@govtechsg/oobee 0.10.83 → 0.10.84
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -1
- package/dist/constants/common.js +13 -1
- package/dist/crawlers/crawlDomain.js +220 -120
- package/dist/crawlers/crawlIntelligentSitemap.js +22 -7
- package/dist/crawlers/runCustom.js +8 -2
- package/dist/mergeAxeResults/itemReferences.js +55 -0
- package/dist/mergeAxeResults/jsonArtifacts.js +335 -0
- package/dist/mergeAxeResults/scanPages.js +159 -0
- package/dist/mergeAxeResults/sentryTelemetry.js +152 -0
- package/dist/mergeAxeResults/types.js +1 -0
- package/dist/mergeAxeResults/writeCsv.js +125 -0
- package/dist/mergeAxeResults/writeScanDetailsCsv.js +35 -0
- package/dist/mergeAxeResults/writeSitemap.js +10 -0
- package/dist/mergeAxeResults.js +24 -929
- package/dist/proxyService.js +90 -5
- package/dist/utils.js +20 -7
- package/package.json +6 -6
- package/src/constants/common.ts +13 -1
- package/src/crawlers/crawlDomain.ts +248 -137
- package/src/crawlers/crawlIntelligentSitemap.ts +22 -8
- package/src/crawlers/runCustom.ts +10 -2
- package/src/mergeAxeResults/itemReferences.ts +62 -0
- package/src/mergeAxeResults/jsonArtifacts.ts +451 -0
- package/src/mergeAxeResults/scanPages.ts +207 -0
- package/src/mergeAxeResults/sentryTelemetry.ts +183 -0
- package/src/mergeAxeResults/types.ts +99 -0
- package/src/mergeAxeResults/writeCsv.ts +145 -0
- package/src/mergeAxeResults/writeScanDetailsCsv.ts +51 -0
- package/src/mergeAxeResults/writeSitemap.ts +13 -0
- package/src/mergeAxeResults.ts +82 -1318
- package/src/proxyService.ts +96 -4
- package/src/utils.ts +19 -7
package/dist/proxyService.js
CHANGED
|
@@ -281,7 +281,7 @@ function parseWindowsRegistry() {
|
|
|
281
281
|
if (enabledManual && v.ProxyServer) {
|
|
282
282
|
const s = v.ProxyServer.trim();
|
|
283
283
|
if (s.includes('=')) {
|
|
284
|
-
for (const part of s.split(
|
|
284
|
+
for (const part of s.split(/[,;]/)) {
|
|
285
285
|
const [proto, addr] = part.split('=');
|
|
286
286
|
if (!proto || !addr)
|
|
287
287
|
continue;
|
|
@@ -315,15 +315,100 @@ function parseWindowsRegistry() {
|
|
|
315
315
|
/* ============================ Public API ============================ */
|
|
316
316
|
export function getProxyInfo() {
|
|
317
317
|
const plat = os.platform();
|
|
318
|
+
let info;
|
|
318
319
|
if (plat === 'win32')
|
|
319
|
-
|
|
320
|
-
if (plat === 'darwin')
|
|
321
|
-
|
|
322
|
-
|
|
320
|
+
info = parseEnvProxyCommon() || parseWindowsRegistry();
|
|
321
|
+
else if (plat === 'darwin')
|
|
322
|
+
info = parseEnvProxyCommon() || parseMacScutil();
|
|
323
|
+
else
|
|
324
|
+
info = parseEnvProxyCommon(); // Linux/others
|
|
325
|
+
// Apply INCLUDE_PROXY env: semicolon-separated domain globs that SHOULD use the proxy
|
|
326
|
+
const includeProxy = process.env.INCLUDE_PROXY;
|
|
327
|
+
if (includeProxy && info) {
|
|
328
|
+
const patterns = includeProxy
|
|
329
|
+
.split(/[,;]/)
|
|
330
|
+
.map(s => s.trim())
|
|
331
|
+
.filter(Boolean);
|
|
332
|
+
if (patterns.length > 0) {
|
|
333
|
+
// INCLUDE_PROXY and NO_PROXY are mutually exclusive; INCLUDE_PROXY takes precedence
|
|
334
|
+
const noProxy = process.env.NO_PROXY || process.env.no_proxy;
|
|
335
|
+
if (noProxy || info.bypassList) {
|
|
336
|
+
console.warn('INCLUDE_PROXY is set — ignoring NO_PROXY / bypass list. ' +
|
|
337
|
+
'These two settings cannot be mixed; INCLUDE_PROXY takes precedence.');
|
|
338
|
+
info.bypassList = undefined;
|
|
339
|
+
}
|
|
340
|
+
info.includeList = patterns;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
return info;
|
|
344
|
+
}
|
|
345
|
+
/**
|
|
346
|
+
* Convert a glob-style domain pattern (e.g. *.example.com) to a regex source string.
|
|
347
|
+
*/
|
|
348
|
+
function domainPatternToRegex(pattern) {
|
|
349
|
+
// Escape dots, replace * with [^.]* or .* depending on position
|
|
350
|
+
// *.example.com → match any subdomain of example.com
|
|
351
|
+
let escaped = pattern
|
|
352
|
+
.replace(/[.+^${}()|[\]\\]/g, '\\$&') // escape regex specials except *
|
|
353
|
+
.replace(/\*/g, '.*'); // convert glob * to regex .*
|
|
354
|
+
return `^${escaped}$`;
|
|
355
|
+
}
|
|
356
|
+
/**
|
|
357
|
+
* Build a PAC script that routes only includeList domains through the given proxy,
|
|
358
|
+
* and returns DIRECT for everything else.
|
|
359
|
+
*/
|
|
360
|
+
function buildIncludeOnlyPac(proxyServer, includeList) {
|
|
361
|
+
// proxyServer is like "http://host:port" or "socks5://host:port"
|
|
362
|
+
let pacProxy;
|
|
363
|
+
if (proxyServer.startsWith('socks5://')) {
|
|
364
|
+
pacProxy = `SOCKS5 ${proxyServer.replace('socks5://', '')}`;
|
|
365
|
+
}
|
|
366
|
+
else if (proxyServer.startsWith('socks://') || proxyServer.startsWith('socks4://')) {
|
|
367
|
+
pacProxy = `SOCKS ${proxyServer.replace(/^socks[4]?:\/\//, '')}`;
|
|
368
|
+
}
|
|
369
|
+
else {
|
|
370
|
+
// http:// or https:// → PROXY host:port
|
|
371
|
+
pacProxy = `PROXY ${proxyServer.replace(/^https?:\/\//, '')}`;
|
|
372
|
+
}
|
|
373
|
+
// Build JS conditions using shExpMatch (built-in PAC function that supports glob patterns)
|
|
374
|
+
const conditions = includeList
|
|
375
|
+
.map(pattern => `shExpMatch(host, ${JSON.stringify(pattern)})`)
|
|
376
|
+
.join(' || ');
|
|
377
|
+
const pac = [
|
|
378
|
+
'function FindProxyForURL(url, host) {',
|
|
379
|
+
` if (${conditions}) {`,
|
|
380
|
+
` return ${JSON.stringify(pacProxy)};`,
|
|
381
|
+
' }',
|
|
382
|
+
' return "DIRECT";',
|
|
383
|
+
'}',
|
|
384
|
+
].join('\n');
|
|
385
|
+
return pac;
|
|
323
386
|
}
|
|
324
387
|
export function proxyInfoToResolution(info) {
|
|
325
388
|
if (!info)
|
|
326
389
|
return { kind: 'none' };
|
|
390
|
+
// If INCLUDE_PROXY is set, generate a PAC that only proxies the listed domains
|
|
391
|
+
if (info.includeList && info.includeList.length > 0) {
|
|
392
|
+
// Determine the proxy server string
|
|
393
|
+
let proxyServer;
|
|
394
|
+
if (info.http)
|
|
395
|
+
proxyServer = `http://${info.http}`;
|
|
396
|
+
else if (info.https)
|
|
397
|
+
proxyServer = `http://${info.https}`;
|
|
398
|
+
else if (info.socks)
|
|
399
|
+
proxyServer = `socks5://${info.socks}`;
|
|
400
|
+
if (proxyServer) {
|
|
401
|
+
// If credentials exist, embed them for the manual proxy auth
|
|
402
|
+
// PAC scripts themselves don't carry auth, but Playwright's proxy option can
|
|
403
|
+
// We use PAC for routing, and if creds are needed, Playwright will prompt or
|
|
404
|
+
// we set them via the proxy option. Unfortunately Playwright doesn't allow
|
|
405
|
+
// proxy.username with a PAC, so we embed in the launch arg and rely on
|
|
406
|
+
// Chromium's built-in auth handler for proxy auth challenges.
|
|
407
|
+
const pac = buildIncludeOnlyPac(proxyServer, info.includeList);
|
|
408
|
+
const pacDataUrl = `data:application/x-ns-proxy-autoconfig;base64,${Buffer.from(pac).toString('base64')}`;
|
|
409
|
+
return { kind: 'pac', pacUrl: pacDataUrl, bypass: info.bypassList };
|
|
410
|
+
}
|
|
411
|
+
}
|
|
327
412
|
// Prefer manual proxies first (these work with Playwright's proxy option)
|
|
328
413
|
if (info.http) {
|
|
329
414
|
return { kind: 'manual', settings: {
|
package/dist/utils.js
CHANGED
|
@@ -3,6 +3,7 @@ import os from 'os';
|
|
|
3
3
|
import fs from 'fs-extra';
|
|
4
4
|
import axe from 'axe-core';
|
|
5
5
|
import { v4 as uuidv4 } from 'uuid';
|
|
6
|
+
import { getDomain } from 'tldts';
|
|
6
7
|
import constants, { destinationPath, getIntermediateScreenshotsPath, } from './constants/constants.js';
|
|
7
8
|
import { consoleLogger, errorsTxtPath } from './logs.js';
|
|
8
9
|
import { getAxeConfiguration } from './crawlers/custom/getAxeConfiguration.js';
|
|
@@ -852,14 +853,26 @@ export const randomThreeDigitNumberString = () => {
|
|
|
852
853
|
return String(threeDigitNumber);
|
|
853
854
|
};
|
|
854
855
|
export const isFollowStrategy = (link1, link2, rule) => {
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
856
|
+
try {
|
|
857
|
+
const parsedLink1 = new URL(link1);
|
|
858
|
+
const parsedLink2 = new URL(link2);
|
|
859
|
+
if (rule === 'all') {
|
|
860
|
+
return true;
|
|
861
|
+
}
|
|
862
|
+
if (rule === 'same-origin') {
|
|
863
|
+
return parsedLink1.origin === parsedLink2.origin;
|
|
864
|
+
}
|
|
865
|
+
if (rule === 'same-domain') {
|
|
866
|
+
const link1Domain = getDomain(parsedLink1.hostname, { allowPrivateDomains: true }) || parsedLink1.hostname;
|
|
867
|
+
const link2Domain = getDomain(parsedLink2.hostname, { allowPrivateDomains: true }) || parsedLink2.hostname;
|
|
868
|
+
return link1Domain.toLowerCase() === link2Domain.toLowerCase();
|
|
869
|
+
}
|
|
870
|
+
// default: same-hostname
|
|
871
|
+
return parsedLink1.hostname === parsedLink2.hostname;
|
|
872
|
+
}
|
|
873
|
+
catch {
|
|
874
|
+
return false;
|
|
861
875
|
}
|
|
862
|
-
return parsedLink1.hostname === parsedLink2.hostname;
|
|
863
876
|
};
|
|
864
877
|
export const retryFunction = async (func, maxAttempt) => {
|
|
865
878
|
let attemptCount = 0;
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@govtechsg/oobee",
|
|
3
3
|
"main": "dist/npmIndex.js",
|
|
4
|
-
"version": "0.10.
|
|
4
|
+
"version": "0.10.84",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"author": "Government Technology Agency <info@tech.gov.sg>",
|
|
7
7
|
"bin": {
|
|
@@ -19,12 +19,12 @@
|
|
|
19
19
|
"cheerio": "^1.0.0-rc.12",
|
|
20
20
|
"crawlee": "^3.13.10",
|
|
21
21
|
"ejs": "^3.1.9",
|
|
22
|
-
"file-type": "^
|
|
22
|
+
"file-type": "^21.3.3",
|
|
23
23
|
"fs-extra": "^11.2.0",
|
|
24
24
|
"glob": "^13.0.6",
|
|
25
25
|
"https": "^1.0.0",
|
|
26
26
|
"inquirer": "^9.2.12",
|
|
27
|
-
"jsdom": "^
|
|
27
|
+
"jsdom": "^29.0.0",
|
|
28
28
|
"jszip": "^3.10.1",
|
|
29
29
|
"lodash": "^4.17.21",
|
|
30
30
|
"mime": "^4.0.7",
|
|
@@ -36,6 +36,7 @@
|
|
|
36
36
|
"print-message": "^3.0.1",
|
|
37
37
|
"safe-regex": "^2.1.1",
|
|
38
38
|
"text-readability": "^1.1.0",
|
|
39
|
+
"tldts": "^7.0.26",
|
|
39
40
|
"typescript": "^5.4.5",
|
|
40
41
|
"url": "^0.11.3",
|
|
41
42
|
"uuid": "^11.0.3",
|
|
@@ -87,9 +88,8 @@
|
|
|
87
88
|
"minimatch": "^10.2.4",
|
|
88
89
|
"brace-expansion": "^5.0.4",
|
|
89
90
|
"glob": "^13.0.6",
|
|
90
|
-
"
|
|
91
|
-
|
|
92
|
-
}
|
|
91
|
+
"flatted": "^3.4.1",
|
|
92
|
+
"file-type": "^21.3.3"
|
|
93
93
|
},
|
|
94
94
|
"optionalDependencies": {
|
|
95
95
|
"@napi-rs/canvas-darwin-arm64": "^0.1.53",
|
package/src/constants/common.ts
CHANGED
|
@@ -446,7 +446,19 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
446
446
|
|
|
447
447
|
if (!response) throw new Error('No response from navigation');
|
|
448
448
|
|
|
449
|
-
//
|
|
449
|
+
// Wait briefly for JS/meta-refresh redirects to settle before reading the final URL.
|
|
450
|
+
// Server-side redirects are already reflected after goto(), but client-side redirects
|
|
451
|
+
// (e.g. domain.tld -> www.domain.tld via JS or meta-refresh) need extra time.
|
|
452
|
+
try {
|
|
453
|
+
await Promise.race([
|
|
454
|
+
page.waitForURL(currentUrl => currentUrl !== url, { timeout: 5000 }),
|
|
455
|
+
new Promise(resolve => setTimeout(resolve, 1000)), // minimum settle time
|
|
456
|
+
]);
|
|
457
|
+
} catch {
|
|
458
|
+
// No redirect happened within the window — that's fine, continue with current URL
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// Re-read page.url() AFTER potential client-side redirects have resolved
|
|
450
462
|
const finalUrl = page.url();
|
|
451
463
|
const finalStatus = response.status();
|
|
452
464
|
const headers = response.headers();
|