@govtechsg/oobee 0.10.63 → 0.10.65
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +8 -3
- package/README.md +2 -0
- package/package.json +2 -1
- package/src/cli.ts +1 -2
- package/src/combine.ts +2 -2
- package/src/constants/common.ts +58 -52
- package/src/constants/constants.ts +45 -71
- package/src/constants/questions.ts +15 -2
- package/src/crawlers/crawlDomain.ts +1 -0
- package/src/crawlers/crawlIntelligentSitemap.ts +0 -3
- package/src/crawlers/crawlLocalFile.ts +29 -32
- package/src/crawlers/crawlSitemap.ts +7 -6
- package/src/crawlers/pdfScanFunc.ts +22 -50
- package/src/mergeAxeResults.ts +14 -3
- package/src/npmIndex.ts +2 -3
- package/src/proxyService.ts +405 -0
- package/src/screenshotFunc/pdfScreenshotFunc.ts +2 -5
- package/src/utils.ts +58 -60
package/Dockerfile
CHANGED
|
@@ -2,9 +2,14 @@
|
|
|
2
2
|
# Node version is v22
|
|
3
3
|
FROM mcr.microsoft.com/playwright:v1.50.1-noble
|
|
4
4
|
|
|
5
|
-
# Installation of packages for oobee and runner
|
|
6
|
-
RUN apt-get update && apt-get install -y
|
|
7
|
-
|
|
5
|
+
# Installation of packages for oobee and runner (locked versions from build log)
|
|
6
|
+
RUN apt-get update && apt-get install -y \
|
|
7
|
+
git=1:2.43.0-1ubuntu7.3 \
|
|
8
|
+
git-man=1:2.43.0-1ubuntu7.3 \
|
|
9
|
+
unzip=6.0-28ubuntu4.1 \
|
|
10
|
+
zip=3.0-13ubuntu0.2 \
|
|
11
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
12
|
+
|
|
8
13
|
WORKDIR /app/oobee
|
|
9
14
|
|
|
10
15
|
# Clone oobee repository
|
package/README.md
CHANGED
|
@@ -88,6 +88,8 @@ verapdf --version
|
|
|
88
88
|
| OOBEE_VALIDATE_URL| When set to `true`, validates if URLs are valid and exits. | `false` |
|
|
89
89
|
| OOBEE_LOGS_PATH | When set, logs are written to this path. | |
|
|
90
90
|
| WARN_LEVEL | Only used in tests. | |
|
|
91
|
+
| OOBEE_DISABLE_BROWSER_DOWNLOAD | Experimental flag to disable file downloads on Chrome/Chromium/Edge. Does not affect Local File scan | |
|
|
92
|
+
| OOBEE_SLOWMO | Experimental flag to slow down web browser behaviour by specified duration (in miliseconds) | |
|
|
91
93
|
|
|
92
94
|
#### Environment variables used internally (Do not set)
|
|
93
95
|
Do not set these environment variables or behaviour might change unexpectedly.
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@govtechsg/oobee",
|
|
3
3
|
"main": "dist/npmIndex.js",
|
|
4
|
-
"version": "0.10.
|
|
4
|
+
"version": "0.10.65",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"author": "Government Technology Agency <info@tech.gov.sg>",
|
|
7
7
|
"dependencies": {
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
"https": "^1.0.0",
|
|
21
21
|
"inquirer": "^9.2.12",
|
|
22
22
|
"jsdom": "^21.1.2",
|
|
23
|
+
"jszip": "^3.10.1",
|
|
23
24
|
"lodash": "^4.17.21",
|
|
24
25
|
"mime-types": "^2.1.35",
|
|
25
26
|
"minimatch": "^9.0.3",
|
package/src/cli.ts
CHANGED
|
@@ -5,7 +5,7 @@ import printMessage from 'print-message';
|
|
|
5
5
|
import { devices } from 'playwright';
|
|
6
6
|
import { fileURLToPath } from 'url';
|
|
7
7
|
import path from 'path';
|
|
8
|
-
import {
|
|
8
|
+
import { setHeadlessMode, getVersion, getStoragePath, listenForCleanUp, cleanUpAndExit } from './utils.js';
|
|
9
9
|
import {
|
|
10
10
|
checkUrl,
|
|
11
11
|
prepareData,
|
|
@@ -16,7 +16,6 @@ import {
|
|
|
16
16
|
validateDirPath,
|
|
17
17
|
validateFilePath,
|
|
18
18
|
validateCustomFlowLabel,
|
|
19
|
-
parseHeaders,
|
|
20
19
|
} from './constants/common.js';
|
|
21
20
|
import constants, { ScannerTypes } from './constants/constants.js';
|
|
22
21
|
import { cliOptions, messageOptions } from './constants/cliFunctions.js';
|
package/src/combine.ts
CHANGED
|
@@ -5,7 +5,7 @@ import crawlDomain from './crawlers/crawlDomain.js';
|
|
|
5
5
|
import crawlLocalFile from './crawlers/crawlLocalFile.js';
|
|
6
6
|
import crawlIntelligentSitemap from './crawlers/crawlIntelligentSitemap.js';
|
|
7
7
|
import generateArtifacts from './mergeAxeResults.js';
|
|
8
|
-
import { getHost, createAndUpdateResultsFolders,
|
|
8
|
+
import { getHost, createAndUpdateResultsFolders, cleanUpAndExit } from './utils.js';
|
|
9
9
|
import { ScannerTypes, UrlsCrawled } from './constants/constants.js';
|
|
10
10
|
import { getBlackListedPatterns, submitForm } from './constants/common.js';
|
|
11
11
|
import { consoleLogger, silentLogger } from './logs.js';
|
|
@@ -218,7 +218,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
|
218
218
|
|
|
219
219
|
scanDetails.endTime = new Date();
|
|
220
220
|
scanDetails.urlsCrawled = urlsCrawledObj;
|
|
221
|
-
|
|
221
|
+
|
|
222
222
|
if (scanDetails.urlsCrawled) {
|
|
223
223
|
if (scanDetails.urlsCrawled.scanned.length > 0) {
|
|
224
224
|
await createAndUpdateResultsFolders(randomToken);
|
package/src/constants/common.ts
CHANGED
|
@@ -22,18 +22,18 @@ import constants, {
|
|
|
22
22
|
getDefaultChromeDataDir,
|
|
23
23
|
getDefaultEdgeDataDir,
|
|
24
24
|
getDefaultChromiumDataDir,
|
|
25
|
-
proxy,
|
|
26
25
|
// Legacy code start - Google Sheets submission
|
|
27
26
|
formDataFields,
|
|
28
27
|
// Legacy code end - Google Sheets submission
|
|
29
28
|
ScannerTypes,
|
|
30
29
|
BrowserTypes,
|
|
31
30
|
} from './constants.js';
|
|
32
|
-
import { consoleLogger
|
|
31
|
+
import { consoleLogger } from '../logs.js';
|
|
33
32
|
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
|
34
33
|
import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
|
|
35
34
|
import { Answers, Data } from '../index.js';
|
|
36
35
|
import { DeviceDescriptor } from '../types/types.js';
|
|
36
|
+
import { getProxyInfo, proxyInfoToResolution, ProxySettings } from '../proxyService.js';
|
|
37
37
|
|
|
38
38
|
// validateDirPath validates a provided directory path
|
|
39
39
|
// returns null if no error
|
|
@@ -304,6 +304,7 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
304
304
|
ignoreHTTPSErrors: true,
|
|
305
305
|
...getPlaywrightLaunchOptions(browserToRun),
|
|
306
306
|
...playwrightDeviceDetailsObject,
|
|
307
|
+
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
307
308
|
});
|
|
308
309
|
|
|
309
310
|
register(browserContext);
|
|
@@ -485,7 +486,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
485
486
|
viewportWidth,
|
|
486
487
|
maxpages,
|
|
487
488
|
strategy,
|
|
488
|
-
isLocalFileScan,
|
|
489
|
+
isLocalFileScan = false,
|
|
489
490
|
browserToRun,
|
|
490
491
|
nameEmail,
|
|
491
492
|
customFlowLabel,
|
|
@@ -510,6 +511,10 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
510
511
|
let username = '';
|
|
511
512
|
let password = '';
|
|
512
513
|
|
|
514
|
+
if (isFilePath(url)) {
|
|
515
|
+
argv.isLocalFileScan = true;
|
|
516
|
+
}
|
|
517
|
+
|
|
513
518
|
// Remove credentials from URL if not a local file scan
|
|
514
519
|
url = argv.isLocalFileScan
|
|
515
520
|
? url
|
|
@@ -550,7 +555,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
550
555
|
viewportWidth,
|
|
551
556
|
);
|
|
552
557
|
|
|
553
|
-
const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(browserToRun, true
|
|
558
|
+
const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(resultFilename, browserToRun, true);
|
|
554
559
|
browserToRun = resolvedBrowser;
|
|
555
560
|
|
|
556
561
|
const resolvedUserDataDirectory = getClonedProfilesWithRandomToken(browserToRun, resultFilename);
|
|
@@ -1005,14 +1010,10 @@ export const validName = (name: string) => {
|
|
|
1005
1010
|
* @returns object consisting of browser to run and cloned data directory
|
|
1006
1011
|
*/
|
|
1007
1012
|
export const getBrowserToRun = (
|
|
1013
|
+
randomToken: string,
|
|
1008
1014
|
preferredBrowser?: BrowserTypes,
|
|
1009
1015
|
isCli = false,
|
|
1010
|
-
randomToken?: string
|
|
1011
1016
|
): { browserToRun: BrowserTypes; clonedBrowserDataDir: string } => {
|
|
1012
|
-
|
|
1013
|
-
if (!randomToken) {
|
|
1014
|
-
randomToken = '';
|
|
1015
|
-
}
|
|
1016
1017
|
|
|
1017
1018
|
const platform = os.platform();
|
|
1018
1019
|
|
|
@@ -1597,15 +1598,8 @@ export const submitFormViaPlaywright = async (
|
|
|
1597
1598
|
userDataDirectory: string,
|
|
1598
1599
|
finalUrl: string,
|
|
1599
1600
|
) => {
|
|
1600
|
-
const dirName = `clone-${Date.now()}`;
|
|
1601
|
-
let clonedDir = null;
|
|
1602
|
-
if (proxy && browserToRun === BrowserTypes.EDGE) {
|
|
1603
|
-
clonedDir = cloneEdgeProfiles(dirName);
|
|
1604
|
-
} else if (proxy && browserToRun === BrowserTypes.CHROME) {
|
|
1605
|
-
clonedDir = cloneChromeProfiles(dirName);
|
|
1606
|
-
}
|
|
1607
1601
|
const browserContext = await constants.launcher.launchPersistentContext(
|
|
1608
|
-
|
|
1602
|
+
userDataDirectory,
|
|
1609
1603
|
{
|
|
1610
1604
|
...getPlaywrightLaunchOptions(browserToRun),
|
|
1611
1605
|
},
|
|
@@ -1618,7 +1612,7 @@ export const submitFormViaPlaywright = async (
|
|
|
1618
1612
|
try {
|
|
1619
1613
|
await page.goto(finalUrl, {
|
|
1620
1614
|
timeout: 30000,
|
|
1621
|
-
|
|
1615
|
+
waitUntil: 'commit',
|
|
1622
1616
|
});
|
|
1623
1617
|
|
|
1624
1618
|
try {
|
|
@@ -1630,11 +1624,6 @@ export const submitFormViaPlaywright = async (
|
|
|
1630
1624
|
consoleLogger.error(error);
|
|
1631
1625
|
} finally {
|
|
1632
1626
|
await browserContext.close();
|
|
1633
|
-
if (proxy && browserToRun === BrowserTypes.EDGE) {
|
|
1634
|
-
deleteClonedEdgeProfiles(clonedDir);
|
|
1635
|
-
} else if (proxy && browserToRun === BrowserTypes.CHROME) {
|
|
1636
|
-
deleteClonedChromeProfiles(clonedDir);
|
|
1637
|
-
}
|
|
1638
1627
|
}
|
|
1639
1628
|
};
|
|
1640
1629
|
|
|
@@ -1673,19 +1662,17 @@ export const submitForm = async (
|
|
|
1673
1662
|
finalUrl += `&${formDataFields.redirectUrlField}=${scannedUrl}`;
|
|
1674
1663
|
}
|
|
1675
1664
|
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
if (browserToRun || constants.launcher === webkit) {
|
|
1684
|
-
await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
|
|
1685
|
-
}
|
|
1665
|
+
|
|
1666
|
+
try {
|
|
1667
|
+
await axios.get(finalUrl, { timeout: 2000 });
|
|
1668
|
+
} catch (error) {
|
|
1669
|
+
if (error.code === 'ECONNABORTED') {
|
|
1670
|
+
if (browserToRun || constants.launcher === webkit) {
|
|
1671
|
+
await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
|
|
1686
1672
|
}
|
|
1687
1673
|
}
|
|
1688
1674
|
}
|
|
1675
|
+
|
|
1689
1676
|
};
|
|
1690
1677
|
// Legacy code end - Google Sheets submission
|
|
1691
1678
|
|
|
@@ -1736,42 +1723,61 @@ export async function initModifiedUserAgent(
|
|
|
1736
1723
|
// console.log('Modified User Agent:', modifiedUA);
|
|
1737
1724
|
}
|
|
1738
1725
|
|
|
1726
|
+
const cacheProxyInfo = getProxyInfo();
|
|
1727
|
+
|
|
1739
1728
|
/**
|
|
1740
1729
|
* @param {string} browser browser name ("chrome" or "edge", null for chromium, the default Playwright browser)
|
|
1741
1730
|
* @returns playwright launch options object. For more details: https://playwright.dev/docs/api/class-browsertype#browser-type-launch
|
|
1742
1731
|
*/
|
|
1743
1732
|
export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1733
|
+
const channel = browser || undefined;
|
|
1734
|
+
|
|
1735
|
+
const resolution = proxyInfoToResolution(cacheProxyInfo);
|
|
1736
|
+
|
|
1737
|
+
// Start with your base args
|
|
1738
|
+
const finalArgs = [...constants.launchOptionsArgs];
|
|
1748
1739
|
|
|
1749
|
-
//
|
|
1750
|
-
// Also mute audio
|
|
1740
|
+
// Headless flags (unchanged)
|
|
1751
1741
|
if (process.env.CRAWLEE_HEADLESS === '1') {
|
|
1752
|
-
|
|
1753
|
-
|
|
1742
|
+
if (!finalArgs.includes('--headless=new')) finalArgs.push('--headless=new');
|
|
1743
|
+
if (!finalArgs.includes('--mute-audio')) finalArgs.push('--mute-audio');
|
|
1744
|
+
}
|
|
1745
|
+
|
|
1746
|
+
// Map resolution to Playwright options
|
|
1747
|
+
let proxyOpt: ProxySettings | undefined;
|
|
1748
|
+
switch (resolution.kind) {
|
|
1749
|
+
case 'manual':
|
|
1750
|
+
proxyOpt = resolution.settings;
|
|
1751
|
+
break;
|
|
1752
|
+
case 'pac': {
|
|
1753
|
+
finalArgs.push(`--proxy-pac-url=${resolution.pacUrl}`);
|
|
1754
|
+
if (resolution.bypass) finalArgs.push(`--proxy-bypass-list=${resolution.bypass}`);
|
|
1755
|
+
break;
|
|
1756
|
+
}
|
|
1757
|
+
case 'none':
|
|
1758
|
+
// nothing
|
|
1759
|
+
break;
|
|
1754
1760
|
}
|
|
1755
1761
|
|
|
1756
1762
|
const options: LaunchOptions = {
|
|
1757
|
-
// Drop the --use-mock-keychain flag to allow MacOS devices
|
|
1758
|
-
// to use the cloned cookies.
|
|
1759
1763
|
ignoreDefaultArgs: ['--use-mock-keychain', '--headless'],
|
|
1760
|
-
|
|
1761
|
-
args: constants.launchOptionsArgs,
|
|
1764
|
+
args: finalArgs,
|
|
1762
1765
|
headless: false,
|
|
1763
|
-
...(channel && { channel }),
|
|
1766
|
+
...(channel && { channel }),
|
|
1767
|
+
...(proxyOpt ? { proxy: proxyOpt } : {}),
|
|
1764
1768
|
};
|
|
1765
1769
|
|
|
1766
|
-
//
|
|
1767
|
-
options.
|
|
1770
|
+
// SlowMo (unchanged)
|
|
1771
|
+
if (!options.slowMo && process.env.OOBEE_SLOWMO && Number(process.env.OOBEE_SLOWMO) >= 1) {
|
|
1772
|
+
options.slowMo = Number(process.env.OOBEE_SLOWMO);
|
|
1773
|
+
consoleLogger.info(`Enabled browser slowMo with value: ${process.env.OOBEE_SLOWMO}ms`);
|
|
1774
|
+
}
|
|
1768
1775
|
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
} else if (browser === BrowserTypes.EDGE && os.platform() === 'win32') {
|
|
1772
|
-
// edge should be in non-headless mode
|
|
1776
|
+
// Edge on Windows should not be headless (unchanged)
|
|
1777
|
+
if (browser === BrowserTypes.EDGE && os.platform() === 'win32') {
|
|
1773
1778
|
options.headless = false;
|
|
1774
1779
|
}
|
|
1780
|
+
|
|
1775
1781
|
return options;
|
|
1776
1782
|
};
|
|
1777
1783
|
|
|
@@ -141,14 +141,53 @@ export const getDefaultChromiumDataDir = () => {
|
|
|
141
141
|
}
|
|
142
142
|
};
|
|
143
143
|
|
|
144
|
-
export
|
|
145
|
-
if (os.platform()
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
144
|
+
export function removeQuarantineFlag(searchPattern: string, allowedRoot = process.cwd()) {
|
|
145
|
+
if (os.platform() !== 'darwin') return;
|
|
146
|
+
|
|
147
|
+
const matches = globSync(searchPattern, {
|
|
148
|
+
absolute: true,
|
|
149
|
+
nodir: true,
|
|
150
|
+
dot: true,
|
|
151
|
+
follow: false, // don't follow symlinks
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
const root = path.resolve(allowedRoot);
|
|
155
|
+
|
|
156
|
+
for (const p of matches) {
|
|
157
|
+
const resolved = path.resolve(p);
|
|
158
|
+
|
|
159
|
+
// Ensure the file is under the allowed root (containment check)
|
|
160
|
+
if (!resolved.startsWith(root + path.sep)) continue;
|
|
161
|
+
|
|
162
|
+
// lstat: skip if not a regular file or if it's a symlink
|
|
163
|
+
let st: fs.Stats;
|
|
164
|
+
try {
|
|
165
|
+
st = fs.lstatSync(resolved);
|
|
166
|
+
} catch {
|
|
167
|
+
continue;
|
|
168
|
+
}
|
|
169
|
+
if (!st.isFile() || st.isSymbolicLink()) continue;
|
|
170
|
+
|
|
171
|
+
// basic filename sanity: no control chars
|
|
172
|
+
const base = path.basename(resolved);
|
|
173
|
+
if (/[\x00-\x1F]/.test(base)) continue;
|
|
174
|
+
|
|
175
|
+
// Use absolute binary path and terminate options with "--"
|
|
176
|
+
const proc = spawnSync('/usr/bin/xattr', ['-d', 'com.apple.quarantine', '--', resolved], {
|
|
177
|
+
stdio: ['ignore', 'ignore', 'pipe'],
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
// Optional: inspect errors (common benign case is "No such xattr")
|
|
181
|
+
if (proc.status !== 0) {
|
|
182
|
+
const err = proc.stderr?.toString() || '';
|
|
183
|
+
// swallow benign errors; otherwise log if you have a logger
|
|
184
|
+
if (!/No such xattr/i.test(err)) {
|
|
185
|
+
// console.warn(`xattr failed for ${resolved}: ${err.trim()}`);
|
|
186
|
+
}
|
|
149
187
|
}
|
|
150
188
|
}
|
|
151
|
-
}
|
|
189
|
+
}
|
|
190
|
+
|
|
152
191
|
|
|
153
192
|
export const getExecutablePath = function (dir: string, file: string): string {
|
|
154
193
|
let execPaths = globSync(`${dir}/${file}`, { absolute: true, nodir: true });
|
|
@@ -228,71 +267,6 @@ if (fs.existsSync('/.dockerenv')) {
|
|
|
228
267
|
launchOptionsArgs = ['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage'];
|
|
229
268
|
}
|
|
230
269
|
|
|
231
|
-
type ProxyInfo = { type: 'autoConfig' | 'manualProxy'; url: string } | null;
|
|
232
|
-
|
|
233
|
-
function queryRegKey(key: string): Record<string, string> {
|
|
234
|
-
try {
|
|
235
|
-
const out = execSync(`reg query "${key}"`, { encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] });
|
|
236
|
-
const values: Record<string, string> = {};
|
|
237
|
-
for (const line of out.split(/\r?\n/)) {
|
|
238
|
-
const parts = line.trim().split(/\s{2,}/);
|
|
239
|
-
if (parts.length >= 3) {
|
|
240
|
-
const [name, _type, ...rest] = parts;
|
|
241
|
-
values[name] = rest.join(' ');
|
|
242
|
-
}
|
|
243
|
-
}
|
|
244
|
-
return values;
|
|
245
|
-
} catch {
|
|
246
|
-
return {};
|
|
247
|
-
}
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
function parseDwordFlag(v: unknown): number {
|
|
251
|
-
if (v == null) return 0;
|
|
252
|
-
const s = String(v).trim();
|
|
253
|
-
// Handles "1", "0", "0x1", "0x0"
|
|
254
|
-
if (/^0x[0-9a-f]+$/i.test(s)) return parseInt(s, 16);
|
|
255
|
-
if (/^\d+$/.test(s)) return parseInt(s, 10);
|
|
256
|
-
return 0;
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
function normalizePacUrl(u: string): string {
|
|
260
|
-
const s = u.trim();
|
|
261
|
-
// If it lacks a scheme, assume http:// (Chrome requires a full URL)
|
|
262
|
-
return /^(https?|file):/i.test(s) ? s : `http://${s}`;
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
export const getProxy = (): ProxyInfo => {
|
|
266
|
-
if (os.platform() !== 'win32') return null;
|
|
267
|
-
|
|
268
|
-
const values = queryRegKey('HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings');
|
|
269
|
-
const pacUrlRaw = (values['AutoConfigURL'] || '').trim();
|
|
270
|
-
const proxyEnableRaw = (values['ProxyEnable'] || '').trim();
|
|
271
|
-
const proxyServerRaw = (values['ProxyServer'] || '').trim();
|
|
272
|
-
|
|
273
|
-
// 1) PAC beats manual proxy if present
|
|
274
|
-
if (pacUrlRaw) {
|
|
275
|
-
return { type: 'autoConfig', url: normalizePacUrl(pacUrlRaw) };
|
|
276
|
-
}
|
|
277
|
-
|
|
278
|
-
// 2) Manual proxy only if enabled
|
|
279
|
-
const enabled = parseDwordFlag(proxyEnableRaw) === 1;
|
|
280
|
-
if (enabled && proxyServerRaw) {
|
|
281
|
-
return { type: 'manualProxy', url: proxyServerRaw };
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
return null;
|
|
285
|
-
};
|
|
286
|
-
|
|
287
|
-
// Usage
|
|
288
|
-
export const proxy = getProxy();
|
|
289
|
-
|
|
290
|
-
if (proxy?.type === 'autoConfig') {
|
|
291
|
-
launchOptionsArgs.push(`--proxy-pac-url=${proxy.url}`);
|
|
292
|
-
} else if (proxy?.type === 'manualProxy') {
|
|
293
|
-
launchOptionsArgs.push(`--proxy-server=${proxy.url}`);
|
|
294
|
-
}
|
|
295
|
-
|
|
296
270
|
export const impactOrder = {
|
|
297
271
|
minor: 0,
|
|
298
272
|
moderate: 1,
|
|
@@ -81,13 +81,26 @@ const startScanQuestions = [
|
|
|
81
81
|
|
|
82
82
|
// construct filename for scan results
|
|
83
83
|
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
|
84
|
-
|
|
84
|
+
let domain = '';
|
|
85
|
+
try {
|
|
86
|
+
domain = new URL(url).hostname;
|
|
87
|
+
} catch (error) {
|
|
88
|
+
// If the input is a local filepath, try to resolve it
|
|
89
|
+
const finalFilePath = getFileSitemap(url);
|
|
90
|
+
if (finalFilePath) {
|
|
91
|
+
answers.isLocalFileScan = true;
|
|
92
|
+
answers.finalUrl = finalFilePath;
|
|
93
|
+
return true;
|
|
94
|
+
}
|
|
95
|
+
return 'Invalid URL';
|
|
96
|
+
}
|
|
97
|
+
|
|
85
98
|
let resultFilename: string;
|
|
86
99
|
const randomThreeDigitNumber = randomThreeDigitNumberString();
|
|
87
100
|
resultFilename = `${date}_${time}_${domain}_${randomThreeDigitNumber}`;
|
|
88
101
|
|
|
89
102
|
const statuses = constants.urlCheckStatuses;
|
|
90
|
-
const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(BrowserTypes.CHROME, false
|
|
103
|
+
const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(resultFilename, BrowserTypes.CHROME, false);
|
|
91
104
|
|
|
92
105
|
setHeadlessMode(browserToRun, answers.headless);
|
|
93
106
|
|
|
@@ -368,6 +368,7 @@ const crawlDomain = async ({
|
|
|
368
368
|
...launchContext.launchOptions,
|
|
369
369
|
ignoreHTTPSErrors: true,
|
|
370
370
|
...playwrightDeviceDetailsObject,
|
|
371
|
+
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
371
372
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
372
373
|
};
|
|
373
374
|
|
|
@@ -41,9 +41,6 @@ const crawlIntelligentSitemap = async (
|
|
|
41
41
|
|
|
42
42
|
function getHomeUrl(parsedUrl: string) {
|
|
43
43
|
const urlObject = new URL(parsedUrl);
|
|
44
|
-
if (urlObject.username && urlObject.password) {
|
|
45
|
-
return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
|
46
|
-
}
|
|
47
44
|
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
|
48
45
|
}
|
|
49
46
|
|
|
@@ -6,6 +6,7 @@ import constants, {
|
|
|
6
6
|
guiInfoStatusTypes,
|
|
7
7
|
basicAuthRegex,
|
|
8
8
|
UrlsCrawled,
|
|
9
|
+
STATUS_CODE_METADATA,
|
|
9
10
|
} from '../constants/constants.js';
|
|
10
11
|
import { ViewportSettingsClass } from '../combine.js';
|
|
11
12
|
import {
|
|
@@ -17,7 +18,7 @@ import {
|
|
|
17
18
|
import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
|
|
18
19
|
import { guiInfoLog } from '../logs.js';
|
|
19
20
|
import crawlSitemap from './crawlSitemap.js';
|
|
20
|
-
import { register } from '../utils.js';
|
|
21
|
+
import { getPdfStoragePath, getStoragePath, register } from '../utils.js';
|
|
21
22
|
|
|
22
23
|
export const crawlLocalFile = async ({
|
|
23
24
|
url,
|
|
@@ -59,7 +60,7 @@ export const crawlLocalFile = async ({
|
|
|
59
60
|
let dataset: any;
|
|
60
61
|
let urlsCrawled: UrlsCrawled;
|
|
61
62
|
let linksFromSitemap = [];
|
|
62
|
-
let sitemapUrl
|
|
63
|
+
let sitemapUrl: string;
|
|
63
64
|
|
|
64
65
|
// Boolean to omit axe scan for basic auth URL
|
|
65
66
|
let isBasicAuth: boolean;
|
|
@@ -76,10 +77,13 @@ export const crawlLocalFile = async ({
|
|
|
76
77
|
|
|
77
78
|
}
|
|
78
79
|
|
|
80
|
+
// Checks if its in the right file format, and change it before placing into linksFromSitemap
|
|
81
|
+
url = convertLocalFileToPath(url);
|
|
82
|
+
|
|
79
83
|
// Check if the sitemapUrl is a local file and if it exists
|
|
80
|
-
if (!
|
|
84
|
+
if (!fs.existsSync(url) && !isFilePath(url)) {
|
|
81
85
|
// Convert to an absolute path
|
|
82
|
-
let normalizedPath = path.resolve(
|
|
86
|
+
let normalizedPath = path.resolve(url);
|
|
83
87
|
|
|
84
88
|
// Normalize the path to handle different path separators
|
|
85
89
|
normalizedPath = path.normalize(normalizedPath);
|
|
@@ -90,17 +94,15 @@ export const crawlLocalFile = async ({
|
|
|
90
94
|
}
|
|
91
95
|
|
|
92
96
|
// At this point, normalizedPath is a valid and existing file path
|
|
93
|
-
|
|
97
|
+
url = normalizedPath;
|
|
94
98
|
}
|
|
95
99
|
|
|
96
|
-
// Checks if its in the right file format, and change it before placing into linksFromSitemap
|
|
97
|
-
convertLocalFileToPath(sitemapUrl);
|
|
98
|
-
|
|
99
100
|
// XML Files
|
|
100
|
-
if (!(
|
|
101
|
-
linksFromSitemap = [new Request({ url:
|
|
101
|
+
if (!(url.match(/\.xml$/i) || url.match(/\.txt$/i))) {
|
|
102
|
+
linksFromSitemap = [new Request({ url: url })];
|
|
102
103
|
// Non XML file
|
|
103
104
|
} else {
|
|
105
|
+
sitemapUrl = url;
|
|
104
106
|
// Put it to crawlSitemap function to handle xml files
|
|
105
107
|
const updatedUrlsCrawled = await crawlSitemap({
|
|
106
108
|
sitemapUrl,
|
|
@@ -127,12 +129,6 @@ export const crawlLocalFile = async ({
|
|
|
127
129
|
return urlsCrawled;
|
|
128
130
|
}
|
|
129
131
|
|
|
130
|
-
try {
|
|
131
|
-
sitemapUrl = encodeURI(sitemapUrl);
|
|
132
|
-
} catch (e) {
|
|
133
|
-
console.log(e);
|
|
134
|
-
}
|
|
135
|
-
|
|
136
132
|
const uuidToPdfMapping: Record<string, string> = {}; // key and value of string type
|
|
137
133
|
|
|
138
134
|
finalLinks = [...finalLinks, ...linksFromSitemap];
|
|
@@ -142,16 +138,10 @@ export const crawlLocalFile = async ({
|
|
|
142
138
|
});
|
|
143
139
|
|
|
144
140
|
const request = linksFromSitemap[0];
|
|
145
|
-
const pdfFileName = path.basename(request.url);
|
|
146
|
-
const trimmedUrl: string = request.url;
|
|
147
|
-
const destinationFilePath: string = `${randomToken}/${pdfFileName}`;
|
|
148
|
-
const data: Buffer = fs.readFileSync(trimmedUrl);
|
|
149
|
-
fs.writeFileSync(destinationFilePath, data);
|
|
150
|
-
uuidToPdfMapping[pdfFileName] = trimmedUrl;
|
|
151
141
|
|
|
152
142
|
let shouldAbort = false;
|
|
153
143
|
|
|
154
|
-
if (!isUrlPdf(
|
|
144
|
+
if (!isUrlPdf(url)) {
|
|
155
145
|
const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
|
|
156
146
|
? userDataDirectory
|
|
157
147
|
: '';
|
|
@@ -160,6 +150,7 @@ export const crawlLocalFile = async ({
|
|
|
160
150
|
headless: process.env.CRAWLEE_HEADLESS === '1',
|
|
161
151
|
...getPlaywrightLaunchOptions(browser),
|
|
162
152
|
...playwrightDeviceDetailsObject,
|
|
153
|
+
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
163
154
|
});
|
|
164
155
|
|
|
165
156
|
register(browserContext);
|
|
@@ -172,8 +163,8 @@ export const crawlLocalFile = async ({
|
|
|
172
163
|
: null;
|
|
173
164
|
|
|
174
165
|
const page = await browserContext.newPage();
|
|
175
|
-
|
|
176
|
-
await page.goto(
|
|
166
|
+
url = convertPathToLocalFile(url);
|
|
167
|
+
await page.goto(url);
|
|
177
168
|
|
|
178
169
|
if (shouldAbort) {
|
|
179
170
|
console.warn('Scan aborted due to timeout before page scan.');
|
|
@@ -184,33 +175,39 @@ export const crawlLocalFile = async ({
|
|
|
184
175
|
|
|
185
176
|
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
|
186
177
|
|
|
187
|
-
const actualUrl = page.url() || request.loadedUrl ||
|
|
178
|
+
const actualUrl = page.url() || request.loadedUrl || url;
|
|
188
179
|
|
|
189
180
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
190
181
|
numScanned: urlsCrawled.scanned.length,
|
|
191
|
-
urlScanned:
|
|
182
|
+
urlScanned: url,
|
|
192
183
|
});
|
|
193
184
|
|
|
194
185
|
urlsCrawled.scanned.push({
|
|
195
|
-
url:
|
|
186
|
+
url: url,
|
|
196
187
|
pageTitle: results.pageTitle,
|
|
197
188
|
actualUrl: actualUrl, // i.e. actualUrl
|
|
198
189
|
});
|
|
199
190
|
|
|
200
191
|
urlsCrawled.scannedRedirects.push({
|
|
201
|
-
fromUrl:
|
|
192
|
+
fromUrl: url,
|
|
202
193
|
toUrl: actualUrl, // i.e. actualUrl
|
|
203
194
|
});
|
|
204
195
|
|
|
205
|
-
results.url =
|
|
196
|
+
results.url = url;
|
|
206
197
|
results.actualUrl = actualUrl;
|
|
207
198
|
|
|
208
199
|
await dataset.pushData(results);
|
|
209
200
|
} else {
|
|
201
|
+
|
|
202
|
+
const pdfFileName = path.basename(url);
|
|
203
|
+
const destinationFilePath: string = path.join(getPdfStoragePath(randomToken), pdfFileName);
|
|
204
|
+
fs.copyFileSync(url, destinationFilePath);
|
|
205
|
+
uuidToPdfMapping[pdfFileName] = url;
|
|
206
|
+
|
|
210
207
|
urlsCrawled.scanned.push({
|
|
211
|
-
url:
|
|
208
|
+
url: url,
|
|
212
209
|
pageTitle: pdfFileName,
|
|
213
|
-
actualUrl:
|
|
210
|
+
actualUrl: url,
|
|
214
211
|
});
|
|
215
212
|
|
|
216
213
|
await runPdfScan(randomToken);
|
|
@@ -21,7 +21,7 @@ import {
|
|
|
21
21
|
isFilePath,
|
|
22
22
|
} from '../constants/common.js';
|
|
23
23
|
import { areLinksEqual, isWhitelistedContentType, register } from '../utils.js';
|
|
24
|
-
import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
|
|
24
|
+
import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
|
|
25
25
|
import { guiInfoLog } from '../logs.js';
|
|
26
26
|
import { ViewportSettingsClass } from '../combine.js';
|
|
27
27
|
import * as path from 'path';
|
|
@@ -135,6 +135,7 @@ const crawlSitemap = async ({
|
|
|
135
135
|
...launchContext.launchOptions,
|
|
136
136
|
ignoreHTTPSErrors: true,
|
|
137
137
|
...playwrightDeviceDetailsObject,
|
|
138
|
+
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
138
139
|
};
|
|
139
140
|
|
|
140
141
|
// Optionally log for debugging
|
|
@@ -412,11 +413,11 @@ const crawlSitemap = async ({
|
|
|
412
413
|
const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
|
|
413
414
|
|
|
414
415
|
// get screenshots from pdf docs
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
416
|
+
if (includeScreenshots) {
|
|
417
|
+
await Promise.all(pdfResults.map(
|
|
418
|
+
async result => await doPdfScreenshots(randomToken, result)
|
|
419
|
+
));
|
|
420
|
+
}
|
|
420
421
|
|
|
421
422
|
// push results for each pdf document to key value store
|
|
422
423
|
await Promise.all(pdfResults.map(result => dataset.pushData(result)));
|