@govtechsg/oobee 0.10.65 → 0.10.68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +1 -1
- package/README.md +20 -0
- package/package.json +4 -2
- package/scripts/install_oobee_dependencies.command +1 -1
- package/scripts/install_oobee_dependencies.ps1 +2 -2
- package/src/cli.ts +23 -78
- package/src/constants/common.ts +245 -126
- package/src/constants/constants.ts +23 -16
- package/src/constants/questions.ts +16 -33
- package/src/crawlers/commonCrawlerFunc.ts +2 -2
- package/src/crawlers/crawlDomain.ts +16 -4
- package/src/crawlers/crawlIntelligentSitemap.ts +2 -2
- package/src/crawlers/crawlLocalFile.ts +2 -1
- package/src/crawlers/crawlSitemap.ts +4 -3
- package/src/crawlers/pdfScanFunc.ts +29 -25
- package/src/index.ts +4 -5
- package/src/mergeAxeResults.ts +27 -27
- package/src/utils.ts +0 -2
package/Dockerfile
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Use Microsoft Playwright image as base image
|
|
2
2
|
# Node version is v22
|
|
3
|
-
FROM mcr.microsoft.com/playwright:v1.
|
|
3
|
+
FROM mcr.microsoft.com/playwright:v1.55.0-noble
|
|
4
4
|
|
|
5
5
|
# Installation of packages for oobee and runner (locked versions from build log)
|
|
6
6
|
RUN apt-get update && apt-get install -y \
|
package/README.md
CHANGED
|
@@ -562,6 +562,26 @@ For details on which accessibility scan results triggers a "Must Fix" / "Good to
|
|
|
562
562
|
|
|
563
563
|
Please refer to the information below to assist in debugging. Most errors below are due to the switching between Node.js versions.
|
|
564
564
|
|
|
565
|
+
### URL Validation Errors
|
|
566
|
+
The following URL and file validation error codes are provided to troubleshoot the scan.
|
|
567
|
+
|
|
568
|
+
| Code | Error Name | Error Message | Troubleshooting Steps |
|
|
569
|
+
|------|----------------------|-------------------------------------------------------------------------------|------------------------|
|
|
570
|
+
| 0 | success | (undefined) | No action needed. Connection successful. |
|
|
571
|
+
| 11 | invalidUrl | Invalid URL. Please check and try again. | • Ensure the URL starts with `http://` or `https://`.<br>• Check for typos in the URL. |
|
|
572
|
+
| 12 | cannotBeResolved | URL cannot be accessed. Please verify whether the website exists. | • Confirm the domain name is correct.<br>• Check DNS resolution with `ping` or `nslookup`.<br>• Ensure the site is publicly accessible (not behind VPN/firewall). |
|
|
573
|
+
| 14 | systemError | Something went wrong when verifying the URL. Please try again in a few minutes. If this issue persists, please contact the Oobee team. | • Retry after a few minutes.<br>• Check internet connection.<br>• If persistent, report as a system issue. |
|
|
574
|
+
| 15 | notASitemap | Invalid sitemap URL format. Please enter a valid sitemap URL ending with .XML e.g. https://www.example.com/sitemap.xml. | • Ensure the URL points to a valid XML sitemap.<br>• View [Examples of sitemaps sitemaps.org - Protocol](https://www.sitemaps.org/protocol.html)<br>• Test the URL in a browser to confirm it returns XML. |
|
|
575
|
+
| 16 | unauthorised | Login required. Please enter your credentials and try again. | • Check if the site requires username/password.<br>• Provide credentials in Oobee if supported. |
|
|
576
|
+
| 17 | browserError | Incompatible browser. Please ensure you are using Chrome or Edge browser. | • Install the latest version of Chrome or Edge.|
|
|
577
|
+
| 18 | sslProtocolError | SSL certificate error. Please check the SSL configuration of your website and try again. | • Verify SSL certificate validity (not expired, issued by trusted CA).<br>• Check for mismatched TLS versions or cipher issues.<br>• Use an SSL checker tool (e.g., Qualys SSL Labs). |
|
|
578
|
+
| 19 | notALocalFile | Uploaded file format is incorrect. Please upload a HTML, PDF, XML or TXT file. | • Verify the file format.<br>• Ensure you are selecting `.html`, `.pdf`, `.xml`, or `.txt`. |
|
|
579
|
+
| 20 | notAPdf | URL/file format is incorrect. Please upload a PDF file. | • Ensure the file ends with `.pdf`.<br>• Open the file manually to confirm it is a valid PDF. |
|
|
580
|
+
| 21 | notASupportedDocument| Uploaded file format is incorrect. Please upload a HTML, PDF, XML or TXT file. | • Confirm file format.<br>• Convert to a supported type if necessary. |
|
|
581
|
+
| 22 | connectionRefused | Connection refused. Please try again in a few minutes. If this issue persists, please contact the Oobee team. | • Check if the server is running.<br>• Verify firewall settings.<br>• Retry after a short interval. |
|
|
582
|
+
| 23 | timedOut | Request timed out. Please try again in a few minutes. If this issue persists, please contact the Oobee team. | • Check your internet speed and stability.<br>• Retry when the server load is lower. |
|
|
583
|
+
|
|
584
|
+
|
|
565
585
|
### Incompatible Node.js versions
|
|
566
586
|
|
|
567
587
|
**Issue**: When your Node.js version is incompatible, you may face the following syntax error.
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@govtechsg/oobee",
|
|
3
3
|
"main": "dist/npmIndex.js",
|
|
4
|
-
"version": "0.10.
|
|
4
|
+
"version": "0.10.68",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"author": "Government Technology Agency <info@tech.gov.sg>",
|
|
7
7
|
"dependencies": {
|
|
@@ -22,10 +22,11 @@
|
|
|
22
22
|
"jsdom": "^21.1.2",
|
|
23
23
|
"jszip": "^3.10.1",
|
|
24
24
|
"lodash": "^4.17.21",
|
|
25
|
+
"mime": "^4.0.7",
|
|
25
26
|
"mime-types": "^2.1.35",
|
|
26
27
|
"minimatch": "^9.0.3",
|
|
27
28
|
"pdfjs-dist": "github:veraPDF/pdfjs-dist#v4.4.168-taggedPdf-0.1.20",
|
|
28
|
-
"playwright": "1.
|
|
29
|
+
"playwright": "^1.55.0",
|
|
29
30
|
"prettier": "^3.1.0",
|
|
30
31
|
"print-message": "^3.0.1",
|
|
31
32
|
"safe-regex": "^2.1.1",
|
|
@@ -48,6 +49,7 @@
|
|
|
48
49
|
"@types/fs-extra": "^11.0.4",
|
|
49
50
|
"@types/inquirer": "^9.0.7",
|
|
50
51
|
"@types/lodash": "^4.17.7",
|
|
52
|
+
"@types/mime": "^3.0.4",
|
|
51
53
|
"@types/mime-types": "^2.1.4",
|
|
52
54
|
"@types/safe-regex": "^1.1.6",
|
|
53
55
|
"@types/validator": "^13.11.10",
|
|
@@ -9,11 +9,11 @@ $ErrorActionPreference = 'Stop'
|
|
|
9
9
|
# Install NodeJS binaries
|
|
10
10
|
if (-Not (Test-Path nodejs-win\node.exe)) {
|
|
11
11
|
Write-Output "Downloading Node"
|
|
12
|
-
Invoke-WebRequest -o ./nodejs-win.zip "https://nodejs.org/dist/v22.
|
|
12
|
+
Invoke-WebRequest -o ./nodejs-win.zip "https://nodejs.org/dist/v22.19.0/node-v22.19.0-win-x64.zip"
|
|
13
13
|
|
|
14
14
|
Write-Output "Unzip Node"
|
|
15
15
|
Expand-Archive .\nodejs-win.zip -DestinationPath .
|
|
16
|
-
Rename-Item node-v22.
|
|
16
|
+
Rename-Item node-v22.19.0-win-x64 -NewName nodejs-win
|
|
17
17
|
Remove-Item -Force .\nodejs-win.zip
|
|
18
18
|
}
|
|
19
19
|
|
package/src/cli.ts
CHANGED
|
@@ -211,18 +211,19 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
|
|
|
211
211
|
.parse() as unknown as Answers;
|
|
212
212
|
|
|
213
213
|
const scanInit = async (argvs: Answers): Promise<string> => {
|
|
214
|
-
let isCustomFlow = false;
|
|
215
|
-
if (argvs.scanner === ScannerTypes.CUSTOM) {
|
|
216
|
-
isCustomFlow = true;
|
|
217
|
-
}
|
|
218
|
-
|
|
219
214
|
const updatedArgvs = { ...argvs };
|
|
220
215
|
|
|
221
216
|
// Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
|
|
222
217
|
setHeadlessMode(updatedArgvs.browserToRun, updatedArgvs.headless);
|
|
223
218
|
const statuses = constants.urlCheckStatuses;
|
|
224
219
|
|
|
225
|
-
|
|
220
|
+
let data;
|
|
221
|
+
try {
|
|
222
|
+
data = await prepareData(updatedArgvs);
|
|
223
|
+
} catch (e) {
|
|
224
|
+
consoleLogger.error(`Error preparing data: ${e.message}\n${e.stack}`);
|
|
225
|
+
cleanUpAndExit(1);
|
|
226
|
+
}
|
|
226
227
|
|
|
227
228
|
// Executes cleanUp script if error encountered
|
|
228
229
|
listenForCleanUp(data.randomToken);
|
|
@@ -233,83 +234,27 @@ const scanInit = async (argvs: Answers): Promise<string> => {
|
|
|
233
234
|
data.browser,
|
|
234
235
|
data.userDataDirectory,
|
|
235
236
|
data.playwrightDeviceDetailsObject,
|
|
236
|
-
data.extraHTTPHeaders
|
|
237
|
+
data.extraHTTPHeaders,
|
|
238
|
+
data.fileTypes
|
|
237
239
|
);
|
|
238
240
|
|
|
239
241
|
if (res.httpStatus) consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
|
|
240
242
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
break;
|
|
250
|
-
}
|
|
251
|
-
case statuses.unauthorised.code: {
|
|
252
|
-
printMessage([statuses.unauthorised.message], messageOptions);
|
|
253
|
-
consoleLogger.info(statuses.unauthorised.message);
|
|
254
|
-
cleanUpAndExit(res.status);
|
|
255
|
-
}
|
|
256
|
-
case statuses.cannotBeResolved.code: {
|
|
257
|
-
printMessage([statuses.cannotBeResolved.message], messageOptions);
|
|
258
|
-
consoleLogger.info(statuses.cannotBeResolved.message);
|
|
259
|
-
cleanUpAndExit(res.status);
|
|
260
|
-
}
|
|
261
|
-
case statuses.systemError.code: {
|
|
262
|
-
printMessage([statuses.systemError.message], messageOptions);
|
|
263
|
-
consoleLogger.info(statuses.systemError.message);
|
|
264
|
-
cleanUpAndExit(res.status);
|
|
265
|
-
}
|
|
266
|
-
case statuses.invalidUrl.code: {
|
|
267
|
-
if (
|
|
268
|
-
updatedArgvs.scanner !== ScannerTypes.SITEMAP &&
|
|
269
|
-
updatedArgvs.scanner !== ScannerTypes.LOCALFILE
|
|
270
|
-
) {
|
|
271
|
-
printMessage([statuses.invalidUrl.message], messageOptions);
|
|
272
|
-
consoleLogger.info(statuses.invalidUrl.message);
|
|
273
|
-
cleanUpAndExit(res.status);
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
const finalFilePath = getFileSitemap(updatedArgvs.url);
|
|
277
|
-
if (finalFilePath) {
|
|
278
|
-
data.isLocalFileScan = true;
|
|
279
|
-
data.url = finalFilePath;
|
|
280
|
-
|
|
281
|
-
if (process.env.OOBEE_VALIDATE_URL) {
|
|
282
|
-
console.log('Url is valid');
|
|
283
|
-
cleanUpAndExit(0);
|
|
284
|
-
}
|
|
285
|
-
} else if (updatedArgvs.scanner === ScannerTypes.LOCALFILE) {
|
|
286
|
-
printMessage([statuses.notALocalFile.message], messageOptions);
|
|
287
|
-
consoleLogger.info(statuses.notALocalFile.message);
|
|
288
|
-
cleanUpAndExit(statuses.notALocalFile.code);
|
|
289
|
-
} else if (updatedArgvs.scanner !== ScannerTypes.SITEMAP) {
|
|
290
|
-
printMessage([statuses.notASitemap.message], messageOptions);
|
|
291
|
-
consoleLogger.info(statuses.notASitemap.message);
|
|
292
|
-
cleanUpAndExit(statuses.notASitemap.code);
|
|
293
|
-
}
|
|
294
|
-
break;
|
|
295
|
-
}
|
|
296
|
-
case statuses.notASitemap.code: {
|
|
297
|
-
printMessage([statuses.notASitemap.message], messageOptions);
|
|
298
|
-
consoleLogger.info(statuses.notASitemap.message);
|
|
299
|
-
cleanUpAndExit(res.status);
|
|
300
|
-
}
|
|
301
|
-
case statuses.notALocalFile.code: {
|
|
302
|
-
printMessage([statuses.notALocalFile.message], messageOptions);
|
|
303
|
-
consoleLogger.info(statuses.notALocalFile.message);
|
|
304
|
-
cleanUpAndExit(res.status);
|
|
305
|
-
}
|
|
306
|
-
case statuses.browserError.code: {
|
|
307
|
-
printMessage([statuses.browserError.message], messageOptions);
|
|
308
|
-
consoleLogger.info(statuses.browserError.message);
|
|
309
|
-
cleanUpAndExit(res.status);
|
|
243
|
+
if (res.status === statuses.success.code) {
|
|
244
|
+
data.url = res.url;
|
|
245
|
+
if (process.env.OOBEE_VALIDATE_URL) {
|
|
246
|
+
consoleLogger.info('Url is valid');
|
|
247
|
+
cleanUpAndExit(0, data.randomToken);
|
|
248
|
+
return;
|
|
310
249
|
}
|
|
311
|
-
|
|
312
|
-
|
|
250
|
+
// fall through (continue normal flow after success)
|
|
251
|
+
} else {
|
|
252
|
+
const match = Object.values(statuses).find((s: any) => s.code === res.status);
|
|
253
|
+
const msg = match && 'message' in match ? match.message : 'Unknown error';
|
|
254
|
+
printMessage([msg], messageOptions);
|
|
255
|
+
consoleLogger.info(msg);
|
|
256
|
+
cleanUpAndExit(res.status);
|
|
257
|
+
return;
|
|
313
258
|
}
|
|
314
259
|
|
|
315
260
|
if (process.env.OOBEE_VERBOSE) {
|
package/src/constants/common.ts
CHANGED
|
@@ -14,6 +14,7 @@ import url, { fileURLToPath, pathToFileURL } from 'url';
|
|
|
14
14
|
import safe from 'safe-regex';
|
|
15
15
|
import * as https from 'https';
|
|
16
16
|
import os from 'os';
|
|
17
|
+
import mime from 'mime';
|
|
17
18
|
import { minimatch } from 'minimatch';
|
|
18
19
|
import { globSync, GlobOptionsWithFileTypesFalse } from 'glob';
|
|
19
20
|
import { LaunchOptions, Locator, Page, devices, webkit } from 'playwright';
|
|
@@ -27,6 +28,8 @@ import constants, {
|
|
|
27
28
|
// Legacy code end - Google Sheets submission
|
|
28
29
|
ScannerTypes,
|
|
29
30
|
BrowserTypes,
|
|
31
|
+
FileTypes,
|
|
32
|
+
getEnumKey,
|
|
30
33
|
} from './constants.js';
|
|
31
34
|
import { consoleLogger } from '../logs.js';
|
|
32
35
|
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
|
@@ -172,9 +175,14 @@ export const messageOptions = {
|
|
|
172
175
|
};
|
|
173
176
|
|
|
174
177
|
const urlOptions = {
|
|
175
|
-
|
|
178
|
+
// http and https for normal scans, file for local file scan
|
|
179
|
+
protocols: ['http', 'https', 'file'],
|
|
176
180
|
require_protocol: true,
|
|
177
181
|
require_tld: false,
|
|
182
|
+
require_host: false,
|
|
183
|
+
// being explicit; fragments/queries are fine for local files
|
|
184
|
+
allow_fragments: true,
|
|
185
|
+
allow_query_components: true,
|
|
178
186
|
};
|
|
179
187
|
|
|
180
188
|
const queryCheck = (s: string) => document.createDocumentFragment().querySelector(s);
|
|
@@ -187,8 +195,9 @@ export const isSelectorValid = (selector: string): boolean => {
|
|
|
187
195
|
return true;
|
|
188
196
|
};
|
|
189
197
|
|
|
190
|
-
//
|
|
191
|
-
|
|
198
|
+
// Don't sanitise for now as we have changed the logic for URL validation / local file scan
|
|
199
|
+
// Only use this when we find characters to validate against
|
|
200
|
+
const blackListCharacters = '';
|
|
192
201
|
|
|
193
202
|
export const validateXML = (content: string): { isValid: boolean; parsedContent: string } => {
|
|
194
203
|
let isValid: boolean;
|
|
@@ -271,12 +280,25 @@ export const isInputValid = (inputString: string): boolean => {
|
|
|
271
280
|
export const sanitizeUrlInput = (url: string): { isValid: boolean; url: string } => {
|
|
272
281
|
// Sanitize that there is no blacklist characters
|
|
273
282
|
const sanitizeUrl = validator.blacklist(url, blackListCharacters);
|
|
274
|
-
if (validator.isURL(sanitizeUrl, urlOptions)) {
|
|
283
|
+
if (url.toLowerCase().startsWith('file://') || validator.isURL(sanitizeUrl, urlOptions)) {
|
|
275
284
|
return { isValid: true, url: sanitizeUrl };
|
|
276
285
|
}
|
|
277
286
|
return { isValid: false, url: sanitizeUrl };
|
|
278
287
|
};
|
|
279
288
|
|
|
289
|
+
const isAllowedContentType = (ct: string): boolean => {
|
|
290
|
+
const c = (ct || '').toLowerCase();
|
|
291
|
+
return (
|
|
292
|
+
c.startsWith('text/html') || // html
|
|
293
|
+
c.startsWith('application/xhtml+xml') || // xhtml
|
|
294
|
+
c.startsWith('text/plain') || // txt
|
|
295
|
+
c.startsWith('application/xml') || // xml
|
|
296
|
+
c.startsWith('text/xml') || // xml (alt)
|
|
297
|
+
c.startsWith('application/pdf') // pdf
|
|
298
|
+
);
|
|
299
|
+
};
|
|
300
|
+
|
|
301
|
+
|
|
280
302
|
const checkUrlConnectivityWithBrowser = async (
|
|
281
303
|
url: string,
|
|
282
304
|
browserToRun: string,
|
|
@@ -292,6 +314,44 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
292
314
|
return res;
|
|
293
315
|
}
|
|
294
316
|
|
|
317
|
+
// STEP 1: For local file scans
|
|
318
|
+
let contentType = '';
|
|
319
|
+
|
|
320
|
+
const protocol = new URL(url).protocol;
|
|
321
|
+
|
|
322
|
+
if (protocol !== 'http:' && protocol !== 'https:') {
|
|
323
|
+
try {
|
|
324
|
+
const filePath = fileURLToPath(url);
|
|
325
|
+
const stat = fs.statSync(filePath);
|
|
326
|
+
|
|
327
|
+
if (!stat.isFile()) {
|
|
328
|
+
res.status = constants.urlCheckStatuses.notALocalFile.code;
|
|
329
|
+
return res;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
const statusCode = 200;
|
|
333
|
+
contentType = mime.getType(filePath) || 'application/octet-stream';
|
|
334
|
+
|
|
335
|
+
if (!isAllowedContentType(contentType)) {
|
|
336
|
+
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
337
|
+
return res;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Short-circuit for pdfs
|
|
341
|
+
if (contentType.includes('pdf')) {
|
|
342
|
+
res.status = constants.urlCheckStatuses.success.code;
|
|
343
|
+
res.httpStatus = statusCode;
|
|
344
|
+
res.url = url;
|
|
345
|
+
res.content = '%PDF-'; // Avoid putting the binary in memory
|
|
346
|
+
return res;
|
|
347
|
+
}
|
|
348
|
+
} catch (e) {
|
|
349
|
+
consoleLogger.info(`Local file check failed: ${e.message}`);
|
|
350
|
+
res.status = constants.urlCheckStatuses.systemError.code;
|
|
351
|
+
return res;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
295
355
|
// Ensure Accept header for non-html content fallback
|
|
296
356
|
extraHTTPHeaders['Accept'] ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
|
|
297
357
|
|
|
@@ -302,6 +362,7 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
302
362
|
browserContext = await constants.launcher.launchPersistentContext(clonedDataDir, {
|
|
303
363
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
304
364
|
ignoreHTTPSErrors: true,
|
|
365
|
+
headless: true,
|
|
305
366
|
...getPlaywrightLaunchOptions(browserToRun),
|
|
306
367
|
...playwrightDeviceDetailsObject,
|
|
307
368
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
@@ -317,77 +378,94 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
317
378
|
try {
|
|
318
379
|
const page = await browserContext.newPage();
|
|
319
380
|
|
|
320
|
-
//
|
|
321
|
-
let statusCode = 0;
|
|
322
|
-
let contentType = '';
|
|
323
|
-
let disposition = '';
|
|
324
|
-
|
|
381
|
+
// Block native Chrome download UI
|
|
325
382
|
try {
|
|
326
|
-
const
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
});
|
|
330
|
-
|
|
331
|
-
statusCode = headResp.status();
|
|
332
|
-
contentType = headResp.headers()['content-type'] || '';
|
|
333
|
-
disposition = headResp.headers()['content-disposition'] || '';
|
|
383
|
+
const cdp = await browserContext.newCDPSession(page as any);
|
|
384
|
+
await cdp.send('Page.setDownloadBehavior', { behavior: 'deny' });
|
|
385
|
+
} catch (e) {
|
|
386
|
+
consoleLogger.info(`Unable to set download deny: ${(e as Error).message}`);
|
|
387
|
+
}
|
|
334
388
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
389
|
+
// STEP 2: Navigate (follows server-side redirects)
|
|
390
|
+
page.once('download', () => {
|
|
391
|
+
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
392
|
+
return res;
|
|
393
|
+
});
|
|
394
|
+
|
|
395
|
+
const response = await page.goto(url, {
|
|
396
|
+
timeout: 15000,
|
|
397
|
+
waitUntil: 'domcontentloaded', // enough to get status + allow potential client redirects to kick in
|
|
398
|
+
});
|
|
344
399
|
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
400
|
+
// Give client-side redirects (meta refresh / JS location.*) a moment
|
|
401
|
+
try {
|
|
402
|
+
await page.waitForLoadState('networkidle', { timeout: 8000 });
|
|
403
|
+
} catch {
|
|
404
|
+
consoleLogger.info('networkidle not reached; proceeding with verification GET');
|
|
405
|
+
}
|
|
348
406
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
407
|
+
// STEP 3: Verify final URL with a GET (follows redirects)
|
|
408
|
+
const finalUrl = page.url();
|
|
409
|
+
let verifyResp = response;
|
|
410
|
+
try {
|
|
411
|
+
verifyResp = await page.request.fetch(finalUrl, {
|
|
412
|
+
method: 'GET',
|
|
413
|
+
headers: extraHTTPHeaders,
|
|
414
|
+
});
|
|
352
415
|
} catch (e) {
|
|
353
|
-
consoleLogger.info(`
|
|
354
|
-
res.status = constants.urlCheckStatuses.systemError.code;
|
|
355
|
-
await browserContext.close();
|
|
356
|
-
return res;
|
|
416
|
+
consoleLogger.info(`Verification GET failed, falling back to navigation response: ${e.message}`);
|
|
357
417
|
}
|
|
358
418
|
|
|
359
|
-
//
|
|
360
|
-
const
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
});
|
|
419
|
+
// Prefer verification GET; fall back to nav response
|
|
420
|
+
const finalStatus = verifyResp?.status?.() ?? response?.status?.() ?? 0;
|
|
421
|
+
const headers = (verifyResp?.headers?.() ?? response?.headers?.()) || {};
|
|
422
|
+
contentType = headers['content-type'] || '';
|
|
364
423
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
424
|
+
if (!isAllowedContentType(contentType)) {
|
|
425
|
+
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
426
|
+
return res;
|
|
427
|
+
}
|
|
369
428
|
|
|
370
429
|
res.httpStatus = finalStatus;
|
|
371
|
-
res.url =
|
|
430
|
+
res.url = finalUrl;
|
|
372
431
|
|
|
373
|
-
|
|
432
|
+
if (finalStatus === 401) {
|
|
433
|
+
res.status = constants.urlCheckStatuses.unauthorised.code;
|
|
434
|
+
} else if (finalStatus >= 200 && finalStatus < 400) {
|
|
435
|
+
res.status = constants.urlCheckStatuses.success.code;
|
|
436
|
+
} else if (finalStatus === 405 || finalStatus === 501) {
|
|
437
|
+
// Some origins 405/501 but the browser-rendered page is still reachable after client redirects.
|
|
438
|
+
// As a last resort, consider DOM presence as success if we actually have a document.
|
|
439
|
+
const hasDOM = await page.evaluate(() => !!document && !!document.documentElement);
|
|
440
|
+
res.status = hasDOM ? constants.urlCheckStatuses.success.code : constants.urlCheckStatuses.systemError.code;
|
|
441
|
+
} else {
|
|
442
|
+
res.status = constants.urlCheckStatuses.systemError.code;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// Content handling
|
|
374
446
|
if (contentType.includes('pdf') || contentType.includes('octet-stream')) {
|
|
375
|
-
res.content = ''; //
|
|
447
|
+
res.content = '%PDF-'; // avoid binary in memory / download
|
|
376
448
|
} else {
|
|
377
449
|
try {
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
}
|
|
382
|
-
|
|
450
|
+
// Try to get a stable DOM; don't fail the check if it times out
|
|
451
|
+
await page.waitForLoadState('domcontentloaded', { timeout: 5000 });
|
|
452
|
+
} catch {}
|
|
383
453
|
res.content = await page.content();
|
|
384
454
|
}
|
|
385
455
|
|
|
386
456
|
} catch (error) {
|
|
387
457
|
if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
|
|
388
458
|
res.status = constants.urlCheckStatuses.unauthorised.code;
|
|
459
|
+
} else if (error.message.includes('net::ERR_NAME_NOT_RESOLVED')) {
|
|
460
|
+
res.status = constants.urlCheckStatuses.cannotBeResolved.code;
|
|
461
|
+
} else if (error.message.includes('net::ERR_CONNECTION_REFUSED')) {
|
|
462
|
+
res.status = constants.urlCheckStatuses.connectionRefused.code;
|
|
463
|
+
} else if (error.message.includes('net::ERR_TIMED_OUT')) {
|
|
464
|
+
res.status = constants.urlCheckStatuses.timedOut.code;
|
|
465
|
+
} else if (error.message.includes('net::ERR_SSL_PROTOCOL_ERROR')) {
|
|
466
|
+
res.status = constants.urlCheckStatuses.sslProtocolError.code;
|
|
389
467
|
} else {
|
|
390
|
-
|
|
468
|
+
consoleLogger.error(error);
|
|
391
469
|
res.status = constants.urlCheckStatuses.systemError.code;
|
|
392
470
|
}
|
|
393
471
|
} finally {
|
|
@@ -397,6 +475,16 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
397
475
|
return res;
|
|
398
476
|
};
|
|
399
477
|
|
|
478
|
+
export const isPdfContent = (content: Buffer | string): boolean => {
|
|
479
|
+
let header: string;
|
|
480
|
+
if (Buffer.isBuffer(content)) {
|
|
481
|
+
header = content.toString('utf8', 0, 5);
|
|
482
|
+
} else {
|
|
483
|
+
header = content.substring(0, 5);
|
|
484
|
+
}
|
|
485
|
+
return header === '%PDF-';
|
|
486
|
+
};
|
|
487
|
+
|
|
400
488
|
export const isSitemapContent = (content: string) => {
|
|
401
489
|
const { isValid } = validateXML(content);
|
|
402
490
|
if (isValid) {
|
|
@@ -426,27 +514,43 @@ export const checkUrl = async (
|
|
|
426
514
|
clonedDataDir: string,
|
|
427
515
|
playwrightDeviceDetailsObject: DeviceDescriptor,
|
|
428
516
|
extraHTTPHeaders: Record<string, string>,
|
|
517
|
+
fileTypes: FileTypes
|
|
429
518
|
) => {
|
|
519
|
+
|
|
430
520
|
const res = await checkUrlConnectivityWithBrowser(
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
521
|
+
url,
|
|
522
|
+
browser,
|
|
523
|
+
clonedDataDir,
|
|
524
|
+
playwrightDeviceDetailsObject,
|
|
525
|
+
extraHTTPHeaders,
|
|
436
526
|
);
|
|
437
527
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
const
|
|
528
|
+
// If response is 200 (meaning no other code was set earlier)
|
|
529
|
+
if (res.status === constants.urlCheckStatuses.success.code) {
|
|
530
|
+
|
|
531
|
+
// Check if document is pdf type
|
|
532
|
+
const isPdf = isPdfContent(res.content);
|
|
533
|
+
|
|
534
|
+
// Check if only HTML document is allowed to be scanned
|
|
535
|
+
if (fileTypes === FileTypes.HtmlOnly && isPdf) {
|
|
536
|
+
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
537
|
+
|
|
538
|
+
// Check if only PDF document is allowed to be scanned
|
|
539
|
+
} else if (fileTypes === FileTypes.PdfOnly && !isPdf) {
|
|
540
|
+
res.status = constants.urlCheckStatuses.notAPdf.code;
|
|
443
541
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
542
|
+
// Check if sitemap is expected
|
|
543
|
+
} else if (scanner === ScannerTypes.SITEMAP) {
|
|
544
|
+
const isSitemap = isSitemapContent(res.content);
|
|
545
|
+
|
|
546
|
+
if (!isSitemap) {
|
|
547
|
+
res.status = constants.urlCheckStatuses.notASitemap.code;
|
|
548
|
+
}
|
|
448
549
|
}
|
|
550
|
+
|
|
551
|
+
// else proceed as normal
|
|
449
552
|
}
|
|
553
|
+
|
|
450
554
|
return res;
|
|
451
555
|
};
|
|
452
556
|
|
|
@@ -486,7 +590,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
486
590
|
viewportWidth,
|
|
487
591
|
maxpages,
|
|
488
592
|
strategy,
|
|
489
|
-
isLocalFileScan =
|
|
593
|
+
isLocalFileScan = argv.scanner === ScannerTypes.LOCALFILE,
|
|
490
594
|
browserToRun,
|
|
491
595
|
nameEmail,
|
|
492
596
|
customFlowLabel,
|
|
@@ -511,30 +615,34 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
511
615
|
let username = '';
|
|
512
616
|
let password = '';
|
|
513
617
|
|
|
618
|
+
// If a file path is provided
|
|
514
619
|
if (isFilePath(url)) {
|
|
515
|
-
|
|
516
|
-
|
|
620
|
+
// Set is as local file scan if not already so
|
|
621
|
+
isLocalFileScan = true;
|
|
517
622
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
? url
|
|
521
|
-
: (() => {
|
|
522
|
-
const temp = new URL(url);
|
|
523
|
-
username = temp.username;
|
|
524
|
-
password = temp.password;
|
|
623
|
+
// Convert to absolute path
|
|
624
|
+
url = path.resolve(url);
|
|
525
625
|
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
626
|
+
// Convert to file:// URL
|
|
627
|
+
url = convertPathToLocalFile(url);
|
|
628
|
+
} else {
|
|
629
|
+
// Check URL for basic auth embedded and move it to extraHTTPHeaders
|
|
630
|
+
const temp = new URL(url);
|
|
631
|
+
username = temp.username;
|
|
632
|
+
password = temp.password;
|
|
529
633
|
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
634
|
+
if (username !== '' || password !== '') {
|
|
635
|
+
extraHTTPHeaders['Authorization'] = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
temp.username = '';
|
|
639
|
+
temp.password = '';
|
|
640
|
+
url = temp.toString();
|
|
641
|
+
}
|
|
534
642
|
|
|
535
643
|
// construct filename for scan results
|
|
536
644
|
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
|
537
|
-
const domain =
|
|
645
|
+
const domain = isLocalFileScan ? path.basename(url) : new URL(url).hostname;
|
|
538
646
|
|
|
539
647
|
const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : '';
|
|
540
648
|
let resultFilename: string;
|
|
@@ -586,7 +694,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
586
694
|
customFlowLabel,
|
|
587
695
|
specifiedMaxConcurrency,
|
|
588
696
|
randomToken: resultFilename,
|
|
589
|
-
fileTypes,
|
|
697
|
+
fileTypes: FileTypes[getEnumKey(FileTypes, fileTypes) as keyof typeof FileTypes],
|
|
590
698
|
blacklistedPatternsFilename,
|
|
591
699
|
includeScreenshots: !(additional === 'none'),
|
|
592
700
|
metadata,
|
|
@@ -1335,26 +1443,28 @@ export const cloneChromeProfiles = (randomToken: string): string => {
|
|
|
1335
1443
|
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
|
1336
1444
|
|
|
1337
1445
|
if (fs.existsSync(destDir)) {
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
fs.
|
|
1343
|
-
|
|
1446
|
+
// Don't delete since it will be handled at the end of the scan
|
|
1447
|
+
// deleteClonedChromeProfiles(randomToken);
|
|
1448
|
+
// Assume it cloned and don't re-clone
|
|
1449
|
+
} else {
|
|
1450
|
+
if (!fs.existsSync(destDir)) {
|
|
1451
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1452
|
+
}
|
|
1344
1453
|
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1454
|
+
const baseOptions = {
|
|
1455
|
+
cwd: baseDir,
|
|
1456
|
+
recursive: true,
|
|
1457
|
+
absolute: true,
|
|
1458
|
+
nodir: true,
|
|
1459
|
+
};
|
|
1460
|
+
const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
|
|
1461
|
+
if (cloneChromeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
|
|
1462
|
+
return destDir;
|
|
1463
|
+
}
|
|
1355
1464
|
|
|
1356
|
-
|
|
1465
|
+
consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
|
|
1357
1466
|
|
|
1467
|
+
}
|
|
1358
1468
|
// For future reference, return a null instead to halt the scan
|
|
1359
1469
|
return destDir;
|
|
1360
1470
|
};
|
|
@@ -1371,10 +1481,11 @@ export const cloneChromiumProfiles = (randomToken: string): string => {
|
|
|
1371
1481
|
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
|
1372
1482
|
|
|
1373
1483
|
if (fs.existsSync(destDir)) {
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1484
|
+
|
|
1485
|
+
// Don't delete since it will be handled at the end of the scan
|
|
1486
|
+
// deleteClonedChromiumProfiles(randomToken);
|
|
1487
|
+
// Assume it cloned and don't re-clone
|
|
1488
|
+
} else {
|
|
1378
1489
|
fs.mkdirSync(destDir, { recursive: true });
|
|
1379
1490
|
}
|
|
1380
1491
|
|
|
@@ -1401,26 +1512,31 @@ export const cloneEdgeProfiles = (randomToken: string): string => {
|
|
|
1401
1512
|
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
|
1402
1513
|
|
|
1403
1514
|
if (fs.existsSync(destDir)) {
|
|
1404
|
-
deleteClonedEdgeProfiles(randomToken);
|
|
1405
|
-
}
|
|
1406
1515
|
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1516
|
+
// Don't delete since it will be handled at the end of the scan
|
|
1517
|
+
// deleteClonedEdgeProfiles(randomToken);
|
|
1518
|
+
// Assume it cloned and don't re-clone
|
|
1410
1519
|
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
nodir: true,
|
|
1416
|
-
};
|
|
1520
|
+
} else {
|
|
1521
|
+
if (!fs.existsSync(destDir)) {
|
|
1522
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1523
|
+
}
|
|
1417
1524
|
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1525
|
+
const baseOptions = {
|
|
1526
|
+
cwd: baseDir,
|
|
1527
|
+
recursive: true,
|
|
1528
|
+
absolute: true,
|
|
1529
|
+
nodir: true,
|
|
1530
|
+
};
|
|
1531
|
+
|
|
1532
|
+
const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
|
|
1533
|
+
if (cloneEdgeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
|
|
1534
|
+
return destDir;
|
|
1535
|
+
}
|
|
1536
|
+
|
|
1537
|
+
consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
|
|
1422
1538
|
|
|
1423
|
-
|
|
1539
|
+
}
|
|
1424
1540
|
|
|
1425
1541
|
// For future reference, return a null instead to halt the scan
|
|
1426
1542
|
return destDir;
|
|
@@ -1863,10 +1979,13 @@ export const isFilePath = (url: string): boolean => {
|
|
|
1863
1979
|
const driveLetterPattern = /^[A-Z]:/i;
|
|
1864
1980
|
const backslashPattern = /\\/;
|
|
1865
1981
|
return (
|
|
1866
|
-
url.startsWith('file://') ||
|
|
1867
1982
|
url.startsWith('/') ||
|
|
1868
1983
|
driveLetterPattern.test(url) ||
|
|
1869
|
-
backslashPattern.test(url)
|
|
1984
|
+
backslashPattern.test(url) ||
|
|
1985
|
+
url.startsWith('./') ||
|
|
1986
|
+
url.startsWith('../') ||
|
|
1987
|
+
url.startsWith('.\\') ||
|
|
1988
|
+
url.startsWith('..\\')
|
|
1870
1989
|
);
|
|
1871
1990
|
};
|
|
1872
1991
|
|
|
@@ -252,6 +252,16 @@ export enum ScannerTypes {
|
|
|
252
252
|
}
|
|
253
253
|
/* eslint-enable no-unused-vars */
|
|
254
254
|
|
|
255
|
+
export enum FileTypes {
|
|
256
|
+
All = 'all',
|
|
257
|
+
PdfOnly = 'pdf-only',
|
|
258
|
+
HtmlOnly = 'html-only',
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
export function getEnumKey<E extends Record<string, string>>(enumObj: E, value: string): keyof E | undefined {
|
|
262
|
+
return (Object.keys(enumObj) as Array<keyof E>).find(k => enumObj[k] === value);
|
|
263
|
+
}
|
|
264
|
+
|
|
255
265
|
export const guiInfoStatusTypes = {
|
|
256
266
|
SCANNED: 'scanned',
|
|
257
267
|
SKIPPED: 'skipped',
|
|
@@ -379,31 +389,28 @@ const wcagLinks = {
|
|
|
379
389
|
|
|
380
390
|
const urlCheckStatuses = {
|
|
381
391
|
success: { code: 0 },
|
|
382
|
-
invalidUrl: { code: 11, message: 'Invalid URL
|
|
383
|
-
cannotBeResolved: {
|
|
384
|
-
code: 12,
|
|
385
|
-
message:
|
|
386
|
-
'Provided URL cannot be accessed. Please verify your internet connectivity and the correctness of the domain.',
|
|
387
|
-
},
|
|
392
|
+
invalidUrl: { code: 11, message: 'Invalid URL. Please check and try again.' },
|
|
393
|
+
cannotBeResolved: { code: 12, message: 'URL cannot be accessed. Please verify whether the website exists.' },
|
|
388
394
|
errorStatusReceived: {
|
|
389
395
|
// unused for now
|
|
390
396
|
code: 13,
|
|
391
397
|
message: 'Provided URL cannot be accessed. Server responded with code ', // append it with the response code received,
|
|
392
398
|
},
|
|
393
|
-
systemError: {
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
notASitemap: { code: 15, message: 'Provided URL is not a sitemap.' },
|
|
398
|
-
unauthorised: { code: 16, message: 'Provided URL needs basic authorisation.' },
|
|
399
|
+
systemError: { code: 14, message: 'Something went wrong when verifying the URL. Please try again in a few minutes. If this issue persists, please contact the Oobee team.'},
|
|
400
|
+
notASitemap: { code: 15, message: 'Invalid sitemap URL format. Please enter a valid sitemap URL ending with .XML e.g. https://www.example.com/sitemap.xml.' },
|
|
401
|
+
unauthorised: { code: 16, message: 'Login required. Please enter your credentials and try again.' },
|
|
402
|
+
// browserError means engine could not find a browser to run the scan
|
|
399
403
|
browserError: {
|
|
400
404
|
code: 17,
|
|
401
405
|
message:
|
|
402
|
-
'
|
|
406
|
+
'Incompatible browser. Please ensure you are using Chrome or Edge browser.',
|
|
403
407
|
},
|
|
404
|
-
|
|
405
|
-
notALocalFile: { code: 19, message: '
|
|
406
|
-
|
|
408
|
+
sslProtocolError: { code: 18, message: 'SSL certificate error. Please check the SSL configuration of your website and try again.' },
|
|
409
|
+
notALocalFile: { code: 19, message: 'Uploaded file format is incorrect. Please upload a HTML, PDF, XML or TXT file.' },
|
|
410
|
+
notAPdf: { code: 20, message: 'URL/file format is incorrect. Please upload a PDF file.' },
|
|
411
|
+
notASupportedDocument: { code: 21, message: 'Uploaded file format is incorrect. Please upload a HTML, PDF, XML or TXT file.' },
|
|
412
|
+
connectionRefused: { code: 22, message: 'Connection refused. Please try again in a few minutes. If this issue persists, please contact the Oobee team.' },
|
|
413
|
+
timedOut: { code: 23, message: 'Request timed out. Please try again in a few minutes. If this issue persists, please contact the Oobee team.' },
|
|
407
414
|
};
|
|
408
415
|
|
|
409
416
|
/* eslint-disable no-unused-vars */
|
|
@@ -3,7 +3,6 @@ import { Answers } from '../index.js';
|
|
|
3
3
|
import { getUserDataTxt, randomThreeDigitNumberString, setHeadlessMode } from '../utils.js';
|
|
4
4
|
import {
|
|
5
5
|
checkUrl,
|
|
6
|
-
deleteClonedProfiles,
|
|
7
6
|
getBrowserToRun,
|
|
8
7
|
getPlaywrightDeviceDetailsObject,
|
|
9
8
|
getUrlMessage,
|
|
@@ -14,7 +13,7 @@ import {
|
|
|
14
13
|
validateCustomFlowLabel,
|
|
15
14
|
parseHeaders,
|
|
16
15
|
} from './common.js';
|
|
17
|
-
import constants, { BrowserTypes, ScannerTypes } from './constants.js';
|
|
16
|
+
import constants, { BrowserTypes, FileTypes, ScannerTypes } from './constants.js';
|
|
18
17
|
import { random } from 'lodash';
|
|
19
18
|
|
|
20
19
|
const userData = getUserDataTxt();
|
|
@@ -58,6 +57,13 @@ const startScanQuestions = [
|
|
|
58
57
|
name: 'viewportWidth',
|
|
59
58
|
message: 'Specify width of the viewport in pixels (e.g. 360):',
|
|
60
59
|
when: (answers: Answers) => answers.customDevice === 'Specify viewport',
|
|
60
|
+
filter: (input) => {
|
|
61
|
+
if (input === '' || input === undefined) {
|
|
62
|
+
return undefined; // return nothing instead of NaN
|
|
63
|
+
}
|
|
64
|
+
const n = Number(input);
|
|
65
|
+
return Number.isInteger(n) ? n : undefined;
|
|
66
|
+
},
|
|
61
67
|
validate: (viewport: number) => {
|
|
62
68
|
if (!Number.isInteger(viewport)) {
|
|
63
69
|
return 'Invalid viewport width. Please provide an integer.';
|
|
@@ -117,39 +123,16 @@ const startScanQuestions = [
|
|
|
117
123
|
clonedBrowserDataDir,
|
|
118
124
|
playwrightDeviceDetailsObject,
|
|
119
125
|
parseHeaders(answers.header),
|
|
126
|
+
FileTypes.All,
|
|
120
127
|
);
|
|
121
128
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
return statuses.systemError.message;
|
|
130
|
-
case statuses.invalidUrl.code:
|
|
131
|
-
if (answers.scanner !== (ScannerTypes.SITEMAP || ScannerTypes.LOCALFILE)) {
|
|
132
|
-
return statuses.invalidUrl.message;
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
/* if sitemap scan is selected, treat this URL as a filepath
|
|
136
|
-
isFileSitemap will tell whether the filepath exists, and if it does, whether the
|
|
137
|
-
file is a sitemap */
|
|
138
|
-
const finalFilePath = getFileSitemap(url);
|
|
139
|
-
if (finalFilePath) {
|
|
140
|
-
answers.isLocalFileScan = true;
|
|
141
|
-
answers.finalUrl = finalFilePath;
|
|
142
|
-
return true;
|
|
143
|
-
}
|
|
144
|
-
if (answers.scanner === ScannerTypes.LOCALFILE) {
|
|
145
|
-
return statuses.notALocalFile.message;
|
|
146
|
-
}
|
|
147
|
-
return statuses.notASitemap.message;
|
|
148
|
-
|
|
149
|
-
case statuses.notASitemap.code:
|
|
150
|
-
return statuses.notASitemap.message;
|
|
151
|
-
case statuses.notALocalFile.code:
|
|
152
|
-
return statuses.notALocalFile.message;
|
|
129
|
+
if (res.status === statuses.success.code) {
|
|
130
|
+
answers.finalUrl = res.url;
|
|
131
|
+
return true;
|
|
132
|
+
} else {
|
|
133
|
+
const match = Object.values(statuses).find((s: any) => s.code === res.status);
|
|
134
|
+
const msg = match && 'message' in match ? match.message : 'Unknown error';
|
|
135
|
+
return msg;
|
|
153
136
|
}
|
|
154
137
|
},
|
|
155
138
|
filter: (input: string) => sanitizeUrlInput(input.trim()).url,
|
|
@@ -318,9 +318,9 @@ export const runAxeScript = async ({
|
|
|
318
318
|
page.on('console', msg => {
|
|
319
319
|
const type = msg.type();
|
|
320
320
|
if (type === 'error') {
|
|
321
|
-
consoleLogger.
|
|
321
|
+
consoleLogger.error(msg.text());
|
|
322
322
|
} else {
|
|
323
|
-
consoleLogger.
|
|
323
|
+
consoleLogger.info(msg.text());
|
|
324
324
|
}
|
|
325
325
|
});
|
|
326
326
|
*/
|
|
@@ -20,6 +20,7 @@ import constants, {
|
|
|
20
20
|
STATUS_CODE_METADATA,
|
|
21
21
|
disallowedListOfPatterns,
|
|
22
22
|
disallowedSelectorPatterns,
|
|
23
|
+
FileTypes,
|
|
23
24
|
} from '../constants/constants.js';
|
|
24
25
|
import {
|
|
25
26
|
getPlaywrightLaunchOptions,
|
|
@@ -88,7 +89,7 @@ const crawlDomain = async ({
|
|
|
88
89
|
userDataDirectory: string;
|
|
89
90
|
strategy: EnqueueStrategy;
|
|
90
91
|
specifiedMaxConcurrency: number;
|
|
91
|
-
fileTypes:
|
|
92
|
+
fileTypes: FileTypes;
|
|
92
93
|
blacklistedPatterns: string[];
|
|
93
94
|
includeScreenshots: boolean;
|
|
94
95
|
followRobots: boolean;
|
|
@@ -117,8 +118,8 @@ const crawlDomain = async ({
|
|
|
117
118
|
|
|
118
119
|
const pdfDownloads: Promise<void>[] = [];
|
|
119
120
|
const uuidToPdfMapping: Record<string, string> = {};
|
|
120
|
-
const isScanHtml = [
|
|
121
|
-
const isScanPdfs = [
|
|
121
|
+
const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes);
|
|
122
|
+
const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
|
|
122
123
|
const { maxConcurrency } = constants;
|
|
123
124
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
|
124
125
|
|
|
@@ -484,6 +485,9 @@ const crawlDomain = async ({
|
|
|
484
485
|
// handle pdfs
|
|
485
486
|
if (shouldSkipDueToUnsupportedContent(response, request.url) || (request.skipNavigation && actualUrl === 'about:blank')) {
|
|
486
487
|
if (!isScanPdfs) {
|
|
488
|
+
|
|
489
|
+
// Don't inform the user it is skipped since web crawler is best-effort.
|
|
490
|
+
/*
|
|
487
491
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
488
492
|
numScanned: urlsCrawled.scanned.length,
|
|
489
493
|
urlScanned: request.url,
|
|
@@ -495,6 +499,7 @@ const crawlDomain = async ({
|
|
|
495
499
|
metadata: STATUS_CODE_METADATA[1],
|
|
496
500
|
httpStatusCode: 0,
|
|
497
501
|
});
|
|
502
|
+
*/
|
|
498
503
|
|
|
499
504
|
return;
|
|
500
505
|
}
|
|
@@ -511,6 +516,9 @@ const crawlDomain = async ({
|
|
|
511
516
|
}
|
|
512
517
|
|
|
513
518
|
if (isBlacklistedFileExtensions(actualUrl, blackListedFileExtensions)) {
|
|
519
|
+
|
|
520
|
+
// Don't inform the user it is skipped since web crawler is best-effort.
|
|
521
|
+
/*
|
|
514
522
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
515
523
|
numScanned: urlsCrawled.scanned.length,
|
|
516
524
|
urlScanned: request.url,
|
|
@@ -522,7 +530,7 @@ const crawlDomain = async ({
|
|
|
522
530
|
metadata: STATUS_CODE_METADATA[1],
|
|
523
531
|
httpStatusCode: 0,
|
|
524
532
|
});
|
|
525
|
-
|
|
533
|
+
*/
|
|
526
534
|
return;
|
|
527
535
|
}
|
|
528
536
|
|
|
@@ -631,6 +639,9 @@ const crawlDomain = async ({
|
|
|
631
639
|
}
|
|
632
640
|
}
|
|
633
641
|
} else {
|
|
642
|
+
|
|
643
|
+
// Don't inform the user it is skipped since web crawler is best-effort.
|
|
644
|
+
/*
|
|
634
645
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
635
646
|
numScanned: urlsCrawled.scanned.length,
|
|
636
647
|
urlScanned: request.url,
|
|
@@ -642,6 +653,7 @@ const crawlDomain = async ({
|
|
|
642
653
|
metadata: STATUS_CODE_METADATA[1],
|
|
643
654
|
httpStatusCode: 0,
|
|
644
655
|
});
|
|
656
|
+
*/
|
|
645
657
|
}
|
|
646
658
|
|
|
647
659
|
if (followRobots) await getUrlsFromRobotsTxt(request.url, browser, userDataDirectory, extraHTTPHeaders);
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import fs from 'fs';
|
|
2
2
|
import { chromium, Page } from 'playwright';
|
|
3
3
|
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
4
|
-
import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
|
4
|
+
import constants, { FileTypes, guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
|
5
5
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
6
6
|
import crawlDomain from './crawlDomain.js';
|
|
7
7
|
import crawlSitemap from './crawlSitemap.js';
|
|
@@ -20,7 +20,7 @@ const crawlIntelligentSitemap = async (
|
|
|
20
20
|
userDataDirectory: string,
|
|
21
21
|
strategy: EnqueueStrategy,
|
|
22
22
|
specifiedMaxConcurrency: number,
|
|
23
|
-
fileTypes:
|
|
23
|
+
fileTypes: FileTypes,
|
|
24
24
|
blacklistedPatterns: string[],
|
|
25
25
|
includeScreenshots: boolean,
|
|
26
26
|
followRobots: boolean,
|
|
@@ -7,6 +7,7 @@ import constants, {
|
|
|
7
7
|
basicAuthRegex,
|
|
8
8
|
UrlsCrawled,
|
|
9
9
|
STATUS_CODE_METADATA,
|
|
10
|
+
FileTypes,
|
|
10
11
|
} from '../constants/constants.js';
|
|
11
12
|
import { ViewportSettingsClass } from '../combine.js';
|
|
12
13
|
import {
|
|
@@ -47,7 +48,7 @@ export const crawlLocalFile = async ({
|
|
|
47
48
|
browser: string;
|
|
48
49
|
userDataDirectory: string;
|
|
49
50
|
specifiedMaxConcurrency: number;
|
|
50
|
-
fileTypes:
|
|
51
|
+
fileTypes: FileTypes;
|
|
51
52
|
blacklistedPatterns: string[];
|
|
52
53
|
includeScreenshots: boolean;
|
|
53
54
|
extraHTTPHeaders: Record<string, string>;
|
|
@@ -12,6 +12,7 @@ import constants, {
|
|
|
12
12
|
guiInfoStatusTypes,
|
|
13
13
|
UrlsCrawled,
|
|
14
14
|
disallowedListOfPatterns,
|
|
15
|
+
FileTypes,
|
|
15
16
|
} from '../constants/constants.js';
|
|
16
17
|
import {
|
|
17
18
|
getLinksFromSitemap,
|
|
@@ -55,7 +56,7 @@ const crawlSitemap = async ({
|
|
|
55
56
|
browser: string;
|
|
56
57
|
userDataDirectory: string;
|
|
57
58
|
specifiedMaxConcurrency: number;
|
|
58
|
-
fileTypes:
|
|
59
|
+
fileTypes: FileTypes;
|
|
59
60
|
blacklistedPatterns: string[];
|
|
60
61
|
includeScreenshots: boolean;
|
|
61
62
|
extraHTTPHeaders: Record<string, string>;
|
|
@@ -97,8 +98,8 @@ const crawlSitemap = async ({
|
|
|
97
98
|
|
|
98
99
|
const pdfDownloads: Promise<void>[] = [];
|
|
99
100
|
const uuidToPdfMapping: Record<string, string> = {};
|
|
100
|
-
const isScanHtml = [
|
|
101
|
-
const isScanPdfs = [
|
|
101
|
+
const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes);
|
|
102
|
+
const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
|
|
102
103
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
|
103
104
|
const { maxConcurrency } = constants;
|
|
104
105
|
|
|
@@ -288,30 +288,31 @@ export const handlePdfDownload = (
|
|
|
288
288
|
downloadFile.write(buf, 'binary');
|
|
289
289
|
downloadFile.end();
|
|
290
290
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
291
|
+
downloadFile.on('finish', () => {
|
|
292
|
+
if (isPDF(buf)) {
|
|
293
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
294
|
+
numScanned: urlsCrawled.scanned.length,
|
|
295
|
+
urlScanned: request.url,
|
|
296
|
+
});
|
|
297
|
+
urlsCrawled.scanned.push({
|
|
298
|
+
url: request.url,
|
|
299
|
+
pageTitle,
|
|
300
|
+
actualUrl: url,
|
|
301
|
+
});
|
|
302
|
+
} else {
|
|
303
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
304
|
+
numScanned: urlsCrawled.scanned.length,
|
|
305
|
+
urlScanned: request.url,
|
|
306
|
+
});
|
|
307
|
+
urlsCrawled.invalid.push({
|
|
308
|
+
url: request.url,
|
|
309
|
+
pageTitle: url,
|
|
310
|
+
actualUrl: url,
|
|
311
|
+
metadata: STATUS_CODE_METADATA[1],
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
resolve();
|
|
315
|
+
});
|
|
315
316
|
|
|
316
317
|
}),
|
|
317
318
|
);
|
|
@@ -346,6 +347,9 @@ export const runPdfScan = async (randomToken: string) => {
|
|
|
346
347
|
];
|
|
347
348
|
|
|
348
349
|
const ls = spawnSync(veraPdfExe, veraPdfCmdArgs, { shell: true });
|
|
350
|
+
if (ls.stderr && ls.stderr.length > 0)
|
|
351
|
+
consoleLogger.error(ls.stderr.toString());
|
|
352
|
+
|
|
349
353
|
fs.writeFileSync(intermediateResultPath, ls.stdout, { encoding: 'utf-8' });
|
|
350
354
|
};
|
|
351
355
|
|
|
@@ -363,7 +367,7 @@ export const mapPdfScanResults = async (
|
|
|
363
367
|
try {
|
|
364
368
|
parsedJsonData = JSON.parse(rawdata);
|
|
365
369
|
} catch (err) {
|
|
366
|
-
consoleLogger.
|
|
370
|
+
consoleLogger.error(err);
|
|
367
371
|
}
|
|
368
372
|
|
|
369
373
|
const errorMeta = require('../constants/errorMeta.json');
|
package/src/index.ts
CHANGED
|
@@ -4,7 +4,6 @@ import inquirer from 'inquirer';
|
|
|
4
4
|
import { EnqueueStrategy } from 'crawlee';
|
|
5
5
|
import {
|
|
6
6
|
getVersion,
|
|
7
|
-
cleanUp,
|
|
8
7
|
getUserDataTxt,
|
|
9
8
|
writeToUserDataTxt,
|
|
10
9
|
listenForCleanUp,
|
|
@@ -21,7 +20,7 @@ import {
|
|
|
21
20
|
} from './constants/common.js';
|
|
22
21
|
import questions from './constants/questions.js';
|
|
23
22
|
import combineRun from './combine.js';
|
|
24
|
-
import { BrowserTypes, RuleFlags, ScannerTypes } from './constants/constants.js';
|
|
23
|
+
import { BrowserTypes, FileTypes, RuleFlags, ScannerTypes } from './constants/constants.js';
|
|
25
24
|
import { DeviceDescriptor } from './types/types.js';
|
|
26
25
|
|
|
27
26
|
export type Answers = {
|
|
@@ -35,7 +34,7 @@ export type Answers = {
|
|
|
35
34
|
clonedBrowserDataDir: string;
|
|
36
35
|
playwrightDeviceDetailsObject: DeviceDescriptor;
|
|
37
36
|
nameEmail: string;
|
|
38
|
-
fileTypes:
|
|
37
|
+
fileTypes: FileTypes;
|
|
39
38
|
metadata: string;
|
|
40
39
|
maxpages: number;
|
|
41
40
|
strategy: string;
|
|
@@ -72,7 +71,7 @@ export type Data = {
|
|
|
72
71
|
customFlowLabel: string;
|
|
73
72
|
specifiedMaxConcurrency: number;
|
|
74
73
|
randomToken: string;
|
|
75
|
-
fileTypes:
|
|
74
|
+
fileTypes: FileTypes;
|
|
76
75
|
blacklistedPatternsFilename: string;
|
|
77
76
|
includeScreenshots: boolean;
|
|
78
77
|
metadata: string;
|
|
@@ -104,7 +103,7 @@ const runScan = async (answers: Answers) => {
|
|
|
104
103
|
answers.nameEmail = `${userData.name}:${userData.email}`;
|
|
105
104
|
}
|
|
106
105
|
|
|
107
|
-
answers.fileTypes =
|
|
106
|
+
answers.fileTypes = FileTypes.All;
|
|
108
107
|
answers.metadata = '{}';
|
|
109
108
|
|
|
110
109
|
const data: Data = await prepareData(answers);
|
package/src/mergeAxeResults.ts
CHANGED
|
@@ -970,7 +970,7 @@ const writeSummaryPdf = async (storagePath: string, pagesScanned: number, filena
|
|
|
970
970
|
? userDataDirectory
|
|
971
971
|
: '';
|
|
972
972
|
const context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
|
973
|
-
headless:
|
|
973
|
+
headless: true,
|
|
974
974
|
...getPlaywrightLaunchOptions(browser),
|
|
975
975
|
});
|
|
976
976
|
|
|
@@ -2014,35 +2014,35 @@ const generateArtifacts = async (
|
|
|
2014
2014
|
constants.cliZipFileName = path.join(storagePath, constants.cliZipFileName);
|
|
2015
2015
|
}
|
|
2016
2016
|
|
|
2017
|
-
|
|
2018
|
-
.ensureDir(storagePath)
|
|
2019
|
-
.then(() => {
|
|
2020
|
-
zipResults(constants.cliZipFileName, storagePath);
|
|
2021
|
-
const messageToDisplay = [
|
|
2022
|
-
`Report of this run is at ${constants.cliZipFileName}`,
|
|
2023
|
-
`Results directory is at ${storagePath}`,
|
|
2024
|
-
];
|
|
2025
|
-
|
|
2026
|
-
if (process.send && process.env.OOBEE_VERBOSE) {
|
|
2027
|
-
const zipFileNameMessage = {
|
|
2028
|
-
type: 'zipFileName',
|
|
2029
|
-
payload: `${constants.cliZipFileName}`,
|
|
2030
|
-
};
|
|
2031
|
-
const storagePathMessage = {
|
|
2032
|
-
type: 'storagePath',
|
|
2033
|
-
payload: `${storagePath}`,
|
|
2034
|
-
};
|
|
2017
|
+
try {
|
|
2018
|
+
await fs.ensureDir(storagePath);
|
|
2035
2019
|
|
|
2036
|
-
|
|
2020
|
+
await zipResults(constants.cliZipFileName, storagePath);
|
|
2037
2021
|
|
|
2038
|
-
|
|
2039
|
-
}
|
|
2022
|
+
const messageToDisplay = [
|
|
2023
|
+
`Report of this run is at ${constants.cliZipFileName}`,
|
|
2024
|
+
`Results directory is at ${storagePath}`,
|
|
2025
|
+
];
|
|
2040
2026
|
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2027
|
+
if (process.send && process.env.OOBEE_VERBOSE) {
|
|
2028
|
+
const zipFileNameMessage = {
|
|
2029
|
+
type: 'zipFileName',
|
|
2030
|
+
payload: `${constants.cliZipFileName}`,
|
|
2031
|
+
};
|
|
2032
|
+
const storagePathMessage = {
|
|
2033
|
+
type: 'storagePath',
|
|
2034
|
+
payload: `${storagePath}`,
|
|
2035
|
+
};
|
|
2036
|
+
|
|
2037
|
+
process.send(JSON.stringify(storagePathMessage));
|
|
2038
|
+
|
|
2039
|
+
process.send(JSON.stringify(zipFileNameMessage));
|
|
2040
|
+
}
|
|
2041
|
+
|
|
2042
|
+
printMessage(messageToDisplay);
|
|
2043
|
+
} catch (error) {
|
|
2044
|
+
printMessage([`Error in zipping results: ${error}`]);
|
|
2045
|
+
}
|
|
2046
2046
|
|
|
2047
2047
|
// Generate scrubbed HTML Code Snippets
|
|
2048
2048
|
const ruleIdJson = createRuleIdJson(allIssues);
|
package/src/utils.ts
CHANGED