@govtechsg/oobee 0.10.65 → 0.10.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Dockerfile CHANGED
@@ -1,6 +1,6 @@
1
1
  # Use Microsoft Playwright image as base image
2
2
  # Node version is v22
3
- FROM mcr.microsoft.com/playwright:v1.50.1-noble
3
+ FROM mcr.microsoft.com/playwright:v1.55.0-noble
4
4
 
5
5
  # Installation of packages for oobee and runner (locked versions from build log)
6
6
  RUN apt-get update && apt-get install -y \
package/README.md CHANGED
@@ -562,6 +562,26 @@ For details on which accessibility scan results triggers a "Must Fix" / "Good to
562
562
 
563
563
  Please refer to the information below to assist in debugging. Most errors below are due to the switching between Node.js versions.
564
564
 
565
+ ### URL Validation Errors
566
+ The following URL and file validation error codes are provided to troubleshoot the scan.
567
+
568
+ | Code | Error Name | Error Message | Troubleshooting Steps |
569
+ |------|----------------------|-------------------------------------------------------------------------------|------------------------|
570
+ | 0 | success | (undefined) | No action needed. Connection successful. |
571
+ | 11 | invalidUrl | Invalid URL. Please check and try again. | • Ensure the URL starts with `http://` or `https://`.<br>• Check for typos in the URL. |
572
+ | 12 | cannotBeResolved | URL cannot be accessed. Please verify whether the website exists. | • Confirm the domain name is correct.<br>• Check DNS resolution with `ping` or `nslookup`.<br>• Ensure the site is publicly accessible (not behind VPN/firewall). |
573
+ | 14 | systemError | Something went wrong when verifying the URL. Please try again in a few minutes. If this issue persists, please contact the Oobee team. | • Retry after a few minutes.<br>• Check internet connection.<br>• If persistent, report as a system issue. |
574
+ | 15 | notASitemap | Invalid sitemap URL format. Please enter a valid sitemap URL ending with .XML e.g. https://www.example.com/sitemap.xml. | • Ensure the URL points to a valid XML sitemap.<br>• View [Examples of sitemaps sitemaps.org - Protocol](https://www.sitemaps.org/protocol.html)<br>• Test the URL in a browser to confirm it returns XML. |
575
+ | 16 | unauthorised | Login required. Please enter your credentials and try again. | • Check if the site requires username/password.<br>• Provide credentials in Oobee if supported. |
576
+ | 17 | browserError | Incompatible browser. Please ensure you are using Chrome or Edge browser. | • Install the latest version of Chrome or Edge.|
577
+ | 18 | sslProtocolError | SSL certificate error. Please check the SSL configuration of your website and try again. | • Verify SSL certificate validity (not expired, issued by trusted CA).<br>• Check for mismatched TLS versions or cipher issues.<br>• Use an SSL checker tool (e.g., Qualys SSL Labs). |
578
+ | 19 | notALocalFile | Uploaded file format is incorrect. Please upload a HTML, PDF, XML or TXT file. | • Verify the file format.<br>• Ensure you are selecting `.html`, `.pdf`, `.xml`, or `.txt`. |
579
+ | 20 | notAPdf | URL/file format is incorrect. Please upload a PDF file. | • Ensure the file ends with `.pdf`.<br>• Open the file manually to confirm it is a valid PDF. |
580
+ | 21 | notASupportedDocument| Uploaded file format is incorrect. Please upload a HTML, PDF, XML or TXT file. | • Confirm file format.<br>• Convert to a supported type if necessary. |
581
+ | 22 | connectionRefused | Connection refused. Please try again in a few minutes. If this issue persists, please contact the Oobee team. | • Check if the server is running.<br>• Verify firewall settings.<br>• Retry after a short interval. |
582
+ | 23 | timedOut | Request timed out. Please try again in a few minutes. If this issue persists, please contact the Oobee team. | • Check your internet speed and stability.<br>• Retry when the server load is lower. |
583
+
584
+
565
585
  ### Incompatible Node.js versions
566
586
 
567
587
  **Issue**: When your Node.js version is incompatible, you may face the following syntax error.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@govtechsg/oobee",
3
3
  "main": "dist/npmIndex.js",
4
- "version": "0.10.65",
4
+ "version": "0.10.68",
5
5
  "type": "module",
6
6
  "author": "Government Technology Agency <info@tech.gov.sg>",
7
7
  "dependencies": {
@@ -22,10 +22,11 @@
22
22
  "jsdom": "^21.1.2",
23
23
  "jszip": "^3.10.1",
24
24
  "lodash": "^4.17.21",
25
+ "mime": "^4.0.7",
25
26
  "mime-types": "^2.1.35",
26
27
  "minimatch": "^9.0.3",
27
28
  "pdfjs-dist": "github:veraPDF/pdfjs-dist#v4.4.168-taggedPdf-0.1.20",
28
- "playwright": "1.50.1",
29
+ "playwright": "^1.55.0",
29
30
  "prettier": "^3.1.0",
30
31
  "print-message": "^3.0.1",
31
32
  "safe-regex": "^2.1.1",
@@ -48,6 +49,7 @@
48
49
  "@types/fs-extra": "^11.0.4",
49
50
  "@types/inquirer": "^9.0.7",
50
51
  "@types/lodash": "^4.17.7",
52
+ "@types/mime": "^3.0.4",
51
53
  "@types/mime-types": "^2.1.4",
52
54
  "@types/safe-regex": "^1.1.6",
53
55
  "@types/validator": "^13.11.10",
@@ -1,6 +1,6 @@
1
1
  #!/bin/bash
2
2
 
3
- NODE_VERSION="22.13.1"
3
+ NODE_VERSION="22.19.0"
4
4
 
5
5
  # Get current shell command
6
6
  SHELL_COMMAND=$(ps -o comm= -p $$)
@@ -9,11 +9,11 @@ $ErrorActionPreference = 'Stop'
9
9
  # Install NodeJS binaries
10
10
  if (-Not (Test-Path nodejs-win\node.exe)) {
11
11
  Write-Output "Downloading Node"
12
- Invoke-WebRequest -o ./nodejs-win.zip "https://nodejs.org/dist/v22.13.1/node-v22.13.1-win-x64.zip"
12
+ Invoke-WebRequest -o ./nodejs-win.zip "https://nodejs.org/dist/v22.19.0/node-v22.19.0-win-x64.zip"
13
13
 
14
14
  Write-Output "Unzip Node"
15
15
  Expand-Archive .\nodejs-win.zip -DestinationPath .
16
- Rename-Item node-v22.13.1-win-x64 -NewName nodejs-win
16
+ Rename-Item node-v22.19.0-win-x64 -NewName nodejs-win
17
17
  Remove-Item -Force .\nodejs-win.zip
18
18
  }
19
19
 
package/src/cli.ts CHANGED
@@ -211,18 +211,19 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
211
211
  .parse() as unknown as Answers;
212
212
 
213
213
  const scanInit = async (argvs: Answers): Promise<string> => {
214
- let isCustomFlow = false;
215
- if (argvs.scanner === ScannerTypes.CUSTOM) {
216
- isCustomFlow = true;
217
- }
218
-
219
214
  const updatedArgvs = { ...argvs };
220
215
 
221
216
  // Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
222
217
  setHeadlessMode(updatedArgvs.browserToRun, updatedArgvs.headless);
223
218
  const statuses = constants.urlCheckStatuses;
224
219
 
225
- const data = await prepareData(updatedArgvs);
220
+ let data;
221
+ try {
222
+ data = await prepareData(updatedArgvs);
223
+ } catch (e) {
224
+ consoleLogger.error(`Error preparing data: ${e.message}\n${e.stack}`);
225
+ cleanUpAndExit(1);
226
+ }
226
227
 
227
228
  // Executes cleanUp script if error encountered
228
229
  listenForCleanUp(data.randomToken);
@@ -233,83 +234,27 @@ const scanInit = async (argvs: Answers): Promise<string> => {
233
234
  data.browser,
234
235
  data.userDataDirectory,
235
236
  data.playwrightDeviceDetailsObject,
236
- data.extraHTTPHeaders
237
+ data.extraHTTPHeaders,
238
+ data.fileTypes
237
239
  );
238
240
 
239
241
  if (res.httpStatus) consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
240
242
 
241
- switch (res.status) {
242
- case statuses.success.code: {
243
- data.url = res.url;
244
- if (process.env.OOBEE_VALIDATE_URL) {
245
- console.log('Url is valid');
246
- cleanUpAndExit(0, data.randomToken);
247
- }
248
-
249
- break;
250
- }
251
- case statuses.unauthorised.code: {
252
- printMessage([statuses.unauthorised.message], messageOptions);
253
- consoleLogger.info(statuses.unauthorised.message);
254
- cleanUpAndExit(res.status);
255
- }
256
- case statuses.cannotBeResolved.code: {
257
- printMessage([statuses.cannotBeResolved.message], messageOptions);
258
- consoleLogger.info(statuses.cannotBeResolved.message);
259
- cleanUpAndExit(res.status);
260
- }
261
- case statuses.systemError.code: {
262
- printMessage([statuses.systemError.message], messageOptions);
263
- consoleLogger.info(statuses.systemError.message);
264
- cleanUpAndExit(res.status);
265
- }
266
- case statuses.invalidUrl.code: {
267
- if (
268
- updatedArgvs.scanner !== ScannerTypes.SITEMAP &&
269
- updatedArgvs.scanner !== ScannerTypes.LOCALFILE
270
- ) {
271
- printMessage([statuses.invalidUrl.message], messageOptions);
272
- consoleLogger.info(statuses.invalidUrl.message);
273
- cleanUpAndExit(res.status);
274
- }
275
-
276
- const finalFilePath = getFileSitemap(updatedArgvs.url);
277
- if (finalFilePath) {
278
- data.isLocalFileScan = true;
279
- data.url = finalFilePath;
280
-
281
- if (process.env.OOBEE_VALIDATE_URL) {
282
- console.log('Url is valid');
283
- cleanUpAndExit(0);
284
- }
285
- } else if (updatedArgvs.scanner === ScannerTypes.LOCALFILE) {
286
- printMessage([statuses.notALocalFile.message], messageOptions);
287
- consoleLogger.info(statuses.notALocalFile.message);
288
- cleanUpAndExit(statuses.notALocalFile.code);
289
- } else if (updatedArgvs.scanner !== ScannerTypes.SITEMAP) {
290
- printMessage([statuses.notASitemap.message], messageOptions);
291
- consoleLogger.info(statuses.notASitemap.message);
292
- cleanUpAndExit(statuses.notASitemap.code);
293
- }
294
- break;
295
- }
296
- case statuses.notASitemap.code: {
297
- printMessage([statuses.notASitemap.message], messageOptions);
298
- consoleLogger.info(statuses.notASitemap.message);
299
- cleanUpAndExit(res.status);
300
- }
301
- case statuses.notALocalFile.code: {
302
- printMessage([statuses.notALocalFile.message], messageOptions);
303
- consoleLogger.info(statuses.notALocalFile.message);
304
- cleanUpAndExit(res.status);
305
- }
306
- case statuses.browserError.code: {
307
- printMessage([statuses.browserError.message], messageOptions);
308
- consoleLogger.info(statuses.browserError.message);
309
- cleanUpAndExit(res.status);
243
+ if (res.status === statuses.success.code) {
244
+ data.url = res.url;
245
+ if (process.env.OOBEE_VALIDATE_URL) {
246
+ consoleLogger.info('Url is valid');
247
+ cleanUpAndExit(0, data.randomToken);
248
+ return;
310
249
  }
311
- default:
312
- break;
250
+ // fall through (continue normal flow after success)
251
+ } else {
252
+ const match = Object.values(statuses).find((s: any) => s.code === res.status);
253
+ const msg = match && 'message' in match ? match.message : 'Unknown error';
254
+ printMessage([msg], messageOptions);
255
+ consoleLogger.info(msg);
256
+ cleanUpAndExit(res.status);
257
+ return;
313
258
  }
314
259
 
315
260
  if (process.env.OOBEE_VERBOSE) {
@@ -14,6 +14,7 @@ import url, { fileURLToPath, pathToFileURL } from 'url';
14
14
  import safe from 'safe-regex';
15
15
  import * as https from 'https';
16
16
  import os from 'os';
17
+ import mime from 'mime';
17
18
  import { minimatch } from 'minimatch';
18
19
  import { globSync, GlobOptionsWithFileTypesFalse } from 'glob';
19
20
  import { LaunchOptions, Locator, Page, devices, webkit } from 'playwright';
@@ -27,6 +28,8 @@ import constants, {
27
28
  // Legacy code end - Google Sheets submission
28
29
  ScannerTypes,
29
30
  BrowserTypes,
31
+ FileTypes,
32
+ getEnumKey,
30
33
  } from './constants.js';
31
34
  import { consoleLogger } from '../logs.js';
32
35
  import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
@@ -172,9 +175,14 @@ export const messageOptions = {
172
175
  };
173
176
 
174
177
  const urlOptions = {
175
- protocols: ['http', 'https'],
178
+ // http and https for normal scans, file for local file scan
179
+ protocols: ['http', 'https', 'file'],
176
180
  require_protocol: true,
177
181
  require_tld: false,
182
+ require_host: false,
183
+ // being explicit; fragments/queries are fine for local files
184
+ allow_fragments: true,
185
+ allow_query_components: true,
178
186
  };
179
187
 
180
188
  const queryCheck = (s: string) => document.createDocumentFragment().querySelector(s);
@@ -187,8 +195,9 @@ export const isSelectorValid = (selector: string): boolean => {
187
195
  return true;
188
196
  };
189
197
 
190
- // Refer to NPM validator's special characters under sanitizers for escape()
191
- const blackListCharacters = '\\<>&\'"';
198
+ // Don't sanitise for now as we have changed the logic for URL validation / local file scan
199
+ // Only use this when we find characters to validate against
200
+ const blackListCharacters = '';
192
201
 
193
202
  export const validateXML = (content: string): { isValid: boolean; parsedContent: string } => {
194
203
  let isValid: boolean;
@@ -271,12 +280,25 @@ export const isInputValid = (inputString: string): boolean => {
271
280
  export const sanitizeUrlInput = (url: string): { isValid: boolean; url: string } => {
272
281
  // Sanitize that there is no blacklist characters
273
282
  const sanitizeUrl = validator.blacklist(url, blackListCharacters);
274
- if (validator.isURL(sanitizeUrl, urlOptions)) {
283
+ if (url.toLowerCase().startsWith('file://') || validator.isURL(sanitizeUrl, urlOptions)) {
275
284
  return { isValid: true, url: sanitizeUrl };
276
285
  }
277
286
  return { isValid: false, url: sanitizeUrl };
278
287
  };
279
288
 
289
+ const isAllowedContentType = (ct: string): boolean => {
290
+ const c = (ct || '').toLowerCase();
291
+ return (
292
+ c.startsWith('text/html') || // html
293
+ c.startsWith('application/xhtml+xml') || // xhtml
294
+ c.startsWith('text/plain') || // txt
295
+ c.startsWith('application/xml') || // xml
296
+ c.startsWith('text/xml') || // xml (alt)
297
+ c.startsWith('application/pdf') // pdf
298
+ );
299
+ };
300
+
301
+
280
302
  const checkUrlConnectivityWithBrowser = async (
281
303
  url: string,
282
304
  browserToRun: string,
@@ -292,6 +314,44 @@ const checkUrlConnectivityWithBrowser = async (
292
314
  return res;
293
315
  }
294
316
 
317
+ // STEP 1: For local file scans
318
+ let contentType = '';
319
+
320
+ const protocol = new URL(url).protocol;
321
+
322
+ if (protocol !== 'http:' && protocol !== 'https:') {
323
+ try {
324
+ const filePath = fileURLToPath(url);
325
+ const stat = fs.statSync(filePath);
326
+
327
+ if (!stat.isFile()) {
328
+ res.status = constants.urlCheckStatuses.notALocalFile.code;
329
+ return res;
330
+ }
331
+
332
+ const statusCode = 200;
333
+ contentType = mime.getType(filePath) || 'application/octet-stream';
334
+
335
+ if (!isAllowedContentType(contentType)) {
336
+ res.status = constants.urlCheckStatuses.notASupportedDocument.code;
337
+ return res;
338
+ }
339
+
340
+ // Short-circuit for pdfs
341
+ if (contentType.includes('pdf')) {
342
+ res.status = constants.urlCheckStatuses.success.code;
343
+ res.httpStatus = statusCode;
344
+ res.url = url;
345
+ res.content = '%PDF-'; // Avoid putting the binary in memory
346
+ return res;
347
+ }
348
+ } catch (e) {
349
+ consoleLogger.info(`Local file check failed: ${e.message}`);
350
+ res.status = constants.urlCheckStatuses.systemError.code;
351
+ return res;
352
+ }
353
+ }
354
+
295
355
  // Ensure Accept header for non-html content fallback
296
356
  extraHTTPHeaders['Accept'] ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
297
357
 
@@ -302,6 +362,7 @@ const checkUrlConnectivityWithBrowser = async (
302
362
  browserContext = await constants.launcher.launchPersistentContext(clonedDataDir, {
303
363
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
304
364
  ignoreHTTPSErrors: true,
365
+ headless: true,
305
366
  ...getPlaywrightLaunchOptions(browserToRun),
306
367
  ...playwrightDeviceDetailsObject,
307
368
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
@@ -317,77 +378,94 @@ const checkUrlConnectivityWithBrowser = async (
317
378
  try {
318
379
  const page = await browserContext.newPage();
319
380
 
320
- // STEP 1: HEAD request before actual navigation
321
- let statusCode = 0;
322
- let contentType = '';
323
- let disposition = '';
324
-
381
+ // Block native Chrome download UI
325
382
  try {
326
- const headResp = await page.request.fetch(url, {
327
- method: 'HEAD',
328
- headers: extraHTTPHeaders,
329
- });
330
-
331
- statusCode = headResp.status();
332
- contentType = headResp.headers()['content-type'] || '';
333
- disposition = headResp.headers()['content-disposition'] || '';
383
+ const cdp = await browserContext.newCDPSession(page as any);
384
+ await cdp.send('Page.setDownloadBehavior', { behavior: 'deny' });
385
+ } catch (e) {
386
+ consoleLogger.info(`Unable to set download deny: ${(e as Error).message}`);
387
+ }
334
388
 
335
- // If it looks like a downloadable file, skip goto entirely
336
- if (
337
- contentType.includes('pdf') ||
338
- contentType.includes('octet-stream') ||
339
- disposition.includes('attachment')
340
- ) {
341
- res.status = statusCode === 401
342
- ? constants.urlCheckStatuses.unauthorised.code
343
- : constants.urlCheckStatuses.success.code;
389
+ // STEP 2: Navigate (follows server-side redirects)
390
+ page.once('download', () => {
391
+ res.status = constants.urlCheckStatuses.notASupportedDocument.code;
392
+ return res;
393
+ });
394
+
395
+ const response = await page.goto(url, {
396
+ timeout: 15000,
397
+ waitUntil: 'domcontentloaded', // enough to get status + allow potential client redirects to kick in
398
+ });
344
399
 
345
- res.httpStatus = statusCode;
346
- res.url = url;
347
- res.content = ''; // Don't try to render binary
400
+ // Give client-side redirects (meta refresh / JS location.*) a moment
401
+ try {
402
+ await page.waitForLoadState('networkidle', { timeout: 8000 });
403
+ } catch {
404
+ consoleLogger.info('networkidle not reached; proceeding with verification GET');
405
+ }
348
406
 
349
- await browserContext.close();
350
- return res;
351
- }
407
+ // STEP 3: Verify final URL with a GET (follows redirects)
408
+ const finalUrl = page.url();
409
+ let verifyResp = response;
410
+ try {
411
+ verifyResp = await page.request.fetch(finalUrl, {
412
+ method: 'GET',
413
+ headers: extraHTTPHeaders,
414
+ });
352
415
  } catch (e) {
353
- consoleLogger.info(`HEAD request failed: ${e.message}`);
354
- res.status = constants.urlCheckStatuses.systemError.code;
355
- await browserContext.close();
356
- return res;
416
+ consoleLogger.info(`Verification GET failed, falling back to navigation response: ${e.message}`);
357
417
  }
358
418
 
359
- // STEP 2: Safe to proceed with navigation
360
- const response = await page.goto(url, {
361
- timeout: 30000,
362
- waitUntil: 'commit', // Don't wait for full load
363
- });
419
+ // Prefer verification GET; fall back to nav response
420
+ const finalStatus = verifyResp?.status?.() ?? response?.status?.() ?? 0;
421
+ const headers = (verifyResp?.headers?.() ?? response?.headers?.()) || {};
422
+ contentType = headers['content-type'] || '';
364
423
 
365
- const finalStatus = statusCode || (response?.status?.() ?? 0);
366
- res.status = finalStatus === 401
367
- ? constants.urlCheckStatuses.unauthorised.code
368
- : constants.urlCheckStatuses.success.code;
424
+ if (!isAllowedContentType(contentType)) {
425
+ res.status = constants.urlCheckStatuses.notASupportedDocument.code;
426
+ return res;
427
+ }
369
428
 
370
429
  res.httpStatus = finalStatus;
371
- res.url = page.url();
430
+ res.url = finalUrl;
372
431
 
373
- contentType = response?.headers()?.['content-type'] || '';
432
+ if (finalStatus === 401) {
433
+ res.status = constants.urlCheckStatuses.unauthorised.code;
434
+ } else if (finalStatus >= 200 && finalStatus < 400) {
435
+ res.status = constants.urlCheckStatuses.success.code;
436
+ } else if (finalStatus === 405 || finalStatus === 501) {
437
+ // Some origins 405/501 but the browser-rendered page is still reachable after client redirects.
438
+ // As a last resort, consider DOM presence as success if we actually have a document.
439
+ const hasDOM = await page.evaluate(() => !!document && !!document.documentElement);
440
+ res.status = hasDOM ? constants.urlCheckStatuses.success.code : constants.urlCheckStatuses.systemError.code;
441
+ } else {
442
+ res.status = constants.urlCheckStatuses.systemError.code;
443
+ }
444
+
445
+ // Content handling
374
446
  if (contentType.includes('pdf') || contentType.includes('octet-stream')) {
375
- res.content = ''; // Avoid triggering render/download
447
+ res.content = '%PDF-'; // avoid binary in memory / download
376
448
  } else {
377
449
  try {
378
- await page.waitForLoadState('networkidle', { timeout: 10000 });
379
- } catch {
380
- consoleLogger.info('Unable to detect networkidle');
381
- }
382
-
450
+ // Try to get a stable DOM; don't fail the check if it times out
451
+ await page.waitForLoadState('domcontentloaded', { timeout: 5000 });
452
+ } catch {}
383
453
  res.content = await page.content();
384
454
  }
385
455
 
386
456
  } catch (error) {
387
457
  if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
388
458
  res.status = constants.urlCheckStatuses.unauthorised.code;
459
+ } else if (error.message.includes('net::ERR_NAME_NOT_RESOLVED')) {
460
+ res.status = constants.urlCheckStatuses.cannotBeResolved.code;
461
+ } else if (error.message.includes('net::ERR_CONNECTION_REFUSED')) {
462
+ res.status = constants.urlCheckStatuses.connectionRefused.code;
463
+ } else if (error.message.includes('net::ERR_TIMED_OUT')) {
464
+ res.status = constants.urlCheckStatuses.timedOut.code;
465
+ } else if (error.message.includes('net::ERR_SSL_PROTOCOL_ERROR')) {
466
+ res.status = constants.urlCheckStatuses.sslProtocolError.code;
389
467
  } else {
390
- console.log(error);
468
+ consoleLogger.error(error);
391
469
  res.status = constants.urlCheckStatuses.systemError.code;
392
470
  }
393
471
  } finally {
@@ -397,6 +475,16 @@ const checkUrlConnectivityWithBrowser = async (
397
475
  return res;
398
476
  };
399
477
 
478
+ export const isPdfContent = (content: Buffer | string): boolean => {
479
+ let header: string;
480
+ if (Buffer.isBuffer(content)) {
481
+ header = content.toString('utf8', 0, 5);
482
+ } else {
483
+ header = content.substring(0, 5);
484
+ }
485
+ return header === '%PDF-';
486
+ };
487
+
400
488
  export const isSitemapContent = (content: string) => {
401
489
  const { isValid } = validateXML(content);
402
490
  if (isValid) {
@@ -426,27 +514,43 @@ export const checkUrl = async (
426
514
  clonedDataDir: string,
427
515
  playwrightDeviceDetailsObject: DeviceDescriptor,
428
516
  extraHTTPHeaders: Record<string, string>,
517
+ fileTypes: FileTypes
429
518
  ) => {
519
+
430
520
  const res = await checkUrlConnectivityWithBrowser(
431
- url,
432
- browser,
433
- clonedDataDir,
434
- playwrightDeviceDetailsObject,
435
- extraHTTPHeaders,
521
+ url,
522
+ browser,
523
+ clonedDataDir,
524
+ playwrightDeviceDetailsObject,
525
+ extraHTTPHeaders,
436
526
  );
437
527
 
438
- if (
439
- res.status === constants.urlCheckStatuses.success.code &&
440
- (scanner === ScannerTypes.SITEMAP || scanner === ScannerTypes.LOCALFILE)
441
- ) {
442
- const isSitemap = isSitemapContent(res.content);
528
+ // If response is 200 (meaning no other code was set earlier)
529
+ if (res.status === constants.urlCheckStatuses.success.code) {
530
+
531
+ // Check if document is pdf type
532
+ const isPdf = isPdfContent(res.content);
533
+
534
+ // Check if only HTML document is allowed to be scanned
535
+ if (fileTypes === FileTypes.HtmlOnly && isPdf) {
536
+ res.status = constants.urlCheckStatuses.notASupportedDocument.code;
537
+
538
+ // Check if only PDF document is allowed to be scanned
539
+ } else if (fileTypes === FileTypes.PdfOnly && !isPdf) {
540
+ res.status = constants.urlCheckStatuses.notAPdf.code;
443
541
 
444
- if (!isSitemap && scanner === ScannerTypes.LOCALFILE) {
445
- res.status = constants.urlCheckStatuses.notALocalFile.code;
446
- } else if (!isSitemap) {
447
- res.status = constants.urlCheckStatuses.notASitemap.code;
542
+ // Check if sitemap is expected
543
+ } else if (scanner === ScannerTypes.SITEMAP) {
544
+ const isSitemap = isSitemapContent(res.content);
545
+
546
+ if (!isSitemap) {
547
+ res.status = constants.urlCheckStatuses.notASitemap.code;
548
+ }
448
549
  }
550
+
551
+ // else proceed as normal
449
552
  }
553
+
450
554
  return res;
451
555
  };
452
556
 
@@ -486,7 +590,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
486
590
  viewportWidth,
487
591
  maxpages,
488
592
  strategy,
489
- isLocalFileScan = false,
593
+ isLocalFileScan = argv.scanner === ScannerTypes.LOCALFILE,
490
594
  browserToRun,
491
595
  nameEmail,
492
596
  customFlowLabel,
@@ -511,30 +615,34 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
511
615
  let username = '';
512
616
  let password = '';
513
617
 
618
+ // If a file path is provided
514
619
  if (isFilePath(url)) {
515
- argv.isLocalFileScan = true;
516
- }
620
+ // Set is as local file scan if not already so
621
+ isLocalFileScan = true;
517
622
 
518
- // Remove credentials from URL if not a local file scan
519
- url = argv.isLocalFileScan
520
- ? url
521
- : (() => {
522
- const temp = new URL(url);
523
- username = temp.username;
524
- password = temp.password;
623
+ // Convert to absolute path
624
+ url = path.resolve(url);
525
625
 
526
- if (username !== '' || password !== '') {
527
- extraHTTPHeaders['Authorization'] = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
528
- }
626
+ // Convert to file:// URL
627
+ url = convertPathToLocalFile(url);
628
+ } else {
629
+ // Check URL for basic auth embedded and move it to extraHTTPHeaders
630
+ const temp = new URL(url);
631
+ username = temp.username;
632
+ password = temp.password;
529
633
 
530
- temp.username = '';
531
- temp.password = '';
532
- return temp.toString();
533
- })();
634
+ if (username !== '' || password !== '') {
635
+ extraHTTPHeaders['Authorization'] = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
636
+ }
637
+
638
+ temp.username = '';
639
+ temp.password = '';
640
+ url = temp.toString();
641
+ }
534
642
 
535
643
  // construct filename for scan results
536
644
  const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
537
- const domain = argv.isLocalFileScan ? path.basename(argv.url) : new URL(argv.url).hostname;
645
+ const domain = isLocalFileScan ? path.basename(url) : new URL(url).hostname;
538
646
 
539
647
  const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : '';
540
648
  let resultFilename: string;
@@ -586,7 +694,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
586
694
  customFlowLabel,
587
695
  specifiedMaxConcurrency,
588
696
  randomToken: resultFilename,
589
- fileTypes,
697
+ fileTypes: FileTypes[getEnumKey(FileTypes, fileTypes) as keyof typeof FileTypes],
590
698
  blacklistedPatternsFilename,
591
699
  includeScreenshots: !(additional === 'none'),
592
700
  metadata,
@@ -1335,26 +1443,28 @@ export const cloneChromeProfiles = (randomToken: string): string => {
1335
1443
  destDir = path.join(baseDir, `oobee-${randomToken}`);
1336
1444
 
1337
1445
  if (fs.existsSync(destDir)) {
1338
- deleteClonedChromeProfiles(randomToken);
1339
- }
1340
-
1341
- if (!fs.existsSync(destDir)) {
1342
- fs.mkdirSync(destDir, { recursive: true });
1343
- }
1446
+ // Don't delete since it will be handled at the end of the scan
1447
+ // deleteClonedChromeProfiles(randomToken);
1448
+ // Assume it cloned and don't re-clone
1449
+ } else {
1450
+ if (!fs.existsSync(destDir)) {
1451
+ fs.mkdirSync(destDir, { recursive: true });
1452
+ }
1344
1453
 
1345
- const baseOptions = {
1346
- cwd: baseDir,
1347
- recursive: true,
1348
- absolute: true,
1349
- nodir: true,
1350
- };
1351
- const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
1352
- if (cloneChromeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
1353
- return destDir;
1354
- }
1454
+ const baseOptions = {
1455
+ cwd: baseDir,
1456
+ recursive: true,
1457
+ absolute: true,
1458
+ nodir: true,
1459
+ };
1460
+ const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
1461
+ if (cloneChromeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
1462
+ return destDir;
1463
+ }
1355
1464
 
1356
- consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
1465
+ consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
1357
1466
 
1467
+ }
1358
1468
  // For future reference, return a null instead to halt the scan
1359
1469
  return destDir;
1360
1470
  };
@@ -1371,10 +1481,11 @@ export const cloneChromiumProfiles = (randomToken: string): string => {
1371
1481
  destDir = path.join(baseDir, `oobee-${randomToken}`);
1372
1482
 
1373
1483
  if (fs.existsSync(destDir)) {
1374
- deleteClonedChromiumProfiles(randomToken);
1375
- }
1376
-
1377
- if (!fs.existsSync(destDir)) {
1484
+
1485
+ // Don't delete since it will be handled at the end of the scan
1486
+ // deleteClonedChromiumProfiles(randomToken);
1487
+ // Assume it cloned and don't re-clone
1488
+ } else {
1378
1489
  fs.mkdirSync(destDir, { recursive: true });
1379
1490
  }
1380
1491
 
@@ -1401,26 +1512,31 @@ export const cloneEdgeProfiles = (randomToken: string): string => {
1401
1512
  destDir = path.join(baseDir, `oobee-${randomToken}`);
1402
1513
 
1403
1514
  if (fs.existsSync(destDir)) {
1404
- deleteClonedEdgeProfiles(randomToken);
1405
- }
1406
1515
 
1407
- if (!fs.existsSync(destDir)) {
1408
- fs.mkdirSync(destDir, { recursive: true });
1409
- }
1516
+ // Don't delete since it will be handled at the end of the scan
1517
+ // deleteClonedEdgeProfiles(randomToken);
1518
+ // Assume it cloned and don't re-clone
1410
1519
 
1411
- const baseOptions = {
1412
- cwd: baseDir,
1413
- recursive: true,
1414
- absolute: true,
1415
- nodir: true,
1416
- };
1520
+ } else {
1521
+ if (!fs.existsSync(destDir)) {
1522
+ fs.mkdirSync(destDir, { recursive: true });
1523
+ }
1417
1524
 
1418
- const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
1419
- if (cloneEdgeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
1420
- return destDir;
1421
- }
1525
+ const baseOptions = {
1526
+ cwd: baseDir,
1527
+ recursive: true,
1528
+ absolute: true,
1529
+ nodir: true,
1530
+ };
1531
+
1532
+ const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
1533
+ if (cloneEdgeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
1534
+ return destDir;
1535
+ }
1536
+
1537
+ consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
1422
1538
 
1423
- consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
1539
+ }
1424
1540
 
1425
1541
  // For future reference, return a null instead to halt the scan
1426
1542
  return destDir;
@@ -1863,10 +1979,13 @@ export const isFilePath = (url: string): boolean => {
1863
1979
  const driveLetterPattern = /^[A-Z]:/i;
1864
1980
  const backslashPattern = /\\/;
1865
1981
  return (
1866
- url.startsWith('file://') ||
1867
1982
  url.startsWith('/') ||
1868
1983
  driveLetterPattern.test(url) ||
1869
- backslashPattern.test(url)
1984
+ backslashPattern.test(url) ||
1985
+ url.startsWith('./') ||
1986
+ url.startsWith('../') ||
1987
+ url.startsWith('.\\') ||
1988
+ url.startsWith('..\\')
1870
1989
  );
1871
1990
  };
1872
1991
 
@@ -252,6 +252,16 @@ export enum ScannerTypes {
252
252
  }
253
253
  /* eslint-enable no-unused-vars */
254
254
 
255
+ export enum FileTypes {
256
+ All = 'all',
257
+ PdfOnly = 'pdf-only',
258
+ HtmlOnly = 'html-only',
259
+ }
260
+
261
+ export function getEnumKey<E extends Record<string, string>>(enumObj: E, value: string): keyof E | undefined {
262
+ return (Object.keys(enumObj) as Array<keyof E>).find(k => enumObj[k] === value);
263
+ }
264
+
255
265
  export const guiInfoStatusTypes = {
256
266
  SCANNED: 'scanned',
257
267
  SKIPPED: 'skipped',
@@ -379,31 +389,28 @@ const wcagLinks = {
379
389
 
380
390
  const urlCheckStatuses = {
381
391
  success: { code: 0 },
382
- invalidUrl: { code: 11, message: 'Invalid URL or URL is not using http or https.' },
383
- cannotBeResolved: {
384
- code: 12,
385
- message:
386
- 'Provided URL cannot be accessed. Please verify your internet connectivity and the correctness of the domain.',
387
- },
392
+ invalidUrl: { code: 11, message: 'Invalid URL. Please check and try again.' },
393
+ cannotBeResolved: { code: 12, message: 'URL cannot be accessed. Please verify whether the website exists.' },
388
394
  errorStatusReceived: {
389
395
  // unused for now
390
396
  code: 13,
391
397
  message: 'Provided URL cannot be accessed. Server responded with code ', // append it with the response code received,
392
398
  },
393
- systemError: {
394
- code: 14,
395
- message: 'Something went wrong when verifying the URL. Please try again later.',
396
- },
397
- notASitemap: { code: 15, message: 'Provided URL is not a sitemap.' },
398
- unauthorised: { code: 16, message: 'Provided URL needs basic authorisation.' },
399
+ systemError: { code: 14, message: 'Something went wrong when verifying the URL. Please try again in a few minutes. If this issue persists, please contact the Oobee team.'},
400
+ notASitemap: { code: 15, message: 'Invalid sitemap URL format. Please enter a valid sitemap URL ending with .XML e.g. https://www.example.com/sitemap.xml.' },
401
+ unauthorised: { code: 16, message: 'Login required. Please enter your credentials and try again.' },
402
+ // browserError means engine could not find a browser to run the scan
399
403
  browserError: {
400
404
  code: 17,
401
405
  message:
402
- 'No browser available to run scans. Please ensure you have Chrome or Edge (for Windows only) installed.',
406
+ 'Incompatible browser. Please ensure you are using Chrome or Edge browser.',
403
407
  },
404
- axiosTimeout: { code: 18, message: 'Axios timeout exceeded. Falling back on browser checks.' },
405
- notALocalFile: { code: 19, message: 'Provided filepath is not a local html or sitemap file.' },
406
- terminationRequested: { code: 15, message: 'Termination requested.' }
408
+ sslProtocolError: { code: 18, message: 'SSL certificate error. Please check the SSL configuration of your website and try again.' },
409
+ notALocalFile: { code: 19, message: 'Uploaded file format is incorrect. Please upload a HTML, PDF, XML or TXT file.' },
410
+ notAPdf: { code: 20, message: 'URL/file format is incorrect. Please upload a PDF file.' },
411
+ notASupportedDocument: { code: 21, message: 'Uploaded file format is incorrect. Please upload a HTML, PDF, XML or TXT file.' },
412
+ connectionRefused: { code: 22, message: 'Connection refused. Please try again in a few minutes. If this issue persists, please contact the Oobee team.' },
413
+ timedOut: { code: 23, message: 'Request timed out. Please try again in a few minutes. If this issue persists, please contact the Oobee team.' },
407
414
  };
408
415
 
409
416
  /* eslint-disable no-unused-vars */
@@ -3,7 +3,6 @@ import { Answers } from '../index.js';
3
3
  import { getUserDataTxt, randomThreeDigitNumberString, setHeadlessMode } from '../utils.js';
4
4
  import {
5
5
  checkUrl,
6
- deleteClonedProfiles,
7
6
  getBrowserToRun,
8
7
  getPlaywrightDeviceDetailsObject,
9
8
  getUrlMessage,
@@ -14,7 +13,7 @@ import {
14
13
  validateCustomFlowLabel,
15
14
  parseHeaders,
16
15
  } from './common.js';
17
- import constants, { BrowserTypes, ScannerTypes } from './constants.js';
16
+ import constants, { BrowserTypes, FileTypes, ScannerTypes } from './constants.js';
18
17
  import { random } from 'lodash';
19
18
 
20
19
  const userData = getUserDataTxt();
@@ -58,6 +57,13 @@ const startScanQuestions = [
58
57
  name: 'viewportWidth',
59
58
  message: 'Specify width of the viewport in pixels (e.g. 360):',
60
59
  when: (answers: Answers) => answers.customDevice === 'Specify viewport',
60
+ filter: (input) => {
61
+ if (input === '' || input === undefined) {
62
+ return undefined; // return nothing instead of NaN
63
+ }
64
+ const n = Number(input);
65
+ return Number.isInteger(n) ? n : undefined;
66
+ },
61
67
  validate: (viewport: number) => {
62
68
  if (!Number.isInteger(viewport)) {
63
69
  return 'Invalid viewport width. Please provide an integer.';
@@ -117,39 +123,16 @@ const startScanQuestions = [
117
123
  clonedBrowserDataDir,
118
124
  playwrightDeviceDetailsObject,
119
125
  parseHeaders(answers.header),
126
+ FileTypes.All,
120
127
  );
121
128
 
122
- switch (res.status) {
123
- case statuses.success.code:
124
- answers.finalUrl = res.url;
125
- return true;
126
- case statuses.cannotBeResolved.code:
127
- return statuses.cannotBeResolved.message;
128
- case statuses.systemError.code:
129
- return statuses.systemError.message;
130
- case statuses.invalidUrl.code:
131
- if (answers.scanner !== (ScannerTypes.SITEMAP || ScannerTypes.LOCALFILE)) {
132
- return statuses.invalidUrl.message;
133
- }
134
-
135
- /* if sitemap scan is selected, treat this URL as a filepath
136
- isFileSitemap will tell whether the filepath exists, and if it does, whether the
137
- file is a sitemap */
138
- const finalFilePath = getFileSitemap(url);
139
- if (finalFilePath) {
140
- answers.isLocalFileScan = true;
141
- answers.finalUrl = finalFilePath;
142
- return true;
143
- }
144
- if (answers.scanner === ScannerTypes.LOCALFILE) {
145
- return statuses.notALocalFile.message;
146
- }
147
- return statuses.notASitemap.message;
148
-
149
- case statuses.notASitemap.code:
150
- return statuses.notASitemap.message;
151
- case statuses.notALocalFile.code:
152
- return statuses.notALocalFile.message;
129
+ if (res.status === statuses.success.code) {
130
+ answers.finalUrl = res.url;
131
+ return true;
132
+ } else {
133
+ const match = Object.values(statuses).find((s: any) => s.code === res.status);
134
+ const msg = match && 'message' in match ? match.message : 'Unknown error';
135
+ return msg;
153
136
  }
154
137
  },
155
138
  filter: (input: string) => sanitizeUrlInput(input.trim()).url,
@@ -318,9 +318,9 @@ export const runAxeScript = async ({
318
318
  page.on('console', msg => {
319
319
  const type = msg.type();
320
320
  if (type === 'error') {
321
- consoleLogger.log({ level: 'error', message: msg.text() });
321
+ consoleLogger.error(msg.text());
322
322
  } else {
323
- consoleLogger.log({ level: 'info', message: msg.text() });
323
+ consoleLogger.info(msg.text());
324
324
  }
325
325
  });
326
326
  */
@@ -20,6 +20,7 @@ import constants, {
20
20
  STATUS_CODE_METADATA,
21
21
  disallowedListOfPatterns,
22
22
  disallowedSelectorPatterns,
23
+ FileTypes,
23
24
  } from '../constants/constants.js';
24
25
  import {
25
26
  getPlaywrightLaunchOptions,
@@ -88,7 +89,7 @@ const crawlDomain = async ({
88
89
  userDataDirectory: string;
89
90
  strategy: EnqueueStrategy;
90
91
  specifiedMaxConcurrency: number;
91
- fileTypes: string;
92
+ fileTypes: FileTypes;
92
93
  blacklistedPatterns: string[];
93
94
  includeScreenshots: boolean;
94
95
  followRobots: boolean;
@@ -117,8 +118,8 @@ const crawlDomain = async ({
117
118
 
118
119
  const pdfDownloads: Promise<void>[] = [];
119
120
  const uuidToPdfMapping: Record<string, string> = {};
120
- const isScanHtml = ['all', 'html-only'].includes(fileTypes);
121
- const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
121
+ const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes);
122
+ const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
122
123
  const { maxConcurrency } = constants;
123
124
  const { playwrightDeviceDetailsObject } = viewportSettings;
124
125
 
@@ -484,6 +485,9 @@ const crawlDomain = async ({
484
485
  // handle pdfs
485
486
  if (shouldSkipDueToUnsupportedContent(response, request.url) || (request.skipNavigation && actualUrl === 'about:blank')) {
486
487
  if (!isScanPdfs) {
488
+
489
+ // Don't inform the user it is skipped since web crawler is best-effort.
490
+ /*
487
491
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
488
492
  numScanned: urlsCrawled.scanned.length,
489
493
  urlScanned: request.url,
@@ -495,6 +499,7 @@ const crawlDomain = async ({
495
499
  metadata: STATUS_CODE_METADATA[1],
496
500
  httpStatusCode: 0,
497
501
  });
502
+ */
498
503
 
499
504
  return;
500
505
  }
@@ -511,6 +516,9 @@ const crawlDomain = async ({
511
516
  }
512
517
 
513
518
  if (isBlacklistedFileExtensions(actualUrl, blackListedFileExtensions)) {
519
+
520
+ // Don't inform the user it is skipped since web crawler is best-effort.
521
+ /*
514
522
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
515
523
  numScanned: urlsCrawled.scanned.length,
516
524
  urlScanned: request.url,
@@ -522,7 +530,7 @@ const crawlDomain = async ({
522
530
  metadata: STATUS_CODE_METADATA[1],
523
531
  httpStatusCode: 0,
524
532
  });
525
-
533
+ */
526
534
  return;
527
535
  }
528
536
 
@@ -631,6 +639,9 @@ const crawlDomain = async ({
631
639
  }
632
640
  }
633
641
  } else {
642
+
643
+ // Don't inform the user it is skipped since web crawler is best-effort.
644
+ /*
634
645
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
635
646
  numScanned: urlsCrawled.scanned.length,
636
647
  urlScanned: request.url,
@@ -642,6 +653,7 @@ const crawlDomain = async ({
642
653
  metadata: STATUS_CODE_METADATA[1],
643
654
  httpStatusCode: 0,
644
655
  });
656
+ */
645
657
  }
646
658
 
647
659
  if (followRobots) await getUrlsFromRobotsTxt(request.url, browser, userDataDirectory, extraHTTPHeaders);
@@ -1,7 +1,7 @@
1
1
  import fs from 'fs';
2
2
  import { chromium, Page } from 'playwright';
3
3
  import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
4
- import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
4
+ import constants, { FileTypes, guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
5
5
  import { consoleLogger, guiInfoLog } from '../logs.js';
6
6
  import crawlDomain from './crawlDomain.js';
7
7
  import crawlSitemap from './crawlSitemap.js';
@@ -20,7 +20,7 @@ const crawlIntelligentSitemap = async (
20
20
  userDataDirectory: string,
21
21
  strategy: EnqueueStrategy,
22
22
  specifiedMaxConcurrency: number,
23
- fileTypes: string,
23
+ fileTypes: FileTypes,
24
24
  blacklistedPatterns: string[],
25
25
  includeScreenshots: boolean,
26
26
  followRobots: boolean,
@@ -7,6 +7,7 @@ import constants, {
7
7
  basicAuthRegex,
8
8
  UrlsCrawled,
9
9
  STATUS_CODE_METADATA,
10
+ FileTypes,
10
11
  } from '../constants/constants.js';
11
12
  import { ViewportSettingsClass } from '../combine.js';
12
13
  import {
@@ -47,7 +48,7 @@ export const crawlLocalFile = async ({
47
48
  browser: string;
48
49
  userDataDirectory: string;
49
50
  specifiedMaxConcurrency: number;
50
- fileTypes: string;
51
+ fileTypes: FileTypes;
51
52
  blacklistedPatterns: string[];
52
53
  includeScreenshots: boolean;
53
54
  extraHTTPHeaders: Record<string, string>;
@@ -12,6 +12,7 @@ import constants, {
12
12
  guiInfoStatusTypes,
13
13
  UrlsCrawled,
14
14
  disallowedListOfPatterns,
15
+ FileTypes,
15
16
  } from '../constants/constants.js';
16
17
  import {
17
18
  getLinksFromSitemap,
@@ -55,7 +56,7 @@ const crawlSitemap = async ({
55
56
  browser: string;
56
57
  userDataDirectory: string;
57
58
  specifiedMaxConcurrency: number;
58
- fileTypes: string;
59
+ fileTypes: FileTypes;
59
60
  blacklistedPatterns: string[];
60
61
  includeScreenshots: boolean;
61
62
  extraHTTPHeaders: Record<string, string>;
@@ -97,8 +98,8 @@ const crawlSitemap = async ({
97
98
 
98
99
  const pdfDownloads: Promise<void>[] = [];
99
100
  const uuidToPdfMapping: Record<string, string> = {};
100
- const isScanHtml = ['all', 'html-only'].includes(fileTypes);
101
- const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
101
+ const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes);
102
+ const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
102
103
  const { playwrightDeviceDetailsObject } = viewportSettings;
103
104
  const { maxConcurrency } = constants;
104
105
 
@@ -288,30 +288,31 @@ export const handlePdfDownload = (
288
288
  downloadFile.write(buf, 'binary');
289
289
  downloadFile.end();
290
290
 
291
- if (isPDF(buf)) {
292
- guiInfoLog(guiInfoStatusTypes.SCANNED, {
293
- numScanned: urlsCrawled.scanned.length,
294
- urlScanned: request.url,
295
- });
296
- urlsCrawled.scanned.push({
297
- url: request.url,
298
- pageTitle,
299
- actualUrl: url,
300
- });
301
- } else {
302
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
303
- numScanned: urlsCrawled.scanned.length,
304
- urlScanned: request.url,
305
- });
306
- urlsCrawled.invalid.push({
307
- url: request.url,
308
- pageTitle: url,
309
- actualUrl: url,
310
- metadata: STATUS_CODE_METADATA[1],
311
- });
312
- }
313
-
314
- resolve();
291
+ downloadFile.on('finish', () => {
292
+ if (isPDF(buf)) {
293
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
294
+ numScanned: urlsCrawled.scanned.length,
295
+ urlScanned: request.url,
296
+ });
297
+ urlsCrawled.scanned.push({
298
+ url: request.url,
299
+ pageTitle,
300
+ actualUrl: url,
301
+ });
302
+ } else {
303
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
304
+ numScanned: urlsCrawled.scanned.length,
305
+ urlScanned: request.url,
306
+ });
307
+ urlsCrawled.invalid.push({
308
+ url: request.url,
309
+ pageTitle: url,
310
+ actualUrl: url,
311
+ metadata: STATUS_CODE_METADATA[1],
312
+ });
313
+ }
314
+ resolve();
315
+ });
315
316
 
316
317
  }),
317
318
  );
@@ -346,6 +347,9 @@ export const runPdfScan = async (randomToken: string) => {
346
347
  ];
347
348
 
348
349
  const ls = spawnSync(veraPdfExe, veraPdfCmdArgs, { shell: true });
350
+ if (ls.stderr && ls.stderr.length > 0)
351
+ consoleLogger.error(ls.stderr.toString());
352
+
349
353
  fs.writeFileSync(intermediateResultPath, ls.stdout, { encoding: 'utf-8' });
350
354
  };
351
355
 
@@ -363,7 +367,7 @@ export const mapPdfScanResults = async (
363
367
  try {
364
368
  parsedJsonData = JSON.parse(rawdata);
365
369
  } catch (err) {
366
- consoleLogger.log(err);
370
+ consoleLogger.error(err);
367
371
  }
368
372
 
369
373
  const errorMeta = require('../constants/errorMeta.json');
package/src/index.ts CHANGED
@@ -4,7 +4,6 @@ import inquirer from 'inquirer';
4
4
  import { EnqueueStrategy } from 'crawlee';
5
5
  import {
6
6
  getVersion,
7
- cleanUp,
8
7
  getUserDataTxt,
9
8
  writeToUserDataTxt,
10
9
  listenForCleanUp,
@@ -21,7 +20,7 @@ import {
21
20
  } from './constants/common.js';
22
21
  import questions from './constants/questions.js';
23
22
  import combineRun from './combine.js';
24
- import { BrowserTypes, RuleFlags, ScannerTypes } from './constants/constants.js';
23
+ import { BrowserTypes, FileTypes, RuleFlags, ScannerTypes } from './constants/constants.js';
25
24
  import { DeviceDescriptor } from './types/types.js';
26
25
 
27
26
  export type Answers = {
@@ -35,7 +34,7 @@ export type Answers = {
35
34
  clonedBrowserDataDir: string;
36
35
  playwrightDeviceDetailsObject: DeviceDescriptor;
37
36
  nameEmail: string;
38
- fileTypes: string;
37
+ fileTypes: FileTypes;
39
38
  metadata: string;
40
39
  maxpages: number;
41
40
  strategy: string;
@@ -72,7 +71,7 @@ export type Data = {
72
71
  customFlowLabel: string;
73
72
  specifiedMaxConcurrency: number;
74
73
  randomToken: string;
75
- fileTypes: string;
74
+ fileTypes: FileTypes;
76
75
  blacklistedPatternsFilename: string;
77
76
  includeScreenshots: boolean;
78
77
  metadata: string;
@@ -104,7 +103,7 @@ const runScan = async (answers: Answers) => {
104
103
  answers.nameEmail = `${userData.name}:${userData.email}`;
105
104
  }
106
105
 
107
- answers.fileTypes = 'html-only';
106
+ answers.fileTypes = FileTypes.All;
108
107
  answers.metadata = '{}';
109
108
 
110
109
  const data: Data = await prepareData(answers);
@@ -970,7 +970,7 @@ const writeSummaryPdf = async (storagePath: string, pagesScanned: number, filena
970
970
  ? userDataDirectory
971
971
  : '';
972
972
  const context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
973
- headless: process.env.CRAWLEE_HEADLESS === '1',
973
+ headless: true,
974
974
  ...getPlaywrightLaunchOptions(browser),
975
975
  });
976
976
 
@@ -2014,35 +2014,35 @@ const generateArtifacts = async (
2014
2014
  constants.cliZipFileName = path.join(storagePath, constants.cliZipFileName);
2015
2015
  }
2016
2016
 
2017
- await fs
2018
- .ensureDir(storagePath)
2019
- .then(() => {
2020
- zipResults(constants.cliZipFileName, storagePath);
2021
- const messageToDisplay = [
2022
- `Report of this run is at ${constants.cliZipFileName}`,
2023
- `Results directory is at ${storagePath}`,
2024
- ];
2025
-
2026
- if (process.send && process.env.OOBEE_VERBOSE) {
2027
- const zipFileNameMessage = {
2028
- type: 'zipFileName',
2029
- payload: `${constants.cliZipFileName}`,
2030
- };
2031
- const storagePathMessage = {
2032
- type: 'storagePath',
2033
- payload: `${storagePath}`,
2034
- };
2017
+ try {
2018
+ await fs.ensureDir(storagePath);
2035
2019
 
2036
- process.send(JSON.stringify(storagePathMessage));
2020
+ await zipResults(constants.cliZipFileName, storagePath);
2037
2021
 
2038
- process.send(JSON.stringify(zipFileNameMessage));
2039
- }
2022
+ const messageToDisplay = [
2023
+ `Report of this run is at ${constants.cliZipFileName}`,
2024
+ `Results directory is at ${storagePath}`,
2025
+ ];
2040
2026
 
2041
- printMessage(messageToDisplay);
2042
- })
2043
- .catch(error => {
2044
- printMessage([`Error in zipping results: ${error}`]);
2045
- });
2027
+ if (process.send && process.env.OOBEE_VERBOSE) {
2028
+ const zipFileNameMessage = {
2029
+ type: 'zipFileName',
2030
+ payload: `${constants.cliZipFileName}`,
2031
+ };
2032
+ const storagePathMessage = {
2033
+ type: 'storagePath',
2034
+ payload: `${storagePath}`,
2035
+ };
2036
+
2037
+ process.send(JSON.stringify(storagePathMessage));
2038
+
2039
+ process.send(JSON.stringify(zipFileNameMessage));
2040
+ }
2041
+
2042
+ printMessage(messageToDisplay);
2043
+ } catch (error) {
2044
+ printMessage([`Error in zipping results: ${error}`]);
2045
+ }
2046
2046
 
2047
2047
  // Generate scrubbed HTML Code Snippets
2048
2048
  const ruleIdJson = createRuleIdJson(allIssues);
package/src/utils.ts CHANGED
@@ -1002,8 +1002,6 @@ export const zipResults = async (zipName: string, resultsPath: string): Promise<
1002
1002
  }
1003
1003
  }
1004
1004
 
1005
- await addFolderToZip(resultsPath, new JSZip());
1006
-
1007
1005
  const zip = new JSZip();
1008
1006
  await addFolderToZip(resultsPath, zip);
1009
1007