@govtechsg/oobee 0.10.42 → 0.10.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/REPORTS.md CHANGED
@@ -242,7 +242,9 @@ This file contains a summary of pages affected by accessibility issues.
242
242
  {
243
243
  "url": "<string>",
244
244
  "pageTitle": "<string>",
245
- "actualUrl": "about:blank"
245
+ "actualUrl": "<string>",
246
+ "metadata": "<string>",
247
+ "httpStatusCode": number
246
248
  },
247
249
  ],
248
250
  "pagesNotScannedCount": <number>
@@ -340,7 +342,9 @@ This file contains a summary of accessibility issues found in a scan, categorize
340
342
  {
341
343
  "url": "<string>",
342
344
  "pageTitle": "<string>",
343
- "actualUrl": "about:blank"
345
+ "actualUrl": "<string>",
346
+ "metadata": "<string>",
347
+ "httpStatusCode": number
344
348
  },
345
349
  ],
346
350
  "pagesNotScannedCount": <number>
@@ -360,3 +364,68 @@ To deflate the .json.gz.b64, use the following with `pako` library installed:
360
364
  // Parse and return the JSON object
361
365
  return JSON.parse(jsonString);
362
366
  ```
367
+
368
+ ## HTTP Status Codes Returned for Skipped Pages
369
+ In scanPagesSummary.json and scanPagesDetail,json, within each `pagesNotScanned`, the following HTTP and Metadata is stored to provide a reason why the apge could not be scanned.
370
+
371
+ | httpStatusCode | metadata |
372
+ |------|------------------------------------------------|
373
+ | 0 | Page Excluded |
374
+ | 1 | Not A Supported Document |
375
+ | 2 | Web Crawler Errored |
376
+ | 100 | 100 – Continue |
377
+ | 101 | 101 – Switching Protocols |
378
+ | 102 | 102 – Processing |
379
+ | 103 | 103 – Early Hints |
380
+ | 200 | 200 – However Page Could Not Be Scanned |
381
+ | 204 | 204 – No Content |
382
+ | 205 | 205 – Reset Content |
383
+ | 300 | 300 – Multiple Choices |
384
+ | 301 | 301 – Moved Permanently |
385
+ | 302 | 302 – Found |
386
+ | 303 | 303 – See Other |
387
+ | 304 | 304 – Not Modified |
388
+ | 305 | 305 – Use Proxy |
389
+ | 307 | 307 – Temporary Redirect |
390
+ | 308 | 308 – Permanent Redirect |
391
+ | 400 | 400 – Bad Request |
392
+ | 401 | 401 – Unauthorized |
393
+ | 402 | 402 – Payment Required |
394
+ | 403 | 403 – Forbidden |
395
+ | 404 | 404 – Not Found |
396
+ | 405 | 405 – Method Not Allowed |
397
+ | 406 | 406 – Not Acceptable |
398
+ | 407 | 407 – Proxy Authentication Required |
399
+ | 408 | 408 – Request Timeout |
400
+ | 409 | 409 – Conflict |
401
+ | 410 | 410 – Gone |
402
+ | 411 | 411 – Length Required |
403
+ | 412 | 412 – Precondition Failed |
404
+ | 413 | 413 – Payload Too Large |
405
+ | 414 | 414 – URI Too Long |
406
+ | 415 | 415 – Unsupported Media Type |
407
+ | 416 | 416 – Range Not Satisfiable |
408
+ | 417 | 417 – Expectation Failed |
409
+ | 418 | 418 – I’m a teapot |
410
+ | 421 | 421 – Misdirected Request |
411
+ | 422 | 422 – Unprocessable Content |
412
+ | 423 | 423 – Locked |
413
+ | 424 | 424 – Failed Dependency |
414
+ | 425 | 425 – Too Early |
415
+ | 426 | 426 – Upgrade Required |
416
+ | 428 | 428 – Precondition Required |
417
+ | 429 | 429 – Too Many Requests |
418
+ | 431 | 431 – Request Header Fields Too Large |
419
+ | 451 | 451 – Unavailable For Legal Reasons |
420
+ | 500 | 500 – Internal Server Error |
421
+ | 501 | 501 – Not Implemented |
422
+ | 502 | 502 – Bad Gateway |
423
+ | 503 | 503 – Service Unavailable |
424
+ | 504 | 504 – Gateway Timeout |
425
+ | 505 | 505 – HTTP Version Not Supported |
426
+ | 506 | 506 – Variant Also Negotiates |
427
+ | 507 | 507 – Insufficient Storage |
428
+ | 508 | 508 – Loop Detected |
429
+ | 510 | 510 – Not Extended |
430
+ | 511 | 511 – Network Authentication Required |
431
+ | 599 | Uncommon Response Code Received |
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@govtechsg/oobee",
3
3
  "main": "dist/npmIndex.js",
4
- "version": "0.10.42",
4
+ "version": "0.10.43",
5
5
  "type": "module",
6
6
  "author": "Government Technology Agency <info@tech.gov.sg>",
7
7
  "dependencies": {
@@ -46,6 +46,7 @@
46
46
  "@types/fs-extra": "^11.0.4",
47
47
  "@types/inquirer": "^9.0.7",
48
48
  "@types/lodash": "^4.17.7",
49
+ "@types/mime-types": "^2.1.4",
49
50
  "@types/safe-regex": "^1.1.6",
50
51
  "@types/validator": "^13.11.10",
51
52
  "@types/which": "^3.0.4",
@@ -97,4 +98,4 @@
97
98
  "url": "https://github.com/GovTechSG/oobee/issues"
98
99
  },
99
100
  "homepage": "https://github.com/GovTechSG/oobee#readme"
100
- }
101
+ }
package/src/cli.ts CHANGED
@@ -137,9 +137,6 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
137
137
  printMessage([`Invalid blacklistedPatternsFilename file path. ${err}`], messageOptions);
138
138
  process.exit(1);
139
139
  }
140
-
141
- // eslint-disable-next-line no-unreachable
142
- return null;
143
140
  })
144
141
  .coerce('i', option => {
145
142
  const { choices } = cliOptions.i;
@@ -241,7 +238,7 @@ const scanInit = async (argvs: Answers): Promise<string> => {
241
238
  clonedDataDir,
242
239
  updatedArgvs.playwrightDeviceDetailsObject,
243
240
  isCustomFlow,
244
- updatedArgvs.header,
241
+ parseHeaders(updatedArgvs.header),
245
242
  );
246
243
  switch (res.status) {
247
244
  case statuses.success.code: {
@@ -255,17 +252,14 @@ const scanInit = async (argvs: Answers): Promise<string> => {
255
252
  case statuses.unauthorised.code: {
256
253
  printMessage([statuses.unauthorised.message], messageOptions);
257
254
  process.exit(res.status);
258
- break;
259
255
  }
260
256
  case statuses.cannotBeResolved.code: {
261
257
  printMessage([statuses.cannotBeResolved.message], messageOptions);
262
258
  process.exit(res.status);
263
- break;
264
259
  }
265
260
  case statuses.systemError.code: {
266
261
  printMessage([statuses.systemError.message], messageOptions);
267
262
  process.exit(res.status);
268
- break;
269
263
  }
270
264
  case statuses.invalidUrl.code: {
271
265
  if (
@@ -296,17 +290,14 @@ const scanInit = async (argvs: Answers): Promise<string> => {
296
290
  case statuses.notASitemap.code: {
297
291
  printMessage([statuses.notASitemap.message], messageOptions);
298
292
  process.exit(res.status);
299
- break;
300
293
  }
301
294
  case statuses.notALocalFile.code: {
302
295
  printMessage([statuses.notALocalFile.message], messageOptions);
303
296
  process.exit(res.status);
304
- break;
305
297
  }
306
298
  case statuses.browserError.code: {
307
299
  printMessage([statuses.browserError.message], messageOptions);
308
300
  process.exit(res.status);
309
- break;
310
301
  }
311
302
  default:
312
303
  break;
@@ -362,7 +353,7 @@ const scanInit = async (argvs: Answers): Promise<string> => {
362
353
  }
363
354
 
364
355
  // Delete dataset and request queues
365
- await cleanUp(data.randomToken);
356
+ cleanUp(data.randomToken);
366
357
 
367
358
  return getStoragePath(data.randomToken);
368
359
  };
@@ -15,8 +15,8 @@ import safe from 'safe-regex';
15
15
  import * as https from 'https';
16
16
  import os from 'os';
17
17
  import { minimatch } from 'minimatch';
18
- import { globSync } from 'glob';
19
- import { LaunchOptions, devices, webkit } from 'playwright';
18
+ import { globSync, GlobOptionsWithFileTypesFalse } from 'glob';
19
+ import { LaunchOptions, Locator, Page, devices, webkit } from 'playwright';
20
20
  import printMessage from 'print-message';
21
21
  import constants, {
22
22
  getDefaultChromeDataDir,
@@ -31,6 +31,7 @@ import { silentLogger } from '../logs.js';
31
31
  import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
32
32
  import { randomThreeDigitNumberString } from '../utils.js';
33
33
  import { Answers, Data } from '../index.js';
34
+ import { DeviceDescriptor } from '../types/types.js';
34
35
 
35
36
  // validateDirPath validates a provided directory path
36
37
  // returns null if no error
@@ -252,7 +253,7 @@ export const getUrlMessage = (scanner: ScannerTypes): string => {
252
253
  }
253
254
  };
254
255
 
255
- export const isInputValid = inputString => {
256
+ export const isInputValid = (inputString: string): boolean => {
256
257
  if (!validator.isEmpty(inputString)) {
257
258
  const removeBlackListCharacters = validator.escape(inputString);
258
259
 
@@ -373,12 +374,12 @@ const requestToUrl = async (
373
374
  };
374
375
 
375
376
  const checkUrlConnectivityWithBrowser = async (
376
- url,
377
- browserToRun,
378
- clonedDataDir,
379
- playwrightDeviceDetailsObject,
380
- isCustomFlow,
381
- extraHTTPHeaders,
377
+ url: string,
378
+ browserToRun: string,
379
+ clonedDataDir: string,
380
+ playwrightDeviceDetailsObject: DeviceDescriptor,
381
+ isCustomFlow: boolean,
382
+ extraHTTPHeaders: Record<string, string>,
382
383
  ) => {
383
384
  const res = new RES();
384
385
 
@@ -468,7 +469,6 @@ const checkUrlConnectivityWithBrowser = async (
468
469
  res.content = responseFromUrl.content;
469
470
  }
470
471
  } catch (error) {
471
-
472
472
  // But this does work with the headless=new flag
473
473
  if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
474
474
  res.status = constants.urlCheckStatuses.unauthorised.code;
@@ -510,13 +510,13 @@ export const isSitemapContent = (content: string) => {
510
510
  };
511
511
 
512
512
  export const checkUrl = async (
513
- scanner,
514
- url,
515
- browser,
516
- clonedDataDir,
517
- playwrightDeviceDetailsObject,
518
- isCustomFlow,
519
- extraHTTPHeaders,
513
+ scanner: ScannerTypes,
514
+ url: string,
515
+ browser: string,
516
+ clonedDataDir: string,
517
+ playwrightDeviceDetailsObject: DeviceDescriptor,
518
+ isCustomFlow: boolean,
519
+ extraHTTPHeaders: Record<string, string>,
520
520
  ) => {
521
521
  const res = await checkUrlConnectivityWithBrowser(
522
522
  url,
@@ -548,7 +548,7 @@ export const parseHeaders = (header?: string): Record<string, string> => {
548
548
  // parse HTTP headers from string
549
549
  if (!header) return {};
550
550
  const headerValues = header.split(', ');
551
- const allHeaders = {};
551
+ const allHeaders: Record<string, string> = {};
552
552
  headerValues.map((headerValue: string) => {
553
553
  const headerValuePair = headerValue.split(/ (.*)/s);
554
554
  if (headerValuePair.length < 2) {
@@ -776,11 +776,11 @@ export const getLinksFromSitemap = async (
776
776
  password: string,
777
777
  ) => {
778
778
  const scannedSitemaps = new Set<string>();
779
- const urls = {}; // dictionary of requests to urls to be scanned
779
+ const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
780
780
 
781
781
  const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
782
782
 
783
- const addToUrlList = url => {
783
+ const addToUrlList = (url: string) => {
784
784
  if (!url) return;
785
785
  if (isDisallowedInRobotsTxt(url)) return;
786
786
 
@@ -803,14 +803,14 @@ export const getLinksFromSitemap = async (
803
803
  urls[url] = request;
804
804
  };
805
805
 
806
- const addBasicAuthCredentials = (url, username, password) => {
806
+ const addBasicAuthCredentials = (url: string, username: string, password: string) => {
807
807
  const urlObject = new URL(url);
808
808
  urlObject.username = username;
809
809
  urlObject.password = password;
810
810
  return urlObject.toString();
811
811
  };
812
812
 
813
- const calculateCloseness = sitemapUrl => {
813
+ const calculateCloseness = (sitemapUrl: string) => {
814
814
  // Remove 'http://', 'https://', and 'www.' prefixes from the URLs
815
815
  const normalizedSitemapUrl = sitemapUrl.replace(/^(https?:\/\/)?(www\.)?/, '');
816
816
  const normalizedUserUrlInput = userUrlInput
@@ -825,10 +825,16 @@ export const getLinksFromSitemap = async (
825
825
  }
826
826
  return 0;
827
827
  };
828
- const processXmlSitemap = async ($, sitemapType, linkSelector, dateSelector, sectionSelector) => {
829
- const urlList = [];
828
+ const processXmlSitemap = async (
829
+ $: cheerio.CheerioAPI,
830
+ sitemapType: number,
831
+ linkSelector: string,
832
+ dateSelector: string,
833
+ sectionSelector: string,
834
+ ) => {
835
+ const urlList: { url: string; lastModifiedDate: Date }[] = [];
830
836
  // Iterate through each URL element in the sitemap, collect url and modified date
831
- $(sectionSelector).each((index, urlElement) => {
837
+ $(sectionSelector).each((_index, urlElement) => {
832
838
  let url;
833
839
  if (sitemapType === constants.xmlSitemapTypes.atom) {
834
840
  url = $(urlElement).find(linkSelector).prop('href');
@@ -850,8 +856,7 @@ export const getLinksFromSitemap = async (
850
856
  }
851
857
 
852
858
  // If closeness is the same, sort by last modified date in descending order
853
- const dateDifference = (b.lastModifiedDate || 0) - (a.lastModifiedDate || 0);
854
- return dateDifference !== 0 ? dateDifference : 0; // Maintain original order for equal dates
859
+ return (b.lastModifiedDate?.getTime() || 0) - (a.lastModifiedDate?.getTime() || 0);
855
860
  });
856
861
  }
857
862
 
@@ -861,7 +866,7 @@ export const getLinksFromSitemap = async (
861
866
  }
862
867
  };
863
868
 
864
- const processNonStandardSitemap = data => {
869
+ const processNonStandardSitemap = (data: string) => {
865
870
  const urlsFromData = crawlee
866
871
  .extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
867
872
  .slice(0, maxLinksCount);
@@ -934,7 +939,7 @@ export const getLinksFromSitemap = async (
934
939
  const sitemapIndex = page.locator('sitemapindex');
935
940
  const rss = page.locator('rss');
936
941
  const feed = page.locator('feed');
937
- const isRoot = async locator => (await locator.count()) > 0;
942
+ const isRoot = async (locator: Locator) => (await locator.count()) > 0;
938
943
 
939
944
  if (await isRoot(urlSet)) {
940
945
  data = await urlSet.evaluate(elem => elem.outerHTML);
@@ -1054,14 +1059,14 @@ export const getLinksFromSitemap = async (
1054
1059
  return requestList;
1055
1060
  };
1056
1061
 
1057
- export const validEmail = email => {
1062
+ export const validEmail = (email: string) => {
1058
1063
  const emailRegex = /^.+@.+\..+$/u;
1059
1064
 
1060
1065
  return emailRegex.test(email);
1061
1066
  };
1062
1067
 
1063
1068
  // For new user flow.
1064
- export const validName = name => {
1069
+ export const validName = (name: string) => {
1065
1070
  // Allow only printable characters from any language
1066
1071
  const regex = /^[\p{L}\p{N}\s'".,()\[\]{}!?:؛،؟…]+$/u;
1067
1072
 
@@ -1213,11 +1218,11 @@ export const getEdgeData = () => {
1213
1218
  * @param {*} destDir destination directory
1214
1219
  * @returns boolean indicating whether the operation was successful
1215
1220
  */
1216
- const cloneChromeProfileCookieFiles = (options, destDir) => {
1221
+ const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
1217
1222
  let profileCookiesDir;
1218
1223
  // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
1219
1224
  // and ../Chrome/<profile name>/Cookies for mac
1220
- let profileNamesRegex;
1225
+ let profileNamesRegex: RegExp;
1221
1226
  if (os.platform() === 'win32') {
1222
1227
  profileCookiesDir = globSync('**/Network/Cookies', {
1223
1228
  ...options,
@@ -1288,11 +1293,11 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
1288
1293
  * @param {*} destDir destination directory
1289
1294
  * @returns boolean indicating whether the operation was successful
1290
1295
  */
1291
- const cloneEdgeProfileCookieFiles = (options, destDir) => {
1296
+ const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
1292
1297
  let profileCookiesDir;
1293
1298
  // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
1294
1299
  // and ../Chrome/<profile name>/Cookies for mac
1295
- let profileNamesRegex;
1300
+ let profileNamesRegex: RegExp;
1296
1301
  // Ignores the cloned oobee directory if exists
1297
1302
  if (os.platform() === 'win32') {
1298
1303
  profileCookiesDir = globSync('**/Network/Cookies', {
@@ -1361,7 +1366,7 @@ const cloneEdgeProfileCookieFiles = (options, destDir) => {
1361
1366
  * @param {string} destDir - destination directory
1362
1367
  * @returns boolean indicating whether the operation was successful
1363
1368
  */
1364
- const cloneLocalStateFile = (options, destDir) => {
1369
+ const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
1365
1370
  const localState = globSync('**/*Local State', {
1366
1371
  ...options,
1367
1372
  maxDepth: 1,
@@ -1647,8 +1652,9 @@ export const getPlaywrightDeviceDetailsObject = (
1647
1652
  deviceChosen: string,
1648
1653
  customDevice: string,
1649
1654
  viewportWidth: number,
1650
- ) => {
1651
- let playwrightDeviceDetailsObject = {};
1655
+ ): DeviceDescriptor => {
1656
+ let playwrightDeviceDetailsObject = devices['Desktop Chrome']; // default to Desktop Chrome
1657
+
1652
1658
  if (deviceChosen === 'Mobile' || customDevice === 'iPhone 11') {
1653
1659
  playwrightDeviceDetailsObject = devices['iPhone 11'];
1654
1660
  } else if (customDevice === 'Samsung Galaxy S9+') {
@@ -1656,6 +1662,11 @@ export const getPlaywrightDeviceDetailsObject = (
1656
1662
  } else if (viewportWidth) {
1657
1663
  playwrightDeviceDetailsObject = {
1658
1664
  viewport: { width: viewportWidth, height: 720 },
1665
+ isMobile: false,
1666
+ hasTouch: false,
1667
+ userAgent: devices['Desktop Chrome'].userAgent,
1668
+ deviceScaleFactor: 1,
1669
+ defaultBrowserType: 'chromium',
1659
1670
  };
1660
1671
  } else if (customDevice) {
1661
1672
  playwrightDeviceDetailsObject = devices[customDevice.replace(/_/g, ' ')];
@@ -1777,14 +1788,17 @@ export const submitForm = async (
1777
1788
  }
1778
1789
  };
1779
1790
 
1780
- export async function initModifiedUserAgent(browser?: string, playwrightDeviceDetailsObject?: object) {
1791
+ export async function initModifiedUserAgent(
1792
+ browser?: string,
1793
+ playwrightDeviceDetailsObject?: object,
1794
+ ) {
1781
1795
  const isHeadless = process.env.CRAWLEE_HEADLESS === '1';
1782
-
1796
+
1783
1797
  // If headless mode is enabled, ensure the headless flag is set.
1784
1798
  if (isHeadless && !constants.launchOptionsArgs.includes('--headless=new')) {
1785
1799
  constants.launchOptionsArgs.push('--headless=new');
1786
1800
  }
1787
-
1801
+
1788
1802
  // Build the launch options using your production settings.
1789
1803
  // headless is forced to false as in your persistent context, and we merge in getPlaywrightLaunchOptions and device details.
1790
1804
  const launchOptions = {
@@ -1803,17 +1817,16 @@ export async function initModifiedUserAgent(browser?: string, playwrightDeviceDe
1803
1817
 
1804
1818
  // Modify the UA:
1805
1819
  // Replace "HeadlessChrome" with "Chrome" if present.
1806
- let modifiedUA = defaultUA.includes('HeadlessChrome')
1820
+ const modifiedUA = defaultUA.includes('HeadlessChrome')
1807
1821
  ? defaultUA.replace('HeadlessChrome', 'Chrome')
1808
1822
  : defaultUA;
1809
-
1823
+
1810
1824
  // Push the modified UA flag into your global launch options.
1811
1825
  constants.launchOptionsArgs.push(`--user-agent=${modifiedUA}`);
1812
1826
  // Optionally log the modified UA.
1813
1827
  // console.log('Modified User Agent:', modifiedUA);
1814
1828
  }
1815
1829
 
1816
-
1817
1830
  /**
1818
1831
  * @param {string} browser browser name ("chrome" or "edge", null for chromium, the default Playwright browser)
1819
1832
  * @returns playwright launch options object. For more details: https://playwright.dev/docs/api/class-browsertype#browser-type-launch
@@ -1856,25 +1869,25 @@ export const urlWithoutAuth = (url: string): string => {
1856
1869
  return parsedUrl.toString();
1857
1870
  };
1858
1871
 
1859
- export const waitForPageLoaded = async (page, timeout = 10000) => {
1872
+ export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
1860
1873
  const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
1861
1874
 
1862
1875
  return Promise.race([
1863
1876
  page.waitForLoadState('load'), // Ensure page load completes
1864
1877
  page.waitForLoadState('networkidle'), // Wait for network requests to settle
1865
1878
  new Promise(resolve => setTimeout(resolve, timeout)), // Hard timeout as a fallback
1866
- page.evaluate((OBSERVER_TIMEOUT) => {
1867
- return new Promise((resolve) => {
1879
+ page.evaluate(OBSERVER_TIMEOUT => {
1880
+ return new Promise(resolve => {
1868
1881
  // Skip mutation check for PDFs
1869
1882
  if (document.contentType === 'application/pdf') {
1870
1883
  resolve('Skipping DOM mutation check for PDF.');
1871
1884
  return;
1872
1885
  }
1873
1886
 
1874
- let timeout;
1887
+ let timeout: NodeJS.Timeout;
1875
1888
  let mutationCount = 0;
1876
1889
  const MAX_MUTATIONS = 250; // Limit max mutations
1877
- const mutationHash = {};
1890
+ const mutationHash: Record<string, number> = {};
1878
1891
 
1879
1892
  const observer = new MutationObserver(mutationsList => {
1880
1893
  clearTimeout(timeout);
@@ -1916,14 +1929,17 @@ export const waitForPageLoaded = async (page, timeout = 10000) => {
1916
1929
  resolve('Observer timeout reached, exiting.');
1917
1930
  }, OBSERVER_TIMEOUT);
1918
1931
 
1919
- observer.observe(document.documentElement, { childList: true, subtree: true, attributes: true });
1932
+ observer.observe(document.documentElement, {
1933
+ childList: true,
1934
+ subtree: true,
1935
+ attributes: true,
1936
+ });
1920
1937
  });
1921
1938
  }, OBSERVER_TIMEOUT), // Pass OBSERVER_TIMEOUT dynamically to the browser context
1922
1939
  ]);
1923
1940
  };
1924
1941
 
1925
-
1926
- function isValidHttpUrl(urlString) {
1942
+ function isValidHttpUrl(urlString: string) {
1927
1943
  const pattern = /^(http|https):\/\/[^ "]+$/;
1928
1944
  return pattern.test(urlString);
1929
1945
  }
@@ -29,6 +29,7 @@ export const blackListedFileExtensions = [
29
29
  'zip',
30
30
  'webp',
31
31
  'json',
32
+ 'xml'
32
33
  ];
33
34
 
34
35
  export const getIntermediateScreenshotsPath = (datasetsPath: string): string =>
@@ -217,7 +218,7 @@ export const guiInfoStatusTypes = {
217
218
  DUPLICATE: 'duplicate',
218
219
  };
219
220
 
220
- let launchOptionsArgs = [];
221
+ let launchOptionsArgs: string[] = [];
221
222
 
222
223
  // Check if running in docker container
223
224
  if (fs.existsSync('/.dockerenv')) {
@@ -444,3 +445,82 @@ export enum RuleFlags {
444
445
  DISABLE_OOBEE = 'disable-oobee',
445
446
  ENABLE_WCAG_AAA = 'enable-wcag-aaa',
446
447
  }
448
+
449
+ // Note: Not all status codes will appear as Crawler will handle it as best effort first. E.g. try to handle redirect
450
+ export const STATUS_CODE_METADATA: Record<number,string> = {
451
+ // Custom Codes for Oobee's use
452
+ 0: 'Page Excluded',
453
+ 1: 'Not A Supported Document',
454
+ 2: 'Web Crawler Errored',
455
+
456
+ // 599 is set because Crawlee returns response status 100, 102, 103 as 599
457
+ 599: 'Uncommon Response Status Code Received',
458
+
459
+ // This is Status OK but thrown when the crawler cannot scan the page
460
+ 200: '200 - However Page Could Not Be Scanned',
461
+
462
+ // 1xx - Informational
463
+ 100: '100 - Continue',
464
+ 101: '101 - Switching Protocols',
465
+ 102: '102 - Processing',
466
+ 103: '103 - Early Hints',
467
+
468
+ // 2xx - Browser Doesn't Support
469
+ 204: '204 - No Content',
470
+ 205: '205 - Reset Content',
471
+
472
+ // 3xx - Redirection
473
+ 300: '300 - Multiple Choices',
474
+ 301: '301 - Moved Permanently',
475
+ 302: '302 - Found',
476
+ 303: '303 - See Other',
477
+ 304: '304 - Not Modified',
478
+ 305: '305 - Use Proxy',
479
+ 307: '307 - Temporary Redirect',
480
+ 308: '308 - Permanent Redirect',
481
+
482
+ // 4xx - Client Error
483
+ 400: '400 - Bad Request',
484
+ 401: '401 - Unauthorized',
485
+ 402: '402 - Payment Required',
486
+ 403: '403 - Forbidden',
487
+ 404: '404 - Not Found',
488
+ 405: '405 - Method Not Allowed',
489
+ 406: '406 - Not Acceptable',
490
+ 407: '407 - Proxy Authentication Required',
491
+ 408: '408 - Request Timeout',
492
+ 409: '409 - Conflict',
493
+ 410: '410 - Gone',
494
+ 411: '411 - Length Required',
495
+ 412: '412 - Precondition Failed',
496
+ 413: '413 - Payload Too Large',
497
+ 414: '414 - URI Too Long',
498
+ 415: '415 - Unsupported Media Type',
499
+ 416: '416 - Range Not Satisfiable',
500
+ 417: '417 - Expectation Failed',
501
+ 418: "418 - I'm a teapot",
502
+ 421: '421 - Misdirected Request',
503
+ 422: '422 - Unprocessable Content',
504
+ 423: '423 - Locked',
505
+ 424: '424 - Failed Dependency',
506
+ 425: '425 - Too Early',
507
+ 426: '426 - Upgrade Required',
508
+ 428: '428 - Precondition Required',
509
+ 429: '429 - Too Many Requests',
510
+ 431: '431 - Request Header Fields Too Large',
511
+ 451: '451 - Unavailable For Legal Reasons',
512
+
513
+ // 5xx - Server Error
514
+ 500: '500 - Internal Server Error',
515
+ 501: '501 - Not Implemented',
516
+ 502: '502 - Bad Gateway',
517
+ 503: '503 - Service Unavailable',
518
+ 504: '504 - Gateway Timeout',
519
+ 505: '505 - HTTP Version Not Supported',
520
+ 506: '506 - Variant Also Negotiates',
521
+ 507: '507 - Insufficient Storage',
522
+ 508: '508 - Loop Detected',
523
+ 510: '510 - Not Extended',
524
+ 511: '511 - Network Authentication Required',
525
+
526
+ };
@@ -24,7 +24,7 @@ export const oobeeAiRules = [
24
24
  'autocomplete-valid',
25
25
  ];
26
26
 
27
- export const oobeeAiHtmlETL = htmlSnippet => {
27
+ export const oobeeAiHtmlETL = (htmlSnippet: string) => {
28
28
  // Whitelisted attributes (to not drop)
29
29
  // i.e. any other attribute will be dropped
30
30
  const whitelistedAttributes = [
@@ -60,12 +60,12 @@ export const oobeeAiHtmlETL = htmlSnippet => {
60
60
  `aria-labelledby`,
61
61
  ];
62
62
 
63
- const sortAlphaAttributes = html => {
63
+ const sortAlphaAttributes = (html: string) => {
64
64
  let entireHtml = '';
65
65
  const htmlOpeningTagRegex = /<[^>]+/g;
66
66
  const htmlTagmatches = html.match(htmlOpeningTagRegex);
67
67
 
68
- let sortedHtmlTag;
68
+ let sortedHtmlTag: string = '';
69
69
 
70
70
  htmlTagmatches.forEach(htmlTag => {
71
71
  const closingTag = htmlTag.trim().slice(-1) === '/' ? '/>' : '>';
@@ -112,7 +112,7 @@ export const oobeeAiHtmlETL = htmlSnippet => {
112
112
 
113
113
  // For all attributes within mutedAttributeValues array
114
114
  // replace their values with "something" while maintaining the attribute
115
- const muteAttributeValues = html => {
115
+ const muteAttributeValues = (html: string) => {
116
116
  const regex = /(\s+)([\w-]+)(\s*=\s*")([^"]*)(")/g;
117
117
 
118
118
  // p1 is the whitespace before the attribute
@@ -120,7 +120,7 @@ export const oobeeAiHtmlETL = htmlSnippet => {
120
120
  // p3 is the attribute value before the replacement
121
121
  // p4 is the attribute value (replaced with "...")
122
122
  // p5 is the closing quote of the attribute value
123
- return html.replace(regex, (match, p1, p2, p3, p4, p5) => {
123
+ return html.replace(regex, (match, p1, p2, p3, _p4, p5) => {
124
124
  if (mutedAttributeValues.includes(p2)) {
125
125
  return `${p1}${p2}${p3}...${p5}`;
126
126
  }
@@ -129,7 +129,7 @@ export const oobeeAiHtmlETL = htmlSnippet => {
129
129
  };
130
130
 
131
131
  // Drop all attributes from the HTML snippet except whitelisted
132
- const dropAllExceptWhitelisted = html => {
132
+ const dropAllExceptWhitelisted = (html: string) => {
133
133
  const regex = new RegExp(
134
134
  `(\\s+)(?!${whitelistedAttributes.join(`|`)})([\\w-]+)(\\s*=\\s*"[^"]*")`,
135
135
  `g`,
@@ -12,12 +12,13 @@ import {
12
12
  validEmail,
13
13
  validName,
14
14
  validateCustomFlowLabel,
15
+ parseHeaders,
15
16
  } from './common.js';
16
17
  import constants, { BrowserTypes, ScannerTypes } from './constants.js';
17
18
 
18
19
  const userData = getUserDataTxt();
19
20
 
20
- const questions = [];
21
+ const questions: Question[] = [];
21
22
 
22
23
  const startScanQuestions = [
23
24
  {
@@ -95,7 +96,7 @@ const startScanQuestions = [
95
96
  clonedBrowserDataDir,
96
97
  playwrightDeviceDetailsObject,
97
98
  answers.scanner === ScannerTypes.CUSTOM,
98
- answers.header,
99
+ parseHeaders(answers.header),
99
100
  );
100
101
 
101
102
  deleteClonedProfiles(browserToRun);