@govtechsg/oobee 0.10.42 → 0.10.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,15 +15,20 @@ import safe from 'safe-regex';
15
15
  import * as https from 'https';
16
16
  import os from 'os';
17
17
  import { minimatch } from 'minimatch';
18
- import { globSync } from 'glob';
19
- import { LaunchOptions, devices, webkit } from 'playwright';
18
+ import { globSync, GlobOptionsWithFileTypesFalse } from 'glob';
19
+ import { LaunchOptions, Locator, Page, devices, webkit } from 'playwright';
20
20
  import printMessage from 'print-message';
21
+ // @ts-ignore
22
+ import * as Sentry from '@sentry/node';
21
23
  import constants, {
22
24
  getDefaultChromeDataDir,
23
25
  getDefaultEdgeDataDir,
24
26
  getDefaultChromiumDataDir,
25
27
  proxy,
28
+ sentryConfig,
29
+ // Legacy code start - Google Sheets submission
26
30
  formDataFields,
31
+ // Legacy code end - Google Sheets submission
27
32
  ScannerTypes,
28
33
  BrowserTypes,
29
34
  } from './constants.js';
@@ -31,6 +36,7 @@ import { silentLogger } from '../logs.js';
31
36
  import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
32
37
  import { randomThreeDigitNumberString } from '../utils.js';
33
38
  import { Answers, Data } from '../index.js';
39
+ import { DeviceDescriptor } from '../types/types.js';
34
40
 
35
41
  // validateDirPath validates a provided directory path
36
42
  // returns null if no error
@@ -252,7 +258,7 @@ export const getUrlMessage = (scanner: ScannerTypes): string => {
252
258
  }
253
259
  };
254
260
 
255
- export const isInputValid = inputString => {
261
+ export const isInputValid = (inputString: string): boolean => {
256
262
  if (!validator.isEmpty(inputString)) {
257
263
  const removeBlackListCharacters = validator.escape(inputString);
258
264
 
@@ -373,12 +379,12 @@ const requestToUrl = async (
373
379
  };
374
380
 
375
381
  const checkUrlConnectivityWithBrowser = async (
376
- url,
377
- browserToRun,
378
- clonedDataDir,
379
- playwrightDeviceDetailsObject,
380
- isCustomFlow,
381
- extraHTTPHeaders,
382
+ url: string,
383
+ browserToRun: string,
384
+ clonedDataDir: string,
385
+ playwrightDeviceDetailsObject: DeviceDescriptor,
386
+ isCustomFlow: boolean,
387
+ extraHTTPHeaders: Record<string, string>,
382
388
  ) => {
383
389
  const res = new RES();
384
390
 
@@ -468,7 +474,6 @@ const checkUrlConnectivityWithBrowser = async (
468
474
  res.content = responseFromUrl.content;
469
475
  }
470
476
  } catch (error) {
471
-
472
477
  // But this does work with the headless=new flag
473
478
  if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
474
479
  res.status = constants.urlCheckStatuses.unauthorised.code;
@@ -510,13 +515,13 @@ export const isSitemapContent = (content: string) => {
510
515
  };
511
516
 
512
517
  export const checkUrl = async (
513
- scanner,
514
- url,
515
- browser,
516
- clonedDataDir,
517
- playwrightDeviceDetailsObject,
518
- isCustomFlow,
519
- extraHTTPHeaders,
518
+ scanner: ScannerTypes,
519
+ url: string,
520
+ browser: string,
521
+ clonedDataDir: string,
522
+ playwrightDeviceDetailsObject: DeviceDescriptor,
523
+ isCustomFlow: boolean,
524
+ extraHTTPHeaders: Record<string, string>,
520
525
  ) => {
521
526
  const res = await checkUrlConnectivityWithBrowser(
522
527
  url,
@@ -548,7 +553,7 @@ export const parseHeaders = (header?: string): Record<string, string> => {
548
553
  // parse HTTP headers from string
549
554
  if (!header) return {};
550
555
  const headerValues = header.split(', ');
551
- const allHeaders = {};
556
+ const allHeaders: Record<string, string> = {};
552
557
  headerValues.map((headerValue: string) => {
553
558
  const headerValuePair = headerValue.split(/ (.*)/s);
554
559
  if (headerValuePair.length < 2) {
@@ -776,11 +781,11 @@ export const getLinksFromSitemap = async (
776
781
  password: string,
777
782
  ) => {
778
783
  const scannedSitemaps = new Set<string>();
779
- const urls = {}; // dictionary of requests to urls to be scanned
784
+ const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
780
785
 
781
786
  const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
782
787
 
783
- const addToUrlList = url => {
788
+ const addToUrlList = (url: string) => {
784
789
  if (!url) return;
785
790
  if (isDisallowedInRobotsTxt(url)) return;
786
791
 
@@ -803,14 +808,14 @@ export const getLinksFromSitemap = async (
803
808
  urls[url] = request;
804
809
  };
805
810
 
806
- const addBasicAuthCredentials = (url, username, password) => {
811
+ const addBasicAuthCredentials = (url: string, username: string, password: string) => {
807
812
  const urlObject = new URL(url);
808
813
  urlObject.username = username;
809
814
  urlObject.password = password;
810
815
  return urlObject.toString();
811
816
  };
812
817
 
813
- const calculateCloseness = sitemapUrl => {
818
+ const calculateCloseness = (sitemapUrl: string) => {
814
819
  // Remove 'http://', 'https://', and 'www.' prefixes from the URLs
815
820
  const normalizedSitemapUrl = sitemapUrl.replace(/^(https?:\/\/)?(www\.)?/, '');
816
821
  const normalizedUserUrlInput = userUrlInput
@@ -825,10 +830,16 @@ export const getLinksFromSitemap = async (
825
830
  }
826
831
  return 0;
827
832
  };
828
- const processXmlSitemap = async ($, sitemapType, linkSelector, dateSelector, sectionSelector) => {
829
- const urlList = [];
833
+ const processXmlSitemap = async (
834
+ $: cheerio.CheerioAPI,
835
+ sitemapType: number,
836
+ linkSelector: string,
837
+ dateSelector: string,
838
+ sectionSelector: string,
839
+ ) => {
840
+ const urlList: { url: string; lastModifiedDate: Date }[] = [];
830
841
  // Iterate through each URL element in the sitemap, collect url and modified date
831
- $(sectionSelector).each((index, urlElement) => {
842
+ $(sectionSelector).each((_index, urlElement) => {
832
843
  let url;
833
844
  if (sitemapType === constants.xmlSitemapTypes.atom) {
834
845
  url = $(urlElement).find(linkSelector).prop('href');
@@ -850,8 +861,7 @@ export const getLinksFromSitemap = async (
850
861
  }
851
862
 
852
863
  // If closeness is the same, sort by last modified date in descending order
853
- const dateDifference = (b.lastModifiedDate || 0) - (a.lastModifiedDate || 0);
854
- return dateDifference !== 0 ? dateDifference : 0; // Maintain original order for equal dates
864
+ return (b.lastModifiedDate?.getTime() || 0) - (a.lastModifiedDate?.getTime() || 0);
855
865
  });
856
866
  }
857
867
 
@@ -861,7 +871,7 @@ export const getLinksFromSitemap = async (
861
871
  }
862
872
  };
863
873
 
864
- const processNonStandardSitemap = data => {
874
+ const processNonStandardSitemap = (data: string) => {
865
875
  const urlsFromData = crawlee
866
876
  .extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
867
877
  .slice(0, maxLinksCount);
@@ -934,7 +944,7 @@ export const getLinksFromSitemap = async (
934
944
  const sitemapIndex = page.locator('sitemapindex');
935
945
  const rss = page.locator('rss');
936
946
  const feed = page.locator('feed');
937
- const isRoot = async locator => (await locator.count()) > 0;
947
+ const isRoot = async (locator: Locator) => (await locator.count()) > 0;
938
948
 
939
949
  if (await isRoot(urlSet)) {
940
950
  data = await urlSet.evaluate(elem => elem.outerHTML);
@@ -1054,14 +1064,14 @@ export const getLinksFromSitemap = async (
1054
1064
  return requestList;
1055
1065
  };
1056
1066
 
1057
- export const validEmail = email => {
1067
+ export const validEmail = (email: string) => {
1058
1068
  const emailRegex = /^.+@.+\..+$/u;
1059
1069
 
1060
1070
  return emailRegex.test(email);
1061
1071
  };
1062
1072
 
1063
1073
  // For new user flow.
1064
- export const validName = name => {
1074
+ export const validName = (name: string) => {
1065
1075
  // Allow only printable characters from any language
1066
1076
  const regex = /^[\p{L}\p{N}\s'".,()\[\]{}!?:؛،؟…]+$/u;
1067
1077
 
@@ -1213,11 +1223,11 @@ export const getEdgeData = () => {
1213
1223
  * @param {*} destDir destination directory
1214
1224
  * @returns boolean indicating whether the operation was successful
1215
1225
  */
1216
- const cloneChromeProfileCookieFiles = (options, destDir) => {
1226
+ const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
1217
1227
  let profileCookiesDir;
1218
1228
  // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
1219
1229
  // and ../Chrome/<profile name>/Cookies for mac
1220
- let profileNamesRegex;
1230
+ let profileNamesRegex: RegExp;
1221
1231
  if (os.platform() === 'win32') {
1222
1232
  profileCookiesDir = globSync('**/Network/Cookies', {
1223
1233
  ...options,
@@ -1288,11 +1298,11 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
1288
1298
  * @param {*} destDir destination directory
1289
1299
  * @returns boolean indicating whether the operation was successful
1290
1300
  */
1291
- const cloneEdgeProfileCookieFiles = (options, destDir) => {
1301
+ const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
1292
1302
  let profileCookiesDir;
1293
1303
  // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
1294
1304
  // and ../Chrome/<profile name>/Cookies for mac
1295
- let profileNamesRegex;
1305
+ let profileNamesRegex: RegExp;
1296
1306
  // Ignores the cloned oobee directory if exists
1297
1307
  if (os.platform() === 'win32') {
1298
1308
  profileCookiesDir = globSync('**/Network/Cookies', {
@@ -1361,7 +1371,7 @@ const cloneEdgeProfileCookieFiles = (options, destDir) => {
1361
1371
  * @param {string} destDir - destination directory
1362
1372
  * @returns boolean indicating whether the operation was successful
1363
1373
  */
1364
- const cloneLocalStateFile = (options, destDir) => {
1374
+ const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
1365
1375
  const localState = globSync('**/*Local State', {
1366
1376
  ...options,
1367
1377
  maxDepth: 1,
@@ -1647,8 +1657,9 @@ export const getPlaywrightDeviceDetailsObject = (
1647
1657
  deviceChosen: string,
1648
1658
  customDevice: string,
1649
1659
  viewportWidth: number,
1650
- ) => {
1651
- let playwrightDeviceDetailsObject = {};
1660
+ ): DeviceDescriptor => {
1661
+ let playwrightDeviceDetailsObject = devices['Desktop Chrome']; // default to Desktop Chrome
1662
+
1652
1663
  if (deviceChosen === 'Mobile' || customDevice === 'iPhone 11') {
1653
1664
  playwrightDeviceDetailsObject = devices['iPhone 11'];
1654
1665
  } else if (customDevice === 'Samsung Galaxy S9+') {
@@ -1656,6 +1667,11 @@ export const getPlaywrightDeviceDetailsObject = (
1656
1667
  } else if (viewportWidth) {
1657
1668
  playwrightDeviceDetailsObject = {
1658
1669
  viewport: { width: viewportWidth, height: 720 },
1670
+ isMobile: false,
1671
+ hasTouch: false,
1672
+ userAgent: devices['Desktop Chrome'].userAgent,
1673
+ deviceScaleFactor: 1,
1674
+ defaultBrowserType: 'chromium',
1659
1675
  };
1660
1676
  } else if (customDevice) {
1661
1677
  playwrightDeviceDetailsObject = devices[customDevice.replace(/_/g, ' ')];
@@ -1742,49 +1758,171 @@ export const submitForm = async (
1742
1758
  numberOfPagesNotScanned: number,
1743
1759
  metadata: string,
1744
1760
  ) => {
1745
- const additionalPageDataJson = JSON.stringify({
1746
- redirectsScanned: numberOfRedirectsScanned,
1761
+ // Initialize Sentry
1762
+ Sentry.init(sentryConfig);
1763
+
1764
+ // Format the data as you want it to appear in Sentry
1765
+ const additionalPageData = {
1747
1766
  pagesNotScanned: numberOfPagesNotScanned,
1748
- });
1767
+ redirectsScanned: numberOfRedirectsScanned
1768
+ };
1749
1769
 
1750
- let finalUrl =
1751
- `${formDataFields.formUrl}?` +
1752
- `${formDataFields.entryUrlField}=${entryUrl}&` +
1753
- `${formDataFields.scanTypeField}=${scanType}&` +
1754
- `${formDataFields.emailField}=${email}&` +
1755
- `${formDataFields.nameField}=${name}&` +
1756
- `${formDataFields.resultsField}=${encodeURIComponent(scanResultsJson)}&` +
1757
- `${formDataFields.numberOfPagesScannedField}=${numberOfPagesScanned}&` +
1758
- `${formDataFields.additionalPageDataField}=${encodeURIComponent(additionalPageDataJson)}&` +
1759
- `${formDataFields.metadataField}=${encodeURIComponent(metadata)}`;
1770
+ // Extract issue occurrences from scan results if possible
1771
+ const issueOccurrences = extractIssueOccurrences(scanResultsJson);
1772
+
1773
+ // Determine if it's a government website
1774
+ const isGov = entryUrl.includes('.gov');
1775
+
1776
+ // Get email domain/tag
1777
+ const emailTag = email.split('@')[1] || '';
1778
+
1779
+ // Format timestamp
1780
+ const timestamp = new Date().toISOString();
1781
+
1782
+ // Prepare redirect URL if different from entry URL
1783
+ const redirectUrl = scannedUrl !== entryUrl ? scannedUrl : null;
1784
+
1785
+ try {
1786
+ // Capture the scan data as a Sentry event with each field as a separate entry
1787
+ Sentry.captureEvent({
1788
+ message: `Accessibility scan completed for ${entryUrl}`,
1789
+ level: 'info',
1790
+ tags: {
1791
+ scanType: scanType,
1792
+ browser: browserToRun,
1793
+ isGov: isGov,
1794
+ emailDomain: emailTag,
1795
+ },
1796
+ user: {
1797
+ email: email,
1798
+ username: name,
1799
+ },
1800
+ extra: {
1801
+ // Top-level fields as shown in your screenshot
1802
+ entryUrl: entryUrl,
1803
+ websiteUrl: scannedUrl,
1804
+ scanType: scanType,
1805
+ numberOfPagesScanned: numberOfPagesScanned,
1806
+ metadata: metadata ? JSON.parse(metadata) : {},
1807
+ scanResults: scanResultsJson.length > 8000 ?
1808
+ scanResultsJson.substring(0, 8000) + '...[truncated]' :
1809
+ scanResultsJson,
1810
+
1811
+ // Additional fields you requested
1812
+ additionalPageData: additionalPageData,
1813
+ additionalScan: additionalPageData,
1814
+ additionalPagesData: additionalPageData,
1815
+
1816
+ // Individual fields as requested
1817
+ timestamp: timestamp,
1818
+ redirectUrl: redirectUrl,
1819
+ isGov: isGov,
1820
+ emailTag: emailTag,
1821
+ consolidatedScanType: scanType.toLowerCase(),
1822
+ email: email,
1823
+ name: name,
1824
+ filledNoPagesScanned: numberOfPagesScanned > 0,
1825
+ redirectsScanned: numberOfRedirectsScanned,
1826
+ pagesNotScanned: numberOfPagesNotScanned,
1827
+ issueOccurrences: issueOccurrences
1828
+ }
1829
+ });
1760
1830
 
1761
- if (scannedUrl !== entryUrl) {
1762
- finalUrl += `&${formDataFields.redirectUrlField}=${scannedUrl}`;
1831
+ // IMPORTANT: Wait for the event to be sent
1832
+ await Sentry.flush(2000); // Wait up to 2 seconds for the event to be sent
1833
+
1834
+ } catch (error) {
1835
+ console.error('Error sending data to Sentry:', error);
1763
1836
  }
1764
1837
 
1765
- if (proxy) {
1766
- await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
1767
- } else {
1768
- try {
1769
- await axios.get(finalUrl, { timeout: 2000 });
1770
- } catch (error) {
1771
- if (error.code === 'ECONNABORTED') {
1772
- if (browserToRun || constants.launcher === webkit) {
1773
- await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
1838
+ // Legacy code start - Google Sheets submission
1839
+ try {
1840
+ const additionalPageDataJson = JSON.stringify({
1841
+ redirectsScanned: numberOfRedirectsScanned,
1842
+ pagesNotScanned: numberOfPagesNotScanned,
1843
+ });
1844
+
1845
+ let finalUrl =
1846
+ `${formDataFields.formUrl}?` +
1847
+ `${formDataFields.entryUrlField}=${entryUrl}&` +
1848
+ `${formDataFields.scanTypeField}=${scanType}&` +
1849
+ `${formDataFields.emailField}=${email}&` +
1850
+ `${formDataFields.nameField}=${name}&` +
1851
+ `${formDataFields.resultsField}=${encodeURIComponent(scanResultsJson)}&` +
1852
+ `${formDataFields.numberOfPagesScannedField}=${numberOfPagesScanned}&` +
1853
+ `${formDataFields.additionalPageDataField}=${encodeURIComponent(additionalPageDataJson)}&` +
1854
+ `${formDataFields.metadataField}=${encodeURIComponent(metadata)}`;
1855
+
1856
+ if (scannedUrl !== entryUrl) {
1857
+ finalUrl += `&${formDataFields.redirectUrlField}=${scannedUrl}`;
1858
+ }
1859
+
1860
+ if (proxy) {
1861
+ await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
1862
+ } else {
1863
+ try {
1864
+ await axios.get(finalUrl, { timeout: 2000 });
1865
+ } catch (error) {
1866
+ if (error.code === 'ECONNABORTED') {
1867
+ if (browserToRun || constants.launcher === webkit) {
1868
+ await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
1869
+ }
1774
1870
  }
1775
1871
  }
1776
1872
  }
1873
+ console.log('Legacy Google Sheets form submitted successfully');
1874
+ } catch (legacyError) {
1875
+ console.error('Error submitting legacy Google Sheets form:', legacyError);
1777
1876
  }
1877
+ // Legacy code end - Google Sheets submission
1778
1878
  };
1779
1879
 
1780
- export async function initModifiedUserAgent(browser?: string, playwrightDeviceDetailsObject?: object) {
1880
+ // Helper function to extract issue occurrences from scan results
1881
+ function extractIssueOccurrences(scanResultsJson: string): number {
1882
+ try {
1883
+ const results = JSON.parse(scanResultsJson);
1884
+ // Count total occurrences from all issues in the scan results
1885
+ // This may need adjustment based on your specific JSON structure
1886
+ let totalOccurrences = 0;
1887
+
1888
+ // Try to parse the format shown in your screenshot
1889
+ if (typeof results === 'object') {
1890
+ // Loop through all keys that have "occurrences" properties
1891
+ Object.keys(results).forEach(key => {
1892
+ if (results[key] && typeof results[key] === 'object' && 'occurrences' in results[key]) {
1893
+ totalOccurrences += parseInt(results[key].occurrences, 10) || 0;
1894
+ }
1895
+ });
1896
+
1897
+ // If we found any occurrences, return the total
1898
+ if (totalOccurrences > 0) {
1899
+ return totalOccurrences;
1900
+ }
1901
+ }
1902
+
1903
+ // Fallback to direct occurrences property if available
1904
+ if (results && results.occurrences) {
1905
+ return parseInt(results.occurrences, 10) || 0;
1906
+ }
1907
+
1908
+ return 0;
1909
+ } catch (e) {
1910
+ console.error('Error extracting issue occurrences:', e);
1911
+ return 0;
1912
+ }
1913
+ }
1914
+
1915
+ export async function initModifiedUserAgent(
1916
+ browser?: string,
1917
+ playwrightDeviceDetailsObject?: object,
1918
+ ) {
1781
1919
  const isHeadless = process.env.CRAWLEE_HEADLESS === '1';
1782
-
1920
+
1783
1921
  // If headless mode is enabled, ensure the headless flag is set.
1784
1922
  if (isHeadless && !constants.launchOptionsArgs.includes('--headless=new')) {
1785
1923
  constants.launchOptionsArgs.push('--headless=new');
1786
1924
  }
1787
-
1925
+
1788
1926
  // Build the launch options using your production settings.
1789
1927
  // headless is forced to false as in your persistent context, and we merge in getPlaywrightLaunchOptions and device details.
1790
1928
  const launchOptions = {
@@ -1803,17 +1941,16 @@ export async function initModifiedUserAgent(browser?: string, playwrightDeviceDe
1803
1941
 
1804
1942
  // Modify the UA:
1805
1943
  // Replace "HeadlessChrome" with "Chrome" if present.
1806
- let modifiedUA = defaultUA.includes('HeadlessChrome')
1944
+ const modifiedUA = defaultUA.includes('HeadlessChrome')
1807
1945
  ? defaultUA.replace('HeadlessChrome', 'Chrome')
1808
1946
  : defaultUA;
1809
-
1947
+
1810
1948
  // Push the modified UA flag into your global launch options.
1811
1949
  constants.launchOptionsArgs.push(`--user-agent=${modifiedUA}`);
1812
1950
  // Optionally log the modified UA.
1813
1951
  // console.log('Modified User Agent:', modifiedUA);
1814
1952
  }
1815
1953
 
1816
-
1817
1954
  /**
1818
1955
  * @param {string} browser browser name ("chrome" or "edge", null for chromium, the default Playwright browser)
1819
1956
  * @returns playwright launch options object. For more details: https://playwright.dev/docs/api/class-browsertype#browser-type-launch
@@ -1856,25 +1993,25 @@ export const urlWithoutAuth = (url: string): string => {
1856
1993
  return parsedUrl.toString();
1857
1994
  };
1858
1995
 
1859
- export const waitForPageLoaded = async (page, timeout = 10000) => {
1996
+ export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
1860
1997
  const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
1861
1998
 
1862
1999
  return Promise.race([
1863
2000
  page.waitForLoadState('load'), // Ensure page load completes
1864
2001
  page.waitForLoadState('networkidle'), // Wait for network requests to settle
1865
2002
  new Promise(resolve => setTimeout(resolve, timeout)), // Hard timeout as a fallback
1866
- page.evaluate((OBSERVER_TIMEOUT) => {
1867
- return new Promise((resolve) => {
2003
+ page.evaluate(OBSERVER_TIMEOUT => {
2004
+ return new Promise(resolve => {
1868
2005
  // Skip mutation check for PDFs
1869
2006
  if (document.contentType === 'application/pdf') {
1870
2007
  resolve('Skipping DOM mutation check for PDF.');
1871
2008
  return;
1872
2009
  }
1873
2010
 
1874
- let timeout;
2011
+ let timeout: NodeJS.Timeout;
1875
2012
  let mutationCount = 0;
1876
2013
  const MAX_MUTATIONS = 250; // Limit max mutations
1877
- const mutationHash = {};
2014
+ const mutationHash: Record<string, number> = {};
1878
2015
 
1879
2016
  const observer = new MutationObserver(mutationsList => {
1880
2017
  clearTimeout(timeout);
@@ -1916,14 +2053,17 @@ export const waitForPageLoaded = async (page, timeout = 10000) => {
1916
2053
  resolve('Observer timeout reached, exiting.');
1917
2054
  }, OBSERVER_TIMEOUT);
1918
2055
 
1919
- observer.observe(document.documentElement, { childList: true, subtree: true, attributes: true });
2056
+ observer.observe(document.documentElement, {
2057
+ childList: true,
2058
+ subtree: true,
2059
+ attributes: true,
2060
+ });
1920
2061
  });
1921
2062
  }, OBSERVER_TIMEOUT), // Pass OBSERVER_TIMEOUT dynamically to the browser context
1922
2063
  ]);
1923
2064
  };
1924
2065
 
1925
-
1926
- function isValidHttpUrl(urlString) {
2066
+ function isValidHttpUrl(urlString: string) {
1927
2067
  const pattern = /^(http|https):\/\/[^ "]+$/;
1928
2068
  return pattern.test(urlString);
1929
2069
  }
@@ -6,6 +6,7 @@ import which from 'which';
6
6
  import os from 'os';
7
7
  import { spawnSync, execSync } from 'child_process';
8
8
  import { chromium } from 'playwright';
9
+ import * as Sentry from '@sentry/node';
9
10
  import { silentLogger } from '../logs.js';
10
11
  import { PageInfo } from '../mergeAxeResults.js';
11
12
 
@@ -29,6 +30,7 @@ export const blackListedFileExtensions = [
29
30
  'zip',
30
31
  'webp',
31
32
  'json',
33
+ 'xml'
32
34
  ];
33
35
 
34
36
  export const getIntermediateScreenshotsPath = (datasetsPath: string): string =>
@@ -217,7 +219,7 @@ export const guiInfoStatusTypes = {
217
219
  DUPLICATE: 'duplicate',
218
220
  };
219
221
 
220
- let launchOptionsArgs = [];
222
+ let launchOptionsArgs: string[] = [];
221
223
 
222
224
  // Check if running in docker container
223
225
  if (fs.existsSync('/.dockerenv')) {
@@ -273,6 +275,12 @@ export const impactOrder = {
273
275
  critical: 3,
274
276
  };
275
277
 
278
+ export const sentryConfig = {
279
+ dsn: "https://e4ab99e457c531e7bde4a8dc3dd2b1ab@o4509047624761344.ingest.us.sentry.io/4509192349548544",
280
+ tracesSampleRate: 1.0, // Capture 100% of transactions for performance monitoring
281
+ profilesSampleRate: 1.0, // Capture 100% of profiles
282
+ };
283
+ // Legacy code start - Google Sheets submission
276
284
  export const formDataFields = {
277
285
  formUrl: `https://docs.google.com/forms/d/e/1FAIpQLSem5C8fyNs5TiU5Vv2Y63-SH7CHN86f-LEPxeN_1u_ldUbgUA/formResponse`, // prod
278
286
  entryUrlField: 'entry.1562345227',
@@ -285,6 +293,7 @@ export const formDataFields = {
285
293
  additionalPageDataField: 'entry.2090887881',
286
294
  metadataField: 'entry.1027769131',
287
295
  };
296
+ // Legacy code end - Google Sheets submission
288
297
 
289
298
  export const sitemapPaths = [
290
299
  '/sitemap.xml',
@@ -444,3 +453,82 @@ export enum RuleFlags {
444
453
  DISABLE_OOBEE = 'disable-oobee',
445
454
  ENABLE_WCAG_AAA = 'enable-wcag-aaa',
446
455
  }
456
+
457
+ // Note: Not all status codes will appear as Crawler will handle it as best effort first. E.g. try to handle redirect
458
+ export const STATUS_CODE_METADATA: Record<number,string> = {
459
+ // Custom Codes for Oobee's use
460
+ 0: 'Page Excluded',
461
+ 1: 'Not A Supported Document',
462
+ 2: 'Web Crawler Errored',
463
+
464
+ // 599 is set because Crawlee returns response status 100, 102, 103 as 599
465
+ 599: 'Uncommon Response Status Code Received',
466
+
467
+ // This is Status OK but thrown when the crawler cannot scan the page
468
+ 200: '200 - However Page Could Not Be Scanned',
469
+
470
+ // 1xx - Informational
471
+ 100: '100 - Continue',
472
+ 101: '101 - Switching Protocols',
473
+ 102: '102 - Processing',
474
+ 103: '103 - Early Hints',
475
+
476
+ // 2xx - Browser Doesn't Support
477
+ 204: '204 - No Content',
478
+ 205: '205 - Reset Content',
479
+
480
+ // 3xx - Redirection
481
+ 300: '300 - Multiple Choices',
482
+ 301: '301 - Moved Permanently',
483
+ 302: '302 - Found',
484
+ 303: '303 - See Other',
485
+ 304: '304 - Not Modified',
486
+ 305: '305 - Use Proxy',
487
+ 307: '307 - Temporary Redirect',
488
+ 308: '308 - Permanent Redirect',
489
+
490
+ // 4xx - Client Error
491
+ 400: '400 - Bad Request',
492
+ 401: '401 - Unauthorized',
493
+ 402: '402 - Payment Required',
494
+ 403: '403 - Forbidden',
495
+ 404: '404 - Not Found',
496
+ 405: '405 - Method Not Allowed',
497
+ 406: '406 - Not Acceptable',
498
+ 407: '407 - Proxy Authentication Required',
499
+ 408: '408 - Request Timeout',
500
+ 409: '409 - Conflict',
501
+ 410: '410 - Gone',
502
+ 411: '411 - Length Required',
503
+ 412: '412 - Precondition Failed',
504
+ 413: '413 - Payload Too Large',
505
+ 414: '414 - URI Too Long',
506
+ 415: '415 - Unsupported Media Type',
507
+ 416: '416 - Range Not Satisfiable',
508
+ 417: '417 - Expectation Failed',
509
+ 418: "418 - I'm a teapot",
510
+ 421: '421 - Misdirected Request',
511
+ 422: '422 - Unprocessable Content',
512
+ 423: '423 - Locked',
513
+ 424: '424 - Failed Dependency',
514
+ 425: '425 - Too Early',
515
+ 426: '426 - Upgrade Required',
516
+ 428: '428 - Precondition Required',
517
+ 429: '429 - Too Many Requests',
518
+ 431: '431 - Request Header Fields Too Large',
519
+ 451: '451 - Unavailable For Legal Reasons',
520
+
521
+ // 5xx - Server Error
522
+ 500: '500 - Internal Server Error',
523
+ 501: '501 - Not Implemented',
524
+ 502: '502 - Bad Gateway',
525
+ 503: '503 - Service Unavailable',
526
+ 504: '504 - Gateway Timeout',
527
+ 505: '505 - HTTP Version Not Supported',
528
+ 506: '506 - Variant Also Negotiates',
529
+ 507: '507 - Insufficient Storage',
530
+ 508: '508 - Loop Detected',
531
+ 510: '510 - Not Extended',
532
+ 511: '511 - Network Authentication Required',
533
+
534
+ };