@govtechsg/oobee 0.10.42 → 0.10.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/REPORTS.md +71 -2
- package/package.json +4 -2
- package/src/cli.ts +2 -11
- package/src/constants/common.ts +216 -76
- package/src/constants/constants.ts +89 -1
- package/src/constants/oobeeAi.ts +6 -6
- package/src/constants/questions.ts +3 -2
- package/src/crawlers/commonCrawlerFunc.ts +16 -15
- package/src/crawlers/crawlDomain.ts +82 -84
- package/src/crawlers/crawlIntelligentSitemap.ts +21 -19
- package/src/crawlers/crawlSitemap.ts +120 -109
- package/src/crawlers/custom/findElementByCssSelector.ts +1 -1
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +8 -8
- package/src/crawlers/custom/xPathToCss.ts +10 -10
- package/src/crawlers/runCustom.ts +1 -1
- package/src/index.ts +3 -4
- package/src/logs.ts +1 -1
- package/src/mergeAxeResults.ts +126 -7
- package/src/npmIndex.ts +12 -8
- package/src/screenshotFunc/htmlScreenshotFunc.ts +8 -20
- package/src/types/text-readability.d.ts +3 -0
- package/src/types/types.ts +1 -1
- package/src/utils.ts +254 -114
- package/src/xPathToCss.ts +0 -186
- package/src/xPathToCssCypress.ts +0 -178
package/src/constants/common.ts
CHANGED
@@ -15,15 +15,20 @@ import safe from 'safe-regex';
|
|
15
15
|
import * as https from 'https';
|
16
16
|
import os from 'os';
|
17
17
|
import { minimatch } from 'minimatch';
|
18
|
-
import { globSync } from 'glob';
|
19
|
-
import { LaunchOptions, devices, webkit } from 'playwright';
|
18
|
+
import { globSync, GlobOptionsWithFileTypesFalse } from 'glob';
|
19
|
+
import { LaunchOptions, Locator, Page, devices, webkit } from 'playwright';
|
20
20
|
import printMessage from 'print-message';
|
21
|
+
// @ts-ignore
|
22
|
+
import * as Sentry from '@sentry/node';
|
21
23
|
import constants, {
|
22
24
|
getDefaultChromeDataDir,
|
23
25
|
getDefaultEdgeDataDir,
|
24
26
|
getDefaultChromiumDataDir,
|
25
27
|
proxy,
|
28
|
+
sentryConfig,
|
29
|
+
// Legacy code start - Google Sheets submission
|
26
30
|
formDataFields,
|
31
|
+
// Legacy code end - Google Sheets submission
|
27
32
|
ScannerTypes,
|
28
33
|
BrowserTypes,
|
29
34
|
} from './constants.js';
|
@@ -31,6 +36,7 @@ import { silentLogger } from '../logs.js';
|
|
31
36
|
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
32
37
|
import { randomThreeDigitNumberString } from '../utils.js';
|
33
38
|
import { Answers, Data } from '../index.js';
|
39
|
+
import { DeviceDescriptor } from '../types/types.js';
|
34
40
|
|
35
41
|
// validateDirPath validates a provided directory path
|
36
42
|
// returns null if no error
|
@@ -252,7 +258,7 @@ export const getUrlMessage = (scanner: ScannerTypes): string => {
|
|
252
258
|
}
|
253
259
|
};
|
254
260
|
|
255
|
-
export const isInputValid = inputString => {
|
261
|
+
export const isInputValid = (inputString: string): boolean => {
|
256
262
|
if (!validator.isEmpty(inputString)) {
|
257
263
|
const removeBlackListCharacters = validator.escape(inputString);
|
258
264
|
|
@@ -373,12 +379,12 @@ const requestToUrl = async (
|
|
373
379
|
};
|
374
380
|
|
375
381
|
const checkUrlConnectivityWithBrowser = async (
|
376
|
-
url,
|
377
|
-
browserToRun,
|
378
|
-
clonedDataDir,
|
379
|
-
playwrightDeviceDetailsObject,
|
380
|
-
isCustomFlow,
|
381
|
-
extraHTTPHeaders,
|
382
|
+
url: string,
|
383
|
+
browserToRun: string,
|
384
|
+
clonedDataDir: string,
|
385
|
+
playwrightDeviceDetailsObject: DeviceDescriptor,
|
386
|
+
isCustomFlow: boolean,
|
387
|
+
extraHTTPHeaders: Record<string, string>,
|
382
388
|
) => {
|
383
389
|
const res = new RES();
|
384
390
|
|
@@ -468,7 +474,6 @@ const checkUrlConnectivityWithBrowser = async (
|
|
468
474
|
res.content = responseFromUrl.content;
|
469
475
|
}
|
470
476
|
} catch (error) {
|
471
|
-
|
472
477
|
// But this does work with the headless=new flag
|
473
478
|
if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
|
474
479
|
res.status = constants.urlCheckStatuses.unauthorised.code;
|
@@ -510,13 +515,13 @@ export const isSitemapContent = (content: string) => {
|
|
510
515
|
};
|
511
516
|
|
512
517
|
export const checkUrl = async (
|
513
|
-
scanner,
|
514
|
-
url,
|
515
|
-
browser,
|
516
|
-
clonedDataDir,
|
517
|
-
playwrightDeviceDetailsObject,
|
518
|
-
isCustomFlow,
|
519
|
-
extraHTTPHeaders,
|
518
|
+
scanner: ScannerTypes,
|
519
|
+
url: string,
|
520
|
+
browser: string,
|
521
|
+
clonedDataDir: string,
|
522
|
+
playwrightDeviceDetailsObject: DeviceDescriptor,
|
523
|
+
isCustomFlow: boolean,
|
524
|
+
extraHTTPHeaders: Record<string, string>,
|
520
525
|
) => {
|
521
526
|
const res = await checkUrlConnectivityWithBrowser(
|
522
527
|
url,
|
@@ -548,7 +553,7 @@ export const parseHeaders = (header?: string): Record<string, string> => {
|
|
548
553
|
// parse HTTP headers from string
|
549
554
|
if (!header) return {};
|
550
555
|
const headerValues = header.split(', ');
|
551
|
-
const allHeaders = {};
|
556
|
+
const allHeaders: Record<string, string> = {};
|
552
557
|
headerValues.map((headerValue: string) => {
|
553
558
|
const headerValuePair = headerValue.split(/ (.*)/s);
|
554
559
|
if (headerValuePair.length < 2) {
|
@@ -776,11 +781,11 @@ export const getLinksFromSitemap = async (
|
|
776
781
|
password: string,
|
777
782
|
) => {
|
778
783
|
const scannedSitemaps = new Set<string>();
|
779
|
-
const urls = {}; // dictionary of requests to urls to be scanned
|
784
|
+
const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
|
780
785
|
|
781
786
|
const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
|
782
787
|
|
783
|
-
const addToUrlList = url => {
|
788
|
+
const addToUrlList = (url: string) => {
|
784
789
|
if (!url) return;
|
785
790
|
if (isDisallowedInRobotsTxt(url)) return;
|
786
791
|
|
@@ -803,14 +808,14 @@ export const getLinksFromSitemap = async (
|
|
803
808
|
urls[url] = request;
|
804
809
|
};
|
805
810
|
|
806
|
-
const addBasicAuthCredentials = (url, username, password) => {
|
811
|
+
const addBasicAuthCredentials = (url: string, username: string, password: string) => {
|
807
812
|
const urlObject = new URL(url);
|
808
813
|
urlObject.username = username;
|
809
814
|
urlObject.password = password;
|
810
815
|
return urlObject.toString();
|
811
816
|
};
|
812
817
|
|
813
|
-
const calculateCloseness = sitemapUrl => {
|
818
|
+
const calculateCloseness = (sitemapUrl: string) => {
|
814
819
|
// Remove 'http://', 'https://', and 'www.' prefixes from the URLs
|
815
820
|
const normalizedSitemapUrl = sitemapUrl.replace(/^(https?:\/\/)?(www\.)?/, '');
|
816
821
|
const normalizedUserUrlInput = userUrlInput
|
@@ -825,10 +830,16 @@ export const getLinksFromSitemap = async (
|
|
825
830
|
}
|
826
831
|
return 0;
|
827
832
|
};
|
828
|
-
const processXmlSitemap = async (
|
829
|
-
|
833
|
+
const processXmlSitemap = async (
|
834
|
+
$: cheerio.CheerioAPI,
|
835
|
+
sitemapType: number,
|
836
|
+
linkSelector: string,
|
837
|
+
dateSelector: string,
|
838
|
+
sectionSelector: string,
|
839
|
+
) => {
|
840
|
+
const urlList: { url: string; lastModifiedDate: Date }[] = [];
|
830
841
|
// Iterate through each URL element in the sitemap, collect url and modified date
|
831
|
-
$(sectionSelector).each((
|
842
|
+
$(sectionSelector).each((_index, urlElement) => {
|
832
843
|
let url;
|
833
844
|
if (sitemapType === constants.xmlSitemapTypes.atom) {
|
834
845
|
url = $(urlElement).find(linkSelector).prop('href');
|
@@ -850,8 +861,7 @@ export const getLinksFromSitemap = async (
|
|
850
861
|
}
|
851
862
|
|
852
863
|
// If closeness is the same, sort by last modified date in descending order
|
853
|
-
|
854
|
-
return dateDifference !== 0 ? dateDifference : 0; // Maintain original order for equal dates
|
864
|
+
return (b.lastModifiedDate?.getTime() || 0) - (a.lastModifiedDate?.getTime() || 0);
|
855
865
|
});
|
856
866
|
}
|
857
867
|
|
@@ -861,7 +871,7 @@ export const getLinksFromSitemap = async (
|
|
861
871
|
}
|
862
872
|
};
|
863
873
|
|
864
|
-
const processNonStandardSitemap = data => {
|
874
|
+
const processNonStandardSitemap = (data: string) => {
|
865
875
|
const urlsFromData = crawlee
|
866
876
|
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
|
867
877
|
.slice(0, maxLinksCount);
|
@@ -934,7 +944,7 @@ export const getLinksFromSitemap = async (
|
|
934
944
|
const sitemapIndex = page.locator('sitemapindex');
|
935
945
|
const rss = page.locator('rss');
|
936
946
|
const feed = page.locator('feed');
|
937
|
-
const isRoot = async locator => (await locator.count()) > 0;
|
947
|
+
const isRoot = async (locator: Locator) => (await locator.count()) > 0;
|
938
948
|
|
939
949
|
if (await isRoot(urlSet)) {
|
940
950
|
data = await urlSet.evaluate(elem => elem.outerHTML);
|
@@ -1054,14 +1064,14 @@ export const getLinksFromSitemap = async (
|
|
1054
1064
|
return requestList;
|
1055
1065
|
};
|
1056
1066
|
|
1057
|
-
export const validEmail = email => {
|
1067
|
+
export const validEmail = (email: string) => {
|
1058
1068
|
const emailRegex = /^.+@.+\..+$/u;
|
1059
1069
|
|
1060
1070
|
return emailRegex.test(email);
|
1061
1071
|
};
|
1062
1072
|
|
1063
1073
|
// For new user flow.
|
1064
|
-
export const validName = name => {
|
1074
|
+
export const validName = (name: string) => {
|
1065
1075
|
// Allow only printable characters from any language
|
1066
1076
|
const regex = /^[\p{L}\p{N}\s'".,()\[\]{}!?:؛،؟…]+$/u;
|
1067
1077
|
|
@@ -1213,11 +1223,11 @@ export const getEdgeData = () => {
|
|
1213
1223
|
* @param {*} destDir destination directory
|
1214
1224
|
* @returns boolean indicating whether the operation was successful
|
1215
1225
|
*/
|
1216
|
-
const cloneChromeProfileCookieFiles = (options, destDir) => {
|
1226
|
+
const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
|
1217
1227
|
let profileCookiesDir;
|
1218
1228
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
1219
1229
|
// and ../Chrome/<profile name>/Cookies for mac
|
1220
|
-
let profileNamesRegex;
|
1230
|
+
let profileNamesRegex: RegExp;
|
1221
1231
|
if (os.platform() === 'win32') {
|
1222
1232
|
profileCookiesDir = globSync('**/Network/Cookies', {
|
1223
1233
|
...options,
|
@@ -1288,11 +1298,11 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
|
|
1288
1298
|
* @param {*} destDir destination directory
|
1289
1299
|
* @returns boolean indicating whether the operation was successful
|
1290
1300
|
*/
|
1291
|
-
const cloneEdgeProfileCookieFiles = (options, destDir) => {
|
1301
|
+
const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
|
1292
1302
|
let profileCookiesDir;
|
1293
1303
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
1294
1304
|
// and ../Chrome/<profile name>/Cookies for mac
|
1295
|
-
let profileNamesRegex;
|
1305
|
+
let profileNamesRegex: RegExp;
|
1296
1306
|
// Ignores the cloned oobee directory if exists
|
1297
1307
|
if (os.platform() === 'win32') {
|
1298
1308
|
profileCookiesDir = globSync('**/Network/Cookies', {
|
@@ -1361,7 +1371,7 @@ const cloneEdgeProfileCookieFiles = (options, destDir) => {
|
|
1361
1371
|
* @param {string} destDir - destination directory
|
1362
1372
|
* @returns boolean indicating whether the operation was successful
|
1363
1373
|
*/
|
1364
|
-
const cloneLocalStateFile = (options, destDir) => {
|
1374
|
+
const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
|
1365
1375
|
const localState = globSync('**/*Local State', {
|
1366
1376
|
...options,
|
1367
1377
|
maxDepth: 1,
|
@@ -1647,8 +1657,9 @@ export const getPlaywrightDeviceDetailsObject = (
|
|
1647
1657
|
deviceChosen: string,
|
1648
1658
|
customDevice: string,
|
1649
1659
|
viewportWidth: number,
|
1650
|
-
) => {
|
1651
|
-
let playwrightDeviceDetailsObject =
|
1660
|
+
): DeviceDescriptor => {
|
1661
|
+
let playwrightDeviceDetailsObject = devices['Desktop Chrome']; // default to Desktop Chrome
|
1662
|
+
|
1652
1663
|
if (deviceChosen === 'Mobile' || customDevice === 'iPhone 11') {
|
1653
1664
|
playwrightDeviceDetailsObject = devices['iPhone 11'];
|
1654
1665
|
} else if (customDevice === 'Samsung Galaxy S9+') {
|
@@ -1656,6 +1667,11 @@ export const getPlaywrightDeviceDetailsObject = (
|
|
1656
1667
|
} else if (viewportWidth) {
|
1657
1668
|
playwrightDeviceDetailsObject = {
|
1658
1669
|
viewport: { width: viewportWidth, height: 720 },
|
1670
|
+
isMobile: false,
|
1671
|
+
hasTouch: false,
|
1672
|
+
userAgent: devices['Desktop Chrome'].userAgent,
|
1673
|
+
deviceScaleFactor: 1,
|
1674
|
+
defaultBrowserType: 'chromium',
|
1659
1675
|
};
|
1660
1676
|
} else if (customDevice) {
|
1661
1677
|
playwrightDeviceDetailsObject = devices[customDevice.replace(/_/g, ' ')];
|
@@ -1742,49 +1758,171 @@ export const submitForm = async (
|
|
1742
1758
|
numberOfPagesNotScanned: number,
|
1743
1759
|
metadata: string,
|
1744
1760
|
) => {
|
1745
|
-
|
1746
|
-
|
1761
|
+
// Initialize Sentry
|
1762
|
+
Sentry.init(sentryConfig);
|
1763
|
+
|
1764
|
+
// Format the data as you want it to appear in Sentry
|
1765
|
+
const additionalPageData = {
|
1747
1766
|
pagesNotScanned: numberOfPagesNotScanned,
|
1748
|
-
|
1767
|
+
redirectsScanned: numberOfRedirectsScanned
|
1768
|
+
};
|
1749
1769
|
|
1750
|
-
|
1751
|
-
|
1752
|
-
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1758
|
-
|
1759
|
-
|
1770
|
+
// Extract issue occurrences from scan results if possible
|
1771
|
+
const issueOccurrences = extractIssueOccurrences(scanResultsJson);
|
1772
|
+
|
1773
|
+
// Determine if it's a government website
|
1774
|
+
const isGov = entryUrl.includes('.gov');
|
1775
|
+
|
1776
|
+
// Get email domain/tag
|
1777
|
+
const emailTag = email.split('@')[1] || '';
|
1778
|
+
|
1779
|
+
// Format timestamp
|
1780
|
+
const timestamp = new Date().toISOString();
|
1781
|
+
|
1782
|
+
// Prepare redirect URL if different from entry URL
|
1783
|
+
const redirectUrl = scannedUrl !== entryUrl ? scannedUrl : null;
|
1784
|
+
|
1785
|
+
try {
|
1786
|
+
// Capture the scan data as a Sentry event with each field as a separate entry
|
1787
|
+
Sentry.captureEvent({
|
1788
|
+
message: `Accessibility scan completed for ${entryUrl}`,
|
1789
|
+
level: 'info',
|
1790
|
+
tags: {
|
1791
|
+
scanType: scanType,
|
1792
|
+
browser: browserToRun,
|
1793
|
+
isGov: isGov,
|
1794
|
+
emailDomain: emailTag,
|
1795
|
+
},
|
1796
|
+
user: {
|
1797
|
+
email: email,
|
1798
|
+
username: name,
|
1799
|
+
},
|
1800
|
+
extra: {
|
1801
|
+
// Top-level fields as shown in your screenshot
|
1802
|
+
entryUrl: entryUrl,
|
1803
|
+
websiteUrl: scannedUrl,
|
1804
|
+
scanType: scanType,
|
1805
|
+
numberOfPagesScanned: numberOfPagesScanned,
|
1806
|
+
metadata: metadata ? JSON.parse(metadata) : {},
|
1807
|
+
scanResults: scanResultsJson.length > 8000 ?
|
1808
|
+
scanResultsJson.substring(0, 8000) + '...[truncated]' :
|
1809
|
+
scanResultsJson,
|
1810
|
+
|
1811
|
+
// Additional fields you requested
|
1812
|
+
additionalPageData: additionalPageData,
|
1813
|
+
additionalScan: additionalPageData,
|
1814
|
+
additionalPagesData: additionalPageData,
|
1815
|
+
|
1816
|
+
// Individual fields as requested
|
1817
|
+
timestamp: timestamp,
|
1818
|
+
redirectUrl: redirectUrl,
|
1819
|
+
isGov: isGov,
|
1820
|
+
emailTag: emailTag,
|
1821
|
+
consolidatedScanType: scanType.toLowerCase(),
|
1822
|
+
email: email,
|
1823
|
+
name: name,
|
1824
|
+
filledNoPagesScanned: numberOfPagesScanned > 0,
|
1825
|
+
redirectsScanned: numberOfRedirectsScanned,
|
1826
|
+
pagesNotScanned: numberOfPagesNotScanned,
|
1827
|
+
issueOccurrences: issueOccurrences
|
1828
|
+
}
|
1829
|
+
});
|
1760
1830
|
|
1761
|
-
|
1762
|
-
|
1831
|
+
// IMPORTANT: Wait for the event to be sent
|
1832
|
+
await Sentry.flush(2000); // Wait up to 2 seconds for the event to be sent
|
1833
|
+
|
1834
|
+
} catch (error) {
|
1835
|
+
console.error('Error sending data to Sentry:', error);
|
1763
1836
|
}
|
1764
1837
|
|
1765
|
-
|
1766
|
-
|
1767
|
-
|
1768
|
-
|
1769
|
-
|
1770
|
-
}
|
1771
|
-
|
1772
|
-
|
1773
|
-
|
1838
|
+
// Legacy code start - Google Sheets submission
|
1839
|
+
try {
|
1840
|
+
const additionalPageDataJson = JSON.stringify({
|
1841
|
+
redirectsScanned: numberOfRedirectsScanned,
|
1842
|
+
pagesNotScanned: numberOfPagesNotScanned,
|
1843
|
+
});
|
1844
|
+
|
1845
|
+
let finalUrl =
|
1846
|
+
`${formDataFields.formUrl}?` +
|
1847
|
+
`${formDataFields.entryUrlField}=${entryUrl}&` +
|
1848
|
+
`${formDataFields.scanTypeField}=${scanType}&` +
|
1849
|
+
`${formDataFields.emailField}=${email}&` +
|
1850
|
+
`${formDataFields.nameField}=${name}&` +
|
1851
|
+
`${formDataFields.resultsField}=${encodeURIComponent(scanResultsJson)}&` +
|
1852
|
+
`${formDataFields.numberOfPagesScannedField}=${numberOfPagesScanned}&` +
|
1853
|
+
`${formDataFields.additionalPageDataField}=${encodeURIComponent(additionalPageDataJson)}&` +
|
1854
|
+
`${formDataFields.metadataField}=${encodeURIComponent(metadata)}`;
|
1855
|
+
|
1856
|
+
if (scannedUrl !== entryUrl) {
|
1857
|
+
finalUrl += `&${formDataFields.redirectUrlField}=${scannedUrl}`;
|
1858
|
+
}
|
1859
|
+
|
1860
|
+
if (proxy) {
|
1861
|
+
await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
|
1862
|
+
} else {
|
1863
|
+
try {
|
1864
|
+
await axios.get(finalUrl, { timeout: 2000 });
|
1865
|
+
} catch (error) {
|
1866
|
+
if (error.code === 'ECONNABORTED') {
|
1867
|
+
if (browserToRun || constants.launcher === webkit) {
|
1868
|
+
await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
|
1869
|
+
}
|
1774
1870
|
}
|
1775
1871
|
}
|
1776
1872
|
}
|
1873
|
+
console.log('Legacy Google Sheets form submitted successfully');
|
1874
|
+
} catch (legacyError) {
|
1875
|
+
console.error('Error submitting legacy Google Sheets form:', legacyError);
|
1777
1876
|
}
|
1877
|
+
// Legacy code end - Google Sheets submission
|
1778
1878
|
};
|
1779
1879
|
|
1780
|
-
|
1880
|
+
// Helper function to extract issue occurrences from scan results
|
1881
|
+
function extractIssueOccurrences(scanResultsJson: string): number {
|
1882
|
+
try {
|
1883
|
+
const results = JSON.parse(scanResultsJson);
|
1884
|
+
// Count total occurrences from all issues in the scan results
|
1885
|
+
// This may need adjustment based on your specific JSON structure
|
1886
|
+
let totalOccurrences = 0;
|
1887
|
+
|
1888
|
+
// Try to parse the format shown in your screenshot
|
1889
|
+
if (typeof results === 'object') {
|
1890
|
+
// Loop through all keys that have "occurrences" properties
|
1891
|
+
Object.keys(results).forEach(key => {
|
1892
|
+
if (results[key] && typeof results[key] === 'object' && 'occurrences' in results[key]) {
|
1893
|
+
totalOccurrences += parseInt(results[key].occurrences, 10) || 0;
|
1894
|
+
}
|
1895
|
+
});
|
1896
|
+
|
1897
|
+
// If we found any occurrences, return the total
|
1898
|
+
if (totalOccurrences > 0) {
|
1899
|
+
return totalOccurrences;
|
1900
|
+
}
|
1901
|
+
}
|
1902
|
+
|
1903
|
+
// Fallback to direct occurrences property if available
|
1904
|
+
if (results && results.occurrences) {
|
1905
|
+
return parseInt(results.occurrences, 10) || 0;
|
1906
|
+
}
|
1907
|
+
|
1908
|
+
return 0;
|
1909
|
+
} catch (e) {
|
1910
|
+
console.error('Error extracting issue occurrences:', e);
|
1911
|
+
return 0;
|
1912
|
+
}
|
1913
|
+
}
|
1914
|
+
|
1915
|
+
export async function initModifiedUserAgent(
|
1916
|
+
browser?: string,
|
1917
|
+
playwrightDeviceDetailsObject?: object,
|
1918
|
+
) {
|
1781
1919
|
const isHeadless = process.env.CRAWLEE_HEADLESS === '1';
|
1782
|
-
|
1920
|
+
|
1783
1921
|
// If headless mode is enabled, ensure the headless flag is set.
|
1784
1922
|
if (isHeadless && !constants.launchOptionsArgs.includes('--headless=new')) {
|
1785
1923
|
constants.launchOptionsArgs.push('--headless=new');
|
1786
1924
|
}
|
1787
|
-
|
1925
|
+
|
1788
1926
|
// Build the launch options using your production settings.
|
1789
1927
|
// headless is forced to false as in your persistent context, and we merge in getPlaywrightLaunchOptions and device details.
|
1790
1928
|
const launchOptions = {
|
@@ -1803,17 +1941,16 @@ export async function initModifiedUserAgent(browser?: string, playwrightDeviceDe
|
|
1803
1941
|
|
1804
1942
|
// Modify the UA:
|
1805
1943
|
// Replace "HeadlessChrome" with "Chrome" if present.
|
1806
|
-
|
1944
|
+
const modifiedUA = defaultUA.includes('HeadlessChrome')
|
1807
1945
|
? defaultUA.replace('HeadlessChrome', 'Chrome')
|
1808
1946
|
: defaultUA;
|
1809
|
-
|
1947
|
+
|
1810
1948
|
// Push the modified UA flag into your global launch options.
|
1811
1949
|
constants.launchOptionsArgs.push(`--user-agent=${modifiedUA}`);
|
1812
1950
|
// Optionally log the modified UA.
|
1813
1951
|
// console.log('Modified User Agent:', modifiedUA);
|
1814
1952
|
}
|
1815
1953
|
|
1816
|
-
|
1817
1954
|
/**
|
1818
1955
|
* @param {string} browser browser name ("chrome" or "edge", null for chromium, the default Playwright browser)
|
1819
1956
|
* @returns playwright launch options object. For more details: https://playwright.dev/docs/api/class-browsertype#browser-type-launch
|
@@ -1856,25 +1993,25 @@ export const urlWithoutAuth = (url: string): string => {
|
|
1856
1993
|
return parsedUrl.toString();
|
1857
1994
|
};
|
1858
1995
|
|
1859
|
-
export const waitForPageLoaded = async (page, timeout = 10000) => {
|
1996
|
+
export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
|
1860
1997
|
const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
|
1861
1998
|
|
1862
1999
|
return Promise.race([
|
1863
2000
|
page.waitForLoadState('load'), // Ensure page load completes
|
1864
2001
|
page.waitForLoadState('networkidle'), // Wait for network requests to settle
|
1865
2002
|
new Promise(resolve => setTimeout(resolve, timeout)), // Hard timeout as a fallback
|
1866
|
-
page.evaluate(
|
1867
|
-
return new Promise(
|
2003
|
+
page.evaluate(OBSERVER_TIMEOUT => {
|
2004
|
+
return new Promise(resolve => {
|
1868
2005
|
// Skip mutation check for PDFs
|
1869
2006
|
if (document.contentType === 'application/pdf') {
|
1870
2007
|
resolve('Skipping DOM mutation check for PDF.');
|
1871
2008
|
return;
|
1872
2009
|
}
|
1873
2010
|
|
1874
|
-
let timeout;
|
2011
|
+
let timeout: NodeJS.Timeout;
|
1875
2012
|
let mutationCount = 0;
|
1876
2013
|
const MAX_MUTATIONS = 250; // Limit max mutations
|
1877
|
-
const mutationHash = {};
|
2014
|
+
const mutationHash: Record<string, number> = {};
|
1878
2015
|
|
1879
2016
|
const observer = new MutationObserver(mutationsList => {
|
1880
2017
|
clearTimeout(timeout);
|
@@ -1916,14 +2053,17 @@ export const waitForPageLoaded = async (page, timeout = 10000) => {
|
|
1916
2053
|
resolve('Observer timeout reached, exiting.');
|
1917
2054
|
}, OBSERVER_TIMEOUT);
|
1918
2055
|
|
1919
|
-
observer.observe(document.documentElement, {
|
2056
|
+
observer.observe(document.documentElement, {
|
2057
|
+
childList: true,
|
2058
|
+
subtree: true,
|
2059
|
+
attributes: true,
|
2060
|
+
});
|
1920
2061
|
});
|
1921
2062
|
}, OBSERVER_TIMEOUT), // Pass OBSERVER_TIMEOUT dynamically to the browser context
|
1922
2063
|
]);
|
1923
2064
|
};
|
1924
2065
|
|
1925
|
-
|
1926
|
-
function isValidHttpUrl(urlString) {
|
2066
|
+
function isValidHttpUrl(urlString: string) {
|
1927
2067
|
const pattern = /^(http|https):\/\/[^ "]+$/;
|
1928
2068
|
return pattern.test(urlString);
|
1929
2069
|
}
|
@@ -6,6 +6,7 @@ import which from 'which';
|
|
6
6
|
import os from 'os';
|
7
7
|
import { spawnSync, execSync } from 'child_process';
|
8
8
|
import { chromium } from 'playwright';
|
9
|
+
import * as Sentry from '@sentry/node';
|
9
10
|
import { silentLogger } from '../logs.js';
|
10
11
|
import { PageInfo } from '../mergeAxeResults.js';
|
11
12
|
|
@@ -29,6 +30,7 @@ export const blackListedFileExtensions = [
|
|
29
30
|
'zip',
|
30
31
|
'webp',
|
31
32
|
'json',
|
33
|
+
'xml'
|
32
34
|
];
|
33
35
|
|
34
36
|
export const getIntermediateScreenshotsPath = (datasetsPath: string): string =>
|
@@ -217,7 +219,7 @@ export const guiInfoStatusTypes = {
|
|
217
219
|
DUPLICATE: 'duplicate',
|
218
220
|
};
|
219
221
|
|
220
|
-
let launchOptionsArgs = [];
|
222
|
+
let launchOptionsArgs: string[] = [];
|
221
223
|
|
222
224
|
// Check if running in docker container
|
223
225
|
if (fs.existsSync('/.dockerenv')) {
|
@@ -273,6 +275,12 @@ export const impactOrder = {
|
|
273
275
|
critical: 3,
|
274
276
|
};
|
275
277
|
|
278
|
+
export const sentryConfig = {
|
279
|
+
dsn: "https://e4ab99e457c531e7bde4a8dc3dd2b1ab@o4509047624761344.ingest.us.sentry.io/4509192349548544",
|
280
|
+
tracesSampleRate: 1.0, // Capture 100% of transactions for performance monitoring
|
281
|
+
profilesSampleRate: 1.0, // Capture 100% of profiles
|
282
|
+
};
|
283
|
+
// Legacy code start - Google Sheets submission
|
276
284
|
export const formDataFields = {
|
277
285
|
formUrl: `https://docs.google.com/forms/d/e/1FAIpQLSem5C8fyNs5TiU5Vv2Y63-SH7CHN86f-LEPxeN_1u_ldUbgUA/formResponse`, // prod
|
278
286
|
entryUrlField: 'entry.1562345227',
|
@@ -285,6 +293,7 @@ export const formDataFields = {
|
|
285
293
|
additionalPageDataField: 'entry.2090887881',
|
286
294
|
metadataField: 'entry.1027769131',
|
287
295
|
};
|
296
|
+
// Legacy code end - Google Sheets submission
|
288
297
|
|
289
298
|
export const sitemapPaths = [
|
290
299
|
'/sitemap.xml',
|
@@ -444,3 +453,82 @@ export enum RuleFlags {
|
|
444
453
|
DISABLE_OOBEE = 'disable-oobee',
|
445
454
|
ENABLE_WCAG_AAA = 'enable-wcag-aaa',
|
446
455
|
}
|
456
|
+
|
457
|
+
// Note: Not all status codes will appear as Crawler will handle it as best effort first. E.g. try to handle redirect
|
458
|
+
export const STATUS_CODE_METADATA: Record<number,string> = {
|
459
|
+
// Custom Codes for Oobee's use
|
460
|
+
0: 'Page Excluded',
|
461
|
+
1: 'Not A Supported Document',
|
462
|
+
2: 'Web Crawler Errored',
|
463
|
+
|
464
|
+
// 599 is set because Crawlee returns response status 100, 102, 103 as 599
|
465
|
+
599: 'Uncommon Response Status Code Received',
|
466
|
+
|
467
|
+
// This is Status OK but thrown when the crawler cannot scan the page
|
468
|
+
200: '200 - However Page Could Not Be Scanned',
|
469
|
+
|
470
|
+
// 1xx - Informational
|
471
|
+
100: '100 - Continue',
|
472
|
+
101: '101 - Switching Protocols',
|
473
|
+
102: '102 - Processing',
|
474
|
+
103: '103 - Early Hints',
|
475
|
+
|
476
|
+
// 2xx - Browser Doesn't Support
|
477
|
+
204: '204 - No Content',
|
478
|
+
205: '205 - Reset Content',
|
479
|
+
|
480
|
+
// 3xx - Redirection
|
481
|
+
300: '300 - Multiple Choices',
|
482
|
+
301: '301 - Moved Permanently',
|
483
|
+
302: '302 - Found',
|
484
|
+
303: '303 - See Other',
|
485
|
+
304: '304 - Not Modified',
|
486
|
+
305: '305 - Use Proxy',
|
487
|
+
307: '307 - Temporary Redirect',
|
488
|
+
308: '308 - Permanent Redirect',
|
489
|
+
|
490
|
+
// 4xx - Client Error
|
491
|
+
400: '400 - Bad Request',
|
492
|
+
401: '401 - Unauthorized',
|
493
|
+
402: '402 - Payment Required',
|
494
|
+
403: '403 - Forbidden',
|
495
|
+
404: '404 - Not Found',
|
496
|
+
405: '405 - Method Not Allowed',
|
497
|
+
406: '406 - Not Acceptable',
|
498
|
+
407: '407 - Proxy Authentication Required',
|
499
|
+
408: '408 - Request Timeout',
|
500
|
+
409: '409 - Conflict',
|
501
|
+
410: '410 - Gone',
|
502
|
+
411: '411 - Length Required',
|
503
|
+
412: '412 - Precondition Failed',
|
504
|
+
413: '413 - Payload Too Large',
|
505
|
+
414: '414 - URI Too Long',
|
506
|
+
415: '415 - Unsupported Media Type',
|
507
|
+
416: '416 - Range Not Satisfiable',
|
508
|
+
417: '417 - Expectation Failed',
|
509
|
+
418: "418 - I'm a teapot",
|
510
|
+
421: '421 - Misdirected Request',
|
511
|
+
422: '422 - Unprocessable Content',
|
512
|
+
423: '423 - Locked',
|
513
|
+
424: '424 - Failed Dependency',
|
514
|
+
425: '425 - Too Early',
|
515
|
+
426: '426 - Upgrade Required',
|
516
|
+
428: '428 - Precondition Required',
|
517
|
+
429: '429 - Too Many Requests',
|
518
|
+
431: '431 - Request Header Fields Too Large',
|
519
|
+
451: '451 - Unavailable For Legal Reasons',
|
520
|
+
|
521
|
+
// 5xx - Server Error
|
522
|
+
500: '500 - Internal Server Error',
|
523
|
+
501: '501 - Not Implemented',
|
524
|
+
502: '502 - Bad Gateway',
|
525
|
+
503: '503 - Service Unavailable',
|
526
|
+
504: '504 - Gateway Timeout',
|
527
|
+
505: '505 - HTTP Version Not Supported',
|
528
|
+
506: '506 - Variant Also Negotiates',
|
529
|
+
507: '507 - Insufficient Storage',
|
530
|
+
508: '508 - Loop Detected',
|
531
|
+
510: '510 - Not Extended',
|
532
|
+
511: '511 - Network Authentication Required',
|
533
|
+
|
534
|
+
};
|