@govtechsg/oobee 0.10.42 → 0.10.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/REPORTS.md +71 -2
- package/package.json +3 -2
- package/src/cli.ts +2 -11
- package/src/constants/common.ts +68 -52
- package/src/constants/constants.ts +81 -1
- package/src/constants/oobeeAi.ts +6 -6
- package/src/constants/questions.ts +3 -2
- package/src/crawlers/commonCrawlerFunc.ts +16 -15
- package/src/crawlers/crawlDomain.ts +82 -84
- package/src/crawlers/crawlIntelligentSitemap.ts +21 -19
- package/src/crawlers/crawlSitemap.ts +120 -109
- package/src/crawlers/custom/findElementByCssSelector.ts +1 -1
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +8 -8
- package/src/crawlers/custom/xPathToCss.ts +10 -10
- package/src/crawlers/runCustom.ts +1 -1
- package/src/index.ts +3 -4
- package/src/logs.ts +1 -1
- package/src/mergeAxeResults.ts +3 -5
- package/src/npmIndex.ts +12 -8
- package/src/screenshotFunc/htmlScreenshotFunc.ts +7 -19
- package/src/types/text-readability.d.ts +3 -0
- package/src/types/types.ts +1 -1
- package/src/utils.ts +128 -114
- package/src/xPathToCss.ts +0 -186
- package/src/xPathToCssCypress.ts +0 -178
package/REPORTS.md
CHANGED
@@ -242,7 +242,9 @@ This file contains a summary of pages affected by accessibility issues.
|
|
242
242
|
{
|
243
243
|
"url": "<string>",
|
244
244
|
"pageTitle": "<string>",
|
245
|
-
"actualUrl": "
|
245
|
+
"actualUrl": "<string>",
|
246
|
+
"metadata": "<string>",
|
247
|
+
"httpStatusCode": number
|
246
248
|
},
|
247
249
|
],
|
248
250
|
"pagesNotScannedCount": <number>
|
@@ -340,7 +342,9 @@ This file contains a summary of accessibility issues found in a scan, categorize
|
|
340
342
|
{
|
341
343
|
"url": "<string>",
|
342
344
|
"pageTitle": "<string>",
|
343
|
-
"actualUrl": "
|
345
|
+
"actualUrl": "<string>",
|
346
|
+
"metadata": "<string>",
|
347
|
+
"httpStatusCode": number
|
344
348
|
},
|
345
349
|
],
|
346
350
|
"pagesNotScannedCount": <number>
|
@@ -360,3 +364,68 @@ To deflate the .json.gz.b64, use the following with `pako` library installed:
|
|
360
364
|
// Parse and return the JSON object
|
361
365
|
return JSON.parse(jsonString);
|
362
366
|
```
|
367
|
+
|
368
|
+
## HTTP Status Codes Returned for Skipped Pages
|
369
|
+
In scanPagesSummary.json and scanPagesDetail,json, within each `pagesNotScanned`, the following HTTP and Metadata is stored to provide a reason why the apge could not be scanned.
|
370
|
+
|
371
|
+
| httpStatusCode | metadata |
|
372
|
+
|------|------------------------------------------------|
|
373
|
+
| 0 | Page Excluded |
|
374
|
+
| 1 | Not A Supported Document |
|
375
|
+
| 2 | Web Crawler Errored |
|
376
|
+
| 100 | 100 – Continue |
|
377
|
+
| 101 | 101 – Switching Protocols |
|
378
|
+
| 102 | 102 – Processing |
|
379
|
+
| 103 | 103 – Early Hints |
|
380
|
+
| 200 | 200 – However Page Could Not Be Scanned |
|
381
|
+
| 204 | 204 – No Content |
|
382
|
+
| 205 | 205 – Reset Content |
|
383
|
+
| 300 | 300 – Multiple Choices |
|
384
|
+
| 301 | 301 – Moved Permanently |
|
385
|
+
| 302 | 302 – Found |
|
386
|
+
| 303 | 303 – See Other |
|
387
|
+
| 304 | 304 – Not Modified |
|
388
|
+
| 305 | 305 – Use Proxy |
|
389
|
+
| 307 | 307 – Temporary Redirect |
|
390
|
+
| 308 | 308 – Permanent Redirect |
|
391
|
+
| 400 | 400 – Bad Request |
|
392
|
+
| 401 | 401 – Unauthorized |
|
393
|
+
| 402 | 402 – Payment Required |
|
394
|
+
| 403 | 403 – Forbidden |
|
395
|
+
| 404 | 404 – Not Found |
|
396
|
+
| 405 | 405 – Method Not Allowed |
|
397
|
+
| 406 | 406 – Not Acceptable |
|
398
|
+
| 407 | 407 – Proxy Authentication Required |
|
399
|
+
| 408 | 408 – Request Timeout |
|
400
|
+
| 409 | 409 – Conflict |
|
401
|
+
| 410 | 410 – Gone |
|
402
|
+
| 411 | 411 – Length Required |
|
403
|
+
| 412 | 412 – Precondition Failed |
|
404
|
+
| 413 | 413 – Payload Too Large |
|
405
|
+
| 414 | 414 – URI Too Long |
|
406
|
+
| 415 | 415 – Unsupported Media Type |
|
407
|
+
| 416 | 416 – Range Not Satisfiable |
|
408
|
+
| 417 | 417 – Expectation Failed |
|
409
|
+
| 418 | 418 – I’m a teapot |
|
410
|
+
| 421 | 421 – Misdirected Request |
|
411
|
+
| 422 | 422 – Unprocessable Content |
|
412
|
+
| 423 | 423 – Locked |
|
413
|
+
| 424 | 424 – Failed Dependency |
|
414
|
+
| 425 | 425 – Too Early |
|
415
|
+
| 426 | 426 – Upgrade Required |
|
416
|
+
| 428 | 428 – Precondition Required |
|
417
|
+
| 429 | 429 – Too Many Requests |
|
418
|
+
| 431 | 431 – Request Header Fields Too Large |
|
419
|
+
| 451 | 451 – Unavailable For Legal Reasons |
|
420
|
+
| 500 | 500 – Internal Server Error |
|
421
|
+
| 501 | 501 – Not Implemented |
|
422
|
+
| 502 | 502 – Bad Gateway |
|
423
|
+
| 503 | 503 – Service Unavailable |
|
424
|
+
| 504 | 504 – Gateway Timeout |
|
425
|
+
| 505 | 505 – HTTP Version Not Supported |
|
426
|
+
| 506 | 506 – Variant Also Negotiates |
|
427
|
+
| 507 | 507 – Insufficient Storage |
|
428
|
+
| 508 | 508 – Loop Detected |
|
429
|
+
| 510 | 510 – Not Extended |
|
430
|
+
| 511 | 511 – Network Authentication Required |
|
431
|
+
| 599 | Uncommon Response Code Received |
|
package/package.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
{
|
2
2
|
"name": "@govtechsg/oobee",
|
3
3
|
"main": "dist/npmIndex.js",
|
4
|
-
"version": "0.10.
|
4
|
+
"version": "0.10.43",
|
5
5
|
"type": "module",
|
6
6
|
"author": "Government Technology Agency <info@tech.gov.sg>",
|
7
7
|
"dependencies": {
|
@@ -46,6 +46,7 @@
|
|
46
46
|
"@types/fs-extra": "^11.0.4",
|
47
47
|
"@types/inquirer": "^9.0.7",
|
48
48
|
"@types/lodash": "^4.17.7",
|
49
|
+
"@types/mime-types": "^2.1.4",
|
49
50
|
"@types/safe-regex": "^1.1.6",
|
50
51
|
"@types/validator": "^13.11.10",
|
51
52
|
"@types/which": "^3.0.4",
|
@@ -97,4 +98,4 @@
|
|
97
98
|
"url": "https://github.com/GovTechSG/oobee/issues"
|
98
99
|
},
|
99
100
|
"homepage": "https://github.com/GovTechSG/oobee#readme"
|
100
|
-
}
|
101
|
+
}
|
package/src/cli.ts
CHANGED
@@ -137,9 +137,6 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
|
|
137
137
|
printMessage([`Invalid blacklistedPatternsFilename file path. ${err}`], messageOptions);
|
138
138
|
process.exit(1);
|
139
139
|
}
|
140
|
-
|
141
|
-
// eslint-disable-next-line no-unreachable
|
142
|
-
return null;
|
143
140
|
})
|
144
141
|
.coerce('i', option => {
|
145
142
|
const { choices } = cliOptions.i;
|
@@ -241,7 +238,7 @@ const scanInit = async (argvs: Answers): Promise<string> => {
|
|
241
238
|
clonedDataDir,
|
242
239
|
updatedArgvs.playwrightDeviceDetailsObject,
|
243
240
|
isCustomFlow,
|
244
|
-
updatedArgvs.header,
|
241
|
+
parseHeaders(updatedArgvs.header),
|
245
242
|
);
|
246
243
|
switch (res.status) {
|
247
244
|
case statuses.success.code: {
|
@@ -255,17 +252,14 @@ const scanInit = async (argvs: Answers): Promise<string> => {
|
|
255
252
|
case statuses.unauthorised.code: {
|
256
253
|
printMessage([statuses.unauthorised.message], messageOptions);
|
257
254
|
process.exit(res.status);
|
258
|
-
break;
|
259
255
|
}
|
260
256
|
case statuses.cannotBeResolved.code: {
|
261
257
|
printMessage([statuses.cannotBeResolved.message], messageOptions);
|
262
258
|
process.exit(res.status);
|
263
|
-
break;
|
264
259
|
}
|
265
260
|
case statuses.systemError.code: {
|
266
261
|
printMessage([statuses.systemError.message], messageOptions);
|
267
262
|
process.exit(res.status);
|
268
|
-
break;
|
269
263
|
}
|
270
264
|
case statuses.invalidUrl.code: {
|
271
265
|
if (
|
@@ -296,17 +290,14 @@ const scanInit = async (argvs: Answers): Promise<string> => {
|
|
296
290
|
case statuses.notASitemap.code: {
|
297
291
|
printMessage([statuses.notASitemap.message], messageOptions);
|
298
292
|
process.exit(res.status);
|
299
|
-
break;
|
300
293
|
}
|
301
294
|
case statuses.notALocalFile.code: {
|
302
295
|
printMessage([statuses.notALocalFile.message], messageOptions);
|
303
296
|
process.exit(res.status);
|
304
|
-
break;
|
305
297
|
}
|
306
298
|
case statuses.browserError.code: {
|
307
299
|
printMessage([statuses.browserError.message], messageOptions);
|
308
300
|
process.exit(res.status);
|
309
|
-
break;
|
310
301
|
}
|
311
302
|
default:
|
312
303
|
break;
|
@@ -362,7 +353,7 @@ const scanInit = async (argvs: Answers): Promise<string> => {
|
|
362
353
|
}
|
363
354
|
|
364
355
|
// Delete dataset and request queues
|
365
|
-
|
356
|
+
cleanUp(data.randomToken);
|
366
357
|
|
367
358
|
return getStoragePath(data.randomToken);
|
368
359
|
};
|
package/src/constants/common.ts
CHANGED
@@ -15,8 +15,8 @@ import safe from 'safe-regex';
|
|
15
15
|
import * as https from 'https';
|
16
16
|
import os from 'os';
|
17
17
|
import { minimatch } from 'minimatch';
|
18
|
-
import { globSync } from 'glob';
|
19
|
-
import { LaunchOptions, devices, webkit } from 'playwright';
|
18
|
+
import { globSync, GlobOptionsWithFileTypesFalse } from 'glob';
|
19
|
+
import { LaunchOptions, Locator, Page, devices, webkit } from 'playwright';
|
20
20
|
import printMessage from 'print-message';
|
21
21
|
import constants, {
|
22
22
|
getDefaultChromeDataDir,
|
@@ -31,6 +31,7 @@ import { silentLogger } from '../logs.js';
|
|
31
31
|
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
32
32
|
import { randomThreeDigitNumberString } from '../utils.js';
|
33
33
|
import { Answers, Data } from '../index.js';
|
34
|
+
import { DeviceDescriptor } from '../types/types.js';
|
34
35
|
|
35
36
|
// validateDirPath validates a provided directory path
|
36
37
|
// returns null if no error
|
@@ -252,7 +253,7 @@ export const getUrlMessage = (scanner: ScannerTypes): string => {
|
|
252
253
|
}
|
253
254
|
};
|
254
255
|
|
255
|
-
export const isInputValid = inputString => {
|
256
|
+
export const isInputValid = (inputString: string): boolean => {
|
256
257
|
if (!validator.isEmpty(inputString)) {
|
257
258
|
const removeBlackListCharacters = validator.escape(inputString);
|
258
259
|
|
@@ -373,12 +374,12 @@ const requestToUrl = async (
|
|
373
374
|
};
|
374
375
|
|
375
376
|
const checkUrlConnectivityWithBrowser = async (
|
376
|
-
url,
|
377
|
-
browserToRun,
|
378
|
-
clonedDataDir,
|
379
|
-
playwrightDeviceDetailsObject,
|
380
|
-
isCustomFlow,
|
381
|
-
extraHTTPHeaders,
|
377
|
+
url: string,
|
378
|
+
browserToRun: string,
|
379
|
+
clonedDataDir: string,
|
380
|
+
playwrightDeviceDetailsObject: DeviceDescriptor,
|
381
|
+
isCustomFlow: boolean,
|
382
|
+
extraHTTPHeaders: Record<string, string>,
|
382
383
|
) => {
|
383
384
|
const res = new RES();
|
384
385
|
|
@@ -468,7 +469,6 @@ const checkUrlConnectivityWithBrowser = async (
|
|
468
469
|
res.content = responseFromUrl.content;
|
469
470
|
}
|
470
471
|
} catch (error) {
|
471
|
-
|
472
472
|
// But this does work with the headless=new flag
|
473
473
|
if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
|
474
474
|
res.status = constants.urlCheckStatuses.unauthorised.code;
|
@@ -510,13 +510,13 @@ export const isSitemapContent = (content: string) => {
|
|
510
510
|
};
|
511
511
|
|
512
512
|
export const checkUrl = async (
|
513
|
-
scanner,
|
514
|
-
url,
|
515
|
-
browser,
|
516
|
-
clonedDataDir,
|
517
|
-
playwrightDeviceDetailsObject,
|
518
|
-
isCustomFlow,
|
519
|
-
extraHTTPHeaders,
|
513
|
+
scanner: ScannerTypes,
|
514
|
+
url: string,
|
515
|
+
browser: string,
|
516
|
+
clonedDataDir: string,
|
517
|
+
playwrightDeviceDetailsObject: DeviceDescriptor,
|
518
|
+
isCustomFlow: boolean,
|
519
|
+
extraHTTPHeaders: Record<string, string>,
|
520
520
|
) => {
|
521
521
|
const res = await checkUrlConnectivityWithBrowser(
|
522
522
|
url,
|
@@ -548,7 +548,7 @@ export const parseHeaders = (header?: string): Record<string, string> => {
|
|
548
548
|
// parse HTTP headers from string
|
549
549
|
if (!header) return {};
|
550
550
|
const headerValues = header.split(', ');
|
551
|
-
const allHeaders = {};
|
551
|
+
const allHeaders: Record<string, string> = {};
|
552
552
|
headerValues.map((headerValue: string) => {
|
553
553
|
const headerValuePair = headerValue.split(/ (.*)/s);
|
554
554
|
if (headerValuePair.length < 2) {
|
@@ -776,11 +776,11 @@ export const getLinksFromSitemap = async (
|
|
776
776
|
password: string,
|
777
777
|
) => {
|
778
778
|
const scannedSitemaps = new Set<string>();
|
779
|
-
const urls = {}; // dictionary of requests to urls to be scanned
|
779
|
+
const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
|
780
780
|
|
781
781
|
const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
|
782
782
|
|
783
|
-
const addToUrlList = url => {
|
783
|
+
const addToUrlList = (url: string) => {
|
784
784
|
if (!url) return;
|
785
785
|
if (isDisallowedInRobotsTxt(url)) return;
|
786
786
|
|
@@ -803,14 +803,14 @@ export const getLinksFromSitemap = async (
|
|
803
803
|
urls[url] = request;
|
804
804
|
};
|
805
805
|
|
806
|
-
const addBasicAuthCredentials = (url, username, password) => {
|
806
|
+
const addBasicAuthCredentials = (url: string, username: string, password: string) => {
|
807
807
|
const urlObject = new URL(url);
|
808
808
|
urlObject.username = username;
|
809
809
|
urlObject.password = password;
|
810
810
|
return urlObject.toString();
|
811
811
|
};
|
812
812
|
|
813
|
-
const calculateCloseness = sitemapUrl => {
|
813
|
+
const calculateCloseness = (sitemapUrl: string) => {
|
814
814
|
// Remove 'http://', 'https://', and 'www.' prefixes from the URLs
|
815
815
|
const normalizedSitemapUrl = sitemapUrl.replace(/^(https?:\/\/)?(www\.)?/, '');
|
816
816
|
const normalizedUserUrlInput = userUrlInput
|
@@ -825,10 +825,16 @@ export const getLinksFromSitemap = async (
|
|
825
825
|
}
|
826
826
|
return 0;
|
827
827
|
};
|
828
|
-
const processXmlSitemap = async (
|
829
|
-
|
828
|
+
const processXmlSitemap = async (
|
829
|
+
$: cheerio.CheerioAPI,
|
830
|
+
sitemapType: number,
|
831
|
+
linkSelector: string,
|
832
|
+
dateSelector: string,
|
833
|
+
sectionSelector: string,
|
834
|
+
) => {
|
835
|
+
const urlList: { url: string; lastModifiedDate: Date }[] = [];
|
830
836
|
// Iterate through each URL element in the sitemap, collect url and modified date
|
831
|
-
$(sectionSelector).each((
|
837
|
+
$(sectionSelector).each((_index, urlElement) => {
|
832
838
|
let url;
|
833
839
|
if (sitemapType === constants.xmlSitemapTypes.atom) {
|
834
840
|
url = $(urlElement).find(linkSelector).prop('href');
|
@@ -850,8 +856,7 @@ export const getLinksFromSitemap = async (
|
|
850
856
|
}
|
851
857
|
|
852
858
|
// If closeness is the same, sort by last modified date in descending order
|
853
|
-
|
854
|
-
return dateDifference !== 0 ? dateDifference : 0; // Maintain original order for equal dates
|
859
|
+
return (b.lastModifiedDate?.getTime() || 0) - (a.lastModifiedDate?.getTime() || 0);
|
855
860
|
});
|
856
861
|
}
|
857
862
|
|
@@ -861,7 +866,7 @@ export const getLinksFromSitemap = async (
|
|
861
866
|
}
|
862
867
|
};
|
863
868
|
|
864
|
-
const processNonStandardSitemap = data => {
|
869
|
+
const processNonStandardSitemap = (data: string) => {
|
865
870
|
const urlsFromData = crawlee
|
866
871
|
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
|
867
872
|
.slice(0, maxLinksCount);
|
@@ -934,7 +939,7 @@ export const getLinksFromSitemap = async (
|
|
934
939
|
const sitemapIndex = page.locator('sitemapindex');
|
935
940
|
const rss = page.locator('rss');
|
936
941
|
const feed = page.locator('feed');
|
937
|
-
const isRoot = async locator => (await locator.count()) > 0;
|
942
|
+
const isRoot = async (locator: Locator) => (await locator.count()) > 0;
|
938
943
|
|
939
944
|
if (await isRoot(urlSet)) {
|
940
945
|
data = await urlSet.evaluate(elem => elem.outerHTML);
|
@@ -1054,14 +1059,14 @@ export const getLinksFromSitemap = async (
|
|
1054
1059
|
return requestList;
|
1055
1060
|
};
|
1056
1061
|
|
1057
|
-
export const validEmail = email => {
|
1062
|
+
export const validEmail = (email: string) => {
|
1058
1063
|
const emailRegex = /^.+@.+\..+$/u;
|
1059
1064
|
|
1060
1065
|
return emailRegex.test(email);
|
1061
1066
|
};
|
1062
1067
|
|
1063
1068
|
// For new user flow.
|
1064
|
-
export const validName = name => {
|
1069
|
+
export const validName = (name: string) => {
|
1065
1070
|
// Allow only printable characters from any language
|
1066
1071
|
const regex = /^[\p{L}\p{N}\s'".,()\[\]{}!?:؛،؟…]+$/u;
|
1067
1072
|
|
@@ -1213,11 +1218,11 @@ export const getEdgeData = () => {
|
|
1213
1218
|
* @param {*} destDir destination directory
|
1214
1219
|
* @returns boolean indicating whether the operation was successful
|
1215
1220
|
*/
|
1216
|
-
const cloneChromeProfileCookieFiles = (options, destDir) => {
|
1221
|
+
const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
|
1217
1222
|
let profileCookiesDir;
|
1218
1223
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
1219
1224
|
// and ../Chrome/<profile name>/Cookies for mac
|
1220
|
-
let profileNamesRegex;
|
1225
|
+
let profileNamesRegex: RegExp;
|
1221
1226
|
if (os.platform() === 'win32') {
|
1222
1227
|
profileCookiesDir = globSync('**/Network/Cookies', {
|
1223
1228
|
...options,
|
@@ -1288,11 +1293,11 @@ const cloneChromeProfileCookieFiles = (options, destDir) => {
|
|
1288
1293
|
* @param {*} destDir destination directory
|
1289
1294
|
* @returns boolean indicating whether the operation was successful
|
1290
1295
|
*/
|
1291
|
-
const cloneEdgeProfileCookieFiles = (options, destDir) => {
|
1296
|
+
const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
|
1292
1297
|
let profileCookiesDir;
|
1293
1298
|
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
1294
1299
|
// and ../Chrome/<profile name>/Cookies for mac
|
1295
|
-
let profileNamesRegex;
|
1300
|
+
let profileNamesRegex: RegExp;
|
1296
1301
|
// Ignores the cloned oobee directory if exists
|
1297
1302
|
if (os.platform() === 'win32') {
|
1298
1303
|
profileCookiesDir = globSync('**/Network/Cookies', {
|
@@ -1361,7 +1366,7 @@ const cloneEdgeProfileCookieFiles = (options, destDir) => {
|
|
1361
1366
|
* @param {string} destDir - destination directory
|
1362
1367
|
* @returns boolean indicating whether the operation was successful
|
1363
1368
|
*/
|
1364
|
-
const cloneLocalStateFile = (options, destDir) => {
|
1369
|
+
const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: string) => {
|
1365
1370
|
const localState = globSync('**/*Local State', {
|
1366
1371
|
...options,
|
1367
1372
|
maxDepth: 1,
|
@@ -1647,8 +1652,9 @@ export const getPlaywrightDeviceDetailsObject = (
|
|
1647
1652
|
deviceChosen: string,
|
1648
1653
|
customDevice: string,
|
1649
1654
|
viewportWidth: number,
|
1650
|
-
) => {
|
1651
|
-
let playwrightDeviceDetailsObject =
|
1655
|
+
): DeviceDescriptor => {
|
1656
|
+
let playwrightDeviceDetailsObject = devices['Desktop Chrome']; // default to Desktop Chrome
|
1657
|
+
|
1652
1658
|
if (deviceChosen === 'Mobile' || customDevice === 'iPhone 11') {
|
1653
1659
|
playwrightDeviceDetailsObject = devices['iPhone 11'];
|
1654
1660
|
} else if (customDevice === 'Samsung Galaxy S9+') {
|
@@ -1656,6 +1662,11 @@ export const getPlaywrightDeviceDetailsObject = (
|
|
1656
1662
|
} else if (viewportWidth) {
|
1657
1663
|
playwrightDeviceDetailsObject = {
|
1658
1664
|
viewport: { width: viewportWidth, height: 720 },
|
1665
|
+
isMobile: false,
|
1666
|
+
hasTouch: false,
|
1667
|
+
userAgent: devices['Desktop Chrome'].userAgent,
|
1668
|
+
deviceScaleFactor: 1,
|
1669
|
+
defaultBrowserType: 'chromium',
|
1659
1670
|
};
|
1660
1671
|
} else if (customDevice) {
|
1661
1672
|
playwrightDeviceDetailsObject = devices[customDevice.replace(/_/g, ' ')];
|
@@ -1777,14 +1788,17 @@ export const submitForm = async (
|
|
1777
1788
|
}
|
1778
1789
|
};
|
1779
1790
|
|
1780
|
-
export async function initModifiedUserAgent(
|
1791
|
+
export async function initModifiedUserAgent(
|
1792
|
+
browser?: string,
|
1793
|
+
playwrightDeviceDetailsObject?: object,
|
1794
|
+
) {
|
1781
1795
|
const isHeadless = process.env.CRAWLEE_HEADLESS === '1';
|
1782
|
-
|
1796
|
+
|
1783
1797
|
// If headless mode is enabled, ensure the headless flag is set.
|
1784
1798
|
if (isHeadless && !constants.launchOptionsArgs.includes('--headless=new')) {
|
1785
1799
|
constants.launchOptionsArgs.push('--headless=new');
|
1786
1800
|
}
|
1787
|
-
|
1801
|
+
|
1788
1802
|
// Build the launch options using your production settings.
|
1789
1803
|
// headless is forced to false as in your persistent context, and we merge in getPlaywrightLaunchOptions and device details.
|
1790
1804
|
const launchOptions = {
|
@@ -1803,17 +1817,16 @@ export async function initModifiedUserAgent(browser?: string, playwrightDeviceDe
|
|
1803
1817
|
|
1804
1818
|
// Modify the UA:
|
1805
1819
|
// Replace "HeadlessChrome" with "Chrome" if present.
|
1806
|
-
|
1820
|
+
const modifiedUA = defaultUA.includes('HeadlessChrome')
|
1807
1821
|
? defaultUA.replace('HeadlessChrome', 'Chrome')
|
1808
1822
|
: defaultUA;
|
1809
|
-
|
1823
|
+
|
1810
1824
|
// Push the modified UA flag into your global launch options.
|
1811
1825
|
constants.launchOptionsArgs.push(`--user-agent=${modifiedUA}`);
|
1812
1826
|
// Optionally log the modified UA.
|
1813
1827
|
// console.log('Modified User Agent:', modifiedUA);
|
1814
1828
|
}
|
1815
1829
|
|
1816
|
-
|
1817
1830
|
/**
|
1818
1831
|
* @param {string} browser browser name ("chrome" or "edge", null for chromium, the default Playwright browser)
|
1819
1832
|
* @returns playwright launch options object. For more details: https://playwright.dev/docs/api/class-browsertype#browser-type-launch
|
@@ -1856,25 +1869,25 @@ export const urlWithoutAuth = (url: string): string => {
|
|
1856
1869
|
return parsedUrl.toString();
|
1857
1870
|
};
|
1858
1871
|
|
1859
|
-
export const waitForPageLoaded = async (page, timeout = 10000) => {
|
1872
|
+
export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
|
1860
1873
|
const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
|
1861
1874
|
|
1862
1875
|
return Promise.race([
|
1863
1876
|
page.waitForLoadState('load'), // Ensure page load completes
|
1864
1877
|
page.waitForLoadState('networkidle'), // Wait for network requests to settle
|
1865
1878
|
new Promise(resolve => setTimeout(resolve, timeout)), // Hard timeout as a fallback
|
1866
|
-
page.evaluate(
|
1867
|
-
return new Promise(
|
1879
|
+
page.evaluate(OBSERVER_TIMEOUT => {
|
1880
|
+
return new Promise(resolve => {
|
1868
1881
|
// Skip mutation check for PDFs
|
1869
1882
|
if (document.contentType === 'application/pdf') {
|
1870
1883
|
resolve('Skipping DOM mutation check for PDF.');
|
1871
1884
|
return;
|
1872
1885
|
}
|
1873
1886
|
|
1874
|
-
let timeout;
|
1887
|
+
let timeout: NodeJS.Timeout;
|
1875
1888
|
let mutationCount = 0;
|
1876
1889
|
const MAX_MUTATIONS = 250; // Limit max mutations
|
1877
|
-
const mutationHash = {};
|
1890
|
+
const mutationHash: Record<string, number> = {};
|
1878
1891
|
|
1879
1892
|
const observer = new MutationObserver(mutationsList => {
|
1880
1893
|
clearTimeout(timeout);
|
@@ -1916,14 +1929,17 @@ export const waitForPageLoaded = async (page, timeout = 10000) => {
|
|
1916
1929
|
resolve('Observer timeout reached, exiting.');
|
1917
1930
|
}, OBSERVER_TIMEOUT);
|
1918
1931
|
|
1919
|
-
observer.observe(document.documentElement, {
|
1932
|
+
observer.observe(document.documentElement, {
|
1933
|
+
childList: true,
|
1934
|
+
subtree: true,
|
1935
|
+
attributes: true,
|
1936
|
+
});
|
1920
1937
|
});
|
1921
1938
|
}, OBSERVER_TIMEOUT), // Pass OBSERVER_TIMEOUT dynamically to the browser context
|
1922
1939
|
]);
|
1923
1940
|
};
|
1924
1941
|
|
1925
|
-
|
1926
|
-
function isValidHttpUrl(urlString) {
|
1942
|
+
function isValidHttpUrl(urlString: string) {
|
1927
1943
|
const pattern = /^(http|https):\/\/[^ "]+$/;
|
1928
1944
|
return pattern.test(urlString);
|
1929
1945
|
}
|
@@ -29,6 +29,7 @@ export const blackListedFileExtensions = [
|
|
29
29
|
'zip',
|
30
30
|
'webp',
|
31
31
|
'json',
|
32
|
+
'xml'
|
32
33
|
];
|
33
34
|
|
34
35
|
export const getIntermediateScreenshotsPath = (datasetsPath: string): string =>
|
@@ -217,7 +218,7 @@ export const guiInfoStatusTypes = {
|
|
217
218
|
DUPLICATE: 'duplicate',
|
218
219
|
};
|
219
220
|
|
220
|
-
let launchOptionsArgs = [];
|
221
|
+
let launchOptionsArgs: string[] = [];
|
221
222
|
|
222
223
|
// Check if running in docker container
|
223
224
|
if (fs.existsSync('/.dockerenv')) {
|
@@ -444,3 +445,82 @@ export enum RuleFlags {
|
|
444
445
|
DISABLE_OOBEE = 'disable-oobee',
|
445
446
|
ENABLE_WCAG_AAA = 'enable-wcag-aaa',
|
446
447
|
}
|
448
|
+
|
449
|
+
// Note: Not all status codes will appear as Crawler will handle it as best effort first. E.g. try to handle redirect
|
450
|
+
export const STATUS_CODE_METADATA: Record<number,string> = {
|
451
|
+
// Custom Codes for Oobee's use
|
452
|
+
0: 'Page Excluded',
|
453
|
+
1: 'Not A Supported Document',
|
454
|
+
2: 'Web Crawler Errored',
|
455
|
+
|
456
|
+
// 599 is set because Crawlee returns response status 100, 102, 103 as 599
|
457
|
+
599: 'Uncommon Response Status Code Received',
|
458
|
+
|
459
|
+
// This is Status OK but thrown when the crawler cannot scan the page
|
460
|
+
200: '200 - However Page Could Not Be Scanned',
|
461
|
+
|
462
|
+
// 1xx - Informational
|
463
|
+
100: '100 - Continue',
|
464
|
+
101: '101 - Switching Protocols',
|
465
|
+
102: '102 - Processing',
|
466
|
+
103: '103 - Early Hints',
|
467
|
+
|
468
|
+
// 2xx - Browser Doesn't Support
|
469
|
+
204: '204 - No Content',
|
470
|
+
205: '205 - Reset Content',
|
471
|
+
|
472
|
+
// 3xx - Redirection
|
473
|
+
300: '300 - Multiple Choices',
|
474
|
+
301: '301 - Moved Permanently',
|
475
|
+
302: '302 - Found',
|
476
|
+
303: '303 - See Other',
|
477
|
+
304: '304 - Not Modified',
|
478
|
+
305: '305 - Use Proxy',
|
479
|
+
307: '307 - Temporary Redirect',
|
480
|
+
308: '308 - Permanent Redirect',
|
481
|
+
|
482
|
+
// 4xx - Client Error
|
483
|
+
400: '400 - Bad Request',
|
484
|
+
401: '401 - Unauthorized',
|
485
|
+
402: '402 - Payment Required',
|
486
|
+
403: '403 - Forbidden',
|
487
|
+
404: '404 - Not Found',
|
488
|
+
405: '405 - Method Not Allowed',
|
489
|
+
406: '406 - Not Acceptable',
|
490
|
+
407: '407 - Proxy Authentication Required',
|
491
|
+
408: '408 - Request Timeout',
|
492
|
+
409: '409 - Conflict',
|
493
|
+
410: '410 - Gone',
|
494
|
+
411: '411 - Length Required',
|
495
|
+
412: '412 - Precondition Failed',
|
496
|
+
413: '413 - Payload Too Large',
|
497
|
+
414: '414 - URI Too Long',
|
498
|
+
415: '415 - Unsupported Media Type',
|
499
|
+
416: '416 - Range Not Satisfiable',
|
500
|
+
417: '417 - Expectation Failed',
|
501
|
+
418: "418 - I'm a teapot",
|
502
|
+
421: '421 - Misdirected Request',
|
503
|
+
422: '422 - Unprocessable Content',
|
504
|
+
423: '423 - Locked',
|
505
|
+
424: '424 - Failed Dependency',
|
506
|
+
425: '425 - Too Early',
|
507
|
+
426: '426 - Upgrade Required',
|
508
|
+
428: '428 - Precondition Required',
|
509
|
+
429: '429 - Too Many Requests',
|
510
|
+
431: '431 - Request Header Fields Too Large',
|
511
|
+
451: '451 - Unavailable For Legal Reasons',
|
512
|
+
|
513
|
+
// 5xx - Server Error
|
514
|
+
500: '500 - Internal Server Error',
|
515
|
+
501: '501 - Not Implemented',
|
516
|
+
502: '502 - Bad Gateway',
|
517
|
+
503: '503 - Service Unavailable',
|
518
|
+
504: '504 - Gateway Timeout',
|
519
|
+
505: '505 - HTTP Version Not Supported',
|
520
|
+
506: '506 - Variant Also Negotiates',
|
521
|
+
507: '507 - Insufficient Storage',
|
522
|
+
508: '508 - Loop Detected',
|
523
|
+
510: '510 - Not Extended',
|
524
|
+
511: '511 - Network Authentication Required',
|
525
|
+
|
526
|
+
};
|
package/src/constants/oobeeAi.ts
CHANGED
@@ -24,7 +24,7 @@ export const oobeeAiRules = [
|
|
24
24
|
'autocomplete-valid',
|
25
25
|
];
|
26
26
|
|
27
|
-
export const oobeeAiHtmlETL = htmlSnippet => {
|
27
|
+
export const oobeeAiHtmlETL = (htmlSnippet: string) => {
|
28
28
|
// Whitelisted attributes (to not drop)
|
29
29
|
// i.e. any other attribute will be dropped
|
30
30
|
const whitelistedAttributes = [
|
@@ -60,12 +60,12 @@ export const oobeeAiHtmlETL = htmlSnippet => {
|
|
60
60
|
`aria-labelledby`,
|
61
61
|
];
|
62
62
|
|
63
|
-
const sortAlphaAttributes = html => {
|
63
|
+
const sortAlphaAttributes = (html: string) => {
|
64
64
|
let entireHtml = '';
|
65
65
|
const htmlOpeningTagRegex = /<[^>]+/g;
|
66
66
|
const htmlTagmatches = html.match(htmlOpeningTagRegex);
|
67
67
|
|
68
|
-
let sortedHtmlTag;
|
68
|
+
let sortedHtmlTag: string = '';
|
69
69
|
|
70
70
|
htmlTagmatches.forEach(htmlTag => {
|
71
71
|
const closingTag = htmlTag.trim().slice(-1) === '/' ? '/>' : '>';
|
@@ -112,7 +112,7 @@ export const oobeeAiHtmlETL = htmlSnippet => {
|
|
112
112
|
|
113
113
|
// For all attributes within mutedAttributeValues array
|
114
114
|
// replace their values with "something" while maintaining the attribute
|
115
|
-
const muteAttributeValues = html => {
|
115
|
+
const muteAttributeValues = (html: string) => {
|
116
116
|
const regex = /(\s+)([\w-]+)(\s*=\s*")([^"]*)(")/g;
|
117
117
|
|
118
118
|
// p1 is the whitespace before the attribute
|
@@ -120,7 +120,7 @@ export const oobeeAiHtmlETL = htmlSnippet => {
|
|
120
120
|
// p3 is the attribute value before the replacement
|
121
121
|
// p4 is the attribute value (replaced with "...")
|
122
122
|
// p5 is the closing quote of the attribute value
|
123
|
-
return html.replace(regex, (match, p1, p2, p3,
|
123
|
+
return html.replace(regex, (match, p1, p2, p3, _p4, p5) => {
|
124
124
|
if (mutedAttributeValues.includes(p2)) {
|
125
125
|
return `${p1}${p2}${p3}...${p5}`;
|
126
126
|
}
|
@@ -129,7 +129,7 @@ export const oobeeAiHtmlETL = htmlSnippet => {
|
|
129
129
|
};
|
130
130
|
|
131
131
|
// Drop all attributes from the HTML snippet except whitelisted
|
132
|
-
const dropAllExceptWhitelisted = html => {
|
132
|
+
const dropAllExceptWhitelisted = (html: string) => {
|
133
133
|
const regex = new RegExp(
|
134
134
|
`(\\s+)(?!${whitelistedAttributes.join(`|`)})([\\w-]+)(\\s*=\\s*"[^"]*")`,
|
135
135
|
`g`,
|
@@ -12,12 +12,13 @@ import {
|
|
12
12
|
validEmail,
|
13
13
|
validName,
|
14
14
|
validateCustomFlowLabel,
|
15
|
+
parseHeaders,
|
15
16
|
} from './common.js';
|
16
17
|
import constants, { BrowserTypes, ScannerTypes } from './constants.js';
|
17
18
|
|
18
19
|
const userData = getUserDataTxt();
|
19
20
|
|
20
|
-
const questions = [];
|
21
|
+
const questions: Question[] = [];
|
21
22
|
|
22
23
|
const startScanQuestions = [
|
23
24
|
{
|
@@ -95,7 +96,7 @@ const startScanQuestions = [
|
|
95
96
|
clonedBrowserDataDir,
|
96
97
|
playwrightDeviceDetailsObject,
|
97
98
|
answers.scanner === ScannerTypes.CUSTOM,
|
98
|
-
answers.header,
|
99
|
+
parseHeaders(answers.header),
|
99
100
|
);
|
100
101
|
|
101
102
|
deleteClonedProfiles(browserToRun);
|