@govtechsg/oobee 0.10.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.dockerignore +22 -0
- package/.github/pull_request_template.md +11 -0
- package/.github/workflows/docker-test.yml +54 -0
- package/.github/workflows/image.yml +107 -0
- package/.github/workflows/publish.yml +18 -0
- package/.idea/modules.xml +8 -0
- package/.idea/purple-a11y.iml +9 -0
- package/.idea/vcs.xml +6 -0
- package/.prettierrc.json +12 -0
- package/.vscode/extensions.json +5 -0
- package/.vscode/settings.json +10 -0
- package/CODE_OF_CONDUCT.md +128 -0
- package/DETAILS.md +163 -0
- package/Dockerfile +60 -0
- package/INSTALLATION.md +146 -0
- package/INTEGRATION.md +785 -0
- package/LICENSE +22 -0
- package/README.md +587 -0
- package/SECURITY.md +5 -0
- package/__mocks__/mock-report.html +1431 -0
- package/__mocks__/mockFunctions.ts +32 -0
- package/__mocks__/mockIssues.ts +64 -0
- package/__mocks__/mock_all_issues/000000001.json +64 -0
- package/__mocks__/mock_all_issues/000000002.json +53 -0
- package/__mocks__/mock_all_issues/fake-file.txt +0 -0
- package/__tests__/logs.test.ts +25 -0
- package/__tests__/mergeAxeResults.test.ts +278 -0
- package/__tests__/utils.test.ts +118 -0
- package/a11y-scan-results.zip +0 -0
- package/eslint.config.js +53 -0
- package/exclusions.txt +2 -0
- package/gitlab-pipeline-template.yml +54 -0
- package/jest.config.js +1 -0
- package/package.json +96 -0
- package/scripts/copyFiles.js +44 -0
- package/scripts/install_oobee_dependencies.cmd +13 -0
- package/scripts/install_oobee_dependencies.command +101 -0
- package/scripts/install_oobee_dependencies.ps1 +110 -0
- package/scripts/oobee_shell.cmd +13 -0
- package/scripts/oobee_shell.command +11 -0
- package/scripts/oobee_shell.sh +55 -0
- package/scripts/oobee_shell_ps.ps1 +54 -0
- package/src/cli.ts +401 -0
- package/src/combine.ts +240 -0
- package/src/constants/__tests__/common.test.ts +44 -0
- package/src/constants/cliFunctions.ts +305 -0
- package/src/constants/common.ts +1840 -0
- package/src/constants/constants.ts +443 -0
- package/src/constants/errorMeta.json +319 -0
- package/src/constants/itemTypeDescription.ts +11 -0
- package/src/constants/oobeeAi.ts +141 -0
- package/src/constants/questions.ts +181 -0
- package/src/constants/sampleData.ts +187 -0
- package/src/crawlers/__tests__/commonCrawlerFunc.test.ts +51 -0
- package/src/crawlers/commonCrawlerFunc.ts +656 -0
- package/src/crawlers/crawlDomain.ts +877 -0
- package/src/crawlers/crawlIntelligentSitemap.ts +156 -0
- package/src/crawlers/crawlLocalFile.ts +193 -0
- package/src/crawlers/crawlSitemap.ts +356 -0
- package/src/crawlers/custom/extractAndGradeText.ts +57 -0
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +964 -0
- package/src/crawlers/custom/utils.ts +486 -0
- package/src/crawlers/customAxeFunctions.ts +82 -0
- package/src/crawlers/pdfScanFunc.ts +468 -0
- package/src/crawlers/runCustom.ts +117 -0
- package/src/index.ts +173 -0
- package/src/logs.ts +66 -0
- package/src/mergeAxeResults.ts +964 -0
- package/src/npmIndex.ts +284 -0
- package/src/screenshotFunc/htmlScreenshotFunc.ts +411 -0
- package/src/screenshotFunc/pdfScreenshotFunc.ts +762 -0
- package/src/static/ejs/partials/components/categorySelector.ejs +4 -0
- package/src/static/ejs/partials/components/categorySelectorDropdown.ejs +57 -0
- package/src/static/ejs/partials/components/pagesScannedModal.ejs +70 -0
- package/src/static/ejs/partials/components/reportSearch.ejs +47 -0
- package/src/static/ejs/partials/components/ruleOffcanvas.ejs +105 -0
- package/src/static/ejs/partials/components/scanAbout.ejs +263 -0
- package/src/static/ejs/partials/components/screenshotLightbox.ejs +13 -0
- package/src/static/ejs/partials/components/summaryScanAbout.ejs +141 -0
- package/src/static/ejs/partials/components/summaryScanResults.ejs +16 -0
- package/src/static/ejs/partials/components/summaryTable.ejs +20 -0
- package/src/static/ejs/partials/components/summaryWcagCompliance.ejs +94 -0
- package/src/static/ejs/partials/components/topFive.ejs +6 -0
- package/src/static/ejs/partials/components/wcagCompliance.ejs +70 -0
- package/src/static/ejs/partials/footer.ejs +21 -0
- package/src/static/ejs/partials/header.ejs +230 -0
- package/src/static/ejs/partials/main.ejs +40 -0
- package/src/static/ejs/partials/scripts/bootstrap.ejs +8 -0
- package/src/static/ejs/partials/scripts/categorySelectorDropdownScript.ejs +190 -0
- package/src/static/ejs/partials/scripts/categorySummary.ejs +141 -0
- package/src/static/ejs/partials/scripts/highlightjs.ejs +335 -0
- package/src/static/ejs/partials/scripts/popper.ejs +7 -0
- package/src/static/ejs/partials/scripts/reportSearch.ejs +248 -0
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +801 -0
- package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +71 -0
- package/src/static/ejs/partials/scripts/summaryScanResults.ejs +14 -0
- package/src/static/ejs/partials/scripts/summaryTable.ejs +78 -0
- package/src/static/ejs/partials/scripts/utils.ejs +441 -0
- package/src/static/ejs/partials/styles/bootstrap.ejs +12375 -0
- package/src/static/ejs/partials/styles/highlightjs.ejs +54 -0
- package/src/static/ejs/partials/styles/styles.ejs +1843 -0
- package/src/static/ejs/partials/styles/summaryBootstrap.ejs +12458 -0
- package/src/static/ejs/partials/summaryHeader.ejs +70 -0
- package/src/static/ejs/partials/summaryMain.ejs +75 -0
- package/src/static/ejs/report.ejs +420 -0
- package/src/static/ejs/summary.ejs +47 -0
- package/src/static/mustache/.prettierrc +4 -0
- package/src/static/mustache/Attention Deficit.mustache +11 -0
- package/src/static/mustache/Blind.mustache +11 -0
- package/src/static/mustache/Cognitive.mustache +7 -0
- package/src/static/mustache/Colorblindness.mustache +20 -0
- package/src/static/mustache/Deaf.mustache +12 -0
- package/src/static/mustache/Deafblind.mustache +7 -0
- package/src/static/mustache/Dyslexia.mustache +14 -0
- package/src/static/mustache/Low Vision.mustache +7 -0
- package/src/static/mustache/Mobility.mustache +15 -0
- package/src/static/mustache/Sighted Keyboard Users.mustache +42 -0
- package/src/static/mustache/report.mustache +1709 -0
- package/src/types/print-message.d.ts +28 -0
- package/src/types/types.ts +46 -0
- package/src/types/xpath-to-css.d.ts +3 -0
- package/src/utils.ts +332 -0
- package/tsconfig.json +15 -0
@@ -0,0 +1,1840 @@
|
|
1
|
+
/* eslint-disable consistent-return */
|
2
|
+
/* eslint-disable no-console */
|
3
|
+
/* eslint-disable camelcase */
|
4
|
+
/* eslint-disable no-use-before-define */
|
5
|
+
import validator from 'validator';
|
6
|
+
import axios from 'axios';
|
7
|
+
import { JSDOM } from 'jsdom';
|
8
|
+
import * as cheerio from 'cheerio';
|
9
|
+
import crawlee, { EnqueueStrategy, Request } from 'crawlee';
|
10
|
+
import { parseString } from 'xml2js';
|
11
|
+
import fs from 'fs';
|
12
|
+
import path from 'path';
|
13
|
+
import url, { fileURLToPath, pathToFileURL } from 'url';
|
14
|
+
import safe from 'safe-regex';
|
15
|
+
import * as https from 'https';
|
16
|
+
import os from 'os';
|
17
|
+
import { minimatch } from 'minimatch';
|
18
|
+
import { globSync } from 'glob';
|
19
|
+
import { LaunchOptions, devices, webkit } from 'playwright';
|
20
|
+
import printMessage from 'print-message';
|
21
|
+
import constants, {
|
22
|
+
getDefaultChromeDataDir,
|
23
|
+
getDefaultEdgeDataDir,
|
24
|
+
getDefaultChromiumDataDir,
|
25
|
+
proxy,
|
26
|
+
formDataFields,
|
27
|
+
ScannerTypes,
|
28
|
+
BrowserTypes,
|
29
|
+
} from './constants.js';
|
30
|
+
import { silentLogger } from '../logs.js';
|
31
|
+
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
32
|
+
import { randomThreeDigitNumberString } from '../utils.js';
|
33
|
+
import { Answers, Data } from '../index.js';
|
34
|
+
|
35
|
+
// validateDirPath validates a provided directory path
|
36
|
+
// returns null if no error
|
37
|
+
export const validateDirPath = (dirPath: string): string => {
|
38
|
+
if (typeof dirPath !== 'string') {
|
39
|
+
return 'Please provide string value of directory path.';
|
40
|
+
}
|
41
|
+
|
42
|
+
try {
|
43
|
+
fs.accessSync(dirPath);
|
44
|
+
if (!fs.statSync(dirPath).isDirectory()) {
|
45
|
+
return 'Please provide a directory path.';
|
46
|
+
}
|
47
|
+
|
48
|
+
return null;
|
49
|
+
} catch {
|
50
|
+
return 'Please ensure path provided exists.';
|
51
|
+
}
|
52
|
+
};
|
53
|
+
|
54
|
+
export class RES {
|
55
|
+
status: number;
|
56
|
+
url: string;
|
57
|
+
content: string;
|
58
|
+
constructor(res?: Partial<RES>) {
|
59
|
+
if (res) {
|
60
|
+
Object.assign(this, res);
|
61
|
+
}
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
export const validateCustomFlowLabel = (customFlowLabel: string) => {
|
66
|
+
const containsReserveWithDot = constants.reserveFileNameKeywords.some(char =>
|
67
|
+
customFlowLabel.toLowerCase().includes(`${char.toLowerCase()}.`),
|
68
|
+
);
|
69
|
+
const containsForbiddenCharacters = constants.forbiddenCharactersInDirPath.some(char =>
|
70
|
+
customFlowLabel.includes(char),
|
71
|
+
);
|
72
|
+
const exceedsMaxLength = customFlowLabel.length > 80;
|
73
|
+
|
74
|
+
if (containsForbiddenCharacters) {
|
75
|
+
const displayForbiddenCharacters = constants.forbiddenCharactersInDirPath
|
76
|
+
.toString()
|
77
|
+
.replaceAll(',', ' , ');
|
78
|
+
return {
|
79
|
+
isValid: false,
|
80
|
+
errorMessage: `Invalid label. Cannot contain ${displayForbiddenCharacters}`,
|
81
|
+
};
|
82
|
+
}
|
83
|
+
if (exceedsMaxLength) {
|
84
|
+
return { isValid: false, errorMessage: `Invalid label. Cannot exceed 80 characters.` };
|
85
|
+
}
|
86
|
+
if (containsReserveWithDot) {
|
87
|
+
const displayReserveKeywords = constants.reserveFileNameKeywords
|
88
|
+
.toString()
|
89
|
+
.replaceAll(',', ' , ');
|
90
|
+
return {
|
91
|
+
isValid: false,
|
92
|
+
errorMessage: `Invalid label. Cannot have '.' appended to ${displayReserveKeywords} as they are reserved keywords.`,
|
93
|
+
};
|
94
|
+
}
|
95
|
+
return { isValid: true };
|
96
|
+
};
|
97
|
+
|
98
|
+
// validateFilePath validates a provided file path
|
99
|
+
// returns null if no error
|
100
|
+
export const validateFilePath = (filePath: string, cliDir: string) => {
|
101
|
+
if (typeof filePath !== 'string') {
|
102
|
+
throw new Error('Please provide string value of file path.');
|
103
|
+
}
|
104
|
+
|
105
|
+
const absolutePath = path.isAbsolute(filePath) ? filePath : path.resolve(cliDir, filePath);
|
106
|
+
try {
|
107
|
+
fs.accessSync(absolutePath);
|
108
|
+
if (!fs.statSync(absolutePath).isFile()) {
|
109
|
+
throw new Error('Please provide a file path.');
|
110
|
+
}
|
111
|
+
|
112
|
+
if (path.extname(absolutePath) !== '.txt') {
|
113
|
+
throw new Error('Please provide a file with txt extension.');
|
114
|
+
}
|
115
|
+
|
116
|
+
return absolutePath;
|
117
|
+
} catch {
|
118
|
+
throw new Error(`Please ensure path provided exists: ${absolutePath}`);
|
119
|
+
}
|
120
|
+
};
|
121
|
+
|
122
|
+
export const getBlackListedPatterns = (
|
123
|
+
blacklistedPatternsFilename: string | null,
|
124
|
+
): string[] | null => {
|
125
|
+
let exclusionsFile = null;
|
126
|
+
if (blacklistedPatternsFilename) {
|
127
|
+
exclusionsFile = blacklistedPatternsFilename;
|
128
|
+
} else if (fs.existsSync('exclusions.txt')) {
|
129
|
+
exclusionsFile = 'exclusions.txt';
|
130
|
+
}
|
131
|
+
|
132
|
+
if (!exclusionsFile) {
|
133
|
+
return null;
|
134
|
+
}
|
135
|
+
|
136
|
+
const rawPatterns = fs.readFileSync(exclusionsFile).toString();
|
137
|
+
const blacklistedPatterns = rawPatterns
|
138
|
+
.split('\n')
|
139
|
+
.map(p => p.trim())
|
140
|
+
.filter(p => p !== '');
|
141
|
+
|
142
|
+
const unsafe = blacklistedPatterns.filter(pattern => !safe(pattern));
|
143
|
+
if (unsafe.length > 0) {
|
144
|
+
const unsafeExpressionsError = `Unsafe expressions detected: ${unsafe} Please revise ${exclusionsFile}`;
|
145
|
+
throw new Error(unsafeExpressionsError);
|
146
|
+
}
|
147
|
+
|
148
|
+
return blacklistedPatterns;
|
149
|
+
};
|
150
|
+
|
151
|
+
export const isBlacklistedFileExtensions = (url: string, blacklistedFileExtensions: string[]) => {
|
152
|
+
const urlExtension = url.split('.').pop();
|
153
|
+
return blacklistedFileExtensions.includes(urlExtension);
|
154
|
+
};
|
155
|
+
|
156
|
+
const document = new JSDOM('').window;
|
157
|
+
|
158
|
+
const httpsAgent = new https.Agent({
|
159
|
+
// Run in environments with custom certificates
|
160
|
+
rejectUnauthorized: false,
|
161
|
+
keepAlive: true,
|
162
|
+
});
|
163
|
+
|
164
|
+
export const messageOptions = {
|
165
|
+
border: false,
|
166
|
+
marginTop: 2,
|
167
|
+
marginBottom: 2,
|
168
|
+
};
|
169
|
+
|
170
|
+
const urlOptions = {
|
171
|
+
protocols: ['http', 'https'],
|
172
|
+
require_protocol: true,
|
173
|
+
require_tld: false,
|
174
|
+
};
|
175
|
+
|
176
|
+
const queryCheck = (s: string) => document.createDocumentFragment().querySelector(s);
|
177
|
+
export const isSelectorValid = (selector: string): boolean => {
|
178
|
+
try {
|
179
|
+
queryCheck(selector);
|
180
|
+
} catch {
|
181
|
+
return false;
|
182
|
+
}
|
183
|
+
return true;
|
184
|
+
};
|
185
|
+
|
186
|
+
// Refer to NPM validator's special characters under sanitizers for escape()
|
187
|
+
const blackListCharacters = '\\<>&\'"';
|
188
|
+
|
189
|
+
export const validateXML = (content: string): { isValid: boolean; parsedContent: string } => {
|
190
|
+
let isValid: boolean;
|
191
|
+
let parsedContent: string;
|
192
|
+
parseString(content, (_err, result) => {
|
193
|
+
if (result) {
|
194
|
+
isValid = true;
|
195
|
+
parsedContent = result;
|
196
|
+
} else {
|
197
|
+
isValid = false;
|
198
|
+
}
|
199
|
+
});
|
200
|
+
return { isValid, parsedContent };
|
201
|
+
};
|
202
|
+
|
203
|
+
export const isSkippedUrl = (pageUrl: string, whitelistedDomains: string[]) => {
|
204
|
+
const matched =
|
205
|
+
whitelistedDomains.filter(p => {
|
206
|
+
const pattern = p.replace(/[\n\r]+/g, '');
|
207
|
+
|
208
|
+
// is url
|
209
|
+
if (pattern.startsWith('http') && pattern === pageUrl) {
|
210
|
+
return true;
|
211
|
+
}
|
212
|
+
|
213
|
+
// is regex (default)
|
214
|
+
return new RegExp(pattern).test(pageUrl);
|
215
|
+
}).length > 0;
|
216
|
+
|
217
|
+
return matched;
|
218
|
+
};
|
219
|
+
|
220
|
+
export const getFileSitemap = (filePath: string): string | null => {
|
221
|
+
if (filePath.startsWith('file:///')) {
|
222
|
+
if (os.platform() === 'win32') {
|
223
|
+
filePath = filePath.match(/^file:\/\/\/([A-Z]:\/[^?#]+)/)?.[1];
|
224
|
+
} else {
|
225
|
+
filePath = filePath.match(/^file:\/\/(\/[^?#]+)/)?.[1];
|
226
|
+
}
|
227
|
+
}
|
228
|
+
|
229
|
+
filePath = convertToFilePath(filePath);
|
230
|
+
|
231
|
+
if (!fs.existsSync(filePath)) {
|
232
|
+
return null;
|
233
|
+
}
|
234
|
+
|
235
|
+
const file = fs.readFileSync(filePath, 'utf8');
|
236
|
+
const isLocalFileScan = isSitemapContent(file);
|
237
|
+
return isLocalFileScan || file !== undefined ? filePath : null;
|
238
|
+
};
|
239
|
+
|
240
|
+
export const getUrlMessage = (scanner: ScannerTypes): string => {
|
241
|
+
switch (scanner) {
|
242
|
+
case ScannerTypes.WEBSITE:
|
243
|
+
case ScannerTypes.CUSTOM:
|
244
|
+
case ScannerTypes.INTELLIGENT:
|
245
|
+
return 'Please enter URL of website: ';
|
246
|
+
case ScannerTypes.SITEMAP:
|
247
|
+
return 'Please enter URL or file path to sitemap, or drag and drop a sitemap file here: ';
|
248
|
+
case ScannerTypes.LOCALFILE:
|
249
|
+
return 'Please enter file path: ';
|
250
|
+
default:
|
251
|
+
return 'Invalid option';
|
252
|
+
}
|
253
|
+
};
|
254
|
+
|
255
|
+
export const isInputValid = inputString => {
|
256
|
+
if (!validator.isEmpty(inputString)) {
|
257
|
+
const removeBlackListCharacters = validator.escape(inputString);
|
258
|
+
|
259
|
+
if (validator.isAscii(removeBlackListCharacters)) {
|
260
|
+
return true;
|
261
|
+
}
|
262
|
+
}
|
263
|
+
|
264
|
+
return false;
|
265
|
+
};
|
266
|
+
|
267
|
+
export const sanitizeUrlInput = (url: string): { isValid: boolean; url: string } => {
|
268
|
+
// Sanitize that there is no blacklist characters
|
269
|
+
const sanitizeUrl = validator.blacklist(url, blackListCharacters);
|
270
|
+
if (validator.isURL(sanitizeUrl, urlOptions)) {
|
271
|
+
return { isValid: true, url: sanitizeUrl };
|
272
|
+
}
|
273
|
+
return { isValid: false, url: sanitizeUrl };
|
274
|
+
};
|
275
|
+
|
276
|
+
const requestToUrl = async (
|
277
|
+
url: string,
|
278
|
+
isCustomFlow: boolean,
|
279
|
+
extraHTTPHeaders: Record<string, string>,
|
280
|
+
) => {
|
281
|
+
// User-Agent is modified to emulate a browser to handle cases where some sites ban non browser agents, resulting in a 403 error
|
282
|
+
const res = new RES();
|
283
|
+
const parsedUrl = new URL(url);
|
284
|
+
await axios
|
285
|
+
.get(parsedUrl.href, {
|
286
|
+
headers: {
|
287
|
+
...extraHTTPHeaders,
|
288
|
+
'User-Agent': devices['Desktop Chrome HiDPI'].userAgent,
|
289
|
+
Host: parsedUrl.host,
|
290
|
+
},
|
291
|
+
auth: {
|
292
|
+
username: decodeURIComponent(parsedUrl.username),
|
293
|
+
password: decodeURIComponent(parsedUrl.password),
|
294
|
+
},
|
295
|
+
httpsAgent,
|
296
|
+
timeout: 5000,
|
297
|
+
})
|
298
|
+
.then(async response => {
|
299
|
+
let redirectUrl = response.request.res.responseUrl;
|
300
|
+
redirectUrl = new URL(redirectUrl).href;
|
301
|
+
res.status = constants.urlCheckStatuses.success.code;
|
302
|
+
let data;
|
303
|
+
if (typeof response.data === 'string' || response.data instanceof String) {
|
304
|
+
data = response.data;
|
305
|
+
} else if (typeof response.data === 'object' && response.data !== null) {
|
306
|
+
try {
|
307
|
+
data = JSON.stringify(response.data);
|
308
|
+
} catch (error) {
|
309
|
+
console.log('Error converting object to JSON:', error);
|
310
|
+
}
|
311
|
+
} else {
|
312
|
+
console.log('Unsupported data type:', typeof response.data);
|
313
|
+
}
|
314
|
+
const modifiedHTML = data.replace(/<noscript>[\s\S]*?<\/noscript>/gi, '');
|
315
|
+
|
316
|
+
const metaRefreshMatch =
|
317
|
+
/<meta\s+http-equiv="refresh"\s+content="(?:\d+;)?\s*url=(?:'([^']*)'|"([^"]*)"|([^>]*))"/i.exec(
|
318
|
+
modifiedHTML,
|
319
|
+
);
|
320
|
+
|
321
|
+
const hasMetaRefresh = metaRefreshMatch && metaRefreshMatch.length > 1;
|
322
|
+
|
323
|
+
if (redirectUrl != null && (hasMetaRefresh || !isCustomFlow)) {
|
324
|
+
res.url = redirectUrl;
|
325
|
+
} else {
|
326
|
+
res.url = url;
|
327
|
+
}
|
328
|
+
|
329
|
+
if (hasMetaRefresh) {
|
330
|
+
let urlOrRelativePath;
|
331
|
+
|
332
|
+
for (let i = 1; i < metaRefreshMatch.length; i++) {
|
333
|
+
if (metaRefreshMatch[i] !== undefined && metaRefreshMatch[i] !== null) {
|
334
|
+
urlOrRelativePath = metaRefreshMatch[i];
|
335
|
+
break; // Stop the loop once the first non-null value is found
|
336
|
+
}
|
337
|
+
}
|
338
|
+
|
339
|
+
if (urlOrRelativePath.includes('URL=')) {
|
340
|
+
res.url = urlOrRelativePath.split('URL=').pop();
|
341
|
+
} else {
|
342
|
+
const pathname = res.url.substring(0, res.url.lastIndexOf('/'));
|
343
|
+
res.url = new URL(urlOrRelativePath, pathname).toString();
|
344
|
+
}
|
345
|
+
}
|
346
|
+
|
347
|
+
res.content = response.data;
|
348
|
+
})
|
349
|
+
.catch(async error => {
|
350
|
+
if (error.code === 'ECONNABORTED' || error.code === 'ERR_FR_TOO_MANY_REDIRECTS') {
|
351
|
+
res.status = constants.urlCheckStatuses.axiosTimeout.code;
|
352
|
+
} else if (error.response) {
|
353
|
+
if (error.response.status === 401) {
|
354
|
+
// enters here if URL is protected by basic auth
|
355
|
+
res.status = constants.urlCheckStatuses.unauthorised.code;
|
356
|
+
} else {
|
357
|
+
// enters here if server responds with a status other than 2xx
|
358
|
+
// the scan should still proceed even if error codes are received, so that accessibility scans for error pages can be done too
|
359
|
+
res.status = constants.urlCheckStatuses.success.code;
|
360
|
+
}
|
361
|
+
res.url = url;
|
362
|
+
res.content = error.response.data;
|
363
|
+
return res;
|
364
|
+
} else if (error.request) {
|
365
|
+
// enters here if URL cannot be accessed
|
366
|
+
res.status = constants.urlCheckStatuses.cannotBeResolved.code;
|
367
|
+
} else {
|
368
|
+
res.status = constants.urlCheckStatuses.systemError.code;
|
369
|
+
}
|
370
|
+
silentLogger.error(error);
|
371
|
+
});
|
372
|
+
return res;
|
373
|
+
};
|
374
|
+
|
375
|
+
const checkUrlConnectivityWithBrowser = async (
|
376
|
+
url,
|
377
|
+
browserToRun,
|
378
|
+
clonedDataDir,
|
379
|
+
playwrightDeviceDetailsObject,
|
380
|
+
isCustomFlow,
|
381
|
+
extraHTTPHeaders,
|
382
|
+
) => {
|
383
|
+
const res = new RES();
|
384
|
+
|
385
|
+
let viewport = null;
|
386
|
+
let userAgent = null;
|
387
|
+
|
388
|
+
if (Object.keys(playwrightDeviceDetailsObject).length > 0) {
|
389
|
+
if ('viewport' in playwrightDeviceDetailsObject) {
|
390
|
+
viewport = playwrightDeviceDetailsObject.viewport;
|
391
|
+
}
|
392
|
+
|
393
|
+
if ('userAgent' in playwrightDeviceDetailsObject) {
|
394
|
+
userAgent = playwrightDeviceDetailsObject.userAgent;
|
395
|
+
}
|
396
|
+
}
|
397
|
+
|
398
|
+
// Validate the connectivity of URL if the string format is url format
|
399
|
+
const data = sanitizeUrlInput(url);
|
400
|
+
|
401
|
+
if (data.isValid) {
|
402
|
+
let browserContext;
|
403
|
+
|
404
|
+
try {
|
405
|
+
browserContext = await constants.launcher.launchPersistentContext(clonedDataDir, {
|
406
|
+
...getPlaywrightLaunchOptions(browserToRun),
|
407
|
+
...(viewport && { viewport }),
|
408
|
+
...(userAgent && { userAgent }),
|
409
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
410
|
+
});
|
411
|
+
} catch (err) {
|
412
|
+
printMessage([`Unable to launch browser\n${err}`], messageOptions);
|
413
|
+
res.status = constants.urlCheckStatuses.browserError.code;
|
414
|
+
return res;
|
415
|
+
}
|
416
|
+
|
417
|
+
// const context = await browser.newContext();
|
418
|
+
const page = await browserContext.newPage();
|
419
|
+
|
420
|
+
// method will not throw an error when any valid HTTP status code is returned by the remote server, including 404 "Not Found" and 500 "Internal Server Error".
|
421
|
+
// navigation to about:blank or navigation to the same URL with a different hash, which would succeed and return null.
|
422
|
+
try {
|
423
|
+
// playwright headless mode does not support navigation to pdf document
|
424
|
+
if (isUrlPdf(url)) {
|
425
|
+
// make http request to url to check
|
426
|
+
return await requestToUrl(url, false, extraHTTPHeaders);
|
427
|
+
}
|
428
|
+
|
429
|
+
const response = await page.goto(url, {
|
430
|
+
timeout: 30000,
|
431
|
+
...(proxy && { waitUntil: 'commit' }),
|
432
|
+
});
|
433
|
+
|
434
|
+
try {
|
435
|
+
await page.waitForLoadState('networkidle', { timeout: 10000 });
|
436
|
+
} catch {
|
437
|
+
silentLogger.info('Unable to detect networkidle');
|
438
|
+
}
|
439
|
+
|
440
|
+
if (response.status() === 401) {
|
441
|
+
res.status = constants.urlCheckStatuses.unauthorised.code;
|
442
|
+
} else {
|
443
|
+
res.status = constants.urlCheckStatuses.success.code;
|
444
|
+
}
|
445
|
+
|
446
|
+
// set redirect link or final url
|
447
|
+
if (isCustomFlow) {
|
448
|
+
res.url = url;
|
449
|
+
} else {
|
450
|
+
res.url = page.url();
|
451
|
+
}
|
452
|
+
|
453
|
+
res.content = await page.content();
|
454
|
+
|
455
|
+
const contentType = response.headers()['content-type'];
|
456
|
+
if (contentType.includes('xml')) {
|
457
|
+
const responseFromUrl = await requestToUrl(res.url, true, extraHTTPHeaders);
|
458
|
+
|
459
|
+
res.content = responseFromUrl.content;
|
460
|
+
}
|
461
|
+
} catch (error) {
|
462
|
+
silentLogger.error(error);
|
463
|
+
res.status = constants.urlCheckStatuses.systemError.code;
|
464
|
+
} finally {
|
465
|
+
await browserContext.close();
|
466
|
+
}
|
467
|
+
} else {
|
468
|
+
// enters here if input is not a URL or not using http/https protocols
|
469
|
+
res.status = constants.urlCheckStatuses.invalidUrl.code;
|
470
|
+
}
|
471
|
+
|
472
|
+
return res;
|
473
|
+
};
|
474
|
+
|
475
|
+
export const isSitemapContent = (content: string) => {
|
476
|
+
const { isValid } = validateXML(content);
|
477
|
+
if (isValid) {
|
478
|
+
return true;
|
479
|
+
}
|
480
|
+
|
481
|
+
const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
|
482
|
+
const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
|
483
|
+
const regexForUrl = new RegExp('^.*(http|https):/{2}.*$', 'gmi');
|
484
|
+
|
485
|
+
if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
|
486
|
+
// is an XML sitemap wrapped in a HTML document
|
487
|
+
return true;
|
488
|
+
}
|
489
|
+
if (!content.match(regexForHtml) && content.match(regexForUrl)) {
|
490
|
+
// treat this as a txt sitemap where all URLs will be extracted for crawling
|
491
|
+
return true;
|
492
|
+
}
|
493
|
+
// is HTML webpage
|
494
|
+
return false;
|
495
|
+
};
|
496
|
+
|
497
|
+
export const checkUrl = async (
|
498
|
+
scanner,
|
499
|
+
url,
|
500
|
+
browser,
|
501
|
+
clonedDataDir,
|
502
|
+
playwrightDeviceDetailsObject,
|
503
|
+
isCustomFlow,
|
504
|
+
extraHTTPHeaders,
|
505
|
+
) => {
|
506
|
+
const res = await checkUrlConnectivityWithBrowser(
|
507
|
+
url,
|
508
|
+
browser,
|
509
|
+
clonedDataDir,
|
510
|
+
playwrightDeviceDetailsObject,
|
511
|
+
isCustomFlow,
|
512
|
+
extraHTTPHeaders,
|
513
|
+
);
|
514
|
+
|
515
|
+
if (
|
516
|
+
res.status === constants.urlCheckStatuses.success.code &&
|
517
|
+
(scanner === ScannerTypes.SITEMAP || scanner === ScannerTypes.LOCALFILE)
|
518
|
+
) {
|
519
|
+
const isSitemap = isSitemapContent(res.content);
|
520
|
+
|
521
|
+
if (!isSitemap && scanner === ScannerTypes.LOCALFILE) {
|
522
|
+
res.status = constants.urlCheckStatuses.notALocalFile.code;
|
523
|
+
} else if (!isSitemap) {
|
524
|
+
res.status = constants.urlCheckStatuses.notASitemap.code;
|
525
|
+
}
|
526
|
+
}
|
527
|
+
return res;
|
528
|
+
};
|
529
|
+
|
530
|
+
const isEmptyObject = (obj: object): boolean => !Object.keys(obj).length;
|
531
|
+
|
532
|
+
export const parseHeaders = (header?: string): Record<string, string> => {
|
533
|
+
// parse HTTP headers from string
|
534
|
+
if (!header) return {};
|
535
|
+
const headerValues = header.split(', ');
|
536
|
+
const allHeaders = {};
|
537
|
+
headerValues.map((headerValue: string) => {
|
538
|
+
const headerValuePair = headerValue.split(/ (.*)/s);
|
539
|
+
if (headerValuePair.length < 2) {
|
540
|
+
printMessage(
|
541
|
+
[
|
542
|
+
`Invalid value for authorisation request header. Please provide valid keywords in the format: "<header> <value>". For multiple authentication headers, please provide the keywords in the format: "<header> <value>, <header2> <value2>, ..." .`,
|
543
|
+
],
|
544
|
+
messageOptions,
|
545
|
+
);
|
546
|
+
process.exit(1);
|
547
|
+
}
|
548
|
+
allHeaders[headerValuePair[0]] = headerValuePair[1]; // {"header": "value", "header2": "value2", ...}
|
549
|
+
});
|
550
|
+
return allHeaders;
|
551
|
+
};
|
552
|
+
|
553
|
+
export const prepareData = async (argv: Answers): Promise<Data> => {
|
554
|
+
if (isEmptyObject(argv)) {
|
555
|
+
throw Error('No inputs should be provided');
|
556
|
+
}
|
557
|
+
const {
|
558
|
+
scanner,
|
559
|
+
headless,
|
560
|
+
url,
|
561
|
+
deviceChosen,
|
562
|
+
customDevice,
|
563
|
+
viewportWidth,
|
564
|
+
playwrightDeviceDetailsObject,
|
565
|
+
maxpages,
|
566
|
+
strategy,
|
567
|
+
isLocalFileScan,
|
568
|
+
finalUrl,
|
569
|
+
browserToRun,
|
570
|
+
nameEmail,
|
571
|
+
customFlowLabel,
|
572
|
+
specifiedMaxConcurrency,
|
573
|
+
fileTypes,
|
574
|
+
blacklistedPatternsFilename,
|
575
|
+
additional,
|
576
|
+
metadata,
|
577
|
+
followRobots,
|
578
|
+
header,
|
579
|
+
safeMode,
|
580
|
+
zip,
|
581
|
+
ruleset,
|
582
|
+
} = argv;
|
583
|
+
|
584
|
+
// construct filename for scan results
|
585
|
+
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
586
|
+
const domain = argv.isLocalFileScan ? path.basename(argv.url) : new URL(argv.url).hostname;
|
587
|
+
const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : '';
|
588
|
+
let resultFilename: string;
|
589
|
+
const randomThreeDigitNumber = randomThreeDigitNumberString();
|
590
|
+
if (process.env.OOBEE_VERBOSE) {
|
591
|
+
resultFilename = `${date}_${time}${sanitisedLabel}_${domain}_${randomThreeDigitNumber}`;
|
592
|
+
} else {
|
593
|
+
resultFilename = `${date}_${time}${sanitisedLabel}_${domain}`;
|
594
|
+
}
|
595
|
+
|
596
|
+
if (followRobots) {
|
597
|
+
constants.robotsTxtUrls = {};
|
598
|
+
await getUrlsFromRobotsTxt(url, browserToRun);
|
599
|
+
}
|
600
|
+
|
601
|
+
return {
|
602
|
+
type: scanner,
|
603
|
+
url: finalUrl,
|
604
|
+
entryUrl: url,
|
605
|
+
isHeadless: headless,
|
606
|
+
deviceChosen,
|
607
|
+
customDevice,
|
608
|
+
viewportWidth,
|
609
|
+
playwrightDeviceDetailsObject,
|
610
|
+
maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
|
611
|
+
strategy:
|
612
|
+
strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
|
613
|
+
isLocalFileScan,
|
614
|
+
browser: browserToRun,
|
615
|
+
nameEmail,
|
616
|
+
customFlowLabel,
|
617
|
+
specifiedMaxConcurrency,
|
618
|
+
randomToken: resultFilename,
|
619
|
+
fileTypes,
|
620
|
+
blacklistedPatternsFilename,
|
621
|
+
includeScreenshots: !(additional === 'none'),
|
622
|
+
metadata,
|
623
|
+
followRobots,
|
624
|
+
extraHTTPHeaders: parseHeaders(header),
|
625
|
+
safeMode,
|
626
|
+
zip,
|
627
|
+
ruleset,
|
628
|
+
};
|
629
|
+
};
|
630
|
+
|
631
|
+
export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): Promise<void> => {
|
632
|
+
if (!constants.robotsTxtUrls) return;
|
633
|
+
|
634
|
+
const domain = new URL(url).origin;
|
635
|
+
if (constants.robotsTxtUrls[domain]) return;
|
636
|
+
const robotsUrl = domain.concat('/robots.txt');
|
637
|
+
|
638
|
+
let robotsTxt: string;
|
639
|
+
try {
|
640
|
+
if (proxy) {
|
641
|
+
robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browserToRun);
|
642
|
+
} else {
|
643
|
+
robotsTxt = await getRobotsTxtViaAxios(robotsUrl);
|
644
|
+
}
|
645
|
+
} catch (e) {
|
646
|
+
silentLogger.info(e);
|
647
|
+
}
|
648
|
+
console.log('robotsTxt', robotsTxt);
|
649
|
+
if (!robotsTxt) {
|
650
|
+
constants.robotsTxtUrls[domain] = {};
|
651
|
+
return;
|
652
|
+
}
|
653
|
+
|
654
|
+
console.log('Found robots.txt: ', robotsUrl);
|
655
|
+
|
656
|
+
const lines = robotsTxt.split(/\r?\n/);
|
657
|
+
let shouldCapture = false;
|
658
|
+
const disallowedUrls = [];
|
659
|
+
const allowedUrls = [];
|
660
|
+
|
661
|
+
const sanitisePattern = (pattern: string): string => {
|
662
|
+
const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
|
663
|
+
const subdirWildcardRegex = /\/\*\//g;
|
664
|
+
const filePathRegex = /^\/(?:[^\/]+\/)*[^\/]+\.[a-zA-Z0-9]{1,6}$/;
|
665
|
+
|
666
|
+
if (subdirWildcardRegex.test(pattern)) {
|
667
|
+
pattern = pattern.replace(subdirWildcardRegex, '/**/');
|
668
|
+
}
|
669
|
+
if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
|
670
|
+
if (pattern.endsWith('*')) {
|
671
|
+
pattern = pattern.concat('*');
|
672
|
+
} else {
|
673
|
+
if (!pattern.endsWith('/')) pattern = pattern.concat('/');
|
674
|
+
pattern = pattern.concat('**');
|
675
|
+
}
|
676
|
+
}
|
677
|
+
const final = domain.concat(pattern);
|
678
|
+
return final;
|
679
|
+
};
|
680
|
+
|
681
|
+
for (const line of lines) {
|
682
|
+
if (line.toLowerCase().startsWith('user-agent: *')) {
|
683
|
+
shouldCapture = true;
|
684
|
+
} else if (line.toLowerCase().startsWith('user-agent:') && shouldCapture) {
|
685
|
+
break;
|
686
|
+
} else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
|
687
|
+
let disallowed = line.substring('disallow: '.length).trim();
|
688
|
+
if (disallowed) {
|
689
|
+
disallowed = sanitisePattern(disallowed);
|
690
|
+
disallowedUrls.push(disallowed);
|
691
|
+
}
|
692
|
+
} else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
|
693
|
+
let allowed = line.substring('allow: '.length).trim();
|
694
|
+
if (allowed) {
|
695
|
+
allowed = sanitisePattern(allowed);
|
696
|
+
allowedUrls.push(allowed);
|
697
|
+
}
|
698
|
+
}
|
699
|
+
}
|
700
|
+
constants.robotsTxtUrls[domain] = { disallowedUrls, allowedUrls };
|
701
|
+
};
|
702
|
+
|
703
|
+
const getRobotsTxtViaPlaywright = async (robotsUrl: string, browser: string): Promise<string> => {
|
704
|
+
const browserContext = await constants.launcher.launchPersistentContext('', {
|
705
|
+
...getPlaywrightLaunchOptions(browser),
|
706
|
+
});
|
707
|
+
|
708
|
+
const page = await browserContext.newPage();
|
709
|
+
await page.goto(robotsUrl, { waitUntil: 'networkidle', timeout: 30000 });
|
710
|
+
|
711
|
+
const robotsTxt: string | null = await page.evaluate(() => document.body.textContent);
|
712
|
+
return robotsTxt;
|
713
|
+
};
|
714
|
+
|
715
|
+
const getRobotsTxtViaAxios = async (robotsUrl: string): Promise<string> => {
|
716
|
+
const instance = axios.create({
|
717
|
+
httpsAgent: new https.Agent({
|
718
|
+
rejectUnauthorized: false,
|
719
|
+
keepAlive: true,
|
720
|
+
}),
|
721
|
+
});
|
722
|
+
|
723
|
+
const robotsTxt = (await (await instance.get(robotsUrl, { timeout: 2000 })).data) as string;
|
724
|
+
return robotsTxt;
|
725
|
+
};
|
726
|
+
|
727
|
+
export const isDisallowedInRobotsTxt = (url: string): boolean => {
|
728
|
+
if (!constants.robotsTxtUrls) return;
|
729
|
+
|
730
|
+
const domain = new URL(url).origin;
|
731
|
+
if (constants.robotsTxtUrls[domain]) {
|
732
|
+
const { disallowedUrls, allowedUrls } = constants.robotsTxtUrls[domain];
|
733
|
+
|
734
|
+
const isDisallowed =
|
735
|
+
disallowedUrls.filter((disallowedUrl: string) => {
|
736
|
+
const disallowed = minimatch(url, disallowedUrl);
|
737
|
+
return disallowed;
|
738
|
+
}).length > 0;
|
739
|
+
|
740
|
+
const isAllowed =
|
741
|
+
allowedUrls.filter((allowedUrl: string) => {
|
742
|
+
const allowed = minimatch(url, allowedUrl);
|
743
|
+
return allowed;
|
744
|
+
}).length > 0;
|
745
|
+
|
746
|
+
return isDisallowed && !isAllowed;
|
747
|
+
}
|
748
|
+
return false;
|
749
|
+
};
|
750
|
+
|
751
|
+
export const getLinksFromSitemap = async (
|
752
|
+
sitemapUrl: string,
|
753
|
+
maxLinksCount: number,
|
754
|
+
browser: string,
|
755
|
+
userDataDirectory: string,
|
756
|
+
userUrlInput: string,
|
757
|
+
isIntelligent: boolean,
|
758
|
+
username: string,
|
759
|
+
password: string,
|
760
|
+
) => {
|
761
|
+
const scannedSitemaps = new Set<string>();
|
762
|
+
const urls = {}; // dictionary of requests to urls to be scanned
|
763
|
+
|
764
|
+
const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
|
765
|
+
|
766
|
+
const addToUrlList = url => {
|
767
|
+
if (!url) return;
|
768
|
+
if (isDisallowedInRobotsTxt(url)) return;
|
769
|
+
|
770
|
+
// add basic auth credentials to the URL
|
771
|
+
username !== '' && password !== ''
|
772
|
+
? (url = addBasicAuthCredentials(url, username, password))
|
773
|
+
: url;
|
774
|
+
|
775
|
+
url = convertPathToLocalFile(url);
|
776
|
+
|
777
|
+
let request;
|
778
|
+
try {
|
779
|
+
request = new Request({ url });
|
780
|
+
} catch (e) {
|
781
|
+
console.log('Error creating request', e);
|
782
|
+
}
|
783
|
+
if (isUrlPdf(url)) {
|
784
|
+
request.skipNavigation = true;
|
785
|
+
}
|
786
|
+
urls[url] = request;
|
787
|
+
};
|
788
|
+
|
789
|
+
const addBasicAuthCredentials = (url, username, password) => {
|
790
|
+
const urlObject = new URL(url);
|
791
|
+
urlObject.username = username;
|
792
|
+
urlObject.password = password;
|
793
|
+
return urlObject.toString();
|
794
|
+
};
|
795
|
+
|
796
|
+
const calculateCloseness = sitemapUrl => {
|
797
|
+
// Remove 'http://', 'https://', and 'www.' prefixes from the URLs
|
798
|
+
const normalizedSitemapUrl = sitemapUrl.replace(/^(https?:\/\/)?(www\.)?/, '');
|
799
|
+
const normalizedUserUrlInput = userUrlInput
|
800
|
+
.replace(/^(https?:\/\/)?(www\.)?/, '')
|
801
|
+
.replace(/\/$/, ''); // Remove trailing slash also
|
802
|
+
|
803
|
+
if (normalizedSitemapUrl == normalizedUserUrlInput) {
|
804
|
+
return 2;
|
805
|
+
}
|
806
|
+
if (normalizedSitemapUrl.startsWith(normalizedUserUrlInput)) {
|
807
|
+
return 1;
|
808
|
+
}
|
809
|
+
return 0;
|
810
|
+
};
|
811
|
+
const processXmlSitemap = async ($, sitemapType, linkSelector, dateSelector, sectionSelector) => {
|
812
|
+
const urlList = [];
|
813
|
+
// Iterate through each URL element in the sitemap, collect url and modified date
|
814
|
+
$(sectionSelector).each((index, urlElement) => {
|
815
|
+
let url;
|
816
|
+
if (sitemapType === constants.xmlSitemapTypes.atom) {
|
817
|
+
url = $(urlElement).find(linkSelector).prop('href');
|
818
|
+
} else {
|
819
|
+
url = $(urlElement).find(linkSelector).text();
|
820
|
+
}
|
821
|
+
const lastModified = $(urlElement).find(dateSelector).text();
|
822
|
+
const lastModifiedDate = lastModified ? new Date(lastModified) : null;
|
823
|
+
|
824
|
+
urlList.push({ url, lastModifiedDate });
|
825
|
+
});
|
826
|
+
if (isIntelligent) {
|
827
|
+
// Sort by closeness to userUrlInput in descending order
|
828
|
+
urlList.sort((a, b) => {
|
829
|
+
const closenessA = calculateCloseness(a.url);
|
830
|
+
const closenessB = calculateCloseness(b.url);
|
831
|
+
if (closenessA !== closenessB) {
|
832
|
+
return closenessB - closenessA;
|
833
|
+
}
|
834
|
+
|
835
|
+
// If closeness is the same, sort by last modified date in descending order
|
836
|
+
const dateDifference = (b.lastModifiedDate || 0) - (a.lastModifiedDate || 0);
|
837
|
+
return dateDifference !== 0 ? dateDifference : 0; // Maintain original order for equal dates
|
838
|
+
});
|
839
|
+
}
|
840
|
+
|
841
|
+
// Add the sorted URLs to the main URL list
|
842
|
+
for (const { url } of urlList.slice(0, maxLinksCount)) {
|
843
|
+
addToUrlList(url);
|
844
|
+
}
|
845
|
+
};
|
846
|
+
|
847
|
+
const processNonStandardSitemap = data => {
|
848
|
+
const urlsFromData = crawlee
|
849
|
+
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
|
850
|
+
.slice(0, maxLinksCount);
|
851
|
+
urlsFromData.forEach(url => {
|
852
|
+
addToUrlList(url);
|
853
|
+
});
|
854
|
+
};
|
855
|
+
|
856
|
+
let finalUserDataDirectory = userDataDirectory;
|
857
|
+
if (userDataDirectory === null || userDataDirectory === undefined) {
|
858
|
+
finalUserDataDirectory = '';
|
859
|
+
}
|
860
|
+
|
861
|
+
const fetchUrls = async (url: string) => {
|
862
|
+
let data;
|
863
|
+
let sitemapType;
|
864
|
+
let isBasicAuth = false;
|
865
|
+
|
866
|
+
let username = '';
|
867
|
+
let password = '';
|
868
|
+
|
869
|
+
let parsedUrl;
|
870
|
+
|
871
|
+
if (scannedSitemaps.has(url)) {
|
872
|
+
// Skip processing if the sitemap has already been scanned
|
873
|
+
return;
|
874
|
+
}
|
875
|
+
|
876
|
+
scannedSitemaps.add(url);
|
877
|
+
|
878
|
+
// Convert file if its not local file path
|
879
|
+
url = convertLocalFileToPath(url);
|
880
|
+
|
881
|
+
// Check whether its a file path or a URL
|
882
|
+
if (isFilePath(url)) {
|
883
|
+
if (!fs.existsSync(url)) {
|
884
|
+
return;
|
885
|
+
}
|
886
|
+
parsedUrl = url;
|
887
|
+
} else if (isValidHttpUrl(url)) {
|
888
|
+
parsedUrl = new URL(url);
|
889
|
+
|
890
|
+
if (parsedUrl.username !== '' && parsedUrl.password !== '') {
|
891
|
+
isBasicAuth = true;
|
892
|
+
username = decodeURIComponent(parsedUrl.username);
|
893
|
+
password = decodeURIComponent(parsedUrl.password);
|
894
|
+
parsedUrl.username = '';
|
895
|
+
parsedUrl.password = '';
|
896
|
+
}
|
897
|
+
} else {
|
898
|
+
printMessage([`Invalid Url/Filepath: ${url}`], messageOptions);
|
899
|
+
return;
|
900
|
+
}
|
901
|
+
|
902
|
+
const getDataUsingPlaywright = async () => {
|
903
|
+
const browserContext = await constants.launcher.launchPersistentContext(
|
904
|
+
finalUserDataDirectory,
|
905
|
+
{
|
906
|
+
...getPlaywrightLaunchOptions(browser),
|
907
|
+
// Not necessary to parse http_credentials as I am parsing it directly in URL
|
908
|
+
},
|
909
|
+
);
|
910
|
+
|
911
|
+
const page = await browserContext.newPage();
|
912
|
+
await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
|
913
|
+
if (constants.launcher === webkit) {
|
914
|
+
data = await page.locator('body').innerText();
|
915
|
+
} else {
|
916
|
+
const urlSet = page.locator('urlset');
|
917
|
+
const sitemapIndex = page.locator('sitemapindex');
|
918
|
+
const rss = page.locator('rss');
|
919
|
+
const feed = page.locator('feed');
|
920
|
+
const isRoot = async locator => (await locator.count()) > 0;
|
921
|
+
|
922
|
+
if (await isRoot(urlSet)) {
|
923
|
+
data = await urlSet.evaluate(elem => elem.outerHTML);
|
924
|
+
} else if (await isRoot(sitemapIndex)) {
|
925
|
+
data = await sitemapIndex.evaluate(elem => elem.outerHTML);
|
926
|
+
} else if (await isRoot(rss)) {
|
927
|
+
data = await rss.evaluate(elem => elem.outerHTML);
|
928
|
+
} else if (await isRoot(feed)) {
|
929
|
+
data = await feed.evaluate(elem => elem.outerHTML);
|
930
|
+
}
|
931
|
+
}
|
932
|
+
|
933
|
+
await browserContext.close();
|
934
|
+
};
|
935
|
+
|
936
|
+
if (validator.isURL(url, urlOptions)) {
|
937
|
+
if (isUrlPdf(url)) {
|
938
|
+
addToUrlList(url);
|
939
|
+
return;
|
940
|
+
}
|
941
|
+
if (proxy) {
|
942
|
+
await getDataUsingPlaywright();
|
943
|
+
} else {
|
944
|
+
try {
|
945
|
+
const instance = axios.create({
|
946
|
+
httpsAgent: new https.Agent({
|
947
|
+
rejectUnauthorized: false,
|
948
|
+
keepAlive: true,
|
949
|
+
}),
|
950
|
+
auth: {
|
951
|
+
username,
|
952
|
+
password,
|
953
|
+
},
|
954
|
+
});
|
955
|
+
try {
|
956
|
+
data = await (await instance.get(url, { timeout: 80000 })).data;
|
957
|
+
} catch {
|
958
|
+
return; // to skip the error
|
959
|
+
}
|
960
|
+
} catch (error) {
|
961
|
+
if (error.code === 'ECONNABORTED') {
|
962
|
+
await getDataUsingPlaywright();
|
963
|
+
}
|
964
|
+
}
|
965
|
+
}
|
966
|
+
} else {
|
967
|
+
url = convertLocalFileToPath(url);
|
968
|
+
data = fs.readFileSync(url, 'utf8');
|
969
|
+
}
|
970
|
+
const $ = cheerio.load(data, { xml: true });
|
971
|
+
|
972
|
+
// This case is when the document is not an XML format document
|
973
|
+
if ($(':root').length === 0) {
|
974
|
+
processNonStandardSitemap(data);
|
975
|
+
return;
|
976
|
+
}
|
977
|
+
|
978
|
+
// Root element
|
979
|
+
const root = $(':root')[0];
|
980
|
+
|
981
|
+
const { xmlns } = root.attribs;
|
982
|
+
|
983
|
+
const xmlFormatNamespace = '/schemas/sitemap';
|
984
|
+
if (root.name === 'urlset' && xmlns.includes(xmlFormatNamespace)) {
|
985
|
+
sitemapType = constants.xmlSitemapTypes.xml;
|
986
|
+
} else if (root.name === 'sitemapindex' && xmlns.includes(xmlFormatNamespace)) {
|
987
|
+
sitemapType = constants.xmlSitemapTypes.xmlIndex;
|
988
|
+
} else if (root.name === 'rss') {
|
989
|
+
sitemapType = constants.xmlSitemapTypes.rss;
|
990
|
+
} else if (root.name === 'feed') {
|
991
|
+
sitemapType = constants.xmlSitemapTypes.atom;
|
992
|
+
} else {
|
993
|
+
sitemapType = constants.xmlSitemapTypes.unknown;
|
994
|
+
}
|
995
|
+
|
996
|
+
switch (sitemapType) {
|
997
|
+
case constants.xmlSitemapTypes.xmlIndex:
|
998
|
+
silentLogger.info(`This is a XML format sitemap index.`);
|
999
|
+
for (const childSitemapUrl of $('loc')) {
|
1000
|
+
const childSitemapUrlText = $(childSitemapUrl).text();
|
1001
|
+
if (isLimitReached()) {
|
1002
|
+
break;
|
1003
|
+
}
|
1004
|
+
if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
|
1005
|
+
await fetchUrls(childSitemapUrlText); // Recursive call for nested sitemaps
|
1006
|
+
} else {
|
1007
|
+
addToUrlList(childSitemapUrlText); // Add regular URLs to the list
|
1008
|
+
}
|
1009
|
+
}
|
1010
|
+
break;
|
1011
|
+
case constants.xmlSitemapTypes.xml:
|
1012
|
+
silentLogger.info(`This is a XML format sitemap.`);
|
1013
|
+
await processXmlSitemap($, sitemapType, 'loc', 'lastmod', 'url');
|
1014
|
+
break;
|
1015
|
+
case constants.xmlSitemapTypes.rss:
|
1016
|
+
silentLogger.info(`This is a RSS format sitemap.`);
|
1017
|
+
await processXmlSitemap($, sitemapType, 'link', 'pubDate', 'item');
|
1018
|
+
break;
|
1019
|
+
case constants.xmlSitemapTypes.atom:
|
1020
|
+
silentLogger.info(`This is a Atom format sitemap.`);
|
1021
|
+
await processXmlSitemap($, sitemapType, 'link', 'published', 'entry');
|
1022
|
+
break;
|
1023
|
+
default:
|
1024
|
+
silentLogger.info(`This is an unrecognised XML sitemap format.`);
|
1025
|
+
processNonStandardSitemap(data);
|
1026
|
+
}
|
1027
|
+
};
|
1028
|
+
|
1029
|
+
try {
|
1030
|
+
await fetchUrls(sitemapUrl);
|
1031
|
+
} catch (e) {
|
1032
|
+
silentLogger.error(e);
|
1033
|
+
}
|
1034
|
+
|
1035
|
+
const requestList = Object.values(urls);
|
1036
|
+
|
1037
|
+
return requestList;
|
1038
|
+
};
|
1039
|
+
|
1040
|
+
export const validEmail = email => {
|
1041
|
+
const emailRegex = /^.+@.+\..+$/u;
|
1042
|
+
|
1043
|
+
return emailRegex.test(email);
|
1044
|
+
};
|
1045
|
+
|
1046
|
+
// For new user flow.
|
1047
|
+
export const validName = name => {
|
1048
|
+
// Allow only printable characters from any language
|
1049
|
+
const regex = /^[\p{L}\p{N}\s'".,()\[\]{}!?:؛،؟…]+$/u;
|
1050
|
+
|
1051
|
+
// Check if the length is between 2 and 32000 characters
|
1052
|
+
if (name.length < 2 || name.length > 32000) {
|
1053
|
+
// Handle invalid name length
|
1054
|
+
return false;
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
if (!regex.test(name)) {
|
1058
|
+
// Handle invalid name format
|
1059
|
+
return false;
|
1060
|
+
}
|
1061
|
+
|
1062
|
+
// Include a check for specific characters to sanitize injection patterns
|
1063
|
+
const preventInjectionRegex = /[<>'"\\/;|&!$*{}()\[\]\r\n\t]/;
|
1064
|
+
if (preventInjectionRegex.test(name)) {
|
1065
|
+
// Handle potential injection attempts
|
1066
|
+
return false;
|
1067
|
+
}
|
1068
|
+
|
1069
|
+
return true;
|
1070
|
+
};
|
1071
|
+
|
1072
|
+
/**
|
1073
|
+
* Check for browser available to run scan and clone data directory of the browser if needed.
|
1074
|
+
* @param preferredBrowser string of user's preferred browser
|
1075
|
+
* @param isCli boolean flag to indicate if function is called from cli
|
1076
|
+
* @returns object consisting of browser to run and cloned data directory
|
1077
|
+
*/
|
1078
|
+
export const getBrowserToRun = (
|
1079
|
+
preferredBrowser: BrowserTypes,
|
1080
|
+
isCli = false,
|
1081
|
+
): { browserToRun: BrowserTypes; clonedBrowserDataDir: string } => {
|
1082
|
+
const platform = os.platform();
|
1083
|
+
|
1084
|
+
// Prioritise Chrome on Windows and Mac platforms if user does not specify a browser
|
1085
|
+
if (!preferredBrowser && (os.platform() === 'win32' || os.platform() === 'darwin')) {
|
1086
|
+
preferredBrowser = BrowserTypes.CHROME;
|
1087
|
+
}
|
1088
|
+
|
1089
|
+
printMessage([`Preferred browser ${preferredBrowser}`], messageOptions);
|
1090
|
+
|
1091
|
+
if (preferredBrowser === BrowserTypes.CHROME) {
|
1092
|
+
const chromeData = getChromeData();
|
1093
|
+
if (chromeData) return chromeData;
|
1094
|
+
|
1095
|
+
if (platform === 'darwin') {
|
1096
|
+
// mac user who specified -b chrome but does not have chrome
|
1097
|
+
if (isCli) printMessage(['Unable to use Chrome, falling back to webkit...'], messageOptions);
|
1098
|
+
|
1099
|
+
constants.launcher = webkit;
|
1100
|
+
return { browserToRun: null, clonedBrowserDataDir: '' };
|
1101
|
+
}
|
1102
|
+
if (platform === 'win32') {
|
1103
|
+
if (isCli)
|
1104
|
+
printMessage(['Unable to use Chrome, falling back to Edge browser...'], messageOptions);
|
1105
|
+
|
1106
|
+
const edgeData = getEdgeData();
|
1107
|
+
if (edgeData) return edgeData;
|
1108
|
+
|
1109
|
+
if (isCli)
|
1110
|
+
printMessage(['Unable to use both Chrome and Edge. Please try again.'], messageOptions);
|
1111
|
+
process.exit(constants.urlCheckStatuses.browserError.code);
|
1112
|
+
}
|
1113
|
+
|
1114
|
+
if (isCli) {
|
1115
|
+
printMessage(['Unable to use Chrome, falling back to Chromium browser...'], messageOptions);
|
1116
|
+
}
|
1117
|
+
} else if (preferredBrowser === BrowserTypes.EDGE) {
|
1118
|
+
const edgeData = getEdgeData();
|
1119
|
+
if (edgeData) return edgeData;
|
1120
|
+
|
1121
|
+
if (isCli)
|
1122
|
+
printMessage(['Unable to use Edge, falling back to Chrome browser...'], messageOptions);
|
1123
|
+
const chromeData = getChromeData();
|
1124
|
+
if (chromeData) return chromeData;
|
1125
|
+
|
1126
|
+
if (platform === 'darwin') {
|
1127
|
+
// mac user who specified -b edge but does not have edge or chrome
|
1128
|
+
if (isCli)
|
1129
|
+
printMessage(
|
1130
|
+
['Unable to use both Edge and Chrome, falling back to webkit...'],
|
1131
|
+
messageOptions,
|
1132
|
+
);
|
1133
|
+
|
1134
|
+
constants.launcher = webkit;
|
1135
|
+
return { browserToRun: null, clonedBrowserDataDir: '' };
|
1136
|
+
}
|
1137
|
+
if (platform === 'win32') {
|
1138
|
+
if (isCli)
|
1139
|
+
printMessage(['Unable to use both Edge and Chrome. Please try again.'], messageOptions);
|
1140
|
+
process.exit(constants.urlCheckStatuses.browserError.code);
|
1141
|
+
} else {
|
1142
|
+
// linux and other OS
|
1143
|
+
if (isCli)
|
1144
|
+
printMessage(
|
1145
|
+
['Unable to use both Edge and Chrome, falling back to Chromium browser...'],
|
1146
|
+
messageOptions,
|
1147
|
+
);
|
1148
|
+
}
|
1149
|
+
}
|
1150
|
+
|
1151
|
+
// defaults to chromium
|
1152
|
+
return {
|
1153
|
+
browserToRun: BrowserTypes.CHROMIUM,
|
1154
|
+
clonedBrowserDataDir: cloneChromiumProfiles(),
|
1155
|
+
};
|
1156
|
+
};
|
1157
|
+
|
1158
|
+
/**
|
1159
|
+
* Cloning a second time with random token for parallel browser sessions
|
1160
|
+
* Also to mitigate against known bug where cookies are
|
1161
|
+
* overridden after each browser session - i.e. logs user out
|
1162
|
+
* after checkingUrl and unable to utilise same cookie for scan
|
1163
|
+
* */
|
1164
|
+
export const getClonedProfilesWithRandomToken = (browser: string, randomToken: string): string => {
|
1165
|
+
if (browser === BrowserTypes.CHROME) {
|
1166
|
+
return cloneChromeProfiles(randomToken);
|
1167
|
+
}
|
1168
|
+
if (browser === BrowserTypes.EDGE) {
|
1169
|
+
return cloneEdgeProfiles(randomToken);
|
1170
|
+
}
|
1171
|
+
return cloneChromiumProfiles(randomToken);
|
1172
|
+
};
|
1173
|
+
|
1174
|
+
export const getChromeData = () => {
|
1175
|
+
const browserDataDir = getDefaultChromeDataDir();
|
1176
|
+
const clonedBrowserDataDir = cloneChromeProfiles();
|
1177
|
+
if (browserDataDir && clonedBrowserDataDir) {
|
1178
|
+
const browserToRun = BrowserTypes.CHROME;
|
1179
|
+
return { browserToRun, clonedBrowserDataDir };
|
1180
|
+
}
|
1181
|
+
return null;
|
1182
|
+
};
|
1183
|
+
|
1184
|
+
export const getEdgeData = () => {
|
1185
|
+
const browserDataDir = getDefaultEdgeDataDir();
|
1186
|
+
const clonedBrowserDataDir = cloneEdgeProfiles();
|
1187
|
+
if (browserDataDir && clonedBrowserDataDir) {
|
1188
|
+
const browserToRun = BrowserTypes.EDGE;
|
1189
|
+
return { browserToRun, clonedBrowserDataDir };
|
1190
|
+
}
|
1191
|
+
};
|
1192
|
+
|
1193
|
+
/**
|
1194
|
+
* Clone the Chrome profile cookie files to the destination directory
|
1195
|
+
* @param {*} options glob options object
|
1196
|
+
* @param {*} destDir destination directory
|
1197
|
+
* @returns boolean indicating whether the operation was successful
|
1198
|
+
*/
|
1199
|
+
const cloneChromeProfileCookieFiles = (options, destDir) => {
|
1200
|
+
let profileCookiesDir;
|
1201
|
+
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
1202
|
+
// and ../Chrome/<profile name>/Cookies for mac
|
1203
|
+
let profileNamesRegex;
|
1204
|
+
if (os.platform() === 'win32') {
|
1205
|
+
profileCookiesDir = globSync('**/Network/Cookies', {
|
1206
|
+
...options,
|
1207
|
+
ignore: ['oobee/**'],
|
1208
|
+
});
|
1209
|
+
profileNamesRegex = /User Data\\(.*?)\\Network/;
|
1210
|
+
} else if (os.platform() === 'darwin') {
|
1211
|
+
// maxDepth 2 to avoid copying cookies from the oobee directory if it exists
|
1212
|
+
profileCookiesDir = globSync('**/Cookies', {
|
1213
|
+
...options,
|
1214
|
+
ignore: 'oobee/**',
|
1215
|
+
});
|
1216
|
+
profileNamesRegex = /Chrome\/(.*?)\/Cookies/;
|
1217
|
+
}
|
1218
|
+
|
1219
|
+
if (profileCookiesDir.length > 0) {
|
1220
|
+
let success = true;
|
1221
|
+
profileCookiesDir.forEach(dir => {
|
1222
|
+
const profileName = dir.match(profileNamesRegex)[1];
|
1223
|
+
if (profileName) {
|
1224
|
+
let destProfileDir = path.join(destDir, profileName);
|
1225
|
+
if (os.platform() === 'win32') {
|
1226
|
+
destProfileDir = path.join(destProfileDir, 'Network');
|
1227
|
+
}
|
1228
|
+
// Recursive true to create all parent directories (e.g. PbProfile/Default/Cookies)
|
1229
|
+
if (!fs.existsSync(destProfileDir)) {
|
1230
|
+
fs.mkdirSync(destProfileDir, { recursive: true });
|
1231
|
+
if (!fs.existsSync(destProfileDir)) {
|
1232
|
+
fs.mkdirSync(destProfileDir, { recursive: true });
|
1233
|
+
}
|
1234
|
+
}
|
1235
|
+
|
1236
|
+
// Prevents duplicate cookies file if the cookies already exist
|
1237
|
+
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
1238
|
+
try {
|
1239
|
+
fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
|
1240
|
+
} catch (err) {
|
1241
|
+
silentLogger.error(err);
|
1242
|
+
if (err.code === 'EBUSY') {
|
1243
|
+
console.log(
|
1244
|
+
`Unable to copy the file for ${profileName} because it is currently in use.`,
|
1245
|
+
);
|
1246
|
+
console.log(
|
1247
|
+
'Please close any applications that might be using this file and try again.',
|
1248
|
+
);
|
1249
|
+
} else {
|
1250
|
+
console.log(
|
1251
|
+
`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`,
|
1252
|
+
);
|
1253
|
+
}
|
1254
|
+
// printMessage([err], messageOptions);
|
1255
|
+
success = false;
|
1256
|
+
}
|
1257
|
+
}
|
1258
|
+
}
|
1259
|
+
});
|
1260
|
+
return success;
|
1261
|
+
}
|
1262
|
+
|
1263
|
+
silentLogger.warn('Unable to find Chrome profile cookies file in the system.');
|
1264
|
+
printMessage(['Unable to find Chrome profile cookies file in the system.'], messageOptions);
|
1265
|
+
return false;
|
1266
|
+
};
|
1267
|
+
|
1268
|
+
/**
|
1269
|
+
* Clone the Chrome profile cookie files to the destination directory
|
1270
|
+
* @param {*} options glob options object
|
1271
|
+
* @param {*} destDir destination directory
|
1272
|
+
* @returns boolean indicating whether the operation was successful
|
1273
|
+
*/
|
1274
|
+
const cloneEdgeProfileCookieFiles = (options, destDir) => {
|
1275
|
+
let profileCookiesDir;
|
1276
|
+
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
1277
|
+
// and ../Chrome/<profile name>/Cookies for mac
|
1278
|
+
let profileNamesRegex;
|
1279
|
+
// Ignores the cloned oobee directory if exists
|
1280
|
+
if (os.platform() === 'win32') {
|
1281
|
+
profileCookiesDir = globSync('**/Network/Cookies', {
|
1282
|
+
...options,
|
1283
|
+
ignore: 'oobee/**',
|
1284
|
+
});
|
1285
|
+
profileNamesRegex = /User Data\\(.*?)\\Network/;
|
1286
|
+
} else if (os.platform() === 'darwin') {
|
1287
|
+
// Ignores copying cookies from the oobee directory if it exists
|
1288
|
+
profileCookiesDir = globSync('**/Cookies', {
|
1289
|
+
...options,
|
1290
|
+
ignore: 'oobee/**',
|
1291
|
+
});
|
1292
|
+
profileNamesRegex = /Microsoft Edge\/(.*?)\/Cookies/;
|
1293
|
+
}
|
1294
|
+
|
1295
|
+
if (profileCookiesDir.length > 0) {
|
1296
|
+
let success = true;
|
1297
|
+
profileCookiesDir.forEach(dir => {
|
1298
|
+
const profileName = dir.match(profileNamesRegex)[1];
|
1299
|
+
if (profileName) {
|
1300
|
+
let destProfileDir = path.join(destDir, profileName);
|
1301
|
+
if (os.platform() === 'win32') {
|
1302
|
+
destProfileDir = path.join(destProfileDir, 'Network');
|
1303
|
+
}
|
1304
|
+
// Recursive true to create all parent directories (e.g. PbProfile/Default/Cookies)
|
1305
|
+
if (!fs.existsSync(destProfileDir)) {
|
1306
|
+
fs.mkdirSync(destProfileDir, { recursive: true });
|
1307
|
+
if (!fs.existsSync(destProfileDir)) {
|
1308
|
+
fs.mkdirSync(destProfileDir, { recursive: true });
|
1309
|
+
}
|
1310
|
+
}
|
1311
|
+
|
1312
|
+
// Prevents duplicate cookies file if the cookies already exist
|
1313
|
+
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
1314
|
+
try {
|
1315
|
+
fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
|
1316
|
+
} catch (err) {
|
1317
|
+
silentLogger.error(err);
|
1318
|
+
if (err.code === 'EBUSY') {
|
1319
|
+
console.log(
|
1320
|
+
`Unable to copy the file for ${profileName} because it is currently in use.`,
|
1321
|
+
);
|
1322
|
+
console.log(
|
1323
|
+
'Please close any applications that might be using this file and try again.',
|
1324
|
+
);
|
1325
|
+
} else {
|
1326
|
+
console.log(`An unexpected error occurred while copying the file: ${err.message}`);
|
1327
|
+
}
|
1328
|
+
// printMessage([err], messageOptions);
|
1329
|
+
success = false;
|
1330
|
+
}
|
1331
|
+
}
|
1332
|
+
}
|
1333
|
+
});
|
1334
|
+
return success;
|
1335
|
+
}
|
1336
|
+
silentLogger.warn('Unable to find Edge profile cookies file in the system.');
|
1337
|
+
printMessage(['Unable to find Edge profile cookies file in the system.'], messageOptions);
|
1338
|
+
return false;
|
1339
|
+
};
|
1340
|
+
|
1341
|
+
/**
|
1342
|
+
* Both Edge and Chrome Local State files are located in the .../User Data directory
|
1343
|
+
* @param {*} options - glob options object
|
1344
|
+
* @param {string} destDir - destination directory
|
1345
|
+
* @returns boolean indicating whether the operation was successful
|
1346
|
+
*/
|
1347
|
+
const cloneLocalStateFile = (options, destDir) => {
|
1348
|
+
const localState = globSync('**/*Local State', {
|
1349
|
+
...options,
|
1350
|
+
maxDepth: 1,
|
1351
|
+
});
|
1352
|
+
const profileNamesRegex = /([^/\\]+)[/\\]Local State$/;
|
1353
|
+
|
1354
|
+
if (localState.length > 0) {
|
1355
|
+
let success = true;
|
1356
|
+
|
1357
|
+
localState.forEach(dir => {
|
1358
|
+
const profileName = dir.match(profileNamesRegex)[1];
|
1359
|
+
try {
|
1360
|
+
fs.copyFileSync(dir, path.join(destDir, 'Local State'));
|
1361
|
+
} catch (err) {
|
1362
|
+
silentLogger.error(err);
|
1363
|
+
if (err.code === 'EBUSY') {
|
1364
|
+
console.log(`Unable to copy the file because it is currently in use.`);
|
1365
|
+
console.log('Please close any applications that might be using this file and try again.');
|
1366
|
+
} else {
|
1367
|
+
console.log(
|
1368
|
+
`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`,
|
1369
|
+
);
|
1370
|
+
}
|
1371
|
+
printMessage([err], messageOptions);
|
1372
|
+
success = false;
|
1373
|
+
}
|
1374
|
+
});
|
1375
|
+
return success;
|
1376
|
+
}
|
1377
|
+
silentLogger.warn('Unable to find local state file in the system.');
|
1378
|
+
printMessage(['Unable to find local state file in the system.'], messageOptions);
|
1379
|
+
return false;
|
1380
|
+
};
|
1381
|
+
|
1382
|
+
/**
|
1383
|
+
* Checks if the Chrome data directory exists and creates a clone
|
1384
|
+
* of all profile within the oobee directory located in the
|
1385
|
+
* .../User Data directory for Windows and
|
1386
|
+
* .../Chrome directory for Mac.
|
1387
|
+
* @param {string} randomToken - random token to append to the cloned directory
|
1388
|
+
* @returns {string} cloned data directory, null if any of the sub files failed to copy
|
1389
|
+
*/
|
1390
|
+
export const cloneChromeProfiles = (randomToken?: string): string => {
|
1391
|
+
const baseDir = getDefaultChromeDataDir();
|
1392
|
+
|
1393
|
+
if (!baseDir) {
|
1394
|
+
return;
|
1395
|
+
}
|
1396
|
+
|
1397
|
+
let destDir;
|
1398
|
+
|
1399
|
+
if (randomToken) {
|
1400
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1401
|
+
} else {
|
1402
|
+
destDir = path.join(baseDir, 'oobee');
|
1403
|
+
}
|
1404
|
+
|
1405
|
+
if (fs.existsSync(destDir)) {
|
1406
|
+
if (process.env.OOBEE_VERBOSE) {
|
1407
|
+
deleteClonedChromeProfiles(randomToken);
|
1408
|
+
} else {
|
1409
|
+
deleteClonedChromeProfiles();
|
1410
|
+
}
|
1411
|
+
}
|
1412
|
+
|
1413
|
+
if (!fs.existsSync(destDir)) {
|
1414
|
+
fs.mkdirSync(destDir, { recursive: true });
|
1415
|
+
}
|
1416
|
+
|
1417
|
+
const baseOptions = {
|
1418
|
+
cwd: baseDir,
|
1419
|
+
recursive: true,
|
1420
|
+
absolute: true,
|
1421
|
+
nodir: true,
|
1422
|
+
};
|
1423
|
+
const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
|
1424
|
+
if (cloneChromeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
|
1425
|
+
return destDir;
|
1426
|
+
}
|
1427
|
+
|
1428
|
+
return null;
|
1429
|
+
};
|
1430
|
+
|
1431
|
+
export const cloneChromiumProfiles = (randomToken?: string): string => {
|
1432
|
+
const baseDir = getDefaultChromiumDataDir();
|
1433
|
+
|
1434
|
+
if (!baseDir) {
|
1435
|
+
return;
|
1436
|
+
}
|
1437
|
+
|
1438
|
+
let destDir: string;
|
1439
|
+
|
1440
|
+
if (randomToken) {
|
1441
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1442
|
+
} else {
|
1443
|
+
destDir = path.join(baseDir, 'oobee');
|
1444
|
+
}
|
1445
|
+
|
1446
|
+
if (!fs.existsSync(destDir)) {
|
1447
|
+
fs.mkdirSync(destDir, { recursive: true });
|
1448
|
+
}
|
1449
|
+
|
1450
|
+
return destDir;
|
1451
|
+
};
|
1452
|
+
|
1453
|
+
/**
|
1454
|
+
* Checks if the Edge data directory exists and creates a clone
|
1455
|
+
* of all profile within the oobee directory located in the
|
1456
|
+
* .../User Data directory for Windows and
|
1457
|
+
* .../Microsoft Edge directory for Mac.
|
1458
|
+
* @param {string} randomToken - random token to append to the cloned directory
|
1459
|
+
* @returns {string} cloned data directory, null if any of the sub files failed to copy
|
1460
|
+
*/
|
1461
|
+
export const cloneEdgeProfiles = (randomToken?: string): string => {
|
1462
|
+
const baseDir = getDefaultEdgeDataDir();
|
1463
|
+
|
1464
|
+
if (!baseDir) {
|
1465
|
+
return;
|
1466
|
+
}
|
1467
|
+
|
1468
|
+
let destDir;
|
1469
|
+
|
1470
|
+
if (randomToken) {
|
1471
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1472
|
+
} else {
|
1473
|
+
destDir = path.join(baseDir, 'oobee');
|
1474
|
+
}
|
1475
|
+
|
1476
|
+
if (fs.existsSync(destDir)) {
|
1477
|
+
if (process.env.OOBEE_VERBOSE) {
|
1478
|
+
deleteClonedEdgeProfiles(randomToken);
|
1479
|
+
} else {
|
1480
|
+
deleteClonedEdgeProfiles();
|
1481
|
+
}
|
1482
|
+
}
|
1483
|
+
|
1484
|
+
if (!fs.existsSync(destDir)) {
|
1485
|
+
fs.mkdirSync(destDir, { recursive: true });
|
1486
|
+
}
|
1487
|
+
|
1488
|
+
const baseOptions = {
|
1489
|
+
cwd: baseDir,
|
1490
|
+
recursive: true,
|
1491
|
+
absolute: true,
|
1492
|
+
nodir: true,
|
1493
|
+
};
|
1494
|
+
|
1495
|
+
const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
|
1496
|
+
if (cloneEdgeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
|
1497
|
+
return destDir;
|
1498
|
+
}
|
1499
|
+
|
1500
|
+
return null;
|
1501
|
+
};
|
1502
|
+
|
1503
|
+
export const deleteClonedProfiles = (browser: string, randomToken?: string): void => {
|
1504
|
+
if (browser === BrowserTypes.CHROME) {
|
1505
|
+
deleteClonedChromeProfiles(randomToken);
|
1506
|
+
} else if (browser === BrowserTypes.EDGE) {
|
1507
|
+
deleteClonedEdgeProfiles(randomToken);
|
1508
|
+
} else if (browser === BrowserTypes.CHROMIUM) {
|
1509
|
+
deleteClonedChromiumProfiles(randomToken);
|
1510
|
+
}
|
1511
|
+
};
|
1512
|
+
|
1513
|
+
/**
|
1514
|
+
* Deletes all the cloned oobee directories in the Chrome data directory
|
1515
|
+
* @returns null
|
1516
|
+
*/
|
1517
|
+
export const deleteClonedChromeProfiles = (randomToken?: string): void => {
|
1518
|
+
const baseDir = getDefaultChromeDataDir();
|
1519
|
+
|
1520
|
+
if (!baseDir) {
|
1521
|
+
return;
|
1522
|
+
}
|
1523
|
+
let destDir: string[];
|
1524
|
+
if (randomToken) {
|
1525
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
1526
|
+
} else {
|
1527
|
+
// Find all the oobee directories in the Chrome data directory
|
1528
|
+
destDir = globSync('**/oobee*', {
|
1529
|
+
cwd: baseDir,
|
1530
|
+
absolute: true,
|
1531
|
+
});
|
1532
|
+
}
|
1533
|
+
|
1534
|
+
if (destDir.length > 0) {
|
1535
|
+
destDir.forEach(dir => {
|
1536
|
+
if (fs.existsSync(dir)) {
|
1537
|
+
try {
|
1538
|
+
fs.rmSync(dir, { recursive: true });
|
1539
|
+
} catch (err) {
|
1540
|
+
silentLogger.error(
|
1541
|
+
`CHROME Unable to delete ${dir} folder in the Chrome data directory. ${err}`,
|
1542
|
+
);
|
1543
|
+
}
|
1544
|
+
}
|
1545
|
+
});
|
1546
|
+
return;
|
1547
|
+
}
|
1548
|
+
|
1549
|
+
silentLogger.warn('Unable to find oobee directory in the Chrome data directory.');
|
1550
|
+
console.warn('Unable to find oobee directory in the Chrome data directory.');
|
1551
|
+
};
|
1552
|
+
|
1553
|
+
/**
|
1554
|
+
* Deletes all the cloned oobee directories in the Edge data directory
|
1555
|
+
* @returns null
|
1556
|
+
*/
|
1557
|
+
export const deleteClonedEdgeProfiles = (randomToken?: string): void => {
|
1558
|
+
if (process.env.OOBEE_VERBOSE) {
|
1559
|
+
return;
|
1560
|
+
}
|
1561
|
+
const baseDir = getDefaultEdgeDataDir();
|
1562
|
+
|
1563
|
+
if (!baseDir) {
|
1564
|
+
console.warn(`Unable to find Edge data directory in the system.`);
|
1565
|
+
return;
|
1566
|
+
}
|
1567
|
+
let destDir: string[];
|
1568
|
+
if (randomToken) {
|
1569
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
1570
|
+
} else {
|
1571
|
+
// Find all the oobee directories in the Chrome data directory
|
1572
|
+
destDir = globSync('**/oobee*', {
|
1573
|
+
cwd: baseDir,
|
1574
|
+
absolute: true,
|
1575
|
+
});
|
1576
|
+
}
|
1577
|
+
|
1578
|
+
if (destDir.length > 0) {
|
1579
|
+
destDir.forEach(dir => {
|
1580
|
+
if (fs.existsSync(dir)) {
|
1581
|
+
try {
|
1582
|
+
fs.rmSync(dir, { recursive: true });
|
1583
|
+
} catch (err) {
|
1584
|
+
silentLogger.error(
|
1585
|
+
`EDGE Unable to delete ${dir} folder in the Chrome data directory. ${err}`,
|
1586
|
+
);
|
1587
|
+
}
|
1588
|
+
}
|
1589
|
+
});
|
1590
|
+
}
|
1591
|
+
};
|
1592
|
+
|
1593
|
+
export const deleteClonedChromiumProfiles = (randomToken?: string): void => {
|
1594
|
+
const baseDir = getDefaultChromiumDataDir();
|
1595
|
+
|
1596
|
+
if (!baseDir) {
|
1597
|
+
return;
|
1598
|
+
}
|
1599
|
+
let destDir: string[];
|
1600
|
+
if (randomToken) {
|
1601
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
1602
|
+
} else {
|
1603
|
+
// Find all the oobee directories in the Chrome data directory
|
1604
|
+
destDir = globSync('**/oobee*', {
|
1605
|
+
cwd: baseDir,
|
1606
|
+
absolute: true,
|
1607
|
+
});
|
1608
|
+
}
|
1609
|
+
|
1610
|
+
if (destDir.length > 0) {
|
1611
|
+
destDir.forEach(dir => {
|
1612
|
+
if (fs.existsSync(dir)) {
|
1613
|
+
try {
|
1614
|
+
fs.rmSync(dir, { recursive: true });
|
1615
|
+
} catch (err) {
|
1616
|
+
silentLogger.error(
|
1617
|
+
`CHROMIUM Unable to delete ${dir} folder in the Chromium data directory. ${err}`,
|
1618
|
+
);
|
1619
|
+
}
|
1620
|
+
}
|
1621
|
+
});
|
1622
|
+
return;
|
1623
|
+
}
|
1624
|
+
|
1625
|
+
silentLogger.warn('Unable to find oobee directory in Chromium support directory');
|
1626
|
+
console.warn('Unable to find oobee directory in Chromium support directory');
|
1627
|
+
};
|
1628
|
+
|
1629
|
+
export const getPlaywrightDeviceDetailsObject = (
|
1630
|
+
deviceChosen: string,
|
1631
|
+
customDevice: string,
|
1632
|
+
viewportWidth: number,
|
1633
|
+
) => {
|
1634
|
+
let playwrightDeviceDetailsObject = {};
|
1635
|
+
if (deviceChosen === 'Mobile' || customDevice === 'iPhone 11') {
|
1636
|
+
playwrightDeviceDetailsObject = devices['iPhone 11'];
|
1637
|
+
} else if (customDevice === 'Samsung Galaxy S9+') {
|
1638
|
+
playwrightDeviceDetailsObject = devices['Galaxy S9+'];
|
1639
|
+
} else if (viewportWidth) {
|
1640
|
+
playwrightDeviceDetailsObject = {
|
1641
|
+
viewport: { width: viewportWidth, height: 720 },
|
1642
|
+
};
|
1643
|
+
} else if (customDevice) {
|
1644
|
+
playwrightDeviceDetailsObject = devices[customDevice.replace(/_/g, ' ')];
|
1645
|
+
}
|
1646
|
+
return playwrightDeviceDetailsObject;
|
1647
|
+
};
|
1648
|
+
|
1649
|
+
export const getScreenToScan = (
|
1650
|
+
deviceChosen: string,
|
1651
|
+
customDevice: string,
|
1652
|
+
viewportWidth: number,
|
1653
|
+
): string => {
|
1654
|
+
if (deviceChosen) {
|
1655
|
+
return deviceChosen;
|
1656
|
+
}
|
1657
|
+
if (customDevice) {
|
1658
|
+
return customDevice;
|
1659
|
+
}
|
1660
|
+
if (viewportWidth) {
|
1661
|
+
return `CustomWidth_${viewportWidth}px`;
|
1662
|
+
}
|
1663
|
+
return 'Desktop';
|
1664
|
+
};
|
1665
|
+
|
1666
|
+
export const submitFormViaPlaywright = async (
|
1667
|
+
browserToRun: string,
|
1668
|
+
userDataDirectory: string,
|
1669
|
+
finalUrl: string,
|
1670
|
+
) => {
|
1671
|
+
const dirName = `clone-${Date.now()}`;
|
1672
|
+
let clonedDir = null;
|
1673
|
+
if (proxy && browserToRun === BrowserTypes.EDGE) {
|
1674
|
+
clonedDir = cloneEdgeProfiles(dirName);
|
1675
|
+
} else if (proxy && browserToRun === BrowserTypes.CHROME) {
|
1676
|
+
clonedDir = cloneChromeProfiles(dirName);
|
1677
|
+
}
|
1678
|
+
const browserContext = await constants.launcher.launchPersistentContext(
|
1679
|
+
clonedDir || userDataDirectory,
|
1680
|
+
{
|
1681
|
+
...getPlaywrightLaunchOptions(browserToRun),
|
1682
|
+
},
|
1683
|
+
);
|
1684
|
+
|
1685
|
+
const page = await browserContext.newPage();
|
1686
|
+
|
1687
|
+
try {
|
1688
|
+
await page.goto(finalUrl, {
|
1689
|
+
timeout: 30000,
|
1690
|
+
...(proxy && { waitUntil: 'commit' }),
|
1691
|
+
});
|
1692
|
+
|
1693
|
+
try {
|
1694
|
+
await page.waitForLoadState('networkidle', { timeout: 10000 });
|
1695
|
+
} catch {
|
1696
|
+
silentLogger.info('Unable to detect networkidle');
|
1697
|
+
}
|
1698
|
+
} catch (error) {
|
1699
|
+
silentLogger.error(error);
|
1700
|
+
} finally {
|
1701
|
+
await browserContext.close();
|
1702
|
+
if (proxy && browserToRun === BrowserTypes.EDGE) {
|
1703
|
+
if (!process.env.OOBEE_VERBOSE) {
|
1704
|
+
deleteClonedEdgeProfiles();
|
1705
|
+
}
|
1706
|
+
} else if (proxy && browserToRun === BrowserTypes.CHROME) {
|
1707
|
+
if (!process.env.OOBEE_VERBOSE) {
|
1708
|
+
deleteClonedChromeProfiles();
|
1709
|
+
}
|
1710
|
+
}
|
1711
|
+
}
|
1712
|
+
};
|
1713
|
+
|
1714
|
+
export const submitForm = async (
|
1715
|
+
browserToRun: string,
|
1716
|
+
userDataDirectory: string,
|
1717
|
+
scannedUrl: string,
|
1718
|
+
entryUrl: string,
|
1719
|
+
scanType: string,
|
1720
|
+
email: string,
|
1721
|
+
name: string,
|
1722
|
+
scanResultsJson: string,
|
1723
|
+
numberOfPagesScanned: number,
|
1724
|
+
numberOfRedirectsScanned: number,
|
1725
|
+
numberOfPagesNotScanned: number,
|
1726
|
+
metadata: string,
|
1727
|
+
) => {
|
1728
|
+
const additionalPageDataJson = JSON.stringify({
|
1729
|
+
redirectsScanned: numberOfRedirectsScanned,
|
1730
|
+
pagesNotScanned: numberOfPagesNotScanned,
|
1731
|
+
});
|
1732
|
+
|
1733
|
+
let finalUrl =
|
1734
|
+
`${formDataFields.formUrl}?` +
|
1735
|
+
`${formDataFields.entryUrlField}=${entryUrl}&` +
|
1736
|
+
`${formDataFields.scanTypeField}=${scanType}&` +
|
1737
|
+
`${formDataFields.emailField}=${email}&` +
|
1738
|
+
`${formDataFields.nameField}=${name}&` +
|
1739
|
+
`${formDataFields.resultsField}=${encodeURIComponent(scanResultsJson)}&` +
|
1740
|
+
`${formDataFields.numberOfPagesScannedField}=${numberOfPagesScanned}&` +
|
1741
|
+
`${formDataFields.additionalPageDataField}=${encodeURIComponent(additionalPageDataJson)}&` +
|
1742
|
+
`${formDataFields.metadataField}=${encodeURIComponent(metadata)}`;
|
1743
|
+
|
1744
|
+
if (scannedUrl !== entryUrl) {
|
1745
|
+
finalUrl += `&${formDataFields.redirectUrlField}=${scannedUrl}`;
|
1746
|
+
}
|
1747
|
+
|
1748
|
+
if (proxy) {
|
1749
|
+
await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
|
1750
|
+
} else {
|
1751
|
+
try {
|
1752
|
+
await axios.get(finalUrl, { timeout: 2000 });
|
1753
|
+
} catch (error) {
|
1754
|
+
if (error.code === 'ECONNABORTED') {
|
1755
|
+
if (browserToRun || constants.launcher === webkit) {
|
1756
|
+
await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
|
1757
|
+
}
|
1758
|
+
}
|
1759
|
+
}
|
1760
|
+
}
|
1761
|
+
};
|
1762
|
+
/**
|
1763
|
+
* @param {string} browser browser name ("chrome" or "edge", null for chromium, the default Playwright browser)
|
1764
|
+
* @returns playwright launch options object. For more details: https://playwright.dev/docs/api/class-browsertype#browser-type-launch
|
1765
|
+
*/
|
1766
|
+
export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
|
1767
|
+
let channel: string;
|
1768
|
+
if (browser) {
|
1769
|
+
channel = browser;
|
1770
|
+
}
|
1771
|
+
const options: LaunchOptions = {
|
1772
|
+
// Drop the --use-mock-keychain flag to allow MacOS devices
|
1773
|
+
// to use the cloned cookies.
|
1774
|
+
ignoreDefaultArgs: ['--use-mock-keychain'],
|
1775
|
+
args: constants.launchOptionsArgs,
|
1776
|
+
...(channel && { channel }), // Having no channel is equivalent to "chromium"
|
1777
|
+
};
|
1778
|
+
if (proxy) {
|
1779
|
+
options.headless = false;
|
1780
|
+
options.slowMo = 1000; // To ensure server-side rendered proxy page is loaded
|
1781
|
+
} else if (browser === BrowserTypes.EDGE && os.platform() === 'win32') {
|
1782
|
+
// edge should be in non-headless mode
|
1783
|
+
options.headless = false;
|
1784
|
+
}
|
1785
|
+
return options;
|
1786
|
+
};
|
1787
|
+
|
1788
|
+
export const urlWithoutAuth = (url: string): string => {
|
1789
|
+
const parsedUrl = new URL(url);
|
1790
|
+
parsedUrl.username = '';
|
1791
|
+
parsedUrl.password = '';
|
1792
|
+
return parsedUrl.toString();
|
1793
|
+
};
|
1794
|
+
|
1795
|
+
export const waitForPageLoaded = async (page, timeout = 10000) => {
|
1796
|
+
return Promise.race([
|
1797
|
+
page.waitForLoadState('load'),
|
1798
|
+
page.waitForLoadState('networkidle'),
|
1799
|
+
new Promise(resolve => setTimeout(resolve, timeout)),
|
1800
|
+
]);
|
1801
|
+
};
|
1802
|
+
|
1803
|
+
function isValidHttpUrl(urlString) {
|
1804
|
+
const pattern = /^(http|https):\/\/[^ "]+$/;
|
1805
|
+
return pattern.test(urlString);
|
1806
|
+
}
|
1807
|
+
|
1808
|
+
export const isFilePath = (url: string): boolean => {
|
1809
|
+
const driveLetterPattern = /^[A-Z]:/i;
|
1810
|
+
const backslashPattern = /\\/;
|
1811
|
+
return (
|
1812
|
+
url.startsWith('file://') ||
|
1813
|
+
url.startsWith('/') ||
|
1814
|
+
driveLetterPattern.test(url) ||
|
1815
|
+
backslashPattern.test(url)
|
1816
|
+
);
|
1817
|
+
};
|
1818
|
+
|
1819
|
+
export function convertLocalFileToPath(url: string): string {
|
1820
|
+
if (url.startsWith('file://')) {
|
1821
|
+
url = fileURLToPath(url);
|
1822
|
+
}
|
1823
|
+
return url;
|
1824
|
+
}
|
1825
|
+
|
1826
|
+
export function convertPathToLocalFile(filePath: string): string {
|
1827
|
+
if (filePath.startsWith('/')) {
|
1828
|
+
filePath = pathToFileURL(filePath).toString();
|
1829
|
+
}
|
1830
|
+
return filePath;
|
1831
|
+
}
|
1832
|
+
|
1833
|
+
export function convertToFilePath(fileUrl: string) {
|
1834
|
+
// Parse the file URL
|
1835
|
+
const parsedUrl = url.parse(fileUrl);
|
1836
|
+
// Decode the URL-encoded path
|
1837
|
+
const filePath = decodeURIComponent(parsedUrl.path);
|
1838
|
+
// Return the file path without the 'file://' prefix
|
1839
|
+
return filePath;
|
1840
|
+
}
|