@govtechsg/oobee 0.10.76 → 0.10.78-alpha1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/publish.yml +8 -1
- package/INTEGRATION.md +50 -3
- package/dist/cli.js +252 -0
- package/dist/combine.js +221 -0
- package/dist/constants/cliFunctions.js +306 -0
- package/dist/constants/common.js +1669 -0
- package/dist/constants/constants.js +913 -0
- package/dist/constants/errorMeta.json +319 -0
- package/dist/constants/itemTypeDescription.js +7 -0
- package/dist/constants/oobeeAi.js +121 -0
- package/dist/constants/questions.js +151 -0
- package/dist/constants/sampleData.js +176 -0
- package/dist/crawlers/commonCrawlerFunc.js +428 -0
- package/dist/crawlers/crawlDomain.js +613 -0
- package/dist/crawlers/crawlIntelligentSitemap.js +135 -0
- package/dist/crawlers/crawlLocalFile.js +151 -0
- package/dist/crawlers/crawlSitemap.js +303 -0
- package/dist/crawlers/custom/escapeCssSelector.js +10 -0
- package/dist/crawlers/custom/evaluateAltText.js +11 -0
- package/dist/crawlers/custom/extractAndGradeText.js +44 -0
- package/dist/crawlers/custom/extractText.js +27 -0
- package/dist/crawlers/custom/findElementByCssSelector.js +36 -0
- package/dist/crawlers/custom/flagUnlabelledClickableElements.js +963 -0
- package/dist/crawlers/custom/framesCheck.js +37 -0
- package/dist/crawlers/custom/getAxeConfiguration.js +111 -0
- package/dist/crawlers/custom/gradeReadability.js +23 -0
- package/dist/crawlers/custom/utils.js +1024 -0
- package/dist/crawlers/custom/xPathToCss.js +147 -0
- package/dist/crawlers/guards/urlGuard.js +71 -0
- package/dist/crawlers/pdfScanFunc.js +276 -0
- package/dist/crawlers/runCustom.js +89 -0
- package/dist/exclusions.txt +7 -0
- package/dist/generateHtmlReport.js +144 -0
- package/dist/index.js +62 -0
- package/dist/logs.js +84 -0
- package/dist/mergeAxeResults.js +1588 -0
- package/dist/npmIndex.js +640 -0
- package/dist/proxyService.js +360 -0
- package/dist/runGenerateJustHtmlReport.js +16 -0
- package/dist/screenshotFunc/htmlScreenshotFunc.js +355 -0
- package/dist/screenshotFunc/pdfScreenshotFunc.js +645 -0
- package/dist/services/s3Uploader.js +127 -0
- package/dist/static/ejs/partials/components/allIssues/AllIssues.ejs +9 -0
- package/dist/static/ejs/partials/components/allIssues/CategoryBadges.ejs +82 -0
- package/dist/static/ejs/partials/components/allIssues/FilterBar.ejs +33 -0
- package/dist/static/ejs/partials/components/allIssues/IssuesTable.ejs +41 -0
- package/dist/static/ejs/partials/components/header/SiteInfo.ejs +119 -0
- package/dist/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +15 -0
- package/dist/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +44 -0
- package/dist/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +142 -0
- package/dist/static/ejs/partials/components/prioritiseIssues/IssueDetailCard.ejs +36 -0
- package/dist/static/ejs/partials/components/prioritiseIssues/PrioritiseIssues.ejs +47 -0
- package/dist/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +196 -0
- package/dist/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +48 -0
- package/dist/static/ejs/partials/components/screenshotLightbox.ejs +13 -0
- package/dist/static/ejs/partials/components/shared/InfoAlert.ejs +3 -0
- package/dist/static/ejs/partials/components/summaryScanAbout.ejs +141 -0
- package/dist/static/ejs/partials/components/summaryScanResults.ejs +16 -0
- package/dist/static/ejs/partials/components/summaryTable.ejs +20 -0
- package/dist/static/ejs/partials/components/summaryWcagCompliance.ejs +94 -0
- package/dist/static/ejs/partials/components/topTen.ejs +6 -0
- package/dist/static/ejs/partials/components/wcagCompliance/FailedCriteria.ejs +47 -0
- package/dist/static/ejs/partials/components/wcagCompliance/WcagCompliance.ejs +16 -0
- package/dist/static/ejs/partials/components/wcagCompliance/WcagGaugeBar.ejs +16 -0
- package/dist/static/ejs/partials/components/wcagCoverageDetails.ejs +18 -0
- package/dist/static/ejs/partials/footer.ejs +24 -0
- package/dist/static/ejs/partials/header.ejs +14 -0
- package/dist/static/ejs/partials/main.ejs +29 -0
- package/dist/static/ejs/partials/scripts/allIssues/AllIssues.ejs +376 -0
- package/dist/static/ejs/partials/scripts/bootstrap.ejs +8 -0
- package/dist/static/ejs/partials/scripts/categorySummary.ejs +141 -0
- package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +3 -0
- package/dist/static/ejs/partials/scripts/header/SiteInfo.ejs +44 -0
- package/dist/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +51 -0
- package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +127 -0
- package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanDetails.ejs +60 -0
- package/dist/static/ejs/partials/scripts/highlightjs.ejs +335 -0
- package/dist/static/ejs/partials/scripts/popper.ejs +7 -0
- package/dist/static/ejs/partials/scripts/prioritiseIssues/IssueDetailCard.ejs +137 -0
- package/dist/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +214 -0
- package/dist/static/ejs/partials/scripts/prioritiseIssues/wcagSvgMap.ejs +861 -0
- package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +957 -0
- package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +353 -0
- package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +468 -0
- package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +306 -0
- package/dist/static/ejs/partials/scripts/ruleModal/utilities.ejs +483 -0
- package/dist/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +35 -0
- package/dist/static/ejs/partials/scripts/screenshotLightbox.ejs +75 -0
- package/dist/static/ejs/partials/scripts/summaryScanResults.ejs +14 -0
- package/dist/static/ejs/partials/scripts/summaryTable.ejs +78 -0
- package/dist/static/ejs/partials/scripts/topTen.ejs +61 -0
- package/dist/static/ejs/partials/scripts/utils.ejs +453 -0
- package/dist/static/ejs/partials/scripts/wcagCompliance/FailedCriteria.ejs +103 -0
- package/dist/static/ejs/partials/scripts/wcagCompliance/WcagGaugeBar.ejs +47 -0
- package/dist/static/ejs/partials/scripts/wcagCompliance.ejs +15 -0
- package/dist/static/ejs/partials/scripts/wcagCoverageDetails.ejs +75 -0
- package/dist/static/ejs/partials/styles/allIssues/AllIssues.ejs +384 -0
- package/dist/static/ejs/partials/styles/bootstrap.ejs +12391 -0
- package/dist/static/ejs/partials/styles/header/SiteInfo.ejs +121 -0
- package/dist/static/ejs/partials/styles/header/aboutScanModal/AboutScanModal.ejs +82 -0
- package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanConfiguration.ejs +50 -0
- package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +149 -0
- package/dist/static/ejs/partials/styles/header.ejs +7 -0
- package/dist/static/ejs/partials/styles/highlightjs.ejs +54 -0
- package/dist/static/ejs/partials/styles/prioritiseIssues/IssueDetailCard.ejs +141 -0
- package/dist/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +204 -0
- package/dist/static/ejs/partials/styles/ruleModal/ruleOffcanvas.ejs +456 -0
- package/dist/static/ejs/partials/styles/scannedPagesSegmentedTabs.ejs +46 -0
- package/dist/static/ejs/partials/styles/shared/InfoAlert.ejs +12 -0
- package/dist/static/ejs/partials/styles/styles.ejs +1607 -0
- package/dist/static/ejs/partials/styles/summaryBootstrap.ejs +12458 -0
- package/dist/static/ejs/partials/styles/topTenCard.ejs +44 -0
- package/dist/static/ejs/partials/styles/wcagCompliance/FailedCriteria.ejs +59 -0
- package/dist/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +62 -0
- package/dist/static/ejs/partials/styles/wcagCompliance.ejs +36 -0
- package/dist/static/ejs/partials/styles/wcagCoverageDetails.ejs +33 -0
- package/dist/static/ejs/partials/summaryHeader.ejs +70 -0
- package/dist/static/ejs/partials/summaryMain.ejs +49 -0
- package/dist/static/ejs/report.ejs +226 -0
- package/dist/static/ejs/summary.ejs +47 -0
- package/dist/types/types.js +1 -0
- package/dist/utils.js +1070 -0
- package/examples/oobee-cypress-integration-js/cypress/support/e2e.js +36 -6
- package/examples/oobee-cypress-integration-js/cypress.config.js +45 -1
- package/examples/oobee-cypress-integration-ts/cypress.config.ts +47 -1
- package/examples/oobee-cypress-integration-ts/src/cypress/support/e2e.ts +36 -6
- package/examples/oobee-playwright-integration-js/oobee-playwright-demo.js +2 -1
- package/examples/oobee-playwright-integration-ts/src/oobee-playwright-demo.ts +2 -1
- package/examples/oobee-scan-html-demo.js +51 -0
- package/examples/oobee-scan-page-demo.js +40 -0
- package/package.json +9 -3
- package/src/constants/common.ts +2 -2
- package/src/constants/constants.ts +3 -1
- package/src/crawlers/crawlDomain.ts +1 -0
- package/src/crawlers/runCustom.ts +0 -1
- package/src/mergeAxeResults.ts +43 -22
- package/src/npmIndex.ts +500 -131
|
@@ -0,0 +1,1669 @@
|
|
|
1
|
+
/* eslint-disable consistent-return */
|
|
2
|
+
/* eslint-disable no-console */
|
|
3
|
+
/* eslint-disable camelcase */
|
|
4
|
+
/* eslint-disable no-use-before-define */
|
|
5
|
+
import validator from 'validator';
|
|
6
|
+
import axios from 'axios';
|
|
7
|
+
import { JSDOM } from 'jsdom';
|
|
8
|
+
import * as cheerio from 'cheerio';
|
|
9
|
+
import crawlee, { EnqueueStrategy, Request } from 'crawlee';
|
|
10
|
+
import { parseString } from 'xml2js';
|
|
11
|
+
import fs from 'fs';
|
|
12
|
+
import path from 'path';
|
|
13
|
+
import url, { fileURLToPath, pathToFileURL } from 'url';
|
|
14
|
+
import safe from 'safe-regex';
|
|
15
|
+
import * as https from 'https';
|
|
16
|
+
import os from 'os';
|
|
17
|
+
import mime from 'mime';
|
|
18
|
+
import { minimatch } from 'minimatch';
|
|
19
|
+
import { globSync } from 'glob';
|
|
20
|
+
import { devices, webkit } from 'playwright';
|
|
21
|
+
import printMessage from 'print-message';
|
|
22
|
+
import constants, { getDefaultChromeDataDir, getDefaultEdgeDataDir, getDefaultChromiumDataDir,
|
|
23
|
+
// Legacy code start - Google Sheets submission
|
|
24
|
+
formDataFields,
|
|
25
|
+
// Legacy code end - Google Sheets submission
|
|
26
|
+
ScannerTypes, BrowserTypes, FileTypes, getEnumKey, } from './constants.js';
|
|
27
|
+
import { consoleLogger } from '../logs.js';
|
|
28
|
+
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
|
29
|
+
import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
|
|
30
|
+
import { getProxyInfo, proxyInfoToResolution } from '../proxyService.js';
|
|
31
|
+
// validateDirPath validates a provided directory path
|
|
32
|
+
// returns null if no error
|
|
33
|
+
export const validateDirPath = (dirPath) => {
|
|
34
|
+
if (typeof dirPath !== 'string') {
|
|
35
|
+
return 'Please provide string value of directory path.';
|
|
36
|
+
}
|
|
37
|
+
try {
|
|
38
|
+
fs.accessSync(dirPath);
|
|
39
|
+
if (!fs.statSync(dirPath).isDirectory()) {
|
|
40
|
+
return 'Please provide a directory path.';
|
|
41
|
+
}
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
catch {
|
|
45
|
+
return 'Please ensure path provided exists.';
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
export class RES {
|
|
49
|
+
constructor(res) {
|
|
50
|
+
if (res) {
|
|
51
|
+
Object.assign(this, res);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
export const validateCustomFlowLabel = (customFlowLabel) => {
|
|
56
|
+
const containsReserveWithDot = constants.reserveFileNameKeywords.some(char => customFlowLabel.toLowerCase().includes(`${char.toLowerCase()}.`));
|
|
57
|
+
const containsForbiddenCharacters = constants.forbiddenCharactersInDirPath.some(char => customFlowLabel.includes(char));
|
|
58
|
+
const exceedsMaxLength = customFlowLabel.length > 80;
|
|
59
|
+
if (containsForbiddenCharacters) {
|
|
60
|
+
const displayForbiddenCharacters = constants.forbiddenCharactersInDirPath
|
|
61
|
+
.toString()
|
|
62
|
+
.replaceAll(',', ' , ');
|
|
63
|
+
return {
|
|
64
|
+
isValid: false,
|
|
65
|
+
errorMessage: `Invalid label. Cannot contain ${displayForbiddenCharacters}`,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
if (exceedsMaxLength) {
|
|
69
|
+
return { isValid: false, errorMessage: `Invalid label. Cannot exceed 80 characters.` };
|
|
70
|
+
}
|
|
71
|
+
if (containsReserveWithDot) {
|
|
72
|
+
const displayReserveKeywords = constants.reserveFileNameKeywords
|
|
73
|
+
.toString()
|
|
74
|
+
.replaceAll(',', ' , ');
|
|
75
|
+
return {
|
|
76
|
+
isValid: false,
|
|
77
|
+
errorMessage: `Invalid label. Cannot have '.' appended to ${displayReserveKeywords} as they are reserved keywords.`,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
return { isValid: true };
|
|
81
|
+
};
|
|
82
|
+
// validateFilePath validates a provided file path
|
|
83
|
+
// returns null if no error
|
|
84
|
+
export const validateFilePath = (filePath, cliDir) => {
|
|
85
|
+
if (typeof filePath !== 'string') {
|
|
86
|
+
throw new Error('Please provide string value of file path.');
|
|
87
|
+
}
|
|
88
|
+
const absolutePath = path.isAbsolute(filePath) ? filePath : path.resolve(cliDir, filePath);
|
|
89
|
+
try {
|
|
90
|
+
fs.accessSync(absolutePath);
|
|
91
|
+
if (!fs.statSync(absolutePath).isFile()) {
|
|
92
|
+
throw new Error('Please provide a file path.');
|
|
93
|
+
}
|
|
94
|
+
if (path.extname(absolutePath) !== '.txt') {
|
|
95
|
+
throw new Error('Please provide a file with txt extension.');
|
|
96
|
+
}
|
|
97
|
+
return absolutePath;
|
|
98
|
+
}
|
|
99
|
+
catch {
|
|
100
|
+
throw new Error(`Please ensure path provided exists and writable: ${absolutePath}`);
|
|
101
|
+
}
|
|
102
|
+
};
|
|
103
|
+
export const getBlackListedPatterns = (blacklistedPatternsFilename) => {
|
|
104
|
+
let exclusionsFile = null;
|
|
105
|
+
if (blacklistedPatternsFilename) {
|
|
106
|
+
exclusionsFile = blacklistedPatternsFilename;
|
|
107
|
+
}
|
|
108
|
+
else if (fs.existsSync('exclusions.txt')) {
|
|
109
|
+
exclusionsFile = 'exclusions.txt';
|
|
110
|
+
}
|
|
111
|
+
if (!exclusionsFile) {
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
const rawPatterns = fs.readFileSync(exclusionsFile).toString();
|
|
115
|
+
const blacklistedPatterns = rawPatterns
|
|
116
|
+
.split('\n')
|
|
117
|
+
.map(p => p.trim())
|
|
118
|
+
.filter(p => p !== '');
|
|
119
|
+
const unsafe = blacklistedPatterns.filter(pattern => !safe(pattern));
|
|
120
|
+
if (unsafe.length > 0) {
|
|
121
|
+
const unsafeExpressionsError = `Unsafe expressions detected: ${unsafe} Please revise ${exclusionsFile}`;
|
|
122
|
+
throw new Error(unsafeExpressionsError);
|
|
123
|
+
}
|
|
124
|
+
return blacklistedPatterns;
|
|
125
|
+
};
|
|
126
|
+
export const isBlacklistedFileExtensions = (url, blacklistedFileExtensions) => {
|
|
127
|
+
const urlExtension = url.split('.').pop();
|
|
128
|
+
return blacklistedFileExtensions.includes(urlExtension);
|
|
129
|
+
};
|
|
130
|
+
const document = new JSDOM('').window;
|
|
131
|
+
const httpsAgent = new https.Agent({
|
|
132
|
+
// Run in environments with custom certificates
|
|
133
|
+
rejectUnauthorized: false,
|
|
134
|
+
keepAlive: true,
|
|
135
|
+
});
|
|
136
|
+
export const messageOptions = {
|
|
137
|
+
border: false,
|
|
138
|
+
marginTop: 2,
|
|
139
|
+
marginBottom: 2,
|
|
140
|
+
};
|
|
141
|
+
const urlOptions = {
|
|
142
|
+
// http and https for normal scans, file for local file scan
|
|
143
|
+
protocols: ['http', 'https', 'file'],
|
|
144
|
+
require_protocol: true,
|
|
145
|
+
require_tld: false,
|
|
146
|
+
require_host: false,
|
|
147
|
+
// being explicit; fragments/queries are fine for local files
|
|
148
|
+
allow_fragments: true,
|
|
149
|
+
allow_query_components: true,
|
|
150
|
+
};
|
|
151
|
+
const queryCheck = (s) => document.createDocumentFragment().querySelector(s);
|
|
152
|
+
export const isSelectorValid = (selector) => {
|
|
153
|
+
try {
|
|
154
|
+
queryCheck(selector);
|
|
155
|
+
}
|
|
156
|
+
catch {
|
|
157
|
+
return false;
|
|
158
|
+
}
|
|
159
|
+
return true;
|
|
160
|
+
};
|
|
161
|
+
// Don't sanitise for now as we have changed the logic for URL validation / local file scan
|
|
162
|
+
// Only use this when we find characters to validate against
|
|
163
|
+
const blackListCharacters = '';
|
|
164
|
+
export const validateXML = (content) => {
|
|
165
|
+
let isValid;
|
|
166
|
+
let parsedContent;
|
|
167
|
+
parseString(content, (_err, result) => {
|
|
168
|
+
if (result) {
|
|
169
|
+
isValid = true;
|
|
170
|
+
parsedContent = result;
|
|
171
|
+
}
|
|
172
|
+
else {
|
|
173
|
+
isValid = false;
|
|
174
|
+
}
|
|
175
|
+
});
|
|
176
|
+
return { isValid, parsedContent };
|
|
177
|
+
};
|
|
178
|
+
export const isSkippedUrl = (pageUrl, whitelistedDomains) => {
|
|
179
|
+
const matched = whitelistedDomains.filter(p => {
|
|
180
|
+
const pattern = p.replace(/[\n\r]+/g, '');
|
|
181
|
+
// is url
|
|
182
|
+
if (pattern.startsWith('http') && pattern === pageUrl) {
|
|
183
|
+
return true;
|
|
184
|
+
}
|
|
185
|
+
// is regex (default)
|
|
186
|
+
return new RegExp(pattern).test(pageUrl);
|
|
187
|
+
}).length > 0;
|
|
188
|
+
return matched;
|
|
189
|
+
};
|
|
190
|
+
export const getFileSitemap = (filePath) => {
|
|
191
|
+
if (filePath.startsWith('file:///')) {
|
|
192
|
+
if (os.platform() === 'win32') {
|
|
193
|
+
filePath = filePath.match(/^file:\/\/\/([A-Z]:\/[^?#]+)/)?.[1];
|
|
194
|
+
}
|
|
195
|
+
else {
|
|
196
|
+
filePath = filePath.match(/^file:\/\/(\/[^?#]+)/)?.[1];
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
filePath = convertToFilePath(filePath);
|
|
200
|
+
if (!fs.existsSync(filePath)) {
|
|
201
|
+
return null;
|
|
202
|
+
}
|
|
203
|
+
const file = fs.readFileSync(filePath, 'utf8');
|
|
204
|
+
const isLocalFileScan = isSitemapContent(file);
|
|
205
|
+
return isLocalFileScan || file !== undefined ? filePath : null;
|
|
206
|
+
};
|
|
207
|
+
export const getUrlMessage = (scanner) => {
|
|
208
|
+
switch (scanner) {
|
|
209
|
+
case ScannerTypes.WEBSITE:
|
|
210
|
+
case ScannerTypes.CUSTOM:
|
|
211
|
+
case ScannerTypes.INTELLIGENT:
|
|
212
|
+
return 'Please enter URL of website: ';
|
|
213
|
+
case ScannerTypes.SITEMAP:
|
|
214
|
+
return 'Please enter URL or file path to sitemap, or drag and drop a sitemap file here: ';
|
|
215
|
+
case ScannerTypes.LOCALFILE:
|
|
216
|
+
return 'Please enter file path: ';
|
|
217
|
+
default:
|
|
218
|
+
return 'Invalid option';
|
|
219
|
+
}
|
|
220
|
+
};
|
|
221
|
+
export const isInputValid = (inputString) => {
|
|
222
|
+
if (!validator.isEmpty(inputString)) {
|
|
223
|
+
const removeBlackListCharacters = validator.escape(inputString);
|
|
224
|
+
if (validator.isAscii(removeBlackListCharacters)) {
|
|
225
|
+
return true;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
return false;
|
|
229
|
+
};
|
|
230
|
+
export const sanitizeUrlInput = (url) => {
|
|
231
|
+
// Sanitize that there is no blacklist characters
|
|
232
|
+
const sanitizeUrl = validator.blacklist(url, blackListCharacters);
|
|
233
|
+
if (url.toLowerCase().startsWith('file://') || validator.isURL(sanitizeUrl, urlOptions)) {
|
|
234
|
+
return { isValid: true, url: sanitizeUrl };
|
|
235
|
+
}
|
|
236
|
+
return { isValid: false, url: sanitizeUrl };
|
|
237
|
+
};
|
|
238
|
+
const isAllowedContentType = (ct) => {
|
|
239
|
+
const c = (ct || '').toLowerCase();
|
|
240
|
+
return (c.startsWith('text/html') || // html
|
|
241
|
+
c.startsWith('application/xhtml+xml') || // xhtml
|
|
242
|
+
c.startsWith('text/plain') || // txt
|
|
243
|
+
c.startsWith('application/xml') || // xml
|
|
244
|
+
c.startsWith('text/xml') || // xml (alt)
|
|
245
|
+
c.startsWith('application/pdf') // pdf
|
|
246
|
+
);
|
|
247
|
+
};
|
|
248
|
+
const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir, playwrightDeviceDetailsObject, extraHTTPHeaders) => {
|
|
249
|
+
const res = new RES();
|
|
250
|
+
const data = sanitizeUrlInput(url);
|
|
251
|
+
if (!data.isValid) {
|
|
252
|
+
res.status = constants.urlCheckStatuses.invalidUrl.code;
|
|
253
|
+
return res;
|
|
254
|
+
}
|
|
255
|
+
// STEP 1: For local file scans
|
|
256
|
+
let contentType = '';
|
|
257
|
+
const protocol = new URL(url).protocol;
|
|
258
|
+
if (protocol !== 'http:' && protocol !== 'https:') {
|
|
259
|
+
try {
|
|
260
|
+
const filePath = fileURLToPath(url);
|
|
261
|
+
const stat = fs.statSync(filePath);
|
|
262
|
+
if (!stat.isFile()) {
|
|
263
|
+
res.status = constants.urlCheckStatuses.notALocalFile.code;
|
|
264
|
+
return res;
|
|
265
|
+
}
|
|
266
|
+
const statusCode = 200;
|
|
267
|
+
contentType = mime.getType(filePath) || 'application/octet-stream';
|
|
268
|
+
if (!isAllowedContentType(contentType)) {
|
|
269
|
+
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
270
|
+
return res;
|
|
271
|
+
}
|
|
272
|
+
// Short-circuit for pdfs
|
|
273
|
+
if (contentType.includes('pdf')) {
|
|
274
|
+
res.status = constants.urlCheckStatuses.success.code;
|
|
275
|
+
res.httpStatus = statusCode;
|
|
276
|
+
res.url = url;
|
|
277
|
+
res.content = '%PDF-'; // Avoid putting the binary in memory
|
|
278
|
+
return res;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
catch (e) {
|
|
282
|
+
consoleLogger.info(`Local file check failed: ${e.message}`);
|
|
283
|
+
res.status = constants.urlCheckStatuses.systemError.code;
|
|
284
|
+
return res;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
// Ensure Accept header for non-html content fallback
|
|
288
|
+
extraHTTPHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
|
|
289
|
+
await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
|
|
290
|
+
let browserContext;
|
|
291
|
+
try {
|
|
292
|
+
browserContext = await constants.launcher.launchPersistentContext(clonedDataDir, {
|
|
293
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
294
|
+
ignoreHTTPSErrors: true,
|
|
295
|
+
headless: true,
|
|
296
|
+
...getPlaywrightLaunchOptions(browserToRun),
|
|
297
|
+
...playwrightDeviceDetailsObject,
|
|
298
|
+
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
299
|
+
});
|
|
300
|
+
register(browserContext);
|
|
301
|
+
}
|
|
302
|
+
catch (err) {
|
|
303
|
+
printMessage([`Unable to launch browser\n${err}`], messageOptions);
|
|
304
|
+
res.status = constants.urlCheckStatuses.browserError.code;
|
|
305
|
+
return res;
|
|
306
|
+
}
|
|
307
|
+
try {
|
|
308
|
+
const page = await browserContext.newPage();
|
|
309
|
+
// Block native Chrome download UI
|
|
310
|
+
try {
|
|
311
|
+
const cdp = await browserContext.newCDPSession(page);
|
|
312
|
+
await cdp.send('Page.setDownloadBehavior', { behavior: 'deny' });
|
|
313
|
+
}
|
|
314
|
+
catch (e) {
|
|
315
|
+
consoleLogger.info(`Unable to set download deny: ${e.message}`);
|
|
316
|
+
}
|
|
317
|
+
// OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
|
|
318
|
+
// This allows the "Connectivity Check" to pass as soon as HTML is ready
|
|
319
|
+
await page.route('**/*', (route) => {
|
|
320
|
+
const type = route.request().resourceType();
|
|
321
|
+
if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
|
|
322
|
+
return route.abort();
|
|
323
|
+
}
|
|
324
|
+
return route.continue();
|
|
325
|
+
});
|
|
326
|
+
// STEP 2: Navigate (follows server-side redirects)
|
|
327
|
+
page.once('download', () => {
|
|
328
|
+
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
329
|
+
return res;
|
|
330
|
+
});
|
|
331
|
+
// OPTIMIZATION: Wait for 'domcontentloaded' only
|
|
332
|
+
const response = await page.goto(url, {
|
|
333
|
+
timeout: 15000,
|
|
334
|
+
waitUntil: 'domcontentloaded', // enough to get status + allow potential client redirects to kick in
|
|
335
|
+
});
|
|
336
|
+
if (!response)
|
|
337
|
+
throw new Error('No response from navigation');
|
|
338
|
+
// We use the response headers from the navigation we just performed.
|
|
339
|
+
const finalUrl = page.url();
|
|
340
|
+
const finalStatus = response.status();
|
|
341
|
+
const headers = response.headers();
|
|
342
|
+
contentType = headers['content-type'] || '';
|
|
343
|
+
if (!isAllowedContentType(contentType)) {
|
|
344
|
+
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
345
|
+
return res;
|
|
346
|
+
}
|
|
347
|
+
res.httpStatus = finalStatus;
|
|
348
|
+
res.url = finalUrl;
|
|
349
|
+
if (finalStatus === 401) {
|
|
350
|
+
res.status = constants.urlCheckStatuses.unauthorised.code;
|
|
351
|
+
}
|
|
352
|
+
else if (finalStatus >= 200 && finalStatus < 400) {
|
|
353
|
+
res.status = constants.urlCheckStatuses.success.code;
|
|
354
|
+
}
|
|
355
|
+
else if (finalStatus === 405 || finalStatus === 501) {
|
|
356
|
+
// Some origins 405/501 but the browser-rendered page is still reachable after client redirects.
|
|
357
|
+
// As a last resort, consider DOM presence as success if we actually have a document.
|
|
358
|
+
const hasDOM = await page.evaluate(() => !!document && !!document.documentElement);
|
|
359
|
+
res.status = hasDOM
|
|
360
|
+
? constants.urlCheckStatuses.success.code
|
|
361
|
+
: constants.urlCheckStatuses.systemError.code;
|
|
362
|
+
}
|
|
363
|
+
else {
|
|
364
|
+
res.status = constants.urlCheckStatuses.systemError.code;
|
|
365
|
+
}
|
|
366
|
+
// Content handling
|
|
367
|
+
if (contentType.includes('pdf') || contentType.includes('octet-stream')) {
|
|
368
|
+
res.content = '%PDF-'; // avoid binary in memory / download
|
|
369
|
+
}
|
|
370
|
+
else {
|
|
371
|
+
try {
|
|
372
|
+
// Try to get a stable DOM; don't fail the check if it times out
|
|
373
|
+
// Note: Since we used 'domcontentloaded' in goto, this is fast, but kept for safety/stability
|
|
374
|
+
await page.waitForLoadState('domcontentloaded', { timeout: 5000 });
|
|
375
|
+
}
|
|
376
|
+
catch { }
|
|
377
|
+
res.content = await page.content();
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
catch (error) {
|
|
381
|
+
if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
|
|
382
|
+
res.status = constants.urlCheckStatuses.unauthorised.code;
|
|
383
|
+
}
|
|
384
|
+
else if (error.message.includes('net::ERR_NAME_NOT_RESOLVED')) {
|
|
385
|
+
res.status = constants.urlCheckStatuses.cannotBeResolved.code;
|
|
386
|
+
}
|
|
387
|
+
else if (error.message.includes('net::ERR_CONNECTION_REFUSED')) {
|
|
388
|
+
res.status = constants.urlCheckStatuses.connectionRefused.code;
|
|
389
|
+
}
|
|
390
|
+
else if (error.message.includes('net::ERR_TIMED_OUT')) {
|
|
391
|
+
res.status = constants.urlCheckStatuses.timedOut.code;
|
|
392
|
+
}
|
|
393
|
+
else if (error.message.includes('net::ERR_SSL_PROTOCOL_ERROR')) {
|
|
394
|
+
res.status = constants.urlCheckStatuses.sslProtocolError.code;
|
|
395
|
+
}
|
|
396
|
+
else {
|
|
397
|
+
consoleLogger.error(error);
|
|
398
|
+
res.status = constants.urlCheckStatuses.systemError.code;
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
finally {
|
|
402
|
+
await browserContext.close();
|
|
403
|
+
}
|
|
404
|
+
return res;
|
|
405
|
+
};
|
|
406
|
+
export const isPdfContent = (content) => {
|
|
407
|
+
let header;
|
|
408
|
+
if (Buffer.isBuffer(content)) {
|
|
409
|
+
header = content.toString('utf8', 0, 5);
|
|
410
|
+
}
|
|
411
|
+
else {
|
|
412
|
+
header = content.substring(0, 5);
|
|
413
|
+
}
|
|
414
|
+
return header === '%PDF-';
|
|
415
|
+
};
|
|
416
|
+
export const isSitemapContent = (content) => {
|
|
417
|
+
const { isValid } = validateXML(content);
|
|
418
|
+
if (isValid) {
|
|
419
|
+
return true;
|
|
420
|
+
}
|
|
421
|
+
const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
|
|
422
|
+
const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
|
|
423
|
+
const regexForUrl = new RegExp('^.*(http|https):/{2}.*$', 'gmi');
|
|
424
|
+
if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
|
|
425
|
+
// is an XML sitemap wrapped in a HTML document
|
|
426
|
+
return true;
|
|
427
|
+
}
|
|
428
|
+
if (!content.match(regexForHtml) && content.match(regexForUrl)) {
|
|
429
|
+
// treat this as a txt sitemap where all URLs will be extracted for crawling
|
|
430
|
+
return true;
|
|
431
|
+
}
|
|
432
|
+
// is HTML webpage
|
|
433
|
+
return false;
|
|
434
|
+
};
|
|
435
|
+
export const checkUrl = async (scanner, url, browser, clonedDataDir, playwrightDeviceDetailsObject, extraHTTPHeaders, fileTypes) => {
|
|
436
|
+
const res = await checkUrlConnectivityWithBrowser(url, browser, clonedDataDir, playwrightDeviceDetailsObject, extraHTTPHeaders);
|
|
437
|
+
// If response is 200 (meaning no other code was set earlier)
|
|
438
|
+
if (res.status === constants.urlCheckStatuses.success.code) {
|
|
439
|
+
// Check if document is pdf type
|
|
440
|
+
const isPdf = isPdfContent(res.content);
|
|
441
|
+
// Check if only HTML document is allowed to be scanned
|
|
442
|
+
if (fileTypes === FileTypes.HtmlOnly && isPdf) {
|
|
443
|
+
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
444
|
+
// Check if only PDF document is allowed to be scanned
|
|
445
|
+
}
|
|
446
|
+
else if (fileTypes === FileTypes.PdfOnly && !isPdf) {
|
|
447
|
+
res.status = constants.urlCheckStatuses.notAPdf.code;
|
|
448
|
+
// Check if sitemap is expected
|
|
449
|
+
}
|
|
450
|
+
else if (scanner === ScannerTypes.SITEMAP) {
|
|
451
|
+
const isSitemap = isSitemapContent(res.content);
|
|
452
|
+
if (!isSitemap) {
|
|
453
|
+
res.status = constants.urlCheckStatuses.notASitemap.code;
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
// else proceed as normal
|
|
457
|
+
}
|
|
458
|
+
return res;
|
|
459
|
+
};
|
|
460
|
+
const isEmptyObject = (obj) => !Object.keys(obj).length;
|
|
461
|
+
export const parseHeaders = (header) => {
|
|
462
|
+
// parse HTTP headers from string
|
|
463
|
+
if (!header)
|
|
464
|
+
return {};
|
|
465
|
+
const headerValues = header.split(', ');
|
|
466
|
+
const allHeaders = {};
|
|
467
|
+
headerValues.map((headerValue) => {
|
|
468
|
+
const headerValuePair = headerValue.split(/ (.*)/s);
|
|
469
|
+
if (headerValuePair.length < 2) {
|
|
470
|
+
printMessage([
|
|
471
|
+
`Invalid value for authorisation request header. Please provide valid keywords in the format: "<header> <value>". For multiple authentication headers, please provide the keywords in the format: "<header> <value>, <header2> <value2>, ..." .`,
|
|
472
|
+
], messageOptions);
|
|
473
|
+
cleanUpAndExit(1);
|
|
474
|
+
}
|
|
475
|
+
allHeaders[headerValuePair[0]] = headerValuePair[1]; // {"header": "value", "header2": "value2", ...}
|
|
476
|
+
});
|
|
477
|
+
return allHeaders;
|
|
478
|
+
};
|
|
479
|
+
export const prepareData = async (argv) => {
|
|
480
|
+
if (isEmptyObject(argv)) {
|
|
481
|
+
throw Error('No inputs should be provided');
|
|
482
|
+
}
|
|
483
|
+
let { scanner, headless, url, deviceChosen, customDevice, viewportWidth, maxpages, strategy, isLocalFileScan = argv.scanner === ScannerTypes.LOCALFILE, browserToRun, nameEmail, customFlowLabel, specifiedMaxConcurrency, fileTypes, blacklistedPatternsFilename, additional, metadata, followRobots, header, safeMode, exportDirectory, zip, ruleset, generateJsonFiles, scanDuration, } = argv;
|
|
484
|
+
const extraHTTPHeaders = parseHeaders(header);
|
|
485
|
+
// Set default username and password for basic auth
|
|
486
|
+
let username = '';
|
|
487
|
+
let password = '';
|
|
488
|
+
// If a file path is provided
|
|
489
|
+
if (isFilePath(url)) {
|
|
490
|
+
// Set is as local file scan if not already so
|
|
491
|
+
isLocalFileScan = true;
|
|
492
|
+
// Convert to absolute path
|
|
493
|
+
url = path.resolve(url);
|
|
494
|
+
// Convert to file:// URL
|
|
495
|
+
url = convertPathToLocalFile(url);
|
|
496
|
+
}
|
|
497
|
+
else {
|
|
498
|
+
// Check URL for basic auth embedded and move it to extraHTTPHeaders
|
|
499
|
+
const temp = new URL(url);
|
|
500
|
+
username = temp.username;
|
|
501
|
+
password = temp.password;
|
|
502
|
+
if (username !== '' || password !== '') {
|
|
503
|
+
extraHTTPHeaders.Authorization = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
|
|
504
|
+
}
|
|
505
|
+
temp.username = '';
|
|
506
|
+
temp.password = '';
|
|
507
|
+
url = temp.toString();
|
|
508
|
+
}
|
|
509
|
+
// construct filename for scan results
|
|
510
|
+
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
|
511
|
+
const domain = isLocalFileScan ? path.basename(url) : new URL(url).hostname;
|
|
512
|
+
const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : '';
|
|
513
|
+
let resultFilename;
|
|
514
|
+
const randomThreeDigitNumber = randomThreeDigitNumberString();
|
|
515
|
+
resultFilename = `${date}_${time}${sanitisedLabel}_${domain}_${randomThreeDigitNumber}`;
|
|
516
|
+
// Set exported directory
|
|
517
|
+
if (exportDirectory) {
|
|
518
|
+
constants.exportDirectory = path.join(exportDirectory, resultFilename);
|
|
519
|
+
}
|
|
520
|
+
// Creating the playwrightDeviceDetailObject
|
|
521
|
+
deviceChosen =
|
|
522
|
+
customDevice === 'Desktop' || customDevice === 'Mobile' ? customDevice : deviceChosen;
|
|
523
|
+
const playwrightDeviceDetailsObject = getPlaywrightDeviceDetailsObject(deviceChosen, customDevice, viewportWidth);
|
|
524
|
+
const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(resultFilename, browserToRun, true);
|
|
525
|
+
browserToRun = resolvedBrowser;
|
|
526
|
+
const resolvedUserDataDirectory = getClonedProfilesWithRandomToken(browserToRun, resultFilename);
|
|
527
|
+
if (followRobots) {
|
|
528
|
+
constants.robotsTxtUrls = {};
|
|
529
|
+
await getUrlsFromRobotsTxt(url, browserToRun, resolvedUserDataDirectory, extraHTTPHeaders);
|
|
530
|
+
}
|
|
531
|
+
constants.userDataDirectory = resolvedUserDataDirectory;
|
|
532
|
+
constants.randomToken = resultFilename;
|
|
533
|
+
return {
|
|
534
|
+
type: scanner,
|
|
535
|
+
url,
|
|
536
|
+
entryUrl: url,
|
|
537
|
+
isHeadless: headless,
|
|
538
|
+
deviceChosen,
|
|
539
|
+
customDevice,
|
|
540
|
+
viewportWidth,
|
|
541
|
+
playwrightDeviceDetailsObject,
|
|
542
|
+
maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
|
|
543
|
+
strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
|
|
544
|
+
isLocalFileScan,
|
|
545
|
+
browser: browserToRun,
|
|
546
|
+
nameEmail,
|
|
547
|
+
customFlowLabel,
|
|
548
|
+
specifiedMaxConcurrency,
|
|
549
|
+
randomToken: resultFilename,
|
|
550
|
+
fileTypes: FileTypes[getEnumKey(FileTypes, fileTypes)],
|
|
551
|
+
blacklistedPatternsFilename,
|
|
552
|
+
includeScreenshots: !(additional === 'none'),
|
|
553
|
+
metadata,
|
|
554
|
+
followRobots,
|
|
555
|
+
extraHTTPHeaders,
|
|
556
|
+
safeMode,
|
|
557
|
+
userDataDirectory: resolvedUserDataDirectory,
|
|
558
|
+
zip,
|
|
559
|
+
ruleset,
|
|
560
|
+
generateJsonFiles,
|
|
561
|
+
scanDuration,
|
|
562
|
+
};
|
|
563
|
+
};
|
|
564
|
+
export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory, extraHTTPHeaders) => {
|
|
565
|
+
if (!constants.robotsTxtUrls)
|
|
566
|
+
return;
|
|
567
|
+
const domain = new URL(url).origin;
|
|
568
|
+
if (constants.robotsTxtUrls[domain])
|
|
569
|
+
return;
|
|
570
|
+
const robotsUrl = domain.concat('/robots.txt');
|
|
571
|
+
let robotsTxt;
|
|
572
|
+
try {
|
|
573
|
+
robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browserToRun, userDataDirectory, extraHTTPHeaders);
|
|
574
|
+
consoleLogger.info(`Fetched robots.txt from ${robotsUrl}`);
|
|
575
|
+
}
|
|
576
|
+
catch (e) {
|
|
577
|
+
// if robots.txt is not found, do nothing
|
|
578
|
+
consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl}`);
|
|
579
|
+
}
|
|
580
|
+
if (!robotsTxt) {
|
|
581
|
+
constants.robotsTxtUrls[domain] = {};
|
|
582
|
+
return;
|
|
583
|
+
}
|
|
584
|
+
const lines = robotsTxt.split(/\r?\n/);
|
|
585
|
+
let shouldCapture = false;
|
|
586
|
+
const disallowedUrls = [];
|
|
587
|
+
const allowedUrls = [];
|
|
588
|
+
const sanitisePattern = (pattern) => {
|
|
589
|
+
const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
|
|
590
|
+
const subdirWildcardRegex = /\/\*\//g;
|
|
591
|
+
const filePathRegex = /^\/(?:[^\/]+\/)*[^\/]+\.[a-zA-Z0-9]{1,6}$/;
|
|
592
|
+
if (subdirWildcardRegex.test(pattern)) {
|
|
593
|
+
pattern = pattern.replace(subdirWildcardRegex, '/**/');
|
|
594
|
+
}
|
|
595
|
+
if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
|
|
596
|
+
if (pattern.endsWith('*')) {
|
|
597
|
+
pattern = pattern.concat('*');
|
|
598
|
+
}
|
|
599
|
+
else {
|
|
600
|
+
if (!pattern.endsWith('/'))
|
|
601
|
+
pattern = pattern.concat('/');
|
|
602
|
+
pattern = pattern.concat('**');
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
const final = domain.concat(pattern);
|
|
606
|
+
return final;
|
|
607
|
+
};
|
|
608
|
+
for (const line of lines) {
|
|
609
|
+
if (line.toLowerCase().startsWith('user-agent: *')) {
|
|
610
|
+
shouldCapture = true;
|
|
611
|
+
}
|
|
612
|
+
else if (line.toLowerCase().startsWith('user-agent:') && shouldCapture) {
|
|
613
|
+
break;
|
|
614
|
+
}
|
|
615
|
+
else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
|
|
616
|
+
let disallowed = line.substring('disallow: '.length).trim();
|
|
617
|
+
if (disallowed) {
|
|
618
|
+
disallowed = sanitisePattern(disallowed);
|
|
619
|
+
disallowedUrls.push(disallowed);
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
|
|
623
|
+
let allowed = line.substring('allow: '.length).trim();
|
|
624
|
+
if (allowed) {
|
|
625
|
+
allowed = sanitisePattern(allowed);
|
|
626
|
+
allowedUrls.push(allowed);
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
constants.robotsTxtUrls[domain] = { disallowedUrls, allowedUrls };
|
|
631
|
+
};
|
|
632
|
+
const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory, extraHTTPHeaders) => {
|
|
633
|
+
const robotsDataDir = '';
|
|
634
|
+
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
635
|
+
if (process.env.CRAWLEE_HEADLESS === '1') {
|
|
636
|
+
// Create robots own user data directory else SingletonLock: File exists (17) with crawlDomain or crawlSitemap's own browser
|
|
637
|
+
const robotsDataDir = path.join(userDataDirectory, 'robots');
|
|
638
|
+
if (!fs.existsSync(robotsDataDir)) {
|
|
639
|
+
fs.mkdirSync(robotsDataDir, { recursive: true });
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
const browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
|
|
643
|
+
...getPlaywrightLaunchOptions(browser),
|
|
644
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
645
|
+
});
|
|
646
|
+
register(browserContext);
|
|
647
|
+
const page = await browserContext.newPage();
|
|
648
|
+
await page.goto(robotsUrl, { waitUntil: 'networkidle', timeout: 30000 });
|
|
649
|
+
const robotsTxt = await page.evaluate(() => document.body.textContent);
|
|
650
|
+
return robotsTxt;
|
|
651
|
+
};
|
|
652
|
+
export const isDisallowedInRobotsTxt = (url) => {
|
|
653
|
+
if (!constants.robotsTxtUrls)
|
|
654
|
+
return;
|
|
655
|
+
const domain = new URL(url).origin;
|
|
656
|
+
if (constants.robotsTxtUrls[domain]) {
|
|
657
|
+
const { disallowedUrls, allowedUrls } = constants.robotsTxtUrls[domain];
|
|
658
|
+
const isDisallowed = disallowedUrls.filter((disallowedUrl) => {
|
|
659
|
+
const disallowed = minimatch(url, disallowedUrl);
|
|
660
|
+
return disallowed;
|
|
661
|
+
}).length > 0;
|
|
662
|
+
const isAllowed = allowedUrls.filter((allowedUrl) => {
|
|
663
|
+
const allowed = minimatch(url, allowedUrl);
|
|
664
|
+
return allowed;
|
|
665
|
+
}).length > 0;
|
|
666
|
+
return isDisallowed && !isAllowed;
|
|
667
|
+
}
|
|
668
|
+
return false;
|
|
669
|
+
};
|
|
670
|
+
export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders) => {
|
|
671
|
+
const scannedSitemaps = new Set();
|
|
672
|
+
const urls = {}; // dictionary of requests to urls to be scanned
|
|
673
|
+
const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
|
|
674
|
+
const addToUrlList = (url) => {
|
|
675
|
+
if (!url)
|
|
676
|
+
return;
|
|
677
|
+
if (isDisallowedInRobotsTxt(url))
|
|
678
|
+
return;
|
|
679
|
+
url = convertPathToLocalFile(url);
|
|
680
|
+
let request;
|
|
681
|
+
try {
|
|
682
|
+
request = new Request({ url });
|
|
683
|
+
}
|
|
684
|
+
catch (e) {
|
|
685
|
+
console.log('Error creating request', e);
|
|
686
|
+
}
|
|
687
|
+
if (isUrlPdf(url)) {
|
|
688
|
+
request.skipNavigation = true;
|
|
689
|
+
}
|
|
690
|
+
urls[url] = request;
|
|
691
|
+
};
|
|
692
|
+
const calculateCloseness = (sitemapUrl) => {
|
|
693
|
+
// Remove 'http://', 'https://', and 'www.' prefixes from the URLs
|
|
694
|
+
const normalizedSitemapUrl = sitemapUrl.replace(/^(https?:\/\/)?(www\.)?/, '');
|
|
695
|
+
const normalizedUserUrlInput = userUrlInput
|
|
696
|
+
.replace(/^(https?:\/\/)?(www\.)?/, '')
|
|
697
|
+
.replace(/\/$/, ''); // Remove trailing slash also
|
|
698
|
+
if (normalizedSitemapUrl == normalizedUserUrlInput) {
|
|
699
|
+
return 2;
|
|
700
|
+
}
|
|
701
|
+
if (normalizedSitemapUrl.startsWith(normalizedUserUrlInput)) {
|
|
702
|
+
return 1;
|
|
703
|
+
}
|
|
704
|
+
return 0;
|
|
705
|
+
};
|
|
706
|
+
const processXmlSitemap = async ($, sitemapType, linkSelector, dateSelector, sectionSelector) => {
|
|
707
|
+
const urlList = [];
|
|
708
|
+
// Iterate through each URL element in the sitemap, collect url and modified date
|
|
709
|
+
$(sectionSelector).each((_index, urlElement) => {
|
|
710
|
+
let url;
|
|
711
|
+
if (sitemapType === constants.xmlSitemapTypes.atom) {
|
|
712
|
+
url = $(urlElement).find(linkSelector).prop('href');
|
|
713
|
+
}
|
|
714
|
+
else {
|
|
715
|
+
url = $(urlElement).find(linkSelector).text();
|
|
716
|
+
}
|
|
717
|
+
const lastModified = $(urlElement).find(dateSelector).text();
|
|
718
|
+
const lastModifiedDate = lastModified ? new Date(lastModified) : null;
|
|
719
|
+
urlList.push({ url, lastModifiedDate });
|
|
720
|
+
});
|
|
721
|
+
if (isIntelligent) {
|
|
722
|
+
// Sort by closeness to userUrlInput in descending order
|
|
723
|
+
urlList.sort((a, b) => {
|
|
724
|
+
const closenessA = calculateCloseness(a.url);
|
|
725
|
+
const closenessB = calculateCloseness(b.url);
|
|
726
|
+
if (closenessA !== closenessB) {
|
|
727
|
+
return closenessB - closenessA;
|
|
728
|
+
}
|
|
729
|
+
// If closeness is the same, sort by last modified date in descending order
|
|
730
|
+
return (b.lastModifiedDate?.getTime() || 0) - (a.lastModifiedDate?.getTime() || 0);
|
|
731
|
+
});
|
|
732
|
+
}
|
|
733
|
+
// Add the sorted URLs to the main URL list
|
|
734
|
+
for (const { url } of urlList.slice(0, maxLinksCount)) {
|
|
735
|
+
addToUrlList(url);
|
|
736
|
+
}
|
|
737
|
+
};
|
|
738
|
+
const processNonStandardSitemap = (data) => {
|
|
739
|
+
const urlsFromData = crawlee
|
|
740
|
+
.extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
|
|
741
|
+
.slice(0, maxLinksCount);
|
|
742
|
+
urlsFromData.forEach(url => {
|
|
743
|
+
addToUrlList(url);
|
|
744
|
+
});
|
|
745
|
+
};
|
|
746
|
+
let finalUserDataDirectory = userDataDirectory;
|
|
747
|
+
if (userDataDirectory === null || userDataDirectory === undefined) {
|
|
748
|
+
finalUserDataDirectory = '';
|
|
749
|
+
}
|
|
750
|
+
const fetchUrls = async (url, extraHTTPHeaders) => {
|
|
751
|
+
let data;
|
|
752
|
+
let sitemapType;
|
|
753
|
+
if (scannedSitemaps.has(url)) {
|
|
754
|
+
// Skip processing if the sitemap has already been scanned
|
|
755
|
+
return;
|
|
756
|
+
}
|
|
757
|
+
scannedSitemaps.add(url);
|
|
758
|
+
// Convert file if its not local file path
|
|
759
|
+
url = convertLocalFileToPath(url);
|
|
760
|
+
// Check whether its a file path or a URL
|
|
761
|
+
if (isFilePath(url)) {
|
|
762
|
+
if (!fs.existsSync(url)) {
|
|
763
|
+
return;
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
else if (isValidHttpUrl(url)) {
|
|
767
|
+
// Do nothing, url is valid
|
|
768
|
+
}
|
|
769
|
+
else {
|
|
770
|
+
printMessage([`Invalid Url/Filepath: ${url}`], messageOptions);
|
|
771
|
+
return;
|
|
772
|
+
}
|
|
773
|
+
const getDataUsingPlaywright = async () => {
|
|
774
|
+
const browserContext = await constants.launcher.launchPersistentContext(finalUserDataDirectory, {
|
|
775
|
+
...getPlaywrightLaunchOptions(browser),
|
|
776
|
+
// Not necessary to parse http_credentials as I am parsing it directly in URL
|
|
777
|
+
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
778
|
+
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
779
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
780
|
+
});
|
|
781
|
+
register(browserContext);
|
|
782
|
+
const page = await browserContext.newPage();
|
|
783
|
+
await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
|
|
784
|
+
if ((await page.locator('body').count()) > 0) {
|
|
785
|
+
data = await page.locator('body').innerText();
|
|
786
|
+
}
|
|
787
|
+
else {
|
|
788
|
+
const urlSet = page.locator('urlset');
|
|
789
|
+
const sitemapIndex = page.locator('sitemapindex');
|
|
790
|
+
const rss = page.locator('rss');
|
|
791
|
+
const feed = page.locator('feed');
|
|
792
|
+
const isRoot = async (locator) => (await locator.count()) > 0;
|
|
793
|
+
if (await isRoot(urlSet)) {
|
|
794
|
+
data = await urlSet.evaluate(elem => elem.outerHTML);
|
|
795
|
+
}
|
|
796
|
+
else if (await isRoot(sitemapIndex)) {
|
|
797
|
+
data = await sitemapIndex.evaluate(elem => elem.outerHTML);
|
|
798
|
+
}
|
|
799
|
+
else if (await isRoot(rss)) {
|
|
800
|
+
data = await rss.evaluate(elem => elem.outerHTML);
|
|
801
|
+
}
|
|
802
|
+
else if (await isRoot(feed)) {
|
|
803
|
+
data = await feed.evaluate(elem => elem.outerHTML);
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
await browserContext.close();
|
|
807
|
+
};
|
|
808
|
+
if (validator.isURL(url, urlOptions)) {
|
|
809
|
+
if (isUrlPdf(url)) {
|
|
810
|
+
addToUrlList(url);
|
|
811
|
+
return;
|
|
812
|
+
}
|
|
813
|
+
await getDataUsingPlaywright();
|
|
814
|
+
}
|
|
815
|
+
else {
|
|
816
|
+
url = convertLocalFileToPath(url);
|
|
817
|
+
data = fs.readFileSync(url, 'utf8');
|
|
818
|
+
}
|
|
819
|
+
const $ = cheerio.load(data, { xml: true });
|
|
820
|
+
// This case is when the document is not an XML format document
|
|
821
|
+
if ($(':root').length === 0) {
|
|
822
|
+
processNonStandardSitemap(data);
|
|
823
|
+
return;
|
|
824
|
+
}
|
|
825
|
+
// Root element
|
|
826
|
+
const root = $(':root')[0];
|
|
827
|
+
const { xmlns } = root.attribs;
|
|
828
|
+
const xmlFormatNamespace = '/schemas/sitemap';
|
|
829
|
+
if (root.name === 'urlset' && xmlns.includes(xmlFormatNamespace)) {
|
|
830
|
+
sitemapType = constants.xmlSitemapTypes.xml;
|
|
831
|
+
}
|
|
832
|
+
else if (root.name === 'sitemapindex' && xmlns.includes(xmlFormatNamespace)) {
|
|
833
|
+
sitemapType = constants.xmlSitemapTypes.xmlIndex;
|
|
834
|
+
}
|
|
835
|
+
else if (root.name === 'rss') {
|
|
836
|
+
sitemapType = constants.xmlSitemapTypes.rss;
|
|
837
|
+
}
|
|
838
|
+
else if (root.name === 'feed') {
|
|
839
|
+
sitemapType = constants.xmlSitemapTypes.atom;
|
|
840
|
+
}
|
|
841
|
+
else {
|
|
842
|
+
sitemapType = constants.xmlSitemapTypes.unknown;
|
|
843
|
+
}
|
|
844
|
+
switch (sitemapType) {
|
|
845
|
+
case constants.xmlSitemapTypes.xmlIndex:
|
|
846
|
+
consoleLogger.info(`This is a XML format sitemap index.`);
|
|
847
|
+
for (const childSitemapUrl of $('loc')) {
|
|
848
|
+
const childSitemapUrlText = $(childSitemapUrl).text();
|
|
849
|
+
if (isLimitReached()) {
|
|
850
|
+
break;
|
|
851
|
+
}
|
|
852
|
+
if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
|
|
853
|
+
await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
|
|
854
|
+
}
|
|
855
|
+
else {
|
|
856
|
+
addToUrlList(childSitemapUrlText); // Add regular URLs to the list
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
break;
|
|
860
|
+
case constants.xmlSitemapTypes.xml:
|
|
861
|
+
consoleLogger.info(`This is a XML format sitemap.`);
|
|
862
|
+
await processXmlSitemap($, sitemapType, 'loc', 'lastmod', 'url');
|
|
863
|
+
break;
|
|
864
|
+
case constants.xmlSitemapTypes.rss:
|
|
865
|
+
consoleLogger.info(`This is a RSS format sitemap.`);
|
|
866
|
+
await processXmlSitemap($, sitemapType, 'link', 'pubDate', 'item');
|
|
867
|
+
break;
|
|
868
|
+
case constants.xmlSitemapTypes.atom:
|
|
869
|
+
consoleLogger.info(`This is a Atom format sitemap.`);
|
|
870
|
+
await processXmlSitemap($, sitemapType, 'link', 'published', 'entry');
|
|
871
|
+
break;
|
|
872
|
+
default:
|
|
873
|
+
consoleLogger.info(`This is an unrecognised XML sitemap format.`);
|
|
874
|
+
processNonStandardSitemap(data);
|
|
875
|
+
}
|
|
876
|
+
};
|
|
877
|
+
try {
|
|
878
|
+
await fetchUrls(sitemapUrl, extraHTTPHeaders);
|
|
879
|
+
}
|
|
880
|
+
catch (e) {
|
|
881
|
+
consoleLogger.error(e);
|
|
882
|
+
}
|
|
883
|
+
const requestList = Object.values(urls);
|
|
884
|
+
return requestList;
|
|
885
|
+
};
|
|
886
|
+
export const validEmail = (email) => {
|
|
887
|
+
const emailRegex = /^.+@.+\..+$/u;
|
|
888
|
+
return emailRegex.test(email);
|
|
889
|
+
};
|
|
890
|
+
// For new user flow.
|
|
891
|
+
export const validName = (name) => {
|
|
892
|
+
// Allow only printable characters from any language
|
|
893
|
+
const regex = /^[\p{L}\p{N}\s'".,()\[\]{}!?:؛،؟…]+$/u;
|
|
894
|
+
// Check if the length is between 2 and 32000 characters
|
|
895
|
+
if (name.length < 2 || name.length > 32000) {
|
|
896
|
+
// Handle invalid name length
|
|
897
|
+
return false;
|
|
898
|
+
}
|
|
899
|
+
if (!regex.test(name)) {
|
|
900
|
+
// Handle invalid name format
|
|
901
|
+
return false;
|
|
902
|
+
}
|
|
903
|
+
// Include a check for specific characters to sanitize injection patterns
|
|
904
|
+
const preventInjectionRegex = /[<>'"\\/;|&!$*{}()\[\]\r\n\t]/;
|
|
905
|
+
if (preventInjectionRegex.test(name)) {
|
|
906
|
+
// Handle potential injection attempts
|
|
907
|
+
return false;
|
|
908
|
+
}
|
|
909
|
+
return true;
|
|
910
|
+
};
|
|
911
|
+
/**
|
|
912
|
+
* Check for browser available to run scan and clone data directory of the browser if needed.
|
|
913
|
+
* @param preferredBrowser string of user's preferred browser
|
|
914
|
+
* @param isCli boolean flag to indicate if function is called from cli
|
|
915
|
+
* @returns object consisting of browser to run and cloned data directory
|
|
916
|
+
*/
|
|
917
|
+
export const getBrowserToRun = (randomToken, preferredBrowser, isCli = false) => {
|
|
918
|
+
const platform = os.platform();
|
|
919
|
+
// Prioritise Chrome on Windows and Mac platforms if user does not specify a browser
|
|
920
|
+
if (!preferredBrowser && (os.platform() === 'win32' || os.platform() === 'darwin')) {
|
|
921
|
+
preferredBrowser = BrowserTypes.CHROME;
|
|
922
|
+
}
|
|
923
|
+
else {
|
|
924
|
+
printMessage([`Preferred browser ${preferredBrowser}`], messageOptions);
|
|
925
|
+
}
|
|
926
|
+
if (preferredBrowser === BrowserTypes.CHROME) {
|
|
927
|
+
const chromeData = getChromeData(randomToken);
|
|
928
|
+
if (chromeData)
|
|
929
|
+
return chromeData;
|
|
930
|
+
if (platform === 'darwin') {
|
|
931
|
+
// mac user who specified -b chrome but does not have chrome
|
|
932
|
+
if (isCli)
|
|
933
|
+
printMessage(['Unable to use Chrome, falling back to webkit...'], messageOptions);
|
|
934
|
+
constants.launcher = webkit;
|
|
935
|
+
return { browserToRun: null, clonedBrowserDataDir: '' };
|
|
936
|
+
}
|
|
937
|
+
if (platform === 'win32') {
|
|
938
|
+
if (isCli)
|
|
939
|
+
printMessage(['Unable to use Chrome, falling back to Edge browser...'], messageOptions);
|
|
940
|
+
const edgeData = getEdgeData(randomToken);
|
|
941
|
+
if (edgeData)
|
|
942
|
+
return edgeData;
|
|
943
|
+
if (isCli)
|
|
944
|
+
printMessage(['Unable to use both Chrome and Edge. Please try again.'], messageOptions);
|
|
945
|
+
process.exit(constants.urlCheckStatuses.browserError.code);
|
|
946
|
+
}
|
|
947
|
+
if (isCli) {
|
|
948
|
+
printMessage(['Unable to use Chrome, falling back to Chromium browser...'], messageOptions);
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
else if (preferredBrowser === BrowserTypes.EDGE) {
|
|
952
|
+
const edgeData = getEdgeData(randomToken);
|
|
953
|
+
if (edgeData)
|
|
954
|
+
return edgeData;
|
|
955
|
+
if (isCli)
|
|
956
|
+
printMessage(['Unable to use Edge, falling back to Chrome browser...'], messageOptions);
|
|
957
|
+
const chromeData = getChromeData(randomToken);
|
|
958
|
+
if (chromeData)
|
|
959
|
+
return chromeData;
|
|
960
|
+
if (platform === 'darwin') {
|
|
961
|
+
// mac user who specified -b edge but does not have edge or chrome
|
|
962
|
+
if (isCli)
|
|
963
|
+
printMessage(['Unable to use both Edge and Chrome, falling back to webkit...'], messageOptions);
|
|
964
|
+
constants.launcher = webkit;
|
|
965
|
+
return { browserToRun: null, clonedBrowserDataDir: '' };
|
|
966
|
+
}
|
|
967
|
+
if (platform === 'win32') {
|
|
968
|
+
if (isCli)
|
|
969
|
+
printMessage(['Unable to use both Edge and Chrome. Please try again.'], messageOptions);
|
|
970
|
+
process.exit(constants.urlCheckStatuses.browserError.code);
|
|
971
|
+
}
|
|
972
|
+
else {
|
|
973
|
+
// linux and other OS
|
|
974
|
+
if (isCli)
|
|
975
|
+
printMessage(['Unable to use both Edge and Chrome, falling back to Chromium browser...'], messageOptions);
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
// defaults to chromium
|
|
979
|
+
return {
|
|
980
|
+
browserToRun: BrowserTypes.CHROMIUM,
|
|
981
|
+
clonedBrowserDataDir: cloneChromiumProfiles(randomToken),
|
|
982
|
+
};
|
|
983
|
+
};
|
|
984
|
+
/**
|
|
985
|
+
* Cloning a second time with random token for parallel browser sessions
|
|
986
|
+
* Also to mitigate against known bug where cookies are
|
|
987
|
+
* overridden after each browser session - i.e. logs user out
|
|
988
|
+
* after checkingUrl and unable to utilise same cookie for scan
|
|
989
|
+
* */
|
|
990
|
+
export const getClonedProfilesWithRandomToken = (browser, randomToken) => {
|
|
991
|
+
if (browser === BrowserTypes.CHROME) {
|
|
992
|
+
return cloneChromeProfiles(randomToken);
|
|
993
|
+
}
|
|
994
|
+
if (browser === BrowserTypes.EDGE) {
|
|
995
|
+
return cloneEdgeProfiles(randomToken);
|
|
996
|
+
}
|
|
997
|
+
return cloneChromiumProfiles(randomToken);
|
|
998
|
+
};
|
|
999
|
+
export const getChromeData = (randomToken) => {
|
|
1000
|
+
const browserDataDir = getDefaultChromeDataDir();
|
|
1001
|
+
const clonedBrowserDataDir = cloneChromeProfiles(randomToken);
|
|
1002
|
+
if (browserDataDir && clonedBrowserDataDir) {
|
|
1003
|
+
const browserToRun = BrowserTypes.CHROME;
|
|
1004
|
+
return { browserToRun, clonedBrowserDataDir };
|
|
1005
|
+
}
|
|
1006
|
+
return null;
|
|
1007
|
+
};
|
|
1008
|
+
export const getEdgeData = (randomToken) => {
|
|
1009
|
+
const browserDataDir = getDefaultEdgeDataDir();
|
|
1010
|
+
const clonedBrowserDataDir = cloneEdgeProfiles(randomToken);
|
|
1011
|
+
if (browserDataDir && clonedBrowserDataDir) {
|
|
1012
|
+
const browserToRun = BrowserTypes.EDGE;
|
|
1013
|
+
return { browserToRun, clonedBrowserDataDir };
|
|
1014
|
+
}
|
|
1015
|
+
};
|
|
1016
|
+
/**
|
|
1017
|
+
* Clone the Chrome profile cookie files to the destination directory
|
|
1018
|
+
* @param {*} options glob options object
|
|
1019
|
+
* @param {*} destDir destination directory
|
|
1020
|
+
* @returns boolean indicating whether the operation was successful
|
|
1021
|
+
*/
|
|
1022
|
+
const cloneChromeProfileCookieFiles = (options, destDir) => {
|
|
1023
|
+
let profileCookiesDir;
|
|
1024
|
+
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
|
1025
|
+
// and ../Chrome/<profile name>/Cookies for mac
|
|
1026
|
+
let profileNamesRegex;
|
|
1027
|
+
if (os.platform() === 'win32') {
|
|
1028
|
+
profileCookiesDir = globSync('**/Network/Cookies', {
|
|
1029
|
+
...options,
|
|
1030
|
+
ignore: ['oobee*/**'],
|
|
1031
|
+
});
|
|
1032
|
+
profileNamesRegex = /User Data\\(.*?)\\Network/;
|
|
1033
|
+
}
|
|
1034
|
+
else if (os.platform() === 'darwin') {
|
|
1035
|
+
// maxDepth 2 to avoid copying cookies from the oobee directory if it exists
|
|
1036
|
+
profileCookiesDir = globSync('**/Cookies', {
|
|
1037
|
+
...options,
|
|
1038
|
+
ignore: 'oobee*/**',
|
|
1039
|
+
});
|
|
1040
|
+
profileNamesRegex = /Chrome\/(.*?)\/Cookies/;
|
|
1041
|
+
}
|
|
1042
|
+
if (profileCookiesDir.length > 0) {
|
|
1043
|
+
let success = true;
|
|
1044
|
+
profileCookiesDir.forEach(dir => {
|
|
1045
|
+
const profileName = dir.match(profileNamesRegex)[1];
|
|
1046
|
+
if (profileName) {
|
|
1047
|
+
let destProfileDir = path.join(destDir, profileName);
|
|
1048
|
+
if (os.platform() === 'win32') {
|
|
1049
|
+
destProfileDir = path.join(destProfileDir, 'Network');
|
|
1050
|
+
}
|
|
1051
|
+
// Recursive true to create all parent directories (e.g. PbProfile/Default/Cookies)
|
|
1052
|
+
if (!fs.existsSync(destProfileDir)) {
|
|
1053
|
+
fs.mkdirSync(destProfileDir, { recursive: true });
|
|
1054
|
+
if (!fs.existsSync(destProfileDir)) {
|
|
1055
|
+
fs.mkdirSync(destProfileDir, { recursive: true });
|
|
1056
|
+
}
|
|
1057
|
+
}
|
|
1058
|
+
// Prevents duplicate cookies file if the cookies already exist
|
|
1059
|
+
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
|
1060
|
+
try {
|
|
1061
|
+
fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
|
|
1062
|
+
}
|
|
1063
|
+
catch (err) {
|
|
1064
|
+
consoleLogger.error(err);
|
|
1065
|
+
if (err.code === 'EBUSY') {
|
|
1066
|
+
console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
|
|
1067
|
+
console.log('Please close any applications that might be using this file and try again.');
|
|
1068
|
+
}
|
|
1069
|
+
else {
|
|
1070
|
+
console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
|
|
1071
|
+
}
|
|
1072
|
+
// printMessage([err], messageOptions);
|
|
1073
|
+
success = false;
|
|
1074
|
+
}
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
});
|
|
1078
|
+
return success;
|
|
1079
|
+
}
|
|
1080
|
+
consoleLogger.warn('Unable to find Chrome profile cookies file in the system.');
|
|
1081
|
+
printMessage(['Unable to find Chrome profile cookies file in the system.'], messageOptions);
|
|
1082
|
+
return false;
|
|
1083
|
+
};
|
|
1084
|
+
/**
|
|
1085
|
+
* Clone the Chrome profile cookie files to the destination directory
|
|
1086
|
+
* @param {*} options glob options object
|
|
1087
|
+
* @param {*} destDir destination directory
|
|
1088
|
+
* @returns boolean indicating whether the operation was successful
|
|
1089
|
+
*/
|
|
1090
|
+
const cloneEdgeProfileCookieFiles = (options, destDir) => {
|
|
1091
|
+
let profileCookiesDir;
|
|
1092
|
+
// Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
|
|
1093
|
+
// and ../Chrome/<profile name>/Cookies for mac
|
|
1094
|
+
let profileNamesRegex;
|
|
1095
|
+
// Ignores the cloned oobee directory if exists
|
|
1096
|
+
if (os.platform() === 'win32') {
|
|
1097
|
+
profileCookiesDir = globSync('**/Network/Cookies', {
|
|
1098
|
+
...options,
|
|
1099
|
+
ignore: 'oobee*/**',
|
|
1100
|
+
});
|
|
1101
|
+
profileNamesRegex = /User Data\\(.*?)\\Network/;
|
|
1102
|
+
}
|
|
1103
|
+
else if (os.platform() === 'darwin') {
|
|
1104
|
+
// Ignores copying cookies from the oobee directory if it exists
|
|
1105
|
+
profileCookiesDir = globSync('**/Cookies', {
|
|
1106
|
+
...options,
|
|
1107
|
+
ignore: 'oobee*/**',
|
|
1108
|
+
});
|
|
1109
|
+
profileNamesRegex = /Microsoft Edge\/(.*?)\/Cookies/;
|
|
1110
|
+
}
|
|
1111
|
+
if (profileCookiesDir.length > 0) {
|
|
1112
|
+
let success = true;
|
|
1113
|
+
profileCookiesDir.forEach(dir => {
|
|
1114
|
+
const profileName = dir.match(profileNamesRegex)[1];
|
|
1115
|
+
if (profileName) {
|
|
1116
|
+
let destProfileDir = path.join(destDir, profileName);
|
|
1117
|
+
if (os.platform() === 'win32') {
|
|
1118
|
+
destProfileDir = path.join(destProfileDir, 'Network');
|
|
1119
|
+
}
|
|
1120
|
+
// Recursive true to create all parent directories (e.g. PbProfile/Default/Cookies)
|
|
1121
|
+
if (!fs.existsSync(destProfileDir)) {
|
|
1122
|
+
fs.mkdirSync(destProfileDir, { recursive: true });
|
|
1123
|
+
if (!fs.existsSync(destProfileDir)) {
|
|
1124
|
+
fs.mkdirSync(destProfileDir, { recursive: true });
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
1127
|
+
// Prevents duplicate cookies file if the cookies already exist
|
|
1128
|
+
if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
|
|
1129
|
+
try {
|
|
1130
|
+
fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
|
|
1131
|
+
}
|
|
1132
|
+
catch (err) {
|
|
1133
|
+
consoleLogger.error(err);
|
|
1134
|
+
if (err.code === 'EBUSY') {
|
|
1135
|
+
console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
|
|
1136
|
+
console.log('Please close any applications that might be using this file and try again.');
|
|
1137
|
+
}
|
|
1138
|
+
else {
|
|
1139
|
+
console.log(`An unexpected error occurred while copying the file: ${err.message}`);
|
|
1140
|
+
}
|
|
1141
|
+
// printMessage([err], messageOptions);
|
|
1142
|
+
success = false;
|
|
1143
|
+
}
|
|
1144
|
+
}
|
|
1145
|
+
}
|
|
1146
|
+
});
|
|
1147
|
+
return success;
|
|
1148
|
+
}
|
|
1149
|
+
consoleLogger.warn('Unable to find Edge profile cookies file in the system.');
|
|
1150
|
+
printMessage(['Unable to find Edge profile cookies file in the system.'], messageOptions);
|
|
1151
|
+
return false;
|
|
1152
|
+
};
|
|
1153
|
+
/**
|
|
1154
|
+
* Both Edge and Chrome Local State files are located in the .../User Data directory
|
|
1155
|
+
* @param {*} options - glob options object
|
|
1156
|
+
* @param {string} destDir - destination directory
|
|
1157
|
+
* @returns boolean indicating whether the operation was successful
|
|
1158
|
+
*/
|
|
1159
|
+
const cloneLocalStateFile = (options, destDir) => {
|
|
1160
|
+
const localState = globSync('**/*Local State', {
|
|
1161
|
+
...options,
|
|
1162
|
+
maxDepth: 1,
|
|
1163
|
+
});
|
|
1164
|
+
const profileNamesRegex = /([^/\\]+)[/\\]Local State$/;
|
|
1165
|
+
if (localState.length > 0) {
|
|
1166
|
+
let success = true;
|
|
1167
|
+
localState.forEach(dir => {
|
|
1168
|
+
const profileName = dir.match(profileNamesRegex)[1];
|
|
1169
|
+
try {
|
|
1170
|
+
fs.copyFileSync(dir, path.join(destDir, 'Local State'));
|
|
1171
|
+
}
|
|
1172
|
+
catch (err) {
|
|
1173
|
+
consoleLogger.error(err);
|
|
1174
|
+
if (err.code === 'EBUSY') {
|
|
1175
|
+
console.log(`Unable to copy the file because it is currently in use.`);
|
|
1176
|
+
console.log('Please close any applications that might be using this file and try again.');
|
|
1177
|
+
}
|
|
1178
|
+
else {
|
|
1179
|
+
console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
|
|
1180
|
+
}
|
|
1181
|
+
printMessage([err], messageOptions);
|
|
1182
|
+
success = false;
|
|
1183
|
+
}
|
|
1184
|
+
});
|
|
1185
|
+
return success;
|
|
1186
|
+
}
|
|
1187
|
+
consoleLogger.warn('Unable to find local state file in the system.');
|
|
1188
|
+
printMessage(['Unable to find local state file in the system.'], messageOptions);
|
|
1189
|
+
return false;
|
|
1190
|
+
};
|
|
1191
|
+
/**
|
|
1192
|
+
* Checks if the Chrome data directory exists and creates a clone
|
|
1193
|
+
* of all profile within the oobee directory located in the
|
|
1194
|
+
* .../User Data directory for Windows and
|
|
1195
|
+
* .../Chrome directory for Mac.
|
|
1196
|
+
* @param {string} randomToken - random token to append to the cloned directory
|
|
1197
|
+
* @returns {string} cloned data directory, null if any of the sub files failed to copy
|
|
1198
|
+
*/
|
|
1199
|
+
export const cloneChromeProfiles = (randomToken) => {
|
|
1200
|
+
const baseDir = getDefaultChromeDataDir();
|
|
1201
|
+
if (!baseDir) {
|
|
1202
|
+
return;
|
|
1203
|
+
}
|
|
1204
|
+
let destDir;
|
|
1205
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
|
1206
|
+
if (fs.existsSync(destDir)) {
|
|
1207
|
+
// Don't delete since it will be handled at the end of the scan
|
|
1208
|
+
// deleteClonedChromeProfiles(randomToken);
|
|
1209
|
+
// Assume it cloned and don't re-clone
|
|
1210
|
+
}
|
|
1211
|
+
else {
|
|
1212
|
+
if (!fs.existsSync(destDir)) {
|
|
1213
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1214
|
+
}
|
|
1215
|
+
const baseOptions = {
|
|
1216
|
+
cwd: baseDir,
|
|
1217
|
+
recursive: true,
|
|
1218
|
+
absolute: true,
|
|
1219
|
+
nodir: true,
|
|
1220
|
+
};
|
|
1221
|
+
const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
|
|
1222
|
+
if (cloneChromeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
|
|
1223
|
+
return destDir;
|
|
1224
|
+
}
|
|
1225
|
+
consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
|
|
1226
|
+
}
|
|
1227
|
+
// For future reference, return a null instead to halt the scan
|
|
1228
|
+
return destDir;
|
|
1229
|
+
};
|
|
1230
|
+
export const cloneChromiumProfiles = (randomToken) => {
|
|
1231
|
+
const baseDir = getDefaultChromiumDataDir();
|
|
1232
|
+
if (!baseDir) {
|
|
1233
|
+
return;
|
|
1234
|
+
}
|
|
1235
|
+
let destDir;
|
|
1236
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
|
1237
|
+
if (fs.existsSync(destDir)) {
|
|
1238
|
+
// Don't delete since it will be handled at the end of the scan
|
|
1239
|
+
// deleteClonedChromiumProfiles(randomToken);
|
|
1240
|
+
// Assume it cloned and don't re-clone
|
|
1241
|
+
}
|
|
1242
|
+
else {
|
|
1243
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1244
|
+
}
|
|
1245
|
+
return destDir;
|
|
1246
|
+
};
|
|
1247
|
+
/**
|
|
1248
|
+
* Checks if the Edge data directory exists and creates a clone
|
|
1249
|
+
* of all profile within the oobee directory located in the
|
|
1250
|
+
* .../User Data directory for Windows and
|
|
1251
|
+
* .../Microsoft Edge directory for Mac.
|
|
1252
|
+
* @param {string} randomToken - random token to append to the cloned directory
|
|
1253
|
+
* @returns {string} cloned data directory, null if any of the sub files failed to copy
|
|
1254
|
+
*/
|
|
1255
|
+
export const cloneEdgeProfiles = (randomToken) => {
|
|
1256
|
+
const baseDir = getDefaultEdgeDataDir();
|
|
1257
|
+
if (!baseDir) {
|
|
1258
|
+
return;
|
|
1259
|
+
}
|
|
1260
|
+
let destDir;
|
|
1261
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
|
1262
|
+
if (fs.existsSync(destDir)) {
|
|
1263
|
+
// Don't delete since it will be handled at the end of the scan
|
|
1264
|
+
// deleteClonedEdgeProfiles(randomToken);
|
|
1265
|
+
// Assume it cloned and don't re-clone
|
|
1266
|
+
}
|
|
1267
|
+
else {
|
|
1268
|
+
if (!fs.existsSync(destDir)) {
|
|
1269
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
1270
|
+
}
|
|
1271
|
+
const baseOptions = {
|
|
1272
|
+
cwd: baseDir,
|
|
1273
|
+
recursive: true,
|
|
1274
|
+
absolute: true,
|
|
1275
|
+
nodir: true,
|
|
1276
|
+
};
|
|
1277
|
+
const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
|
|
1278
|
+
if (cloneEdgeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
|
|
1279
|
+
return destDir;
|
|
1280
|
+
}
|
|
1281
|
+
consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
|
|
1282
|
+
}
|
|
1283
|
+
// For future reference, return a null instead to halt the scan
|
|
1284
|
+
return destDir;
|
|
1285
|
+
};
|
|
1286
|
+
export const deleteClonedProfiles = (browser, randomToken) => {
|
|
1287
|
+
if (browser === BrowserTypes.CHROME) {
|
|
1288
|
+
deleteClonedChromeProfiles(randomToken);
|
|
1289
|
+
}
|
|
1290
|
+
else if (browser === BrowserTypes.EDGE) {
|
|
1291
|
+
deleteClonedEdgeProfiles(randomToken);
|
|
1292
|
+
}
|
|
1293
|
+
else if (browser === BrowserTypes.CHROMIUM) {
|
|
1294
|
+
deleteClonedChromiumProfiles(randomToken);
|
|
1295
|
+
}
|
|
1296
|
+
};
|
|
1297
|
+
/**
|
|
1298
|
+
* Deletes all the cloned oobee directories in the Chrome data directory
|
|
1299
|
+
* @returns null
|
|
1300
|
+
*/
|
|
1301
|
+
export const deleteClonedChromeProfiles = (randomToken) => {
|
|
1302
|
+
const baseDir = getDefaultChromeDataDir();
|
|
1303
|
+
if (!baseDir) {
|
|
1304
|
+
return;
|
|
1305
|
+
}
|
|
1306
|
+
let destDir;
|
|
1307
|
+
if (randomToken) {
|
|
1308
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
|
1309
|
+
}
|
|
1310
|
+
else {
|
|
1311
|
+
// Find all the oobee directories in the Chrome data directory
|
|
1312
|
+
destDir = globSync('**/oobee*', {
|
|
1313
|
+
cwd: baseDir,
|
|
1314
|
+
absolute: true,
|
|
1315
|
+
});
|
|
1316
|
+
}
|
|
1317
|
+
if (destDir.length > 0) {
|
|
1318
|
+
destDir.forEach(dir => {
|
|
1319
|
+
if (fs.existsSync(dir)) {
|
|
1320
|
+
try {
|
|
1321
|
+
fs.rmSync(dir, { recursive: true });
|
|
1322
|
+
}
|
|
1323
|
+
catch (err) {
|
|
1324
|
+
consoleLogger.error(`CHROME Unable to delete ${dir} folder in the Chrome data directory. ${err}`);
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
});
|
|
1328
|
+
return;
|
|
1329
|
+
}
|
|
1330
|
+
consoleLogger.warn('Unable to find oobee directory in the Chrome data directory.');
|
|
1331
|
+
console.warn('Unable to find oobee directory in the Chrome data directory.');
|
|
1332
|
+
};
|
|
1333
|
+
/**
|
|
1334
|
+
* Deletes all the cloned oobee directories in the Edge data directory
|
|
1335
|
+
* @returns null
|
|
1336
|
+
*/
|
|
1337
|
+
export const deleteClonedEdgeProfiles = (randomToken) => {
|
|
1338
|
+
const baseDir = getDefaultEdgeDataDir();
|
|
1339
|
+
if (!baseDir) {
|
|
1340
|
+
console.warn(`Unable to find Edge data directory in the system.`);
|
|
1341
|
+
return;
|
|
1342
|
+
}
|
|
1343
|
+
let destDir;
|
|
1344
|
+
if (randomToken) {
|
|
1345
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
|
1346
|
+
}
|
|
1347
|
+
else {
|
|
1348
|
+
// Find all the oobee directories in the Chrome data directory
|
|
1349
|
+
destDir = globSync('**/oobee*', {
|
|
1350
|
+
cwd: baseDir,
|
|
1351
|
+
absolute: true,
|
|
1352
|
+
});
|
|
1353
|
+
}
|
|
1354
|
+
if (destDir.length > 0) {
|
|
1355
|
+
destDir.forEach(dir => {
|
|
1356
|
+
if (fs.existsSync(dir)) {
|
|
1357
|
+
try {
|
|
1358
|
+
fs.rmSync(dir, { recursive: true });
|
|
1359
|
+
}
|
|
1360
|
+
catch (err) {
|
|
1361
|
+
consoleLogger.error(`EDGE Unable to delete ${dir} folder in the Chrome data directory. ${err}`);
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
});
|
|
1365
|
+
}
|
|
1366
|
+
};
|
|
1367
|
+
export const deleteClonedChromiumProfiles = (randomToken) => {
|
|
1368
|
+
const baseDir = getDefaultChromiumDataDir();
|
|
1369
|
+
if (!baseDir) {
|
|
1370
|
+
return;
|
|
1371
|
+
}
|
|
1372
|
+
let destDir;
|
|
1373
|
+
if (randomToken) {
|
|
1374
|
+
destDir = [`${baseDir}/oobee-${randomToken}`];
|
|
1375
|
+
}
|
|
1376
|
+
else {
|
|
1377
|
+
// Find all the oobee directories in the Chrome data directory
|
|
1378
|
+
destDir = globSync('**/oobee*', {
|
|
1379
|
+
cwd: baseDir,
|
|
1380
|
+
absolute: true,
|
|
1381
|
+
});
|
|
1382
|
+
}
|
|
1383
|
+
if (destDir.length > 0) {
|
|
1384
|
+
destDir.forEach(dir => {
|
|
1385
|
+
if (fs.existsSync(dir)) {
|
|
1386
|
+
try {
|
|
1387
|
+
fs.rmSync(dir, { recursive: true });
|
|
1388
|
+
}
|
|
1389
|
+
catch (err) {
|
|
1390
|
+
consoleLogger.error(`CHROMIUM Unable to delete ${dir} folder in the Chromium data directory. ${err}`);
|
|
1391
|
+
}
|
|
1392
|
+
}
|
|
1393
|
+
});
|
|
1394
|
+
return;
|
|
1395
|
+
}
|
|
1396
|
+
consoleLogger.warn('Unable to find oobee directory in Chromium support directory');
|
|
1397
|
+
console.warn('Unable to find oobee directory in Chromium support directory');
|
|
1398
|
+
};
|
|
1399
|
+
export const getPlaywrightDeviceDetailsObject = (deviceChosen, customDevice, viewportWidth) => {
|
|
1400
|
+
let playwrightDeviceDetailsObject = devices['Desktop Chrome']; // default to Desktop Chrome
|
|
1401
|
+
if (deviceChosen === 'Mobile' || customDevice === 'iPhone 11') {
|
|
1402
|
+
playwrightDeviceDetailsObject = devices['iPhone 11'];
|
|
1403
|
+
}
|
|
1404
|
+
else if (customDevice === 'Samsung Galaxy S9+') {
|
|
1405
|
+
playwrightDeviceDetailsObject = devices['Galaxy S9+'];
|
|
1406
|
+
}
|
|
1407
|
+
else if (viewportWidth) {
|
|
1408
|
+
playwrightDeviceDetailsObject = {
|
|
1409
|
+
viewport: { width: viewportWidth, height: 720 },
|
|
1410
|
+
isMobile: false,
|
|
1411
|
+
hasTouch: false,
|
|
1412
|
+
userAgent: devices['Desktop Chrome'].userAgent,
|
|
1413
|
+
deviceScaleFactor: 1,
|
|
1414
|
+
defaultBrowserType: 'chromium',
|
|
1415
|
+
};
|
|
1416
|
+
}
|
|
1417
|
+
else if (customDevice) {
|
|
1418
|
+
playwrightDeviceDetailsObject = devices[customDevice.replace(/_/g, ' ')];
|
|
1419
|
+
}
|
|
1420
|
+
return playwrightDeviceDetailsObject;
|
|
1421
|
+
};
|
|
1422
|
+
export const getScreenToScan = (deviceChosen, customDevice, viewportWidth) => {
|
|
1423
|
+
if (deviceChosen) {
|
|
1424
|
+
return deviceChosen;
|
|
1425
|
+
}
|
|
1426
|
+
if (customDevice) {
|
|
1427
|
+
return customDevice;
|
|
1428
|
+
}
|
|
1429
|
+
if (viewportWidth) {
|
|
1430
|
+
return `CustomWidth_${viewportWidth}px`;
|
|
1431
|
+
}
|
|
1432
|
+
return 'Desktop';
|
|
1433
|
+
};
|
|
1434
|
+
export const submitFormViaPlaywright = async (browserToRun, userDataDirectory, finalUrl) => {
|
|
1435
|
+
const browserContext = await constants.launcher.launchPersistentContext(userDataDirectory, {
|
|
1436
|
+
...getPlaywrightLaunchOptions(browserToRun),
|
|
1437
|
+
});
|
|
1438
|
+
register(browserContext);
|
|
1439
|
+
const page = await browserContext.newPage();
|
|
1440
|
+
try {
|
|
1441
|
+
await page.goto(finalUrl, {
|
|
1442
|
+
timeout: 30000,
|
|
1443
|
+
waitUntil: 'commit',
|
|
1444
|
+
});
|
|
1445
|
+
try {
|
|
1446
|
+
await page.waitForLoadState('networkidle', { timeout: 10000 });
|
|
1447
|
+
}
|
|
1448
|
+
catch {
|
|
1449
|
+
consoleLogger.info('Unable to detect networkidle');
|
|
1450
|
+
}
|
|
1451
|
+
}
|
|
1452
|
+
catch (error) {
|
|
1453
|
+
consoleLogger.error(error);
|
|
1454
|
+
}
|
|
1455
|
+
finally {
|
|
1456
|
+
await browserContext.close();
|
|
1457
|
+
}
|
|
1458
|
+
};
|
|
1459
|
+
export const submitForm = async (browserToRun, userDataDirectory, scannedUrl, entryUrl, scanType, email, name, scanResultsJson, numberOfPagesScanned, numberOfRedirectsScanned, numberOfPagesNotScanned, metadata) => {
|
|
1460
|
+
// Legacy code start - Google Sheets submission
|
|
1461
|
+
const additionalPageDataJson = JSON.stringify({
|
|
1462
|
+
redirectsScanned: numberOfRedirectsScanned,
|
|
1463
|
+
pagesNotScanned: numberOfPagesNotScanned,
|
|
1464
|
+
});
|
|
1465
|
+
let finalUrl = `${formDataFields.formUrl}?` +
|
|
1466
|
+
`${formDataFields.entryUrlField}=${entryUrl}&` +
|
|
1467
|
+
`${formDataFields.scanTypeField}=${scanType}&` +
|
|
1468
|
+
`${formDataFields.emailField}=${email}&` +
|
|
1469
|
+
`${formDataFields.nameField}=${name}&` +
|
|
1470
|
+
`${formDataFields.resultsField}=${encodeURIComponent(scanResultsJson)}&` +
|
|
1471
|
+
`${formDataFields.numberOfPagesScannedField}=${numberOfPagesScanned}&` +
|
|
1472
|
+
`${formDataFields.additionalPageDataField}=${encodeURIComponent(additionalPageDataJson)}&` +
|
|
1473
|
+
`${formDataFields.metadataField}=${encodeURIComponent(metadata)}`;
|
|
1474
|
+
if (scannedUrl !== entryUrl) {
|
|
1475
|
+
finalUrl += `&${formDataFields.redirectUrlField}=${scannedUrl}`;
|
|
1476
|
+
}
|
|
1477
|
+
try {
|
|
1478
|
+
await axios.get(finalUrl, { timeout: 2000 });
|
|
1479
|
+
}
|
|
1480
|
+
catch (error) {
|
|
1481
|
+
if (error.code === 'ECONNABORTED') {
|
|
1482
|
+
if (browserToRun || constants.launcher === webkit) {
|
|
1483
|
+
await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
|
|
1484
|
+
}
|
|
1485
|
+
}
|
|
1486
|
+
}
|
|
1487
|
+
};
|
|
1488
|
+
// Legacy code end - Google Sheets submission
|
|
1489
|
+
export async function initModifiedUserAgent(browser, playwrightDeviceDetailsObject, userDataDirectory) {
|
|
1490
|
+
const isHeadless = process.env.CRAWLEE_HEADLESS === '1';
|
|
1491
|
+
// If headless mode is enabled, ensure the headless flag is set.
|
|
1492
|
+
if (isHeadless && !constants.launchOptionsArgs.includes('--headless=new')) {
|
|
1493
|
+
constants.launchOptionsArgs.push('--headless=new');
|
|
1494
|
+
}
|
|
1495
|
+
// Build the launch options using your production settings.
|
|
1496
|
+
// headless is forced to false as in your persistent context, and we merge in getPlaywrightLaunchOptions and device details.
|
|
1497
|
+
const launchOptions = {
|
|
1498
|
+
headless: false,
|
|
1499
|
+
...getPlaywrightLaunchOptions(browser),
|
|
1500
|
+
...playwrightDeviceDetailsObject,
|
|
1501
|
+
};
|
|
1502
|
+
// Launch a temporary persistent context with an empty userDataDir to mimic your production browser setup.
|
|
1503
|
+
const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1' ? userDataDirectory : '';
|
|
1504
|
+
const browserContext = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, launchOptions);
|
|
1505
|
+
register(browserContext);
|
|
1506
|
+
const page = await browserContext.newPage();
|
|
1507
|
+
// Retrieve the default user agent.
|
|
1508
|
+
const defaultUA = await page.evaluate(() => navigator.userAgent);
|
|
1509
|
+
await browserContext.close();
|
|
1510
|
+
// Modify the UA:
|
|
1511
|
+
// Replace "HeadlessChrome" with "Chrome" if present.
|
|
1512
|
+
const modifiedUA = defaultUA.includes('HeadlessChrome')
|
|
1513
|
+
? defaultUA.replace('HeadlessChrome', 'Chrome')
|
|
1514
|
+
: defaultUA;
|
|
1515
|
+
// Push the modified UA flag into your global launch options.
|
|
1516
|
+
constants.launchOptionsArgs.push(`--user-agent=${modifiedUA}`);
|
|
1517
|
+
// Optionally log the modified UA.
|
|
1518
|
+
// console.log('Modified User Agent:', modifiedUA);
|
|
1519
|
+
}
|
|
1520
|
+
const cacheProxyInfo = getProxyInfo();
|
|
1521
|
+
/**
|
|
1522
|
+
* @param {string} browser browser name ("chrome" or "edge", null for chromium, the default Playwright browser)
|
|
1523
|
+
* @returns playwright launch options object. For more details: https://playwright.dev/docs/api/class-browsertype#browser-type-launch
|
|
1524
|
+
*/
|
|
1525
|
+
export const getPlaywrightLaunchOptions = (browser) => {
|
|
1526
|
+
const channel = browser || undefined;
|
|
1527
|
+
const resolution = proxyInfoToResolution(cacheProxyInfo);
|
|
1528
|
+
// Start with your base args
|
|
1529
|
+
const finalArgs = [...constants.launchOptionsArgs];
|
|
1530
|
+
// Headless flags (unchanged)
|
|
1531
|
+
if (process.env.CRAWLEE_HEADLESS === '1') {
|
|
1532
|
+
if (!finalArgs.includes('--headless=new'))
|
|
1533
|
+
finalArgs.push('--headless=new');
|
|
1534
|
+
if (!finalArgs.includes('--mute-audio'))
|
|
1535
|
+
finalArgs.push('--mute-audio');
|
|
1536
|
+
}
|
|
1537
|
+
// Map resolution to Playwright options
|
|
1538
|
+
let proxyOpt;
|
|
1539
|
+
switch (resolution.kind) {
|
|
1540
|
+
case 'manual':
|
|
1541
|
+
proxyOpt = resolution.settings;
|
|
1542
|
+
break;
|
|
1543
|
+
case 'pac': {
|
|
1544
|
+
finalArgs.push(`--proxy-pac-url=${resolution.pacUrl}`);
|
|
1545
|
+
if (resolution.bypass)
|
|
1546
|
+
finalArgs.push(`--proxy-bypass-list=${resolution.bypass}`);
|
|
1547
|
+
break;
|
|
1548
|
+
}
|
|
1549
|
+
case 'none':
|
|
1550
|
+
// nothing
|
|
1551
|
+
break;
|
|
1552
|
+
}
|
|
1553
|
+
const options = {
|
|
1554
|
+
ignoreDefaultArgs: ['--use-mock-keychain', '--headless'],
|
|
1555
|
+
args: finalArgs,
|
|
1556
|
+
headless: false,
|
|
1557
|
+
...(channel && { channel }),
|
|
1558
|
+
...(proxyOpt ? { proxy: proxyOpt } : {}),
|
|
1559
|
+
};
|
|
1560
|
+
// SlowMo (unchanged)
|
|
1561
|
+
if (!options.slowMo && process.env.OOBEE_SLOWMO && Number(process.env.OOBEE_SLOWMO) >= 1) {
|
|
1562
|
+
options.slowMo = Number(process.env.OOBEE_SLOWMO);
|
|
1563
|
+
consoleLogger.info(`Enabled browser slowMo with value: ${process.env.OOBEE_SLOWMO}ms`);
|
|
1564
|
+
}
|
|
1565
|
+
// Edge on Windows should not be headless (unchanged)
|
|
1566
|
+
if (browser === BrowserTypes.EDGE && os.platform() === 'win32') {
|
|
1567
|
+
options.headless = false;
|
|
1568
|
+
}
|
|
1569
|
+
return options;
|
|
1570
|
+
};
|
|
1571
|
+
export const waitForPageLoaded = async (page, timeout = 10000) => {
|
|
1572
|
+
const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
|
|
1573
|
+
return Promise.race([
|
|
1574
|
+
page.waitForLoadState('load'), // Ensure page load completes
|
|
1575
|
+
page.waitForLoadState('networkidle'), // Wait for network requests to settle
|
|
1576
|
+
new Promise(resolve => setTimeout(resolve, timeout)), // Hard timeout as a fallback
|
|
1577
|
+
page.evaluate(OBSERVER_TIMEOUT => {
|
|
1578
|
+
return new Promise(resolve => {
|
|
1579
|
+
// Skip mutation check for PDFs
|
|
1580
|
+
if (document.contentType === 'application/pdf') {
|
|
1581
|
+
resolve('Skipping DOM mutation check for PDF.');
|
|
1582
|
+
return;
|
|
1583
|
+
}
|
|
1584
|
+
const root = document.documentElement || document.body;
|
|
1585
|
+
if (!(root instanceof Node)) {
|
|
1586
|
+
// Not a valid DOM root—treat as loaded
|
|
1587
|
+
resolve('No valid root to observe; treating as loaded.');
|
|
1588
|
+
return;
|
|
1589
|
+
}
|
|
1590
|
+
let timeout;
|
|
1591
|
+
let mutationCount = 0;
|
|
1592
|
+
const MAX_MUTATIONS = 500;
|
|
1593
|
+
const mutationHash = {};
|
|
1594
|
+
const observer = new MutationObserver(mutationsList => {
|
|
1595
|
+
clearTimeout(timeout);
|
|
1596
|
+
mutationCount++;
|
|
1597
|
+
if (mutationCount > MAX_MUTATIONS) {
|
|
1598
|
+
observer.disconnect();
|
|
1599
|
+
resolve('Too many mutations detected, exiting.');
|
|
1600
|
+
return;
|
|
1601
|
+
}
|
|
1602
|
+
for (const mutation of mutationsList) {
|
|
1603
|
+
if (mutation.target instanceof Element) {
|
|
1604
|
+
for (const attr of Array.from(mutation.target.attributes)) {
|
|
1605
|
+
const key = `${mutation.target.nodeName}-${attr.name}`;
|
|
1606
|
+
mutationHash[key] = (mutationHash[key] || 0) + 1;
|
|
1607
|
+
if (mutationHash[key] >= 10) {
|
|
1608
|
+
observer.disconnect();
|
|
1609
|
+
resolve(`Repeated mutation detected for ${key}, exiting.`);
|
|
1610
|
+
return;
|
|
1611
|
+
}
|
|
1612
|
+
}
|
|
1613
|
+
}
|
|
1614
|
+
}
|
|
1615
|
+
timeout = setTimeout(() => {
|
|
1616
|
+
observer.disconnect();
|
|
1617
|
+
resolve('DOM stabilized after mutations.');
|
|
1618
|
+
}, 1000);
|
|
1619
|
+
});
|
|
1620
|
+
// Final timeout to avoid infinite waiting
|
|
1621
|
+
timeout = setTimeout(() => {
|
|
1622
|
+
observer.disconnect();
|
|
1623
|
+
resolve('Observer timeout reached, exiting.');
|
|
1624
|
+
}, OBSERVER_TIMEOUT);
|
|
1625
|
+
// Only observe if root is a Node
|
|
1626
|
+
observer.observe(root, {
|
|
1627
|
+
childList: true,
|
|
1628
|
+
subtree: true,
|
|
1629
|
+
attributes: true,
|
|
1630
|
+
});
|
|
1631
|
+
});
|
|
1632
|
+
}, OBSERVER_TIMEOUT), // Pass OBSERVER_TIMEOUT dynamically to the browser context
|
|
1633
|
+
]);
|
|
1634
|
+
};
|
|
1635
|
+
function isValidHttpUrl(urlString) {
|
|
1636
|
+
const pattern = /^(http|https):\/\/[^ "]+$/;
|
|
1637
|
+
return pattern.test(urlString);
|
|
1638
|
+
}
|
|
1639
|
+
export const isFilePath = (url) => {
|
|
1640
|
+
const driveLetterPattern = /^[A-Z]:/i;
|
|
1641
|
+
const backslashPattern = /\\/;
|
|
1642
|
+
return (url.startsWith('/') ||
|
|
1643
|
+
driveLetterPattern.test(url) ||
|
|
1644
|
+
backslashPattern.test(url) ||
|
|
1645
|
+
url.startsWith('./') ||
|
|
1646
|
+
url.startsWith('../') ||
|
|
1647
|
+
url.startsWith('.\\') ||
|
|
1648
|
+
url.startsWith('..\\'));
|
|
1649
|
+
};
|
|
1650
|
+
export function convertLocalFileToPath(url) {
|
|
1651
|
+
if (url.startsWith('file://')) {
|
|
1652
|
+
url = fileURLToPath(url);
|
|
1653
|
+
}
|
|
1654
|
+
return url;
|
|
1655
|
+
}
|
|
1656
|
+
export function convertPathToLocalFile(filePath) {
|
|
1657
|
+
if (filePath.startsWith('/')) {
|
|
1658
|
+
filePath = pathToFileURL(filePath).toString();
|
|
1659
|
+
}
|
|
1660
|
+
return filePath;
|
|
1661
|
+
}
|
|
1662
|
+
export function convertToFilePath(fileUrl) {
|
|
1663
|
+
// Parse the file URL
|
|
1664
|
+
const parsedUrl = url.parse(fileUrl);
|
|
1665
|
+
// Decode the URL-encoded path
|
|
1666
|
+
const filePath = decodeURIComponent(parsedUrl.path);
|
|
1667
|
+
// Return the file path without the 'file://' prefix
|
|
1668
|
+
return filePath;
|
|
1669
|
+
}
|