@govtechsg/oobee 0.10.76 → 0.10.78-alpha1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/.github/workflows/publish.yml +8 -1
  2. package/INTEGRATION.md +50 -3
  3. package/dist/cli.js +252 -0
  4. package/dist/combine.js +221 -0
  5. package/dist/constants/cliFunctions.js +306 -0
  6. package/dist/constants/common.js +1669 -0
  7. package/dist/constants/constants.js +913 -0
  8. package/dist/constants/errorMeta.json +319 -0
  9. package/dist/constants/itemTypeDescription.js +7 -0
  10. package/dist/constants/oobeeAi.js +121 -0
  11. package/dist/constants/questions.js +151 -0
  12. package/dist/constants/sampleData.js +176 -0
  13. package/dist/crawlers/commonCrawlerFunc.js +428 -0
  14. package/dist/crawlers/crawlDomain.js +613 -0
  15. package/dist/crawlers/crawlIntelligentSitemap.js +135 -0
  16. package/dist/crawlers/crawlLocalFile.js +151 -0
  17. package/dist/crawlers/crawlSitemap.js +303 -0
  18. package/dist/crawlers/custom/escapeCssSelector.js +10 -0
  19. package/dist/crawlers/custom/evaluateAltText.js +11 -0
  20. package/dist/crawlers/custom/extractAndGradeText.js +44 -0
  21. package/dist/crawlers/custom/extractText.js +27 -0
  22. package/dist/crawlers/custom/findElementByCssSelector.js +36 -0
  23. package/dist/crawlers/custom/flagUnlabelledClickableElements.js +963 -0
  24. package/dist/crawlers/custom/framesCheck.js +37 -0
  25. package/dist/crawlers/custom/getAxeConfiguration.js +111 -0
  26. package/dist/crawlers/custom/gradeReadability.js +23 -0
  27. package/dist/crawlers/custom/utils.js +1024 -0
  28. package/dist/crawlers/custom/xPathToCss.js +147 -0
  29. package/dist/crawlers/guards/urlGuard.js +71 -0
  30. package/dist/crawlers/pdfScanFunc.js +276 -0
  31. package/dist/crawlers/runCustom.js +89 -0
  32. package/dist/exclusions.txt +7 -0
  33. package/dist/generateHtmlReport.js +144 -0
  34. package/dist/index.js +62 -0
  35. package/dist/logs.js +84 -0
  36. package/dist/mergeAxeResults.js +1588 -0
  37. package/dist/npmIndex.js +640 -0
  38. package/dist/proxyService.js +360 -0
  39. package/dist/runGenerateJustHtmlReport.js +16 -0
  40. package/dist/screenshotFunc/htmlScreenshotFunc.js +355 -0
  41. package/dist/screenshotFunc/pdfScreenshotFunc.js +645 -0
  42. package/dist/services/s3Uploader.js +127 -0
  43. package/dist/static/ejs/partials/components/allIssues/AllIssues.ejs +9 -0
  44. package/dist/static/ejs/partials/components/allIssues/CategoryBadges.ejs +82 -0
  45. package/dist/static/ejs/partials/components/allIssues/FilterBar.ejs +33 -0
  46. package/dist/static/ejs/partials/components/allIssues/IssuesTable.ejs +41 -0
  47. package/dist/static/ejs/partials/components/header/SiteInfo.ejs +119 -0
  48. package/dist/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +15 -0
  49. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +44 -0
  50. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +142 -0
  51. package/dist/static/ejs/partials/components/prioritiseIssues/IssueDetailCard.ejs +36 -0
  52. package/dist/static/ejs/partials/components/prioritiseIssues/PrioritiseIssues.ejs +47 -0
  53. package/dist/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +196 -0
  54. package/dist/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +48 -0
  55. package/dist/static/ejs/partials/components/screenshotLightbox.ejs +13 -0
  56. package/dist/static/ejs/partials/components/shared/InfoAlert.ejs +3 -0
  57. package/dist/static/ejs/partials/components/summaryScanAbout.ejs +141 -0
  58. package/dist/static/ejs/partials/components/summaryScanResults.ejs +16 -0
  59. package/dist/static/ejs/partials/components/summaryTable.ejs +20 -0
  60. package/dist/static/ejs/partials/components/summaryWcagCompliance.ejs +94 -0
  61. package/dist/static/ejs/partials/components/topTen.ejs +6 -0
  62. package/dist/static/ejs/partials/components/wcagCompliance/FailedCriteria.ejs +47 -0
  63. package/dist/static/ejs/partials/components/wcagCompliance/WcagCompliance.ejs +16 -0
  64. package/dist/static/ejs/partials/components/wcagCompliance/WcagGaugeBar.ejs +16 -0
  65. package/dist/static/ejs/partials/components/wcagCoverageDetails.ejs +18 -0
  66. package/dist/static/ejs/partials/footer.ejs +24 -0
  67. package/dist/static/ejs/partials/header.ejs +14 -0
  68. package/dist/static/ejs/partials/main.ejs +29 -0
  69. package/dist/static/ejs/partials/scripts/allIssues/AllIssues.ejs +376 -0
  70. package/dist/static/ejs/partials/scripts/bootstrap.ejs +8 -0
  71. package/dist/static/ejs/partials/scripts/categorySummary.ejs +141 -0
  72. package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +3 -0
  73. package/dist/static/ejs/partials/scripts/header/SiteInfo.ejs +44 -0
  74. package/dist/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +51 -0
  75. package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +127 -0
  76. package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanDetails.ejs +60 -0
  77. package/dist/static/ejs/partials/scripts/highlightjs.ejs +335 -0
  78. package/dist/static/ejs/partials/scripts/popper.ejs +7 -0
  79. package/dist/static/ejs/partials/scripts/prioritiseIssues/IssueDetailCard.ejs +137 -0
  80. package/dist/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +214 -0
  81. package/dist/static/ejs/partials/scripts/prioritiseIssues/wcagSvgMap.ejs +861 -0
  82. package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +957 -0
  83. package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +353 -0
  84. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +468 -0
  85. package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +306 -0
  86. package/dist/static/ejs/partials/scripts/ruleModal/utilities.ejs +483 -0
  87. package/dist/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +35 -0
  88. package/dist/static/ejs/partials/scripts/screenshotLightbox.ejs +75 -0
  89. package/dist/static/ejs/partials/scripts/summaryScanResults.ejs +14 -0
  90. package/dist/static/ejs/partials/scripts/summaryTable.ejs +78 -0
  91. package/dist/static/ejs/partials/scripts/topTen.ejs +61 -0
  92. package/dist/static/ejs/partials/scripts/utils.ejs +453 -0
  93. package/dist/static/ejs/partials/scripts/wcagCompliance/FailedCriteria.ejs +103 -0
  94. package/dist/static/ejs/partials/scripts/wcagCompliance/WcagGaugeBar.ejs +47 -0
  95. package/dist/static/ejs/partials/scripts/wcagCompliance.ejs +15 -0
  96. package/dist/static/ejs/partials/scripts/wcagCoverageDetails.ejs +75 -0
  97. package/dist/static/ejs/partials/styles/allIssues/AllIssues.ejs +384 -0
  98. package/dist/static/ejs/partials/styles/bootstrap.ejs +12391 -0
  99. package/dist/static/ejs/partials/styles/header/SiteInfo.ejs +121 -0
  100. package/dist/static/ejs/partials/styles/header/aboutScanModal/AboutScanModal.ejs +82 -0
  101. package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanConfiguration.ejs +50 -0
  102. package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +149 -0
  103. package/dist/static/ejs/partials/styles/header.ejs +7 -0
  104. package/dist/static/ejs/partials/styles/highlightjs.ejs +54 -0
  105. package/dist/static/ejs/partials/styles/prioritiseIssues/IssueDetailCard.ejs +141 -0
  106. package/dist/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +204 -0
  107. package/dist/static/ejs/partials/styles/ruleModal/ruleOffcanvas.ejs +456 -0
  108. package/dist/static/ejs/partials/styles/scannedPagesSegmentedTabs.ejs +46 -0
  109. package/dist/static/ejs/partials/styles/shared/InfoAlert.ejs +12 -0
  110. package/dist/static/ejs/partials/styles/styles.ejs +1607 -0
  111. package/dist/static/ejs/partials/styles/summaryBootstrap.ejs +12458 -0
  112. package/dist/static/ejs/partials/styles/topTenCard.ejs +44 -0
  113. package/dist/static/ejs/partials/styles/wcagCompliance/FailedCriteria.ejs +59 -0
  114. package/dist/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +62 -0
  115. package/dist/static/ejs/partials/styles/wcagCompliance.ejs +36 -0
  116. package/dist/static/ejs/partials/styles/wcagCoverageDetails.ejs +33 -0
  117. package/dist/static/ejs/partials/summaryHeader.ejs +70 -0
  118. package/dist/static/ejs/partials/summaryMain.ejs +49 -0
  119. package/dist/static/ejs/report.ejs +226 -0
  120. package/dist/static/ejs/summary.ejs +47 -0
  121. package/dist/types/types.js +1 -0
  122. package/dist/utils.js +1070 -0
  123. package/examples/oobee-cypress-integration-js/cypress/support/e2e.js +36 -6
  124. package/examples/oobee-cypress-integration-js/cypress.config.js +45 -1
  125. package/examples/oobee-cypress-integration-ts/cypress.config.ts +47 -1
  126. package/examples/oobee-cypress-integration-ts/src/cypress/support/e2e.ts +36 -6
  127. package/examples/oobee-playwright-integration-js/oobee-playwright-demo.js +2 -1
  128. package/examples/oobee-playwright-integration-ts/src/oobee-playwright-demo.ts +2 -1
  129. package/examples/oobee-scan-html-demo.js +51 -0
  130. package/examples/oobee-scan-page-demo.js +40 -0
  131. package/package.json +9 -3
  132. package/src/constants/common.ts +2 -2
  133. package/src/constants/constants.ts +3 -1
  134. package/src/crawlers/crawlDomain.ts +1 -0
  135. package/src/crawlers/runCustom.ts +0 -1
  136. package/src/mergeAxeResults.ts +43 -22
  137. package/src/npmIndex.ts +500 -131
@@ -0,0 +1,1669 @@
1
+ /* eslint-disable consistent-return */
2
+ /* eslint-disable no-console */
3
+ /* eslint-disable camelcase */
4
+ /* eslint-disable no-use-before-define */
5
+ import validator from 'validator';
6
+ import axios from 'axios';
7
+ import { JSDOM } from 'jsdom';
8
+ import * as cheerio from 'cheerio';
9
+ import crawlee, { EnqueueStrategy, Request } from 'crawlee';
10
+ import { parseString } from 'xml2js';
11
+ import fs from 'fs';
12
+ import path from 'path';
13
+ import url, { fileURLToPath, pathToFileURL } from 'url';
14
+ import safe from 'safe-regex';
15
+ import * as https from 'https';
16
+ import os from 'os';
17
+ import mime from 'mime';
18
+ import { minimatch } from 'minimatch';
19
+ import { globSync } from 'glob';
20
+ import { devices, webkit } from 'playwright';
21
+ import printMessage from 'print-message';
22
+ import constants, { getDefaultChromeDataDir, getDefaultEdgeDataDir, getDefaultChromiumDataDir,
23
+ // Legacy code start - Google Sheets submission
24
+ formDataFields,
25
+ // Legacy code end - Google Sheets submission
26
+ ScannerTypes, BrowserTypes, FileTypes, getEnumKey, } from './constants.js';
27
+ import { consoleLogger } from '../logs.js';
28
+ import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
29
+ import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
30
+ import { getProxyInfo, proxyInfoToResolution } from '../proxyService.js';
31
+ // validateDirPath validates a provided directory path
32
+ // returns null if no error
33
+ export const validateDirPath = (dirPath) => {
34
+ if (typeof dirPath !== 'string') {
35
+ return 'Please provide string value of directory path.';
36
+ }
37
+ try {
38
+ fs.accessSync(dirPath);
39
+ if (!fs.statSync(dirPath).isDirectory()) {
40
+ return 'Please provide a directory path.';
41
+ }
42
+ return null;
43
+ }
44
+ catch {
45
+ return 'Please ensure path provided exists.';
46
+ }
47
+ };
48
+ export class RES {
49
+ constructor(res) {
50
+ if (res) {
51
+ Object.assign(this, res);
52
+ }
53
+ }
54
+ }
55
+ export const validateCustomFlowLabel = (customFlowLabel) => {
56
+ const containsReserveWithDot = constants.reserveFileNameKeywords.some(char => customFlowLabel.toLowerCase().includes(`${char.toLowerCase()}.`));
57
+ const containsForbiddenCharacters = constants.forbiddenCharactersInDirPath.some(char => customFlowLabel.includes(char));
58
+ const exceedsMaxLength = customFlowLabel.length > 80;
59
+ if (containsForbiddenCharacters) {
60
+ const displayForbiddenCharacters = constants.forbiddenCharactersInDirPath
61
+ .toString()
62
+ .replaceAll(',', ' , ');
63
+ return {
64
+ isValid: false,
65
+ errorMessage: `Invalid label. Cannot contain ${displayForbiddenCharacters}`,
66
+ };
67
+ }
68
+ if (exceedsMaxLength) {
69
+ return { isValid: false, errorMessage: `Invalid label. Cannot exceed 80 characters.` };
70
+ }
71
+ if (containsReserveWithDot) {
72
+ const displayReserveKeywords = constants.reserveFileNameKeywords
73
+ .toString()
74
+ .replaceAll(',', ' , ');
75
+ return {
76
+ isValid: false,
77
+ errorMessage: `Invalid label. Cannot have '.' appended to ${displayReserveKeywords} as they are reserved keywords.`,
78
+ };
79
+ }
80
+ return { isValid: true };
81
+ };
82
+ // validateFilePath validates a provided file path
83
+ // returns null if no error
84
+ export const validateFilePath = (filePath, cliDir) => {
85
+ if (typeof filePath !== 'string') {
86
+ throw new Error('Please provide string value of file path.');
87
+ }
88
+ const absolutePath = path.isAbsolute(filePath) ? filePath : path.resolve(cliDir, filePath);
89
+ try {
90
+ fs.accessSync(absolutePath);
91
+ if (!fs.statSync(absolutePath).isFile()) {
92
+ throw new Error('Please provide a file path.');
93
+ }
94
+ if (path.extname(absolutePath) !== '.txt') {
95
+ throw new Error('Please provide a file with txt extension.');
96
+ }
97
+ return absolutePath;
98
+ }
99
+ catch {
100
+ throw new Error(`Please ensure path provided exists and writable: ${absolutePath}`);
101
+ }
102
+ };
103
+ export const getBlackListedPatterns = (blacklistedPatternsFilename) => {
104
+ let exclusionsFile = null;
105
+ if (blacklistedPatternsFilename) {
106
+ exclusionsFile = blacklistedPatternsFilename;
107
+ }
108
+ else if (fs.existsSync('exclusions.txt')) {
109
+ exclusionsFile = 'exclusions.txt';
110
+ }
111
+ if (!exclusionsFile) {
112
+ return null;
113
+ }
114
+ const rawPatterns = fs.readFileSync(exclusionsFile).toString();
115
+ const blacklistedPatterns = rawPatterns
116
+ .split('\n')
117
+ .map(p => p.trim())
118
+ .filter(p => p !== '');
119
+ const unsafe = blacklistedPatterns.filter(pattern => !safe(pattern));
120
+ if (unsafe.length > 0) {
121
+ const unsafeExpressionsError = `Unsafe expressions detected: ${unsafe} Please revise ${exclusionsFile}`;
122
+ throw new Error(unsafeExpressionsError);
123
+ }
124
+ return blacklistedPatterns;
125
+ };
126
+ export const isBlacklistedFileExtensions = (url, blacklistedFileExtensions) => {
127
+ const urlExtension = url.split('.').pop();
128
+ return blacklistedFileExtensions.includes(urlExtension);
129
+ };
130
+ const document = new JSDOM('').window;
131
+ const httpsAgent = new https.Agent({
132
+ // Run in environments with custom certificates
133
+ rejectUnauthorized: false,
134
+ keepAlive: true,
135
+ });
136
+ export const messageOptions = {
137
+ border: false,
138
+ marginTop: 2,
139
+ marginBottom: 2,
140
+ };
141
+ const urlOptions = {
142
+ // http and https for normal scans, file for local file scan
143
+ protocols: ['http', 'https', 'file'],
144
+ require_protocol: true,
145
+ require_tld: false,
146
+ require_host: false,
147
+ // being explicit; fragments/queries are fine for local files
148
+ allow_fragments: true,
149
+ allow_query_components: true,
150
+ };
151
+ const queryCheck = (s) => document.createDocumentFragment().querySelector(s);
152
+ export const isSelectorValid = (selector) => {
153
+ try {
154
+ queryCheck(selector);
155
+ }
156
+ catch {
157
+ return false;
158
+ }
159
+ return true;
160
+ };
161
+ // Don't sanitise for now as we have changed the logic for URL validation / local file scan
162
+ // Only use this when we find characters to validate against
163
+ const blackListCharacters = '';
164
+ export const validateXML = (content) => {
165
+ let isValid;
166
+ let parsedContent;
167
+ parseString(content, (_err, result) => {
168
+ if (result) {
169
+ isValid = true;
170
+ parsedContent = result;
171
+ }
172
+ else {
173
+ isValid = false;
174
+ }
175
+ });
176
+ return { isValid, parsedContent };
177
+ };
178
+ export const isSkippedUrl = (pageUrl, whitelistedDomains) => {
179
+ const matched = whitelistedDomains.filter(p => {
180
+ const pattern = p.replace(/[\n\r]+/g, '');
181
+ // is url
182
+ if (pattern.startsWith('http') && pattern === pageUrl) {
183
+ return true;
184
+ }
185
+ // is regex (default)
186
+ return new RegExp(pattern).test(pageUrl);
187
+ }).length > 0;
188
+ return matched;
189
+ };
190
+ export const getFileSitemap = (filePath) => {
191
+ if (filePath.startsWith('file:///')) {
192
+ if (os.platform() === 'win32') {
193
+ filePath = filePath.match(/^file:\/\/\/([A-Z]:\/[^?#]+)/)?.[1];
194
+ }
195
+ else {
196
+ filePath = filePath.match(/^file:\/\/(\/[^?#]+)/)?.[1];
197
+ }
198
+ }
199
+ filePath = convertToFilePath(filePath);
200
+ if (!fs.existsSync(filePath)) {
201
+ return null;
202
+ }
203
+ const file = fs.readFileSync(filePath, 'utf8');
204
+ const isLocalFileScan = isSitemapContent(file);
205
+ return isLocalFileScan || file !== undefined ? filePath : null;
206
+ };
207
+ export const getUrlMessage = (scanner) => {
208
+ switch (scanner) {
209
+ case ScannerTypes.WEBSITE:
210
+ case ScannerTypes.CUSTOM:
211
+ case ScannerTypes.INTELLIGENT:
212
+ return 'Please enter URL of website: ';
213
+ case ScannerTypes.SITEMAP:
214
+ return 'Please enter URL or file path to sitemap, or drag and drop a sitemap file here: ';
215
+ case ScannerTypes.LOCALFILE:
216
+ return 'Please enter file path: ';
217
+ default:
218
+ return 'Invalid option';
219
+ }
220
+ };
221
+ export const isInputValid = (inputString) => {
222
+ if (!validator.isEmpty(inputString)) {
223
+ const removeBlackListCharacters = validator.escape(inputString);
224
+ if (validator.isAscii(removeBlackListCharacters)) {
225
+ return true;
226
+ }
227
+ }
228
+ return false;
229
+ };
230
+ export const sanitizeUrlInput = (url) => {
231
+ // Sanitize that there is no blacklist characters
232
+ const sanitizeUrl = validator.blacklist(url, blackListCharacters);
233
+ if (url.toLowerCase().startsWith('file://') || validator.isURL(sanitizeUrl, urlOptions)) {
234
+ return { isValid: true, url: sanitizeUrl };
235
+ }
236
+ return { isValid: false, url: sanitizeUrl };
237
+ };
238
+ const isAllowedContentType = (ct) => {
239
+ const c = (ct || '').toLowerCase();
240
+ return (c.startsWith('text/html') || // html
241
+ c.startsWith('application/xhtml+xml') || // xhtml
242
+ c.startsWith('text/plain') || // txt
243
+ c.startsWith('application/xml') || // xml
244
+ c.startsWith('text/xml') || // xml (alt)
245
+ c.startsWith('application/pdf') // pdf
246
+ );
247
+ };
248
+ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir, playwrightDeviceDetailsObject, extraHTTPHeaders) => {
249
+ const res = new RES();
250
+ const data = sanitizeUrlInput(url);
251
+ if (!data.isValid) {
252
+ res.status = constants.urlCheckStatuses.invalidUrl.code;
253
+ return res;
254
+ }
255
+ // STEP 1: For local file scans
256
+ let contentType = '';
257
+ const protocol = new URL(url).protocol;
258
+ if (protocol !== 'http:' && protocol !== 'https:') {
259
+ try {
260
+ const filePath = fileURLToPath(url);
261
+ const stat = fs.statSync(filePath);
262
+ if (!stat.isFile()) {
263
+ res.status = constants.urlCheckStatuses.notALocalFile.code;
264
+ return res;
265
+ }
266
+ const statusCode = 200;
267
+ contentType = mime.getType(filePath) || 'application/octet-stream';
268
+ if (!isAllowedContentType(contentType)) {
269
+ res.status = constants.urlCheckStatuses.notASupportedDocument.code;
270
+ return res;
271
+ }
272
+ // Short-circuit for pdfs
273
+ if (contentType.includes('pdf')) {
274
+ res.status = constants.urlCheckStatuses.success.code;
275
+ res.httpStatus = statusCode;
276
+ res.url = url;
277
+ res.content = '%PDF-'; // Avoid putting the binary in memory
278
+ return res;
279
+ }
280
+ }
281
+ catch (e) {
282
+ consoleLogger.info(`Local file check failed: ${e.message}`);
283
+ res.status = constants.urlCheckStatuses.systemError.code;
284
+ return res;
285
+ }
286
+ }
287
+ // Ensure Accept header for non-html content fallback
288
+ extraHTTPHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
289
+ await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
290
+ let browserContext;
291
+ try {
292
+ browserContext = await constants.launcher.launchPersistentContext(clonedDataDir, {
293
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
294
+ ignoreHTTPSErrors: true,
295
+ headless: true,
296
+ ...getPlaywrightLaunchOptions(browserToRun),
297
+ ...playwrightDeviceDetailsObject,
298
+ ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
299
+ });
300
+ register(browserContext);
301
+ }
302
+ catch (err) {
303
+ printMessage([`Unable to launch browser\n${err}`], messageOptions);
304
+ res.status = constants.urlCheckStatuses.browserError.code;
305
+ return res;
306
+ }
307
+ try {
308
+ const page = await browserContext.newPage();
309
+ // Block native Chrome download UI
310
+ try {
311
+ const cdp = await browserContext.newCDPSession(page);
312
+ await cdp.send('Page.setDownloadBehavior', { behavior: 'deny' });
313
+ }
314
+ catch (e) {
315
+ consoleLogger.info(`Unable to set download deny: ${e.message}`);
316
+ }
317
+ // OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
318
+ // This allows the "Connectivity Check" to pass as soon as HTML is ready
319
+ await page.route('**/*', (route) => {
320
+ const type = route.request().resourceType();
321
+ if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
322
+ return route.abort();
323
+ }
324
+ return route.continue();
325
+ });
326
+ // STEP 2: Navigate (follows server-side redirects)
327
+ page.once('download', () => {
328
+ res.status = constants.urlCheckStatuses.notASupportedDocument.code;
329
+ return res;
330
+ });
331
+ // OPTIMIZATION: Wait for 'domcontentloaded' only
332
+ const response = await page.goto(url, {
333
+ timeout: 15000,
334
+ waitUntil: 'domcontentloaded', // enough to get status + allow potential client redirects to kick in
335
+ });
336
+ if (!response)
337
+ throw new Error('No response from navigation');
338
+ // We use the response headers from the navigation we just performed.
339
+ const finalUrl = page.url();
340
+ const finalStatus = response.status();
341
+ const headers = response.headers();
342
+ contentType = headers['content-type'] || '';
343
+ if (!isAllowedContentType(contentType)) {
344
+ res.status = constants.urlCheckStatuses.notASupportedDocument.code;
345
+ return res;
346
+ }
347
+ res.httpStatus = finalStatus;
348
+ res.url = finalUrl;
349
+ if (finalStatus === 401) {
350
+ res.status = constants.urlCheckStatuses.unauthorised.code;
351
+ }
352
+ else if (finalStatus >= 200 && finalStatus < 400) {
353
+ res.status = constants.urlCheckStatuses.success.code;
354
+ }
355
+ else if (finalStatus === 405 || finalStatus === 501) {
356
+ // Some origins 405/501 but the browser-rendered page is still reachable after client redirects.
357
+ // As a last resort, consider DOM presence as success if we actually have a document.
358
+ const hasDOM = await page.evaluate(() => !!document && !!document.documentElement);
359
+ res.status = hasDOM
360
+ ? constants.urlCheckStatuses.success.code
361
+ : constants.urlCheckStatuses.systemError.code;
362
+ }
363
+ else {
364
+ res.status = constants.urlCheckStatuses.systemError.code;
365
+ }
366
+ // Content handling
367
+ if (contentType.includes('pdf') || contentType.includes('octet-stream')) {
368
+ res.content = '%PDF-'; // avoid binary in memory / download
369
+ }
370
+ else {
371
+ try {
372
+ // Try to get a stable DOM; don't fail the check if it times out
373
+ // Note: Since we used 'domcontentloaded' in goto, this is fast, but kept for safety/stability
374
+ await page.waitForLoadState('domcontentloaded', { timeout: 5000 });
375
+ }
376
+ catch { }
377
+ res.content = await page.content();
378
+ }
379
+ }
380
+ catch (error) {
381
+ if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
382
+ res.status = constants.urlCheckStatuses.unauthorised.code;
383
+ }
384
+ else if (error.message.includes('net::ERR_NAME_NOT_RESOLVED')) {
385
+ res.status = constants.urlCheckStatuses.cannotBeResolved.code;
386
+ }
387
+ else if (error.message.includes('net::ERR_CONNECTION_REFUSED')) {
388
+ res.status = constants.urlCheckStatuses.connectionRefused.code;
389
+ }
390
+ else if (error.message.includes('net::ERR_TIMED_OUT')) {
391
+ res.status = constants.urlCheckStatuses.timedOut.code;
392
+ }
393
+ else if (error.message.includes('net::ERR_SSL_PROTOCOL_ERROR')) {
394
+ res.status = constants.urlCheckStatuses.sslProtocolError.code;
395
+ }
396
+ else {
397
+ consoleLogger.error(error);
398
+ res.status = constants.urlCheckStatuses.systemError.code;
399
+ }
400
+ }
401
+ finally {
402
+ await browserContext.close();
403
+ }
404
+ return res;
405
+ };
406
+ export const isPdfContent = (content) => {
407
+ let header;
408
+ if (Buffer.isBuffer(content)) {
409
+ header = content.toString('utf8', 0, 5);
410
+ }
411
+ else {
412
+ header = content.substring(0, 5);
413
+ }
414
+ return header === '%PDF-';
415
+ };
416
+ export const isSitemapContent = (content) => {
417
+ const { isValid } = validateXML(content);
418
+ if (isValid) {
419
+ return true;
420
+ }
421
+ const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
422
+ const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
423
+ const regexForUrl = new RegExp('^.*(http|https):/{2}.*$', 'gmi');
424
+ if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
425
+ // is an XML sitemap wrapped in a HTML document
426
+ return true;
427
+ }
428
+ if (!content.match(regexForHtml) && content.match(regexForUrl)) {
429
+ // treat this as a txt sitemap where all URLs will be extracted for crawling
430
+ return true;
431
+ }
432
+ // is HTML webpage
433
+ return false;
434
+ };
435
+ export const checkUrl = async (scanner, url, browser, clonedDataDir, playwrightDeviceDetailsObject, extraHTTPHeaders, fileTypes) => {
436
+ const res = await checkUrlConnectivityWithBrowser(url, browser, clonedDataDir, playwrightDeviceDetailsObject, extraHTTPHeaders);
437
+ // If response is 200 (meaning no other code was set earlier)
438
+ if (res.status === constants.urlCheckStatuses.success.code) {
439
+ // Check if document is pdf type
440
+ const isPdf = isPdfContent(res.content);
441
+ // Check if only HTML document is allowed to be scanned
442
+ if (fileTypes === FileTypes.HtmlOnly && isPdf) {
443
+ res.status = constants.urlCheckStatuses.notASupportedDocument.code;
444
+ // Check if only PDF document is allowed to be scanned
445
+ }
446
+ else if (fileTypes === FileTypes.PdfOnly && !isPdf) {
447
+ res.status = constants.urlCheckStatuses.notAPdf.code;
448
+ // Check if sitemap is expected
449
+ }
450
+ else if (scanner === ScannerTypes.SITEMAP) {
451
+ const isSitemap = isSitemapContent(res.content);
452
+ if (!isSitemap) {
453
+ res.status = constants.urlCheckStatuses.notASitemap.code;
454
+ }
455
+ }
456
+ // else proceed as normal
457
+ }
458
+ return res;
459
+ };
460
+ const isEmptyObject = (obj) => !Object.keys(obj).length;
461
+ export const parseHeaders = (header) => {
462
+ // parse HTTP headers from string
463
+ if (!header)
464
+ return {};
465
+ const headerValues = header.split(', ');
466
+ const allHeaders = {};
467
+ headerValues.map((headerValue) => {
468
+ const headerValuePair = headerValue.split(/ (.*)/s);
469
+ if (headerValuePair.length < 2) {
470
+ printMessage([
471
+ `Invalid value for authorisation request header. Please provide valid keywords in the format: "<header> <value>". For multiple authentication headers, please provide the keywords in the format: "<header> <value>, <header2> <value2>, ..." .`,
472
+ ], messageOptions);
473
+ cleanUpAndExit(1);
474
+ }
475
+ allHeaders[headerValuePair[0]] = headerValuePair[1]; // {"header": "value", "header2": "value2", ...}
476
+ });
477
+ return allHeaders;
478
+ };
479
+ export const prepareData = async (argv) => {
480
+ if (isEmptyObject(argv)) {
481
+ throw Error('No inputs should be provided');
482
+ }
483
+ let { scanner, headless, url, deviceChosen, customDevice, viewportWidth, maxpages, strategy, isLocalFileScan = argv.scanner === ScannerTypes.LOCALFILE, browserToRun, nameEmail, customFlowLabel, specifiedMaxConcurrency, fileTypes, blacklistedPatternsFilename, additional, metadata, followRobots, header, safeMode, exportDirectory, zip, ruleset, generateJsonFiles, scanDuration, } = argv;
484
+ const extraHTTPHeaders = parseHeaders(header);
485
+ // Set default username and password for basic auth
486
+ let username = '';
487
+ let password = '';
488
+ // If a file path is provided
489
+ if (isFilePath(url)) {
490
+ // Set is as local file scan if not already so
491
+ isLocalFileScan = true;
492
+ // Convert to absolute path
493
+ url = path.resolve(url);
494
+ // Convert to file:// URL
495
+ url = convertPathToLocalFile(url);
496
+ }
497
+ else {
498
+ // Check URL for basic auth embedded and move it to extraHTTPHeaders
499
+ const temp = new URL(url);
500
+ username = temp.username;
501
+ password = temp.password;
502
+ if (username !== '' || password !== '') {
503
+ extraHTTPHeaders.Authorization = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
504
+ }
505
+ temp.username = '';
506
+ temp.password = '';
507
+ url = temp.toString();
508
+ }
509
+ // construct filename for scan results
510
+ const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
511
+ const domain = isLocalFileScan ? path.basename(url) : new URL(url).hostname;
512
+ const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : '';
513
+ let resultFilename;
514
+ const randomThreeDigitNumber = randomThreeDigitNumberString();
515
+ resultFilename = `${date}_${time}${sanitisedLabel}_${domain}_${randomThreeDigitNumber}`;
516
+ // Set exported directory
517
+ if (exportDirectory) {
518
+ constants.exportDirectory = path.join(exportDirectory, resultFilename);
519
+ }
520
+ // Creating the playwrightDeviceDetailObject
521
+ deviceChosen =
522
+ customDevice === 'Desktop' || customDevice === 'Mobile' ? customDevice : deviceChosen;
523
+ const playwrightDeviceDetailsObject = getPlaywrightDeviceDetailsObject(deviceChosen, customDevice, viewportWidth);
524
+ const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(resultFilename, browserToRun, true);
525
+ browserToRun = resolvedBrowser;
526
+ const resolvedUserDataDirectory = getClonedProfilesWithRandomToken(browserToRun, resultFilename);
527
+ if (followRobots) {
528
+ constants.robotsTxtUrls = {};
529
+ await getUrlsFromRobotsTxt(url, browserToRun, resolvedUserDataDirectory, extraHTTPHeaders);
530
+ }
531
+ constants.userDataDirectory = resolvedUserDataDirectory;
532
+ constants.randomToken = resultFilename;
533
+ return {
534
+ type: scanner,
535
+ url,
536
+ entryUrl: url,
537
+ isHeadless: headless,
538
+ deviceChosen,
539
+ customDevice,
540
+ viewportWidth,
541
+ playwrightDeviceDetailsObject,
542
+ maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
543
+ strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
544
+ isLocalFileScan,
545
+ browser: browserToRun,
546
+ nameEmail,
547
+ customFlowLabel,
548
+ specifiedMaxConcurrency,
549
+ randomToken: resultFilename,
550
+ fileTypes: FileTypes[getEnumKey(FileTypes, fileTypes)],
551
+ blacklistedPatternsFilename,
552
+ includeScreenshots: !(additional === 'none'),
553
+ metadata,
554
+ followRobots,
555
+ extraHTTPHeaders,
556
+ safeMode,
557
+ userDataDirectory: resolvedUserDataDirectory,
558
+ zip,
559
+ ruleset,
560
+ generateJsonFiles,
561
+ scanDuration,
562
+ };
563
+ };
564
+ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory, extraHTTPHeaders) => {
565
+ if (!constants.robotsTxtUrls)
566
+ return;
567
+ const domain = new URL(url).origin;
568
+ if (constants.robotsTxtUrls[domain])
569
+ return;
570
+ const robotsUrl = domain.concat('/robots.txt');
571
+ let robotsTxt;
572
+ try {
573
+ robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browserToRun, userDataDirectory, extraHTTPHeaders);
574
+ consoleLogger.info(`Fetched robots.txt from ${robotsUrl}`);
575
+ }
576
+ catch (e) {
577
+ // if robots.txt is not found, do nothing
578
+ consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl}`);
579
+ }
580
+ if (!robotsTxt) {
581
+ constants.robotsTxtUrls[domain] = {};
582
+ return;
583
+ }
584
+ const lines = robotsTxt.split(/\r?\n/);
585
+ let shouldCapture = false;
586
+ const disallowedUrls = [];
587
+ const allowedUrls = [];
588
+ const sanitisePattern = (pattern) => {
589
+ const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
590
+ const subdirWildcardRegex = /\/\*\//g;
591
+ const filePathRegex = /^\/(?:[^\/]+\/)*[^\/]+\.[a-zA-Z0-9]{1,6}$/;
592
+ if (subdirWildcardRegex.test(pattern)) {
593
+ pattern = pattern.replace(subdirWildcardRegex, '/**/');
594
+ }
595
+ if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
596
+ if (pattern.endsWith('*')) {
597
+ pattern = pattern.concat('*');
598
+ }
599
+ else {
600
+ if (!pattern.endsWith('/'))
601
+ pattern = pattern.concat('/');
602
+ pattern = pattern.concat('**');
603
+ }
604
+ }
605
+ const final = domain.concat(pattern);
606
+ return final;
607
+ };
608
+ for (const line of lines) {
609
+ if (line.toLowerCase().startsWith('user-agent: *')) {
610
+ shouldCapture = true;
611
+ }
612
+ else if (line.toLowerCase().startsWith('user-agent:') && shouldCapture) {
613
+ break;
614
+ }
615
+ else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
616
+ let disallowed = line.substring('disallow: '.length).trim();
617
+ if (disallowed) {
618
+ disallowed = sanitisePattern(disallowed);
619
+ disallowedUrls.push(disallowed);
620
+ }
621
+ }
622
+ else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
623
+ let allowed = line.substring('allow: '.length).trim();
624
+ if (allowed) {
625
+ allowed = sanitisePattern(allowed);
626
+ allowedUrls.push(allowed);
627
+ }
628
+ }
629
+ }
630
+ constants.robotsTxtUrls[domain] = { disallowedUrls, allowedUrls };
631
+ };
632
+ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory, extraHTTPHeaders) => {
633
+ const robotsDataDir = '';
634
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
635
+ if (process.env.CRAWLEE_HEADLESS === '1') {
636
+ // Create robots own user data directory else SingletonLock: File exists (17) with crawlDomain or crawlSitemap's own browser
637
+ const robotsDataDir = path.join(userDataDirectory, 'robots');
638
+ if (!fs.existsSync(robotsDataDir)) {
639
+ fs.mkdirSync(robotsDataDir, { recursive: true });
640
+ }
641
+ }
642
+ const browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
643
+ ...getPlaywrightLaunchOptions(browser),
644
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
645
+ });
646
+ register(browserContext);
647
+ const page = await browserContext.newPage();
648
+ await page.goto(robotsUrl, { waitUntil: 'networkidle', timeout: 30000 });
649
+ const robotsTxt = await page.evaluate(() => document.body.textContent);
650
+ return robotsTxt;
651
+ };
652
+ export const isDisallowedInRobotsTxt = (url) => {
653
+ if (!constants.robotsTxtUrls)
654
+ return;
655
+ const domain = new URL(url).origin;
656
+ if (constants.robotsTxtUrls[domain]) {
657
+ const { disallowedUrls, allowedUrls } = constants.robotsTxtUrls[domain];
658
+ const isDisallowed = disallowedUrls.filter((disallowedUrl) => {
659
+ const disallowed = minimatch(url, disallowedUrl);
660
+ return disallowed;
661
+ }).length > 0;
662
+ const isAllowed = allowedUrls.filter((allowedUrl) => {
663
+ const allowed = minimatch(url, allowedUrl);
664
+ return allowed;
665
+ }).length > 0;
666
+ return isDisallowed && !isAllowed;
667
+ }
668
+ return false;
669
+ };
670
+ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders) => {
671
+ const scannedSitemaps = new Set();
672
+ const urls = {}; // dictionary of requests to urls to be scanned
673
+ const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
674
+ const addToUrlList = (url) => {
675
+ if (!url)
676
+ return;
677
+ if (isDisallowedInRobotsTxt(url))
678
+ return;
679
+ url = convertPathToLocalFile(url);
680
+ let request;
681
+ try {
682
+ request = new Request({ url });
683
+ }
684
+ catch (e) {
685
+ console.log('Error creating request', e);
686
+ }
687
+ if (isUrlPdf(url)) {
688
+ request.skipNavigation = true;
689
+ }
690
+ urls[url] = request;
691
+ };
692
+ const calculateCloseness = (sitemapUrl) => {
693
+ // Remove 'http://', 'https://', and 'www.' prefixes from the URLs
694
+ const normalizedSitemapUrl = sitemapUrl.replace(/^(https?:\/\/)?(www\.)?/, '');
695
+ const normalizedUserUrlInput = userUrlInput
696
+ .replace(/^(https?:\/\/)?(www\.)?/, '')
697
+ .replace(/\/$/, ''); // Remove trailing slash also
698
+ if (normalizedSitemapUrl == normalizedUserUrlInput) {
699
+ return 2;
700
+ }
701
+ if (normalizedSitemapUrl.startsWith(normalizedUserUrlInput)) {
702
+ return 1;
703
+ }
704
+ return 0;
705
+ };
706
+ const processXmlSitemap = async ($, sitemapType, linkSelector, dateSelector, sectionSelector) => {
707
+ const urlList = [];
708
+ // Iterate through each URL element in the sitemap, collect url and modified date
709
+ $(sectionSelector).each((_index, urlElement) => {
710
+ let url;
711
+ if (sitemapType === constants.xmlSitemapTypes.atom) {
712
+ url = $(urlElement).find(linkSelector).prop('href');
713
+ }
714
+ else {
715
+ url = $(urlElement).find(linkSelector).text();
716
+ }
717
+ const lastModified = $(urlElement).find(dateSelector).text();
718
+ const lastModifiedDate = lastModified ? new Date(lastModified) : null;
719
+ urlList.push({ url, lastModifiedDate });
720
+ });
721
+ if (isIntelligent) {
722
+ // Sort by closeness to userUrlInput in descending order
723
+ urlList.sort((a, b) => {
724
+ const closenessA = calculateCloseness(a.url);
725
+ const closenessB = calculateCloseness(b.url);
726
+ if (closenessA !== closenessB) {
727
+ return closenessB - closenessA;
728
+ }
729
+ // If closeness is the same, sort by last modified date in descending order
730
+ return (b.lastModifiedDate?.getTime() || 0) - (a.lastModifiedDate?.getTime() || 0);
731
+ });
732
+ }
733
+ // Add the sorted URLs to the main URL list
734
+ for (const { url } of urlList.slice(0, maxLinksCount)) {
735
+ addToUrlList(url);
736
+ }
737
+ };
738
+ const processNonStandardSitemap = (data) => {
739
+ const urlsFromData = crawlee
740
+ .extractUrls({ string: data, urlRegExp: new RegExp('^(http|https):/{2}.+$', 'gmi') })
741
+ .slice(0, maxLinksCount);
742
+ urlsFromData.forEach(url => {
743
+ addToUrlList(url);
744
+ });
745
+ };
746
+ let finalUserDataDirectory = userDataDirectory;
747
+ if (userDataDirectory === null || userDataDirectory === undefined) {
748
+ finalUserDataDirectory = '';
749
+ }
750
+ const fetchUrls = async (url, extraHTTPHeaders) => {
751
+ let data;
752
+ let sitemapType;
753
+ if (scannedSitemaps.has(url)) {
754
+ // Skip processing if the sitemap has already been scanned
755
+ return;
756
+ }
757
+ scannedSitemaps.add(url);
758
+ // Convert file if its not local file path
759
+ url = convertLocalFileToPath(url);
760
+ // Check whether its a file path or a URL
761
+ if (isFilePath(url)) {
762
+ if (!fs.existsSync(url)) {
763
+ return;
764
+ }
765
+ }
766
+ else if (isValidHttpUrl(url)) {
767
+ // Do nothing, url is valid
768
+ }
769
+ else {
770
+ printMessage([`Invalid Url/Filepath: ${url}`], messageOptions);
771
+ return;
772
+ }
773
+ const getDataUsingPlaywright = async () => {
774
+ const browserContext = await constants.launcher.launchPersistentContext(finalUserDataDirectory, {
775
+ ...getPlaywrightLaunchOptions(browser),
776
+ // Not necessary to parse http_credentials as I am parsing it directly in URL
777
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
778
+ ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
779
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
780
+ });
781
+ register(browserContext);
782
+ const page = await browserContext.newPage();
783
+ await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
784
+ if ((await page.locator('body').count()) > 0) {
785
+ data = await page.locator('body').innerText();
786
+ }
787
+ else {
788
+ const urlSet = page.locator('urlset');
789
+ const sitemapIndex = page.locator('sitemapindex');
790
+ const rss = page.locator('rss');
791
+ const feed = page.locator('feed');
792
+ const isRoot = async (locator) => (await locator.count()) > 0;
793
+ if (await isRoot(urlSet)) {
794
+ data = await urlSet.evaluate(elem => elem.outerHTML);
795
+ }
796
+ else if (await isRoot(sitemapIndex)) {
797
+ data = await sitemapIndex.evaluate(elem => elem.outerHTML);
798
+ }
799
+ else if (await isRoot(rss)) {
800
+ data = await rss.evaluate(elem => elem.outerHTML);
801
+ }
802
+ else if (await isRoot(feed)) {
803
+ data = await feed.evaluate(elem => elem.outerHTML);
804
+ }
805
+ }
806
+ await browserContext.close();
807
+ };
808
+ if (validator.isURL(url, urlOptions)) {
809
+ if (isUrlPdf(url)) {
810
+ addToUrlList(url);
811
+ return;
812
+ }
813
+ await getDataUsingPlaywright();
814
+ }
815
+ else {
816
+ url = convertLocalFileToPath(url);
817
+ data = fs.readFileSync(url, 'utf8');
818
+ }
819
+ const $ = cheerio.load(data, { xml: true });
820
+ // This case is when the document is not an XML format document
821
+ if ($(':root').length === 0) {
822
+ processNonStandardSitemap(data);
823
+ return;
824
+ }
825
+ // Root element
826
+ const root = $(':root')[0];
827
+ const { xmlns } = root.attribs;
828
+ const xmlFormatNamespace = '/schemas/sitemap';
829
+ if (root.name === 'urlset' && xmlns.includes(xmlFormatNamespace)) {
830
+ sitemapType = constants.xmlSitemapTypes.xml;
831
+ }
832
+ else if (root.name === 'sitemapindex' && xmlns.includes(xmlFormatNamespace)) {
833
+ sitemapType = constants.xmlSitemapTypes.xmlIndex;
834
+ }
835
+ else if (root.name === 'rss') {
836
+ sitemapType = constants.xmlSitemapTypes.rss;
837
+ }
838
+ else if (root.name === 'feed') {
839
+ sitemapType = constants.xmlSitemapTypes.atom;
840
+ }
841
+ else {
842
+ sitemapType = constants.xmlSitemapTypes.unknown;
843
+ }
844
+ switch (sitemapType) {
845
+ case constants.xmlSitemapTypes.xmlIndex:
846
+ consoleLogger.info(`This is a XML format sitemap index.`);
847
+ for (const childSitemapUrl of $('loc')) {
848
+ const childSitemapUrlText = $(childSitemapUrl).text();
849
+ if (isLimitReached()) {
850
+ break;
851
+ }
852
+ if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
853
+ await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
854
+ }
855
+ else {
856
+ addToUrlList(childSitemapUrlText); // Add regular URLs to the list
857
+ }
858
+ }
859
+ break;
860
+ case constants.xmlSitemapTypes.xml:
861
+ consoleLogger.info(`This is a XML format sitemap.`);
862
+ await processXmlSitemap($, sitemapType, 'loc', 'lastmod', 'url');
863
+ break;
864
+ case constants.xmlSitemapTypes.rss:
865
+ consoleLogger.info(`This is a RSS format sitemap.`);
866
+ await processXmlSitemap($, sitemapType, 'link', 'pubDate', 'item');
867
+ break;
868
+ case constants.xmlSitemapTypes.atom:
869
+ consoleLogger.info(`This is a Atom format sitemap.`);
870
+ await processXmlSitemap($, sitemapType, 'link', 'published', 'entry');
871
+ break;
872
+ default:
873
+ consoleLogger.info(`This is an unrecognised XML sitemap format.`);
874
+ processNonStandardSitemap(data);
875
+ }
876
+ };
877
+ try {
878
+ await fetchUrls(sitemapUrl, extraHTTPHeaders);
879
+ }
880
+ catch (e) {
881
+ consoleLogger.error(e);
882
+ }
883
+ const requestList = Object.values(urls);
884
+ return requestList;
885
+ };
886
+ export const validEmail = (email) => {
887
+ const emailRegex = /^.+@.+\..+$/u;
888
+ return emailRegex.test(email);
889
+ };
890
+ // For new user flow.
891
+ export const validName = (name) => {
892
+ // Allow only printable characters from any language
893
+ const regex = /^[\p{L}\p{N}\s'".,()\[\]{}!?:؛،؟…]+$/u;
894
+ // Check if the length is between 2 and 32000 characters
895
+ if (name.length < 2 || name.length > 32000) {
896
+ // Handle invalid name length
897
+ return false;
898
+ }
899
+ if (!regex.test(name)) {
900
+ // Handle invalid name format
901
+ return false;
902
+ }
903
+ // Include a check for specific characters to sanitize injection patterns
904
+ const preventInjectionRegex = /[<>'"\\/;|&!$*{}()\[\]\r\n\t]/;
905
+ if (preventInjectionRegex.test(name)) {
906
+ // Handle potential injection attempts
907
+ return false;
908
+ }
909
+ return true;
910
+ };
911
+ /**
912
+ * Check for browser available to run scan and clone data directory of the browser if needed.
913
+ * @param preferredBrowser string of user's preferred browser
914
+ * @param isCli boolean flag to indicate if function is called from cli
915
+ * @returns object consisting of browser to run and cloned data directory
916
+ */
917
+ export const getBrowserToRun = (randomToken, preferredBrowser, isCli = false) => {
918
+ const platform = os.platform();
919
+ // Prioritise Chrome on Windows and Mac platforms if user does not specify a browser
920
+ if (!preferredBrowser && (os.platform() === 'win32' || os.platform() === 'darwin')) {
921
+ preferredBrowser = BrowserTypes.CHROME;
922
+ }
923
+ else {
924
+ printMessage([`Preferred browser ${preferredBrowser}`], messageOptions);
925
+ }
926
+ if (preferredBrowser === BrowserTypes.CHROME) {
927
+ const chromeData = getChromeData(randomToken);
928
+ if (chromeData)
929
+ return chromeData;
930
+ if (platform === 'darwin') {
931
+ // mac user who specified -b chrome but does not have chrome
932
+ if (isCli)
933
+ printMessage(['Unable to use Chrome, falling back to webkit...'], messageOptions);
934
+ constants.launcher = webkit;
935
+ return { browserToRun: null, clonedBrowserDataDir: '' };
936
+ }
937
+ if (platform === 'win32') {
938
+ if (isCli)
939
+ printMessage(['Unable to use Chrome, falling back to Edge browser...'], messageOptions);
940
+ const edgeData = getEdgeData(randomToken);
941
+ if (edgeData)
942
+ return edgeData;
943
+ if (isCli)
944
+ printMessage(['Unable to use both Chrome and Edge. Please try again.'], messageOptions);
945
+ process.exit(constants.urlCheckStatuses.browserError.code);
946
+ }
947
+ if (isCli) {
948
+ printMessage(['Unable to use Chrome, falling back to Chromium browser...'], messageOptions);
949
+ }
950
+ }
951
+ else if (preferredBrowser === BrowserTypes.EDGE) {
952
+ const edgeData = getEdgeData(randomToken);
953
+ if (edgeData)
954
+ return edgeData;
955
+ if (isCli)
956
+ printMessage(['Unable to use Edge, falling back to Chrome browser...'], messageOptions);
957
+ const chromeData = getChromeData(randomToken);
958
+ if (chromeData)
959
+ return chromeData;
960
+ if (platform === 'darwin') {
961
+ // mac user who specified -b edge but does not have edge or chrome
962
+ if (isCli)
963
+ printMessage(['Unable to use both Edge and Chrome, falling back to webkit...'], messageOptions);
964
+ constants.launcher = webkit;
965
+ return { browserToRun: null, clonedBrowserDataDir: '' };
966
+ }
967
+ if (platform === 'win32') {
968
+ if (isCli)
969
+ printMessage(['Unable to use both Edge and Chrome. Please try again.'], messageOptions);
970
+ process.exit(constants.urlCheckStatuses.browserError.code);
971
+ }
972
+ else {
973
+ // linux and other OS
974
+ if (isCli)
975
+ printMessage(['Unable to use both Edge and Chrome, falling back to Chromium browser...'], messageOptions);
976
+ }
977
+ }
978
+ // defaults to chromium
979
+ return {
980
+ browserToRun: BrowserTypes.CHROMIUM,
981
+ clonedBrowserDataDir: cloneChromiumProfiles(randomToken),
982
+ };
983
+ };
984
+ /**
985
+ * Cloning a second time with random token for parallel browser sessions
986
+ * Also to mitigate against known bug where cookies are
987
+ * overridden after each browser session - i.e. logs user out
988
+ * after checkingUrl and unable to utilise same cookie for scan
989
+ * */
990
+ export const getClonedProfilesWithRandomToken = (browser, randomToken) => {
991
+ if (browser === BrowserTypes.CHROME) {
992
+ return cloneChromeProfiles(randomToken);
993
+ }
994
+ if (browser === BrowserTypes.EDGE) {
995
+ return cloneEdgeProfiles(randomToken);
996
+ }
997
+ return cloneChromiumProfiles(randomToken);
998
+ };
999
+ export const getChromeData = (randomToken) => {
1000
+ const browserDataDir = getDefaultChromeDataDir();
1001
+ const clonedBrowserDataDir = cloneChromeProfiles(randomToken);
1002
+ if (browserDataDir && clonedBrowserDataDir) {
1003
+ const browserToRun = BrowserTypes.CHROME;
1004
+ return { browserToRun, clonedBrowserDataDir };
1005
+ }
1006
+ return null;
1007
+ };
1008
+ export const getEdgeData = (randomToken) => {
1009
+ const browserDataDir = getDefaultEdgeDataDir();
1010
+ const clonedBrowserDataDir = cloneEdgeProfiles(randomToken);
1011
+ if (browserDataDir && clonedBrowserDataDir) {
1012
+ const browserToRun = BrowserTypes.EDGE;
1013
+ return { browserToRun, clonedBrowserDataDir };
1014
+ }
1015
+ };
1016
+ /**
1017
+ * Clone the Chrome profile cookie files to the destination directory
1018
+ * @param {*} options glob options object
1019
+ * @param {*} destDir destination directory
1020
+ * @returns boolean indicating whether the operation was successful
1021
+ */
1022
+ const cloneChromeProfileCookieFiles = (options, destDir) => {
1023
+ let profileCookiesDir;
1024
+ // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
1025
+ // and ../Chrome/<profile name>/Cookies for mac
1026
+ let profileNamesRegex;
1027
+ if (os.platform() === 'win32') {
1028
+ profileCookiesDir = globSync('**/Network/Cookies', {
1029
+ ...options,
1030
+ ignore: ['oobee*/**'],
1031
+ });
1032
+ profileNamesRegex = /User Data\\(.*?)\\Network/;
1033
+ }
1034
+ else if (os.platform() === 'darwin') {
1035
+ // maxDepth 2 to avoid copying cookies from the oobee directory if it exists
1036
+ profileCookiesDir = globSync('**/Cookies', {
1037
+ ...options,
1038
+ ignore: 'oobee*/**',
1039
+ });
1040
+ profileNamesRegex = /Chrome\/(.*?)\/Cookies/;
1041
+ }
1042
+ if (profileCookiesDir.length > 0) {
1043
+ let success = true;
1044
+ profileCookiesDir.forEach(dir => {
1045
+ const profileName = dir.match(profileNamesRegex)[1];
1046
+ if (profileName) {
1047
+ let destProfileDir = path.join(destDir, profileName);
1048
+ if (os.platform() === 'win32') {
1049
+ destProfileDir = path.join(destProfileDir, 'Network');
1050
+ }
1051
+ // Recursive true to create all parent directories (e.g. PbProfile/Default/Cookies)
1052
+ if (!fs.existsSync(destProfileDir)) {
1053
+ fs.mkdirSync(destProfileDir, { recursive: true });
1054
+ if (!fs.existsSync(destProfileDir)) {
1055
+ fs.mkdirSync(destProfileDir, { recursive: true });
1056
+ }
1057
+ }
1058
+ // Prevents duplicate cookies file if the cookies already exist
1059
+ if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
1060
+ try {
1061
+ fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
1062
+ }
1063
+ catch (err) {
1064
+ consoleLogger.error(err);
1065
+ if (err.code === 'EBUSY') {
1066
+ console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
1067
+ console.log('Please close any applications that might be using this file and try again.');
1068
+ }
1069
+ else {
1070
+ console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
1071
+ }
1072
+ // printMessage([err], messageOptions);
1073
+ success = false;
1074
+ }
1075
+ }
1076
+ }
1077
+ });
1078
+ return success;
1079
+ }
1080
+ consoleLogger.warn('Unable to find Chrome profile cookies file in the system.');
1081
+ printMessage(['Unable to find Chrome profile cookies file in the system.'], messageOptions);
1082
+ return false;
1083
+ };
1084
+ /**
1085
+ * Clone the Chrome profile cookie files to the destination directory
1086
+ * @param {*} options glob options object
1087
+ * @param {*} destDir destination directory
1088
+ * @returns boolean indicating whether the operation was successful
1089
+ */
1090
+ const cloneEdgeProfileCookieFiles = (options, destDir) => {
1091
+ let profileCookiesDir;
1092
+ // Cookies file per profile is located in .../User Data/<profile name>/Network/Cookies for windows
1093
+ // and ../Chrome/<profile name>/Cookies for mac
1094
+ let profileNamesRegex;
1095
+ // Ignores the cloned oobee directory if exists
1096
+ if (os.platform() === 'win32') {
1097
+ profileCookiesDir = globSync('**/Network/Cookies', {
1098
+ ...options,
1099
+ ignore: 'oobee*/**',
1100
+ });
1101
+ profileNamesRegex = /User Data\\(.*?)\\Network/;
1102
+ }
1103
+ else if (os.platform() === 'darwin') {
1104
+ // Ignores copying cookies from the oobee directory if it exists
1105
+ profileCookiesDir = globSync('**/Cookies', {
1106
+ ...options,
1107
+ ignore: 'oobee*/**',
1108
+ });
1109
+ profileNamesRegex = /Microsoft Edge\/(.*?)\/Cookies/;
1110
+ }
1111
+ if (profileCookiesDir.length > 0) {
1112
+ let success = true;
1113
+ profileCookiesDir.forEach(dir => {
1114
+ const profileName = dir.match(profileNamesRegex)[1];
1115
+ if (profileName) {
1116
+ let destProfileDir = path.join(destDir, profileName);
1117
+ if (os.platform() === 'win32') {
1118
+ destProfileDir = path.join(destProfileDir, 'Network');
1119
+ }
1120
+ // Recursive true to create all parent directories (e.g. PbProfile/Default/Cookies)
1121
+ if (!fs.existsSync(destProfileDir)) {
1122
+ fs.mkdirSync(destProfileDir, { recursive: true });
1123
+ if (!fs.existsSync(destProfileDir)) {
1124
+ fs.mkdirSync(destProfileDir, { recursive: true });
1125
+ }
1126
+ }
1127
+ // Prevents duplicate cookies file if the cookies already exist
1128
+ if (!fs.existsSync(path.join(destProfileDir, 'Cookies'))) {
1129
+ try {
1130
+ fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
1131
+ }
1132
+ catch (err) {
1133
+ consoleLogger.error(err);
1134
+ if (err.code === 'EBUSY') {
1135
+ console.log(`Unable to copy the file for ${profileName} because it is currently in use.`);
1136
+ console.log('Please close any applications that might be using this file and try again.');
1137
+ }
1138
+ else {
1139
+ console.log(`An unexpected error occurred while copying the file: ${err.message}`);
1140
+ }
1141
+ // printMessage([err], messageOptions);
1142
+ success = false;
1143
+ }
1144
+ }
1145
+ }
1146
+ });
1147
+ return success;
1148
+ }
1149
+ consoleLogger.warn('Unable to find Edge profile cookies file in the system.');
1150
+ printMessage(['Unable to find Edge profile cookies file in the system.'], messageOptions);
1151
+ return false;
1152
+ };
1153
+ /**
1154
+ * Both Edge and Chrome Local State files are located in the .../User Data directory
1155
+ * @param {*} options - glob options object
1156
+ * @param {string} destDir - destination directory
1157
+ * @returns boolean indicating whether the operation was successful
1158
+ */
1159
+ const cloneLocalStateFile = (options, destDir) => {
1160
+ const localState = globSync('**/*Local State', {
1161
+ ...options,
1162
+ maxDepth: 1,
1163
+ });
1164
+ const profileNamesRegex = /([^/\\]+)[/\\]Local State$/;
1165
+ if (localState.length > 0) {
1166
+ let success = true;
1167
+ localState.forEach(dir => {
1168
+ const profileName = dir.match(profileNamesRegex)[1];
1169
+ try {
1170
+ fs.copyFileSync(dir, path.join(destDir, 'Local State'));
1171
+ }
1172
+ catch (err) {
1173
+ consoleLogger.error(err);
1174
+ if (err.code === 'EBUSY') {
1175
+ console.log(`Unable to copy the file because it is currently in use.`);
1176
+ console.log('Please close any applications that might be using this file and try again.');
1177
+ }
1178
+ else {
1179
+ console.log(`An unexpected error occurred for ${profileName} while copying the file: ${err.message}`);
1180
+ }
1181
+ printMessage([err], messageOptions);
1182
+ success = false;
1183
+ }
1184
+ });
1185
+ return success;
1186
+ }
1187
+ consoleLogger.warn('Unable to find local state file in the system.');
1188
+ printMessage(['Unable to find local state file in the system.'], messageOptions);
1189
+ return false;
1190
+ };
1191
+ /**
1192
+ * Checks if the Chrome data directory exists and creates a clone
1193
+ * of all profile within the oobee directory located in the
1194
+ * .../User Data directory for Windows and
1195
+ * .../Chrome directory for Mac.
1196
+ * @param {string} randomToken - random token to append to the cloned directory
1197
+ * @returns {string} cloned data directory, null if any of the sub files failed to copy
1198
+ */
1199
+ export const cloneChromeProfiles = (randomToken) => {
1200
+ const baseDir = getDefaultChromeDataDir();
1201
+ if (!baseDir) {
1202
+ return;
1203
+ }
1204
+ let destDir;
1205
+ destDir = path.join(baseDir, `oobee-${randomToken}`);
1206
+ if (fs.existsSync(destDir)) {
1207
+ // Don't delete since it will be handled at the end of the scan
1208
+ // deleteClonedChromeProfiles(randomToken);
1209
+ // Assume it cloned and don't re-clone
1210
+ }
1211
+ else {
1212
+ if (!fs.existsSync(destDir)) {
1213
+ fs.mkdirSync(destDir, { recursive: true });
1214
+ }
1215
+ const baseOptions = {
1216
+ cwd: baseDir,
1217
+ recursive: true,
1218
+ absolute: true,
1219
+ nodir: true,
1220
+ };
1221
+ const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
1222
+ if (cloneChromeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
1223
+ return destDir;
1224
+ }
1225
+ consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
1226
+ }
1227
+ // For future reference, return a null instead to halt the scan
1228
+ return destDir;
1229
+ };
1230
+ export const cloneChromiumProfiles = (randomToken) => {
1231
+ const baseDir = getDefaultChromiumDataDir();
1232
+ if (!baseDir) {
1233
+ return;
1234
+ }
1235
+ let destDir;
1236
+ destDir = path.join(baseDir, `oobee-${randomToken}`);
1237
+ if (fs.existsSync(destDir)) {
1238
+ // Don't delete since it will be handled at the end of the scan
1239
+ // deleteClonedChromiumProfiles(randomToken);
1240
+ // Assume it cloned and don't re-clone
1241
+ }
1242
+ else {
1243
+ fs.mkdirSync(destDir, { recursive: true });
1244
+ }
1245
+ return destDir;
1246
+ };
1247
+ /**
1248
+ * Checks if the Edge data directory exists and creates a clone
1249
+ * of all profile within the oobee directory located in the
1250
+ * .../User Data directory for Windows and
1251
+ * .../Microsoft Edge directory for Mac.
1252
+ * @param {string} randomToken - random token to append to the cloned directory
1253
+ * @returns {string} cloned data directory, null if any of the sub files failed to copy
1254
+ */
1255
+ export const cloneEdgeProfiles = (randomToken) => {
1256
+ const baseDir = getDefaultEdgeDataDir();
1257
+ if (!baseDir) {
1258
+ return;
1259
+ }
1260
+ let destDir;
1261
+ destDir = path.join(baseDir, `oobee-${randomToken}`);
1262
+ if (fs.existsSync(destDir)) {
1263
+ // Don't delete since it will be handled at the end of the scan
1264
+ // deleteClonedEdgeProfiles(randomToken);
1265
+ // Assume it cloned and don't re-clone
1266
+ }
1267
+ else {
1268
+ if (!fs.existsSync(destDir)) {
1269
+ fs.mkdirSync(destDir, { recursive: true });
1270
+ }
1271
+ const baseOptions = {
1272
+ cwd: baseDir,
1273
+ recursive: true,
1274
+ absolute: true,
1275
+ nodir: true,
1276
+ };
1277
+ const cloneLocalStateFileSuccess = cloneLocalStateFile(baseOptions, destDir);
1278
+ if (cloneEdgeProfileCookieFiles(baseOptions, destDir) && cloneLocalStateFileSuccess) {
1279
+ return destDir;
1280
+ }
1281
+ consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
1282
+ }
1283
+ // For future reference, return a null instead to halt the scan
1284
+ return destDir;
1285
+ };
1286
+ export const deleteClonedProfiles = (browser, randomToken) => {
1287
+ if (browser === BrowserTypes.CHROME) {
1288
+ deleteClonedChromeProfiles(randomToken);
1289
+ }
1290
+ else if (browser === BrowserTypes.EDGE) {
1291
+ deleteClonedEdgeProfiles(randomToken);
1292
+ }
1293
+ else if (browser === BrowserTypes.CHROMIUM) {
1294
+ deleteClonedChromiumProfiles(randomToken);
1295
+ }
1296
+ };
1297
+ /**
1298
+ * Deletes all the cloned oobee directories in the Chrome data directory
1299
+ * @returns null
1300
+ */
1301
+ export const deleteClonedChromeProfiles = (randomToken) => {
1302
+ const baseDir = getDefaultChromeDataDir();
1303
+ if (!baseDir) {
1304
+ return;
1305
+ }
1306
+ let destDir;
1307
+ if (randomToken) {
1308
+ destDir = [`${baseDir}/oobee-${randomToken}`];
1309
+ }
1310
+ else {
1311
+ // Find all the oobee directories in the Chrome data directory
1312
+ destDir = globSync('**/oobee*', {
1313
+ cwd: baseDir,
1314
+ absolute: true,
1315
+ });
1316
+ }
1317
+ if (destDir.length > 0) {
1318
+ destDir.forEach(dir => {
1319
+ if (fs.existsSync(dir)) {
1320
+ try {
1321
+ fs.rmSync(dir, { recursive: true });
1322
+ }
1323
+ catch (err) {
1324
+ consoleLogger.error(`CHROME Unable to delete ${dir} folder in the Chrome data directory. ${err}`);
1325
+ }
1326
+ }
1327
+ });
1328
+ return;
1329
+ }
1330
+ consoleLogger.warn('Unable to find oobee directory in the Chrome data directory.');
1331
+ console.warn('Unable to find oobee directory in the Chrome data directory.');
1332
+ };
1333
+ /**
1334
+ * Deletes all the cloned oobee directories in the Edge data directory
1335
+ * @returns null
1336
+ */
1337
+ export const deleteClonedEdgeProfiles = (randomToken) => {
1338
+ const baseDir = getDefaultEdgeDataDir();
1339
+ if (!baseDir) {
1340
+ console.warn(`Unable to find Edge data directory in the system.`);
1341
+ return;
1342
+ }
1343
+ let destDir;
1344
+ if (randomToken) {
1345
+ destDir = [`${baseDir}/oobee-${randomToken}`];
1346
+ }
1347
+ else {
1348
+ // Find all the oobee directories in the Chrome data directory
1349
+ destDir = globSync('**/oobee*', {
1350
+ cwd: baseDir,
1351
+ absolute: true,
1352
+ });
1353
+ }
1354
+ if (destDir.length > 0) {
1355
+ destDir.forEach(dir => {
1356
+ if (fs.existsSync(dir)) {
1357
+ try {
1358
+ fs.rmSync(dir, { recursive: true });
1359
+ }
1360
+ catch (err) {
1361
+ consoleLogger.error(`EDGE Unable to delete ${dir} folder in the Chrome data directory. ${err}`);
1362
+ }
1363
+ }
1364
+ });
1365
+ }
1366
+ };
1367
+ export const deleteClonedChromiumProfiles = (randomToken) => {
1368
+ const baseDir = getDefaultChromiumDataDir();
1369
+ if (!baseDir) {
1370
+ return;
1371
+ }
1372
+ let destDir;
1373
+ if (randomToken) {
1374
+ destDir = [`${baseDir}/oobee-${randomToken}`];
1375
+ }
1376
+ else {
1377
+ // Find all the oobee directories in the Chrome data directory
1378
+ destDir = globSync('**/oobee*', {
1379
+ cwd: baseDir,
1380
+ absolute: true,
1381
+ });
1382
+ }
1383
+ if (destDir.length > 0) {
1384
+ destDir.forEach(dir => {
1385
+ if (fs.existsSync(dir)) {
1386
+ try {
1387
+ fs.rmSync(dir, { recursive: true });
1388
+ }
1389
+ catch (err) {
1390
+ consoleLogger.error(`CHROMIUM Unable to delete ${dir} folder in the Chromium data directory. ${err}`);
1391
+ }
1392
+ }
1393
+ });
1394
+ return;
1395
+ }
1396
+ consoleLogger.warn('Unable to find oobee directory in Chromium support directory');
1397
+ console.warn('Unable to find oobee directory in Chromium support directory');
1398
+ };
1399
+ export const getPlaywrightDeviceDetailsObject = (deviceChosen, customDevice, viewportWidth) => {
1400
+ let playwrightDeviceDetailsObject = devices['Desktop Chrome']; // default to Desktop Chrome
1401
+ if (deviceChosen === 'Mobile' || customDevice === 'iPhone 11') {
1402
+ playwrightDeviceDetailsObject = devices['iPhone 11'];
1403
+ }
1404
+ else if (customDevice === 'Samsung Galaxy S9+') {
1405
+ playwrightDeviceDetailsObject = devices['Galaxy S9+'];
1406
+ }
1407
+ else if (viewportWidth) {
1408
+ playwrightDeviceDetailsObject = {
1409
+ viewport: { width: viewportWidth, height: 720 },
1410
+ isMobile: false,
1411
+ hasTouch: false,
1412
+ userAgent: devices['Desktop Chrome'].userAgent,
1413
+ deviceScaleFactor: 1,
1414
+ defaultBrowserType: 'chromium',
1415
+ };
1416
+ }
1417
+ else if (customDevice) {
1418
+ playwrightDeviceDetailsObject = devices[customDevice.replace(/_/g, ' ')];
1419
+ }
1420
+ return playwrightDeviceDetailsObject;
1421
+ };
1422
+ export const getScreenToScan = (deviceChosen, customDevice, viewportWidth) => {
1423
+ if (deviceChosen) {
1424
+ return deviceChosen;
1425
+ }
1426
+ if (customDevice) {
1427
+ return customDevice;
1428
+ }
1429
+ if (viewportWidth) {
1430
+ return `CustomWidth_${viewportWidth}px`;
1431
+ }
1432
+ return 'Desktop';
1433
+ };
1434
+ export const submitFormViaPlaywright = async (browserToRun, userDataDirectory, finalUrl) => {
1435
+ const browserContext = await constants.launcher.launchPersistentContext(userDataDirectory, {
1436
+ ...getPlaywrightLaunchOptions(browserToRun),
1437
+ });
1438
+ register(browserContext);
1439
+ const page = await browserContext.newPage();
1440
+ try {
1441
+ await page.goto(finalUrl, {
1442
+ timeout: 30000,
1443
+ waitUntil: 'commit',
1444
+ });
1445
+ try {
1446
+ await page.waitForLoadState('networkidle', { timeout: 10000 });
1447
+ }
1448
+ catch {
1449
+ consoleLogger.info('Unable to detect networkidle');
1450
+ }
1451
+ }
1452
+ catch (error) {
1453
+ consoleLogger.error(error);
1454
+ }
1455
+ finally {
1456
+ await browserContext.close();
1457
+ }
1458
+ };
1459
+ export const submitForm = async (browserToRun, userDataDirectory, scannedUrl, entryUrl, scanType, email, name, scanResultsJson, numberOfPagesScanned, numberOfRedirectsScanned, numberOfPagesNotScanned, metadata) => {
1460
+ // Legacy code start - Google Sheets submission
1461
+ const additionalPageDataJson = JSON.stringify({
1462
+ redirectsScanned: numberOfRedirectsScanned,
1463
+ pagesNotScanned: numberOfPagesNotScanned,
1464
+ });
1465
+ let finalUrl = `${formDataFields.formUrl}?` +
1466
+ `${formDataFields.entryUrlField}=${entryUrl}&` +
1467
+ `${formDataFields.scanTypeField}=${scanType}&` +
1468
+ `${formDataFields.emailField}=${email}&` +
1469
+ `${formDataFields.nameField}=${name}&` +
1470
+ `${formDataFields.resultsField}=${encodeURIComponent(scanResultsJson)}&` +
1471
+ `${formDataFields.numberOfPagesScannedField}=${numberOfPagesScanned}&` +
1472
+ `${formDataFields.additionalPageDataField}=${encodeURIComponent(additionalPageDataJson)}&` +
1473
+ `${formDataFields.metadataField}=${encodeURIComponent(metadata)}`;
1474
+ if (scannedUrl !== entryUrl) {
1475
+ finalUrl += `&${formDataFields.redirectUrlField}=${scannedUrl}`;
1476
+ }
1477
+ try {
1478
+ await axios.get(finalUrl, { timeout: 2000 });
1479
+ }
1480
+ catch (error) {
1481
+ if (error.code === 'ECONNABORTED') {
1482
+ if (browserToRun || constants.launcher === webkit) {
1483
+ await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
1484
+ }
1485
+ }
1486
+ }
1487
+ };
1488
+ // Legacy code end - Google Sheets submission
1489
+ export async function initModifiedUserAgent(browser, playwrightDeviceDetailsObject, userDataDirectory) {
1490
+ const isHeadless = process.env.CRAWLEE_HEADLESS === '1';
1491
+ // If headless mode is enabled, ensure the headless flag is set.
1492
+ if (isHeadless && !constants.launchOptionsArgs.includes('--headless=new')) {
1493
+ constants.launchOptionsArgs.push('--headless=new');
1494
+ }
1495
+ // Build the launch options using your production settings.
1496
+ // headless is forced to false as in your persistent context, and we merge in getPlaywrightLaunchOptions and device details.
1497
+ const launchOptions = {
1498
+ headless: false,
1499
+ ...getPlaywrightLaunchOptions(browser),
1500
+ ...playwrightDeviceDetailsObject,
1501
+ };
1502
+ // Launch a temporary persistent context with an empty userDataDir to mimic your production browser setup.
1503
+ const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1' ? userDataDirectory : '';
1504
+ const browserContext = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, launchOptions);
1505
+ register(browserContext);
1506
+ const page = await browserContext.newPage();
1507
+ // Retrieve the default user agent.
1508
+ const defaultUA = await page.evaluate(() => navigator.userAgent);
1509
+ await browserContext.close();
1510
+ // Modify the UA:
1511
+ // Replace "HeadlessChrome" with "Chrome" if present.
1512
+ const modifiedUA = defaultUA.includes('HeadlessChrome')
1513
+ ? defaultUA.replace('HeadlessChrome', 'Chrome')
1514
+ : defaultUA;
1515
+ // Push the modified UA flag into your global launch options.
1516
+ constants.launchOptionsArgs.push(`--user-agent=${modifiedUA}`);
1517
+ // Optionally log the modified UA.
1518
+ // console.log('Modified User Agent:', modifiedUA);
1519
+ }
1520
+ const cacheProxyInfo = getProxyInfo();
1521
+ /**
1522
+ * @param {string} browser browser name ("chrome" or "edge", null for chromium, the default Playwright browser)
1523
+ * @returns playwright launch options object. For more details: https://playwright.dev/docs/api/class-browsertype#browser-type-launch
1524
+ */
1525
+ export const getPlaywrightLaunchOptions = (browser) => {
1526
+ const channel = browser || undefined;
1527
+ const resolution = proxyInfoToResolution(cacheProxyInfo);
1528
+ // Start with your base args
1529
+ const finalArgs = [...constants.launchOptionsArgs];
1530
+ // Headless flags (unchanged)
1531
+ if (process.env.CRAWLEE_HEADLESS === '1') {
1532
+ if (!finalArgs.includes('--headless=new'))
1533
+ finalArgs.push('--headless=new');
1534
+ if (!finalArgs.includes('--mute-audio'))
1535
+ finalArgs.push('--mute-audio');
1536
+ }
1537
+ // Map resolution to Playwright options
1538
+ let proxyOpt;
1539
+ switch (resolution.kind) {
1540
+ case 'manual':
1541
+ proxyOpt = resolution.settings;
1542
+ break;
1543
+ case 'pac': {
1544
+ finalArgs.push(`--proxy-pac-url=${resolution.pacUrl}`);
1545
+ if (resolution.bypass)
1546
+ finalArgs.push(`--proxy-bypass-list=${resolution.bypass}`);
1547
+ break;
1548
+ }
1549
+ case 'none':
1550
+ // nothing
1551
+ break;
1552
+ }
1553
+ const options = {
1554
+ ignoreDefaultArgs: ['--use-mock-keychain', '--headless'],
1555
+ args: finalArgs,
1556
+ headless: false,
1557
+ ...(channel && { channel }),
1558
+ ...(proxyOpt ? { proxy: proxyOpt } : {}),
1559
+ };
1560
+ // SlowMo (unchanged)
1561
+ if (!options.slowMo && process.env.OOBEE_SLOWMO && Number(process.env.OOBEE_SLOWMO) >= 1) {
1562
+ options.slowMo = Number(process.env.OOBEE_SLOWMO);
1563
+ consoleLogger.info(`Enabled browser slowMo with value: ${process.env.OOBEE_SLOWMO}ms`);
1564
+ }
1565
+ // Edge on Windows should not be headless (unchanged)
1566
+ if (browser === BrowserTypes.EDGE && os.platform() === 'win32') {
1567
+ options.headless = false;
1568
+ }
1569
+ return options;
1570
+ };
1571
+ export const waitForPageLoaded = async (page, timeout = 10000) => {
1572
+ const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
1573
+ return Promise.race([
1574
+ page.waitForLoadState('load'), // Ensure page load completes
1575
+ page.waitForLoadState('networkidle'), // Wait for network requests to settle
1576
+ new Promise(resolve => setTimeout(resolve, timeout)), // Hard timeout as a fallback
1577
+ page.evaluate(OBSERVER_TIMEOUT => {
1578
+ return new Promise(resolve => {
1579
+ // Skip mutation check for PDFs
1580
+ if (document.contentType === 'application/pdf') {
1581
+ resolve('Skipping DOM mutation check for PDF.');
1582
+ return;
1583
+ }
1584
+ const root = document.documentElement || document.body;
1585
+ if (!(root instanceof Node)) {
1586
+ // Not a valid DOM root—treat as loaded
1587
+ resolve('No valid root to observe; treating as loaded.');
1588
+ return;
1589
+ }
1590
+ let timeout;
1591
+ let mutationCount = 0;
1592
+ const MAX_MUTATIONS = 500;
1593
+ const mutationHash = {};
1594
+ const observer = new MutationObserver(mutationsList => {
1595
+ clearTimeout(timeout);
1596
+ mutationCount++;
1597
+ if (mutationCount > MAX_MUTATIONS) {
1598
+ observer.disconnect();
1599
+ resolve('Too many mutations detected, exiting.');
1600
+ return;
1601
+ }
1602
+ for (const mutation of mutationsList) {
1603
+ if (mutation.target instanceof Element) {
1604
+ for (const attr of Array.from(mutation.target.attributes)) {
1605
+ const key = `${mutation.target.nodeName}-${attr.name}`;
1606
+ mutationHash[key] = (mutationHash[key] || 0) + 1;
1607
+ if (mutationHash[key] >= 10) {
1608
+ observer.disconnect();
1609
+ resolve(`Repeated mutation detected for ${key}, exiting.`);
1610
+ return;
1611
+ }
1612
+ }
1613
+ }
1614
+ }
1615
+ timeout = setTimeout(() => {
1616
+ observer.disconnect();
1617
+ resolve('DOM stabilized after mutations.');
1618
+ }, 1000);
1619
+ });
1620
+ // Final timeout to avoid infinite waiting
1621
+ timeout = setTimeout(() => {
1622
+ observer.disconnect();
1623
+ resolve('Observer timeout reached, exiting.');
1624
+ }, OBSERVER_TIMEOUT);
1625
+ // Only observe if root is a Node
1626
+ observer.observe(root, {
1627
+ childList: true,
1628
+ subtree: true,
1629
+ attributes: true,
1630
+ });
1631
+ });
1632
+ }, OBSERVER_TIMEOUT), // Pass OBSERVER_TIMEOUT dynamically to the browser context
1633
+ ]);
1634
+ };
1635
+ function isValidHttpUrl(urlString) {
1636
+ const pattern = /^(http|https):\/\/[^ "]+$/;
1637
+ return pattern.test(urlString);
1638
+ }
1639
+ export const isFilePath = (url) => {
1640
+ const driveLetterPattern = /^[A-Z]:/i;
1641
+ const backslashPattern = /\\/;
1642
+ return (url.startsWith('/') ||
1643
+ driveLetterPattern.test(url) ||
1644
+ backslashPattern.test(url) ||
1645
+ url.startsWith('./') ||
1646
+ url.startsWith('../') ||
1647
+ url.startsWith('.\\') ||
1648
+ url.startsWith('..\\'));
1649
+ };
1650
+ export function convertLocalFileToPath(url) {
1651
+ if (url.startsWith('file://')) {
1652
+ url = fileURLToPath(url);
1653
+ }
1654
+ return url;
1655
+ }
1656
+ export function convertPathToLocalFile(filePath) {
1657
+ if (filePath.startsWith('/')) {
1658
+ filePath = pathToFileURL(filePath).toString();
1659
+ }
1660
+ return filePath;
1661
+ }
1662
+ export function convertToFilePath(fileUrl) {
1663
+ // Parse the file URL
1664
+ const parsedUrl = url.parse(fileUrl);
1665
+ // Decode the URL-encoded path
1666
+ const filePath = decodeURIComponent(parsedUrl.path);
1667
+ // Return the file path without the 'file://' prefix
1668
+ return filePath;
1669
+ }