@govtechsg/oobee 0.10.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/.dockerignore +22 -0
  2. package/.github/pull_request_template.md +11 -0
  3. package/.github/workflows/docker-test.yml +54 -0
  4. package/.github/workflows/image.yml +107 -0
  5. package/.github/workflows/publish.yml +18 -0
  6. package/.idea/modules.xml +8 -0
  7. package/.idea/purple-a11y.iml +9 -0
  8. package/.idea/vcs.xml +6 -0
  9. package/.prettierrc.json +12 -0
  10. package/.vscode/extensions.json +5 -0
  11. package/.vscode/settings.json +10 -0
  12. package/CODE_OF_CONDUCT.md +128 -0
  13. package/DETAILS.md +163 -0
  14. package/Dockerfile +60 -0
  15. package/INSTALLATION.md +146 -0
  16. package/INTEGRATION.md +785 -0
  17. package/LICENSE +22 -0
  18. package/README.md +587 -0
  19. package/SECURITY.md +5 -0
  20. package/__mocks__/mock-report.html +1431 -0
  21. package/__mocks__/mockFunctions.ts +32 -0
  22. package/__mocks__/mockIssues.ts +64 -0
  23. package/__mocks__/mock_all_issues/000000001.json +64 -0
  24. package/__mocks__/mock_all_issues/000000002.json +53 -0
  25. package/__mocks__/mock_all_issues/fake-file.txt +0 -0
  26. package/__tests__/logs.test.ts +25 -0
  27. package/__tests__/mergeAxeResults.test.ts +278 -0
  28. package/__tests__/utils.test.ts +118 -0
  29. package/a11y-scan-results.zip +0 -0
  30. package/eslint.config.js +53 -0
  31. package/exclusions.txt +2 -0
  32. package/gitlab-pipeline-template.yml +54 -0
  33. package/jest.config.js +1 -0
  34. package/package.json +96 -0
  35. package/scripts/copyFiles.js +44 -0
  36. package/scripts/install_oobee_dependencies.cmd +13 -0
  37. package/scripts/install_oobee_dependencies.command +101 -0
  38. package/scripts/install_oobee_dependencies.ps1 +110 -0
  39. package/scripts/oobee_shell.cmd +13 -0
  40. package/scripts/oobee_shell.command +11 -0
  41. package/scripts/oobee_shell.sh +55 -0
  42. package/scripts/oobee_shell_ps.ps1 +54 -0
  43. package/src/cli.ts +401 -0
  44. package/src/combine.ts +240 -0
  45. package/src/constants/__tests__/common.test.ts +44 -0
  46. package/src/constants/cliFunctions.ts +305 -0
  47. package/src/constants/common.ts +1840 -0
  48. package/src/constants/constants.ts +443 -0
  49. package/src/constants/errorMeta.json +319 -0
  50. package/src/constants/itemTypeDescription.ts +11 -0
  51. package/src/constants/oobeeAi.ts +141 -0
  52. package/src/constants/questions.ts +181 -0
  53. package/src/constants/sampleData.ts +187 -0
  54. package/src/crawlers/__tests__/commonCrawlerFunc.test.ts +51 -0
  55. package/src/crawlers/commonCrawlerFunc.ts +656 -0
  56. package/src/crawlers/crawlDomain.ts +877 -0
  57. package/src/crawlers/crawlIntelligentSitemap.ts +156 -0
  58. package/src/crawlers/crawlLocalFile.ts +193 -0
  59. package/src/crawlers/crawlSitemap.ts +356 -0
  60. package/src/crawlers/custom/extractAndGradeText.ts +57 -0
  61. package/src/crawlers/custom/flagUnlabelledClickableElements.ts +964 -0
  62. package/src/crawlers/custom/utils.ts +486 -0
  63. package/src/crawlers/customAxeFunctions.ts +82 -0
  64. package/src/crawlers/pdfScanFunc.ts +468 -0
  65. package/src/crawlers/runCustom.ts +117 -0
  66. package/src/index.ts +173 -0
  67. package/src/logs.ts +66 -0
  68. package/src/mergeAxeResults.ts +964 -0
  69. package/src/npmIndex.ts +284 -0
  70. package/src/screenshotFunc/htmlScreenshotFunc.ts +411 -0
  71. package/src/screenshotFunc/pdfScreenshotFunc.ts +762 -0
  72. package/src/static/ejs/partials/components/categorySelector.ejs +4 -0
  73. package/src/static/ejs/partials/components/categorySelectorDropdown.ejs +57 -0
  74. package/src/static/ejs/partials/components/pagesScannedModal.ejs +70 -0
  75. package/src/static/ejs/partials/components/reportSearch.ejs +47 -0
  76. package/src/static/ejs/partials/components/ruleOffcanvas.ejs +105 -0
  77. package/src/static/ejs/partials/components/scanAbout.ejs +263 -0
  78. package/src/static/ejs/partials/components/screenshotLightbox.ejs +13 -0
  79. package/src/static/ejs/partials/components/summaryScanAbout.ejs +141 -0
  80. package/src/static/ejs/partials/components/summaryScanResults.ejs +16 -0
  81. package/src/static/ejs/partials/components/summaryTable.ejs +20 -0
  82. package/src/static/ejs/partials/components/summaryWcagCompliance.ejs +94 -0
  83. package/src/static/ejs/partials/components/topFive.ejs +6 -0
  84. package/src/static/ejs/partials/components/wcagCompliance.ejs +70 -0
  85. package/src/static/ejs/partials/footer.ejs +21 -0
  86. package/src/static/ejs/partials/header.ejs +230 -0
  87. package/src/static/ejs/partials/main.ejs +40 -0
  88. package/src/static/ejs/partials/scripts/bootstrap.ejs +8 -0
  89. package/src/static/ejs/partials/scripts/categorySelectorDropdownScript.ejs +190 -0
  90. package/src/static/ejs/partials/scripts/categorySummary.ejs +141 -0
  91. package/src/static/ejs/partials/scripts/highlightjs.ejs +335 -0
  92. package/src/static/ejs/partials/scripts/popper.ejs +7 -0
  93. package/src/static/ejs/partials/scripts/reportSearch.ejs +248 -0
  94. package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +801 -0
  95. package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +71 -0
  96. package/src/static/ejs/partials/scripts/summaryScanResults.ejs +14 -0
  97. package/src/static/ejs/partials/scripts/summaryTable.ejs +78 -0
  98. package/src/static/ejs/partials/scripts/utils.ejs +441 -0
  99. package/src/static/ejs/partials/styles/bootstrap.ejs +12375 -0
  100. package/src/static/ejs/partials/styles/highlightjs.ejs +54 -0
  101. package/src/static/ejs/partials/styles/styles.ejs +1843 -0
  102. package/src/static/ejs/partials/styles/summaryBootstrap.ejs +12458 -0
  103. package/src/static/ejs/partials/summaryHeader.ejs +70 -0
  104. package/src/static/ejs/partials/summaryMain.ejs +75 -0
  105. package/src/static/ejs/report.ejs +420 -0
  106. package/src/static/ejs/summary.ejs +47 -0
  107. package/src/static/mustache/.prettierrc +4 -0
  108. package/src/static/mustache/Attention Deficit.mustache +11 -0
  109. package/src/static/mustache/Blind.mustache +11 -0
  110. package/src/static/mustache/Cognitive.mustache +7 -0
  111. package/src/static/mustache/Colorblindness.mustache +20 -0
  112. package/src/static/mustache/Deaf.mustache +12 -0
  113. package/src/static/mustache/Deafblind.mustache +7 -0
  114. package/src/static/mustache/Dyslexia.mustache +14 -0
  115. package/src/static/mustache/Low Vision.mustache +7 -0
  116. package/src/static/mustache/Mobility.mustache +15 -0
  117. package/src/static/mustache/Sighted Keyboard Users.mustache +42 -0
  118. package/src/static/mustache/report.mustache +1709 -0
  119. package/src/types/print-message.d.ts +28 -0
  120. package/src/types/types.ts +46 -0
  121. package/src/types/xpath-to-css.d.ts +3 -0
  122. package/src/utils.ts +332 -0
  123. package/tsconfig.json +15 -0
@@ -0,0 +1,877 @@
1
+ import crawlee, { EnqueueStrategy } from 'crawlee';
2
+ import fs from 'fs';
3
+ import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
4
+ import type { EnqueueLinksOptions, RequestOptions } from 'crawlee';
5
+ import axios from 'axios';
6
+ import { fileTypeFromBuffer } from 'file-type';
7
+ import mime from 'mime-types';
8
+ import https from 'https';
9
+ import type { BatchAddRequestsResult } from '@crawlee/types';
10
+ import {
11
+ createCrawleeSubFolders,
12
+ preNavigationHooks,
13
+ runAxeScript,
14
+ isUrlPdf,
15
+ } from './commonCrawlerFunc.js';
16
+ import constants, {
17
+ UrlsCrawled,
18
+ blackListedFileExtensions,
19
+ guiInfoStatusTypes,
20
+ cssQuerySelectors,
21
+ RuleFlags,
22
+ } from '../constants/constants.js';
23
+ import {
24
+ getPlaywrightLaunchOptions,
25
+ isBlacklistedFileExtensions,
26
+ isSkippedUrl,
27
+ isDisallowedInRobotsTxt,
28
+ getUrlsFromRobotsTxt,
29
+ getBlackListedPatterns,
30
+ urlWithoutAuth,
31
+ waitForPageLoaded,
32
+ } from '../constants/common.js';
33
+ import { areLinksEqual, isFollowStrategy } from '../utils.js';
34
+ import {
35
+ handlePdfDownload,
36
+ runPdfScan,
37
+ mapPdfScanResults,
38
+ doPdfScreenshots,
39
+ } from './pdfScanFunc.js';
40
+ import { silentLogger, guiInfoLog } from '../logs.js';
41
+ import { ViewportSettingsClass } from '../combine.js';
42
+
43
+ const isBlacklisted = (url: string) => {
44
+ const blacklistedPatterns = getBlackListedPatterns(null);
45
+ if (!blacklistedPatterns) {
46
+ return false;
47
+ }
48
+ try {
49
+ const parsedUrl = new URL(url);
50
+
51
+ return blacklistedPatterns.some(
52
+ pattern => new RegExp(pattern).test(parsedUrl.hostname) || new RegExp(pattern).test(url),
53
+ );
54
+ } catch (error) {
55
+ console.error(`Error parsing URL: ${url}`, error);
56
+ return false;
57
+ }
58
+ };
59
+
60
+ const crawlDomain = async ({
61
+ url,
62
+ randomToken,
63
+ host: _host,
64
+ viewportSettings,
65
+ maxRequestsPerCrawl,
66
+ browser,
67
+ userDataDirectory,
68
+ strategy,
69
+ specifiedMaxConcurrency,
70
+ fileTypes,
71
+ blacklistedPatterns,
72
+ includeScreenshots,
73
+ followRobots,
74
+ extraHTTPHeaders,
75
+ safeMode = false,
76
+ fromCrawlIntelligentSitemap = false,
77
+ datasetFromIntelligent = null,
78
+ urlsCrawledFromIntelligent = null,
79
+ ruleset = [],
80
+ }: {
81
+ url: string;
82
+ randomToken: string;
83
+ host: string;
84
+ viewportSettings: ViewportSettingsClass;
85
+ maxRequestsPerCrawl: number;
86
+ browser: string;
87
+ userDataDirectory: string;
88
+ strategy: EnqueueStrategy;
89
+ specifiedMaxConcurrency: number;
90
+ fileTypes: string;
91
+ blacklistedPatterns: string[];
92
+ includeScreenshots: boolean;
93
+ followRobots: boolean;
94
+ extraHTTPHeaders: Record<string, string>;
95
+ safeMode?: boolean;
96
+ fromCrawlIntelligentSitemap?: boolean;
97
+ datasetFromIntelligent?: crawlee.Dataset;
98
+ urlsCrawledFromIntelligent?: UrlsCrawled;
99
+ ruleset?: RuleFlags[];
100
+ }) => {
101
+ let dataset: crawlee.Dataset;
102
+ let urlsCrawled: UrlsCrawled;
103
+ let requestQueue: crawlee.RequestQueue;
104
+
105
+ if (fromCrawlIntelligentSitemap) {
106
+ dataset = datasetFromIntelligent;
107
+ urlsCrawled = urlsCrawledFromIntelligent;
108
+ } else {
109
+ ({ dataset } = await createCrawleeSubFolders(randomToken));
110
+ urlsCrawled = { ...constants.urlsCrawledObj };
111
+ }
112
+
113
+ ({ requestQueue } = await createCrawleeSubFolders(randomToken));
114
+
115
+ if (!fs.existsSync(randomToken)) {
116
+ fs.mkdirSync(randomToken);
117
+ }
118
+
119
+ const pdfDownloads = [];
120
+ const uuidToPdfMapping = {};
121
+ const isScanHtml = ['all', 'html-only'].includes(fileTypes);
122
+ const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
123
+ const { maxConcurrency } = constants;
124
+ const { playwrightDeviceDetailsObject } = viewportSettings;
125
+ const isBlacklistedUrl = isBlacklisted(url);
126
+
127
+ const httpsAgent = new https.Agent({ rejectUnauthorized: false });
128
+
129
+ if (isBlacklistedUrl) {
130
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
131
+ numScanned: urlsCrawled.scanned.length,
132
+ urlScanned: url,
133
+ });
134
+ return;
135
+ }
136
+
137
+ // Boolean to omit axe scan for basic auth URL
138
+ let isBasicAuth = false;
139
+ let authHeader = '';
140
+
141
+ // Test basic auth and add auth header if auth exist
142
+ const parsedUrl = new URL(url);
143
+ let username: string;
144
+ let password: string;
145
+ if (parsedUrl.username !== '' && parsedUrl.password !== '') {
146
+ isBasicAuth = true;
147
+ username = decodeURIComponent(parsedUrl.username);
148
+ password = decodeURIComponent(parsedUrl.password);
149
+
150
+ // Create auth header
151
+ authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
152
+
153
+ // Remove username from parsedUrl
154
+ parsedUrl.username = '';
155
+ parsedUrl.password = '';
156
+ // Send the finalUrl without credentials by setting auth header instead
157
+ const finalUrl = parsedUrl.toString();
158
+
159
+ await requestQueue.addRequest({
160
+ url: finalUrl,
161
+ skipNavigation: isUrlPdf(finalUrl),
162
+ headers: {
163
+ Authorization: authHeader,
164
+ },
165
+ label: finalUrl,
166
+ });
167
+ } else {
168
+ await requestQueue.addRequest({
169
+ url,
170
+ skipNavigation: isUrlPdf(url),
171
+ label: url,
172
+ });
173
+ }
174
+
175
+ const httpHeadCache = new Map<string, boolean>();
176
+ const isProcessibleUrl = async (url: string): Promise<boolean> => {
177
+ if (httpHeadCache.has(url)) {
178
+ silentLogger.info('cache hit', url, httpHeadCache.get(url));
179
+ return false; // return false to avoid processing the url again
180
+ }
181
+
182
+ try {
183
+ // Send a HEAD request to check headers without downloading the file
184
+ const headResponse = await axios.head(url, {
185
+ headers: { Authorization: authHeader },
186
+ httpsAgent,
187
+ });
188
+ const contentType = headResponse.headers['content-type'] || '';
189
+ const contentDisposition = headResponse.headers['content-disposition'] || '';
190
+
191
+ // Check if the response suggests it's a downloadable file based on Content-Disposition header
192
+ if (contentDisposition.includes('attachment')) {
193
+ silentLogger.info(`Skipping URL due to attachment header: ${url}`);
194
+ httpHeadCache.set(url, false);
195
+ return false;
196
+ }
197
+
198
+ // Check if the MIME type suggests it's a downloadable file
199
+ if (contentType.startsWith('application/') || contentType.includes('octet-stream')) {
200
+ silentLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`);
201
+ httpHeadCache.set(url, false);
202
+ return false;
203
+ }
204
+
205
+ // Use the mime-types library to ensure it's processible content (e.g., HTML or plain text)
206
+ const mimeType = mime.lookup(contentType);
207
+ if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) {
208
+ silentLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`);
209
+ httpHeadCache.set(url, false);
210
+ return false;
211
+ }
212
+
213
+ // Additional check for zip files by their magic number (PK\x03\x04)
214
+ if (url.endsWith('.zip')) {
215
+ silentLogger.info(`Checking for zip file magic number at URL ${url}`);
216
+
217
+ // Download the first few bytes of the file to check for the magic number
218
+ const byteResponse = await axios.get(url, {
219
+ headers: { Range: 'bytes=0-3', Authorization: authHeader },
220
+ responseType: 'arraybuffer',
221
+ httpsAgent,
222
+ });
223
+
224
+ const magicNumber = byteResponse.data.toString('hex');
225
+ if (magicNumber === '504b0304') {
226
+ silentLogger.info(`Skipping zip file at URL ${url}`);
227
+ httpHeadCache.set(url, false);
228
+ return false;
229
+ }
230
+ silentLogger.info(
231
+ `Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`,
232
+ );
233
+ }
234
+
235
+ // If you want more robust checks, you can download a portion of the content and use the file-type package to detect file types by content
236
+ const response = await axios.get(url, {
237
+ headers: { Range: 'bytes=0-4100', Authorization: authHeader },
238
+ responseType: 'arraybuffer',
239
+ httpsAgent,
240
+ });
241
+
242
+ const fileType = await fileTypeFromBuffer(response.data);
243
+ if (
244
+ fileType &&
245
+ !fileType.mime.startsWith('text/html') &&
246
+ !fileType.mime.startsWith('text/')
247
+ ) {
248
+ silentLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`);
249
+ httpHeadCache.set(url, false);
250
+ return false;
251
+ }
252
+ } catch (e) {
253
+ // silentLogger.error(`Error checking the MIME type of ${url}: ${e.message}`);
254
+ // If an error occurs (e.g., a network issue), assume the URL is processible
255
+ httpHeadCache.set(url, true);
256
+ return true;
257
+ }
258
+
259
+ // If none of the conditions to skip are met, allow processing of the URL
260
+ httpHeadCache.set(url, true);
261
+ return true;
262
+ };
263
+
264
+ const enqueueProcess = async (
265
+ page: Page,
266
+ enqueueLinks: (options: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>,
267
+ browserContext: BrowserContext,
268
+ ) => {
269
+ try {
270
+ await enqueueLinks({
271
+ // set selector matches anchor elements with href but not contains # or starting with mailto:
272
+ selector: 'a:not(a[href*="#"],a[href^="mailto:"])',
273
+ strategy,
274
+ requestQueue,
275
+ transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
276
+ try {
277
+ req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
278
+ } catch (e) {
279
+ silentLogger.error(e);
280
+ }
281
+ if (urlsCrawled.scanned.some(item => item.url === req.url)) {
282
+ req.skipNavigation = true;
283
+ }
284
+ if (isDisallowedInRobotsTxt(req.url)) return null;
285
+ if (isUrlPdf(req.url)) {
286
+ // playwright headless mode does not support navigation to pdf document
287
+ req.skipNavigation = true;
288
+ }
289
+ req.label = req.url;
290
+
291
+ return req;
292
+ },
293
+ });
294
+
295
+ // If safeMode flag is enabled, skip enqueueLinksByClickingElements
296
+ if (!safeMode) {
297
+ // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
298
+ try {
299
+ await customEnqueueLinksByClickingElements(page, browserContext);
300
+ } catch (e) {
301
+ silentLogger.info(e);
302
+ }
303
+ }
304
+ } catch {
305
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
306
+ // Handles browser page object been closed.
307
+ }
308
+ };
309
+
310
+ const customEnqueueLinksByClickingElements = async (
311
+ page: Page,
312
+ browserContext: BrowserContext,
313
+ ): Promise<void> => {
314
+ const initialPageUrl: string = page.url().toString();
315
+
316
+ const isExcluded = (newPageUrl: string): boolean => {
317
+ const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
318
+ const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl);
319
+ const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
320
+ return isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
321
+ };
322
+ const setPageListeners = (page: Page): void => {
323
+ // event listener to handle new page popups upon button click
324
+ page.on('popup', async (newPage: Page) => {
325
+ try {
326
+ if (newPage.url() != initialPageUrl && !isExcluded(newPage.url())) {
327
+ const newPageUrl: string = newPage.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
328
+ await requestQueue.addRequest({
329
+ url: newPageUrl,
330
+ skipNavigation: isUrlPdf(newPage.url()),
331
+ label: newPageUrl,
332
+ });
333
+ } else {
334
+ try {
335
+ await newPage.close();
336
+ } catch {
337
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
338
+ // Handles browser page object been closed.
339
+ }
340
+ }
341
+ } catch {
342
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
343
+ // Handles browser page object been closed.
344
+ }
345
+ });
346
+
347
+ // event listener to handle navigation to new url within same page upon element click
348
+ page.on('framenavigated', async (newFrame: Frame) => {
349
+ try {
350
+ if (
351
+ newFrame.url() !== initialPageUrl &&
352
+ !isExcluded(newFrame.url()) &&
353
+ !(newFrame.url() == 'about:blank')
354
+ ) {
355
+ const newFrameUrl: string = newFrame.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
356
+ await requestQueue.addRequest({
357
+ url: newFrameUrl,
358
+ skipNavigation: isUrlPdf(newFrame.url()),
359
+ label: newFrameUrl,
360
+ });
361
+ }
362
+ } catch {
363
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
364
+ // Handles browser page object been closed.
365
+ }
366
+ });
367
+ };
368
+ setPageListeners(page);
369
+ let currentElementIndex: number = 0;
370
+ let isAllElementsHandled: boolean = false;
371
+ while (!isAllElementsHandled) {
372
+ try {
373
+ // navigate back to initial page if clicking on a element previously caused it to navigate to a new url
374
+ if (page.url() != initialPageUrl) {
375
+ try {
376
+ await page.close();
377
+ } catch {
378
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
379
+ // Handles browser page object been closed.
380
+ }
381
+ page = await browserContext.newPage();
382
+ await page.goto(initialPageUrl, {
383
+ waitUntil: 'domcontentloaded',
384
+ });
385
+ setPageListeners(page);
386
+ }
387
+ const selectedElementsString = cssQuerySelectors.join(', ');
388
+ const selectedElements: ElementHandle<SVGElement | HTMLElement>[] =
389
+ await page.$$(selectedElementsString);
390
+ // edge case where there might be elements on page that appears intermittently
391
+ if (currentElementIndex + 1 > selectedElements.length || !selectedElements) {
392
+ break;
393
+ }
394
+ // handle the last element in selectedElements
395
+ if (currentElementIndex + 1 === selectedElements.length) {
396
+ isAllElementsHandled = true;
397
+ }
398
+ const element: ElementHandle<SVGElement | HTMLElement> =
399
+ selectedElements[currentElementIndex];
400
+ currentElementIndex += 1;
401
+ let newUrlFoundInElement: string = null;
402
+ if (await element.isVisible()) {
403
+ // Find url in html elements without clicking them
404
+ await page
405
+ .evaluate(element => {
406
+ // find href attribute
407
+ const hrefUrl: string = element.getAttribute('href');
408
+
409
+ // find url in datapath
410
+ const dataPathUrl: string = element.getAttribute('data-path');
411
+
412
+ return hrefUrl || dataPathUrl;
413
+ }, element)
414
+ .then(result => {
415
+ if (result) {
416
+ newUrlFoundInElement = result;
417
+ const pageUrl: URL = new URL(page.url());
418
+ const baseUrl: string = `${pageUrl.protocol}//${pageUrl.host}`;
419
+ let absoluteUrl: URL;
420
+ // Construct absolute URL using base URL
421
+ try {
422
+ // Check if newUrlFoundInElement is a valid absolute URL
423
+ absoluteUrl = new URL(newUrlFoundInElement);
424
+ } catch (e) {
425
+ // If it's not a valid URL, treat it as a relative URL
426
+ absoluteUrl = new URL(newUrlFoundInElement, baseUrl);
427
+ }
428
+ newUrlFoundInElement = absoluteUrl.href;
429
+ }
430
+ });
431
+ if (newUrlFoundInElement && !isExcluded(newUrlFoundInElement)) {
432
+ const newUrlFoundInElementUrl: string = newUrlFoundInElement.replace(
433
+ /(?<=&|\?)utm_.*?(&|$)/gim,
434
+ '',
435
+ );
436
+
437
+ await requestQueue.addRequest({
438
+ url: newUrlFoundInElementUrl,
439
+ skipNavigation: isUrlPdf(newUrlFoundInElement),
440
+ label: newUrlFoundInElementUrl,
441
+ });
442
+ } else if (!newUrlFoundInElement) {
443
+ try {
444
+ // Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
445
+ await element.click({ force: true });
446
+ await page.waitForTimeout(1000); // Add a delay of 1 second between each Element click
447
+ } catch {
448
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
449
+ // Handles browser page object been closed.
450
+ }
451
+ }
452
+ }
453
+ } catch {
454
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
455
+ // Handles browser page object been closed.
456
+ }
457
+ }
458
+ };
459
+
460
+ let isAbortingScanNow = false;
461
+
462
+ let userDataDir = '';
463
+ if (userDataDirectory) {
464
+ userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
465
+ }
466
+
467
+ const crawler = new crawlee.PlaywrightCrawler({
468
+ launchContext: {
469
+ launcher: constants.launcher,
470
+ launchOptions: getPlaywrightLaunchOptions(browser),
471
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
472
+ userDataDir,
473
+ },
474
+ retryOnBlocked: true,
475
+ browserPoolOptions: {
476
+ useFingerprints: false,
477
+ preLaunchHooks: [
478
+ async (_pageId, launchContext) => {
479
+ launchContext.launchOptions = {
480
+ ...launchContext.launchOptions,
481
+ bypassCSP: true,
482
+ ignoreHTTPSErrors: true,
483
+ ...playwrightDeviceDetailsObject,
484
+ };
485
+ },
486
+ ],
487
+ },
488
+ requestQueue,
489
+ postNavigationHooks: [
490
+ async crawlingContext => {
491
+ const { page, request } = crawlingContext;
492
+
493
+ request.skipNavigation = true;
494
+
495
+ await page.evaluate(() => {
496
+ return new Promise(resolve => {
497
+ let timeout;
498
+ let mutationCount = 0;
499
+ const MAX_MUTATIONS = 100;
500
+ const MAX_SAME_MUTATION_LIMIT = 10;
501
+ const mutationHash = {};
502
+
503
+ const observer = new MutationObserver(mutationsList => {
504
+ clearTimeout(timeout);
505
+
506
+ mutationCount += 1;
507
+
508
+ if (mutationCount > MAX_MUTATIONS) {
509
+ observer.disconnect();
510
+ resolve('Too many mutations detected');
511
+ }
512
+
513
+ // To handle scenario where DOM elements are constantly changing and unable to exit
514
+ mutationsList.forEach(mutation => {
515
+ let mutationKey;
516
+
517
+ if (mutation.target instanceof Element) {
518
+ Array.from(mutation.target.attributes).forEach(attr => {
519
+ mutationKey = `${mutation.target.nodeName}-${attr.name}`;
520
+
521
+ if (mutationKey) {
522
+ if (!mutationHash[mutationKey]) {
523
+ mutationHash[mutationKey] = 1;
524
+ } else {
525
+ mutationHash[mutationKey]++;
526
+ }
527
+
528
+ if (mutationHash[mutationKey] >= MAX_SAME_MUTATION_LIMIT) {
529
+ observer.disconnect();
530
+ resolve(`Repeated mutation detected for ${mutationKey}`);
531
+ }
532
+ }
533
+ });
534
+ }
535
+ });
536
+
537
+ timeout = setTimeout(() => {
538
+ observer.disconnect();
539
+ resolve('DOM stabilized after mutations.');
540
+ }, 1000);
541
+ });
542
+
543
+ timeout = setTimeout(() => {
544
+ observer.disconnect();
545
+ resolve('No mutations detected, exit from idle state');
546
+ }, 1000);
547
+
548
+ observer.observe(document, { childList: true, subtree: true, attributes: true });
549
+ });
550
+ });
551
+
552
+ let finalUrl = page.url();
553
+ const requestLabelUrl = request.label;
554
+
555
+ // to handle scenario where the redirected link is not within the scanning website
556
+ const isLoadedUrlFollowStrategy = isFollowStrategy(finalUrl, requestLabelUrl, strategy);
557
+ if (!isLoadedUrlFollowStrategy) {
558
+ finalUrl = requestLabelUrl;
559
+ }
560
+
561
+ const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
562
+ if (isRedirected) {
563
+ await requestQueue.addRequest({ url: finalUrl, label: finalUrl });
564
+ } else {
565
+ request.skipNavigation = false;
566
+ }
567
+ },
568
+ ],
569
+ preNavigationHooks: isBasicAuth
570
+ ? [
571
+ async ({ page, request }) => {
572
+ await page.setExtraHTTPHeaders({
573
+ Authorization: authHeader,
574
+ ...extraHTTPHeaders,
575
+ });
576
+ const processible = await isProcessibleUrl(request.url);
577
+ if (!processible) {
578
+ request.skipNavigation = true;
579
+ return null;
580
+ }
581
+ },
582
+ ]
583
+ : [
584
+ async (crawlingContext, gotoOptions) => {
585
+ const { page, request } = crawlingContext;
586
+
587
+ await page.setExtraHTTPHeaders({
588
+ ...extraHTTPHeaders,
589
+ });
590
+
591
+ Object.assign(gotoOptions, {
592
+ waitUntil: 'networkidle',
593
+ timeout: 30000,
594
+ });
595
+
596
+ const processible = await isProcessibleUrl(request.url);
597
+ if (!processible) {
598
+ request.skipNavigation = true;
599
+ return null;
600
+ }
601
+ },
602
+ ],
603
+ requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
604
+ requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
605
+ const browserContext: BrowserContext = page.context();
606
+ try {
607
+ // Set basic auth header if needed
608
+ if (isBasicAuth) {
609
+ await page.setExtraHTTPHeaders({
610
+ Authorization: authHeader,
611
+ });
612
+ const currentUrl = new URL(request.url);
613
+ currentUrl.username = username;
614
+ currentUrl.password = password;
615
+ request.url = currentUrl.href;
616
+ }
617
+
618
+ await waitForPageLoaded(page, 10000);
619
+ let actualUrl = request.url;
620
+
621
+ if (page.url() !== 'about:blank') {
622
+ actualUrl = page.url();
623
+ }
624
+
625
+ if (isBlacklisted(actualUrl) || (isUrlPdf(actualUrl) && !isScanPdfs)) {
626
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
627
+ numScanned: urlsCrawled.scanned.length,
628
+ urlScanned: actualUrl,
629
+ });
630
+ return;
631
+ }
632
+
633
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
634
+ isAbortingScanNow = true;
635
+ crawler.autoscaledPool.abort();
636
+ return;
637
+ }
638
+
639
+ // if URL has already been scanned
640
+ if (urlsCrawled.scanned.some(item => item.url === request.url)) {
641
+ // await enqueueProcess(page, enqueueLinks, browserContext);
642
+ return;
643
+ }
644
+
645
+ if (isDisallowedInRobotsTxt(request.url)) {
646
+ await enqueueProcess(page, enqueueLinks, browserContext);
647
+ return;
648
+ }
649
+
650
+ // handle pdfs
651
+ if (request.skipNavigation && isUrlPdf(actualUrl)) {
652
+ if (!isScanPdfs) {
653
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
654
+ numScanned: urlsCrawled.scanned.length,
655
+ urlScanned: request.url,
656
+ });
657
+ urlsCrawled.blacklisted.push(request.url);
658
+ return;
659
+ }
660
+ const { pdfFileName, url } = handlePdfDownload(
661
+ randomToken,
662
+ pdfDownloads,
663
+ request,
664
+ sendRequest,
665
+ urlsCrawled,
666
+ );
667
+
668
+ uuidToPdfMapping[pdfFileName] = url;
669
+ return;
670
+ }
671
+
672
+ const resHeaders = response ? response.headers() : {}; // Safely access response headers
673
+ const contentType = resHeaders['content-type'] || ''; // Ensure contentType is defined
674
+
675
+ // Skip non-HTML and non-PDF URLs
676
+ if (!contentType.includes('text/html') && !contentType.includes('application/pdf')) {
677
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
678
+ numScanned: urlsCrawled.scanned.length,
679
+ urlScanned: request.url,
680
+ });
681
+ urlsCrawled.blacklisted.push(request.url);
682
+ return;
683
+ }
684
+
685
+ if (isBlacklistedFileExtensions(actualUrl, blackListedFileExtensions)) {
686
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
687
+ numScanned: urlsCrawled.scanned.length,
688
+ urlScanned: request.url,
689
+ });
690
+ urlsCrawled.blacklisted.push(request.url);
691
+ return;
692
+ }
693
+
694
+ if (blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
695
+ urlsCrawled.userExcluded.push(request.url);
696
+ await enqueueProcess(page, enqueueLinks, browserContext);
697
+ return;
698
+ }
699
+
700
+ if (response.status() === 403) {
701
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
702
+ numScanned: urlsCrawled.scanned.length,
703
+ urlScanned: request.url,
704
+ });
705
+ urlsCrawled.forbidden.push(request.url);
706
+ return;
707
+ }
708
+
709
+ if (response.status() !== 200) {
710
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
711
+ numScanned: urlsCrawled.scanned.length,
712
+ urlScanned: request.url,
713
+ });
714
+ urlsCrawled.invalid.push(request.url);
715
+ return;
716
+ }
717
+
718
+ if (isScanHtml) {
719
+ // For deduplication, if the URL is redirected, we want to store the original URL and the redirected URL (actualUrl)
720
+ const isRedirected = !areLinksEqual(request.loadedUrl, request.url);
721
+
722
+ // check if redirected link is following strategy (same-domain/same-hostname)
723
+ const isLoadedUrlFollowStrategy = isFollowStrategy(
724
+ request.loadedUrl,
725
+ request.url,
726
+ strategy,
727
+ );
728
+ if (isRedirected && !isLoadedUrlFollowStrategy) {
729
+ urlsCrawled.notScannedRedirects.push({
730
+ fromUrl: request.url,
731
+ toUrl: request.loadedUrl, // i.e. actualUrl
732
+ });
733
+ return;
734
+ }
735
+
736
+ const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
737
+
738
+ if (isRedirected) {
739
+ const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
740
+ item => (item.actualUrl || item.url) === request.loadedUrl,
741
+ );
742
+
743
+ if (isLoadedUrlInCrawledUrls) {
744
+ urlsCrawled.notScannedRedirects.push({
745
+ fromUrl: request.url,
746
+ toUrl: request.loadedUrl, // i.e. actualUrl
747
+ });
748
+ return;
749
+ }
750
+
751
+ // One more check if scanned pages have reached limit due to multi-instances of handler running
752
+ if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
753
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
754
+ numScanned: urlsCrawled.scanned.length,
755
+ urlScanned: request.url,
756
+ });
757
+
758
+ urlsCrawled.scanned.push({
759
+ url: urlWithoutAuth(request.url),
760
+ pageTitle: results.pageTitle,
761
+ actualUrl: request.loadedUrl, // i.e. actualUrl
762
+ });
763
+
764
+ urlsCrawled.scannedRedirects.push({
765
+ fromUrl: urlWithoutAuth(request.url),
766
+ toUrl: request.loadedUrl, // i.e. actualUrl
767
+ });
768
+
769
+ results.url = request.url;
770
+ results.actualUrl = request.loadedUrl;
771
+ await dataset.pushData(results);
772
+ }
773
+ } else {
774
+ // One more check if scanned pages have reached limit due to multi-instances of handler running
775
+ if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
776
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
777
+ numScanned: urlsCrawled.scanned.length,
778
+ urlScanned: urlWithoutAuth(request.url),
779
+ });
780
+ urlsCrawled.scanned.push({
781
+ url: urlWithoutAuth(request.url),
782
+ actualUrl: request.url,
783
+ pageTitle: results.pageTitle,
784
+ });
785
+ await dataset.pushData(results);
786
+ }
787
+ }
788
+ } else {
789
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
790
+ numScanned: urlsCrawled.scanned.length,
791
+ urlScanned: request.url,
792
+ });
793
+ urlsCrawled.blacklisted.push(request.url);
794
+ }
795
+
796
+ if (followRobots) await getUrlsFromRobotsTxt(request.url, browser);
797
+ await enqueueProcess(page, enqueueLinks, browserContext);
798
+ } catch (e) {
799
+ try {
800
+ if (!e.message.includes('page.evaluate')) {
801
+ silentLogger.info(e);
802
+ guiInfoLog(guiInfoStatusTypes.ERROR, {
803
+ numScanned: urlsCrawled.scanned.length,
804
+ urlScanned: request.url,
805
+ });
806
+
807
+ page = await browserContext.newPage();
808
+ await page.goto(request.url);
809
+
810
+ await page.route('**/*', async route => {
811
+ const interceptedRequest = route.request();
812
+ if (interceptedRequest.resourceType() === 'document') {
813
+ const interceptedRequestUrl = interceptedRequest
814
+ .url()
815
+ .replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
816
+ await requestQueue.addRequest({
817
+ url: interceptedRequestUrl,
818
+ skipNavigation: isUrlPdf(interceptedRequest.url()),
819
+ label: interceptedRequestUrl,
820
+ });
821
+ }
822
+ });
823
+ }
824
+ } catch {
825
+ // Do nothing since the error will be pushed
826
+ }
827
+
828
+ // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
829
+ // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
830
+ if (!isAbortingScanNow) {
831
+ urlsCrawled.error.push({ url: request.url });
832
+ }
833
+ }
834
+ },
835
+ failedRequestHandler: async ({ request }) => {
836
+ guiInfoLog(guiInfoStatusTypes.ERROR, {
837
+ numScanned: urlsCrawled.scanned.length,
838
+ urlScanned: request.url,
839
+ });
840
+ urlsCrawled.error.push({ url: request.url });
841
+ crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
842
+ },
843
+ maxRequestsPerCrawl: Infinity,
844
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
845
+ });
846
+
847
+ await crawler.run();
848
+
849
+ if (pdfDownloads.length > 0) {
850
+ // wait for pdf downloads to complete
851
+ await Promise.all(pdfDownloads);
852
+
853
+ // scan and process pdf documents
854
+ await runPdfScan(randomToken);
855
+
856
+ // transform result format
857
+ const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
858
+
859
+ // get screenshots from pdf docs
860
+ if (includeScreenshots) {
861
+ await Promise.all(
862
+ pdfResults.map(async result => await doPdfScreenshots(randomToken, result)),
863
+ );
864
+ }
865
+
866
+ // push results for each pdf document to key value store
867
+ await Promise.all(pdfResults.map(result => dataset.pushData(result)));
868
+ }
869
+
870
+ if (!fromCrawlIntelligentSitemap) {
871
+ guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
872
+ }
873
+
874
+ return urlsCrawled;
875
+ };
876
+
877
+ export default crawlDomain;