@govtechsg/oobee 0.10.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/.dockerignore +22 -0
  2. package/.github/pull_request_template.md +11 -0
  3. package/.github/workflows/docker-test.yml +54 -0
  4. package/.github/workflows/image.yml +107 -0
  5. package/.github/workflows/publish.yml +18 -0
  6. package/.idea/modules.xml +8 -0
  7. package/.idea/purple-a11y.iml +9 -0
  8. package/.idea/vcs.xml +6 -0
  9. package/.prettierrc.json +12 -0
  10. package/.vscode/extensions.json +5 -0
  11. package/.vscode/settings.json +10 -0
  12. package/CODE_OF_CONDUCT.md +128 -0
  13. package/DETAILS.md +163 -0
  14. package/Dockerfile +60 -0
  15. package/INSTALLATION.md +146 -0
  16. package/INTEGRATION.md +785 -0
  17. package/LICENSE +22 -0
  18. package/README.md +587 -0
  19. package/SECURITY.md +5 -0
  20. package/__mocks__/mock-report.html +1431 -0
  21. package/__mocks__/mockFunctions.ts +32 -0
  22. package/__mocks__/mockIssues.ts +64 -0
  23. package/__mocks__/mock_all_issues/000000001.json +64 -0
  24. package/__mocks__/mock_all_issues/000000002.json +53 -0
  25. package/__mocks__/mock_all_issues/fake-file.txt +0 -0
  26. package/__tests__/logs.test.ts +25 -0
  27. package/__tests__/mergeAxeResults.test.ts +278 -0
  28. package/__tests__/utils.test.ts +118 -0
  29. package/a11y-scan-results.zip +0 -0
  30. package/eslint.config.js +53 -0
  31. package/exclusions.txt +2 -0
  32. package/gitlab-pipeline-template.yml +54 -0
  33. package/jest.config.js +1 -0
  34. package/package.json +96 -0
  35. package/scripts/copyFiles.js +44 -0
  36. package/scripts/install_oobee_dependencies.cmd +13 -0
  37. package/scripts/install_oobee_dependencies.command +101 -0
  38. package/scripts/install_oobee_dependencies.ps1 +110 -0
  39. package/scripts/oobee_shell.cmd +13 -0
  40. package/scripts/oobee_shell.command +11 -0
  41. package/scripts/oobee_shell.sh +55 -0
  42. package/scripts/oobee_shell_ps.ps1 +54 -0
  43. package/src/cli.ts +401 -0
  44. package/src/combine.ts +240 -0
  45. package/src/constants/__tests__/common.test.ts +44 -0
  46. package/src/constants/cliFunctions.ts +305 -0
  47. package/src/constants/common.ts +1840 -0
  48. package/src/constants/constants.ts +443 -0
  49. package/src/constants/errorMeta.json +319 -0
  50. package/src/constants/itemTypeDescription.ts +11 -0
  51. package/src/constants/oobeeAi.ts +141 -0
  52. package/src/constants/questions.ts +181 -0
  53. package/src/constants/sampleData.ts +187 -0
  54. package/src/crawlers/__tests__/commonCrawlerFunc.test.ts +51 -0
  55. package/src/crawlers/commonCrawlerFunc.ts +656 -0
  56. package/src/crawlers/crawlDomain.ts +877 -0
  57. package/src/crawlers/crawlIntelligentSitemap.ts +156 -0
  58. package/src/crawlers/crawlLocalFile.ts +193 -0
  59. package/src/crawlers/crawlSitemap.ts +356 -0
  60. package/src/crawlers/custom/extractAndGradeText.ts +57 -0
  61. package/src/crawlers/custom/flagUnlabelledClickableElements.ts +964 -0
  62. package/src/crawlers/custom/utils.ts +486 -0
  63. package/src/crawlers/customAxeFunctions.ts +82 -0
  64. package/src/crawlers/pdfScanFunc.ts +468 -0
  65. package/src/crawlers/runCustom.ts +117 -0
  66. package/src/index.ts +173 -0
  67. package/src/logs.ts +66 -0
  68. package/src/mergeAxeResults.ts +964 -0
  69. package/src/npmIndex.ts +284 -0
  70. package/src/screenshotFunc/htmlScreenshotFunc.ts +411 -0
  71. package/src/screenshotFunc/pdfScreenshotFunc.ts +762 -0
  72. package/src/static/ejs/partials/components/categorySelector.ejs +4 -0
  73. package/src/static/ejs/partials/components/categorySelectorDropdown.ejs +57 -0
  74. package/src/static/ejs/partials/components/pagesScannedModal.ejs +70 -0
  75. package/src/static/ejs/partials/components/reportSearch.ejs +47 -0
  76. package/src/static/ejs/partials/components/ruleOffcanvas.ejs +105 -0
  77. package/src/static/ejs/partials/components/scanAbout.ejs +263 -0
  78. package/src/static/ejs/partials/components/screenshotLightbox.ejs +13 -0
  79. package/src/static/ejs/partials/components/summaryScanAbout.ejs +141 -0
  80. package/src/static/ejs/partials/components/summaryScanResults.ejs +16 -0
  81. package/src/static/ejs/partials/components/summaryTable.ejs +20 -0
  82. package/src/static/ejs/partials/components/summaryWcagCompliance.ejs +94 -0
  83. package/src/static/ejs/partials/components/topFive.ejs +6 -0
  84. package/src/static/ejs/partials/components/wcagCompliance.ejs +70 -0
  85. package/src/static/ejs/partials/footer.ejs +21 -0
  86. package/src/static/ejs/partials/header.ejs +230 -0
  87. package/src/static/ejs/partials/main.ejs +40 -0
  88. package/src/static/ejs/partials/scripts/bootstrap.ejs +8 -0
  89. package/src/static/ejs/partials/scripts/categorySelectorDropdownScript.ejs +190 -0
  90. package/src/static/ejs/partials/scripts/categorySummary.ejs +141 -0
  91. package/src/static/ejs/partials/scripts/highlightjs.ejs +335 -0
  92. package/src/static/ejs/partials/scripts/popper.ejs +7 -0
  93. package/src/static/ejs/partials/scripts/reportSearch.ejs +248 -0
  94. package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +801 -0
  95. package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +71 -0
  96. package/src/static/ejs/partials/scripts/summaryScanResults.ejs +14 -0
  97. package/src/static/ejs/partials/scripts/summaryTable.ejs +78 -0
  98. package/src/static/ejs/partials/scripts/utils.ejs +441 -0
  99. package/src/static/ejs/partials/styles/bootstrap.ejs +12375 -0
  100. package/src/static/ejs/partials/styles/highlightjs.ejs +54 -0
  101. package/src/static/ejs/partials/styles/styles.ejs +1843 -0
  102. package/src/static/ejs/partials/styles/summaryBootstrap.ejs +12458 -0
  103. package/src/static/ejs/partials/summaryHeader.ejs +70 -0
  104. package/src/static/ejs/partials/summaryMain.ejs +75 -0
  105. package/src/static/ejs/report.ejs +420 -0
  106. package/src/static/ejs/summary.ejs +47 -0
  107. package/src/static/mustache/.prettierrc +4 -0
  108. package/src/static/mustache/Attention Deficit.mustache +11 -0
  109. package/src/static/mustache/Blind.mustache +11 -0
  110. package/src/static/mustache/Cognitive.mustache +7 -0
  111. package/src/static/mustache/Colorblindness.mustache +20 -0
  112. package/src/static/mustache/Deaf.mustache +12 -0
  113. package/src/static/mustache/Deafblind.mustache +7 -0
  114. package/src/static/mustache/Dyslexia.mustache +14 -0
  115. package/src/static/mustache/Low Vision.mustache +7 -0
  116. package/src/static/mustache/Mobility.mustache +15 -0
  117. package/src/static/mustache/Sighted Keyboard Users.mustache +42 -0
  118. package/src/static/mustache/report.mustache +1709 -0
  119. package/src/types/print-message.d.ts +28 -0
  120. package/src/types/types.ts +46 -0
  121. package/src/types/xpath-to-css.d.ts +3 -0
  122. package/src/utils.ts +332 -0
  123. package/tsconfig.json +15 -0
@@ -0,0 +1,156 @@
1
+ import fs from 'fs';
2
+ import { chromium } from 'playwright';
3
+ import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
4
+ import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
5
+ import { silentLogger, guiInfoLog } from '../logs.js';
6
+ import crawlDomain from './crawlDomain.js';
7
+ import crawlSitemap from './crawlSitemap.js';
8
+
9
+ const crawlIntelligentSitemap = async (
10
+ url,
11
+ randomToken,
12
+ host,
13
+ viewportSettings,
14
+ maxRequestsPerCrawl,
15
+ browser,
16
+ userDataDirectory,
17
+ strategy,
18
+ specifiedMaxConcurrency,
19
+ fileTypes,
20
+ blacklistedPatterns,
21
+ includeScreenshots,
22
+ followRobots,
23
+ extraHTTPHeaders,
24
+ safeMode,
25
+ ) => {
26
+ let urlsCrawledFinal;
27
+ let urlsCrawled;
28
+ let dataset;
29
+ let sitemapExist = false;
30
+ const fromCrawlIntelligentSitemap = true;
31
+ let sitemapUrl;
32
+
33
+ urlsCrawled = { ...constants.urlsCrawledObj };
34
+ ({ dataset } = await createCrawleeSubFolders(randomToken));
35
+
36
+ if (!fs.existsSync(randomToken)) {
37
+ fs.mkdirSync(randomToken);
38
+ }
39
+
40
+ function getHomeUrl(parsedUrl) {
41
+ const urlObject = new URL(parsedUrl);
42
+ if (urlObject.username !== '' && urlObject.password !== '') {
43
+ return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
44
+ }
45
+
46
+ return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
47
+ }
48
+
49
+ async function findSitemap(link) {
50
+ const homeUrl = getHomeUrl(link);
51
+ let sitemapLinkFound = false;
52
+ let sitemapLink = '';
53
+ const chromiumBrowser = await chromium.launch({ headless: true, channel: 'chrome' });
54
+ const page = await chromiumBrowser.newPage();
55
+ for (const path of sitemapPaths) {
56
+ sitemapLink = homeUrl + path;
57
+ sitemapLinkFound = await checkUrlExists(page, sitemapLink);
58
+ if (sitemapLinkFound) {
59
+ sitemapExist = true;
60
+ break;
61
+ }
62
+ }
63
+ await chromiumBrowser.close();
64
+ return sitemapExist ? sitemapLink : '';
65
+ }
66
+
67
+ const checkUrlExists = async (page, parsedUrl) => {
68
+ try {
69
+ const response = await page.goto(parsedUrl);
70
+ if (response.ok()) {
71
+ return true;
72
+ }
73
+ return false;
74
+ } catch (e) {
75
+ silentLogger.error(e);
76
+ return false;
77
+ }
78
+ };
79
+
80
+ try {
81
+ sitemapUrl = await findSitemap(url);
82
+ } catch (error) {
83
+ silentLogger.error(error);
84
+ }
85
+
86
+ if (!sitemapExist) {
87
+ console.log('Unable to find sitemap. Commencing website crawl instead.');
88
+ // run crawlDomain as per normal
89
+ urlsCrawledFinal = await crawlDomain({
90
+ url,
91
+ randomToken,
92
+ host,
93
+ viewportSettings,
94
+ maxRequestsPerCrawl,
95
+ browser,
96
+ userDataDirectory,
97
+ strategy,
98
+ specifiedMaxConcurrency,
99
+ fileTypes,
100
+ blacklistedPatterns,
101
+ includeScreenshots,
102
+ followRobots,
103
+ extraHTTPHeaders,
104
+ });
105
+ return urlsCrawledFinal;
106
+ }
107
+ console.log(`Sitemap found at ${sitemapUrl}`);
108
+ // run crawlSitemap then crawDomain subsequently if urlsCrawled.scanned.length < maxRequestsPerCrawl
109
+ urlsCrawledFinal = await crawlSitemap(
110
+ sitemapUrl,
111
+ randomToken,
112
+ host,
113
+ viewportSettings,
114
+ maxRequestsPerCrawl,
115
+ browser,
116
+ userDataDirectory,
117
+ specifiedMaxConcurrency,
118
+ fileTypes,
119
+ blacklistedPatterns,
120
+ includeScreenshots,
121
+ extraHTTPHeaders,
122
+ fromCrawlIntelligentSitemap,
123
+ url,
124
+ dataset, // for crawlSitemap to add on to
125
+ urlsCrawled, // for crawlSitemap to add on to
126
+ false,
127
+ );
128
+
129
+ if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
130
+ // run crawl domain starting from root website, only on pages not scanned before
131
+ urlsCrawledFinal = await crawlDomain({
132
+ url,
133
+ randomToken,
134
+ host,
135
+ viewportSettings,
136
+ maxRequestsPerCrawl,
137
+ browser,
138
+ userDataDirectory,
139
+ strategy,
140
+ specifiedMaxConcurrency,
141
+ fileTypes,
142
+ blacklistedPatterns,
143
+ includeScreenshots,
144
+ followRobots,
145
+ extraHTTPHeaders,
146
+ safeMode,
147
+ fromCrawlIntelligentSitemap,
148
+ datasetFromIntelligent: dataset, // for crawlDomain to add on to
149
+ urlsCrawledFromIntelligent: urlsCrawledFinal, // urls for crawlDomain to exclude
150
+ });
151
+ }
152
+
153
+ guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
154
+ return urlsCrawledFinal;
155
+ };
156
+ export default crawlIntelligentSitemap;
@@ -0,0 +1,193 @@
1
+ import { Request, RequestList } from 'crawlee';
2
+ import printMessage from 'print-message';
3
+ import fs from 'fs';
4
+ import path from 'path';
5
+ import { createCrawleeSubFolders, runAxeScript, isUrlPdf } from './commonCrawlerFunc.js';
6
+ import constants, { guiInfoStatusTypes, basicAuthRegex } from '../constants/constants.js';
7
+ import {
8
+ getPlaywrightLaunchOptions,
9
+ messageOptions,
10
+ isFilePath,
11
+ convertLocalFileToPath,
12
+ convertPathToLocalFile,
13
+ } from '../constants/common.js';
14
+ import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
15
+ import { guiInfoLog } from '../logs.js';
16
+ import crawlSitemap from './crawlSitemap.js';
17
+
18
+ const crawlLocalFile = async (
19
+ sitemapUrl: string,
20
+ randomToken: string,
21
+ host: string,
22
+ viewportSettings: any,
23
+ maxRequestsPerCrawl: number,
24
+ browser: string,
25
+ userDataDirectory: string,
26
+ specifiedMaxConcurrency: number,
27
+ fileTypes: string,
28
+ blacklistedPatterns: string[],
29
+ includeScreenshots: boolean,
30
+ extraHTTPHeaders: any,
31
+ fromCrawlIntelligentSitemap: boolean = false, // optional
32
+ userUrlInputFromIntelligent: any = null, // optional
33
+ datasetFromIntelligent: any = null, // optional
34
+ urlsCrawledFromIntelligent: any = null, // optional
35
+ ) => {
36
+ let dataset: any;
37
+ let urlsCrawled: any;
38
+ let linksFromSitemap = [];
39
+
40
+ // Boolean to omit axe scan for basic auth URL
41
+ let isBasicAuth: boolean;
42
+ let basicAuthPage: number = 0;
43
+ let finalLinks: Request[] = [];
44
+ const { playwrightDeviceDetailsObject } = viewportSettings;
45
+
46
+ if (fromCrawlIntelligentSitemap) {
47
+ dataset = datasetFromIntelligent;
48
+ urlsCrawled = urlsCrawledFromIntelligent;
49
+ } else {
50
+ ({ dataset } = await createCrawleeSubFolders(randomToken));
51
+ urlsCrawled = { ...constants.urlsCrawledObj };
52
+
53
+ if (!fs.existsSync(randomToken)) {
54
+ fs.mkdirSync(randomToken);
55
+ }
56
+ }
57
+
58
+ // Check if the sitemapUrl is a local file and if it exists
59
+ if (!isFilePath(sitemapUrl) || !fs.existsSync(sitemapUrl)) {
60
+ // Convert to an absolute path
61
+ let normalizedPath = path.resolve(sitemapUrl);
62
+
63
+ // Normalize the path to handle different path separators
64
+ normalizedPath = path.normalize(normalizedPath);
65
+
66
+ // Check if the normalized path exists
67
+ if (!fs.existsSync(normalizedPath)) {
68
+ return;
69
+ }
70
+
71
+ // At this point, normalizedPath is a valid and existing file path
72
+ sitemapUrl = normalizedPath;
73
+ }
74
+
75
+ // Checks if its in the right file format, and change it before placing into linksFromSitemap
76
+ convertLocalFileToPath(sitemapUrl);
77
+
78
+ // XML Files
79
+ if (!(sitemapUrl.match(/\.xml$/i) || sitemapUrl.match(/\.txt$/i))) {
80
+ linksFromSitemap = [new Request({ url: sitemapUrl })];
81
+ // Non XML file
82
+ } else {
83
+ // Put it to crawlSitemap function to handle xml files
84
+ const updatedUrlsCrawled = await crawlSitemap(
85
+ sitemapUrl,
86
+ randomToken,
87
+ host,
88
+ viewportSettings,
89
+ maxRequestsPerCrawl,
90
+ browser,
91
+ userDataDirectory,
92
+ specifiedMaxConcurrency,
93
+ fileTypes,
94
+ blacklistedPatterns,
95
+ includeScreenshots,
96
+ extraHTTPHeaders,
97
+ (fromCrawlIntelligentSitemap = false), // optional
98
+ (userUrlInputFromIntelligent = null), // optional
99
+ (datasetFromIntelligent = null), // optional
100
+ (urlsCrawledFromIntelligent = null), // optional
101
+ true,
102
+ );
103
+
104
+ urlsCrawled = { ...urlsCrawled, ...updatedUrlsCrawled };
105
+ return urlsCrawled;
106
+ }
107
+
108
+ try {
109
+ sitemapUrl = encodeURI(sitemapUrl);
110
+ } catch (e) {
111
+ console.log(e);
112
+ }
113
+
114
+ if (basicAuthRegex.test(sitemapUrl)) {
115
+ isBasicAuth = true;
116
+ // request to basic auth URL to authenticate for browser session
117
+ finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
118
+ const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
119
+ // obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
120
+ finalLinks.push(new Request({ url: finalUrl }));
121
+ basicAuthPage = -2;
122
+ }
123
+
124
+ const uuidToPdfMapping: Record<string, string> = {}; // key and value of string type
125
+
126
+ printMessage(['Fetching URLs. This might take some time...'], { border: false });
127
+
128
+ finalLinks = [...finalLinks, ...linksFromSitemap];
129
+
130
+ await RequestList.open({
131
+ sources: finalLinks,
132
+ });
133
+
134
+ printMessage(['Fetch URLs completed. Beginning scan'], messageOptions);
135
+
136
+ const request = linksFromSitemap[0];
137
+ const pdfFileName = path.basename(request.url);
138
+ const trimmedUrl: string = request.url;
139
+ const destinationFilePath: string = `${randomToken}/${pdfFileName}`;
140
+ const data: Buffer = fs.readFileSync(trimmedUrl);
141
+ fs.writeFileSync(destinationFilePath, data);
142
+ uuidToPdfMapping[pdfFileName] = trimmedUrl;
143
+
144
+ if (!isUrlPdf(request.url)) {
145
+ const browserContext = await constants.launcher.launchPersistentContext('', {
146
+ headless: process.env.CRAWLEE_HEADLESS === '1',
147
+ ...getPlaywrightLaunchOptions(browser),
148
+ ...playwrightDeviceDetailsObject,
149
+ });
150
+
151
+ const page = await browserContext.newPage();
152
+ request.url = convertPathToLocalFile(request.url);
153
+ await page.goto(request.url);
154
+ const results = await runAxeScript({ includeScreenshots, page, randomToken });
155
+
156
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
157
+ numScanned: urlsCrawled.scanned.length,
158
+ urlScanned: request.url,
159
+ });
160
+
161
+ urlsCrawled.scanned.push({
162
+ url: request.url,
163
+ pageTitle: results.pageTitle,
164
+ actualUrl: request.loadedUrl, // i.e. actualUrl
165
+ });
166
+
167
+ urlsCrawled.scannedRedirects.push({
168
+ fromUrl: request.url,
169
+ toUrl: request.loadedUrl, // i.e. actualUrl
170
+ });
171
+
172
+ results.url = request.url;
173
+ // results.actualUrl = request.loadedUrl;
174
+
175
+ await dataset.pushData(results);
176
+ } else {
177
+ urlsCrawled.scanned.push({ url: trimmedUrl, pageTitle: pdfFileName });
178
+
179
+ await runPdfScan(randomToken);
180
+ // transform result format
181
+ const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
182
+
183
+ // get screenshots from pdf docs
184
+ if (includeScreenshots) {
185
+ await Promise.all(pdfResults.map(result => doPdfScreenshots(randomToken, result)));
186
+ }
187
+
188
+ // push results for each pdf document to key value store
189
+ await Promise.all(pdfResults.map(result => dataset.pushData(result)));
190
+ }
191
+ return urlsCrawled;
192
+ };
193
+ export default crawlLocalFile;
@@ -0,0 +1,356 @@
1
+ import crawlee, { Request, RequestList } from 'crawlee';
2
+ import printMessage from 'print-message';
3
+ import fs from 'fs';
4
+ import {
5
+ createCrawleeSubFolders,
6
+ preNavigationHooks,
7
+ runAxeScript,
8
+ isUrlPdf,
9
+ } from './commonCrawlerFunc.js';
10
+
11
+ import constants, { guiInfoStatusTypes } from '../constants/constants.js';
12
+ import {
13
+ getLinksFromSitemap,
14
+ getPlaywrightLaunchOptions,
15
+ messageOptions,
16
+ isSkippedUrl,
17
+ urlWithoutAuth,
18
+ waitForPageLoaded,
19
+ isFilePath,
20
+ } from '../constants/common.js';
21
+ import { areLinksEqual, isWhitelistedContentType } from '../utils.js';
22
+ import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
23
+ import { guiInfoLog } from '../logs.js';
24
+
25
+ const crawlSitemap = async (
26
+ sitemapUrl,
27
+ randomToken,
28
+ host,
29
+ viewportSettings,
30
+ maxRequestsPerCrawl,
31
+ browser,
32
+ userDataDirectory,
33
+ specifiedMaxConcurrency,
34
+ fileTypes,
35
+ blacklistedPatterns,
36
+ includeScreenshots,
37
+ extraHTTPHeaders,
38
+ fromCrawlIntelligentSitemap = false, // optional
39
+ userUrlInputFromIntelligent = null, // optional
40
+ datasetFromIntelligent = null, // optional
41
+ urlsCrawledFromIntelligent = null, // optional
42
+ crawledFromLocalFile = false, // optional
43
+ ) => {
44
+ let dataset;
45
+ let urlsCrawled;
46
+
47
+ // Boolean to omit axe scan for basic auth URL
48
+ let isBasicAuth;
49
+ let basicAuthPage = 0;
50
+ let finalLinks = [];
51
+ let authHeader = '';
52
+
53
+ if (fromCrawlIntelligentSitemap) {
54
+ dataset = datasetFromIntelligent;
55
+ urlsCrawled = urlsCrawledFromIntelligent;
56
+ } else {
57
+ ({ dataset } = await createCrawleeSubFolders(randomToken));
58
+ urlsCrawled = { ...constants.urlsCrawledObj };
59
+
60
+ if (!fs.existsSync(randomToken)) {
61
+ fs.mkdirSync(randomToken);
62
+ }
63
+ }
64
+
65
+ let parsedUrl;
66
+ let username = '';
67
+ let password = '';
68
+
69
+ if (!crawledFromLocalFile && isFilePath(sitemapUrl)) {
70
+ console.log('Local file crawling not supported for sitemap. Please provide a valid URL.');
71
+ return;
72
+ }
73
+
74
+ if (isFilePath(sitemapUrl)) {
75
+ parsedUrl = sitemapUrl;
76
+ } else {
77
+ parsedUrl = new URL(sitemapUrl);
78
+ if (parsedUrl.username !== '' && parsedUrl.password !== '') {
79
+ isBasicAuth = true;
80
+ username = decodeURIComponent(parsedUrl.username);
81
+ password = decodeURIComponent(parsedUrl.password);
82
+
83
+ // Create auth header
84
+ authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
85
+
86
+ parsedUrl.username = '';
87
+ parsedUrl.password = '';
88
+ }
89
+ }
90
+
91
+ const linksFromSitemap = await getLinksFromSitemap(
92
+ sitemapUrl,
93
+ maxRequestsPerCrawl,
94
+ browser,
95
+ userDataDirectory,
96
+ userUrlInputFromIntelligent,
97
+ fromCrawlIntelligentSitemap,
98
+ username,
99
+ password,
100
+ );
101
+ /**
102
+ * Regex to match http://username:password@hostname.com
103
+ * utilised in scan strategy to ensure subsequent URLs within the same domain are scanned.
104
+ * First time scan with original `url` containing credentials is strictly to authenticate for browser session
105
+ * subsequent URLs are without credentials.
106
+ * basicAuthPage is set to -1 for basic auth URL to ensure it is not counted towards maxRequestsPerCrawl
107
+ */
108
+
109
+ sitemapUrl = encodeURI(sitemapUrl);
110
+
111
+ if (isBasicAuth) {
112
+ // request to basic auth URL to authenticate for browser session
113
+ finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
114
+ const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
115
+
116
+ // obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
117
+ finalLinks.push(new Request({ url: finalUrl }));
118
+ basicAuthPage = -2;
119
+ }
120
+
121
+ const pdfDownloads = [];
122
+ const uuidToPdfMapping = {};
123
+ const isScanHtml = ['all', 'html-only'].includes(fileTypes);
124
+ const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
125
+ const { playwrightDeviceDetailsObject } = viewportSettings;
126
+ const { maxConcurrency } = constants;
127
+
128
+ printMessage(['Fetching URLs. This might take some time...'], { border: false });
129
+
130
+ finalLinks = [...finalLinks, ...linksFromSitemap];
131
+
132
+ const requestList = await RequestList.open({
133
+ sources: finalLinks,
134
+ });
135
+ printMessage(['Fetch URLs completed. Beginning scan'], messageOptions);
136
+
137
+ let userDataDir = '';
138
+ if (userDataDirectory) {
139
+ userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
140
+ }
141
+
142
+ const crawler = new crawlee.PlaywrightCrawler({
143
+ launchContext: {
144
+ launcher: constants.launcher,
145
+ launchOptions: getPlaywrightLaunchOptions(browser),
146
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
147
+ userDataDir,
148
+ },
149
+ retryOnBlocked: true,
150
+ browserPoolOptions: {
151
+ useFingerprints: false,
152
+ preLaunchHooks: [
153
+ async (pageId, launchContext) => {
154
+ launchContext.launchOptions = {
155
+ ...launchContext.launchOptions,
156
+ bypassCSP: true,
157
+ ignoreHTTPSErrors: true,
158
+ ...playwrightDeviceDetailsObject,
159
+ };
160
+ },
161
+ ],
162
+ },
163
+ requestList,
164
+ preNavigationHooks: isBasicAuth
165
+ ? [
166
+ async ({ page }) => {
167
+ await page.setExtraHTTPHeaders({
168
+ Authorization: authHeader,
169
+ ...extraHTTPHeaders,
170
+ });
171
+ },
172
+ ]
173
+ : [
174
+ async () => {
175
+ preNavigationHooks(extraHTTPHeaders);
176
+ // insert other code here
177
+ },
178
+ ],
179
+ requestHandlerTimeoutSecs: 90,
180
+ requestHandler: async ({ page, request, response, sendRequest }) => {
181
+ await waitForPageLoaded(page, 10000);
182
+
183
+ // Set basic auth header if needed
184
+ if (isBasicAuth) {
185
+ await page.setExtraHTTPHeaders({
186
+ Authorization: authHeader,
187
+ });
188
+ const currentUrl = new URL(request.url);
189
+ currentUrl.username = username;
190
+ currentUrl.password = password;
191
+ request.url = currentUrl.href;
192
+ }
193
+
194
+ const actualUrl = request.loadedUrl || request.url;
195
+
196
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
197
+ crawler.autoscaledPool.abort();
198
+ return;
199
+ }
200
+
201
+ if (isUrlPdf(actualUrl)) {
202
+ if (!isScanPdfs) {
203
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
204
+ numScanned: urlsCrawled.scanned.length,
205
+ urlScanned: request.url,
206
+ });
207
+ urlsCrawled.blacklisted.push(request.url);
208
+ return;
209
+ }
210
+ // pushes download promise into pdfDownloads
211
+ const { pdfFileName, url } = handlePdfDownload(
212
+ randomToken,
213
+ pdfDownloads,
214
+ request,
215
+ sendRequest,
216
+ urlsCrawled,
217
+ );
218
+
219
+ uuidToPdfMapping[pdfFileName] = url;
220
+ return;
221
+ }
222
+
223
+ const contentType = response.headers()['content-type'];
224
+ const status = response.status();
225
+
226
+ if (blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
227
+ urlsCrawled.userExcluded.push(request.url);
228
+ return;
229
+ }
230
+
231
+ if (status === 403) {
232
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
233
+ numScanned: urlsCrawled.scanned.length,
234
+ urlScanned: request.url,
235
+ });
236
+ urlsCrawled.forbidden.push({ url: request.url });
237
+ return;
238
+ }
239
+
240
+ if (status !== 200) {
241
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
242
+ numScanned: urlsCrawled.scanned.length,
243
+ urlScanned: request.url,
244
+ });
245
+ urlsCrawled.invalid.push(request.url);
246
+ return;
247
+ }
248
+
249
+ if (basicAuthPage < 0) {
250
+ basicAuthPage += 1;
251
+ } else if (isScanHtml && status === 200 && isWhitelistedContentType(contentType)) {
252
+ const results = await runAxeScript({ includeScreenshots, page, randomToken });
253
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
254
+ numScanned: urlsCrawled.scanned.length,
255
+ urlScanned: request.url,
256
+ });
257
+
258
+ const isRedirected = !areLinksEqual(request.loadedUrl, request.url);
259
+ if (isRedirected) {
260
+ const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
261
+ item => (item.actualUrl || item.url.href) === request.loadedUrl,
262
+ );
263
+
264
+ if (isLoadedUrlInCrawledUrls) {
265
+ urlsCrawled.notScannedRedirects.push({
266
+ fromUrl: request.url,
267
+ toUrl: request.loadedUrl, // i.e. actualUrl
268
+ });
269
+ return;
270
+ }
271
+
272
+ urlsCrawled.scanned.push({
273
+ url: urlWithoutAuth(request.url),
274
+ pageTitle: results.pageTitle,
275
+ actualUrl: request.loadedUrl, // i.e. actualUrl
276
+ });
277
+
278
+ urlsCrawled.scannedRedirects.push({
279
+ fromUrl: urlWithoutAuth(request.url),
280
+ toUrl: request.loadedUrl, // i.e. actualUrl
281
+ });
282
+
283
+ results.url = request.url;
284
+ results.actualUrl = request.loadedUrl;
285
+ } else {
286
+ urlsCrawled.scanned.push({
287
+ url: urlWithoutAuth(request.url),
288
+ pageTitle: results.pageTitle,
289
+ });
290
+ }
291
+ await dataset.pushData(results);
292
+ } else {
293
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
294
+ numScanned: urlsCrawled.scanned.length,
295
+ urlScanned: request.url,
296
+ });
297
+
298
+ if (isScanHtml) {
299
+ urlsCrawled.invalid.push(actualUrl);
300
+ }
301
+ }
302
+ },
303
+ failedRequestHandler: async ({ request }) => {
304
+ if (isBasicAuth && request.url) {
305
+ request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}`;
306
+ }
307
+
308
+ // check if scanned pages have reached limit due to multi-instances of handler running
309
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
310
+ return;
311
+ }
312
+
313
+ guiInfoLog(guiInfoStatusTypes.ERROR, {
314
+ numScanned: urlsCrawled.scanned.length,
315
+ urlScanned: request.url,
316
+ });
317
+ urlsCrawled.error.push({ url: request.url });
318
+ crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
319
+ },
320
+ maxRequestsPerCrawl: Infinity,
321
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
322
+ });
323
+
324
+ await crawler.run();
325
+
326
+ await requestList.isFinished();
327
+
328
+ if (pdfDownloads.length > 0) {
329
+ // wait for pdf downloads to complete
330
+ await Promise.all(pdfDownloads);
331
+
332
+ // scan and process pdf documents
333
+ await runPdfScan(randomToken);
334
+
335
+ // transform result format
336
+ const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
337
+
338
+ // get screenshots from pdf docs
339
+ // if (includeScreenshots) {
340
+ // await Promise.all(pdfResults.map(
341
+ // async result => await doPdfScreenshots(randomToken, result)
342
+ // ));
343
+ // }
344
+
345
+ // push results for each pdf document to key value store
346
+ await Promise.all(pdfResults.map(result => dataset.pushData(result)));
347
+ }
348
+
349
+ if (!fromCrawlIntelligentSitemap) {
350
+ guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
351
+ }
352
+
353
+ return urlsCrawled;
354
+ };
355
+
356
+ export default crawlSitemap;