@govtechsg/oobee 0.10.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.dockerignore +22 -0
- package/.github/pull_request_template.md +11 -0
- package/.github/workflows/docker-test.yml +54 -0
- package/.github/workflows/image.yml +107 -0
- package/.github/workflows/publish.yml +18 -0
- package/.idea/modules.xml +8 -0
- package/.idea/purple-a11y.iml +9 -0
- package/.idea/vcs.xml +6 -0
- package/.prettierrc.json +12 -0
- package/.vscode/extensions.json +5 -0
- package/.vscode/settings.json +10 -0
- package/CODE_OF_CONDUCT.md +128 -0
- package/DETAILS.md +163 -0
- package/Dockerfile +60 -0
- package/INSTALLATION.md +146 -0
- package/INTEGRATION.md +785 -0
- package/LICENSE +22 -0
- package/README.md +587 -0
- package/SECURITY.md +5 -0
- package/__mocks__/mock-report.html +1431 -0
- package/__mocks__/mockFunctions.ts +32 -0
- package/__mocks__/mockIssues.ts +64 -0
- package/__mocks__/mock_all_issues/000000001.json +64 -0
- package/__mocks__/mock_all_issues/000000002.json +53 -0
- package/__mocks__/mock_all_issues/fake-file.txt +0 -0
- package/__tests__/logs.test.ts +25 -0
- package/__tests__/mergeAxeResults.test.ts +278 -0
- package/__tests__/utils.test.ts +118 -0
- package/a11y-scan-results.zip +0 -0
- package/eslint.config.js +53 -0
- package/exclusions.txt +2 -0
- package/gitlab-pipeline-template.yml +54 -0
- package/jest.config.js +1 -0
- package/package.json +96 -0
- package/scripts/copyFiles.js +44 -0
- package/scripts/install_oobee_dependencies.cmd +13 -0
- package/scripts/install_oobee_dependencies.command +101 -0
- package/scripts/install_oobee_dependencies.ps1 +110 -0
- package/scripts/oobee_shell.cmd +13 -0
- package/scripts/oobee_shell.command +11 -0
- package/scripts/oobee_shell.sh +55 -0
- package/scripts/oobee_shell_ps.ps1 +54 -0
- package/src/cli.ts +401 -0
- package/src/combine.ts +240 -0
- package/src/constants/__tests__/common.test.ts +44 -0
- package/src/constants/cliFunctions.ts +305 -0
- package/src/constants/common.ts +1840 -0
- package/src/constants/constants.ts +443 -0
- package/src/constants/errorMeta.json +319 -0
- package/src/constants/itemTypeDescription.ts +11 -0
- package/src/constants/oobeeAi.ts +141 -0
- package/src/constants/questions.ts +181 -0
- package/src/constants/sampleData.ts +187 -0
- package/src/crawlers/__tests__/commonCrawlerFunc.test.ts +51 -0
- package/src/crawlers/commonCrawlerFunc.ts +656 -0
- package/src/crawlers/crawlDomain.ts +877 -0
- package/src/crawlers/crawlIntelligentSitemap.ts +156 -0
- package/src/crawlers/crawlLocalFile.ts +193 -0
- package/src/crawlers/crawlSitemap.ts +356 -0
- package/src/crawlers/custom/extractAndGradeText.ts +57 -0
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +964 -0
- package/src/crawlers/custom/utils.ts +486 -0
- package/src/crawlers/customAxeFunctions.ts +82 -0
- package/src/crawlers/pdfScanFunc.ts +468 -0
- package/src/crawlers/runCustom.ts +117 -0
- package/src/index.ts +173 -0
- package/src/logs.ts +66 -0
- package/src/mergeAxeResults.ts +964 -0
- package/src/npmIndex.ts +284 -0
- package/src/screenshotFunc/htmlScreenshotFunc.ts +411 -0
- package/src/screenshotFunc/pdfScreenshotFunc.ts +762 -0
- package/src/static/ejs/partials/components/categorySelector.ejs +4 -0
- package/src/static/ejs/partials/components/categorySelectorDropdown.ejs +57 -0
- package/src/static/ejs/partials/components/pagesScannedModal.ejs +70 -0
- package/src/static/ejs/partials/components/reportSearch.ejs +47 -0
- package/src/static/ejs/partials/components/ruleOffcanvas.ejs +105 -0
- package/src/static/ejs/partials/components/scanAbout.ejs +263 -0
- package/src/static/ejs/partials/components/screenshotLightbox.ejs +13 -0
- package/src/static/ejs/partials/components/summaryScanAbout.ejs +141 -0
- package/src/static/ejs/partials/components/summaryScanResults.ejs +16 -0
- package/src/static/ejs/partials/components/summaryTable.ejs +20 -0
- package/src/static/ejs/partials/components/summaryWcagCompliance.ejs +94 -0
- package/src/static/ejs/partials/components/topFive.ejs +6 -0
- package/src/static/ejs/partials/components/wcagCompliance.ejs +70 -0
- package/src/static/ejs/partials/footer.ejs +21 -0
- package/src/static/ejs/partials/header.ejs +230 -0
- package/src/static/ejs/partials/main.ejs +40 -0
- package/src/static/ejs/partials/scripts/bootstrap.ejs +8 -0
- package/src/static/ejs/partials/scripts/categorySelectorDropdownScript.ejs +190 -0
- package/src/static/ejs/partials/scripts/categorySummary.ejs +141 -0
- package/src/static/ejs/partials/scripts/highlightjs.ejs +335 -0
- package/src/static/ejs/partials/scripts/popper.ejs +7 -0
- package/src/static/ejs/partials/scripts/reportSearch.ejs +248 -0
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +801 -0
- package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +71 -0
- package/src/static/ejs/partials/scripts/summaryScanResults.ejs +14 -0
- package/src/static/ejs/partials/scripts/summaryTable.ejs +78 -0
- package/src/static/ejs/partials/scripts/utils.ejs +441 -0
- package/src/static/ejs/partials/styles/bootstrap.ejs +12375 -0
- package/src/static/ejs/partials/styles/highlightjs.ejs +54 -0
- package/src/static/ejs/partials/styles/styles.ejs +1843 -0
- package/src/static/ejs/partials/styles/summaryBootstrap.ejs +12458 -0
- package/src/static/ejs/partials/summaryHeader.ejs +70 -0
- package/src/static/ejs/partials/summaryMain.ejs +75 -0
- package/src/static/ejs/report.ejs +420 -0
- package/src/static/ejs/summary.ejs +47 -0
- package/src/static/mustache/.prettierrc +4 -0
- package/src/static/mustache/Attention Deficit.mustache +11 -0
- package/src/static/mustache/Blind.mustache +11 -0
- package/src/static/mustache/Cognitive.mustache +7 -0
- package/src/static/mustache/Colorblindness.mustache +20 -0
- package/src/static/mustache/Deaf.mustache +12 -0
- package/src/static/mustache/Deafblind.mustache +7 -0
- package/src/static/mustache/Dyslexia.mustache +14 -0
- package/src/static/mustache/Low Vision.mustache +7 -0
- package/src/static/mustache/Mobility.mustache +15 -0
- package/src/static/mustache/Sighted Keyboard Users.mustache +42 -0
- package/src/static/mustache/report.mustache +1709 -0
- package/src/types/print-message.d.ts +28 -0
- package/src/types/types.ts +46 -0
- package/src/types/xpath-to-css.d.ts +3 -0
- package/src/utils.ts +332 -0
- package/tsconfig.json +15 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
import fs from 'fs';
|
2
|
+
import { chromium } from 'playwright';
|
3
|
+
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
4
|
+
import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
5
|
+
import { silentLogger, guiInfoLog } from '../logs.js';
|
6
|
+
import crawlDomain from './crawlDomain.js';
|
7
|
+
import crawlSitemap from './crawlSitemap.js';
|
8
|
+
|
9
|
+
const crawlIntelligentSitemap = async (
|
10
|
+
url,
|
11
|
+
randomToken,
|
12
|
+
host,
|
13
|
+
viewportSettings,
|
14
|
+
maxRequestsPerCrawl,
|
15
|
+
browser,
|
16
|
+
userDataDirectory,
|
17
|
+
strategy,
|
18
|
+
specifiedMaxConcurrency,
|
19
|
+
fileTypes,
|
20
|
+
blacklistedPatterns,
|
21
|
+
includeScreenshots,
|
22
|
+
followRobots,
|
23
|
+
extraHTTPHeaders,
|
24
|
+
safeMode,
|
25
|
+
) => {
|
26
|
+
let urlsCrawledFinal;
|
27
|
+
let urlsCrawled;
|
28
|
+
let dataset;
|
29
|
+
let sitemapExist = false;
|
30
|
+
const fromCrawlIntelligentSitemap = true;
|
31
|
+
let sitemapUrl;
|
32
|
+
|
33
|
+
urlsCrawled = { ...constants.urlsCrawledObj };
|
34
|
+
({ dataset } = await createCrawleeSubFolders(randomToken));
|
35
|
+
|
36
|
+
if (!fs.existsSync(randomToken)) {
|
37
|
+
fs.mkdirSync(randomToken);
|
38
|
+
}
|
39
|
+
|
40
|
+
function getHomeUrl(parsedUrl) {
|
41
|
+
const urlObject = new URL(parsedUrl);
|
42
|
+
if (urlObject.username !== '' && urlObject.password !== '') {
|
43
|
+
return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
44
|
+
}
|
45
|
+
|
46
|
+
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
47
|
+
}
|
48
|
+
|
49
|
+
async function findSitemap(link) {
|
50
|
+
const homeUrl = getHomeUrl(link);
|
51
|
+
let sitemapLinkFound = false;
|
52
|
+
let sitemapLink = '';
|
53
|
+
const chromiumBrowser = await chromium.launch({ headless: true, channel: 'chrome' });
|
54
|
+
const page = await chromiumBrowser.newPage();
|
55
|
+
for (const path of sitemapPaths) {
|
56
|
+
sitemapLink = homeUrl + path;
|
57
|
+
sitemapLinkFound = await checkUrlExists(page, sitemapLink);
|
58
|
+
if (sitemapLinkFound) {
|
59
|
+
sitemapExist = true;
|
60
|
+
break;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
await chromiumBrowser.close();
|
64
|
+
return sitemapExist ? sitemapLink : '';
|
65
|
+
}
|
66
|
+
|
67
|
+
const checkUrlExists = async (page, parsedUrl) => {
|
68
|
+
try {
|
69
|
+
const response = await page.goto(parsedUrl);
|
70
|
+
if (response.ok()) {
|
71
|
+
return true;
|
72
|
+
}
|
73
|
+
return false;
|
74
|
+
} catch (e) {
|
75
|
+
silentLogger.error(e);
|
76
|
+
return false;
|
77
|
+
}
|
78
|
+
};
|
79
|
+
|
80
|
+
try {
|
81
|
+
sitemapUrl = await findSitemap(url);
|
82
|
+
} catch (error) {
|
83
|
+
silentLogger.error(error);
|
84
|
+
}
|
85
|
+
|
86
|
+
if (!sitemapExist) {
|
87
|
+
console.log('Unable to find sitemap. Commencing website crawl instead.');
|
88
|
+
// run crawlDomain as per normal
|
89
|
+
urlsCrawledFinal = await crawlDomain({
|
90
|
+
url,
|
91
|
+
randomToken,
|
92
|
+
host,
|
93
|
+
viewportSettings,
|
94
|
+
maxRequestsPerCrawl,
|
95
|
+
browser,
|
96
|
+
userDataDirectory,
|
97
|
+
strategy,
|
98
|
+
specifiedMaxConcurrency,
|
99
|
+
fileTypes,
|
100
|
+
blacklistedPatterns,
|
101
|
+
includeScreenshots,
|
102
|
+
followRobots,
|
103
|
+
extraHTTPHeaders,
|
104
|
+
});
|
105
|
+
return urlsCrawledFinal;
|
106
|
+
}
|
107
|
+
console.log(`Sitemap found at ${sitemapUrl}`);
|
108
|
+
// run crawlSitemap then crawDomain subsequently if urlsCrawled.scanned.length < maxRequestsPerCrawl
|
109
|
+
urlsCrawledFinal = await crawlSitemap(
|
110
|
+
sitemapUrl,
|
111
|
+
randomToken,
|
112
|
+
host,
|
113
|
+
viewportSettings,
|
114
|
+
maxRequestsPerCrawl,
|
115
|
+
browser,
|
116
|
+
userDataDirectory,
|
117
|
+
specifiedMaxConcurrency,
|
118
|
+
fileTypes,
|
119
|
+
blacklistedPatterns,
|
120
|
+
includeScreenshots,
|
121
|
+
extraHTTPHeaders,
|
122
|
+
fromCrawlIntelligentSitemap,
|
123
|
+
url,
|
124
|
+
dataset, // for crawlSitemap to add on to
|
125
|
+
urlsCrawled, // for crawlSitemap to add on to
|
126
|
+
false,
|
127
|
+
);
|
128
|
+
|
129
|
+
if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
|
130
|
+
// run crawl domain starting from root website, only on pages not scanned before
|
131
|
+
urlsCrawledFinal = await crawlDomain({
|
132
|
+
url,
|
133
|
+
randomToken,
|
134
|
+
host,
|
135
|
+
viewportSettings,
|
136
|
+
maxRequestsPerCrawl,
|
137
|
+
browser,
|
138
|
+
userDataDirectory,
|
139
|
+
strategy,
|
140
|
+
specifiedMaxConcurrency,
|
141
|
+
fileTypes,
|
142
|
+
blacklistedPatterns,
|
143
|
+
includeScreenshots,
|
144
|
+
followRobots,
|
145
|
+
extraHTTPHeaders,
|
146
|
+
safeMode,
|
147
|
+
fromCrawlIntelligentSitemap,
|
148
|
+
datasetFromIntelligent: dataset, // for crawlDomain to add on to
|
149
|
+
urlsCrawledFromIntelligent: urlsCrawledFinal, // urls for crawlDomain to exclude
|
150
|
+
});
|
151
|
+
}
|
152
|
+
|
153
|
+
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
154
|
+
return urlsCrawledFinal;
|
155
|
+
};
|
156
|
+
export default crawlIntelligentSitemap;
|
@@ -0,0 +1,193 @@
|
|
1
|
+
import { Request, RequestList } from 'crawlee';
|
2
|
+
import printMessage from 'print-message';
|
3
|
+
import fs from 'fs';
|
4
|
+
import path from 'path';
|
5
|
+
import { createCrawleeSubFolders, runAxeScript, isUrlPdf } from './commonCrawlerFunc.js';
|
6
|
+
import constants, { guiInfoStatusTypes, basicAuthRegex } from '../constants/constants.js';
|
7
|
+
import {
|
8
|
+
getPlaywrightLaunchOptions,
|
9
|
+
messageOptions,
|
10
|
+
isFilePath,
|
11
|
+
convertLocalFileToPath,
|
12
|
+
convertPathToLocalFile,
|
13
|
+
} from '../constants/common.js';
|
14
|
+
import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
|
15
|
+
import { guiInfoLog } from '../logs.js';
|
16
|
+
import crawlSitemap from './crawlSitemap.js';
|
17
|
+
|
18
|
+
const crawlLocalFile = async (
|
19
|
+
sitemapUrl: string,
|
20
|
+
randomToken: string,
|
21
|
+
host: string,
|
22
|
+
viewportSettings: any,
|
23
|
+
maxRequestsPerCrawl: number,
|
24
|
+
browser: string,
|
25
|
+
userDataDirectory: string,
|
26
|
+
specifiedMaxConcurrency: number,
|
27
|
+
fileTypes: string,
|
28
|
+
blacklistedPatterns: string[],
|
29
|
+
includeScreenshots: boolean,
|
30
|
+
extraHTTPHeaders: any,
|
31
|
+
fromCrawlIntelligentSitemap: boolean = false, // optional
|
32
|
+
userUrlInputFromIntelligent: any = null, // optional
|
33
|
+
datasetFromIntelligent: any = null, // optional
|
34
|
+
urlsCrawledFromIntelligent: any = null, // optional
|
35
|
+
) => {
|
36
|
+
let dataset: any;
|
37
|
+
let urlsCrawled: any;
|
38
|
+
let linksFromSitemap = [];
|
39
|
+
|
40
|
+
// Boolean to omit axe scan for basic auth URL
|
41
|
+
let isBasicAuth: boolean;
|
42
|
+
let basicAuthPage: number = 0;
|
43
|
+
let finalLinks: Request[] = [];
|
44
|
+
const { playwrightDeviceDetailsObject } = viewportSettings;
|
45
|
+
|
46
|
+
if (fromCrawlIntelligentSitemap) {
|
47
|
+
dataset = datasetFromIntelligent;
|
48
|
+
urlsCrawled = urlsCrawledFromIntelligent;
|
49
|
+
} else {
|
50
|
+
({ dataset } = await createCrawleeSubFolders(randomToken));
|
51
|
+
urlsCrawled = { ...constants.urlsCrawledObj };
|
52
|
+
|
53
|
+
if (!fs.existsSync(randomToken)) {
|
54
|
+
fs.mkdirSync(randomToken);
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
// Check if the sitemapUrl is a local file and if it exists
|
59
|
+
if (!isFilePath(sitemapUrl) || !fs.existsSync(sitemapUrl)) {
|
60
|
+
// Convert to an absolute path
|
61
|
+
let normalizedPath = path.resolve(sitemapUrl);
|
62
|
+
|
63
|
+
// Normalize the path to handle different path separators
|
64
|
+
normalizedPath = path.normalize(normalizedPath);
|
65
|
+
|
66
|
+
// Check if the normalized path exists
|
67
|
+
if (!fs.existsSync(normalizedPath)) {
|
68
|
+
return;
|
69
|
+
}
|
70
|
+
|
71
|
+
// At this point, normalizedPath is a valid and existing file path
|
72
|
+
sitemapUrl = normalizedPath;
|
73
|
+
}
|
74
|
+
|
75
|
+
// Checks if its in the right file format, and change it before placing into linksFromSitemap
|
76
|
+
convertLocalFileToPath(sitemapUrl);
|
77
|
+
|
78
|
+
// XML Files
|
79
|
+
if (!(sitemapUrl.match(/\.xml$/i) || sitemapUrl.match(/\.txt$/i))) {
|
80
|
+
linksFromSitemap = [new Request({ url: sitemapUrl })];
|
81
|
+
// Non XML file
|
82
|
+
} else {
|
83
|
+
// Put it to crawlSitemap function to handle xml files
|
84
|
+
const updatedUrlsCrawled = await crawlSitemap(
|
85
|
+
sitemapUrl,
|
86
|
+
randomToken,
|
87
|
+
host,
|
88
|
+
viewportSettings,
|
89
|
+
maxRequestsPerCrawl,
|
90
|
+
browser,
|
91
|
+
userDataDirectory,
|
92
|
+
specifiedMaxConcurrency,
|
93
|
+
fileTypes,
|
94
|
+
blacklistedPatterns,
|
95
|
+
includeScreenshots,
|
96
|
+
extraHTTPHeaders,
|
97
|
+
(fromCrawlIntelligentSitemap = false), // optional
|
98
|
+
(userUrlInputFromIntelligent = null), // optional
|
99
|
+
(datasetFromIntelligent = null), // optional
|
100
|
+
(urlsCrawledFromIntelligent = null), // optional
|
101
|
+
true,
|
102
|
+
);
|
103
|
+
|
104
|
+
urlsCrawled = { ...urlsCrawled, ...updatedUrlsCrawled };
|
105
|
+
return urlsCrawled;
|
106
|
+
}
|
107
|
+
|
108
|
+
try {
|
109
|
+
sitemapUrl = encodeURI(sitemapUrl);
|
110
|
+
} catch (e) {
|
111
|
+
console.log(e);
|
112
|
+
}
|
113
|
+
|
114
|
+
if (basicAuthRegex.test(sitemapUrl)) {
|
115
|
+
isBasicAuth = true;
|
116
|
+
// request to basic auth URL to authenticate for browser session
|
117
|
+
finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
|
118
|
+
const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
|
119
|
+
// obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
|
120
|
+
finalLinks.push(new Request({ url: finalUrl }));
|
121
|
+
basicAuthPage = -2;
|
122
|
+
}
|
123
|
+
|
124
|
+
const uuidToPdfMapping: Record<string, string> = {}; // key and value of string type
|
125
|
+
|
126
|
+
printMessage(['Fetching URLs. This might take some time...'], { border: false });
|
127
|
+
|
128
|
+
finalLinks = [...finalLinks, ...linksFromSitemap];
|
129
|
+
|
130
|
+
await RequestList.open({
|
131
|
+
sources: finalLinks,
|
132
|
+
});
|
133
|
+
|
134
|
+
printMessage(['Fetch URLs completed. Beginning scan'], messageOptions);
|
135
|
+
|
136
|
+
const request = linksFromSitemap[0];
|
137
|
+
const pdfFileName = path.basename(request.url);
|
138
|
+
const trimmedUrl: string = request.url;
|
139
|
+
const destinationFilePath: string = `${randomToken}/${pdfFileName}`;
|
140
|
+
const data: Buffer = fs.readFileSync(trimmedUrl);
|
141
|
+
fs.writeFileSync(destinationFilePath, data);
|
142
|
+
uuidToPdfMapping[pdfFileName] = trimmedUrl;
|
143
|
+
|
144
|
+
if (!isUrlPdf(request.url)) {
|
145
|
+
const browserContext = await constants.launcher.launchPersistentContext('', {
|
146
|
+
headless: process.env.CRAWLEE_HEADLESS === '1',
|
147
|
+
...getPlaywrightLaunchOptions(browser),
|
148
|
+
...playwrightDeviceDetailsObject,
|
149
|
+
});
|
150
|
+
|
151
|
+
const page = await browserContext.newPage();
|
152
|
+
request.url = convertPathToLocalFile(request.url);
|
153
|
+
await page.goto(request.url);
|
154
|
+
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
155
|
+
|
156
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
157
|
+
numScanned: urlsCrawled.scanned.length,
|
158
|
+
urlScanned: request.url,
|
159
|
+
});
|
160
|
+
|
161
|
+
urlsCrawled.scanned.push({
|
162
|
+
url: request.url,
|
163
|
+
pageTitle: results.pageTitle,
|
164
|
+
actualUrl: request.loadedUrl, // i.e. actualUrl
|
165
|
+
});
|
166
|
+
|
167
|
+
urlsCrawled.scannedRedirects.push({
|
168
|
+
fromUrl: request.url,
|
169
|
+
toUrl: request.loadedUrl, // i.e. actualUrl
|
170
|
+
});
|
171
|
+
|
172
|
+
results.url = request.url;
|
173
|
+
// results.actualUrl = request.loadedUrl;
|
174
|
+
|
175
|
+
await dataset.pushData(results);
|
176
|
+
} else {
|
177
|
+
urlsCrawled.scanned.push({ url: trimmedUrl, pageTitle: pdfFileName });
|
178
|
+
|
179
|
+
await runPdfScan(randomToken);
|
180
|
+
// transform result format
|
181
|
+
const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
|
182
|
+
|
183
|
+
// get screenshots from pdf docs
|
184
|
+
if (includeScreenshots) {
|
185
|
+
await Promise.all(pdfResults.map(result => doPdfScreenshots(randomToken, result)));
|
186
|
+
}
|
187
|
+
|
188
|
+
// push results for each pdf document to key value store
|
189
|
+
await Promise.all(pdfResults.map(result => dataset.pushData(result)));
|
190
|
+
}
|
191
|
+
return urlsCrawled;
|
192
|
+
};
|
193
|
+
export default crawlLocalFile;
|
@@ -0,0 +1,356 @@
|
|
1
|
+
import crawlee, { Request, RequestList } from 'crawlee';
|
2
|
+
import printMessage from 'print-message';
|
3
|
+
import fs from 'fs';
|
4
|
+
import {
|
5
|
+
createCrawleeSubFolders,
|
6
|
+
preNavigationHooks,
|
7
|
+
runAxeScript,
|
8
|
+
isUrlPdf,
|
9
|
+
} from './commonCrawlerFunc.js';
|
10
|
+
|
11
|
+
import constants, { guiInfoStatusTypes } from '../constants/constants.js';
|
12
|
+
import {
|
13
|
+
getLinksFromSitemap,
|
14
|
+
getPlaywrightLaunchOptions,
|
15
|
+
messageOptions,
|
16
|
+
isSkippedUrl,
|
17
|
+
urlWithoutAuth,
|
18
|
+
waitForPageLoaded,
|
19
|
+
isFilePath,
|
20
|
+
} from '../constants/common.js';
|
21
|
+
import { areLinksEqual, isWhitelistedContentType } from '../utils.js';
|
22
|
+
import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
|
23
|
+
import { guiInfoLog } from '../logs.js';
|
24
|
+
|
25
|
+
const crawlSitemap = async (
|
26
|
+
sitemapUrl,
|
27
|
+
randomToken,
|
28
|
+
host,
|
29
|
+
viewportSettings,
|
30
|
+
maxRequestsPerCrawl,
|
31
|
+
browser,
|
32
|
+
userDataDirectory,
|
33
|
+
specifiedMaxConcurrency,
|
34
|
+
fileTypes,
|
35
|
+
blacklistedPatterns,
|
36
|
+
includeScreenshots,
|
37
|
+
extraHTTPHeaders,
|
38
|
+
fromCrawlIntelligentSitemap = false, // optional
|
39
|
+
userUrlInputFromIntelligent = null, // optional
|
40
|
+
datasetFromIntelligent = null, // optional
|
41
|
+
urlsCrawledFromIntelligent = null, // optional
|
42
|
+
crawledFromLocalFile = false, // optional
|
43
|
+
) => {
|
44
|
+
let dataset;
|
45
|
+
let urlsCrawled;
|
46
|
+
|
47
|
+
// Boolean to omit axe scan for basic auth URL
|
48
|
+
let isBasicAuth;
|
49
|
+
let basicAuthPage = 0;
|
50
|
+
let finalLinks = [];
|
51
|
+
let authHeader = '';
|
52
|
+
|
53
|
+
if (fromCrawlIntelligentSitemap) {
|
54
|
+
dataset = datasetFromIntelligent;
|
55
|
+
urlsCrawled = urlsCrawledFromIntelligent;
|
56
|
+
} else {
|
57
|
+
({ dataset } = await createCrawleeSubFolders(randomToken));
|
58
|
+
urlsCrawled = { ...constants.urlsCrawledObj };
|
59
|
+
|
60
|
+
if (!fs.existsSync(randomToken)) {
|
61
|
+
fs.mkdirSync(randomToken);
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
let parsedUrl;
|
66
|
+
let username = '';
|
67
|
+
let password = '';
|
68
|
+
|
69
|
+
if (!crawledFromLocalFile && isFilePath(sitemapUrl)) {
|
70
|
+
console.log('Local file crawling not supported for sitemap. Please provide a valid URL.');
|
71
|
+
return;
|
72
|
+
}
|
73
|
+
|
74
|
+
if (isFilePath(sitemapUrl)) {
|
75
|
+
parsedUrl = sitemapUrl;
|
76
|
+
} else {
|
77
|
+
parsedUrl = new URL(sitemapUrl);
|
78
|
+
if (parsedUrl.username !== '' && parsedUrl.password !== '') {
|
79
|
+
isBasicAuth = true;
|
80
|
+
username = decodeURIComponent(parsedUrl.username);
|
81
|
+
password = decodeURIComponent(parsedUrl.password);
|
82
|
+
|
83
|
+
// Create auth header
|
84
|
+
authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
|
85
|
+
|
86
|
+
parsedUrl.username = '';
|
87
|
+
parsedUrl.password = '';
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
const linksFromSitemap = await getLinksFromSitemap(
|
92
|
+
sitemapUrl,
|
93
|
+
maxRequestsPerCrawl,
|
94
|
+
browser,
|
95
|
+
userDataDirectory,
|
96
|
+
userUrlInputFromIntelligent,
|
97
|
+
fromCrawlIntelligentSitemap,
|
98
|
+
username,
|
99
|
+
password,
|
100
|
+
);
|
101
|
+
/**
|
102
|
+
* Regex to match http://username:password@hostname.com
|
103
|
+
* utilised in scan strategy to ensure subsequent URLs within the same domain are scanned.
|
104
|
+
* First time scan with original `url` containing credentials is strictly to authenticate for browser session
|
105
|
+
* subsequent URLs are without credentials.
|
106
|
+
* basicAuthPage is set to -1 for basic auth URL to ensure it is not counted towards maxRequestsPerCrawl
|
107
|
+
*/
|
108
|
+
|
109
|
+
sitemapUrl = encodeURI(sitemapUrl);
|
110
|
+
|
111
|
+
if (isBasicAuth) {
|
112
|
+
// request to basic auth URL to authenticate for browser session
|
113
|
+
finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
|
114
|
+
const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
|
115
|
+
|
116
|
+
// obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
|
117
|
+
finalLinks.push(new Request({ url: finalUrl }));
|
118
|
+
basicAuthPage = -2;
|
119
|
+
}
|
120
|
+
|
121
|
+
const pdfDownloads = [];
|
122
|
+
const uuidToPdfMapping = {};
|
123
|
+
const isScanHtml = ['all', 'html-only'].includes(fileTypes);
|
124
|
+
const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
|
125
|
+
const { playwrightDeviceDetailsObject } = viewportSettings;
|
126
|
+
const { maxConcurrency } = constants;
|
127
|
+
|
128
|
+
printMessage(['Fetching URLs. This might take some time...'], { border: false });
|
129
|
+
|
130
|
+
finalLinks = [...finalLinks, ...linksFromSitemap];
|
131
|
+
|
132
|
+
const requestList = await RequestList.open({
|
133
|
+
sources: finalLinks,
|
134
|
+
});
|
135
|
+
printMessage(['Fetch URLs completed. Beginning scan'], messageOptions);
|
136
|
+
|
137
|
+
let userDataDir = '';
|
138
|
+
if (userDataDirectory) {
|
139
|
+
userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
|
140
|
+
}
|
141
|
+
|
142
|
+
const crawler = new crawlee.PlaywrightCrawler({
|
143
|
+
launchContext: {
|
144
|
+
launcher: constants.launcher,
|
145
|
+
launchOptions: getPlaywrightLaunchOptions(browser),
|
146
|
+
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
147
|
+
userDataDir,
|
148
|
+
},
|
149
|
+
retryOnBlocked: true,
|
150
|
+
browserPoolOptions: {
|
151
|
+
useFingerprints: false,
|
152
|
+
preLaunchHooks: [
|
153
|
+
async (pageId, launchContext) => {
|
154
|
+
launchContext.launchOptions = {
|
155
|
+
...launchContext.launchOptions,
|
156
|
+
bypassCSP: true,
|
157
|
+
ignoreHTTPSErrors: true,
|
158
|
+
...playwrightDeviceDetailsObject,
|
159
|
+
};
|
160
|
+
},
|
161
|
+
],
|
162
|
+
},
|
163
|
+
requestList,
|
164
|
+
preNavigationHooks: isBasicAuth
|
165
|
+
? [
|
166
|
+
async ({ page }) => {
|
167
|
+
await page.setExtraHTTPHeaders({
|
168
|
+
Authorization: authHeader,
|
169
|
+
...extraHTTPHeaders,
|
170
|
+
});
|
171
|
+
},
|
172
|
+
]
|
173
|
+
: [
|
174
|
+
async () => {
|
175
|
+
preNavigationHooks(extraHTTPHeaders);
|
176
|
+
// insert other code here
|
177
|
+
},
|
178
|
+
],
|
179
|
+
requestHandlerTimeoutSecs: 90,
|
180
|
+
requestHandler: async ({ page, request, response, sendRequest }) => {
|
181
|
+
await waitForPageLoaded(page, 10000);
|
182
|
+
|
183
|
+
// Set basic auth header if needed
|
184
|
+
if (isBasicAuth) {
|
185
|
+
await page.setExtraHTTPHeaders({
|
186
|
+
Authorization: authHeader,
|
187
|
+
});
|
188
|
+
const currentUrl = new URL(request.url);
|
189
|
+
currentUrl.username = username;
|
190
|
+
currentUrl.password = password;
|
191
|
+
request.url = currentUrl.href;
|
192
|
+
}
|
193
|
+
|
194
|
+
const actualUrl = request.loadedUrl || request.url;
|
195
|
+
|
196
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
|
197
|
+
crawler.autoscaledPool.abort();
|
198
|
+
return;
|
199
|
+
}
|
200
|
+
|
201
|
+
if (isUrlPdf(actualUrl)) {
|
202
|
+
if (!isScanPdfs) {
|
203
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
204
|
+
numScanned: urlsCrawled.scanned.length,
|
205
|
+
urlScanned: request.url,
|
206
|
+
});
|
207
|
+
urlsCrawled.blacklisted.push(request.url);
|
208
|
+
return;
|
209
|
+
}
|
210
|
+
// pushes download promise into pdfDownloads
|
211
|
+
const { pdfFileName, url } = handlePdfDownload(
|
212
|
+
randomToken,
|
213
|
+
pdfDownloads,
|
214
|
+
request,
|
215
|
+
sendRequest,
|
216
|
+
urlsCrawled,
|
217
|
+
);
|
218
|
+
|
219
|
+
uuidToPdfMapping[pdfFileName] = url;
|
220
|
+
return;
|
221
|
+
}
|
222
|
+
|
223
|
+
const contentType = response.headers()['content-type'];
|
224
|
+
const status = response.status();
|
225
|
+
|
226
|
+
if (blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
227
|
+
urlsCrawled.userExcluded.push(request.url);
|
228
|
+
return;
|
229
|
+
}
|
230
|
+
|
231
|
+
if (status === 403) {
|
232
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
233
|
+
numScanned: urlsCrawled.scanned.length,
|
234
|
+
urlScanned: request.url,
|
235
|
+
});
|
236
|
+
urlsCrawled.forbidden.push({ url: request.url });
|
237
|
+
return;
|
238
|
+
}
|
239
|
+
|
240
|
+
if (status !== 200) {
|
241
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
242
|
+
numScanned: urlsCrawled.scanned.length,
|
243
|
+
urlScanned: request.url,
|
244
|
+
});
|
245
|
+
urlsCrawled.invalid.push(request.url);
|
246
|
+
return;
|
247
|
+
}
|
248
|
+
|
249
|
+
if (basicAuthPage < 0) {
|
250
|
+
basicAuthPage += 1;
|
251
|
+
} else if (isScanHtml && status === 200 && isWhitelistedContentType(contentType)) {
|
252
|
+
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
253
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
254
|
+
numScanned: urlsCrawled.scanned.length,
|
255
|
+
urlScanned: request.url,
|
256
|
+
});
|
257
|
+
|
258
|
+
const isRedirected = !areLinksEqual(request.loadedUrl, request.url);
|
259
|
+
if (isRedirected) {
|
260
|
+
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
|
261
|
+
item => (item.actualUrl || item.url.href) === request.loadedUrl,
|
262
|
+
);
|
263
|
+
|
264
|
+
if (isLoadedUrlInCrawledUrls) {
|
265
|
+
urlsCrawled.notScannedRedirects.push({
|
266
|
+
fromUrl: request.url,
|
267
|
+
toUrl: request.loadedUrl, // i.e. actualUrl
|
268
|
+
});
|
269
|
+
return;
|
270
|
+
}
|
271
|
+
|
272
|
+
urlsCrawled.scanned.push({
|
273
|
+
url: urlWithoutAuth(request.url),
|
274
|
+
pageTitle: results.pageTitle,
|
275
|
+
actualUrl: request.loadedUrl, // i.e. actualUrl
|
276
|
+
});
|
277
|
+
|
278
|
+
urlsCrawled.scannedRedirects.push({
|
279
|
+
fromUrl: urlWithoutAuth(request.url),
|
280
|
+
toUrl: request.loadedUrl, // i.e. actualUrl
|
281
|
+
});
|
282
|
+
|
283
|
+
results.url = request.url;
|
284
|
+
results.actualUrl = request.loadedUrl;
|
285
|
+
} else {
|
286
|
+
urlsCrawled.scanned.push({
|
287
|
+
url: urlWithoutAuth(request.url),
|
288
|
+
pageTitle: results.pageTitle,
|
289
|
+
});
|
290
|
+
}
|
291
|
+
await dataset.pushData(results);
|
292
|
+
} else {
|
293
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
294
|
+
numScanned: urlsCrawled.scanned.length,
|
295
|
+
urlScanned: request.url,
|
296
|
+
});
|
297
|
+
|
298
|
+
if (isScanHtml) {
|
299
|
+
urlsCrawled.invalid.push(actualUrl);
|
300
|
+
}
|
301
|
+
}
|
302
|
+
},
|
303
|
+
failedRequestHandler: async ({ request }) => {
|
304
|
+
if (isBasicAuth && request.url) {
|
305
|
+
request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}`;
|
306
|
+
}
|
307
|
+
|
308
|
+
// check if scanned pages have reached limit due to multi-instances of handler running
|
309
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
|
310
|
+
return;
|
311
|
+
}
|
312
|
+
|
313
|
+
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
314
|
+
numScanned: urlsCrawled.scanned.length,
|
315
|
+
urlScanned: request.url,
|
316
|
+
});
|
317
|
+
urlsCrawled.error.push({ url: request.url });
|
318
|
+
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
319
|
+
},
|
320
|
+
maxRequestsPerCrawl: Infinity,
|
321
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
322
|
+
});
|
323
|
+
|
324
|
+
await crawler.run();
|
325
|
+
|
326
|
+
await requestList.isFinished();
|
327
|
+
|
328
|
+
if (pdfDownloads.length > 0) {
|
329
|
+
// wait for pdf downloads to complete
|
330
|
+
await Promise.all(pdfDownloads);
|
331
|
+
|
332
|
+
// scan and process pdf documents
|
333
|
+
await runPdfScan(randomToken);
|
334
|
+
|
335
|
+
// transform result format
|
336
|
+
const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
|
337
|
+
|
338
|
+
// get screenshots from pdf docs
|
339
|
+
// if (includeScreenshots) {
|
340
|
+
// await Promise.all(pdfResults.map(
|
341
|
+
// async result => await doPdfScreenshots(randomToken, result)
|
342
|
+
// ));
|
343
|
+
// }
|
344
|
+
|
345
|
+
// push results for each pdf document to key value store
|
346
|
+
await Promise.all(pdfResults.map(result => dataset.pushData(result)));
|
347
|
+
}
|
348
|
+
|
349
|
+
if (!fromCrawlIntelligentSitemap) {
|
350
|
+
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
351
|
+
}
|
352
|
+
|
353
|
+
return urlsCrawled;
|
354
|
+
};
|
355
|
+
|
356
|
+
export default crawlSitemap;
|