@govtechsg/oobee 0.10.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.dockerignore +22 -0
- package/.github/pull_request_template.md +11 -0
- package/.github/workflows/docker-test.yml +54 -0
- package/.github/workflows/image.yml +107 -0
- package/.github/workflows/publish.yml +18 -0
- package/.idea/modules.xml +8 -0
- package/.idea/purple-a11y.iml +9 -0
- package/.idea/vcs.xml +6 -0
- package/.prettierrc.json +12 -0
- package/.vscode/extensions.json +5 -0
- package/.vscode/settings.json +10 -0
- package/CODE_OF_CONDUCT.md +128 -0
- package/DETAILS.md +163 -0
- package/Dockerfile +60 -0
- package/INSTALLATION.md +146 -0
- package/INTEGRATION.md +785 -0
- package/LICENSE +22 -0
- package/README.md +587 -0
- package/SECURITY.md +5 -0
- package/__mocks__/mock-report.html +1431 -0
- package/__mocks__/mockFunctions.ts +32 -0
- package/__mocks__/mockIssues.ts +64 -0
- package/__mocks__/mock_all_issues/000000001.json +64 -0
- package/__mocks__/mock_all_issues/000000002.json +53 -0
- package/__mocks__/mock_all_issues/fake-file.txt +0 -0
- package/__tests__/logs.test.ts +25 -0
- package/__tests__/mergeAxeResults.test.ts +278 -0
- package/__tests__/utils.test.ts +118 -0
- package/a11y-scan-results.zip +0 -0
- package/eslint.config.js +53 -0
- package/exclusions.txt +2 -0
- package/gitlab-pipeline-template.yml +54 -0
- package/jest.config.js +1 -0
- package/package.json +96 -0
- package/scripts/copyFiles.js +44 -0
- package/scripts/install_oobee_dependencies.cmd +13 -0
- package/scripts/install_oobee_dependencies.command +101 -0
- package/scripts/install_oobee_dependencies.ps1 +110 -0
- package/scripts/oobee_shell.cmd +13 -0
- package/scripts/oobee_shell.command +11 -0
- package/scripts/oobee_shell.sh +55 -0
- package/scripts/oobee_shell_ps.ps1 +54 -0
- package/src/cli.ts +401 -0
- package/src/combine.ts +240 -0
- package/src/constants/__tests__/common.test.ts +44 -0
- package/src/constants/cliFunctions.ts +305 -0
- package/src/constants/common.ts +1840 -0
- package/src/constants/constants.ts +443 -0
- package/src/constants/errorMeta.json +319 -0
- package/src/constants/itemTypeDescription.ts +11 -0
- package/src/constants/oobeeAi.ts +141 -0
- package/src/constants/questions.ts +181 -0
- package/src/constants/sampleData.ts +187 -0
- package/src/crawlers/__tests__/commonCrawlerFunc.test.ts +51 -0
- package/src/crawlers/commonCrawlerFunc.ts +656 -0
- package/src/crawlers/crawlDomain.ts +877 -0
- package/src/crawlers/crawlIntelligentSitemap.ts +156 -0
- package/src/crawlers/crawlLocalFile.ts +193 -0
- package/src/crawlers/crawlSitemap.ts +356 -0
- package/src/crawlers/custom/extractAndGradeText.ts +57 -0
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +964 -0
- package/src/crawlers/custom/utils.ts +486 -0
- package/src/crawlers/customAxeFunctions.ts +82 -0
- package/src/crawlers/pdfScanFunc.ts +468 -0
- package/src/crawlers/runCustom.ts +117 -0
- package/src/index.ts +173 -0
- package/src/logs.ts +66 -0
- package/src/mergeAxeResults.ts +964 -0
- package/src/npmIndex.ts +284 -0
- package/src/screenshotFunc/htmlScreenshotFunc.ts +411 -0
- package/src/screenshotFunc/pdfScreenshotFunc.ts +762 -0
- package/src/static/ejs/partials/components/categorySelector.ejs +4 -0
- package/src/static/ejs/partials/components/categorySelectorDropdown.ejs +57 -0
- package/src/static/ejs/partials/components/pagesScannedModal.ejs +70 -0
- package/src/static/ejs/partials/components/reportSearch.ejs +47 -0
- package/src/static/ejs/partials/components/ruleOffcanvas.ejs +105 -0
- package/src/static/ejs/partials/components/scanAbout.ejs +263 -0
- package/src/static/ejs/partials/components/screenshotLightbox.ejs +13 -0
- package/src/static/ejs/partials/components/summaryScanAbout.ejs +141 -0
- package/src/static/ejs/partials/components/summaryScanResults.ejs +16 -0
- package/src/static/ejs/partials/components/summaryTable.ejs +20 -0
- package/src/static/ejs/partials/components/summaryWcagCompliance.ejs +94 -0
- package/src/static/ejs/partials/components/topFive.ejs +6 -0
- package/src/static/ejs/partials/components/wcagCompliance.ejs +70 -0
- package/src/static/ejs/partials/footer.ejs +21 -0
- package/src/static/ejs/partials/header.ejs +230 -0
- package/src/static/ejs/partials/main.ejs +40 -0
- package/src/static/ejs/partials/scripts/bootstrap.ejs +8 -0
- package/src/static/ejs/partials/scripts/categorySelectorDropdownScript.ejs +190 -0
- package/src/static/ejs/partials/scripts/categorySummary.ejs +141 -0
- package/src/static/ejs/partials/scripts/highlightjs.ejs +335 -0
- package/src/static/ejs/partials/scripts/popper.ejs +7 -0
- package/src/static/ejs/partials/scripts/reportSearch.ejs +248 -0
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +801 -0
- package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +71 -0
- package/src/static/ejs/partials/scripts/summaryScanResults.ejs +14 -0
- package/src/static/ejs/partials/scripts/summaryTable.ejs +78 -0
- package/src/static/ejs/partials/scripts/utils.ejs +441 -0
- package/src/static/ejs/partials/styles/bootstrap.ejs +12375 -0
- package/src/static/ejs/partials/styles/highlightjs.ejs +54 -0
- package/src/static/ejs/partials/styles/styles.ejs +1843 -0
- package/src/static/ejs/partials/styles/summaryBootstrap.ejs +12458 -0
- package/src/static/ejs/partials/summaryHeader.ejs +70 -0
- package/src/static/ejs/partials/summaryMain.ejs +75 -0
- package/src/static/ejs/report.ejs +420 -0
- package/src/static/ejs/summary.ejs +47 -0
- package/src/static/mustache/.prettierrc +4 -0
- package/src/static/mustache/Attention Deficit.mustache +11 -0
- package/src/static/mustache/Blind.mustache +11 -0
- package/src/static/mustache/Cognitive.mustache +7 -0
- package/src/static/mustache/Colorblindness.mustache +20 -0
- package/src/static/mustache/Deaf.mustache +12 -0
- package/src/static/mustache/Deafblind.mustache +7 -0
- package/src/static/mustache/Dyslexia.mustache +14 -0
- package/src/static/mustache/Low Vision.mustache +7 -0
- package/src/static/mustache/Mobility.mustache +15 -0
- package/src/static/mustache/Sighted Keyboard Users.mustache +42 -0
- package/src/static/mustache/report.mustache +1709 -0
- package/src/types/print-message.d.ts +28 -0
- package/src/types/types.ts +46 -0
- package/src/types/xpath-to-css.d.ts +3 -0
- package/src/utils.ts +332 -0
- package/tsconfig.json +15 -0
@@ -0,0 +1,877 @@
|
|
1
|
+
import crawlee, { EnqueueStrategy } from 'crawlee';
|
2
|
+
import fs from 'fs';
|
3
|
+
import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
|
4
|
+
import type { EnqueueLinksOptions, RequestOptions } from 'crawlee';
|
5
|
+
import axios from 'axios';
|
6
|
+
import { fileTypeFromBuffer } from 'file-type';
|
7
|
+
import mime from 'mime-types';
|
8
|
+
import https from 'https';
|
9
|
+
import type { BatchAddRequestsResult } from '@crawlee/types';
|
10
|
+
import {
|
11
|
+
createCrawleeSubFolders,
|
12
|
+
preNavigationHooks,
|
13
|
+
runAxeScript,
|
14
|
+
isUrlPdf,
|
15
|
+
} from './commonCrawlerFunc.js';
|
16
|
+
import constants, {
|
17
|
+
UrlsCrawled,
|
18
|
+
blackListedFileExtensions,
|
19
|
+
guiInfoStatusTypes,
|
20
|
+
cssQuerySelectors,
|
21
|
+
RuleFlags,
|
22
|
+
} from '../constants/constants.js';
|
23
|
+
import {
|
24
|
+
getPlaywrightLaunchOptions,
|
25
|
+
isBlacklistedFileExtensions,
|
26
|
+
isSkippedUrl,
|
27
|
+
isDisallowedInRobotsTxt,
|
28
|
+
getUrlsFromRobotsTxt,
|
29
|
+
getBlackListedPatterns,
|
30
|
+
urlWithoutAuth,
|
31
|
+
waitForPageLoaded,
|
32
|
+
} from '../constants/common.js';
|
33
|
+
import { areLinksEqual, isFollowStrategy } from '../utils.js';
|
34
|
+
import {
|
35
|
+
handlePdfDownload,
|
36
|
+
runPdfScan,
|
37
|
+
mapPdfScanResults,
|
38
|
+
doPdfScreenshots,
|
39
|
+
} from './pdfScanFunc.js';
|
40
|
+
import { silentLogger, guiInfoLog } from '../logs.js';
|
41
|
+
import { ViewportSettingsClass } from '../combine.js';
|
42
|
+
|
43
|
+
const isBlacklisted = (url: string) => {
|
44
|
+
const blacklistedPatterns = getBlackListedPatterns(null);
|
45
|
+
if (!blacklistedPatterns) {
|
46
|
+
return false;
|
47
|
+
}
|
48
|
+
try {
|
49
|
+
const parsedUrl = new URL(url);
|
50
|
+
|
51
|
+
return blacklistedPatterns.some(
|
52
|
+
pattern => new RegExp(pattern).test(parsedUrl.hostname) || new RegExp(pattern).test(url),
|
53
|
+
);
|
54
|
+
} catch (error) {
|
55
|
+
console.error(`Error parsing URL: ${url}`, error);
|
56
|
+
return false;
|
57
|
+
}
|
58
|
+
};
|
59
|
+
|
60
|
+
const crawlDomain = async ({
|
61
|
+
url,
|
62
|
+
randomToken,
|
63
|
+
host: _host,
|
64
|
+
viewportSettings,
|
65
|
+
maxRequestsPerCrawl,
|
66
|
+
browser,
|
67
|
+
userDataDirectory,
|
68
|
+
strategy,
|
69
|
+
specifiedMaxConcurrency,
|
70
|
+
fileTypes,
|
71
|
+
blacklistedPatterns,
|
72
|
+
includeScreenshots,
|
73
|
+
followRobots,
|
74
|
+
extraHTTPHeaders,
|
75
|
+
safeMode = false,
|
76
|
+
fromCrawlIntelligentSitemap = false,
|
77
|
+
datasetFromIntelligent = null,
|
78
|
+
urlsCrawledFromIntelligent = null,
|
79
|
+
ruleset = [],
|
80
|
+
}: {
|
81
|
+
url: string;
|
82
|
+
randomToken: string;
|
83
|
+
host: string;
|
84
|
+
viewportSettings: ViewportSettingsClass;
|
85
|
+
maxRequestsPerCrawl: number;
|
86
|
+
browser: string;
|
87
|
+
userDataDirectory: string;
|
88
|
+
strategy: EnqueueStrategy;
|
89
|
+
specifiedMaxConcurrency: number;
|
90
|
+
fileTypes: string;
|
91
|
+
blacklistedPatterns: string[];
|
92
|
+
includeScreenshots: boolean;
|
93
|
+
followRobots: boolean;
|
94
|
+
extraHTTPHeaders: Record<string, string>;
|
95
|
+
safeMode?: boolean;
|
96
|
+
fromCrawlIntelligentSitemap?: boolean;
|
97
|
+
datasetFromIntelligent?: crawlee.Dataset;
|
98
|
+
urlsCrawledFromIntelligent?: UrlsCrawled;
|
99
|
+
ruleset?: RuleFlags[];
|
100
|
+
}) => {
|
101
|
+
let dataset: crawlee.Dataset;
|
102
|
+
let urlsCrawled: UrlsCrawled;
|
103
|
+
let requestQueue: crawlee.RequestQueue;
|
104
|
+
|
105
|
+
if (fromCrawlIntelligentSitemap) {
|
106
|
+
dataset = datasetFromIntelligent;
|
107
|
+
urlsCrawled = urlsCrawledFromIntelligent;
|
108
|
+
} else {
|
109
|
+
({ dataset } = await createCrawleeSubFolders(randomToken));
|
110
|
+
urlsCrawled = { ...constants.urlsCrawledObj };
|
111
|
+
}
|
112
|
+
|
113
|
+
({ requestQueue } = await createCrawleeSubFolders(randomToken));
|
114
|
+
|
115
|
+
if (!fs.existsSync(randomToken)) {
|
116
|
+
fs.mkdirSync(randomToken);
|
117
|
+
}
|
118
|
+
|
119
|
+
const pdfDownloads = [];
|
120
|
+
const uuidToPdfMapping = {};
|
121
|
+
const isScanHtml = ['all', 'html-only'].includes(fileTypes);
|
122
|
+
const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
|
123
|
+
const { maxConcurrency } = constants;
|
124
|
+
const { playwrightDeviceDetailsObject } = viewportSettings;
|
125
|
+
const isBlacklistedUrl = isBlacklisted(url);
|
126
|
+
|
127
|
+
const httpsAgent = new https.Agent({ rejectUnauthorized: false });
|
128
|
+
|
129
|
+
if (isBlacklistedUrl) {
|
130
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
131
|
+
numScanned: urlsCrawled.scanned.length,
|
132
|
+
urlScanned: url,
|
133
|
+
});
|
134
|
+
return;
|
135
|
+
}
|
136
|
+
|
137
|
+
// Boolean to omit axe scan for basic auth URL
|
138
|
+
let isBasicAuth = false;
|
139
|
+
let authHeader = '';
|
140
|
+
|
141
|
+
// Test basic auth and add auth header if auth exist
|
142
|
+
const parsedUrl = new URL(url);
|
143
|
+
let username: string;
|
144
|
+
let password: string;
|
145
|
+
if (parsedUrl.username !== '' && parsedUrl.password !== '') {
|
146
|
+
isBasicAuth = true;
|
147
|
+
username = decodeURIComponent(parsedUrl.username);
|
148
|
+
password = decodeURIComponent(parsedUrl.password);
|
149
|
+
|
150
|
+
// Create auth header
|
151
|
+
authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
|
152
|
+
|
153
|
+
// Remove username from parsedUrl
|
154
|
+
parsedUrl.username = '';
|
155
|
+
parsedUrl.password = '';
|
156
|
+
// Send the finalUrl without credentials by setting auth header instead
|
157
|
+
const finalUrl = parsedUrl.toString();
|
158
|
+
|
159
|
+
await requestQueue.addRequest({
|
160
|
+
url: finalUrl,
|
161
|
+
skipNavigation: isUrlPdf(finalUrl),
|
162
|
+
headers: {
|
163
|
+
Authorization: authHeader,
|
164
|
+
},
|
165
|
+
label: finalUrl,
|
166
|
+
});
|
167
|
+
} else {
|
168
|
+
await requestQueue.addRequest({
|
169
|
+
url,
|
170
|
+
skipNavigation: isUrlPdf(url),
|
171
|
+
label: url,
|
172
|
+
});
|
173
|
+
}
|
174
|
+
|
175
|
+
const httpHeadCache = new Map<string, boolean>();
|
176
|
+
const isProcessibleUrl = async (url: string): Promise<boolean> => {
|
177
|
+
if (httpHeadCache.has(url)) {
|
178
|
+
silentLogger.info('cache hit', url, httpHeadCache.get(url));
|
179
|
+
return false; // return false to avoid processing the url again
|
180
|
+
}
|
181
|
+
|
182
|
+
try {
|
183
|
+
// Send a HEAD request to check headers without downloading the file
|
184
|
+
const headResponse = await axios.head(url, {
|
185
|
+
headers: { Authorization: authHeader },
|
186
|
+
httpsAgent,
|
187
|
+
});
|
188
|
+
const contentType = headResponse.headers['content-type'] || '';
|
189
|
+
const contentDisposition = headResponse.headers['content-disposition'] || '';
|
190
|
+
|
191
|
+
// Check if the response suggests it's a downloadable file based on Content-Disposition header
|
192
|
+
if (contentDisposition.includes('attachment')) {
|
193
|
+
silentLogger.info(`Skipping URL due to attachment header: ${url}`);
|
194
|
+
httpHeadCache.set(url, false);
|
195
|
+
return false;
|
196
|
+
}
|
197
|
+
|
198
|
+
// Check if the MIME type suggests it's a downloadable file
|
199
|
+
if (contentType.startsWith('application/') || contentType.includes('octet-stream')) {
|
200
|
+
silentLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`);
|
201
|
+
httpHeadCache.set(url, false);
|
202
|
+
return false;
|
203
|
+
}
|
204
|
+
|
205
|
+
// Use the mime-types library to ensure it's processible content (e.g., HTML or plain text)
|
206
|
+
const mimeType = mime.lookup(contentType);
|
207
|
+
if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) {
|
208
|
+
silentLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`);
|
209
|
+
httpHeadCache.set(url, false);
|
210
|
+
return false;
|
211
|
+
}
|
212
|
+
|
213
|
+
// Additional check for zip files by their magic number (PK\x03\x04)
|
214
|
+
if (url.endsWith('.zip')) {
|
215
|
+
silentLogger.info(`Checking for zip file magic number at URL ${url}`);
|
216
|
+
|
217
|
+
// Download the first few bytes of the file to check for the magic number
|
218
|
+
const byteResponse = await axios.get(url, {
|
219
|
+
headers: { Range: 'bytes=0-3', Authorization: authHeader },
|
220
|
+
responseType: 'arraybuffer',
|
221
|
+
httpsAgent,
|
222
|
+
});
|
223
|
+
|
224
|
+
const magicNumber = byteResponse.data.toString('hex');
|
225
|
+
if (magicNumber === '504b0304') {
|
226
|
+
silentLogger.info(`Skipping zip file at URL ${url}`);
|
227
|
+
httpHeadCache.set(url, false);
|
228
|
+
return false;
|
229
|
+
}
|
230
|
+
silentLogger.info(
|
231
|
+
`Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`,
|
232
|
+
);
|
233
|
+
}
|
234
|
+
|
235
|
+
// If you want more robust checks, you can download a portion of the content and use the file-type package to detect file types by content
|
236
|
+
const response = await axios.get(url, {
|
237
|
+
headers: { Range: 'bytes=0-4100', Authorization: authHeader },
|
238
|
+
responseType: 'arraybuffer',
|
239
|
+
httpsAgent,
|
240
|
+
});
|
241
|
+
|
242
|
+
const fileType = await fileTypeFromBuffer(response.data);
|
243
|
+
if (
|
244
|
+
fileType &&
|
245
|
+
!fileType.mime.startsWith('text/html') &&
|
246
|
+
!fileType.mime.startsWith('text/')
|
247
|
+
) {
|
248
|
+
silentLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`);
|
249
|
+
httpHeadCache.set(url, false);
|
250
|
+
return false;
|
251
|
+
}
|
252
|
+
} catch (e) {
|
253
|
+
// silentLogger.error(`Error checking the MIME type of ${url}: ${e.message}`);
|
254
|
+
// If an error occurs (e.g., a network issue), assume the URL is processible
|
255
|
+
httpHeadCache.set(url, true);
|
256
|
+
return true;
|
257
|
+
}
|
258
|
+
|
259
|
+
// If none of the conditions to skip are met, allow processing of the URL
|
260
|
+
httpHeadCache.set(url, true);
|
261
|
+
return true;
|
262
|
+
};
|
263
|
+
|
264
|
+
const enqueueProcess = async (
|
265
|
+
page: Page,
|
266
|
+
enqueueLinks: (options: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>,
|
267
|
+
browserContext: BrowserContext,
|
268
|
+
) => {
|
269
|
+
try {
|
270
|
+
await enqueueLinks({
|
271
|
+
// set selector matches anchor elements with href but not contains # or starting with mailto:
|
272
|
+
selector: 'a:not(a[href*="#"],a[href^="mailto:"])',
|
273
|
+
strategy,
|
274
|
+
requestQueue,
|
275
|
+
transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
|
276
|
+
try {
|
277
|
+
req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
278
|
+
} catch (e) {
|
279
|
+
silentLogger.error(e);
|
280
|
+
}
|
281
|
+
if (urlsCrawled.scanned.some(item => item.url === req.url)) {
|
282
|
+
req.skipNavigation = true;
|
283
|
+
}
|
284
|
+
if (isDisallowedInRobotsTxt(req.url)) return null;
|
285
|
+
if (isUrlPdf(req.url)) {
|
286
|
+
// playwright headless mode does not support navigation to pdf document
|
287
|
+
req.skipNavigation = true;
|
288
|
+
}
|
289
|
+
req.label = req.url;
|
290
|
+
|
291
|
+
return req;
|
292
|
+
},
|
293
|
+
});
|
294
|
+
|
295
|
+
// If safeMode flag is enabled, skip enqueueLinksByClickingElements
|
296
|
+
if (!safeMode) {
|
297
|
+
// Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
|
298
|
+
try {
|
299
|
+
await customEnqueueLinksByClickingElements(page, browserContext);
|
300
|
+
} catch (e) {
|
301
|
+
silentLogger.info(e);
|
302
|
+
}
|
303
|
+
}
|
304
|
+
} catch {
|
305
|
+
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
306
|
+
// Handles browser page object been closed.
|
307
|
+
}
|
308
|
+
};
|
309
|
+
|
310
|
+
const customEnqueueLinksByClickingElements = async (
|
311
|
+
page: Page,
|
312
|
+
browserContext: BrowserContext,
|
313
|
+
): Promise<void> => {
|
314
|
+
const initialPageUrl: string = page.url().toString();
|
315
|
+
|
316
|
+
const isExcluded = (newPageUrl: string): boolean => {
|
317
|
+
const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
|
318
|
+
const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl);
|
319
|
+
const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
|
320
|
+
return isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
|
321
|
+
};
|
322
|
+
const setPageListeners = (page: Page): void => {
|
323
|
+
// event listener to handle new page popups upon button click
|
324
|
+
page.on('popup', async (newPage: Page) => {
|
325
|
+
try {
|
326
|
+
if (newPage.url() != initialPageUrl && !isExcluded(newPage.url())) {
|
327
|
+
const newPageUrl: string = newPage.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
328
|
+
await requestQueue.addRequest({
|
329
|
+
url: newPageUrl,
|
330
|
+
skipNavigation: isUrlPdf(newPage.url()),
|
331
|
+
label: newPageUrl,
|
332
|
+
});
|
333
|
+
} else {
|
334
|
+
try {
|
335
|
+
await newPage.close();
|
336
|
+
} catch {
|
337
|
+
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
338
|
+
// Handles browser page object been closed.
|
339
|
+
}
|
340
|
+
}
|
341
|
+
} catch {
|
342
|
+
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
343
|
+
// Handles browser page object been closed.
|
344
|
+
}
|
345
|
+
});
|
346
|
+
|
347
|
+
// event listener to handle navigation to new url within same page upon element click
|
348
|
+
page.on('framenavigated', async (newFrame: Frame) => {
|
349
|
+
try {
|
350
|
+
if (
|
351
|
+
newFrame.url() !== initialPageUrl &&
|
352
|
+
!isExcluded(newFrame.url()) &&
|
353
|
+
!(newFrame.url() == 'about:blank')
|
354
|
+
) {
|
355
|
+
const newFrameUrl: string = newFrame.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
356
|
+
await requestQueue.addRequest({
|
357
|
+
url: newFrameUrl,
|
358
|
+
skipNavigation: isUrlPdf(newFrame.url()),
|
359
|
+
label: newFrameUrl,
|
360
|
+
});
|
361
|
+
}
|
362
|
+
} catch {
|
363
|
+
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
364
|
+
// Handles browser page object been closed.
|
365
|
+
}
|
366
|
+
});
|
367
|
+
};
|
368
|
+
setPageListeners(page);
|
369
|
+
let currentElementIndex: number = 0;
|
370
|
+
let isAllElementsHandled: boolean = false;
|
371
|
+
while (!isAllElementsHandled) {
|
372
|
+
try {
|
373
|
+
// navigate back to initial page if clicking on a element previously caused it to navigate to a new url
|
374
|
+
if (page.url() != initialPageUrl) {
|
375
|
+
try {
|
376
|
+
await page.close();
|
377
|
+
} catch {
|
378
|
+
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
379
|
+
// Handles browser page object been closed.
|
380
|
+
}
|
381
|
+
page = await browserContext.newPage();
|
382
|
+
await page.goto(initialPageUrl, {
|
383
|
+
waitUntil: 'domcontentloaded',
|
384
|
+
});
|
385
|
+
setPageListeners(page);
|
386
|
+
}
|
387
|
+
const selectedElementsString = cssQuerySelectors.join(', ');
|
388
|
+
const selectedElements: ElementHandle<SVGElement | HTMLElement>[] =
|
389
|
+
await page.$$(selectedElementsString);
|
390
|
+
// edge case where there might be elements on page that appears intermittently
|
391
|
+
if (currentElementIndex + 1 > selectedElements.length || !selectedElements) {
|
392
|
+
break;
|
393
|
+
}
|
394
|
+
// handle the last element in selectedElements
|
395
|
+
if (currentElementIndex + 1 === selectedElements.length) {
|
396
|
+
isAllElementsHandled = true;
|
397
|
+
}
|
398
|
+
const element: ElementHandle<SVGElement | HTMLElement> =
|
399
|
+
selectedElements[currentElementIndex];
|
400
|
+
currentElementIndex += 1;
|
401
|
+
let newUrlFoundInElement: string = null;
|
402
|
+
if (await element.isVisible()) {
|
403
|
+
// Find url in html elements without clicking them
|
404
|
+
await page
|
405
|
+
.evaluate(element => {
|
406
|
+
// find href attribute
|
407
|
+
const hrefUrl: string = element.getAttribute('href');
|
408
|
+
|
409
|
+
// find url in datapath
|
410
|
+
const dataPathUrl: string = element.getAttribute('data-path');
|
411
|
+
|
412
|
+
return hrefUrl || dataPathUrl;
|
413
|
+
}, element)
|
414
|
+
.then(result => {
|
415
|
+
if (result) {
|
416
|
+
newUrlFoundInElement = result;
|
417
|
+
const pageUrl: URL = new URL(page.url());
|
418
|
+
const baseUrl: string = `${pageUrl.protocol}//${pageUrl.host}`;
|
419
|
+
let absoluteUrl: URL;
|
420
|
+
// Construct absolute URL using base URL
|
421
|
+
try {
|
422
|
+
// Check if newUrlFoundInElement is a valid absolute URL
|
423
|
+
absoluteUrl = new URL(newUrlFoundInElement);
|
424
|
+
} catch (e) {
|
425
|
+
// If it's not a valid URL, treat it as a relative URL
|
426
|
+
absoluteUrl = new URL(newUrlFoundInElement, baseUrl);
|
427
|
+
}
|
428
|
+
newUrlFoundInElement = absoluteUrl.href;
|
429
|
+
}
|
430
|
+
});
|
431
|
+
if (newUrlFoundInElement && !isExcluded(newUrlFoundInElement)) {
|
432
|
+
const newUrlFoundInElementUrl: string = newUrlFoundInElement.replace(
|
433
|
+
/(?<=&|\?)utm_.*?(&|$)/gim,
|
434
|
+
'',
|
435
|
+
);
|
436
|
+
|
437
|
+
await requestQueue.addRequest({
|
438
|
+
url: newUrlFoundInElementUrl,
|
439
|
+
skipNavigation: isUrlPdf(newUrlFoundInElement),
|
440
|
+
label: newUrlFoundInElementUrl,
|
441
|
+
});
|
442
|
+
} else if (!newUrlFoundInElement) {
|
443
|
+
try {
|
444
|
+
// Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
|
445
|
+
await element.click({ force: true });
|
446
|
+
await page.waitForTimeout(1000); // Add a delay of 1 second between each Element click
|
447
|
+
} catch {
|
448
|
+
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
449
|
+
// Handles browser page object been closed.
|
450
|
+
}
|
451
|
+
}
|
452
|
+
}
|
453
|
+
} catch {
|
454
|
+
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
455
|
+
// Handles browser page object been closed.
|
456
|
+
}
|
457
|
+
}
|
458
|
+
};
|
459
|
+
|
460
|
+
let isAbortingScanNow = false;
|
461
|
+
|
462
|
+
let userDataDir = '';
|
463
|
+
if (userDataDirectory) {
|
464
|
+
userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
|
465
|
+
}
|
466
|
+
|
467
|
+
const crawler = new crawlee.PlaywrightCrawler({
|
468
|
+
launchContext: {
|
469
|
+
launcher: constants.launcher,
|
470
|
+
launchOptions: getPlaywrightLaunchOptions(browser),
|
471
|
+
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
472
|
+
userDataDir,
|
473
|
+
},
|
474
|
+
retryOnBlocked: true,
|
475
|
+
browserPoolOptions: {
|
476
|
+
useFingerprints: false,
|
477
|
+
preLaunchHooks: [
|
478
|
+
async (_pageId, launchContext) => {
|
479
|
+
launchContext.launchOptions = {
|
480
|
+
...launchContext.launchOptions,
|
481
|
+
bypassCSP: true,
|
482
|
+
ignoreHTTPSErrors: true,
|
483
|
+
...playwrightDeviceDetailsObject,
|
484
|
+
};
|
485
|
+
},
|
486
|
+
],
|
487
|
+
},
|
488
|
+
requestQueue,
|
489
|
+
postNavigationHooks: [
|
490
|
+
async crawlingContext => {
|
491
|
+
const { page, request } = crawlingContext;
|
492
|
+
|
493
|
+
request.skipNavigation = true;
|
494
|
+
|
495
|
+
await page.evaluate(() => {
|
496
|
+
return new Promise(resolve => {
|
497
|
+
let timeout;
|
498
|
+
let mutationCount = 0;
|
499
|
+
const MAX_MUTATIONS = 100;
|
500
|
+
const MAX_SAME_MUTATION_LIMIT = 10;
|
501
|
+
const mutationHash = {};
|
502
|
+
|
503
|
+
const observer = new MutationObserver(mutationsList => {
|
504
|
+
clearTimeout(timeout);
|
505
|
+
|
506
|
+
mutationCount += 1;
|
507
|
+
|
508
|
+
if (mutationCount > MAX_MUTATIONS) {
|
509
|
+
observer.disconnect();
|
510
|
+
resolve('Too many mutations detected');
|
511
|
+
}
|
512
|
+
|
513
|
+
// To handle scenario where DOM elements are constantly changing and unable to exit
|
514
|
+
mutationsList.forEach(mutation => {
|
515
|
+
let mutationKey;
|
516
|
+
|
517
|
+
if (mutation.target instanceof Element) {
|
518
|
+
Array.from(mutation.target.attributes).forEach(attr => {
|
519
|
+
mutationKey = `${mutation.target.nodeName}-${attr.name}`;
|
520
|
+
|
521
|
+
if (mutationKey) {
|
522
|
+
if (!mutationHash[mutationKey]) {
|
523
|
+
mutationHash[mutationKey] = 1;
|
524
|
+
} else {
|
525
|
+
mutationHash[mutationKey]++;
|
526
|
+
}
|
527
|
+
|
528
|
+
if (mutationHash[mutationKey] >= MAX_SAME_MUTATION_LIMIT) {
|
529
|
+
observer.disconnect();
|
530
|
+
resolve(`Repeated mutation detected for ${mutationKey}`);
|
531
|
+
}
|
532
|
+
}
|
533
|
+
});
|
534
|
+
}
|
535
|
+
});
|
536
|
+
|
537
|
+
timeout = setTimeout(() => {
|
538
|
+
observer.disconnect();
|
539
|
+
resolve('DOM stabilized after mutations.');
|
540
|
+
}, 1000);
|
541
|
+
});
|
542
|
+
|
543
|
+
timeout = setTimeout(() => {
|
544
|
+
observer.disconnect();
|
545
|
+
resolve('No mutations detected, exit from idle state');
|
546
|
+
}, 1000);
|
547
|
+
|
548
|
+
observer.observe(document, { childList: true, subtree: true, attributes: true });
|
549
|
+
});
|
550
|
+
});
|
551
|
+
|
552
|
+
let finalUrl = page.url();
|
553
|
+
const requestLabelUrl = request.label;
|
554
|
+
|
555
|
+
// to handle scenario where the redirected link is not within the scanning website
|
556
|
+
const isLoadedUrlFollowStrategy = isFollowStrategy(finalUrl, requestLabelUrl, strategy);
|
557
|
+
if (!isLoadedUrlFollowStrategy) {
|
558
|
+
finalUrl = requestLabelUrl;
|
559
|
+
}
|
560
|
+
|
561
|
+
const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
|
562
|
+
if (isRedirected) {
|
563
|
+
await requestQueue.addRequest({ url: finalUrl, label: finalUrl });
|
564
|
+
} else {
|
565
|
+
request.skipNavigation = false;
|
566
|
+
}
|
567
|
+
},
|
568
|
+
],
|
569
|
+
preNavigationHooks: isBasicAuth
|
570
|
+
? [
|
571
|
+
async ({ page, request }) => {
|
572
|
+
await page.setExtraHTTPHeaders({
|
573
|
+
Authorization: authHeader,
|
574
|
+
...extraHTTPHeaders,
|
575
|
+
});
|
576
|
+
const processible = await isProcessibleUrl(request.url);
|
577
|
+
if (!processible) {
|
578
|
+
request.skipNavigation = true;
|
579
|
+
return null;
|
580
|
+
}
|
581
|
+
},
|
582
|
+
]
|
583
|
+
: [
|
584
|
+
async (crawlingContext, gotoOptions) => {
|
585
|
+
const { page, request } = crawlingContext;
|
586
|
+
|
587
|
+
await page.setExtraHTTPHeaders({
|
588
|
+
...extraHTTPHeaders,
|
589
|
+
});
|
590
|
+
|
591
|
+
Object.assign(gotoOptions, {
|
592
|
+
waitUntil: 'networkidle',
|
593
|
+
timeout: 30000,
|
594
|
+
});
|
595
|
+
|
596
|
+
const processible = await isProcessibleUrl(request.url);
|
597
|
+
if (!processible) {
|
598
|
+
request.skipNavigation = true;
|
599
|
+
return null;
|
600
|
+
}
|
601
|
+
},
|
602
|
+
],
|
603
|
+
requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
|
604
|
+
requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
|
605
|
+
const browserContext: BrowserContext = page.context();
|
606
|
+
try {
|
607
|
+
// Set basic auth header if needed
|
608
|
+
if (isBasicAuth) {
|
609
|
+
await page.setExtraHTTPHeaders({
|
610
|
+
Authorization: authHeader,
|
611
|
+
});
|
612
|
+
const currentUrl = new URL(request.url);
|
613
|
+
currentUrl.username = username;
|
614
|
+
currentUrl.password = password;
|
615
|
+
request.url = currentUrl.href;
|
616
|
+
}
|
617
|
+
|
618
|
+
await waitForPageLoaded(page, 10000);
|
619
|
+
let actualUrl = request.url;
|
620
|
+
|
621
|
+
if (page.url() !== 'about:blank') {
|
622
|
+
actualUrl = page.url();
|
623
|
+
}
|
624
|
+
|
625
|
+
if (isBlacklisted(actualUrl) || (isUrlPdf(actualUrl) && !isScanPdfs)) {
|
626
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
627
|
+
numScanned: urlsCrawled.scanned.length,
|
628
|
+
urlScanned: actualUrl,
|
629
|
+
});
|
630
|
+
return;
|
631
|
+
}
|
632
|
+
|
633
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
|
634
|
+
isAbortingScanNow = true;
|
635
|
+
crawler.autoscaledPool.abort();
|
636
|
+
return;
|
637
|
+
}
|
638
|
+
|
639
|
+
// if URL has already been scanned
|
640
|
+
if (urlsCrawled.scanned.some(item => item.url === request.url)) {
|
641
|
+
// await enqueueProcess(page, enqueueLinks, browserContext);
|
642
|
+
return;
|
643
|
+
}
|
644
|
+
|
645
|
+
if (isDisallowedInRobotsTxt(request.url)) {
|
646
|
+
await enqueueProcess(page, enqueueLinks, browserContext);
|
647
|
+
return;
|
648
|
+
}
|
649
|
+
|
650
|
+
// handle pdfs
|
651
|
+
if (request.skipNavigation && isUrlPdf(actualUrl)) {
|
652
|
+
if (!isScanPdfs) {
|
653
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
654
|
+
numScanned: urlsCrawled.scanned.length,
|
655
|
+
urlScanned: request.url,
|
656
|
+
});
|
657
|
+
urlsCrawled.blacklisted.push(request.url);
|
658
|
+
return;
|
659
|
+
}
|
660
|
+
const { pdfFileName, url } = handlePdfDownload(
|
661
|
+
randomToken,
|
662
|
+
pdfDownloads,
|
663
|
+
request,
|
664
|
+
sendRequest,
|
665
|
+
urlsCrawled,
|
666
|
+
);
|
667
|
+
|
668
|
+
uuidToPdfMapping[pdfFileName] = url;
|
669
|
+
return;
|
670
|
+
}
|
671
|
+
|
672
|
+
const resHeaders = response ? response.headers() : {}; // Safely access response headers
|
673
|
+
const contentType = resHeaders['content-type'] || ''; // Ensure contentType is defined
|
674
|
+
|
675
|
+
// Skip non-HTML and non-PDF URLs
|
676
|
+
if (!contentType.includes('text/html') && !contentType.includes('application/pdf')) {
|
677
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
678
|
+
numScanned: urlsCrawled.scanned.length,
|
679
|
+
urlScanned: request.url,
|
680
|
+
});
|
681
|
+
urlsCrawled.blacklisted.push(request.url);
|
682
|
+
return;
|
683
|
+
}
|
684
|
+
|
685
|
+
if (isBlacklistedFileExtensions(actualUrl, blackListedFileExtensions)) {
|
686
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
687
|
+
numScanned: urlsCrawled.scanned.length,
|
688
|
+
urlScanned: request.url,
|
689
|
+
});
|
690
|
+
urlsCrawled.blacklisted.push(request.url);
|
691
|
+
return;
|
692
|
+
}
|
693
|
+
|
694
|
+
if (blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
695
|
+
urlsCrawled.userExcluded.push(request.url);
|
696
|
+
await enqueueProcess(page, enqueueLinks, browserContext);
|
697
|
+
return;
|
698
|
+
}
|
699
|
+
|
700
|
+
if (response.status() === 403) {
|
701
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
702
|
+
numScanned: urlsCrawled.scanned.length,
|
703
|
+
urlScanned: request.url,
|
704
|
+
});
|
705
|
+
urlsCrawled.forbidden.push(request.url);
|
706
|
+
return;
|
707
|
+
}
|
708
|
+
|
709
|
+
if (response.status() !== 200) {
|
710
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
711
|
+
numScanned: urlsCrawled.scanned.length,
|
712
|
+
urlScanned: request.url,
|
713
|
+
});
|
714
|
+
urlsCrawled.invalid.push(request.url);
|
715
|
+
return;
|
716
|
+
}
|
717
|
+
|
718
|
+
if (isScanHtml) {
|
719
|
+
// For deduplication, if the URL is redirected, we want to store the original URL and the redirected URL (actualUrl)
|
720
|
+
const isRedirected = !areLinksEqual(request.loadedUrl, request.url);
|
721
|
+
|
722
|
+
// check if redirected link is following strategy (same-domain/same-hostname)
|
723
|
+
const isLoadedUrlFollowStrategy = isFollowStrategy(
|
724
|
+
request.loadedUrl,
|
725
|
+
request.url,
|
726
|
+
strategy,
|
727
|
+
);
|
728
|
+
if (isRedirected && !isLoadedUrlFollowStrategy) {
|
729
|
+
urlsCrawled.notScannedRedirects.push({
|
730
|
+
fromUrl: request.url,
|
731
|
+
toUrl: request.loadedUrl, // i.e. actualUrl
|
732
|
+
});
|
733
|
+
return;
|
734
|
+
}
|
735
|
+
|
736
|
+
const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
|
737
|
+
|
738
|
+
if (isRedirected) {
|
739
|
+
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
|
740
|
+
item => (item.actualUrl || item.url) === request.loadedUrl,
|
741
|
+
);
|
742
|
+
|
743
|
+
if (isLoadedUrlInCrawledUrls) {
|
744
|
+
urlsCrawled.notScannedRedirects.push({
|
745
|
+
fromUrl: request.url,
|
746
|
+
toUrl: request.loadedUrl, // i.e. actualUrl
|
747
|
+
});
|
748
|
+
return;
|
749
|
+
}
|
750
|
+
|
751
|
+
// One more check if scanned pages have reached limit due to multi-instances of handler running
|
752
|
+
if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
|
753
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
754
|
+
numScanned: urlsCrawled.scanned.length,
|
755
|
+
urlScanned: request.url,
|
756
|
+
});
|
757
|
+
|
758
|
+
urlsCrawled.scanned.push({
|
759
|
+
url: urlWithoutAuth(request.url),
|
760
|
+
pageTitle: results.pageTitle,
|
761
|
+
actualUrl: request.loadedUrl, // i.e. actualUrl
|
762
|
+
});
|
763
|
+
|
764
|
+
urlsCrawled.scannedRedirects.push({
|
765
|
+
fromUrl: urlWithoutAuth(request.url),
|
766
|
+
toUrl: request.loadedUrl, // i.e. actualUrl
|
767
|
+
});
|
768
|
+
|
769
|
+
results.url = request.url;
|
770
|
+
results.actualUrl = request.loadedUrl;
|
771
|
+
await dataset.pushData(results);
|
772
|
+
}
|
773
|
+
} else {
|
774
|
+
// One more check if scanned pages have reached limit due to multi-instances of handler running
|
775
|
+
if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
|
776
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
777
|
+
numScanned: urlsCrawled.scanned.length,
|
778
|
+
urlScanned: urlWithoutAuth(request.url),
|
779
|
+
});
|
780
|
+
urlsCrawled.scanned.push({
|
781
|
+
url: urlWithoutAuth(request.url),
|
782
|
+
actualUrl: request.url,
|
783
|
+
pageTitle: results.pageTitle,
|
784
|
+
});
|
785
|
+
await dataset.pushData(results);
|
786
|
+
}
|
787
|
+
}
|
788
|
+
} else {
|
789
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
790
|
+
numScanned: urlsCrawled.scanned.length,
|
791
|
+
urlScanned: request.url,
|
792
|
+
});
|
793
|
+
urlsCrawled.blacklisted.push(request.url);
|
794
|
+
}
|
795
|
+
|
796
|
+
if (followRobots) await getUrlsFromRobotsTxt(request.url, browser);
|
797
|
+
await enqueueProcess(page, enqueueLinks, browserContext);
|
798
|
+
} catch (e) {
|
799
|
+
try {
|
800
|
+
if (!e.message.includes('page.evaluate')) {
|
801
|
+
silentLogger.info(e);
|
802
|
+
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
803
|
+
numScanned: urlsCrawled.scanned.length,
|
804
|
+
urlScanned: request.url,
|
805
|
+
});
|
806
|
+
|
807
|
+
page = await browserContext.newPage();
|
808
|
+
await page.goto(request.url);
|
809
|
+
|
810
|
+
await page.route('**/*', async route => {
|
811
|
+
const interceptedRequest = route.request();
|
812
|
+
if (interceptedRequest.resourceType() === 'document') {
|
813
|
+
const interceptedRequestUrl = interceptedRequest
|
814
|
+
.url()
|
815
|
+
.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
816
|
+
await requestQueue.addRequest({
|
817
|
+
url: interceptedRequestUrl,
|
818
|
+
skipNavigation: isUrlPdf(interceptedRequest.url()),
|
819
|
+
label: interceptedRequestUrl,
|
820
|
+
});
|
821
|
+
}
|
822
|
+
});
|
823
|
+
}
|
824
|
+
} catch {
|
825
|
+
// Do nothing since the error will be pushed
|
826
|
+
}
|
827
|
+
|
828
|
+
// when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
|
829
|
+
// a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
|
830
|
+
if (!isAbortingScanNow) {
|
831
|
+
urlsCrawled.error.push({ url: request.url });
|
832
|
+
}
|
833
|
+
}
|
834
|
+
},
|
835
|
+
failedRequestHandler: async ({ request }) => {
|
836
|
+
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
837
|
+
numScanned: urlsCrawled.scanned.length,
|
838
|
+
urlScanned: request.url,
|
839
|
+
});
|
840
|
+
urlsCrawled.error.push({ url: request.url });
|
841
|
+
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
842
|
+
},
|
843
|
+
maxRequestsPerCrawl: Infinity,
|
844
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
845
|
+
});
|
846
|
+
|
847
|
+
await crawler.run();
|
848
|
+
|
849
|
+
if (pdfDownloads.length > 0) {
|
850
|
+
// wait for pdf downloads to complete
|
851
|
+
await Promise.all(pdfDownloads);
|
852
|
+
|
853
|
+
// scan and process pdf documents
|
854
|
+
await runPdfScan(randomToken);
|
855
|
+
|
856
|
+
// transform result format
|
857
|
+
const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
|
858
|
+
|
859
|
+
// get screenshots from pdf docs
|
860
|
+
if (includeScreenshots) {
|
861
|
+
await Promise.all(
|
862
|
+
pdfResults.map(async result => await doPdfScreenshots(randomToken, result)),
|
863
|
+
);
|
864
|
+
}
|
865
|
+
|
866
|
+
// push results for each pdf document to key value store
|
867
|
+
await Promise.all(pdfResults.map(result => dataset.pushData(result)));
|
868
|
+
}
|
869
|
+
|
870
|
+
if (!fromCrawlIntelligentSitemap) {
|
871
|
+
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
872
|
+
}
|
873
|
+
|
874
|
+
return urlsCrawled;
|
875
|
+
};
|
876
|
+
|
877
|
+
export default crawlDomain;
|