@govtechsg/oobee 0.10.58 → 0.10.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/DETAILS.md +1 -1
- package/package.json +1 -1
- package/src/cli.ts +17 -64
- package/src/combine.ts +18 -4
- package/src/constants/common.ts +193 -293
- package/src/constants/constants.ts +2 -1
- package/src/constants/questions.ts +12 -4
- package/src/crawlers/commonCrawlerFunc.ts +9 -3
- package/src/crawlers/crawlDomain.ts +31 -83
- package/src/crawlers/crawlIntelligentSitemap.ts +16 -11
- package/src/crawlers/crawlLocalFile.ts +6 -17
- package/src/crawlers/crawlSitemap.ts +27 -93
- package/src/crawlers/custom/utils.ts +4 -4
- package/src/index.ts +2 -5
- package/src/logs.ts +1 -2
- package/src/mergeAxeResults.ts +35 -30
- package/src/npmIndex.ts +4 -4
- package/src/utils.ts +56 -14
@@ -452,7 +452,7 @@ const reserveFileNameKeywords = [
|
|
452
452
|
|
453
453
|
export default {
|
454
454
|
cliZipFileName: 'oobee-scan-results.zip',
|
455
|
-
exportDirectory:
|
455
|
+
exportDirectory: undefined,
|
456
456
|
maxRequestsPerCrawl,
|
457
457
|
maxConcurrency: 25,
|
458
458
|
urlsCrawledObj,
|
@@ -466,6 +466,7 @@ export default {
|
|
466
466
|
reserveFileNameKeywords,
|
467
467
|
wcagLinks,
|
468
468
|
robotsTxtUrls: null,
|
469
|
+
userDataDirectory: null, // This will be set later in the code
|
469
470
|
};
|
470
471
|
|
471
472
|
export const rootPath = dirname;
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import { Question } from 'inquirer';
|
2
2
|
import { Answers } from '../index.js';
|
3
|
-
import { getUserDataTxt, setHeadlessMode } from '../utils.js';
|
3
|
+
import { getUserDataTxt, randomThreeDigitNumberString, setHeadlessMode } from '../utils.js';
|
4
4
|
import {
|
5
5
|
checkUrl,
|
6
6
|
deleteClonedProfiles,
|
@@ -15,6 +15,7 @@ import {
|
|
15
15
|
parseHeaders,
|
16
16
|
} from './common.js';
|
17
17
|
import constants, { BrowserTypes, ScannerTypes } from './constants.js';
|
18
|
+
import { random } from 'lodash';
|
18
19
|
|
19
20
|
const userData = getUserDataTxt();
|
20
21
|
|
@@ -78,8 +79,15 @@ const startScanQuestions = [
|
|
78
79
|
process.exit(1);
|
79
80
|
}
|
80
81
|
|
82
|
+
// construct filename for scan results
|
83
|
+
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
84
|
+
const domain = new URL(url).hostname;
|
85
|
+
let resultFilename: string;
|
86
|
+
const randomThreeDigitNumber = randomThreeDigitNumberString();
|
87
|
+
resultFilename = `${date}_${time}_${domain}_${randomThreeDigitNumber}`;
|
88
|
+
|
81
89
|
const statuses = constants.urlCheckStatuses;
|
82
|
-
const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(BrowserTypes.CHROME);
|
90
|
+
const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(BrowserTypes.CHROME, false, resultFilename);
|
83
91
|
|
84
92
|
setHeadlessMode(browserToRun, answers.headless);
|
85
93
|
|
@@ -95,11 +103,11 @@ const startScanQuestions = [
|
|
95
103
|
browserToRun,
|
96
104
|
clonedBrowserDataDir,
|
97
105
|
playwrightDeviceDetailsObject,
|
98
|
-
answers.scanner === ScannerTypes.CUSTOM,
|
99
106
|
parseHeaders(answers.header),
|
100
107
|
);
|
101
108
|
|
102
|
-
deleteClonedProfiles(browserToRun);
|
109
|
+
deleteClonedProfiles(browserToRun, resultFilename);
|
110
|
+
|
103
111
|
switch (res.status) {
|
104
112
|
case statuses.success.code:
|
105
113
|
answers.finalUrl = res.url;
|
@@ -21,6 +21,9 @@ import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
|
|
21
21
|
import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
|
22
22
|
import xPathToCss from './custom/xPathToCss.js';
|
23
23
|
import type { Response as PlaywrightResponse } from 'playwright';
|
24
|
+
import fs from 'fs';
|
25
|
+
import { getStoragePath } from '../utils.js';
|
26
|
+
import path from 'path';
|
24
27
|
|
25
28
|
// types
|
26
29
|
interface AxeResultsWithScreenshot extends AxeResults {
|
@@ -254,7 +257,7 @@ export const runAxeScript = async ({
|
|
254
257
|
return new Promise(resolve => {
|
255
258
|
let timeout: NodeJS.Timeout;
|
256
259
|
let mutationCount = 0;
|
257
|
-
const MAX_MUTATIONS =
|
260
|
+
const MAX_MUTATIONS = 500;
|
258
261
|
const MAX_SAME_MUTATION_LIMIT = 10;
|
259
262
|
const mutationHash: Record<string, number> = {};
|
260
263
|
|
@@ -476,8 +479,11 @@ export const runAxeScript = async ({
|
|
476
479
|
export const createCrawleeSubFolders = async (
|
477
480
|
randomToken: string,
|
478
481
|
): Promise<{ dataset: crawlee.Dataset; requestQueue: crawlee.RequestQueue }> => {
|
479
|
-
|
480
|
-
const
|
482
|
+
|
483
|
+
const crawleeDir = path.join(getStoragePath(randomToken),"crawlee");
|
484
|
+
|
485
|
+
const dataset = await crawlee.Dataset.open(crawleeDir);
|
486
|
+
const requestQueue = await crawlee.RequestQueue.open(crawleeDir);
|
481
487
|
return { dataset, requestQueue };
|
482
488
|
};
|
483
489
|
|
@@ -27,9 +27,7 @@ import {
|
|
27
27
|
isSkippedUrl,
|
28
28
|
isDisallowedInRobotsTxt,
|
29
29
|
getUrlsFromRobotsTxt,
|
30
|
-
urlWithoutAuth,
|
31
30
|
waitForPageLoaded,
|
32
|
-
initModifiedUserAgent,
|
33
31
|
} from '../constants/common.js';
|
34
32
|
import { areLinksEqual, isFollowStrategy } from '../utils.js';
|
35
33
|
import {
|
@@ -40,6 +38,8 @@ import {
|
|
40
38
|
} from './pdfScanFunc.js';
|
41
39
|
import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
|
42
40
|
import { ViewportSettingsClass } from '../combine.js';
|
41
|
+
import * as path from 'path';
|
42
|
+
import fsp from 'fs/promises';
|
43
43
|
|
44
44
|
const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
|
45
45
|
if (!blacklistedPatterns) {
|
@@ -115,10 +115,6 @@ const crawlDomain = async ({
|
|
115
115
|
|
116
116
|
({ requestQueue } = await createCrawleeSubFolders(randomToken));
|
117
117
|
|
118
|
-
if (!fs.existsSync(randomToken)) {
|
119
|
-
fs.mkdirSync(randomToken);
|
120
|
-
}
|
121
|
-
|
122
118
|
const pdfDownloads: Promise<void>[] = [];
|
123
119
|
const uuidToPdfMapping: Record<string, string> = {};
|
124
120
|
const isScanHtml = ['all', 'html-only'].includes(fileTypes);
|
@@ -126,45 +122,11 @@ const crawlDomain = async ({
|
|
126
122
|
const { maxConcurrency } = constants;
|
127
123
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
128
124
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
// Test basic auth and add auth header if auth exist
|
136
|
-
const parsedUrl = new URL(url);
|
137
|
-
let username: string;
|
138
|
-
let password: string;
|
139
|
-
if (parsedUrl.username !== '' && parsedUrl.password !== '') {
|
140
|
-
isBasicAuth = true;
|
141
|
-
username = decodeURIComponent(parsedUrl.username);
|
142
|
-
password = decodeURIComponent(parsedUrl.password);
|
143
|
-
|
144
|
-
// Create auth header
|
145
|
-
authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
|
146
|
-
|
147
|
-
// Remove username from parsedUrl
|
148
|
-
parsedUrl.username = '';
|
149
|
-
parsedUrl.password = '';
|
150
|
-
// Send the finalUrl without credentials by setting auth header instead
|
151
|
-
const finalUrl = parsedUrl.toString();
|
152
|
-
|
153
|
-
await requestQueue.addRequest({
|
154
|
-
url: finalUrl,
|
155
|
-
skipNavigation: isUrlPdf(finalUrl),
|
156
|
-
headers: {
|
157
|
-
Authorization: authHeader,
|
158
|
-
},
|
159
|
-
label: finalUrl,
|
160
|
-
});
|
161
|
-
} else {
|
162
|
-
await requestQueue.addRequest({
|
163
|
-
url,
|
164
|
-
skipNavigation: isUrlPdf(url),
|
165
|
-
label: url,
|
166
|
-
});
|
167
|
-
}
|
125
|
+
await requestQueue.addRequest({
|
126
|
+
url,
|
127
|
+
skipNavigation: isUrlPdf(url),
|
128
|
+
label: url,
|
129
|
+
});
|
168
130
|
|
169
131
|
const enqueueProcess = async (
|
170
132
|
page: Page,
|
@@ -377,31 +339,40 @@ const crawlDomain = async ({
|
|
377
339
|
|
378
340
|
let isAbortingScanNow = false;
|
379
341
|
|
380
|
-
let userDataDir = '';
|
381
|
-
if (userDataDirectory) {
|
382
|
-
userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
|
383
|
-
}
|
384
|
-
|
385
|
-
await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
|
386
|
-
|
387
342
|
const crawler = new crawlee.PlaywrightCrawler({
|
388
343
|
launchContext: {
|
389
344
|
launcher: constants.launcher,
|
390
345
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
391
346
|
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
392
|
-
...(process.env.CRAWLEE_HEADLESS === '
|
347
|
+
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
393
348
|
},
|
394
349
|
retryOnBlocked: true,
|
395
350
|
browserPoolOptions: {
|
396
351
|
useFingerprints: false,
|
397
352
|
preLaunchHooks: [
|
398
353
|
async (_pageId, launchContext) => {
|
354
|
+
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
|
355
|
+
|
356
|
+
// Ensure base exists
|
357
|
+
await fsp.mkdir(baseDir, { recursive: true });
|
358
|
+
|
359
|
+
// Create a unique subdir per browser
|
360
|
+
const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
|
361
|
+
await fsp.mkdir(subProfileDir, { recursive: true });
|
362
|
+
|
363
|
+
// Assign to Crawlee's launcher
|
364
|
+
launchContext.userDataDir = subProfileDir;
|
365
|
+
|
366
|
+
// Safely extend launchOptions
|
399
367
|
launchContext.launchOptions = {
|
400
368
|
...launchContext.launchOptions,
|
401
|
-
bypassCSP: true,
|
402
369
|
ignoreHTTPSErrors: true,
|
403
370
|
...playwrightDeviceDetailsObject,
|
371
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
404
372
|
};
|
373
|
+
|
374
|
+
// Optionally log for debugging
|
375
|
+
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
|
405
376
|
},
|
406
377
|
],
|
407
378
|
},
|
@@ -414,7 +385,7 @@ const crawlDomain = async ({
|
|
414
385
|
return new Promise(resolve => {
|
415
386
|
let timeout;
|
416
387
|
let mutationCount = 0;
|
417
|
-
const MAX_MUTATIONS =
|
388
|
+
const MAX_MUTATIONS = 500; // stop if things never quiet down
|
418
389
|
const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
|
419
390
|
|
420
391
|
const observer = new MutationObserver(() => {
|
@@ -464,33 +435,10 @@ const crawlDomain = async ({
|
|
464
435
|
}
|
465
436
|
},
|
466
437
|
],
|
467
|
-
preNavigationHooks: [ async({ page, request}) => {
|
468
|
-
if (isBasicAuth) {
|
469
|
-
await page.setExtraHTTPHeaders({
|
470
|
-
Authorization: authHeader,
|
471
|
-
...extraHTTPHeaders,
|
472
|
-
});
|
473
|
-
} else {
|
474
|
-
await page.setExtraHTTPHeaders({
|
475
|
-
...extraHTTPHeaders,
|
476
|
-
});
|
477
|
-
}
|
478
|
-
}],
|
479
438
|
requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
|
480
439
|
requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
|
481
440
|
const browserContext: BrowserContext = page.context();
|
482
441
|
try {
|
483
|
-
// Set basic auth header if needed
|
484
|
-
if (isBasicAuth) {
|
485
|
-
await page.setExtraHTTPHeaders({
|
486
|
-
Authorization: authHeader,
|
487
|
-
});
|
488
|
-
const currentUrl = new URL(request.url);
|
489
|
-
currentUrl.username = username;
|
490
|
-
currentUrl.password = password;
|
491
|
-
request.url = currentUrl.href;
|
492
|
-
}
|
493
|
-
|
494
442
|
await waitForPageLoaded(page, 10000);
|
495
443
|
let actualUrl = page.url() || request.loadedUrl || request.url;
|
496
444
|
|
@@ -652,13 +600,13 @@ const crawlDomain = async ({
|
|
652
600
|
});
|
653
601
|
|
654
602
|
urlsCrawled.scanned.push({
|
655
|
-
url:
|
603
|
+
url: request.url,
|
656
604
|
pageTitle: results.pageTitle,
|
657
605
|
actualUrl, // i.e. actualUrl
|
658
606
|
});
|
659
607
|
|
660
608
|
urlsCrawled.scannedRedirects.push({
|
661
|
-
fromUrl:
|
609
|
+
fromUrl: request.url,
|
662
610
|
toUrl: actualUrl, // i.e. actualUrl
|
663
611
|
});
|
664
612
|
|
@@ -671,10 +619,10 @@ const crawlDomain = async ({
|
|
671
619
|
if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
|
672
620
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
673
621
|
numScanned: urlsCrawled.scanned.length,
|
674
|
-
urlScanned:
|
622
|
+
urlScanned: request.url,
|
675
623
|
});
|
676
624
|
urlsCrawled.scanned.push({
|
677
|
-
url:
|
625
|
+
url: request.url,
|
678
626
|
actualUrl: request.url,
|
679
627
|
pageTitle: results.pageTitle,
|
680
628
|
});
|
@@ -695,7 +643,7 @@ const crawlDomain = async ({
|
|
695
643
|
});
|
696
644
|
}
|
697
645
|
|
698
|
-
if (followRobots) await getUrlsFromRobotsTxt(request.url, browser);
|
646
|
+
if (followRobots) await getUrlsFromRobotsTxt(request.url, browser, userDataDirectory, extraHTTPHeaders);
|
699
647
|
await enqueueProcess(page, enqueueLinks, browserContext);
|
700
648
|
} catch (e) {
|
701
649
|
try {
|
@@ -7,6 +7,7 @@ import crawlDomain from './crawlDomain.js';
|
|
7
7
|
import crawlSitemap from './crawlSitemap.js';
|
8
8
|
import { EnqueueStrategy } from 'crawlee';
|
9
9
|
import { ViewportSettingsClass } from '../combine.js';
|
10
|
+
import { getPlaywrightLaunchOptions } from '../constants/common.js';
|
10
11
|
|
11
12
|
const crawlIntelligentSitemap = async (
|
12
13
|
url: string,
|
@@ -36,9 +37,6 @@ const crawlIntelligentSitemap = async (
|
|
36
37
|
let sitemapUrl;
|
37
38
|
|
38
39
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
39
|
-
if (!fs.existsSync(randomToken)) {
|
40
|
-
fs.mkdirSync(randomToken);
|
41
|
-
}
|
42
40
|
|
43
41
|
function getHomeUrl(parsedUrl: string) {
|
44
42
|
const urlObject = new URL(parsedUrl);
|
@@ -48,15 +46,21 @@ const crawlIntelligentSitemap = async (
|
|
48
46
|
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
49
47
|
}
|
50
48
|
|
51
|
-
async function findSitemap(link: string) {
|
49
|
+
async function findSitemap(link: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>) {
|
52
50
|
const homeUrl = getHomeUrl(link);
|
53
51
|
let sitemapLink = '';
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
52
|
+
|
53
|
+
const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
|
54
|
+
? userDataDirectory
|
55
|
+
: '';
|
56
|
+
const context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
57
|
+
headless: process.env.CRAWLEE_HEADLESS === '1',
|
58
|
+
...getPlaywrightLaunchOptions(browser),
|
59
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
58
60
|
});
|
59
|
-
|
61
|
+
|
62
|
+
const page = await context.newPage();
|
63
|
+
|
60
64
|
for (const path of sitemapPaths) {
|
61
65
|
sitemapLink = homeUrl + path;
|
62
66
|
if (await checkUrlExists(page, sitemapLink)) {
|
@@ -64,7 +68,8 @@ const crawlIntelligentSitemap = async (
|
|
64
68
|
break;
|
65
69
|
}
|
66
70
|
}
|
67
|
-
await
|
71
|
+
await page.close();
|
72
|
+
await context.close().catch(() => { });
|
68
73
|
return sitemapExist ? sitemapLink : '';
|
69
74
|
}
|
70
75
|
|
@@ -79,7 +84,7 @@ const crawlIntelligentSitemap = async (
|
|
79
84
|
};
|
80
85
|
|
81
86
|
try {
|
82
|
-
sitemapUrl = await findSitemap(url);
|
87
|
+
sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
|
83
88
|
} catch (error) {
|
84
89
|
consoleLogger.error(error);
|
85
90
|
}
|
@@ -13,7 +13,6 @@ import {
|
|
13
13
|
isFilePath,
|
14
14
|
convertLocalFileToPath,
|
15
15
|
convertPathToLocalFile,
|
16
|
-
initModifiedUserAgent,
|
17
16
|
} from '../constants/common.js';
|
18
17
|
import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
|
19
18
|
import { guiInfoLog } from '../logs.js';
|
@@ -74,9 +73,6 @@ export const crawlLocalFile = async ({
|
|
74
73
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
75
74
|
urlsCrawled = { ...constants.urlsCrawledObj };
|
76
75
|
|
77
|
-
if (!fs.existsSync(randomToken)) {
|
78
|
-
fs.mkdirSync(randomToken);
|
79
|
-
}
|
80
76
|
}
|
81
77
|
|
82
78
|
// Check if the sitemapUrl is a local file and if it exists
|
@@ -136,16 +132,6 @@ export const crawlLocalFile = async ({
|
|
136
132
|
console.log(e);
|
137
133
|
}
|
138
134
|
|
139
|
-
if (basicAuthRegex.test(sitemapUrl)) {
|
140
|
-
isBasicAuth = true;
|
141
|
-
// request to basic auth URL to authenticate for browser session
|
142
|
-
finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
|
143
|
-
const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
|
144
|
-
// obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
|
145
|
-
finalLinks.push(new Request({ url: finalUrl }));
|
146
|
-
basicAuthPage = -2;
|
147
|
-
}
|
148
|
-
|
149
135
|
const uuidToPdfMapping: Record<string, string> = {}; // key and value of string type
|
150
136
|
|
151
137
|
finalLinks = [...finalLinks, ...linksFromSitemap];
|
@@ -165,9 +151,12 @@ export const crawlLocalFile = async ({
|
|
165
151
|
let shouldAbort = false;
|
166
152
|
|
167
153
|
if (!isUrlPdf(request.url)) {
|
168
|
-
|
169
|
-
|
170
|
-
|
154
|
+
const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
|
155
|
+
? userDataDirectory
|
156
|
+
: '';
|
157
|
+
|
158
|
+
const browserContext = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
159
|
+
headless: process.env.CRAWLEE_HEADLESS === '1',
|
171
160
|
...getPlaywrightLaunchOptions(browser),
|
172
161
|
...playwrightDeviceDetailsObject,
|
173
162
|
});
|
@@ -17,15 +17,15 @@ import {
|
|
17
17
|
getLinksFromSitemap,
|
18
18
|
getPlaywrightLaunchOptions,
|
19
19
|
isSkippedUrl,
|
20
|
-
urlWithoutAuth,
|
21
20
|
waitForPageLoaded,
|
22
21
|
isFilePath,
|
23
|
-
initModifiedUserAgent,
|
24
22
|
} from '../constants/common.js';
|
25
23
|
import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
|
26
24
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
|
27
25
|
import { guiInfoLog } from '../logs.js';
|
28
26
|
import { ViewportSettingsClass } from '../combine.js';
|
27
|
+
import * as path from 'path';
|
28
|
+
import fsp from 'fs/promises';
|
29
29
|
|
30
30
|
const crawlSitemap = async ({
|
31
31
|
sitemapUrl,
|
@@ -70,50 +70,19 @@ const crawlSitemap = async ({
|
|
70
70
|
let dataset: crawlee.Dataset;
|
71
71
|
let urlsCrawled: UrlsCrawled;
|
72
72
|
|
73
|
-
// Boolean to omit axe scan for basic auth URL
|
74
|
-
let isBasicAuth: boolean;
|
75
|
-
let basicAuthPage = 0;
|
76
|
-
let finalLinks = [];
|
77
|
-
let authHeader = '';
|
78
|
-
|
79
73
|
if (fromCrawlIntelligentSitemap) {
|
80
74
|
dataset = datasetFromIntelligent;
|
81
75
|
urlsCrawled = urlsCrawledFromIntelligent;
|
82
76
|
} else {
|
83
77
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
84
78
|
urlsCrawled = { ...constants.urlsCrawledObj };
|
85
|
-
|
86
|
-
if (!fs.existsSync(randomToken)) {
|
87
|
-
fs.mkdirSync(randomToken);
|
88
|
-
}
|
89
79
|
}
|
90
80
|
|
91
|
-
let parsedUrl;
|
92
|
-
let username = '';
|
93
|
-
let password = '';
|
94
|
-
|
95
81
|
if (!crawledFromLocalFile && isFilePath(sitemapUrl)) {
|
96
82
|
console.log('Local file crawling not supported for sitemap. Please provide a valid URL.');
|
97
83
|
return;
|
98
84
|
}
|
99
85
|
|
100
|
-
if (isFilePath(sitemapUrl)) {
|
101
|
-
parsedUrl = sitemapUrl;
|
102
|
-
} else {
|
103
|
-
parsedUrl = new URL(sitemapUrl);
|
104
|
-
if (parsedUrl.username !== '' && parsedUrl.password !== '') {
|
105
|
-
isBasicAuth = true;
|
106
|
-
username = decodeURIComponent(parsedUrl.username);
|
107
|
-
password = decodeURIComponent(parsedUrl.password);
|
108
|
-
|
109
|
-
// Create auth header
|
110
|
-
authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
|
111
|
-
|
112
|
-
parsedUrl.username = '';
|
113
|
-
parsedUrl.password = '';
|
114
|
-
}
|
115
|
-
}
|
116
|
-
|
117
86
|
const linksFromSitemap = await getLinksFromSitemap(
|
118
87
|
sitemapUrl,
|
119
88
|
maxRequestsPerCrawl,
|
@@ -121,29 +90,11 @@ const crawlSitemap = async ({
|
|
121
90
|
userDataDirectory,
|
122
91
|
userUrlInputFromIntelligent,
|
123
92
|
fromCrawlIntelligentSitemap,
|
124
|
-
|
125
|
-
password,
|
93
|
+
extraHTTPHeaders,
|
126
94
|
);
|
127
|
-
/**
|
128
|
-
* Regex to match http://username:password@hostname.com
|
129
|
-
* utilised in scan strategy to ensure subsequent URLs within the same domain are scanned.
|
130
|
-
* First time scan with original `url` containing credentials is strictly to authenticate for browser session
|
131
|
-
* subsequent URLs are without credentials.
|
132
|
-
* basicAuthPage is set to -1 for basic auth URL to ensure it is not counted towards maxRequestsPerCrawl
|
133
|
-
*/
|
134
95
|
|
135
96
|
sitemapUrl = encodeURI(sitemapUrl);
|
136
97
|
|
137
|
-
if (isBasicAuth) {
|
138
|
-
// request to basic auth URL to authenticate for browser session
|
139
|
-
finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
|
140
|
-
const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
|
141
|
-
|
142
|
-
// obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
|
143
|
-
finalLinks.push(new Request({ url: finalUrl }));
|
144
|
-
basicAuthPage = -2;
|
145
|
-
}
|
146
|
-
|
147
98
|
const pdfDownloads: Promise<void>[] = [];
|
148
99
|
const uuidToPdfMapping: Record<string, string> = {};
|
149
100
|
const isScanHtml = ['all', 'html-only'].includes(fileTypes);
|
@@ -151,36 +102,43 @@ const crawlSitemap = async ({
|
|
151
102
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
152
103
|
const { maxConcurrency } = constants;
|
153
104
|
|
154
|
-
finalLinks = [...finalLinks, ...linksFromSitemap];
|
155
|
-
|
156
105
|
const requestList = await RequestList.open({
|
157
|
-
sources:
|
106
|
+
sources: linksFromSitemap,
|
158
107
|
});
|
159
108
|
|
160
|
-
let userDataDir = '';
|
161
|
-
if (userDataDirectory) {
|
162
|
-
userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
|
163
|
-
}
|
164
|
-
|
165
|
-
await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
|
166
109
|
const crawler = new crawlee.PlaywrightCrawler({
|
167
110
|
launchContext: {
|
168
111
|
launcher: constants.launcher,
|
169
112
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
170
113
|
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
171
|
-
...(process.env.CRAWLEE_HEADLESS === '
|
114
|
+
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
172
115
|
},
|
173
116
|
retryOnBlocked: true,
|
174
117
|
browserPoolOptions: {
|
175
118
|
useFingerprints: false,
|
176
119
|
preLaunchHooks: [
|
177
|
-
async (_pageId
|
120
|
+
async (_pageId, launchContext) => {
|
121
|
+
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
|
122
|
+
|
123
|
+
// Ensure base exists
|
124
|
+
await fsp.mkdir(baseDir, { recursive: true });
|
125
|
+
|
126
|
+
// Create a unique subdir per browser
|
127
|
+
const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
|
128
|
+
await fsp.mkdir(subProfileDir, { recursive: true });
|
129
|
+
|
130
|
+
// Assign to Crawlee's launcher
|
131
|
+
launchContext.userDataDir = subProfileDir;
|
132
|
+
|
133
|
+
// Safely extend launchOptions
|
178
134
|
launchContext.launchOptions = {
|
179
135
|
...launchContext.launchOptions,
|
180
|
-
bypassCSP: true,
|
181
136
|
ignoreHTTPSErrors: true,
|
182
137
|
...playwrightDeviceDetailsObject,
|
183
138
|
};
|
139
|
+
|
140
|
+
// Optionally log for debugging
|
141
|
+
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
|
184
142
|
},
|
185
143
|
],
|
186
144
|
},
|
@@ -193,7 +151,7 @@ const crawlSitemap = async ({
|
|
193
151
|
return new Promise(resolve => {
|
194
152
|
let timeout;
|
195
153
|
let mutationCount = 0;
|
196
|
-
const MAX_MUTATIONS =
|
154
|
+
const MAX_MUTATIONS = 500; // stop if things never quiet down
|
197
155
|
const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
|
198
156
|
|
199
157
|
const observer = new MutationObserver(() => {
|
@@ -252,15 +210,7 @@ const crawlSitemap = async ({
|
|
252
210
|
return;
|
253
211
|
}
|
254
212
|
|
255
|
-
|
256
|
-
if (isBasicAuth) {
|
257
|
-
await page.setExtraHTTPHeaders({
|
258
|
-
Authorization: authHeader,
|
259
|
-
...extraHTTPHeaders,
|
260
|
-
});
|
261
|
-
} else {
|
262
|
-
preNavigationHooks(extraHTTPHeaders);
|
263
|
-
}
|
213
|
+
preNavigationHooks(extraHTTPHeaders);
|
264
214
|
},
|
265
215
|
],
|
266
216
|
requestHandlerTimeoutSecs: 90,
|
@@ -282,17 +232,6 @@ const crawlSitemap = async ({
|
|
282
232
|
return;
|
283
233
|
}
|
284
234
|
|
285
|
-
// Set basic auth header if needed
|
286
|
-
if (isBasicAuth) {
|
287
|
-
await page.setExtraHTTPHeaders({
|
288
|
-
Authorization: authHeader,
|
289
|
-
});
|
290
|
-
const currentUrl = new URL(request.url);
|
291
|
-
currentUrl.username = username;
|
292
|
-
currentUrl.password = password;
|
293
|
-
request.url = currentUrl.href;
|
294
|
-
}
|
295
|
-
|
296
235
|
await waitForPageLoaded(page, 10000);
|
297
236
|
|
298
237
|
const actualUrl = page.url() || request.loadedUrl || request.url;
|
@@ -341,9 +280,7 @@ const crawlSitemap = async ({
|
|
341
280
|
const contentType = response?.headers?.()['content-type'] || '';
|
342
281
|
const status = response ? response.status() : 0;
|
343
282
|
|
344
|
-
if (
|
345
|
-
basicAuthPage += 1;
|
346
|
-
} else if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
|
283
|
+
if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
|
347
284
|
const isRedirected = !areLinksEqual(page.url(), request.url);
|
348
285
|
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
|
349
286
|
item => (item.actualUrl || item.url) === page.url(),
|
@@ -382,13 +319,13 @@ const crawlSitemap = async ({
|
|
382
319
|
});
|
383
320
|
|
384
321
|
urlsCrawled.scanned.push({
|
385
|
-
url:
|
322
|
+
url: request.url,
|
386
323
|
pageTitle: results.pageTitle,
|
387
324
|
actualUrl, // i.e. actualUrl
|
388
325
|
});
|
389
326
|
|
390
327
|
urlsCrawled.scannedRedirects.push({
|
391
|
-
fromUrl:
|
328
|
+
fromUrl: request.url,
|
392
329
|
toUrl: actualUrl,
|
393
330
|
});
|
394
331
|
|
@@ -421,9 +358,6 @@ const crawlSitemap = async ({
|
|
421
358
|
}
|
422
359
|
},
|
423
360
|
failedRequestHandler: async ({ request, response, error }) => {
|
424
|
-
if (isBasicAuth && request.url) {
|
425
|
-
request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}`;
|
426
|
-
}
|
427
361
|
|
428
362
|
// check if scanned pages have reached limit due to multi-instances of handler running
|
429
363
|
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
|
@@ -6,7 +6,7 @@ import path from 'path';
|
|
6
6
|
import { runAxeScript } from '../commonCrawlerFunc.js';
|
7
7
|
import { consoleLogger, guiInfoLog, silentLogger } from '../../logs.js';
|
8
8
|
import { guiInfoStatusTypes } from '../../constants/constants.js';
|
9
|
-
import { isSkippedUrl
|
9
|
+
import { isSkippedUrl } from '../../constants/common.js';
|
10
10
|
|
11
11
|
//! For Cypress Test
|
12
12
|
// env to check if Cypress test is running
|
@@ -77,8 +77,8 @@ export const screenshotFullPage = async (page, screenshotsDir: string, screensho
|
|
77
77
|
window.scrollTo(0, 0);
|
78
78
|
});
|
79
79
|
|
80
|
-
consoleLogger.info(`Screenshot page at: ${
|
81
|
-
silentLogger.info(`Screenshot page at: ${
|
80
|
+
consoleLogger.info(`Screenshot page at: ${page.url()}`);
|
81
|
+
silentLogger.info(`Screenshot page at: ${page.url()}`);
|
82
82
|
|
83
83
|
await page.screenshot({
|
84
84
|
timeout: 5000,
|
@@ -116,7 +116,7 @@ export const runAxeScan = async (
|
|
116
116
|
await dataset.pushData(result);
|
117
117
|
|
118
118
|
urlsCrawled.scanned.push({
|
119
|
-
url:
|
119
|
+
url: page.url(),
|
120
120
|
pageTitle: result.pageTitle,
|
121
121
|
pageImagePath: customFlowDetails.pageImagePath,
|
122
122
|
});
|