@govtechsg/oobee 0.10.58 → 0.10.62
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/DETAILS.md +1 -1
- package/README.md +1 -0
- package/package.json +3 -2
- package/src/cli.ts +46 -99
- package/src/combine.ts +18 -6
- package/src/constants/cliFunctions.ts +5 -4
- package/src/constants/common.ts +207 -295
- package/src/constants/constants.ts +65 -32
- package/src/constants/questions.ts +11 -5
- package/src/crawlers/commonCrawlerFunc.ts +11 -5
- package/src/crawlers/crawlDomain.ts +34 -86
- package/src/crawlers/crawlIntelligentSitemap.ts +18 -11
- package/src/crawlers/crawlLocalFile.ts +9 -17
- package/src/crawlers/crawlSitemap.ts +30 -96
- package/src/crawlers/custom/utils.ts +5 -5
- package/src/crawlers/pdfScanFunc.ts +3 -2
- package/src/crawlers/runCustom.ts +4 -3
- package/src/index.ts +8 -9
- package/src/logs.ts +36 -11
- package/src/mergeAxeResults.ts +37 -31
- package/src/npmIndex.ts +4 -4
- package/src/screenshotFunc/htmlScreenshotFunc.ts +4 -4
- package/src/static/ejs/partials/scripts/utils.ejs +8 -11
- package/src/utils.ts +304 -15
@@ -5,10 +5,11 @@ import { globSync } from 'glob';
|
|
5
5
|
import which from 'which';
|
6
6
|
import os from 'os';
|
7
7
|
import { spawnSync, execSync } from 'child_process';
|
8
|
-
import { chromium } from 'playwright';
|
8
|
+
import { Browser, BrowserContext, chromium } from 'playwright';
|
9
9
|
import * as Sentry from '@sentry/node';
|
10
10
|
import { consoleLogger, silentLogger } from '../logs.js';
|
11
11
|
import { PageInfo } from '../mergeAxeResults.js';
|
12
|
+
import { PlaywrightCrawler } from 'crawlee';
|
12
13
|
|
13
14
|
const filename = fileURLToPath(import.meta.url);
|
14
15
|
const dirname = path.dirname(filename);
|
@@ -136,7 +137,7 @@ export const getDefaultChromiumDataDir = () => {
|
|
136
137
|
}
|
137
138
|
return null;
|
138
139
|
} catch (error) {
|
139
|
-
|
140
|
+
consoleLogger.error(`Error in getDefaultChromiumDataDir(): ${error}`);
|
140
141
|
}
|
141
142
|
};
|
142
143
|
|
@@ -227,45 +228,68 @@ if (fs.existsSync('/.dockerenv')) {
|
|
227
228
|
launchOptionsArgs = ['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage'];
|
228
229
|
}
|
229
230
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
)
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
silentLogger.error(e.toString());
|
231
|
+
type ProxyInfo = { type: 'autoConfig' | 'manualProxy'; url: string } | null;
|
232
|
+
|
233
|
+
function queryRegKey(key: string): Record<string, string> {
|
234
|
+
try {
|
235
|
+
const out = execSync(`reg query "${key}"`, { encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] });
|
236
|
+
const values: Record<string, string> = {};
|
237
|
+
for (const line of out.split(/\r?\n/)) {
|
238
|
+
const parts = line.trim().split(/\s{2,}/);
|
239
|
+
if (parts.length >= 3) {
|
240
|
+
const [name, _type, ...rest] = parts;
|
241
|
+
values[name] = rest.join(' ');
|
242
|
+
}
|
243
243
|
}
|
244
|
+
return values;
|
245
|
+
} catch {
|
246
|
+
return {};
|
247
|
+
}
|
248
|
+
}
|
244
249
|
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
250
|
+
function parseDwordFlag(v: unknown): number {
|
251
|
+
if (v == null) return 0;
|
252
|
+
const s = String(v).trim();
|
253
|
+
// Handles "1", "0", "0x1", "0x0"
|
254
|
+
if (/^0x[0-9a-f]+$/i.test(s)) return parseInt(s, 16);
|
255
|
+
if (/^\d+$/.test(s)) return parseInt(s, 10);
|
256
|
+
return 0;
|
257
|
+
}
|
251
258
|
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
+
function normalizePacUrl(u: string): string {
|
260
|
+
const s = u.trim();
|
261
|
+
// If it lacks a scheme, assume http:// (Chrome requires a full URL)
|
262
|
+
return /^(https?|file):/i.test(s) ? s : `http://${s}`;
|
263
|
+
}
|
264
|
+
|
265
|
+
export const getProxy = (): ProxyInfo => {
|
266
|
+
if (os.platform() !== 'win32') return null;
|
267
|
+
|
268
|
+
const values = queryRegKey('HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings');
|
269
|
+
const pacUrlRaw = (values['AutoConfigURL'] || '').trim();
|
270
|
+
const proxyEnableRaw = (values['ProxyEnable'] || '').trim();
|
271
|
+
const proxyServerRaw = (values['ProxyServer'] || '').trim();
|
272
|
+
|
273
|
+
// 1) PAC beats manual proxy if present
|
274
|
+
if (pacUrlRaw) {
|
275
|
+
return { type: 'autoConfig', url: normalizePacUrl(pacUrlRaw) };
|
276
|
+
}
|
277
|
+
|
278
|
+
// 2) Manual proxy only if enabled
|
279
|
+
const enabled = parseDwordFlag(proxyEnableRaw) === 1;
|
280
|
+
if (enabled && proxyServerRaw) {
|
281
|
+
return { type: 'manualProxy', url: proxyServerRaw };
|
259
282
|
}
|
260
|
-
|
283
|
+
|
261
284
|
return null;
|
262
285
|
};
|
263
286
|
|
287
|
+
// Usage
|
264
288
|
export const proxy = getProxy();
|
265
289
|
|
266
|
-
if (proxy
|
290
|
+
if (proxy?.type === 'autoConfig') {
|
267
291
|
launchOptionsArgs.push(`--proxy-pac-url=${proxy.url}`);
|
268
|
-
} else if (proxy
|
292
|
+
} else if (proxy?.type === 'manualProxy') {
|
269
293
|
launchOptionsArgs.push(`--proxy-server=${proxy.url}`);
|
270
294
|
}
|
271
295
|
|
@@ -405,6 +429,7 @@ const urlCheckStatuses = {
|
|
405
429
|
},
|
406
430
|
axiosTimeout: { code: 18, message: 'Axios timeout exceeded. Falling back on browser checks.' },
|
407
431
|
notALocalFile: { code: 19, message: 'Provided filepath is not a local html or sitemap file.' },
|
432
|
+
terminationRequested: { code: 15, message: 'Termination requested.' }
|
408
433
|
};
|
409
434
|
|
410
435
|
/* eslint-disable no-unused-vars */
|
@@ -452,7 +477,7 @@ const reserveFileNameKeywords = [
|
|
452
477
|
|
453
478
|
export default {
|
454
479
|
cliZipFileName: 'oobee-scan-results.zip',
|
455
|
-
exportDirectory:
|
480
|
+
exportDirectory: undefined,
|
456
481
|
maxRequestsPerCrawl,
|
457
482
|
maxConcurrency: 25,
|
458
483
|
urlsCrawledObj,
|
@@ -466,6 +491,14 @@ export default {
|
|
466
491
|
reserveFileNameKeywords,
|
467
492
|
wcagLinks,
|
468
493
|
robotsTxtUrls: null,
|
494
|
+
userDataDirectory: null, // This will be set later in the code
|
495
|
+
randomToken: null, // This will be set later in the code
|
496
|
+
// Track all active Crawlee / Playwright resources for cleanup
|
497
|
+
resources: {
|
498
|
+
crawlers: new Set<PlaywrightCrawler>(),
|
499
|
+
browserContexts: new Set<BrowserContext>(),
|
500
|
+
browsers: new Set<Browser>(),
|
501
|
+
},
|
469
502
|
};
|
470
503
|
|
471
504
|
export const rootPath = dirname;
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import { Question } from 'inquirer';
|
2
2
|
import { Answers } from '../index.js';
|
3
|
-
import { getUserDataTxt, setHeadlessMode } from '../utils.js';
|
3
|
+
import { getUserDataTxt, randomThreeDigitNumberString, setHeadlessMode } from '../utils.js';
|
4
4
|
import {
|
5
5
|
checkUrl,
|
6
6
|
deleteClonedProfiles,
|
@@ -15,6 +15,7 @@ import {
|
|
15
15
|
parseHeaders,
|
16
16
|
} from './common.js';
|
17
17
|
import constants, { BrowserTypes, ScannerTypes } from './constants.js';
|
18
|
+
import { random } from 'lodash';
|
18
19
|
|
19
20
|
const userData = getUserDataTxt();
|
20
21
|
|
@@ -78,8 +79,15 @@ const startScanQuestions = [
|
|
78
79
|
process.exit(1);
|
79
80
|
}
|
80
81
|
|
82
|
+
// construct filename for scan results
|
83
|
+
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
84
|
+
const domain = new URL(url).hostname;
|
85
|
+
let resultFilename: string;
|
86
|
+
const randomThreeDigitNumber = randomThreeDigitNumberString();
|
87
|
+
resultFilename = `${date}_${time}_${domain}_${randomThreeDigitNumber}`;
|
88
|
+
|
81
89
|
const statuses = constants.urlCheckStatuses;
|
82
|
-
const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(BrowserTypes.CHROME);
|
90
|
+
const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(BrowserTypes.CHROME, false, resultFilename);
|
83
91
|
|
84
92
|
setHeadlessMode(browserToRun, answers.headless);
|
85
93
|
|
@@ -95,11 +103,9 @@ const startScanQuestions = [
|
|
95
103
|
browserToRun,
|
96
104
|
clonedBrowserDataDir,
|
97
105
|
playwrightDeviceDetailsObject,
|
98
|
-
answers.scanner === ScannerTypes.CUSTOM,
|
99
106
|
parseHeaders(answers.header),
|
100
107
|
);
|
101
|
-
|
102
|
-
deleteClonedProfiles(browserToRun);
|
108
|
+
|
103
109
|
switch (res.status) {
|
104
110
|
case statuses.success.code:
|
105
111
|
answers.finalUrl = res.url;
|
@@ -21,6 +21,9 @@ import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
|
|
21
21
|
import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
|
22
22
|
import xPathToCss from './custom/xPathToCss.js';
|
23
23
|
import type { Response as PlaywrightResponse } from 'playwright';
|
24
|
+
import fs from 'fs';
|
25
|
+
import { getStoragePath } from '../utils.js';
|
26
|
+
import path from 'path';
|
24
27
|
|
25
28
|
// types
|
26
29
|
interface AxeResultsWithScreenshot extends AxeResults {
|
@@ -254,7 +257,7 @@ export const runAxeScript = async ({
|
|
254
257
|
return new Promise(resolve => {
|
255
258
|
let timeout: NodeJS.Timeout;
|
256
259
|
let mutationCount = 0;
|
257
|
-
const MAX_MUTATIONS =
|
260
|
+
const MAX_MUTATIONS = 500;
|
258
261
|
const MAX_SAME_MUTATION_LIMIT = 10;
|
259
262
|
const mutationHash: Record<string, number> = {};
|
260
263
|
|
@@ -315,9 +318,9 @@ export const runAxeScript = async ({
|
|
315
318
|
page.on('console', msg => {
|
316
319
|
const type = msg.type();
|
317
320
|
if (type === 'error') {
|
318
|
-
|
321
|
+
consoleLogger.log({ level: 'error', message: msg.text() });
|
319
322
|
} else {
|
320
|
-
|
323
|
+
consoleLogger.log({ level: 'info', message: msg.text() });
|
321
324
|
}
|
322
325
|
});
|
323
326
|
*/
|
@@ -476,8 +479,11 @@ export const runAxeScript = async ({
|
|
476
479
|
export const createCrawleeSubFolders = async (
|
477
480
|
randomToken: string,
|
478
481
|
): Promise<{ dataset: crawlee.Dataset; requestQueue: crawlee.RequestQueue }> => {
|
479
|
-
|
480
|
-
const
|
482
|
+
|
483
|
+
const crawleeDir = path.join(getStoragePath(randomToken),"crawlee");
|
484
|
+
|
485
|
+
const dataset = await crawlee.Dataset.open(crawleeDir);
|
486
|
+
const requestQueue = await crawlee.RequestQueue.open(crawleeDir);
|
481
487
|
return { dataset, requestQueue };
|
482
488
|
};
|
483
489
|
|
@@ -27,11 +27,9 @@ import {
|
|
27
27
|
isSkippedUrl,
|
28
28
|
isDisallowedInRobotsTxt,
|
29
29
|
getUrlsFromRobotsTxt,
|
30
|
-
urlWithoutAuth,
|
31
30
|
waitForPageLoaded,
|
32
|
-
initModifiedUserAgent,
|
33
31
|
} from '../constants/common.js';
|
34
|
-
import { areLinksEqual, isFollowStrategy } from '../utils.js';
|
32
|
+
import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
|
35
33
|
import {
|
36
34
|
handlePdfDownload,
|
37
35
|
runPdfScan,
|
@@ -40,6 +38,8 @@ import {
|
|
40
38
|
} from './pdfScanFunc.js';
|
41
39
|
import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
|
42
40
|
import { ViewportSettingsClass } from '../combine.js';
|
41
|
+
import * as path from 'path';
|
42
|
+
import fsp from 'fs/promises';
|
43
43
|
|
44
44
|
const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
|
45
45
|
if (!blacklistedPatterns) {
|
@@ -115,10 +115,6 @@ const crawlDomain = async ({
|
|
115
115
|
|
116
116
|
({ requestQueue } = await createCrawleeSubFolders(randomToken));
|
117
117
|
|
118
|
-
if (!fs.existsSync(randomToken)) {
|
119
|
-
fs.mkdirSync(randomToken);
|
120
|
-
}
|
121
|
-
|
122
118
|
const pdfDownloads: Promise<void>[] = [];
|
123
119
|
const uuidToPdfMapping: Record<string, string> = {};
|
124
120
|
const isScanHtml = ['all', 'html-only'].includes(fileTypes);
|
@@ -126,45 +122,11 @@ const crawlDomain = async ({
|
|
126
122
|
const { maxConcurrency } = constants;
|
127
123
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
128
124
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
// Test basic auth and add auth header if auth exist
|
136
|
-
const parsedUrl = new URL(url);
|
137
|
-
let username: string;
|
138
|
-
let password: string;
|
139
|
-
if (parsedUrl.username !== '' && parsedUrl.password !== '') {
|
140
|
-
isBasicAuth = true;
|
141
|
-
username = decodeURIComponent(parsedUrl.username);
|
142
|
-
password = decodeURIComponent(parsedUrl.password);
|
143
|
-
|
144
|
-
// Create auth header
|
145
|
-
authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
|
146
|
-
|
147
|
-
// Remove username from parsedUrl
|
148
|
-
parsedUrl.username = '';
|
149
|
-
parsedUrl.password = '';
|
150
|
-
// Send the finalUrl without credentials by setting auth header instead
|
151
|
-
const finalUrl = parsedUrl.toString();
|
152
|
-
|
153
|
-
await requestQueue.addRequest({
|
154
|
-
url: finalUrl,
|
155
|
-
skipNavigation: isUrlPdf(finalUrl),
|
156
|
-
headers: {
|
157
|
-
Authorization: authHeader,
|
158
|
-
},
|
159
|
-
label: finalUrl,
|
160
|
-
});
|
161
|
-
} else {
|
162
|
-
await requestQueue.addRequest({
|
163
|
-
url,
|
164
|
-
skipNavigation: isUrlPdf(url),
|
165
|
-
label: url,
|
166
|
-
});
|
167
|
-
}
|
125
|
+
await requestQueue.addRequest({
|
126
|
+
url,
|
127
|
+
skipNavigation: isUrlPdf(url),
|
128
|
+
label: url,
|
129
|
+
});
|
168
130
|
|
169
131
|
const enqueueProcess = async (
|
170
132
|
page: Page,
|
@@ -377,31 +339,40 @@ const crawlDomain = async ({
|
|
377
339
|
|
378
340
|
let isAbortingScanNow = false;
|
379
341
|
|
380
|
-
|
381
|
-
if (userDataDirectory) {
|
382
|
-
userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
|
383
|
-
}
|
384
|
-
|
385
|
-
await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
|
386
|
-
|
387
|
-
const crawler = new crawlee.PlaywrightCrawler({
|
342
|
+
const crawler = register(new crawlee.PlaywrightCrawler({
|
388
343
|
launchContext: {
|
389
344
|
launcher: constants.launcher,
|
390
345
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
391
346
|
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
392
|
-
...(process.env.CRAWLEE_HEADLESS === '
|
347
|
+
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
393
348
|
},
|
394
349
|
retryOnBlocked: true,
|
395
350
|
browserPoolOptions: {
|
396
351
|
useFingerprints: false,
|
397
352
|
preLaunchHooks: [
|
398
353
|
async (_pageId, launchContext) => {
|
354
|
+
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
|
355
|
+
|
356
|
+
// Ensure base exists
|
357
|
+
await fsp.mkdir(baseDir, { recursive: true });
|
358
|
+
|
359
|
+
// Create a unique subdir per browser
|
360
|
+
const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
|
361
|
+
await fsp.mkdir(subProfileDir, { recursive: true });
|
362
|
+
|
363
|
+
// Assign to Crawlee's launcher
|
364
|
+
launchContext.userDataDir = subProfileDir;
|
365
|
+
|
366
|
+
// Safely extend launchOptions
|
399
367
|
launchContext.launchOptions = {
|
400
368
|
...launchContext.launchOptions,
|
401
|
-
bypassCSP: true,
|
402
369
|
ignoreHTTPSErrors: true,
|
403
370
|
...playwrightDeviceDetailsObject,
|
371
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
404
372
|
};
|
373
|
+
|
374
|
+
// Optionally log for debugging
|
375
|
+
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
|
405
376
|
},
|
406
377
|
],
|
407
378
|
},
|
@@ -414,7 +385,7 @@ const crawlDomain = async ({
|
|
414
385
|
return new Promise(resolve => {
|
415
386
|
let timeout;
|
416
387
|
let mutationCount = 0;
|
417
|
-
const MAX_MUTATIONS =
|
388
|
+
const MAX_MUTATIONS = 500; // stop if things never quiet down
|
418
389
|
const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
|
419
390
|
|
420
391
|
const observer = new MutationObserver(() => {
|
@@ -464,33 +435,10 @@ const crawlDomain = async ({
|
|
464
435
|
}
|
465
436
|
},
|
466
437
|
],
|
467
|
-
preNavigationHooks: [ async({ page, request}) => {
|
468
|
-
if (isBasicAuth) {
|
469
|
-
await page.setExtraHTTPHeaders({
|
470
|
-
Authorization: authHeader,
|
471
|
-
...extraHTTPHeaders,
|
472
|
-
});
|
473
|
-
} else {
|
474
|
-
await page.setExtraHTTPHeaders({
|
475
|
-
...extraHTTPHeaders,
|
476
|
-
});
|
477
|
-
}
|
478
|
-
}],
|
479
438
|
requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
|
480
439
|
requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
|
481
440
|
const browserContext: BrowserContext = page.context();
|
482
441
|
try {
|
483
|
-
// Set basic auth header if needed
|
484
|
-
if (isBasicAuth) {
|
485
|
-
await page.setExtraHTTPHeaders({
|
486
|
-
Authorization: authHeader,
|
487
|
-
});
|
488
|
-
const currentUrl = new URL(request.url);
|
489
|
-
currentUrl.username = username;
|
490
|
-
currentUrl.password = password;
|
491
|
-
request.url = currentUrl.href;
|
492
|
-
}
|
493
|
-
|
494
442
|
await waitForPageLoaded(page, 10000);
|
495
443
|
let actualUrl = page.url() || request.loadedUrl || request.url;
|
496
444
|
|
@@ -652,13 +600,13 @@ const crawlDomain = async ({
|
|
652
600
|
});
|
653
601
|
|
654
602
|
urlsCrawled.scanned.push({
|
655
|
-
url:
|
603
|
+
url: request.url,
|
656
604
|
pageTitle: results.pageTitle,
|
657
605
|
actualUrl, // i.e. actualUrl
|
658
606
|
});
|
659
607
|
|
660
608
|
urlsCrawled.scannedRedirects.push({
|
661
|
-
fromUrl:
|
609
|
+
fromUrl: request.url,
|
662
610
|
toUrl: actualUrl, // i.e. actualUrl
|
663
611
|
});
|
664
612
|
|
@@ -671,10 +619,10 @@ const crawlDomain = async ({
|
|
671
619
|
if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
|
672
620
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
673
621
|
numScanned: urlsCrawled.scanned.length,
|
674
|
-
urlScanned:
|
622
|
+
urlScanned: request.url,
|
675
623
|
});
|
676
624
|
urlsCrawled.scanned.push({
|
677
|
-
url:
|
625
|
+
url: request.url,
|
678
626
|
actualUrl: request.url,
|
679
627
|
pageTitle: results.pageTitle,
|
680
628
|
});
|
@@ -695,7 +643,7 @@ const crawlDomain = async ({
|
|
695
643
|
});
|
696
644
|
}
|
697
645
|
|
698
|
-
if (followRobots) await getUrlsFromRobotsTxt(request.url, browser);
|
646
|
+
if (followRobots) await getUrlsFromRobotsTxt(request.url, browser, userDataDirectory, extraHTTPHeaders);
|
699
647
|
await enqueueProcess(page, enqueueLinks, browserContext);
|
700
648
|
} catch (e) {
|
701
649
|
try {
|
@@ -775,7 +723,7 @@ const crawlDomain = async ({
|
|
775
723
|
scaleDownStepRatio: 0.1, // Scale down slower
|
776
724
|
},
|
777
725
|
}),
|
778
|
-
});
|
726
|
+
}));
|
779
727
|
|
780
728
|
await crawler.run();
|
781
729
|
|
@@ -7,6 +7,8 @@ import crawlDomain from './crawlDomain.js';
|
|
7
7
|
import crawlSitemap from './crawlSitemap.js';
|
8
8
|
import { EnqueueStrategy } from 'crawlee';
|
9
9
|
import { ViewportSettingsClass } from '../combine.js';
|
10
|
+
import { getPlaywrightLaunchOptions } from '../constants/common.js';
|
11
|
+
import { register } from '../utils.js';
|
10
12
|
|
11
13
|
const crawlIntelligentSitemap = async (
|
12
14
|
url: string,
|
@@ -36,9 +38,6 @@ const crawlIntelligentSitemap = async (
|
|
36
38
|
let sitemapUrl;
|
37
39
|
|
38
40
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
39
|
-
if (!fs.existsSync(randomToken)) {
|
40
|
-
fs.mkdirSync(randomToken);
|
41
|
-
}
|
42
41
|
|
43
42
|
function getHomeUrl(parsedUrl: string) {
|
44
43
|
const urlObject = new URL(parsedUrl);
|
@@ -48,15 +47,22 @@ const crawlIntelligentSitemap = async (
|
|
48
47
|
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
49
48
|
}
|
50
49
|
|
51
|
-
async function findSitemap(link: string) {
|
50
|
+
async function findSitemap(link: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>) {
|
52
51
|
const homeUrl = getHomeUrl(link);
|
53
52
|
let sitemapLink = '';
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
53
|
+
|
54
|
+
const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
|
55
|
+
? userDataDirectory
|
56
|
+
: '';
|
57
|
+
const context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
58
|
+
headless: process.env.CRAWLEE_HEADLESS === '1',
|
59
|
+
...getPlaywrightLaunchOptions(browser),
|
60
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
58
61
|
});
|
59
|
-
|
62
|
+
register(context);
|
63
|
+
|
64
|
+
const page = await context.newPage();
|
65
|
+
|
60
66
|
for (const path of sitemapPaths) {
|
61
67
|
sitemapLink = homeUrl + path;
|
62
68
|
if (await checkUrlExists(page, sitemapLink)) {
|
@@ -64,7 +70,8 @@ const crawlIntelligentSitemap = async (
|
|
64
70
|
break;
|
65
71
|
}
|
66
72
|
}
|
67
|
-
await
|
73
|
+
await page.close();
|
74
|
+
await context.close().catch(() => { });
|
68
75
|
return sitemapExist ? sitemapLink : '';
|
69
76
|
}
|
70
77
|
|
@@ -79,7 +86,7 @@ const crawlIntelligentSitemap = async (
|
|
79
86
|
};
|
80
87
|
|
81
88
|
try {
|
82
|
-
sitemapUrl = await findSitemap(url);
|
89
|
+
sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
|
83
90
|
} catch (error) {
|
84
91
|
consoleLogger.error(error);
|
85
92
|
}
|
@@ -13,11 +13,11 @@ import {
|
|
13
13
|
isFilePath,
|
14
14
|
convertLocalFileToPath,
|
15
15
|
convertPathToLocalFile,
|
16
|
-
initModifiedUserAgent,
|
17
16
|
} from '../constants/common.js';
|
18
17
|
import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
|
19
18
|
import { guiInfoLog } from '../logs.js';
|
20
19
|
import crawlSitemap from './crawlSitemap.js';
|
20
|
+
import { register } from '../utils.js';
|
21
21
|
|
22
22
|
export const crawlLocalFile = async ({
|
23
23
|
url,
|
@@ -74,9 +74,6 @@ export const crawlLocalFile = async ({
|
|
74
74
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
75
75
|
urlsCrawled = { ...constants.urlsCrawledObj };
|
76
76
|
|
77
|
-
if (!fs.existsSync(randomToken)) {
|
78
|
-
fs.mkdirSync(randomToken);
|
79
|
-
}
|
80
77
|
}
|
81
78
|
|
82
79
|
// Check if the sitemapUrl is a local file and if it exists
|
@@ -136,16 +133,6 @@ export const crawlLocalFile = async ({
|
|
136
133
|
console.log(e);
|
137
134
|
}
|
138
135
|
|
139
|
-
if (basicAuthRegex.test(sitemapUrl)) {
|
140
|
-
isBasicAuth = true;
|
141
|
-
// request to basic auth URL to authenticate for browser session
|
142
|
-
finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
|
143
|
-
const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
|
144
|
-
// obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
|
145
|
-
finalLinks.push(new Request({ url: finalUrl }));
|
146
|
-
basicAuthPage = -2;
|
147
|
-
}
|
148
|
-
|
149
136
|
const uuidToPdfMapping: Record<string, string> = {}; // key and value of string type
|
150
137
|
|
151
138
|
finalLinks = [...finalLinks, ...linksFromSitemap];
|
@@ -165,13 +152,18 @@ export const crawlLocalFile = async ({
|
|
165
152
|
let shouldAbort = false;
|
166
153
|
|
167
154
|
if (!isUrlPdf(request.url)) {
|
168
|
-
|
169
|
-
|
170
|
-
|
155
|
+
const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
|
156
|
+
? userDataDirectory
|
157
|
+
: '';
|
158
|
+
|
159
|
+
const browserContext = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
160
|
+
headless: process.env.CRAWLEE_HEADLESS === '1',
|
171
161
|
...getPlaywrightLaunchOptions(browser),
|
172
162
|
...playwrightDeviceDetailsObject,
|
173
163
|
});
|
174
164
|
|
165
|
+
register(browserContext);
|
166
|
+
|
175
167
|
const timeoutId = scanDuration > 0
|
176
168
|
? setTimeout(() => {
|
177
169
|
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting local file scan.`);
|