@govtechsg/oobee 0.10.58 → 0.10.62
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/DETAILS.md +1 -1
- package/README.md +1 -0
- package/package.json +3 -2
- package/src/cli.ts +46 -99
- package/src/combine.ts +18 -6
- package/src/constants/cliFunctions.ts +5 -4
- package/src/constants/common.ts +207 -295
- package/src/constants/constants.ts +65 -32
- package/src/constants/questions.ts +11 -5
- package/src/crawlers/commonCrawlerFunc.ts +11 -5
- package/src/crawlers/crawlDomain.ts +34 -86
- package/src/crawlers/crawlIntelligentSitemap.ts +18 -11
- package/src/crawlers/crawlLocalFile.ts +9 -17
- package/src/crawlers/crawlSitemap.ts +30 -96
- package/src/crawlers/custom/utils.ts +5 -5
- package/src/crawlers/pdfScanFunc.ts +3 -2
- package/src/crawlers/runCustom.ts +4 -3
- package/src/index.ts +8 -9
- package/src/logs.ts +36 -11
- package/src/mergeAxeResults.ts +37 -31
- package/src/npmIndex.ts +4 -4
- package/src/screenshotFunc/htmlScreenshotFunc.ts +4 -4
- package/src/static/ejs/partials/scripts/utils.ejs +8 -11
- package/src/utils.ts +304 -15
package/src/constants/common.ts
CHANGED
@@ -31,7 +31,7 @@ import constants, {
|
|
31
31
|
} from './constants.js';
|
32
32
|
import { consoleLogger, silentLogger } from '../logs.js';
|
33
33
|
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
34
|
-
import { randomThreeDigitNumberString } from '../utils.js';
|
34
|
+
import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
|
35
35
|
import { Answers, Data } from '../index.js';
|
36
36
|
import { DeviceDescriptor } from '../types/types.js';
|
37
37
|
|
@@ -119,7 +119,7 @@ export const validateFilePath = (filePath: string, cliDir: string) => {
|
|
119
119
|
|
120
120
|
return absolutePath;
|
121
121
|
} catch {
|
122
|
-
throw new Error(`Please ensure path provided exists: ${absolutePath}`);
|
122
|
+
throw new Error(`Please ensure path provided exists and writable: ${absolutePath}`);
|
123
123
|
}
|
124
124
|
};
|
125
125
|
|
@@ -277,110 +277,11 @@ export const sanitizeUrlInput = (url: string): { isValid: boolean; url: string }
|
|
277
277
|
return { isValid: false, url: sanitizeUrl };
|
278
278
|
};
|
279
279
|
|
280
|
-
const requestToUrl = async (
|
281
|
-
url: string,
|
282
|
-
isCustomFlow: boolean,
|
283
|
-
extraHTTPHeaders: Record<string, string>,
|
284
|
-
) => {
|
285
|
-
// User-Agent is modified to emulate a browser to handle cases where some sites ban non browser agents, resulting in a 403 error
|
286
|
-
const res = new RES();
|
287
|
-
const parsedUrl = new URL(url);
|
288
|
-
await axios
|
289
|
-
.get(parsedUrl.href, {
|
290
|
-
headers: {
|
291
|
-
...extraHTTPHeaders,
|
292
|
-
'User-Agent': devices['Desktop Chrome HiDPI'].userAgent,
|
293
|
-
Host: parsedUrl.host,
|
294
|
-
},
|
295
|
-
auth: {
|
296
|
-
username: decodeURIComponent(parsedUrl.username),
|
297
|
-
password: decodeURIComponent(parsedUrl.password),
|
298
|
-
},
|
299
|
-
httpsAgent,
|
300
|
-
timeout: 5000,
|
301
|
-
})
|
302
|
-
.then(async response => {
|
303
|
-
let redirectUrl = response.request.res.responseUrl;
|
304
|
-
redirectUrl = new URL(redirectUrl).href;
|
305
|
-
res.status = constants.urlCheckStatuses.success.code;
|
306
|
-
let data;
|
307
|
-
if (typeof response.data === 'string' || response.data instanceof String) {
|
308
|
-
data = response.data;
|
309
|
-
} else if (typeof response.data === 'object' && response.data !== null) {
|
310
|
-
try {
|
311
|
-
data = JSON.stringify(response.data);
|
312
|
-
} catch (error) {
|
313
|
-
console.log('Error converting object to JSON:', error);
|
314
|
-
}
|
315
|
-
} else {
|
316
|
-
console.log('Unsupported data type:', typeof response.data);
|
317
|
-
}
|
318
|
-
const modifiedHTML = data.replace(/<noscript>[\s\S]*?<\/noscript>/gi, '');
|
319
|
-
|
320
|
-
const metaRefreshMatch =
|
321
|
-
/<meta\s+http-equiv="refresh"\s+content="(?:\d+;)?\s*url=(?:'([^']*)'|"([^"]*)"|([^>]*))"/i.exec(
|
322
|
-
modifiedHTML,
|
323
|
-
);
|
324
|
-
|
325
|
-
const hasMetaRefresh = metaRefreshMatch && metaRefreshMatch.length > 1;
|
326
|
-
|
327
|
-
if (redirectUrl != null && (hasMetaRefresh || !isCustomFlow)) {
|
328
|
-
res.url = redirectUrl;
|
329
|
-
} else {
|
330
|
-
res.url = url;
|
331
|
-
}
|
332
|
-
|
333
|
-
if (hasMetaRefresh) {
|
334
|
-
let urlOrRelativePath;
|
335
|
-
|
336
|
-
for (let i = 1; i < metaRefreshMatch.length; i++) {
|
337
|
-
if (metaRefreshMatch[i] !== undefined && metaRefreshMatch[i] !== null) {
|
338
|
-
urlOrRelativePath = metaRefreshMatch[i];
|
339
|
-
break; // Stop the loop once the first non-null value is found
|
340
|
-
}
|
341
|
-
}
|
342
|
-
|
343
|
-
if (urlOrRelativePath.includes('URL=')) {
|
344
|
-
res.url = urlOrRelativePath.split('URL=').pop();
|
345
|
-
} else {
|
346
|
-
const pathname = res.url.substring(0, res.url.lastIndexOf('/'));
|
347
|
-
res.url = new URL(urlOrRelativePath, pathname).toString();
|
348
|
-
}
|
349
|
-
}
|
350
|
-
|
351
|
-
res.content = response.data;
|
352
|
-
})
|
353
|
-
.catch(async error => {
|
354
|
-
if (error.code === 'ECONNABORTED' || error.code === 'ERR_FR_TOO_MANY_REDIRECTS') {
|
355
|
-
res.status = constants.urlCheckStatuses.axiosTimeout.code;
|
356
|
-
} else if (error.response) {
|
357
|
-
if (error.response.status === 401) {
|
358
|
-
// enters here if URL is protected by basic auth
|
359
|
-
res.status = constants.urlCheckStatuses.unauthorised.code;
|
360
|
-
} else {
|
361
|
-
// enters here if server responds with a status other than 2xx
|
362
|
-
// the scan should still proceed even if error codes are received, so that accessibility scans for error pages can be done too
|
363
|
-
res.status = constants.urlCheckStatuses.success.code;
|
364
|
-
}
|
365
|
-
res.url = url;
|
366
|
-
res.content = error.response.data;
|
367
|
-
return res;
|
368
|
-
} else if (error.request) {
|
369
|
-
// enters here if URL cannot be accessed
|
370
|
-
res.status = constants.urlCheckStatuses.cannotBeResolved.code;
|
371
|
-
} else {
|
372
|
-
res.status = constants.urlCheckStatuses.systemError.code;
|
373
|
-
}
|
374
|
-
});
|
375
|
-
return res;
|
376
|
-
};
|
377
|
-
|
378
280
|
const checkUrlConnectivityWithBrowser = async (
|
379
281
|
url: string,
|
380
282
|
browserToRun: string,
|
381
283
|
clonedDataDir: string,
|
382
284
|
playwrightDeviceDetailsObject: DeviceDescriptor,
|
383
|
-
isCustomFlow: boolean,
|
384
285
|
extraHTTPHeaders: Record<string, string>,
|
385
286
|
) => {
|
386
287
|
const res = new RES();
|
@@ -391,28 +292,21 @@ const checkUrlConnectivityWithBrowser = async (
|
|
391
292
|
return res;
|
392
293
|
}
|
393
294
|
|
394
|
-
let viewport = null;
|
395
|
-
let userAgent = null;
|
396
|
-
if ('viewport' in playwrightDeviceDetailsObject) viewport = playwrightDeviceDetailsObject.viewport;
|
397
|
-
if ('userAgent' in playwrightDeviceDetailsObject) userAgent = playwrightDeviceDetailsObject.userAgent;
|
398
|
-
|
399
295
|
// Ensure Accept header for non-html content fallback
|
400
296
|
extraHTTPHeaders['Accept'] ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
|
401
297
|
|
402
|
-
|
403
|
-
const browserContextLaunchOptions = {
|
404
|
-
...launchOptions,
|
405
|
-
args: [...launchOptions.args, '--headless=new'],
|
406
|
-
};
|
298
|
+
await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
|
407
299
|
|
408
300
|
let browserContext;
|
409
301
|
try {
|
410
302
|
browserContext = await constants.launcher.launchPersistentContext(clonedDataDir, {
|
411
|
-
...browserContextLaunchOptions,
|
412
|
-
...(viewport && { viewport }),
|
413
|
-
...(userAgent && { userAgent }),
|
414
303
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
304
|
+
ignoreHTTPSErrors: true,
|
305
|
+
...getPlaywrightLaunchOptions(browserToRun),
|
306
|
+
...playwrightDeviceDetailsObject,
|
415
307
|
});
|
308
|
+
|
309
|
+
register(browserContext);
|
416
310
|
} catch (err) {
|
417
311
|
printMessage([`Unable to launch browser\n${err}`], messageOptions);
|
418
312
|
res.status = constants.urlCheckStatuses.browserError.code;
|
@@ -422,48 +316,77 @@ const checkUrlConnectivityWithBrowser = async (
|
|
422
316
|
try {
|
423
317
|
const page = await browserContext.newPage();
|
424
318
|
|
425
|
-
//
|
426
|
-
|
427
|
-
|
319
|
+
// STEP 1: HEAD request before actual navigation
|
320
|
+
let statusCode = 0;
|
321
|
+
let contentType = '';
|
322
|
+
let disposition = '';
|
323
|
+
|
324
|
+
try {
|
325
|
+
const headResp = await page.request.fetch(url, {
|
326
|
+
method: 'HEAD',
|
327
|
+
headers: extraHTTPHeaders,
|
328
|
+
});
|
329
|
+
|
330
|
+
statusCode = headResp.status();
|
331
|
+
contentType = headResp.headers()['content-type'] || '';
|
332
|
+
disposition = headResp.headers()['content-disposition'] || '';
|
333
|
+
|
334
|
+
// If it looks like a downloadable file, skip goto entirely
|
335
|
+
if (
|
336
|
+
contentType.includes('pdf') ||
|
337
|
+
contentType.includes('octet-stream') ||
|
338
|
+
disposition.includes('attachment')
|
339
|
+
) {
|
340
|
+
res.status = statusCode === 401
|
341
|
+
? constants.urlCheckStatuses.unauthorised.code
|
342
|
+
: constants.urlCheckStatuses.success.code;
|
343
|
+
|
344
|
+
res.httpStatus = statusCode;
|
345
|
+
res.url = url;
|
346
|
+
res.content = ''; // Don't try to render binary
|
347
|
+
|
348
|
+
await browserContext.close();
|
349
|
+
return res;
|
350
|
+
}
|
351
|
+
} catch (e) {
|
352
|
+
consoleLogger.info(`HEAD request failed: ${e.message}`);
|
353
|
+
res.status = constants.urlCheckStatuses.systemError.code;
|
354
|
+
await browserContext.close();
|
355
|
+
return res;
|
428
356
|
}
|
429
357
|
|
358
|
+
// STEP 2: Safe to proceed with navigation
|
430
359
|
const response = await page.goto(url, {
|
431
360
|
timeout: 30000,
|
432
|
-
|
361
|
+
waitUntil: 'commit', // Don't wait for full load
|
433
362
|
});
|
434
363
|
|
435
|
-
|
436
|
-
|
437
|
-
} catch {
|
438
|
-
consoleLogger.info('Unable to detect networkidle');
|
439
|
-
}
|
440
|
-
|
441
|
-
const status = response.status();
|
442
|
-
res.status = status === 401
|
364
|
+
const finalStatus = statusCode || (response?.status?.() ?? 0);
|
365
|
+
res.status = finalStatus === 401
|
443
366
|
? constants.urlCheckStatuses.unauthorised.code
|
444
367
|
: constants.urlCheckStatuses.success.code;
|
445
368
|
|
446
|
-
|
447
|
-
res.
|
369
|
+
res.httpStatus = finalStatus;
|
370
|
+
res.url = page.url();
|
448
371
|
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
// Check content type to determine how to extract content
|
453
|
-
const contentType = response.headers()['content-type'] || '';
|
454
|
-
|
455
|
-
if (contentType.includes('xml') || res.url.endsWith('.xml')) {
|
456
|
-
// Fetch raw content to avoid Playwright's HTML-wrapped <pre> behavior
|
457
|
-
const rawResponse = await requestToUrl(res.url, true, extraHTTPHeaders);
|
458
|
-
res.content = rawResponse.content;
|
372
|
+
contentType = response?.headers()?.['content-type'] || '';
|
373
|
+
if (contentType.includes('pdf') || contentType.includes('octet-stream')) {
|
374
|
+
res.content = ''; // Avoid triggering render/download
|
459
375
|
} else {
|
460
|
-
|
376
|
+
try {
|
377
|
+
await page.waitForLoadState('networkidle', { timeout: 10000 });
|
378
|
+
} catch {
|
379
|
+
consoleLogger.info('Unable to detect networkidle');
|
380
|
+
}
|
381
|
+
|
382
|
+
res.content = await page.content();
|
461
383
|
}
|
462
384
|
|
463
385
|
} catch (error) {
|
464
386
|
if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
|
465
387
|
res.status = constants.urlCheckStatuses.unauthorised.code;
|
466
388
|
} else {
|
389
|
+
console.log(error);
|
467
390
|
res.status = constants.urlCheckStatuses.systemError.code;
|
468
391
|
}
|
469
392
|
} finally {
|
@@ -501,7 +424,6 @@ export const checkUrl = async (
|
|
501
424
|
browser: string,
|
502
425
|
clonedDataDir: string,
|
503
426
|
playwrightDeviceDetailsObject: DeviceDescriptor,
|
504
|
-
isCustomFlow: boolean,
|
505
427
|
extraHTTPHeaders: Record<string, string>,
|
506
428
|
) => {
|
507
429
|
const res = await checkUrlConnectivityWithBrowser(
|
@@ -509,7 +431,6 @@ export const checkUrl = async (
|
|
509
431
|
browser,
|
510
432
|
clonedDataDir,
|
511
433
|
playwrightDeviceDetailsObject,
|
512
|
-
isCustomFlow,
|
513
434
|
extraHTTPHeaders,
|
514
435
|
);
|
515
436
|
|
@@ -544,7 +465,7 @@ export const parseHeaders = (header?: string): Record<string, string> => {
|
|
544
465
|
],
|
545
466
|
messageOptions,
|
546
467
|
);
|
547
|
-
|
468
|
+
cleanUpAndExit(1);
|
548
469
|
}
|
549
470
|
allHeaders[headerValuePair[0]] = headerValuePair[1]; // {"header": "value", "header2": "value2", ...}
|
550
471
|
});
|
@@ -555,18 +476,16 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
555
476
|
if (isEmptyObject(argv)) {
|
556
477
|
throw Error('No inputs should be provided');
|
557
478
|
}
|
558
|
-
|
479
|
+
let {
|
559
480
|
scanner,
|
560
481
|
headless,
|
561
482
|
url,
|
562
483
|
deviceChosen,
|
563
484
|
customDevice,
|
564
485
|
viewportWidth,
|
565
|
-
playwrightDeviceDetailsObject,
|
566
486
|
maxpages,
|
567
487
|
strategy,
|
568
488
|
isLocalFileScan,
|
569
|
-
finalUrl,
|
570
489
|
browserToRun,
|
571
490
|
nameEmail,
|
572
491
|
customFlowLabel,
|
@@ -578,32 +497,75 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
578
497
|
followRobots,
|
579
498
|
header,
|
580
499
|
safeMode,
|
500
|
+
exportDirectory,
|
581
501
|
zip,
|
582
502
|
ruleset,
|
583
503
|
generateJsonFiles,
|
584
504
|
scanDuration
|
585
505
|
} = argv;
|
586
506
|
|
507
|
+
const extraHTTPHeaders = parseHeaders(header);
|
508
|
+
|
509
|
+
// Set default username and password for basic auth
|
510
|
+
let username = '';
|
511
|
+
let password = '';
|
512
|
+
|
513
|
+
// Remove credentials from URL if not a local file scan
|
514
|
+
url = argv.isLocalFileScan
|
515
|
+
? url
|
516
|
+
: (() => {
|
517
|
+
const temp = new URL(url);
|
518
|
+
username = temp.username;
|
519
|
+
password = temp.password;
|
520
|
+
|
521
|
+
if (username !== '' || password !== '') {
|
522
|
+
extraHTTPHeaders['Authorization'] = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
|
523
|
+
}
|
524
|
+
|
525
|
+
temp.username = '';
|
526
|
+
temp.password = '';
|
527
|
+
return temp.toString();
|
528
|
+
})();
|
529
|
+
|
587
530
|
// construct filename for scan results
|
588
531
|
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
589
532
|
const domain = argv.isLocalFileScan ? path.basename(argv.url) : new URL(argv.url).hostname;
|
533
|
+
|
590
534
|
const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : '';
|
591
535
|
let resultFilename: string;
|
592
536
|
const randomThreeDigitNumber = randomThreeDigitNumberString();
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
537
|
+
resultFilename = `${date}_${time}${sanitisedLabel}_${domain}_${randomThreeDigitNumber}`;
|
538
|
+
|
539
|
+
// Set exported directory
|
540
|
+
if (exportDirectory) {
|
541
|
+
constants.exportDirectory = path.join(exportDirectory, resultFilename);
|
597
542
|
}
|
543
|
+
|
544
|
+
// Creating the playwrightDeviceDetailObject
|
545
|
+
deviceChosen = customDevice === 'Desktop' || customDevice === 'Mobile' ? customDevice : deviceChosen;
|
546
|
+
|
547
|
+
const playwrightDeviceDetailsObject = getPlaywrightDeviceDetailsObject(
|
548
|
+
deviceChosen,
|
549
|
+
customDevice,
|
550
|
+
viewportWidth,
|
551
|
+
);
|
552
|
+
|
553
|
+
const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(browserToRun, true, resultFilename);
|
554
|
+
browserToRun = resolvedBrowser;
|
555
|
+
|
556
|
+
const resolvedUserDataDirectory = getClonedProfilesWithRandomToken(browserToRun, resultFilename);
|
598
557
|
|
599
558
|
if (followRobots) {
|
600
559
|
constants.robotsTxtUrls = {};
|
601
|
-
await getUrlsFromRobotsTxt(url, browserToRun);
|
560
|
+
await getUrlsFromRobotsTxt(url, browserToRun, resolvedUserDataDirectory, extraHTTPHeaders);
|
602
561
|
}
|
603
562
|
|
563
|
+
constants.userDataDirectory = resolvedUserDataDirectory;
|
564
|
+
constants.randomToken = resultFilename;
|
565
|
+
|
604
566
|
return {
|
605
567
|
type: scanner,
|
606
|
-
url:
|
568
|
+
url: url,
|
607
569
|
entryUrl: url,
|
608
570
|
isHeadless: headless,
|
609
571
|
deviceChosen,
|
@@ -624,8 +586,9 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
624
586
|
includeScreenshots: !(additional === 'none'),
|
625
587
|
metadata,
|
626
588
|
followRobots,
|
627
|
-
extraHTTPHeaders:
|
589
|
+
extraHTTPHeaders: extraHTTPHeaders,
|
628
590
|
safeMode,
|
591
|
+
userDataDirectory: resolvedUserDataDirectory,
|
629
592
|
zip,
|
630
593
|
ruleset,
|
631
594
|
generateJsonFiles,
|
@@ -633,7 +596,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
633
596
|
};
|
634
597
|
};
|
635
598
|
|
636
|
-
export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): Promise<void> => {
|
599
|
+
export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>): Promise<void> => {
|
637
600
|
if (!constants.robotsTxtUrls) return;
|
638
601
|
|
639
602
|
const domain = new URL(url).origin;
|
@@ -642,22 +605,18 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): P
|
|
642
605
|
|
643
606
|
let robotsTxt: string;
|
644
607
|
try {
|
645
|
-
|
646
|
-
|
647
|
-
} else {
|
648
|
-
robotsTxt = await getRobotsTxtViaAxios(robotsUrl);
|
649
|
-
}
|
608
|
+
robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browserToRun, userDataDirectory, extraHTTPHeaders);
|
609
|
+
consoleLogger.info(`Fetched robots.txt from ${robotsUrl}`);
|
650
610
|
} catch (e) {
|
651
611
|
// if robots.txt is not found, do nothing
|
612
|
+
consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl}`);
|
652
613
|
}
|
653
|
-
|
614
|
+
|
654
615
|
if (!robotsTxt) {
|
655
616
|
constants.robotsTxtUrls[domain] = {};
|
656
617
|
return;
|
657
618
|
}
|
658
|
-
|
659
|
-
console.log('Found robots.txt: ', robotsUrl);
|
660
|
-
|
619
|
+
|
661
620
|
const lines = robotsTxt.split(/\r?\n/);
|
662
621
|
let shouldCapture = false;
|
663
622
|
const disallowedUrls = [];
|
@@ -705,30 +664,32 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): P
|
|
705
664
|
constants.robotsTxtUrls[domain] = { disallowedUrls, allowedUrls };
|
706
665
|
};
|
707
666
|
|
708
|
-
const getRobotsTxtViaPlaywright = async (robotsUrl: string, browser: string): Promise<string> => {
|
709
|
-
|
667
|
+
const getRobotsTxtViaPlaywright = async (robotsUrl: string, browser: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>): Promise<string> => {
|
668
|
+
|
669
|
+
let robotsDataDir = '';
|
670
|
+
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
671
|
+
if (process.env.CRAWLEE_HEADLESS === '1') {
|
672
|
+
// Create robots own user data directory else SingletonLock: File exists (17) with crawlDomain or crawlSitemap's own browser
|
673
|
+
const robotsDataDir = path.join(userDataDirectory, 'robots');
|
674
|
+
if (!fs.existsSync(robotsDataDir)) {
|
675
|
+
fs.mkdirSync(robotsDataDir, { recursive: true });
|
676
|
+
}
|
677
|
+
}
|
678
|
+
|
679
|
+
const browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
|
710
680
|
...getPlaywrightLaunchOptions(browser),
|
681
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
711
682
|
});
|
712
683
|
|
684
|
+
register(browserContext);
|
685
|
+
|
713
686
|
const page = await browserContext.newPage();
|
714
|
-
await page.goto(robotsUrl, { waitUntil: 'networkidle', timeout: 30000 });
|
715
687
|
|
688
|
+
await page.goto(robotsUrl, { waitUntil: 'networkidle', timeout: 30000 });
|
716
689
|
const robotsTxt: string | null = await page.evaluate(() => document.body.textContent);
|
717
690
|
return robotsTxt;
|
718
691
|
};
|
719
692
|
|
720
|
-
const getRobotsTxtViaAxios = async (robotsUrl: string): Promise<string> => {
|
721
|
-
const instance = axios.create({
|
722
|
-
httpsAgent: new https.Agent({
|
723
|
-
rejectUnauthorized: false,
|
724
|
-
keepAlive: true,
|
725
|
-
}),
|
726
|
-
});
|
727
|
-
|
728
|
-
const robotsTxt = (await (await instance.get(robotsUrl, { timeout: 2000 })).data) as string;
|
729
|
-
return robotsTxt;
|
730
|
-
};
|
731
|
-
|
732
693
|
export const isDisallowedInRobotsTxt = (url: string): boolean => {
|
733
694
|
if (!constants.robotsTxtUrls) return;
|
734
695
|
|
@@ -760,8 +721,7 @@ export const getLinksFromSitemap = async (
|
|
760
721
|
userDataDirectory: string,
|
761
722
|
userUrlInput: string,
|
762
723
|
isIntelligent: boolean,
|
763
|
-
|
764
|
-
password: string,
|
724
|
+
extraHTTPHeaders: Record<string, string>,
|
765
725
|
) => {
|
766
726
|
const scannedSitemaps = new Set<string>();
|
767
727
|
const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
|
@@ -772,11 +732,6 @@ export const getLinksFromSitemap = async (
|
|
772
732
|
if (!url) return;
|
773
733
|
if (isDisallowedInRobotsTxt(url)) return;
|
774
734
|
|
775
|
-
// add basic auth credentials to the URL
|
776
|
-
username !== '' && password !== ''
|
777
|
-
? (url = addBasicAuthCredentials(url, username, password))
|
778
|
-
: url;
|
779
|
-
|
780
735
|
url = convertPathToLocalFile(url);
|
781
736
|
|
782
737
|
let request;
|
@@ -791,13 +746,6 @@ export const getLinksFromSitemap = async (
|
|
791
746
|
urls[url] = request;
|
792
747
|
};
|
793
748
|
|
794
|
-
const addBasicAuthCredentials = (url: string, username: string, password: string) => {
|
795
|
-
const urlObject = new URL(url);
|
796
|
-
urlObject.username = username;
|
797
|
-
urlObject.password = password;
|
798
|
-
return urlObject.toString();
|
799
|
-
};
|
800
|
-
|
801
749
|
const calculateCloseness = (sitemapUrl: string) => {
|
802
750
|
// Remove 'http://', 'https://', and 'www.' prefixes from the URLs
|
803
751
|
const normalizedSitemapUrl = sitemapUrl.replace(/^(https?:\/\/)?(www\.)?/, '');
|
@@ -868,16 +816,10 @@ export const getLinksFromSitemap = async (
|
|
868
816
|
finalUserDataDirectory = '';
|
869
817
|
}
|
870
818
|
|
871
|
-
const fetchUrls = async (url: string) => {
|
819
|
+
const fetchUrls = async (url: string, extraHTTPHeaders: Record<string, string>) => {
|
872
820
|
let data;
|
873
821
|
let sitemapType;
|
874
|
-
|
875
|
-
|
876
|
-
let username = '';
|
877
|
-
let password = '';
|
878
|
-
|
879
|
-
let parsedUrl;
|
880
|
-
|
822
|
+
|
881
823
|
if (scannedSitemaps.has(url)) {
|
882
824
|
// Skip processing if the sitemap has already been scanned
|
883
825
|
return;
|
@@ -893,17 +835,9 @@ export const getLinksFromSitemap = async (
|
|
893
835
|
if (!fs.existsSync(url)) {
|
894
836
|
return;
|
895
837
|
}
|
896
|
-
|
838
|
+
|
897
839
|
} else if (isValidHttpUrl(url)) {
|
898
|
-
|
899
|
-
|
900
|
-
if (parsedUrl.username !== '' && parsedUrl.password !== '') {
|
901
|
-
isBasicAuth = true;
|
902
|
-
username = decodeURIComponent(parsedUrl.username);
|
903
|
-
password = decodeURIComponent(parsedUrl.password);
|
904
|
-
parsedUrl.username = '';
|
905
|
-
parsedUrl.password = '';
|
906
|
-
}
|
840
|
+
// Do nothing, url is valid
|
907
841
|
} else {
|
908
842
|
printMessage([`Invalid Url/Filepath: ${url}`], messageOptions);
|
909
843
|
return;
|
@@ -915,12 +849,18 @@ export const getLinksFromSitemap = async (
|
|
915
849
|
{
|
916
850
|
...getPlaywrightLaunchOptions(browser),
|
917
851
|
// Not necessary to parse http_credentials as I am parsing it directly in URL
|
852
|
+
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
853
|
+
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
854
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
918
855
|
},
|
919
856
|
);
|
920
857
|
|
858
|
+
register(browserContext);
|
921
859
|
const page = await browserContext.newPage();
|
860
|
+
|
922
861
|
await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
|
923
|
-
|
862
|
+
|
863
|
+
if (await page.locator('body').count() > 0) {
|
924
864
|
data = await page.locator('body').innerText();
|
925
865
|
} else {
|
926
866
|
const urlSet = page.locator('urlset');
|
@@ -948,35 +888,14 @@ export const getLinksFromSitemap = async (
|
|
948
888
|
addToUrlList(url);
|
949
889
|
return;
|
950
890
|
}
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
try {
|
955
|
-
const instance = axios.create({
|
956
|
-
httpsAgent: new https.Agent({
|
957
|
-
rejectUnauthorized: false,
|
958
|
-
keepAlive: true,
|
959
|
-
}),
|
960
|
-
auth: {
|
961
|
-
username,
|
962
|
-
password,
|
963
|
-
},
|
964
|
-
});
|
965
|
-
try {
|
966
|
-
data = await (await instance.get(url, { timeout: 80000 })).data;
|
967
|
-
} catch {
|
968
|
-
return; // to skip the error
|
969
|
-
}
|
970
|
-
} catch (error) {
|
971
|
-
if (error.code === 'ECONNABORTED') {
|
972
|
-
await getDataUsingPlaywright();
|
973
|
-
}
|
974
|
-
}
|
975
|
-
}
|
891
|
+
|
892
|
+
await getDataUsingPlaywright();
|
893
|
+
|
976
894
|
} else {
|
977
895
|
url = convertLocalFileToPath(url);
|
978
896
|
data = fs.readFileSync(url, 'utf8');
|
979
897
|
}
|
898
|
+
|
980
899
|
const $ = cheerio.load(data, { xml: true });
|
981
900
|
|
982
901
|
// This case is when the document is not an XML format document
|
@@ -1012,7 +931,7 @@ export const getLinksFromSitemap = async (
|
|
1012
931
|
break;
|
1013
932
|
}
|
1014
933
|
if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
|
1015
|
-
await fetchUrls(childSitemapUrlText); // Recursive call for nested sitemaps
|
934
|
+
await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
|
1016
935
|
} else {
|
1017
936
|
addToUrlList(childSitemapUrlText); // Add regular URLs to the list
|
1018
937
|
}
|
@@ -1037,7 +956,7 @@ export const getLinksFromSitemap = async (
|
|
1037
956
|
};
|
1038
957
|
|
1039
958
|
try {
|
1040
|
-
await fetchUrls(sitemapUrl);
|
959
|
+
await fetchUrls(sitemapUrl, extraHTTPHeaders);
|
1041
960
|
} catch (e) {
|
1042
961
|
consoleLogger.error(e);
|
1043
962
|
}
|
@@ -1086,20 +1005,26 @@ export const validName = (name: string) => {
|
|
1086
1005
|
* @returns object consisting of browser to run and cloned data directory
|
1087
1006
|
*/
|
1088
1007
|
export const getBrowserToRun = (
|
1089
|
-
preferredBrowser
|
1008
|
+
preferredBrowser?: BrowserTypes,
|
1090
1009
|
isCli = false,
|
1010
|
+
randomToken?: string
|
1091
1011
|
): { browserToRun: BrowserTypes; clonedBrowserDataDir: string } => {
|
1012
|
+
|
1013
|
+
if (!randomToken) {
|
1014
|
+
randomToken = '';
|
1015
|
+
}
|
1016
|
+
|
1092
1017
|
const platform = os.platform();
|
1093
1018
|
|
1094
1019
|
// Prioritise Chrome on Windows and Mac platforms if user does not specify a browser
|
1095
1020
|
if (!preferredBrowser && (os.platform() === 'win32' || os.platform() === 'darwin')) {
|
1096
1021
|
preferredBrowser = BrowserTypes.CHROME;
|
1022
|
+
} else {
|
1023
|
+
printMessage([`Preferred browser ${preferredBrowser}`], messageOptions);
|
1097
1024
|
}
|
1098
1025
|
|
1099
|
-
printMessage([`Preferred browser ${preferredBrowser}`], messageOptions);
|
1100
|
-
|
1101
1026
|
if (preferredBrowser === BrowserTypes.CHROME) {
|
1102
|
-
const chromeData = getChromeData();
|
1027
|
+
const chromeData = getChromeData(randomToken);
|
1103
1028
|
if (chromeData) return chromeData;
|
1104
1029
|
|
1105
1030
|
if (platform === 'darwin') {
|
@@ -1113,7 +1038,7 @@ export const getBrowserToRun = (
|
|
1113
1038
|
if (isCli)
|
1114
1039
|
printMessage(['Unable to use Chrome, falling back to Edge browser...'], messageOptions);
|
1115
1040
|
|
1116
|
-
const edgeData = getEdgeData();
|
1041
|
+
const edgeData = getEdgeData(randomToken);
|
1117
1042
|
if (edgeData) return edgeData;
|
1118
1043
|
|
1119
1044
|
if (isCli)
|
@@ -1125,12 +1050,12 @@ export const getBrowserToRun = (
|
|
1125
1050
|
printMessage(['Unable to use Chrome, falling back to Chromium browser...'], messageOptions);
|
1126
1051
|
}
|
1127
1052
|
} else if (preferredBrowser === BrowserTypes.EDGE) {
|
1128
|
-
const edgeData = getEdgeData();
|
1053
|
+
const edgeData = getEdgeData(randomToken);
|
1129
1054
|
if (edgeData) return edgeData;
|
1130
1055
|
|
1131
1056
|
if (isCli)
|
1132
1057
|
printMessage(['Unable to use Edge, falling back to Chrome browser...'], messageOptions);
|
1133
|
-
const chromeData = getChromeData();
|
1058
|
+
const chromeData = getChromeData(randomToken);
|
1134
1059
|
if (chromeData) return chromeData;
|
1135
1060
|
|
1136
1061
|
if (platform === 'darwin') {
|
@@ -1161,7 +1086,7 @@ export const getBrowserToRun = (
|
|
1161
1086
|
// defaults to chromium
|
1162
1087
|
return {
|
1163
1088
|
browserToRun: BrowserTypes.CHROMIUM,
|
1164
|
-
clonedBrowserDataDir: cloneChromiumProfiles(),
|
1089
|
+
clonedBrowserDataDir: cloneChromiumProfiles(randomToken),
|
1165
1090
|
};
|
1166
1091
|
};
|
1167
1092
|
|
@@ -1181,9 +1106,9 @@ export const getClonedProfilesWithRandomToken = (browser: string, randomToken: s
|
|
1181
1106
|
return cloneChromiumProfiles(randomToken);
|
1182
1107
|
};
|
1183
1108
|
|
1184
|
-
export const getChromeData = () => {
|
1109
|
+
export const getChromeData = (randomToken: string) => {
|
1185
1110
|
const browserDataDir = getDefaultChromeDataDir();
|
1186
|
-
const clonedBrowserDataDir = cloneChromeProfiles();
|
1111
|
+
const clonedBrowserDataDir = cloneChromeProfiles(randomToken);
|
1187
1112
|
if (browserDataDir && clonedBrowserDataDir) {
|
1188
1113
|
const browserToRun = BrowserTypes.CHROME;
|
1189
1114
|
return { browserToRun, clonedBrowserDataDir };
|
@@ -1191,9 +1116,9 @@ export const getChromeData = () => {
|
|
1191
1116
|
return null;
|
1192
1117
|
};
|
1193
1118
|
|
1194
|
-
export const getEdgeData = () => {
|
1119
|
+
export const getEdgeData = (randomToken: string) => {
|
1195
1120
|
const browserDataDir = getDefaultEdgeDataDir();
|
1196
|
-
const clonedBrowserDataDir = cloneEdgeProfiles();
|
1121
|
+
const clonedBrowserDataDir = cloneEdgeProfiles(randomToken);
|
1197
1122
|
if (browserDataDir && clonedBrowserDataDir) {
|
1198
1123
|
const browserToRun = BrowserTypes.EDGE;
|
1199
1124
|
return { browserToRun, clonedBrowserDataDir };
|
@@ -1397,7 +1322,7 @@ const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: st
|
|
1397
1322
|
* @param {string} randomToken - random token to append to the cloned directory
|
1398
1323
|
* @returns {string} cloned data directory, null if any of the sub files failed to copy
|
1399
1324
|
*/
|
1400
|
-
export const cloneChromeProfiles = (randomToken
|
1325
|
+
export const cloneChromeProfiles = (randomToken: string): string => {
|
1401
1326
|
const baseDir = getDefaultChromeDataDir();
|
1402
1327
|
|
1403
1328
|
if (!baseDir) {
|
@@ -1406,18 +1331,10 @@ export const cloneChromeProfiles = (randomToken?: string): string => {
|
|
1406
1331
|
|
1407
1332
|
let destDir;
|
1408
1333
|
|
1409
|
-
|
1410
|
-
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1411
|
-
} else {
|
1412
|
-
destDir = path.join(baseDir, 'oobee');
|
1413
|
-
}
|
1334
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1414
1335
|
|
1415
1336
|
if (fs.existsSync(destDir)) {
|
1416
|
-
if (process.env.OOBEE_VERBOSE) {
|
1417
1337
|
deleteClonedChromeProfiles(randomToken);
|
1418
|
-
} else {
|
1419
|
-
deleteClonedChromeProfiles();
|
1420
|
-
}
|
1421
1338
|
}
|
1422
1339
|
|
1423
1340
|
if (!fs.existsSync(destDir)) {
|
@@ -1435,10 +1352,13 @@ export const cloneChromeProfiles = (randomToken?: string): string => {
|
|
1435
1352
|
return destDir;
|
1436
1353
|
}
|
1437
1354
|
|
1438
|
-
|
1355
|
+
consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
|
1356
|
+
|
1357
|
+
// For future reference, return a null instead to halt the scan
|
1358
|
+
return destDir;
|
1439
1359
|
};
|
1440
1360
|
|
1441
|
-
export const cloneChromiumProfiles = (randomToken
|
1361
|
+
export const cloneChromiumProfiles = (randomToken: string): string => {
|
1442
1362
|
const baseDir = getDefaultChromiumDataDir();
|
1443
1363
|
|
1444
1364
|
if (!baseDir) {
|
@@ -1447,10 +1367,10 @@ export const cloneChromiumProfiles = (randomToken?: string): string => {
|
|
1447
1367
|
|
1448
1368
|
let destDir: string;
|
1449
1369
|
|
1450
|
-
|
1451
|
-
|
1452
|
-
|
1453
|
-
|
1370
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1371
|
+
|
1372
|
+
if (fs.existsSync(destDir)) {
|
1373
|
+
deleteClonedChromiumProfiles(randomToken);
|
1454
1374
|
}
|
1455
1375
|
|
1456
1376
|
if (!fs.existsSync(destDir)) {
|
@@ -1468,7 +1388,7 @@ export const cloneChromiumProfiles = (randomToken?: string): string => {
|
|
1468
1388
|
* @param {string} randomToken - random token to append to the cloned directory
|
1469
1389
|
* @returns {string} cloned data directory, null if any of the sub files failed to copy
|
1470
1390
|
*/
|
1471
|
-
export const cloneEdgeProfiles = (randomToken
|
1391
|
+
export const cloneEdgeProfiles = (randomToken: string): string => {
|
1472
1392
|
const baseDir = getDefaultEdgeDataDir();
|
1473
1393
|
|
1474
1394
|
if (!baseDir) {
|
@@ -1477,18 +1397,10 @@ export const cloneEdgeProfiles = (randomToken?: string): string => {
|
|
1477
1397
|
|
1478
1398
|
let destDir;
|
1479
1399
|
|
1480
|
-
|
1481
|
-
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1482
|
-
} else {
|
1483
|
-
destDir = path.join(baseDir, 'oobee');
|
1484
|
-
}
|
1400
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1485
1401
|
|
1486
1402
|
if (fs.existsSync(destDir)) {
|
1487
|
-
if (process.env.OOBEE_VERBOSE) {
|
1488
1403
|
deleteClonedEdgeProfiles(randomToken);
|
1489
|
-
} else {
|
1490
|
-
deleteClonedEdgeProfiles();
|
1491
|
-
}
|
1492
1404
|
}
|
1493
1405
|
|
1494
1406
|
if (!fs.existsSync(destDir)) {
|
@@ -1507,10 +1419,13 @@ export const cloneEdgeProfiles = (randomToken?: string): string => {
|
|
1507
1419
|
return destDir;
|
1508
1420
|
}
|
1509
1421
|
|
1510
|
-
|
1422
|
+
consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
|
1423
|
+
|
1424
|
+
// For future reference, return a null instead to halt the scan
|
1425
|
+
return destDir;
|
1511
1426
|
};
|
1512
1427
|
|
1513
|
-
export const deleteClonedProfiles = (browser: string, randomToken
|
1428
|
+
export const deleteClonedProfiles = (browser: string, randomToken: string): void => {
|
1514
1429
|
if (browser === BrowserTypes.CHROME) {
|
1515
1430
|
deleteClonedChromeProfiles(randomToken);
|
1516
1431
|
} else if (browser === BrowserTypes.EDGE) {
|
@@ -1565,9 +1480,7 @@ export const deleteClonedChromeProfiles = (randomToken?: string): void => {
|
|
1565
1480
|
* @returns null
|
1566
1481
|
*/
|
1567
1482
|
export const deleteClonedEdgeProfiles = (randomToken?: string): void => {
|
1568
|
-
|
1569
|
-
return;
|
1570
|
-
}
|
1483
|
+
|
1571
1484
|
const baseDir = getDefaultEdgeDataDir();
|
1572
1485
|
|
1573
1486
|
if (!baseDir) {
|
@@ -1698,6 +1611,8 @@ export const submitFormViaPlaywright = async (
|
|
1698
1611
|
},
|
1699
1612
|
);
|
1700
1613
|
|
1614
|
+
register(browserContext);
|
1615
|
+
|
1701
1616
|
const page = await browserContext.newPage();
|
1702
1617
|
|
1703
1618
|
try {
|
@@ -1716,13 +1631,9 @@ export const submitFormViaPlaywright = async (
|
|
1716
1631
|
} finally {
|
1717
1632
|
await browserContext.close();
|
1718
1633
|
if (proxy && browserToRun === BrowserTypes.EDGE) {
|
1719
|
-
|
1720
|
-
deleteClonedEdgeProfiles();
|
1721
|
-
}
|
1634
|
+
deleteClonedEdgeProfiles(clonedDir);
|
1722
1635
|
} else if (proxy && browserToRun === BrowserTypes.CHROME) {
|
1723
|
-
|
1724
|
-
deleteClonedChromeProfiles();
|
1725
|
-
}
|
1636
|
+
deleteClonedChromeProfiles(clonedDir);
|
1726
1637
|
}
|
1727
1638
|
}
|
1728
1639
|
};
|
@@ -1781,7 +1692,9 @@ export const submitForm = async (
|
|
1781
1692
|
export async function initModifiedUserAgent(
|
1782
1693
|
browser?: string,
|
1783
1694
|
playwrightDeviceDetailsObject?: object,
|
1695
|
+
userDataDirectory?: string,
|
1784
1696
|
) {
|
1697
|
+
|
1785
1698
|
const isHeadless = process.env.CRAWLEE_HEADLESS === '1';
|
1786
1699
|
|
1787
1700
|
// If headless mode is enabled, ensure the headless flag is set.
|
@@ -1798,7 +1711,13 @@ export async function initModifiedUserAgent(
|
|
1798
1711
|
};
|
1799
1712
|
|
1800
1713
|
// Launch a temporary persistent context with an empty userDataDir to mimic your production browser setup.
|
1801
|
-
const
|
1714
|
+
const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
|
1715
|
+
? userDataDirectory
|
1716
|
+
: '';
|
1717
|
+
|
1718
|
+
const browserContext = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, launchOptions);
|
1719
|
+
register(browserContext);
|
1720
|
+
|
1802
1721
|
const page = await browserContext.newPage();
|
1803
1722
|
|
1804
1723
|
// Retrieve the default user agent.
|
@@ -1856,13 +1775,6 @@ export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
|
|
1856
1775
|
return options;
|
1857
1776
|
};
|
1858
1777
|
|
1859
|
-
export const urlWithoutAuth = (url: string): string => {
|
1860
|
-
const parsedUrl = new URL(url);
|
1861
|
-
parsedUrl.username = '';
|
1862
|
-
parsedUrl.password = '';
|
1863
|
-
return parsedUrl.toString();
|
1864
|
-
};
|
1865
|
-
|
1866
1778
|
export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
|
1867
1779
|
const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
|
1868
1780
|
|
@@ -1887,7 +1799,7 @@ export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
|
|
1887
1799
|
|
1888
1800
|
let timeout: NodeJS.Timeout;
|
1889
1801
|
let mutationCount = 0;
|
1890
|
-
const MAX_MUTATIONS =
|
1802
|
+
const MAX_MUTATIONS = 500;
|
1891
1803
|
const mutationHash: Record<string, number> = {};
|
1892
1804
|
|
1893
1805
|
const observer = new MutationObserver(mutationsList => {
|