@govtechsg/oobee 0.10.58 → 0.10.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/DETAILS.md +1 -1
- package/package.json +1 -1
- package/src/cli.ts +17 -64
- package/src/combine.ts +18 -4
- package/src/constants/common.ts +193 -293
- package/src/constants/constants.ts +2 -1
- package/src/constants/questions.ts +12 -4
- package/src/crawlers/commonCrawlerFunc.ts +9 -3
- package/src/crawlers/crawlDomain.ts +31 -83
- package/src/crawlers/crawlIntelligentSitemap.ts +16 -11
- package/src/crawlers/crawlLocalFile.ts +6 -17
- package/src/crawlers/crawlSitemap.ts +27 -93
- package/src/crawlers/custom/utils.ts +4 -4
- package/src/index.ts +2 -5
- package/src/logs.ts +1 -2
- package/src/mergeAxeResults.ts +35 -30
- package/src/npmIndex.ts +4 -4
- package/src/utils.ts +56 -14
package/src/constants/common.ts
CHANGED
@@ -119,7 +119,7 @@ export const validateFilePath = (filePath: string, cliDir: string) => {
|
|
119
119
|
|
120
120
|
return absolutePath;
|
121
121
|
} catch {
|
122
|
-
throw new Error(`Please ensure path provided exists: ${absolutePath}`);
|
122
|
+
throw new Error(`Please ensure path provided exists and writable: ${absolutePath}`);
|
123
123
|
}
|
124
124
|
};
|
125
125
|
|
@@ -277,110 +277,11 @@ export const sanitizeUrlInput = (url: string): { isValid: boolean; url: string }
|
|
277
277
|
return { isValid: false, url: sanitizeUrl };
|
278
278
|
};
|
279
279
|
|
280
|
-
const requestToUrl = async (
|
281
|
-
url: string,
|
282
|
-
isCustomFlow: boolean,
|
283
|
-
extraHTTPHeaders: Record<string, string>,
|
284
|
-
) => {
|
285
|
-
// User-Agent is modified to emulate a browser to handle cases where some sites ban non browser agents, resulting in a 403 error
|
286
|
-
const res = new RES();
|
287
|
-
const parsedUrl = new URL(url);
|
288
|
-
await axios
|
289
|
-
.get(parsedUrl.href, {
|
290
|
-
headers: {
|
291
|
-
...extraHTTPHeaders,
|
292
|
-
'User-Agent': devices['Desktop Chrome HiDPI'].userAgent,
|
293
|
-
Host: parsedUrl.host,
|
294
|
-
},
|
295
|
-
auth: {
|
296
|
-
username: decodeURIComponent(parsedUrl.username),
|
297
|
-
password: decodeURIComponent(parsedUrl.password),
|
298
|
-
},
|
299
|
-
httpsAgent,
|
300
|
-
timeout: 5000,
|
301
|
-
})
|
302
|
-
.then(async response => {
|
303
|
-
let redirectUrl = response.request.res.responseUrl;
|
304
|
-
redirectUrl = new URL(redirectUrl).href;
|
305
|
-
res.status = constants.urlCheckStatuses.success.code;
|
306
|
-
let data;
|
307
|
-
if (typeof response.data === 'string' || response.data instanceof String) {
|
308
|
-
data = response.data;
|
309
|
-
} else if (typeof response.data === 'object' && response.data !== null) {
|
310
|
-
try {
|
311
|
-
data = JSON.stringify(response.data);
|
312
|
-
} catch (error) {
|
313
|
-
console.log('Error converting object to JSON:', error);
|
314
|
-
}
|
315
|
-
} else {
|
316
|
-
console.log('Unsupported data type:', typeof response.data);
|
317
|
-
}
|
318
|
-
const modifiedHTML = data.replace(/<noscript>[\s\S]*?<\/noscript>/gi, '');
|
319
|
-
|
320
|
-
const metaRefreshMatch =
|
321
|
-
/<meta\s+http-equiv="refresh"\s+content="(?:\d+;)?\s*url=(?:'([^']*)'|"([^"]*)"|([^>]*))"/i.exec(
|
322
|
-
modifiedHTML,
|
323
|
-
);
|
324
|
-
|
325
|
-
const hasMetaRefresh = metaRefreshMatch && metaRefreshMatch.length > 1;
|
326
|
-
|
327
|
-
if (redirectUrl != null && (hasMetaRefresh || !isCustomFlow)) {
|
328
|
-
res.url = redirectUrl;
|
329
|
-
} else {
|
330
|
-
res.url = url;
|
331
|
-
}
|
332
|
-
|
333
|
-
if (hasMetaRefresh) {
|
334
|
-
let urlOrRelativePath;
|
335
|
-
|
336
|
-
for (let i = 1; i < metaRefreshMatch.length; i++) {
|
337
|
-
if (metaRefreshMatch[i] !== undefined && metaRefreshMatch[i] !== null) {
|
338
|
-
urlOrRelativePath = metaRefreshMatch[i];
|
339
|
-
break; // Stop the loop once the first non-null value is found
|
340
|
-
}
|
341
|
-
}
|
342
|
-
|
343
|
-
if (urlOrRelativePath.includes('URL=')) {
|
344
|
-
res.url = urlOrRelativePath.split('URL=').pop();
|
345
|
-
} else {
|
346
|
-
const pathname = res.url.substring(0, res.url.lastIndexOf('/'));
|
347
|
-
res.url = new URL(urlOrRelativePath, pathname).toString();
|
348
|
-
}
|
349
|
-
}
|
350
|
-
|
351
|
-
res.content = response.data;
|
352
|
-
})
|
353
|
-
.catch(async error => {
|
354
|
-
if (error.code === 'ECONNABORTED' || error.code === 'ERR_FR_TOO_MANY_REDIRECTS') {
|
355
|
-
res.status = constants.urlCheckStatuses.axiosTimeout.code;
|
356
|
-
} else if (error.response) {
|
357
|
-
if (error.response.status === 401) {
|
358
|
-
// enters here if URL is protected by basic auth
|
359
|
-
res.status = constants.urlCheckStatuses.unauthorised.code;
|
360
|
-
} else {
|
361
|
-
// enters here if server responds with a status other than 2xx
|
362
|
-
// the scan should still proceed even if error codes are received, so that accessibility scans for error pages can be done too
|
363
|
-
res.status = constants.urlCheckStatuses.success.code;
|
364
|
-
}
|
365
|
-
res.url = url;
|
366
|
-
res.content = error.response.data;
|
367
|
-
return res;
|
368
|
-
} else if (error.request) {
|
369
|
-
// enters here if URL cannot be accessed
|
370
|
-
res.status = constants.urlCheckStatuses.cannotBeResolved.code;
|
371
|
-
} else {
|
372
|
-
res.status = constants.urlCheckStatuses.systemError.code;
|
373
|
-
}
|
374
|
-
});
|
375
|
-
return res;
|
376
|
-
};
|
377
|
-
|
378
280
|
const checkUrlConnectivityWithBrowser = async (
|
379
281
|
url: string,
|
380
282
|
browserToRun: string,
|
381
283
|
clonedDataDir: string,
|
382
284
|
playwrightDeviceDetailsObject: DeviceDescriptor,
|
383
|
-
isCustomFlow: boolean,
|
384
285
|
extraHTTPHeaders: Record<string, string>,
|
385
286
|
) => {
|
386
287
|
const res = new RES();
|
@@ -391,27 +292,18 @@ const checkUrlConnectivityWithBrowser = async (
|
|
391
292
|
return res;
|
392
293
|
}
|
393
294
|
|
394
|
-
let viewport = null;
|
395
|
-
let userAgent = null;
|
396
|
-
if ('viewport' in playwrightDeviceDetailsObject) viewport = playwrightDeviceDetailsObject.viewport;
|
397
|
-
if ('userAgent' in playwrightDeviceDetailsObject) userAgent = playwrightDeviceDetailsObject.userAgent;
|
398
|
-
|
399
295
|
// Ensure Accept header for non-html content fallback
|
400
296
|
extraHTTPHeaders['Accept'] ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
|
401
297
|
|
402
|
-
|
403
|
-
const browserContextLaunchOptions = {
|
404
|
-
...launchOptions,
|
405
|
-
args: [...launchOptions.args, '--headless=new'],
|
406
|
-
};
|
298
|
+
await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
|
407
299
|
|
408
300
|
let browserContext;
|
409
301
|
try {
|
410
302
|
browserContext = await constants.launcher.launchPersistentContext(clonedDataDir, {
|
411
|
-
...browserContextLaunchOptions,
|
412
|
-
...(viewport && { viewport }),
|
413
|
-
...(userAgent && { userAgent }),
|
414
303
|
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
304
|
+
ignoreHTTPSErrors: true,
|
305
|
+
...getPlaywrightLaunchOptions(browserToRun),
|
306
|
+
...playwrightDeviceDetailsObject,
|
415
307
|
});
|
416
308
|
} catch (err) {
|
417
309
|
printMessage([`Unable to launch browser\n${err}`], messageOptions);
|
@@ -422,48 +314,77 @@ const checkUrlConnectivityWithBrowser = async (
|
|
422
314
|
try {
|
423
315
|
const page = await browserContext.newPage();
|
424
316
|
|
425
|
-
//
|
426
|
-
|
427
|
-
|
317
|
+
// STEP 1: HEAD request before actual navigation
|
318
|
+
let statusCode = 0;
|
319
|
+
let contentType = '';
|
320
|
+
let disposition = '';
|
321
|
+
|
322
|
+
try {
|
323
|
+
const headResp = await page.request.fetch(url, {
|
324
|
+
method: 'HEAD',
|
325
|
+
headers: extraHTTPHeaders,
|
326
|
+
});
|
327
|
+
|
328
|
+
statusCode = headResp.status();
|
329
|
+
contentType = headResp.headers()['content-type'] || '';
|
330
|
+
disposition = headResp.headers()['content-disposition'] || '';
|
331
|
+
|
332
|
+
// If it looks like a downloadable file, skip goto entirely
|
333
|
+
if (
|
334
|
+
contentType.includes('pdf') ||
|
335
|
+
contentType.includes('octet-stream') ||
|
336
|
+
disposition.includes('attachment')
|
337
|
+
) {
|
338
|
+
res.status = statusCode === 401
|
339
|
+
? constants.urlCheckStatuses.unauthorised.code
|
340
|
+
: constants.urlCheckStatuses.success.code;
|
341
|
+
|
342
|
+
res.httpStatus = statusCode;
|
343
|
+
res.url = url;
|
344
|
+
res.content = ''; // Don't try to render binary
|
345
|
+
|
346
|
+
await browserContext.close();
|
347
|
+
return res;
|
348
|
+
}
|
349
|
+
} catch (e) {
|
350
|
+
consoleLogger.info(`HEAD request failed: ${e.message}`);
|
351
|
+
res.status = constants.urlCheckStatuses.systemError.code;
|
352
|
+
await browserContext.close();
|
353
|
+
return res;
|
428
354
|
}
|
429
355
|
|
356
|
+
// STEP 2: Safe to proceed with navigation
|
430
357
|
const response = await page.goto(url, {
|
431
358
|
timeout: 30000,
|
432
|
-
|
359
|
+
waitUntil: 'commit', // Don't wait for full load
|
433
360
|
});
|
434
361
|
|
435
|
-
|
436
|
-
|
437
|
-
} catch {
|
438
|
-
consoleLogger.info('Unable to detect networkidle');
|
439
|
-
}
|
440
|
-
|
441
|
-
const status = response.status();
|
442
|
-
res.status = status === 401
|
362
|
+
const finalStatus = statusCode || (response?.status?.() ?? 0);
|
363
|
+
res.status = finalStatus === 401
|
443
364
|
? constants.urlCheckStatuses.unauthorised.code
|
444
365
|
: constants.urlCheckStatuses.success.code;
|
445
366
|
|
446
|
-
|
447
|
-
res.
|
367
|
+
res.httpStatus = finalStatus;
|
368
|
+
res.url = page.url();
|
448
369
|
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
// Check content type to determine how to extract content
|
453
|
-
const contentType = response.headers()['content-type'] || '';
|
454
|
-
|
455
|
-
if (contentType.includes('xml') || res.url.endsWith('.xml')) {
|
456
|
-
// Fetch raw content to avoid Playwright's HTML-wrapped <pre> behavior
|
457
|
-
const rawResponse = await requestToUrl(res.url, true, extraHTTPHeaders);
|
458
|
-
res.content = rawResponse.content;
|
370
|
+
contentType = response?.headers()?.['content-type'] || '';
|
371
|
+
if (contentType.includes('pdf') || contentType.includes('octet-stream')) {
|
372
|
+
res.content = ''; // Avoid triggering render/download
|
459
373
|
} else {
|
460
|
-
|
374
|
+
try {
|
375
|
+
await page.waitForLoadState('networkidle', { timeout: 10000 });
|
376
|
+
} catch {
|
377
|
+
consoleLogger.info('Unable to detect networkidle');
|
378
|
+
}
|
379
|
+
|
380
|
+
res.content = await page.content();
|
461
381
|
}
|
462
382
|
|
463
383
|
} catch (error) {
|
464
384
|
if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
|
465
385
|
res.status = constants.urlCheckStatuses.unauthorised.code;
|
466
386
|
} else {
|
387
|
+
console.log(error);
|
467
388
|
res.status = constants.urlCheckStatuses.systemError.code;
|
468
389
|
}
|
469
390
|
} finally {
|
@@ -501,7 +422,6 @@ export const checkUrl = async (
|
|
501
422
|
browser: string,
|
502
423
|
clonedDataDir: string,
|
503
424
|
playwrightDeviceDetailsObject: DeviceDescriptor,
|
504
|
-
isCustomFlow: boolean,
|
505
425
|
extraHTTPHeaders: Record<string, string>,
|
506
426
|
) => {
|
507
427
|
const res = await checkUrlConnectivityWithBrowser(
|
@@ -509,7 +429,6 @@ export const checkUrl = async (
|
|
509
429
|
browser,
|
510
430
|
clonedDataDir,
|
511
431
|
playwrightDeviceDetailsObject,
|
512
|
-
isCustomFlow,
|
513
432
|
extraHTTPHeaders,
|
514
433
|
);
|
515
434
|
|
@@ -555,18 +474,16 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
555
474
|
if (isEmptyObject(argv)) {
|
556
475
|
throw Error('No inputs should be provided');
|
557
476
|
}
|
558
|
-
|
477
|
+
let {
|
559
478
|
scanner,
|
560
479
|
headless,
|
561
480
|
url,
|
562
481
|
deviceChosen,
|
563
482
|
customDevice,
|
564
483
|
viewportWidth,
|
565
|
-
playwrightDeviceDetailsObject,
|
566
484
|
maxpages,
|
567
485
|
strategy,
|
568
486
|
isLocalFileScan,
|
569
|
-
finalUrl,
|
570
487
|
browserToRun,
|
571
488
|
nameEmail,
|
572
489
|
customFlowLabel,
|
@@ -578,32 +495,72 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
578
495
|
followRobots,
|
579
496
|
header,
|
580
497
|
safeMode,
|
498
|
+
exportDirectory,
|
581
499
|
zip,
|
582
500
|
ruleset,
|
583
501
|
generateJsonFiles,
|
584
502
|
scanDuration
|
585
503
|
} = argv;
|
586
504
|
|
505
|
+
const extraHTTPHeaders = parseHeaders(header);
|
506
|
+
|
507
|
+
// Set default username and password for basic auth
|
508
|
+
let username = '';
|
509
|
+
let password = '';
|
510
|
+
|
511
|
+
// Remove credentials from URL if not a local file scan
|
512
|
+
url = argv.isLocalFileScan
|
513
|
+
? url
|
514
|
+
: (() => {
|
515
|
+
const temp = new URL(url);
|
516
|
+
username = temp.username;
|
517
|
+
password = temp.password;
|
518
|
+
|
519
|
+
if (username !== '' || password !== '') {
|
520
|
+
extraHTTPHeaders['Authorization'] = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
|
521
|
+
}
|
522
|
+
|
523
|
+
temp.username = '';
|
524
|
+
temp.password = '';
|
525
|
+
return temp.toString();
|
526
|
+
})();
|
527
|
+
|
587
528
|
// construct filename for scan results
|
588
529
|
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
589
530
|
const domain = argv.isLocalFileScan ? path.basename(argv.url) : new URL(argv.url).hostname;
|
531
|
+
|
590
532
|
const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : '';
|
591
533
|
let resultFilename: string;
|
592
534
|
const randomThreeDigitNumber = randomThreeDigitNumberString();
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
535
|
+
resultFilename = `${date}_${time}${sanitisedLabel}_${domain}_${randomThreeDigitNumber}`;
|
536
|
+
|
537
|
+
// Set exported directory
|
538
|
+
if (exportDirectory) {
|
539
|
+
constants.exportDirectory = path.join(exportDirectory, resultFilename);
|
597
540
|
}
|
541
|
+
|
542
|
+
// Creating the playwrightDeviceDetailObject
|
543
|
+
deviceChosen = customDevice === 'Desktop' || customDevice === 'Mobile' ? customDevice : deviceChosen;
|
544
|
+
|
545
|
+
const playwrightDeviceDetailsObject = getPlaywrightDeviceDetailsObject(
|
546
|
+
deviceChosen,
|
547
|
+
customDevice,
|
548
|
+
viewportWidth,
|
549
|
+
);
|
550
|
+
|
551
|
+
const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(browserToRun, true, resultFilename);
|
552
|
+
browserToRun = resolvedBrowser;
|
553
|
+
|
554
|
+
const resolvedUserDataDirectory = getClonedProfilesWithRandomToken(browserToRun, resultFilename);
|
598
555
|
|
599
556
|
if (followRobots) {
|
600
557
|
constants.robotsTxtUrls = {};
|
601
|
-
await getUrlsFromRobotsTxt(url, browserToRun);
|
558
|
+
await getUrlsFromRobotsTxt(url, browserToRun, resolvedUserDataDirectory, extraHTTPHeaders);
|
602
559
|
}
|
603
560
|
|
604
561
|
return {
|
605
562
|
type: scanner,
|
606
|
-
url:
|
563
|
+
url: url,
|
607
564
|
entryUrl: url,
|
608
565
|
isHeadless: headless,
|
609
566
|
deviceChosen,
|
@@ -624,8 +581,9 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
624
581
|
includeScreenshots: !(additional === 'none'),
|
625
582
|
metadata,
|
626
583
|
followRobots,
|
627
|
-
extraHTTPHeaders:
|
584
|
+
extraHTTPHeaders: extraHTTPHeaders,
|
628
585
|
safeMode,
|
586
|
+
userDataDirectory: resolvedUserDataDirectory,
|
629
587
|
zip,
|
630
588
|
ruleset,
|
631
589
|
generateJsonFiles,
|
@@ -633,7 +591,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
633
591
|
};
|
634
592
|
};
|
635
593
|
|
636
|
-
export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): Promise<void> => {
|
594
|
+
export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>): Promise<void> => {
|
637
595
|
if (!constants.robotsTxtUrls) return;
|
638
596
|
|
639
597
|
const domain = new URL(url).origin;
|
@@ -642,22 +600,18 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): P
|
|
642
600
|
|
643
601
|
let robotsTxt: string;
|
644
602
|
try {
|
645
|
-
|
646
|
-
|
647
|
-
} else {
|
648
|
-
robotsTxt = await getRobotsTxtViaAxios(robotsUrl);
|
649
|
-
}
|
603
|
+
robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browserToRun, userDataDirectory, extraHTTPHeaders);
|
604
|
+
consoleLogger.info(`Fetched robots.txt from ${robotsUrl}`);
|
650
605
|
} catch (e) {
|
651
606
|
// if robots.txt is not found, do nothing
|
607
|
+
consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl}`);
|
652
608
|
}
|
653
|
-
|
609
|
+
|
654
610
|
if (!robotsTxt) {
|
655
611
|
constants.robotsTxtUrls[domain] = {};
|
656
612
|
return;
|
657
613
|
}
|
658
|
-
|
659
|
-
console.log('Found robots.txt: ', robotsUrl);
|
660
|
-
|
614
|
+
|
661
615
|
const lines = robotsTxt.split(/\r?\n/);
|
662
616
|
let shouldCapture = false;
|
663
617
|
const disallowedUrls = [];
|
@@ -705,30 +659,30 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): P
|
|
705
659
|
constants.robotsTxtUrls[domain] = { disallowedUrls, allowedUrls };
|
706
660
|
};
|
707
661
|
|
708
|
-
const getRobotsTxtViaPlaywright = async (robotsUrl: string, browser: string): Promise<string> => {
|
709
|
-
|
662
|
+
const getRobotsTxtViaPlaywright = async (robotsUrl: string, browser: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>): Promise<string> => {
|
663
|
+
|
664
|
+
let robotsDataDir = '';
|
665
|
+
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
666
|
+
if (process.env.CRAWLEE_HEADLESS === '1') {
|
667
|
+
// Create robots own user data directory else SingletonLock: File exists (17) with crawlDomain or crawlSitemap's own browser
|
668
|
+
const robotsDataDir = path.join(userDataDirectory, 'robots');
|
669
|
+
if (!fs.existsSync(robotsDataDir)) {
|
670
|
+
fs.mkdirSync(robotsDataDir, { recursive: true });
|
671
|
+
}
|
672
|
+
}
|
673
|
+
|
674
|
+
const browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
|
710
675
|
...getPlaywrightLaunchOptions(browser),
|
676
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
711
677
|
});
|
712
678
|
|
713
679
|
const page = await browserContext.newPage();
|
714
|
-
await page.goto(robotsUrl, { waitUntil: 'networkidle', timeout: 30000 });
|
715
680
|
|
681
|
+
await page.goto(robotsUrl, { waitUntil: 'networkidle', timeout: 30000 });
|
716
682
|
const robotsTxt: string | null = await page.evaluate(() => document.body.textContent);
|
717
683
|
return robotsTxt;
|
718
684
|
};
|
719
685
|
|
720
|
-
const getRobotsTxtViaAxios = async (robotsUrl: string): Promise<string> => {
|
721
|
-
const instance = axios.create({
|
722
|
-
httpsAgent: new https.Agent({
|
723
|
-
rejectUnauthorized: false,
|
724
|
-
keepAlive: true,
|
725
|
-
}),
|
726
|
-
});
|
727
|
-
|
728
|
-
const robotsTxt = (await (await instance.get(robotsUrl, { timeout: 2000 })).data) as string;
|
729
|
-
return robotsTxt;
|
730
|
-
};
|
731
|
-
|
732
686
|
export const isDisallowedInRobotsTxt = (url: string): boolean => {
|
733
687
|
if (!constants.robotsTxtUrls) return;
|
734
688
|
|
@@ -760,8 +714,7 @@ export const getLinksFromSitemap = async (
|
|
760
714
|
userDataDirectory: string,
|
761
715
|
userUrlInput: string,
|
762
716
|
isIntelligent: boolean,
|
763
|
-
|
764
|
-
password: string,
|
717
|
+
extraHTTPHeaders: Record<string, string>,
|
765
718
|
) => {
|
766
719
|
const scannedSitemaps = new Set<string>();
|
767
720
|
const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
|
@@ -772,11 +725,6 @@ export const getLinksFromSitemap = async (
|
|
772
725
|
if (!url) return;
|
773
726
|
if (isDisallowedInRobotsTxt(url)) return;
|
774
727
|
|
775
|
-
// add basic auth credentials to the URL
|
776
|
-
username !== '' && password !== ''
|
777
|
-
? (url = addBasicAuthCredentials(url, username, password))
|
778
|
-
: url;
|
779
|
-
|
780
728
|
url = convertPathToLocalFile(url);
|
781
729
|
|
782
730
|
let request;
|
@@ -791,13 +739,6 @@ export const getLinksFromSitemap = async (
|
|
791
739
|
urls[url] = request;
|
792
740
|
};
|
793
741
|
|
794
|
-
const addBasicAuthCredentials = (url: string, username: string, password: string) => {
|
795
|
-
const urlObject = new URL(url);
|
796
|
-
urlObject.username = username;
|
797
|
-
urlObject.password = password;
|
798
|
-
return urlObject.toString();
|
799
|
-
};
|
800
|
-
|
801
742
|
const calculateCloseness = (sitemapUrl: string) => {
|
802
743
|
// Remove 'http://', 'https://', and 'www.' prefixes from the URLs
|
803
744
|
const normalizedSitemapUrl = sitemapUrl.replace(/^(https?:\/\/)?(www\.)?/, '');
|
@@ -868,16 +809,10 @@ export const getLinksFromSitemap = async (
|
|
868
809
|
finalUserDataDirectory = '';
|
869
810
|
}
|
870
811
|
|
871
|
-
const fetchUrls = async (url: string) => {
|
812
|
+
const fetchUrls = async (url: string, extraHTTPHeaders: Record<string, string>) => {
|
872
813
|
let data;
|
873
814
|
let sitemapType;
|
874
|
-
|
875
|
-
|
876
|
-
let username = '';
|
877
|
-
let password = '';
|
878
|
-
|
879
|
-
let parsedUrl;
|
880
|
-
|
815
|
+
|
881
816
|
if (scannedSitemaps.has(url)) {
|
882
817
|
// Skip processing if the sitemap has already been scanned
|
883
818
|
return;
|
@@ -893,17 +828,9 @@ export const getLinksFromSitemap = async (
|
|
893
828
|
if (!fs.existsSync(url)) {
|
894
829
|
return;
|
895
830
|
}
|
896
|
-
|
831
|
+
|
897
832
|
} else if (isValidHttpUrl(url)) {
|
898
|
-
|
899
|
-
|
900
|
-
if (parsedUrl.username !== '' && parsedUrl.password !== '') {
|
901
|
-
isBasicAuth = true;
|
902
|
-
username = decodeURIComponent(parsedUrl.username);
|
903
|
-
password = decodeURIComponent(parsedUrl.password);
|
904
|
-
parsedUrl.username = '';
|
905
|
-
parsedUrl.password = '';
|
906
|
-
}
|
833
|
+
// Do nothing, url is valid
|
907
834
|
} else {
|
908
835
|
printMessage([`Invalid Url/Filepath: ${url}`], messageOptions);
|
909
836
|
return;
|
@@ -915,12 +842,17 @@ export const getLinksFromSitemap = async (
|
|
915
842
|
{
|
916
843
|
...getPlaywrightLaunchOptions(browser),
|
917
844
|
// Not necessary to parse http_credentials as I am parsing it directly in URL
|
845
|
+
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
846
|
+
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
847
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
918
848
|
},
|
919
849
|
);
|
920
850
|
|
921
851
|
const page = await browserContext.newPage();
|
852
|
+
|
922
853
|
await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
|
923
|
-
|
854
|
+
|
855
|
+
if (await page.locator('body').count() > 0) {
|
924
856
|
data = await page.locator('body').innerText();
|
925
857
|
} else {
|
926
858
|
const urlSet = page.locator('urlset');
|
@@ -948,35 +880,14 @@ export const getLinksFromSitemap = async (
|
|
948
880
|
addToUrlList(url);
|
949
881
|
return;
|
950
882
|
}
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
try {
|
955
|
-
const instance = axios.create({
|
956
|
-
httpsAgent: new https.Agent({
|
957
|
-
rejectUnauthorized: false,
|
958
|
-
keepAlive: true,
|
959
|
-
}),
|
960
|
-
auth: {
|
961
|
-
username,
|
962
|
-
password,
|
963
|
-
},
|
964
|
-
});
|
965
|
-
try {
|
966
|
-
data = await (await instance.get(url, { timeout: 80000 })).data;
|
967
|
-
} catch {
|
968
|
-
return; // to skip the error
|
969
|
-
}
|
970
|
-
} catch (error) {
|
971
|
-
if (error.code === 'ECONNABORTED') {
|
972
|
-
await getDataUsingPlaywright();
|
973
|
-
}
|
974
|
-
}
|
975
|
-
}
|
883
|
+
|
884
|
+
await getDataUsingPlaywright();
|
885
|
+
|
976
886
|
} else {
|
977
887
|
url = convertLocalFileToPath(url);
|
978
888
|
data = fs.readFileSync(url, 'utf8');
|
979
889
|
}
|
890
|
+
|
980
891
|
const $ = cheerio.load(data, { xml: true });
|
981
892
|
|
982
893
|
// This case is when the document is not an XML format document
|
@@ -1012,7 +923,7 @@ export const getLinksFromSitemap = async (
|
|
1012
923
|
break;
|
1013
924
|
}
|
1014
925
|
if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
|
1015
|
-
await fetchUrls(childSitemapUrlText); // Recursive call for nested sitemaps
|
926
|
+
await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
|
1016
927
|
} else {
|
1017
928
|
addToUrlList(childSitemapUrlText); // Add regular URLs to the list
|
1018
929
|
}
|
@@ -1037,7 +948,7 @@ export const getLinksFromSitemap = async (
|
|
1037
948
|
};
|
1038
949
|
|
1039
950
|
try {
|
1040
|
-
await fetchUrls(sitemapUrl);
|
951
|
+
await fetchUrls(sitemapUrl, extraHTTPHeaders);
|
1041
952
|
} catch (e) {
|
1042
953
|
consoleLogger.error(e);
|
1043
954
|
}
|
@@ -1086,20 +997,26 @@ export const validName = (name: string) => {
|
|
1086
997
|
* @returns object consisting of browser to run and cloned data directory
|
1087
998
|
*/
|
1088
999
|
export const getBrowserToRun = (
|
1089
|
-
preferredBrowser
|
1000
|
+
preferredBrowser?: BrowserTypes,
|
1090
1001
|
isCli = false,
|
1002
|
+
randomToken?: string
|
1091
1003
|
): { browserToRun: BrowserTypes; clonedBrowserDataDir: string } => {
|
1004
|
+
|
1005
|
+
if (!randomToken) {
|
1006
|
+
randomToken = '';
|
1007
|
+
}
|
1008
|
+
|
1092
1009
|
const platform = os.platform();
|
1093
1010
|
|
1094
1011
|
// Prioritise Chrome on Windows and Mac platforms if user does not specify a browser
|
1095
1012
|
if (!preferredBrowser && (os.platform() === 'win32' || os.platform() === 'darwin')) {
|
1096
1013
|
preferredBrowser = BrowserTypes.CHROME;
|
1014
|
+
} else {
|
1015
|
+
printMessage([`Preferred browser ${preferredBrowser}`], messageOptions);
|
1097
1016
|
}
|
1098
1017
|
|
1099
|
-
printMessage([`Preferred browser ${preferredBrowser}`], messageOptions);
|
1100
|
-
|
1101
1018
|
if (preferredBrowser === BrowserTypes.CHROME) {
|
1102
|
-
const chromeData = getChromeData();
|
1019
|
+
const chromeData = getChromeData(randomToken);
|
1103
1020
|
if (chromeData) return chromeData;
|
1104
1021
|
|
1105
1022
|
if (platform === 'darwin') {
|
@@ -1113,7 +1030,7 @@ export const getBrowserToRun = (
|
|
1113
1030
|
if (isCli)
|
1114
1031
|
printMessage(['Unable to use Chrome, falling back to Edge browser...'], messageOptions);
|
1115
1032
|
|
1116
|
-
const edgeData = getEdgeData();
|
1033
|
+
const edgeData = getEdgeData(randomToken);
|
1117
1034
|
if (edgeData) return edgeData;
|
1118
1035
|
|
1119
1036
|
if (isCli)
|
@@ -1125,12 +1042,12 @@ export const getBrowserToRun = (
|
|
1125
1042
|
printMessage(['Unable to use Chrome, falling back to Chromium browser...'], messageOptions);
|
1126
1043
|
}
|
1127
1044
|
} else if (preferredBrowser === BrowserTypes.EDGE) {
|
1128
|
-
const edgeData = getEdgeData();
|
1045
|
+
const edgeData = getEdgeData(randomToken);
|
1129
1046
|
if (edgeData) return edgeData;
|
1130
1047
|
|
1131
1048
|
if (isCli)
|
1132
1049
|
printMessage(['Unable to use Edge, falling back to Chrome browser...'], messageOptions);
|
1133
|
-
const chromeData = getChromeData();
|
1050
|
+
const chromeData = getChromeData(randomToken);
|
1134
1051
|
if (chromeData) return chromeData;
|
1135
1052
|
|
1136
1053
|
if (platform === 'darwin') {
|
@@ -1161,7 +1078,7 @@ export const getBrowserToRun = (
|
|
1161
1078
|
// defaults to chromium
|
1162
1079
|
return {
|
1163
1080
|
browserToRun: BrowserTypes.CHROMIUM,
|
1164
|
-
clonedBrowserDataDir: cloneChromiumProfiles(),
|
1081
|
+
clonedBrowserDataDir: cloneChromiumProfiles(randomToken),
|
1165
1082
|
};
|
1166
1083
|
};
|
1167
1084
|
|
@@ -1181,9 +1098,9 @@ export const getClonedProfilesWithRandomToken = (browser: string, randomToken: s
|
|
1181
1098
|
return cloneChromiumProfiles(randomToken);
|
1182
1099
|
};
|
1183
1100
|
|
1184
|
-
export const getChromeData = () => {
|
1101
|
+
export const getChromeData = (randomToken: string) => {
|
1185
1102
|
const browserDataDir = getDefaultChromeDataDir();
|
1186
|
-
const clonedBrowserDataDir = cloneChromeProfiles();
|
1103
|
+
const clonedBrowserDataDir = cloneChromeProfiles(randomToken);
|
1187
1104
|
if (browserDataDir && clonedBrowserDataDir) {
|
1188
1105
|
const browserToRun = BrowserTypes.CHROME;
|
1189
1106
|
return { browserToRun, clonedBrowserDataDir };
|
@@ -1191,9 +1108,9 @@ export const getChromeData = () => {
|
|
1191
1108
|
return null;
|
1192
1109
|
};
|
1193
1110
|
|
1194
|
-
export const getEdgeData = () => {
|
1111
|
+
export const getEdgeData = (randomToken: string) => {
|
1195
1112
|
const browserDataDir = getDefaultEdgeDataDir();
|
1196
|
-
const clonedBrowserDataDir = cloneEdgeProfiles();
|
1113
|
+
const clonedBrowserDataDir = cloneEdgeProfiles(randomToken);
|
1197
1114
|
if (browserDataDir && clonedBrowserDataDir) {
|
1198
1115
|
const browserToRun = BrowserTypes.EDGE;
|
1199
1116
|
return { browserToRun, clonedBrowserDataDir };
|
@@ -1397,7 +1314,7 @@ const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: st
|
|
1397
1314
|
* @param {string} randomToken - random token to append to the cloned directory
|
1398
1315
|
* @returns {string} cloned data directory, null if any of the sub files failed to copy
|
1399
1316
|
*/
|
1400
|
-
export const cloneChromeProfiles = (randomToken
|
1317
|
+
export const cloneChromeProfiles = (randomToken: string): string => {
|
1401
1318
|
const baseDir = getDefaultChromeDataDir();
|
1402
1319
|
|
1403
1320
|
if (!baseDir) {
|
@@ -1406,18 +1323,10 @@ export const cloneChromeProfiles = (randomToken?: string): string => {
|
|
1406
1323
|
|
1407
1324
|
let destDir;
|
1408
1325
|
|
1409
|
-
|
1410
|
-
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1411
|
-
} else {
|
1412
|
-
destDir = path.join(baseDir, 'oobee');
|
1413
|
-
}
|
1326
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1414
1327
|
|
1415
1328
|
if (fs.existsSync(destDir)) {
|
1416
|
-
if (process.env.OOBEE_VERBOSE) {
|
1417
1329
|
deleteClonedChromeProfiles(randomToken);
|
1418
|
-
} else {
|
1419
|
-
deleteClonedChromeProfiles();
|
1420
|
-
}
|
1421
1330
|
}
|
1422
1331
|
|
1423
1332
|
if (!fs.existsSync(destDir)) {
|
@@ -1435,10 +1344,13 @@ export const cloneChromeProfiles = (randomToken?: string): string => {
|
|
1435
1344
|
return destDir;
|
1436
1345
|
}
|
1437
1346
|
|
1438
|
-
|
1347
|
+
consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
|
1348
|
+
|
1349
|
+
// For future reference, return a null instead to halt the scan
|
1350
|
+
return destDir;
|
1439
1351
|
};
|
1440
1352
|
|
1441
|
-
export const cloneChromiumProfiles = (randomToken
|
1353
|
+
export const cloneChromiumProfiles = (randomToken: string): string => {
|
1442
1354
|
const baseDir = getDefaultChromiumDataDir();
|
1443
1355
|
|
1444
1356
|
if (!baseDir) {
|
@@ -1447,10 +1359,10 @@ export const cloneChromiumProfiles = (randomToken?: string): string => {
|
|
1447
1359
|
|
1448
1360
|
let destDir: string;
|
1449
1361
|
|
1450
|
-
|
1451
|
-
|
1452
|
-
|
1453
|
-
|
1362
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1363
|
+
|
1364
|
+
if (fs.existsSync(destDir)) {
|
1365
|
+
deleteClonedChromiumProfiles(randomToken);
|
1454
1366
|
}
|
1455
1367
|
|
1456
1368
|
if (!fs.existsSync(destDir)) {
|
@@ -1468,7 +1380,7 @@ export const cloneChromiumProfiles = (randomToken?: string): string => {
|
|
1468
1380
|
* @param {string} randomToken - random token to append to the cloned directory
|
1469
1381
|
* @returns {string} cloned data directory, null if any of the sub files failed to copy
|
1470
1382
|
*/
|
1471
|
-
export const cloneEdgeProfiles = (randomToken
|
1383
|
+
export const cloneEdgeProfiles = (randomToken: string): string => {
|
1472
1384
|
const baseDir = getDefaultEdgeDataDir();
|
1473
1385
|
|
1474
1386
|
if (!baseDir) {
|
@@ -1477,18 +1389,10 @@ export const cloneEdgeProfiles = (randomToken?: string): string => {
|
|
1477
1389
|
|
1478
1390
|
let destDir;
|
1479
1391
|
|
1480
|
-
|
1481
|
-
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1482
|
-
} else {
|
1483
|
-
destDir = path.join(baseDir, 'oobee');
|
1484
|
-
}
|
1392
|
+
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
1485
1393
|
|
1486
1394
|
if (fs.existsSync(destDir)) {
|
1487
|
-
if (process.env.OOBEE_VERBOSE) {
|
1488
1395
|
deleteClonedEdgeProfiles(randomToken);
|
1489
|
-
} else {
|
1490
|
-
deleteClonedEdgeProfiles();
|
1491
|
-
}
|
1492
1396
|
}
|
1493
1397
|
|
1494
1398
|
if (!fs.existsSync(destDir)) {
|
@@ -1507,10 +1411,13 @@ export const cloneEdgeProfiles = (randomToken?: string): string => {
|
|
1507
1411
|
return destDir;
|
1508
1412
|
}
|
1509
1413
|
|
1510
|
-
|
1414
|
+
consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
|
1415
|
+
|
1416
|
+
// For future reference, return a null instead to halt the scan
|
1417
|
+
return destDir;
|
1511
1418
|
};
|
1512
1419
|
|
1513
|
-
export const deleteClonedProfiles = (browser: string, randomToken
|
1420
|
+
export const deleteClonedProfiles = (browser: string, randomToken: string): void => {
|
1514
1421
|
if (browser === BrowserTypes.CHROME) {
|
1515
1422
|
deleteClonedChromeProfiles(randomToken);
|
1516
1423
|
} else if (browser === BrowserTypes.EDGE) {
|
@@ -1565,9 +1472,7 @@ export const deleteClonedChromeProfiles = (randomToken?: string): void => {
|
|
1565
1472
|
* @returns null
|
1566
1473
|
*/
|
1567
1474
|
export const deleteClonedEdgeProfiles = (randomToken?: string): void => {
|
1568
|
-
|
1569
|
-
return;
|
1570
|
-
}
|
1475
|
+
|
1571
1476
|
const baseDir = getDefaultEdgeDataDir();
|
1572
1477
|
|
1573
1478
|
if (!baseDir) {
|
@@ -1716,13 +1621,9 @@ export const submitFormViaPlaywright = async (
|
|
1716
1621
|
} finally {
|
1717
1622
|
await browserContext.close();
|
1718
1623
|
if (proxy && browserToRun === BrowserTypes.EDGE) {
|
1719
|
-
|
1720
|
-
deleteClonedEdgeProfiles();
|
1721
|
-
}
|
1624
|
+
deleteClonedEdgeProfiles(clonedDir);
|
1722
1625
|
} else if (proxy && browserToRun === BrowserTypes.CHROME) {
|
1723
|
-
|
1724
|
-
deleteClonedChromeProfiles();
|
1725
|
-
}
|
1626
|
+
deleteClonedChromeProfiles(clonedDir);
|
1726
1627
|
}
|
1727
1628
|
}
|
1728
1629
|
};
|
@@ -1781,7 +1682,9 @@ export const submitForm = async (
|
|
1781
1682
|
export async function initModifiedUserAgent(
|
1782
1683
|
browser?: string,
|
1783
1684
|
playwrightDeviceDetailsObject?: object,
|
1685
|
+
userDataDirectory?: string,
|
1784
1686
|
) {
|
1687
|
+
|
1785
1688
|
const isHeadless = process.env.CRAWLEE_HEADLESS === '1';
|
1786
1689
|
|
1787
1690
|
// If headless mode is enabled, ensure the headless flag is set.
|
@@ -1798,7 +1701,11 @@ export async function initModifiedUserAgent(
|
|
1798
1701
|
};
|
1799
1702
|
|
1800
1703
|
// Launch a temporary persistent context with an empty userDataDir to mimic your production browser setup.
|
1801
|
-
const
|
1704
|
+
const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
|
1705
|
+
? userDataDirectory
|
1706
|
+
: '';
|
1707
|
+
|
1708
|
+
const browserContext = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, launchOptions);
|
1802
1709
|
const page = await browserContext.newPage();
|
1803
1710
|
|
1804
1711
|
// Retrieve the default user agent.
|
@@ -1856,13 +1763,6 @@ export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
|
|
1856
1763
|
return options;
|
1857
1764
|
};
|
1858
1765
|
|
1859
|
-
export const urlWithoutAuth = (url: string): string => {
|
1860
|
-
const parsedUrl = new URL(url);
|
1861
|
-
parsedUrl.username = '';
|
1862
|
-
parsedUrl.password = '';
|
1863
|
-
return parsedUrl.toString();
|
1864
|
-
};
|
1865
|
-
|
1866
1766
|
export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
|
1867
1767
|
const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
|
1868
1768
|
|
@@ -1887,7 +1787,7 @@ export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
|
|
1887
1787
|
|
1888
1788
|
let timeout: NodeJS.Timeout;
|
1889
1789
|
let mutationCount = 0;
|
1890
|
-
const MAX_MUTATIONS =
|
1790
|
+
const MAX_MUTATIONS = 500;
|
1891
1791
|
const mutationHash: Record<string, number> = {};
|
1892
1792
|
|
1893
1793
|
const observer = new MutationObserver(mutationsList => {
|