@govtechsg/oobee 0.10.58 → 0.10.62

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,7 +31,7 @@ import constants, {
31
31
  } from './constants.js';
32
32
  import { consoleLogger, silentLogger } from '../logs.js';
33
33
  import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
34
- import { randomThreeDigitNumberString } from '../utils.js';
34
+ import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
35
35
  import { Answers, Data } from '../index.js';
36
36
  import { DeviceDescriptor } from '../types/types.js';
37
37
 
@@ -119,7 +119,7 @@ export const validateFilePath = (filePath: string, cliDir: string) => {
119
119
 
120
120
  return absolutePath;
121
121
  } catch {
122
- throw new Error(`Please ensure path provided exists: ${absolutePath}`);
122
+ throw new Error(`Please ensure path provided exists and writable: ${absolutePath}`);
123
123
  }
124
124
  };
125
125
 
@@ -277,110 +277,11 @@ export const sanitizeUrlInput = (url: string): { isValid: boolean; url: string }
277
277
  return { isValid: false, url: sanitizeUrl };
278
278
  };
279
279
 
280
- const requestToUrl = async (
281
- url: string,
282
- isCustomFlow: boolean,
283
- extraHTTPHeaders: Record<string, string>,
284
- ) => {
285
- // User-Agent is modified to emulate a browser to handle cases where some sites ban non browser agents, resulting in a 403 error
286
- const res = new RES();
287
- const parsedUrl = new URL(url);
288
- await axios
289
- .get(parsedUrl.href, {
290
- headers: {
291
- ...extraHTTPHeaders,
292
- 'User-Agent': devices['Desktop Chrome HiDPI'].userAgent,
293
- Host: parsedUrl.host,
294
- },
295
- auth: {
296
- username: decodeURIComponent(parsedUrl.username),
297
- password: decodeURIComponent(parsedUrl.password),
298
- },
299
- httpsAgent,
300
- timeout: 5000,
301
- })
302
- .then(async response => {
303
- let redirectUrl = response.request.res.responseUrl;
304
- redirectUrl = new URL(redirectUrl).href;
305
- res.status = constants.urlCheckStatuses.success.code;
306
- let data;
307
- if (typeof response.data === 'string' || response.data instanceof String) {
308
- data = response.data;
309
- } else if (typeof response.data === 'object' && response.data !== null) {
310
- try {
311
- data = JSON.stringify(response.data);
312
- } catch (error) {
313
- console.log('Error converting object to JSON:', error);
314
- }
315
- } else {
316
- console.log('Unsupported data type:', typeof response.data);
317
- }
318
- const modifiedHTML = data.replace(/<noscript>[\s\S]*?<\/noscript>/gi, '');
319
-
320
- const metaRefreshMatch =
321
- /<meta\s+http-equiv="refresh"\s+content="(?:\d+;)?\s*url=(?:'([^']*)'|"([^"]*)"|([^>]*))"/i.exec(
322
- modifiedHTML,
323
- );
324
-
325
- const hasMetaRefresh = metaRefreshMatch && metaRefreshMatch.length > 1;
326
-
327
- if (redirectUrl != null && (hasMetaRefresh || !isCustomFlow)) {
328
- res.url = redirectUrl;
329
- } else {
330
- res.url = url;
331
- }
332
-
333
- if (hasMetaRefresh) {
334
- let urlOrRelativePath;
335
-
336
- for (let i = 1; i < metaRefreshMatch.length; i++) {
337
- if (metaRefreshMatch[i] !== undefined && metaRefreshMatch[i] !== null) {
338
- urlOrRelativePath = metaRefreshMatch[i];
339
- break; // Stop the loop once the first non-null value is found
340
- }
341
- }
342
-
343
- if (urlOrRelativePath.includes('URL=')) {
344
- res.url = urlOrRelativePath.split('URL=').pop();
345
- } else {
346
- const pathname = res.url.substring(0, res.url.lastIndexOf('/'));
347
- res.url = new URL(urlOrRelativePath, pathname).toString();
348
- }
349
- }
350
-
351
- res.content = response.data;
352
- })
353
- .catch(async error => {
354
- if (error.code === 'ECONNABORTED' || error.code === 'ERR_FR_TOO_MANY_REDIRECTS') {
355
- res.status = constants.urlCheckStatuses.axiosTimeout.code;
356
- } else if (error.response) {
357
- if (error.response.status === 401) {
358
- // enters here if URL is protected by basic auth
359
- res.status = constants.urlCheckStatuses.unauthorised.code;
360
- } else {
361
- // enters here if server responds with a status other than 2xx
362
- // the scan should still proceed even if error codes are received, so that accessibility scans for error pages can be done too
363
- res.status = constants.urlCheckStatuses.success.code;
364
- }
365
- res.url = url;
366
- res.content = error.response.data;
367
- return res;
368
- } else if (error.request) {
369
- // enters here if URL cannot be accessed
370
- res.status = constants.urlCheckStatuses.cannotBeResolved.code;
371
- } else {
372
- res.status = constants.urlCheckStatuses.systemError.code;
373
- }
374
- });
375
- return res;
376
- };
377
-
378
280
  const checkUrlConnectivityWithBrowser = async (
379
281
  url: string,
380
282
  browserToRun: string,
381
283
  clonedDataDir: string,
382
284
  playwrightDeviceDetailsObject: DeviceDescriptor,
383
- isCustomFlow: boolean,
384
285
  extraHTTPHeaders: Record<string, string>,
385
286
  ) => {
386
287
  const res = new RES();
@@ -391,28 +292,21 @@ const checkUrlConnectivityWithBrowser = async (
391
292
  return res;
392
293
  }
393
294
 
394
- let viewport = null;
395
- let userAgent = null;
396
- if ('viewport' in playwrightDeviceDetailsObject) viewport = playwrightDeviceDetailsObject.viewport;
397
- if ('userAgent' in playwrightDeviceDetailsObject) userAgent = playwrightDeviceDetailsObject.userAgent;
398
-
399
295
  // Ensure Accept header for non-html content fallback
400
296
  extraHTTPHeaders['Accept'] ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
401
297
 
402
- const launchOptions = getPlaywrightLaunchOptions(browserToRun);
403
- const browserContextLaunchOptions = {
404
- ...launchOptions,
405
- args: [...launchOptions.args, '--headless=new'],
406
- };
298
+ await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
407
299
 
408
300
  let browserContext;
409
301
  try {
410
302
  browserContext = await constants.launcher.launchPersistentContext(clonedDataDir, {
411
- ...browserContextLaunchOptions,
412
- ...(viewport && { viewport }),
413
- ...(userAgent && { userAgent }),
414
303
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
304
+ ignoreHTTPSErrors: true,
305
+ ...getPlaywrightLaunchOptions(browserToRun),
306
+ ...playwrightDeviceDetailsObject,
415
307
  });
308
+
309
+ register(browserContext);
416
310
  } catch (err) {
417
311
  printMessage([`Unable to launch browser\n${err}`], messageOptions);
418
312
  res.status = constants.urlCheckStatuses.browserError.code;
@@ -422,48 +316,77 @@ const checkUrlConnectivityWithBrowser = async (
422
316
  try {
423
317
  const page = await browserContext.newPage();
424
318
 
425
- // Skip Playwright for PDF (use raw request instead)
426
- if (isUrlPdf(url)) {
427
- return await requestToUrl(url, false, extraHTTPHeaders);
319
+ // STEP 1: HEAD request before actual navigation
320
+ let statusCode = 0;
321
+ let contentType = '';
322
+ let disposition = '';
323
+
324
+ try {
325
+ const headResp = await page.request.fetch(url, {
326
+ method: 'HEAD',
327
+ headers: extraHTTPHeaders,
328
+ });
329
+
330
+ statusCode = headResp.status();
331
+ contentType = headResp.headers()['content-type'] || '';
332
+ disposition = headResp.headers()['content-disposition'] || '';
333
+
334
+ // If it looks like a downloadable file, skip goto entirely
335
+ if (
336
+ contentType.includes('pdf') ||
337
+ contentType.includes('octet-stream') ||
338
+ disposition.includes('attachment')
339
+ ) {
340
+ res.status = statusCode === 401
341
+ ? constants.urlCheckStatuses.unauthorised.code
342
+ : constants.urlCheckStatuses.success.code;
343
+
344
+ res.httpStatus = statusCode;
345
+ res.url = url;
346
+ res.content = ''; // Don't try to render binary
347
+
348
+ await browserContext.close();
349
+ return res;
350
+ }
351
+ } catch (e) {
352
+ consoleLogger.info(`HEAD request failed: ${e.message}`);
353
+ res.status = constants.urlCheckStatuses.systemError.code;
354
+ await browserContext.close();
355
+ return res;
428
356
  }
429
357
 
358
+ // STEP 2: Safe to proceed with navigation
430
359
  const response = await page.goto(url, {
431
360
  timeout: 30000,
432
- ...(proxy && { waitUntil: 'commit' }),
361
+ waitUntil: 'commit', // Don't wait for full load
433
362
  });
434
363
 
435
- try {
436
- await page.waitForLoadState('networkidle', { timeout: 10000 });
437
- } catch {
438
- consoleLogger.info('Unable to detect networkidle');
439
- }
440
-
441
- const status = response.status();
442
- res.status = status === 401
364
+ const finalStatus = statusCode || (response?.status?.() ?? 0);
365
+ res.status = finalStatus === 401
443
366
  ? constants.urlCheckStatuses.unauthorised.code
444
367
  : constants.urlCheckStatuses.success.code;
445
368
 
446
- // Store the status code
447
- res.httpStatus = response?.status?.() ?? 0;
369
+ res.httpStatus = finalStatus;
370
+ res.url = page.url();
448
371
 
449
- // Store final navigated URL
450
- res.url = isCustomFlow ? url : page.url();
451
-
452
- // Check content type to determine how to extract content
453
- const contentType = response.headers()['content-type'] || '';
454
-
455
- if (contentType.includes('xml') || res.url.endsWith('.xml')) {
456
- // Fetch raw content to avoid Playwright's HTML-wrapped <pre> behavior
457
- const rawResponse = await requestToUrl(res.url, true, extraHTTPHeaders);
458
- res.content = rawResponse.content;
372
+ contentType = response?.headers()?.['content-type'] || '';
373
+ if (contentType.includes('pdf') || contentType.includes('octet-stream')) {
374
+ res.content = ''; // Avoid triggering render/download
459
375
  } else {
460
- res.content = await page.content(); // rendered DOM
376
+ try {
377
+ await page.waitForLoadState('networkidle', { timeout: 10000 });
378
+ } catch {
379
+ consoleLogger.info('Unable to detect networkidle');
380
+ }
381
+
382
+ res.content = await page.content();
461
383
  }
462
384
 
463
385
  } catch (error) {
464
386
  if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
465
387
  res.status = constants.urlCheckStatuses.unauthorised.code;
466
388
  } else {
389
+ console.log(error);
467
390
  res.status = constants.urlCheckStatuses.systemError.code;
468
391
  }
469
392
  } finally {
@@ -501,7 +424,6 @@ export const checkUrl = async (
501
424
  browser: string,
502
425
  clonedDataDir: string,
503
426
  playwrightDeviceDetailsObject: DeviceDescriptor,
504
- isCustomFlow: boolean,
505
427
  extraHTTPHeaders: Record<string, string>,
506
428
  ) => {
507
429
  const res = await checkUrlConnectivityWithBrowser(
@@ -509,7 +431,6 @@ export const checkUrl = async (
509
431
  browser,
510
432
  clonedDataDir,
511
433
  playwrightDeviceDetailsObject,
512
- isCustomFlow,
513
434
  extraHTTPHeaders,
514
435
  );
515
436
 
@@ -544,7 +465,7 @@ export const parseHeaders = (header?: string): Record<string, string> => {
544
465
  ],
545
466
  messageOptions,
546
467
  );
547
- process.exit(1);
468
+ cleanUpAndExit(1);
548
469
  }
549
470
  allHeaders[headerValuePair[0]] = headerValuePair[1]; // {"header": "value", "header2": "value2", ...}
550
471
  });
@@ -555,18 +476,16 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
555
476
  if (isEmptyObject(argv)) {
556
477
  throw Error('No inputs should be provided');
557
478
  }
558
- const {
479
+ let {
559
480
  scanner,
560
481
  headless,
561
482
  url,
562
483
  deviceChosen,
563
484
  customDevice,
564
485
  viewportWidth,
565
- playwrightDeviceDetailsObject,
566
486
  maxpages,
567
487
  strategy,
568
488
  isLocalFileScan,
569
- finalUrl,
570
489
  browserToRun,
571
490
  nameEmail,
572
491
  customFlowLabel,
@@ -578,32 +497,75 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
578
497
  followRobots,
579
498
  header,
580
499
  safeMode,
500
+ exportDirectory,
581
501
  zip,
582
502
  ruleset,
583
503
  generateJsonFiles,
584
504
  scanDuration
585
505
  } = argv;
586
506
 
507
+ const extraHTTPHeaders = parseHeaders(header);
508
+
509
+ // Set default username and password for basic auth
510
+ let username = '';
511
+ let password = '';
512
+
513
+ // Remove credentials from URL if not a local file scan
514
+ url = argv.isLocalFileScan
515
+ ? url
516
+ : (() => {
517
+ const temp = new URL(url);
518
+ username = temp.username;
519
+ password = temp.password;
520
+
521
+ if (username !== '' || password !== '') {
522
+ extraHTTPHeaders['Authorization'] = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
523
+ }
524
+
525
+ temp.username = '';
526
+ temp.password = '';
527
+ return temp.toString();
528
+ })();
529
+
587
530
  // construct filename for scan results
588
531
  const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
589
532
  const domain = argv.isLocalFileScan ? path.basename(argv.url) : new URL(argv.url).hostname;
533
+
590
534
  const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : '';
591
535
  let resultFilename: string;
592
536
  const randomThreeDigitNumber = randomThreeDigitNumberString();
593
- if (process.env.OOBEE_VERBOSE) {
594
- resultFilename = `${date}_${time}${sanitisedLabel}_${domain}_${randomThreeDigitNumber}`;
595
- } else {
596
- resultFilename = `${date}_${time}${sanitisedLabel}_${domain}`;
537
+ resultFilename = `${date}_${time}${sanitisedLabel}_${domain}_${randomThreeDigitNumber}`;
538
+
539
+ // Set exported directory
540
+ if (exportDirectory) {
541
+ constants.exportDirectory = path.join(exportDirectory, resultFilename);
597
542
  }
543
+
544
+ // Creating the playwrightDeviceDetailObject
545
+ deviceChosen = customDevice === 'Desktop' || customDevice === 'Mobile' ? customDevice : deviceChosen;
546
+
547
+ const playwrightDeviceDetailsObject = getPlaywrightDeviceDetailsObject(
548
+ deviceChosen,
549
+ customDevice,
550
+ viewportWidth,
551
+ );
552
+
553
+ const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(browserToRun, true, resultFilename);
554
+ browserToRun = resolvedBrowser;
555
+
556
+ const resolvedUserDataDirectory = getClonedProfilesWithRandomToken(browserToRun, resultFilename);
598
557
 
599
558
  if (followRobots) {
600
559
  constants.robotsTxtUrls = {};
601
- await getUrlsFromRobotsTxt(url, browserToRun);
560
+ await getUrlsFromRobotsTxt(url, browserToRun, resolvedUserDataDirectory, extraHTTPHeaders);
602
561
  }
603
562
 
563
+ constants.userDataDirectory = resolvedUserDataDirectory;
564
+ constants.randomToken = resultFilename;
565
+
604
566
  return {
605
567
  type: scanner,
606
- url: finalUrl,
568
+ url: url,
607
569
  entryUrl: url,
608
570
  isHeadless: headless,
609
571
  deviceChosen,
@@ -624,8 +586,9 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
624
586
  includeScreenshots: !(additional === 'none'),
625
587
  metadata,
626
588
  followRobots,
627
- extraHTTPHeaders: parseHeaders(header),
589
+ extraHTTPHeaders: extraHTTPHeaders,
628
590
  safeMode,
591
+ userDataDirectory: resolvedUserDataDirectory,
629
592
  zip,
630
593
  ruleset,
631
594
  generateJsonFiles,
@@ -633,7 +596,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
633
596
  };
634
597
  };
635
598
 
636
- export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): Promise<void> => {
599
+ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>): Promise<void> => {
637
600
  if (!constants.robotsTxtUrls) return;
638
601
 
639
602
  const domain = new URL(url).origin;
@@ -642,22 +605,18 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): P
642
605
 
643
606
  let robotsTxt: string;
644
607
  try {
645
- if (proxy) {
646
- robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browserToRun);
647
- } else {
648
- robotsTxt = await getRobotsTxtViaAxios(robotsUrl);
649
- }
608
+ robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browserToRun, userDataDirectory, extraHTTPHeaders);
609
+ consoleLogger.info(`Fetched robots.txt from ${robotsUrl}`);
650
610
  } catch (e) {
651
611
  // if robots.txt is not found, do nothing
612
+ consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl}`);
652
613
  }
653
- console.log('robotsTxt', robotsTxt);
614
+
654
615
  if (!robotsTxt) {
655
616
  constants.robotsTxtUrls[domain] = {};
656
617
  return;
657
618
  }
658
-
659
- console.log('Found robots.txt: ', robotsUrl);
660
-
619
+
661
620
  const lines = robotsTxt.split(/\r?\n/);
662
621
  let shouldCapture = false;
663
622
  const disallowedUrls = [];
@@ -705,30 +664,32 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): P
705
664
  constants.robotsTxtUrls[domain] = { disallowedUrls, allowedUrls };
706
665
  };
707
666
 
708
- const getRobotsTxtViaPlaywright = async (robotsUrl: string, browser: string): Promise<string> => {
709
- const browserContext = await constants.launcher.launchPersistentContext('', {
667
+ const getRobotsTxtViaPlaywright = async (robotsUrl: string, browser: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>): Promise<string> => {
668
+
669
+ let robotsDataDir = '';
670
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
671
+ if (process.env.CRAWLEE_HEADLESS === '1') {
672
+ // Create robots own user data directory else SingletonLock: File exists (17) with crawlDomain or crawlSitemap's own browser
673
+ const robotsDataDir = path.join(userDataDirectory, 'robots');
674
+ if (!fs.existsSync(robotsDataDir)) {
675
+ fs.mkdirSync(robotsDataDir, { recursive: true });
676
+ }
677
+ }
678
+
679
+ const browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
710
680
  ...getPlaywrightLaunchOptions(browser),
681
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
711
682
  });
712
683
 
684
+ register(browserContext);
685
+
713
686
  const page = await browserContext.newPage();
714
- await page.goto(robotsUrl, { waitUntil: 'networkidle', timeout: 30000 });
715
687
 
688
+ await page.goto(robotsUrl, { waitUntil: 'networkidle', timeout: 30000 });
716
689
  const robotsTxt: string | null = await page.evaluate(() => document.body.textContent);
717
690
  return robotsTxt;
718
691
  };
719
692
 
720
- const getRobotsTxtViaAxios = async (robotsUrl: string): Promise<string> => {
721
- const instance = axios.create({
722
- httpsAgent: new https.Agent({
723
- rejectUnauthorized: false,
724
- keepAlive: true,
725
- }),
726
- });
727
-
728
- const robotsTxt = (await (await instance.get(robotsUrl, { timeout: 2000 })).data) as string;
729
- return robotsTxt;
730
- };
731
-
732
693
  export const isDisallowedInRobotsTxt = (url: string): boolean => {
733
694
  if (!constants.robotsTxtUrls) return;
734
695
 
@@ -760,8 +721,7 @@ export const getLinksFromSitemap = async (
760
721
  userDataDirectory: string,
761
722
  userUrlInput: string,
762
723
  isIntelligent: boolean,
763
- username: string,
764
- password: string,
724
+ extraHTTPHeaders: Record<string, string>,
765
725
  ) => {
766
726
  const scannedSitemaps = new Set<string>();
767
727
  const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
@@ -772,11 +732,6 @@ export const getLinksFromSitemap = async (
772
732
  if (!url) return;
773
733
  if (isDisallowedInRobotsTxt(url)) return;
774
734
 
775
- // add basic auth credentials to the URL
776
- username !== '' && password !== ''
777
- ? (url = addBasicAuthCredentials(url, username, password))
778
- : url;
779
-
780
735
  url = convertPathToLocalFile(url);
781
736
 
782
737
  let request;
@@ -791,13 +746,6 @@ export const getLinksFromSitemap = async (
791
746
  urls[url] = request;
792
747
  };
793
748
 
794
- const addBasicAuthCredentials = (url: string, username: string, password: string) => {
795
- const urlObject = new URL(url);
796
- urlObject.username = username;
797
- urlObject.password = password;
798
- return urlObject.toString();
799
- };
800
-
801
749
  const calculateCloseness = (sitemapUrl: string) => {
802
750
  // Remove 'http://', 'https://', and 'www.' prefixes from the URLs
803
751
  const normalizedSitemapUrl = sitemapUrl.replace(/^(https?:\/\/)?(www\.)?/, '');
@@ -868,16 +816,10 @@ export const getLinksFromSitemap = async (
868
816
  finalUserDataDirectory = '';
869
817
  }
870
818
 
871
- const fetchUrls = async (url: string) => {
819
+ const fetchUrls = async (url: string, extraHTTPHeaders: Record<string, string>) => {
872
820
  let data;
873
821
  let sitemapType;
874
- let isBasicAuth = false;
875
-
876
- let username = '';
877
- let password = '';
878
-
879
- let parsedUrl;
880
-
822
+
881
823
  if (scannedSitemaps.has(url)) {
882
824
  // Skip processing if the sitemap has already been scanned
883
825
  return;
@@ -893,17 +835,9 @@ export const getLinksFromSitemap = async (
893
835
  if (!fs.existsSync(url)) {
894
836
  return;
895
837
  }
896
- parsedUrl = url;
838
+
897
839
  } else if (isValidHttpUrl(url)) {
898
- parsedUrl = new URL(url);
899
-
900
- if (parsedUrl.username !== '' && parsedUrl.password !== '') {
901
- isBasicAuth = true;
902
- username = decodeURIComponent(parsedUrl.username);
903
- password = decodeURIComponent(parsedUrl.password);
904
- parsedUrl.username = '';
905
- parsedUrl.password = '';
906
- }
840
+ // Do nothing, url is valid
907
841
  } else {
908
842
  printMessage([`Invalid Url/Filepath: ${url}`], messageOptions);
909
843
  return;
@@ -915,12 +849,18 @@ export const getLinksFromSitemap = async (
915
849
  {
916
850
  ...getPlaywrightLaunchOptions(browser),
917
851
  // Not necessary to parse http_credentials as I am parsing it directly in URL
852
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
853
+ ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
854
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
918
855
  },
919
856
  );
920
857
 
858
+ register(browserContext);
921
859
  const page = await browserContext.newPage();
860
+
922
861
  await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
923
- if (constants.launcher === webkit) {
862
+
863
+ if (await page.locator('body').count() > 0) {
924
864
  data = await page.locator('body').innerText();
925
865
  } else {
926
866
  const urlSet = page.locator('urlset');
@@ -948,35 +888,14 @@ export const getLinksFromSitemap = async (
948
888
  addToUrlList(url);
949
889
  return;
950
890
  }
951
- if (proxy) {
952
- await getDataUsingPlaywright();
953
- } else {
954
- try {
955
- const instance = axios.create({
956
- httpsAgent: new https.Agent({
957
- rejectUnauthorized: false,
958
- keepAlive: true,
959
- }),
960
- auth: {
961
- username,
962
- password,
963
- },
964
- });
965
- try {
966
- data = await (await instance.get(url, { timeout: 80000 })).data;
967
- } catch {
968
- return; // to skip the error
969
- }
970
- } catch (error) {
971
- if (error.code === 'ECONNABORTED') {
972
- await getDataUsingPlaywright();
973
- }
974
- }
975
- }
891
+
892
+ await getDataUsingPlaywright();
893
+
976
894
  } else {
977
895
  url = convertLocalFileToPath(url);
978
896
  data = fs.readFileSync(url, 'utf8');
979
897
  }
898
+
980
899
  const $ = cheerio.load(data, { xml: true });
981
900
 
982
901
  // This case is when the document is not an XML format document
@@ -1012,7 +931,7 @@ export const getLinksFromSitemap = async (
1012
931
  break;
1013
932
  }
1014
933
  if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
1015
- await fetchUrls(childSitemapUrlText); // Recursive call for nested sitemaps
934
+ await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
1016
935
  } else {
1017
936
  addToUrlList(childSitemapUrlText); // Add regular URLs to the list
1018
937
  }
@@ -1037,7 +956,7 @@ export const getLinksFromSitemap = async (
1037
956
  };
1038
957
 
1039
958
  try {
1040
- await fetchUrls(sitemapUrl);
959
+ await fetchUrls(sitemapUrl, extraHTTPHeaders);
1041
960
  } catch (e) {
1042
961
  consoleLogger.error(e);
1043
962
  }
@@ -1086,20 +1005,26 @@ export const validName = (name: string) => {
1086
1005
  * @returns object consisting of browser to run and cloned data directory
1087
1006
  */
1088
1007
  export const getBrowserToRun = (
1089
- preferredBrowser: BrowserTypes,
1008
+ preferredBrowser?: BrowserTypes,
1090
1009
  isCli = false,
1010
+ randomToken?: string
1091
1011
  ): { browserToRun: BrowserTypes; clonedBrowserDataDir: string } => {
1012
+
1013
+ if (!randomToken) {
1014
+ randomToken = '';
1015
+ }
1016
+
1092
1017
  const platform = os.platform();
1093
1018
 
1094
1019
  // Prioritise Chrome on Windows and Mac platforms if user does not specify a browser
1095
1020
  if (!preferredBrowser && (os.platform() === 'win32' || os.platform() === 'darwin')) {
1096
1021
  preferredBrowser = BrowserTypes.CHROME;
1022
+ } else {
1023
+ printMessage([`Preferred browser ${preferredBrowser}`], messageOptions);
1097
1024
  }
1098
1025
 
1099
- printMessage([`Preferred browser ${preferredBrowser}`], messageOptions);
1100
-
1101
1026
  if (preferredBrowser === BrowserTypes.CHROME) {
1102
- const chromeData = getChromeData();
1027
+ const chromeData = getChromeData(randomToken);
1103
1028
  if (chromeData) return chromeData;
1104
1029
 
1105
1030
  if (platform === 'darwin') {
@@ -1113,7 +1038,7 @@ export const getBrowserToRun = (
1113
1038
  if (isCli)
1114
1039
  printMessage(['Unable to use Chrome, falling back to Edge browser...'], messageOptions);
1115
1040
 
1116
- const edgeData = getEdgeData();
1041
+ const edgeData = getEdgeData(randomToken);
1117
1042
  if (edgeData) return edgeData;
1118
1043
 
1119
1044
  if (isCli)
@@ -1125,12 +1050,12 @@ export const getBrowserToRun = (
1125
1050
  printMessage(['Unable to use Chrome, falling back to Chromium browser...'], messageOptions);
1126
1051
  }
1127
1052
  } else if (preferredBrowser === BrowserTypes.EDGE) {
1128
- const edgeData = getEdgeData();
1053
+ const edgeData = getEdgeData(randomToken);
1129
1054
  if (edgeData) return edgeData;
1130
1055
 
1131
1056
  if (isCli)
1132
1057
  printMessage(['Unable to use Edge, falling back to Chrome browser...'], messageOptions);
1133
- const chromeData = getChromeData();
1058
+ const chromeData = getChromeData(randomToken);
1134
1059
  if (chromeData) return chromeData;
1135
1060
 
1136
1061
  if (platform === 'darwin') {
@@ -1161,7 +1086,7 @@ export const getBrowserToRun = (
1161
1086
  // defaults to chromium
1162
1087
  return {
1163
1088
  browserToRun: BrowserTypes.CHROMIUM,
1164
- clonedBrowserDataDir: cloneChromiumProfiles(),
1089
+ clonedBrowserDataDir: cloneChromiumProfiles(randomToken),
1165
1090
  };
1166
1091
  };
1167
1092
 
@@ -1181,9 +1106,9 @@ export const getClonedProfilesWithRandomToken = (browser: string, randomToken: s
1181
1106
  return cloneChromiumProfiles(randomToken);
1182
1107
  };
1183
1108
 
1184
- export const getChromeData = () => {
1109
+ export const getChromeData = (randomToken: string) => {
1185
1110
  const browserDataDir = getDefaultChromeDataDir();
1186
- const clonedBrowserDataDir = cloneChromeProfiles();
1111
+ const clonedBrowserDataDir = cloneChromeProfiles(randomToken);
1187
1112
  if (browserDataDir && clonedBrowserDataDir) {
1188
1113
  const browserToRun = BrowserTypes.CHROME;
1189
1114
  return { browserToRun, clonedBrowserDataDir };
@@ -1191,9 +1116,9 @@ export const getChromeData = () => {
1191
1116
  return null;
1192
1117
  };
1193
1118
 
1194
- export const getEdgeData = () => {
1119
+ export const getEdgeData = (randomToken: string) => {
1195
1120
  const browserDataDir = getDefaultEdgeDataDir();
1196
- const clonedBrowserDataDir = cloneEdgeProfiles();
1121
+ const clonedBrowserDataDir = cloneEdgeProfiles(randomToken);
1197
1122
  if (browserDataDir && clonedBrowserDataDir) {
1198
1123
  const browserToRun = BrowserTypes.EDGE;
1199
1124
  return { browserToRun, clonedBrowserDataDir };
@@ -1397,7 +1322,7 @@ const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: st
1397
1322
  * @param {string} randomToken - random token to append to the cloned directory
1398
1323
  * @returns {string} cloned data directory, null if any of the sub files failed to copy
1399
1324
  */
1400
- export const cloneChromeProfiles = (randomToken?: string): string => {
1325
+ export const cloneChromeProfiles = (randomToken: string): string => {
1401
1326
  const baseDir = getDefaultChromeDataDir();
1402
1327
 
1403
1328
  if (!baseDir) {
@@ -1406,18 +1331,10 @@ export const cloneChromeProfiles = (randomToken?: string): string => {
1406
1331
 
1407
1332
  let destDir;
1408
1333
 
1409
- if (randomToken) {
1410
- destDir = path.join(baseDir, `oobee-${randomToken}`);
1411
- } else {
1412
- destDir = path.join(baseDir, 'oobee');
1413
- }
1334
+ destDir = path.join(baseDir, `oobee-${randomToken}`);
1414
1335
 
1415
1336
  if (fs.existsSync(destDir)) {
1416
- if (process.env.OOBEE_VERBOSE) {
1417
1337
  deleteClonedChromeProfiles(randomToken);
1418
- } else {
1419
- deleteClonedChromeProfiles();
1420
- }
1421
1338
  }
1422
1339
 
1423
1340
  if (!fs.existsSync(destDir)) {
@@ -1435,10 +1352,13 @@ export const cloneChromeProfiles = (randomToken?: string): string => {
1435
1352
  return destDir;
1436
1353
  }
1437
1354
 
1438
- return null;
1355
+ consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
1356
+
1357
+ // For future reference, return a null instead to halt the scan
1358
+ return destDir;
1439
1359
  };
1440
1360
 
1441
- export const cloneChromiumProfiles = (randomToken?: string): string => {
1361
+ export const cloneChromiumProfiles = (randomToken: string): string => {
1442
1362
  const baseDir = getDefaultChromiumDataDir();
1443
1363
 
1444
1364
  if (!baseDir) {
@@ -1447,10 +1367,10 @@ export const cloneChromiumProfiles = (randomToken?: string): string => {
1447
1367
 
1448
1368
  let destDir: string;
1449
1369
 
1450
- if (randomToken) {
1451
- destDir = path.join(baseDir, `oobee-${randomToken}`);
1452
- } else {
1453
- destDir = path.join(baseDir, 'oobee');
1370
+ destDir = path.join(baseDir, `oobee-${randomToken}`);
1371
+
1372
+ if (fs.existsSync(destDir)) {
1373
+ deleteClonedChromiumProfiles(randomToken);
1454
1374
  }
1455
1375
 
1456
1376
  if (!fs.existsSync(destDir)) {
@@ -1468,7 +1388,7 @@ export const cloneChromiumProfiles = (randomToken?: string): string => {
1468
1388
  * @param {string} randomToken - random token to append to the cloned directory
1469
1389
  * @returns {string} cloned data directory, null if any of the sub files failed to copy
1470
1390
  */
1471
- export const cloneEdgeProfiles = (randomToken?: string): string => {
1391
+ export const cloneEdgeProfiles = (randomToken: string): string => {
1472
1392
  const baseDir = getDefaultEdgeDataDir();
1473
1393
 
1474
1394
  if (!baseDir) {
@@ -1477,18 +1397,10 @@ export const cloneEdgeProfiles = (randomToken?: string): string => {
1477
1397
 
1478
1398
  let destDir;
1479
1399
 
1480
- if (randomToken) {
1481
- destDir = path.join(baseDir, `oobee-${randomToken}`);
1482
- } else {
1483
- destDir = path.join(baseDir, 'oobee');
1484
- }
1400
+ destDir = path.join(baseDir, `oobee-${randomToken}`);
1485
1401
 
1486
1402
  if (fs.existsSync(destDir)) {
1487
- if (process.env.OOBEE_VERBOSE) {
1488
1403
  deleteClonedEdgeProfiles(randomToken);
1489
- } else {
1490
- deleteClonedEdgeProfiles();
1491
- }
1492
1404
  }
1493
1405
 
1494
1406
  if (!fs.existsSync(destDir)) {
@@ -1507,10 +1419,13 @@ export const cloneEdgeProfiles = (randomToken?: string): string => {
1507
1419
  return destDir;
1508
1420
  }
1509
1421
 
1510
- return null;
1422
+ consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
1423
+
1424
+ // For future reference, return a null instead to halt the scan
1425
+ return destDir;
1511
1426
  };
1512
1427
 
1513
- export const deleteClonedProfiles = (browser: string, randomToken?: string): void => {
1428
+ export const deleteClonedProfiles = (browser: string, randomToken: string): void => {
1514
1429
  if (browser === BrowserTypes.CHROME) {
1515
1430
  deleteClonedChromeProfiles(randomToken);
1516
1431
  } else if (browser === BrowserTypes.EDGE) {
@@ -1565,9 +1480,7 @@ export const deleteClonedChromeProfiles = (randomToken?: string): void => {
1565
1480
  * @returns null
1566
1481
  */
1567
1482
  export const deleteClonedEdgeProfiles = (randomToken?: string): void => {
1568
- if (process.env.OOBEE_VERBOSE) {
1569
- return;
1570
- }
1483
+
1571
1484
  const baseDir = getDefaultEdgeDataDir();
1572
1485
 
1573
1486
  if (!baseDir) {
@@ -1698,6 +1611,8 @@ export const submitFormViaPlaywright = async (
1698
1611
  },
1699
1612
  );
1700
1613
 
1614
+ register(browserContext);
1615
+
1701
1616
  const page = await browserContext.newPage();
1702
1617
 
1703
1618
  try {
@@ -1716,13 +1631,9 @@ export const submitFormViaPlaywright = async (
1716
1631
  } finally {
1717
1632
  await browserContext.close();
1718
1633
  if (proxy && browserToRun === BrowserTypes.EDGE) {
1719
- if (!process.env.OOBEE_VERBOSE) {
1720
- deleteClonedEdgeProfiles();
1721
- }
1634
+ deleteClonedEdgeProfiles(clonedDir);
1722
1635
  } else if (proxy && browserToRun === BrowserTypes.CHROME) {
1723
- if (!process.env.OOBEE_VERBOSE) {
1724
- deleteClonedChromeProfiles();
1725
- }
1636
+ deleteClonedChromeProfiles(clonedDir);
1726
1637
  }
1727
1638
  }
1728
1639
  };
@@ -1781,7 +1692,9 @@ export const submitForm = async (
1781
1692
  export async function initModifiedUserAgent(
1782
1693
  browser?: string,
1783
1694
  playwrightDeviceDetailsObject?: object,
1695
+ userDataDirectory?: string,
1784
1696
  ) {
1697
+
1785
1698
  const isHeadless = process.env.CRAWLEE_HEADLESS === '1';
1786
1699
 
1787
1700
  // If headless mode is enabled, ensure the headless flag is set.
@@ -1798,7 +1711,13 @@ export async function initModifiedUserAgent(
1798
1711
  };
1799
1712
 
1800
1713
  // Launch a temporary persistent context with an empty userDataDir to mimic your production browser setup.
1801
- const browserContext = await constants.launcher.launchPersistentContext('', launchOptions);
1714
+ const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
1715
+ ? userDataDirectory
1716
+ : '';
1717
+
1718
+ const browserContext = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, launchOptions);
1719
+ register(browserContext);
1720
+
1802
1721
  const page = await browserContext.newPage();
1803
1722
 
1804
1723
  // Retrieve the default user agent.
@@ -1856,13 +1775,6 @@ export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
1856
1775
  return options;
1857
1776
  };
1858
1777
 
1859
- export const urlWithoutAuth = (url: string): string => {
1860
- const parsedUrl = new URL(url);
1861
- parsedUrl.username = '';
1862
- parsedUrl.password = '';
1863
- return parsedUrl.toString();
1864
- };
1865
-
1866
1778
  export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
1867
1779
  const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
1868
1780
 
@@ -1887,7 +1799,7 @@ export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
1887
1799
 
1888
1800
  let timeout: NodeJS.Timeout;
1889
1801
  let mutationCount = 0;
1890
- const MAX_MUTATIONS = 250;
1802
+ const MAX_MUTATIONS = 500;
1891
1803
  const mutationHash: Record<string, number> = {};
1892
1804
 
1893
1805
  const observer = new MutationObserver(mutationsList => {