@govtechsg/oobee 0.10.58 → 0.10.61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -119,7 +119,7 @@ export const validateFilePath = (filePath: string, cliDir: string) => {
119
119
 
120
120
  return absolutePath;
121
121
  } catch {
122
- throw new Error(`Please ensure path provided exists: ${absolutePath}`);
122
+ throw new Error(`Please ensure path provided exists and writable: ${absolutePath}`);
123
123
  }
124
124
  };
125
125
 
@@ -277,110 +277,11 @@ export const sanitizeUrlInput = (url: string): { isValid: boolean; url: string }
277
277
  return { isValid: false, url: sanitizeUrl };
278
278
  };
279
279
 
280
- const requestToUrl = async (
281
- url: string,
282
- isCustomFlow: boolean,
283
- extraHTTPHeaders: Record<string, string>,
284
- ) => {
285
- // User-Agent is modified to emulate a browser to handle cases where some sites ban non browser agents, resulting in a 403 error
286
- const res = new RES();
287
- const parsedUrl = new URL(url);
288
- await axios
289
- .get(parsedUrl.href, {
290
- headers: {
291
- ...extraHTTPHeaders,
292
- 'User-Agent': devices['Desktop Chrome HiDPI'].userAgent,
293
- Host: parsedUrl.host,
294
- },
295
- auth: {
296
- username: decodeURIComponent(parsedUrl.username),
297
- password: decodeURIComponent(parsedUrl.password),
298
- },
299
- httpsAgent,
300
- timeout: 5000,
301
- })
302
- .then(async response => {
303
- let redirectUrl = response.request.res.responseUrl;
304
- redirectUrl = new URL(redirectUrl).href;
305
- res.status = constants.urlCheckStatuses.success.code;
306
- let data;
307
- if (typeof response.data === 'string' || response.data instanceof String) {
308
- data = response.data;
309
- } else if (typeof response.data === 'object' && response.data !== null) {
310
- try {
311
- data = JSON.stringify(response.data);
312
- } catch (error) {
313
- console.log('Error converting object to JSON:', error);
314
- }
315
- } else {
316
- console.log('Unsupported data type:', typeof response.data);
317
- }
318
- const modifiedHTML = data.replace(/<noscript>[\s\S]*?<\/noscript>/gi, '');
319
-
320
- const metaRefreshMatch =
321
- /<meta\s+http-equiv="refresh"\s+content="(?:\d+;)?\s*url=(?:'([^']*)'|"([^"]*)"|([^>]*))"/i.exec(
322
- modifiedHTML,
323
- );
324
-
325
- const hasMetaRefresh = metaRefreshMatch && metaRefreshMatch.length > 1;
326
-
327
- if (redirectUrl != null && (hasMetaRefresh || !isCustomFlow)) {
328
- res.url = redirectUrl;
329
- } else {
330
- res.url = url;
331
- }
332
-
333
- if (hasMetaRefresh) {
334
- let urlOrRelativePath;
335
-
336
- for (let i = 1; i < metaRefreshMatch.length; i++) {
337
- if (metaRefreshMatch[i] !== undefined && metaRefreshMatch[i] !== null) {
338
- urlOrRelativePath = metaRefreshMatch[i];
339
- break; // Stop the loop once the first non-null value is found
340
- }
341
- }
342
-
343
- if (urlOrRelativePath.includes('URL=')) {
344
- res.url = urlOrRelativePath.split('URL=').pop();
345
- } else {
346
- const pathname = res.url.substring(0, res.url.lastIndexOf('/'));
347
- res.url = new URL(urlOrRelativePath, pathname).toString();
348
- }
349
- }
350
-
351
- res.content = response.data;
352
- })
353
- .catch(async error => {
354
- if (error.code === 'ECONNABORTED' || error.code === 'ERR_FR_TOO_MANY_REDIRECTS') {
355
- res.status = constants.urlCheckStatuses.axiosTimeout.code;
356
- } else if (error.response) {
357
- if (error.response.status === 401) {
358
- // enters here if URL is protected by basic auth
359
- res.status = constants.urlCheckStatuses.unauthorised.code;
360
- } else {
361
- // enters here if server responds with a status other than 2xx
362
- // the scan should still proceed even if error codes are received, so that accessibility scans for error pages can be done too
363
- res.status = constants.urlCheckStatuses.success.code;
364
- }
365
- res.url = url;
366
- res.content = error.response.data;
367
- return res;
368
- } else if (error.request) {
369
- // enters here if URL cannot be accessed
370
- res.status = constants.urlCheckStatuses.cannotBeResolved.code;
371
- } else {
372
- res.status = constants.urlCheckStatuses.systemError.code;
373
- }
374
- });
375
- return res;
376
- };
377
-
378
280
  const checkUrlConnectivityWithBrowser = async (
379
281
  url: string,
380
282
  browserToRun: string,
381
283
  clonedDataDir: string,
382
284
  playwrightDeviceDetailsObject: DeviceDescriptor,
383
- isCustomFlow: boolean,
384
285
  extraHTTPHeaders: Record<string, string>,
385
286
  ) => {
386
287
  const res = new RES();
@@ -391,27 +292,18 @@ const checkUrlConnectivityWithBrowser = async (
391
292
  return res;
392
293
  }
393
294
 
394
- let viewport = null;
395
- let userAgent = null;
396
- if ('viewport' in playwrightDeviceDetailsObject) viewport = playwrightDeviceDetailsObject.viewport;
397
- if ('userAgent' in playwrightDeviceDetailsObject) userAgent = playwrightDeviceDetailsObject.userAgent;
398
-
399
295
  // Ensure Accept header for non-html content fallback
400
296
  extraHTTPHeaders['Accept'] ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
401
297
 
402
- const launchOptions = getPlaywrightLaunchOptions(browserToRun);
403
- const browserContextLaunchOptions = {
404
- ...launchOptions,
405
- args: [...launchOptions.args, '--headless=new'],
406
- };
298
+ await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
407
299
 
408
300
  let browserContext;
409
301
  try {
410
302
  browserContext = await constants.launcher.launchPersistentContext(clonedDataDir, {
411
- ...browserContextLaunchOptions,
412
- ...(viewport && { viewport }),
413
- ...(userAgent && { userAgent }),
414
303
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
304
+ ignoreHTTPSErrors: true,
305
+ ...getPlaywrightLaunchOptions(browserToRun),
306
+ ...playwrightDeviceDetailsObject,
415
307
  });
416
308
  } catch (err) {
417
309
  printMessage([`Unable to launch browser\n${err}`], messageOptions);
@@ -422,48 +314,77 @@ const checkUrlConnectivityWithBrowser = async (
422
314
  try {
423
315
  const page = await browserContext.newPage();
424
316
 
425
- // Skip Playwright for PDF (use raw request instead)
426
- if (isUrlPdf(url)) {
427
- return await requestToUrl(url, false, extraHTTPHeaders);
317
+ // STEP 1: HEAD request before actual navigation
318
+ let statusCode = 0;
319
+ let contentType = '';
320
+ let disposition = '';
321
+
322
+ try {
323
+ const headResp = await page.request.fetch(url, {
324
+ method: 'HEAD',
325
+ headers: extraHTTPHeaders,
326
+ });
327
+
328
+ statusCode = headResp.status();
329
+ contentType = headResp.headers()['content-type'] || '';
330
+ disposition = headResp.headers()['content-disposition'] || '';
331
+
332
+ // If it looks like a downloadable file, skip goto entirely
333
+ if (
334
+ contentType.includes('pdf') ||
335
+ contentType.includes('octet-stream') ||
336
+ disposition.includes('attachment')
337
+ ) {
338
+ res.status = statusCode === 401
339
+ ? constants.urlCheckStatuses.unauthorised.code
340
+ : constants.urlCheckStatuses.success.code;
341
+
342
+ res.httpStatus = statusCode;
343
+ res.url = url;
344
+ res.content = ''; // Don't try to render binary
345
+
346
+ await browserContext.close();
347
+ return res;
348
+ }
349
+ } catch (e) {
350
+ consoleLogger.info(`HEAD request failed: ${e.message}`);
351
+ res.status = constants.urlCheckStatuses.systemError.code;
352
+ await browserContext.close();
353
+ return res;
428
354
  }
429
355
 
356
+ // STEP 2: Safe to proceed with navigation
430
357
  const response = await page.goto(url, {
431
358
  timeout: 30000,
432
- ...(proxy && { waitUntil: 'commit' }),
359
+ waitUntil: 'commit', // Don't wait for full load
433
360
  });
434
361
 
435
- try {
436
- await page.waitForLoadState('networkidle', { timeout: 10000 });
437
- } catch {
438
- consoleLogger.info('Unable to detect networkidle');
439
- }
440
-
441
- const status = response.status();
442
- res.status = status === 401
362
+ const finalStatus = statusCode || (response?.status?.() ?? 0);
363
+ res.status = finalStatus === 401
443
364
  ? constants.urlCheckStatuses.unauthorised.code
444
365
  : constants.urlCheckStatuses.success.code;
445
366
 
446
- // Store the status code
447
- res.httpStatus = response?.status?.() ?? 0;
367
+ res.httpStatus = finalStatus;
368
+ res.url = page.url();
448
369
 
449
- // Store final navigated URL
450
- res.url = isCustomFlow ? url : page.url();
451
-
452
- // Check content type to determine how to extract content
453
- const contentType = response.headers()['content-type'] || '';
454
-
455
- if (contentType.includes('xml') || res.url.endsWith('.xml')) {
456
- // Fetch raw content to avoid Playwright's HTML-wrapped <pre> behavior
457
- const rawResponse = await requestToUrl(res.url, true, extraHTTPHeaders);
458
- res.content = rawResponse.content;
370
+ contentType = response?.headers()?.['content-type'] || '';
371
+ if (contentType.includes('pdf') || contentType.includes('octet-stream')) {
372
+ res.content = ''; // Avoid triggering render/download
459
373
  } else {
460
- res.content = await page.content(); // rendered DOM
374
+ try {
375
+ await page.waitForLoadState('networkidle', { timeout: 10000 });
376
+ } catch {
377
+ consoleLogger.info('Unable to detect networkidle');
378
+ }
379
+
380
+ res.content = await page.content();
461
381
  }
462
382
 
463
383
  } catch (error) {
464
384
  if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
465
385
  res.status = constants.urlCheckStatuses.unauthorised.code;
466
386
  } else {
387
+ console.log(error);
467
388
  res.status = constants.urlCheckStatuses.systemError.code;
468
389
  }
469
390
  } finally {
@@ -501,7 +422,6 @@ export const checkUrl = async (
501
422
  browser: string,
502
423
  clonedDataDir: string,
503
424
  playwrightDeviceDetailsObject: DeviceDescriptor,
504
- isCustomFlow: boolean,
505
425
  extraHTTPHeaders: Record<string, string>,
506
426
  ) => {
507
427
  const res = await checkUrlConnectivityWithBrowser(
@@ -509,7 +429,6 @@ export const checkUrl = async (
509
429
  browser,
510
430
  clonedDataDir,
511
431
  playwrightDeviceDetailsObject,
512
- isCustomFlow,
513
432
  extraHTTPHeaders,
514
433
  );
515
434
 
@@ -555,18 +474,16 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
555
474
  if (isEmptyObject(argv)) {
556
475
  throw Error('No inputs should be provided');
557
476
  }
558
- const {
477
+ let {
559
478
  scanner,
560
479
  headless,
561
480
  url,
562
481
  deviceChosen,
563
482
  customDevice,
564
483
  viewportWidth,
565
- playwrightDeviceDetailsObject,
566
484
  maxpages,
567
485
  strategy,
568
486
  isLocalFileScan,
569
- finalUrl,
570
487
  browserToRun,
571
488
  nameEmail,
572
489
  customFlowLabel,
@@ -578,32 +495,72 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
578
495
  followRobots,
579
496
  header,
580
497
  safeMode,
498
+ exportDirectory,
581
499
  zip,
582
500
  ruleset,
583
501
  generateJsonFiles,
584
502
  scanDuration
585
503
  } = argv;
586
504
 
505
+ const extraHTTPHeaders = parseHeaders(header);
506
+
507
+ // Set default username and password for basic auth
508
+ let username = '';
509
+ let password = '';
510
+
511
+ // Remove credentials from URL if not a local file scan
512
+ url = argv.isLocalFileScan
513
+ ? url
514
+ : (() => {
515
+ const temp = new URL(url);
516
+ username = temp.username;
517
+ password = temp.password;
518
+
519
+ if (username !== '' || password !== '') {
520
+ extraHTTPHeaders['Authorization'] = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
521
+ }
522
+
523
+ temp.username = '';
524
+ temp.password = '';
525
+ return temp.toString();
526
+ })();
527
+
587
528
  // construct filename for scan results
588
529
  const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
589
530
  const domain = argv.isLocalFileScan ? path.basename(argv.url) : new URL(argv.url).hostname;
531
+
590
532
  const sanitisedLabel = customFlowLabel ? `_${customFlowLabel.replaceAll(' ', '_')}` : '';
591
533
  let resultFilename: string;
592
534
  const randomThreeDigitNumber = randomThreeDigitNumberString();
593
- if (process.env.OOBEE_VERBOSE) {
594
- resultFilename = `${date}_${time}${sanitisedLabel}_${domain}_${randomThreeDigitNumber}`;
595
- } else {
596
- resultFilename = `${date}_${time}${sanitisedLabel}_${domain}`;
535
+ resultFilename = `${date}_${time}${sanitisedLabel}_${domain}_${randomThreeDigitNumber}`;
536
+
537
+ // Set exported directory
538
+ if (exportDirectory) {
539
+ constants.exportDirectory = path.join(exportDirectory, resultFilename);
597
540
  }
541
+
542
+ // Creating the playwrightDeviceDetailObject
543
+ deviceChosen = customDevice === 'Desktop' || customDevice === 'Mobile' ? customDevice : deviceChosen;
544
+
545
+ const playwrightDeviceDetailsObject = getPlaywrightDeviceDetailsObject(
546
+ deviceChosen,
547
+ customDevice,
548
+ viewportWidth,
549
+ );
550
+
551
+ const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(browserToRun, true, resultFilename);
552
+ browserToRun = resolvedBrowser;
553
+
554
+ const resolvedUserDataDirectory = getClonedProfilesWithRandomToken(browserToRun, resultFilename);
598
555
 
599
556
  if (followRobots) {
600
557
  constants.robotsTxtUrls = {};
601
- await getUrlsFromRobotsTxt(url, browserToRun);
558
+ await getUrlsFromRobotsTxt(url, browserToRun, resolvedUserDataDirectory, extraHTTPHeaders);
602
559
  }
603
560
 
604
561
  return {
605
562
  type: scanner,
606
- url: finalUrl,
563
+ url: url,
607
564
  entryUrl: url,
608
565
  isHeadless: headless,
609
566
  deviceChosen,
@@ -624,8 +581,9 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
624
581
  includeScreenshots: !(additional === 'none'),
625
582
  metadata,
626
583
  followRobots,
627
- extraHTTPHeaders: parseHeaders(header),
584
+ extraHTTPHeaders: extraHTTPHeaders,
628
585
  safeMode,
586
+ userDataDirectory: resolvedUserDataDirectory,
629
587
  zip,
630
588
  ruleset,
631
589
  generateJsonFiles,
@@ -633,7 +591,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
633
591
  };
634
592
  };
635
593
 
636
- export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): Promise<void> => {
594
+ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>): Promise<void> => {
637
595
  if (!constants.robotsTxtUrls) return;
638
596
 
639
597
  const domain = new URL(url).origin;
@@ -642,22 +600,18 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): P
642
600
 
643
601
  let robotsTxt: string;
644
602
  try {
645
- if (proxy) {
646
- robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browserToRun);
647
- } else {
648
- robotsTxt = await getRobotsTxtViaAxios(robotsUrl);
649
- }
603
+ robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browserToRun, userDataDirectory, extraHTTPHeaders);
604
+ consoleLogger.info(`Fetched robots.txt from ${robotsUrl}`);
650
605
  } catch (e) {
651
606
  // if robots.txt is not found, do nothing
607
+ consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl}`);
652
608
  }
653
- console.log('robotsTxt', robotsTxt);
609
+
654
610
  if (!robotsTxt) {
655
611
  constants.robotsTxtUrls[domain] = {};
656
612
  return;
657
613
  }
658
-
659
- console.log('Found robots.txt: ', robotsUrl);
660
-
614
+
661
615
  const lines = robotsTxt.split(/\r?\n/);
662
616
  let shouldCapture = false;
663
617
  const disallowedUrls = [];
@@ -705,30 +659,30 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): P
705
659
  constants.robotsTxtUrls[domain] = { disallowedUrls, allowedUrls };
706
660
  };
707
661
 
708
- const getRobotsTxtViaPlaywright = async (robotsUrl: string, browser: string): Promise<string> => {
709
- const browserContext = await constants.launcher.launchPersistentContext('', {
662
+ const getRobotsTxtViaPlaywright = async (robotsUrl: string, browser: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>): Promise<string> => {
663
+
664
+ let robotsDataDir = '';
665
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
666
+ if (process.env.CRAWLEE_HEADLESS === '1') {
667
+ // Create robots own user data directory else SingletonLock: File exists (17) with crawlDomain or crawlSitemap's own browser
668
+ const robotsDataDir = path.join(userDataDirectory, 'robots');
669
+ if (!fs.existsSync(robotsDataDir)) {
670
+ fs.mkdirSync(robotsDataDir, { recursive: true });
671
+ }
672
+ }
673
+
674
+ const browserContext = await constants.launcher.launchPersistentContext(robotsDataDir, {
710
675
  ...getPlaywrightLaunchOptions(browser),
676
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
711
677
  });
712
678
 
713
679
  const page = await browserContext.newPage();
714
- await page.goto(robotsUrl, { waitUntil: 'networkidle', timeout: 30000 });
715
680
 
681
+ await page.goto(robotsUrl, { waitUntil: 'networkidle', timeout: 30000 });
716
682
  const robotsTxt: string | null = await page.evaluate(() => document.body.textContent);
717
683
  return robotsTxt;
718
684
  };
719
685
 
720
- const getRobotsTxtViaAxios = async (robotsUrl: string): Promise<string> => {
721
- const instance = axios.create({
722
- httpsAgent: new https.Agent({
723
- rejectUnauthorized: false,
724
- keepAlive: true,
725
- }),
726
- });
727
-
728
- const robotsTxt = (await (await instance.get(robotsUrl, { timeout: 2000 })).data) as string;
729
- return robotsTxt;
730
- };
731
-
732
686
  export const isDisallowedInRobotsTxt = (url: string): boolean => {
733
687
  if (!constants.robotsTxtUrls) return;
734
688
 
@@ -760,8 +714,7 @@ export const getLinksFromSitemap = async (
760
714
  userDataDirectory: string,
761
715
  userUrlInput: string,
762
716
  isIntelligent: boolean,
763
- username: string,
764
- password: string,
717
+ extraHTTPHeaders: Record<string, string>,
765
718
  ) => {
766
719
  const scannedSitemaps = new Set<string>();
767
720
  const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
@@ -772,11 +725,6 @@ export const getLinksFromSitemap = async (
772
725
  if (!url) return;
773
726
  if (isDisallowedInRobotsTxt(url)) return;
774
727
 
775
- // add basic auth credentials to the URL
776
- username !== '' && password !== ''
777
- ? (url = addBasicAuthCredentials(url, username, password))
778
- : url;
779
-
780
728
  url = convertPathToLocalFile(url);
781
729
 
782
730
  let request;
@@ -791,13 +739,6 @@ export const getLinksFromSitemap = async (
791
739
  urls[url] = request;
792
740
  };
793
741
 
794
- const addBasicAuthCredentials = (url: string, username: string, password: string) => {
795
- const urlObject = new URL(url);
796
- urlObject.username = username;
797
- urlObject.password = password;
798
- return urlObject.toString();
799
- };
800
-
801
742
  const calculateCloseness = (sitemapUrl: string) => {
802
743
  // Remove 'http://', 'https://', and 'www.' prefixes from the URLs
803
744
  const normalizedSitemapUrl = sitemapUrl.replace(/^(https?:\/\/)?(www\.)?/, '');
@@ -868,16 +809,10 @@ export const getLinksFromSitemap = async (
868
809
  finalUserDataDirectory = '';
869
810
  }
870
811
 
871
- const fetchUrls = async (url: string) => {
812
+ const fetchUrls = async (url: string, extraHTTPHeaders: Record<string, string>) => {
872
813
  let data;
873
814
  let sitemapType;
874
- let isBasicAuth = false;
875
-
876
- let username = '';
877
- let password = '';
878
-
879
- let parsedUrl;
880
-
815
+
881
816
  if (scannedSitemaps.has(url)) {
882
817
  // Skip processing if the sitemap has already been scanned
883
818
  return;
@@ -893,17 +828,9 @@ export const getLinksFromSitemap = async (
893
828
  if (!fs.existsSync(url)) {
894
829
  return;
895
830
  }
896
- parsedUrl = url;
831
+
897
832
  } else if (isValidHttpUrl(url)) {
898
- parsedUrl = new URL(url);
899
-
900
- if (parsedUrl.username !== '' && parsedUrl.password !== '') {
901
- isBasicAuth = true;
902
- username = decodeURIComponent(parsedUrl.username);
903
- password = decodeURIComponent(parsedUrl.password);
904
- parsedUrl.username = '';
905
- parsedUrl.password = '';
906
- }
833
+ // Do nothing, url is valid
907
834
  } else {
908
835
  printMessage([`Invalid Url/Filepath: ${url}`], messageOptions);
909
836
  return;
@@ -915,12 +842,17 @@ export const getLinksFromSitemap = async (
915
842
  {
916
843
  ...getPlaywrightLaunchOptions(browser),
917
844
  // Not necessary to parse http_credentials as I am parsing it directly in URL
845
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
846
+ ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
847
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
918
848
  },
919
849
  );
920
850
 
921
851
  const page = await browserContext.newPage();
852
+
922
853
  await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
923
- if (constants.launcher === webkit) {
854
+
855
+ if (await page.locator('body').count() > 0) {
924
856
  data = await page.locator('body').innerText();
925
857
  } else {
926
858
  const urlSet = page.locator('urlset');
@@ -948,35 +880,14 @@ export const getLinksFromSitemap = async (
948
880
  addToUrlList(url);
949
881
  return;
950
882
  }
951
- if (proxy) {
952
- await getDataUsingPlaywright();
953
- } else {
954
- try {
955
- const instance = axios.create({
956
- httpsAgent: new https.Agent({
957
- rejectUnauthorized: false,
958
- keepAlive: true,
959
- }),
960
- auth: {
961
- username,
962
- password,
963
- },
964
- });
965
- try {
966
- data = await (await instance.get(url, { timeout: 80000 })).data;
967
- } catch {
968
- return; // to skip the error
969
- }
970
- } catch (error) {
971
- if (error.code === 'ECONNABORTED') {
972
- await getDataUsingPlaywright();
973
- }
974
- }
975
- }
883
+
884
+ await getDataUsingPlaywright();
885
+
976
886
  } else {
977
887
  url = convertLocalFileToPath(url);
978
888
  data = fs.readFileSync(url, 'utf8');
979
889
  }
890
+
980
891
  const $ = cheerio.load(data, { xml: true });
981
892
 
982
893
  // This case is when the document is not an XML format document
@@ -1012,7 +923,7 @@ export const getLinksFromSitemap = async (
1012
923
  break;
1013
924
  }
1014
925
  if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
1015
- await fetchUrls(childSitemapUrlText); // Recursive call for nested sitemaps
926
+ await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
1016
927
  } else {
1017
928
  addToUrlList(childSitemapUrlText); // Add regular URLs to the list
1018
929
  }
@@ -1037,7 +948,7 @@ export const getLinksFromSitemap = async (
1037
948
  };
1038
949
 
1039
950
  try {
1040
- await fetchUrls(sitemapUrl);
951
+ await fetchUrls(sitemapUrl, extraHTTPHeaders);
1041
952
  } catch (e) {
1042
953
  consoleLogger.error(e);
1043
954
  }
@@ -1086,20 +997,26 @@ export const validName = (name: string) => {
1086
997
  * @returns object consisting of browser to run and cloned data directory
1087
998
  */
1088
999
  export const getBrowserToRun = (
1089
- preferredBrowser: BrowserTypes,
1000
+ preferredBrowser?: BrowserTypes,
1090
1001
  isCli = false,
1002
+ randomToken?: string
1091
1003
  ): { browserToRun: BrowserTypes; clonedBrowserDataDir: string } => {
1004
+
1005
+ if (!randomToken) {
1006
+ randomToken = '';
1007
+ }
1008
+
1092
1009
  const platform = os.platform();
1093
1010
 
1094
1011
  // Prioritise Chrome on Windows and Mac platforms if user does not specify a browser
1095
1012
  if (!preferredBrowser && (os.platform() === 'win32' || os.platform() === 'darwin')) {
1096
1013
  preferredBrowser = BrowserTypes.CHROME;
1014
+ } else {
1015
+ printMessage([`Preferred browser ${preferredBrowser}`], messageOptions);
1097
1016
  }
1098
1017
 
1099
- printMessage([`Preferred browser ${preferredBrowser}`], messageOptions);
1100
-
1101
1018
  if (preferredBrowser === BrowserTypes.CHROME) {
1102
- const chromeData = getChromeData();
1019
+ const chromeData = getChromeData(randomToken);
1103
1020
  if (chromeData) return chromeData;
1104
1021
 
1105
1022
  if (platform === 'darwin') {
@@ -1113,7 +1030,7 @@ export const getBrowserToRun = (
1113
1030
  if (isCli)
1114
1031
  printMessage(['Unable to use Chrome, falling back to Edge browser...'], messageOptions);
1115
1032
 
1116
- const edgeData = getEdgeData();
1033
+ const edgeData = getEdgeData(randomToken);
1117
1034
  if (edgeData) return edgeData;
1118
1035
 
1119
1036
  if (isCli)
@@ -1125,12 +1042,12 @@ export const getBrowserToRun = (
1125
1042
  printMessage(['Unable to use Chrome, falling back to Chromium browser...'], messageOptions);
1126
1043
  }
1127
1044
  } else if (preferredBrowser === BrowserTypes.EDGE) {
1128
- const edgeData = getEdgeData();
1045
+ const edgeData = getEdgeData(randomToken);
1129
1046
  if (edgeData) return edgeData;
1130
1047
 
1131
1048
  if (isCli)
1132
1049
  printMessage(['Unable to use Edge, falling back to Chrome browser...'], messageOptions);
1133
- const chromeData = getChromeData();
1050
+ const chromeData = getChromeData(randomToken);
1134
1051
  if (chromeData) return chromeData;
1135
1052
 
1136
1053
  if (platform === 'darwin') {
@@ -1161,7 +1078,7 @@ export const getBrowserToRun = (
1161
1078
  // defaults to chromium
1162
1079
  return {
1163
1080
  browserToRun: BrowserTypes.CHROMIUM,
1164
- clonedBrowserDataDir: cloneChromiumProfiles(),
1081
+ clonedBrowserDataDir: cloneChromiumProfiles(randomToken),
1165
1082
  };
1166
1083
  };
1167
1084
 
@@ -1181,9 +1098,9 @@ export const getClonedProfilesWithRandomToken = (browser: string, randomToken: s
1181
1098
  return cloneChromiumProfiles(randomToken);
1182
1099
  };
1183
1100
 
1184
- export const getChromeData = () => {
1101
+ export const getChromeData = (randomToken: string) => {
1185
1102
  const browserDataDir = getDefaultChromeDataDir();
1186
- const clonedBrowserDataDir = cloneChromeProfiles();
1103
+ const clonedBrowserDataDir = cloneChromeProfiles(randomToken);
1187
1104
  if (browserDataDir && clonedBrowserDataDir) {
1188
1105
  const browserToRun = BrowserTypes.CHROME;
1189
1106
  return { browserToRun, clonedBrowserDataDir };
@@ -1191,9 +1108,9 @@ export const getChromeData = () => {
1191
1108
  return null;
1192
1109
  };
1193
1110
 
1194
- export const getEdgeData = () => {
1111
+ export const getEdgeData = (randomToken: string) => {
1195
1112
  const browserDataDir = getDefaultEdgeDataDir();
1196
- const clonedBrowserDataDir = cloneEdgeProfiles();
1113
+ const clonedBrowserDataDir = cloneEdgeProfiles(randomToken);
1197
1114
  if (browserDataDir && clonedBrowserDataDir) {
1198
1115
  const browserToRun = BrowserTypes.EDGE;
1199
1116
  return { browserToRun, clonedBrowserDataDir };
@@ -1397,7 +1314,7 @@ const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: st
1397
1314
  * @param {string} randomToken - random token to append to the cloned directory
1398
1315
  * @returns {string} cloned data directory, null if any of the sub files failed to copy
1399
1316
  */
1400
- export const cloneChromeProfiles = (randomToken?: string): string => {
1317
+ export const cloneChromeProfiles = (randomToken: string): string => {
1401
1318
  const baseDir = getDefaultChromeDataDir();
1402
1319
 
1403
1320
  if (!baseDir) {
@@ -1406,18 +1323,10 @@ export const cloneChromeProfiles = (randomToken?: string): string => {
1406
1323
 
1407
1324
  let destDir;
1408
1325
 
1409
- if (randomToken) {
1410
- destDir = path.join(baseDir, `oobee-${randomToken}`);
1411
- } else {
1412
- destDir = path.join(baseDir, 'oobee');
1413
- }
1326
+ destDir = path.join(baseDir, `oobee-${randomToken}`);
1414
1327
 
1415
1328
  if (fs.existsSync(destDir)) {
1416
- if (process.env.OOBEE_VERBOSE) {
1417
1329
  deleteClonedChromeProfiles(randomToken);
1418
- } else {
1419
- deleteClonedChromeProfiles();
1420
- }
1421
1330
  }
1422
1331
 
1423
1332
  if (!fs.existsSync(destDir)) {
@@ -1435,10 +1344,13 @@ export const cloneChromeProfiles = (randomToken?: string): string => {
1435
1344
  return destDir;
1436
1345
  }
1437
1346
 
1438
- return null;
1347
+ consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
1348
+
1349
+ // For future reference, return a null instead to halt the scan
1350
+ return destDir;
1439
1351
  };
1440
1352
 
1441
- export const cloneChromiumProfiles = (randomToken?: string): string => {
1353
+ export const cloneChromiumProfiles = (randomToken: string): string => {
1442
1354
  const baseDir = getDefaultChromiumDataDir();
1443
1355
 
1444
1356
  if (!baseDir) {
@@ -1447,10 +1359,10 @@ export const cloneChromiumProfiles = (randomToken?: string): string => {
1447
1359
 
1448
1360
  let destDir: string;
1449
1361
 
1450
- if (randomToken) {
1451
- destDir = path.join(baseDir, `oobee-${randomToken}`);
1452
- } else {
1453
- destDir = path.join(baseDir, 'oobee');
1362
+ destDir = path.join(baseDir, `oobee-${randomToken}`);
1363
+
1364
+ if (fs.existsSync(destDir)) {
1365
+ deleteClonedChromiumProfiles(randomToken);
1454
1366
  }
1455
1367
 
1456
1368
  if (!fs.existsSync(destDir)) {
@@ -1468,7 +1380,7 @@ export const cloneChromiumProfiles = (randomToken?: string): string => {
1468
1380
  * @param {string} randomToken - random token to append to the cloned directory
1469
1381
  * @returns {string} cloned data directory, null if any of the sub files failed to copy
1470
1382
  */
1471
- export const cloneEdgeProfiles = (randomToken?: string): string => {
1383
+ export const cloneEdgeProfiles = (randomToken: string): string => {
1472
1384
  const baseDir = getDefaultEdgeDataDir();
1473
1385
 
1474
1386
  if (!baseDir) {
@@ -1477,18 +1389,10 @@ export const cloneEdgeProfiles = (randomToken?: string): string => {
1477
1389
 
1478
1390
  let destDir;
1479
1391
 
1480
- if (randomToken) {
1481
- destDir = path.join(baseDir, `oobee-${randomToken}`);
1482
- } else {
1483
- destDir = path.join(baseDir, 'oobee');
1484
- }
1392
+ destDir = path.join(baseDir, `oobee-${randomToken}`);
1485
1393
 
1486
1394
  if (fs.existsSync(destDir)) {
1487
- if (process.env.OOBEE_VERBOSE) {
1488
1395
  deleteClonedEdgeProfiles(randomToken);
1489
- } else {
1490
- deleteClonedEdgeProfiles();
1491
- }
1492
1396
  }
1493
1397
 
1494
1398
  if (!fs.existsSync(destDir)) {
@@ -1507,10 +1411,13 @@ export const cloneEdgeProfiles = (randomToken?: string): string => {
1507
1411
  return destDir;
1508
1412
  }
1509
1413
 
1510
- return null;
1414
+ consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
1415
+
1416
+ // For future reference, return a null instead to halt the scan
1417
+ return destDir;
1511
1418
  };
1512
1419
 
1513
- export const deleteClonedProfiles = (browser: string, randomToken?: string): void => {
1420
+ export const deleteClonedProfiles = (browser: string, randomToken: string): void => {
1514
1421
  if (browser === BrowserTypes.CHROME) {
1515
1422
  deleteClonedChromeProfiles(randomToken);
1516
1423
  } else if (browser === BrowserTypes.EDGE) {
@@ -1565,9 +1472,7 @@ export const deleteClonedChromeProfiles = (randomToken?: string): void => {
1565
1472
  * @returns null
1566
1473
  */
1567
1474
  export const deleteClonedEdgeProfiles = (randomToken?: string): void => {
1568
- if (process.env.OOBEE_VERBOSE) {
1569
- return;
1570
- }
1475
+
1571
1476
  const baseDir = getDefaultEdgeDataDir();
1572
1477
 
1573
1478
  if (!baseDir) {
@@ -1716,13 +1621,9 @@ export const submitFormViaPlaywright = async (
1716
1621
  } finally {
1717
1622
  await browserContext.close();
1718
1623
  if (proxy && browserToRun === BrowserTypes.EDGE) {
1719
- if (!process.env.OOBEE_VERBOSE) {
1720
- deleteClonedEdgeProfiles();
1721
- }
1624
+ deleteClonedEdgeProfiles(clonedDir);
1722
1625
  } else if (proxy && browserToRun === BrowserTypes.CHROME) {
1723
- if (!process.env.OOBEE_VERBOSE) {
1724
- deleteClonedChromeProfiles();
1725
- }
1626
+ deleteClonedChromeProfiles(clonedDir);
1726
1627
  }
1727
1628
  }
1728
1629
  };
@@ -1781,7 +1682,9 @@ export const submitForm = async (
1781
1682
  export async function initModifiedUserAgent(
1782
1683
  browser?: string,
1783
1684
  playwrightDeviceDetailsObject?: object,
1685
+ userDataDirectory?: string,
1784
1686
  ) {
1687
+
1785
1688
  const isHeadless = process.env.CRAWLEE_HEADLESS === '1';
1786
1689
 
1787
1690
  // If headless mode is enabled, ensure the headless flag is set.
@@ -1798,7 +1701,11 @@ export async function initModifiedUserAgent(
1798
1701
  };
1799
1702
 
1800
1703
  // Launch a temporary persistent context with an empty userDataDir to mimic your production browser setup.
1801
- const browserContext = await constants.launcher.launchPersistentContext('', launchOptions);
1704
+ const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
1705
+ ? userDataDirectory
1706
+ : '';
1707
+
1708
+ const browserContext = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, launchOptions);
1802
1709
  const page = await browserContext.newPage();
1803
1710
 
1804
1711
  // Retrieve the default user agent.
@@ -1856,13 +1763,6 @@ export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
1856
1763
  return options;
1857
1764
  };
1858
1765
 
1859
- export const urlWithoutAuth = (url: string): string => {
1860
- const parsedUrl = new URL(url);
1861
- parsedUrl.username = '';
1862
- parsedUrl.password = '';
1863
- return parsedUrl.toString();
1864
- };
1865
-
1866
1766
  export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
1867
1767
  const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
1868
1768
 
@@ -1887,7 +1787,7 @@ export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
1887
1787
 
1888
1788
  let timeout: NodeJS.Timeout;
1889
1789
  let mutationCount = 0;
1890
- const MAX_MUTATIONS = 250;
1790
+ const MAX_MUTATIONS = 500;
1891
1791
  const mutationHash: Record<string, number> = {};
1892
1792
 
1893
1793
  const observer = new MutationObserver(mutationsList => {