@govtechsg/oobee 0.10.58 → 0.10.61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -452,7 +452,7 @@ const reserveFileNameKeywords = [
452
452
 
453
453
  export default {
454
454
  cliZipFileName: 'oobee-scan-results.zip',
455
- exportDirectory: `${process.cwd()}`,
455
+ exportDirectory: undefined,
456
456
  maxRequestsPerCrawl,
457
457
  maxConcurrency: 25,
458
458
  urlsCrawledObj,
@@ -466,6 +466,7 @@ export default {
466
466
  reserveFileNameKeywords,
467
467
  wcagLinks,
468
468
  robotsTxtUrls: null,
469
+ userDataDirectory: null, // This will be set later in the code
469
470
  };
470
471
 
471
472
  export const rootPath = dirname;
@@ -1,6 +1,6 @@
1
1
  import { Question } from 'inquirer';
2
2
  import { Answers } from '../index.js';
3
- import { getUserDataTxt, setHeadlessMode } from '../utils.js';
3
+ import { getUserDataTxt, randomThreeDigitNumberString, setHeadlessMode } from '../utils.js';
4
4
  import {
5
5
  checkUrl,
6
6
  deleteClonedProfiles,
@@ -15,6 +15,7 @@ import {
15
15
  parseHeaders,
16
16
  } from './common.js';
17
17
  import constants, { BrowserTypes, ScannerTypes } from './constants.js';
18
+ import { random } from 'lodash';
18
19
 
19
20
  const userData = getUserDataTxt();
20
21
 
@@ -78,8 +79,15 @@ const startScanQuestions = [
78
79
  process.exit(1);
79
80
  }
80
81
 
82
+ // construct filename for scan results
83
+ const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
84
+ const domain = new URL(url).hostname;
85
+ let resultFilename: string;
86
+ const randomThreeDigitNumber = randomThreeDigitNumberString();
87
+ resultFilename = `${date}_${time}_${domain}_${randomThreeDigitNumber}`;
88
+
81
89
  const statuses = constants.urlCheckStatuses;
82
- const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(BrowserTypes.CHROME);
90
+ const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(BrowserTypes.CHROME, false, resultFilename);
83
91
 
84
92
  setHeadlessMode(browserToRun, answers.headless);
85
93
 
@@ -95,11 +103,11 @@ const startScanQuestions = [
95
103
  browserToRun,
96
104
  clonedBrowserDataDir,
97
105
  playwrightDeviceDetailsObject,
98
- answers.scanner === ScannerTypes.CUSTOM,
99
106
  parseHeaders(answers.header),
100
107
  );
101
108
 
102
- deleteClonedProfiles(browserToRun);
109
+ deleteClonedProfiles(browserToRun, resultFilename);
110
+
103
111
  switch (res.status) {
104
112
  case statuses.success.code:
105
113
  answers.finalUrl = res.url;
@@ -21,6 +21,9 @@ import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
21
21
  import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
22
22
  import xPathToCss from './custom/xPathToCss.js';
23
23
  import type { Response as PlaywrightResponse } from 'playwright';
24
+ import fs from 'fs';
25
+ import { getStoragePath } from '../utils.js';
26
+ import path from 'path';
24
27
 
25
28
  // types
26
29
  interface AxeResultsWithScreenshot extends AxeResults {
@@ -254,7 +257,7 @@ export const runAxeScript = async ({
254
257
  return new Promise(resolve => {
255
258
  let timeout: NodeJS.Timeout;
256
259
  let mutationCount = 0;
257
- const MAX_MUTATIONS = 250;
260
+ const MAX_MUTATIONS = 500;
258
261
  const MAX_SAME_MUTATION_LIMIT = 10;
259
262
  const mutationHash: Record<string, number> = {};
260
263
 
@@ -476,8 +479,11 @@ export const runAxeScript = async ({
476
479
  export const createCrawleeSubFolders = async (
477
480
  randomToken: string,
478
481
  ): Promise<{ dataset: crawlee.Dataset; requestQueue: crawlee.RequestQueue }> => {
479
- const dataset = await crawlee.Dataset.open(randomToken);
480
- const requestQueue = await crawlee.RequestQueue.open(randomToken);
482
+
483
+ const crawleeDir = path.join(getStoragePath(randomToken),"crawlee");
484
+
485
+ const dataset = await crawlee.Dataset.open(crawleeDir);
486
+ const requestQueue = await crawlee.RequestQueue.open(crawleeDir);
481
487
  return { dataset, requestQueue };
482
488
  };
483
489
 
@@ -27,9 +27,7 @@ import {
27
27
  isSkippedUrl,
28
28
  isDisallowedInRobotsTxt,
29
29
  getUrlsFromRobotsTxt,
30
- urlWithoutAuth,
31
30
  waitForPageLoaded,
32
- initModifiedUserAgent,
33
31
  } from '../constants/common.js';
34
32
  import { areLinksEqual, isFollowStrategy } from '../utils.js';
35
33
  import {
@@ -40,6 +38,8 @@ import {
40
38
  } from './pdfScanFunc.js';
41
39
  import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
42
40
  import { ViewportSettingsClass } from '../combine.js';
41
+ import * as path from 'path';
42
+ import fsp from 'fs/promises';
43
43
 
44
44
  const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
45
45
  if (!blacklistedPatterns) {
@@ -115,10 +115,6 @@ const crawlDomain = async ({
115
115
 
116
116
  ({ requestQueue } = await createCrawleeSubFolders(randomToken));
117
117
 
118
- if (!fs.existsSync(randomToken)) {
119
- fs.mkdirSync(randomToken);
120
- }
121
-
122
118
  const pdfDownloads: Promise<void>[] = [];
123
119
  const uuidToPdfMapping: Record<string, string> = {};
124
120
  const isScanHtml = ['all', 'html-only'].includes(fileTypes);
@@ -126,45 +122,11 @@ const crawlDomain = async ({
126
122
  const { maxConcurrency } = constants;
127
123
  const { playwrightDeviceDetailsObject } = viewportSettings;
128
124
 
129
- const httpsAgent = new https.Agent({ rejectUnauthorized: false });
130
-
131
- // Boolean to omit axe scan for basic auth URL
132
- let isBasicAuth = false;
133
- let authHeader = '';
134
-
135
- // Test basic auth and add auth header if auth exist
136
- const parsedUrl = new URL(url);
137
- let username: string;
138
- let password: string;
139
- if (parsedUrl.username !== '' && parsedUrl.password !== '') {
140
- isBasicAuth = true;
141
- username = decodeURIComponent(parsedUrl.username);
142
- password = decodeURIComponent(parsedUrl.password);
143
-
144
- // Create auth header
145
- authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
146
-
147
- // Remove username from parsedUrl
148
- parsedUrl.username = '';
149
- parsedUrl.password = '';
150
- // Send the finalUrl without credentials by setting auth header instead
151
- const finalUrl = parsedUrl.toString();
152
-
153
- await requestQueue.addRequest({
154
- url: finalUrl,
155
- skipNavigation: isUrlPdf(finalUrl),
156
- headers: {
157
- Authorization: authHeader,
158
- },
159
- label: finalUrl,
160
- });
161
- } else {
162
- await requestQueue.addRequest({
163
- url,
164
- skipNavigation: isUrlPdf(url),
165
- label: url,
166
- });
167
- }
125
+ await requestQueue.addRequest({
126
+ url,
127
+ skipNavigation: isUrlPdf(url),
128
+ label: url,
129
+ });
168
130
 
169
131
  const enqueueProcess = async (
170
132
  page: Page,
@@ -377,31 +339,40 @@ const crawlDomain = async ({
377
339
 
378
340
  let isAbortingScanNow = false;
379
341
 
380
- let userDataDir = '';
381
- if (userDataDirectory) {
382
- userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
383
- }
384
-
385
- await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
386
-
387
342
  const crawler = new crawlee.PlaywrightCrawler({
388
343
  launchContext: {
389
344
  launcher: constants.launcher,
390
345
  launchOptions: getPlaywrightLaunchOptions(browser),
391
346
  // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
392
- ...(process.env.CRAWLEE_HEADLESS === '0' && { userDataDir }),
347
+ ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
393
348
  },
394
349
  retryOnBlocked: true,
395
350
  browserPoolOptions: {
396
351
  useFingerprints: false,
397
352
  preLaunchHooks: [
398
353
  async (_pageId, launchContext) => {
354
+ const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
355
+
356
+ // Ensure base exists
357
+ await fsp.mkdir(baseDir, { recursive: true });
358
+
359
+ // Create a unique subdir per browser
360
+ const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
361
+ await fsp.mkdir(subProfileDir, { recursive: true });
362
+
363
+ // Assign to Crawlee's launcher
364
+ launchContext.userDataDir = subProfileDir;
365
+
366
+ // Safely extend launchOptions
399
367
  launchContext.launchOptions = {
400
368
  ...launchContext.launchOptions,
401
- bypassCSP: true,
402
369
  ignoreHTTPSErrors: true,
403
370
  ...playwrightDeviceDetailsObject,
371
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
404
372
  };
373
+
374
+ // Optionally log for debugging
375
+ // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
405
376
  },
406
377
  ],
407
378
  },
@@ -414,7 +385,7 @@ const crawlDomain = async ({
414
385
  return new Promise(resolve => {
415
386
  let timeout;
416
387
  let mutationCount = 0;
417
- const MAX_MUTATIONS = 250; // stop if things never quiet down
388
+ const MAX_MUTATIONS = 500; // stop if things never quiet down
418
389
  const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
419
390
 
420
391
  const observer = new MutationObserver(() => {
@@ -464,33 +435,10 @@ const crawlDomain = async ({
464
435
  }
465
436
  },
466
437
  ],
467
- preNavigationHooks: [ async({ page, request}) => {
468
- if (isBasicAuth) {
469
- await page.setExtraHTTPHeaders({
470
- Authorization: authHeader,
471
- ...extraHTTPHeaders,
472
- });
473
- } else {
474
- await page.setExtraHTTPHeaders({
475
- ...extraHTTPHeaders,
476
- });
477
- }
478
- }],
479
438
  requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
480
439
  requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
481
440
  const browserContext: BrowserContext = page.context();
482
441
  try {
483
- // Set basic auth header if needed
484
- if (isBasicAuth) {
485
- await page.setExtraHTTPHeaders({
486
- Authorization: authHeader,
487
- });
488
- const currentUrl = new URL(request.url);
489
- currentUrl.username = username;
490
- currentUrl.password = password;
491
- request.url = currentUrl.href;
492
- }
493
-
494
442
  await waitForPageLoaded(page, 10000);
495
443
  let actualUrl = page.url() || request.loadedUrl || request.url;
496
444
 
@@ -652,13 +600,13 @@ const crawlDomain = async ({
652
600
  });
653
601
 
654
602
  urlsCrawled.scanned.push({
655
- url: urlWithoutAuth(request.url),
603
+ url: request.url,
656
604
  pageTitle: results.pageTitle,
657
605
  actualUrl, // i.e. actualUrl
658
606
  });
659
607
 
660
608
  urlsCrawled.scannedRedirects.push({
661
- fromUrl: urlWithoutAuth(request.url),
609
+ fromUrl: request.url,
662
610
  toUrl: actualUrl, // i.e. actualUrl
663
611
  });
664
612
 
@@ -671,10 +619,10 @@ const crawlDomain = async ({
671
619
  if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
672
620
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
673
621
  numScanned: urlsCrawled.scanned.length,
674
- urlScanned: urlWithoutAuth(request.url),
622
+ urlScanned: request.url,
675
623
  });
676
624
  urlsCrawled.scanned.push({
677
- url: urlWithoutAuth(request.url),
625
+ url: request.url,
678
626
  actualUrl: request.url,
679
627
  pageTitle: results.pageTitle,
680
628
  });
@@ -695,7 +643,7 @@ const crawlDomain = async ({
695
643
  });
696
644
  }
697
645
 
698
- if (followRobots) await getUrlsFromRobotsTxt(request.url, browser);
646
+ if (followRobots) await getUrlsFromRobotsTxt(request.url, browser, userDataDirectory, extraHTTPHeaders);
699
647
  await enqueueProcess(page, enqueueLinks, browserContext);
700
648
  } catch (e) {
701
649
  try {
@@ -7,6 +7,7 @@ import crawlDomain from './crawlDomain.js';
7
7
  import crawlSitemap from './crawlSitemap.js';
8
8
  import { EnqueueStrategy } from 'crawlee';
9
9
  import { ViewportSettingsClass } from '../combine.js';
10
+ import { getPlaywrightLaunchOptions } from '../constants/common.js';
10
11
 
11
12
  const crawlIntelligentSitemap = async (
12
13
  url: string,
@@ -36,9 +37,6 @@ const crawlIntelligentSitemap = async (
36
37
  let sitemapUrl;
37
38
 
38
39
  ({ dataset } = await createCrawleeSubFolders(randomToken));
39
- if (!fs.existsSync(randomToken)) {
40
- fs.mkdirSync(randomToken);
41
- }
42
40
 
43
41
  function getHomeUrl(parsedUrl: string) {
44
42
  const urlObject = new URL(parsedUrl);
@@ -48,15 +46,21 @@ const crawlIntelligentSitemap = async (
48
46
  return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
49
47
  }
50
48
 
51
- async function findSitemap(link: string) {
49
+ async function findSitemap(link: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>) {
52
50
  const homeUrl = getHomeUrl(link);
53
51
  let sitemapLink = '';
54
- const chromiumBrowser = await chromium.launch({
55
- headless: false,
56
- channel: 'chrome',
57
- args: ['--headless=new', '--no-sandbox'],
52
+
53
+ const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
54
+ ? userDataDirectory
55
+ : '';
56
+ const context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
57
+ headless: process.env.CRAWLEE_HEADLESS === '1',
58
+ ...getPlaywrightLaunchOptions(browser),
59
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
58
60
  });
59
- const page = await chromiumBrowser.newPage();
61
+
62
+ const page = await context.newPage();
63
+
60
64
  for (const path of sitemapPaths) {
61
65
  sitemapLink = homeUrl + path;
62
66
  if (await checkUrlExists(page, sitemapLink)) {
@@ -64,7 +68,8 @@ const crawlIntelligentSitemap = async (
64
68
  break;
65
69
  }
66
70
  }
67
- await chromiumBrowser.close();
71
+ await page.close();
72
+ await context.close().catch(() => { });
68
73
  return sitemapExist ? sitemapLink : '';
69
74
  }
70
75
 
@@ -79,7 +84,7 @@ const crawlIntelligentSitemap = async (
79
84
  };
80
85
 
81
86
  try {
82
- sitemapUrl = await findSitemap(url);
87
+ sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
83
88
  } catch (error) {
84
89
  consoleLogger.error(error);
85
90
  }
@@ -13,7 +13,6 @@ import {
13
13
  isFilePath,
14
14
  convertLocalFileToPath,
15
15
  convertPathToLocalFile,
16
- initModifiedUserAgent,
17
16
  } from '../constants/common.js';
18
17
  import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
19
18
  import { guiInfoLog } from '../logs.js';
@@ -74,9 +73,6 @@ export const crawlLocalFile = async ({
74
73
  ({ dataset } = await createCrawleeSubFolders(randomToken));
75
74
  urlsCrawled = { ...constants.urlsCrawledObj };
76
75
 
77
- if (!fs.existsSync(randomToken)) {
78
- fs.mkdirSync(randomToken);
79
- }
80
76
  }
81
77
 
82
78
  // Check if the sitemapUrl is a local file and if it exists
@@ -136,16 +132,6 @@ export const crawlLocalFile = async ({
136
132
  console.log(e);
137
133
  }
138
134
 
139
- if (basicAuthRegex.test(sitemapUrl)) {
140
- isBasicAuth = true;
141
- // request to basic auth URL to authenticate for browser session
142
- finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
143
- const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
144
- // obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
145
- finalLinks.push(new Request({ url: finalUrl }));
146
- basicAuthPage = -2;
147
- }
148
-
149
135
  const uuidToPdfMapping: Record<string, string> = {}; // key and value of string type
150
136
 
151
137
  finalLinks = [...finalLinks, ...linksFromSitemap];
@@ -165,9 +151,12 @@ export const crawlLocalFile = async ({
165
151
  let shouldAbort = false;
166
152
 
167
153
  if (!isUrlPdf(request.url)) {
168
- await initModifiedUserAgent(browser);
169
- const browserContext = await constants.launcher.launchPersistentContext('', {
170
- headless: false,
154
+ const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
155
+ ? userDataDirectory
156
+ : '';
157
+
158
+ const browserContext = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
159
+ headless: process.env.CRAWLEE_HEADLESS === '1',
171
160
  ...getPlaywrightLaunchOptions(browser),
172
161
  ...playwrightDeviceDetailsObject,
173
162
  });
@@ -17,15 +17,15 @@ import {
17
17
  getLinksFromSitemap,
18
18
  getPlaywrightLaunchOptions,
19
19
  isSkippedUrl,
20
- urlWithoutAuth,
21
20
  waitForPageLoaded,
22
21
  isFilePath,
23
- initModifiedUserAgent,
24
22
  } from '../constants/common.js';
25
23
  import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
26
24
  import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
27
25
  import { guiInfoLog } from '../logs.js';
28
26
  import { ViewportSettingsClass } from '../combine.js';
27
+ import * as path from 'path';
28
+ import fsp from 'fs/promises';
29
29
 
30
30
  const crawlSitemap = async ({
31
31
  sitemapUrl,
@@ -70,50 +70,19 @@ const crawlSitemap = async ({
70
70
  let dataset: crawlee.Dataset;
71
71
  let urlsCrawled: UrlsCrawled;
72
72
 
73
- // Boolean to omit axe scan for basic auth URL
74
- let isBasicAuth: boolean;
75
- let basicAuthPage = 0;
76
- let finalLinks = [];
77
- let authHeader = '';
78
-
79
73
  if (fromCrawlIntelligentSitemap) {
80
74
  dataset = datasetFromIntelligent;
81
75
  urlsCrawled = urlsCrawledFromIntelligent;
82
76
  } else {
83
77
  ({ dataset } = await createCrawleeSubFolders(randomToken));
84
78
  urlsCrawled = { ...constants.urlsCrawledObj };
85
-
86
- if (!fs.existsSync(randomToken)) {
87
- fs.mkdirSync(randomToken);
88
- }
89
79
  }
90
80
 
91
- let parsedUrl;
92
- let username = '';
93
- let password = '';
94
-
95
81
  if (!crawledFromLocalFile && isFilePath(sitemapUrl)) {
96
82
  console.log('Local file crawling not supported for sitemap. Please provide a valid URL.');
97
83
  return;
98
84
  }
99
85
 
100
- if (isFilePath(sitemapUrl)) {
101
- parsedUrl = sitemapUrl;
102
- } else {
103
- parsedUrl = new URL(sitemapUrl);
104
- if (parsedUrl.username !== '' && parsedUrl.password !== '') {
105
- isBasicAuth = true;
106
- username = decodeURIComponent(parsedUrl.username);
107
- password = decodeURIComponent(parsedUrl.password);
108
-
109
- // Create auth header
110
- authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
111
-
112
- parsedUrl.username = '';
113
- parsedUrl.password = '';
114
- }
115
- }
116
-
117
86
  const linksFromSitemap = await getLinksFromSitemap(
118
87
  sitemapUrl,
119
88
  maxRequestsPerCrawl,
@@ -121,29 +90,11 @@ const crawlSitemap = async ({
121
90
  userDataDirectory,
122
91
  userUrlInputFromIntelligent,
123
92
  fromCrawlIntelligentSitemap,
124
- username,
125
- password,
93
+ extraHTTPHeaders,
126
94
  );
127
- /**
128
- * Regex to match http://username:password@hostname.com
129
- * utilised in scan strategy to ensure subsequent URLs within the same domain are scanned.
130
- * First time scan with original `url` containing credentials is strictly to authenticate for browser session
131
- * subsequent URLs are without credentials.
132
- * basicAuthPage is set to -1 for basic auth URL to ensure it is not counted towards maxRequestsPerCrawl
133
- */
134
95
 
135
96
  sitemapUrl = encodeURI(sitemapUrl);
136
97
 
137
- if (isBasicAuth) {
138
- // request to basic auth URL to authenticate for browser session
139
- finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
140
- const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
141
-
142
- // obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
143
- finalLinks.push(new Request({ url: finalUrl }));
144
- basicAuthPage = -2;
145
- }
146
-
147
98
  const pdfDownloads: Promise<void>[] = [];
148
99
  const uuidToPdfMapping: Record<string, string> = {};
149
100
  const isScanHtml = ['all', 'html-only'].includes(fileTypes);
@@ -151,36 +102,43 @@ const crawlSitemap = async ({
151
102
  const { playwrightDeviceDetailsObject } = viewportSettings;
152
103
  const { maxConcurrency } = constants;
153
104
 
154
- finalLinks = [...finalLinks, ...linksFromSitemap];
155
-
156
105
  const requestList = await RequestList.open({
157
- sources: finalLinks,
106
+ sources: linksFromSitemap,
158
107
  });
159
108
 
160
- let userDataDir = '';
161
- if (userDataDirectory) {
162
- userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
163
- }
164
-
165
- await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
166
109
  const crawler = new crawlee.PlaywrightCrawler({
167
110
  launchContext: {
168
111
  launcher: constants.launcher,
169
112
  launchOptions: getPlaywrightLaunchOptions(browser),
170
113
  // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
171
- ...(process.env.CRAWLEE_HEADLESS === '0' && { userDataDir }),
114
+ ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
172
115
  },
173
116
  retryOnBlocked: true,
174
117
  browserPoolOptions: {
175
118
  useFingerprints: false,
176
119
  preLaunchHooks: [
177
- async (_pageId: string, launchContext: LaunchContext) => {
120
+ async (_pageId, launchContext) => {
121
+ const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
122
+
123
+ // Ensure base exists
124
+ await fsp.mkdir(baseDir, { recursive: true });
125
+
126
+ // Create a unique subdir per browser
127
+ const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
128
+ await fsp.mkdir(subProfileDir, { recursive: true });
129
+
130
+ // Assign to Crawlee's launcher
131
+ launchContext.userDataDir = subProfileDir;
132
+
133
+ // Safely extend launchOptions
178
134
  launchContext.launchOptions = {
179
135
  ...launchContext.launchOptions,
180
- bypassCSP: true,
181
136
  ignoreHTTPSErrors: true,
182
137
  ...playwrightDeviceDetailsObject,
183
138
  };
139
+
140
+ // Optionally log for debugging
141
+ // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
184
142
  },
185
143
  ],
186
144
  },
@@ -193,7 +151,7 @@ const crawlSitemap = async ({
193
151
  return new Promise(resolve => {
194
152
  let timeout;
195
153
  let mutationCount = 0;
196
- const MAX_MUTATIONS = 250; // stop if things never quiet down
154
+ const MAX_MUTATIONS = 500; // stop if things never quiet down
197
155
  const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
198
156
 
199
157
  const observer = new MutationObserver(() => {
@@ -252,15 +210,7 @@ const crawlSitemap = async ({
252
210
  return;
253
211
  }
254
212
 
255
- // Set headers if basic auth
256
- if (isBasicAuth) {
257
- await page.setExtraHTTPHeaders({
258
- Authorization: authHeader,
259
- ...extraHTTPHeaders,
260
- });
261
- } else {
262
- preNavigationHooks(extraHTTPHeaders);
263
- }
213
+ preNavigationHooks(extraHTTPHeaders);
264
214
  },
265
215
  ],
266
216
  requestHandlerTimeoutSecs: 90,
@@ -282,17 +232,6 @@ const crawlSitemap = async ({
282
232
  return;
283
233
  }
284
234
 
285
- // Set basic auth header if needed
286
- if (isBasicAuth) {
287
- await page.setExtraHTTPHeaders({
288
- Authorization: authHeader,
289
- });
290
- const currentUrl = new URL(request.url);
291
- currentUrl.username = username;
292
- currentUrl.password = password;
293
- request.url = currentUrl.href;
294
- }
295
-
296
235
  await waitForPageLoaded(page, 10000);
297
236
 
298
237
  const actualUrl = page.url() || request.loadedUrl || request.url;
@@ -341,9 +280,7 @@ const crawlSitemap = async ({
341
280
  const contentType = response?.headers?.()['content-type'] || '';
342
281
  const status = response ? response.status() : 0;
343
282
 
344
- if (basicAuthPage < 0) {
345
- basicAuthPage += 1;
346
- } else if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
283
+ if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
347
284
  const isRedirected = !areLinksEqual(page.url(), request.url);
348
285
  const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
349
286
  item => (item.actualUrl || item.url) === page.url(),
@@ -382,13 +319,13 @@ const crawlSitemap = async ({
382
319
  });
383
320
 
384
321
  urlsCrawled.scanned.push({
385
- url: urlWithoutAuth(request.url),
322
+ url: request.url,
386
323
  pageTitle: results.pageTitle,
387
324
  actualUrl, // i.e. actualUrl
388
325
  });
389
326
 
390
327
  urlsCrawled.scannedRedirects.push({
391
- fromUrl: urlWithoutAuth(request.url),
328
+ fromUrl: request.url,
392
329
  toUrl: actualUrl,
393
330
  });
394
331
 
@@ -421,9 +358,6 @@ const crawlSitemap = async ({
421
358
  }
422
359
  },
423
360
  failedRequestHandler: async ({ request, response, error }) => {
424
- if (isBasicAuth && request.url) {
425
- request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}`;
426
- }
427
361
 
428
362
  // check if scanned pages have reached limit due to multi-instances of handler running
429
363
  if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
@@ -6,7 +6,7 @@ import path from 'path';
6
6
  import { runAxeScript } from '../commonCrawlerFunc.js';
7
7
  import { consoleLogger, guiInfoLog, silentLogger } from '../../logs.js';
8
8
  import { guiInfoStatusTypes } from '../../constants/constants.js';
9
- import { isSkippedUrl, urlWithoutAuth } from '../../constants/common.js';
9
+ import { isSkippedUrl } from '../../constants/common.js';
10
10
 
11
11
  //! For Cypress Test
12
12
  // env to check if Cypress test is running
@@ -77,8 +77,8 @@ export const screenshotFullPage = async (page, screenshotsDir: string, screensho
77
77
  window.scrollTo(0, 0);
78
78
  });
79
79
 
80
- consoleLogger.info(`Screenshot page at: ${urlWithoutAuth(page.url())}`);
81
- silentLogger.info(`Screenshot page at: ${urlWithoutAuth(page.url())}`);
80
+ consoleLogger.info(`Screenshot page at: ${page.url()}`);
81
+ silentLogger.info(`Screenshot page at: ${page.url()}`);
82
82
 
83
83
  await page.screenshot({
84
84
  timeout: 5000,
@@ -116,7 +116,7 @@ export const runAxeScan = async (
116
116
  await dataset.pushData(result);
117
117
 
118
118
  urlsCrawled.scanned.push({
119
- url: urlWithoutAuth(page.url()),
119
+ url: page.url(),
120
120
  pageTitle: result.pageTitle,
121
121
  pageImagePath: customFlowDetails.pageImagePath,
122
122
  });