@govtechsg/oobee 0.10.58 → 0.10.62

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,10 +5,11 @@ import { globSync } from 'glob';
5
5
  import which from 'which';
6
6
  import os from 'os';
7
7
  import { spawnSync, execSync } from 'child_process';
8
- import { chromium } from 'playwright';
8
+ import { Browser, BrowserContext, chromium } from 'playwright';
9
9
  import * as Sentry from '@sentry/node';
10
10
  import { consoleLogger, silentLogger } from '../logs.js';
11
11
  import { PageInfo } from '../mergeAxeResults.js';
12
+ import { PlaywrightCrawler } from 'crawlee';
12
13
 
13
14
  const filename = fileURLToPath(import.meta.url);
14
15
  const dirname = path.dirname(filename);
@@ -136,7 +137,7 @@ export const getDefaultChromiumDataDir = () => {
136
137
  }
137
138
  return null;
138
139
  } catch (error) {
139
- silentLogger.error(`Error in getDefaultChromiumDataDir(): ${error}`);
140
+ consoleLogger.error(`Error in getDefaultChromiumDataDir(): ${error}`);
140
141
  }
141
142
  };
142
143
 
@@ -227,45 +228,68 @@ if (fs.existsSync('/.dockerenv')) {
227
228
  launchOptionsArgs = ['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage'];
228
229
  }
229
230
 
230
- export const getProxy = (): { type: string; url: string } | null => {
231
- if (os.platform() === 'win32') {
232
- let internetSettings: string[];
233
- try {
234
- internetSettings = execSync(
235
- 'Get-ItemProperty -Path "Registry::HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings"',
236
- { shell: 'powershell.exe' },
237
- )
238
- .toString()
239
- .split('\n');
240
- } catch (e) {
241
- console.log(e.toString());
242
- silentLogger.error(e.toString());
231
+ type ProxyInfo = { type: 'autoConfig' | 'manualProxy'; url: string } | null;
232
+
233
+ function queryRegKey(key: string): Record<string, string> {
234
+ try {
235
+ const out = execSync(`reg query "${key}"`, { encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] });
236
+ const values: Record<string, string> = {};
237
+ for (const line of out.split(/\r?\n/)) {
238
+ const parts = line.trim().split(/\s{2,}/);
239
+ if (parts.length >= 3) {
240
+ const [name, _type, ...rest] = parts;
241
+ values[name] = rest.join(' ');
242
+ }
243
243
  }
244
+ return values;
245
+ } catch {
246
+ return {};
247
+ }
248
+ }
244
249
 
245
- const getSettingValue = (settingName: string) =>
246
- internetSettings
247
- .find(s => s.startsWith(settingName))
248
- // split only once at with ':' as the delimiter
249
- ?.split(/:(.*)/s)[1]
250
- ?.trim();
250
+ function parseDwordFlag(v: unknown): number {
251
+ if (v == null) return 0;
252
+ const s = String(v).trim();
253
+ // Handles "1", "0", "0x1", "0x0"
254
+ if (/^0x[0-9a-f]+$/i.test(s)) return parseInt(s, 16);
255
+ if (/^\d+$/.test(s)) return parseInt(s, 10);
256
+ return 0;
257
+ }
251
258
 
252
- if (getSettingValue('AutoConfigURL')) {
253
- return { type: 'autoConfig', url: getSettingValue('AutoConfigURL') };
254
- }
255
- if (getSettingValue('ProxyEnable') === '1') {
256
- return { type: 'manualProxy', url: getSettingValue('ProxyServer') };
257
- }
258
- return null;
259
+ function normalizePacUrl(u: string): string {
260
+ const s = u.trim();
261
+ // If it lacks a scheme, assume http:// (Chrome requires a full URL)
262
+ return /^(https?|file):/i.test(s) ? s : `http://${s}`;
263
+ }
264
+
265
+ export const getProxy = (): ProxyInfo => {
266
+ if (os.platform() !== 'win32') return null;
267
+
268
+ const values = queryRegKey('HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings');
269
+ const pacUrlRaw = (values['AutoConfigURL'] || '').trim();
270
+ const proxyEnableRaw = (values['ProxyEnable'] || '').trim();
271
+ const proxyServerRaw = (values['ProxyServer'] || '').trim();
272
+
273
+ // 1) PAC beats manual proxy if present
274
+ if (pacUrlRaw) {
275
+ return { type: 'autoConfig', url: normalizePacUrl(pacUrlRaw) };
276
+ }
277
+
278
+ // 2) Manual proxy only if enabled
279
+ const enabled = parseDwordFlag(proxyEnableRaw) === 1;
280
+ if (enabled && proxyServerRaw) {
281
+ return { type: 'manualProxy', url: proxyServerRaw };
259
282
  }
260
- // develop for mac
283
+
261
284
  return null;
262
285
  };
263
286
 
287
+ // Usage
264
288
  export const proxy = getProxy();
265
289
 
266
- if (proxy && proxy.type === 'autoConfig') {
290
+ if (proxy?.type === 'autoConfig') {
267
291
  launchOptionsArgs.push(`--proxy-pac-url=${proxy.url}`);
268
- } else if (proxy && proxy.type === 'manualProxy') {
292
+ } else if (proxy?.type === 'manualProxy') {
269
293
  launchOptionsArgs.push(`--proxy-server=${proxy.url}`);
270
294
  }
271
295
 
@@ -405,6 +429,7 @@ const urlCheckStatuses = {
405
429
  },
406
430
  axiosTimeout: { code: 18, message: 'Axios timeout exceeded. Falling back on browser checks.' },
407
431
  notALocalFile: { code: 19, message: 'Provided filepath is not a local html or sitemap file.' },
432
+ terminationRequested: { code: 15, message: 'Termination requested.' }
408
433
  };
409
434
 
410
435
  /* eslint-disable no-unused-vars */
@@ -452,7 +477,7 @@ const reserveFileNameKeywords = [
452
477
 
453
478
  export default {
454
479
  cliZipFileName: 'oobee-scan-results.zip',
455
- exportDirectory: `${process.cwd()}`,
480
+ exportDirectory: undefined,
456
481
  maxRequestsPerCrawl,
457
482
  maxConcurrency: 25,
458
483
  urlsCrawledObj,
@@ -466,6 +491,14 @@ export default {
466
491
  reserveFileNameKeywords,
467
492
  wcagLinks,
468
493
  robotsTxtUrls: null,
494
+ userDataDirectory: null, // This will be set later in the code
495
+ randomToken: null, // This will be set later in the code
496
+ // Track all active Crawlee / Playwright resources for cleanup
497
+ resources: {
498
+ crawlers: new Set<PlaywrightCrawler>(),
499
+ browserContexts: new Set<BrowserContext>(),
500
+ browsers: new Set<Browser>(),
501
+ },
469
502
  };
470
503
 
471
504
  export const rootPath = dirname;
@@ -1,6 +1,6 @@
1
1
  import { Question } from 'inquirer';
2
2
  import { Answers } from '../index.js';
3
- import { getUserDataTxt, setHeadlessMode } from '../utils.js';
3
+ import { getUserDataTxt, randomThreeDigitNumberString, setHeadlessMode } from '../utils.js';
4
4
  import {
5
5
  checkUrl,
6
6
  deleteClonedProfiles,
@@ -15,6 +15,7 @@ import {
15
15
  parseHeaders,
16
16
  } from './common.js';
17
17
  import constants, { BrowserTypes, ScannerTypes } from './constants.js';
18
+ import { random } from 'lodash';
18
19
 
19
20
  const userData = getUserDataTxt();
20
21
 
@@ -78,8 +79,15 @@ const startScanQuestions = [
78
79
  process.exit(1);
79
80
  }
80
81
 
82
+ // construct filename for scan results
83
+ const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
84
+ const domain = new URL(url).hostname;
85
+ let resultFilename: string;
86
+ const randomThreeDigitNumber = randomThreeDigitNumberString();
87
+ resultFilename = `${date}_${time}_${domain}_${randomThreeDigitNumber}`;
88
+
81
89
  const statuses = constants.urlCheckStatuses;
82
- const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(BrowserTypes.CHROME);
90
+ const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(BrowserTypes.CHROME, false, resultFilename);
83
91
 
84
92
  setHeadlessMode(browserToRun, answers.headless);
85
93
 
@@ -95,11 +103,9 @@ const startScanQuestions = [
95
103
  browserToRun,
96
104
  clonedBrowserDataDir,
97
105
  playwrightDeviceDetailsObject,
98
- answers.scanner === ScannerTypes.CUSTOM,
99
106
  parseHeaders(answers.header),
100
107
  );
101
-
102
- deleteClonedProfiles(browserToRun);
108
+
103
109
  switch (res.status) {
104
110
  case statuses.success.code:
105
111
  answers.finalUrl = res.url;
@@ -21,6 +21,9 @@ import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
21
21
  import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
22
22
  import xPathToCss from './custom/xPathToCss.js';
23
23
  import type { Response as PlaywrightResponse } from 'playwright';
24
+ import fs from 'fs';
25
+ import { getStoragePath } from '../utils.js';
26
+ import path from 'path';
24
27
 
25
28
  // types
26
29
  interface AxeResultsWithScreenshot extends AxeResults {
@@ -254,7 +257,7 @@ export const runAxeScript = async ({
254
257
  return new Promise(resolve => {
255
258
  let timeout: NodeJS.Timeout;
256
259
  let mutationCount = 0;
257
- const MAX_MUTATIONS = 250;
260
+ const MAX_MUTATIONS = 500;
258
261
  const MAX_SAME_MUTATION_LIMIT = 10;
259
262
  const mutationHash: Record<string, number> = {};
260
263
 
@@ -315,9 +318,9 @@ export const runAxeScript = async ({
315
318
  page.on('console', msg => {
316
319
  const type = msg.type();
317
320
  if (type === 'error') {
318
- silentLogger.log({ level: 'error', message: msg.text() });
321
+ consoleLogger.log({ level: 'error', message: msg.text() });
319
322
  } else {
320
- silentLogger.log({ level: 'info', message: msg.text() });
323
+ consoleLogger.log({ level: 'info', message: msg.text() });
321
324
  }
322
325
  });
323
326
  */
@@ -476,8 +479,11 @@ export const runAxeScript = async ({
476
479
  export const createCrawleeSubFolders = async (
477
480
  randomToken: string,
478
481
  ): Promise<{ dataset: crawlee.Dataset; requestQueue: crawlee.RequestQueue }> => {
479
- const dataset = await crawlee.Dataset.open(randomToken);
480
- const requestQueue = await crawlee.RequestQueue.open(randomToken);
482
+
483
+ const crawleeDir = path.join(getStoragePath(randomToken),"crawlee");
484
+
485
+ const dataset = await crawlee.Dataset.open(crawleeDir);
486
+ const requestQueue = await crawlee.RequestQueue.open(crawleeDir);
481
487
  return { dataset, requestQueue };
482
488
  };
483
489
 
@@ -27,11 +27,9 @@ import {
27
27
  isSkippedUrl,
28
28
  isDisallowedInRobotsTxt,
29
29
  getUrlsFromRobotsTxt,
30
- urlWithoutAuth,
31
30
  waitForPageLoaded,
32
- initModifiedUserAgent,
33
31
  } from '../constants/common.js';
34
- import { areLinksEqual, isFollowStrategy } from '../utils.js';
32
+ import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
35
33
  import {
36
34
  handlePdfDownload,
37
35
  runPdfScan,
@@ -40,6 +38,8 @@ import {
40
38
  } from './pdfScanFunc.js';
41
39
  import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
42
40
  import { ViewportSettingsClass } from '../combine.js';
41
+ import * as path from 'path';
42
+ import fsp from 'fs/promises';
43
43
 
44
44
  const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
45
45
  if (!blacklistedPatterns) {
@@ -115,10 +115,6 @@ const crawlDomain = async ({
115
115
 
116
116
  ({ requestQueue } = await createCrawleeSubFolders(randomToken));
117
117
 
118
- if (!fs.existsSync(randomToken)) {
119
- fs.mkdirSync(randomToken);
120
- }
121
-
122
118
  const pdfDownloads: Promise<void>[] = [];
123
119
  const uuidToPdfMapping: Record<string, string> = {};
124
120
  const isScanHtml = ['all', 'html-only'].includes(fileTypes);
@@ -126,45 +122,11 @@ const crawlDomain = async ({
126
122
  const { maxConcurrency } = constants;
127
123
  const { playwrightDeviceDetailsObject } = viewportSettings;
128
124
 
129
- const httpsAgent = new https.Agent({ rejectUnauthorized: false });
130
-
131
- // Boolean to omit axe scan for basic auth URL
132
- let isBasicAuth = false;
133
- let authHeader = '';
134
-
135
- // Test basic auth and add auth header if auth exist
136
- const parsedUrl = new URL(url);
137
- let username: string;
138
- let password: string;
139
- if (parsedUrl.username !== '' && parsedUrl.password !== '') {
140
- isBasicAuth = true;
141
- username = decodeURIComponent(parsedUrl.username);
142
- password = decodeURIComponent(parsedUrl.password);
143
-
144
- // Create auth header
145
- authHeader = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
146
-
147
- // Remove username from parsedUrl
148
- parsedUrl.username = '';
149
- parsedUrl.password = '';
150
- // Send the finalUrl without credentials by setting auth header instead
151
- const finalUrl = parsedUrl.toString();
152
-
153
- await requestQueue.addRequest({
154
- url: finalUrl,
155
- skipNavigation: isUrlPdf(finalUrl),
156
- headers: {
157
- Authorization: authHeader,
158
- },
159
- label: finalUrl,
160
- });
161
- } else {
162
- await requestQueue.addRequest({
163
- url,
164
- skipNavigation: isUrlPdf(url),
165
- label: url,
166
- });
167
- }
125
+ await requestQueue.addRequest({
126
+ url,
127
+ skipNavigation: isUrlPdf(url),
128
+ label: url,
129
+ });
168
130
 
169
131
  const enqueueProcess = async (
170
132
  page: Page,
@@ -377,31 +339,40 @@ const crawlDomain = async ({
377
339
 
378
340
  let isAbortingScanNow = false;
379
341
 
380
- let userDataDir = '';
381
- if (userDataDirectory) {
382
- userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
383
- }
384
-
385
- await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
386
-
387
- const crawler = new crawlee.PlaywrightCrawler({
342
+ const crawler = register(new crawlee.PlaywrightCrawler({
388
343
  launchContext: {
389
344
  launcher: constants.launcher,
390
345
  launchOptions: getPlaywrightLaunchOptions(browser),
391
346
  // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
392
- ...(process.env.CRAWLEE_HEADLESS === '0' && { userDataDir }),
347
+ ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
393
348
  },
394
349
  retryOnBlocked: true,
395
350
  browserPoolOptions: {
396
351
  useFingerprints: false,
397
352
  preLaunchHooks: [
398
353
  async (_pageId, launchContext) => {
354
+ const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
355
+
356
+ // Ensure base exists
357
+ await fsp.mkdir(baseDir, { recursive: true });
358
+
359
+ // Create a unique subdir per browser
360
+ const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
361
+ await fsp.mkdir(subProfileDir, { recursive: true });
362
+
363
+ // Assign to Crawlee's launcher
364
+ launchContext.userDataDir = subProfileDir;
365
+
366
+ // Safely extend launchOptions
399
367
  launchContext.launchOptions = {
400
368
  ...launchContext.launchOptions,
401
- bypassCSP: true,
402
369
  ignoreHTTPSErrors: true,
403
370
  ...playwrightDeviceDetailsObject,
371
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
404
372
  };
373
+
374
+ // Optionally log for debugging
375
+ // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
405
376
  },
406
377
  ],
407
378
  },
@@ -414,7 +385,7 @@ const crawlDomain = async ({
414
385
  return new Promise(resolve => {
415
386
  let timeout;
416
387
  let mutationCount = 0;
417
- const MAX_MUTATIONS = 250; // stop if things never quiet down
388
+ const MAX_MUTATIONS = 500; // stop if things never quiet down
418
389
  const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
419
390
 
420
391
  const observer = new MutationObserver(() => {
@@ -464,33 +435,10 @@ const crawlDomain = async ({
464
435
  }
465
436
  },
466
437
  ],
467
- preNavigationHooks: [ async({ page, request}) => {
468
- if (isBasicAuth) {
469
- await page.setExtraHTTPHeaders({
470
- Authorization: authHeader,
471
- ...extraHTTPHeaders,
472
- });
473
- } else {
474
- await page.setExtraHTTPHeaders({
475
- ...extraHTTPHeaders,
476
- });
477
- }
478
- }],
479
438
  requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
480
439
  requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
481
440
  const browserContext: BrowserContext = page.context();
482
441
  try {
483
- // Set basic auth header if needed
484
- if (isBasicAuth) {
485
- await page.setExtraHTTPHeaders({
486
- Authorization: authHeader,
487
- });
488
- const currentUrl = new URL(request.url);
489
- currentUrl.username = username;
490
- currentUrl.password = password;
491
- request.url = currentUrl.href;
492
- }
493
-
494
442
  await waitForPageLoaded(page, 10000);
495
443
  let actualUrl = page.url() || request.loadedUrl || request.url;
496
444
 
@@ -652,13 +600,13 @@ const crawlDomain = async ({
652
600
  });
653
601
 
654
602
  urlsCrawled.scanned.push({
655
- url: urlWithoutAuth(request.url),
603
+ url: request.url,
656
604
  pageTitle: results.pageTitle,
657
605
  actualUrl, // i.e. actualUrl
658
606
  });
659
607
 
660
608
  urlsCrawled.scannedRedirects.push({
661
- fromUrl: urlWithoutAuth(request.url),
609
+ fromUrl: request.url,
662
610
  toUrl: actualUrl, // i.e. actualUrl
663
611
  });
664
612
 
@@ -671,10 +619,10 @@ const crawlDomain = async ({
671
619
  if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
672
620
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
673
621
  numScanned: urlsCrawled.scanned.length,
674
- urlScanned: urlWithoutAuth(request.url),
622
+ urlScanned: request.url,
675
623
  });
676
624
  urlsCrawled.scanned.push({
677
- url: urlWithoutAuth(request.url),
625
+ url: request.url,
678
626
  actualUrl: request.url,
679
627
  pageTitle: results.pageTitle,
680
628
  });
@@ -695,7 +643,7 @@ const crawlDomain = async ({
695
643
  });
696
644
  }
697
645
 
698
- if (followRobots) await getUrlsFromRobotsTxt(request.url, browser);
646
+ if (followRobots) await getUrlsFromRobotsTxt(request.url, browser, userDataDirectory, extraHTTPHeaders);
699
647
  await enqueueProcess(page, enqueueLinks, browserContext);
700
648
  } catch (e) {
701
649
  try {
@@ -775,7 +723,7 @@ const crawlDomain = async ({
775
723
  scaleDownStepRatio: 0.1, // Scale down slower
776
724
  },
777
725
  }),
778
- });
726
+ }));
779
727
 
780
728
  await crawler.run();
781
729
 
@@ -7,6 +7,8 @@ import crawlDomain from './crawlDomain.js';
7
7
  import crawlSitemap from './crawlSitemap.js';
8
8
  import { EnqueueStrategy } from 'crawlee';
9
9
  import { ViewportSettingsClass } from '../combine.js';
10
+ import { getPlaywrightLaunchOptions } from '../constants/common.js';
11
+ import { register } from '../utils.js';
10
12
 
11
13
  const crawlIntelligentSitemap = async (
12
14
  url: string,
@@ -36,9 +38,6 @@ const crawlIntelligentSitemap = async (
36
38
  let sitemapUrl;
37
39
 
38
40
  ({ dataset } = await createCrawleeSubFolders(randomToken));
39
- if (!fs.existsSync(randomToken)) {
40
- fs.mkdirSync(randomToken);
41
- }
42
41
 
43
42
  function getHomeUrl(parsedUrl: string) {
44
43
  const urlObject = new URL(parsedUrl);
@@ -48,15 +47,22 @@ const crawlIntelligentSitemap = async (
48
47
  return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
49
48
  }
50
49
 
51
- async function findSitemap(link: string) {
50
+ async function findSitemap(link: string, userDataDirectory: string, extraHTTPHeaders: Record<string, string>) {
52
51
  const homeUrl = getHomeUrl(link);
53
52
  let sitemapLink = '';
54
- const chromiumBrowser = await chromium.launch({
55
- headless: false,
56
- channel: 'chrome',
57
- args: ['--headless=new', '--no-sandbox'],
53
+
54
+ const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
55
+ ? userDataDirectory
56
+ : '';
57
+ const context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
58
+ headless: process.env.CRAWLEE_HEADLESS === '1',
59
+ ...getPlaywrightLaunchOptions(browser),
60
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
58
61
  });
59
- const page = await chromiumBrowser.newPage();
62
+ register(context);
63
+
64
+ const page = await context.newPage();
65
+
60
66
  for (const path of sitemapPaths) {
61
67
  sitemapLink = homeUrl + path;
62
68
  if (await checkUrlExists(page, sitemapLink)) {
@@ -64,7 +70,8 @@ const crawlIntelligentSitemap = async (
64
70
  break;
65
71
  }
66
72
  }
67
- await chromiumBrowser.close();
73
+ await page.close();
74
+ await context.close().catch(() => { });
68
75
  return sitemapExist ? sitemapLink : '';
69
76
  }
70
77
 
@@ -79,7 +86,7 @@ const crawlIntelligentSitemap = async (
79
86
  };
80
87
 
81
88
  try {
82
- sitemapUrl = await findSitemap(url);
89
+ sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
83
90
  } catch (error) {
84
91
  consoleLogger.error(error);
85
92
  }
@@ -13,11 +13,11 @@ import {
13
13
  isFilePath,
14
14
  convertLocalFileToPath,
15
15
  convertPathToLocalFile,
16
- initModifiedUserAgent,
17
16
  } from '../constants/common.js';
18
17
  import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
19
18
  import { guiInfoLog } from '../logs.js';
20
19
  import crawlSitemap from './crawlSitemap.js';
20
+ import { register } from '../utils.js';
21
21
 
22
22
  export const crawlLocalFile = async ({
23
23
  url,
@@ -74,9 +74,6 @@ export const crawlLocalFile = async ({
74
74
  ({ dataset } = await createCrawleeSubFolders(randomToken));
75
75
  urlsCrawled = { ...constants.urlsCrawledObj };
76
76
 
77
- if (!fs.existsSync(randomToken)) {
78
- fs.mkdirSync(randomToken);
79
- }
80
77
  }
81
78
 
82
79
  // Check if the sitemapUrl is a local file and if it exists
@@ -136,16 +133,6 @@ export const crawlLocalFile = async ({
136
133
  console.log(e);
137
134
  }
138
135
 
139
- if (basicAuthRegex.test(sitemapUrl)) {
140
- isBasicAuth = true;
141
- // request to basic auth URL to authenticate for browser session
142
- finalLinks.push(new Request({ url: sitemapUrl, uniqueKey: `auth:${sitemapUrl}` }));
143
- const finalUrl = `${sitemapUrl.split('://')[0]}://${sitemapUrl.split('@')[1]}`;
144
- // obtain base URL without credentials so that subsequent URLs within the same domain can be scanned
145
- finalLinks.push(new Request({ url: finalUrl }));
146
- basicAuthPage = -2;
147
- }
148
-
149
136
  const uuidToPdfMapping: Record<string, string> = {}; // key and value of string type
150
137
 
151
138
  finalLinks = [...finalLinks, ...linksFromSitemap];
@@ -165,13 +152,18 @@ export const crawlLocalFile = async ({
165
152
  let shouldAbort = false;
166
153
 
167
154
  if (!isUrlPdf(request.url)) {
168
- await initModifiedUserAgent(browser);
169
- const browserContext = await constants.launcher.launchPersistentContext('', {
170
- headless: false,
155
+ const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
156
+ ? userDataDirectory
157
+ : '';
158
+
159
+ const browserContext = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
160
+ headless: process.env.CRAWLEE_HEADLESS === '1',
171
161
  ...getPlaywrightLaunchOptions(browser),
172
162
  ...playwrightDeviceDetailsObject,
173
163
  });
174
164
 
165
+ register(browserContext);
166
+
175
167
  const timeoutId = scanDuration > 0
176
168
  ? setTimeout(() => {
177
169
  console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting local file scan.`);