@govtechsg/oobee 0.10.63 → 0.10.65

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Dockerfile CHANGED
@@ -2,9 +2,14 @@
2
2
  # Node version is v22
3
3
  FROM mcr.microsoft.com/playwright:v1.50.1-noble
4
4
 
5
- # Installation of packages for oobee and runner
6
- RUN apt-get update && apt-get install -y zip git
7
-
5
+ # Installation of packages for oobee and runner (locked versions from build log)
6
+ RUN apt-get update && apt-get install -y \
7
+ git=1:2.43.0-1ubuntu7.3 \
8
+ git-man=1:2.43.0-1ubuntu7.3 \
9
+ unzip=6.0-28ubuntu4.1 \
10
+ zip=3.0-13ubuntu0.2 \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
8
13
  WORKDIR /app/oobee
9
14
 
10
15
  # Clone oobee repository
package/README.md CHANGED
@@ -88,6 +88,8 @@ verapdf --version
88
88
  | OOBEE_VALIDATE_URL| When set to `true`, validates if URLs are valid and exits. | `false` |
89
89
  | OOBEE_LOGS_PATH | When set, logs are written to this path. | |
90
90
  | WARN_LEVEL | Only used in tests. | |
91
+ | OOBEE_DISABLE_BROWSER_DOWNLOAD | Experimental flag to disable file downloads on Chrome/Chromium/Edge. Does not affect Local File scan | |
92
+ | OOBEE_SLOWMO | Experimental flag to slow down web browser behaviour by specified duration (in miliseconds) | |
91
93
 
92
94
  #### Environment variables used internally (Do not set)
93
95
  Do not set these environment variables or behaviour might change unexpectedly.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@govtechsg/oobee",
3
3
  "main": "dist/npmIndex.js",
4
- "version": "0.10.63",
4
+ "version": "0.10.65",
5
5
  "type": "module",
6
6
  "author": "Government Technology Agency <info@tech.gov.sg>",
7
7
  "dependencies": {
@@ -20,6 +20,7 @@
20
20
  "https": "^1.0.0",
21
21
  "inquirer": "^9.2.12",
22
22
  "jsdom": "^21.1.2",
23
+ "jszip": "^3.10.1",
23
24
  "lodash": "^4.17.21",
24
25
  "mime-types": "^2.1.35",
25
26
  "minimatch": "^9.0.3",
package/src/cli.ts CHANGED
@@ -5,7 +5,7 @@ import printMessage from 'print-message';
5
5
  import { devices } from 'playwright';
6
6
  import { fileURLToPath } from 'url';
7
7
  import path from 'path';
8
- import { cleanUp, setHeadlessMode, getVersion, getStoragePath, listenForCleanUp, cleanUpAndExit } from './utils.js';
8
+ import { setHeadlessMode, getVersion, getStoragePath, listenForCleanUp, cleanUpAndExit } from './utils.js';
9
9
  import {
10
10
  checkUrl,
11
11
  prepareData,
@@ -16,7 +16,6 @@ import {
16
16
  validateDirPath,
17
17
  validateFilePath,
18
18
  validateCustomFlowLabel,
19
- parseHeaders,
20
19
  } from './constants/common.js';
21
20
  import constants, { ScannerTypes } from './constants/constants.js';
22
21
  import { cliOptions, messageOptions } from './constants/cliFunctions.js';
package/src/combine.ts CHANGED
@@ -5,7 +5,7 @@ import crawlDomain from './crawlers/crawlDomain.js';
5
5
  import crawlLocalFile from './crawlers/crawlLocalFile.js';
6
6
  import crawlIntelligentSitemap from './crawlers/crawlIntelligentSitemap.js';
7
7
  import generateArtifacts from './mergeAxeResults.js';
8
- import { getHost, createAndUpdateResultsFolders, createDetailsAndLogs, cleanUp, cleanUpAndExit } from './utils.js';
8
+ import { getHost, createAndUpdateResultsFolders, cleanUpAndExit } from './utils.js';
9
9
  import { ScannerTypes, UrlsCrawled } from './constants/constants.js';
10
10
  import { getBlackListedPatterns, submitForm } from './constants/common.js';
11
11
  import { consoleLogger, silentLogger } from './logs.js';
@@ -218,7 +218,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
218
218
 
219
219
  scanDetails.endTime = new Date();
220
220
  scanDetails.urlsCrawled = urlsCrawledObj;
221
- await createDetailsAndLogs(randomToken);
221
+
222
222
  if (scanDetails.urlsCrawled) {
223
223
  if (scanDetails.urlsCrawled.scanned.length > 0) {
224
224
  await createAndUpdateResultsFolders(randomToken);
@@ -22,18 +22,18 @@ import constants, {
22
22
  getDefaultChromeDataDir,
23
23
  getDefaultEdgeDataDir,
24
24
  getDefaultChromiumDataDir,
25
- proxy,
26
25
  // Legacy code start - Google Sheets submission
27
26
  formDataFields,
28
27
  // Legacy code end - Google Sheets submission
29
28
  ScannerTypes,
30
29
  BrowserTypes,
31
30
  } from './constants.js';
32
- import { consoleLogger, silentLogger } from '../logs.js';
31
+ import { consoleLogger } from '../logs.js';
33
32
  import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
34
33
  import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
35
34
  import { Answers, Data } from '../index.js';
36
35
  import { DeviceDescriptor } from '../types/types.js';
36
+ import { getProxyInfo, proxyInfoToResolution, ProxySettings } from '../proxyService.js';
37
37
 
38
38
  // validateDirPath validates a provided directory path
39
39
  // returns null if no error
@@ -304,6 +304,7 @@ const checkUrlConnectivityWithBrowser = async (
304
304
  ignoreHTTPSErrors: true,
305
305
  ...getPlaywrightLaunchOptions(browserToRun),
306
306
  ...playwrightDeviceDetailsObject,
307
+ ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
307
308
  });
308
309
 
309
310
  register(browserContext);
@@ -485,7 +486,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
485
486
  viewportWidth,
486
487
  maxpages,
487
488
  strategy,
488
- isLocalFileScan,
489
+ isLocalFileScan = false,
489
490
  browserToRun,
490
491
  nameEmail,
491
492
  customFlowLabel,
@@ -510,6 +511,10 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
510
511
  let username = '';
511
512
  let password = '';
512
513
 
514
+ if (isFilePath(url)) {
515
+ argv.isLocalFileScan = true;
516
+ }
517
+
513
518
  // Remove credentials from URL if not a local file scan
514
519
  url = argv.isLocalFileScan
515
520
  ? url
@@ -550,7 +555,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
550
555
  viewportWidth,
551
556
  );
552
557
 
553
- const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(browserToRun, true, resultFilename);
558
+ const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(resultFilename, browserToRun, true);
554
559
  browserToRun = resolvedBrowser;
555
560
 
556
561
  const resolvedUserDataDirectory = getClonedProfilesWithRandomToken(browserToRun, resultFilename);
@@ -1005,14 +1010,10 @@ export const validName = (name: string) => {
1005
1010
  * @returns object consisting of browser to run and cloned data directory
1006
1011
  */
1007
1012
  export const getBrowserToRun = (
1013
+ randomToken: string,
1008
1014
  preferredBrowser?: BrowserTypes,
1009
1015
  isCli = false,
1010
- randomToken?: string
1011
1016
  ): { browserToRun: BrowserTypes; clonedBrowserDataDir: string } => {
1012
-
1013
- if (!randomToken) {
1014
- randomToken = '';
1015
- }
1016
1017
 
1017
1018
  const platform = os.platform();
1018
1019
 
@@ -1597,15 +1598,8 @@ export const submitFormViaPlaywright = async (
1597
1598
  userDataDirectory: string,
1598
1599
  finalUrl: string,
1599
1600
  ) => {
1600
- const dirName = `clone-${Date.now()}`;
1601
- let clonedDir = null;
1602
- if (proxy && browserToRun === BrowserTypes.EDGE) {
1603
- clonedDir = cloneEdgeProfiles(dirName);
1604
- } else if (proxy && browserToRun === BrowserTypes.CHROME) {
1605
- clonedDir = cloneChromeProfiles(dirName);
1606
- }
1607
1601
  const browserContext = await constants.launcher.launchPersistentContext(
1608
- clonedDir || userDataDirectory,
1602
+ userDataDirectory,
1609
1603
  {
1610
1604
  ...getPlaywrightLaunchOptions(browserToRun),
1611
1605
  },
@@ -1618,7 +1612,7 @@ export const submitFormViaPlaywright = async (
1618
1612
  try {
1619
1613
  await page.goto(finalUrl, {
1620
1614
  timeout: 30000,
1621
- ...(proxy && { waitUntil: 'commit' }),
1615
+ waitUntil: 'commit',
1622
1616
  });
1623
1617
 
1624
1618
  try {
@@ -1630,11 +1624,6 @@ export const submitFormViaPlaywright = async (
1630
1624
  consoleLogger.error(error);
1631
1625
  } finally {
1632
1626
  await browserContext.close();
1633
- if (proxy && browserToRun === BrowserTypes.EDGE) {
1634
- deleteClonedEdgeProfiles(clonedDir);
1635
- } else if (proxy && browserToRun === BrowserTypes.CHROME) {
1636
- deleteClonedChromeProfiles(clonedDir);
1637
- }
1638
1627
  }
1639
1628
  };
1640
1629
 
@@ -1673,19 +1662,17 @@ export const submitForm = async (
1673
1662
  finalUrl += `&${formDataFields.redirectUrlField}=${scannedUrl}`;
1674
1663
  }
1675
1664
 
1676
- if (proxy) {
1677
- await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
1678
- } else {
1679
- try {
1680
- await axios.get(finalUrl, { timeout: 2000 });
1681
- } catch (error) {
1682
- if (error.code === 'ECONNABORTED') {
1683
- if (browserToRun || constants.launcher === webkit) {
1684
- await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
1685
- }
1665
+
1666
+ try {
1667
+ await axios.get(finalUrl, { timeout: 2000 });
1668
+ } catch (error) {
1669
+ if (error.code === 'ECONNABORTED') {
1670
+ if (browserToRun || constants.launcher === webkit) {
1671
+ await submitFormViaPlaywright(browserToRun, userDataDirectory, finalUrl);
1686
1672
  }
1687
1673
  }
1688
1674
  }
1675
+
1689
1676
  };
1690
1677
  // Legacy code end - Google Sheets submission
1691
1678
 
@@ -1736,42 +1723,61 @@ export async function initModifiedUserAgent(
1736
1723
  // console.log('Modified User Agent:', modifiedUA);
1737
1724
  }
1738
1725
 
1726
+ const cacheProxyInfo = getProxyInfo();
1727
+
1739
1728
  /**
1740
1729
  * @param {string} browser browser name ("chrome" or "edge", null for chromium, the default Playwright browser)
1741
1730
  * @returns playwright launch options object. For more details: https://playwright.dev/docs/api/class-browsertype#browser-type-launch
1742
1731
  */
1743
1732
  export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
1744
- let channel: string;
1745
- if (browser) {
1746
- channel = browser;
1747
- }
1733
+ const channel = browser || undefined;
1734
+
1735
+ const resolution = proxyInfoToResolution(cacheProxyInfo);
1736
+
1737
+ // Start with your base args
1738
+ const finalArgs = [...constants.launchOptionsArgs];
1748
1739
 
1749
- // Set new headless mode as Chrome 132 does not support headless=old
1750
- // Also mute audio
1740
+ // Headless flags (unchanged)
1751
1741
  if (process.env.CRAWLEE_HEADLESS === '1') {
1752
- constants.launchOptionsArgs.push('--headless=new');
1753
- constants.launchOptionsArgs.push('--mute-audio');
1742
+ if (!finalArgs.includes('--headless=new')) finalArgs.push('--headless=new');
1743
+ if (!finalArgs.includes('--mute-audio')) finalArgs.push('--mute-audio');
1744
+ }
1745
+
1746
+ // Map resolution to Playwright options
1747
+ let proxyOpt: ProxySettings | undefined;
1748
+ switch (resolution.kind) {
1749
+ case 'manual':
1750
+ proxyOpt = resolution.settings;
1751
+ break;
1752
+ case 'pac': {
1753
+ finalArgs.push(`--proxy-pac-url=${resolution.pacUrl}`);
1754
+ if (resolution.bypass) finalArgs.push(`--proxy-bypass-list=${resolution.bypass}`);
1755
+ break;
1756
+ }
1757
+ case 'none':
1758
+ // nothing
1759
+ break;
1754
1760
  }
1755
1761
 
1756
1762
  const options: LaunchOptions = {
1757
- // Drop the --use-mock-keychain flag to allow MacOS devices
1758
- // to use the cloned cookies.
1759
1763
  ignoreDefaultArgs: ['--use-mock-keychain', '--headless'],
1760
- // necessary from Chrome 132 to use our own headless=new flag
1761
- args: constants.launchOptionsArgs,
1764
+ args: finalArgs,
1762
1765
  headless: false,
1763
- ...(channel && { channel }), // Having no channel is equivalent to "chromium"
1766
+ ...(channel && { channel }),
1767
+ ...(proxyOpt ? { proxy: proxyOpt } : {}),
1764
1768
  };
1765
1769
 
1766
- // Necessary as Chrome 132 does not support headless=old
1767
- options.headless = false;
1770
+ // SlowMo (unchanged)
1771
+ if (!options.slowMo && process.env.OOBEE_SLOWMO && Number(process.env.OOBEE_SLOWMO) >= 1) {
1772
+ options.slowMo = Number(process.env.OOBEE_SLOWMO);
1773
+ consoleLogger.info(`Enabled browser slowMo with value: ${process.env.OOBEE_SLOWMO}ms`);
1774
+ }
1768
1775
 
1769
- if (proxy) {
1770
- options.slowMo = 1000; // To ensure server-side rendered proxy page is loaded
1771
- } else if (browser === BrowserTypes.EDGE && os.platform() === 'win32') {
1772
- // edge should be in non-headless mode
1776
+ // Edge on Windows should not be headless (unchanged)
1777
+ if (browser === BrowserTypes.EDGE && os.platform() === 'win32') {
1773
1778
  options.headless = false;
1774
1779
  }
1780
+
1775
1781
  return options;
1776
1782
  };
1777
1783
 
@@ -141,14 +141,53 @@ export const getDefaultChromiumDataDir = () => {
141
141
  }
142
142
  };
143
143
 
144
- export const removeQuarantineFlag = function (searchPath: string) {
145
- if (os.platform() === 'darwin') {
146
- const execPaths = globSync(searchPath, { absolute: true, nodir: true });
147
- if (execPaths.length > 0) {
148
- execPaths.forEach(filePath => spawnSync('xattr', ['-d', 'com.apple.quarantine', filePath]));
144
+ export function removeQuarantineFlag(searchPattern: string, allowedRoot = process.cwd()) {
145
+ if (os.platform() !== 'darwin') return;
146
+
147
+ const matches = globSync(searchPattern, {
148
+ absolute: true,
149
+ nodir: true,
150
+ dot: true,
151
+ follow: false, // don't follow symlinks
152
+ });
153
+
154
+ const root = path.resolve(allowedRoot);
155
+
156
+ for (const p of matches) {
157
+ const resolved = path.resolve(p);
158
+
159
+ // Ensure the file is under the allowed root (containment check)
160
+ if (!resolved.startsWith(root + path.sep)) continue;
161
+
162
+ // lstat: skip if not a regular file or if it's a symlink
163
+ let st: fs.Stats;
164
+ try {
165
+ st = fs.lstatSync(resolved);
166
+ } catch {
167
+ continue;
168
+ }
169
+ if (!st.isFile() || st.isSymbolicLink()) continue;
170
+
171
+ // basic filename sanity: no control chars
172
+ const base = path.basename(resolved);
173
+ if (/[\x00-\x1F]/.test(base)) continue;
174
+
175
+ // Use absolute binary path and terminate options with "--"
176
+ const proc = spawnSync('/usr/bin/xattr', ['-d', 'com.apple.quarantine', '--', resolved], {
177
+ stdio: ['ignore', 'ignore', 'pipe'],
178
+ });
179
+
180
+ // Optional: inspect errors (common benign case is "No such xattr")
181
+ if (proc.status !== 0) {
182
+ const err = proc.stderr?.toString() || '';
183
+ // swallow benign errors; otherwise log if you have a logger
184
+ if (!/No such xattr/i.test(err)) {
185
+ // console.warn(`xattr failed for ${resolved}: ${err.trim()}`);
186
+ }
149
187
  }
150
188
  }
151
- };
189
+ }
190
+
152
191
 
153
192
  export const getExecutablePath = function (dir: string, file: string): string {
154
193
  let execPaths = globSync(`${dir}/${file}`, { absolute: true, nodir: true });
@@ -228,71 +267,6 @@ if (fs.existsSync('/.dockerenv')) {
228
267
  launchOptionsArgs = ['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage'];
229
268
  }
230
269
 
231
- type ProxyInfo = { type: 'autoConfig' | 'manualProxy'; url: string } | null;
232
-
233
- function queryRegKey(key: string): Record<string, string> {
234
- try {
235
- const out = execSync(`reg query "${key}"`, { encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] });
236
- const values: Record<string, string> = {};
237
- for (const line of out.split(/\r?\n/)) {
238
- const parts = line.trim().split(/\s{2,}/);
239
- if (parts.length >= 3) {
240
- const [name, _type, ...rest] = parts;
241
- values[name] = rest.join(' ');
242
- }
243
- }
244
- return values;
245
- } catch {
246
- return {};
247
- }
248
- }
249
-
250
- function parseDwordFlag(v: unknown): number {
251
- if (v == null) return 0;
252
- const s = String(v).trim();
253
- // Handles "1", "0", "0x1", "0x0"
254
- if (/^0x[0-9a-f]+$/i.test(s)) return parseInt(s, 16);
255
- if (/^\d+$/.test(s)) return parseInt(s, 10);
256
- return 0;
257
- }
258
-
259
- function normalizePacUrl(u: string): string {
260
- const s = u.trim();
261
- // If it lacks a scheme, assume http:// (Chrome requires a full URL)
262
- return /^(https?|file):/i.test(s) ? s : `http://${s}`;
263
- }
264
-
265
- export const getProxy = (): ProxyInfo => {
266
- if (os.platform() !== 'win32') return null;
267
-
268
- const values = queryRegKey('HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Internet Settings');
269
- const pacUrlRaw = (values['AutoConfigURL'] || '').trim();
270
- const proxyEnableRaw = (values['ProxyEnable'] || '').trim();
271
- const proxyServerRaw = (values['ProxyServer'] || '').trim();
272
-
273
- // 1) PAC beats manual proxy if present
274
- if (pacUrlRaw) {
275
- return { type: 'autoConfig', url: normalizePacUrl(pacUrlRaw) };
276
- }
277
-
278
- // 2) Manual proxy only if enabled
279
- const enabled = parseDwordFlag(proxyEnableRaw) === 1;
280
- if (enabled && proxyServerRaw) {
281
- return { type: 'manualProxy', url: proxyServerRaw };
282
- }
283
-
284
- return null;
285
- };
286
-
287
- // Usage
288
- export const proxy = getProxy();
289
-
290
- if (proxy?.type === 'autoConfig') {
291
- launchOptionsArgs.push(`--proxy-pac-url=${proxy.url}`);
292
- } else if (proxy?.type === 'manualProxy') {
293
- launchOptionsArgs.push(`--proxy-server=${proxy.url}`);
294
- }
295
-
296
270
  export const impactOrder = {
297
271
  minor: 0,
298
272
  moderate: 1,
@@ -81,13 +81,26 @@ const startScanQuestions = [
81
81
 
82
82
  // construct filename for scan results
83
83
  const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
84
- const domain = new URL(url).hostname;
84
+ let domain = '';
85
+ try {
86
+ domain = new URL(url).hostname;
87
+ } catch (error) {
88
+ // If the input is a local filepath, try to resolve it
89
+ const finalFilePath = getFileSitemap(url);
90
+ if (finalFilePath) {
91
+ answers.isLocalFileScan = true;
92
+ answers.finalUrl = finalFilePath;
93
+ return true;
94
+ }
95
+ return 'Invalid URL';
96
+ }
97
+
85
98
  let resultFilename: string;
86
99
  const randomThreeDigitNumber = randomThreeDigitNumberString();
87
100
  resultFilename = `${date}_${time}_${domain}_${randomThreeDigitNumber}`;
88
101
 
89
102
  const statuses = constants.urlCheckStatuses;
90
- const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(BrowserTypes.CHROME, false, resultFilename);
103
+ const { browserToRun, clonedBrowserDataDir } = getBrowserToRun(resultFilename, BrowserTypes.CHROME, false);
91
104
 
92
105
  setHeadlessMode(browserToRun, answers.headless);
93
106
 
@@ -368,6 +368,7 @@ const crawlDomain = async ({
368
368
  ...launchContext.launchOptions,
369
369
  ignoreHTTPSErrors: true,
370
370
  ...playwrightDeviceDetailsObject,
371
+ ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
371
372
  ...(extraHTTPHeaders && { extraHTTPHeaders }),
372
373
  };
373
374
 
@@ -41,9 +41,6 @@ const crawlIntelligentSitemap = async (
41
41
 
42
42
  function getHomeUrl(parsedUrl: string) {
43
43
  const urlObject = new URL(parsedUrl);
44
- if (urlObject.username && urlObject.password) {
45
- return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
46
- }
47
44
  return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
48
45
  }
49
46
 
@@ -6,6 +6,7 @@ import constants, {
6
6
  guiInfoStatusTypes,
7
7
  basicAuthRegex,
8
8
  UrlsCrawled,
9
+ STATUS_CODE_METADATA,
9
10
  } from '../constants/constants.js';
10
11
  import { ViewportSettingsClass } from '../combine.js';
11
12
  import {
@@ -17,7 +18,7 @@ import {
17
18
  import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
18
19
  import { guiInfoLog } from '../logs.js';
19
20
  import crawlSitemap from './crawlSitemap.js';
20
- import { register } from '../utils.js';
21
+ import { getPdfStoragePath, getStoragePath, register } from '../utils.js';
21
22
 
22
23
  export const crawlLocalFile = async ({
23
24
  url,
@@ -59,7 +60,7 @@ export const crawlLocalFile = async ({
59
60
  let dataset: any;
60
61
  let urlsCrawled: UrlsCrawled;
61
62
  let linksFromSitemap = [];
62
- let sitemapUrl = url;
63
+ let sitemapUrl: string;
63
64
 
64
65
  // Boolean to omit axe scan for basic auth URL
65
66
  let isBasicAuth: boolean;
@@ -76,10 +77,13 @@ export const crawlLocalFile = async ({
76
77
 
77
78
  }
78
79
 
80
+ // Checks if its in the right file format, and change it before placing into linksFromSitemap
81
+ url = convertLocalFileToPath(url);
82
+
79
83
  // Check if the sitemapUrl is a local file and if it exists
80
- if (!isFilePath(sitemapUrl) || !fs.existsSync(sitemapUrl)) {
84
+ if (!fs.existsSync(url) && !isFilePath(url)) {
81
85
  // Convert to an absolute path
82
- let normalizedPath = path.resolve(sitemapUrl);
86
+ let normalizedPath = path.resolve(url);
83
87
 
84
88
  // Normalize the path to handle different path separators
85
89
  normalizedPath = path.normalize(normalizedPath);
@@ -90,17 +94,15 @@ export const crawlLocalFile = async ({
90
94
  }
91
95
 
92
96
  // At this point, normalizedPath is a valid and existing file path
93
- sitemapUrl = normalizedPath;
97
+ url = normalizedPath;
94
98
  }
95
99
 
96
- // Checks if its in the right file format, and change it before placing into linksFromSitemap
97
- convertLocalFileToPath(sitemapUrl);
98
-
99
100
  // XML Files
100
- if (!(sitemapUrl.match(/\.xml$/i) || sitemapUrl.match(/\.txt$/i))) {
101
- linksFromSitemap = [new Request({ url: sitemapUrl })];
101
+ if (!(url.match(/\.xml$/i) || url.match(/\.txt$/i))) {
102
+ linksFromSitemap = [new Request({ url: url })];
102
103
  // Non XML file
103
104
  } else {
105
+ sitemapUrl = url;
104
106
  // Put it to crawlSitemap function to handle xml files
105
107
  const updatedUrlsCrawled = await crawlSitemap({
106
108
  sitemapUrl,
@@ -127,12 +129,6 @@ export const crawlLocalFile = async ({
127
129
  return urlsCrawled;
128
130
  }
129
131
 
130
- try {
131
- sitemapUrl = encodeURI(sitemapUrl);
132
- } catch (e) {
133
- console.log(e);
134
- }
135
-
136
132
  const uuidToPdfMapping: Record<string, string> = {}; // key and value of string type
137
133
 
138
134
  finalLinks = [...finalLinks, ...linksFromSitemap];
@@ -142,16 +138,10 @@ export const crawlLocalFile = async ({
142
138
  });
143
139
 
144
140
  const request = linksFromSitemap[0];
145
- const pdfFileName = path.basename(request.url);
146
- const trimmedUrl: string = request.url;
147
- const destinationFilePath: string = `${randomToken}/${pdfFileName}`;
148
- const data: Buffer = fs.readFileSync(trimmedUrl);
149
- fs.writeFileSync(destinationFilePath, data);
150
- uuidToPdfMapping[pdfFileName] = trimmedUrl;
151
141
 
152
142
  let shouldAbort = false;
153
143
 
154
- if (!isUrlPdf(request.url)) {
144
+ if (!isUrlPdf(url)) {
155
145
  const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
156
146
  ? userDataDirectory
157
147
  : '';
@@ -160,6 +150,7 @@ export const crawlLocalFile = async ({
160
150
  headless: process.env.CRAWLEE_HEADLESS === '1',
161
151
  ...getPlaywrightLaunchOptions(browser),
162
152
  ...playwrightDeviceDetailsObject,
153
+ ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
163
154
  });
164
155
 
165
156
  register(browserContext);
@@ -172,8 +163,8 @@ export const crawlLocalFile = async ({
172
163
  : null;
173
164
 
174
165
  const page = await browserContext.newPage();
175
- request.url = convertPathToLocalFile(request.url);
176
- await page.goto(request.url);
166
+ url = convertPathToLocalFile(url);
167
+ await page.goto(url);
177
168
 
178
169
  if (shouldAbort) {
179
170
  console.warn('Scan aborted due to timeout before page scan.');
@@ -184,33 +175,39 @@ export const crawlLocalFile = async ({
184
175
 
185
176
  const results = await runAxeScript({ includeScreenshots, page, randomToken });
186
177
 
187
- const actualUrl = page.url() || request.loadedUrl || request.url;
178
+ const actualUrl = page.url() || request.loadedUrl || url;
188
179
 
189
180
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
190
181
  numScanned: urlsCrawled.scanned.length,
191
- urlScanned: request.url,
182
+ urlScanned: url,
192
183
  });
193
184
 
194
185
  urlsCrawled.scanned.push({
195
- url: request.url,
186
+ url: url,
196
187
  pageTitle: results.pageTitle,
197
188
  actualUrl: actualUrl, // i.e. actualUrl
198
189
  });
199
190
 
200
191
  urlsCrawled.scannedRedirects.push({
201
- fromUrl: request.url,
192
+ fromUrl: url,
202
193
  toUrl: actualUrl, // i.e. actualUrl
203
194
  });
204
195
 
205
- results.url = request.url;
196
+ results.url = url;
206
197
  results.actualUrl = actualUrl;
207
198
 
208
199
  await dataset.pushData(results);
209
200
  } else {
201
+
202
+ const pdfFileName = path.basename(url);
203
+ const destinationFilePath: string = path.join(getPdfStoragePath(randomToken), pdfFileName);
204
+ fs.copyFileSync(url, destinationFilePath);
205
+ uuidToPdfMapping[pdfFileName] = url;
206
+
210
207
  urlsCrawled.scanned.push({
211
- url: trimmedUrl,
208
+ url: url,
212
209
  pageTitle: pdfFileName,
213
- actualUrl: trimmedUrl,
210
+ actualUrl: url,
214
211
  });
215
212
 
216
213
  await runPdfScan(randomToken);
@@ -21,7 +21,7 @@ import {
21
21
  isFilePath,
22
22
  } from '../constants/common.js';
23
23
  import { areLinksEqual, isWhitelistedContentType, register } from '../utils.js';
24
- import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
24
+ import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
25
25
  import { guiInfoLog } from '../logs.js';
26
26
  import { ViewportSettingsClass } from '../combine.js';
27
27
  import * as path from 'path';
@@ -135,6 +135,7 @@ const crawlSitemap = async ({
135
135
  ...launchContext.launchOptions,
136
136
  ignoreHTTPSErrors: true,
137
137
  ...playwrightDeviceDetailsObject,
138
+ ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
138
139
  };
139
140
 
140
141
  // Optionally log for debugging
@@ -412,11 +413,11 @@ const crawlSitemap = async ({
412
413
  const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
413
414
 
414
415
  // get screenshots from pdf docs
415
- // if (includeScreenshots) {
416
- // await Promise.all(pdfResults.map(
417
- // async result => await doPdfScreenshots(randomToken, result)
418
- // ));
419
- // }
416
+ if (includeScreenshots) {
417
+ await Promise.all(pdfResults.map(
418
+ async result => await doPdfScreenshots(randomToken, result)
419
+ ));
420
+ }
420
421
 
421
422
  // push results for each pdf document to key value store
422
423
  await Promise.all(pdfResults.map(result => dataset.pushData(result)));