@govtechsg/oobee 0.10.92 → 0.10.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/AGENTS.md +34 -0
  2. package/README.md +19 -0
  3. package/dist/cli.js +3 -2
  4. package/dist/combine.js +4 -4
  5. package/dist/constants/common.js +136 -49
  6. package/dist/crawlers/commonCrawlerFunc.js +54 -2
  7. package/dist/crawlers/crawlDomain.js +9 -2
  8. package/dist/crawlers/crawlIntelligentSitemap.js +9 -4
  9. package/dist/crawlers/crawlSitemap.js +14 -2
  10. package/dist/crawlers/custom/utils.js +22 -9
  11. package/dist/crawlers/guards/urlGuard.js +19 -1
  12. package/dist/crawlers/runCustom.js +8 -2
  13. package/dist/generateOobeeClientScanner.js +1 -1
  14. package/dist/mergeAxeResults/itemsStore.js +32 -3
  15. package/dist/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
  16. package/dist/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
  17. package/dist/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
  18. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
  19. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
  20. package/dist/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
  21. package/dist/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
  22. package/dist/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
  23. package/dist/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
  24. package/dist/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
  25. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
  26. package/dist/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
  27. package/dist/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
  28. package/dist/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
  29. package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
  30. package/dist/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
  31. package/dist/static/ejs/partials/styles/styles.ejs +1 -1
  32. package/dist/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
  33. package/dist/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
  34. package/dist/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
  35. package/oobee-client-scanner.js +4 -4
  36. package/package.json +2 -2
  37. package/src/cli.ts +3 -2
  38. package/src/combine.ts +4 -2
  39. package/src/constants/common.ts +131 -35
  40. package/src/crawlers/commonCrawlerFunc.ts +56 -2
  41. package/src/crawlers/crawlDomain.ts +11 -1
  42. package/src/crawlers/crawlIntelligentSitemap.ts +10 -4
  43. package/src/crawlers/crawlSitemap.ts +19 -2
  44. package/src/crawlers/custom/utils.ts +26 -13
  45. package/src/crawlers/guards/urlGuard.ts +18 -1
  46. package/src/crawlers/runCustom.ts +10 -1
  47. package/src/generateOobeeClientScanner.ts +1 -1
  48. package/src/mergeAxeResults/itemsStore.ts +37 -3
  49. package/src/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
  50. package/src/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
  51. package/src/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
  52. package/src/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
  53. package/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
  54. package/src/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
  55. package/src/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
  56. package/src/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
  57. package/src/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
  58. package/src/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
  59. package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
  60. package/src/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
  61. package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
  62. package/src/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
  63. package/src/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
  64. package/src/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
  65. package/src/static/ejs/partials/styles/styles.ejs +1 -1
  66. package/src/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
  67. package/src/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
  68. package/src/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
  69. package/testStaticJSScanner.html +1 -1
  70. /package/{d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt → 67e8137b-1939-4253-8f11-a82bc833cfcb.txt} +0 -0
package/AGENTS.md CHANGED
@@ -79,6 +79,7 @@ All crawlers use Crawlee's `PlaywrightCrawler` with:
79
79
  - Docker detection (`/.dockerenv`): adds `--disable-gpu`, `--no-sandbox`, `--disable-dev-shm-usage`
80
80
  - Proxy support (manual, PAC, or none) via `getProxyInfo()`
81
81
  - Channel set from browser name (undefined for chromium = bundled)
82
+ - `--mute-audio` is added by default in both headless and headful modes, but must be disabled for `customFlow` by calling `getPlaywrightLaunchOptions(browser, { includeMuteAudio: false })`
82
83
 
83
84
  ### User-Agent
84
85
 
@@ -111,6 +112,10 @@ Important behaviors:
111
112
  - The crawler itself enforces `maxRequestsPerCrawl` by counting only successfully scanned pages
112
113
  - `constants.sitemapFetchedLinks` stores the total discovered count for `scanData.json` reporting
113
114
  - For sitemap indexes, child sitemaps are processed recursively
115
+ - Some sitemap XMLs include `<?xml-stylesheet ...?>` (XSL). In `getDataUsingPlaywright()`:
116
+ - Use `waitUntil: 'domcontentloaded'` (not `networkidle`) to avoid 60s timeouts caused by stylesheet/resource loading
117
+ - Prefer `response.text()` to capture raw XML before browser XSL transformation (preserves `<sitemapindex>` / `<urlset>` structure)
118
+ - Only fall back to DOM extraction when raw response text is unavailable
114
119
 
115
120
  ## Shared Mutable State
116
121
 
@@ -134,6 +139,11 @@ The `constants` default export object holds runtime state:
134
139
  | `OOBEE_SLOWMO` | Browser slowmo in ms |
135
140
  | `OOBEE_FAST_CRAWLER` | Experimental high-concurrency mode |
136
141
  | `OOBEE_DISABLE_BROWSER_DOWNLOAD` | Block browser file downloads |
142
+ | `OOBEE_TAGGED_WEBSITE` | Tag to identify the website in Sentry telemetry (overridden by `--websiteTag` CLI flag) |
143
+ | `OOBEE_SCAN_METADATA` | Overrides `entryUrl` tag in Sentry events |
144
+ | `OOBEE_SCAN_PRODUCT` | Adds `scanProduct` tag to Sentry events |
145
+ | `OOBEE_CONSECUTIVE_MAX_RETRIES` | Max consecutive HTTP failures before circuit breaker aborts crawl (default 100) |
146
+ | `OOBEE_VALIDATE_URL` | If set, exit after URL validation without scanning |
137
147
  | `HTTP_PROXY` / `HTTPS_PROXY` / `ALL_PROXY` | Proxy configuration |
138
148
  | `NO_PROXY` / `INCLUDE_PROXY` | Proxy bypass/include lists |
139
149
 
@@ -215,6 +225,20 @@ docker run oobee node dist/cli.js ...
215
225
 
216
226
  8. **Crawlee dataset** — Results are stored as numbered JSON files in `{randomToken}/datasets/default/`. Each file is one page's axe results. `generateArtifacts()` reads all of them.
217
227
 
228
+ 9. **Auth headers and CORS** — Never set `Authorization` in `extraHTTPHeaders` globally on a browser context. Playwright sends `extraHTTPHeaders` to ALL requests (including cross-origin CDNs), which triggers CORS preflight failures. Instead use `splitAuthHeaders()` from `commonCrawlerFunc.ts` to separate auth from non-auth headers:
229
+ - Non-auth headers → safe to set globally via `extraHTTPHeaders` on context/launch options
230
+ - Basic auth → set `httpCredentials` on context (Playwright auto-responds to 401 challenges, origin-aware)
231
+ - Any Authorization header → send only to same-origin requests via `addAuthRouteHandler()` (route interception) or Crawlee's `preNavigationHooks` (navigation-only)
232
+ - Credentials come from URL-embedded `user:pass@host` or `-m "Authorization Basic ..."` — both produce the same `extraHTTPHeaders.Authorization` value in `prepareData()`
233
+
234
+ 10. **Intermediate JSONL write safety + corruption tolerance** — `ItemsStore.appendPageItems()` requires strict serialization of writes per rule file to prevent interleaved corruption. It also enforces a strict text sanitization regex to filter out literal `\n` and `\r` control characters from website HTML inputs immediately after `JSON.stringify()`. This ensures no single JSON issue accidentally injects illegal implicit newline boundaries when writing to JSONL format. Maintain backward-compatible `fs.appendFile` queues over buffered WriteStreams to guarantee pipeline sync visibility. `ItemsStore.readRuleItems()` tolerates historical malformed lines via fallback skip logic.
235
+
236
+ 11. **`preNavigationHooks` and the Playwright header-rewrite warning** — `preNavigationHooks()` in `commonCrawlerFunc.ts` is always included in the crawler `preNavigationHooks` array (for both `crawlDomain` and `crawlSitemap`). The hook does two things:
237
+ - **Header rewriting**: only sets `crawlingContext.request.headers = extraHTTPHeaders` when `extraHTTPHeaders` is non-empty. Setting request headers causes Crawlee/Playwright to intercept every network request to rewrite them, which triggers `WARN Playwright Utils: Using other request methods than GET, rewriting headers and adding payloads has a high impact on performance`. This warning is expected for authenticated scans; it is suppressed for unauthenticated scans because `extraHTTPHeaders` stays empty (see pitfall 12 below).
238
+ - **Navigation wait**: always sets `gotoOptions.waitUntil = 'domcontentloaded'` and `gotoOptions.timeout = 30000` via **in-place object mutation**. Do NOT reassign the `gotoOptions` parameter (`gotoOptions = {...}`) — that only rebinds the local variable and does not propagate to Crawlee. `domcontentloaded` is used (not `networkidle`) to avoid indefinite hangs on sites with WebSockets, analytics polling, lazy-load beacons, or health-check pings that never quiet their network activity. Further page stability is handled by `waitForPageLoaded()` in each requestHandler and the DOM mutation observer in `postNavigationHooks`.
239
+
240
+ 12. **`extraHTTPHeaders` must not be mutated before being passed to crawlers** — `checkUrlConnectivityWithBrowser()` in `common.ts` needs an `Accept` header for its own connectivity check but must NOT add it to the shared `extraHTTPHeaders` object. Mutating the shared object causes crawlers to see a non-empty `extraHTTPHeaders` (at minimum `{ Accept: '...' }`), which silently triggers header rewriting and the Playwright performance warning for every unauthenticated scan. Always use a local copy: `const localHeaders = { ...extraHTTPHeaders }; localHeaders.Accept ||= '...';`.
241
+
218
242
  ## Testing Considerations
219
243
 
220
244
  When making changes, validate these areas which have well-established edge cases:
@@ -246,6 +270,16 @@ When making changes, validate these areas which have well-established edge cases
246
270
  - `document.title` must be captured at the START of `runAxeScript()`, before axe scanning or screenshot capture. Pages can close during these operations (timeout, navigation, crash). Never create a new page just to re-navigate for the title — this leaks pages.
247
271
  - The URL guard script in custom flow must be defensive against pages that close unexpectedly. All page event handlers should handle closed contexts gracefully.
248
272
 
273
+ ### URL Guard & Overlay Management in Custom Flow
274
+
275
+ `src/crawlers/guards/urlGuard.ts` — attached via `addUrlGuardScript()` in `runCustom.ts`:
276
+
277
+ - **`restoreToSafeUrl` must validate the safe URL before calling `page.goto()`**. If the entry URL is `file://` (e.g. `-u '/path/to/report.html'`), `fallbackUrl` is also `file://`. Redirecting to it fires another `framenavigated` for `file://`, which re-triggers `restoreToSafeUrl` → infinite reload loop. Always check `ALLOWED_PROTOCOLS.has(safeObj.protocol)` before navigating; if the fallback is not http/https, return without redirecting.
278
+
279
+ - **`about:` protocol must be skipped in `framenavigated`**. Chromium fires `framenavigated` for `about:blank` as a transient intermediate state during every `page.goto()` call. Intercepting it and calling `restoreToSafeUrl` → `page.goto(safeUrl)` → `about:blank` → `restoreToSafeUrl` → … creates a second infinite loop. Always `return` early when `urlObj.protocol === 'about:'`.
280
+
281
+ - **`reconcileOverlayMenu` must not remove the overlay on macOS/Windows**. On `darwin`/`win32` the custom flow runs headful. When `isOverlayAllowed` returns `false` (e.g. transient `file://` or `about:blank` URL), do **not** call `removeOverlayMenu` — the URL guard will redirect back to the safe URL momentarily. Instead, fall through to the `hasOverlay` / `addOverlayMenu` block so the overlay is (re-)injected regardless of the current URL protocol. On Linux/Docker (headless) the removal behaviour is unchanged.
282
+
249
283
  ### Proxy & Network
250
284
  - Proxy detection must handle `ALL_PROXY` on Windows. The proxy resolution logic should be tested on all platforms.
251
285
 
package/README.md CHANGED
@@ -92,6 +92,10 @@ verapdf --version
92
92
  | WARN_LEVEL | Only used in tests. | |
93
93
  | OOBEE_DISABLE_BROWSER_DOWNLOAD | Experimental flag to disable file downloads on Chrome/Chromium/Edge. Does not affect Local File scan | |
94
94
  | OOBEE_SLOWMO | Experimental flag to slow down web browser behaviour by specified duration (in miliseconds) | |
95
+ | OOBEE_TAGGED_WEBSITE | Tag to identify the website in telemetry. Can also be set via `-z, --websiteTag` CLI flag (CLI flag takes precedence). | |
96
+ | OOBEE_SCAN_METADATA | Overrides the `entryUrl` tag sent to telemetry. | |
97
+ | OOBEE_SCAN_PRODUCT | Adds a `scanProduct` tag to telemetry events. | |
98
+ | OOBEE_CONSECUTIVE_MAX_RETRIES | Max consecutive HTTP failures before the circuit breaker aborts the crawl. | `100` |
95
99
  | HTTP_PROXY | URL of the proxy server to be used for HTTP requests (e.g. `http://proxy.example.com:8080`). | |
96
100
  | HTTPS_PROXY | URL of the proxy server to be used for HTTPS requests (e.g. `https://proxy.example.com:8080`). | |
97
101
  | ALL_PROXY | URL of the proxy server to be used for all requests, typically used for SOCKS5 proxies (e.g. `socks5://proxy.example.com:1080`. Note: IPv6 direct connections may still continue even though socks5 proxy is specified due to a known issue with Chrome/Chromium. (Recommended workaround is to turn off IPv6 at host-level). | |
@@ -413,6 +417,21 @@ Examples:
413
417
  > [ -d <device> | -w <viewport_width> ]
414
418
 
415
419
  ```
420
+
421
+ ### Basic Auth
422
+
423
+ For sites behind HTTP Basic Authentication, you can provide credentials in two ways:
424
+
425
+ 1. **Embed in URL**: `npm run cli -- -u 'https://user:password@example.com' -c 3`
426
+ 2. **Use `-m` flag**: `npm run cli -- -u 'https://example.com' -c 3 -m "Authorization Basic dXNlcjpwYXNzd29yZA=="`
427
+
428
+ Both methods work across all scan types (sitemap, website, custom flow). For multiple headers, separate with `, `:
429
+ ```
430
+ -m "Authorization Basic dXNlcjpwYXNz, X-Custom-Header myvalue"
431
+ ```
432
+
433
+ > **Note:** Authorization headers are only sent to same-origin requests to avoid CORS preflight failures on cross-origin resources (e.g., CDN fonts, analytics scripts).
434
+
416
435
  ### Note on Windows PowerShell:
417
436
  You need to run the command as `npm run cli -- --` (with the extra set of `--`) as PowerShell interprets arguments differently.
418
437
 
package/dist/cli.js CHANGED
@@ -199,9 +199,10 @@ const scanInit = async (argvs) => {
199
199
  if (res.httpStatus)
200
200
  consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
201
201
  if (res.status === statuses.success.code) {
202
- // Custom flow should continue from the user-provided entry URL so auth redirects
203
- // do not replace the original domain used for overlay gating and navigation.
202
+ // Keep browser-resolved URL as entryUrl for downstream scan metadata/events
203
+ // on non-custom scans.
204
204
  if (data.type !== ScannerTypes.CUSTOM) {
205
+ data.entryUrl = res.url;
205
206
  data.url = res.url;
206
207
  }
207
208
  if (process.env.OOBEE_VALIDATE_URL) {
package/dist/combine.js CHANGED
@@ -23,7 +23,7 @@ export class ViewportSettingsClass {
23
23
  }
24
24
  const combineRun = async (details, deviceToScan) => {
25
25
  const envDetails = { ...details };
26
- const { type, url, nameEmail, randomToken, deviceChosen, customDevice, viewportWidth, playwrightDeviceDetailsObject, maxRequestsPerCrawl, browser, userDataDirectory, strategy, // Allow subdomains: if checked, = 'same-domain'
26
+ const { type, url, entryUrl, nameEmail, randomToken, deviceChosen, customDevice, viewportWidth, playwrightDeviceDetailsObject, maxRequestsPerCrawl, browser, userDataDirectory, strategy, // Allow subdomains: if checked, = 'same-domain'
27
27
  specifiedMaxConcurrency, // Slow scan mode: if checked, = '1'
28
28
  fileTypes, blacklistedPatternsFilename, includeScreenshots, // Include screenshots: if checked, = 'true'
29
29
  followRobots, // Adhere to robots.txt: if checked, = 'true'
@@ -59,8 +59,8 @@ const combineRun = async (details, deviceToScan) => {
59
59
  }
60
60
  // remove basic-auth credentials from URL
61
61
  const finalUrl = !(type === ScannerTypes.SITEMAP || type === ScannerTypes.LOCALFILE)
62
- ? new URL(url)
63
- : new URL(pathToFileURL(url));
62
+ ? new URL(entryUrl)
63
+ : new URL(pathToFileURL(entryUrl));
64
64
  // Use the string version of finalUrl to reduce logic at submitForm
65
65
  const finalUrlString = finalUrl.toString();
66
66
  const scanDetails = {
@@ -89,7 +89,7 @@ const combineRun = async (details, deviceToScan) => {
89
89
  let durationExceeded = false;
90
90
  switch (type) {
91
91
  case ScannerTypes.CUSTOM:
92
- const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
92
+ const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '', extraHTTPHeaders);
93
93
  urlsCrawledObj = res.urlsCrawled;
94
94
  uiCustomFlowLabel = res.customFlowLabel;
95
95
  break;
@@ -292,17 +292,30 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
292
292
  return res;
293
293
  }
294
294
  }
295
- // Ensure Accept header for non-html content fallback
296
- extraHTTPHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
295
+ // Ensure Accept header for non-html content fallback — use a local copy to avoid
296
+ // mutating the caller's extraHTTPHeaders object (which is later checked by crawlers
297
+ // to decide whether to enable preNavigationHooks header rewriting).
298
+ const localHeaders = { ...extraHTTPHeaders };
299
+ localHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
297
300
  await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
298
301
  let browserContext;
299
302
  let browserInstance;
300
303
  const rawDevice = (playwrightDeviceDetailsObject || {});
301
304
  const { viewport, isMobile, hasTouch, userAgent: deviceUserAgent, ...restDevice } = rawDevice;
302
305
  const launchOptions = getPlaywrightLaunchOptions(browserToRun);
306
+ const { Authorization, ...nonAuthHeaders } = localHeaders || {};
307
+ let httpCredentials = undefined;
308
+ if (Authorization?.startsWith('Basic ')) {
309
+ const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
310
+ const colonIdx = decoded.indexOf(':');
311
+ if (colonIdx > 0) {
312
+ httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
313
+ }
314
+ }
303
315
  const contextOptions = {
304
316
  ...restDevice,
305
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
317
+ ...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
318
+ ...(httpCredentials && { httpCredentials }),
306
319
  ignoreHTTPSErrors: true,
307
320
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
308
321
  };
@@ -342,6 +355,27 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
342
355
  return res;
343
356
  }
344
357
  try {
358
+ // Only enable generic Authorization header routing interception broadly if
359
+ // a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
360
+ // performance warnings inside the check checkUrl phase for typical public scans
361
+ if (Object.keys(localHeaders).length > 0) {
362
+ if (Authorization && !httpCredentials) {
363
+ const entryOrigin = new URL(url).origin;
364
+ await browserContext.route('**/*', async (route, request) => {
365
+ try {
366
+ if (new URL(request.url()).origin === entryOrigin) {
367
+ await route.continue({ headers: { ...request.headers(), Authorization } });
368
+ }
369
+ else {
370
+ await route.continue();
371
+ }
372
+ }
373
+ catch {
374
+ await route.continue();
375
+ }
376
+ });
377
+ }
378
+ }
345
379
  const page = await browserContext.newPage();
346
380
  // Block native Chrome download UI
347
381
  try {
@@ -351,15 +385,6 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
351
385
  catch (e) {
352
386
  consoleLogger.info(`Unable to set download deny: ${e.message}`);
353
387
  }
354
- // OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
355
- // This allows the "Connectivity Check" to pass as soon as HTML is ready
356
- await page.route('**/*', (route) => {
357
- const type = route.request().resourceType();
358
- if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
359
- return route.abort();
360
- }
361
- return route.continue();
362
- });
363
388
  // STEP 2: Navigate (follows server-side redirects)
364
389
  page.once('download', () => {
365
390
  res.status = constants.urlCheckStatuses.notASupportedDocument.code;
@@ -471,7 +496,7 @@ export const isSitemapContent = (content) => {
471
496
  return true;
472
497
  }
473
498
  const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
474
- const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
499
+ const regexForXmlSitemap = new RegExp('<(?:urlset|sitemapindex|feed|rss)+?.*>', 'gmi');
475
500
  if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
476
501
  // is an XML sitemap wrapped in a HTML document
477
502
  return true;
@@ -485,7 +510,18 @@ export const isSitemapContent = (content) => {
485
510
  return false;
486
511
  };
487
512
  export const checkUrl = async (scanner, url, browser, clonedDataDir, playwrightDeviceDetailsObject, extraHTTPHeaders, fileTypes) => {
488
- const res = await checkUrlConnectivityWithBrowser(url, browser, clonedDataDir, playwrightDeviceDetailsObject, extraHTTPHeaders);
513
+ let urlToCheck = url;
514
+ if (scanner === ScannerTypes.LOCALFILE) {
515
+ if (!isFilePath(url)) {
516
+ const res = new RES();
517
+ res.status = constants.urlCheckStatuses.notALocalFile.code;
518
+ return res;
519
+ }
520
+ if (!url.toLowerCase().startsWith('file://')) {
521
+ urlToCheck = pathToFileURL(path.resolve(url)).toString();
522
+ }
523
+ }
524
+ const res = await checkUrlConnectivityWithBrowser(urlToCheck, browser, clonedDataDir, playwrightDeviceDetailsObject, extraHTTPHeaders);
489
525
  // If response is 200 (meaning no other code was set earlier)
490
526
  if (res.status === constants.urlCheckStatuses.success.code) {
491
527
  // Check if document is pdf type
@@ -532,7 +568,7 @@ export const prepareData = async (argv) => {
532
568
  if (isEmptyObject(argv)) {
533
569
  throw Error('No inputs should be provided');
534
570
  }
535
- let { scanner, headless, url, deviceChosen, customDevice, viewportWidth, maxpages, strategy, isLocalFileScan = argv.scanner === ScannerTypes.LOCALFILE, browserToRun, nameEmail, customFlowLabel, specifiedMaxConcurrency, fileTypes, blacklistedPatternsFilename, additional, metadata, followRobots, header, safeMode, exportDirectory, zip, ruleset, generateJsonFiles, scanDuration, } = argv;
571
+ let { scanner, headless, url, deviceChosen, customDevice, viewportWidth, maxpages, strategy, isLocalFileScan = argv.scanner === ScannerTypes.LOCALFILE, browserToRun, nameEmail, customFlowLabel, specifiedMaxConcurrency, fileTypes, blacklistedPatternsFilename, additional, metadata, followRobots, header, safeMode, exportDirectory, zip, ruleset, generateJsonFiles, scanDuration, finalUrl, } = argv;
536
572
  const extraHTTPHeaders = parseHeaders(header);
537
573
  // Set default username and password for basic auth
538
574
  let username = '';
@@ -558,6 +594,9 @@ export const prepareData = async (argv) => {
558
594
  temp.password = '';
559
595
  url = temp.toString();
560
596
  }
597
+ // Keep browser-resolved URL (if provided by pre-check flow) as canonical entry URL.
598
+ // For local file paths, keep using the normalized `url` value below.
599
+ const resolvedEntryUrl = finalUrl && !isFilePath(finalUrl) ? finalUrl : url;
561
600
  // construct filename for scan results
562
601
  const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
563
602
  const domain = isLocalFileScan ? path.basename(url) : new URL(url).hostname;
@@ -585,7 +624,7 @@ export const prepareData = async (argv) => {
585
624
  return {
586
625
  type: scanner,
587
626
  url,
588
- entryUrl: url,
627
+ entryUrl: resolvedEntryUrl,
589
628
  isHeadless: headless,
590
629
  deviceChosen,
591
630
  customDevice,
@@ -790,6 +829,7 @@ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, u
790
829
  const scannedSitemaps = new Set();
791
830
  const sitemapLinkCounts = {};
792
831
  const allUrls = new Set(); // all discovered URLs (lightweight strings)
832
+ const isImageSitemapUrl = (candidateUrl) => /(^|\/)image-sitemap(?:-index)?(?:-\d+)?\.xml(?:$|[?#])/i.test(candidateUrl);
793
833
  const addToUrlList = (url) => {
794
834
  if (!url)
795
835
  return;
@@ -860,6 +900,10 @@ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, u
860
900
  const fetchUrls = async (url, extraHTTPHeaders) => {
861
901
  let data;
862
902
  let sitemapType;
903
+ if (isImageSitemapUrl(url)) {
904
+ consoleLogger.info(`Skipping image sitemap: ${url}`);
905
+ return;
906
+ }
863
907
  if (scannedSitemaps.has(url)) {
864
908
  // Skip processing if the sitemap has already been scanned
865
909
  return;
@@ -906,27 +950,45 @@ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, u
906
950
  });
907
951
  }
908
952
  const page = await browserContext.newPage();
909
- await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
910
- if ((await page.locator('body').count()) > 0) {
911
- data = await page.locator('body').innerText();
912
- }
913
- else {
914
- const urlSet = page.locator('urlset');
915
- const sitemapIndex = page.locator('sitemapindex');
916
- const rss = page.locator('rss');
917
- const feed = page.locator('feed');
918
- const isRoot = async (locator) => (await locator.count()) > 0;
919
- if (await isRoot(urlSet)) {
920
- data = await urlSet.evaluate(elem => elem.outerHTML);
953
+ // Use 'domcontentloaded' instead of 'networkidle' sitemap XMLs with
954
+ // XSL stylesheet references (e.g. <?xml-stylesheet ...?>) cause the browser
955
+ // to fetch and apply the stylesheet, which may load additional resources
956
+ // (fonts, CSS, images) that prevent 'networkidle' from ever being reached.
957
+ const response = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
958
+ // Prefer the raw response body — this gives us the original XML before
959
+ // the browser applies any XSL transformation (which would turn the XML
960
+ // into rendered HTML, losing the sitemap structure).
961
+ if (response) {
962
+ try {
963
+ data = await response.text();
921
964
  }
922
- else if (await isRoot(sitemapIndex)) {
923
- data = await sitemapIndex.evaluate(elem => elem.outerHTML);
965
+ catch {
966
+ // response.text() can fail if the body was already consumed or
967
+ // if a redirect occurred; fall through to DOM extraction below.
924
968
  }
925
- else if (await isRoot(rss)) {
926
- data = await rss.evaluate(elem => elem.outerHTML);
969
+ }
970
+ if (!data) {
971
+ if ((await page.locator('body').count()) > 0) {
972
+ data = await page.locator('body').innerText();
927
973
  }
928
- else if (await isRoot(feed)) {
929
- data = await feed.evaluate(elem => elem.outerHTML);
974
+ else {
975
+ const urlSet = page.locator('urlset');
976
+ const sitemapIndex = page.locator('sitemapindex');
977
+ const rss = page.locator('rss');
978
+ const feed = page.locator('feed');
979
+ const isRoot = async (locator) => (await locator.count()) > 0;
980
+ if (await isRoot(urlSet)) {
981
+ data = await urlSet.evaluate(elem => elem.outerHTML);
982
+ }
983
+ else if (await isRoot(sitemapIndex)) {
984
+ data = await sitemapIndex.evaluate(elem => elem.outerHTML);
985
+ }
986
+ else if (await isRoot(rss)) {
987
+ data = await rss.evaluate(elem => elem.outerHTML);
988
+ }
989
+ else if (await isRoot(feed)) {
990
+ data = await feed.evaluate(elem => elem.outerHTML);
991
+ }
930
992
  }
931
993
  }
932
994
  }
@@ -949,37 +1011,61 @@ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, u
949
1011
  data = fs.readFileSync(url, 'utf8');
950
1012
  }
951
1013
  const $ = cheerio.load(data, { xml: true });
1014
+ const countBefore = allUrls.size;
952
1015
  // This case is when the document is not an XML format document
953
1016
  if ($(':root').length === 0) {
954
1017
  processNonStandardSitemap(data);
1018
+ const linksFromThisSitemap = allUrls.size - countBefore;
1019
+ if (linksFromThisSitemap > 0) {
1020
+ sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
1021
+ }
955
1022
  return;
956
1023
  }
957
1024
  // Root element
958
1025
  const root = $(':root')[0];
959
- const { xmlns } = root.attribs;
960
- const xmlFormatNamespace = '/schemas/sitemap';
961
- if (root.name === 'urlset' && xmlns.includes(xmlFormatNamespace)) {
1026
+ const hasImageNamespace = Object.values(root?.attribs ?? {}).some(attribVal => typeof attribVal === 'string' && attribVal.toLowerCase().includes('sitemap-image'));
1027
+ if (hasImageNamespace) {
1028
+ consoleLogger.info(`Skipping image sitemap: ${url}`);
1029
+ return;
1030
+ }
1031
+ const rootName = root?.name?.toLowerCase().split(':').pop() ?? '';
1032
+ const hasXmlSitemapIndexTag = /<\s*(?:[a-z0-9_-]+:)?sitemapindex\b/i.test(data);
1033
+ const hasXmlUrlsetTag = /<\s*(?:[a-z0-9_-]+:)?urlset\b/i.test(data);
1034
+ if (rootName === 'urlset') {
962
1035
  sitemapType = constants.xmlSitemapTypes.xml;
963
1036
  }
964
- else if (root.name === 'sitemapindex' && xmlns.includes(xmlFormatNamespace)) {
1037
+ else if (rootName === 'sitemapindex') {
965
1038
  sitemapType = constants.xmlSitemapTypes.xmlIndex;
966
1039
  }
967
- else if (root.name === 'rss') {
1040
+ else if (rootName === 'rss') {
968
1041
  sitemapType = constants.xmlSitemapTypes.rss;
969
1042
  }
970
- else if (root.name === 'feed') {
1043
+ else if (rootName === 'feed') {
971
1044
  sitemapType = constants.xmlSitemapTypes.atom;
972
1045
  }
1046
+ else if (hasXmlSitemapIndexTag) {
1047
+ sitemapType = constants.xmlSitemapTypes.xmlIndex;
1048
+ }
1049
+ else if (hasXmlUrlsetTag) {
1050
+ sitemapType = constants.xmlSitemapTypes.xml;
1051
+ }
973
1052
  else {
974
1053
  sitemapType = constants.xmlSitemapTypes.unknown;
975
1054
  }
976
- const countBefore = allUrls.size;
977
1055
  switch (sitemapType) {
978
1056
  case constants.xmlSitemapTypes.xmlIndex:
979
- consoleLogger.info(`This is a XML format sitemap index.`);
1057
+ consoleLogger.info(`This is a XML format sitemap index: ${url}`);
980
1058
  for (const childSitemapUrl of $('loc')) {
981
- const childSitemapUrlText = $(childSitemapUrl).text();
982
- if (childSitemapUrlText.endsWith('.xml') || childSitemapUrlText.endsWith('.txt')) {
1059
+ const childSitemapUrlText = $(childSitemapUrl).text().trim();
1060
+ if (!childSitemapUrlText) {
1061
+ continue;
1062
+ }
1063
+ const childSitemapPath = childSitemapUrlText.split(/[?#]/)[0].toLowerCase();
1064
+ if (childSitemapPath.endsWith('.xml') || childSitemapPath.endsWith('.txt')) {
1065
+ if (isImageSitemapUrl(childSitemapUrlText)) {
1066
+ consoleLogger.info(`Skipping image sitemap: ${childSitemapUrlText}`);
1067
+ continue;
1068
+ }
983
1069
  await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
984
1070
  }
985
1071
  else {
@@ -988,19 +1074,19 @@ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, u
988
1074
  }
989
1075
  break;
990
1076
  case constants.xmlSitemapTypes.xml:
991
- consoleLogger.info(`This is a XML format sitemap.`);
1077
+ consoleLogger.info(`This is a XML format sitemap: ${url}`);
992
1078
  await processXmlSitemap($, sitemapType, 'loc', 'lastmod', 'url');
993
1079
  break;
994
1080
  case constants.xmlSitemapTypes.rss:
995
- consoleLogger.info(`This is a RSS format sitemap.`);
1081
+ consoleLogger.info(`This is a RSS format sitemap: ${url}`);
996
1082
  await processXmlSitemap($, sitemapType, 'link', 'pubDate', 'item');
997
1083
  break;
998
1084
  case constants.xmlSitemapTypes.atom:
999
- consoleLogger.info(`This is a Atom format sitemap.`);
1085
+ consoleLogger.info(`This is a Atom format sitemap: ${url}`);
1000
1086
  await processXmlSitemap($, sitemapType, 'link', 'published', 'entry');
1001
1087
  break;
1002
1088
  default:
1003
- consoleLogger.info(`This is an unrecognised XML sitemap format.`);
1089
+ consoleLogger.info(`This is an unrecognised XML sitemap format: ${url}`);
1004
1090
  processNonStandardSitemap(data);
1005
1091
  }
1006
1092
  const linksFromThisSitemap = allUrls.size - countBefore;
@@ -1816,7 +1902,8 @@ function isValidHttpUrl(urlString) {
1816
1902
  export const isFilePath = (url) => {
1817
1903
  const driveLetterPattern = /^[A-Z]:/i;
1818
1904
  const backslashPattern = /\\/;
1819
- return (url.startsWith('/') ||
1905
+ return (url.toLowerCase().startsWith('file://') ||
1906
+ url.startsWith('/') ||
1820
1907
  driveLetterPattern.test(url) ||
1821
1908
  backslashPattern.test(url) ||
1822
1909
  url.startsWith('./') ||
@@ -898,13 +898,65 @@ export const createCrawleeSubFolders = async (randomToken) => {
898
898
  export const preNavigationHooks = (extraHTTPHeaders) => {
899
899
  return [
900
900
  async (crawlingContext, gotoOptions) => {
901
- if (extraHTTPHeaders) {
901
+ if (extraHTTPHeaders && Object.keys(extraHTTPHeaders).length > 0) {
902
902
  crawlingContext.request.headers = extraHTTPHeaders;
903
903
  }
904
- gotoOptions = { waitUntil: 'networkidle', timeout: 30000 };
904
+ // Use domcontentloaded fires as soon as the DOM is parsed, before
905
+ // images/stylesheets/network requests settle. This avoids indefinite
906
+ // hangs on sites with WebSockets, analytics polling, or infinite-scroll
907
+ // beacons that never reach networkidle. Further page stability is
908
+ // handled by waitForPageLoaded() in each crawler's requestHandler and
909
+ // by the DOM mutation observer in postNavigationHooks.
910
+ if (gotoOptions) {
911
+ gotoOptions.waitUntil = 'domcontentloaded';
912
+ gotoOptions.timeout = 30000;
913
+ }
905
914
  },
906
915
  ];
907
916
  };
917
+ /**
918
+ * Splits extraHTTPHeaders into auth and non-auth parts.
919
+ * Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
920
+ * Non-auth headers are safe to set globally on the browser context.
921
+ */
922
+ export const splitAuthHeaders = (extraHTTPHeaders) => {
923
+ const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
924
+ return {
925
+ authHeader: Authorization || null,
926
+ nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
927
+ httpCredentials: (() => {
928
+ if (!Authorization?.startsWith('Basic '))
929
+ return null;
930
+ const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
931
+ const colonIdx = decoded.indexOf(':');
932
+ if (colonIdx <= 0)
933
+ return null;
934
+ return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
935
+ })(),
936
+ };
937
+ };
938
+ /**
939
+ * Adds a route handler to a BrowserContext that sends the Authorization header
940
+ * only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
941
+ */
942
+ export const addAuthRouteHandler = async (context, entryUrl, authHeader) => {
943
+ if (!authHeader)
944
+ return;
945
+ const entryOrigin = new URL(entryUrl).origin;
946
+ await context.route('**/*', async (route, request) => {
947
+ try {
948
+ if (new URL(request.url()).origin === entryOrigin) {
949
+ await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
950
+ }
951
+ else {
952
+ await route.continue();
953
+ }
954
+ }
955
+ catch {
956
+ await route.continue();
957
+ }
958
+ });
959
+ };
908
960
  export const postNavigationHooks = [
909
961
  async (_crawlingContext) => {
910
962
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
@@ -1,6 +1,6 @@
1
1
  import crawlee from 'crawlee';
2
2
  import { CrawlRateController } from './crawlRateController.js';
3
- import { createCrawleeSubFolders, getPreLaunchHook, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
3
+ import { createCrawleeSubFolders, getPreLaunchHook, preNavigationHooks, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, splitAuthHeaders, } from './commonCrawlerFunc.js';
4
4
  import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
5
5
  import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
6
6
  import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
@@ -275,6 +275,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
275
275
  };
276
276
  let isAbortingScanNow = false;
277
277
  const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
278
+ const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
278
279
  const crawler = register(new crawlee.PlaywrightCrawler({
279
280
  launchContext: {
280
281
  launcher: constants.launcher,
@@ -293,12 +294,18 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
293
294
  ...playwrightDeviceDetailsObject,
294
295
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
295
296
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
296
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
297
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
298
+ ...(httpCredentials && { httpCredentials }),
297
299
  };
298
300
  },
299
301
  ],
300
302
  },
301
303
  requestQueue,
304
+ maxRequestRetries: 3,
305
+ maxSessionRotations: 1,
306
+ preNavigationHooks: [
307
+ ...preNavigationHooks(extraHTTPHeaders),
308
+ ],
302
309
  postNavigationHooks: [
303
310
  async (crawlingContext) => {
304
311
  const { page, request } = crawlingContext;
@@ -1,4 +1,4 @@
1
- import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
1
+ import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
2
2
  import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
3
3
  import { consoleLogger, guiInfoLog } from '../logs.js';
4
4
  import crawlDomain from './crawlDomain.js';
@@ -26,26 +26,31 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
26
26
  const homeUrl = getHomeUrl(link);
27
27
  let sitemapLink = '';
28
28
  const launchOptions = getPlaywrightLaunchOptions(browser);
29
+ const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
29
30
  let context;
30
31
  let browserInstance;
31
32
  if (process.env.CRAWLEE_HEADLESS === '1') {
32
33
  const effectiveUserDataDirectory = userDataDirectory || '';
33
34
  context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
34
35
  ...launchOptions,
35
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
36
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
37
+ ...(httpCredentials && { httpCredentials }),
36
38
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
37
39
  });
38
40
  register(context);
39
41
  }
40
42
  else {
41
- // In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
42
43
  browserInstance = await constants.launcher.launch(launchOptions);
43
44
  register(browserInstance);
44
45
  context = await browserInstance.newContext({
45
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
46
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
47
+ ...(httpCredentials && { httpCredentials }),
46
48
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
47
49
  });
48
50
  }
51
+ if (authHeader) {
52
+ await addAuthRouteHandler(context, link, authHeader);
53
+ }
49
54
  const page = await context.newPage();
50
55
  for (const path of sitemapPaths) {
51
56
  sitemapLink = homeUrl + path;