@govtechsg/oobee 0.10.91 → 0.10.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/AGENTS.md +303 -0
  2. package/README.md +22 -0
  3. package/dist/cli.js +3 -0
  4. package/dist/combine.js +15 -3
  5. package/dist/constants/cliFunctions.js +7 -0
  6. package/dist/constants/common.js +149 -80
  7. package/dist/constants/constants.js +1 -0
  8. package/dist/crawlers/commonCrawlerFunc.js +136 -15
  9. package/dist/crawlers/crawlDomain.js +55 -58
  10. package/dist/crawlers/crawlIntelligentSitemap.js +21 -11
  11. package/dist/crawlers/crawlRateController.js +47 -0
  12. package/dist/crawlers/crawlSitemap.js +51 -62
  13. package/dist/crawlers/runCustom.js +8 -2
  14. package/dist/generateOobeeClientScanner.js +32 -1
  15. package/dist/mergeAxeResults/itemsStore.js +32 -3
  16. package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
  17. package/dist/mergeAxeResults.js +120 -92
  18. package/dist/npmIndex.js +1 -0
  19. package/dist/utils.js +23 -28
  20. package/oobee-client-scanner.js +35 -4
  21. package/package.json +3 -3
  22. package/src/cli.ts +4 -0
  23. package/src/combine.ts +16 -1
  24. package/src/constants/cliFunctions.ts +7 -0
  25. package/src/constants/common.ts +162 -90
  26. package/src/constants/constants.ts +1 -0
  27. package/src/crawlers/commonCrawlerFunc.ts +148 -14
  28. package/src/crawlers/crawlDomain.ts +64 -66
  29. package/src/crawlers/crawlIntelligentSitemap.ts +23 -11
  30. package/src/crawlers/crawlRateController.ts +63 -0
  31. package/src/crawlers/crawlSitemap.ts +57 -70
  32. package/src/crawlers/runCustom.ts +10 -1
  33. package/src/generateOobeeClientScanner.ts +32 -1
  34. package/src/index.ts +1 -0
  35. package/src/mergeAxeResults/itemsStore.ts +37 -3
  36. package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
  37. package/src/mergeAxeResults.ts +139 -99
  38. package/src/npmIndex.ts +1 -0
  39. package/src/utils.ts +25 -33
  40. /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0
package/AGENTS.md ADDED
@@ -0,0 +1,303 @@
1
+ # Oobee Developer Guide
2
+
3
+ > **Keep this file up to date.** When you make changes that affect architecture, crawl behavior, environment variables, or testing considerations described here, update the relevant section in the same commit.
4
+
5
+ Oobee is a web accessibility scanner that crawls websites and runs axe-core + custom checks against each page, producing HTML/PDF/CSV/JSON reports.
6
+
7
+ ## Architecture Overview
8
+
9
+ ```
10
+ User Input (CLI / npm API)
11
+
12
+ combine.ts (orchestrator)
13
+ ↓ routes by ScannerTypes
14
+ Crawler (crawlDomain / crawlSitemap / crawlIntelligentSitemap / crawlLocalFile / runCustom)
15
+ ↓ uses Crawlee PlaywrightCrawler
16
+ Page Handler (axe-core injection + custom checks)
17
+ ↓ writes per-page JSON to Crawlee dataset
18
+ generateArtifacts() in mergeAxeResults.ts
19
+ ↓ reads dataset, aggregates, renders templates
20
+ Reports (HTML, PDF, CSV, JSON, sitemap.xml)
21
+ ```
22
+
23
+ ## Entry Points
24
+
25
+ | Entry | File | Purpose |
26
+ |-------|------|---------|
27
+ | CLI | `src/cli.ts` | yargs-based CLI, calls `combineRun()` |
28
+ | Interactive CLI | `src/index.ts` | Inquirer prompts, calls `combineRun()` |
29
+ | npm API | `src/npmIndex.ts` | Programmatic `init()` for external consumers |
30
+ | Orchestrator | `src/combine.ts` | Routes scan type, manages lifecycle, calls `generateArtifacts()` |
31
+
32
+ ## Scanner Types
33
+
34
+ | Type | File | Behavior |
35
+ |------|------|----------|
36
+ | `Website` | `src/crawlers/crawlDomain.ts` | Domain crawl, discovers links from pages |
37
+ | `Sitemap` | `src/crawlers/crawlSitemap.ts` | Fetches URLs from sitemap XML |
38
+ | `Intelligent` | `src/crawlers/crawlIntelligentSitemap.ts` | Discovers sitemap via robots.txt, crawls it, then supplements with domain crawl |
39
+ | `LocalFile` | `src/crawlers/crawlLocalFile.ts` | Scans local HTML/PDF files via file:// |
40
+ | `Custom` | `src/crawlers/runCustom.ts` | User-driven flow (manual navigation in browser) |
41
+
42
+ ## Key Files
43
+
44
+ ### Constants & Configuration
45
+
46
+ - **`src/constants/constants.ts`** — Enums (`ScannerTypes`, `BrowserTypes`, `FileTypes`, `RuleFlags`), browser data dir paths, sitemap path list, WCAG mappings, shared mutable state (`robotsTxtUrls`, `sitemapFetchedLinks`, `userDataDirectory`, `launcher`)
47
+ - **`src/constants/common.ts`** — URL validation (`checkUrl`), browser launch options (`getPlaywrightLaunchOptions`), sitemap parsing (`getLinksFromSitemap`, `getSitemapsFromRobotsTxt`), robots.txt handling, browser selection (`getBrowserToRun`), user-agent initialization (`initModifiedUserAgent`)
48
+
49
+ ### Crawlers
50
+
51
+ All crawlers use Crawlee's `PlaywrightCrawler` with:
52
+ - `maxRequestsPerCrawl: Infinity` (Crawlee's internal limit disabled)
53
+ - Manual stop when `urlsCrawled.scanned.length >= maxRequestsPerCrawl` (counts only successful scans)
54
+ - `retryOnBlocked: true`
55
+ - `useFingerprints: false`
56
+
57
+ ### Report Generation
58
+
59
+ - **`src/mergeAxeResults.ts`** — Main `generateArtifacts()` function, reads Crawlee dataset, builds `allIssues` object, generates all output formats
60
+ - **`src/mergeAxeResults/`** — Sub-modules: `jsonArtifacts.ts` (JSON+base64), `writeCsv.ts`, `writeSitemap.ts`, `scanPages.ts`, `itemsStore.ts`, `types.ts`
61
+ - **`src/static/ejs/`** — EJS templates for HTML report and PDF summary
62
+
63
+ ## Browser Handling
64
+
65
+ ### Selection Priority
66
+
67
+ `getBrowserToRun()` in `common.ts` resolves the browser:
68
+ - If no preference specified: defaults to Chrome on Windows/macOS, Chromium on Linux
69
+ - Fallback chains:
70
+ - **macOS**: Chrome → webkit
71
+ - **Windows**: Chrome → Edge → error
72
+ - **Linux**: Chrome → Chromium (bundled by Playwright)
73
+ - When `chromium` is specified: uses Playwright's bundled Chromium with no channel
74
+
75
+ ### Launch Options
76
+
77
+ `getPlaywrightLaunchOptions()` builds Playwright launch config:
78
+ - Headless mode from `process.env.CRAWLEE_HEADLESS`
79
+ - Docker detection (`/.dockerenv`): adds `--disable-gpu`, `--no-sandbox`, `--disable-dev-shm-usage`
80
+ - Proxy support (manual, PAC, or none) via `getProxyInfo()`
81
+ - Channel set from browser name (undefined for chromium = bundled)
82
+ - `--mute-audio` is added by default in both headless and headful modes, but must be disabled for `customFlow` by calling `getPlaywrightLaunchOptions(browser, { includeMuteAudio: false })`
83
+
84
+ ### User-Agent
85
+
86
+ `initModifiedUserAgent()` detects the default UA, replaces `HeadlessChrome` with `Chrome`, stores in `process.env.OOBEE_USER_AGENT`. This must be called before any browser context that talks to remote servers in headless mode, or bot-blocking WAFs will reject requests.
87
+
88
+ Contexts that need `userAgent: process.env.OOBEE_USER_AGENT`:
89
+ - `getRobotsTxtViaPlaywright()` — robots.txt fetching
90
+ - `findSitemap()` in `crawlIntelligentSitemap.ts` — sitemap path probing
91
+ - `getDataUsingPlaywright()` in `getLinksFromSitemap()` — sitemap XML content fetching
92
+ - `checkUrl()` — main URL validation context (already handled)
93
+ - Crawlee crawler contexts in `crawlDomain`/`crawlSitemap` — UA set via `preLaunchHooks` in `getPreLaunchHook()`
94
+
95
+ ### Headless vs Headful
96
+
97
+ - Docker/Linux: always headless (`CRAWLEE_HEADLESS=1`)
98
+ - macOS CLI: typically headful (`CRAWLEE_HEADLESS=0`) unless user opts in
99
+ - Headful mode uses ephemeral contexts (no `userDataDir`) to avoid "Browser window not found" errors
100
+ - Headless mode uses `launchPersistentContext` with cloned user data directories
101
+
102
+ ## Sitemap Discovery & Fetching
103
+
104
+ The intelligent crawl flow:
105
+ 1. `getSitemapsFromRobotsTxt()` — fetches robots.txt, extracts `Sitemap:` directives
106
+ 2. `findSitemap()` — probes hardcoded paths (`/sitemap.xml`, `/sitemap-index.xml`, etc.)
107
+ 3. `getLinksFromSitemap()` — fetches and parses sitemap XML content, returns `Request[]`
108
+
109
+ Important behaviors:
110
+ - All URLs from the sitemap are discovered and stored as strings in a `Set<string>`
111
+ - All discovered URLs are converted to `Request` objects (no truncation at this stage)
112
+ - The crawler itself enforces `maxRequestsPerCrawl` by counting only successfully scanned pages
113
+ - `constants.sitemapFetchedLinks` stores the total discovered count for `scanData.json` reporting
114
+ - For sitemap indexes, child sitemaps are processed recursively
115
+
116
+ ## Shared Mutable State
117
+
118
+ The `constants` default export object holds runtime state:
119
+ - `constants.launcher` — Playwright browser type (chromium/webkit)
120
+ - `constants.robotsTxtUrls` — Parsed robots.txt disallow/allow rules
121
+ - `constants.sitemapFetchedLinks` — Sitemap fetch diagnostics (reset per scan)
122
+ - `constants.userDataDirectory` — Current browser profile directory
123
+ - `constants.randomToken` — Current scan token
124
+ - `constants.resources` — Active Crawlee crawlers, browser contexts, browsers (for cleanup)
125
+
126
+ ## Environment Variables
127
+
128
+ ### User-Facing
129
+ | Variable | Purpose |
130
+ |----------|---------|
131
+ | `CRAWLEE_HEADLESS` | `1` = headless, `0` = headful (set by `setHeadlessMode()`) |
132
+ | `OOBEE_USER_AGENT` | Modified UA (set by `initModifiedUserAgent()`) |
133
+ | `OOBEE_VERBOSE` | Enable verbose console logging |
134
+ | `OOBEE_LOGS_PATH` | Custom log directory |
135
+ | `OOBEE_SLOWMO` | Browser slowmo in ms |
136
+ | `OOBEE_FAST_CRAWLER` | Experimental high-concurrency mode |
137
+ | `OOBEE_DISABLE_BROWSER_DOWNLOAD` | Block browser file downloads |
138
+ | `OOBEE_TAGGED_WEBSITE` | Tag to identify the website in Sentry telemetry (overridden by `--websiteTag` CLI flag) |
139
+ | `OOBEE_SCAN_METADATA` | Overrides `entryUrl` tag in Sentry events |
140
+ | `OOBEE_SCAN_PRODUCT` | Adds `scanProduct` tag to Sentry events |
141
+ | `OOBEE_CONSECUTIVE_MAX_RETRIES` | Max consecutive HTTP failures before circuit breaker aborts crawl (default 100) |
142
+ | `OOBEE_VALIDATE_URL` | If set, exit after URL validation without scanning |
143
+ | `HTTP_PROXY` / `HTTPS_PROXY` / `ALL_PROXY` | Proxy configuration |
144
+ | `NO_PROXY` / `INCLUDE_PROXY` | Proxy bypass/include lists |
145
+
146
+ ### Internal (set by code)
147
+ | Variable | Purpose |
148
+ |----------|---------|
149
+ | `CRAWLEE_STORAGE_DIR` | Crawlee dataset directory (= randomToken) |
150
+ | `CRAWLEE_LOG_LEVEL` | Set to `ERROR` |
151
+ | `CRAWLEE_SYSTEM_INFO_V2` | `1` (Windows wmic workaround) |
152
+ | `NODE_TLS_REJECT_UNAUTHORIZED` | Set to `0` for self-signed certs |
153
+
154
+ ## Platform Differences
155
+
156
+ ### Docker/Linux
157
+ - `/.dockerenv` detection adds `--no-sandbox`, `--disable-gpu`, `--disable-dev-shm-usage`
158
+ - No system Chrome/Edge — always falls back to Playwright's bundled Chromium
159
+ - `getDefaultChromeDataDir()` returns null (no Chrome profile to clone)
160
+ - `getDefaultChromiumDataDir()` creates `./Chromium Support` or falls back to `/tmp`
161
+ - Always headless in Docker
162
+ - Default UA contains `HeadlessChrome` — must be patched via `initModifiedUserAgent()`
163
+
164
+ ### macOS
165
+ - Defaults to system Chrome if available, falls back to webkit (not Chromium)
166
+ - Browser profiles at `~/Library/Application Support/Google/Chrome`
167
+ - Typically headful (non-headless)
168
+ - Logs at `~/Library/Application Support/Oobee/`
169
+
170
+ ### Windows
171
+ - Defaults to system Chrome, falls back to Edge
172
+ - Browser profiles at `%APPDATA%/Local/Google/Chrome/User Data`
173
+ - File locks require longer cleanup delays (5s vs 3s)
174
+ - Path separator differences in cookie profile regex
175
+ - `CRAWLEE_SYSTEM_INFO_V2=1` needed (wmic deprecation)
176
+
177
+ ## Testing
178
+
179
+ ```bash
180
+ npm test # Run Jest tests (uses --experimental-vm-modules)
181
+ npx tsc --noEmit # Type-check without emitting
182
+ npm run build # Compile TypeScript
183
+ ```
184
+
185
+ Test files: `__tests__/` directory and `src/crawlers/__tests__/`
186
+
187
+ ## Build & Run
188
+
189
+ ```bash
190
+ npm install # Install dependencies
191
+ npm run build # Compile TS → dist/
192
+ node dist/cli.js # Run CLI
193
+ ```
194
+
195
+ Docker:
196
+ ```bash
197
+ docker build -t oobee .
198
+ docker run oobee node dist/cli.js ...
199
+ ```
200
+
201
+ ## Common Pitfalls
202
+
203
+ 1. **Bot-blocking in headless mode** — Any new browser context that fetches remote content in headless mode must pass `userAgent: process.env.OOBEE_USER_AGENT`. Without this, sites with WAFs block the request.
204
+
205
+ 2. **`maxRequestsPerCrawl` semantics** — This counts *successfully scanned* pages, not total requests. The sitemap enqueues all discovered URLs; the crawler stops when enough succeed. Errored pages do not consume the budget.
206
+
207
+ 3. **Browser profile isolation** — Each scan clones browser profiles with a `randomToken` suffix. Profiles must be cleaned up after scan (`deleteClonedProfiles()`).
208
+
209
+ - If Chrome/Edge profile cloning fails (for example `EBUSY` while copying locked cookie/state files on Windows), Oobee now falls back to an empty cloned profile directory for that scan. This keeps browser launch stable, but authenticated session cookies may not be available.
210
+ - Crawlee's browser pool retires and re-launches browser instances after ~4 minutes. On Windows, reusing the same `--user-data-dir` causes Chrome exit code 21 (stale lock contention). `getPreLaunchHook()` in `commonCrawlerFunc.ts` assigns unique `_pool{N}` directories for each re-launch and performs a best-effort async clone of the base profile. Cleanup must glob `_pool*` directories alongside the base `oobee-{token}` dir.
211
+
212
+ 4. **`constants.launcher` mutation** — When webkit is the fallback, `constants.launcher` is reassigned globally. This affects all subsequent browser launches in the same process.
213
+
214
+ 5. **Headful vs headless context creation** — Headful mode must NOT use `launchPersistentContext` with custom `userDataDir` (causes "Browser window not found" crash). Use `launch()` + `newContext()` instead.
215
+
216
+ 6. **Sitemap fetch state** — `constants.sitemapFetchedLinks` accumulates across multiple `getLinksFromSitemap` calls. Must be reset to `null` at scan start.
217
+
218
+ 7. **PDF generation** — `writeSummaryPdf()` always runs headless regardless of scan mode. It loads a local `file://` URL so UA/network issues don't apply, but it needs a working browser binary.
219
+
220
+ - On Windows, summary PDF generation now retries with Edge (`msedge`) if the initial Chrome launch fails at runtime.
221
+
222
+ 8. **Crawlee dataset** — Results are stored as numbered JSON files in `{randomToken}/datasets/default/`. Each file is one page's axe results. `generateArtifacts()` reads all of them.
223
+
224
+ 9. **Auth headers and CORS** — Never set `Authorization` in `extraHTTPHeaders` globally on a browser context. Playwright sends `extraHTTPHeaders` to ALL requests (including cross-origin CDNs), which triggers CORS preflight failures. Instead use `splitAuthHeaders()` from `commonCrawlerFunc.ts` to separate auth from non-auth headers:
225
+ - Non-auth headers → safe to set globally via `extraHTTPHeaders` on context/launch options
226
+ - Basic auth → set `httpCredentials` on context (Playwright auto-responds to 401 challenges, origin-aware)
227
+ - Any Authorization header → send only to same-origin requests via `addAuthRouteHandler()` (route interception) or Crawlee's `preNavigationHooks` (navigation-only)
228
+ - Credentials come from URL-embedded `user:pass@host` or `-m "Authorization Basic ..."` — both produce the same `extraHTTPHeaders.Authorization` value in `prepareData()`
229
+
230
+ 10. **Intermediate JSONL write safety + corruption tolerance** — `ItemsStore.appendPageItems()` requires strict serialization of writes per rule file to prevent interleaved corruption. It also enforces a strict text sanitization regex to filter out literal `\n` and `\r` control characters from website HTML inputs immediately after `JSON.stringify()`. This ensures no single JSON issue accidentally injects illegal implicit newline boundaries when writing to JSONL format. Maintain backward-compatible `fs.appendFile` queues over buffered WriteStreams to guarantee pipeline sync visibility. `ItemsStore.readRuleItems()` tolerates historical malformed lines via fallback skip logic.
231
+
232
+ ## Testing Considerations
233
+
234
+ When making changes, validate these areas which have well-established edge cases:
235
+
236
+ ### Memory & Large Scan Handling
237
+ - Large scans (1000+ pages) can produce multi-GB JSON payloads. The report pipeline streams per-page results sequentially and writes violation items to per-rule JSONL files on disk. Only rule-level summaries (not full `pagesAffected` arrays) are embedded in report.html. Any change to report generation must be tested with 1000+ page scans.
238
+ - When writing chunked base64 data to the HTML output stream, await drain events. Silent data truncation occurs on large payloads (57MB+) without backpressure handling.
239
+ - The browser-embedded payload in report.html must remain minimal — only rule summaries with `pagesAffectedCount`, not full item arrays. Browser `JSON.parse()` cannot handle 700MB+ strings.
240
+
241
+ ### Crawlee Lifecycle & Cleanup
242
+ - Crawlee's async lock-file operations (`.json.lock` mkdir) can fire after the crawl finishes. On Windows, this triggers uncaughtException EPERM during report generation. A scoped exception handler suppresses these. The cleanup delay is 5s on Windows, 3s on others.
243
+ - The crawlee dataset folder and `tmp-items` (intermediate JSONL store) must be deleted BEFORE zipping results. `zipResults` must be the last step in `generateArtifacts()` — any cleanup or processing that removes temp files from `storagePath` must happen earlier. The dataset deletion uses an awaited delay (not fire-and-forget setTimeout) to let lingering Crawlee I/O flush.
244
+ - Errors must only be recorded in `failedRequestHandler` (after all retries exhausted), not in the `requestHandler` catch block. Crawlee retries up to 3 times, so recording in the catch block creates duplicates and false positives for URLs that succeed on retry.
245
+
246
+ ### URL & Redirect Handling
247
+ - `https://example.com` and `https://example.com/` must be treated as the same page. Use `normUrl()` (wrapping `@apify/utilities normalizeUrl`) for all dedup sets.
248
+ - `www.example.com` and `example.com` must be treated as the same host. Never compare hostnames with `===` directly — use `isSameHostname()` from `src/utils.ts`, which strips the `www.` prefix. This applies to follow-strategy checks, click-discovery gating, and any other hostname comparison. Sitemaps commonly list child URLs without the `www.` prefix; browsers redirect between www/non-www variants freely.
249
+ - Pages may redirect to external domains. The crawler detects this both pre-scan (via `response.url()` after goto) and post-scan (via `page.url()` after axe completes, since JS redirects can fire during scan). Results are discarded if the page leaves its queued hostname.
250
+ - In custom flow, the entry URL should remain the user-provided URL, not the final redirected URL.
251
+
252
+ ### robots.txt Handling
253
+ - Bare paths like `/subscription/unsubscribe` must emit both the exact-path pattern AND a children glob (`/subscription/unsubscribe/**`). Query-string `?` must be escaped (minimatch treats `?` as a single-char wildcard).
254
+ - URLs found via popups, frame navigations, or interactive clicks go through `enqueueUniqueRequest` which bypasses `transformRequestFunction`. These must also be checked against robots.txt via `isDisallowedInRobotsTxt` before enqueue.
255
+
256
+ ### Local File Sitemaps
257
+ - When a local file path is used as `userUrl`, `isFollowStrategy` tries `new URL('/app/sitemaps/...')` which throws. Strategy checks must be skipped when `userUrl` is a file path. The `rule === 'all'` early-return should come before any URL parsing.
258
+
259
+ ### Page Lifecycle
260
+ - `document.title` must be captured at the START of `runAxeScript()`, before axe scanning or screenshot capture. Pages can close during these operations (timeout, navigation, crash). Never create a new page just to re-navigate for the title — this leaks pages.
261
+ - The URL guard script in custom flow must be defensive against pages that close unexpectedly. All page event handlers should handle closed contexts gracefully.
262
+
263
+ ### Proxy & Network
264
+ - Proxy detection must handle `ALL_PROXY` on Windows. The proxy resolution logic should be tested on all platforms.
265
+
266
+ ### Strategy & Filtering in Sitemap Crawls
267
+ - The `-s` (strategy) flag must be passed through to `crawlSitemap` and `getLinksFromSitemap`. For sitemap-only scans the default is `'ignore'` (all URLs); for domain/intelligent crawls it's `'same-domain'`.
268
+ - `scanDuration=0` means unlimited. Code that calculates `remainingDuration` must treat 0 as "no limit", not as "0 seconds remaining".
269
+
270
+ ### Rate Limiting, Adaptive Concurrency & CrawlRateController
271
+ - Sites with WAFs (Cloudflare, Akamai, etc.) will start returning 403/503 after a certain number of concurrent requests — typically 200-300 pages in rapid succession.
272
+ - Both crawlers use a shared `CrawlRateController` class (`src/crawlers/crawlRateController.ts`) that provides:
273
+ 1. **Strict maxPages**: `claimSlot()` is called at the moment of success (synchronously right before `urlsCrawled.scanned.push()`), not at the top of the request handler. `abort()` is called only after claiming the last slot (`isLimitReached()` becomes true post-claim). Never abort from the top of the handler — doing so kills in-flight pages that other handlers are scanning, causing undershoot.
274
+ 2. **Circuit breaker**: After 100 consecutive HTTP 4xx/5xx failures (configurable via `OOBEE_CONSECUTIVE_MAX_RETRIES`), the crawl aborts gracefully.
275
+ 3. **Adaptive concurrency**: On each 4xx/5xx failure, concurrency is halved (floor 1). After every 10 consecutive successes, concurrency recovers by +2 toward the original value. This automatically finds the site's rate limit threshold without manual tuning.
276
+ - **Critical placement of `claimSlot()` and `abort()`**: `claimSlot()` must be synchronously right before `push()` — never at the top of the handler. `abort()` must be called only after the last slot is claimed — never from an early-exit check. Pages can be discarded mid-handler (redirect, dedup, robots.txt block), and aborting prematurely kills in-flight handlers that would have succeeded.
277
+ - Only HTTP 4xx/5xx responses trigger rate adaptation and count toward the circuit breaker — timeouts and network errors do not.
278
+ - In intelligent crawl, each phase (sitemap then domain) creates its own `CrawlRateController` instance — transitioning from sitemap to domain crawl starts fresh.
279
+ - Without the circuit breaker, a rate-limited crawl with thousands of enqueued URLs would run indefinitely, never hit the success threshold, and never generate a report.
280
+ - When enqueuing all sitemap URLs (which we do for accurate `totalLinksFetchedFromSitemaps` reporting), always ensure either a scan duration (`-d`) or the circuit breaker is in place as a safety net.
281
+
282
+ ### Axe & Custom Checks
283
+ - When axe reports color-contrast violations but cannot determine the actual colors, skip augmenting the message with contrast context (avoids crashes on null/undefined color values).
284
+ - Violation messages are enriched with live DOM context (element text, computed styles, dimensions) via `page.evaluate()` during scan. Handle cases where elements are no longer in DOM at evaluation time.
285
+ - `aria-hidden-focus` violations are re-verified against the live DOM after axe completes, to handle race conditions with JS that sets `tabindex="-1"` after `aria-hidden="true"` (common in carousel/slider libraries). The re-verification yields to the event loop before re-checking, allowing pending timers to fire. If all focusable descendants now have `tabindex < 0`, the violation is filtered out as a false positive.
286
+
287
+ ## Report Output Structure
288
+
289
+ ```
290
+ {randomToken}/
291
+ ├── datasets/default/ # Crawlee per-page JSON results
292
+ ├── report.html # Interactive HTML report
293
+ ├── summary.html → summary.pdf # PDF summary (HTML deleted after conversion)
294
+ ├── report.csv # Issue-level CSV
295
+ ├── scanData.json # Scan metadata (site, dates, type, sitemap info)
296
+ ├── scanItems.json # All issues grouped by severity
297
+ ├── scanItemsSummary.json # Summary counts
298
+ ├── scanIssuesSummary.json # Issues without page details
299
+ ├── scanPagesDetail.json # Per-page breakdown
300
+ ├── scanPagesSummary.json # Page-level summary
301
+ ├── sitemap.xml # Discovered URLs
302
+ └── screenshots/ # Violation screenshots (if enabled)
303
+ ```
package/README.md CHANGED
@@ -92,6 +92,10 @@ verapdf --version
92
92
  | WARN_LEVEL | Only used in tests. | |
93
93
  | OOBEE_DISABLE_BROWSER_DOWNLOAD | Experimental flag to disable file downloads on Chrome/Chromium/Edge. Does not affect Local File scan | |
94
94
  | OOBEE_SLOWMO | Experimental flag to slow down web browser behaviour by specified duration (in miliseconds) | |
95
+ | OOBEE_TAGGED_WEBSITE | Tag to identify the website in telemetry. Can also be set via `-z, --websiteTag` CLI flag (CLI flag takes precedence). | |
96
+ | OOBEE_SCAN_METADATA | Overrides the `entryUrl` tag sent to telemetry. | |
97
+ | OOBEE_SCAN_PRODUCT | Adds a `scanProduct` tag to telemetry events. | |
98
+ | OOBEE_CONSECUTIVE_MAX_RETRIES | Max consecutive HTTP failures before the circuit breaker aborts the crawl. | `100` |
95
99
  | HTTP_PROXY | URL of the proxy server to be used for HTTP requests (e.g. `http://proxy.example.com:8080`). | |
96
100
  | HTTPS_PROXY | URL of the proxy server to be used for HTTPS requests (e.g. `https://proxy.example.com:8080`). | |
97
101
  | ALL_PROXY | URL of the proxy server to be used for all requests, typically used for SOCKS5 proxies (e.g. `socks5://proxy.example.com:1080`. Note: IPv6 direct connections may still continue even though socks5 proxy is specified due to a known issue with Chrome/Chromium. (Recommended workaround is to turn off IPv6 at host-level). | |
@@ -400,6 +404,9 @@ Options:
400
404
  [string] [choices: "yes", "no"] [default: "no"]
401
405
  -l, --scanDuration Maximum scan duration in seconds (0 means u
402
406
  nlimited) [number] [default: 0]
407
+ -z, --websiteTag Tag to identify the website in telemetry.
408
+ Overrides OOBEE_TAGGED_WEBSITE env var.
409
+ [string]
403
410
 
404
411
  Examples:
405
412
  To scan sitemap of website:', 'npm run cli -- -c [ 1 | sitemap ] -u <url_lin
@@ -410,6 +417,21 @@ Examples:
410
417
  > [ -d <device> | -w <viewport_width> ]
411
418
 
412
419
  ```
420
+
421
+ ### Basic Auth
422
+
423
+ For sites behind HTTP Basic Authentication, you can provide credentials in two ways:
424
+
425
+ 1. **Embed in URL**: `npm run cli -- -u 'https://user:password@example.com' -c 3`
426
+ 2. **Use `-m` flag**: `npm run cli -- -u 'https://example.com' -c 3 -m "Authorization Basic dXNlcjpwYXNzd29yZA=="`
427
+
428
+ Both methods work across all scan types (sitemap, website, custom flow). For multiple headers, separate with `, `:
429
+ ```
430
+ -m "Authorization Basic dXNlcjpwYXNz, X-Custom-Header myvalue"
431
+ ```
432
+
433
+ > **Note:** Authorization headers are only sent to same-origin requests to avoid CORS preflight failures on cross-origin resources (e.g., CDN fonts, analytics scripts).
434
+
413
435
  ### Note on Windows PowerShell:
414
436
  You need to run the command as `npm run cli -- --` (with the extra set of `--`) as PowerShell interprets arguments differently.
415
437
 
package/dist/cli.js CHANGED
@@ -177,6 +177,9 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
177
177
  if (!options.strategy) {
178
178
  options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
179
179
  }
180
+ if (options.websiteTag) {
181
+ process.env.OOBEE_TAGGED_WEBSITE = options.websiteTag;
182
+ }
180
183
  const scanInit = async (argvs) => {
181
184
  const updatedArgvs = { ...argvs };
182
185
  // Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
package/dist/combine.js CHANGED
@@ -6,7 +6,7 @@ import crawlLocalFile from './crawlers/crawlLocalFile.js';
6
6
  import crawlIntelligentSitemap from './crawlers/crawlIntelligentSitemap.js';
7
7
  import generateArtifacts from './mergeAxeResults.js';
8
8
  import { getHost, createAndUpdateResultsFolders, cleanUpAndExit, getStoragePath } from './utils.js';
9
- import { ScannerTypes, UrlsCrawled } from './constants/constants.js';
9
+ import constants, { ScannerTypes, UrlsCrawled } from './constants/constants.js';
10
10
  import { getBlackListedPatterns, submitForm } from './constants/common.js';
11
11
  import { consoleLogger } from './logs.js';
12
12
  import runCustom from './crawlers/runCustom.js';
@@ -31,11 +31,23 @@ const combineRun = async (details, deviceToScan) => {
31
31
  generateJsonFiles, scanDuration, } = envDetails;
32
32
  process.env.CRAWLEE_LOG_LEVEL = 'ERROR';
33
33
  process.env.CRAWLEE_STORAGE_DIR = randomToken;
34
+ constants.sitemapFetchedLinks = null;
34
35
  if (process.env.CRAWLEE_SYSTEM_INFO_V2 === undefined) {
35
36
  // Set the environment variable to enable system info v2
36
37
  // Resolves issue with when wmic is not installed on Windows
37
38
  process.env.CRAWLEE_SYSTEM_INFO_V2 = '1';
38
39
  }
40
+ // Suppress non-fatal Crawlee ps-tree errors on Windows with non-English locales.
41
+ // The system info module tries to parse process listing headers and crashes when
42
+ // headers are in a different language (e.g. "Wo" instead of "PID").
43
+ const psTreeHandler = (err) => {
44
+ if (err.message?.includes('Unknown process listing header')) {
45
+ consoleLogger.info(`Suppressed Crawlee ps-tree locale error: ${err.message}`);
46
+ return;
47
+ }
48
+ throw err;
49
+ };
50
+ process.on('uncaughtException', psTreeHandler);
39
51
  const host = type === ScannerTypes.SITEMAP || type === ScannerTypes.LOCALFILE ? '' : getHost(url);
40
52
  let blacklistedPatterns = null;
41
53
  try {
@@ -77,7 +89,7 @@ const combineRun = async (details, deviceToScan) => {
77
89
  let durationExceeded = false;
78
90
  switch (type) {
79
91
  case ScannerTypes.CUSTOM:
80
- const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
92
+ const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '', extraHTTPHeaders);
81
93
  urlsCrawledObj = res.urlsCrawled;
82
94
  uiCustomFlowLabel = res.customFlowLabel;
83
95
  break;
@@ -174,7 +186,7 @@ const combineRun = async (details, deviceToScan) => {
174
186
  ];
175
187
  const basicFormHTMLSnippet = await generateArtifacts(randomToken, url, type, deviceToScan, urlsCrawledObj.scanned, pagesNotScanned, uiCustomFlowLabel && uiCustomFlowLabel.length > 0
176
188
  ? uiCustomFlowLabel
177
- : customFlowLabel || 'None', undefined, scanDetails, zip, generateJsonFiles);
189
+ : customFlowLabel || 'None', undefined, scanDetails, zip, generateJsonFiles, browser);
178
190
  const [name, email] = nameEmail.split(':');
179
191
  // Upload results to S3 if environment variables are set
180
192
  if (isS3UploadEnabled()) {
@@ -303,4 +303,11 @@ To obtain the JSON files, you need to base64-decode the file followed by gunzip.
303
303
  demandOption: false,
304
304
  coerce: val => Number(val),
305
305
  },
306
+ z: {
307
+ alias: 'websiteTag',
308
+ describe: 'Tag to identify the website in telemetry. Overrides OOBEE_TAGGED_WEBSITE env var.',
309
+ type: 'string',
310
+ requiresArg: true,
311
+ demandOption: false,
312
+ },
306
313
  };