@govtechsg/oobee 0.10.91 → 0.10.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +303 -0
- package/README.md +22 -0
- package/dist/cli.js +3 -0
- package/dist/combine.js +15 -3
- package/dist/constants/cliFunctions.js +7 -0
- package/dist/constants/common.js +149 -80
- package/dist/constants/constants.js +1 -0
- package/dist/crawlers/commonCrawlerFunc.js +136 -15
- package/dist/crawlers/crawlDomain.js +55 -58
- package/dist/crawlers/crawlIntelligentSitemap.js +21 -11
- package/dist/crawlers/crawlRateController.js +47 -0
- package/dist/crawlers/crawlSitemap.js +51 -62
- package/dist/crawlers/runCustom.js +8 -2
- package/dist/generateOobeeClientScanner.js +32 -1
- package/dist/mergeAxeResults/itemsStore.js +32 -3
- package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
- package/dist/mergeAxeResults.js +120 -92
- package/dist/npmIndex.js +1 -0
- package/dist/utils.js +23 -28
- package/oobee-client-scanner.js +35 -4
- package/package.json +3 -3
- package/src/cli.ts +4 -0
- package/src/combine.ts +16 -1
- package/src/constants/cliFunctions.ts +7 -0
- package/src/constants/common.ts +162 -90
- package/src/constants/constants.ts +1 -0
- package/src/crawlers/commonCrawlerFunc.ts +148 -14
- package/src/crawlers/crawlDomain.ts +64 -66
- package/src/crawlers/crawlIntelligentSitemap.ts +23 -11
- package/src/crawlers/crawlRateController.ts +63 -0
- package/src/crawlers/crawlSitemap.ts +57 -70
- package/src/crawlers/runCustom.ts +10 -1
- package/src/generateOobeeClientScanner.ts +32 -1
- package/src/index.ts +1 -0
- package/src/mergeAxeResults/itemsStore.ts +37 -3
- package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
- package/src/mergeAxeResults.ts +139 -99
- package/src/npmIndex.ts +1 -0
- package/src/utils.ts +25 -33
- /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0
package/AGENTS.md
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# Oobee Developer Guide
|
|
2
|
+
|
|
3
|
+
> **Keep this file up to date.** When you make changes that affect architecture, crawl behavior, environment variables, or testing considerations described here, update the relevant section in the same commit.
|
|
4
|
+
|
|
5
|
+
Oobee is a web accessibility scanner that crawls websites and runs axe-core + custom checks against each page, producing HTML/PDF/CSV/JSON reports.
|
|
6
|
+
|
|
7
|
+
## Architecture Overview
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
User Input (CLI / npm API)
|
|
11
|
+
↓
|
|
12
|
+
combine.ts (orchestrator)
|
|
13
|
+
↓ routes by ScannerTypes
|
|
14
|
+
Crawler (crawlDomain / crawlSitemap / crawlIntelligentSitemap / crawlLocalFile / runCustom)
|
|
15
|
+
↓ uses Crawlee PlaywrightCrawler
|
|
16
|
+
Page Handler (axe-core injection + custom checks)
|
|
17
|
+
↓ writes per-page JSON to Crawlee dataset
|
|
18
|
+
generateArtifacts() in mergeAxeResults.ts
|
|
19
|
+
↓ reads dataset, aggregates, renders templates
|
|
20
|
+
Reports (HTML, PDF, CSV, JSON, sitemap.xml)
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Entry Points
|
|
24
|
+
|
|
25
|
+
| Entry | File | Purpose |
|
|
26
|
+
|-------|------|---------|
|
|
27
|
+
| CLI | `src/cli.ts` | yargs-based CLI, calls `combineRun()` |
|
|
28
|
+
| Interactive CLI | `src/index.ts` | Inquirer prompts, calls `combineRun()` |
|
|
29
|
+
| npm API | `src/npmIndex.ts` | Programmatic `init()` for external consumers |
|
|
30
|
+
| Orchestrator | `src/combine.ts` | Routes scan type, manages lifecycle, calls `generateArtifacts()` |
|
|
31
|
+
|
|
32
|
+
## Scanner Types
|
|
33
|
+
|
|
34
|
+
| Type | File | Behavior |
|
|
35
|
+
|------|------|----------|
|
|
36
|
+
| `Website` | `src/crawlers/crawlDomain.ts` | Domain crawl, discovers links from pages |
|
|
37
|
+
| `Sitemap` | `src/crawlers/crawlSitemap.ts` | Fetches URLs from sitemap XML |
|
|
38
|
+
| `Intelligent` | `src/crawlers/crawlIntelligentSitemap.ts` | Discovers sitemap via robots.txt, crawls it, then supplements with domain crawl |
|
|
39
|
+
| `LocalFile` | `src/crawlers/crawlLocalFile.ts` | Scans local HTML/PDF files via file:// |
|
|
40
|
+
| `Custom` | `src/crawlers/runCustom.ts` | User-driven flow (manual navigation in browser) |
|
|
41
|
+
|
|
42
|
+
## Key Files
|
|
43
|
+
|
|
44
|
+
### Constants & Configuration
|
|
45
|
+
|
|
46
|
+
- **`src/constants/constants.ts`** — Enums (`ScannerTypes`, `BrowserTypes`, `FileTypes`, `RuleFlags`), browser data dir paths, sitemap path list, WCAG mappings, shared mutable state (`robotsTxtUrls`, `sitemapFetchedLinks`, `userDataDirectory`, `launcher`)
|
|
47
|
+
- **`src/constants/common.ts`** — URL validation (`checkUrl`), browser launch options (`getPlaywrightLaunchOptions`), sitemap parsing (`getLinksFromSitemap`, `getSitemapsFromRobotsTxt`), robots.txt handling, browser selection (`getBrowserToRun`), user-agent initialization (`initModifiedUserAgent`)
|
|
48
|
+
|
|
49
|
+
### Crawlers
|
|
50
|
+
|
|
51
|
+
All crawlers use Crawlee's `PlaywrightCrawler` with:
|
|
52
|
+
- `maxRequestsPerCrawl: Infinity` (Crawlee's internal limit disabled)
|
|
53
|
+
- Manual stop when `urlsCrawled.scanned.length >= maxRequestsPerCrawl` (counts only successful scans)
|
|
54
|
+
- `retryOnBlocked: true`
|
|
55
|
+
- `useFingerprints: false`
|
|
56
|
+
|
|
57
|
+
### Report Generation
|
|
58
|
+
|
|
59
|
+
- **`src/mergeAxeResults.ts`** — Main `generateArtifacts()` function, reads Crawlee dataset, builds `allIssues` object, generates all output formats
|
|
60
|
+
- **`src/mergeAxeResults/`** — Sub-modules: `jsonArtifacts.ts` (JSON+base64), `writeCsv.ts`, `writeSitemap.ts`, `scanPages.ts`, `itemsStore.ts`, `types.ts`
|
|
61
|
+
- **`src/static/ejs/`** — EJS templates for HTML report and PDF summary
|
|
62
|
+
|
|
63
|
+
## Browser Handling
|
|
64
|
+
|
|
65
|
+
### Selection Priority
|
|
66
|
+
|
|
67
|
+
`getBrowserToRun()` in `common.ts` resolves the browser:
|
|
68
|
+
- If no preference specified: defaults to Chrome on Windows/macOS, Chromium on Linux
|
|
69
|
+
- Fallback chains:
|
|
70
|
+
- **macOS**: Chrome → webkit
|
|
71
|
+
- **Windows**: Chrome → Edge → error
|
|
72
|
+
- **Linux**: Chrome → Chromium (bundled by Playwright)
|
|
73
|
+
- When `chromium` is specified: uses Playwright's bundled Chromium with no channel
|
|
74
|
+
|
|
75
|
+
### Launch Options
|
|
76
|
+
|
|
77
|
+
`getPlaywrightLaunchOptions()` builds Playwright launch config:
|
|
78
|
+
- Headless mode from `process.env.CRAWLEE_HEADLESS`
|
|
79
|
+
- Docker detection (`/.dockerenv`): adds `--disable-gpu`, `--no-sandbox`, `--disable-dev-shm-usage`
|
|
80
|
+
- Proxy support (manual, PAC, or none) via `getProxyInfo()`
|
|
81
|
+
- Channel set from browser name (undefined for chromium = bundled)
|
|
82
|
+
- `--mute-audio` is added by default in both headless and headful modes, but must be disabled for `customFlow` by calling `getPlaywrightLaunchOptions(browser, { includeMuteAudio: false })`
|
|
83
|
+
|
|
84
|
+
### User-Agent
|
|
85
|
+
|
|
86
|
+
`initModifiedUserAgent()` detects the default UA, replaces `HeadlessChrome` with `Chrome`, stores in `process.env.OOBEE_USER_AGENT`. This must be called before any browser context that talks to remote servers in headless mode, or bot-blocking WAFs will reject requests.
|
|
87
|
+
|
|
88
|
+
Contexts that need `userAgent: process.env.OOBEE_USER_AGENT`:
|
|
89
|
+
- `getRobotsTxtViaPlaywright()` — robots.txt fetching
|
|
90
|
+
- `findSitemap()` in `crawlIntelligentSitemap.ts` — sitemap path probing
|
|
91
|
+
- `getDataUsingPlaywright()` in `getLinksFromSitemap()` — sitemap XML content fetching
|
|
92
|
+
- `checkUrl()` — main URL validation context (already handled)
|
|
93
|
+
- Crawlee crawler contexts in `crawlDomain`/`crawlSitemap` — UA set via `preLaunchHooks` in `getPreLaunchHook()`
|
|
94
|
+
|
|
95
|
+
### Headless vs Headful
|
|
96
|
+
|
|
97
|
+
- Docker/Linux: always headless (`CRAWLEE_HEADLESS=1`)
|
|
98
|
+
- macOS CLI: typically headful (`CRAWLEE_HEADLESS=0`) unless user opts in
|
|
99
|
+
- Headful mode uses ephemeral contexts (no `userDataDir`) to avoid "Browser window not found" errors
|
|
100
|
+
- Headless mode uses `launchPersistentContext` with cloned user data directories
|
|
101
|
+
|
|
102
|
+
## Sitemap Discovery & Fetching
|
|
103
|
+
|
|
104
|
+
The intelligent crawl flow:
|
|
105
|
+
1. `getSitemapsFromRobotsTxt()` — fetches robots.txt, extracts `Sitemap:` directives
|
|
106
|
+
2. `findSitemap()` — probes hardcoded paths (`/sitemap.xml`, `/sitemap-index.xml`, etc.)
|
|
107
|
+
3. `getLinksFromSitemap()` — fetches and parses sitemap XML content, returns `Request[]`
|
|
108
|
+
|
|
109
|
+
Important behaviors:
|
|
110
|
+
- All URLs from the sitemap are discovered and stored as strings in a `Set<string>`
|
|
111
|
+
- All discovered URLs are converted to `Request` objects (no truncation at this stage)
|
|
112
|
+
- The crawler itself enforces `maxRequestsPerCrawl` by counting only successfully scanned pages
|
|
113
|
+
- `constants.sitemapFetchedLinks` stores the total discovered count for `scanData.json` reporting
|
|
114
|
+
- For sitemap indexes, child sitemaps are processed recursively
|
|
115
|
+
|
|
116
|
+
## Shared Mutable State
|
|
117
|
+
|
|
118
|
+
The `constants` default export object holds runtime state:
|
|
119
|
+
- `constants.launcher` — Playwright browser type (chromium/webkit)
|
|
120
|
+
- `constants.robotsTxtUrls` — Parsed robots.txt disallow/allow rules
|
|
121
|
+
- `constants.sitemapFetchedLinks` — Sitemap fetch diagnostics (reset per scan)
|
|
122
|
+
- `constants.userDataDirectory` — Current browser profile directory
|
|
123
|
+
- `constants.randomToken` — Current scan token
|
|
124
|
+
- `constants.resources` — Active Crawlee crawlers, browser contexts, browsers (for cleanup)
|
|
125
|
+
|
|
126
|
+
## Environment Variables
|
|
127
|
+
|
|
128
|
+
### User-Facing
|
|
129
|
+
| Variable | Purpose |
|
|
130
|
+
|----------|---------|
|
|
131
|
+
| `CRAWLEE_HEADLESS` | `1` = headless, `0` = headful (set by `setHeadlessMode()`) |
|
|
132
|
+
| `OOBEE_USER_AGENT` | Modified UA (set by `initModifiedUserAgent()`) |
|
|
133
|
+
| `OOBEE_VERBOSE` | Enable verbose console logging |
|
|
134
|
+
| `OOBEE_LOGS_PATH` | Custom log directory |
|
|
135
|
+
| `OOBEE_SLOWMO` | Browser slowmo in ms |
|
|
136
|
+
| `OOBEE_FAST_CRAWLER` | Experimental high-concurrency mode |
|
|
137
|
+
| `OOBEE_DISABLE_BROWSER_DOWNLOAD` | Block browser file downloads |
|
|
138
|
+
| `OOBEE_TAGGED_WEBSITE` | Tag to identify the website in Sentry telemetry (overridden by `--websiteTag` CLI flag) |
|
|
139
|
+
| `OOBEE_SCAN_METADATA` | Overrides `entryUrl` tag in Sentry events |
|
|
140
|
+
| `OOBEE_SCAN_PRODUCT` | Adds `scanProduct` tag to Sentry events |
|
|
141
|
+
| `OOBEE_CONSECUTIVE_MAX_RETRIES` | Max consecutive HTTP failures before circuit breaker aborts crawl (default 100) |
|
|
142
|
+
| `OOBEE_VALIDATE_URL` | If set, exit after URL validation without scanning |
|
|
143
|
+
| `HTTP_PROXY` / `HTTPS_PROXY` / `ALL_PROXY` | Proxy configuration |
|
|
144
|
+
| `NO_PROXY` / `INCLUDE_PROXY` | Proxy bypass/include lists |
|
|
145
|
+
|
|
146
|
+
### Internal (set by code)
|
|
147
|
+
| Variable | Purpose |
|
|
148
|
+
|----------|---------|
|
|
149
|
+
| `CRAWLEE_STORAGE_DIR` | Crawlee dataset directory (= randomToken) |
|
|
150
|
+
| `CRAWLEE_LOG_LEVEL` | Set to `ERROR` |
|
|
151
|
+
| `CRAWLEE_SYSTEM_INFO_V2` | `1` (Windows wmic workaround) |
|
|
152
|
+
| `NODE_TLS_REJECT_UNAUTHORIZED` | Set to `0` for self-signed certs |
|
|
153
|
+
|
|
154
|
+
## Platform Differences
|
|
155
|
+
|
|
156
|
+
### Docker/Linux
|
|
157
|
+
- `/.dockerenv` detection adds `--no-sandbox`, `--disable-gpu`, `--disable-dev-shm-usage`
|
|
158
|
+
- No system Chrome/Edge — always falls back to Playwright's bundled Chromium
|
|
159
|
+
- `getDefaultChromeDataDir()` returns null (no Chrome profile to clone)
|
|
160
|
+
- `getDefaultChromiumDataDir()` creates `./Chromium Support` or falls back to `/tmp`
|
|
161
|
+
- Always headless in Docker
|
|
162
|
+
- Default UA contains `HeadlessChrome` — must be patched via `initModifiedUserAgent()`
|
|
163
|
+
|
|
164
|
+
### macOS
|
|
165
|
+
- Defaults to system Chrome if available, falls back to webkit (not Chromium)
|
|
166
|
+
- Browser profiles at `~/Library/Application Support/Google/Chrome`
|
|
167
|
+
- Typically headful (non-headless)
|
|
168
|
+
- Logs at `~/Library/Application Support/Oobee/`
|
|
169
|
+
|
|
170
|
+
### Windows
|
|
171
|
+
- Defaults to system Chrome, falls back to Edge
|
|
172
|
+
- Browser profiles at `%APPDATA%/Local/Google/Chrome/User Data`
|
|
173
|
+
- File locks require longer cleanup delays (5s vs 3s)
|
|
174
|
+
- Path separator differences in cookie profile regex
|
|
175
|
+
- `CRAWLEE_SYSTEM_INFO_V2=1` needed (wmic deprecation)
|
|
176
|
+
|
|
177
|
+
## Testing
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
npm test # Run Jest tests (uses --experimental-vm-modules)
|
|
181
|
+
npx tsc --noEmit # Type-check without emitting
|
|
182
|
+
npm run build # Compile TypeScript
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Test files: `__tests__/` directory and `src/crawlers/__tests__/`
|
|
186
|
+
|
|
187
|
+
## Build & Run
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
npm install # Install dependencies
|
|
191
|
+
npm run build # Compile TS → dist/
|
|
192
|
+
node dist/cli.js # Run CLI
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
Docker:
|
|
196
|
+
```bash
|
|
197
|
+
docker build -t oobee .
|
|
198
|
+
docker run oobee node dist/cli.js ...
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## Common Pitfalls
|
|
202
|
+
|
|
203
|
+
1. **Bot-blocking in headless mode** — Any new browser context that fetches remote content in headless mode must pass `userAgent: process.env.OOBEE_USER_AGENT`. Without this, sites with WAFs block the request.
|
|
204
|
+
|
|
205
|
+
2. **`maxRequestsPerCrawl` semantics** — This counts *successfully scanned* pages, not total requests. The sitemap enqueues all discovered URLs; the crawler stops when enough succeed. Errored pages do not consume the budget.
|
|
206
|
+
|
|
207
|
+
3. **Browser profile isolation** — Each scan clones browser profiles with a `randomToken` suffix. Profiles must be cleaned up after scan (`deleteClonedProfiles()`).
|
|
208
|
+
|
|
209
|
+
- If Chrome/Edge profile cloning fails (for example `EBUSY` while copying locked cookie/state files on Windows), Oobee now falls back to an empty cloned profile directory for that scan. This keeps browser launch stable, but authenticated session cookies may not be available.
|
|
210
|
+
- Crawlee's browser pool retires and re-launches browser instances after ~4 minutes. On Windows, reusing the same `--user-data-dir` causes Chrome exit code 21 (stale lock contention). `getPreLaunchHook()` in `commonCrawlerFunc.ts` assigns unique `_pool{N}` directories for each re-launch and performs a best-effort async clone of the base profile. Cleanup must glob `_pool*` directories alongside the base `oobee-{token}` dir.
|
|
211
|
+
|
|
212
|
+
4. **`constants.launcher` mutation** — When webkit is the fallback, `constants.launcher` is reassigned globally. This affects all subsequent browser launches in the same process.
|
|
213
|
+
|
|
214
|
+
5. **Headful vs headless context creation** — Headful mode must NOT use `launchPersistentContext` with custom `userDataDir` (causes "Browser window not found" crash). Use `launch()` + `newContext()` instead.
|
|
215
|
+
|
|
216
|
+
6. **Sitemap fetch state** — `constants.sitemapFetchedLinks` accumulates across multiple `getLinksFromSitemap` calls. Must be reset to `null` at scan start.
|
|
217
|
+
|
|
218
|
+
7. **PDF generation** — `writeSummaryPdf()` always runs headless regardless of scan mode. It loads a local `file://` URL so UA/network issues don't apply, but it needs a working browser binary.
|
|
219
|
+
|
|
220
|
+
- On Windows, summary PDF generation now retries with Edge (`msedge`) if the initial Chrome launch fails at runtime.
|
|
221
|
+
|
|
222
|
+
8. **Crawlee dataset** — Results are stored as numbered JSON files in `{randomToken}/datasets/default/`. Each file is one page's axe results. `generateArtifacts()` reads all of them.
|
|
223
|
+
|
|
224
|
+
9. **Auth headers and CORS** — Never set `Authorization` in `extraHTTPHeaders` globally on a browser context. Playwright sends `extraHTTPHeaders` to ALL requests (including cross-origin CDNs), which triggers CORS preflight failures. Instead use `splitAuthHeaders()` from `commonCrawlerFunc.ts` to separate auth from non-auth headers:
|
|
225
|
+
- Non-auth headers → safe to set globally via `extraHTTPHeaders` on context/launch options
|
|
226
|
+
- Basic auth → set `httpCredentials` on context (Playwright auto-responds to 401 challenges, origin-aware)
|
|
227
|
+
- Any Authorization header → send only to same-origin requests via `addAuthRouteHandler()` (route interception) or Crawlee's `preNavigationHooks` (navigation-only)
|
|
228
|
+
- Credentials come from URL-embedded `user:pass@host` or `-m "Authorization Basic ..."` — both produce the same `extraHTTPHeaders.Authorization` value in `prepareData()`
|
|
229
|
+
|
|
230
|
+
10. **Intermediate JSONL write safety + corruption tolerance** — `ItemsStore.appendPageItems()` requires strict serialization of writes per rule file to prevent interleaved corruption. It also enforces a strict text sanitization regex to filter out literal `\n` and `\r` control characters from website HTML inputs immediately after `JSON.stringify()`. This ensures no single JSON issue accidentally injects illegal implicit newline boundaries when writing to JSONL format. Maintain backward-compatible `fs.appendFile` queues over buffered WriteStreams to guarantee pipeline sync visibility. `ItemsStore.readRuleItems()` tolerates historical malformed lines via fallback skip logic.
|
|
231
|
+
|
|
232
|
+
## Testing Considerations
|
|
233
|
+
|
|
234
|
+
When making changes, validate these areas which have well-established edge cases:
|
|
235
|
+
|
|
236
|
+
### Memory & Large Scan Handling
|
|
237
|
+
- Large scans (1000+ pages) can produce multi-GB JSON payloads. The report pipeline streams per-page results sequentially and writes violation items to per-rule JSONL files on disk. Only rule-level summaries (not full `pagesAffected` arrays) are embedded in report.html. Any change to report generation must be tested with 1000+ page scans.
|
|
238
|
+
- When writing chunked base64 data to the HTML output stream, await drain events. Silent data truncation occurs on large payloads (57MB+) without backpressure handling.
|
|
239
|
+
- The browser-embedded payload in report.html must remain minimal — only rule summaries with `pagesAffectedCount`, not full item arrays. Browser `JSON.parse()` cannot handle 700MB+ strings.
|
|
240
|
+
|
|
241
|
+
### Crawlee Lifecycle & Cleanup
|
|
242
|
+
- Crawlee's async lock-file operations (`.json.lock` mkdir) can fire after the crawl finishes. On Windows, this triggers uncaughtException EPERM during report generation. A scoped exception handler suppresses these. The cleanup delay is 5s on Windows, 3s on others.
|
|
243
|
+
- The crawlee dataset folder and `tmp-items` (intermediate JSONL store) must be deleted BEFORE zipping results. `zipResults` must be the last step in `generateArtifacts()` — any cleanup or processing that removes temp files from `storagePath` must happen earlier. The dataset deletion uses an awaited delay (not fire-and-forget setTimeout) to let lingering Crawlee I/O flush.
|
|
244
|
+
- Errors must only be recorded in `failedRequestHandler` (after all retries exhausted), not in the `requestHandler` catch block. Crawlee retries up to 3 times, so recording in the catch block creates duplicates and false positives for URLs that succeed on retry.
|
|
245
|
+
|
|
246
|
+
### URL & Redirect Handling
|
|
247
|
+
- `https://example.com` and `https://example.com/` must be treated as the same page. Use `normUrl()` (wrapping `@apify/utilities normalizeUrl`) for all dedup sets.
|
|
248
|
+
- `www.example.com` and `example.com` must be treated as the same host. Never compare hostnames with `===` directly — use `isSameHostname()` from `src/utils.ts`, which strips the `www.` prefix. This applies to follow-strategy checks, click-discovery gating, and any other hostname comparison. Sitemaps commonly list child URLs without the `www.` prefix; browsers redirect between www/non-www variants freely.
|
|
249
|
+
- Pages may redirect to external domains. The crawler detects this both pre-scan (via `response.url()` after goto) and post-scan (via `page.url()` after axe completes, since JS redirects can fire during scan). Results are discarded if the page leaves its queued hostname.
|
|
250
|
+
- In custom flow, the entry URL should remain the user-provided URL, not the final redirected URL.
|
|
251
|
+
|
|
252
|
+
### robots.txt Handling
|
|
253
|
+
- Bare paths like `/subscription/unsubscribe` must emit both the exact-path pattern AND a children glob (`/subscription/unsubscribe/**`). Query-string `?` must be escaped (minimatch treats `?` as a single-char wildcard).
|
|
254
|
+
- URLs found via popups, frame navigations, or interactive clicks go through `enqueueUniqueRequest` which bypasses `transformRequestFunction`. These must also be checked against robots.txt via `isDisallowedInRobotsTxt` before enqueue.
|
|
255
|
+
|
|
256
|
+
### Local File Sitemaps
|
|
257
|
+
- When a local file path is used as `userUrl`, `isFollowStrategy` tries `new URL('/app/sitemaps/...')` which throws. Strategy checks must be skipped when `userUrl` is a file path. The `rule === 'all'` early-return should come before any URL parsing.
|
|
258
|
+
|
|
259
|
+
### Page Lifecycle
|
|
260
|
+
- `document.title` must be captured at the START of `runAxeScript()`, before axe scanning or screenshot capture. Pages can close during these operations (timeout, navigation, crash). Never create a new page just to re-navigate for the title — this leaks pages.
|
|
261
|
+
- The URL guard script in custom flow must be defensive against pages that close unexpectedly. All page event handlers should handle closed contexts gracefully.
|
|
262
|
+
|
|
263
|
+
### Proxy & Network
|
|
264
|
+
- Proxy detection must handle `ALL_PROXY` on Windows. The proxy resolution logic should be tested on all platforms.
|
|
265
|
+
|
|
266
|
+
### Strategy & Filtering in Sitemap Crawls
|
|
267
|
+
- The `-s` (strategy) flag must be passed through to `crawlSitemap` and `getLinksFromSitemap`. For sitemap-only scans the default is `'ignore'` (all URLs); for domain/intelligent crawls it's `'same-domain'`.
|
|
268
|
+
- `scanDuration=0` means unlimited. Code that calculates `remainingDuration` must treat 0 as "no limit", not as "0 seconds remaining".
|
|
269
|
+
|
|
270
|
+
### Rate Limiting, Adaptive Concurrency & CrawlRateController
|
|
271
|
+
- Sites with WAFs (Cloudflare, Akamai, etc.) will start returning 403/503 after a certain number of concurrent requests — typically 200-300 pages in rapid succession.
|
|
272
|
+
- Both crawlers use a shared `CrawlRateController` class (`src/crawlers/crawlRateController.ts`) that provides:
|
|
273
|
+
1. **Strict maxPages**: `claimSlot()` is called at the moment of success (synchronously right before `urlsCrawled.scanned.push()`), not at the top of the request handler. `abort()` is called only after claiming the last slot (`isLimitReached()` becomes true post-claim). Never abort from the top of the handler — doing so kills in-flight pages that other handlers are scanning, causing undershoot.
|
|
274
|
+
2. **Circuit breaker**: After 100 consecutive HTTP 4xx/5xx failures (configurable via `OOBEE_CONSECUTIVE_MAX_RETRIES`), the crawl aborts gracefully.
|
|
275
|
+
3. **Adaptive concurrency**: On each 4xx/5xx failure, concurrency is halved (floor 1). After every 10 consecutive successes, concurrency recovers by +2 toward the original value. This automatically finds the site's rate limit threshold without manual tuning.
|
|
276
|
+
- **Critical placement of `claimSlot()` and `abort()`**: `claimSlot()` must be synchronously right before `push()` — never at the top of the handler. `abort()` must be called only after the last slot is claimed — never from an early-exit check. Pages can be discarded mid-handler (redirect, dedup, robots.txt block), and aborting prematurely kills in-flight handlers that would have succeeded.
|
|
277
|
+
- Only HTTP 4xx/5xx responses trigger rate adaptation and count toward the circuit breaker — timeouts and network errors do not.
|
|
278
|
+
- In intelligent crawl, each phase (sitemap then domain) creates its own `CrawlRateController` instance — transitioning from sitemap to domain crawl starts fresh.
|
|
279
|
+
- Without the circuit breaker, a rate-limited crawl with thousands of enqueued URLs would run indefinitely, never hit the success threshold, and never generate a report.
|
|
280
|
+
- When enqueuing all sitemap URLs (which we do for accurate `totalLinksFetchedFromSitemaps` reporting), always ensure either a scan duration (`-d`) or the circuit breaker is in place as a safety net.
|
|
281
|
+
|
|
282
|
+
### Axe & Custom Checks
|
|
283
|
+
- When axe reports color-contrast violations but cannot determine the actual colors, skip augmenting the message with contrast context (avoids crashes on null/undefined color values).
|
|
284
|
+
- Violation messages are enriched with live DOM context (element text, computed styles, dimensions) via `page.evaluate()` during scan. Handle cases where elements are no longer in DOM at evaluation time.
|
|
285
|
+
- `aria-hidden-focus` violations are re-verified against the live DOM after axe completes, to handle race conditions with JS that sets `tabindex="-1"` after `aria-hidden="true"` (common in carousel/slider libraries). The re-verification yields to the event loop before re-checking, allowing pending timers to fire. If all focusable descendants now have `tabindex < 0`, the violation is filtered out as a false positive.
|
|
286
|
+
|
|
287
|
+
## Report Output Structure
|
|
288
|
+
|
|
289
|
+
```
|
|
290
|
+
{randomToken}/
|
|
291
|
+
├── datasets/default/ # Crawlee per-page JSON results
|
|
292
|
+
├── report.html # Interactive HTML report
|
|
293
|
+
├── summary.html → summary.pdf # PDF summary (HTML deleted after conversion)
|
|
294
|
+
├── report.csv # Issue-level CSV
|
|
295
|
+
├── scanData.json # Scan metadata (site, dates, type, sitemap info)
|
|
296
|
+
├── scanItems.json # All issues grouped by severity
|
|
297
|
+
├── scanItemsSummary.json # Summary counts
|
|
298
|
+
├── scanIssuesSummary.json # Issues without page details
|
|
299
|
+
├── scanPagesDetail.json # Per-page breakdown
|
|
300
|
+
├── scanPagesSummary.json # Page-level summary
|
|
301
|
+
├── sitemap.xml # Discovered URLs
|
|
302
|
+
└── screenshots/ # Violation screenshots (if enabled)
|
|
303
|
+
```
|
package/README.md
CHANGED
|
@@ -92,6 +92,10 @@ verapdf --version
|
|
|
92
92
|
| WARN_LEVEL | Only used in tests. | |
|
|
93
93
|
| OOBEE_DISABLE_BROWSER_DOWNLOAD | Experimental flag to disable file downloads on Chrome/Chromium/Edge. Does not affect Local File scan | |
|
|
94
94
|
| OOBEE_SLOWMO | Experimental flag to slow down web browser behaviour by specified duration (in miliseconds) | |
|
|
95
|
+
| OOBEE_TAGGED_WEBSITE | Tag to identify the website in telemetry. Can also be set via `-z, --websiteTag` CLI flag (CLI flag takes precedence). | |
|
|
96
|
+
| OOBEE_SCAN_METADATA | Overrides the `entryUrl` tag sent to telemetry. | |
|
|
97
|
+
| OOBEE_SCAN_PRODUCT | Adds a `scanProduct` tag to telemetry events. | |
|
|
98
|
+
| OOBEE_CONSECUTIVE_MAX_RETRIES | Max consecutive HTTP failures before the circuit breaker aborts the crawl. | `100` |
|
|
95
99
|
| HTTP_PROXY | URL of the proxy server to be used for HTTP requests (e.g. `http://proxy.example.com:8080`). | |
|
|
96
100
|
| HTTPS_PROXY | URL of the proxy server to be used for HTTPS requests (e.g. `https://proxy.example.com:8080`). | |
|
|
97
101
|
| ALL_PROXY | URL of the proxy server to be used for all requests, typically used for SOCKS5 proxies (e.g. `socks5://proxy.example.com:1080`. Note: IPv6 direct connections may still continue even though socks5 proxy is specified due to a known issue with Chrome/Chromium. (Recommended workaround is to turn off IPv6 at host-level). | |
|
|
@@ -400,6 +404,9 @@ Options:
|
|
|
400
404
|
[string] [choices: "yes", "no"] [default: "no"]
|
|
401
405
|
-l, --scanDuration Maximum scan duration in seconds (0 means u
|
|
402
406
|
nlimited) [number] [default: 0]
|
|
407
|
+
-z, --websiteTag Tag to identify the website in telemetry.
|
|
408
|
+
Overrides OOBEE_TAGGED_WEBSITE env var.
|
|
409
|
+
[string]
|
|
403
410
|
|
|
404
411
|
Examples:
|
|
405
412
|
To scan sitemap of website:', 'npm run cli -- -c [ 1 | sitemap ] -u <url_lin
|
|
@@ -410,6 +417,21 @@ Examples:
|
|
|
410
417
|
> [ -d <device> | -w <viewport_width> ]
|
|
411
418
|
|
|
412
419
|
```
|
|
420
|
+
|
|
421
|
+
### Basic Auth
|
|
422
|
+
|
|
423
|
+
For sites behind HTTP Basic Authentication, you can provide credentials in two ways:
|
|
424
|
+
|
|
425
|
+
1. **Embed in URL**: `npm run cli -- -u 'https://user:password@example.com' -c 3`
|
|
426
|
+
2. **Use `-m` flag**: `npm run cli -- -u 'https://example.com' -c 3 -m "Authorization Basic dXNlcjpwYXNzd29yZA=="`
|
|
427
|
+
|
|
428
|
+
Both methods work across all scan types (sitemap, website, custom flow). For multiple headers, separate with `, `:
|
|
429
|
+
```
|
|
430
|
+
-m "Authorization Basic dXNlcjpwYXNz, X-Custom-Header myvalue"
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
> **Note:** Authorization headers are only sent to same-origin requests to avoid CORS preflight failures on cross-origin resources (e.g., CDN fonts, analytics scripts).
|
|
434
|
+
|
|
413
435
|
### Note on Windows PowerShell:
|
|
414
436
|
You need to run the command as `npm run cli -- --` (with the extra set of `--`) as PowerShell interprets arguments differently.
|
|
415
437
|
|
package/dist/cli.js
CHANGED
|
@@ -177,6 +177,9 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
|
|
|
177
177
|
if (!options.strategy) {
|
|
178
178
|
options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
|
|
179
179
|
}
|
|
180
|
+
if (options.websiteTag) {
|
|
181
|
+
process.env.OOBEE_TAGGED_WEBSITE = options.websiteTag;
|
|
182
|
+
}
|
|
180
183
|
const scanInit = async (argvs) => {
|
|
181
184
|
const updatedArgvs = { ...argvs };
|
|
182
185
|
// Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
|
package/dist/combine.js
CHANGED
|
@@ -6,7 +6,7 @@ import crawlLocalFile from './crawlers/crawlLocalFile.js';
|
|
|
6
6
|
import crawlIntelligentSitemap from './crawlers/crawlIntelligentSitemap.js';
|
|
7
7
|
import generateArtifacts from './mergeAxeResults.js';
|
|
8
8
|
import { getHost, createAndUpdateResultsFolders, cleanUpAndExit, getStoragePath } from './utils.js';
|
|
9
|
-
import { ScannerTypes, UrlsCrawled } from './constants/constants.js';
|
|
9
|
+
import constants, { ScannerTypes, UrlsCrawled } from './constants/constants.js';
|
|
10
10
|
import { getBlackListedPatterns, submitForm } from './constants/common.js';
|
|
11
11
|
import { consoleLogger } from './logs.js';
|
|
12
12
|
import runCustom from './crawlers/runCustom.js';
|
|
@@ -31,11 +31,23 @@ const combineRun = async (details, deviceToScan) => {
|
|
|
31
31
|
generateJsonFiles, scanDuration, } = envDetails;
|
|
32
32
|
process.env.CRAWLEE_LOG_LEVEL = 'ERROR';
|
|
33
33
|
process.env.CRAWLEE_STORAGE_DIR = randomToken;
|
|
34
|
+
constants.sitemapFetchedLinks = null;
|
|
34
35
|
if (process.env.CRAWLEE_SYSTEM_INFO_V2 === undefined) {
|
|
35
36
|
// Set the environment variable to enable system info v2
|
|
36
37
|
// Resolves issue with when wmic is not installed on Windows
|
|
37
38
|
process.env.CRAWLEE_SYSTEM_INFO_V2 = '1';
|
|
38
39
|
}
|
|
40
|
+
// Suppress non-fatal Crawlee ps-tree errors on Windows with non-English locales.
|
|
41
|
+
// The system info module tries to parse process listing headers and crashes when
|
|
42
|
+
// headers are in a different language (e.g. "Wo" instead of "PID").
|
|
43
|
+
const psTreeHandler = (err) => {
|
|
44
|
+
if (err.message?.includes('Unknown process listing header')) {
|
|
45
|
+
consoleLogger.info(`Suppressed Crawlee ps-tree locale error: ${err.message}`);
|
|
46
|
+
return;
|
|
47
|
+
}
|
|
48
|
+
throw err;
|
|
49
|
+
};
|
|
50
|
+
process.on('uncaughtException', psTreeHandler);
|
|
39
51
|
const host = type === ScannerTypes.SITEMAP || type === ScannerTypes.LOCALFILE ? '' : getHost(url);
|
|
40
52
|
let blacklistedPatterns = null;
|
|
41
53
|
try {
|
|
@@ -77,7 +89,7 @@ const combineRun = async (details, deviceToScan) => {
|
|
|
77
89
|
let durationExceeded = false;
|
|
78
90
|
switch (type) {
|
|
79
91
|
case ScannerTypes.CUSTOM:
|
|
80
|
-
const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
|
|
92
|
+
const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '', extraHTTPHeaders);
|
|
81
93
|
urlsCrawledObj = res.urlsCrawled;
|
|
82
94
|
uiCustomFlowLabel = res.customFlowLabel;
|
|
83
95
|
break;
|
|
@@ -174,7 +186,7 @@ const combineRun = async (details, deviceToScan) => {
|
|
|
174
186
|
];
|
|
175
187
|
const basicFormHTMLSnippet = await generateArtifacts(randomToken, url, type, deviceToScan, urlsCrawledObj.scanned, pagesNotScanned, uiCustomFlowLabel && uiCustomFlowLabel.length > 0
|
|
176
188
|
? uiCustomFlowLabel
|
|
177
|
-
: customFlowLabel || 'None', undefined, scanDetails, zip, generateJsonFiles);
|
|
189
|
+
: customFlowLabel || 'None', undefined, scanDetails, zip, generateJsonFiles, browser);
|
|
178
190
|
const [name, email] = nameEmail.split(':');
|
|
179
191
|
// Upload results to S3 if environment variables are set
|
|
180
192
|
if (isS3UploadEnabled()) {
|
|
@@ -303,4 +303,11 @@ To obtain the JSON files, you need to base64-decode the file followed by gunzip.
|
|
|
303
303
|
demandOption: false,
|
|
304
304
|
coerce: val => Number(val),
|
|
305
305
|
},
|
|
306
|
+
z: {
|
|
307
|
+
alias: 'websiteTag',
|
|
308
|
+
describe: 'Tag to identify the website in telemetry. Overrides OOBEE_TAGGED_WEBSITE env var.',
|
|
309
|
+
type: 'string',
|
|
310
|
+
requiresArg: true,
|
|
311
|
+
demandOption: false,
|
|
312
|
+
},
|
|
306
313
|
};
|