@govtechsg/oobee 0.10.91 → 0.10.92

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/AGENTS.md +289 -0
  2. package/README.md +3 -0
  3. package/dist/cli.js +3 -0
  4. package/dist/combine.js +14 -2
  5. package/dist/constants/cliFunctions.js +7 -0
  6. package/dist/constants/common.js +119 -70
  7. package/dist/constants/constants.js +1 -0
  8. package/dist/crawlers/commonCrawlerFunc.js +93 -15
  9. package/dist/crawlers/crawlDomain.js +45 -57
  10. package/dist/crawlers/crawlIntelligentSitemap.js +12 -7
  11. package/dist/crawlers/crawlRateController.js +47 -0
  12. package/dist/crawlers/crawlSitemap.js +51 -62
  13. package/dist/generateOobeeClientScanner.js +31 -0
  14. package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
  15. package/dist/mergeAxeResults.js +120 -92
  16. package/dist/npmIndex.js +1 -0
  17. package/dist/utils.js +23 -28
  18. package/oobee-client-scanner.js +33 -2
  19. package/package.json +2 -2
  20. package/src/cli.ts +4 -0
  21. package/src/combine.ts +15 -1
  22. package/src/constants/cliFunctions.ts +7 -0
  23. package/src/constants/common.ts +131 -79
  24. package/src/constants/constants.ts +1 -0
  25. package/src/crawlers/commonCrawlerFunc.ts +103 -14
  26. package/src/crawlers/crawlDomain.ts +52 -65
  27. package/src/crawlers/crawlIntelligentSitemap.ts +13 -7
  28. package/src/crawlers/crawlRateController.ts +63 -0
  29. package/src/crawlers/crawlSitemap.ts +57 -70
  30. package/src/generateOobeeClientScanner.ts +31 -0
  31. package/src/index.ts +1 -0
  32. package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
  33. package/src/mergeAxeResults.ts +139 -99
  34. package/src/npmIndex.ts +1 -0
  35. package/src/utils.ts +25 -33
  36. /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt} +0 -0
package/AGENTS.md ADDED
@@ -0,0 +1,289 @@
1
+ # Oobee Developer Guide
2
+
3
+ > **Keep this file up to date.** When you make changes that affect architecture, crawl behavior, environment variables, or testing considerations described here, update the relevant section in the same commit.
4
+
5
+ Oobee is a web accessibility scanner that crawls websites and runs axe-core + custom checks against each page, producing HTML/PDF/CSV/JSON reports.
6
+
7
+ ## Architecture Overview
8
+
9
+ ```
10
+ User Input (CLI / npm API)
11
+
12
+ combine.ts (orchestrator)
13
+ ↓ routes by ScannerTypes
14
+ Crawler (crawlDomain / crawlSitemap / crawlIntelligentSitemap / crawlLocalFile / runCustom)
15
+ ↓ uses Crawlee PlaywrightCrawler
16
+ Page Handler (axe-core injection + custom checks)
17
+ ↓ writes per-page JSON to Crawlee dataset
18
+ generateArtifacts() in mergeAxeResults.ts
19
+ ↓ reads dataset, aggregates, renders templates
20
+ Reports (HTML, PDF, CSV, JSON, sitemap.xml)
21
+ ```
22
+
23
+ ## Entry Points
24
+
25
+ | Entry | File | Purpose |
26
+ |-------|------|---------|
27
+ | CLI | `src/cli.ts` | yargs-based CLI, calls `combineRun()` |
28
+ | Interactive CLI | `src/index.ts` | Inquirer prompts, calls `combineRun()` |
29
+ | npm API | `src/npmIndex.ts` | Programmatic `init()` for external consumers |
30
+ | Orchestrator | `src/combine.ts` | Routes scan type, manages lifecycle, calls `generateArtifacts()` |
31
+
32
+ ## Scanner Types
33
+
34
+ | Type | File | Behavior |
35
+ |------|------|----------|
36
+ | `Website` | `src/crawlers/crawlDomain.ts` | Domain crawl, discovers links from pages |
37
+ | `Sitemap` | `src/crawlers/crawlSitemap.ts` | Fetches URLs from sitemap XML |
38
+ | `Intelligent` | `src/crawlers/crawlIntelligentSitemap.ts` | Discovers sitemap via robots.txt, crawls it, then supplements with domain crawl |
39
+ | `LocalFile` | `src/crawlers/crawlLocalFile.ts` | Scans local HTML/PDF files via file:// |
40
+ | `Custom` | `src/crawlers/runCustom.ts` | User-driven flow (manual navigation in browser) |
41
+
42
+ ## Key Files
43
+
44
+ ### Constants & Configuration
45
+
46
+ - **`src/constants/constants.ts`** — Enums (`ScannerTypes`, `BrowserTypes`, `FileTypes`, `RuleFlags`), browser data dir paths, sitemap path list, WCAG mappings, shared mutable state (`robotsTxtUrls`, `sitemapFetchedLinks`, `userDataDirectory`, `launcher`)
47
+ - **`src/constants/common.ts`** — URL validation (`checkUrl`), browser launch options (`getPlaywrightLaunchOptions`), sitemap parsing (`getLinksFromSitemap`, `getSitemapsFromRobotsTxt`), robots.txt handling, browser selection (`getBrowserToRun`), user-agent initialization (`initModifiedUserAgent`)
48
+
49
+ ### Crawlers
50
+
51
+ All crawlers use Crawlee's `PlaywrightCrawler` with:
52
+ - `maxRequestsPerCrawl: Infinity` (Crawlee's internal limit disabled)
53
+ - Manual stop when `urlsCrawled.scanned.length >= maxRequestsPerCrawl` (counts only successful scans)
54
+ - `retryOnBlocked: true`
55
+ - `useFingerprints: false`
56
+
57
+ ### Report Generation
58
+
59
+ - **`src/mergeAxeResults.ts`** — Main `generateArtifacts()` function, reads Crawlee dataset, builds `allIssues` object, generates all output formats
60
+ - **`src/mergeAxeResults/`** — Sub-modules: `jsonArtifacts.ts` (JSON+base64), `writeCsv.ts`, `writeSitemap.ts`, `scanPages.ts`, `itemsStore.ts`, `types.ts`
61
+ - **`src/static/ejs/`** — EJS templates for HTML report and PDF summary
62
+
63
+ ## Browser Handling
64
+
65
+ ### Selection Priority
66
+
67
+ `getBrowserToRun()` in `common.ts` resolves the browser:
68
+ - If no preference specified: defaults to Chrome on Windows/macOS, Chromium on Linux
69
+ - Fallback chains:
70
+ - **macOS**: Chrome → webkit
71
+ - **Windows**: Chrome → Edge → error
72
+ - **Linux**: Chrome → Chromium (bundled by Playwright)
73
+ - When `chromium` is specified: uses Playwright's bundled Chromium with no channel
74
+
75
+ ### Launch Options
76
+
77
+ `getPlaywrightLaunchOptions()` builds Playwright launch config:
78
+ - Headless mode from `process.env.CRAWLEE_HEADLESS`
79
+ - Docker detection (`/.dockerenv`): adds `--disable-gpu`, `--no-sandbox`, `--disable-dev-shm-usage`
80
+ - Proxy support (manual, PAC, or none) via `getProxyInfo()`
81
+ - Channel set from browser name (undefined for chromium = bundled)
82
+
83
+ ### User-Agent
84
+
85
+ `initModifiedUserAgent()` detects the default UA, replaces `HeadlessChrome` with `Chrome`, stores in `process.env.OOBEE_USER_AGENT`. This must be called before any browser context that talks to remote servers in headless mode, or bot-blocking WAFs will reject requests.
86
+
87
+ Contexts that need `userAgent: process.env.OOBEE_USER_AGENT`:
88
+ - `getRobotsTxtViaPlaywright()` — robots.txt fetching
89
+ - `findSitemap()` in `crawlIntelligentSitemap.ts` — sitemap path probing
90
+ - `getDataUsingPlaywright()` in `getLinksFromSitemap()` — sitemap XML content fetching
91
+ - `checkUrl()` — main URL validation context (already handled)
92
+ - Crawlee crawler contexts in `crawlDomain`/`crawlSitemap` — UA set via `preLaunchHooks` in `getPreLaunchHook()`
93
+
94
+ ### Headless vs Headful
95
+
96
+ - Docker/Linux: always headless (`CRAWLEE_HEADLESS=1`)
97
+ - macOS CLI: typically headful (`CRAWLEE_HEADLESS=0`) unless user opts in
98
+ - Headful mode uses ephemeral contexts (no `userDataDir`) to avoid "Browser window not found" errors
99
+ - Headless mode uses `launchPersistentContext` with cloned user data directories
100
+
101
+ ## Sitemap Discovery & Fetching
102
+
103
+ The intelligent crawl flow:
104
+ 1. `getSitemapsFromRobotsTxt()` — fetches robots.txt, extracts `Sitemap:` directives
105
+ 2. `findSitemap()` — probes hardcoded paths (`/sitemap.xml`, `/sitemap-index.xml`, etc.)
106
+ 3. `getLinksFromSitemap()` — fetches and parses sitemap XML content, returns `Request[]`
107
+
108
+ Important behaviors:
109
+ - All URLs from the sitemap are discovered and stored as strings in a `Set<string>`
110
+ - All discovered URLs are converted to `Request` objects (no truncation at this stage)
111
+ - The crawler itself enforces `maxRequestsPerCrawl` by counting only successfully scanned pages
112
+ - `constants.sitemapFetchedLinks` stores the total discovered count for `scanData.json` reporting
113
+ - For sitemap indexes, child sitemaps are processed recursively
114
+
115
+ ## Shared Mutable State
116
+
117
+ The `constants` default export object holds runtime state:
118
+ - `constants.launcher` — Playwright browser type (chromium/webkit)
119
+ - `constants.robotsTxtUrls` — Parsed robots.txt disallow/allow rules
120
+ - `constants.sitemapFetchedLinks` — Sitemap fetch diagnostics (reset per scan)
121
+ - `constants.userDataDirectory` — Current browser profile directory
122
+ - `constants.randomToken` — Current scan token
123
+ - `constants.resources` — Active Crawlee crawlers, browser contexts, browsers (for cleanup)
124
+
125
+ ## Environment Variables
126
+
127
+ ### User-Facing
128
+ | Variable | Purpose |
129
+ |----------|---------|
130
+ | `CRAWLEE_HEADLESS` | `1` = headless, `0` = headful (set by `setHeadlessMode()`) |
131
+ | `OOBEE_USER_AGENT` | Modified UA (set by `initModifiedUserAgent()`) |
132
+ | `OOBEE_VERBOSE` | Enable verbose console logging |
133
+ | `OOBEE_LOGS_PATH` | Custom log directory |
134
+ | `OOBEE_SLOWMO` | Browser slowmo in ms |
135
+ | `OOBEE_FAST_CRAWLER` | Experimental high-concurrency mode |
136
+ | `OOBEE_DISABLE_BROWSER_DOWNLOAD` | Block browser file downloads |
137
+ | `HTTP_PROXY` / `HTTPS_PROXY` / `ALL_PROXY` | Proxy configuration |
138
+ | `NO_PROXY` / `INCLUDE_PROXY` | Proxy bypass/include lists |
139
+
140
+ ### Internal (set by code)
141
+ | Variable | Purpose |
142
+ |----------|---------|
143
+ | `CRAWLEE_STORAGE_DIR` | Crawlee dataset directory (= randomToken) |
144
+ | `CRAWLEE_LOG_LEVEL` | Set to `ERROR` |
145
+ | `CRAWLEE_SYSTEM_INFO_V2` | `1` (Windows wmic workaround) |
146
+ | `NODE_TLS_REJECT_UNAUTHORIZED` | Set to `0` for self-signed certs |
147
+
148
+ ## Platform Differences
149
+
150
+ ### Docker/Linux
151
+ - `/.dockerenv` detection adds `--no-sandbox`, `--disable-gpu`, `--disable-dev-shm-usage`
152
+ - No system Chrome/Edge — always falls back to Playwright's bundled Chromium
153
+ - `getDefaultChromeDataDir()` returns null (no Chrome profile to clone)
154
+ - `getDefaultChromiumDataDir()` creates `./Chromium Support` or falls back to `/tmp`
155
+ - Always headless in Docker
156
+ - Default UA contains `HeadlessChrome` — must be patched via `initModifiedUserAgent()`
157
+
158
+ ### macOS
159
+ - Defaults to system Chrome if available, falls back to webkit (not Chromium)
160
+ - Browser profiles at `~/Library/Application Support/Google/Chrome`
161
+ - Typically headful (non-headless)
162
+ - Logs at `~/Library/Application Support/Oobee/`
163
+
164
+ ### Windows
165
+ - Defaults to system Chrome, falls back to Edge
166
+ - Browser profiles at `%APPDATA%/Local/Google/Chrome/User Data`
167
+ - File locks require longer cleanup delays (5s vs 3s)
168
+ - Path separator differences in cookie profile regex
169
+ - `CRAWLEE_SYSTEM_INFO_V2=1` needed (wmic deprecation)
170
+
171
+ ## Testing
172
+
173
+ ```bash
174
+ npm test # Run Jest tests (uses --experimental-vm-modules)
175
+ npx tsc --noEmit # Type-check without emitting
176
+ npm run build # Compile TypeScript
177
+ ```
178
+
179
+ Test files: `__tests__/` directory and `src/crawlers/__tests__/`
180
+
181
+ ## Build & Run
182
+
183
+ ```bash
184
+ npm install # Install dependencies
185
+ npm run build # Compile TS → dist/
186
+ node dist/cli.js # Run CLI
187
+ ```
188
+
189
+ Docker:
190
+ ```bash
191
+ docker build -t oobee .
192
+ docker run oobee node dist/cli.js ...
193
+ ```
194
+
195
+ ## Common Pitfalls
196
+
197
+ 1. **Bot-blocking in headless mode** — Any new browser context that fetches remote content in headless mode must pass `userAgent: process.env.OOBEE_USER_AGENT`. Without this, sites with WAFs block the request.
198
+
199
+ 2. **`maxRequestsPerCrawl` semantics** — This counts *successfully scanned* pages, not total requests. The sitemap enqueues all discovered URLs; the crawler stops when enough succeed. Errored pages do not consume the budget.
200
+
201
+ 3. **Browser profile isolation** — Each scan clones browser profiles with a `randomToken` suffix. Profiles must be cleaned up after scan (`deleteClonedProfiles()`).
202
+
203
+ - If Chrome/Edge profile cloning fails (for example `EBUSY` while copying locked cookie/state files on Windows), Oobee now falls back to an empty cloned profile directory for that scan. This keeps browser launch stable, but authenticated session cookies may not be available.
204
+ - Crawlee's browser pool retires and re-launches browser instances after ~4 minutes. On Windows, reusing the same `--user-data-dir` causes Chrome exit code 21 (stale lock contention). `getPreLaunchHook()` in `commonCrawlerFunc.ts` assigns unique `_pool{N}` directories for each re-launch and performs a best-effort async clone of the base profile. Cleanup must glob `_pool*` directories alongside the base `oobee-{token}` dir.
205
+
206
+ 4. **`constants.launcher` mutation** — When webkit is the fallback, `constants.launcher` is reassigned globally. This affects all subsequent browser launches in the same process.
207
+
208
+ 5. **Headful vs headless context creation** — Headful mode must NOT use `launchPersistentContext` with custom `userDataDir` (causes "Browser window not found" crash). Use `launch()` + `newContext()` instead.
209
+
210
+ 6. **Sitemap fetch state** — `constants.sitemapFetchedLinks` accumulates across multiple `getLinksFromSitemap` calls. Must be reset to `null` at scan start.
211
+
212
+ 7. **PDF generation** — `writeSummaryPdf()` always runs headless regardless of scan mode. It loads a local `file://` URL so UA/network issues don't apply, but it needs a working browser binary.
213
+
214
+ - On Windows, summary PDF generation now retries with Edge (`msedge`) if the initial Chrome launch fails at runtime.
215
+
216
+ 8. **Crawlee dataset** — Results are stored as numbered JSON files in `{randomToken}/datasets/default/`. Each file is one page's axe results. `generateArtifacts()` reads all of them.
217
+
218
+ ## Testing Considerations
219
+
220
+ When making changes, validate these areas which have well-established edge cases:
221
+
222
+ ### Memory & Large Scan Handling
223
+ - Large scans (1000+ pages) can produce multi-GB JSON payloads. The report pipeline streams per-page results sequentially and writes violation items to per-rule JSONL files on disk. Only rule-level summaries (not full `pagesAffected` arrays) are embedded in report.html. Any change to report generation must be tested with 1000+ page scans.
224
+ - When writing chunked base64 data to the HTML output stream, await drain events. Silent data truncation occurs on large payloads (57MB+) without backpressure handling.
225
+ - The browser-embedded payload in report.html must remain minimal — only rule summaries with `pagesAffectedCount`, not full item arrays. Browser `JSON.parse()` cannot handle 700MB+ strings.
226
+
227
+ ### Crawlee Lifecycle & Cleanup
228
+ - Crawlee's async lock-file operations (`.json.lock` mkdir) can fire after the crawl finishes. On Windows, this triggers uncaughtException EPERM during report generation. A scoped exception handler suppresses these. The cleanup delay is 5s on Windows, 3s on others.
229
+ - The crawlee dataset folder and `tmp-items` (intermediate JSONL store) must be deleted BEFORE zipping results. `zipResults` must be the last step in `generateArtifacts()` — any cleanup or processing that removes temp files from `storagePath` must happen earlier. The dataset deletion uses an awaited delay (not fire-and-forget setTimeout) to let lingering Crawlee I/O flush.
230
+ - Errors must only be recorded in `failedRequestHandler` (after all retries exhausted), not in the `requestHandler` catch block. Crawlee retries up to 3 times, so recording in the catch block creates duplicates and false positives for URLs that succeed on retry.
231
+
232
+ ### URL & Redirect Handling
233
+ - `https://example.com` and `https://example.com/` must be treated as the same page. Use `normUrl()` (wrapping `@apify/utilities normalizeUrl`) for all dedup sets.
234
+ - `www.example.com` and `example.com` must be treated as the same host. Never compare hostnames with `===` directly — use `isSameHostname()` from `src/utils.ts`, which strips the `www.` prefix. This applies to follow-strategy checks, click-discovery gating, and any other hostname comparison. Sitemaps commonly list child URLs without the `www.` prefix; browsers redirect between www/non-www variants freely.
235
+ - Pages may redirect to external domains. The crawler detects this both pre-scan (via `response.url()` after goto) and post-scan (via `page.url()` after axe completes, since JS redirects can fire during scan). Results are discarded if the page leaves its queued hostname.
236
+ - In custom flow, the entry URL should remain the user-provided URL, not the final redirected URL.
237
+
238
+ ### robots.txt Handling
239
+ - Bare paths like `/subscription/unsubscribe` must emit both the exact-path pattern AND a children glob (`/subscription/unsubscribe/**`). Query-string `?` must be escaped (minimatch treats `?` as a single-char wildcard).
240
+ - URLs found via popups, frame navigations, or interactive clicks go through `enqueueUniqueRequest` which bypasses `transformRequestFunction`. These must also be checked against robots.txt via `isDisallowedInRobotsTxt` before enqueue.
241
+
242
+ ### Local File Sitemaps
243
+ - When a local file path is used as `userUrl`, `isFollowStrategy` tries `new URL('/app/sitemaps/...')` which throws. Strategy checks must be skipped when `userUrl` is a file path. The `rule === 'all'` early-return should come before any URL parsing.
244
+
245
+ ### Page Lifecycle
246
+ - `document.title` must be captured at the START of `runAxeScript()`, before axe scanning or screenshot capture. Pages can close during these operations (timeout, navigation, crash). Never create a new page just to re-navigate for the title — this leaks pages.
247
+ - The URL guard script in custom flow must be defensive against pages that close unexpectedly. All page event handlers should handle closed contexts gracefully.
248
+
249
+ ### Proxy & Network
250
+ - Proxy detection must handle `ALL_PROXY` on Windows. The proxy resolution logic should be tested on all platforms.
251
+
252
+ ### Strategy & Filtering in Sitemap Crawls
253
+ - The `-s` (strategy) flag must be passed through to `crawlSitemap` and `getLinksFromSitemap`. For sitemap-only scans the default is `'ignore'` (all URLs); for domain/intelligent crawls it's `'same-domain'`.
254
+ - `scanDuration=0` means unlimited. Code that calculates `remainingDuration` must treat 0 as "no limit", not as "0 seconds remaining".
255
+
256
+ ### Rate Limiting, Adaptive Concurrency & CrawlRateController
257
+ - Sites with WAFs (Cloudflare, Akamai, etc.) will start returning 403/503 after a certain number of concurrent requests — typically 200-300 pages in rapid succession.
258
+ - Both crawlers use a shared `CrawlRateController` class (`src/crawlers/crawlRateController.ts`) that provides:
259
+ 1. **Strict maxPages**: `claimSlot()` is called at the moment of success (synchronously right before `urlsCrawled.scanned.push()`), not at the top of the request handler. `abort()` is called only after claiming the last slot (`isLimitReached()` becomes true post-claim). Never abort from the top of the handler — doing so kills in-flight pages that other handlers are scanning, causing undershoot.
260
+ 2. **Circuit breaker**: After 100 consecutive HTTP 4xx/5xx failures (configurable via `OOBEE_CONSECUTIVE_MAX_RETRIES`), the crawl aborts gracefully.
261
+ 3. **Adaptive concurrency**: On each 4xx/5xx failure, concurrency is halved (floor 1). After every 10 consecutive successes, concurrency recovers by +2 toward the original value. This automatically finds the site's rate limit threshold without manual tuning.
262
+ - **Critical placement of `claimSlot()` and `abort()`**: `claimSlot()` must be synchronously right before `push()` — never at the top of the handler. `abort()` must be called only after the last slot is claimed — never from an early-exit check. Pages can be discarded mid-handler (redirect, dedup, robots.txt block), and aborting prematurely kills in-flight handlers that would have succeeded.
263
+ - Only HTTP 4xx/5xx responses trigger rate adaptation and count toward the circuit breaker — timeouts and network errors do not.
264
+ - In intelligent crawl, each phase (sitemap then domain) creates its own `CrawlRateController` instance — transitioning from sitemap to domain crawl starts fresh.
265
+ - Without the circuit breaker, a rate-limited crawl with thousands of enqueued URLs would run indefinitely, never hit the success threshold, and never generate a report.
266
+ - When enqueuing all sitemap URLs (which we do for accurate `totalLinksFetchedFromSitemaps` reporting), always ensure either a scan duration (`-d`) or the circuit breaker is in place as a safety net.
267
+
268
+ ### Axe & Custom Checks
269
+ - When axe reports color-contrast violations but cannot determine the actual colors, skip augmenting the message with contrast context (avoids crashes on null/undefined color values).
270
+ - Violation messages are enriched with live DOM context (element text, computed styles, dimensions) via `page.evaluate()` during scan. Handle cases where elements are no longer in DOM at evaluation time.
271
+ - `aria-hidden-focus` violations are re-verified against the live DOM after axe completes, to handle race conditions with JS that sets `tabindex="-1"` after `aria-hidden="true"` (common in carousel/slider libraries). The re-verification yields to the event loop before re-checking, allowing pending timers to fire. If all focusable descendants now have `tabindex < 0`, the violation is filtered out as a false positive.
272
+
273
+ ## Report Output Structure
274
+
275
+ ```
276
+ {randomToken}/
277
+ ├── datasets/default/ # Crawlee per-page JSON results
278
+ ├── report.html # Interactive HTML report
279
+ ├── summary.html → summary.pdf # PDF summary (HTML deleted after conversion)
280
+ ├── report.csv # Issue-level CSV
281
+ ├── scanData.json # Scan metadata (site, dates, type, sitemap info)
282
+ ├── scanItems.json # All issues grouped by severity
283
+ ├── scanItemsSummary.json # Summary counts
284
+ ├── scanIssuesSummary.json # Issues without page details
285
+ ├── scanPagesDetail.json # Per-page breakdown
286
+ ├── scanPagesSummary.json # Page-level summary
287
+ ├── sitemap.xml # Discovered URLs
288
+ └── screenshots/ # Violation screenshots (if enabled)
289
+ ```
package/README.md CHANGED
@@ -400,6 +400,9 @@ Options:
400
400
  [string] [choices: "yes", "no"] [default: "no"]
401
401
  -l, --scanDuration Maximum scan duration in seconds (0 means u
402
402
  nlimited) [number] [default: 0]
403
+ -z, --websiteTag Tag to identify the website in telemetry.
404
+ Overrides OOBEE_TAGGED_WEBSITE env var.
405
+ [string]
403
406
 
404
407
  Examples:
405
408
  To scan sitemap of website:', 'npm run cli -- -c [ 1 | sitemap ] -u <url_lin
package/dist/cli.js CHANGED
@@ -177,6 +177,9 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
177
177
  if (!options.strategy) {
178
178
  options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
179
179
  }
180
+ if (options.websiteTag) {
181
+ process.env.OOBEE_TAGGED_WEBSITE = options.websiteTag;
182
+ }
180
183
  const scanInit = async (argvs) => {
181
184
  const updatedArgvs = { ...argvs };
182
185
  // Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
package/dist/combine.js CHANGED
@@ -6,7 +6,7 @@ import crawlLocalFile from './crawlers/crawlLocalFile.js';
6
6
  import crawlIntelligentSitemap from './crawlers/crawlIntelligentSitemap.js';
7
7
  import generateArtifacts from './mergeAxeResults.js';
8
8
  import { getHost, createAndUpdateResultsFolders, cleanUpAndExit, getStoragePath } from './utils.js';
9
- import { ScannerTypes, UrlsCrawled } from './constants/constants.js';
9
+ import constants, { ScannerTypes, UrlsCrawled } from './constants/constants.js';
10
10
  import { getBlackListedPatterns, submitForm } from './constants/common.js';
11
11
  import { consoleLogger } from './logs.js';
12
12
  import runCustom from './crawlers/runCustom.js';
@@ -31,11 +31,23 @@ const combineRun = async (details, deviceToScan) => {
31
31
  generateJsonFiles, scanDuration, } = envDetails;
32
32
  process.env.CRAWLEE_LOG_LEVEL = 'ERROR';
33
33
  process.env.CRAWLEE_STORAGE_DIR = randomToken;
34
+ constants.sitemapFetchedLinks = null;
34
35
  if (process.env.CRAWLEE_SYSTEM_INFO_V2 === undefined) {
35
36
  // Set the environment variable to enable system info v2
36
37
  // Resolves issue with when wmic is not installed on Windows
37
38
  process.env.CRAWLEE_SYSTEM_INFO_V2 = '1';
38
39
  }
40
+ // Suppress non-fatal Crawlee ps-tree errors on Windows with non-English locales.
41
+ // The system info module tries to parse process listing headers and crashes when
42
+ // headers are in a different language (e.g. "Wo" instead of "PID").
43
+ const psTreeHandler = (err) => {
44
+ if (err.message?.includes('Unknown process listing header')) {
45
+ consoleLogger.info(`Suppressed Crawlee ps-tree locale error: ${err.message}`);
46
+ return;
47
+ }
48
+ throw err;
49
+ };
50
+ process.on('uncaughtException', psTreeHandler);
39
51
  const host = type === ScannerTypes.SITEMAP || type === ScannerTypes.LOCALFILE ? '' : getHost(url);
40
52
  let blacklistedPatterns = null;
41
53
  try {
@@ -174,7 +186,7 @@ const combineRun = async (details, deviceToScan) => {
174
186
  ];
175
187
  const basicFormHTMLSnippet = await generateArtifacts(randomToken, url, type, deviceToScan, urlsCrawledObj.scanned, pagesNotScanned, uiCustomFlowLabel && uiCustomFlowLabel.length > 0
176
188
  ? uiCustomFlowLabel
177
- : customFlowLabel || 'None', undefined, scanDetails, zip, generateJsonFiles);
189
+ : customFlowLabel || 'None', undefined, scanDetails, zip, generateJsonFiles, browser);
178
190
  const [name, email] = nameEmail.split(':');
179
191
  // Upload results to S3 if environment variables are set
180
192
  if (isS3UploadEnabled()) {
@@ -303,4 +303,11 @@ To obtain the JSON files, you need to base64-decode the file followed by gunzip.
303
303
  demandOption: false,
304
304
  coerce: val => Number(val),
305
305
  },
306
+ z: {
307
+ alias: 'websiteTag',
308
+ describe: 'Tag to identify the website in telemetry. Overrides OOBEE_TAGGED_WEBSITE env var.',
309
+ type: 'string',
310
+ requiresArg: true,
311
+ demandOption: false,
312
+ },
306
313
  };