@govtechsg/oobee 0.10.92 → 0.10.94
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +34 -0
- package/README.md +19 -0
- package/dist/cli.js +3 -2
- package/dist/combine.js +4 -4
- package/dist/constants/common.js +136 -49
- package/dist/crawlers/commonCrawlerFunc.js +54 -2
- package/dist/crawlers/crawlDomain.js +9 -2
- package/dist/crawlers/crawlIntelligentSitemap.js +9 -4
- package/dist/crawlers/crawlSitemap.js +14 -2
- package/dist/crawlers/custom/utils.js +22 -9
- package/dist/crawlers/guards/urlGuard.js +19 -1
- package/dist/crawlers/runCustom.js +8 -2
- package/dist/generateOobeeClientScanner.js +1 -1
- package/dist/mergeAxeResults/itemsStore.js +32 -3
- package/dist/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
- package/dist/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
- package/dist/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
- package/dist/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
- package/dist/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
- package/dist/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
- package/dist/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
- package/dist/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
- package/dist/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
- package/dist/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
- package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
- package/dist/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
- package/dist/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
- package/dist/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
- package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
- package/dist/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
- package/dist/static/ejs/partials/styles/styles.ejs +1 -1
- package/dist/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
- package/dist/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
- package/dist/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
- package/oobee-client-scanner.js +4 -4
- package/package.json +2 -2
- package/src/cli.ts +3 -2
- package/src/combine.ts +4 -2
- package/src/constants/common.ts +131 -35
- package/src/crawlers/commonCrawlerFunc.ts +56 -2
- package/src/crawlers/crawlDomain.ts +11 -1
- package/src/crawlers/crawlIntelligentSitemap.ts +10 -4
- package/src/crawlers/crawlSitemap.ts +19 -2
- package/src/crawlers/custom/utils.ts +26 -13
- package/src/crawlers/guards/urlGuard.ts +18 -1
- package/src/crawlers/runCustom.ts +10 -1
- package/src/generateOobeeClientScanner.ts +1 -1
- package/src/mergeAxeResults/itemsStore.ts +37 -3
- package/src/static/ejs/partials/components/allIssues/CategoryBadges.ejs +3 -0
- package/src/static/ejs/partials/components/allIssues/IssuesTable.ejs +3 -3
- package/src/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +1 -1
- package/src/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +3 -3
- package/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +34 -27
- package/src/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +1 -0
- package/src/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +7 -0
- package/src/static/ejs/partials/components/wcagCoverageDetails.ejs +5 -5
- package/src/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +3 -3
- package/src/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +21 -19
- package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +39 -8
- package/src/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +11 -5
- package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +49 -31
- package/src/static/ejs/partials/styles/header/SiteInfo.ejs +1 -1
- package/src/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +36 -16
- package/src/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +22 -1
- package/src/static/ejs/partials/styles/styles.ejs +1 -1
- package/src/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +6 -0
- package/src/static/ejs/partials/styles/wcagCompliance.ejs +5 -4
- package/src/static/ejs/partials/styles/wcagCoverageDetails.ejs +6 -1
- package/testStaticJSScanner.html +1 -1
- /package/{d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt → 67e8137b-1939-4253-8f11-a82bc833cfcb.txt} +0 -0
package/AGENTS.md
CHANGED
|
@@ -79,6 +79,7 @@ All crawlers use Crawlee's `PlaywrightCrawler` with:
|
|
|
79
79
|
- Docker detection (`/.dockerenv`): adds `--disable-gpu`, `--no-sandbox`, `--disable-dev-shm-usage`
|
|
80
80
|
- Proxy support (manual, PAC, or none) via `getProxyInfo()`
|
|
81
81
|
- Channel set from browser name (undefined for chromium = bundled)
|
|
82
|
+
- `--mute-audio` is added by default in both headless and headful modes, but must be disabled for `customFlow` by calling `getPlaywrightLaunchOptions(browser, { includeMuteAudio: false })`
|
|
82
83
|
|
|
83
84
|
### User-Agent
|
|
84
85
|
|
|
@@ -111,6 +112,10 @@ Important behaviors:
|
|
|
111
112
|
- The crawler itself enforces `maxRequestsPerCrawl` by counting only successfully scanned pages
|
|
112
113
|
- `constants.sitemapFetchedLinks` stores the total discovered count for `scanData.json` reporting
|
|
113
114
|
- For sitemap indexes, child sitemaps are processed recursively
|
|
115
|
+
- Some sitemap XMLs include `<?xml-stylesheet ...?>` (XSL). In `getDataUsingPlaywright()`:
|
|
116
|
+
- Use `waitUntil: 'domcontentloaded'` (not `networkidle`) to avoid 60s timeouts caused by stylesheet/resource loading
|
|
117
|
+
- Prefer `response.text()` to capture raw XML before browser XSL transformation (preserves `<sitemapindex>` / `<urlset>` structure)
|
|
118
|
+
- Only fall back to DOM extraction when raw response text is unavailable
|
|
114
119
|
|
|
115
120
|
## Shared Mutable State
|
|
116
121
|
|
|
@@ -134,6 +139,11 @@ The `constants` default export object holds runtime state:
|
|
|
134
139
|
| `OOBEE_SLOWMO` | Browser slowmo in ms |
|
|
135
140
|
| `OOBEE_FAST_CRAWLER` | Experimental high-concurrency mode |
|
|
136
141
|
| `OOBEE_DISABLE_BROWSER_DOWNLOAD` | Block browser file downloads |
|
|
142
|
+
| `OOBEE_TAGGED_WEBSITE` | Tag to identify the website in Sentry telemetry (overridden by `--websiteTag` CLI flag) |
|
|
143
|
+
| `OOBEE_SCAN_METADATA` | Overrides `entryUrl` tag in Sentry events |
|
|
144
|
+
| `OOBEE_SCAN_PRODUCT` | Adds `scanProduct` tag to Sentry events |
|
|
145
|
+
| `OOBEE_CONSECUTIVE_MAX_RETRIES` | Max consecutive HTTP failures before circuit breaker aborts crawl (default 100) |
|
|
146
|
+
| `OOBEE_VALIDATE_URL` | If set, exit after URL validation without scanning |
|
|
137
147
|
| `HTTP_PROXY` / `HTTPS_PROXY` / `ALL_PROXY` | Proxy configuration |
|
|
138
148
|
| `NO_PROXY` / `INCLUDE_PROXY` | Proxy bypass/include lists |
|
|
139
149
|
|
|
@@ -215,6 +225,20 @@ docker run oobee node dist/cli.js ...
|
|
|
215
225
|
|
|
216
226
|
8. **Crawlee dataset** — Results are stored as numbered JSON files in `{randomToken}/datasets/default/`. Each file is one page's axe results. `generateArtifacts()` reads all of them.
|
|
217
227
|
|
|
228
|
+
9. **Auth headers and CORS** — Never set `Authorization` in `extraHTTPHeaders` globally on a browser context. Playwright sends `extraHTTPHeaders` to ALL requests (including cross-origin CDNs), which triggers CORS preflight failures. Instead use `splitAuthHeaders()` from `commonCrawlerFunc.ts` to separate auth from non-auth headers:
|
|
229
|
+
- Non-auth headers → safe to set globally via `extraHTTPHeaders` on context/launch options
|
|
230
|
+
- Basic auth → set `httpCredentials` on context (Playwright auto-responds to 401 challenges, origin-aware)
|
|
231
|
+
- Any Authorization header → send only to same-origin requests via `addAuthRouteHandler()` (route interception) or Crawlee's `preNavigationHooks` (navigation-only)
|
|
232
|
+
- Credentials come from URL-embedded `user:pass@host` or `-m "Authorization Basic ..."` — both produce the same `extraHTTPHeaders.Authorization` value in `prepareData()`
|
|
233
|
+
|
|
234
|
+
10. **Intermediate JSONL write safety + corruption tolerance** — `ItemsStore.appendPageItems()` requires strict serialization of writes per rule file to prevent interleaved corruption. It also enforces a strict text sanitization regex to filter out literal `\n` and `\r` control characters from website HTML inputs immediately after `JSON.stringify()`. This ensures no single JSON issue accidentally injects illegal implicit newline boundaries when writing to JSONL format. Maintain backward-compatible `fs.appendFile` queues over buffered WriteStreams to guarantee pipeline sync visibility. `ItemsStore.readRuleItems()` tolerates historical malformed lines via fallback skip logic.
|
|
235
|
+
|
|
236
|
+
11. **`preNavigationHooks` and the Playwright header-rewrite warning** — `preNavigationHooks()` in `commonCrawlerFunc.ts` is always included in the crawler `preNavigationHooks` array (for both `crawlDomain` and `crawlSitemap`). The hook does two things:
|
|
237
|
+
- **Header rewriting**: only sets `crawlingContext.request.headers = extraHTTPHeaders` when `extraHTTPHeaders` is non-empty. Setting request headers causes Crawlee/Playwright to intercept every network request to rewrite them, which triggers `WARN Playwright Utils: Using other request methods than GET, rewriting headers and adding payloads has a high impact on performance`. This warning is expected for authenticated scans; it is suppressed for unauthenticated scans because `extraHTTPHeaders` stays empty (see pitfall 12 below).
|
|
238
|
+
- **Navigation wait**: always sets `gotoOptions.waitUntil = 'domcontentloaded'` and `gotoOptions.timeout = 30000` via **in-place object mutation**. Do NOT reassign the `gotoOptions` parameter (`gotoOptions = {...}`) — that only rebinds the local variable and does not propagate to Crawlee. `domcontentloaded` is used (not `networkidle`) to avoid indefinite hangs on sites with WebSockets, analytics polling, lazy-load beacons, or health-check pings that never quiet their network activity. Further page stability is handled by `waitForPageLoaded()` in each requestHandler and the DOM mutation observer in `postNavigationHooks`.
|
|
239
|
+
|
|
240
|
+
12. **`extraHTTPHeaders` must not be mutated before being passed to crawlers** — `checkUrlConnectivityWithBrowser()` in `common.ts` needs an `Accept` header for its own connectivity check but must NOT add it to the shared `extraHTTPHeaders` object. Mutating the shared object causes crawlers to see a non-empty `extraHTTPHeaders` (at minimum `{ Accept: '...' }`), which silently triggers header rewriting and the Playwright performance warning for every unauthenticated scan. Always use a local copy: `const localHeaders = { ...extraHTTPHeaders }; localHeaders.Accept ||= '...';`.
|
|
241
|
+
|
|
218
242
|
## Testing Considerations
|
|
219
243
|
|
|
220
244
|
When making changes, validate these areas which have well-established edge cases:
|
|
@@ -246,6 +270,16 @@ When making changes, validate these areas which have well-established edge cases
|
|
|
246
270
|
- `document.title` must be captured at the START of `runAxeScript()`, before axe scanning or screenshot capture. Pages can close during these operations (timeout, navigation, crash). Never create a new page just to re-navigate for the title — this leaks pages.
|
|
247
271
|
- The URL guard script in custom flow must be defensive against pages that close unexpectedly. All page event handlers should handle closed contexts gracefully.
|
|
248
272
|
|
|
273
|
+
### URL Guard & Overlay Management in Custom Flow
|
|
274
|
+
|
|
275
|
+
`src/crawlers/guards/urlGuard.ts` — attached via `addUrlGuardScript()` in `runCustom.ts`:
|
|
276
|
+
|
|
277
|
+
- **`restoreToSafeUrl` must validate the safe URL before calling `page.goto()`**. If the entry URL is `file://` (e.g. `-u '/path/to/report.html'`), `fallbackUrl` is also `file://`. Redirecting to it fires another `framenavigated` for `file://`, which re-triggers `restoreToSafeUrl` → infinite reload loop. Always check `ALLOWED_PROTOCOLS.has(safeObj.protocol)` before navigating; if the fallback is not http/https, return without redirecting.
|
|
278
|
+
|
|
279
|
+
- **`about:` protocol must be skipped in `framenavigated`**. Chromium fires `framenavigated` for `about:blank` as a transient intermediate state during every `page.goto()` call. Intercepting it and calling `restoreToSafeUrl` → `page.goto(safeUrl)` → `about:blank` → `restoreToSafeUrl` → … creates a second infinite loop. Always `return` early when `urlObj.protocol === 'about:'`.
|
|
280
|
+
|
|
281
|
+
- **`reconcileOverlayMenu` must not remove the overlay on macOS/Windows**. On `darwin`/`win32` the custom flow runs headful. When `isOverlayAllowed` returns `false` (e.g. transient `file://` or `about:blank` URL), do **not** call `removeOverlayMenu` — the URL guard will redirect back to the safe URL momentarily. Instead, fall through to the `hasOverlay` / `addOverlayMenu` block so the overlay is (re-)injected regardless of the current URL protocol. On Linux/Docker (headless) the removal behaviour is unchanged.
|
|
282
|
+
|
|
249
283
|
### Proxy & Network
|
|
250
284
|
- Proxy detection must handle `ALL_PROXY` on Windows. The proxy resolution logic should be tested on all platforms.
|
|
251
285
|
|
package/README.md
CHANGED
|
@@ -92,6 +92,10 @@ verapdf --version
|
|
|
92
92
|
| WARN_LEVEL | Only used in tests. | |
|
|
93
93
|
| OOBEE_DISABLE_BROWSER_DOWNLOAD | Experimental flag to disable file downloads on Chrome/Chromium/Edge. Does not affect Local File scan | |
|
|
94
94
|
| OOBEE_SLOWMO | Experimental flag to slow down web browser behaviour by specified duration (in miliseconds) | |
|
|
95
|
+
| OOBEE_TAGGED_WEBSITE | Tag to identify the website in telemetry. Can also be set via `-z, --websiteTag` CLI flag (CLI flag takes precedence). | |
|
|
96
|
+
| OOBEE_SCAN_METADATA | Overrides the `entryUrl` tag sent to telemetry. | |
|
|
97
|
+
| OOBEE_SCAN_PRODUCT | Adds a `scanProduct` tag to telemetry events. | |
|
|
98
|
+
| OOBEE_CONSECUTIVE_MAX_RETRIES | Max consecutive HTTP failures before the circuit breaker aborts the crawl. | `100` |
|
|
95
99
|
| HTTP_PROXY | URL of the proxy server to be used for HTTP requests (e.g. `http://proxy.example.com:8080`). | |
|
|
96
100
|
| HTTPS_PROXY | URL of the proxy server to be used for HTTPS requests (e.g. `https://proxy.example.com:8080`). | |
|
|
97
101
|
| ALL_PROXY | URL of the proxy server to be used for all requests, typically used for SOCKS5 proxies (e.g. `socks5://proxy.example.com:1080`. Note: IPv6 direct connections may still continue even though socks5 proxy is specified due to a known issue with Chrome/Chromium. (Recommended workaround is to turn off IPv6 at host-level). | |
|
|
@@ -413,6 +417,21 @@ Examples:
|
|
|
413
417
|
> [ -d <device> | -w <viewport_width> ]
|
|
414
418
|
|
|
415
419
|
```
|
|
420
|
+
|
|
421
|
+
### Basic Auth
|
|
422
|
+
|
|
423
|
+
For sites behind HTTP Basic Authentication, you can provide credentials in two ways:
|
|
424
|
+
|
|
425
|
+
1. **Embed in URL**: `npm run cli -- -u 'https://user:password@example.com' -c 3`
|
|
426
|
+
2. **Use `-m` flag**: `npm run cli -- -u 'https://example.com' -c 3 -m "Authorization Basic dXNlcjpwYXNzd29yZA=="`
|
|
427
|
+
|
|
428
|
+
Both methods work across all scan types (sitemap, website, custom flow). For multiple headers, separate with `, `:
|
|
429
|
+
```
|
|
430
|
+
-m "Authorization Basic dXNlcjpwYXNz, X-Custom-Header myvalue"
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
> **Note:** Authorization headers are only sent to same-origin requests to avoid CORS preflight failures on cross-origin resources (e.g., CDN fonts, analytics scripts).
|
|
434
|
+
|
|
416
435
|
### Note on Windows PowerShell:
|
|
417
436
|
You need to run the command as `npm run cli -- --` (with the extra set of `--`) as PowerShell interprets arguments differently.
|
|
418
437
|
|
package/dist/cli.js
CHANGED
|
@@ -199,9 +199,10 @@ const scanInit = async (argvs) => {
|
|
|
199
199
|
if (res.httpStatus)
|
|
200
200
|
consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
|
|
201
201
|
if (res.status === statuses.success.code) {
|
|
202
|
-
//
|
|
203
|
-
//
|
|
202
|
+
// Keep browser-resolved URL as entryUrl for downstream scan metadata/events
|
|
203
|
+
// on non-custom scans.
|
|
204
204
|
if (data.type !== ScannerTypes.CUSTOM) {
|
|
205
|
+
data.entryUrl = res.url;
|
|
205
206
|
data.url = res.url;
|
|
206
207
|
}
|
|
207
208
|
if (process.env.OOBEE_VALIDATE_URL) {
|
package/dist/combine.js
CHANGED
|
@@ -23,7 +23,7 @@ export class ViewportSettingsClass {
|
|
|
23
23
|
}
|
|
24
24
|
const combineRun = async (details, deviceToScan) => {
|
|
25
25
|
const envDetails = { ...details };
|
|
26
|
-
const { type, url, nameEmail, randomToken, deviceChosen, customDevice, viewportWidth, playwrightDeviceDetailsObject, maxRequestsPerCrawl, browser, userDataDirectory, strategy, // Allow subdomains: if checked, = 'same-domain'
|
|
26
|
+
const { type, url, entryUrl, nameEmail, randomToken, deviceChosen, customDevice, viewportWidth, playwrightDeviceDetailsObject, maxRequestsPerCrawl, browser, userDataDirectory, strategy, // Allow subdomains: if checked, = 'same-domain'
|
|
27
27
|
specifiedMaxConcurrency, // Slow scan mode: if checked, = '1'
|
|
28
28
|
fileTypes, blacklistedPatternsFilename, includeScreenshots, // Include screenshots: if checked, = 'true'
|
|
29
29
|
followRobots, // Adhere to robots.txt: if checked, = 'true'
|
|
@@ -59,8 +59,8 @@ const combineRun = async (details, deviceToScan) => {
|
|
|
59
59
|
}
|
|
60
60
|
// remove basic-auth credentials from URL
|
|
61
61
|
const finalUrl = !(type === ScannerTypes.SITEMAP || type === ScannerTypes.LOCALFILE)
|
|
62
|
-
? new URL(
|
|
63
|
-
: new URL(pathToFileURL(
|
|
62
|
+
? new URL(entryUrl)
|
|
63
|
+
: new URL(pathToFileURL(entryUrl));
|
|
64
64
|
// Use the string version of finalUrl to reduce logic at submitForm
|
|
65
65
|
const finalUrlString = finalUrl.toString();
|
|
66
66
|
const scanDetails = {
|
|
@@ -89,7 +89,7 @@ const combineRun = async (details, deviceToScan) => {
|
|
|
89
89
|
let durationExceeded = false;
|
|
90
90
|
switch (type) {
|
|
91
91
|
case ScannerTypes.CUSTOM:
|
|
92
|
-
const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
|
|
92
|
+
const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '', extraHTTPHeaders);
|
|
93
93
|
urlsCrawledObj = res.urlsCrawled;
|
|
94
94
|
uiCustomFlowLabel = res.customFlowLabel;
|
|
95
95
|
break;
|
package/dist/constants/common.js
CHANGED
|
@@ -292,17 +292,30 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
|
|
|
292
292
|
return res;
|
|
293
293
|
}
|
|
294
294
|
}
|
|
295
|
-
// Ensure Accept header for non-html content fallback
|
|
296
|
-
extraHTTPHeaders
|
|
295
|
+
// Ensure Accept header for non-html content fallback — use a local copy to avoid
|
|
296
|
+
// mutating the caller's extraHTTPHeaders object (which is later checked by crawlers
|
|
297
|
+
// to decide whether to enable preNavigationHooks header rewriting).
|
|
298
|
+
const localHeaders = { ...extraHTTPHeaders };
|
|
299
|
+
localHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
|
|
297
300
|
await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
|
|
298
301
|
let browserContext;
|
|
299
302
|
let browserInstance;
|
|
300
303
|
const rawDevice = (playwrightDeviceDetailsObject || {});
|
|
301
304
|
const { viewport, isMobile, hasTouch, userAgent: deviceUserAgent, ...restDevice } = rawDevice;
|
|
302
305
|
const launchOptions = getPlaywrightLaunchOptions(browserToRun);
|
|
306
|
+
const { Authorization, ...nonAuthHeaders } = localHeaders || {};
|
|
307
|
+
let httpCredentials = undefined;
|
|
308
|
+
if (Authorization?.startsWith('Basic ')) {
|
|
309
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
310
|
+
const colonIdx = decoded.indexOf(':');
|
|
311
|
+
if (colonIdx > 0) {
|
|
312
|
+
httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
313
|
+
}
|
|
314
|
+
}
|
|
303
315
|
const contextOptions = {
|
|
304
316
|
...restDevice,
|
|
305
|
-
...(
|
|
317
|
+
...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
|
|
318
|
+
...(httpCredentials && { httpCredentials }),
|
|
306
319
|
ignoreHTTPSErrors: true,
|
|
307
320
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
308
321
|
};
|
|
@@ -342,6 +355,27 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
|
|
|
342
355
|
return res;
|
|
343
356
|
}
|
|
344
357
|
try {
|
|
358
|
+
// Only enable generic Authorization header routing interception broadly if
|
|
359
|
+
// a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
|
|
360
|
+
// performance warnings inside the check checkUrl phase for typical public scans
|
|
361
|
+
if (Object.keys(localHeaders).length > 0) {
|
|
362
|
+
if (Authorization && !httpCredentials) {
|
|
363
|
+
const entryOrigin = new URL(url).origin;
|
|
364
|
+
await browserContext.route('**/*', async (route, request) => {
|
|
365
|
+
try {
|
|
366
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
367
|
+
await route.continue({ headers: { ...request.headers(), Authorization } });
|
|
368
|
+
}
|
|
369
|
+
else {
|
|
370
|
+
await route.continue();
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
catch {
|
|
374
|
+
await route.continue();
|
|
375
|
+
}
|
|
376
|
+
});
|
|
377
|
+
}
|
|
378
|
+
}
|
|
345
379
|
const page = await browserContext.newPage();
|
|
346
380
|
// Block native Chrome download UI
|
|
347
381
|
try {
|
|
@@ -351,15 +385,6 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
|
|
|
351
385
|
catch (e) {
|
|
352
386
|
consoleLogger.info(`Unable to set download deny: ${e.message}`);
|
|
353
387
|
}
|
|
354
|
-
// OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
|
|
355
|
-
// This allows the "Connectivity Check" to pass as soon as HTML is ready
|
|
356
|
-
await page.route('**/*', (route) => {
|
|
357
|
-
const type = route.request().resourceType();
|
|
358
|
-
if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
|
|
359
|
-
return route.abort();
|
|
360
|
-
}
|
|
361
|
-
return route.continue();
|
|
362
|
-
});
|
|
363
388
|
// STEP 2: Navigate (follows server-side redirects)
|
|
364
389
|
page.once('download', () => {
|
|
365
390
|
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
@@ -471,7 +496,7 @@ export const isSitemapContent = (content) => {
|
|
|
471
496
|
return true;
|
|
472
497
|
}
|
|
473
498
|
const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
|
|
474
|
-
const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
|
|
499
|
+
const regexForXmlSitemap = new RegExp('<(?:urlset|sitemapindex|feed|rss)+?.*>', 'gmi');
|
|
475
500
|
if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
|
|
476
501
|
// is an XML sitemap wrapped in a HTML document
|
|
477
502
|
return true;
|
|
@@ -485,7 +510,18 @@ export const isSitemapContent = (content) => {
|
|
|
485
510
|
return false;
|
|
486
511
|
};
|
|
487
512
|
export const checkUrl = async (scanner, url, browser, clonedDataDir, playwrightDeviceDetailsObject, extraHTTPHeaders, fileTypes) => {
|
|
488
|
-
|
|
513
|
+
let urlToCheck = url;
|
|
514
|
+
if (scanner === ScannerTypes.LOCALFILE) {
|
|
515
|
+
if (!isFilePath(url)) {
|
|
516
|
+
const res = new RES();
|
|
517
|
+
res.status = constants.urlCheckStatuses.notALocalFile.code;
|
|
518
|
+
return res;
|
|
519
|
+
}
|
|
520
|
+
if (!url.toLowerCase().startsWith('file://')) {
|
|
521
|
+
urlToCheck = pathToFileURL(path.resolve(url)).toString();
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
const res = await checkUrlConnectivityWithBrowser(urlToCheck, browser, clonedDataDir, playwrightDeviceDetailsObject, extraHTTPHeaders);
|
|
489
525
|
// If response is 200 (meaning no other code was set earlier)
|
|
490
526
|
if (res.status === constants.urlCheckStatuses.success.code) {
|
|
491
527
|
// Check if document is pdf type
|
|
@@ -532,7 +568,7 @@ export const prepareData = async (argv) => {
|
|
|
532
568
|
if (isEmptyObject(argv)) {
|
|
533
569
|
throw Error('No inputs should be provided');
|
|
534
570
|
}
|
|
535
|
-
let { scanner, headless, url, deviceChosen, customDevice, viewportWidth, maxpages, strategy, isLocalFileScan = argv.scanner === ScannerTypes.LOCALFILE, browserToRun, nameEmail, customFlowLabel, specifiedMaxConcurrency, fileTypes, blacklistedPatternsFilename, additional, metadata, followRobots, header, safeMode, exportDirectory, zip, ruleset, generateJsonFiles, scanDuration, } = argv;
|
|
571
|
+
let { scanner, headless, url, deviceChosen, customDevice, viewportWidth, maxpages, strategy, isLocalFileScan = argv.scanner === ScannerTypes.LOCALFILE, browserToRun, nameEmail, customFlowLabel, specifiedMaxConcurrency, fileTypes, blacklistedPatternsFilename, additional, metadata, followRobots, header, safeMode, exportDirectory, zip, ruleset, generateJsonFiles, scanDuration, finalUrl, } = argv;
|
|
536
572
|
const extraHTTPHeaders = parseHeaders(header);
|
|
537
573
|
// Set default username and password for basic auth
|
|
538
574
|
let username = '';
|
|
@@ -558,6 +594,9 @@ export const prepareData = async (argv) => {
|
|
|
558
594
|
temp.password = '';
|
|
559
595
|
url = temp.toString();
|
|
560
596
|
}
|
|
597
|
+
// Keep browser-resolved URL (if provided by pre-check flow) as canonical entry URL.
|
|
598
|
+
// For local file paths, keep using the normalized `url` value below.
|
|
599
|
+
const resolvedEntryUrl = finalUrl && !isFilePath(finalUrl) ? finalUrl : url;
|
|
561
600
|
// construct filename for scan results
|
|
562
601
|
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
|
563
602
|
const domain = isLocalFileScan ? path.basename(url) : new URL(url).hostname;
|
|
@@ -585,7 +624,7 @@ export const prepareData = async (argv) => {
|
|
|
585
624
|
return {
|
|
586
625
|
type: scanner,
|
|
587
626
|
url,
|
|
588
|
-
entryUrl:
|
|
627
|
+
entryUrl: resolvedEntryUrl,
|
|
589
628
|
isHeadless: headless,
|
|
590
629
|
deviceChosen,
|
|
591
630
|
customDevice,
|
|
@@ -790,6 +829,7 @@ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, u
|
|
|
790
829
|
const scannedSitemaps = new Set();
|
|
791
830
|
const sitemapLinkCounts = {};
|
|
792
831
|
const allUrls = new Set(); // all discovered URLs (lightweight strings)
|
|
832
|
+
const isImageSitemapUrl = (candidateUrl) => /(^|\/)image-sitemap(?:-index)?(?:-\d+)?\.xml(?:$|[?#])/i.test(candidateUrl);
|
|
793
833
|
const addToUrlList = (url) => {
|
|
794
834
|
if (!url)
|
|
795
835
|
return;
|
|
@@ -860,6 +900,10 @@ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, u
|
|
|
860
900
|
const fetchUrls = async (url, extraHTTPHeaders) => {
|
|
861
901
|
let data;
|
|
862
902
|
let sitemapType;
|
|
903
|
+
if (isImageSitemapUrl(url)) {
|
|
904
|
+
consoleLogger.info(`Skipping image sitemap: ${url}`);
|
|
905
|
+
return;
|
|
906
|
+
}
|
|
863
907
|
if (scannedSitemaps.has(url)) {
|
|
864
908
|
// Skip processing if the sitemap has already been scanned
|
|
865
909
|
return;
|
|
@@ -906,27 +950,45 @@ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, u
|
|
|
906
950
|
});
|
|
907
951
|
}
|
|
908
952
|
const page = await browserContext.newPage();
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
data = await urlSet.evaluate(elem => elem.outerHTML);
|
|
953
|
+
// Use 'domcontentloaded' instead of 'networkidle' — sitemap XMLs with
|
|
954
|
+
// XSL stylesheet references (e.g. <?xml-stylesheet ...?>) cause the browser
|
|
955
|
+
// to fetch and apply the stylesheet, which may load additional resources
|
|
956
|
+
// (fonts, CSS, images) that prevent 'networkidle' from ever being reached.
|
|
957
|
+
const response = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
958
|
+
// Prefer the raw response body — this gives us the original XML before
|
|
959
|
+
// the browser applies any XSL transformation (which would turn the XML
|
|
960
|
+
// into rendered HTML, losing the sitemap structure).
|
|
961
|
+
if (response) {
|
|
962
|
+
try {
|
|
963
|
+
data = await response.text();
|
|
921
964
|
}
|
|
922
|
-
|
|
923
|
-
|
|
965
|
+
catch {
|
|
966
|
+
// response.text() can fail if the body was already consumed or
|
|
967
|
+
// if a redirect occurred; fall through to DOM extraction below.
|
|
924
968
|
}
|
|
925
|
-
|
|
926
|
-
|
|
969
|
+
}
|
|
970
|
+
if (!data) {
|
|
971
|
+
if ((await page.locator('body').count()) > 0) {
|
|
972
|
+
data = await page.locator('body').innerText();
|
|
927
973
|
}
|
|
928
|
-
else
|
|
929
|
-
|
|
974
|
+
else {
|
|
975
|
+
const urlSet = page.locator('urlset');
|
|
976
|
+
const sitemapIndex = page.locator('sitemapindex');
|
|
977
|
+
const rss = page.locator('rss');
|
|
978
|
+
const feed = page.locator('feed');
|
|
979
|
+
const isRoot = async (locator) => (await locator.count()) > 0;
|
|
980
|
+
if (await isRoot(urlSet)) {
|
|
981
|
+
data = await urlSet.evaluate(elem => elem.outerHTML);
|
|
982
|
+
}
|
|
983
|
+
else if (await isRoot(sitemapIndex)) {
|
|
984
|
+
data = await sitemapIndex.evaluate(elem => elem.outerHTML);
|
|
985
|
+
}
|
|
986
|
+
else if (await isRoot(rss)) {
|
|
987
|
+
data = await rss.evaluate(elem => elem.outerHTML);
|
|
988
|
+
}
|
|
989
|
+
else if (await isRoot(feed)) {
|
|
990
|
+
data = await feed.evaluate(elem => elem.outerHTML);
|
|
991
|
+
}
|
|
930
992
|
}
|
|
931
993
|
}
|
|
932
994
|
}
|
|
@@ -949,37 +1011,61 @@ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, u
|
|
|
949
1011
|
data = fs.readFileSync(url, 'utf8');
|
|
950
1012
|
}
|
|
951
1013
|
const $ = cheerio.load(data, { xml: true });
|
|
1014
|
+
const countBefore = allUrls.size;
|
|
952
1015
|
// This case is when the document is not an XML format document
|
|
953
1016
|
if ($(':root').length === 0) {
|
|
954
1017
|
processNonStandardSitemap(data);
|
|
1018
|
+
const linksFromThisSitemap = allUrls.size - countBefore;
|
|
1019
|
+
if (linksFromThisSitemap > 0) {
|
|
1020
|
+
sitemapLinkCounts[url] = (sitemapLinkCounts[url] || 0) + linksFromThisSitemap;
|
|
1021
|
+
}
|
|
955
1022
|
return;
|
|
956
1023
|
}
|
|
957
1024
|
// Root element
|
|
958
1025
|
const root = $(':root')[0];
|
|
959
|
-
const
|
|
960
|
-
|
|
961
|
-
|
|
1026
|
+
const hasImageNamespace = Object.values(root?.attribs ?? {}).some(attribVal => typeof attribVal === 'string' && attribVal.toLowerCase().includes('sitemap-image'));
|
|
1027
|
+
if (hasImageNamespace) {
|
|
1028
|
+
consoleLogger.info(`Skipping image sitemap: ${url}`);
|
|
1029
|
+
return;
|
|
1030
|
+
}
|
|
1031
|
+
const rootName = root?.name?.toLowerCase().split(':').pop() ?? '';
|
|
1032
|
+
const hasXmlSitemapIndexTag = /<\s*(?:[a-z0-9_-]+:)?sitemapindex\b/i.test(data);
|
|
1033
|
+
const hasXmlUrlsetTag = /<\s*(?:[a-z0-9_-]+:)?urlset\b/i.test(data);
|
|
1034
|
+
if (rootName === 'urlset') {
|
|
962
1035
|
sitemapType = constants.xmlSitemapTypes.xml;
|
|
963
1036
|
}
|
|
964
|
-
else if (
|
|
1037
|
+
else if (rootName === 'sitemapindex') {
|
|
965
1038
|
sitemapType = constants.xmlSitemapTypes.xmlIndex;
|
|
966
1039
|
}
|
|
967
|
-
else if (
|
|
1040
|
+
else if (rootName === 'rss') {
|
|
968
1041
|
sitemapType = constants.xmlSitemapTypes.rss;
|
|
969
1042
|
}
|
|
970
|
-
else if (
|
|
1043
|
+
else if (rootName === 'feed') {
|
|
971
1044
|
sitemapType = constants.xmlSitemapTypes.atom;
|
|
972
1045
|
}
|
|
1046
|
+
else if (hasXmlSitemapIndexTag) {
|
|
1047
|
+
sitemapType = constants.xmlSitemapTypes.xmlIndex;
|
|
1048
|
+
}
|
|
1049
|
+
else if (hasXmlUrlsetTag) {
|
|
1050
|
+
sitemapType = constants.xmlSitemapTypes.xml;
|
|
1051
|
+
}
|
|
973
1052
|
else {
|
|
974
1053
|
sitemapType = constants.xmlSitemapTypes.unknown;
|
|
975
1054
|
}
|
|
976
|
-
const countBefore = allUrls.size;
|
|
977
1055
|
switch (sitemapType) {
|
|
978
1056
|
case constants.xmlSitemapTypes.xmlIndex:
|
|
979
|
-
consoleLogger.info(`This is a XML format sitemap index
|
|
1057
|
+
consoleLogger.info(`This is a XML format sitemap index: ${url}`);
|
|
980
1058
|
for (const childSitemapUrl of $('loc')) {
|
|
981
|
-
const childSitemapUrlText = $(childSitemapUrl).text();
|
|
982
|
-
if (childSitemapUrlText
|
|
1059
|
+
const childSitemapUrlText = $(childSitemapUrl).text().trim();
|
|
1060
|
+
if (!childSitemapUrlText) {
|
|
1061
|
+
continue;
|
|
1062
|
+
}
|
|
1063
|
+
const childSitemapPath = childSitemapUrlText.split(/[?#]/)[0].toLowerCase();
|
|
1064
|
+
if (childSitemapPath.endsWith('.xml') || childSitemapPath.endsWith('.txt')) {
|
|
1065
|
+
if (isImageSitemapUrl(childSitemapUrlText)) {
|
|
1066
|
+
consoleLogger.info(`Skipping image sitemap: ${childSitemapUrlText}`);
|
|
1067
|
+
continue;
|
|
1068
|
+
}
|
|
983
1069
|
await fetchUrls(childSitemapUrlText, extraHTTPHeaders); // Recursive call for nested sitemaps
|
|
984
1070
|
}
|
|
985
1071
|
else {
|
|
@@ -988,19 +1074,19 @@ export const getLinksFromSitemap = async (sitemapUrl, _maxLinksCount, browser, u
|
|
|
988
1074
|
}
|
|
989
1075
|
break;
|
|
990
1076
|
case constants.xmlSitemapTypes.xml:
|
|
991
|
-
consoleLogger.info(`This is a XML format sitemap
|
|
1077
|
+
consoleLogger.info(`This is a XML format sitemap: ${url}`);
|
|
992
1078
|
await processXmlSitemap($, sitemapType, 'loc', 'lastmod', 'url');
|
|
993
1079
|
break;
|
|
994
1080
|
case constants.xmlSitemapTypes.rss:
|
|
995
|
-
consoleLogger.info(`This is a RSS format sitemap
|
|
1081
|
+
consoleLogger.info(`This is a RSS format sitemap: ${url}`);
|
|
996
1082
|
await processXmlSitemap($, sitemapType, 'link', 'pubDate', 'item');
|
|
997
1083
|
break;
|
|
998
1084
|
case constants.xmlSitemapTypes.atom:
|
|
999
|
-
consoleLogger.info(`This is a Atom format sitemap
|
|
1085
|
+
consoleLogger.info(`This is a Atom format sitemap: ${url}`);
|
|
1000
1086
|
await processXmlSitemap($, sitemapType, 'link', 'published', 'entry');
|
|
1001
1087
|
break;
|
|
1002
1088
|
default:
|
|
1003
|
-
consoleLogger.info(`This is an unrecognised XML sitemap format
|
|
1089
|
+
consoleLogger.info(`This is an unrecognised XML sitemap format: ${url}`);
|
|
1004
1090
|
processNonStandardSitemap(data);
|
|
1005
1091
|
}
|
|
1006
1092
|
const linksFromThisSitemap = allUrls.size - countBefore;
|
|
@@ -1816,7 +1902,8 @@ function isValidHttpUrl(urlString) {
|
|
|
1816
1902
|
export const isFilePath = (url) => {
|
|
1817
1903
|
const driveLetterPattern = /^[A-Z]:/i;
|
|
1818
1904
|
const backslashPattern = /\\/;
|
|
1819
|
-
return (url.startsWith('
|
|
1905
|
+
return (url.toLowerCase().startsWith('file://') ||
|
|
1906
|
+
url.startsWith('/') ||
|
|
1820
1907
|
driveLetterPattern.test(url) ||
|
|
1821
1908
|
backslashPattern.test(url) ||
|
|
1822
1909
|
url.startsWith('./') ||
|
|
@@ -898,13 +898,65 @@ export const createCrawleeSubFolders = async (randomToken) => {
|
|
|
898
898
|
export const preNavigationHooks = (extraHTTPHeaders) => {
|
|
899
899
|
return [
|
|
900
900
|
async (crawlingContext, gotoOptions) => {
|
|
901
|
-
if (extraHTTPHeaders) {
|
|
901
|
+
if (extraHTTPHeaders && Object.keys(extraHTTPHeaders).length > 0) {
|
|
902
902
|
crawlingContext.request.headers = extraHTTPHeaders;
|
|
903
903
|
}
|
|
904
|
-
|
|
904
|
+
// Use domcontentloaded — fires as soon as the DOM is parsed, before
|
|
905
|
+
// images/stylesheets/network requests settle. This avoids indefinite
|
|
906
|
+
// hangs on sites with WebSockets, analytics polling, or infinite-scroll
|
|
907
|
+
// beacons that never reach networkidle. Further page stability is
|
|
908
|
+
// handled by waitForPageLoaded() in each crawler's requestHandler and
|
|
909
|
+
// by the DOM mutation observer in postNavigationHooks.
|
|
910
|
+
if (gotoOptions) {
|
|
911
|
+
gotoOptions.waitUntil = 'domcontentloaded';
|
|
912
|
+
gotoOptions.timeout = 30000;
|
|
913
|
+
}
|
|
905
914
|
},
|
|
906
915
|
];
|
|
907
916
|
};
|
|
917
|
+
/**
|
|
918
|
+
* Splits extraHTTPHeaders into auth and non-auth parts.
|
|
919
|
+
* Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
|
|
920
|
+
* Non-auth headers are safe to set globally on the browser context.
|
|
921
|
+
*/
|
|
922
|
+
export const splitAuthHeaders = (extraHTTPHeaders) => {
|
|
923
|
+
const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
|
|
924
|
+
return {
|
|
925
|
+
authHeader: Authorization || null,
|
|
926
|
+
nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
|
|
927
|
+
httpCredentials: (() => {
|
|
928
|
+
if (!Authorization?.startsWith('Basic '))
|
|
929
|
+
return null;
|
|
930
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
931
|
+
const colonIdx = decoded.indexOf(':');
|
|
932
|
+
if (colonIdx <= 0)
|
|
933
|
+
return null;
|
|
934
|
+
return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
935
|
+
})(),
|
|
936
|
+
};
|
|
937
|
+
};
|
|
938
|
+
/**
|
|
939
|
+
* Adds a route handler to a BrowserContext that sends the Authorization header
|
|
940
|
+
* only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
|
|
941
|
+
*/
|
|
942
|
+
export const addAuthRouteHandler = async (context, entryUrl, authHeader) => {
|
|
943
|
+
if (!authHeader)
|
|
944
|
+
return;
|
|
945
|
+
const entryOrigin = new URL(entryUrl).origin;
|
|
946
|
+
await context.route('**/*', async (route, request) => {
|
|
947
|
+
try {
|
|
948
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
949
|
+
await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
|
|
950
|
+
}
|
|
951
|
+
else {
|
|
952
|
+
await route.continue();
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
catch {
|
|
956
|
+
await route.continue();
|
|
957
|
+
}
|
|
958
|
+
});
|
|
959
|
+
};
|
|
908
960
|
export const postNavigationHooks = [
|
|
909
961
|
async (_crawlingContext) => {
|
|
910
962
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import crawlee from 'crawlee';
|
|
2
2
|
import { CrawlRateController } from './crawlRateController.js';
|
|
3
|
-
import { createCrawleeSubFolders, getPreLaunchHook, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
|
|
3
|
+
import { createCrawleeSubFolders, getPreLaunchHook, preNavigationHooks, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, splitAuthHeaders, } from './commonCrawlerFunc.js';
|
|
4
4
|
import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
|
|
5
5
|
import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
|
|
6
6
|
import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
|
|
@@ -275,6 +275,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
275
275
|
};
|
|
276
276
|
let isAbortingScanNow = false;
|
|
277
277
|
const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
|
|
278
|
+
const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
278
279
|
const crawler = register(new crawlee.PlaywrightCrawler({
|
|
279
280
|
launchContext: {
|
|
280
281
|
launcher: constants.launcher,
|
|
@@ -293,12 +294,18 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
293
294
|
...playwrightDeviceDetailsObject,
|
|
294
295
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
295
296
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
296
|
-
...(
|
|
297
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
298
|
+
...(httpCredentials && { httpCredentials }),
|
|
297
299
|
};
|
|
298
300
|
},
|
|
299
301
|
],
|
|
300
302
|
},
|
|
301
303
|
requestQueue,
|
|
304
|
+
maxRequestRetries: 3,
|
|
305
|
+
maxSessionRotations: 1,
|
|
306
|
+
preNavigationHooks: [
|
|
307
|
+
...preNavigationHooks(extraHTTPHeaders),
|
|
308
|
+
],
|
|
302
309
|
postNavigationHooks: [
|
|
303
310
|
async (crawlingContext) => {
|
|
304
311
|
const { page, request } = crawlingContext;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
1
|
+
import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
|
|
2
2
|
import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
|
3
3
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
4
4
|
import crawlDomain from './crawlDomain.js';
|
|
@@ -26,26 +26,31 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
26
26
|
const homeUrl = getHomeUrl(link);
|
|
27
27
|
let sitemapLink = '';
|
|
28
28
|
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
29
|
+
const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
29
30
|
let context;
|
|
30
31
|
let browserInstance;
|
|
31
32
|
if (process.env.CRAWLEE_HEADLESS === '1') {
|
|
32
33
|
const effectiveUserDataDirectory = userDataDirectory || '';
|
|
33
34
|
context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
|
34
35
|
...launchOptions,
|
|
35
|
-
...(
|
|
36
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
37
|
+
...(httpCredentials && { httpCredentials }),
|
|
36
38
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
37
39
|
});
|
|
38
40
|
register(context);
|
|
39
41
|
}
|
|
40
42
|
else {
|
|
41
|
-
// In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
|
|
42
43
|
browserInstance = await constants.launcher.launch(launchOptions);
|
|
43
44
|
register(browserInstance);
|
|
44
45
|
context = await browserInstance.newContext({
|
|
45
|
-
...(
|
|
46
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
47
|
+
...(httpCredentials && { httpCredentials }),
|
|
46
48
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
47
49
|
});
|
|
48
50
|
}
|
|
51
|
+
if (authHeader) {
|
|
52
|
+
await addAuthRouteHandler(context, link, authHeader);
|
|
53
|
+
}
|
|
49
54
|
const page = await context.newPage();
|
|
50
55
|
for (const path of sitemapPaths) {
|
|
51
56
|
sitemapLink = homeUrl + path;
|