@govtechsg/oobee 0.10.92 → 0.10.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +14 -0
- package/README.md +19 -0
- package/dist/combine.js +1 -1
- package/dist/constants/common.js +30 -10
- package/dist/crawlers/commonCrawlerFunc.js +43 -0
- package/dist/crawlers/crawlDomain.js +11 -2
- package/dist/crawlers/crawlIntelligentSitemap.js +9 -4
- package/dist/crawlers/runCustom.js +8 -2
- package/dist/generateOobeeClientScanner.js +1 -1
- package/dist/mergeAxeResults/itemsStore.js +32 -3
- package/oobee-client-scanner.js +4 -4
- package/package.json +2 -2
- package/src/combine.ts +1 -0
- package/src/constants/common.ts +31 -11
- package/src/crawlers/commonCrawlerFunc.ts +45 -0
- package/src/crawlers/crawlDomain.ts +12 -1
- package/src/crawlers/crawlIntelligentSitemap.ts +10 -4
- package/src/crawlers/runCustom.ts +10 -1
- package/src/generateOobeeClientScanner.ts +1 -1
- package/src/mergeAxeResults/itemsStore.ts +37 -3
- /package/{d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0
package/AGENTS.md
CHANGED
|
@@ -79,6 +79,7 @@ All crawlers use Crawlee's `PlaywrightCrawler` with:
|
|
|
79
79
|
- Docker detection (`/.dockerenv`): adds `--disable-gpu`, `--no-sandbox`, `--disable-dev-shm-usage`
|
|
80
80
|
- Proxy support (manual, PAC, or none) via `getProxyInfo()`
|
|
81
81
|
- Channel set from browser name (undefined for chromium = bundled)
|
|
82
|
+
- `--mute-audio` is added by default in both headless and headful modes, but must be disabled for `customFlow` by calling `getPlaywrightLaunchOptions(browser, { includeMuteAudio: false })`
|
|
82
83
|
|
|
83
84
|
### User-Agent
|
|
84
85
|
|
|
@@ -134,6 +135,11 @@ The `constants` default export object holds runtime state:
|
|
|
134
135
|
| `OOBEE_SLOWMO` | Browser slowmo in ms |
|
|
135
136
|
| `OOBEE_FAST_CRAWLER` | Experimental high-concurrency mode |
|
|
136
137
|
| `OOBEE_DISABLE_BROWSER_DOWNLOAD` | Block browser file downloads |
|
|
138
|
+
| `OOBEE_TAGGED_WEBSITE` | Tag to identify the website in Sentry telemetry (overridden by `--websiteTag` CLI flag) |
|
|
139
|
+
| `OOBEE_SCAN_METADATA` | Overrides `entryUrl` tag in Sentry events |
|
|
140
|
+
| `OOBEE_SCAN_PRODUCT` | Adds `scanProduct` tag to Sentry events |
|
|
141
|
+
| `OOBEE_CONSECUTIVE_MAX_RETRIES` | Max consecutive HTTP failures before circuit breaker aborts crawl (default 100) |
|
|
142
|
+
| `OOBEE_VALIDATE_URL` | If set, exit after URL validation without scanning |
|
|
137
143
|
| `HTTP_PROXY` / `HTTPS_PROXY` / `ALL_PROXY` | Proxy configuration |
|
|
138
144
|
| `NO_PROXY` / `INCLUDE_PROXY` | Proxy bypass/include lists |
|
|
139
145
|
|
|
@@ -215,6 +221,14 @@ docker run oobee node dist/cli.js ...
|
|
|
215
221
|
|
|
216
222
|
8. **Crawlee dataset** — Results are stored as numbered JSON files in `{randomToken}/datasets/default/`. Each file is one page's axe results. `generateArtifacts()` reads all of them.
|
|
217
223
|
|
|
224
|
+
9. **Auth headers and CORS** — Never set `Authorization` in `extraHTTPHeaders` globally on a browser context. Playwright sends `extraHTTPHeaders` to ALL requests (including cross-origin CDNs), which triggers CORS preflight failures. Instead use `splitAuthHeaders()` from `commonCrawlerFunc.ts` to separate auth from non-auth headers:
|
|
225
|
+
- Non-auth headers → safe to set globally via `extraHTTPHeaders` on context/launch options
|
|
226
|
+
- Basic auth → set `httpCredentials` on context (Playwright auto-responds to 401 challenges, origin-aware)
|
|
227
|
+
- Any Authorization header → send only to same-origin requests via `addAuthRouteHandler()` (route interception) or Crawlee's `preNavigationHooks` (navigation-only)
|
|
228
|
+
- Credentials come from URL-embedded `user:pass@host` or `-m "Authorization Basic ..."` — both produce the same `extraHTTPHeaders.Authorization` value in `prepareData()`
|
|
229
|
+
|
|
230
|
+
10. **Intermediate JSONL write safety + corruption tolerance** — `ItemsStore.appendPageItems()` requires strict serialization of writes per rule file to prevent interleaved corruption. It also enforces a strict text sanitization regex to filter out literal `\n` and `\r` control characters from website HTML inputs immediately after `JSON.stringify()`. This ensures no single JSON issue accidentally injects illegal implicit newline boundaries when writing to JSONL format. Maintain backward-compatible `fs.appendFile` queues over buffered WriteStreams to guarantee pipeline sync visibility. `ItemsStore.readRuleItems()` tolerates historical malformed lines via fallback skip logic.
|
|
231
|
+
|
|
218
232
|
## Testing Considerations
|
|
219
233
|
|
|
220
234
|
When making changes, validate these areas which have well-established edge cases:
|
package/README.md
CHANGED
|
@@ -92,6 +92,10 @@ verapdf --version
|
|
|
92
92
|
| WARN_LEVEL | Only used in tests. | |
|
|
93
93
|
| OOBEE_DISABLE_BROWSER_DOWNLOAD | Experimental flag to disable file downloads on Chrome/Chromium/Edge. Does not affect Local File scan | |
|
|
94
94
|
| OOBEE_SLOWMO | Experimental flag to slow down web browser behaviour by specified duration (in miliseconds) | |
|
|
95
|
+
| OOBEE_TAGGED_WEBSITE | Tag to identify the website in telemetry. Can also be set via `-z, --websiteTag` CLI flag (CLI flag takes precedence). | |
|
|
96
|
+
| OOBEE_SCAN_METADATA | Overrides the `entryUrl` tag sent to telemetry. | |
|
|
97
|
+
| OOBEE_SCAN_PRODUCT | Adds a `scanProduct` tag to telemetry events. | |
|
|
98
|
+
| OOBEE_CONSECUTIVE_MAX_RETRIES | Max consecutive HTTP failures before the circuit breaker aborts the crawl. | `100` |
|
|
95
99
|
| HTTP_PROXY | URL of the proxy server to be used for HTTP requests (e.g. `http://proxy.example.com:8080`). | |
|
|
96
100
|
| HTTPS_PROXY | URL of the proxy server to be used for HTTPS requests (e.g. `https://proxy.example.com:8080`). | |
|
|
97
101
|
| ALL_PROXY | URL of the proxy server to be used for all requests, typically used for SOCKS5 proxies (e.g. `socks5://proxy.example.com:1080`. Note: IPv6 direct connections may still continue even though socks5 proxy is specified due to a known issue with Chrome/Chromium. (Recommended workaround is to turn off IPv6 at host-level). | |
|
|
@@ -413,6 +417,21 @@ Examples:
|
|
|
413
417
|
> [ -d <device> | -w <viewport_width> ]
|
|
414
418
|
|
|
415
419
|
```
|
|
420
|
+
|
|
421
|
+
### Basic Auth
|
|
422
|
+
|
|
423
|
+
For sites behind HTTP Basic Authentication, you can provide credentials in two ways:
|
|
424
|
+
|
|
425
|
+
1. **Embed in URL**: `npm run cli -- -u 'https://user:password@example.com' -c 3`
|
|
426
|
+
2. **Use `-m` flag**: `npm run cli -- -u 'https://example.com' -c 3 -m "Authorization Basic dXNlcjpwYXNzd29yZA=="`
|
|
427
|
+
|
|
428
|
+
Both methods work across all scan types (sitemap, website, custom flow). For multiple headers, separate with `, `:
|
|
429
|
+
```
|
|
430
|
+
-m "Authorization Basic dXNlcjpwYXNz, X-Custom-Header myvalue"
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
> **Note:** Authorization headers are only sent to same-origin requests to avoid CORS preflight failures on cross-origin resources (e.g., CDN fonts, analytics scripts).
|
|
434
|
+
|
|
416
435
|
### Note on Windows PowerShell:
|
|
417
436
|
You need to run the command as `npm run cli -- --` (with the extra set of `--`) as PowerShell interprets arguments differently.
|
|
418
437
|
|
package/dist/combine.js
CHANGED
|
@@ -89,7 +89,7 @@ const combineRun = async (details, deviceToScan) => {
|
|
|
89
89
|
let durationExceeded = false;
|
|
90
90
|
switch (type) {
|
|
91
91
|
case ScannerTypes.CUSTOM:
|
|
92
|
-
const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
|
|
92
|
+
const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '', extraHTTPHeaders);
|
|
93
93
|
urlsCrawledObj = res.urlsCrawled;
|
|
94
94
|
uiCustomFlowLabel = res.customFlowLabel;
|
|
95
95
|
break;
|
package/dist/constants/common.js
CHANGED
|
@@ -300,9 +300,19 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
|
|
|
300
300
|
const rawDevice = (playwrightDeviceDetailsObject || {});
|
|
301
301
|
const { viewport, isMobile, hasTouch, userAgent: deviceUserAgent, ...restDevice } = rawDevice;
|
|
302
302
|
const launchOptions = getPlaywrightLaunchOptions(browserToRun);
|
|
303
|
+
const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
|
|
304
|
+
let httpCredentials = undefined;
|
|
305
|
+
if (Authorization?.startsWith('Basic ')) {
|
|
306
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
307
|
+
const colonIdx = decoded.indexOf(':');
|
|
308
|
+
if (colonIdx > 0) {
|
|
309
|
+
httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
310
|
+
}
|
|
311
|
+
}
|
|
303
312
|
const contextOptions = {
|
|
304
313
|
...restDevice,
|
|
305
|
-
...(
|
|
314
|
+
...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
|
|
315
|
+
...(httpCredentials && { httpCredentials }),
|
|
306
316
|
ignoreHTTPSErrors: true,
|
|
307
317
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
308
318
|
};
|
|
@@ -342,6 +352,25 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
|
|
|
342
352
|
return res;
|
|
343
353
|
}
|
|
344
354
|
try {
|
|
355
|
+
// Only enable generic Authorization header routing interception broadly if
|
|
356
|
+
// a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
|
|
357
|
+
// performance warnings inside the check checkUrl phase for typical public scans
|
|
358
|
+
if (Authorization && !httpCredentials) {
|
|
359
|
+
const entryOrigin = new URL(url).origin;
|
|
360
|
+
await browserContext.route('**/*', async (route, request) => {
|
|
361
|
+
try {
|
|
362
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
363
|
+
await route.continue({ headers: { ...request.headers(), Authorization } });
|
|
364
|
+
}
|
|
365
|
+
else {
|
|
366
|
+
await route.continue();
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
catch {
|
|
370
|
+
await route.continue();
|
|
371
|
+
}
|
|
372
|
+
});
|
|
373
|
+
}
|
|
345
374
|
const page = await browserContext.newPage();
|
|
346
375
|
// Block native Chrome download UI
|
|
347
376
|
try {
|
|
@@ -351,15 +380,6 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
|
|
|
351
380
|
catch (e) {
|
|
352
381
|
consoleLogger.info(`Unable to set download deny: ${e.message}`);
|
|
353
382
|
}
|
|
354
|
-
// OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
|
|
355
|
-
// This allows the "Connectivity Check" to pass as soon as HTML is ready
|
|
356
|
-
await page.route('**/*', (route) => {
|
|
357
|
-
const type = route.request().resourceType();
|
|
358
|
-
if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
|
|
359
|
-
return route.abort();
|
|
360
|
-
}
|
|
361
|
-
return route.continue();
|
|
362
|
-
});
|
|
363
383
|
// STEP 2: Navigate (follows server-side redirects)
|
|
364
384
|
page.once('download', () => {
|
|
365
385
|
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
@@ -905,6 +905,49 @@ export const preNavigationHooks = (extraHTTPHeaders) => {
|
|
|
905
905
|
},
|
|
906
906
|
];
|
|
907
907
|
};
|
|
908
|
+
/**
|
|
909
|
+
* Splits extraHTTPHeaders into auth and non-auth parts.
|
|
910
|
+
* Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
|
|
911
|
+
* Non-auth headers are safe to set globally on the browser context.
|
|
912
|
+
*/
|
|
913
|
+
export const splitAuthHeaders = (extraHTTPHeaders) => {
|
|
914
|
+
const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
|
|
915
|
+
return {
|
|
916
|
+
authHeader: Authorization || null,
|
|
917
|
+
nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
|
|
918
|
+
httpCredentials: (() => {
|
|
919
|
+
if (!Authorization?.startsWith('Basic '))
|
|
920
|
+
return null;
|
|
921
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
922
|
+
const colonIdx = decoded.indexOf(':');
|
|
923
|
+
if (colonIdx <= 0)
|
|
924
|
+
return null;
|
|
925
|
+
return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
926
|
+
})(),
|
|
927
|
+
};
|
|
928
|
+
};
|
|
929
|
+
/**
|
|
930
|
+
* Adds a route handler to a BrowserContext that sends the Authorization header
|
|
931
|
+
* only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
|
|
932
|
+
*/
|
|
933
|
+
export const addAuthRouteHandler = async (context, entryUrl, authHeader) => {
|
|
934
|
+
if (!authHeader)
|
|
935
|
+
return;
|
|
936
|
+
const entryOrigin = new URL(entryUrl).origin;
|
|
937
|
+
await context.route('**/*', async (route, request) => {
|
|
938
|
+
try {
|
|
939
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
940
|
+
await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
|
|
941
|
+
}
|
|
942
|
+
else {
|
|
943
|
+
await route.continue();
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
catch {
|
|
947
|
+
await route.continue();
|
|
948
|
+
}
|
|
949
|
+
});
|
|
950
|
+
};
|
|
908
951
|
export const postNavigationHooks = [
|
|
909
952
|
async (_crawlingContext) => {
|
|
910
953
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import crawlee from 'crawlee';
|
|
2
2
|
import { CrawlRateController } from './crawlRateController.js';
|
|
3
|
-
import { createCrawleeSubFolders, getPreLaunchHook, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
|
|
3
|
+
import { createCrawleeSubFolders, getPreLaunchHook, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, splitAuthHeaders, } from './commonCrawlerFunc.js';
|
|
4
4
|
import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
|
|
5
5
|
import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
|
|
6
6
|
import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
|
|
@@ -275,6 +275,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
275
275
|
};
|
|
276
276
|
let isAbortingScanNow = false;
|
|
277
277
|
const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
|
|
278
|
+
const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
278
279
|
const crawler = register(new crawlee.PlaywrightCrawler({
|
|
279
280
|
launchContext: {
|
|
280
281
|
launcher: constants.launcher,
|
|
@@ -293,12 +294,20 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
293
294
|
...playwrightDeviceDetailsObject,
|
|
294
295
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
295
296
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
296
|
-
...(
|
|
297
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
298
|
+
...(httpCredentials && { httpCredentials }),
|
|
297
299
|
};
|
|
298
300
|
},
|
|
299
301
|
],
|
|
300
302
|
},
|
|
301
303
|
requestQueue,
|
|
304
|
+
preNavigationHooks: [
|
|
305
|
+
async (crawlingContext) => {
|
|
306
|
+
if (extraHTTPHeaders) {
|
|
307
|
+
crawlingContext.request.headers = extraHTTPHeaders;
|
|
308
|
+
}
|
|
309
|
+
},
|
|
310
|
+
],
|
|
302
311
|
postNavigationHooks: [
|
|
303
312
|
async (crawlingContext) => {
|
|
304
313
|
const { page, request } = crawlingContext;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
1
|
+
import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
|
|
2
2
|
import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
|
3
3
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
4
4
|
import crawlDomain from './crawlDomain.js';
|
|
@@ -26,26 +26,31 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
26
26
|
const homeUrl = getHomeUrl(link);
|
|
27
27
|
let sitemapLink = '';
|
|
28
28
|
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
29
|
+
const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
29
30
|
let context;
|
|
30
31
|
let browserInstance;
|
|
31
32
|
if (process.env.CRAWLEE_HEADLESS === '1') {
|
|
32
33
|
const effectiveUserDataDirectory = userDataDirectory || '';
|
|
33
34
|
context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
|
34
35
|
...launchOptions,
|
|
35
|
-
...(
|
|
36
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
37
|
+
...(httpCredentials && { httpCredentials }),
|
|
36
38
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
37
39
|
});
|
|
38
40
|
register(context);
|
|
39
41
|
}
|
|
40
42
|
else {
|
|
41
|
-
// In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
|
|
42
43
|
browserInstance = await constants.launcher.launch(launchOptions);
|
|
43
44
|
register(browserInstance);
|
|
44
45
|
context = await browserInstance.newContext({
|
|
45
|
-
...(
|
|
46
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
47
|
+
...(httpCredentials && { httpCredentials }),
|
|
46
48
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
47
49
|
});
|
|
48
50
|
}
|
|
51
|
+
if (authHeader) {
|
|
52
|
+
await addAuthRouteHandler(context, link, authHeader);
|
|
53
|
+
}
|
|
49
54
|
const page = await context.newPage();
|
|
50
55
|
for (const path of sitemapPaths) {
|
|
51
56
|
sitemapLink = homeUrl + path;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/* eslint-env browser */
|
|
2
|
-
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
2
|
+
import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
|
|
3
3
|
import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
|
|
4
4
|
import constants, { getIntermediateScreenshotsPath, guiInfoStatusTypes, } from '../constants/constants.js';
|
|
5
5
|
import { initNewPage, log } from './custom/utils.js';
|
|
@@ -18,7 +18,7 @@ export class ProcessPageParams {
|
|
|
18
18
|
this.randomToken = randomToken;
|
|
19
19
|
}
|
|
20
20
|
}
|
|
21
|
-
const runCustom = async (url, randomToken, browserToRun, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, initialCustomFlowLabel) => {
|
|
21
|
+
const runCustom = async (url, randomToken, browserToRun, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, initialCustomFlowLabel, extraHTTPHeaders) => {
|
|
22
22
|
// checks and delete datasets path if it already exists
|
|
23
23
|
process.env.CRAWLEE_STORAGE_DIR = randomToken;
|
|
24
24
|
const urlsCrawled = { ...constants.urlsCrawledObj };
|
|
@@ -47,6 +47,7 @@ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, view
|
|
|
47
47
|
...baseArgs.filter(a => !a.startsWith('--window-size') && a !== '--start-maximized'),
|
|
48
48
|
...customArgs,
|
|
49
49
|
];
|
|
50
|
+
const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
50
51
|
const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
|
|
51
52
|
...baseLaunchOptions,
|
|
52
53
|
args: mergedArgs,
|
|
@@ -56,7 +57,12 @@ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, view
|
|
|
56
57
|
viewport: null,
|
|
57
58
|
...(hasCustomViewport ? contextDeviceOptions : {}),
|
|
58
59
|
userAgent: process.env.OOBEE_USER_AGENT || deviceUserAgent,
|
|
60
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
61
|
+
...(httpCredentials && { httpCredentials }),
|
|
59
62
|
});
|
|
63
|
+
if (authHeader) {
|
|
64
|
+
await addAuthRouteHandler(context, url, authHeader);
|
|
65
|
+
}
|
|
60
66
|
register(context);
|
|
61
67
|
processPageParams.stopAll = async () => {
|
|
62
68
|
try {
|
|
@@ -51,7 +51,7 @@ const SENTRY_NODE_VERSION = (() => {
|
|
|
51
51
|
return _require('@sentry/node/package.json').version;
|
|
52
52
|
}
|
|
53
53
|
catch {
|
|
54
|
-
return '
|
|
54
|
+
return '10.58.0'; // safe fallback matching currently installed version
|
|
55
55
|
}
|
|
56
56
|
})();
|
|
57
57
|
// ---------------------------------------------------------------------------
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import fs from 'fs-extra';
|
|
2
2
|
import path from 'path';
|
|
3
3
|
import readline from 'readline';
|
|
4
|
+
import { consoleLogger } from '../logs.js';
|
|
4
5
|
export class ItemsStore {
|
|
5
6
|
constructor(storagePath) {
|
|
6
7
|
this.ensuredDirs = new Set();
|
|
8
|
+
this.fileWriteQueues = new Map();
|
|
7
9
|
this.basePath = path.join(storagePath, 'tmp-items');
|
|
8
10
|
}
|
|
9
11
|
sanitizeRuleId(ruleId) {
|
|
@@ -22,8 +24,25 @@ export class ItemsStore {
|
|
|
22
24
|
async appendPageItems(category, ruleId, entry) {
|
|
23
25
|
await this.ensureDir(category);
|
|
24
26
|
const filePath = this.getRuleFilePath(category, ruleId);
|
|
25
|
-
|
|
26
|
-
|
|
27
|
+
let line = JSON.stringify(entry);
|
|
28
|
+
// JSON.stringify should never produce literal newlines inside strings, but HTML content
|
|
29
|
+
// from page evaluation may contain edge-case characters (e.g. unescaped control chars in
|
|
30
|
+
// non-spec-compliant innerHTML). Strip any embedded \r or \n that would break JSONL format readline parsing.
|
|
31
|
+
line = line.replace(/[\n\r]/g, (match) => {
|
|
32
|
+
if (match === '\n')
|
|
33
|
+
return '\\n';
|
|
34
|
+
if (match === '\r')
|
|
35
|
+
return '\\r';
|
|
36
|
+
return match;
|
|
37
|
+
});
|
|
38
|
+
line += '\n';
|
|
39
|
+
// Serialize writes per rule file to avoid concurrent append interleaving/truncation.
|
|
40
|
+
const previous = this.fileWriteQueues.get(filePath) ?? Promise.resolve();
|
|
41
|
+
const next = previous.then(() => fs.appendFile(filePath, line, 'utf8'));
|
|
42
|
+
this.fileWriteQueues.set(filePath, next.catch(() => {
|
|
43
|
+
// Keep queue alive for subsequent writes.
|
|
44
|
+
}));
|
|
45
|
+
await next;
|
|
27
46
|
}
|
|
28
47
|
async *readRuleItems(category, ruleId) {
|
|
29
48
|
const filePath = this.getRuleFilePath(category, ruleId);
|
|
@@ -31,10 +50,19 @@ export class ItemsStore {
|
|
|
31
50
|
return;
|
|
32
51
|
const fileStream = fs.createReadStream(filePath, { encoding: 'utf8' });
|
|
33
52
|
const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
|
|
53
|
+
let lineNumber = 0;
|
|
34
54
|
for await (const line of rl) {
|
|
35
|
-
|
|
55
|
+
lineNumber += 1;
|
|
56
|
+
if (!line.trim())
|
|
57
|
+
continue;
|
|
58
|
+
try {
|
|
36
59
|
yield JSON.parse(line);
|
|
37
60
|
}
|
|
61
|
+
catch (error) {
|
|
62
|
+
// Tolerate malformed/truncated JSONL lines (e.g. interrupted append) so report generation can continue.
|
|
63
|
+
const preview = line.slice(0, 200);
|
|
64
|
+
consoleLogger.warn(`Skipping malformed itemsStore JSONL line ${lineNumber} in ${filePath}: ${error.message}. Content preview: ${preview}`);
|
|
65
|
+
}
|
|
38
66
|
}
|
|
39
67
|
}
|
|
40
68
|
async readRuleItemsMap(category, ruleId) {
|
|
@@ -46,6 +74,7 @@ export class ItemsStore {
|
|
|
46
74
|
return map;
|
|
47
75
|
}
|
|
48
76
|
async cleanup() {
|
|
77
|
+
await Promise.all(this.fileWriteQueues.values());
|
|
49
78
|
await fs.rm(this.basePath, { recursive: true, force: true });
|
|
50
79
|
}
|
|
51
80
|
}
|
package/oobee-client-scanner.js
CHANGED
|
@@ -3,9 +3,9 @@
|
|
|
3
3
|
* DO NOT EDIT MANUALLY. Re-generate with: node dist/generateOobeeClientScanner.js
|
|
4
4
|
*
|
|
5
5
|
* Embedded at generation time:
|
|
6
|
-
* App version : 0.10.
|
|
6
|
+
* App version : 0.10.93
|
|
7
7
|
* Sentry DSN : (from OOBEE_SENTRY_DSN env var or constants.ts default)
|
|
8
|
-
* Sentry SDK : @sentry/browser
|
|
8
|
+
* Sentry SDK : @sentry/browser 10.58.0 (loaded from CDN at runtime)
|
|
9
9
|
*
|
|
10
10
|
* Usage:
|
|
11
11
|
* <script src="oobee-client-scanner.js"></script>
|
|
@@ -34883,8 +34883,8 @@
|
|
|
34883
34883
|
// ── Sentry browser telemetry (Sentry JS SDK, loaded from CDN) ────────────
|
|
34884
34884
|
|
|
34885
34885
|
var _oobeeSentryDsn = "https://3b8c7ee46b06f33815a1301b6713ebc3@o4509047624761344.ingest.us.sentry.io/4509327783559168";
|
|
34886
|
-
var _oobeeAppVersion = "0.10.
|
|
34887
|
-
var _oobeeSentryVersion = "
|
|
34886
|
+
var _oobeeAppVersion = "0.10.93";
|
|
34887
|
+
var _oobeeSentryVersion = "10.58.0";
|
|
34888
34888
|
var _oobeeSentryInitialized = false;
|
|
34889
34889
|
var _oobeeSentryLoadPromise = null;
|
|
34890
34890
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@govtechsg/oobee",
|
|
3
3
|
"main": "dist/npmIndex.js",
|
|
4
|
-
"version": "0.10.
|
|
4
|
+
"version": "0.10.93",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"author": "Government Technology Agency <info@tech.gov.sg>",
|
|
7
7
|
"bin": {
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
"@aws-sdk/client-s3": "^3.1049.0",
|
|
12
12
|
"@json2csv/node": "^7.0.3",
|
|
13
13
|
"@napi-rs/canvas": "^0.1.53",
|
|
14
|
-
"@sentry/node": "^
|
|
14
|
+
"@sentry/node": "^10.58.0",
|
|
15
15
|
"@types/aws-sdk": "^0.0.42",
|
|
16
16
|
"axe-core": "^4.11.4",
|
|
17
17
|
"axios": "^1.8.2",
|
package/src/combine.ts
CHANGED
|
@@ -154,6 +154,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
|
154
154
|
blacklistedPatterns,
|
|
155
155
|
includeScreenshots,
|
|
156
156
|
customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '',
|
|
157
|
+
extraHTTPHeaders,
|
|
157
158
|
);
|
|
158
159
|
|
|
159
160
|
urlsCrawledObj = res.urlsCrawled;
|
package/src/constants/common.ts
CHANGED
|
@@ -377,9 +377,21 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
377
377
|
} = rawDevice;
|
|
378
378
|
|
|
379
379
|
const launchOptions = getPlaywrightLaunchOptions(browserToRun);
|
|
380
|
+
|
|
381
|
+
const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
|
|
382
|
+
let httpCredentials = undefined;
|
|
383
|
+
if (Authorization?.startsWith('Basic ')) {
|
|
384
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
385
|
+
const colonIdx = decoded.indexOf(':');
|
|
386
|
+
if (colonIdx > 0) {
|
|
387
|
+
httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
380
391
|
const contextOptions: Record<string, unknown> = {
|
|
381
392
|
...restDevice,
|
|
382
|
-
...(
|
|
393
|
+
...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
|
|
394
|
+
...(httpCredentials && { httpCredentials }),
|
|
383
395
|
ignoreHTTPSErrors: true,
|
|
384
396
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
385
397
|
};
|
|
@@ -421,6 +433,24 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
421
433
|
}
|
|
422
434
|
|
|
423
435
|
try {
|
|
436
|
+
// Only enable generic Authorization header routing interception broadly if
|
|
437
|
+
// a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
|
|
438
|
+
// performance warnings inside the check checkUrl phase for typical public scans
|
|
439
|
+
if (Authorization && !httpCredentials) {
|
|
440
|
+
const entryOrigin = new URL(url).origin;
|
|
441
|
+
await browserContext.route('**/*', async (route: any, request: any) => {
|
|
442
|
+
try {
|
|
443
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
444
|
+
await route.continue({ headers: { ...request.headers(), Authorization } });
|
|
445
|
+
} else {
|
|
446
|
+
await route.continue();
|
|
447
|
+
}
|
|
448
|
+
} catch {
|
|
449
|
+
await route.continue();
|
|
450
|
+
}
|
|
451
|
+
});
|
|
452
|
+
}
|
|
453
|
+
|
|
424
454
|
const page = await browserContext.newPage();
|
|
425
455
|
|
|
426
456
|
// Block native Chrome download UI
|
|
@@ -431,16 +461,6 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
431
461
|
consoleLogger.info(`Unable to set download deny: ${(e as Error).message}`);
|
|
432
462
|
}
|
|
433
463
|
|
|
434
|
-
// OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
|
|
435
|
-
// This allows the "Connectivity Check" to pass as soon as HTML is ready
|
|
436
|
-
await page.route('**/*', (route) => {
|
|
437
|
-
const type = route.request().resourceType();
|
|
438
|
-
if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
|
|
439
|
-
return route.abort();
|
|
440
|
-
}
|
|
441
|
-
return route.continue();
|
|
442
|
-
});
|
|
443
|
-
|
|
444
464
|
// STEP 2: Navigate (follows server-side redirects)
|
|
445
465
|
page.once('download', () => {
|
|
446
466
|
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
@@ -1153,6 +1153,51 @@ export const preNavigationHooks = (extraHTTPHeaders: Record<string, string>) =>
|
|
|
1153
1153
|
];
|
|
1154
1154
|
};
|
|
1155
1155
|
|
|
1156
|
+
/**
|
|
1157
|
+
* Splits extraHTTPHeaders into auth and non-auth parts.
|
|
1158
|
+
* Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
|
|
1159
|
+
* Non-auth headers are safe to set globally on the browser context.
|
|
1160
|
+
*/
|
|
1161
|
+
export const splitAuthHeaders = (extraHTTPHeaders?: Record<string, string>) => {
|
|
1162
|
+
const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
|
|
1163
|
+
return {
|
|
1164
|
+
authHeader: Authorization || null,
|
|
1165
|
+
nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
|
|
1166
|
+
httpCredentials: (() => {
|
|
1167
|
+
if (!Authorization?.startsWith('Basic ')) return null;
|
|
1168
|
+
const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
|
|
1169
|
+
const colonIdx = decoded.indexOf(':');
|
|
1170
|
+
if (colonIdx <= 0) return null;
|
|
1171
|
+
return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
|
|
1172
|
+
})(),
|
|
1173
|
+
};
|
|
1174
|
+
};
|
|
1175
|
+
|
|
1176
|
+
/**
|
|
1177
|
+
* Adds a route handler to a BrowserContext that sends the Authorization header
|
|
1178
|
+
* only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
|
|
1179
|
+
*/
|
|
1180
|
+
export const addAuthRouteHandler = async (
|
|
1181
|
+
context: BrowserContext,
|
|
1182
|
+
entryUrl: string,
|
|
1183
|
+
authHeader: string | null
|
|
1184
|
+
) => {
|
|
1185
|
+
if (!authHeader) return;
|
|
1186
|
+
|
|
1187
|
+
const entryOrigin = new URL(entryUrl).origin;
|
|
1188
|
+
await context.route('**/*', async (route, request) => {
|
|
1189
|
+
try {
|
|
1190
|
+
if (new URL(request.url()).origin === entryOrigin) {
|
|
1191
|
+
await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
|
|
1192
|
+
} else {
|
|
1193
|
+
await route.continue();
|
|
1194
|
+
}
|
|
1195
|
+
} catch {
|
|
1196
|
+
await route.continue();
|
|
1197
|
+
}
|
|
1198
|
+
});
|
|
1199
|
+
};
|
|
1200
|
+
|
|
1156
1201
|
export const postNavigationHooks = [
|
|
1157
1202
|
async (_crawlingContext: CrawlingContext) => {
|
|
1158
1203
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
|
@@ -9,6 +9,7 @@ import {
|
|
|
9
9
|
isUrlPdf,
|
|
10
10
|
shouldSkipClickDueToDisallowedHref,
|
|
11
11
|
shouldSkipDueToUnsupportedContent,
|
|
12
|
+
splitAuthHeaders,
|
|
12
13
|
} from './commonCrawlerFunc.js';
|
|
13
14
|
import constants, {
|
|
14
15
|
UrlsCrawled,
|
|
@@ -385,6 +386,8 @@ const crawlDomain = async ({
|
|
|
385
386
|
specifiedMaxConcurrency || constants.maxConcurrency,
|
|
386
387
|
);
|
|
387
388
|
|
|
389
|
+
const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
390
|
+
|
|
388
391
|
const crawler = register(
|
|
389
392
|
new crawlee.PlaywrightCrawler({
|
|
390
393
|
launchContext: {
|
|
@@ -404,12 +407,20 @@ const crawlDomain = async ({
|
|
|
404
407
|
...playwrightDeviceDetailsObject,
|
|
405
408
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
406
409
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
407
|
-
...(
|
|
410
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
411
|
+
...(httpCredentials && { httpCredentials }),
|
|
408
412
|
};
|
|
409
413
|
},
|
|
410
414
|
],
|
|
411
415
|
},
|
|
412
416
|
requestQueue,
|
|
417
|
+
preNavigationHooks: [
|
|
418
|
+
async (crawlingContext) => {
|
|
419
|
+
if (extraHTTPHeaders) {
|
|
420
|
+
crawlingContext.request.headers = extraHTTPHeaders;
|
|
421
|
+
}
|
|
422
|
+
},
|
|
423
|
+
],
|
|
413
424
|
postNavigationHooks: [
|
|
414
425
|
async crawlingContext => {
|
|
415
426
|
const { page, request } = crawlingContext;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import fs from 'fs';
|
|
2
2
|
import { chromium, Page } from 'playwright';
|
|
3
3
|
import { EnqueueStrategy } from 'crawlee';
|
|
4
|
-
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
4
|
+
import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
|
|
5
5
|
import constants, { FileTypes, guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
|
6
6
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
7
7
|
import crawlDomain from './crawlDomain.js';
|
|
@@ -58,6 +58,7 @@ const crawlIntelligentSitemap = async (
|
|
|
58
58
|
let sitemapLink = '';
|
|
59
59
|
|
|
60
60
|
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
61
|
+
const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
61
62
|
let context;
|
|
62
63
|
let browserInstance;
|
|
63
64
|
|
|
@@ -65,20 +66,25 @@ const crawlIntelligentSitemap = async (
|
|
|
65
66
|
const effectiveUserDataDirectory = userDataDirectory || '';
|
|
66
67
|
context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
|
67
68
|
...launchOptions,
|
|
68
|
-
...(
|
|
69
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
70
|
+
...(httpCredentials && { httpCredentials }),
|
|
69
71
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
70
72
|
});
|
|
71
73
|
register(context);
|
|
72
74
|
} else {
|
|
73
|
-
// In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
|
|
74
75
|
browserInstance = await constants.launcher.launch(launchOptions);
|
|
75
76
|
register(browserInstance as unknown as { close: () => Promise<void> });
|
|
76
77
|
context = await browserInstance.newContext({
|
|
77
|
-
...(
|
|
78
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
79
|
+
...(httpCredentials && { httpCredentials }),
|
|
78
80
|
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
79
81
|
});
|
|
80
82
|
}
|
|
81
83
|
|
|
84
|
+
if (authHeader) {
|
|
85
|
+
await addAuthRouteHandler(context, link, authHeader);
|
|
86
|
+
}
|
|
87
|
+
|
|
82
88
|
const page = await context.newPage();
|
|
83
89
|
|
|
84
90
|
for (const path of sitemapPaths) {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/* eslint-env browser */
|
|
2
|
-
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
2
|
+
import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
|
|
3
3
|
import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
|
|
4
4
|
import constants, {
|
|
5
5
|
getIntermediateScreenshotsPath,
|
|
@@ -60,6 +60,7 @@ const runCustom = async (
|
|
|
60
60
|
blacklistedPatterns: string[] | null,
|
|
61
61
|
includeScreenshots: boolean,
|
|
62
62
|
initialCustomFlowLabel?: string,
|
|
63
|
+
extraHTTPHeaders?: Record<string, string>,
|
|
63
64
|
) => {
|
|
64
65
|
// checks and delete datasets path if it already exists
|
|
65
66
|
process.env.CRAWLEE_STORAGE_DIR = randomToken;
|
|
@@ -109,6 +110,8 @@ const runCustom = async (
|
|
|
109
110
|
...customArgs,
|
|
110
111
|
];
|
|
111
112
|
|
|
113
|
+
const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
114
|
+
|
|
112
115
|
const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
|
|
113
116
|
...baseLaunchOptions,
|
|
114
117
|
args: mergedArgs,
|
|
@@ -118,8 +121,14 @@ const runCustom = async (
|
|
|
118
121
|
viewport: null,
|
|
119
122
|
...(hasCustomViewport ? contextDeviceOptions : {}),
|
|
120
123
|
userAgent: process.env.OOBEE_USER_AGENT || (deviceUserAgent as string | undefined),
|
|
124
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
125
|
+
...(httpCredentials && { httpCredentials }),
|
|
121
126
|
});
|
|
122
127
|
|
|
128
|
+
if (authHeader) {
|
|
129
|
+
await addAuthRouteHandler(context, url, authHeader);
|
|
130
|
+
}
|
|
131
|
+
|
|
123
132
|
register(context);
|
|
124
133
|
|
|
125
134
|
processPageParams.stopAll = async () => {
|
|
@@ -60,7 +60,7 @@ const SENTRY_NODE_VERSION: string = (() => {
|
|
|
60
60
|
try {
|
|
61
61
|
return _require('@sentry/node/package.json').version as string;
|
|
62
62
|
} catch {
|
|
63
|
-
return '
|
|
63
|
+
return '10.58.0'; // safe fallback matching currently installed version
|
|
64
64
|
}
|
|
65
65
|
})();
|
|
66
66
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import fs from 'fs-extra';
|
|
2
2
|
import path from 'path';
|
|
3
3
|
import readline from 'readline';
|
|
4
|
+
import { consoleLogger } from '../logs.js';
|
|
4
5
|
import type { ItemsInfo } from './types.js';
|
|
5
6
|
|
|
6
7
|
export interface ItemsStoreEntry {
|
|
@@ -16,6 +17,7 @@ export interface ItemsStoreEntry {
|
|
|
16
17
|
export class ItemsStore {
|
|
17
18
|
private basePath: string;
|
|
18
19
|
private ensuredDirs = new Set<string>();
|
|
20
|
+
private fileWriteQueues = new Map<string, Promise<void>>();
|
|
19
21
|
|
|
20
22
|
constructor(storagePath: string) {
|
|
21
23
|
this.basePath = path.join(storagePath, 'tmp-items');
|
|
@@ -40,8 +42,29 @@ export class ItemsStore {
|
|
|
40
42
|
async appendPageItems(category: string, ruleId: string, entry: ItemsStoreEntry): Promise<void> {
|
|
41
43
|
await this.ensureDir(category);
|
|
42
44
|
const filePath = this.getRuleFilePath(category, ruleId);
|
|
43
|
-
|
|
44
|
-
|
|
45
|
+
let line = JSON.stringify(entry);
|
|
46
|
+
|
|
47
|
+
// JSON.stringify should never produce literal newlines inside strings, but HTML content
|
|
48
|
+
// from page evaluation may contain edge-case characters (e.g. unescaped control chars in
|
|
49
|
+
// non-spec-compliant innerHTML). Strip any embedded \r or \n that would break JSONL format readline parsing.
|
|
50
|
+
line = line.replace(/[\n\r]/g, (match) => {
|
|
51
|
+
if (match === '\n') return '\\n';
|
|
52
|
+
if (match === '\r') return '\\r';
|
|
53
|
+
return match;
|
|
54
|
+
});
|
|
55
|
+
line += '\n';
|
|
56
|
+
|
|
57
|
+
// Serialize writes per rule file to avoid concurrent append interleaving/truncation.
|
|
58
|
+
const previous = this.fileWriteQueues.get(filePath) ?? Promise.resolve();
|
|
59
|
+
const next = previous.then(() => fs.appendFile(filePath, line, 'utf8'));
|
|
60
|
+
this.fileWriteQueues.set(
|
|
61
|
+
filePath,
|
|
62
|
+
next.catch(() => {
|
|
63
|
+
// Keep queue alive for subsequent writes.
|
|
64
|
+
}),
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
await next;
|
|
45
68
|
}
|
|
46
69
|
|
|
47
70
|
async *readRuleItems(category: string, ruleId: string): AsyncGenerator<ItemsStoreEntry> {
|
|
@@ -51,9 +74,19 @@ export class ItemsStore {
|
|
|
51
74
|
const fileStream = fs.createReadStream(filePath, { encoding: 'utf8' });
|
|
52
75
|
const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
|
|
53
76
|
|
|
77
|
+
let lineNumber = 0;
|
|
54
78
|
for await (const line of rl) {
|
|
55
|
-
|
|
79
|
+
lineNumber += 1;
|
|
80
|
+
if (!line.trim()) continue;
|
|
81
|
+
|
|
82
|
+
try {
|
|
56
83
|
yield JSON.parse(line) as ItemsStoreEntry;
|
|
84
|
+
} catch (error) {
|
|
85
|
+
// Tolerate malformed/truncated JSONL lines (e.g. interrupted append) so report generation can continue.
|
|
86
|
+
const preview = line.slice(0, 200);
|
|
87
|
+
consoleLogger.warn(
|
|
88
|
+
`Skipping malformed itemsStore JSONL line ${lineNumber} in ${filePath}: ${(error as Error).message}. Content preview: ${preview}`,
|
|
89
|
+
);
|
|
57
90
|
}
|
|
58
91
|
}
|
|
59
92
|
}
|
|
@@ -68,6 +101,7 @@ export class ItemsStore {
|
|
|
68
101
|
}
|
|
69
102
|
|
|
70
103
|
async cleanup(): Promise<void> {
|
|
104
|
+
await Promise.all(this.fileWriteQueues.values());
|
|
71
105
|
await fs.rm(this.basePath, { recursive: true, force: true });
|
|
72
106
|
}
|
|
73
107
|
}
|
/package/{d5e2f6a7-0279-41a3-8763-844970cdf0ba.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt}
RENAMED
|
File without changes
|