@govtechsg/oobee 0.10.92 → 0.10.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -79,6 +79,7 @@ All crawlers use Crawlee's `PlaywrightCrawler` with:
79
79
  - Docker detection (`/.dockerenv`): adds `--disable-gpu`, `--no-sandbox`, `--disable-dev-shm-usage`
80
80
  - Proxy support (manual, PAC, or none) via `getProxyInfo()`
81
81
  - Channel set from browser name (undefined for chromium = bundled)
82
+ - `--mute-audio` is added by default in both headless and headful modes, but must be disabled for `customFlow` by calling `getPlaywrightLaunchOptions(browser, { includeMuteAudio: false })`
82
83
 
83
84
  ### User-Agent
84
85
 
@@ -134,6 +135,11 @@ The `constants` default export object holds runtime state:
134
135
  | `OOBEE_SLOWMO` | Browser slowmo in ms |
135
136
  | `OOBEE_FAST_CRAWLER` | Experimental high-concurrency mode |
136
137
  | `OOBEE_DISABLE_BROWSER_DOWNLOAD` | Block browser file downloads |
138
+ | `OOBEE_TAGGED_WEBSITE` | Tag to identify the website in Sentry telemetry (overridden by `--websiteTag` CLI flag) |
139
+ | `OOBEE_SCAN_METADATA` | Overrides `entryUrl` tag in Sentry events |
140
+ | `OOBEE_SCAN_PRODUCT` | Adds `scanProduct` tag to Sentry events |
141
+ | `OOBEE_CONSECUTIVE_MAX_RETRIES` | Max consecutive HTTP failures before circuit breaker aborts crawl (default 100) |
142
+ | `OOBEE_VALIDATE_URL` | If set, exit after URL validation without scanning |
137
143
  | `HTTP_PROXY` / `HTTPS_PROXY` / `ALL_PROXY` | Proxy configuration |
138
144
  | `NO_PROXY` / `INCLUDE_PROXY` | Proxy bypass/include lists |
139
145
 
@@ -215,6 +221,14 @@ docker run oobee node dist/cli.js ...
215
221
 
216
222
  8. **Crawlee dataset** — Results are stored as numbered JSON files in `{randomToken}/datasets/default/`. Each file is one page's axe results. `generateArtifacts()` reads all of them.
217
223
 
224
+ 9. **Auth headers and CORS** — Never set `Authorization` in `extraHTTPHeaders` globally on a browser context. Playwright sends `extraHTTPHeaders` to ALL requests (including cross-origin CDNs), which triggers CORS preflight failures. Instead use `splitAuthHeaders()` from `commonCrawlerFunc.ts` to separate auth from non-auth headers:
225
+ - Non-auth headers → safe to set globally via `extraHTTPHeaders` on context/launch options
226
+ - Basic auth → set `httpCredentials` on context (Playwright auto-responds to 401 challenges, origin-aware)
227
+ - Any Authorization header → send only to same-origin requests via `addAuthRouteHandler()` (route interception) or Crawlee's `preNavigationHooks` (navigation-only)
228
+ - Credentials come from URL-embedded `user:pass@host` or `-m "Authorization Basic ..."` — both produce the same `extraHTTPHeaders.Authorization` value in `prepareData()`
229
+
230
+ 10. **Intermediate JSONL write safety + corruption tolerance** — `ItemsStore.appendPageItems()` requires strict serialization of writes per rule file to prevent interleaved corruption. It also enforces a strict text sanitization regex to filter out literal `\n` and `\r` control characters from website HTML inputs immediately after `JSON.stringify()`. This ensures no single JSON issue accidentally injects illegal implicit newline boundaries when writing to JSONL format. Maintain backward-compatible `fs.appendFile` queues over buffered WriteStreams to guarantee pipeline sync visibility. `ItemsStore.readRuleItems()` tolerates historical malformed lines via fallback skip logic.
231
+
218
232
  ## Testing Considerations
219
233
 
220
234
  When making changes, validate these areas which have well-established edge cases:
package/README.md CHANGED
@@ -92,6 +92,10 @@ verapdf --version
92
92
  | WARN_LEVEL | Only used in tests. | |
93
93
  | OOBEE_DISABLE_BROWSER_DOWNLOAD | Experimental flag to disable file downloads on Chrome/Chromium/Edge. Does not affect Local File scan | |
94
94
  | OOBEE_SLOWMO | Experimental flag to slow down web browser behaviour by specified duration (in miliseconds) | |
95
+ | OOBEE_TAGGED_WEBSITE | Tag to identify the website in telemetry. Can also be set via `-z, --websiteTag` CLI flag (CLI flag takes precedence). | |
96
+ | OOBEE_SCAN_METADATA | Overrides the `entryUrl` tag sent to telemetry. | |
97
+ | OOBEE_SCAN_PRODUCT | Adds a `scanProduct` tag to telemetry events. | |
98
+ | OOBEE_CONSECUTIVE_MAX_RETRIES | Max consecutive HTTP failures before the circuit breaker aborts the crawl. | `100` |
95
99
  | HTTP_PROXY | URL of the proxy server to be used for HTTP requests (e.g. `http://proxy.example.com:8080`). | |
96
100
  | HTTPS_PROXY | URL of the proxy server to be used for HTTPS requests (e.g. `https://proxy.example.com:8080`). | |
97
101
  | ALL_PROXY | URL of the proxy server to be used for all requests, typically used for SOCKS5 proxies (e.g. `socks5://proxy.example.com:1080`. Note: IPv6 direct connections may still continue even though socks5 proxy is specified due to a known issue with Chrome/Chromium. (Recommended workaround is to turn off IPv6 at host-level). | |
@@ -413,6 +417,21 @@ Examples:
413
417
  > [ -d <device> | -w <viewport_width> ]
414
418
 
415
419
  ```
420
+
421
+ ### Basic Auth
422
+
423
+ For sites behind HTTP Basic Authentication, you can provide credentials in two ways:
424
+
425
+ 1. **Embed in URL**: `npm run cli -- -u 'https://user:password@example.com' -c 3`
426
+ 2. **Use `-m` flag**: `npm run cli -- -u 'https://example.com' -c 3 -m "Authorization Basic dXNlcjpwYXNzd29yZA=="`
427
+
428
+ Both methods work across all scan types (sitemap, website, custom flow). For multiple headers, separate with `, `:
429
+ ```
430
+ -m "Authorization Basic dXNlcjpwYXNz, X-Custom-Header myvalue"
431
+ ```
432
+
433
+ > **Note:** Authorization headers are only sent to same-origin requests to avoid CORS preflight failures on cross-origin resources (e.g., CDN fonts, analytics scripts).
434
+
416
435
  ### Note on Windows PowerShell:
417
436
  You need to run the command as `npm run cli -- --` (with the extra set of `--`) as PowerShell interprets arguments differently.
418
437
 
package/dist/combine.js CHANGED
@@ -89,7 +89,7 @@ const combineRun = async (details, deviceToScan) => {
89
89
  let durationExceeded = false;
90
90
  switch (type) {
91
91
  case ScannerTypes.CUSTOM:
92
- const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
92
+ const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '', extraHTTPHeaders);
93
93
  urlsCrawledObj = res.urlsCrawled;
94
94
  uiCustomFlowLabel = res.customFlowLabel;
95
95
  break;
@@ -300,9 +300,19 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
300
300
  const rawDevice = (playwrightDeviceDetailsObject || {});
301
301
  const { viewport, isMobile, hasTouch, userAgent: deviceUserAgent, ...restDevice } = rawDevice;
302
302
  const launchOptions = getPlaywrightLaunchOptions(browserToRun);
303
+ const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
304
+ let httpCredentials = undefined;
305
+ if (Authorization?.startsWith('Basic ')) {
306
+ const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
307
+ const colonIdx = decoded.indexOf(':');
308
+ if (colonIdx > 0) {
309
+ httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
310
+ }
311
+ }
303
312
  const contextOptions = {
304
313
  ...restDevice,
305
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
314
+ ...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
315
+ ...(httpCredentials && { httpCredentials }),
306
316
  ignoreHTTPSErrors: true,
307
317
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
308
318
  };
@@ -342,6 +352,25 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
342
352
  return res;
343
353
  }
344
354
  try {
355
+ // Only enable generic Authorization header routing interception broadly if
356
+ // a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
357
+ // performance warnings inside the check checkUrl phase for typical public scans
358
+ if (Authorization && !httpCredentials) {
359
+ const entryOrigin = new URL(url).origin;
360
+ await browserContext.route('**/*', async (route, request) => {
361
+ try {
362
+ if (new URL(request.url()).origin === entryOrigin) {
363
+ await route.continue({ headers: { ...request.headers(), Authorization } });
364
+ }
365
+ else {
366
+ await route.continue();
367
+ }
368
+ }
369
+ catch {
370
+ await route.continue();
371
+ }
372
+ });
373
+ }
345
374
  const page = await browserContext.newPage();
346
375
  // Block native Chrome download UI
347
376
  try {
@@ -351,15 +380,6 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
351
380
  catch (e) {
352
381
  consoleLogger.info(`Unable to set download deny: ${e.message}`);
353
382
  }
354
- // OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
355
- // This allows the "Connectivity Check" to pass as soon as HTML is ready
356
- await page.route('**/*', (route) => {
357
- const type = route.request().resourceType();
358
- if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
359
- return route.abort();
360
- }
361
- return route.continue();
362
- });
363
383
  // STEP 2: Navigate (follows server-side redirects)
364
384
  page.once('download', () => {
365
385
  res.status = constants.urlCheckStatuses.notASupportedDocument.code;
@@ -905,6 +905,49 @@ export const preNavigationHooks = (extraHTTPHeaders) => {
905
905
  },
906
906
  ];
907
907
  };
908
+ /**
909
+ * Splits extraHTTPHeaders into auth and non-auth parts.
910
+ * Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
911
+ * Non-auth headers are safe to set globally on the browser context.
912
+ */
913
+ export const splitAuthHeaders = (extraHTTPHeaders) => {
914
+ const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
915
+ return {
916
+ authHeader: Authorization || null,
917
+ nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
918
+ httpCredentials: (() => {
919
+ if (!Authorization?.startsWith('Basic '))
920
+ return null;
921
+ const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
922
+ const colonIdx = decoded.indexOf(':');
923
+ if (colonIdx <= 0)
924
+ return null;
925
+ return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
926
+ })(),
927
+ };
928
+ };
929
+ /**
930
+ * Adds a route handler to a BrowserContext that sends the Authorization header
931
+ * only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
932
+ */
933
+ export const addAuthRouteHandler = async (context, entryUrl, authHeader) => {
934
+ if (!authHeader)
935
+ return;
936
+ const entryOrigin = new URL(entryUrl).origin;
937
+ await context.route('**/*', async (route, request) => {
938
+ try {
939
+ if (new URL(request.url()).origin === entryOrigin) {
940
+ await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
941
+ }
942
+ else {
943
+ await route.continue();
944
+ }
945
+ }
946
+ catch {
947
+ await route.continue();
948
+ }
949
+ });
950
+ };
908
951
  export const postNavigationHooks = [
909
952
  async (_crawlingContext) => {
910
953
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
@@ -1,6 +1,6 @@
1
1
  import crawlee from 'crawlee';
2
2
  import { CrawlRateController } from './crawlRateController.js';
3
- import { createCrawleeSubFolders, getPreLaunchHook, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
3
+ import { createCrawleeSubFolders, getPreLaunchHook, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, splitAuthHeaders, } from './commonCrawlerFunc.js';
4
4
  import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
5
5
  import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
6
6
  import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
@@ -275,6 +275,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
275
275
  };
276
276
  let isAbortingScanNow = false;
277
277
  const rateController = new CrawlRateController(maxRequestsPerCrawl, specifiedMaxConcurrency || constants.maxConcurrency);
278
+ const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
278
279
  const crawler = register(new crawlee.PlaywrightCrawler({
279
280
  launchContext: {
280
281
  launcher: constants.launcher,
@@ -293,12 +294,20 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
293
294
  ...playwrightDeviceDetailsObject,
294
295
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
295
296
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
296
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
297
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
298
+ ...(httpCredentials && { httpCredentials }),
297
299
  };
298
300
  },
299
301
  ],
300
302
  },
301
303
  requestQueue,
304
+ preNavigationHooks: [
305
+ async (crawlingContext) => {
306
+ if (extraHTTPHeaders) {
307
+ crawlingContext.request.headers = extraHTTPHeaders;
308
+ }
309
+ },
310
+ ],
302
311
  postNavigationHooks: [
303
312
  async (crawlingContext) => {
304
313
  const { page, request } = crawlingContext;
@@ -1,4 +1,4 @@
1
- import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
1
+ import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
2
2
  import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
3
3
  import { consoleLogger, guiInfoLog } from '../logs.js';
4
4
  import crawlDomain from './crawlDomain.js';
@@ -26,26 +26,31 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
26
26
  const homeUrl = getHomeUrl(link);
27
27
  let sitemapLink = '';
28
28
  const launchOptions = getPlaywrightLaunchOptions(browser);
29
+ const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
29
30
  let context;
30
31
  let browserInstance;
31
32
  if (process.env.CRAWLEE_HEADLESS === '1') {
32
33
  const effectiveUserDataDirectory = userDataDirectory || '';
33
34
  context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
34
35
  ...launchOptions,
35
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
36
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
37
+ ...(httpCredentials && { httpCredentials }),
36
38
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
37
39
  });
38
40
  register(context);
39
41
  }
40
42
  else {
41
- // In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
42
43
  browserInstance = await constants.launcher.launch(launchOptions);
43
44
  register(browserInstance);
44
45
  context = await browserInstance.newContext({
45
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
46
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
47
+ ...(httpCredentials && { httpCredentials }),
46
48
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
47
49
  });
48
50
  }
51
+ if (authHeader) {
52
+ await addAuthRouteHandler(context, link, authHeader);
53
+ }
49
54
  const page = await context.newPage();
50
55
  for (const path of sitemapPaths) {
51
56
  sitemapLink = homeUrl + path;
@@ -1,5 +1,5 @@
1
1
  /* eslint-env browser */
2
- import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
2
+ import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
3
3
  import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
4
4
  import constants, { getIntermediateScreenshotsPath, guiInfoStatusTypes, } from '../constants/constants.js';
5
5
  import { initNewPage, log } from './custom/utils.js';
@@ -18,7 +18,7 @@ export class ProcessPageParams {
18
18
  this.randomToken = randomToken;
19
19
  }
20
20
  }
21
- const runCustom = async (url, randomToken, browserToRun, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, initialCustomFlowLabel) => {
21
+ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, initialCustomFlowLabel, extraHTTPHeaders) => {
22
22
  // checks and delete datasets path if it already exists
23
23
  process.env.CRAWLEE_STORAGE_DIR = randomToken;
24
24
  const urlsCrawled = { ...constants.urlsCrawledObj };
@@ -47,6 +47,7 @@ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, view
47
47
  ...baseArgs.filter(a => !a.startsWith('--window-size') && a !== '--start-maximized'),
48
48
  ...customArgs,
49
49
  ];
50
+ const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
50
51
  const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
51
52
  ...baseLaunchOptions,
52
53
  args: mergedArgs,
@@ -56,7 +57,12 @@ const runCustom = async (url, randomToken, browserToRun, userDataDirectory, view
56
57
  viewport: null,
57
58
  ...(hasCustomViewport ? contextDeviceOptions : {}),
58
59
  userAgent: process.env.OOBEE_USER_AGENT || deviceUserAgent,
60
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
61
+ ...(httpCredentials && { httpCredentials }),
59
62
  });
63
+ if (authHeader) {
64
+ await addAuthRouteHandler(context, url, authHeader);
65
+ }
60
66
  register(context);
61
67
  processPageParams.stopAll = async () => {
62
68
  try {
@@ -51,7 +51,7 @@ const SENTRY_NODE_VERSION = (() => {
51
51
  return _require('@sentry/node/package.json').version;
52
52
  }
53
53
  catch {
54
- return '9.47.1'; // safe fallback matching currently installed version
54
+ return '10.58.0'; // safe fallback matching currently installed version
55
55
  }
56
56
  })();
57
57
  // ---------------------------------------------------------------------------
@@ -1,9 +1,11 @@
1
1
  import fs from 'fs-extra';
2
2
  import path from 'path';
3
3
  import readline from 'readline';
4
+ import { consoleLogger } from '../logs.js';
4
5
  export class ItemsStore {
5
6
  constructor(storagePath) {
6
7
  this.ensuredDirs = new Set();
8
+ this.fileWriteQueues = new Map();
7
9
  this.basePath = path.join(storagePath, 'tmp-items');
8
10
  }
9
11
  sanitizeRuleId(ruleId) {
@@ -22,8 +24,25 @@ export class ItemsStore {
22
24
  async appendPageItems(category, ruleId, entry) {
23
25
  await this.ensureDir(category);
24
26
  const filePath = this.getRuleFilePath(category, ruleId);
25
- const line = JSON.stringify(entry) + '\n';
26
- await fs.appendFile(filePath, line, 'utf8');
27
+ let line = JSON.stringify(entry);
28
+ // JSON.stringify should never produce literal newlines inside strings, but HTML content
29
+ // from page evaluation may contain edge-case characters (e.g. unescaped control chars in
30
+ // non-spec-compliant innerHTML). Strip any embedded \r or \n that would break JSONL format readline parsing.
31
+ line = line.replace(/[\n\r]/g, (match) => {
32
+ if (match === '\n')
33
+ return '\\n';
34
+ if (match === '\r')
35
+ return '\\r';
36
+ return match;
37
+ });
38
+ line += '\n';
39
+ // Serialize writes per rule file to avoid concurrent append interleaving/truncation.
40
+ const previous = this.fileWriteQueues.get(filePath) ?? Promise.resolve();
41
+ const next = previous.then(() => fs.appendFile(filePath, line, 'utf8'));
42
+ this.fileWriteQueues.set(filePath, next.catch(() => {
43
+ // Keep queue alive for subsequent writes.
44
+ }));
45
+ await next;
27
46
  }
28
47
  async *readRuleItems(category, ruleId) {
29
48
  const filePath = this.getRuleFilePath(category, ruleId);
@@ -31,10 +50,19 @@ export class ItemsStore {
31
50
  return;
32
51
  const fileStream = fs.createReadStream(filePath, { encoding: 'utf8' });
33
52
  const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
53
+ let lineNumber = 0;
34
54
  for await (const line of rl) {
35
- if (line.trim()) {
55
+ lineNumber += 1;
56
+ if (!line.trim())
57
+ continue;
58
+ try {
36
59
  yield JSON.parse(line);
37
60
  }
61
+ catch (error) {
62
+ // Tolerate malformed/truncated JSONL lines (e.g. interrupted append) so report generation can continue.
63
+ const preview = line.slice(0, 200);
64
+ consoleLogger.warn(`Skipping malformed itemsStore JSONL line ${lineNumber} in ${filePath}: ${error.message}. Content preview: ${preview}`);
65
+ }
38
66
  }
39
67
  }
40
68
  async readRuleItemsMap(category, ruleId) {
@@ -46,6 +74,7 @@ export class ItemsStore {
46
74
  return map;
47
75
  }
48
76
  async cleanup() {
77
+ await Promise.all(this.fileWriteQueues.values());
49
78
  await fs.rm(this.basePath, { recursive: true, force: true });
50
79
  }
51
80
  }
@@ -3,9 +3,9 @@
3
3
  * DO NOT EDIT MANUALLY. Re-generate with: node dist/generateOobeeClientScanner.js
4
4
  *
5
5
  * Embedded at generation time:
6
- * App version : 0.10.92
6
+ * App version : 0.10.93
7
7
  * Sentry DSN : (from OOBEE_SENTRY_DSN env var or constants.ts default)
8
- * Sentry SDK : @sentry/browser 9.47.1 (loaded from CDN at runtime)
8
+ * Sentry SDK : @sentry/browser 10.58.0 (loaded from CDN at runtime)
9
9
  *
10
10
  * Usage:
11
11
  * <script src="oobee-client-scanner.js"></script>
@@ -34883,8 +34883,8 @@
34883
34883
  // ── Sentry browser telemetry (Sentry JS SDK, loaded from CDN) ────────────
34884
34884
 
34885
34885
  var _oobeeSentryDsn = "https://3b8c7ee46b06f33815a1301b6713ebc3@o4509047624761344.ingest.us.sentry.io/4509327783559168";
34886
- var _oobeeAppVersion = "0.10.92";
34887
- var _oobeeSentryVersion = "9.47.1";
34886
+ var _oobeeAppVersion = "0.10.93";
34887
+ var _oobeeSentryVersion = "10.58.0";
34888
34888
  var _oobeeSentryInitialized = false;
34889
34889
  var _oobeeSentryLoadPromise = null;
34890
34890
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@govtechsg/oobee",
3
3
  "main": "dist/npmIndex.js",
4
- "version": "0.10.92",
4
+ "version": "0.10.93",
5
5
  "type": "module",
6
6
  "author": "Government Technology Agency <info@tech.gov.sg>",
7
7
  "bin": {
@@ -11,7 +11,7 @@
11
11
  "@aws-sdk/client-s3": "^3.1049.0",
12
12
  "@json2csv/node": "^7.0.3",
13
13
  "@napi-rs/canvas": "^0.1.53",
14
- "@sentry/node": "^9.13.0",
14
+ "@sentry/node": "^10.58.0",
15
15
  "@types/aws-sdk": "^0.0.42",
16
16
  "axe-core": "^4.11.4",
17
17
  "axios": "^1.8.2",
package/src/combine.ts CHANGED
@@ -154,6 +154,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
154
154
  blacklistedPatterns,
155
155
  includeScreenshots,
156
156
  customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '',
157
+ extraHTTPHeaders,
157
158
  );
158
159
 
159
160
  urlsCrawledObj = res.urlsCrawled;
@@ -377,9 +377,21 @@ const checkUrlConnectivityWithBrowser = async (
377
377
  } = rawDevice;
378
378
 
379
379
  const launchOptions = getPlaywrightLaunchOptions(browserToRun);
380
+
381
+ const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
382
+ let httpCredentials = undefined;
383
+ if (Authorization?.startsWith('Basic ')) {
384
+ const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
385
+ const colonIdx = decoded.indexOf(':');
386
+ if (colonIdx > 0) {
387
+ httpCredentials = { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
388
+ }
389
+ }
390
+
380
391
  const contextOptions: Record<string, unknown> = {
381
392
  ...restDevice,
382
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
393
+ ...(Object.keys(nonAuthHeaders).length > 0 && { extraHTTPHeaders: nonAuthHeaders }),
394
+ ...(httpCredentials && { httpCredentials }),
383
395
  ignoreHTTPSErrors: true,
384
396
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
385
397
  };
@@ -421,6 +433,24 @@ const checkUrlConnectivityWithBrowser = async (
421
433
  }
422
434
 
423
435
  try {
436
+ // Only enable generic Authorization header routing interception broadly if
437
+ // a non-Basic Bearer auth string is heavily relied upon, thereby bypassing
438
+ // performance warnings inside the check checkUrl phase for typical public scans
439
+ if (Authorization && !httpCredentials) {
440
+ const entryOrigin = new URL(url).origin;
441
+ await browserContext.route('**/*', async (route: any, request: any) => {
442
+ try {
443
+ if (new URL(request.url()).origin === entryOrigin) {
444
+ await route.continue({ headers: { ...request.headers(), Authorization } });
445
+ } else {
446
+ await route.continue();
447
+ }
448
+ } catch {
449
+ await route.continue();
450
+ }
451
+ });
452
+ }
453
+
424
454
  const page = await browserContext.newPage();
425
455
 
426
456
  // Block native Chrome download UI
@@ -431,16 +461,6 @@ const checkUrlConnectivityWithBrowser = async (
431
461
  consoleLogger.info(`Unable to set download deny: ${(e as Error).message}`);
432
462
  }
433
463
 
434
- // OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
435
- // This allows the "Connectivity Check" to pass as soon as HTML is ready
436
- await page.route('**/*', (route) => {
437
- const type = route.request().resourceType();
438
- if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
439
- return route.abort();
440
- }
441
- return route.continue();
442
- });
443
-
444
464
  // STEP 2: Navigate (follows server-side redirects)
445
465
  page.once('download', () => {
446
466
  res.status = constants.urlCheckStatuses.notASupportedDocument.code;
@@ -1153,6 +1153,51 @@ export const preNavigationHooks = (extraHTTPHeaders: Record<string, string>) =>
1153
1153
  ];
1154
1154
  };
1155
1155
 
1156
+ /**
1157
+ * Splits extraHTTPHeaders into auth and non-auth parts.
1158
+ * Auth headers (Authorization) must only be sent to same-origin requests to avoid CORS preflight failures.
1159
+ * Non-auth headers are safe to set globally on the browser context.
1160
+ */
1161
+ export const splitAuthHeaders = (extraHTTPHeaders?: Record<string, string>) => {
1162
+ const { Authorization, ...nonAuthHeaders } = extraHTTPHeaders || {};
1163
+ return {
1164
+ authHeader: Authorization || null,
1165
+ nonAuthHeaders: Object.keys(nonAuthHeaders).length > 0 ? nonAuthHeaders : null,
1166
+ httpCredentials: (() => {
1167
+ if (!Authorization?.startsWith('Basic ')) return null;
1168
+ const decoded = Buffer.from(Authorization.slice(6), 'base64').toString();
1169
+ const colonIdx = decoded.indexOf(':');
1170
+ if (colonIdx <= 0) return null;
1171
+ return { username: decoded.slice(0, colonIdx), password: decoded.slice(colonIdx + 1) };
1172
+ })(),
1173
+ };
1174
+ };
1175
+
1176
+ /**
1177
+ * Adds a route handler to a BrowserContext that sends the Authorization header
1178
+ * only to same-origin requests, preventing CORS preflight failures on cross-origin CDN resources.
1179
+ */
1180
+ export const addAuthRouteHandler = async (
1181
+ context: BrowserContext,
1182
+ entryUrl: string,
1183
+ authHeader: string | null
1184
+ ) => {
1185
+ if (!authHeader) return;
1186
+
1187
+ const entryOrigin = new URL(entryUrl).origin;
1188
+ await context.route('**/*', async (route, request) => {
1189
+ try {
1190
+ if (new URL(request.url()).origin === entryOrigin) {
1191
+ await route.continue({ headers: { ...request.headers(), Authorization: authHeader } });
1192
+ } else {
1193
+ await route.continue();
1194
+ }
1195
+ } catch {
1196
+ await route.continue();
1197
+ }
1198
+ });
1199
+ };
1200
+
1156
1201
  export const postNavigationHooks = [
1157
1202
  async (_crawlingContext: CrawlingContext) => {
1158
1203
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
@@ -9,6 +9,7 @@ import {
9
9
  isUrlPdf,
10
10
  shouldSkipClickDueToDisallowedHref,
11
11
  shouldSkipDueToUnsupportedContent,
12
+ splitAuthHeaders,
12
13
  } from './commonCrawlerFunc.js';
13
14
  import constants, {
14
15
  UrlsCrawled,
@@ -385,6 +386,8 @@ const crawlDomain = async ({
385
386
  specifiedMaxConcurrency || constants.maxConcurrency,
386
387
  );
387
388
 
389
+ const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
390
+
388
391
  const crawler = register(
389
392
  new crawlee.PlaywrightCrawler({
390
393
  launchContext: {
@@ -404,12 +407,20 @@ const crawlDomain = async ({
404
407
  ...playwrightDeviceDetailsObject,
405
408
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
406
409
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
407
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
410
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
411
+ ...(httpCredentials && { httpCredentials }),
408
412
  };
409
413
  },
410
414
  ],
411
415
  },
412
416
  requestQueue,
417
+ preNavigationHooks: [
418
+ async (crawlingContext) => {
419
+ if (extraHTTPHeaders) {
420
+ crawlingContext.request.headers = extraHTTPHeaders;
421
+ }
422
+ },
423
+ ],
413
424
  postNavigationHooks: [
414
425
  async crawlingContext => {
415
426
  const { page, request } = crawlingContext;
@@ -1,7 +1,7 @@
1
1
  import fs from 'fs';
2
2
  import { chromium, Page } from 'playwright';
3
3
  import { EnqueueStrategy } from 'crawlee';
4
- import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
4
+ import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
5
5
  import constants, { FileTypes, guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
6
6
  import { consoleLogger, guiInfoLog } from '../logs.js';
7
7
  import crawlDomain from './crawlDomain.js';
@@ -58,6 +58,7 @@ const crawlIntelligentSitemap = async (
58
58
  let sitemapLink = '';
59
59
 
60
60
  const launchOptions = getPlaywrightLaunchOptions(browser);
61
+ const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
61
62
  let context;
62
63
  let browserInstance;
63
64
 
@@ -65,20 +66,25 @@ const crawlIntelligentSitemap = async (
65
66
  const effectiveUserDataDirectory = userDataDirectory || '';
66
67
  context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
67
68
  ...launchOptions,
68
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
69
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
70
+ ...(httpCredentials && { httpCredentials }),
69
71
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
70
72
  });
71
73
  register(context);
72
74
  } else {
73
- // In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
74
75
  browserInstance = await constants.launcher.launch(launchOptions);
75
76
  register(browserInstance as unknown as { close: () => Promise<void> });
76
77
  context = await browserInstance.newContext({
77
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
78
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
79
+ ...(httpCredentials && { httpCredentials }),
78
80
  ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
79
81
  });
80
82
  }
81
83
 
84
+ if (authHeader) {
85
+ await addAuthRouteHandler(context, link, authHeader);
86
+ }
87
+
82
88
  const page = await context.newPage();
83
89
 
84
90
  for (const path of sitemapPaths) {
@@ -1,5 +1,5 @@
1
1
  /* eslint-env browser */
2
- import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
2
+ import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
3
3
  import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
4
4
  import constants, {
5
5
  getIntermediateScreenshotsPath,
@@ -60,6 +60,7 @@ const runCustom = async (
60
60
  blacklistedPatterns: string[] | null,
61
61
  includeScreenshots: boolean,
62
62
  initialCustomFlowLabel?: string,
63
+ extraHTTPHeaders?: Record<string, string>,
63
64
  ) => {
64
65
  // checks and delete datasets path if it already exists
65
66
  process.env.CRAWLEE_STORAGE_DIR = randomToken;
@@ -109,6 +110,8 @@ const runCustom = async (
109
110
  ...customArgs,
110
111
  ];
111
112
 
113
+ const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
114
+
112
115
  const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
113
116
  ...baseLaunchOptions,
114
117
  args: mergedArgs,
@@ -118,8 +121,14 @@ const runCustom = async (
118
121
  viewport: null,
119
122
  ...(hasCustomViewport ? contextDeviceOptions : {}),
120
123
  userAgent: process.env.OOBEE_USER_AGENT || (deviceUserAgent as string | undefined),
124
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
125
+ ...(httpCredentials && { httpCredentials }),
121
126
  });
122
127
 
128
+ if (authHeader) {
129
+ await addAuthRouteHandler(context, url, authHeader);
130
+ }
131
+
123
132
  register(context);
124
133
 
125
134
  processPageParams.stopAll = async () => {
@@ -60,7 +60,7 @@ const SENTRY_NODE_VERSION: string = (() => {
60
60
  try {
61
61
  return _require('@sentry/node/package.json').version as string;
62
62
  } catch {
63
- return '9.47.1'; // safe fallback matching currently installed version
63
+ return '10.58.0'; // safe fallback matching currently installed version
64
64
  }
65
65
  })();
66
66
 
@@ -1,6 +1,7 @@
1
1
  import fs from 'fs-extra';
2
2
  import path from 'path';
3
3
  import readline from 'readline';
4
+ import { consoleLogger } from '../logs.js';
4
5
  import type { ItemsInfo } from './types.js';
5
6
 
6
7
  export interface ItemsStoreEntry {
@@ -16,6 +17,7 @@ export interface ItemsStoreEntry {
16
17
  export class ItemsStore {
17
18
  private basePath: string;
18
19
  private ensuredDirs = new Set<string>();
20
+ private fileWriteQueues = new Map<string, Promise<void>>();
19
21
 
20
22
  constructor(storagePath: string) {
21
23
  this.basePath = path.join(storagePath, 'tmp-items');
@@ -40,8 +42,29 @@ export class ItemsStore {
40
42
  async appendPageItems(category: string, ruleId: string, entry: ItemsStoreEntry): Promise<void> {
41
43
  await this.ensureDir(category);
42
44
  const filePath = this.getRuleFilePath(category, ruleId);
43
- const line = JSON.stringify(entry) + '\n';
44
- await fs.appendFile(filePath, line, 'utf8');
45
+ let line = JSON.stringify(entry);
46
+
47
+ // JSON.stringify should never produce literal newlines inside strings, but HTML content
48
+ // from page evaluation may contain edge-case characters (e.g. unescaped control chars in
49
+ // non-spec-compliant innerHTML). Strip any embedded \r or \n that would break JSONL format readline parsing.
50
+ line = line.replace(/[\n\r]/g, (match) => {
51
+ if (match === '\n') return '\\n';
52
+ if (match === '\r') return '\\r';
53
+ return match;
54
+ });
55
+ line += '\n';
56
+
57
+ // Serialize writes per rule file to avoid concurrent append interleaving/truncation.
58
+ const previous = this.fileWriteQueues.get(filePath) ?? Promise.resolve();
59
+ const next = previous.then(() => fs.appendFile(filePath, line, 'utf8'));
60
+ this.fileWriteQueues.set(
61
+ filePath,
62
+ next.catch(() => {
63
+ // Keep queue alive for subsequent writes.
64
+ }),
65
+ );
66
+
67
+ await next;
45
68
  }
46
69
 
47
70
  async *readRuleItems(category: string, ruleId: string): AsyncGenerator<ItemsStoreEntry> {
@@ -51,9 +74,19 @@ export class ItemsStore {
51
74
  const fileStream = fs.createReadStream(filePath, { encoding: 'utf8' });
52
75
  const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity });
53
76
 
77
+ let lineNumber = 0;
54
78
  for await (const line of rl) {
55
- if (line.trim()) {
79
+ lineNumber += 1;
80
+ if (!line.trim()) continue;
81
+
82
+ try {
56
83
  yield JSON.parse(line) as ItemsStoreEntry;
84
+ } catch (error) {
85
+ // Tolerate malformed/truncated JSONL lines (e.g. interrupted append) so report generation can continue.
86
+ const preview = line.slice(0, 200);
87
+ consoleLogger.warn(
88
+ `Skipping malformed itemsStore JSONL line ${lineNumber} in ${filePath}: ${(error as Error).message}. Content preview: ${preview}`,
89
+ );
57
90
  }
58
91
  }
59
92
  }
@@ -68,6 +101,7 @@ export class ItemsStore {
68
101
  }
69
102
 
70
103
  async cleanup(): Promise<void> {
104
+ await Promise.all(this.fileWriteQueues.values());
71
105
  await fs.rm(this.basePath, { recursive: true, force: true });
72
106
  }
73
107
  }