wb-browser-runtime 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -122,6 +122,8 @@ Verb arguments support two substitutions at dispatch time:
122
122
 
123
123
  Both forms are redacted in stdout summaries — only the verb name + selector make it into the log. Expanded values are also scrubbed from `verb.failed` / `slice.failed` error messages before they cross the stdio boundary.
124
124
 
125
+ **Escaping.** To emit a literal `{{ … }}` that should *not* be substituted, prefix it with a backslash: `\{{ env.X }}` round-trips to the literal text `{{ env.X }}`. The escape is a single left-to-right pass, so the braces it produces are not re-scanned.
126
+
125
127
  **Missing-value policy.** Set `WB_SUBSTITUTION_ON_MISSING` to choose how a missing `env.X` or `artifacts.X` is handled:
126
128
 
127
129
  - `warn` (default) — log a stderr warning and substitute an empty string; the verb continues.
@@ -166,10 +168,16 @@ endpoint at session close. Recording is **off by default** — set
166
168
  | `WB_RECORDING_SCREENCAST_QUALITY` | `60` | JPEG quality (0–100). |
167
169
  | `WB_RECORDING_RRWEB` | `1` | Set `0` to skip rrweb even if recording is on. |
168
170
  | `WB_RECORDING_VIDEO` | `0` if no `ffmpeg` | Set `0` to skip video even if `ffmpeg` is present. |
171
+ | `WB_RECORDING_MASK_ALL_INPUTS` | `1` | rrweb `maskAllInputs`. Set `0` to record input *values* (off by default for safety). |
172
+ | `WB_RECORDING_MASK_TEXT_SELECTOR` | *(unset)* | CSS selector whose **text content** rrweb masks (e.g. `.ssn, .acct-balance`). |
173
+ | `WB_RECORDING_BLOCK_SELECTOR` | *(unset)* | CSS selector rrweb records as an inert placeholder (contents never captured). |
174
+ | `WB_RECORDING_IGNORE_SELECTOR` | *(unset)* | CSS selector for elements to exclude from the recording. **In this build it is applied as a block selector** (unioned with `WB_RECORDING_BLOCK_SELECTOR`): the matching element is recorded as an inert placeholder and its subtree/inputs are never captured. The vendored rrweb bundle does not support rrweb's `ignoreSelector` (which only drops input *events*), so we map this knob onto the supported, stronger `blockSelector` to honor the "drop this field" intent. |
169
175
 
170
176
  Artifacts are two parallel POSTs per session, `kind ∈ {rrweb, video}`:
171
177
 
172
- - **rrweb** — gzipped JSON (`application/json+gzip`) — `{ run_id, session, event_count, events: [...] }`. DOM mutations + input events captured from every page; defaults mask all inputs for PII.
178
+ - **rrweb** — gzipped JSON (`application/json+gzip`) — `{ run_id, session, event_count, events: [...] }`. DOM mutations + input events captured from every page.
179
+
180
+ **PII scope — read this.** `maskAllInputs` (on by default) only redacts the *values* a user types into form fields. It does **not** mask field labels, placeholders, `aria-label`s, `<option>` text, or any other rendered text, and it does not alter the recorded DOM structure. A displayed account number, balance, or name that is page text — not an input value — is captured verbatim. For those, point rrweb at the sensitive nodes with `WB_RECORDING_MASK_TEXT_SELECTOR` (mask the text) or `WB_RECORDING_BLOCK_SELECTOR` (omit the subtree). `WB_RECORDING_IGNORE_SELECTOR` is treated as an alias for `WB_RECORDING_BLOCK_SELECTOR` in this build (the vendored rrweb bundle has no `ignoreSelector` support), so a field named there is excluded from the recording entirely rather than merely having its input events dropped. When in doubt, block the region.
173
181
  - **video** — VP9 WebM (`video/webm`) — encoded from JPEG screencast frames via `ffmpeg`. Requires `ffmpeg` on `$PATH` (droplet install: `apt-get install -y ffmpeg`). If `ffmpeg` is missing the video kind silently disables and rrweb continues alone.
174
182
 
175
183
  Each POST carries headers `Authorization: Bearer <secret>`,
@@ -208,7 +216,7 @@ example, see the `browserbase-hn-upvoted-probe` runbook in the xatabase repo.
208
216
  | `assert` | `assert: <selector>` | `selector`, `text_contains`, `url_contains` |
209
217
  | `eval` | `eval: <js>` | `script` |
210
218
  | `save` | `save: <name>` | `name`, `value` (captures prior `extract`/`eval` when omitted) |
211
- | `download` | `download: <selector>` | `selector`, `path`, `timeout`, `text_fallback` (clicks + races Playwright `download` event with in-page blob/anchor capture; saves into `$WB_ARTIFACTS_DIR/<path>`) |
219
+ | `download` | `download: <selector>` | `selector`, `path`, `timeout`, `text_fallback`, `signed_url` (clicks + races Playwright `download` event, in-page blob/anchor capture, and signed-URL export capture; saves into `$WB_ARTIFACTS_DIR/<path>`) |
212
220
 
213
221
  `extract`'s `fields` entries are either a CSS selector string (returns
214
222
  `textContent`), or `{ selector, attr }` to read an attribute.
@@ -308,9 +316,55 @@ Behaviour:
308
316
  doesn't double-save.
309
317
  - Emits `slice.artifact_saved` with `source: "download"` and
310
318
  `provenance.verb_name: "download"`.
311
- - On timeout: throws with diagnostics (page URL, selector, both
319
+ - On timeout: throws with diagnostics (page URL, selector, all
312
320
  failure reasons) AND emits a `slice.download_failed` frame.
313
321
 
322
+ #### Signed-URL export capture
323
+
324
+ Some SaaS "Download" buttons never trip a Playwright `download` event or an
325
+ in-page Blob. Instead the click calls a same-origin API that returns JSON like
326
+ `{ "download_url": "https://bucket.s3.amazonaws.com/…?<signed>" }` and then
327
+ navigates to that URL — and a page-side `fetch(signedUrl)` fails because the
328
+ object store's CORS policy won't let the app origin read the bytes.
329
+
330
+ The `download:` verb adds a **third** capture racer for this: it wraps the
331
+ page's `fetch`/`XHR` around the click, inspects small same-origin JSON
332
+ responses for URL-looking fields, and when it finds one pointing at a
333
+ recognized object-store host (S3, GCS, CloudFront, Azure Blob, R2), it
334
+ downloads the bytes **from the sidecar process** (where CORS doesn't apply)
335
+ and saves them like any other artifact.
336
+
337
+ This is **on by default in `auto` mode** — it only fires when a recognized
338
+ signed host appears in a JSON response around the click, so a normal
339
+ Playwright/blob download is unaffected. Tune or disable it per verb:
340
+
341
+ ```yaml
342
+ - download:
343
+ selector: 'button:has-text("Download as xlsx")'
344
+ path: pilot-profit-loss.xlsx
345
+ timeout: 10s
346
+ signed_url:
347
+ enabled: true # true | false | auto (default auto)
348
+ hosts: # extra non-recognized hosts to accept
349
+ - pilot-report-downloads.s3.amazonaws.com
350
+ json_fields: # restrict to these response field names
351
+ - download_url
352
+ ```
353
+
354
+ - Set `signed_url: false` to turn the capture off entirely for a verb.
355
+ - In `auto` mode only recognized object-store hosts (or an explicit `hosts:`
356
+ entry) are fetched. With `enabled: true` an explicit `hosts:`/`json_fields:`
357
+ match is honored even for an unrecognized host, since you named it.
358
+ - The captured URL's **query string (where signed credentials live) is
359
+ redacted** everywhere it crosses the stdio boundary — `provenance.signed_url`
360
+ is `origin+path?<redacted>`; the full URL stays only in sidecar memory for the
361
+ fetch. Honors `WB_BROWSER_DOWNLOAD_EXTENSIONS`.
362
+ - The saved frame carries `provenance.capture: "signed_url"` plus `api_url`,
363
+ `field`, `content_type`, and `content_disposition`.
364
+ - A 403 on the signed URL (expired token) emits `slice.download_failed` with
365
+ `expired: true` and `http_status: 403` so the operator knows to shorten the
366
+ click→fetch gap.
367
+
314
368
  ## Protocol
315
369
 
316
370
  Line-framed JSON, one message per line, on stdin/stdout. `stderr` is treated as
@@ -321,9 +375,26 @@ opaque diagnostics by `wb` and printed dimmed to the user's terminal.
321
375
  ```
322
376
  wb → {"type": "hello", "wb_version": "...", "protocol": "wb-sidecar/1"}
323
377
  wb ← {"type": "ready", "runtime": "wb-browser-runtime", "version": "...",
324
- "protocol": "wb-sidecar/1", "supports": ["goto", "click", "fill", ...]}
378
+ "protocol": "wb-sidecar/1", "min_protocol": "wb-sidecar/1",
379
+ "supports": ["goto", "click", "fill", ...],
380
+ "features": ["recording", "pause", "substitution",
381
+ "substitution_escape", "download_capture",
382
+ "signed_url_download"]}
325
383
  ```
326
384
 
385
+ The `ready` frame advertises capabilities so a client can feature-detect
386
+ without a hard-coded version→capability map:
387
+
388
+ - `protocol` — the wire version this runtime speaks.
389
+ - `min_protocol` — the oldest protocol version it can still interoperate with
390
+ (equal to `protocol` until a breaking frame change ships). A client speaking
391
+ an older protocol than `min_protocol` should refuse rather than guess.
392
+ - `supports` — the per-verb list (derived from the verb registry).
393
+ - `features` — coarse capability tokens above the verb list.
394
+
395
+ `version` is read from `package.json` at boot, so it can never drift from the
396
+ published version.
397
+
327
398
  ### Slice
328
399
 
329
400
  ```
@@ -25,9 +25,7 @@
25
25
 
26
26
  import readline from "node:readline";
27
27
  import { chromium } from "playwright-core";
28
- import { readFileSync } from "node:fs";
29
28
  import { send, log } from "../lib/io.js";
30
- import { resolveInside } from "../lib/util.js";
31
29
  import { SessionManager } from "../lib/session-manager.js";
32
30
  import {
33
31
  RecordingManager,
@@ -40,9 +38,31 @@ import {
40
38
  classifyError,
41
39
  } from "../lib/failure.js";
42
40
  import { installDownloadCapture } from "../lib/download-capture.js";
41
+ import { expand, scrubSecrets } from "../lib/substitution.js";
43
42
  import { SUPPORTS, runVerb, verbName } from "../verbs/index.js";
43
+ import pkg from "../package.json" with { type: "json" };
44
+
45
+ // Read the version from package.json so the `ready` frame can never drift from
46
+ // the published version (it used to be a hand-maintained literal that fell out
47
+ // of sync). Node >=24 supports JSON import attributes natively.
48
+ const VERSION = pkg.version;
49
+
50
+ // Protocol capability advertisement. `protocol` is the wire version we speak;
51
+ // `min_protocol` is the oldest version a peer may speak and still interoperate
52
+ // (we keep it equal to `protocol` until we ship a breaking frame change).
53
+ // `features` is a coarse capability list above the per-verb `supports` array —
54
+ // a client can feature-detect without hard-coding a version→capability map.
55
+ const PROTOCOL = "wb-sidecar/1";
56
+ const MIN_PROTOCOL = "wb-sidecar/1";
57
+ const FEATURES = [
58
+ "recording", // rrweb DOM capture + CDP screencast video
59
+ "pause", // pause_for_human operator handoff
60
+ "substitution", // {{ env.X }} / {{ artifacts.X }}
61
+ "substitution_escape", // \{{ literal-brace escape
62
+ "download_capture", // passive + explicit download artifact capture
63
+ "signed_url_download", // server-side fetch of in-JSON signed export URLs
64
+ ];
44
65
 
45
- const VERSION = "0.8.0";
46
66
  const provider = getProvider();
47
67
  log(`[provider] ${provider.name}`);
48
68
 
@@ -158,108 +178,9 @@ async function ensureSession(name, { profile, restoreSession } = {}) {
158
178
  }
159
179
  });
160
180
  }
161
- // --- {{ env.X }} / {{ artifacts.X }} substitution --------------------------
162
-
163
- const ENV_RE = /\{\{\s*env\.([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g;
164
- // Artifact names are bare identifiers — no dots, no slashes. Anything more
165
- // exotic would invite path traversal once composed with WB_ARTIFACTS_DIR.
166
- const ARTIFACT_RE = /\{\{\s*artifacts\.([A-Za-z_][A-Za-z0-9_-]*)\s*\}\}/g;
167
-
168
- // Resolved once at module load. `warn` matches historical behavior
169
- // (log + empty string, runbook continues). `error` throws so a missing OTP
170
- // or env var fails the slice instead of silently sending an empty value
171
- // into a Playwright action. `empty` is the silent variant.
172
- const ON_MISSING = (() => {
173
- const raw = (process.env.WB_SUBSTITUTION_ON_MISSING || "warn")
174
- .trim()
175
- .toLowerCase();
176
- if (raw === "error" || raw === "empty" || raw === "warn") return raw;
177
- log(
178
- `[warn] WB_SUBSTITUTION_ON_MISSING=${raw} is not valid (warn|error|empty); defaulting to warn`,
179
- );
180
- return "warn";
181
- })();
182
-
183
- function handleMissingSubstitution(kind, name) {
184
- const msg = `${kind}.${name} is not set`;
185
- if (ON_MISSING === "error") {
186
- throw new Error(`substitution: ${msg}`);
187
- }
188
- if (ON_MISSING === "warn") {
189
- log(`[warn] ${msg}; substituting empty string`);
190
- }
191
- return "";
192
- }
193
-
194
- function readArtifactRaw(name) {
195
- const dir = (process.env.WB_ARTIFACTS_DIR || "").trim();
196
- if (!dir) {
197
- log(`[warn] artifacts.${name} referenced but WB_ARTIFACTS_DIR is not set`);
198
- return null;
199
- }
200
- for (const candidate of [`${name}.txt`, name]) {
201
- const full = resolveInside(dir, candidate);
202
- if (!full) continue;
203
- try {
204
- return readFileSync(full, "utf8").trimEnd();
205
- } catch {
206
- // try next candidate
207
- }
208
- }
209
- return null;
210
- }
211
-
212
- function readArtifact(name, cache) {
213
- if (cache && cache.has(name)) {
214
- const hit = cache.get(name);
215
- if (hit === null) return handleMissingSubstitution("artifacts", name);
216
- return hit;
217
- }
218
- const v = readArtifactRaw(name);
219
- if (cache) cache.set(name, v);
220
- if (v === null) return handleMissingSubstitution("artifacts", name);
221
- return v;
222
- }
223
-
224
- function expand(value, collected, artifactCache) {
225
- if (typeof value === "string") {
226
- return value
227
- .replace(ENV_RE, (_, name) => {
228
- const v = process.env[name];
229
- if (v === undefined) return handleMissingSubstitution("env", name);
230
- if (collected && v.length >= 3) collected.add(v);
231
- return v;
232
- })
233
- .replace(ARTIFACT_RE, (_, name) => {
234
- const v = readArtifact(name, artifactCache);
235
- if (collected && v && v.length >= 3) collected.add(v);
236
- return v;
237
- });
238
- }
239
- if (Array.isArray(value))
240
- return value.map((v) => expand(v, collected, artifactCache));
241
- if (value && typeof value === "object") {
242
- const out = {};
243
- for (const [k, v] of Object.entries(value))
244
- out[k] = expand(v, collected, artifactCache);
245
- return out;
246
- }
247
- return value;
248
- }
249
-
250
- // Scrub any values that came from {{ env.X }} / {{ artifacts.X }} expansion
251
- // out of error messages before they cross the stdio boundary — Playwright and
252
- // fetch errors sometimes echo their inputs (URLs, script bodies, assertion
253
- // text) and those inputs may contain credentials.
254
- function scrubSecrets(msg, secrets) {
255
- let out = String(msg == null ? "" : msg);
256
- if (!secrets) return out;
257
- for (const s of secrets) {
258
- if (!s) continue;
259
- out = out.split(s).join("«***»");
260
- }
261
- return out;
262
- }
181
+ // {{ env.X }} / {{ artifacts.X }} substitution + `\{{` escape + secret scrubbing
182
+ // live in lib/substitution.js (extracted so they're unit-testable without
183
+ // booting the sidecar).
263
184
 
264
185
  // --- Slice handler ----------------------------------------------------------
265
186
 
@@ -559,8 +480,10 @@ rl.on("line", (line) => {
559
480
  type: "ready",
560
481
  runtime: "wb-browser-runtime",
561
482
  version: VERSION,
562
- protocol: "wb-sidecar/1",
483
+ protocol: PROTOCOL,
484
+ min_protocol: MIN_PROTOCOL,
563
485
  supports: SUPPORTS,
486
+ features: FEATURES,
564
487
  });
565
488
  break;
566
489
  case "slice":
package/lib/http.js CHANGED
@@ -1,3 +1,5 @@
1
+ import dns from "node:dns";
2
+ import { isIP } from "node:net";
1
3
  import { log } from "./io.js";
2
4
 
3
5
  export async function safeText(res) {
@@ -8,6 +10,138 @@ export async function safeText(res) {
8
10
  }
9
11
  }
10
12
 
13
+ // --- Body-read timeout handoff ---------------------------------------------
14
+ //
15
+ // retryableFetch's AbortController timer normally fires until fetch() resolves
16
+ // (headers received) and is then cleared in `finally`. That leaves the *body*
17
+ // read unbounded: a server can dribble bytes forever after sending headers.
18
+ //
19
+ // `keepBodyTimeout: true` is an opt-in for callers that consume the body
20
+ // themselves (the signed-URL download path). When set, on a successful (2xx)
21
+ // response we do NOT clear the timer — instead we stash the timer + controller
22
+ // in a WeakMap keyed by the Response so the caller can either:
23
+ // - releaseBodyTimeout(res): clear it once the body is fully consumed, or
24
+ // - abortBody(res): abort the in-flight body read (e.g. size cap tripped).
25
+ // If the caller never releases, the timer still fires and aborts the socket,
26
+ // so a hung body read can't wedge the process. Other callers (default
27
+ // keepBodyTimeout=false) are unaffected — their timer is cleared as before.
28
+ const bodyTimers = new WeakMap();
29
+
30
+ export function releaseBodyTimeout(res) {
31
+ const entry = bodyTimers.get(res);
32
+ if (entry) {
33
+ clearTimeout(entry.timer);
34
+ bodyTimers.delete(res);
35
+ }
36
+ }
37
+
38
+ export function abortBody(res) {
39
+ const entry = bodyTimers.get(res);
40
+ if (entry) {
41
+ try {
42
+ entry.controller.abort();
43
+ } catch {}
44
+ }
45
+ }
46
+
47
+ // Best-effort cancel/drain of a response body so a non-OK or redirect response
48
+ // doesn't leak the underlying socket while we throw or follow a redirect.
49
+ export async function drainResponseBody(res) {
50
+ try {
51
+ if (res?.body?.cancel) {
52
+ await res.body.cancel();
53
+ } else if (res?.body) {
54
+ // Fall back to consuming it if cancel() isn't available.
55
+ await res.arrayBuffer().catch(() => {});
56
+ }
57
+ } catch {}
58
+ }
59
+
60
+ // --- SSRF guard: private/loopback/link-local IP detection ------------------
61
+
62
+ function isPrivateIPv4(addr) {
63
+ const m = /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/.exec(addr);
64
+ if (!m) return false;
65
+ const a = Number(m[1]);
66
+ const b = Number(m[2]);
67
+ if (a === 0) return true; // 0.0.0.0/8 (includes the unspecified address)
68
+ if (a === 127) return true; // 127.0.0.0/8 loopback
69
+ if (a === 10) return true; // 10.0.0.0/8
70
+ if (a === 172 && b >= 16 && b <= 31) return true; // 172.16.0.0/12
71
+ if (a === 192 && b === 168) return true; // 192.168.0.0/16
72
+ if (a === 169 && b === 254) return true; // 169.254.0.0/16 link-local
73
+ return false;
74
+ }
75
+
76
+ function isPrivateIPv6(addr) {
77
+ let s = String(addr).toLowerCase();
78
+ const pct = s.indexOf("%");
79
+ if (pct >= 0) s = s.slice(0, pct); // strip zone id
80
+ if (s === "::1") return true; // loopback
81
+ if (s === "::") return true; // unspecified
82
+ // IPv4-mapped / IPv4-embedded (e.g. ::ffff:127.0.0.1, ::127.0.0.1)
83
+ const v4 = /:(\d+\.\d+\.\d+\.\d+)$/.exec(s);
84
+ if (v4 && isPrivateIPv4(v4[1])) return true;
85
+ const first = s.split(":")[0];
86
+ if (/^f[cd]/.test(first)) return true; // fc00::/7 unique-local
87
+ if (/^fe[89ab]/.test(first)) return true; // fe80::/10 link-local
88
+ return false;
89
+ }
90
+
91
+ // True if a literal IP address falls in a private/loopback/link-local range.
92
+ export function isPrivateIp(addr) {
93
+ const fam = isIP(addr);
94
+ if (fam === 4) return isPrivateIPv4(addr);
95
+ if (fam === 6) return isPrivateIPv6(addr);
96
+ return false;
97
+ }
98
+
99
+ // Validate a single URL as an allowed download target. Applies the caller's
100
+ // host allowlist (the SAME check used on the initial URL — this is what makes
101
+ // redirect following safe) and, unless explicitly allowed, rejects any host
102
+ // that is a private IP literal or resolves to one (DNS rebinding / SSRF).
103
+ // Throws on rejection; resolves on success.
104
+ export async function assertAllowedTarget(
105
+ urlStr,
106
+ { validateHost = null, allowPrivateIp = false } = {},
107
+ ) {
108
+ let u;
109
+ try {
110
+ u = new URL(urlStr);
111
+ } catch {
112
+ throw new Error(`blocked target: unparseable URL`);
113
+ }
114
+ if (u.protocol !== "http:" && u.protocol !== "https:") {
115
+ throw new Error(`blocked target: unsupported scheme "${u.protocol}"`);
116
+ }
117
+ const host = u.host.toLowerCase(); // includes port — matches the picker
118
+ const hostname = u.hostname.toLowerCase();
119
+ if (validateHost && !validateHost(host)) {
120
+ throw new Error(`blocked target: host not allowed: ${host}`);
121
+ }
122
+ if (allowPrivateIp) return;
123
+ if (isIP(hostname)) {
124
+ if (isPrivateIp(hostname)) {
125
+ throw new Error(`blocked target: private/loopback IP ${hostname}`);
126
+ }
127
+ return;
128
+ }
129
+ let results;
130
+ try {
131
+ results = await dns.promises.lookup(hostname, { all: true });
132
+ } catch (e) {
133
+ // Fail closed: a host we can't resolve isn't a host we should fetch.
134
+ throw new Error(`blocked target: could not resolve ${hostname}: ${e?.message || e}`);
135
+ }
136
+ for (const r of results) {
137
+ if (isPrivateIp(r.address)) {
138
+ throw new Error(
139
+ `blocked target: ${hostname} resolves to private/loopback IP ${r.address}`,
140
+ );
141
+ }
142
+ }
143
+ }
144
+
11
145
  // Retry transient network + 5xx/429 failures with short exponential backoff.
12
146
  // Each attempt gets its own AbortController + timeout; caller-passed signals
13
147
  // are not plumbed through since we don't have a cancellation story above this
@@ -17,11 +151,15 @@ export async function safeText(res) {
17
151
  // `bodyFactory`, when set, is invoked per attempt to produce a fresh body —
18
152
  // required for streaming uploads where the previous attempt consumed the
19
153
  // stream. Takes precedence over opts.body.
154
+ //
155
+ // `keepBodyTimeout`, when set, hands the attempt's abort timer to the caller on
156
+ // a successful (2xx) response instead of clearing it, so the body-read window
157
+ // stays bounded. See releaseBodyTimeout / abortBody above.
20
158
  export async function retryableFetch(
21
159
  url,
22
160
  opts = {},
23
161
  label,
24
- { timeoutMs = 30_000, bodyFactory = null } = {},
162
+ { timeoutMs = 30_000, bodyFactory = null, keepBodyTimeout = false } = {},
25
163
  ) {
26
164
  const delays = [100, 500];
27
165
  let lastErr = null;
@@ -36,6 +174,7 @@ export async function retryableFetch(
36
174
  }
37
175
  const controller = new AbortController();
38
176
  const timer = setTimeout(() => controller.abort(), timeoutMs);
177
+ let handedOff = false;
39
178
  try {
40
179
  const fetchOpts = { ...opts, signal: controller.signal };
41
180
  if (bodyFactory) {
@@ -45,7 +184,14 @@ export async function retryableFetch(
45
184
  fetchOpts.duplex = "half";
46
185
  }
47
186
  const res = await fetch(url, fetchOpts);
48
- if (res.ok) return res;
187
+ if (res.ok) {
188
+ if (keepBodyTimeout) {
189
+ // Keep the timer armed until the caller consumes the body.
190
+ handedOff = true;
191
+ bodyTimers.set(res, { timer, controller });
192
+ }
193
+ return res;
194
+ }
49
195
  if (res.status === 429 || (res.status >= 500 && res.status < 600)) {
50
196
  lastRes = res;
51
197
  continue;
@@ -55,9 +201,60 @@ export async function retryableFetch(
55
201
  lastErr = e;
56
202
  continue;
57
203
  } finally {
58
- clearTimeout(timer);
204
+ if (!handedOff) clearTimeout(timer);
59
205
  }
60
206
  }
61
207
  if (lastRes) return lastRes;
62
208
  throw lastErr;
63
209
  }
210
+
211
+ export const MAX_DOWNLOAD_REDIRECTS = 5;
212
+
213
+ // Fetch a download target with manual redirect handling and an SSRF guard.
214
+ // Every hop (the initial URL and each Location target) is re-validated with
215
+ // assertAllowedTarget before it is fetched, so a 3xx to an unvalidated or
216
+ // private host is rejected instead of silently followed. Redirect bodies are
217
+ // drained between hops. Returns the final (non-redirect) Response; the caller
218
+ // owns the body (use keepBodyTimeout semantics: releaseBodyTimeout when done).
219
+ export async function guardedDownloadFetch(
220
+ url,
221
+ {
222
+ timeoutMs = 30_000,
223
+ validateHost = null,
224
+ allowPrivateIp = false,
225
+ maxRedirects = MAX_DOWNLOAD_REDIRECTS,
226
+ label,
227
+ } = {},
228
+ ) {
229
+ let current = url;
230
+ for (let hop = 0; ; hop++) {
231
+ await assertAllowedTarget(current, { validateHost, allowPrivateIp });
232
+ const res = await retryableFetch(
233
+ current,
234
+ { method: "GET", redirect: "manual" },
235
+ label,
236
+ { timeoutMs, keepBodyTimeout: true },
237
+ );
238
+ const status = res.status;
239
+ if (status >= 300 && status < 400) {
240
+ const loc = res.headers?.get?.("location");
241
+ if (loc) {
242
+ // A 3xx is not res.ok, so it was never handed off — its timer is
243
+ // already cleared. Drain the redirect body and re-validate the target.
244
+ await drainResponseBody(res);
245
+ if (hop >= maxRedirects) {
246
+ throw new Error(`too many redirects (> ${maxRedirects})`);
247
+ }
248
+ let next;
249
+ try {
250
+ next = new URL(loc, current).toString();
251
+ } catch {
252
+ throw new Error(`blocked target: unparseable redirect Location`);
253
+ }
254
+ current = next;
255
+ continue;
256
+ }
257
+ }
258
+ return res;
259
+ }
260
+ }
@@ -148,6 +148,44 @@ export function loadRecordingConfig() {
148
148
  kinds,
149
149
  rrwebSource,
150
150
  rrwebMaxEvents,
151
+ mask: loadMaskConfig(),
152
+ };
153
+ }
154
+
155
+ // rrweb's `maskAllInputs` only redacts input *values* — labels, placeholders,
156
+ // aria-labels, option text, and the full DOM structure are still recorded. For
157
+ // genuinely sensitive regions (a displayed SSN, an account balance) the author
158
+ // must point rrweb at the offending nodes with a CSS selector. These knobs
159
+ // expose rrweb's selector options without hard-coding them:
160
+ // WB_RECORDING_MASK_ALL_INPUTS default on; set "0" to record input values
161
+ // WB_RECORDING_MASK_TEXT_SELECTOR text content of matches → asterisks
162
+ // WB_RECORDING_BLOCK_SELECTOR matches recorded as inert placeholders
163
+ // WB_RECORDING_IGNORE_SELECTOR matches excluded from the recording
164
+ //
165
+ // Privacy note: the vendored `vendor/rrweb-record.min.js` bundle supports
166
+ // `blockSelector` but NOT `ignoreSelector` — passing `ignoreSelector` to this
167
+ // build is a silent no-op, so an operator relying on it to drop a sensitive
168
+ // field would have that value recorded verbatim. To keep the "drop this field"
169
+ // promise on the shipped binary, WB_RECORDING_IGNORE_SELECTOR is folded into the
170
+ // (supported, and strictly stronger) `blockSelector`: a blocked element is not
171
+ // recorded at all — its subtree and inputs are never captured. The env var name
172
+ // is kept for compatibility; both selectors are unioned so neither is lost.
173
+ export function loadMaskConfig() {
174
+ const sel = (name) => {
175
+ const v = (process.env[name] || "").trim();
176
+ return v || null;
177
+ };
178
+ // Comma-join the explicit block selector with the ignore selector so the
179
+ // ignore intent is honored via a mechanism the vendored bundle actually
180
+ // supports. Either, both, or neither may be set.
181
+ const blockParts = [
182
+ sel("WB_RECORDING_BLOCK_SELECTOR"),
183
+ sel("WB_RECORDING_IGNORE_SELECTOR"),
184
+ ].filter(Boolean);
185
+ return {
186
+ maskAllInputs: process.env.WB_RECORDING_MASK_ALL_INPUTS !== "0",
187
+ maskTextSelector: sel("WB_RECORDING_MASK_TEXT_SELECTOR"),
188
+ blockSelector: blockParts.length ? blockParts.join(", ") : null,
151
189
  };
152
190
  }
153
191
 
@@ -221,17 +259,30 @@ export class RecordingManager {
221
259
  for (const e of batch) pushRrweb(e);
222
260
  }
223
261
  });
262
+ // Build rrweb record options from the resolved mask config. Selector
263
+ // options are omitted entirely when unset so we don't pass `null` into
264
+ // rrweb (which would match nothing but still allocate a matcher).
265
+ const mask = cfg.mask || { maskAllInputs: true };
266
+ const recordOpts = {
267
+ maskAllInputs: mask.maskAllInputs !== false,
268
+ };
269
+ if (mask.maskTextSelector)
270
+ recordOpts.maskTextSelector = mask.maskTextSelector;
271
+ // `blockSelector` already folds in WB_RECORDING_IGNORE_SELECTOR (see
272
+ // loadMaskConfig). We never pass `ignoreSelector` — the vendored rrweb
273
+ // bundle does not support it, so it would be silently dropped.
274
+ if (mask.blockSelector) recordOpts.blockSelector = mask.blockSelector;
275
+ const recordOptsJson = JSON.stringify(recordOpts);
224
276
  const bootstrap = `
225
277
  ;(function(){
226
278
  if (window.__wbRrwebActive) return;
227
279
  window.__wbRrwebActive = true;
228
280
  window.__wbRrwebBuffer = [];
229
281
  try {
230
- rrwebRecord({
282
+ rrwebRecord(Object.assign({
231
283
  emit: function(event){ window.__wbRrwebBuffer.push(event); },
232
- sampling: { scroll: 150, media: 800, input: 'last' },
233
- maskAllInputs: true
234
- });
284
+ sampling: { scroll: 150, media: 800, input: 'last' }
285
+ }, ${recordOptsJson}));
235
286
  } catch (e) { /* rrweb unavailable on this page (e.g. chrome://) */ }
236
287
  var flush = function(){
237
288
  var buf = window.__wbRrwebBuffer;