wb-browser-runtime 0.14.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -171,13 +171,13 @@ endpoint at session close. Recording is **off by default** — set
171
171
  | `WB_RECORDING_MASK_ALL_INPUTS` | `1` | rrweb `maskAllInputs`. Set `0` to record input *values* (off by default for safety). |
172
172
  | `WB_RECORDING_MASK_TEXT_SELECTOR` | *(unset)* | CSS selector whose **text content** rrweb masks (e.g. `.ssn, .acct-balance`). |
173
173
  | `WB_RECORDING_BLOCK_SELECTOR` | *(unset)* | CSS selector rrweb records as an inert placeholder (contents never captured). |
174
- | `WB_RECORDING_IGNORE_SELECTOR` | *(unset)* | CSS selector whose input events rrweb drops entirely. |
174
+ | `WB_RECORDING_IGNORE_SELECTOR` | *(unset)* | CSS selector for elements to exclude from the recording. **In this build it is applied as a block selector** (unioned with `WB_RECORDING_BLOCK_SELECTOR`): the matching element is recorded as an inert placeholder and its subtree/inputs are never captured. The vendored rrweb bundle does not support rrweb's `ignoreSelector` (which only drops input *events*), so we map this knob onto the supported, stronger `blockSelector` to honor the "drop this field" intent. |
175
175
 
176
176
  Artifacts are two parallel POSTs per session, `kind ∈ {rrweb, video}`:
177
177
 
178
178
  - **rrweb** — gzipped JSON (`application/json+gzip`) — `{ run_id, session, event_count, events: [...] }`. DOM mutations + input events captured from every page.
179
179
 
180
- **PII scope — read this.** `maskAllInputs` (on by default) only redacts the *values* a user types into form fields. It does **not** mask field labels, placeholders, `aria-label`s, `<option>` text, or any other rendered text, and it does not alter the recorded DOM structure. A displayed account number, balance, or name that is page text — not an input value — is captured verbatim. For those, point rrweb at the sensitive nodes with `WB_RECORDING_MASK_TEXT_SELECTOR` (mask the text) or `WB_RECORDING_BLOCK_SELECTOR` (omit the subtree). When in doubt, block the region.
180
+ **PII scope — read this.** `maskAllInputs` (on by default) only redacts the *values* a user types into form fields. It does **not** mask field labels, placeholders, `aria-label`s, `<option>` text, or any other rendered text, and it does not alter the recorded DOM structure. A displayed account number, balance, or name that is page text — not an input value — is captured verbatim. For those, point rrweb at the sensitive nodes with `WB_RECORDING_MASK_TEXT_SELECTOR` (mask the text) or `WB_RECORDING_BLOCK_SELECTOR` (omit the subtree). `WB_RECORDING_IGNORE_SELECTOR` is treated as an alias for `WB_RECORDING_BLOCK_SELECTOR` in this build (the vendored rrweb bundle has no `ignoreSelector` support), so a field named there is excluded from the recording entirely rather than merely having its input events dropped. When in doubt, block the region.
181
181
  - **video** — VP9 WebM (`video/webm`) — encoded from JPEG screencast frames via `ffmpeg`. Requires `ffmpeg` on `$PATH` (droplet install: `apt-get install -y ffmpeg`). If `ffmpeg` is missing the video kind silently disables and rrweb continues alone.
182
182
 
183
183
  Each POST carries headers `Authorization: Bearer <secret>`,
package/lib/http.js CHANGED
@@ -1,3 +1,5 @@
1
+ import dns from "node:dns";
2
+ import { isIP } from "node:net";
1
3
  import { log } from "./io.js";
2
4
 
3
5
  export async function safeText(res) {
@@ -8,6 +10,138 @@ export async function safeText(res) {
8
10
  }
9
11
  }
10
12
 
13
+ // --- Body-read timeout handoff ---------------------------------------------
14
+ //
15
+ // retryableFetch's AbortController timer normally fires until fetch() resolves
16
+ // (headers received) and is then cleared in `finally`. That leaves the *body*
17
+ // read unbounded: a server can dribble bytes forever after sending headers.
18
+ //
19
+ // `keepBodyTimeout: true` is an opt-in for callers that consume the body
20
+ // themselves (the signed-URL download path). When set, on a successful (2xx)
21
+ // response we do NOT clear the timer — instead we stash the timer + controller
22
+ // in a WeakMap keyed by the Response so the caller can either:
23
+ // - releaseBodyTimeout(res): clear it once the body is fully consumed, or
24
+ // - abortBody(res): abort the in-flight body read (e.g. size cap tripped).
25
+ // If the caller never releases, the timer still fires and aborts the socket,
26
+ // so a hung body read can't wedge the process. Other callers (default
27
+ // keepBodyTimeout=false) are unaffected — their timer is cleared as before.
28
+ const bodyTimers = new WeakMap();
29
+
30
+ export function releaseBodyTimeout(res) {
31
+ const entry = bodyTimers.get(res);
32
+ if (entry) {
33
+ clearTimeout(entry.timer);
34
+ bodyTimers.delete(res);
35
+ }
36
+ }
37
+
38
+ export function abortBody(res) {
39
+ const entry = bodyTimers.get(res);
40
+ if (entry) {
41
+ try {
42
+ entry.controller.abort();
43
+ } catch {}
44
+ }
45
+ }
46
+
47
+ // Best-effort cancel/drain of a response body so a non-OK or redirect response
48
+ // doesn't leak the underlying socket while we throw or follow a redirect.
49
+ export async function drainResponseBody(res) {
50
+ try {
51
+ if (res?.body?.cancel) {
52
+ await res.body.cancel();
53
+ } else if (res?.body) {
54
+ // Fall back to consuming it if cancel() isn't available.
55
+ await res.arrayBuffer().catch(() => {});
56
+ }
57
+ } catch {}
58
+ }
59
+
60
+ // --- SSRF guard: private/loopback/link-local IP detection ------------------
61
+
62
+ function isPrivateIPv4(addr) {
63
+ const m = /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/.exec(addr);
64
+ if (!m) return false;
65
+ const a = Number(m[1]);
66
+ const b = Number(m[2]);
67
+ if (a === 0) return true; // 0.0.0.0/8 (includes the unspecified address)
68
+ if (a === 127) return true; // 127.0.0.0/8 loopback
69
+ if (a === 10) return true; // 10.0.0.0/8
70
+ if (a === 172 && b >= 16 && b <= 31) return true; // 172.16.0.0/12
71
+ if (a === 192 && b === 168) return true; // 192.168.0.0/16
72
+ if (a === 169 && b === 254) return true; // 169.254.0.0/16 link-local
73
+ return false;
74
+ }
75
+
76
+ function isPrivateIPv6(addr) {
77
+ let s = String(addr).toLowerCase();
78
+ const pct = s.indexOf("%");
79
+ if (pct >= 0) s = s.slice(0, pct); // strip zone id
80
+ if (s === "::1") return true; // loopback
81
+ if (s === "::") return true; // unspecified
82
+ // IPv4-mapped / IPv4-embedded (e.g. ::ffff:127.0.0.1, ::127.0.0.1)
83
+ const v4 = /:(\d+\.\d+\.\d+\.\d+)$/.exec(s);
84
+ if (v4 && isPrivateIPv4(v4[1])) return true;
85
+ const first = s.split(":")[0];
86
+ if (/^f[cd]/.test(first)) return true; // fc00::/7 unique-local
87
+ if (/^fe[89ab]/.test(first)) return true; // fe80::/10 link-local
88
+ return false;
89
+ }
90
+
91
+ // True if a literal IP address falls in a private/loopback/link-local range.
92
+ export function isPrivateIp(addr) {
93
+ const fam = isIP(addr);
94
+ if (fam === 4) return isPrivateIPv4(addr);
95
+ if (fam === 6) return isPrivateIPv6(addr);
96
+ return false;
97
+ }
98
+
99
+ // Validate a single URL as an allowed download target. Applies the caller's
100
+ // host allowlist (the SAME check used on the initial URL — this is what makes
101
+ // redirect following safe) and, unless explicitly allowed, rejects any host
102
+ // that is a private IP literal or resolves to one (DNS rebinding / SSRF).
103
+ // Throws on rejection; resolves on success.
104
+ export async function assertAllowedTarget(
105
+ urlStr,
106
+ { validateHost = null, allowPrivateIp = false } = {},
107
+ ) {
108
+ let u;
109
+ try {
110
+ u = new URL(urlStr);
111
+ } catch {
112
+ throw new Error(`blocked target: unparseable URL`);
113
+ }
114
+ if (u.protocol !== "http:" && u.protocol !== "https:") {
115
+ throw new Error(`blocked target: unsupported scheme "${u.protocol}"`);
116
+ }
117
+ const host = u.host.toLowerCase(); // includes port — matches the picker
118
+ const hostname = u.hostname.toLowerCase();
119
+ if (validateHost && !validateHost(host)) {
120
+ throw new Error(`blocked target: host not allowed: ${host}`);
121
+ }
122
+ if (allowPrivateIp) return;
123
+ if (isIP(hostname)) {
124
+ if (isPrivateIp(hostname)) {
125
+ throw new Error(`blocked target: private/loopback IP ${hostname}`);
126
+ }
127
+ return;
128
+ }
129
+ let results;
130
+ try {
131
+ results = await dns.promises.lookup(hostname, { all: true });
132
+ } catch (e) {
133
+ // Fail closed: a host we can't resolve isn't a host we should fetch.
134
+ throw new Error(`blocked target: could not resolve ${hostname}: ${e?.message || e}`);
135
+ }
136
+ for (const r of results) {
137
+ if (isPrivateIp(r.address)) {
138
+ throw new Error(
139
+ `blocked target: ${hostname} resolves to private/loopback IP ${r.address}`,
140
+ );
141
+ }
142
+ }
143
+ }
144
+
11
145
  // Retry transient network + 5xx/429 failures with short exponential backoff.
12
146
  // Each attempt gets its own AbortController + timeout; caller-passed signals
13
147
  // are not plumbed through since we don't have a cancellation story above this
@@ -17,11 +151,15 @@ export async function safeText(res) {
17
151
  // `bodyFactory`, when set, is invoked per attempt to produce a fresh body —
18
152
  // required for streaming uploads where the previous attempt consumed the
19
153
  // stream. Takes precedence over opts.body.
154
+ //
155
+ // `keepBodyTimeout`, when set, hands the attempt's abort timer to the caller on
156
+ // a successful (2xx) response instead of clearing it, so the body-read window
157
+ // stays bounded. See releaseBodyTimeout / abortBody above.
20
158
  export async function retryableFetch(
21
159
  url,
22
160
  opts = {},
23
161
  label,
24
- { timeoutMs = 30_000, bodyFactory = null } = {},
162
+ { timeoutMs = 30_000, bodyFactory = null, keepBodyTimeout = false } = {},
25
163
  ) {
26
164
  const delays = [100, 500];
27
165
  let lastErr = null;
@@ -36,6 +174,7 @@ export async function retryableFetch(
36
174
  }
37
175
  const controller = new AbortController();
38
176
  const timer = setTimeout(() => controller.abort(), timeoutMs);
177
+ let handedOff = false;
39
178
  try {
40
179
  const fetchOpts = { ...opts, signal: controller.signal };
41
180
  if (bodyFactory) {
@@ -45,7 +184,14 @@ export async function retryableFetch(
45
184
  fetchOpts.duplex = "half";
46
185
  }
47
186
  const res = await fetch(url, fetchOpts);
48
- if (res.ok) return res;
187
+ if (res.ok) {
188
+ if (keepBodyTimeout) {
189
+ // Keep the timer armed until the caller consumes the body.
190
+ handedOff = true;
191
+ bodyTimers.set(res, { timer, controller });
192
+ }
193
+ return res;
194
+ }
49
195
  if (res.status === 429 || (res.status >= 500 && res.status < 600)) {
50
196
  lastRes = res;
51
197
  continue;
@@ -55,9 +201,60 @@ export async function retryableFetch(
55
201
  lastErr = e;
56
202
  continue;
57
203
  } finally {
58
- clearTimeout(timer);
204
+ if (!handedOff) clearTimeout(timer);
59
205
  }
60
206
  }
61
207
  if (lastRes) return lastRes;
62
208
  throw lastErr;
63
209
  }
210
+
211
+ export const MAX_DOWNLOAD_REDIRECTS = 5;
212
+
213
+ // Fetch a download target with manual redirect handling and an SSRF guard.
214
+ // Every hop (the initial URL and each Location target) is re-validated with
215
+ // assertAllowedTarget before it is fetched, so a 3xx to an unvalidated or
216
+ // private host is rejected instead of silently followed. Redirect bodies are
217
+ // drained between hops. Returns the final (non-redirect) Response; the caller
218
+ // owns the body (use keepBodyTimeout semantics: releaseBodyTimeout when done).
219
+ export async function guardedDownloadFetch(
220
+ url,
221
+ {
222
+ timeoutMs = 30_000,
223
+ validateHost = null,
224
+ allowPrivateIp = false,
225
+ maxRedirects = MAX_DOWNLOAD_REDIRECTS,
226
+ label,
227
+ } = {},
228
+ ) {
229
+ let current = url;
230
+ for (let hop = 0; ; hop++) {
231
+ await assertAllowedTarget(current, { validateHost, allowPrivateIp });
232
+ const res = await retryableFetch(
233
+ current,
234
+ { method: "GET", redirect: "manual" },
235
+ label,
236
+ { timeoutMs, keepBodyTimeout: true },
237
+ );
238
+ const status = res.status;
239
+ if (status >= 300 && status < 400) {
240
+ const loc = res.headers?.get?.("location");
241
+ if (loc) {
242
+ // A 3xx is not res.ok, so it was never handed off — its timer is
243
+ // already cleared. Drain the redirect body and re-validate the target.
244
+ await drainResponseBody(res);
245
+ if (hop >= maxRedirects) {
246
+ throw new Error(`too many redirects (> ${maxRedirects})`);
247
+ }
248
+ let next;
249
+ try {
250
+ next = new URL(loc, current).toString();
251
+ } catch {
252
+ throw new Error(`blocked target: unparseable redirect Location`);
253
+ }
254
+ current = next;
255
+ continue;
256
+ }
257
+ }
258
+ return res;
259
+ }
260
+ }
@@ -160,17 +160,32 @@ export function loadRecordingConfig() {
160
160
  // WB_RECORDING_MASK_ALL_INPUTS default on; set "0" to record input values
161
161
  // WB_RECORDING_MASK_TEXT_SELECTOR text content of matches → asterisks
162
162
  // WB_RECORDING_BLOCK_SELECTOR matches recorded as inert placeholders
163
- // WB_RECORDING_IGNORE_SELECTOR matches' input events dropped entirely
163
+ // WB_RECORDING_IGNORE_SELECTOR matches excluded from the recording
164
+ //
165
+ // Privacy note: the vendored `vendor/rrweb-record.min.js` bundle supports
166
+ // `blockSelector` but NOT `ignoreSelector` — passing `ignoreSelector` to this
167
+ // build is a silent no-op, so an operator relying on it to drop a sensitive
168
+ // field would have that value recorded verbatim. To keep the "drop this field"
169
+ // promise on the shipped binary, WB_RECORDING_IGNORE_SELECTOR is folded into the
170
+ // (supported, and strictly stronger) `blockSelector`: a blocked element is not
171
+ // recorded at all — its subtree and inputs are never captured. The env var name
172
+ // is kept for compatibility; both selectors are unioned so neither is lost.
164
173
  export function loadMaskConfig() {
165
174
  const sel = (name) => {
166
175
  const v = (process.env[name] || "").trim();
167
176
  return v || null;
168
177
  };
178
+ // Comma-join the explicit block selector with the ignore selector so the
179
+ // ignore intent is honored via a mechanism the vendored bundle actually
180
+ // supports. Either, both, or neither may be set.
181
+ const blockParts = [
182
+ sel("WB_RECORDING_BLOCK_SELECTOR"),
183
+ sel("WB_RECORDING_IGNORE_SELECTOR"),
184
+ ].filter(Boolean);
169
185
  return {
170
186
  maskAllInputs: process.env.WB_RECORDING_MASK_ALL_INPUTS !== "0",
171
187
  maskTextSelector: sel("WB_RECORDING_MASK_TEXT_SELECTOR"),
172
- blockSelector: sel("WB_RECORDING_BLOCK_SELECTOR"),
173
- ignoreSelector: sel("WB_RECORDING_IGNORE_SELECTOR"),
188
+ blockSelector: blockParts.length ? blockParts.join(", ") : null,
174
189
  };
175
190
  }
176
191
 
@@ -253,9 +268,10 @@ export class RecordingManager {
253
268
  };
254
269
  if (mask.maskTextSelector)
255
270
  recordOpts.maskTextSelector = mask.maskTextSelector;
271
+ // `blockSelector` already folds in WB_RECORDING_IGNORE_SELECTOR (see
272
+ // loadMaskConfig). We never pass `ignoreSelector` — the vendored rrweb
273
+ // bundle does not support it, so it would be silently dropped.
256
274
  if (mask.blockSelector) recordOpts.blockSelector = mask.blockSelector;
257
- if (mask.ignoreSelector)
258
- recordOpts.ignoreSelector = mask.ignoreSelector;
259
275
  const recordOptsJson = JSON.stringify(recordOpts);
260
276
  const bootstrap = `
261
277
  ;(function(){
@@ -138,9 +138,10 @@ export function parseSignedConfig(raw) {
138
138
  export function pickSignedCandidate(candidates, opts = {}) {
139
139
  const hosts = opts.hosts || [];
140
140
  const jsonFields = opts.jsonFields || null;
141
- const forced = opts.enabled === true;
142
141
  for (const cand of candidates || []) {
143
142
  for (const u of cand.urls || []) {
143
+ // json_fields only *filters* which fields are inspected — it never
144
+ // bypasses the host check below.
144
145
  if (jsonFields && !jsonFields.includes(leafField(u.field)) && !jsonFields.includes(u.field))
145
146
  continue;
146
147
  let host = "";
@@ -153,12 +154,12 @@ export function pickSignedCandidate(candidates, opts = {}) {
153
154
  hosts.length > 0 &&
154
155
  hosts.some((h) => host === h || host.endsWith(`.${h}`));
155
156
  const looksSigned = isSignedHost(host);
156
- // auto: only recognized signed hosts or an explicit hosts allowlist.
157
- // forced: also honor a candidate the author selected via hosts/json_fields.
158
- const accept =
159
- hostAllowed ||
160
- looksSigned ||
161
- (forced && (hosts.length > 0 || jsonFields));
157
+ // A host match is ALWAYS required: a recognized signed host (auto mode)
158
+ // or an explicit `hosts` allowlist entry. Forced mode (`enabled: true`)
159
+ // does not relax this — it only opts the feature on; an author who needs
160
+ // an unrecognized host must name it in `hosts`. This closes the SSRF gap
161
+ // where forced + json_fields accepted an arbitrary host.
162
+ const accept = hostAllowed || looksSigned;
162
163
  if (accept) {
163
164
  return { url: u.url, field: u.field, api_url: cand.api_url || null, host };
164
165
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "wb-browser-runtime",
3
- "version": "0.14.0",
3
+ "version": "0.14.1",
4
4
  "description": "Browser sidecar runtime for wb — Playwright over CDP (Browserbase, browser-use) via the wb-sidecar/1 line-framed JSON protocol.",
5
5
  "bin": {
6
6
  "wb-browser-runtime": "bin/wb-browser-runtime.js"
package/verbs/download.js CHANGED
@@ -18,7 +18,8 @@
18
18
 
19
19
  import path from "node:path";
20
20
  import { Buffer } from "node:buffer";
21
- import { promises as fsPromises } from "node:fs";
21
+ import { promises as fsPromises, createWriteStream } from "node:fs";
22
+ import { once } from "node:events";
22
23
  import { send } from "../lib/io.js";
23
24
  import {
24
25
  uniquePathInside,
@@ -26,19 +27,57 @@ import {
26
27
  extensionAllowed,
27
28
  } from "../lib/util.js";
28
29
  import { HANDLED_MARK } from "../lib/download-capture.js";
29
- import { retryableFetch } from "../lib/http.js";
30
+ import {
31
+ guardedDownloadFetch,
32
+ releaseBodyTimeout,
33
+ abortBody,
34
+ drainResponseBody,
35
+ } from "../lib/http.js";
30
36
  import {
31
37
  SIGNED_PAGE_HOOK,
32
38
  SIGNED_POLL_SCRIPT,
33
39
  parseSignedConfig,
34
40
  pickSignedCandidate,
35
41
  redactSignedUrl,
42
+ isSignedHost,
36
43
  } from "../lib/signed-url-capture.js";
37
44
 
38
45
  const DEFAULT_TIMEOUT_MS = 10_000;
39
46
  const POLL_INTERVAL_MS = 50;
40
47
  const FALLBACK_NAME = "download.bin";
41
48
 
49
+ // Hard cap on signed-URL download size to bound memory/disk. A lying or absent
50
+ // Content-Length can't bypass it: the stream is aborted once bytes exceed it.
51
+ // Override for tests/ops via WB_MAX_DOWNLOAD_BYTES.
52
+ const MAX_SIGNED_DOWNLOAD_BYTES = 512 * 1024 * 1024; // 512 MiB
53
+
54
+ function maxDownloadBytes() {
55
+ const v = Number(process.env.WB_MAX_DOWNLOAD_BYTES);
56
+ return Number.isFinite(v) && v > 0 ? v : MAX_SIGNED_DOWNLOAD_BYTES;
57
+ }
58
+
59
+ // Test/ops escape hatch: by default the SSRF guard rejects targets that resolve
60
+ // to private/loopback IPs. A local test server lives on 127.0.0.1, so the guard
61
+ // must be opt-out-able for those tests. Production leaves this unset.
62
+ function privateDownloadIpAllowed() {
63
+ const v = String(process.env.WB_ALLOW_PRIVATE_DOWNLOAD_IP || "").toLowerCase();
64
+ return v === "1" || v === "true" || v === "yes" || v === "on";
65
+ }
66
+
67
+ // Build the host validator applied to the initial signed URL *and* every
68
+ // redirect hop — the same gate pickSignedCandidate uses, so a redirect can't
69
+ // escape to a host the picker would never have selected.
70
+ function makeSignedHostValidator(signedCfg) {
71
+ const hosts = (signedCfg && signedCfg.hosts) || [];
72
+ return (host) => {
73
+ if (!host) return false;
74
+ const h = String(host).toLowerCase();
75
+ const hostAllowed =
76
+ hosts.length > 0 && hosts.some((x) => h === x || h.endsWith(`.${x}`));
77
+ return hostAllowed || isSignedHost(h);
78
+ };
79
+ }
80
+
42
81
  // Page-side hook that traps blob/data-URL anchor clicks the SPA performs
43
82
  // programmatically — `URL.createObjectURL(blob)` + `<a download>` + `.click()`.
44
83
  // Playwright's own `download` event normally catches these, but a handful
@@ -247,6 +286,7 @@ export default {
247
286
  page,
248
287
  ctx,
249
288
  timeout,
289
+ signedCfg,
250
290
  });
251
291
  }
252
292
 
@@ -417,6 +457,7 @@ async function saveSignedUrlDownload({
417
457
  page,
418
458
  ctx,
419
459
  timeout,
460
+ signedCfg,
420
461
  }) {
421
462
  const redacted = redactSignedUrl(signed.url);
422
463
  // Filename: explicit path: wins, else the signed URL's basename, else a
@@ -439,18 +480,8 @@ async function saveSignedUrlDownload({
439
480
  }
440
481
  await fsPromises.mkdir(artifactsDir, { recursive: true });
441
482
 
442
- // Fetch the signed URL from the sidecar (not the page) so the object store's
443
- // CORS policy doesn't block the read. The label is redacted — retry logs must
444
- // never echo signed credentials.
445
- let res;
446
- try {
447
- res = await retryableFetch(
448
- signed.url,
449
- { method: "GET" },
450
- `signed-url download (${redacted})`,
451
- { timeoutMs: timeout },
452
- );
453
- } catch (e) {
483
+ const maxBytes = maxDownloadBytes();
484
+ const failed = (extra, reason) =>
454
485
  send({
455
486
  type: "slice.download_failed",
456
487
  verb: "download",
@@ -459,58 +490,161 @@ async function saveSignedUrlDownload({
459
490
  api_url: signed.api_url,
460
491
  signed_url: redacted,
461
492
  page_url: safePageUrl(page),
462
- reason: `signed url fetch error: ${e?.message || e}`,
493
+ ...extra,
494
+ reason,
463
495
  });
496
+
497
+ // Fetch the signed URL from the sidecar (not the page) so the object store's
498
+ // CORS policy doesn't block the read. Redirects are followed *manually* with
499
+ // the same host allowlist + private-IP block applied to every hop (SSRF), and
500
+ // the body-read timeout stays armed until we finish streaming. The label is
501
+ // redacted — retry logs must never echo signed credentials.
502
+ let res;
503
+ try {
504
+ res = await guardedDownloadFetch(signed.url, {
505
+ timeoutMs: timeout,
506
+ validateHost: makeSignedHostValidator(signedCfg),
507
+ allowPrivateIp: privateDownloadIpAllowed(),
508
+ label: `signed-url download (${redacted})`,
509
+ });
510
+ } catch (e) {
511
+ failed({}, `signed url fetch error: ${e?.message || e}`);
464
512
  throw new Error(
465
513
  `download: signed URL fetch failed for ${redacted}: ${e?.message || e}`,
466
514
  );
467
515
  }
468
- if (!res.ok) {
469
- // A 403 on a pre-signed URL almost always means the token expired before
470
- // we fetched it — call that out so the operator knows to shorten the gap.
471
- const expired = res.status === 403;
516
+
517
+ try {
518
+ if (!res.ok) {
519
+ // A 403 on a pre-signed URL almost always means the token expired before
520
+ // we fetched it — call that out so the operator knows to shorten the gap.
521
+ const expired = res.status === 403;
522
+ await drainResponseBody(res); // don't leak the socket
523
+ failed(
524
+ { http_status: res.status, expired },
525
+ `signed url fetch: HTTP ${res.status}${expired ? " (likely expired)" : ""}`,
526
+ );
527
+ throw new Error(
528
+ `download: signed URL fetch returned HTTP ${res.status} for ${redacted}${expired ? " (likely expired)" : ""}`,
529
+ );
530
+ }
531
+
532
+ // Reject up front when the server *declares* an oversized body...
533
+ const clRaw = safeHeader(res, "content-length");
534
+ const cl = clRaw == null ? NaN : Number(clRaw);
535
+ if (Number.isFinite(cl) && cl > maxBytes) {
536
+ await drainResponseBody(res);
537
+ failed(
538
+ { content_length: cl, max_bytes: maxBytes },
539
+ `signed url body too large: Content-Length ${cl} > cap ${maxBytes}`,
540
+ );
541
+ throw new Error(
542
+ `download: signed URL body exceeds size cap (${cl} > ${maxBytes} bytes) for ${redacted}`,
543
+ );
544
+ }
545
+
546
+ // ...and enforce while streaming so a lying/absent Content-Length can't slip
547
+ // past. Streams to disk rather than materializing the whole body in memory.
548
+ let bytes;
549
+ try {
550
+ bytes = await streamToFileWithCap(res, target, maxBytes);
551
+ } catch (e) {
552
+ if (e && e.code === "WB_SIZE_CAP") {
553
+ failed(
554
+ { max_bytes: maxBytes },
555
+ `signed url body exceeded size cap of ${maxBytes} bytes mid-stream`,
556
+ );
557
+ throw new Error(
558
+ `download: signed URL body exceeded size cap (${maxBytes} bytes) for ${redacted}`,
559
+ );
560
+ }
561
+ failed({}, `signed url body read error: ${e?.message || e}`);
562
+ throw new Error(
563
+ `download: signed URL body read failed for ${redacted}: ${e?.message || e}`,
564
+ );
565
+ }
566
+
567
+ const contentType = safeHeader(res, "content-type");
568
+ const contentDisposition = safeHeader(res, "content-disposition");
472
569
  send({
473
- type: "slice.download_failed",
474
- verb: "download",
475
- verb_index: ctx?.index ?? null,
476
- capture: "signed_url",
477
- api_url: signed.api_url,
478
- signed_url: redacted,
479
- page_url: safePageUrl(page),
480
- http_status: res.status,
481
- expired,
482
- reason: `signed url fetch: HTTP ${res.status}${expired ? " (likely expired)" : ""}`,
570
+ type: "slice.artifact_saved",
571
+ filename: path.basename(target),
572
+ path: target,
573
+ bytes,
574
+ source: "download",
575
+ provenance: {
576
+ url: null,
577
+ signed_url: redacted,
578
+ api_url: signed.api_url,
579
+ field: signed.field,
580
+ suggested_filename: suggested,
581
+ page_url: safePageUrl(page),
582
+ verb_index: ctx?.index ?? null,
583
+ verb_name: "download",
584
+ capture: "signed_url",
585
+ content_type: contentType,
586
+ content_disposition: contentDisposition,
587
+ ts: Date.now(),
588
+ },
483
589
  });
484
- throw new Error(
485
- `download: signed URL fetch returned HTTP ${res.status} for ${redacted}${expired ? " (likely expired)" : ""}`,
590
+ return `→ ${path.basename(target)}`;
591
+ } finally {
592
+ // Body fully consumed (or we bailed) — disarm the body-read timeout.
593
+ releaseBodyTimeout(res);
594
+ }
595
+ }
596
+
597
+ // Stream a response body to disk, counting bytes and aborting the in-flight
598
+ // read once the cap is exceeded (so an absent/under-stated Content-Length can't
599
+ // OOM us). Removes the partial file on any error. Returns total bytes written.
600
+ async function streamToFileWithCap(res, target, maxBytes) {
601
+ const reader = res.body?.getReader?.();
602
+ const ws = createWriteStream(target);
603
+ let total = 0;
604
+ try {
605
+ if (!reader) {
606
+ // No body to read (e.g. 204) — write an empty file.
607
+ await new Promise((resolve, reject) =>
608
+ ws.end((e) => (e ? reject(e) : resolve())),
609
+ );
610
+ return 0;
611
+ }
612
+ for (;;) {
613
+ const { done, value } = await reader.read();
614
+ if (done) break;
615
+ total += value.byteLength;
616
+ if (total > maxBytes) {
617
+ abortBody(res); // abort the underlying socket read
618
+ try {
619
+ await reader.cancel();
620
+ } catch {}
621
+ const err = new Error(`size cap exceeded`);
622
+ err.code = "WB_SIZE_CAP";
623
+ throw err;
624
+ }
625
+ if (!ws.write(Buffer.from(value))) {
626
+ await once(ws, "drain");
627
+ }
628
+ }
629
+ await new Promise((resolve, reject) =>
630
+ ws.end((e) => (e ? reject(e) : resolve())),
486
631
  );
632
+ return total;
633
+ } catch (e) {
634
+ // Await the stream's close before unlinking: createWriteStream opens its fd
635
+ // asynchronously, so unlinking eagerly can race the (lazy) open and leave a
636
+ // resurrected empty file behind.
637
+ try {
638
+ await new Promise((resolve) => {
639
+ ws.once("close", resolve);
640
+ ws.destroy();
641
+ });
642
+ } catch {}
643
+ try {
644
+ await fsPromises.unlink(target);
645
+ } catch {}
646
+ throw e;
487
647
  }
488
- const buf = Buffer.from(await res.arrayBuffer());
489
- await fsPromises.writeFile(target, buf);
490
- const contentType = safeHeader(res, "content-type");
491
- const contentDisposition = safeHeader(res, "content-disposition");
492
- send({
493
- type: "slice.artifact_saved",
494
- filename: path.basename(target),
495
- path: target,
496
- bytes: buf.length,
497
- source: "download",
498
- provenance: {
499
- url: null,
500
- signed_url: redacted,
501
- api_url: signed.api_url,
502
- field: signed.field,
503
- suggested_filename: suggested,
504
- page_url: safePageUrl(page),
505
- verb_index: ctx?.index ?? null,
506
- verb_name: "download",
507
- capture: "signed_url",
508
- content_type: contentType,
509
- content_disposition: contentDisposition,
510
- ts: Date.now(),
511
- },
512
- });
513
- return `→ ${path.basename(target)}`;
514
648
  }
515
649
 
516
650
  async function pollForBlob(page, timeoutMs, stop) {
package/verbs/goto.js CHANGED
@@ -1,10 +1,16 @@
1
+ import { scrubSecrets } from "../lib/substitution.js";
2
+
1
3
  export default {
2
4
  name: "goto",
3
5
  primaryKey: "url",
4
- async execute(page, args) {
6
+ async execute(page, args, ctx) {
5
7
  const url = args.url ?? "";
6
8
  const waitUntil = args.wait_until ?? "domcontentloaded";
7
9
  await page.goto(url, { waitUntil, timeout: args.timeout ?? 30_000 });
8
- return `→ ${page.url()}`;
10
+ // The resolved URL can carry a substituted secret (e.g.
11
+ // ?token={{ env.TOKEN }}). Scrub any collected secret value out of the
12
+ // summary before it crosses into the verb.complete event stream — the
13
+ // same mechanism error messages use (lib/substitution.scrubSecrets).
14
+ return `→ ${scrubSecrets(page.url(), ctx?.secrets)}`;
9
15
  },
10
16
  };