wb-browser-runtime 0.14.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/lib/http.js +200 -3
- package/lib/recording-manager.js +21 -5
- package/lib/signed-url-capture.js +8 -7
- package/package.json +1 -1
- package/verbs/download.js +191 -57
- package/verbs/goto.js +8 -2
package/README.md
CHANGED
|
@@ -171,13 +171,13 @@ endpoint at session close. Recording is **off by default** — set
|
|
|
171
171
|
| `WB_RECORDING_MASK_ALL_INPUTS` | `1` | rrweb `maskAllInputs`. Set `0` to record input *values* (off by default for safety). |
|
|
172
172
|
| `WB_RECORDING_MASK_TEXT_SELECTOR` | *(unset)* | CSS selector whose **text content** rrweb masks (e.g. `.ssn, .acct-balance`). |
|
|
173
173
|
| `WB_RECORDING_BLOCK_SELECTOR` | *(unset)* | CSS selector rrweb records as an inert placeholder (contents never captured). |
|
|
174
|
-
| `WB_RECORDING_IGNORE_SELECTOR` | *(unset)* | CSS selector
|
|
174
|
+
| `WB_RECORDING_IGNORE_SELECTOR` | *(unset)* | CSS selector for elements to exclude from the recording. **In this build it is applied as a block selector** (unioned with `WB_RECORDING_BLOCK_SELECTOR`): the matching element is recorded as an inert placeholder and its subtree/inputs are never captured. The vendored rrweb bundle does not support rrweb's `ignoreSelector` (which only drops input *events*), so we map this knob onto the supported, stronger `blockSelector` to honor the "drop this field" intent. |
|
|
175
175
|
|
|
176
176
|
Artifacts are two parallel POSTs per session, `kind ∈ {rrweb, video}`:
|
|
177
177
|
|
|
178
178
|
- **rrweb** — gzipped JSON (`application/json+gzip`) — `{ run_id, session, event_count, events: [...] }`. DOM mutations + input events captured from every page.
|
|
179
179
|
|
|
180
|
-
**PII scope — read this.** `maskAllInputs` (on by default) only redacts the *values* a user types into form fields. It does **not** mask field labels, placeholders, `aria-label`s, `<option>` text, or any other rendered text, and it does not alter the recorded DOM structure. A displayed account number, balance, or name that is page text — not an input value — is captured verbatim. For those, point rrweb at the sensitive nodes with `WB_RECORDING_MASK_TEXT_SELECTOR` (mask the text) or `WB_RECORDING_BLOCK_SELECTOR` (omit the subtree). When in doubt, block the region.
|
|
180
|
+
**PII scope — read this.** `maskAllInputs` (on by default) only redacts the *values* a user types into form fields. It does **not** mask field labels, placeholders, `aria-label`s, `<option>` text, or any other rendered text, and it does not alter the recorded DOM structure. A displayed account number, balance, or name that is page text — not an input value — is captured verbatim. For those, point rrweb at the sensitive nodes with `WB_RECORDING_MASK_TEXT_SELECTOR` (mask the text) or `WB_RECORDING_BLOCK_SELECTOR` (omit the subtree). `WB_RECORDING_IGNORE_SELECTOR` is treated as an alias for `WB_RECORDING_BLOCK_SELECTOR` in this build (the vendored rrweb bundle has no `ignoreSelector` support), so a field named there is excluded from the recording entirely rather than merely having its input events dropped. When in doubt, block the region.
|
|
181
181
|
- **video** — VP9 WebM (`video/webm`) — encoded from JPEG screencast frames via `ffmpeg`. Requires `ffmpeg` on `$PATH` (droplet install: `apt-get install -y ffmpeg`). If `ffmpeg` is missing the video kind silently disables and rrweb continues alone.
|
|
182
182
|
|
|
183
183
|
Each POST carries headers `Authorization: Bearer <secret>`,
|
package/lib/http.js
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import dns from "node:dns";
|
|
2
|
+
import { isIP } from "node:net";
|
|
1
3
|
import { log } from "./io.js";
|
|
2
4
|
|
|
3
5
|
export async function safeText(res) {
|
|
@@ -8,6 +10,138 @@ export async function safeText(res) {
|
|
|
8
10
|
}
|
|
9
11
|
}
|
|
10
12
|
|
|
13
|
+
// --- Body-read timeout handoff ---------------------------------------------
|
|
14
|
+
//
|
|
15
|
+
// retryableFetch's AbortController timer normally fires until fetch() resolves
|
|
16
|
+
// (headers received) and is then cleared in `finally`. That leaves the *body*
|
|
17
|
+
// read unbounded: a server can dribble bytes forever after sending headers.
|
|
18
|
+
//
|
|
19
|
+
// `keepBodyTimeout: true` is an opt-in for callers that consume the body
|
|
20
|
+
// themselves (the signed-URL download path). When set, on a successful (2xx)
|
|
21
|
+
// response we do NOT clear the timer — instead we stash the timer + controller
|
|
22
|
+
// in a WeakMap keyed by the Response so the caller can either:
|
|
23
|
+
// - releaseBodyTimeout(res): clear it once the body is fully consumed, or
|
|
24
|
+
// - abortBody(res): abort the in-flight body read (e.g. size cap tripped).
|
|
25
|
+
// If the caller never releases, the timer still fires and aborts the socket,
|
|
26
|
+
// so a hung body read can't wedge the process. Other callers (default
|
|
27
|
+
// keepBodyTimeout=false) are unaffected — their timer is cleared as before.
|
|
28
|
+
const bodyTimers = new WeakMap();
|
|
29
|
+
|
|
30
|
+
export function releaseBodyTimeout(res) {
|
|
31
|
+
const entry = bodyTimers.get(res);
|
|
32
|
+
if (entry) {
|
|
33
|
+
clearTimeout(entry.timer);
|
|
34
|
+
bodyTimers.delete(res);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export function abortBody(res) {
|
|
39
|
+
const entry = bodyTimers.get(res);
|
|
40
|
+
if (entry) {
|
|
41
|
+
try {
|
|
42
|
+
entry.controller.abort();
|
|
43
|
+
} catch {}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Best-effort cancel/drain of a response body so a non-OK or redirect response
|
|
48
|
+
// doesn't leak the underlying socket while we throw or follow a redirect.
|
|
49
|
+
export async function drainResponseBody(res) {
|
|
50
|
+
try {
|
|
51
|
+
if (res?.body?.cancel) {
|
|
52
|
+
await res.body.cancel();
|
|
53
|
+
} else if (res?.body) {
|
|
54
|
+
// Fall back to consuming it if cancel() isn't available.
|
|
55
|
+
await res.arrayBuffer().catch(() => {});
|
|
56
|
+
}
|
|
57
|
+
} catch {}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// --- SSRF guard: private/loopback/link-local IP detection ------------------
|
|
61
|
+
|
|
62
|
+
function isPrivateIPv4(addr) {
|
|
63
|
+
const m = /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/.exec(addr);
|
|
64
|
+
if (!m) return false;
|
|
65
|
+
const a = Number(m[1]);
|
|
66
|
+
const b = Number(m[2]);
|
|
67
|
+
if (a === 0) return true; // 0.0.0.0/8 (includes the unspecified address)
|
|
68
|
+
if (a === 127) return true; // 127.0.0.0/8 loopback
|
|
69
|
+
if (a === 10) return true; // 10.0.0.0/8
|
|
70
|
+
if (a === 172 && b >= 16 && b <= 31) return true; // 172.16.0.0/12
|
|
71
|
+
if (a === 192 && b === 168) return true; // 192.168.0.0/16
|
|
72
|
+
if (a === 169 && b === 254) return true; // 169.254.0.0/16 link-local
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function isPrivateIPv6(addr) {
|
|
77
|
+
let s = String(addr).toLowerCase();
|
|
78
|
+
const pct = s.indexOf("%");
|
|
79
|
+
if (pct >= 0) s = s.slice(0, pct); // strip zone id
|
|
80
|
+
if (s === "::1") return true; // loopback
|
|
81
|
+
if (s === "::") return true; // unspecified
|
|
82
|
+
// IPv4-mapped / IPv4-embedded (e.g. ::ffff:127.0.0.1, ::127.0.0.1)
|
|
83
|
+
const v4 = /:(\d+\.\d+\.\d+\.\d+)$/.exec(s);
|
|
84
|
+
if (v4 && isPrivateIPv4(v4[1])) return true;
|
|
85
|
+
const first = s.split(":")[0];
|
|
86
|
+
if (/^f[cd]/.test(first)) return true; // fc00::/7 unique-local
|
|
87
|
+
if (/^fe[89ab]/.test(first)) return true; // fe80::/10 link-local
|
|
88
|
+
return false;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// True if a literal IP address falls in a private/loopback/link-local range.
|
|
92
|
+
export function isPrivateIp(addr) {
|
|
93
|
+
const fam = isIP(addr);
|
|
94
|
+
if (fam === 4) return isPrivateIPv4(addr);
|
|
95
|
+
if (fam === 6) return isPrivateIPv6(addr);
|
|
96
|
+
return false;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Validate a single URL as an allowed download target. Applies the caller's
|
|
100
|
+
// host allowlist (the SAME check used on the initial URL — this is what makes
|
|
101
|
+
// redirect following safe) and, unless explicitly allowed, rejects any host
|
|
102
|
+
// that is a private IP literal or resolves to one (DNS rebinding / SSRF).
|
|
103
|
+
// Throws on rejection; resolves on success.
|
|
104
|
+
export async function assertAllowedTarget(
|
|
105
|
+
urlStr,
|
|
106
|
+
{ validateHost = null, allowPrivateIp = false } = {},
|
|
107
|
+
) {
|
|
108
|
+
let u;
|
|
109
|
+
try {
|
|
110
|
+
u = new URL(urlStr);
|
|
111
|
+
} catch {
|
|
112
|
+
throw new Error(`blocked target: unparseable URL`);
|
|
113
|
+
}
|
|
114
|
+
if (u.protocol !== "http:" && u.protocol !== "https:") {
|
|
115
|
+
throw new Error(`blocked target: unsupported scheme "${u.protocol}"`);
|
|
116
|
+
}
|
|
117
|
+
const host = u.host.toLowerCase(); // includes port — matches the picker
|
|
118
|
+
const hostname = u.hostname.toLowerCase();
|
|
119
|
+
if (validateHost && !validateHost(host)) {
|
|
120
|
+
throw new Error(`blocked target: host not allowed: ${host}`);
|
|
121
|
+
}
|
|
122
|
+
if (allowPrivateIp) return;
|
|
123
|
+
if (isIP(hostname)) {
|
|
124
|
+
if (isPrivateIp(hostname)) {
|
|
125
|
+
throw new Error(`blocked target: private/loopback IP ${hostname}`);
|
|
126
|
+
}
|
|
127
|
+
return;
|
|
128
|
+
}
|
|
129
|
+
let results;
|
|
130
|
+
try {
|
|
131
|
+
results = await dns.promises.lookup(hostname, { all: true });
|
|
132
|
+
} catch (e) {
|
|
133
|
+
// Fail closed: a host we can't resolve isn't a host we should fetch.
|
|
134
|
+
throw new Error(`blocked target: could not resolve ${hostname}: ${e?.message || e}`);
|
|
135
|
+
}
|
|
136
|
+
for (const r of results) {
|
|
137
|
+
if (isPrivateIp(r.address)) {
|
|
138
|
+
throw new Error(
|
|
139
|
+
`blocked target: ${hostname} resolves to private/loopback IP ${r.address}`,
|
|
140
|
+
);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
11
145
|
// Retry transient network + 5xx/429 failures with short exponential backoff.
|
|
12
146
|
// Each attempt gets its own AbortController + timeout; caller-passed signals
|
|
13
147
|
// are not plumbed through since we don't have a cancellation story above this
|
|
@@ -17,11 +151,15 @@ export async function safeText(res) {
|
|
|
17
151
|
// `bodyFactory`, when set, is invoked per attempt to produce a fresh body —
|
|
18
152
|
// required for streaming uploads where the previous attempt consumed the
|
|
19
153
|
// stream. Takes precedence over opts.body.
|
|
154
|
+
//
|
|
155
|
+
// `keepBodyTimeout`, when set, hands the attempt's abort timer to the caller on
|
|
156
|
+
// a successful (2xx) response instead of clearing it, so the body-read window
|
|
157
|
+
// stays bounded. See releaseBodyTimeout / abortBody above.
|
|
20
158
|
export async function retryableFetch(
|
|
21
159
|
url,
|
|
22
160
|
opts = {},
|
|
23
161
|
label,
|
|
24
|
-
{ timeoutMs = 30_000, bodyFactory = null } = {},
|
|
162
|
+
{ timeoutMs = 30_000, bodyFactory = null, keepBodyTimeout = false } = {},
|
|
25
163
|
) {
|
|
26
164
|
const delays = [100, 500];
|
|
27
165
|
let lastErr = null;
|
|
@@ -36,6 +174,7 @@ export async function retryableFetch(
|
|
|
36
174
|
}
|
|
37
175
|
const controller = new AbortController();
|
|
38
176
|
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
177
|
+
let handedOff = false;
|
|
39
178
|
try {
|
|
40
179
|
const fetchOpts = { ...opts, signal: controller.signal };
|
|
41
180
|
if (bodyFactory) {
|
|
@@ -45,7 +184,14 @@ export async function retryableFetch(
|
|
|
45
184
|
fetchOpts.duplex = "half";
|
|
46
185
|
}
|
|
47
186
|
const res = await fetch(url, fetchOpts);
|
|
48
|
-
if (res.ok)
|
|
187
|
+
if (res.ok) {
|
|
188
|
+
if (keepBodyTimeout) {
|
|
189
|
+
// Keep the timer armed until the caller consumes the body.
|
|
190
|
+
handedOff = true;
|
|
191
|
+
bodyTimers.set(res, { timer, controller });
|
|
192
|
+
}
|
|
193
|
+
return res;
|
|
194
|
+
}
|
|
49
195
|
if (res.status === 429 || (res.status >= 500 && res.status < 600)) {
|
|
50
196
|
lastRes = res;
|
|
51
197
|
continue;
|
|
@@ -55,9 +201,60 @@ export async function retryableFetch(
|
|
|
55
201
|
lastErr = e;
|
|
56
202
|
continue;
|
|
57
203
|
} finally {
|
|
58
|
-
clearTimeout(timer);
|
|
204
|
+
if (!handedOff) clearTimeout(timer);
|
|
59
205
|
}
|
|
60
206
|
}
|
|
61
207
|
if (lastRes) return lastRes;
|
|
62
208
|
throw lastErr;
|
|
63
209
|
}
|
|
210
|
+
|
|
211
|
+
export const MAX_DOWNLOAD_REDIRECTS = 5;
|
|
212
|
+
|
|
213
|
+
// Fetch a download target with manual redirect handling and an SSRF guard.
|
|
214
|
+
// Every hop (the initial URL and each Location target) is re-validated with
|
|
215
|
+
// assertAllowedTarget before it is fetched, so a 3xx to an unvalidated or
|
|
216
|
+
// private host is rejected instead of silently followed. Redirect bodies are
|
|
217
|
+
// drained between hops. Returns the final (non-redirect) Response; the caller
|
|
218
|
+
// owns the body (use keepBodyTimeout semantics: releaseBodyTimeout when done).
|
|
219
|
+
export async function guardedDownloadFetch(
|
|
220
|
+
url,
|
|
221
|
+
{
|
|
222
|
+
timeoutMs = 30_000,
|
|
223
|
+
validateHost = null,
|
|
224
|
+
allowPrivateIp = false,
|
|
225
|
+
maxRedirects = MAX_DOWNLOAD_REDIRECTS,
|
|
226
|
+
label,
|
|
227
|
+
} = {},
|
|
228
|
+
) {
|
|
229
|
+
let current = url;
|
|
230
|
+
for (let hop = 0; ; hop++) {
|
|
231
|
+
await assertAllowedTarget(current, { validateHost, allowPrivateIp });
|
|
232
|
+
const res = await retryableFetch(
|
|
233
|
+
current,
|
|
234
|
+
{ method: "GET", redirect: "manual" },
|
|
235
|
+
label,
|
|
236
|
+
{ timeoutMs, keepBodyTimeout: true },
|
|
237
|
+
);
|
|
238
|
+
const status = res.status;
|
|
239
|
+
if (status >= 300 && status < 400) {
|
|
240
|
+
const loc = res.headers?.get?.("location");
|
|
241
|
+
if (loc) {
|
|
242
|
+
// A 3xx is not res.ok, so it was never handed off — its timer is
|
|
243
|
+
// already cleared. Drain the redirect body and re-validate the target.
|
|
244
|
+
await drainResponseBody(res);
|
|
245
|
+
if (hop >= maxRedirects) {
|
|
246
|
+
throw new Error(`too many redirects (> ${maxRedirects})`);
|
|
247
|
+
}
|
|
248
|
+
let next;
|
|
249
|
+
try {
|
|
250
|
+
next = new URL(loc, current).toString();
|
|
251
|
+
} catch {
|
|
252
|
+
throw new Error(`blocked target: unparseable redirect Location`);
|
|
253
|
+
}
|
|
254
|
+
current = next;
|
|
255
|
+
continue;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
return res;
|
|
259
|
+
}
|
|
260
|
+
}
|
package/lib/recording-manager.js
CHANGED
|
@@ -160,17 +160,32 @@ export function loadRecordingConfig() {
|
|
|
160
160
|
// WB_RECORDING_MASK_ALL_INPUTS default on; set "0" to record input values
|
|
161
161
|
// WB_RECORDING_MASK_TEXT_SELECTOR text content of matches → asterisks
|
|
162
162
|
// WB_RECORDING_BLOCK_SELECTOR matches recorded as inert placeholders
|
|
163
|
-
// WB_RECORDING_IGNORE_SELECTOR matches
|
|
163
|
+
// WB_RECORDING_IGNORE_SELECTOR matches excluded from the recording
|
|
164
|
+
//
|
|
165
|
+
// Privacy note: the vendored `vendor/rrweb-record.min.js` bundle supports
|
|
166
|
+
// `blockSelector` but NOT `ignoreSelector` — passing `ignoreSelector` to this
|
|
167
|
+
// build is a silent no-op, so an operator relying on it to drop a sensitive
|
|
168
|
+
// field would have that value recorded verbatim. To keep the "drop this field"
|
|
169
|
+
// promise on the shipped binary, WB_RECORDING_IGNORE_SELECTOR is folded into the
|
|
170
|
+
// (supported, and strictly stronger) `blockSelector`: a blocked element is not
|
|
171
|
+
// recorded at all — its subtree and inputs are never captured. The env var name
|
|
172
|
+
// is kept for compatibility; both selectors are unioned so neither is lost.
|
|
164
173
|
export function loadMaskConfig() {
|
|
165
174
|
const sel = (name) => {
|
|
166
175
|
const v = (process.env[name] || "").trim();
|
|
167
176
|
return v || null;
|
|
168
177
|
};
|
|
178
|
+
// Comma-join the explicit block selector with the ignore selector so the
|
|
179
|
+
// ignore intent is honored via a mechanism the vendored bundle actually
|
|
180
|
+
// supports. Either, both, or neither may be set.
|
|
181
|
+
const blockParts = [
|
|
182
|
+
sel("WB_RECORDING_BLOCK_SELECTOR"),
|
|
183
|
+
sel("WB_RECORDING_IGNORE_SELECTOR"),
|
|
184
|
+
].filter(Boolean);
|
|
169
185
|
return {
|
|
170
186
|
maskAllInputs: process.env.WB_RECORDING_MASK_ALL_INPUTS !== "0",
|
|
171
187
|
maskTextSelector: sel("WB_RECORDING_MASK_TEXT_SELECTOR"),
|
|
172
|
-
blockSelector:
|
|
173
|
-
ignoreSelector: sel("WB_RECORDING_IGNORE_SELECTOR"),
|
|
188
|
+
blockSelector: blockParts.length ? blockParts.join(", ") : null,
|
|
174
189
|
};
|
|
175
190
|
}
|
|
176
191
|
|
|
@@ -253,9 +268,10 @@ export class RecordingManager {
|
|
|
253
268
|
};
|
|
254
269
|
if (mask.maskTextSelector)
|
|
255
270
|
recordOpts.maskTextSelector = mask.maskTextSelector;
|
|
271
|
+
// `blockSelector` already folds in WB_RECORDING_IGNORE_SELECTOR (see
|
|
272
|
+
// loadMaskConfig). We never pass `ignoreSelector` — the vendored rrweb
|
|
273
|
+
// bundle does not support it, so it would be silently dropped.
|
|
256
274
|
if (mask.blockSelector) recordOpts.blockSelector = mask.blockSelector;
|
|
257
|
-
if (mask.ignoreSelector)
|
|
258
|
-
recordOpts.ignoreSelector = mask.ignoreSelector;
|
|
259
275
|
const recordOptsJson = JSON.stringify(recordOpts);
|
|
260
276
|
const bootstrap = `
|
|
261
277
|
;(function(){
|
|
@@ -138,9 +138,10 @@ export function parseSignedConfig(raw) {
|
|
|
138
138
|
export function pickSignedCandidate(candidates, opts = {}) {
|
|
139
139
|
const hosts = opts.hosts || [];
|
|
140
140
|
const jsonFields = opts.jsonFields || null;
|
|
141
|
-
const forced = opts.enabled === true;
|
|
142
141
|
for (const cand of candidates || []) {
|
|
143
142
|
for (const u of cand.urls || []) {
|
|
143
|
+
// json_fields only *filters* which fields are inspected — it never
|
|
144
|
+
// bypasses the host check below.
|
|
144
145
|
if (jsonFields && !jsonFields.includes(leafField(u.field)) && !jsonFields.includes(u.field))
|
|
145
146
|
continue;
|
|
146
147
|
let host = "";
|
|
@@ -153,12 +154,12 @@ export function pickSignedCandidate(candidates, opts = {}) {
|
|
|
153
154
|
hosts.length > 0 &&
|
|
154
155
|
hosts.some((h) => host === h || host.endsWith(`.${h}`));
|
|
155
156
|
const looksSigned = isSignedHost(host);
|
|
156
|
-
//
|
|
157
|
-
//
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
157
|
+
// A host match is ALWAYS required: a recognized signed host (auto mode)
|
|
158
|
+
// or an explicit `hosts` allowlist entry. Forced mode (`enabled: true`)
|
|
159
|
+
// does not relax this — it only opts the feature on; an author who needs
|
|
160
|
+
// an unrecognized host must name it in `hosts`. This closes the SSRF gap
|
|
161
|
+
// where forced + json_fields accepted an arbitrary host.
|
|
162
|
+
const accept = hostAllowed || looksSigned;
|
|
162
163
|
if (accept) {
|
|
163
164
|
return { url: u.url, field: u.field, api_url: cand.api_url || null, host };
|
|
164
165
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "wb-browser-runtime",
|
|
3
|
-
"version": "0.14.
|
|
3
|
+
"version": "0.14.1",
|
|
4
4
|
"description": "Browser sidecar runtime for wb — Playwright over CDP (Browserbase, browser-use) via the wb-sidecar/1 line-framed JSON protocol.",
|
|
5
5
|
"bin": {
|
|
6
6
|
"wb-browser-runtime": "bin/wb-browser-runtime.js"
|
package/verbs/download.js
CHANGED
|
@@ -18,7 +18,8 @@
|
|
|
18
18
|
|
|
19
19
|
import path from "node:path";
|
|
20
20
|
import { Buffer } from "node:buffer";
|
|
21
|
-
import { promises as fsPromises } from "node:fs";
|
|
21
|
+
import { promises as fsPromises, createWriteStream } from "node:fs";
|
|
22
|
+
import { once } from "node:events";
|
|
22
23
|
import { send } from "../lib/io.js";
|
|
23
24
|
import {
|
|
24
25
|
uniquePathInside,
|
|
@@ -26,19 +27,57 @@ import {
|
|
|
26
27
|
extensionAllowed,
|
|
27
28
|
} from "../lib/util.js";
|
|
28
29
|
import { HANDLED_MARK } from "../lib/download-capture.js";
|
|
29
|
-
import {
|
|
30
|
+
import {
|
|
31
|
+
guardedDownloadFetch,
|
|
32
|
+
releaseBodyTimeout,
|
|
33
|
+
abortBody,
|
|
34
|
+
drainResponseBody,
|
|
35
|
+
} from "../lib/http.js";
|
|
30
36
|
import {
|
|
31
37
|
SIGNED_PAGE_HOOK,
|
|
32
38
|
SIGNED_POLL_SCRIPT,
|
|
33
39
|
parseSignedConfig,
|
|
34
40
|
pickSignedCandidate,
|
|
35
41
|
redactSignedUrl,
|
|
42
|
+
isSignedHost,
|
|
36
43
|
} from "../lib/signed-url-capture.js";
|
|
37
44
|
|
|
38
45
|
const DEFAULT_TIMEOUT_MS = 10_000;
|
|
39
46
|
const POLL_INTERVAL_MS = 50;
|
|
40
47
|
const FALLBACK_NAME = "download.bin";
|
|
41
48
|
|
|
49
|
+
// Hard cap on signed-URL download size to bound memory/disk. A lying or absent
|
|
50
|
+
// Content-Length can't bypass it: the stream is aborted once bytes exceed it.
|
|
51
|
+
// Override for tests/ops via WB_MAX_DOWNLOAD_BYTES.
|
|
52
|
+
const MAX_SIGNED_DOWNLOAD_BYTES = 512 * 1024 * 1024; // 512 MiB
|
|
53
|
+
|
|
54
|
+
function maxDownloadBytes() {
|
|
55
|
+
const v = Number(process.env.WB_MAX_DOWNLOAD_BYTES);
|
|
56
|
+
return Number.isFinite(v) && v > 0 ? v : MAX_SIGNED_DOWNLOAD_BYTES;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Test/ops escape hatch: by default the SSRF guard rejects targets that resolve
|
|
60
|
+
// to private/loopback IPs. A local test server lives on 127.0.0.1, so the guard
|
|
61
|
+
// must be opt-out-able for those tests. Production leaves this unset.
|
|
62
|
+
function privateDownloadIpAllowed() {
|
|
63
|
+
const v = String(process.env.WB_ALLOW_PRIVATE_DOWNLOAD_IP || "").toLowerCase();
|
|
64
|
+
return v === "1" || v === "true" || v === "yes" || v === "on";
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Build the host validator applied to the initial signed URL *and* every
|
|
68
|
+
// redirect hop — the same gate pickSignedCandidate uses, so a redirect can't
|
|
69
|
+
// escape to a host the picker would never have selected.
|
|
70
|
+
function makeSignedHostValidator(signedCfg) {
|
|
71
|
+
const hosts = (signedCfg && signedCfg.hosts) || [];
|
|
72
|
+
return (host) => {
|
|
73
|
+
if (!host) return false;
|
|
74
|
+
const h = String(host).toLowerCase();
|
|
75
|
+
const hostAllowed =
|
|
76
|
+
hosts.length > 0 && hosts.some((x) => h === x || h.endsWith(`.${x}`));
|
|
77
|
+
return hostAllowed || isSignedHost(h);
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
42
81
|
// Page-side hook that traps blob/data-URL anchor clicks the SPA performs
|
|
43
82
|
// programmatically — `URL.createObjectURL(blob)` + `<a download>` + `.click()`.
|
|
44
83
|
// Playwright's own `download` event normally catches these, but a handful
|
|
@@ -247,6 +286,7 @@ export default {
|
|
|
247
286
|
page,
|
|
248
287
|
ctx,
|
|
249
288
|
timeout,
|
|
289
|
+
signedCfg,
|
|
250
290
|
});
|
|
251
291
|
}
|
|
252
292
|
|
|
@@ -417,6 +457,7 @@ async function saveSignedUrlDownload({
|
|
|
417
457
|
page,
|
|
418
458
|
ctx,
|
|
419
459
|
timeout,
|
|
460
|
+
signedCfg,
|
|
420
461
|
}) {
|
|
421
462
|
const redacted = redactSignedUrl(signed.url);
|
|
422
463
|
// Filename: explicit path: wins, else the signed URL's basename, else a
|
|
@@ -439,18 +480,8 @@ async function saveSignedUrlDownload({
|
|
|
439
480
|
}
|
|
440
481
|
await fsPromises.mkdir(artifactsDir, { recursive: true });
|
|
441
482
|
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
// never echo signed credentials.
|
|
445
|
-
let res;
|
|
446
|
-
try {
|
|
447
|
-
res = await retryableFetch(
|
|
448
|
-
signed.url,
|
|
449
|
-
{ method: "GET" },
|
|
450
|
-
`signed-url download (${redacted})`,
|
|
451
|
-
{ timeoutMs: timeout },
|
|
452
|
-
);
|
|
453
|
-
} catch (e) {
|
|
483
|
+
const maxBytes = maxDownloadBytes();
|
|
484
|
+
const failed = (extra, reason) =>
|
|
454
485
|
send({
|
|
455
486
|
type: "slice.download_failed",
|
|
456
487
|
verb: "download",
|
|
@@ -459,58 +490,161 @@ async function saveSignedUrlDownload({
|
|
|
459
490
|
api_url: signed.api_url,
|
|
460
491
|
signed_url: redacted,
|
|
461
492
|
page_url: safePageUrl(page),
|
|
462
|
-
|
|
493
|
+
...extra,
|
|
494
|
+
reason,
|
|
463
495
|
});
|
|
496
|
+
|
|
497
|
+
// Fetch the signed URL from the sidecar (not the page) so the object store's
|
|
498
|
+
// CORS policy doesn't block the read. Redirects are followed *manually* with
|
|
499
|
+
// the same host allowlist + private-IP block applied to every hop (SSRF), and
|
|
500
|
+
// the body-read timeout stays armed until we finish streaming. The label is
|
|
501
|
+
// redacted — retry logs must never echo signed credentials.
|
|
502
|
+
let res;
|
|
503
|
+
try {
|
|
504
|
+
res = await guardedDownloadFetch(signed.url, {
|
|
505
|
+
timeoutMs: timeout,
|
|
506
|
+
validateHost: makeSignedHostValidator(signedCfg),
|
|
507
|
+
allowPrivateIp: privateDownloadIpAllowed(),
|
|
508
|
+
label: `signed-url download (${redacted})`,
|
|
509
|
+
});
|
|
510
|
+
} catch (e) {
|
|
511
|
+
failed({}, `signed url fetch error: ${e?.message || e}`);
|
|
464
512
|
throw new Error(
|
|
465
513
|
`download: signed URL fetch failed for ${redacted}: ${e?.message || e}`,
|
|
466
514
|
);
|
|
467
515
|
}
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
516
|
+
|
|
517
|
+
try {
|
|
518
|
+
if (!res.ok) {
|
|
519
|
+
// A 403 on a pre-signed URL almost always means the token expired before
|
|
520
|
+
// we fetched it — call that out so the operator knows to shorten the gap.
|
|
521
|
+
const expired = res.status === 403;
|
|
522
|
+
await drainResponseBody(res); // don't leak the socket
|
|
523
|
+
failed(
|
|
524
|
+
{ http_status: res.status, expired },
|
|
525
|
+
`signed url fetch: HTTP ${res.status}${expired ? " (likely expired)" : ""}`,
|
|
526
|
+
);
|
|
527
|
+
throw new Error(
|
|
528
|
+
`download: signed URL fetch returned HTTP ${res.status} for ${redacted}${expired ? " (likely expired)" : ""}`,
|
|
529
|
+
);
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
// Reject up front when the server *declares* an oversized body...
|
|
533
|
+
const clRaw = safeHeader(res, "content-length");
|
|
534
|
+
const cl = clRaw == null ? NaN : Number(clRaw);
|
|
535
|
+
if (Number.isFinite(cl) && cl > maxBytes) {
|
|
536
|
+
await drainResponseBody(res);
|
|
537
|
+
failed(
|
|
538
|
+
{ content_length: cl, max_bytes: maxBytes },
|
|
539
|
+
`signed url body too large: Content-Length ${cl} > cap ${maxBytes}`,
|
|
540
|
+
);
|
|
541
|
+
throw new Error(
|
|
542
|
+
`download: signed URL body exceeds size cap (${cl} > ${maxBytes} bytes) for ${redacted}`,
|
|
543
|
+
);
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
// ...and enforce while streaming so a lying/absent Content-Length can't slip
|
|
547
|
+
// past. Streams to disk rather than materializing the whole body in memory.
|
|
548
|
+
let bytes;
|
|
549
|
+
try {
|
|
550
|
+
bytes = await streamToFileWithCap(res, target, maxBytes);
|
|
551
|
+
} catch (e) {
|
|
552
|
+
if (e && e.code === "WB_SIZE_CAP") {
|
|
553
|
+
failed(
|
|
554
|
+
{ max_bytes: maxBytes },
|
|
555
|
+
`signed url body exceeded size cap of ${maxBytes} bytes mid-stream`,
|
|
556
|
+
);
|
|
557
|
+
throw new Error(
|
|
558
|
+
`download: signed URL body exceeded size cap (${maxBytes} bytes) for ${redacted}`,
|
|
559
|
+
);
|
|
560
|
+
}
|
|
561
|
+
failed({}, `signed url body read error: ${e?.message || e}`);
|
|
562
|
+
throw new Error(
|
|
563
|
+
`download: signed URL body read failed for ${redacted}: ${e?.message || e}`,
|
|
564
|
+
);
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
const contentType = safeHeader(res, "content-type");
|
|
568
|
+
const contentDisposition = safeHeader(res, "content-disposition");
|
|
472
569
|
send({
|
|
473
|
-
type: "slice.
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
570
|
+
type: "slice.artifact_saved",
|
|
571
|
+
filename: path.basename(target),
|
|
572
|
+
path: target,
|
|
573
|
+
bytes,
|
|
574
|
+
source: "download",
|
|
575
|
+
provenance: {
|
|
576
|
+
url: null,
|
|
577
|
+
signed_url: redacted,
|
|
578
|
+
api_url: signed.api_url,
|
|
579
|
+
field: signed.field,
|
|
580
|
+
suggested_filename: suggested,
|
|
581
|
+
page_url: safePageUrl(page),
|
|
582
|
+
verb_index: ctx?.index ?? null,
|
|
583
|
+
verb_name: "download",
|
|
584
|
+
capture: "signed_url",
|
|
585
|
+
content_type: contentType,
|
|
586
|
+
content_disposition: contentDisposition,
|
|
587
|
+
ts: Date.now(),
|
|
588
|
+
},
|
|
483
589
|
});
|
|
484
|
-
|
|
485
|
-
|
|
590
|
+
return `→ ${path.basename(target)}`;
|
|
591
|
+
} finally {
|
|
592
|
+
// Body fully consumed (or we bailed) — disarm the body-read timeout.
|
|
593
|
+
releaseBodyTimeout(res);
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
// Stream a response body to disk, counting bytes and aborting the in-flight
|
|
598
|
+
// read once the cap is exceeded (so an absent/under-stated Content-Length can't
|
|
599
|
+
// OOM us). Removes the partial file on any error. Returns total bytes written.
|
|
600
|
+
async function streamToFileWithCap(res, target, maxBytes) {
|
|
601
|
+
const reader = res.body?.getReader?.();
|
|
602
|
+
const ws = createWriteStream(target);
|
|
603
|
+
let total = 0;
|
|
604
|
+
try {
|
|
605
|
+
if (!reader) {
|
|
606
|
+
// No body to read (e.g. 204) — write an empty file.
|
|
607
|
+
await new Promise((resolve, reject) =>
|
|
608
|
+
ws.end((e) => (e ? reject(e) : resolve())),
|
|
609
|
+
);
|
|
610
|
+
return 0;
|
|
611
|
+
}
|
|
612
|
+
for (;;) {
|
|
613
|
+
const { done, value } = await reader.read();
|
|
614
|
+
if (done) break;
|
|
615
|
+
total += value.byteLength;
|
|
616
|
+
if (total > maxBytes) {
|
|
617
|
+
abortBody(res); // abort the underlying socket read
|
|
618
|
+
try {
|
|
619
|
+
await reader.cancel();
|
|
620
|
+
} catch {}
|
|
621
|
+
const err = new Error(`size cap exceeded`);
|
|
622
|
+
err.code = "WB_SIZE_CAP";
|
|
623
|
+
throw err;
|
|
624
|
+
}
|
|
625
|
+
if (!ws.write(Buffer.from(value))) {
|
|
626
|
+
await once(ws, "drain");
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
await new Promise((resolve, reject) =>
|
|
630
|
+
ws.end((e) => (e ? reject(e) : resolve())),
|
|
486
631
|
);
|
|
632
|
+
return total;
|
|
633
|
+
} catch (e) {
|
|
634
|
+
// Await the stream's close before unlinking: createWriteStream opens its fd
|
|
635
|
+
// asynchronously, so unlinking eagerly can race the (lazy) open and leave a
|
|
636
|
+
// resurrected empty file behind.
|
|
637
|
+
try {
|
|
638
|
+
await new Promise((resolve) => {
|
|
639
|
+
ws.once("close", resolve);
|
|
640
|
+
ws.destroy();
|
|
641
|
+
});
|
|
642
|
+
} catch {}
|
|
643
|
+
try {
|
|
644
|
+
await fsPromises.unlink(target);
|
|
645
|
+
} catch {}
|
|
646
|
+
throw e;
|
|
487
647
|
}
|
|
488
|
-
const buf = Buffer.from(await res.arrayBuffer());
|
|
489
|
-
await fsPromises.writeFile(target, buf);
|
|
490
|
-
const contentType = safeHeader(res, "content-type");
|
|
491
|
-
const contentDisposition = safeHeader(res, "content-disposition");
|
|
492
|
-
send({
|
|
493
|
-
type: "slice.artifact_saved",
|
|
494
|
-
filename: path.basename(target),
|
|
495
|
-
path: target,
|
|
496
|
-
bytes: buf.length,
|
|
497
|
-
source: "download",
|
|
498
|
-
provenance: {
|
|
499
|
-
url: null,
|
|
500
|
-
signed_url: redacted,
|
|
501
|
-
api_url: signed.api_url,
|
|
502
|
-
field: signed.field,
|
|
503
|
-
suggested_filename: suggested,
|
|
504
|
-
page_url: safePageUrl(page),
|
|
505
|
-
verb_index: ctx?.index ?? null,
|
|
506
|
-
verb_name: "download",
|
|
507
|
-
capture: "signed_url",
|
|
508
|
-
content_type: contentType,
|
|
509
|
-
content_disposition: contentDisposition,
|
|
510
|
-
ts: Date.now(),
|
|
511
|
-
},
|
|
512
|
-
});
|
|
513
|
-
return `→ ${path.basename(target)}`;
|
|
514
648
|
}
|
|
515
649
|
|
|
516
650
|
async function pollForBlob(page, timeoutMs, stop) {
|
package/verbs/goto.js
CHANGED
|
@@ -1,10 +1,16 @@
|
|
|
1
|
+
import { scrubSecrets } from "../lib/substitution.js";
|
|
2
|
+
|
|
1
3
|
export default {
|
|
2
4
|
name: "goto",
|
|
3
5
|
primaryKey: "url",
|
|
4
|
-
async execute(page, args) {
|
|
6
|
+
async execute(page, args, ctx) {
|
|
5
7
|
const url = args.url ?? "";
|
|
6
8
|
const waitUntil = args.wait_until ?? "domcontentloaded";
|
|
7
9
|
await page.goto(url, { waitUntil, timeout: args.timeout ?? 30_000 });
|
|
8
|
-
|
|
10
|
+
// The resolved URL can carry a substituted secret (e.g.
|
|
11
|
+
// ?token={{ env.TOKEN }}). Scrub any collected secret value out of the
|
|
12
|
+
// summary before it crosses into the verb.complete event stream — the
|
|
13
|
+
// same mechanism error messages use (lib/substitution.scrubSecrets).
|
|
14
|
+
return `→ ${scrubSecrets(page.url(), ctx?.secrets)}`;
|
|
9
15
|
},
|
|
10
16
|
};
|