wb-browser-runtime 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -4
- package/bin/wb-browser-runtime.js +29 -106
- package/lib/recording-manager.js +39 -4
- package/lib/signed-url-capture.js +275 -0
- package/lib/substitution.js +128 -0
- package/package.json +1 -1
- package/verbs/download.js +205 -19
package/README.md
CHANGED
|
@@ -122,6 +122,8 @@ Verb arguments support two substitutions at dispatch time:
|
|
|
122
122
|
|
|
123
123
|
Both forms are redacted in stdout summaries — only the verb name + selector make it into the log. Expanded values are also scrubbed from `verb.failed` / `slice.failed` error messages before they cross the stdio boundary.
|
|
124
124
|
|
|
125
|
+
**Escaping.** To emit a literal `{{ … }}` that should *not* be substituted, prefix it with a backslash: `\{{ env.X }}` round-trips to the literal text `{{ env.X }}`. The escape is a single left-to-right pass, so the braces it produces are not re-scanned.
|
|
126
|
+
|
|
125
127
|
**Missing-value policy.** Set `WB_SUBSTITUTION_ON_MISSING` to choose how a missing `env.X` or `artifacts.X` is handled:
|
|
126
128
|
|
|
127
129
|
- `warn` (default) — log a stderr warning and substitute an empty string; the verb continues.
|
|
@@ -166,10 +168,16 @@ endpoint at session close. Recording is **off by default** — set
|
|
|
166
168
|
| `WB_RECORDING_SCREENCAST_QUALITY` | `60` | JPEG quality (0–100). |
|
|
167
169
|
| `WB_RECORDING_RRWEB` | `1` | Set `0` to skip rrweb even if recording is on. |
|
|
168
170
|
| `WB_RECORDING_VIDEO` | `0` if no `ffmpeg` | Set `0` to skip video even if `ffmpeg` is present. |
|
|
171
|
+
| `WB_RECORDING_MASK_ALL_INPUTS` | `1` | rrweb `maskAllInputs`. Set `0` to record input *values* (off by default for safety). |
|
|
172
|
+
| `WB_RECORDING_MASK_TEXT_SELECTOR` | *(unset)* | CSS selector whose **text content** rrweb masks (e.g. `.ssn, .acct-balance`). |
|
|
173
|
+
| `WB_RECORDING_BLOCK_SELECTOR` | *(unset)* | CSS selector rrweb records as an inert placeholder (contents never captured). |
|
|
174
|
+
| `WB_RECORDING_IGNORE_SELECTOR` | *(unset)* | CSS selector whose input events rrweb drops entirely. |
|
|
169
175
|
|
|
170
176
|
Artifacts are two parallel POSTs per session, `kind ∈ {rrweb, video}`:
|
|
171
177
|
|
|
172
|
-
- **rrweb** — gzipped JSON (`application/json+gzip`) — `{ run_id, session, event_count, events: [...] }`. DOM mutations + input events captured from every page
|
|
178
|
+
- **rrweb** — gzipped JSON (`application/json+gzip`) — `{ run_id, session, event_count, events: [...] }`. DOM mutations + input events captured from every page.
|
|
179
|
+
|
|
180
|
+
**PII scope — read this.** `maskAllInputs` (on by default) only redacts the *values* a user types into form fields. It does **not** mask field labels, placeholders, `aria-label`s, `<option>` text, or any other rendered text, and it does not alter the recorded DOM structure. A displayed account number, balance, or name that is page text — not an input value — is captured verbatim. For those, point rrweb at the sensitive nodes with `WB_RECORDING_MASK_TEXT_SELECTOR` (mask the text) or `WB_RECORDING_BLOCK_SELECTOR` (omit the subtree). When in doubt, block the region.
|
|
173
181
|
- **video** — VP9 WebM (`video/webm`) — encoded from JPEG screencast frames via `ffmpeg`. Requires `ffmpeg` on `$PATH` (droplet install: `apt-get install -y ffmpeg`). If `ffmpeg` is missing the video kind silently disables and rrweb continues alone.
|
|
174
182
|
|
|
175
183
|
Each POST carries headers `Authorization: Bearer <secret>`,
|
|
@@ -208,7 +216,7 @@ example, see the `browserbase-hn-upvoted-probe` runbook in the xatabase repo.
|
|
|
208
216
|
| `assert` | `assert: <selector>` | `selector`, `text_contains`, `url_contains` |
|
|
209
217
|
| `eval` | `eval: <js>` | `script` |
|
|
210
218
|
| `save` | `save: <name>` | `name`, `value` (captures prior `extract`/`eval` when omitted) |
|
|
211
|
-
| `download` | `download: <selector>` | `selector`, `path`, `timeout`, `text_fallback` (clicks + races Playwright `download` event
|
|
219
|
+
| `download` | `download: <selector>` | `selector`, `path`, `timeout`, `text_fallback`, `signed_url` (clicks + races Playwright `download` event, in-page blob/anchor capture, and signed-URL export capture; saves into `$WB_ARTIFACTS_DIR/<path>`) |
|
|
212
220
|
|
|
213
221
|
`extract`'s `fields` entries are either a CSS selector string (returns
|
|
214
222
|
`textContent`), or `{ selector, attr }` to read an attribute.
|
|
@@ -308,9 +316,55 @@ Behaviour:
|
|
|
308
316
|
doesn't double-save.
|
|
309
317
|
- Emits `slice.artifact_saved` with `source: "download"` and
|
|
310
318
|
`provenance.verb_name: "download"`.
|
|
311
|
-
- On timeout: throws with diagnostics (page URL, selector,
|
|
319
|
+
- On timeout: throws with diagnostics (page URL, selector, all
|
|
312
320
|
failure reasons) AND emits a `slice.download_failed` frame.
|
|
313
321
|
|
|
322
|
+
#### Signed-URL export capture
|
|
323
|
+
|
|
324
|
+
Some SaaS "Download" buttons never trip a Playwright `download` event or an
|
|
325
|
+
in-page Blob. Instead the click calls a same-origin API that returns JSON like
|
|
326
|
+
`{ "download_url": "https://bucket.s3.amazonaws.com/…?<signed>" }` and then
|
|
327
|
+
navigates to that URL — and a page-side `fetch(signedUrl)` fails because the
|
|
328
|
+
object store's CORS policy won't let the app origin read the bytes.
|
|
329
|
+
|
|
330
|
+
The `download:` verb adds a **third** capture racer for this: it wraps the
|
|
331
|
+
page's `fetch`/`XHR` around the click, inspects small same-origin JSON
|
|
332
|
+
responses for URL-looking fields, and when it finds one pointing at a
|
|
333
|
+
recognized object-store host (S3, GCS, CloudFront, Azure Blob, R2), it
|
|
334
|
+
downloads the bytes **from the sidecar process** (where CORS doesn't apply)
|
|
335
|
+
and saves them like any other artifact.
|
|
336
|
+
|
|
337
|
+
This is **on by default in `auto` mode** — it only fires when a recognized
|
|
338
|
+
signed host appears in a JSON response around the click, so a normal
|
|
339
|
+
Playwright/blob download is unaffected. Tune or disable it per verb:
|
|
340
|
+
|
|
341
|
+
```yaml
|
|
342
|
+
- download:
|
|
343
|
+
selector: 'button:has-text("Download as xlsx")'
|
|
344
|
+
path: pilot-profit-loss.xlsx
|
|
345
|
+
timeout: 10s
|
|
346
|
+
signed_url:
|
|
347
|
+
enabled: true # true | false | auto (default auto)
|
|
348
|
+
hosts: # extra non-recognized hosts to accept
|
|
349
|
+
- pilot-report-downloads.s3.amazonaws.com
|
|
350
|
+
json_fields: # restrict to these response field names
|
|
351
|
+
- download_url
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
- Set `signed_url: false` to turn the capture off entirely for a verb.
|
|
355
|
+
- In `auto` mode only recognized object-store hosts (or an explicit `hosts:`
|
|
356
|
+
entry) are fetched. With `enabled: true` an explicit `hosts:`/`json_fields:`
|
|
357
|
+
match is honored even for an unrecognized host, since you named it.
|
|
358
|
+
- The captured URL's **query string (where signed credentials live) is
|
|
359
|
+
redacted** everywhere it crosses the stdio boundary — `provenance.signed_url`
|
|
360
|
+
is `origin+path?<redacted>`; the full URL stays only in sidecar memory for the
|
|
361
|
+
fetch. Honors `WB_BROWSER_DOWNLOAD_EXTENSIONS`.
|
|
362
|
+
- The saved frame carries `provenance.capture: "signed_url"` plus `api_url`,
|
|
363
|
+
`field`, `content_type`, and `content_disposition`.
|
|
364
|
+
- A 403 on the signed URL (expired token) emits `slice.download_failed` with
|
|
365
|
+
`expired: true` and `http_status: 403` so the operator knows to shorten the
|
|
366
|
+
click→fetch gap.
|
|
367
|
+
|
|
314
368
|
## Protocol
|
|
315
369
|
|
|
316
370
|
Line-framed JSON, one message per line, on stdin/stdout. `stderr` is treated as
|
|
@@ -321,9 +375,26 @@ opaque diagnostics by `wb` and printed dimmed to the user's terminal.
|
|
|
321
375
|
```
|
|
322
376
|
wb → {"type": "hello", "wb_version": "...", "protocol": "wb-sidecar/1"}
|
|
323
377
|
wb ← {"type": "ready", "runtime": "wb-browser-runtime", "version": "...",
|
|
324
|
-
"protocol": "wb-sidecar/1", "
|
|
378
|
+
"protocol": "wb-sidecar/1", "min_protocol": "wb-sidecar/1",
|
|
379
|
+
"supports": ["goto", "click", "fill", ...],
|
|
380
|
+
"features": ["recording", "pause", "substitution",
|
|
381
|
+
"substitution_escape", "download_capture",
|
|
382
|
+
"signed_url_download"]}
|
|
325
383
|
```
|
|
326
384
|
|
|
385
|
+
The `ready` frame advertises capabilities so a client can feature-detect
|
|
386
|
+
without a hard-coded version→capability map:
|
|
387
|
+
|
|
388
|
+
- `protocol` — the wire version this runtime speaks.
|
|
389
|
+
- `min_protocol` — the oldest protocol version it can still interoperate with
|
|
390
|
+
(equal to `protocol` until a breaking frame change ships). A client speaking
|
|
391
|
+
an older protocol than `min_protocol` should refuse rather than guess.
|
|
392
|
+
- `supports` — the per-verb list (derived from the verb registry).
|
|
393
|
+
- `features` — coarse capability tokens above the verb list.
|
|
394
|
+
|
|
395
|
+
`version` is read from `package.json` at boot, so it can never drift from the
|
|
396
|
+
published version.
|
|
397
|
+
|
|
327
398
|
### Slice
|
|
328
399
|
|
|
329
400
|
```
|
|
@@ -25,9 +25,7 @@
|
|
|
25
25
|
|
|
26
26
|
import readline from "node:readline";
|
|
27
27
|
import { chromium } from "playwright-core";
|
|
28
|
-
import { readFileSync } from "node:fs";
|
|
29
28
|
import { send, log } from "../lib/io.js";
|
|
30
|
-
import { resolveInside } from "../lib/util.js";
|
|
31
29
|
import { SessionManager } from "../lib/session-manager.js";
|
|
32
30
|
import {
|
|
33
31
|
RecordingManager,
|
|
@@ -40,9 +38,31 @@ import {
|
|
|
40
38
|
classifyError,
|
|
41
39
|
} from "../lib/failure.js";
|
|
42
40
|
import { installDownloadCapture } from "../lib/download-capture.js";
|
|
41
|
+
import { expand, scrubSecrets } from "../lib/substitution.js";
|
|
43
42
|
import { SUPPORTS, runVerb, verbName } from "../verbs/index.js";
|
|
43
|
+
import pkg from "../package.json" with { type: "json" };
|
|
44
|
+
|
|
45
|
+
// Read the version from package.json so the `ready` frame can never drift from
|
|
46
|
+
// the published version (it used to be a hand-maintained literal that fell out
|
|
47
|
+
// of sync). Node >=24 supports JSON import attributes natively.
|
|
48
|
+
const VERSION = pkg.version;
|
|
49
|
+
|
|
50
|
+
// Protocol capability advertisement. `protocol` is the wire version we speak;
|
|
51
|
+
// `min_protocol` is the oldest version a peer may speak and still interoperate
|
|
52
|
+
// (we keep it equal to `protocol` until we ship a breaking frame change).
|
|
53
|
+
// `features` is a coarse capability list above the per-verb `supports` array —
|
|
54
|
+
// a client can feature-detect without hard-coding a version→capability map.
|
|
55
|
+
const PROTOCOL = "wb-sidecar/1";
|
|
56
|
+
const MIN_PROTOCOL = "wb-sidecar/1";
|
|
57
|
+
const FEATURES = [
|
|
58
|
+
"recording", // rrweb DOM capture + CDP screencast video
|
|
59
|
+
"pause", // pause_for_human operator handoff
|
|
60
|
+
"substitution", // {{ env.X }} / {{ artifacts.X }}
|
|
61
|
+
"substitution_escape", // \{{ literal-brace escape
|
|
62
|
+
"download_capture", // passive + explicit download artifact capture
|
|
63
|
+
"signed_url_download", // server-side fetch of in-JSON signed export URLs
|
|
64
|
+
];
|
|
44
65
|
|
|
45
|
-
const VERSION = "0.8.0";
|
|
46
66
|
const provider = getProvider();
|
|
47
67
|
log(`[provider] ${provider.name}`);
|
|
48
68
|
|
|
@@ -158,108 +178,9 @@ async function ensureSession(name, { profile, restoreSession } = {}) {
|
|
|
158
178
|
}
|
|
159
179
|
});
|
|
160
180
|
}
|
|
161
|
-
//
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
// Artifact names are bare identifiers — no dots, no slashes. Anything more
|
|
165
|
-
// exotic would invite path traversal once composed with WB_ARTIFACTS_DIR.
|
|
166
|
-
const ARTIFACT_RE = /\{\{\s*artifacts\.([A-Za-z_][A-Za-z0-9_-]*)\s*\}\}/g;
|
|
167
|
-
|
|
168
|
-
// Resolved once at module load. `warn` matches historical behavior
|
|
169
|
-
// (log + empty string, runbook continues). `error` throws so a missing OTP
|
|
170
|
-
// or env var fails the slice instead of silently sending an empty value
|
|
171
|
-
// into a Playwright action. `empty` is the silent variant.
|
|
172
|
-
const ON_MISSING = (() => {
|
|
173
|
-
const raw = (process.env.WB_SUBSTITUTION_ON_MISSING || "warn")
|
|
174
|
-
.trim()
|
|
175
|
-
.toLowerCase();
|
|
176
|
-
if (raw === "error" || raw === "empty" || raw === "warn") return raw;
|
|
177
|
-
log(
|
|
178
|
-
`[warn] WB_SUBSTITUTION_ON_MISSING=${raw} is not valid (warn|error|empty); defaulting to warn`,
|
|
179
|
-
);
|
|
180
|
-
return "warn";
|
|
181
|
-
})();
|
|
182
|
-
|
|
183
|
-
function handleMissingSubstitution(kind, name) {
|
|
184
|
-
const msg = `${kind}.${name} is not set`;
|
|
185
|
-
if (ON_MISSING === "error") {
|
|
186
|
-
throw new Error(`substitution: ${msg}`);
|
|
187
|
-
}
|
|
188
|
-
if (ON_MISSING === "warn") {
|
|
189
|
-
log(`[warn] ${msg}; substituting empty string`);
|
|
190
|
-
}
|
|
191
|
-
return "";
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
function readArtifactRaw(name) {
|
|
195
|
-
const dir = (process.env.WB_ARTIFACTS_DIR || "").trim();
|
|
196
|
-
if (!dir) {
|
|
197
|
-
log(`[warn] artifacts.${name} referenced but WB_ARTIFACTS_DIR is not set`);
|
|
198
|
-
return null;
|
|
199
|
-
}
|
|
200
|
-
for (const candidate of [`${name}.txt`, name]) {
|
|
201
|
-
const full = resolveInside(dir, candidate);
|
|
202
|
-
if (!full) continue;
|
|
203
|
-
try {
|
|
204
|
-
return readFileSync(full, "utf8").trimEnd();
|
|
205
|
-
} catch {
|
|
206
|
-
// try next candidate
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
return null;
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
function readArtifact(name, cache) {
|
|
213
|
-
if (cache && cache.has(name)) {
|
|
214
|
-
const hit = cache.get(name);
|
|
215
|
-
if (hit === null) return handleMissingSubstitution("artifacts", name);
|
|
216
|
-
return hit;
|
|
217
|
-
}
|
|
218
|
-
const v = readArtifactRaw(name);
|
|
219
|
-
if (cache) cache.set(name, v);
|
|
220
|
-
if (v === null) return handleMissingSubstitution("artifacts", name);
|
|
221
|
-
return v;
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
function expand(value, collected, artifactCache) {
|
|
225
|
-
if (typeof value === "string") {
|
|
226
|
-
return value
|
|
227
|
-
.replace(ENV_RE, (_, name) => {
|
|
228
|
-
const v = process.env[name];
|
|
229
|
-
if (v === undefined) return handleMissingSubstitution("env", name);
|
|
230
|
-
if (collected && v.length >= 3) collected.add(v);
|
|
231
|
-
return v;
|
|
232
|
-
})
|
|
233
|
-
.replace(ARTIFACT_RE, (_, name) => {
|
|
234
|
-
const v = readArtifact(name, artifactCache);
|
|
235
|
-
if (collected && v && v.length >= 3) collected.add(v);
|
|
236
|
-
return v;
|
|
237
|
-
});
|
|
238
|
-
}
|
|
239
|
-
if (Array.isArray(value))
|
|
240
|
-
return value.map((v) => expand(v, collected, artifactCache));
|
|
241
|
-
if (value && typeof value === "object") {
|
|
242
|
-
const out = {};
|
|
243
|
-
for (const [k, v] of Object.entries(value))
|
|
244
|
-
out[k] = expand(v, collected, artifactCache);
|
|
245
|
-
return out;
|
|
246
|
-
}
|
|
247
|
-
return value;
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
// Scrub any values that came from {{ env.X }} / {{ artifacts.X }} expansion
|
|
251
|
-
// out of error messages before they cross the stdio boundary — Playwright and
|
|
252
|
-
// fetch errors sometimes echo their inputs (URLs, script bodies, assertion
|
|
253
|
-
// text) and those inputs may contain credentials.
|
|
254
|
-
function scrubSecrets(msg, secrets) {
|
|
255
|
-
let out = String(msg == null ? "" : msg);
|
|
256
|
-
if (!secrets) return out;
|
|
257
|
-
for (const s of secrets) {
|
|
258
|
-
if (!s) continue;
|
|
259
|
-
out = out.split(s).join("«***»");
|
|
260
|
-
}
|
|
261
|
-
return out;
|
|
262
|
-
}
|
|
181
|
+
// {{ env.X }} / {{ artifacts.X }} substitution + `\{{` escape + secret scrubbing
|
|
182
|
+
// live in lib/substitution.js (extracted so they're unit-testable without
|
|
183
|
+
// booting the sidecar).
|
|
263
184
|
|
|
264
185
|
// --- Slice handler ----------------------------------------------------------
|
|
265
186
|
|
|
@@ -559,8 +480,10 @@ rl.on("line", (line) => {
|
|
|
559
480
|
type: "ready",
|
|
560
481
|
runtime: "wb-browser-runtime",
|
|
561
482
|
version: VERSION,
|
|
562
|
-
protocol:
|
|
483
|
+
protocol: PROTOCOL,
|
|
484
|
+
min_protocol: MIN_PROTOCOL,
|
|
563
485
|
supports: SUPPORTS,
|
|
486
|
+
features: FEATURES,
|
|
564
487
|
});
|
|
565
488
|
break;
|
|
566
489
|
case "slice":
|
package/lib/recording-manager.js
CHANGED
|
@@ -148,6 +148,29 @@ export function loadRecordingConfig() {
|
|
|
148
148
|
kinds,
|
|
149
149
|
rrwebSource,
|
|
150
150
|
rrwebMaxEvents,
|
|
151
|
+
mask: loadMaskConfig(),
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// rrweb's `maskAllInputs` only redacts input *values* — labels, placeholders,
|
|
156
|
+
// aria-labels, option text, and the full DOM structure are still recorded. For
|
|
157
|
+
// genuinely sensitive regions (a displayed SSN, an account balance) the author
|
|
158
|
+
// must point rrweb at the offending nodes with a CSS selector. These knobs
|
|
159
|
+
// expose rrweb's selector options without hard-coding them:
|
|
160
|
+
// WB_RECORDING_MASK_ALL_INPUTS default on; set "0" to record input values
|
|
161
|
+
// WB_RECORDING_MASK_TEXT_SELECTOR text content of matches → asterisks
|
|
162
|
+
// WB_RECORDING_BLOCK_SELECTOR matches recorded as inert placeholders
|
|
163
|
+
// WB_RECORDING_IGNORE_SELECTOR matches' input events dropped entirely
|
|
164
|
+
export function loadMaskConfig() {
|
|
165
|
+
const sel = (name) => {
|
|
166
|
+
const v = (process.env[name] || "").trim();
|
|
167
|
+
return v || null;
|
|
168
|
+
};
|
|
169
|
+
return {
|
|
170
|
+
maskAllInputs: process.env.WB_RECORDING_MASK_ALL_INPUTS !== "0",
|
|
171
|
+
maskTextSelector: sel("WB_RECORDING_MASK_TEXT_SELECTOR"),
|
|
172
|
+
blockSelector: sel("WB_RECORDING_BLOCK_SELECTOR"),
|
|
173
|
+
ignoreSelector: sel("WB_RECORDING_IGNORE_SELECTOR"),
|
|
151
174
|
};
|
|
152
175
|
}
|
|
153
176
|
|
|
@@ -221,17 +244,29 @@ export class RecordingManager {
|
|
|
221
244
|
for (const e of batch) pushRrweb(e);
|
|
222
245
|
}
|
|
223
246
|
});
|
|
247
|
+
// Build rrweb record options from the resolved mask config. Selector
|
|
248
|
+
// options are omitted entirely when unset so we don't pass `null` into
|
|
249
|
+
// rrweb (which would match nothing but still allocate a matcher).
|
|
250
|
+
const mask = cfg.mask || { maskAllInputs: true };
|
|
251
|
+
const recordOpts = {
|
|
252
|
+
maskAllInputs: mask.maskAllInputs !== false,
|
|
253
|
+
};
|
|
254
|
+
if (mask.maskTextSelector)
|
|
255
|
+
recordOpts.maskTextSelector = mask.maskTextSelector;
|
|
256
|
+
if (mask.blockSelector) recordOpts.blockSelector = mask.blockSelector;
|
|
257
|
+
if (mask.ignoreSelector)
|
|
258
|
+
recordOpts.ignoreSelector = mask.ignoreSelector;
|
|
259
|
+
const recordOptsJson = JSON.stringify(recordOpts);
|
|
224
260
|
const bootstrap = `
|
|
225
261
|
;(function(){
|
|
226
262
|
if (window.__wbRrwebActive) return;
|
|
227
263
|
window.__wbRrwebActive = true;
|
|
228
264
|
window.__wbRrwebBuffer = [];
|
|
229
265
|
try {
|
|
230
|
-
rrwebRecord({
|
|
266
|
+
rrwebRecord(Object.assign({
|
|
231
267
|
emit: function(event){ window.__wbRrwebBuffer.push(event); },
|
|
232
|
-
sampling: { scroll: 150, media: 800, input: 'last' }
|
|
233
|
-
|
|
234
|
-
});
|
|
268
|
+
sampling: { scroll: 150, media: 800, input: 'last' }
|
|
269
|
+
}, ${recordOptsJson}));
|
|
235
270
|
} catch (e) { /* rrweb unavailable on this page (e.g. chrome://) */ }
|
|
236
271
|
var flush = function(){
|
|
237
272
|
var buf = window.__wbRrwebBuffer;
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
// Signed-URL export capture.
|
|
2
|
+
//
|
|
3
|
+
// Some SaaS "Download" buttons never produce a Playwright `download` event or
|
|
4
|
+
// an in-page Blob. Instead the app calls a same-origin API that returns JSON
|
|
5
|
+
// like `{ "download_url": "https://bucket.s3.amazonaws.com/...?<signed>" }`
|
|
6
|
+
// and then navigates to that URL. A page-side `fetch(signedUrl)` usually fails
|
|
7
|
+
// because the object store's CORS policy doesn't allow the app origin to read
|
|
8
|
+
// the bytes — so the only reliable place to fetch it is the sidecar process,
|
|
9
|
+
// where CORS doesn't apply.
|
|
10
|
+
//
|
|
11
|
+
// This module supplies:
|
|
12
|
+
// - SIGNED_PAGE_HOOK / SIGNED_POLL_SCRIPT — page-side instrumentation that
|
|
13
|
+
// wraps fetch + XHR, inspects small same-origin JSON responses, and stashes
|
|
14
|
+
// any http(s) URL fields it finds on `window.__wbSignedCandidates`.
|
|
15
|
+
// - pure helpers (isSignedHost, redactSignedUrl, extractUrlFields,
|
|
16
|
+
// pickSignedCandidate, parseSignedConfig) that the `download` verb uses to
|
|
17
|
+
// decide which captured URL to fetch server-side.
|
|
18
|
+
//
|
|
19
|
+
// The bytes are downloaded by the verb via lib/http.js's retryableFetch. Signed
|
|
20
|
+
// query credentials are redacted everywhere they cross the stdio boundary; the
|
|
21
|
+
// full URL lives only in sidecar memory for the duration of the fetch.
|
|
22
|
+
|
|
23
|
+
// Object-store / CDN hosts that hand out pre-signed, credential-bearing URLs.
|
|
24
|
+
// A match means "this looks like an export download, not a normal app API call"
|
|
25
|
+
// — the gate that keeps auto-mode from grabbing arbitrary same-origin URLs.
|
|
26
|
+
const SIGNED_HOST_PATTERNS = [
|
|
27
|
+
// S3: s3.amazonaws.com, s3.us-east-1.amazonaws.com, s3-us-west-2.amazonaws.com,
|
|
28
|
+
// bucket.s3.amazonaws.com, bucket.s3.us-east-1.amazonaws.com
|
|
29
|
+
/(^|\.)s3([.-][a-z0-9-]+)?\.amazonaws\.com$/i,
|
|
30
|
+
// Google Cloud Storage
|
|
31
|
+
/(^|\.)storage\.googleapis\.com$/i,
|
|
32
|
+
/(^|\.)storage\.cloud\.google\.com$/i,
|
|
33
|
+
// CloudFront
|
|
34
|
+
/(^|\.)cloudfront\.net$/i,
|
|
35
|
+
// Azure Blob Storage
|
|
36
|
+
/\.blob\.core\.windows\.net$/i,
|
|
37
|
+
// Cloudflare R2
|
|
38
|
+
/\.r2\.cloudflarestorage\.com$/i,
|
|
39
|
+
];
|
|
40
|
+
|
|
41
|
+
export function isSignedHost(host) {
|
|
42
|
+
if (!host) return false;
|
|
43
|
+
const h = String(host).toLowerCase();
|
|
44
|
+
return SIGNED_HOST_PATTERNS.some((re) => re.test(h));
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Drop the query string (where signed credentials live) but keep origin + path
|
|
48
|
+
// so diagnostics stay useful. Falls back to a naive split when URL parsing
|
|
49
|
+
// fails so a malformed value still can't leak its query.
|
|
50
|
+
export function redactSignedUrl(url) {
|
|
51
|
+
const s = String(url || "");
|
|
52
|
+
try {
|
|
53
|
+
const u = new URL(s);
|
|
54
|
+
return u.search ? `${u.origin}${u.pathname}?<redacted>` : `${u.origin}${u.pathname}`;
|
|
55
|
+
} catch {
|
|
56
|
+
const i = s.indexOf("?");
|
|
57
|
+
return i >= 0 ? `${s.slice(0, i)}?<redacted>` : s;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Recursively collect every http(s) string value in a parsed JSON object.
|
|
62
|
+
// Bounded in depth and count so a pathological response can't blow the stack
|
|
63
|
+
// or the buffer. Returns `[{ field, url }]` with dotted field paths.
|
|
64
|
+
export function extractUrlFields(data, maxDepth = 6, maxUrls = 30) {
|
|
65
|
+
const out = [];
|
|
66
|
+
const visit = (node, fieldPath, depth) => {
|
|
67
|
+
if (depth > maxDepth || out.length >= maxUrls) return;
|
|
68
|
+
if (typeof node === "string") {
|
|
69
|
+
if (/^https?:\/\//i.test(node)) out.push({ field: fieldPath, url: node });
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
if (Array.isArray(node)) {
|
|
73
|
+
for (let i = 0; i < node.length; i++)
|
|
74
|
+
visit(node[i], `${fieldPath}[${i}]`, depth + 1);
|
|
75
|
+
return;
|
|
76
|
+
}
|
|
77
|
+
if (node && typeof node === "object") {
|
|
78
|
+
for (const k of Object.keys(node)) {
|
|
79
|
+
visit(node[k], fieldPath ? `${fieldPath}.${k}` : k, depth + 1);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
};
|
|
83
|
+
visit(data, "", 0);
|
|
84
|
+
return out;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Leaf field name from a dotted/indexed path: "data.export.download_url" →
|
|
88
|
+
// "download_url", "files[0].url" → "url". Used to match against a caller's
|
|
89
|
+
// `json_fields` allowlist.
|
|
90
|
+
function leafField(fieldPath) {
|
|
91
|
+
const last = String(fieldPath || "")
|
|
92
|
+
.split(".")
|
|
93
|
+
.pop();
|
|
94
|
+
return last.replace(/\[\d+\]$/, "");
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Normalize `args.signed_url` into a resolved policy.
|
|
98
|
+
// undefined / "auto" → { enabled: "auto" } (recognized hosts only)
|
|
99
|
+
// false / { enabled: false } → { enabled: false } (feature off)
|
|
100
|
+
// true → { enabled: true }
|
|
101
|
+
// { enabled, hosts, json_fields } → that, normalized
|
|
102
|
+
// In "auto" mode only recognized signed hosts (or an explicit `hosts` entry)
|
|
103
|
+
// are captured. In forced mode (`enabled: true`) an explicit `hosts` or
|
|
104
|
+
// `json_fields` match is honored even for an unrecognized host, since the
|
|
105
|
+
// author asked for it by name.
|
|
106
|
+
export function parseSignedConfig(raw) {
|
|
107
|
+
const norm = (cfg) => ({
|
|
108
|
+
enabled: cfg.enabled,
|
|
109
|
+
hosts: Array.isArray(cfg.hosts) ? cfg.hosts.map((h) => String(h).toLowerCase()) : [],
|
|
110
|
+
jsonFields: cfg.jsonFields && cfg.jsonFields.length ? cfg.jsonFields.map(String) : null,
|
|
111
|
+
});
|
|
112
|
+
if (raw === false) return { enabled: false, hosts: [], jsonFields: null };
|
|
113
|
+
if (raw === true) return norm({ enabled: true });
|
|
114
|
+
if (raw == null) return norm({ enabled: "auto" });
|
|
115
|
+
if (typeof raw === "object") {
|
|
116
|
+
const enabled =
|
|
117
|
+
raw.enabled === undefined
|
|
118
|
+
? "auto"
|
|
119
|
+
: raw.enabled === true
|
|
120
|
+
? true
|
|
121
|
+
: raw.enabled === false
|
|
122
|
+
? false
|
|
123
|
+
: "auto";
|
|
124
|
+
if (enabled === false) return { enabled: false, hosts: [], jsonFields: null };
|
|
125
|
+
const jsonFields = Array.isArray(raw.json_fields)
|
|
126
|
+
? raw.json_fields
|
|
127
|
+
: Array.isArray(raw.jsonFields)
|
|
128
|
+
? raw.jsonFields
|
|
129
|
+
: null;
|
|
130
|
+
return norm({ enabled, hosts: raw.hosts, jsonFields });
|
|
131
|
+
}
|
|
132
|
+
return norm({ enabled: "auto" });
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Choose the best signed-URL candidate from the page-captured list, or null.
|
|
136
|
+
// `candidates` is the shape pushed by SIGNED_PAGE_HOOK:
|
|
137
|
+
// [{ api_url, urls: [{ field, url }], ts }]
|
|
138
|
+
export function pickSignedCandidate(candidates, opts = {}) {
|
|
139
|
+
const hosts = opts.hosts || [];
|
|
140
|
+
const jsonFields = opts.jsonFields || null;
|
|
141
|
+
const forced = opts.enabled === true;
|
|
142
|
+
for (const cand of candidates || []) {
|
|
143
|
+
for (const u of cand.urls || []) {
|
|
144
|
+
if (jsonFields && !jsonFields.includes(leafField(u.field)) && !jsonFields.includes(u.field))
|
|
145
|
+
continue;
|
|
146
|
+
let host = "";
|
|
147
|
+
try {
|
|
148
|
+
host = new URL(u.url).host.toLowerCase();
|
|
149
|
+
} catch {
|
|
150
|
+
continue;
|
|
151
|
+
}
|
|
152
|
+
const hostAllowed =
|
|
153
|
+
hosts.length > 0 &&
|
|
154
|
+
hosts.some((h) => host === h || host.endsWith(`.${h}`));
|
|
155
|
+
const looksSigned = isSignedHost(host);
|
|
156
|
+
// auto: only recognized signed hosts or an explicit hosts allowlist.
|
|
157
|
+
// forced: also honor a candidate the author selected via hosts/json_fields.
|
|
158
|
+
const accept =
|
|
159
|
+
hostAllowed ||
|
|
160
|
+
looksSigned ||
|
|
161
|
+
(forced && (hosts.length > 0 || jsonFields));
|
|
162
|
+
if (accept) {
|
|
163
|
+
return { url: u.url, field: u.field, api_url: cand.api_url || null, host };
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
return null;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Page-side hook installed BEFORE the click. Wraps fetch + XHR to inspect small
|
|
171
|
+
// same-origin JSON responses and stash any http(s) URL fields. Idempotent and
|
|
172
|
+
// fail-open — any error in the wrapper falls through to the original call so the
|
|
173
|
+
// app keeps working. Mirrors the blob hook's "never uninstall" contract.
|
|
174
|
+
export const SIGNED_PAGE_HOOK = `(() => {
|
|
175
|
+
if (window.__wbSignedInstalled) return;
|
|
176
|
+
window.__wbSignedInstalled = true;
|
|
177
|
+
window.__wbSignedCandidates = [];
|
|
178
|
+
var MAX_BODY = 64 * 1024;
|
|
179
|
+
var MAX_CAND = 50;
|
|
180
|
+
|
|
181
|
+
var sameOrigin = function(u){
|
|
182
|
+
try { return new URL(u, location.href).origin === location.origin; }
|
|
183
|
+
catch (e) { return false; }
|
|
184
|
+
};
|
|
185
|
+
|
|
186
|
+
var collect = function(apiUrl, text){
|
|
187
|
+
try {
|
|
188
|
+
if (!text || text.length > MAX_BODY) return;
|
|
189
|
+
var data;
|
|
190
|
+
try { data = JSON.parse(text); } catch (e) { return; }
|
|
191
|
+
var urls = [];
|
|
192
|
+
var visit = function(node, fp, depth){
|
|
193
|
+
if (depth > 6 || urls.length >= 30) return;
|
|
194
|
+
if (typeof node === 'string') {
|
|
195
|
+
if (/^https?:\\/\\//i.test(node)) urls.push({ field: fp, url: node });
|
|
196
|
+
return;
|
|
197
|
+
}
|
|
198
|
+
if (Array.isArray(node)) {
|
|
199
|
+
for (var i = 0; i < node.length; i++) visit(node[i], fp + '[' + i + ']', depth + 1);
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
if (node && typeof node === 'object') {
|
|
203
|
+
for (var k in node) {
|
|
204
|
+
if (Object.prototype.hasOwnProperty.call(node, k)) {
|
|
205
|
+
visit(node[k], fp ? fp + '.' + k : k, depth + 1);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
};
|
|
210
|
+
visit(data, '', 0);
|
|
211
|
+
if (urls.length && window.__wbSignedCandidates.length < MAX_CAND) {
|
|
212
|
+
window.__wbSignedCandidates.push({ api_url: apiUrl, urls: urls, ts: Date.now() });
|
|
213
|
+
}
|
|
214
|
+
} catch (e) {}
|
|
215
|
+
};
|
|
216
|
+
|
|
217
|
+
var origFetch = window.fetch;
|
|
218
|
+
if (typeof origFetch === 'function') {
|
|
219
|
+
window.fetch = function(){
|
|
220
|
+
var args = arguments;
|
|
221
|
+
var reqUrl = '';
|
|
222
|
+
try { reqUrl = (typeof args[0] === 'string') ? args[0] : (args[0] && args[0].url) || ''; } catch (e) {}
|
|
223
|
+
var p = origFetch.apply(this, args);
|
|
224
|
+
try {
|
|
225
|
+
if (sameOrigin(reqUrl) && p && typeof p.then === 'function') {
|
|
226
|
+
p.then(function(res){
|
|
227
|
+
try {
|
|
228
|
+
var ct = (res && res.headers && res.headers.get && res.headers.get('content-type')) || '';
|
|
229
|
+
if (/json|text/i.test(ct) || ct === '') {
|
|
230
|
+
res.clone().text().then(function(t){ collect(reqUrl, t); }).catch(function(){});
|
|
231
|
+
}
|
|
232
|
+
} catch (e) {}
|
|
233
|
+
return res;
|
|
234
|
+
}).catch(function(){});
|
|
235
|
+
}
|
|
236
|
+
} catch (e) {}
|
|
237
|
+
return p;
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
var XHR = window.XMLHttpRequest;
|
|
242
|
+
if (XHR && XHR.prototype) {
|
|
243
|
+
var origOpen = XHR.prototype.open;
|
|
244
|
+
var origSend = XHR.prototype.send;
|
|
245
|
+
XHR.prototype.open = function(method, url){
|
|
246
|
+
try { this.__wbUrl = url; } catch (e) {}
|
|
247
|
+
return origOpen.apply(this, arguments);
|
|
248
|
+
};
|
|
249
|
+
XHR.prototype.send = function(){
|
|
250
|
+
try {
|
|
251
|
+
var self = this;
|
|
252
|
+
this.addEventListener('load', function(){
|
|
253
|
+
try {
|
|
254
|
+
if (sameOrigin(self.__wbUrl)) {
|
|
255
|
+
var rt = '';
|
|
256
|
+
try {
|
|
257
|
+
if (self.responseType === '' || self.responseType === 'text') rt = self.responseText;
|
|
258
|
+
} catch (e) {}
|
|
259
|
+
if (rt) collect(self.__wbUrl, rt);
|
|
260
|
+
}
|
|
261
|
+
} catch (e) {}
|
|
262
|
+
});
|
|
263
|
+
} catch (e) {}
|
|
264
|
+
return origSend.apply(this, arguments);
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
})()`;
|
|
268
|
+
|
|
269
|
+
// Read-and-clear of the candidate buffer so successive polls only see new
|
|
270
|
+
// responses (matches the blob hook's read-and-clear contract).
|
|
271
|
+
export const SIGNED_POLL_SCRIPT = `(() => {
|
|
272
|
+
var c = window.__wbSignedCandidates || [];
|
|
273
|
+
window.__wbSignedCandidates = [];
|
|
274
|
+
return c;
|
|
275
|
+
})()`;
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
// Verb-argument substitution: {{ env.X }} / {{ artifacts.X }} expansion plus a
|
|
2
|
+
// `\{{` escape for literal template braces. Extracted from the entry point so
|
|
3
|
+
// it's unit-testable without booting the sidecar.
|
|
4
|
+
//
|
|
5
|
+
// {{ env.NAME }} → process.env.NAME
|
|
6
|
+
// {{ artifacts.NAME }} → contents of $WB_ARTIFACTS_DIR/NAME.txt (or .../NAME)
|
|
7
|
+
// \{{ → literal "{{" (escape; braces are NOT re-scanned)
|
|
8
|
+
//
|
|
9
|
+
// `expand(value, collected, artifactCache)` walks strings/arrays/objects.
|
|
10
|
+
// Resolved secret-ish values (≥3 chars) are added to `collected` so the caller
|
|
11
|
+
// can scrub them out of error messages with `scrubSecrets`.
|
|
12
|
+
|
|
13
|
+
import { readFileSync } from "node:fs";
|
|
14
|
+
import { log } from "./io.js";
|
|
15
|
+
import { resolveInside } from "./util.js";
|
|
16
|
+
|
|
17
|
+
// One combined pattern, scanned left-to-right in a single pass so the escape
|
|
18
|
+
// branch consumes the braces before either substitution branch can see them.
|
|
19
|
+
// Alternation order matters: the escape must come first.
|
|
20
|
+
// \{{ → no capture group (escape)
|
|
21
|
+
// {{ env.NAME }} → group 1
|
|
22
|
+
// {{ artifacts.NAME }} → group 2
|
|
23
|
+
// Artifact names are bare identifiers — no dots, no slashes — so a name can't
|
|
24
|
+
// compose with WB_ARTIFACTS_DIR into a path-traversal read.
|
|
25
|
+
const SUBST_RE =
|
|
26
|
+
/\\\{\{|\{\{\s*env\.([A-Za-z_][A-Za-z0-9_]*)\s*\}\}|\{\{\s*artifacts\.([A-Za-z_][A-Za-z0-9_-]*)\s*\}\}/g;
|
|
27
|
+
|
|
28
|
+
let warnedInvalidPolicy = false;
|
|
29
|
+
|
|
30
|
+
// Resolve the missing-value policy fresh each call (cheap) so the behavior
|
|
31
|
+
// tracks the current env. `warn` matches historical behavior (log + empty
|
|
32
|
+
// string, runbook continues). `error` throws so a missing OTP fails the slice
|
|
33
|
+
// instead of silently sending an empty value into a Playwright action. `empty`
|
|
34
|
+
// is the silent variant.
|
|
35
|
+
function resolveOnMissing() {
|
|
36
|
+
const raw = (process.env.WB_SUBSTITUTION_ON_MISSING || "warn").trim().toLowerCase();
|
|
37
|
+
if (raw === "error" || raw === "empty" || raw === "warn") return raw;
|
|
38
|
+
if (!warnedInvalidPolicy) {
|
|
39
|
+
warnedInvalidPolicy = true;
|
|
40
|
+
log(
|
|
41
|
+
`[warn] WB_SUBSTITUTION_ON_MISSING=${raw} is not valid (warn|error|empty); defaulting to warn`,
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
return "warn";
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function handleMissingSubstitution(kind, name) {
|
|
48
|
+
const msg = `${kind}.${name} is not set`;
|
|
49
|
+
if (resolveOnMissing() === "error") {
|
|
50
|
+
throw new Error(`substitution: ${msg}`);
|
|
51
|
+
}
|
|
52
|
+
if (resolveOnMissing() === "warn") {
|
|
53
|
+
log(`[warn] ${msg}; substituting empty string`);
|
|
54
|
+
}
|
|
55
|
+
return "";
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function readArtifactRaw(name) {
|
|
59
|
+
const dir = (process.env.WB_ARTIFACTS_DIR || "").trim();
|
|
60
|
+
if (!dir) {
|
|
61
|
+
log(`[warn] artifacts.${name} referenced but WB_ARTIFACTS_DIR is not set`);
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
for (const candidate of [`${name}.txt`, name]) {
|
|
65
|
+
const full = resolveInside(dir, candidate);
|
|
66
|
+
if (!full) continue;
|
|
67
|
+
try {
|
|
68
|
+
return readFileSync(full, "utf8").trimEnd();
|
|
69
|
+
} catch {
|
|
70
|
+
// try next candidate
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function readArtifact(name, cache) {
|
|
77
|
+
if (cache && cache.has(name)) {
|
|
78
|
+
const hit = cache.get(name);
|
|
79
|
+
if (hit === null) return handleMissingSubstitution("artifacts", name);
|
|
80
|
+
return hit;
|
|
81
|
+
}
|
|
82
|
+
const v = readArtifactRaw(name);
|
|
83
|
+
if (cache) cache.set(name, v);
|
|
84
|
+
if (v === null) return handleMissingSubstitution("artifacts", name);
|
|
85
|
+
return v;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export function expand(value, collected, artifactCache) {
|
|
89
|
+
if (typeof value === "string") {
|
|
90
|
+
return value.replace(SUBST_RE, (m, envName, artName) => {
|
|
91
|
+
// Escape branch: `\{{` → literal `{{`. No capture group, so both
|
|
92
|
+
// envName and artName are undefined here.
|
|
93
|
+
if (envName === undefined && artName === undefined) return "{{";
|
|
94
|
+
if (envName !== undefined) {
|
|
95
|
+
const v = process.env[envName];
|
|
96
|
+
if (v === undefined) return handleMissingSubstitution("env", envName);
|
|
97
|
+
if (collected && v.length >= 3) collected.add(v);
|
|
98
|
+
return v;
|
|
99
|
+
}
|
|
100
|
+
const v = readArtifact(artName, artifactCache);
|
|
101
|
+
if (collected && v && v.length >= 3) collected.add(v);
|
|
102
|
+
return v;
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
if (Array.isArray(value))
|
|
106
|
+
return value.map((v) => expand(v, collected, artifactCache));
|
|
107
|
+
if (value && typeof value === "object") {
|
|
108
|
+
const out = {};
|
|
109
|
+
for (const [k, v] of Object.entries(value))
|
|
110
|
+
out[k] = expand(v, collected, artifactCache);
|
|
111
|
+
return out;
|
|
112
|
+
}
|
|
113
|
+
return value;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Scrub any values that came from {{ env.X }} / {{ artifacts.X }} expansion out
|
|
117
|
+
// of error messages before they cross the stdio boundary — Playwright and fetch
|
|
118
|
+
// errors sometimes echo their inputs (URLs, script bodies, assertion text) and
|
|
119
|
+
// those inputs may contain credentials.
|
|
120
|
+
export function scrubSecrets(msg, secrets) {
|
|
121
|
+
let out = String(msg == null ? "" : msg);
|
|
122
|
+
if (!secrets) return out;
|
|
123
|
+
for (const s of secrets) {
|
|
124
|
+
if (!s) continue;
|
|
125
|
+
out = out.split(s).join("«***»");
|
|
126
|
+
}
|
|
127
|
+
return out;
|
|
128
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "wb-browser-runtime",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.14.0",
|
|
4
4
|
"description": "Browser sidecar runtime for wb — Playwright over CDP (Browserbase, browser-use) via the wb-sidecar/1 line-framed JSON protocol.",
|
|
5
5
|
"bin": {
|
|
6
6
|
"wb-browser-runtime": "bin/wb-browser-runtime.js"
|
package/verbs/download.js
CHANGED
|
@@ -26,6 +26,14 @@ import {
|
|
|
26
26
|
extensionAllowed,
|
|
27
27
|
} from "../lib/util.js";
|
|
28
28
|
import { HANDLED_MARK } from "../lib/download-capture.js";
|
|
29
|
+
import { retryableFetch } from "../lib/http.js";
|
|
30
|
+
import {
|
|
31
|
+
SIGNED_PAGE_HOOK,
|
|
32
|
+
SIGNED_POLL_SCRIPT,
|
|
33
|
+
parseSignedConfig,
|
|
34
|
+
pickSignedCandidate,
|
|
35
|
+
redactSignedUrl,
|
|
36
|
+
} from "../lib/signed-url-capture.js";
|
|
29
37
|
|
|
30
38
|
const DEFAULT_TIMEOUT_MS = 10_000;
|
|
31
39
|
const POLL_INTERVAL_MS = 50;
|
|
@@ -114,15 +122,24 @@ export default {
|
|
|
114
122
|
const allowlist = parseExtensionAllowlist(
|
|
115
123
|
process.env.WB_BROWSER_DOWNLOAD_EXTENSIONS,
|
|
116
124
|
);
|
|
125
|
+
const signedCfg = parseSignedConfig(args.signed_url);
|
|
126
|
+
const signedEnabled = signedCfg.enabled !== false;
|
|
117
127
|
|
|
118
128
|
// 1) Inject the page-side blob/anchor capture hook BEFORE the click so a
|
|
119
129
|
// synchronously-dispatched anchor.click() inside the SPA's handler is
|
|
120
130
|
// observed. Best-effort: a frame mid-navigation can reject evaluate;
|
|
121
131
|
// the Playwright `download` event still works and is the primary
|
|
122
|
-
// signal anyway.
|
|
132
|
+
// signal anyway. When signed-URL capture is enabled, install its
|
|
133
|
+
// fetch/XHR response hook in the same pre-click window so the API call
|
|
134
|
+
// the click triggers is observed from the start.
|
|
123
135
|
try {
|
|
124
136
|
await page.evaluate(PAGE_HOOK);
|
|
125
137
|
} catch {}
|
|
138
|
+
if (signedEnabled) {
|
|
139
|
+
try {
|
|
140
|
+
await page.evaluate(SIGNED_PAGE_HOOK);
|
|
141
|
+
} catch {}
|
|
142
|
+
}
|
|
126
143
|
|
|
127
144
|
// 2) Claim ownership of the next download synchronously — prepended to
|
|
128
145
|
// BrowserContext listeners so it runs before lib/download-capture.js's
|
|
@@ -148,16 +165,25 @@ export default {
|
|
|
148
165
|
}
|
|
149
166
|
}
|
|
150
167
|
|
|
168
|
+
// Shared cancellation token: once the race has a winner, the losing
|
|
169
|
+
// pollers stop on their next tick instead of spinning page.evaluate against
|
|
170
|
+
// a (possibly navigating/closing) page for the rest of the timeout window.
|
|
171
|
+
const stop = { done: false };
|
|
172
|
+
|
|
151
173
|
try {
|
|
152
|
-
// 3) Race the
|
|
153
|
-
//
|
|
154
|
-
//
|
|
174
|
+
// 3) Race the capture sources against the click. The download event AND
|
|
175
|
+
// the click run concurrently — Playwright's standard pattern, since
|
|
176
|
+
// the click can resolve before or after the download fires.
|
|
155
177
|
const downloadPromise = page
|
|
156
178
|
.waitForEvent("download", { timeout })
|
|
157
179
|
.then((d) => ({ kind: "playwright", download: d }))
|
|
158
180
|
.catch((e) => ({ kind: "playwright_failed", error: e }));
|
|
159
181
|
|
|
160
|
-
const blobPromise = pollForBlob(page, timeout);
|
|
182
|
+
const blobPromise = pollForBlob(page, timeout, stop);
|
|
183
|
+
|
|
184
|
+
const signedPromise = signedEnabled
|
|
185
|
+
? pollForSignedUrl(page, timeout, signedCfg, stop)
|
|
186
|
+
: null;
|
|
161
187
|
|
|
162
188
|
let clickError = null;
|
|
163
189
|
const clickPromise = (async () => {
|
|
@@ -181,7 +207,11 @@ export default {
|
|
|
181
207
|
}
|
|
182
208
|
})();
|
|
183
209
|
|
|
184
|
-
const winner = await raceCaptures(
|
|
210
|
+
const winner = await raceCaptures(
|
|
211
|
+
[downloadPromise, blobPromise, signedPromise].filter(Boolean),
|
|
212
|
+
);
|
|
213
|
+
// Winner decided (success or all-failed) — release the losing pollers.
|
|
214
|
+
stop.done = true;
|
|
185
215
|
// Wait for the click to settle so we surface its error (if any) over
|
|
186
216
|
// a generic "no file captured" — a click that never landed is the
|
|
187
217
|
// more actionable failure.
|
|
@@ -208,6 +238,17 @@ export default {
|
|
|
208
238
|
ctx,
|
|
209
239
|
});
|
|
210
240
|
}
|
|
241
|
+
if (winner.success && winner.kind === "signed_url") {
|
|
242
|
+
return await saveSignedUrlDownload({
|
|
243
|
+
signed: winner.signed,
|
|
244
|
+
artifactsDir,
|
|
245
|
+
allowlist,
|
|
246
|
+
explicitPath,
|
|
247
|
+
page,
|
|
248
|
+
ctx,
|
|
249
|
+
timeout,
|
|
250
|
+
});
|
|
251
|
+
}
|
|
211
252
|
|
|
212
253
|
// No capture won — emit structured failure diagnostics.
|
|
213
254
|
const reasons = winner.failures
|
|
@@ -217,6 +258,9 @@ export default {
|
|
|
217
258
|
}
|
|
218
259
|
if (f.kind === "blob_failed") return `blob hook: ${f.error}`;
|
|
219
260
|
if (f.kind === "blob_timeout") return `blob hook: no capture within ${timeout}ms`;
|
|
261
|
+
if (f.kind === "signed_failed") return `signed url: ${f.error}`;
|
|
262
|
+
if (f.kind === "signed_timeout")
|
|
263
|
+
return `signed url: no signed file URL seen within ${timeout}ms`;
|
|
220
264
|
return f.kind;
|
|
221
265
|
})
|
|
222
266
|
.join("; ");
|
|
@@ -233,6 +277,9 @@ export default {
|
|
|
233
277
|
`download: no file captured within ${timeout}ms after clicking ${args.selector} (page=${safePageUrl(page) || "?"}). ${reasons}`,
|
|
234
278
|
);
|
|
235
279
|
} finally {
|
|
280
|
+
// Backstop: ensure pollers are released on any exit path (thrown
|
|
281
|
+
// click/save error, extension rejection, etc.).
|
|
282
|
+
stop.done = true;
|
|
236
283
|
if (attached && browserContext && typeof browserContext.off === "function") {
|
|
237
284
|
try {
|
|
238
285
|
browserContext.off("download", claim);
|
|
@@ -333,13 +380,17 @@ async function saveBlobDownload({
|
|
|
333
380
|
return `→ ${path.basename(target)}`;
|
|
334
381
|
}
|
|
335
382
|
|
|
336
|
-
// Race
|
|
337
|
-
// before we declare failure, so the diagnostics frame can list every
|
|
338
|
-
// the verb didn't see a file. (Promise.race would shortcut on a fast
|
|
339
|
-
// and discard
|
|
340
|
-
|
|
383
|
+
// Race N capture promises. First to report success wins. Every source must
|
|
384
|
+
// report before we declare failure, so the diagnostics frame can list every
|
|
385
|
+
// reason the verb didn't see a file. (Promise.race would shortcut on a fast
|
|
386
|
+
// failure and discard a slower success.) Each promise resolves to an object
|
|
387
|
+
// whose `kind` names the source: a success kind ("playwright" | "blob" |
|
|
388
|
+
// "signed_url") or a failure kind ("*_failed" | "*_timeout").
|
|
389
|
+
const SUCCESS_KINDS = new Set(["playwright", "blob", "signed_url"]);
|
|
390
|
+
|
|
391
|
+
function raceCaptures(promises) {
|
|
341
392
|
return new Promise((resolve) => {
|
|
342
|
-
let outstanding =
|
|
393
|
+
let outstanding = promises.length;
|
|
343
394
|
const failures = [];
|
|
344
395
|
const finish = (settled) => {
|
|
345
396
|
if (settled.success) {
|
|
@@ -349,20 +400,123 @@ function raceCaptures(downloadPromise, blobPromise) {
|
|
|
349
400
|
failures.push(settled);
|
|
350
401
|
if (--outstanding === 0) resolve({ success: false, failures });
|
|
351
402
|
};
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
403
|
+
for (const pr of promises) {
|
|
404
|
+
pr.then((r) => {
|
|
405
|
+
if (SUCCESS_KINDS.has(r.kind)) finish({ success: true, ...r });
|
|
406
|
+
else finish({ success: false, ...r });
|
|
407
|
+
});
|
|
408
|
+
}
|
|
409
|
+
});
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
async function saveSignedUrlDownload({
|
|
413
|
+
signed,
|
|
414
|
+
artifactsDir,
|
|
415
|
+
allowlist,
|
|
416
|
+
explicitPath,
|
|
417
|
+
page,
|
|
418
|
+
ctx,
|
|
419
|
+
timeout,
|
|
420
|
+
}) {
|
|
421
|
+
const redacted = redactSignedUrl(signed.url);
|
|
422
|
+
// Filename: explicit path: wins, else the signed URL's basename, else a
|
|
423
|
+
// generic fallback. (S3 keys usually end in the real filename.)
|
|
424
|
+
let nameFromUrl = "";
|
|
425
|
+
try {
|
|
426
|
+
nameFromUrl = path.basename(new URL(signed.url).pathname) || "";
|
|
427
|
+
} catch {}
|
|
428
|
+
const suggested = explicitPath || (nameFromUrl.trim() ? nameFromUrl : FALLBACK_NAME);
|
|
429
|
+
if (!extensionAllowed(suggested, allowlist)) {
|
|
430
|
+
throw new Error(
|
|
431
|
+
`download: file "${suggested}" rejected by WB_BROWSER_DOWNLOAD_EXTENSIONS`,
|
|
432
|
+
);
|
|
433
|
+
}
|
|
434
|
+
const target = uniquePathInside(artifactsDir, suggested);
|
|
435
|
+
if (!target) {
|
|
436
|
+
throw new Error(
|
|
437
|
+
`download: refusing to save "${suggested}" — resolves outside $WB_ARTIFACTS_DIR`,
|
|
438
|
+
);
|
|
439
|
+
}
|
|
440
|
+
await fsPromises.mkdir(artifactsDir, { recursive: true });
|
|
441
|
+
|
|
442
|
+
// Fetch the signed URL from the sidecar (not the page) so the object store's
|
|
443
|
+
// CORS policy doesn't block the read. The label is redacted — retry logs must
|
|
444
|
+
// never echo signed credentials.
|
|
445
|
+
let res;
|
|
446
|
+
try {
|
|
447
|
+
res = await retryableFetch(
|
|
448
|
+
signed.url,
|
|
449
|
+
{ method: "GET" },
|
|
450
|
+
`signed-url download (${redacted})`,
|
|
451
|
+
{ timeoutMs: timeout },
|
|
452
|
+
);
|
|
453
|
+
} catch (e) {
|
|
454
|
+
send({
|
|
455
|
+
type: "slice.download_failed",
|
|
456
|
+
verb: "download",
|
|
457
|
+
verb_index: ctx?.index ?? null,
|
|
458
|
+
capture: "signed_url",
|
|
459
|
+
api_url: signed.api_url,
|
|
460
|
+
signed_url: redacted,
|
|
461
|
+
page_url: safePageUrl(page),
|
|
462
|
+
reason: `signed url fetch error: ${e?.message || e}`,
|
|
355
463
|
});
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
464
|
+
throw new Error(
|
|
465
|
+
`download: signed URL fetch failed for ${redacted}: ${e?.message || e}`,
|
|
466
|
+
);
|
|
467
|
+
}
|
|
468
|
+
if (!res.ok) {
|
|
469
|
+
// A 403 on a pre-signed URL almost always means the token expired before
|
|
470
|
+
// we fetched it — call that out so the operator knows to shorten the gap.
|
|
471
|
+
const expired = res.status === 403;
|
|
472
|
+
send({
|
|
473
|
+
type: "slice.download_failed",
|
|
474
|
+
verb: "download",
|
|
475
|
+
verb_index: ctx?.index ?? null,
|
|
476
|
+
capture: "signed_url",
|
|
477
|
+
api_url: signed.api_url,
|
|
478
|
+
signed_url: redacted,
|
|
479
|
+
page_url: safePageUrl(page),
|
|
480
|
+
http_status: res.status,
|
|
481
|
+
expired,
|
|
482
|
+
reason: `signed url fetch: HTTP ${res.status}${expired ? " (likely expired)" : ""}`,
|
|
359
483
|
});
|
|
484
|
+
throw new Error(
|
|
485
|
+
`download: signed URL fetch returned HTTP ${res.status} for ${redacted}${expired ? " (likely expired)" : ""}`,
|
|
486
|
+
);
|
|
487
|
+
}
|
|
488
|
+
const buf = Buffer.from(await res.arrayBuffer());
|
|
489
|
+
await fsPromises.writeFile(target, buf);
|
|
490
|
+
const contentType = safeHeader(res, "content-type");
|
|
491
|
+
const contentDisposition = safeHeader(res, "content-disposition");
|
|
492
|
+
send({
|
|
493
|
+
type: "slice.artifact_saved",
|
|
494
|
+
filename: path.basename(target),
|
|
495
|
+
path: target,
|
|
496
|
+
bytes: buf.length,
|
|
497
|
+
source: "download",
|
|
498
|
+
provenance: {
|
|
499
|
+
url: null,
|
|
500
|
+
signed_url: redacted,
|
|
501
|
+
api_url: signed.api_url,
|
|
502
|
+
field: signed.field,
|
|
503
|
+
suggested_filename: suggested,
|
|
504
|
+
page_url: safePageUrl(page),
|
|
505
|
+
verb_index: ctx?.index ?? null,
|
|
506
|
+
verb_name: "download",
|
|
507
|
+
capture: "signed_url",
|
|
508
|
+
content_type: contentType,
|
|
509
|
+
content_disposition: contentDisposition,
|
|
510
|
+
ts: Date.now(),
|
|
511
|
+
},
|
|
360
512
|
});
|
|
513
|
+
return `→ ${path.basename(target)}`;
|
|
361
514
|
}
|
|
362
515
|
|
|
363
|
-
async function pollForBlob(page, timeoutMs) {
|
|
516
|
+
async function pollForBlob(page, timeoutMs, stop) {
|
|
364
517
|
const deadline = Date.now() + timeoutMs;
|
|
365
518
|
while (true) {
|
|
519
|
+
if (stop?.done) return { kind: "blob_timeout" };
|
|
366
520
|
let result;
|
|
367
521
|
try {
|
|
368
522
|
result = await page.evaluate(POLL_SCRIPT);
|
|
@@ -376,6 +530,38 @@ async function pollForBlob(page, timeoutMs) {
|
|
|
376
530
|
}
|
|
377
531
|
}
|
|
378
532
|
|
|
533
|
+
// Poll the page-side signed-URL candidate buffer until a candidate matching
|
|
534
|
+
// the configured policy appears or the deadline passes. The bytes are NOT
|
|
535
|
+
// fetched here — the winner is fetched server-side by saveSignedUrlDownload so
|
|
536
|
+
// CORS doesn't apply. Returns the picked candidate; never throws (page
|
|
537
|
+
// evaluate errors degrade to "keep polling").
|
|
538
|
+
async function pollForSignedUrl(page, timeoutMs, signedCfg, stop) {
|
|
539
|
+
const deadline = Date.now() + timeoutMs;
|
|
540
|
+
while (true) {
|
|
541
|
+
if (stop?.done) return { kind: "signed_timeout" };
|
|
542
|
+
let cands = null;
|
|
543
|
+
try {
|
|
544
|
+
cands = await page.evaluate(SIGNED_POLL_SCRIPT);
|
|
545
|
+
} catch {
|
|
546
|
+
cands = null;
|
|
547
|
+
}
|
|
548
|
+
if (Array.isArray(cands) && cands.length) {
|
|
549
|
+
const picked = pickSignedCandidate(cands, signedCfg);
|
|
550
|
+
if (picked) return { kind: "signed_url", signed: picked };
|
|
551
|
+
}
|
|
552
|
+
if (Date.now() >= deadline) return { kind: "signed_timeout" };
|
|
553
|
+
await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
function safeHeader(res, name) {
|
|
558
|
+
try {
|
|
559
|
+
return res.headers?.get?.(name) || null;
|
|
560
|
+
} catch {
|
|
561
|
+
return null;
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
379
565
|
function safePageUrl(page) {
|
|
380
566
|
try {
|
|
381
567
|
return page.url();
|