wb-browser-runtime 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,276 @@
1
+ // Signed-URL export capture.
2
+ //
3
+ // Some SaaS "Download" buttons never produce a Playwright `download` event or
4
+ // an in-page Blob. Instead the app calls a same-origin API that returns JSON
5
+ // like `{ "download_url": "https://bucket.s3.amazonaws.com/...?<signed>" }`
6
+ // and then navigates to that URL. A page-side `fetch(signedUrl)` usually fails
7
+ // because the object store's CORS policy doesn't allow the app origin to read
8
+ // the bytes — so the only reliable place to fetch it is the sidecar process,
9
+ // where CORS doesn't apply.
10
+ //
11
+ // This module supplies:
12
+ // - SIGNED_PAGE_HOOK / SIGNED_POLL_SCRIPT — page-side instrumentation that
13
+ // wraps fetch + XHR, inspects small same-origin JSON responses, and stashes
14
+ // any http(s) URL fields it finds on `window.__wbSignedCandidates`.
15
+ // - pure helpers (isSignedHost, redactSignedUrl, extractUrlFields,
16
+ // pickSignedCandidate, parseSignedConfig) that the `download` verb uses to
17
+ // decide which captured URL to fetch server-side.
18
+ //
19
+ // The bytes are downloaded by the verb via lib/http.js's retryableFetch. Signed
20
+ // query credentials are redacted everywhere they cross the stdio boundary; the
21
+ // full URL lives only in sidecar memory for the duration of the fetch.
22
+
23
+ // Object-store / CDN hosts that hand out pre-signed, credential-bearing URLs.
24
+ // A match means "this looks like an export download, not a normal app API call"
25
+ // — the gate that keeps auto-mode from grabbing arbitrary same-origin URLs.
26
+ const SIGNED_HOST_PATTERNS = [
27
+ // S3: s3.amazonaws.com, s3.us-east-1.amazonaws.com, s3-us-west-2.amazonaws.com,
28
+ // bucket.s3.amazonaws.com, bucket.s3.us-east-1.amazonaws.com
29
+ /(^|\.)s3([.-][a-z0-9-]+)?\.amazonaws\.com$/i,
30
+ // Google Cloud Storage
31
+ /(^|\.)storage\.googleapis\.com$/i,
32
+ /(^|\.)storage\.cloud\.google\.com$/i,
33
+ // CloudFront
34
+ /(^|\.)cloudfront\.net$/i,
35
+ // Azure Blob Storage
36
+ /\.blob\.core\.windows\.net$/i,
37
+ // Cloudflare R2
38
+ /\.r2\.cloudflarestorage\.com$/i,
39
+ ];
40
+
41
+ export function isSignedHost(host) {
42
+ if (!host) return false;
43
+ const h = String(host).toLowerCase();
44
+ return SIGNED_HOST_PATTERNS.some((re) => re.test(h));
45
+ }
46
+
47
+ // Drop the query string (where signed credentials live) but keep origin + path
48
+ // so diagnostics stay useful. Falls back to a naive split when URL parsing
49
+ // fails so a malformed value still can't leak its query.
50
+ export function redactSignedUrl(url) {
51
+ const s = String(url || "");
52
+ try {
53
+ const u = new URL(s);
54
+ return u.search ? `${u.origin}${u.pathname}?<redacted>` : `${u.origin}${u.pathname}`;
55
+ } catch {
56
+ const i = s.indexOf("?");
57
+ return i >= 0 ? `${s.slice(0, i)}?<redacted>` : s;
58
+ }
59
+ }
60
+
61
+ // Recursively collect every http(s) string value in a parsed JSON object.
62
+ // Bounded in depth and count so a pathological response can't blow the stack
63
+ // or the buffer. Returns `[{ field, url }]` with dotted field paths.
64
+ export function extractUrlFields(data, maxDepth = 6, maxUrls = 30) {
65
+ const out = [];
66
+ const visit = (node, fieldPath, depth) => {
67
+ if (depth > maxDepth || out.length >= maxUrls) return;
68
+ if (typeof node === "string") {
69
+ if (/^https?:\/\//i.test(node)) out.push({ field: fieldPath, url: node });
70
+ return;
71
+ }
72
+ if (Array.isArray(node)) {
73
+ for (let i = 0; i < node.length; i++)
74
+ visit(node[i], `${fieldPath}[${i}]`, depth + 1);
75
+ return;
76
+ }
77
+ if (node && typeof node === "object") {
78
+ for (const k of Object.keys(node)) {
79
+ visit(node[k], fieldPath ? `${fieldPath}.${k}` : k, depth + 1);
80
+ }
81
+ }
82
+ };
83
+ visit(data, "", 0);
84
+ return out;
85
+ }
86
+
87
+ // Leaf field name from a dotted/indexed path: "data.export.download_url" →
88
+ // "download_url", "files[0].url" → "url". Used to match against a caller's
89
+ // `json_fields` allowlist.
90
+ function leafField(fieldPath) {
91
+ const last = String(fieldPath || "")
92
+ .split(".")
93
+ .pop();
94
+ return last.replace(/\[\d+\]$/, "");
95
+ }
96
+
97
+ // Normalize `args.signed_url` into a resolved policy.
98
+ // undefined / "auto" → { enabled: "auto" } (recognized hosts only)
99
+ // false / { enabled: false } → { enabled: false } (feature off)
100
+ // true → { enabled: true }
101
+ // { enabled, hosts, json_fields } → that, normalized
102
+ // In "auto" mode only recognized signed hosts (or an explicit `hosts` entry)
103
+ // are captured. In forced mode (`enabled: true`) an explicit `hosts` or
104
+ // `json_fields` match is honored even for an unrecognized host, since the
105
+ // author asked for it by name.
106
+ export function parseSignedConfig(raw) {
107
+ const norm = (cfg) => ({
108
+ enabled: cfg.enabled,
109
+ hosts: Array.isArray(cfg.hosts) ? cfg.hosts.map((h) => String(h).toLowerCase()) : [],
110
+ jsonFields: cfg.jsonFields && cfg.jsonFields.length ? cfg.jsonFields.map(String) : null,
111
+ });
112
+ if (raw === false) return { enabled: false, hosts: [], jsonFields: null };
113
+ if (raw === true) return norm({ enabled: true });
114
+ if (raw == null) return norm({ enabled: "auto" });
115
+ if (typeof raw === "object") {
116
+ const enabled =
117
+ raw.enabled === undefined
118
+ ? "auto"
119
+ : raw.enabled === true
120
+ ? true
121
+ : raw.enabled === false
122
+ ? false
123
+ : "auto";
124
+ if (enabled === false) return { enabled: false, hosts: [], jsonFields: null };
125
+ const jsonFields = Array.isArray(raw.json_fields)
126
+ ? raw.json_fields
127
+ : Array.isArray(raw.jsonFields)
128
+ ? raw.jsonFields
129
+ : null;
130
+ return norm({ enabled, hosts: raw.hosts, jsonFields });
131
+ }
132
+ return norm({ enabled: "auto" });
133
+ }
134
+
135
+ // Choose the best signed-URL candidate from the page-captured list, or null.
136
+ // `candidates` is the shape pushed by SIGNED_PAGE_HOOK:
137
+ // [{ api_url, urls: [{ field, url }], ts }]
138
+ export function pickSignedCandidate(candidates, opts = {}) {
139
+ const hosts = opts.hosts || [];
140
+ const jsonFields = opts.jsonFields || null;
141
+ for (const cand of candidates || []) {
142
+ for (const u of cand.urls || []) {
143
+ // json_fields only *filters* which fields are inspected — it never
144
+ // bypasses the host check below.
145
+ if (jsonFields && !jsonFields.includes(leafField(u.field)) && !jsonFields.includes(u.field))
146
+ continue;
147
+ let host = "";
148
+ try {
149
+ host = new URL(u.url).host.toLowerCase();
150
+ } catch {
151
+ continue;
152
+ }
153
+ const hostAllowed =
154
+ hosts.length > 0 &&
155
+ hosts.some((h) => host === h || host.endsWith(`.${h}`));
156
+ const looksSigned = isSignedHost(host);
157
+ // A host match is ALWAYS required: a recognized signed host (auto mode)
158
+ // or an explicit `hosts` allowlist entry. Forced mode (`enabled: true`)
159
+ // does not relax this — it only opts the feature on; an author who needs
160
+ // an unrecognized host must name it in `hosts`. This closes the SSRF gap
161
+ // where forced + json_fields accepted an arbitrary host.
162
+ const accept = hostAllowed || looksSigned;
163
+ if (accept) {
164
+ return { url: u.url, field: u.field, api_url: cand.api_url || null, host };
165
+ }
166
+ }
167
+ }
168
+ return null;
169
+ }
170
+
171
+ // Page-side hook installed BEFORE the click. Wraps fetch + XHR to inspect small
172
+ // same-origin JSON responses and stash any http(s) URL fields. Idempotent and
173
+ // fail-open — any error in the wrapper falls through to the original call so the
174
+ // app keeps working. Mirrors the blob hook's "never uninstall" contract.
175
+ export const SIGNED_PAGE_HOOK = `(() => {
176
+ if (window.__wbSignedInstalled) return;
177
+ window.__wbSignedInstalled = true;
178
+ window.__wbSignedCandidates = [];
179
+ var MAX_BODY = 64 * 1024;
180
+ var MAX_CAND = 50;
181
+
182
+ var sameOrigin = function(u){
183
+ try { return new URL(u, location.href).origin === location.origin; }
184
+ catch (e) { return false; }
185
+ };
186
+
187
+ var collect = function(apiUrl, text){
188
+ try {
189
+ if (!text || text.length > MAX_BODY) return;
190
+ var data;
191
+ try { data = JSON.parse(text); } catch (e) { return; }
192
+ var urls = [];
193
+ var visit = function(node, fp, depth){
194
+ if (depth > 6 || urls.length >= 30) return;
195
+ if (typeof node === 'string') {
196
+ if (/^https?:\\/\\//i.test(node)) urls.push({ field: fp, url: node });
197
+ return;
198
+ }
199
+ if (Array.isArray(node)) {
200
+ for (var i = 0; i < node.length; i++) visit(node[i], fp + '[' + i + ']', depth + 1);
201
+ return;
202
+ }
203
+ if (node && typeof node === 'object') {
204
+ for (var k in node) {
205
+ if (Object.prototype.hasOwnProperty.call(node, k)) {
206
+ visit(node[k], fp ? fp + '.' + k : k, depth + 1);
207
+ }
208
+ }
209
+ }
210
+ };
211
+ visit(data, '', 0);
212
+ if (urls.length && window.__wbSignedCandidates.length < MAX_CAND) {
213
+ window.__wbSignedCandidates.push({ api_url: apiUrl, urls: urls, ts: Date.now() });
214
+ }
215
+ } catch (e) {}
216
+ };
217
+
218
+ var origFetch = window.fetch;
219
+ if (typeof origFetch === 'function') {
220
+ window.fetch = function(){
221
+ var args = arguments;
222
+ var reqUrl = '';
223
+ try { reqUrl = (typeof args[0] === 'string') ? args[0] : (args[0] && args[0].url) || ''; } catch (e) {}
224
+ var p = origFetch.apply(this, args);
225
+ try {
226
+ if (sameOrigin(reqUrl) && p && typeof p.then === 'function') {
227
+ p.then(function(res){
228
+ try {
229
+ var ct = (res && res.headers && res.headers.get && res.headers.get('content-type')) || '';
230
+ if (/json|text/i.test(ct) || ct === '') {
231
+ res.clone().text().then(function(t){ collect(reqUrl, t); }).catch(function(){});
232
+ }
233
+ } catch (e) {}
234
+ return res;
235
+ }).catch(function(){});
236
+ }
237
+ } catch (e) {}
238
+ return p;
239
+ };
240
+ }
241
+
242
+ var XHR = window.XMLHttpRequest;
243
+ if (XHR && XHR.prototype) {
244
+ var origOpen = XHR.prototype.open;
245
+ var origSend = XHR.prototype.send;
246
+ XHR.prototype.open = function(method, url){
247
+ try { this.__wbUrl = url; } catch (e) {}
248
+ return origOpen.apply(this, arguments);
249
+ };
250
+ XHR.prototype.send = function(){
251
+ try {
252
+ var self = this;
253
+ this.addEventListener('load', function(){
254
+ try {
255
+ if (sameOrigin(self.__wbUrl)) {
256
+ var rt = '';
257
+ try {
258
+ if (self.responseType === '' || self.responseType === 'text') rt = self.responseText;
259
+ } catch (e) {}
260
+ if (rt) collect(self.__wbUrl, rt);
261
+ }
262
+ } catch (e) {}
263
+ });
264
+ } catch (e) {}
265
+ return origSend.apply(this, arguments);
266
+ };
267
+ }
268
+ })()`;
269
+
270
+ // Read-and-clear of the candidate buffer so successive polls only see new
271
+ // responses (matches the blob hook's read-and-clear contract).
272
+ export const SIGNED_POLL_SCRIPT = `(() => {
273
+ var c = window.__wbSignedCandidates || [];
274
+ window.__wbSignedCandidates = [];
275
+ return c;
276
+ })()`;
@@ -0,0 +1,128 @@
1
+ // Verb-argument substitution: {{ env.X }} / {{ artifacts.X }} expansion plus a
2
+ // `\{{` escape for literal template braces. Extracted from the entry point so
3
+ // it's unit-testable without booting the sidecar.
4
+ //
5
+ // {{ env.NAME }} → process.env.NAME
6
+ // {{ artifacts.NAME }} → contents of $WB_ARTIFACTS_DIR/NAME.txt (or .../NAME)
7
+ // \{{ → literal "{{" (escape; braces are NOT re-scanned)
8
+ //
9
+ // `expand(value, collected, artifactCache)` walks strings/arrays/objects.
10
+ // Resolved secret-ish values (≥3 chars) are added to `collected` so the caller
11
+ // can scrub them out of error messages with `scrubSecrets`.
12
+
13
+ import { readFileSync } from "node:fs";
14
+ import { log } from "./io.js";
15
+ import { resolveInside } from "./util.js";
16
+
17
+ // One combined pattern, scanned left-to-right in a single pass so the escape
18
+ // branch consumes the braces before either substitution branch can see them.
19
+ // Alternation order matters: the escape must come first.
20
+ // \{{ → no capture group (escape)
21
+ // {{ env.NAME }} → group 1
22
+ // {{ artifacts.NAME }} → group 2
23
+ // Artifact names are bare identifiers — no dots, no slashes — so a name can't
24
+ // compose with WB_ARTIFACTS_DIR into a path-traversal read.
25
+ const SUBST_RE =
26
+ /\\\{\{|\{\{\s*env\.([A-Za-z_][A-Za-z0-9_]*)\s*\}\}|\{\{\s*artifacts\.([A-Za-z_][A-Za-z0-9_-]*)\s*\}\}/g;
27
+
28
+ let warnedInvalidPolicy = false;
29
+
30
+ // Resolve the missing-value policy fresh each call (cheap) so the behavior
31
+ // tracks the current env. `warn` matches historical behavior (log + empty
32
+ // string, runbook continues). `error` throws so a missing OTP fails the slice
33
+ // instead of silently sending an empty value into a Playwright action. `empty`
34
+ // is the silent variant.
35
+ function resolveOnMissing() {
36
+ const raw = (process.env.WB_SUBSTITUTION_ON_MISSING || "warn").trim().toLowerCase();
37
+ if (raw === "error" || raw === "empty" || raw === "warn") return raw;
38
+ if (!warnedInvalidPolicy) {
39
+ warnedInvalidPolicy = true;
40
+ log(
41
+ `[warn] WB_SUBSTITUTION_ON_MISSING=${raw} is not valid (warn|error|empty); defaulting to warn`,
42
+ );
43
+ }
44
+ return "warn";
45
+ }
46
+
47
+ function handleMissingSubstitution(kind, name) {
48
+ const msg = `${kind}.${name} is not set`;
49
+ if (resolveOnMissing() === "error") {
50
+ throw new Error(`substitution: ${msg}`);
51
+ }
52
+ if (resolveOnMissing() === "warn") {
53
+ log(`[warn] ${msg}; substituting empty string`);
54
+ }
55
+ return "";
56
+ }
57
+
58
+ function readArtifactRaw(name) {
59
+ const dir = (process.env.WB_ARTIFACTS_DIR || "").trim();
60
+ if (!dir) {
61
+ log(`[warn] artifacts.${name} referenced but WB_ARTIFACTS_DIR is not set`);
62
+ return null;
63
+ }
64
+ for (const candidate of [`${name}.txt`, name]) {
65
+ const full = resolveInside(dir, candidate);
66
+ if (!full) continue;
67
+ try {
68
+ return readFileSync(full, "utf8").trimEnd();
69
+ } catch {
70
+ // try next candidate
71
+ }
72
+ }
73
+ return null;
74
+ }
75
+
76
+ function readArtifact(name, cache) {
77
+ if (cache && cache.has(name)) {
78
+ const hit = cache.get(name);
79
+ if (hit === null) return handleMissingSubstitution("artifacts", name);
80
+ return hit;
81
+ }
82
+ const v = readArtifactRaw(name);
83
+ if (cache) cache.set(name, v);
84
+ if (v === null) return handleMissingSubstitution("artifacts", name);
85
+ return v;
86
+ }
87
+
88
+ export function expand(value, collected, artifactCache) {
89
+ if (typeof value === "string") {
90
+ return value.replace(SUBST_RE, (m, envName, artName) => {
91
+ // Escape branch: `\{{` → literal `{{`. No capture group, so both
92
+ // envName and artName are undefined here.
93
+ if (envName === undefined && artName === undefined) return "{{";
94
+ if (envName !== undefined) {
95
+ const v = process.env[envName];
96
+ if (v === undefined) return handleMissingSubstitution("env", envName);
97
+ if (collected && v.length >= 3) collected.add(v);
98
+ return v;
99
+ }
100
+ const v = readArtifact(artName, artifactCache);
101
+ if (collected && v && v.length >= 3) collected.add(v);
102
+ return v;
103
+ });
104
+ }
105
+ if (Array.isArray(value))
106
+ return value.map((v) => expand(v, collected, artifactCache));
107
+ if (value && typeof value === "object") {
108
+ const out = {};
109
+ for (const [k, v] of Object.entries(value))
110
+ out[k] = expand(v, collected, artifactCache);
111
+ return out;
112
+ }
113
+ return value;
114
+ }
115
+
116
+ // Scrub any values that came from {{ env.X }} / {{ artifacts.X }} expansion out
117
+ // of error messages before they cross the stdio boundary — Playwright and fetch
118
+ // errors sometimes echo their inputs (URLs, script bodies, assertion text) and
119
+ // those inputs may contain credentials.
120
+ export function scrubSecrets(msg, secrets) {
121
+ let out = String(msg == null ? "" : msg);
122
+ if (!secrets) return out;
123
+ for (const s of secrets) {
124
+ if (!s) continue;
125
+ out = out.split(s).join("«***»");
126
+ }
127
+ return out;
128
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "wb-browser-runtime",
3
- "version": "0.13.0",
3
+ "version": "0.14.1",
4
4
  "description": "Browser sidecar runtime for wb — Playwright over CDP (Browserbase, browser-use) via the wb-sidecar/1 line-framed JSON protocol.",
5
5
  "bin": {
6
6
  "wb-browser-runtime": "bin/wb-browser-runtime.js"