wb-browser-runtime 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/verbs/download.js CHANGED
@@ -18,7 +18,8 @@
18
18
 
19
19
  import path from "node:path";
20
20
  import { Buffer } from "node:buffer";
21
- import { promises as fsPromises } from "node:fs";
21
+ import { promises as fsPromises, createWriteStream } from "node:fs";
22
+ import { once } from "node:events";
22
23
  import { send } from "../lib/io.js";
23
24
  import {
24
25
  uniquePathInside,
@@ -26,11 +27,57 @@ import {
26
27
  extensionAllowed,
27
28
  } from "../lib/util.js";
28
29
  import { HANDLED_MARK } from "../lib/download-capture.js";
30
+ import {
31
+ guardedDownloadFetch,
32
+ releaseBodyTimeout,
33
+ abortBody,
34
+ drainResponseBody,
35
+ } from "../lib/http.js";
36
+ import {
37
+ SIGNED_PAGE_HOOK,
38
+ SIGNED_POLL_SCRIPT,
39
+ parseSignedConfig,
40
+ pickSignedCandidate,
41
+ redactSignedUrl,
42
+ isSignedHost,
43
+ } from "../lib/signed-url-capture.js";
29
44
 
30
45
  const DEFAULT_TIMEOUT_MS = 10_000;
31
46
  const POLL_INTERVAL_MS = 50;
32
47
  const FALLBACK_NAME = "download.bin";
33
48
 
49
+ // Hard cap on signed-URL download size to bound memory/disk. A lying or absent
50
+ // Content-Length can't bypass it: the stream is aborted once bytes exceed it.
51
+ // Override for tests/ops via WB_MAX_DOWNLOAD_BYTES.
52
+ const MAX_SIGNED_DOWNLOAD_BYTES = 512 * 1024 * 1024; // 512 MiB
53
+
54
+ function maxDownloadBytes() {
55
+ const v = Number(process.env.WB_MAX_DOWNLOAD_BYTES);
56
+ return Number.isFinite(v) && v > 0 ? v : MAX_SIGNED_DOWNLOAD_BYTES;
57
+ }
58
+
59
+ // Test/ops escape hatch: by default the SSRF guard rejects targets that resolve
60
+ // to private/loopback IPs. A local test server lives on 127.0.0.1, so the guard
61
+ // must be opt-out-able for those tests. Production leaves this unset.
62
+ function privateDownloadIpAllowed() {
63
+ const v = String(process.env.WB_ALLOW_PRIVATE_DOWNLOAD_IP || "").toLowerCase();
64
+ return v === "1" || v === "true" || v === "yes" || v === "on";
65
+ }
66
+
67
+ // Build the host validator applied to the initial signed URL *and* every
68
+ // redirect hop — the same gate pickSignedCandidate uses, so a redirect can't
69
+ // escape to a host the picker would never have selected.
70
+ function makeSignedHostValidator(signedCfg) {
71
+ const hosts = (signedCfg && signedCfg.hosts) || [];
72
+ return (host) => {
73
+ if (!host) return false;
74
+ const h = String(host).toLowerCase();
75
+ const hostAllowed =
76
+ hosts.length > 0 && hosts.some((x) => h === x || h.endsWith(`.${x}`));
77
+ return hostAllowed || isSignedHost(h);
78
+ };
79
+ }
80
+
34
81
  // Page-side hook that traps blob/data-URL anchor clicks the SPA performs
35
82
  // programmatically — `URL.createObjectURL(blob)` + `<a download>` + `.click()`.
36
83
  // Playwright's own `download` event normally catches these, but a handful
@@ -114,15 +161,24 @@ export default {
114
161
  const allowlist = parseExtensionAllowlist(
115
162
  process.env.WB_BROWSER_DOWNLOAD_EXTENSIONS,
116
163
  );
164
+ const signedCfg = parseSignedConfig(args.signed_url);
165
+ const signedEnabled = signedCfg.enabled !== false;
117
166
 
118
167
  // 1) Inject the page-side blob/anchor capture hook BEFORE the click so a
119
168
  // synchronously-dispatched anchor.click() inside the SPA's handler is
120
169
  // observed. Best-effort: a frame mid-navigation can reject evaluate;
121
170
  // the Playwright `download` event still works and is the primary
122
- // signal anyway.
171
+ // signal anyway. When signed-URL capture is enabled, install its
172
+ // fetch/XHR response hook in the same pre-click window so the API call
173
+ // the click triggers is observed from the start.
123
174
  try {
124
175
  await page.evaluate(PAGE_HOOK);
125
176
  } catch {}
177
+ if (signedEnabled) {
178
+ try {
179
+ await page.evaluate(SIGNED_PAGE_HOOK);
180
+ } catch {}
181
+ }
126
182
 
127
183
  // 2) Claim ownership of the next download synchronously — prepended to
128
184
  // BrowserContext listeners so it runs before lib/download-capture.js's
@@ -148,16 +204,25 @@ export default {
148
204
  }
149
205
  }
150
206
 
207
+ // Shared cancellation token: once the race has a winner, the losing
208
+ // pollers stop on their next tick instead of spinning page.evaluate against
209
+ // a (possibly navigating/closing) page for the rest of the timeout window.
210
+ const stop = { done: false };
211
+
151
212
  try {
152
- // 3) Race the two capture sources against the click. The download event
153
- // AND the click run concurrently — Playwright's standard pattern,
154
- // since the click can resolve before or after the download fires.
213
+ // 3) Race the capture sources against the click. The download event AND
214
+ // the click run concurrently — Playwright's standard pattern, since
215
+ // the click can resolve before or after the download fires.
155
216
  const downloadPromise = page
156
217
  .waitForEvent("download", { timeout })
157
218
  .then((d) => ({ kind: "playwright", download: d }))
158
219
  .catch((e) => ({ kind: "playwright_failed", error: e }));
159
220
 
160
- const blobPromise = pollForBlob(page, timeout);
221
+ const blobPromise = pollForBlob(page, timeout, stop);
222
+
223
+ const signedPromise = signedEnabled
224
+ ? pollForSignedUrl(page, timeout, signedCfg, stop)
225
+ : null;
161
226
 
162
227
  let clickError = null;
163
228
  const clickPromise = (async () => {
@@ -181,7 +246,11 @@ export default {
181
246
  }
182
247
  })();
183
248
 
184
- const winner = await raceCaptures(downloadPromise, blobPromise);
249
+ const winner = await raceCaptures(
250
+ [downloadPromise, blobPromise, signedPromise].filter(Boolean),
251
+ );
252
+ // Winner decided (success or all-failed) — release the losing pollers.
253
+ stop.done = true;
185
254
  // Wait for the click to settle so we surface its error (if any) over
186
255
  // a generic "no file captured" — a click that never landed is the
187
256
  // more actionable failure.
@@ -208,6 +277,18 @@ export default {
208
277
  ctx,
209
278
  });
210
279
  }
280
+ if (winner.success && winner.kind === "signed_url") {
281
+ return await saveSignedUrlDownload({
282
+ signed: winner.signed,
283
+ artifactsDir,
284
+ allowlist,
285
+ explicitPath,
286
+ page,
287
+ ctx,
288
+ timeout,
289
+ signedCfg,
290
+ });
291
+ }
211
292
 
212
293
  // No capture won — emit structured failure diagnostics.
213
294
  const reasons = winner.failures
@@ -217,6 +298,9 @@ export default {
217
298
  }
218
299
  if (f.kind === "blob_failed") return `blob hook: ${f.error}`;
219
300
  if (f.kind === "blob_timeout") return `blob hook: no capture within ${timeout}ms`;
301
+ if (f.kind === "signed_failed") return `signed url: ${f.error}`;
302
+ if (f.kind === "signed_timeout")
303
+ return `signed url: no signed file URL seen within ${timeout}ms`;
220
304
  return f.kind;
221
305
  })
222
306
  .join("; ");
@@ -233,6 +317,9 @@ export default {
233
317
  `download: no file captured within ${timeout}ms after clicking ${args.selector} (page=${safePageUrl(page) || "?"}). ${reasons}`,
234
318
  );
235
319
  } finally {
320
+ // Backstop: ensure pollers are released on any exit path (thrown
321
+ // click/save error, extension rejection, etc.).
322
+ stop.done = true;
236
323
  if (attached && browserContext && typeof browserContext.off === "function") {
237
324
  try {
238
325
  browserContext.off("download", claim);
@@ -333,13 +420,17 @@ async function saveBlobDownload({
333
420
  return `→ ${path.basename(target)}`;
334
421
  }
335
422
 
336
- // Race two capture promises. First to report success wins. Both must report
337
- // before we declare failure, so the diagnostics frame can list every reason
338
- // the verb didn't see a file. (Promise.race would shortcut on a fast failure
339
- // and discard the slower success.)
340
- function raceCaptures(downloadPromise, blobPromise) {
423
+ // Race N capture promises. First to report success wins. Every source must
424
+ // report before we declare failure, so the diagnostics frame can list every
425
+ // reason the verb didn't see a file. (Promise.race would shortcut on a fast
426
+ // failure and discard a slower success.) Each promise resolves to an object
427
+ // whose `kind` names the source: a success kind ("playwright" | "blob" |
428
+ // "signed_url") or a failure kind ("*_failed" | "*_timeout").
429
+ const SUCCESS_KINDS = new Set(["playwright", "blob", "signed_url"]);
430
+
431
+ function raceCaptures(promises) {
341
432
  return new Promise((resolve) => {
342
- let outstanding = 2;
433
+ let outstanding = promises.length;
343
434
  const failures = [];
344
435
  const finish = (settled) => {
345
436
  if (settled.success) {
@@ -349,20 +440,217 @@ function raceCaptures(downloadPromise, blobPromise) {
349
440
  failures.push(settled);
350
441
  if (--outstanding === 0) resolve({ success: false, failures });
351
442
  };
352
- downloadPromise.then((r) => {
353
- if (r.kind === "playwright") finish({ success: true, ...r });
354
- else finish({ success: false, ...r });
443
+ for (const pr of promises) {
444
+ pr.then((r) => {
445
+ if (SUCCESS_KINDS.has(r.kind)) finish({ success: true, ...r });
446
+ else finish({ success: false, ...r });
447
+ });
448
+ }
449
+ });
450
+ }
451
+
452
+ async function saveSignedUrlDownload({
453
+ signed,
454
+ artifactsDir,
455
+ allowlist,
456
+ explicitPath,
457
+ page,
458
+ ctx,
459
+ timeout,
460
+ signedCfg,
461
+ }) {
462
+ const redacted = redactSignedUrl(signed.url);
463
+ // Filename: explicit path: wins, else the signed URL's basename, else a
464
+ // generic fallback. (S3 keys usually end in the real filename.)
465
+ let nameFromUrl = "";
466
+ try {
467
+ nameFromUrl = path.basename(new URL(signed.url).pathname) || "";
468
+ } catch {}
469
+ const suggested = explicitPath || (nameFromUrl.trim() ? nameFromUrl : FALLBACK_NAME);
470
+ if (!extensionAllowed(suggested, allowlist)) {
471
+ throw new Error(
472
+ `download: file "${suggested}" rejected by WB_BROWSER_DOWNLOAD_EXTENSIONS`,
473
+ );
474
+ }
475
+ const target = uniquePathInside(artifactsDir, suggested);
476
+ if (!target) {
477
+ throw new Error(
478
+ `download: refusing to save "${suggested}" — resolves outside $WB_ARTIFACTS_DIR`,
479
+ );
480
+ }
481
+ await fsPromises.mkdir(artifactsDir, { recursive: true });
482
+
483
+ const maxBytes = maxDownloadBytes();
484
+ const failed = (extra, reason) =>
485
+ send({
486
+ type: "slice.download_failed",
487
+ verb: "download",
488
+ verb_index: ctx?.index ?? null,
489
+ capture: "signed_url",
490
+ api_url: signed.api_url,
491
+ signed_url: redacted,
492
+ page_url: safePageUrl(page),
493
+ ...extra,
494
+ reason,
355
495
  });
356
- blobPromise.then((r) => {
357
- if (r.kind === "blob") finish({ success: true, ...r });
358
- else finish({ success: false, ...r });
496
+
497
+ // Fetch the signed URL from the sidecar (not the page) so the object store's
498
+ // CORS policy doesn't block the read. Redirects are followed *manually* with
499
+ // the same host allowlist + private-IP block applied to every hop (SSRF), and
500
+ // the body-read timeout stays armed until we finish streaming. The label is
501
+ // redacted — retry logs must never echo signed credentials.
502
+ let res;
503
+ try {
504
+ res = await guardedDownloadFetch(signed.url, {
505
+ timeoutMs: timeout,
506
+ validateHost: makeSignedHostValidator(signedCfg),
507
+ allowPrivateIp: privateDownloadIpAllowed(),
508
+ label: `signed-url download (${redacted})`,
359
509
  });
360
- });
510
+ } catch (e) {
511
+ failed({}, `signed url fetch error: ${e?.message || e}`);
512
+ throw new Error(
513
+ `download: signed URL fetch failed for ${redacted}: ${e?.message || e}`,
514
+ );
515
+ }
516
+
517
+ try {
518
+ if (!res.ok) {
519
+ // A 403 on a pre-signed URL almost always means the token expired before
520
+ // we fetched it — call that out so the operator knows to shorten the gap.
521
+ const expired = res.status === 403;
522
+ await drainResponseBody(res); // don't leak the socket
523
+ failed(
524
+ { http_status: res.status, expired },
525
+ `signed url fetch: HTTP ${res.status}${expired ? " (likely expired)" : ""}`,
526
+ );
527
+ throw new Error(
528
+ `download: signed URL fetch returned HTTP ${res.status} for ${redacted}${expired ? " (likely expired)" : ""}`,
529
+ );
530
+ }
531
+
532
+ // Reject up front when the server *declares* an oversized body...
533
+ const clRaw = safeHeader(res, "content-length");
534
+ const cl = clRaw == null ? NaN : Number(clRaw);
535
+ if (Number.isFinite(cl) && cl > maxBytes) {
536
+ await drainResponseBody(res);
537
+ failed(
538
+ { content_length: cl, max_bytes: maxBytes },
539
+ `signed url body too large: Content-Length ${cl} > cap ${maxBytes}`,
540
+ );
541
+ throw new Error(
542
+ `download: signed URL body exceeds size cap (${cl} > ${maxBytes} bytes) for ${redacted}`,
543
+ );
544
+ }
545
+
546
+ // ...and enforce while streaming so a lying/absent Content-Length can't slip
547
+ // past. Streams to disk rather than materializing the whole body in memory.
548
+ let bytes;
549
+ try {
550
+ bytes = await streamToFileWithCap(res, target, maxBytes);
551
+ } catch (e) {
552
+ if (e && e.code === "WB_SIZE_CAP") {
553
+ failed(
554
+ { max_bytes: maxBytes },
555
+ `signed url body exceeded size cap of ${maxBytes} bytes mid-stream`,
556
+ );
557
+ throw new Error(
558
+ `download: signed URL body exceeded size cap (${maxBytes} bytes) for ${redacted}`,
559
+ );
560
+ }
561
+ failed({}, `signed url body read error: ${e?.message || e}`);
562
+ throw new Error(
563
+ `download: signed URL body read failed for ${redacted}: ${e?.message || e}`,
564
+ );
565
+ }
566
+
567
+ const contentType = safeHeader(res, "content-type");
568
+ const contentDisposition = safeHeader(res, "content-disposition");
569
+ send({
570
+ type: "slice.artifact_saved",
571
+ filename: path.basename(target),
572
+ path: target,
573
+ bytes,
574
+ source: "download",
575
+ provenance: {
576
+ url: null,
577
+ signed_url: redacted,
578
+ api_url: signed.api_url,
579
+ field: signed.field,
580
+ suggested_filename: suggested,
581
+ page_url: safePageUrl(page),
582
+ verb_index: ctx?.index ?? null,
583
+ verb_name: "download",
584
+ capture: "signed_url",
585
+ content_type: contentType,
586
+ content_disposition: contentDisposition,
587
+ ts: Date.now(),
588
+ },
589
+ });
590
+ return `→ ${path.basename(target)}`;
591
+ } finally {
592
+ // Body fully consumed (or we bailed) — disarm the body-read timeout.
593
+ releaseBodyTimeout(res);
594
+ }
361
595
  }
362
596
 
363
- async function pollForBlob(page, timeoutMs) {
597
+ // Stream a response body to disk, counting bytes and aborting the in-flight
598
+ // read once the cap is exceeded (so an absent/under-stated Content-Length can't
599
+ // OOM us). Removes the partial file on any error. Returns total bytes written.
600
+ async function streamToFileWithCap(res, target, maxBytes) {
601
+ const reader = res.body?.getReader?.();
602
+ const ws = createWriteStream(target);
603
+ let total = 0;
604
+ try {
605
+ if (!reader) {
606
+ // No body to read (e.g. 204) — write an empty file.
607
+ await new Promise((resolve, reject) =>
608
+ ws.end((e) => (e ? reject(e) : resolve())),
609
+ );
610
+ return 0;
611
+ }
612
+ for (;;) {
613
+ const { done, value } = await reader.read();
614
+ if (done) break;
615
+ total += value.byteLength;
616
+ if (total > maxBytes) {
617
+ abortBody(res); // abort the underlying socket read
618
+ try {
619
+ await reader.cancel();
620
+ } catch {}
621
+ const err = new Error(`size cap exceeded`);
622
+ err.code = "WB_SIZE_CAP";
623
+ throw err;
624
+ }
625
+ if (!ws.write(Buffer.from(value))) {
626
+ await once(ws, "drain");
627
+ }
628
+ }
629
+ await new Promise((resolve, reject) =>
630
+ ws.end((e) => (e ? reject(e) : resolve())),
631
+ );
632
+ return total;
633
+ } catch (e) {
634
+ // Await the stream's close before unlinking: createWriteStream opens its fd
635
+ // asynchronously, so unlinking eagerly can race the (lazy) open and leave a
636
+ // resurrected empty file behind.
637
+ try {
638
+ await new Promise((resolve) => {
639
+ ws.once("close", resolve);
640
+ ws.destroy();
641
+ });
642
+ } catch {}
643
+ try {
644
+ await fsPromises.unlink(target);
645
+ } catch {}
646
+ throw e;
647
+ }
648
+ }
649
+
650
+ async function pollForBlob(page, timeoutMs, stop) {
364
651
  const deadline = Date.now() + timeoutMs;
365
652
  while (true) {
653
+ if (stop?.done) return { kind: "blob_timeout" };
366
654
  let result;
367
655
  try {
368
656
  result = await page.evaluate(POLL_SCRIPT);
@@ -376,6 +664,38 @@ async function pollForBlob(page, timeoutMs) {
376
664
  }
377
665
  }
378
666
 
667
+ // Poll the page-side signed-URL candidate buffer until a candidate matching
668
+ // the configured policy appears or the deadline passes. The bytes are NOT
669
+ // fetched here — the winner is fetched server-side by saveSignedUrlDownload so
670
+ // CORS doesn't apply. Returns the picked candidate; never throws (page
671
+ // evaluate errors degrade to "keep polling").
672
+ async function pollForSignedUrl(page, timeoutMs, signedCfg, stop) {
673
+ const deadline = Date.now() + timeoutMs;
674
+ while (true) {
675
+ if (stop?.done) return { kind: "signed_timeout" };
676
+ let cands = null;
677
+ try {
678
+ cands = await page.evaluate(SIGNED_POLL_SCRIPT);
679
+ } catch {
680
+ cands = null;
681
+ }
682
+ if (Array.isArray(cands) && cands.length) {
683
+ const picked = pickSignedCandidate(cands, signedCfg);
684
+ if (picked) return { kind: "signed_url", signed: picked };
685
+ }
686
+ if (Date.now() >= deadline) return { kind: "signed_timeout" };
687
+ await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
688
+ }
689
+ }
690
+
691
+ function safeHeader(res, name) {
692
+ try {
693
+ return res.headers?.get?.(name) || null;
694
+ } catch {
695
+ return null;
696
+ }
697
+ }
698
+
379
699
  function safePageUrl(page) {
380
700
  try {
381
701
  return page.url();
package/verbs/goto.js CHANGED
@@ -1,10 +1,16 @@
1
+ import { scrubSecrets } from "../lib/substitution.js";
2
+
1
3
  export default {
2
4
  name: "goto",
3
5
  primaryKey: "url",
4
- async execute(page, args) {
6
+ async execute(page, args, ctx) {
5
7
  const url = args.url ?? "";
6
8
  const waitUntil = args.wait_until ?? "domcontentloaded";
7
9
  await page.goto(url, { waitUntil, timeout: args.timeout ?? 30_000 });
8
- return `→ ${page.url()}`;
10
+ // The resolved URL can carry a substituted secret (e.g.
11
+ // ?token={{ env.TOKEN }}). Scrub any collected secret value out of the
12
+ // summary before it crosses into the verb.complete event stream — the
13
+ // same mechanism error messages use (lib/substitution.scrubSecrets).
14
+ return `→ ${scrubSecrets(page.url(), ctx?.secrets)}`;
9
15
  },
10
16
  };