wb-browser-runtime 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,596 @@
1
+ // download — explicit "click and capture" verb.
2
+ //
3
+ // The passive listener in lib/download-capture.js already saves any file the
4
+ // browser downloads, but it has no say over the filename and announces a
5
+ // `slice.artifact_saved` frame asynchronously after `saveAs` resolves. Some
6
+ // runbooks want stronger guarantees:
7
+ // - "the file lands at exactly $WB_ARTIFACTS_DIR/<path>"
8
+ // - "if it doesn't appear within ~10s, fail the slice with diagnostics"
9
+ // - works for SPAs that build the file in-page via fetch/XHR + Blob and
10
+ // don't always trip Playwright's `download` event reliably
11
+ //
12
+ // This verb installs capture hooks BEFORE clicking, races
13
+ // `page.waitForEvent("download")` against an in-page blob/anchor capture
14
+ // hook, and either saves the bytes itself (blob path) or hands the
15
+ // Playwright Download to `saveAs` (download path). Whichever path wins,
16
+ // the verb sets HANDLED_MARK on the Download (when applicable) so the
17
+ // passive listener doesn't double-save.
18
+
19
+ import path from "node:path";
20
+ import { Buffer } from "node:buffer";
21
+ import { promises as fsPromises } from "node:fs";
22
+ import { send } from "../lib/io.js";
23
+ import {
24
+ uniquePathInside,
25
+ parseExtensionAllowlist,
26
+ extensionAllowed,
27
+ } from "../lib/util.js";
28
+ import { HANDLED_MARK } from "../lib/download-capture.js";
29
+ import { retryableFetch } from "../lib/http.js";
30
+ import {
31
+ SIGNED_PAGE_HOOK,
32
+ SIGNED_POLL_SCRIPT,
33
+ parseSignedConfig,
34
+ pickSignedCandidate,
35
+ redactSignedUrl,
36
+ } from "../lib/signed-url-capture.js";
37
+
38
+ const DEFAULT_TIMEOUT_MS = 10_000;
39
+ const POLL_INTERVAL_MS = 50;
40
+ const FALLBACK_NAME = "download.bin";
41
+
42
+ // Page-side hook that traps blob/data-URL anchor clicks the SPA performs
43
+ // programmatically — `URL.createObjectURL(blob)` + `<a download>` + `.click()`.
44
+ // Playwright's own `download` event normally catches these, but a handful
45
+ // of SPAs trigger downloads via `window.open(blobUrl)` or
46
+ // `window.location = blobUrl` which slip past. The hook re-fetches the blob
47
+ // in-page, base64-encodes the bytes, and stashes them on
48
+ // `window.__wbDownload` for the Node side to poll.
49
+ //
50
+ // Idempotent: re-installing on each verb invocation is a no-op after the
51
+ // first. We never uninstall — leaves the page in a slightly altered state
52
+ // but the wrapped click is functionally equivalent to the original.
53
+ const PAGE_HOOK = `(() => {
54
+ if (window.__wbDownloadInstalled) return;
55
+ window.__wbDownloadInstalled = true;
56
+ window.__wbDownload = null;
57
+
58
+ const captureBlob = async (target, filename, mime) => {
59
+ try {
60
+ let blob;
61
+ if (typeof target === "string") {
62
+ const resp = await fetch(target);
63
+ blob = await resp.blob();
64
+ } else {
65
+ blob = target;
66
+ }
67
+ const buf = await blob.arrayBuffer();
68
+ const bin = new Uint8Array(buf);
69
+ let s = "";
70
+ for (let i = 0; i < bin.length; i++) s += String.fromCharCode(bin[i]);
71
+ window.__wbDownload = {
72
+ filename: filename || "download.bin",
73
+ bytes: btoa(s),
74
+ mimeType: mime || blob.type || "application/octet-stream",
75
+ };
76
+ } catch (e) {
77
+ window.__wbDownload = { error: String((e && e.message) || e) };
78
+ }
79
+ };
80
+
81
+ const origClick = HTMLAnchorElement.prototype.click;
82
+ HTMLAnchorElement.prototype.click = function () {
83
+ try {
84
+ const href = this.getAttribute("href") || this.href || "";
85
+ const hasDownload = this.hasAttribute("download");
86
+ if (hasDownload && (href.startsWith("blob:") || href.startsWith("data:"))) {
87
+ const fname = this.getAttribute("download") || this.download || "";
88
+ captureBlob(href, fname);
89
+ }
90
+ } catch {}
91
+ return origClick.apply(this, arguments);
92
+ };
93
+ })()`;
94
+
95
+ // Read-and-clear of `window.__wbDownload`. Returning the value AND nulling
96
+ // it lets the page hook capture multiple downloads across separate verb
97
+ // calls without leaking state from a prior call into the next poll.
98
+ const POLL_SCRIPT = `(() => {
99
+ const v = window.__wbDownload;
100
+ window.__wbDownload = null;
101
+ return v;
102
+ })()`;
103
+
104
+ export default {
105
+ name: "download",
106
+ primaryKey: "selector",
107
+ async execute(page, args, ctx) {
108
+ const artifactsDir = (process.env.WB_ARTIFACTS_DIR || "").trim();
109
+ if (!artifactsDir) {
110
+ throw new Error(
111
+ "download: $WB_ARTIFACTS_DIR is not set — run this workbook via `wb run` (wb exports the dir for you)",
112
+ );
113
+ }
114
+ if (!args.selector) {
115
+ throw new Error("download: `selector` is required");
116
+ }
117
+ const timeout = args.timeout ?? DEFAULT_TIMEOUT_MS;
118
+ const explicitPath =
119
+ typeof args.path === "string" && args.path.trim()
120
+ ? args.path.trim()
121
+ : null;
122
+ const allowlist = parseExtensionAllowlist(
123
+ process.env.WB_BROWSER_DOWNLOAD_EXTENSIONS,
124
+ );
125
+ const signedCfg = parseSignedConfig(args.signed_url);
126
+ const signedEnabled = signedCfg.enabled !== false;
127
+
128
+ // 1) Inject the page-side blob/anchor capture hook BEFORE the click so a
129
+ // synchronously-dispatched anchor.click() inside the SPA's handler is
130
+ // observed. Best-effort: a frame mid-navigation can reject evaluate;
131
+ // the Playwright `download` event still works and is the primary
132
+ // signal anyway. When signed-URL capture is enabled, install its
133
+ // fetch/XHR response hook in the same pre-click window so the API call
134
+ // the click triggers is observed from the start.
135
+ try {
136
+ await page.evaluate(PAGE_HOOK);
137
+ } catch {}
138
+ if (signedEnabled) {
139
+ try {
140
+ await page.evaluate(SIGNED_PAGE_HOOK);
141
+ } catch {}
142
+ }
143
+
144
+ // 2) Claim ownership of the next download synchronously — prepended to
145
+ // BrowserContext listeners so it runs before lib/download-capture.js's
146
+ // passive listener has a chance to start its async capture chain. The
147
+ // HANDLED_MARK tells the passive listener to bail.
148
+ const claim = (download) => {
149
+ try {
150
+ download[HANDLED_MARK] = true;
151
+ } catch {}
152
+ };
153
+ const browserContext = safeContext(page);
154
+ let attached = false;
155
+ if (browserContext) {
156
+ if (typeof browserContext.prependListener === "function") {
157
+ browserContext.prependListener("download", claim);
158
+ attached = true;
159
+ } else if (typeof browserContext.on === "function") {
160
+ // Fallback: append. Race window is tiny (passive listener checks
161
+ // HANDLED_MARK before its first await), but ordering isn't
162
+ // guaranteed without prependListener.
163
+ browserContext.on("download", claim);
164
+ attached = true;
165
+ }
166
+ }
167
+
168
+ // Shared cancellation token: once the race has a winner, the losing
169
+ // pollers stop on their next tick instead of spinning page.evaluate against
170
+ // a (possibly navigating/closing) page for the rest of the timeout window.
171
+ const stop = { done: false };
172
+
173
+ try {
174
+ // 3) Race the capture sources against the click. The download event AND
175
+ // the click run concurrently — Playwright's standard pattern, since
176
+ // the click can resolve before or after the download fires.
177
+ const downloadPromise = page
178
+ .waitForEvent("download", { timeout })
179
+ .then((d) => ({ kind: "playwright", download: d }))
180
+ .catch((e) => ({ kind: "playwright_failed", error: e }));
181
+
182
+ const blobPromise = pollForBlob(page, timeout, stop);
183
+
184
+ const signedPromise = signedEnabled
185
+ ? pollForSignedUrl(page, timeout, signedCfg, stop)
186
+ : null;
187
+
188
+ let clickError = null;
189
+ const clickPromise = (async () => {
190
+ try {
191
+ await page.click(args.selector, { timeout });
192
+ } catch (err) {
193
+ const isTimeout = err && err.name === "TimeoutError";
194
+ if (isTimeout && args.text_fallback) {
195
+ try {
196
+ await page
197
+ .getByText(args.text_fallback, { exact: false })
198
+ .first()
199
+ .click({ timeout });
200
+ return;
201
+ } catch {
202
+ clickError = err;
203
+ return;
204
+ }
205
+ }
206
+ clickError = err;
207
+ }
208
+ })();
209
+
210
+ const winner = await raceCaptures(
211
+ [downloadPromise, blobPromise, signedPromise].filter(Boolean),
212
+ );
213
+ // Winner decided (success or all-failed) — release the losing pollers.
214
+ stop.done = true;
215
+ // Wait for the click to settle so we surface its error (if any) over
216
+ // a generic "no file captured" — a click that never landed is the
217
+ // more actionable failure.
218
+ await clickPromise;
219
+ if (clickError) throw clickError;
220
+
221
+ if (winner.success && winner.kind === "playwright") {
222
+ return await savePlaywrightDownload({
223
+ download: winner.download,
224
+ artifactsDir,
225
+ allowlist,
226
+ explicitPath,
227
+ page,
228
+ ctx,
229
+ });
230
+ }
231
+ if (winner.success && winner.kind === "blob") {
232
+ return await saveBlobDownload({
233
+ blob: winner.blob,
234
+ artifactsDir,
235
+ allowlist,
236
+ explicitPath,
237
+ page,
238
+ ctx,
239
+ });
240
+ }
241
+ if (winner.success && winner.kind === "signed_url") {
242
+ return await saveSignedUrlDownload({
243
+ signed: winner.signed,
244
+ artifactsDir,
245
+ allowlist,
246
+ explicitPath,
247
+ page,
248
+ ctx,
249
+ timeout,
250
+ });
251
+ }
252
+
253
+ // No capture won — emit structured failure diagnostics.
254
+ const reasons = winner.failures
255
+ .map((f) => {
256
+ if (f.kind === "playwright_failed") {
257
+ return `playwright download: ${f.error?.message || f.error}`;
258
+ }
259
+ if (f.kind === "blob_failed") return `blob hook: ${f.error}`;
260
+ if (f.kind === "blob_timeout") return `blob hook: no capture within ${timeout}ms`;
261
+ if (f.kind === "signed_failed") return `signed url: ${f.error}`;
262
+ if (f.kind === "signed_timeout")
263
+ return `signed url: no signed file URL seen within ${timeout}ms`;
264
+ return f.kind;
265
+ })
266
+ .join("; ");
267
+ send({
268
+ type: "slice.download_failed",
269
+ verb: "download",
270
+ verb_index: ctx?.index ?? null,
271
+ selector: args.selector,
272
+ timeout_ms: timeout,
273
+ page_url: safePageUrl(page),
274
+ reason: reasons,
275
+ });
276
+ throw new Error(
277
+ `download: no file captured within ${timeout}ms after clicking ${args.selector} (page=${safePageUrl(page) || "?"}). ${reasons}`,
278
+ );
279
+ } finally {
280
+ // Backstop: ensure pollers are released on any exit path (thrown
281
+ // click/save error, extension rejection, etc.).
282
+ stop.done = true;
283
+ if (attached && browserContext && typeof browserContext.off === "function") {
284
+ try {
285
+ browserContext.off("download", claim);
286
+ } catch {}
287
+ }
288
+ }
289
+ },
290
+ };
291
+
292
+ async function savePlaywrightDownload({
293
+ download,
294
+ artifactsDir,
295
+ allowlist,
296
+ explicitPath,
297
+ page,
298
+ ctx,
299
+ }) {
300
+ const suggested = explicitPath || safeSuggestedFilename(download);
301
+ const sourceUrl = safeUrl(download);
302
+ if (!extensionAllowed(suggested, allowlist)) {
303
+ try {
304
+ await download.cancel();
305
+ } catch {}
306
+ throw new Error(
307
+ `download: file "${suggested}" rejected by WB_BROWSER_DOWNLOAD_EXTENSIONS`,
308
+ );
309
+ }
310
+ const target = uniquePathInside(artifactsDir, suggested);
311
+ if (!target) {
312
+ throw new Error(
313
+ `download: refusing to save "${suggested}" — resolves outside $WB_ARTIFACTS_DIR`,
314
+ );
315
+ }
316
+ await fsPromises.mkdir(artifactsDir, { recursive: true });
317
+ await download.saveAs(target);
318
+ let bytes = null;
319
+ try {
320
+ bytes = (await fsPromises.stat(target)).size;
321
+ } catch {}
322
+ send({
323
+ type: "slice.artifact_saved",
324
+ filename: path.basename(target),
325
+ path: target,
326
+ bytes,
327
+ source: "download",
328
+ provenance: {
329
+ url: sourceUrl,
330
+ suggested_filename: suggested,
331
+ page_url: safePageUrl(page),
332
+ verb_index: ctx?.index ?? null,
333
+ verb_name: "download",
334
+ ts: Date.now(),
335
+ },
336
+ });
337
+ return `→ ${path.basename(target)}`;
338
+ }
339
+
340
+ async function saveBlobDownload({
341
+ blob,
342
+ artifactsDir,
343
+ allowlist,
344
+ explicitPath,
345
+ page,
346
+ ctx,
347
+ }) {
348
+ const suggested = explicitPath || blob.filename || FALLBACK_NAME;
349
+ if (!extensionAllowed(suggested, allowlist)) {
350
+ throw new Error(
351
+ `download: file "${suggested}" rejected by WB_BROWSER_DOWNLOAD_EXTENSIONS`,
352
+ );
353
+ }
354
+ const target = uniquePathInside(artifactsDir, suggested);
355
+ if (!target) {
356
+ throw new Error(
357
+ `download: refusing to save "${suggested}" — resolves outside $WB_ARTIFACTS_DIR`,
358
+ );
359
+ }
360
+ const buf = Buffer.from(blob.bytes, "base64");
361
+ await fsPromises.mkdir(artifactsDir, { recursive: true });
362
+ await fsPromises.writeFile(target, buf);
363
+ send({
364
+ type: "slice.artifact_saved",
365
+ filename: path.basename(target),
366
+ path: target,
367
+ bytes: buf.length,
368
+ source: "download",
369
+ provenance: {
370
+ url: null,
371
+ suggested_filename: suggested,
372
+ page_url: safePageUrl(page),
373
+ verb_index: ctx?.index ?? null,
374
+ verb_name: "download",
375
+ mime_type: blob.mimeType || null,
376
+ capture: "blob",
377
+ ts: Date.now(),
378
+ },
379
+ });
380
+ return `→ ${path.basename(target)}`;
381
+ }
382
+
383
+ // Race N capture promises. First to report success wins. Every source must
384
+ // report before we declare failure, so the diagnostics frame can list every
385
+ // reason the verb didn't see a file. (Promise.race would shortcut on a fast
386
+ // failure and discard a slower success.) Each promise resolves to an object
387
+ // whose `kind` names the source: a success kind ("playwright" | "blob" |
388
+ // "signed_url") or a failure kind ("*_failed" | "*_timeout").
389
+ const SUCCESS_KINDS = new Set(["playwright", "blob", "signed_url"]);
390
+
391
+ function raceCaptures(promises) {
392
+ return new Promise((resolve) => {
393
+ let outstanding = promises.length;
394
+ const failures = [];
395
+ const finish = (settled) => {
396
+ if (settled.success) {
397
+ resolve(settled);
398
+ return;
399
+ }
400
+ failures.push(settled);
401
+ if (--outstanding === 0) resolve({ success: false, failures });
402
+ };
403
+ for (const pr of promises) {
404
+ pr.then((r) => {
405
+ if (SUCCESS_KINDS.has(r.kind)) finish({ success: true, ...r });
406
+ else finish({ success: false, ...r });
407
+ });
408
+ }
409
+ });
410
+ }
411
+
412
+ async function saveSignedUrlDownload({
413
+ signed,
414
+ artifactsDir,
415
+ allowlist,
416
+ explicitPath,
417
+ page,
418
+ ctx,
419
+ timeout,
420
+ }) {
421
+ const redacted = redactSignedUrl(signed.url);
422
+ // Filename: explicit path: wins, else the signed URL's basename, else a
423
+ // generic fallback. (S3 keys usually end in the real filename.)
424
+ let nameFromUrl = "";
425
+ try {
426
+ nameFromUrl = path.basename(new URL(signed.url).pathname) || "";
427
+ } catch {}
428
+ const suggested = explicitPath || (nameFromUrl.trim() ? nameFromUrl : FALLBACK_NAME);
429
+ if (!extensionAllowed(suggested, allowlist)) {
430
+ throw new Error(
431
+ `download: file "${suggested}" rejected by WB_BROWSER_DOWNLOAD_EXTENSIONS`,
432
+ );
433
+ }
434
+ const target = uniquePathInside(artifactsDir, suggested);
435
+ if (!target) {
436
+ throw new Error(
437
+ `download: refusing to save "${suggested}" — resolves outside $WB_ARTIFACTS_DIR`,
438
+ );
439
+ }
440
+ await fsPromises.mkdir(artifactsDir, { recursive: true });
441
+
442
+ // Fetch the signed URL from the sidecar (not the page) so the object store's
443
+ // CORS policy doesn't block the read. The label is redacted — retry logs must
444
+ // never echo signed credentials.
445
+ let res;
446
+ try {
447
+ res = await retryableFetch(
448
+ signed.url,
449
+ { method: "GET" },
450
+ `signed-url download (${redacted})`,
451
+ { timeoutMs: timeout },
452
+ );
453
+ } catch (e) {
454
+ send({
455
+ type: "slice.download_failed",
456
+ verb: "download",
457
+ verb_index: ctx?.index ?? null,
458
+ capture: "signed_url",
459
+ api_url: signed.api_url,
460
+ signed_url: redacted,
461
+ page_url: safePageUrl(page),
462
+ reason: `signed url fetch error: ${e?.message || e}`,
463
+ });
464
+ throw new Error(
465
+ `download: signed URL fetch failed for ${redacted}: ${e?.message || e}`,
466
+ );
467
+ }
468
+ if (!res.ok) {
469
+ // A 403 on a pre-signed URL almost always means the token expired before
470
+ // we fetched it — call that out so the operator knows to shorten the gap.
471
+ const expired = res.status === 403;
472
+ send({
473
+ type: "slice.download_failed",
474
+ verb: "download",
475
+ verb_index: ctx?.index ?? null,
476
+ capture: "signed_url",
477
+ api_url: signed.api_url,
478
+ signed_url: redacted,
479
+ page_url: safePageUrl(page),
480
+ http_status: res.status,
481
+ expired,
482
+ reason: `signed url fetch: HTTP ${res.status}${expired ? " (likely expired)" : ""}`,
483
+ });
484
+ throw new Error(
485
+ `download: signed URL fetch returned HTTP ${res.status} for ${redacted}${expired ? " (likely expired)" : ""}`,
486
+ );
487
+ }
488
+ const buf = Buffer.from(await res.arrayBuffer());
489
+ await fsPromises.writeFile(target, buf);
490
+ const contentType = safeHeader(res, "content-type");
491
+ const contentDisposition = safeHeader(res, "content-disposition");
492
+ send({
493
+ type: "slice.artifact_saved",
494
+ filename: path.basename(target),
495
+ path: target,
496
+ bytes: buf.length,
497
+ source: "download",
498
+ provenance: {
499
+ url: null,
500
+ signed_url: redacted,
501
+ api_url: signed.api_url,
502
+ field: signed.field,
503
+ suggested_filename: suggested,
504
+ page_url: safePageUrl(page),
505
+ verb_index: ctx?.index ?? null,
506
+ verb_name: "download",
507
+ capture: "signed_url",
508
+ content_type: contentType,
509
+ content_disposition: contentDisposition,
510
+ ts: Date.now(),
511
+ },
512
+ });
513
+ return `→ ${path.basename(target)}`;
514
+ }
515
+
516
+ async function pollForBlob(page, timeoutMs, stop) {
517
+ const deadline = Date.now() + timeoutMs;
518
+ while (true) {
519
+ if (stop?.done) return { kind: "blob_timeout" };
520
+ let result;
521
+ try {
522
+ result = await page.evaluate(POLL_SCRIPT);
523
+ } catch {
524
+ result = null;
525
+ }
526
+ if (result && result.bytes) return { kind: "blob", blob: result };
527
+ if (result && result.error) return { kind: "blob_failed", error: result.error };
528
+ if (Date.now() >= deadline) return { kind: "blob_timeout" };
529
+ await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
530
+ }
531
+ }
532
+
533
+ // Poll the page-side signed-URL candidate buffer until a candidate matching
534
+ // the configured policy appears or the deadline passes. The bytes are NOT
535
+ // fetched here — the winner is fetched server-side by saveSignedUrlDownload so
536
+ // CORS doesn't apply. Returns the picked candidate; never throws (page
537
+ // evaluate errors degrade to "keep polling").
538
+ async function pollForSignedUrl(page, timeoutMs, signedCfg, stop) {
539
+ const deadline = Date.now() + timeoutMs;
540
+ while (true) {
541
+ if (stop?.done) return { kind: "signed_timeout" };
542
+ let cands = null;
543
+ try {
544
+ cands = await page.evaluate(SIGNED_POLL_SCRIPT);
545
+ } catch {
546
+ cands = null;
547
+ }
548
+ if (Array.isArray(cands) && cands.length) {
549
+ const picked = pickSignedCandidate(cands, signedCfg);
550
+ if (picked) return { kind: "signed_url", signed: picked };
551
+ }
552
+ if (Date.now() >= deadline) return { kind: "signed_timeout" };
553
+ await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
554
+ }
555
+ }
556
+
557
+ function safeHeader(res, name) {
558
+ try {
559
+ return res.headers?.get?.(name) || null;
560
+ } catch {
561
+ return null;
562
+ }
563
+ }
564
+
565
+ function safePageUrl(page) {
566
+ try {
567
+ return page.url();
568
+ } catch {
569
+ return null;
570
+ }
571
+ }
572
+
573
+ function safeContext(page) {
574
+ try {
575
+ return page.context();
576
+ } catch {
577
+ return null;
578
+ }
579
+ }
580
+
581
+ function safeSuggestedFilename(download) {
582
+ try {
583
+ const s = download.suggestedFilename();
584
+ return s && s.trim() ? s : FALLBACK_NAME;
585
+ } catch {
586
+ return FALLBACK_NAME;
587
+ }
588
+ }
589
+
590
+ function safeUrl(download) {
591
+ try {
592
+ return download.url();
593
+ } catch {
594
+ return null;
595
+ }
596
+ }
package/verbs/index.js CHANGED
@@ -22,6 +22,7 @@ import saveVerb from "./save.js";
22
22
  import pauseForHumanVerb from "./pause_for_human.js";
23
23
  import waitForDropVerb from "./wait_for_drop.js";
24
24
  import announceArtifactVerb from "./announce_artifact.js";
25
+ import downloadVerb from "./download.js";
25
26
 
26
27
  const VERBS = [
27
28
  gotoVerb,
@@ -38,6 +39,7 @@ const VERBS = [
38
39
  pauseForHumanVerb,
39
40
  waitForDropVerb,
40
41
  announceArtifactVerb,
42
+ downloadVerb,
41
43
  ];
42
44
 
43
45
  export const VERB_REGISTRY = Object.fromEntries(VERBS.map((v) => [v.name, v]));