genkitx-playwright 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/tools.ts ADDED
@@ -0,0 +1,759 @@
1
+ import { MessageData, Part, z } from 'genkit';
2
+ import { tool } from 'genkit/beta';
3
+ import type { Locator, Page } from 'playwright';
4
+ import { BrowserSession } from './browser.js';
5
+
6
+
7
+ /** Names of all tools this middleware can inject. */
8
+ export const ALL_TOOL_NAMES = [
9
+ // navigation
10
+ 'browser_navigate',
11
+ 'browser_navigate_back',
12
+ 'browser_navigate_forward',
13
+ // observation
14
+ 'browser_snapshot',
15
+ 'browser_screenshot',
16
+ 'browser_get_console_logs',
17
+ 'browser_get_network_requests',
18
+ // interaction
19
+ 'browser_click',
20
+ 'browser_type',
21
+ 'browser_fill',
22
+ 'browser_press_key',
23
+ 'browser_hover',
24
+ 'browser_select_option',
25
+ 'browser_scroll',
26
+ 'browser_drag',
27
+ 'browser_file_upload',
28
+ // synchronization
29
+ 'browser_wait_for',
30
+ // tabs
31
+ 'browser_tab_list',
32
+ 'browser_tab_new',
33
+ 'browser_tab_select',
34
+ 'browser_tab_close',
35
+ // dialogs
36
+ 'browser_handle_dialog',
37
+ // escape hatch
38
+ 'browser_evaluate',
39
+ ] as const;
40
+
41
+ export type PlaywrightToolName = (typeof ALL_TOOL_NAMES)[number];
42
+
43
+ /** Tools that mutate page state. Disabled in read-only mode. */
44
+ const WRITE_TOOLS: PlaywrightToolName[] = [
45
+ 'browser_click',
46
+ 'browser_type',
47
+ 'browser_fill',
48
+ 'browser_press_key',
49
+ 'browser_hover',
50
+ 'browser_select_option',
51
+ 'browser_scroll',
52
+ 'browser_drag',
53
+ 'browser_file_upload',
54
+ 'browser_handle_dialog',
55
+ ];
56
+
57
+ export interface ToolFactoryOptions {
58
+ /** The active browser session. */
59
+ session: BrowserSession;
60
+ /**
61
+ * Queue of messages to inject into the conversation history. Tools that
62
+ * produce media (e.g. screenshots) push `user` messages here; the middleware
63
+ * drains the queue into the request on the next turn so the content enters
64
+ * the model's context as proper multimodal parts instead of a JSON blob.
65
+ */
66
+ messageQueue?: MessageData[];
67
+ /** Prefix to prepend to every tool name. */
68
+ toolNamePrefix?: string;
69
+
70
+ /** If true, only observe-only (non-mutating) tools are created. */
71
+ readOnly?: boolean;
72
+ /** If true, the `browser_evaluate` JS escape-hatch tool is created. */
73
+ allowEvaluate?: boolean;
74
+ /** Optional allow-list of (unprefixed) tool names to create. */
75
+ tools?: PlaywrightToolName[];
76
+ /** Restrict navigation to these hostnames (suffix match). */
77
+ allowedDomains?: string[];
78
+ /** Block navigation to these hostnames (suffix match). */
79
+ blockedDomains?: string[];
80
+ }
81
+
82
+ function hostnameAllowed(
83
+ url: string,
84
+ allowed?: string[],
85
+ blocked?: string[]
86
+ ): boolean {
87
+ let host: string;
88
+ try {
89
+ host = new URL(url).hostname;
90
+ } catch {
91
+ // Relative or invalid URLs are allowed; Playwright will surface errors.
92
+ return true;
93
+ }
94
+ const matches = (domain: string) =>
95
+ host === domain || host.endsWith(`.${domain}`);
96
+ if (blocked?.some(matches)) return false;
97
+ if (allowed && allowed.length > 0) return allowed.some(matches);
98
+ return true;
99
+ }
100
+
101
+ /**
102
+ * Resolves a snapshot ref (e.g. "e12") to a Playwright locator using the
103
+ * `aria-ref` selector engine populated by `browser_snapshot`.
104
+ */
105
+ function locatorForRef(page: Page, ref: string): Locator {
106
+ return page.locator(`aria-ref=${ref}`);
107
+ }
108
+
109
+ /**
110
+ * Produces an accessibility snapshot annotated with stable `ref` ids for each
111
+ * interactive element. Refs can be passed to interaction tools.
112
+ */
113
+ async function snapshotForAI(page: Page): Promise<string> {
114
+ // The "ai" mode of ariaSnapshot yields an ARIA tree where each element is
115
+ // tagged with a stable [ref=eN] id. Those refs resolve via the built-in
116
+ // `aria-ref=` selector engine (see locatorForRef). This is the same
117
+ // mechanism the official Playwright MCP server relies on.
118
+ try {
119
+ return await (
120
+ page.ariaSnapshot as (opts: { mode: 'ai' }) => Promise<string>
121
+ )({ mode: 'ai' });
122
+ } catch {
123
+ // Fallback: plain ARIA snapshot (no refs). Interaction tools won't be able
124
+ // to resolve refs, but the model can still read the page structure.
125
+ return page.locator('body').ariaSnapshot();
126
+ }
127
+ }
128
+
129
+ /**
130
+ * Creates the set of Playwright browser tools for a single generation.
131
+ */
132
+
133
+ export function createPlaywrightTools(opts: ToolFactoryOptions) {
134
+ const { session } = opts;
135
+ const prefix = opts.toolNamePrefix ?? '';
136
+ const name = (n: PlaywrightToolName) => `${prefix}${n}`;
137
+
138
+ // Determine which tools are enabled.
139
+ const enabled = new Set<PlaywrightToolName>(opts.tools ?? ALL_TOOL_NAMES);
140
+ if (opts.readOnly) {
141
+ for (const t of WRITE_TOOLS) enabled.delete(t);
142
+ }
143
+ if (!opts.allowEvaluate) {
144
+ enabled.delete('browser_evaluate');
145
+ }
146
+
147
+ const refSchema = z
148
+ .string()
149
+ .describe('Element ref from the latest browser_snapshot, e.g. "e12".');
150
+
151
+ /**
152
+ * Enqueues parts as a `user` message for the middleware to inject into the
153
+ * conversation history. Appends to the trailing queued `user` message when
154
+ * possible so consecutive media parts coalesce into one message.
155
+ */
156
+ const enqueueParts = (parts: Part[], toolName: string) => {
157
+ const queue = opts.messageQueue;
158
+ if (!queue) return;
159
+ const last = queue[queue.length - 1];
160
+ if (last && last.role === 'user') {
161
+ last.content.push(...parts);
162
+ } else {
163
+ queue.push({
164
+ role: 'user',
165
+ content: parts,
166
+ metadata: { playwrightMiddlewareTool: toolName },
167
+ });
168
+ }
169
+ };
170
+
171
+ const tools: any[] = [];
172
+
173
+ const add = (toolName: PlaywrightToolName, factory: () => any) => {
174
+ if (enabled.has(toolName)) tools.push(factory());
175
+ };
176
+
177
+ // ---- Navigation ----------------------------------------------------------
178
+
179
+ add('browser_navigate', () =>
180
+ tool(
181
+ {
182
+ name: name('browser_navigate'),
183
+ description: 'Navigate the current page to a URL.',
184
+ inputSchema: z.object({
185
+ url: z.string().describe('Absolute URL to navigate to.'),
186
+ }),
187
+ outputSchema: z.string(),
188
+ },
189
+ async ({ url }) => {
190
+ if (!hostnameAllowed(url, opts.allowedDomains, opts.blockedDomains)) {
191
+ throw new Error(`Navigation to "${url}" is not allowed by policy.`);
192
+ }
193
+ const page = await session.getPage();
194
+ await page.goto(url, { waitUntil: 'domcontentloaded' });
195
+ return `Navigated to ${page.url()} (title: "${await page.title()}").`;
196
+ }
197
+ )
198
+ );
199
+
200
+ add('browser_navigate_back', () =>
201
+ tool(
202
+ {
203
+ name: name('browser_navigate_back'),
204
+ description: 'Go back to the previous page in history.',
205
+ inputSchema: z.object({}),
206
+ outputSchema: z.string(),
207
+ },
208
+ async () => {
209
+ const page = await session.getPage();
210
+ await page.goBack({ waitUntil: 'domcontentloaded' });
211
+ return `Went back to ${page.url()}.`;
212
+ }
213
+ )
214
+ );
215
+
216
+ add('browser_navigate_forward', () =>
217
+ tool(
218
+ {
219
+ name: name('browser_navigate_forward'),
220
+ description: 'Go forward to the next page in history.',
221
+ inputSchema: z.object({}),
222
+ outputSchema: z.string(),
223
+ },
224
+ async () => {
225
+ const page = await session.getPage();
226
+ await page.goForward({ waitUntil: 'domcontentloaded' });
227
+ return `Went forward to ${page.url()}.`;
228
+ }
229
+ )
230
+ );
231
+
232
+ // ---- Observation ---------------------------------------------------------
233
+
234
+ add('browser_snapshot', () =>
235
+ tool(
236
+ {
237
+ name: name('browser_snapshot'),
238
+ description:
239
+ 'Capture an accessibility snapshot of the current page. Each ' +
240
+ 'interactive element is tagged with a [ref=eN] id you can pass to ' +
241
+ 'interaction tools (click, type, etc.). Prefer this over screenshots ' +
242
+ 'for deciding what to interact with.',
243
+ inputSchema: z.object({}),
244
+ outputSchema: z.object({
245
+ url: z.string(),
246
+ title: z.string(),
247
+ snapshot: z.string(),
248
+ }),
249
+ },
250
+ async () => {
251
+ const page = await session.getPage();
252
+ return {
253
+ url: page.url(),
254
+ title: await page.title(),
255
+ snapshot: await snapshotForAI(page),
256
+ };
257
+ }
258
+ )
259
+ );
260
+
261
+ add('browser_screenshot', () =>
262
+ tool(
263
+ {
264
+ name: name('browser_screenshot'),
265
+ description:
266
+ 'Take a screenshot of the current page. The image is added to the ' +
267
+ 'conversation so you can view it on your next turn.',
268
+ inputSchema: z.object({
269
+ fullPage: z
270
+ .boolean()
271
+ .optional()
272
+ .describe('Capture the full scrollable page. Defaults to false.'),
273
+ }),
274
+ outputSchema: z.string(),
275
+ },
276
+ async ({ fullPage }) => {
277
+ const page = await session.getPage();
278
+ const buffer = await page.screenshot({
279
+ fullPage: fullPage ?? false,
280
+ type: 'png',
281
+ });
282
+ const dataUri = `data:image/png;base64,${buffer.toString('base64')}`;
283
+
284
+ // Enqueue the image as a real multimodal `user` message part so the
285
+ // model actually "sees" it, instead of receiving a JSON blob in the
286
+ // tool response. The middleware drains this queue into the request.
287
+ enqueueParts(
288
+ [
289
+ { text: `\n\nbrowser_screenshot result image/png (${page.url()})` },
290
+ { media: { url: dataUri, contentType: 'image/png' } },
291
+ ],
292
+ name('browser_screenshot')
293
+ );
294
+
295
+ return 'Screenshot captured. The image is shown below.';
296
+ }
297
+ )
298
+ );
299
+
300
+
301
+ add('browser_get_console_logs', () =>
302
+ tool(
303
+ {
304
+ name: name('browser_get_console_logs'),
305
+ description: 'Return console messages collected from the page.',
306
+ inputSchema: z.object({}),
307
+ outputSchema: z.array(
308
+ z.object({ type: z.string(), text: z.string() })
309
+ ),
310
+ },
311
+ async () => session.consoleLogs
312
+ )
313
+ );
314
+
315
+ add('browser_get_network_requests', () =>
316
+ tool(
317
+ {
318
+ name: name('browser_get_network_requests'),
319
+ description:
320
+ 'List network requests made by the current page (URL, method, ' +
321
+ 'and response status when available).',
322
+ inputSchema: z.object({}),
323
+ outputSchema: z.array(
324
+ z.object({
325
+ url: z.string(),
326
+ method: z.string(),
327
+ status: z.number().optional(),
328
+ })
329
+ ),
330
+ },
331
+ async () => {
332
+ const page = await session.getPage();
333
+ // Best-effort: use the page's performance entries for a snapshot of
334
+ // resources. Live request interception is intentionally avoided to keep
335
+ // the session lightweight.
336
+ const entries = await page.evaluate(() => {
337
+ const list = performance.getEntriesByType(
338
+ 'resource'
339
+ ) as PerformanceResourceTiming[];
340
+ return list.map((e) => ({
341
+ url: e.name,
342
+ method: 'GET',
343
+ status: undefined as number | undefined,
344
+ }));
345
+ });
346
+ return entries;
347
+ }
348
+ )
349
+ );
350
+
351
+ // ---- Interaction ---------------------------------------------------------
352
+
353
+ add('browser_click', () =>
354
+ tool(
355
+ {
356
+ name: name('browser_click'),
357
+ description: 'Click an element identified by its snapshot ref.',
358
+ inputSchema: z.object({
359
+ ref: refSchema,
360
+ doubleClick: z
361
+ .boolean()
362
+ .optional()
363
+ .describe('Perform a double click instead of a single click.'),
364
+ }),
365
+ outputSchema: z.string(),
366
+ },
367
+ async ({ ref, doubleClick }) => {
368
+ const page = await session.getPage();
369
+ const locator = locatorForRef(page, ref);
370
+ if (doubleClick) {
371
+ await locator.dblclick();
372
+ } else {
373
+ await locator.click();
374
+ }
375
+ return `Clicked element ${ref}.`;
376
+ }
377
+ )
378
+ );
379
+
380
+ add('browser_type', () =>
381
+ tool(
382
+ {
383
+ name: name('browser_type'),
384
+ description:
385
+ 'Type text into an element by ref, character by character ' +
386
+ '(does not clear existing content).',
387
+ inputSchema: z.object({
388
+ ref: refSchema,
389
+ text: z.string().describe('Text to type.'),
390
+ submit: z
391
+ .boolean()
392
+ .optional()
393
+ .describe('Press Enter after typing.'),
394
+ }),
395
+ outputSchema: z.string(),
396
+ },
397
+ async ({ ref, text, submit }) => {
398
+ const page = await session.getPage();
399
+ const locator = locatorForRef(page, ref);
400
+ await locator.pressSequentially(text);
401
+ if (submit) await locator.press('Enter');
402
+ return `Typed into element ${ref}.`;
403
+ }
404
+ )
405
+ );
406
+
407
+ add('browser_fill', () =>
408
+ tool(
409
+ {
410
+ name: name('browser_fill'),
411
+ description:
412
+ 'Set the value of an input/textarea by ref, clearing it first.',
413
+ inputSchema: z.object({
414
+ ref: refSchema,
415
+ value: z.string().describe('Value to set.'),
416
+ }),
417
+ outputSchema: z.string(),
418
+ },
419
+ async ({ ref, value }) => {
420
+ const page = await session.getPage();
421
+ await locatorForRef(page, ref).fill(value);
422
+ return `Filled element ${ref}.`;
423
+ }
424
+ )
425
+ );
426
+
427
+ add('browser_press_key', () =>
428
+ tool(
429
+ {
430
+ name: name('browser_press_key'),
431
+ description:
432
+ 'Press a keyboard key (e.g. "Enter", "Tab", "ArrowDown", "a"). ' +
433
+ 'Optionally targets an element by ref.',
434
+ inputSchema: z.object({
435
+ key: z.string().describe('Key to press.'),
436
+ ref: refSchema.optional(),
437
+ }),
438
+ outputSchema: z.string(),
439
+ },
440
+ async ({ key, ref }) => {
441
+ const page = await session.getPage();
442
+ if (ref) {
443
+ await locatorForRef(page, ref).press(key);
444
+ } else {
445
+ await page.keyboard.press(key);
446
+ }
447
+ return `Pressed key "${key}".`;
448
+ }
449
+ )
450
+ );
451
+
452
+ add('browser_hover', () =>
453
+ tool(
454
+ {
455
+ name: name('browser_hover'),
456
+ description: 'Hover the mouse over an element by ref.',
457
+ inputSchema: z.object({ ref: refSchema }),
458
+ outputSchema: z.string(),
459
+ },
460
+ async ({ ref }) => {
461
+ const page = await session.getPage();
462
+ await locatorForRef(page, ref).hover();
463
+ return `Hovered element ${ref}.`;
464
+ }
465
+ )
466
+ );
467
+
468
+ add('browser_select_option', () =>
469
+ tool(
470
+ {
471
+ name: name('browser_select_option'),
472
+ description: 'Select option(s) in a <select> element by ref.',
473
+ inputSchema: z.object({
474
+ ref: refSchema,
475
+ values: z
476
+ .array(z.string())
477
+ .describe('Option values or labels to select.'),
478
+ }),
479
+ outputSchema: z.string(),
480
+ },
481
+ async ({ ref, values }) => {
482
+ const page = await session.getPage();
483
+ await locatorForRef(page, ref).selectOption(values);
484
+ return `Selected ${values.length} option(s) in ${ref}.`;
485
+ }
486
+ )
487
+ );
488
+
489
+ add('browser_scroll', () =>
490
+ tool(
491
+ {
492
+ name: name('browser_scroll'),
493
+ description:
494
+ 'Scroll the page up or down, or scroll a specific element into view.',
495
+ inputSchema: z.object({
496
+ direction: z
497
+ .enum(['up', 'down'])
498
+ .optional()
499
+ .describe('Direction to scroll the window. Defaults to "down".'),
500
+ ref: refSchema
501
+ .optional()
502
+ .describe('If provided, scroll this element into view instead.'),
503
+ }),
504
+ outputSchema: z.string(),
505
+ },
506
+ async ({ direction, ref }) => {
507
+ const page = await session.getPage();
508
+ if (ref) {
509
+ await locatorForRef(page, ref).scrollIntoViewIfNeeded();
510
+ return `Scrolled element ${ref} into view.`;
511
+ }
512
+ const dir = direction ?? 'down';
513
+ await page.evaluate((d) => {
514
+ window.scrollBy(0, d === 'down' ? window.innerHeight : -window.innerHeight);
515
+ }, dir);
516
+ return `Scrolled ${dir}.`;
517
+ }
518
+ )
519
+ );
520
+
521
+ add('browser_drag', () =>
522
+ tool(
523
+ {
524
+ name: name('browser_drag'),
525
+ description: 'Drag one element onto another, by their refs.',
526
+ inputSchema: z.object({
527
+ startRef: refSchema.describe('Ref of the element to drag.'),
528
+ endRef: refSchema.describe('Ref of the drop target.'),
529
+ }),
530
+ outputSchema: z.string(),
531
+ },
532
+ async ({ startRef, endRef }) => {
533
+ const page = await session.getPage();
534
+ await locatorForRef(page, startRef).dragTo(locatorForRef(page, endRef));
535
+ return `Dragged ${startRef} onto ${endRef}.`;
536
+ }
537
+ )
538
+ );
539
+
540
+ add('browser_file_upload', () =>
541
+ tool(
542
+ {
543
+ name: name('browser_file_upload'),
544
+ description:
545
+ 'Set files on a file input element by ref (provide local file paths).',
546
+ inputSchema: z.object({
547
+ ref: refSchema,
548
+ paths: z.array(z.string()).describe('Local file paths to upload.'),
549
+ }),
550
+ outputSchema: z.string(),
551
+ },
552
+ async ({ ref, paths }) => {
553
+ const page = await session.getPage();
554
+ await locatorForRef(page, ref).setInputFiles(paths);
555
+ return `Uploaded ${paths.length} file(s) to ${ref}.`;
556
+ }
557
+ )
558
+ );
559
+
560
+ // ---- Synchronization -----------------------------------------------------
561
+
562
+ add('browser_wait_for', () =>
563
+ tool(
564
+ {
565
+ name: name('browser_wait_for'),
566
+ description:
567
+ 'Wait for text to appear/disappear on the page, or for a fixed time.',
568
+ inputSchema: z.object({
569
+ text: z.string().optional().describe('Text to wait to appear.'),
570
+ textGone: z
571
+ .string()
572
+ .optional()
573
+ .describe('Text to wait to disappear.'),
574
+ time: z
575
+ .number()
576
+ .optional()
577
+ .describe('Seconds to wait. Capped at 30.'),
578
+ }),
579
+ outputSchema: z.string(),
580
+ },
581
+ async ({ text, textGone, time }) => {
582
+ const page = await session.getPage();
583
+ if (text) {
584
+ await page.getByText(text).first().waitFor({ state: 'visible' });
585
+ return `Text "${text}" appeared.`;
586
+ }
587
+ if (textGone) {
588
+ await page.getByText(textGone).first().waitFor({ state: 'hidden' });
589
+ return `Text "${textGone}" disappeared.`;
590
+ }
591
+ const seconds = Math.min(time ?? 1, 30);
592
+ await page.waitForTimeout(seconds * 1000);
593
+ return `Waited ${seconds}s.`;
594
+ }
595
+ )
596
+ );
597
+
598
+ // ---- Tabs ----------------------------------------------------------------
599
+
600
+ add('browser_tab_list', () =>
601
+ tool(
602
+ {
603
+ name: name('browser_tab_list'),
604
+ description: 'List open tabs with their index, URL, and title.',
605
+ inputSchema: z.object({}),
606
+ outputSchema: z.array(
607
+ z.object({
608
+ index: z.number(),
609
+ url: z.string(),
610
+ title: z.string(),
611
+ })
612
+ ),
613
+ },
614
+ async () => {
615
+ const pages = await session.getPages();
616
+ return Promise.all(
617
+ pages.map(async (p, index) => ({
618
+ index,
619
+ url: p.url(),
620
+ title: await p.title(),
621
+ }))
622
+ );
623
+ }
624
+ )
625
+ );
626
+
627
+ add('browser_tab_new', () =>
628
+ tool(
629
+ {
630
+ name: name('browser_tab_new'),
631
+ description: 'Open a new tab, optionally navigating it to a URL.',
632
+ inputSchema: z.object({
633
+ url: z.string().optional().describe('URL to open in the new tab.'),
634
+ }),
635
+ outputSchema: z.string(),
636
+ },
637
+ async ({ url }) => {
638
+ const page = await session.newPage();
639
+ if (url) {
640
+ if (!hostnameAllowed(url, opts.allowedDomains, opts.blockedDomains)) {
641
+ throw new Error(`Navigation to "${url}" is not allowed by policy.`);
642
+ }
643
+ await page.goto(url, { waitUntil: 'domcontentloaded' });
644
+ }
645
+ const pages = await session.getPages();
646
+ return `Opened tab ${pages.length - 1}${url ? ` at ${url}` : ''}.`;
647
+ }
648
+ )
649
+ );
650
+
651
+ add('browser_tab_select', () =>
652
+ tool(
653
+ {
654
+ name: name('browser_tab_select'),
655
+ description: 'Bring a tab to the front by its index.',
656
+ inputSchema: z.object({
657
+ index: z.number().describe('Tab index from browser_tab_list.'),
658
+ }),
659
+ outputSchema: z.string(),
660
+ },
661
+ async ({ index }) => {
662
+ const pages = await session.getPages();
663
+ const page = pages[index];
664
+ if (!page) throw new Error(`No tab at index ${index}.`);
665
+ await page.bringToFront();
666
+ return `Selected tab ${index}.`;
667
+ }
668
+ )
669
+ );
670
+
671
+ add('browser_tab_close', () =>
672
+ tool(
673
+ {
674
+ name: name('browser_tab_close'),
675
+ description:
676
+ 'Close a tab by index (defaults to the current/last tab).',
677
+ inputSchema: z.object({
678
+ index: z.number().optional().describe('Tab index to close.'),
679
+ }),
680
+ outputSchema: z.string(),
681
+ },
682
+ async ({ index }) => {
683
+ const pages = await session.getPages();
684
+ const page = index === undefined ? pages[pages.length - 1] : pages[index];
685
+ if (!page) throw new Error(`No tab at index ${index}.`);
686
+ await page.close();
687
+ return `Closed tab ${index ?? pages.length - 1}.`;
688
+ }
689
+ )
690
+ );
691
+
692
+ // ---- Dialogs -------------------------------------------------------------
693
+
694
+ add('browser_handle_dialog', () =>
695
+ tool(
696
+ {
697
+ name: name('browser_handle_dialog'),
698
+ description:
699
+ 'Accept or dismiss a pending JavaScript dialog (alert/confirm/prompt).',
700
+ inputSchema: z.object({
701
+ accept: z.boolean().describe('Accept (true) or dismiss (false).'),
702
+ promptText: z
703
+ .string()
704
+ .optional()
705
+ .describe('Text to enter for prompt dialogs.'),
706
+ }),
707
+ outputSchema: z.string(),
708
+ },
709
+ async ({ accept, promptText }) => {
710
+ const dialog = session.activeDialog;
711
+ if (!dialog) return 'No pending dialog to handle.';
712
+ if (accept) {
713
+ await dialog.accept(promptText);
714
+ } else {
715
+ await dialog.dismiss();
716
+ }
717
+ session.activeDialog = undefined;
718
+ session.pendingDialog = undefined;
719
+ return `Dialog ${accept ? 'accepted' : 'dismissed'}.`;
720
+ }
721
+ )
722
+ );
723
+
724
+ // ---- Escape hatch --------------------------------------------------------
725
+
726
+ add('browser_evaluate', () =>
727
+ tool(
728
+ {
729
+ name: name('browser_evaluate'),
730
+ description:
731
+ 'Evaluate arbitrary JavaScript in the page context and return the ' +
732
+ 'JSON-serializable result. Use only when other tools are insufficient.',
733
+ inputSchema: z.object({
734
+ script: z
735
+ .string()
736
+ .describe(
737
+ 'A JS expression or function body, e.g. "document.title".'
738
+ ),
739
+ }),
740
+ outputSchema: z.any(),
741
+ },
742
+ async ({ script }) => {
743
+ const page = await session.getPage();
744
+ // Wrap so both expressions and statements work.
745
+ const result = await page.evaluate(
746
+ (code) => {
747
+ // eslint-disable-next-line no-new-func
748
+ const fn = new Function(`return (async () => { ${code} })()`);
749
+ return fn();
750
+ },
751
+ /(\breturn\b|;)/.test(script) ? script : `return (${script});`
752
+ );
753
+ return result ?? null;
754
+ }
755
+ )
756
+ );
757
+
758
+ return tools;
759
+ }