@stevederico/dotbot 0.19.0 → 0.20.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/tools/browser.js CHANGED
@@ -1,26 +1,19 @@
1
- // agent/browser.js
1
+ // tools/browser.js
2
2
  // Headless browser automation tools for the DotBot agent.
3
3
  // Provides 7 tools: navigate, read_page, click, type, screenshot, extract, close.
4
4
  // Uses a singleton Chromium instance with per-user browser contexts (isolated cookies/storage).
5
+ // Zero npm dependencies - uses Chrome DevTools Protocol directly.
5
6
 
6
- // Lazy-load playwright to avoid hard dependency at module evaluation time.
7
- // Consumers that don't use browser tools won't need playwright installed.
8
- let _chromium = null;
9
- async function getChromium() {
10
- if (!_chromium) {
11
- const pw = await import("playwright");
12
- _chromium = pw.chromium;
13
- }
14
- return _chromium;
15
- }
16
- import { writeFile, mkdir, readdir, unlink, stat } from "node:fs/promises";
7
+ import { CDPClient } from '../core/cdp.js';
8
+ import { launchBrowser, createBrowserContext, closeBrowserContext, killBrowser } from '../core/browser-launcher.js';
9
+ import { writeFile, mkdir, readdir, unlink, stat } from 'node:fs/promises';
17
10
 
18
11
  // ── Constants ──
19
12
 
20
13
  const MAX_CONTEXTS = 10;
21
14
  const IDLE_TIMEOUT_MS = 5 * 60 * 1000;
22
15
  const NAV_TIMEOUT_MS = 30_000;
23
- const SCREENSHOT_DIR = "/tmp/dotbot_screenshots";
16
+ const SCREENSHOT_DIR = '/tmp/dotbot_screenshots';
24
17
  const MAX_CONTENT_CHARS = 8000;
25
18
  const MAX_SCREENSHOTS_PER_USER = 20;
26
19
  const SCREENSHOT_TTL_MS = 60 * 60 * 1000; // 1 hour
@@ -38,30 +31,30 @@ const STALE_SCREENSHOT_MS = 24 * 60 * 60 * 1000; // 24 hours
38
31
  function validateUrl(url) {
39
32
  try {
40
33
  const parsed = new URL(url);
41
- if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
42
- return { valid: false, error: "Only http and https URLs are allowed" };
34
+ if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
35
+ return { valid: false, error: 'Only http and https URLs are allowed' };
43
36
  }
44
37
  const hostname = parsed.hostname;
45
38
  if (
46
- hostname === "localhost" ||
47
- hostname.startsWith("127.") ||
48
- hostname.startsWith("192.168.") ||
49
- hostname.startsWith("10.") ||
50
- hostname.startsWith("172.16.") ||
51
- hostname.startsWith("172.17.") ||
52
- hostname.startsWith("172.18.") ||
53
- hostname.startsWith("172.19.") ||
54
- hostname.startsWith("172.2") ||
55
- hostname.startsWith("172.30.") ||
56
- hostname.startsWith("172.31.") ||
57
- hostname === "0.0.0.0" ||
58
- hostname === "[::1]"
39
+ hostname === 'localhost' ||
40
+ hostname.startsWith('127.') ||
41
+ hostname.startsWith('192.168.') ||
42
+ hostname.startsWith('10.') ||
43
+ hostname.startsWith('172.16.') ||
44
+ hostname.startsWith('172.17.') ||
45
+ hostname.startsWith('172.18.') ||
46
+ hostname.startsWith('172.19.') ||
47
+ hostname.startsWith('172.2') ||
48
+ hostname.startsWith('172.30.') ||
49
+ hostname.startsWith('172.31.') ||
50
+ hostname === '0.0.0.0' ||
51
+ hostname === '[::1]'
59
52
  ) {
60
- return { valid: false, error: "Private/local URLs are not allowed" };
53
+ return { valid: false, error: 'Private/local URLs are not allowed' };
61
54
  }
62
55
  return { valid: true };
63
56
  } catch {
64
- return { valid: false, error: "Invalid URL" };
57
+ return { valid: false, error: 'Invalid URL' };
65
58
  }
66
59
  }
67
60
 
@@ -73,39 +66,38 @@ function validateUrl(url) {
73
66
  */
74
67
  class BrowserSessionManager {
75
68
  constructor() {
76
- /** @type {import('playwright').Browser|null} */
69
+ /** @type {{ process: ChildProcess, port: number, wsUrl: string }|null} */
77
70
  this.browser = null;
78
- /** @type {Map<string, { context: import('playwright').BrowserContext, page: import('playwright').Page, lastUsed: number, idleTimer: NodeJS.Timeout }>} */
71
+ /** @type {Map<string, { cdp: CDPClient, targetWsUrl: string, lastUsed: number, idleTimer: NodeJS.Timeout }>} */
79
72
  this.contexts = new Map();
80
73
  }
81
74
 
82
75
  /**
83
76
  * Launch the shared Chromium instance if not already running.
84
- * @returns {Promise<import('playwright').Browser>}
77
+ * @returns {Promise<{ process: ChildProcess, port: number, wsUrl: string }>}
85
78
  */
86
79
  async ensureBrowser() {
87
- if (!this.browser || !this.browser.isConnected()) {
88
- const chromium = await getChromium();
89
- this.browser = await chromium.launch({ headless: true });
90
- console.log("[browser] Chromium launched");
80
+ if (!this.browser || this.browser.process.killed) {
81
+ this.browser = await launchBrowser();
82
+ console.log('[browser] Chromium launched');
91
83
  }
92
84
  return this.browser;
93
85
  }
94
86
 
95
87
  /**
96
- * Get or create a browser context + page for a user.
88
+ * Get or create a browser context + CDP client for a user.
97
89
  * Resets idle timer on each access. Evicts LRU context if at capacity.
98
90
  *
99
91
  * @param {string} userID - User identifier for context isolation
100
- * @returns {Promise<import('playwright').Page>} The user's page
92
+ * @returns {Promise<CDPClient>} The user's CDP client
101
93
  */
102
- async getPage(userID) {
94
+ async getClient(userID) {
103
95
  const existing = this.contexts.get(userID);
104
96
  if (existing) {
105
97
  existing.lastUsed = Date.now();
106
98
  clearTimeout(existing.idleTimer);
107
99
  existing.idleTimer = setTimeout(() => this.closeContext(userID), IDLE_TIMEOUT_MS);
108
- return existing.page;
100
+ return existing.cdp;
109
101
  }
110
102
 
111
103
  // Evict LRU if at capacity
@@ -122,15 +114,24 @@ class BrowserSessionManager {
122
114
  }
123
115
 
124
116
  const browser = await this.ensureBrowser();
125
- const context = await browser.newContext({
126
- userAgent: "DotBot/1.0 (Headless Browser)",
117
+
118
+ // Create new tab/context
119
+ const targetWsUrl = await createBrowserContext(browser.wsUrl);
120
+
121
+ // Connect CDP client to the new target
122
+ const cdp = new CDPClient(targetWsUrl);
123
+ await cdp.connect();
124
+
125
+ // Set viewport and user agent
126
+ await cdp.setViewport(1280, 720);
127
+ await cdp.send('Network.setUserAgentOverride', {
128
+ userAgent: 'DotBot/1.0 (Headless Browser)'
127
129
  });
128
- const page = await context.newPage();
129
- page.setDefaultNavigationTimeout(NAV_TIMEOUT_MS);
130
130
 
131
131
  const idleTimer = setTimeout(() => this.closeContext(userID), IDLE_TIMEOUT_MS);
132
- this.contexts.set(userID, { context, page, lastUsed: Date.now(), idleTimer });
133
- return page;
132
+ this.contexts.set(userID, { cdp, targetWsUrl, lastUsed: Date.now(), idleTimer });
133
+
134
+ return cdp;
134
135
  }
135
136
 
136
137
  /**
@@ -140,13 +141,17 @@ class BrowserSessionManager {
140
141
  async closeContext(userID) {
141
142
  const entry = this.contexts.get(userID);
142
143
  if (!entry) return;
144
+
143
145
  clearTimeout(entry.idleTimer);
144
146
  this.contexts.delete(userID);
147
+
145
148
  try {
146
- await entry.context.close();
149
+ entry.cdp.close();
150
+ await closeBrowserContext(entry.targetWsUrl);
147
151
  } catch {
148
152
  // Context may already be closed
149
153
  }
154
+
150
155
  console.log(`[browser] context closed for user ${userID}`);
151
156
  }
152
157
 
@@ -159,12 +164,12 @@ class BrowserSessionManager {
159
164
  }
160
165
  if (this.browser) {
161
166
  try {
162
- await this.browser.close();
167
+ killBrowser(this.browser.process);
163
168
  } catch {
164
169
  // Browser may already be closed
165
170
  }
166
171
  this.browser = null;
167
- console.log("[browser] Chromium closed");
172
+ console.log('[browser] Chromium closed');
168
173
  }
169
174
  }
170
175
  }
@@ -183,7 +188,7 @@ export const sessionManager = new BrowserSessionManager();
183
188
  async function pruneScreenshots(userID) {
184
189
  try {
185
190
  const files = await readdir(SCREENSHOT_DIR);
186
- const userFiles = files.filter(f => f.startsWith(`${userID}_`) && f.endsWith(".png"));
191
+ const userFiles = files.filter(f => f.startsWith(`${userID}_`) && f.endsWith('.png'));
187
192
  if (userFiles.length === 0) return;
188
193
 
189
194
  const now = Date.now();
@@ -231,7 +236,7 @@ export async function cleanupStaleScreenshots() {
231
236
  let deleted = 0;
232
237
 
233
238
  for (const name of files) {
234
- if (!name.endsWith(".png")) continue;
239
+ if (!name.endsWith('.png')) continue;
235
240
  const path = `${SCREENSHOT_DIR}/${name}`;
236
241
  const s = await stat(path).catch(() => null);
237
242
  if (s && now - s.mtimeMs > STALE_SCREENSHOT_MS) {
@@ -248,6 +253,48 @@ export async function cleanupStaleScreenshots() {
248
253
  }
249
254
  }
250
255
 
256
+ // ── Helpers ──
257
+
258
+ /**
259
+ * Build a structured summary of interactive elements on the page via DOM evaluation.
260
+ *
261
+ * @param {CDPClient} cdp - CDP client instance
262
+ * @returns {Promise<string>} Formatted element tree
263
+ */
264
+ async function getPageStructure(cdp) {
265
+ return await cdp.evaluate(`
266
+ (() => {
267
+ const INTERACTIVE = 'a,button,input,select,textarea,[role=button],[role=link],[role=tab],[role=menuitem]';
268
+ const lines = [];
269
+ const els = document.querySelectorAll(INTERACTIVE);
270
+ for (const el of els) {
271
+ if (el.offsetParent === null && el.tagName !== 'INPUT') continue;
272
+ const tag = el.tagName.toLowerCase();
273
+ const role = el.getAttribute('role') || tag;
274
+ const name =
275
+ el.getAttribute('aria-label') ||
276
+ el.innerText?.slice(0, 60).replace(/\\n/g, ' ').trim() ||
277
+ el.getAttribute('placeholder') ||
278
+ el.getAttribute('name') ||
279
+ '';
280
+ const type = el.getAttribute('type') || '';
281
+ const href = el.getAttribute('href') || '';
282
+ let line = '[' + role + ']';
283
+ if (name) line += ' "' + name + '"';
284
+ if (type) line += ' type=' + type;
285
+ if (href) line += ' href="' + href.slice(0, 80) + '"';
286
+ lines.push(line);
287
+ }
288
+ const headings = document.querySelectorAll('h1,h2,h3');
289
+ for (const h of headings) {
290
+ const text = h.innerText?.trim();
291
+ if (text) lines.push('[' + h.tagName.toLowerCase() + '] "' + text.slice(0, 80) + '"');
292
+ }
293
+ return lines.length > 0 ? lines.join('\\n') : 'No interactive elements found.';
294
+ })()
295
+ `);
296
+ }
297
+
251
298
  // ── Tool Definitions ──
252
299
 
253
300
  /**
@@ -258,377 +305,425 @@ export async function cleanupStaleScreenshots() {
258
305
  */
259
306
  export function createBrowserTools(screenshotUrlPattern = (filename) => `/api/agent/screenshots/${filename}`) {
260
307
  return [
261
- {
262
- name: "browser_navigate",
263
- description:
264
- "Navigate a headless browser to a URL and return the page title and text content. PREFERRED tool for reading web pages — renders JavaScript so it works on dynamic sites (live scores, SPAs, dashboards). Use this instead of calling web_search multiple times.",
265
- parameters: {
266
- type: "object",
267
- properties: {
268
- url: {
269
- type: "string",
270
- description: "The URL to navigate to (must be http or https)",
308
+ {
309
+ name: 'browser_navigate',
310
+ description:
311
+ 'Navigate a headless browser to a URL and return the page title and text content. PREFERRED tool for reading web pages — renders JavaScript so it works on dynamic sites (live scores, SPAs, dashboards). Use this instead of calling web_search multiple times.',
312
+ parameters: {
313
+ type: 'object',
314
+ properties: {
315
+ url: {
316
+ type: 'string',
317
+ description: 'The URL to navigate to (must be http or https)'
318
+ }
271
319
  },
320
+ required: ['url']
272
321
  },
273
- required: ["url"],
274
- },
275
- execute: async (input, signal, context) => {
276
- const check = validateUrl(input.url);
277
- if (!check.valid) return `Error: ${check.error}`;
322
+ execute: async (input, signal, context) => {
323
+ const check = validateUrl(input.url);
324
+ if (!check.valid) return `Error: ${check.error}`;
278
325
 
279
- try {
280
- const page = await sessionManager.getPage(context.userID);
281
- await page.goto(input.url, { waitUntil: "domcontentloaded" });
282
- const title = await page.title();
283
- let text = await page.innerText("body").catch(() => "");
284
- if (text.length > MAX_CONTENT_CHARS) {
285
- text = text.slice(0, MAX_CONTENT_CHARS) + `\n\n... [truncated, ${text.length} chars total]`;
326
+ try {
327
+ const cdp = await sessionManager.getClient(context.userID);
328
+ await cdp.navigate(input.url);
329
+
330
+ // Wait for load event then network idle for SPAs
331
+ await cdp.waitForLoad(NAV_TIMEOUT_MS).catch(() => {});
332
+ await cdp.waitForNetworkIdle({ timeout: 5000, idleTime: 300 });
333
+
334
+ const title = await cdp.getTitle();
335
+ const currentUrl = await cdp.getUrl();
336
+ let text = await cdp.getBodyText();
337
+
338
+ if (text.length > MAX_CONTENT_CHARS) {
339
+ text = text.slice(0, MAX_CONTENT_CHARS) + `\n\n... [truncated, ${text.length} chars total]`;
340
+ }
341
+
342
+ return JSON.stringify({
343
+ action: 'browser_update',
344
+ url: currentUrl,
345
+ title,
346
+ content: text
347
+ });
348
+ } catch (err) {
349
+ return `Error navigating to ${input.url}: ${err.message}`;
286
350
  }
287
- return JSON.stringify({
288
- action: "browser_update",
289
- url: page.url(),
290
- title,
291
- content: text,
292
- });
293
- } catch (err) {
294
- return `Error navigating to ${input.url}: ${err.message}`;
295
351
  }
296
352
  },
297
- },
298
-
299
- {
300
- name: "browser_read_page",
301
- description:
302
- "Read the current page content or a specific section. Use 'text' mode for readable text, 'accessibility' mode for a structured element tree (useful before clicking or typing).",
303
- parameters: {
304
- type: "object",
305
- properties: {
306
- mode: {
307
- type: "string",
308
- description: "'text' for page text content, 'accessibility' for element tree. Default: 'text'",
309
- },
310
- selector: {
311
- type: "string",
312
- description: "Optional CSS selector to scope reading to a specific element",
313
- },
314
- },
315
- },
316
- execute: async (input, signal, context) => {
317
- try {
318
- const page = await sessionManager.getPage(context.userID);
319
- const currentUrl = page.url();
320
- if (currentUrl === "about:blank") return "No page loaded. Use browser_navigate first.";
321
-
322
- if (input.mode === "accessibility") {
323
- const tree = await getPageStructure(page);
324
- if (!tree) return "No page structure available.";
325
- if (tree.length > MAX_CONTENT_CHARS) {
326
- return `Page: ${currentUrl}\n\n${tree.slice(0, MAX_CONTENT_CHARS)}\n... [truncated]`;
353
+
354
+ {
355
+ name: 'browser_read_page',
356
+ description:
357
+ "Read the current page content or a specific section. Use 'text' mode for readable text, 'accessibility' mode for a structured element tree (useful before clicking or typing).",
358
+ parameters: {
359
+ type: 'object',
360
+ properties: {
361
+ mode: {
362
+ type: 'string',
363
+ description: "'text' for page text content, 'accessibility' for element tree. Default: 'text'"
364
+ },
365
+ selector: {
366
+ type: 'string',
367
+ description: 'Optional CSS selector to scope reading to a specific element'
327
368
  }
328
- return `Page: ${currentUrl}\n\n${tree}`;
329
369
  }
370
+ },
371
+ execute: async (input, signal, context) => {
372
+ try {
373
+ const cdp = await sessionManager.getClient(context.userID);
374
+ const currentUrl = await cdp.getUrl();
375
+
376
+ if (currentUrl === 'about:blank') {
377
+ return 'No page loaded. Use browser_navigate first.';
378
+ }
379
+
380
+ if (input.mode === 'accessibility') {
381
+ const tree = await getPageStructure(cdp);
382
+ if (!tree) return 'No page structure available.';
383
+ if (tree.length > MAX_CONTENT_CHARS) {
384
+ return `Page: ${currentUrl}\n\n${tree.slice(0, MAX_CONTENT_CHARS)}\n... [truncated]`;
385
+ }
386
+ return `Page: ${currentUrl}\n\n${tree}`;
387
+ }
388
+
389
+ // Default: text mode
390
+ let text;
391
+ if (input.selector) {
392
+ text = await cdp.getText(input.selector);
393
+ } else {
394
+ text = await cdp.getBodyText();
395
+ }
396
+
397
+ if (!text) {
398
+ return `No text content found${input.selector ? ` for selector "${input.selector}"` : ''}.`;
399
+ }
400
+
401
+ if (text.length > MAX_CONTENT_CHARS) {
402
+ text = text.slice(0, MAX_CONTENT_CHARS) + `\n\n... [truncated, ${text.length} chars total]`;
403
+ }
330
404
 
331
- // Default: text mode
332
- const target = input.selector ? page.locator(input.selector).first() : page.locator("body");
333
- let text = await target.innerText().catch(() => "");
334
- if (!text) return `No text content found${input.selector ? ` for selector "${input.selector}"` : ""}.`;
335
- if (text.length > MAX_CONTENT_CHARS) {
336
- text = text.slice(0, MAX_CONTENT_CHARS) + `\n\n... [truncated, ${text.length} chars total]`;
405
+ return `Page: ${currentUrl}\n\n${text}`;
406
+ } catch (err) {
407
+ return `Error reading page: ${err.message}`;
337
408
  }
338
- return `Page: ${currentUrl}\n\n${text}`;
339
- } catch (err) {
340
- return `Error reading page: ${err.message}`;
341
409
  }
342
410
  },
343
- },
344
-
345
- {
346
- name: "browser_click",
347
- description:
348
- "Click an element on the current page by CSS selector or visible text. Use browser_read_page with 'accessibility' mode first to find the right selector or text.",
349
- parameters: {
350
- type: "object",
351
- properties: {
352
- selector: {
353
- type: "string",
354
- description: "CSS selector of the element to click (e.g. 'button.submit', '#login-btn')",
355
- },
356
- text: {
357
- type: "string",
358
- description: "Visible text of the element to click (e.g. 'Sign In', 'Next'). Used if selector is not provided.",
359
- },
411
+
412
+ {
413
+ name: 'browser_click',
414
+ description:
415
+ "Click an element on the current page by CSS selector or visible text. Use browser_read_page with 'accessibility' mode first to find the right selector or text.",
416
+ parameters: {
417
+ type: 'object',
418
+ properties: {
419
+ selector: {
420
+ type: 'string',
421
+ description: "CSS selector of the element to click (e.g. 'button.submit', '#login-btn')"
422
+ },
423
+ text: {
424
+ type: 'string',
425
+ description: "Visible text of the element to click (e.g. 'Sign In', 'Next'). Used if selector is not provided."
426
+ }
427
+ }
360
428
  },
361
- },
362
- execute: async (input, signal, context) => {
363
- if (!input.selector && !input.text) return "Error: provide either 'selector' or 'text' to identify the element.";
429
+ execute: async (input, signal, context) => {
430
+ if (!input.selector && !input.text) {
431
+ return "Error: provide either 'selector' or 'text' to identify the element.";
432
+ }
364
433
 
365
- try {
366
- const page = await sessionManager.getPage(context.userID);
367
- if (page.url() === "about:blank") return "No page loaded. Use browser_navigate first.";
434
+ try {
435
+ const cdp = await sessionManager.getClient(context.userID);
436
+ const currentUrl = await cdp.getUrl();
368
437
 
369
- if (input.selector) {
370
- await page.locator(input.selector).first().click({ timeout: 5000 });
371
- } else {
372
- await page.getByText(input.text, { exact: false }).first().click({ timeout: 5000 });
373
- }
438
+ if (currentUrl === 'about:blank') {
439
+ return 'No page loaded. Use browser_navigate first.';
440
+ }
374
441
 
375
- // Wait briefly for navigation or dynamic content
376
- await page.waitForLoadState("domcontentloaded", { timeout: 5000 }).catch(() => {});
377
- const title = await page.title();
378
- return JSON.stringify({
379
- action: "browser_update",
380
- url: page.url(),
381
- title,
382
- clicked: input.selector || input.text,
383
- });
384
- } catch (err) {
385
- return `Error clicking element: ${err.message}`;
442
+ // Use retry helpers for more reliable clicking
443
+ if (input.selector) {
444
+ await cdp.clickWithRetry(input.selector);
445
+ } else {
446
+ // For text-based clicking, use retry wrapper
447
+ await cdp.retry(async () => {
448
+ const el = await cdp.getByText(input.text);
449
+ if (!el) throw new Error(`Element with text "${input.text}" not found`);
450
+ await cdp.click(el.x, el.y);
451
+ });
452
+ }
453
+
454
+ // Wait for network to settle after click
455
+ await cdp.waitForNetworkIdle({ timeout: 5000, idleTime: 300 });
456
+
457
+ const title = await cdp.getTitle();
458
+ const newUrl = await cdp.getUrl();
459
+
460
+ return JSON.stringify({
461
+ action: 'browser_update',
462
+ url: newUrl,
463
+ title,
464
+ clicked: input.selector || input.text
465
+ });
466
+ } catch (err) {
467
+ return `Error clicking element: ${err.message}`;
468
+ }
386
469
  }
387
470
  },
388
- },
389
-
390
- {
391
- name: "browser_type",
392
- description:
393
- "Type text into an input field on the current page. Finds the field by CSS selector, label, or placeholder text.",
394
- parameters: {
395
- type: "object",
396
- properties: {
397
- selector: {
398
- type: "string",
399
- description: "CSS selector of the input (e.g. 'input[name=email]', '#search')",
400
- },
401
- label: {
402
- type: "string",
403
- description: "Label text of the input field. Used if selector is not provided.",
404
- },
405
- placeholder: {
406
- type: "string",
407
- description: "Placeholder text of the input field. Used if selector and label are not provided.",
408
- },
409
- text: {
410
- type: "string",
411
- description: "Text to type into the field",
412
- },
413
- submit: {
414
- type: "boolean",
415
- description: "Press Enter after typing (to submit a form). Default: false",
471
+
472
+ {
473
+ name: 'browser_type',
474
+ description:
475
+ 'Type text into an input field on the current page. Finds the field by CSS selector, label, or placeholder text.',
476
+ parameters: {
477
+ type: 'object',
478
+ properties: {
479
+ selector: {
480
+ type: 'string',
481
+ description: "CSS selector of the input (e.g. 'input[name=email]', '#search')"
482
+ },
483
+ label: {
484
+ type: 'string',
485
+ description: 'Label text of the input field. Used if selector is not provided.'
486
+ },
487
+ placeholder: {
488
+ type: 'string',
489
+ description: 'Placeholder text of the input field. Used if selector and label are not provided.'
490
+ },
491
+ text: {
492
+ type: 'string',
493
+ description: 'Text to type into the field'
494
+ },
495
+ submit: {
496
+ type: 'boolean',
497
+ description: 'Press Enter after typing (to submit a form). Default: false'
498
+ }
416
499
  },
500
+ required: ['text']
417
501
  },
418
- required: ["text"],
419
- },
420
- execute: async (input, signal, context) => {
421
- try {
422
- const page = await sessionManager.getPage(context.userID);
423
- if (page.url() === "about:blank") return "No page loaded. Use browser_navigate first.";
424
-
425
- let locator;
426
- if (input.selector) {
427
- locator = page.locator(input.selector).first();
428
- } else if (input.label) {
429
- locator = page.getByLabel(input.label).first();
430
- } else if (input.placeholder) {
431
- locator = page.getByPlaceholder(input.placeholder).first();
432
- } else {
433
- // Fallback: first visible input
434
- locator = page.locator("input:visible, textarea:visible").first();
435
- }
502
+ execute: async (input, signal, context) => {
503
+ try {
504
+ const cdp = await sessionManager.getClient(context.userID);
505
+ const currentUrl = await cdp.getUrl();
436
506
 
437
- await locator.fill(input.text, { timeout: 5000 });
507
+ if (currentUrl === 'about:blank') {
508
+ return 'No page loaded. Use browser_navigate first.';
509
+ }
438
510
 
439
- if (input.submit) {
440
- await locator.press("Enter");
441
- await page.waitForLoadState("domcontentloaded", { timeout: 5000 }).catch(() => {});
442
- }
511
+ // Determine selector to use
512
+ let selector;
513
+ if (input.selector) {
514
+ selector = input.selector;
515
+ } else if (input.label) {
516
+ // Find input by label
517
+ const labelSelector = await cdp.evaluate(`
518
+ (() => {
519
+ const label = Array.from(document.querySelectorAll('label')).find(l =>
520
+ l.textContent.toLowerCase().includes('${input.label.toLowerCase().replace(/'/g, "\\'")}')
521
+ );
522
+ if (label && label.htmlFor) {
523
+ return '#' + label.htmlFor;
524
+ }
525
+ const input = label?.querySelector('input,textarea');
526
+ if (input) {
527
+ return input.tagName.toLowerCase() + (input.id ? '#' + input.id : '[name="' + input.name + '"]');
528
+ }
529
+ return null;
530
+ })()
531
+ `);
532
+ if (!labelSelector) {
533
+ return `No input found with label "${input.label}"`;
534
+ }
535
+ selector = labelSelector;
536
+ } else if (input.placeholder) {
537
+ selector = `input[placeholder*="${input.placeholder}"],textarea[placeholder*="${input.placeholder}"]`;
538
+ } else {
539
+ // Fallback: first visible input
540
+ selector = 'input:not([type=hidden]),textarea';
541
+ }
542
+
543
+ await cdp.fillWithRetry(selector, input.text);
544
+
545
+ if (input.submit) {
546
+ await cdp.press('Enter');
547
+ await cdp.waitForNetworkIdle({ timeout: 5000, idleTime: 300 });
548
+ }
443
549
 
444
- return JSON.stringify({
445
- action: "browser_update",
446
- url: page.url(),
447
- title: await page.title(),
448
- typed: input.text.slice(0, 50),
449
- submitted: input.submit || false,
450
- });
451
- } catch (err) {
452
- return `Error typing into field: ${err.message}`;
550
+ const title = await cdp.getTitle();
551
+ const newUrl = await cdp.getUrl();
552
+
553
+ return JSON.stringify({
554
+ action: 'browser_update',
555
+ url: newUrl,
556
+ title,
557
+ typed: input.text.slice(0, 50),
558
+ submitted: input.submit || false
559
+ });
560
+ } catch (err) {
561
+ return `Error typing into field: ${err.message}`;
562
+ }
453
563
  }
454
564
  },
455
- },
456
-
457
- {
458
- name: "browser_screenshot",
459
- description:
460
- "Take a screenshot of the current page and save it as a PNG. Returns an accessibility summary of the page and the screenshot URL.",
461
- parameters: {
462
- type: "object",
463
- properties: {
464
- full_page: {
465
- type: "boolean",
466
- description: "Capture the full scrollable page instead of just the viewport. Default: false",
467
- },
468
- selector: {
469
- type: "string",
470
- description: "CSS selector to screenshot a specific element instead of the whole page",
471
- },
472
- },
473
- },
474
- execute: async (input, signal, context) => {
475
- try {
476
- const page = await sessionManager.getPage(context.userID);
477
- if (page.url() === "about:blank") return "No page loaded. Use browser_navigate first.";
478
-
479
- await mkdir(SCREENSHOT_DIR, { recursive: true });
480
- const filename = `${context.userID}_${Date.now()}.png`;
481
- const filepath = `${SCREENSHOT_DIR}/${filename}`;
482
-
483
- const opts = { path: filepath, type: "png" };
484
- if (input.selector) {
485
- await page.locator(input.selector).first().screenshot(opts);
486
- } else {
487
- opts.fullPage = input.full_page || false;
488
- await page.screenshot(opts);
489
- }
490
565
 
491
- // Prune old screenshots (best-effort, non-blocking)
492
- pruneScreenshots(context.userID).catch(() => {});
493
-
494
- // Build page summary for the agent LLM
495
- const title = await page.title();
496
- const screenshotUrl = screenshotUrlPattern(filename);
497
- let pageSummary = `Page: ${title} (${page.url()})`;
498
- const tree = await getPageStructure(page).catch(() => null);
499
- if (tree) {
500
- const trimmed = tree.length > 2000 ? tree.slice(0, 2000) + "\n... [truncated]" : tree;
501
- pageSummary += `\n\nPage structure:\n${trimmed}`;
566
+ {
567
+ name: 'browser_screenshot',
568
+ description:
569
+ 'Take a screenshot of the current page and save it as a PNG. Returns an accessibility summary of the page and the screenshot URL.',
570
+ parameters: {
571
+ type: 'object',
572
+ properties: {
573
+ full_page: {
574
+ type: 'boolean',
575
+ description: 'Capture the full scrollable page instead of just the viewport. Default: false'
576
+ },
577
+ selector: {
578
+ type: 'string',
579
+ description: 'CSS selector to screenshot a specific element instead of the whole page'
580
+ }
502
581
  }
503
- // Log to activity so Photos app can list the screenshot
504
- if (context?.databaseManager) {
505
- try {
506
- await context.databaseManager.logAgentActivity(
507
- context.dbConfig.dbType, context.dbConfig.db, context.dbConfig.connectionString,
508
- context.userID, { type: "image_generation", prompt: `Screenshot: ${title}`, url: screenshotUrl, source: "browser" }
509
- );
510
- } catch { /* best effort */ }
582
+ },
583
+ execute: async (input, signal, context) => {
584
+ try {
585
+ const cdp = await sessionManager.getClient(context.userID);
586
+ const currentUrl = await cdp.getUrl();
587
+
588
+ if (currentUrl === 'about:blank') {
589
+ return 'No page loaded. Use browser_navigate first.';
590
+ }
591
+
592
+ await mkdir(SCREENSHOT_DIR, { recursive: true });
593
+ const filename = `${context.userID}_${Date.now()}.png`;
594
+ const filepath = `${SCREENSHOT_DIR}/${filename}`;
595
+
596
+ let buffer;
597
+ if (input.selector) {
598
+ buffer = await cdp.screenshotElement(input.selector);
599
+ } else {
600
+ buffer = await cdp.screenshot({ fullPage: input.full_page || false });
601
+ }
602
+
603
+ await writeFile(filepath, buffer);
604
+
605
+ // Prune old screenshots (best-effort, non-blocking)
606
+ pruneScreenshots(context.userID).catch(() => {});
607
+
608
+ // Build page summary for the agent LLM
609
+ const title = await cdp.getTitle();
610
+ const screenshotUrl = screenshotUrlPattern(filename);
611
+ let pageSummary = `Page: ${title} (${currentUrl})`;
612
+
613
+ const tree = await getPageStructure(cdp).catch(() => null);
614
+ if (tree) {
615
+ const trimmed = tree.length > 2000 ? tree.slice(0, 2000) + '\n... [truncated]' : tree;
616
+ pageSummary += `\n\nPage structure:\n${trimmed}`;
617
+ }
618
+
619
+ // Log to activity so Photos app can list the screenshot
620
+ if (context?.databaseManager) {
621
+ try {
622
+ await context.databaseManager.logAgentActivity(
623
+ context.dbConfig.dbType,
624
+ context.dbConfig.db,
625
+ context.dbConfig.connectionString,
626
+ context.userID,
627
+ { type: 'image_generation', prompt: `Screenshot: ${title}`, url: screenshotUrl, source: 'browser' }
628
+ );
629
+ } catch {
630
+ /* best effort */
631
+ }
632
+ }
633
+
634
+ // Return image JSON so frontend renders the screenshot inline
635
+ return JSON.stringify({ type: 'image', url: screenshotUrl, prompt: pageSummary });
636
+ } catch (err) {
637
+ return `Error taking screenshot: ${err.message}`;
511
638
  }
512
- // Return image JSON so frontend renders the screenshot inline
513
- return JSON.stringify({ type: "image", url: screenshotUrl, prompt: pageSummary });
514
- } catch (err) {
515
- return `Error taking screenshot: ${err.message}`;
516
639
  }
517
640
  },
518
- },
519
-
520
- {
521
- name: "browser_extract",
522
- description:
523
- "Extract structured data from the current page using CSS selectors. Returns an array of objects with the requested fields.",
524
- parameters: {
525
- type: "object",
526
- properties: {
527
- selector: {
528
- type: "string",
529
- description: "CSS selector for the repeating container elements (e.g. '.product-card', 'tr.result')",
530
- },
531
- fields: {
532
- type: "object",
533
- description: "Map of field names to CSS selectors relative to each container (e.g. { \"title\": \"h3\", \"price\": \".price\" })",
534
- },
535
- limit: {
536
- type: "number",
537
- description: "Max number of items to extract. Default: 20",
641
+
642
+ {
643
+ name: 'browser_extract',
644
+ description:
645
+ 'Extract structured data from the current page using CSS selectors. Returns an array of objects with the requested fields.',
646
+ parameters: {
647
+ type: 'object',
648
+ properties: {
649
+ selector: {
650
+ type: 'string',
651
+ description: "CSS selector for the repeating container elements (e.g. '.product-card', 'tr.result')"
652
+ },
653
+ fields: {
654
+ type: 'object',
655
+ description:
656
+ 'Map of field names to CSS selectors relative to each container (e.g. { "title": "h3", "price": ".price" })'
657
+ },
658
+ limit: {
659
+ type: 'number',
660
+ description: 'Max number of items to extract. Default: 20'
661
+ }
538
662
  },
663
+ required: ['selector', 'fields']
539
664
  },
540
- required: ["selector", "fields"],
541
- },
542
- execute: async (input, signal, context) => {
543
- try {
544
- const page = await sessionManager.getPage(context.userID);
545
- if (page.url() === "about:blank") return "No page loaded. Use browser_navigate first.";
546
-
547
- const limit = input.limit || 20;
548
- const containers = page.locator(input.selector);
549
- const count = Math.min(await containers.count(), limit);
550
-
551
- if (count === 0) return `No elements found matching "${input.selector}".`;
552
-
553
- const results = [];
554
- for (let i = 0; i < count; i++) {
555
- const container = containers.nth(i);
556
- const item = {};
557
- for (const [fieldName, fieldSelector] of Object.entries(input.fields)) {
558
- const el = container.locator(fieldSelector).first();
559
- item[fieldName] = await el.innerText().catch(() => "");
665
+ execute: async (input, signal, context) => {
666
+ try {
667
+ const cdp = await sessionManager.getClient(context.userID);
668
+ const currentUrl = await cdp.getUrl();
669
+
670
+ if (currentUrl === 'about:blank') {
671
+ return 'No page loaded. Use browser_navigate first.';
672
+ }
673
+
674
+ const limit = input.limit || 20;
675
+ const fieldsJson = JSON.stringify(input.fields);
676
+
677
+ const results = await cdp.evaluate(`
678
+ (() => {
679
+ const containers = document.querySelectorAll('${input.selector.replace(/'/g, "\\'")}');
680
+ const fields = ${fieldsJson};
681
+ const limit = ${limit};
682
+ const results = [];
683
+
684
+ for (let i = 0; i < Math.min(containers.length, limit); i++) {
685
+ const container = containers[i];
686
+ const item = {};
687
+ for (const [fieldName, fieldSelector] of Object.entries(fields)) {
688
+ const el = container.querySelector(fieldSelector);
689
+ item[fieldName] = el?.innerText || '';
690
+ }
691
+ results.push(item);
692
+ }
693
+
694
+ return results;
695
+ })()
696
+ `);
697
+
698
+ if (!results || results.length === 0) {
699
+ return `No elements found matching "${input.selector}".`;
560
700
  }
561
- results.push(item);
562
- }
563
701
 
564
- const json = JSON.stringify(results, null, 2);
565
- if (json.length > MAX_CONTENT_CHARS) {
566
- return json.slice(0, MAX_CONTENT_CHARS) + "\n... [truncated]";
702
+ const json = JSON.stringify(results, null, 2);
703
+ if (json.length > MAX_CONTENT_CHARS) {
704
+ return json.slice(0, MAX_CONTENT_CHARS) + '\n... [truncated]';
705
+ }
706
+ return json;
707
+ } catch (err) {
708
+ return `Error extracting data: ${err.message}`;
567
709
  }
568
- return json;
569
- } catch (err) {
570
- return `Error extracting data: ${err.message}`;
571
710
  }
572
711
  },
573
- },
574
-
575
- {
576
- name: "browser_close",
577
- description:
578
- "Close the current browser session. Use this when you're done browsing to free resources.",
579
- parameters: {
580
- type: "object",
581
- properties: {},
582
- },
583
- execute: async (input, signal, context) => {
584
- await sessionManager.closeContext(context.userID);
585
- return JSON.stringify({ action: "browser_closed" });
586
- },
587
- },
712
+
713
+ {
714
+ name: 'browser_close',
715
+ description: "Close the current browser session. Use this when you're done browsing to free resources.",
716
+ parameters: {
717
+ type: 'object',
718
+ properties: {}
719
+ },
720
+ execute: async (input, signal, context) => {
721
+ await sessionManager.closeContext(context.userID);
722
+ return JSON.stringify({ action: 'browser_closed' });
723
+ }
724
+ }
588
725
  ];
589
726
  }
590
727
 
591
728
  // Export default tools with default screenshot pattern
592
729
  export const browserTools = createBrowserTools();
593
-
594
- // ── Helpers ──
595
-
596
- /**
597
- * Build a structured summary of interactive elements on the page via DOM evaluation.
598
- * Replaces the deprecated page.accessibility.snapshot() API.
599
- *
600
- * @param {import('playwright').Page} page - Playwright page instance
601
- * @returns {Promise<string>} Formatted element tree
602
- */
603
- async function getPageStructure(page) {
604
- return await page.evaluate(() => {
605
- const INTERACTIVE = "a,button,input,select,textarea,[role=button],[role=link],[role=tab],[role=menuitem]";
606
- const lines = [];
607
- const els = document.querySelectorAll(INTERACTIVE);
608
- for (const el of els) {
609
- if (el.offsetParent === null && el.tagName !== "INPUT") continue; // skip hidden
610
- const tag = el.tagName.toLowerCase();
611
- const role = el.getAttribute("role") || tag;
612
- const name =
613
- el.getAttribute("aria-label") ||
614
- el.innerText?.slice(0, 60).replace(/\n/g, " ").trim() ||
615
- el.getAttribute("placeholder") ||
616
- el.getAttribute("name") ||
617
- "";
618
- const type = el.getAttribute("type") || "";
619
- const href = el.getAttribute("href") || "";
620
- let line = `[${role}]`;
621
- if (name) line += ` "${name}"`;
622
- if (type) line += ` type=${type}`;
623
- if (href) line += ` href="${href.slice(0, 80)}"`;
624
- lines.push(line);
625
- }
626
- // Also include headings for page structure
627
- const headings = document.querySelectorAll("h1,h2,h3");
628
- for (const h of headings) {
629
- const text = h.innerText?.trim();
630
- if (text) lines.push(`[${h.tagName.toLowerCase()}] "${text.slice(0, 80)}"`);
631
- }
632
- return lines.length > 0 ? lines.join("\n") : "No interactive elements found.";
633
- });
634
- }