@poncho-ai/browser 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- > @poncho-ai/browser@0.4.0 build /home/runner/work/poncho-ai/poncho-ai/packages/browser
2
+ > @poncho-ai/browser@0.4.0 build /Users/cesar/Dev/latitude/poncho-ai/packages/browser
3
3
  > tsup src/index.ts --format esm --dts
4
4
 
5
5
  CLI Building entry: src/index.ts
@@ -8,7 +8,7 @@
8
8
  CLI Target: es2022
9
9
  ESM Build start
10
10
  ESM dist/index.js 34.91 KB
11
- ESM ⚡️ Build success in 62ms
11
+ ESM ⚡️ Build success in 218ms
12
12
  DTS Build start
13
- DTS ⚡️ Build success in 4777ms
13
+ DTS ⚡️ Build success in 5616ms
14
14
  DTS dist/index.d.ts 12.44 KB
@@ -0,0 +1,12 @@
1
+
2
+ > @poncho-ai/browser@0.3.0 test /Users/cesar/Dev/latitude/poncho-ai/packages/browser
3
+ > vitest --passWithNoTests
4
+
5
+
6
+  RUN  v1.6.1 /Users/cesar/Dev/latitude/poncho-ai/packages/browser
7
+
8
+ include: **/*.{test,spec}.?(c|m)[jt]s?(x)
9
+ exclude: **/node_modules/**, **/dist/**, **/cypress/**, **/.{idea,git,cache,output,temp}/**, **/{karma,rollup,webpack,vite,vitest,jest,ava,babel,nyc,cypress,tsup,build,eslint,prettier}.config.*
10
+ watch exclude: **/node_modules/**, **/dist/**
11
+ No test files found, exiting with code 0
12
+
package/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # @poncho-ai/browser
2
2
 
3
+ ## 0.5.0
4
+
5
+ ### Minor Changes
6
+
7
+ - [`540c8e6`](https://github.com/cesr/poncho-ai/commit/540c8e6d895a95c2f215deb4af219069543371d9) Thanks [@cesr](https://github.com/cesr)! - Add `browser_click_text` and `browser_execute_js` tools for interacting with elements that don't appear in the accessibility snapshot (e.g. styled divs acting as buttons). Also force new-tab navigations (`window.open`, `target="_blank"`) to stay in the current tab so agents don't lose context.
8
+
3
9
  ## 0.4.0
4
10
 
5
11
  ### Minor Changes
package/dist/index.d.ts CHANGED
@@ -90,6 +90,12 @@ declare class BrowserSession {
90
90
  * Only needs to be called once per browser launch.
91
91
  */
92
92
  private installContextStealth;
93
+ /**
94
+ * Force all new-tab navigations (window.open, target="_blank") to open
95
+ * in the current tab instead. Agents operate on a single tab at a time
96
+ * and can't see or interact with popups.
97
+ */
98
+ private installSameTabScript;
93
99
  /**
94
100
  * Override the user-agent via CDP on the current page target.
95
101
  * CDP Network.setUserAgentOverride is per-target, so call per-tab.
@@ -122,6 +128,8 @@ declare class BrowserSession {
122
128
  title: string;
123
129
  }>;
124
130
  scroll(conversationId: string, direction: "up" | "down", amount?: number): Promise<void>;
131
+ clickText(conversationId: string, text: string, exact?: boolean): Promise<void>;
132
+ executeJs(conversationId: string, script: string): Promise<unknown>;
125
133
  closeTab(conversationId: string): Promise<void>;
126
134
  navigate(conversationId: string, action: string): Promise<void>;
127
135
  startScreencast(conversationId: string, options?: ScreencastOptions): Promise<void>;
package/dist/index.js CHANGED
@@ -186,6 +186,46 @@ async function getBrowserManagerCtor() {
186
186
  return BrowserManagerCtor;
187
187
  }
188
188
  var MAX_TABS = 8;
189
+ var SAME_TAB_INIT_SCRIPT = `
190
+ (() => {
191
+ // Override window.open to navigate in-place
192
+ try {
193
+ const origOpen = window.open;
194
+ window.open = function(url, target, features) {
195
+ if (url) {
196
+ location.href = url;
197
+ return window;
198
+ }
199
+ return origOpen.call(this, url, target, features);
200
+ };
201
+ } catch {}
202
+
203
+ // Rewrite target="_blank" on existing and future links
204
+ try {
205
+ const rewrite = (el) => {
206
+ if (el.tagName === 'A' && el.target === '_blank') {
207
+ el.target = '_self';
208
+ }
209
+ };
210
+ // Catch links already in the DOM
211
+ document.addEventListener('DOMContentLoaded', () => {
212
+ document.querySelectorAll('a[target="_blank"]').forEach(rewrite);
213
+ });
214
+ // Catch dynamically added links
215
+ new MutationObserver((mutations) => {
216
+ for (const m of mutations) {
217
+ for (const node of m.addedNodes) {
218
+ if (node.nodeType !== 1) continue;
219
+ rewrite(node);
220
+ if (node.querySelectorAll) {
221
+ node.querySelectorAll('a[target="_blank"]').forEach(rewrite);
222
+ }
223
+ }
224
+ }
225
+ }).observe(document.documentElement, { childList: true, subtree: true });
226
+ } catch {}
227
+ })();
228
+ `;
189
229
  var BrowserSession = class {
190
230
  config;
191
231
  sessionId;
@@ -264,6 +304,20 @@ var BrowserSession = class {
264
304
  console.warn("[poncho][browser] Failed to install stealth init script:", err?.message ?? err);
265
305
  }
266
306
  }
307
+ /**
308
+ * Force all new-tab navigations (window.open, target="_blank") to open
309
+ * in the current tab instead. Agents operate on a single tab at a time
310
+ * and can't see or interact with popups.
311
+ */
312
+ async installSameTabScript(mgr) {
313
+ const ctx = mgr.getContext();
314
+ if (!ctx) return;
315
+ try {
316
+ await ctx.addInitScript({ content: SAME_TAB_INIT_SCRIPT });
317
+ } catch (err) {
318
+ console.warn("[poncho][browser] Failed to install same-tab init script:", err?.message ?? err);
319
+ }
320
+ }
267
321
  /**
268
322
  * Override the user-agent via CDP on the current page target.
269
323
  * CDP Network.setUserAgentOverride is per-target, so call per-tab.
@@ -310,6 +364,7 @@ var BrowserSession = class {
310
364
  if (this.stealthEnabled) {
311
365
  await this.installContextStealth(mgr);
312
366
  }
367
+ await this.installSameTabScript(mgr);
313
368
  try {
314
369
  const cdp = await mgr.getCDPSession();
315
370
  await cdp.send("Debugger.disable");
@@ -558,6 +613,30 @@ var BrowserSession = class {
558
613
  this.unlock();
559
614
  }
560
615
  }
616
+ async clickText(conversationId, text, exact) {
617
+ await this.lock();
618
+ try {
619
+ const mgr = await this.ensureManager();
620
+ const tab = await this.switchToConversation(mgr, conversationId);
621
+ const selector = exact ? `text="${text}"` : `text=${text}`;
622
+ const locator = mgr.getLocator(selector);
623
+ await locator.click();
624
+ tab.url = mgr.getPage().url();
625
+ } finally {
626
+ this.unlock();
627
+ }
628
+ }
629
+ async executeJs(conversationId, script) {
630
+ await this.lock();
631
+ try {
632
+ const mgr = await this.ensureManager();
633
+ await this.switchToConversation(mgr, conversationId);
634
+ const page = mgr.getPage();
635
+ return await page.evaluate(script);
636
+ } finally {
637
+ this.unlock();
638
+ }
639
+ }
561
640
  async closeTab(conversationId) {
562
641
  await this.lock();
563
642
  try {
@@ -907,6 +986,53 @@ function createBrowserTools(getSession, getConversationId) {
907
986
  return { clicked: ref };
908
987
  }
909
988
  },
989
+ {
990
+ name: "browser_click_text",
991
+ description: "Click the first visible element on the page that contains the given text. Use this when an element doesn't appear in the snapshot \u2014 e.g. styled divs acting as buttons. By default matches substring (case-insensitive); set exact=true for exact text match.",
992
+ inputSchema: {
993
+ type: "object",
994
+ properties: {
995
+ text: {
996
+ type: "string",
997
+ description: "The visible text of the element to click"
998
+ },
999
+ exact: {
1000
+ type: "boolean",
1001
+ description: "If true, match the exact full text (case-sensitive). Default: false (substring, case-insensitive)."
1002
+ }
1003
+ },
1004
+ required: ["text"]
1005
+ },
1006
+ handler: async (input) => {
1007
+ const session = getSession();
1008
+ const text = String(input.text ?? "");
1009
+ if (!text) throw new Error("text is required");
1010
+ const exact = input.exact === true;
1011
+ await session.clickText(getConversationId(), text, exact);
1012
+ return { clicked: text, exact };
1013
+ }
1014
+ },
1015
+ {
1016
+ name: "browser_execute_js",
1017
+ description: "Execute JavaScript in the current page context and return the result. Use this to inspect or interact with the DOM when snapshot refs aren't available \u2014 e.g. finding elements by text content, getting bounding boxes, or clicking elements by selector. The script is evaluated via page.evaluate(); return a value to get it back.",
1018
+ inputSchema: {
1019
+ type: "object",
1020
+ properties: {
1021
+ script: {
1022
+ type: "string",
1023
+ description: "JavaScript code to evaluate in the page. Use a return statement or expression to get a result back."
1024
+ }
1025
+ },
1026
+ required: ["script"]
1027
+ },
1028
+ handler: async (input) => {
1029
+ const session = getSession();
1030
+ const script = String(input.script ?? "");
1031
+ if (!script) throw new Error("script is required");
1032
+ const result = await session.executeJs(getConversationId(), script);
1033
+ return { result: result ?? null };
1034
+ }
1035
+ },
910
1036
  {
911
1037
  name: "browser_type",
912
1038
  description: "Type text into a form field identified by its ref from the last snapshot. This clears the field first, then types the new value.",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@poncho-ai/browser",
3
- "version": "0.4.0",
3
+ "version": "0.5.0",
4
4
  "description": "Browser automation for Poncho agents, powered by agent-browser",
5
5
  "repository": {
6
6
  "type": "git",
package/src/session.ts CHANGED
@@ -62,6 +62,51 @@ async function getBrowserManagerCtor(): Promise<new () => BrowserManagerInstance
62
62
 
63
63
  const MAX_TABS = 8;
64
64
 
65
+ /**
66
+ * Init script that forces new-tab navigations (window.open, target="_blank")
67
+ * to open in the current tab. Runs before page scripts on every navigation.
68
+ */
69
+ const SAME_TAB_INIT_SCRIPT = `
70
+ (() => {
71
+ // Override window.open to navigate in-place
72
+ try {
73
+ const origOpen = window.open;
74
+ window.open = function(url, target, features) {
75
+ if (url) {
76
+ location.href = url;
77
+ return window;
78
+ }
79
+ return origOpen.call(this, url, target, features);
80
+ };
81
+ } catch {}
82
+
83
+ // Rewrite target="_blank" on existing and future links
84
+ try {
85
+ const rewrite = (el) => {
86
+ if (el.tagName === 'A' && el.target === '_blank') {
87
+ el.target = '_self';
88
+ }
89
+ };
90
+ // Catch links already in the DOM
91
+ document.addEventListener('DOMContentLoaded', () => {
92
+ document.querySelectorAll('a[target="_blank"]').forEach(rewrite);
93
+ });
94
+ // Catch dynamically added links
95
+ new MutationObserver((mutations) => {
96
+ for (const m of mutations) {
97
+ for (const node of m.addedNodes) {
98
+ if (node.nodeType !== 1) continue;
99
+ rewrite(node);
100
+ if (node.querySelectorAll) {
101
+ node.querySelectorAll('a[target="_blank"]').forEach(rewrite);
102
+ }
103
+ }
104
+ }
105
+ }).observe(document.documentElement, { childList: true, subtree: true });
106
+ } catch {}
107
+ })();
108
+ `;
109
+
65
110
  // Per-conversation tab state
66
111
  interface ConversationTab {
67
112
  tabIndex: number;
@@ -163,6 +208,21 @@ export class BrowserSession {
163
208
  }
164
209
  }
165
210
 
211
+ /**
212
+ * Force all new-tab navigations (window.open, target="_blank") to open
213
+ * in the current tab instead. Agents operate on a single tab at a time
214
+ * and can't see or interact with popups.
215
+ */
216
+ private async installSameTabScript(mgr: BrowserManagerInstance): Promise<void> {
217
+ const ctx = mgr.getContext();
218
+ if (!ctx) return;
219
+ try {
220
+ await ctx.addInitScript({ content: SAME_TAB_INIT_SCRIPT });
221
+ } catch (err) {
222
+ console.warn("[poncho][browser] Failed to install same-tab init script:", (err as Error)?.message ?? err);
223
+ }
224
+ }
225
+
166
226
  /**
167
227
  * Override the user-agent via CDP on the current page target.
168
228
  * CDP Network.setUserAgentOverride is per-target, so call per-tab.
@@ -219,6 +279,9 @@ export class BrowserSession {
219
279
  await this.installContextStealth(mgr);
220
280
  }
221
281
 
282
+ // Redirect new-tab navigations into the current tab
283
+ await this.installSameTabScript(mgr);
284
+
222
285
  try {
223
286
  const cdp = await mgr.getCDPSession();
224
287
  await cdp.send("Debugger.disable");
@@ -475,6 +538,32 @@ export class BrowserSession {
475
538
  }
476
539
  }
477
540
 
541
+ async clickText(conversationId: string, text: string, exact?: boolean): Promise<void> {
542
+ await this.lock();
543
+ try {
544
+ const mgr = await this.ensureManager();
545
+ const tab = await this.switchToConversation(mgr, conversationId);
546
+ const selector = exact ? `text="${text}"` : `text=${text}`;
547
+ const locator = mgr.getLocator(selector);
548
+ await locator.click();
549
+ tab.url = mgr.getPage().url();
550
+ } finally {
551
+ this.unlock();
552
+ }
553
+ }
554
+
555
+ async executeJs(conversationId: string, script: string): Promise<unknown> {
556
+ await this.lock();
557
+ try {
558
+ const mgr = await this.ensureManager();
559
+ await this.switchToConversation(mgr, conversationId);
560
+ const page = mgr.getPage();
561
+ return await page.evaluate(script);
562
+ } finally {
563
+ this.unlock();
564
+ }
565
+ }
566
+
478
567
  async closeTab(conversationId: string): Promise<void> {
479
568
  await this.lock();
480
569
  try {
package/src/tools.ts CHANGED
@@ -73,6 +73,62 @@ export function createBrowserTools(
73
73
  return { clicked: ref };
74
74
  },
75
75
  },
76
+ {
77
+ name: "browser_click_text",
78
+ description:
79
+ "Click the first visible element on the page that contains the given text. " +
80
+ "Use this when an element doesn't appear in the snapshot — e.g. styled divs acting as buttons. " +
81
+ "By default matches substring (case-insensitive); set exact=true for exact text match.",
82
+ inputSchema: {
83
+ type: "object",
84
+ properties: {
85
+ text: {
86
+ type: "string",
87
+ description: "The visible text of the element to click",
88
+ },
89
+ exact: {
90
+ type: "boolean",
91
+ description:
92
+ "If true, match the exact full text (case-sensitive). Default: false (substring, case-insensitive).",
93
+ },
94
+ },
95
+ required: ["text"],
96
+ },
97
+ handler: async (input: BrowserToolInput) => {
98
+ const session = getSession();
99
+ const text = String(input.text ?? "");
100
+ if (!text) throw new Error("text is required");
101
+ const exact = input.exact === true;
102
+ await session.clickText(getConversationId(), text, exact);
103
+ return { clicked: text, exact };
104
+ },
105
+ },
106
+ {
107
+ name: "browser_execute_js",
108
+ description:
109
+ "Execute JavaScript in the current page context and return the result. " +
110
+ "Use this to inspect or interact with the DOM when snapshot refs aren't available — " +
111
+ "e.g. finding elements by text content, getting bounding boxes, or clicking elements by selector. " +
112
+ "The script is evaluated via page.evaluate(); return a value to get it back.",
113
+ inputSchema: {
114
+ type: "object",
115
+ properties: {
116
+ script: {
117
+ type: "string",
118
+ description:
119
+ "JavaScript code to evaluate in the page. Use a return statement or expression to get a result back.",
120
+ },
121
+ },
122
+ required: ["script"],
123
+ },
124
+ handler: async (input: BrowserToolInput) => {
125
+ const session = getSession();
126
+ const script = String(input.script ?? "");
127
+ if (!script) throw new Error("script is required");
128
+ const result = await session.executeJs(getConversationId(), script);
129
+ return { result: result ?? null };
130
+ },
131
+ },
76
132
  {
77
133
  name: "browser_type",
78
134
  description: