meshagent-computers 0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of meshagent-computers might be problematic. Click here for more details.

@@ -0,0 +1,7 @@
1
+ from .computer import Computer
2
+ from .browserbase import BrowserbaseBrowser
3
+ from .local_playwright import LocalPlaywrightComputer
4
+ from .docker import DockerComputer
5
+ from .scrapybara import ScrapybaraBrowser, ScrapybaraUbuntu
6
+ from .operator import Operator
7
+ from .agent import ComputerAgent
@@ -0,0 +1,225 @@
1
+ from meshagent.openai import OpenAIResponsesAdapter
2
+ from meshagent.agents import LLMAdapter, AgentChatContext
3
+ from meshagent.tools import Tool, Toolkit, ToolContext
4
+ from meshagent.agents.prompt import PromptAgent
5
+ from meshagent.computers import Computer, Operator
6
+ from meshagent.agents.chat import ChatBot, ChatThreadContext
7
+ from meshagent.api import RemoteParticipant, FileResponse
8
+ from meshagent.api.messaging import RawOutputs
9
+
10
+ from typing import Optional
11
+ import base64
12
+ import json
13
+ import logging
14
+
15
+ logging.basicConfig()
16
+ logger = logging.getLogger("computer")
17
+ logger.setLevel(logging.INFO)
18
+
19
+ class ComputerAgent[ComputerType:Computer, OperatorType:Operator](ChatBot):
20
+ def __init__(self, *, name,
21
+ title=None,
22
+ description=None,
23
+ requires=None,
24
+ labels = None,
25
+ computer_cls: ComputerType,
26
+ operator_cls: OperatorType,
27
+ rules: Optional[list[str]] = None,
28
+ llm_adapter: Optional[LLMAdapter] = None,
29
+ toolkits: list[Toolkit] = None
30
+ ):
31
+
32
+ if rules == None:
33
+ rules=[
34
+ "if asked to go to a URL, you MUST use the goto function to go to the url if it is available",
35
+ "after going directly to a URL, the screen will change so you should take a look at it to know what to do next"
36
+ ]
37
+ super().__init__(
38
+ name=name,
39
+ title=title,
40
+ description=description,
41
+ requires=requires,
42
+ labels=labels,
43
+ llm_adapter=llm_adapter,
44
+ toolkits=toolkits,
45
+ rules=rules
46
+ )
47
+ self.computer_cls = computer_cls
48
+ self.operator_cls = operator_cls
49
+
50
+
51
+ async def init_thread_context(self, *, thread_context: ChatThreadContext):
52
+
53
+ operator : Operator = self.operator_cls()
54
+ computer : Computer = self.computer_cls()
55
+ started = False
56
+
57
+ class ComputerTool(Tool):
58
+ def __init__(self, *, operator: Operator, computer: Computer, title = "computer_call", description = "handle computer calls from computer use preview", rules = [], thumbnail_url = None, defs = None):
59
+ super().__init__(
60
+ name="computer_call",
61
+ # TODO: give a correct schema
62
+ input_schema={
63
+ "additionalProperties" : False,
64
+ "type" : "object",
65
+ "required" : [],
66
+ "properties" : {}
67
+ },
68
+ title=title,
69
+ description=description,
70
+ rules=rules,
71
+ thumbnail_url=thumbnail_url,
72
+ defs=defs,
73
+
74
+ )
75
+ self.computer = computer
76
+
77
+
78
+ @property
79
+ def options(self):
80
+ return {
81
+ "type": "computer-preview",
82
+ "display_width": self.computer.dimensions[0],
83
+ "display_height": self.computer.dimensions[1],
84
+ "environment": self.computer.environment,
85
+ }
86
+
87
+ async def execute(self, context: ToolContext, *, arguments):
88
+
89
+ nonlocal started
90
+ if started == False:
91
+ await self.computer.__aenter__()
92
+ started = True
93
+
94
+ for participant in thread_context.participants:
95
+ await context.room.messaging.send_message(
96
+ to=participant,
97
+ type="computer_use",
98
+ message={
99
+ "arguments" : arguments
100
+ }
101
+ )
102
+
103
+ outputs = await operator.play(computer=self.computer, item=arguments)
104
+ for output in outputs:
105
+ if output["type"] == "computer_call_output":
106
+ if output["output"] != None:
107
+ if output["output"]["type"] == "input_image":
108
+
109
+ b64 : str = output["output"]["image_url"]
110
+ image_data_b64 = b64.split(",", 1)
111
+
112
+ image_bytes = base64.b64decode(image_data_b64[1])
113
+
114
+ for participant in thread_context.participants:
115
+ context.room.messaging.send_message_nowait(
116
+ to=participant,
117
+ type="computer_screen",
118
+ message={
119
+ },
120
+ attachment=image_bytes
121
+ )
122
+
123
+ nonlocal computer_toolkit
124
+ if len(computer_toolkit.tools) == 1:
125
+ # HACK: after looking at the page, add the other tools,
126
+ # if we add these first then the computer-use-preview mode fails if it calls them before using the computer
127
+ computer_toolkit.tools.extend([
128
+ ScreenshotTool(computer=computer),
129
+ GotoURL(computer=computer),
130
+ ])
131
+ return RawOutputs(outputs=outputs)
132
+
133
+ class ScreenshotTool(Tool):
134
+ def __init__(self, computer: Computer):
135
+ self.computer = computer
136
+
137
+ super().__init__(
138
+ name="screenshot",
139
+ # TODO: give a correct schema
140
+ input_schema={
141
+ "additionalProperties" : False,
142
+ "type" : "object",
143
+ "required" : ["full_page","save_path"],
144
+ "properties" : {
145
+ "full_page" : {
146
+ "type" : "boolean"
147
+ },
148
+ "save_path" : {
149
+ "type" : "string",
150
+ "description" : "a file path to save the screenshot to (should end with .png)"
151
+ }
152
+ }
153
+ },
154
+ description="take a screenshot of the current page",
155
+ )
156
+
157
+
158
+ async def execute(self, context: ToolContext, save_path: str, full_page: bool):
159
+ nonlocal started
160
+ if started == False:
161
+ await self.computer.__aenter__()
162
+ started = True
163
+
164
+ screenshot_bytes = await self.computer.screenshot_bytes(full_page=full_page)
165
+ handle = await context.room.storage.open(path=save_path, overwrite=True)
166
+ await context.room.storage.write(handle=handle, data=screenshot_bytes)
167
+ await context.room.storage.close(handle=handle)
168
+
169
+ return f"saved screenshot to {save_path}"
170
+
171
+ class GotoURL(Tool):
172
+ def __init__(self, computer: Computer):
173
+ self.computer = computer
174
+
175
+ super().__init__(
176
+ name="goto",
177
+ description="goes to a specific URL. Make sure it starts with http:// or https://",
178
+ # TODO: give a correct schema
179
+ input_schema={
180
+ "additionalProperties" : False,
181
+ "type" : "object",
182
+ "required" : ["url"],
183
+ "properties" : {
184
+ "url" : {
185
+ "type" : "string",
186
+ "description": "Fully qualified URL to navigate to.",
187
+ }
188
+ }
189
+ },
190
+ )
191
+
192
+
193
+ async def execute(self, context: ToolContext, url: str):
194
+ nonlocal started
195
+ if started == False:
196
+ await self.computer.__aenter__()
197
+ started = True
198
+
199
+ if url.startswith("https://") == False and url.startswith("http://") == False:
200
+ url = "https://"+url
201
+
202
+ await self.computer.goto(url)
203
+
204
+ # send an updated screen out
205
+ for participant in thread_context.participants:
206
+ context.room.messaging.send_message_nowait(
207
+ to=participant,
208
+ type="computer_screen",
209
+ message={
210
+ },
211
+ attachment = await self.computer.screenshot_bytes(full_page=False)
212
+ )
213
+
214
+ computer_tool = ComputerTool(computer=computer, operator=operator)
215
+
216
+ computer_toolkit = Toolkit(name="meshagent.openai.computer", tools=[
217
+ computer_tool
218
+ ])
219
+
220
+ thread_context.toolkits = [
221
+ computer_toolkit,
222
+ *thread_context.toolkits
223
+ ]
224
+
225
+
@@ -0,0 +1,179 @@
1
+ import time
2
+ import base64
3
+ from typing import List, Dict, Literal
4
+ from playwright.async_api import async_playwright, Browser, Page, Route, Request
5
+ from meshagent.computers.utils import check_blocklisted_url
6
+
7
+ # Optional: key mapping if your model uses "CUA" style keys
8
+ CUA_KEY_TO_PLAYWRIGHT_KEY = {
9
+ "/": "Divide",
10
+ "\\": "Backslash",
11
+ "alt": "Alt",
12
+ "arrowdown": "ArrowDown",
13
+ "arrowleft": "ArrowLeft",
14
+ "arrowright": "ArrowRight",
15
+ "arrowup": "ArrowUp",
16
+ "backspace": "Backspace",
17
+ "capslock": "CapsLock",
18
+ "cmd": "Meta",
19
+ "ctrl": "Control",
20
+ "delete": "Delete",
21
+ "end": "End",
22
+ "enter": "Enter",
23
+ "esc": "Escape",
24
+ "home": "Home",
25
+ "insert": "Insert",
26
+ "option": "Alt",
27
+ "pagedown": "PageDown",
28
+ "pageup": "PageUp",
29
+ "shift": "Shift",
30
+ "space": " ",
31
+ "super": "Meta",
32
+ "tab": "Tab",
33
+ "win": "Meta",
34
+ }
35
+
36
+
37
+ class BasePlaywrightComputer:
38
+ """
39
+ Abstract base for Playwright-based computers:
40
+
41
+ - Subclasses override `_get_browser_and_page()` to do local or remote connection,
42
+ returning (Browser, Page).
43
+ - This base class handles context creation (`__enter__`/`__exit__`),
44
+ plus standard "Computer" actions like click, scroll, etc.
45
+ - We also have extra browser actions: `goto(url)` and `back()`.
46
+ """
47
+
48
+ environment: Literal["browser"] = "browser"
49
+ dimensions = (1024, 768)
50
+
51
+ def __init__(self):
52
+ self._playwright = None
53
+ self._browser: Browser | None = None
54
+ self._page: Page | None = None
55
+
56
+ async def __aenter__(self):
57
+ # Start Playwright and call the subclass hook for getting browser/page
58
+ self._context = async_playwright()
59
+ self._playwright = await self._context.__aenter__()
60
+ self._browser, self._page = await self._get_browser_and_page()
61
+
62
+ # Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS
63
+ async def handle_route(route: Route, request: Request):
64
+
65
+ url = request.url
66
+ if check_blocklisted_url(url):
67
+ print(f"Flagging blocked domain: {url}")
68
+ await route.abort()
69
+ else:
70
+ await route.continue_()
71
+
72
+ await self._page.route("**/*", handle_route)
73
+
74
+ return self
75
+
76
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
77
+ if self._browser:
78
+ await self._browser.close()
79
+ if self._playwright:
80
+ await self._context.__aexit__(exc_type, exc_val, exc_tb)
81
+
82
+ def get_current_url(self) -> str:
83
+ if self._page == None:
84
+ return "about:blank"
85
+
86
+ async def ensure_page(self):
87
+ # After a timeout, we might loose our browser
88
+ if self._page == None or self._browser.is_connected == False:
89
+ self._browser, self._page = await self._get_browser_and_page()
90
+
91
+ # --- Common "Computer" actions ---
92
+
93
+
94
+ async def screenshot_bytes(self, full_page: bool = False) -> bytes:
95
+ await self.ensure_page()
96
+ png_bytes = await self._page.screenshot(full_page=full_page)
97
+ return png_bytes
98
+
99
+ async def screenshot(self, full_page: bool = False) -> str:
100
+ await self.ensure_page()
101
+ png_bytes = await self.screenshot_bytes(full_page=full_page)
102
+ return base64.b64encode(png_bytes).decode("utf-8")
103
+
104
+ async def click(self, x: int, y: int, button: str = "left") -> None:
105
+ await self.ensure_page()
106
+ match button:
107
+ case "back":
108
+ await self.back()
109
+ case "forward":
110
+ await self.forward()
111
+ case "wheel":
112
+ await self._page.mouse.wheel(x, y)
113
+ case _:
114
+ button_mapping = {"left": "left", "right": "right"}
115
+ button_type = button_mapping.get(button, "left")
116
+ await self._page.mouse.click(x, y, button=button_type)
117
+
118
+ async def double_click(self, x: int, y: int) -> None:
119
+ await self.ensure_page()
120
+ await self._page.mouse.dblclick(x, y)
121
+
122
+ async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
123
+ await self.ensure_page()
124
+ await self._page.mouse.move(x, y)
125
+ await self._page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
126
+
127
+ async def type(self, text: str) -> None:
128
+ await self.ensure_page()
129
+ await self._page.keyboard.type(text)
130
+
131
+ async def wait(self, ms: int = 1000) -> None:
132
+ await self.ensure_page()
133
+ time.sleep(ms / 1000)
134
+
135
+ async def move(self, x: int, y: int) -> None:
136
+ await self.ensure_page()
137
+ await self._page.mouse.move(x, y)
138
+
139
+ async def keypress(self, keys: List[str]) -> None:
140
+ await self.ensure_page()
141
+ for key in keys:
142
+ mapped_key = CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key)
143
+ await self._page.keyboard.press(mapped_key)
144
+
145
+ async def drag(self, path: List[Dict[str, int]]) -> None:
146
+ await self.ensure_page()
147
+ if not path:
148
+ return
149
+
150
+ await self._page.mouse.move(path[0]["x"], path[0]["y"])
151
+ await self._page.mouse.down()
152
+ for point in path[1:]:
153
+ await self._page.mouse.move(point["x"], point["y"])
154
+ await self._page.mouse.up()
155
+
156
+ async def get_current_url(self) -> str:
157
+ await self.ensure_page()
158
+ return self._page.url
159
+
160
+ # --- Extra browser-oriented actions ---
161
+ async def goto(self, url: str) -> None:
162
+ await self.ensure_page()
163
+ try:
164
+ return await self._page.goto(url)
165
+ except Exception as e:
166
+ print(f"Error navigating to {url}: {e}")
167
+
168
+ async def back(self) -> None:
169
+ await self.ensure_page()
170
+ return await self._page.go_back()
171
+
172
+ async def forward(self) -> None:
173
+ await self.ensure_page()
174
+ return await self._page.go_forward()
175
+
176
+ # --- Subclass hook ---
177
+ async def _get_browser_and_page(self) -> tuple[Browser, Page]:
178
+ """Subclasses must implement, returning (Browser, Page)."""
179
+ raise NotImplementedError
@@ -0,0 +1,197 @@
1
+ import os
2
+ from typing import Tuple, Dict, List, Union, Optional
3
+ from playwright.async_api import Browser, Page, BrowserContext, Error as PlaywrightError
4
+ from .base_playwright import BasePlaywrightComputer
5
+ from browserbase import AsyncBrowserbase
6
+ from dotenv import load_dotenv
7
+ import base64
8
+
9
+ load_dotenv()
10
+
11
+
12
+ class BrowserbaseBrowser(BasePlaywrightComputer):
13
+ """
14
+ Browserbase is a headless browser platform that offers a remote browser API. You can use it to control thousands of browsers from anywhere.
15
+ You can find more information about Browserbase at https://www.browserbase.com/computer-use or view our OpenAI CUA Quickstart at https://docs.browserbase.com/integrations/openai-cua/introduction.
16
+
17
+ IMPORTANT: This Browserbase computer requires the use of the `goto` tool defined in playwright_with_custom_functions.py.
18
+ Make sure to include this tool in your configuration when using the Browserbase computer.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ width: int = 1024,
24
+ height: int = 768,
25
+ region: str = "us-west-2",
26
+ proxy: bool = False,
27
+ virtual_mouse: bool = True,
28
+ ad_blocker: bool = False,
29
+ ):
30
+ """
31
+ Initialize the Browserbase instance. Additional configuration options for features such as persistent cookies, ad blockers, file downloads and more can be found in the Browserbase API documentation: https://docs.browserbase.com/reference/api/create-a-session
32
+
33
+ Args:
34
+ width (int): The width of the browser viewport. Default is 1024.
35
+ height (int): The height of the browser viewport. Default is 768.
36
+ region (str): The region for the Browserbase session. Default is "us-west-2". Pick a region close to you for better performance. https://docs.browserbase.com/guides/multi-region
37
+ proxy (bool): Whether to use a proxy for the session. Default is False. Turn on proxies if you're browsing is frequently interrupted. https://docs.browserbase.com/features/proxies
38
+ virtual_mouse (bool): Whether to enable the virtual mouse cursor. Default is True.
39
+ ad_blocker (bool): Whether to enable the built-in ad blocker. Default is False.
40
+ """
41
+ super().__init__()
42
+ self.bb = AsyncBrowserbase(api_key=os.getenv("BROWSERBASE_API_KEY"))
43
+ self.project_id = os.getenv("BROWSERBASE_PROJECT_ID")
44
+ self.session = None
45
+ self.dimensions = (width, height)
46
+ self.region = region
47
+ self.proxy = proxy
48
+ self.virtual_mouse = virtual_mouse
49
+ self.ad_blocker = ad_blocker
50
+
51
+ async def _get_browser_and_page(self) -> Tuple[Browser, Page]:
52
+ """
53
+ Create a Browserbase session and connect to it.
54
+
55
+ Returns:
56
+ Tuple[Browser, Page]: A tuple containing the connected browser and page objects.
57
+ """
58
+ # Create a session on Browserbase with specified parameters
59
+ width, height = self.dimensions
60
+ session_params = {
61
+ "project_id": self.project_id,
62
+ "browser_settings": {
63
+ "viewport": {"width": width, "height": height},
64
+ "blockAds": self.ad_blocker,
65
+ },
66
+ "region": self.region,
67
+ "proxies": self.proxy,
68
+ }
69
+ self.session = await self.bb.sessions.create(**session_params)
70
+
71
+ # Print the live session URL
72
+ print(
73
+ f"Watch and control this browser live at https://www.browserbase.com/sessions/{self.session.id}"
74
+ )
75
+
76
+ # Connect to the remote session
77
+ browser = await self._playwright.chromium.connect_over_cdp(
78
+ self.session.connect_url,
79
+ timeout=60000
80
+ )
81
+ context = browser.contexts[0]
82
+
83
+ # Add event listeners for page creation and closure
84
+ context.on("page", self._handle_new_page)
85
+
86
+ # Only add the init script if virtual_mouse is True
87
+ if self.virtual_mouse:
88
+ await context.add_init_script("""
89
+ // Only run in the top frame
90
+ if (window.self === window.top) {
91
+ function initCursor() {
92
+ const CURSOR_ID = '__cursor__';
93
+
94
+ // Check if cursor element already exists
95
+ if (document.getElementById(CURSOR_ID)) return;
96
+
97
+ const cursor = document.createElement('div');
98
+ cursor.id = CURSOR_ID;
99
+ Object.assign(cursor.style, {
100
+ position: 'fixed',
101
+ top: '0px',
102
+ left: '0px',
103
+ width: '20px',
104
+ height: '20px',
105
+ backgroundImage: 'url("data:image/svg+xml;utf8,<svg xmlns=\\'http://www.w3.org/2000/svg\\' viewBox=\\'0 0 24 24\\' fill=\\'black\\' stroke=\\'white\\' stroke-width=\\'1\\' stroke-linejoin=\\'round\\' stroke-linecap=\\'round\\'><polygon points=\\'2,2 2,22 8,16 14,22 17,19 11,13 20,13\\'/></svg>")',
106
+ backgroundSize: 'cover',
107
+ pointerEvents: 'none',
108
+ zIndex: '99999',
109
+ transform: 'translate(-2px, -2px)',
110
+ });
111
+
112
+ document.body.appendChild(cursor);
113
+
114
+ document.addEventListener("mousemove", (e) => {
115
+ cursor.style.top = e.clientY + "px";
116
+ cursor.style.left = e.clientX + "px";
117
+ });
118
+ }
119
+
120
+ // Use requestAnimationFrame for early execution
121
+ requestAnimationFrame(function checkBody() {
122
+ if (document.body) {
123
+ initCursor();
124
+ } else {
125
+ requestAnimationFrame(checkBody);
126
+ }
127
+ });
128
+ }
129
+ """)
130
+
131
+ page = context.pages[0]
132
+ page.on("close", self._handle_page_close)
133
+
134
+ await page.goto("https://google.com")
135
+
136
+ return browser, page
137
+
138
+ async def _handle_new_page(self, page: Page):
139
+ """Handle the creation of a new page."""
140
+ print("New page created")
141
+ self._page = page
142
+ page.on("close", self._handle_page_close)
143
+
144
+ async def _handle_page_close(self, page: Page):
145
+ """Handle the closure of a page."""
146
+ print("Page closed")
147
+ if self._page == page:
148
+ if self._browser.contexts[0].pages:
149
+ self._page = self._browser.contexts[0].pages[-1]
150
+ else:
151
+ print("Warning: All pages have been closed.")
152
+ self._page = None
153
+
154
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
155
+ """
156
+ Clean up resources when exiting the context manager.
157
+
158
+ Args:
159
+ exc_type: The type of the exception that caused the context to be exited.
160
+ exc_val: The exception instance that caused the context to be exited.
161
+ exc_tb: A traceback object encapsulating the call stack at the point where the exception occurred.
162
+ """
163
+ if self._page:
164
+ await self._page.close()
165
+ if self._browser:
166
+ await self._browser.close()
167
+ if self._playwright:
168
+ await self._playwright.stop()
169
+
170
+ if self.session:
171
+ print(
172
+ f"Session completed. View replay at https://browserbase.com/sessions/{self.session.id}"
173
+ )
174
+
175
+ async def screenshot(self) -> str:
176
+ await self.ensure_page()
177
+
178
+ """
179
+ Capture a screenshot of the current viewport using CDP.
180
+
181
+ Returns:
182
+ str: A base64 encoded string of the screenshot.
183
+ """
184
+ try:
185
+ # Get CDP session from the page
186
+ cdp_session = await self._page.context.new_cdp_session(self._page)
187
+
188
+ # Capture screenshot using CDP
189
+ result = await cdp_session.send("Page.captureScreenshot", {
190
+ "format": "png",
191
+ "fromSurface": True
192
+ })
193
+
194
+ return result['data']
195
+ except PlaywrightError as error:
196
+ print(f"CDP screenshot failed, falling back to standard screenshot: {error}")
197
+ return await super().screenshot()
@@ -0,0 +1,36 @@
1
+ from typing import Protocol, List, Literal, Dict
2
+
3
+
4
+ class Computer(Protocol):
5
+ """Defines the 'shape' (methods/properties) our loop expects."""
6
+
7
+ @property
8
+ def environment(self) -> Literal["windows", "mac", "linux", "browser"]: ...
9
+ @property
10
+ def dimensions(self) -> tuple[int, int]: ...
11
+
12
+ async def screenshot(self) -> str: ...
13
+
14
+ async def click(self, x: int, y: int, button: str = "left") -> None: ...
15
+
16
+ async def double_click(self, x: int, y: int) -> None: ...
17
+
18
+ async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ...
19
+
20
+ async def type(self, text: str) -> None: ...
21
+
22
+ async def wait(self, ms: int = 1000) -> None: ...
23
+
24
+ async def move(self, x: int, y: int) -> None: ...
25
+
26
+ async def keypress(self, keys: List[str]) -> None: ...
27
+
28
+ async def drag(self, path: List[Dict[str, int]]) -> None: ...
29
+
30
+ async def get_current_url() -> str: ...
31
+
32
+ async def __aenter__(self) -> 'Computer':
33
+ return self
34
+
35
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> 'Computer':
36
+ return self