vibesurf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vibesurf might be problematic. Click here for more details.
- vibe_surf/__init__.py +12 -0
- vibe_surf/_version.py +34 -0
- vibe_surf/agents/__init__.py +0 -0
- vibe_surf/agents/browser_use_agent.py +1106 -0
- vibe_surf/agents/prompts/__init__.py +1 -0
- vibe_surf/agents/prompts/vibe_surf_prompt.py +176 -0
- vibe_surf/agents/report_writer_agent.py +360 -0
- vibe_surf/agents/vibe_surf_agent.py +1632 -0
- vibe_surf/backend/__init__.py +0 -0
- vibe_surf/backend/api/__init__.py +3 -0
- vibe_surf/backend/api/activity.py +243 -0
- vibe_surf/backend/api/config.py +740 -0
- vibe_surf/backend/api/files.py +322 -0
- vibe_surf/backend/api/models.py +257 -0
- vibe_surf/backend/api/task.py +300 -0
- vibe_surf/backend/database/__init__.py +13 -0
- vibe_surf/backend/database/manager.py +129 -0
- vibe_surf/backend/database/models.py +164 -0
- vibe_surf/backend/database/queries.py +922 -0
- vibe_surf/backend/database/schemas.py +100 -0
- vibe_surf/backend/llm_config.py +182 -0
- vibe_surf/backend/main.py +137 -0
- vibe_surf/backend/migrations/__init__.py +16 -0
- vibe_surf/backend/migrations/init_db.py +303 -0
- vibe_surf/backend/migrations/seed_data.py +236 -0
- vibe_surf/backend/shared_state.py +601 -0
- vibe_surf/backend/utils/__init__.py +7 -0
- vibe_surf/backend/utils/encryption.py +164 -0
- vibe_surf/backend/utils/llm_factory.py +225 -0
- vibe_surf/browser/__init__.py +8 -0
- vibe_surf/browser/agen_browser_profile.py +130 -0
- vibe_surf/browser/agent_browser_session.py +416 -0
- vibe_surf/browser/browser_manager.py +296 -0
- vibe_surf/browser/utils.py +790 -0
- vibe_surf/browser/watchdogs/__init__.py +0 -0
- vibe_surf/browser/watchdogs/action_watchdog.py +291 -0
- vibe_surf/browser/watchdogs/dom_watchdog.py +954 -0
- vibe_surf/chrome_extension/background.js +558 -0
- vibe_surf/chrome_extension/config.js +48 -0
- vibe_surf/chrome_extension/content.js +284 -0
- vibe_surf/chrome_extension/dev-reload.js +47 -0
- vibe_surf/chrome_extension/icons/convert-svg.js +33 -0
- vibe_surf/chrome_extension/icons/logo-preview.html +187 -0
- vibe_surf/chrome_extension/icons/logo.png +0 -0
- vibe_surf/chrome_extension/manifest.json +53 -0
- vibe_surf/chrome_extension/popup.html +134 -0
- vibe_surf/chrome_extension/scripts/api-client.js +473 -0
- vibe_surf/chrome_extension/scripts/main.js +491 -0
- vibe_surf/chrome_extension/scripts/markdown-it.min.js +3 -0
- vibe_surf/chrome_extension/scripts/session-manager.js +599 -0
- vibe_surf/chrome_extension/scripts/ui-manager.js +3687 -0
- vibe_surf/chrome_extension/sidepanel.html +347 -0
- vibe_surf/chrome_extension/styles/animations.css +471 -0
- vibe_surf/chrome_extension/styles/components.css +670 -0
- vibe_surf/chrome_extension/styles/main.css +2307 -0
- vibe_surf/chrome_extension/styles/settings.css +1100 -0
- vibe_surf/cli.py +357 -0
- vibe_surf/controller/__init__.py +0 -0
- vibe_surf/controller/file_system.py +53 -0
- vibe_surf/controller/mcp_client.py +68 -0
- vibe_surf/controller/vibesurf_controller.py +616 -0
- vibe_surf/controller/views.py +37 -0
- vibe_surf/llm/__init__.py +21 -0
- vibe_surf/llm/openai_compatible.py +237 -0
- vibesurf-0.1.0.dist-info/METADATA +97 -0
- vibesurf-0.1.0.dist-info/RECORD +70 -0
- vibesurf-0.1.0.dist-info/WHEEL +5 -0
- vibesurf-0.1.0.dist-info/entry_points.txt +2 -0
- vibesurf-0.1.0.dist-info/licenses/LICENSE +201 -0
- vibesurf-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,616 @@
|
|
|
1
|
+
import pdb
|
|
2
|
+
import os
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import enum
|
|
6
|
+
import base64
|
|
7
|
+
import mimetypes
|
|
8
|
+
|
|
9
|
+
from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable, TypeVar
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
from browser_use.controller.service import Controller
|
|
12
|
+
import logging
|
|
13
|
+
from browser_use.agent.views import ActionModel, ActionResult
|
|
14
|
+
from browser_use.utils import time_execution_sync
|
|
15
|
+
from browser_use.filesystem.file_system import FileSystem
|
|
16
|
+
from browser_use.browser import BrowserSession
|
|
17
|
+
from browser_use.browser.events import UploadFileEvent
|
|
18
|
+
from browser_use.observability import observe_debug
|
|
19
|
+
from browser_use.controller.views import (
|
|
20
|
+
ClickElementAction,
|
|
21
|
+
CloseTabAction,
|
|
22
|
+
DoneAction,
|
|
23
|
+
GetDropdownOptionsAction,
|
|
24
|
+
GoToUrlAction,
|
|
25
|
+
InputTextAction,
|
|
26
|
+
NoParamsAction,
|
|
27
|
+
ScrollAction,
|
|
28
|
+
SearchGoogleAction,
|
|
29
|
+
SelectDropdownOptionAction,
|
|
30
|
+
SendKeysAction,
|
|
31
|
+
StructuredOutputAction,
|
|
32
|
+
SwitchTabAction,
|
|
33
|
+
UploadFileAction,
|
|
34
|
+
)
|
|
35
|
+
from browser_use.llm.base import BaseChatModel
|
|
36
|
+
from browser_use.llm.messages import UserMessage, ContentPartTextParam, ContentPartImageParam, ImageURL
|
|
37
|
+
from browser_use.dom.service import EnhancedDOMTreeNode
|
|
38
|
+
from browser_use.browser.views import BrowserError
|
|
39
|
+
from browser_use.mcp.client import MCPClient
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
from vibe_surf.browser.agent_browser_session import AgentBrowserSession
|
|
43
|
+
from vibe_surf.controller.views import HoverAction, ExtractionAction, FileExtractionAction
|
|
44
|
+
from vibe_surf.controller.mcp_client import VibeSurfMCPClient
|
|
45
|
+
|
|
46
|
+
logger = logging.getLogger(__name__)
|
|
47
|
+
|
|
48
|
+
Context = TypeVar('Context')
|
|
49
|
+
|
|
50
|
+
T = TypeVar('T', bound=BaseModel)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class VibeSurfController(Controller):
|
|
54
|
+
def __init__(self,
|
|
55
|
+
exclude_actions: list[str] = [],
|
|
56
|
+
output_model: type[T] | None = None,
|
|
57
|
+
display_files_in_done_text: bool = True,
|
|
58
|
+
mcp_server_config: Optional[Dict[str, Any]] = None
|
|
59
|
+
):
|
|
60
|
+
super().__init__(exclude_actions=exclude_actions, output_model=output_model,
|
|
61
|
+
display_files_in_done_text=display_files_in_done_text)
|
|
62
|
+
self._register_browser_actions()
|
|
63
|
+
self.mcp_server_config = mcp_server_config
|
|
64
|
+
self.mcp_clients = {}
|
|
65
|
+
|
|
66
|
+
def _register_browser_actions(self):
|
|
67
|
+
"""Register custom browser actions"""
|
|
68
|
+
|
|
69
|
+
@self.registry.action(
|
|
70
|
+
'Hover over an element',
|
|
71
|
+
param_model=HoverAction,
|
|
72
|
+
)
|
|
73
|
+
async def hover_element(params: HoverAction, browser_session: AgentBrowserSession):
|
|
74
|
+
"""Hovers over the element specified by its index from the cached selector map or by XPath."""
|
|
75
|
+
try:
|
|
76
|
+
if params.xpath:
|
|
77
|
+
# Find element by XPath using CDP
|
|
78
|
+
cdp_session = await browser_session.get_or_create_cdp_session()
|
|
79
|
+
result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
80
|
+
params={
|
|
81
|
+
'expression': f"""
|
|
82
|
+
(() => {{
|
|
83
|
+
const element = document.evaluate('{params.xpath}', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
|
|
84
|
+
if (element) {{
|
|
85
|
+
const rect = element.getBoundingClientRect();
|
|
86
|
+
return {{found: true, x: rect.x + rect.width/2, y: rect.y + rect.height/2}};
|
|
87
|
+
}}
|
|
88
|
+
return {{found: false}};
|
|
89
|
+
}})()
|
|
90
|
+
""",
|
|
91
|
+
'returnByValue': True,
|
|
92
|
+
},
|
|
93
|
+
session_id=cdp_session.session_id,
|
|
94
|
+
)
|
|
95
|
+
element_info = result.get('result', {}).get('value', {})
|
|
96
|
+
if not element_info.get('found'):
|
|
97
|
+
raise Exception(f'Failed to locate element with XPath {params.xpath}')
|
|
98
|
+
x, y = element_info['x'], element_info['y']
|
|
99
|
+
|
|
100
|
+
elif params.selector:
|
|
101
|
+
# Find element by CSS selector using CDP
|
|
102
|
+
cdp_session = await browser_session.get_or_create_cdp_session()
|
|
103
|
+
result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
104
|
+
params={
|
|
105
|
+
'expression': f"""
|
|
106
|
+
(() => {{
|
|
107
|
+
const element = document.querySelector('{params.selector}');
|
|
108
|
+
if (element) {{
|
|
109
|
+
const rect = element.getBoundingClientRect();
|
|
110
|
+
return {{found: true, x: rect.x + rect.width/2, y: rect.y + rect.height/2}};
|
|
111
|
+
}}
|
|
112
|
+
return {{found: false}};
|
|
113
|
+
}})()
|
|
114
|
+
""",
|
|
115
|
+
'returnByValue': True,
|
|
116
|
+
},
|
|
117
|
+
session_id=cdp_session.session_id,
|
|
118
|
+
)
|
|
119
|
+
element_info = result.get('result', {}).get('value', {})
|
|
120
|
+
if not element_info.get('found'):
|
|
121
|
+
raise Exception(f'Failed to locate element with CSS Selector {params.selector}')
|
|
122
|
+
x, y = element_info['x'], element_info['y']
|
|
123
|
+
|
|
124
|
+
elif params.index is not None:
|
|
125
|
+
# Use index to locate the element
|
|
126
|
+
selector_map = await browser_session.get_selector_map()
|
|
127
|
+
if params.index not in selector_map:
|
|
128
|
+
raise Exception(
|
|
129
|
+
f'Element index {params.index} does not exist - retry or use alternative actions')
|
|
130
|
+
element_node = selector_map[params.index]
|
|
131
|
+
|
|
132
|
+
# Get element position
|
|
133
|
+
if not element_node.absolute_position:
|
|
134
|
+
raise Exception(f'Element at index {params.index} has no position information')
|
|
135
|
+
|
|
136
|
+
x = element_node.absolute_position.x + element_node.absolute_position.width / 2
|
|
137
|
+
y = element_node.absolute_position.y + element_node.absolute_position.height / 2
|
|
138
|
+
|
|
139
|
+
else:
|
|
140
|
+
raise Exception('Either index, xpath, or selector must be provided')
|
|
141
|
+
|
|
142
|
+
# Perform hover using CDP mouse events
|
|
143
|
+
cdp_session = await browser_session.get_or_create_cdp_session()
|
|
144
|
+
|
|
145
|
+
# Move mouse to the element position
|
|
146
|
+
await cdp_session.cdp_client.send.Input.dispatchMouseEvent(
|
|
147
|
+
params={
|
|
148
|
+
'type': 'mouseMoved',
|
|
149
|
+
'x': x,
|
|
150
|
+
'y': y,
|
|
151
|
+
},
|
|
152
|
+
session_id=cdp_session.session_id,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Wait a bit for hover state to trigger
|
|
156
|
+
await asyncio.sleep(0.1)
|
|
157
|
+
|
|
158
|
+
msg = (
|
|
159
|
+
f'🖱️ Hovered over element at index {params.index}'
|
|
160
|
+
if params.index is not None
|
|
161
|
+
else f'🖱️ Hovered over element with XPath {params.xpath}'
|
|
162
|
+
if params.xpath
|
|
163
|
+
else f'🖱️ Hovered over element with selector {params.selector}'
|
|
164
|
+
)
|
|
165
|
+
return ActionResult(extracted_content=msg, include_in_memory=True)
|
|
166
|
+
|
|
167
|
+
except Exception as e:
|
|
168
|
+
error_msg = f'❌ Failed to hover over element: {str(e)}'
|
|
169
|
+
return ActionResult(error=error_msg)
|
|
170
|
+
|
|
171
|
+
# =======================
|
|
172
|
+
# NAVIGATION ACTIONS
|
|
173
|
+
# =======================
|
|
174
|
+
|
|
175
|
+
@self.registry.action(
|
|
176
|
+
'Search the query in Google, the query should be a search query like humans search in Google, concrete and not vague or super long.',
|
|
177
|
+
param_model=SearchGoogleAction,
|
|
178
|
+
)
|
|
179
|
+
async def search_google(params: SearchGoogleAction, browser_session: AgentBrowserSession):
|
|
180
|
+
search_url = f'https://www.google.com/search?q={params.query}&udm=14'
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
# Use AgentBrowserSession's direct navigation method
|
|
184
|
+
await browser_session.navigate_to_url(search_url, new_tab=False)
|
|
185
|
+
memory = f"Searched Google for '{params.query}'"
|
|
186
|
+
msg = f'🔍 {memory}'
|
|
187
|
+
logger.info(msg)
|
|
188
|
+
return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory)
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.error(f'Failed to search Google: {e}')
|
|
191
|
+
return ActionResult(error=f'Failed to search Google for "{params.query}": {str(e)}')
|
|
192
|
+
|
|
193
|
+
@self.registry.action(
|
|
194
|
+
'Navigate to URL, set new_tab=True to open in new tab, False to navigate in current tab',
|
|
195
|
+
param_model=GoToUrlAction
|
|
196
|
+
)
|
|
197
|
+
async def go_to_url(params: GoToUrlAction, browser_session: AgentBrowserSession):
|
|
198
|
+
try:
|
|
199
|
+
# Use AgentBrowserSession's direct navigation method
|
|
200
|
+
await browser_session.navigate_to_url(params.url, new_tab=params.new_tab)
|
|
201
|
+
|
|
202
|
+
if params.new_tab:
|
|
203
|
+
memory = f'Opened new tab with URL {params.url}'
|
|
204
|
+
msg = f'🔗 Opened new tab with url {params.url}'
|
|
205
|
+
else:
|
|
206
|
+
memory = f'Navigated to {params.url}'
|
|
207
|
+
msg = f'🔗 {memory}'
|
|
208
|
+
|
|
209
|
+
logger.info(msg)
|
|
210
|
+
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=memory)
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logger.error(f'❌ Navigation failed: {str(e)}')
|
|
213
|
+
return ActionResult(error=f'Navigation failed: {str(e)}')
|
|
214
|
+
|
|
215
|
+
@self.registry.action(
|
|
216
|
+
'Go back',
|
|
217
|
+
)
|
|
218
|
+
async def go_back(browser_session: AgentBrowserSession):
|
|
219
|
+
try:
|
|
220
|
+
cdp_session = await browser_session.get_or_create_cdp_session()
|
|
221
|
+
history = await cdp_session.cdp_client.send.Page.getNavigationHistory(session_id=cdp_session.session_id)
|
|
222
|
+
current_index = history['currentIndex']
|
|
223
|
+
entries = history['entries']
|
|
224
|
+
|
|
225
|
+
# Check if we can go back
|
|
226
|
+
if current_index <= 0:
|
|
227
|
+
memory = msg = '⚠️ Cannot go back - no previous entry in history'
|
|
228
|
+
logger.info(msg)
|
|
229
|
+
return ActionResult(extracted_content=memory)
|
|
230
|
+
|
|
231
|
+
# Navigate to the previous entry
|
|
232
|
+
previous_entry_id = entries[current_index - 1]['id']
|
|
233
|
+
await cdp_session.cdp_client.send.Page.navigateToHistoryEntry(
|
|
234
|
+
params={'entryId': previous_entry_id}, session_id=cdp_session.session_id
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Wait for navigation
|
|
238
|
+
await asyncio.sleep(0.5)
|
|
239
|
+
memory = 'Navigated back'
|
|
240
|
+
msg = f'🔙 {memory}'
|
|
241
|
+
logger.info(msg)
|
|
242
|
+
return ActionResult(extracted_content=memory)
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.error(f'Failed to go back: {str(e)}')
|
|
245
|
+
return ActionResult(error=f'Failed to go back: {str(e)}')
|
|
246
|
+
|
|
247
|
+
@self.registry.action(
|
|
248
|
+
'Switch tab',
|
|
249
|
+
param_model=SwitchTabAction
|
|
250
|
+
)
|
|
251
|
+
async def switch_tab(params: SwitchTabAction, browser_session: AgentBrowserSession):
|
|
252
|
+
try:
|
|
253
|
+
|
|
254
|
+
if params.tab_id:
|
|
255
|
+
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
|
|
256
|
+
elif params.url:
|
|
257
|
+
target_id = await browser_session.get_target_id_from_url(params.url)
|
|
258
|
+
else:
|
|
259
|
+
target_id = await browser_session.get_most_recently_opened_target_id()
|
|
260
|
+
|
|
261
|
+
# Switch to target using CDP
|
|
262
|
+
await browser_session.get_or_create_cdp_session(target_id, focus=True)
|
|
263
|
+
|
|
264
|
+
memory = f'Switched to Tab with ID {target_id[-4:]}'
|
|
265
|
+
logger.info(f'🔄 {memory}')
|
|
266
|
+
return ActionResult(extracted_content=memory, include_in_memory=True, long_term_memory=memory)
|
|
267
|
+
except Exception as e:
|
|
268
|
+
logger.error(f'Failed to switch tab: {str(e)}')
|
|
269
|
+
return ActionResult(error=f'Failed to switch to tab {params.tab_id or params.url}: {str(e)}')
|
|
270
|
+
|
|
271
|
+
@self.registry.action(
|
|
272
|
+
"""Extract structured, semantic data (e.g. product description, price, all information about XYZ) from the current webpage based on a textual query.
|
|
273
|
+
This tool takes the entire markdown of the page and extracts the query from it.
|
|
274
|
+
Set extract_links=True ONLY if your query requires extracting links/URLs from the page.
|
|
275
|
+
Only use this for specific queries for information retrieval from the page. Don't use this to get interactive elements - the tool does not see HTML elements, only the markdown.
|
|
276
|
+
Note: Extracting from the same page will yield the same results unless more content is loaded (e.g., through scrolling for dynamic content, or new page is loaded) - so one extraction per page state is sufficient. If you want to scrape a listing of many elements always first scroll a lot until the page end to load everything and then call this tool in the end.
|
|
277
|
+
If you called extract_structured_data in the last step and the result was not good (e.g. because of antispam protection), use the current browser state and scrolling to get the information, dont call extract_structured_data again.
|
|
278
|
+
""",
|
|
279
|
+
param_model=ExtractionAction
|
|
280
|
+
)
|
|
281
|
+
async def extract_structured_data(
|
|
282
|
+
params: ExtractionAction,
|
|
283
|
+
browser_session: AgentBrowserSession,
|
|
284
|
+
page_extraction_llm: BaseChatModel,
|
|
285
|
+
file_system: FileSystem,
|
|
286
|
+
):
|
|
287
|
+
try:
|
|
288
|
+
# Use AgentBrowserSession's direct method to get HTML content
|
|
289
|
+
target_id = None
|
|
290
|
+
if params.tab_id:
|
|
291
|
+
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
|
|
292
|
+
page_html = await browser_session.get_html_content(target_id)
|
|
293
|
+
|
|
294
|
+
# Simple markdown conversion
|
|
295
|
+
import re
|
|
296
|
+
import markdownify
|
|
297
|
+
|
|
298
|
+
if params.extract_links:
|
|
299
|
+
content = markdownify.markdownify(page_html, heading_style='ATX', bullets='-')
|
|
300
|
+
else:
|
|
301
|
+
content = markdownify.markdownify(page_html, heading_style='ATX', bullets='-', strip=['a'])
|
|
302
|
+
# Remove all markdown links and images, keep only the text
|
|
303
|
+
content = re.sub(r'!\[.*?\]\([^)]*\)', '', content, flags=re.MULTILINE | re.DOTALL) # Remove images
|
|
304
|
+
content = re.sub(
|
|
305
|
+
r'\[([^\]]*)\]\([^)]*\)', r'\1', content, flags=re.MULTILINE | re.DOTALL
|
|
306
|
+
) # Convert [text](url) -> text
|
|
307
|
+
|
|
308
|
+
# Remove weird positioning artifacts
|
|
309
|
+
content = re.sub(r'❓\s*\[\d+\]\s*\w+.*?Position:.*?Size:.*?\n?', '', content,
|
|
310
|
+
flags=re.MULTILINE | re.DOTALL)
|
|
311
|
+
content = re.sub(r'Primary: UNKNOWN\n\nNo specific evidence found', '', content,
|
|
312
|
+
flags=re.MULTILINE | re.DOTALL)
|
|
313
|
+
content = re.sub(r'UNKNOWN CONFIDENCE', '', content, flags=re.MULTILINE | re.DOTALL)
|
|
314
|
+
content = re.sub(r'!\[\]\(\)', '', content, flags=re.MULTILINE | re.DOTALL)
|
|
315
|
+
|
|
316
|
+
# Simple truncation to 30k characters
|
|
317
|
+
if len(content) > 30000:
|
|
318
|
+
content = content[:30000] + '\n\n... [Content truncated at 30k characters] ...'
|
|
319
|
+
|
|
320
|
+
# Simple prompt
|
|
321
|
+
prompt = f"""Extract the requested information from this webpage content.
|
|
322
|
+
|
|
323
|
+
Query: {params.query}
|
|
324
|
+
|
|
325
|
+
Webpage Content:
|
|
326
|
+
{content}
|
|
327
|
+
|
|
328
|
+
Provide the extracted information in a clear, structured format."""
|
|
329
|
+
|
|
330
|
+
from browser_use.llm.messages import UserMessage
|
|
331
|
+
|
|
332
|
+
response = await asyncio.wait_for(
|
|
333
|
+
page_extraction_llm.ainvoke([UserMessage(content=prompt)]),
|
|
334
|
+
timeout=120.0,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
extracted_content = f'Query: {params.query}\nExtracted Content:\n{response.completion}'
|
|
338
|
+
|
|
339
|
+
# Simple memory handling
|
|
340
|
+
if len(extracted_content) < 1000:
|
|
341
|
+
memory = extracted_content
|
|
342
|
+
include_extracted_content_only_once = False
|
|
343
|
+
else:
|
|
344
|
+
save_result = await file_system.save_extracted_content(extracted_content)
|
|
345
|
+
current_url = await browser_session.get_current_page_url()
|
|
346
|
+
memory = (
|
|
347
|
+
f'Extracted content from {current_url} for query: {params.query}\nContent saved to file system: {save_result}'
|
|
348
|
+
)
|
|
349
|
+
include_extracted_content_only_once = True
|
|
350
|
+
|
|
351
|
+
logger.info(f'📄 {memory}')
|
|
352
|
+
return ActionResult(
|
|
353
|
+
extracted_content=extracted_content,
|
|
354
|
+
include_extracted_content_only_once=include_extracted_content_only_once,
|
|
355
|
+
long_term_memory=memory,
|
|
356
|
+
)
|
|
357
|
+
except Exception as e:
|
|
358
|
+
logger.debug(f'Error extracting content: {e}')
|
|
359
|
+
raise RuntimeError(str(e))
|
|
360
|
+
|
|
361
|
+
@self.registry.action('Read file_name from file system. If this is a file not in Current workspace dir or with a absolute path, Set external_file=True.')
|
|
362
|
+
async def read_file(file_name: str, external_file: bool, file_system: FileSystem):
|
|
363
|
+
result = await file_system.read_file(file_name, external_file=external_file)
|
|
364
|
+
|
|
365
|
+
MAX_MEMORY_SIZE = 1000
|
|
366
|
+
if len(result) > MAX_MEMORY_SIZE:
|
|
367
|
+
lines = result.splitlines()
|
|
368
|
+
display = ''
|
|
369
|
+
lines_count = 0
|
|
370
|
+
for line in lines:
|
|
371
|
+
if len(display) + len(line) < MAX_MEMORY_SIZE:
|
|
372
|
+
display += line + '\n'
|
|
373
|
+
lines_count += 1
|
|
374
|
+
else:
|
|
375
|
+
break
|
|
376
|
+
remaining_lines = len(lines) - lines_count
|
|
377
|
+
memory = f'{display}{remaining_lines} more lines...' if remaining_lines > 0 else display
|
|
378
|
+
else:
|
|
379
|
+
memory = result
|
|
380
|
+
logger.info(f'💾 {memory}')
|
|
381
|
+
return ActionResult(
|
|
382
|
+
extracted_content=result,
|
|
383
|
+
include_in_memory=True,
|
|
384
|
+
long_term_memory=memory,
|
|
385
|
+
include_extracted_content_only_once=True,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
@self.registry.action(
|
|
389
|
+
'Extract content from a file. Support image files, pdf and more.',
|
|
390
|
+
param_model=FileExtractionAction,
|
|
391
|
+
)
|
|
392
|
+
async def extract_content_from_file(
|
|
393
|
+
params: FileExtractionAction,
|
|
394
|
+
page_extraction_llm: BaseChatModel,
|
|
395
|
+
file_system: FileSystem,
|
|
396
|
+
):
|
|
397
|
+
try:
|
|
398
|
+
# Get file path
|
|
399
|
+
file_path = params.file_path
|
|
400
|
+
|
|
401
|
+
# Check if file exists
|
|
402
|
+
if not os.path.exists(file_path):
|
|
403
|
+
raise Exception(f'File not found: {file_path}')
|
|
404
|
+
|
|
405
|
+
# Determine if file is an image based on MIME type
|
|
406
|
+
mime_type, _ = mimetypes.guess_type(file_path)
|
|
407
|
+
is_image = mime_type and mime_type.startswith('image/')
|
|
408
|
+
|
|
409
|
+
if is_image:
|
|
410
|
+
# Handle image files with LLM vision
|
|
411
|
+
try:
|
|
412
|
+
# Read image file and encode to base64
|
|
413
|
+
with open(file_path, 'rb') as image_file:
|
|
414
|
+
image_data = image_file.read()
|
|
415
|
+
image_base64 = base64.b64encode(image_data).decode('utf-8')
|
|
416
|
+
|
|
417
|
+
# Create content parts similar to the user's example
|
|
418
|
+
content_parts: list[ContentPartTextParam | ContentPartImageParam] = [
|
|
419
|
+
ContentPartTextParam(text=f"Query: {params.query}")
|
|
420
|
+
]
|
|
421
|
+
|
|
422
|
+
# Add the image
|
|
423
|
+
content_parts.append(
|
|
424
|
+
ContentPartImageParam(
|
|
425
|
+
image_url=ImageURL(
|
|
426
|
+
url=f'data:{mime_type};base64,{image_base64}',
|
|
427
|
+
media_type=mime_type,
|
|
428
|
+
detail='high',
|
|
429
|
+
),
|
|
430
|
+
)
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Create user message and invoke LLM
|
|
434
|
+
user_message = UserMessage(content=content_parts, cache=True)
|
|
435
|
+
response = await asyncio.wait_for(
|
|
436
|
+
page_extraction_llm.ainvoke([user_message]),
|
|
437
|
+
timeout=120.0,
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
extracted_content = f'File: {file_path}\nQuery: {params.query}\nExtracted Content:\n{response.completion}'
|
|
441
|
+
|
|
442
|
+
except Exception as e:
|
|
443
|
+
raise Exception(f'Failed to process image file {file_path}: {str(e)}')
|
|
444
|
+
|
|
445
|
+
else:
|
|
446
|
+
# Handle non-image files by reading content
|
|
447
|
+
try:
|
|
448
|
+
file_content = await file_system.read_file(file_path, external_file=True)
|
|
449
|
+
|
|
450
|
+
# Create a simple prompt for text extraction
|
|
451
|
+
prompt = f"""Extract the requested information from this file content.
|
|
452
|
+
|
|
453
|
+
Query: {params.query}
|
|
454
|
+
|
|
455
|
+
File: {file_path}
|
|
456
|
+
File Content:
|
|
457
|
+
{file_content}
|
|
458
|
+
|
|
459
|
+
Provide the extracted information in a clear, structured format."""
|
|
460
|
+
|
|
461
|
+
response = await asyncio.wait_for(
|
|
462
|
+
page_extraction_llm.ainvoke([UserMessage(content=prompt)]),
|
|
463
|
+
timeout=120.0,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
extracted_content = f'File: {file_path}\nQuery: {params.query}\nExtracted Content:\n{response.completion}'
|
|
467
|
+
|
|
468
|
+
except Exception as e:
|
|
469
|
+
raise Exception(f'Failed to read file {file_path}: {str(e)}')
|
|
470
|
+
|
|
471
|
+
# Handle memory storage
|
|
472
|
+
if len(extracted_content) < 1000:
|
|
473
|
+
memory = extracted_content
|
|
474
|
+
include_extracted_content_only_once = False
|
|
475
|
+
else:
|
|
476
|
+
save_result = await file_system.save_extracted_content(extracted_content)
|
|
477
|
+
memory = (
|
|
478
|
+
f'Extracted content from file {file_path} for query: {params.query}\nContent saved to file system: {save_result}'
|
|
479
|
+
)
|
|
480
|
+
include_extracted_content_only_once = True
|
|
481
|
+
|
|
482
|
+
logger.info(f'📄 Extracted content from file: {file_path}')
|
|
483
|
+
return ActionResult(
|
|
484
|
+
extracted_content=extracted_content,
|
|
485
|
+
include_extracted_content_only_once=include_extracted_content_only_once,
|
|
486
|
+
long_term_memory=memory,
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
except Exception as e:
|
|
490
|
+
logger.debug(f'Error extracting content from file: {e}')
|
|
491
|
+
raise RuntimeError(str(e))
|
|
492
|
+
|
|
493
|
+
async def register_mcp_clients(self, mcp_server_config: Optional[Dict[str, Any]] = None):
|
|
494
|
+
self.mcp_server_config = mcp_server_config or self.mcp_server_config
|
|
495
|
+
if self.mcp_server_config:
|
|
496
|
+
await self.unregister_mcp_clients()
|
|
497
|
+
await self.register_mcp_tools()
|
|
498
|
+
|
|
499
|
+
async def register_mcp_tools(self):
|
|
500
|
+
"""
|
|
501
|
+
Register the MCP tools used by this controller.
|
|
502
|
+
"""
|
|
503
|
+
if not self.mcp_server_config:
|
|
504
|
+
return
|
|
505
|
+
|
|
506
|
+
# Handle both formats: with or without "mcpServers" key
|
|
507
|
+
mcp_servers = self.mcp_server_config.get('mcpServers', self.mcp_server_config)
|
|
508
|
+
|
|
509
|
+
if not mcp_servers:
|
|
510
|
+
return
|
|
511
|
+
|
|
512
|
+
for server_name, server_config in mcp_servers.items():
|
|
513
|
+
try:
|
|
514
|
+
logger.info(f'Connecting to MCP server: {server_name}')
|
|
515
|
+
|
|
516
|
+
# Create MCP client
|
|
517
|
+
client = VibeSurfMCPClient(
|
|
518
|
+
server_name=server_name,
|
|
519
|
+
command=server_config['command'],
|
|
520
|
+
args=server_config['args'],
|
|
521
|
+
env=server_config.get('env', None)
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
# Connect to the MCP server
|
|
525
|
+
await client.connect(timeout=200)
|
|
526
|
+
|
|
527
|
+
# Register tools to controller with prefix
|
|
528
|
+
prefix = f"mcp.{server_name}."
|
|
529
|
+
await client.register_to_controller(
|
|
530
|
+
controller=self,
|
|
531
|
+
prefix=prefix
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
# Store client for later cleanup
|
|
535
|
+
self.mcp_clients[server_name] = client
|
|
536
|
+
|
|
537
|
+
logger.info(f'Successfully registered MCP server: {server_name} with prefix: {prefix}')
|
|
538
|
+
|
|
539
|
+
except Exception as e:
|
|
540
|
+
logger.error(f'Failed to register MCP server {server_name}: {str(e)}')
|
|
541
|
+
# Continue with other servers even if one fails
|
|
542
|
+
|
|
543
|
+
async def unregister_mcp_clients(self):
|
|
544
|
+
"""
|
|
545
|
+
Unregister and disconnect all MCP clients.
|
|
546
|
+
"""
|
|
547
|
+
# Disconnect all MCP clients
|
|
548
|
+
for server_name, client in self.mcp_clients.items():
|
|
549
|
+
try:
|
|
550
|
+
logger.info(f'Disconnecting MCP server: {server_name}')
|
|
551
|
+
await client.disconnect()
|
|
552
|
+
except Exception as e:
|
|
553
|
+
logger.error(f'Failed to disconnect MCP server {server_name}: {str(e)}')
|
|
554
|
+
|
|
555
|
+
# Remove MCP tools from registry
|
|
556
|
+
try:
|
|
557
|
+
# Get all registered actions
|
|
558
|
+
actions_to_remove = []
|
|
559
|
+
for action_name in list(self.registry.registry.actions.keys()):
|
|
560
|
+
if action_name.startswith('mcp.'):
|
|
561
|
+
actions_to_remove.append(action_name)
|
|
562
|
+
|
|
563
|
+
# Remove MCP actions from registry
|
|
564
|
+
for action_name in actions_to_remove:
|
|
565
|
+
if action_name in self.registry.registry.actions:
|
|
566
|
+
del self.registry.registry.actions[action_name]
|
|
567
|
+
logger.info(f'Removed MCP action: {action_name}')
|
|
568
|
+
|
|
569
|
+
except Exception as e:
|
|
570
|
+
logger.error(f'Failed to remove MCP actions from registry: {str(e)}')
|
|
571
|
+
|
|
572
|
+
# Clear the clients dictionary
|
|
573
|
+
self.mcp_clients.clear()
|
|
574
|
+
logger.info('All MCP clients unregistered and disconnected')
|
|
575
|
+
|
|
576
|
+
@observe_debug(ignore_input=True, ignore_output=True, name='act')
|
|
577
|
+
@time_execution_sync('--act')
|
|
578
|
+
async def act(
|
|
579
|
+
self,
|
|
580
|
+
action: ActionModel,
|
|
581
|
+
browser_session: BrowserSession| None = None,
|
|
582
|
+
#
|
|
583
|
+
page_extraction_llm: BaseChatModel | None = None,
|
|
584
|
+
sensitive_data: dict[str, str | dict[str, str]] | None = None,
|
|
585
|
+
available_file_paths: list[str] | None = None,
|
|
586
|
+
file_system: FileSystem | None = None,
|
|
587
|
+
#
|
|
588
|
+
context: Context | None = None,
|
|
589
|
+
) -> ActionResult:
|
|
590
|
+
"""Execute an action"""
|
|
591
|
+
|
|
592
|
+
for action_name, params in action.model_dump(exclude_unset=True).items():
|
|
593
|
+
if params is not None:
|
|
594
|
+
try:
|
|
595
|
+
result = await self.registry.execute_action(
|
|
596
|
+
action_name=action_name,
|
|
597
|
+
params=params,
|
|
598
|
+
browser_session=browser_session,
|
|
599
|
+
page_extraction_llm=page_extraction_llm,
|
|
600
|
+
file_system=file_system,
|
|
601
|
+
sensitive_data=sensitive_data,
|
|
602
|
+
available_file_paths=available_file_paths,
|
|
603
|
+
context=context,
|
|
604
|
+
)
|
|
605
|
+
except Exception as e:
|
|
606
|
+
result = ActionResult(error=str(e))
|
|
607
|
+
|
|
608
|
+
if isinstance(result, str):
|
|
609
|
+
return ActionResult(extracted_content=result)
|
|
610
|
+
elif isinstance(result, ActionResult):
|
|
611
|
+
return result
|
|
612
|
+
elif result is None:
|
|
613
|
+
return ActionResult()
|
|
614
|
+
else:
|
|
615
|
+
raise ValueError(f'Invalid action result type: {type(result)} of {result}')
|
|
616
|
+
return ActionResult()
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from typing import Generic, TypeVar
|
|
2
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class HoverAction(BaseModel):
|
|
6
|
+
"""Parameters for hover action"""
|
|
7
|
+
index: int | None = None
|
|
8
|
+
xpath: str | None = None
|
|
9
|
+
selector: str | None = None
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ExtractionAction(BaseModel):
|
|
13
|
+
query: str = Field(
|
|
14
|
+
default="summary this page",
|
|
15
|
+
description='Extraction goal',
|
|
16
|
+
)
|
|
17
|
+
extract_links: bool | None = Field(
|
|
18
|
+
default=False,
|
|
19
|
+
description='Whether to extract links',
|
|
20
|
+
)
|
|
21
|
+
tab_id: str | None = Field(
|
|
22
|
+
default=None,
|
|
23
|
+
min_length=4,
|
|
24
|
+
max_length=4,
|
|
25
|
+
description='exact 4 character Tab ID of the tab for extraction',
|
|
26
|
+
) # last 4 chars of TargetID
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class FileExtractionAction(BaseModel):
|
|
30
|
+
"""Parameters for file content extraction action"""
|
|
31
|
+
file_path: str = Field(
|
|
32
|
+
description='Path to the file to extract content from',
|
|
33
|
+
)
|
|
34
|
+
query: str = Field(
|
|
35
|
+
default="Extract and summarize the content from this file",
|
|
36
|
+
description='Query or instruction for content extraction',
|
|
37
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Vibe Surf LLM implementations.
|
|
3
|
+
|
|
4
|
+
This module provides LLM implementations for vibe_surf, including:
|
|
5
|
+
- ChatOpenAICompatible: OpenAI-compatible implementation with Gemini schema fix support
|
|
6
|
+
|
|
7
|
+
Example usage:
|
|
8
|
+
from vibe_surf.llm import ChatOpenAICompatible
|
|
9
|
+
|
|
10
|
+
# Using with Azure OpenAI for Gemini models
|
|
11
|
+
llm = ChatOpenAICompatible(
|
|
12
|
+
model="gemini-2.5-pro",
|
|
13
|
+
base_url="https://your-endpoint.openai.azure.com/",
|
|
14
|
+
api_key="your-api-key",
|
|
15
|
+
temperature=0,
|
|
16
|
+
)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from vibe_surf.llm.openai_compatible import ChatOpenAICompatible
|
|
20
|
+
|
|
21
|
+
__all__ = ['ChatOpenAICompatible']
|