vibesurf 0.1.22__py3-none-any.whl → 0.1.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vibesurf might be problematic. Click here for more details.
- vibe_surf/_version.py +2 -2
- vibe_surf/agents/prompts/vibe_surf_prompt.py +10 -0
- vibe_surf/agents/vibe_surf_agent.py +13 -2
- vibe_surf/backend/api/agent.py +38 -0
- vibe_surf/backend/api/task.py +1 -1
- vibe_surf/backend/main.py +2 -0
- vibe_surf/browser/agent_browser_session.py +5 -5
- vibe_surf/chrome_extension/scripts/api-client.js +5 -0
- vibe_surf/chrome_extension/scripts/main.js +1 -1
- vibe_surf/chrome_extension/scripts/ui-manager.js +397 -20
- vibe_surf/chrome_extension/sidepanel.html +13 -1
- vibe_surf/chrome_extension/styles/input.css +115 -0
- vibe_surf/tools/browser_use_tools.py +0 -90
- vibe_surf/tools/vibesurf_registry.py +52 -0
- vibe_surf/tools/vibesurf_tools.py +954 -5
- vibe_surf/tools/views.py +60 -0
- {vibesurf-0.1.22.dist-info → vibesurf-0.1.23.dist-info}/METADATA +1 -1
- {vibesurf-0.1.22.dist-info → vibesurf-0.1.23.dist-info}/RECORD +22 -20
- {vibesurf-0.1.22.dist-info → vibesurf-0.1.23.dist-info}/WHEEL +0 -0
- {vibesurf-0.1.22.dist-info → vibesurf-0.1.23.dist-info}/entry_points.txt +0 -0
- {vibesurf-0.1.22.dist-info → vibesurf-0.1.23.dist-info}/licenses/LICENSE +0 -0
- {vibesurf-0.1.22.dist-info → vibesurf-0.1.23.dist-info}/top_level.txt +0 -0
|
@@ -8,8 +8,10 @@ import json
|
|
|
8
8
|
import enum
|
|
9
9
|
import base64
|
|
10
10
|
import mimetypes
|
|
11
|
+
from json_repair import repair_json
|
|
11
12
|
from datetime import datetime
|
|
12
13
|
from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable, TypeVar
|
|
14
|
+
from pathvalidate import sanitize_filename
|
|
13
15
|
from pydantic import BaseModel
|
|
14
16
|
from browser_use.tools.service import Controller, Tools, handle_browser_error
|
|
15
17
|
import logging
|
|
@@ -18,18 +20,21 @@ from browser_use.utils import time_execution_sync
|
|
|
18
20
|
from browser_use.filesystem.file_system import FileSystem
|
|
19
21
|
from browser_use.browser import BrowserSession
|
|
20
22
|
from browser_use.llm.base import BaseChatModel
|
|
21
|
-
from browser_use.llm.messages import UserMessage, ContentPartTextParam, ContentPartImageParam, ImageURL
|
|
23
|
+
from browser_use.llm.messages import UserMessage, ContentPartTextParam, ContentPartImageParam, ImageURL, \
|
|
24
|
+
AssistantMessage
|
|
22
25
|
from browser_use.dom.service import EnhancedDOMTreeNode
|
|
23
26
|
from browser_use.browser.views import BrowserError
|
|
24
27
|
from browser_use.mcp.client import MCPClient
|
|
25
|
-
|
|
28
|
+
from browser_use.tools.views import NoParamsAction
|
|
26
29
|
from vibe_surf.browser.agent_browser_session import AgentBrowserSession
|
|
27
30
|
from vibe_surf.tools.views import HoverAction, ExtractionAction, FileExtractionAction, BrowserUseAgentExecution, \
|
|
28
|
-
ReportWriterTask, TodoGenerateAction, TodoModifyAction, VibeSurfDoneAction
|
|
31
|
+
ReportWriterTask, TodoGenerateAction, TodoModifyAction, VibeSurfDoneAction, SkillSearchAction, SkillCrawlAction, \
|
|
32
|
+
SkillSummaryAction, SkillTakeScreenshotAction, SkillDeepResearchAction, SkillCodeAction
|
|
29
33
|
from vibe_surf.tools.mcp_client import CustomMCPClient
|
|
30
34
|
from vibe_surf.tools.file_system import CustomFileSystem
|
|
31
35
|
from vibe_surf.browser.browser_manager import BrowserManager
|
|
32
|
-
|
|
36
|
+
from vibe_surf.tools.vibesurf_registry import VibeSurfRegistry
|
|
37
|
+
from bs4 import BeautifulSoup
|
|
33
38
|
from vibe_surf.logger import get_logger
|
|
34
39
|
|
|
35
40
|
logger = get_logger(__name__)
|
|
@@ -39,17 +44,961 @@ Context = TypeVar('Context')
|
|
|
39
44
|
T = TypeVar('T', bound=BaseModel)
|
|
40
45
|
|
|
41
46
|
|
|
47
|
+
def clean_html_basic(page_html_content, max_text_length=100):
|
|
48
|
+
soup = BeautifulSoup(page_html_content, 'html.parser')
|
|
49
|
+
|
|
50
|
+
for script in soup(["script", "style"]):
|
|
51
|
+
script.decompose()
|
|
52
|
+
|
|
53
|
+
from bs4 import Comment
|
|
54
|
+
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
|
|
55
|
+
for comment in comments:
|
|
56
|
+
comment.extract()
|
|
57
|
+
|
|
58
|
+
for text_node in soup.find_all(string=True):
|
|
59
|
+
if text_node.parent.name not in ['script', 'style']:
|
|
60
|
+
clean_text = ' '.join(text_node.split())
|
|
61
|
+
|
|
62
|
+
if len(clean_text) > max_text_length:
|
|
63
|
+
clean_text = clean_text[:max_text_length].rstrip() + "..."
|
|
64
|
+
|
|
65
|
+
if clean_text != text_node:
|
|
66
|
+
text_node.replace_with(clean_text)
|
|
67
|
+
|
|
68
|
+
important_attrs = ['id', 'class', 'name', 'role', 'type',
|
|
69
|
+
'colspan', 'rowspan', 'headers', 'scope',
|
|
70
|
+
'href', 'src', 'alt', 'title']
|
|
71
|
+
|
|
72
|
+
for tag in soup.find_all():
|
|
73
|
+
attrs_to_keep = {}
|
|
74
|
+
for attr in list(tag.attrs.keys()):
|
|
75
|
+
if (attr in important_attrs or
|
|
76
|
+
attr.startswith('data-') or
|
|
77
|
+
attr.startswith('aria-')):
|
|
78
|
+
attrs_to_keep[attr] = tag.attrs[attr]
|
|
79
|
+
tag.attrs = attrs_to_keep
|
|
80
|
+
|
|
81
|
+
return str(soup)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_sibling_position(node: EnhancedDOMTreeNode) -> int:
|
|
85
|
+
"""Get the position of node among its siblings with the same tag"""
|
|
86
|
+
if not node.parent_node:
|
|
87
|
+
return 1
|
|
88
|
+
|
|
89
|
+
tag_name = node.tag_name
|
|
90
|
+
position = 1
|
|
91
|
+
|
|
92
|
+
# Find siblings with same tag name before this node
|
|
93
|
+
for sibling in node.parent_node.children:
|
|
94
|
+
if sibling == node:
|
|
95
|
+
break
|
|
96
|
+
if sibling.tag_name == tag_name:
|
|
97
|
+
position += 1
|
|
98
|
+
|
|
99
|
+
return position
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def extract_css_hints(node: EnhancedDOMTreeNode) -> dict:
|
|
103
|
+
"""Extract CSS selector construction hints"""
|
|
104
|
+
hints = {}
|
|
105
|
+
|
|
106
|
+
if "id" in node.attributes:
|
|
107
|
+
hints["id"] = f"#{node.attributes['id']}"
|
|
108
|
+
|
|
109
|
+
if "class" in node.attributes:
|
|
110
|
+
classes = node.attributes["class"].split()
|
|
111
|
+
hints["class"] = f".{'.'.join(classes[:3])}" # Limit class count
|
|
112
|
+
|
|
113
|
+
# Attribute selector hints
|
|
114
|
+
for attr in ["name", "data-testid", "type"]:
|
|
115
|
+
if attr in node.attributes:
|
|
116
|
+
hints[f"attr_{attr}"] = f"[{attr}='{node.attributes[attr]}']"
|
|
117
|
+
|
|
118
|
+
return hints
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def convert_selector_map_for_llm(selector_map) -> dict:
|
|
122
|
+
"""
|
|
123
|
+
Convert complex selector_map to simplified format suitable for LLM understanding and JS code writing
|
|
124
|
+
"""
|
|
125
|
+
simplified_elements = []
|
|
126
|
+
|
|
127
|
+
for element_index, node in selector_map.items():
|
|
128
|
+
if node.is_visible and node.element_index is not None: # Only include visible interactive elements
|
|
129
|
+
element_info = {
|
|
130
|
+
"tag": node.tag_name,
|
|
131
|
+
"text": node.get_meaningful_text_for_llm()[:200], # Limit text length
|
|
132
|
+
|
|
133
|
+
# Selector information - most needed for JS code
|
|
134
|
+
"selectors": {
|
|
135
|
+
"xpath": node.xpath,
|
|
136
|
+
"css_hints": extract_css_hints(node), # Extract id, class etc
|
|
137
|
+
},
|
|
138
|
+
|
|
139
|
+
# Element semantics
|
|
140
|
+
"role": node.ax_node.role if node.ax_node else None,
|
|
141
|
+
"type": node.attributes.get("type"),
|
|
142
|
+
"aria_label": node.attributes.get("aria-label"),
|
|
143
|
+
|
|
144
|
+
# Key attributes
|
|
145
|
+
"attributes": {k: v for k, v in node.attributes.items()
|
|
146
|
+
if k in ["id", "class", "name", "href", "src", "value", "placeholder", "data-testid"]},
|
|
147
|
+
|
|
148
|
+
# Interactivity
|
|
149
|
+
"is_clickable": node.snapshot_node.is_clickable if node.snapshot_node else False,
|
|
150
|
+
"is_input": node.tag_name.lower() in ["input", "textarea", "select"],
|
|
151
|
+
|
|
152
|
+
# Structure information
|
|
153
|
+
"parent_tag": node.parent_node.tag_name if node.parent_node else None,
|
|
154
|
+
"position_info": f"{node.tag_name}[{get_sibling_position(node)}]"
|
|
155
|
+
}
|
|
156
|
+
simplified_elements.append(element_info)
|
|
157
|
+
|
|
158
|
+
return {
|
|
159
|
+
"page_elements": simplified_elements,
|
|
160
|
+
"total_elements": len(simplified_elements)
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
|
|
42
164
|
class VibeSurfTools:
|
|
43
165
|
def __init__(self, exclude_actions: list[str] = [], mcp_server_config: Optional[Dict[str, Any]] = None):
|
|
44
|
-
self.registry =
|
|
166
|
+
self.registry = VibeSurfRegistry(exclude_actions)
|
|
45
167
|
self._register_file_actions()
|
|
46
168
|
self._register_browser_use_agent()
|
|
47
169
|
self._register_report_writer_agent()
|
|
48
170
|
self._register_todo_actions()
|
|
49
171
|
self._register_done_action()
|
|
172
|
+
self._register_skills()
|
|
50
173
|
self.mcp_server_config = mcp_server_config
|
|
51
174
|
self.mcp_clients: Dict[str, MCPClient] = {}
|
|
52
175
|
|
|
176
|
+
def _register_skills(self):
|
|
177
|
+
@self.registry.action(
|
|
178
|
+
'Skill: Advanced parallel search - analyze user intent and generate 5 different search tasks, perform parallel Google searches, and return top 10 most relevant results',
|
|
179
|
+
param_model=SkillSearchAction,
|
|
180
|
+
)
|
|
181
|
+
async def skill_search(
|
|
182
|
+
params: SkillSearchAction,
|
|
183
|
+
browser_manager: BrowserManager,
|
|
184
|
+
page_extraction_llm: BaseChatModel
|
|
185
|
+
):
|
|
186
|
+
"""
|
|
187
|
+
Skill: Advanced parallel search with LLM-generated search strategies
|
|
188
|
+
"""
|
|
189
|
+
llm = page_extraction_llm
|
|
190
|
+
agent_ids = []
|
|
191
|
+
try:
|
|
192
|
+
if not llm:
|
|
193
|
+
raise RuntimeError("LLM is required for skill_search")
|
|
194
|
+
|
|
195
|
+
# Step 1: Use LLM to analyze user intent and generate different search tasks
|
|
196
|
+
from datetime import datetime
|
|
197
|
+
analysis_prompt = f"""
|
|
198
|
+
Analyze the user query and generate 5 different Google search strategies to comprehensively find relevant information.
|
|
199
|
+
|
|
200
|
+
Current Time: {datetime.now().isoformat()}
|
|
201
|
+
|
|
202
|
+
User Query: "{params.query}"
|
|
203
|
+
|
|
204
|
+
Generate 5 different search queries that approach this topic from different angles. Each search should be:
|
|
205
|
+
1. Specific and concrete (good for Google search)
|
|
206
|
+
2. Different from the others (different perspectives/aspects)
|
|
207
|
+
3. Likely to return valuable, unique information
|
|
208
|
+
|
|
209
|
+
Return your response as a JSON array of 5 search query strings.
|
|
210
|
+
Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
from browser_use.llm.messages import SystemMessage, UserMessage
|
|
214
|
+
response = await llm.ainvoke([
|
|
215
|
+
SystemMessage(content="You are an expert at generating comprehensive search strategies."),
|
|
216
|
+
UserMessage(content=analysis_prompt)
|
|
217
|
+
])
|
|
218
|
+
|
|
219
|
+
# Parse the search queries
|
|
220
|
+
import json
|
|
221
|
+
try:
|
|
222
|
+
search_queries = json.loads(response.completion.strip())
|
|
223
|
+
if not isinstance(search_queries, list):
|
|
224
|
+
raise ValueError("Invalid search queries format")
|
|
225
|
+
search_queries = search_queries[:5]
|
|
226
|
+
except (json.JSONDecodeError, ValueError):
|
|
227
|
+
# Fallback to simple queries if parsing fails
|
|
228
|
+
try:
|
|
229
|
+
from json_repair import repair_json
|
|
230
|
+
search_queries = repair_json(response.completion.strip())
|
|
231
|
+
except Exception as e:
|
|
232
|
+
search_queries = [
|
|
233
|
+
params.query,
|
|
234
|
+
f"{params.query} guide",
|
|
235
|
+
f"{params.query} best practices",
|
|
236
|
+
f"{params.query} examples",
|
|
237
|
+
f"{params.query} latest news"
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
# Step 2: Create browser sessions for parallel searching
|
|
241
|
+
register_sessions = []
|
|
242
|
+
|
|
243
|
+
for i, query in enumerate(search_queries):
|
|
244
|
+
agent_id = f"search_agent_{i + 1:03d}"
|
|
245
|
+
register_sessions.append(
|
|
246
|
+
browser_manager.register_agent(agent_id, target_id=None)
|
|
247
|
+
)
|
|
248
|
+
agent_ids.append(agent_id)
|
|
249
|
+
|
|
250
|
+
agent_browser_sessions = await asyncio.gather(*register_sessions)
|
|
251
|
+
|
|
252
|
+
# Step 3: Perform parallel Google searches
|
|
253
|
+
search_tasks = []
|
|
254
|
+
for i, (browser_session, query) in enumerate(zip(agent_browser_sessions, search_queries)):
|
|
255
|
+
search_tasks.append(self._perform_google_search(browser_session, query, llm))
|
|
256
|
+
|
|
257
|
+
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
|
|
258
|
+
|
|
259
|
+
# Step 4: Aggregate and filter results
|
|
260
|
+
all_results = []
|
|
261
|
+
for i, result in enumerate(search_results):
|
|
262
|
+
if isinstance(result, Exception):
|
|
263
|
+
logger.error(f"Search task {i + 1} failed: {result}")
|
|
264
|
+
continue
|
|
265
|
+
if result:
|
|
266
|
+
all_results.extend(result)
|
|
267
|
+
|
|
268
|
+
# Step 5: Use LLM to deduplicate and rank top 10 results
|
|
269
|
+
if all_results:
|
|
270
|
+
ranking_prompt = f"""
|
|
271
|
+
Given these search results for the query "{params.query}", please:
|
|
272
|
+
1. Remove duplicates (same or very similar content)
|
|
273
|
+
2. Rank by relevance and value to the user
|
|
274
|
+
3. Select the TOP 10 most relevant and valuable results
|
|
275
|
+
|
|
276
|
+
Search Results:
|
|
277
|
+
{json.dumps(all_results, indent=2)}
|
|
278
|
+
|
|
279
|
+
Return the top 10 results as a JSON array, with each result containing:
|
|
280
|
+
- title: string
|
|
281
|
+
- url: string
|
|
282
|
+
- summary: string (brief description of why this result is valuable)
|
|
283
|
+
|
|
284
|
+
Format: [{{"title": "...", "url": "...", "summary": "..."}}, ...]
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
ranking_response = await llm.ainvoke([
|
|
288
|
+
SystemMessage(
|
|
289
|
+
content="You are an expert at evaluating and ranking search results for relevance and value."),
|
|
290
|
+
UserMessage(content=ranking_prompt)
|
|
291
|
+
])
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
top_results = json.loads(ranking_response.completion.strip())
|
|
295
|
+
if not isinstance(top_results, list):
|
|
296
|
+
raise ValueError("Invalid ranking results format")
|
|
297
|
+
except (json.JSONDecodeError, ValueError):
|
|
298
|
+
# Fallback to first 10 results if ranking fails
|
|
299
|
+
top_results = all_results[:10]
|
|
300
|
+
else:
|
|
301
|
+
top_results = []
|
|
302
|
+
|
|
303
|
+
# Format results for display
|
|
304
|
+
if top_results:
|
|
305
|
+
results_text = f"🔍 Advanced Search Results for '{params.query}':\n\n"
|
|
306
|
+
for i, result in enumerate(top_results[:10]):
|
|
307
|
+
title = result.get('title', 'Unknown Title')
|
|
308
|
+
url = result.get('url', 'No URL')
|
|
309
|
+
summary = result.get('summary', 'No summary available')
|
|
310
|
+
results_text += f"{i}. **{title}**\n URL: {url}\n Summary: {summary}\n\n"
|
|
311
|
+
else:
|
|
312
|
+
results_text = f"No results found for query: {params.query}"
|
|
313
|
+
|
|
314
|
+
logger.info(f'🔍 Skill Search completed for: {params.query}')
|
|
315
|
+
return ActionResult(
|
|
316
|
+
extracted_content=results_text,
|
|
317
|
+
include_extracted_content_only_once=True,
|
|
318
|
+
long_term_memory=f'Advanced search completed for: {params.query}, found {len(top_results)} relevant results',
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
except Exception as e:
|
|
322
|
+
logger.error(f'❌ Skill Search failed: {e}')
|
|
323
|
+
return ActionResult(error=f'Skill search failed: {str(e)}')
|
|
324
|
+
finally:
|
|
325
|
+
for i, agent_id in enumerate(agent_ids):
|
|
326
|
+
await browser_manager.unregister_agent(agent_id, close_tabs=True)
|
|
327
|
+
|
|
328
|
+
@self.registry.action(
|
|
329
|
+
'Skill: Crawl a web page and extract structured information from a webpage with optional tab selection',
|
|
330
|
+
param_model=SkillCrawlAction,
|
|
331
|
+
)
|
|
332
|
+
async def skill_crawl(
|
|
333
|
+
params: SkillCrawlAction,
|
|
334
|
+
browser_manager: BrowserManager,
|
|
335
|
+
page_extraction_llm: BaseChatModel
|
|
336
|
+
):
|
|
337
|
+
"""
|
|
338
|
+
Skill: Extract structured content from current or specified webpage
|
|
339
|
+
"""
|
|
340
|
+
llm = page_extraction_llm
|
|
341
|
+
try:
|
|
342
|
+
if not llm:
|
|
343
|
+
raise RuntimeError("LLM is required for skill_crawl")
|
|
344
|
+
|
|
345
|
+
# Get browser session
|
|
346
|
+
browser_session = browser_manager.main_browser_session
|
|
347
|
+
|
|
348
|
+
# If tab_id is provided, switch to that tab
|
|
349
|
+
if params.tab_id:
|
|
350
|
+
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
|
|
351
|
+
await browser_session.get_or_create_cdp_session(target_id, focus=True)
|
|
352
|
+
|
|
353
|
+
# Extract structured content using the existing method
|
|
354
|
+
extracted_content = await self._extract_structured_content(
|
|
355
|
+
browser_session, params.query, llm
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
current_url = await browser_session.get_current_page_url()
|
|
359
|
+
result_text = f'### URL:{current_url}\n\n{extracted_content}'
|
|
360
|
+
|
|
361
|
+
# Handle memory storage
|
|
362
|
+
MAX_MEMORY_LENGTH = 1000
|
|
363
|
+
if len(result_text) < MAX_MEMORY_LENGTH:
|
|
364
|
+
memory = result_text
|
|
365
|
+
include_extracted_content_only_once = False
|
|
366
|
+
else:
|
|
367
|
+
memory = f'Extracted structured content from {current_url} for query: {params.query}'
|
|
368
|
+
include_extracted_content_only_once = True
|
|
369
|
+
|
|
370
|
+
logger.info(f'📄 Skill Crawl completed for: {current_url}')
|
|
371
|
+
return ActionResult(
|
|
372
|
+
extracted_content=result_text,
|
|
373
|
+
include_extracted_content_only_once=include_extracted_content_only_once,
|
|
374
|
+
long_term_memory=memory,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
except Exception as e:
|
|
378
|
+
logger.error(f'❌ Skill Crawl failed: {e}')
|
|
379
|
+
return ActionResult(error=f'Skill crawl failed: {str(e)}')
|
|
380
|
+
|
|
381
|
+
@self.registry.action(
|
|
382
|
+
'Skill: Summarize webpage content with optional tab selection',
|
|
383
|
+
param_model=SkillSummaryAction,
|
|
384
|
+
)
|
|
385
|
+
async def skill_summary(
|
|
386
|
+
params: SkillSummaryAction,
|
|
387
|
+
browser_manager: BrowserManager,
|
|
388
|
+
page_extraction_llm: BaseChatModel
|
|
389
|
+
):
|
|
390
|
+
"""
|
|
391
|
+
Skill: Summarize webpage content using LLM
|
|
392
|
+
"""
|
|
393
|
+
llm = page_extraction_llm
|
|
394
|
+
try:
|
|
395
|
+
if not llm:
|
|
396
|
+
raise RuntimeError("LLM is required for skill_summary")
|
|
397
|
+
|
|
398
|
+
# Get browser session
|
|
399
|
+
browser_session = browser_manager.main_browser_session
|
|
400
|
+
|
|
401
|
+
# If tab_id is provided, switch to that tab
|
|
402
|
+
if params.tab_id:
|
|
403
|
+
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
|
|
404
|
+
await browser_session.get_or_create_cdp_session(target_id, focus=True)
|
|
405
|
+
|
|
406
|
+
# Extract and summarize content
|
|
407
|
+
summary = await self._extract_structured_content(
|
|
408
|
+
browser_session, "Provide a comprehensive summary of this webpage", llm
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
current_url = await browser_session.get_current_page_url()
|
|
412
|
+
result_text = f'📝 Summary of {current_url}:\n\n{summary}'
|
|
413
|
+
|
|
414
|
+
# Handle memory storage
|
|
415
|
+
MAX_MEMORY_LENGTH = 1000
|
|
416
|
+
if len(result_text) < MAX_MEMORY_LENGTH:
|
|
417
|
+
memory = result_text
|
|
418
|
+
include_extracted_content_only_once = False
|
|
419
|
+
else:
|
|
420
|
+
memory = f'Summarized webpage: {current_url}'
|
|
421
|
+
include_extracted_content_only_once = True
|
|
422
|
+
|
|
423
|
+
logger.info(f'📝 Skill Summary completed for: {current_url}')
|
|
424
|
+
return ActionResult(
|
|
425
|
+
extracted_content=result_text,
|
|
426
|
+
include_extracted_content_only_once=include_extracted_content_only_once,
|
|
427
|
+
long_term_memory=memory,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
except Exception as e:
|
|
431
|
+
logger.error(f'❌ Skill Summary failed: {e}')
|
|
432
|
+
return ActionResult(error=f'Skill summary failed: {str(e)}')
|
|
433
|
+
|
|
434
|
+
@self.registry.action(
|
|
435
|
+
'Skill: Take screenshot of current page or specified tab',
|
|
436
|
+
param_model=SkillTakeScreenshotAction,
|
|
437
|
+
)
|
|
438
|
+
async def skill_screenshot(
|
|
439
|
+
params: SkillTakeScreenshotAction,
|
|
440
|
+
browser_manager: BrowserManager,
|
|
441
|
+
file_system: CustomFileSystem
|
|
442
|
+
):
|
|
443
|
+
"""
|
|
444
|
+
Skill: Take screenshot with optional tab selection
|
|
445
|
+
"""
|
|
446
|
+
try:
|
|
447
|
+
# Get browser session
|
|
448
|
+
browser_session = browser_manager.main_browser_session
|
|
449
|
+
|
|
450
|
+
# If tab_id is provided, switch to that tab
|
|
451
|
+
if params.tab_id:
|
|
452
|
+
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
|
|
453
|
+
await browser_session.get_or_create_cdp_session(target_id, focus=True)
|
|
454
|
+
|
|
455
|
+
# Take screenshot using browser session
|
|
456
|
+
screenshot = await browser_session.take_screenshot()
|
|
457
|
+
|
|
458
|
+
# Generate timestamp for filename
|
|
459
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
460
|
+
|
|
461
|
+
# Get file system directory path (Path type)
|
|
462
|
+
fs_dir = file_system.get_dir()
|
|
463
|
+
|
|
464
|
+
# Create screenshots directory if it doesn't exist
|
|
465
|
+
screenshots_dir = fs_dir / "screenshots"
|
|
466
|
+
screenshots_dir.mkdir(exist_ok=True)
|
|
467
|
+
|
|
468
|
+
# Save screenshot to file system
|
|
469
|
+
page_title = await browser_session.get_current_page_title()
|
|
470
|
+
from pathvalidate import sanitize_filename
|
|
471
|
+
page_title = sanitize_filename(page_title)
|
|
472
|
+
filename = f"{page_title}-{timestamp}.png"
|
|
473
|
+
filepath = screenshots_dir / filename
|
|
474
|
+
|
|
475
|
+
with open(filepath, "wb") as f:
|
|
476
|
+
f.write(base64.b64decode(screenshot))
|
|
477
|
+
|
|
478
|
+
msg = f'📸 Screenshot saved to path: {str(filepath.relative_to(fs_dir))}'
|
|
479
|
+
logger.info(msg)
|
|
480
|
+
return ActionResult(
|
|
481
|
+
extracted_content=msg,
|
|
482
|
+
include_in_memory=True,
|
|
483
|
+
long_term_memory=f'Screenshot saved to {str(filepath.relative_to(fs_dir))}',
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
except Exception as e:
|
|
487
|
+
error_msg = f'❌ Failed to take screenshot: {str(e)}'
|
|
488
|
+
logger.error(error_msg)
|
|
489
|
+
return ActionResult(error=error_msg)
|
|
490
|
+
|
|
491
|
+
@self.registry.action(
|
|
492
|
+
'Skill: Execute JavaScript code on webpage with optional tab selection - accepts functional requirements, code prompts, or code snippets that will be processed by LLM to generate proper executable JavaScript',
|
|
493
|
+
param_model=SkillCodeAction,
|
|
494
|
+
)
|
|
495
|
+
async def skill_code(
|
|
496
|
+
params: SkillCodeAction,
|
|
497
|
+
browser_manager: BrowserManager,
|
|
498
|
+
page_extraction_llm: BaseChatModel,
|
|
499
|
+
):
|
|
500
|
+
"""
|
|
501
|
+
Skill: Generate and execute JavaScript code from functional requirements or code prompts with iterative retry logic
|
|
502
|
+
"""
|
|
503
|
+
MAX_ITERATIONS = 5
|
|
504
|
+
|
|
505
|
+
try:
|
|
506
|
+
if not page_extraction_llm:
|
|
507
|
+
raise RuntimeError("LLM is required for skill_code")
|
|
508
|
+
|
|
509
|
+
# Get browser session
|
|
510
|
+
browser_session = browser_manager.main_browser_session
|
|
511
|
+
|
|
512
|
+
# If tab_id is provided, switch to that tab
|
|
513
|
+
if params.tab_id:
|
|
514
|
+
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
|
|
515
|
+
await browser_session.get_or_create_cdp_session(target_id, focus=True)
|
|
516
|
+
|
|
517
|
+
# Get browser state and convert for LLM
|
|
518
|
+
# browser_state = await browser_session.get_browser_state_summary()
|
|
519
|
+
# web_page_description = browser_state.dom_state.llm_representation()
|
|
520
|
+
|
|
521
|
+
page_html_content = await browser_session.get_html_content()
|
|
522
|
+
web_page_html = clean_html_basic(page_html_content)
|
|
523
|
+
if len(web_page_html) > 30000:
|
|
524
|
+
web_page_html = web_page_html[:24000] + "..." + web_page_html[-6000:]
|
|
525
|
+
|
|
526
|
+
# Get current page URL for context
|
|
527
|
+
current_url = await browser_session.get_current_page_url()
|
|
528
|
+
|
|
529
|
+
# Create base system prompt for JavaScript code generation
|
|
530
|
+
base_system_prompt = """You are an expert JavaScript developer specializing in browser automation and DOM manipulation.
|
|
531
|
+
|
|
532
|
+
You will be given a functional requirement or code prompt, along with the current page's DOM structure information.
|
|
533
|
+
Your task is to generate valid, executable JavaScript code that accomplishes the specified requirement.
|
|
534
|
+
|
|
535
|
+
IMPORTANT GUIDELINES:
|
|
536
|
+
This JavaScript code gets executed with Runtime.evaluate and 'returnByValue': True, 'awaitPromise': True
|
|
537
|
+
|
|
538
|
+
SYNTAX RULES - FAILURE TO FOLLOW CAUSES "Uncaught at line 0" ERRORS:
|
|
539
|
+
- ALWAYS wrap your code in IIFE: (function(){ ... })() or (async function(){ ... })() for async code
|
|
540
|
+
- ALWAYS add try-catch blocks to prevent execution errors
|
|
541
|
+
- ALWAYS use proper semicolons and valid JavaScript syntax
|
|
542
|
+
- NEVER write multiline code without proper IIFE wrapping
|
|
543
|
+
- ALWAYS validate elements exist before accessing them
|
|
544
|
+
|
|
545
|
+
EXAMPLES:
|
|
546
|
+
Use this tool when other tools do not work on the first try as expected or when a more general tool is needed, e.g. for filling a form all at once, hovering, dragging, extracting only links, extracting content from the page, press and hold, hovering, clicking on coordinates, zooming, use this if the user provides custom selectors which you can otherwise not interact with ....
|
|
547
|
+
You can also use it to explore the website.
|
|
548
|
+
- Write code to solve problems you could not solve with other tools.
|
|
549
|
+
- Don't write comments in here, no human reads that.
|
|
550
|
+
- Write only valid js code.
|
|
551
|
+
- use this to e.g. extract + filter links, convert the page to json into the format you need etc...
|
|
552
|
+
|
|
553
|
+
- limit the output otherwise your context will explode
|
|
554
|
+
- think if you deal with special elements like iframes / shadow roots etc
|
|
555
|
+
- Adopt your strategy for React Native Web, React, Angular, Vue, MUI pages etc.
|
|
556
|
+
- e.g. with synthetic events, keyboard simulation, shadow DOM, etc.
|
|
557
|
+
|
|
558
|
+
PROPER SYNTAX EXAMPLES:
|
|
559
|
+
CORRECT: (function(){ try { const el = document.querySelector('#id'); return el ? el.value : 'not found'; } catch(e) { return 'Error: ' + e.message; } })()
|
|
560
|
+
CORRECT: (async function(){ try { await new Promise(r => setTimeout(r, 100)); return 'done'; } catch(e) { return 'Error: ' + e.message; } })()
|
|
561
|
+
|
|
562
|
+
WRONG: const el = document.querySelector('#id'); el ? el.value : '';
|
|
563
|
+
WRONG: document.querySelector('#id').value
|
|
564
|
+
WRONG: Multiline code without IIFE wrapping
|
|
565
|
+
|
|
566
|
+
SHADOW DOM ACCESS EXAMPLE:
|
|
567
|
+
(function(){
|
|
568
|
+
try {
|
|
569
|
+
const hosts = document.querySelectorAll('*');
|
|
570
|
+
for (let host of hosts) {
|
|
571
|
+
if (host.shadowRoot) {
|
|
572
|
+
const el = host.shadowRoot.querySelector('#target');
|
|
573
|
+
if (el) return el.textContent;
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
return 'Not found';
|
|
577
|
+
} catch(e) {
|
|
578
|
+
return 'Error: ' + e.message;
|
|
579
|
+
}
|
|
580
|
+
})()
|
|
581
|
+
|
|
582
|
+
## Return values:
|
|
583
|
+
- Async functions (with await, promises, timeouts) are automatically handled
|
|
584
|
+
- Returns strings, numbers, booleans, and serialized objects/arrays
|
|
585
|
+
- Use JSON.stringify() for complex objects: JSON.stringify(Array.from(document.querySelectorAll('a')).map(el => el.textContent.trim()))
|
|
586
|
+
|
|
587
|
+
OUTPUT FORMAT:
|
|
588
|
+
Return ONLY the JavaScript code, no explanations or markdown formatting."""
|
|
589
|
+
|
|
590
|
+
# Initialize message history for iterative prompting
|
|
591
|
+
from browser_use.llm.messages import SystemMessage, UserMessage
|
|
592
|
+
message_history = [SystemMessage(content=base_system_prompt)]
|
|
593
|
+
|
|
594
|
+
# Initial user prompt
|
|
595
|
+
initial_user_prompt = f"""Current Page URL: {current_url}
|
|
596
|
+
|
|
597
|
+
USER REQUIREMENT: {params.code_requirement}
|
|
598
|
+
|
|
599
|
+
Web Page Html Content:
|
|
600
|
+
{web_page_html}
|
|
601
|
+
|
|
602
|
+
Generate JavaScript code to fulfill the requirement:"""
|
|
603
|
+
|
|
604
|
+
message_history.append(UserMessage(content=initial_user_prompt))
|
|
605
|
+
|
|
606
|
+
# Get CDP session for JavaScript execution
|
|
607
|
+
cdp_session = await browser_session.get_or_create_cdp_session()
|
|
608
|
+
|
|
609
|
+
# Iterative code generation and execution
|
|
610
|
+
for iteration in range(1, MAX_ITERATIONS + 1):
|
|
611
|
+
try:
|
|
612
|
+
logger.info(f'🔄 Skill Code iteration {iteration}/{MAX_ITERATIONS}')
|
|
613
|
+
|
|
614
|
+
# Generate JavaScript code using LLM with message history
|
|
615
|
+
response = await asyncio.wait_for(
|
|
616
|
+
page_extraction_llm.ainvoke(message_history),
|
|
617
|
+
timeout=60.0,
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
generated_js_code = response.completion.strip()
|
|
621
|
+
message_history.append(AssistantMessage(content=generated_js_code))
|
|
622
|
+
|
|
623
|
+
# Clean up the generated code (remove markdown if present)
|
|
624
|
+
if generated_js_code.startswith('```javascript'):
|
|
625
|
+
generated_js_code = generated_js_code.replace('```javascript', '').replace('```',
|
|
626
|
+
'').strip()
|
|
627
|
+
elif generated_js_code.startswith('```js'):
|
|
628
|
+
generated_js_code = generated_js_code.replace('```js', '').replace('```', '').strip()
|
|
629
|
+
elif generated_js_code.startswith('```'):
|
|
630
|
+
generated_js_code = generated_js_code.replace('```', '').strip()
|
|
631
|
+
|
|
632
|
+
# Execute the generated JavaScript code
|
|
633
|
+
try:
|
|
634
|
+
logger.info(generated_js_code)
|
|
635
|
+
# Always use awaitPromise=True - it's ignored for non-promises
|
|
636
|
+
result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
637
|
+
params={'expression': generated_js_code, 'returnByValue': True, 'awaitPromise': True},
|
|
638
|
+
session_id=cdp_session.session_id,
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
logger.info(result)
|
|
642
|
+
# Check for JavaScript execution errors
|
|
643
|
+
if result.get('exceptionDetails'):
|
|
644
|
+
exception = result['exceptionDetails']
|
|
645
|
+
error_msg = f'JavaScript execution error: {exception.get("text", "Unknown error")}'
|
|
646
|
+
if 'lineNumber' in exception:
|
|
647
|
+
error_msg += f' at line {exception["lineNumber"]}'
|
|
648
|
+
|
|
649
|
+
# Add error feedback to message history for next iteration
|
|
650
|
+
if iteration < MAX_ITERATIONS:
|
|
651
|
+
error_feedback = f"""The previous JavaScript code failed with error:
|
|
652
|
+
{error_msg}
|
|
653
|
+
|
|
654
|
+
Please fix the error and generate corrected JavaScript code:"""
|
|
655
|
+
message_history.append(UserMessage(content=error_feedback))
|
|
656
|
+
continue # Try next iteration
|
|
657
|
+
else:
|
|
658
|
+
# Final iteration, return error
|
|
659
|
+
msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nError: {error_msg}'
|
|
660
|
+
logger.info(msg)
|
|
661
|
+
return ActionResult(error=msg)
|
|
662
|
+
|
|
663
|
+
# Get the result data
|
|
664
|
+
result_data = result.get('result', {})
|
|
665
|
+
|
|
666
|
+
# Check for wasThrown flag (backup error detection)
|
|
667
|
+
if result_data.get('wasThrown'):
|
|
668
|
+
error_msg = 'JavaScript execution failed (wasThrown=true)'
|
|
669
|
+
|
|
670
|
+
# Add error feedback to message history for next iteration
|
|
671
|
+
if iteration < MAX_ITERATIONS:
|
|
672
|
+
error_feedback = f"""The previous JavaScript code failed with error:
|
|
673
|
+
{error_msg}
|
|
674
|
+
|
|
675
|
+
Please fix the error and generate corrected JavaScript code:"""
|
|
676
|
+
message_history.append(UserMessage(content=error_feedback))
|
|
677
|
+
continue # Try next iteration
|
|
678
|
+
else:
|
|
679
|
+
# Final iteration, return error
|
|
680
|
+
msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nError: {error_msg}'
|
|
681
|
+
logger.info(msg)
|
|
682
|
+
return ActionResult(error=msg)
|
|
683
|
+
|
|
684
|
+
# Get the actual value
|
|
685
|
+
value = result_data.get('value')
|
|
686
|
+
|
|
687
|
+
# Handle different value types
|
|
688
|
+
if value is None:
|
|
689
|
+
# Could be legitimate null/undefined result
|
|
690
|
+
result_text = str(value) if 'value' in result_data else 'undefined'
|
|
691
|
+
elif isinstance(value, (dict, list)):
|
|
692
|
+
# Complex objects - should be serialized by returnByValue
|
|
693
|
+
try:
|
|
694
|
+
result_text = json.dumps(value, ensure_ascii=False)
|
|
695
|
+
except (TypeError, ValueError):
|
|
696
|
+
# Fallback for non-serializable objects
|
|
697
|
+
result_text = str(value)
|
|
698
|
+
else:
|
|
699
|
+
# Primitive values (string, number, boolean)
|
|
700
|
+
result_text = str(value)
|
|
701
|
+
|
|
702
|
+
# Check if result is empty or meaningless
|
|
703
|
+
if (not result_text or
|
|
704
|
+
result_text.strip() in ['', 'null', 'undefined', '[]', '{}'] or
|
|
705
|
+
len(result_text.strip()) == 0):
|
|
706
|
+
|
|
707
|
+
# Add empty result feedback to message history for next iteration
|
|
708
|
+
if iteration < MAX_ITERATIONS:
|
|
709
|
+
empty_feedback = f"""The previous JavaScript code executed successfully but returned empty/meaningless result:
|
|
710
|
+
Result: {result_text}
|
|
711
|
+
|
|
712
|
+
The result is empty or not useful. Please generate improved JavaScript code that returns meaningful data:"""
|
|
713
|
+
message_history.append(UserMessage(content=empty_feedback))
|
|
714
|
+
continue # Try next iteration
|
|
715
|
+
else:
|
|
716
|
+
# Final iteration, return empty result with warning
|
|
717
|
+
msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nWarning: Empty or meaningless result: {result_text}'
|
|
718
|
+
logger.info(msg)
|
|
719
|
+
return ActionResult(
|
|
720
|
+
extracted_content=msg,
|
|
721
|
+
long_term_memory=f'Generated JavaScript code (iteration {iteration}) for requirement: {params.code_requirement} - Empty result warning',
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
# Apply length limit with better truncation
|
|
725
|
+
if len(result_text) > 30000:
|
|
726
|
+
result_text = result_text[:30000] + '\n... [Truncated after 30000 characters]'
|
|
727
|
+
|
|
728
|
+
# Success! Return the result
|
|
729
|
+
msg = f'Requirement: {params.code_requirement}\n\nGenerated Code (Iteration {iteration}): \n```javascript\n{generated_js_code}\n```\nResult: {result_text}'
|
|
730
|
+
logger.info(f'✅ Skill Code succeeded on iteration {iteration}')
|
|
731
|
+
|
|
732
|
+
return ActionResult(
|
|
733
|
+
extracted_content=msg,
|
|
734
|
+
long_term_memory=f'Generated and executed JavaScript code (iteration {iteration}) for requirement: {params.code_requirement}',
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
except Exception as e:
|
|
738
|
+
# CDP communication or other system errors
|
|
739
|
+
error_msg = f'Failed to execute JavaScript: {type(e).__name__}: {e}'
|
|
740
|
+
|
|
741
|
+
# Add system error feedback to message history for next iteration
|
|
742
|
+
if iteration < MAX_ITERATIONS:
|
|
743
|
+
system_error_feedback = f"""The previous JavaScript code failed to execute due to system error:
|
|
744
|
+
{error_msg}
|
|
745
|
+
|
|
746
|
+
Please generate alternative JavaScript code that avoids this system error:"""
|
|
747
|
+
message_history.append(UserMessage(content=system_error_feedback))
|
|
748
|
+
continue # Try next iteration
|
|
749
|
+
else:
|
|
750
|
+
# Final iteration, return system error
|
|
751
|
+
error_msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nError: {error_msg}'
|
|
752
|
+
logger.info(error_msg)
|
|
753
|
+
return ActionResult(error=error_msg)
|
|
754
|
+
|
|
755
|
+
except Exception as e:
|
|
756
|
+
# LLM generation error
|
|
757
|
+
logger.error(f'❌ LLM generation failed on iteration {iteration}: {e}')
|
|
758
|
+
if iteration == MAX_ITERATIONS:
|
|
759
|
+
return ActionResult(
|
|
760
|
+
error=f'LLM generation failed after {MAX_ITERATIONS} iterations: {str(e)}')
|
|
761
|
+
continue # Try next iteration with same message history
|
|
762
|
+
|
|
763
|
+
# Should not reach here, but just in case
|
|
764
|
+
return ActionResult(error=f'Skill code failed after {MAX_ITERATIONS} iterations')
|
|
765
|
+
|
|
766
|
+
except Exception as e:
|
|
767
|
+
logger.error(f'❌ Skill Code failed: {e}')
|
|
768
|
+
return ActionResult(error=f'Skill code failed: {str(e)}')
|
|
769
|
+
|
|
770
|
+
@self.registry.action(
|
|
771
|
+
'Skill: Deep research mode - Only return the guideline for deep research. Please follow the guideline to do real deep research actions.',
|
|
772
|
+
param_model=NoParamsAction,
|
|
773
|
+
)
|
|
774
|
+
async def skill_deep_research(
|
|
775
|
+
_: NoParamsAction,
|
|
776
|
+
):
|
|
777
|
+
"""
|
|
778
|
+
Skill: Deep research mode activation
|
|
779
|
+
"""
|
|
780
|
+
research_prompt = f"""
|
|
781
|
+
🔬 **DEEP RESEARCH GUIDELINE**
|
|
782
|
+
|
|
783
|
+
To proceed with comprehensive research, please:
|
|
784
|
+
|
|
785
|
+
1. **Set up a detailed TODO list** for this research project that includes:
|
|
786
|
+
- Background research and context gathering
|
|
787
|
+
- Key questions to investigate
|
|
788
|
+
- Multiple source verification
|
|
789
|
+
- Data collection and analysis steps
|
|
790
|
+
- Report generation with proper citations
|
|
791
|
+
|
|
792
|
+
2. **Conduct systematic research** following these principles:
|
|
793
|
+
- Use multiple search strategies and sources
|
|
794
|
+
- Verify information across different platforms
|
|
795
|
+
- Document all sources with URLs for citation
|
|
796
|
+
- Take notes and screenshots of key findings
|
|
797
|
+
- Organize findings by themes or categories
|
|
798
|
+
|
|
799
|
+
3. **Generate a comprehensive report** that includes:
|
|
800
|
+
- Executive summary
|
|
801
|
+
- Detailed findings with analysis
|
|
802
|
+
- Proper citations and source references
|
|
803
|
+
- Supporting evidence (screenshots, quotes)
|
|
804
|
+
- Conclusions and recommendations
|
|
805
|
+
- Areas for further investigation
|
|
806
|
+
|
|
807
|
+
4. **Maintain research traceability** by:
|
|
808
|
+
- Recording all search queries used
|
|
809
|
+
- Saving important URLs and sources
|
|
810
|
+
- Including direct quotes with attribution
|
|
811
|
+
- Documenting methodology and approach
|
|
812
|
+
|
|
813
|
+
This deep research mode ensures thorough, traceable, and well-documented investigation of your topic with proper academic rigor and source citation.
|
|
814
|
+
"""
|
|
815
|
+
|
|
816
|
+
return ActionResult(
|
|
817
|
+
extracted_content=research_prompt,
|
|
818
|
+
include_extracted_content_only_once=True,
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
async def _perform_google_search(self, browser_session, query: str, llm: BaseChatModel):
|
|
822
|
+
"""Helper method to perform Google search and extract top 5 results"""
|
|
823
|
+
try:
|
|
824
|
+
# Navigate to Google search
|
|
825
|
+
search_url = f'https://www.google.com/search?q={query}&udm=14'
|
|
826
|
+
await browser_session.navigate_to_url(search_url, new_tab=False)
|
|
827
|
+
|
|
828
|
+
# Wait a moment for page to load
|
|
829
|
+
await asyncio.sleep(1)
|
|
830
|
+
|
|
831
|
+
# Extract structured content
|
|
832
|
+
extraction_query = f"""
|
|
833
|
+
Extract the top 5 search results from this Google search page. For each result, provide:
|
|
834
|
+
- title: The clickable title/headline
|
|
835
|
+
- url: The website URL
|
|
836
|
+
- summary: A brief description of what this result contains
|
|
837
|
+
|
|
838
|
+
Return results as a JSON array: [{{"title": "...", "url": "...", "summary": "..."}}, ...]
|
|
839
|
+
"""
|
|
840
|
+
|
|
841
|
+
results_text = await self._extract_structured_content(browser_session, extraction_query, llm)
|
|
842
|
+
|
|
843
|
+
# Try to parse JSON results
|
|
844
|
+
import json
|
|
845
|
+
try:
|
|
846
|
+
results = json.loads(results_text.strip())
|
|
847
|
+
if isinstance(results, list):
|
|
848
|
+
return results[:5] # Ensure max 5 results
|
|
849
|
+
except (json.JSONDecodeError, ValueError):
|
|
850
|
+
try:
|
|
851
|
+
results = repair_json(results_text.strip())
|
|
852
|
+
if isinstance(results, list):
|
|
853
|
+
return results[:5] # Ensure max 5 results
|
|
854
|
+
except Exception as e:
|
|
855
|
+
logger.warning(f"Failed to parse JSON from search results: {results_text}")
|
|
856
|
+
|
|
857
|
+
# Fallback: return raw text as single result
|
|
858
|
+
current_url = await browser_session.get_current_page_url()
|
|
859
|
+
return [{
|
|
860
|
+
"title": f"Search results for: {query}",
|
|
861
|
+
"url": current_url,
|
|
862
|
+
"summary": results_text[:200] + "..." if len(results_text) > 200 else results_text
|
|
863
|
+
}]
|
|
864
|
+
|
|
865
|
+
except Exception as e:
|
|
866
|
+
logger.error(f"Google search failed for query '{query}': {e}")
|
|
867
|
+
return []
|
|
868
|
+
|
|
869
|
+
async def _extract_structured_content(self, browser_session, query: str, llm: BaseChatModel):
|
|
870
|
+
"""Helper method to extract structured content from current page"""
|
|
871
|
+
MAX_CHAR_LIMIT = 30000
|
|
872
|
+
|
|
873
|
+
# Extract clean markdown using the existing method
|
|
874
|
+
try:
|
|
875
|
+
content, content_stats = await self.extract_clean_markdown(browser_session, extract_links=False)
|
|
876
|
+
except Exception as e:
|
|
877
|
+
raise RuntimeError(f'Could not extract clean markdown: {type(e).__name__}')
|
|
878
|
+
|
|
879
|
+
# Smart truncation with context preservation
|
|
880
|
+
if len(content) > MAX_CHAR_LIMIT:
|
|
881
|
+
# Try to truncate at a natural break point
|
|
882
|
+
truncate_at = MAX_CHAR_LIMIT
|
|
883
|
+
paragraph_break = content.rfind('\n\n', MAX_CHAR_LIMIT - 500, MAX_CHAR_LIMIT)
|
|
884
|
+
if paragraph_break > 0:
|
|
885
|
+
truncate_at = paragraph_break
|
|
886
|
+
else:
|
|
887
|
+
sentence_break = content.rfind('.', MAX_CHAR_LIMIT - 200, MAX_CHAR_LIMIT)
|
|
888
|
+
if sentence_break > 0:
|
|
889
|
+
truncate_at = sentence_break + 1
|
|
890
|
+
content = content[:truncate_at]
|
|
891
|
+
|
|
892
|
+
system_prompt = """
|
|
893
|
+
You are an expert at extracting data from the markdown of a webpage.
|
|
894
|
+
|
|
895
|
+
<input>
|
|
896
|
+
You will be given a query and the markdown of a webpage that has been filtered to remove noise and advertising content.
|
|
897
|
+
</input>
|
|
898
|
+
|
|
899
|
+
<instructions>
|
|
900
|
+
- You are tasked to extract information from the webpage that is relevant to the query.
|
|
901
|
+
- You should ONLY use the information available in the webpage to answer the query. Do not make up information or provide guess from your own knowledge.
|
|
902
|
+
- If the information relevant to the query is not available in the page, your response should mention that.
|
|
903
|
+
- If the query asks for all items, products, etc., make sure to directly list all of them.
|
|
904
|
+
</instructions>
|
|
905
|
+
|
|
906
|
+
<output>
|
|
907
|
+
- Your output should present ALL the information relevant to the query in a concise way.
|
|
908
|
+
- Do not answer in conversational format - directly output the relevant information or that the information is unavailable.
|
|
909
|
+
</output>
|
|
910
|
+
""".strip()
|
|
911
|
+
|
|
912
|
+
prompt = f'<query>\n{query}\n</query>\n\n<webpage_content>\n{content}\n</webpage_content>'
|
|
913
|
+
|
|
914
|
+
try:
|
|
915
|
+
from browser_use.llm.messages import SystemMessage, UserMessage
|
|
916
|
+
response = await asyncio.wait_for(
|
|
917
|
+
llm.ainvoke([SystemMessage(content=system_prompt), UserMessage(content=prompt)]),
|
|
918
|
+
timeout=120.0,
|
|
919
|
+
)
|
|
920
|
+
return response.completion
|
|
921
|
+
except Exception as e:
|
|
922
|
+
logger.debug(f'Error extracting content: {e}')
|
|
923
|
+
raise RuntimeError(str(e))
|
|
924
|
+
|
|
925
|
+
async def extract_clean_markdown(
|
|
926
|
+
self, browser_session: BrowserSession, extract_links: bool = True
|
|
927
|
+
) -> tuple[str, dict[str, Any]]:
|
|
928
|
+
"""Extract clean markdown from the current page."""
|
|
929
|
+
import re
|
|
930
|
+
|
|
931
|
+
# Get HTML content from current page
|
|
932
|
+
cdp_session = await browser_session.get_or_create_cdp_session()
|
|
933
|
+
try:
|
|
934
|
+
body_id = await cdp_session.cdp_client.send.DOM.getDocument(session_id=cdp_session.session_id)
|
|
935
|
+
page_html_result = await cdp_session.cdp_client.send.DOM.getOuterHTML(
|
|
936
|
+
params={'backendNodeId': body_id['root']['backendNodeId']}, session_id=cdp_session.session_id
|
|
937
|
+
)
|
|
938
|
+
page_html = page_html_result['outerHTML']
|
|
939
|
+
current_url = await browser_session.get_current_page_url()
|
|
940
|
+
except Exception as e:
|
|
941
|
+
raise RuntimeError(f"Couldn't extract page content: {e}")
|
|
942
|
+
|
|
943
|
+
original_html_length = len(page_html)
|
|
944
|
+
|
|
945
|
+
# Use html2text for clean markdown conversion
|
|
946
|
+
import html2text
|
|
947
|
+
|
|
948
|
+
h = html2text.HTML2Text()
|
|
949
|
+
h.ignore_links = not extract_links
|
|
950
|
+
h.ignore_images = True
|
|
951
|
+
h.ignore_emphasis = False
|
|
952
|
+
h.body_width = 0 # Don't wrap lines
|
|
953
|
+
h.unicode_snob = True
|
|
954
|
+
h.skip_internal_links = True
|
|
955
|
+
content = h.handle(page_html)
|
|
956
|
+
|
|
957
|
+
initial_markdown_length = len(content)
|
|
958
|
+
|
|
959
|
+
# Minimal cleanup - html2text already does most of the work
|
|
960
|
+
content = re.sub(r'%[0-9A-Fa-f]{2}', '', content) # Remove any remaining URL encoding
|
|
961
|
+
|
|
962
|
+
# Apply light preprocessing to clean up excessive whitespace
|
|
963
|
+
content, chars_filtered = self._preprocess_markdown_content(content)
|
|
964
|
+
|
|
965
|
+
final_filtered_length = len(content)
|
|
966
|
+
|
|
967
|
+
# Content statistics
|
|
968
|
+
stats = {
|
|
969
|
+
'url': current_url,
|
|
970
|
+
'original_html_chars': original_html_length,
|
|
971
|
+
'initial_markdown_chars': initial_markdown_length,
|
|
972
|
+
'filtered_chars_removed': chars_filtered,
|
|
973
|
+
'final_filtered_chars': final_filtered_length,
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
return content, stats
|
|
977
|
+
|
|
978
|
+
def _preprocess_markdown_content(self, content: str, max_newlines: int = 3) -> tuple[str, int]:
|
|
979
|
+
"""Light preprocessing of html2text output - minimal cleanup since html2text is already clean."""
|
|
980
|
+
import re
|
|
981
|
+
|
|
982
|
+
original_length = len(content)
|
|
983
|
+
|
|
984
|
+
# Compress consecutive newlines (4+ newlines become max_newlines)
|
|
985
|
+
content = re.sub(r'\n{4,}', '\n' * max_newlines, content)
|
|
986
|
+
|
|
987
|
+
# Remove lines that are only whitespace or very short (likely artifacts)
|
|
988
|
+
lines = content.split('\n')
|
|
989
|
+
filtered_lines = []
|
|
990
|
+
for line in lines:
|
|
991
|
+
stripped = line.strip()
|
|
992
|
+
# Keep lines with substantial content (html2text output is already clean)
|
|
993
|
+
if len(stripped) > 2:
|
|
994
|
+
filtered_lines.append(line)
|
|
995
|
+
|
|
996
|
+
content = '\n'.join(filtered_lines)
|
|
997
|
+
content = content.strip()
|
|
998
|
+
|
|
999
|
+
chars_filtered = original_length - len(content)
|
|
1000
|
+
return content, chars_filtered
|
|
1001
|
+
|
|
53
1002
|
def _register_browser_use_agent(self):
|
|
54
1003
|
@self.registry.action(
|
|
55
1004
|
'Execute browser_use agent tasks. Supports both single task execution (list length=1) and '
|