vibesurf 0.1.22__py3-none-any.whl → 0.1.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vibesurf might be problematic. Click here for more details.

@@ -8,8 +8,10 @@ import json
8
8
  import enum
9
9
  import base64
10
10
  import mimetypes
11
+ from json_repair import repair_json
11
12
  from datetime import datetime
12
13
  from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable, TypeVar
14
+ from pathvalidate import sanitize_filename
13
15
  from pydantic import BaseModel
14
16
  from browser_use.tools.service import Controller, Tools, handle_browser_error
15
17
  import logging
@@ -18,18 +20,21 @@ from browser_use.utils import time_execution_sync
18
20
  from browser_use.filesystem.file_system import FileSystem
19
21
  from browser_use.browser import BrowserSession
20
22
  from browser_use.llm.base import BaseChatModel
21
- from browser_use.llm.messages import UserMessage, ContentPartTextParam, ContentPartImageParam, ImageURL
23
+ from browser_use.llm.messages import UserMessage, ContentPartTextParam, ContentPartImageParam, ImageURL, \
24
+ AssistantMessage
22
25
  from browser_use.dom.service import EnhancedDOMTreeNode
23
26
  from browser_use.browser.views import BrowserError
24
27
  from browser_use.mcp.client import MCPClient
25
-
28
+ from browser_use.tools.views import NoParamsAction
26
29
  from vibe_surf.browser.agent_browser_session import AgentBrowserSession
27
30
  from vibe_surf.tools.views import HoverAction, ExtractionAction, FileExtractionAction, BrowserUseAgentExecution, \
28
- ReportWriterTask, TodoGenerateAction, TodoModifyAction, VibeSurfDoneAction
31
+ ReportWriterTask, TodoGenerateAction, TodoModifyAction, VibeSurfDoneAction, SkillSearchAction, SkillCrawlAction, \
32
+ SkillSummaryAction, SkillTakeScreenshotAction, SkillDeepResearchAction, SkillCodeAction
29
33
  from vibe_surf.tools.mcp_client import CustomMCPClient
30
34
  from vibe_surf.tools.file_system import CustomFileSystem
31
35
  from vibe_surf.browser.browser_manager import BrowserManager
32
-
36
+ from vibe_surf.tools.vibesurf_registry import VibeSurfRegistry
37
+ from bs4 import BeautifulSoup
33
38
  from vibe_surf.logger import get_logger
34
39
 
35
40
  logger = get_logger(__name__)
@@ -39,17 +44,961 @@ Context = TypeVar('Context')
39
44
  T = TypeVar('T', bound=BaseModel)
40
45
 
41
46
 
47
+ def clean_html_basic(page_html_content, max_text_length=100):
48
+ soup = BeautifulSoup(page_html_content, 'html.parser')
49
+
50
+ for script in soup(["script", "style"]):
51
+ script.decompose()
52
+
53
+ from bs4 import Comment
54
+ comments = soup.findAll(text=lambda text: isinstance(text, Comment))
55
+ for comment in comments:
56
+ comment.extract()
57
+
58
+ for text_node in soup.find_all(string=True):
59
+ if text_node.parent.name not in ['script', 'style']:
60
+ clean_text = ' '.join(text_node.split())
61
+
62
+ if len(clean_text) > max_text_length:
63
+ clean_text = clean_text[:max_text_length].rstrip() + "..."
64
+
65
+ if clean_text != text_node:
66
+ text_node.replace_with(clean_text)
67
+
68
+ important_attrs = ['id', 'class', 'name', 'role', 'type',
69
+ 'colspan', 'rowspan', 'headers', 'scope',
70
+ 'href', 'src', 'alt', 'title']
71
+
72
+ for tag in soup.find_all():
73
+ attrs_to_keep = {}
74
+ for attr in list(tag.attrs.keys()):
75
+ if (attr in important_attrs or
76
+ attr.startswith('data-') or
77
+ attr.startswith('aria-')):
78
+ attrs_to_keep[attr] = tag.attrs[attr]
79
+ tag.attrs = attrs_to_keep
80
+
81
+ return str(soup)
82
+
83
+
84
+ def get_sibling_position(node: EnhancedDOMTreeNode) -> int:
85
+ """Get the position of node among its siblings with the same tag"""
86
+ if not node.parent_node:
87
+ return 1
88
+
89
+ tag_name = node.tag_name
90
+ position = 1
91
+
92
+ # Find siblings with same tag name before this node
93
+ for sibling in node.parent_node.children:
94
+ if sibling == node:
95
+ break
96
+ if sibling.tag_name == tag_name:
97
+ position += 1
98
+
99
+ return position
100
+
101
+
102
+ def extract_css_hints(node: EnhancedDOMTreeNode) -> dict:
103
+ """Extract CSS selector construction hints"""
104
+ hints = {}
105
+
106
+ if "id" in node.attributes:
107
+ hints["id"] = f"#{node.attributes['id']}"
108
+
109
+ if "class" in node.attributes:
110
+ classes = node.attributes["class"].split()
111
+ hints["class"] = f".{'.'.join(classes[:3])}" # Limit class count
112
+
113
+ # Attribute selector hints
114
+ for attr in ["name", "data-testid", "type"]:
115
+ if attr in node.attributes:
116
+ hints[f"attr_{attr}"] = f"[{attr}='{node.attributes[attr]}']"
117
+
118
+ return hints
119
+
120
+
121
+ def convert_selector_map_for_llm(selector_map) -> dict:
122
+ """
123
+ Convert complex selector_map to simplified format suitable for LLM understanding and JS code writing
124
+ """
125
+ simplified_elements = []
126
+
127
+ for element_index, node in selector_map.items():
128
+ if node.is_visible and node.element_index is not None: # Only include visible interactive elements
129
+ element_info = {
130
+ "tag": node.tag_name,
131
+ "text": node.get_meaningful_text_for_llm()[:200], # Limit text length
132
+
133
+ # Selector information - most needed for JS code
134
+ "selectors": {
135
+ "xpath": node.xpath,
136
+ "css_hints": extract_css_hints(node), # Extract id, class etc
137
+ },
138
+
139
+ # Element semantics
140
+ "role": node.ax_node.role if node.ax_node else None,
141
+ "type": node.attributes.get("type"),
142
+ "aria_label": node.attributes.get("aria-label"),
143
+
144
+ # Key attributes
145
+ "attributes": {k: v for k, v in node.attributes.items()
146
+ if k in ["id", "class", "name", "href", "src", "value", "placeholder", "data-testid"]},
147
+
148
+ # Interactivity
149
+ "is_clickable": node.snapshot_node.is_clickable if node.snapshot_node else False,
150
+ "is_input": node.tag_name.lower() in ["input", "textarea", "select"],
151
+
152
+ # Structure information
153
+ "parent_tag": node.parent_node.tag_name if node.parent_node else None,
154
+ "position_info": f"{node.tag_name}[{get_sibling_position(node)}]"
155
+ }
156
+ simplified_elements.append(element_info)
157
+
158
+ return {
159
+ "page_elements": simplified_elements,
160
+ "total_elements": len(simplified_elements)
161
+ }
162
+
163
+
42
164
  class VibeSurfTools:
43
165
  def __init__(self, exclude_actions: list[str] = [], mcp_server_config: Optional[Dict[str, Any]] = None):
44
- self.registry = Registry(exclude_actions)
166
+ self.registry = VibeSurfRegistry(exclude_actions)
45
167
  self._register_file_actions()
46
168
  self._register_browser_use_agent()
47
169
  self._register_report_writer_agent()
48
170
  self._register_todo_actions()
49
171
  self._register_done_action()
172
+ self._register_skills()
50
173
  self.mcp_server_config = mcp_server_config
51
174
  self.mcp_clients: Dict[str, MCPClient] = {}
52
175
 
176
+ def _register_skills(self):
177
+ @self.registry.action(
178
+ 'Skill: Advanced parallel search - analyze user intent and generate 5 different search tasks, perform parallel Google searches, and return top 10 most relevant results',
179
+ param_model=SkillSearchAction,
180
+ )
181
+ async def skill_search(
182
+ params: SkillSearchAction,
183
+ browser_manager: BrowserManager,
184
+ page_extraction_llm: BaseChatModel
185
+ ):
186
+ """
187
+ Skill: Advanced parallel search with LLM-generated search strategies
188
+ """
189
+ llm = page_extraction_llm
190
+ agent_ids = []
191
+ try:
192
+ if not llm:
193
+ raise RuntimeError("LLM is required for skill_search")
194
+
195
+ # Step 1: Use LLM to analyze user intent and generate different search tasks
196
+ from datetime import datetime
197
+ analysis_prompt = f"""
198
+ Analyze the user query and generate 5 different Google search strategies to comprehensively find relevant information.
199
+
200
+ Current Time: {datetime.now().isoformat()}
201
+
202
+ User Query: "{params.query}"
203
+
204
+ Generate 5 different search queries that approach this topic from different angles. Each search should be:
205
+ 1. Specific and concrete (good for Google search)
206
+ 2. Different from the others (different perspectives/aspects)
207
+ 3. Likely to return valuable, unique information
208
+
209
+ Return your response as a JSON array of 5 search query strings.
210
+ Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
211
+ """
212
+
213
+ from browser_use.llm.messages import SystemMessage, UserMessage
214
+ response = await llm.ainvoke([
215
+ SystemMessage(content="You are an expert at generating comprehensive search strategies."),
216
+ UserMessage(content=analysis_prompt)
217
+ ])
218
+
219
+ # Parse the search queries
220
+ import json
221
+ try:
222
+ search_queries = json.loads(response.completion.strip())
223
+ if not isinstance(search_queries, list):
224
+ raise ValueError("Invalid search queries format")
225
+ search_queries = search_queries[:5]
226
+ except (json.JSONDecodeError, ValueError):
227
+ # Fallback to simple queries if parsing fails
228
+ try:
229
+ from json_repair import repair_json
230
+ search_queries = repair_json(response.completion.strip())
231
+ except Exception as e:
232
+ search_queries = [
233
+ params.query,
234
+ f"{params.query} guide",
235
+ f"{params.query} best practices",
236
+ f"{params.query} examples",
237
+ f"{params.query} latest news"
238
+ ]
239
+
240
+ # Step 2: Create browser sessions for parallel searching
241
+ register_sessions = []
242
+
243
+ for i, query in enumerate(search_queries):
244
+ agent_id = f"search_agent_{i + 1:03d}"
245
+ register_sessions.append(
246
+ browser_manager.register_agent(agent_id, target_id=None)
247
+ )
248
+ agent_ids.append(agent_id)
249
+
250
+ agent_browser_sessions = await asyncio.gather(*register_sessions)
251
+
252
+ # Step 3: Perform parallel Google searches
253
+ search_tasks = []
254
+ for i, (browser_session, query) in enumerate(zip(agent_browser_sessions, search_queries)):
255
+ search_tasks.append(self._perform_google_search(browser_session, query, llm))
256
+
257
+ search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
258
+
259
+ # Step 4: Aggregate and filter results
260
+ all_results = []
261
+ for i, result in enumerate(search_results):
262
+ if isinstance(result, Exception):
263
+ logger.error(f"Search task {i + 1} failed: {result}")
264
+ continue
265
+ if result:
266
+ all_results.extend(result)
267
+
268
+ # Step 5: Use LLM to deduplicate and rank top 10 results
269
+ if all_results:
270
+ ranking_prompt = f"""
271
+ Given these search results for the query "{params.query}", please:
272
+ 1. Remove duplicates (same or very similar content)
273
+ 2. Rank by relevance and value to the user
274
+ 3. Select the TOP 10 most relevant and valuable results
275
+
276
+ Search Results:
277
+ {json.dumps(all_results, indent=2)}
278
+
279
+ Return the top 10 results as a JSON array, with each result containing:
280
+ - title: string
281
+ - url: string
282
+ - summary: string (brief description of why this result is valuable)
283
+
284
+ Format: [{{"title": "...", "url": "...", "summary": "..."}}, ...]
285
+ """
286
+
287
+ ranking_response = await llm.ainvoke([
288
+ SystemMessage(
289
+ content="You are an expert at evaluating and ranking search results for relevance and value."),
290
+ UserMessage(content=ranking_prompt)
291
+ ])
292
+
293
+ try:
294
+ top_results = json.loads(ranking_response.completion.strip())
295
+ if not isinstance(top_results, list):
296
+ raise ValueError("Invalid ranking results format")
297
+ except (json.JSONDecodeError, ValueError):
298
+ # Fallback to first 10 results if ranking fails
299
+ top_results = all_results[:10]
300
+ else:
301
+ top_results = []
302
+
303
+ # Format results for display
304
+ if top_results:
305
+ results_text = f"🔍 Advanced Search Results for '{params.query}':\n\n"
306
+ for i, result in enumerate(top_results[:10]):
307
+ title = result.get('title', 'Unknown Title')
308
+ url = result.get('url', 'No URL')
309
+ summary = result.get('summary', 'No summary available')
310
+ results_text += f"{i}. **{title}**\n URL: {url}\n Summary: {summary}\n\n"
311
+ else:
312
+ results_text = f"No results found for query: {params.query}"
313
+
314
+ logger.info(f'🔍 Skill Search completed for: {params.query}')
315
+ return ActionResult(
316
+ extracted_content=results_text,
317
+ include_extracted_content_only_once=True,
318
+ long_term_memory=f'Advanced search completed for: {params.query}, found {len(top_results)} relevant results',
319
+ )
320
+
321
+ except Exception as e:
322
+ logger.error(f'❌ Skill Search failed: {e}')
323
+ return ActionResult(error=f'Skill search failed: {str(e)}')
324
+ finally:
325
+ for i, agent_id in enumerate(agent_ids):
326
+ await browser_manager.unregister_agent(agent_id, close_tabs=True)
327
+
328
+ @self.registry.action(
329
+ 'Skill: Crawl a web page and extract structured information from a webpage with optional tab selection',
330
+ param_model=SkillCrawlAction,
331
+ )
332
+ async def skill_crawl(
333
+ params: SkillCrawlAction,
334
+ browser_manager: BrowserManager,
335
+ page_extraction_llm: BaseChatModel
336
+ ):
337
+ """
338
+ Skill: Extract structured content from current or specified webpage
339
+ """
340
+ llm = page_extraction_llm
341
+ try:
342
+ if not llm:
343
+ raise RuntimeError("LLM is required for skill_crawl")
344
+
345
+ # Get browser session
346
+ browser_session = browser_manager.main_browser_session
347
+
348
+ # If tab_id is provided, switch to that tab
349
+ if params.tab_id:
350
+ target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
351
+ await browser_session.get_or_create_cdp_session(target_id, focus=True)
352
+
353
+ # Extract structured content using the existing method
354
+ extracted_content = await self._extract_structured_content(
355
+ browser_session, params.query, llm
356
+ )
357
+
358
+ current_url = await browser_session.get_current_page_url()
359
+ result_text = f'### URL:{current_url}\n\n{extracted_content}'
360
+
361
+ # Handle memory storage
362
+ MAX_MEMORY_LENGTH = 1000
363
+ if len(result_text) < MAX_MEMORY_LENGTH:
364
+ memory = result_text
365
+ include_extracted_content_only_once = False
366
+ else:
367
+ memory = f'Extracted structured content from {current_url} for query: {params.query}'
368
+ include_extracted_content_only_once = True
369
+
370
+ logger.info(f'📄 Skill Crawl completed for: {current_url}')
371
+ return ActionResult(
372
+ extracted_content=result_text,
373
+ include_extracted_content_only_once=include_extracted_content_only_once,
374
+ long_term_memory=memory,
375
+ )
376
+
377
+ except Exception as e:
378
+ logger.error(f'❌ Skill Crawl failed: {e}')
379
+ return ActionResult(error=f'Skill crawl failed: {str(e)}')
380
+
381
+ @self.registry.action(
382
+ 'Skill: Summarize webpage content with optional tab selection',
383
+ param_model=SkillSummaryAction,
384
+ )
385
+ async def skill_summary(
386
+ params: SkillSummaryAction,
387
+ browser_manager: BrowserManager,
388
+ page_extraction_llm: BaseChatModel
389
+ ):
390
+ """
391
+ Skill: Summarize webpage content using LLM
392
+ """
393
+ llm = page_extraction_llm
394
+ try:
395
+ if not llm:
396
+ raise RuntimeError("LLM is required for skill_summary")
397
+
398
+ # Get browser session
399
+ browser_session = browser_manager.main_browser_session
400
+
401
+ # If tab_id is provided, switch to that tab
402
+ if params.tab_id:
403
+ target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
404
+ await browser_session.get_or_create_cdp_session(target_id, focus=True)
405
+
406
+ # Extract and summarize content
407
+ summary = await self._extract_structured_content(
408
+ browser_session, "Provide a comprehensive summary of this webpage", llm
409
+ )
410
+
411
+ current_url = await browser_session.get_current_page_url()
412
+ result_text = f'📝 Summary of {current_url}:\n\n{summary}'
413
+
414
+ # Handle memory storage
415
+ MAX_MEMORY_LENGTH = 1000
416
+ if len(result_text) < MAX_MEMORY_LENGTH:
417
+ memory = result_text
418
+ include_extracted_content_only_once = False
419
+ else:
420
+ memory = f'Summarized webpage: {current_url}'
421
+ include_extracted_content_only_once = True
422
+
423
+ logger.info(f'📝 Skill Summary completed for: {current_url}')
424
+ return ActionResult(
425
+ extracted_content=result_text,
426
+ include_extracted_content_only_once=include_extracted_content_only_once,
427
+ long_term_memory=memory,
428
+ )
429
+
430
+ except Exception as e:
431
+ logger.error(f'❌ Skill Summary failed: {e}')
432
+ return ActionResult(error=f'Skill summary failed: {str(e)}')
433
+
434
+ @self.registry.action(
435
+ 'Skill: Take screenshot of current page or specified tab',
436
+ param_model=SkillTakeScreenshotAction,
437
+ )
438
+ async def skill_screenshot(
439
+ params: SkillTakeScreenshotAction,
440
+ browser_manager: BrowserManager,
441
+ file_system: CustomFileSystem
442
+ ):
443
+ """
444
+ Skill: Take screenshot with optional tab selection
445
+ """
446
+ try:
447
+ # Get browser session
448
+ browser_session = browser_manager.main_browser_session
449
+
450
+ # If tab_id is provided, switch to that tab
451
+ if params.tab_id:
452
+ target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
453
+ await browser_session.get_or_create_cdp_session(target_id, focus=True)
454
+
455
+ # Take screenshot using browser session
456
+ screenshot = await browser_session.take_screenshot()
457
+
458
+ # Generate timestamp for filename
459
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
460
+
461
+ # Get file system directory path (Path type)
462
+ fs_dir = file_system.get_dir()
463
+
464
+ # Create screenshots directory if it doesn't exist
465
+ screenshots_dir = fs_dir / "screenshots"
466
+ screenshots_dir.mkdir(exist_ok=True)
467
+
468
+ # Save screenshot to file system
469
+ page_title = await browser_session.get_current_page_title()
470
+ from pathvalidate import sanitize_filename
471
+ page_title = sanitize_filename(page_title)
472
+ filename = f"{page_title}-{timestamp}.png"
473
+ filepath = screenshots_dir / filename
474
+
475
+ with open(filepath, "wb") as f:
476
+ f.write(base64.b64decode(screenshot))
477
+
478
+ msg = f'📸 Screenshot saved to path: {str(filepath.relative_to(fs_dir))}'
479
+ logger.info(msg)
480
+ return ActionResult(
481
+ extracted_content=msg,
482
+ include_in_memory=True,
483
+ long_term_memory=f'Screenshot saved to {str(filepath.relative_to(fs_dir))}',
484
+ )
485
+
486
+ except Exception as e:
487
+ error_msg = f'❌ Failed to take screenshot: {str(e)}'
488
+ logger.error(error_msg)
489
+ return ActionResult(error=error_msg)
490
+
491
+ @self.registry.action(
492
+ 'Skill: Execute JavaScript code on webpage with optional tab selection - accepts functional requirements, code prompts, or code snippets that will be processed by LLM to generate proper executable JavaScript',
493
+ param_model=SkillCodeAction,
494
+ )
495
+ async def skill_code(
496
+ params: SkillCodeAction,
497
+ browser_manager: BrowserManager,
498
+ page_extraction_llm: BaseChatModel,
499
+ ):
500
+ """
501
+ Skill: Generate and execute JavaScript code from functional requirements or code prompts with iterative retry logic
502
+ """
503
+ MAX_ITERATIONS = 5
504
+
505
+ try:
506
+ if not page_extraction_llm:
507
+ raise RuntimeError("LLM is required for skill_code")
508
+
509
+ # Get browser session
510
+ browser_session = browser_manager.main_browser_session
511
+
512
+ # If tab_id is provided, switch to that tab
513
+ if params.tab_id:
514
+ target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
515
+ await browser_session.get_or_create_cdp_session(target_id, focus=True)
516
+
517
+ # Get browser state and convert for LLM
518
+ # browser_state = await browser_session.get_browser_state_summary()
519
+ # web_page_description = browser_state.dom_state.llm_representation()
520
+
521
+ page_html_content = await browser_session.get_html_content()
522
+ web_page_html = clean_html_basic(page_html_content)
523
+ if len(web_page_html) > 30000:
524
+ web_page_html = web_page_html[:24000] + "..." + web_page_html[-6000:]
525
+
526
+ # Get current page URL for context
527
+ current_url = await browser_session.get_current_page_url()
528
+
529
+ # Create base system prompt for JavaScript code generation
530
+ base_system_prompt = """You are an expert JavaScript developer specializing in browser automation and DOM manipulation.
531
+
532
+ You will be given a functional requirement or code prompt, along with the current page's DOM structure information.
533
+ Your task is to generate valid, executable JavaScript code that accomplishes the specified requirement.
534
+
535
+ IMPORTANT GUIDELINES:
536
+ This JavaScript code gets executed with Runtime.evaluate and 'returnByValue': True, 'awaitPromise': True
537
+
538
+ SYNTAX RULES - FAILURE TO FOLLOW CAUSES "Uncaught at line 0" ERRORS:
539
+ - ALWAYS wrap your code in IIFE: (function(){ ... })() or (async function(){ ... })() for async code
540
+ - ALWAYS add try-catch blocks to prevent execution errors
541
+ - ALWAYS use proper semicolons and valid JavaScript syntax
542
+ - NEVER write multiline code without proper IIFE wrapping
543
+ - ALWAYS validate elements exist before accessing them
544
+
545
+ EXAMPLES:
546
+ Use this tool when other tools do not work on the first try as expected or when a more general tool is needed, e.g. for filling a form all at once, hovering, dragging, extracting only links, extracting content from the page, press and hold, hovering, clicking on coordinates, zooming, use this if the user provides custom selectors which you can otherwise not interact with ....
547
+ You can also use it to explore the website.
548
+ - Write code to solve problems you could not solve with other tools.
549
+ - Don't write comments in here, no human reads that.
550
+ - Write only valid js code.
551
+ - use this to e.g. extract + filter links, convert the page to json into the format you need etc...
552
+
553
+ - limit the output otherwise your context will explode
554
+ - think if you deal with special elements like iframes / shadow roots etc
555
+ - Adopt your strategy for React Native Web, React, Angular, Vue, MUI pages etc.
556
+ - e.g. with synthetic events, keyboard simulation, shadow DOM, etc.
557
+
558
+ PROPER SYNTAX EXAMPLES:
559
+ CORRECT: (function(){ try { const el = document.querySelector('#id'); return el ? el.value : 'not found'; } catch(e) { return 'Error: ' + e.message; } })()
560
+ CORRECT: (async function(){ try { await new Promise(r => setTimeout(r, 100)); return 'done'; } catch(e) { return 'Error: ' + e.message; } })()
561
+
562
+ WRONG: const el = document.querySelector('#id'); el ? el.value : '';
563
+ WRONG: document.querySelector('#id').value
564
+ WRONG: Multiline code without IIFE wrapping
565
+
566
+ SHADOW DOM ACCESS EXAMPLE:
567
+ (function(){
568
+ try {
569
+ const hosts = document.querySelectorAll('*');
570
+ for (let host of hosts) {
571
+ if (host.shadowRoot) {
572
+ const el = host.shadowRoot.querySelector('#target');
573
+ if (el) return el.textContent;
574
+ }
575
+ }
576
+ return 'Not found';
577
+ } catch(e) {
578
+ return 'Error: ' + e.message;
579
+ }
580
+ })()
581
+
582
+ ## Return values:
583
+ - Async functions (with await, promises, timeouts) are automatically handled
584
+ - Returns strings, numbers, booleans, and serialized objects/arrays
585
+ - Use JSON.stringify() for complex objects: JSON.stringify(Array.from(document.querySelectorAll('a')).map(el => el.textContent.trim()))
586
+
587
+ OUTPUT FORMAT:
588
+ Return ONLY the JavaScript code, no explanations or markdown formatting."""
589
+
590
+ # Initialize message history for iterative prompting
591
+ from browser_use.llm.messages import SystemMessage, UserMessage
592
+ message_history = [SystemMessage(content=base_system_prompt)]
593
+
594
+ # Initial user prompt
595
+ initial_user_prompt = f"""Current Page URL: {current_url}
596
+
597
+ USER REQUIREMENT: {params.code_requirement}
598
+
599
+ Web Page Html Content:
600
+ {web_page_html}
601
+
602
+ Generate JavaScript code to fulfill the requirement:"""
603
+
604
+ message_history.append(UserMessage(content=initial_user_prompt))
605
+
606
+ # Get CDP session for JavaScript execution
607
+ cdp_session = await browser_session.get_or_create_cdp_session()
608
+
609
+ # Iterative code generation and execution
610
+ for iteration in range(1, MAX_ITERATIONS + 1):
611
+ try:
612
+ logger.info(f'🔄 Skill Code iteration {iteration}/{MAX_ITERATIONS}')
613
+
614
+ # Generate JavaScript code using LLM with message history
615
+ response = await asyncio.wait_for(
616
+ page_extraction_llm.ainvoke(message_history),
617
+ timeout=60.0,
618
+ )
619
+
620
+ generated_js_code = response.completion.strip()
621
+ message_history.append(AssistantMessage(content=generated_js_code))
622
+
623
+ # Clean up the generated code (remove markdown if present)
624
+ if generated_js_code.startswith('```javascript'):
625
+ generated_js_code = generated_js_code.replace('```javascript', '').replace('```',
626
+ '').strip()
627
+ elif generated_js_code.startswith('```js'):
628
+ generated_js_code = generated_js_code.replace('```js', '').replace('```', '').strip()
629
+ elif generated_js_code.startswith('```'):
630
+ generated_js_code = generated_js_code.replace('```', '').strip()
631
+
632
+ # Execute the generated JavaScript code
633
+ try:
634
+ logger.info(generated_js_code)
635
+ # Always use awaitPromise=True - it's ignored for non-promises
636
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
637
+ params={'expression': generated_js_code, 'returnByValue': True, 'awaitPromise': True},
638
+ session_id=cdp_session.session_id,
639
+ )
640
+
641
+ logger.info(result)
642
+ # Check for JavaScript execution errors
643
+ if result.get('exceptionDetails'):
644
+ exception = result['exceptionDetails']
645
+ error_msg = f'JavaScript execution error: {exception.get("text", "Unknown error")}'
646
+ if 'lineNumber' in exception:
647
+ error_msg += f' at line {exception["lineNumber"]}'
648
+
649
+ # Add error feedback to message history for next iteration
650
+ if iteration < MAX_ITERATIONS:
651
+ error_feedback = f"""The previous JavaScript code failed with error:
652
+ {error_msg}
653
+
654
+ Please fix the error and generate corrected JavaScript code:"""
655
+ message_history.append(UserMessage(content=error_feedback))
656
+ continue # Try next iteration
657
+ else:
658
+ # Final iteration, return error
659
+ msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nError: {error_msg}'
660
+ logger.info(msg)
661
+ return ActionResult(error=msg)
662
+
663
+ # Get the result data
664
+ result_data = result.get('result', {})
665
+
666
+ # Check for wasThrown flag (backup error detection)
667
+ if result_data.get('wasThrown'):
668
+ error_msg = 'JavaScript execution failed (wasThrown=true)'
669
+
670
+ # Add error feedback to message history for next iteration
671
+ if iteration < MAX_ITERATIONS:
672
+ error_feedback = f"""The previous JavaScript code failed with error:
673
+ {error_msg}
674
+
675
+ Please fix the error and generate corrected JavaScript code:"""
676
+ message_history.append(UserMessage(content=error_feedback))
677
+ continue # Try next iteration
678
+ else:
679
+ # Final iteration, return error
680
+ msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nError: {error_msg}'
681
+ logger.info(msg)
682
+ return ActionResult(error=msg)
683
+
684
+ # Get the actual value
685
+ value = result_data.get('value')
686
+
687
+ # Handle different value types
688
+ if value is None:
689
+ # Could be legitimate null/undefined result
690
+ result_text = str(value) if 'value' in result_data else 'undefined'
691
+ elif isinstance(value, (dict, list)):
692
+ # Complex objects - should be serialized by returnByValue
693
+ try:
694
+ result_text = json.dumps(value, ensure_ascii=False)
695
+ except (TypeError, ValueError):
696
+ # Fallback for non-serializable objects
697
+ result_text = str(value)
698
+ else:
699
+ # Primitive values (string, number, boolean)
700
+ result_text = str(value)
701
+
702
+ # Check if result is empty or meaningless
703
+ if (not result_text or
704
+ result_text.strip() in ['', 'null', 'undefined', '[]', '{}'] or
705
+ len(result_text.strip()) == 0):
706
+
707
+ # Add empty result feedback to message history for next iteration
708
+ if iteration < MAX_ITERATIONS:
709
+ empty_feedback = f"""The previous JavaScript code executed successfully but returned empty/meaningless result:
710
+ Result: {result_text}
711
+
712
+ The result is empty or not useful. Please generate improved JavaScript code that returns meaningful data:"""
713
+ message_history.append(UserMessage(content=empty_feedback))
714
+ continue # Try next iteration
715
+ else:
716
+ # Final iteration, return empty result with warning
717
+ msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nWarning: Empty or meaningless result: {result_text}'
718
+ logger.info(msg)
719
+ return ActionResult(
720
+ extracted_content=msg,
721
+ long_term_memory=f'Generated JavaScript code (iteration {iteration}) for requirement: {params.code_requirement} - Empty result warning',
722
+ )
723
+
724
+ # Apply length limit with better truncation
725
+ if len(result_text) > 30000:
726
+ result_text = result_text[:30000] + '\n... [Truncated after 30000 characters]'
727
+
728
+ # Success! Return the result
729
+ msg = f'Requirement: {params.code_requirement}\n\nGenerated Code (Iteration {iteration}): \n```javascript\n{generated_js_code}\n```\nResult: {result_text}'
730
+ logger.info(f'✅ Skill Code succeeded on iteration {iteration}')
731
+
732
+ return ActionResult(
733
+ extracted_content=msg,
734
+ long_term_memory=f'Generated and executed JavaScript code (iteration {iteration}) for requirement: {params.code_requirement}',
735
+ )
736
+
737
+ except Exception as e:
738
+ # CDP communication or other system errors
739
+ error_msg = f'Failed to execute JavaScript: {type(e).__name__}: {e}'
740
+
741
+ # Add system error feedback to message history for next iteration
742
+ if iteration < MAX_ITERATIONS:
743
+ system_error_feedback = f"""The previous JavaScript code failed to execute due to system error:
744
+ {error_msg}
745
+
746
+ Please generate alternative JavaScript code that avoids this system error:"""
747
+ message_history.append(UserMessage(content=system_error_feedback))
748
+ continue # Try next iteration
749
+ else:
750
+ # Final iteration, return system error
751
+ error_msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nError: {error_msg}'
752
+ logger.info(error_msg)
753
+ return ActionResult(error=error_msg)
754
+
755
+ except Exception as e:
756
+ # LLM generation error
757
+ logger.error(f'❌ LLM generation failed on iteration {iteration}: {e}')
758
+ if iteration == MAX_ITERATIONS:
759
+ return ActionResult(
760
+ error=f'LLM generation failed after {MAX_ITERATIONS} iterations: {str(e)}')
761
+ continue # Try next iteration with same message history
762
+
763
+ # Should not reach here, but just in case
764
+ return ActionResult(error=f'Skill code failed after {MAX_ITERATIONS} iterations')
765
+
766
+ except Exception as e:
767
+ logger.error(f'❌ Skill Code failed: {e}')
768
+ return ActionResult(error=f'Skill code failed: {str(e)}')
769
+
770
+ @self.registry.action(
771
+ 'Skill: Deep research mode - Only return the guideline for deep research. Please follow the guideline to do real deep research actions.',
772
+ param_model=NoParamsAction,
773
+ )
774
+ async def skill_deep_research(
775
+ _: NoParamsAction,
776
+ ):
777
+ """
778
+ Skill: Deep research mode activation
779
+ """
780
+ research_prompt = f"""
781
+ 🔬 **DEEP RESEARCH GUIDELINE**
782
+
783
+ To proceed with comprehensive research, please:
784
+
785
+ 1. **Set up a detailed TODO list** for this research project that includes:
786
+ - Background research and context gathering
787
+ - Key questions to investigate
788
+ - Multiple source verification
789
+ - Data collection and analysis steps
790
+ - Report generation with proper citations
791
+
792
+ 2. **Conduct systematic research** following these principles:
793
+ - Use multiple search strategies and sources
794
+ - Verify information across different platforms
795
+ - Document all sources with URLs for citation
796
+ - Take notes and screenshots of key findings
797
+ - Organize findings by themes or categories
798
+
799
+ 3. **Generate a comprehensive report** that includes:
800
+ - Executive summary
801
+ - Detailed findings with analysis
802
+ - Proper citations and source references
803
+ - Supporting evidence (screenshots, quotes)
804
+ - Conclusions and recommendations
805
+ - Areas for further investigation
806
+
807
+ 4. **Maintain research traceability** by:
808
+ - Recording all search queries used
809
+ - Saving important URLs and sources
810
+ - Including direct quotes with attribution
811
+ - Documenting methodology and approach
812
+
813
+ This deep research mode ensures thorough, traceable, and well-documented investigation of your topic with proper academic rigor and source citation.
814
+ """
815
+
816
+ return ActionResult(
817
+ extracted_content=research_prompt,
818
+ include_extracted_content_only_once=True,
819
+ )
820
+
821
+ async def _perform_google_search(self, browser_session, query: str, llm: BaseChatModel):
822
+ """Helper method to perform Google search and extract top 5 results"""
823
+ try:
824
+ # Navigate to Google search
825
+ search_url = f'https://www.google.com/search?q={query}&udm=14'
826
+ await browser_session.navigate_to_url(search_url, new_tab=False)
827
+
828
+ # Wait a moment for page to load
829
+ await asyncio.sleep(1)
830
+
831
+ # Extract structured content
832
+ extraction_query = f"""
833
+ Extract the top 5 search results from this Google search page. For each result, provide:
834
+ - title: The clickable title/headline
835
+ - url: The website URL
836
+ - summary: A brief description of what this result contains
837
+
838
+ Return results as a JSON array: [{{"title": "...", "url": "...", "summary": "..."}}, ...]
839
+ """
840
+
841
+ results_text = await self._extract_structured_content(browser_session, extraction_query, llm)
842
+
843
+ # Try to parse JSON results
844
+ import json
845
+ try:
846
+ results = json.loads(results_text.strip())
847
+ if isinstance(results, list):
848
+ return results[:5] # Ensure max 5 results
849
+ except (json.JSONDecodeError, ValueError):
850
+ try:
851
+ results = repair_json(results_text.strip())
852
+ if isinstance(results, list):
853
+ return results[:5] # Ensure max 5 results
854
+ except Exception as e:
855
+ logger.warning(f"Failed to parse JSON from search results: {results_text}")
856
+
857
+ # Fallback: return raw text as single result
858
+ current_url = await browser_session.get_current_page_url()
859
+ return [{
860
+ "title": f"Search results for: {query}",
861
+ "url": current_url,
862
+ "summary": results_text[:200] + "..." if len(results_text) > 200 else results_text
863
+ }]
864
+
865
+ except Exception as e:
866
+ logger.error(f"Google search failed for query '{query}': {e}")
867
+ return []
868
+
869
+ async def _extract_structured_content(self, browser_session, query: str, llm: BaseChatModel):
870
+ """Helper method to extract structured content from current page"""
871
+ MAX_CHAR_LIMIT = 30000
872
+
873
+ # Extract clean markdown using the existing method
874
+ try:
875
+ content, content_stats = await self.extract_clean_markdown(browser_session, extract_links=False)
876
+ except Exception as e:
877
+ raise RuntimeError(f'Could not extract clean markdown: {type(e).__name__}')
878
+
879
+ # Smart truncation with context preservation
880
+ if len(content) > MAX_CHAR_LIMIT:
881
+ # Try to truncate at a natural break point
882
+ truncate_at = MAX_CHAR_LIMIT
883
+ paragraph_break = content.rfind('\n\n', MAX_CHAR_LIMIT - 500, MAX_CHAR_LIMIT)
884
+ if paragraph_break > 0:
885
+ truncate_at = paragraph_break
886
+ else:
887
+ sentence_break = content.rfind('.', MAX_CHAR_LIMIT - 200, MAX_CHAR_LIMIT)
888
+ if sentence_break > 0:
889
+ truncate_at = sentence_break + 1
890
+ content = content[:truncate_at]
891
+
892
+ system_prompt = """
893
+ You are an expert at extracting data from the markdown of a webpage.
894
+
895
+ <input>
896
+ You will be given a query and the markdown of a webpage that has been filtered to remove noise and advertising content.
897
+ </input>
898
+
899
+ <instructions>
900
+ - You are tasked to extract information from the webpage that is relevant to the query.
901
+ - You should ONLY use the information available in the webpage to answer the query. Do not make up information or provide guess from your own knowledge.
902
+ - If the information relevant to the query is not available in the page, your response should mention that.
903
+ - If the query asks for all items, products, etc., make sure to directly list all of them.
904
+ </instructions>
905
+
906
+ <output>
907
+ - Your output should present ALL the information relevant to the query in a concise way.
908
+ - Do not answer in conversational format - directly output the relevant information or that the information is unavailable.
909
+ </output>
910
+ """.strip()
911
+
912
+ prompt = f'<query>\n{query}\n</query>\n\n<webpage_content>\n{content}\n</webpage_content>'
913
+
914
+ try:
915
+ from browser_use.llm.messages import SystemMessage, UserMessage
916
+ response = await asyncio.wait_for(
917
+ llm.ainvoke([SystemMessage(content=system_prompt), UserMessage(content=prompt)]),
918
+ timeout=120.0,
919
+ )
920
+ return response.completion
921
+ except Exception as e:
922
+ logger.debug(f'Error extracting content: {e}')
923
+ raise RuntimeError(str(e))
924
+
925
+ async def extract_clean_markdown(
926
+ self, browser_session: BrowserSession, extract_links: bool = True
927
+ ) -> tuple[str, dict[str, Any]]:
928
+ """Extract clean markdown from the current page."""
929
+ import re
930
+
931
+ # Get HTML content from current page
932
+ cdp_session = await browser_session.get_or_create_cdp_session()
933
+ try:
934
+ body_id = await cdp_session.cdp_client.send.DOM.getDocument(session_id=cdp_session.session_id)
935
+ page_html_result = await cdp_session.cdp_client.send.DOM.getOuterHTML(
936
+ params={'backendNodeId': body_id['root']['backendNodeId']}, session_id=cdp_session.session_id
937
+ )
938
+ page_html = page_html_result['outerHTML']
939
+ current_url = await browser_session.get_current_page_url()
940
+ except Exception as e:
941
+ raise RuntimeError(f"Couldn't extract page content: {e}")
942
+
943
+ original_html_length = len(page_html)
944
+
945
+ # Use html2text for clean markdown conversion
946
+ import html2text
947
+
948
+ h = html2text.HTML2Text()
949
+ h.ignore_links = not extract_links
950
+ h.ignore_images = True
951
+ h.ignore_emphasis = False
952
+ h.body_width = 0 # Don't wrap lines
953
+ h.unicode_snob = True
954
+ h.skip_internal_links = True
955
+ content = h.handle(page_html)
956
+
957
+ initial_markdown_length = len(content)
958
+
959
+ # Minimal cleanup - html2text already does most of the work
960
+ content = re.sub(r'%[0-9A-Fa-f]{2}', '', content) # Remove any remaining URL encoding
961
+
962
+ # Apply light preprocessing to clean up excessive whitespace
963
+ content, chars_filtered = self._preprocess_markdown_content(content)
964
+
965
+ final_filtered_length = len(content)
966
+
967
+ # Content statistics
968
+ stats = {
969
+ 'url': current_url,
970
+ 'original_html_chars': original_html_length,
971
+ 'initial_markdown_chars': initial_markdown_length,
972
+ 'filtered_chars_removed': chars_filtered,
973
+ 'final_filtered_chars': final_filtered_length,
974
+ }
975
+
976
+ return content, stats
977
+
978
+ def _preprocess_markdown_content(self, content: str, max_newlines: int = 3) -> tuple[str, int]:
979
+ """Light preprocessing of html2text output - minimal cleanup since html2text is already clean."""
980
+ import re
981
+
982
+ original_length = len(content)
983
+
984
+ # Compress consecutive newlines (4+ newlines become max_newlines)
985
+ content = re.sub(r'\n{4,}', '\n' * max_newlines, content)
986
+
987
+ # Remove lines that are only whitespace or very short (likely artifacts)
988
+ lines = content.split('\n')
989
+ filtered_lines = []
990
+ for line in lines:
991
+ stripped = line.strip()
992
+ # Keep lines with substantial content (html2text output is already clean)
993
+ if len(stripped) > 2:
994
+ filtered_lines.append(line)
995
+
996
+ content = '\n'.join(filtered_lines)
997
+ content = content.strip()
998
+
999
+ chars_filtered = original_length - len(content)
1000
+ return content, chars_filtered
1001
+
53
1002
  def _register_browser_use_agent(self):
54
1003
  @self.registry.action(
55
1004
  'Execute browser_use agent tasks. Supports both single task execution (list length=1) and '