vibesurf 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vibesurf might be problematic. Click here for more details.

@@ -8,8 +8,12 @@ import json
8
8
  import enum
9
9
  import base64
10
10
  import mimetypes
11
+ import yfinance as yf
12
+ import pprint
13
+ from json_repair import repair_json
11
14
  from datetime import datetime
12
15
  from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable, TypeVar
16
+ from pathvalidate import sanitize_filename
13
17
  from pydantic import BaseModel
14
18
  from browser_use.tools.service import Controller, Tools, handle_browser_error
15
19
  import logging
@@ -18,18 +22,22 @@ from browser_use.utils import time_execution_sync
18
22
  from browser_use.filesystem.file_system import FileSystem
19
23
  from browser_use.browser import BrowserSession
20
24
  from browser_use.llm.base import BaseChatModel
21
- from browser_use.llm.messages import UserMessage, ContentPartTextParam, ContentPartImageParam, ImageURL
25
+ from browser_use.llm.messages import UserMessage, ContentPartTextParam, ContentPartImageParam, ImageURL, \
26
+ AssistantMessage
22
27
  from browser_use.dom.service import EnhancedDOMTreeNode
23
28
  from browser_use.browser.views import BrowserError
24
29
  from browser_use.mcp.client import MCPClient
25
-
30
+ from browser_use.tools.views import NoParamsAction
26
31
  from vibe_surf.browser.agent_browser_session import AgentBrowserSession
27
32
  from vibe_surf.tools.views import HoverAction, ExtractionAction, FileExtractionAction, BrowserUseAgentExecution, \
28
- ReportWriterTask, TodoGenerateAction, TodoModifyAction, VibeSurfDoneAction
33
+ ReportWriterTask, TodoGenerateAction, TodoModifyAction, VibeSurfDoneAction, SkillSearchAction, SkillCrawlAction, \
34
+ SkillSummaryAction, SkillTakeScreenshotAction, SkillDeepResearchAction, SkillCodeAction, SkillFinanceAction
35
+ from vibe_surf.tools.finance_tools import FinanceDataRetriever, FinanceMarkdownFormatter, FinanceMethod
29
36
  from vibe_surf.tools.mcp_client import CustomMCPClient
30
37
  from vibe_surf.tools.file_system import CustomFileSystem
31
38
  from vibe_surf.browser.browser_manager import BrowserManager
32
-
39
+ from vibe_surf.tools.vibesurf_registry import VibeSurfRegistry
40
+ from bs4 import BeautifulSoup
33
41
  from vibe_surf.logger import get_logger
34
42
 
35
43
  logger = get_logger(__name__)
@@ -39,17 +47,1047 @@ Context = TypeVar('Context')
39
47
  T = TypeVar('T', bound=BaseModel)
40
48
 
41
49
 
50
+ def clean_html_basic(page_html_content, max_text_length=100):
51
+ soup = BeautifulSoup(page_html_content, 'html.parser')
52
+
53
+ for script in soup(["script", "style"]):
54
+ script.decompose()
55
+
56
+ from bs4 import Comment
57
+ comments = soup.findAll(text=lambda text: isinstance(text, Comment))
58
+ for comment in comments:
59
+ comment.extract()
60
+
61
+ for text_node in soup.find_all(string=True):
62
+ if text_node.parent.name not in ['script', 'style']:
63
+ clean_text = ' '.join(text_node.split())
64
+
65
+ if len(clean_text) > max_text_length:
66
+ clean_text = clean_text[:max_text_length].rstrip() + "..."
67
+
68
+ if clean_text != text_node:
69
+ text_node.replace_with(clean_text)
70
+
71
+ important_attrs = ['id', 'class', 'name', 'role', 'type',
72
+ 'colspan', 'rowspan', 'headers', 'scope',
73
+ 'href', 'src', 'alt', 'title']
74
+
75
+ for tag in soup.find_all():
76
+ attrs_to_keep = {}
77
+ for attr in list(tag.attrs.keys()):
78
+ if (attr in important_attrs or
79
+ attr.startswith('data-') or
80
+ attr.startswith('aria-')):
81
+ attrs_to_keep[attr] = tag.attrs[attr]
82
+ tag.attrs = attrs_to_keep
83
+
84
+ return str(soup)
85
+
86
+
87
+ def get_sibling_position(node: EnhancedDOMTreeNode) -> int:
88
+ """Get the position of node among its siblings with the same tag"""
89
+ if not node.parent_node:
90
+ return 1
91
+
92
+ tag_name = node.tag_name
93
+ position = 1
94
+
95
+ # Find siblings with same tag name before this node
96
+ for sibling in node.parent_node.children:
97
+ if sibling == node:
98
+ break
99
+ if sibling.tag_name == tag_name:
100
+ position += 1
101
+
102
+ return position
103
+
104
+
105
+ def extract_css_hints(node: EnhancedDOMTreeNode) -> dict:
106
+ """Extract CSS selector construction hints"""
107
+ hints = {}
108
+
109
+ if "id" in node.attributes:
110
+ hints["id"] = f"#{node.attributes['id']}"
111
+
112
+ if "class" in node.attributes:
113
+ classes = node.attributes["class"].split()
114
+ hints["class"] = f".{'.'.join(classes[:3])}" # Limit class count
115
+
116
+ # Attribute selector hints
117
+ for attr in ["name", "data-testid", "type"]:
118
+ if attr in node.attributes:
119
+ hints[f"attr_{attr}"] = f"[{attr}='{node.attributes[attr]}']"
120
+
121
+ return hints
122
+
123
+
124
+ def convert_selector_map_for_llm(selector_map) -> dict:
125
+ """
126
+ Convert complex selector_map to simplified format suitable for LLM understanding and JS code writing
127
+ """
128
+ simplified_elements = []
129
+
130
+ for element_index, node in selector_map.items():
131
+ if node.is_visible and node.element_index is not None: # Only include visible interactive elements
132
+ element_info = {
133
+ "tag": node.tag_name,
134
+ "text": node.get_meaningful_text_for_llm()[:200], # Limit text length
135
+
136
+ # Selector information - most needed for JS code
137
+ "selectors": {
138
+ "xpath": node.xpath,
139
+ "css_hints": extract_css_hints(node), # Extract id, class etc
140
+ },
141
+
142
+ # Element semantics
143
+ "role": node.ax_node.role if node.ax_node else None,
144
+ "type": node.attributes.get("type"),
145
+ "aria_label": node.attributes.get("aria-label"),
146
+
147
+ # Key attributes
148
+ "attributes": {k: v for k, v in node.attributes.items()
149
+ if k in ["id", "class", "name", "href", "src", "value", "placeholder", "data-testid"]},
150
+
151
+ # Interactivity
152
+ "is_clickable": node.snapshot_node.is_clickable if node.snapshot_node else False,
153
+ "is_input": node.tag_name.lower() in ["input", "textarea", "select"],
154
+
155
+ # Structure information
156
+ "parent_tag": node.parent_node.tag_name if node.parent_node else None,
157
+ "position_info": f"{node.tag_name}[{get_sibling_position(node)}]"
158
+ }
159
+ simplified_elements.append(element_info)
160
+
161
+ return {
162
+ "page_elements": simplified_elements,
163
+ "total_elements": len(simplified_elements)
164
+ }
165
+
166
+
42
167
  class VibeSurfTools:
43
168
  def __init__(self, exclude_actions: list[str] = [], mcp_server_config: Optional[Dict[str, Any]] = None):
44
- self.registry = Registry(exclude_actions)
169
+ self.registry = VibeSurfRegistry(exclude_actions)
45
170
  self._register_file_actions()
46
171
  self._register_browser_use_agent()
47
172
  self._register_report_writer_agent()
48
173
  self._register_todo_actions()
49
174
  self._register_done_action()
175
+ self._register_skills()
50
176
  self.mcp_server_config = mcp_server_config
51
177
  self.mcp_clients: Dict[str, MCPClient] = {}
52
178
 
179
+ def _register_skills(self):
180
+ @self.registry.action(
181
+ 'Skill: Advanced parallel search - analyze user intent and generate 5 different search tasks, perform parallel Google searches, and return top 10 most relevant results',
182
+ param_model=SkillSearchAction,
183
+ )
184
+ async def skill_search(
185
+ params: SkillSearchAction,
186
+ browser_manager: BrowserManager,
187
+ page_extraction_llm: BaseChatModel
188
+ ):
189
+ """
190
+ Skill: Advanced parallel search with LLM-generated search strategies
191
+ """
192
+ llm = page_extraction_llm
193
+ agent_ids = []
194
+ try:
195
+ if not llm:
196
+ raise RuntimeError("LLM is required for skill_search")
197
+
198
+ # Step 1: Use LLM to analyze user intent and generate different search tasks
199
+ from datetime import datetime
200
+ analysis_prompt = f"""
201
+ Analyze the user query and generate 5 different Google search strategies to comprehensively find relevant information.
202
+
203
+ Current Time: {datetime.now().isoformat()}
204
+
205
+ User Query: "{params.query}"
206
+
207
+ Generate 5 different search queries that approach this topic from different angles. Each search should be:
208
+ 1. Specific and concrete (good for Google search)
209
+ 2. Different from the others (different perspectives/aspects)
210
+ 3. Likely to return valuable, unique information
211
+
212
+ Return your response as a JSON array of 5 search query strings.
213
+ Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
214
+ """
215
+
216
+ from browser_use.llm.messages import SystemMessage, UserMessage
217
+ response = await llm.ainvoke([
218
+ SystemMessage(content="You are an expert at generating comprehensive search strategies."),
219
+ UserMessage(content=analysis_prompt)
220
+ ])
221
+
222
+ # Parse the search queries
223
+ import json
224
+ try:
225
+ search_queries = json.loads(response.completion.strip())
226
+ if not isinstance(search_queries, list):
227
+ raise ValueError("Invalid search queries format")
228
+ search_queries = search_queries[:5]
229
+ except (json.JSONDecodeError, ValueError):
230
+ # Fallback to simple queries if parsing fails
231
+ try:
232
+ from json_repair import repair_json
233
+ search_queries = repair_json(response.completion.strip())
234
+ except Exception as e:
235
+ search_queries = [
236
+ params.query,
237
+ f"{params.query} guide",
238
+ f"{params.query} best practices",
239
+ f"{params.query} examples",
240
+ f"{params.query} latest news"
241
+ ]
242
+
243
+ # Step 2: Create browser sessions for parallel searching
244
+ register_sessions = []
245
+
246
+ for i, query in enumerate(search_queries):
247
+ agent_id = f"search_agent_{i + 1:03d}"
248
+ register_sessions.append(
249
+ browser_manager.register_agent(agent_id, target_id=None)
250
+ )
251
+ agent_ids.append(agent_id)
252
+
253
+ agent_browser_sessions = await asyncio.gather(*register_sessions)
254
+
255
+ # Step 3: Perform parallel Google searches
256
+ search_tasks = []
257
+ for i, (browser_session, query) in enumerate(zip(agent_browser_sessions, search_queries)):
258
+ search_tasks.append(self._perform_google_search(browser_session, query, llm))
259
+
260
+ search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
261
+
262
+ # Step 4: Aggregate and filter results
263
+ all_results = []
264
+ for i, result in enumerate(search_results):
265
+ if isinstance(result, Exception):
266
+ logger.error(f"Search task {i + 1} failed: {result}")
267
+ continue
268
+ if result:
269
+ all_results.extend(result)
270
+
271
+ # Step 5: Use LLM to deduplicate and rank top 10 results
272
+ if all_results:
273
+ ranking_prompt = f"""
274
+ Given these search results for the query "{params.query}", please:
275
+ 1. Remove duplicates (same or very similar content)
276
+ 2. Rank by relevance and value to the user
277
+ 3. Select the TOP 10 most relevant and valuable results
278
+
279
+ Search Results:
280
+ {json.dumps(all_results, indent=2)}
281
+
282
+ Return the top 10 results as a JSON array, with each result containing:
283
+ - title: string
284
+ - url: string
285
+ - summary: string (brief description of why this result is valuable)
286
+
287
+ Format: [{{"title": "...", "url": "...", "summary": "..."}}, ...]
288
+ """
289
+
290
+ ranking_response = await llm.ainvoke([
291
+ SystemMessage(
292
+ content="You are an expert at evaluating and ranking search results for relevance and value."),
293
+ UserMessage(content=ranking_prompt)
294
+ ])
295
+
296
+ try:
297
+ top_results = json.loads(ranking_response.completion.strip())
298
+ if not isinstance(top_results, list):
299
+ raise ValueError("Invalid ranking results format")
300
+ except (json.JSONDecodeError, ValueError):
301
+ # Fallback to first 10 results if ranking fails
302
+ top_results = all_results[:10]
303
+ else:
304
+ top_results = []
305
+
306
+ # Format results for display
307
+ if top_results:
308
+ results_text = f"🔍 Advanced Search Results for '{params.query}':\n\n"
309
+ for i, result in enumerate(top_results[:10]):
310
+ title = result.get('title', 'Unknown Title')
311
+ url = result.get('url', 'No URL')
312
+ summary = result.get('summary', 'No summary available')
313
+ results_text += f"{i}. **{title}**\n URL: {url}\n Summary: {summary}\n\n"
314
+ else:
315
+ results_text = f"No results found for query: {params.query}"
316
+
317
+ logger.info(f'🔍 Skill Search completed for: {params.query}')
318
+ return ActionResult(
319
+ extracted_content=results_text,
320
+ include_extracted_content_only_once=True,
321
+ long_term_memory=f'Advanced search completed for: {params.query}, found {len(top_results)} relevant results',
322
+ )
323
+
324
+ except Exception as e:
325
+ logger.error(f'❌ Skill Search failed: {e}')
326
+ return ActionResult(error=f'Skill search failed: {str(e)}')
327
+ finally:
328
+ for i, agent_id in enumerate(agent_ids):
329
+ await browser_manager.unregister_agent(agent_id, close_tabs=True)
330
+
331
+ @self.registry.action(
332
+ 'Skill: Crawl a web page and extract structured information from a webpage with optional tab selection',
333
+ param_model=SkillCrawlAction,
334
+ )
335
+ async def skill_crawl(
336
+ params: SkillCrawlAction,
337
+ browser_manager: BrowserManager,
338
+ page_extraction_llm: BaseChatModel
339
+ ):
340
+ """
341
+ Skill: Extract structured content from current or specified webpage
342
+ """
343
+ llm = page_extraction_llm
344
+ try:
345
+ if not llm:
346
+ raise RuntimeError("LLM is required for skill_crawl")
347
+
348
+ # Get browser session
349
+ browser_session = browser_manager.main_browser_session
350
+
351
+ # If tab_id is provided, switch to that tab
352
+ if params.tab_id:
353
+ target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
354
+ await browser_session.get_or_create_cdp_session(target_id, focus=True)
355
+
356
+ # Extract structured content using the existing method
357
+ extracted_content = await self._extract_structured_content(
358
+ browser_session, params.query, llm
359
+ )
360
+
361
+ current_url = await browser_session.get_current_page_url()
362
+ result_text = f'### URL:{current_url}\n\n{extracted_content}'
363
+
364
+ # Handle memory storage
365
+ MAX_MEMORY_LENGTH = 1000
366
+ if len(result_text) < MAX_MEMORY_LENGTH:
367
+ memory = result_text
368
+ include_extracted_content_only_once = False
369
+ else:
370
+ memory = f'Extracted structured content from {current_url} for query: {params.query}'
371
+ include_extracted_content_only_once = True
372
+
373
+ logger.info(f'📄 Skill Crawl completed for: {current_url}')
374
+ return ActionResult(
375
+ extracted_content=result_text,
376
+ include_extracted_content_only_once=include_extracted_content_only_once,
377
+ long_term_memory=memory,
378
+ )
379
+
380
+ except Exception as e:
381
+ logger.error(f'❌ Skill Crawl failed: {e}')
382
+ return ActionResult(error=f'Skill crawl failed: {str(e)}')
383
+
384
+ @self.registry.action(
385
+ 'Skill: Summarize webpage content with optional tab selection',
386
+ param_model=SkillSummaryAction,
387
+ )
388
+ async def skill_summary(
389
+ params: SkillSummaryAction,
390
+ browser_manager: BrowserManager,
391
+ page_extraction_llm: BaseChatModel
392
+ ):
393
+ """
394
+ Skill: Summarize webpage content using LLM
395
+ """
396
+ llm = page_extraction_llm
397
+ try:
398
+ if not llm:
399
+ raise RuntimeError("LLM is required for skill_summary")
400
+
401
+ # Get browser session
402
+ browser_session = browser_manager.main_browser_session
403
+
404
+ # If tab_id is provided, switch to that tab
405
+ if params.tab_id:
406
+ target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
407
+ await browser_session.get_or_create_cdp_session(target_id, focus=True)
408
+
409
+ # Extract and summarize content
410
+ summary = await self._extract_structured_content(
411
+ browser_session, "Provide a comprehensive summary of this webpage", llm
412
+ )
413
+
414
+ current_url = await browser_session.get_current_page_url()
415
+ result_text = f'📝 Summary of {current_url}:\n\n{summary}'
416
+
417
+ # Handle memory storage
418
+ MAX_MEMORY_LENGTH = 1000
419
+ if len(result_text) < MAX_MEMORY_LENGTH:
420
+ memory = result_text
421
+ include_extracted_content_only_once = False
422
+ else:
423
+ memory = f'Summarized webpage: {current_url}'
424
+ include_extracted_content_only_once = True
425
+
426
+ logger.info(f'📝 Skill Summary completed for: {current_url}')
427
+ return ActionResult(
428
+ extracted_content=result_text,
429
+ include_extracted_content_only_once=include_extracted_content_only_once,
430
+ long_term_memory=memory,
431
+ )
432
+
433
+ except Exception as e:
434
+ logger.error(f'❌ Skill Summary failed: {e}')
435
+ return ActionResult(error=f'Skill summary failed: {str(e)}')
436
+
437
+ @self.registry.action(
438
+ 'Skill: Take screenshot of current page or specified tab',
439
+ param_model=SkillTakeScreenshotAction,
440
+ )
441
+ async def skill_screenshot(
442
+ params: SkillTakeScreenshotAction,
443
+ browser_manager: BrowserManager,
444
+ file_system: CustomFileSystem
445
+ ):
446
+ """
447
+ Skill: Take screenshot with optional tab selection
448
+ """
449
+ try:
450
+ # Get browser session
451
+ browser_session = browser_manager.main_browser_session
452
+
453
+ # If tab_id is provided, switch to that tab
454
+ if params.tab_id:
455
+ target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
456
+ await browser_session.get_or_create_cdp_session(target_id, focus=True)
457
+
458
+ # Take screenshot using browser session
459
+ screenshot = await browser_session.take_screenshot()
460
+
461
+ # Generate timestamp for filename
462
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
463
+
464
+ # Get file system directory path (Path type)
465
+ fs_dir = file_system.get_dir()
466
+
467
+ # Create screenshots directory if it doesn't exist
468
+ screenshots_dir = fs_dir / "screenshots"
469
+ screenshots_dir.mkdir(exist_ok=True)
470
+
471
+ # Save screenshot to file system
472
+ page_title = await browser_session.get_current_page_title()
473
+ from pathvalidate import sanitize_filename
474
+ page_title = sanitize_filename(page_title)
475
+ filename = f"{page_title}-{timestamp}.png"
476
+ filepath = screenshots_dir / filename
477
+
478
+ with open(filepath, "wb") as f:
479
+ f.write(base64.b64decode(screenshot))
480
+
481
+ msg = f'📸 Screenshot saved to path: {str(filepath.relative_to(fs_dir))}'
482
+ logger.info(msg)
483
+ return ActionResult(
484
+ extracted_content=msg,
485
+ include_in_memory=True,
486
+ long_term_memory=f'Screenshot saved to {str(filepath.relative_to(fs_dir))}',
487
+ )
488
+
489
+ except Exception as e:
490
+ error_msg = f'❌ Failed to take screenshot: {str(e)}'
491
+ logger.error(error_msg)
492
+ return ActionResult(error=error_msg)
493
+
494
+ @self.registry.action(
495
+ 'Skill: Execute JavaScript code on webpage with optional tab selection - accepts functional requirements, code prompts, or code snippets that will be processed by LLM to generate proper executable JavaScript',
496
+ param_model=SkillCodeAction,
497
+ )
498
+ async def skill_code(
499
+ params: SkillCodeAction,
500
+ browser_manager: BrowserManager,
501
+ page_extraction_llm: BaseChatModel,
502
+ ):
503
+ """
504
+ Skill: Generate and execute JavaScript code from functional requirements or code prompts with iterative retry logic
505
+ """
506
+ MAX_ITERATIONS = 5
507
+
508
+ try:
509
+ if not page_extraction_llm:
510
+ raise RuntimeError("LLM is required for skill_code")
511
+
512
+ # Get browser session
513
+ browser_session = browser_manager.main_browser_session
514
+
515
+ # If tab_id is provided, switch to that tab
516
+ if params.tab_id:
517
+ target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
518
+ await browser_session.get_or_create_cdp_session(target_id, focus=True)
519
+
520
+ # Get browser state and convert for LLM
521
+ # browser_state = await browser_session.get_browser_state_summary()
522
+ # web_page_description = browser_state.dom_state.llm_representation()
523
+
524
+ page_html_content = await browser_session.get_html_content()
525
+ web_page_html = clean_html_basic(page_html_content)
526
+ if len(web_page_html) > 30000:
527
+ web_page_html = web_page_html[:24000] + "..." + web_page_html[-6000:]
528
+
529
+ # Get current page URL for context
530
+ current_url = await browser_session.get_current_page_url()
531
+
532
+ # Create base system prompt for JavaScript code generation
533
+ base_system_prompt = """You are an expert JavaScript developer specializing in browser automation and DOM manipulation.
534
+
535
+ You will be given a functional requirement or code prompt, along with the current page's DOM structure information.
536
+ Your task is to generate valid, executable JavaScript code that accomplishes the specified requirement.
537
+
538
+ IMPORTANT GUIDELINES:
539
+ This JavaScript code gets executed with Runtime.evaluate and 'returnByValue': True, 'awaitPromise': True
540
+
541
+ SYNTAX RULES - FAILURE TO FOLLOW CAUSES "Uncaught at line 0" ERRORS:
542
+ - ALWAYS wrap your code in IIFE: (function(){ ... })() or (async function(){ ... })() for async code
543
+ - ALWAYS add try-catch blocks to prevent execution errors
544
+ - ALWAYS use proper semicolons and valid JavaScript syntax
545
+ - NEVER write multiline code without proper IIFE wrapping
546
+ - ALWAYS validate elements exist before accessing them
547
+
548
+ EXAMPLES:
549
+ Use this tool when other tools do not work on the first try as expected or when a more general tool is needed, e.g. for filling a form all at once, hovering, dragging, extracting only links, extracting content from the page, press and hold, hovering, clicking on coordinates, zooming, use this if the user provides custom selectors which you can otherwise not interact with ....
550
+ You can also use it to explore the website.
551
+ - Write code to solve problems you could not solve with other tools.
552
+ - Don't write comments in here, no human reads that.
553
+ - Write only valid js code.
554
+ - use this to e.g. extract + filter links, convert the page to json into the format you need etc...
555
+
556
+ - limit the output otherwise your context will explode
557
+ - think if you deal with special elements like iframes / shadow roots etc
558
+ - Adopt your strategy for React Native Web, React, Angular, Vue, MUI pages etc.
559
+ - e.g. with synthetic events, keyboard simulation, shadow DOM, etc.
560
+
561
+ PROPER SYNTAX EXAMPLES:
562
+ CORRECT: (function(){ try { const el = document.querySelector('#id'); return el ? el.value : 'not found'; } catch(e) { return 'Error: ' + e.message; } })()
563
+ CORRECT: (async function(){ try { await new Promise(r => setTimeout(r, 100)); return 'done'; } catch(e) { return 'Error: ' + e.message; } })()
564
+
565
+ WRONG: const el = document.querySelector('#id'); el ? el.value : '';
566
+ WRONG: document.querySelector('#id').value
567
+ WRONG: Multiline code without IIFE wrapping
568
+
569
+ SHADOW DOM ACCESS EXAMPLE:
570
+ (function(){
571
+ try {
572
+ const hosts = document.querySelectorAll('*');
573
+ for (let host of hosts) {
574
+ if (host.shadowRoot) {
575
+ const el = host.shadowRoot.querySelector('#target');
576
+ if (el) return el.textContent;
577
+ }
578
+ }
579
+ return 'Not found';
580
+ } catch(e) {
581
+ return 'Error: ' + e.message;
582
+ }
583
+ })()
584
+
585
+ ## Return values:
586
+ - Async functions (with await, promises, timeouts) are automatically handled
587
+ - Returns strings, numbers, booleans, and serialized objects/arrays
588
+ - Use JSON.stringify() for complex objects: JSON.stringify(Array.from(document.querySelectorAll('a')).map(el => el.textContent.trim()))
589
+
590
+ OUTPUT FORMAT:
591
+ Return ONLY the JavaScript code, no explanations or markdown formatting."""
592
+
593
+ # Initialize message history for iterative prompting
594
+ from browser_use.llm.messages import SystemMessage, UserMessage
595
+ message_history = [SystemMessage(content=base_system_prompt)]
596
+
597
+ # Initial user prompt
598
+ initial_user_prompt = f"""Current Page URL: {current_url}
599
+
600
+ USER REQUIREMENT: {params.code_requirement}
601
+
602
+ Web Page Html Content:
603
+ {web_page_html}
604
+
605
+ Generate JavaScript code to fulfill the requirement:"""
606
+
607
+ message_history.append(UserMessage(content=initial_user_prompt))
608
+
609
+ # Get CDP session for JavaScript execution
610
+ cdp_session = await browser_session.get_or_create_cdp_session()
611
+
612
+ # Iterative code generation and execution
613
+ for iteration in range(1, MAX_ITERATIONS + 1):
614
+ try:
615
+ logger.info(f'🔄 Skill Code iteration {iteration}/{MAX_ITERATIONS}')
616
+
617
+ # Generate JavaScript code using LLM with message history
618
+ response = await asyncio.wait_for(
619
+ page_extraction_llm.ainvoke(message_history),
620
+ timeout=60.0,
621
+ )
622
+
623
+ generated_js_code = response.completion.strip()
624
+ message_history.append(AssistantMessage(content=generated_js_code))
625
+
626
+ # Clean up the generated code (remove markdown if present)
627
+ if generated_js_code.startswith('```javascript'):
628
+ generated_js_code = generated_js_code.replace('```javascript', '').replace('```',
629
+ '').strip()
630
+ elif generated_js_code.startswith('```js'):
631
+ generated_js_code = generated_js_code.replace('```js', '').replace('```', '').strip()
632
+ elif generated_js_code.startswith('```'):
633
+ generated_js_code = generated_js_code.replace('```', '').strip()
634
+
635
+ # Execute the generated JavaScript code
636
+ try:
637
+ logger.info(generated_js_code)
638
+ # Always use awaitPromise=True - it's ignored for non-promises
639
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
640
+ params={'expression': generated_js_code, 'returnByValue': True, 'awaitPromise': True},
641
+ session_id=cdp_session.session_id,
642
+ )
643
+
644
+ logger.info(result)
645
+ # Check for JavaScript execution errors
646
+ if result.get('exceptionDetails'):
647
+ exception = result['exceptionDetails']
648
+ error_msg = f'JavaScript execution error: {exception.get("text", "Unknown error")}'
649
+ if 'lineNumber' in exception:
650
+ error_msg += f' at line {exception["lineNumber"]}'
651
+
652
+ # Add error feedback to message history for next iteration
653
+ if iteration < MAX_ITERATIONS:
654
+ error_feedback = f"""The previous JavaScript code failed with error:
655
+ {error_msg}
656
+
657
+ Please fix the error and generate corrected JavaScript code:"""
658
+ message_history.append(UserMessage(content=error_feedback))
659
+ continue # Try next iteration
660
+ else:
661
+ # Final iteration, return error
662
+ msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nError: {error_msg}'
663
+ logger.info(msg)
664
+ return ActionResult(error=msg)
665
+
666
+ # Get the result data
667
+ result_data = result.get('result', {})
668
+
669
+ # Check for wasThrown flag (backup error detection)
670
+ if result_data.get('wasThrown'):
671
+ error_msg = 'JavaScript execution failed (wasThrown=true)'
672
+
673
+ # Add error feedback to message history for next iteration
674
+ if iteration < MAX_ITERATIONS:
675
+ error_feedback = f"""The previous JavaScript code failed with error:
676
+ {error_msg}
677
+
678
+ Please fix the error and generate corrected JavaScript code:"""
679
+ message_history.append(UserMessage(content=error_feedback))
680
+ continue # Try next iteration
681
+ else:
682
+ # Final iteration, return error
683
+ msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nError: {error_msg}'
684
+ logger.info(msg)
685
+ return ActionResult(error=msg)
686
+
687
+ # Get the actual value
688
+ value = result_data.get('value')
689
+
690
+ # Handle different value types
691
+ if value is None:
692
+ # Could be legitimate null/undefined result
693
+ result_text = str(value) if 'value' in result_data else 'undefined'
694
+ elif isinstance(value, (dict, list)):
695
+ # Complex objects - should be serialized by returnByValue
696
+ try:
697
+ result_text = json.dumps(value, ensure_ascii=False)
698
+ except (TypeError, ValueError):
699
+ # Fallback for non-serializable objects
700
+ result_text = str(value)
701
+ else:
702
+ # Primitive values (string, number, boolean)
703
+ result_text = str(value)
704
+
705
+ # Check if result is empty or meaningless
706
+ if (not result_text or
707
+ result_text.strip() in ['', 'null', 'undefined', '[]', '{}'] or
708
+ len(result_text.strip()) == 0):
709
+
710
+ # Add empty result feedback to message history for next iteration
711
+ if iteration < MAX_ITERATIONS:
712
+ empty_feedback = f"""The previous JavaScript code executed successfully but returned empty/meaningless result:
713
+ Result: {result_text}
714
+
715
+ The result is empty or not useful. Please generate improved JavaScript code that returns meaningful data:"""
716
+ message_history.append(UserMessage(content=empty_feedback))
717
+ continue # Try next iteration
718
+ else:
719
+ # Final iteration, return empty result with warning
720
+ msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nWarning: Empty or meaningless result: {result_text}'
721
+ logger.info(msg)
722
+ return ActionResult(
723
+ extracted_content=msg,
724
+ long_term_memory=f'Generated JavaScript code (iteration {iteration}) for requirement: {params.code_requirement} - Empty result warning',
725
+ )
726
+
727
+ # Apply length limit with better truncation
728
+ if len(result_text) > 30000:
729
+ result_text = result_text[:30000] + '\n... [Truncated after 30000 characters]'
730
+
731
+ # Success! Return the result
732
+ msg = f'Requirement: {params.code_requirement}\n\nGenerated Code (Iteration {iteration}): \n```javascript\n{generated_js_code}\n```\nResult: {result_text}'
733
+ logger.info(f'✅ Skill Code succeeded on iteration {iteration}')
734
+
735
+ return ActionResult(
736
+ extracted_content=msg,
737
+ long_term_memory=f'Generated and executed JavaScript code (iteration {iteration}) for requirement: {params.code_requirement}',
738
+ )
739
+
740
+ except Exception as e:
741
+ # CDP communication or other system errors
742
+ error_msg = f'Failed to execute JavaScript: {type(e).__name__}: {e}'
743
+
744
+ # Add system error feedback to message history for next iteration
745
+ if iteration < MAX_ITERATIONS:
746
+ system_error_feedback = f"""The previous JavaScript code failed to execute due to system error:
747
+ {error_msg}
748
+
749
+ Please generate alternative JavaScript code that avoids this system error:"""
750
+ message_history.append(UserMessage(content=system_error_feedback))
751
+ continue # Try next iteration
752
+ else:
753
+ # Final iteration, return system error
754
+ error_msg = f'Requirement: {params.code_requirement}\n\nFinal Generated Code (Iteration {iteration}): {generated_js_code}\n\nError: {error_msg}'
755
+ logger.info(error_msg)
756
+ return ActionResult(error=error_msg)
757
+
758
+ except Exception as e:
759
+ # LLM generation error
760
+ logger.error(f'❌ LLM generation failed on iteration {iteration}: {e}')
761
+ if iteration == MAX_ITERATIONS:
762
+ return ActionResult(
763
+ error=f'LLM generation failed after {MAX_ITERATIONS} iterations: {str(e)}')
764
+ continue # Try next iteration with same message history
765
+
766
+ # Should not reach here, but just in case
767
+ return ActionResult(error=f'Skill code failed after {MAX_ITERATIONS} iterations')
768
+
769
+ except Exception as e:
770
+ logger.error(f'❌ Skill Code failed: {e}')
771
+ return ActionResult(error=f'Skill code failed: {str(e)}')
772
+
773
+ @self.registry.action(
774
+ 'Skill: Deep research mode - Only return the guideline for deep research. Please follow the guideline to do real deep research actions.',
775
+ param_model=NoParamsAction,
776
+ )
777
+ async def skill_deep_research(
778
+ _: NoParamsAction,
779
+ ):
780
+ """
781
+ Skill: Deep research mode activation
782
+ """
783
+ research_prompt = f"""
784
+ 🔬 **DEEP RESEARCH GUIDELINE**
785
+
786
+ To proceed with comprehensive research, please:
787
+
788
+ 1. **Set up a detailed TODO list** for this research project that includes:
789
+ - Background research and context gathering
790
+ - Key questions to investigate
791
+ - Multiple source verification
792
+ - Data collection and analysis steps
793
+ - Report generation with proper citations
794
+
795
+ 2. **Conduct systematic research** following these principles:
796
+ - Use multiple search strategies and sources
797
+ - Verify information across different platforms
798
+ - Document all sources with URLs for citation
799
+ - Take notes and screenshots of key findings
800
+ - Organize findings by themes or categories
801
+
802
+ 3. **Generate a comprehensive report** that includes:
803
+ - Executive summary
804
+ - Detailed findings with analysis
805
+ - Proper citations and source references
806
+ - Supporting evidence (screenshots, quotes)
807
+ - Conclusions and recommendations
808
+ - Areas for further investigation
809
+
810
+ 4. **Maintain research traceability** by:
811
+ - Recording all search queries used
812
+ - Saving important URLs and sources
813
+ - Including direct quotes with attribution
814
+ - Documenting methodology and approach
815
+
816
+ This deep research mode ensures thorough, traceable, and well-documented investigation of your topic with proper academic rigor and source citation.
817
+ """
818
+
819
+ return ActionResult(
820
+ extracted_content=research_prompt,
821
+ include_extracted_content_only_once=True,
822
+ )
823
+
824
+ @self.registry.action(
825
+ 'Skill: Get comprehensive financial data for stocks - retrieve company information, historical prices, news, earnings, dividends, analyst recommendations and other financial data using Yahoo Finance. Available methods include: get_info (company info), get_history (price history), get_news (latest news), get_dividends (dividend history), get_earnings (earnings data), get_recommendations (analyst recommendations), get_balance_sheet (balance sheet data), get_income_stmt (income statement), get_cashflow (cash flow statement), get_fast_info (quick stats), get_institutional_holders (institutional ownership), get_major_holders (major shareholders), get_sustainability (ESG data), get_upgrades_downgrades (analyst upgrades/downgrades), and more. If no methods specified, defaults to get_info.',
826
+ param_model=SkillFinanceAction,
827
+ )
828
+ async def skill_finance(
829
+ params: SkillFinanceAction,
830
+ ):
831
+ """
832
+ Skill: Get comprehensive financial data using Yahoo Finance
833
+
834
+ Available methods include:
835
+ - get_info: Company information including sector, industry, market cap, business summary
836
+ - get_history: Historical stock prices and volume data over time periods
837
+ - get_news: Latest news articles about the company
838
+ - get_dividends: Historical dividend payments and yield data
839
+ - get_earnings: Quarterly and annual earnings data and growth trends
840
+ - get_recommendations: Analyst recommendations, price targets, and ratings
841
+ - get_balance_sheet: Company balance sheet data (assets, liabilities, equity)
842
+ - get_income_stmt: Income statement data (revenue, expenses, profit)
843
+ - get_cashflow: Cash flow statement data (operating, investing, financing)
844
+ - get_fast_info: Quick statistics like current price, volume, market cap
845
+ - get_institutional_holders: Institutional ownership and holdings data
846
+ - get_major_holders: Major shareholders and insider ownership percentages
847
+ - get_sustainability: ESG (Environmental, Social, Governance) scores and data
848
+ - get_upgrades_downgrades: Recent analyst upgrades and downgrades
849
+ - get_splits: Historical stock splits and stock split dates
850
+ - get_actions: Corporate actions including dividends and splits
851
+ - get_sec_filings: Recent SEC filings and regulatory documents
852
+ - get_calendar: Upcoming earnings dates and events
853
+ - get_mutualfund_holders: Mutual fund ownership data
854
+ - get_insider_purchases: Recent insider buying activity
855
+ - get_insider_transactions: All insider trading transactions
856
+ - get_shares: Outstanding shares and float data
857
+ """
858
+ try:
859
+ # Default to get_info if no methods specified
860
+ methods = params.methods if params.methods else [FinanceMethod.GET_INFO]
861
+
862
+ # Convert string methods to FinanceMethod enum if needed
863
+ if methods and isinstance(methods[0], str):
864
+ try:
865
+ methods = [FinanceMethod(method) for method in methods]
866
+ except ValueError as e:
867
+ available_methods = [method.value for method in FinanceMethod]
868
+ return ActionResult(
869
+ error=f'Invalid method in {methods}. Available methods: {available_methods}'
870
+ )
871
+
872
+ # Create data retriever with symbol
873
+ retriever = FinanceDataRetriever(params.symbol)
874
+
875
+ # Convert FinanceMethod enum values to strings for the retriever
876
+ method_strings = [method.value for method in methods]
877
+
878
+ # Retrieve financial data
879
+ financial_data = retriever.get_finance_data(
880
+ methods=method_strings,
881
+ period=getattr(params, 'period', '1y'),
882
+ start_date=getattr(params, 'start_date', None),
883
+ end_date=getattr(params, 'end_date', None),
884
+ interval=getattr(params, 'interval', '1d'),
885
+ num_news=getattr(params, 'num_news', 5)
886
+ )
887
+
888
+ # Format as markdown using the static method
889
+ markdown_content = FinanceMarkdownFormatter.format_finance_data(
890
+ symbol=params.symbol,
891
+ results=financial_data,
892
+ methods=method_strings
893
+ )
894
+
895
+ method_names = [method.value for method in methods]
896
+ logger.info(f'💹 Comprehensive finance data retrieved for {params.symbol} with methods: {method_names}')
897
+
898
+ return ActionResult(
899
+ extracted_content=markdown_content,
900
+ include_extracted_content_only_once=True,
901
+ long_term_memory=f'Retrieved comprehensive financial data for {params.symbol} using methods: {", ".join(method_names)}',
902
+ )
903
+
904
+ except Exception as e:
905
+ error_msg = f'❌ Failed to retrieve financial data for {params.symbol}: {str(e)}'
906
+ logger.error(error_msg)
907
+ return ActionResult(error=error_msg)
908
+
909
+
910
+ async def _perform_google_search(self, browser_session, query: str, llm: BaseChatModel):
911
+ """Helper method to perform Google search and extract top 5 results"""
912
+ try:
913
+ # Navigate to Google search
914
+ search_url = f'https://www.google.com/search?q={query}&udm=14'
915
+ await browser_session.navigate_to_url(search_url, new_tab=False)
916
+
917
+ # Wait a moment for page to load
918
+ await asyncio.sleep(1)
919
+
920
+ # Extract structured content
921
+ extraction_query = f"""
922
+ Extract the top 5 search results from this Google search page. For each result, provide:
923
+ - title: The clickable title/headline
924
+ - url: The website URL
925
+ - summary: A brief description of what this result contains
926
+
927
+ Return results as a JSON array: [{{"title": "...", "url": "...", "summary": "..."}}, ...]
928
+ """
929
+
930
+ results_text = await self._extract_structured_content(browser_session, extraction_query, llm)
931
+
932
+ # Try to parse JSON results
933
+ import json
934
+ try:
935
+ results = json.loads(results_text.strip())
936
+ if isinstance(results, list):
937
+ return results[:5] # Ensure max 5 results
938
+ except (json.JSONDecodeError, ValueError):
939
+ try:
940
+ results = repair_json(results_text.strip())
941
+ if isinstance(results, list):
942
+ return results[:5] # Ensure max 5 results
943
+ except Exception as e:
944
+ logger.warning(f"Failed to parse JSON from search results: {results_text}")
945
+
946
+ # Fallback: return raw text as single result
947
+ current_url = await browser_session.get_current_page_url()
948
+ return [{
949
+ "title": f"Search results for: {query}",
950
+ "url": current_url,
951
+ "summary": results_text[:200] + "..." if len(results_text) > 200 else results_text
952
+ }]
953
+
954
+ except Exception as e:
955
+ logger.error(f"Google search failed for query '{query}': {e}")
956
+ return []
957
+
958
+ async def _extract_structured_content(self, browser_session, query: str, llm: BaseChatModel):
959
+ """Helper method to extract structured content from current page"""
960
+ MAX_CHAR_LIMIT = 30000
961
+
962
+ # Extract clean markdown using the existing method
963
+ try:
964
+ content, content_stats = await self.extract_clean_markdown(browser_session, extract_links=False)
965
+ except Exception as e:
966
+ raise RuntimeError(f'Could not extract clean markdown: {type(e).__name__}')
967
+
968
+ # Smart truncation with context preservation
969
+ if len(content) > MAX_CHAR_LIMIT:
970
+ # Try to truncate at a natural break point
971
+ truncate_at = MAX_CHAR_LIMIT
972
+ paragraph_break = content.rfind('\n\n', MAX_CHAR_LIMIT - 500, MAX_CHAR_LIMIT)
973
+ if paragraph_break > 0:
974
+ truncate_at = paragraph_break
975
+ else:
976
+ sentence_break = content.rfind('.', MAX_CHAR_LIMIT - 200, MAX_CHAR_LIMIT)
977
+ if sentence_break > 0:
978
+ truncate_at = sentence_break + 1
979
+ content = content[:truncate_at]
980
+
981
+ system_prompt = """
982
+ You are an expert at extracting data from the markdown of a webpage.
983
+
984
+ <input>
985
+ You will be given a query and the markdown of a webpage that has been filtered to remove noise and advertising content.
986
+ </input>
987
+
988
+ <instructions>
989
+ - You are tasked to extract information from the webpage that is relevant to the query.
990
+ - You should ONLY use the information available in the webpage to answer the query. Do not make up information or provide guess from your own knowledge.
991
+ - If the information relevant to the query is not available in the page, your response should mention that.
992
+ - If the query asks for all items, products, etc., make sure to directly list all of them.
993
+ </instructions>
994
+
995
+ <output>
996
+ - Your output should present ALL the information relevant to the query in a concise way.
997
+ - Do not answer in conversational format - directly output the relevant information or that the information is unavailable.
998
+ </output>
999
+ """.strip()
1000
+
1001
+ prompt = f'<query>\n{query}\n</query>\n\n<webpage_content>\n{content}\n</webpage_content>'
1002
+
1003
+ try:
1004
+ from browser_use.llm.messages import SystemMessage, UserMessage
1005
+ response = await asyncio.wait_for(
1006
+ llm.ainvoke([SystemMessage(content=system_prompt), UserMessage(content=prompt)]),
1007
+ timeout=120.0,
1008
+ )
1009
+ return response.completion
1010
+ except Exception as e:
1011
+ logger.debug(f'Error extracting content: {e}')
1012
+ raise RuntimeError(str(e))
1013
+
1014
+ async def extract_clean_markdown(
1015
+ self, browser_session: BrowserSession, extract_links: bool = True
1016
+ ) -> tuple[str, dict[str, Any]]:
1017
+ """Extract clean markdown from the current page."""
1018
+ import re
1019
+
1020
+ # Get HTML content from current page
1021
+ cdp_session = await browser_session.get_or_create_cdp_session()
1022
+ try:
1023
+ body_id = await cdp_session.cdp_client.send.DOM.getDocument(session_id=cdp_session.session_id)
1024
+ page_html_result = await cdp_session.cdp_client.send.DOM.getOuterHTML(
1025
+ params={'backendNodeId': body_id['root']['backendNodeId']}, session_id=cdp_session.session_id
1026
+ )
1027
+ page_html = page_html_result['outerHTML']
1028
+ current_url = await browser_session.get_current_page_url()
1029
+ except Exception as e:
1030
+ raise RuntimeError(f"Couldn't extract page content: {e}")
1031
+
1032
+ original_html_length = len(page_html)
1033
+
1034
+ # Use html2text for clean markdown conversion
1035
+ import html2text
1036
+
1037
+ h = html2text.HTML2Text()
1038
+ h.ignore_links = not extract_links
1039
+ h.ignore_images = True
1040
+ h.ignore_emphasis = False
1041
+ h.body_width = 0 # Don't wrap lines
1042
+ h.unicode_snob = True
1043
+ h.skip_internal_links = True
1044
+ content = h.handle(page_html)
1045
+
1046
+ initial_markdown_length = len(content)
1047
+
1048
+ # Minimal cleanup - html2text already does most of the work
1049
+ content = re.sub(r'%[0-9A-Fa-f]{2}', '', content) # Remove any remaining URL encoding
1050
+
1051
+ # Apply light preprocessing to clean up excessive whitespace
1052
+ content, chars_filtered = self._preprocess_markdown_content(content)
1053
+
1054
+ final_filtered_length = len(content)
1055
+
1056
+ # Content statistics
1057
+ stats = {
1058
+ 'url': current_url,
1059
+ 'original_html_chars': original_html_length,
1060
+ 'initial_markdown_chars': initial_markdown_length,
1061
+ 'filtered_chars_removed': chars_filtered,
1062
+ 'final_filtered_chars': final_filtered_length,
1063
+ }
1064
+
1065
+ return content, stats
1066
+
1067
+ def _preprocess_markdown_content(self, content: str, max_newlines: int = 3) -> tuple[str, int]:
1068
+ """Light preprocessing of html2text output - minimal cleanup since html2text is already clean."""
1069
+ import re
1070
+
1071
+ original_length = len(content)
1072
+
1073
+ # Compress consecutive newlines (4+ newlines become max_newlines)
1074
+ content = re.sub(r'\n{4,}', '\n' * max_newlines, content)
1075
+
1076
+ # Remove lines that are only whitespace or very short (likely artifacts)
1077
+ lines = content.split('\n')
1078
+ filtered_lines = []
1079
+ for line in lines:
1080
+ stripped = line.strip()
1081
+ # Keep lines with substantial content (html2text output is already clean)
1082
+ if len(stripped) > 2:
1083
+ filtered_lines.append(line)
1084
+
1085
+ content = '\n'.join(filtered_lines)
1086
+ content = content.strip()
1087
+
1088
+ chars_filtered = original_length - len(content)
1089
+ return content, chars_filtered
1090
+
53
1091
  def _register_browser_use_agent(self):
54
1092
  @self.registry.action(
55
1093
  'Execute browser_use agent tasks. Supports both single task execution (list length=1) and '
@@ -430,7 +1468,7 @@ class VibeSurfTools:
430
1468
  async def write_file(
431
1469
  file_path: str,
432
1470
  content: str,
433
- file_system: FileSystem,
1471
+ file_system: CustomFileSystem,
434
1472
  append: bool = False,
435
1473
  trailing_newline: bool = True,
436
1474
  leading_newline: bool = False,