vibesurf 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vibesurf might be problematic. Click here for more details.

vibe_surf/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.24'
32
- __version_tuple__ = version_tuple = (0, 1, 24)
31
+ __version__ = version = '0.1.26'
32
+ __version_tuple__ = version_tuple = (0, 1, 26)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -104,7 +104,7 @@ async def submit_task(
104
104
  logger.info("Using default empty MCP server configuration")
105
105
 
106
106
  # DEBUG: Log the type and content of mcp_server_config
107
- logger.info(f"mcp_server_config type: {type(mcp_server_config)}, value: {mcp_server_config}")
107
+ logger.debug(f"mcp_server_config type: {type(mcp_server_config)}, value: {mcp_server_config}")
108
108
 
109
109
  # Create initial task record in database
110
110
  from ..database.queries import TaskQueries
@@ -486,13 +486,13 @@ class TaskQueries:
486
486
  return existing_task
487
487
  else:
488
488
  # DEBUG: Log the type and content of mcp_server_config before saving
489
- logger.info(
489
+ logger.debug(
490
490
  f"Creating task with mcp_server_config type: {type(mcp_server_config)}, value: {mcp_server_config}")
491
491
 
492
492
  # Serialize mcp_server_config to JSON string if it's a dict
493
493
  if isinstance(mcp_server_config, dict):
494
494
  mcp_server_config_json = json.dumps(mcp_server_config)
495
- logger.info(f"Converted dict to JSON string: {mcp_server_config_json}")
495
+ logger.debug(f"Converted dict to JSON string: {mcp_server_config_json}")
496
496
  else:
497
497
  mcp_server_config_json = mcp_server_config
498
498
 
@@ -58,7 +58,7 @@ def create_llm_from_profile(llm_profile) -> BaseChatModel:
58
58
  "deepseek": ["temperature"],
59
59
  "aws_bedrock": ["temperature"],
60
60
  "anthropic_bedrock": ["temperature"],
61
- "openai_compatible": ["temperature"]
61
+ "openai_compatible": ["temperature", "max_tokens"]
62
62
  }
63
63
 
64
64
  # Build common parameters based on provider support
vibe_surf/cli.py CHANGED
@@ -325,7 +325,7 @@ def start_backend(port: int) -> None:
325
325
  console.print("[yellow]📝 Press Ctrl+C to stop the server[/yellow]\n")
326
326
 
327
327
  # Run the server
328
- uvicorn.run(app, host="127.0.0.1", port=port, log_level="info")
328
+ uvicorn.run(app, host="127.0.0.1", port=port, log_level="error")
329
329
 
330
330
  except KeyboardInterrupt:
331
331
  console.print("\n[yellow]🛑 Server stopped by user[/yellow]")
@@ -76,7 +76,7 @@ class ChatOpenAICompatible(ChatOpenAI):
76
76
  The class automatically detects the model type and applies appropriate fixes.
77
77
  """
78
78
 
79
- max_completion_tokens: int | None = 16000
79
+ max_completion_tokens: int | None = 8192
80
80
 
81
81
  def _is_gemini_model(self) -> bool:
82
82
  """Check if the current model is a Gemini model."""
@@ -337,7 +337,6 @@ class ChatOpenAICompatible(ChatOpenAI):
337
337
  try:
338
338
  parsed = output_format.model_validate_json(output_content)
339
339
  except Exception as e:
340
- pdb.set_trace()
341
340
  repair_content = repair_json(output_content)
342
341
  parsed = output_format.model_validate_json(repair_content)
343
342
 
@@ -8,6 +8,8 @@ from typing import Dict, List, Any, Optional, Union
8
8
  from datetime import datetime, timedelta
9
9
  import yfinance as yf
10
10
  import pandas as pd
11
+ from datetime import datetime
12
+
11
13
  from vibe_surf.logger import get_logger
12
14
 
13
15
  logger = get_logger(__name__)
@@ -445,33 +447,58 @@ class FinanceMarkdownFormatter:
445
447
  return "No news available.\n"
446
448
 
447
449
  markdown = f"**Total News Articles:** {len(news)}\n\n"
448
- pdb.set_trace()
449
450
  for i, article in enumerate(news, 1):
450
451
  if isinstance(article, dict):
451
- # Try different possible field names for title
452
- title = (article.get('title') or
453
- article.get('headline') or
454
- article.get('summary') or
452
+ # Handle new yfinance news structure with nested 'content'
453
+ content = article.get('content', article) # Fallback to article itself for backwards compatibility
454
+
455
+ # Extract title
456
+ title = (content.get('title') or
457
+ content.get('headline') or
458
+ content.get('summary') or
459
+ article.get('title') or # Fallback to old format
455
460
  'No title available')
456
461
 
457
- # Try different possible field names for link/URL
458
- link = (article.get('link') or
459
- article.get('url') or
460
- article.get('guid') or '')
462
+ # Extract content type if available
463
+ content_type = content.get('contentType', '')
464
+ type_emoji = "🎥" if content_type == "VIDEO" else "📰"
461
465
 
462
- # Try different possible field names for publisher
463
- publisher = (article.get('publisher') or
464
- article.get('source') or
465
- article.get('author') or
466
- 'Unknown')
466
+ # Extract link/URL - try new nested structure first
467
+ link = ''
468
+ if 'canonicalUrl' in content and isinstance(content['canonicalUrl'], dict):
469
+ link = content['canonicalUrl'].get('url', '')
470
+ elif 'clickThroughUrl' in content and isinstance(content['clickThroughUrl'], dict):
471
+ link = content['clickThroughUrl'].get('url', '')
472
+ else:
473
+ # Fallback to old format
474
+ link = (content.get('link') or
475
+ content.get('url') or
476
+ content.get('guid') or
477
+ article.get('link') or '')
478
+
479
+ # Extract publisher - try new nested structure first
480
+ publisher = 'Unknown'
481
+ if 'provider' in content and isinstance(content['provider'], dict):
482
+ publisher = content['provider'].get('displayName', 'Unknown')
483
+ else:
484
+ # Fallback to old format
485
+ publisher = (content.get('publisher') or
486
+ content.get('source') or
487
+ content.get('author') or
488
+ article.get('publisher') or
489
+ 'Unknown')
467
490
 
468
- # Try different possible field names for timestamp
469
- publish_time = (article.get('providerPublishTime') or
470
- article.get('timestamp') or
471
- article.get('pubDate') or
472
- article.get('published') or '')
491
+ # Extract publication time
492
+ publish_time = (content.get('pubDate') or
493
+ content.get('providerPublishTime') or
494
+ content.get('timestamp') or
495
+ content.get('published') or
496
+ article.get('providerPublishTime') or '')
473
497
 
474
- markdown += f"### {i}. {title}\n"
498
+ # Format the article
499
+ markdown += f"### {type_emoji} {i}. {title}\n"
500
+ if content_type:
501
+ markdown += f"**Type:** {content_type}\n"
475
502
  markdown += f"**Publisher:** {publisher}\n"
476
503
 
477
504
  if publish_time:
@@ -481,11 +508,16 @@ class FinanceMarkdownFormatter:
481
508
  dt = datetime.fromtimestamp(publish_time)
482
509
  markdown += f"**Published:** {dt.strftime('%Y-%m-%d %H:%M')}\n"
483
510
  elif isinstance(publish_time, str):
484
- # Try to parse string timestamp
511
+ # Try to parse ISO format first (new format)
485
512
  try:
486
- publish_time_int = int(float(publish_time))
487
- dt = datetime.fromtimestamp(publish_time_int)
488
- markdown += f"**Published:** {dt.strftime('%Y-%m-%d %H:%M')}\n"
513
+ if publish_time.endswith('Z'):
514
+ dt = datetime.fromisoformat(publish_time.replace('Z', '+00:00'))
515
+ markdown += f"**Published:** {dt.strftime('%Y-%m-%d %H:%M UTC')}\n"
516
+ else:
517
+ # Try to parse as Unix timestamp
518
+ publish_time_int = int(float(publish_time))
519
+ dt = datetime.fromtimestamp(publish_time_int)
520
+ markdown += f"**Published:** {dt.strftime('%Y-%m-%d %H:%M')}\n"
489
521
  except:
490
522
  markdown += f"**Published:** {publish_time}\n"
491
523
  except Exception as e:
@@ -496,14 +528,25 @@ class FinanceMarkdownFormatter:
496
528
  markdown += f"**Link:** {link}\n"
497
529
 
498
530
  # Add summary or description if available
499
- summary = (article.get('summary') or
500
- article.get('description') or
501
- article.get('snippet') or '')
531
+ summary = (content.get('summary') or
532
+ content.get('description') or
533
+ content.get('snippet') or
534
+ article.get('summary') or '')
502
535
  if summary and summary != title:
536
+ # Clean HTML tags from description if present
537
+ import re
538
+ clean_summary = re.sub(r'<[^>]+>', '', summary)
539
+ clean_summary = re.sub(r'\s+', ' ', clean_summary).strip()
540
+
503
541
  # Limit summary length
504
- if len(summary) > 200:
505
- summary = summary[:200] + "..."
506
- markdown += f"**Summary:** {summary}\n"
542
+ if len(clean_summary) > 300:
543
+ clean_summary = clean_summary[:300] + "..."
544
+ markdown += f"**Summary:** {clean_summary}\n"
545
+
546
+ # Add metadata if available
547
+ if 'metadata' in content and isinstance(content['metadata'], dict):
548
+ if content['metadata'].get('editorsPick'):
549
+ markdown += f"**Editor's Pick:** ✅\n"
507
550
 
508
551
  markdown += "\n"
509
552
 
@@ -514,10 +557,10 @@ class FinanceMarkdownFormatter:
514
557
  """Format dividend data as markdown"""
515
558
  if dividends.empty:
516
559
  return "No dividend data available.\n"
517
-
560
+
518
561
  markdown = f"**Total Dividends Recorded:** {len(dividends)}\n"
519
562
  markdown += f"**Date Range:** {dividends.index.min().strftime('%Y-%m-%d')} to {dividends.index.max().strftime('%Y-%m-%d')}\n\n"
520
-
563
+
521
564
  # Recent dividends (last 10)
522
565
  recent_dividends = dividends.tail(10)
523
566
  markdown += "### 💰 Recent Dividends\n\n"
@@ -196,6 +196,7 @@ class VibeSurfTools:
196
196
  raise RuntimeError("LLM is required for skill_search")
197
197
 
198
198
  # Step 1: Use LLM to analyze user intent and generate different search tasks
199
+ query_num = 6
199
200
  from datetime import datetime
200
201
  analysis_prompt = f"""
201
202
  Analyze the user query and generate 5 different Google search strategies to comprehensively find relevant information.
@@ -204,13 +205,13 @@ Current Time: {datetime.now().isoformat()}
204
205
 
205
206
  User Query: "{params.query}"
206
207
 
207
- Generate 5 different search queries that approach this topic from different angles. Each search should be:
208
+ Generate {query_num} different search queries that approach this topic from different angles. Each search should be:
208
209
  1. Specific and concrete (good for Google search)
209
210
  2. Different from the others (different perspectives/aspects)
210
211
  3. Likely to return valuable, unique information
211
212
 
212
- Return your response as a JSON array of 5 search query strings.
213
- Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
213
+ Return your response as a JSON array of {query_num} search query strings.
214
+ Example format: ["query 1", "query 2", "query 3", "query 4", "query 5", "query 6"]
214
215
  """
215
216
 
216
217
  from browser_use.llm.messages import SystemMessage, UserMessage
@@ -225,12 +226,14 @@ Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
225
226
  search_queries = json.loads(response.completion.strip())
226
227
  if not isinstance(search_queries, list):
227
228
  raise ValueError("Invalid search queries format")
228
- search_queries = search_queries[:5]
229
+ search_queries = search_queries[:query_num]
229
230
  except (json.JSONDecodeError, ValueError):
230
231
  # Fallback to simple queries if parsing fails
231
232
  try:
232
233
  from json_repair import repair_json
233
- search_queries = repair_json(response.completion.strip())
234
+ search_queries_s = repair_json(response.completion.strip())
235
+ search_queries = json.loads(search_queries_s)
236
+ search_queries = search_queries[:query_num]
234
237
  except Exception as e:
235
238
  search_queries = [
236
239
  params.query,
@@ -243,7 +246,7 @@ Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
243
246
  # Step 2: Create browser sessions for parallel searching
244
247
  register_sessions = []
245
248
 
246
- for i, query in enumerate(search_queries):
249
+ for i, query in enumerate(search_queries[:query_num]):
247
250
  agent_id = f"search_agent_{i + 1:03d}"
248
251
  register_sessions.append(
249
252
  browser_manager.register_agent(agent_id, target_id=None)
@@ -258,7 +261,6 @@ Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
258
261
  search_tasks.append(self._perform_google_search(browser_session, query, llm))
259
262
 
260
263
  search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
261
-
262
264
  # Step 4: Aggregate and filter results
263
265
  all_results = []
264
266
  for i, result in enumerate(search_results):
@@ -268,38 +270,74 @@ Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
268
270
  if result:
269
271
  all_results.extend(result)
270
272
 
271
- # Step 5: Use LLM to deduplicate and rank top 10 results
272
- if all_results:
273
+ # Step 4.5: Rule-based deduplication to reduce LLM processing load
274
+ # if all_results:
275
+ # deduplicated_results = self._rule_based_deduplication(all_results)
276
+ # logger.info(f"Rule-based deduplication: {len(all_results)} -> {len(deduplicated_results)} results")
277
+ # else:
278
+ # deduplicated_results = []
279
+
280
+ # Step 5: Use LLM only for final ranking and selection (much smaller dataset now)
281
+ if all_results and len(all_results) > 10:
282
+ # Only use LLM if we have more than 10 results to rank
283
+ # Create indexed results for LLM prompt
284
+ indexed_results = []
285
+ for i, result in enumerate(all_results):
286
+ indexed_results.append({
287
+ "index": i,
288
+ "title": result.get('title', 'Unknown Title'),
289
+ "url": result.get('url', 'No URL'),
290
+ "summary": result.get('summary', 'No summary available')
291
+ })
292
+
273
293
  ranking_prompt = f"""
274
- Given these search results for the query "{params.query}", please:
275
- 1. Remove duplicates (same or very similar content)
276
- 2. Rank by relevance and value to the user
277
- 3. Select the TOP 10 most relevant and valuable results
294
+ Rank these search results for the query "{params.query}" by relevance and value.
295
+ Select the TOP 10 most relevant and valuable results.
278
296
 
279
- Search Results:
280
- {json.dumps(all_results, indent=2)}
297
+ Search Results ({len(indexed_results)} total):
298
+ {json.dumps(indexed_results, indent=2, ensure_ascii=False)}
281
299
 
282
- Return the top 10 results as a JSON array, with each result containing:
283
- - title: string
284
- - url: string
285
- - summary: string (brief description of why this result is valuable)
300
+ Return ONLY the indices of the top 10 results as a JSON array of numbers.
301
+ For example: [0, 5, 2, 8, 1, 9, 3, 7, 4, 6]
286
302
 
287
- Format: [{{"title": "...", "url": "...", "summary": "..."}}, ...]
303
+ Format: [index1, index2, index3, ...]
288
304
  """
289
305
 
290
306
  ranking_response = await llm.ainvoke([
291
307
  SystemMessage(
292
- content="You are an expert at evaluating and ranking search results for relevance and value."),
308
+ content="You are an expert at ranking search results for relevance and value. Return only the indices of the top results."),
293
309
  UserMessage(content=ranking_prompt)
294
310
  ])
295
311
 
296
312
  try:
297
- top_results = json.loads(ranking_response.completion.strip())
298
- if not isinstance(top_results, list):
313
+ selected_indices = json.loads(ranking_response.completion.strip())
314
+ if not isinstance(selected_indices, list):
299
315
  raise ValueError("Invalid ranking results format")
316
+ # Ensure indices are valid and limit to 10
317
+ valid_indices = [i for i in selected_indices if isinstance(i, int) and 0 <= i < len(all_results)][:10]
318
+ if valid_indices:
319
+ top_results = [all_results[i] for i in valid_indices]
320
+ else:
321
+ top_results = all_results[:10]
300
322
  except (json.JSONDecodeError, ValueError):
301
- # Fallback to first 10 results if ranking fails
302
- top_results = all_results[:10]
323
+ try:
324
+ selected_indices_s = repair_json(ranking_response.completion.strip())
325
+ selected_indices = json.loads(selected_indices_s)
326
+ if isinstance(selected_indices, list):
327
+ valid_indices = [i for i in selected_indices if isinstance(i, int) and 0 <= i < len(all_results)][:10]
328
+ if valid_indices:
329
+ top_results = [all_results[i] for i in valid_indices]
330
+ else:
331
+ top_results = all_results[:10]
332
+ else:
333
+ top_results = all_results[:10]
334
+ except Exception:
335
+ # Fallback to first 10 results
336
+ top_results = all_results[:10]
337
+ elif all_results:
338
+ # If we have 10 or fewer results, skip LLM ranking
339
+ top_results = all_results[:10]
340
+ logger.info(f"Skipping LLM ranking for {len(all_results)} results (≤10)")
303
341
  else:
304
342
  top_results = []
305
343
 
@@ -694,7 +732,7 @@ Please fix the error and generate corrected JavaScript code:"""
694
732
  elif isinstance(value, (dict, list)):
695
733
  # Complex objects - should be serialized by returnByValue
696
734
  try:
697
- result_text = json.dumps(value, ensure_ascii=False)
735
+ result_text = json.dumps(value, ensure_ascii=False, indent=2)
698
736
  except (TypeError, ValueError):
699
737
  # Fallback for non-serializable objects
700
738
  result_text = str(value)
@@ -729,7 +767,7 @@ The result is empty or not useful. Please generate improved JavaScript code that
729
767
  result_text = result_text[:30000] + '\n... [Truncated after 30000 characters]'
730
768
 
731
769
  # Success! Return the result
732
- msg = f'Requirement: {params.code_requirement}\n\nGenerated Code (Iteration {iteration}): \n```javascript\n{generated_js_code}\n```\nResult: {result_text}'
770
+ msg = f'Generated Code (Iteration {iteration}): \n```javascript\n{generated_js_code}\n```\nResult:\n```json\n {result_text}\n```\n'
733
771
  logger.info(f'✅ Skill Code succeeded on iteration {iteration}')
734
772
 
735
773
  return ActionResult(
@@ -907,19 +945,164 @@ Please generate alternative JavaScript code that avoids this system error:"""
907
945
  return ActionResult(error=error_msg)
908
946
 
909
947
 
948
+ async def _extract_google_results_rule_based(self, browser_session):
949
+ """Rule-based extraction of Google search results using JavaScript"""
950
+ try:
951
+ cdp_session = await browser_session.get_or_create_cdp_session()
952
+
953
+ # JavaScript code to extract Google search results using DOM selectors
954
+ js_extraction_code = """
955
+ (function() {
956
+ try {
957
+ const results = [];
958
+
959
+ // Multiple selector strategies for different Google layouts
960
+ const selectors = [
961
+ 'div[data-sokoban-container] div[data-sokoban-feature]', // Standard results
962
+ 'div.g:not(.g-blk)', // Classic results container
963
+ '.tF2Cxc', // Modern result container
964
+ 'div[data-ved] h3', // Result titles
965
+ ];
966
+
967
+ let resultElements = [];
968
+
969
+ // Try each selector until we find results
970
+ for (const selector of selectors) {
971
+ const elements = document.querySelectorAll(selector);
972
+ if (elements.length > 0) {
973
+ resultElements = Array.from(elements).slice(0, 10); // Get up to 10 results
974
+ break;
975
+ }
976
+ }
977
+
978
+ // If no results found with specific selectors, try broader search
979
+ if (resultElements.length === 0) {
980
+ // Look for any divs containing h3 elements (likely search results)
981
+ const h3Elements = document.querySelectorAll('h3');
982
+ resultElements = Array.from(h3Elements)
983
+ .map(h3 => h3.closest('div'))
984
+ .filter(div => div && div.querySelector('a[href]'))
985
+ .slice(0, 10);
986
+ }
987
+
988
+ for (let i = 0; i < Math.min(resultElements.length, 10); i++) {
989
+ const element = resultElements[i];
990
+
991
+ // Extract title
992
+ let title = '';
993
+ const titleSelectors = ['h3', '[role="heading"]', 'a > span', '.LC20lb'];
994
+ for (const sel of titleSelectors) {
995
+ const titleEl = element.querySelector(sel);
996
+ if (titleEl && titleEl.textContent.trim()) {
997
+ title = titleEl.textContent.trim();
998
+ break;
999
+ }
1000
+ }
1001
+
1002
+ // Extract URL
1003
+ let url = '';
1004
+ const linkSelectors = ['a[href^="http"]', 'a[href^="/url?q="]', 'a[href]'];
1005
+ for (const sel of linkSelectors) {
1006
+ const linkEl = element.querySelector(sel);
1007
+ if (linkEl && linkEl.href) {
1008
+ url = linkEl.href;
1009
+ // Clean Google redirect URLs
1010
+ if (url.includes('/url?q=')) {
1011
+ const urlMatch = url.match(/[?&]q=([^&]*)/);
1012
+ if (urlMatch) {
1013
+ url = decodeURIComponent(urlMatch[1]);
1014
+ }
1015
+ }
1016
+ break;
1017
+ }
1018
+ }
1019
+
1020
+ // Extract summary/description
1021
+ let summary = '';
1022
+ const summarySelectors = [
1023
+ '.VwiC3b', // Description text
1024
+ '.yXK7lf', // Snippet text
1025
+ '[data-content-feature="1"] span',
1026
+ '.s', // Classic description
1027
+ 'span:not(:has(a))'
1028
+ ];
1029
+ for (const sel of summarySelectors) {
1030
+ const summaryEl = element.querySelector(sel);
1031
+ if (summaryEl && summaryEl.textContent.trim() && summaryEl.textContent.length > 10) {
1032
+ summary = summaryEl.textContent.trim();
1033
+ break;
1034
+ }
1035
+ }
1036
+
1037
+ // Only add if we have at least title or URL
1038
+ if (title || url) {
1039
+ results.push({
1040
+ title: title || 'No title',
1041
+ url: url || 'No URL',
1042
+ summary: summary || 'No description available'
1043
+ });
1044
+ }
1045
+ }
1046
+
1047
+ return JSON.stringify(results);
1048
+
1049
+ } catch (e) {
1050
+ return JSON.stringify([{
1051
+ title: 'Error extracting results',
1052
+ url: window.location.href,
1053
+ summary: 'JavaScript extraction failed: ' + e.message
1054
+ }]);
1055
+ }
1056
+ })()
1057
+ """
1058
+
1059
+ # Execute JavaScript to extract results
1060
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
1061
+ params={'expression': js_extraction_code, 'returnByValue': True, 'awaitPromise': True},
1062
+ session_id=cdp_session.session_id,
1063
+ )
1064
+
1065
+ if result.get('exceptionDetails'):
1066
+ logger.warning(f"JavaScript extraction failed: {result['exceptionDetails']}")
1067
+ return []
1068
+
1069
+ result_data = result.get('result', {})
1070
+ value = result_data.get('value', '[]')
1071
+
1072
+ try:
1073
+ extracted_results = json.loads(value)
1074
+ return extracted_results if isinstance(extracted_results, list) else []
1075
+ except (json.JSONDecodeError, ValueError):
1076
+ logger.warning(f"Failed to parse extraction results: {value}")
1077
+ return []
1078
+
1079
+ except Exception as e:
1080
+ logger.error(f"Rule-based extraction failed: {e}")
1081
+ return []
1082
+
910
1083
  async def _perform_google_search(self, browser_session, query: str, llm: BaseChatModel):
911
- """Helper method to perform Google search and extract top 5 results"""
1084
+ """Helper method to perform Google search and extract top 5 results using rule-based extraction"""
912
1085
  try:
913
1086
  # Navigate to Google search
914
1087
  search_url = f'https://www.google.com/search?q={query}&udm=14'
915
1088
  await browser_session.navigate_to_url(search_url, new_tab=False)
916
1089
 
917
1090
  # Wait a moment for page to load
918
- await asyncio.sleep(1)
919
-
920
- # Extract structured content
1091
+ await asyncio.sleep(2)
1092
+
1093
+ # Use rule-based extraction first (much faster than LLM)
1094
+ search_ret_len = 10
1095
+ results = await self._extract_google_results_rule_based(browser_session)
1096
+ if results and len(results) > 0:
1097
+ # Rule-based extraction succeeded
1098
+ logger.debug(f"Rule-based extraction found {len(results)} results for query: {query}")
1099
+ return results[:search_ret_len] # Return top 6 results
1100
+
1101
+ # Fallback to LLM extraction if rule-based fails
1102
+ logger.warning(f"Rule-based extraction failed for query '{query}', falling back to LLM")
1103
+
921
1104
  extraction_query = f"""
922
- Extract the top 5 search results from this Google search page. For each result, provide:
1105
+ Extract the top {search_ret_len} search results from this Google search page. For each result, provide:
923
1106
  - title: The clickable title/headline
924
1107
  - url: The website URL
925
1108
  - summary: A brief description of what this result contains
@@ -930,18 +1113,17 @@ Return results as a JSON array: [{{"title": "...", "url": "...", "summary": "...
930
1113
  results_text = await self._extract_structured_content(browser_session, extraction_query, llm)
931
1114
 
932
1115
  # Try to parse JSON results
933
- import json
934
1116
  try:
935
1117
  results = json.loads(results_text.strip())
936
1118
  if isinstance(results, list):
937
- return results[:5] # Ensure max 5 results
1119
+ return results[:search_ret_len] # Ensure max 5 results
938
1120
  except (json.JSONDecodeError, ValueError):
939
1121
  try:
940
1122
  results = repair_json(results_text.strip())
941
1123
  if isinstance(results, list):
942
- return results[:5] # Ensure max 5 results
1124
+ return results[:search_ret_len] # Ensure max 5 results
943
1125
  except Exception as e:
944
- logger.warning(f"Failed to parse JSON from search results: {results_text}")
1126
+ logger.warning(f"Failed to parse JSON from LLM search results: {results_text}")
945
1127
 
946
1128
  # Fallback: return raw text as single result
947
1129
  current_url = await browser_session.get_current_page_url()
@@ -955,6 +1137,74 @@ Return results as a JSON array: [{{"title": "...", "url": "...", "summary": "...
955
1137
  logger.error(f"Google search failed for query '{query}': {e}")
956
1138
  return []
957
1139
 
1140
+ def _rule_based_deduplication(self, results):
1141
+ """Rule-based deduplication to reduce dataset before LLM processing"""
1142
+ if not results:
1143
+ return []
1144
+
1145
+ deduplicated = []
1146
+ seen_urls = set()
1147
+ seen_titles = set()
1148
+
1149
+ for result in results:
1150
+ url = result.get('url', '').strip()
1151
+ title = result.get('title', '').strip().lower()
1152
+
1153
+ # Skip results with missing essential data
1154
+ if not url or not title or url == 'No URL' or title == 'no title':
1155
+ continue
1156
+
1157
+ # Normalize URL for comparison (remove fragments, query params for deduplication)
1158
+ normalized_url = url.split('#')[0].split('?')[0].lower()
1159
+
1160
+ # Check for duplicate URLs
1161
+ if normalized_url in seen_urls:
1162
+ continue
1163
+
1164
+ # Check for very similar titles (basic similarity)
1165
+ title_normalized = ''.join(c for c in title if c.isalnum()).lower()
1166
+ if len(title_normalized) > 10: # Only check titles with substantial content
1167
+ similar_found = False
1168
+ for seen_title in seen_titles:
1169
+ # Simple similarity check: if 80% of characters match
1170
+ if len(title_normalized) > 0 and len(seen_title) > 0:
1171
+ common_chars = sum(1 for c in title_normalized if c in seen_title)
1172
+ similarity = common_chars / max(len(title_normalized), len(seen_title))
1173
+ if similarity > 0.8:
1174
+ similar_found = True
1175
+ break
1176
+
1177
+ if similar_found:
1178
+ continue
1179
+
1180
+ # Add to deduplicated results
1181
+ seen_urls.add(normalized_url)
1182
+ seen_titles.add(title_normalized)
1183
+ deduplicated.append(result)
1184
+
1185
+ # Sort by relevance indicators (prioritize results with longer summaries, non-generic titles)
1186
+ def relevance_score(result):
1187
+ score = 0
1188
+ title = result.get('title', '')
1189
+ summary = result.get('summary', '')
1190
+
1191
+ # Longer summaries are typically more informative
1192
+ score += min(len(summary), 200) / 10
1193
+
1194
+ # Non-generic titles score higher
1195
+ generic_terms = ['search results', 'no title', 'error', 'loading']
1196
+ if not any(term in title.lower() for term in generic_terms):
1197
+ score += 10
1198
+
1199
+ # Prefer results with actual descriptions
1200
+ if summary and summary != 'No description available' and len(summary) > 20:
1201
+ score += 5
1202
+
1203
+ return score
1204
+
1205
+ deduplicated.sort(key=relevance_score, reverse=True)
1206
+ return deduplicated
1207
+
958
1208
  async def _extract_structured_content(self, browser_session, query: str, llm: BaseChatModel):
959
1209
  """Helper method to extract structured content from current page"""
960
1210
  MAX_CHAR_LIMIT = 30000
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vibesurf
3
- Version: 0.1.24
3
+ Version: 0.1.26
4
4
  Summary: VibeSurf: A powerful browser assistant for vibe surfing
5
5
  Author: Shao Warm
6
6
  License: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  vibe_surf/__init__.py,sha256=WtduuMFGauMD_9dpk4fnRnLTAP6ka9Lfu0feAFNzLfo,339
2
- vibe_surf/_version.py,sha256=IV4a2R7tlzuACf6FAyPEbprLKNroeE-n_UPSKi1QJSc,706
3
- vibe_surf/cli.py,sha256=pbep2dBeQqralZ8AggkH4h2nayBarbdN8lhZxo35gNU,16689
2
+ vibe_surf/_version.py,sha256=Y9o7KiJWiG6n9XbSpMICgNgajFRbL4an-gN1BQc-jwM,706
3
+ vibe_surf/cli.py,sha256=KAmUBsXfS-NkMp3ITxzNXwtFeKVmXJUDZiWqLcIC0BI,16690
4
4
  vibe_surf/common.py,sha256=_WWMxen5wFwzUjEShn3yDVC1OBFUiJ6Vccadi6tuG6w,1215
5
5
  vibe_surf/logger.py,sha256=k53MFA96QX6t9OfcOf1Zws8PP0OOqjVJfhUD3Do9lKw,3043
6
6
  vibe_surf/agents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -23,12 +23,12 @@ vibe_surf/backend/api/browser.py,sha256=NXedyZG3NIVRIx5O7d9mHwVWX-Q4_KsX5mSgfKt8
23
23
  vibe_surf/backend/api/config.py,sha256=vKY6ZnKZeazQP9qqUEiQvP9HoPtJbAzETORuPWZomGw,27272
24
24
  vibe_surf/backend/api/files.py,sha256=kJMG9MWECKXwGh64Q6xvAzNjeZGcLhIEnn65HiMZHKE,11762
25
25
  vibe_surf/backend/api/models.py,sha256=n_bu8vavvO8bIKA1WUAbaGPFeZKeamMJelDWU3DlFJc,10533
26
- vibe_surf/backend/api/task.py,sha256=vpQMOn6YBuD_16jzfUajUvBYaydC0jj8Ny3WOJDVuck,14359
26
+ vibe_surf/backend/api/task.py,sha256=CYx8FNN04XM8cH9BjAuMb-E7GUTxi4pl0OsT_1KnDBQ,14360
27
27
  vibe_surf/backend/api/voices.py,sha256=YfPCqnR7EAYh2nfMRIpB0xEo6_giTtxrcSeobU3HQHg,17098
28
28
  vibe_surf/backend/database/__init__.py,sha256=XhmcscnhgMhUyXML7m4SnuQIqkFpyY_zJ0D3yYa2RqQ,239
29
29
  vibe_surf/backend/database/manager.py,sha256=Okmr6yG2aycmatONRMyRbHe6l53RkFIPeMxxPSD3ycY,11884
30
30
  vibe_surf/backend/database/models.py,sha256=Z5_RqGyD4ER5bsrYjc2iso9yPo7zfAqxNeVDGtZqotw,8887
31
- vibe_surf/backend/database/queries.py,sha256=0-RKjbHY3G5Y5_QrTtvl-nHs0KPlygmwm0ZOdbsvINY,41155
31
+ vibe_surf/backend/database/queries.py,sha256=6SsAxTr-ocQ189xQ5m0L3BsgUdkGtmt2TcrXP-JIbrw,41157
32
32
  vibe_surf/backend/database/schemas.py,sha256=OPnpRKwYG1Cu8geJ6pajiEDF8x8mRestXnAfI4Gy18w,3402
33
33
  vibe_surf/backend/database/migrations/v001_initial_schema.sql,sha256=MC2fa1WHUEhHhdOTxz0qB4RI7JdGRpiGXZ77ytl3LRQ,4345
34
34
  vibe_surf/backend/database/migrations/v002_add_agent_mode.sql,sha256=jKnW28HsphUeU9kudEx9QaLnUh8swmmOt-hFsZJay24,251
@@ -36,7 +36,7 @@ vibe_surf/backend/database/migrations/v003_fix_task_status_case.sql,sha256=npzRg
36
36
  vibe_surf/backend/database/migrations/v004_add_voice_profiles.sql,sha256=-9arjQBF-OxvFIOwkEl7JJJRDTS_nJ8GNX3T7bJgVq0,1321
37
37
  vibe_surf/backend/utils/__init__.py,sha256=V8leMFp7apAglUAoCHPZrNNcRHthSLYIudIJE5qwjb0,184
38
38
  vibe_surf/backend/utils/encryption.py,sha256=CjLNh_n0Luhfa-6BB-icfzkiiDqj5b4Gu6MADU3p2eM,3754
39
- vibe_surf/backend/utils/llm_factory.py,sha256=KF84YYgPaOF0_1P_IF0cAtY1kua0D-8gEP2NoSu2UZM,9033
39
+ vibe_surf/backend/utils/llm_factory.py,sha256=XIJYc9Lh_L2vbwlAe96PrjptlzJtLOjCGNdHEx6fThk,9047
40
40
  vibe_surf/browser/__init__.py,sha256=_UToO2fZfSCrfjOcxhn4Qq7ZLbYeyPuUUEmqIva-Yv8,325
41
41
  vibe_surf/browser/agen_browser_profile.py,sha256=J06hCBJSJ-zAFVM9yDFz8UpmiLuFyWke1EMekpU45eo,5871
42
42
  vibe_surf/browser/agent_browser_session.py,sha256=xV0nHo_TCb7b7QYhIee4cLzH-1rqJswYwH7GEwyQmqc,33980
@@ -85,20 +85,20 @@ vibe_surf/chrome_extension/styles/settings-responsive.css,sha256=jLE0yG15n2aI6_6
85
85
  vibe_surf/chrome_extension/styles/settings-utilities.css,sha256=3PuQS2857kg83d5erLbLdo_7J95-qV-qyNWS5M-w1oQ,505
86
86
  vibe_surf/chrome_extension/styles/variables.css,sha256=enjyhsa0PeU3b-3uiXa-VkV-1-h2-Ai3m4KpmC2k0rY,2984
87
87
  vibe_surf/llm/__init__.py,sha256=_vDVPo6STf343p1SgMQrF5023hicAx0g83pK2Gbk4Ek,601
88
- vibe_surf/llm/openai_compatible.py,sha256=7e0XC-Mtz8MmgQZHH8tx8H_VXB6MLvMhDy1qKbESmVo,16149
88
+ vibe_surf/llm/openai_compatible.py,sha256=i0a5OLaL6QIlacVyctOG09vKr3KOi8T8Izp1v7xkD5I,16112
89
89
  vibe_surf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
90
  vibe_surf/tools/browser_use_tools.py,sha256=tacxKUJL6uOt04f52_iIw1cs-FT-mBgIPmAsIc4Hww0,23730
91
91
  vibe_surf/tools/file_system.py,sha256=Tw_6J5QjCahQ3fd26CXziF1zPvRxhYM0889oK4bDhlU,19304
92
- vibe_surf/tools/finance_tools.py,sha256=pwPSBb0HwCDTdKZNAS5NPE8-rM1Nz57foj9XyKgQmI4,24803
92
+ vibe_surf/tools/finance_tools.py,sha256=E8rmblp57e_cp0tFbdZ7BY3_upNlk4Whk0bYc_SFCJE,27284
93
93
  vibe_surf/tools/mcp_client.py,sha256=OeCoTgyx4MoY7JxXndK6pGHIoyFOhf5r7XCbx25y1Ec,2446
94
94
  vibe_surf/tools/report_writer_tools.py,sha256=2CyTTXOahTKZo7XwyWDDhJ--1mRA0uTtUWxu_DACAY0,776
95
95
  vibe_surf/tools/vibesurf_registry.py,sha256=Z-8d9BrJl3RFMEK0Tw1Q5xNHX2kZGsnIGCTBZ3RM-pw,2159
96
- vibe_surf/tools/vibesurf_tools.py,sha256=KMf9J_GDo9MbjBruv6-aHi5srR2pvlvW3uegihAMRIc,79994
96
+ vibe_surf/tools/vibesurf_tools.py,sha256=O8y1noWyY8y-j8I7vF4oOaVDybINNXiNXWNwGJJ5xsM,91500
97
97
  vibe_surf/tools/views.py,sha256=AEAPzML-lqWJ7dBMjXTl7o-rk4hp5PGaPRqLyilJUl8,7789
98
98
  vibe_surf/tools/voice_asr.py,sha256=AJG0yq_Jq-j8ulDlbPhVFfK1jch9_ASesis73iki9II,4702
99
- vibesurf-0.1.24.dist-info/licenses/LICENSE,sha256=czn6QYya0-jhLnStD9JqnMS-hwP5wRByipkrGTvoXLI,11355
100
- vibesurf-0.1.24.dist-info/METADATA,sha256=Ck-enMQ77f9ekeLQG9xzNGX3mOuDhqIXiXdA3_Zcq4I,5190
101
- vibesurf-0.1.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
102
- vibesurf-0.1.24.dist-info/entry_points.txt,sha256=UxqpvMocL-PR33S6vLF2OmXn-kVzM-DneMeZeHcPMM8,48
103
- vibesurf-0.1.24.dist-info/top_level.txt,sha256=VPZGHqSb6EEqcJ4ZX6bHIuWfon5f6HXl3c7BYpbRqnY,10
104
- vibesurf-0.1.24.dist-info/RECORD,,
99
+ vibesurf-0.1.26.dist-info/licenses/LICENSE,sha256=czn6QYya0-jhLnStD9JqnMS-hwP5wRByipkrGTvoXLI,11355
100
+ vibesurf-0.1.26.dist-info/METADATA,sha256=8Bdh3-15Hl0KbmB0ghk9KNQiveDnVGnhzZ-4Dv0MVjc,5190
101
+ vibesurf-0.1.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
102
+ vibesurf-0.1.26.dist-info/entry_points.txt,sha256=UxqpvMocL-PR33S6vLF2OmXn-kVzM-DneMeZeHcPMM8,48
103
+ vibesurf-0.1.26.dist-info/top_level.txt,sha256=VPZGHqSb6EEqcJ4ZX6bHIuWfon5f6HXl3c7BYpbRqnY,10
104
+ vibesurf-0.1.26.dist-info/RECORD,,