vibesurf 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vibesurf might be problematic. Click here for more details.
- vibe_surf/_version.py +2 -2
- vibe_surf/backend/api/task.py +1 -1
- vibe_surf/backend/database/queries.py +2 -2
- vibe_surf/backend/utils/llm_factory.py +1 -1
- vibe_surf/cli.py +1 -1
- vibe_surf/llm/openai_compatible.py +1 -2
- vibe_surf/tools/finance_tools.py +75 -32
- vibe_surf/tools/vibesurf_tools.py +286 -36
- {vibesurf-0.1.24.dist-info → vibesurf-0.1.26.dist-info}/METADATA +1 -1
- {vibesurf-0.1.24.dist-info → vibesurf-0.1.26.dist-info}/RECORD +14 -14
- {vibesurf-0.1.24.dist-info → vibesurf-0.1.26.dist-info}/WHEEL +0 -0
- {vibesurf-0.1.24.dist-info → vibesurf-0.1.26.dist-info}/entry_points.txt +0 -0
- {vibesurf-0.1.24.dist-info → vibesurf-0.1.26.dist-info}/licenses/LICENSE +0 -0
- {vibesurf-0.1.24.dist-info → vibesurf-0.1.26.dist-info}/top_level.txt +0 -0
vibe_surf/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.26'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 26)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
vibe_surf/backend/api/task.py
CHANGED
|
@@ -104,7 +104,7 @@ async def submit_task(
|
|
|
104
104
|
logger.info("Using default empty MCP server configuration")
|
|
105
105
|
|
|
106
106
|
# DEBUG: Log the type and content of mcp_server_config
|
|
107
|
-
logger.
|
|
107
|
+
logger.debug(f"mcp_server_config type: {type(mcp_server_config)}, value: {mcp_server_config}")
|
|
108
108
|
|
|
109
109
|
# Create initial task record in database
|
|
110
110
|
from ..database.queries import TaskQueries
|
|
@@ -486,13 +486,13 @@ class TaskQueries:
|
|
|
486
486
|
return existing_task
|
|
487
487
|
else:
|
|
488
488
|
# DEBUG: Log the type and content of mcp_server_config before saving
|
|
489
|
-
logger.
|
|
489
|
+
logger.debug(
|
|
490
490
|
f"Creating task with mcp_server_config type: {type(mcp_server_config)}, value: {mcp_server_config}")
|
|
491
491
|
|
|
492
492
|
# Serialize mcp_server_config to JSON string if it's a dict
|
|
493
493
|
if isinstance(mcp_server_config, dict):
|
|
494
494
|
mcp_server_config_json = json.dumps(mcp_server_config)
|
|
495
|
-
logger.
|
|
495
|
+
logger.debug(f"Converted dict to JSON string: {mcp_server_config_json}")
|
|
496
496
|
else:
|
|
497
497
|
mcp_server_config_json = mcp_server_config
|
|
498
498
|
|
|
@@ -58,7 +58,7 @@ def create_llm_from_profile(llm_profile) -> BaseChatModel:
|
|
|
58
58
|
"deepseek": ["temperature"],
|
|
59
59
|
"aws_bedrock": ["temperature"],
|
|
60
60
|
"anthropic_bedrock": ["temperature"],
|
|
61
|
-
"openai_compatible": ["temperature"]
|
|
61
|
+
"openai_compatible": ["temperature", "max_tokens"]
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
# Build common parameters based on provider support
|
vibe_surf/cli.py
CHANGED
|
@@ -325,7 +325,7 @@ def start_backend(port: int) -> None:
|
|
|
325
325
|
console.print("[yellow]📝 Press Ctrl+C to stop the server[/yellow]\n")
|
|
326
326
|
|
|
327
327
|
# Run the server
|
|
328
|
-
uvicorn.run(app, host="127.0.0.1", port=port, log_level="
|
|
328
|
+
uvicorn.run(app, host="127.0.0.1", port=port, log_level="error")
|
|
329
329
|
|
|
330
330
|
except KeyboardInterrupt:
|
|
331
331
|
console.print("\n[yellow]🛑 Server stopped by user[/yellow]")
|
|
@@ -76,7 +76,7 @@ class ChatOpenAICompatible(ChatOpenAI):
|
|
|
76
76
|
The class automatically detects the model type and applies appropriate fixes.
|
|
77
77
|
"""
|
|
78
78
|
|
|
79
|
-
max_completion_tokens: int | None =
|
|
79
|
+
max_completion_tokens: int | None = 8192
|
|
80
80
|
|
|
81
81
|
def _is_gemini_model(self) -> bool:
|
|
82
82
|
"""Check if the current model is a Gemini model."""
|
|
@@ -337,7 +337,6 @@ class ChatOpenAICompatible(ChatOpenAI):
|
|
|
337
337
|
try:
|
|
338
338
|
parsed = output_format.model_validate_json(output_content)
|
|
339
339
|
except Exception as e:
|
|
340
|
-
pdb.set_trace()
|
|
341
340
|
repair_content = repair_json(output_content)
|
|
342
341
|
parsed = output_format.model_validate_json(repair_content)
|
|
343
342
|
|
vibe_surf/tools/finance_tools.py
CHANGED
|
@@ -8,6 +8,8 @@ from typing import Dict, List, Any, Optional, Union
|
|
|
8
8
|
from datetime import datetime, timedelta
|
|
9
9
|
import yfinance as yf
|
|
10
10
|
import pandas as pd
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
|
|
11
13
|
from vibe_surf.logger import get_logger
|
|
12
14
|
|
|
13
15
|
logger = get_logger(__name__)
|
|
@@ -445,33 +447,58 @@ class FinanceMarkdownFormatter:
|
|
|
445
447
|
return "No news available.\n"
|
|
446
448
|
|
|
447
449
|
markdown = f"**Total News Articles:** {len(news)}\n\n"
|
|
448
|
-
pdb.set_trace()
|
|
449
450
|
for i, article in enumerate(news, 1):
|
|
450
451
|
if isinstance(article, dict):
|
|
451
|
-
#
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
452
|
+
# Handle new yfinance news structure with nested 'content'
|
|
453
|
+
content = article.get('content', article) # Fallback to article itself for backwards compatibility
|
|
454
|
+
|
|
455
|
+
# Extract title
|
|
456
|
+
title = (content.get('title') or
|
|
457
|
+
content.get('headline') or
|
|
458
|
+
content.get('summary') or
|
|
459
|
+
article.get('title') or # Fallback to old format
|
|
455
460
|
'No title available')
|
|
456
461
|
|
|
457
|
-
#
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
article.get('guid') or '')
|
|
462
|
+
# Extract content type if available
|
|
463
|
+
content_type = content.get('contentType', '')
|
|
464
|
+
type_emoji = "🎥" if content_type == "VIDEO" else "📰"
|
|
461
465
|
|
|
462
|
-
#
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
466
|
+
# Extract link/URL - try new nested structure first
|
|
467
|
+
link = ''
|
|
468
|
+
if 'canonicalUrl' in content and isinstance(content['canonicalUrl'], dict):
|
|
469
|
+
link = content['canonicalUrl'].get('url', '')
|
|
470
|
+
elif 'clickThroughUrl' in content and isinstance(content['clickThroughUrl'], dict):
|
|
471
|
+
link = content['clickThroughUrl'].get('url', '')
|
|
472
|
+
else:
|
|
473
|
+
# Fallback to old format
|
|
474
|
+
link = (content.get('link') or
|
|
475
|
+
content.get('url') or
|
|
476
|
+
content.get('guid') or
|
|
477
|
+
article.get('link') or '')
|
|
478
|
+
|
|
479
|
+
# Extract publisher - try new nested structure first
|
|
480
|
+
publisher = 'Unknown'
|
|
481
|
+
if 'provider' in content and isinstance(content['provider'], dict):
|
|
482
|
+
publisher = content['provider'].get('displayName', 'Unknown')
|
|
483
|
+
else:
|
|
484
|
+
# Fallback to old format
|
|
485
|
+
publisher = (content.get('publisher') or
|
|
486
|
+
content.get('source') or
|
|
487
|
+
content.get('author') or
|
|
488
|
+
article.get('publisher') or
|
|
489
|
+
'Unknown')
|
|
467
490
|
|
|
468
|
-
#
|
|
469
|
-
publish_time = (
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
491
|
+
# Extract publication time
|
|
492
|
+
publish_time = (content.get('pubDate') or
|
|
493
|
+
content.get('providerPublishTime') or
|
|
494
|
+
content.get('timestamp') or
|
|
495
|
+
content.get('published') or
|
|
496
|
+
article.get('providerPublishTime') or '')
|
|
473
497
|
|
|
474
|
-
|
|
498
|
+
# Format the article
|
|
499
|
+
markdown += f"### {type_emoji} {i}. {title}\n"
|
|
500
|
+
if content_type:
|
|
501
|
+
markdown += f"**Type:** {content_type}\n"
|
|
475
502
|
markdown += f"**Publisher:** {publisher}\n"
|
|
476
503
|
|
|
477
504
|
if publish_time:
|
|
@@ -481,11 +508,16 @@ class FinanceMarkdownFormatter:
|
|
|
481
508
|
dt = datetime.fromtimestamp(publish_time)
|
|
482
509
|
markdown += f"**Published:** {dt.strftime('%Y-%m-%d %H:%M')}\n"
|
|
483
510
|
elif isinstance(publish_time, str):
|
|
484
|
-
# Try to parse
|
|
511
|
+
# Try to parse ISO format first (new format)
|
|
485
512
|
try:
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
513
|
+
if publish_time.endswith('Z'):
|
|
514
|
+
dt = datetime.fromisoformat(publish_time.replace('Z', '+00:00'))
|
|
515
|
+
markdown += f"**Published:** {dt.strftime('%Y-%m-%d %H:%M UTC')}\n"
|
|
516
|
+
else:
|
|
517
|
+
# Try to parse as Unix timestamp
|
|
518
|
+
publish_time_int = int(float(publish_time))
|
|
519
|
+
dt = datetime.fromtimestamp(publish_time_int)
|
|
520
|
+
markdown += f"**Published:** {dt.strftime('%Y-%m-%d %H:%M')}\n"
|
|
489
521
|
except:
|
|
490
522
|
markdown += f"**Published:** {publish_time}\n"
|
|
491
523
|
except Exception as e:
|
|
@@ -496,14 +528,25 @@ class FinanceMarkdownFormatter:
|
|
|
496
528
|
markdown += f"**Link:** {link}\n"
|
|
497
529
|
|
|
498
530
|
# Add summary or description if available
|
|
499
|
-
summary = (
|
|
500
|
-
|
|
501
|
-
|
|
531
|
+
summary = (content.get('summary') or
|
|
532
|
+
content.get('description') or
|
|
533
|
+
content.get('snippet') or
|
|
534
|
+
article.get('summary') or '')
|
|
502
535
|
if summary and summary != title:
|
|
536
|
+
# Clean HTML tags from description if present
|
|
537
|
+
import re
|
|
538
|
+
clean_summary = re.sub(r'<[^>]+>', '', summary)
|
|
539
|
+
clean_summary = re.sub(r'\s+', ' ', clean_summary).strip()
|
|
540
|
+
|
|
503
541
|
# Limit summary length
|
|
504
|
-
if len(
|
|
505
|
-
|
|
506
|
-
markdown += f"**Summary:** {
|
|
542
|
+
if len(clean_summary) > 300:
|
|
543
|
+
clean_summary = clean_summary[:300] + "..."
|
|
544
|
+
markdown += f"**Summary:** {clean_summary}\n"
|
|
545
|
+
|
|
546
|
+
# Add metadata if available
|
|
547
|
+
if 'metadata' in content and isinstance(content['metadata'], dict):
|
|
548
|
+
if content['metadata'].get('editorsPick'):
|
|
549
|
+
markdown += f"**Editor's Pick:** ✅\n"
|
|
507
550
|
|
|
508
551
|
markdown += "\n"
|
|
509
552
|
|
|
@@ -514,10 +557,10 @@ class FinanceMarkdownFormatter:
|
|
|
514
557
|
"""Format dividend data as markdown"""
|
|
515
558
|
if dividends.empty:
|
|
516
559
|
return "No dividend data available.\n"
|
|
517
|
-
|
|
560
|
+
|
|
518
561
|
markdown = f"**Total Dividends Recorded:** {len(dividends)}\n"
|
|
519
562
|
markdown += f"**Date Range:** {dividends.index.min().strftime('%Y-%m-%d')} to {dividends.index.max().strftime('%Y-%m-%d')}\n\n"
|
|
520
|
-
|
|
563
|
+
|
|
521
564
|
# Recent dividends (last 10)
|
|
522
565
|
recent_dividends = dividends.tail(10)
|
|
523
566
|
markdown += "### 💰 Recent Dividends\n\n"
|
|
@@ -196,6 +196,7 @@ class VibeSurfTools:
|
|
|
196
196
|
raise RuntimeError("LLM is required for skill_search")
|
|
197
197
|
|
|
198
198
|
# Step 1: Use LLM to analyze user intent and generate different search tasks
|
|
199
|
+
query_num = 6
|
|
199
200
|
from datetime import datetime
|
|
200
201
|
analysis_prompt = f"""
|
|
201
202
|
Analyze the user query and generate 5 different Google search strategies to comprehensively find relevant information.
|
|
@@ -204,13 +205,13 @@ Current Time: {datetime.now().isoformat()}
|
|
|
204
205
|
|
|
205
206
|
User Query: "{params.query}"
|
|
206
207
|
|
|
207
|
-
Generate
|
|
208
|
+
Generate {query_num} different search queries that approach this topic from different angles. Each search should be:
|
|
208
209
|
1. Specific and concrete (good for Google search)
|
|
209
210
|
2. Different from the others (different perspectives/aspects)
|
|
210
211
|
3. Likely to return valuable, unique information
|
|
211
212
|
|
|
212
|
-
Return your response as a JSON array of
|
|
213
|
-
Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
|
|
213
|
+
Return your response as a JSON array of {query_num} search query strings.
|
|
214
|
+
Example format: ["query 1", "query 2", "query 3", "query 4", "query 5", "query 6"]
|
|
214
215
|
"""
|
|
215
216
|
|
|
216
217
|
from browser_use.llm.messages import SystemMessage, UserMessage
|
|
@@ -225,12 +226,14 @@ Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
|
|
|
225
226
|
search_queries = json.loads(response.completion.strip())
|
|
226
227
|
if not isinstance(search_queries, list):
|
|
227
228
|
raise ValueError("Invalid search queries format")
|
|
228
|
-
search_queries = search_queries[:
|
|
229
|
+
search_queries = search_queries[:query_num]
|
|
229
230
|
except (json.JSONDecodeError, ValueError):
|
|
230
231
|
# Fallback to simple queries if parsing fails
|
|
231
232
|
try:
|
|
232
233
|
from json_repair import repair_json
|
|
233
|
-
|
|
234
|
+
search_queries_s = repair_json(response.completion.strip())
|
|
235
|
+
search_queries = json.loads(search_queries_s)
|
|
236
|
+
search_queries = search_queries[:query_num]
|
|
234
237
|
except Exception as e:
|
|
235
238
|
search_queries = [
|
|
236
239
|
params.query,
|
|
@@ -243,7 +246,7 @@ Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
|
|
|
243
246
|
# Step 2: Create browser sessions for parallel searching
|
|
244
247
|
register_sessions = []
|
|
245
248
|
|
|
246
|
-
for i, query in enumerate(search_queries):
|
|
249
|
+
for i, query in enumerate(search_queries[:query_num]):
|
|
247
250
|
agent_id = f"search_agent_{i + 1:03d}"
|
|
248
251
|
register_sessions.append(
|
|
249
252
|
browser_manager.register_agent(agent_id, target_id=None)
|
|
@@ -258,7 +261,6 @@ Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
|
|
|
258
261
|
search_tasks.append(self._perform_google_search(browser_session, query, llm))
|
|
259
262
|
|
|
260
263
|
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
|
|
261
|
-
|
|
262
264
|
# Step 4: Aggregate and filter results
|
|
263
265
|
all_results = []
|
|
264
266
|
for i, result in enumerate(search_results):
|
|
@@ -268,38 +270,74 @@ Example format: ["query 1", "query 2", "query 3", "query 4", "query 5"]
|
|
|
268
270
|
if result:
|
|
269
271
|
all_results.extend(result)
|
|
270
272
|
|
|
271
|
-
# Step 5:
|
|
272
|
-
if all_results:
|
|
273
|
+
# Step 4.5: Rule-based deduplication to reduce LLM processing load
|
|
274
|
+
# if all_results:
|
|
275
|
+
# deduplicated_results = self._rule_based_deduplication(all_results)
|
|
276
|
+
# logger.info(f"Rule-based deduplication: {len(all_results)} -> {len(deduplicated_results)} results")
|
|
277
|
+
# else:
|
|
278
|
+
# deduplicated_results = []
|
|
279
|
+
|
|
280
|
+
# Step 5: Use LLM only for final ranking and selection (much smaller dataset now)
|
|
281
|
+
if all_results and len(all_results) > 10:
|
|
282
|
+
# Only use LLM if we have more than 10 results to rank
|
|
283
|
+
# Create indexed results for LLM prompt
|
|
284
|
+
indexed_results = []
|
|
285
|
+
for i, result in enumerate(all_results):
|
|
286
|
+
indexed_results.append({
|
|
287
|
+
"index": i,
|
|
288
|
+
"title": result.get('title', 'Unknown Title'),
|
|
289
|
+
"url": result.get('url', 'No URL'),
|
|
290
|
+
"summary": result.get('summary', 'No summary available')
|
|
291
|
+
})
|
|
292
|
+
|
|
273
293
|
ranking_prompt = f"""
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
2. Rank by relevance and value to the user
|
|
277
|
-
3. Select the TOP 10 most relevant and valuable results
|
|
294
|
+
Rank these search results for the query "{params.query}" by relevance and value.
|
|
295
|
+
Select the TOP 10 most relevant and valuable results.
|
|
278
296
|
|
|
279
|
-
Search Results:
|
|
280
|
-
{json.dumps(
|
|
297
|
+
Search Results ({len(indexed_results)} total):
|
|
298
|
+
{json.dumps(indexed_results, indent=2, ensure_ascii=False)}
|
|
281
299
|
|
|
282
|
-
Return the top 10 results as a JSON array
|
|
283
|
-
|
|
284
|
-
- url: string
|
|
285
|
-
- summary: string (brief description of why this result is valuable)
|
|
300
|
+
Return ONLY the indices of the top 10 results as a JSON array of numbers.
|
|
301
|
+
For example: [0, 5, 2, 8, 1, 9, 3, 7, 4, 6]
|
|
286
302
|
|
|
287
|
-
Format: [
|
|
303
|
+
Format: [index1, index2, index3, ...]
|
|
288
304
|
"""
|
|
289
305
|
|
|
290
306
|
ranking_response = await llm.ainvoke([
|
|
291
307
|
SystemMessage(
|
|
292
|
-
content="You are an expert at
|
|
308
|
+
content="You are an expert at ranking search results for relevance and value. Return only the indices of the top results."),
|
|
293
309
|
UserMessage(content=ranking_prompt)
|
|
294
310
|
])
|
|
295
311
|
|
|
296
312
|
try:
|
|
297
|
-
|
|
298
|
-
if not isinstance(
|
|
313
|
+
selected_indices = json.loads(ranking_response.completion.strip())
|
|
314
|
+
if not isinstance(selected_indices, list):
|
|
299
315
|
raise ValueError("Invalid ranking results format")
|
|
316
|
+
# Ensure indices are valid and limit to 10
|
|
317
|
+
valid_indices = [i for i in selected_indices if isinstance(i, int) and 0 <= i < len(all_results)][:10]
|
|
318
|
+
if valid_indices:
|
|
319
|
+
top_results = [all_results[i] for i in valid_indices]
|
|
320
|
+
else:
|
|
321
|
+
top_results = all_results[:10]
|
|
300
322
|
except (json.JSONDecodeError, ValueError):
|
|
301
|
-
|
|
302
|
-
|
|
323
|
+
try:
|
|
324
|
+
selected_indices_s = repair_json(ranking_response.completion.strip())
|
|
325
|
+
selected_indices = json.loads(selected_indices_s)
|
|
326
|
+
if isinstance(selected_indices, list):
|
|
327
|
+
valid_indices = [i for i in selected_indices if isinstance(i, int) and 0 <= i < len(all_results)][:10]
|
|
328
|
+
if valid_indices:
|
|
329
|
+
top_results = [all_results[i] for i in valid_indices]
|
|
330
|
+
else:
|
|
331
|
+
top_results = all_results[:10]
|
|
332
|
+
else:
|
|
333
|
+
top_results = all_results[:10]
|
|
334
|
+
except Exception:
|
|
335
|
+
# Fallback to first 10 results
|
|
336
|
+
top_results = all_results[:10]
|
|
337
|
+
elif all_results:
|
|
338
|
+
# If we have 10 or fewer results, skip LLM ranking
|
|
339
|
+
top_results = all_results[:10]
|
|
340
|
+
logger.info(f"Skipping LLM ranking for {len(all_results)} results (≤10)")
|
|
303
341
|
else:
|
|
304
342
|
top_results = []
|
|
305
343
|
|
|
@@ -694,7 +732,7 @@ Please fix the error and generate corrected JavaScript code:"""
|
|
|
694
732
|
elif isinstance(value, (dict, list)):
|
|
695
733
|
# Complex objects - should be serialized by returnByValue
|
|
696
734
|
try:
|
|
697
|
-
result_text = json.dumps(value, ensure_ascii=False)
|
|
735
|
+
result_text = json.dumps(value, ensure_ascii=False, indent=2)
|
|
698
736
|
except (TypeError, ValueError):
|
|
699
737
|
# Fallback for non-serializable objects
|
|
700
738
|
result_text = str(value)
|
|
@@ -729,7 +767,7 @@ The result is empty or not useful. Please generate improved JavaScript code that
|
|
|
729
767
|
result_text = result_text[:30000] + '\n... [Truncated after 30000 characters]'
|
|
730
768
|
|
|
731
769
|
# Success! Return the result
|
|
732
|
-
msg = f'
|
|
770
|
+
msg = f'Generated Code (Iteration {iteration}): \n```javascript\n{generated_js_code}\n```\nResult:\n```json\n {result_text}\n```\n'
|
|
733
771
|
logger.info(f'✅ Skill Code succeeded on iteration {iteration}')
|
|
734
772
|
|
|
735
773
|
return ActionResult(
|
|
@@ -907,19 +945,164 @@ Please generate alternative JavaScript code that avoids this system error:"""
|
|
|
907
945
|
return ActionResult(error=error_msg)
|
|
908
946
|
|
|
909
947
|
|
|
948
|
+
async def _extract_google_results_rule_based(self, browser_session):
|
|
949
|
+
"""Rule-based extraction of Google search results using JavaScript"""
|
|
950
|
+
try:
|
|
951
|
+
cdp_session = await browser_session.get_or_create_cdp_session()
|
|
952
|
+
|
|
953
|
+
# JavaScript code to extract Google search results using DOM selectors
|
|
954
|
+
js_extraction_code = """
|
|
955
|
+
(function() {
|
|
956
|
+
try {
|
|
957
|
+
const results = [];
|
|
958
|
+
|
|
959
|
+
// Multiple selector strategies for different Google layouts
|
|
960
|
+
const selectors = [
|
|
961
|
+
'div[data-sokoban-container] div[data-sokoban-feature]', // Standard results
|
|
962
|
+
'div.g:not(.g-blk)', // Classic results container
|
|
963
|
+
'.tF2Cxc', // Modern result container
|
|
964
|
+
'div[data-ved] h3', // Result titles
|
|
965
|
+
];
|
|
966
|
+
|
|
967
|
+
let resultElements = [];
|
|
968
|
+
|
|
969
|
+
// Try each selector until we find results
|
|
970
|
+
for (const selector of selectors) {
|
|
971
|
+
const elements = document.querySelectorAll(selector);
|
|
972
|
+
if (elements.length > 0) {
|
|
973
|
+
resultElements = Array.from(elements).slice(0, 10); // Get up to 10 results
|
|
974
|
+
break;
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
// If no results found with specific selectors, try broader search
|
|
979
|
+
if (resultElements.length === 0) {
|
|
980
|
+
// Look for any divs containing h3 elements (likely search results)
|
|
981
|
+
const h3Elements = document.querySelectorAll('h3');
|
|
982
|
+
resultElements = Array.from(h3Elements)
|
|
983
|
+
.map(h3 => h3.closest('div'))
|
|
984
|
+
.filter(div => div && div.querySelector('a[href]'))
|
|
985
|
+
.slice(0, 10);
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
for (let i = 0; i < Math.min(resultElements.length, 10); i++) {
|
|
989
|
+
const element = resultElements[i];
|
|
990
|
+
|
|
991
|
+
// Extract title
|
|
992
|
+
let title = '';
|
|
993
|
+
const titleSelectors = ['h3', '[role="heading"]', 'a > span', '.LC20lb'];
|
|
994
|
+
for (const sel of titleSelectors) {
|
|
995
|
+
const titleEl = element.querySelector(sel);
|
|
996
|
+
if (titleEl && titleEl.textContent.trim()) {
|
|
997
|
+
title = titleEl.textContent.trim();
|
|
998
|
+
break;
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
// Extract URL
|
|
1003
|
+
let url = '';
|
|
1004
|
+
const linkSelectors = ['a[href^="http"]', 'a[href^="/url?q="]', 'a[href]'];
|
|
1005
|
+
for (const sel of linkSelectors) {
|
|
1006
|
+
const linkEl = element.querySelector(sel);
|
|
1007
|
+
if (linkEl && linkEl.href) {
|
|
1008
|
+
url = linkEl.href;
|
|
1009
|
+
// Clean Google redirect URLs
|
|
1010
|
+
if (url.includes('/url?q=')) {
|
|
1011
|
+
const urlMatch = url.match(/[?&]q=([^&]*)/);
|
|
1012
|
+
if (urlMatch) {
|
|
1013
|
+
url = decodeURIComponent(urlMatch[1]);
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
break;
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
// Extract summary/description
|
|
1021
|
+
let summary = '';
|
|
1022
|
+
const summarySelectors = [
|
|
1023
|
+
'.VwiC3b', // Description text
|
|
1024
|
+
'.yXK7lf', // Snippet text
|
|
1025
|
+
'[data-content-feature="1"] span',
|
|
1026
|
+
'.s', // Classic description
|
|
1027
|
+
'span:not(:has(a))'
|
|
1028
|
+
];
|
|
1029
|
+
for (const sel of summarySelectors) {
|
|
1030
|
+
const summaryEl = element.querySelector(sel);
|
|
1031
|
+
if (summaryEl && summaryEl.textContent.trim() && summaryEl.textContent.length > 10) {
|
|
1032
|
+
summary = summaryEl.textContent.trim();
|
|
1033
|
+
break;
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
// Only add if we have at least title or URL
|
|
1038
|
+
if (title || url) {
|
|
1039
|
+
results.push({
|
|
1040
|
+
title: title || 'No title',
|
|
1041
|
+
url: url || 'No URL',
|
|
1042
|
+
summary: summary || 'No description available'
|
|
1043
|
+
});
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
return JSON.stringify(results);
|
|
1048
|
+
|
|
1049
|
+
} catch (e) {
|
|
1050
|
+
return JSON.stringify([{
|
|
1051
|
+
title: 'Error extracting results',
|
|
1052
|
+
url: window.location.href,
|
|
1053
|
+
summary: 'JavaScript extraction failed: ' + e.message
|
|
1054
|
+
}]);
|
|
1055
|
+
}
|
|
1056
|
+
})()
|
|
1057
|
+
"""
|
|
1058
|
+
|
|
1059
|
+
# Execute JavaScript to extract results
|
|
1060
|
+
result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
1061
|
+
params={'expression': js_extraction_code, 'returnByValue': True, 'awaitPromise': True},
|
|
1062
|
+
session_id=cdp_session.session_id,
|
|
1063
|
+
)
|
|
1064
|
+
|
|
1065
|
+
if result.get('exceptionDetails'):
|
|
1066
|
+
logger.warning(f"JavaScript extraction failed: {result['exceptionDetails']}")
|
|
1067
|
+
return []
|
|
1068
|
+
|
|
1069
|
+
result_data = result.get('result', {})
|
|
1070
|
+
value = result_data.get('value', '[]')
|
|
1071
|
+
|
|
1072
|
+
try:
|
|
1073
|
+
extracted_results = json.loads(value)
|
|
1074
|
+
return extracted_results if isinstance(extracted_results, list) else []
|
|
1075
|
+
except (json.JSONDecodeError, ValueError):
|
|
1076
|
+
logger.warning(f"Failed to parse extraction results: {value}")
|
|
1077
|
+
return []
|
|
1078
|
+
|
|
1079
|
+
except Exception as e:
|
|
1080
|
+
logger.error(f"Rule-based extraction failed: {e}")
|
|
1081
|
+
return []
|
|
1082
|
+
|
|
910
1083
|
async def _perform_google_search(self, browser_session, query: str, llm: BaseChatModel):
|
|
911
|
-
"""Helper method to perform Google search and extract top 5 results"""
|
|
1084
|
+
"""Helper method to perform Google search and extract top 5 results using rule-based extraction"""
|
|
912
1085
|
try:
|
|
913
1086
|
# Navigate to Google search
|
|
914
1087
|
search_url = f'https://www.google.com/search?q={query}&udm=14'
|
|
915
1088
|
await browser_session.navigate_to_url(search_url, new_tab=False)
|
|
916
1089
|
|
|
917
1090
|
# Wait a moment for page to load
|
|
918
|
-
await asyncio.sleep(
|
|
919
|
-
|
|
920
|
-
#
|
|
1091
|
+
await asyncio.sleep(2)
|
|
1092
|
+
|
|
1093
|
+
# Use rule-based extraction first (much faster than LLM)
|
|
1094
|
+
search_ret_len = 10
|
|
1095
|
+
results = await self._extract_google_results_rule_based(browser_session)
|
|
1096
|
+
if results and len(results) > 0:
|
|
1097
|
+
# Rule-based extraction succeeded
|
|
1098
|
+
logger.debug(f"Rule-based extraction found {len(results)} results for query: {query}")
|
|
1099
|
+
return results[:search_ret_len] # Return top 6 results
|
|
1100
|
+
|
|
1101
|
+
# Fallback to LLM extraction if rule-based fails
|
|
1102
|
+
logger.warning(f"Rule-based extraction failed for query '{query}', falling back to LLM")
|
|
1103
|
+
|
|
921
1104
|
extraction_query = f"""
|
|
922
|
-
Extract the top
|
|
1105
|
+
Extract the top {search_ret_len} search results from this Google search page. For each result, provide:
|
|
923
1106
|
- title: The clickable title/headline
|
|
924
1107
|
- url: The website URL
|
|
925
1108
|
- summary: A brief description of what this result contains
|
|
@@ -930,18 +1113,17 @@ Return results as a JSON array: [{{"title": "...", "url": "...", "summary": "...
|
|
|
930
1113
|
results_text = await self._extract_structured_content(browser_session, extraction_query, llm)
|
|
931
1114
|
|
|
932
1115
|
# Try to parse JSON results
|
|
933
|
-
import json
|
|
934
1116
|
try:
|
|
935
1117
|
results = json.loads(results_text.strip())
|
|
936
1118
|
if isinstance(results, list):
|
|
937
|
-
return results[:
|
|
1119
|
+
return results[:search_ret_len] # Ensure max 5 results
|
|
938
1120
|
except (json.JSONDecodeError, ValueError):
|
|
939
1121
|
try:
|
|
940
1122
|
results = repair_json(results_text.strip())
|
|
941
1123
|
if isinstance(results, list):
|
|
942
|
-
return results[:
|
|
1124
|
+
return results[:search_ret_len] # Ensure max 5 results
|
|
943
1125
|
except Exception as e:
|
|
944
|
-
logger.warning(f"Failed to parse JSON from search results: {results_text}")
|
|
1126
|
+
logger.warning(f"Failed to parse JSON from LLM search results: {results_text}")
|
|
945
1127
|
|
|
946
1128
|
# Fallback: return raw text as single result
|
|
947
1129
|
current_url = await browser_session.get_current_page_url()
|
|
@@ -955,6 +1137,74 @@ Return results as a JSON array: [{{"title": "...", "url": "...", "summary": "...
|
|
|
955
1137
|
logger.error(f"Google search failed for query '{query}': {e}")
|
|
956
1138
|
return []
|
|
957
1139
|
|
|
1140
|
+
def _rule_based_deduplication(self, results):
|
|
1141
|
+
"""Rule-based deduplication to reduce dataset before LLM processing"""
|
|
1142
|
+
if not results:
|
|
1143
|
+
return []
|
|
1144
|
+
|
|
1145
|
+
deduplicated = []
|
|
1146
|
+
seen_urls = set()
|
|
1147
|
+
seen_titles = set()
|
|
1148
|
+
|
|
1149
|
+
for result in results:
|
|
1150
|
+
url = result.get('url', '').strip()
|
|
1151
|
+
title = result.get('title', '').strip().lower()
|
|
1152
|
+
|
|
1153
|
+
# Skip results with missing essential data
|
|
1154
|
+
if not url or not title or url == 'No URL' or title == 'no title':
|
|
1155
|
+
continue
|
|
1156
|
+
|
|
1157
|
+
# Normalize URL for comparison (remove fragments, query params for deduplication)
|
|
1158
|
+
normalized_url = url.split('#')[0].split('?')[0].lower()
|
|
1159
|
+
|
|
1160
|
+
# Check for duplicate URLs
|
|
1161
|
+
if normalized_url in seen_urls:
|
|
1162
|
+
continue
|
|
1163
|
+
|
|
1164
|
+
# Check for very similar titles (basic similarity)
|
|
1165
|
+
title_normalized = ''.join(c for c in title if c.isalnum()).lower()
|
|
1166
|
+
if len(title_normalized) > 10: # Only check titles with substantial content
|
|
1167
|
+
similar_found = False
|
|
1168
|
+
for seen_title in seen_titles:
|
|
1169
|
+
# Simple similarity check: if 80% of characters match
|
|
1170
|
+
if len(title_normalized) > 0 and len(seen_title) > 0:
|
|
1171
|
+
common_chars = sum(1 for c in title_normalized if c in seen_title)
|
|
1172
|
+
similarity = common_chars / max(len(title_normalized), len(seen_title))
|
|
1173
|
+
if similarity > 0.8:
|
|
1174
|
+
similar_found = True
|
|
1175
|
+
break
|
|
1176
|
+
|
|
1177
|
+
if similar_found:
|
|
1178
|
+
continue
|
|
1179
|
+
|
|
1180
|
+
# Add to deduplicated results
|
|
1181
|
+
seen_urls.add(normalized_url)
|
|
1182
|
+
seen_titles.add(title_normalized)
|
|
1183
|
+
deduplicated.append(result)
|
|
1184
|
+
|
|
1185
|
+
# Sort by relevance indicators (prioritize results with longer summaries, non-generic titles)
|
|
1186
|
+
def relevance_score(result):
|
|
1187
|
+
score = 0
|
|
1188
|
+
title = result.get('title', '')
|
|
1189
|
+
summary = result.get('summary', '')
|
|
1190
|
+
|
|
1191
|
+
# Longer summaries are typically more informative
|
|
1192
|
+
score += min(len(summary), 200) / 10
|
|
1193
|
+
|
|
1194
|
+
# Non-generic titles score higher
|
|
1195
|
+
generic_terms = ['search results', 'no title', 'error', 'loading']
|
|
1196
|
+
if not any(term in title.lower() for term in generic_terms):
|
|
1197
|
+
score += 10
|
|
1198
|
+
|
|
1199
|
+
# Prefer results with actual descriptions
|
|
1200
|
+
if summary and summary != 'No description available' and len(summary) > 20:
|
|
1201
|
+
score += 5
|
|
1202
|
+
|
|
1203
|
+
return score
|
|
1204
|
+
|
|
1205
|
+
deduplicated.sort(key=relevance_score, reverse=True)
|
|
1206
|
+
return deduplicated
|
|
1207
|
+
|
|
958
1208
|
async def _extract_structured_content(self, browser_session, query: str, llm: BaseChatModel):
|
|
959
1209
|
"""Helper method to extract structured content from current page"""
|
|
960
1210
|
MAX_CHAR_LIMIT = 30000
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
vibe_surf/__init__.py,sha256=WtduuMFGauMD_9dpk4fnRnLTAP6ka9Lfu0feAFNzLfo,339
|
|
2
|
-
vibe_surf/_version.py,sha256=
|
|
3
|
-
vibe_surf/cli.py,sha256=
|
|
2
|
+
vibe_surf/_version.py,sha256=Y9o7KiJWiG6n9XbSpMICgNgajFRbL4an-gN1BQc-jwM,706
|
|
3
|
+
vibe_surf/cli.py,sha256=KAmUBsXfS-NkMp3ITxzNXwtFeKVmXJUDZiWqLcIC0BI,16690
|
|
4
4
|
vibe_surf/common.py,sha256=_WWMxen5wFwzUjEShn3yDVC1OBFUiJ6Vccadi6tuG6w,1215
|
|
5
5
|
vibe_surf/logger.py,sha256=k53MFA96QX6t9OfcOf1Zws8PP0OOqjVJfhUD3Do9lKw,3043
|
|
6
6
|
vibe_surf/agents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -23,12 +23,12 @@ vibe_surf/backend/api/browser.py,sha256=NXedyZG3NIVRIx5O7d9mHwVWX-Q4_KsX5mSgfKt8
|
|
|
23
23
|
vibe_surf/backend/api/config.py,sha256=vKY6ZnKZeazQP9qqUEiQvP9HoPtJbAzETORuPWZomGw,27272
|
|
24
24
|
vibe_surf/backend/api/files.py,sha256=kJMG9MWECKXwGh64Q6xvAzNjeZGcLhIEnn65HiMZHKE,11762
|
|
25
25
|
vibe_surf/backend/api/models.py,sha256=n_bu8vavvO8bIKA1WUAbaGPFeZKeamMJelDWU3DlFJc,10533
|
|
26
|
-
vibe_surf/backend/api/task.py,sha256=
|
|
26
|
+
vibe_surf/backend/api/task.py,sha256=CYx8FNN04XM8cH9BjAuMb-E7GUTxi4pl0OsT_1KnDBQ,14360
|
|
27
27
|
vibe_surf/backend/api/voices.py,sha256=YfPCqnR7EAYh2nfMRIpB0xEo6_giTtxrcSeobU3HQHg,17098
|
|
28
28
|
vibe_surf/backend/database/__init__.py,sha256=XhmcscnhgMhUyXML7m4SnuQIqkFpyY_zJ0D3yYa2RqQ,239
|
|
29
29
|
vibe_surf/backend/database/manager.py,sha256=Okmr6yG2aycmatONRMyRbHe6l53RkFIPeMxxPSD3ycY,11884
|
|
30
30
|
vibe_surf/backend/database/models.py,sha256=Z5_RqGyD4ER5bsrYjc2iso9yPo7zfAqxNeVDGtZqotw,8887
|
|
31
|
-
vibe_surf/backend/database/queries.py,sha256=
|
|
31
|
+
vibe_surf/backend/database/queries.py,sha256=6SsAxTr-ocQ189xQ5m0L3BsgUdkGtmt2TcrXP-JIbrw,41157
|
|
32
32
|
vibe_surf/backend/database/schemas.py,sha256=OPnpRKwYG1Cu8geJ6pajiEDF8x8mRestXnAfI4Gy18w,3402
|
|
33
33
|
vibe_surf/backend/database/migrations/v001_initial_schema.sql,sha256=MC2fa1WHUEhHhdOTxz0qB4RI7JdGRpiGXZ77ytl3LRQ,4345
|
|
34
34
|
vibe_surf/backend/database/migrations/v002_add_agent_mode.sql,sha256=jKnW28HsphUeU9kudEx9QaLnUh8swmmOt-hFsZJay24,251
|
|
@@ -36,7 +36,7 @@ vibe_surf/backend/database/migrations/v003_fix_task_status_case.sql,sha256=npzRg
|
|
|
36
36
|
vibe_surf/backend/database/migrations/v004_add_voice_profiles.sql,sha256=-9arjQBF-OxvFIOwkEl7JJJRDTS_nJ8GNX3T7bJgVq0,1321
|
|
37
37
|
vibe_surf/backend/utils/__init__.py,sha256=V8leMFp7apAglUAoCHPZrNNcRHthSLYIudIJE5qwjb0,184
|
|
38
38
|
vibe_surf/backend/utils/encryption.py,sha256=CjLNh_n0Luhfa-6BB-icfzkiiDqj5b4Gu6MADU3p2eM,3754
|
|
39
|
-
vibe_surf/backend/utils/llm_factory.py,sha256=
|
|
39
|
+
vibe_surf/backend/utils/llm_factory.py,sha256=XIJYc9Lh_L2vbwlAe96PrjptlzJtLOjCGNdHEx6fThk,9047
|
|
40
40
|
vibe_surf/browser/__init__.py,sha256=_UToO2fZfSCrfjOcxhn4Qq7ZLbYeyPuUUEmqIva-Yv8,325
|
|
41
41
|
vibe_surf/browser/agen_browser_profile.py,sha256=J06hCBJSJ-zAFVM9yDFz8UpmiLuFyWke1EMekpU45eo,5871
|
|
42
42
|
vibe_surf/browser/agent_browser_session.py,sha256=xV0nHo_TCb7b7QYhIee4cLzH-1rqJswYwH7GEwyQmqc,33980
|
|
@@ -85,20 +85,20 @@ vibe_surf/chrome_extension/styles/settings-responsive.css,sha256=jLE0yG15n2aI6_6
|
|
|
85
85
|
vibe_surf/chrome_extension/styles/settings-utilities.css,sha256=3PuQS2857kg83d5erLbLdo_7J95-qV-qyNWS5M-w1oQ,505
|
|
86
86
|
vibe_surf/chrome_extension/styles/variables.css,sha256=enjyhsa0PeU3b-3uiXa-VkV-1-h2-Ai3m4KpmC2k0rY,2984
|
|
87
87
|
vibe_surf/llm/__init__.py,sha256=_vDVPo6STf343p1SgMQrF5023hicAx0g83pK2Gbk4Ek,601
|
|
88
|
-
vibe_surf/llm/openai_compatible.py,sha256=
|
|
88
|
+
vibe_surf/llm/openai_compatible.py,sha256=i0a5OLaL6QIlacVyctOG09vKr3KOi8T8Izp1v7xkD5I,16112
|
|
89
89
|
vibe_surf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
90
90
|
vibe_surf/tools/browser_use_tools.py,sha256=tacxKUJL6uOt04f52_iIw1cs-FT-mBgIPmAsIc4Hww0,23730
|
|
91
91
|
vibe_surf/tools/file_system.py,sha256=Tw_6J5QjCahQ3fd26CXziF1zPvRxhYM0889oK4bDhlU,19304
|
|
92
|
-
vibe_surf/tools/finance_tools.py,sha256=
|
|
92
|
+
vibe_surf/tools/finance_tools.py,sha256=E8rmblp57e_cp0tFbdZ7BY3_upNlk4Whk0bYc_SFCJE,27284
|
|
93
93
|
vibe_surf/tools/mcp_client.py,sha256=OeCoTgyx4MoY7JxXndK6pGHIoyFOhf5r7XCbx25y1Ec,2446
|
|
94
94
|
vibe_surf/tools/report_writer_tools.py,sha256=2CyTTXOahTKZo7XwyWDDhJ--1mRA0uTtUWxu_DACAY0,776
|
|
95
95
|
vibe_surf/tools/vibesurf_registry.py,sha256=Z-8d9BrJl3RFMEK0Tw1Q5xNHX2kZGsnIGCTBZ3RM-pw,2159
|
|
96
|
-
vibe_surf/tools/vibesurf_tools.py,sha256=
|
|
96
|
+
vibe_surf/tools/vibesurf_tools.py,sha256=O8y1noWyY8y-j8I7vF4oOaVDybINNXiNXWNwGJJ5xsM,91500
|
|
97
97
|
vibe_surf/tools/views.py,sha256=AEAPzML-lqWJ7dBMjXTl7o-rk4hp5PGaPRqLyilJUl8,7789
|
|
98
98
|
vibe_surf/tools/voice_asr.py,sha256=AJG0yq_Jq-j8ulDlbPhVFfK1jch9_ASesis73iki9II,4702
|
|
99
|
-
vibesurf-0.1.
|
|
100
|
-
vibesurf-0.1.
|
|
101
|
-
vibesurf-0.1.
|
|
102
|
-
vibesurf-0.1.
|
|
103
|
-
vibesurf-0.1.
|
|
104
|
-
vibesurf-0.1.
|
|
99
|
+
vibesurf-0.1.26.dist-info/licenses/LICENSE,sha256=czn6QYya0-jhLnStD9JqnMS-hwP5wRByipkrGTvoXLI,11355
|
|
100
|
+
vibesurf-0.1.26.dist-info/METADATA,sha256=8Bdh3-15Hl0KbmB0ghk9KNQiveDnVGnhzZ-4Dv0MVjc,5190
|
|
101
|
+
vibesurf-0.1.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
102
|
+
vibesurf-0.1.26.dist-info/entry_points.txt,sha256=UxqpvMocL-PR33S6vLF2OmXn-kVzM-DneMeZeHcPMM8,48
|
|
103
|
+
vibesurf-0.1.26.dist-info/top_level.txt,sha256=VPZGHqSb6EEqcJ4ZX6bHIuWfon5f6HXl3c7BYpbRqnY,10
|
|
104
|
+
vibesurf-0.1.26.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|