skydeckai-code 0.1.27__py3-none-any.whl → 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aidd/tools/__init__.py +3 -1
- aidd/tools/web_tools.py +592 -2
- {skydeckai_code-0.1.27.dist-info → skydeckai_code-0.1.28.dist-info}/METADATA +53 -4
- {skydeckai_code-0.1.27.dist-info → skydeckai_code-0.1.28.dist-info}/RECORD +7 -7
- {skydeckai_code-0.1.27.dist-info → skydeckai_code-0.1.28.dist-info}/WHEEL +0 -0
- {skydeckai_code-0.1.27.dist-info → skydeckai_code-0.1.28.dist-info}/entry_points.txt +0 -0
- {skydeckai_code-0.1.27.dist-info → skydeckai_code-0.1.28.dist-info}/licenses/LICENSE +0 -0
aidd/tools/__init__.py
CHANGED
@@ -77,7 +77,7 @@ from .screenshot_tool import (
|
|
77
77
|
handle_capture_screenshot,
|
78
78
|
)
|
79
79
|
from .system_tools import get_system_info_tool, handle_get_system_info
|
80
|
-
from .web_tools import web_fetch_tool, handle_web_fetch
|
80
|
+
from .web_tools import web_fetch_tool, handle_web_fetch, web_search_tool, handle_web_search
|
81
81
|
|
82
82
|
# Export all tools definitions
|
83
83
|
TOOL_DEFINITIONS = [
|
@@ -125,6 +125,7 @@ TOOL_DEFINITIONS = [
|
|
125
125
|
read_image_file_tool(),
|
126
126
|
# Web tools
|
127
127
|
web_fetch_tool(),
|
128
|
+
web_search_tool(),
|
128
129
|
]
|
129
130
|
|
130
131
|
# Export all handlers
|
@@ -173,4 +174,5 @@ TOOL_HANDLERS = {
|
|
173
174
|
"read_image_file": handle_read_image_file,
|
174
175
|
# Web handlers
|
175
176
|
"web_fetch": handle_web_fetch,
|
177
|
+
"web_search": handle_web_search,
|
176
178
|
}
|
aidd/tools/web_tools.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
import os
|
2
|
-
import
|
3
|
-
|
2
|
+
import random
|
3
|
+
import time
|
4
|
+
from typing import List
|
4
5
|
from urllib.parse import urlparse
|
5
6
|
|
7
|
+
import requests
|
6
8
|
from mcp.types import TextContent
|
9
|
+
|
7
10
|
from .state import state
|
8
11
|
|
9
12
|
|
@@ -17,6 +20,7 @@ def web_fetch_tool():
|
|
17
20
|
"WHEN NOT TO USE: When you need to interact with complex websites requiring authentication "
|
18
21
|
"or session management, when the data needs to be processed in a specific format not supported, "
|
19
22
|
"or when you need to make authenticated API calls with OAuth. "
|
23
|
+
"TIP: Use 'web_search' first to find relevant URLs, then use this tool to fetch detailed content. "
|
20
24
|
"RETURNS: The content of the URL as text. For HTML pages, returns the raw HTML content. "
|
21
25
|
"For JSON endpoints, returns the JSON content as a string. Successful response includes HTTP "
|
22
26
|
"status code. Failed requests include error details. Maximum request size enforced for safety.",
|
@@ -183,3 +187,589 @@ async def handle_web_fetch(arguments: dict) -> List[TextContent]:
|
|
183
187
|
except Exception as e:
|
184
188
|
# Handle other errors
|
185
189
|
raise ValueError(f"Error processing content from {url}: {str(e)}")
|
190
|
+
|
191
|
+
|
192
|
+
def web_search_tool():
|
193
|
+
return {
|
194
|
+
"name": "web_search",
|
195
|
+
"description": "Performs a web search and returns the search results. "
|
196
|
+
"WHEN TO USE: When you need to find information on the web, get up-to-date data, "
|
197
|
+
"or research a topic. This provides more current information than your training data. "
|
198
|
+
"WHEN NOT TO USE: For queries requiring complex authentication, accessing private data, "
|
199
|
+
"or when you want to browse interactive websites. "
|
200
|
+
"TIP: For best results, use this tool to find relevant URLs, then use 'web_fetch' to get the full content of specific pages. "
|
201
|
+
"RETURNS: A list of search results including titles, URLs, and snippets for each result.",
|
202
|
+
"inputSchema": {
|
203
|
+
"type": "object",
|
204
|
+
"properties": {
|
205
|
+
"query": {
|
206
|
+
"type": "string",
|
207
|
+
"description": "The search query to send to search engine. Be specific to get better results. "
|
208
|
+
"Example: 'latest python release features' or 'climate change statistics 2023'."
|
209
|
+
},
|
210
|
+
"num_results": {
|
211
|
+
"type": "integer",
|
212
|
+
"description": "Number of search results to return. Maximum is 20 to prevent abuse.",
|
213
|
+
"default": 10
|
214
|
+
},
|
215
|
+
"convert_html_to_markdown": {
|
216
|
+
"type": "boolean",
|
217
|
+
"description": "If true, search result snippets will be converted from HTML to markdown "
|
218
|
+
"for better readability.",
|
219
|
+
"default": True
|
220
|
+
},
|
221
|
+
"search_engine": {
|
222
|
+
"type": "string",
|
223
|
+
"description": "Specifies which search engine to use. Options: 'auto' (tries all in sequence), "
|
224
|
+
"'bing', or 'duckduckgo'. Some engines may block automated requests.",
|
225
|
+
"enum": ["auto", "bing", "duckduckgo"],
|
226
|
+
"default": "auto"
|
227
|
+
}
|
228
|
+
},
|
229
|
+
"required": ["query"]
|
230
|
+
}
|
231
|
+
}
|
232
|
+
|
233
|
+
|
234
|
+
def _process_ddg_url(url):
|
235
|
+
"""Process DuckDuckGo URLs to get the actual target URL."""
|
236
|
+
try:
|
237
|
+
import urllib.parse
|
238
|
+
url_parts = urllib.parse.urlparse(url)
|
239
|
+
|
240
|
+
# Case 1: Traditional uddg parameter format
|
241
|
+
if 'uddg' in url_parts.query:
|
242
|
+
query_parts = urllib.parse.parse_qs(url_parts.query)
|
243
|
+
extracted_url = query_parts.get('uddg', [''])[0]
|
244
|
+
if extracted_url:
|
245
|
+
return extracted_url
|
246
|
+
|
247
|
+
# Case 2: Advertising/redirect y.js format
|
248
|
+
elif 'y.js' in url_parts.path:
|
249
|
+
query_parts = urllib.parse.parse_qs(url_parts.query)
|
250
|
+
# Try ad_domain first
|
251
|
+
if 'ad_domain' in query_parts and query_parts['ad_domain'][0]:
|
252
|
+
return f"https://{query_parts['ad_domain'][0]}"
|
253
|
+
# Then try du parameter
|
254
|
+
elif 'du' in query_parts and query_parts['du'][0]:
|
255
|
+
return query_parts['du'][0]
|
256
|
+
# Try other known parameters
|
257
|
+
for param in ['u', 'l']:
|
258
|
+
if param in query_parts and query_parts[param][0]:
|
259
|
+
return query_parts[param][0]
|
260
|
+
|
261
|
+
# Case 3: Direct URL
|
262
|
+
elif url.startswith('http'):
|
263
|
+
return url
|
264
|
+
|
265
|
+
except Exception as e:
|
266
|
+
print(f"Error processing DuckDuckGo URL: {str(e)}")
|
267
|
+
|
268
|
+
# Default to original URL if all else fails
|
269
|
+
return url
|
270
|
+
|
271
|
+
|
272
|
+
def _process_bing_url(url):
|
273
|
+
"""Process Bing URLs to get the actual target URL."""
|
274
|
+
try:
|
275
|
+
import urllib.parse
|
276
|
+
parsed_url = urllib.parse.urlparse(url)
|
277
|
+
|
278
|
+
# Check if it's a Bing redirect URL
|
279
|
+
if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/ck/a':
|
280
|
+
# Try to extract the actual URL from Bing's redirect
|
281
|
+
query_dict = urllib.parse.parse_qs(parsed_url.query)
|
282
|
+
if 'u' in query_dict:
|
283
|
+
# Bing stores the actual URL in the 'u' parameter, often base64 encoded
|
284
|
+
import base64
|
285
|
+
try:
|
286
|
+
# Try to decode if it's base64
|
287
|
+
real_url = base64.b64decode(query_dict['u'][0]).decode('utf-8')
|
288
|
+
return real_url
|
289
|
+
except Exception:
|
290
|
+
# If not base64, just use it directly
|
291
|
+
return query_dict['u'][0]
|
292
|
+
|
293
|
+
# Try other known redirect parameters
|
294
|
+
for param in ['purl', 'r']:
|
295
|
+
if param in query_dict:
|
296
|
+
return query_dict[param][0]
|
297
|
+
|
298
|
+
except Exception as e:
|
299
|
+
print(f"Error processing Bing URL: {str(e)}")
|
300
|
+
|
301
|
+
# Default to original URL if all else fails
|
302
|
+
return url
|
303
|
+
|
304
|
+
|
305
|
+
async def handle_web_search(arguments: dict) -> List[TextContent]:
|
306
|
+
"""Handle performing a web search using direct HTML scraping with anti-detection measures."""
|
307
|
+
query = arguments.get("query")
|
308
|
+
num_results = min(arguments.get("num_results", 10), 20) # Cap at 20 results max
|
309
|
+
convert_html_to_markdown = arguments.get("convert_html_to_markdown", True)
|
310
|
+
search_engine = arguments.get("search_engine", "auto").lower()
|
311
|
+
engine_warning = None
|
312
|
+
|
313
|
+
if not query:
|
314
|
+
raise ValueError("Search query must be provided")
|
315
|
+
|
316
|
+
# Validate search engine parameter
|
317
|
+
valid_engines = ["auto", "bing", "duckduckgo"]
|
318
|
+
if search_engine not in valid_engines:
|
319
|
+
if search_engine == "google":
|
320
|
+
engine_warning = "Warning: Google search engine is no longer supported due to blocking automated requests. Falling back to 'auto' mode."
|
321
|
+
else:
|
322
|
+
engine_warning = f"Warning: Unsupported search engine '{search_engine}'. Valid options are: {', '.join(valid_engines)}. Falling back to 'auto' mode."
|
323
|
+
print(engine_warning)
|
324
|
+
search_engine = "auto" # Default to auto if invalid
|
325
|
+
|
326
|
+
# Create a list of common user agents to rotate through
|
327
|
+
user_agents = [
|
328
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
329
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
|
330
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15",
|
331
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0",
|
332
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
|
333
|
+
]
|
334
|
+
|
335
|
+
# Use a random user agent
|
336
|
+
user_agent = random.choice(user_agents)
|
337
|
+
|
338
|
+
# Set up params for the request
|
339
|
+
params = {
|
340
|
+
"q": query,
|
341
|
+
"num": num_results + 5, # Request a few more results than needed
|
342
|
+
"hl": "en", # Language hint
|
343
|
+
"gl": "us", # Geolocation hint (helps avoid redirect to country-specific sites)
|
344
|
+
}
|
345
|
+
|
346
|
+
# Set up headers to more closely mimic a real browser
|
347
|
+
headers = {
|
348
|
+
"User-Agent": user_agent,
|
349
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
350
|
+
"Accept-Language": "en-US,en;q=0.9",
|
351
|
+
"Accept-Encoding": "gzip, deflate",
|
352
|
+
"Referer": "https://www.skydeck.ai/",
|
353
|
+
"Connection": "keep-alive",
|
354
|
+
"Cache-Control": "max-age=0",
|
355
|
+
"Upgrade-Insecure-Requests": "1",
|
356
|
+
"Sec-Fetch-Dest": "document",
|
357
|
+
"Sec-Fetch-Mode": "navigate",
|
358
|
+
"Sec-Fetch-Site": "same-origin",
|
359
|
+
"Sec-Fetch-User": "?1",
|
360
|
+
}
|
361
|
+
|
362
|
+
# Define search engines configurations
|
363
|
+
search_engines = [
|
364
|
+
{
|
365
|
+
"name": "DuckDuckGo HTML",
|
366
|
+
"id": "duckduckgo",
|
367
|
+
"url": "https://html.duckduckgo.com/html/",
|
368
|
+
"params": {"q": query},
|
369
|
+
"headers": {
|
370
|
+
"User-Agent": user_agent,
|
371
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
372
|
+
"Accept-Language": "en-US,en;q=0.9",
|
373
|
+
"Accept-Encoding": "gzip, deflate",
|
374
|
+
"Referer": "https://duckduckgo.com/",
|
375
|
+
"Connection": "keep-alive",
|
376
|
+
"Upgrade-Insecure-Requests": "1"
|
377
|
+
},
|
378
|
+
"result_selector": [
|
379
|
+
".web-result",
|
380
|
+
".result:not(.result--ad)",
|
381
|
+
".results_links:not(.result--ad)",
|
382
|
+
".result"
|
383
|
+
],
|
384
|
+
"title_selector": [
|
385
|
+
".result__title",
|
386
|
+
".result__a",
|
387
|
+
"h2",
|
388
|
+
".result__title a"
|
389
|
+
],
|
390
|
+
"link_selector": [
|
391
|
+
"a.result__a",
|
392
|
+
"a.result__url",
|
393
|
+
".result__title a",
|
394
|
+
"a[href^='http']"
|
395
|
+
],
|
396
|
+
"snippet_selector": [
|
397
|
+
".result__snippet",
|
398
|
+
".result__snippet p",
|
399
|
+
".result__desc",
|
400
|
+
".result__body",
|
401
|
+
".snippet"
|
402
|
+
]
|
403
|
+
},
|
404
|
+
{
|
405
|
+
"name": "Bing",
|
406
|
+
"id": "bing",
|
407
|
+
"url": "https://www.bing.com/search",
|
408
|
+
"params": {"q": query, "count": num_results},
|
409
|
+
"headers": {
|
410
|
+
"User-Agent": user_agent,
|
411
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
412
|
+
"Accept-Language": "en-US,en;q=0.9",
|
413
|
+
"Accept-Encoding": "gzip, deflate",
|
414
|
+
"Referer": "https://www.bing.com/",
|
415
|
+
"Connection": "keep-alive",
|
416
|
+
"Upgrade-Insecure-Requests": "1"
|
417
|
+
},
|
418
|
+
"result_selector": [
|
419
|
+
".b_algo",
|
420
|
+
"li.b_algo",
|
421
|
+
".b_results > li:not(.b_ad)",
|
422
|
+
"ol#b_results > li"
|
423
|
+
],
|
424
|
+
"title_selector": [
|
425
|
+
"h2",
|
426
|
+
".b_title",
|
427
|
+
"h2 a",
|
428
|
+
"a"
|
429
|
+
],
|
430
|
+
"link_selector": [
|
431
|
+
"h2 a",
|
432
|
+
"a.tilk",
|
433
|
+
"cite",
|
434
|
+
".b_attribution > cite",
|
435
|
+
"a[href^='http']"
|
436
|
+
],
|
437
|
+
"snippet_selector": [
|
438
|
+
".b_caption p",
|
439
|
+
".b_snippet",
|
440
|
+
".b_richcard",
|
441
|
+
".b_caption",
|
442
|
+
".b_algoSlug"
|
443
|
+
]
|
444
|
+
}
|
445
|
+
]
|
446
|
+
|
447
|
+
# Filter engines based on user preference
|
448
|
+
if search_engine != "auto":
|
449
|
+
filtered_engines = [engine for engine in search_engines if engine["id"] == search_engine]
|
450
|
+
if filtered_engines:
|
451
|
+
search_engines = filtered_engines
|
452
|
+
# If no matching engine found, keep the original list (fallback to auto)
|
453
|
+
|
454
|
+
# Track URLs we've already seen to prevent duplicates
|
455
|
+
seen_urls = set()
|
456
|
+
|
457
|
+
# Try each search engine until one works
|
458
|
+
for engine in search_engines:
|
459
|
+
try:
|
460
|
+
print(f"Trying search with {engine['name']}...")
|
461
|
+
|
462
|
+
# Add a small delay to avoid rate limiting
|
463
|
+
time.sleep(random.uniform(0.5, 1.5))
|
464
|
+
|
465
|
+
# Make the request
|
466
|
+
response = requests.get(
|
467
|
+
engine["url"],
|
468
|
+
params=engine["params"],
|
469
|
+
headers=engine["headers"],
|
470
|
+
timeout=15
|
471
|
+
)
|
472
|
+
|
473
|
+
# Check if the response was successful
|
474
|
+
if response.status_code == 200:
|
475
|
+
# Parse the HTML response
|
476
|
+
try:
|
477
|
+
from bs4 import BeautifulSoup
|
478
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
479
|
+
search_results = []
|
480
|
+
|
481
|
+
# Special handling for DuckDuckGo which uses different URL structure
|
482
|
+
is_ddg = engine["name"] == "DuckDuckGo HTML"
|
483
|
+
|
484
|
+
# Convert single selector to list for consistent handling
|
485
|
+
result_selectors = engine["result_selector"]
|
486
|
+
if isinstance(result_selectors, str):
|
487
|
+
result_selectors = [result_selectors]
|
488
|
+
|
489
|
+
# Try each result selector until we find results
|
490
|
+
result_elements = []
|
491
|
+
for selector in result_selectors:
|
492
|
+
result_elements = soup.select(selector)
|
493
|
+
if result_elements:
|
494
|
+
print(f"Found {len(result_elements)} results with selector '{selector}'")
|
495
|
+
break
|
496
|
+
|
497
|
+
print(f"Found {len(result_elements)} potential results with {engine['name']}")
|
498
|
+
|
499
|
+
for result in result_elements:
|
500
|
+
if len(search_results) >= num_results:
|
501
|
+
break
|
502
|
+
|
503
|
+
# Try all title selectors
|
504
|
+
title_selectors = engine["title_selector"]
|
505
|
+
if isinstance(title_selectors, str):
|
506
|
+
title_selectors = [title_selectors]
|
507
|
+
|
508
|
+
title_element = None
|
509
|
+
for selector in title_selectors:
|
510
|
+
title_element = result.select_one(selector)
|
511
|
+
if title_element:
|
512
|
+
break
|
513
|
+
|
514
|
+
# Try all link selectors
|
515
|
+
link_selectors = engine["link_selector"]
|
516
|
+
if isinstance(link_selectors, str):
|
517
|
+
link_selectors = [link_selectors]
|
518
|
+
|
519
|
+
link_element = None
|
520
|
+
for selector in link_selectors:
|
521
|
+
link_element = result.select_one(selector)
|
522
|
+
if link_element and 'href' in link_element.attrs:
|
523
|
+
break
|
524
|
+
|
525
|
+
# Try all snippet selectors
|
526
|
+
snippet_selectors = engine["snippet_selector"]
|
527
|
+
if isinstance(snippet_selectors, str):
|
528
|
+
snippet_selectors = [snippet_selectors]
|
529
|
+
|
530
|
+
snippet_element = None
|
531
|
+
for selector in snippet_selectors:
|
532
|
+
snippet_element = result.select_one(selector)
|
533
|
+
if snippet_element:
|
534
|
+
break
|
535
|
+
|
536
|
+
# If we couldn't find link or title, try looking for any anchor tag with text
|
537
|
+
if not link_element and not title_element:
|
538
|
+
for anchor in result.find_all('a', href=True):
|
539
|
+
if anchor.text.strip() and len(anchor.text.strip()) > 3:
|
540
|
+
link_element = anchor
|
541
|
+
title_element = anchor
|
542
|
+
break
|
543
|
+
|
544
|
+
if title_element and link_element and 'href' in link_element.attrs:
|
545
|
+
# Process URL
|
546
|
+
url = link_element['href']
|
547
|
+
|
548
|
+
# Process URL based on search engine
|
549
|
+
if is_ddg:
|
550
|
+
url = _process_ddg_url(url)
|
551
|
+
elif engine["id"] == "bing":
|
552
|
+
url = _process_bing_url(url)
|
553
|
+
|
554
|
+
# Skip duplicate URLs
|
555
|
+
canonical_url = url.split('?')[0].rstrip('/') # Remove query params and trailing slash for comparison
|
556
|
+
if canonical_url in seen_urls:
|
557
|
+
continue
|
558
|
+
seen_urls.add(canonical_url)
|
559
|
+
|
560
|
+
# Ensure URL is valid
|
561
|
+
if not url or not url.startswith('http'):
|
562
|
+
continue
|
563
|
+
|
564
|
+
# Get title and snippet
|
565
|
+
title = title_element.text.strip()
|
566
|
+
snippet = snippet_element.text.strip() if snippet_element else "No description available"
|
567
|
+
|
568
|
+
# Add to results if we have valid data
|
569
|
+
if title:
|
570
|
+
search_results.append({
|
571
|
+
"title": title,
|
572
|
+
"link": url,
|
573
|
+
"snippet": snippet
|
574
|
+
})
|
575
|
+
|
576
|
+
# If we found results, format and return them
|
577
|
+
if search_results:
|
578
|
+
print(f"Success! Found {len(search_results)} results with {engine['name']}")
|
579
|
+
return _format_search_results(query, search_results, convert_html_to_markdown, engine["name"], engine_warning)
|
580
|
+
|
581
|
+
except Exception as parse_error:
|
582
|
+
print(f"Error parsing {engine['name']} results: {str(parse_error)}")
|
583
|
+
# Continue to the next engine
|
584
|
+
else:
|
585
|
+
print(f"{engine['name']} returned status code: {response.status_code}")
|
586
|
+
|
587
|
+
except Exception as e:
|
588
|
+
print(f"Error with {engine['name']}: {str(e)}")
|
589
|
+
# Continue to the next engine
|
590
|
+
|
591
|
+
# If all engines fail, try a last-resort approach: extract any links from the last response
|
592
|
+
try:
|
593
|
+
if 'response' in locals() and response.status_code == 200:
|
594
|
+
from bs4 import BeautifulSoup
|
595
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
596
|
+
|
597
|
+
print("Attempting emergency link extraction...")
|
598
|
+
emergency_results = []
|
599
|
+
|
600
|
+
# Look for common result containers first
|
601
|
+
potential_containers = [
|
602
|
+
# Common search result containers
|
603
|
+
soup.select("div.g, div.b_algo, .result, .web-result, .results_links, li[data-bm], div[data-hveid]"),
|
604
|
+
# Any div with title-like content
|
605
|
+
soup.select("div:has(h1), div:has(h2), div:has(h3), div:has(h4)"),
|
606
|
+
# Main content areas
|
607
|
+
soup.select("main, #main, #content, .content, #results, .results"),
|
608
|
+
# Fallback to any link with reasonable text
|
609
|
+
soup.select("a[href^='http']")
|
610
|
+
]
|
611
|
+
|
612
|
+
# Process each container type in order until we find enough results
|
613
|
+
for container_set in potential_containers:
|
614
|
+
if container_set and len(emergency_results) < num_results:
|
615
|
+
for container in container_set:
|
616
|
+
# For containers, look for links inside
|
617
|
+
if container.name != 'a':
|
618
|
+
links = container.select("a[href^='http']") or []
|
619
|
+
# Process each link in the container
|
620
|
+
for link in links:
|
621
|
+
url = link.get('href', '')
|
622
|
+
title = link.text.strip()
|
623
|
+
|
624
|
+
# Skip navigation links or empty links
|
625
|
+
if not url or not title or len(title) < 5:
|
626
|
+
continue
|
627
|
+
|
628
|
+
# Skip search engine internal links
|
629
|
+
if any(s in url for s in ['google.com/search', 'bing.com/search', 'duckduckgo.com']):
|
630
|
+
continue
|
631
|
+
|
632
|
+
# Skip duplicate URLs
|
633
|
+
canonical_url = url.split('?')[0].rstrip('/')
|
634
|
+
if canonical_url in seen_urls:
|
635
|
+
continue
|
636
|
+
seen_urls.add(canonical_url)
|
637
|
+
|
638
|
+
# Process URL based on domain
|
639
|
+
if 'bing.com' in url:
|
640
|
+
url = _process_bing_url(url)
|
641
|
+
elif 'duckduckgo.com' in url:
|
642
|
+
url = _process_ddg_url(url)
|
643
|
+
|
644
|
+
# Find snippet text near the link if possible
|
645
|
+
snippet = "No description available"
|
646
|
+
# Try to get snippet from surrounding paragraph or div
|
647
|
+
parent = link.parent
|
648
|
+
if parent:
|
649
|
+
# Look for sibling paragraphs or divs
|
650
|
+
sibling = parent.find_next_sibling(['p', 'div', 'span'])
|
651
|
+
if sibling and sibling.text.strip():
|
652
|
+
snippet = sibling.text.strip()
|
653
|
+
# Or try parent's text excluding the link text
|
654
|
+
elif parent.name in ['p', 'div', 'span'] and len(parent.text) > len(title):
|
655
|
+
snippet_text = parent.text.replace(title, '').strip()
|
656
|
+
if snippet_text:
|
657
|
+
snippet = snippet_text
|
658
|
+
|
659
|
+
emergency_results.append({
|
660
|
+
"title": title,
|
661
|
+
"link": url,
|
662
|
+
"snippet": snippet
|
663
|
+
})
|
664
|
+
|
665
|
+
if len(emergency_results) >= num_results:
|
666
|
+
break
|
667
|
+
else:
|
668
|
+
# Process direct link
|
669
|
+
url = container.get('href', '')
|
670
|
+
title = container.text.strip()
|
671
|
+
|
672
|
+
# Skip invalid links
|
673
|
+
if not url or not title or len(title) < 5:
|
674
|
+
continue
|
675
|
+
|
676
|
+
# Skip search engine internal links
|
677
|
+
if any(s in url for s in ['google.com/search', 'bing.com/search', 'duckduckgo.com']):
|
678
|
+
continue
|
679
|
+
|
680
|
+
# Skip duplicate URLs
|
681
|
+
canonical_url = url.split('?')[0].rstrip('/')
|
682
|
+
if canonical_url in seen_urls:
|
683
|
+
continue
|
684
|
+
seen_urls.add(canonical_url)
|
685
|
+
|
686
|
+
emergency_results.append({
|
687
|
+
"title": title,
|
688
|
+
"link": url,
|
689
|
+
"snippet": "No description available"
|
690
|
+
})
|
691
|
+
|
692
|
+
if len(emergency_results) >= num_results:
|
693
|
+
break
|
694
|
+
|
695
|
+
if len(emergency_results) >= num_results:
|
696
|
+
break
|
697
|
+
|
698
|
+
if emergency_results:
|
699
|
+
print(f"Found {len(emergency_results)} emergency results by extracting links")
|
700
|
+
return _format_search_results(query, emergency_results, convert_html_to_markdown, "Emergency Links", engine_warning)
|
701
|
+
except Exception as e:
|
702
|
+
print(f"Error in emergency link extraction: {str(e)}")
|
703
|
+
|
704
|
+
# If all search methods fail, provide helpful fallback information
|
705
|
+
print("All search methods failed, providing search fallback")
|
706
|
+
return _provide_search_fallback(query, engine_warning)
|
707
|
+
|
708
|
+
|
709
|
+
def _format_search_results(query: str, search_results: list, convert_html_to_markdown: bool, engine_name: str = None, engine_warning: str = None) -> List[TextContent]:
|
710
|
+
"""Format search results into markdown."""
|
711
|
+
formatted_results = ["# Web Search Results\n\n"]
|
712
|
+
formatted_results.append(f"**Query:** {query}\n\n")
|
713
|
+
|
714
|
+
if engine_warning:
|
715
|
+
formatted_results.append(f"**{engine_warning}**\n\n")
|
716
|
+
|
717
|
+
if engine_name:
|
718
|
+
formatted_results.append(f"**Source:** {engine_name}\n\n")
|
719
|
+
|
720
|
+
for i, item in enumerate(search_results, 1):
|
721
|
+
title = item.get("title", "No title")
|
722
|
+
link = item.get("link", "")
|
723
|
+
snippet = item.get("snippet", "No description available")
|
724
|
+
|
725
|
+
# Convert HTML in snippet to markdown if requested
|
726
|
+
if convert_html_to_markdown:
|
727
|
+
try:
|
728
|
+
import html2text
|
729
|
+
h = html2text.HTML2Text()
|
730
|
+
h.ignore_links = False
|
731
|
+
h.ignore_images = True
|
732
|
+
h.body_width = 0 # Don't wrap text
|
733
|
+
|
734
|
+
# Remove HTML tags from title and snippet
|
735
|
+
title = h.handle(title) if '<' in title else title
|
736
|
+
snippet = h.handle(snippet) if '<' in snippet else snippet
|
737
|
+
except ImportError:
|
738
|
+
# Continue without conversion if html2text is not available
|
739
|
+
# Just strip basic HTML tags as a fallback
|
740
|
+
import re
|
741
|
+
title = re.sub(r'<[^>]*>', '', title)
|
742
|
+
snippet = re.sub(r'<[^>]*>', '', snippet)
|
743
|
+
|
744
|
+
formatted_results.append(f"## {i}. {title}\n")
|
745
|
+
formatted_results.append(f"**URL:** {link}\n\n")
|
746
|
+
formatted_results.append(f"{snippet}\n\n---\n\n")
|
747
|
+
|
748
|
+
return [TextContent(
|
749
|
+
type="text",
|
750
|
+
text="".join(formatted_results)
|
751
|
+
)]
|
752
|
+
|
753
|
+
|
754
|
+
def _provide_search_fallback(query: str, engine_warning: str = None) -> List[TextContent]:
|
755
|
+
"""Provide a useful fallback when search fails."""
|
756
|
+
# Create a helpful response with suggestions for alternative approaches
|
757
|
+
formatted_results = ["# Web Search Results\n\n"]
|
758
|
+
formatted_results.append(f"**Query:** {query}\n\n")
|
759
|
+
|
760
|
+
if engine_warning:
|
761
|
+
formatted_results.append(f"**{engine_warning}**\n\n")
|
762
|
+
|
763
|
+
formatted_results.append("I couldn't retrieve search results at this time.\n\n")
|
764
|
+
|
765
|
+
# Add explanation about limitations
|
766
|
+
formatted_results.append("## Why search might be unavailable\n\n")
|
767
|
+
formatted_results.append("Web search APIs often have restrictions on automated access, which can cause searches to fail. When this happens, it's better to:\n\n")
|
768
|
+
formatted_results.append("1. Try a different search engine (Bing or DuckDuckGo which are more reliable for automated access)\n")
|
769
|
+
formatted_results.append("2. Visit specific authoritative sites directly\n")
|
770
|
+
formatted_results.append("3. Try the search again later, or with different terms\n")
|
771
|
+
|
772
|
+
return [TextContent(
|
773
|
+
type="text",
|
774
|
+
text="".join(formatted_results)
|
775
|
+
)]
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: skydeckai-code
|
3
|
-
Version: 0.1.
|
4
|
-
Summary: This MCP server provides a comprehensive set of tools for AI-driven Development workflows including file operations, code analysis, multi-language execution, Git operations, web content fetching, code content searching, and system information retrieval.
|
3
|
+
Version: 0.1.28
|
4
|
+
Summary: This MCP server provides a comprehensive set of tools for AI-driven Development workflows including file operations, code analysis, multi-language execution, Git operations, web content fetching with HTML-to-markdown conversion, multi-engine web search, code content searching, and system information retrieval.
|
5
5
|
Project-URL: Homepage, https://github.com/skydeckai/skydeckai-code
|
6
6
|
Project-URL: Repository, https://github.com/skydeckai/skydeckai-code
|
7
7
|
Project-URL: Documentation, https://github.com/skydeckai/skydeckai-code/blob/main/README.md
|
@@ -10,6 +10,7 @@ License: Apache 2.0
|
|
10
10
|
License-File: LICENSE
|
11
11
|
Keywords: ai,aidd,code,code-analysis,development,mcp
|
12
12
|
Requires-Python: >=3.11
|
13
|
+
Requires-Dist: beautifulsoup4>=4.13.3
|
13
14
|
Requires-Dist: gitpython>=3.1.44
|
14
15
|
Requires-Dist: html2text>=2025.4.15
|
15
16
|
Requires-Dist: mcp>=1.6.0
|
@@ -35,7 +36,7 @@ Description-Content-Type: text/markdown
|
|
35
36
|
|
36
37
|
# SkyDeckAI Code
|
37
38
|
|
38
|
-
An MCP server that provides a comprehensive set of tools for AI-driven development workflows. Features include file system operations, code analysis using tree-sitter for multiple programming languages, Git operations, code execution, web content fetching, code content searching, and system information retrieval. Designed to enhance AI's capability to assist in software development tasks by providing direct access to both local and remote resources.
|
39
|
+
An MCP server that provides a comprehensive set of tools for AI-driven development workflows. Features include file system operations, code analysis using tree-sitter for multiple programming languages, Git operations, code execution, web content fetching with HTML-to-markdown conversion, multi-engine web search, code content searching, and system information retrieval. Designed to enhance AI's capability to assist in software development tasks by providing direct access to both local and remote resources.
|
39
40
|
|
40
41
|
# Formerly Known As MCP-Server-AIDD
|
41
42
|
|
@@ -98,7 +99,8 @@ If you're using SkyDeck AI Helper app, you can search for "SkyDeckAI Code" and i
|
|
98
99
|
- Code content searching with regex pattern matching
|
99
100
|
- Multi-language code execution with safety measures
|
100
101
|
- Git operations (status, diff, commit, branch management, cloning)
|
101
|
-
- Web content fetching from APIs and websites
|
102
|
+
- Web content fetching from APIs and websites with HTML-to-markdown conversion
|
103
|
+
- Multi-engine web search with reliable fallback mechanisms
|
102
104
|
- Batch operations for parallel and serial tool execution
|
103
105
|
- Security controls with configurable workspace boundaries
|
104
106
|
- Screenshot and screen context tools
|
@@ -571,6 +573,53 @@ skydeckai-code-cli --tool web_fetch --args '{
|
|
571
573
|
}'
|
572
574
|
```
|
573
575
|
|
576
|
+
#### web_search
|
577
|
+
|
578
|
+
Performs a robust web search using multiple search engines and returns concise, relevant results.
|
579
|
+
|
580
|
+
```json
|
581
|
+
{
|
582
|
+
"query": "latest python release features",
|
583
|
+
"num_results": 8,
|
584
|
+
"convert_html_to_markdown": true,
|
585
|
+
"search_engine": "bing"
|
586
|
+
}
|
587
|
+
```
|
588
|
+
|
589
|
+
**Parameters:**
|
590
|
+
| Parameter | Type | Required | Description |
|
591
|
+
|-----------|---------|----------|---------------------------------------|
|
592
|
+
| query | string | Yes | The search query to process. Be specific for better results. |
|
593
|
+
| num_results | integer | No | Maximum number of search results to return (default: 10, max: 20) |
|
594
|
+
| convert_html_to_markdown | boolean | No | When true, content will be converted from HTML to markdown for better readability (default: true) |
|
595
|
+
| search_engine | string | No | Specifies which search engine to use: "auto" (default), "bing", or "duckduckgo" |
|
596
|
+
|
597
|
+
**Returns:**
|
598
|
+
A list of search results formatted in markdown, including titles, URLs, and snippets for each result. Results are deduplicated and organized hierarchically for easy reading.
|
599
|
+
|
600
|
+
This tool uses a multi-engine approach that tries different search engines with various parsing strategies to ensure reliable results. You can specify a preferred engine, but some engines may block automated access, in which case the tool will fall back to alternative engines when "auto" is selected.
|
601
|
+
|
602
|
+
**Example Usage:**
|
603
|
+
|
604
|
+
```bash
|
605
|
+
# Search with default settings (auto engine selection)
|
606
|
+
skydeckai-code-cli --tool web_search --args '{
|
607
|
+
"query": "latest python release features"
|
608
|
+
}'
|
609
|
+
|
610
|
+
# Try DuckDuckGo if you want alternative results
|
611
|
+
skydeckai-code-cli --tool web_search --args '{
|
612
|
+
"query": "machine learning frameworks comparison",
|
613
|
+
"search_engine": "duckduckgo"
|
614
|
+
}'
|
615
|
+
|
616
|
+
# Use Bing for reliable results
|
617
|
+
skydeckai-code-cli --tool web_search --args '{
|
618
|
+
"query": "best programming practices 2023",
|
619
|
+
"search_engine": "bing"
|
620
|
+
}'
|
621
|
+
```
|
622
|
+
|
574
623
|
### Utility Tools
|
575
624
|
|
576
625
|
#### batch_tools
|
@@ -1,7 +1,7 @@
|
|
1
1
|
aidd/__init__.py,sha256=c9HBWxWruCxoAqLCJqltylAwz_7xmaK3g8DKViJZs0Q,222
|
2
2
|
aidd/cli.py,sha256=cLtaQJmMBfr7fHkd0dyJqpDrVTIwybL48PotniWGrFM,5031
|
3
3
|
aidd/server.py,sha256=kPRyWeWkMCZjabelC65XTmzZG7yw8htMJKSfnUcKnb0,1575
|
4
|
-
aidd/tools/__init__.py,sha256=
|
4
|
+
aidd/tools/__init__.py,sha256=Oyl9YzMB1SRT_skxMcWHPP7ScTjfBSgT3N4ctGxHMAI,5310
|
5
5
|
aidd/tools/base.py,sha256=wHSAaGGYWM8ECmoYd7KEcmjsZRWesNQFf3zMjCKGMcc,380
|
6
6
|
aidd/tools/code_analysis.py,sha256=fDpm2o_If5PsngXzHN2-ezSkPVT0ZxivLuzmHrOAmVU,33188
|
7
7
|
aidd/tools/code_execution.py,sha256=dIPxHBtclsetDZY4jGlSBrw_t-7VlIVrK8mflnZ6c4w,13176
|
@@ -17,9 +17,9 @@ aidd/tools/path_tools.py,sha256=RGoOhqP69eHJzM8tEgn_5-GRaR0gp25fd0XZIJ_RnQE,4045
|
|
17
17
|
aidd/tools/screenshot_tool.py,sha256=NMO5B4UG8qfMEOMRd2YoOjtwz_oQ2y1UAGU22jV1yGU,46337
|
18
18
|
aidd/tools/state.py,sha256=RWSw0Jfsui8FqC0xsI7Ik07tAg35hRwLHa5xGBVbiI4,1493
|
19
19
|
aidd/tools/system_tools.py,sha256=H4_qveKC2HA7SIbi-j4vxA0W4jYh2wfu9A6ni5wkZyA,7249
|
20
|
-
aidd/tools/web_tools.py,sha256=
|
21
|
-
skydeckai_code-0.1.
|
22
|
-
skydeckai_code-0.1.
|
23
|
-
skydeckai_code-0.1.
|
24
|
-
skydeckai_code-0.1.
|
25
|
-
skydeckai_code-0.1.
|
20
|
+
aidd/tools/web_tools.py,sha256=gdsj2DEVYb_oYChItK5I1ugt2w25U7IAa5kEw9q6MVg,35534
|
21
|
+
skydeckai_code-0.1.28.dist-info/METADATA,sha256=TxHt3pJB_Cjo9u49Y-tBgS7-oii2v_rMx-6bT5VohyU,31300
|
22
|
+
skydeckai_code-0.1.28.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
23
|
+
skydeckai_code-0.1.28.dist-info/entry_points.txt,sha256=cT-IHh3_ioGLk3kwIeqj1X6Li1dnJinX9qKWUl7nOLg,80
|
24
|
+
skydeckai_code-0.1.28.dist-info/licenses/LICENSE,sha256=uHse04vmI6ZjW7TblegFl30X-sDyyF0-QvH8ItPca3c,10865
|
25
|
+
skydeckai_code-0.1.28.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|