wet-mcp 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wet_mcp/__init__.py +6 -0
- wet_mcp/__main__.py +6 -0
- wet_mcp/config.py +32 -0
- wet_mcp/docker_manager.py +146 -0
- wet_mcp/docs/__init__.py +1 -0
- wet_mcp/docs/help.md +55 -0
- wet_mcp/docs/media.md +58 -0
- wet_mcp/docs/web.md +73 -0
- wet_mcp/searxng_settings.yml +30 -0
- wet_mcp/server.py +172 -0
- wet_mcp/setup.py +94 -0
- wet_mcp/sources/__init__.py +1 -0
- wet_mcp/sources/crawler.py +296 -0
- wet_mcp/sources/searxng.py +72 -0
- wet_mcp-1.0.0.dist-info/METADATA +182 -0
- wet_mcp-1.0.0.dist-info/RECORD +19 -0
- wet_mcp-1.0.0.dist-info/WHEEL +4 -0
- wet_mcp-1.0.0.dist-info/entry_points.txt +2 -0
- wet_mcp-1.0.0.dist-info/licenses/LICENSE +21 -0
wet_mcp/__init__.py
ADDED
wet_mcp/__main__.py
ADDED
wet_mcp/config.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Configuration settings for WET MCP Server."""
|
|
2
|
+
|
|
3
|
+
from pydantic_settings import BaseSettings
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Settings(BaseSettings):
|
|
7
|
+
"""WET MCP Server configuration."""
|
|
8
|
+
|
|
9
|
+
# SearXNG
|
|
10
|
+
searxng_url: str = "http://localhost:8080"
|
|
11
|
+
searxng_timeout: int = 30
|
|
12
|
+
|
|
13
|
+
# Crawler
|
|
14
|
+
crawler_headless: bool = True
|
|
15
|
+
crawler_timeout: int = 60
|
|
16
|
+
|
|
17
|
+
# Docker Management
|
|
18
|
+
wet_auto_docker: bool = True
|
|
19
|
+
wet_container_name: str = "wet-searxng"
|
|
20
|
+
wet_searxng_image: str = "searxng/searxng:latest"
|
|
21
|
+
wet_searxng_port: int = 8080
|
|
22
|
+
|
|
23
|
+
# Media
|
|
24
|
+
download_dir: str = "~/.wet-mcp/downloads"
|
|
25
|
+
|
|
26
|
+
# Logging
|
|
27
|
+
log_level: str = "INFO"
|
|
28
|
+
|
|
29
|
+
model_config = {"env_prefix": "", "case_sensitive": False}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
settings = Settings()
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Docker container management for SearXNG."""
|
|
2
|
+
|
|
3
|
+
import socket
|
|
4
|
+
from importlib.resources import files
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from loguru import logger
|
|
8
|
+
|
|
9
|
+
from wet_mcp.config import settings
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _find_available_port(start_port: int, max_tries: int = 10) -> int:
|
|
13
|
+
"""Find an available port starting from start_port."""
|
|
14
|
+
for offset in range(max_tries):
|
|
15
|
+
port = start_port + offset
|
|
16
|
+
try:
|
|
17
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
18
|
+
s.bind(("localhost", port))
|
|
19
|
+
return port
|
|
20
|
+
except OSError:
|
|
21
|
+
continue
|
|
22
|
+
# Fallback to original port
|
|
23
|
+
return start_port
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_settings_path() -> Path:
|
|
27
|
+
"""Get path to SearXNG settings file.
|
|
28
|
+
|
|
29
|
+
Copies bundled settings.yml to user config directory for Docker mounting.
|
|
30
|
+
Uses ~/.wet-mcp/ which is typically shared with Docker.
|
|
31
|
+
"""
|
|
32
|
+
config_dir = Path.home() / ".wet-mcp"
|
|
33
|
+
config_dir.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
settings_file = config_dir / "searxng_settings.yml"
|
|
36
|
+
|
|
37
|
+
# Copy bundled settings if not exists
|
|
38
|
+
if not settings_file.exists():
|
|
39
|
+
bundled = files("wet_mcp").joinpath("searxng_settings.yml")
|
|
40
|
+
settings_file.write_text(bundled.read_text())
|
|
41
|
+
logger.debug(f"Copied SearXNG settings to: {settings_file}")
|
|
42
|
+
|
|
43
|
+
return settings_file
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def ensure_searxng() -> str:
|
|
47
|
+
"""Start SearXNG container if not running. Returns URL.
|
|
48
|
+
|
|
49
|
+
This function handles:
|
|
50
|
+
- Automatic container creation if it doesn't exist
|
|
51
|
+
- Port conflict resolution (tries next available port)
|
|
52
|
+
- SearXNG configuration for JSON API format via settings.yml mount
|
|
53
|
+
- Graceful fallback to external SearXNG URL if Docker unavailable
|
|
54
|
+
"""
|
|
55
|
+
if not settings.wet_auto_docker:
|
|
56
|
+
logger.info("Auto Docker disabled, using external SearXNG")
|
|
57
|
+
return settings.searxng_url
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
from python_on_whales import DockerException, docker
|
|
61
|
+
except ImportError:
|
|
62
|
+
logger.warning("python-on-whales not installed, using external SearXNG")
|
|
63
|
+
return settings.searxng_url
|
|
64
|
+
|
|
65
|
+
container_name = settings.wet_container_name
|
|
66
|
+
image = settings.wet_searxng_image
|
|
67
|
+
preferred_port = settings.wet_searxng_port
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
if docker.container.exists(container_name):
|
|
71
|
+
container = docker.container.inspect(container_name)
|
|
72
|
+
if container.state.running:
|
|
73
|
+
logger.debug(f"SearXNG container already running: {container_name}")
|
|
74
|
+
# Extract port from running container
|
|
75
|
+
ports = container.network_settings.ports
|
|
76
|
+
if ports and "8080/tcp" in ports and ports["8080/tcp"]:
|
|
77
|
+
port = int(ports["8080/tcp"][0].get("HostPort", preferred_port))
|
|
78
|
+
else:
|
|
79
|
+
port = preferred_port
|
|
80
|
+
else:
|
|
81
|
+
logger.info(f"Starting stopped container: {container_name}")
|
|
82
|
+
docker.container.start(container_name)
|
|
83
|
+
port = preferred_port
|
|
84
|
+
else:
|
|
85
|
+
# Find available port to avoid conflicts
|
|
86
|
+
port = _find_available_port(preferred_port)
|
|
87
|
+
if port != preferred_port:
|
|
88
|
+
logger.info(f"Port {preferred_port} in use, using {port}")
|
|
89
|
+
|
|
90
|
+
# Get settings file path
|
|
91
|
+
settings_path = _get_settings_path()
|
|
92
|
+
|
|
93
|
+
logger.info(f"Starting SearXNG container: {container_name}")
|
|
94
|
+
docker.run(
|
|
95
|
+
image,
|
|
96
|
+
name=container_name,
|
|
97
|
+
detach=True,
|
|
98
|
+
publish=[(port, 8080)],
|
|
99
|
+
volumes=[(str(settings_path), "/etc/searxng/settings.yml", "ro")],
|
|
100
|
+
envs={
|
|
101
|
+
"SEARXNG_SECRET": "wet-internal",
|
|
102
|
+
},
|
|
103
|
+
)
|
|
104
|
+
logger.info(f"SearXNG container started on port {port}")
|
|
105
|
+
|
|
106
|
+
return f"http://localhost:{port}"
|
|
107
|
+
|
|
108
|
+
except DockerException as e:
|
|
109
|
+
logger.warning(f"Docker not available: {e}")
|
|
110
|
+
logger.warning("Falling back to external SearXNG URL")
|
|
111
|
+
return settings.searxng_url
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.error(f"Failed to start SearXNG: {e}")
|
|
114
|
+
return settings.searxng_url
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def stop_searxng() -> None:
|
|
118
|
+
"""Stop SearXNG container if running."""
|
|
119
|
+
if not settings.wet_auto_docker:
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
from python_on_whales import docker
|
|
124
|
+
|
|
125
|
+
container_name = settings.wet_container_name
|
|
126
|
+
if docker.container.exists(container_name):
|
|
127
|
+
logger.info(f"Stopping container: {container_name}")
|
|
128
|
+
docker.container.stop(container_name)
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.debug(f"Failed to stop container: {e}")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def remove_searxng() -> None:
|
|
134
|
+
"""Stop and remove SearXNG container."""
|
|
135
|
+
if not settings.wet_auto_docker:
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
from python_on_whales import docker
|
|
140
|
+
|
|
141
|
+
container_name = settings.wet_container_name
|
|
142
|
+
if docker.container.exists(container_name):
|
|
143
|
+
logger.info(f"Removing container: {container_name}")
|
|
144
|
+
docker.container.remove(container_name, force=True)
|
|
145
|
+
except Exception as e:
|
|
146
|
+
logger.debug(f"Failed to remove container: {e}")
|
wet_mcp/docs/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Docs package for WET MCP Server."""
|
wet_mcp/docs/help.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# WET MCP Server - Help
|
|
2
|
+
|
|
3
|
+
Welcome to **WET** (Web ExTract) MCP Server - an open-source alternative to Tavily.
|
|
4
|
+
|
|
5
|
+
## Available Tools
|
|
6
|
+
|
|
7
|
+
| Tool | Description |
|
|
8
|
+
|:-----|:------------|
|
|
9
|
+
| `web` | Web search, content extraction, crawling, site mapping |
|
|
10
|
+
| `media` | Media discovery (images, videos, audio) and download |
|
|
11
|
+
| `help` | Get full documentation for any tool |
|
|
12
|
+
|
|
13
|
+
## Quick Reference
|
|
14
|
+
|
|
15
|
+
### web tool
|
|
16
|
+
|
|
17
|
+
```json
|
|
18
|
+
// Search the web
|
|
19
|
+
{"action": "search", "query": "your search query"}
|
|
20
|
+
|
|
21
|
+
// Extract content from URLs
|
|
22
|
+
{"action": "extract", "urls": ["https://example.com"]}
|
|
23
|
+
|
|
24
|
+
// Crawl multiple pages
|
|
25
|
+
{"action": "crawl", "urls": ["https://docs.example.com"], "depth": 2}
|
|
26
|
+
|
|
27
|
+
// Map site structure
|
|
28
|
+
{"action": "map", "urls": ["https://example.com"]}
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### media tool
|
|
32
|
+
|
|
33
|
+
```json
|
|
34
|
+
// List media on a page
|
|
35
|
+
{"action": "list", "url": "https://example.com"}
|
|
36
|
+
|
|
37
|
+
// Download specific files
|
|
38
|
+
{"action": "download", "media_urls": ["https://example.com/image.png"]}
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Getting Full Documentation
|
|
42
|
+
|
|
43
|
+
Call `help` with the tool name:
|
|
44
|
+
|
|
45
|
+
```json
|
|
46
|
+
{"tool_name": "web"} // Web tool documentation
|
|
47
|
+
{"tool_name": "media"} // Media tool documentation
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Features
|
|
51
|
+
|
|
52
|
+
- **Auto-setup**: First run automatically installs Playwright and configures SearXNG
|
|
53
|
+
- **Anti-bot bypass**: Stealth mode works with Cloudflare, Medium, LinkedIn, etc.
|
|
54
|
+
- **Multimodal**: Extract and download images, videos, audio files
|
|
55
|
+
- **Deep crawling**: Follow links to specified depth with page limits
|
wet_mcp/docs/media.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# media Tool Documentation
|
|
2
|
+
|
|
3
|
+
Media discovery and download from web pages.
|
|
4
|
+
|
|
5
|
+
## Actions
|
|
6
|
+
|
|
7
|
+
### list
|
|
8
|
+
Scan a page and return media URLs with metadata.
|
|
9
|
+
|
|
10
|
+
**Parameters:**
|
|
11
|
+
- `url` (required): Page URL to scan
|
|
12
|
+
- `media_type`: Type of media - images, videos, audio, files, all (default: all)
|
|
13
|
+
- `max_items`: Maximum items per type (default: 10)
|
|
14
|
+
|
|
15
|
+
**Example:**
|
|
16
|
+
```json
|
|
17
|
+
{"action": "list", "url": "https://example.com/gallery", "media_type": "images"}
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
**Returns:**
|
|
21
|
+
```json
|
|
22
|
+
{
|
|
23
|
+
"images": [
|
|
24
|
+
{"src": "https://...", "alt": "...", "width": 800, "height": 600}
|
|
25
|
+
],
|
|
26
|
+
"videos": [],
|
|
27
|
+
"audio": []
|
|
28
|
+
}
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
### download
|
|
34
|
+
Download specific media files to local storage.
|
|
35
|
+
|
|
36
|
+
**Parameters:**
|
|
37
|
+
- `media_urls` (required): List of media URLs to download
|
|
38
|
+
- `output_dir`: Output directory (default: ~/.wet-mcp/downloads)
|
|
39
|
+
|
|
40
|
+
**Example:**
|
|
41
|
+
```json
|
|
42
|
+
{"action": "download", "media_urls": ["https://example.com/image.jpg"]}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
**Returns:**
|
|
46
|
+
```json
|
|
47
|
+
[
|
|
48
|
+
{"url": "...", "path": "/path/to/file.jpg", "size": 12345}
|
|
49
|
+
]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Workflow
|
|
55
|
+
|
|
56
|
+
1. Use `list` to discover media on a page
|
|
57
|
+
2. Review the results (optionally have AI analyze)
|
|
58
|
+
3. Use `download` to save specific files locally
|
wet_mcp/docs/web.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# web Tool Documentation
|
|
2
|
+
|
|
3
|
+
Web operations: search, extract, crawl, map.
|
|
4
|
+
|
|
5
|
+
## Actions
|
|
6
|
+
|
|
7
|
+
### search
|
|
8
|
+
Web search via SearXNG metasearch engine.
|
|
9
|
+
|
|
10
|
+
**Parameters:**
|
|
11
|
+
- `query` (required): Search query string
|
|
12
|
+
- `categories`: Search category - general, images, videos, files (default: general)
|
|
13
|
+
- `max_results`: Maximum results to return (default: 10)
|
|
14
|
+
|
|
15
|
+
**Example:**
|
|
16
|
+
```json
|
|
17
|
+
{"action": "search", "query": "python web scraping tutorial", "max_results": 5}
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
### extract
|
|
23
|
+
Get clean content from one or more URLs.
|
|
24
|
+
|
|
25
|
+
**Parameters:**
|
|
26
|
+
- `urls` (required): List of URLs to extract
|
|
27
|
+
- `format`: Output format - markdown, text, html (default: markdown)
|
|
28
|
+
- `stealth`: Enable stealth mode to bypass anti-bot (default: true)
|
|
29
|
+
|
|
30
|
+
**Example:**
|
|
31
|
+
```json
|
|
32
|
+
{"action": "extract", "urls": ["https://example.com/article"]}
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
### crawl
|
|
38
|
+
Deep crawl starting from root URLs.
|
|
39
|
+
|
|
40
|
+
**Parameters:**
|
|
41
|
+
- `urls` (required): List of root URLs to crawl from
|
|
42
|
+
- `depth`: How many levels deep to crawl (default: 2)
|
|
43
|
+
- `max_pages`: Maximum pages to crawl (default: 20)
|
|
44
|
+
- `format`: Output format (default: markdown)
|
|
45
|
+
- `stealth`: Enable stealth mode (default: true)
|
|
46
|
+
|
|
47
|
+
**Example:**
|
|
48
|
+
```json
|
|
49
|
+
{"action": "crawl", "urls": ["https://docs.example.com"], "depth": 3}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
### map
|
|
55
|
+
Discover site structure without extracting content.
|
|
56
|
+
|
|
57
|
+
**Parameters:**
|
|
58
|
+
- `urls` (required): List of root URLs
|
|
59
|
+
- `depth`: Discovery depth (default: 2)
|
|
60
|
+
- `max_pages`: Maximum URLs to discover (default: 50)
|
|
61
|
+
|
|
62
|
+
**Example:**
|
|
63
|
+
```json
|
|
64
|
+
{"action": "map", "urls": ["https://example.com"]}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Anti-Bot Features
|
|
70
|
+
|
|
71
|
+
The `stealth` parameter enables:
|
|
72
|
+
- Stealth mode: Masks navigator.webdriver, emulates plugins
|
|
73
|
+
- Undetected browser: For advanced detection (Cloudflare, Datadome)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# SearXNG settings for WET MCP Server
|
|
2
|
+
# Mounted into container at /etc/searxng/settings.yml
|
|
3
|
+
|
|
4
|
+
use_default_settings: true
|
|
5
|
+
|
|
6
|
+
server:
|
|
7
|
+
secret_key: "wet-mcp-internal-secret"
|
|
8
|
+
limiter: false
|
|
9
|
+
method: "GET"
|
|
10
|
+
|
|
11
|
+
search:
|
|
12
|
+
formats:
|
|
13
|
+
- html
|
|
14
|
+
- json
|
|
15
|
+
|
|
16
|
+
# Disable rate limiting for local use
|
|
17
|
+
outgoing:
|
|
18
|
+
request_timeout: 10.0
|
|
19
|
+
max_request_timeout: 30.0
|
|
20
|
+
|
|
21
|
+
# Engines to enable
|
|
22
|
+
engines:
|
|
23
|
+
- name: google
|
|
24
|
+
disabled: false
|
|
25
|
+
- name: bing
|
|
26
|
+
disabled: false
|
|
27
|
+
- name: duckduckgo
|
|
28
|
+
disabled: false
|
|
29
|
+
- name: brave
|
|
30
|
+
disabled: false
|
wet_mcp/server.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""WET MCP Server - Main server definition."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from importlib.resources import files
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
from mcp.server.fastmcp import FastMCP
|
|
8
|
+
|
|
9
|
+
from wet_mcp.config import settings
|
|
10
|
+
from wet_mcp.docker_manager import ensure_searxng
|
|
11
|
+
|
|
12
|
+
# Configure logging
|
|
13
|
+
logger.remove()
|
|
14
|
+
logger.add(sys.stderr, level=settings.log_level)
|
|
15
|
+
|
|
16
|
+
# Initialize MCP server
|
|
17
|
+
mcp = FastMCP(
|
|
18
|
+
name="wet",
|
|
19
|
+
instructions="Web ExTract MCP Server - search, extract, crawl, map with SearXNG",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Store SearXNG URL after initialization
|
|
23
|
+
_searxng_url: str | None = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_searxng_url() -> str:
|
|
27
|
+
"""Get SearXNG URL, initializing container if needed."""
|
|
28
|
+
global _searxng_url
|
|
29
|
+
if _searxng_url is None:
|
|
30
|
+
_searxng_url = ensure_searxng()
|
|
31
|
+
return _searxng_url
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@mcp.tool()
|
|
35
|
+
async def web(
|
|
36
|
+
action: str,
|
|
37
|
+
query: str | None = None,
|
|
38
|
+
urls: list[str] | None = None,
|
|
39
|
+
categories: str = "general",
|
|
40
|
+
max_results: int = 10,
|
|
41
|
+
depth: int = 2,
|
|
42
|
+
max_pages: int = 20,
|
|
43
|
+
format: str = "markdown",
|
|
44
|
+
stealth: bool = True,
|
|
45
|
+
) -> str:
|
|
46
|
+
"""Web operations: search, extract, crawl, map.
|
|
47
|
+
- search: Web search via SearXNG (requires query)
|
|
48
|
+
- extract: Get clean content from URLs
|
|
49
|
+
- crawl: Deep crawl from root URL
|
|
50
|
+
- map: Discover site structure
|
|
51
|
+
Use `help` tool for full documentation.
|
|
52
|
+
"""
|
|
53
|
+
from wet_mcp.sources.crawler import crawl, extract, sitemap
|
|
54
|
+
from wet_mcp.sources.searxng import search as searxng_search
|
|
55
|
+
|
|
56
|
+
match action:
|
|
57
|
+
case "search":
|
|
58
|
+
if not query:
|
|
59
|
+
return "Error: query is required for search action"
|
|
60
|
+
searxng_url = _get_searxng_url()
|
|
61
|
+
return await searxng_search(
|
|
62
|
+
searxng_url=searxng_url,
|
|
63
|
+
query=query,
|
|
64
|
+
categories=categories,
|
|
65
|
+
max_results=max_results,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
case "extract":
|
|
69
|
+
if not urls:
|
|
70
|
+
return "Error: urls is required for extract action"
|
|
71
|
+
return await extract(
|
|
72
|
+
urls=urls,
|
|
73
|
+
format=format,
|
|
74
|
+
stealth=stealth,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
case "crawl":
|
|
78
|
+
if not urls:
|
|
79
|
+
return "Error: urls is required for crawl action"
|
|
80
|
+
return await crawl(
|
|
81
|
+
urls=urls,
|
|
82
|
+
depth=depth,
|
|
83
|
+
max_pages=max_pages,
|
|
84
|
+
format=format,
|
|
85
|
+
stealth=stealth,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
case "map":
|
|
89
|
+
if not urls:
|
|
90
|
+
return "Error: urls is required for map action"
|
|
91
|
+
return await sitemap(
|
|
92
|
+
urls=urls,
|
|
93
|
+
depth=depth,
|
|
94
|
+
max_pages=max_pages,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
case _:
|
|
98
|
+
return f"Error: Unknown action '{action}'. Valid actions: search, extract, crawl, map"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@mcp.tool()
|
|
102
|
+
async def media(
|
|
103
|
+
action: str,
|
|
104
|
+
url: str | None = None,
|
|
105
|
+
media_type: str = "all",
|
|
106
|
+
media_urls: list[str] | None = None,
|
|
107
|
+
output_dir: str | None = None,
|
|
108
|
+
max_items: int = 10,
|
|
109
|
+
) -> str:
|
|
110
|
+
"""Media discovery and download.
|
|
111
|
+
- list: Scan page, return URLs + metadata
|
|
112
|
+
- download: Download specific files to local
|
|
113
|
+
MCP client decides whether to analyze media.
|
|
114
|
+
Use `help` tool for full documentation.
|
|
115
|
+
"""
|
|
116
|
+
from wet_mcp.sources.crawler import download_media, list_media
|
|
117
|
+
|
|
118
|
+
match action:
|
|
119
|
+
case "list":
|
|
120
|
+
if not url:
|
|
121
|
+
return "Error: url is required for list action"
|
|
122
|
+
return await list_media(
|
|
123
|
+
url=url,
|
|
124
|
+
media_type=media_type,
|
|
125
|
+
max_items=max_items,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
case "download":
|
|
129
|
+
if not media_urls:
|
|
130
|
+
return "Error: media_urls is required for download action"
|
|
131
|
+
return await download_media(
|
|
132
|
+
media_urls=media_urls,
|
|
133
|
+
output_dir=output_dir or settings.download_dir,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
case _:
|
|
137
|
+
return f"Error: Unknown action '{action}'. Valid actions: list, download"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
@mcp.tool()
|
|
141
|
+
async def help(tool_name: str = "web") -> str:
|
|
142
|
+
"""Get full documentation for a tool.
|
|
143
|
+
Use when compressed descriptions are insufficient.
|
|
144
|
+
"""
|
|
145
|
+
try:
|
|
146
|
+
doc_file = files("wet_mcp.docs").joinpath(f"{tool_name}.md")
|
|
147
|
+
return doc_file.read_text()
|
|
148
|
+
except FileNotFoundError:
|
|
149
|
+
return f"Error: No documentation found for tool '{tool_name}'"
|
|
150
|
+
except Exception as e:
|
|
151
|
+
return f"Error loading documentation: {e}"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def main() -> None:
|
|
155
|
+
"""Run the MCP server."""
|
|
156
|
+
from wet_mcp.setup import run_auto_setup
|
|
157
|
+
|
|
158
|
+
logger.info("Starting WET MCP Server...")
|
|
159
|
+
|
|
160
|
+
# Run auto-setup on first start (installs Playwright, etc.)
|
|
161
|
+
run_auto_setup()
|
|
162
|
+
|
|
163
|
+
# Initialize SearXNG container
|
|
164
|
+
searxng_url = _get_searxng_url()
|
|
165
|
+
logger.info(f"SearXNG URL: {searxng_url}")
|
|
166
|
+
|
|
167
|
+
# Run MCP server
|
|
168
|
+
mcp.run()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
if __name__ == "__main__":
|
|
172
|
+
main()
|
wet_mcp/setup.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Auto-setup utilities for WET MCP Server.
|
|
2
|
+
|
|
3
|
+
This module handles automatic first-run setup:
|
|
4
|
+
- Install Playwright browsers (chromium)
|
|
5
|
+
- Verify Docker availability
|
|
6
|
+
- Create configuration directories
|
|
7
|
+
|
|
8
|
+
Setup runs automatically on first server start.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import subprocess
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from loguru import logger
|
|
16
|
+
|
|
17
|
+
# Marker file to track if setup has been run
|
|
18
|
+
SETUP_MARKER = Path.home() / ".wet-mcp" / ".setup-complete"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def needs_setup() -> bool:
|
|
22
|
+
"""Check if setup needs to run."""
|
|
23
|
+
return not SETUP_MARKER.exists()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def run_auto_setup() -> bool:
|
|
27
|
+
"""Run automatic setup on first start.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
True if setup succeeded or was already done, False on failure.
|
|
31
|
+
"""
|
|
32
|
+
if not needs_setup():
|
|
33
|
+
logger.debug("Setup already complete, skipping")
|
|
34
|
+
return True
|
|
35
|
+
|
|
36
|
+
logger.info("First run detected, running auto-setup...")
|
|
37
|
+
|
|
38
|
+
success = True
|
|
39
|
+
|
|
40
|
+
# Step 1: Create config directory
|
|
41
|
+
config_dir = Path.home() / ".wet-mcp"
|
|
42
|
+
config_dir.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
logger.debug(f"Created config directory: {config_dir}")
|
|
44
|
+
|
|
45
|
+
# Step 2: Install Playwright chromium (required for Crawl4AI)
|
|
46
|
+
logger.info("Installing Playwright chromium browser...")
|
|
47
|
+
try:
|
|
48
|
+
result = subprocess.run(
|
|
49
|
+
[sys.executable, "-m", "playwright", "install", "chromium"],
|
|
50
|
+
capture_output=True,
|
|
51
|
+
text=True,
|
|
52
|
+
timeout=300,
|
|
53
|
+
)
|
|
54
|
+
if result.returncode == 0:
|
|
55
|
+
logger.info("Playwright chromium installed successfully")
|
|
56
|
+
else:
|
|
57
|
+
logger.warning(f"Playwright install warning: {result.stderr[:200]}")
|
|
58
|
+
# Don't fail - might already be installed
|
|
59
|
+
except subprocess.TimeoutExpired:
|
|
60
|
+
logger.error("Playwright installation timed out")
|
|
61
|
+
success = False
|
|
62
|
+
except FileNotFoundError:
|
|
63
|
+
logger.warning("Playwright command not found, some features may not work")
|
|
64
|
+
|
|
65
|
+
# Step 3: Verify Docker (optional, for SearXNG)
|
|
66
|
+
try:
|
|
67
|
+
result = subprocess.run(
|
|
68
|
+
["docker", "version", "--format", "{{.Server.Version}}"],
|
|
69
|
+
capture_output=True,
|
|
70
|
+
text=True,
|
|
71
|
+
timeout=10,
|
|
72
|
+
)
|
|
73
|
+
if result.returncode == 0:
|
|
74
|
+
logger.debug(f"Docker available: v{result.stdout.strip()}")
|
|
75
|
+
else:
|
|
76
|
+
logger.info("Docker not running, will use external SearXNG URL if configured")
|
|
77
|
+
except FileNotFoundError:
|
|
78
|
+
logger.info("Docker not installed, will use external SearXNG URL if configured")
|
|
79
|
+
except subprocess.TimeoutExpired:
|
|
80
|
+
logger.debug("Docker check timed out")
|
|
81
|
+
|
|
82
|
+
# Mark setup as complete
|
|
83
|
+
if success:
|
|
84
|
+
SETUP_MARKER.touch()
|
|
85
|
+
logger.info("Auto-setup complete!")
|
|
86
|
+
|
|
87
|
+
return success
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def reset_setup() -> None:
|
|
91
|
+
"""Reset setup marker to force re-run on next start."""
|
|
92
|
+
if SETUP_MARKER.exists():
|
|
93
|
+
SETUP_MARKER.unlink()
|
|
94
|
+
logger.info("Setup marker removed, will re-run on next start")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Sources package for WET MCP Server."""
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Crawl4AI integration for web crawling and extraction."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def extract(
|
|
10
|
+
urls: list[str],
|
|
11
|
+
format: str = "markdown",
|
|
12
|
+
stealth: bool = True,
|
|
13
|
+
) -> str:
|
|
14
|
+
"""Extract content from URLs.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
urls: List of URLs to extract
|
|
18
|
+
format: Output format (markdown, text, html)
|
|
19
|
+
stealth: Enable stealth mode
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
JSON string with extracted content
|
|
23
|
+
"""
|
|
24
|
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
25
|
+
|
|
26
|
+
logger.info(f"Extracting content from {len(urls)} URLs")
|
|
27
|
+
|
|
28
|
+
browser_config = BrowserConfig(
|
|
29
|
+
headless=True,
|
|
30
|
+
enable_stealth=stealth,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
results = []
|
|
34
|
+
|
|
35
|
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
36
|
+
for url in urls:
|
|
37
|
+
try:
|
|
38
|
+
result = await crawler.arun(
|
|
39
|
+
url,
|
|
40
|
+
config=CrawlerRunConfig(),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if result.success:
|
|
44
|
+
content = result.markdown if format == "markdown" else result.cleaned_html
|
|
45
|
+
results.append({
|
|
46
|
+
"url": url,
|
|
47
|
+
"title": result.metadata.get("title", ""),
|
|
48
|
+
"content": content,
|
|
49
|
+
"links": {
|
|
50
|
+
"internal": result.links.get("internal", [])[:20],
|
|
51
|
+
"external": result.links.get("external", [])[:20],
|
|
52
|
+
},
|
|
53
|
+
})
|
|
54
|
+
else:
|
|
55
|
+
results.append({
|
|
56
|
+
"url": url,
|
|
57
|
+
"error": result.error_message or "Failed to extract",
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.error(f"Error extracting {url}: {e}")
|
|
62
|
+
results.append({
|
|
63
|
+
"url": url,
|
|
64
|
+
"error": str(e),
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
logger.info(f"Extracted {len(results)} pages")
|
|
68
|
+
return json.dumps(results, ensure_ascii=False, indent=2)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
async def crawl(
|
|
72
|
+
urls: list[str],
|
|
73
|
+
depth: int = 2,
|
|
74
|
+
max_pages: int = 20,
|
|
75
|
+
format: str = "markdown",
|
|
76
|
+
stealth: bool = True,
|
|
77
|
+
) -> str:
|
|
78
|
+
"""Deep crawl from root URLs.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
urls: List of root URLs
|
|
82
|
+
depth: Crawl depth
|
|
83
|
+
max_pages: Maximum pages to crawl
|
|
84
|
+
format: Output format
|
|
85
|
+
stealth: Enable stealth mode
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
JSON string with crawled content
|
|
89
|
+
"""
|
|
90
|
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
91
|
+
|
|
92
|
+
logger.info(f"Crawling {len(urls)} URLs with depth={depth}")
|
|
93
|
+
|
|
94
|
+
browser_config = BrowserConfig(
|
|
95
|
+
headless=True,
|
|
96
|
+
enable_stealth=stealth,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
all_results = []
|
|
100
|
+
visited = set()
|
|
101
|
+
|
|
102
|
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
103
|
+
for root_url in urls:
|
|
104
|
+
to_crawl = [(root_url, 0)]
|
|
105
|
+
|
|
106
|
+
while to_crawl and len(all_results) < max_pages:
|
|
107
|
+
url, current_depth = to_crawl.pop(0)
|
|
108
|
+
|
|
109
|
+
if url in visited or current_depth > depth:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
visited.add(url)
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
result = await crawler.arun(
|
|
116
|
+
url,
|
|
117
|
+
config=CrawlerRunConfig(),
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if result.success:
|
|
121
|
+
content = result.markdown if format == "markdown" else result.cleaned_html
|
|
122
|
+
all_results.append({
|
|
123
|
+
"url": url,
|
|
124
|
+
"depth": current_depth,
|
|
125
|
+
"title": result.metadata.get("title", ""),
|
|
126
|
+
"content": content[:5000], # Limit content size
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
# Add internal links for next depth
|
|
130
|
+
if current_depth < depth:
|
|
131
|
+
internal_links = result.links.get("internal", [])
|
|
132
|
+
for link_item in internal_links[:10]:
|
|
133
|
+
# Crawl4AI returns dicts with 'href' key
|
|
134
|
+
link_url = link_item.get("href", "") if isinstance(link_item, dict) else link_item
|
|
135
|
+
if link_url and link_url not in visited:
|
|
136
|
+
to_crawl.append((link_url, current_depth + 1))
|
|
137
|
+
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.error(f"Error crawling {url}: {e}")
|
|
140
|
+
|
|
141
|
+
logger.info(f"Crawled {len(all_results)} pages")
|
|
142
|
+
return json.dumps(all_results, ensure_ascii=False, indent=2)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
async def sitemap(
|
|
146
|
+
urls: list[str],
|
|
147
|
+
depth: int = 2,
|
|
148
|
+
max_pages: int = 50,
|
|
149
|
+
) -> str:
|
|
150
|
+
"""Discover site structure.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
urls: List of root URLs
|
|
154
|
+
depth: Discovery depth
|
|
155
|
+
max_pages: Maximum pages to discover
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
JSON string with discovered URLs
|
|
159
|
+
"""
|
|
160
|
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
161
|
+
|
|
162
|
+
logger.info(f"Mapping {len(urls)} URLs")
|
|
163
|
+
|
|
164
|
+
browser_config = BrowserConfig(headless=True)
|
|
165
|
+
|
|
166
|
+
all_urls = []
|
|
167
|
+
visited = set()
|
|
168
|
+
|
|
169
|
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
170
|
+
for root_url in urls:
|
|
171
|
+
to_visit = [(root_url, 0)]
|
|
172
|
+
site_urls = []
|
|
173
|
+
|
|
174
|
+
while to_visit and len(site_urls) < max_pages:
|
|
175
|
+
url, current_depth = to_visit.pop(0)
|
|
176
|
+
|
|
177
|
+
if url in visited or current_depth > depth:
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
visited.add(url)
|
|
181
|
+
site_urls.append({"url": url, "depth": current_depth})
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
result = await crawler.arun(
|
|
185
|
+
url,
|
|
186
|
+
config=CrawlerRunConfig(),
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if result.success and current_depth < depth:
|
|
190
|
+
for link in result.links.get("internal", [])[:20]:
|
|
191
|
+
if link not in visited:
|
|
192
|
+
to_visit.append((link, current_depth + 1))
|
|
193
|
+
|
|
194
|
+
except Exception as e:
|
|
195
|
+
logger.debug(f"Error mapping {url}: {e}")
|
|
196
|
+
|
|
197
|
+
all_urls.extend(site_urls)
|
|
198
|
+
|
|
199
|
+
logger.info(f"Mapped {len(all_urls)} URLs")
|
|
200
|
+
return json.dumps(all_urls, ensure_ascii=False, indent=2)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
async def list_media(
|
|
204
|
+
url: str,
|
|
205
|
+
media_type: str = "all",
|
|
206
|
+
max_items: int = 10,
|
|
207
|
+
) -> str:
|
|
208
|
+
"""List media from a page.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
url: Page URL to scan
|
|
212
|
+
media_type: Type of media (images, videos, audio, files, all)
|
|
213
|
+
max_items: Maximum items to return
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
JSON string with media list
|
|
217
|
+
"""
|
|
218
|
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
219
|
+
|
|
220
|
+
logger.info(f"Listing media from: {url}")
|
|
221
|
+
|
|
222
|
+
browser_config = BrowserConfig(headless=True)
|
|
223
|
+
|
|
224
|
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
225
|
+
result = await crawler.arun(
|
|
226
|
+
url,
|
|
227
|
+
config=CrawlerRunConfig(),
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
if not result.success:
|
|
231
|
+
return json.dumps({"error": result.error_message or "Failed to load page"})
|
|
232
|
+
|
|
233
|
+
media = result.media or {}
|
|
234
|
+
|
|
235
|
+
output = {}
|
|
236
|
+
|
|
237
|
+
if media_type in ("images", "all"):
|
|
238
|
+
output["images"] = media.get("images", [])[:max_items]
|
|
239
|
+
if media_type in ("videos", "all"):
|
|
240
|
+
output["videos"] = media.get("videos", [])[:max_items]
|
|
241
|
+
if media_type in ("audio", "all"):
|
|
242
|
+
# Crawl4AI uses 'audios' (plural)
|
|
243
|
+
output["audio"] = media.get("audios", [])[:max_items]
|
|
244
|
+
|
|
245
|
+
logger.info(f"Found media: {sum(len(v) for v in output.values())} items")
|
|
246
|
+
return json.dumps(output, ensure_ascii=False, indent=2)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
async def download_media(
|
|
250
|
+
media_urls: list[str],
|
|
251
|
+
output_dir: str,
|
|
252
|
+
) -> str:
|
|
253
|
+
"""Download media files.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
media_urls: List of media URLs to download
|
|
257
|
+
output_dir: Output directory
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
JSON string with download results
|
|
261
|
+
"""
|
|
262
|
+
import httpx
|
|
263
|
+
|
|
264
|
+
logger.info(f"Downloading {len(media_urls)} media files")
|
|
265
|
+
|
|
266
|
+
output_path = Path(output_dir).expanduser()
|
|
267
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
268
|
+
|
|
269
|
+
results = []
|
|
270
|
+
|
|
271
|
+
async with httpx.AsyncClient(timeout=60) as client:
|
|
272
|
+
for url in media_urls:
|
|
273
|
+
try:
|
|
274
|
+
response = await client.get(url)
|
|
275
|
+
response.raise_for_status()
|
|
276
|
+
|
|
277
|
+
filename = url.split("/")[-1].split("?")[0] or "download"
|
|
278
|
+
filepath = output_path / filename
|
|
279
|
+
|
|
280
|
+
filepath.write_bytes(response.content)
|
|
281
|
+
|
|
282
|
+
results.append({
|
|
283
|
+
"url": url,
|
|
284
|
+
"path": str(filepath),
|
|
285
|
+
"size": len(response.content),
|
|
286
|
+
})
|
|
287
|
+
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.error(f"Error downloading {url}: {e}")
|
|
290
|
+
results.append({
|
|
291
|
+
"url": url,
|
|
292
|
+
"error": str(e),
|
|
293
|
+
})
|
|
294
|
+
|
|
295
|
+
logger.info(f"Downloaded {len([r for r in results if 'path' in r])} files")
|
|
296
|
+
return json.dumps(results, ensure_ascii=False, indent=2)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""SearXNG search integration."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def search(
|
|
10
|
+
searxng_url: str,
|
|
11
|
+
query: str,
|
|
12
|
+
categories: str = "general",
|
|
13
|
+
max_results: int = 10,
|
|
14
|
+
) -> str:
|
|
15
|
+
"""Search via SearXNG API.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
searxng_url: SearXNG instance URL
|
|
19
|
+
query: Search query
|
|
20
|
+
categories: Search category (general, images, videos, files)
|
|
21
|
+
max_results: Maximum number of results
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
JSON string with search results
|
|
25
|
+
"""
|
|
26
|
+
logger.info(f"Searching SearXNG: {query}")
|
|
27
|
+
|
|
28
|
+
params = {
|
|
29
|
+
"q": query,
|
|
30
|
+
"format": "json",
|
|
31
|
+
"categories": categories,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
async with httpx.AsyncClient(timeout=30) as client:
|
|
36
|
+
response = await client.get(
|
|
37
|
+
f"{searxng_url}/search",
|
|
38
|
+
params=params,
|
|
39
|
+
)
|
|
40
|
+
response.raise_for_status()
|
|
41
|
+
data = response.json()
|
|
42
|
+
|
|
43
|
+
results = data.get("results", [])[:max_results]
|
|
44
|
+
|
|
45
|
+
# Format results
|
|
46
|
+
formatted = []
|
|
47
|
+
for r in results:
|
|
48
|
+
formatted.append({
|
|
49
|
+
"url": r.get("url", ""),
|
|
50
|
+
"title": r.get("title", ""),
|
|
51
|
+
"snippet": r.get("content", ""),
|
|
52
|
+
"source": r.get("engine", ""),
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
output = {
|
|
56
|
+
"results": formatted,
|
|
57
|
+
"total": len(formatted),
|
|
58
|
+
"query": query,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
logger.info(f"Found {len(formatted)} results for: {query}")
|
|
62
|
+
return json.dumps(output, ensure_ascii=False, indent=2)
|
|
63
|
+
|
|
64
|
+
except httpx.HTTPStatusError as e:
|
|
65
|
+
logger.error(f"SearXNG HTTP error: {e}")
|
|
66
|
+
return json.dumps({"error": f"HTTP error: {e.response.status_code}"})
|
|
67
|
+
except httpx.RequestError as e:
|
|
68
|
+
logger.error(f"SearXNG request error: {e}")
|
|
69
|
+
return json.dumps({"error": f"Request error: {e}"})
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.error(f"SearXNG error: {e}")
|
|
72
|
+
return json.dumps({"error": str(e)})
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wet-mcp
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Open-source MCP Server thay thế Tavily - Web search, extract, crawl với SearXNG
|
|
5
|
+
Project-URL: Homepage, https://github.com/n24q02m/wet-mcp
|
|
6
|
+
Project-URL: Repository, https://github.com/n24q02m/wet-mcp.git
|
|
7
|
+
Project-URL: Issues, https://github.com/n24q02m/wet-mcp/issues
|
|
8
|
+
Author-email: n24q02m <quangminh2422004@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: crawl4ai,mcp,searxng,tavily-alternative,web-scraping
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: ==3.13.*
|
|
22
|
+
Requires-Dist: crawl4ai>=0.8.0
|
|
23
|
+
Requires-Dist: httpx>=0.27.0
|
|
24
|
+
Requires-Dist: loguru>=0.7.0
|
|
25
|
+
Requires-Dist: mcp[cli]>=1.0.0
|
|
26
|
+
Requires-Dist: pydantic-settings>=2.0.0
|
|
27
|
+
Requires-Dist: pydantic>=2.0.0
|
|
28
|
+
Requires-Dist: python-on-whales>=0.73.0
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# WET - Web ExTract MCP Server
|
|
32
|
+
|
|
33
|
+
[](https://badge.fury.io/py/wet-mcp)
|
|
34
|
+
[](https://opensource.org/licenses/MIT)
|
|
35
|
+
|
|
36
|
+
> **Open-source MCP Server thay thế Tavily cho web scraping & multimodal extraction**
|
|
37
|
+
|
|
38
|
+
Zero-install experience: chỉ cần `uvx wet-mcp` - tự động setup và quản lý SearXNG container.
|
|
39
|
+
|
|
40
|
+
## Features
|
|
41
|
+
|
|
42
|
+
| Feature | Description |
|
|
43
|
+
|:--------|:------------|
|
|
44
|
+
| **Web Search** | Tìm kiếm qua SearXNG (metasearch: Google, Bing, DuckDuckGo, Brave) |
|
|
45
|
+
| **Content Extract** | Trích xuất nội dung sạch (Markdown/Text/HTML) |
|
|
46
|
+
| **Deep Crawl** | Đi qua nhiều trang con từ URL gốc với depth control |
|
|
47
|
+
| **Site Map** | Khám phá cấu trúc URL của website |
|
|
48
|
+
| **Media** | List và download images, videos, audio files |
|
|
49
|
+
| **Anti-bot** | Stealth mode bypass Cloudflare, Medium, LinkedIn, Twitter |
|
|
50
|
+
|
|
51
|
+
## Quick Start
|
|
52
|
+
|
|
53
|
+
### Prerequisites
|
|
54
|
+
|
|
55
|
+
- Docker daemon running (for SearXNG)
|
|
56
|
+
- Python 3.13+ (hoặc dùng uvx)
|
|
57
|
+
|
|
58
|
+
### MCP Client Configuration
|
|
59
|
+
|
|
60
|
+
**Claude Desktop / Cursor / Windsurf / Antigravity:**
|
|
61
|
+
|
|
62
|
+
```json
|
|
63
|
+
{
|
|
64
|
+
"mcpServers": {
|
|
65
|
+
"wet": {
|
|
66
|
+
"command": "uvx",
|
|
67
|
+
"args": ["wet-mcp"]
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
**Đó là tất cả!** Khi MCP client gọi wet-mcp lần đầu:
|
|
74
|
+
1. Tự động install Playwright chromium
|
|
75
|
+
2. Tự động pull SearXNG Docker image
|
|
76
|
+
3. Start `wet-searxng` container
|
|
77
|
+
4. Chạy MCP server
|
|
78
|
+
|
|
79
|
+
### Without uvx
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install wet-mcp
|
|
83
|
+
wet-mcp
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Tools
|
|
87
|
+
|
|
88
|
+
| Tool | Actions | Description |
|
|
89
|
+
|:-----|:--------|:------------|
|
|
90
|
+
| `web` | search, extract, crawl, map | Web operations |
|
|
91
|
+
| `media` | list, download | Media discovery & download |
|
|
92
|
+
| `help` | - | Full documentation |
|
|
93
|
+
|
|
94
|
+
### Examples
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
# Search
|
|
98
|
+
{"action": "search", "query": "python web scraping", "max_results": 10}
|
|
99
|
+
|
|
100
|
+
# Extract content
|
|
101
|
+
{"action": "extract", "urls": ["https://example.com"]}
|
|
102
|
+
|
|
103
|
+
# Crawl with depth
|
|
104
|
+
{"action": "crawl", "urls": ["https://docs.python.org"], "depth": 2}
|
|
105
|
+
|
|
106
|
+
# Map site structure
|
|
107
|
+
{"action": "map", "urls": ["https://example.com"]}
|
|
108
|
+
|
|
109
|
+
# List media
|
|
110
|
+
{"action": "list", "url": "https://github.com/python/cpython"}
|
|
111
|
+
|
|
112
|
+
# Download media
|
|
113
|
+
{"action": "download", "media_urls": ["https://example.com/image.png"]}
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Tech Stack
|
|
117
|
+
|
|
118
|
+
| Component | Technology |
|
|
119
|
+
|:----------|:-----------|
|
|
120
|
+
| Language | Python 3.13 |
|
|
121
|
+
| MCP Framework | FastMCP |
|
|
122
|
+
| Web Search | SearXNG (auto-managed Docker) |
|
|
123
|
+
| Web Crawling | Crawl4AI |
|
|
124
|
+
| Docker Management | python-on-whales |
|
|
125
|
+
|
|
126
|
+
## How It Works
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
┌─────────────────────────────────────────────────────────┐
|
|
130
|
+
│ MCP Client │
|
|
131
|
+
│ (Claude, Cursor, Windsurf) │
|
|
132
|
+
└─────────────────────┬───────────────────────────────────┘
|
|
133
|
+
│ MCP Protocol
|
|
134
|
+
▼
|
|
135
|
+
┌─────────────────────────────────────────────────────────┐
|
|
136
|
+
│ WET MCP Server │
|
|
137
|
+
│ ┌──────────┐ ┌──────────┐ ┌──────────────────────┐ │
|
|
138
|
+
│ │ web │ │ media │ │ help │ │
|
|
139
|
+
│ │ (search, │ │ (list, │ │ (full documentation)│ │
|
|
140
|
+
│ │ extract, │ │ download)│ └──────────────────────┘ │
|
|
141
|
+
│ │ crawl, │ └────┬─────┘ │
|
|
142
|
+
│ │ map) │ │ │
|
|
143
|
+
│ └────┬─────┘ │ │
|
|
144
|
+
│ │ │ │
|
|
145
|
+
│ ▼ ▼ │
|
|
146
|
+
│ ┌──────────┐ ┌──────────┐ │
|
|
147
|
+
│ │ SearXNG │ │ Crawl4AI │ │
|
|
148
|
+
│ │ (Docker) │ │(Playwright)│ │
|
|
149
|
+
│ └──────────┘ └──────────┘ │
|
|
150
|
+
└─────────────────────────────────────────────────────────┘
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Configuration
|
|
154
|
+
|
|
155
|
+
Environment variables:
|
|
156
|
+
|
|
157
|
+
| Variable | Default | Description |
|
|
158
|
+
|:---------|:--------|:------------|
|
|
159
|
+
| `WET_AUTO_DOCKER` | `true` | Auto-manage SearXNG container |
|
|
160
|
+
| `WET_SEARXNG_PORT` | `8080` | SearXNG container port |
|
|
161
|
+
| `SEARXNG_URL` | `http://localhost:8080` | External SearXNG URL |
|
|
162
|
+
| `LOG_LEVEL` | `INFO` | Logging level |
|
|
163
|
+
|
|
164
|
+
## Container Management
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
# View SearXNG logs
|
|
168
|
+
docker logs wet-searxng
|
|
169
|
+
|
|
170
|
+
# Stop SearXNG
|
|
171
|
+
docker stop wet-searxng
|
|
172
|
+
|
|
173
|
+
# Remove container (will be recreated on next run)
|
|
174
|
+
docker rm wet-searxng
|
|
175
|
+
|
|
176
|
+
# Reset auto-setup (forces re-install Playwright)
|
|
177
|
+
rm ~/.wet-mcp/.setup-complete
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## License
|
|
181
|
+
|
|
182
|
+
MIT License
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
wet_mcp/__init__.py,sha256=I6YcMMy-wD5AY49rICs4lkd3tZQ4-XWkPpvoj7KwsKs,152
|
|
2
|
+
wet_mcp/__main__.py,sha256=fjPYOwPnE1LDxkSbFZH0NjH8WLx8F2NjsO8vD-ujVbw,106
|
|
3
|
+
wet_mcp/config.py,sha256=9ppAQSMmoNN5vAZHxSwMAjyeA7-3k3Nm6ANRUVMikBU,716
|
|
4
|
+
wet_mcp/docker_manager.py,sha256=mFuzPO_9WjdTK8h7bqZ9VJ9NfW1Dkb7t5UAbTb_6GaE,5106
|
|
5
|
+
wet_mcp/searxng_settings.yml,sha256=mB-AgqDGzoEG5xF5gHfPtH3s3TRyt3wV9FYC3b6wdIY,524
|
|
6
|
+
wet_mcp/server.py,sha256=wmlZuAv6WvTrf-8LKYfNPvSI8vXk8Y0R5ZMuMPenNk4,4851
|
|
7
|
+
wet_mcp/setup.py,sha256=8leo5QG_gRgVnpHhapOohbrsFoET4WaZ-cLEEc4Tvvw,2938
|
|
8
|
+
wet_mcp/docs/__init__.py,sha256=JRSAzxxW76HaoAy6cf2lArX0abAhab3DiJtEwK4CYjc,39
|
|
9
|
+
wet_mcp/docs/help.md,sha256=K91uveU6hM_tWBR9iqgn6_g0rt9h1h2zyEA3RMefeks,1428
|
|
10
|
+
wet_mcp/docs/media.md,sha256=dktqg67x5yJaMXkHB2ShR9m3YaC2y56Ya82XKCVnl5k,1144
|
|
11
|
+
wet_mcp/docs/web.md,sha256=Ts7hBo_TEBcfT10iscag9-PxAKUkhhiaywricUTj3to,1691
|
|
12
|
+
wet_mcp/sources/__init__.py,sha256=NzIac1ha0nZR93Uivsq0GqBxdJrfm0q83IQAPeip4I4,42
|
|
13
|
+
wet_mcp/sources/crawler.py,sha256=TBdH0raBoVVSohAHPv4J1DaaFbnjIkhLIf3SyI17tn8,9049
|
|
14
|
+
wet_mcp/sources/searxng.py,sha256=Mg3WHy1z4OqvEJAOnn674S7ejCxBaROKBoBJJLulOxQ,1979
|
|
15
|
+
wet_mcp-1.0.0.dist-info/METADATA,sha256=722R2KzbPcjtH_5zeeuM1nNaDDXy6Zo4OooZ6bjPC3c,6532
|
|
16
|
+
wet_mcp-1.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
17
|
+
wet_mcp-1.0.0.dist-info/entry_points.txt,sha256=MvjtmQDh--zOPfnE-21Q861RFRLkE1xDbcTGAgURT_Y,41
|
|
18
|
+
wet_mcp-1.0.0.dist-info/licenses/LICENSE,sha256=d7xQ6sRyeGus6gnvwgqiQtSY7XdFw0Jd0w5-Co_xHnk,1064
|
|
19
|
+
wet_mcp-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 n24q02m
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|