wet-mcp 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wet_mcp/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """WET MCP Server - Web ExTract for AI Agents."""
2
+
3
+ from wet_mcp.server import main, mcp
4
+
5
+ __version__ = "0.1.0"
6
+ __all__ = ["mcp", "main", "__version__"]
wet_mcp/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """WET MCP Server entry point."""
2
+
3
+ from wet_mcp.server import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
wet_mcp/config.py ADDED
@@ -0,0 +1,32 @@
1
+ """Configuration settings for WET MCP Server."""
2
+
3
+ from pydantic_settings import BaseSettings
4
+
5
+
6
+ class Settings(BaseSettings):
7
+ """WET MCP Server configuration."""
8
+
9
+ # SearXNG
10
+ searxng_url: str = "http://localhost:8080"
11
+ searxng_timeout: int = 30
12
+
13
+ # Crawler
14
+ crawler_headless: bool = True
15
+ crawler_timeout: int = 60
16
+
17
+ # Docker Management
18
+ wet_auto_docker: bool = True
19
+ wet_container_name: str = "wet-searxng"
20
+ wet_searxng_image: str = "searxng/searxng:latest"
21
+ wet_searxng_port: int = 8080
22
+
23
+ # Media
24
+ download_dir: str = "~/.wet-mcp/downloads"
25
+
26
+ # Logging
27
+ log_level: str = "INFO"
28
+
29
+ model_config = {"env_prefix": "", "case_sensitive": False}
30
+
31
+
32
+ settings = Settings()
@@ -0,0 +1,146 @@
1
+ """Docker container management for SearXNG."""
2
+
3
+ import socket
4
+ from importlib.resources import files
5
+ from pathlib import Path
6
+
7
+ from loguru import logger
8
+
9
+ from wet_mcp.config import settings
10
+
11
+
12
+ def _find_available_port(start_port: int, max_tries: int = 10) -> int:
13
+ """Find an available port starting from start_port."""
14
+ for offset in range(max_tries):
15
+ port = start_port + offset
16
+ try:
17
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
18
+ s.bind(("localhost", port))
19
+ return port
20
+ except OSError:
21
+ continue
22
+ # Fallback to original port
23
+ return start_port
24
+
25
+
26
+ def _get_settings_path() -> Path:
27
+ """Get path to SearXNG settings file.
28
+
29
+ Copies bundled settings.yml to user config directory for Docker mounting.
30
+ Uses ~/.wet-mcp/ which is typically shared with Docker.
31
+ """
32
+ config_dir = Path.home() / ".wet-mcp"
33
+ config_dir.mkdir(parents=True, exist_ok=True)
34
+
35
+ settings_file = config_dir / "searxng_settings.yml"
36
+
37
+ # Copy bundled settings if not exists
38
+ if not settings_file.exists():
39
+ bundled = files("wet_mcp").joinpath("searxng_settings.yml")
40
+ settings_file.write_text(bundled.read_text())
41
+ logger.debug(f"Copied SearXNG settings to: {settings_file}")
42
+
43
+ return settings_file
44
+
45
+
46
+ def ensure_searxng() -> str:
47
+ """Start SearXNG container if not running. Returns URL.
48
+
49
+ This function handles:
50
+ - Automatic container creation if it doesn't exist
51
+ - Port conflict resolution (tries next available port)
52
+ - SearXNG configuration for JSON API format via settings.yml mount
53
+ - Graceful fallback to external SearXNG URL if Docker unavailable
54
+ """
55
+ if not settings.wet_auto_docker:
56
+ logger.info("Auto Docker disabled, using external SearXNG")
57
+ return settings.searxng_url
58
+
59
+ try:
60
+ from python_on_whales import DockerException, docker
61
+ except ImportError:
62
+ logger.warning("python-on-whales not installed, using external SearXNG")
63
+ return settings.searxng_url
64
+
65
+ container_name = settings.wet_container_name
66
+ image = settings.wet_searxng_image
67
+ preferred_port = settings.wet_searxng_port
68
+
69
+ try:
70
+ if docker.container.exists(container_name):
71
+ container = docker.container.inspect(container_name)
72
+ if container.state.running:
73
+ logger.debug(f"SearXNG container already running: {container_name}")
74
+ # Extract port from running container
75
+ ports = container.network_settings.ports
76
+ if ports and "8080/tcp" in ports and ports["8080/tcp"]:
77
+ port = int(ports["8080/tcp"][0].get("HostPort", preferred_port))
78
+ else:
79
+ port = preferred_port
80
+ else:
81
+ logger.info(f"Starting stopped container: {container_name}")
82
+ docker.container.start(container_name)
83
+ port = preferred_port
84
+ else:
85
+ # Find available port to avoid conflicts
86
+ port = _find_available_port(preferred_port)
87
+ if port != preferred_port:
88
+ logger.info(f"Port {preferred_port} in use, using {port}")
89
+
90
+ # Get settings file path
91
+ settings_path = _get_settings_path()
92
+
93
+ logger.info(f"Starting SearXNG container: {container_name}")
94
+ docker.run(
95
+ image,
96
+ name=container_name,
97
+ detach=True,
98
+ publish=[(port, 8080)],
99
+ volumes=[(str(settings_path), "/etc/searxng/settings.yml", "ro")],
100
+ envs={
101
+ "SEARXNG_SECRET": "wet-internal",
102
+ },
103
+ )
104
+ logger.info(f"SearXNG container started on port {port}")
105
+
106
+ return f"http://localhost:{port}"
107
+
108
+ except DockerException as e:
109
+ logger.warning(f"Docker not available: {e}")
110
+ logger.warning("Falling back to external SearXNG URL")
111
+ return settings.searxng_url
112
+ except Exception as e:
113
+ logger.error(f"Failed to start SearXNG: {e}")
114
+ return settings.searxng_url
115
+
116
+
117
+ def stop_searxng() -> None:
118
+ """Stop SearXNG container if running."""
119
+ if not settings.wet_auto_docker:
120
+ return
121
+
122
+ try:
123
+ from python_on_whales import docker
124
+
125
+ container_name = settings.wet_container_name
126
+ if docker.container.exists(container_name):
127
+ logger.info(f"Stopping container: {container_name}")
128
+ docker.container.stop(container_name)
129
+ except Exception as e:
130
+ logger.debug(f"Failed to stop container: {e}")
131
+
132
+
133
+ def remove_searxng() -> None:
134
+ """Stop and remove SearXNG container."""
135
+ if not settings.wet_auto_docker:
136
+ return
137
+
138
+ try:
139
+ from python_on_whales import docker
140
+
141
+ container_name = settings.wet_container_name
142
+ if docker.container.exists(container_name):
143
+ logger.info(f"Removing container: {container_name}")
144
+ docker.container.remove(container_name, force=True)
145
+ except Exception as e:
146
+ logger.debug(f"Failed to remove container: {e}")
@@ -0,0 +1 @@
1
+ """Docs package for WET MCP Server."""
wet_mcp/docs/help.md ADDED
@@ -0,0 +1,55 @@
1
+ # WET MCP Server - Help
2
+
3
+ Welcome to **WET** (Web ExTract) MCP Server - an open-source alternative to Tavily.
4
+
5
+ ## Available Tools
6
+
7
+ | Tool | Description |
8
+ |:-----|:------------|
9
+ | `web` | Web search, content extraction, crawling, site mapping |
10
+ | `media` | Media discovery (images, videos, audio) and download |
11
+ | `help` | Get full documentation for any tool |
12
+
13
+ ## Quick Reference
14
+
15
+ ### web tool
16
+
17
+ ```json
18
+ // Search the web
19
+ {"action": "search", "query": "your search query"}
20
+
21
+ // Extract content from URLs
22
+ {"action": "extract", "urls": ["https://example.com"]}
23
+
24
+ // Crawl multiple pages
25
+ {"action": "crawl", "urls": ["https://docs.example.com"], "depth": 2}
26
+
27
+ // Map site structure
28
+ {"action": "map", "urls": ["https://example.com"]}
29
+ ```
30
+
31
+ ### media tool
32
+
33
+ ```json
34
+ // List media on a page
35
+ {"action": "list", "url": "https://example.com"}
36
+
37
+ // Download specific files
38
+ {"action": "download", "media_urls": ["https://example.com/image.png"]}
39
+ ```
40
+
41
+ ## Getting Full Documentation
42
+
43
+ Call `help` with the tool name:
44
+
45
+ ```json
46
+ {"tool_name": "web"} // Web tool documentation
47
+ {"tool_name": "media"} // Media tool documentation
48
+ ```
49
+
50
+ ## Features
51
+
52
+ - **Auto-setup**: First run automatically installs Playwright and configures SearXNG
53
+ - **Anti-bot bypass**: Stealth mode works with Cloudflare, Medium, LinkedIn, etc.
54
+ - **Multimodal**: Extract and download images, videos, audio files
55
+ - **Deep crawling**: Follow links to specified depth with page limits
wet_mcp/docs/media.md ADDED
@@ -0,0 +1,58 @@
1
+ # media Tool Documentation
2
+
3
+ Media discovery and download from web pages.
4
+
5
+ ## Actions
6
+
7
+ ### list
8
+ Scan a page and return media URLs with metadata.
9
+
10
+ **Parameters:**
11
+ - `url` (required): Page URL to scan
12
+ - `media_type`: Type of media - images, videos, audio, files, all (default: all)
13
+ - `max_items`: Maximum items per type (default: 10)
14
+
15
+ **Example:**
16
+ ```json
17
+ {"action": "list", "url": "https://example.com/gallery", "media_type": "images"}
18
+ ```
19
+
20
+ **Returns:**
21
+ ```json
22
+ {
23
+ "images": [
24
+ {"src": "https://...", "alt": "...", "width": 800, "height": 600}
25
+ ],
26
+ "videos": [],
27
+ "audio": []
28
+ }
29
+ ```
30
+
31
+ ---
32
+
33
+ ### download
34
+ Download specific media files to local storage.
35
+
36
+ **Parameters:**
37
+ - `media_urls` (required): List of media URLs to download
38
+ - `output_dir`: Output directory (default: ~/.wet-mcp/downloads)
39
+
40
+ **Example:**
41
+ ```json
42
+ {"action": "download", "media_urls": ["https://example.com/image.jpg"]}
43
+ ```
44
+
45
+ **Returns:**
46
+ ```json
47
+ [
48
+ {"url": "...", "path": "/path/to/file.jpg", "size": 12345}
49
+ ]
50
+ ```
51
+
52
+ ---
53
+
54
+ ## Workflow
55
+
56
+ 1. Use `list` to discover media on a page
57
+ 2. Review the results (optionally have AI analyze)
58
+ 3. Use `download` to save specific files locally
wet_mcp/docs/web.md ADDED
@@ -0,0 +1,73 @@
1
+ # web Tool Documentation
2
+
3
+ Web operations: search, extract, crawl, map.
4
+
5
+ ## Actions
6
+
7
+ ### search
8
+ Web search via SearXNG metasearch engine.
9
+
10
+ **Parameters:**
11
+ - `query` (required): Search query string
12
+ - `categories`: Search category - general, images, videos, files (default: general)
13
+ - `max_results`: Maximum results to return (default: 10)
14
+
15
+ **Example:**
16
+ ```json
17
+ {"action": "search", "query": "python web scraping tutorial", "max_results": 5}
18
+ ```
19
+
20
+ ---
21
+
22
+ ### extract
23
+ Get clean content from one or more URLs.
24
+
25
+ **Parameters:**
26
+ - `urls` (required): List of URLs to extract
27
+ - `format`: Output format - markdown, text, html (default: markdown)
28
+ - `stealth`: Enable stealth mode to bypass anti-bot (default: true)
29
+
30
+ **Example:**
31
+ ```json
32
+ {"action": "extract", "urls": ["https://example.com/article"]}
33
+ ```
34
+
35
+ ---
36
+
37
+ ### crawl
38
+ Deep crawl starting from root URLs.
39
+
40
+ **Parameters:**
41
+ - `urls` (required): List of root URLs to crawl from
42
+ - `depth`: How many levels deep to crawl (default: 2)
43
+ - `max_pages`: Maximum pages to crawl (default: 20)
44
+ - `format`: Output format (default: markdown)
45
+ - `stealth`: Enable stealth mode (default: true)
46
+
47
+ **Example:**
48
+ ```json
49
+ {"action": "crawl", "urls": ["https://docs.example.com"], "depth": 3}
50
+ ```
51
+
52
+ ---
53
+
54
+ ### map
55
+ Discover site structure without extracting content.
56
+
57
+ **Parameters:**
58
+ - `urls` (required): List of root URLs
59
+ - `depth`: Discovery depth (default: 2)
60
+ - `max_pages`: Maximum URLs to discover (default: 50)
61
+
62
+ **Example:**
63
+ ```json
64
+ {"action": "map", "urls": ["https://example.com"]}
65
+ ```
66
+
67
+ ---
68
+
69
+ ## Anti-Bot Features
70
+
71
+ The `stealth` parameter enables:
72
+ - Stealth mode: Masks navigator.webdriver, emulates plugins
73
+ - Undetected browser: For advanced detection (Cloudflare, Datadome)
@@ -0,0 +1,30 @@
1
+ # SearXNG settings for WET MCP Server
2
+ # Mounted into container at /etc/searxng/settings.yml
3
+
4
+ use_default_settings: true
5
+
6
+ server:
7
+ secret_key: "wet-mcp-internal-secret"
8
+ limiter: false
9
+ method: "GET"
10
+
11
+ search:
12
+ formats:
13
+ - html
14
+ - json
15
+
16
+ # Disable rate limiting for local use
17
+ outgoing:
18
+ request_timeout: 10.0
19
+ max_request_timeout: 30.0
20
+
21
+ # Engines to enable
22
+ engines:
23
+ - name: google
24
+ disabled: false
25
+ - name: bing
26
+ disabled: false
27
+ - name: duckduckgo
28
+ disabled: false
29
+ - name: brave
30
+ disabled: false
wet_mcp/server.py ADDED
@@ -0,0 +1,172 @@
1
+ """WET MCP Server - Main server definition."""
2
+
3
+ import sys
4
+ from importlib.resources import files
5
+
6
+ from loguru import logger
7
+ from mcp.server.fastmcp import FastMCP
8
+
9
+ from wet_mcp.config import settings
10
+ from wet_mcp.docker_manager import ensure_searxng
11
+
12
+ # Configure logging
13
+ logger.remove()
14
+ logger.add(sys.stderr, level=settings.log_level)
15
+
16
+ # Initialize MCP server
17
+ mcp = FastMCP(
18
+ name="wet",
19
+ instructions="Web ExTract MCP Server - search, extract, crawl, map with SearXNG",
20
+ )
21
+
22
+ # Store SearXNG URL after initialization
23
+ _searxng_url: str | None = None
24
+
25
+
26
+ def _get_searxng_url() -> str:
27
+ """Get SearXNG URL, initializing container if needed."""
28
+ global _searxng_url
29
+ if _searxng_url is None:
30
+ _searxng_url = ensure_searxng()
31
+ return _searxng_url
32
+
33
+
34
+ @mcp.tool()
35
+ async def web(
36
+ action: str,
37
+ query: str | None = None,
38
+ urls: list[str] | None = None,
39
+ categories: str = "general",
40
+ max_results: int = 10,
41
+ depth: int = 2,
42
+ max_pages: int = 20,
43
+ format: str = "markdown",
44
+ stealth: bool = True,
45
+ ) -> str:
46
+ """Web operations: search, extract, crawl, map.
47
+ - search: Web search via SearXNG (requires query)
48
+ - extract: Get clean content from URLs
49
+ - crawl: Deep crawl from root URL
50
+ - map: Discover site structure
51
+ Use `help` tool for full documentation.
52
+ """
53
+ from wet_mcp.sources.crawler import crawl, extract, sitemap
54
+ from wet_mcp.sources.searxng import search as searxng_search
55
+
56
+ match action:
57
+ case "search":
58
+ if not query:
59
+ return "Error: query is required for search action"
60
+ searxng_url = _get_searxng_url()
61
+ return await searxng_search(
62
+ searxng_url=searxng_url,
63
+ query=query,
64
+ categories=categories,
65
+ max_results=max_results,
66
+ )
67
+
68
+ case "extract":
69
+ if not urls:
70
+ return "Error: urls is required for extract action"
71
+ return await extract(
72
+ urls=urls,
73
+ format=format,
74
+ stealth=stealth,
75
+ )
76
+
77
+ case "crawl":
78
+ if not urls:
79
+ return "Error: urls is required for crawl action"
80
+ return await crawl(
81
+ urls=urls,
82
+ depth=depth,
83
+ max_pages=max_pages,
84
+ format=format,
85
+ stealth=stealth,
86
+ )
87
+
88
+ case "map":
89
+ if not urls:
90
+ return "Error: urls is required for map action"
91
+ return await sitemap(
92
+ urls=urls,
93
+ depth=depth,
94
+ max_pages=max_pages,
95
+ )
96
+
97
+ case _:
98
+ return f"Error: Unknown action '{action}'. Valid actions: search, extract, crawl, map"
99
+
100
+
101
+ @mcp.tool()
102
+ async def media(
103
+ action: str,
104
+ url: str | None = None,
105
+ media_type: str = "all",
106
+ media_urls: list[str] | None = None,
107
+ output_dir: str | None = None,
108
+ max_items: int = 10,
109
+ ) -> str:
110
+ """Media discovery and download.
111
+ - list: Scan page, return URLs + metadata
112
+ - download: Download specific files to local
113
+ MCP client decides whether to analyze media.
114
+ Use `help` tool for full documentation.
115
+ """
116
+ from wet_mcp.sources.crawler import download_media, list_media
117
+
118
+ match action:
119
+ case "list":
120
+ if not url:
121
+ return "Error: url is required for list action"
122
+ return await list_media(
123
+ url=url,
124
+ media_type=media_type,
125
+ max_items=max_items,
126
+ )
127
+
128
+ case "download":
129
+ if not media_urls:
130
+ return "Error: media_urls is required for download action"
131
+ return await download_media(
132
+ media_urls=media_urls,
133
+ output_dir=output_dir or settings.download_dir,
134
+ )
135
+
136
+ case _:
137
+ return f"Error: Unknown action '{action}'. Valid actions: list, download"
138
+
139
+
140
+ @mcp.tool()
141
+ async def help(tool_name: str = "web") -> str:
142
+ """Get full documentation for a tool.
143
+ Use when compressed descriptions are insufficient.
144
+ """
145
+ try:
146
+ doc_file = files("wet_mcp.docs").joinpath(f"{tool_name}.md")
147
+ return doc_file.read_text()
148
+ except FileNotFoundError:
149
+ return f"Error: No documentation found for tool '{tool_name}'"
150
+ except Exception as e:
151
+ return f"Error loading documentation: {e}"
152
+
153
+
154
+ def main() -> None:
155
+ """Run the MCP server."""
156
+ from wet_mcp.setup import run_auto_setup
157
+
158
+ logger.info("Starting WET MCP Server...")
159
+
160
+ # Run auto-setup on first start (installs Playwright, etc.)
161
+ run_auto_setup()
162
+
163
+ # Initialize SearXNG container
164
+ searxng_url = _get_searxng_url()
165
+ logger.info(f"SearXNG URL: {searxng_url}")
166
+
167
+ # Run MCP server
168
+ mcp.run()
169
+
170
+
171
+ if __name__ == "__main__":
172
+ main()
wet_mcp/setup.py ADDED
@@ -0,0 +1,94 @@
1
+ """Auto-setup utilities for WET MCP Server.
2
+
3
+ This module handles automatic first-run setup:
4
+ - Install Playwright browsers (chromium)
5
+ - Verify Docker availability
6
+ - Create configuration directories
7
+
8
+ Setup runs automatically on first server start.
9
+ """
10
+
11
+ import subprocess
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ from loguru import logger
16
+
17
+ # Marker file to track if setup has been run
18
+ SETUP_MARKER = Path.home() / ".wet-mcp" / ".setup-complete"
19
+
20
+
21
+ def needs_setup() -> bool:
22
+ """Check if setup needs to run."""
23
+ return not SETUP_MARKER.exists()
24
+
25
+
26
+ def run_auto_setup() -> bool:
27
+ """Run automatic setup on first start.
28
+
29
+ Returns:
30
+ True if setup succeeded or was already done, False on failure.
31
+ """
32
+ if not needs_setup():
33
+ logger.debug("Setup already complete, skipping")
34
+ return True
35
+
36
+ logger.info("First run detected, running auto-setup...")
37
+
38
+ success = True
39
+
40
+ # Step 1: Create config directory
41
+ config_dir = Path.home() / ".wet-mcp"
42
+ config_dir.mkdir(parents=True, exist_ok=True)
43
+ logger.debug(f"Created config directory: {config_dir}")
44
+
45
+ # Step 2: Install Playwright chromium (required for Crawl4AI)
46
+ logger.info("Installing Playwright chromium browser...")
47
+ try:
48
+ result = subprocess.run(
49
+ [sys.executable, "-m", "playwright", "install", "chromium"],
50
+ capture_output=True,
51
+ text=True,
52
+ timeout=300,
53
+ )
54
+ if result.returncode == 0:
55
+ logger.info("Playwright chromium installed successfully")
56
+ else:
57
+ logger.warning(f"Playwright install warning: {result.stderr[:200]}")
58
+ # Don't fail - might already be installed
59
+ except subprocess.TimeoutExpired:
60
+ logger.error("Playwright installation timed out")
61
+ success = False
62
+ except FileNotFoundError:
63
+ logger.warning("Playwright command not found, some features may not work")
64
+
65
+ # Step 3: Verify Docker (optional, for SearXNG)
66
+ try:
67
+ result = subprocess.run(
68
+ ["docker", "version", "--format", "{{.Server.Version}}"],
69
+ capture_output=True,
70
+ text=True,
71
+ timeout=10,
72
+ )
73
+ if result.returncode == 0:
74
+ logger.debug(f"Docker available: v{result.stdout.strip()}")
75
+ else:
76
+ logger.info("Docker not running, will use external SearXNG URL if configured")
77
+ except FileNotFoundError:
78
+ logger.info("Docker not installed, will use external SearXNG URL if configured")
79
+ except subprocess.TimeoutExpired:
80
+ logger.debug("Docker check timed out")
81
+
82
+ # Mark setup as complete
83
+ if success:
84
+ SETUP_MARKER.touch()
85
+ logger.info("Auto-setup complete!")
86
+
87
+ return success
88
+
89
+
90
+ def reset_setup() -> None:
91
+ """Reset setup marker to force re-run on next start."""
92
+ if SETUP_MARKER.exists():
93
+ SETUP_MARKER.unlink()
94
+ logger.info("Setup marker removed, will re-run on next start")
@@ -0,0 +1 @@
1
+ """Sources package for WET MCP Server."""
@@ -0,0 +1,296 @@
1
+ """Crawl4AI integration for web crawling and extraction."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from loguru import logger
7
+
8
+
9
+ async def extract(
10
+ urls: list[str],
11
+ format: str = "markdown",
12
+ stealth: bool = True,
13
+ ) -> str:
14
+ """Extract content from URLs.
15
+
16
+ Args:
17
+ urls: List of URLs to extract
18
+ format: Output format (markdown, text, html)
19
+ stealth: Enable stealth mode
20
+
21
+ Returns:
22
+ JSON string with extracted content
23
+ """
24
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
25
+
26
+ logger.info(f"Extracting content from {len(urls)} URLs")
27
+
28
+ browser_config = BrowserConfig(
29
+ headless=True,
30
+ enable_stealth=stealth,
31
+ )
32
+
33
+ results = []
34
+
35
+ async with AsyncWebCrawler(config=browser_config) as crawler:
36
+ for url in urls:
37
+ try:
38
+ result = await crawler.arun(
39
+ url,
40
+ config=CrawlerRunConfig(),
41
+ )
42
+
43
+ if result.success:
44
+ content = result.markdown if format == "markdown" else result.cleaned_html
45
+ results.append({
46
+ "url": url,
47
+ "title": result.metadata.get("title", ""),
48
+ "content": content,
49
+ "links": {
50
+ "internal": result.links.get("internal", [])[:20],
51
+ "external": result.links.get("external", [])[:20],
52
+ },
53
+ })
54
+ else:
55
+ results.append({
56
+ "url": url,
57
+ "error": result.error_message or "Failed to extract",
58
+ })
59
+
60
+ except Exception as e:
61
+ logger.error(f"Error extracting {url}: {e}")
62
+ results.append({
63
+ "url": url,
64
+ "error": str(e),
65
+ })
66
+
67
+ logger.info(f"Extracted {len(results)} pages")
68
+ return json.dumps(results, ensure_ascii=False, indent=2)
69
+
70
+
71
+ async def crawl(
72
+ urls: list[str],
73
+ depth: int = 2,
74
+ max_pages: int = 20,
75
+ format: str = "markdown",
76
+ stealth: bool = True,
77
+ ) -> str:
78
+ """Deep crawl from root URLs.
79
+
80
+ Args:
81
+ urls: List of root URLs
82
+ depth: Crawl depth
83
+ max_pages: Maximum pages to crawl
84
+ format: Output format
85
+ stealth: Enable stealth mode
86
+
87
+ Returns:
88
+ JSON string with crawled content
89
+ """
90
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
91
+
92
+ logger.info(f"Crawling {len(urls)} URLs with depth={depth}")
93
+
94
+ browser_config = BrowserConfig(
95
+ headless=True,
96
+ enable_stealth=stealth,
97
+ )
98
+
99
+ all_results = []
100
+ visited = set()
101
+
102
+ async with AsyncWebCrawler(config=browser_config) as crawler:
103
+ for root_url in urls:
104
+ to_crawl = [(root_url, 0)]
105
+
106
+ while to_crawl and len(all_results) < max_pages:
107
+ url, current_depth = to_crawl.pop(0)
108
+
109
+ if url in visited or current_depth > depth:
110
+ continue
111
+
112
+ visited.add(url)
113
+
114
+ try:
115
+ result = await crawler.arun(
116
+ url,
117
+ config=CrawlerRunConfig(),
118
+ )
119
+
120
+ if result.success:
121
+ content = result.markdown if format == "markdown" else result.cleaned_html
122
+ all_results.append({
123
+ "url": url,
124
+ "depth": current_depth,
125
+ "title": result.metadata.get("title", ""),
126
+ "content": content[:5000], # Limit content size
127
+ })
128
+
129
+ # Add internal links for next depth
130
+ if current_depth < depth:
131
+ internal_links = result.links.get("internal", [])
132
+ for link_item in internal_links[:10]:
133
+ # Crawl4AI returns dicts with 'href' key
134
+ link_url = link_item.get("href", "") if isinstance(link_item, dict) else link_item
135
+ if link_url and link_url not in visited:
136
+ to_crawl.append((link_url, current_depth + 1))
137
+
138
+ except Exception as e:
139
+ logger.error(f"Error crawling {url}: {e}")
140
+
141
+ logger.info(f"Crawled {len(all_results)} pages")
142
+ return json.dumps(all_results, ensure_ascii=False, indent=2)
143
+
144
+
145
+ async def sitemap(
146
+ urls: list[str],
147
+ depth: int = 2,
148
+ max_pages: int = 50,
149
+ ) -> str:
150
+ """Discover site structure.
151
+
152
+ Args:
153
+ urls: List of root URLs
154
+ depth: Discovery depth
155
+ max_pages: Maximum pages to discover
156
+
157
+ Returns:
158
+ JSON string with discovered URLs
159
+ """
160
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
161
+
162
+ logger.info(f"Mapping {len(urls)} URLs")
163
+
164
+ browser_config = BrowserConfig(headless=True)
165
+
166
+ all_urls = []
167
+ visited = set()
168
+
169
+ async with AsyncWebCrawler(config=browser_config) as crawler:
170
+ for root_url in urls:
171
+ to_visit = [(root_url, 0)]
172
+ site_urls = []
173
+
174
+ while to_visit and len(site_urls) < max_pages:
175
+ url, current_depth = to_visit.pop(0)
176
+
177
+ if url in visited or current_depth > depth:
178
+ continue
179
+
180
+ visited.add(url)
181
+ site_urls.append({"url": url, "depth": current_depth})
182
+
183
+ try:
184
+ result = await crawler.arun(
185
+ url,
186
+ config=CrawlerRunConfig(),
187
+ )
188
+
189
+ if result.success and current_depth < depth:
190
+ for link in result.links.get("internal", [])[:20]:
191
+ if link not in visited:
192
+ to_visit.append((link, current_depth + 1))
193
+
194
+ except Exception as e:
195
+ logger.debug(f"Error mapping {url}: {e}")
196
+
197
+ all_urls.extend(site_urls)
198
+
199
+ logger.info(f"Mapped {len(all_urls)} URLs")
200
+ return json.dumps(all_urls, ensure_ascii=False, indent=2)
201
+
202
+
203
+ async def list_media(
204
+ url: str,
205
+ media_type: str = "all",
206
+ max_items: int = 10,
207
+ ) -> str:
208
+ """List media from a page.
209
+
210
+ Args:
211
+ url: Page URL to scan
212
+ media_type: Type of media (images, videos, audio, files, all)
213
+ max_items: Maximum items to return
214
+
215
+ Returns:
216
+ JSON string with media list
217
+ """
218
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
219
+
220
+ logger.info(f"Listing media from: {url}")
221
+
222
+ browser_config = BrowserConfig(headless=True)
223
+
224
+ async with AsyncWebCrawler(config=browser_config) as crawler:
225
+ result = await crawler.arun(
226
+ url,
227
+ config=CrawlerRunConfig(),
228
+ )
229
+
230
+ if not result.success:
231
+ return json.dumps({"error": result.error_message or "Failed to load page"})
232
+
233
+ media = result.media or {}
234
+
235
+ output = {}
236
+
237
+ if media_type in ("images", "all"):
238
+ output["images"] = media.get("images", [])[:max_items]
239
+ if media_type in ("videos", "all"):
240
+ output["videos"] = media.get("videos", [])[:max_items]
241
+ if media_type in ("audio", "all"):
242
+ # Crawl4AI uses 'audios' (plural)
243
+ output["audio"] = media.get("audios", [])[:max_items]
244
+
245
+ logger.info(f"Found media: {sum(len(v) for v in output.values())} items")
246
+ return json.dumps(output, ensure_ascii=False, indent=2)
247
+
248
+
249
+ async def download_media(
250
+ media_urls: list[str],
251
+ output_dir: str,
252
+ ) -> str:
253
+ """Download media files.
254
+
255
+ Args:
256
+ media_urls: List of media URLs to download
257
+ output_dir: Output directory
258
+
259
+ Returns:
260
+ JSON string with download results
261
+ """
262
+ import httpx
263
+
264
+ logger.info(f"Downloading {len(media_urls)} media files")
265
+
266
+ output_path = Path(output_dir).expanduser()
267
+ output_path.mkdir(parents=True, exist_ok=True)
268
+
269
+ results = []
270
+
271
+ async with httpx.AsyncClient(timeout=60) as client:
272
+ for url in media_urls:
273
+ try:
274
+ response = await client.get(url)
275
+ response.raise_for_status()
276
+
277
+ filename = url.split("/")[-1].split("?")[0] or "download"
278
+ filepath = output_path / filename
279
+
280
+ filepath.write_bytes(response.content)
281
+
282
+ results.append({
283
+ "url": url,
284
+ "path": str(filepath),
285
+ "size": len(response.content),
286
+ })
287
+
288
+ except Exception as e:
289
+ logger.error(f"Error downloading {url}: {e}")
290
+ results.append({
291
+ "url": url,
292
+ "error": str(e),
293
+ })
294
+
295
+ logger.info(f"Downloaded {len([r for r in results if 'path' in r])} files")
296
+ return json.dumps(results, ensure_ascii=False, indent=2)
@@ -0,0 +1,72 @@
1
+ """SearXNG search integration."""
2
+
3
+ import json
4
+
5
+ import httpx
6
+ from loguru import logger
7
+
8
+
9
+ async def search(
10
+ searxng_url: str,
11
+ query: str,
12
+ categories: str = "general",
13
+ max_results: int = 10,
14
+ ) -> str:
15
+ """Search via SearXNG API.
16
+
17
+ Args:
18
+ searxng_url: SearXNG instance URL
19
+ query: Search query
20
+ categories: Search category (general, images, videos, files)
21
+ max_results: Maximum number of results
22
+
23
+ Returns:
24
+ JSON string with search results
25
+ """
26
+ logger.info(f"Searching SearXNG: {query}")
27
+
28
+ params = {
29
+ "q": query,
30
+ "format": "json",
31
+ "categories": categories,
32
+ }
33
+
34
+ try:
35
+ async with httpx.AsyncClient(timeout=30) as client:
36
+ response = await client.get(
37
+ f"{searxng_url}/search",
38
+ params=params,
39
+ )
40
+ response.raise_for_status()
41
+ data = response.json()
42
+
43
+ results = data.get("results", [])[:max_results]
44
+
45
+ # Format results
46
+ formatted = []
47
+ for r in results:
48
+ formatted.append({
49
+ "url": r.get("url", ""),
50
+ "title": r.get("title", ""),
51
+ "snippet": r.get("content", ""),
52
+ "source": r.get("engine", ""),
53
+ })
54
+
55
+ output = {
56
+ "results": formatted,
57
+ "total": len(formatted),
58
+ "query": query,
59
+ }
60
+
61
+ logger.info(f"Found {len(formatted)} results for: {query}")
62
+ return json.dumps(output, ensure_ascii=False, indent=2)
63
+
64
+ except httpx.HTTPStatusError as e:
65
+ logger.error(f"SearXNG HTTP error: {e}")
66
+ return json.dumps({"error": f"HTTP error: {e.response.status_code}"})
67
+ except httpx.RequestError as e:
68
+ logger.error(f"SearXNG request error: {e}")
69
+ return json.dumps({"error": f"Request error: {e}"})
70
+ except Exception as e:
71
+ logger.error(f"SearXNG error: {e}")
72
+ return json.dumps({"error": str(e)})
@@ -0,0 +1,182 @@
1
+ Metadata-Version: 2.4
2
+ Name: wet-mcp
3
+ Version: 1.0.0
4
+ Summary: Open-source MCP Server thay thế Tavily - Web search, extract, crawl với SearXNG
5
+ Project-URL: Homepage, https://github.com/n24q02m/wet-mcp
6
+ Project-URL: Repository, https://github.com/n24q02m/wet-mcp.git
7
+ Project-URL: Issues, https://github.com/n24q02m/wet-mcp/issues
8
+ Author-email: n24q02m <quangminh2422004@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: crawl4ai,mcp,searxng,tavily-alternative,web-scraping
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Internet :: WWW/HTTP
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: ==3.13.*
22
+ Requires-Dist: crawl4ai>=0.8.0
23
+ Requires-Dist: httpx>=0.27.0
24
+ Requires-Dist: loguru>=0.7.0
25
+ Requires-Dist: mcp[cli]>=1.0.0
26
+ Requires-Dist: pydantic-settings>=2.0.0
27
+ Requires-Dist: pydantic>=2.0.0
28
+ Requires-Dist: python-on-whales>=0.73.0
29
+ Description-Content-Type: text/markdown
30
+
31
+ # WET - Web ExTract MCP Server
32
+
33
+ [![PyPI version](https://badge.fury.io/py/wet-mcp.svg)](https://badge.fury.io/py/wet-mcp)
34
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
35
+
36
+ > **Open-source MCP Server thay thế Tavily cho web scraping & multimodal extraction**
37
+
38
+ Zero-install experience: chỉ cần `uvx wet-mcp` - tự động setup và quản lý SearXNG container.
39
+
40
+ ## Features
41
+
42
+ | Feature | Description |
43
+ |:--------|:------------|
44
+ | **Web Search** | Tìm kiếm qua SearXNG (metasearch: Google, Bing, DuckDuckGo, Brave) |
45
+ | **Content Extract** | Trích xuất nội dung sạch (Markdown/Text/HTML) |
46
+ | **Deep Crawl** | Đi qua nhiều trang con từ URL gốc với depth control |
47
+ | **Site Map** | Khám phá cấu trúc URL của website |
48
+ | **Media** | List và download images, videos, audio files |
49
+ | **Anti-bot** | Stealth mode bypass Cloudflare, Medium, LinkedIn, Twitter |
50
+
51
+ ## Quick Start
52
+
53
+ ### Prerequisites
54
+
55
+ - Docker daemon running (for SearXNG)
56
+ - Python 3.13+ (hoặc dùng uvx)
57
+
58
+ ### MCP Client Configuration
59
+
60
+ **Claude Desktop / Cursor / Windsurf / Antigravity:**
61
+
62
+ ```json
63
+ {
64
+ "mcpServers": {
65
+ "wet": {
66
+ "command": "uvx",
67
+ "args": ["wet-mcp"]
68
+ }
69
+ }
70
+ }
71
+ ```
72
+
73
+ **Đó là tất cả!** Khi MCP client gọi wet-mcp lần đầu:
74
+ 1. Tự động install Playwright chromium
75
+ 2. Tự động pull SearXNG Docker image
76
+ 3. Start `wet-searxng` container
77
+ 4. Chạy MCP server
78
+
79
+ ### Without uvx
80
+
81
+ ```bash
82
+ pip install wet-mcp
83
+ wet-mcp
84
+ ```
85
+
86
+ ## Tools
87
+
88
+ | Tool | Actions | Description |
89
+ |:-----|:--------|:------------|
90
+ | `web` | search, extract, crawl, map | Web operations |
91
+ | `media` | list, download | Media discovery & download |
92
+ | `help` | - | Full documentation |
93
+
94
+ ### Examples
95
+
96
+ ```python
97
+ # Search
98
+ {"action": "search", "query": "python web scraping", "max_results": 10}
99
+
100
+ # Extract content
101
+ {"action": "extract", "urls": ["https://example.com"]}
102
+
103
+ # Crawl with depth
104
+ {"action": "crawl", "urls": ["https://docs.python.org"], "depth": 2}
105
+
106
+ # Map site structure
107
+ {"action": "map", "urls": ["https://example.com"]}
108
+
109
+ # List media
110
+ {"action": "list", "url": "https://github.com/python/cpython"}
111
+
112
+ # Download media
113
+ {"action": "download", "media_urls": ["https://example.com/image.png"]}
114
+ ```
115
+
116
+ ## Tech Stack
117
+
118
+ | Component | Technology |
119
+ |:----------|:-----------|
120
+ | Language | Python 3.13 |
121
+ | MCP Framework | FastMCP |
122
+ | Web Search | SearXNG (auto-managed Docker) |
123
+ | Web Crawling | Crawl4AI |
124
+ | Docker Management | python-on-whales |
125
+
126
+ ## How It Works
127
+
128
+ ```
129
+ ┌─────────────────────────────────────────────────────────┐
130
+ │ MCP Client │
131
+ │ (Claude, Cursor, Windsurf) │
132
+ └─────────────────────┬───────────────────────────────────┘
133
+ │ MCP Protocol
134
+
135
+ ┌─────────────────────────────────────────────────────────┐
136
+ │ WET MCP Server │
137
+ │ ┌──────────┐ ┌──────────┐ ┌──────────────────────┐ │
138
+ │ │ web │ │ media │ │ help │ │
139
+ │ │ (search, │ │ (list, │ │ (full documentation)│ │
140
+ │ │ extract, │ │ download)│ └──────────────────────┘ │
141
+ │ │ crawl, │ └────┬─────┘ │
142
+ │ │ map) │ │ │
143
+ │ └────┬─────┘ │ │
144
+ │ │ │ │
145
+ │ ▼ ▼ │
146
+ │ ┌──────────┐ ┌──────────┐ │
147
+ │ │ SearXNG │ │ Crawl4AI │ │
148
+ │ │ (Docker) │ │(Playwright)│ │
149
+ │ └──────────┘ └──────────┘ │
150
+ └─────────────────────────────────────────────────────────┘
151
+ ```
152
+
153
+ ## Configuration
154
+
155
+ Environment variables:
156
+
157
+ | Variable | Default | Description |
158
+ |:---------|:--------|:------------|
159
+ | `WET_AUTO_DOCKER` | `true` | Auto-manage SearXNG container |
160
+ | `WET_SEARXNG_PORT` | `8080` | SearXNG container port |
161
+ | `SEARXNG_URL` | `http://localhost:8080` | External SearXNG URL |
162
+ | `LOG_LEVEL` | `INFO` | Logging level |
163
+
164
+ ## Container Management
165
+
166
+ ```bash
167
+ # View SearXNG logs
168
+ docker logs wet-searxng
169
+
170
+ # Stop SearXNG
171
+ docker stop wet-searxng
172
+
173
+ # Remove container (will be recreated on next run)
174
+ docker rm wet-searxng
175
+
176
+ # Reset auto-setup (forces re-install Playwright)
177
+ rm ~/.wet-mcp/.setup-complete
178
+ ```
179
+
180
+ ## License
181
+
182
+ MIT License
@@ -0,0 +1,19 @@
1
+ wet_mcp/__init__.py,sha256=I6YcMMy-wD5AY49rICs4lkd3tZQ4-XWkPpvoj7KwsKs,152
2
+ wet_mcp/__main__.py,sha256=fjPYOwPnE1LDxkSbFZH0NjH8WLx8F2NjsO8vD-ujVbw,106
3
+ wet_mcp/config.py,sha256=9ppAQSMmoNN5vAZHxSwMAjyeA7-3k3Nm6ANRUVMikBU,716
4
+ wet_mcp/docker_manager.py,sha256=mFuzPO_9WjdTK8h7bqZ9VJ9NfW1Dkb7t5UAbTb_6GaE,5106
5
+ wet_mcp/searxng_settings.yml,sha256=mB-AgqDGzoEG5xF5gHfPtH3s3TRyt3wV9FYC3b6wdIY,524
6
+ wet_mcp/server.py,sha256=wmlZuAv6WvTrf-8LKYfNPvSI8vXk8Y0R5ZMuMPenNk4,4851
7
+ wet_mcp/setup.py,sha256=8leo5QG_gRgVnpHhapOohbrsFoET4WaZ-cLEEc4Tvvw,2938
8
+ wet_mcp/docs/__init__.py,sha256=JRSAzxxW76HaoAy6cf2lArX0abAhab3DiJtEwK4CYjc,39
9
+ wet_mcp/docs/help.md,sha256=K91uveU6hM_tWBR9iqgn6_g0rt9h1h2zyEA3RMefeks,1428
10
+ wet_mcp/docs/media.md,sha256=dktqg67x5yJaMXkHB2ShR9m3YaC2y56Ya82XKCVnl5k,1144
11
+ wet_mcp/docs/web.md,sha256=Ts7hBo_TEBcfT10iscag9-PxAKUkhhiaywricUTj3to,1691
12
+ wet_mcp/sources/__init__.py,sha256=NzIac1ha0nZR93Uivsq0GqBxdJrfm0q83IQAPeip4I4,42
13
+ wet_mcp/sources/crawler.py,sha256=TBdH0raBoVVSohAHPv4J1DaaFbnjIkhLIf3SyI17tn8,9049
14
+ wet_mcp/sources/searxng.py,sha256=Mg3WHy1z4OqvEJAOnn674S7ejCxBaROKBoBJJLulOxQ,1979
15
+ wet_mcp-1.0.0.dist-info/METADATA,sha256=722R2KzbPcjtH_5zeeuM1nNaDDXy6Zo4OooZ6bjPC3c,6532
16
+ wet_mcp-1.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
17
+ wet_mcp-1.0.0.dist-info/entry_points.txt,sha256=MvjtmQDh--zOPfnE-21Q861RFRLkE1xDbcTGAgURT_Y,41
18
+ wet_mcp-1.0.0.dist-info/licenses/LICENSE,sha256=d7xQ6sRyeGus6gnvwgqiQtSY7XdFw0Jd0w5-Co_xHnk,1064
19
+ wet_mcp-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ wet-mcp = wet_mcp:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 n24q02m
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.