mcpwebprobe 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpwebprobe-0.1.0/PKG-INFO +97 -0
- mcpwebprobe-0.1.0/README.md +77 -0
- mcpwebprobe-0.1.0/pyproject.toml +39 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/__init__.py +24 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/api.py +109 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/config.py +262 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engine/__init__.py +5 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engine/registry.py +35 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engine/search_service.py +160 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/__init__.py +23 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/baidu.py +105 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/bing.py +190 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/brave.py +105 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/csdn.py +73 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/duckduckgo.py +124 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/exa.py +97 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/fetch_csdn.py +12 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/fetch_juejin.py +76 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/fetch_linuxdo.py +59 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/github.py +77 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/juejin.py +114 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/linuxdo.py +44 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/engines/startpage.py +192 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/logging.py +38 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/main.py +161 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/server.py +107 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/types.py +11 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/utils/browser_cookies.py +17 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/utils/cookies.py +273 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/utils/csdn.py +361 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/utils/duckduckgo.py +227 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/utils/http_client.py +363 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/utils/playwright.py +196 -0
- mcpwebprobe-0.1.0/src/mcpwebprobe/utils/urls.py +117 -0
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: mcpwebprobe
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python port of open-webSearch.
|
|
5
|
+
Classifier: Development Status :: 4 - Beta
|
|
6
|
+
Classifier: Intended Audience :: Developers
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
10
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
11
|
+
Requires-Dist: beautifulsoup4>=4.12.2
|
|
12
|
+
Requires-Dist: httpx>=0.28.1
|
|
13
|
+
Requires-Dist: modelcontextprotocol>=1.0.1
|
|
14
|
+
Requires-Dist: pytest>=9.0.3,<9.1
|
|
15
|
+
Requires-Dist: pytest-asyncio>=0.22.0
|
|
16
|
+
Requires-Python: >=3.14
|
|
17
|
+
Project-URL: Homepage, https://github.com/Ddilibe/webprobe
|
|
18
|
+
Project-URL: Issues, https://github.com/Ddilibe/webprobe
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# WebProbe (Python port of open-webSearch)
|
|
22
|
+
|
|
23
|
+
This project replicates the core search and fetch capabilities of [Aas-ee/open-webSearch](https://github.com/Aas-ee/open-webSearch) using Python. It exposes a CLI that can:
|
|
24
|
+
|
|
25
|
+
- Search multiple engines (Bing, DuckDuckGo, Baidu, Brave, Exa, Startpage, CSDN, Juejin, Linux.do)
|
|
26
|
+
- Fetch full-length articles from CSDN, Linux.do, and Juejin
|
|
27
|
+
- Download GitHub `README.*` files without hitting the API
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
python -m pip install --upgrade pip
|
|
33
|
+
pip install -e .
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Package usage
|
|
37
|
+
|
|
38
|
+
Install the project locally and consume the `webprobe` package directly:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from webprobe import WebProbeServer, search, fetch_csdn
|
|
42
|
+
|
|
43
|
+
print(search("visible web", limit=5))
|
|
44
|
+
print(fetch_csdn("https://blog.csdn.net/example/article/details/xxxxx"))
|
|
45
|
+
|
|
46
|
+
# Start the bundled HTTP server (serves /search and /fetch?kind=csdn)
|
|
47
|
+
server = WebProbeServer(host="0.0.0.0", port=3210)
|
|
48
|
+
try:
|
|
49
|
+
server.serve_forever()
|
|
50
|
+
finally:
|
|
51
|
+
server.shutdown()
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
The HTTP server exposes `/search?query=...&limit=...&engines=...` and `/fetch?kind=<csdn|linuxdo|juejin|github>&url=...`.
|
|
55
|
+
|
|
56
|
+
## CLI
|
|
57
|
+
|
|
58
|
+
Run `python main.py --help` to see available commands. Key subcommands:
|
|
59
|
+
|
|
60
|
+
### `search`
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
python main.py search "open websearch" --limit 12 --engines bing,duckduckgo
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Article fetchers
|
|
67
|
+
|
|
68
|
+
Each fetcher prints JSON or plain text:
|
|
69
|
+
|
|
70
|
+
- `python main.py fetch-csdn <url>`
|
|
71
|
+
- `python main.py fetch-linuxdo <url>`
|
|
72
|
+
- `python main.py fetch-juejin <url>`
|
|
73
|
+
- `python main.py fetch-github <repo-url>`
|
|
74
|
+
|
|
75
|
+
## Configuration
|
|
76
|
+
|
|
77
|
+
Environment variables mirror the TypeScript version:
|
|
78
|
+
|
|
79
|
+
| Variable | Default | Description |
|
|
80
|
+
| --- | --- | --- |
|
|
81
|
+
| `DEFAULT_SEARCH_ENGINE` | `bing` | Default search engine |
|
|
82
|
+
| `ALLOWED_SEARCH_ENGINES` | (empty) | Comma-separated whitelist |
|
|
83
|
+
| `USE_PROXY` / `PROXY_URL` | `false` / `http://127.0.0.1:7890` | HTTP proxy for requests |
|
|
84
|
+
|
|
85
|
+
Set `USE_PROXY=true` to route all HTTP traffic through `PROXY_URL`.
|
|
86
|
+
|
|
87
|
+
## Architecture
|
|
88
|
+
|
|
89
|
+
- `src/engine/search_service.py` orchestrates multi-engine searches with distribution logic.
|
|
90
|
+
- `src/engines/*` implement individual search/fetch adapters for each provider.
|
|
91
|
+
- `src/utils/` contains HTTP helpers, Playwright bridges for future browser fallbacks, and shared fetch logic for CSDN articles.
|
|
92
|
+
|
|
93
|
+
## Next steps
|
|
94
|
+
|
|
95
|
+
1. Wire this CLI into an MCP server similar to the TypeScript runtime.
|
|
96
|
+
2. Add Playwright-backed fallbacks for blocked search pages and protected articles.
|
|
97
|
+
3. Extend fetchers with generic web extraction (`fetch_web_content`) as in the original repo.
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# WebProbe (Python port of open-webSearch)
|
|
2
|
+
|
|
3
|
+
This project replicates the core search and fetch capabilities of [Aas-ee/open-webSearch](https://github.com/Aas-ee/open-webSearch) using Python. It exposes a CLI that can:
|
|
4
|
+
|
|
5
|
+
- Search multiple engines (Bing, DuckDuckGo, Baidu, Brave, Exa, Startpage, CSDN, Juejin, Linux.do)
|
|
6
|
+
- Fetch full-length articles from CSDN, Linux.do, and Juejin
|
|
7
|
+
- Download GitHub `README.*` files without hitting the API
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
python -m pip install --upgrade pip
|
|
13
|
+
pip install -e .
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Package usage
|
|
17
|
+
|
|
18
|
+
Install the project locally and consume the `webprobe` package directly:
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from webprobe import WebProbeServer, search, fetch_csdn
|
|
22
|
+
|
|
23
|
+
print(search("visible web", limit=5))
|
|
24
|
+
print(fetch_csdn("https://blog.csdn.net/example/article/details/xxxxx"))
|
|
25
|
+
|
|
26
|
+
# Start the bundled HTTP server (serves /search and /fetch?kind=csdn)
|
|
27
|
+
server = WebProbeServer(host="0.0.0.0", port=3210)
|
|
28
|
+
try:
|
|
29
|
+
server.serve_forever()
|
|
30
|
+
finally:
|
|
31
|
+
server.shutdown()
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
The HTTP server exposes `/search?query=...&limit=...&engines=...` and `/fetch?kind=<csdn|linuxdo|juejin|github>&url=...`.
|
|
35
|
+
|
|
36
|
+
## CLI
|
|
37
|
+
|
|
38
|
+
Run `python main.py --help` to see available commands. Key subcommands:
|
|
39
|
+
|
|
40
|
+
### `search`
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
python main.py search "open websearch" --limit 12 --engines bing,duckduckgo
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Article fetchers
|
|
47
|
+
|
|
48
|
+
Each fetcher prints JSON or plain text:
|
|
49
|
+
|
|
50
|
+
- `python main.py fetch-csdn <url>`
|
|
51
|
+
- `python main.py fetch-linuxdo <url>`
|
|
52
|
+
- `python main.py fetch-juejin <url>`
|
|
53
|
+
- `python main.py fetch-github <repo-url>`
|
|
54
|
+
|
|
55
|
+
## Configuration
|
|
56
|
+
|
|
57
|
+
Environment variables mirror the TypeScript version:
|
|
58
|
+
|
|
59
|
+
| Variable | Default | Description |
|
|
60
|
+
| --- | --- | --- |
|
|
61
|
+
| `DEFAULT_SEARCH_ENGINE` | `bing` | Default search engine |
|
|
62
|
+
| `ALLOWED_SEARCH_ENGINES` | (empty) | Comma-separated whitelist |
|
|
63
|
+
| `USE_PROXY` / `PROXY_URL` | `false` / `http://127.0.0.1:7890` | HTTP proxy for requests |
|
|
64
|
+
|
|
65
|
+
Set `USE_PROXY=true` to route all HTTP traffic through `PROXY_URL`.
|
|
66
|
+
|
|
67
|
+
## Architecture
|
|
68
|
+
|
|
69
|
+
- `src/engine/search_service.py` orchestrates multi-engine searches with distribution logic.
|
|
70
|
+
- `src/engines/*` implement individual search/fetch adapters for each provider.
|
|
71
|
+
- `src/utils/` contains HTTP helpers, Playwright bridges for future browser fallbacks, and shared fetch logic for CSDN articles.
|
|
72
|
+
|
|
73
|
+
## Next steps
|
|
74
|
+
|
|
75
|
+
1. Wire this CLI into an MCP server similar to the TypeScript runtime.
|
|
76
|
+
2. Add Playwright-backed fallbacks for blocked search pages and protected articles.
|
|
77
|
+
3. Extend fetchers with generic web extraction (`fetch_web_content`) as in the original repo.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "mcpwebprobe"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Python port of open-webSearch."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.14"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"beautifulsoup4>=4.12.2",
|
|
9
|
+
"httpx>=0.28.1",
|
|
10
|
+
"modelcontextprotocol>=1.0.1",
|
|
11
|
+
"pytest>=9.0.3,<9.1",
|
|
12
|
+
"pytest-asyncio>=0.22.0",
|
|
13
|
+
]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3.14",
|
|
20
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
[project.scripts]
|
|
25
|
+
webprobe = "webprobe:main"
|
|
26
|
+
|
|
27
|
+
[build-system]
|
|
28
|
+
requires = ["uv_build>=0.9.10,<0.10.0"]
|
|
29
|
+
build-backend = "uv_build"
|
|
30
|
+
|
|
31
|
+
[dependency-groups]
|
|
32
|
+
dev = [
|
|
33
|
+
"build>=1.4.3",
|
|
34
|
+
"twine>=6.2.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/Ddilibe/webprobe"
|
|
39
|
+
Issues = "https://github.com/Ddilibe/webprobe"
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Convenience exports for the webprobe package."""
|
|
2
|
+
|
|
3
|
+
from .api import (
|
|
4
|
+
fetch_csdn,
|
|
5
|
+
fetch_github,
|
|
6
|
+
fetch_juejin,
|
|
7
|
+
fetch_linuxdo,
|
|
8
|
+
search,
|
|
9
|
+
)
|
|
10
|
+
from .logging import configure_logging, get_logger
|
|
11
|
+
from .main import main
|
|
12
|
+
from .server import WebProbeServer
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"search",
|
|
16
|
+
"fetch_csdn",
|
|
17
|
+
"fetch_linuxdo",
|
|
18
|
+
"fetch_juejin",
|
|
19
|
+
"fetch_github",
|
|
20
|
+
"WebProbeServer",
|
|
21
|
+
"configure_logging",
|
|
22
|
+
"get_logger",
|
|
23
|
+
"main",
|
|
24
|
+
]
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
from typing import Iterable, List, Optional, Sequence
|
|
4
|
+
|
|
5
|
+
from mcpwebprobe.config import config
|
|
6
|
+
from mcpwebprobe.engine.registry import SEARCH_SERVICE
|
|
7
|
+
from mcpwebprobe.engine.search_service import (
|
|
8
|
+
normalize_engine_name,
|
|
9
|
+
resolve_requested_engines,
|
|
10
|
+
SearchExecutionResult,
|
|
11
|
+
)
|
|
12
|
+
from mcpwebprobe.engines.fetch_csdn import fetch_csdn_article
|
|
13
|
+
from mcpwebprobe.engines.fetch_juejin import fetch_juejin_article
|
|
14
|
+
from mcpwebprobe.engines.fetch_linuxdo import fetch_linuxdo_article
|
|
15
|
+
from mcpwebprobe.engines.github import fetch_github_readme
|
|
16
|
+
from mcpwebprobe.logging import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _normalize_requested_engines(engines: Optional[Sequence[str]]) -> List[str]:
|
|
22
|
+
requested = []
|
|
23
|
+
if not engines:
|
|
24
|
+
return requested
|
|
25
|
+
for engine in engines:
|
|
26
|
+
normalized = normalize_engine_name(engine)
|
|
27
|
+
if normalized:
|
|
28
|
+
requested.append(normalized)
|
|
29
|
+
return requested
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _serialize_search(result: SearchExecutionResult) -> dict:
|
|
33
|
+
return {
|
|
34
|
+
"query": result.query,
|
|
35
|
+
"engines": result.engines,
|
|
36
|
+
"totalResults": result.total_results,
|
|
37
|
+
"partialFailures": [
|
|
38
|
+
{
|
|
39
|
+
"engine": f.engine,
|
|
40
|
+
"code": f.code,
|
|
41
|
+
"message": f.message,
|
|
42
|
+
}
|
|
43
|
+
for f in result.partial_failures
|
|
44
|
+
],
|
|
45
|
+
"results": [
|
|
46
|
+
{
|
|
47
|
+
"title": entry.title,
|
|
48
|
+
"url": entry.url,
|
|
49
|
+
"description": entry.description,
|
|
50
|
+
"source": entry.source,
|
|
51
|
+
"engine": entry.engine,
|
|
52
|
+
}
|
|
53
|
+
for entry in result.results
|
|
54
|
+
],
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def search(
|
|
59
|
+
query: str,
|
|
60
|
+
limit: int = 10,
|
|
61
|
+
engines: Optional[Iterable[str]] = None,
|
|
62
|
+
) -> dict:
|
|
63
|
+
"""
|
|
64
|
+
Run a search across the configured engines.
|
|
65
|
+
"""
|
|
66
|
+
if not query or not query.strip():
|
|
67
|
+
raise ValueError("query is required")
|
|
68
|
+
|
|
69
|
+
if not (1 <= limit <= 50):
|
|
70
|
+
raise ValueError("limit must be between 1 and 50")
|
|
71
|
+
|
|
72
|
+
requested = list(_normalize_requested_engines(engines))
|
|
73
|
+
resolved = resolve_requested_engines(
|
|
74
|
+
requested or [config.default_search_engine],
|
|
75
|
+
[normalize_engine_name(engine) for engine in config.allowed_search_engines],
|
|
76
|
+
config.default_search_engine,
|
|
77
|
+
)
|
|
78
|
+
result = asyncio.run(
|
|
79
|
+
SEARCH_SERVICE.execute(
|
|
80
|
+
query=query,
|
|
81
|
+
engines=resolved,
|
|
82
|
+
limit=limit,
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
logger.debug(
|
|
86
|
+
"Ran search for query=%s limit=%s engines=%s", query, limit, resolved
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return _serialize_search(result)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def fetch_csdn(url: str) -> dict:
|
|
93
|
+
logger.debug("Fetching CSDN article %s", url)
|
|
94
|
+
return asyncio.run(fetch_csdn_article(url))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def fetch_linuxdo(url: str) -> dict:
|
|
98
|
+
logger.debug("Fetching linux.do topic %s", url)
|
|
99
|
+
return asyncio.run(fetch_linuxdo_article(url))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def fetch_juejin(url: str) -> dict:
|
|
103
|
+
logger.debug("Fetching Juejin article %s", url)
|
|
104
|
+
return asyncio.run(fetch_juejin_article(url))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def fetch_github(url: str) -> Optional[str]:
|
|
108
|
+
logger.debug("Fetching GitHub README %s", url)
|
|
109
|
+
return asyncio.run(fetch_github_readme(url))
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional, List, Literal, Union
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from urllib.parse import quote
|
|
5
|
+
|
|
6
|
+
from mcpwebprobe.logging import get_logger
|
|
7
|
+
|
|
8
|
+
# Type aliases for better type hints
|
|
9
|
+
SearchEngine = Literal[
|
|
10
|
+
"bing",
|
|
11
|
+
"duckduckgo",
|
|
12
|
+
"exa",
|
|
13
|
+
"brave",
|
|
14
|
+
"baidu",
|
|
15
|
+
"csdn",
|
|
16
|
+
"linuxdo",
|
|
17
|
+
"juejin",
|
|
18
|
+
"startpage",
|
|
19
|
+
]
|
|
20
|
+
SearchMode = Literal["request", "auto", "playwright"]
|
|
21
|
+
PlaywrightPackage = Literal["auto", "playwright", "playwright-core"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class AppConfig:
|
|
26
|
+
# Search engine configuration
|
|
27
|
+
default_search_engine: SearchEngine
|
|
28
|
+
# List of allowed search engines (if empty, all engines are available)
|
|
29
|
+
allowed_search_engines: List[str]
|
|
30
|
+
# Search mode: request only, auto request then fallback, or force Playwright
|
|
31
|
+
# Currently only affects Bing.
|
|
32
|
+
search_mode: SearchMode
|
|
33
|
+
# Proxy configuration
|
|
34
|
+
proxy_url: Optional[str]
|
|
35
|
+
use_proxy: bool
|
|
36
|
+
fetch_web_allow_insecure_tls: bool
|
|
37
|
+
# Playwright configuration
|
|
38
|
+
playwright_package: PlaywrightPackage
|
|
39
|
+
playwright_module_path: Optional[str]
|
|
40
|
+
playwright_executable_path: Optional[str]
|
|
41
|
+
playwright_ws_endpoint: Optional[str]
|
|
42
|
+
playwright_cdp_endpoint: Optional[str]
|
|
43
|
+
playwright_headless: bool
|
|
44
|
+
playwright_navigation_timeout_ms: int
|
|
45
|
+
# CORS configuration
|
|
46
|
+
enable_cors: bool
|
|
47
|
+
cors_origin: str
|
|
48
|
+
# Server configuration (determined by MODE env var: 'both', 'http', or 'stdio')
|
|
49
|
+
enable_http_server: bool
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def read_optional_env(name: str) -> Optional[str]:
|
|
53
|
+
"""Read optional environment variable, return None if not set or empty"""
|
|
54
|
+
value = os.environ.get(name)
|
|
55
|
+
if value is not None:
|
|
56
|
+
value = value.strip()
|
|
57
|
+
return value if value else None
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def parse_allowed_search_engines(env_value: Optional[str]) -> List[str]:
|
|
62
|
+
"""Parse comma-separated list of allowed search engines"""
|
|
63
|
+
if env_value:
|
|
64
|
+
return [e.strip() for e in env_value.split(",")]
|
|
65
|
+
return []
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# Valid search engines list
|
|
69
|
+
VALID_SEARCH_ENGINES = [
|
|
70
|
+
"bing",
|
|
71
|
+
"duckduckgo",
|
|
72
|
+
"exa",
|
|
73
|
+
"brave",
|
|
74
|
+
"baidu",
|
|
75
|
+
"csdn",
|
|
76
|
+
"linuxdo",
|
|
77
|
+
"juejin",
|
|
78
|
+
"startpage",
|
|
79
|
+
]
|
|
80
|
+
VALID_SEARCH_MODES = ["request", "auto", "playwright"]
|
|
81
|
+
VALID_PLAYWRIGHT_PACKAGES = ["auto", "playwright", "playwright-core"]
|
|
82
|
+
QUIET_STARTUP_LOGS = os.environ.get("OPEN_WEBSEARCH_QUIET_STARTUP") == "true"
|
|
83
|
+
|
|
84
|
+
# Read from environment variables or use defaults
|
|
85
|
+
config = AppConfig(
|
|
86
|
+
# Search engine configuration
|
|
87
|
+
default_search_engine=os.environ.get("DEFAULT_SEARCH_ENGINE", "bing"), # type: ignore
|
|
88
|
+
# Parse comma-separated list of allowed search engines
|
|
89
|
+
allowed_search_engines=parse_allowed_search_engines(
|
|
90
|
+
os.environ.get("ALLOWED_SEARCH_ENGINES")
|
|
91
|
+
),
|
|
92
|
+
search_mode=os.environ.get("SEARCH_MODE", "auto"), # type: ignore
|
|
93
|
+
# Proxy configuration
|
|
94
|
+
proxy_url=os.environ.get("PROXY_URL", "http://127.0.0.1:7890"),
|
|
95
|
+
use_proxy=os.environ.get("USE_PROXY") == "true",
|
|
96
|
+
fetch_web_allow_insecure_tls=os.environ.get("FETCH_WEB_INSECURE_TLS") == "true",
|
|
97
|
+
playwright_package=os.environ.get("PLAYWRIGHT_PACKAGE", "auto"), # type: ignore
|
|
98
|
+
playwright_module_path=read_optional_env("PLAYWRIGHT_MODULE_PATH"),
|
|
99
|
+
playwright_executable_path=read_optional_env("PLAYWRIGHT_EXECUTABLE_PATH"),
|
|
100
|
+
playwright_ws_endpoint=read_optional_env("PLAYWRIGHT_WS_ENDPOINT"),
|
|
101
|
+
playwright_cdp_endpoint=read_optional_env("PLAYWRIGHT_CDP_ENDPOINT"),
|
|
102
|
+
playwright_headless=os.environ.get("PLAYWRIGHT_HEADLESS") != "false",
|
|
103
|
+
playwright_navigation_timeout_ms=int(
|
|
104
|
+
os.environ.get("PLAYWRIGHT_NAVIGATION_TIMEOUT_MS", 20000)
|
|
105
|
+
),
|
|
106
|
+
# CORS configuration
|
|
107
|
+
enable_cors=os.environ.get("ENABLE_CORS") == "true",
|
|
108
|
+
cors_origin=os.environ.get("CORS_ORIGIN", "*"),
|
|
109
|
+
# Server configuration - determined by MODE environment variable
|
|
110
|
+
# Modes: 'both' (default), 'http', 'stdio'
|
|
111
|
+
enable_http_server=(
|
|
112
|
+
(os.environ.get("MODE", "both") in ["both", "http"])
|
|
113
|
+
if os.environ.get("MODE")
|
|
114
|
+
else True
|
|
115
|
+
),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
logger = get_logger(__name__)
|
|
119
|
+
|
|
120
|
+
# Validate default search engine
|
|
121
|
+
if config.default_search_engine not in VALID_SEARCH_ENGINES:
|
|
122
|
+
logger.warning(
|
|
123
|
+
'Invalid DEFAULT_SEARCH_ENGINE: "%s", falling back to "bing"',
|
|
124
|
+
config.default_search_engine,
|
|
125
|
+
)
|
|
126
|
+
config.default_search_engine = "bing"
|
|
127
|
+
|
|
128
|
+
if config.search_mode not in VALID_SEARCH_MODES:
|
|
129
|
+
logger.warning(
|
|
130
|
+
'Invalid SEARCH_MODE: "%s", falling back to "auto"',
|
|
131
|
+
config.search_mode,
|
|
132
|
+
)
|
|
133
|
+
config.search_mode = "auto"
|
|
134
|
+
|
|
135
|
+
if config.playwright_package not in VALID_PLAYWRIGHT_PACKAGES:
|
|
136
|
+
logger.warning(
|
|
137
|
+
'Invalid PLAYWRIGHT_PACKAGE: "%s", falling back to "auto"',
|
|
138
|
+
config.playwright_package,
|
|
139
|
+
)
|
|
140
|
+
config.playwright_package = "auto"
|
|
141
|
+
|
|
142
|
+
if not (
|
|
143
|
+
isinstance(config.playwright_navigation_timeout_ms, (int, float))
|
|
144
|
+
and config.playwright_navigation_timeout_ms > 0
|
|
145
|
+
):
|
|
146
|
+
logger.warning(
|
|
147
|
+
'Invalid PLAYWRIGHT_NAVIGATION_TIMEOUT_MS: "%s", falling back to 20000',
|
|
148
|
+
os.environ.get("PLAYWRIGHT_NAVIGATION_TIMEOUT_MS"),
|
|
149
|
+
)
|
|
150
|
+
config.playwright_navigation_timeout_ms = 20000
|
|
151
|
+
|
|
152
|
+
if config.playwright_ws_endpoint and config.playwright_cdp_endpoint:
|
|
153
|
+
logger.warning(
|
|
154
|
+
"Both PLAYWRIGHT_WS_ENDPOINT and PLAYWRIGHT_CDP_ENDPOINT are set, PLAYWRIGHT_WS_ENDPOINT will take precedence"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
if (
|
|
158
|
+
config.playwright_ws_endpoint or config.playwright_cdp_endpoint
|
|
159
|
+
) and config.playwright_executable_path:
|
|
160
|
+
logger.warning(
|
|
161
|
+
"PLAYWRIGHT_EXECUTABLE_PATH is ignored when connecting to a remote browser endpoint"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Validate allowed search engines
|
|
165
|
+
if config.allowed_search_engines:
|
|
166
|
+
# Filter out invalid engines
|
|
167
|
+
invalid_engines = [
|
|
168
|
+
engine
|
|
169
|
+
for engine in config.allowed_search_engines
|
|
170
|
+
if engine not in VALID_SEARCH_ENGINES
|
|
171
|
+
]
|
|
172
|
+
if invalid_engines:
|
|
173
|
+
logger.warning(
|
|
174
|
+
"Invalid search engines detected and will be ignored: %s",
|
|
175
|
+
", ".join(invalid_engines),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
config.allowed_search_engines = [
|
|
179
|
+
engine
|
|
180
|
+
for engine in config.allowed_search_engines
|
|
181
|
+
if engine in VALID_SEARCH_ENGINES
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
# If all engines were invalid, don't restrict (allow all engines)
|
|
185
|
+
if not config.allowed_search_engines:
|
|
186
|
+
logger.warning(
|
|
187
|
+
"No valid search engines specified in the allowed list, all engines will be available"
|
|
188
|
+
)
|
|
189
|
+
# Check if default engine is in the allowed list
|
|
190
|
+
elif config.default_search_engine not in config.allowed_search_engines:
|
|
191
|
+
logger.warning(
|
|
192
|
+
'Default search engine "%s" is not in the allowed engines list',
|
|
193
|
+
config.default_search_engine,
|
|
194
|
+
)
|
|
195
|
+
# Update the default engine to the first allowed engine
|
|
196
|
+
config.default_search_engine = config.allowed_search_engines[0] # type: ignore
|
|
197
|
+
logger.info(
|
|
198
|
+
'Default search engine updated to "%s"',
|
|
199
|
+
config.default_search_engine,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
if not QUIET_STARTUP_LOGS:
|
|
203
|
+
# Log configuration
|
|
204
|
+
logger.info("🔍 Default search engine: %s", config.default_search_engine)
|
|
205
|
+
if config.allowed_search_engines:
|
|
206
|
+
logger.info("🔍 Allowed search engines: %s", ", ".join(config.allowed_search_engines))
|
|
207
|
+
else:
|
|
208
|
+
logger.info("🔍 No search engine restrictions, all available engines can be used")
|
|
209
|
+
logger.info(
|
|
210
|
+
"🔍 Search mode: %s (currently only affects Bing)",
|
|
211
|
+
config.search_mode.upper(),
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if config.use_proxy:
|
|
215
|
+
logger.info("🌐 Using proxy: %s", config.proxy_url)
|
|
216
|
+
else:
|
|
217
|
+
logger.info("🌐 No proxy configured (set USE_PROXY=true to enable)")
|
|
218
|
+
|
|
219
|
+
if config.fetch_web_allow_insecure_tls:
|
|
220
|
+
logger.warning(
|
|
221
|
+
"⚠️ fetchWebContent TLS verification is disabled (FETCH_WEB_INSECURE_TLS=true)"
|
|
222
|
+
)
|
|
223
|
+
else:
|
|
224
|
+
logger.info("🔐 fetchWebContent TLS verification is enabled")
|
|
225
|
+
|
|
226
|
+
logger.info("🧭 Playwright client source: %s", config.playwright_package)
|
|
227
|
+
if config.playwright_module_path:
|
|
228
|
+
logger.info(
|
|
229
|
+
"🧭 Playwright module path override: %s", config.playwright_module_path
|
|
230
|
+
)
|
|
231
|
+
if config.playwright_ws_endpoint:
|
|
232
|
+
logger.info(
|
|
233
|
+
"🧭 Playwright remote endpoint (ws): %s", config.playwright_ws_endpoint
|
|
234
|
+
)
|
|
235
|
+
elif config.playwright_cdp_endpoint:
|
|
236
|
+
logger.info(
|
|
237
|
+
"🧭 Playwright remote endpoint (cdp): %s", config.playwright_cdp_endpoint
|
|
238
|
+
)
|
|
239
|
+
elif config.playwright_executable_path:
|
|
240
|
+
logger.info(
|
|
241
|
+
"🧭 Playwright executable path: %s", config.playwright_executable_path
|
|
242
|
+
)
|
|
243
|
+
logger.info("🧭 Playwright headless: %s", config.playwright_headless)
|
|
244
|
+
logger.info(
|
|
245
|
+
"🧭 Playwright navigation timeout: %sms",
|
|
246
|
+
config.playwright_navigation_timeout_ms,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Determine server mode from config
|
|
250
|
+
mode = os.environ.get("MODE") or ("both" if config.enable_http_server else "stdio")
|
|
251
|
+
logger.info("🖥️ Server mode: %s", mode.upper())
|
|
252
|
+
|
|
253
|
+
if config.enable_http_server:
|
|
254
|
+
if config.enable_cors:
|
|
255
|
+
logger.info("🔒 CORS enabled with origin: %s", config.cors_origin)
|
|
256
|
+
else:
|
|
257
|
+
logger.info("🔒 CORS disabled (set ENABLE_CORS=true to enable)")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def get_proxy_url() -> Optional[str]:
|
|
261
|
+
"""Helper function to get the proxy URL if proxy is enabled"""
|
|
262
|
+
return quote(config.proxy_url) if config.use_proxy and config.proxy_url else None
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Registry of supported search engines."""
|
|
2
|
+
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
from mcpwebprobe.engine.search_service import (
|
|
6
|
+
SearchEngineExecutor,
|
|
7
|
+
SearchExecutionResult,
|
|
8
|
+
SearchService,
|
|
9
|
+
SUPPORTED_SEARCH_ENGINES,
|
|
10
|
+
)
|
|
11
|
+
from mcpwebprobe.engines.baidu import search_baidu
|
|
12
|
+
from mcpwebprobe.engines.bing import search_bing
|
|
13
|
+
from mcpwebprobe.engines.brave import search_brave
|
|
14
|
+
from mcpwebprobe.engines.csdn import search_csdn
|
|
15
|
+
from mcpwebprobe.engines.duckduckgo import search_duckduckgo
|
|
16
|
+
from mcpwebprobe.engines.exa import search_exa
|
|
17
|
+
from mcpwebprobe.engines.juejin import search_juejin
|
|
18
|
+
from mcpwebprobe.engines.linuxdo import search_linuxdo
|
|
19
|
+
from mcpwebprobe.engines.startpage import search_startpage
|
|
20
|
+
|
|
21
|
+
EngineMap = Dict[str, SearchEngineExecutor]
|
|
22
|
+
|
|
23
|
+
ENGINE_MAP: EngineMap = {
|
|
24
|
+
"baidu": search_baidu,
|
|
25
|
+
"bing": search_bing,
|
|
26
|
+
"brave": search_brave,
|
|
27
|
+
"csdn": search_csdn,
|
|
28
|
+
"duckduckgo": search_duckduckgo,
|
|
29
|
+
"exa": search_exa,
|
|
30
|
+
"juejin": search_juejin,
|
|
31
|
+
"linuxdo": search_linuxdo,
|
|
32
|
+
"startpage": search_startpage,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
SEARCH_SERVICE = SearchService(ENGINE_MAP)
|