mcpwebprobe 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. mcpwebprobe-0.1.0/PKG-INFO +97 -0
  2. mcpwebprobe-0.1.0/README.md +77 -0
  3. mcpwebprobe-0.1.0/pyproject.toml +39 -0
  4. mcpwebprobe-0.1.0/src/mcpwebprobe/__init__.py +24 -0
  5. mcpwebprobe-0.1.0/src/mcpwebprobe/api.py +109 -0
  6. mcpwebprobe-0.1.0/src/mcpwebprobe/config.py +262 -0
  7. mcpwebprobe-0.1.0/src/mcpwebprobe/engine/__init__.py +5 -0
  8. mcpwebprobe-0.1.0/src/mcpwebprobe/engine/registry.py +35 -0
  9. mcpwebprobe-0.1.0/src/mcpwebprobe/engine/search_service.py +160 -0
  10. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/__init__.py +23 -0
  11. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/baidu.py +105 -0
  12. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/bing.py +190 -0
  13. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/brave.py +105 -0
  14. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/csdn.py +73 -0
  15. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/duckduckgo.py +124 -0
  16. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/exa.py +97 -0
  17. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/fetch_csdn.py +12 -0
  18. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/fetch_juejin.py +76 -0
  19. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/fetch_linuxdo.py +59 -0
  20. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/github.py +77 -0
  21. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/juejin.py +114 -0
  22. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/linuxdo.py +44 -0
  23. mcpwebprobe-0.1.0/src/mcpwebprobe/engines/startpage.py +192 -0
  24. mcpwebprobe-0.1.0/src/mcpwebprobe/logging.py +38 -0
  25. mcpwebprobe-0.1.0/src/mcpwebprobe/main.py +161 -0
  26. mcpwebprobe-0.1.0/src/mcpwebprobe/server.py +107 -0
  27. mcpwebprobe-0.1.0/src/mcpwebprobe/types.py +11 -0
  28. mcpwebprobe-0.1.0/src/mcpwebprobe/utils/browser_cookies.py +17 -0
  29. mcpwebprobe-0.1.0/src/mcpwebprobe/utils/cookies.py +273 -0
  30. mcpwebprobe-0.1.0/src/mcpwebprobe/utils/csdn.py +361 -0
  31. mcpwebprobe-0.1.0/src/mcpwebprobe/utils/duckduckgo.py +227 -0
  32. mcpwebprobe-0.1.0/src/mcpwebprobe/utils/http_client.py +363 -0
  33. mcpwebprobe-0.1.0/src/mcpwebprobe/utils/playwright.py +196 -0
  34. mcpwebprobe-0.1.0/src/mcpwebprobe/utils/urls.py +117 -0
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.3
2
+ Name: mcpwebprobe
3
+ Version: 0.1.0
4
+ Summary: Python port of open-webSearch.
5
+ Classifier: Development Status :: 4 - Beta
6
+ Classifier: Intended Audience :: Developers
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Classifier: Programming Language :: Python :: 3.14
10
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
11
+ Requires-Dist: beautifulsoup4>=4.12.2
12
+ Requires-Dist: httpx>=0.28.1
13
+ Requires-Dist: modelcontextprotocol>=1.0.1
14
+ Requires-Dist: pytest>=9.0.3,<9.1
15
+ Requires-Dist: pytest-asyncio>=0.22.0
16
+ Requires-Python: >=3.14
17
+ Project-URL: Homepage, https://github.com/Ddilibe/webprobe
18
+ Project-URL: Issues, https://github.com/Ddilibe/webprobe
19
+ Description-Content-Type: text/markdown
20
+
21
+ # WebProbe (Python port of open-webSearch)
22
+
23
+ This project replicates the core search and fetch capabilities of [Aas-ee/open-webSearch](https://github.com/Aas-ee/open-webSearch) using Python. It exposes a CLI that can:
24
+
25
+ - Search multiple engines (Bing, DuckDuckGo, Baidu, Brave, Exa, Startpage, CSDN, Juejin, Linux.do)
26
+ - Fetch full-length articles from CSDN, Linux.do, and Juejin
27
+ - Download GitHub `README.*` files without hitting the API
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ python -m pip install --upgrade pip
33
+ pip install -e .
34
+ ```
35
+
36
+ ## Package usage
37
+
38
+ Install the project locally and consume the `webprobe` package directly:
39
+
40
+ ```python
41
+ from webprobe import WebProbeServer, search, fetch_csdn
42
+
43
+ print(search("visible web", limit=5))
44
+ print(fetch_csdn("https://blog.csdn.net/example/article/details/xxxxx"))
45
+
46
+ # Start the bundled HTTP server (serves /search and /fetch?kind=csdn)
47
+ server = WebProbeServer(host="0.0.0.0", port=3210)
48
+ try:
49
+ server.serve_forever()
50
+ finally:
51
+ server.shutdown()
52
+ ```
53
+
54
+ The HTTP server exposes `/search?query=...&limit=...&engines=...` and `/fetch?kind=<csdn|linuxdo|juejin|github>&url=...`.
55
+
56
+ ## CLI
57
+
58
+ Run `python main.py --help` to see available commands. Key subcommands:
59
+
60
+ ### `search`
61
+
62
+ ```bash
63
+ python main.py search "open websearch" --limit 12 --engines bing,duckduckgo
64
+ ```
65
+
66
+ ### Article fetchers
67
+
68
+ Each fetcher prints JSON or plain text:
69
+
70
+ - `python main.py fetch-csdn <url>`
71
+ - `python main.py fetch-linuxdo <url>`
72
+ - `python main.py fetch-juejin <url>`
73
+ - `python main.py fetch-github <repo-url>`
74
+
75
+ ## Configuration
76
+
77
+ Environment variables mirror the TypeScript version:
78
+
79
+ | Variable | Default | Description |
80
+ | --- | --- | --- |
81
+ | `DEFAULT_SEARCH_ENGINE` | `bing` | Default search engine |
82
+ | `ALLOWED_SEARCH_ENGINES` | (empty) | Comma-separated whitelist |
83
+ | `USE_PROXY` / `PROXY_URL` | `false` / `http://127.0.0.1:7890` | HTTP proxy for requests |
84
+
85
+ Set `USE_PROXY=true` to route all HTTP traffic through `PROXY_URL`.
86
+
87
+ ## Architecture
88
+
89
+ - `src/engine/search_service.py` orchestrates multi-engine searches with distribution logic.
90
+ - `src/engines/*` implement individual search/fetch adapters for each provider.
91
+ - `src/utils/` contains HTTP helpers, Playwright bridges for future browser fallbacks, and shared fetch logic for CSDN articles.
92
+
93
+ ## Next steps
94
+
95
+ 1. Wire this CLI into an MCP server similar to the TypeScript runtime.
96
+ 2. Add Playwright-backed fallbacks for blocked search pages and protected articles.
97
+ 3. Extend fetchers with generic web extraction (`fetch_web_content`) as in the original repo.
@@ -0,0 +1,77 @@
1
+ # WebProbe (Python port of open-webSearch)
2
+
3
+ This project replicates the core search and fetch capabilities of [Aas-ee/open-webSearch](https://github.com/Aas-ee/open-webSearch) using Python. It exposes a CLI that can:
4
+
5
+ - Search multiple engines (Bing, DuckDuckGo, Baidu, Brave, Exa, Startpage, CSDN, Juejin, Linux.do)
6
+ - Fetch full-length articles from CSDN, Linux.do, and Juejin
7
+ - Download GitHub `README.*` files without hitting the API
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ python -m pip install --upgrade pip
13
+ pip install -e .
14
+ ```
15
+
16
+ ## Package usage
17
+
18
+ Install the project locally and consume the `webprobe` package directly:
19
+
20
+ ```python
21
+ from webprobe import WebProbeServer, search, fetch_csdn
22
+
23
+ print(search("visible web", limit=5))
24
+ print(fetch_csdn("https://blog.csdn.net/example/article/details/xxxxx"))
25
+
26
+ # Start the bundled HTTP server (serves /search and /fetch?kind=csdn)
27
+ server = WebProbeServer(host="0.0.0.0", port=3210)
28
+ try:
29
+ server.serve_forever()
30
+ finally:
31
+ server.shutdown()
32
+ ```
33
+
34
+ The HTTP server exposes `/search?query=...&limit=...&engines=...` and `/fetch?kind=<csdn|linuxdo|juejin|github>&url=...`.
35
+
36
+ ## CLI
37
+
38
+ Run `python main.py --help` to see available commands. Key subcommands:
39
+
40
+ ### `search`
41
+
42
+ ```bash
43
+ python main.py search "open websearch" --limit 12 --engines bing,duckduckgo
44
+ ```
45
+
46
+ ### Article fetchers
47
+
48
+ Each fetcher prints JSON or plain text:
49
+
50
+ - `python main.py fetch-csdn <url>`
51
+ - `python main.py fetch-linuxdo <url>`
52
+ - `python main.py fetch-juejin <url>`
53
+ - `python main.py fetch-github <repo-url>`
54
+
55
+ ## Configuration
56
+
57
+ Environment variables mirror the TypeScript version:
58
+
59
+ | Variable | Default | Description |
60
+ | --- | --- | --- |
61
+ | `DEFAULT_SEARCH_ENGINE` | `bing` | Default search engine |
62
+ | `ALLOWED_SEARCH_ENGINES` | (empty) | Comma-separated whitelist |
63
+ | `USE_PROXY` / `PROXY_URL` | `false` / `http://127.0.0.1:7890` | HTTP proxy for requests |
64
+
65
+ Set `USE_PROXY=true` to route all HTTP traffic through `PROXY_URL`.
66
+
67
+ ## Architecture
68
+
69
+ - `src/engine/search_service.py` orchestrates multi-engine searches with distribution logic.
70
+ - `src/engines/*` implement individual search/fetch adapters for each provider.
71
+ - `src/utils/` contains HTTP helpers, Playwright bridges for future browser fallbacks, and shared fetch logic for CSDN articles.
72
+
73
+ ## Next steps
74
+
75
+ 1. Wire this CLI into an MCP server similar to the TypeScript runtime.
76
+ 2. Add Playwright-backed fallbacks for blocked search pages and protected articles.
77
+ 3. Extend fetchers with generic web extraction (`fetch_web_content`) as in the original repo.
@@ -0,0 +1,39 @@
1
+ [project]
2
+ name = "mcpwebprobe"
3
+ version = "0.1.0"
4
+ description = "Python port of open-webSearch."
5
+ readme = "README.md"
6
+ requires-python = ">=3.14"
7
+ dependencies = [
8
+ "beautifulsoup4>=4.12.2",
9
+ "httpx>=0.28.1",
10
+ "modelcontextprotocol>=1.0.1",
11
+ "pytest>=9.0.3,<9.1",
12
+ "pytest-asyncio>=0.22.0",
13
+ ]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3.14",
20
+ "Topic :: Software Development :: Libraries :: Python Modules",
21
+ ]
22
+
23
+
24
+ [project.scripts]
25
+ webprobe = "webprobe:main"
26
+
27
+ [build-system]
28
+ requires = ["uv_build>=0.9.10,<0.10.0"]
29
+ build-backend = "uv_build"
30
+
31
+ [dependency-groups]
32
+ dev = [
33
+ "build>=1.4.3",
34
+ "twine>=6.2.0",
35
+ ]
36
+
37
+ [project.urls]
38
+ Homepage = "https://github.com/Ddilibe/webprobe"
39
+ Issues = "https://github.com/Ddilibe/webprobe"
@@ -0,0 +1,24 @@
1
+ """Convenience exports for the webprobe package."""
2
+
3
+ from .api import (
4
+ fetch_csdn,
5
+ fetch_github,
6
+ fetch_juejin,
7
+ fetch_linuxdo,
8
+ search,
9
+ )
10
+ from .logging import configure_logging, get_logger
11
+ from .main import main
12
+ from .server import WebProbeServer
13
+
14
+ __all__ = [
15
+ "search",
16
+ "fetch_csdn",
17
+ "fetch_linuxdo",
18
+ "fetch_juejin",
19
+ "fetch_github",
20
+ "WebProbeServer",
21
+ "configure_logging",
22
+ "get_logger",
23
+ "main",
24
+ ]
@@ -0,0 +1,109 @@
1
+ import asyncio
2
+ import json
3
+ from typing import Iterable, List, Optional, Sequence
4
+
5
+ from mcpwebprobe.config import config
6
+ from mcpwebprobe.engine.registry import SEARCH_SERVICE
7
+ from mcpwebprobe.engine.search_service import (
8
+ normalize_engine_name,
9
+ resolve_requested_engines,
10
+ SearchExecutionResult,
11
+ )
12
+ from mcpwebprobe.engines.fetch_csdn import fetch_csdn_article
13
+ from mcpwebprobe.engines.fetch_juejin import fetch_juejin_article
14
+ from mcpwebprobe.engines.fetch_linuxdo import fetch_linuxdo_article
15
+ from mcpwebprobe.engines.github import fetch_github_readme
16
+ from mcpwebprobe.logging import get_logger
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ def _normalize_requested_engines(engines: Optional[Sequence[str]]) -> List[str]:
22
+ requested = []
23
+ if not engines:
24
+ return requested
25
+ for engine in engines:
26
+ normalized = normalize_engine_name(engine)
27
+ if normalized:
28
+ requested.append(normalized)
29
+ return requested
30
+
31
+
32
+ def _serialize_search(result: SearchExecutionResult) -> dict:
33
+ return {
34
+ "query": result.query,
35
+ "engines": result.engines,
36
+ "totalResults": result.total_results,
37
+ "partialFailures": [
38
+ {
39
+ "engine": f.engine,
40
+ "code": f.code,
41
+ "message": f.message,
42
+ }
43
+ for f in result.partial_failures
44
+ ],
45
+ "results": [
46
+ {
47
+ "title": entry.title,
48
+ "url": entry.url,
49
+ "description": entry.description,
50
+ "source": entry.source,
51
+ "engine": entry.engine,
52
+ }
53
+ for entry in result.results
54
+ ],
55
+ }
56
+
57
+
58
+ def search(
59
+ query: str,
60
+ limit: int = 10,
61
+ engines: Optional[Iterable[str]] = None,
62
+ ) -> dict:
63
+ """
64
+ Run a search across the configured engines.
65
+ """
66
+ if not query or not query.strip():
67
+ raise ValueError("query is required")
68
+
69
+ if not (1 <= limit <= 50):
70
+ raise ValueError("limit must be between 1 and 50")
71
+
72
+ requested = list(_normalize_requested_engines(engines))
73
+ resolved = resolve_requested_engines(
74
+ requested or [config.default_search_engine],
75
+ [normalize_engine_name(engine) for engine in config.allowed_search_engines],
76
+ config.default_search_engine,
77
+ )
78
+ result = asyncio.run(
79
+ SEARCH_SERVICE.execute(
80
+ query=query,
81
+ engines=resolved,
82
+ limit=limit,
83
+ )
84
+ )
85
+ logger.debug(
86
+ "Ran search for query=%s limit=%s engines=%s", query, limit, resolved
87
+ )
88
+
89
+ return _serialize_search(result)
90
+
91
+
92
+ def fetch_csdn(url: str) -> dict:
93
+ logger.debug("Fetching CSDN article %s", url)
94
+ return asyncio.run(fetch_csdn_article(url))
95
+
96
+
97
+ def fetch_linuxdo(url: str) -> dict:
98
+ logger.debug("Fetching linux.do topic %s", url)
99
+ return asyncio.run(fetch_linuxdo_article(url))
100
+
101
+
102
+ def fetch_juejin(url: str) -> dict:
103
+ logger.debug("Fetching Juejin article %s", url)
104
+ return asyncio.run(fetch_juejin_article(url))
105
+
106
+
107
+ def fetch_github(url: str) -> Optional[str]:
108
+ logger.debug("Fetching GitHub README %s", url)
109
+ return asyncio.run(fetch_github_readme(url))
@@ -0,0 +1,262 @@
1
+ import os
2
+ from typing import Optional, List, Literal, Union
3
+ from dataclasses import dataclass
4
+ from urllib.parse import quote
5
+
6
+ from mcpwebprobe.logging import get_logger
7
+
8
+ # Type aliases for better type hints
9
+ SearchEngine = Literal[
10
+ "bing",
11
+ "duckduckgo",
12
+ "exa",
13
+ "brave",
14
+ "baidu",
15
+ "csdn",
16
+ "linuxdo",
17
+ "juejin",
18
+ "startpage",
19
+ ]
20
+ SearchMode = Literal["request", "auto", "playwright"]
21
+ PlaywrightPackage = Literal["auto", "playwright", "playwright-core"]
22
+
23
+
24
+ @dataclass
25
+ class AppConfig:
26
+ # Search engine configuration
27
+ default_search_engine: SearchEngine
28
+ # List of allowed search engines (if empty, all engines are available)
29
+ allowed_search_engines: List[str]
30
+ # Search mode: request only, auto request then fallback, or force Playwright
31
+ # Currently only affects Bing.
32
+ search_mode: SearchMode
33
+ # Proxy configuration
34
+ proxy_url: Optional[str]
35
+ use_proxy: bool
36
+ fetch_web_allow_insecure_tls: bool
37
+ # Playwright configuration
38
+ playwright_package: PlaywrightPackage
39
+ playwright_module_path: Optional[str]
40
+ playwright_executable_path: Optional[str]
41
+ playwright_ws_endpoint: Optional[str]
42
+ playwright_cdp_endpoint: Optional[str]
43
+ playwright_headless: bool
44
+ playwright_navigation_timeout_ms: int
45
+ # CORS configuration
46
+ enable_cors: bool
47
+ cors_origin: str
48
+ # Server configuration (determined by MODE env var: 'both', 'http', or 'stdio')
49
+ enable_http_server: bool
50
+
51
+
52
+ def read_optional_env(name: str) -> Optional[str]:
53
+ """Read optional environment variable, return None if not set or empty"""
54
+ value = os.environ.get(name)
55
+ if value is not None:
56
+ value = value.strip()
57
+ return value if value else None
58
+ return None
59
+
60
+
61
+ def parse_allowed_search_engines(env_value: Optional[str]) -> List[str]:
62
+ """Parse comma-separated list of allowed search engines"""
63
+ if env_value:
64
+ return [e.strip() for e in env_value.split(",")]
65
+ return []
66
+
67
+
68
+ # Valid search engines list
69
+ VALID_SEARCH_ENGINES = [
70
+ "bing",
71
+ "duckduckgo",
72
+ "exa",
73
+ "brave",
74
+ "baidu",
75
+ "csdn",
76
+ "linuxdo",
77
+ "juejin",
78
+ "startpage",
79
+ ]
80
+ VALID_SEARCH_MODES = ["request", "auto", "playwright"]
81
+ VALID_PLAYWRIGHT_PACKAGES = ["auto", "playwright", "playwright-core"]
82
+ QUIET_STARTUP_LOGS = os.environ.get("OPEN_WEBSEARCH_QUIET_STARTUP") == "true"
83
+
84
+ # Read from environment variables or use defaults
85
+ config = AppConfig(
86
+ # Search engine configuration
87
+ default_search_engine=os.environ.get("DEFAULT_SEARCH_ENGINE", "bing"), # type: ignore
88
+ # Parse comma-separated list of allowed search engines
89
+ allowed_search_engines=parse_allowed_search_engines(
90
+ os.environ.get("ALLOWED_SEARCH_ENGINES")
91
+ ),
92
+ search_mode=os.environ.get("SEARCH_MODE", "auto"), # type: ignore
93
+ # Proxy configuration
94
+ proxy_url=os.environ.get("PROXY_URL", "http://127.0.0.1:7890"),
95
+ use_proxy=os.environ.get("USE_PROXY") == "true",
96
+ fetch_web_allow_insecure_tls=os.environ.get("FETCH_WEB_INSECURE_TLS") == "true",
97
+ playwright_package=os.environ.get("PLAYWRIGHT_PACKAGE", "auto"), # type: ignore
98
+ playwright_module_path=read_optional_env("PLAYWRIGHT_MODULE_PATH"),
99
+ playwright_executable_path=read_optional_env("PLAYWRIGHT_EXECUTABLE_PATH"),
100
+ playwright_ws_endpoint=read_optional_env("PLAYWRIGHT_WS_ENDPOINT"),
101
+ playwright_cdp_endpoint=read_optional_env("PLAYWRIGHT_CDP_ENDPOINT"),
102
+ playwright_headless=os.environ.get("PLAYWRIGHT_HEADLESS") != "false",
103
+ playwright_navigation_timeout_ms=int(
104
+ os.environ.get("PLAYWRIGHT_NAVIGATION_TIMEOUT_MS", 20000)
105
+ ),
106
+ # CORS configuration
107
+ enable_cors=os.environ.get("ENABLE_CORS") == "true",
108
+ cors_origin=os.environ.get("CORS_ORIGIN", "*"),
109
+ # Server configuration - determined by MODE environment variable
110
+ # Modes: 'both' (default), 'http', 'stdio'
111
+ enable_http_server=(
112
+ (os.environ.get("MODE", "both") in ["both", "http"])
113
+ if os.environ.get("MODE")
114
+ else True
115
+ ),
116
+ )
117
+
118
+ logger = get_logger(__name__)
119
+
120
+ # Validate default search engine
121
+ if config.default_search_engine not in VALID_SEARCH_ENGINES:
122
+ logger.warning(
123
+ 'Invalid DEFAULT_SEARCH_ENGINE: "%s", falling back to "bing"',
124
+ config.default_search_engine,
125
+ )
126
+ config.default_search_engine = "bing"
127
+
128
+ if config.search_mode not in VALID_SEARCH_MODES:
129
+ logger.warning(
130
+ 'Invalid SEARCH_MODE: "%s", falling back to "auto"',
131
+ config.search_mode,
132
+ )
133
+ config.search_mode = "auto"
134
+
135
+ if config.playwright_package not in VALID_PLAYWRIGHT_PACKAGES:
136
+ logger.warning(
137
+ 'Invalid PLAYWRIGHT_PACKAGE: "%s", falling back to "auto"',
138
+ config.playwright_package,
139
+ )
140
+ config.playwright_package = "auto"
141
+
142
+ if not (
143
+ isinstance(config.playwright_navigation_timeout_ms, (int, float))
144
+ and config.playwright_navigation_timeout_ms > 0
145
+ ):
146
+ logger.warning(
147
+ 'Invalid PLAYWRIGHT_NAVIGATION_TIMEOUT_MS: "%s", falling back to 20000',
148
+ os.environ.get("PLAYWRIGHT_NAVIGATION_TIMEOUT_MS"),
149
+ )
150
+ config.playwright_navigation_timeout_ms = 20000
151
+
152
+ if config.playwright_ws_endpoint and config.playwright_cdp_endpoint:
153
+ logger.warning(
154
+ "Both PLAYWRIGHT_WS_ENDPOINT and PLAYWRIGHT_CDP_ENDPOINT are set, PLAYWRIGHT_WS_ENDPOINT will take precedence"
155
+ )
156
+
157
+ if (
158
+ config.playwright_ws_endpoint or config.playwright_cdp_endpoint
159
+ ) and config.playwright_executable_path:
160
+ logger.warning(
161
+ "PLAYWRIGHT_EXECUTABLE_PATH is ignored when connecting to a remote browser endpoint"
162
+ )
163
+
164
+ # Validate allowed search engines
165
+ if config.allowed_search_engines:
166
+ # Filter out invalid engines
167
+ invalid_engines = [
168
+ engine
169
+ for engine in config.allowed_search_engines
170
+ if engine not in VALID_SEARCH_ENGINES
171
+ ]
172
+ if invalid_engines:
173
+ logger.warning(
174
+ "Invalid search engines detected and will be ignored: %s",
175
+ ", ".join(invalid_engines),
176
+ )
177
+
178
+ config.allowed_search_engines = [
179
+ engine
180
+ for engine in config.allowed_search_engines
181
+ if engine in VALID_SEARCH_ENGINES
182
+ ]
183
+
184
+ # If all engines were invalid, don't restrict (allow all engines)
185
+ if not config.allowed_search_engines:
186
+ logger.warning(
187
+ "No valid search engines specified in the allowed list, all engines will be available"
188
+ )
189
+ # Check if default engine is in the allowed list
190
+ elif config.default_search_engine not in config.allowed_search_engines:
191
+ logger.warning(
192
+ 'Default search engine "%s" is not in the allowed engines list',
193
+ config.default_search_engine,
194
+ )
195
+ # Update the default engine to the first allowed engine
196
+ config.default_search_engine = config.allowed_search_engines[0] # type: ignore
197
+ logger.info(
198
+ 'Default search engine updated to "%s"',
199
+ config.default_search_engine,
200
+ )
201
+
202
+ if not QUIET_STARTUP_LOGS:
203
+ # Log configuration
204
+ logger.info("🔍 Default search engine: %s", config.default_search_engine)
205
+ if config.allowed_search_engines:
206
+ logger.info("🔍 Allowed search engines: %s", ", ".join(config.allowed_search_engines))
207
+ else:
208
+ logger.info("🔍 No search engine restrictions, all available engines can be used")
209
+ logger.info(
210
+ "🔍 Search mode: %s (currently only affects Bing)",
211
+ config.search_mode.upper(),
212
+ )
213
+
214
+ if config.use_proxy:
215
+ logger.info("🌐 Using proxy: %s", config.proxy_url)
216
+ else:
217
+ logger.info("🌐 No proxy configured (set USE_PROXY=true to enable)")
218
+
219
+ if config.fetch_web_allow_insecure_tls:
220
+ logger.warning(
221
+ "⚠️ fetchWebContent TLS verification is disabled (FETCH_WEB_INSECURE_TLS=true)"
222
+ )
223
+ else:
224
+ logger.info("🔐 fetchWebContent TLS verification is enabled")
225
+
226
+ logger.info("🧭 Playwright client source: %s", config.playwright_package)
227
+ if config.playwright_module_path:
228
+ logger.info(
229
+ "🧭 Playwright module path override: %s", config.playwright_module_path
230
+ )
231
+ if config.playwright_ws_endpoint:
232
+ logger.info(
233
+ "🧭 Playwright remote endpoint (ws): %s", config.playwright_ws_endpoint
234
+ )
235
+ elif config.playwright_cdp_endpoint:
236
+ logger.info(
237
+ "🧭 Playwright remote endpoint (cdp): %s", config.playwright_cdp_endpoint
238
+ )
239
+ elif config.playwright_executable_path:
240
+ logger.info(
241
+ "🧭 Playwright executable path: %s", config.playwright_executable_path
242
+ )
243
+ logger.info("🧭 Playwright headless: %s", config.playwright_headless)
244
+ logger.info(
245
+ "🧭 Playwright navigation timeout: %sms",
246
+ config.playwright_navigation_timeout_ms,
247
+ )
248
+
249
+ # Determine server mode from config
250
+ mode = os.environ.get("MODE") or ("both" if config.enable_http_server else "stdio")
251
+ logger.info("🖥️ Server mode: %s", mode.upper())
252
+
253
+ if config.enable_http_server:
254
+ if config.enable_cors:
255
+ logger.info("🔒 CORS enabled with origin: %s", config.cors_origin)
256
+ else:
257
+ logger.info("🔒 CORS disabled (set ENABLE_CORS=true to enable)")
258
+
259
+
260
+ def get_proxy_url() -> Optional[str]:
261
+ """Helper function to get the proxy URL if proxy is enabled"""
262
+ return quote(config.proxy_url) if config.use_proxy and config.proxy_url else None
@@ -0,0 +1,5 @@
1
+ """Core engine helpers."""
2
+
3
+ from .search_service import SearchExecutionFailure, SearchExecutionResult, SearchService
4
+
5
+ __all__ = ["SearchExecutionFailure", "SearchExecutionResult", "SearchService"]
@@ -0,0 +1,35 @@
1
+ """Registry of supported search engines."""
2
+
3
+ from typing import Dict
4
+
5
+ from mcpwebprobe.engine.search_service import (
6
+ SearchEngineExecutor,
7
+ SearchExecutionResult,
8
+ SearchService,
9
+ SUPPORTED_SEARCH_ENGINES,
10
+ )
11
+ from mcpwebprobe.engines.baidu import search_baidu
12
+ from mcpwebprobe.engines.bing import search_bing
13
+ from mcpwebprobe.engines.brave import search_brave
14
+ from mcpwebprobe.engines.csdn import search_csdn
15
+ from mcpwebprobe.engines.duckduckgo import search_duckduckgo
16
+ from mcpwebprobe.engines.exa import search_exa
17
+ from mcpwebprobe.engines.juejin import search_juejin
18
+ from mcpwebprobe.engines.linuxdo import search_linuxdo
19
+ from mcpwebprobe.engines.startpage import search_startpage
20
+
21
+ EngineMap = Dict[str, SearchEngineExecutor]
22
+
23
+ ENGINE_MAP: EngineMap = {
24
+ "baidu": search_baidu,
25
+ "bing": search_bing,
26
+ "brave": search_brave,
27
+ "csdn": search_csdn,
28
+ "duckduckgo": search_duckduckgo,
29
+ "exa": search_exa,
30
+ "juejin": search_juejin,
31
+ "linuxdo": search_linuxdo,
32
+ "startpage": search_startpage,
33
+ }
34
+
35
+ SEARCH_SERVICE = SearchService(ENGINE_MAP)