mcpwebprobe 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ """Convenience exports for the webprobe package."""
2
+
3
+ from .api import (
4
+ fetch_csdn,
5
+ fetch_github,
6
+ fetch_juejin,
7
+ fetch_linuxdo,
8
+ search,
9
+ )
10
+ from .logging import configure_logging, get_logger
11
+ from .main import main
12
+ from .server import WebProbeServer
13
+
14
+ __all__ = [
15
+ "search",
16
+ "fetch_csdn",
17
+ "fetch_linuxdo",
18
+ "fetch_juejin",
19
+ "fetch_github",
20
+ "WebProbeServer",
21
+ "configure_logging",
22
+ "get_logger",
23
+ "main",
24
+ ]
mcpwebprobe/api.py ADDED
@@ -0,0 +1,109 @@
1
+ import asyncio
2
+ import json
3
+ from typing import Iterable, List, Optional, Sequence
4
+
5
+ from mcpwebprobe.config import config
6
+ from mcpwebprobe.engine.registry import SEARCH_SERVICE
7
+ from mcpwebprobe.engine.search_service import (
8
+ normalize_engine_name,
9
+ resolve_requested_engines,
10
+ SearchExecutionResult,
11
+ )
12
+ from mcpwebprobe.engines.fetch_csdn import fetch_csdn_article
13
+ from mcpwebprobe.engines.fetch_juejin import fetch_juejin_article
14
+ from mcpwebprobe.engines.fetch_linuxdo import fetch_linuxdo_article
15
+ from mcpwebprobe.engines.github import fetch_github_readme
16
+ from mcpwebprobe.logging import get_logger
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ def _normalize_requested_engines(engines: Optional[Sequence[str]]) -> List[str]:
22
+ requested = []
23
+ if not engines:
24
+ return requested
25
+ for engine in engines:
26
+ normalized = normalize_engine_name(engine)
27
+ if normalized:
28
+ requested.append(normalized)
29
+ return requested
30
+
31
+
32
+ def _serialize_search(result: SearchExecutionResult) -> dict:
33
+ return {
34
+ "query": result.query,
35
+ "engines": result.engines,
36
+ "totalResults": result.total_results,
37
+ "partialFailures": [
38
+ {
39
+ "engine": f.engine,
40
+ "code": f.code,
41
+ "message": f.message,
42
+ }
43
+ for f in result.partial_failures
44
+ ],
45
+ "results": [
46
+ {
47
+ "title": entry.title,
48
+ "url": entry.url,
49
+ "description": entry.description,
50
+ "source": entry.source,
51
+ "engine": entry.engine,
52
+ }
53
+ for entry in result.results
54
+ ],
55
+ }
56
+
57
+
58
+ def search(
59
+ query: str,
60
+ limit: int = 10,
61
+ engines: Optional[Iterable[str]] = None,
62
+ ) -> dict:
63
+ """
64
+ Run a search across the configured engines.
65
+ """
66
+ if not query or not query.strip():
67
+ raise ValueError("query is required")
68
+
69
+ if not (1 <= limit <= 50):
70
+ raise ValueError("limit must be between 1 and 50")
71
+
72
+ requested = list(_normalize_requested_engines(engines))
73
+ resolved = resolve_requested_engines(
74
+ requested or [config.default_search_engine],
75
+ [normalize_engine_name(engine) for engine in config.allowed_search_engines],
76
+ config.default_search_engine,
77
+ )
78
+ result = asyncio.run(
79
+ SEARCH_SERVICE.execute(
80
+ query=query,
81
+ engines=resolved,
82
+ limit=limit,
83
+ )
84
+ )
85
+ logger.debug(
86
+ "Ran search for query=%s limit=%s engines=%s", query, limit, resolved
87
+ )
88
+
89
+ return _serialize_search(result)
90
+
91
+
92
+ def fetch_csdn(url: str) -> dict:
93
+ logger.debug("Fetching CSDN article %s", url)
94
+ return asyncio.run(fetch_csdn_article(url))
95
+
96
+
97
+ def fetch_linuxdo(url: str) -> dict:
98
+ logger.debug("Fetching linux.do topic %s", url)
99
+ return asyncio.run(fetch_linuxdo_article(url))
100
+
101
+
102
+ def fetch_juejin(url: str) -> dict:
103
+ logger.debug("Fetching Juejin article %s", url)
104
+ return asyncio.run(fetch_juejin_article(url))
105
+
106
+
107
+ def fetch_github(url: str) -> Optional[str]:
108
+ logger.debug("Fetching GitHub README %s", url)
109
+ return asyncio.run(fetch_github_readme(url))
mcpwebprobe/config.py ADDED
@@ -0,0 +1,262 @@
1
+ import os
2
+ from typing import Optional, List, Literal, Union
3
+ from dataclasses import dataclass
4
+ from urllib.parse import quote
5
+
6
+ from mcpwebprobe.logging import get_logger
7
+
8
+ # Type aliases for better type hints
9
+ SearchEngine = Literal[
10
+ "bing",
11
+ "duckduckgo",
12
+ "exa",
13
+ "brave",
14
+ "baidu",
15
+ "csdn",
16
+ "linuxdo",
17
+ "juejin",
18
+ "startpage",
19
+ ]
20
+ SearchMode = Literal["request", "auto", "playwright"]
21
+ PlaywrightPackage = Literal["auto", "playwright", "playwright-core"]
22
+
23
+
24
+ @dataclass
25
+ class AppConfig:
26
+ # Search engine configuration
27
+ default_search_engine: SearchEngine
28
+ # List of allowed search engines (if empty, all engines are available)
29
+ allowed_search_engines: List[str]
30
+ # Search mode: request only, auto request then fallback, or force Playwright
31
+ # Currently only affects Bing.
32
+ search_mode: SearchMode
33
+ # Proxy configuration
34
+ proxy_url: Optional[str]
35
+ use_proxy: bool
36
+ fetch_web_allow_insecure_tls: bool
37
+ # Playwright configuration
38
+ playwright_package: PlaywrightPackage
39
+ playwright_module_path: Optional[str]
40
+ playwright_executable_path: Optional[str]
41
+ playwright_ws_endpoint: Optional[str]
42
+ playwright_cdp_endpoint: Optional[str]
43
+ playwright_headless: bool
44
+ playwright_navigation_timeout_ms: int
45
+ # CORS configuration
46
+ enable_cors: bool
47
+ cors_origin: str
48
+ # Server configuration (determined by MODE env var: 'both', 'http', or 'stdio')
49
+ enable_http_server: bool
50
+
51
+
52
+ def read_optional_env(name: str) -> Optional[str]:
53
+ """Read optional environment variable, return None if not set or empty"""
54
+ value = os.environ.get(name)
55
+ if value is not None:
56
+ value = value.strip()
57
+ return value if value else None
58
+ return None
59
+
60
+
61
+ def parse_allowed_search_engines(env_value: Optional[str]) -> List[str]:
62
+ """Parse comma-separated list of allowed search engines"""
63
+ if env_value:
64
+ return [e.strip() for e in env_value.split(",")]
65
+ return []
66
+
67
+
68
+ # Valid search engines list
69
+ VALID_SEARCH_ENGINES = [
70
+ "bing",
71
+ "duckduckgo",
72
+ "exa",
73
+ "brave",
74
+ "baidu",
75
+ "csdn",
76
+ "linuxdo",
77
+ "juejin",
78
+ "startpage",
79
+ ]
80
+ VALID_SEARCH_MODES = ["request", "auto", "playwright"]
81
+ VALID_PLAYWRIGHT_PACKAGES = ["auto", "playwright", "playwright-core"]
82
+ QUIET_STARTUP_LOGS = os.environ.get("OPEN_WEBSEARCH_QUIET_STARTUP") == "true"
83
+
84
+ # Read from environment variables or use defaults
85
+ config = AppConfig(
86
+ # Search engine configuration
87
+ default_search_engine=os.environ.get("DEFAULT_SEARCH_ENGINE", "bing"), # type: ignore
88
+ # Parse comma-separated list of allowed search engines
89
+ allowed_search_engines=parse_allowed_search_engines(
90
+ os.environ.get("ALLOWED_SEARCH_ENGINES")
91
+ ),
92
+ search_mode=os.environ.get("SEARCH_MODE", "auto"), # type: ignore
93
+ # Proxy configuration
94
+ proxy_url=os.environ.get("PROXY_URL", "http://127.0.0.1:7890"),
95
+ use_proxy=os.environ.get("USE_PROXY") == "true",
96
+ fetch_web_allow_insecure_tls=os.environ.get("FETCH_WEB_INSECURE_TLS") == "true",
97
+ playwright_package=os.environ.get("PLAYWRIGHT_PACKAGE", "auto"), # type: ignore
98
+ playwright_module_path=read_optional_env("PLAYWRIGHT_MODULE_PATH"),
99
+ playwright_executable_path=read_optional_env("PLAYWRIGHT_EXECUTABLE_PATH"),
100
+ playwright_ws_endpoint=read_optional_env("PLAYWRIGHT_WS_ENDPOINT"),
101
+ playwright_cdp_endpoint=read_optional_env("PLAYWRIGHT_CDP_ENDPOINT"),
102
+ playwright_headless=os.environ.get("PLAYWRIGHT_HEADLESS") != "false",
103
+ playwright_navigation_timeout_ms=int(
104
+ os.environ.get("PLAYWRIGHT_NAVIGATION_TIMEOUT_MS", 20000)
105
+ ),
106
+ # CORS configuration
107
+ enable_cors=os.environ.get("ENABLE_CORS") == "true",
108
+ cors_origin=os.environ.get("CORS_ORIGIN", "*"),
109
+ # Server configuration - determined by MODE environment variable
110
+ # Modes: 'both' (default), 'http', 'stdio'
111
+ enable_http_server=(
112
+ (os.environ.get("MODE", "both") in ["both", "http"])
113
+ if os.environ.get("MODE")
114
+ else True
115
+ ),
116
+ )
117
+
118
+ logger = get_logger(__name__)
119
+
120
+ # Validate default search engine
121
+ if config.default_search_engine not in VALID_SEARCH_ENGINES:
122
+ logger.warning(
123
+ 'Invalid DEFAULT_SEARCH_ENGINE: "%s", falling back to "bing"',
124
+ config.default_search_engine,
125
+ )
126
+ config.default_search_engine = "bing"
127
+
128
+ if config.search_mode not in VALID_SEARCH_MODES:
129
+ logger.warning(
130
+ 'Invalid SEARCH_MODE: "%s", falling back to "auto"',
131
+ config.search_mode,
132
+ )
133
+ config.search_mode = "auto"
134
+
135
+ if config.playwright_package not in VALID_PLAYWRIGHT_PACKAGES:
136
+ logger.warning(
137
+ 'Invalid PLAYWRIGHT_PACKAGE: "%s", falling back to "auto"',
138
+ config.playwright_package,
139
+ )
140
+ config.playwright_package = "auto"
141
+
142
+ if not (
143
+ isinstance(config.playwright_navigation_timeout_ms, (int, float))
144
+ and config.playwright_navigation_timeout_ms > 0
145
+ ):
146
+ logger.warning(
147
+ 'Invalid PLAYWRIGHT_NAVIGATION_TIMEOUT_MS: "%s", falling back to 20000',
148
+ os.environ.get("PLAYWRIGHT_NAVIGATION_TIMEOUT_MS"),
149
+ )
150
+ config.playwright_navigation_timeout_ms = 20000
151
+
152
+ if config.playwright_ws_endpoint and config.playwright_cdp_endpoint:
153
+ logger.warning(
154
+ "Both PLAYWRIGHT_WS_ENDPOINT and PLAYWRIGHT_CDP_ENDPOINT are set, PLAYWRIGHT_WS_ENDPOINT will take precedence"
155
+ )
156
+
157
+ if (
158
+ config.playwright_ws_endpoint or config.playwright_cdp_endpoint
159
+ ) and config.playwright_executable_path:
160
+ logger.warning(
161
+ "PLAYWRIGHT_EXECUTABLE_PATH is ignored when connecting to a remote browser endpoint"
162
+ )
163
+
164
+ # Validate allowed search engines
165
+ if config.allowed_search_engines:
166
+ # Filter out invalid engines
167
+ invalid_engines = [
168
+ engine
169
+ for engine in config.allowed_search_engines
170
+ if engine not in VALID_SEARCH_ENGINES
171
+ ]
172
+ if invalid_engines:
173
+ logger.warning(
174
+ "Invalid search engines detected and will be ignored: %s",
175
+ ", ".join(invalid_engines),
176
+ )
177
+
178
+ config.allowed_search_engines = [
179
+ engine
180
+ for engine in config.allowed_search_engines
181
+ if engine in VALID_SEARCH_ENGINES
182
+ ]
183
+
184
+ # If all engines were invalid, don't restrict (allow all engines)
185
+ if not config.allowed_search_engines:
186
+ logger.warning(
187
+ "No valid search engines specified in the allowed list, all engines will be available"
188
+ )
189
+ # Check if default engine is in the allowed list
190
+ elif config.default_search_engine not in config.allowed_search_engines:
191
+ logger.warning(
192
+ 'Default search engine "%s" is not in the allowed engines list',
193
+ config.default_search_engine,
194
+ )
195
+ # Update the default engine to the first allowed engine
196
+ config.default_search_engine = config.allowed_search_engines[0] # type: ignore
197
+ logger.info(
198
+ 'Default search engine updated to "%s"',
199
+ config.default_search_engine,
200
+ )
201
+
202
+ if not QUIET_STARTUP_LOGS:
203
+ # Log configuration
204
+ logger.info("🔍 Default search engine: %s", config.default_search_engine)
205
+ if config.allowed_search_engines:
206
+ logger.info("🔍 Allowed search engines: %s", ", ".join(config.allowed_search_engines))
207
+ else:
208
+ logger.info("🔍 No search engine restrictions, all available engines can be used")
209
+ logger.info(
210
+ "🔍 Search mode: %s (currently only affects Bing)",
211
+ config.search_mode.upper(),
212
+ )
213
+
214
+ if config.use_proxy:
215
+ logger.info("🌐 Using proxy: %s", config.proxy_url)
216
+ else:
217
+ logger.info("🌐 No proxy configured (set USE_PROXY=true to enable)")
218
+
219
+ if config.fetch_web_allow_insecure_tls:
220
+ logger.warning(
221
+ "⚠️ fetchWebContent TLS verification is disabled (FETCH_WEB_INSECURE_TLS=true)"
222
+ )
223
+ else:
224
+ logger.info("🔐 fetchWebContent TLS verification is enabled")
225
+
226
+ logger.info("🧭 Playwright client source: %s", config.playwright_package)
227
+ if config.playwright_module_path:
228
+ logger.info(
229
+ "🧭 Playwright module path override: %s", config.playwright_module_path
230
+ )
231
+ if config.playwright_ws_endpoint:
232
+ logger.info(
233
+ "🧭 Playwright remote endpoint (ws): %s", config.playwright_ws_endpoint
234
+ )
235
+ elif config.playwright_cdp_endpoint:
236
+ logger.info(
237
+ "🧭 Playwright remote endpoint (cdp): %s", config.playwright_cdp_endpoint
238
+ )
239
+ elif config.playwright_executable_path:
240
+ logger.info(
241
+ "🧭 Playwright executable path: %s", config.playwright_executable_path
242
+ )
243
+ logger.info("🧭 Playwright headless: %s", config.playwright_headless)
244
+ logger.info(
245
+ "🧭 Playwright navigation timeout: %sms",
246
+ config.playwright_navigation_timeout_ms,
247
+ )
248
+
249
+ # Determine server mode from config
250
+ mode = os.environ.get("MODE") or ("both" if config.enable_http_server else "stdio")
251
+ logger.info("🖥️ Server mode: %s", mode.upper())
252
+
253
+ if config.enable_http_server:
254
+ if config.enable_cors:
255
+ logger.info("🔒 CORS enabled with origin: %s", config.cors_origin)
256
+ else:
257
+ logger.info("🔒 CORS disabled (set ENABLE_CORS=true to enable)")
258
+
259
+
260
+ def get_proxy_url() -> Optional[str]:
261
+ """Helper function to get the proxy URL if proxy is enabled"""
262
+ return quote(config.proxy_url) if config.use_proxy and config.proxy_url else None
@@ -0,0 +1,5 @@
1
+ """Core engine helpers."""
2
+
3
+ from .search_service import SearchExecutionFailure, SearchExecutionResult, SearchService
4
+
5
+ __all__ = ["SearchExecutionFailure", "SearchExecutionResult", "SearchService"]
@@ -0,0 +1,35 @@
1
+ """Registry of supported search engines."""
2
+
3
+ from typing import Dict
4
+
5
+ from mcpwebprobe.engine.search_service import (
6
+ SearchEngineExecutor,
7
+ SearchExecutionResult,
8
+ SearchService,
9
+ SUPPORTED_SEARCH_ENGINES,
10
+ )
11
+ from mcpwebprobe.engines.baidu import search_baidu
12
+ from mcpwebprobe.engines.bing import search_bing
13
+ from mcpwebprobe.engines.brave import search_brave
14
+ from mcpwebprobe.engines.csdn import search_csdn
15
+ from mcpwebprobe.engines.duckduckgo import search_duckduckgo
16
+ from mcpwebprobe.engines.exa import search_exa
17
+ from mcpwebprobe.engines.juejin import search_juejin
18
+ from mcpwebprobe.engines.linuxdo import search_linuxdo
19
+ from mcpwebprobe.engines.startpage import search_startpage
20
+
21
+ EngineMap = Dict[str, SearchEngineExecutor]
22
+
23
+ ENGINE_MAP: EngineMap = {
24
+ "baidu": search_baidu,
25
+ "bing": search_bing,
26
+ "brave": search_brave,
27
+ "csdn": search_csdn,
28
+ "duckduckgo": search_duckduckgo,
29
+ "exa": search_exa,
30
+ "juejin": search_juejin,
31
+ "linuxdo": search_linuxdo,
32
+ "startpage": search_startpage,
33
+ }
34
+
35
+ SEARCH_SERVICE = SearchService(ENGINE_MAP)
@@ -0,0 +1,160 @@
1
+ """Search execution helpers for multi-engine queries."""
2
+
3
+ import asyncio
4
+ import re
5
+ from dataclasses import dataclass
6
+ from typing import (
7
+ Awaitable,
8
+ Callable,
9
+ Dict,
10
+ List,
11
+ MutableMapping,
12
+ Sequence,
13
+ )
14
+
15
+ from mcpwebprobe.types import SearchResult
16
+
17
+ SearchEngineExecutor = Callable[[str, int], Awaitable[List[SearchResult]]]
18
+ SearchEngineExecutorMap = Dict[str, SearchEngineExecutor]
19
+
20
+ SUPPORTED_SEARCH_ENGINES = [
21
+ "baidu",
22
+ "bing",
23
+ "linuxdo",
24
+ "csdn",
25
+ "duckduckgo",
26
+ "exa",
27
+ "brave",
28
+ "juejin",
29
+ "startpage",
30
+ ]
31
+
32
+ SEARCH_ENGINE_SYNONYMS = {
33
+ "bd": "baidu",
34
+ "ddg": "duckduckgo",
35
+ "sp": "startpage",
36
+ }
37
+
38
+
39
+ def normalize_engine_name(engine: str) -> str:
40
+ """Normalize engine identifiers to canonical names."""
41
+ cleaned = engine.strip().lower()
42
+ compact = re.sub(r"[\s._-]+", "", cleaned)
43
+
44
+ if compact in SEARCH_ENGINE_SYNONYMS:
45
+ return SEARCH_ENGINE_SYNONYMS[compact]
46
+
47
+ if compact in SUPPORTED_SEARCH_ENGINES:
48
+ return compact
49
+
50
+ return cleaned
51
+
52
+
53
+ def distribute_limit(total_limit: int, engine_count: int) -> List[int]:
54
+ """Evenly distribute the requested limit across the chosen engines."""
55
+ if engine_count <= 0:
56
+ return []
57
+
58
+ base = total_limit // engine_count
59
+ remainder = total_limit % engine_count
60
+
61
+ return [base + (1 if i < remainder else 0) for i in range(engine_count)]
62
+
63
+
64
+ def resolve_requested_engines(
65
+ requested: Sequence[str], allowed: Sequence[str], default_engine: str
66
+ ) -> List[str]:
67
+ """Filter requested engines against the allowed list, falling back to defaults."""
68
+ if not requested:
69
+ return [default_engine]
70
+
71
+ if not allowed:
72
+ return list(requested)
73
+
74
+ filtered = [engine for engine in requested if engine in allowed]
75
+ return filtered if filtered else [default_engine]
76
+
77
+
78
+ @dataclass
79
+ class SearchExecutionFailure:
80
+ engine: str
81
+ code: str
82
+ message: str
83
+
84
+
85
+ @dataclass
86
+ class SearchExecutionResult:
87
+ query: str
88
+ engines: List[str]
89
+ total_results: int
90
+ results: List[SearchResult]
91
+ partial_failures: List[SearchExecutionFailure]
92
+
93
+
94
+ class SearchService:
95
+ """Executor for multi-engine search requests."""
96
+
97
+ def __init__(self, engine_map: SearchEngineExecutorMap):
98
+ self.engine_map: MutableMapping[str, SearchEngineExecutor] = engine_map
99
+
100
+ async def execute(
101
+ self,
102
+ *,
103
+ query: str,
104
+ engines: List[str],
105
+ limit: int,
106
+ ) -> SearchExecutionResult:
107
+ clean_query = query.strip()
108
+ if not clean_query:
109
+ raise ValueError("Search query must not be empty")
110
+
111
+ if limit <= 0:
112
+ raise ValueError("Limit must be greater than zero")
113
+
114
+ if not engines:
115
+ raise ValueError("At least one search engine is required")
116
+
117
+ limits = distribute_limit(limit, len(engines))
118
+ partial_failures: List[SearchExecutionFailure] = []
119
+ tasks: List[Awaitable[List[SearchResult]]] = []
120
+
121
+ for engine, engine_limit in zip(engines, limits):
122
+ executor = self.engine_map.get(engine)
123
+ if executor is None:
124
+ partial_failures.append(
125
+ SearchExecutionFailure(
126
+ engine=engine,
127
+ code="unsupported_engine",
128
+ message=f"Unsupported search engine: {engine}",
129
+ )
130
+ )
131
+ continue
132
+
133
+ async def _run(executor=executor, engine=engine, engine_limit=engine_limit):
134
+ try:
135
+ return await executor(clean_query, engine_limit)
136
+ except Exception as error: # noqa: BLE001
137
+ partial_failures.append(
138
+ SearchExecutionFailure(
139
+ engine=engine,
140
+ code="engine_error",
141
+ message=str(error),
142
+ )
143
+ )
144
+ return []
145
+
146
+ tasks.append(_run())
147
+
148
+ gathered_results: List[SearchResult] = []
149
+ if tasks:
150
+ for chunk in await asyncio.gather(*tasks):
151
+ gathered_results.extend(chunk)
152
+
153
+ trimmed_results = gathered_results[:limit]
154
+ return SearchExecutionResult(
155
+ query=clean_query,
156
+ engines=engines,
157
+ total_results=len(trimmed_results),
158
+ results=trimmed_results,
159
+ partial_failures=partial_failures,
160
+ )
@@ -0,0 +1,23 @@
1
+ """Search engine implementations."""
2
+
3
+ from .baidu import search_baidu
4
+ from .bing import search_bing
5
+ from .brave import search_brave
6
+ from .csdn import search_csdn
7
+ from .duckduckgo import search_duckduckgo
8
+ from .exa import search_exa
9
+ from .juejin import search_juejin
10
+ from .linuxdo import search_linuxdo
11
+ from .startpage import search_startpage
12
+
13
+ __all__ = [
14
+ "search_baidu",
15
+ "search_bing",
16
+ "search_brave",
17
+ "search_csdn",
18
+ "search_duckduckgo",
19
+ "search_exa",
20
+ "search_juejin",
21
+ "search_linuxdo",
22
+ "search_startpage",
23
+ ]