PyPI - mcpwebprobe - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mcpwebprobe 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

mcpwebprobe/__init__.py +24 -0
mcpwebprobe/api.py +109 -0
mcpwebprobe/config.py +262 -0
mcpwebprobe/engine/__init__.py +5 -0
mcpwebprobe/engine/registry.py +35 -0
mcpwebprobe/engine/search_service.py +160 -0
mcpwebprobe/engines/__init__.py +23 -0
mcpwebprobe/engines/baidu.py +105 -0
mcpwebprobe/engines/bing.py +190 -0
mcpwebprobe/engines/brave.py +105 -0
mcpwebprobe/engines/csdn.py +73 -0
mcpwebprobe/engines/duckduckgo.py +124 -0
mcpwebprobe/engines/exa.py +97 -0
mcpwebprobe/engines/fetch_csdn.py +12 -0
mcpwebprobe/engines/fetch_juejin.py +76 -0
mcpwebprobe/engines/fetch_linuxdo.py +59 -0
mcpwebprobe/engines/github.py +77 -0
mcpwebprobe/engines/juejin.py +114 -0
mcpwebprobe/engines/linuxdo.py +44 -0
mcpwebprobe/engines/startpage.py +192 -0
mcpwebprobe/logging.py +38 -0
mcpwebprobe/main.py +161 -0
mcpwebprobe/server.py +107 -0
mcpwebprobe/types.py +11 -0
mcpwebprobe/utils/browser_cookies.py +17 -0
mcpwebprobe/utils/cookies.py +273 -0
mcpwebprobe/utils/csdn.py +361 -0
mcpwebprobe/utils/duckduckgo.py +227 -0
mcpwebprobe/utils/http_client.py +363 -0
mcpwebprobe/utils/playwright.py +196 -0
mcpwebprobe/utils/urls.py +117 -0
mcpwebprobe-0.1.0.dist-info/METADATA +97 -0
mcpwebprobe-0.1.0.dist-info/RECORD +35 -0
mcpwebprobe-0.1.0.dist-info/WHEEL +4 -0
mcpwebprobe-0.1.0.dist-info/entry_points.txt +3 -0

mcpwebprobe/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""Convenience exports for the webprobe package."""
+from .api import (
+    fetch_csdn,
+    fetch_github,
+    fetch_juejin,
+    fetch_linuxdo,
+    search,
+)
+from .logging import configure_logging, get_logger
+from .main import main
+from .server import WebProbeServer
+__all__ = [
+    "search",
+    "fetch_csdn",
+    "fetch_linuxdo",
+    "fetch_juejin",
+    "fetch_github",
+    "WebProbeServer",
+    "configure_logging",
+    "get_logger",
+    "main",
+]

mcpwebprobe/api.py ADDED Viewed

@@ -0,0 +1,109 @@
+import asyncio
+import json
+from typing import Iterable, List, Optional, Sequence
+from mcpwebprobe.config import config
+from mcpwebprobe.engine.registry import SEARCH_SERVICE
+from mcpwebprobe.engine.search_service import (
+    normalize_engine_name,
+    resolve_requested_engines,
+    SearchExecutionResult,
+)
+from mcpwebprobe.engines.fetch_csdn import fetch_csdn_article
+from mcpwebprobe.engines.fetch_juejin import fetch_juejin_article
+from mcpwebprobe.engines.fetch_linuxdo import fetch_linuxdo_article
+from mcpwebprobe.engines.github import fetch_github_readme
+from mcpwebprobe.logging import get_logger
+logger = get_logger(__name__)
+def _normalize_requested_engines(engines: Optional[Sequence[str]]) -> List[str]:
+    requested = []
+    if not engines:
+        return requested
+    for engine in engines:
+        normalized = normalize_engine_name(engine)
+        if normalized:
+            requested.append(normalized)
+    return requested
+def _serialize_search(result: SearchExecutionResult) -> dict:
+    return {
+        "query": result.query,
+        "engines": result.engines,
+        "totalResults": result.total_results,
+        "partialFailures": [
+            {
+                "engine": f.engine,
+                "code": f.code,
+                "message": f.message,
+            }
+            for f in result.partial_failures
+        ],
+        "results": [
+            {
+                "title": entry.title,
+                "url": entry.url,
+                "description": entry.description,
+                "source": entry.source,
+                "engine": entry.engine,
+            }
+            for entry in result.results
+        ],
+    }
+def search(
+    query: str,
+    limit: int = 10,
+    engines: Optional[Iterable[str]] = None,
+) -> dict:
+    """
+    Run a search across the configured engines.
+    """
+    if not query or not query.strip():
+        raise ValueError("query is required")
+    if not (1 <= limit <= 50):
+        raise ValueError("limit must be between 1 and 50")
+    requested = list(_normalize_requested_engines(engines))
+    resolved = resolve_requested_engines(
+        requested or [config.default_search_engine],
+        [normalize_engine_name(engine) for engine in config.allowed_search_engines],
+        config.default_search_engine,
+    )
+    result = asyncio.run(
+        SEARCH_SERVICE.execute(
+            query=query,
+            engines=resolved,
+            limit=limit,
+        )
+    )
+    logger.debug(
+        "Ran search for query=%s limit=%s engines=%s", query, limit, resolved
+    )
+    return _serialize_search(result)
+def fetch_csdn(url: str) -> dict:
+    logger.debug("Fetching CSDN article %s", url)
+    return asyncio.run(fetch_csdn_article(url))
+def fetch_linuxdo(url: str) -> dict:
+    logger.debug("Fetching linux.do topic %s", url)
+    return asyncio.run(fetch_linuxdo_article(url))
+def fetch_juejin(url: str) -> dict:
+    logger.debug("Fetching Juejin article %s", url)
+    return asyncio.run(fetch_juejin_article(url))
+def fetch_github(url: str) -> Optional[str]:
+    logger.debug("Fetching GitHub README %s", url)
+    return asyncio.run(fetch_github_readme(url))

mcpwebprobe/config.py ADDED Viewed

@@ -0,0 +1,262 @@
+import os
+from typing import Optional, List, Literal, Union
+from dataclasses import dataclass
+from urllib.parse import quote
+from mcpwebprobe.logging import get_logger
+# Type aliases for better type hints
+SearchEngine = Literal[
+    "bing",
+    "duckduckgo",
+    "exa",
+    "brave",
+    "baidu",
+    "csdn",
+    "linuxdo",
+    "juejin",
+    "startpage",
+]
+SearchMode = Literal["request", "auto", "playwright"]
+PlaywrightPackage = Literal["auto", "playwright", "playwright-core"]
+@dataclass
+class AppConfig:
+    # Search engine configuration
+    default_search_engine: SearchEngine
+    # List of allowed search engines (if empty, all engines are available)
+    allowed_search_engines: List[str]
+    # Search mode: request only, auto request then fallback, or force Playwright
+    # Currently only affects Bing.
+    search_mode: SearchMode
+    # Proxy configuration
+    proxy_url: Optional[str]
+    use_proxy: bool
+    fetch_web_allow_insecure_tls: bool
+    # Playwright configuration
+    playwright_package: PlaywrightPackage
+    playwright_module_path: Optional[str]
+    playwright_executable_path: Optional[str]
+    playwright_ws_endpoint: Optional[str]
+    playwright_cdp_endpoint: Optional[str]
+    playwright_headless: bool
+    playwright_navigation_timeout_ms: int
+    # CORS configuration
+    enable_cors: bool
+    cors_origin: str
+    # Server configuration (determined by MODE env var: 'both', 'http', or 'stdio')
+    enable_http_server: bool
+def read_optional_env(name: str) -> Optional[str]:
+    """Read optional environment variable, return None if not set or empty"""
+    value = os.environ.get(name)
+    if value is not None:
+        value = value.strip()
+        return value if value else None
+    return None
+def parse_allowed_search_engines(env_value: Optional[str]) -> List[str]:
+    """Parse comma-separated list of allowed search engines"""
+    if env_value:
+        return [e.strip() for e in env_value.split(",")]
+    return []
+# Valid search engines list
+VALID_SEARCH_ENGINES = [
+    "bing",
+    "duckduckgo",
+    "exa",
+    "brave",
+    "baidu",
+    "csdn",
+    "linuxdo",
+    "juejin",
+    "startpage",
+]
+VALID_SEARCH_MODES = ["request", "auto", "playwright"]
+VALID_PLAYWRIGHT_PACKAGES = ["auto", "playwright", "playwright-core"]
+QUIET_STARTUP_LOGS = os.environ.get("OPEN_WEBSEARCH_QUIET_STARTUP") == "true"
+# Read from environment variables or use defaults
+config = AppConfig(
+    # Search engine configuration
+    default_search_engine=os.environ.get("DEFAULT_SEARCH_ENGINE", "bing"),  # type: ignore
+    # Parse comma-separated list of allowed search engines
+    allowed_search_engines=parse_allowed_search_engines(
+        os.environ.get("ALLOWED_SEARCH_ENGINES")
+    ),
+    search_mode=os.environ.get("SEARCH_MODE", "auto"),  # type: ignore
+    # Proxy configuration
+    proxy_url=os.environ.get("PROXY_URL", "http://127.0.0.1:7890"),
+    use_proxy=os.environ.get("USE_PROXY") == "true",
+    fetch_web_allow_insecure_tls=os.environ.get("FETCH_WEB_INSECURE_TLS") == "true",
+    playwright_package=os.environ.get("PLAYWRIGHT_PACKAGE", "auto"),  # type: ignore
+    playwright_module_path=read_optional_env("PLAYWRIGHT_MODULE_PATH"),
+    playwright_executable_path=read_optional_env("PLAYWRIGHT_EXECUTABLE_PATH"),
+    playwright_ws_endpoint=read_optional_env("PLAYWRIGHT_WS_ENDPOINT"),
+    playwright_cdp_endpoint=read_optional_env("PLAYWRIGHT_CDP_ENDPOINT"),
+    playwright_headless=os.environ.get("PLAYWRIGHT_HEADLESS") != "false",
+    playwright_navigation_timeout_ms=int(
+        os.environ.get("PLAYWRIGHT_NAVIGATION_TIMEOUT_MS", 20000)
+    ),
+    # CORS configuration
+    enable_cors=os.environ.get("ENABLE_CORS") == "true",
+    cors_origin=os.environ.get("CORS_ORIGIN", "*"),
+    # Server configuration - determined by MODE environment variable
+    # Modes: 'both' (default), 'http', 'stdio'
+    enable_http_server=(
+        (os.environ.get("MODE", "both") in ["both", "http"])
+        if os.environ.get("MODE")
+        else True
+    ),
+)
+logger = get_logger(__name__)
+# Validate default search engine
+if config.default_search_engine not in VALID_SEARCH_ENGINES:
+    logger.warning(
+        'Invalid DEFAULT_SEARCH_ENGINE: "%s", falling back to "bing"',
+        config.default_search_engine,
+    )
+    config.default_search_engine = "bing"
+if config.search_mode not in VALID_SEARCH_MODES:
+    logger.warning(
+        'Invalid SEARCH_MODE: "%s", falling back to "auto"',
+        config.search_mode,
+    )
+    config.search_mode = "auto"
+if config.playwright_package not in VALID_PLAYWRIGHT_PACKAGES:
+    logger.warning(
+        'Invalid PLAYWRIGHT_PACKAGE: "%s", falling back to "auto"',
+        config.playwright_package,
+    )
+    config.playwright_package = "auto"
+if not (
+    isinstance(config.playwright_navigation_timeout_ms, (int, float))
+    and config.playwright_navigation_timeout_ms > 0
+):
+    logger.warning(
+        'Invalid PLAYWRIGHT_NAVIGATION_TIMEOUT_MS: "%s", falling back to 20000',
+        os.environ.get("PLAYWRIGHT_NAVIGATION_TIMEOUT_MS"),
+    )
+    config.playwright_navigation_timeout_ms = 20000
+if config.playwright_ws_endpoint and config.playwright_cdp_endpoint:
+    logger.warning(
+        "Both PLAYWRIGHT_WS_ENDPOINT and PLAYWRIGHT_CDP_ENDPOINT are set, PLAYWRIGHT_WS_ENDPOINT will take precedence"
+    )
+if (
+    config.playwright_ws_endpoint or config.playwright_cdp_endpoint
+) and config.playwright_executable_path:
+    logger.warning(
+        "PLAYWRIGHT_EXECUTABLE_PATH is ignored when connecting to a remote browser endpoint"
+    )
+# Validate allowed search engines
+if config.allowed_search_engines:
+    # Filter out invalid engines
+    invalid_engines = [
+        engine
+        for engine in config.allowed_search_engines
+        if engine not in VALID_SEARCH_ENGINES
+    ]
+    if invalid_engines:
+        logger.warning(
+            "Invalid search engines detected and will be ignored: %s",
+            ", ".join(invalid_engines),
+        )
+    config.allowed_search_engines = [
+        engine
+        for engine in config.allowed_search_engines
+        if engine in VALID_SEARCH_ENGINES
+    ]
+    # If all engines were invalid, don't restrict (allow all engines)
+    if not config.allowed_search_engines:
+        logger.warning(
+            "No valid search engines specified in the allowed list, all engines will be available"
+        )
+    # Check if default engine is in the allowed list
+    elif config.default_search_engine not in config.allowed_search_engines:
+        logger.warning(
+            'Default search engine "%s" is not in the allowed engines list',
+            config.default_search_engine,
+        )
+        # Update the default engine to the first allowed engine
+        config.default_search_engine = config.allowed_search_engines[0]  # type: ignore
+        logger.info(
+            'Default search engine updated to "%s"',
+            config.default_search_engine,
+        )
+if not QUIET_STARTUP_LOGS:
+    # Log configuration
+    logger.info("🔍 Default search engine: %s", config.default_search_engine)
+    if config.allowed_search_engines:
+        logger.info("🔍 Allowed search engines: %s", ", ".join(config.allowed_search_engines))
+    else:
+        logger.info("🔍 No search engine restrictions, all available engines can be used")
+    logger.info(
+        "🔍 Search mode: %s (currently only affects Bing)",
+        config.search_mode.upper(),
+    )
+    if config.use_proxy:
+        logger.info("🌐 Using proxy: %s", config.proxy_url)
+    else:
+        logger.info("🌐 No proxy configured (set USE_PROXY=true to enable)")
+    if config.fetch_web_allow_insecure_tls:
+        logger.warning(
+            "⚠️ fetchWebContent TLS verification is disabled (FETCH_WEB_INSECURE_TLS=true)"
+        )
+    else:
+        logger.info("🔐 fetchWebContent TLS verification is enabled")
+    logger.info("🧭 Playwright client source: %s", config.playwright_package)
+    if config.playwright_module_path:
+        logger.info(
+            "🧭 Playwright module path override: %s", config.playwright_module_path
+        )
+    if config.playwright_ws_endpoint:
+        logger.info(
+            "🧭 Playwright remote endpoint (ws): %s", config.playwright_ws_endpoint
+        )
+    elif config.playwright_cdp_endpoint:
+        logger.info(
+            "🧭 Playwright remote endpoint (cdp): %s", config.playwright_cdp_endpoint
+        )
+    elif config.playwright_executable_path:
+        logger.info(
+            "🧭 Playwright executable path: %s", config.playwright_executable_path
+        )
+    logger.info("🧭 Playwright headless: %s", config.playwright_headless)
+    logger.info(
+        "🧭 Playwright navigation timeout: %sms",
+        config.playwright_navigation_timeout_ms,
+    )
+    # Determine server mode from config
+    mode = os.environ.get("MODE") or ("both" if config.enable_http_server else "stdio")
+    logger.info("🖥️ Server mode: %s", mode.upper())
+    if config.enable_http_server:
+        if config.enable_cors:
+            logger.info("🔒 CORS enabled with origin: %s", config.cors_origin)
+        else:
+            logger.info("🔒 CORS disabled (set ENABLE_CORS=true to enable)")
+def get_proxy_url() -> Optional[str]:
+    """Helper function to get the proxy URL if proxy is enabled"""
+    return quote(config.proxy_url) if config.use_proxy and config.proxy_url else None

mcpwebprobe/engine/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Core engine helpers."""
+from .search_service import SearchExecutionFailure, SearchExecutionResult, SearchService
+__all__ = ["SearchExecutionFailure", "SearchExecutionResult", "SearchService"]

mcpwebprobe/engine/registry.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Registry of supported search engines."""
+from typing import Dict
+from mcpwebprobe.engine.search_service import (
+    SearchEngineExecutor,
+    SearchExecutionResult,
+    SearchService,
+    SUPPORTED_SEARCH_ENGINES,
+)
+from mcpwebprobe.engines.baidu import search_baidu
+from mcpwebprobe.engines.bing import search_bing
+from mcpwebprobe.engines.brave import search_brave
+from mcpwebprobe.engines.csdn import search_csdn
+from mcpwebprobe.engines.duckduckgo import search_duckduckgo
+from mcpwebprobe.engines.exa import search_exa
+from mcpwebprobe.engines.juejin import search_juejin
+from mcpwebprobe.engines.linuxdo import search_linuxdo
+from mcpwebprobe.engines.startpage import search_startpage
+EngineMap = Dict[str, SearchEngineExecutor]
+ENGINE_MAP: EngineMap = {
+    "baidu": search_baidu,
+    "bing": search_bing,
+    "brave": search_brave,
+    "csdn": search_csdn,
+    "duckduckgo": search_duckduckgo,
+    "exa": search_exa,
+    "juejin": search_juejin,
+    "linuxdo": search_linuxdo,
+    "startpage": search_startpage,
+}
+SEARCH_SERVICE = SearchService(ENGINE_MAP)

mcpwebprobe/engine/search_service.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""Search execution helpers for multi-engine queries."""
+import asyncio
+import re
+from dataclasses import dataclass
+from typing import (
+    Awaitable,
+    Callable,
+    Dict,
+    List,
+    MutableMapping,
+    Sequence,
+)
+from mcpwebprobe.types import SearchResult
+SearchEngineExecutor = Callable[[str, int], Awaitable[List[SearchResult]]]
+SearchEngineExecutorMap = Dict[str, SearchEngineExecutor]
+SUPPORTED_SEARCH_ENGINES = [
+    "baidu",
+    "bing",
+    "linuxdo",
+    "csdn",
+    "duckduckgo",
+    "exa",
+    "brave",
+    "juejin",
+    "startpage",
+]
+SEARCH_ENGINE_SYNONYMS = {
+    "bd": "baidu",
+    "ddg": "duckduckgo",
+    "sp": "startpage",
+}
+def normalize_engine_name(engine: str) -> str:
+    """Normalize engine identifiers to canonical names."""
+    cleaned = engine.strip().lower()
+    compact = re.sub(r"[\s._-]+", "", cleaned)
+    if compact in SEARCH_ENGINE_SYNONYMS:
+        return SEARCH_ENGINE_SYNONYMS[compact]
+    if compact in SUPPORTED_SEARCH_ENGINES:
+        return compact
+    return cleaned
+def distribute_limit(total_limit: int, engine_count: int) -> List[int]:
+    """Evenly distribute the requested limit across the chosen engines."""
+    if engine_count <= 0:
+        return []
+    base = total_limit // engine_count
+    remainder = total_limit % engine_count
+    return [base + (1 if i < remainder else 0) for i in range(engine_count)]
+def resolve_requested_engines(
+    requested: Sequence[str], allowed: Sequence[str], default_engine: str
+) -> List[str]:
+    """Filter requested engines against the allowed list, falling back to defaults."""
+    if not requested:
+        return [default_engine]
+    if not allowed:
+        return list(requested)
+    filtered = [engine for engine in requested if engine in allowed]
+    return filtered if filtered else [default_engine]
+@dataclass
+class SearchExecutionFailure:
+    engine: str
+    code: str
+    message: str
+@dataclass
+class SearchExecutionResult:
+    query: str
+    engines: List[str]
+    total_results: int
+    results: List[SearchResult]
+    partial_failures: List[SearchExecutionFailure]
+class SearchService:
+    """Executor for multi-engine search requests."""
+    def __init__(self, engine_map: SearchEngineExecutorMap):
+        self.engine_map: MutableMapping[str, SearchEngineExecutor] = engine_map
+    async def execute(
+        self,
+        *,
+        query: str,
+        engines: List[str],
+        limit: int,
+    ) -> SearchExecutionResult:
+        clean_query = query.strip()
+        if not clean_query:
+            raise ValueError("Search query must not be empty")
+        if limit <= 0:
+            raise ValueError("Limit must be greater than zero")
+        if not engines:
+            raise ValueError("At least one search engine is required")
+        limits = distribute_limit(limit, len(engines))
+        partial_failures: List[SearchExecutionFailure] = []
+        tasks: List[Awaitable[List[SearchResult]]] = []
+        for engine, engine_limit in zip(engines, limits):
+            executor = self.engine_map.get(engine)
+            if executor is None:
+                partial_failures.append(
+                    SearchExecutionFailure(
+                        engine=engine,
+                        code="unsupported_engine",
+                        message=f"Unsupported search engine: {engine}",
+                    )
+                )
+                continue
+            async def _run(executor=executor, engine=engine, engine_limit=engine_limit):
+                try:
+                    return await executor(clean_query, engine_limit)
+                except Exception as error:  # noqa: BLE001
+                    partial_failures.append(
+                        SearchExecutionFailure(
+                            engine=engine,
+                            code="engine_error",
+                            message=str(error),
+                        )
+                    )
+                    return []
+            tasks.append(_run())
+        gathered_results: List[SearchResult] = []
+        if tasks:
+            for chunk in await asyncio.gather(*tasks):
+                gathered_results.extend(chunk)
+        trimmed_results = gathered_results[:limit]
+        return SearchExecutionResult(
+            query=clean_query,
+            engines=engines,
+            total_results=len(trimmed_results),
+            results=trimmed_results,
+            partial_failures=partial_failures,
+        )

mcpwebprobe/engines/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Search engine implementations."""
+from .baidu import search_baidu
+from .bing import search_bing
+from .brave import search_brave
+from .csdn import search_csdn
+from .duckduckgo import search_duckduckgo
+from .exa import search_exa
+from .juejin import search_juejin
+from .linuxdo import search_linuxdo
+from .startpage import search_startpage
+__all__ = [
+    "search_baidu",
+    "search_bing",
+    "search_brave",
+    "search_csdn",
+    "search_duckduckgo",
+    "search_exa",
+    "search_juejin",
+    "search_linuxdo",
+    "search_startpage",
+]