PyPI - ddgs-mcp-server - Versions diffs - 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

ddgs-mcp-server 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

ddgs_mcp_server/server.py CHANGED Viewed

@@ -1,7 +1,11 @@
 import json
 import logging
-from typing import Optional, Literal
+import asyncio
+from typing import Optional
+import httpx
+import trafilatura
 from mcp.server import Server
 import mcp.types as types
 from ddgs import DDGS
@@ -13,12 +17,94 @@ logger = logging.getLogger("ddgs-mcp")
 # MCP Server
 server = Server("ddgs-mcp-server")
+# --- Content Extraction Utilities ---
+async def fetch_page_content(
+    url: str,
+    timeout: int = 10,
+    max_length: int = 50000
+) -> Optional[str]:
+    """
+    Fetch and extract main text content from a URL using trafilatura.
+    Args:
+        url: The URL to fetch content from
+        timeout: Request timeout in seconds
+        max_length: Maximum characters to return
+    Returns:
+        Extracted text content or None on failure
+    """
+    try:
+        async with httpx.AsyncClient(
+            timeout=timeout,
+            follow_redirects=True,
+            verify=True
+        ) as client:
+            response = await client.get(url, headers={
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.5",
+            })
+            if response.status_code == 200:
+                downloaded = response.text
+                # Extract main content using trafilatura
+                extracted = trafilatura.extract(
+                    downloaded,
+                    include_links=False,
+                    include_images=False,
+                    include_comments=False,
+                    favor_precision=True
+                )
+                if extracted:
+                    return extracted[:max_length]
+    except httpx.TimeoutException:
+        logger.warning(f"Timeout fetching {url}")
+    except httpx.HTTPError as e:
+        logger.warning(f"HTTP error fetching {url}: {e}")
+    except Exception as e:
+        logger.warning(f"Failed to fetch {url}: {e}")
+    return None
+async def enrich_results_with_content(
+    results: list,
+    max_concurrent: int = 5,
+    max_length: int = 50000
+) -> list:
+    """
+    Fetch full content for all search results concurrently.
+    Args:
+        results: List of search result dictionaries
+        max_concurrent: Maximum concurrent requests
+        max_length: Maximum content length per page
+    Returns:
+        Results list with 'full_content' field added
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def fetch_with_semaphore(result: dict) -> dict:
+        async with semaphore:
+            url = result.get("href")
+            if url:
+                content = await fetch_page_content(url, max_length=max_length)
+                result["full_content"] = content if content else "[Content extraction failed or blocked]"
+            return result
+    tasks = [fetch_with_semaphore(r.copy()) for r in results]
+    return await asyncio.gather(*tasks)
+# --- MCP Tool Definitions ---
 @server.list_tools()
 async def list_tools() -> list[types.Tool]:
     return [
         types.Tool(
             name="search_text",
-            description="Perform a metasearch using various backends (DuckDuckGo, Google, Bing, etc.). Use this to find APIs, libraries, developer tools, and general information.",
+            description="Perform a metasearch using various backends (DuckDuckGo, Google, Bing, etc.). Use this to find APIs, libraries, developer tools, and general information. Optionally fetch full page content for complete context.",
             inputSchema={
                 "type": "object",
                 "properties": {
@@ -32,7 +118,17 @@ async def list_tools() -> list[types.Tool]:
                     "region": {"type": "string", "default": "us-en", "description": "e.g., us-en, uk-en"},
                     "safesearch": {"type": "string", "enum": ["on", "moderate", "off"], "default": "moderate"},
                     "timelimit": {"type": "string", "enum": ["d", "w", "m", "y"], "default": None},
-                    "max_results": {"type": "integer", "default": 10}
+                    "max_results": {"type": "integer", "default": 10},
+                    "fetch_full_content": {
+                        "type": "boolean",
+                        "default": False,
+                        "description": "If true, fetches and returns the full text content of each result page. This provides complete context but adds latency."
+                    },
+                    "max_content_length": {
+                        "type": "integer",
+                        "default": 50000,
+                        "description": "Maximum characters of content to fetch per page (only used if fetch_full_content is true)."
+                    }
                 },
                 "required": ["query"]
             }
@@ -54,6 +150,7 @@ async def list_tools() -> list[types.Tool]:
         )
     ]
 @server.call_tool()
 async def call_tool(name: str, arguments: dict) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
     logger.info(f"Calling tool: {name} with args: {arguments}")
@@ -68,6 +165,10 @@ async def call_tool(name: str, arguments: dict) -> list[types.TextContent | type
     timelimit = arguments.get("timelimit")
     max_results = arguments.get("max_results", 10)
+    # New parameters for full content extraction
+    fetch_full_content = arguments.get("fetch_full_content", False)
+    max_content_length = arguments.get("max_content_length", 50000)
     try:
         with DDGS() as ddgs:
             results = []
@@ -80,6 +181,19 @@ async def call_tool(name: str, arguments: dict) -> list[types.TextContent | type
                     max_results=max_results,
                     backend=backend
                 )
+                # Convert generator to list for manipulation
+                results = list(results) if results else []
+                # Enrich with full content if requested
+                if fetch_full_content and results:
+                    logger.info(f"Fetching full content for {len(results)} results...")
+                    results = await enrich_results_with_content(
+                        results,
+                        max_length=max_content_length
+                    )
+                    logger.info("Full content extraction complete")
             elif name == "search_news":
                 results = ddgs.news(
                     query=query,
@@ -88,8 +202,9 @@ async def call_tool(name: str, arguments: dict) -> list[types.TextContent | type
                     timelimit=timelimit,
                     max_results=max_results
                 )
+                results = list(results) if results else []
-            return [types.TextContent(type="text", text=json.dumps(results, indent=2))]
+            return [types.TextContent(type="text", text=json.dumps(results, indent=2, ensure_ascii=False))]
     except Exception as e:
         logger.error(f"Error executing {name}: {e}")

{ddgs_mcp_server-0.4.1.dist-info → ddgs_mcp_server-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,11 +1,13 @@
 Metadata-Version: 2.4
 Name: ddgs-mcp-server
-Version: 0.4.1
-Summary: DuckDuckGo Search MCP Server
+Version: 0.5.1
+Summary: DuckDuckGo Search MCP Server with full page content extraction
 License-File: LICENSE
 Requires-Python: >=3.10
 Requires-Dist: ddgs>=9.10.0
+Requires-Dist: httpx>=0.27.0
 Requires-Dist: mcp>=1.0.0
+Requires-Dist: trafilatura>=2.0.0
 Description-Content-Type: text/markdown
 # DDGS MCP Server
@@ -14,9 +16,54 @@ A Model Context Protocol (MCP) server that provides DuckDuckGo Search capabiliti
 ## Features
-- **search_text**: advanced metasearch using `bing`, `brave`, `duckduckgo`, `google`, `mojeek`, `yahoo`, `yandex`, `wikipedia`.
+- **search_text**: Advanced metasearch using `bing`, `brave`, `duckduckgo`, `google`, `mojeek`, `yahoo`, `yandex`, `wikipedia`.
+  - **Full Content Extraction**: Optionally fetch complete page content (not just snippets) for comprehensive context.
 - **search_news**: Find latest updates, releases, and tech news.
+## Full Content Extraction
+For coding agents that need complete context from search results, enable full page content fetching:
+### Usage
+```json
+{
+  "query": "python async programming tutorial",
+  "fetch_full_content": true,
+  "max_content_length": 50000,
+  "max_results": 5
+}
+```
+### Parameters
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `fetch_full_content` | boolean | `false` | Enable full page content extraction |
+| `max_content_length` | integer | `50000` | Maximum characters per page (when `fetch_full_content` is true) |
+### Response Structure
+When `fetch_full_content` is enabled, each result includes a `full_content` field:
+```json
+[
+  {
+    "title": "Python Async Programming Guide",
+    "href": "https://example.com/python-async",
+    "body": "Brief snippet from search results...",
+    "full_content": "Complete extracted article text with all paragraphs, code examples, and detailed explanations..."
+  }
+]
+```
+### Performance Notes
+- Content extraction adds ~1-3 seconds latency per page
+- Up to 5 pages are fetched concurrently to minimize total time
+- Failed fetches return `[Content extraction failed or blocked]` without breaking the search
+- Uses [Trafilatura](https://trafilatura.readthedocs.io/) for high-quality text extraction
 ## Installation & Usage

ddgs_mcp_server-0.5.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+ddgs_mcp_server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ddgs_mcp_server/main.py,sha256=hqJl7UoGQoL9a-2hX24srZYFGdatheJfgkn5wz5Od70,492
+ddgs_mcp_server/server.py,sha256=1I7mG-Dw1UV_xQtj-Ow_3ttHAr3HF3z6SSfR-Mi0ts8,8269
+ddgs_mcp_server-0.5.1.dist-info/METADATA,sha256=FqvHalHQ737L415fvpyWXauJEiO7Li7AQbBNTur4lQc,3757
+ddgs_mcp_server-0.5.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+ddgs_mcp_server-0.5.1.dist-info/entry_points.txt,sha256=8YvtzhkNDMvAy2CdIx8VppBFjiBSJ56JtLX-v8SUHGc,62
+ddgs_mcp_server-0.5.1.dist-info/licenses/LICENSE,sha256=vLPKcNOa4dGBRPq4I_mIBKyVSbIlzrOdinbwXFeKb88,1091
+ddgs_mcp_server-0.5.1.dist-info/RECORD,,

ddgs_mcp_server-0.4.1.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-ddgs_mcp_server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ddgs_mcp_server/main.py,sha256=hqJl7UoGQoL9a-2hX24srZYFGdatheJfgkn5wz5Od70,492
-ddgs_mcp_server/server.py,sha256=IEsDiPsw2ciIVnmoOBYnOuVd_fCXmOXfNapQTBpO9wc,3919
-ddgs_mcp_server-0.4.1.dist-info/METADATA,sha256=bylRyAQpkCOFQilQt-rMLuP1j4eOxLMZjQsZMyVqyyE,2227
-ddgs_mcp_server-0.4.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-ddgs_mcp_server-0.4.1.dist-info/entry_points.txt,sha256=8YvtzhkNDMvAy2CdIx8VppBFjiBSJ56JtLX-v8SUHGc,62
-ddgs_mcp_server-0.4.1.dist-info/licenses/LICENSE,sha256=vLPKcNOa4dGBRPq4I_mIBKyVSbIlzrOdinbwXFeKb88,1091
-ddgs_mcp_server-0.4.1.dist-info/RECORD,,

{ddgs_mcp_server-0.4.1.dist-info → ddgs_mcp_server-0.5.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{ddgs_mcp_server-0.4.1.dist-info → ddgs_mcp_server-0.5.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ddgs_mcp_server-0.4.1.dist-info → ddgs_mcp_server-0.5.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

ddgs-mcp-server 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

ddgs-mcp-server 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl