PyPI - docpull - Versions diffs - 2.5.1__tar.gz → 3.0.1__tar.gz - Mend

docpull 2.5.1tar.gz → 3.0.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

{docpull-2.5.1/src/docpull.egg-info → docpull-3.0.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docpull
-Version: 2.5.1
+Version: 3.0.1
 Summary: Pull documentation from the web and convert to clean markdown
 Author-email: Zachary Roth <support@raintree.technology>
 Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -43,9 +43,12 @@ Requires-Dist: html2text>=2020.1.16
 Requires-Dist: defusedxml>=0.7.1
 Requires-Dist: extruct>=0.15.0
 Requires-Dist: aiohttp>=3.9.0
+Requires-Dist: idna>=3.15
+Requires-Dist: regex>=2024.11.6
 Requires-Dist: rich>=13.0.0
 Requires-Dist: pyyaml>=6.0
 Requires-Dist: pydantic>=2.0
+Requires-Dist: urllib3>=2.7.0
 Provides-Extra: proxy
 Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
 Provides-Extra: normalize
@@ -56,6 +59,8 @@ Provides-Extra: tokens
 Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
 Provides-Extra: mcp
 Requires-Dist: mcp>=1.0.0; extra == "mcp"
+Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
+Requires-Dist: starlette>=1.0.1; extra == "mcp"
 Provides-Extra: llm
 Requires-Dist: tiktoken>=0.7.0; extra == "llm"
 Provides-Extra: all
@@ -64,11 +69,12 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
 Requires-Dist: trafilatura>=1.12.0; extra == "all"
 Requires-Dist: tiktoken>=0.7.0; extra == "all"
 Requires-Dist: mcp>=1.0.0; extra == "all"
+Requires-Dist: python-multipart>=0.0.27; extra == "all"
+Requires-Dist: starlette>=1.0.1; extra == "all"
 Provides-Extra: dev
 Requires-Dist: pytest>=7.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
 Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
-Requires-Dist: black>=23.0.0; extra == "dev"
 Requires-Dist: mypy>=1.0.0; extra == "dev"
 Requires-Dist: ruff>=0.1.0; extra == "dev"
 Requires-Dist: bandit>=1.7.0; extra == "dev"
@@ -280,6 +286,17 @@ sources:
     maxPages: 200
 ```
+### About the `mcp/` directory in this repo
+The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
+server backed by PostgreSQL with pgvector for semantic search. It is not
+the Python MCP server shipped in the `docpull` package described above
+— that one is the right choice for almost every user and is installed
+with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
+own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
+unless you specifically need pgvector-backed semantic search, ignore it
+and use `docpull mcp`.
 ## Output
 Markdown files with YAML frontmatter:
@@ -376,6 +393,7 @@ docpull URL --preview-urls    # List URLs without fetching
 - [PyPI](https://pypi.org/project/docpull/)
 - [GitHub](https://github.com/raintree-technology/docpull)
 - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
+- [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
 ## License

{docpull-2.5.1 → docpull-3.0.1}/README.md RENAMED Viewed

@@ -198,6 +198,17 @@ sources:
     maxPages: 200
 ```
+### About the `mcp/` directory in this repo
+The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
+server backed by PostgreSQL with pgvector for semantic search. It is not
+the Python MCP server shipped in the `docpull` package described above
+— that one is the right choice for almost every user and is installed
+with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
+own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
+unless you specifically need pgvector-backed semantic search, ignore it
+and use `docpull mcp`.
 ## Output
 Markdown files with YAML frontmatter:
@@ -294,6 +305,7 @@ docpull URL --preview-urls    # List URLs without fetching
 - [PyPI](https://pypi.org/project/docpull/)
 - [GitHub](https://github.com/raintree-technology/docpull)
 - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
+- [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
 ## License

{docpull-2.5.1 → docpull-3.0.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "docpull"
-version = "2.5.1"
+version = "3.0.1"
 dynamic = []
 description = "Pull documentation from the web and convert to clean markdown"
 readme = {file = "README.md", content-type = "text/markdown"}
@@ -67,9 +67,12 @@ dependencies = [
     "defusedxml>=0.7.1",
     "extruct>=0.15.0",
     "aiohttp>=3.9.0",
+    "idna>=3.15",
+    "regex>=2024.11.6",
     "rich>=13.0.0",
     "pyyaml>=6.0",
     "pydantic>=2.0",
+    "urllib3>=2.7.0",
 ]
 [project.optional-dependencies]
@@ -87,6 +90,8 @@ tokens = [
 ]
 mcp = [
     "mcp>=1.0.0",
+    "python-multipart>=0.0.27",
+    "starlette>=1.0.1",
 ]
 llm = [
     "tiktoken>=0.7.0",
@@ -97,12 +102,13 @@ all = [
     "trafilatura>=1.12.0",
     "tiktoken>=0.7.0",
     "mcp>=1.0.0",
+    "python-multipart>=0.0.27",
+    "starlette>=1.0.1",
 ]
 dev = [
     "pytest>=7.0.0",
     "pytest-cov>=4.0.0",
     "pytest-asyncio>=0.21.0",
-    "black>=23.0.0",
     "mypy>=1.0.0",
     "ruff>=0.1.0",
     "bandit>=1.7.0",
@@ -132,10 +138,6 @@ include = ["docpull*"]
 [tool.setuptools.package-data]
 docpull = ["py.typed"]
-[tool.black]
-line-length = 110
-target-version = ["py310", "py311", "py312", "py313", "py314"]
 [tool.ruff]
 line-length = 110
 target-version = "py310"
@@ -175,10 +177,22 @@ module = "docpull.models.*"
 disallow_any_unimported = false
 warn_return_any = false
-[[tool.mypy.overrides]]
-module = "tests.*"
-disallow_untyped_defs = false
-disallow_any_unimported = false
+[tool.bandit]
+# Policy: every entry in `skips` MUST have a one-line justification
+# above it explaining what bandit found, why it's a false positive
+# *for this codebase*, and (if narrow) why a `# nosec BXXX  # reason`
+# annotation at the call site would have been worse. Bandit skips
+# silence findings repo-wide, so the bar to add one is higher than
+# silencing a single line. If a new skip is unavoidable, add it here
+# in PR review, not as a drive-by.
+#
+# B101 (assert_used) — flags every `assert x is not None` we use for
+# type narrowing. Bandit's concern is that assertions vanish under
+# `python -O`. docpull is a CLI / SDK, never invoked with -O, and the
+# narrowing asserts are not load-bearing safety checks. Skipping the
+# rule globally keeps the existing idiom without 8+ inline `# nosec`
+# annotations in fetcher.py / pipeline/steps/convert.py.
+skips = ["B101"]
 [tool.pytest.ini_options]
 minversion = "7.0"

{docpull-2.5.1 → docpull-3.0.1}/src/docpull/__init__.py RENAMED Viewed

@@ -14,7 +14,7 @@ Usage:
             print(event)
 """
-__version__ = "2.5.1"
+__version__ = "3.0.0"
 from .cache import CacheManager, StreamingDeduplicator
 from .conversion.chunking import Chunk, TokenCounter, chunk_markdown

{docpull-2.5.1 → docpull-3.0.1}/src/docpull/cache/manager.py RENAMED Viewed

@@ -5,10 +5,12 @@ from __future__ import annotations
 import hashlib
 import json
 import logging
-from datetime import datetime, timedelta
+from datetime import timedelta
 from pathlib import Path
 from typing import TypedDict
+from ..time_utils import parse_persisted_datetime, utc_now, utc_now_iso
 logger = logging.getLogger(__name__)
 # Default TTL for cache entries (30 days)
@@ -257,7 +259,7 @@ class CacheManager:
         self.manifest[url] = {
             "checksum": self.compute_checksum(content),
             "file_path": str(file_path),
-            "fetched_at": datetime.now().isoformat(),
+            "fetched_at": utc_now_iso(),
             "size": len(content),
         }
@@ -314,7 +316,7 @@ class CacheManager:
         Note:
             Changes are batched. Call flush() to persist to disk.
         """
-        self._state.last_run = datetime.now().isoformat()
+        self._state.last_run = utc_now_iso()
         self._state_dirty = True
     def clear_state(self) -> None:
@@ -354,18 +356,18 @@ class CacheManager:
         if ttl is None:
             return 0
-        cutoff = datetime.now() - timedelta(days=ttl)
+        cutoff = utc_now() - timedelta(days=ttl)
         to_remove = []
         for url, entry in self.manifest.items():
             fetched_at = entry.get("fetched_at")
             if fetched_at:
                 try:
-                    entry_time = datetime.fromisoformat(fetched_at)
+                    entry_time = parse_persisted_datetime(fetched_at)
                     if entry_time < cutoff:
                         to_remove.append(url)
-                except ValueError:
-                    pass  # Invalid date format, skip
+                except ValueError as err:
+                    logger.warning("Invalid cache timestamp for %s: %s", url, err)
         for url in to_remove:
             del self.manifest[url]
@@ -413,7 +415,7 @@ class CacheManager:
         """
         data: DiscoveredUrlsState = {
             "start_url": start_url,
-            "discovered_at": datetime.now().isoformat(),
+            "discovered_at": utc_now_iso(),
             "urls": urls,
         }
         try:

{docpull-2.5.1 → docpull-3.0.1}/src/docpull/cli.py RENAMED Viewed

@@ -13,12 +13,10 @@ if "--doctor" in sys.argv:
     output_dir = None
     if "--output-dir" in sys.argv or "-o" in sys.argv:
-        try:
-            flag_idx = sys.argv.index("--output-dir") if "--output-dir" in sys.argv else sys.argv.index("-o")
-            if flag_idx + 1 < len(sys.argv):
-                output_dir = Path(sys.argv[flag_idx + 1])
-        except (ValueError, IndexError):
-            pass
+        flag = "--output-dir" if "--output-dir" in sys.argv else "-o"
+        flag_idx = sys.argv.index(flag)
+        if flag_idx + 1 < len(sys.argv):
+            output_dir = Path(sys.argv[flag_idx + 1])
     sys.exit(run_doctor(output_dir=output_dir))
 # Verify core dependencies
@@ -562,8 +560,7 @@ def run_fetcher(args: argparse.Namespace) -> int:
                         n_chunks = len(ctx.chunks) if ctx.chunks else 0
                         extra = f" ({n_chunks} chunks)" if n_chunks else ""
                         console.print(
-                            f"[green]Saved:[/green] {ctx.output_path} "
-                            f"[{ctx.source_type or 'generic'}]{extra}"
+                            f"[green]Saved:[/green] {ctx.output_path} [{ctx.source_type or 'generic'}]{extra}"
                         )
                     return 0

{docpull-2.5.1 → docpull-3.0.1}/src/docpull/conversion/special_cases.py RENAMED Viewed

@@ -246,7 +246,8 @@ def _describe_type(schema: Any, spec: dict[str, Any]) -> str:
     if not isinstance(schema, dict):
         return "?"
     if "$ref" in schema:
-        return schema["$ref"].rsplit("/", 1)[-1]
+        ref: str = schema["$ref"]
+        return ref.rsplit("/", 1)[-1]
     for key in ("oneOf", "anyOf", "allOf"):
         if isinstance(schema.get(key), list) and schema[key]:
             seen: list[str] = []
@@ -319,7 +320,8 @@ class OpenApiExtractor:
             return None
         try:
             data = json.loads(text)
-        except json.JSONDecodeError:
+        except json.JSONDecodeError as err:
+            logger.debug("OpenAPI extractor skipped %s: JSON parse failed: %s", url, err)
             return None
         if not isinstance(data, dict):
             return None
@@ -349,9 +351,7 @@ class OpenApiExtractor:
             for method, op in ops.items():
                 if method.lower() not in _HTTP_METHODS or not isinstance(op, dict):
                     continue
-                self._render_operation(
-                    lines, path, method, op, shared_params, data
-                )
+                self._render_operation(lines, path, method, op, shared_params, data)
         return SpecialCaseResult(
             markdown="\n".join(lines).strip() + "\n",
@@ -410,9 +410,7 @@ class OpenApiExtractor:
                 lines.append(bullet)
             lines.append("")
-    def _render_request_body(
-        self, lines: list[str], body: Any, spec: dict[str, Any]
-    ) -> None:
+    def _render_request_body(self, lines: list[str], body: Any, spec: dict[str, Any]) -> None:
         if not isinstance(body, dict):
             return
         if "$ref" in body:
@@ -455,9 +453,7 @@ class OpenApiExtractor:
             lines.append(f"- body: {_describe_type(schema, spec)}")
         lines.append("")
-    def _render_responses(
-        self, lines: list[str], responses: Any, spec: dict[str, Any]
-    ) -> None:
+    def _render_responses(self, lines: list[str], responses: Any, spec: dict[str, Any]) -> None:
         if not isinstance(responses, dict) or not responses:
             return
         lines.append("**Responses:**")
@@ -535,11 +531,7 @@ class MdxSourceExtractor:
         for pattern in self._EDIT_PATTERNS:
             match = pattern.search(text)
             if match:
-                raw_url = (
-                    match.group(1)
-                    .replace("/blob/", "/raw/")
-                    .replace("/edit/", "/raw/")
-                )
+                raw_url = match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
                 # Return None so downstream runs, but attach hint via a cache
                 # mechanism. Simpler: return None always; step reads the URL
                 # if needed by re-running the regex.
@@ -567,9 +559,7 @@ def find_mdx_source_url(html: bytes) -> str | None:
     for pattern in MdxSourceExtractor._EDIT_PATTERNS:
         match = pattern.search(text)
         if match:
-            return (
-                match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
-            )
+            return match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
     return None
@@ -580,8 +570,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
     """Heuristic: does this HTML appear to be a JS-only SPA?
     True when the non-script body text is very small relative to the overall
-    page size and the page contains script tags. Not perfect, but good enough
-    to warn an agent before it consumes empty Markdown.
+    page size and the page contains script tags. This is a conservative signal
+    for warning an agent before it consumes empty Markdown.
     """
     if len(html) < 500:
         return False
@@ -589,7 +579,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
         return False
     try:
         soup = _soup(html)
-    except Exception:  # noqa: BLE001
+    except Exception as err:  # noqa: BLE001
+        logger.debug("SPA heuristic skipped malformed HTML: %s", err)
         return False
     # Remove scripts/styles before measuring.
     for tag in soup(["script", "style", "noscript"]):

{docpull-2.5.1 → docpull-3.0.1}/src/docpull/core/fetcher.py RENAMED Viewed

@@ -265,9 +265,7 @@ class Fetcher:
         # built-in 50 MB ceiling.
         max_content_size_kw: dict[str, int] = {}
         if self.config.content_filter.max_file_size is not None:
-            max_content_size_kw["max_content_size"] = int(
-                self.config.content_filter.max_file_size
-            )
+            max_content_size_kw["max_content_size"] = int(self.config.content_filter.max_file_size)
         self._http_client = AsyncHttpClient(
             rate_limiter=self._rate_limiter,
             max_retries=self.config.network.max_retries,
@@ -509,11 +507,7 @@ class Fetcher:
         steps = self._pipeline.steps
         if not save:
-            steps = [
-                s
-                for s in steps
-                if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}
-            ]
+            steps = [s for s in steps if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}]
         pipeline = type(self._pipeline)(steps=steps)
         ctx = await pipeline.execute(url, output_path)
         if ctx.error:
@@ -531,8 +525,8 @@ class Fetcher:
         """
         Compute output path for a URL using the configured naming strategy.
-        - ``full`` / ``flat`` / ``short``: a single flattened filename
-          (URL path joined with underscores).
+        - ``full``: a single flattened filename (URL path joined with
+          underscores).
         - ``hierarchical``: URL path preserved as nested directories,
           terminating in ``<segment>.md`` or ``index.md`` for trailing
           slashes. The leaf is `_validate_output_path`-safe — every segment
@@ -545,7 +539,6 @@ class Fetcher:
             parts = _url_to_path_parts(url, self.config.url)
             return output_dir.joinpath(*parts)
-        # full / flat / short: aliased to full until 3.0
         filename = _url_to_filename(url, self.config.url)
         return output_dir / filename
@@ -638,9 +631,7 @@ class Fetcher:
         )
         discovered: list[str] = []
-        async for url in self._discoverer.discover(
-            start_url, max_urls=self.config.crawl.max_pages
-        ):
+        async for url in self._discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
             discovered.append(url)
             if self._cancelled:
                 yield FetchEvent(
@@ -756,9 +747,7 @@ class Fetcher:
                 )
             )
             try:
-                async for url in discoverer.discover(
-                    start_url, max_urls=self.config.crawl.max_pages
-                ):
+                async for url in discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
                     if self._cancelled:
                         break
                     await url_queue.put(url)
@@ -770,14 +759,10 @@ class Fetcher:
                         and self._cache_manager
                         and len(discovered_for_resume) % 200 == 0
                     ):
-                        self._cache_manager.save_discovered_urls(
-                            list(discovered_for_resume), start_url
-                        )
+                        self._cache_manager.save_discovered_urls(list(discovered_for_resume), start_url)
             finally:
                 if self.config.cache.enabled and self._cache_manager:
-                    self._cache_manager.save_discovered_urls(
-                        discovered_for_resume, start_url
-                    )
+                    self._cache_manager.save_discovered_urls(discovered_for_resume, start_url)
                 self._stats.urls_discovered = len(discovered_for_resume)
                 await event_queue.put(
                     FetchEvent(
@@ -810,6 +795,7 @@ class Fetcher:
                     continue
                 local_events: list[FetchEvent] = []
                 # Bind the per-iteration list as a default arg so ruff B023
                 # is happy. Closure is consumed synchronously by execute()
                 # before the next iteration anyway, so capture order is safe.
@@ -936,9 +922,7 @@ def fetch_one(url: str, **kwargs: object) -> PageContext:
     """
     try:
         asyncio.get_running_loop()
-        raise RuntimeError(
-            "fetch_one() called from async context. Use Fetcher.fetch_one() instead."
-        )
+        raise RuntimeError("fetch_one() called from async context. Use Fetcher.fetch_one() instead.")
     except RuntimeError as exc:
         if "no running event loop" not in str(exc).lower():
             raise

{docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/filters.py RENAMED Viewed

@@ -29,19 +29,20 @@ def normalize_url(url: str) -> str:
     Returns:
         Normalized URL string
     """
-    # Use url_normalize library if available
+    # Use url_normalize library if available for case / percent-encoding
+    # cleanup. It does NOT strip fragments, so we always do that ourselves
+    # below — keeping behavior consistent whether the optional dep is
+    # installed or not.
     if URL_NORMALIZE_AVAILABLE:
         try:
-            result: str = url_normalize(url)
-            return result
+            normalized = url_normalize(url)
+            if normalized:
+                url = normalized
         except ValueError:
             logger.debug("url_normalize rejected URL during normalization", exc_info=True)
-    # Basic normalization
     parsed = urlparse(url)
-    # Remove fragment
-    normalized = urlunparse(
+    return urlunparse(
         (
             parsed.scheme.lower(),
             parsed.netloc.lower(),
@@ -52,8 +53,6 @@ def normalize_url(url: str) -> str:
         )
     )
-    return normalized
 class PatternFilter:
     """

{docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/link_extractors/enhanced.py RENAMED Viewed

@@ -295,7 +295,8 @@ class EnhancedLinkExtractor:
         try:
             absolute_url = urljoin(base_url, href)
-        except Exception:
+        except Exception as err:
+            logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
             return None
         # Validate it's a proper URL

{docpull-2.5.1 → docpull-3.0.1}/src/docpull/discovery/link_extractors/static.py RENAMED Viewed

@@ -148,7 +148,8 @@ class StaticLinkExtractor:
         """
         try:
             absolute_url = urljoin(base_url, href)
-        except Exception:
+        except Exception as err:
+            logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
             return None
         # Remove fragment

{docpull-2.5.1 → docpull-3.0.1}/src/docpull/http/client.py RENAMED Viewed

@@ -12,7 +12,7 @@ from types import TracebackType
 from urllib.parse import urljoin, urlparse
 import aiohttp
-from aiohttp.abc import AbstractResolver
+from aiohttp.abc import AbstractResolver, ResolveResult
 from ..security.url_validator import UrlValidator
 from .protocols import HttpResponse
@@ -45,14 +45,14 @@ class _ValidatedResolver(AbstractResolver):
         self,
         host: str,
         port: int = 0,
-        family: int = socket.AF_UNSPEC,
-    ) -> list[dict[str, object]]:
+        family: socket.AddressFamily = socket.AF_UNSPEC,
+    ) -> list[ResolveResult]:
         try:
             addresses = self._url_validator.resolve_allowed_addresses(host)
         except ValueError as err:
             raise OSError(str(err)) from err
-        results: list[dict[str, object]] = []
+        results: list[ResolveResult] = []
         for address in addresses:
             ip = ipaddress.ip_address(address)
             entry_family = socket.AF_INET6 if ip.version == 6 else socket.AF_INET
@@ -60,14 +60,14 @@ class _ValidatedResolver(AbstractResolver):
                 continue
             results.append(
-                {
-                    "hostname": host,
-                    "host": address,
-                    "port": port,
-                    "family": entry_family,
-                    "proto": socket.IPPROTO_TCP,
-                    "flags": socket.AI_NUMERICHOST,
-                }
+                ResolveResult(
+                    hostname=host,
+                    host=address,
+                    port=port,
+                    family=entry_family,
+                    proto=socket.IPPROTO_TCP,
+                    flags=socket.AI_NUMERICHOST,
+                )
             )
         if not results:
@@ -236,20 +236,21 @@ class AsyncHttpClient:
     async def __aenter__(self) -> AsyncHttpClient:
         """Enter async context and create session."""
-        connector_kwargs: dict[str, object] = {
-            "limit": 100,  # Total connection limit
-            "limit_per_host": 10,  # Per-host connection limit
-            "ttl_dns_cache": 300,  # DNS cache TTL
-        }
+        resolver: AbstractResolver | None = None
         if self._url_validator is not None and self._proxy is None:
-            connector_kwargs["resolver"] = _ValidatedResolver(self._url_validator)
+            resolver = _ValidatedResolver(self._url_validator)
         elif self._proxy is not None and self._url_validator is not None:
             logger.warning(
                 "Proxy mode: DNS-pinning resolver is not active. "
                 "URL validation still runs pre-flight, but the proxy resolves DNS independently."
             )
-        connector = aiohttp.TCPConnector(**connector_kwargs)
+        connector = aiohttp.TCPConnector(
+            limit=100,
+            limit_per_host=10,
+            ttl_dns_cache=300,
+            resolver=resolver,
+        )
         self._session = aiohttp.ClientSession(
             connector=connector,
             headers={"User-Agent": self._user_agent},

{docpull-2.5.1 → docpull-3.0.1}/src/docpull/mcp/server.py RENAMED Viewed

@@ -215,8 +215,7 @@ async def _run_stdio() -> int:
         from mcp.types import CallToolResult, TextContent, Tool, ToolAnnotations
     except ImportError:
         print(
-            "docpull mcp requires the 'mcp' package. Install with: "
-            "pip install docpull[mcp]",
+            "docpull mcp requires the 'mcp' package. Install with: pip install docpull[mcp]",
             file=sys.stderr,
         )
         return 1
@@ -590,7 +589,10 @@ async def _run_stdio() -> int:
         #     isError=False), and
         # (b) errors on tools with an outputSchema don't fail the validator
         #     for "missing structured content."
-        content = [TextContent(type="text", text=result.text)]
+        # `content` is typed `list[TextContent | ImageContent | ...]` on the SDK
+        # side; list invariance means we have to widen the local annotation
+        # explicitly even though TextContent is one of the valid variants.
+        content: list[Any] = [TextContent(type="text", text=result.text)]
         return CallToolResult(
             content=content,
             structuredContent=result.data if not result.is_error else None,

docpull 2.5.1__tar.gz → 3.0.1__tar.gz

docpull 2.5.1tar.gz → 3.0.1tar.gz