PyPI - docpull - Versions diffs - 2.5.1__tar.gz → 3.0.0__tar.gz - Mend

docpull 2.5.1tar.gz → 3.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{docpull-2.5.1/src/docpull.egg-info → docpull-3.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docpull
-Version: 2.5.1
+Version: 3.0.0
 Summary: Pull documentation from the web and convert to clean markdown
 Author-email: Zachary Roth <support@raintree.technology>
 Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -68,7 +68,6 @@ Provides-Extra: dev
 Requires-Dist: pytest>=7.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
 Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
-Requires-Dist: black>=23.0.0; extra == "dev"
 Requires-Dist: mypy>=1.0.0; extra == "dev"
 Requires-Dist: ruff>=0.1.0; extra == "dev"
 Requires-Dist: bandit>=1.7.0; extra == "dev"
@@ -280,6 +279,17 @@ sources:
     maxPages: 200
 ```
+### About the `mcp/` directory in this repo
+The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
+server backed by PostgreSQL with pgvector for semantic search. It is not
+the Python MCP server shipped in the `docpull` package described above
+— that one is the right choice for almost every user and is installed
+with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
+own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
+unless you specifically need pgvector-backed semantic search, ignore it
+and use `docpull mcp`.
 ## Output
 Markdown files with YAML frontmatter:
@@ -376,6 +386,7 @@ docpull URL --preview-urls    # List URLs without fetching
 - [PyPI](https://pypi.org/project/docpull/)
 - [GitHub](https://github.com/raintree-technology/docpull)
 - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
+- [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
 ## License

{docpull-2.5.1 → docpull-3.0.0}/README.md RENAMED Viewed

@@ -198,6 +198,17 @@ sources:
     maxPages: 200
 ```
+### About the `mcp/` directory in this repo
+The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
+server backed by PostgreSQL with pgvector for semantic search. It is not
+the Python MCP server shipped in the `docpull` package described above
+— that one is the right choice for almost every user and is installed
+with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
+own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
+unless you specifically need pgvector-backed semantic search, ignore it
+and use `docpull mcp`.
 ## Output
 Markdown files with YAML frontmatter:
@@ -294,6 +305,7 @@ docpull URL --preview-urls    # List URLs without fetching
 - [PyPI](https://pypi.org/project/docpull/)
 - [GitHub](https://github.com/raintree-technology/docpull)
 - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
+- [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
 ## License

{docpull-2.5.1 → docpull-3.0.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "docpull"
-version = "2.5.1"
+version = "3.0.0"
 dynamic = []
 description = "Pull documentation from the web and convert to clean markdown"
 readme = {file = "README.md", content-type = "text/markdown"}
@@ -102,7 +102,6 @@ dev = [
     "pytest>=7.0.0",
     "pytest-cov>=4.0.0",
     "pytest-asyncio>=0.21.0",
-    "black>=23.0.0",
     "mypy>=1.0.0",
     "ruff>=0.1.0",
     "bandit>=1.7.0",
@@ -132,10 +131,6 @@ include = ["docpull*"]
 [tool.setuptools.package-data]
 docpull = ["py.typed"]
-[tool.black]
-line-length = 110
-target-version = ["py310", "py311", "py312", "py313", "py314"]
 [tool.ruff]
 line-length = 110
 target-version = "py310"

{docpull-2.5.1 → docpull-3.0.0}/src/docpull/__init__.py RENAMED Viewed

@@ -14,7 +14,7 @@ Usage:
             print(event)
 """
-__version__ = "2.5.1"
+__version__ = "3.0.0"
 from .cache import CacheManager, StreamingDeduplicator
 from .conversion.chunking import Chunk, TokenCounter, chunk_markdown

{docpull-2.5.1 → docpull-3.0.0}/src/docpull/cli.py RENAMED Viewed

@@ -562,8 +562,7 @@ def run_fetcher(args: argparse.Namespace) -> int:
                         n_chunks = len(ctx.chunks) if ctx.chunks else 0
                         extra = f" ({n_chunks} chunks)" if n_chunks else ""
                         console.print(
-                            f"[green]Saved:[/green] {ctx.output_path} "
-                            f"[{ctx.source_type or 'generic'}]{extra}"
+                            f"[green]Saved:[/green] {ctx.output_path} [{ctx.source_type or 'generic'}]{extra}"
                         )
                     return 0

{docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/special_cases.py RENAMED Viewed

@@ -246,7 +246,8 @@ def _describe_type(schema: Any, spec: dict[str, Any]) -> str:
     if not isinstance(schema, dict):
         return "?"
     if "$ref" in schema:
-        return schema["$ref"].rsplit("/", 1)[-1]
+        ref: str = schema["$ref"]
+        return ref.rsplit("/", 1)[-1]
     for key in ("oneOf", "anyOf", "allOf"):
         if isinstance(schema.get(key), list) and schema[key]:
             seen: list[str] = []
@@ -349,9 +350,7 @@ class OpenApiExtractor:
             for method, op in ops.items():
                 if method.lower() not in _HTTP_METHODS or not isinstance(op, dict):
                     continue
-                self._render_operation(
-                    lines, path, method, op, shared_params, data
-                )
+                self._render_operation(lines, path, method, op, shared_params, data)
         return SpecialCaseResult(
             markdown="\n".join(lines).strip() + "\n",
@@ -410,9 +409,7 @@ class OpenApiExtractor:
                 lines.append(bullet)
             lines.append("")
-    def _render_request_body(
-        self, lines: list[str], body: Any, spec: dict[str, Any]
-    ) -> None:
+    def _render_request_body(self, lines: list[str], body: Any, spec: dict[str, Any]) -> None:
         if not isinstance(body, dict):
             return
         if "$ref" in body:
@@ -455,9 +452,7 @@ class OpenApiExtractor:
             lines.append(f"- body: {_describe_type(schema, spec)}")
         lines.append("")
-    def _render_responses(
-        self, lines: list[str], responses: Any, spec: dict[str, Any]
-    ) -> None:
+    def _render_responses(self, lines: list[str], responses: Any, spec: dict[str, Any]) -> None:
         if not isinstance(responses, dict) or not responses:
             return
         lines.append("**Responses:**")
@@ -535,11 +530,7 @@ class MdxSourceExtractor:
         for pattern in self._EDIT_PATTERNS:
             match = pattern.search(text)
             if match:
-                raw_url = (
-                    match.group(1)
-                    .replace("/blob/", "/raw/")
-                    .replace("/edit/", "/raw/")
-                )
+                raw_url = match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
                 # Return None so downstream runs, but attach hint via a cache
                 # mechanism. Simpler: return None always; step reads the URL
                 # if needed by re-running the regex.
@@ -567,9 +558,7 @@ def find_mdx_source_url(html: bytes) -> str | None:
     for pattern in MdxSourceExtractor._EDIT_PATTERNS:
         match = pattern.search(text)
         if match:
-            return (
-                match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
-            )
+            return match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
     return None

{docpull-2.5.1 → docpull-3.0.0}/src/docpull/core/fetcher.py RENAMED Viewed

@@ -265,9 +265,7 @@ class Fetcher:
         # built-in 50 MB ceiling.
         max_content_size_kw: dict[str, int] = {}
         if self.config.content_filter.max_file_size is not None:
-            max_content_size_kw["max_content_size"] = int(
-                self.config.content_filter.max_file_size
-            )
+            max_content_size_kw["max_content_size"] = int(self.config.content_filter.max_file_size)
         self._http_client = AsyncHttpClient(
             rate_limiter=self._rate_limiter,
             max_retries=self.config.network.max_retries,
@@ -509,11 +507,7 @@ class Fetcher:
         steps = self._pipeline.steps
         if not save:
-            steps = [
-                s
-                for s in steps
-                if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}
-            ]
+            steps = [s for s in steps if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}]
         pipeline = type(self._pipeline)(steps=steps)
         ctx = await pipeline.execute(url, output_path)
         if ctx.error:
@@ -531,8 +525,8 @@ class Fetcher:
         """
         Compute output path for a URL using the configured naming strategy.
-        - ``full`` / ``flat`` / ``short``: a single flattened filename
-          (URL path joined with underscores).
+        - ``full``: a single flattened filename (URL path joined with
+          underscores).
         - ``hierarchical``: URL path preserved as nested directories,
           terminating in ``<segment>.md`` or ``index.md`` for trailing
           slashes. The leaf is `_validate_output_path`-safe — every segment
@@ -545,7 +539,6 @@ class Fetcher:
             parts = _url_to_path_parts(url, self.config.url)
             return output_dir.joinpath(*parts)
-        # full / flat / short: aliased to full until 3.0
         filename = _url_to_filename(url, self.config.url)
         return output_dir / filename
@@ -638,9 +631,7 @@ class Fetcher:
         )
         discovered: list[str] = []
-        async for url in self._discoverer.discover(
-            start_url, max_urls=self.config.crawl.max_pages
-        ):
+        async for url in self._discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
             discovered.append(url)
             if self._cancelled:
                 yield FetchEvent(
@@ -756,9 +747,7 @@ class Fetcher:
                 )
             )
             try:
-                async for url in discoverer.discover(
-                    start_url, max_urls=self.config.crawl.max_pages
-                ):
+                async for url in discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
                     if self._cancelled:
                         break
                     await url_queue.put(url)
@@ -770,14 +759,10 @@ class Fetcher:
                         and self._cache_manager
                         and len(discovered_for_resume) % 200 == 0
                     ):
-                        self._cache_manager.save_discovered_urls(
-                            list(discovered_for_resume), start_url
-                        )
+                        self._cache_manager.save_discovered_urls(list(discovered_for_resume), start_url)
             finally:
                 if self.config.cache.enabled and self._cache_manager:
-                    self._cache_manager.save_discovered_urls(
-                        discovered_for_resume, start_url
-                    )
+                    self._cache_manager.save_discovered_urls(discovered_for_resume, start_url)
                 self._stats.urls_discovered = len(discovered_for_resume)
                 await event_queue.put(
                     FetchEvent(
@@ -810,6 +795,7 @@ class Fetcher:
                     continue
                 local_events: list[FetchEvent] = []
                 # Bind the per-iteration list as a default arg so ruff B023
                 # is happy. Closure is consumed synchronously by execute()
                 # before the next iteration anyway, so capture order is safe.
@@ -936,9 +922,7 @@ def fetch_one(url: str, **kwargs: object) -> PageContext:
     """
     try:
         asyncio.get_running_loop()
-        raise RuntimeError(
-            "fetch_one() called from async context. Use Fetcher.fetch_one() instead."
-        )
+        raise RuntimeError("fetch_one() called from async context. Use Fetcher.fetch_one() instead.")
     except RuntimeError as exc:
         if "no running event loop" not in str(exc).lower():
             raise

{docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/filters.py RENAMED Viewed

@@ -29,19 +29,20 @@ def normalize_url(url: str) -> str:
     Returns:
         Normalized URL string
     """
-    # Use url_normalize library if available
+    # Use url_normalize library if available for case / percent-encoding
+    # cleanup. It does NOT strip fragments, so we always do that ourselves
+    # below — keeping behavior consistent whether the optional dep is
+    # installed or not.
     if URL_NORMALIZE_AVAILABLE:
         try:
-            result: str = url_normalize(url)
-            return result
+            normalized = url_normalize(url)
+            if normalized:
+                url = normalized
         except ValueError:
             logger.debug("url_normalize rejected URL during normalization", exc_info=True)
-    # Basic normalization
     parsed = urlparse(url)
-    # Remove fragment
-    normalized = urlunparse(
+    return urlunparse(
         (
             parsed.scheme.lower(),
             parsed.netloc.lower(),
@@ -52,8 +53,6 @@ def normalize_url(url: str) -> str:
         )
     )
-    return normalized
 class PatternFilter:
     """

{docpull-2.5.1 → docpull-3.0.0}/src/docpull/http/client.py RENAMED Viewed

@@ -12,7 +12,7 @@ from types import TracebackType
 from urllib.parse import urljoin, urlparse
 import aiohttp
-from aiohttp.abc import AbstractResolver
+from aiohttp.abc import AbstractResolver, ResolveResult
 from ..security.url_validator import UrlValidator
 from .protocols import HttpResponse
@@ -45,14 +45,14 @@ class _ValidatedResolver(AbstractResolver):
         self,
         host: str,
         port: int = 0,
-        family: int = socket.AF_UNSPEC,
-    ) -> list[dict[str, object]]:
+        family: socket.AddressFamily = socket.AF_UNSPEC,
+    ) -> list[ResolveResult]:
         try:
             addresses = self._url_validator.resolve_allowed_addresses(host)
         except ValueError as err:
             raise OSError(str(err)) from err
-        results: list[dict[str, object]] = []
+        results: list[ResolveResult] = []
         for address in addresses:
             ip = ipaddress.ip_address(address)
             entry_family = socket.AF_INET6 if ip.version == 6 else socket.AF_INET
@@ -60,14 +60,14 @@ class _ValidatedResolver(AbstractResolver):
                 continue
             results.append(
-                {
-                    "hostname": host,
-                    "host": address,
-                    "port": port,
-                    "family": entry_family,
-                    "proto": socket.IPPROTO_TCP,
-                    "flags": socket.AI_NUMERICHOST,
-                }
+                ResolveResult(
+                    hostname=host,
+                    host=address,
+                    port=port,
+                    family=entry_family,
+                    proto=socket.IPPROTO_TCP,
+                    flags=socket.AI_NUMERICHOST,
+                )
             )
         if not results:
@@ -236,20 +236,21 @@ class AsyncHttpClient:
     async def __aenter__(self) -> AsyncHttpClient:
         """Enter async context and create session."""
-        connector_kwargs: dict[str, object] = {
-            "limit": 100,  # Total connection limit
-            "limit_per_host": 10,  # Per-host connection limit
-            "ttl_dns_cache": 300,  # DNS cache TTL
-        }
+        resolver: AbstractResolver | None = None
         if self._url_validator is not None and self._proxy is None:
-            connector_kwargs["resolver"] = _ValidatedResolver(self._url_validator)
+            resolver = _ValidatedResolver(self._url_validator)
         elif self._proxy is not None and self._url_validator is not None:
             logger.warning(
                 "Proxy mode: DNS-pinning resolver is not active. "
                 "URL validation still runs pre-flight, but the proxy resolves DNS independently."
             )
-        connector = aiohttp.TCPConnector(**connector_kwargs)
+        connector = aiohttp.TCPConnector(
+            limit=100,
+            limit_per_host=10,
+            ttl_dns_cache=300,
+            resolver=resolver,
+        )
         self._session = aiohttp.ClientSession(
             connector=connector,
             headers={"User-Agent": self._user_agent},

{docpull-2.5.1 → docpull-3.0.0}/src/docpull/mcp/server.py RENAMED Viewed

@@ -215,8 +215,7 @@ async def _run_stdio() -> int:
         from mcp.types import CallToolResult, TextContent, Tool, ToolAnnotations
     except ImportError:
         print(
-            "docpull mcp requires the 'mcp' package. Install with: "
-            "pip install docpull[mcp]",
+            "docpull mcp requires the 'mcp' package. Install with: pip install docpull[mcp]",
             file=sys.stderr,
         )
         return 1
@@ -590,7 +589,10 @@ async def _run_stdio() -> int:
         #     isError=False), and
         # (b) errors on tools with an outputSchema don't fail the validator
         #     for "missing structured content."
-        content = [TextContent(type="text", text=result.text)]
+        # `content` is typed `list[TextContent | ImageContent | ...]` on the SDK
+        # side; list invariance means we have to widen the local annotation
+        # explicitly even though TextContent is one of the valid variants.
+        content: list[Any] = [TextContent(type="text", text=result.text)]
         return CallToolResult(
             content=content,
             structuredContent=result.data if not result.is_error else None,

{docpull-2.5.1 → docpull-3.0.0}/src/docpull/mcp/tools.py RENAMED Viewed

@@ -26,7 +26,7 @@ from typing import Any
 import yaml
 from ..core.fetcher import Fetcher
-from ..models.config import DocpullConfig, ProfileName
+from ..models.config import CrawlConfig, DocpullConfig, OutputConfig, ProfileName
 from ..security.url_validator import UrlValidator
 from .sources import (
     _URL_SCHEME_RE,
@@ -195,16 +195,10 @@ async def ensure_docs(
     target_dir = _source_dir(docs_dir, source)
     meta_path = _meta_path(docs_dir, source)
-    if (
-        not force
-        and _cache_fresh(meta_path)
-        and target_dir.exists()
-        and any(target_dir.rglob("*.md"))
-    ):
+    if not force and _cache_fresh(meta_path) and target_dir.exists() and any(target_dir.rglob("*.md")):
         files = list(target_dir.rglob("*.md"))
         return ToolResult(
-            f"Cached: {source} ({len(files)} files at {target_dir}). "
-            "Call with force=true to refresh.",
+            f"Cached: {source} ({len(files)} files at {target_dir}). Call with force=true to refresh.",
             data={
                 "source": source,
                 "cached": True,
@@ -216,8 +210,8 @@ async def ensure_docs(
     config = DocpullConfig(
         url=resolved.url,
         profile=profile_enum,
-        crawl={"max_pages": resolved.max_pages} if resolved.max_pages else {},
-        output={"directory": target_dir},
+        crawl=CrawlConfig(max_pages=resolved.max_pages) if resolved.max_pages else CrawlConfig(),
+        output=OutputConfig(directory=target_dir),
     )
     fetched = 0
     crashed = False
@@ -264,13 +258,11 @@ async def fetch_url(url: str, *, max_tokens: int | None = None) -> ToolResult:
     if not validation.is_valid:
         return ToolResult(f"URL rejected: {validation.rejection_reason}", is_error=True)
-    output_kwargs: dict[str, Any] = {}
-    if max_tokens:
-        output_kwargs["max_tokens_per_file"] = max_tokens
+    output_cfg = OutputConfig(max_tokens_per_file=max_tokens) if max_tokens else OutputConfig()
     config = DocpullConfig(
         url=url,
         profile=ProfileName.CUSTOM,
-        output=output_kwargs or None,
+        output=output_cfg,
     )
     async with Fetcher(config) as fetcher:
         ctx = await fetcher.fetch_one(url, save=False)
@@ -288,10 +280,7 @@ async def fetch_url(url: str, *, max_tokens: int | None = None) -> ToolResult:
         ]
         body = "\n\n".join(parts)
     chunks_meta = f" _chunks: {len(ctx.chunks)}_" if ctx.chunks else ""
-    header = (
-        f"# {ctx.title or url}\n"
-        f"_source: {url}_ _type: {ctx.source_type or 'generic'}_{chunks_meta}\n\n"
-    )
+    header = f"# {ctx.title or url}\n_source: {url}_ _type: {ctx.source_type or 'generic'}_{chunks_meta}\n\n"
     return ToolResult(header + body)
@@ -471,16 +460,9 @@ def grep_docs(
             matches: list[tuple[int, list[str], str, list[str]]] = []
             for idx, line in enumerate(lines):
                 if regex.search(line):
-                    before = (
-                        [lines[i].rstrip() for i in range(max(0, idx - context), idx)]
-                        if context
-                        else []
-                    )
+                    before = [lines[i].rstrip() for i in range(max(0, idx - context), idx)] if context else []
                     after = (
-                        [
-                            lines[i].rstrip()
-                            for i in range(idx + 1, min(len(lines), idx + 1 + context))
-                        ]
+                        [lines[i].rstrip() for i in range(idx + 1, min(len(lines), idx + 1 + context))]
                         if context
                         else []
                     )
@@ -532,9 +514,7 @@ def grep_docs(
             for off, line in enumerate(after, start=1):
                 chunk.append(f"  {lineno + off:>4}- {line}")
             block_lines.append("\n".join(chunk))
-            rendered_matches.append(
-                {"lineno": lineno, "before": before, "line": hit, "after": after}
-            )
+            rendered_matches.append({"lineno": lineno, "before": before, "line": hit, "after": after})
             rendered += 1
         blocks.append("\n\n".join(block_lines))
         files_payload.append(
@@ -710,28 +690,19 @@ def add_source(
         )
     validation = _ADD_SOURCE_VALIDATOR.validate(url)
     if not validation.is_valid:
-        return ToolResult(
-            f"URL rejected: {validation.rejection_reason}", is_error=True
-        )
+        return ToolResult(f"URL rejected: {validation.rejection_reason}", is_error=True)
     if description is not None and len(description) > MAX_DESCRIPTION_LEN:
-        return ToolResult(
-            f"Description too long (>{MAX_DESCRIPTION_LEN} chars).", is_error=True
-        )
+        return ToolResult(f"Description too long (>{MAX_DESCRIPTION_LEN} chars).", is_error=True)
     if category is not None and category not in ALLOWED_USER_CATEGORIES:
         valid = ", ".join(sorted(ALLOWED_USER_CATEGORIES))
-        return ToolResult(
-            f"Unknown category '{category}'. Valid: {valid}", is_error=True
-        )
+        return ToolResult(f"Unknown category '{category}'. Valid: {valid}", is_error=True)
     if max_pages is not None and (max_pages < 1 or max_pages > 100_000):
-        return ToolResult(
-            "max_pages must be between 1 and 100000.", is_error=True
-        )
+        return ToolResult("max_pages must be between 1 and 100000.", is_error=True)
     is_builtin = name in BUILTIN_SOURCES
     if is_builtin and not force:
         return ToolResult(
-            f"'{name}' is a builtin source. Pass force=true to shadow it with a "
-            "user override.",
+            f"'{name}' is a builtin source. Pass force=true to shadow it with a user override.",
             is_error=True,
         )

docpull 2.5.1__tar.gz → 3.0.0__tar.gz

docpull 2.5.1tar.gz → 3.0.0tar.gz