PyPI - docpull - Versions diffs - 3.0.2__tar.gz → 4.0.0__tar.gz - Mend

docpull 3.0.2tar.gz → 4.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

{docpull-3.0.2/src/docpull.egg-info → docpull-4.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docpull
-Version: 3.0.2
+Version: 4.0.0
 Summary: Pull documentation from the web and convert to clean markdown
 Author-email: Zachary Roth <support@raintree.technology>
 Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -42,7 +42,7 @@ Requires-Dist: beautifulsoup4>=4.12.0
 Requires-Dist: html2text>=2020.1.16
 Requires-Dist: defusedxml>=0.7.1
 Requires-Dist: extruct>=0.15.0
-Requires-Dist: aiohttp>=3.9.0
+Requires-Dist: aiohttp>=3.14.0
 Requires-Dist: idna>=3.15
 Requires-Dist: regex>=2024.11.6
 Requires-Dist: rich>=13.0.0
@@ -59,6 +59,7 @@ Provides-Extra: tokens
 Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
 Provides-Extra: mcp
 Requires-Dist: mcp>=1.0.0; extra == "mcp"
+Requires-Dist: pyjwt>=2.13.0; extra == "mcp"
 Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
 Requires-Dist: starlette>=1.0.1; extra == "mcp"
 Provides-Extra: llm
@@ -69,6 +70,7 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
 Requires-Dist: trafilatura>=1.12.0; extra == "all"
 Requires-Dist: tiktoken>=0.7.0; extra == "all"
 Requires-Dist: mcp>=1.0.0; extra == "all"
+Requires-Dist: pyjwt>=2.13.0; extra == "all"
 Requires-Dist: python-multipart>=0.0.27; extra == "all"
 Requires-Dist: starlette>=1.0.1; extra == "all"
 Provides-Extra: dev

{docpull-3.0.2 → docpull-4.0.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "docpull"
-version = "3.0.2"
+version = "4.0.0"
 dynamic = []
 description = "Pull documentation from the web and convert to clean markdown"
 readme = {file = "README.md", content-type = "text/markdown"}
@@ -66,7 +66,7 @@ dependencies = [
     "html2text>=2020.1.16",
     "defusedxml>=0.7.1",
     "extruct>=0.15.0",
-    "aiohttp>=3.9.0",
+    "aiohttp>=3.14.0",  # 3.14.0 fixes CVE-2026-34993 and CVE-2026-47265
     "idna>=3.15",
     "regex>=2024.11.6",
     "rich>=13.0.0",
@@ -90,6 +90,7 @@ tokens = [
 ]
 mcp = [
     "mcp>=1.0.0",
+    "pyjwt>=2.13.0",
     "python-multipart>=0.0.27",
     "starlette>=1.0.1",
 ]
@@ -102,6 +103,7 @@ all = [
     "trafilatura>=1.12.0",
     "tiktoken>=0.7.0",
     "mcp>=1.0.0",
+    "pyjwt>=2.13.0",
     "python-multipart>=0.0.27",
     "starlette>=1.0.1",
 ]

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/__init__.py RENAMED Viewed

@@ -14,7 +14,7 @@ Usage:
             print(event)
 """
-__version__ = "3.0.2"
+__version__ = "4.0.0"
 from .cache import CacheManager, StreamingDeduplicator
 from .conversion.chunking import Chunk, TokenCounter, chunk_markdown

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/cache/manager.py RENAMED Viewed

@@ -197,45 +197,6 @@ class CacheManager:
             content = content.encode("utf-8")
         return hashlib.sha256(content).hexdigest()
-    def has_changed(
-        self,
-        url: str,
-        content: str | None = None,
-        etag: str | None = None,
-        last_modified: str | None = None,
-    ) -> bool:
-        """Check if content has changed since last fetch.
-        Args:
-            url: URL to check
-            content: Current content (for checksum comparison)
-            etag: HTTP ETag header
-            last_modified: HTTP Last-Modified header
-        Returns:
-            True if content has changed or is new
-        """
-        if url not in self.manifest:
-            return True  # New URL
-        cached = self.manifest[url]
-        # Check ETag first (most reliable)
-        if etag and "etag" in cached:
-            return bool(etag != cached["etag"])
-        # Check Last-Modified
-        if last_modified and "last_modified" in cached:
-            return bool(last_modified != cached["last_modified"])
-        # Check content checksum
-        if content and "checksum" in cached:
-            current_checksum = self.compute_checksum(content)
-            return bool(current_checksum != cached["checksum"])
-        # Can't determine, assume changed
-        return True
     def update_cache(
         self,
         url: str,
@@ -302,14 +263,6 @@ class CacheManager:
         """
         return self._state.fetched_urls.copy()
-    def get_failed_urls(self) -> set[str]:
-        """Get set of URLs that failed to fetch.
-        Returns:
-            Set of failed URLs (copy to prevent mutation)
-        """
-        return self._state.failed_urls.copy()
     def start_session(self) -> None:
         """Start a new fetch session.
@@ -319,30 +272,6 @@ class CacheManager:
         self._state.last_run = utc_now_iso()
         self._state_dirty = True
-    def clear_state(self) -> None:
-        """Clear incremental state (for fresh start).
-        Note:
-            This immediately flushes to disk.
-        """
-        self._state = _InternalState()
-        self._state_dirty = True
-        self.flush()
-        logger.info("Cleared incremental state")
-    def get_cache_stats(self) -> dict[str, str | int | None]:
-        """Get cache statistics.
-        Returns:
-            Dict with cache stats
-        """
-        return {
-            "cached_urls": len(self.manifest),
-            "fetched_urls": len(self._state.fetched_urls),
-            "failed_urls": len(self._state.failed_urls),
-            "last_run": self._state.last_run,
-        }
     def evict_expired(self, ttl_days: int | None = None) -> int:
         """Remove cache entries older than TTL.
@@ -378,28 +307,6 @@ class CacheManager:
         return len(to_remove)
-    def is_fetched(self, url: str) -> bool:
-        """Check if URL has been fetched (O(1) lookup).
-        Args:
-            url: URL to check
-        Returns:
-            True if URL was successfully fetched
-        """
-        return url in self._state.fetched_urls
-    def is_failed(self, url: str) -> bool:
-        """Check if URL has failed (O(1) lookup).
-        Args:
-            url: URL to check
-        Returns:
-            True if URL failed to fetch
-        """
-        return url in self._state.failed_urls
     # Resume capability methods
     def save_discovered_urls(self, urls: list[str], start_url: str) -> None:
@@ -483,22 +390,3 @@ class CacheManager:
                 logger.info("Cleared discovered URLs file")
             except Exception as e:
                 logger.warning(f"Could not clear discovered URLs file: {e}")
-    def has_resume_data(self, start_url: str) -> bool:
-        """Check if there is resume data available for the given URL.
-        Args:
-            start_url: The starting URL to check
-        Returns:
-            True if resume data exists and matches the start URL
-        """
-        if not self.discovered_urls_file.exists():
-            return False
-        try:
-            with open(self.discovered_urls_file, encoding="utf-8") as f:
-                data: DiscoveredUrlsState = json.load(f)
-            return data.get("start_url") == start_url
-        except Exception:
-            return False

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/cache/streaming_dedup.py RENAMED Viewed

@@ -90,23 +90,6 @@ class StreamingDeduplicator:
             self._seen[content_hash] = url
             return (True, None)
-    async def is_duplicate(self, content: str | bytes) -> bool:
-        """
-        Check if content has been seen before (read-only).
-        Unlike check_and_register, this doesn't register the content.
-        Useful for checking without committing to save.
-        Args:
-            content: The content to check (str or bytes)
-        Returns:
-            True if content has been seen before
-        """
-        content_hash = self.compute_hash(content)
-        async with self._lock:
-            return content_hash in self._seen
     def get_stats(self) -> dict:
         """
         Get deduplication statistics.

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/conversion/markdown.py RENAMED Viewed

@@ -215,6 +215,17 @@ class FrontmatterBuilder:
         )
     """
+    @staticmethod
+    def _inline(value: Any) -> str:
+        """Collapse CR/LF/NUL so an interpolated value stays on its own YAML line.
+        Page-supplied metadata (JSON-LD ``keywords``, OpenGraph ``article:tag``,
+        etc.) flows into frontmatter. Without this, a newline in a tag/keyword
+        would break out of the list item and inject attacker-chosen top-level
+        keys (e.g. ``draft: true``) into the document frontmatter.
+        """
+        return str(value).replace("\r", " ").replace("\n", " ").replace("\x00", " ")
     def build(
         self,
         title: str | None = None,
@@ -238,28 +249,32 @@ class FrontmatterBuilder:
         if title:
             # Escape quotes in title
-            safe_title = title.replace('"', '\\"')
+            safe_title = self._inline(title).replace('"', '\\"')
             lines.append(f'title: "{safe_title}"')
         if url:
-            lines.append(f"source: {url}")
+            lines.append(f"source: {self._inline(url)}")
         if description:
             # Escape quotes and truncate long descriptions
-            safe_desc = description[:500].replace('"', '\\"')
+            safe_desc = self._inline(description[:500]).replace('"', '\\"')
             lines.append(f'description: "{safe_desc}"')
         for key, value in extra_fields.items():
             if value is not None:
                 if isinstance(value, str):
-                    safe_value = value.replace('"', '\\"')
+                    safe_value = self._inline(value).replace('"', '\\"')
                     lines.append(f'{key}: "{safe_value}"')
                 elif isinstance(value, (list, tuple)):
                     lines.append(f"{key}:")
                     for item in value:
-                        lines.append(f"  - {item}")
+                        # Quote + escape each item so a hostile tag/keyword (from
+                        # page JSON-LD / OpenGraph) stays a single YAML string and
+                        # cannot inject new keys or produce malformed frontmatter.
+                        safe_item = self._inline(item).replace('"', '\\"')
+                        lines.append(f'  - "{safe_item}"')
                 else:
-                    lines.append(f"{key}: {value}")
+                    lines.append(f"{key}: {self._inline(value)}")
         lines.append("---")
         return "\n".join(lines) + "\n\n"

docpull-4.0.0/src/docpull/discovery/_fetch.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Shared HTML fetch helper for link-discovery components."""
+from __future__ import annotations
+import logging
+from ..http.protocols import HttpClient
+logger = logging.getLogger(__name__)
+async def fetch_html(client: HttpClient, url: str) -> bytes | None:
+    """Fetch ``url`` and return its body iff it is a successful HTML response.
+    Returns ``None`` on network error, non-200 status, or a non-HTML content
+    type. Shared by the crawler and the static/enhanced link extractors so the
+    fetch and content-type gate stay identical across all three.
+    """
+    try:
+        response = await client.get(url, timeout=30.0)
+        if response.status_code != 200:
+            return None
+        content_type = response.content_type.lower()
+        if "text/html" not in content_type and "application/xhtml" not in content_type:
+            return None
+        return response.content
+    except Exception as e:
+        logger.debug(f"Failed to fetch {url}: {e}")
+        return None

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/crawler.py RENAMED Viewed

@@ -13,6 +13,7 @@ from bs4 import BeautifulSoup
 from ..http.protocols import HttpClient
 from ..security.robots import RobotsChecker
 from ..security.url_validator import UrlValidator
+from ._fetch import fetch_html
 from .filters import DomainFilter, PatternFilter, SeenUrlTracker
 if TYPE_CHECKING:
@@ -114,33 +115,6 @@ class LinkCrawler:
         return links
-    async def _fetch_page(self, url: str) -> bytes | None:
-        """
-        Fetch a page for link extraction.
-        Args:
-            url: URL to fetch
-        Returns:
-            HTML content as bytes, or None if fetch failed
-        """
-        try:
-            response = await self._client.get(url, timeout=30.0)
-            if response.status_code != 200:
-                return None
-            # Only process HTML content
-            content_type = response.content_type.lower()
-            if "text/html" not in content_type and "application/xhtml" not in content_type:
-                return None
-            return response.content
-        except Exception as e:
-            logger.debug(f"Failed to fetch {url}: {e}")
-            return None
     def _should_crawl(self, url: str) -> bool:
         """
         Check if a URL should be crawled.
@@ -225,7 +199,7 @@ class LinkCrawler:
                 links = await self._link_extractor.extract_links(current_url)
             else:
                 # Built-in extraction with separate fetch
-                html = await self._fetch_page(current_url)
+                html = await fetch_html(self._client, current_url)
                 if html is None:
                     continue
                 links = self._extract_links(html, current_url)

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/link_extractors/enhanced.py RENAMED Viewed

@@ -10,6 +10,7 @@ from urllib.parse import urljoin, urlparse
 from bs4 import BeautifulSoup
 from ...http.protocols import HttpClient
+from .._fetch import fetch_html
 logger = logging.getLogger(__name__)
@@ -104,7 +105,7 @@ class EnhancedLinkExtractor:
             List of absolute URLs found on the page
         """
         if content is None:
-            content = await self._fetch_content(url)
+            content = await fetch_html(self._client, url)
             if content is None:
                 return []
@@ -137,32 +138,6 @@ class EnhancedLinkExtractor:
         return list(links)
-    async def _fetch_content(self, url: str) -> bytes | None:
-        """
-        Fetch page content for link extraction.
-        Args:
-            url: URL to fetch
-        Returns:
-            HTML content as bytes, or None if fetch failed
-        """
-        try:
-            response = await self._client.get(url, timeout=30.0)
-            if response.status_code != 200:
-                return None
-            content_type = response.content_type.lower()
-            if "text/html" not in content_type and "application/xhtml" not in content_type:
-                return None
-            return response.content
-        except Exception as e:
-            logger.debug(f"Failed to fetch {url}: {e}")
-            return None
     def _extract_standard_links(self, soup: BeautifulSoup, base_url: str) -> list[str]:
         """Extract links from standard <a href> tags."""
         links = []

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/link_extractors/static.py RENAMED Viewed

@@ -8,6 +8,7 @@ from urllib.parse import urljoin, urlparse
 from bs4 import BeautifulSoup
 from ...http.protocols import HttpClient
+from .._fetch import fetch_html
 logger = logging.getLogger(__name__)
@@ -56,39 +57,12 @@ class StaticLinkExtractor:
             List of absolute URLs found on the page
         """
         if content is None:
-            content = await self._fetch_content(url)
+            content = await fetch_html(self._client, url)
             if content is None:
                 return []
         return self._parse_links(content, url)
-    async def _fetch_content(self, url: str) -> bytes | None:
-        """
-        Fetch page content for link extraction.
-        Args:
-            url: URL to fetch
-        Returns:
-            HTML content as bytes, or None if fetch failed
-        """
-        try:
-            response = await self._client.get(url, timeout=30.0)
-            if response.status_code != 200:
-                return None
-            # Only process HTML content
-            content_type = response.content_type.lower()
-            if "text/html" not in content_type and "application/xhtml" not in content_type:
-                return None
-            return response.content
-        except Exception as e:
-            logger.debug(f"Failed to fetch {url}: {e}")
-            return None
     def _parse_links(self, html: bytes, base_url: str) -> list[str]:
         """
         Parse links from HTML content.

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/sitemap.py RENAMED Viewed

@@ -7,6 +7,7 @@ from collections.abc import AsyncIterator
 from urllib.parse import urlparse
 from defusedxml import ElementTree
+from defusedxml.common import DefusedXmlException
 from ..http.protocols import HttpClient
 from ..security.robots import RobotsChecker
@@ -160,7 +161,7 @@ class SitemapDiscoverer:
         try:
             root = ElementTree.fromstring(content)
-        except ElementTree.ParseError as e:
+        except (ElementTree.ParseError, DefusedXmlException) as e:
             logger.warning(f"Failed to parse sitemap XML: {e}")
             return page_urls, sitemap_urls

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/http/client.py RENAMED Viewed

@@ -234,6 +234,38 @@ class AsyncHttpClient:
         return {key: value for key, value in headers.items() if key.lower() not in self.SENSITIVE_HEADERS}
+    def _next_redirect(
+        self,
+        response: aiohttp.ClientResponse,
+        current_url: str,
+        current_headers: dict[str, str],
+        redirect_count: int,
+        original_url: str,
+    ) -> tuple[str, dict[str, str], int] | None:
+        """Re-validate and follow one redirect hop, shared by GET and HEAD.
+        Returns the updated ``(url, headers, redirect_count)`` when ``response``
+        is a redirect, or ``None`` when it is not. Raises ``ValueError`` once
+        ``MAX_REDIRECTS`` is exceeded. Centralising this keeps GET and HEAD on
+        identical redirect/SSRF re-validation.
+        """
+        location = response.headers.get("Location")
+        if response.status in self.REDIRECT_STATUS_CODES and location:
+            if redirect_count >= self.MAX_REDIRECTS:
+                raise ValueError(f"Too many redirects while fetching {original_url}")
+            redirect_url = self._resolve_redirect_url(current_url, location)
+            new_headers = self._headers_for_url(
+                self._headers_for_redirect(
+                    current_headers,
+                    current_url,
+                    redirect_url,
+                ),
+                redirect_url,
+            )
+            return redirect_url, new_headers, redirect_count + 1
+        return None
     async def __aenter__(self) -> AsyncHttpClient:
         """Enter async context and create session."""
         resolver: AbstractResolver | None = None
@@ -382,22 +414,11 @@ class AsyncHttpClient:
                             allow_redirects=False,
                         ) as response,
                     ):
-                        location = response.headers.get("Location")
-                        if response.status in self.REDIRECT_STATUS_CODES and location:
-                            if redirect_count >= self.MAX_REDIRECTS:
-                                raise ValueError(f"Too many redirects while fetching {url}")
-                            redirect_url = self._resolve_redirect_url(current_url, location)
-                            current_headers = self._headers_for_url(
-                                self._headers_for_redirect(
-                                    current_headers,
-                                    current_url,
-                                    redirect_url,
-                                ),
-                                redirect_url,
-                            )
-                            current_url = redirect_url
-                            redirect_count += 1
+                        redirect = self._next_redirect(
+                            response, current_url, current_headers, redirect_count, url
+                        )
+                        if redirect is not None:
+                            current_url, current_headers, redirect_count = redirect
                             continue
                         if response.status in self.RETRYABLE_STATUS_CODES:
@@ -504,22 +525,9 @@ class AsyncHttpClient:
                     allow_redirects=False,
                 ) as response,
             ):
-                location = response.headers.get("Location")
-                if response.status in self.REDIRECT_STATUS_CODES and location:
-                    if redirect_count >= self.MAX_REDIRECTS:
-                        raise ValueError(f"Too many redirects while fetching {url}")
-                    redirect_url = self._resolve_redirect_url(current_url, location)
-                    current_headers = self._headers_for_url(
-                        self._headers_for_redirect(
-                            current_headers,
-                            current_url,
-                            redirect_url,
-                        ),
-                        redirect_url,
-                    )
-                    current_url = redirect_url
-                    redirect_count += 1
+                redirect = self._next_redirect(response, current_url, current_headers, redirect_count, url)
+                if redirect is not None:
+                    current_url, current_headers, redirect_count = redirect
                     continue
                 return HttpResponse(
@@ -529,17 +537,3 @@ class AsyncHttpClient:
                     headers=dict(response.headers),
                     url=str(response.url),
                 )
-    def decode_content(self, response: HttpResponse) -> str:
-        """
-        Decode response content to string.
-        Convenience method that uses intelligent encoding detection.
-        Args:
-            response: HttpResponse to decode
-        Returns:
-            Decoded string content
-        """
-        return self._decode_content(response.content, response.content_type)

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/mcp/tools.py RENAMED Viewed

@@ -452,12 +452,22 @@ def grep_docs(
     for root in roots:
         if not root.exists() or not root.is_dir():
             continue
+        resolved_root = root.resolve()
         for file in root.rglob("*.md"):
             if time.monotonic() > deadline:
                 timed_out = True
                 break
+            if file.is_symlink():
+                logger.debug("skip symlinked doc file: %s", file)
+                continue
+            resolved_file = file.resolve()
+            try:
+                resolved_file.relative_to(resolved_root)
+            except ValueError:
+                logger.debug("skip doc file outside library root: %s", file)
+                continue
             try:
-                lines = file.read_text(errors="replace").splitlines()
+                lines = resolved_file.read_text(errors="replace").splitlines()
             except OSError as err:
                 logger.debug("skip %s: %s", file, err)
                 continue

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/models/config.py RENAMED Viewed

@@ -396,8 +396,3 @@ class DocpullConfig(BaseModel):
         data = yaml.safe_load(yaml_str)
         return cls.model_validate(data)
-    @classmethod
-    def from_yaml_file(cls, path: Path) -> DocpullConfig:
-        """Load config from YAML file."""
-        return cls.from_yaml(path.read_text())

{docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/base.py RENAMED Viewed

@@ -181,16 +181,3 @@ class FetchPipeline:
                 break
         return ctx
-    def add_step(self, step: FetchStep) -> FetchPipeline:
-        """
-        Add a step to the pipeline (fluent API).
-        Args:
-            step: The step to add
-        Returns:
-            Self for chaining
-        """
-        self.steps.append(step)
-        return self

docpull 3.0.2__tar.gz → 4.0.0__tar.gz

docpull 3.0.2tar.gz → 4.0.0tar.gz