PyPI - docpull - Versions diffs - 2.0.0__tar.gz → 2.2.0__tar.gz - Mend

docpull 2.0.0tar.gz → 2.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

{docpull-2.0.0/src/docpull.egg-info → docpull-2.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docpull
-Version: 2.0.0
+Version: 2.2.0
 Summary: Pull documentation from the web and convert to clean markdown
 Author-email: Zachary Roth <support@raintree.technology>
 Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -36,7 +36,7 @@ Classifier: Programming Language :: Python :: 3.13
 Classifier: Programming Language :: Python :: 3.14
 Classifier: Programming Language :: Python :: 3 :: Only
 Classifier: Typing :: Typed
-Requires-Python: >=3.9
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: requests>=2.31.0
@@ -81,6 +81,7 @@ Dynamic: license-file
 [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
 [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
+[![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
 [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
 ## Install

{docpull-2.0.0 → docpull-2.2.0}/README.md RENAMED Viewed

@@ -4,6 +4,7 @@
 [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
 [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
+[![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
 [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
 ## Install

{docpull-2.0.0 → docpull-2.2.0}/pyproject.toml RENAMED Viewed

@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "docpull"
-version = "2.0.0"
+version = "2.2.0"
 dynamic = []
 description = "Pull documentation from the web and convert to clean markdown"
 readme = {file = "README.md", content-type = "text/markdown"}
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = "MIT"
 license-files = ["LICENSE"]
 authors = [
@@ -137,7 +137,7 @@ select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
 ignore = ["A003"]  # Allow 'type' and 'format' as field names in data models
 [tool.mypy]
-python_version = "3.9"
+python_version = "3.10"
 warn_return_any = true
 warn_unused_configs = true
 disallow_untyped_defs = true

{docpull-2.0.0 → docpull-2.2.0}/src/docpull/__init__.py RENAMED Viewed

@@ -14,7 +14,7 @@ Usage:
             print(event)
 """
-__version__ = "2.0.0"
+__version__ = "2.2.0"
 from .cache import CacheManager, StreamingDeduplicator
 from .core.fetcher import Fetcher, fetch_blocking

{docpull-2.0.0 → docpull-2.2.0}/src/docpull/cache/manager.py RENAMED Viewed

@@ -1,11 +1,13 @@
 """Cache management for update detection and incremental fetching."""
+from __future__ import annotations
 import hashlib
 import json
 import logging
 from datetime import datetime, timedelta
 from pathlib import Path
-from typing import Optional, TypedDict, Union
+from typing import TypedDict
 logger = logging.getLogger(__name__)
@@ -29,7 +31,15 @@ class CacheState(TypedDict, total=False):
     fetched_urls: list[str]
     failed_urls: list[str]
-    last_run: Optional[str]
+    last_run: str | None
+class DiscoveredUrlsState(TypedDict, total=False):
+    """Type for discovered URLs persistence (for resume capability)."""
+    start_url: str
+    discovered_at: str
+    urls: list[str]
 class _InternalState:
@@ -38,10 +48,10 @@ class _InternalState:
     def __init__(self) -> None:
         self.fetched_urls: set[str] = set()
         self.failed_urls: set[str] = set()
-        self.last_run: Optional[str] = None
+        self.last_run: str | None = None
     @classmethod
-    def from_cache_state(cls, state: CacheState) -> "_InternalState":
+    def from_cache_state(cls, state: CacheState) -> _InternalState:
         """Create internal state from serialized CacheState."""
         internal = cls()
         internal.fetched_urls = set(state.get("fetched_urls", []))
@@ -68,7 +78,7 @@ class CacheManager:
     - Consistent hashing: Uses bytes input for SHA-256 computation
     """
-    def __init__(self, cache_dir: Path, ttl_days: Optional[int] = None):
+    def __init__(self, cache_dir: Path, ttl_days: int | None = None):
         """Initialize cache manager.
         Args:
@@ -81,6 +91,7 @@ class CacheManager:
         self.manifest_file = self.cache_dir / "manifest.json"
         self.state_file = self.cache_dir / "state.json"
+        self.discovered_urls_file = self.cache_dir / "discovered_urls.json"
         self.manifest: dict[str, ManifestEntry] = self._load_manifest()
         self._state: _InternalState = _InternalState.from_cache_state(self._load_state())
@@ -157,21 +168,21 @@ class CacheManager:
         self._save_manifest()
         self._save_state()
-    def __enter__(self) -> "CacheManager":
+    def __enter__(self) -> CacheManager:
         """Context manager entry."""
         return self
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc_val: Optional[BaseException],
-        exc_tb: Optional[object],
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: object | None,
     ) -> None:
         """Context manager exit - auto-flush on exit."""
         self.flush()
     @staticmethod
-    def compute_checksum(content: Union[str, bytes]) -> str:
+    def compute_checksum(content: str | bytes) -> str:
         """Compute SHA-256 checksum of content.
         Args:
@@ -187,9 +198,9 @@ class CacheManager:
     def has_changed(
         self,
         url: str,
-        content: Optional[str] = None,
-        etag: Optional[str] = None,
-        last_modified: Optional[str] = None,
+        content: str | None = None,
+        etag: str | None = None,
+        last_modified: str | None = None,
     ) -> bool:
         """Check if content has changed since last fetch.
@@ -226,10 +237,10 @@ class CacheManager:
     def update_cache(
         self,
         url: str,
-        content: Union[str, bytes],
+        content: str | bytes,
         file_path: Path,
-        etag: Optional[str] = None,
-        last_modified: Optional[str] = None,
+        etag: str | None = None,
+        last_modified: str | None = None,
     ) -> None:
         """Update cache entry for a URL.
@@ -317,7 +328,7 @@ class CacheManager:
         self.flush()
         logger.info("Cleared incremental state")
-    def get_cache_stats(self) -> dict[str, Union[str, int, None]]:
+    def get_cache_stats(self) -> dict[str, str | int | None]:
         """Get cache statistics.
         Returns:
@@ -330,7 +341,7 @@ class CacheManager:
             "last_run": self._state.last_run,
         }
-    def evict_expired(self, ttl_days: Optional[int] = None) -> int:
+    def evict_expired(self, ttl_days: int | None = None) -> int:
         """Remove cache entries older than TTL.
         Args:
@@ -386,3 +397,106 @@ class CacheManager:
             True if URL failed to fetch
         """
         return url in self._state.failed_urls
+    # Resume capability methods
+    def save_discovered_urls(self, urls: list[str], start_url: str) -> None:
+        """Save discovered URLs for resume capability.
+        Args:
+            urls: List of discovered URLs
+            start_url: The starting URL for this crawl
+        Note:
+            This is written immediately (not batched) to ensure
+            URLs are persisted before fetching begins.
+        """
+        data: DiscoveredUrlsState = {
+            "start_url": start_url,
+            "discovered_at": datetime.now().isoformat(),
+            "urls": urls,
+        }
+        try:
+            with open(self.discovered_urls_file, "w", encoding="utf-8") as f:
+                json.dump(data, f, indent=2, ensure_ascii=False)
+            logger.info(f"Saved {len(urls)} discovered URLs for resume capability")
+        except Exception as e:
+            logger.error(f"Could not save discovered URLs: {e}")
+    def load_discovered_urls(self, start_url: str) -> list[str] | None:
+        """Load previously discovered URLs if they match the start URL.
+        Args:
+            start_url: The starting URL to match
+        Returns:
+            List of discovered URLs if found and matching, None otherwise
+        """
+        if not self.discovered_urls_file.exists():
+            return None
+        try:
+            with open(self.discovered_urls_file, encoding="utf-8") as f:
+                data: DiscoveredUrlsState = json.load(f)
+            if data.get("start_url") != start_url:
+                logger.info("Discovered URLs file exists but start_url doesn't match")
+                return None
+            urls = data.get("urls", [])
+            logger.info(f"Loaded {len(urls)} discovered URLs from previous run")
+            return urls
+        except Exception as e:
+            logger.warning(f"Could not load discovered URLs: {e}")
+            return None
+    def get_pending_urls(self, start_url: str) -> list[str] | None:
+        """Get URLs that were discovered but not yet fetched.
+        Args:
+            start_url: The starting URL to match
+        Returns:
+            List of pending URLs, or None if no resume data available
+        """
+        discovered = self.load_discovered_urls(start_url)
+        if discovered is None:
+            return None
+        # Filter out already-fetched URLs
+        fetched = self.get_fetched_urls()
+        pending = [url for url in discovered if url not in fetched]
+        logger.info(f"Found {len(pending)} pending URLs (out of {len(discovered)} discovered)")
+        return pending
+    def clear_discovered_urls(self) -> None:
+        """Clear discovered URLs file (called on successful completion).
+        This should be called after a successful fetch to clean up
+        the resume state.
+        """
+        if self.discovered_urls_file.exists():
+            try:
+                self.discovered_urls_file.unlink()
+                logger.info("Cleared discovered URLs file")
+            except Exception as e:
+                logger.warning(f"Could not clear discovered URLs file: {e}")
+    def has_resume_data(self, start_url: str) -> bool:
+        """Check if there is resume data available for the given URL.
+        Args:
+            start_url: The starting URL to check
+        Returns:
+            True if resume data exists and matches the start URL
+        """
+        if not self.discovered_urls_file.exists():
+            return False
+        try:
+            with open(self.discovered_urls_file, encoding="utf-8") as f:
+                data: DiscoveredUrlsState = json.load(f)
+            return data.get("start_url") == start_url
+        except Exception:
+            return False

{docpull-2.0.0 → docpull-2.2.0}/src/docpull/cache/streaming_dedup.py RENAMED Viewed

@@ -1,8 +1,9 @@
 """Streaming deduplication for real-time duplicate detection during fetch."""
+from __future__ import annotations
 import asyncio
 import hashlib
-from typing import Optional, Union
 class StreamingDeduplicator:
@@ -38,7 +39,7 @@ class StreamingDeduplicator:
         self._duplicates_found: int = 0
     @staticmethod
-    def compute_hash(content: Union[str, bytes]) -> str:
+    def compute_hash(content: str | bytes) -> str:
         """
         Compute SHA-256 hash of content.
@@ -59,8 +60,8 @@ class StreamingDeduplicator:
     async def check_and_register(
         self,
         url: str,
-        content: Union[str, bytes],
-    ) -> tuple[bool, Optional[str]]:
+        content: str | bytes,
+    ) -> tuple[bool, str | None]:
         """
         Check if content is a duplicate and register if new.
@@ -89,7 +90,7 @@ class StreamingDeduplicator:
             self._seen[content_hash] = url
             return (True, None)
-    async def is_duplicate(self, content: Union[str, bytes]) -> bool:
+    async def is_duplicate(self, content: str | bytes) -> bool:
         """
         Check if content has been seen before (read-only).

{docpull-2.0.0 → docpull-2.2.0}/src/docpull/cli.py RENAMED Viewed

@@ -1,10 +1,11 @@
 """Command-line interface for docpull."""
+from __future__ import annotations
 import argparse
 import asyncio
 import sys
 from pathlib import Path
-from typing import Optional
 # Check if --doctor flag is present before checking dependencies
 if "--doctor" in sys.argv:
@@ -43,7 +44,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
 from . import __version__
 from .core.fetcher import Fetcher
 from .models.config import DocpullConfig, ProfileName
-from .models.events import EventType
+from .models.events import EventType, SkipReason
 def create_parser() -> argparse.ArgumentParser:
@@ -106,6 +107,13 @@ Examples:
         default=None,
         help="Output directory (default: ./docs)",
     )
+    parser.add_argument(
+        "--format",
+        "-f",
+        choices=["markdown", "json", "sqlite"],
+        default="markdown",
+        help="Output format (default: markdown)",
+    )
     # Crawl settings
     crawl_group = parser.add_argument_group("crawl settings")
@@ -153,6 +161,11 @@ Examples:
         dest="javascript",
         help="Enable JavaScript rendering (requires Playwright)",
     )
+    crawl_group.add_argument(
+        "--adaptive-rate-limit",
+        action="store_true",
+        help="Automatically adjust rate limits based on server responses",
+    )
     # Content filtering
     filter_group = parser.add_argument_group("content filtering")
@@ -188,6 +201,33 @@ Examples:
         help="Maximum retry attempts",
     )
+    # Authentication settings
+    auth_group = parser.add_argument_group("authentication")
+    auth_group.add_argument(
+        "--auth-bearer",
+        type=str,
+        metavar="TOKEN",
+        help="Bearer token for authentication",
+    )
+    auth_group.add_argument(
+        "--auth-basic",
+        type=str,
+        metavar="USER:PASS",
+        help="Basic auth credentials (username:password)",
+    )
+    auth_group.add_argument(
+        "--auth-cookie",
+        type=str,
+        metavar="COOKIE",
+        help="Cookie string for authentication",
+    )
+    auth_group.add_argument(
+        "--auth-header",
+        nargs=2,
+        metavar=("NAME", "VALUE"),
+        help="Custom auth header (name value)",
+    )
     # Cache settings
     cache_group = parser.add_argument_group("cache settings")
     cache_group.add_argument(
@@ -214,6 +254,11 @@ Examples:
         action="store_true",
         help="Re-fetch pages even if unchanged",
     )
+    cache_group.add_argument(
+        "--resume",
+        action="store_true",
+        help="Resume from previous interrupted run (requires --cache)",
+    )
     # Output control
     output_group = parser.add_argument_group("output control")
@@ -222,6 +267,11 @@ Examples:
         action="store_true",
         help="Show what would be fetched without downloading",
     )
+    output_group.add_argument(
+        "--preview-urls",
+        action="store_true",
+        help="List discovered URLs without fetching",
+    )
     output_group.add_argument(
         "--verbose",
         "-v",
@@ -262,8 +312,13 @@ def run_fetcher(args: argparse.Namespace) -> int:
     }
     # Output settings
+    output_kwargs: dict = {}
     if args.output_dir:
-        config_kwargs["output"] = {"directory": args.output_dir}
+        output_kwargs["directory"] = args.output_dir
+    if args.format:
+        output_kwargs["format"] = args.format
+    if output_kwargs:
+        config_kwargs["output"] = output_kwargs
     # Crawl settings
     crawl_kwargs: dict = {}
@@ -277,6 +332,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
         crawl_kwargs["rate_limit"] = args.rate_limit
     if args.javascript:
         crawl_kwargs["javascript"] = True
+    if args.adaptive_rate_limit:
+        crawl_kwargs["adaptive_rate_limit"] = True
     if args.include_paths:
         crawl_kwargs["include_paths"] = args.include_paths
     if args.exclude_paths:
@@ -304,9 +361,33 @@ def run_fetcher(args: argparse.Namespace) -> int:
     if network_kwargs:
         config_kwargs["network"] = network_kwargs
+    # Authentication settings
+    auth_kwargs: dict = {}
+    if args.auth_bearer:
+        auth_kwargs["type"] = "bearer"
+        auth_kwargs["token"] = args.auth_bearer
+    elif args.auth_basic:
+        auth_kwargs["type"] = "basic"
+        if ":" in args.auth_basic:
+            username, password = args.auth_basic.split(":", 1)
+            auth_kwargs["username"] = username
+            auth_kwargs["password"] = password
+        else:
+            console.print("[red]Error:[/red] --auth-basic requires format username:password")
+            return 1
+    elif args.auth_cookie:
+        auth_kwargs["type"] = "cookie"
+        auth_kwargs["cookie"] = args.auth_cookie
+    elif args.auth_header:
+        auth_kwargs["type"] = "header"
+        auth_kwargs["header_name"] = args.auth_header[0]
+        auth_kwargs["header_value"] = args.auth_header[1]
+    if auth_kwargs:
+        config_kwargs["auth"] = auth_kwargs
     # Cache settings
     cache_kwargs: dict = {}
-    if args.cache:
+    if args.cache or args.resume:
         cache_kwargs["enabled"] = True
     if args.cache_dir:
         cache_kwargs["directory"] = args.cache_dir
@@ -314,6 +395,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
         cache_kwargs["ttl_days"] = args.cache_ttl
     if args.no_skip_unchanged:
         cache_kwargs["skip_unchanged"] = False
+    if args.resume:
+        cache_kwargs["resume"] = True
     if cache_kwargs:
         config_kwargs["cache"] = cache_kwargs
@@ -338,9 +421,23 @@ def run_fetcher(args: argparse.Namespace) -> int:
         try:
             async with Fetcher(config) as fetcher:
+                # Handle --preview-urls mode
+                if args.preview_urls:
+                    urls = await fetcher.discover()
+                    console.print(f"[bold]Discovered {len(urls)} URLs:[/bold]")
+                    for url in urls:
+                        console.print(f"  {url}")
+                    return 0
+                # Track skip reasons for summary
+                from collections import defaultdict
+                skip_counts: dict[SkipReason, int] = defaultdict(int)
                 if args.quiet:
-                    async for _ in fetcher.run():
-                        pass
+                    async for event in fetcher.run():
+                        if event.type == EventType.FETCH_SKIPPED and event.skip_reason:
+                            skip_counts[event.skip_reason] += 1
                 else:
                     with Progress(
                         SpinnerColumn(),
@@ -353,6 +450,10 @@ def run_fetcher(args: argparse.Namespace) -> int:
                         async for event in fetcher.run():
                             if event.type == EventType.STARTED:
                                 progress.update(task, description=f"[cyan]{event.message}")
+                            elif event.type == EventType.RESUMED:
+                                progress.update(
+                                    task, description=f"[yellow]Resuming with {event.total} pending URLs"
+                                )
                             elif event.type == EventType.DISCOVERY_STARTED:
                                 progress.update(task, description="[cyan]Discovering URLs...")
                             elif event.type == EventType.DISCOVERY_COMPLETE:
@@ -362,6 +463,12 @@ def run_fetcher(args: argparse.Namespace) -> int:
                                     task,
                                     description=f"[cyan]Fetching {event.current}/{event.total}: {event.url}",
                                 )
+                            elif event.type == EventType.FETCH_SKIPPED:
+                                if event.skip_reason:
+                                    skip_counts[event.skip_reason] += 1
+                                if args.verbose:
+                                    reason = event.skip_reason.value if event.skip_reason else "unknown"
+                                    console.print(f"[dim]Skipped: {event.url} ({reason})[/dim]")
                             elif event.type == EventType.FETCH_FAILED:
                                 console.print(f"[red]Failed:[/red] {event.url} - {event.error}")
                             elif event.type == EventType.COMPLETED:
@@ -378,6 +485,13 @@ def run_fetcher(args: argparse.Namespace) -> int:
                     console.print(f"  Pages failed: {stats.pages_failed}")
                     console.print(f"  Duration: {stats.duration_seconds:.1f}s")
+                    # Print skip reason summary if there were skips
+                    if skip_counts:
+                        console.print()
+                        console.print("[bold]Skip Summary:[/bold]")
+                        for reason, count in sorted(skip_counts.items(), key=lambda x: -x[1]):
+                            console.print(f"  {reason.value}: {count}")
                 return 0 if stats.pages_failed == 0 else 1
         except Exception as e:
@@ -391,7 +505,7 @@ def run_fetcher(args: argparse.Namespace) -> int:
     return asyncio.run(run())
-def main(argv: Optional[list[str]] = None) -> int:
+def main(argv: list[str] | None = None) -> int:
     """Main entry point."""
     parser = create_parser()
     args = parser.parse_args(argv)

{docpull-2.0.0 → docpull-2.2.0}/src/docpull/concurrency/browser_pool.py RENAMED Viewed

@@ -64,7 +64,7 @@ class BrowserContextPool:
         """
         if not PLAYWRIGHT_AVAILABLE:
             raise ImportError(
-                "Playwright is required for JavaScript rendering. " "Install with: pip install docpull[js]"
+                "Playwright is required for JavaScript rendering. Install with: pip install docpull[js]"
             )
         self._max_contexts = max_contexts
@@ -280,8 +280,7 @@ class BrowserFetcher:
                 if response is None or response.status >= 400:
                     logger.warning(
-                        f"Browser fetch failed for {url}: "
-                        f"status={response.status if response else 'None'}"
+                        f"Browser fetch failed for {url}: status={response.status if response else 'None'}"
                     )
                     return None

{docpull-2.0.0 → docpull-2.2.0}/src/docpull/conversion/extractor.py RENAMED Viewed

@@ -1,8 +1,9 @@
 """Main content extraction from HTML pages."""
+from __future__ import annotations
 import logging
 import re
-from typing import Optional
 from urllib.parse import urljoin, urlparse
 from bs4 import BeautifulSoup, Tag
@@ -103,8 +104,8 @@ class MainContentExtractor:
     def __init__(
         self,
-        content_selectors: Optional[list[str]] = None,
-        remove_selectors: Optional[list[str]] = None,
+        content_selectors: list[str] | None = None,
+        remove_selectors: list[str] | None = None,
         preserve_images: bool = True,
         preserve_code_blocks: bool = True,
     ):
@@ -146,7 +147,7 @@ class MainContentExtractor:
             text = html.decode("utf-8", errors="replace")
         return BeautifulSoup(text, "html.parser")
-    def _find_main_content(self, soup: BeautifulSoup) -> Optional[Tag]:
+    def _find_main_content(self, soup: BeautifulSoup) -> Tag | None:
         """Find the main content element using selectors."""
         for selector in self._content_selectors:
             element = soup.select_one(selector)

{docpull-2.0.0 → docpull-2.2.0}/src/docpull/conversion/markdown.py RENAMED Viewed

@@ -1,8 +1,10 @@
 """HTML to Markdown conversion."""
+from __future__ import annotations
 import logging
 import re
-from typing import Any, Optional
+from typing import Any
 from urllib.parse import urljoin
 import html2text
@@ -153,9 +155,9 @@ class FrontmatterBuilder:
     def build(
         self,
-        title: Optional[str] = None,
-        url: Optional[str] = None,
-        description: Optional[str] = None,
+        title: str | None = None,
+        url: str | None = None,
+        description: str | None = None,
         **extra_fields: Any,
     ) -> str:
         """

docpull 2.0.0__tar.gz → 2.2.0__tar.gz

docpull 2.0.0tar.gz → 2.2.0tar.gz