PyPI - docpull - Versions diffs - 1.0.1__py3-none-any.whl - Mend

docpull 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

docpull/__init__.py +29 -0
docpull/__main__.py +6 -0
docpull/cli.py +440 -0
docpull/config.py +199 -0
docpull/fetchers/__init__.py +23 -0
docpull/fetchers/async_fetcher.py +322 -0
docpull/fetchers/base.py +450 -0
docpull/fetchers/bun.py +59 -0
docpull/fetchers/d3.py +211 -0
docpull/fetchers/generic.py +255 -0
docpull/fetchers/generic_async.py +282 -0
docpull/fetchers/nextjs.py +50 -0
docpull/fetchers/parallel_base.py +93 -0
docpull/fetchers/plaid.py +92 -0
docpull/fetchers/react.py +59 -0
docpull/fetchers/stripe.py +60 -0
docpull/fetchers/tailwind.py +59 -0
docpull/fetchers/turborepo.py +57 -0
docpull/profiles/__init__.py +70 -0
docpull/profiles/base.py +64 -0
docpull/profiles/bun.py +14 -0
docpull/profiles/d3.py +17 -0
docpull/profiles/nextjs.py +15 -0
docpull/profiles/plaid.py +16 -0
docpull/profiles/react.py +14 -0
docpull/profiles/stripe.py +14 -0
docpull/profiles/tailwind.py +14 -0
docpull/profiles/turborepo.py +14 -0
docpull/py.typed +0 -0
docpull/utils/__init__.py +6 -0
docpull/utils/file_utils.py +97 -0
docpull/utils/logging_config.py +54 -0
docpull-1.0.1.dist-info/METADATA +440 -0
docpull-1.0.1.dist-info/RECORD +38 -0
docpull-1.0.1.dist-info/WHEEL +5 -0
docpull-1.0.1.dist-info/entry_points.txt +2 -0
docpull-1.0.1.dist-info/licenses/LICENSE +21 -0
docpull-1.0.1.dist-info/top_level.txt +1 -0

docpull/fetchers/tailwind.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Tailwind CSS documentation fetcher."""
+import logging
+from pathlib import Path
+from typing import Optional
+from .parallel_base import ParallelFetcher
+class TailwindFetcher(ParallelFetcher):
+    """Fetcher for Tailwind CSS documentation."""
+    def __init__(
+        self,
+        output_dir: Path,
+        rate_limit: float = 0.2,
+        skip_existing: bool = True,
+        logger: Optional[logging.Logger] = None,
+        max_workers: int = 15,
+    ) -> None:
+        """
+        Initialize Tailwind fetcher.
+        Args:
+            output_dir: Directory to save documentation
+            rate_limit: Seconds between requests
+            skip_existing: Skip existing files
+            logger: Logger instance
+            max_workers: Number of concurrent workers
+        """
+        super().__init__(output_dir, rate_limit, skip_existing, logger, max_workers)
+        self.sitemap_url = "https://tailwindcss.com/sitemap.xml"
+        self.base_url = "https://tailwindcss.com/"
+    def fetch(self) -> None:
+        """Fetch all Tailwind CSS documentation."""
+        self.logger.info("Fetching Tailwind CSS documentation")
+        urls = self.fetch_sitemap(self.sitemap_url)
+        if not urls:
+            self.logger.error("No URLs found in Tailwind sitemap")
+            return
+        doc_urls = self.filter_urls(
+            urls, include_patterns=["/docs/"], exclude_patterns=["/blog/", "/resources/", "/showcase/"]
+        )
+        self.logger.info(f"Found {len(doc_urls)} documentation URLs")
+        url_output_pairs = []
+        for url in doc_urls:
+            filepath = self.create_output_path(url, self.base_url, "tailwind", strip_prefix="docs")
+            url_output_pairs.append((url, filepath))
+        self.fetch_urls_parallel(url_output_pairs)
+        self.logger.info("Tailwind CSS documentation fetch complete")
+        self.print_stats()

docpull/fetchers/turborepo.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Turborepo documentation fetcher."""
+import logging
+from pathlib import Path
+from typing import Optional
+from .parallel_base import ParallelFetcher
+class TurborepoFetcher(ParallelFetcher):
+    """Fetcher for Turborepo documentation."""
+    def __init__(
+        self,
+        output_dir: Path,
+        rate_limit: float = 0.2,
+        skip_existing: bool = True,
+        logger: Optional[logging.Logger] = None,
+        max_workers: int = 15,
+    ) -> None:
+        """
+        Initialize Turborepo fetcher.
+        Args:
+            output_dir: Directory to save documentation
+            rate_limit: Seconds between requests
+            skip_existing: Skip existing files
+            logger: Logger instance
+            max_workers: Number of concurrent workers
+        """
+        super().__init__(output_dir, rate_limit, skip_existing, logger, max_workers)
+        self.sitemap_url = "https://turborepo.com/sitemap.xml"
+        self.base_url = "https://turborepo.com/"
+    def fetch(self) -> None:
+        """Fetch all Turborepo documentation."""
+        self.logger.info("Fetching Turborepo documentation")
+        urls = self.fetch_sitemap(self.sitemap_url)
+        if not urls:
+            self.logger.error("No URLs found in Turborepo sitemap")
+            return
+        doc_urls = self.filter_urls(urls, include_patterns=["/docs/"], exclude_patterns=["/blog/"])
+        self.logger.info(f"Found {len(doc_urls)} documentation URLs")
+        url_output_pairs = []
+        for url in doc_urls:
+            filepath = self.create_output_path(url, self.base_url, "turborepo", strip_prefix="docs")
+            url_output_pairs.append((url, filepath))
+        self.fetch_urls_parallel(url_output_pairs)
+        self.logger.info("Turborepo documentation fetch complete")
+        self.print_stats()

docpull/profiles/__init__.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Site profiles for documentation scraping."""
+from typing import Optional
+from .base import SiteProfile
+from .bun import BUN_PROFILE
+from .d3 import D3_PROFILE
+from .nextjs import NEXTJS_PROFILE
+from .plaid import PLAID_PROFILE
+from .react import REACT_PROFILE
+from .stripe import STRIPE_PROFILE
+from .tailwind import TAILWIND_PROFILE
+from .turborepo import TURBOREPO_PROFILE
+# Registry of all available profiles
+PROFILES = {
+    "stripe": STRIPE_PROFILE,
+    "plaid": PLAID_PROFILE,
+    "nextjs": NEXTJS_PROFILE,
+    "react": REACT_PROFILE,
+    "tailwind": TAILWIND_PROFILE,
+    "bun": BUN_PROFILE,
+    "d3": D3_PROFILE,
+    "turborepo": TURBOREPO_PROFILE,
+}
+def get_profile_for_url(url: str) -> Optional[SiteProfile]:
+    """
+    Find a matching profile for a given URL.
+    Args:
+        url: URL to match against profiles
+    Returns:
+        Matching SiteProfile or None if no match
+    """
+    for profile in PROFILES.values():
+        if profile.matches_url(url):
+            return profile
+    return None
+def get_profile_by_name(name: str) -> Optional[SiteProfile]:
+    """
+    Get a profile by name.
+    Args:
+        name: Profile name (e.g., 'stripe', 'plaid')
+    Returns:
+        SiteProfile or None if not found
+    """
+    return PROFILES.get(name.lower())
+__all__ = [
+    "SiteProfile",
+    "PROFILES",
+    "get_profile_for_url",
+    "get_profile_by_name",
+    "STRIPE_PROFILE",
+    "PLAID_PROFILE",
+    "NEXTJS_PROFILE",
+    "REACT_PROFILE",
+    "TAILWIND_PROFILE",
+    "BUN_PROFILE",
+    "D3_PROFILE",
+    "TURBOREPO_PROFILE",
+]

docpull/profiles/base.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""Base class for site profiles."""
+from dataclasses import dataclass, field
+from typing import Optional
+@dataclass
+class SiteProfile:
+    """
+    Configuration profile for a documentation site.
+    Defines site-specific settings for optimal scraping.
+    """
+    # Identification
+    name: str
+    domains: set[str]  # Domains this profile applies to
+    # URL discovery
+    sitemap_url: Optional[str] = None
+    base_url: Optional[str] = None
+    start_urls: list[str] = field(default_factory=list)  # Alternative to sitemap
+    # URL filtering
+    include_patterns: list[str] = field(default_factory=list)
+    exclude_patterns: list[str] = field(default_factory=list)
+    # Content extraction
+    content_selectors: list[str] = field(default_factory=lambda: ["main", "article", ".content"])
+    remove_selectors: list[str] = field(
+        default_factory=lambda: ["script", "style", "nav", "footer", "header"]
+    )
+    # File organization
+    output_subdir: Optional[str] = None  # Subdirectory name (defaults to name)
+    strip_prefix: Optional[str] = None  # URL prefix to remove from paths
+    # Rate limiting
+    rate_limit: float = 0.5  # Seconds between requests
+    # Crawling behavior
+    max_depth: int = 5  # Maximum link depth from start URLs
+    max_pages: Optional[int] = None  # Maximum pages to fetch
+    follow_links: bool = False  # Whether to follow links (vs sitemap only)
+    def __post_init__(self) -> None:
+        """Set defaults after initialization."""
+        if self.output_subdir is None:
+            self.output_subdir = self.name
+    def matches_url(self, url: str) -> bool:
+        """
+        Check if this profile matches a given URL.
+        Args:
+            url: URL to check
+        Returns:
+            True if profile matches, False otherwise
+        """
+        from urllib.parse import urlparse
+        parsed = urlparse(url)
+        return parsed.netloc in self.domains

docpull/profiles/bun.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Bun documentation profile."""
+from .base import SiteProfile
+BUN_PROFILE = SiteProfile(
+    name="bun",
+    domains={"bun.sh"},
+    sitemap_url="https://bun.sh/sitemap.xml",
+    base_url="https://bun.sh/",
+    include_patterns=["/docs/"],
+    output_subdir="bun",
+    strip_prefix="docs",
+    rate_limit=0.2,
+)

docpull/profiles/d3.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""D3.js documentation profile."""
+from .base import SiteProfile
+# Note: D3 fetcher uses devdocs.io API, not sitemap
+# This profile is for generic URL scraping of D3 docs
+D3_PROFILE = SiteProfile(
+    name="d3",
+    domains={"d3js.org", "devdocs.io"},
+    base_url="https://d3js.org/",
+    start_urls=["https://d3js.org/getting-started"],
+    include_patterns=["/getting-started", "/d3-"],
+    output_subdir="d3",
+    rate_limit=0.5,
+    follow_links=True,
+    max_depth=3,
+)

docpull/profiles/nextjs.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Next.js documentation profile."""
+from .base import SiteProfile
+NEXTJS_PROFILE = SiteProfile(
+    name="nextjs",
+    domains={"nextjs.org"},
+    sitemap_url="https://nextjs.org/sitemap.xml",
+    base_url="https://nextjs.org/",
+    include_patterns=["/docs/"],
+    exclude_patterns=["/blog/", "/showcase/", "/conf/", "/learn/"],
+    output_subdir="next",
+    strip_prefix="docs",
+    rate_limit=0.5,
+)

docpull/profiles/plaid.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Plaid documentation profile."""
+from .base import SiteProfile
+PLAID_PROFILE = SiteProfile(
+    name="plaid",
+    domains={"plaid.com"},
+    sitemap_url="https://plaid.com/sitemap.xml",
+    base_url="https://plaid.com/",
+    start_urls=["https://plaid.com/docs/"],
+    include_patterns=["/docs/", "/api/"],
+    exclude_patterns=["/blog/", "/resources/", "/company/", "/customers/"],
+    output_subdir="plaid",
+    rate_limit=0.5,
+    follow_links=True,  # Crawls links from start_urls in addition to sitemap
+)

docpull/profiles/react.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""React documentation profile."""
+from .base import SiteProfile
+REACT_PROFILE = SiteProfile(
+    name="react",
+    domains={"react.dev"},
+    sitemap_url="https://react.dev/sitemap.xml",
+    base_url="https://react.dev/",
+    include_patterns=["/reference/", "/learn/"],
+    exclude_patterns=["/blog/", "/community/"],
+    output_subdir="react",
+    rate_limit=0.2,  # React docs can handle faster requests
+)

docpull/profiles/stripe.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Stripe documentation profile."""
+from .base import SiteProfile
+STRIPE_PROFILE = SiteProfile(
+    name="stripe",
+    domains={"docs.stripe.com", "stripe.com"},
+    sitemap_url="https://docs.stripe.com/sitemap.xml",
+    base_url="https://docs.stripe.com/",
+    include_patterns=["https://docs.stripe.com/"],
+    exclude_patterns=["/changelog/", "/upgrades/"],
+    output_subdir="stripe",
+    rate_limit=0.5,
+)

docpull/profiles/tailwind.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Tailwind CSS documentation profile."""
+from .base import SiteProfile
+TAILWIND_PROFILE = SiteProfile(
+    name="tailwind",
+    domains={"tailwindcss.com"},
+    sitemap_url="https://tailwindcss.com/sitemap.xml",
+    base_url="https://tailwindcss.com/",
+    include_patterns=["/docs/"],
+    output_subdir="tailwind",
+    strip_prefix="docs",
+    rate_limit=0.2,
+)

docpull/profiles/turborepo.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Turborepo documentation profile."""
+from .base import SiteProfile
+TURBOREPO_PROFILE = SiteProfile(
+    name="turborepo",
+    domains={"turborepo.com", "turbo.build"},
+    sitemap_url="https://turbo.build/repo/sitemap.xml",
+    base_url="https://turbo.build/repo/",
+    include_patterns=["/docs/"],
+    output_subdir="turborepo",
+    strip_prefix="docs",
+    rate_limit=0.2,
+)

docpull/py.typed ADDED Viewed

File without changes

docpull/utils/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Utility functions for docpull."""
+from .file_utils import clean_filename, ensure_dir, validate_output_path
+from .logging_config import setup_logging
+__all__ = ["clean_filename", "ensure_dir", "setup_logging", "validate_output_path"]

docpull/utils/file_utils.py ADDED Viewed

@@ -0,0 +1,97 @@
+import re
+from pathlib import Path
+from typing import Union
+def clean_filename(url: str, base_url: str) -> str:
+    """
+    Clean and sanitize a URL to create a safe filename.
+    Args:
+        url: The URL to convert to a filename
+        base_url: The base URL to remove from the path
+    Returns:
+        A sanitized filename ending in .md
+    Raises:
+        TypeError: If url or base_url are not strings
+        ValueError: If url or base_url are empty
+    """
+    if not isinstance(url, str):
+        raise TypeError(f"url must be a string, got {type(url).__name__}")
+    if not isinstance(base_url, str):
+        raise TypeError(f"base_url must be a string, got {type(base_url).__name__}")
+    if not url:
+        raise ValueError("url cannot be empty")
+    if not base_url:
+        raise ValueError("base_url cannot be empty")
+    path = url.replace(base_url, "").strip("/")
+    filename = path.replace("/", "-")
+    filename = re.sub(r"[^\w\-.]", "-", filename)
+    filename = re.sub(r"-+", "-", filename)
+    filename = filename.strip("-")
+    if not filename or filename in (".", ".."):
+        filename = "index"
+    if len(filename) > 200:
+        # Hash the overflow to maintain uniqueness and prevent collisions
+        import hashlib
+        overflow = filename[180:]
+        hash_suffix = hashlib.sha256(overflow.encode()).hexdigest()[:12]
+        filename = filename[:180] + "-" + hash_suffix
+    if not filename.endswith(".md"):
+        filename += ".md"
+    return filename
+def ensure_dir(path: Union[str, Path]) -> Path:
+    """
+    Ensure a directory exists, creating it if necessary.
+    Args:
+        path: The directory path to create
+    Returns:
+        The resolved Path object
+    Raises:
+        OSError: If directory creation fails
+    """
+    path = Path(path).resolve()
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+def validate_output_path(output_path: Path, base_dir: Path) -> Path:
+    """
+    Validate that an output path is within the base directory.
+    Prevents path traversal attacks by ensuring the output path
+    doesn't escape the base directory.
+    Args:
+        output_path: The path to validate
+        base_dir: The base directory to check against
+    Returns:
+        The resolved output path if valid
+    Raises:
+        ValueError: If path traversal is detected
+    """
+    resolved_output = output_path.resolve()
+    resolved_base = base_dir.resolve()
+    try:
+        resolved_output.relative_to(resolved_base)
+    except ValueError as e:
+        raise ValueError(f"Path traversal detected: {output_path} is outside {base_dir}") from e
+    return resolved_output

docpull/utils/logging_config.py ADDED Viewed

@@ -0,0 +1,54 @@
+import logging
+import sys
+from typing import Optional
+def setup_logging(
+    level: str = "INFO",
+    log_file: Optional[str] = None,
+    format_string: Optional[str] = None,
+    force: bool = False,
+) -> logging.Logger:
+    """
+    Set up logging configuration for docpull.
+    Args:
+        level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        log_file: Optional file path for logging output
+        format_string: Optional custom format string for log messages
+        force: If True, reconfigure even if handlers exist
+    Returns:
+        Configured logger instance
+    Raises:
+        AttributeError: If invalid logging level is provided
+    """
+    if format_string is None:
+        format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    numeric_level = getattr(logging, level.upper(), logging.INFO)
+    logger = logging.getLogger("docpull")
+    logger.setLevel(numeric_level)
+    # Only clear and reconfigure if forced or no handlers exist
+    if force or not logger.handlers:
+        logger.handlers.clear()
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(numeric_level)
+        console_formatter = logging.Formatter(format_string)
+        console_handler.setFormatter(console_formatter)
+        logger.addHandler(console_handler)
+        if log_file:
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setLevel(numeric_level)
+            file_formatter = logging.Formatter(format_string)
+            file_handler.setFormatter(file_formatter)
+            logger.addHandler(file_handler)
+    # Prevent propagation to root logger to avoid duplicate logs
+    logger.propagate = False
+    return logger