docpull 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,59 @@
1
+ """Tailwind CSS documentation fetcher."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from .parallel_base import ParallelFetcher
8
+
9
+
10
+ class TailwindFetcher(ParallelFetcher):
11
+ """Fetcher for Tailwind CSS documentation."""
12
+
13
+ def __init__(
14
+ self,
15
+ output_dir: Path,
16
+ rate_limit: float = 0.2,
17
+ skip_existing: bool = True,
18
+ logger: Optional[logging.Logger] = None,
19
+ max_workers: int = 15,
20
+ ) -> None:
21
+ """
22
+ Initialize Tailwind fetcher.
23
+
24
+ Args:
25
+ output_dir: Directory to save documentation
26
+ rate_limit: Seconds between requests
27
+ skip_existing: Skip existing files
28
+ logger: Logger instance
29
+ max_workers: Number of concurrent workers
30
+ """
31
+ super().__init__(output_dir, rate_limit, skip_existing, logger, max_workers)
32
+ self.sitemap_url = "https://tailwindcss.com/sitemap.xml"
33
+ self.base_url = "https://tailwindcss.com/"
34
+
35
+ def fetch(self) -> None:
36
+ """Fetch all Tailwind CSS documentation."""
37
+ self.logger.info("Fetching Tailwind CSS documentation")
38
+
39
+ urls = self.fetch_sitemap(self.sitemap_url)
40
+
41
+ if not urls:
42
+ self.logger.error("No URLs found in Tailwind sitemap")
43
+ return
44
+
45
+ doc_urls = self.filter_urls(
46
+ urls, include_patterns=["/docs/"], exclude_patterns=["/blog/", "/resources/", "/showcase/"]
47
+ )
48
+
49
+ self.logger.info(f"Found {len(doc_urls)} documentation URLs")
50
+
51
+ url_output_pairs = []
52
+ for url in doc_urls:
53
+ filepath = self.create_output_path(url, self.base_url, "tailwind", strip_prefix="docs")
54
+ url_output_pairs.append((url, filepath))
55
+
56
+ self.fetch_urls_parallel(url_output_pairs)
57
+
58
+ self.logger.info("Tailwind CSS documentation fetch complete")
59
+ self.print_stats()
@@ -0,0 +1,57 @@
1
+ """Turborepo documentation fetcher."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from .parallel_base import ParallelFetcher
8
+
9
+
10
+ class TurborepoFetcher(ParallelFetcher):
11
+ """Fetcher for Turborepo documentation."""
12
+
13
+ def __init__(
14
+ self,
15
+ output_dir: Path,
16
+ rate_limit: float = 0.2,
17
+ skip_existing: bool = True,
18
+ logger: Optional[logging.Logger] = None,
19
+ max_workers: int = 15,
20
+ ) -> None:
21
+ """
22
+ Initialize Turborepo fetcher.
23
+
24
+ Args:
25
+ output_dir: Directory to save documentation
26
+ rate_limit: Seconds between requests
27
+ skip_existing: Skip existing files
28
+ logger: Logger instance
29
+ max_workers: Number of concurrent workers
30
+ """
31
+ super().__init__(output_dir, rate_limit, skip_existing, logger, max_workers)
32
+ self.sitemap_url = "https://turborepo.com/sitemap.xml"
33
+ self.base_url = "https://turborepo.com/"
34
+
35
+ def fetch(self) -> None:
36
+ """Fetch all Turborepo documentation."""
37
+ self.logger.info("Fetching Turborepo documentation")
38
+
39
+ urls = self.fetch_sitemap(self.sitemap_url)
40
+
41
+ if not urls:
42
+ self.logger.error("No URLs found in Turborepo sitemap")
43
+ return
44
+
45
+ doc_urls = self.filter_urls(urls, include_patterns=["/docs/"], exclude_patterns=["/blog/"])
46
+
47
+ self.logger.info(f"Found {len(doc_urls)} documentation URLs")
48
+
49
+ url_output_pairs = []
50
+ for url in doc_urls:
51
+ filepath = self.create_output_path(url, self.base_url, "turborepo", strip_prefix="docs")
52
+ url_output_pairs.append((url, filepath))
53
+
54
+ self.fetch_urls_parallel(url_output_pairs)
55
+
56
+ self.logger.info("Turborepo documentation fetch complete")
57
+ self.print_stats()
@@ -0,0 +1,70 @@
1
+ """Site profiles for documentation scraping."""
2
+
3
+ from typing import Optional
4
+
5
+ from .base import SiteProfile
6
+ from .bun import BUN_PROFILE
7
+ from .d3 import D3_PROFILE
8
+ from .nextjs import NEXTJS_PROFILE
9
+ from .plaid import PLAID_PROFILE
10
+ from .react import REACT_PROFILE
11
+ from .stripe import STRIPE_PROFILE
12
+ from .tailwind import TAILWIND_PROFILE
13
+ from .turborepo import TURBOREPO_PROFILE
14
+
15
+ # Registry of all available profiles
16
+ PROFILES = {
17
+ "stripe": STRIPE_PROFILE,
18
+ "plaid": PLAID_PROFILE,
19
+ "nextjs": NEXTJS_PROFILE,
20
+ "react": REACT_PROFILE,
21
+ "tailwind": TAILWIND_PROFILE,
22
+ "bun": BUN_PROFILE,
23
+ "d3": D3_PROFILE,
24
+ "turborepo": TURBOREPO_PROFILE,
25
+ }
26
+
27
+
28
+ def get_profile_for_url(url: str) -> Optional[SiteProfile]:
29
+ """
30
+ Find a matching profile for a given URL.
31
+
32
+ Args:
33
+ url: URL to match against profiles
34
+
35
+ Returns:
36
+ Matching SiteProfile or None if no match
37
+ """
38
+ for profile in PROFILES.values():
39
+ if profile.matches_url(url):
40
+ return profile
41
+ return None
42
+
43
+
44
+ def get_profile_by_name(name: str) -> Optional[SiteProfile]:
45
+ """
46
+ Get a profile by name.
47
+
48
+ Args:
49
+ name: Profile name (e.g., 'stripe', 'plaid')
50
+
51
+ Returns:
52
+ SiteProfile or None if not found
53
+ """
54
+ return PROFILES.get(name.lower())
55
+
56
+
57
+ __all__ = [
58
+ "SiteProfile",
59
+ "PROFILES",
60
+ "get_profile_for_url",
61
+ "get_profile_by_name",
62
+ "STRIPE_PROFILE",
63
+ "PLAID_PROFILE",
64
+ "NEXTJS_PROFILE",
65
+ "REACT_PROFILE",
66
+ "TAILWIND_PROFILE",
67
+ "BUN_PROFILE",
68
+ "D3_PROFILE",
69
+ "TURBOREPO_PROFILE",
70
+ ]
@@ -0,0 +1,64 @@
1
+ """Base class for site profiles."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Optional
5
+
6
+
7
+ @dataclass
8
+ class SiteProfile:
9
+ """
10
+ Configuration profile for a documentation site.
11
+
12
+ Defines site-specific settings for optimal scraping.
13
+ """
14
+
15
+ # Identification
16
+ name: str
17
+ domains: set[str] # Domains this profile applies to
18
+
19
+ # URL discovery
20
+ sitemap_url: Optional[str] = None
21
+ base_url: Optional[str] = None
22
+ start_urls: list[str] = field(default_factory=list) # Alternative to sitemap
23
+
24
+ # URL filtering
25
+ include_patterns: list[str] = field(default_factory=list)
26
+ exclude_patterns: list[str] = field(default_factory=list)
27
+
28
+ # Content extraction
29
+ content_selectors: list[str] = field(default_factory=lambda: ["main", "article", ".content"])
30
+ remove_selectors: list[str] = field(
31
+ default_factory=lambda: ["script", "style", "nav", "footer", "header"]
32
+ )
33
+
34
+ # File organization
35
+ output_subdir: Optional[str] = None # Subdirectory name (defaults to name)
36
+ strip_prefix: Optional[str] = None # URL prefix to remove from paths
37
+
38
+ # Rate limiting
39
+ rate_limit: float = 0.5 # Seconds between requests
40
+
41
+ # Crawling behavior
42
+ max_depth: int = 5 # Maximum link depth from start URLs
43
+ max_pages: Optional[int] = None # Maximum pages to fetch
44
+ follow_links: bool = False # Whether to follow links (vs sitemap only)
45
+
46
+ def __post_init__(self) -> None:
47
+ """Set defaults after initialization."""
48
+ if self.output_subdir is None:
49
+ self.output_subdir = self.name
50
+
51
+ def matches_url(self, url: str) -> bool:
52
+ """
53
+ Check if this profile matches a given URL.
54
+
55
+ Args:
56
+ url: URL to check
57
+
58
+ Returns:
59
+ True if profile matches, False otherwise
60
+ """
61
+ from urllib.parse import urlparse
62
+
63
+ parsed = urlparse(url)
64
+ return parsed.netloc in self.domains
@@ -0,0 +1,14 @@
1
+ """Bun documentation profile."""
2
+
3
+ from .base import SiteProfile
4
+
5
+ BUN_PROFILE = SiteProfile(
6
+ name="bun",
7
+ domains={"bun.sh"},
8
+ sitemap_url="https://bun.sh/sitemap.xml",
9
+ base_url="https://bun.sh/",
10
+ include_patterns=["/docs/"],
11
+ output_subdir="bun",
12
+ strip_prefix="docs",
13
+ rate_limit=0.2,
14
+ )
docpull/profiles/d3.py ADDED
@@ -0,0 +1,17 @@
1
+ """D3.js documentation profile."""
2
+
3
+ from .base import SiteProfile
4
+
5
+ # Note: D3 fetcher uses devdocs.io API, not sitemap
6
+ # This profile is for generic URL scraping of D3 docs
7
+ D3_PROFILE = SiteProfile(
8
+ name="d3",
9
+ domains={"d3js.org", "devdocs.io"},
10
+ base_url="https://d3js.org/",
11
+ start_urls=["https://d3js.org/getting-started"],
12
+ include_patterns=["/getting-started", "/d3-"],
13
+ output_subdir="d3",
14
+ rate_limit=0.5,
15
+ follow_links=True,
16
+ max_depth=3,
17
+ )
@@ -0,0 +1,15 @@
1
+ """Next.js documentation profile."""
2
+
3
+ from .base import SiteProfile
4
+
5
+ NEXTJS_PROFILE = SiteProfile(
6
+ name="nextjs",
7
+ domains={"nextjs.org"},
8
+ sitemap_url="https://nextjs.org/sitemap.xml",
9
+ base_url="https://nextjs.org/",
10
+ include_patterns=["/docs/"],
11
+ exclude_patterns=["/blog/", "/showcase/", "/conf/", "/learn/"],
12
+ output_subdir="next",
13
+ strip_prefix="docs",
14
+ rate_limit=0.5,
15
+ )
@@ -0,0 +1,16 @@
1
+ """Plaid documentation profile."""
2
+
3
+ from .base import SiteProfile
4
+
5
+ PLAID_PROFILE = SiteProfile(
6
+ name="plaid",
7
+ domains={"plaid.com"},
8
+ sitemap_url="https://plaid.com/sitemap.xml",
9
+ base_url="https://plaid.com/",
10
+ start_urls=["https://plaid.com/docs/"],
11
+ include_patterns=["/docs/", "/api/"],
12
+ exclude_patterns=["/blog/", "/resources/", "/company/", "/customers/"],
13
+ output_subdir="plaid",
14
+ rate_limit=0.5,
15
+ follow_links=True, # Crawls links from start_urls in addition to sitemap
16
+ )
@@ -0,0 +1,14 @@
1
+ """React documentation profile."""
2
+
3
+ from .base import SiteProfile
4
+
5
+ REACT_PROFILE = SiteProfile(
6
+ name="react",
7
+ domains={"react.dev"},
8
+ sitemap_url="https://react.dev/sitemap.xml",
9
+ base_url="https://react.dev/",
10
+ include_patterns=["/reference/", "/learn/"],
11
+ exclude_patterns=["/blog/", "/community/"],
12
+ output_subdir="react",
13
+ rate_limit=0.2, # React docs can handle faster requests
14
+ )
@@ -0,0 +1,14 @@
1
+ """Stripe documentation profile."""
2
+
3
+ from .base import SiteProfile
4
+
5
+ STRIPE_PROFILE = SiteProfile(
6
+ name="stripe",
7
+ domains={"docs.stripe.com", "stripe.com"},
8
+ sitemap_url="https://docs.stripe.com/sitemap.xml",
9
+ base_url="https://docs.stripe.com/",
10
+ include_patterns=["https://docs.stripe.com/"],
11
+ exclude_patterns=["/changelog/", "/upgrades/"],
12
+ output_subdir="stripe",
13
+ rate_limit=0.5,
14
+ )
@@ -0,0 +1,14 @@
1
+ """Tailwind CSS documentation profile."""
2
+
3
+ from .base import SiteProfile
4
+
5
+ TAILWIND_PROFILE = SiteProfile(
6
+ name="tailwind",
7
+ domains={"tailwindcss.com"},
8
+ sitemap_url="https://tailwindcss.com/sitemap.xml",
9
+ base_url="https://tailwindcss.com/",
10
+ include_patterns=["/docs/"],
11
+ output_subdir="tailwind",
12
+ strip_prefix="docs",
13
+ rate_limit=0.2,
14
+ )
@@ -0,0 +1,14 @@
1
+ """Turborepo documentation profile."""
2
+
3
+ from .base import SiteProfile
4
+
5
+ TURBOREPO_PROFILE = SiteProfile(
6
+ name="turborepo",
7
+ domains={"turborepo.com", "turbo.build"},
8
+ sitemap_url="https://turbo.build/repo/sitemap.xml",
9
+ base_url="https://turbo.build/repo/",
10
+ include_patterns=["/docs/"],
11
+ output_subdir="turborepo",
12
+ strip_prefix="docs",
13
+ rate_limit=0.2,
14
+ )
docpull/py.typed ADDED
File without changes
@@ -0,0 +1,6 @@
1
+ """Utility functions for docpull."""
2
+
3
+ from .file_utils import clean_filename, ensure_dir, validate_output_path
4
+ from .logging_config import setup_logging
5
+
6
+ __all__ = ["clean_filename", "ensure_dir", "setup_logging", "validate_output_path"]
@@ -0,0 +1,97 @@
1
+ import re
2
+ from pathlib import Path
3
+ from typing import Union
4
+
5
+
6
+ def clean_filename(url: str, base_url: str) -> str:
7
+ """
8
+ Clean and sanitize a URL to create a safe filename.
9
+
10
+ Args:
11
+ url: The URL to convert to a filename
12
+ base_url: The base URL to remove from the path
13
+
14
+ Returns:
15
+ A sanitized filename ending in .md
16
+
17
+ Raises:
18
+ TypeError: If url or base_url are not strings
19
+ ValueError: If url or base_url are empty
20
+ """
21
+ if not isinstance(url, str):
22
+ raise TypeError(f"url must be a string, got {type(url).__name__}")
23
+ if not isinstance(base_url, str):
24
+ raise TypeError(f"base_url must be a string, got {type(base_url).__name__}")
25
+
26
+ if not url:
27
+ raise ValueError("url cannot be empty")
28
+ if not base_url:
29
+ raise ValueError("base_url cannot be empty")
30
+
31
+ path = url.replace(base_url, "").strip("/")
32
+ filename = path.replace("/", "-")
33
+ filename = re.sub(r"[^\w\-.]", "-", filename)
34
+ filename = re.sub(r"-+", "-", filename)
35
+ filename = filename.strip("-")
36
+
37
+ if not filename or filename in (".", ".."):
38
+ filename = "index"
39
+
40
+ if len(filename) > 200:
41
+ # Hash the overflow to maintain uniqueness and prevent collisions
42
+ import hashlib
43
+
44
+ overflow = filename[180:]
45
+ hash_suffix = hashlib.sha256(overflow.encode()).hexdigest()[:12]
46
+ filename = filename[:180] + "-" + hash_suffix
47
+
48
+ if not filename.endswith(".md"):
49
+ filename += ".md"
50
+
51
+ return filename
52
+
53
+
54
+ def ensure_dir(path: Union[str, Path]) -> Path:
55
+ """
56
+ Ensure a directory exists, creating it if necessary.
57
+
58
+ Args:
59
+ path: The directory path to create
60
+
61
+ Returns:
62
+ The resolved Path object
63
+
64
+ Raises:
65
+ OSError: If directory creation fails
66
+ """
67
+ path = Path(path).resolve()
68
+ path.mkdir(parents=True, exist_ok=True)
69
+ return path
70
+
71
+
72
+ def validate_output_path(output_path: Path, base_dir: Path) -> Path:
73
+ """
74
+ Validate that an output path is within the base directory.
75
+
76
+ Prevents path traversal attacks by ensuring the output path
77
+ doesn't escape the base directory.
78
+
79
+ Args:
80
+ output_path: The path to validate
81
+ base_dir: The base directory to check against
82
+
83
+ Returns:
84
+ The resolved output path if valid
85
+
86
+ Raises:
87
+ ValueError: If path traversal is detected
88
+ """
89
+ resolved_output = output_path.resolve()
90
+ resolved_base = base_dir.resolve()
91
+
92
+ try:
93
+ resolved_output.relative_to(resolved_base)
94
+ except ValueError as e:
95
+ raise ValueError(f"Path traversal detected: {output_path} is outside {base_dir}") from e
96
+
97
+ return resolved_output
@@ -0,0 +1,54 @@
1
+ import logging
2
+ import sys
3
+ from typing import Optional
4
+
5
+
6
+ def setup_logging(
7
+ level: str = "INFO",
8
+ log_file: Optional[str] = None,
9
+ format_string: Optional[str] = None,
10
+ force: bool = False,
11
+ ) -> logging.Logger:
12
+ """
13
+ Set up logging configuration for docpull.
14
+
15
+ Args:
16
+ level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
17
+ log_file: Optional file path for logging output
18
+ format_string: Optional custom format string for log messages
19
+ force: If True, reconfigure even if handlers exist
20
+
21
+ Returns:
22
+ Configured logger instance
23
+
24
+ Raises:
25
+ AttributeError: If invalid logging level is provided
26
+ """
27
+ if format_string is None:
28
+ format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
29
+
30
+ numeric_level = getattr(logging, level.upper(), logging.INFO)
31
+ logger = logging.getLogger("docpull")
32
+ logger.setLevel(numeric_level)
33
+
34
+ # Only clear and reconfigure if forced or no handlers exist
35
+ if force or not logger.handlers:
36
+ logger.handlers.clear()
37
+
38
+ console_handler = logging.StreamHandler(sys.stdout)
39
+ console_handler.setLevel(numeric_level)
40
+ console_formatter = logging.Formatter(format_string)
41
+ console_handler.setFormatter(console_formatter)
42
+ logger.addHandler(console_handler)
43
+
44
+ if log_file:
45
+ file_handler = logging.FileHandler(log_file)
46
+ file_handler.setLevel(numeric_level)
47
+ file_formatter = logging.Formatter(format_string)
48
+ file_handler.setFormatter(file_formatter)
49
+ logger.addHandler(file_handler)
50
+
51
+ # Prevent propagation to root logger to avoid duplicate logs
52
+ logger.propagate = False
53
+
54
+ return logger