docpull 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpull/__init__.py +29 -0
- docpull/__main__.py +6 -0
- docpull/cli.py +440 -0
- docpull/config.py +199 -0
- docpull/fetchers/__init__.py +23 -0
- docpull/fetchers/async_fetcher.py +322 -0
- docpull/fetchers/base.py +450 -0
- docpull/fetchers/bun.py +59 -0
- docpull/fetchers/d3.py +211 -0
- docpull/fetchers/generic.py +255 -0
- docpull/fetchers/generic_async.py +282 -0
- docpull/fetchers/nextjs.py +50 -0
- docpull/fetchers/parallel_base.py +93 -0
- docpull/fetchers/plaid.py +92 -0
- docpull/fetchers/react.py +59 -0
- docpull/fetchers/stripe.py +60 -0
- docpull/fetchers/tailwind.py +59 -0
- docpull/fetchers/turborepo.py +57 -0
- docpull/profiles/__init__.py +70 -0
- docpull/profiles/base.py +64 -0
- docpull/profiles/bun.py +14 -0
- docpull/profiles/d3.py +17 -0
- docpull/profiles/nextjs.py +15 -0
- docpull/profiles/plaid.py +16 -0
- docpull/profiles/react.py +14 -0
- docpull/profiles/stripe.py +14 -0
- docpull/profiles/tailwind.py +14 -0
- docpull/profiles/turborepo.py +14 -0
- docpull/py.typed +0 -0
- docpull/utils/__init__.py +6 -0
- docpull/utils/file_utils.py +97 -0
- docpull/utils/logging_config.py +54 -0
- docpull-1.0.1.dist-info/METADATA +440 -0
- docpull-1.0.1.dist-info/RECORD +38 -0
- docpull-1.0.1.dist-info/WHEEL +5 -0
- docpull-1.0.1.dist-info/entry_points.txt +2 -0
- docpull-1.0.1.dist-info/licenses/LICENSE +21 -0
- docpull-1.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Tailwind CSS documentation fetcher."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from .parallel_base import ParallelFetcher
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TailwindFetcher(ParallelFetcher):
|
|
11
|
+
"""Fetcher for Tailwind CSS documentation."""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
output_dir: Path,
|
|
16
|
+
rate_limit: float = 0.2,
|
|
17
|
+
skip_existing: bool = True,
|
|
18
|
+
logger: Optional[logging.Logger] = None,
|
|
19
|
+
max_workers: int = 15,
|
|
20
|
+
) -> None:
|
|
21
|
+
"""
|
|
22
|
+
Initialize Tailwind fetcher.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
output_dir: Directory to save documentation
|
|
26
|
+
rate_limit: Seconds between requests
|
|
27
|
+
skip_existing: Skip existing files
|
|
28
|
+
logger: Logger instance
|
|
29
|
+
max_workers: Number of concurrent workers
|
|
30
|
+
"""
|
|
31
|
+
super().__init__(output_dir, rate_limit, skip_existing, logger, max_workers)
|
|
32
|
+
self.sitemap_url = "https://tailwindcss.com/sitemap.xml"
|
|
33
|
+
self.base_url = "https://tailwindcss.com/"
|
|
34
|
+
|
|
35
|
+
def fetch(self) -> None:
|
|
36
|
+
"""Fetch all Tailwind CSS documentation."""
|
|
37
|
+
self.logger.info("Fetching Tailwind CSS documentation")
|
|
38
|
+
|
|
39
|
+
urls = self.fetch_sitemap(self.sitemap_url)
|
|
40
|
+
|
|
41
|
+
if not urls:
|
|
42
|
+
self.logger.error("No URLs found in Tailwind sitemap")
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
doc_urls = self.filter_urls(
|
|
46
|
+
urls, include_patterns=["/docs/"], exclude_patterns=["/blog/", "/resources/", "/showcase/"]
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
self.logger.info(f"Found {len(doc_urls)} documentation URLs")
|
|
50
|
+
|
|
51
|
+
url_output_pairs = []
|
|
52
|
+
for url in doc_urls:
|
|
53
|
+
filepath = self.create_output_path(url, self.base_url, "tailwind", strip_prefix="docs")
|
|
54
|
+
url_output_pairs.append((url, filepath))
|
|
55
|
+
|
|
56
|
+
self.fetch_urls_parallel(url_output_pairs)
|
|
57
|
+
|
|
58
|
+
self.logger.info("Tailwind CSS documentation fetch complete")
|
|
59
|
+
self.print_stats()
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Turborepo documentation fetcher."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from .parallel_base import ParallelFetcher
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TurborepoFetcher(ParallelFetcher):
|
|
11
|
+
"""Fetcher for Turborepo documentation."""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
output_dir: Path,
|
|
16
|
+
rate_limit: float = 0.2,
|
|
17
|
+
skip_existing: bool = True,
|
|
18
|
+
logger: Optional[logging.Logger] = None,
|
|
19
|
+
max_workers: int = 15,
|
|
20
|
+
) -> None:
|
|
21
|
+
"""
|
|
22
|
+
Initialize Turborepo fetcher.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
output_dir: Directory to save documentation
|
|
26
|
+
rate_limit: Seconds between requests
|
|
27
|
+
skip_existing: Skip existing files
|
|
28
|
+
logger: Logger instance
|
|
29
|
+
max_workers: Number of concurrent workers
|
|
30
|
+
"""
|
|
31
|
+
super().__init__(output_dir, rate_limit, skip_existing, logger, max_workers)
|
|
32
|
+
self.sitemap_url = "https://turborepo.com/sitemap.xml"
|
|
33
|
+
self.base_url = "https://turborepo.com/"
|
|
34
|
+
|
|
35
|
+
def fetch(self) -> None:
|
|
36
|
+
"""Fetch all Turborepo documentation."""
|
|
37
|
+
self.logger.info("Fetching Turborepo documentation")
|
|
38
|
+
|
|
39
|
+
urls = self.fetch_sitemap(self.sitemap_url)
|
|
40
|
+
|
|
41
|
+
if not urls:
|
|
42
|
+
self.logger.error("No URLs found in Turborepo sitemap")
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
doc_urls = self.filter_urls(urls, include_patterns=["/docs/"], exclude_patterns=["/blog/"])
|
|
46
|
+
|
|
47
|
+
self.logger.info(f"Found {len(doc_urls)} documentation URLs")
|
|
48
|
+
|
|
49
|
+
url_output_pairs = []
|
|
50
|
+
for url in doc_urls:
|
|
51
|
+
filepath = self.create_output_path(url, self.base_url, "turborepo", strip_prefix="docs")
|
|
52
|
+
url_output_pairs.append((url, filepath))
|
|
53
|
+
|
|
54
|
+
self.fetch_urls_parallel(url_output_pairs)
|
|
55
|
+
|
|
56
|
+
self.logger.info("Turborepo documentation fetch complete")
|
|
57
|
+
self.print_stats()
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Site profiles for documentation scraping."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from .base import SiteProfile
|
|
6
|
+
from .bun import BUN_PROFILE
|
|
7
|
+
from .d3 import D3_PROFILE
|
|
8
|
+
from .nextjs import NEXTJS_PROFILE
|
|
9
|
+
from .plaid import PLAID_PROFILE
|
|
10
|
+
from .react import REACT_PROFILE
|
|
11
|
+
from .stripe import STRIPE_PROFILE
|
|
12
|
+
from .tailwind import TAILWIND_PROFILE
|
|
13
|
+
from .turborepo import TURBOREPO_PROFILE
|
|
14
|
+
|
|
15
|
+
# Registry of all available profiles
|
|
16
|
+
PROFILES = {
|
|
17
|
+
"stripe": STRIPE_PROFILE,
|
|
18
|
+
"plaid": PLAID_PROFILE,
|
|
19
|
+
"nextjs": NEXTJS_PROFILE,
|
|
20
|
+
"react": REACT_PROFILE,
|
|
21
|
+
"tailwind": TAILWIND_PROFILE,
|
|
22
|
+
"bun": BUN_PROFILE,
|
|
23
|
+
"d3": D3_PROFILE,
|
|
24
|
+
"turborepo": TURBOREPO_PROFILE,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_profile_for_url(url: str) -> Optional[SiteProfile]:
|
|
29
|
+
"""
|
|
30
|
+
Find a matching profile for a given URL.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
url: URL to match against profiles
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Matching SiteProfile or None if no match
|
|
37
|
+
"""
|
|
38
|
+
for profile in PROFILES.values():
|
|
39
|
+
if profile.matches_url(url):
|
|
40
|
+
return profile
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_profile_by_name(name: str) -> Optional[SiteProfile]:
|
|
45
|
+
"""
|
|
46
|
+
Get a profile by name.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
name: Profile name (e.g., 'stripe', 'plaid')
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
SiteProfile or None if not found
|
|
53
|
+
"""
|
|
54
|
+
return PROFILES.get(name.lower())
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
__all__ = [
|
|
58
|
+
"SiteProfile",
|
|
59
|
+
"PROFILES",
|
|
60
|
+
"get_profile_for_url",
|
|
61
|
+
"get_profile_by_name",
|
|
62
|
+
"STRIPE_PROFILE",
|
|
63
|
+
"PLAID_PROFILE",
|
|
64
|
+
"NEXTJS_PROFILE",
|
|
65
|
+
"REACT_PROFILE",
|
|
66
|
+
"TAILWIND_PROFILE",
|
|
67
|
+
"BUN_PROFILE",
|
|
68
|
+
"D3_PROFILE",
|
|
69
|
+
"TURBOREPO_PROFILE",
|
|
70
|
+
]
|
docpull/profiles/base.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Base class for site profiles."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class SiteProfile:
|
|
9
|
+
"""
|
|
10
|
+
Configuration profile for a documentation site.
|
|
11
|
+
|
|
12
|
+
Defines site-specific settings for optimal scraping.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# Identification
|
|
16
|
+
name: str
|
|
17
|
+
domains: set[str] # Domains this profile applies to
|
|
18
|
+
|
|
19
|
+
# URL discovery
|
|
20
|
+
sitemap_url: Optional[str] = None
|
|
21
|
+
base_url: Optional[str] = None
|
|
22
|
+
start_urls: list[str] = field(default_factory=list) # Alternative to sitemap
|
|
23
|
+
|
|
24
|
+
# URL filtering
|
|
25
|
+
include_patterns: list[str] = field(default_factory=list)
|
|
26
|
+
exclude_patterns: list[str] = field(default_factory=list)
|
|
27
|
+
|
|
28
|
+
# Content extraction
|
|
29
|
+
content_selectors: list[str] = field(default_factory=lambda: ["main", "article", ".content"])
|
|
30
|
+
remove_selectors: list[str] = field(
|
|
31
|
+
default_factory=lambda: ["script", "style", "nav", "footer", "header"]
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# File organization
|
|
35
|
+
output_subdir: Optional[str] = None # Subdirectory name (defaults to name)
|
|
36
|
+
strip_prefix: Optional[str] = None # URL prefix to remove from paths
|
|
37
|
+
|
|
38
|
+
# Rate limiting
|
|
39
|
+
rate_limit: float = 0.5 # Seconds between requests
|
|
40
|
+
|
|
41
|
+
# Crawling behavior
|
|
42
|
+
max_depth: int = 5 # Maximum link depth from start URLs
|
|
43
|
+
max_pages: Optional[int] = None # Maximum pages to fetch
|
|
44
|
+
follow_links: bool = False # Whether to follow links (vs sitemap only)
|
|
45
|
+
|
|
46
|
+
def __post_init__(self) -> None:
|
|
47
|
+
"""Set defaults after initialization."""
|
|
48
|
+
if self.output_subdir is None:
|
|
49
|
+
self.output_subdir = self.name
|
|
50
|
+
|
|
51
|
+
def matches_url(self, url: str) -> bool:
|
|
52
|
+
"""
|
|
53
|
+
Check if this profile matches a given URL.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
url: URL to check
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
True if profile matches, False otherwise
|
|
60
|
+
"""
|
|
61
|
+
from urllib.parse import urlparse
|
|
62
|
+
|
|
63
|
+
parsed = urlparse(url)
|
|
64
|
+
return parsed.netloc in self.domains
|
docpull/profiles/bun.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Bun documentation profile."""
|
|
2
|
+
|
|
3
|
+
from .base import SiteProfile
|
|
4
|
+
|
|
5
|
+
BUN_PROFILE = SiteProfile(
|
|
6
|
+
name="bun",
|
|
7
|
+
domains={"bun.sh"},
|
|
8
|
+
sitemap_url="https://bun.sh/sitemap.xml",
|
|
9
|
+
base_url="https://bun.sh/",
|
|
10
|
+
include_patterns=["/docs/"],
|
|
11
|
+
output_subdir="bun",
|
|
12
|
+
strip_prefix="docs",
|
|
13
|
+
rate_limit=0.2,
|
|
14
|
+
)
|
docpull/profiles/d3.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""D3.js documentation profile."""
|
|
2
|
+
|
|
3
|
+
from .base import SiteProfile
|
|
4
|
+
|
|
5
|
+
# Note: D3 fetcher uses devdocs.io API, not sitemap
|
|
6
|
+
# This profile is for generic URL scraping of D3 docs
|
|
7
|
+
D3_PROFILE = SiteProfile(
|
|
8
|
+
name="d3",
|
|
9
|
+
domains={"d3js.org", "devdocs.io"},
|
|
10
|
+
base_url="https://d3js.org/",
|
|
11
|
+
start_urls=["https://d3js.org/getting-started"],
|
|
12
|
+
include_patterns=["/getting-started", "/d3-"],
|
|
13
|
+
output_subdir="d3",
|
|
14
|
+
rate_limit=0.5,
|
|
15
|
+
follow_links=True,
|
|
16
|
+
max_depth=3,
|
|
17
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Next.js documentation profile."""
|
|
2
|
+
|
|
3
|
+
from .base import SiteProfile
|
|
4
|
+
|
|
5
|
+
NEXTJS_PROFILE = SiteProfile(
|
|
6
|
+
name="nextjs",
|
|
7
|
+
domains={"nextjs.org"},
|
|
8
|
+
sitemap_url="https://nextjs.org/sitemap.xml",
|
|
9
|
+
base_url="https://nextjs.org/",
|
|
10
|
+
include_patterns=["/docs/"],
|
|
11
|
+
exclude_patterns=["/blog/", "/showcase/", "/conf/", "/learn/"],
|
|
12
|
+
output_subdir="next",
|
|
13
|
+
strip_prefix="docs",
|
|
14
|
+
rate_limit=0.5,
|
|
15
|
+
)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Plaid documentation profile."""
|
|
2
|
+
|
|
3
|
+
from .base import SiteProfile
|
|
4
|
+
|
|
5
|
+
PLAID_PROFILE = SiteProfile(
|
|
6
|
+
name="plaid",
|
|
7
|
+
domains={"plaid.com"},
|
|
8
|
+
sitemap_url="https://plaid.com/sitemap.xml",
|
|
9
|
+
base_url="https://plaid.com/",
|
|
10
|
+
start_urls=["https://plaid.com/docs/"],
|
|
11
|
+
include_patterns=["/docs/", "/api/"],
|
|
12
|
+
exclude_patterns=["/blog/", "/resources/", "/company/", "/customers/"],
|
|
13
|
+
output_subdir="plaid",
|
|
14
|
+
rate_limit=0.5,
|
|
15
|
+
follow_links=True, # Crawls links from start_urls in addition to sitemap
|
|
16
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""React documentation profile."""
|
|
2
|
+
|
|
3
|
+
from .base import SiteProfile
|
|
4
|
+
|
|
5
|
+
REACT_PROFILE = SiteProfile(
|
|
6
|
+
name="react",
|
|
7
|
+
domains={"react.dev"},
|
|
8
|
+
sitemap_url="https://react.dev/sitemap.xml",
|
|
9
|
+
base_url="https://react.dev/",
|
|
10
|
+
include_patterns=["/reference/", "/learn/"],
|
|
11
|
+
exclude_patterns=["/blog/", "/community/"],
|
|
12
|
+
output_subdir="react",
|
|
13
|
+
rate_limit=0.2, # React docs can handle faster requests
|
|
14
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Stripe documentation profile."""
|
|
2
|
+
|
|
3
|
+
from .base import SiteProfile
|
|
4
|
+
|
|
5
|
+
STRIPE_PROFILE = SiteProfile(
|
|
6
|
+
name="stripe",
|
|
7
|
+
domains={"docs.stripe.com", "stripe.com"},
|
|
8
|
+
sitemap_url="https://docs.stripe.com/sitemap.xml",
|
|
9
|
+
base_url="https://docs.stripe.com/",
|
|
10
|
+
include_patterns=["https://docs.stripe.com/"],
|
|
11
|
+
exclude_patterns=["/changelog/", "/upgrades/"],
|
|
12
|
+
output_subdir="stripe",
|
|
13
|
+
rate_limit=0.5,
|
|
14
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Tailwind CSS documentation profile."""
|
|
2
|
+
|
|
3
|
+
from .base import SiteProfile
|
|
4
|
+
|
|
5
|
+
TAILWIND_PROFILE = SiteProfile(
|
|
6
|
+
name="tailwind",
|
|
7
|
+
domains={"tailwindcss.com"},
|
|
8
|
+
sitemap_url="https://tailwindcss.com/sitemap.xml",
|
|
9
|
+
base_url="https://tailwindcss.com/",
|
|
10
|
+
include_patterns=["/docs/"],
|
|
11
|
+
output_subdir="tailwind",
|
|
12
|
+
strip_prefix="docs",
|
|
13
|
+
rate_limit=0.2,
|
|
14
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Turborepo documentation profile."""
|
|
2
|
+
|
|
3
|
+
from .base import SiteProfile
|
|
4
|
+
|
|
5
|
+
TURBOREPO_PROFILE = SiteProfile(
|
|
6
|
+
name="turborepo",
|
|
7
|
+
domains={"turborepo.com", "turbo.build"},
|
|
8
|
+
sitemap_url="https://turbo.build/repo/sitemap.xml",
|
|
9
|
+
base_url="https://turbo.build/repo/",
|
|
10
|
+
include_patterns=["/docs/"],
|
|
11
|
+
output_subdir="turborepo",
|
|
12
|
+
strip_prefix="docs",
|
|
13
|
+
rate_limit=0.2,
|
|
14
|
+
)
|
docpull/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def clean_filename(url: str, base_url: str) -> str:
|
|
7
|
+
"""
|
|
8
|
+
Clean and sanitize a URL to create a safe filename.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
url: The URL to convert to a filename
|
|
12
|
+
base_url: The base URL to remove from the path
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
A sanitized filename ending in .md
|
|
16
|
+
|
|
17
|
+
Raises:
|
|
18
|
+
TypeError: If url or base_url are not strings
|
|
19
|
+
ValueError: If url or base_url are empty
|
|
20
|
+
"""
|
|
21
|
+
if not isinstance(url, str):
|
|
22
|
+
raise TypeError(f"url must be a string, got {type(url).__name__}")
|
|
23
|
+
if not isinstance(base_url, str):
|
|
24
|
+
raise TypeError(f"base_url must be a string, got {type(base_url).__name__}")
|
|
25
|
+
|
|
26
|
+
if not url:
|
|
27
|
+
raise ValueError("url cannot be empty")
|
|
28
|
+
if not base_url:
|
|
29
|
+
raise ValueError("base_url cannot be empty")
|
|
30
|
+
|
|
31
|
+
path = url.replace(base_url, "").strip("/")
|
|
32
|
+
filename = path.replace("/", "-")
|
|
33
|
+
filename = re.sub(r"[^\w\-.]", "-", filename)
|
|
34
|
+
filename = re.sub(r"-+", "-", filename)
|
|
35
|
+
filename = filename.strip("-")
|
|
36
|
+
|
|
37
|
+
if not filename or filename in (".", ".."):
|
|
38
|
+
filename = "index"
|
|
39
|
+
|
|
40
|
+
if len(filename) > 200:
|
|
41
|
+
# Hash the overflow to maintain uniqueness and prevent collisions
|
|
42
|
+
import hashlib
|
|
43
|
+
|
|
44
|
+
overflow = filename[180:]
|
|
45
|
+
hash_suffix = hashlib.sha256(overflow.encode()).hexdigest()[:12]
|
|
46
|
+
filename = filename[:180] + "-" + hash_suffix
|
|
47
|
+
|
|
48
|
+
if not filename.endswith(".md"):
|
|
49
|
+
filename += ".md"
|
|
50
|
+
|
|
51
|
+
return filename
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def ensure_dir(path: Union[str, Path]) -> Path:
|
|
55
|
+
"""
|
|
56
|
+
Ensure a directory exists, creating it if necessary.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
path: The directory path to create
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
The resolved Path object
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
OSError: If directory creation fails
|
|
66
|
+
"""
|
|
67
|
+
path = Path(path).resolve()
|
|
68
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
return path
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def validate_output_path(output_path: Path, base_dir: Path) -> Path:
|
|
73
|
+
"""
|
|
74
|
+
Validate that an output path is within the base directory.
|
|
75
|
+
|
|
76
|
+
Prevents path traversal attacks by ensuring the output path
|
|
77
|
+
doesn't escape the base directory.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
output_path: The path to validate
|
|
81
|
+
base_dir: The base directory to check against
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
The resolved output path if valid
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
ValueError: If path traversal is detected
|
|
88
|
+
"""
|
|
89
|
+
resolved_output = output_path.resolve()
|
|
90
|
+
resolved_base = base_dir.resolve()
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
resolved_output.relative_to(resolved_base)
|
|
94
|
+
except ValueError as e:
|
|
95
|
+
raise ValueError(f"Path traversal detected: {output_path} is outside {base_dir}") from e
|
|
96
|
+
|
|
97
|
+
return resolved_output
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def setup_logging(
|
|
7
|
+
level: str = "INFO",
|
|
8
|
+
log_file: Optional[str] = None,
|
|
9
|
+
format_string: Optional[str] = None,
|
|
10
|
+
force: bool = False,
|
|
11
|
+
) -> logging.Logger:
|
|
12
|
+
"""
|
|
13
|
+
Set up logging configuration for docpull.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
|
17
|
+
log_file: Optional file path for logging output
|
|
18
|
+
format_string: Optional custom format string for log messages
|
|
19
|
+
force: If True, reconfigure even if handlers exist
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Configured logger instance
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
AttributeError: If invalid logging level is provided
|
|
26
|
+
"""
|
|
27
|
+
if format_string is None:
|
|
28
|
+
format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
29
|
+
|
|
30
|
+
numeric_level = getattr(logging, level.upper(), logging.INFO)
|
|
31
|
+
logger = logging.getLogger("docpull")
|
|
32
|
+
logger.setLevel(numeric_level)
|
|
33
|
+
|
|
34
|
+
# Only clear and reconfigure if forced or no handlers exist
|
|
35
|
+
if force or not logger.handlers:
|
|
36
|
+
logger.handlers.clear()
|
|
37
|
+
|
|
38
|
+
console_handler = logging.StreamHandler(sys.stdout)
|
|
39
|
+
console_handler.setLevel(numeric_level)
|
|
40
|
+
console_formatter = logging.Formatter(format_string)
|
|
41
|
+
console_handler.setFormatter(console_formatter)
|
|
42
|
+
logger.addHandler(console_handler)
|
|
43
|
+
|
|
44
|
+
if log_file:
|
|
45
|
+
file_handler = logging.FileHandler(log_file)
|
|
46
|
+
file_handler.setLevel(numeric_level)
|
|
47
|
+
file_formatter = logging.Formatter(format_string)
|
|
48
|
+
file_handler.setFormatter(file_formatter)
|
|
49
|
+
logger.addHandler(file_handler)
|
|
50
|
+
|
|
51
|
+
# Prevent propagation to root logger to avoid duplicate logs
|
|
52
|
+
logger.propagate = False
|
|
53
|
+
|
|
54
|
+
return logger
|