docpull 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,450 @@
1
+ import ipaddress
2
+ import logging
3
+ import re
4
+ import time
5
+ from abc import ABC, abstractmethod
6
+ from pathlib import Path
7
+ from typing import Optional, TypedDict
8
+ from urllib.parse import urlparse
9
+
10
+ import html2text
11
+ import requests
12
+ from bs4 import BeautifulSoup
13
+ from defusedxml import ElementTree
14
+
15
+ from ..utils.file_utils import clean_filename, ensure_dir, validate_output_path
16
+
17
+ # Validate dependencies at module load
18
+ try:
19
+ # Validate BeautifulSoup parser is available
20
+ BeautifulSoup("<html></html>", "html.parser")
21
+ except Exception as e:
22
+ raise ImportError(f"html.parser not available for BeautifulSoup: {e}") from e
23
+
24
+
25
+ class FetcherStats(TypedDict):
26
+ """Statistics for documentation fetching operations."""
27
+
28
+ fetched: int
29
+ skipped: int
30
+ errors: int
31
+
32
+
33
+ class BaseFetcher(ABC):
34
+ """
35
+ Abstract base class for documentation fetchers.
36
+
37
+ Provides common functionality for fetching, validating, and converting
38
+ documentation from various sources to markdown format.
39
+ """
40
+
41
+ MAX_CONTENT_SIZE = 50 * 1024 * 1024 # 50 MB
42
+ MAX_REDIRECTS = 5
43
+ MAX_DOWNLOAD_TIME = 300 # 5 minutes
44
+ ALLOWED_SCHEMES = {"https"}
45
+ ALLOWED_CONTENT_TYPES = {
46
+ "text/html",
47
+ "application/xhtml+xml",
48
+ "text/xml",
49
+ "application/xml",
50
+ "application/atom+xml",
51
+ "application/rss+xml",
52
+ }
53
+
54
+ def __init__(
55
+ self,
56
+ output_dir: Path,
57
+ rate_limit: float = 0.5,
58
+ user_agent: Optional[str] = None,
59
+ skip_existing: bool = True,
60
+ logger: Optional[logging.Logger] = None,
61
+ allowed_domains: Optional[set[str]] = None,
62
+ ) -> None:
63
+ self.output_dir = Path(output_dir).resolve()
64
+ self.rate_limit = rate_limit
65
+ self.skip_existing = skip_existing
66
+ self.logger = logger or logging.getLogger("docpull")
67
+ self.allowed_domains = allowed_domains
68
+ self.h2t = html2text.HTML2Text()
69
+ self.h2t.ignore_links = False
70
+ self.h2t.ignore_images = False
71
+ self.h2t.ignore_emphasis = False
72
+ self.h2t.body_width = 0
73
+ self.session = requests.Session()
74
+ self.session.max_redirects = self.MAX_REDIRECTS
75
+
76
+ # Configure custom adapter to validate redirect URLs
77
+ from typing import Any, Callable
78
+
79
+ from requests.adapters import HTTPAdapter
80
+ from requests.models import PreparedRequest, Response
81
+
82
+ class SafeHTTPAdapter(HTTPAdapter):
83
+ def __init__(self, validator_func: Callable[[str], bool], *args: Any, **kwargs: Any) -> None:
84
+ self.validator_func = validator_func
85
+ super().__init__(*args, **kwargs)
86
+
87
+ def send( # type: ignore[override]
88
+ self, request: PreparedRequest, **kwargs: Any
89
+ ) -> Response:
90
+ if request.url is None:
91
+ raise ValueError("Request URL is None")
92
+ if not self.validator_func(request.url):
93
+ raise ValueError(f"Redirect to unsafe URL blocked: {request.url}")
94
+ return super().send(request, **kwargs)
95
+
96
+ adapter = SafeHTTPAdapter(self.validate_url)
97
+ self.session.mount("https://", adapter)
98
+ self.session.mount("http://", adapter)
99
+
100
+ if user_agent is None:
101
+ user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
102
+ self.session.headers.update({"User-Agent": user_agent})
103
+ self.stats: FetcherStats = {
104
+ "fetched": 0,
105
+ "skipped": 0,
106
+ "errors": 0,
107
+ }
108
+
109
+ def validate_url(self, url: str) -> bool:
110
+ """
111
+ Validate URL for security and allowed schemes.
112
+
113
+ Args:
114
+ url: URL to validate
115
+
116
+ Returns:
117
+ True if URL is safe to fetch, False otherwise
118
+ """
119
+ try:
120
+ parsed = urlparse(url)
121
+ if parsed.scheme not in self.ALLOWED_SCHEMES:
122
+ self.logger.warning("Rejected non-HTTPS URL")
123
+ return False
124
+ if not parsed.netloc:
125
+ self.logger.warning("Rejected URL with no domain")
126
+ return False
127
+
128
+ if self.allowed_domains is not None and parsed.netloc not in self.allowed_domains:
129
+ self.logger.warning(f"Rejected domain not in allowlist: {parsed.netloc}")
130
+ return False
131
+
132
+ # Extract hostname (remove port if present)
133
+ hostname = parsed.netloc.split(":")[0]
134
+
135
+ # Check for localhost
136
+ if hostname.lower() in ["localhost", "localhost.localdomain"]:
137
+ self.logger.warning("Rejected localhost URL")
138
+ return False
139
+
140
+ # Check for internal domain suffixes
141
+ if hostname.lower().endswith(".internal") or hostname.lower().endswith(".local"):
142
+ self.logger.warning("Rejected internal domain")
143
+ return False
144
+
145
+ # Try to parse as IP address
146
+ try:
147
+ ip = ipaddress.ip_address(hostname)
148
+ if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved:
149
+ self.logger.warning(f"Rejected private/internal IP: {hostname}")
150
+ return False
151
+ except ValueError:
152
+ # Not an IP address, it's a domain name - this is fine
153
+ pass
154
+
155
+ return True
156
+ except Exception:
157
+ self.logger.warning("Invalid URL format")
158
+ return False
159
+
160
+ def fetch_sitemap(self, url: str) -> list[str]:
161
+ self.logger.info(f"Fetching sitemap: {url}")
162
+ if not self.validate_url(url):
163
+ return []
164
+
165
+ try:
166
+ response = self.session.get(url, timeout=30, stream=True)
167
+ response.raise_for_status()
168
+
169
+ content_length = response.headers.get("content-length")
170
+ if content_length and int(content_length) > self.MAX_CONTENT_SIZE:
171
+ self.logger.error(f"Sitemap too large: {content_length} bytes")
172
+ return []
173
+
174
+ content = response.content
175
+ if len(content) > self.MAX_CONTENT_SIZE:
176
+ self.logger.error(f"Sitemap exceeded size limit: {len(content)} bytes")
177
+ return []
178
+
179
+ try:
180
+ # Parse XML (limited by MAX_CONTENT_SIZE for security)
181
+ root = ElementTree.fromstring(content)
182
+ except ElementTree.ParseError as e:
183
+ self.logger.error(f"XML parsing error (possible XXE/bomb): {e}")
184
+ return []
185
+ namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}
186
+ urls = []
187
+ for url_elem in root.findall(".//ns:url/ns:loc", namespace):
188
+ if url_elem.text:
189
+ urls.append(url_elem.text)
190
+
191
+ if not urls:
192
+ for url_elem in root.findall(".//url/loc"):
193
+ if url_elem.text:
194
+ urls.append(url_elem.text)
195
+
196
+ sitemap_urls = []
197
+ for sitemap_elem in root.findall(".//ns:sitemap/ns:loc", namespace):
198
+ if sitemap_elem.text:
199
+ sitemap_urls.append(sitemap_elem.text)
200
+
201
+ if not sitemap_urls:
202
+ for sitemap_elem in root.findall(".//sitemap/loc"):
203
+ if sitemap_elem.text:
204
+ sitemap_urls.append(sitemap_elem.text)
205
+
206
+ for sitemap_url in sitemap_urls:
207
+ self.logger.info(f"Found sub-sitemap: {sitemap_url}")
208
+ urls.extend(self.fetch_sitemap(sitemap_url))
209
+
210
+ self.logger.info(f"Found {len(urls)} URLs in sitemap")
211
+ return urls
212
+
213
+ except Exception as e:
214
+ self.logger.error(f"Error fetching sitemap {url}: {e}")
215
+ return []
216
+
217
+ def filter_urls(
218
+ self, urls: list[str], include_patterns: list[str], exclude_patterns: Optional[list[str]] = None
219
+ ) -> list[str]:
220
+ """
221
+ Filter URLs based on include and exclude patterns.
222
+
223
+ Args:
224
+ urls: List of URLs to filter
225
+ include_patterns: Patterns that URLs must contain
226
+ exclude_patterns: Patterns that URLs must not contain
227
+
228
+ Returns:
229
+ Filtered list of URLs
230
+ """
231
+ exclude_patterns = exclude_patterns or []
232
+ filtered = []
233
+
234
+ for url in urls:
235
+ if any(pattern in url for pattern in include_patterns) and not any(
236
+ ex_pattern in url for ex_pattern in exclude_patterns
237
+ ):
238
+ filtered.append(url)
239
+
240
+ self.logger.info(f"Filtered to {len(filtered)} URLs")
241
+ return filtered
242
+
243
+ def categorize_urls(self, urls: list[str], base_url: str) -> dict[str, list[str]]:
244
+ """
245
+ Categorize URLs by their first path segment.
246
+
247
+ Args:
248
+ urls: List of URLs to categorize
249
+ base_url: Base URL to strip from paths
250
+
251
+ Returns:
252
+ Dictionary mapping category names to lists of URLs
253
+ """
254
+ categories: dict[str, list[str]] = {}
255
+
256
+ for url in urls:
257
+ path = url.replace(base_url, "").strip("/")
258
+
259
+ if not path:
260
+ continue
261
+
262
+ parts = path.split("/")
263
+ if len(parts) > 0:
264
+ category = parts[0]
265
+ if category not in categories:
266
+ categories[category] = []
267
+ categories[category].append(url)
268
+
269
+ return categories
270
+
271
+ def create_output_path(
272
+ self, url: str, base_url: str, output_subdir: str, strip_prefix: Optional[str] = None
273
+ ) -> Path:
274
+ """
275
+ Create standardized output path for a documentation URL.
276
+
277
+ Args:
278
+ url: The URL to process
279
+ base_url: Base URL to strip from the path
280
+ output_subdir: Subdirectory name (e.g., 'react', 'nextjs')
281
+ strip_prefix: Optional prefix to remove (e.g., 'docs')
282
+
283
+ Returns:
284
+ Path object for where to save the content
285
+ """
286
+ # Remove base URL and create path structure
287
+ path = url.replace(base_url, "").strip("/")
288
+ parts = path.split("/")
289
+
290
+ # Remove prefix if specified
291
+ if strip_prefix and parts and parts[0] == strip_prefix:
292
+ parts = parts[1:]
293
+
294
+ # Create directory structure
295
+ if len(parts) >= 2:
296
+ category_dir = self.output_dir / output_subdir / "/".join(parts[:-1])
297
+ elif len(parts) == 1:
298
+ category_dir = self.output_dir / output_subdir
299
+ else:
300
+ category_dir = self.output_dir / output_subdir / "other"
301
+
302
+ # Generate filename
303
+ filename = clean_filename(url, base_url)
304
+ filepath = category_dir / filename
305
+
306
+ return filepath
307
+
308
+ def validate_content_type(self, content_type: str) -> bool:
309
+ """
310
+ Validate HTTP content type header.
311
+
312
+ Args:
313
+ content_type: Content-Type header value
314
+
315
+ Returns:
316
+ True if content type is allowed, False otherwise
317
+ """
318
+ if not content_type:
319
+ return True
320
+ content_type_lower = content_type.lower().split(";")[0].strip()
321
+ return content_type_lower in self.ALLOWED_CONTENT_TYPES
322
+
323
+ def fetch_page_content(self, url: str) -> str:
324
+ """
325
+ Fetch and convert a webpage to markdown.
326
+
327
+ Args:
328
+ url: URL to fetch
329
+
330
+ Returns:
331
+ Markdown content with frontmatter, or error message
332
+ """
333
+ if not self.validate_url(url):
334
+ return "# Error\n\nInvalid URL"
335
+
336
+ try:
337
+ self.logger.debug(f"Fetching: {url}")
338
+ response = self.session.get(url, timeout=30, stream=True)
339
+ response.raise_for_status()
340
+
341
+ content_type = response.headers.get("content-type", "")
342
+ if not self.validate_content_type(content_type):
343
+ self.logger.warning(f"Invalid content-type: {content_type}")
344
+ return "# Error\n\nInvalid content type"
345
+
346
+ content_length = response.headers.get("content-length")
347
+ if content_length and int(content_length) > self.MAX_CONTENT_SIZE:
348
+ return "# Error\n\nContent too large"
349
+
350
+ content = b""
351
+ download_start = time.time()
352
+
353
+ for chunk in response.iter_content(chunk_size=8192):
354
+ content += chunk
355
+ if len(content) > self.MAX_CONTENT_SIZE:
356
+ return "# Error\n\nContent size limit exceeded"
357
+ if time.time() - download_start > self.MAX_DOWNLOAD_TIME:
358
+ return "# Error\n\nDownload timeout exceeded"
359
+
360
+ soup = BeautifulSoup(content, "html.parser")
361
+
362
+ for element in soup(["script", "style", "nav", "footer", "header"]):
363
+ element.decompose()
364
+ main_content = (
365
+ soup.find("main")
366
+ or soup.find("article")
367
+ or soup.find(class_=re.compile(r"content|documentation|docs"))
368
+ or soup.find("body")
369
+ )
370
+
371
+ if main_content:
372
+ markdown = self.h2t.handle(str(main_content))
373
+ frontmatter = f"""---
374
+ url: {url}
375
+ fetched: {time.strftime('%Y-%m-%d')}
376
+ ---
377
+
378
+ """
379
+ return frontmatter + markdown.strip()
380
+ else:
381
+ return f"# Error\n\nCould not find main content for {url}"
382
+
383
+ except Exception as e:
384
+ self.logger.error(f"Error fetching {url}: {e}")
385
+ self.stats["errors"] += 1
386
+ return f"# Error\n\nFailed to fetch {url}\n\nError: {str(e)}"
387
+
388
+ def save_content(self, content: str, filepath: Path) -> None:
389
+ """
390
+ Save content to a file after validation.
391
+
392
+ Args:
393
+ content: The content to write
394
+ filepath: Path where content should be saved
395
+ """
396
+ validated_path = validate_output_path(filepath, self.output_dir)
397
+ ensure_dir(validated_path.parent)
398
+ with open(validated_path, "w", encoding="utf-8") as f:
399
+ f.write(content)
400
+
401
+ def process_url(self, url: str, output_path: Path) -> bool:
402
+ """
403
+ Process a single URL: fetch, convert, and save.
404
+
405
+ Args:
406
+ url: URL to process
407
+ output_path: Path where content should be saved
408
+
409
+ Returns:
410
+ True if successful, False otherwise
411
+ """
412
+ if not self.validate_url(url):
413
+ self.logger.warning(f"Skipping invalid URL: {url}")
414
+ self.stats["errors"] += 1
415
+ return False
416
+
417
+ try:
418
+ validated_path = validate_output_path(output_path, self.output_dir)
419
+ except ValueError as e:
420
+ self.logger.error(f"Path validation failed: {e}")
421
+ self.stats["errors"] += 1
422
+ return False
423
+
424
+ if self.skip_existing and validated_path.exists():
425
+ self.logger.debug(f"Skipping (already exists): {validated_path}")
426
+ self.stats["skipped"] += 1
427
+ return False
428
+
429
+ content = self.fetch_page_content(url)
430
+ self.save_content(content, validated_path)
431
+
432
+ self.logger.info(f"Saved: {validated_path}")
433
+ self.stats["fetched"] += 1
434
+ time.sleep(self.rate_limit)
435
+
436
+ return True
437
+
438
+ @abstractmethod
439
+ def fetch(self) -> None:
440
+ """Fetch all documentation for this source."""
441
+ pass
442
+
443
+ def print_stats(self) -> None:
444
+ """Print fetching statistics to log."""
445
+ self.logger.info("Fetching Statistics:")
446
+ self.logger.info(f" Fetched: {self.stats['fetched']}")
447
+ self.logger.info(f" Skipped: {self.stats['skipped']}")
448
+ self.logger.info(f" Errors: {self.stats['errors']}")
449
+ total = self.stats["fetched"] + self.stats["skipped"] + self.stats["errors"]
450
+ self.logger.info(f" Total: {total}")
@@ -0,0 +1,59 @@
1
+ """Bun documentation fetcher."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from .parallel_base import ParallelFetcher
8
+
9
+
10
+ class BunFetcher(ParallelFetcher):
11
+ """Fetcher for Bun documentation."""
12
+
13
+ def __init__(
14
+ self,
15
+ output_dir: Path,
16
+ rate_limit: float = 0.2,
17
+ skip_existing: bool = True,
18
+ logger: Optional[logging.Logger] = None,
19
+ max_workers: int = 15,
20
+ ) -> None:
21
+ """
22
+ Initialize Bun fetcher.
23
+
24
+ Args:
25
+ output_dir: Directory to save documentation
26
+ rate_limit: Seconds between requests
27
+ skip_existing: Skip existing files
28
+ logger: Logger instance
29
+ max_workers: Number of concurrent workers
30
+ """
31
+ super().__init__(output_dir, rate_limit, skip_existing, logger, max_workers)
32
+ self.sitemap_url = "https://bun.sh/sitemap.xml"
33
+ self.base_url = "https://bun.sh/"
34
+
35
+ def fetch(self) -> None:
36
+ """Fetch all Bun documentation."""
37
+ self.logger.info("Fetching Bun documentation")
38
+
39
+ urls = self.fetch_sitemap(self.sitemap_url)
40
+
41
+ if not urls:
42
+ self.logger.error("No URLs found in Bun sitemap")
43
+ return
44
+
45
+ doc_urls = self.filter_urls(
46
+ urls, include_patterns=["/docs/"], exclude_patterns=["/blog/", "/guides/"]
47
+ )
48
+
49
+ self.logger.info(f"Found {len(doc_urls)} documentation URLs")
50
+
51
+ url_output_pairs = []
52
+ for url in doc_urls:
53
+ filepath = self.create_output_path(url, self.base_url, "bun", strip_prefix="docs")
54
+ url_output_pairs.append((url, filepath))
55
+
56
+ self.fetch_urls_parallel(url_output_pairs)
57
+
58
+ self.logger.info("Bun documentation fetch complete")
59
+ self.print_stats()