docpull 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docpull/fetchers/d3.py ADDED
@@ -0,0 +1,211 @@
1
+ """D3.js documentation fetcher from DevDocs.io."""
2
+
3
+ import logging
4
+ import re
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Any, Optional, cast
8
+
9
+ from .parallel_base import ParallelFetcher
10
+
11
+
12
+ class D3DevDocsFetcher(ParallelFetcher):
13
+ """Fetcher for D3.js documentation from DevDocs.io."""
14
+
15
+ def __init__(
16
+ self,
17
+ output_dir: Path,
18
+ rate_limit: float = 0.05, # DevDocs can handle faster requests
19
+ skip_existing: bool = True,
20
+ logger: Optional[logging.Logger] = None,
21
+ max_workers: int = 20, # More workers for DevDocs
22
+ version: str = "7", # D3 version
23
+ ) -> None:
24
+ """
25
+ Initialize D3 DevDocs fetcher.
26
+
27
+ Args:
28
+ output_dir: Directory to save documentation
29
+ rate_limit: Seconds between requests (per worker)
30
+ skip_existing: Skip existing files
31
+ logger: Logger instance
32
+ max_workers: Number of concurrent workers
33
+ version: D3 version (default: 7 for latest)
34
+ """
35
+ super().__init__(output_dir, rate_limit, skip_existing, logger, max_workers)
36
+ self.version = version
37
+ self.doc_slug = f"d3~{version}"
38
+ self.base_url = "https://devdocs.io/"
39
+ self.api_base = "https://documents.devdocs.io/"
40
+ self.index_url = f"{self.api_base}{self.doc_slug}/index.json"
41
+
42
+ def fetch_index(self) -> list[dict[Any, Any]]:
43
+ """
44
+ Fetch the documentation index from DevDocs API.
45
+
46
+ Returns:
47
+ List of entry dictionaries
48
+ """
49
+ self.logger.info(f"Fetching D3 v{self.version} index from DevDocs")
50
+
51
+ try:
52
+ response = self.session.get(self.index_url, timeout=30)
53
+ response.raise_for_status()
54
+
55
+ data = response.json()
56
+ entries = cast(list[dict[Any, Any]], data.get("entries", []))
57
+
58
+ self.logger.info(f"Found {len(entries)} D3 documentation entries")
59
+ return entries
60
+
61
+ except Exception as e:
62
+ self.logger.error(f"Error fetching index: {e}")
63
+ return []
64
+
65
+ def fetch_entry_content(self, entry: dict) -> str:
66
+ """
67
+ Fetch content for a single entry from DevDocs.
68
+
69
+ Args:
70
+ entry: Entry dictionary with 'path' key
71
+
72
+ Returns:
73
+ Markdown content with frontmatter
74
+ """
75
+ path = entry["path"]
76
+ url = f"{self.api_base}{self.doc_slug}/{path}.html"
77
+
78
+ try:
79
+ response = self.session.get(url, timeout=30)
80
+ response.raise_for_status()
81
+
82
+ # DevDocs returns HTML, convert to markdown
83
+ from bs4 import BeautifulSoup
84
+
85
+ soup = BeautifulSoup(response.content, "html.parser")
86
+
87
+ # Convert to markdown
88
+ markdown = self.h2t.handle(str(soup))
89
+
90
+ # Add frontmatter
91
+ frontmatter = f"""---
92
+ name: {entry.get('name', '')}
93
+ type: {entry.get('type', '')}
94
+ path: {path}
95
+ source: DevDocs.io - D3.js v{self.version}
96
+ url: {self.base_url}d3~{self.version}/{path}
97
+ fetched: {time.strftime('%Y-%m-%d')}
98
+ ---
99
+
100
+ """
101
+ return frontmatter + markdown.strip()
102
+
103
+ except Exception as e:
104
+ self.logger.error(f"Error fetching {path}: {e}")
105
+ self.stats["errors"] += 1
106
+ return f"# Error\n\nFailed to fetch {path}\n\nError: {str(e)}"
107
+
108
+ def process_entry(self, entry_data: tuple[dict, Path]) -> tuple[bool, str]:
109
+ """
110
+ Process a single entry.
111
+
112
+ Args:
113
+ entry_data: Tuple of (entry, output_path)
114
+
115
+ Returns:
116
+ Tuple of (success, entry_name)
117
+ """
118
+ entry, output_path = entry_data
119
+
120
+ # Skip if exists
121
+ if self.skip_existing and output_path.exists():
122
+ self.logger.debug(f"Skipping (already exists): {entry['name']}")
123
+ self.stats["skipped"] += 1
124
+ return (True, entry["name"])
125
+
126
+ # Fetch content
127
+ content = self.fetch_entry_content(entry)
128
+
129
+ # Save
130
+ self.save_content(content, output_path)
131
+ self.stats["fetched"] += 1
132
+
133
+ # Rate limiting
134
+ time.sleep(self.rate_limit)
135
+
136
+ return (True, entry["name"])
137
+
138
+ def fetch(self) -> None:
139
+ """Fetch all D3.js documentation from DevDocs."""
140
+ self.logger.info(f"Fetching D3.js v{self.version} documentation from DevDocs")
141
+
142
+ entries = self.fetch_index()
143
+
144
+ if not entries:
145
+ self.logger.error("No entries found")
146
+ return
147
+
148
+ # Group by type for organization
149
+ by_type: dict[str, list[dict[Any, Any]]] = {}
150
+ for entry in entries:
151
+ entry_type = entry.get("type", "Other")
152
+ if entry_type not in by_type:
153
+ by_type[entry_type] = []
154
+ by_type[entry_type].append(entry)
155
+
156
+ self.logger.info(f"Found {len(by_type)} categories:")
157
+ for cat, cat_entries in sorted(by_type.items(), key=lambda x: len(x[1]), reverse=True):
158
+ self.logger.info(f" {cat}: {len(cat_entries)} entries")
159
+
160
+ # Prepare URL paths
161
+ entry_paths = []
162
+ for entry in entries:
163
+ # Organize by type
164
+ entry_type = entry.get("type", "Other")
165
+ # Clean type name for directory - sanitize to prevent path traversal
166
+ type_dir = entry_type.lower().replace(" ", "-")
167
+ type_dir = re.sub(r"[^\w\-]", "-", type_dir)
168
+ type_dir = type_dir.strip("-").strip(".")
169
+
170
+ # Generate filename from path
171
+ path = entry["path"]
172
+ filename = path.replace("/", "-") + ".md"
173
+
174
+ output_path = self.output_dir / "d3" / type_dir / filename
175
+
176
+ # Validate output path to prevent path traversal
177
+ try:
178
+ from ..utils.file_utils import validate_output_path
179
+
180
+ output_path = validate_output_path(output_path, self.output_dir)
181
+ except ValueError as e:
182
+ self.logger.error(f"Invalid output path for entry {entry.get('name', 'unknown')}: {e}")
183
+ continue
184
+
185
+ entry_paths.append(((entry, output_path), entry))
186
+
187
+ self.logger.info(f"Fetching with {self.max_workers} concurrent workers")
188
+ start_time = time.time()
189
+
190
+ from concurrent.futures import ThreadPoolExecutor, as_completed
191
+
192
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
193
+ futures = {executor.submit(self.process_entry, ep[0]): ep[1] for ep in entry_paths}
194
+
195
+ total = len(futures)
196
+
197
+ for completed, future in enumerate(as_completed(futures), start=1):
198
+ success, name = future.result()
199
+
200
+ if completed % 50 == 0 or completed == total:
201
+ elapsed = time.time() - start_time
202
+ rate = completed / elapsed if elapsed > 0 else 0
203
+ self.logger.info(
204
+ f"[{completed}/{total}] " f"({completed*100//total}%) " f"- {rate:.1f} docs/sec"
205
+ )
206
+
207
+ elapsed = time.time() - start_time
208
+ self.logger.info(f"Completed in {elapsed:.1f}s ({total/elapsed:.1f} docs/sec)")
209
+
210
+ self.logger.info("D3.js documentation fetch complete")
211
+ self.print_stats()
@@ -0,0 +1,255 @@
1
+ """Generic documentation fetcher that works with any URL."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional
6
+ from urllib.parse import urljoin, urlparse
7
+
8
+ from bs4 import BeautifulSoup
9
+
10
+ from ..profiles import SiteProfile, get_profile_by_name, get_profile_for_url
11
+ from .base import BaseFetcher
12
+
13
+
14
+ class GenericFetcher(BaseFetcher):
15
+ """
16
+ Generic fetcher that can scrape documentation from any URL.
17
+
18
+ Supports both profile-based (optimized) and generic scraping modes.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ url_or_profile: str,
24
+ output_dir: Path,
25
+ profile: Optional[SiteProfile] = None,
26
+ rate_limit: float = 0.5,
27
+ skip_existing: bool = True,
28
+ logger: Optional[logging.Logger] = None,
29
+ max_pages: Optional[int] = None,
30
+ max_depth: int = 5,
31
+ ) -> None:
32
+ """
33
+ Initialize generic fetcher.
34
+
35
+ Args:
36
+ url_or_profile: URL to scrape or profile name (e.g., 'stripe')
37
+ output_dir: Directory to save documentation
38
+ profile: Optional SiteProfile to use (overrides auto-detection)
39
+ rate_limit: Seconds between requests
40
+ skip_existing: Skip existing files
41
+ logger: Logger instance
42
+ max_pages: Maximum pages to fetch
43
+ max_depth: Maximum crawl depth
44
+ """
45
+ super().__init__(output_dir, rate_limit, skip_existing=skip_existing, logger=logger)
46
+
47
+ # Determine if input is a URL or profile name
48
+ if url_or_profile.startswith(("http://", "https://")):
49
+ self.start_url = url_or_profile
50
+ # Try to auto-detect profile
51
+ if profile is None:
52
+ profile = get_profile_for_url(url_or_profile)
53
+ if profile:
54
+ self.logger.info(f"Auto-detected profile: {profile.name}")
55
+ else:
56
+ # Treat as profile name
57
+ profile = get_profile_by_name(url_or_profile)
58
+ if profile is None:
59
+ raise ValueError(f"Unknown profile: {url_or_profile}")
60
+ start_url_candidate = profile.base_url or (profile.start_urls[0] if profile.start_urls else None)
61
+ if not start_url_candidate:
62
+ raise ValueError(f"Profile {url_or_profile} has no start URL")
63
+ self.start_url = start_url_candidate
64
+ self.logger.info(f"Using profile: {profile.name}")
65
+
66
+ self.profile = profile
67
+ self.max_pages = max_pages
68
+ self.max_depth = max_depth
69
+
70
+ # Set defaults from profile if available
71
+ if profile:
72
+ self.rate_limit = profile.rate_limit
73
+ self.sitemap_url = profile.sitemap_url
74
+ self.base_url = profile.base_url or self._extract_base_url(self.start_url)
75
+ self.include_patterns = profile.include_patterns
76
+ self.exclude_patterns = profile.exclude_patterns
77
+ self.output_subdir = profile.output_subdir or urlparse(self.start_url).netloc.replace(".", "_")
78
+ self.strip_prefix = profile.strip_prefix
79
+ self.follow_links = profile.follow_links
80
+ else:
81
+ # Generic mode - infer from URL
82
+ self.sitemap_url = self._guess_sitemap_url(self.start_url)
83
+ self.base_url = self._extract_base_url(self.start_url)
84
+ self.include_patterns = [self.base_url]
85
+ self.exclude_patterns = []
86
+ self.output_subdir = urlparse(self.start_url).netloc.replace(".", "_")
87
+ self.strip_prefix = None
88
+ self.follow_links = False
89
+
90
+ def _extract_base_url(self, url: str) -> str:
91
+ """Extract base URL from a full URL."""
92
+ parsed = urlparse(url)
93
+ return f"{parsed.scheme}://{parsed.netloc}/"
94
+
95
+ def _guess_sitemap_url(self, url: str) -> Optional[str]:
96
+ """
97
+ Guess sitemap URL for a given domain.
98
+
99
+ Args:
100
+ url: URL to guess sitemap for
101
+
102
+ Returns:
103
+ Guessed sitemap URL or None
104
+ """
105
+ base = self._extract_base_url(url)
106
+ common_paths = ["sitemap.xml", "sitemap_index.xml", "docs/sitemap.xml"]
107
+
108
+ for path in common_paths:
109
+ sitemap_url = urljoin(base, path)
110
+ try:
111
+ self.logger.debug(f"Trying sitemap: {sitemap_url}")
112
+ response = self.session.head(sitemap_url, timeout=10)
113
+ if response.status_code == 200:
114
+ self.logger.info(f"Found sitemap: {sitemap_url}")
115
+ return sitemap_url
116
+ except Exception:
117
+ continue
118
+
119
+ return None
120
+
121
+ def _crawl_links(self, start_urls: set[str], max_depth: int = 5) -> set[str]:
122
+ """
123
+ Crawl links from start URLs.
124
+
125
+ Args:
126
+ start_urls: URLs to start crawling from
127
+ max_depth: Maximum depth to crawl
128
+
129
+ Returns:
130
+ Set of discovered URLs
131
+ """
132
+ discovered: set[str] = set()
133
+ to_visit: set[tuple[str, int]] = {(url, 0) for url in start_urls}
134
+ visited: set[str] = set()
135
+
136
+ while to_visit:
137
+ url, depth = to_visit.pop()
138
+
139
+ if url in visited or depth > max_depth:
140
+ continue
141
+
142
+ if not self.validate_url(url):
143
+ continue
144
+
145
+ visited.add(url)
146
+ discovered.add(url)
147
+
148
+ if depth >= max_depth:
149
+ continue
150
+
151
+ try:
152
+ self.logger.debug(f"Crawling: {url} (depth {depth})")
153
+ response = self.session.get(url, timeout=30)
154
+ response.raise_for_status()
155
+
156
+ soup = BeautifulSoup(response.content, "html.parser")
157
+
158
+ for link in soup.find_all("a", href=True):
159
+ href = link["href"]
160
+ if not isinstance(href, str):
161
+ continue
162
+
163
+ # Resolve relative URLs
164
+ absolute_url = urljoin(url, href)
165
+
166
+ # Remove fragments and query params
167
+ absolute_url = absolute_url.split("#")[0].split("?")[0]
168
+
169
+ # Check if URL matches patterns
170
+ if not any(pattern in absolute_url for pattern in self.include_patterns):
171
+ continue
172
+
173
+ if any(pattern in absolute_url for pattern in self.exclude_patterns):
174
+ continue
175
+
176
+ if absolute_url not in visited:
177
+ to_visit.add((absolute_url, depth + 1))
178
+
179
+ except Exception as e:
180
+ self.logger.debug(f"Error crawling {url}: {e}")
181
+ continue
182
+
183
+ return discovered
184
+
185
+ def fetch(self) -> None:
186
+ """Fetch documentation using profile or generic scraping."""
187
+ self.logger.info(f"Fetching documentation from {self.start_url}")
188
+
189
+ urls: set[str] = set()
190
+
191
+ # Try sitemap first
192
+ if self.sitemap_url:
193
+ sitemap_urls = self.fetch_sitemap(self.sitemap_url)
194
+ if sitemap_urls:
195
+ urls.update(sitemap_urls)
196
+ self.logger.info(f"Found {len(sitemap_urls)} URLs in sitemap")
197
+
198
+ # Add start URLs if using profile
199
+ if self.profile and self.profile.start_urls:
200
+ urls.update(self.profile.start_urls)
201
+
202
+ # Crawl links if needed
203
+ if self.follow_links or (not urls and not self.sitemap_url):
204
+ start_urls = {self.start_url}
205
+ if self.profile and self.profile.start_urls:
206
+ start_urls.update(self.profile.start_urls)
207
+
208
+ self.logger.info(f"Crawling links from {len(start_urls)} start URL(s)")
209
+ crawled_urls = self._crawl_links(start_urls, self.max_depth)
210
+ urls.update(crawled_urls)
211
+ self.logger.info(f"Discovered {len(crawled_urls)} URLs via crawling")
212
+
213
+ if not urls:
214
+ self.logger.error("No URLs found to fetch")
215
+ return
216
+
217
+ # Apply filters
218
+ if self.include_patterns or self.exclude_patterns:
219
+ filtered_urls = []
220
+ for url in urls:
221
+ if self.include_patterns and not any(pattern in url for pattern in self.include_patterns):
222
+ continue
223
+ if self.exclude_patterns and any(pattern in url for pattern in self.exclude_patterns):
224
+ continue
225
+ filtered_urls.append(url)
226
+ urls = set(filtered_urls)
227
+
228
+ urls_list = sorted(urls)
229
+
230
+ # Apply max_pages limit
231
+ if self.max_pages:
232
+ urls_list = urls_list[: self.max_pages]
233
+ self.logger.info(f"Limited to {self.max_pages} pages")
234
+
235
+ self.logger.info(f"Processing {len(urls_list)} URLs")
236
+
237
+ # Process each URL
238
+ total = len(urls_list)
239
+ for idx, url in enumerate(urls_list, 1):
240
+ self.logger.info(f"[{idx}/{total}] Processing: {url}")
241
+
242
+ if self.profile and self.base_url:
243
+ filepath = self.create_output_path(url, self.base_url, self.output_subdir, self.strip_prefix)
244
+ else:
245
+ # Generic path creation
246
+ parsed = urlparse(url)
247
+ path = parsed.path.strip("/")
248
+ if not path:
249
+ path = "index"
250
+ filepath = self.output_dir / self.output_subdir / f"{path.replace('/', '_')}.md"
251
+
252
+ self.process_url(url, filepath)
253
+
254
+ self.logger.info("Fetch complete")
255
+ self.print_stats()