docpull 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpull/__init__.py +29 -0
- docpull/__main__.py +6 -0
- docpull/cli.py +440 -0
- docpull/config.py +199 -0
- docpull/fetchers/__init__.py +23 -0
- docpull/fetchers/async_fetcher.py +322 -0
- docpull/fetchers/base.py +450 -0
- docpull/fetchers/bun.py +59 -0
- docpull/fetchers/d3.py +211 -0
- docpull/fetchers/generic.py +255 -0
- docpull/fetchers/generic_async.py +282 -0
- docpull/fetchers/nextjs.py +50 -0
- docpull/fetchers/parallel_base.py +93 -0
- docpull/fetchers/plaid.py +92 -0
- docpull/fetchers/react.py +59 -0
- docpull/fetchers/stripe.py +60 -0
- docpull/fetchers/tailwind.py +59 -0
- docpull/fetchers/turborepo.py +57 -0
- docpull/profiles/__init__.py +70 -0
- docpull/profiles/base.py +64 -0
- docpull/profiles/bun.py +14 -0
- docpull/profiles/d3.py +17 -0
- docpull/profiles/nextjs.py +15 -0
- docpull/profiles/plaid.py +16 -0
- docpull/profiles/react.py +14 -0
- docpull/profiles/stripe.py +14 -0
- docpull/profiles/tailwind.py +14 -0
- docpull/profiles/turborepo.py +14 -0
- docpull/py.typed +0 -0
- docpull/utils/__init__.py +6 -0
- docpull/utils/file_utils.py +97 -0
- docpull/utils/logging_config.py +54 -0
- docpull-1.0.1.dist-info/METADATA +440 -0
- docpull-1.0.1.dist-info/RECORD +38 -0
- docpull-1.0.1.dist-info/WHEEL +5 -0
- docpull-1.0.1.dist-info/entry_points.txt +2 -0
- docpull-1.0.1.dist-info/licenses/LICENSE +21 -0
- docpull-1.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""Async generic fetcher with progress bars and JS support."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from urllib.parse import urljoin, urlparse
|
|
8
|
+
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
|
11
|
+
|
|
12
|
+
from ..profiles import SiteProfile, get_profile_by_name, get_profile_for_url
|
|
13
|
+
from .async_fetcher import PLAYWRIGHT_AVAILABLE, AsyncFetcher
|
|
14
|
+
from .base import BaseFetcher
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class GenericAsyncFetcher(BaseFetcher):
|
|
18
|
+
"""
|
|
19
|
+
Async generic fetcher with progress bars and optional JS rendering.
|
|
20
|
+
|
|
21
|
+
Features:
|
|
22
|
+
- Async/parallel fetching (10x+ faster)
|
|
23
|
+
- Progress bars with rich
|
|
24
|
+
- Optional JavaScript rendering
|
|
25
|
+
- All security features from BaseFetcher
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
url_or_profile: str,
|
|
31
|
+
output_dir: Path,
|
|
32
|
+
profile: Optional[SiteProfile] = None,
|
|
33
|
+
rate_limit: float = 0.5,
|
|
34
|
+
skip_existing: bool = True,
|
|
35
|
+
logger: Optional[logging.Logger] = None,
|
|
36
|
+
max_pages: Optional[int] = None,
|
|
37
|
+
max_depth: int = 5,
|
|
38
|
+
max_concurrent: int = 10,
|
|
39
|
+
use_js: bool = False,
|
|
40
|
+
show_progress: bool = True,
|
|
41
|
+
) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Initialize async generic fetcher.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
url_or_profile: URL to scrape or profile name
|
|
47
|
+
output_dir: Directory to save documentation
|
|
48
|
+
profile: Optional SiteProfile
|
|
49
|
+
rate_limit: Seconds between requests
|
|
50
|
+
skip_existing: Skip existing files
|
|
51
|
+
logger: Logger instance
|
|
52
|
+
max_pages: Maximum pages to fetch
|
|
53
|
+
max_depth: Maximum crawl depth
|
|
54
|
+
max_concurrent: Maximum concurrent requests
|
|
55
|
+
use_js: Enable JavaScript rendering (requires playwright)
|
|
56
|
+
show_progress: Show progress bars
|
|
57
|
+
"""
|
|
58
|
+
super().__init__(output_dir, rate_limit, skip_existing=skip_existing, logger=logger)
|
|
59
|
+
|
|
60
|
+
# Determine if input is a URL or profile name
|
|
61
|
+
if url_or_profile.startswith(("http://", "https://")):
|
|
62
|
+
self.start_url = url_or_profile
|
|
63
|
+
if profile is None:
|
|
64
|
+
profile = get_profile_for_url(url_or_profile)
|
|
65
|
+
if profile:
|
|
66
|
+
self.logger.info(f"Auto-detected profile: {profile.name}")
|
|
67
|
+
else:
|
|
68
|
+
profile = get_profile_by_name(url_or_profile)
|
|
69
|
+
if profile is None:
|
|
70
|
+
raise ValueError(f"Unknown profile: {url_or_profile}")
|
|
71
|
+
start_url_candidate = profile.base_url or (profile.start_urls[0] if profile.start_urls else None)
|
|
72
|
+
if not start_url_candidate:
|
|
73
|
+
raise ValueError(f"Profile {url_or_profile} has no start URL")
|
|
74
|
+
self.start_url = start_url_candidate
|
|
75
|
+
self.logger.info(f"Using profile: {profile.name}")
|
|
76
|
+
|
|
77
|
+
self.profile = profile
|
|
78
|
+
self.max_pages = max_pages
|
|
79
|
+
self.max_depth = max_depth
|
|
80
|
+
self.max_concurrent = max_concurrent
|
|
81
|
+
self.use_js = use_js
|
|
82
|
+
self.show_progress = show_progress
|
|
83
|
+
|
|
84
|
+
# Set defaults from profile
|
|
85
|
+
if profile:
|
|
86
|
+
self.rate_limit = profile.rate_limit
|
|
87
|
+
self.sitemap_url = profile.sitemap_url
|
|
88
|
+
self.base_url = profile.base_url or self._extract_base_url(self.start_url)
|
|
89
|
+
self.include_patterns = profile.include_patterns
|
|
90
|
+
self.exclude_patterns = profile.exclude_patterns
|
|
91
|
+
self.output_subdir = profile.output_subdir or urlparse(self.start_url).netloc.replace(".", "_")
|
|
92
|
+
self.strip_prefix = profile.strip_prefix
|
|
93
|
+
self.follow_links = profile.follow_links
|
|
94
|
+
else:
|
|
95
|
+
self.sitemap_url = self._guess_sitemap_url(self.start_url)
|
|
96
|
+
self.base_url = self._extract_base_url(self.start_url)
|
|
97
|
+
self.include_patterns = [self.base_url]
|
|
98
|
+
self.exclude_patterns = []
|
|
99
|
+
self.output_subdir = urlparse(self.start_url).netloc.replace(".", "_")
|
|
100
|
+
self.strip_prefix = None
|
|
101
|
+
self.follow_links = False
|
|
102
|
+
|
|
103
|
+
if use_js and not PLAYWRIGHT_AVAILABLE:
|
|
104
|
+
self.logger.warning("Playwright not installed. JS rendering disabled.")
|
|
105
|
+
self.logger.warning(
|
|
106
|
+
"Install with: pip install 'docpull[js]' && python -m playwright install chromium"
|
|
107
|
+
)
|
|
108
|
+
self.use_js = False
|
|
109
|
+
|
|
110
|
+
def _extract_base_url(self, url: str) -> str:
|
|
111
|
+
"""Extract base URL from a full URL."""
|
|
112
|
+
parsed = urlparse(url)
|
|
113
|
+
return f"{parsed.scheme}://{parsed.netloc}/"
|
|
114
|
+
|
|
115
|
+
def _guess_sitemap_url(self, url: str) -> Optional[str]:
|
|
116
|
+
"""Guess sitemap URL for a given domain."""
|
|
117
|
+
base = self._extract_base_url(url)
|
|
118
|
+
common_paths = ["sitemap.xml", "sitemap_index.xml", "docs/sitemap.xml"]
|
|
119
|
+
|
|
120
|
+
for path in common_paths:
|
|
121
|
+
sitemap_url = urljoin(base, path)
|
|
122
|
+
try:
|
|
123
|
+
response = self.session.head(sitemap_url, timeout=10)
|
|
124
|
+
if response.status_code == 200:
|
|
125
|
+
self.logger.info(f"Found sitemap: {sitemap_url}")
|
|
126
|
+
return sitemap_url
|
|
127
|
+
except Exception:
|
|
128
|
+
continue
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
def _crawl_links(self, start_urls: set[str], max_depth: int = 5) -> set[str]:
|
|
132
|
+
"""Crawl links from start URLs (sync version for discovery)."""
|
|
133
|
+
discovered: set[str] = set()
|
|
134
|
+
to_visit: set[tuple[str, int]] = {(url, 0) for url in start_urls}
|
|
135
|
+
visited: set[str] = set()
|
|
136
|
+
|
|
137
|
+
while to_visit:
|
|
138
|
+
url, depth = to_visit.pop()
|
|
139
|
+
|
|
140
|
+
if url in visited or depth > max_depth:
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
if not self.validate_url(url):
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
visited.add(url)
|
|
147
|
+
discovered.add(url)
|
|
148
|
+
|
|
149
|
+
if depth >= max_depth:
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
response = self.session.get(url, timeout=30)
|
|
154
|
+
response.raise_for_status()
|
|
155
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
156
|
+
|
|
157
|
+
for link in soup.find_all("a", href=True):
|
|
158
|
+
href = link["href"]
|
|
159
|
+
if not isinstance(href, str):
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
absolute_url = urljoin(url, href)
|
|
163
|
+
absolute_url = absolute_url.split("#")[0].split("?")[0]
|
|
164
|
+
|
|
165
|
+
if not any(pattern in absolute_url for pattern in self.include_patterns):
|
|
166
|
+
continue
|
|
167
|
+
if any(pattern in absolute_url for pattern in self.exclude_patterns):
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
if absolute_url not in visited:
|
|
171
|
+
to_visit.add((absolute_url, depth + 1))
|
|
172
|
+
|
|
173
|
+
except Exception:
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
return discovered
|
|
177
|
+
|
|
178
|
+
def fetch(self) -> None:
|
|
179
|
+
"""Fetch documentation (sync wrapper for async method)."""
|
|
180
|
+
asyncio.run(self.fetch_async())
|
|
181
|
+
|
|
182
|
+
async def fetch_async(self) -> None:
|
|
183
|
+
"""Fetch documentation asynchronously with progress bars."""
|
|
184
|
+
self.logger.info(f"Fetching documentation from {self.start_url}")
|
|
185
|
+
|
|
186
|
+
urls: set[str] = set()
|
|
187
|
+
|
|
188
|
+
# Discover URLs
|
|
189
|
+
with Progress(
|
|
190
|
+
SpinnerColumn(),
|
|
191
|
+
TextColumn("[progress.description]{task.description}"),
|
|
192
|
+
transient=True,
|
|
193
|
+
) as progress:
|
|
194
|
+
task = progress.add_task("Discovering URLs...", total=None)
|
|
195
|
+
|
|
196
|
+
# Try sitemap
|
|
197
|
+
if self.sitemap_url:
|
|
198
|
+
sitemap_urls = self.fetch_sitemap(self.sitemap_url)
|
|
199
|
+
if sitemap_urls:
|
|
200
|
+
urls.update(sitemap_urls)
|
|
201
|
+
progress.update(task, description=f"Found {len(sitemap_urls)} URLs in sitemap")
|
|
202
|
+
|
|
203
|
+
# Add start URLs
|
|
204
|
+
if self.profile and self.profile.start_urls:
|
|
205
|
+
urls.update(self.profile.start_urls)
|
|
206
|
+
|
|
207
|
+
# Crawl links if needed
|
|
208
|
+
if self.follow_links or (not urls and not self.sitemap_url):
|
|
209
|
+
start_urls = {self.start_url}
|
|
210
|
+
if self.profile and self.profile.start_urls:
|
|
211
|
+
start_urls.update(self.profile.start_urls)
|
|
212
|
+
|
|
213
|
+
progress.update(task, description=f"Crawling links from {len(start_urls)} URL(s)...")
|
|
214
|
+
crawled_urls = self._crawl_links(start_urls, self.max_depth)
|
|
215
|
+
urls.update(crawled_urls)
|
|
216
|
+
progress.update(task, description=f"Discovered {len(crawled_urls)} URLs via crawling")
|
|
217
|
+
|
|
218
|
+
if not urls:
|
|
219
|
+
self.logger.error("No URLs found to fetch")
|
|
220
|
+
return
|
|
221
|
+
|
|
222
|
+
# Apply filters
|
|
223
|
+
if self.include_patterns or self.exclude_patterns:
|
|
224
|
+
filtered_urls = []
|
|
225
|
+
for url in urls:
|
|
226
|
+
if self.include_patterns and not any(pattern in url for pattern in self.include_patterns):
|
|
227
|
+
continue
|
|
228
|
+
if self.exclude_patterns and any(pattern in url for pattern in self.exclude_patterns):
|
|
229
|
+
continue
|
|
230
|
+
filtered_urls.append(url)
|
|
231
|
+
urls = set(filtered_urls)
|
|
232
|
+
|
|
233
|
+
urls_list = sorted(urls)
|
|
234
|
+
|
|
235
|
+
# Apply max_pages limit
|
|
236
|
+
if self.max_pages:
|
|
237
|
+
urls_list = urls_list[: self.max_pages]
|
|
238
|
+
|
|
239
|
+
self.logger.info(f"Processing {len(urls_list)} URLs")
|
|
240
|
+
|
|
241
|
+
# Prepare URL/path pairs
|
|
242
|
+
url_output_pairs = []
|
|
243
|
+
for url in urls_list:
|
|
244
|
+
if self.profile and self.base_url:
|
|
245
|
+
filepath = self.create_output_path(url, self.base_url, self.output_subdir, self.strip_prefix)
|
|
246
|
+
else:
|
|
247
|
+
parsed = urlparse(url)
|
|
248
|
+
path = parsed.path.strip("/")
|
|
249
|
+
if not path:
|
|
250
|
+
path = "index"
|
|
251
|
+
filepath = self.output_dir / self.output_subdir / f"{path.replace('/', '_')}.md"
|
|
252
|
+
url_output_pairs.append((url, filepath))
|
|
253
|
+
|
|
254
|
+
# Fetch URLs with progress bar
|
|
255
|
+
async with AsyncFetcher(
|
|
256
|
+
base_fetcher=self,
|
|
257
|
+
max_concurrent=self.max_concurrent,
|
|
258
|
+
use_js=self.use_js,
|
|
259
|
+
) as async_fetcher:
|
|
260
|
+
if self.show_progress:
|
|
261
|
+
with Progress(
|
|
262
|
+
SpinnerColumn(),
|
|
263
|
+
TextColumn("[progress.description]{task.description}"),
|
|
264
|
+
BarColumn(),
|
|
265
|
+
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
|
266
|
+
TimeElapsedColumn(),
|
|
267
|
+
) as progress:
|
|
268
|
+
task = progress.add_task(
|
|
269
|
+
f"Fetching {len(url_output_pairs)} pages...", total=len(url_output_pairs)
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Fetch with progress updates
|
|
273
|
+
for i in range(0, len(url_output_pairs), self.max_concurrent):
|
|
274
|
+
batch = url_output_pairs[i : i + self.max_concurrent]
|
|
275
|
+
await async_fetcher.fetch_urls_parallel(batch)
|
|
276
|
+
progress.update(task, completed=min(i + self.max_concurrent, len(url_output_pairs)))
|
|
277
|
+
else:
|
|
278
|
+
# Fetch without progress
|
|
279
|
+
await async_fetcher.fetch_urls_parallel(url_output_pairs)
|
|
280
|
+
|
|
281
|
+
self.logger.info("Fetch complete")
|
|
282
|
+
self.print_stats()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Next.js documentation fetcher."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from .base import BaseFetcher
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NextJSFetcher(BaseFetcher):
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
output_dir: Path,
|
|
14
|
+
rate_limit: float = 0.5,
|
|
15
|
+
skip_existing: bool = True,
|
|
16
|
+
logger: Optional[logging.Logger] = None,
|
|
17
|
+
) -> None:
|
|
18
|
+
super().__init__(output_dir, rate_limit, skip_existing=skip_existing, logger=logger)
|
|
19
|
+
self.sitemap_url = "https://nextjs.org/sitemap.xml"
|
|
20
|
+
self.base_url = "https://nextjs.org/"
|
|
21
|
+
|
|
22
|
+
def fetch(self) -> None:
|
|
23
|
+
self.logger.info("Fetching Next.js documentation")
|
|
24
|
+
|
|
25
|
+
urls = self.fetch_sitemap(self.sitemap_url)
|
|
26
|
+
|
|
27
|
+
if not urls:
|
|
28
|
+
self.logger.error("No URLs found in Next.js sitemap")
|
|
29
|
+
return
|
|
30
|
+
|
|
31
|
+
doc_urls = self.filter_urls(
|
|
32
|
+
urls, include_patterns=["/docs/"], exclude_patterns=["/blog/", "/showcase/", "/conf/", "/learn/"]
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
self.logger.info(f"Found {len(doc_urls)} documentation URLs")
|
|
36
|
+
|
|
37
|
+
categories = self.categorize_urls(doc_urls, self.base_url)
|
|
38
|
+
|
|
39
|
+
self.logger.info(f"Found {len(categories)} categories:")
|
|
40
|
+
for cat, cat_urls in sorted(categories.items(), key=lambda x: len(x[1]), reverse=True):
|
|
41
|
+
self.logger.info(f" {cat}: {len(cat_urls)} pages")
|
|
42
|
+
|
|
43
|
+
total = len(doc_urls)
|
|
44
|
+
for idx, url in enumerate(doc_urls, 1):
|
|
45
|
+
self.logger.info(f"[{idx}/{total}] Processing Next.js documentation")
|
|
46
|
+
filepath = self.create_output_path(url, self.base_url, "next", strip_prefix="docs")
|
|
47
|
+
self.process_url(url, filepath)
|
|
48
|
+
|
|
49
|
+
self.logger.info("Next.js documentation fetch complete")
|
|
50
|
+
self.print_stats()
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Parallel/concurrent base fetcher for faster downloads."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from .base import BaseFetcher
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ParallelFetcher(BaseFetcher):
|
|
13
|
+
"""
|
|
14
|
+
Enhanced fetcher with parallel/concurrent downloads.
|
|
15
|
+
|
|
16
|
+
Uses ThreadPoolExecutor for concurrent HTTP requests.
|
|
17
|
+
Much faster than sequential fetching.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
output_dir: Path,
|
|
23
|
+
rate_limit: float = 0.5,
|
|
24
|
+
skip_existing: bool = True,
|
|
25
|
+
logger: Optional[logging.Logger] = None,
|
|
26
|
+
max_workers: int = 10,
|
|
27
|
+
) -> None:
|
|
28
|
+
"""
|
|
29
|
+
Initialize parallel fetcher.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
output_dir: Directory to save documentation
|
|
33
|
+
rate_limit: Seconds between requests (per worker)
|
|
34
|
+
skip_existing: Skip existing files
|
|
35
|
+
logger: Logger instance
|
|
36
|
+
max_workers: Number of concurrent workers (default: 10)
|
|
37
|
+
"""
|
|
38
|
+
super().__init__(output_dir, rate_limit, None, skip_existing, logger)
|
|
39
|
+
self.max_workers = max_workers
|
|
40
|
+
|
|
41
|
+
def process_url_with_metadata(self, url_data: tuple[str, Path]) -> tuple[bool, str]:
|
|
42
|
+
"""
|
|
43
|
+
Process a single URL with metadata.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
url_data: Tuple of (url, output_path)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Tuple of (success, url)
|
|
50
|
+
"""
|
|
51
|
+
url, output_path = url_data
|
|
52
|
+
try:
|
|
53
|
+
success = self.process_url(url, output_path)
|
|
54
|
+
return (success, url)
|
|
55
|
+
except Exception as e:
|
|
56
|
+
self.logger.error(f"Error processing {url}: {e}")
|
|
57
|
+
self.stats["errors"] += 1
|
|
58
|
+
return (False, url)
|
|
59
|
+
|
|
60
|
+
def fetch_urls_parallel(self, url_output_pairs: list[tuple[str, Path]]) -> None:
|
|
61
|
+
"""
|
|
62
|
+
Fetch URLs in parallel.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
url_output_pairs: List of (url, output_path) tuples
|
|
66
|
+
"""
|
|
67
|
+
total = len(url_output_pairs)
|
|
68
|
+
self.logger.info(f"Fetching {total} URLs with {self.max_workers} workers...")
|
|
69
|
+
|
|
70
|
+
start_time = time.time()
|
|
71
|
+
|
|
72
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
73
|
+
# Submit all tasks
|
|
74
|
+
futures = {
|
|
75
|
+
executor.submit(self.process_url_with_metadata, url_data): url_data
|
|
76
|
+
for url_data in url_output_pairs
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Process as they complete
|
|
80
|
+
for completed, future in enumerate(as_completed(futures), start=1):
|
|
81
|
+
success, url = future.result()
|
|
82
|
+
|
|
83
|
+
if completed % 10 == 0 or completed == total:
|
|
84
|
+
elapsed = time.time() - start_time
|
|
85
|
+
rate = completed / elapsed if elapsed > 0 else 0
|
|
86
|
+
self.logger.info(
|
|
87
|
+
f"Progress: {completed}/{total} "
|
|
88
|
+
f"({completed*100//total}%) "
|
|
89
|
+
f"- {rate:.1f} docs/sec"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
elapsed = time.time() - start_time
|
|
93
|
+
self.logger.info(f"Completed in {elapsed:.1f}s " f"({total/elapsed:.1f} docs/sec average)")
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Plaid documentation fetcher."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
|
|
9
|
+
from ..utils.file_utils import clean_filename
|
|
10
|
+
from .base import BaseFetcher
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PlaidFetcher(BaseFetcher):
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
output_dir: Path,
|
|
17
|
+
rate_limit: float = 0.5,
|
|
18
|
+
skip_existing: bool = True,
|
|
19
|
+
logger: Optional[logging.Logger] = None,
|
|
20
|
+
) -> None:
|
|
21
|
+
super().__init__(output_dir, rate_limit, skip_existing=skip_existing, logger=logger)
|
|
22
|
+
self.sitemap_url = "https://plaid.com/sitemap.xml"
|
|
23
|
+
self.docs_url = "https://plaid.com/docs/"
|
|
24
|
+
self.base_url = "https://plaid.com/"
|
|
25
|
+
|
|
26
|
+
def fetch(self) -> None:
|
|
27
|
+
self.logger.info("Fetching Plaid documentation")
|
|
28
|
+
|
|
29
|
+
doc_urls: set[str] = set()
|
|
30
|
+
|
|
31
|
+
self.logger.info(f"Fetching Plaid docs index from {self.docs_url}")
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
response = self.session.get(self.docs_url, timeout=30)
|
|
35
|
+
response.raise_for_status()
|
|
36
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
37
|
+
|
|
38
|
+
for link in soup.find_all("a", href=True):
|
|
39
|
+
href = link["href"]
|
|
40
|
+
if not isinstance(href, str):
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
if href.startswith("/docs/") or href.startswith("/api/"):
|
|
44
|
+
href = "https://plaid.com" + href
|
|
45
|
+
|
|
46
|
+
# Validate URL before adding
|
|
47
|
+
if not self.validate_url(href):
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
if "plaid.com/docs/" in href or "plaid.com/api/" in href:
|
|
51
|
+
href = href.split("#")[0].split("?")[0]
|
|
52
|
+
doc_urls.add(href)
|
|
53
|
+
|
|
54
|
+
except Exception as e:
|
|
55
|
+
self.logger.error(f"Error fetching Plaid docs index: {e}")
|
|
56
|
+
|
|
57
|
+
sitemap_urls = self.fetch_sitemap(self.sitemap_url)
|
|
58
|
+
|
|
59
|
+
for url in sitemap_urls:
|
|
60
|
+
if (
|
|
61
|
+
"/docs/" in url or "/api/" in url
|
|
62
|
+
) and not any(x in url for x in ["/blog/", "/resources/", "/company/", "/customers/"]):
|
|
63
|
+
doc_urls.add(url.split("#")[0].split("?")[0])
|
|
64
|
+
|
|
65
|
+
doc_urls_list = sorted(doc_urls)
|
|
66
|
+
|
|
67
|
+
self.logger.info(f"Found {len(doc_urls_list)} Plaid documentation URLs")
|
|
68
|
+
|
|
69
|
+
total = len(doc_urls_list)
|
|
70
|
+
for idx, url in enumerate(doc_urls_list, 1):
|
|
71
|
+
self.logger.info(f"[{idx}/{total}] Processing Plaid documentation")
|
|
72
|
+
|
|
73
|
+
if "/api/" in url:
|
|
74
|
+
path = url.replace("https://plaid.com/api/", "").strip("/")
|
|
75
|
+
category_dir = self.output_dir / "plaid" / "api-reference"
|
|
76
|
+
elif "/docs/" in url:
|
|
77
|
+
path = url.replace("https://plaid.com/docs/", "").strip("/")
|
|
78
|
+
category_dir = self.output_dir / "plaid" / "guides"
|
|
79
|
+
else:
|
|
80
|
+
path = ""
|
|
81
|
+
category_dir = self.output_dir / "plaid" / "other"
|
|
82
|
+
|
|
83
|
+
if "/" in path:
|
|
84
|
+
parts = path.split("/")
|
|
85
|
+
category_dir = category_dir / parts[0]
|
|
86
|
+
|
|
87
|
+
filename = clean_filename(url, self.base_url)
|
|
88
|
+
filepath = category_dir / filename
|
|
89
|
+
self.process_url(url, filepath)
|
|
90
|
+
|
|
91
|
+
self.logger.info("Plaid documentation fetch complete")
|
|
92
|
+
self.print_stats()
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""React documentation fetcher."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from .parallel_base import ParallelFetcher
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ReactFetcher(ParallelFetcher):
|
|
11
|
+
"""Fetcher for React documentation."""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
output_dir: Path,
|
|
16
|
+
rate_limit: float = 0.2,
|
|
17
|
+
skip_existing: bool = True,
|
|
18
|
+
logger: Optional[logging.Logger] = None,
|
|
19
|
+
max_workers: int = 15,
|
|
20
|
+
) -> None:
|
|
21
|
+
"""
|
|
22
|
+
Initialize React fetcher.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
output_dir: Directory to save documentation
|
|
26
|
+
rate_limit: Seconds between requests
|
|
27
|
+
skip_existing: Skip existing files
|
|
28
|
+
logger: Logger instance
|
|
29
|
+
max_workers: Number of concurrent workers
|
|
30
|
+
"""
|
|
31
|
+
super().__init__(output_dir, rate_limit, skip_existing, logger, max_workers)
|
|
32
|
+
self.sitemap_url = "https://react.dev/sitemap.xml"
|
|
33
|
+
self.base_url = "https://react.dev/"
|
|
34
|
+
|
|
35
|
+
def fetch(self) -> None:
|
|
36
|
+
"""Fetch all React documentation."""
|
|
37
|
+
self.logger.info("Fetching React documentation")
|
|
38
|
+
|
|
39
|
+
urls = self.fetch_sitemap(self.sitemap_url)
|
|
40
|
+
|
|
41
|
+
if not urls:
|
|
42
|
+
self.logger.error("No URLs found in React sitemap")
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
doc_urls = self.filter_urls(
|
|
46
|
+
urls, include_patterns=["/reference/", "/learn/"], exclude_patterns=["/blog/", "/community/"]
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
self.logger.info(f"Found {len(doc_urls)} documentation URLs")
|
|
50
|
+
|
|
51
|
+
url_output_pairs = []
|
|
52
|
+
for url in doc_urls:
|
|
53
|
+
filepath = self.create_output_path(url, self.base_url, "react")
|
|
54
|
+
url_output_pairs.append((url, filepath))
|
|
55
|
+
|
|
56
|
+
self.fetch_urls_parallel(url_output_pairs)
|
|
57
|
+
|
|
58
|
+
self.logger.info("React documentation fetch complete")
|
|
59
|
+
self.print_stats()
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Stripe documentation fetcher."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from ..utils.file_utils import clean_filename
|
|
8
|
+
from .base import BaseFetcher
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class StripeFetcher(BaseFetcher):
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
output_dir: Path,
|
|
15
|
+
rate_limit: float = 0.5,
|
|
16
|
+
skip_existing: bool = True,
|
|
17
|
+
logger: Optional[logging.Logger] = None,
|
|
18
|
+
) -> None:
|
|
19
|
+
super().__init__(output_dir, rate_limit, skip_existing=skip_existing, logger=logger)
|
|
20
|
+
self.sitemap_url = "https://docs.stripe.com/sitemap.xml"
|
|
21
|
+
self.base_url = "https://docs.stripe.com/"
|
|
22
|
+
|
|
23
|
+
def fetch(self) -> None:
|
|
24
|
+
self.logger.info("Fetching Stripe documentation")
|
|
25
|
+
|
|
26
|
+
urls = self.fetch_sitemap(self.sitemap_url)
|
|
27
|
+
|
|
28
|
+
if not urls:
|
|
29
|
+
self.logger.error("No URLs found in Stripe sitemap")
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
exclude_patterns = ["/changelog/", "/upgrades/"]
|
|
33
|
+
urls = self.filter_urls(urls, [self.base_url], exclude_patterns)
|
|
34
|
+
|
|
35
|
+
categories = self.categorize_urls(urls, self.base_url)
|
|
36
|
+
|
|
37
|
+
self.logger.info(f"Found {len(categories)} categories:")
|
|
38
|
+
for cat, cat_urls in sorted(categories.items(), key=lambda x: len(x[1]), reverse=True):
|
|
39
|
+
self.logger.info(f" {cat}: {len(cat_urls)} pages")
|
|
40
|
+
|
|
41
|
+
total = len(urls)
|
|
42
|
+
for idx, url in enumerate(urls, 1):
|
|
43
|
+
self.logger.info(f"[{idx}/{total}] Processing Stripe documentation")
|
|
44
|
+
|
|
45
|
+
path = url.replace(self.base_url, "").strip("/")
|
|
46
|
+
parts = path.split("/")
|
|
47
|
+
|
|
48
|
+
if len(parts) >= 2:
|
|
49
|
+
category_dir = self.output_dir / "stripe" / parts[0] / parts[1]
|
|
50
|
+
elif len(parts) == 1:
|
|
51
|
+
category_dir = self.output_dir / "stripe" / parts[0]
|
|
52
|
+
else:
|
|
53
|
+
category_dir = self.output_dir / "stripe" / "other"
|
|
54
|
+
|
|
55
|
+
filename = clean_filename(url, self.base_url)
|
|
56
|
+
filepath = category_dir / filename
|
|
57
|
+
self.process_url(url, filepath)
|
|
58
|
+
|
|
59
|
+
self.logger.info("Stripe documentation fetch complete")
|
|
60
|
+
self.print_stats()
|