docpull 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,282 @@
1
+ """Async generic fetcher with progress bars and JS support."""
2
+
3
+ import asyncio
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Optional
7
+ from urllib.parse import urljoin, urlparse
8
+
9
+ from bs4 import BeautifulSoup
10
+ from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
11
+
12
+ from ..profiles import SiteProfile, get_profile_by_name, get_profile_for_url
13
+ from .async_fetcher import PLAYWRIGHT_AVAILABLE, AsyncFetcher
14
+ from .base import BaseFetcher
15
+
16
+
17
+ class GenericAsyncFetcher(BaseFetcher):
18
+ """
19
+ Async generic fetcher with progress bars and optional JS rendering.
20
+
21
+ Features:
22
+ - Async/parallel fetching (10x+ faster)
23
+ - Progress bars with rich
24
+ - Optional JavaScript rendering
25
+ - All security features from BaseFetcher
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ url_or_profile: str,
31
+ output_dir: Path,
32
+ profile: Optional[SiteProfile] = None,
33
+ rate_limit: float = 0.5,
34
+ skip_existing: bool = True,
35
+ logger: Optional[logging.Logger] = None,
36
+ max_pages: Optional[int] = None,
37
+ max_depth: int = 5,
38
+ max_concurrent: int = 10,
39
+ use_js: bool = False,
40
+ show_progress: bool = True,
41
+ ) -> None:
42
+ """
43
+ Initialize async generic fetcher.
44
+
45
+ Args:
46
+ url_or_profile: URL to scrape or profile name
47
+ output_dir: Directory to save documentation
48
+ profile: Optional SiteProfile
49
+ rate_limit: Seconds between requests
50
+ skip_existing: Skip existing files
51
+ logger: Logger instance
52
+ max_pages: Maximum pages to fetch
53
+ max_depth: Maximum crawl depth
54
+ max_concurrent: Maximum concurrent requests
55
+ use_js: Enable JavaScript rendering (requires playwright)
56
+ show_progress: Show progress bars
57
+ """
58
+ super().__init__(output_dir, rate_limit, skip_existing=skip_existing, logger=logger)
59
+
60
+ # Determine if input is a URL or profile name
61
+ if url_or_profile.startswith(("http://", "https://")):
62
+ self.start_url = url_or_profile
63
+ if profile is None:
64
+ profile = get_profile_for_url(url_or_profile)
65
+ if profile:
66
+ self.logger.info(f"Auto-detected profile: {profile.name}")
67
+ else:
68
+ profile = get_profile_by_name(url_or_profile)
69
+ if profile is None:
70
+ raise ValueError(f"Unknown profile: {url_or_profile}")
71
+ start_url_candidate = profile.base_url or (profile.start_urls[0] if profile.start_urls else None)
72
+ if not start_url_candidate:
73
+ raise ValueError(f"Profile {url_or_profile} has no start URL")
74
+ self.start_url = start_url_candidate
75
+ self.logger.info(f"Using profile: {profile.name}")
76
+
77
+ self.profile = profile
78
+ self.max_pages = max_pages
79
+ self.max_depth = max_depth
80
+ self.max_concurrent = max_concurrent
81
+ self.use_js = use_js
82
+ self.show_progress = show_progress
83
+
84
+ # Set defaults from profile
85
+ if profile:
86
+ self.rate_limit = profile.rate_limit
87
+ self.sitemap_url = profile.sitemap_url
88
+ self.base_url = profile.base_url or self._extract_base_url(self.start_url)
89
+ self.include_patterns = profile.include_patterns
90
+ self.exclude_patterns = profile.exclude_patterns
91
+ self.output_subdir = profile.output_subdir or urlparse(self.start_url).netloc.replace(".", "_")
92
+ self.strip_prefix = profile.strip_prefix
93
+ self.follow_links = profile.follow_links
94
+ else:
95
+ self.sitemap_url = self._guess_sitemap_url(self.start_url)
96
+ self.base_url = self._extract_base_url(self.start_url)
97
+ self.include_patterns = [self.base_url]
98
+ self.exclude_patterns = []
99
+ self.output_subdir = urlparse(self.start_url).netloc.replace(".", "_")
100
+ self.strip_prefix = None
101
+ self.follow_links = False
102
+
103
+ if use_js and not PLAYWRIGHT_AVAILABLE:
104
+ self.logger.warning("Playwright not installed. JS rendering disabled.")
105
+ self.logger.warning(
106
+ "Install with: pip install 'docpull[js]' && python -m playwright install chromium"
107
+ )
108
+ self.use_js = False
109
+
110
+ def _extract_base_url(self, url: str) -> str:
111
+ """Extract base URL from a full URL."""
112
+ parsed = urlparse(url)
113
+ return f"{parsed.scheme}://{parsed.netloc}/"
114
+
115
+ def _guess_sitemap_url(self, url: str) -> Optional[str]:
116
+ """Guess sitemap URL for a given domain."""
117
+ base = self._extract_base_url(url)
118
+ common_paths = ["sitemap.xml", "sitemap_index.xml", "docs/sitemap.xml"]
119
+
120
+ for path in common_paths:
121
+ sitemap_url = urljoin(base, path)
122
+ try:
123
+ response = self.session.head(sitemap_url, timeout=10)
124
+ if response.status_code == 200:
125
+ self.logger.info(f"Found sitemap: {sitemap_url}")
126
+ return sitemap_url
127
+ except Exception:
128
+ continue
129
+ return None
130
+
131
+ def _crawl_links(self, start_urls: set[str], max_depth: int = 5) -> set[str]:
132
+ """Crawl links from start URLs (sync version for discovery)."""
133
+ discovered: set[str] = set()
134
+ to_visit: set[tuple[str, int]] = {(url, 0) for url in start_urls}
135
+ visited: set[str] = set()
136
+
137
+ while to_visit:
138
+ url, depth = to_visit.pop()
139
+
140
+ if url in visited or depth > max_depth:
141
+ continue
142
+
143
+ if not self.validate_url(url):
144
+ continue
145
+
146
+ visited.add(url)
147
+ discovered.add(url)
148
+
149
+ if depth >= max_depth:
150
+ continue
151
+
152
+ try:
153
+ response = self.session.get(url, timeout=30)
154
+ response.raise_for_status()
155
+ soup = BeautifulSoup(response.content, "html.parser")
156
+
157
+ for link in soup.find_all("a", href=True):
158
+ href = link["href"]
159
+ if not isinstance(href, str):
160
+ continue
161
+
162
+ absolute_url = urljoin(url, href)
163
+ absolute_url = absolute_url.split("#")[0].split("?")[0]
164
+
165
+ if not any(pattern in absolute_url for pattern in self.include_patterns):
166
+ continue
167
+ if any(pattern in absolute_url for pattern in self.exclude_patterns):
168
+ continue
169
+
170
+ if absolute_url not in visited:
171
+ to_visit.add((absolute_url, depth + 1))
172
+
173
+ except Exception:
174
+ continue
175
+
176
+ return discovered
177
+
178
+ def fetch(self) -> None:
179
+ """Fetch documentation (sync wrapper for async method)."""
180
+ asyncio.run(self.fetch_async())
181
+
182
+ async def fetch_async(self) -> None:
183
+ """Fetch documentation asynchronously with progress bars."""
184
+ self.logger.info(f"Fetching documentation from {self.start_url}")
185
+
186
+ urls: set[str] = set()
187
+
188
+ # Discover URLs
189
+ with Progress(
190
+ SpinnerColumn(),
191
+ TextColumn("[progress.description]{task.description}"),
192
+ transient=True,
193
+ ) as progress:
194
+ task = progress.add_task("Discovering URLs...", total=None)
195
+
196
+ # Try sitemap
197
+ if self.sitemap_url:
198
+ sitemap_urls = self.fetch_sitemap(self.sitemap_url)
199
+ if sitemap_urls:
200
+ urls.update(sitemap_urls)
201
+ progress.update(task, description=f"Found {len(sitemap_urls)} URLs in sitemap")
202
+
203
+ # Add start URLs
204
+ if self.profile and self.profile.start_urls:
205
+ urls.update(self.profile.start_urls)
206
+
207
+ # Crawl links if needed
208
+ if self.follow_links or (not urls and not self.sitemap_url):
209
+ start_urls = {self.start_url}
210
+ if self.profile and self.profile.start_urls:
211
+ start_urls.update(self.profile.start_urls)
212
+
213
+ progress.update(task, description=f"Crawling links from {len(start_urls)} URL(s)...")
214
+ crawled_urls = self._crawl_links(start_urls, self.max_depth)
215
+ urls.update(crawled_urls)
216
+ progress.update(task, description=f"Discovered {len(crawled_urls)} URLs via crawling")
217
+
218
+ if not urls:
219
+ self.logger.error("No URLs found to fetch")
220
+ return
221
+
222
+ # Apply filters
223
+ if self.include_patterns or self.exclude_patterns:
224
+ filtered_urls = []
225
+ for url in urls:
226
+ if self.include_patterns and not any(pattern in url for pattern in self.include_patterns):
227
+ continue
228
+ if self.exclude_patterns and any(pattern in url for pattern in self.exclude_patterns):
229
+ continue
230
+ filtered_urls.append(url)
231
+ urls = set(filtered_urls)
232
+
233
+ urls_list = sorted(urls)
234
+
235
+ # Apply max_pages limit
236
+ if self.max_pages:
237
+ urls_list = urls_list[: self.max_pages]
238
+
239
+ self.logger.info(f"Processing {len(urls_list)} URLs")
240
+
241
+ # Prepare URL/path pairs
242
+ url_output_pairs = []
243
+ for url in urls_list:
244
+ if self.profile and self.base_url:
245
+ filepath = self.create_output_path(url, self.base_url, self.output_subdir, self.strip_prefix)
246
+ else:
247
+ parsed = urlparse(url)
248
+ path = parsed.path.strip("/")
249
+ if not path:
250
+ path = "index"
251
+ filepath = self.output_dir / self.output_subdir / f"{path.replace('/', '_')}.md"
252
+ url_output_pairs.append((url, filepath))
253
+
254
+ # Fetch URLs with progress bar
255
+ async with AsyncFetcher(
256
+ base_fetcher=self,
257
+ max_concurrent=self.max_concurrent,
258
+ use_js=self.use_js,
259
+ ) as async_fetcher:
260
+ if self.show_progress:
261
+ with Progress(
262
+ SpinnerColumn(),
263
+ TextColumn("[progress.description]{task.description}"),
264
+ BarColumn(),
265
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
266
+ TimeElapsedColumn(),
267
+ ) as progress:
268
+ task = progress.add_task(
269
+ f"Fetching {len(url_output_pairs)} pages...", total=len(url_output_pairs)
270
+ )
271
+
272
+ # Fetch with progress updates
273
+ for i in range(0, len(url_output_pairs), self.max_concurrent):
274
+ batch = url_output_pairs[i : i + self.max_concurrent]
275
+ await async_fetcher.fetch_urls_parallel(batch)
276
+ progress.update(task, completed=min(i + self.max_concurrent, len(url_output_pairs)))
277
+ else:
278
+ # Fetch without progress
279
+ await async_fetcher.fetch_urls_parallel(url_output_pairs)
280
+
281
+ self.logger.info("Fetch complete")
282
+ self.print_stats()
@@ -0,0 +1,50 @@
1
+ """Next.js documentation fetcher."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from .base import BaseFetcher
8
+
9
+
10
+ class NextJSFetcher(BaseFetcher):
11
+ def __init__(
12
+ self,
13
+ output_dir: Path,
14
+ rate_limit: float = 0.5,
15
+ skip_existing: bool = True,
16
+ logger: Optional[logging.Logger] = None,
17
+ ) -> None:
18
+ super().__init__(output_dir, rate_limit, skip_existing=skip_existing, logger=logger)
19
+ self.sitemap_url = "https://nextjs.org/sitemap.xml"
20
+ self.base_url = "https://nextjs.org/"
21
+
22
+ def fetch(self) -> None:
23
+ self.logger.info("Fetching Next.js documentation")
24
+
25
+ urls = self.fetch_sitemap(self.sitemap_url)
26
+
27
+ if not urls:
28
+ self.logger.error("No URLs found in Next.js sitemap")
29
+ return
30
+
31
+ doc_urls = self.filter_urls(
32
+ urls, include_patterns=["/docs/"], exclude_patterns=["/blog/", "/showcase/", "/conf/", "/learn/"]
33
+ )
34
+
35
+ self.logger.info(f"Found {len(doc_urls)} documentation URLs")
36
+
37
+ categories = self.categorize_urls(doc_urls, self.base_url)
38
+
39
+ self.logger.info(f"Found {len(categories)} categories:")
40
+ for cat, cat_urls in sorted(categories.items(), key=lambda x: len(x[1]), reverse=True):
41
+ self.logger.info(f" {cat}: {len(cat_urls)} pages")
42
+
43
+ total = len(doc_urls)
44
+ for idx, url in enumerate(doc_urls, 1):
45
+ self.logger.info(f"[{idx}/{total}] Processing Next.js documentation")
46
+ filepath = self.create_output_path(url, self.base_url, "next", strip_prefix="docs")
47
+ self.process_url(url, filepath)
48
+
49
+ self.logger.info("Next.js documentation fetch complete")
50
+ self.print_stats()
@@ -0,0 +1,93 @@
1
+ """Parallel/concurrent base fetcher for faster downloads."""
2
+
3
+ import logging
4
+ import time
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ from .base import BaseFetcher
10
+
11
+
12
+ class ParallelFetcher(BaseFetcher):
13
+ """
14
+ Enhanced fetcher with parallel/concurrent downloads.
15
+
16
+ Uses ThreadPoolExecutor for concurrent HTTP requests.
17
+ Much faster than sequential fetching.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ output_dir: Path,
23
+ rate_limit: float = 0.5,
24
+ skip_existing: bool = True,
25
+ logger: Optional[logging.Logger] = None,
26
+ max_workers: int = 10,
27
+ ) -> None:
28
+ """
29
+ Initialize parallel fetcher.
30
+
31
+ Args:
32
+ output_dir: Directory to save documentation
33
+ rate_limit: Seconds between requests (per worker)
34
+ skip_existing: Skip existing files
35
+ logger: Logger instance
36
+ max_workers: Number of concurrent workers (default: 10)
37
+ """
38
+ super().__init__(output_dir, rate_limit, None, skip_existing, logger)
39
+ self.max_workers = max_workers
40
+
41
+ def process_url_with_metadata(self, url_data: tuple[str, Path]) -> tuple[bool, str]:
42
+ """
43
+ Process a single URL with metadata.
44
+
45
+ Args:
46
+ url_data: Tuple of (url, output_path)
47
+
48
+ Returns:
49
+ Tuple of (success, url)
50
+ """
51
+ url, output_path = url_data
52
+ try:
53
+ success = self.process_url(url, output_path)
54
+ return (success, url)
55
+ except Exception as e:
56
+ self.logger.error(f"Error processing {url}: {e}")
57
+ self.stats["errors"] += 1
58
+ return (False, url)
59
+
60
+ def fetch_urls_parallel(self, url_output_pairs: list[tuple[str, Path]]) -> None:
61
+ """
62
+ Fetch URLs in parallel.
63
+
64
+ Args:
65
+ url_output_pairs: List of (url, output_path) tuples
66
+ """
67
+ total = len(url_output_pairs)
68
+ self.logger.info(f"Fetching {total} URLs with {self.max_workers} workers...")
69
+
70
+ start_time = time.time()
71
+
72
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
73
+ # Submit all tasks
74
+ futures = {
75
+ executor.submit(self.process_url_with_metadata, url_data): url_data
76
+ for url_data in url_output_pairs
77
+ }
78
+
79
+ # Process as they complete
80
+ for completed, future in enumerate(as_completed(futures), start=1):
81
+ success, url = future.result()
82
+
83
+ if completed % 10 == 0 or completed == total:
84
+ elapsed = time.time() - start_time
85
+ rate = completed / elapsed if elapsed > 0 else 0
86
+ self.logger.info(
87
+ f"Progress: {completed}/{total} "
88
+ f"({completed*100//total}%) "
89
+ f"- {rate:.1f} docs/sec"
90
+ )
91
+
92
+ elapsed = time.time() - start_time
93
+ self.logger.info(f"Completed in {elapsed:.1f}s " f"({total/elapsed:.1f} docs/sec average)")
@@ -0,0 +1,92 @@
1
+ """Plaid documentation fetcher."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from bs4 import BeautifulSoup
8
+
9
+ from ..utils.file_utils import clean_filename
10
+ from .base import BaseFetcher
11
+
12
+
13
+ class PlaidFetcher(BaseFetcher):
14
+ def __init__(
15
+ self,
16
+ output_dir: Path,
17
+ rate_limit: float = 0.5,
18
+ skip_existing: bool = True,
19
+ logger: Optional[logging.Logger] = None,
20
+ ) -> None:
21
+ super().__init__(output_dir, rate_limit, skip_existing=skip_existing, logger=logger)
22
+ self.sitemap_url = "https://plaid.com/sitemap.xml"
23
+ self.docs_url = "https://plaid.com/docs/"
24
+ self.base_url = "https://plaid.com/"
25
+
26
+ def fetch(self) -> None:
27
+ self.logger.info("Fetching Plaid documentation")
28
+
29
+ doc_urls: set[str] = set()
30
+
31
+ self.logger.info(f"Fetching Plaid docs index from {self.docs_url}")
32
+
33
+ try:
34
+ response = self.session.get(self.docs_url, timeout=30)
35
+ response.raise_for_status()
36
+ soup = BeautifulSoup(response.content, "html.parser")
37
+
38
+ for link in soup.find_all("a", href=True):
39
+ href = link["href"]
40
+ if not isinstance(href, str):
41
+ continue
42
+
43
+ if href.startswith("/docs/") or href.startswith("/api/"):
44
+ href = "https://plaid.com" + href
45
+
46
+ # Validate URL before adding
47
+ if not self.validate_url(href):
48
+ continue
49
+
50
+ if "plaid.com/docs/" in href or "plaid.com/api/" in href:
51
+ href = href.split("#")[0].split("?")[0]
52
+ doc_urls.add(href)
53
+
54
+ except Exception as e:
55
+ self.logger.error(f"Error fetching Plaid docs index: {e}")
56
+
57
+ sitemap_urls = self.fetch_sitemap(self.sitemap_url)
58
+
59
+ for url in sitemap_urls:
60
+ if (
61
+ "/docs/" in url or "/api/" in url
62
+ ) and not any(x in url for x in ["/blog/", "/resources/", "/company/", "/customers/"]):
63
+ doc_urls.add(url.split("#")[0].split("?")[0])
64
+
65
+ doc_urls_list = sorted(doc_urls)
66
+
67
+ self.logger.info(f"Found {len(doc_urls_list)} Plaid documentation URLs")
68
+
69
+ total = len(doc_urls_list)
70
+ for idx, url in enumerate(doc_urls_list, 1):
71
+ self.logger.info(f"[{idx}/{total}] Processing Plaid documentation")
72
+
73
+ if "/api/" in url:
74
+ path = url.replace("https://plaid.com/api/", "").strip("/")
75
+ category_dir = self.output_dir / "plaid" / "api-reference"
76
+ elif "/docs/" in url:
77
+ path = url.replace("https://plaid.com/docs/", "").strip("/")
78
+ category_dir = self.output_dir / "plaid" / "guides"
79
+ else:
80
+ path = ""
81
+ category_dir = self.output_dir / "plaid" / "other"
82
+
83
+ if "/" in path:
84
+ parts = path.split("/")
85
+ category_dir = category_dir / parts[0]
86
+
87
+ filename = clean_filename(url, self.base_url)
88
+ filepath = category_dir / filename
89
+ self.process_url(url, filepath)
90
+
91
+ self.logger.info("Plaid documentation fetch complete")
92
+ self.print_stats()
@@ -0,0 +1,59 @@
1
+ """React documentation fetcher."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from .parallel_base import ParallelFetcher
8
+
9
+
10
+ class ReactFetcher(ParallelFetcher):
11
+ """Fetcher for React documentation."""
12
+
13
+ def __init__(
14
+ self,
15
+ output_dir: Path,
16
+ rate_limit: float = 0.2,
17
+ skip_existing: bool = True,
18
+ logger: Optional[logging.Logger] = None,
19
+ max_workers: int = 15,
20
+ ) -> None:
21
+ """
22
+ Initialize React fetcher.
23
+
24
+ Args:
25
+ output_dir: Directory to save documentation
26
+ rate_limit: Seconds between requests
27
+ skip_existing: Skip existing files
28
+ logger: Logger instance
29
+ max_workers: Number of concurrent workers
30
+ """
31
+ super().__init__(output_dir, rate_limit, skip_existing, logger, max_workers)
32
+ self.sitemap_url = "https://react.dev/sitemap.xml"
33
+ self.base_url = "https://react.dev/"
34
+
35
+ def fetch(self) -> None:
36
+ """Fetch all React documentation."""
37
+ self.logger.info("Fetching React documentation")
38
+
39
+ urls = self.fetch_sitemap(self.sitemap_url)
40
+
41
+ if not urls:
42
+ self.logger.error("No URLs found in React sitemap")
43
+ return
44
+
45
+ doc_urls = self.filter_urls(
46
+ urls, include_patterns=["/reference/", "/learn/"], exclude_patterns=["/blog/", "/community/"]
47
+ )
48
+
49
+ self.logger.info(f"Found {len(doc_urls)} documentation URLs")
50
+
51
+ url_output_pairs = []
52
+ for url in doc_urls:
53
+ filepath = self.create_output_path(url, self.base_url, "react")
54
+ url_output_pairs.append((url, filepath))
55
+
56
+ self.fetch_urls_parallel(url_output_pairs)
57
+
58
+ self.logger.info("React documentation fetch complete")
59
+ self.print_stats()
@@ -0,0 +1,60 @@
1
+ """Stripe documentation fetcher."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from ..utils.file_utils import clean_filename
8
+ from .base import BaseFetcher
9
+
10
+
11
+ class StripeFetcher(BaseFetcher):
12
+ def __init__(
13
+ self,
14
+ output_dir: Path,
15
+ rate_limit: float = 0.5,
16
+ skip_existing: bool = True,
17
+ logger: Optional[logging.Logger] = None,
18
+ ) -> None:
19
+ super().__init__(output_dir, rate_limit, skip_existing=skip_existing, logger=logger)
20
+ self.sitemap_url = "https://docs.stripe.com/sitemap.xml"
21
+ self.base_url = "https://docs.stripe.com/"
22
+
23
+ def fetch(self) -> None:
24
+ self.logger.info("Fetching Stripe documentation")
25
+
26
+ urls = self.fetch_sitemap(self.sitemap_url)
27
+
28
+ if not urls:
29
+ self.logger.error("No URLs found in Stripe sitemap")
30
+ return
31
+
32
+ exclude_patterns = ["/changelog/", "/upgrades/"]
33
+ urls = self.filter_urls(urls, [self.base_url], exclude_patterns)
34
+
35
+ categories = self.categorize_urls(urls, self.base_url)
36
+
37
+ self.logger.info(f"Found {len(categories)} categories:")
38
+ for cat, cat_urls in sorted(categories.items(), key=lambda x: len(x[1]), reverse=True):
39
+ self.logger.info(f" {cat}: {len(cat_urls)} pages")
40
+
41
+ total = len(urls)
42
+ for idx, url in enumerate(urls, 1):
43
+ self.logger.info(f"[{idx}/{total}] Processing Stripe documentation")
44
+
45
+ path = url.replace(self.base_url, "").strip("/")
46
+ parts = path.split("/")
47
+
48
+ if len(parts) >= 2:
49
+ category_dir = self.output_dir / "stripe" / parts[0] / parts[1]
50
+ elif len(parts) == 1:
51
+ category_dir = self.output_dir / "stripe" / parts[0]
52
+ else:
53
+ category_dir = self.output_dir / "stripe" / "other"
54
+
55
+ filename = clean_filename(url, self.base_url)
56
+ filepath = category_dir / filename
57
+ self.process_url(url, filepath)
58
+
59
+ self.logger.info("Stripe documentation fetch complete")
60
+ self.print_stats()