docpull 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpull/__init__.py +29 -0
- docpull/__main__.py +6 -0
- docpull/cli.py +440 -0
- docpull/config.py +199 -0
- docpull/fetchers/__init__.py +23 -0
- docpull/fetchers/async_fetcher.py +322 -0
- docpull/fetchers/base.py +450 -0
- docpull/fetchers/bun.py +59 -0
- docpull/fetchers/d3.py +211 -0
- docpull/fetchers/generic.py +255 -0
- docpull/fetchers/generic_async.py +282 -0
- docpull/fetchers/nextjs.py +50 -0
- docpull/fetchers/parallel_base.py +93 -0
- docpull/fetchers/plaid.py +92 -0
- docpull/fetchers/react.py +59 -0
- docpull/fetchers/stripe.py +60 -0
- docpull/fetchers/tailwind.py +59 -0
- docpull/fetchers/turborepo.py +57 -0
- docpull/profiles/__init__.py +70 -0
- docpull/profiles/base.py +64 -0
- docpull/profiles/bun.py +14 -0
- docpull/profiles/d3.py +17 -0
- docpull/profiles/nextjs.py +15 -0
- docpull/profiles/plaid.py +16 -0
- docpull/profiles/react.py +14 -0
- docpull/profiles/stripe.py +14 -0
- docpull/profiles/tailwind.py +14 -0
- docpull/profiles/turborepo.py +14 -0
- docpull/py.typed +0 -0
- docpull/utils/__init__.py +6 -0
- docpull/utils/file_utils.py +97 -0
- docpull/utils/logging_config.py +54 -0
- docpull-1.0.1.dist-info/METADATA +440 -0
- docpull-1.0.1.dist-info/RECORD +38 -0
- docpull-1.0.1.dist-info/WHEEL +5 -0
- docpull-1.0.1.dist-info/entry_points.txt +2 -0
- docpull-1.0.1.dist-info/licenses/LICENSE +21 -0
- docpull-1.0.1.dist-info/top_level.txt +1 -0
docpull/fetchers/d3.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""D3.js documentation fetcher from DevDocs.io."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Optional, cast
|
|
8
|
+
|
|
9
|
+
from .parallel_base import ParallelFetcher
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class D3DevDocsFetcher(ParallelFetcher):
|
|
13
|
+
"""Fetcher for D3.js documentation from DevDocs.io."""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
output_dir: Path,
|
|
18
|
+
rate_limit: float = 0.05, # DevDocs can handle faster requests
|
|
19
|
+
skip_existing: bool = True,
|
|
20
|
+
logger: Optional[logging.Logger] = None,
|
|
21
|
+
max_workers: int = 20, # More workers for DevDocs
|
|
22
|
+
version: str = "7", # D3 version
|
|
23
|
+
) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Initialize D3 DevDocs fetcher.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
output_dir: Directory to save documentation
|
|
29
|
+
rate_limit: Seconds between requests (per worker)
|
|
30
|
+
skip_existing: Skip existing files
|
|
31
|
+
logger: Logger instance
|
|
32
|
+
max_workers: Number of concurrent workers
|
|
33
|
+
version: D3 version (default: 7 for latest)
|
|
34
|
+
"""
|
|
35
|
+
super().__init__(output_dir, rate_limit, skip_existing, logger, max_workers)
|
|
36
|
+
self.version = version
|
|
37
|
+
self.doc_slug = f"d3~{version}"
|
|
38
|
+
self.base_url = "https://devdocs.io/"
|
|
39
|
+
self.api_base = "https://documents.devdocs.io/"
|
|
40
|
+
self.index_url = f"{self.api_base}{self.doc_slug}/index.json"
|
|
41
|
+
|
|
42
|
+
def fetch_index(self) -> list[dict[Any, Any]]:
|
|
43
|
+
"""
|
|
44
|
+
Fetch the documentation index from DevDocs API.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
List of entry dictionaries
|
|
48
|
+
"""
|
|
49
|
+
self.logger.info(f"Fetching D3 v{self.version} index from DevDocs")
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
response = self.session.get(self.index_url, timeout=30)
|
|
53
|
+
response.raise_for_status()
|
|
54
|
+
|
|
55
|
+
data = response.json()
|
|
56
|
+
entries = cast(list[dict[Any, Any]], data.get("entries", []))
|
|
57
|
+
|
|
58
|
+
self.logger.info(f"Found {len(entries)} D3 documentation entries")
|
|
59
|
+
return entries
|
|
60
|
+
|
|
61
|
+
except Exception as e:
|
|
62
|
+
self.logger.error(f"Error fetching index: {e}")
|
|
63
|
+
return []
|
|
64
|
+
|
|
65
|
+
def fetch_entry_content(self, entry: dict) -> str:
|
|
66
|
+
"""
|
|
67
|
+
Fetch content for a single entry from DevDocs.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
entry: Entry dictionary with 'path' key
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Markdown content with frontmatter
|
|
74
|
+
"""
|
|
75
|
+
path = entry["path"]
|
|
76
|
+
url = f"{self.api_base}{self.doc_slug}/{path}.html"
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
response = self.session.get(url, timeout=30)
|
|
80
|
+
response.raise_for_status()
|
|
81
|
+
|
|
82
|
+
# DevDocs returns HTML, convert to markdown
|
|
83
|
+
from bs4 import BeautifulSoup
|
|
84
|
+
|
|
85
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
86
|
+
|
|
87
|
+
# Convert to markdown
|
|
88
|
+
markdown = self.h2t.handle(str(soup))
|
|
89
|
+
|
|
90
|
+
# Add frontmatter
|
|
91
|
+
frontmatter = f"""---
|
|
92
|
+
name: {entry.get('name', '')}
|
|
93
|
+
type: {entry.get('type', '')}
|
|
94
|
+
path: {path}
|
|
95
|
+
source: DevDocs.io - D3.js v{self.version}
|
|
96
|
+
url: {self.base_url}d3~{self.version}/{path}
|
|
97
|
+
fetched: {time.strftime('%Y-%m-%d')}
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
"""
|
|
101
|
+
return frontmatter + markdown.strip()
|
|
102
|
+
|
|
103
|
+
except Exception as e:
|
|
104
|
+
self.logger.error(f"Error fetching {path}: {e}")
|
|
105
|
+
self.stats["errors"] += 1
|
|
106
|
+
return f"# Error\n\nFailed to fetch {path}\n\nError: {str(e)}"
|
|
107
|
+
|
|
108
|
+
def process_entry(self, entry_data: tuple[dict, Path]) -> tuple[bool, str]:
|
|
109
|
+
"""
|
|
110
|
+
Process a single entry.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
entry_data: Tuple of (entry, output_path)
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Tuple of (success, entry_name)
|
|
117
|
+
"""
|
|
118
|
+
entry, output_path = entry_data
|
|
119
|
+
|
|
120
|
+
# Skip if exists
|
|
121
|
+
if self.skip_existing and output_path.exists():
|
|
122
|
+
self.logger.debug(f"Skipping (already exists): {entry['name']}")
|
|
123
|
+
self.stats["skipped"] += 1
|
|
124
|
+
return (True, entry["name"])
|
|
125
|
+
|
|
126
|
+
# Fetch content
|
|
127
|
+
content = self.fetch_entry_content(entry)
|
|
128
|
+
|
|
129
|
+
# Save
|
|
130
|
+
self.save_content(content, output_path)
|
|
131
|
+
self.stats["fetched"] += 1
|
|
132
|
+
|
|
133
|
+
# Rate limiting
|
|
134
|
+
time.sleep(self.rate_limit)
|
|
135
|
+
|
|
136
|
+
return (True, entry["name"])
|
|
137
|
+
|
|
138
|
+
def fetch(self) -> None:
|
|
139
|
+
"""Fetch all D3.js documentation from DevDocs."""
|
|
140
|
+
self.logger.info(f"Fetching D3.js v{self.version} documentation from DevDocs")
|
|
141
|
+
|
|
142
|
+
entries = self.fetch_index()
|
|
143
|
+
|
|
144
|
+
if not entries:
|
|
145
|
+
self.logger.error("No entries found")
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
# Group by type for organization
|
|
149
|
+
by_type: dict[str, list[dict[Any, Any]]] = {}
|
|
150
|
+
for entry in entries:
|
|
151
|
+
entry_type = entry.get("type", "Other")
|
|
152
|
+
if entry_type not in by_type:
|
|
153
|
+
by_type[entry_type] = []
|
|
154
|
+
by_type[entry_type].append(entry)
|
|
155
|
+
|
|
156
|
+
self.logger.info(f"Found {len(by_type)} categories:")
|
|
157
|
+
for cat, cat_entries in sorted(by_type.items(), key=lambda x: len(x[1]), reverse=True):
|
|
158
|
+
self.logger.info(f" {cat}: {len(cat_entries)} entries")
|
|
159
|
+
|
|
160
|
+
# Prepare URL paths
|
|
161
|
+
entry_paths = []
|
|
162
|
+
for entry in entries:
|
|
163
|
+
# Organize by type
|
|
164
|
+
entry_type = entry.get("type", "Other")
|
|
165
|
+
# Clean type name for directory - sanitize to prevent path traversal
|
|
166
|
+
type_dir = entry_type.lower().replace(" ", "-")
|
|
167
|
+
type_dir = re.sub(r"[^\w\-]", "-", type_dir)
|
|
168
|
+
type_dir = type_dir.strip("-").strip(".")
|
|
169
|
+
|
|
170
|
+
# Generate filename from path
|
|
171
|
+
path = entry["path"]
|
|
172
|
+
filename = path.replace("/", "-") + ".md"
|
|
173
|
+
|
|
174
|
+
output_path = self.output_dir / "d3" / type_dir / filename
|
|
175
|
+
|
|
176
|
+
# Validate output path to prevent path traversal
|
|
177
|
+
try:
|
|
178
|
+
from ..utils.file_utils import validate_output_path
|
|
179
|
+
|
|
180
|
+
output_path = validate_output_path(output_path, self.output_dir)
|
|
181
|
+
except ValueError as e:
|
|
182
|
+
self.logger.error(f"Invalid output path for entry {entry.get('name', 'unknown')}: {e}")
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
entry_paths.append(((entry, output_path), entry))
|
|
186
|
+
|
|
187
|
+
self.logger.info(f"Fetching with {self.max_workers} concurrent workers")
|
|
188
|
+
start_time = time.time()
|
|
189
|
+
|
|
190
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
191
|
+
|
|
192
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
193
|
+
futures = {executor.submit(self.process_entry, ep[0]): ep[1] for ep in entry_paths}
|
|
194
|
+
|
|
195
|
+
total = len(futures)
|
|
196
|
+
|
|
197
|
+
for completed, future in enumerate(as_completed(futures), start=1):
|
|
198
|
+
success, name = future.result()
|
|
199
|
+
|
|
200
|
+
if completed % 50 == 0 or completed == total:
|
|
201
|
+
elapsed = time.time() - start_time
|
|
202
|
+
rate = completed / elapsed if elapsed > 0 else 0
|
|
203
|
+
self.logger.info(
|
|
204
|
+
f"[{completed}/{total}] " f"({completed*100//total}%) " f"- {rate:.1f} docs/sec"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
elapsed = time.time() - start_time
|
|
208
|
+
self.logger.info(f"Completed in {elapsed:.1f}s ({total/elapsed:.1f} docs/sec)")
|
|
209
|
+
|
|
210
|
+
self.logger.info("D3.js documentation fetch complete")
|
|
211
|
+
self.print_stats()
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""Generic documentation fetcher that works with any URL."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from urllib.parse import urljoin, urlparse
|
|
7
|
+
|
|
8
|
+
from bs4 import BeautifulSoup
|
|
9
|
+
|
|
10
|
+
from ..profiles import SiteProfile, get_profile_by_name, get_profile_for_url
|
|
11
|
+
from .base import BaseFetcher
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class GenericFetcher(BaseFetcher):
|
|
15
|
+
"""
|
|
16
|
+
Generic fetcher that can scrape documentation from any URL.
|
|
17
|
+
|
|
18
|
+
Supports both profile-based (optimized) and generic scraping modes.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
url_or_profile: str,
|
|
24
|
+
output_dir: Path,
|
|
25
|
+
profile: Optional[SiteProfile] = None,
|
|
26
|
+
rate_limit: float = 0.5,
|
|
27
|
+
skip_existing: bool = True,
|
|
28
|
+
logger: Optional[logging.Logger] = None,
|
|
29
|
+
max_pages: Optional[int] = None,
|
|
30
|
+
max_depth: int = 5,
|
|
31
|
+
) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Initialize generic fetcher.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
url_or_profile: URL to scrape or profile name (e.g., 'stripe')
|
|
37
|
+
output_dir: Directory to save documentation
|
|
38
|
+
profile: Optional SiteProfile to use (overrides auto-detection)
|
|
39
|
+
rate_limit: Seconds between requests
|
|
40
|
+
skip_existing: Skip existing files
|
|
41
|
+
logger: Logger instance
|
|
42
|
+
max_pages: Maximum pages to fetch
|
|
43
|
+
max_depth: Maximum crawl depth
|
|
44
|
+
"""
|
|
45
|
+
super().__init__(output_dir, rate_limit, skip_existing=skip_existing, logger=logger)
|
|
46
|
+
|
|
47
|
+
# Determine if input is a URL or profile name
|
|
48
|
+
if url_or_profile.startswith(("http://", "https://")):
|
|
49
|
+
self.start_url = url_or_profile
|
|
50
|
+
# Try to auto-detect profile
|
|
51
|
+
if profile is None:
|
|
52
|
+
profile = get_profile_for_url(url_or_profile)
|
|
53
|
+
if profile:
|
|
54
|
+
self.logger.info(f"Auto-detected profile: {profile.name}")
|
|
55
|
+
else:
|
|
56
|
+
# Treat as profile name
|
|
57
|
+
profile = get_profile_by_name(url_or_profile)
|
|
58
|
+
if profile is None:
|
|
59
|
+
raise ValueError(f"Unknown profile: {url_or_profile}")
|
|
60
|
+
start_url_candidate = profile.base_url or (profile.start_urls[0] if profile.start_urls else None)
|
|
61
|
+
if not start_url_candidate:
|
|
62
|
+
raise ValueError(f"Profile {url_or_profile} has no start URL")
|
|
63
|
+
self.start_url = start_url_candidate
|
|
64
|
+
self.logger.info(f"Using profile: {profile.name}")
|
|
65
|
+
|
|
66
|
+
self.profile = profile
|
|
67
|
+
self.max_pages = max_pages
|
|
68
|
+
self.max_depth = max_depth
|
|
69
|
+
|
|
70
|
+
# Set defaults from profile if available
|
|
71
|
+
if profile:
|
|
72
|
+
self.rate_limit = profile.rate_limit
|
|
73
|
+
self.sitemap_url = profile.sitemap_url
|
|
74
|
+
self.base_url = profile.base_url or self._extract_base_url(self.start_url)
|
|
75
|
+
self.include_patterns = profile.include_patterns
|
|
76
|
+
self.exclude_patterns = profile.exclude_patterns
|
|
77
|
+
self.output_subdir = profile.output_subdir or urlparse(self.start_url).netloc.replace(".", "_")
|
|
78
|
+
self.strip_prefix = profile.strip_prefix
|
|
79
|
+
self.follow_links = profile.follow_links
|
|
80
|
+
else:
|
|
81
|
+
# Generic mode - infer from URL
|
|
82
|
+
self.sitemap_url = self._guess_sitemap_url(self.start_url)
|
|
83
|
+
self.base_url = self._extract_base_url(self.start_url)
|
|
84
|
+
self.include_patterns = [self.base_url]
|
|
85
|
+
self.exclude_patterns = []
|
|
86
|
+
self.output_subdir = urlparse(self.start_url).netloc.replace(".", "_")
|
|
87
|
+
self.strip_prefix = None
|
|
88
|
+
self.follow_links = False
|
|
89
|
+
|
|
90
|
+
def _extract_base_url(self, url: str) -> str:
|
|
91
|
+
"""Extract base URL from a full URL."""
|
|
92
|
+
parsed = urlparse(url)
|
|
93
|
+
return f"{parsed.scheme}://{parsed.netloc}/"
|
|
94
|
+
|
|
95
|
+
def _guess_sitemap_url(self, url: str) -> Optional[str]:
|
|
96
|
+
"""
|
|
97
|
+
Guess sitemap URL for a given domain.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
url: URL to guess sitemap for
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Guessed sitemap URL or None
|
|
104
|
+
"""
|
|
105
|
+
base = self._extract_base_url(url)
|
|
106
|
+
common_paths = ["sitemap.xml", "sitemap_index.xml", "docs/sitemap.xml"]
|
|
107
|
+
|
|
108
|
+
for path in common_paths:
|
|
109
|
+
sitemap_url = urljoin(base, path)
|
|
110
|
+
try:
|
|
111
|
+
self.logger.debug(f"Trying sitemap: {sitemap_url}")
|
|
112
|
+
response = self.session.head(sitemap_url, timeout=10)
|
|
113
|
+
if response.status_code == 200:
|
|
114
|
+
self.logger.info(f"Found sitemap: {sitemap_url}")
|
|
115
|
+
return sitemap_url
|
|
116
|
+
except Exception:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
def _crawl_links(self, start_urls: set[str], max_depth: int = 5) -> set[str]:
|
|
122
|
+
"""
|
|
123
|
+
Crawl links from start URLs.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
start_urls: URLs to start crawling from
|
|
127
|
+
max_depth: Maximum depth to crawl
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Set of discovered URLs
|
|
131
|
+
"""
|
|
132
|
+
discovered: set[str] = set()
|
|
133
|
+
to_visit: set[tuple[str, int]] = {(url, 0) for url in start_urls}
|
|
134
|
+
visited: set[str] = set()
|
|
135
|
+
|
|
136
|
+
while to_visit:
|
|
137
|
+
url, depth = to_visit.pop()
|
|
138
|
+
|
|
139
|
+
if url in visited or depth > max_depth:
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
if not self.validate_url(url):
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
visited.add(url)
|
|
146
|
+
discovered.add(url)
|
|
147
|
+
|
|
148
|
+
if depth >= max_depth:
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
self.logger.debug(f"Crawling: {url} (depth {depth})")
|
|
153
|
+
response = self.session.get(url, timeout=30)
|
|
154
|
+
response.raise_for_status()
|
|
155
|
+
|
|
156
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
157
|
+
|
|
158
|
+
for link in soup.find_all("a", href=True):
|
|
159
|
+
href = link["href"]
|
|
160
|
+
if not isinstance(href, str):
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
# Resolve relative URLs
|
|
164
|
+
absolute_url = urljoin(url, href)
|
|
165
|
+
|
|
166
|
+
# Remove fragments and query params
|
|
167
|
+
absolute_url = absolute_url.split("#")[0].split("?")[0]
|
|
168
|
+
|
|
169
|
+
# Check if URL matches patterns
|
|
170
|
+
if not any(pattern in absolute_url for pattern in self.include_patterns):
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
if any(pattern in absolute_url for pattern in self.exclude_patterns):
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
if absolute_url not in visited:
|
|
177
|
+
to_visit.add((absolute_url, depth + 1))
|
|
178
|
+
|
|
179
|
+
except Exception as e:
|
|
180
|
+
self.logger.debug(f"Error crawling {url}: {e}")
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
return discovered
|
|
184
|
+
|
|
185
|
+
def fetch(self) -> None:
|
|
186
|
+
"""Fetch documentation using profile or generic scraping."""
|
|
187
|
+
self.logger.info(f"Fetching documentation from {self.start_url}")
|
|
188
|
+
|
|
189
|
+
urls: set[str] = set()
|
|
190
|
+
|
|
191
|
+
# Try sitemap first
|
|
192
|
+
if self.sitemap_url:
|
|
193
|
+
sitemap_urls = self.fetch_sitemap(self.sitemap_url)
|
|
194
|
+
if sitemap_urls:
|
|
195
|
+
urls.update(sitemap_urls)
|
|
196
|
+
self.logger.info(f"Found {len(sitemap_urls)} URLs in sitemap")
|
|
197
|
+
|
|
198
|
+
# Add start URLs if using profile
|
|
199
|
+
if self.profile and self.profile.start_urls:
|
|
200
|
+
urls.update(self.profile.start_urls)
|
|
201
|
+
|
|
202
|
+
# Crawl links if needed
|
|
203
|
+
if self.follow_links or (not urls and not self.sitemap_url):
|
|
204
|
+
start_urls = {self.start_url}
|
|
205
|
+
if self.profile and self.profile.start_urls:
|
|
206
|
+
start_urls.update(self.profile.start_urls)
|
|
207
|
+
|
|
208
|
+
self.logger.info(f"Crawling links from {len(start_urls)} start URL(s)")
|
|
209
|
+
crawled_urls = self._crawl_links(start_urls, self.max_depth)
|
|
210
|
+
urls.update(crawled_urls)
|
|
211
|
+
self.logger.info(f"Discovered {len(crawled_urls)} URLs via crawling")
|
|
212
|
+
|
|
213
|
+
if not urls:
|
|
214
|
+
self.logger.error("No URLs found to fetch")
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
# Apply filters
|
|
218
|
+
if self.include_patterns or self.exclude_patterns:
|
|
219
|
+
filtered_urls = []
|
|
220
|
+
for url in urls:
|
|
221
|
+
if self.include_patterns and not any(pattern in url for pattern in self.include_patterns):
|
|
222
|
+
continue
|
|
223
|
+
if self.exclude_patterns and any(pattern in url for pattern in self.exclude_patterns):
|
|
224
|
+
continue
|
|
225
|
+
filtered_urls.append(url)
|
|
226
|
+
urls = set(filtered_urls)
|
|
227
|
+
|
|
228
|
+
urls_list = sorted(urls)
|
|
229
|
+
|
|
230
|
+
# Apply max_pages limit
|
|
231
|
+
if self.max_pages:
|
|
232
|
+
urls_list = urls_list[: self.max_pages]
|
|
233
|
+
self.logger.info(f"Limited to {self.max_pages} pages")
|
|
234
|
+
|
|
235
|
+
self.logger.info(f"Processing {len(urls_list)} URLs")
|
|
236
|
+
|
|
237
|
+
# Process each URL
|
|
238
|
+
total = len(urls_list)
|
|
239
|
+
for idx, url in enumerate(urls_list, 1):
|
|
240
|
+
self.logger.info(f"[{idx}/{total}] Processing: {url}")
|
|
241
|
+
|
|
242
|
+
if self.profile and self.base_url:
|
|
243
|
+
filepath = self.create_output_path(url, self.base_url, self.output_subdir, self.strip_prefix)
|
|
244
|
+
else:
|
|
245
|
+
# Generic path creation
|
|
246
|
+
parsed = urlparse(url)
|
|
247
|
+
path = parsed.path.strip("/")
|
|
248
|
+
if not path:
|
|
249
|
+
path = "index"
|
|
250
|
+
filepath = self.output_dir / self.output_subdir / f"{path.replace('/', '_')}.md"
|
|
251
|
+
|
|
252
|
+
self.process_url(url, filepath)
|
|
253
|
+
|
|
254
|
+
self.logger.info("Fetch complete")
|
|
255
|
+
self.print_stats()
|