docpull 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpull/__init__.py +29 -0
- docpull/__main__.py +6 -0
- docpull/cli.py +440 -0
- docpull/config.py +199 -0
- docpull/fetchers/__init__.py +23 -0
- docpull/fetchers/async_fetcher.py +322 -0
- docpull/fetchers/base.py +450 -0
- docpull/fetchers/bun.py +59 -0
- docpull/fetchers/d3.py +211 -0
- docpull/fetchers/generic.py +255 -0
- docpull/fetchers/generic_async.py +282 -0
- docpull/fetchers/nextjs.py +50 -0
- docpull/fetchers/parallel_base.py +93 -0
- docpull/fetchers/plaid.py +92 -0
- docpull/fetchers/react.py +59 -0
- docpull/fetchers/stripe.py +60 -0
- docpull/fetchers/tailwind.py +59 -0
- docpull/fetchers/turborepo.py +57 -0
- docpull/profiles/__init__.py +70 -0
- docpull/profiles/base.py +64 -0
- docpull/profiles/bun.py +14 -0
- docpull/profiles/d3.py +17 -0
- docpull/profiles/nextjs.py +15 -0
- docpull/profiles/plaid.py +16 -0
- docpull/profiles/react.py +14 -0
- docpull/profiles/stripe.py +14 -0
- docpull/profiles/tailwind.py +14 -0
- docpull/profiles/turborepo.py +14 -0
- docpull/py.typed +0 -0
- docpull/utils/__init__.py +6 -0
- docpull/utils/file_utils.py +97 -0
- docpull/utils/logging_config.py +54 -0
- docpull-1.0.1.dist-info/METADATA +440 -0
- docpull-1.0.1.dist-info/RECORD +38 -0
- docpull-1.0.1.dist-info/WHEEL +5 -0
- docpull-1.0.1.dist-info/entry_points.txt +2 -0
- docpull-1.0.1.dist-info/licenses/LICENSE +21 -0
- docpull-1.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""Async fetcher with JavaScript rendering support."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import aiohttp
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
|
|
11
|
+
from ..utils.file_utils import ensure_dir, validate_output_path
|
|
12
|
+
from .base import BaseFetcher
|
|
13
|
+
|
|
14
|
+
# Optional Playwright support
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from playwright.async_api import Browser, Page, Playwright
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from playwright.async_api import async_playwright
|
|
20
|
+
|
|
21
|
+
PLAYWRIGHT_AVAILABLE = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
PLAYWRIGHT_AVAILABLE = False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AsyncFetcher:
|
|
27
|
+
"""
|
|
28
|
+
Async fetcher with optional JavaScript rendering support.
|
|
29
|
+
|
|
30
|
+
Security features:
|
|
31
|
+
- All URL validation from BaseFetcher
|
|
32
|
+
- Rate limiting (async-safe with semaphore)
|
|
33
|
+
- Concurrent request limits
|
|
34
|
+
- Timeout controls for both HTTP and browser
|
|
35
|
+
- Content size limits
|
|
36
|
+
- Playwright sandboxing (disabled JS in certain contexts)
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
MAX_CONTENT_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
40
|
+
MAX_DOWNLOAD_TIME = 300 # 5 minutes
|
|
41
|
+
MAX_JS_RENDER_TIME = 30 # 30 seconds for JS rendering
|
|
42
|
+
MAX_CONCURRENT = 10 # Max concurrent requests
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
base_fetcher: BaseFetcher,
|
|
47
|
+
max_concurrent: int = 10,
|
|
48
|
+
use_js: bool = False,
|
|
49
|
+
headless: bool = True,
|
|
50
|
+
) -> None:
|
|
51
|
+
"""
|
|
52
|
+
Initialize async fetcher.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
base_fetcher: BaseFetcher instance for URL validation and settings
|
|
56
|
+
max_concurrent: Maximum concurrent requests
|
|
57
|
+
use_js: Enable JavaScript rendering with Playwright
|
|
58
|
+
headless: Run browser in headless mode
|
|
59
|
+
"""
|
|
60
|
+
self.base_fetcher = base_fetcher
|
|
61
|
+
self.logger = base_fetcher.logger
|
|
62
|
+
self.max_concurrent = max_concurrent
|
|
63
|
+
self.use_js = use_js
|
|
64
|
+
self.headless = headless
|
|
65
|
+
|
|
66
|
+
# Async-safe rate limiting
|
|
67
|
+
self.semaphore = asyncio.Semaphore(max_concurrent)
|
|
68
|
+
self.rate_limit_delay = base_fetcher.rate_limit
|
|
69
|
+
|
|
70
|
+
# Browser instance (if using JS)
|
|
71
|
+
self.browser: Optional["Browser"] = None
|
|
72
|
+
self.playwright: Optional["Playwright"] = None
|
|
73
|
+
|
|
74
|
+
if use_js and not PLAYWRIGHT_AVAILABLE:
|
|
75
|
+
self.logger.warning("Playwright not installed. Install with: pip install docpull[js]")
|
|
76
|
+
self.logger.warning("Falling back to non-JS mode")
|
|
77
|
+
self.use_js = False
|
|
78
|
+
|
|
79
|
+
async def __aenter__(self) -> "AsyncFetcher":
|
|
80
|
+
"""Async context manager entry."""
|
|
81
|
+
if self.use_js and PLAYWRIGHT_AVAILABLE:
|
|
82
|
+
self.playwright = await async_playwright().start()
|
|
83
|
+
# Launch with security-focused options
|
|
84
|
+
self.browser = await self.playwright.chromium.launch(
|
|
85
|
+
headless=self.headless,
|
|
86
|
+
args=[
|
|
87
|
+
"--disable-dev-shm-usage", # Prevent memory issues
|
|
88
|
+
"--no-sandbox", # Required for some environments
|
|
89
|
+
"--disable-setuid-sandbox",
|
|
90
|
+
"--disable-web-security", # For CORS, but still validate URLs
|
|
91
|
+
],
|
|
92
|
+
)
|
|
93
|
+
self.logger.info("Browser launched for JavaScript rendering")
|
|
94
|
+
return self
|
|
95
|
+
|
|
96
|
+
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
97
|
+
"""Async context manager exit."""
|
|
98
|
+
if self.browser:
|
|
99
|
+
await self.browser.close()
|
|
100
|
+
if self.playwright:
|
|
101
|
+
await self.playwright.stop()
|
|
102
|
+
self.logger.info("Browser closed")
|
|
103
|
+
|
|
104
|
+
async def fetch_with_js(self, url: str) -> str:
|
|
105
|
+
"""
|
|
106
|
+
Fetch page content with JavaScript rendering.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
url: URL to fetch
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Rendered HTML content
|
|
113
|
+
|
|
114
|
+
Security measures:
|
|
115
|
+
- URL validation before fetch
|
|
116
|
+
- Timeout limits
|
|
117
|
+
- Blocks certain resource types (images, fonts) to speed up
|
|
118
|
+
"""
|
|
119
|
+
if not self.browser:
|
|
120
|
+
raise RuntimeError("Browser not initialized. Use async context manager.")
|
|
121
|
+
|
|
122
|
+
if not self.base_fetcher.validate_url(url):
|
|
123
|
+
raise ValueError(f"Invalid URL: {url}")
|
|
124
|
+
|
|
125
|
+
user_agent = self.base_fetcher.session.headers.get("User-Agent")
|
|
126
|
+
if isinstance(user_agent, bytes):
|
|
127
|
+
user_agent = user_agent.decode("utf-8")
|
|
128
|
+
|
|
129
|
+
context = await self.browser.new_context(
|
|
130
|
+
user_agent=user_agent,
|
|
131
|
+
viewport={"width": 1920, "height": 1080},
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
page = await context.new_page()
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
# Block unnecessary resources to speed up loading
|
|
138
|
+
async def route_handler(route: Any) -> None:
|
|
139
|
+
resource_type = route.request.resource_type
|
|
140
|
+
if resource_type in ["image", "font", "media"]:
|
|
141
|
+
await route.abort()
|
|
142
|
+
else:
|
|
143
|
+
await route.continue_()
|
|
144
|
+
|
|
145
|
+
await page.route("**/*", route_handler)
|
|
146
|
+
|
|
147
|
+
# Navigate with timeout
|
|
148
|
+
await page.goto(
|
|
149
|
+
url,
|
|
150
|
+
wait_until="networkidle",
|
|
151
|
+
timeout=self.MAX_JS_RENDER_TIME * 1000,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Get rendered HTML
|
|
155
|
+
content = await page.content()
|
|
156
|
+
|
|
157
|
+
return content
|
|
158
|
+
|
|
159
|
+
except Exception as e:
|
|
160
|
+
self.logger.error(f"JS rendering error for {url}: {e}")
|
|
161
|
+
raise
|
|
162
|
+
finally:
|
|
163
|
+
await page.close()
|
|
164
|
+
await context.close()
|
|
165
|
+
|
|
166
|
+
async def fetch_without_js(self, session: aiohttp.ClientSession, url: str) -> str:
|
|
167
|
+
"""
|
|
168
|
+
Fetch page content without JavaScript (faster).
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
session: aiohttp ClientSession
|
|
172
|
+
url: URL to fetch
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
HTML content
|
|
176
|
+
"""
|
|
177
|
+
if not self.base_fetcher.validate_url(url):
|
|
178
|
+
raise ValueError(f"Invalid URL: {url}")
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
async with session.get(
|
|
182
|
+
url,
|
|
183
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
|
184
|
+
headers=self.base_fetcher.session.headers,
|
|
185
|
+
) as response:
|
|
186
|
+
response.raise_for_status()
|
|
187
|
+
|
|
188
|
+
# Validate content type
|
|
189
|
+
content_type = response.headers.get("Content-Type", "")
|
|
190
|
+
if not self.base_fetcher.validate_content_type(content_type):
|
|
191
|
+
raise ValueError(f"Invalid content type: {content_type}")
|
|
192
|
+
|
|
193
|
+
# Check size limits
|
|
194
|
+
content_length = response.headers.get("Content-Length")
|
|
195
|
+
if content_length and int(content_length) > self.MAX_CONTENT_SIZE:
|
|
196
|
+
raise ValueError(f"Content too large: {content_length} bytes")
|
|
197
|
+
|
|
198
|
+
# Read with size limit
|
|
199
|
+
content = b""
|
|
200
|
+
async for chunk in response.content.iter_chunked(8192):
|
|
201
|
+
content += chunk
|
|
202
|
+
if len(content) > self.MAX_CONTENT_SIZE:
|
|
203
|
+
raise ValueError("Content size limit exceeded")
|
|
204
|
+
|
|
205
|
+
return content.decode("utf-8", errors="ignore")
|
|
206
|
+
|
|
207
|
+
except Exception as e:
|
|
208
|
+
self.logger.error(f"HTTP fetch error for {url}: {e}")
|
|
209
|
+
raise
|
|
210
|
+
|
|
211
|
+
async def fetch_url(
|
|
212
|
+
self,
|
|
213
|
+
session: Optional[aiohttp.ClientSession],
|
|
214
|
+
url: str,
|
|
215
|
+
output_path: Path,
|
|
216
|
+
) -> bool:
|
|
217
|
+
"""
|
|
218
|
+
Fetch single URL with rate limiting and save to file.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
session: aiohttp session (None if using JS)
|
|
222
|
+
url: URL to fetch
|
|
223
|
+
output_path: Where to save content
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
True if successful, False otherwise
|
|
227
|
+
"""
|
|
228
|
+
async with self.semaphore: # Limit concurrency
|
|
229
|
+
if not self.base_fetcher.validate_url(url):
|
|
230
|
+
self.logger.warning(f"Skipping invalid URL: {url}")
|
|
231
|
+
self.base_fetcher.stats["errors"] += 1
|
|
232
|
+
return False
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
validated_path = validate_output_path(output_path, self.base_fetcher.output_dir)
|
|
236
|
+
except ValueError as e:
|
|
237
|
+
self.logger.error(f"Path validation failed: {e}")
|
|
238
|
+
self.base_fetcher.stats["errors"] += 1
|
|
239
|
+
return False
|
|
240
|
+
|
|
241
|
+
# Skip if exists
|
|
242
|
+
if self.base_fetcher.skip_existing and validated_path.exists():
|
|
243
|
+
self.logger.debug(f"Skipping (already exists): {validated_path}")
|
|
244
|
+
self.base_fetcher.stats["skipped"] += 1
|
|
245
|
+
return False
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
# Fetch content
|
|
249
|
+
if self.use_js:
|
|
250
|
+
html_content = await self.fetch_with_js(url)
|
|
251
|
+
else:
|
|
252
|
+
if session is None:
|
|
253
|
+
raise RuntimeError("Session is required for non-JS fetching")
|
|
254
|
+
html_content = await self.fetch_without_js(session, url)
|
|
255
|
+
|
|
256
|
+
# Process with BeautifulSoup (same as sync version)
|
|
257
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
258
|
+
|
|
259
|
+
# Remove unwanted elements
|
|
260
|
+
for element in soup(["script", "style", "nav", "footer", "header"]):
|
|
261
|
+
element.decompose()
|
|
262
|
+
|
|
263
|
+
# Find main content
|
|
264
|
+
import re
|
|
265
|
+
|
|
266
|
+
main_content = (
|
|
267
|
+
soup.find("main")
|
|
268
|
+
or soup.find("article")
|
|
269
|
+
or soup.find(class_=re.compile(r"content|documentation|docs"))
|
|
270
|
+
or soup.find("body")
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
if main_content:
|
|
274
|
+
# Convert to markdown
|
|
275
|
+
markdown = self.base_fetcher.h2t.handle(str(main_content))
|
|
276
|
+
frontmatter = f"""---
|
|
277
|
+
url: {url}
|
|
278
|
+
fetched: {time.strftime('%Y-%m-%d')}
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
"""
|
|
282
|
+
content = frontmatter + markdown.strip()
|
|
283
|
+
else:
|
|
284
|
+
content = f"# Error\n\nCould not find main content for {url}"
|
|
285
|
+
|
|
286
|
+
# Save content
|
|
287
|
+
ensure_dir(validated_path.parent)
|
|
288
|
+
await asyncio.to_thread(validated_path.write_text, content, encoding="utf-8")
|
|
289
|
+
|
|
290
|
+
self.logger.info(f"Saved: {validated_path}")
|
|
291
|
+
self.base_fetcher.stats["fetched"] += 1
|
|
292
|
+
|
|
293
|
+
# Rate limiting
|
|
294
|
+
if self.rate_limit_delay > 0:
|
|
295
|
+
await asyncio.sleep(self.rate_limit_delay)
|
|
296
|
+
|
|
297
|
+
return True
|
|
298
|
+
|
|
299
|
+
except Exception as e:
|
|
300
|
+
self.logger.error(f"Error fetching {url}: {e}")
|
|
301
|
+
self.base_fetcher.stats["errors"] += 1
|
|
302
|
+
return False
|
|
303
|
+
|
|
304
|
+
async def fetch_urls_parallel(
|
|
305
|
+
self,
|
|
306
|
+
url_output_pairs: list[tuple[str, Path]],
|
|
307
|
+
) -> None:
|
|
308
|
+
"""
|
|
309
|
+
Fetch multiple URLs in parallel.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
url_output_pairs: List of (url, output_path) tuples
|
|
313
|
+
"""
|
|
314
|
+
if self.use_js:
|
|
315
|
+
# JS mode - use browser, no session needed
|
|
316
|
+
tasks = [self.fetch_url(None, url, output_path) for url, output_path in url_output_pairs]
|
|
317
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
318
|
+
else:
|
|
319
|
+
# Non-JS mode - use aiohttp session
|
|
320
|
+
async with aiohttp.ClientSession() as session:
|
|
321
|
+
tasks = [self.fetch_url(session, url, output_path) for url, output_path in url_output_pairs]
|
|
322
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|