docpull 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,322 @@
1
+ """Async fetcher with JavaScript rendering support."""
2
+
3
+ import asyncio
4
+ import time
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, List, Optional, Tuple
7
+
8
+ import aiohttp
9
+ from bs4 import BeautifulSoup
10
+
11
+ from ..utils.file_utils import ensure_dir, validate_output_path
12
+ from .base import BaseFetcher
13
+
14
+ # Optional Playwright support
15
+ if TYPE_CHECKING:
16
+ from playwright.async_api import Browser, Page, Playwright
17
+
18
+ try:
19
+ from playwright.async_api import async_playwright
20
+
21
+ PLAYWRIGHT_AVAILABLE = True
22
+ except ImportError:
23
+ PLAYWRIGHT_AVAILABLE = False
24
+
25
+
26
+ class AsyncFetcher:
27
+ """
28
+ Async fetcher with optional JavaScript rendering support.
29
+
30
+ Security features:
31
+ - All URL validation from BaseFetcher
32
+ - Rate limiting (async-safe with semaphore)
33
+ - Concurrent request limits
34
+ - Timeout controls for both HTTP and browser
35
+ - Content size limits
36
+ - Playwright sandboxing (disabled JS in certain contexts)
37
+ """
38
+
39
+ MAX_CONTENT_SIZE = 50 * 1024 * 1024 # 50 MB
40
+ MAX_DOWNLOAD_TIME = 300 # 5 minutes
41
+ MAX_JS_RENDER_TIME = 30 # 30 seconds for JS rendering
42
+ MAX_CONCURRENT = 10 # Max concurrent requests
43
+
44
+ def __init__(
45
+ self,
46
+ base_fetcher: BaseFetcher,
47
+ max_concurrent: int = 10,
48
+ use_js: bool = False,
49
+ headless: bool = True,
50
+ ) -> None:
51
+ """
52
+ Initialize async fetcher.
53
+
54
+ Args:
55
+ base_fetcher: BaseFetcher instance for URL validation and settings
56
+ max_concurrent: Maximum concurrent requests
57
+ use_js: Enable JavaScript rendering with Playwright
58
+ headless: Run browser in headless mode
59
+ """
60
+ self.base_fetcher = base_fetcher
61
+ self.logger = base_fetcher.logger
62
+ self.max_concurrent = max_concurrent
63
+ self.use_js = use_js
64
+ self.headless = headless
65
+
66
+ # Async-safe rate limiting
67
+ self.semaphore = asyncio.Semaphore(max_concurrent)
68
+ self.rate_limit_delay = base_fetcher.rate_limit
69
+
70
+ # Browser instance (if using JS)
71
+ self.browser: Optional["Browser"] = None
72
+ self.playwright: Optional["Playwright"] = None
73
+
74
+ if use_js and not PLAYWRIGHT_AVAILABLE:
75
+ self.logger.warning("Playwright not installed. Install with: pip install docpull[js]")
76
+ self.logger.warning("Falling back to non-JS mode")
77
+ self.use_js = False
78
+
79
+ async def __aenter__(self) -> "AsyncFetcher":
80
+ """Async context manager entry."""
81
+ if self.use_js and PLAYWRIGHT_AVAILABLE:
82
+ self.playwright = await async_playwright().start()
83
+ # Launch with security-focused options
84
+ self.browser = await self.playwright.chromium.launch(
85
+ headless=self.headless,
86
+ args=[
87
+ "--disable-dev-shm-usage", # Prevent memory issues
88
+ "--no-sandbox", # Required for some environments
89
+ "--disable-setuid-sandbox",
90
+ "--disable-web-security", # For CORS, but still validate URLs
91
+ ],
92
+ )
93
+ self.logger.info("Browser launched for JavaScript rendering")
94
+ return self
95
+
96
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
97
+ """Async context manager exit."""
98
+ if self.browser:
99
+ await self.browser.close()
100
+ if self.playwright:
101
+ await self.playwright.stop()
102
+ self.logger.info("Browser closed")
103
+
104
+ async def fetch_with_js(self, url: str) -> str:
105
+ """
106
+ Fetch page content with JavaScript rendering.
107
+
108
+ Args:
109
+ url: URL to fetch
110
+
111
+ Returns:
112
+ Rendered HTML content
113
+
114
+ Security measures:
115
+ - URL validation before fetch
116
+ - Timeout limits
117
+ - Blocks certain resource types (images, fonts) to speed up
118
+ """
119
+ if not self.browser:
120
+ raise RuntimeError("Browser not initialized. Use async context manager.")
121
+
122
+ if not self.base_fetcher.validate_url(url):
123
+ raise ValueError(f"Invalid URL: {url}")
124
+
125
+ user_agent = self.base_fetcher.session.headers.get("User-Agent")
126
+ if isinstance(user_agent, bytes):
127
+ user_agent = user_agent.decode("utf-8")
128
+
129
+ context = await self.browser.new_context(
130
+ user_agent=user_agent,
131
+ viewport={"width": 1920, "height": 1080},
132
+ )
133
+
134
+ page = await context.new_page()
135
+
136
+ try:
137
+ # Block unnecessary resources to speed up loading
138
+ async def route_handler(route: Any) -> None:
139
+ resource_type = route.request.resource_type
140
+ if resource_type in ["image", "font", "media"]:
141
+ await route.abort()
142
+ else:
143
+ await route.continue_()
144
+
145
+ await page.route("**/*", route_handler)
146
+
147
+ # Navigate with timeout
148
+ await page.goto(
149
+ url,
150
+ wait_until="networkidle",
151
+ timeout=self.MAX_JS_RENDER_TIME * 1000,
152
+ )
153
+
154
+ # Get rendered HTML
155
+ content = await page.content()
156
+
157
+ return content
158
+
159
+ except Exception as e:
160
+ self.logger.error(f"JS rendering error for {url}: {e}")
161
+ raise
162
+ finally:
163
+ await page.close()
164
+ await context.close()
165
+
166
+ async def fetch_without_js(self, session: aiohttp.ClientSession, url: str) -> str:
167
+ """
168
+ Fetch page content without JavaScript (faster).
169
+
170
+ Args:
171
+ session: aiohttp ClientSession
172
+ url: URL to fetch
173
+
174
+ Returns:
175
+ HTML content
176
+ """
177
+ if not self.base_fetcher.validate_url(url):
178
+ raise ValueError(f"Invalid URL: {url}")
179
+
180
+ try:
181
+ async with session.get(
182
+ url,
183
+ timeout=aiohttp.ClientTimeout(total=30),
184
+ headers=self.base_fetcher.session.headers,
185
+ ) as response:
186
+ response.raise_for_status()
187
+
188
+ # Validate content type
189
+ content_type = response.headers.get("Content-Type", "")
190
+ if not self.base_fetcher.validate_content_type(content_type):
191
+ raise ValueError(f"Invalid content type: {content_type}")
192
+
193
+ # Check size limits
194
+ content_length = response.headers.get("Content-Length")
195
+ if content_length and int(content_length) > self.MAX_CONTENT_SIZE:
196
+ raise ValueError(f"Content too large: {content_length} bytes")
197
+
198
+ # Read with size limit
199
+ content = b""
200
+ async for chunk in response.content.iter_chunked(8192):
201
+ content += chunk
202
+ if len(content) > self.MAX_CONTENT_SIZE:
203
+ raise ValueError("Content size limit exceeded")
204
+
205
+ return content.decode("utf-8", errors="ignore")
206
+
207
+ except Exception as e:
208
+ self.logger.error(f"HTTP fetch error for {url}: {e}")
209
+ raise
210
+
211
+ async def fetch_url(
212
+ self,
213
+ session: Optional[aiohttp.ClientSession],
214
+ url: str,
215
+ output_path: Path,
216
+ ) -> bool:
217
+ """
218
+ Fetch single URL with rate limiting and save to file.
219
+
220
+ Args:
221
+ session: aiohttp session (None if using JS)
222
+ url: URL to fetch
223
+ output_path: Where to save content
224
+
225
+ Returns:
226
+ True if successful, False otherwise
227
+ """
228
+ async with self.semaphore: # Limit concurrency
229
+ if not self.base_fetcher.validate_url(url):
230
+ self.logger.warning(f"Skipping invalid URL: {url}")
231
+ self.base_fetcher.stats["errors"] += 1
232
+ return False
233
+
234
+ try:
235
+ validated_path = validate_output_path(output_path, self.base_fetcher.output_dir)
236
+ except ValueError as e:
237
+ self.logger.error(f"Path validation failed: {e}")
238
+ self.base_fetcher.stats["errors"] += 1
239
+ return False
240
+
241
+ # Skip if exists
242
+ if self.base_fetcher.skip_existing and validated_path.exists():
243
+ self.logger.debug(f"Skipping (already exists): {validated_path}")
244
+ self.base_fetcher.stats["skipped"] += 1
245
+ return False
246
+
247
+ try:
248
+ # Fetch content
249
+ if self.use_js:
250
+ html_content = await self.fetch_with_js(url)
251
+ else:
252
+ if session is None:
253
+ raise RuntimeError("Session is required for non-JS fetching")
254
+ html_content = await self.fetch_without_js(session, url)
255
+
256
+ # Process with BeautifulSoup (same as sync version)
257
+ soup = BeautifulSoup(html_content, "html.parser")
258
+
259
+ # Remove unwanted elements
260
+ for element in soup(["script", "style", "nav", "footer", "header"]):
261
+ element.decompose()
262
+
263
+ # Find main content
264
+ import re
265
+
266
+ main_content = (
267
+ soup.find("main")
268
+ or soup.find("article")
269
+ or soup.find(class_=re.compile(r"content|documentation|docs"))
270
+ or soup.find("body")
271
+ )
272
+
273
+ if main_content:
274
+ # Convert to markdown
275
+ markdown = self.base_fetcher.h2t.handle(str(main_content))
276
+ frontmatter = f"""---
277
+ url: {url}
278
+ fetched: {time.strftime('%Y-%m-%d')}
279
+ ---
280
+
281
+ """
282
+ content = frontmatter + markdown.strip()
283
+ else:
284
+ content = f"# Error\n\nCould not find main content for {url}"
285
+
286
+ # Save content
287
+ ensure_dir(validated_path.parent)
288
+ await asyncio.to_thread(validated_path.write_text, content, encoding="utf-8")
289
+
290
+ self.logger.info(f"Saved: {validated_path}")
291
+ self.base_fetcher.stats["fetched"] += 1
292
+
293
+ # Rate limiting
294
+ if self.rate_limit_delay > 0:
295
+ await asyncio.sleep(self.rate_limit_delay)
296
+
297
+ return True
298
+
299
+ except Exception as e:
300
+ self.logger.error(f"Error fetching {url}: {e}")
301
+ self.base_fetcher.stats["errors"] += 1
302
+ return False
303
+
304
+ async def fetch_urls_parallel(
305
+ self,
306
+ url_output_pairs: list[tuple[str, Path]],
307
+ ) -> None:
308
+ """
309
+ Fetch multiple URLs in parallel.
310
+
311
+ Args:
312
+ url_output_pairs: List of (url, output_path) tuples
313
+ """
314
+ if self.use_js:
315
+ # JS mode - use browser, no session needed
316
+ tasks = [self.fetch_url(None, url, output_path) for url, output_path in url_output_pairs]
317
+ await asyncio.gather(*tasks, return_exceptions=True)
318
+ else:
319
+ # Non-JS mode - use aiohttp session
320
+ async with aiohttp.ClientSession() as session:
321
+ tasks = [self.fetch_url(session, url, output_path) for url, output_path in url_output_pairs]
322
+ await asyncio.gather(*tasks, return_exceptions=True)