iflow-mcp_anton-prosterity-documentation-search-enhanced 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. documentation_search_enhanced/__init__.py +14 -0
  2. documentation_search_enhanced/__main__.py +6 -0
  3. documentation_search_enhanced/config.json +1674 -0
  4. documentation_search_enhanced/config_manager.py +233 -0
  5. documentation_search_enhanced/config_validator.py +79 -0
  6. documentation_search_enhanced/content_enhancer.py +578 -0
  7. documentation_search_enhanced/docker_manager.py +87 -0
  8. documentation_search_enhanced/logger.py +179 -0
  9. documentation_search_enhanced/main.py +2170 -0
  10. documentation_search_enhanced/project_generator.py +260 -0
  11. documentation_search_enhanced/project_scanner.py +85 -0
  12. documentation_search_enhanced/reranker.py +230 -0
  13. documentation_search_enhanced/site_index_builder.py +274 -0
  14. documentation_search_enhanced/site_index_downloader.py +222 -0
  15. documentation_search_enhanced/site_search.py +1325 -0
  16. documentation_search_enhanced/smart_search.py +473 -0
  17. documentation_search_enhanced/snyk_integration.py +657 -0
  18. documentation_search_enhanced/vector_search.py +303 -0
  19. documentation_search_enhanced/version_resolver.py +189 -0
  20. documentation_search_enhanced/vulnerability_scanner.py +545 -0
  21. documentation_search_enhanced/web_scraper.py +117 -0
  22. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/METADATA +195 -0
  23. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/RECORD +26 -0
  24. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/WHEEL +4 -0
  25. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/entry_points.txt +2 -0
  26. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,545 @@
1
+ """
2
+ Vulnerability scanner for documentation-search-enhanced MCP server.
3
+ Integrates with OSINT sources to check library security vulnerabilities.
4
+ """
5
+
6
+ import asyncio
7
+ import sys
8
+ from dataclasses import dataclass
9
+ from datetime import datetime, timedelta
10
+ from enum import Enum
11
+ from typing import Any, Dict, List, Optional
12
+
13
+ import httpx
14
+
15
+
16
+ class SeverityLevel(Enum):
17
+ """Vulnerability severity levels"""
18
+
19
+ CRITICAL = "critical"
20
+ HIGH = "high"
21
+ MEDIUM = "medium"
22
+ LOW = "low"
23
+ INFO = "info"
24
+
25
+
26
+ @dataclass
27
+ class Vulnerability:
28
+ """Represents a security vulnerability"""
29
+
30
+ id: str
31
+ title: str
32
+ description: str
33
+ severity: SeverityLevel
34
+ cvss_score: Optional[float]
35
+ cve_id: Optional[str]
36
+ affected_versions: List[str]
37
+ fixed_version: Optional[str]
38
+ published_date: str
39
+ source: str # "osv", "github", "safety", "snyk"
40
+ references: List[str]
41
+
42
+ def to_dict(self) -> Dict[str, Any]:
43
+ return {
44
+ "id": self.id,
45
+ "title": self.title,
46
+ "description": (
47
+ self.description[:200] + "..."
48
+ if len(self.description) > 200
49
+ else self.description
50
+ ),
51
+ "severity": self.severity.value,
52
+ "cvss_score": self.cvss_score,
53
+ "cve_id": self.cve_id,
54
+ "affected_versions": self.affected_versions,
55
+ "fixed_version": self.fixed_version,
56
+ "published_date": self.published_date,
57
+ "source": self.source,
58
+ "references": self.references[:3], # Limit references
59
+ }
60
+
61
+
62
+ @dataclass
63
+ class SecurityReport:
64
+ """Comprehensive security report for a library"""
65
+
66
+ library_name: str
67
+ ecosystem: str # "pypi", "npm", "maven", etc.
68
+ scan_date: str
69
+ total_vulnerabilities: int
70
+ critical_count: int
71
+ high_count: int
72
+ medium_count: int
73
+ low_count: int
74
+ security_score: float # 0-100, higher is better
75
+ recommendations: List[str]
76
+ vulnerabilities: List[Vulnerability]
77
+ latest_secure_version: Optional[str]
78
+
79
+ def to_dict(self) -> Dict[str, Any]:
80
+ return {
81
+ "library_name": self.library_name,
82
+ "ecosystem": self.ecosystem,
83
+ "scan_date": self.scan_date,
84
+ "summary": {
85
+ "total_vulnerabilities": self.total_vulnerabilities,
86
+ "critical": self.critical_count,
87
+ "high": self.high_count,
88
+ "medium": self.medium_count,
89
+ "low": self.low_count,
90
+ "security_score": self.security_score,
91
+ },
92
+ "latest_secure_version": self.latest_secure_version,
93
+ "recommendations": self.recommendations,
94
+ "vulnerabilities": [vuln.to_dict() for vuln in self.vulnerabilities],
95
+ }
96
+
97
+
98
+ class VulnerabilityScanner:
99
+ """Main vulnerability scanner class"""
100
+
101
+ def __init__(self):
102
+ self.cache = {}
103
+ self.cache_ttl = timedelta(hours=6) # Cache for 6 hours
104
+ self.timeout = httpx.Timeout(30.0)
105
+
106
+ # API endpoints
107
+ self.osv_api = "https://api.osv.dev"
108
+ self.github_api = "https://api.github.com"
109
+ self.cve_api = "https://cve.circl.lu/api"
110
+
111
+ async def scan_library(
112
+ self, library_name: str, ecosystem: str = "PyPI"
113
+ ) -> SecurityReport:
114
+ """
115
+ Comprehensive vulnerability scan for a library
116
+
117
+ Args:
118
+ library_name: Name of the library (e.g., "fastapi", "react")
119
+ ecosystem: Package ecosystem ("PyPI", "npm", "Maven", etc.)
120
+
121
+ Returns:
122
+ SecurityReport with vulnerability details
123
+ """
124
+ cache_key = f"{library_name}_{ecosystem}"
125
+
126
+ # Check cache first
127
+ if self._is_cached(cache_key):
128
+ return self.cache[cache_key]["data"]
129
+
130
+ vulnerabilities = []
131
+
132
+ # Scan multiple sources in parallel
133
+ scan_tasks = [
134
+ self._scan_osv(library_name, ecosystem),
135
+ self._scan_github_advisories(library_name, ecosystem),
136
+ (
137
+ self._scan_safety_db(library_name)
138
+ if ecosystem.lower() == "pypi"
139
+ else self._empty_scan()
140
+ ),
141
+ ]
142
+
143
+ try:
144
+ results = await asyncio.gather(*scan_tasks, return_exceptions=True)
145
+
146
+ for result in results:
147
+ if isinstance(result, list):
148
+ vulnerabilities.extend(result)
149
+ elif isinstance(result, Exception):
150
+ print(f"Scan error: {result}", file=sys.stderr)
151
+
152
+ except Exception as e:
153
+ print(f"Vulnerability scan failed for {library_name}: {e}", file=sys.stderr)
154
+
155
+ # Generate security report
156
+ report = self._generate_security_report(
157
+ library_name, ecosystem, vulnerabilities
158
+ )
159
+
160
+ # Cache the result
161
+ self._cache_result(cache_key, report)
162
+
163
+ return report
164
+
165
+ async def _scan_osv(self, library_name: str, ecosystem: str) -> List[Vulnerability]:
166
+ """Scan OSV (Open Source Vulnerabilities) database"""
167
+ vulnerabilities = []
168
+
169
+ try:
170
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
171
+ # OSV API query
172
+ query_data = {"package": {"name": library_name, "ecosystem": ecosystem}}
173
+
174
+ response = await client.post(
175
+ f"{self.osv_api}/v1/query", json=query_data
176
+ )
177
+
178
+ if response.status_code == 200:
179
+ data = response.json()
180
+
181
+ for vuln_data in data.get("vulns", []):
182
+ vulnerability = self._parse_osv_vulnerability(vuln_data)
183
+ if vulnerability:
184
+ vulnerabilities.append(vulnerability)
185
+
186
+ except Exception as e:
187
+ print(f"OSV scan error for {library_name}: {e}", file=sys.stderr)
188
+
189
+ return vulnerabilities
190
+
191
+ async def _scan_github_advisories(
192
+ self, library_name: str, ecosystem: str
193
+ ) -> List[Vulnerability]:
194
+ """Scan GitHub Security Advisories"""
195
+ vulnerabilities = []
196
+
197
+ try:
198
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
199
+ # GitHub GraphQL API would be more comprehensive, but REST API is simpler
200
+ search_query = f"type:security-advisories {library_name}"
201
+
202
+ response = await client.get(
203
+ f"{self.github_api}/search/repositories",
204
+ params={"q": search_query, "per_page": 10},
205
+ headers={"Accept": "application/vnd.github+json"},
206
+ )
207
+
208
+ if response.status_code == 200:
209
+ data = response.json()
210
+
211
+ # This is a simplified implementation
212
+ # In production, you'd use GitHub's Security Advisory API
213
+ for item in data.get("items", []):
214
+ if library_name.lower() in item.get("full_name", "").lower():
215
+ vuln = Vulnerability(
216
+ id=f"GHSA-{item['id']}",
217
+ title=f"GitHub Advisory for {library_name}",
218
+ description=item.get(
219
+ "description", "Security advisory found"
220
+ ),
221
+ severity=SeverityLevel.MEDIUM, # Default severity
222
+ cvss_score=None,
223
+ cve_id=None,
224
+ affected_versions=["unknown"],
225
+ fixed_version=None,
226
+ published_date=item.get("created_at", ""),
227
+ source="github",
228
+ references=[item.get("html_url", "")],
229
+ )
230
+ vulnerabilities.append(vuln)
231
+
232
+ except Exception as e:
233
+ print(
234
+ f"GitHub Advisory scan error for {library_name}: {e}", file=sys.stderr
235
+ )
236
+
237
+ return vulnerabilities
238
+
239
+ async def _scan_safety_db(self, library_name: str) -> List[Vulnerability]:
240
+ """Scan Python Safety Database (for PyPI packages)"""
241
+ vulnerabilities = []
242
+
243
+ try:
244
+ # Using Safety CLI database approach
245
+ # In a real implementation, you might use their API or local database
246
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
247
+ # PyPA Safety Database (simplified example)
248
+ response = await client.get(
249
+ f"https://pypi.org/pypi/{library_name}/json"
250
+ )
251
+
252
+ if response.status_code == 200:
253
+ data = response.json()
254
+
255
+ # Check for known vulnerable versions
256
+ # This is a placeholder - real implementation would check Safety DB
257
+ info = data.get("info", {})
258
+ if "security" in info.get("description", "").lower():
259
+ vuln = Vulnerability(
260
+ id=f"PYSA-{library_name}",
261
+ title=f"Potential security issue in {library_name}",
262
+ description="Security-related keywords found in package description",
263
+ severity=SeverityLevel.INFO,
264
+ cvss_score=None,
265
+ cve_id=None,
266
+ affected_versions=["unknown"],
267
+ fixed_version=None,
268
+ published_date=datetime.now().isoformat(),
269
+ source="safety",
270
+ references=[f"https://pypi.org/project/{library_name}/"],
271
+ )
272
+ vulnerabilities.append(vuln)
273
+
274
+ except Exception as e:
275
+ print(f"Safety DB scan error for {library_name}: {e}", file=sys.stderr)
276
+
277
+ return vulnerabilities
278
+
279
+ async def _empty_scan(self) -> List[Vulnerability]:
280
+ """Empty scan for unsupported ecosystems"""
281
+ return []
282
+
283
+ def _parse_osv_vulnerability(
284
+ self, vuln_data: Dict[str, Any]
285
+ ) -> Optional[Vulnerability]:
286
+ """Parse OSV vulnerability data"""
287
+ try:
288
+ # Extract severity
289
+ severity = SeverityLevel.MEDIUM # Default
290
+ cvss_score = None
291
+
292
+ if "severity" in vuln_data:
293
+ severity_info = vuln_data["severity"]
294
+ if isinstance(severity_info, list) and severity_info:
295
+ severity_data = severity_info[0]
296
+ score = severity_data.get("score")
297
+ if score:
298
+ cvss_score = float(score)
299
+ if cvss_score >= 9.0:
300
+ severity = SeverityLevel.CRITICAL
301
+ elif cvss_score >= 7.0:
302
+ severity = SeverityLevel.HIGH
303
+ elif cvss_score >= 4.0:
304
+ severity = SeverityLevel.MEDIUM
305
+ else:
306
+ severity = SeverityLevel.LOW
307
+
308
+ # Extract affected versions
309
+ affected_versions = []
310
+ for affected in vuln_data.get("affected", []):
311
+ ranges = affected.get("ranges", [])
312
+ for range_info in ranges:
313
+ events = range_info.get("events", [])
314
+ for event in events:
315
+ if "introduced" in event:
316
+ affected_versions.append(f">={event['introduced']}")
317
+ elif "fixed" in event:
318
+ affected_versions.append(f"<{event['fixed']}")
319
+
320
+ # Extract references
321
+ references = []
322
+ for ref in vuln_data.get("references", []):
323
+ if "url" in ref:
324
+ references.append(ref["url"])
325
+
326
+ return Vulnerability(
327
+ id=vuln_data.get("id", ""),
328
+ title=vuln_data.get("summary", ""),
329
+ description=vuln_data.get("details", ""),
330
+ severity=severity,
331
+ cvss_score=cvss_score,
332
+ cve_id=self._extract_cve_id(vuln_data),
333
+ affected_versions=affected_versions,
334
+ fixed_version=self._extract_fixed_version(vuln_data),
335
+ published_date=vuln_data.get("published", ""),
336
+ source="osv",
337
+ references=references,
338
+ )
339
+
340
+ except Exception as e:
341
+ print(f"Error parsing OSV vulnerability: {e}", file=sys.stderr)
342
+ return None
343
+
344
+ def _extract_cve_id(self, vuln_data: Dict[str, Any]) -> Optional[str]:
345
+ """Extract CVE ID from vulnerability data"""
346
+ aliases = vuln_data.get("aliases", [])
347
+ for alias in aliases:
348
+ if alias.startswith("CVE-"):
349
+ return alias
350
+ return None
351
+
352
+ def _extract_fixed_version(self, vuln_data: Dict[str, Any]) -> Optional[str]:
353
+ """Extract fixed version from vulnerability data"""
354
+ for affected in vuln_data.get("affected", []):
355
+ ranges = affected.get("ranges", [])
356
+ for range_info in ranges:
357
+ events = range_info.get("events", [])
358
+ for event in events:
359
+ if "fixed" in event:
360
+ return event["fixed"]
361
+ return None
362
+
363
+ def _generate_security_report(
364
+ self, library_name: str, ecosystem: str, vulnerabilities: List[Vulnerability]
365
+ ) -> SecurityReport:
366
+ """Generate comprehensive security report"""
367
+
368
+ # Count vulnerabilities by severity
369
+ critical_count = sum(
370
+ 1 for v in vulnerabilities if v.severity == SeverityLevel.CRITICAL
371
+ )
372
+ high_count = sum(1 for v in vulnerabilities if v.severity == SeverityLevel.HIGH)
373
+ medium_count = sum(
374
+ 1 for v in vulnerabilities if v.severity == SeverityLevel.MEDIUM
375
+ )
376
+ low_count = sum(1 for v in vulnerabilities if v.severity == SeverityLevel.LOW)
377
+
378
+ # Calculate security score (0-100, higher is better)
379
+ security_score = self._calculate_security_score(
380
+ critical_count, high_count, medium_count, low_count
381
+ )
382
+
383
+ # Generate recommendations
384
+ recommendations = self._generate_recommendations(
385
+ library_name, vulnerabilities, security_score
386
+ )
387
+
388
+ # Find latest secure version (placeholder)
389
+ latest_secure_version = self._find_latest_secure_version(vulnerabilities)
390
+
391
+ return SecurityReport(
392
+ library_name=library_name,
393
+ ecosystem=ecosystem,
394
+ scan_date=datetime.now().isoformat(),
395
+ total_vulnerabilities=len(vulnerabilities),
396
+ critical_count=critical_count,
397
+ high_count=high_count,
398
+ medium_count=medium_count,
399
+ low_count=low_count,
400
+ security_score=security_score,
401
+ recommendations=recommendations,
402
+ vulnerabilities=vulnerabilities[:10], # Limit to top 10
403
+ latest_secure_version=latest_secure_version,
404
+ )
405
+
406
+ def _calculate_security_score(
407
+ self, critical: int, high: int, medium: int, low: int
408
+ ) -> float:
409
+ """Calculate security score based on vulnerability counts"""
410
+ # Start with perfect score
411
+ score = 100.0
412
+
413
+ # Deduct points based on severity
414
+ score -= critical * 25 # Critical: -25 points each
415
+ score -= high * 15 # High: -15 points each
416
+ score -= medium * 5 # Medium: -5 points each
417
+ score -= low * 1 # Low: -1 point each
418
+
419
+ # Ensure score doesn't go below 0
420
+ return max(0.0, score)
421
+
422
+ def _generate_recommendations(
423
+ self,
424
+ library_name: str,
425
+ vulnerabilities: List[Vulnerability],
426
+ security_score: float,
427
+ ) -> List[str]:
428
+ """Generate security recommendations"""
429
+ recommendations = []
430
+
431
+ if security_score < 50:
432
+ recommendations.append(
433
+ "🚨 High security risk - Consider alternative libraries"
434
+ )
435
+ elif security_score < 70:
436
+ recommendations.append("⚠️ Moderate security risk - Monitor for updates")
437
+ elif security_score < 90:
438
+ recommendations.append("✅ Generally secure - Keep updated")
439
+ else:
440
+ recommendations.append("🛡️ Excellent security record")
441
+
442
+ # Specific recommendations based on vulnerabilities
443
+ critical_vulns = [
444
+ v for v in vulnerabilities if v.severity == SeverityLevel.CRITICAL
445
+ ]
446
+ if critical_vulns:
447
+ recommendations.append(
448
+ "🔥 Update immediately - Critical vulnerabilities found"
449
+ )
450
+
451
+ fixed_versions = [v.fixed_version for v in vulnerabilities if v.fixed_version]
452
+ if fixed_versions:
453
+ latest_fix = max(fixed_versions)
454
+ recommendations.append(f"📦 Update to version {latest_fix} or later")
455
+
456
+ if len(vulnerabilities) > 5:
457
+ recommendations.append(
458
+ "📊 Many vulnerabilities found - Consider security audit"
459
+ )
460
+
461
+ return recommendations[:5] # Limit recommendations
462
+
463
+ def _find_latest_secure_version(
464
+ self, vulnerabilities: List[Vulnerability]
465
+ ) -> Optional[str]:
466
+ """Find the latest secure version"""
467
+ fixed_versions = [v.fixed_version for v in vulnerabilities if v.fixed_version]
468
+ if fixed_versions:
469
+ # This is simplified - real implementation would use proper version comparison
470
+ return max(fixed_versions)
471
+ return None
472
+
473
+ def _is_cached(self, cache_key: str) -> bool:
474
+ """Check if result is cached and still valid"""
475
+ if cache_key not in self.cache:
476
+ return False
477
+
478
+ cached_time = self.cache[cache_key]["timestamp"]
479
+ return datetime.now() - cached_time < self.cache_ttl
480
+
481
+ def _cache_result(self, cache_key: str, result: SecurityReport) -> None:
482
+ """Cache scan result"""
483
+ self.cache[cache_key] = {"data": result, "timestamp": datetime.now()}
484
+
485
+ # Simple cache cleanup - remove old entries
486
+ if len(self.cache) > 100:
487
+ oldest_key = min(
488
+ self.cache.keys(), key=lambda k: self.cache[k]["timestamp"]
489
+ )
490
+ del self.cache[oldest_key]
491
+
492
+
493
+ class SecurityIntegration:
494
+ """Integration layer for security features"""
495
+
496
+ def __init__(self, scanner: VulnerabilityScanner):
497
+ self.scanner = scanner
498
+
499
+ async def get_security_score(
500
+ self, library_name: str, ecosystem: str = "PyPI"
501
+ ) -> float:
502
+ """Get security score for a library (0-100, higher is better)"""
503
+ try:
504
+ report = await self.scanner.scan_library(library_name, ecosystem)
505
+ return report.security_score
506
+ except Exception:
507
+ return 50.0 # Default neutral score
508
+
509
+ async def is_library_secure(
510
+ self, library_name: str, ecosystem: str = "PyPI", threshold: float = 70.0
511
+ ) -> bool:
512
+ """Check if library meets security threshold"""
513
+ score = await self.get_security_score(library_name, ecosystem)
514
+ return score >= threshold
515
+
516
+ async def get_security_summary(
517
+ self, library_name: str, ecosystem: str = "PyPI"
518
+ ) -> Dict[str, Any]:
519
+ """Get concise security summary"""
520
+ try:
521
+ report = await self.scanner.scan_library(library_name, ecosystem)
522
+ return {
523
+ "library": library_name,
524
+ "security_score": report.security_score,
525
+ "total_vulnerabilities": report.total_vulnerabilities,
526
+ "critical_vulnerabilities": report.critical_count,
527
+ "status": "secure" if report.security_score >= 70 else "at_risk",
528
+ "primary_recommendation": (
529
+ report.recommendations[0]
530
+ if report.recommendations
531
+ else "No specific recommendations"
532
+ ),
533
+ }
534
+ except Exception as e:
535
+ return {
536
+ "library": library_name,
537
+ "security_score": 50.0,
538
+ "error": str(e),
539
+ "status": "unknown",
540
+ }
541
+
542
+
543
+ # Global instances
544
+ vulnerability_scanner = VulnerabilityScanner()
545
+ security_integration = SecurityIntegration(vulnerability_scanner)
@@ -0,0 +1,117 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ An advanced web scraper using Playwright to handle dynamic, JS-heavy sites.
4
+ """
5
+
6
+ import sys
7
+ from typing import Optional
8
+
9
+ from bs4 import BeautifulSoup
10
+ import httpx
11
+ from playwright.async_api import Browser, async_playwright
12
+
13
+
14
+ class PlaywrightScraper:
15
+ """A web scraper that uses a real browser to render pages."""
16
+
17
+ _browser: Optional[Browser] = None
18
+ _playwright = None
19
+ _disabled_reason: Optional[str] = None
20
+
21
+ async def _get_browser(self) -> Browser:
22
+ """Initializes and returns a shared browser instance."""
23
+ if self._disabled_reason:
24
+ raise RuntimeError(self._disabled_reason)
25
+ if self._browser is None or not self._browser.is_connected():
26
+ try:
27
+ self._playwright = await async_playwright().start()
28
+ self._browser = await self._playwright.chromium.launch()
29
+ except Exception as e:
30
+ self._disabled_reason = f"Playwright disabled: {e}"
31
+ if self._playwright:
32
+ try:
33
+ await self._playwright.stop()
34
+ except Exception:
35
+ pass
36
+ self._playwright = None
37
+ self._browser = None
38
+ raise
39
+ return self._browser
40
+
41
+ async def scrape_url(self, url: str) -> str:
42
+ """
43
+ Scrapes a URL using Playwright, returning the clean, readable text content.
44
+
45
+ This method can handle dynamic content, as it waits for the page
46
+ to fully load and can execute scripts if needed.
47
+ """
48
+ page = None
49
+
50
+ try:
51
+ if self._disabled_reason:
52
+ return await self._scrape_url_fallback(url)
53
+
54
+ browser = await self._get_browser()
55
+ page = await browser.new_page()
56
+
57
+ # Navigate to the page and wait for it to be fully loaded
58
+ await page.goto(url, wait_until="networkidle", timeout=60000)
59
+
60
+ # Scroll to the bottom to trigger lazy-loaded content
61
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
62
+ await page.wait_for_timeout(1000) # Wait for any new content to load
63
+
64
+ # Get the page content after JavaScript has rendered
65
+ html_content = await page.content()
66
+
67
+ # Use BeautifulSoup to parse and clean the final HTML
68
+ soup = BeautifulSoup(html_content, "html.parser")
69
+
70
+ # Remove non-content elements
71
+ for element in soup(
72
+ ["script", "style", "nav", "footer", "header", "aside"]
73
+ ):
74
+ element.decompose()
75
+
76
+ # Get clean text
77
+ text = soup.get_text(separator=" ", strip=True)
78
+ return text
79
+
80
+ except Exception as e:
81
+ print(f"Failed to scrape {url}: {e}", file=sys.stderr)
82
+ return await self._scrape_url_fallback(url)
83
+ finally:
84
+ if page is not None:
85
+ await page.close()
86
+
87
+ async def _scrape_url_fallback(self, url: str) -> str:
88
+ """Fallback fetcher when Playwright cannot launch (e.g., sandboxed environments)."""
89
+ headers = {"User-Agent": "docs-app/1.0"}
90
+ try:
91
+ async with httpx.AsyncClient(
92
+ timeout=httpx.Timeout(15.0, read=30.0),
93
+ follow_redirects=True,
94
+ headers=headers,
95
+ ) as client:
96
+ response = await client.get(url)
97
+ response.raise_for_status()
98
+
99
+ soup = BeautifulSoup(response.text, "html.parser")
100
+ for element in soup(
101
+ ["script", "style", "nav", "footer", "header", "aside"]
102
+ ):
103
+ element.decompose()
104
+ return soup.get_text(separator=" ", strip=True)
105
+ except Exception as e:
106
+ print(f"Fallback fetch failed for {url}: {e}", file=sys.stderr)
107
+ return f"Error: Could not retrieve content from {url}."
108
+
109
+ async def close(self):
110
+ """Closes the browser instance."""
111
+ if self._browser and self._browser.is_connected():
112
+ await self._browser.close()
113
+ if self._playwright:
114
+ await self._playwright.stop()
115
+
116
+
117
+ scraper = PlaywrightScraper()