iflow-mcp_anton-prosterity-documentation-search-enhanced 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- documentation_search_enhanced/__init__.py +14 -0
- documentation_search_enhanced/__main__.py +6 -0
- documentation_search_enhanced/config.json +1674 -0
- documentation_search_enhanced/config_manager.py +233 -0
- documentation_search_enhanced/config_validator.py +79 -0
- documentation_search_enhanced/content_enhancer.py +578 -0
- documentation_search_enhanced/docker_manager.py +87 -0
- documentation_search_enhanced/logger.py +179 -0
- documentation_search_enhanced/main.py +2170 -0
- documentation_search_enhanced/project_generator.py +260 -0
- documentation_search_enhanced/project_scanner.py +85 -0
- documentation_search_enhanced/reranker.py +230 -0
- documentation_search_enhanced/site_index_builder.py +274 -0
- documentation_search_enhanced/site_index_downloader.py +222 -0
- documentation_search_enhanced/site_search.py +1325 -0
- documentation_search_enhanced/smart_search.py +473 -0
- documentation_search_enhanced/snyk_integration.py +657 -0
- documentation_search_enhanced/vector_search.py +303 -0
- documentation_search_enhanced/version_resolver.py +189 -0
- documentation_search_enhanced/vulnerability_scanner.py +545 -0
- documentation_search_enhanced/web_scraper.py +117 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/METADATA +195 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/RECORD +26 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/WHEEL +4 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/entry_points.txt +2 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,545 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Vulnerability scanner for documentation-search-enhanced MCP server.
|
|
3
|
+
Integrates with OSINT sources to check library security vulnerabilities.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import sys
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from datetime import datetime, timedelta
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any, Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SeverityLevel(Enum):
|
|
17
|
+
"""Vulnerability severity levels"""
|
|
18
|
+
|
|
19
|
+
CRITICAL = "critical"
|
|
20
|
+
HIGH = "high"
|
|
21
|
+
MEDIUM = "medium"
|
|
22
|
+
LOW = "low"
|
|
23
|
+
INFO = "info"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class Vulnerability:
|
|
28
|
+
"""Represents a security vulnerability"""
|
|
29
|
+
|
|
30
|
+
id: str
|
|
31
|
+
title: str
|
|
32
|
+
description: str
|
|
33
|
+
severity: SeverityLevel
|
|
34
|
+
cvss_score: Optional[float]
|
|
35
|
+
cve_id: Optional[str]
|
|
36
|
+
affected_versions: List[str]
|
|
37
|
+
fixed_version: Optional[str]
|
|
38
|
+
published_date: str
|
|
39
|
+
source: str # "osv", "github", "safety", "snyk"
|
|
40
|
+
references: List[str]
|
|
41
|
+
|
|
42
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
43
|
+
return {
|
|
44
|
+
"id": self.id,
|
|
45
|
+
"title": self.title,
|
|
46
|
+
"description": (
|
|
47
|
+
self.description[:200] + "..."
|
|
48
|
+
if len(self.description) > 200
|
|
49
|
+
else self.description
|
|
50
|
+
),
|
|
51
|
+
"severity": self.severity.value,
|
|
52
|
+
"cvss_score": self.cvss_score,
|
|
53
|
+
"cve_id": self.cve_id,
|
|
54
|
+
"affected_versions": self.affected_versions,
|
|
55
|
+
"fixed_version": self.fixed_version,
|
|
56
|
+
"published_date": self.published_date,
|
|
57
|
+
"source": self.source,
|
|
58
|
+
"references": self.references[:3], # Limit references
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class SecurityReport:
|
|
64
|
+
"""Comprehensive security report for a library"""
|
|
65
|
+
|
|
66
|
+
library_name: str
|
|
67
|
+
ecosystem: str # "pypi", "npm", "maven", etc.
|
|
68
|
+
scan_date: str
|
|
69
|
+
total_vulnerabilities: int
|
|
70
|
+
critical_count: int
|
|
71
|
+
high_count: int
|
|
72
|
+
medium_count: int
|
|
73
|
+
low_count: int
|
|
74
|
+
security_score: float # 0-100, higher is better
|
|
75
|
+
recommendations: List[str]
|
|
76
|
+
vulnerabilities: List[Vulnerability]
|
|
77
|
+
latest_secure_version: Optional[str]
|
|
78
|
+
|
|
79
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
80
|
+
return {
|
|
81
|
+
"library_name": self.library_name,
|
|
82
|
+
"ecosystem": self.ecosystem,
|
|
83
|
+
"scan_date": self.scan_date,
|
|
84
|
+
"summary": {
|
|
85
|
+
"total_vulnerabilities": self.total_vulnerabilities,
|
|
86
|
+
"critical": self.critical_count,
|
|
87
|
+
"high": self.high_count,
|
|
88
|
+
"medium": self.medium_count,
|
|
89
|
+
"low": self.low_count,
|
|
90
|
+
"security_score": self.security_score,
|
|
91
|
+
},
|
|
92
|
+
"latest_secure_version": self.latest_secure_version,
|
|
93
|
+
"recommendations": self.recommendations,
|
|
94
|
+
"vulnerabilities": [vuln.to_dict() for vuln in self.vulnerabilities],
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class VulnerabilityScanner:
|
|
99
|
+
"""Main vulnerability scanner class"""
|
|
100
|
+
|
|
101
|
+
def __init__(self):
|
|
102
|
+
self.cache = {}
|
|
103
|
+
self.cache_ttl = timedelta(hours=6) # Cache for 6 hours
|
|
104
|
+
self.timeout = httpx.Timeout(30.0)
|
|
105
|
+
|
|
106
|
+
# API endpoints
|
|
107
|
+
self.osv_api = "https://api.osv.dev"
|
|
108
|
+
self.github_api = "https://api.github.com"
|
|
109
|
+
self.cve_api = "https://cve.circl.lu/api"
|
|
110
|
+
|
|
111
|
+
async def scan_library(
|
|
112
|
+
self, library_name: str, ecosystem: str = "PyPI"
|
|
113
|
+
) -> SecurityReport:
|
|
114
|
+
"""
|
|
115
|
+
Comprehensive vulnerability scan for a library
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
library_name: Name of the library (e.g., "fastapi", "react")
|
|
119
|
+
ecosystem: Package ecosystem ("PyPI", "npm", "Maven", etc.)
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
SecurityReport with vulnerability details
|
|
123
|
+
"""
|
|
124
|
+
cache_key = f"{library_name}_{ecosystem}"
|
|
125
|
+
|
|
126
|
+
# Check cache first
|
|
127
|
+
if self._is_cached(cache_key):
|
|
128
|
+
return self.cache[cache_key]["data"]
|
|
129
|
+
|
|
130
|
+
vulnerabilities = []
|
|
131
|
+
|
|
132
|
+
# Scan multiple sources in parallel
|
|
133
|
+
scan_tasks = [
|
|
134
|
+
self._scan_osv(library_name, ecosystem),
|
|
135
|
+
self._scan_github_advisories(library_name, ecosystem),
|
|
136
|
+
(
|
|
137
|
+
self._scan_safety_db(library_name)
|
|
138
|
+
if ecosystem.lower() == "pypi"
|
|
139
|
+
else self._empty_scan()
|
|
140
|
+
),
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
results = await asyncio.gather(*scan_tasks, return_exceptions=True)
|
|
145
|
+
|
|
146
|
+
for result in results:
|
|
147
|
+
if isinstance(result, list):
|
|
148
|
+
vulnerabilities.extend(result)
|
|
149
|
+
elif isinstance(result, Exception):
|
|
150
|
+
print(f"Scan error: {result}", file=sys.stderr)
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
print(f"Vulnerability scan failed for {library_name}: {e}", file=sys.stderr)
|
|
154
|
+
|
|
155
|
+
# Generate security report
|
|
156
|
+
report = self._generate_security_report(
|
|
157
|
+
library_name, ecosystem, vulnerabilities
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Cache the result
|
|
161
|
+
self._cache_result(cache_key, report)
|
|
162
|
+
|
|
163
|
+
return report
|
|
164
|
+
|
|
165
|
+
async def _scan_osv(self, library_name: str, ecosystem: str) -> List[Vulnerability]:
|
|
166
|
+
"""Scan OSV (Open Source Vulnerabilities) database"""
|
|
167
|
+
vulnerabilities = []
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
171
|
+
# OSV API query
|
|
172
|
+
query_data = {"package": {"name": library_name, "ecosystem": ecosystem}}
|
|
173
|
+
|
|
174
|
+
response = await client.post(
|
|
175
|
+
f"{self.osv_api}/v1/query", json=query_data
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if response.status_code == 200:
|
|
179
|
+
data = response.json()
|
|
180
|
+
|
|
181
|
+
for vuln_data in data.get("vulns", []):
|
|
182
|
+
vulnerability = self._parse_osv_vulnerability(vuln_data)
|
|
183
|
+
if vulnerability:
|
|
184
|
+
vulnerabilities.append(vulnerability)
|
|
185
|
+
|
|
186
|
+
except Exception as e:
|
|
187
|
+
print(f"OSV scan error for {library_name}: {e}", file=sys.stderr)
|
|
188
|
+
|
|
189
|
+
return vulnerabilities
|
|
190
|
+
|
|
191
|
+
async def _scan_github_advisories(
|
|
192
|
+
self, library_name: str, ecosystem: str
|
|
193
|
+
) -> List[Vulnerability]:
|
|
194
|
+
"""Scan GitHub Security Advisories"""
|
|
195
|
+
vulnerabilities = []
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
199
|
+
# GitHub GraphQL API would be more comprehensive, but REST API is simpler
|
|
200
|
+
search_query = f"type:security-advisories {library_name}"
|
|
201
|
+
|
|
202
|
+
response = await client.get(
|
|
203
|
+
f"{self.github_api}/search/repositories",
|
|
204
|
+
params={"q": search_query, "per_page": 10},
|
|
205
|
+
headers={"Accept": "application/vnd.github+json"},
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
if response.status_code == 200:
|
|
209
|
+
data = response.json()
|
|
210
|
+
|
|
211
|
+
# This is a simplified implementation
|
|
212
|
+
# In production, you'd use GitHub's Security Advisory API
|
|
213
|
+
for item in data.get("items", []):
|
|
214
|
+
if library_name.lower() in item.get("full_name", "").lower():
|
|
215
|
+
vuln = Vulnerability(
|
|
216
|
+
id=f"GHSA-{item['id']}",
|
|
217
|
+
title=f"GitHub Advisory for {library_name}",
|
|
218
|
+
description=item.get(
|
|
219
|
+
"description", "Security advisory found"
|
|
220
|
+
),
|
|
221
|
+
severity=SeverityLevel.MEDIUM, # Default severity
|
|
222
|
+
cvss_score=None,
|
|
223
|
+
cve_id=None,
|
|
224
|
+
affected_versions=["unknown"],
|
|
225
|
+
fixed_version=None,
|
|
226
|
+
published_date=item.get("created_at", ""),
|
|
227
|
+
source="github",
|
|
228
|
+
references=[item.get("html_url", "")],
|
|
229
|
+
)
|
|
230
|
+
vulnerabilities.append(vuln)
|
|
231
|
+
|
|
232
|
+
except Exception as e:
|
|
233
|
+
print(
|
|
234
|
+
f"GitHub Advisory scan error for {library_name}: {e}", file=sys.stderr
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
return vulnerabilities
|
|
238
|
+
|
|
239
|
+
async def _scan_safety_db(self, library_name: str) -> List[Vulnerability]:
|
|
240
|
+
"""Scan Python Safety Database (for PyPI packages)"""
|
|
241
|
+
vulnerabilities = []
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
# Using Safety CLI database approach
|
|
245
|
+
# In a real implementation, you might use their API or local database
|
|
246
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
247
|
+
# PyPA Safety Database (simplified example)
|
|
248
|
+
response = await client.get(
|
|
249
|
+
f"https://pypi.org/pypi/{library_name}/json"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if response.status_code == 200:
|
|
253
|
+
data = response.json()
|
|
254
|
+
|
|
255
|
+
# Check for known vulnerable versions
|
|
256
|
+
# This is a placeholder - real implementation would check Safety DB
|
|
257
|
+
info = data.get("info", {})
|
|
258
|
+
if "security" in info.get("description", "").lower():
|
|
259
|
+
vuln = Vulnerability(
|
|
260
|
+
id=f"PYSA-{library_name}",
|
|
261
|
+
title=f"Potential security issue in {library_name}",
|
|
262
|
+
description="Security-related keywords found in package description",
|
|
263
|
+
severity=SeverityLevel.INFO,
|
|
264
|
+
cvss_score=None,
|
|
265
|
+
cve_id=None,
|
|
266
|
+
affected_versions=["unknown"],
|
|
267
|
+
fixed_version=None,
|
|
268
|
+
published_date=datetime.now().isoformat(),
|
|
269
|
+
source="safety",
|
|
270
|
+
references=[f"https://pypi.org/project/{library_name}/"],
|
|
271
|
+
)
|
|
272
|
+
vulnerabilities.append(vuln)
|
|
273
|
+
|
|
274
|
+
except Exception as e:
|
|
275
|
+
print(f"Safety DB scan error for {library_name}: {e}", file=sys.stderr)
|
|
276
|
+
|
|
277
|
+
return vulnerabilities
|
|
278
|
+
|
|
279
|
+
async def _empty_scan(self) -> List[Vulnerability]:
|
|
280
|
+
"""Empty scan for unsupported ecosystems"""
|
|
281
|
+
return []
|
|
282
|
+
|
|
283
|
+
def _parse_osv_vulnerability(
|
|
284
|
+
self, vuln_data: Dict[str, Any]
|
|
285
|
+
) -> Optional[Vulnerability]:
|
|
286
|
+
"""Parse OSV vulnerability data"""
|
|
287
|
+
try:
|
|
288
|
+
# Extract severity
|
|
289
|
+
severity = SeverityLevel.MEDIUM # Default
|
|
290
|
+
cvss_score = None
|
|
291
|
+
|
|
292
|
+
if "severity" in vuln_data:
|
|
293
|
+
severity_info = vuln_data["severity"]
|
|
294
|
+
if isinstance(severity_info, list) and severity_info:
|
|
295
|
+
severity_data = severity_info[0]
|
|
296
|
+
score = severity_data.get("score")
|
|
297
|
+
if score:
|
|
298
|
+
cvss_score = float(score)
|
|
299
|
+
if cvss_score >= 9.0:
|
|
300
|
+
severity = SeverityLevel.CRITICAL
|
|
301
|
+
elif cvss_score >= 7.0:
|
|
302
|
+
severity = SeverityLevel.HIGH
|
|
303
|
+
elif cvss_score >= 4.0:
|
|
304
|
+
severity = SeverityLevel.MEDIUM
|
|
305
|
+
else:
|
|
306
|
+
severity = SeverityLevel.LOW
|
|
307
|
+
|
|
308
|
+
# Extract affected versions
|
|
309
|
+
affected_versions = []
|
|
310
|
+
for affected in vuln_data.get("affected", []):
|
|
311
|
+
ranges = affected.get("ranges", [])
|
|
312
|
+
for range_info in ranges:
|
|
313
|
+
events = range_info.get("events", [])
|
|
314
|
+
for event in events:
|
|
315
|
+
if "introduced" in event:
|
|
316
|
+
affected_versions.append(f">={event['introduced']}")
|
|
317
|
+
elif "fixed" in event:
|
|
318
|
+
affected_versions.append(f"<{event['fixed']}")
|
|
319
|
+
|
|
320
|
+
# Extract references
|
|
321
|
+
references = []
|
|
322
|
+
for ref in vuln_data.get("references", []):
|
|
323
|
+
if "url" in ref:
|
|
324
|
+
references.append(ref["url"])
|
|
325
|
+
|
|
326
|
+
return Vulnerability(
|
|
327
|
+
id=vuln_data.get("id", ""),
|
|
328
|
+
title=vuln_data.get("summary", ""),
|
|
329
|
+
description=vuln_data.get("details", ""),
|
|
330
|
+
severity=severity,
|
|
331
|
+
cvss_score=cvss_score,
|
|
332
|
+
cve_id=self._extract_cve_id(vuln_data),
|
|
333
|
+
affected_versions=affected_versions,
|
|
334
|
+
fixed_version=self._extract_fixed_version(vuln_data),
|
|
335
|
+
published_date=vuln_data.get("published", ""),
|
|
336
|
+
source="osv",
|
|
337
|
+
references=references,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
except Exception as e:
|
|
341
|
+
print(f"Error parsing OSV vulnerability: {e}", file=sys.stderr)
|
|
342
|
+
return None
|
|
343
|
+
|
|
344
|
+
def _extract_cve_id(self, vuln_data: Dict[str, Any]) -> Optional[str]:
|
|
345
|
+
"""Extract CVE ID from vulnerability data"""
|
|
346
|
+
aliases = vuln_data.get("aliases", [])
|
|
347
|
+
for alias in aliases:
|
|
348
|
+
if alias.startswith("CVE-"):
|
|
349
|
+
return alias
|
|
350
|
+
return None
|
|
351
|
+
|
|
352
|
+
def _extract_fixed_version(self, vuln_data: Dict[str, Any]) -> Optional[str]:
|
|
353
|
+
"""Extract fixed version from vulnerability data"""
|
|
354
|
+
for affected in vuln_data.get("affected", []):
|
|
355
|
+
ranges = affected.get("ranges", [])
|
|
356
|
+
for range_info in ranges:
|
|
357
|
+
events = range_info.get("events", [])
|
|
358
|
+
for event in events:
|
|
359
|
+
if "fixed" in event:
|
|
360
|
+
return event["fixed"]
|
|
361
|
+
return None
|
|
362
|
+
|
|
363
|
+
def _generate_security_report(
|
|
364
|
+
self, library_name: str, ecosystem: str, vulnerabilities: List[Vulnerability]
|
|
365
|
+
) -> SecurityReport:
|
|
366
|
+
"""Generate comprehensive security report"""
|
|
367
|
+
|
|
368
|
+
# Count vulnerabilities by severity
|
|
369
|
+
critical_count = sum(
|
|
370
|
+
1 for v in vulnerabilities if v.severity == SeverityLevel.CRITICAL
|
|
371
|
+
)
|
|
372
|
+
high_count = sum(1 for v in vulnerabilities if v.severity == SeverityLevel.HIGH)
|
|
373
|
+
medium_count = sum(
|
|
374
|
+
1 for v in vulnerabilities if v.severity == SeverityLevel.MEDIUM
|
|
375
|
+
)
|
|
376
|
+
low_count = sum(1 for v in vulnerabilities if v.severity == SeverityLevel.LOW)
|
|
377
|
+
|
|
378
|
+
# Calculate security score (0-100, higher is better)
|
|
379
|
+
security_score = self._calculate_security_score(
|
|
380
|
+
critical_count, high_count, medium_count, low_count
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Generate recommendations
|
|
384
|
+
recommendations = self._generate_recommendations(
|
|
385
|
+
library_name, vulnerabilities, security_score
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# Find latest secure version (placeholder)
|
|
389
|
+
latest_secure_version = self._find_latest_secure_version(vulnerabilities)
|
|
390
|
+
|
|
391
|
+
return SecurityReport(
|
|
392
|
+
library_name=library_name,
|
|
393
|
+
ecosystem=ecosystem,
|
|
394
|
+
scan_date=datetime.now().isoformat(),
|
|
395
|
+
total_vulnerabilities=len(vulnerabilities),
|
|
396
|
+
critical_count=critical_count,
|
|
397
|
+
high_count=high_count,
|
|
398
|
+
medium_count=medium_count,
|
|
399
|
+
low_count=low_count,
|
|
400
|
+
security_score=security_score,
|
|
401
|
+
recommendations=recommendations,
|
|
402
|
+
vulnerabilities=vulnerabilities[:10], # Limit to top 10
|
|
403
|
+
latest_secure_version=latest_secure_version,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
def _calculate_security_score(
|
|
407
|
+
self, critical: int, high: int, medium: int, low: int
|
|
408
|
+
) -> float:
|
|
409
|
+
"""Calculate security score based on vulnerability counts"""
|
|
410
|
+
# Start with perfect score
|
|
411
|
+
score = 100.0
|
|
412
|
+
|
|
413
|
+
# Deduct points based on severity
|
|
414
|
+
score -= critical * 25 # Critical: -25 points each
|
|
415
|
+
score -= high * 15 # High: -15 points each
|
|
416
|
+
score -= medium * 5 # Medium: -5 points each
|
|
417
|
+
score -= low * 1 # Low: -1 point each
|
|
418
|
+
|
|
419
|
+
# Ensure score doesn't go below 0
|
|
420
|
+
return max(0.0, score)
|
|
421
|
+
|
|
422
|
+
def _generate_recommendations(
|
|
423
|
+
self,
|
|
424
|
+
library_name: str,
|
|
425
|
+
vulnerabilities: List[Vulnerability],
|
|
426
|
+
security_score: float,
|
|
427
|
+
) -> List[str]:
|
|
428
|
+
"""Generate security recommendations"""
|
|
429
|
+
recommendations = []
|
|
430
|
+
|
|
431
|
+
if security_score < 50:
|
|
432
|
+
recommendations.append(
|
|
433
|
+
"🚨 High security risk - Consider alternative libraries"
|
|
434
|
+
)
|
|
435
|
+
elif security_score < 70:
|
|
436
|
+
recommendations.append("⚠️ Moderate security risk - Monitor for updates")
|
|
437
|
+
elif security_score < 90:
|
|
438
|
+
recommendations.append("✅ Generally secure - Keep updated")
|
|
439
|
+
else:
|
|
440
|
+
recommendations.append("🛡️ Excellent security record")
|
|
441
|
+
|
|
442
|
+
# Specific recommendations based on vulnerabilities
|
|
443
|
+
critical_vulns = [
|
|
444
|
+
v for v in vulnerabilities if v.severity == SeverityLevel.CRITICAL
|
|
445
|
+
]
|
|
446
|
+
if critical_vulns:
|
|
447
|
+
recommendations.append(
|
|
448
|
+
"🔥 Update immediately - Critical vulnerabilities found"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
fixed_versions = [v.fixed_version for v in vulnerabilities if v.fixed_version]
|
|
452
|
+
if fixed_versions:
|
|
453
|
+
latest_fix = max(fixed_versions)
|
|
454
|
+
recommendations.append(f"📦 Update to version {latest_fix} or later")
|
|
455
|
+
|
|
456
|
+
if len(vulnerabilities) > 5:
|
|
457
|
+
recommendations.append(
|
|
458
|
+
"📊 Many vulnerabilities found - Consider security audit"
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
return recommendations[:5] # Limit recommendations
|
|
462
|
+
|
|
463
|
+
def _find_latest_secure_version(
|
|
464
|
+
self, vulnerabilities: List[Vulnerability]
|
|
465
|
+
) -> Optional[str]:
|
|
466
|
+
"""Find the latest secure version"""
|
|
467
|
+
fixed_versions = [v.fixed_version for v in vulnerabilities if v.fixed_version]
|
|
468
|
+
if fixed_versions:
|
|
469
|
+
# This is simplified - real implementation would use proper version comparison
|
|
470
|
+
return max(fixed_versions)
|
|
471
|
+
return None
|
|
472
|
+
|
|
473
|
+
def _is_cached(self, cache_key: str) -> bool:
|
|
474
|
+
"""Check if result is cached and still valid"""
|
|
475
|
+
if cache_key not in self.cache:
|
|
476
|
+
return False
|
|
477
|
+
|
|
478
|
+
cached_time = self.cache[cache_key]["timestamp"]
|
|
479
|
+
return datetime.now() - cached_time < self.cache_ttl
|
|
480
|
+
|
|
481
|
+
def _cache_result(self, cache_key: str, result: SecurityReport) -> None:
|
|
482
|
+
"""Cache scan result"""
|
|
483
|
+
self.cache[cache_key] = {"data": result, "timestamp": datetime.now()}
|
|
484
|
+
|
|
485
|
+
# Simple cache cleanup - remove old entries
|
|
486
|
+
if len(self.cache) > 100:
|
|
487
|
+
oldest_key = min(
|
|
488
|
+
self.cache.keys(), key=lambda k: self.cache[k]["timestamp"]
|
|
489
|
+
)
|
|
490
|
+
del self.cache[oldest_key]
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
class SecurityIntegration:
|
|
494
|
+
"""Integration layer for security features"""
|
|
495
|
+
|
|
496
|
+
def __init__(self, scanner: VulnerabilityScanner):
|
|
497
|
+
self.scanner = scanner
|
|
498
|
+
|
|
499
|
+
async def get_security_score(
|
|
500
|
+
self, library_name: str, ecosystem: str = "PyPI"
|
|
501
|
+
) -> float:
|
|
502
|
+
"""Get security score for a library (0-100, higher is better)"""
|
|
503
|
+
try:
|
|
504
|
+
report = await self.scanner.scan_library(library_name, ecosystem)
|
|
505
|
+
return report.security_score
|
|
506
|
+
except Exception:
|
|
507
|
+
return 50.0 # Default neutral score
|
|
508
|
+
|
|
509
|
+
async def is_library_secure(
|
|
510
|
+
self, library_name: str, ecosystem: str = "PyPI", threshold: float = 70.0
|
|
511
|
+
) -> bool:
|
|
512
|
+
"""Check if library meets security threshold"""
|
|
513
|
+
score = await self.get_security_score(library_name, ecosystem)
|
|
514
|
+
return score >= threshold
|
|
515
|
+
|
|
516
|
+
async def get_security_summary(
|
|
517
|
+
self, library_name: str, ecosystem: str = "PyPI"
|
|
518
|
+
) -> Dict[str, Any]:
|
|
519
|
+
"""Get concise security summary"""
|
|
520
|
+
try:
|
|
521
|
+
report = await self.scanner.scan_library(library_name, ecosystem)
|
|
522
|
+
return {
|
|
523
|
+
"library": library_name,
|
|
524
|
+
"security_score": report.security_score,
|
|
525
|
+
"total_vulnerabilities": report.total_vulnerabilities,
|
|
526
|
+
"critical_vulnerabilities": report.critical_count,
|
|
527
|
+
"status": "secure" if report.security_score >= 70 else "at_risk",
|
|
528
|
+
"primary_recommendation": (
|
|
529
|
+
report.recommendations[0]
|
|
530
|
+
if report.recommendations
|
|
531
|
+
else "No specific recommendations"
|
|
532
|
+
),
|
|
533
|
+
}
|
|
534
|
+
except Exception as e:
|
|
535
|
+
return {
|
|
536
|
+
"library": library_name,
|
|
537
|
+
"security_score": 50.0,
|
|
538
|
+
"error": str(e),
|
|
539
|
+
"status": "unknown",
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
# Global instances
|
|
544
|
+
vulnerability_scanner = VulnerabilityScanner()
|
|
545
|
+
security_integration = SecurityIntegration(vulnerability_scanner)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
An advanced web scraper using Playwright to handle dynamic, JS-heavy sites.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
import httpx
|
|
11
|
+
from playwright.async_api import Browser, async_playwright
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PlaywrightScraper:
|
|
15
|
+
"""A web scraper that uses a real browser to render pages."""
|
|
16
|
+
|
|
17
|
+
_browser: Optional[Browser] = None
|
|
18
|
+
_playwright = None
|
|
19
|
+
_disabled_reason: Optional[str] = None
|
|
20
|
+
|
|
21
|
+
async def _get_browser(self) -> Browser:
|
|
22
|
+
"""Initializes and returns a shared browser instance."""
|
|
23
|
+
if self._disabled_reason:
|
|
24
|
+
raise RuntimeError(self._disabled_reason)
|
|
25
|
+
if self._browser is None or not self._browser.is_connected():
|
|
26
|
+
try:
|
|
27
|
+
self._playwright = await async_playwright().start()
|
|
28
|
+
self._browser = await self._playwright.chromium.launch()
|
|
29
|
+
except Exception as e:
|
|
30
|
+
self._disabled_reason = f"Playwright disabled: {e}"
|
|
31
|
+
if self._playwright:
|
|
32
|
+
try:
|
|
33
|
+
await self._playwright.stop()
|
|
34
|
+
except Exception:
|
|
35
|
+
pass
|
|
36
|
+
self._playwright = None
|
|
37
|
+
self._browser = None
|
|
38
|
+
raise
|
|
39
|
+
return self._browser
|
|
40
|
+
|
|
41
|
+
async def scrape_url(self, url: str) -> str:
|
|
42
|
+
"""
|
|
43
|
+
Scrapes a URL using Playwright, returning the clean, readable text content.
|
|
44
|
+
|
|
45
|
+
This method can handle dynamic content, as it waits for the page
|
|
46
|
+
to fully load and can execute scripts if needed.
|
|
47
|
+
"""
|
|
48
|
+
page = None
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
if self._disabled_reason:
|
|
52
|
+
return await self._scrape_url_fallback(url)
|
|
53
|
+
|
|
54
|
+
browser = await self._get_browser()
|
|
55
|
+
page = await browser.new_page()
|
|
56
|
+
|
|
57
|
+
# Navigate to the page and wait for it to be fully loaded
|
|
58
|
+
await page.goto(url, wait_until="networkidle", timeout=60000)
|
|
59
|
+
|
|
60
|
+
# Scroll to the bottom to trigger lazy-loaded content
|
|
61
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
62
|
+
await page.wait_for_timeout(1000) # Wait for any new content to load
|
|
63
|
+
|
|
64
|
+
# Get the page content after JavaScript has rendered
|
|
65
|
+
html_content = await page.content()
|
|
66
|
+
|
|
67
|
+
# Use BeautifulSoup to parse and clean the final HTML
|
|
68
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
69
|
+
|
|
70
|
+
# Remove non-content elements
|
|
71
|
+
for element in soup(
|
|
72
|
+
["script", "style", "nav", "footer", "header", "aside"]
|
|
73
|
+
):
|
|
74
|
+
element.decompose()
|
|
75
|
+
|
|
76
|
+
# Get clean text
|
|
77
|
+
text = soup.get_text(separator=" ", strip=True)
|
|
78
|
+
return text
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
print(f"Failed to scrape {url}: {e}", file=sys.stderr)
|
|
82
|
+
return await self._scrape_url_fallback(url)
|
|
83
|
+
finally:
|
|
84
|
+
if page is not None:
|
|
85
|
+
await page.close()
|
|
86
|
+
|
|
87
|
+
async def _scrape_url_fallback(self, url: str) -> str:
|
|
88
|
+
"""Fallback fetcher when Playwright cannot launch (e.g., sandboxed environments)."""
|
|
89
|
+
headers = {"User-Agent": "docs-app/1.0"}
|
|
90
|
+
try:
|
|
91
|
+
async with httpx.AsyncClient(
|
|
92
|
+
timeout=httpx.Timeout(15.0, read=30.0),
|
|
93
|
+
follow_redirects=True,
|
|
94
|
+
headers=headers,
|
|
95
|
+
) as client:
|
|
96
|
+
response = await client.get(url)
|
|
97
|
+
response.raise_for_status()
|
|
98
|
+
|
|
99
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
100
|
+
for element in soup(
|
|
101
|
+
["script", "style", "nav", "footer", "header", "aside"]
|
|
102
|
+
):
|
|
103
|
+
element.decompose()
|
|
104
|
+
return soup.get_text(separator=" ", strip=True)
|
|
105
|
+
except Exception as e:
|
|
106
|
+
print(f"Fallback fetch failed for {url}: {e}", file=sys.stderr)
|
|
107
|
+
return f"Error: Could not retrieve content from {url}."
|
|
108
|
+
|
|
109
|
+
async def close(self):
|
|
110
|
+
"""Closes the browser instance."""
|
|
111
|
+
if self._browser and self._browser.is_connected():
|
|
112
|
+
await self._browser.close()
|
|
113
|
+
if self._playwright:
|
|
114
|
+
await self._playwright.stop()
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
scraper = PlaywrightScraper()
|