zen-ai-pentest 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agents/__init__.py +28 -0
- agents/agent_base.py +239 -0
- agents/agent_orchestrator.py +346 -0
- agents/analysis_agent.py +225 -0
- agents/cli.py +258 -0
- agents/exploit_agent.py +224 -0
- agents/integration.py +211 -0
- agents/post_scan_agent.py +937 -0
- agents/react_agent.py +384 -0
- agents/react_agent_enhanced.py +616 -0
- agents/react_agent_vm.py +298 -0
- agents/research_agent.py +176 -0
- api/__init__.py +11 -0
- api/auth.py +123 -0
- api/main.py +1027 -0
- api/schemas.py +357 -0
- api/websocket.py +97 -0
- autonomous/__init__.py +122 -0
- autonomous/agent.py +253 -0
- autonomous/agent_loop.py +1370 -0
- autonomous/exploit_validator.py +1537 -0
- autonomous/memory.py +448 -0
- autonomous/react.py +339 -0
- autonomous/tool_executor.py +488 -0
- backends/__init__.py +16 -0
- backends/chatgpt_direct.py +133 -0
- backends/claude_direct.py +130 -0
- backends/duckduckgo.py +138 -0
- backends/openrouter.py +120 -0
- benchmarks/__init__.py +149 -0
- benchmarks/benchmark_engine.py +904 -0
- benchmarks/ci_benchmark.py +785 -0
- benchmarks/comparison.py +729 -0
- benchmarks/metrics.py +553 -0
- benchmarks/run_benchmarks.py +809 -0
- ci_cd/__init__.py +2 -0
- core/__init__.py +17 -0
- core/async_pool.py +282 -0
- core/asyncio_fix.py +222 -0
- core/cache.py +472 -0
- core/container.py +277 -0
- core/database.py +114 -0
- core/input_validator.py +353 -0
- core/models.py +288 -0
- core/orchestrator.py +611 -0
- core/plugin_manager.py +571 -0
- core/rate_limiter.py +405 -0
- core/secure_config.py +328 -0
- core/shield_integration.py +296 -0
- modules/__init__.py +46 -0
- modules/cve_database.py +362 -0
- modules/exploit_assist.py +330 -0
- modules/nuclei_integration.py +480 -0
- modules/osint.py +604 -0
- modules/protonvpn.py +554 -0
- modules/recon.py +165 -0
- modules/sql_injection_db.py +826 -0
- modules/tool_orchestrator.py +498 -0
- modules/vuln_scanner.py +292 -0
- modules/wordlist_generator.py +566 -0
- risk_engine/__init__.py +99 -0
- risk_engine/business_impact.py +267 -0
- risk_engine/business_impact_calculator.py +563 -0
- risk_engine/cvss.py +156 -0
- risk_engine/epss.py +190 -0
- risk_engine/example_usage.py +294 -0
- risk_engine/false_positive_engine.py +1073 -0
- risk_engine/scorer.py +304 -0
- web_ui/backend/main.py +471 -0
- zen_ai_pentest-2.0.0.dist-info/METADATA +795 -0
- zen_ai_pentest-2.0.0.dist-info/RECORD +75 -0
- zen_ai_pentest-2.0.0.dist-info/WHEEL +5 -0
- zen_ai_pentest-2.0.0.dist-info/entry_points.txt +2 -0
- zen_ai_pentest-2.0.0.dist-info/licenses/LICENSE +21 -0
- zen_ai_pentest-2.0.0.dist-info/top_level.txt +10 -0
benchmarks/metrics.py
ADDED
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Zen-AI-Pentest Benchmark Metrics Module
|
|
3
|
+
|
|
4
|
+
Comprehensive metrics collection for security testing benchmarks.
|
|
5
|
+
Provides statistical analysis and performance tracking.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Dict, List, Optional, Any
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
from enum import Enum
|
|
12
|
+
import json
|
|
13
|
+
import math
|
|
14
|
+
import statistics
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SeverityLevel(Enum):
|
|
19
|
+
"""Severity levels for vulnerabilities."""
|
|
20
|
+
CRITICAL = "critical"
|
|
21
|
+
HIGH = "high"
|
|
22
|
+
MEDIUM = "medium"
|
|
23
|
+
LOW = "low"
|
|
24
|
+
INFO = "info"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class FindingType(Enum):
|
|
28
|
+
"""Types of security findings."""
|
|
29
|
+
SQL_INJECTION = "sql_injection"
|
|
30
|
+
XSS = "xss"
|
|
31
|
+
CSRF = "csrf"
|
|
32
|
+
SSRF = "ssrf"
|
|
33
|
+
COMMAND_INJECTION = "command_injection"
|
|
34
|
+
PATH_TRAVERSAL = "path_traversal"
|
|
35
|
+
INSECURE_DESERIALIZATION = "insecure_deserialization"
|
|
36
|
+
AUTH_BYPASS = "auth_bypass"
|
|
37
|
+
INFO_DISCLOSURE = "info_disclosure"
|
|
38
|
+
MISCONFIGURATION = "misconfiguration"
|
|
39
|
+
OUTDATED_COMPONENT = "outdated_component"
|
|
40
|
+
WEAK_CRYPTO = "weak_cryptography"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class FindingMetrics:
|
|
45
|
+
"""Metrics for individual findings."""
|
|
46
|
+
finding_type: FindingType
|
|
47
|
+
severity: SeverityLevel
|
|
48
|
+
confidence: float # 0.0 - 1.0
|
|
49
|
+
exploitability: float # 0.0 - 1.0
|
|
50
|
+
detection_time_ms: int
|
|
51
|
+
verification_time_ms: Optional[int] = None
|
|
52
|
+
false_positive: Optional[bool] = None
|
|
53
|
+
verified: bool = False
|
|
54
|
+
exploited: bool = False
|
|
55
|
+
cve_id: Optional[str] = None
|
|
56
|
+
cvss_score: Optional[float] = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class TokenUsage:
|
|
61
|
+
"""Track API token consumption."""
|
|
62
|
+
prompt_tokens: int = 0
|
|
63
|
+
completion_tokens: int = 0
|
|
64
|
+
total_tokens: int = 0
|
|
65
|
+
cost_usd: float = 0.0
|
|
66
|
+
model: str = ""
|
|
67
|
+
|
|
68
|
+
def __post_init__(self):
|
|
69
|
+
self.total_tokens = self.prompt_tokens + self.completion_tokens
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class PerformanceMetrics:
|
|
74
|
+
"""Performance-related metrics."""
|
|
75
|
+
scan_start_time: datetime = field(default_factory=datetime.utcnow)
|
|
76
|
+
scan_end_time: Optional[datetime] = None
|
|
77
|
+
total_duration_ms: int = 0
|
|
78
|
+
mean_time_to_detect_ms: float = 0.0
|
|
79
|
+
detection_rate_per_minute: float = 0.0
|
|
80
|
+
memory_peak_mb: float = 0.0
|
|
81
|
+
cpu_usage_avg: float = 0.0
|
|
82
|
+
network_requests: int = 0
|
|
83
|
+
network_errors: int = 0
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def duration_seconds(self) -> float:
|
|
87
|
+
"""Get duration in seconds."""
|
|
88
|
+
if self.scan_end_time:
|
|
89
|
+
return (self.scan_end_time - self.scan_start_time).total_seconds()
|
|
90
|
+
return 0.0
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass
|
|
94
|
+
class ClassificationMetrics:
|
|
95
|
+
"""
|
|
96
|
+
Binary classification metrics for security findings.
|
|
97
|
+
|
|
98
|
+
True Positives: Correctly identified vulnerabilities
|
|
99
|
+
False Positives: Incorrectly flagged as vulnerabilities
|
|
100
|
+
True Negatives: Correctly identified as safe
|
|
101
|
+
False Negatives: Missed vulnerabilities
|
|
102
|
+
"""
|
|
103
|
+
true_positives: int = 0
|
|
104
|
+
false_positives: int = 0
|
|
105
|
+
true_negatives: int = 0
|
|
106
|
+
false_negatives: int = 0
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def precision(self) -> float:
|
|
110
|
+
"""Precision = TP / (TP + FP)"""
|
|
111
|
+
denominator = self.true_positives + self.false_positives
|
|
112
|
+
if denominator == 0:
|
|
113
|
+
return 0.0
|
|
114
|
+
return self.true_positives / denominator
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def recall(self) -> float:
|
|
118
|
+
"""Recall = TP / (TP + FN)"""
|
|
119
|
+
denominator = self.true_positives + self.false_negatives
|
|
120
|
+
if denominator == 0:
|
|
121
|
+
return 0.0
|
|
122
|
+
return self.true_positives / denominator
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def specificity(self) -> float:
|
|
126
|
+
"""Specificity = TN / (TN + FP)"""
|
|
127
|
+
denominator = self.true_negatives + self.false_positives
|
|
128
|
+
if denominator == 0:
|
|
129
|
+
return 0.0
|
|
130
|
+
return self.true_negatives / denominator
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def f1_score(self) -> float:
|
|
134
|
+
"""F1-Score = 2 * (Precision * Recall) / (Precision + Recall)"""
|
|
135
|
+
precision = self.precision
|
|
136
|
+
recall = self.recall
|
|
137
|
+
if precision + recall == 0:
|
|
138
|
+
return 0.0
|
|
139
|
+
return 2 * (precision * recall) / (precision + recall)
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def f2_score(self) -> float:
|
|
143
|
+
"""F2-Score weights recall higher than precision."""
|
|
144
|
+
precision = self.precision
|
|
145
|
+
recall = self.recall
|
|
146
|
+
if precision + recall == 0:
|
|
147
|
+
return 0.0
|
|
148
|
+
return 5 * (precision * recall) / ((4 * precision) + recall)
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def accuracy(self) -> float:
|
|
152
|
+
"""Accuracy = (TP + TN) / (TP + TN + FP + FN)"""
|
|
153
|
+
total = self.true_positives + self.true_negatives + \
|
|
154
|
+
self.false_positives + self.false_negatives
|
|
155
|
+
if total == 0:
|
|
156
|
+
return 0.0
|
|
157
|
+
return (self.true_positives + self.true_negatives) / total
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def balanced_accuracy(self) -> float:
|
|
161
|
+
"""Balanced accuracy for imbalanced datasets."""
|
|
162
|
+
sensitivity = self.recall
|
|
163
|
+
specificity = self.specificity
|
|
164
|
+
return (sensitivity + specificity) / 2
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def matthews_correlation(self) -> float:
|
|
168
|
+
"""Matthews Correlation Coefficient (-1 to +1)."""
|
|
169
|
+
tp, tn, fp, fn = self.true_positives, self.true_negatives, \
|
|
170
|
+
self.false_positives, self.false_negatives
|
|
171
|
+
|
|
172
|
+
denominator = math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
|
|
173
|
+
if denominator == 0:
|
|
174
|
+
return 0.0
|
|
175
|
+
return ((tp * tn) - (fp * fn)) / denominator
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@dataclass
|
|
179
|
+
class CoverageMetrics:
|
|
180
|
+
"""Coverage metrics for security testing."""
|
|
181
|
+
total_endpoints: int = 0
|
|
182
|
+
scanned_endpoints: int = 0
|
|
183
|
+
total_parameters: int = 0
|
|
184
|
+
tested_parameters: int = 0
|
|
185
|
+
total_attack_vectors: int = 0
|
|
186
|
+
tested_attack_vectors: int = 0
|
|
187
|
+
owasp_categories_covered: List[str] = field(default_factory=list)
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def endpoint_coverage(self) -> float:
|
|
191
|
+
"""Percentage of endpoints scanned."""
|
|
192
|
+
if self.total_endpoints == 0:
|
|
193
|
+
return 0.0
|
|
194
|
+
return (self.scanned_endpoints / self.total_endpoints) * 100
|
|
195
|
+
|
|
196
|
+
@property
|
|
197
|
+
def parameter_coverage(self) -> float:
|
|
198
|
+
"""Percentage of parameters tested."""
|
|
199
|
+
if self.total_parameters == 0:
|
|
200
|
+
return 0.0
|
|
201
|
+
return (self.tested_parameters / self.total_parameters) * 100
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def attack_vector_coverage(self) -> float:
|
|
205
|
+
"""Percentage of attack vectors tested."""
|
|
206
|
+
if self.total_attack_vectors == 0:
|
|
207
|
+
return 0.0
|
|
208
|
+
return (self.tested_attack_vectors / self.total_attack_vectors) * 100
|
|
209
|
+
|
|
210
|
+
@property
|
|
211
|
+
def owasp_coverage(self) -> float:
|
|
212
|
+
"""Percentage of OWASP categories covered."""
|
|
213
|
+
# OWASP Top 10 categories
|
|
214
|
+
owasp_top10 = [
|
|
215
|
+
"A01:2021-Broken Access Control",
|
|
216
|
+
"A02:2021-Cryptographic Failures",
|
|
217
|
+
"A03:2021-Injection",
|
|
218
|
+
"A04:2021-Insecure Design",
|
|
219
|
+
"A05:2021-Security Misconfiguration",
|
|
220
|
+
"A06:2021-Vulnerable and Outdated Components",
|
|
221
|
+
"A07:2021-Identification and Authentication Failures",
|
|
222
|
+
"A08:2021-Software and Data Integrity Failures",
|
|
223
|
+
"A09:2021-Security Logging and Monitoring Failures",
|
|
224
|
+
"A10:2021-Server-Side Request Forgery (SSRF)"
|
|
225
|
+
]
|
|
226
|
+
if not owasp_top10:
|
|
227
|
+
return 0.0
|
|
228
|
+
return (len(self.owasp_categories_covered) / len(owasp_top10)) * 100
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
@dataclass
|
|
232
|
+
class ExploitMetrics:
|
|
233
|
+
"""Metrics for exploit attempts."""
|
|
234
|
+
total_exploits_attempted: int = 0
|
|
235
|
+
successful_exploits: int = 0
|
|
236
|
+
failed_exploits: int = 0
|
|
237
|
+
blocked_exploits: int = 0
|
|
238
|
+
exploit_types: Dict[str, int] = field(default_factory=dict)
|
|
239
|
+
|
|
240
|
+
@property
|
|
241
|
+
def success_rate(self) -> float:
|
|
242
|
+
"""Percentage of successful exploits."""
|
|
243
|
+
if self.total_exploits_attempted == 0:
|
|
244
|
+
return 0.0
|
|
245
|
+
return (self.successful_exploits / self.total_exploits_attempted) * 100
|
|
246
|
+
|
|
247
|
+
@property
|
|
248
|
+
def safety_score(self) -> float:
|
|
249
|
+
"""Safety score based on controlled exploitation."""
|
|
250
|
+
if self.total_exploits_attempted == 0:
|
|
251
|
+
return 100.0
|
|
252
|
+
# Higher is better - fewer unexpected successes
|
|
253
|
+
return ((self.total_exploits_attempted - self.blocked_exploits) /
|
|
254
|
+
self.total_exploits_attempted) * 100
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@dataclass
|
|
258
|
+
class BenchmarkMetrics:
|
|
259
|
+
"""Complete benchmark metrics container."""
|
|
260
|
+
# Identification
|
|
261
|
+
benchmark_id: str = ""
|
|
262
|
+
scenario_name: str = ""
|
|
263
|
+
tool_version: str = ""
|
|
264
|
+
timestamp: datetime = field(default_factory=datetime.utcnow)
|
|
265
|
+
|
|
266
|
+
# Component metrics
|
|
267
|
+
classification: ClassificationMetrics = field(default_factory=ClassificationMetrics)
|
|
268
|
+
coverage: CoverageMetrics = field(default_factory=CoverageMetrics)
|
|
269
|
+
performance: PerformanceMetrics = field(default_factory=PerformanceMetrics)
|
|
270
|
+
exploit: ExploitMetrics = field(default_factory=ExploitMetrics)
|
|
271
|
+
token_usage: TokenUsage = field(default_factory=TokenUsage)
|
|
272
|
+
|
|
273
|
+
# Findings
|
|
274
|
+
findings: List[FindingMetrics] = field(default_factory=list)
|
|
275
|
+
|
|
276
|
+
# Raw data
|
|
277
|
+
raw_output: Dict[str, Any] = field(default_factory=dict)
|
|
278
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
279
|
+
|
|
280
|
+
def calculate_aggregate_scores(self) -> Dict[str, float]:
|
|
281
|
+
"""Calculate aggregate benchmark scores."""
|
|
282
|
+
return {
|
|
283
|
+
"accuracy": self.classification.accuracy,
|
|
284
|
+
"precision": self.classification.precision,
|
|
285
|
+
"recall": self.classification.recall,
|
|
286
|
+
"f1_score": self.classification.f1_score,
|
|
287
|
+
"coverage": (
|
|
288
|
+
self.coverage.endpoint_coverage +
|
|
289
|
+
self.coverage.parameter_coverage +
|
|
290
|
+
self.coverage.attack_vector_coverage
|
|
291
|
+
) / 3,
|
|
292
|
+
"speed": self._calculate_speed_score(),
|
|
293
|
+
"efficiency": self._calculate_efficiency_score(),
|
|
294
|
+
"exploit_success": self.exploit.success_rate,
|
|
295
|
+
"overall": self._calculate_overall_score()
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
def _calculate_speed_score(self) -> float:
|
|
299
|
+
"""Calculate speed score (0-100)."""
|
|
300
|
+
duration = self.performance.duration_seconds
|
|
301
|
+
if duration == 0:
|
|
302
|
+
return 0.0
|
|
303
|
+
# Faster is better, with diminishing returns
|
|
304
|
+
return min(100.0, 1000.0 / max(duration, 10))
|
|
305
|
+
|
|
306
|
+
def _calculate_efficiency_score(self) -> float:
|
|
307
|
+
"""Calculate efficiency score based on findings per token."""
|
|
308
|
+
tokens = self.token_usage.total_tokens
|
|
309
|
+
if tokens == 0:
|
|
310
|
+
return 0.0
|
|
311
|
+
findings = len(self.findings)
|
|
312
|
+
return min(100.0, (findings / tokens) * 10000)
|
|
313
|
+
|
|
314
|
+
def _calculate_overall_score(self) -> float:
|
|
315
|
+
"""Calculate overall benchmark score."""
|
|
316
|
+
scores = self.calculate_aggregate_scores()
|
|
317
|
+
weights = {
|
|
318
|
+
"accuracy": 0.20,
|
|
319
|
+
"precision": 0.15,
|
|
320
|
+
"recall": 0.20,
|
|
321
|
+
"f1_score": 0.15,
|
|
322
|
+
"coverage": 0.15,
|
|
323
|
+
"speed": 0.10,
|
|
324
|
+
"efficiency": 0.05
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
overall = sum(scores.get(k, 0) * w for k, w in weights.items())
|
|
328
|
+
return overall
|
|
329
|
+
|
|
330
|
+
def get_severity_distribution(self) -> Dict[str, int]:
|
|
331
|
+
"""Get distribution of findings by severity."""
|
|
332
|
+
distribution = defaultdict(int)
|
|
333
|
+
for finding in self.findings:
|
|
334
|
+
distribution[finding.severity.value] += 1
|
|
335
|
+
return dict(distribution)
|
|
336
|
+
|
|
337
|
+
def get_finding_type_distribution(self) -> Dict[str, int]:
|
|
338
|
+
"""Get distribution of findings by type."""
|
|
339
|
+
distribution = defaultdict(int)
|
|
340
|
+
for finding in self.findings:
|
|
341
|
+
distribution[finding.finding_type.value] += 1
|
|
342
|
+
return dict(distribution)
|
|
343
|
+
|
|
344
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
345
|
+
"""Convert metrics to dictionary."""
|
|
346
|
+
return {
|
|
347
|
+
"benchmark_id": self.benchmark_id,
|
|
348
|
+
"scenario_name": self.scenario_name,
|
|
349
|
+
"tool_version": self.tool_version,
|
|
350
|
+
"timestamp": self.timestamp.isoformat(),
|
|
351
|
+
"classification": {
|
|
352
|
+
"true_positives": self.classification.true_positives,
|
|
353
|
+
"false_positives": self.classification.false_positives,
|
|
354
|
+
"true_negatives": self.classification.true_negatives,
|
|
355
|
+
"false_negatives": self.classification.false_negatives,
|
|
356
|
+
"precision": self.classification.precision,
|
|
357
|
+
"recall": self.classification.recall,
|
|
358
|
+
"f1_score": self.classification.f1_score,
|
|
359
|
+
"accuracy": self.classification.accuracy
|
|
360
|
+
},
|
|
361
|
+
"coverage": {
|
|
362
|
+
"endpoint_coverage": self.coverage.endpoint_coverage,
|
|
363
|
+
"parameter_coverage": self.coverage.parameter_coverage,
|
|
364
|
+
"attack_vector_coverage": self.coverage.attack_vector_coverage,
|
|
365
|
+
"owasp_coverage": self.coverage.owasp_coverage
|
|
366
|
+
},
|
|
367
|
+
"performance": {
|
|
368
|
+
"duration_seconds": self.performance.duration_seconds,
|
|
369
|
+
"mean_time_to_detect_ms": self.performance.mean_time_to_detect_ms,
|
|
370
|
+
"memory_peak_mb": self.performance.memory_peak_mb,
|
|
371
|
+
"cpu_usage_avg": self.performance.cpu_usage_avg
|
|
372
|
+
},
|
|
373
|
+
"exploit": {
|
|
374
|
+
"success_rate": self.exploit.success_rate,
|
|
375
|
+
"safety_score": self.exploit.safety_score,
|
|
376
|
+
"total_attempts": self.exploit.total_exploits_attempted
|
|
377
|
+
},
|
|
378
|
+
"token_usage": {
|
|
379
|
+
"prompt_tokens": self.token_usage.prompt_tokens,
|
|
380
|
+
"completion_tokens": self.token_usage.completion_tokens,
|
|
381
|
+
"total_tokens": self.token_usage.total_tokens,
|
|
382
|
+
"cost_usd": self.token_usage.cost_usd
|
|
383
|
+
},
|
|
384
|
+
"findings_count": len(self.findings),
|
|
385
|
+
"severity_distribution": self.get_severity_distribution(),
|
|
386
|
+
"finding_type_distribution": self.get_finding_type_distribution(),
|
|
387
|
+
"aggregate_scores": self.calculate_aggregate_scores()
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
def to_json(self, indent: int = 2) -> str:
|
|
391
|
+
"""Convert metrics to JSON string."""
|
|
392
|
+
return json.dumps(self.to_dict(), indent=indent, default=str)
|
|
393
|
+
|
|
394
|
+
@classmethod
|
|
395
|
+
def from_dict(cls, data: Dict[str, Any]) -> "BenchmarkMetrics":
|
|
396
|
+
"""Create metrics from dictionary."""
|
|
397
|
+
metrics = cls(
|
|
398
|
+
benchmark_id=data.get("benchmark_id", ""),
|
|
399
|
+
scenario_name=data.get("scenario_name", ""),
|
|
400
|
+
tool_version=data.get("tool_version", ""),
|
|
401
|
+
timestamp=datetime.fromisoformat(data.get("timestamp", datetime.utcnow().isoformat()))
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# Parse classification
|
|
405
|
+
if "classification" in data:
|
|
406
|
+
c = data["classification"]
|
|
407
|
+
metrics.classification = ClassificationMetrics(
|
|
408
|
+
true_positives=c.get("true_positives", 0),
|
|
409
|
+
false_positives=c.get("false_positives", 0),
|
|
410
|
+
true_negatives=c.get("true_negatives", 0),
|
|
411
|
+
false_negatives=c.get("false_negatives", 0)
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Parse coverage
|
|
415
|
+
if "coverage" in data:
|
|
416
|
+
cov = data["coverage"]
|
|
417
|
+
metrics.coverage = CoverageMetrics(
|
|
418
|
+
endpoint_coverage=cov.get("endpoint_coverage", 0),
|
|
419
|
+
parameter_coverage=cov.get("parameter_coverage", 0),
|
|
420
|
+
attack_vector_coverage=cov.get("attack_vector_coverage", 0),
|
|
421
|
+
owasp_coverage=cov.get("owasp_coverage", 0)
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
# Parse performance
|
|
425
|
+
if "performance" in data:
|
|
426
|
+
p = data["performance"]
|
|
427
|
+
metrics.performance = PerformanceMetrics(
|
|
428
|
+
total_duration_ms=p.get("duration_seconds", 0) * 1000,
|
|
429
|
+
mean_time_to_detect_ms=p.get("mean_time_to_detect_ms", 0),
|
|
430
|
+
memory_peak_mb=p.get("memory_peak_mb", 0),
|
|
431
|
+
cpu_usage_avg=p.get("cpu_usage_avg", 0)
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
return metrics
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
class MetricsAggregator:
|
|
438
|
+
"""Aggregate metrics across multiple benchmark runs."""
|
|
439
|
+
|
|
440
|
+
def __init__(self):
|
|
441
|
+
self.metrics: List[BenchmarkMetrics] = []
|
|
442
|
+
|
|
443
|
+
def add(self, metrics: BenchmarkMetrics) -> None:
|
|
444
|
+
"""Add metrics to aggregation."""
|
|
445
|
+
self.metrics.append(metrics)
|
|
446
|
+
|
|
447
|
+
def get_average_scores(self) -> Dict[str, float]:
|
|
448
|
+
"""Get average scores across all runs."""
|
|
449
|
+
if not self.metrics:
|
|
450
|
+
return {}
|
|
451
|
+
|
|
452
|
+
all_scores = [m.calculate_aggregate_scores() for m in self.metrics]
|
|
453
|
+
keys = all_scores[0].keys()
|
|
454
|
+
|
|
455
|
+
return {
|
|
456
|
+
key: statistics.mean(s.get(key, 0) for s in all_scores)
|
|
457
|
+
for key in keys
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
def get_statistics(self) -> Dict[str, Dict[str, float]]:
|
|
461
|
+
"""Get statistical analysis of all metrics."""
|
|
462
|
+
if not self.metrics:
|
|
463
|
+
return {}
|
|
464
|
+
|
|
465
|
+
scores = [m.calculate_aggregate_scores() for m in self.metrics]
|
|
466
|
+
result = {}
|
|
467
|
+
|
|
468
|
+
for key in scores[0].keys():
|
|
469
|
+
values = [s[key] for s in scores]
|
|
470
|
+
result[key] = {
|
|
471
|
+
"mean": statistics.mean(values),
|
|
472
|
+
"median": statistics.median(values),
|
|
473
|
+
"stdev": statistics.stdev(values) if len(values) > 1 else 0.0,
|
|
474
|
+
"min": min(values),
|
|
475
|
+
"max": max(values)
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
return result
|
|
479
|
+
|
|
480
|
+
def get_trend(self, metric_key: str = "overall") -> str:
|
|
481
|
+
"""Determine trend direction for a metric."""
|
|
482
|
+
if len(self.metrics) < 2:
|
|
483
|
+
return "insufficient_data"
|
|
484
|
+
|
|
485
|
+
scores = [m.calculate_aggregate_scores().get(metric_key, 0)
|
|
486
|
+
for m in self.metrics]
|
|
487
|
+
|
|
488
|
+
# Simple linear regression
|
|
489
|
+
n = len(scores)
|
|
490
|
+
x_mean = (n - 1) / 2
|
|
491
|
+
y_mean = statistics.mean(scores)
|
|
492
|
+
|
|
493
|
+
numerator = sum((i - x_mean) * (scores[i] - y_mean) for i in range(n))
|
|
494
|
+
denominator = sum((i - x_mean) ** 2 for i in range(n))
|
|
495
|
+
|
|
496
|
+
if denominator == 0:
|
|
497
|
+
return "stable"
|
|
498
|
+
|
|
499
|
+
slope = numerator / denominator
|
|
500
|
+
|
|
501
|
+
if slope > 0.5:
|
|
502
|
+
return "improving"
|
|
503
|
+
elif slope < -0.5:
|
|
504
|
+
return "degrading"
|
|
505
|
+
return "stable"
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
# Convenience functions
|
|
509
|
+
def calculate_confidence_interval(
|
|
510
|
+
values: List[float],
|
|
511
|
+
confidence: float = 0.95
|
|
512
|
+
) -> tuple:
|
|
513
|
+
"""Calculate confidence interval for a list of values."""
|
|
514
|
+
if len(values) < 2:
|
|
515
|
+
return (0.0, 0.0)
|
|
516
|
+
|
|
517
|
+
mean = statistics.mean(values)
|
|
518
|
+
stdev = statistics.stdev(values)
|
|
519
|
+
|
|
520
|
+
# For 95% confidence, z-score is approximately 1.96
|
|
521
|
+
z_score = 1.96 if confidence == 0.95 else 2.576 # 99%: 2.576
|
|
522
|
+
|
|
523
|
+
margin = z_score * (stdev / math.sqrt(len(values)))
|
|
524
|
+
return (mean - margin, mean + margin)
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def compare_metrics(
|
|
528
|
+
baseline: BenchmarkMetrics,
|
|
529
|
+
current: BenchmarkMetrics
|
|
530
|
+
) -> Dict[str, Any]:
|
|
531
|
+
"""Compare two benchmark runs and return differences."""
|
|
532
|
+
baseline_scores = baseline.calculate_aggregate_scores()
|
|
533
|
+
current_scores = current.calculate_aggregate_scores()
|
|
534
|
+
|
|
535
|
+
comparison = {}
|
|
536
|
+
for key in baseline_scores.keys():
|
|
537
|
+
old_val = baseline_scores[key]
|
|
538
|
+
new_val = current_scores.get(key, 0)
|
|
539
|
+
|
|
540
|
+
if old_val != 0:
|
|
541
|
+
pct_change = ((new_val - old_val) / old_val) * 100
|
|
542
|
+
else:
|
|
543
|
+
pct_change = 100.0 if new_val > 0 else 0.0
|
|
544
|
+
|
|
545
|
+
comparison[key] = {
|
|
546
|
+
"baseline": old_val,
|
|
547
|
+
"current": new_val,
|
|
548
|
+
"absolute_change": new_val - old_val,
|
|
549
|
+
"percent_change": pct_change,
|
|
550
|
+
"improved": new_val > old_val
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
return comparison
|