zen-ai-pentest 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. agents/__init__.py +28 -0
  2. agents/agent_base.py +239 -0
  3. agents/agent_orchestrator.py +346 -0
  4. agents/analysis_agent.py +225 -0
  5. agents/cli.py +258 -0
  6. agents/exploit_agent.py +224 -0
  7. agents/integration.py +211 -0
  8. agents/post_scan_agent.py +937 -0
  9. agents/react_agent.py +384 -0
  10. agents/react_agent_enhanced.py +616 -0
  11. agents/react_agent_vm.py +298 -0
  12. agents/research_agent.py +176 -0
  13. api/__init__.py +11 -0
  14. api/auth.py +123 -0
  15. api/main.py +1027 -0
  16. api/schemas.py +357 -0
  17. api/websocket.py +97 -0
  18. autonomous/__init__.py +122 -0
  19. autonomous/agent.py +253 -0
  20. autonomous/agent_loop.py +1370 -0
  21. autonomous/exploit_validator.py +1537 -0
  22. autonomous/memory.py +448 -0
  23. autonomous/react.py +339 -0
  24. autonomous/tool_executor.py +488 -0
  25. backends/__init__.py +16 -0
  26. backends/chatgpt_direct.py +133 -0
  27. backends/claude_direct.py +130 -0
  28. backends/duckduckgo.py +138 -0
  29. backends/openrouter.py +120 -0
  30. benchmarks/__init__.py +149 -0
  31. benchmarks/benchmark_engine.py +904 -0
  32. benchmarks/ci_benchmark.py +785 -0
  33. benchmarks/comparison.py +729 -0
  34. benchmarks/metrics.py +553 -0
  35. benchmarks/run_benchmarks.py +809 -0
  36. ci_cd/__init__.py +2 -0
  37. core/__init__.py +17 -0
  38. core/async_pool.py +282 -0
  39. core/asyncio_fix.py +222 -0
  40. core/cache.py +472 -0
  41. core/container.py +277 -0
  42. core/database.py +114 -0
  43. core/input_validator.py +353 -0
  44. core/models.py +288 -0
  45. core/orchestrator.py +611 -0
  46. core/plugin_manager.py +571 -0
  47. core/rate_limiter.py +405 -0
  48. core/secure_config.py +328 -0
  49. core/shield_integration.py +296 -0
  50. modules/__init__.py +46 -0
  51. modules/cve_database.py +362 -0
  52. modules/exploit_assist.py +330 -0
  53. modules/nuclei_integration.py +480 -0
  54. modules/osint.py +604 -0
  55. modules/protonvpn.py +554 -0
  56. modules/recon.py +165 -0
  57. modules/sql_injection_db.py +826 -0
  58. modules/tool_orchestrator.py +498 -0
  59. modules/vuln_scanner.py +292 -0
  60. modules/wordlist_generator.py +566 -0
  61. risk_engine/__init__.py +99 -0
  62. risk_engine/business_impact.py +267 -0
  63. risk_engine/business_impact_calculator.py +563 -0
  64. risk_engine/cvss.py +156 -0
  65. risk_engine/epss.py +190 -0
  66. risk_engine/example_usage.py +294 -0
  67. risk_engine/false_positive_engine.py +1073 -0
  68. risk_engine/scorer.py +304 -0
  69. web_ui/backend/main.py +471 -0
  70. zen_ai_pentest-2.0.0.dist-info/METADATA +795 -0
  71. zen_ai_pentest-2.0.0.dist-info/RECORD +75 -0
  72. zen_ai_pentest-2.0.0.dist-info/WHEEL +5 -0
  73. zen_ai_pentest-2.0.0.dist-info/entry_points.txt +2 -0
  74. zen_ai_pentest-2.0.0.dist-info/licenses/LICENSE +21 -0
  75. zen_ai_pentest-2.0.0.dist-info/top_level.txt +10 -0
benchmarks/metrics.py ADDED
@@ -0,0 +1,553 @@
1
+ """
2
+ Zen-AI-Pentest Benchmark Metrics Module
3
+
4
+ Comprehensive metrics collection for security testing benchmarks.
5
+ Provides statistical analysis and performance tracking.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Dict, List, Optional, Any
10
+ from datetime import datetime, timedelta
11
+ from enum import Enum
12
+ import json
13
+ import math
14
+ import statistics
15
+ from collections import defaultdict
16
+
17
+
18
+ class SeverityLevel(Enum):
19
+ """Severity levels for vulnerabilities."""
20
+ CRITICAL = "critical"
21
+ HIGH = "high"
22
+ MEDIUM = "medium"
23
+ LOW = "low"
24
+ INFO = "info"
25
+
26
+
27
+ class FindingType(Enum):
28
+ """Types of security findings."""
29
+ SQL_INJECTION = "sql_injection"
30
+ XSS = "xss"
31
+ CSRF = "csrf"
32
+ SSRF = "ssrf"
33
+ COMMAND_INJECTION = "command_injection"
34
+ PATH_TRAVERSAL = "path_traversal"
35
+ INSECURE_DESERIALIZATION = "insecure_deserialization"
36
+ AUTH_BYPASS = "auth_bypass"
37
+ INFO_DISCLOSURE = "info_disclosure"
38
+ MISCONFIGURATION = "misconfiguration"
39
+ OUTDATED_COMPONENT = "outdated_component"
40
+ WEAK_CRYPTO = "weak_cryptography"
41
+
42
+
43
+ @dataclass
44
+ class FindingMetrics:
45
+ """Metrics for individual findings."""
46
+ finding_type: FindingType
47
+ severity: SeverityLevel
48
+ confidence: float # 0.0 - 1.0
49
+ exploitability: float # 0.0 - 1.0
50
+ detection_time_ms: int
51
+ verification_time_ms: Optional[int] = None
52
+ false_positive: Optional[bool] = None
53
+ verified: bool = False
54
+ exploited: bool = False
55
+ cve_id: Optional[str] = None
56
+ cvss_score: Optional[float] = None
57
+
58
+
59
+ @dataclass
60
+ class TokenUsage:
61
+ """Track API token consumption."""
62
+ prompt_tokens: int = 0
63
+ completion_tokens: int = 0
64
+ total_tokens: int = 0
65
+ cost_usd: float = 0.0
66
+ model: str = ""
67
+
68
+ def __post_init__(self):
69
+ self.total_tokens = self.prompt_tokens + self.completion_tokens
70
+
71
+
72
+ @dataclass
73
+ class PerformanceMetrics:
74
+ """Performance-related metrics."""
75
+ scan_start_time: datetime = field(default_factory=datetime.utcnow)
76
+ scan_end_time: Optional[datetime] = None
77
+ total_duration_ms: int = 0
78
+ mean_time_to_detect_ms: float = 0.0
79
+ detection_rate_per_minute: float = 0.0
80
+ memory_peak_mb: float = 0.0
81
+ cpu_usage_avg: float = 0.0
82
+ network_requests: int = 0
83
+ network_errors: int = 0
84
+
85
+ @property
86
+ def duration_seconds(self) -> float:
87
+ """Get duration in seconds."""
88
+ if self.scan_end_time:
89
+ return (self.scan_end_time - self.scan_start_time).total_seconds()
90
+ return 0.0
91
+
92
+
93
+ @dataclass
94
+ class ClassificationMetrics:
95
+ """
96
+ Binary classification metrics for security findings.
97
+
98
+ True Positives: Correctly identified vulnerabilities
99
+ False Positives: Incorrectly flagged as vulnerabilities
100
+ True Negatives: Correctly identified as safe
101
+ False Negatives: Missed vulnerabilities
102
+ """
103
+ true_positives: int = 0
104
+ false_positives: int = 0
105
+ true_negatives: int = 0
106
+ false_negatives: int = 0
107
+
108
+ @property
109
+ def precision(self) -> float:
110
+ """Precision = TP / (TP + FP)"""
111
+ denominator = self.true_positives + self.false_positives
112
+ if denominator == 0:
113
+ return 0.0
114
+ return self.true_positives / denominator
115
+
116
+ @property
117
+ def recall(self) -> float:
118
+ """Recall = TP / (TP + FN)"""
119
+ denominator = self.true_positives + self.false_negatives
120
+ if denominator == 0:
121
+ return 0.0
122
+ return self.true_positives / denominator
123
+
124
+ @property
125
+ def specificity(self) -> float:
126
+ """Specificity = TN / (TN + FP)"""
127
+ denominator = self.true_negatives + self.false_positives
128
+ if denominator == 0:
129
+ return 0.0
130
+ return self.true_negatives / denominator
131
+
132
+ @property
133
+ def f1_score(self) -> float:
134
+ """F1-Score = 2 * (Precision * Recall) / (Precision + Recall)"""
135
+ precision = self.precision
136
+ recall = self.recall
137
+ if precision + recall == 0:
138
+ return 0.0
139
+ return 2 * (precision * recall) / (precision + recall)
140
+
141
+ @property
142
+ def f2_score(self) -> float:
143
+ """F2-Score weights recall higher than precision."""
144
+ precision = self.precision
145
+ recall = self.recall
146
+ if precision + recall == 0:
147
+ return 0.0
148
+ return 5 * (precision * recall) / ((4 * precision) + recall)
149
+
150
+ @property
151
+ def accuracy(self) -> float:
152
+ """Accuracy = (TP + TN) / (TP + TN + FP + FN)"""
153
+ total = self.true_positives + self.true_negatives + \
154
+ self.false_positives + self.false_negatives
155
+ if total == 0:
156
+ return 0.0
157
+ return (self.true_positives + self.true_negatives) / total
158
+
159
+ @property
160
+ def balanced_accuracy(self) -> float:
161
+ """Balanced accuracy for imbalanced datasets."""
162
+ sensitivity = self.recall
163
+ specificity = self.specificity
164
+ return (sensitivity + specificity) / 2
165
+
166
+ @property
167
+ def matthews_correlation(self) -> float:
168
+ """Matthews Correlation Coefficient (-1 to +1)."""
169
+ tp, tn, fp, fn = self.true_positives, self.true_negatives, \
170
+ self.false_positives, self.false_negatives
171
+
172
+ denominator = math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
173
+ if denominator == 0:
174
+ return 0.0
175
+ return ((tp * tn) - (fp * fn)) / denominator
176
+
177
+
178
+ @dataclass
179
+ class CoverageMetrics:
180
+ """Coverage metrics for security testing."""
181
+ total_endpoints: int = 0
182
+ scanned_endpoints: int = 0
183
+ total_parameters: int = 0
184
+ tested_parameters: int = 0
185
+ total_attack_vectors: int = 0
186
+ tested_attack_vectors: int = 0
187
+ owasp_categories_covered: List[str] = field(default_factory=list)
188
+
189
+ @property
190
+ def endpoint_coverage(self) -> float:
191
+ """Percentage of endpoints scanned."""
192
+ if self.total_endpoints == 0:
193
+ return 0.0
194
+ return (self.scanned_endpoints / self.total_endpoints) * 100
195
+
196
+ @property
197
+ def parameter_coverage(self) -> float:
198
+ """Percentage of parameters tested."""
199
+ if self.total_parameters == 0:
200
+ return 0.0
201
+ return (self.tested_parameters / self.total_parameters) * 100
202
+
203
+ @property
204
+ def attack_vector_coverage(self) -> float:
205
+ """Percentage of attack vectors tested."""
206
+ if self.total_attack_vectors == 0:
207
+ return 0.0
208
+ return (self.tested_attack_vectors / self.total_attack_vectors) * 100
209
+
210
+ @property
211
+ def owasp_coverage(self) -> float:
212
+ """Percentage of OWASP categories covered."""
213
+ # OWASP Top 10 categories
214
+ owasp_top10 = [
215
+ "A01:2021-Broken Access Control",
216
+ "A02:2021-Cryptographic Failures",
217
+ "A03:2021-Injection",
218
+ "A04:2021-Insecure Design",
219
+ "A05:2021-Security Misconfiguration",
220
+ "A06:2021-Vulnerable and Outdated Components",
221
+ "A07:2021-Identification and Authentication Failures",
222
+ "A08:2021-Software and Data Integrity Failures",
223
+ "A09:2021-Security Logging and Monitoring Failures",
224
+ "A10:2021-Server-Side Request Forgery (SSRF)"
225
+ ]
226
+ if not owasp_top10:
227
+ return 0.0
228
+ return (len(self.owasp_categories_covered) / len(owasp_top10)) * 100
229
+
230
+
231
+ @dataclass
232
+ class ExploitMetrics:
233
+ """Metrics for exploit attempts."""
234
+ total_exploits_attempted: int = 0
235
+ successful_exploits: int = 0
236
+ failed_exploits: int = 0
237
+ blocked_exploits: int = 0
238
+ exploit_types: Dict[str, int] = field(default_factory=dict)
239
+
240
+ @property
241
+ def success_rate(self) -> float:
242
+ """Percentage of successful exploits."""
243
+ if self.total_exploits_attempted == 0:
244
+ return 0.0
245
+ return (self.successful_exploits / self.total_exploits_attempted) * 100
246
+
247
+ @property
248
+ def safety_score(self) -> float:
249
+ """Safety score based on controlled exploitation."""
250
+ if self.total_exploits_attempted == 0:
251
+ return 100.0
252
+ # Higher is better - fewer unexpected successes
253
+ return ((self.total_exploits_attempted - self.blocked_exploits) /
254
+ self.total_exploits_attempted) * 100
255
+
256
+
257
+ @dataclass
258
+ class BenchmarkMetrics:
259
+ """Complete benchmark metrics container."""
260
+ # Identification
261
+ benchmark_id: str = ""
262
+ scenario_name: str = ""
263
+ tool_version: str = ""
264
+ timestamp: datetime = field(default_factory=datetime.utcnow)
265
+
266
+ # Component metrics
267
+ classification: ClassificationMetrics = field(default_factory=ClassificationMetrics)
268
+ coverage: CoverageMetrics = field(default_factory=CoverageMetrics)
269
+ performance: PerformanceMetrics = field(default_factory=PerformanceMetrics)
270
+ exploit: ExploitMetrics = field(default_factory=ExploitMetrics)
271
+ token_usage: TokenUsage = field(default_factory=TokenUsage)
272
+
273
+ # Findings
274
+ findings: List[FindingMetrics] = field(default_factory=list)
275
+
276
+ # Raw data
277
+ raw_output: Dict[str, Any] = field(default_factory=dict)
278
+ metadata: Dict[str, Any] = field(default_factory=dict)
279
+
280
+ def calculate_aggregate_scores(self) -> Dict[str, float]:
281
+ """Calculate aggregate benchmark scores."""
282
+ return {
283
+ "accuracy": self.classification.accuracy,
284
+ "precision": self.classification.precision,
285
+ "recall": self.classification.recall,
286
+ "f1_score": self.classification.f1_score,
287
+ "coverage": (
288
+ self.coverage.endpoint_coverage +
289
+ self.coverage.parameter_coverage +
290
+ self.coverage.attack_vector_coverage
291
+ ) / 3,
292
+ "speed": self._calculate_speed_score(),
293
+ "efficiency": self._calculate_efficiency_score(),
294
+ "exploit_success": self.exploit.success_rate,
295
+ "overall": self._calculate_overall_score()
296
+ }
297
+
298
+ def _calculate_speed_score(self) -> float:
299
+ """Calculate speed score (0-100)."""
300
+ duration = self.performance.duration_seconds
301
+ if duration == 0:
302
+ return 0.0
303
+ # Faster is better, with diminishing returns
304
+ return min(100.0, 1000.0 / max(duration, 10))
305
+
306
+ def _calculate_efficiency_score(self) -> float:
307
+ """Calculate efficiency score based on findings per token."""
308
+ tokens = self.token_usage.total_tokens
309
+ if tokens == 0:
310
+ return 0.0
311
+ findings = len(self.findings)
312
+ return min(100.0, (findings / tokens) * 10000)
313
+
314
+ def _calculate_overall_score(self) -> float:
315
+ """Calculate overall benchmark score."""
316
+ scores = self.calculate_aggregate_scores()
317
+ weights = {
318
+ "accuracy": 0.20,
319
+ "precision": 0.15,
320
+ "recall": 0.20,
321
+ "f1_score": 0.15,
322
+ "coverage": 0.15,
323
+ "speed": 0.10,
324
+ "efficiency": 0.05
325
+ }
326
+
327
+ overall = sum(scores.get(k, 0) * w for k, w in weights.items())
328
+ return overall
329
+
330
+ def get_severity_distribution(self) -> Dict[str, int]:
331
+ """Get distribution of findings by severity."""
332
+ distribution = defaultdict(int)
333
+ for finding in self.findings:
334
+ distribution[finding.severity.value] += 1
335
+ return dict(distribution)
336
+
337
+ def get_finding_type_distribution(self) -> Dict[str, int]:
338
+ """Get distribution of findings by type."""
339
+ distribution = defaultdict(int)
340
+ for finding in self.findings:
341
+ distribution[finding.finding_type.value] += 1
342
+ return dict(distribution)
343
+
344
+ def to_dict(self) -> Dict[str, Any]:
345
+ """Convert metrics to dictionary."""
346
+ return {
347
+ "benchmark_id": self.benchmark_id,
348
+ "scenario_name": self.scenario_name,
349
+ "tool_version": self.tool_version,
350
+ "timestamp": self.timestamp.isoformat(),
351
+ "classification": {
352
+ "true_positives": self.classification.true_positives,
353
+ "false_positives": self.classification.false_positives,
354
+ "true_negatives": self.classification.true_negatives,
355
+ "false_negatives": self.classification.false_negatives,
356
+ "precision": self.classification.precision,
357
+ "recall": self.classification.recall,
358
+ "f1_score": self.classification.f1_score,
359
+ "accuracy": self.classification.accuracy
360
+ },
361
+ "coverage": {
362
+ "endpoint_coverage": self.coverage.endpoint_coverage,
363
+ "parameter_coverage": self.coverage.parameter_coverage,
364
+ "attack_vector_coverage": self.coverage.attack_vector_coverage,
365
+ "owasp_coverage": self.coverage.owasp_coverage
366
+ },
367
+ "performance": {
368
+ "duration_seconds": self.performance.duration_seconds,
369
+ "mean_time_to_detect_ms": self.performance.mean_time_to_detect_ms,
370
+ "memory_peak_mb": self.performance.memory_peak_mb,
371
+ "cpu_usage_avg": self.performance.cpu_usage_avg
372
+ },
373
+ "exploit": {
374
+ "success_rate": self.exploit.success_rate,
375
+ "safety_score": self.exploit.safety_score,
376
+ "total_attempts": self.exploit.total_exploits_attempted
377
+ },
378
+ "token_usage": {
379
+ "prompt_tokens": self.token_usage.prompt_tokens,
380
+ "completion_tokens": self.token_usage.completion_tokens,
381
+ "total_tokens": self.token_usage.total_tokens,
382
+ "cost_usd": self.token_usage.cost_usd
383
+ },
384
+ "findings_count": len(self.findings),
385
+ "severity_distribution": self.get_severity_distribution(),
386
+ "finding_type_distribution": self.get_finding_type_distribution(),
387
+ "aggregate_scores": self.calculate_aggregate_scores()
388
+ }
389
+
390
+ def to_json(self, indent: int = 2) -> str:
391
+ """Convert metrics to JSON string."""
392
+ return json.dumps(self.to_dict(), indent=indent, default=str)
393
+
394
+ @classmethod
395
+ def from_dict(cls, data: Dict[str, Any]) -> "BenchmarkMetrics":
396
+ """Create metrics from dictionary."""
397
+ metrics = cls(
398
+ benchmark_id=data.get("benchmark_id", ""),
399
+ scenario_name=data.get("scenario_name", ""),
400
+ tool_version=data.get("tool_version", ""),
401
+ timestamp=datetime.fromisoformat(data.get("timestamp", datetime.utcnow().isoformat()))
402
+ )
403
+
404
+ # Parse classification
405
+ if "classification" in data:
406
+ c = data["classification"]
407
+ metrics.classification = ClassificationMetrics(
408
+ true_positives=c.get("true_positives", 0),
409
+ false_positives=c.get("false_positives", 0),
410
+ true_negatives=c.get("true_negatives", 0),
411
+ false_negatives=c.get("false_negatives", 0)
412
+ )
413
+
414
+ # Parse coverage
415
+ if "coverage" in data:
416
+ cov = data["coverage"]
417
+ metrics.coverage = CoverageMetrics(
418
+ endpoint_coverage=cov.get("endpoint_coverage", 0),
419
+ parameter_coverage=cov.get("parameter_coverage", 0),
420
+ attack_vector_coverage=cov.get("attack_vector_coverage", 0),
421
+ owasp_coverage=cov.get("owasp_coverage", 0)
422
+ )
423
+
424
+ # Parse performance
425
+ if "performance" in data:
426
+ p = data["performance"]
427
+ metrics.performance = PerformanceMetrics(
428
+ total_duration_ms=p.get("duration_seconds", 0) * 1000,
429
+ mean_time_to_detect_ms=p.get("mean_time_to_detect_ms", 0),
430
+ memory_peak_mb=p.get("memory_peak_mb", 0),
431
+ cpu_usage_avg=p.get("cpu_usage_avg", 0)
432
+ )
433
+
434
+ return metrics
435
+
436
+
437
+ class MetricsAggregator:
438
+ """Aggregate metrics across multiple benchmark runs."""
439
+
440
+ def __init__(self):
441
+ self.metrics: List[BenchmarkMetrics] = []
442
+
443
+ def add(self, metrics: BenchmarkMetrics) -> None:
444
+ """Add metrics to aggregation."""
445
+ self.metrics.append(metrics)
446
+
447
+ def get_average_scores(self) -> Dict[str, float]:
448
+ """Get average scores across all runs."""
449
+ if not self.metrics:
450
+ return {}
451
+
452
+ all_scores = [m.calculate_aggregate_scores() for m in self.metrics]
453
+ keys = all_scores[0].keys()
454
+
455
+ return {
456
+ key: statistics.mean(s.get(key, 0) for s in all_scores)
457
+ for key in keys
458
+ }
459
+
460
+ def get_statistics(self) -> Dict[str, Dict[str, float]]:
461
+ """Get statistical analysis of all metrics."""
462
+ if not self.metrics:
463
+ return {}
464
+
465
+ scores = [m.calculate_aggregate_scores() for m in self.metrics]
466
+ result = {}
467
+
468
+ for key in scores[0].keys():
469
+ values = [s[key] for s in scores]
470
+ result[key] = {
471
+ "mean": statistics.mean(values),
472
+ "median": statistics.median(values),
473
+ "stdev": statistics.stdev(values) if len(values) > 1 else 0.0,
474
+ "min": min(values),
475
+ "max": max(values)
476
+ }
477
+
478
+ return result
479
+
480
+ def get_trend(self, metric_key: str = "overall") -> str:
481
+ """Determine trend direction for a metric."""
482
+ if len(self.metrics) < 2:
483
+ return "insufficient_data"
484
+
485
+ scores = [m.calculate_aggregate_scores().get(metric_key, 0)
486
+ for m in self.metrics]
487
+
488
+ # Simple linear regression
489
+ n = len(scores)
490
+ x_mean = (n - 1) / 2
491
+ y_mean = statistics.mean(scores)
492
+
493
+ numerator = sum((i - x_mean) * (scores[i] - y_mean) for i in range(n))
494
+ denominator = sum((i - x_mean) ** 2 for i in range(n))
495
+
496
+ if denominator == 0:
497
+ return "stable"
498
+
499
+ slope = numerator / denominator
500
+
501
+ if slope > 0.5:
502
+ return "improving"
503
+ elif slope < -0.5:
504
+ return "degrading"
505
+ return "stable"
506
+
507
+
508
+ # Convenience functions
509
+ def calculate_confidence_interval(
510
+ values: List[float],
511
+ confidence: float = 0.95
512
+ ) -> tuple:
513
+ """Calculate confidence interval for a list of values."""
514
+ if len(values) < 2:
515
+ return (0.0, 0.0)
516
+
517
+ mean = statistics.mean(values)
518
+ stdev = statistics.stdev(values)
519
+
520
+ # For 95% confidence, z-score is approximately 1.96
521
+ z_score = 1.96 if confidence == 0.95 else 2.576 # 99%: 2.576
522
+
523
+ margin = z_score * (stdev / math.sqrt(len(values)))
524
+ return (mean - margin, mean + margin)
525
+
526
+
527
+ def compare_metrics(
528
+ baseline: BenchmarkMetrics,
529
+ current: BenchmarkMetrics
530
+ ) -> Dict[str, Any]:
531
+ """Compare two benchmark runs and return differences."""
532
+ baseline_scores = baseline.calculate_aggregate_scores()
533
+ current_scores = current.calculate_aggregate_scores()
534
+
535
+ comparison = {}
536
+ for key in baseline_scores.keys():
537
+ old_val = baseline_scores[key]
538
+ new_val = current_scores.get(key, 0)
539
+
540
+ if old_val != 0:
541
+ pct_change = ((new_val - old_val) / old_val) * 100
542
+ else:
543
+ pct_change = 100.0 if new_val > 0 else 0.0
544
+
545
+ comparison[key] = {
546
+ "baseline": old_val,
547
+ "current": new_val,
548
+ "absolute_change": new_val - old_val,
549
+ "percent_change": pct_change,
550
+ "improved": new_val > old_val
551
+ }
552
+
553
+ return comparison