zen-ai-pentest 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agents/__init__.py +28 -0
- agents/agent_base.py +239 -0
- agents/agent_orchestrator.py +346 -0
- agents/analysis_agent.py +225 -0
- agents/cli.py +258 -0
- agents/exploit_agent.py +224 -0
- agents/integration.py +211 -0
- agents/post_scan_agent.py +937 -0
- agents/react_agent.py +384 -0
- agents/react_agent_enhanced.py +616 -0
- agents/react_agent_vm.py +298 -0
- agents/research_agent.py +176 -0
- api/__init__.py +11 -0
- api/auth.py +123 -0
- api/main.py +1027 -0
- api/schemas.py +357 -0
- api/websocket.py +97 -0
- autonomous/__init__.py +122 -0
- autonomous/agent.py +253 -0
- autonomous/agent_loop.py +1370 -0
- autonomous/exploit_validator.py +1537 -0
- autonomous/memory.py +448 -0
- autonomous/react.py +339 -0
- autonomous/tool_executor.py +488 -0
- backends/__init__.py +16 -0
- backends/chatgpt_direct.py +133 -0
- backends/claude_direct.py +130 -0
- backends/duckduckgo.py +138 -0
- backends/openrouter.py +120 -0
- benchmarks/__init__.py +149 -0
- benchmarks/benchmark_engine.py +904 -0
- benchmarks/ci_benchmark.py +785 -0
- benchmarks/comparison.py +729 -0
- benchmarks/metrics.py +553 -0
- benchmarks/run_benchmarks.py +809 -0
- ci_cd/__init__.py +2 -0
- core/__init__.py +17 -0
- core/async_pool.py +282 -0
- core/asyncio_fix.py +222 -0
- core/cache.py +472 -0
- core/container.py +277 -0
- core/database.py +114 -0
- core/input_validator.py +353 -0
- core/models.py +288 -0
- core/orchestrator.py +611 -0
- core/plugin_manager.py +571 -0
- core/rate_limiter.py +405 -0
- core/secure_config.py +328 -0
- core/shield_integration.py +296 -0
- modules/__init__.py +46 -0
- modules/cve_database.py +362 -0
- modules/exploit_assist.py +330 -0
- modules/nuclei_integration.py +480 -0
- modules/osint.py +604 -0
- modules/protonvpn.py +554 -0
- modules/recon.py +165 -0
- modules/sql_injection_db.py +826 -0
- modules/tool_orchestrator.py +498 -0
- modules/vuln_scanner.py +292 -0
- modules/wordlist_generator.py +566 -0
- risk_engine/__init__.py +99 -0
- risk_engine/business_impact.py +267 -0
- risk_engine/business_impact_calculator.py +563 -0
- risk_engine/cvss.py +156 -0
- risk_engine/epss.py +190 -0
- risk_engine/example_usage.py +294 -0
- risk_engine/false_positive_engine.py +1073 -0
- risk_engine/scorer.py +304 -0
- web_ui/backend/main.py +471 -0
- zen_ai_pentest-2.0.0.dist-info/METADATA +795 -0
- zen_ai_pentest-2.0.0.dist-info/RECORD +75 -0
- zen_ai_pentest-2.0.0.dist-info/WHEEL +5 -0
- zen_ai_pentest-2.0.0.dist-info/entry_points.txt +2 -0
- zen_ai_pentest-2.0.0.dist-info/licenses/LICENSE +21 -0
- zen_ai_pentest-2.0.0.dist-info/top_level.txt +10 -0
|
@@ -0,0 +1,809 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Zen-AI-Pentest Benchmark CLI Runner
|
|
3
|
+
|
|
4
|
+
Command-line interface for running benchmarks, viewing results,
|
|
5
|
+
and generating comparison reports.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional, List
|
|
14
|
+
|
|
15
|
+
# Rich for beautiful CLI output
|
|
16
|
+
try:
|
|
17
|
+
from rich.console import Console
|
|
18
|
+
from rich.table import Table
|
|
19
|
+
from rich.panel import Panel
|
|
20
|
+
from rich.progress import Progress, TaskID
|
|
21
|
+
from rich import box
|
|
22
|
+
from rich.tree import Tree
|
|
23
|
+
from rich.layout import Layout
|
|
24
|
+
from rich.syntax import Syntax
|
|
25
|
+
RICH_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
RICH_AVAILABLE = False
|
|
28
|
+
|
|
29
|
+
# Matplotlib for charts
|
|
30
|
+
try:
|
|
31
|
+
import matplotlib.pyplot as plt
|
|
32
|
+
import matplotlib
|
|
33
|
+
matplotlib.use('Agg') # Non-interactive backend
|
|
34
|
+
MATPLOTLIB_AVAILABLE = True
|
|
35
|
+
except ImportError:
|
|
36
|
+
MATPLOTLIB_AVAILABLE = False
|
|
37
|
+
|
|
38
|
+
from .benchmark_engine import (
|
|
39
|
+
BenchmarkEngine, BenchmarkConfig, BenchmarkReport,
|
|
40
|
+
BenchmarkStatus
|
|
41
|
+
)
|
|
42
|
+
from .scenarios import (
|
|
43
|
+
list_all_scenarios, get_scenario, create_benchmark_suite,
|
|
44
|
+
ScenarioType, DifficultyLevel, ALL_SCENARIOS
|
|
45
|
+
)
|
|
46
|
+
from .comparison import ComparisonFramework
|
|
47
|
+
from .ci_benchmark import CIBenchmarkRunner, CIConfig
|
|
48
|
+
|
|
49
|
+
# Setup logging
|
|
50
|
+
logging.basicConfig(
|
|
51
|
+
level=logging.INFO,
|
|
52
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
53
|
+
)
|
|
54
|
+
logger = logging.getLogger(__name__)
|
|
55
|
+
|
|
56
|
+
# Console for output
|
|
57
|
+
console = Console() if RICH_AVAILABLE else None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class BenchmarkCLI:
|
|
61
|
+
"""Command-line interface for benchmark operations."""
|
|
62
|
+
|
|
63
|
+
def __init__(self, output_dir: str = "benchmark_results"):
|
|
64
|
+
self.engine = BenchmarkEngine(output_dir=output_dir)
|
|
65
|
+
self.output_dir = Path(output_dir)
|
|
66
|
+
|
|
67
|
+
if RICH_AVAILABLE:
|
|
68
|
+
console.print(
|
|
69
|
+
Panel.fit(
|
|
70
|
+
"[bold blue]Zen-AI-Pentest Benchmark Framework[/bold blue]\n"
|
|
71
|
+
"[dim]Security Testing Performance Evaluation[/dim]",
|
|
72
|
+
border_style="blue"
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def list_scenarios(
|
|
77
|
+
self,
|
|
78
|
+
scenario_type: Optional[str] = None,
|
|
79
|
+
difficulty: Optional[str] = None,
|
|
80
|
+
tag: Optional[str] = None
|
|
81
|
+
) -> None:
|
|
82
|
+
"""List available benchmark scenarios."""
|
|
83
|
+
|
|
84
|
+
scenarios = list_all_scenarios()
|
|
85
|
+
|
|
86
|
+
# Apply filters
|
|
87
|
+
if scenario_type:
|
|
88
|
+
scenarios = [s for s in scenarios if s.get("type") == scenario_type]
|
|
89
|
+
if difficulty:
|
|
90
|
+
scenarios = [s for s in scenarios if s.get("difficulty") == difficulty]
|
|
91
|
+
if tag:
|
|
92
|
+
scenarios = [s for s in scenarios if tag in s.get("tags", [])]
|
|
93
|
+
|
|
94
|
+
if RICH_AVAILABLE:
|
|
95
|
+
table = Table(
|
|
96
|
+
title="Available Benchmark Scenarios",
|
|
97
|
+
box=box.ROUNDED
|
|
98
|
+
)
|
|
99
|
+
table.add_column("ID", style="cyan", no_wrap=True)
|
|
100
|
+
table.add_column("Name", style="green")
|
|
101
|
+
table.add_column("Type", style="blue")
|
|
102
|
+
table.add_column("Difficulty", style="yellow")
|
|
103
|
+
table.add_column("Duration", justify="right")
|
|
104
|
+
table.add_column("Vulns", justify="right")
|
|
105
|
+
|
|
106
|
+
for s in scenarios:
|
|
107
|
+
diff_color = {
|
|
108
|
+
"easy": "green",
|
|
109
|
+
"medium": "yellow",
|
|
110
|
+
"hard": "red",
|
|
111
|
+
"expert": "magenta"
|
|
112
|
+
}.get(s.get("difficulty", ""), "white")
|
|
113
|
+
|
|
114
|
+
table.add_row(
|
|
115
|
+
s.get("id", ""),
|
|
116
|
+
s.get("name", ""),
|
|
117
|
+
s.get("type", ""),
|
|
118
|
+
f"[{diff_color}]{s.get('difficulty', '')}[/{diff_color}]",
|
|
119
|
+
f"{s.get('estimated_duration_minutes', 0)}m",
|
|
120
|
+
str(s.get('expected_vulnerabilities_count', 0))
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
console.print(table)
|
|
124
|
+
console.print(f"\n[dim]Total: {len(scenarios)} scenarios[/dim]")
|
|
125
|
+
else:
|
|
126
|
+
# Plain text output
|
|
127
|
+
print("\nAvailable Benchmark Scenarios:")
|
|
128
|
+
print("-" * 100)
|
|
129
|
+
print(f"{'ID':<25} {'Name':<30} {'Type':<12} {'Difficulty':<10} {'Duration':<10}")
|
|
130
|
+
print("-" * 100)
|
|
131
|
+
|
|
132
|
+
for s in scenarios:
|
|
133
|
+
print(
|
|
134
|
+
f"{s.get('id', ''):<25} "
|
|
135
|
+
f"{s.get('name', '')[:28]:<30} "
|
|
136
|
+
f"{s.get('type', ''):<12} "
|
|
137
|
+
f"{s.get('difficulty', ''):<10} "
|
|
138
|
+
f"{s.get('estimated_duration_minutes', 0)}m"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
print(f"\nTotal: {len(scenarios)} scenarios")
|
|
142
|
+
|
|
143
|
+
async def run_benchmark(
|
|
144
|
+
self,
|
|
145
|
+
scenarios: Optional[List[str]] = None,
|
|
146
|
+
scenario_type: Optional[str] = None,
|
|
147
|
+
difficulty: Optional[str] = None,
|
|
148
|
+
tags: Optional[List[str]] = None,
|
|
149
|
+
name: Optional[str] = None,
|
|
150
|
+
concurrent: int = 1,
|
|
151
|
+
timeout: int = 3600,
|
|
152
|
+
compare: bool = False,
|
|
153
|
+
competitors: Optional[List[str]] = None
|
|
154
|
+
) -> BenchmarkReport:
|
|
155
|
+
"""Run benchmark with specified configuration."""
|
|
156
|
+
|
|
157
|
+
# Build configuration
|
|
158
|
+
config = BenchmarkConfig(
|
|
159
|
+
benchmark_name=name or f"benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
|
160
|
+
scenarios=scenarios or [],
|
|
161
|
+
tags=tags,
|
|
162
|
+
max_concurrent=concurrent,
|
|
163
|
+
timeout_per_scenario=timeout,
|
|
164
|
+
enable_competitor_comparison=compare,
|
|
165
|
+
competitors=competitors or []
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Add type filter
|
|
169
|
+
if scenario_type:
|
|
170
|
+
try:
|
|
171
|
+
config.scenario_types = [ScenarioType(scenario_type)]
|
|
172
|
+
except ValueError:
|
|
173
|
+
logger.warning(f"Unknown scenario type: {scenario_type}")
|
|
174
|
+
|
|
175
|
+
# Add difficulty filter
|
|
176
|
+
if difficulty:
|
|
177
|
+
try:
|
|
178
|
+
config.difficulty_levels = [DifficultyLevel(difficulty)]
|
|
179
|
+
except ValueError:
|
|
180
|
+
logger.warning(f"Unknown difficulty: {difficulty}")
|
|
181
|
+
|
|
182
|
+
if RICH_AVAILABLE:
|
|
183
|
+
console.print(f"\n[bold]Starting Benchmark:[/bold] {config.benchmark_name}")
|
|
184
|
+
console.print(f"[dim]ID: {config.benchmark_id}[/dim]\n")
|
|
185
|
+
else:
|
|
186
|
+
print(f"\nStarting Benchmark: {config.benchmark_name}")
|
|
187
|
+
print(f"ID: {config.benchmark_id}\n")
|
|
188
|
+
|
|
189
|
+
# Run benchmark
|
|
190
|
+
report = await self.engine.run_benchmark(config)
|
|
191
|
+
|
|
192
|
+
# Display results
|
|
193
|
+
self._display_results(report)
|
|
194
|
+
|
|
195
|
+
return report
|
|
196
|
+
|
|
197
|
+
def _display_results(self, report: BenchmarkReport) -> None:
|
|
198
|
+
"""Display benchmark results."""
|
|
199
|
+
|
|
200
|
+
if RICH_AVAILABLE:
|
|
201
|
+
console.print("\n[bold]Benchmark Results:[/bold]\n")
|
|
202
|
+
|
|
203
|
+
# Summary panel
|
|
204
|
+
summary_text = (
|
|
205
|
+
f"[bold]Duration:[/bold] {report.duration_seconds:.1f}s\n"
|
|
206
|
+
f"[bold]Scenarios:[/bold] {len(report.scenario_results)}\n"
|
|
207
|
+
f"[bold]Passed:[/bold] {report.scenarios_passed} ✅\n"
|
|
208
|
+
f"[bold]Failed:[/bold] {report.scenarios_failed} ❌\n"
|
|
209
|
+
f"[bold]Success Rate:[/bold] {report.success_rate:.1f}%"
|
|
210
|
+
)
|
|
211
|
+
console.print(Panel(summary_text, title="Summary", border_style="green"))
|
|
212
|
+
|
|
213
|
+
# Results table
|
|
214
|
+
table = Table(title="Scenario Results", box=box.ROUNDED)
|
|
215
|
+
table.add_column("Scenario", style="cyan")
|
|
216
|
+
table.add_column("Status", style="bold")
|
|
217
|
+
table.add_column("Duration", justify="right")
|
|
218
|
+
table.add_column("Precision", justify="right")
|
|
219
|
+
table.add_column("Recall", justify="right")
|
|
220
|
+
table.add_column("F1-Score", justify="right")
|
|
221
|
+
table.add_column("Overall", justify="right")
|
|
222
|
+
|
|
223
|
+
for result in report.scenario_results:
|
|
224
|
+
status_color = {
|
|
225
|
+
BenchmarkStatus.COMPLETED: "green",
|
|
226
|
+
BenchmarkStatus.FAILED: "red",
|
|
227
|
+
BenchmarkStatus.TIMEOUT: "yellow",
|
|
228
|
+
BenchmarkStatus.CANCELLED: "magenta"
|
|
229
|
+
}.get(result.status, "white")
|
|
230
|
+
|
|
231
|
+
if result.metrics:
|
|
232
|
+
scores = result.metrics.calculate_aggregate_scores()
|
|
233
|
+
table.add_row(
|
|
234
|
+
result.scenario_id,
|
|
235
|
+
f"[{status_color}]{result.status.name}[/{status_color}]",
|
|
236
|
+
f"{result.duration_seconds:.1f}s",
|
|
237
|
+
f"{scores.get('precision', 0):.3f}",
|
|
238
|
+
f"{scores.get('recall', 0):.3f}",
|
|
239
|
+
f"{scores.get('f1_score', 0):.3f}",
|
|
240
|
+
f"[bold]{scores.get('overall', 0):.3f}[/bold]"
|
|
241
|
+
)
|
|
242
|
+
else:
|
|
243
|
+
table.add_row(
|
|
244
|
+
result.scenario_id,
|
|
245
|
+
f"[{status_color}]{result.status.name}[/{status_color}]",
|
|
246
|
+
f"{result.duration_seconds:.1f}s",
|
|
247
|
+
"N/A", "N/A", "N/A", "N/A"
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
console.print(table)
|
|
251
|
+
|
|
252
|
+
# Aggregate metrics
|
|
253
|
+
if report.aggregate_metrics:
|
|
254
|
+
console.print("\n[bold]Aggregate Metrics:[/bold]")
|
|
255
|
+
metrics_text = ""
|
|
256
|
+
for key, value in report.aggregate_metrics.items():
|
|
257
|
+
if isinstance(value, float):
|
|
258
|
+
metrics_text += f"[bold]{key}:[/bold] {value:.3f}\n"
|
|
259
|
+
else:
|
|
260
|
+
metrics_text += f"[bold]{key}:[/bold] {value}\n"
|
|
261
|
+
console.print(Panel(metrics_text, border_style="blue"))
|
|
262
|
+
|
|
263
|
+
# Output location
|
|
264
|
+
output_path = self.output_dir / report.benchmark_id
|
|
265
|
+
console.print(f"\n[dim]Results saved to: {output_path}[/dim]")
|
|
266
|
+
|
|
267
|
+
else:
|
|
268
|
+
# Plain text output
|
|
269
|
+
print("\n" + "="*60)
|
|
270
|
+
print("BENCHMARK RESULTS")
|
|
271
|
+
print("="*60)
|
|
272
|
+
print(f"Duration: {report.duration_seconds:.1f}s")
|
|
273
|
+
print(f"Scenarios: {len(report.scenario_results)}")
|
|
274
|
+
print(f"Passed: {report.scenarios_passed}")
|
|
275
|
+
print(f"Failed: {report.scenarios_failed}")
|
|
276
|
+
print(f"Success Rate: {report.success_rate:.1f}%")
|
|
277
|
+
print("-"*60)
|
|
278
|
+
|
|
279
|
+
for result in report.scenario_results:
|
|
280
|
+
print(f"\n{result.scenario_id}:")
|
|
281
|
+
print(f" Status: {result.status.name}")
|
|
282
|
+
print(f" Duration: {result.duration_seconds:.1f}s")
|
|
283
|
+
|
|
284
|
+
if result.metrics:
|
|
285
|
+
scores = result.metrics.calculate_aggregate_scores()
|
|
286
|
+
print(f" Precision: {scores.get('precision', 0):.3f}")
|
|
287
|
+
print(f" Recall: {scores.get('recall', 0):.3f}")
|
|
288
|
+
print(f" F1-Score: {scores.get('f1_score', 0):.3f}")
|
|
289
|
+
print(f" Overall: {scores.get('overall', 0):.3f}")
|
|
290
|
+
|
|
291
|
+
def view_report(self, benchmark_id: str) -> None:
|
|
292
|
+
"""View a specific benchmark report."""
|
|
293
|
+
|
|
294
|
+
report_path = self.output_dir / benchmark_id / "report.json"
|
|
295
|
+
|
|
296
|
+
if not report_path.exists():
|
|
297
|
+
if RICH_AVAILABLE:
|
|
298
|
+
console.print(f"[red]Report not found: {benchmark_id}[/red]")
|
|
299
|
+
else:
|
|
300
|
+
print(f"Report not found: {benchmark_id}")
|
|
301
|
+
return
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
with open(report_path) as f:
|
|
305
|
+
data = json.load(f)
|
|
306
|
+
|
|
307
|
+
if RICH_AVAILABLE:
|
|
308
|
+
console.print(f"\n[bold]Benchmark Report:[/bold] {benchmark_id}")
|
|
309
|
+
|
|
310
|
+
# Display as tree
|
|
311
|
+
tree = Tree(f"[bold]{data.get('benchmark_name', benchmark_id)}[/bold]")
|
|
312
|
+
|
|
313
|
+
summary = data.get('summary', {})
|
|
314
|
+
summary_branch = tree.add("[blue]Summary[/blue]")
|
|
315
|
+
summary_branch.add(f"Total: {summary.get('total_scenarios', 0)}")
|
|
316
|
+
summary_branch.add(f"Passed: {summary.get('passed', 0)}")
|
|
317
|
+
summary_branch.add(f"Failed: {summary.get('failed', 0)}")
|
|
318
|
+
summary_branch.add(f"Success Rate: {summary.get('success_rate', 0):.1f}%")
|
|
319
|
+
|
|
320
|
+
if 'aggregate_metrics' in data and data['aggregate_metrics']:
|
|
321
|
+
metrics_branch = tree.add("[green]Aggregate Metrics[/green]")
|
|
322
|
+
for key, value in data['aggregate_metrics'].items():
|
|
323
|
+
if isinstance(value, float):
|
|
324
|
+
metrics_branch.add(f"{key}: {value:.3f}")
|
|
325
|
+
else:
|
|
326
|
+
metrics_branch.add(f"{key}: {value}")
|
|
327
|
+
|
|
328
|
+
console.print(tree)
|
|
329
|
+
else:
|
|
330
|
+
print(json.dumps(data, indent=2))
|
|
331
|
+
|
|
332
|
+
except Exception as e:
|
|
333
|
+
if RICH_AVAILABLE:
|
|
334
|
+
console.print(f"[red]Error loading report: {e}[/red]")
|
|
335
|
+
else:
|
|
336
|
+
print(f"Error loading report: {e}")
|
|
337
|
+
|
|
338
|
+
def list_history(self, limit: int = 10) -> None:
|
|
339
|
+
"""List benchmark history."""
|
|
340
|
+
|
|
341
|
+
history = self.engine.get_benchmark_history(limit=limit)
|
|
342
|
+
|
|
343
|
+
if not history:
|
|
344
|
+
if RICH_AVAILABLE:
|
|
345
|
+
console.print("[yellow]No benchmark history found[/yellow]")
|
|
346
|
+
else:
|
|
347
|
+
print("No benchmark history found")
|
|
348
|
+
return
|
|
349
|
+
|
|
350
|
+
if RICH_AVAILABLE:
|
|
351
|
+
table = Table(title="Benchmark History", box=box.ROUNDED)
|
|
352
|
+
table.add_column("ID", style="cyan", no_wrap=True)
|
|
353
|
+
table.add_column("Date", style="dim")
|
|
354
|
+
table.add_column("Scenarios", justify="right")
|
|
355
|
+
table.add_column("Success Rate", justify="right")
|
|
356
|
+
table.add_column("F1-Score", justify="right")
|
|
357
|
+
|
|
358
|
+
for entry in history:
|
|
359
|
+
metrics = entry.get('aggregate_metrics', {})
|
|
360
|
+
table.add_row(
|
|
361
|
+
entry.get('benchmark_id', '')[:12],
|
|
362
|
+
entry.get('timestamp', '')[:19],
|
|
363
|
+
str(len(entry.get('scenarios', []))),
|
|
364
|
+
f"{entry.get('success_rate', 0):.1f}%",
|
|
365
|
+
f"{metrics.get('avg_f1_score', 0):.3f}"
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
console.print(table)
|
|
369
|
+
else:
|
|
370
|
+
print("\nBenchmark History:")
|
|
371
|
+
print("-" * 80)
|
|
372
|
+
for entry in history:
|
|
373
|
+
metrics = entry.get('aggregate_metrics', {})
|
|
374
|
+
print(
|
|
375
|
+
f"{entry.get('benchmark_id', '')[:12]} | "
|
|
376
|
+
f"{entry.get('timestamp', '')[:19]} | "
|
|
377
|
+
f"Scenarios: {len(entry.get('scenarios', [])):<3} | "
|
|
378
|
+
f"Success: {entry.get('success_rate', 0):.1f}% | "
|
|
379
|
+
f"F1: {metrics.get('avg_f1_score', 0):.3f}"
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
def compare_reports(
|
|
383
|
+
self,
|
|
384
|
+
benchmark_id1: str,
|
|
385
|
+
benchmark_id2: str
|
|
386
|
+
) -> None:
|
|
387
|
+
"""Compare two benchmark reports."""
|
|
388
|
+
|
|
389
|
+
path1 = self.output_dir / benchmark_id1 / "report.json"
|
|
390
|
+
path2 = self.output_dir / benchmark_id2 / "report.json"
|
|
391
|
+
|
|
392
|
+
if not path1.exists() or not path2.exists():
|
|
393
|
+
if RICH_AVAILABLE:
|
|
394
|
+
console.print("[red]One or both reports not found[/red]")
|
|
395
|
+
else:
|
|
396
|
+
print("One or both reports not found")
|
|
397
|
+
return
|
|
398
|
+
|
|
399
|
+
try:
|
|
400
|
+
with open(path1) as f:
|
|
401
|
+
data1 = json.load(f)
|
|
402
|
+
with open(path2) as f:
|
|
403
|
+
data2 = json.load(f)
|
|
404
|
+
|
|
405
|
+
metrics1 = data1.get('aggregate_metrics', {})
|
|
406
|
+
metrics2 = data2.get('aggregate_metrics', {})
|
|
407
|
+
|
|
408
|
+
if RICH_AVAILABLE:
|
|
409
|
+
console.print(f"\n[bold]Comparing Benchmarks:[/bold]")
|
|
410
|
+
console.print(f" [cyan]{benchmark_id1}[/cyan] vs [cyan]{benchmark_id2}[/cyan]\n")
|
|
411
|
+
|
|
412
|
+
table = Table(box=box.ROUNDED)
|
|
413
|
+
table.add_column("Metric", style="bold")
|
|
414
|
+
table.add_column(benchmark_id1[:15], justify="right")
|
|
415
|
+
table.add_column(benchmark_id2[:15], justify="right")
|
|
416
|
+
table.add_column("Change", justify="right")
|
|
417
|
+
|
|
418
|
+
for key in metrics1.keys():
|
|
419
|
+
if key in metrics2:
|
|
420
|
+
val1 = metrics1[key]
|
|
421
|
+
val2 = metrics2[key]
|
|
422
|
+
|
|
423
|
+
if isinstance(val1, float) and isinstance(val2, float):
|
|
424
|
+
change = val2 - val1
|
|
425
|
+
change_pct = (change / val1 * 100) if val1 != 0 else 0
|
|
426
|
+
|
|
427
|
+
change_color = "green" if change > 0 else "red"
|
|
428
|
+
change_str = f"{change:+.3f} ({change_pct:+.1f}%)"
|
|
429
|
+
|
|
430
|
+
table.add_row(
|
|
431
|
+
key,
|
|
432
|
+
f"{val1:.3f}",
|
|
433
|
+
f"{val2:.3f}",
|
|
434
|
+
f"[{change_color}]{change_str}[/{change_color}]"
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
console.print(table)
|
|
438
|
+
else:
|
|
439
|
+
print(f"\nComparing: {benchmark_id1} vs {benchmark_id2}")
|
|
440
|
+
print("-" * 60)
|
|
441
|
+
for key in metrics1.keys():
|
|
442
|
+
if key in metrics2:
|
|
443
|
+
val1 = metrics1[key]
|
|
444
|
+
val2 = metrics2[key]
|
|
445
|
+
if isinstance(val1, float):
|
|
446
|
+
change = val2 - val1
|
|
447
|
+
print(f"{key}: {val1:.3f} → {val2:.3f} ({change:+.3f})")
|
|
448
|
+
|
|
449
|
+
except Exception as e:
|
|
450
|
+
if RICH_AVAILABLE:
|
|
451
|
+
console.print(f"[red]Error comparing reports: {e}[/red]")
|
|
452
|
+
else:
|
|
453
|
+
print(f"Error comparing reports: {e}")
|
|
454
|
+
|
|
455
|
+
def generate_chart(self, benchmark_id: Optional[str] = None) -> None:
|
|
456
|
+
"""Generate visualization charts."""
|
|
457
|
+
|
|
458
|
+
if not MATPLOTLIB_AVAILABLE:
|
|
459
|
+
if RICH_AVAILABLE:
|
|
460
|
+
console.print("[yellow]Matplotlib not available. Install with: pip install matplotlib[/yellow]")
|
|
461
|
+
else:
|
|
462
|
+
print("Matplotlib not available")
|
|
463
|
+
return
|
|
464
|
+
|
|
465
|
+
if benchmark_id:
|
|
466
|
+
# Single benchmark chart
|
|
467
|
+
report_path = self.output_dir / benchmark_id / "report.json"
|
|
468
|
+
if not report_path.exists():
|
|
469
|
+
if RICH_AVAILABLE:
|
|
470
|
+
console.print(f"[red]Report not found: {benchmark_id}[/red]")
|
|
471
|
+
else:
|
|
472
|
+
print(f"Report not found: {benchmark_id}")
|
|
473
|
+
return
|
|
474
|
+
|
|
475
|
+
with open(report_path) as f:
|
|
476
|
+
data = json.load(f)
|
|
477
|
+
|
|
478
|
+
self._create_benchmark_chart(data, benchmark_id)
|
|
479
|
+
else:
|
|
480
|
+
# Historical trend chart
|
|
481
|
+
history = self.engine.get_benchmark_history(limit=20)
|
|
482
|
+
if not history:
|
|
483
|
+
if RICH_AVAILABLE:
|
|
484
|
+
console.print("[yellow]No history available for trend chart[/yellow]")
|
|
485
|
+
else:
|
|
486
|
+
print("No history available")
|
|
487
|
+
return
|
|
488
|
+
|
|
489
|
+
self._create_trend_chart(history)
|
|
490
|
+
|
|
491
|
+
def _create_benchmark_chart(self, data: dict, benchmark_id: str) -> None:
|
|
492
|
+
"""Create chart for single benchmark."""
|
|
493
|
+
|
|
494
|
+
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
|
|
495
|
+
fig.suptitle(f'Benchmark Results: {benchmark_id}', fontsize=14, fontweight='bold')
|
|
496
|
+
|
|
497
|
+
scenario_results = data.get('scenario_results', [])
|
|
498
|
+
|
|
499
|
+
# 1. Scores by scenario
|
|
500
|
+
ax1 = axes[0, 0]
|
|
501
|
+
scenarios = [r['scenario_id'] for r in scenario_results if r.get('metrics')]
|
|
502
|
+
scores_data = []
|
|
503
|
+
|
|
504
|
+
for r in scenario_results:
|
|
505
|
+
if r.get('metrics'):
|
|
506
|
+
scores = r['metrics'].get('aggregate_scores', {})
|
|
507
|
+
scores_data.append({
|
|
508
|
+
'precision': scores.get('precision', 0),
|
|
509
|
+
'recall': scores.get('recall', 0),
|
|
510
|
+
'f1': scores.get('f1_score', 0)
|
|
511
|
+
})
|
|
512
|
+
|
|
513
|
+
if scenarios and scores_data:
|
|
514
|
+
x = range(len(scenarios))
|
|
515
|
+
width = 0.25
|
|
516
|
+
|
|
517
|
+
ax1.bar([i - width for i in x], [s['precision'] for s in scores_data],
|
|
518
|
+
width, label='Precision', alpha=0.8)
|
|
519
|
+
ax1.bar(x, [s['recall'] for s in scores_data],
|
|
520
|
+
width, label='Recall', alpha=0.8)
|
|
521
|
+
ax1.bar([i + width for i in x], [s['f1'] for s in scores_data],
|
|
522
|
+
width, label='F1-Score', alpha=0.8)
|
|
523
|
+
|
|
524
|
+
ax1.set_xlabel('Scenario')
|
|
525
|
+
ax1.set_ylabel('Score')
|
|
526
|
+
ax1.set_title('Scores by Scenario')
|
|
527
|
+
ax1.set_xticks(x)
|
|
528
|
+
ax1.set_xticklabels(scenarios, rotation=45, ha='right')
|
|
529
|
+
ax1.legend()
|
|
530
|
+
ax1.set_ylim(0, 1)
|
|
531
|
+
|
|
532
|
+
# 2. Duration by scenario
|
|
533
|
+
ax2 = axes[0, 1]
|
|
534
|
+
durations = [r['duration_seconds'] for r in scenario_results]
|
|
535
|
+
statuses = [r['status'] for r in scenario_results]
|
|
536
|
+
colors = ['green' if s == 'COMPLETED' else 'red' for s in statuses]
|
|
537
|
+
|
|
538
|
+
ax2.barh(scenarios, durations, color=colors, alpha=0.7)
|
|
539
|
+
ax2.set_xlabel('Duration (seconds)')
|
|
540
|
+
ax2.set_title('Scenario Duration')
|
|
541
|
+
|
|
542
|
+
# 3. Severity distribution
|
|
543
|
+
ax3 = axes[1, 0]
|
|
544
|
+
all_severities = {}
|
|
545
|
+
for r in scenario_results:
|
|
546
|
+
if r.get('metrics'):
|
|
547
|
+
sev_dist = r['metrics'].get('severity_distribution', {})
|
|
548
|
+
for sev, count in sev_dist.items():
|
|
549
|
+
all_severities[sev] = all_severities.get(sev, 0) + count
|
|
550
|
+
|
|
551
|
+
if all_severities:
|
|
552
|
+
colors_sev = {'critical': '#d32f2f', 'high': '#f57c00',
|
|
553
|
+
'medium': '#fbc02d', 'low': '#388e3c', 'info': '#1976d2'}
|
|
554
|
+
sev_colors = [colors_sev.get(s, '#757575') for s in all_severities.keys()]
|
|
555
|
+
ax3.pie(all_severities.values(), labels=all_severities.keys(),
|
|
556
|
+
colors=sev_colors, autopct='%1.1f%%')
|
|
557
|
+
ax3.set_title('Findings by Severity')
|
|
558
|
+
|
|
559
|
+
# 4. Aggregate metrics
|
|
560
|
+
ax4 = axes[1, 1]
|
|
561
|
+
metrics = data.get('aggregate_metrics', {})
|
|
562
|
+
metric_names = []
|
|
563
|
+
metric_values = []
|
|
564
|
+
|
|
565
|
+
for key in ['avg_precision', 'avg_recall', 'avg_f1_score', 'avg_accuracy']:
|
|
566
|
+
if key in metrics:
|
|
567
|
+
metric_names.append(key.replace('avg_', '').title())
|
|
568
|
+
metric_values.append(metrics[key])
|
|
569
|
+
|
|
570
|
+
if metric_values:
|
|
571
|
+
bars = ax4.bar(metric_names, metric_values, color='steelblue', alpha=0.7)
|
|
572
|
+
ax4.set_ylabel('Score')
|
|
573
|
+
ax4.set_title('Aggregate Metrics')
|
|
574
|
+
ax4.set_ylim(0, 1)
|
|
575
|
+
|
|
576
|
+
# Add value labels on bars
|
|
577
|
+
for bar, val in zip(bars, metric_values):
|
|
578
|
+
ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
|
|
579
|
+
f'{val:.3f}', ha='center', va='bottom', fontsize=9)
|
|
580
|
+
|
|
581
|
+
plt.tight_layout()
|
|
582
|
+
|
|
583
|
+
output_path = self.output_dir / benchmark_id / "chart.png"
|
|
584
|
+
plt.savefig(output_path, dpi=150, bbox_inches='tight')
|
|
585
|
+
plt.close()
|
|
586
|
+
|
|
587
|
+
if RICH_AVAILABLE:
|
|
588
|
+
console.print(f"[green]Chart saved:[/green] {output_path}")
|
|
589
|
+
else:
|
|
590
|
+
print(f"Chart saved: {output_path}")
|
|
591
|
+
|
|
592
|
+
def _create_trend_chart(self, history: list) -> None:
|
|
593
|
+
"""Create historical trend chart."""
|
|
594
|
+
|
|
595
|
+
fig, ax = plt.subplots(figsize=(12, 6))
|
|
596
|
+
|
|
597
|
+
dates = [h.get('timestamp', '')[:10] for h in history]
|
|
598
|
+
|
|
599
|
+
metrics_to_plot = {
|
|
600
|
+
'F1-Score': [h.get('aggregate_metrics', {}).get('avg_f1_score', 0) for h in history],
|
|
601
|
+
'Precision': [h.get('aggregate_metrics', {}).get('avg_precision', 0) for h in history],
|
|
602
|
+
'Recall': [h.get('aggregate_metrics', {}).get('avg_recall', 0) for h in history],
|
|
603
|
+
'Accuracy': [h.get('aggregate_metrics', {}).get('avg_accuracy', 0) for h in history],
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
for label, values in metrics_to_plot.items():
|
|
607
|
+
ax.plot(dates, values, marker='o', label=label, linewidth=2)
|
|
608
|
+
|
|
609
|
+
ax.set_xlabel('Date')
|
|
610
|
+
ax.set_ylabel('Score')
|
|
611
|
+
ax.set_title('Benchmark Performance Trends')
|
|
612
|
+
ax.legend()
|
|
613
|
+
ax.grid(True, alpha=0.3)
|
|
614
|
+
ax.set_ylim(0, 1)
|
|
615
|
+
|
|
616
|
+
plt.xticks(rotation=45)
|
|
617
|
+
plt.tight_layout()
|
|
618
|
+
|
|
619
|
+
output_path = self.output_dir / "trend_chart.png"
|
|
620
|
+
plt.savefig(output_path, dpi=150, bbox_inches='tight')
|
|
621
|
+
plt.close()
|
|
622
|
+
|
|
623
|
+
if RICH_AVAILABLE:
|
|
624
|
+
console.print(f"[green]Trend chart saved:[/green] {output_path}")
|
|
625
|
+
else:
|
|
626
|
+
print(f"Trend chart saved: {output_path}")
|
|
627
|
+
|
|
628
|
+
async def run_ci(
|
|
629
|
+
self,
|
|
630
|
+
benchmark_type: str = "quick",
|
|
631
|
+
output_format: str = "all",
|
|
632
|
+
fail_on_gate: bool = False,
|
|
633
|
+
fail_on_regression: bool = False
|
|
634
|
+
) -> None:
|
|
635
|
+
"""Run CI/CD benchmark pipeline."""
|
|
636
|
+
|
|
637
|
+
config = CIConfig(
|
|
638
|
+
output_format=output_format,
|
|
639
|
+
fail_on_gate_failure=fail_on_gate,
|
|
640
|
+
fail_on_critical_regression=fail_on_regression
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
runner = CIBenchmarkRunner(
|
|
644
|
+
engine=self.engine,
|
|
645
|
+
config=config,
|
|
646
|
+
output_dir=str(self.output_dir)
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
result = await runner.run_ci_pipeline(benchmark_type)
|
|
650
|
+
|
|
651
|
+
if RICH_AVAILABLE:
|
|
652
|
+
if result["should_fail"]:
|
|
653
|
+
console.print(f"\n[red]❌ Build Failed: {result['fail_reason']}[/red]")
|
|
654
|
+
else:
|
|
655
|
+
console.print("\n[green]✅ All Checks Passed[/green]")
|
|
656
|
+
|
|
657
|
+
console.print(f"\n[bold]Results:[/bold]")
|
|
658
|
+
console.print(f" Success Rate: {result['success_rate']:.1f}%")
|
|
659
|
+
console.print(f" Gates Passed: {result['gates_passed']}/{result['gates_total']}")
|
|
660
|
+
console.print(f" Regressions: {result['regressions']}")
|
|
661
|
+
else:
|
|
662
|
+
if result["should_fail"]:
|
|
663
|
+
print(f"\nBuild Failed: {result['fail_reason']}")
|
|
664
|
+
else:
|
|
665
|
+
print("\nAll Checks Passed")
|
|
666
|
+
print(f"Success Rate: {result['success_rate']:.1f}%")
|
|
667
|
+
|
|
668
|
+
if result["should_fail"]:
|
|
669
|
+
sys.exit(1)
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
def main():
|
|
673
|
+
"""Main CLI entry point."""
|
|
674
|
+
import argparse
|
|
675
|
+
from datetime import datetime
|
|
676
|
+
|
|
677
|
+
parser = argparse.ArgumentParser(
|
|
678
|
+
description="Zen-AI-Pentest Benchmark Framework",
|
|
679
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
680
|
+
epilog="""
|
|
681
|
+
Examples:
|
|
682
|
+
# List all scenarios
|
|
683
|
+
%(prog)s list
|
|
684
|
+
|
|
685
|
+
# Run quick benchmark
|
|
686
|
+
%(prog)s run --scenarios dvwa juice-shop
|
|
687
|
+
|
|
688
|
+
# Run full benchmark suite
|
|
689
|
+
%(prog)s run --all
|
|
690
|
+
|
|
691
|
+
# Run by difficulty
|
|
692
|
+
%(prog)s run --difficulty easy
|
|
693
|
+
|
|
694
|
+
# View report
|
|
695
|
+
%(prog)s view <benchmark-id>
|
|
696
|
+
|
|
697
|
+
# Compare two reports
|
|
698
|
+
%(prog)s compare <id1> <id2>
|
|
699
|
+
|
|
700
|
+
# Generate charts
|
|
701
|
+
%(prog)s chart --benchmark <id>
|
|
702
|
+
|
|
703
|
+
# Run CI pipeline
|
|
704
|
+
%(prog)s ci --type quick
|
|
705
|
+
"""
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
subparsers = parser.add_subparsers(dest="command", help="Commands")
|
|
709
|
+
|
|
710
|
+
# List command
|
|
711
|
+
list_parser = subparsers.add_parser("list", help="List available scenarios")
|
|
712
|
+
list_parser.add_argument("--type", help="Filter by scenario type")
|
|
713
|
+
list_parser.add_argument("--difficulty", help="Filter by difficulty")
|
|
714
|
+
list_parser.add_argument("--tag", help="Filter by tag")
|
|
715
|
+
|
|
716
|
+
# Run command
|
|
717
|
+
run_parser = subparsers.add_parser("run", help="Run benchmark")
|
|
718
|
+
run_parser.add_argument("--scenarios", nargs="+", help="Specific scenarios to run")
|
|
719
|
+
run_parser.add_argument("--all", action="store_true", help="Run all scenarios")
|
|
720
|
+
run_parser.add_argument("--type", help="Filter by scenario type")
|
|
721
|
+
run_parser.add_argument("--difficulty", help="Filter by difficulty")
|
|
722
|
+
run_parser.add_argument("--tags", nargs="+", help="Filter by tags")
|
|
723
|
+
run_parser.add_argument("--name", help="Benchmark name")
|
|
724
|
+
run_parser.add_argument("--concurrent", type=int, default=1, help="Max concurrent scenarios")
|
|
725
|
+
run_parser.add_argument("--timeout", type=int, default=3600, help="Timeout per scenario (seconds)")
|
|
726
|
+
run_parser.add_argument("--compare", action="store_true", help="Compare with competitors")
|
|
727
|
+
run_parser.add_argument("--competitors", nargs="+", help="Competitors to compare")
|
|
728
|
+
|
|
729
|
+
# View command
|
|
730
|
+
view_parser = subparsers.add_parser("view", help="View benchmark report")
|
|
731
|
+
view_parser.add_argument("benchmark_id", help="Benchmark ID")
|
|
732
|
+
|
|
733
|
+
# History command
|
|
734
|
+
history_parser = subparsers.add_parser("history", help="View benchmark history")
|
|
735
|
+
history_parser.add_argument("--limit", type=int, default=10, help="Number of entries")
|
|
736
|
+
|
|
737
|
+
# Compare command
|
|
738
|
+
compare_parser = subparsers.add_parser("compare", help="Compare two benchmarks")
|
|
739
|
+
compare_parser.add_argument("benchmark_id1", help="First benchmark ID")
|
|
740
|
+
compare_parser.add_argument("benchmark_id2", help="Second benchmark ID")
|
|
741
|
+
|
|
742
|
+
# Chart command
|
|
743
|
+
chart_parser = subparsers.add_parser("chart", help="Generate visualization charts")
|
|
744
|
+
chart_parser.add_argument("--benchmark", help="Benchmark ID (or omit for trend)")
|
|
745
|
+
|
|
746
|
+
# CI command
|
|
747
|
+
ci_parser = subparsers.add_parser("ci", help="Run CI/CD benchmark pipeline")
|
|
748
|
+
ci_parser.add_argument("--type", choices=["quick", "full"], default="quick")
|
|
749
|
+
ci_parser.add_argument("--format", choices=["json", "junit", "markdown", "all"], default="all")
|
|
750
|
+
ci_parser.add_argument("--fail-on-gate", action="store_true", help="Fail on gate failure")
|
|
751
|
+
ci_parser.add_argument("--fail-on-regression", action="store_true", help="Fail on regression")
|
|
752
|
+
|
|
753
|
+
args = parser.parse_args()
|
|
754
|
+
|
|
755
|
+
if not args.command:
|
|
756
|
+
parser.print_help()
|
|
757
|
+
return
|
|
758
|
+
|
|
759
|
+
cli = BenchmarkCLI()
|
|
760
|
+
|
|
761
|
+
if args.command == "list":
|
|
762
|
+
cli.list_scenarios(
|
|
763
|
+
scenario_type=args.type,
|
|
764
|
+
difficulty=args.difficulty,
|
|
765
|
+
tag=args.tag
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
elif args.command == "run":
|
|
769
|
+
scenarios = None
|
|
770
|
+
if args.all:
|
|
771
|
+
scenarios = list(ALL_SCENARIOS.keys())
|
|
772
|
+
elif args.scenarios:
|
|
773
|
+
scenarios = args.scenarios
|
|
774
|
+
|
|
775
|
+
asyncio.run(cli.run_benchmark(
|
|
776
|
+
scenarios=scenarios,
|
|
777
|
+
scenario_type=args.type,
|
|
778
|
+
difficulty=args.difficulty,
|
|
779
|
+
tags=args.tags,
|
|
780
|
+
name=args.name,
|
|
781
|
+
concurrent=args.concurrent,
|
|
782
|
+
timeout=args.timeout,
|
|
783
|
+
compare=args.compare,
|
|
784
|
+
competitors=args.competitors
|
|
785
|
+
))
|
|
786
|
+
|
|
787
|
+
elif args.command == "view":
|
|
788
|
+
cli.view_report(args.benchmark_id)
|
|
789
|
+
|
|
790
|
+
elif args.command == "history":
|
|
791
|
+
cli.list_history(limit=args.limit)
|
|
792
|
+
|
|
793
|
+
elif args.command == "compare":
|
|
794
|
+
cli.compare_reports(args.benchmark_id1, args.benchmark_id2)
|
|
795
|
+
|
|
796
|
+
elif args.command == "chart":
|
|
797
|
+
cli.generate_chart(args.benchmark)
|
|
798
|
+
|
|
799
|
+
elif args.command == "ci":
|
|
800
|
+
asyncio.run(cli.run_ci(
|
|
801
|
+
benchmark_type=args.type,
|
|
802
|
+
output_format=args.format,
|
|
803
|
+
fail_on_gate=args.fail_on_gate,
|
|
804
|
+
fail_on_regression=args.fail_on_regression
|
|
805
|
+
))
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
if __name__ == "__main__":
|
|
809
|
+
main()
|