zen-ai-pentest 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. agents/__init__.py +28 -0
  2. agents/agent_base.py +239 -0
  3. agents/agent_orchestrator.py +346 -0
  4. agents/analysis_agent.py +225 -0
  5. agents/cli.py +258 -0
  6. agents/exploit_agent.py +224 -0
  7. agents/integration.py +211 -0
  8. agents/post_scan_agent.py +937 -0
  9. agents/react_agent.py +384 -0
  10. agents/react_agent_enhanced.py +616 -0
  11. agents/react_agent_vm.py +298 -0
  12. agents/research_agent.py +176 -0
  13. api/__init__.py +11 -0
  14. api/auth.py +123 -0
  15. api/main.py +1027 -0
  16. api/schemas.py +357 -0
  17. api/websocket.py +97 -0
  18. autonomous/__init__.py +122 -0
  19. autonomous/agent.py +253 -0
  20. autonomous/agent_loop.py +1370 -0
  21. autonomous/exploit_validator.py +1537 -0
  22. autonomous/memory.py +448 -0
  23. autonomous/react.py +339 -0
  24. autonomous/tool_executor.py +488 -0
  25. backends/__init__.py +16 -0
  26. backends/chatgpt_direct.py +133 -0
  27. backends/claude_direct.py +130 -0
  28. backends/duckduckgo.py +138 -0
  29. backends/openrouter.py +120 -0
  30. benchmarks/__init__.py +149 -0
  31. benchmarks/benchmark_engine.py +904 -0
  32. benchmarks/ci_benchmark.py +785 -0
  33. benchmarks/comparison.py +729 -0
  34. benchmarks/metrics.py +553 -0
  35. benchmarks/run_benchmarks.py +809 -0
  36. ci_cd/__init__.py +2 -0
  37. core/__init__.py +17 -0
  38. core/async_pool.py +282 -0
  39. core/asyncio_fix.py +222 -0
  40. core/cache.py +472 -0
  41. core/container.py +277 -0
  42. core/database.py +114 -0
  43. core/input_validator.py +353 -0
  44. core/models.py +288 -0
  45. core/orchestrator.py +611 -0
  46. core/plugin_manager.py +571 -0
  47. core/rate_limiter.py +405 -0
  48. core/secure_config.py +328 -0
  49. core/shield_integration.py +296 -0
  50. modules/__init__.py +46 -0
  51. modules/cve_database.py +362 -0
  52. modules/exploit_assist.py +330 -0
  53. modules/nuclei_integration.py +480 -0
  54. modules/osint.py +604 -0
  55. modules/protonvpn.py +554 -0
  56. modules/recon.py +165 -0
  57. modules/sql_injection_db.py +826 -0
  58. modules/tool_orchestrator.py +498 -0
  59. modules/vuln_scanner.py +292 -0
  60. modules/wordlist_generator.py +566 -0
  61. risk_engine/__init__.py +99 -0
  62. risk_engine/business_impact.py +267 -0
  63. risk_engine/business_impact_calculator.py +563 -0
  64. risk_engine/cvss.py +156 -0
  65. risk_engine/epss.py +190 -0
  66. risk_engine/example_usage.py +294 -0
  67. risk_engine/false_positive_engine.py +1073 -0
  68. risk_engine/scorer.py +304 -0
  69. web_ui/backend/main.py +471 -0
  70. zen_ai_pentest-2.0.0.dist-info/METADATA +795 -0
  71. zen_ai_pentest-2.0.0.dist-info/RECORD +75 -0
  72. zen_ai_pentest-2.0.0.dist-info/WHEEL +5 -0
  73. zen_ai_pentest-2.0.0.dist-info/entry_points.txt +2 -0
  74. zen_ai_pentest-2.0.0.dist-info/licenses/LICENSE +21 -0
  75. zen_ai_pentest-2.0.0.dist-info/top_level.txt +10 -0
@@ -0,0 +1,809 @@
1
+ """
2
+ Zen-AI-Pentest Benchmark CLI Runner
3
+
4
+ Command-line interface for running benchmarks, viewing results,
5
+ and generating comparison reports.
6
+ """
7
+
8
+ import asyncio
9
+ import json
10
+ import logging
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import Optional, List
14
+
15
+ # Rich for beautiful CLI output
16
+ try:
17
+ from rich.console import Console
18
+ from rich.table import Table
19
+ from rich.panel import Panel
20
+ from rich.progress import Progress, TaskID
21
+ from rich import box
22
+ from rich.tree import Tree
23
+ from rich.layout import Layout
24
+ from rich.syntax import Syntax
25
+ RICH_AVAILABLE = True
26
+ except ImportError:
27
+ RICH_AVAILABLE = False
28
+
29
+ # Matplotlib for charts
30
+ try:
31
+ import matplotlib.pyplot as plt
32
+ import matplotlib
33
+ matplotlib.use('Agg') # Non-interactive backend
34
+ MATPLOTLIB_AVAILABLE = True
35
+ except ImportError:
36
+ MATPLOTLIB_AVAILABLE = False
37
+
38
+ from .benchmark_engine import (
39
+ BenchmarkEngine, BenchmarkConfig, BenchmarkReport,
40
+ BenchmarkStatus
41
+ )
42
+ from .scenarios import (
43
+ list_all_scenarios, get_scenario, create_benchmark_suite,
44
+ ScenarioType, DifficultyLevel, ALL_SCENARIOS
45
+ )
46
+ from .comparison import ComparisonFramework
47
+ from .ci_benchmark import CIBenchmarkRunner, CIConfig
48
+
49
+ # Setup logging
50
+ logging.basicConfig(
51
+ level=logging.INFO,
52
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
53
+ )
54
+ logger = logging.getLogger(__name__)
55
+
56
+ # Console for output
57
+ console = Console() if RICH_AVAILABLE else None
58
+
59
+
60
+ class BenchmarkCLI:
61
+ """Command-line interface for benchmark operations."""
62
+
63
+ def __init__(self, output_dir: str = "benchmark_results"):
64
+ self.engine = BenchmarkEngine(output_dir=output_dir)
65
+ self.output_dir = Path(output_dir)
66
+
67
+ if RICH_AVAILABLE:
68
+ console.print(
69
+ Panel.fit(
70
+ "[bold blue]Zen-AI-Pentest Benchmark Framework[/bold blue]\n"
71
+ "[dim]Security Testing Performance Evaluation[/dim]",
72
+ border_style="blue"
73
+ )
74
+ )
75
+
76
+ def list_scenarios(
77
+ self,
78
+ scenario_type: Optional[str] = None,
79
+ difficulty: Optional[str] = None,
80
+ tag: Optional[str] = None
81
+ ) -> None:
82
+ """List available benchmark scenarios."""
83
+
84
+ scenarios = list_all_scenarios()
85
+
86
+ # Apply filters
87
+ if scenario_type:
88
+ scenarios = [s for s in scenarios if s.get("type") == scenario_type]
89
+ if difficulty:
90
+ scenarios = [s for s in scenarios if s.get("difficulty") == difficulty]
91
+ if tag:
92
+ scenarios = [s for s in scenarios if tag in s.get("tags", [])]
93
+
94
+ if RICH_AVAILABLE:
95
+ table = Table(
96
+ title="Available Benchmark Scenarios",
97
+ box=box.ROUNDED
98
+ )
99
+ table.add_column("ID", style="cyan", no_wrap=True)
100
+ table.add_column("Name", style="green")
101
+ table.add_column("Type", style="blue")
102
+ table.add_column("Difficulty", style="yellow")
103
+ table.add_column("Duration", justify="right")
104
+ table.add_column("Vulns", justify="right")
105
+
106
+ for s in scenarios:
107
+ diff_color = {
108
+ "easy": "green",
109
+ "medium": "yellow",
110
+ "hard": "red",
111
+ "expert": "magenta"
112
+ }.get(s.get("difficulty", ""), "white")
113
+
114
+ table.add_row(
115
+ s.get("id", ""),
116
+ s.get("name", ""),
117
+ s.get("type", ""),
118
+ f"[{diff_color}]{s.get('difficulty', '')}[/{diff_color}]",
119
+ f"{s.get('estimated_duration_minutes', 0)}m",
120
+ str(s.get('expected_vulnerabilities_count', 0))
121
+ )
122
+
123
+ console.print(table)
124
+ console.print(f"\n[dim]Total: {len(scenarios)} scenarios[/dim]")
125
+ else:
126
+ # Plain text output
127
+ print("\nAvailable Benchmark Scenarios:")
128
+ print("-" * 100)
129
+ print(f"{'ID':<25} {'Name':<30} {'Type':<12} {'Difficulty':<10} {'Duration':<10}")
130
+ print("-" * 100)
131
+
132
+ for s in scenarios:
133
+ print(
134
+ f"{s.get('id', ''):<25} "
135
+ f"{s.get('name', '')[:28]:<30} "
136
+ f"{s.get('type', ''):<12} "
137
+ f"{s.get('difficulty', ''):<10} "
138
+ f"{s.get('estimated_duration_minutes', 0)}m"
139
+ )
140
+
141
+ print(f"\nTotal: {len(scenarios)} scenarios")
142
+
143
+ async def run_benchmark(
144
+ self,
145
+ scenarios: Optional[List[str]] = None,
146
+ scenario_type: Optional[str] = None,
147
+ difficulty: Optional[str] = None,
148
+ tags: Optional[List[str]] = None,
149
+ name: Optional[str] = None,
150
+ concurrent: int = 1,
151
+ timeout: int = 3600,
152
+ compare: bool = False,
153
+ competitors: Optional[List[str]] = None
154
+ ) -> BenchmarkReport:
155
+ """Run benchmark with specified configuration."""
156
+
157
+ # Build configuration
158
+ config = BenchmarkConfig(
159
+ benchmark_name=name or f"benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
160
+ scenarios=scenarios or [],
161
+ tags=tags,
162
+ max_concurrent=concurrent,
163
+ timeout_per_scenario=timeout,
164
+ enable_competitor_comparison=compare,
165
+ competitors=competitors or []
166
+ )
167
+
168
+ # Add type filter
169
+ if scenario_type:
170
+ try:
171
+ config.scenario_types = [ScenarioType(scenario_type)]
172
+ except ValueError:
173
+ logger.warning(f"Unknown scenario type: {scenario_type}")
174
+
175
+ # Add difficulty filter
176
+ if difficulty:
177
+ try:
178
+ config.difficulty_levels = [DifficultyLevel(difficulty)]
179
+ except ValueError:
180
+ logger.warning(f"Unknown difficulty: {difficulty}")
181
+
182
+ if RICH_AVAILABLE:
183
+ console.print(f"\n[bold]Starting Benchmark:[/bold] {config.benchmark_name}")
184
+ console.print(f"[dim]ID: {config.benchmark_id}[/dim]\n")
185
+ else:
186
+ print(f"\nStarting Benchmark: {config.benchmark_name}")
187
+ print(f"ID: {config.benchmark_id}\n")
188
+
189
+ # Run benchmark
190
+ report = await self.engine.run_benchmark(config)
191
+
192
+ # Display results
193
+ self._display_results(report)
194
+
195
+ return report
196
+
197
+ def _display_results(self, report: BenchmarkReport) -> None:
198
+ """Display benchmark results."""
199
+
200
+ if RICH_AVAILABLE:
201
+ console.print("\n[bold]Benchmark Results:[/bold]\n")
202
+
203
+ # Summary panel
204
+ summary_text = (
205
+ f"[bold]Duration:[/bold] {report.duration_seconds:.1f}s\n"
206
+ f"[bold]Scenarios:[/bold] {len(report.scenario_results)}\n"
207
+ f"[bold]Passed:[/bold] {report.scenarios_passed} ✅\n"
208
+ f"[bold]Failed:[/bold] {report.scenarios_failed} ❌\n"
209
+ f"[bold]Success Rate:[/bold] {report.success_rate:.1f}%"
210
+ )
211
+ console.print(Panel(summary_text, title="Summary", border_style="green"))
212
+
213
+ # Results table
214
+ table = Table(title="Scenario Results", box=box.ROUNDED)
215
+ table.add_column("Scenario", style="cyan")
216
+ table.add_column("Status", style="bold")
217
+ table.add_column("Duration", justify="right")
218
+ table.add_column("Precision", justify="right")
219
+ table.add_column("Recall", justify="right")
220
+ table.add_column("F1-Score", justify="right")
221
+ table.add_column("Overall", justify="right")
222
+
223
+ for result in report.scenario_results:
224
+ status_color = {
225
+ BenchmarkStatus.COMPLETED: "green",
226
+ BenchmarkStatus.FAILED: "red",
227
+ BenchmarkStatus.TIMEOUT: "yellow",
228
+ BenchmarkStatus.CANCELLED: "magenta"
229
+ }.get(result.status, "white")
230
+
231
+ if result.metrics:
232
+ scores = result.metrics.calculate_aggregate_scores()
233
+ table.add_row(
234
+ result.scenario_id,
235
+ f"[{status_color}]{result.status.name}[/{status_color}]",
236
+ f"{result.duration_seconds:.1f}s",
237
+ f"{scores.get('precision', 0):.3f}",
238
+ f"{scores.get('recall', 0):.3f}",
239
+ f"{scores.get('f1_score', 0):.3f}",
240
+ f"[bold]{scores.get('overall', 0):.3f}[/bold]"
241
+ )
242
+ else:
243
+ table.add_row(
244
+ result.scenario_id,
245
+ f"[{status_color}]{result.status.name}[/{status_color}]",
246
+ f"{result.duration_seconds:.1f}s",
247
+ "N/A", "N/A", "N/A", "N/A"
248
+ )
249
+
250
+ console.print(table)
251
+
252
+ # Aggregate metrics
253
+ if report.aggregate_metrics:
254
+ console.print("\n[bold]Aggregate Metrics:[/bold]")
255
+ metrics_text = ""
256
+ for key, value in report.aggregate_metrics.items():
257
+ if isinstance(value, float):
258
+ metrics_text += f"[bold]{key}:[/bold] {value:.3f}\n"
259
+ else:
260
+ metrics_text += f"[bold]{key}:[/bold] {value}\n"
261
+ console.print(Panel(metrics_text, border_style="blue"))
262
+
263
+ # Output location
264
+ output_path = self.output_dir / report.benchmark_id
265
+ console.print(f"\n[dim]Results saved to: {output_path}[/dim]")
266
+
267
+ else:
268
+ # Plain text output
269
+ print("\n" + "="*60)
270
+ print("BENCHMARK RESULTS")
271
+ print("="*60)
272
+ print(f"Duration: {report.duration_seconds:.1f}s")
273
+ print(f"Scenarios: {len(report.scenario_results)}")
274
+ print(f"Passed: {report.scenarios_passed}")
275
+ print(f"Failed: {report.scenarios_failed}")
276
+ print(f"Success Rate: {report.success_rate:.1f}%")
277
+ print("-"*60)
278
+
279
+ for result in report.scenario_results:
280
+ print(f"\n{result.scenario_id}:")
281
+ print(f" Status: {result.status.name}")
282
+ print(f" Duration: {result.duration_seconds:.1f}s")
283
+
284
+ if result.metrics:
285
+ scores = result.metrics.calculate_aggregate_scores()
286
+ print(f" Precision: {scores.get('precision', 0):.3f}")
287
+ print(f" Recall: {scores.get('recall', 0):.3f}")
288
+ print(f" F1-Score: {scores.get('f1_score', 0):.3f}")
289
+ print(f" Overall: {scores.get('overall', 0):.3f}")
290
+
291
+ def view_report(self, benchmark_id: str) -> None:
292
+ """View a specific benchmark report."""
293
+
294
+ report_path = self.output_dir / benchmark_id / "report.json"
295
+
296
+ if not report_path.exists():
297
+ if RICH_AVAILABLE:
298
+ console.print(f"[red]Report not found: {benchmark_id}[/red]")
299
+ else:
300
+ print(f"Report not found: {benchmark_id}")
301
+ return
302
+
303
+ try:
304
+ with open(report_path) as f:
305
+ data = json.load(f)
306
+
307
+ if RICH_AVAILABLE:
308
+ console.print(f"\n[bold]Benchmark Report:[/bold] {benchmark_id}")
309
+
310
+ # Display as tree
311
+ tree = Tree(f"[bold]{data.get('benchmark_name', benchmark_id)}[/bold]")
312
+
313
+ summary = data.get('summary', {})
314
+ summary_branch = tree.add("[blue]Summary[/blue]")
315
+ summary_branch.add(f"Total: {summary.get('total_scenarios', 0)}")
316
+ summary_branch.add(f"Passed: {summary.get('passed', 0)}")
317
+ summary_branch.add(f"Failed: {summary.get('failed', 0)}")
318
+ summary_branch.add(f"Success Rate: {summary.get('success_rate', 0):.1f}%")
319
+
320
+ if 'aggregate_metrics' in data and data['aggregate_metrics']:
321
+ metrics_branch = tree.add("[green]Aggregate Metrics[/green]")
322
+ for key, value in data['aggregate_metrics'].items():
323
+ if isinstance(value, float):
324
+ metrics_branch.add(f"{key}: {value:.3f}")
325
+ else:
326
+ metrics_branch.add(f"{key}: {value}")
327
+
328
+ console.print(tree)
329
+ else:
330
+ print(json.dumps(data, indent=2))
331
+
332
+ except Exception as e:
333
+ if RICH_AVAILABLE:
334
+ console.print(f"[red]Error loading report: {e}[/red]")
335
+ else:
336
+ print(f"Error loading report: {e}")
337
+
338
+ def list_history(self, limit: int = 10) -> None:
339
+ """List benchmark history."""
340
+
341
+ history = self.engine.get_benchmark_history(limit=limit)
342
+
343
+ if not history:
344
+ if RICH_AVAILABLE:
345
+ console.print("[yellow]No benchmark history found[/yellow]")
346
+ else:
347
+ print("No benchmark history found")
348
+ return
349
+
350
+ if RICH_AVAILABLE:
351
+ table = Table(title="Benchmark History", box=box.ROUNDED)
352
+ table.add_column("ID", style="cyan", no_wrap=True)
353
+ table.add_column("Date", style="dim")
354
+ table.add_column("Scenarios", justify="right")
355
+ table.add_column("Success Rate", justify="right")
356
+ table.add_column("F1-Score", justify="right")
357
+
358
+ for entry in history:
359
+ metrics = entry.get('aggregate_metrics', {})
360
+ table.add_row(
361
+ entry.get('benchmark_id', '')[:12],
362
+ entry.get('timestamp', '')[:19],
363
+ str(len(entry.get('scenarios', []))),
364
+ f"{entry.get('success_rate', 0):.1f}%",
365
+ f"{metrics.get('avg_f1_score', 0):.3f}"
366
+ )
367
+
368
+ console.print(table)
369
+ else:
370
+ print("\nBenchmark History:")
371
+ print("-" * 80)
372
+ for entry in history:
373
+ metrics = entry.get('aggregate_metrics', {})
374
+ print(
375
+ f"{entry.get('benchmark_id', '')[:12]} | "
376
+ f"{entry.get('timestamp', '')[:19]} | "
377
+ f"Scenarios: {len(entry.get('scenarios', [])):<3} | "
378
+ f"Success: {entry.get('success_rate', 0):.1f}% | "
379
+ f"F1: {metrics.get('avg_f1_score', 0):.3f}"
380
+ )
381
+
382
+ def compare_reports(
383
+ self,
384
+ benchmark_id1: str,
385
+ benchmark_id2: str
386
+ ) -> None:
387
+ """Compare two benchmark reports."""
388
+
389
+ path1 = self.output_dir / benchmark_id1 / "report.json"
390
+ path2 = self.output_dir / benchmark_id2 / "report.json"
391
+
392
+ if not path1.exists() or not path2.exists():
393
+ if RICH_AVAILABLE:
394
+ console.print("[red]One or both reports not found[/red]")
395
+ else:
396
+ print("One or both reports not found")
397
+ return
398
+
399
+ try:
400
+ with open(path1) as f:
401
+ data1 = json.load(f)
402
+ with open(path2) as f:
403
+ data2 = json.load(f)
404
+
405
+ metrics1 = data1.get('aggregate_metrics', {})
406
+ metrics2 = data2.get('aggregate_metrics', {})
407
+
408
+ if RICH_AVAILABLE:
409
+ console.print(f"\n[bold]Comparing Benchmarks:[/bold]")
410
+ console.print(f" [cyan]{benchmark_id1}[/cyan] vs [cyan]{benchmark_id2}[/cyan]\n")
411
+
412
+ table = Table(box=box.ROUNDED)
413
+ table.add_column("Metric", style="bold")
414
+ table.add_column(benchmark_id1[:15], justify="right")
415
+ table.add_column(benchmark_id2[:15], justify="right")
416
+ table.add_column("Change", justify="right")
417
+
418
+ for key in metrics1.keys():
419
+ if key in metrics2:
420
+ val1 = metrics1[key]
421
+ val2 = metrics2[key]
422
+
423
+ if isinstance(val1, float) and isinstance(val2, float):
424
+ change = val2 - val1
425
+ change_pct = (change / val1 * 100) if val1 != 0 else 0
426
+
427
+ change_color = "green" if change > 0 else "red"
428
+ change_str = f"{change:+.3f} ({change_pct:+.1f}%)"
429
+
430
+ table.add_row(
431
+ key,
432
+ f"{val1:.3f}",
433
+ f"{val2:.3f}",
434
+ f"[{change_color}]{change_str}[/{change_color}]"
435
+ )
436
+
437
+ console.print(table)
438
+ else:
439
+ print(f"\nComparing: {benchmark_id1} vs {benchmark_id2}")
440
+ print("-" * 60)
441
+ for key in metrics1.keys():
442
+ if key in metrics2:
443
+ val1 = metrics1[key]
444
+ val2 = metrics2[key]
445
+ if isinstance(val1, float):
446
+ change = val2 - val1
447
+ print(f"{key}: {val1:.3f} → {val2:.3f} ({change:+.3f})")
448
+
449
+ except Exception as e:
450
+ if RICH_AVAILABLE:
451
+ console.print(f"[red]Error comparing reports: {e}[/red]")
452
+ else:
453
+ print(f"Error comparing reports: {e}")
454
+
455
+ def generate_chart(self, benchmark_id: Optional[str] = None) -> None:
456
+ """Generate visualization charts."""
457
+
458
+ if not MATPLOTLIB_AVAILABLE:
459
+ if RICH_AVAILABLE:
460
+ console.print("[yellow]Matplotlib not available. Install with: pip install matplotlib[/yellow]")
461
+ else:
462
+ print("Matplotlib not available")
463
+ return
464
+
465
+ if benchmark_id:
466
+ # Single benchmark chart
467
+ report_path = self.output_dir / benchmark_id / "report.json"
468
+ if not report_path.exists():
469
+ if RICH_AVAILABLE:
470
+ console.print(f"[red]Report not found: {benchmark_id}[/red]")
471
+ else:
472
+ print(f"Report not found: {benchmark_id}")
473
+ return
474
+
475
+ with open(report_path) as f:
476
+ data = json.load(f)
477
+
478
+ self._create_benchmark_chart(data, benchmark_id)
479
+ else:
480
+ # Historical trend chart
481
+ history = self.engine.get_benchmark_history(limit=20)
482
+ if not history:
483
+ if RICH_AVAILABLE:
484
+ console.print("[yellow]No history available for trend chart[/yellow]")
485
+ else:
486
+ print("No history available")
487
+ return
488
+
489
+ self._create_trend_chart(history)
490
+
491
+ def _create_benchmark_chart(self, data: dict, benchmark_id: str) -> None:
492
+ """Create chart for single benchmark."""
493
+
494
+ fig, axes = plt.subplots(2, 2, figsize=(12, 10))
495
+ fig.suptitle(f'Benchmark Results: {benchmark_id}', fontsize=14, fontweight='bold')
496
+
497
+ scenario_results = data.get('scenario_results', [])
498
+
499
+ # 1. Scores by scenario
500
+ ax1 = axes[0, 0]
501
+ scenarios = [r['scenario_id'] for r in scenario_results if r.get('metrics')]
502
+ scores_data = []
503
+
504
+ for r in scenario_results:
505
+ if r.get('metrics'):
506
+ scores = r['metrics'].get('aggregate_scores', {})
507
+ scores_data.append({
508
+ 'precision': scores.get('precision', 0),
509
+ 'recall': scores.get('recall', 0),
510
+ 'f1': scores.get('f1_score', 0)
511
+ })
512
+
513
+ if scenarios and scores_data:
514
+ x = range(len(scenarios))
515
+ width = 0.25
516
+
517
+ ax1.bar([i - width for i in x], [s['precision'] for s in scores_data],
518
+ width, label='Precision', alpha=0.8)
519
+ ax1.bar(x, [s['recall'] for s in scores_data],
520
+ width, label='Recall', alpha=0.8)
521
+ ax1.bar([i + width for i in x], [s['f1'] for s in scores_data],
522
+ width, label='F1-Score', alpha=0.8)
523
+
524
+ ax1.set_xlabel('Scenario')
525
+ ax1.set_ylabel('Score')
526
+ ax1.set_title('Scores by Scenario')
527
+ ax1.set_xticks(x)
528
+ ax1.set_xticklabels(scenarios, rotation=45, ha='right')
529
+ ax1.legend()
530
+ ax1.set_ylim(0, 1)
531
+
532
+ # 2. Duration by scenario
533
+ ax2 = axes[0, 1]
534
+ durations = [r['duration_seconds'] for r in scenario_results]
535
+ statuses = [r['status'] for r in scenario_results]
536
+ colors = ['green' if s == 'COMPLETED' else 'red' for s in statuses]
537
+
538
+ ax2.barh(scenarios, durations, color=colors, alpha=0.7)
539
+ ax2.set_xlabel('Duration (seconds)')
540
+ ax2.set_title('Scenario Duration')
541
+
542
+ # 3. Severity distribution
543
+ ax3 = axes[1, 0]
544
+ all_severities = {}
545
+ for r in scenario_results:
546
+ if r.get('metrics'):
547
+ sev_dist = r['metrics'].get('severity_distribution', {})
548
+ for sev, count in sev_dist.items():
549
+ all_severities[sev] = all_severities.get(sev, 0) + count
550
+
551
+ if all_severities:
552
+ colors_sev = {'critical': '#d32f2f', 'high': '#f57c00',
553
+ 'medium': '#fbc02d', 'low': '#388e3c', 'info': '#1976d2'}
554
+ sev_colors = [colors_sev.get(s, '#757575') for s in all_severities.keys()]
555
+ ax3.pie(all_severities.values(), labels=all_severities.keys(),
556
+ colors=sev_colors, autopct='%1.1f%%')
557
+ ax3.set_title('Findings by Severity')
558
+
559
+ # 4. Aggregate metrics
560
+ ax4 = axes[1, 1]
561
+ metrics = data.get('aggregate_metrics', {})
562
+ metric_names = []
563
+ metric_values = []
564
+
565
+ for key in ['avg_precision', 'avg_recall', 'avg_f1_score', 'avg_accuracy']:
566
+ if key in metrics:
567
+ metric_names.append(key.replace('avg_', '').title())
568
+ metric_values.append(metrics[key])
569
+
570
+ if metric_values:
571
+ bars = ax4.bar(metric_names, metric_values, color='steelblue', alpha=0.7)
572
+ ax4.set_ylabel('Score')
573
+ ax4.set_title('Aggregate Metrics')
574
+ ax4.set_ylim(0, 1)
575
+
576
+ # Add value labels on bars
577
+ for bar, val in zip(bars, metric_values):
578
+ ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
579
+ f'{val:.3f}', ha='center', va='bottom', fontsize=9)
580
+
581
+ plt.tight_layout()
582
+
583
+ output_path = self.output_dir / benchmark_id / "chart.png"
584
+ plt.savefig(output_path, dpi=150, bbox_inches='tight')
585
+ plt.close()
586
+
587
+ if RICH_AVAILABLE:
588
+ console.print(f"[green]Chart saved:[/green] {output_path}")
589
+ else:
590
+ print(f"Chart saved: {output_path}")
591
+
592
+ def _create_trend_chart(self, history: list) -> None:
593
+ """Create historical trend chart."""
594
+
595
+ fig, ax = plt.subplots(figsize=(12, 6))
596
+
597
+ dates = [h.get('timestamp', '')[:10] for h in history]
598
+
599
+ metrics_to_plot = {
600
+ 'F1-Score': [h.get('aggregate_metrics', {}).get('avg_f1_score', 0) for h in history],
601
+ 'Precision': [h.get('aggregate_metrics', {}).get('avg_precision', 0) for h in history],
602
+ 'Recall': [h.get('aggregate_metrics', {}).get('avg_recall', 0) for h in history],
603
+ 'Accuracy': [h.get('aggregate_metrics', {}).get('avg_accuracy', 0) for h in history],
604
+ }
605
+
606
+ for label, values in metrics_to_plot.items():
607
+ ax.plot(dates, values, marker='o', label=label, linewidth=2)
608
+
609
+ ax.set_xlabel('Date')
610
+ ax.set_ylabel('Score')
611
+ ax.set_title('Benchmark Performance Trends')
612
+ ax.legend()
613
+ ax.grid(True, alpha=0.3)
614
+ ax.set_ylim(0, 1)
615
+
616
+ plt.xticks(rotation=45)
617
+ plt.tight_layout()
618
+
619
+ output_path = self.output_dir / "trend_chart.png"
620
+ plt.savefig(output_path, dpi=150, bbox_inches='tight')
621
+ plt.close()
622
+
623
+ if RICH_AVAILABLE:
624
+ console.print(f"[green]Trend chart saved:[/green] {output_path}")
625
+ else:
626
+ print(f"Trend chart saved: {output_path}")
627
+
628
+ async def run_ci(
629
+ self,
630
+ benchmark_type: str = "quick",
631
+ output_format: str = "all",
632
+ fail_on_gate: bool = False,
633
+ fail_on_regression: bool = False
634
+ ) -> None:
635
+ """Run CI/CD benchmark pipeline."""
636
+
637
+ config = CIConfig(
638
+ output_format=output_format,
639
+ fail_on_gate_failure=fail_on_gate,
640
+ fail_on_critical_regression=fail_on_regression
641
+ )
642
+
643
+ runner = CIBenchmarkRunner(
644
+ engine=self.engine,
645
+ config=config,
646
+ output_dir=str(self.output_dir)
647
+ )
648
+
649
+ result = await runner.run_ci_pipeline(benchmark_type)
650
+
651
+ if RICH_AVAILABLE:
652
+ if result["should_fail"]:
653
+ console.print(f"\n[red]❌ Build Failed: {result['fail_reason']}[/red]")
654
+ else:
655
+ console.print("\n[green]✅ All Checks Passed[/green]")
656
+
657
+ console.print(f"\n[bold]Results:[/bold]")
658
+ console.print(f" Success Rate: {result['success_rate']:.1f}%")
659
+ console.print(f" Gates Passed: {result['gates_passed']}/{result['gates_total']}")
660
+ console.print(f" Regressions: {result['regressions']}")
661
+ else:
662
+ if result["should_fail"]:
663
+ print(f"\nBuild Failed: {result['fail_reason']}")
664
+ else:
665
+ print("\nAll Checks Passed")
666
+ print(f"Success Rate: {result['success_rate']:.1f}%")
667
+
668
+ if result["should_fail"]:
669
+ sys.exit(1)
670
+
671
+
672
+ def main():
673
+ """Main CLI entry point."""
674
+ import argparse
675
+ from datetime import datetime
676
+
677
+ parser = argparse.ArgumentParser(
678
+ description="Zen-AI-Pentest Benchmark Framework",
679
+ formatter_class=argparse.RawDescriptionHelpFormatter,
680
+ epilog="""
681
+ Examples:
682
+ # List all scenarios
683
+ %(prog)s list
684
+
685
+ # Run quick benchmark
686
+ %(prog)s run --scenarios dvwa juice-shop
687
+
688
+ # Run full benchmark suite
689
+ %(prog)s run --all
690
+
691
+ # Run by difficulty
692
+ %(prog)s run --difficulty easy
693
+
694
+ # View report
695
+ %(prog)s view <benchmark-id>
696
+
697
+ # Compare two reports
698
+ %(prog)s compare <id1> <id2>
699
+
700
+ # Generate charts
701
+ %(prog)s chart --benchmark <id>
702
+
703
+ # Run CI pipeline
704
+ %(prog)s ci --type quick
705
+ """
706
+ )
707
+
708
+ subparsers = parser.add_subparsers(dest="command", help="Commands")
709
+
710
+ # List command
711
+ list_parser = subparsers.add_parser("list", help="List available scenarios")
712
+ list_parser.add_argument("--type", help="Filter by scenario type")
713
+ list_parser.add_argument("--difficulty", help="Filter by difficulty")
714
+ list_parser.add_argument("--tag", help="Filter by tag")
715
+
716
+ # Run command
717
+ run_parser = subparsers.add_parser("run", help="Run benchmark")
718
+ run_parser.add_argument("--scenarios", nargs="+", help="Specific scenarios to run")
719
+ run_parser.add_argument("--all", action="store_true", help="Run all scenarios")
720
+ run_parser.add_argument("--type", help="Filter by scenario type")
721
+ run_parser.add_argument("--difficulty", help="Filter by difficulty")
722
+ run_parser.add_argument("--tags", nargs="+", help="Filter by tags")
723
+ run_parser.add_argument("--name", help="Benchmark name")
724
+ run_parser.add_argument("--concurrent", type=int, default=1, help="Max concurrent scenarios")
725
+ run_parser.add_argument("--timeout", type=int, default=3600, help="Timeout per scenario (seconds)")
726
+ run_parser.add_argument("--compare", action="store_true", help="Compare with competitors")
727
+ run_parser.add_argument("--competitors", nargs="+", help="Competitors to compare")
728
+
729
+ # View command
730
+ view_parser = subparsers.add_parser("view", help="View benchmark report")
731
+ view_parser.add_argument("benchmark_id", help="Benchmark ID")
732
+
733
+ # History command
734
+ history_parser = subparsers.add_parser("history", help="View benchmark history")
735
+ history_parser.add_argument("--limit", type=int, default=10, help="Number of entries")
736
+
737
+ # Compare command
738
+ compare_parser = subparsers.add_parser("compare", help="Compare two benchmarks")
739
+ compare_parser.add_argument("benchmark_id1", help="First benchmark ID")
740
+ compare_parser.add_argument("benchmark_id2", help="Second benchmark ID")
741
+
742
+ # Chart command
743
+ chart_parser = subparsers.add_parser("chart", help="Generate visualization charts")
744
+ chart_parser.add_argument("--benchmark", help="Benchmark ID (or omit for trend)")
745
+
746
+ # CI command
747
+ ci_parser = subparsers.add_parser("ci", help="Run CI/CD benchmark pipeline")
748
+ ci_parser.add_argument("--type", choices=["quick", "full"], default="quick")
749
+ ci_parser.add_argument("--format", choices=["json", "junit", "markdown", "all"], default="all")
750
+ ci_parser.add_argument("--fail-on-gate", action="store_true", help="Fail on gate failure")
751
+ ci_parser.add_argument("--fail-on-regression", action="store_true", help="Fail on regression")
752
+
753
+ args = parser.parse_args()
754
+
755
+ if not args.command:
756
+ parser.print_help()
757
+ return
758
+
759
+ cli = BenchmarkCLI()
760
+
761
+ if args.command == "list":
762
+ cli.list_scenarios(
763
+ scenario_type=args.type,
764
+ difficulty=args.difficulty,
765
+ tag=args.tag
766
+ )
767
+
768
+ elif args.command == "run":
769
+ scenarios = None
770
+ if args.all:
771
+ scenarios = list(ALL_SCENARIOS.keys())
772
+ elif args.scenarios:
773
+ scenarios = args.scenarios
774
+
775
+ asyncio.run(cli.run_benchmark(
776
+ scenarios=scenarios,
777
+ scenario_type=args.type,
778
+ difficulty=args.difficulty,
779
+ tags=args.tags,
780
+ name=args.name,
781
+ concurrent=args.concurrent,
782
+ timeout=args.timeout,
783
+ compare=args.compare,
784
+ competitors=args.competitors
785
+ ))
786
+
787
+ elif args.command == "view":
788
+ cli.view_report(args.benchmark_id)
789
+
790
+ elif args.command == "history":
791
+ cli.list_history(limit=args.limit)
792
+
793
+ elif args.command == "compare":
794
+ cli.compare_reports(args.benchmark_id1, args.benchmark_id2)
795
+
796
+ elif args.command == "chart":
797
+ cli.generate_chart(args.benchmark)
798
+
799
+ elif args.command == "ci":
800
+ asyncio.run(cli.run_ci(
801
+ benchmark_type=args.type,
802
+ output_format=args.format,
803
+ fail_on_gate=args.fail_on_gate,
804
+ fail_on_regression=args.fail_on_regression
805
+ ))
806
+
807
+
808
+ if __name__ == "__main__":
809
+ main()