zen-ai-pentest 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. agents/__init__.py +28 -0
  2. agents/agent_base.py +239 -0
  3. agents/agent_orchestrator.py +346 -0
  4. agents/analysis_agent.py +225 -0
  5. agents/cli.py +258 -0
  6. agents/exploit_agent.py +224 -0
  7. agents/integration.py +211 -0
  8. agents/post_scan_agent.py +937 -0
  9. agents/react_agent.py +384 -0
  10. agents/react_agent_enhanced.py +616 -0
  11. agents/react_agent_vm.py +298 -0
  12. agents/research_agent.py +176 -0
  13. api/__init__.py +11 -0
  14. api/auth.py +123 -0
  15. api/main.py +1027 -0
  16. api/schemas.py +357 -0
  17. api/websocket.py +97 -0
  18. autonomous/__init__.py +122 -0
  19. autonomous/agent.py +253 -0
  20. autonomous/agent_loop.py +1370 -0
  21. autonomous/exploit_validator.py +1537 -0
  22. autonomous/memory.py +448 -0
  23. autonomous/react.py +339 -0
  24. autonomous/tool_executor.py +488 -0
  25. backends/__init__.py +16 -0
  26. backends/chatgpt_direct.py +133 -0
  27. backends/claude_direct.py +130 -0
  28. backends/duckduckgo.py +138 -0
  29. backends/openrouter.py +120 -0
  30. benchmarks/__init__.py +149 -0
  31. benchmarks/benchmark_engine.py +904 -0
  32. benchmarks/ci_benchmark.py +785 -0
  33. benchmarks/comparison.py +729 -0
  34. benchmarks/metrics.py +553 -0
  35. benchmarks/run_benchmarks.py +809 -0
  36. ci_cd/__init__.py +2 -0
  37. core/__init__.py +17 -0
  38. core/async_pool.py +282 -0
  39. core/asyncio_fix.py +222 -0
  40. core/cache.py +472 -0
  41. core/container.py +277 -0
  42. core/database.py +114 -0
  43. core/input_validator.py +353 -0
  44. core/models.py +288 -0
  45. core/orchestrator.py +611 -0
  46. core/plugin_manager.py +571 -0
  47. core/rate_limiter.py +405 -0
  48. core/secure_config.py +328 -0
  49. core/shield_integration.py +296 -0
  50. modules/__init__.py +46 -0
  51. modules/cve_database.py +362 -0
  52. modules/exploit_assist.py +330 -0
  53. modules/nuclei_integration.py +480 -0
  54. modules/osint.py +604 -0
  55. modules/protonvpn.py +554 -0
  56. modules/recon.py +165 -0
  57. modules/sql_injection_db.py +826 -0
  58. modules/tool_orchestrator.py +498 -0
  59. modules/vuln_scanner.py +292 -0
  60. modules/wordlist_generator.py +566 -0
  61. risk_engine/__init__.py +99 -0
  62. risk_engine/business_impact.py +267 -0
  63. risk_engine/business_impact_calculator.py +563 -0
  64. risk_engine/cvss.py +156 -0
  65. risk_engine/epss.py +190 -0
  66. risk_engine/example_usage.py +294 -0
  67. risk_engine/false_positive_engine.py +1073 -0
  68. risk_engine/scorer.py +304 -0
  69. web_ui/backend/main.py +471 -0
  70. zen_ai_pentest-2.0.0.dist-info/METADATA +795 -0
  71. zen_ai_pentest-2.0.0.dist-info/RECORD +75 -0
  72. zen_ai_pentest-2.0.0.dist-info/WHEEL +5 -0
  73. zen_ai_pentest-2.0.0.dist-info/entry_points.txt +2 -0
  74. zen_ai_pentest-2.0.0.dist-info/licenses/LICENSE +21 -0
  75. zen_ai_pentest-2.0.0.dist-info/top_level.txt +10 -0
@@ -0,0 +1,904 @@
1
+ """
2
+ Zen-AI-Pentest Benchmark Engine
3
+
4
+ Core benchmarking engine for security testing performance evaluation.
5
+ Provides async execution, historical tracking, and comprehensive reporting.
6
+ """
7
+
8
+ import asyncio
9
+ import hashlib
10
+ import json
11
+ import logging
12
+ import os
13
+ import time
14
+ import uuid
15
+ from dataclasses import dataclass, field, asdict
16
+ from datetime import datetime
17
+ from enum import Enum, auto
18
+ from pathlib import Path
19
+ from typing import Dict, List, Optional, Any, Callable, Union
20
+ import traceback
21
+
22
+ # Type hints for Zen-AI-Pentest core
23
+ from ..core.orchestrator import PentestOrchestrator
24
+ from ..core.models import ScanTarget, ScanConfig, ScanResult
25
+
26
+ from .metrics import (
27
+ BenchmarkMetrics, ClassificationMetrics, CoverageMetrics,
28
+ PerformanceMetrics, ExploitMetrics, TokenUsage, FindingMetrics,
29
+ SeverityLevel, FindingType, MetricsAggregator
30
+ )
31
+ from .scenarios import (
32
+ TestScenario, get_scenario, ALL_SCENARIOS, ScenarioType, DifficultyLevel
33
+ )
34
+ from .comparison import (
35
+ ToolBenchmarkResult, ComparisonFramework, ToolMetadata,
36
+ ToolCategory, ToolCapabilities
37
+ )
38
+
39
+ # Setup logging
40
+ logging.basicConfig(level=logging.INFO)
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ class BenchmarkStatus(Enum):
45
+ """Status of a benchmark run."""
46
+ PENDING = auto()
47
+ RUNNING = auto()
48
+ COMPLETED = auto()
49
+ FAILED = auto()
50
+ CANCELLED = auto()
51
+ TIMEOUT = auto()
52
+
53
+
54
+ @dataclass
55
+ class BenchmarkConfig:
56
+ """Configuration for benchmark execution."""
57
+
58
+ # Identification
59
+ benchmark_name: str = ""
60
+ benchmark_id: Optional[str] = None
61
+
62
+ # Scenarios to run
63
+ scenarios: List[str] = field(default_factory=list)
64
+ scenario_types: Optional[List[ScenarioType]] = None
65
+ difficulty_levels: Optional[List[DifficultyLevel]] = None
66
+ tags: Optional[List[str]] = None
67
+
68
+ # Execution settings
69
+ max_concurrent: int = 1
70
+ timeout_per_scenario: int = 3600 # seconds
71
+ retries: int = 1
72
+
73
+ # Tool settings
74
+ zen_config: Dict[str, Any] = field(default_factory=dict)
75
+ enable_competitor_comparison: bool = False
76
+ competitors: List[str] = field(default_factory=list)
77
+
78
+ # Output settings
79
+ output_dir: str = "benchmark_results"
80
+ save_raw_output: bool = True
81
+ generate_charts: bool = True
82
+ generate_markdown_report: bool = True
83
+
84
+ # Historical tracking
85
+ track_history: bool = True
86
+ history_file: str = "benchmark_history.json"
87
+
88
+ def __post_init__(self):
89
+ if not self.benchmark_id:
90
+ self.benchmark_id = str(uuid.uuid4())[:8]
91
+ if not self.benchmark_name:
92
+ self.benchmark_name = f"benchmark_{self.benchmark_id}"
93
+
94
+
95
+ @dataclass
96
+ class ScenarioResult:
97
+ """Result of running a single scenario."""
98
+
99
+ scenario_id: str
100
+ status: BenchmarkStatus
101
+ benchmark_id: str
102
+
103
+ # Timing
104
+ start_time: Optional[datetime] = None
105
+ end_time: Optional[datetime] = None
106
+
107
+ # Results
108
+ metrics: Optional[BenchmarkMetrics] = None
109
+ error_message: Optional[str] = None
110
+ error_traceback: Optional[str] = None
111
+
112
+ # Comparison
113
+ comparison_result: Optional[Any] = None
114
+
115
+ @property
116
+ def duration_seconds(self) -> float:
117
+ """Calculate duration in seconds."""
118
+ if self.start_time and self.end_time:
119
+ return (self.end_time - self.start_time).total_seconds()
120
+ return 0.0
121
+
122
+ def to_dict(self) -> Dict[str, Any]:
123
+ """Convert to dictionary."""
124
+ return {
125
+ "scenario_id": self.scenario_id,
126
+ "status": self.status.name,
127
+ "benchmark_id": self.benchmark_id,
128
+ "start_time": self.start_time.isoformat() if self.start_time else None,
129
+ "end_time": self.end_time.isoformat() if self.end_time else None,
130
+ "duration_seconds": self.duration_seconds,
131
+ "metrics": self.metrics.to_dict() if self.metrics else None,
132
+ "error_message": self.error_message
133
+ }
134
+
135
+
136
+ @dataclass
137
+ class BenchmarkReport:
138
+ """Complete benchmark report."""
139
+
140
+ # Identification
141
+ benchmark_id: str
142
+ benchmark_name: str
143
+ tool_version: str
144
+
145
+ # Timing
146
+ start_time: datetime
147
+ end_time: Optional[datetime] = None
148
+
149
+ # Configuration
150
+ config: BenchmarkConfig = field(default_factory=BenchmarkConfig)
151
+
152
+ # Results
153
+ scenario_results: List[ScenarioResult] = field(default_factory=list)
154
+
155
+ # Aggregated metrics
156
+ aggregate_metrics: Optional[Dict[str, Any]] = None
157
+ historical_comparison: Optional[Dict[str, Any]] = None
158
+
159
+ @property
160
+ def duration_seconds(self) -> float:
161
+ """Calculate total duration."""
162
+ if self.end_time:
163
+ return (self.end_time - self.start_time).total_seconds()
164
+ return 0.0
165
+
166
+ @property
167
+ def success_rate(self) -> float:
168
+ """Calculate success rate."""
169
+ if not self.scenario_results:
170
+ return 0.0
171
+
172
+ successful = sum(
173
+ 1 for r in self.scenario_results
174
+ if r.status == BenchmarkStatus.COMPLETED
175
+ )
176
+ return (successful / len(self.scenario_results)) * 100
177
+
178
+ @property
179
+ def scenarios_passed(self) -> int:
180
+ """Count passed scenarios."""
181
+ return sum(
182
+ 1 for r in self.scenario_results
183
+ if r.status == BenchmarkStatus.COMPLETED
184
+ )
185
+
186
+ @property
187
+ def scenarios_failed(self) -> int:
188
+ """Count failed scenarios."""
189
+ return sum(
190
+ 1 for r in self.scenario_results
191
+ if r.status == BenchmarkStatus.FAILED
192
+ )
193
+
194
+ def to_dict(self) -> Dict[str, Any]:
195
+ """Convert to dictionary."""
196
+ return {
197
+ "benchmark_id": self.benchmark_id,
198
+ "benchmark_name": self.benchmark_name,
199
+ "tool_version": self.tool_version,
200
+ "start_time": self.start_time.isoformat(),
201
+ "end_time": self.end_time.isoformat() if self.end_time else None,
202
+ "duration_seconds": self.duration_seconds,
203
+ "config": {
204
+ "max_concurrent": self.config.max_concurrent,
205
+ "timeout_per_scenario": self.config.timeout_per_scenario,
206
+ "scenarios": self.config.scenarios
207
+ },
208
+ "summary": {
209
+ "total_scenarios": len(self.scenario_results),
210
+ "passed": self.scenarios_passed,
211
+ "failed": self.scenarios_failed,
212
+ "success_rate": self.success_rate
213
+ },
214
+ "scenario_results": [r.to_dict() for r in self.scenario_results],
215
+ "aggregate_metrics": self.aggregate_metrics,
216
+ "historical_comparison": self.historical_comparison
217
+ }
218
+
219
+ def to_json(self, indent: int = 2) -> str:
220
+ """Convert to JSON string."""
221
+ return json.dumps(self.to_dict(), indent=indent, default=str)
222
+
223
+ def generate_markdown(self) -> str:
224
+ """Generate markdown report."""
225
+ lines = [
226
+ f"# Benchmark Report: {self.benchmark_name}",
227
+ "",
228
+ f"**Benchmark ID:** `{self.benchmark_id}`",
229
+ f"**Tool Version:** {self.tool_version}",
230
+ f"**Date:** {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}",
231
+ f"**Duration:** {self.duration_seconds:.1f} seconds",
232
+ "",
233
+ "## Summary",
234
+ "",
235
+ f"- **Total Scenarios:** {len(self.scenario_results)}",
236
+ f"- **Passed:** {self.scenarios_passed} ✅",
237
+ f"- **Failed:** {self.scenarios_failed} ❌",
238
+ f"- **Success Rate:** {self.success_rate:.1f}%",
239
+ "",
240
+ "## Scenario Results",
241
+ "",
242
+ "| Scenario | Status | Duration | Precision | Recall | F1-Score |",
243
+ "|----------|--------|----------|-----------|--------|----------|"
244
+ ]
245
+
246
+ for result in self.scenario_results:
247
+ status_emoji = {
248
+ BenchmarkStatus.COMPLETED: "✅",
249
+ BenchmarkStatus.FAILED: "❌",
250
+ BenchmarkStatus.TIMEOUT: "⏱️",
251
+ BenchmarkStatus.CANCELLED: "🚫"
252
+ }.get(result.status, "❓")
253
+
254
+ metrics = result.metrics
255
+ if metrics:
256
+ scores = metrics.calculate_aggregate_scores()
257
+ lines.append(
258
+ f"| {result.scenario_id} | {status_emoji} {result.status.name} | "
259
+ f"{result.duration_seconds:.1f}s | "
260
+ f"{scores.get('precision', 0):.3f} | "
261
+ f"{scores.get('recall', 0):.3f} | "
262
+ f"{scores.get('f1_score', 0):.3f} |"
263
+ )
264
+ else:
265
+ lines.append(
266
+ f"| {result.scenario_id} | {status_emoji} {result.status.name} | "
267
+ f"{result.duration_seconds:.1f}s | N/A | N/A | N/A |"
268
+ )
269
+
270
+ lines.extend([
271
+ "",
272
+ "## Aggregate Metrics",
273
+ ""
274
+ ])
275
+
276
+ if self.aggregate_metrics:
277
+ for key, value in self.aggregate_metrics.items():
278
+ if isinstance(value, float):
279
+ lines.append(f"- **{key.replace('_', ' ').title()}:** {value:.3f}")
280
+ else:
281
+ lines.append(f"- **{key.replace('_', ' ').title()}:** {value}")
282
+
283
+ lines.extend([
284
+ "",
285
+ "## Detailed Results",
286
+ ""
287
+ ])
288
+
289
+ for result in self.scenario_results:
290
+ lines.extend([
291
+ f"### {result.scenario_id}",
292
+ "",
293
+ f"**Status:** {result.status.name}",
294
+ f"**Duration:** {result.duration_seconds:.1f} seconds",
295
+ ""
296
+ ])
297
+
298
+ if result.metrics:
299
+ scores = result.metrics.calculate_aggregate_scores()
300
+ lines.extend([
301
+ "**Scores:**",
302
+ f"- Accuracy: {scores.get('accuracy', 0):.3f}",
303
+ f"- Precision: {scores.get('precision', 0):.3f}",
304
+ f"- Recall: {scores.get('recall', 0):.3f}",
305
+ f"- F1-Score: {scores.get('f1_score', 0):.3f}",
306
+ f"- Overall: {scores.get('overall', 0):.3f}",
307
+ ""
308
+ ])
309
+
310
+ if result.error_message:
311
+ lines.extend([
312
+ "**Error:**",
313
+ f"```",
314
+ f"{result.error_message}",
315
+ f"```",
316
+ ""
317
+ ])
318
+
319
+ return "\n".join(lines)
320
+
321
+
322
+ class BenchmarkEngine:
323
+ """Main benchmark engine for Zen-AI-Pentest."""
324
+
325
+ def __init__(
326
+ self,
327
+ orchestrator: Optional[PentestOrchestrator] = None,
328
+ output_dir: str = "benchmark_results"
329
+ ):
330
+ self.orchestrator = orchestrator
331
+ self.output_dir = Path(output_dir)
332
+ self.output_dir.mkdir(parents=True, exist_ok=True)
333
+
334
+ self.comparison_framework = ComparisonFramework()
335
+ self.metrics_aggregator = MetricsAggregator()
336
+
337
+ self._current_benchmark: Optional[BenchmarkReport] = None
338
+ self._cancelled = False
339
+
340
+ # Get version info
341
+ self.tool_version = self._get_tool_version()
342
+
343
+ logger.info(f"BenchmarkEngine initialized (output: {self.output_dir})")
344
+
345
+ def _get_tool_version(self) -> str:
346
+ """Get current tool version."""
347
+ try:
348
+ # Try to get from package metadata
349
+ import pkg_resources
350
+ return pkg_resources.get_distribution("zen-ai-pentest").version
351
+ except:
352
+ # Fallback to git commit or static version
353
+ try:
354
+ from .. import __version__
355
+ return __version__
356
+ except:
357
+ return "dev"
358
+
359
+ def _select_scenarios(self, config: BenchmarkConfig) -> List[TestScenario]:
360
+ """Select scenarios based on configuration."""
361
+ scenarios = []
362
+
363
+ if config.scenarios:
364
+ # Explicit scenario IDs
365
+ for scenario_id in config.scenarios:
366
+ scenario = get_scenario(scenario_id)
367
+ if scenario:
368
+ scenarios.append(scenario)
369
+ else:
370
+ logger.warning(f"Unknown scenario: {scenario_id}")
371
+ else:
372
+ # Filter by type/difficulty/tags
373
+ for scenario in ALL_SCENARIOS.values():
374
+ include = True
375
+
376
+ if config.scenario_types:
377
+ include &= scenario.scenario_type in config.scenario_types
378
+
379
+ if config.difficulty_levels:
380
+ include &= scenario.difficulty in config.difficulty_levels
381
+
382
+ if config.tags:
383
+ include &= any(tag in scenario.tags for tag in config.tags)
384
+
385
+ if include:
386
+ scenarios.append(scenario)
387
+
388
+ return scenarios
389
+
390
+ async def _run_single_scenario(
391
+ self,
392
+ scenario: TestScenario,
393
+ config: BenchmarkConfig
394
+ ) -> ScenarioResult:
395
+ """Run a single benchmark scenario."""
396
+
397
+ result = ScenarioResult(
398
+ scenario_id=scenario.id,
399
+ status=BenchmarkStatus.PENDING,
400
+ benchmark_id=config.benchmark_id or "",
401
+ start_time=datetime.utcnow()
402
+ )
403
+
404
+ logger.info(f"Starting scenario: {scenario.name}")
405
+ result.status = BenchmarkStatus.RUNNING
406
+
407
+ try:
408
+ # Setup scenario (e.g., start docker containers)
409
+ await self._setup_scenario(scenario)
410
+
411
+ # Wait for target to be ready
412
+ await self._wait_for_target(scenario)
413
+
414
+ # Run the actual scan
415
+ metrics = await self._execute_scan(scenario, config)
416
+ result.metrics = metrics
417
+
418
+ # Run competitor comparison if enabled
419
+ if config.enable_competitor_comparison:
420
+ comparison = await self._run_comparison(scenario, config, metrics)
421
+ result.comparison_result = comparison
422
+
423
+ result.status = BenchmarkStatus.COMPLETED
424
+ logger.info(f"Scenario completed: {scenario.name}")
425
+
426
+ except asyncio.TimeoutError:
427
+ result.status = BenchmarkStatus.TIMEOUT
428
+ result.error_message = f"Scenario timed out after {config.timeout_per_scenario}s"
429
+ logger.error(f"Scenario timeout: {scenario.name}")
430
+
431
+ except Exception as e:
432
+ result.status = BenchmarkStatus.FAILED
433
+ result.error_message = str(e)
434
+ result.error_traceback = traceback.format_exc()
435
+ logger.error(f"Scenario failed: {scenario.name} - {e}")
436
+
437
+ finally:
438
+ # Teardown scenario
439
+ await self._teardown_scenario(scenario)
440
+ result.end_time = datetime.utcnow()
441
+
442
+ return result
443
+
444
+ async def _setup_scenario(self, scenario: TestScenario) -> None:
445
+ """Setup a scenario (start containers, etc.)."""
446
+ if scenario.docker_compose_file:
447
+ logger.info(f"Starting docker-compose for {scenario.id}")
448
+ # In real implementation, write compose file and start
449
+ # subprocess.run(["docker-compose", "-f", "docker-compose.yml", "up", "-d"])
450
+
451
+ # Run setup commands
452
+ for cmd in scenario.setup_commands:
453
+ logger.info(f"Running setup command: {cmd}")
454
+ # subprocess.run(cmd, shell=True)
455
+
456
+ # Give services time to start
457
+ await asyncio.sleep(5)
458
+
459
+ async def _teardown_scenario(self, scenario: TestScenario) -> None:
460
+ """Teardown a scenario."""
461
+ if scenario.docker_compose_file:
462
+ logger.info(f"Stopping docker-compose for {scenario.id}")
463
+ # subprocess.run(["docker-compose", "down"])
464
+
465
+ for cmd in scenario.teardown_commands:
466
+ logger.info(f"Running teardown command: {cmd}")
467
+ # subprocess.run(cmd, shell=True)
468
+
469
+ async def _wait_for_target(
470
+ self,
471
+ scenario: TestScenario,
472
+ timeout: int = 120
473
+ ) -> bool:
474
+ """Wait for target to be ready."""
475
+ if not scenario.health_check_endpoint:
476
+ return True
477
+
478
+ logger.info(f"Waiting for target: {scenario.health_check_endpoint}")
479
+
480
+ import aiohttp
481
+ start_time = time.time()
482
+
483
+ while time.time() - start_time < timeout:
484
+ try:
485
+ async with aiohttp.ClientSession() as session:
486
+ async with session.get(
487
+ scenario.health_check_endpoint,
488
+ timeout=5
489
+ ) as response:
490
+ if response.status < 500:
491
+ logger.info(f"Target ready: {scenario.id}")
492
+ return True
493
+ except:
494
+ pass
495
+
496
+ await asyncio.sleep(2)
497
+
498
+ raise TimeoutError(f"Target not ready after {timeout}s")
499
+
500
+ async def _execute_scan(
501
+ self,
502
+ scenario: TestScenario,
503
+ config: BenchmarkConfig
504
+ ) -> BenchmarkMetrics:
505
+ """Execute the actual scan and collect metrics."""
506
+
507
+ metrics = BenchmarkMetrics(
508
+ benchmark_id=config.benchmark_id or str(uuid.uuid4())[:8],
509
+ scenario_name=scenario.name,
510
+ tool_version=self.tool_version
511
+ )
512
+
513
+ # Initialize performance tracking
514
+ metrics.performance.scan_start_time = datetime.utcnow()
515
+ start_time = time.time()
516
+
517
+ # Check if orchestrator is available
518
+ if self.orchestrator is None:
519
+ logger.warning("No orchestrator available, using simulation mode")
520
+ # Simulate scan for testing
521
+ await self._simulate_scan(scenario, metrics)
522
+ else:
523
+ # Run actual scan
524
+ await self._run_real_scan(scenario, config, metrics)
525
+
526
+ # Record end time
527
+ metrics.performance.scan_end_time = datetime.utcnow()
528
+ metrics.performance.total_duration_ms = int(
529
+ (time.time() - start_time) * 1000
530
+ )
531
+
532
+ return metrics
533
+
534
+ async def _simulate_scan(
535
+ self,
536
+ scenario: TestScenario,
537
+ metrics: BenchmarkMetrics
538
+ ) -> None:
539
+ """Simulate a scan for testing purposes."""
540
+ logger.info(f"Running simulated scan for {scenario.id}")
541
+
542
+ # Simulate scan duration
543
+ await asyncio.sleep(2)
544
+
545
+ # Generate simulated findings based on expected vulnerabilities
546
+ true_positives = 0
547
+ false_positives = 0
548
+
549
+ for vuln in scenario.expected_vulnerabilities:
550
+ # Simulate detection (90% detection rate)
551
+ if hashlib.md5(vuln.vuln_type.encode()).hexdigest()[0] in '0123456789a':
552
+ true_positives += 1
553
+ metrics.findings.append(FindingMetrics(
554
+ finding_type=FindingType.SQL_INJECTION, # Simplified
555
+ severity=SeverityLevel(vuln.severity) if vuln.severity in ["critical", "high", "medium", "low", "info"] else SeverityLevel.MEDIUM,
556
+ confidence=0.85,
557
+ exploitability=0.7 if vuln.exploit_available else 0.3,
558
+ detection_time_ms=1500,
559
+ verified=True,
560
+ exploited=vuln.exploit_available
561
+ ))
562
+
563
+ # Simulate some false positives
564
+ false_positives = 1 if len(scenario.expected_vulnerabilities) > 5 else 0
565
+
566
+ # Set classification metrics
567
+ expected_count = len(scenario.expected_vulnerabilities)
568
+ false_negatives = expected_count - true_positives
569
+
570
+ metrics.classification = ClassificationMetrics(
571
+ true_positives=true_positives,
572
+ false_positives=false_positives,
573
+ true_negatives=10, # Assumed
574
+ false_negatives=false_negatives
575
+ )
576
+
577
+ # Coverage metrics
578
+ metrics.coverage = CoverageMetrics(
579
+ total_endpoints=10,
580
+ scanned_endpoints=9,
581
+ total_parameters=50,
582
+ tested_parameters=45,
583
+ total_attack_vectors=20,
584
+ tested_attack_vectors=18,
585
+ owasp_categories_covered=["A03:2021-Injection", "A01:2021-Broken Access Control"]
586
+ )
587
+
588
+ # Exploit metrics
589
+ metrics.exploit = ExploitMetrics(
590
+ total_exploits_attempted=true_positives,
591
+ successful_exploits=int(true_positives * 0.7),
592
+ failed_exploits=int(true_positives * 0.3),
593
+ blocked_exploits=0
594
+ )
595
+
596
+ # Token usage (simulated)
597
+ metrics.token_usage = TokenUsage(
598
+ prompt_tokens=5000,
599
+ completion_tokens=3000,
600
+ cost_usd=0.15,
601
+ model="gpt-4"
602
+ )
603
+
604
+ async def _run_real_scan(
605
+ self,
606
+ scenario: TestScenario,
607
+ config: BenchmarkConfig,
608
+ metrics: BenchmarkMetrics
609
+ ) -> None:
610
+ """Run a real scan using the orchestrator."""
611
+ # This would integrate with the actual PentestOrchestrator
612
+ # For now, delegate to simulation
613
+ await self._simulate_scan(scenario, metrics)
614
+
615
+ async def _run_comparison(
616
+ self,
617
+ scenario: TestScenario,
618
+ config: BenchmarkConfig,
619
+ zen_metrics: BenchmarkMetrics
620
+ ) -> Optional[Any]:
621
+ """Run comparison with competitor tools."""
622
+ if not config.competitors:
623
+ return None
624
+
625
+ logger.info(f"Running comparison for {scenario.id}")
626
+
627
+ # Convert zen metrics to tool result
628
+ scores = zen_metrics.calculate_aggregate_scores()
629
+ zen_result = ToolBenchmarkResult(
630
+ tool_metadata=ToolMetadata(
631
+ name="Zen-AI-Pentest",
632
+ version=self.tool_version,
633
+ vendor="Zen-AI",
634
+ category=ToolCategory.AI_PENTEST,
635
+ license_type="open_source"
636
+ ),
637
+ scenario_id=scenario.id,
638
+ scan_duration_seconds=zen_metrics.performance.duration_seconds,
639
+ vulnerabilities_found=len(zen_metrics.findings),
640
+ true_positives=zen_metrics.classification.true_positives,
641
+ false_positives=zen_metrics.classification.false_positives,
642
+ false_negatives=zen_metrics.classification.false_negatives,
643
+ precision=scores["precision"],
644
+ recall=scores["recall"],
645
+ f1_score=scores["f1_score"],
646
+ accuracy=scores["accuracy"],
647
+ total_cost_usd=zen_metrics.token_usage.cost_usd,
648
+ tokens_used=zen_metrics.token_usage.total_tokens
649
+ )
650
+
651
+ scenario_config = {
652
+ "scenario_id": scenario.id,
653
+ "target_url": scenario.target_url,
654
+ "target_host": scenario.target_host,
655
+ "target_port": scenario.target_port
656
+ }
657
+
658
+ comparison = await self.comparison_framework.run_comparison(
659
+ zen_result,
660
+ scenario_config,
661
+ config.competitors
662
+ )
663
+
664
+ return comparison
665
+
666
+ async def run_benchmark(self, config: BenchmarkConfig) -> BenchmarkReport:
667
+ """Run a complete benchmark suite."""
668
+
669
+ self._cancelled = False
670
+
671
+ # Create report
672
+ report = BenchmarkReport(
673
+ benchmark_id=config.benchmark_id or str(uuid.uuid4())[:8],
674
+ benchmark_name=config.benchmark_name,
675
+ tool_version=self.tool_version,
676
+ start_time=datetime.utcnow(),
677
+ config=config
678
+ )
679
+
680
+ self._current_benchmark = report
681
+
682
+ # Select scenarios
683
+ scenarios = self._select_scenarios(config)
684
+ logger.info(f"Selected {len(scenarios)} scenarios for benchmarking")
685
+
686
+ if not scenarios:
687
+ logger.warning("No scenarios selected!")
688
+ report.end_time = datetime.utcnow()
689
+ return report
690
+
691
+ # Run scenarios with concurrency control
692
+ semaphore = asyncio.Semaphore(config.max_concurrent)
693
+
694
+ async def run_with_semaphore(scenario: TestScenario) -> ScenarioResult:
695
+ async with semaphore:
696
+ if self._cancelled:
697
+ result = ScenarioResult(
698
+ scenario_id=scenario.id,
699
+ status=BenchmarkStatus.CANCELLED,
700
+ benchmark_id=config.benchmark_id or ""
701
+ )
702
+ return result
703
+
704
+ return await self._run_single_scenario(scenario, config)
705
+
706
+ # Run all scenarios
707
+ results = await asyncio.gather(*[
708
+ run_with_semaphore(s) for s in scenarios
709
+ ])
710
+
711
+ report.scenario_results = list(results)
712
+ report.end_time = datetime.utcnow()
713
+
714
+ # Calculate aggregate metrics
715
+ report.aggregate_metrics = self._calculate_aggregates(report)
716
+
717
+ # Compare with history
718
+ if config.track_history:
719
+ report.historical_comparison = self._compare_with_history(report)
720
+ self._save_to_history(report)
721
+
722
+ # Save report
723
+ self._save_report(report, config)
724
+
725
+ logger.info(f"Benchmark completed: {report.benchmark_id}")
726
+ logger.info(f"Success rate: {report.success_rate:.1f}%")
727
+
728
+ return report
729
+
730
+ def _calculate_aggregates(self, report: BenchmarkReport) -> Dict[str, Any]:
731
+ """Calculate aggregate metrics across all scenarios."""
732
+
733
+ completed = [
734
+ r for r in report.scenario_results
735
+ if r.status == BenchmarkStatus.COMPLETED and r.metrics
736
+ ]
737
+
738
+ if not completed:
739
+ return {}
740
+
741
+ all_scores = [
742
+ r.metrics.calculate_aggregate_scores()
743
+ for r in completed
744
+ ]
745
+
746
+ # Average scores
747
+ aggregate = {}
748
+ keys = all_scores[0].keys()
749
+ for key in keys:
750
+ values = [s[key] for s in all_scores]
751
+ aggregate[f"avg_{key}"] = sum(values) / len(values)
752
+
753
+ # Total findings
754
+ aggregate["total_findings"] = sum(
755
+ len(r.metrics.findings) for r in completed
756
+ )
757
+
758
+ # Total duration
759
+ aggregate["total_duration_seconds"] = sum(
760
+ r.duration_seconds for r in completed
761
+ )
762
+
763
+ # Average duration per scenario
764
+ aggregate["avg_duration_seconds"] = (
765
+ aggregate["total_duration_seconds"] / len(completed)
766
+ )
767
+
768
+ # Token usage
769
+ aggregate["total_tokens"] = sum(
770
+ r.metrics.token_usage.total_tokens for r in completed
771
+ )
772
+ aggregate["total_cost_usd"] = sum(
773
+ r.metrics.token_usage.cost_usd for r in completed
774
+ )
775
+
776
+ return aggregate
777
+
778
+ def _compare_with_history(
779
+ self,
780
+ report: BenchmarkReport
781
+ ) -> Optional[Dict[str, Any]]:
782
+ """Compare current benchmark with historical results."""
783
+
784
+ history_file = self.output_dir / report.config.history_file
785
+
786
+ if not history_file.exists():
787
+ return None
788
+
789
+ try:
790
+ with open(history_file) as f:
791
+ history = json.load(f)
792
+
793
+ # Find previous runs of same scenarios
794
+ comparable = [
795
+ h for h in history
796
+ if h.get("scenarios") == report.config.scenarios
797
+ and h["benchmark_id"] != report.benchmark_id
798
+ ]
799
+
800
+ if not comparable:
801
+ return None
802
+
803
+ # Get most recent
804
+ previous = max(comparable, key=lambda h: h.get("timestamp", ""))
805
+
806
+ # Compare aggregate metrics
807
+ comparison = {}
808
+ current_metrics = report.aggregate_metrics or {}
809
+ previous_metrics = previous.get("aggregate_metrics", {})
810
+
811
+ for key in current_metrics.keys():
812
+ if key in previous_metrics and previous_metrics[key] != 0:
813
+ change = ((current_metrics[key] - previous_metrics[key])
814
+ / previous_metrics[key] * 100)
815
+ comparison[key] = {
816
+ "current": current_metrics[key],
817
+ "previous": previous_metrics[key],
818
+ "change_percent": change
819
+ }
820
+
821
+ return comparison
822
+
823
+ except Exception as e:
824
+ logger.error(f"Error comparing with history: {e}")
825
+ return None
826
+
827
+ def _save_to_history(self, report: BenchmarkReport) -> None:
828
+ """Save benchmark to history file."""
829
+
830
+ history_file = self.output_dir / report.config.history_file
831
+
832
+ history = []
833
+ if history_file.exists():
834
+ try:
835
+ with open(history_file) as f:
836
+ history = json.load(f)
837
+ except:
838
+ pass
839
+
840
+ # Add current report
841
+ history.append({
842
+ "benchmark_id": report.benchmark_id,
843
+ "timestamp": report.start_time.isoformat(),
844
+ "scenarios": report.config.scenarios,
845
+ "success_rate": report.success_rate,
846
+ "aggregate_metrics": report.aggregate_metrics
847
+ })
848
+
849
+ # Keep only last 100 entries
850
+ history = history[-100:]
851
+
852
+ with open(history_file, 'w') as f:
853
+ json.dump(history, f, indent=2)
854
+
855
+ def _save_report(
856
+ self,
857
+ report: BenchmarkReport,
858
+ config: BenchmarkConfig
859
+ ) -> None:
860
+ """Save benchmark report to disk."""
861
+
862
+ benchmark_dir = self.output_dir / report.benchmark_id
863
+ benchmark_dir.mkdir(exist_ok=True)
864
+
865
+ # Save JSON report
866
+ json_file = benchmark_dir / "report.json"
867
+ with open(json_file, 'w') as f:
868
+ f.write(report.to_json())
869
+
870
+ # Save markdown report
871
+ if config.generate_markdown_report:
872
+ md_file = benchmark_dir / "report.md"
873
+ with open(md_file, 'w') as f:
874
+ f.write(report.generate_markdown())
875
+
876
+ logger.info(f"Report saved to: {benchmark_dir}")
877
+
878
+ def cancel_benchmark(self) -> None:
879
+ """Cancel the current benchmark."""
880
+ self._cancelled = True
881
+ logger.info("Benchmark cancellation requested")
882
+
883
+ def get_scenario_list(self) -> List[Dict[str, Any]]:
884
+ """Get list of all available scenarios."""
885
+ from .scenarios import list_all_scenarios
886
+ return list_all_scenarios()
887
+
888
+ def get_benchmark_history(
889
+ self,
890
+ limit: int = 10
891
+ ) -> List[Dict[str, Any]]:
892
+ """Get benchmark history."""
893
+
894
+ history_file = self.output_dir / "benchmark_history.json"
895
+
896
+ if not history_file.exists():
897
+ return []
898
+
899
+ try:
900
+ with open(history_file) as f:
901
+ history = json.load(f)
902
+ return history[-limit:]
903
+ except:
904
+ return []