zen-ai-pentest 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
benchmarks/__init__.py CHANGED
@@ -1,149 +1,19 @@
1
1
  """
2
- Zen-AI-Pentest Benchmarking & Testing Framework
2
+ Zen-AI-Pentest Benchmarks Package
3
3
 
4
- Comprehensive benchmark suite for evaluating security testing performance.
4
+ Performance benchmarking suite for measuring scan speed,
5
+ agent decision time, and API response times.
5
6
  """
6
7
 
7
- __version__ = "1.0.0"
8
- __author__ = "Zen-AI-Pentest Team"
9
-
10
- # Main components
11
- from .benchmark_engine import (
12
- BenchmarkEngine,
13
- BenchmarkConfig,
14
- BenchmarkReport,
15
- BenchmarkStatus,
16
- ScenarioResult
17
- )
18
-
19
- from .metrics import (
20
- BenchmarkMetrics,
21
- ClassificationMetrics,
22
- CoverageMetrics,
23
- PerformanceMetrics,
24
- ExploitMetrics,
25
- TokenUsage,
26
- FindingMetrics,
27
- SeverityLevel,
28
- FindingType,
29
- MetricsAggregator,
30
- compare_metrics,
31
- calculate_confidence_interval
32
- )
33
-
34
- from .scenarios import (
35
- TestScenario,
36
- ScenarioType,
37
- DifficultyLevel,
38
- VulnerabilityProfile,
39
- get_scenario,
40
- get_scenarios_by_type,
41
- get_scenarios_by_difficulty,
42
- get_scenarios_by_tag,
43
- list_all_scenarios,
44
- create_benchmark_suite,
45
- ALL_SCENARIOS,
46
- # Pre-defined scenarios
47
- OWASP_JUICE_SHOP,
48
- DVWA_SCENARIO,
49
- METASPLOITABLE2_SCENARIO,
50
- METASPLOITABLE3_SCENARIO,
51
- WEBGOAT_SCENARIO,
52
- HTB_STARTING_POINT_TIER1,
53
- THM_OWASP_TOP10
54
- )
55
-
56
- from .comparison import (
57
- ComparisonFramework,
58
- ComparisonResult,
59
- CompetitorTool,
60
- ToolMetadata,
61
- ToolCapabilities,
62
- ToolCategory,
63
- ToolBenchmarkResult,
64
- PentestGPTCompetitor,
65
- AutoPentestDRLCompetitor,
66
- PENTESTGPT_METADATA,
67
- AUTOPENTEST_METADATA,
68
- NESSUS_METADATA,
69
- OPENVAS_METADATA,
70
- BURP_SUITE_METADATA,
71
- OWASP_ZAP_METADATA,
72
- NIKTO_METADATA,
73
- NUCLEI_METADATA,
74
- SQLMAP_METADATA
75
- )
76
-
77
- from .ci_benchmark import (
78
- CIBenchmarkRunner,
79
- CIConfig,
80
- PerformanceGate,
81
- GateResult,
82
- RegressionCheck,
83
- RegressionSeverity
84
- )
8
+ from .scan_performance import ScanPerformanceBenchmark, measure_scan_speed
9
+ from .agent_performance import AgentPerformanceBenchmark, measure_agent_decision_time
10
+ from .api_performance import APIPerformanceBenchmark, measure_api_response_time
85
11
 
86
12
  __all__ = [
87
- # Engine
88
- "BenchmarkEngine",
89
- "BenchmarkConfig",
90
- "BenchmarkReport",
91
- "BenchmarkStatus",
92
- "ScenarioResult",
93
-
94
- # Metrics
95
- "BenchmarkMetrics",
96
- "ClassificationMetrics",
97
- "CoverageMetrics",
98
- "PerformanceMetrics",
99
- "ExploitMetrics",
100
- "TokenUsage",
101
- "FindingMetrics",
102
- "SeverityLevel",
103
- "FindingType",
104
- "MetricsAggregator",
105
-
106
- # Scenarios
107
- "TestScenario",
108
- "ScenarioType",
109
- "DifficultyLevel",
110
- "VulnerabilityProfile",
111
- "get_scenario",
112
- "get_scenarios_by_type",
113
- "get_scenarios_by_difficulty",
114
- "get_scenarios_by_tag",
115
- "list_all_scenarios",
116
- "create_benchmark_suite",
117
-
118
- # Comparison
119
- "ComparisonFramework",
120
- "ComparisonResult",
121
- "CompetitorTool",
122
- "ToolMetadata",
123
- "ToolCapabilities",
124
- "ToolCategory",
125
- "ToolBenchmarkResult",
126
-
127
- # CI/CD
128
- "CIBenchmarkRunner",
129
- "CIConfig",
130
- "PerformanceGate",
131
- "GateResult",
132
- "RegressionCheck",
133
- "RegressionSeverity",
13
+ "ScanPerformanceBenchmark",
14
+ "AgentPerformanceBenchmark",
15
+ "APIPerformanceBenchmark",
16
+ "measure_scan_speed",
17
+ "measure_agent_decision_time",
18
+ "measure_api_response_time",
134
19
  ]
135
-
136
-
137
- def get_version() -> str:
138
- """Get framework version."""
139
- return __version__
140
-
141
-
142
- def get_available_scenarios() -> list:
143
- """Get list of all available scenario IDs."""
144
- return list(ALL_SCENARIOS.keys())
145
-
146
-
147
- def create_default_engine(output_dir: str = "benchmark_results") -> BenchmarkEngine:
148
- """Create a benchmark engine with default configuration."""
149
- return BenchmarkEngine(output_dir=output_dir)
@@ -0,0 +1,374 @@
1
+ """
2
+ Agent Performance Benchmarks
3
+
4
+ Measures agent decision-making time, ReAct loop iterations, and tool selection speed.
5
+ """
6
+
7
+ import asyncio
8
+ import time
9
+ from typing import List, Dict, Any, Optional, Callable
10
+ from dataclasses import dataclass
11
+ from enum import Enum
12
+
13
+ import sys
14
+ from pathlib import Path
15
+ sys.path.insert(0, str(Path(__file__).parent.parent))
16
+
17
+ from modules.benchmark import (
18
+ BenchmarkRunner, BenchmarkResult, BenchmarkCategory,
19
+ TimingMetrics
20
+ )
21
+
22
+
23
+ class AgentTaskType(Enum):
24
+ """Types of agent tasks for benchmarking."""
25
+ RECONNAISSANCE = "reconnaissance"
26
+ VULNERABILITY_ANALYSIS = "vulnerability_analysis"
27
+ EXPLOIT_SELECTION = "exploit_selection"
28
+ REPORT_GENERATION = "report_generation"
29
+
30
+
31
+ @dataclass
32
+ class AgentBenchmarkConfig:
33
+ """Configuration for agent benchmarks."""
34
+ iterations: int = 10
35
+ task_type: AgentTaskType = AgentTaskType.VULNERABILITY_ANALYSIS
36
+ complexity: str = "medium" # simple, medium, complex
37
+ max_react_iterations: int = 5
38
+
39
+
40
+ class AgentPerformanceBenchmark:
41
+ """Benchmark suite for AI agent performance measurement."""
42
+
43
+ def __init__(self, output_dir: str = "benchmark_results"):
44
+ self.runner = BenchmarkRunner(output_dir=output_dir)
45
+ self.config = AgentBenchmarkConfig()
46
+
47
+ def _get_test_task(self, task_type: AgentTaskType, complexity: str) -> Dict[str, Any]:
48
+ """Generate test tasks for benchmarking."""
49
+ tasks = {
50
+ AgentTaskType.RECONNAISSANCE: {
51
+ "target": "example.com",
52
+ "scope": "subdomains",
53
+ "description": "Discover subdomains"
54
+ },
55
+ AgentTaskType.VULNERABILITY_ANALYSIS: {
56
+ "target": "192.168.1.1",
57
+ "scan_type": "full",
58
+ "description": "Analyze vulnerabilities"
59
+ },
60
+ AgentTaskType.EXPLOIT_SELECTION: {
61
+ "finding": {"type": "sqli", "severity": "high"},
62
+ "target_info": {"os": "linux", "services": ["apache", "mysql"]},
63
+ "description": "Select appropriate exploit"
64
+ },
65
+ AgentTaskType.REPORT_GENERATION: {
66
+ "findings_count": 10 if complexity == "simple" else 50 if complexity == "medium" else 100,
67
+ "format": "pdf",
68
+ "description": "Generate security report"
69
+ }
70
+ }
71
+ return tasks.get(task_type, tasks[AgentTaskType.VULNERABILITY_ANALYSIS])
72
+
73
+ async def benchmark_decision_time(self, config: Optional[AgentBenchmarkConfig] = None) -> BenchmarkResult:
74
+ """
75
+ Benchmark agent decision-making time.
76
+
77
+ Returns:
78
+ BenchmarkResult with decision timing metrics
79
+ """
80
+ cfg = config or self.config
81
+ task = self._get_test_task(cfg.task_type, cfg.complexity)
82
+
83
+ async def make_decision():
84
+ # Simulate agent decision-making process
85
+ # This would integrate with actual agent in production
86
+ base_delay = 0.5 if cfg.complexity == "simple" else 1.0 if cfg.complexity == "medium" else 2.0
87
+ await asyncio.sleep(base_delay + (hash(str(task)) % 100) / 1000)
88
+ return {"decision": "scan", "confidence": 0.85}
89
+
90
+ result = await self.runner.run_benchmark(
91
+ name="agent_decision_time",
92
+ category=BenchmarkCategory.AGENT,
93
+ description=f"Agent decision time for {cfg.task_type.value} ({cfg.complexity})",
94
+ benchmark_func=make_decision,
95
+ iterations=cfg.iterations,
96
+ monitor_resources=True
97
+ )
98
+
99
+ result.custom_metrics["task_type"] = cfg.task_type.value
100
+ result.custom_metrics["complexity"] = cfg.complexity
101
+ result.custom_metrics["decisions_per_second"] = (
102
+ cfg.iterations / result.timing.duration_seconds
103
+ if result.timing.duration_seconds > 0 else 0
104
+ )
105
+
106
+ return result
107
+
108
+ async def benchmark_react_loop(self, config: Optional[AgentBenchmarkConfig] = None) -> BenchmarkResult:
109
+ """
110
+ Benchmark ReAct loop performance.
111
+
112
+ Returns:
113
+ BenchmarkResult with ReAct loop metrics
114
+ """
115
+ cfg = config or AgentBenchmarkConfig(
116
+ iterations=5,
117
+ max_react_iterations=5
118
+ )
119
+
120
+ async def run_react_loop():
121
+ # Simulate ReAct loop iterations
122
+ for iteration in range(cfg.max_react_iterations):
123
+ # Thought
124
+ await asyncio.sleep(0.2)
125
+ # Action
126
+ await asyncio.sleep(0.3)
127
+ # Observation
128
+ await asyncio.sleep(0.1)
129
+
130
+ return {"iterations": cfg.max_react_iterations, "completed": True}
131
+
132
+ result = await self.runner.run_benchmark(
133
+ name="react_loop_performance",
134
+ category=BenchmarkCategory.AGENT,
135
+ description=f"ReAct loop with {cfg.max_react_iterations} iterations",
136
+ benchmark_func=run_react_loop,
137
+ iterations=cfg.iterations,
138
+ monitor_resources=True
139
+ )
140
+
141
+ # Calculate per-iteration metrics
142
+ avg_per_iteration = result.timing.avg_ms / cfg.max_react_iterations
143
+
144
+ result.custom_metrics["max_iterations"] = cfg.max_react_iterations
145
+ result.custom_metrics["ms_per_iteration"] = avg_per_iteration
146
+ result.custom_metrics["iterations_per_second"] = (
147
+ 1000 / avg_per_iteration if avg_per_iteration > 0 else 0
148
+ )
149
+
150
+ return result
151
+
152
+ async def benchmark_tool_selection(self, config: Optional[AgentBenchmarkConfig] = None) -> BenchmarkResult:
153
+ """
154
+ Benchmark tool selection speed.
155
+
156
+ Returns:
157
+ BenchmarkResult with tool selection metrics
158
+ """
159
+ cfg = config or self.config
160
+
161
+ test_scenarios = [
162
+ {"task": "port_scan", "tools_available": 5},
163
+ {"task": "web_enum", "tools_available": 8},
164
+ {"task": "vuln_check", "tools_available": 12},
165
+ {"task": "exploit", "tools_available": 6},
166
+ ]
167
+
168
+ async def select_tools():
169
+ for scenario in test_scenarios:
170
+ # Simulate tool selection decision
171
+ await asyncio.sleep(0.15 * scenario["tools_available"])
172
+ return {"selections": len(test_scenarios)}
173
+
174
+ result = await self.runner.run_benchmark(
175
+ name="tool_selection_speed",
176
+ category=BenchmarkCategory.AGENT,
177
+ description="AI tool selection performance",
178
+ benchmark_func=select_tools,
179
+ iterations=cfg.iterations,
180
+ monitor_resources=True
181
+ )
182
+
183
+ result.custom_metrics["scenarios_tested"] = len(test_scenarios)
184
+ result.custom_metrics["selections_per_second"] = (
185
+ (len(test_scenarios) * cfg.iterations) / result.timing.duration_seconds
186
+ if result.timing.duration_seconds > 0 else 0
187
+ )
188
+
189
+ return result
190
+
191
+ async def benchmark_memory_usage(self, config: Optional[AgentBenchmarkConfig] = None) -> BenchmarkResult:
192
+ """
193
+ Benchmark agent memory usage during operations.
194
+
195
+ Returns:
196
+ BenchmarkResult with memory metrics
197
+ """
198
+ cfg = config or AgentBenchmarkConfig(complexity="complex")
199
+
200
+ async def agent_operation():
201
+ # Simulate agent operation that builds context
202
+ context_size = 100 if cfg.complexity == "simple" else 500 if cfg.complexity == "medium" else 1000
203
+
204
+ context = []
205
+ for i in range(context_size):
206
+ context.append({
207
+ "step": i,
208
+ "observation": f"Observation {i}",
209
+ "action": f"Action {i}",
210
+ "result": f"Result data for step {i}" * 10
211
+ })
212
+ if i % 100 == 0:
213
+ await asyncio.sleep(0.01)
214
+
215
+ return {"context_size": len(context)}
216
+
217
+ result = await self.runner.run_benchmark(
218
+ name="agent_memory_usage",
219
+ category=BenchmarkCategory.AGENT,
220
+ description=f"Agent memory usage ({cfg.complexity} context)",
221
+ benchmark_func=agent_operation,
222
+ iterations=3,
223
+ monitor_resources=True
224
+ )
225
+
226
+ result.custom_metrics["complexity"] = cfg.complexity
227
+ result.custom_metrics["memory_per_100_context"] = (
228
+ result.memory.peak_mb / 10 if cfg.complexity == "complex" else
229
+ result.memory.peak_mb / 5 if cfg.complexity == "medium" else
230
+ result.memory.peak_mb
231
+ )
232
+
233
+ return result
234
+
235
+ async def benchmark_reasoning_quality(self, config: Optional[AgentBenchmarkConfig] = None) -> BenchmarkResult:
236
+ """
237
+ Benchmark reasoning quality vs time tradeoff.
238
+
239
+ Returns:
240
+ BenchmarkResult with reasoning metrics
241
+ """
242
+ cfg = config or AgentBenchmarkConfig()
243
+
244
+ reasoning_depths = ["fast", "balanced", "thorough"]
245
+
246
+ async def reasoning_benchmark():
247
+ for depth in reasoning_depths:
248
+ delay = {"fast": 0.5, "balanced": 1.0, "thorough": 2.0}[depth]
249
+ await asyncio.sleep(delay)
250
+ return {"depths_tested": reasoning_depths}
251
+
252
+ result = await self.runner.run_benchmark(
253
+ name="reasoning_quality_time",
254
+ category=BenchmarkCategory.AGENT,
255
+ description="Reasoning quality vs time tradeoff",
256
+ benchmark_func=reasoning_benchmark,
257
+ iterations=cfg.iterations,
258
+ monitor_resources=True
259
+ )
260
+
261
+ result.custom_metrics["reasoning_depths"] = reasoning_depths
262
+ result.custom_metrics["avg_time_per_depth"] = (
263
+ result.timing.avg_ms / len(reasoning_depths)
264
+ )
265
+
266
+ return result
267
+
268
+ async def run_all(self) -> List[BenchmarkResult]:
269
+ """Run all agent performance benchmarks."""
270
+ results = []
271
+
272
+ print("Running agent performance benchmarks...")
273
+
274
+ benchmarks = [
275
+ ("Decision Time", self.benchmark_decision_time),
276
+ ("ReAct Loop", self.benchmark_react_loop),
277
+ ("Tool Selection", self.benchmark_tool_selection),
278
+ ("Memory Usage", self.benchmark_memory_usage),
279
+ ("Reasoning Quality", self.benchmark_reasoning_quality),
280
+ ]
281
+
282
+ for name, benchmark_func in benchmarks:
283
+ print(f" Running: {name}...")
284
+ try:
285
+ result = await benchmark_func()
286
+ results.append(result)
287
+ self.runner.save_result(result)
288
+ print(f" ✓ {name}: {result.timing.avg_ms:.2f}ms avg")
289
+ except Exception as e:
290
+ print(f" ✗ {name} failed: {e}")
291
+
292
+ # Save combined results
293
+ self.runner.save_all_results("agent_benchmarks.json")
294
+
295
+ return results
296
+
297
+ def get_summary(self) -> Dict[str, Any]:
298
+ """Get summary of agent benchmark results."""
299
+ return self.runner.get_summary()
300
+
301
+
302
+ # Convenience function
303
+ async def measure_agent_decision_time(
304
+ decision_func: Callable,
305
+ iterations: int = 10,
306
+ output_dir: str = "benchmark_results"
307
+ ) -> BenchmarkResult:
308
+ """
309
+ Quick function to measure agent decision time.
310
+
311
+ Args:
312
+ decision_func: Async function for agent decision
313
+ iterations: Number of iterations
314
+ output_dir: Directory for results
315
+
316
+ Returns:
317
+ BenchmarkResult with timing metrics
318
+ """
319
+ runner = BenchmarkRunner(output_dir=output_dir)
320
+
321
+ result = await runner.run_benchmark(
322
+ name="agent_decision",
323
+ category=BenchmarkCategory.AGENT,
324
+ description="Agent decision time measurement",
325
+ benchmark_func=decision_func,
326
+ iterations=iterations,
327
+ monitor_resources=True
328
+ )
329
+
330
+ runner.save_result(result)
331
+ return result
332
+
333
+
334
+ # CLI interface
335
+ if __name__ == "__main__":
336
+ import argparse
337
+
338
+ parser = argparse.ArgumentParser(description="Agent Performance Benchmarks")
339
+ parser.add_argument("--output", default="benchmark_results", help="Output directory")
340
+ parser.add_argument("--iterations", type=int, default=10, help="Number of iterations")
341
+ parser.add_argument("--complexity", default="medium",
342
+ choices=["simple", "medium", "complex"],
343
+ help="Task complexity")
344
+
345
+ args = parser.parse_args()
346
+
347
+ async def main():
348
+ config = AgentBenchmarkConfig(
349
+ iterations=args.iterations,
350
+ complexity=args.complexity
351
+ )
352
+
353
+ benchmark = AgentPerformanceBenchmark(output_dir=args.output)
354
+ results = await benchmark.run_all()
355
+
356
+ print("\n" + "="*60)
357
+ print("AGENT PERFORMANCE BENCHMARK RESULTS")
358
+ print("="*60)
359
+
360
+ for result in results:
361
+ print(f"\n{result.name}:")
362
+ print(f" Avg Time: {result.timing.avg_ms:.2f}ms")
363
+ print(f" P95: {result.timing.p95_ms:.2f}ms")
364
+ print(f" Peak Memory: {result.memory.peak_mb:.2f} MB")
365
+
366
+ if result.custom_metrics:
367
+ print(" Custom Metrics:")
368
+ for key, value in result.custom_metrics.items():
369
+ if isinstance(value, float):
370
+ print(f" {key}: {value:.3f}")
371
+ else:
372
+ print(f" {key}: {value}")
373
+
374
+ asyncio.run(main())