zen-ai-pentest 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- api/auth.py +61 -7
- api/csrf_protection.py +286 -0
- api/main.py +77 -11
- api/rate_limiter.py +317 -0
- api/rate_limiter_v2.py +586 -0
- autonomous/ki_analysis_agent.py +1033 -0
- benchmarks/__init__.py +12 -142
- benchmarks/agent_performance.py +374 -0
- benchmarks/api_performance.py +479 -0
- benchmarks/scan_performance.py +272 -0
- modules/agent_coordinator.py +255 -0
- modules/api_key_manager.py +501 -0
- modules/benchmark.py +706 -0
- modules/cve_updater.py +303 -0
- modules/false_positive_filter.py +149 -0
- modules/output_formats.py +1088 -0
- modules/risk_scoring.py +206 -0
- {zen_ai_pentest-2.2.0.dist-info → zen_ai_pentest-2.3.0.dist-info}/METADATA +134 -289
- {zen_ai_pentest-2.2.0.dist-info → zen_ai_pentest-2.3.0.dist-info}/RECORD +23 -9
- {zen_ai_pentest-2.2.0.dist-info → zen_ai_pentest-2.3.0.dist-info}/WHEEL +0 -0
- {zen_ai_pentest-2.2.0.dist-info → zen_ai_pentest-2.3.0.dist-info}/entry_points.txt +0 -0
- {zen_ai_pentest-2.2.0.dist-info → zen_ai_pentest-2.3.0.dist-info}/licenses/LICENSE +0 -0
- {zen_ai_pentest-2.2.0.dist-info → zen_ai_pentest-2.3.0.dist-info}/top_level.txt +0 -0
benchmarks/__init__.py
CHANGED
|
@@ -1,149 +1,19 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Zen-AI-Pentest
|
|
2
|
+
Zen-AI-Pentest Benchmarks Package
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Performance benchmarking suite for measuring scan speed,
|
|
5
|
+
agent decision time, and API response times.
|
|
5
6
|
"""
|
|
6
7
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
# Main components
|
|
11
|
-
from .benchmark_engine import (
|
|
12
|
-
BenchmarkEngine,
|
|
13
|
-
BenchmarkConfig,
|
|
14
|
-
BenchmarkReport,
|
|
15
|
-
BenchmarkStatus,
|
|
16
|
-
ScenarioResult
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
from .metrics import (
|
|
20
|
-
BenchmarkMetrics,
|
|
21
|
-
ClassificationMetrics,
|
|
22
|
-
CoverageMetrics,
|
|
23
|
-
PerformanceMetrics,
|
|
24
|
-
ExploitMetrics,
|
|
25
|
-
TokenUsage,
|
|
26
|
-
FindingMetrics,
|
|
27
|
-
SeverityLevel,
|
|
28
|
-
FindingType,
|
|
29
|
-
MetricsAggregator,
|
|
30
|
-
compare_metrics,
|
|
31
|
-
calculate_confidence_interval
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
from .scenarios import (
|
|
35
|
-
TestScenario,
|
|
36
|
-
ScenarioType,
|
|
37
|
-
DifficultyLevel,
|
|
38
|
-
VulnerabilityProfile,
|
|
39
|
-
get_scenario,
|
|
40
|
-
get_scenarios_by_type,
|
|
41
|
-
get_scenarios_by_difficulty,
|
|
42
|
-
get_scenarios_by_tag,
|
|
43
|
-
list_all_scenarios,
|
|
44
|
-
create_benchmark_suite,
|
|
45
|
-
ALL_SCENARIOS,
|
|
46
|
-
# Pre-defined scenarios
|
|
47
|
-
OWASP_JUICE_SHOP,
|
|
48
|
-
DVWA_SCENARIO,
|
|
49
|
-
METASPLOITABLE2_SCENARIO,
|
|
50
|
-
METASPLOITABLE3_SCENARIO,
|
|
51
|
-
WEBGOAT_SCENARIO,
|
|
52
|
-
HTB_STARTING_POINT_TIER1,
|
|
53
|
-
THM_OWASP_TOP10
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
from .comparison import (
|
|
57
|
-
ComparisonFramework,
|
|
58
|
-
ComparisonResult,
|
|
59
|
-
CompetitorTool,
|
|
60
|
-
ToolMetadata,
|
|
61
|
-
ToolCapabilities,
|
|
62
|
-
ToolCategory,
|
|
63
|
-
ToolBenchmarkResult,
|
|
64
|
-
PentestGPTCompetitor,
|
|
65
|
-
AutoPentestDRLCompetitor,
|
|
66
|
-
PENTESTGPT_METADATA,
|
|
67
|
-
AUTOPENTEST_METADATA,
|
|
68
|
-
NESSUS_METADATA,
|
|
69
|
-
OPENVAS_METADATA,
|
|
70
|
-
BURP_SUITE_METADATA,
|
|
71
|
-
OWASP_ZAP_METADATA,
|
|
72
|
-
NIKTO_METADATA,
|
|
73
|
-
NUCLEI_METADATA,
|
|
74
|
-
SQLMAP_METADATA
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
from .ci_benchmark import (
|
|
78
|
-
CIBenchmarkRunner,
|
|
79
|
-
CIConfig,
|
|
80
|
-
PerformanceGate,
|
|
81
|
-
GateResult,
|
|
82
|
-
RegressionCheck,
|
|
83
|
-
RegressionSeverity
|
|
84
|
-
)
|
|
8
|
+
from .scan_performance import ScanPerformanceBenchmark, measure_scan_speed
|
|
9
|
+
from .agent_performance import AgentPerformanceBenchmark, measure_agent_decision_time
|
|
10
|
+
from .api_performance import APIPerformanceBenchmark, measure_api_response_time
|
|
85
11
|
|
|
86
12
|
__all__ = [
|
|
87
|
-
|
|
88
|
-
"
|
|
89
|
-
"
|
|
90
|
-
"
|
|
91
|
-
"
|
|
92
|
-
"
|
|
93
|
-
|
|
94
|
-
# Metrics
|
|
95
|
-
"BenchmarkMetrics",
|
|
96
|
-
"ClassificationMetrics",
|
|
97
|
-
"CoverageMetrics",
|
|
98
|
-
"PerformanceMetrics",
|
|
99
|
-
"ExploitMetrics",
|
|
100
|
-
"TokenUsage",
|
|
101
|
-
"FindingMetrics",
|
|
102
|
-
"SeverityLevel",
|
|
103
|
-
"FindingType",
|
|
104
|
-
"MetricsAggregator",
|
|
105
|
-
|
|
106
|
-
# Scenarios
|
|
107
|
-
"TestScenario",
|
|
108
|
-
"ScenarioType",
|
|
109
|
-
"DifficultyLevel",
|
|
110
|
-
"VulnerabilityProfile",
|
|
111
|
-
"get_scenario",
|
|
112
|
-
"get_scenarios_by_type",
|
|
113
|
-
"get_scenarios_by_difficulty",
|
|
114
|
-
"get_scenarios_by_tag",
|
|
115
|
-
"list_all_scenarios",
|
|
116
|
-
"create_benchmark_suite",
|
|
117
|
-
|
|
118
|
-
# Comparison
|
|
119
|
-
"ComparisonFramework",
|
|
120
|
-
"ComparisonResult",
|
|
121
|
-
"CompetitorTool",
|
|
122
|
-
"ToolMetadata",
|
|
123
|
-
"ToolCapabilities",
|
|
124
|
-
"ToolCategory",
|
|
125
|
-
"ToolBenchmarkResult",
|
|
126
|
-
|
|
127
|
-
# CI/CD
|
|
128
|
-
"CIBenchmarkRunner",
|
|
129
|
-
"CIConfig",
|
|
130
|
-
"PerformanceGate",
|
|
131
|
-
"GateResult",
|
|
132
|
-
"RegressionCheck",
|
|
133
|
-
"RegressionSeverity",
|
|
13
|
+
"ScanPerformanceBenchmark",
|
|
14
|
+
"AgentPerformanceBenchmark",
|
|
15
|
+
"APIPerformanceBenchmark",
|
|
16
|
+
"measure_scan_speed",
|
|
17
|
+
"measure_agent_decision_time",
|
|
18
|
+
"measure_api_response_time",
|
|
134
19
|
]
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def get_version() -> str:
|
|
138
|
-
"""Get framework version."""
|
|
139
|
-
return __version__
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
def get_available_scenarios() -> list:
|
|
143
|
-
"""Get list of all available scenario IDs."""
|
|
144
|
-
return list(ALL_SCENARIOS.keys())
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def create_default_engine(output_dir: str = "benchmark_results") -> BenchmarkEngine:
|
|
148
|
-
"""Create a benchmark engine with default configuration."""
|
|
149
|
-
return BenchmarkEngine(output_dir=output_dir)
|
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent Performance Benchmarks
|
|
3
|
+
|
|
4
|
+
Measures agent decision-making time, ReAct loop iterations, and tool selection speed.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import time
|
|
9
|
+
from typing import List, Dict, Any, Optional, Callable
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from enum import Enum
|
|
12
|
+
|
|
13
|
+
import sys
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
16
|
+
|
|
17
|
+
from modules.benchmark import (
|
|
18
|
+
BenchmarkRunner, BenchmarkResult, BenchmarkCategory,
|
|
19
|
+
TimingMetrics
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AgentTaskType(Enum):
|
|
24
|
+
"""Types of agent tasks for benchmarking."""
|
|
25
|
+
RECONNAISSANCE = "reconnaissance"
|
|
26
|
+
VULNERABILITY_ANALYSIS = "vulnerability_analysis"
|
|
27
|
+
EXPLOIT_SELECTION = "exploit_selection"
|
|
28
|
+
REPORT_GENERATION = "report_generation"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class AgentBenchmarkConfig:
|
|
33
|
+
"""Configuration for agent benchmarks."""
|
|
34
|
+
iterations: int = 10
|
|
35
|
+
task_type: AgentTaskType = AgentTaskType.VULNERABILITY_ANALYSIS
|
|
36
|
+
complexity: str = "medium" # simple, medium, complex
|
|
37
|
+
max_react_iterations: int = 5
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class AgentPerformanceBenchmark:
|
|
41
|
+
"""Benchmark suite for AI agent performance measurement."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, output_dir: str = "benchmark_results"):
|
|
44
|
+
self.runner = BenchmarkRunner(output_dir=output_dir)
|
|
45
|
+
self.config = AgentBenchmarkConfig()
|
|
46
|
+
|
|
47
|
+
def _get_test_task(self, task_type: AgentTaskType, complexity: str) -> Dict[str, Any]:
|
|
48
|
+
"""Generate test tasks for benchmarking."""
|
|
49
|
+
tasks = {
|
|
50
|
+
AgentTaskType.RECONNAISSANCE: {
|
|
51
|
+
"target": "example.com",
|
|
52
|
+
"scope": "subdomains",
|
|
53
|
+
"description": "Discover subdomains"
|
|
54
|
+
},
|
|
55
|
+
AgentTaskType.VULNERABILITY_ANALYSIS: {
|
|
56
|
+
"target": "192.168.1.1",
|
|
57
|
+
"scan_type": "full",
|
|
58
|
+
"description": "Analyze vulnerabilities"
|
|
59
|
+
},
|
|
60
|
+
AgentTaskType.EXPLOIT_SELECTION: {
|
|
61
|
+
"finding": {"type": "sqli", "severity": "high"},
|
|
62
|
+
"target_info": {"os": "linux", "services": ["apache", "mysql"]},
|
|
63
|
+
"description": "Select appropriate exploit"
|
|
64
|
+
},
|
|
65
|
+
AgentTaskType.REPORT_GENERATION: {
|
|
66
|
+
"findings_count": 10 if complexity == "simple" else 50 if complexity == "medium" else 100,
|
|
67
|
+
"format": "pdf",
|
|
68
|
+
"description": "Generate security report"
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return tasks.get(task_type, tasks[AgentTaskType.VULNERABILITY_ANALYSIS])
|
|
72
|
+
|
|
73
|
+
async def benchmark_decision_time(self, config: Optional[AgentBenchmarkConfig] = None) -> BenchmarkResult:
|
|
74
|
+
"""
|
|
75
|
+
Benchmark agent decision-making time.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
BenchmarkResult with decision timing metrics
|
|
79
|
+
"""
|
|
80
|
+
cfg = config or self.config
|
|
81
|
+
task = self._get_test_task(cfg.task_type, cfg.complexity)
|
|
82
|
+
|
|
83
|
+
async def make_decision():
|
|
84
|
+
# Simulate agent decision-making process
|
|
85
|
+
# This would integrate with actual agent in production
|
|
86
|
+
base_delay = 0.5 if cfg.complexity == "simple" else 1.0 if cfg.complexity == "medium" else 2.0
|
|
87
|
+
await asyncio.sleep(base_delay + (hash(str(task)) % 100) / 1000)
|
|
88
|
+
return {"decision": "scan", "confidence": 0.85}
|
|
89
|
+
|
|
90
|
+
result = await self.runner.run_benchmark(
|
|
91
|
+
name="agent_decision_time",
|
|
92
|
+
category=BenchmarkCategory.AGENT,
|
|
93
|
+
description=f"Agent decision time for {cfg.task_type.value} ({cfg.complexity})",
|
|
94
|
+
benchmark_func=make_decision,
|
|
95
|
+
iterations=cfg.iterations,
|
|
96
|
+
monitor_resources=True
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
result.custom_metrics["task_type"] = cfg.task_type.value
|
|
100
|
+
result.custom_metrics["complexity"] = cfg.complexity
|
|
101
|
+
result.custom_metrics["decisions_per_second"] = (
|
|
102
|
+
cfg.iterations / result.timing.duration_seconds
|
|
103
|
+
if result.timing.duration_seconds > 0 else 0
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
async def benchmark_react_loop(self, config: Optional[AgentBenchmarkConfig] = None) -> BenchmarkResult:
|
|
109
|
+
"""
|
|
110
|
+
Benchmark ReAct loop performance.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
BenchmarkResult with ReAct loop metrics
|
|
114
|
+
"""
|
|
115
|
+
cfg = config or AgentBenchmarkConfig(
|
|
116
|
+
iterations=5,
|
|
117
|
+
max_react_iterations=5
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
async def run_react_loop():
|
|
121
|
+
# Simulate ReAct loop iterations
|
|
122
|
+
for iteration in range(cfg.max_react_iterations):
|
|
123
|
+
# Thought
|
|
124
|
+
await asyncio.sleep(0.2)
|
|
125
|
+
# Action
|
|
126
|
+
await asyncio.sleep(0.3)
|
|
127
|
+
# Observation
|
|
128
|
+
await asyncio.sleep(0.1)
|
|
129
|
+
|
|
130
|
+
return {"iterations": cfg.max_react_iterations, "completed": True}
|
|
131
|
+
|
|
132
|
+
result = await self.runner.run_benchmark(
|
|
133
|
+
name="react_loop_performance",
|
|
134
|
+
category=BenchmarkCategory.AGENT,
|
|
135
|
+
description=f"ReAct loop with {cfg.max_react_iterations} iterations",
|
|
136
|
+
benchmark_func=run_react_loop,
|
|
137
|
+
iterations=cfg.iterations,
|
|
138
|
+
monitor_resources=True
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Calculate per-iteration metrics
|
|
142
|
+
avg_per_iteration = result.timing.avg_ms / cfg.max_react_iterations
|
|
143
|
+
|
|
144
|
+
result.custom_metrics["max_iterations"] = cfg.max_react_iterations
|
|
145
|
+
result.custom_metrics["ms_per_iteration"] = avg_per_iteration
|
|
146
|
+
result.custom_metrics["iterations_per_second"] = (
|
|
147
|
+
1000 / avg_per_iteration if avg_per_iteration > 0 else 0
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return result
|
|
151
|
+
|
|
152
|
+
async def benchmark_tool_selection(self, config: Optional[AgentBenchmarkConfig] = None) -> BenchmarkResult:
|
|
153
|
+
"""
|
|
154
|
+
Benchmark tool selection speed.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
BenchmarkResult with tool selection metrics
|
|
158
|
+
"""
|
|
159
|
+
cfg = config or self.config
|
|
160
|
+
|
|
161
|
+
test_scenarios = [
|
|
162
|
+
{"task": "port_scan", "tools_available": 5},
|
|
163
|
+
{"task": "web_enum", "tools_available": 8},
|
|
164
|
+
{"task": "vuln_check", "tools_available": 12},
|
|
165
|
+
{"task": "exploit", "tools_available": 6},
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
async def select_tools():
|
|
169
|
+
for scenario in test_scenarios:
|
|
170
|
+
# Simulate tool selection decision
|
|
171
|
+
await asyncio.sleep(0.15 * scenario["tools_available"])
|
|
172
|
+
return {"selections": len(test_scenarios)}
|
|
173
|
+
|
|
174
|
+
result = await self.runner.run_benchmark(
|
|
175
|
+
name="tool_selection_speed",
|
|
176
|
+
category=BenchmarkCategory.AGENT,
|
|
177
|
+
description="AI tool selection performance",
|
|
178
|
+
benchmark_func=select_tools,
|
|
179
|
+
iterations=cfg.iterations,
|
|
180
|
+
monitor_resources=True
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
result.custom_metrics["scenarios_tested"] = len(test_scenarios)
|
|
184
|
+
result.custom_metrics["selections_per_second"] = (
|
|
185
|
+
(len(test_scenarios) * cfg.iterations) / result.timing.duration_seconds
|
|
186
|
+
if result.timing.duration_seconds > 0 else 0
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return result
|
|
190
|
+
|
|
191
|
+
async def benchmark_memory_usage(self, config: Optional[AgentBenchmarkConfig] = None) -> BenchmarkResult:
|
|
192
|
+
"""
|
|
193
|
+
Benchmark agent memory usage during operations.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
BenchmarkResult with memory metrics
|
|
197
|
+
"""
|
|
198
|
+
cfg = config or AgentBenchmarkConfig(complexity="complex")
|
|
199
|
+
|
|
200
|
+
async def agent_operation():
|
|
201
|
+
# Simulate agent operation that builds context
|
|
202
|
+
context_size = 100 if cfg.complexity == "simple" else 500 if cfg.complexity == "medium" else 1000
|
|
203
|
+
|
|
204
|
+
context = []
|
|
205
|
+
for i in range(context_size):
|
|
206
|
+
context.append({
|
|
207
|
+
"step": i,
|
|
208
|
+
"observation": f"Observation {i}",
|
|
209
|
+
"action": f"Action {i}",
|
|
210
|
+
"result": f"Result data for step {i}" * 10
|
|
211
|
+
})
|
|
212
|
+
if i % 100 == 0:
|
|
213
|
+
await asyncio.sleep(0.01)
|
|
214
|
+
|
|
215
|
+
return {"context_size": len(context)}
|
|
216
|
+
|
|
217
|
+
result = await self.runner.run_benchmark(
|
|
218
|
+
name="agent_memory_usage",
|
|
219
|
+
category=BenchmarkCategory.AGENT,
|
|
220
|
+
description=f"Agent memory usage ({cfg.complexity} context)",
|
|
221
|
+
benchmark_func=agent_operation,
|
|
222
|
+
iterations=3,
|
|
223
|
+
monitor_resources=True
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
result.custom_metrics["complexity"] = cfg.complexity
|
|
227
|
+
result.custom_metrics["memory_per_100_context"] = (
|
|
228
|
+
result.memory.peak_mb / 10 if cfg.complexity == "complex" else
|
|
229
|
+
result.memory.peak_mb / 5 if cfg.complexity == "medium" else
|
|
230
|
+
result.memory.peak_mb
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
return result
|
|
234
|
+
|
|
235
|
+
async def benchmark_reasoning_quality(self, config: Optional[AgentBenchmarkConfig] = None) -> BenchmarkResult:
|
|
236
|
+
"""
|
|
237
|
+
Benchmark reasoning quality vs time tradeoff.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
BenchmarkResult with reasoning metrics
|
|
241
|
+
"""
|
|
242
|
+
cfg = config or AgentBenchmarkConfig()
|
|
243
|
+
|
|
244
|
+
reasoning_depths = ["fast", "balanced", "thorough"]
|
|
245
|
+
|
|
246
|
+
async def reasoning_benchmark():
|
|
247
|
+
for depth in reasoning_depths:
|
|
248
|
+
delay = {"fast": 0.5, "balanced": 1.0, "thorough": 2.0}[depth]
|
|
249
|
+
await asyncio.sleep(delay)
|
|
250
|
+
return {"depths_tested": reasoning_depths}
|
|
251
|
+
|
|
252
|
+
result = await self.runner.run_benchmark(
|
|
253
|
+
name="reasoning_quality_time",
|
|
254
|
+
category=BenchmarkCategory.AGENT,
|
|
255
|
+
description="Reasoning quality vs time tradeoff",
|
|
256
|
+
benchmark_func=reasoning_benchmark,
|
|
257
|
+
iterations=cfg.iterations,
|
|
258
|
+
monitor_resources=True
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
result.custom_metrics["reasoning_depths"] = reasoning_depths
|
|
262
|
+
result.custom_metrics["avg_time_per_depth"] = (
|
|
263
|
+
result.timing.avg_ms / len(reasoning_depths)
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return result
|
|
267
|
+
|
|
268
|
+
async def run_all(self) -> List[BenchmarkResult]:
|
|
269
|
+
"""Run all agent performance benchmarks."""
|
|
270
|
+
results = []
|
|
271
|
+
|
|
272
|
+
print("Running agent performance benchmarks...")
|
|
273
|
+
|
|
274
|
+
benchmarks = [
|
|
275
|
+
("Decision Time", self.benchmark_decision_time),
|
|
276
|
+
("ReAct Loop", self.benchmark_react_loop),
|
|
277
|
+
("Tool Selection", self.benchmark_tool_selection),
|
|
278
|
+
("Memory Usage", self.benchmark_memory_usage),
|
|
279
|
+
("Reasoning Quality", self.benchmark_reasoning_quality),
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
for name, benchmark_func in benchmarks:
|
|
283
|
+
print(f" Running: {name}...")
|
|
284
|
+
try:
|
|
285
|
+
result = await benchmark_func()
|
|
286
|
+
results.append(result)
|
|
287
|
+
self.runner.save_result(result)
|
|
288
|
+
print(f" ✓ {name}: {result.timing.avg_ms:.2f}ms avg")
|
|
289
|
+
except Exception as e:
|
|
290
|
+
print(f" ✗ {name} failed: {e}")
|
|
291
|
+
|
|
292
|
+
# Save combined results
|
|
293
|
+
self.runner.save_all_results("agent_benchmarks.json")
|
|
294
|
+
|
|
295
|
+
return results
|
|
296
|
+
|
|
297
|
+
def get_summary(self) -> Dict[str, Any]:
|
|
298
|
+
"""Get summary of agent benchmark results."""
|
|
299
|
+
return self.runner.get_summary()
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
# Convenience function
|
|
303
|
+
async def measure_agent_decision_time(
|
|
304
|
+
decision_func: Callable,
|
|
305
|
+
iterations: int = 10,
|
|
306
|
+
output_dir: str = "benchmark_results"
|
|
307
|
+
) -> BenchmarkResult:
|
|
308
|
+
"""
|
|
309
|
+
Quick function to measure agent decision time.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
decision_func: Async function for agent decision
|
|
313
|
+
iterations: Number of iterations
|
|
314
|
+
output_dir: Directory for results
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
BenchmarkResult with timing metrics
|
|
318
|
+
"""
|
|
319
|
+
runner = BenchmarkRunner(output_dir=output_dir)
|
|
320
|
+
|
|
321
|
+
result = await runner.run_benchmark(
|
|
322
|
+
name="agent_decision",
|
|
323
|
+
category=BenchmarkCategory.AGENT,
|
|
324
|
+
description="Agent decision time measurement",
|
|
325
|
+
benchmark_func=decision_func,
|
|
326
|
+
iterations=iterations,
|
|
327
|
+
monitor_resources=True
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
runner.save_result(result)
|
|
331
|
+
return result
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
# CLI interface
|
|
335
|
+
if __name__ == "__main__":
|
|
336
|
+
import argparse
|
|
337
|
+
|
|
338
|
+
parser = argparse.ArgumentParser(description="Agent Performance Benchmarks")
|
|
339
|
+
parser.add_argument("--output", default="benchmark_results", help="Output directory")
|
|
340
|
+
parser.add_argument("--iterations", type=int, default=10, help="Number of iterations")
|
|
341
|
+
parser.add_argument("--complexity", default="medium",
|
|
342
|
+
choices=["simple", "medium", "complex"],
|
|
343
|
+
help="Task complexity")
|
|
344
|
+
|
|
345
|
+
args = parser.parse_args()
|
|
346
|
+
|
|
347
|
+
async def main():
|
|
348
|
+
config = AgentBenchmarkConfig(
|
|
349
|
+
iterations=args.iterations,
|
|
350
|
+
complexity=args.complexity
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
benchmark = AgentPerformanceBenchmark(output_dir=args.output)
|
|
354
|
+
results = await benchmark.run_all()
|
|
355
|
+
|
|
356
|
+
print("\n" + "="*60)
|
|
357
|
+
print("AGENT PERFORMANCE BENCHMARK RESULTS")
|
|
358
|
+
print("="*60)
|
|
359
|
+
|
|
360
|
+
for result in results:
|
|
361
|
+
print(f"\n{result.name}:")
|
|
362
|
+
print(f" Avg Time: {result.timing.avg_ms:.2f}ms")
|
|
363
|
+
print(f" P95: {result.timing.p95_ms:.2f}ms")
|
|
364
|
+
print(f" Peak Memory: {result.memory.peak_mb:.2f} MB")
|
|
365
|
+
|
|
366
|
+
if result.custom_metrics:
|
|
367
|
+
print(" Custom Metrics:")
|
|
368
|
+
for key, value in result.custom_metrics.items():
|
|
369
|
+
if isinstance(value, float):
|
|
370
|
+
print(f" {key}: {value:.3f}")
|
|
371
|
+
else:
|
|
372
|
+
print(f" {key}: {value}")
|
|
373
|
+
|
|
374
|
+
asyncio.run(main())
|