zen-ai-pentest 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agents/__init__.py +28 -0
- agents/agent_base.py +239 -0
- agents/agent_orchestrator.py +346 -0
- agents/analysis_agent.py +225 -0
- agents/cli.py +258 -0
- agents/exploit_agent.py +224 -0
- agents/integration.py +211 -0
- agents/post_scan_agent.py +937 -0
- agents/react_agent.py +384 -0
- agents/react_agent_enhanced.py +616 -0
- agents/react_agent_vm.py +298 -0
- agents/research_agent.py +176 -0
- api/__init__.py +11 -0
- api/auth.py +123 -0
- api/main.py +1027 -0
- api/schemas.py +357 -0
- api/websocket.py +97 -0
- autonomous/__init__.py +122 -0
- autonomous/agent.py +253 -0
- autonomous/agent_loop.py +1370 -0
- autonomous/exploit_validator.py +1537 -0
- autonomous/memory.py +448 -0
- autonomous/react.py +339 -0
- autonomous/tool_executor.py +488 -0
- backends/__init__.py +16 -0
- backends/chatgpt_direct.py +133 -0
- backends/claude_direct.py +130 -0
- backends/duckduckgo.py +138 -0
- backends/openrouter.py +120 -0
- benchmarks/__init__.py +149 -0
- benchmarks/benchmark_engine.py +904 -0
- benchmarks/ci_benchmark.py +785 -0
- benchmarks/comparison.py +729 -0
- benchmarks/metrics.py +553 -0
- benchmarks/run_benchmarks.py +809 -0
- ci_cd/__init__.py +2 -0
- core/__init__.py +17 -0
- core/async_pool.py +282 -0
- core/asyncio_fix.py +222 -0
- core/cache.py +472 -0
- core/container.py +277 -0
- core/database.py +114 -0
- core/input_validator.py +353 -0
- core/models.py +288 -0
- core/orchestrator.py +611 -0
- core/plugin_manager.py +571 -0
- core/rate_limiter.py +405 -0
- core/secure_config.py +328 -0
- core/shield_integration.py +296 -0
- modules/__init__.py +46 -0
- modules/cve_database.py +362 -0
- modules/exploit_assist.py +330 -0
- modules/nuclei_integration.py +480 -0
- modules/osint.py +604 -0
- modules/protonvpn.py +554 -0
- modules/recon.py +165 -0
- modules/sql_injection_db.py +826 -0
- modules/tool_orchestrator.py +498 -0
- modules/vuln_scanner.py +292 -0
- modules/wordlist_generator.py +566 -0
- risk_engine/__init__.py +99 -0
- risk_engine/business_impact.py +267 -0
- risk_engine/business_impact_calculator.py +563 -0
- risk_engine/cvss.py +156 -0
- risk_engine/epss.py +190 -0
- risk_engine/example_usage.py +294 -0
- risk_engine/false_positive_engine.py +1073 -0
- risk_engine/scorer.py +304 -0
- web_ui/backend/main.py +471 -0
- zen_ai_pentest-2.0.0.dist-info/METADATA +795 -0
- zen_ai_pentest-2.0.0.dist-info/RECORD +75 -0
- zen_ai_pentest-2.0.0.dist-info/WHEEL +5 -0
- zen_ai_pentest-2.0.0.dist-info/entry_points.txt +2 -0
- zen_ai_pentest-2.0.0.dist-info/licenses/LICENSE +21 -0
- zen_ai_pentest-2.0.0.dist-info/top_level.txt +10 -0
|
@@ -0,0 +1,785 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Zen-AI-Pentest CI/CD Benchmark Integration
|
|
3
|
+
|
|
4
|
+
Continuous Integration benchmarking for automated performance tracking,
|
|
5
|
+
regression detection, and quality gates.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Dict, List, Optional, Any, Tuple
|
|
18
|
+
import xml.etree.ElementTree as ET
|
|
19
|
+
|
|
20
|
+
from .benchmark_engine import (
|
|
21
|
+
BenchmarkEngine, BenchmarkConfig, BenchmarkReport,
|
|
22
|
+
BenchmarkStatus
|
|
23
|
+
)
|
|
24
|
+
from .metrics import BenchmarkMetrics, MetricsAggregator
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class RegressionSeverity(Enum):
|
|
30
|
+
"""Severity levels for performance regressions."""
|
|
31
|
+
NONE = "none"
|
|
32
|
+
LOW = "low"
|
|
33
|
+
MEDIUM = "medium"
|
|
34
|
+
HIGH = "high"
|
|
35
|
+
CRITICAL = "critical"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class PerformanceGate:
|
|
40
|
+
"""Performance gate configuration."""
|
|
41
|
+
|
|
42
|
+
name: str
|
|
43
|
+
metric: str # e.g., "precision", "recall", "f1_score"
|
|
44
|
+
threshold: float
|
|
45
|
+
comparison: str = "min" # "min" or "max"
|
|
46
|
+
|
|
47
|
+
def check(self, value: float) -> bool:
|
|
48
|
+
"""Check if value passes the gate."""
|
|
49
|
+
if self.comparison == "min":
|
|
50
|
+
return value >= self.threshold
|
|
51
|
+
else:
|
|
52
|
+
return value <= self.threshold
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class GateResult:
|
|
57
|
+
"""Result of a performance gate check."""
|
|
58
|
+
|
|
59
|
+
gate: PerformanceGate
|
|
60
|
+
actual_value: float
|
|
61
|
+
passed: bool
|
|
62
|
+
message: str = ""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class RegressionCheck:
|
|
67
|
+
"""Result of regression check."""
|
|
68
|
+
|
|
69
|
+
metric: str
|
|
70
|
+
baseline_value: float
|
|
71
|
+
current_value: float
|
|
72
|
+
change_percent: float
|
|
73
|
+
severity: RegressionSeverity
|
|
74
|
+
message: str = ""
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class CIConfig:
|
|
79
|
+
"""Configuration for CI benchmark runs."""
|
|
80
|
+
|
|
81
|
+
# Trigger conditions
|
|
82
|
+
run_on_pr: bool = True
|
|
83
|
+
run_on_release: bool = True
|
|
84
|
+
run_on_schedule: bool = False
|
|
85
|
+
schedule_cron: str = "0 0 * * 0" # Weekly
|
|
86
|
+
|
|
87
|
+
# Scenarios
|
|
88
|
+
quick_scenarios: List[str] = field(
|
|
89
|
+
default_factory=lambda: ["dvwa", "juice-shop"]
|
|
90
|
+
)
|
|
91
|
+
full_scenarios: List[str] = field(
|
|
92
|
+
default_factory=lambda: list(ALL_SCENARIOS.keys())
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Performance gates
|
|
96
|
+
performance_gates: List[PerformanceGate] = field(default_factory=lambda: [
|
|
97
|
+
PerformanceGate("precision_min", "precision", 0.70),
|
|
98
|
+
PerformanceGate("recall_min", "recall", 0.65),
|
|
99
|
+
PerformanceGate("f1_min", "f1_score", 0.67),
|
|
100
|
+
PerformanceGate("accuracy_min", "accuracy", 0.75),
|
|
101
|
+
])
|
|
102
|
+
|
|
103
|
+
# Regression detection
|
|
104
|
+
enable_regression_detection: bool = True
|
|
105
|
+
regression_threshold_low: float = -5.0 # -5%
|
|
106
|
+
regression_threshold_medium: float = -10.0 # -10%
|
|
107
|
+
regression_threshold_high: float = -20.0 # -20%
|
|
108
|
+
regression_threshold_critical: float = -30.0 # -30%
|
|
109
|
+
|
|
110
|
+
# Trend analysis
|
|
111
|
+
enable_trend_analysis: bool = True
|
|
112
|
+
trend_lookback_runs: int = 5
|
|
113
|
+
trend_significance_threshold: float = 2.0 # Standard deviations
|
|
114
|
+
|
|
115
|
+
# Output
|
|
116
|
+
output_format: str = "all" # "json", "junit", "markdown", "all"
|
|
117
|
+
fail_on_gate_failure: bool = True
|
|
118
|
+
fail_on_critical_regression: bool = True
|
|
119
|
+
comment_on_pr: bool = True
|
|
120
|
+
|
|
121
|
+
# Notifications
|
|
122
|
+
notify_on_failure: bool = True
|
|
123
|
+
notify_on_regression: bool = True
|
|
124
|
+
slack_webhook: Optional[str] = None
|
|
125
|
+
email_on_failure: Optional[str] = None
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class CIBenchmarkRunner:
|
|
129
|
+
"""CI/CD benchmark runner for automated testing."""
|
|
130
|
+
|
|
131
|
+
def __init__(
|
|
132
|
+
self,
|
|
133
|
+
engine: Optional[BenchmarkEngine] = None,
|
|
134
|
+
config: Optional[CIConfig] = None,
|
|
135
|
+
output_dir: str = "ci_benchmark_results"
|
|
136
|
+
):
|
|
137
|
+
self.engine = engine or BenchmarkEngine(output_dir=output_dir)
|
|
138
|
+
self.config = config or CIConfig()
|
|
139
|
+
self.output_dir = Path(output_dir)
|
|
140
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
141
|
+
|
|
142
|
+
self.results: List[BenchmarkReport] = []
|
|
143
|
+
self.gate_results: List[GateResult] = []
|
|
144
|
+
self.regressions: List[RegressionCheck] = []
|
|
145
|
+
|
|
146
|
+
logger.info("CIBenchmarkRunner initialized")
|
|
147
|
+
|
|
148
|
+
async def run_quick_benchmark(self) -> BenchmarkReport:
|
|
149
|
+
"""Run quick benchmark for PR validation."""
|
|
150
|
+
logger.info("Running quick benchmark suite")
|
|
151
|
+
|
|
152
|
+
config = BenchmarkConfig(
|
|
153
|
+
benchmark_name="ci-quick-benchmark",
|
|
154
|
+
scenarios=self.config.quick_scenarios,
|
|
155
|
+
max_concurrent=2,
|
|
156
|
+
timeout_per_scenario=1800,
|
|
157
|
+
generate_markdown_report=True,
|
|
158
|
+
track_history=True
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
report = await self.engine.run_benchmark(config)
|
|
162
|
+
self.results.append(report)
|
|
163
|
+
|
|
164
|
+
return report
|
|
165
|
+
|
|
166
|
+
async def run_full_benchmark(self) -> BenchmarkReport:
|
|
167
|
+
"""Run full benchmark suite for releases."""
|
|
168
|
+
logger.info("Running full benchmark suite")
|
|
169
|
+
|
|
170
|
+
config = BenchmarkConfig(
|
|
171
|
+
benchmark_name="ci-full-benchmark",
|
|
172
|
+
scenarios=self.config.full_scenarios,
|
|
173
|
+
max_concurrent=1, # Sequential for stability
|
|
174
|
+
timeout_per_scenario=3600,
|
|
175
|
+
generate_markdown_report=True,
|
|
176
|
+
track_history=True
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
report = await self.engine.run_benchmark(config)
|
|
180
|
+
self.results.append(report)
|
|
181
|
+
|
|
182
|
+
return report
|
|
183
|
+
|
|
184
|
+
def check_performance_gates(
|
|
185
|
+
self,
|
|
186
|
+
report: BenchmarkReport
|
|
187
|
+
) -> List[GateResult]:
|
|
188
|
+
"""Check if benchmark passes all performance gates."""
|
|
189
|
+
|
|
190
|
+
logger.info("Checking performance gates")
|
|
191
|
+
self.gate_results = []
|
|
192
|
+
|
|
193
|
+
if not report.aggregate_metrics:
|
|
194
|
+
logger.warning("No aggregate metrics available")
|
|
195
|
+
return []
|
|
196
|
+
|
|
197
|
+
for gate in self.config.performance_gates:
|
|
198
|
+
# Get metric value
|
|
199
|
+
metric_key = f"avg_{gate.metric}"
|
|
200
|
+
value = report.aggregate_metrics.get(metric_key, 0)
|
|
201
|
+
|
|
202
|
+
passed = gate.check(value)
|
|
203
|
+
|
|
204
|
+
result = GateResult(
|
|
205
|
+
gate=gate,
|
|
206
|
+
actual_value=value,
|
|
207
|
+
passed=passed,
|
|
208
|
+
message=(
|
|
209
|
+
f"✅ {gate.name}: {value:.3f} >= {gate.threshold}"
|
|
210
|
+
if passed else
|
|
211
|
+
f"❌ {gate.name}: {value:.3f} < {gate.threshold}"
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
self.gate_results.append(result)
|
|
216
|
+
|
|
217
|
+
if passed:
|
|
218
|
+
logger.info(result.message)
|
|
219
|
+
else:
|
|
220
|
+
logger.warning(result.message)
|
|
221
|
+
|
|
222
|
+
return self.gate_results
|
|
223
|
+
|
|
224
|
+
def detect_regressions(
|
|
225
|
+
self,
|
|
226
|
+
report: BenchmarkReport
|
|
227
|
+
) -> List[RegressionCheck]:
|
|
228
|
+
"""Detect performance regressions compared to baseline."""
|
|
229
|
+
|
|
230
|
+
logger.info("Detecting regressions")
|
|
231
|
+
self.regressions = []
|
|
232
|
+
|
|
233
|
+
if not self.config.enable_regression_detection:
|
|
234
|
+
return []
|
|
235
|
+
|
|
236
|
+
# Get baseline (previous successful run)
|
|
237
|
+
baseline = self._get_baseline_report(report)
|
|
238
|
+
if not baseline:
|
|
239
|
+
logger.info("No baseline found for comparison")
|
|
240
|
+
return []
|
|
241
|
+
|
|
242
|
+
# Compare metrics
|
|
243
|
+
if not report.aggregate_metrics or not baseline.aggregate_metrics:
|
|
244
|
+
return []
|
|
245
|
+
|
|
246
|
+
for key in report.aggregate_metrics.keys():
|
|
247
|
+
if not key.startswith("avg_"):
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
current = report.aggregate_metrics[key]
|
|
251
|
+
baseline_val = baseline.aggregate_metrics.get(key)
|
|
252
|
+
|
|
253
|
+
if baseline_val is None or baseline_val == 0:
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
change_pct = ((current - baseline_val) / baseline_val) * 100
|
|
257
|
+
|
|
258
|
+
# Determine severity
|
|
259
|
+
if change_pct < self.config.regression_threshold_critical:
|
|
260
|
+
severity = RegressionSeverity.CRITICAL
|
|
261
|
+
elif change_pct < self.config.regression_threshold_high:
|
|
262
|
+
severity = RegressionSeverity.HIGH
|
|
263
|
+
elif change_pct < self.config.regression_threshold_medium:
|
|
264
|
+
severity = RegressionSeverity.MEDIUM
|
|
265
|
+
elif change_pct < self.config.regression_threshold_low:
|
|
266
|
+
severity = RegressionSeverity.LOW
|
|
267
|
+
else:
|
|
268
|
+
continue # No significant regression
|
|
269
|
+
|
|
270
|
+
regression = RegressionCheck(
|
|
271
|
+
metric=key,
|
|
272
|
+
baseline_value=baseline_val,
|
|
273
|
+
current_value=current,
|
|
274
|
+
change_percent=change_pct,
|
|
275
|
+
severity=severity,
|
|
276
|
+
message=f"{key}: {baseline_val:.3f} → {current:.3f} ({change_pct:+.1f}%)"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
self.regressions.append(regression)
|
|
280
|
+
logger.warning(f"Regression detected: {regression.message}")
|
|
281
|
+
|
|
282
|
+
return self.regressions
|
|
283
|
+
|
|
284
|
+
def _get_baseline_report(
|
|
285
|
+
self,
|
|
286
|
+
current: BenchmarkReport
|
|
287
|
+
) -> Optional[BenchmarkReport]:
|
|
288
|
+
"""Get baseline report for comparison."""
|
|
289
|
+
|
|
290
|
+
history = self.engine.get_benchmark_history(limit=10)
|
|
291
|
+
|
|
292
|
+
# Find most recent successful run with same scenarios
|
|
293
|
+
for entry in reversed(history):
|
|
294
|
+
if (entry.get("benchmark_id") != current.benchmark_id and
|
|
295
|
+
entry.get("scenarios") == current.config.scenarios and
|
|
296
|
+
entry.get("success_rate", 0) > 50):
|
|
297
|
+
|
|
298
|
+
# Load full report if available
|
|
299
|
+
report_path = (
|
|
300
|
+
self.output_dir / entry["benchmark_id"] / "report.json"
|
|
301
|
+
)
|
|
302
|
+
if report_path.exists():
|
|
303
|
+
try:
|
|
304
|
+
with open(report_path) as f:
|
|
305
|
+
data = json.load(f)
|
|
306
|
+
# Reconstruct report (simplified)
|
|
307
|
+
return current # Placeholder
|
|
308
|
+
except:
|
|
309
|
+
pass
|
|
310
|
+
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
def analyze_trends(self) -> Dict[str, Any]:
|
|
314
|
+
"""Analyze performance trends over time."""
|
|
315
|
+
|
|
316
|
+
if not self.config.enable_trend_analysis:
|
|
317
|
+
return {}
|
|
318
|
+
|
|
319
|
+
logger.info("Analyzing performance trends")
|
|
320
|
+
|
|
321
|
+
history = self.engine.get_benchmark_history(
|
|
322
|
+
limit=self.config.trend_lookback_runs
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
if len(history) < 3:
|
|
326
|
+
logger.info("Not enough history for trend analysis")
|
|
327
|
+
return {}
|
|
328
|
+
|
|
329
|
+
trends = {}
|
|
330
|
+
|
|
331
|
+
# Analyze key metrics
|
|
332
|
+
metrics_to_track = [
|
|
333
|
+
"avg_precision", "avg_recall", "avg_f1_score",
|
|
334
|
+
"avg_accuracy", "success_rate"
|
|
335
|
+
]
|
|
336
|
+
|
|
337
|
+
for metric in metrics_to_track:
|
|
338
|
+
values = [
|
|
339
|
+
h.get("aggregate_metrics", {}).get(metric)
|
|
340
|
+
for h in history
|
|
341
|
+
if h.get("aggregate_metrics", {}).get(metric) is not None
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
if len(values) < 3:
|
|
345
|
+
continue
|
|
346
|
+
|
|
347
|
+
# Simple trend detection
|
|
348
|
+
first_half = values[:len(values)//2]
|
|
349
|
+
second_half = values[len(values)//2:]
|
|
350
|
+
|
|
351
|
+
if not first_half or not second_half:
|
|
352
|
+
continue
|
|
353
|
+
|
|
354
|
+
first_avg = sum(first_half) / len(first_half)
|
|
355
|
+
second_avg = sum(second_half) / len(second_half)
|
|
356
|
+
|
|
357
|
+
change = second_avg - first_avg
|
|
358
|
+
|
|
359
|
+
trends[metric] = {
|
|
360
|
+
"direction": "improving" if change > 0 else "degrading",
|
|
361
|
+
"change": change,
|
|
362
|
+
"current_avg": second_avg,
|
|
363
|
+
"previous_avg": first_avg
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return trends
|
|
367
|
+
|
|
368
|
+
def should_fail_build(self) -> Tuple[bool, str]:
|
|
369
|
+
"""Determine if the build should fail based on results."""
|
|
370
|
+
|
|
371
|
+
reasons = []
|
|
372
|
+
|
|
373
|
+
# Check gate failures
|
|
374
|
+
if self.config.fail_on_gate_failure:
|
|
375
|
+
failed_gates = [g for g in self.gate_results if not g.passed]
|
|
376
|
+
if failed_gates:
|
|
377
|
+
reasons.append(
|
|
378
|
+
f"{len(failed_gates)} performance gate(s) failed"
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Check critical regressions
|
|
382
|
+
if self.config.fail_on_critical_regression:
|
|
383
|
+
critical = [
|
|
384
|
+
r for r in self.regressions
|
|
385
|
+
if r.severity == RegressionSeverity.CRITICAL
|
|
386
|
+
]
|
|
387
|
+
if critical:
|
|
388
|
+
reasons.append(
|
|
389
|
+
f"{len(critical)} critical regression(s) detected"
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
if reasons:
|
|
393
|
+
return True, "; ".join(reasons)
|
|
394
|
+
|
|
395
|
+
return False, "All checks passed"
|
|
396
|
+
|
|
397
|
+
def generate_junit_xml(self, report: BenchmarkReport) -> str:
|
|
398
|
+
"""Generate JUnit XML for CI integration."""
|
|
399
|
+
|
|
400
|
+
root = ET.Element("testsuites")
|
|
401
|
+
suite = ET.SubElement(
|
|
402
|
+
root,
|
|
403
|
+
"testsuite",
|
|
404
|
+
name="Zen-AI-Pentest Benchmark",
|
|
405
|
+
tests=str(len(report.scenario_results)),
|
|
406
|
+
failures=str(report.scenarios_failed),
|
|
407
|
+
time=str(report.duration_seconds)
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
# Add scenario tests
|
|
411
|
+
for scenario_result in report.scenario_results:
|
|
412
|
+
testcase = ET.SubElement(
|
|
413
|
+
suite,
|
|
414
|
+
"testcase",
|
|
415
|
+
name=scenario_result.scenario_id,
|
|
416
|
+
time=str(scenario_result.duration_seconds)
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
if scenario_result.status != BenchmarkStatus.COMPLETED:
|
|
420
|
+
failure = ET.SubElement(testcase, "failure")
|
|
421
|
+
failure.text = (
|
|
422
|
+
scenario_result.error_message or
|
|
423
|
+
f"Scenario failed with status: {scenario_result.status.name}"
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
elif scenario_result.metrics:
|
|
427
|
+
# Add metrics as system output
|
|
428
|
+
sys_out = ET.SubElement(testcase, "system-out")
|
|
429
|
+
scores = scenario_result.metrics.calculate_aggregate_scores()
|
|
430
|
+
sys_out.text = json.dumps(scores, indent=2)
|
|
431
|
+
|
|
432
|
+
# Add performance gates as test cases
|
|
433
|
+
gate_suite = ET.SubElement(
|
|
434
|
+
root,
|
|
435
|
+
"testsuite",
|
|
436
|
+
name="Performance Gates",
|
|
437
|
+
tests=str(len(self.gate_results)),
|
|
438
|
+
failures=str(sum(1 for g in self.gate_results if not g.passed))
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
for gate_result in self.gate_results:
|
|
442
|
+
testcase = ET.SubElement(
|
|
443
|
+
gate_suite,
|
|
444
|
+
"testcase",
|
|
445
|
+
name=gate_result.gate.name
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
if not gate_result.passed:
|
|
449
|
+
failure = ET.SubElement(testcase, "failure")
|
|
450
|
+
failure.text = gate_result.message
|
|
451
|
+
|
|
452
|
+
# Add regression checks
|
|
453
|
+
if self.regressions:
|
|
454
|
+
reg_suite = ET.SubElement(
|
|
455
|
+
root,
|
|
456
|
+
"testsuite",
|
|
457
|
+
name="Regression Checks",
|
|
458
|
+
tests=str(len(self.regressions))
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
for reg in self.regressions:
|
|
462
|
+
testcase = ET.SubElement(
|
|
463
|
+
reg_suite,
|
|
464
|
+
"testcase",
|
|
465
|
+
name=f"regression_{reg.metric}"
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
if reg.severity in [RegressionSeverity.CRITICAL, RegressionSeverity.HIGH]:
|
|
469
|
+
failure = ET.SubElement(testcase, "failure")
|
|
470
|
+
failure.text = reg.message
|
|
471
|
+
|
|
472
|
+
return ET.tostring(root, encoding="unicode")
|
|
473
|
+
|
|
474
|
+
def generate_summary_markdown(
|
|
475
|
+
self,
|
|
476
|
+
report: BenchmarkReport
|
|
477
|
+
) -> str:
|
|
478
|
+
"""Generate summary markdown for PR comments."""
|
|
479
|
+
|
|
480
|
+
lines = [
|
|
481
|
+
"## 🔒 Zen-AI-Pentest Benchmark Results",
|
|
482
|
+
"",
|
|
483
|
+
f"**Benchmark ID:** `{report.benchmark_id}`",
|
|
484
|
+
f"**Duration:** {report.duration_seconds:.1f}s",
|
|
485
|
+
f"**Success Rate:** {report.success_rate:.1f}%",
|
|
486
|
+
"",
|
|
487
|
+
"### Performance Gates",
|
|
488
|
+
""
|
|
489
|
+
]
|
|
490
|
+
|
|
491
|
+
for gate_result in self.gate_results:
|
|
492
|
+
emoji = "✅" if gate_result.passed else "❌"
|
|
493
|
+
lines.append(
|
|
494
|
+
f"{emoji} **{gate_result.gate.name}:** "
|
|
495
|
+
f"{gate_result.actual_value:.3f} "
|
|
496
|
+
f"(threshold: {gate_result.gate.threshold})"
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
lines.append("")
|
|
500
|
+
|
|
501
|
+
# Aggregate scores
|
|
502
|
+
if report.aggregate_metrics:
|
|
503
|
+
lines.extend([
|
|
504
|
+
"### Aggregate Scores",
|
|
505
|
+
"",
|
|
506
|
+
f"- **Precision:** {report.aggregate_metrics.get('avg_precision', 0):.3f}",
|
|
507
|
+
f"- **Recall:** {report.aggregate_metrics.get('avg_recall', 0):.3f}",
|
|
508
|
+
f"- **F1-Score:** {report.aggregate_metrics.get('avg_f1_score', 0):.3f}",
|
|
509
|
+
f"- **Accuracy:** {report.aggregate_metrics.get('avg_accuracy', 0):.3f}",
|
|
510
|
+
""
|
|
511
|
+
])
|
|
512
|
+
|
|
513
|
+
# Regressions
|
|
514
|
+
if self.regressions:
|
|
515
|
+
lines.extend([
|
|
516
|
+
"### ⚠️ Regressions Detected",
|
|
517
|
+
""
|
|
518
|
+
])
|
|
519
|
+
|
|
520
|
+
for reg in self.regressions:
|
|
521
|
+
emoji = {
|
|
522
|
+
RegressionSeverity.CRITICAL: "🔴",
|
|
523
|
+
RegressionSeverity.HIGH: "🟠",
|
|
524
|
+
RegressionSeverity.MEDIUM: "🟡",
|
|
525
|
+
RegressionSeverity.LOW: "⚪"
|
|
526
|
+
}.get(reg.severity, "⚪")
|
|
527
|
+
|
|
528
|
+
lines.append(f"{emoji} {reg.message}")
|
|
529
|
+
|
|
530
|
+
lines.append("")
|
|
531
|
+
|
|
532
|
+
# Build status
|
|
533
|
+
should_fail, reason = self.should_fail_build()
|
|
534
|
+
if should_fail:
|
|
535
|
+
lines.extend([
|
|
536
|
+
"### ❌ Build Status: FAILED",
|
|
537
|
+
f"",
|
|
538
|
+
f"**Reason:** {reason}",
|
|
539
|
+
""
|
|
540
|
+
])
|
|
541
|
+
else:
|
|
542
|
+
lines.extend([
|
|
543
|
+
"### ✅ Build Status: PASSED",
|
|
544
|
+
""
|
|
545
|
+
])
|
|
546
|
+
|
|
547
|
+
return "\n".join(lines)
|
|
548
|
+
|
|
549
|
+
async def run_ci_pipeline(
|
|
550
|
+
self,
|
|
551
|
+
benchmark_type: str = "quick"
|
|
552
|
+
) -> Dict[str, Any]:
|
|
553
|
+
"""Run complete CI pipeline."""
|
|
554
|
+
|
|
555
|
+
logger.info(f"Starting CI pipeline ({benchmark_type})")
|
|
556
|
+
|
|
557
|
+
# Run benchmark
|
|
558
|
+
if benchmark_type == "quick":
|
|
559
|
+
report = await self.run_quick_benchmark()
|
|
560
|
+
else:
|
|
561
|
+
report = await self.run_full_benchmark()
|
|
562
|
+
|
|
563
|
+
# Check gates
|
|
564
|
+
self.check_performance_gates(report)
|
|
565
|
+
|
|
566
|
+
# Detect regressions
|
|
567
|
+
self.detect_regressions(report)
|
|
568
|
+
|
|
569
|
+
# Analyze trends
|
|
570
|
+
trends = self.analyze_trends()
|
|
571
|
+
|
|
572
|
+
# Determine build status
|
|
573
|
+
should_fail, fail_reason = self.should_fail_build()
|
|
574
|
+
|
|
575
|
+
# Generate outputs
|
|
576
|
+
outputs = {}
|
|
577
|
+
|
|
578
|
+
if self.config.output_format in ["junit", "all"]:
|
|
579
|
+
outputs["junit_xml"] = self.generate_junit_xml(report)
|
|
580
|
+
|
|
581
|
+
if self.config.output_format in ["markdown", "all"]:
|
|
582
|
+
outputs["markdown_summary"] = self.generate_summary_markdown(report)
|
|
583
|
+
|
|
584
|
+
if self.config.output_format in ["json", "all"]:
|
|
585
|
+
outputs["json_report"] = report.to_dict()
|
|
586
|
+
|
|
587
|
+
# Save outputs
|
|
588
|
+
self._save_ci_outputs(report, outputs)
|
|
589
|
+
|
|
590
|
+
result = {
|
|
591
|
+
"benchmark_id": report.benchmark_id,
|
|
592
|
+
"success_rate": report.success_rate,
|
|
593
|
+
"gates_passed": sum(1 for g in self.gate_results if g.passed),
|
|
594
|
+
"gates_total": len(self.gate_results),
|
|
595
|
+
"regressions": len(self.regressions),
|
|
596
|
+
"critical_regressions": sum(
|
|
597
|
+
1 for r in self.regressions
|
|
598
|
+
if r.severity == RegressionSeverity.CRITICAL
|
|
599
|
+
),
|
|
600
|
+
"should_fail": should_fail,
|
|
601
|
+
"fail_reason": fail_reason if should_fail else None,
|
|
602
|
+
"trends": trends,
|
|
603
|
+
"outputs": outputs
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
logger.info(f"CI pipeline completed: {result}")
|
|
607
|
+
|
|
608
|
+
return result
|
|
609
|
+
|
|
610
|
+
def _save_ci_outputs(
|
|
611
|
+
self,
|
|
612
|
+
report: BenchmarkReport,
|
|
613
|
+
outputs: Dict[str, Any]
|
|
614
|
+
) -> None:
|
|
615
|
+
"""Save CI output files."""
|
|
616
|
+
|
|
617
|
+
ci_dir = self.output_dir / "ci_outputs"
|
|
618
|
+
ci_dir.mkdir(exist_ok=True)
|
|
619
|
+
|
|
620
|
+
if "junit_xml" in outputs:
|
|
621
|
+
with open(ci_dir / "benchmark-junit.xml", 'w') as f:
|
|
622
|
+
f.write(outputs["junit_xml"])
|
|
623
|
+
|
|
624
|
+
if "markdown_summary" in outputs:
|
|
625
|
+
with open(ci_dir / "benchmark-summary.md", 'w') as f:
|
|
626
|
+
f.write(outputs["markdown_summary"])
|
|
627
|
+
|
|
628
|
+
if "json_report" in outputs:
|
|
629
|
+
with open(ci_dir / "benchmark-report.json", 'w') as f:
|
|
630
|
+
json.dump(outputs["json_report"], f, indent=2)
|
|
631
|
+
|
|
632
|
+
def generate_github_actions_workflow(self) -> str:
|
|
633
|
+
"""Generate GitHub Actions workflow file."""
|
|
634
|
+
|
|
635
|
+
workflow = """name: Benchmark
|
|
636
|
+
|
|
637
|
+
on:
|
|
638
|
+
pull_request:
|
|
639
|
+
branches: [ main, develop ]
|
|
640
|
+
push:
|
|
641
|
+
branches: [ main ]
|
|
642
|
+
tags: [ 'v*' ]
|
|
643
|
+
schedule:
|
|
644
|
+
# Run weekly on Sunday at 00:00
|
|
645
|
+
- cron: '0 0 * * 0'
|
|
646
|
+
|
|
647
|
+
jobs:
|
|
648
|
+
quick-benchmark:
|
|
649
|
+
if: github.event_name == 'pull_request'
|
|
650
|
+
runs-on: ubuntu-latest
|
|
651
|
+
steps:
|
|
652
|
+
- uses: actions/checkout@v4
|
|
653
|
+
|
|
654
|
+
- name: Set up Python
|
|
655
|
+
uses: actions/setup-python@v5
|
|
656
|
+
with:
|
|
657
|
+
python-version: '3.11'
|
|
658
|
+
|
|
659
|
+
- name: Install dependencies
|
|
660
|
+
run: |
|
|
661
|
+
pip install -e .
|
|
662
|
+
pip install -r benchmarks/requirements.txt
|
|
663
|
+
|
|
664
|
+
- name: Run quick benchmark
|
|
665
|
+
run: |
|
|
666
|
+
python -m benchmarks.ci_benchmark --type quick --output ci
|
|
667
|
+
|
|
668
|
+
- name: Upload results
|
|
669
|
+
uses: actions/upload-artifact@v4
|
|
670
|
+
with:
|
|
671
|
+
name: benchmark-results
|
|
672
|
+
path: ci_benchmark_results/ci_outputs/
|
|
673
|
+
|
|
674
|
+
- name: Comment PR
|
|
675
|
+
if: github.event_name == 'pull_request'
|
|
676
|
+
uses: actions/github-script@v7
|
|
677
|
+
with:
|
|
678
|
+
script: |
|
|
679
|
+
const fs = require('fs');
|
|
680
|
+
const summary = fs.readFileSync('ci_benchmark_results/ci_outputs/benchmark-summary.md', 'utf8');
|
|
681
|
+
github.rest.issues.createComment({
|
|
682
|
+
issue_number: context.issue.number,
|
|
683
|
+
owner: context.repo.owner,
|
|
684
|
+
repo: context.repo.repo,
|
|
685
|
+
body: summary
|
|
686
|
+
});
|
|
687
|
+
|
|
688
|
+
full-benchmark:
|
|
689
|
+
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
|
690
|
+
runs-on: ubuntu-latest
|
|
691
|
+
steps:
|
|
692
|
+
- uses: actions/checkout@v4
|
|
693
|
+
|
|
694
|
+
- name: Set up Python
|
|
695
|
+
uses: actions/setup-python@v5
|
|
696
|
+
with:
|
|
697
|
+
python-version: '3.11'
|
|
698
|
+
|
|
699
|
+
- name: Install dependencies
|
|
700
|
+
run: |
|
|
701
|
+
pip install -e .
|
|
702
|
+
pip install -r benchmarks/requirements.txt
|
|
703
|
+
|
|
704
|
+
- name: Run full benchmark
|
|
705
|
+
run: |
|
|
706
|
+
python -m benchmarks.ci_benchmark --type full --output ci
|
|
707
|
+
|
|
708
|
+
- name: Upload results
|
|
709
|
+
uses: actions/upload-artifact@v4
|
|
710
|
+
with:
|
|
711
|
+
name: benchmark-results-full
|
|
712
|
+
path: ci_benchmark_results/
|
|
713
|
+
|
|
714
|
+
- name: Update release notes
|
|
715
|
+
uses: softprops/action-gh-release@v1
|
|
716
|
+
with:
|
|
717
|
+
files: ci_benchmark_results/ci_outputs/benchmark-summary.md
|
|
718
|
+
"""
|
|
719
|
+
|
|
720
|
+
return workflow
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
# Import ALL_SCENARIOS for default config
|
|
724
|
+
from .scenarios import ALL_SCENARIOS
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
async def main():
|
|
728
|
+
"""CLI entry point for CI benchmark."""
|
|
729
|
+
import argparse
|
|
730
|
+
|
|
731
|
+
parser = argparse.ArgumentParser(
|
|
732
|
+
description="Zen-AI-Pentest CI Benchmark Runner"
|
|
733
|
+
)
|
|
734
|
+
parser.add_argument(
|
|
735
|
+
"--type",
|
|
736
|
+
choices=["quick", "full"],
|
|
737
|
+
default="quick",
|
|
738
|
+
help="Type of benchmark to run"
|
|
739
|
+
)
|
|
740
|
+
parser.add_argument(
|
|
741
|
+
"--output",
|
|
742
|
+
default="ci_benchmark_results",
|
|
743
|
+
help="Output directory"
|
|
744
|
+
)
|
|
745
|
+
parser.add_argument(
|
|
746
|
+
"--format",
|
|
747
|
+
choices=["json", "junit", "markdown", "all"],
|
|
748
|
+
default="all",
|
|
749
|
+
help="Output format"
|
|
750
|
+
)
|
|
751
|
+
parser.add_argument(
|
|
752
|
+
"--fail-on-gate",
|
|
753
|
+
action="store_true",
|
|
754
|
+
help="Fail on gate failure"
|
|
755
|
+
)
|
|
756
|
+
parser.add_argument(
|
|
757
|
+
"--fail-on-regression",
|
|
758
|
+
action="store_true",
|
|
759
|
+
help="Fail on critical regression"
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
args = parser.parse_args()
|
|
763
|
+
|
|
764
|
+
# Create config
|
|
765
|
+
config = CIConfig(
|
|
766
|
+
output_format=args.format,
|
|
767
|
+
fail_on_gate_failure=args.fail_on_gate,
|
|
768
|
+
fail_on_critical_regression=args.fail_on_regression
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
# Run pipeline
|
|
772
|
+
runner = CIBenchmarkRunner(config=config, output_dir=args.output)
|
|
773
|
+
result = await runner.run_ci_pipeline(args.type)
|
|
774
|
+
|
|
775
|
+
# Exit with appropriate code
|
|
776
|
+
if result["should_fail"]:
|
|
777
|
+
print(f"Build failed: {result['fail_reason']}")
|
|
778
|
+
sys.exit(1)
|
|
779
|
+
else:
|
|
780
|
+
print("All checks passed!")
|
|
781
|
+
sys.exit(0)
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
if __name__ == "__main__":
|
|
785
|
+
asyncio.run(main())
|