zen-ai-pentest 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. agents/__init__.py +28 -0
  2. agents/agent_base.py +239 -0
  3. agents/agent_orchestrator.py +346 -0
  4. agents/analysis_agent.py +225 -0
  5. agents/cli.py +258 -0
  6. agents/exploit_agent.py +224 -0
  7. agents/integration.py +211 -0
  8. agents/post_scan_agent.py +937 -0
  9. agents/react_agent.py +384 -0
  10. agents/react_agent_enhanced.py +616 -0
  11. agents/react_agent_vm.py +298 -0
  12. agents/research_agent.py +176 -0
  13. api/__init__.py +11 -0
  14. api/auth.py +123 -0
  15. api/main.py +1027 -0
  16. api/schemas.py +357 -0
  17. api/websocket.py +97 -0
  18. autonomous/__init__.py +122 -0
  19. autonomous/agent.py +253 -0
  20. autonomous/agent_loop.py +1370 -0
  21. autonomous/exploit_validator.py +1537 -0
  22. autonomous/memory.py +448 -0
  23. autonomous/react.py +339 -0
  24. autonomous/tool_executor.py +488 -0
  25. backends/__init__.py +16 -0
  26. backends/chatgpt_direct.py +133 -0
  27. backends/claude_direct.py +130 -0
  28. backends/duckduckgo.py +138 -0
  29. backends/openrouter.py +120 -0
  30. benchmarks/__init__.py +149 -0
  31. benchmarks/benchmark_engine.py +904 -0
  32. benchmarks/ci_benchmark.py +785 -0
  33. benchmarks/comparison.py +729 -0
  34. benchmarks/metrics.py +553 -0
  35. benchmarks/run_benchmarks.py +809 -0
  36. ci_cd/__init__.py +2 -0
  37. core/__init__.py +17 -0
  38. core/async_pool.py +282 -0
  39. core/asyncio_fix.py +222 -0
  40. core/cache.py +472 -0
  41. core/container.py +277 -0
  42. core/database.py +114 -0
  43. core/input_validator.py +353 -0
  44. core/models.py +288 -0
  45. core/orchestrator.py +611 -0
  46. core/plugin_manager.py +571 -0
  47. core/rate_limiter.py +405 -0
  48. core/secure_config.py +328 -0
  49. core/shield_integration.py +296 -0
  50. modules/__init__.py +46 -0
  51. modules/cve_database.py +362 -0
  52. modules/exploit_assist.py +330 -0
  53. modules/nuclei_integration.py +480 -0
  54. modules/osint.py +604 -0
  55. modules/protonvpn.py +554 -0
  56. modules/recon.py +165 -0
  57. modules/sql_injection_db.py +826 -0
  58. modules/tool_orchestrator.py +498 -0
  59. modules/vuln_scanner.py +292 -0
  60. modules/wordlist_generator.py +566 -0
  61. risk_engine/__init__.py +99 -0
  62. risk_engine/business_impact.py +267 -0
  63. risk_engine/business_impact_calculator.py +563 -0
  64. risk_engine/cvss.py +156 -0
  65. risk_engine/epss.py +190 -0
  66. risk_engine/example_usage.py +294 -0
  67. risk_engine/false_positive_engine.py +1073 -0
  68. risk_engine/scorer.py +304 -0
  69. web_ui/backend/main.py +471 -0
  70. zen_ai_pentest-2.0.0.dist-info/METADATA +795 -0
  71. zen_ai_pentest-2.0.0.dist-info/RECORD +75 -0
  72. zen_ai_pentest-2.0.0.dist-info/WHEEL +5 -0
  73. zen_ai_pentest-2.0.0.dist-info/entry_points.txt +2 -0
  74. zen_ai_pentest-2.0.0.dist-info/licenses/LICENSE +21 -0
  75. zen_ai_pentest-2.0.0.dist-info/top_level.txt +10 -0
@@ -0,0 +1,785 @@
1
+ """
2
+ Zen-AI-Pentest CI/CD Benchmark Integration
3
+
4
+ Continuous Integration benchmarking for automated performance tracking,
5
+ regression detection, and quality gates.
6
+ """
7
+
8
+ import asyncio
9
+ import json
10
+ import logging
11
+ import os
12
+ import sys
13
+ from dataclasses import dataclass, field
14
+ from datetime import datetime
15
+ from enum import Enum
16
+ from pathlib import Path
17
+ from typing import Dict, List, Optional, Any, Tuple
18
+ import xml.etree.ElementTree as ET
19
+
20
+ from .benchmark_engine import (
21
+ BenchmarkEngine, BenchmarkConfig, BenchmarkReport,
22
+ BenchmarkStatus
23
+ )
24
+ from .metrics import BenchmarkMetrics, MetricsAggregator
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class RegressionSeverity(Enum):
30
+ """Severity levels for performance regressions."""
31
+ NONE = "none"
32
+ LOW = "low"
33
+ MEDIUM = "medium"
34
+ HIGH = "high"
35
+ CRITICAL = "critical"
36
+
37
+
38
+ @dataclass
39
+ class PerformanceGate:
40
+ """Performance gate configuration."""
41
+
42
+ name: str
43
+ metric: str # e.g., "precision", "recall", "f1_score"
44
+ threshold: float
45
+ comparison: str = "min" # "min" or "max"
46
+
47
+ def check(self, value: float) -> bool:
48
+ """Check if value passes the gate."""
49
+ if self.comparison == "min":
50
+ return value >= self.threshold
51
+ else:
52
+ return value <= self.threshold
53
+
54
+
55
+ @dataclass
56
+ class GateResult:
57
+ """Result of a performance gate check."""
58
+
59
+ gate: PerformanceGate
60
+ actual_value: float
61
+ passed: bool
62
+ message: str = ""
63
+
64
+
65
+ @dataclass
66
+ class RegressionCheck:
67
+ """Result of regression check."""
68
+
69
+ metric: str
70
+ baseline_value: float
71
+ current_value: float
72
+ change_percent: float
73
+ severity: RegressionSeverity
74
+ message: str = ""
75
+
76
+
77
+ @dataclass
78
+ class CIConfig:
79
+ """Configuration for CI benchmark runs."""
80
+
81
+ # Trigger conditions
82
+ run_on_pr: bool = True
83
+ run_on_release: bool = True
84
+ run_on_schedule: bool = False
85
+ schedule_cron: str = "0 0 * * 0" # Weekly
86
+
87
+ # Scenarios
88
+ quick_scenarios: List[str] = field(
89
+ default_factory=lambda: ["dvwa", "juice-shop"]
90
+ )
91
+ full_scenarios: List[str] = field(
92
+ default_factory=lambda: list(ALL_SCENARIOS.keys())
93
+ )
94
+
95
+ # Performance gates
96
+ performance_gates: List[PerformanceGate] = field(default_factory=lambda: [
97
+ PerformanceGate("precision_min", "precision", 0.70),
98
+ PerformanceGate("recall_min", "recall", 0.65),
99
+ PerformanceGate("f1_min", "f1_score", 0.67),
100
+ PerformanceGate("accuracy_min", "accuracy", 0.75),
101
+ ])
102
+
103
+ # Regression detection
104
+ enable_regression_detection: bool = True
105
+ regression_threshold_low: float = -5.0 # -5%
106
+ regression_threshold_medium: float = -10.0 # -10%
107
+ regression_threshold_high: float = -20.0 # -20%
108
+ regression_threshold_critical: float = -30.0 # -30%
109
+
110
+ # Trend analysis
111
+ enable_trend_analysis: bool = True
112
+ trend_lookback_runs: int = 5
113
+ trend_significance_threshold: float = 2.0 # Standard deviations
114
+
115
+ # Output
116
+ output_format: str = "all" # "json", "junit", "markdown", "all"
117
+ fail_on_gate_failure: bool = True
118
+ fail_on_critical_regression: bool = True
119
+ comment_on_pr: bool = True
120
+
121
+ # Notifications
122
+ notify_on_failure: bool = True
123
+ notify_on_regression: bool = True
124
+ slack_webhook: Optional[str] = None
125
+ email_on_failure: Optional[str] = None
126
+
127
+
128
+ class CIBenchmarkRunner:
129
+ """CI/CD benchmark runner for automated testing."""
130
+
131
+ def __init__(
132
+ self,
133
+ engine: Optional[BenchmarkEngine] = None,
134
+ config: Optional[CIConfig] = None,
135
+ output_dir: str = "ci_benchmark_results"
136
+ ):
137
+ self.engine = engine or BenchmarkEngine(output_dir=output_dir)
138
+ self.config = config or CIConfig()
139
+ self.output_dir = Path(output_dir)
140
+ self.output_dir.mkdir(parents=True, exist_ok=True)
141
+
142
+ self.results: List[BenchmarkReport] = []
143
+ self.gate_results: List[GateResult] = []
144
+ self.regressions: List[RegressionCheck] = []
145
+
146
+ logger.info("CIBenchmarkRunner initialized")
147
+
148
+ async def run_quick_benchmark(self) -> BenchmarkReport:
149
+ """Run quick benchmark for PR validation."""
150
+ logger.info("Running quick benchmark suite")
151
+
152
+ config = BenchmarkConfig(
153
+ benchmark_name="ci-quick-benchmark",
154
+ scenarios=self.config.quick_scenarios,
155
+ max_concurrent=2,
156
+ timeout_per_scenario=1800,
157
+ generate_markdown_report=True,
158
+ track_history=True
159
+ )
160
+
161
+ report = await self.engine.run_benchmark(config)
162
+ self.results.append(report)
163
+
164
+ return report
165
+
166
+ async def run_full_benchmark(self) -> BenchmarkReport:
167
+ """Run full benchmark suite for releases."""
168
+ logger.info("Running full benchmark suite")
169
+
170
+ config = BenchmarkConfig(
171
+ benchmark_name="ci-full-benchmark",
172
+ scenarios=self.config.full_scenarios,
173
+ max_concurrent=1, # Sequential for stability
174
+ timeout_per_scenario=3600,
175
+ generate_markdown_report=True,
176
+ track_history=True
177
+ )
178
+
179
+ report = await self.engine.run_benchmark(config)
180
+ self.results.append(report)
181
+
182
+ return report
183
+
184
+ def check_performance_gates(
185
+ self,
186
+ report: BenchmarkReport
187
+ ) -> List[GateResult]:
188
+ """Check if benchmark passes all performance gates."""
189
+
190
+ logger.info("Checking performance gates")
191
+ self.gate_results = []
192
+
193
+ if not report.aggregate_metrics:
194
+ logger.warning("No aggregate metrics available")
195
+ return []
196
+
197
+ for gate in self.config.performance_gates:
198
+ # Get metric value
199
+ metric_key = f"avg_{gate.metric}"
200
+ value = report.aggregate_metrics.get(metric_key, 0)
201
+
202
+ passed = gate.check(value)
203
+
204
+ result = GateResult(
205
+ gate=gate,
206
+ actual_value=value,
207
+ passed=passed,
208
+ message=(
209
+ f"✅ {gate.name}: {value:.3f} >= {gate.threshold}"
210
+ if passed else
211
+ f"❌ {gate.name}: {value:.3f} < {gate.threshold}"
212
+ )
213
+ )
214
+
215
+ self.gate_results.append(result)
216
+
217
+ if passed:
218
+ logger.info(result.message)
219
+ else:
220
+ logger.warning(result.message)
221
+
222
+ return self.gate_results
223
+
224
+ def detect_regressions(
225
+ self,
226
+ report: BenchmarkReport
227
+ ) -> List[RegressionCheck]:
228
+ """Detect performance regressions compared to baseline."""
229
+
230
+ logger.info("Detecting regressions")
231
+ self.regressions = []
232
+
233
+ if not self.config.enable_regression_detection:
234
+ return []
235
+
236
+ # Get baseline (previous successful run)
237
+ baseline = self._get_baseline_report(report)
238
+ if not baseline:
239
+ logger.info("No baseline found for comparison")
240
+ return []
241
+
242
+ # Compare metrics
243
+ if not report.aggregate_metrics or not baseline.aggregate_metrics:
244
+ return []
245
+
246
+ for key in report.aggregate_metrics.keys():
247
+ if not key.startswith("avg_"):
248
+ continue
249
+
250
+ current = report.aggregate_metrics[key]
251
+ baseline_val = baseline.aggregate_metrics.get(key)
252
+
253
+ if baseline_val is None or baseline_val == 0:
254
+ continue
255
+
256
+ change_pct = ((current - baseline_val) / baseline_val) * 100
257
+
258
+ # Determine severity
259
+ if change_pct < self.config.regression_threshold_critical:
260
+ severity = RegressionSeverity.CRITICAL
261
+ elif change_pct < self.config.regression_threshold_high:
262
+ severity = RegressionSeverity.HIGH
263
+ elif change_pct < self.config.regression_threshold_medium:
264
+ severity = RegressionSeverity.MEDIUM
265
+ elif change_pct < self.config.regression_threshold_low:
266
+ severity = RegressionSeverity.LOW
267
+ else:
268
+ continue # No significant regression
269
+
270
+ regression = RegressionCheck(
271
+ metric=key,
272
+ baseline_value=baseline_val,
273
+ current_value=current,
274
+ change_percent=change_pct,
275
+ severity=severity,
276
+ message=f"{key}: {baseline_val:.3f} → {current:.3f} ({change_pct:+.1f}%)"
277
+ )
278
+
279
+ self.regressions.append(regression)
280
+ logger.warning(f"Regression detected: {regression.message}")
281
+
282
+ return self.regressions
283
+
284
+ def _get_baseline_report(
285
+ self,
286
+ current: BenchmarkReport
287
+ ) -> Optional[BenchmarkReport]:
288
+ """Get baseline report for comparison."""
289
+
290
+ history = self.engine.get_benchmark_history(limit=10)
291
+
292
+ # Find most recent successful run with same scenarios
293
+ for entry in reversed(history):
294
+ if (entry.get("benchmark_id") != current.benchmark_id and
295
+ entry.get("scenarios") == current.config.scenarios and
296
+ entry.get("success_rate", 0) > 50):
297
+
298
+ # Load full report if available
299
+ report_path = (
300
+ self.output_dir / entry["benchmark_id"] / "report.json"
301
+ )
302
+ if report_path.exists():
303
+ try:
304
+ with open(report_path) as f:
305
+ data = json.load(f)
306
+ # Reconstruct report (simplified)
307
+ return current # Placeholder
308
+ except:
309
+ pass
310
+
311
+ return None
312
+
313
+ def analyze_trends(self) -> Dict[str, Any]:
314
+ """Analyze performance trends over time."""
315
+
316
+ if not self.config.enable_trend_analysis:
317
+ return {}
318
+
319
+ logger.info("Analyzing performance trends")
320
+
321
+ history = self.engine.get_benchmark_history(
322
+ limit=self.config.trend_lookback_runs
323
+ )
324
+
325
+ if len(history) < 3:
326
+ logger.info("Not enough history for trend analysis")
327
+ return {}
328
+
329
+ trends = {}
330
+
331
+ # Analyze key metrics
332
+ metrics_to_track = [
333
+ "avg_precision", "avg_recall", "avg_f1_score",
334
+ "avg_accuracy", "success_rate"
335
+ ]
336
+
337
+ for metric in metrics_to_track:
338
+ values = [
339
+ h.get("aggregate_metrics", {}).get(metric)
340
+ for h in history
341
+ if h.get("aggregate_metrics", {}).get(metric) is not None
342
+ ]
343
+
344
+ if len(values) < 3:
345
+ continue
346
+
347
+ # Simple trend detection
348
+ first_half = values[:len(values)//2]
349
+ second_half = values[len(values)//2:]
350
+
351
+ if not first_half or not second_half:
352
+ continue
353
+
354
+ first_avg = sum(first_half) / len(first_half)
355
+ second_avg = sum(second_half) / len(second_half)
356
+
357
+ change = second_avg - first_avg
358
+
359
+ trends[metric] = {
360
+ "direction": "improving" if change > 0 else "degrading",
361
+ "change": change,
362
+ "current_avg": second_avg,
363
+ "previous_avg": first_avg
364
+ }
365
+
366
+ return trends
367
+
368
+ def should_fail_build(self) -> Tuple[bool, str]:
369
+ """Determine if the build should fail based on results."""
370
+
371
+ reasons = []
372
+
373
+ # Check gate failures
374
+ if self.config.fail_on_gate_failure:
375
+ failed_gates = [g for g in self.gate_results if not g.passed]
376
+ if failed_gates:
377
+ reasons.append(
378
+ f"{len(failed_gates)} performance gate(s) failed"
379
+ )
380
+
381
+ # Check critical regressions
382
+ if self.config.fail_on_critical_regression:
383
+ critical = [
384
+ r for r in self.regressions
385
+ if r.severity == RegressionSeverity.CRITICAL
386
+ ]
387
+ if critical:
388
+ reasons.append(
389
+ f"{len(critical)} critical regression(s) detected"
390
+ )
391
+
392
+ if reasons:
393
+ return True, "; ".join(reasons)
394
+
395
+ return False, "All checks passed"
396
+
397
+ def generate_junit_xml(self, report: BenchmarkReport) -> str:
398
+ """Generate JUnit XML for CI integration."""
399
+
400
+ root = ET.Element("testsuites")
401
+ suite = ET.SubElement(
402
+ root,
403
+ "testsuite",
404
+ name="Zen-AI-Pentest Benchmark",
405
+ tests=str(len(report.scenario_results)),
406
+ failures=str(report.scenarios_failed),
407
+ time=str(report.duration_seconds)
408
+ )
409
+
410
+ # Add scenario tests
411
+ for scenario_result in report.scenario_results:
412
+ testcase = ET.SubElement(
413
+ suite,
414
+ "testcase",
415
+ name=scenario_result.scenario_id,
416
+ time=str(scenario_result.duration_seconds)
417
+ )
418
+
419
+ if scenario_result.status != BenchmarkStatus.COMPLETED:
420
+ failure = ET.SubElement(testcase, "failure")
421
+ failure.text = (
422
+ scenario_result.error_message or
423
+ f"Scenario failed with status: {scenario_result.status.name}"
424
+ )
425
+
426
+ elif scenario_result.metrics:
427
+ # Add metrics as system output
428
+ sys_out = ET.SubElement(testcase, "system-out")
429
+ scores = scenario_result.metrics.calculate_aggregate_scores()
430
+ sys_out.text = json.dumps(scores, indent=2)
431
+
432
+ # Add performance gates as test cases
433
+ gate_suite = ET.SubElement(
434
+ root,
435
+ "testsuite",
436
+ name="Performance Gates",
437
+ tests=str(len(self.gate_results)),
438
+ failures=str(sum(1 for g in self.gate_results if not g.passed))
439
+ )
440
+
441
+ for gate_result in self.gate_results:
442
+ testcase = ET.SubElement(
443
+ gate_suite,
444
+ "testcase",
445
+ name=gate_result.gate.name
446
+ )
447
+
448
+ if not gate_result.passed:
449
+ failure = ET.SubElement(testcase, "failure")
450
+ failure.text = gate_result.message
451
+
452
+ # Add regression checks
453
+ if self.regressions:
454
+ reg_suite = ET.SubElement(
455
+ root,
456
+ "testsuite",
457
+ name="Regression Checks",
458
+ tests=str(len(self.regressions))
459
+ )
460
+
461
+ for reg in self.regressions:
462
+ testcase = ET.SubElement(
463
+ reg_suite,
464
+ "testcase",
465
+ name=f"regression_{reg.metric}"
466
+ )
467
+
468
+ if reg.severity in [RegressionSeverity.CRITICAL, RegressionSeverity.HIGH]:
469
+ failure = ET.SubElement(testcase, "failure")
470
+ failure.text = reg.message
471
+
472
+ return ET.tostring(root, encoding="unicode")
473
+
474
+ def generate_summary_markdown(
475
+ self,
476
+ report: BenchmarkReport
477
+ ) -> str:
478
+ """Generate summary markdown for PR comments."""
479
+
480
+ lines = [
481
+ "## 🔒 Zen-AI-Pentest Benchmark Results",
482
+ "",
483
+ f"**Benchmark ID:** `{report.benchmark_id}`",
484
+ f"**Duration:** {report.duration_seconds:.1f}s",
485
+ f"**Success Rate:** {report.success_rate:.1f}%",
486
+ "",
487
+ "### Performance Gates",
488
+ ""
489
+ ]
490
+
491
+ for gate_result in self.gate_results:
492
+ emoji = "✅" if gate_result.passed else "❌"
493
+ lines.append(
494
+ f"{emoji} **{gate_result.gate.name}:** "
495
+ f"{gate_result.actual_value:.3f} "
496
+ f"(threshold: {gate_result.gate.threshold})"
497
+ )
498
+
499
+ lines.append("")
500
+
501
+ # Aggregate scores
502
+ if report.aggregate_metrics:
503
+ lines.extend([
504
+ "### Aggregate Scores",
505
+ "",
506
+ f"- **Precision:** {report.aggregate_metrics.get('avg_precision', 0):.3f}",
507
+ f"- **Recall:** {report.aggregate_metrics.get('avg_recall', 0):.3f}",
508
+ f"- **F1-Score:** {report.aggregate_metrics.get('avg_f1_score', 0):.3f}",
509
+ f"- **Accuracy:** {report.aggregate_metrics.get('avg_accuracy', 0):.3f}",
510
+ ""
511
+ ])
512
+
513
+ # Regressions
514
+ if self.regressions:
515
+ lines.extend([
516
+ "### ⚠️ Regressions Detected",
517
+ ""
518
+ ])
519
+
520
+ for reg in self.regressions:
521
+ emoji = {
522
+ RegressionSeverity.CRITICAL: "🔴",
523
+ RegressionSeverity.HIGH: "🟠",
524
+ RegressionSeverity.MEDIUM: "🟡",
525
+ RegressionSeverity.LOW: "⚪"
526
+ }.get(reg.severity, "⚪")
527
+
528
+ lines.append(f"{emoji} {reg.message}")
529
+
530
+ lines.append("")
531
+
532
+ # Build status
533
+ should_fail, reason = self.should_fail_build()
534
+ if should_fail:
535
+ lines.extend([
536
+ "### ❌ Build Status: FAILED",
537
+ f"",
538
+ f"**Reason:** {reason}",
539
+ ""
540
+ ])
541
+ else:
542
+ lines.extend([
543
+ "### ✅ Build Status: PASSED",
544
+ ""
545
+ ])
546
+
547
+ return "\n".join(lines)
548
+
549
+ async def run_ci_pipeline(
550
+ self,
551
+ benchmark_type: str = "quick"
552
+ ) -> Dict[str, Any]:
553
+ """Run complete CI pipeline."""
554
+
555
+ logger.info(f"Starting CI pipeline ({benchmark_type})")
556
+
557
+ # Run benchmark
558
+ if benchmark_type == "quick":
559
+ report = await self.run_quick_benchmark()
560
+ else:
561
+ report = await self.run_full_benchmark()
562
+
563
+ # Check gates
564
+ self.check_performance_gates(report)
565
+
566
+ # Detect regressions
567
+ self.detect_regressions(report)
568
+
569
+ # Analyze trends
570
+ trends = self.analyze_trends()
571
+
572
+ # Determine build status
573
+ should_fail, fail_reason = self.should_fail_build()
574
+
575
+ # Generate outputs
576
+ outputs = {}
577
+
578
+ if self.config.output_format in ["junit", "all"]:
579
+ outputs["junit_xml"] = self.generate_junit_xml(report)
580
+
581
+ if self.config.output_format in ["markdown", "all"]:
582
+ outputs["markdown_summary"] = self.generate_summary_markdown(report)
583
+
584
+ if self.config.output_format in ["json", "all"]:
585
+ outputs["json_report"] = report.to_dict()
586
+
587
+ # Save outputs
588
+ self._save_ci_outputs(report, outputs)
589
+
590
+ result = {
591
+ "benchmark_id": report.benchmark_id,
592
+ "success_rate": report.success_rate,
593
+ "gates_passed": sum(1 for g in self.gate_results if g.passed),
594
+ "gates_total": len(self.gate_results),
595
+ "regressions": len(self.regressions),
596
+ "critical_regressions": sum(
597
+ 1 for r in self.regressions
598
+ if r.severity == RegressionSeverity.CRITICAL
599
+ ),
600
+ "should_fail": should_fail,
601
+ "fail_reason": fail_reason if should_fail else None,
602
+ "trends": trends,
603
+ "outputs": outputs
604
+ }
605
+
606
+ logger.info(f"CI pipeline completed: {result}")
607
+
608
+ return result
609
+
610
+ def _save_ci_outputs(
611
+ self,
612
+ report: BenchmarkReport,
613
+ outputs: Dict[str, Any]
614
+ ) -> None:
615
+ """Save CI output files."""
616
+
617
+ ci_dir = self.output_dir / "ci_outputs"
618
+ ci_dir.mkdir(exist_ok=True)
619
+
620
+ if "junit_xml" in outputs:
621
+ with open(ci_dir / "benchmark-junit.xml", 'w') as f:
622
+ f.write(outputs["junit_xml"])
623
+
624
+ if "markdown_summary" in outputs:
625
+ with open(ci_dir / "benchmark-summary.md", 'w') as f:
626
+ f.write(outputs["markdown_summary"])
627
+
628
+ if "json_report" in outputs:
629
+ with open(ci_dir / "benchmark-report.json", 'w') as f:
630
+ json.dump(outputs["json_report"], f, indent=2)
631
+
632
+ def generate_github_actions_workflow(self) -> str:
633
+ """Generate GitHub Actions workflow file."""
634
+
635
+ workflow = """name: Benchmark
636
+
637
+ on:
638
+ pull_request:
639
+ branches: [ main, develop ]
640
+ push:
641
+ branches: [ main ]
642
+ tags: [ 'v*' ]
643
+ schedule:
644
+ # Run weekly on Sunday at 00:00
645
+ - cron: '0 0 * * 0'
646
+
647
+ jobs:
648
+ quick-benchmark:
649
+ if: github.event_name == 'pull_request'
650
+ runs-on: ubuntu-latest
651
+ steps:
652
+ - uses: actions/checkout@v4
653
+
654
+ - name: Set up Python
655
+ uses: actions/setup-python@v5
656
+ with:
657
+ python-version: '3.11'
658
+
659
+ - name: Install dependencies
660
+ run: |
661
+ pip install -e .
662
+ pip install -r benchmarks/requirements.txt
663
+
664
+ - name: Run quick benchmark
665
+ run: |
666
+ python -m benchmarks.ci_benchmark --type quick --output ci
667
+
668
+ - name: Upload results
669
+ uses: actions/upload-artifact@v4
670
+ with:
671
+ name: benchmark-results
672
+ path: ci_benchmark_results/ci_outputs/
673
+
674
+ - name: Comment PR
675
+ if: github.event_name == 'pull_request'
676
+ uses: actions/github-script@v7
677
+ with:
678
+ script: |
679
+ const fs = require('fs');
680
+ const summary = fs.readFileSync('ci_benchmark_results/ci_outputs/benchmark-summary.md', 'utf8');
681
+ github.rest.issues.createComment({
682
+ issue_number: context.issue.number,
683
+ owner: context.repo.owner,
684
+ repo: context.repo.repo,
685
+ body: summary
686
+ });
687
+
688
+ full-benchmark:
689
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
690
+ runs-on: ubuntu-latest
691
+ steps:
692
+ - uses: actions/checkout@v4
693
+
694
+ - name: Set up Python
695
+ uses: actions/setup-python@v5
696
+ with:
697
+ python-version: '3.11'
698
+
699
+ - name: Install dependencies
700
+ run: |
701
+ pip install -e .
702
+ pip install -r benchmarks/requirements.txt
703
+
704
+ - name: Run full benchmark
705
+ run: |
706
+ python -m benchmarks.ci_benchmark --type full --output ci
707
+
708
+ - name: Upload results
709
+ uses: actions/upload-artifact@v4
710
+ with:
711
+ name: benchmark-results-full
712
+ path: ci_benchmark_results/
713
+
714
+ - name: Update release notes
715
+ uses: softprops/action-gh-release@v1
716
+ with:
717
+ files: ci_benchmark_results/ci_outputs/benchmark-summary.md
718
+ """
719
+
720
+ return workflow
721
+
722
+
723
+ # Import ALL_SCENARIOS for default config
724
+ from .scenarios import ALL_SCENARIOS
725
+
726
+
727
+ async def main():
728
+ """CLI entry point for CI benchmark."""
729
+ import argparse
730
+
731
+ parser = argparse.ArgumentParser(
732
+ description="Zen-AI-Pentest CI Benchmark Runner"
733
+ )
734
+ parser.add_argument(
735
+ "--type",
736
+ choices=["quick", "full"],
737
+ default="quick",
738
+ help="Type of benchmark to run"
739
+ )
740
+ parser.add_argument(
741
+ "--output",
742
+ default="ci_benchmark_results",
743
+ help="Output directory"
744
+ )
745
+ parser.add_argument(
746
+ "--format",
747
+ choices=["json", "junit", "markdown", "all"],
748
+ default="all",
749
+ help="Output format"
750
+ )
751
+ parser.add_argument(
752
+ "--fail-on-gate",
753
+ action="store_true",
754
+ help="Fail on gate failure"
755
+ )
756
+ parser.add_argument(
757
+ "--fail-on-regression",
758
+ action="store_true",
759
+ help="Fail on critical regression"
760
+ )
761
+
762
+ args = parser.parse_args()
763
+
764
+ # Create config
765
+ config = CIConfig(
766
+ output_format=args.format,
767
+ fail_on_gate_failure=args.fail_on_gate,
768
+ fail_on_critical_regression=args.fail_on_regression
769
+ )
770
+
771
+ # Run pipeline
772
+ runner = CIBenchmarkRunner(config=config, output_dir=args.output)
773
+ result = await runner.run_ci_pipeline(args.type)
774
+
775
+ # Exit with appropriate code
776
+ if result["should_fail"]:
777
+ print(f"Build failed: {result['fail_reason']}")
778
+ sys.exit(1)
779
+ else:
780
+ print("All checks passed!")
781
+ sys.exit(0)
782
+
783
+
784
+ if __name__ == "__main__":
785
+ asyncio.run(main())