runbooks 0.7.6__py3-none-any.whl → 0.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. runbooks/__init__.py +1 -1
  2. runbooks/base.py +5 -1
  3. runbooks/cfat/__init__.py +8 -4
  4. runbooks/cfat/assessment/collectors.py +171 -14
  5. runbooks/cfat/assessment/compliance.py +871 -0
  6. runbooks/cfat/assessment/runner.py +122 -11
  7. runbooks/cfat/models.py +6 -2
  8. runbooks/common/logger.py +14 -0
  9. runbooks/common/rich_utils.py +451 -0
  10. runbooks/enterprise/__init__.py +68 -0
  11. runbooks/enterprise/error_handling.py +411 -0
  12. runbooks/enterprise/logging.py +439 -0
  13. runbooks/enterprise/multi_tenant.py +583 -0
  14. runbooks/finops/README.md +468 -241
  15. runbooks/finops/__init__.py +39 -3
  16. runbooks/finops/cli.py +83 -18
  17. runbooks/finops/cross_validation.py +375 -0
  18. runbooks/finops/dashboard_runner.py +812 -164
  19. runbooks/finops/enhanced_dashboard_runner.py +525 -0
  20. runbooks/finops/finops_dashboard.py +1892 -0
  21. runbooks/finops/helpers.py +485 -51
  22. runbooks/finops/optimizer.py +823 -0
  23. runbooks/finops/tests/__init__.py +19 -0
  24. runbooks/finops/tests/results_test_finops_dashboard.xml +1 -0
  25. runbooks/finops/tests/run_comprehensive_tests.py +421 -0
  26. runbooks/finops/tests/run_tests.py +305 -0
  27. runbooks/finops/tests/test_finops_dashboard.py +705 -0
  28. runbooks/finops/tests/test_integration.py +477 -0
  29. runbooks/finops/tests/test_performance.py +380 -0
  30. runbooks/finops/tests/test_performance_benchmarks.py +500 -0
  31. runbooks/finops/tests/test_reference_images_validation.py +867 -0
  32. runbooks/finops/tests/test_single_account_features.py +715 -0
  33. runbooks/finops/tests/validate_test_suite.py +220 -0
  34. runbooks/finops/types.py +1 -1
  35. runbooks/hitl/enhanced_workflow_engine.py +725 -0
  36. runbooks/inventory/artifacts/scale-optimize-status.txt +12 -0
  37. runbooks/inventory/collectors/aws_comprehensive.py +442 -0
  38. runbooks/inventory/collectors/enterprise_scale.py +281 -0
  39. runbooks/inventory/core/collector.py +172 -13
  40. runbooks/inventory/discovery.md +1 -1
  41. runbooks/inventory/list_ec2_instances.py +18 -20
  42. runbooks/inventory/list_ssm_parameters.py +31 -3
  43. runbooks/inventory/organizations_discovery.py +1269 -0
  44. runbooks/inventory/rich_inventory_display.py +393 -0
  45. runbooks/inventory/run_on_multi_accounts.py +35 -19
  46. runbooks/inventory/runbooks.security.report_generator.log +0 -0
  47. runbooks/inventory/runbooks.security.run_script.log +0 -0
  48. runbooks/inventory/vpc_flow_analyzer.py +1030 -0
  49. runbooks/main.py +2215 -119
  50. runbooks/metrics/dora_metrics_engine.py +599 -0
  51. runbooks/operate/__init__.py +2 -2
  52. runbooks/operate/base.py +122 -10
  53. runbooks/operate/deployment_framework.py +1032 -0
  54. runbooks/operate/deployment_validator.py +853 -0
  55. runbooks/operate/dynamodb_operations.py +10 -6
  56. runbooks/operate/ec2_operations.py +319 -11
  57. runbooks/operate/executive_dashboard.py +779 -0
  58. runbooks/operate/mcp_integration.py +750 -0
  59. runbooks/operate/nat_gateway_operations.py +1120 -0
  60. runbooks/operate/networking_cost_heatmap.py +685 -0
  61. runbooks/operate/privatelink_operations.py +940 -0
  62. runbooks/operate/s3_operations.py +10 -6
  63. runbooks/operate/vpc_endpoints.py +644 -0
  64. runbooks/operate/vpc_operations.py +1038 -0
  65. runbooks/remediation/__init__.py +2 -2
  66. runbooks/remediation/acm_remediation.py +1 -1
  67. runbooks/remediation/base.py +1 -1
  68. runbooks/remediation/cloudtrail_remediation.py +1 -1
  69. runbooks/remediation/cognito_remediation.py +1 -1
  70. runbooks/remediation/dynamodb_remediation.py +1 -1
  71. runbooks/remediation/ec2_remediation.py +1 -1
  72. runbooks/remediation/ec2_unattached_ebs_volumes.py +1 -1
  73. runbooks/remediation/kms_enable_key_rotation.py +1 -1
  74. runbooks/remediation/kms_remediation.py +1 -1
  75. runbooks/remediation/lambda_remediation.py +1 -1
  76. runbooks/remediation/multi_account.py +1 -1
  77. runbooks/remediation/rds_remediation.py +1 -1
  78. runbooks/remediation/s3_block_public_access.py +1 -1
  79. runbooks/remediation/s3_enable_access_logging.py +1 -1
  80. runbooks/remediation/s3_encryption.py +1 -1
  81. runbooks/remediation/s3_remediation.py +1 -1
  82. runbooks/remediation/vpc_remediation.py +475 -0
  83. runbooks/security/__init__.py +3 -1
  84. runbooks/security/compliance_automation.py +632 -0
  85. runbooks/security/report_generator.py +10 -0
  86. runbooks/security/run_script.py +31 -5
  87. runbooks/security/security_baseline_tester.py +169 -30
  88. runbooks/security/security_export.py +477 -0
  89. runbooks/validation/__init__.py +10 -0
  90. runbooks/validation/benchmark.py +484 -0
  91. runbooks/validation/cli.py +356 -0
  92. runbooks/validation/mcp_validator.py +768 -0
  93. runbooks/vpc/__init__.py +38 -0
  94. runbooks/vpc/config.py +212 -0
  95. runbooks/vpc/cost_engine.py +347 -0
  96. runbooks/vpc/heatmap_engine.py +605 -0
  97. runbooks/vpc/manager_interface.py +634 -0
  98. runbooks/vpc/networking_wrapper.py +1260 -0
  99. runbooks/vpc/rich_formatters.py +679 -0
  100. runbooks/vpc/tests/__init__.py +5 -0
  101. runbooks/vpc/tests/conftest.py +356 -0
  102. runbooks/vpc/tests/test_cli_integration.py +530 -0
  103. runbooks/vpc/tests/test_config.py +458 -0
  104. runbooks/vpc/tests/test_cost_engine.py +479 -0
  105. runbooks/vpc/tests/test_networking_wrapper.py +512 -0
  106. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/METADATA +40 -12
  107. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/RECORD +111 -50
  108. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/WHEEL +0 -0
  109. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/entry_points.txt +0 -0
  110. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/licenses/LICENSE +0 -0
  111. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,484 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ MCP Validation Performance Benchmark Suite
4
+
5
+ Enterprise performance testing for MCP validation framework with:
6
+ - <30s validation cycle target
7
+ - 99.5% accuracy requirement
8
+ - Multi-account performance testing (60+ accounts)
9
+ - Real-time monitoring and reporting
10
+ - SRE reliability metrics
11
+
12
+ Usage:
13
+ python -m runbooks.validation.benchmark --iterations 10 --target-accuracy 99.5
14
+ """
15
+
16
+ import asyncio
17
+ import time
18
+ import statistics
19
+ from datetime import datetime
20
+ from typing import List, Dict, Any
21
+ from dataclasses import dataclass
22
+ import json
23
+
24
+ from rich.console import Console
25
+ from rich.table import Table
26
+ from rich.panel import Panel
27
+ from rich.progress import Progress, TaskID
28
+ from rich import box
29
+
30
+ from .mcp_validator import MCPValidator, ValidationReport, ValidationStatus
31
+
32
+ console = Console()
33
+
34
+ @dataclass
35
+ class BenchmarkResult:
36
+ """Individual benchmark iteration result."""
37
+ iteration: int
38
+ accuracy: float
39
+ execution_time: float
40
+ passed_validations: int
41
+ total_validations: int
42
+ timestamp: datetime
43
+ details: ValidationReport
44
+
45
+ @dataclass
46
+ class BenchmarkSuite:
47
+ """Complete benchmark suite results."""
48
+ target_accuracy: float
49
+ performance_target: float
50
+ iterations: int
51
+ results: List[BenchmarkResult]
52
+
53
+ # Summary statistics
54
+ avg_accuracy: float
55
+ avg_execution_time: float
56
+ min_accuracy: float
57
+ max_accuracy: float
58
+ min_execution_time: float
59
+ max_execution_time: float
60
+ accuracy_std_dev: float
61
+ time_std_dev: float
62
+ success_rate: float
63
+
64
+ # SRE metrics
65
+ availability: float # % of successful validations
66
+ reliability_score: float # Combined accuracy + performance
67
+ performance_consistency: float # Low variance = high consistency
68
+
69
+ class MCPBenchmarkRunner:
70
+ """
71
+ Enterprise benchmark runner for MCP validation framework.
72
+
73
+ Provides comprehensive performance testing with SRE reliability metrics
74
+ and enterprise reporting for production deployment validation.
75
+ """
76
+
77
+ def __init__(self,
78
+ target_accuracy: float = 99.5,
79
+ performance_target: float = 30.0,
80
+ tolerance_percentage: float = 5.0):
81
+ """Initialize benchmark runner."""
82
+
83
+ self.target_accuracy = target_accuracy
84
+ self.performance_target = performance_target
85
+ self.tolerance_percentage = tolerance_percentage
86
+
87
+ console.print(Panel(
88
+ f"[bold blue]MCP Validation Benchmark Suite[/bold blue]\n"
89
+ f"Target Accuracy: {target_accuracy}%\n"
90
+ f"Performance Target: <{performance_target}s\n"
91
+ f"Tolerance: ±{tolerance_percentage}%",
92
+ title="Enterprise Benchmark Framework"
93
+ ))
94
+
95
+ async def run_benchmark(self, iterations: int = 5) -> BenchmarkSuite:
96
+ """
97
+ Run comprehensive benchmark across multiple iterations.
98
+
99
+ Args:
100
+ iterations: Number of benchmark iterations to run
101
+
102
+ Returns:
103
+ BenchmarkSuite with complete performance analysis
104
+ """
105
+
106
+ console.print(f"\n[bold cyan]Starting {iterations} benchmark iterations...[/bold cyan]")
107
+
108
+ results: List[BenchmarkResult] = []
109
+
110
+ with Progress() as progress:
111
+ task = progress.add_task("[cyan]Running benchmark...", total=iterations)
112
+
113
+ for i in range(iterations):
114
+ progress.console.print(f"\n[bold green]→ Iteration {i+1}/{iterations}[/bold green]")
115
+
116
+ # Run single benchmark iteration
117
+ result = await self._run_single_iteration(i + 1, progress)
118
+ results.append(result)
119
+
120
+ # Display iteration summary
121
+ status_color = "green" if result.accuracy >= self.target_accuracy else "red"
122
+ perf_color = "green" if result.execution_time <= self.performance_target else "red"
123
+
124
+ progress.console.print(
125
+ f" Accuracy: [{status_color}]{result.accuracy:.1f}%[/{status_color}] | "
126
+ f"Time: [{perf_color}]{result.execution_time:.1f}s[/{perf_color}] | "
127
+ f"Passed: {result.passed_validations}/{result.total_validations}"
128
+ )
129
+
130
+ progress.advance(task)
131
+
132
+ # Calculate benchmark suite statistics
133
+ return self._calculate_benchmark_statistics(results)
134
+
135
+ async def _run_single_iteration(self, iteration: int, progress: Progress) -> BenchmarkResult:
136
+ """Run single benchmark iteration."""
137
+
138
+ start_time = time.time()
139
+
140
+ # Initialize validator for this iteration
141
+ validator = MCPValidator(
142
+ tolerance_percentage=self.tolerance_percentage,
143
+ performance_target_seconds=self.performance_target
144
+ )
145
+
146
+ # Run validation
147
+ try:
148
+ report = await validator.validate_all_operations()
149
+
150
+ execution_time = time.time() - start_time
151
+
152
+ return BenchmarkResult(
153
+ iteration=iteration,
154
+ accuracy=report.overall_accuracy,
155
+ execution_time=execution_time,
156
+ passed_validations=report.passed_validations,
157
+ total_validations=report.total_validations,
158
+ timestamp=datetime.now(),
159
+ details=report
160
+ )
161
+
162
+ except Exception as e:
163
+ execution_time = time.time() - start_time
164
+ progress.console.print(f"[red]Iteration {iteration} failed: {e}[/red]")
165
+
166
+ # Return failed iteration
167
+ return BenchmarkResult(
168
+ iteration=iteration,
169
+ accuracy=0.0,
170
+ execution_time=execution_time,
171
+ passed_validations=0,
172
+ total_validations=5, # Expected number of validations
173
+ timestamp=datetime.now(),
174
+ details=None
175
+ )
176
+
177
+ def _calculate_benchmark_statistics(self, results: List[BenchmarkResult]) -> BenchmarkSuite:
178
+ """Calculate comprehensive benchmark statistics."""
179
+
180
+ if not results:
181
+ raise ValueError("No benchmark results to analyze")
182
+
183
+ # Basic statistics
184
+ accuracies = [r.accuracy for r in results]
185
+ times = [r.execution_time for r in results]
186
+
187
+ avg_accuracy = statistics.mean(accuracies)
188
+ avg_execution_time = statistics.mean(times)
189
+ min_accuracy = min(accuracies)
190
+ max_accuracy = max(accuracies)
191
+ min_execution_time = min(times)
192
+ max_execution_time = max(times)
193
+
194
+ # Calculate standard deviations
195
+ accuracy_std_dev = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0
196
+ time_std_dev = statistics.stdev(times) if len(times) > 1 else 0.0
197
+
198
+ # Success rate (meeting target accuracy)
199
+ successful_iterations = len([r for r in results if r.accuracy >= self.target_accuracy])
200
+ success_rate = (successful_iterations / len(results)) * 100
201
+
202
+ # SRE reliability metrics
203
+ availability = len([r for r in results if r.accuracy > 0]) / len(results) * 100
204
+
205
+ # Reliability score (weighted accuracy + performance)
206
+ accuracy_score = min(100, avg_accuracy / self.target_accuracy * 100)
207
+ performance_score = min(100, self.performance_target / avg_execution_time * 100) if avg_execution_time > 0 else 0
208
+ reliability_score = (accuracy_score * 0.7) + (performance_score * 0.3) # 70% accuracy, 30% performance
209
+
210
+ # Performance consistency (lower std dev = higher consistency)
211
+ max_acceptable_std_dev = 5.0 # 5% standard deviation is acceptable
212
+ performance_consistency = max(0, 100 - (accuracy_std_dev / max_acceptable_std_dev * 100))
213
+
214
+ return BenchmarkSuite(
215
+ target_accuracy=self.target_accuracy,
216
+ performance_target=self.performance_target,
217
+ iterations=len(results),
218
+ results=results,
219
+ avg_accuracy=avg_accuracy,
220
+ avg_execution_time=avg_execution_time,
221
+ min_accuracy=min_accuracy,
222
+ max_accuracy=max_accuracy,
223
+ min_execution_time=min_execution_time,
224
+ max_execution_time=max_execution_time,
225
+ accuracy_std_dev=accuracy_std_dev,
226
+ time_std_dev=time_std_dev,
227
+ success_rate=success_rate,
228
+ availability=availability,
229
+ reliability_score=reliability_score,
230
+ performance_consistency=performance_consistency
231
+ )
232
+
233
+ def display_benchmark_results(self, suite: BenchmarkSuite) -> None:
234
+ """Display comprehensive benchmark results."""
235
+
236
+ # Overall assessment
237
+ overall_status = self._assess_benchmark_results(suite)
238
+ status_color = "green" if overall_status == "PASSED" else "red" if overall_status == "FAILED" else "yellow"
239
+
240
+ console.print(Panel(
241
+ f"[bold {status_color}]Benchmark Status: {overall_status}[/bold {status_color}]\n"
242
+ f"Average Accuracy: {suite.avg_accuracy:.2f}% (Target: {suite.target_accuracy}%)\n"
243
+ f"Average Time: {suite.avg_execution_time:.1f}s (Target: <{suite.performance_target}s)\n"
244
+ f"Success Rate: {suite.success_rate:.1f}% | Reliability: {suite.reliability_score:.1f}%",
245
+ title="Benchmark Summary"
246
+ ))
247
+
248
+ # Detailed statistics table
249
+ stats_table = Table(title="Performance Statistics", box=box.ROUNDED)
250
+ stats_table.add_column("Metric", style="cyan", no_wrap=True)
251
+ stats_table.add_column("Value", justify="right", style="bold")
252
+ stats_table.add_column("Status", style="bold")
253
+
254
+ # Accuracy metrics
255
+ stats_table.add_row(
256
+ "Average Accuracy",
257
+ f"{suite.avg_accuracy:.2f}%",
258
+ "✅ PASS" if suite.avg_accuracy >= suite.target_accuracy else "❌ FAIL"
259
+ )
260
+ stats_table.add_row(
261
+ "Accuracy Range",
262
+ f"{suite.min_accuracy:.1f}% - {suite.max_accuracy:.1f}%",
263
+ "ℹ️ INFO"
264
+ )
265
+ stats_table.add_row(
266
+ "Accuracy Std Dev",
267
+ f"{suite.accuracy_std_dev:.2f}%",
268
+ "✅ GOOD" if suite.accuracy_std_dev < 5.0 else "⚠️ HIGH"
269
+ )
270
+
271
+ # Performance metrics
272
+ stats_table.add_row(
273
+ "Average Time",
274
+ f"{suite.avg_execution_time:.1f}s",
275
+ "✅ PASS" if suite.avg_execution_time <= suite.performance_target else "❌ FAIL"
276
+ )
277
+ stats_table.add_row(
278
+ "Time Range",
279
+ f"{suite.min_execution_time:.1f}s - {suite.max_execution_time:.1f}s",
280
+ "ℹ️ INFO"
281
+ )
282
+ stats_table.add_row(
283
+ "Time Std Dev",
284
+ f"{suite.time_std_dev:.1f}s",
285
+ "✅ GOOD" if suite.time_std_dev < 5.0 else "⚠️ HIGH"
286
+ )
287
+
288
+ # SRE metrics
289
+ stats_table.add_row(
290
+ "Success Rate",
291
+ f"{suite.success_rate:.1f}%",
292
+ "✅ EXCELLENT" if suite.success_rate >= 80 else "❌ POOR"
293
+ )
294
+ stats_table.add_row(
295
+ "Availability",
296
+ f"{suite.availability:.1f}%",
297
+ "✅ PASS" if suite.availability >= 99 else "❌ FAIL"
298
+ )
299
+ stats_table.add_row(
300
+ "Reliability Score",
301
+ f"{suite.reliability_score:.1f}%",
302
+ "✅ EXCELLENT" if suite.reliability_score >= 90 else "⚠️ NEEDS WORK"
303
+ )
304
+ stats_table.add_row(
305
+ "Consistency",
306
+ f"{suite.performance_consistency:.1f}%",
307
+ "✅ STABLE" if suite.performance_consistency >= 80 else "⚠️ VARIABLE"
308
+ )
309
+
310
+ console.print(stats_table)
311
+
312
+ # Individual iteration results
313
+ iterations_table = Table(title="Individual Iterations", box=box.MINIMAL)
314
+ iterations_table.add_column("Iteration", justify="center")
315
+ iterations_table.add_column("Accuracy", justify="right")
316
+ iterations_table.add_column("Time (s)", justify="right")
317
+ iterations_table.add_column("Passed/Total")
318
+ iterations_table.add_column("Status", style="bold")
319
+
320
+ for result in suite.results:
321
+ status_color = "green" if result.accuracy >= suite.target_accuracy else "red"
322
+ status = "PASS" if result.accuracy >= suite.target_accuracy and result.execution_time <= suite.performance_target else "FAIL"
323
+
324
+ iterations_table.add_row(
325
+ str(result.iteration),
326
+ f"{result.accuracy:.1f}%",
327
+ f"{result.execution_time:.1f}",
328
+ f"{result.passed_validations}/{result.total_validations}",
329
+ f"[{status_color}]{status}[/{status_color}]"
330
+ )
331
+
332
+ console.print(iterations_table)
333
+
334
+ # Recommendations
335
+ recommendations = self._generate_benchmark_recommendations(suite)
336
+ if recommendations:
337
+ console.print(Panel(
338
+ "\n".join(f"• {rec}" for rec in recommendations),
339
+ title="Recommendations",
340
+ border_style="blue"
341
+ ))
342
+
343
+ # Save benchmark report
344
+ self._save_benchmark_report(suite)
345
+
346
+ def _assess_benchmark_results(self, suite: BenchmarkSuite) -> str:
347
+ """Assess overall benchmark results."""
348
+
349
+ accuracy_pass = suite.avg_accuracy >= suite.target_accuracy
350
+ performance_pass = suite.avg_execution_time <= suite.performance_target
351
+ reliability_pass = suite.reliability_score >= 90
352
+ consistency_pass = suite.accuracy_std_dev < 5.0
353
+
354
+ if accuracy_pass and performance_pass and reliability_pass:
355
+ return "PASSED"
356
+ elif accuracy_pass and performance_pass:
357
+ return "WARNING"
358
+ else:
359
+ return "FAILED"
360
+
361
+ def _generate_benchmark_recommendations(self, suite: BenchmarkSuite) -> List[str]:
362
+ """Generate actionable recommendations based on benchmark results."""
363
+
364
+ recommendations = []
365
+
366
+ # Accuracy recommendations
367
+ if suite.avg_accuracy < suite.target_accuracy:
368
+ recommendations.append(f"🎯 Improve average accuracy from {suite.avg_accuracy:.1f}% to {suite.target_accuracy}%")
369
+ recommendations.append("🔍 Review MCP integration and AWS API permissions")
370
+
371
+ # Performance recommendations
372
+ if suite.avg_execution_time > suite.performance_target:
373
+ recommendations.append(f"⚡ Optimize performance from {suite.avg_execution_time:.1f}s to <{suite.performance_target}s")
374
+ recommendations.append("🚀 Consider parallel validation and caching strategies")
375
+
376
+ # Consistency recommendations
377
+ if suite.accuracy_std_dev > 5.0:
378
+ recommendations.append(f"📊 Improve consistency - accuracy std dev {suite.accuracy_std_dev:.1f}% is high")
379
+ recommendations.append("🔧 Investigate sources of validation variance")
380
+
381
+ # Reliability recommendations
382
+ if suite.reliability_score < 90:
383
+ recommendations.append(f"🛠️ Enhance reliability score from {suite.reliability_score:.1f}% to >90%")
384
+ recommendations.append("📈 Focus on both accuracy and performance improvements")
385
+
386
+ # Success rate recommendations
387
+ if suite.success_rate < 80:
388
+ recommendations.append(f"✅ Improve success rate from {suite.success_rate:.1f}% to >80%")
389
+ recommendations.append("🎯 Address systematic issues causing validation failures")
390
+
391
+ # Production readiness
392
+ overall_status = self._assess_benchmark_results(suite)
393
+ if overall_status == "PASSED":
394
+ recommendations.append("🚀 Benchmark PASSED - Ready for production deployment")
395
+ elif overall_status == "WARNING":
396
+ recommendations.append("⚠️ Benchmark WARNING - Address consistency issues before production")
397
+ else:
398
+ recommendations.append("❌ Benchmark FAILED - Significant improvements needed before production")
399
+
400
+ return recommendations
401
+
402
+ def _save_benchmark_report(self, suite: BenchmarkSuite) -> None:
403
+ """Save benchmark report to artifacts directory."""
404
+
405
+ from pathlib import Path
406
+
407
+ artifacts_dir = Path("./artifacts/benchmark")
408
+ artifacts_dir.mkdir(parents=True, exist_ok=True)
409
+
410
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
411
+ report_file = artifacts_dir / f"mcp_benchmark_{timestamp}.json"
412
+
413
+ # Convert to serializable format
414
+ report_data = {
415
+ "benchmark_suite": {
416
+ "target_accuracy": suite.target_accuracy,
417
+ "performance_target": suite.performance_target,
418
+ "iterations": suite.iterations,
419
+ "avg_accuracy": suite.avg_accuracy,
420
+ "avg_execution_time": suite.avg_execution_time,
421
+ "min_accuracy": suite.min_accuracy,
422
+ "max_accuracy": suite.max_accuracy,
423
+ "min_execution_time": suite.min_execution_time,
424
+ "max_execution_time": suite.max_execution_time,
425
+ "accuracy_std_dev": suite.accuracy_std_dev,
426
+ "time_std_dev": suite.time_std_dev,
427
+ "success_rate": suite.success_rate,
428
+ "availability": suite.availability,
429
+ "reliability_score": suite.reliability_score,
430
+ "performance_consistency": suite.performance_consistency
431
+ },
432
+ "results": [
433
+ {
434
+ "iteration": r.iteration,
435
+ "accuracy": r.accuracy,
436
+ "execution_time": r.execution_time,
437
+ "passed_validations": r.passed_validations,
438
+ "total_validations": r.total_validations,
439
+ "timestamp": r.timestamp.isoformat()
440
+ }
441
+ for r in suite.results
442
+ ],
443
+ "assessment": self._assess_benchmark_results(suite),
444
+ "recommendations": self._generate_benchmark_recommendations(suite)
445
+ }
446
+
447
+ with open(report_file, 'w') as f:
448
+ json.dump(report_data, f, indent=2)
449
+
450
+ console.print(f"[green]Benchmark report saved:[/green] {report_file}")
451
+
452
+ # CLI entry point
453
+ async def main():
454
+ """CLI entry point for benchmark runner."""
455
+ import argparse
456
+
457
+ parser = argparse.ArgumentParser(description="MCP Validation Benchmark Suite")
458
+ parser.add_argument('--iterations', type=int, default=5, help='Number of benchmark iterations')
459
+ parser.add_argument('--target-accuracy', type=float, default=99.5, help='Target accuracy percentage')
460
+ parser.add_argument('--performance-target', type=float, default=30.0, help='Performance target in seconds')
461
+ parser.add_argument('--tolerance', type=float, default=5.0, help='Tolerance percentage')
462
+
463
+ args = parser.parse_args()
464
+
465
+ runner = MCPBenchmarkRunner(
466
+ target_accuracy=args.target_accuracy,
467
+ performance_target=args.performance_target,
468
+ tolerance_percentage=args.tolerance
469
+ )
470
+
471
+ suite = await runner.run_benchmark(args.iterations)
472
+ runner.display_benchmark_results(suite)
473
+
474
+ # Exit with appropriate code
475
+ overall_status = runner._assess_benchmark_results(suite)
476
+ if overall_status == "PASSED":
477
+ exit(0)
478
+ elif overall_status == "WARNING":
479
+ exit(1)
480
+ else:
481
+ exit(2)
482
+
483
+ if __name__ == '__main__':
484
+ asyncio.run(main())