runbooks 0.7.7__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. runbooks/__init__.py +1 -1
  2. runbooks/base.py +2 -2
  3. runbooks/cfat/README.md +12 -1
  4. runbooks/cfat/__init__.py +8 -4
  5. runbooks/cfat/assessment/collectors.py +171 -14
  6. runbooks/cfat/assessment/compliance.py +546 -522
  7. runbooks/cfat/assessment/runner.py +129 -10
  8. runbooks/cfat/models.py +6 -2
  9. runbooks/common/__init__.py +152 -0
  10. runbooks/common/accuracy_validator.py +1039 -0
  11. runbooks/common/context_logger.py +440 -0
  12. runbooks/common/cross_module_integration.py +594 -0
  13. runbooks/common/enhanced_exception_handler.py +1108 -0
  14. runbooks/common/enterprise_audit_integration.py +634 -0
  15. runbooks/common/logger.py +14 -0
  16. runbooks/common/mcp_integration.py +539 -0
  17. runbooks/common/performance_monitor.py +387 -0
  18. runbooks/common/profile_utils.py +216 -0
  19. runbooks/common/rich_utils.py +622 -0
  20. runbooks/enterprise/__init__.py +68 -0
  21. runbooks/enterprise/error_handling.py +411 -0
  22. runbooks/enterprise/logging.py +439 -0
  23. runbooks/enterprise/multi_tenant.py +583 -0
  24. runbooks/feedback/user_feedback_collector.py +440 -0
  25. runbooks/finops/README.md +129 -14
  26. runbooks/finops/__init__.py +22 -3
  27. runbooks/finops/account_resolver.py +279 -0
  28. runbooks/finops/accuracy_cross_validator.py +638 -0
  29. runbooks/finops/aws_client.py +721 -36
  30. runbooks/finops/budget_integration.py +313 -0
  31. runbooks/finops/cli.py +90 -33
  32. runbooks/finops/cost_processor.py +211 -37
  33. runbooks/finops/dashboard_router.py +900 -0
  34. runbooks/finops/dashboard_runner.py +1334 -399
  35. runbooks/finops/embedded_mcp_validator.py +288 -0
  36. runbooks/finops/enhanced_dashboard_runner.py +526 -0
  37. runbooks/finops/enhanced_progress.py +327 -0
  38. runbooks/finops/enhanced_trend_visualization.py +423 -0
  39. runbooks/finops/finops_dashboard.py +41 -0
  40. runbooks/finops/helpers.py +639 -323
  41. runbooks/finops/iam_guidance.py +400 -0
  42. runbooks/finops/markdown_exporter.py +466 -0
  43. runbooks/finops/multi_dashboard.py +1502 -0
  44. runbooks/finops/optimizer.py +396 -395
  45. runbooks/finops/profile_processor.py +2 -2
  46. runbooks/finops/runbooks.inventory.organizations_discovery.log +0 -0
  47. runbooks/finops/runbooks.security.report_generator.log +0 -0
  48. runbooks/finops/runbooks.security.run_script.log +0 -0
  49. runbooks/finops/runbooks.security.security_export.log +0 -0
  50. runbooks/finops/service_mapping.py +195 -0
  51. runbooks/finops/single_dashboard.py +710 -0
  52. runbooks/finops/tests/__init__.py +19 -0
  53. runbooks/finops/tests/results_test_finops_dashboard.xml +1 -0
  54. runbooks/finops/tests/run_comprehensive_tests.py +421 -0
  55. runbooks/finops/tests/run_tests.py +305 -0
  56. runbooks/finops/tests/test_finops_dashboard.py +705 -0
  57. runbooks/finops/tests/test_integration.py +477 -0
  58. runbooks/finops/tests/test_performance.py +380 -0
  59. runbooks/finops/tests/test_performance_benchmarks.py +500 -0
  60. runbooks/finops/tests/test_reference_images_validation.py +867 -0
  61. runbooks/finops/tests/test_single_account_features.py +715 -0
  62. runbooks/finops/tests/validate_test_suite.py +220 -0
  63. runbooks/finops/types.py +1 -1
  64. runbooks/hitl/enhanced_workflow_engine.py +725 -0
  65. runbooks/inventory/README.md +12 -1
  66. runbooks/inventory/artifacts/scale-optimize-status.txt +12 -0
  67. runbooks/inventory/collectors/aws_comprehensive.py +192 -185
  68. runbooks/inventory/collectors/enterprise_scale.py +281 -0
  69. runbooks/inventory/core/collector.py +299 -12
  70. runbooks/inventory/list_ec2_instances.py +21 -20
  71. runbooks/inventory/list_ssm_parameters.py +31 -3
  72. runbooks/inventory/organizations_discovery.py +1315 -0
  73. runbooks/inventory/rich_inventory_display.py +360 -0
  74. runbooks/inventory/run_on_multi_accounts.py +32 -16
  75. runbooks/inventory/runbooks.security.report_generator.log +0 -0
  76. runbooks/inventory/runbooks.security.run_script.log +0 -0
  77. runbooks/inventory/vpc_flow_analyzer.py +1030 -0
  78. runbooks/main.py +4171 -1615
  79. runbooks/metrics/dora_metrics_engine.py +1293 -0
  80. runbooks/monitoring/performance_monitor.py +433 -0
  81. runbooks/operate/README.md +394 -0
  82. runbooks/operate/__init__.py +2 -2
  83. runbooks/operate/base.py +291 -11
  84. runbooks/operate/deployment_framework.py +1032 -0
  85. runbooks/operate/deployment_validator.py +853 -0
  86. runbooks/operate/dynamodb_operations.py +10 -6
  87. runbooks/operate/ec2_operations.py +321 -11
  88. runbooks/operate/executive_dashboard.py +779 -0
  89. runbooks/operate/mcp_integration.py +750 -0
  90. runbooks/operate/nat_gateway_operations.py +1120 -0
  91. runbooks/operate/networking_cost_heatmap.py +685 -0
  92. runbooks/operate/privatelink_operations.py +940 -0
  93. runbooks/operate/s3_operations.py +10 -6
  94. runbooks/operate/vpc_endpoints.py +644 -0
  95. runbooks/operate/vpc_operations.py +1038 -0
  96. runbooks/remediation/README.md +489 -13
  97. runbooks/remediation/__init__.py +2 -2
  98. runbooks/remediation/acm_remediation.py +1 -1
  99. runbooks/remediation/base.py +1 -1
  100. runbooks/remediation/cloudtrail_remediation.py +1 -1
  101. runbooks/remediation/cognito_remediation.py +1 -1
  102. runbooks/remediation/commons.py +8 -4
  103. runbooks/remediation/dynamodb_remediation.py +1 -1
  104. runbooks/remediation/ec2_remediation.py +1 -1
  105. runbooks/remediation/ec2_unattached_ebs_volumes.py +1 -1
  106. runbooks/remediation/kms_enable_key_rotation.py +1 -1
  107. runbooks/remediation/kms_remediation.py +1 -1
  108. runbooks/remediation/lambda_remediation.py +1 -1
  109. runbooks/remediation/multi_account.py +1 -1
  110. runbooks/remediation/rds_remediation.py +1 -1
  111. runbooks/remediation/s3_block_public_access.py +1 -1
  112. runbooks/remediation/s3_enable_access_logging.py +1 -1
  113. runbooks/remediation/s3_encryption.py +1 -1
  114. runbooks/remediation/s3_remediation.py +1 -1
  115. runbooks/remediation/vpc_remediation.py +475 -0
  116. runbooks/security/ENTERPRISE_SECURITY_FRAMEWORK.md +506 -0
  117. runbooks/security/README.md +12 -1
  118. runbooks/security/__init__.py +166 -33
  119. runbooks/security/compliance_automation.py +634 -0
  120. runbooks/security/compliance_automation_engine.py +1021 -0
  121. runbooks/security/enterprise_security_framework.py +931 -0
  122. runbooks/security/enterprise_security_policies.json +293 -0
  123. runbooks/security/integration_test_enterprise_security.py +879 -0
  124. runbooks/security/module_security_integrator.py +641 -0
  125. runbooks/security/report_generator.py +10 -0
  126. runbooks/security/run_script.py +27 -5
  127. runbooks/security/security_baseline_tester.py +153 -27
  128. runbooks/security/security_export.py +456 -0
  129. runbooks/sre/README.md +472 -0
  130. runbooks/sre/__init__.py +33 -0
  131. runbooks/sre/mcp_reliability_engine.py +1049 -0
  132. runbooks/sre/performance_optimization_engine.py +1032 -0
  133. runbooks/sre/reliability_monitoring_framework.py +1011 -0
  134. runbooks/validation/__init__.py +10 -0
  135. runbooks/validation/benchmark.py +489 -0
  136. runbooks/validation/cli.py +368 -0
  137. runbooks/validation/mcp_validator.py +797 -0
  138. runbooks/vpc/README.md +478 -0
  139. runbooks/vpc/__init__.py +38 -0
  140. runbooks/vpc/config.py +212 -0
  141. runbooks/vpc/cost_engine.py +347 -0
  142. runbooks/vpc/heatmap_engine.py +605 -0
  143. runbooks/vpc/manager_interface.py +649 -0
  144. runbooks/vpc/networking_wrapper.py +1289 -0
  145. runbooks/vpc/rich_formatters.py +693 -0
  146. runbooks/vpc/tests/__init__.py +5 -0
  147. runbooks/vpc/tests/conftest.py +356 -0
  148. runbooks/vpc/tests/test_cli_integration.py +530 -0
  149. runbooks/vpc/tests/test_config.py +458 -0
  150. runbooks/vpc/tests/test_cost_engine.py +479 -0
  151. runbooks/vpc/tests/test_networking_wrapper.py +512 -0
  152. {runbooks-0.7.7.dist-info → runbooks-0.9.0.dist-info}/METADATA +175 -65
  153. {runbooks-0.7.7.dist-info → runbooks-0.9.0.dist-info}/RECORD +157 -60
  154. {runbooks-0.7.7.dist-info → runbooks-0.9.0.dist-info}/entry_points.txt +1 -1
  155. {runbooks-0.7.7.dist-info → runbooks-0.9.0.dist-info}/WHEEL +0 -0
  156. {runbooks-0.7.7.dist-info → runbooks-0.9.0.dist-info}/licenses/LICENSE +0 -0
  157. {runbooks-0.7.7.dist-info → runbooks-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,10 @@
1
+ """
2
+ Enterprise MCP Validation Module
3
+
4
+ Provides comprehensive validation between runbooks outputs and MCP server results
5
+ for enterprise AWS operations with 99.5% accuracy target.
6
+ """
7
+
8
+ from .mcp_validator import MCPValidator, ValidationReport, ValidationResult, ValidationStatus
9
+
10
+ __all__ = ["MCPValidator", "ValidationResult", "ValidationReport", "ValidationStatus"]
@@ -0,0 +1,489 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ MCP Validation Performance Benchmark Suite
4
+
5
+ Enterprise performance testing for MCP validation framework with:
6
+ - <30s validation cycle target
7
+ - 99.5% accuracy requirement
8
+ - Multi-account performance testing (60+ accounts)
9
+ - Real-time monitoring and reporting
10
+ - SRE reliability metrics
11
+
12
+ Usage:
13
+ python -m runbooks.validation.benchmark --iterations 10 --target-accuracy 99.5
14
+ """
15
+
16
+ import asyncio
17
+ import json
18
+ import statistics
19
+ import time
20
+ from dataclasses import dataclass
21
+ from datetime import datetime
22
+ from typing import Any, Dict, List
23
+
24
+ from rich import box
25
+ from rich.console import Console
26
+ from rich.panel import Panel
27
+ from rich.progress import Progress, TaskID
28
+ from rich.table import Table
29
+
30
+ from .mcp_validator import MCPValidator, ValidationReport, ValidationStatus
31
+
32
+ console = Console()
33
+
34
+
35
+ @dataclass
36
+ class BenchmarkResult:
37
+ """Individual benchmark iteration result."""
38
+
39
+ iteration: int
40
+ accuracy: float
41
+ execution_time: float
42
+ passed_validations: int
43
+ total_validations: int
44
+ timestamp: datetime
45
+ details: ValidationReport
46
+
47
+
48
+ @dataclass
49
+ class BenchmarkSuite:
50
+ """Complete benchmark suite results."""
51
+
52
+ target_accuracy: float
53
+ performance_target: float
54
+ iterations: int
55
+ results: List[BenchmarkResult]
56
+
57
+ # Summary statistics
58
+ avg_accuracy: float
59
+ avg_execution_time: float
60
+ min_accuracy: float
61
+ max_accuracy: float
62
+ min_execution_time: float
63
+ max_execution_time: float
64
+ accuracy_std_dev: float
65
+ time_std_dev: float
66
+ success_rate: float
67
+
68
+ # SRE metrics
69
+ availability: float # % of successful validations
70
+ reliability_score: float # Combined accuracy + performance
71
+ performance_consistency: float # Low variance = high consistency
72
+
73
+
74
+ class MCPBenchmarkRunner:
75
+ """
76
+ Enterprise benchmark runner for MCP validation framework.
77
+
78
+ Provides comprehensive performance testing with SRE reliability metrics
79
+ and enterprise reporting for production deployment validation.
80
+ """
81
+
82
+ def __init__(
83
+ self, target_accuracy: float = 99.5, performance_target: float = 30.0, tolerance_percentage: float = 5.0
84
+ ):
85
+ """Initialize benchmark runner."""
86
+
87
+ self.target_accuracy = target_accuracy
88
+ self.performance_target = performance_target
89
+ self.tolerance_percentage = tolerance_percentage
90
+
91
+ console.print(
92
+ Panel(
93
+ f"[bold blue]MCP Validation Benchmark Suite[/bold blue]\n"
94
+ f"Target Accuracy: {target_accuracy}%\n"
95
+ f"Performance Target: <{performance_target}s\n"
96
+ f"Tolerance: ±{tolerance_percentage}%",
97
+ title="Enterprise Benchmark Framework",
98
+ )
99
+ )
100
+
101
+ async def run_benchmark(self, iterations: int = 5) -> BenchmarkSuite:
102
+ """
103
+ Run comprehensive benchmark across multiple iterations.
104
+
105
+ Args:
106
+ iterations: Number of benchmark iterations to run
107
+
108
+ Returns:
109
+ BenchmarkSuite with complete performance analysis
110
+ """
111
+
112
+ console.print(f"\n[bold cyan]Starting {iterations} benchmark iterations...[/bold cyan]")
113
+
114
+ results: List[BenchmarkResult] = []
115
+
116
+ with Progress() as progress:
117
+ task = progress.add_task("[cyan]Running benchmark...", total=iterations)
118
+
119
+ for i in range(iterations):
120
+ progress.console.print(f"\n[bold green]→ Iteration {i + 1}/{iterations}[/bold green]")
121
+
122
+ # Run single benchmark iteration
123
+ result = await self._run_single_iteration(i + 1, progress)
124
+ results.append(result)
125
+
126
+ # Display iteration summary
127
+ status_color = "green" if result.accuracy >= self.target_accuracy else "red"
128
+ perf_color = "green" if result.execution_time <= self.performance_target else "red"
129
+
130
+ progress.console.print(
131
+ f" Accuracy: [{status_color}]{result.accuracy:.1f}%[/{status_color}] | "
132
+ f"Time: [{perf_color}]{result.execution_time:.1f}s[/{perf_color}] | "
133
+ f"Passed: {result.passed_validations}/{result.total_validations}"
134
+ )
135
+
136
+ progress.advance(task)
137
+
138
+ # Calculate benchmark suite statistics
139
+ return self._calculate_benchmark_statistics(results)
140
+
141
+ async def _run_single_iteration(self, iteration: int, progress: Progress) -> BenchmarkResult:
142
+ """Run single benchmark iteration."""
143
+
144
+ start_time = time.time()
145
+
146
+ # Initialize validator for this iteration
147
+ validator = MCPValidator(
148
+ tolerance_percentage=self.tolerance_percentage, performance_target_seconds=self.performance_target
149
+ )
150
+
151
+ # Run validation
152
+ try:
153
+ report = await validator.validate_all_operations()
154
+
155
+ execution_time = time.time() - start_time
156
+
157
+ return BenchmarkResult(
158
+ iteration=iteration,
159
+ accuracy=report.overall_accuracy,
160
+ execution_time=execution_time,
161
+ passed_validations=report.passed_validations,
162
+ total_validations=report.total_validations,
163
+ timestamp=datetime.now(),
164
+ details=report,
165
+ )
166
+
167
+ except Exception as e:
168
+ execution_time = time.time() - start_time
169
+ progress.console.print(f"[red]Iteration {iteration} failed: {e}[/red]")
170
+
171
+ # Return failed iteration
172
+ return BenchmarkResult(
173
+ iteration=iteration,
174
+ accuracy=0.0,
175
+ execution_time=execution_time,
176
+ passed_validations=0,
177
+ total_validations=5, # Expected number of validations
178
+ timestamp=datetime.now(),
179
+ details=None,
180
+ )
181
+
182
+ def _calculate_benchmark_statistics(self, results: List[BenchmarkResult]) -> BenchmarkSuite:
183
+ """Calculate comprehensive benchmark statistics."""
184
+
185
+ if not results:
186
+ raise ValueError("No benchmark results to analyze")
187
+
188
+ # Basic statistics
189
+ accuracies = [r.accuracy for r in results]
190
+ times = [r.execution_time for r in results]
191
+
192
+ avg_accuracy = statistics.mean(accuracies)
193
+ avg_execution_time = statistics.mean(times)
194
+ min_accuracy = min(accuracies)
195
+ max_accuracy = max(accuracies)
196
+ min_execution_time = min(times)
197
+ max_execution_time = max(times)
198
+
199
+ # Calculate standard deviations
200
+ accuracy_std_dev = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0
201
+ time_std_dev = statistics.stdev(times) if len(times) > 1 else 0.0
202
+
203
+ # Success rate (meeting target accuracy)
204
+ successful_iterations = len([r for r in results if r.accuracy >= self.target_accuracy])
205
+ success_rate = (successful_iterations / len(results)) * 100
206
+
207
+ # SRE reliability metrics
208
+ availability = len([r for r in results if r.accuracy > 0]) / len(results) * 100
209
+
210
+ # Reliability score (weighted accuracy + performance)
211
+ accuracy_score = min(100, avg_accuracy / self.target_accuracy * 100)
212
+ performance_score = (
213
+ min(100, self.performance_target / avg_execution_time * 100) if avg_execution_time > 0 else 0
214
+ )
215
+ reliability_score = (accuracy_score * 0.7) + (performance_score * 0.3) # 70% accuracy, 30% performance
216
+
217
+ # Performance consistency (lower std dev = higher consistency)
218
+ max_acceptable_std_dev = 5.0 # 5% standard deviation is acceptable
219
+ performance_consistency = max(0, 100 - (accuracy_std_dev / max_acceptable_std_dev * 100))
220
+
221
+ return BenchmarkSuite(
222
+ target_accuracy=self.target_accuracy,
223
+ performance_target=self.performance_target,
224
+ iterations=len(results),
225
+ results=results,
226
+ avg_accuracy=avg_accuracy,
227
+ avg_execution_time=avg_execution_time,
228
+ min_accuracy=min_accuracy,
229
+ max_accuracy=max_accuracy,
230
+ min_execution_time=min_execution_time,
231
+ max_execution_time=max_execution_time,
232
+ accuracy_std_dev=accuracy_std_dev,
233
+ time_std_dev=time_std_dev,
234
+ success_rate=success_rate,
235
+ availability=availability,
236
+ reliability_score=reliability_score,
237
+ performance_consistency=performance_consistency,
238
+ )
239
+
240
+ def display_benchmark_results(self, suite: BenchmarkSuite) -> None:
241
+ """Display comprehensive benchmark results."""
242
+
243
+ # Overall assessment
244
+ overall_status = self._assess_benchmark_results(suite)
245
+ status_color = "green" if overall_status == "PASSED" else "red" if overall_status == "FAILED" else "yellow"
246
+
247
+ console.print(
248
+ Panel(
249
+ f"[bold {status_color}]Benchmark Status: {overall_status}[/bold {status_color}]\n"
250
+ f"Average Accuracy: {suite.avg_accuracy:.2f}% (Target: {suite.target_accuracy}%)\n"
251
+ f"Average Time: {suite.avg_execution_time:.1f}s (Target: <{suite.performance_target}s)\n"
252
+ f"Success Rate: {suite.success_rate:.1f}% | Reliability: {suite.reliability_score:.1f}%",
253
+ title="Benchmark Summary",
254
+ )
255
+ )
256
+
257
+ # Detailed statistics table
258
+ stats_table = Table(title="Performance Statistics", box=box.ROUNDED)
259
+ stats_table.add_column("Metric", style="cyan", no_wrap=True)
260
+ stats_table.add_column("Value", justify="right", style="bold")
261
+ stats_table.add_column("Status", style="bold")
262
+
263
+ # Accuracy metrics
264
+ stats_table.add_row(
265
+ "Average Accuracy",
266
+ f"{suite.avg_accuracy:.2f}%",
267
+ "✅ PASS" if suite.avg_accuracy >= suite.target_accuracy else "❌ FAIL",
268
+ )
269
+ stats_table.add_row("Accuracy Range", f"{suite.min_accuracy:.1f}% - {suite.max_accuracy:.1f}%", "ℹ️ INFO")
270
+ stats_table.add_row(
271
+ "Accuracy Std Dev",
272
+ f"{suite.accuracy_std_dev:.2f}%",
273
+ "✅ GOOD" if suite.accuracy_std_dev < 5.0 else "⚠️ HIGH",
274
+ )
275
+
276
+ # Performance metrics
277
+ stats_table.add_row(
278
+ "Average Time",
279
+ f"{suite.avg_execution_time:.1f}s",
280
+ "✅ PASS" if suite.avg_execution_time <= suite.performance_target else "❌ FAIL",
281
+ )
282
+ stats_table.add_row(
283
+ "Time Range", f"{suite.min_execution_time:.1f}s - {suite.max_execution_time:.1f}s", "ℹ️ INFO"
284
+ )
285
+ stats_table.add_row(
286
+ "Time Std Dev", f"{suite.time_std_dev:.1f}s", "✅ GOOD" if suite.time_std_dev < 5.0 else "⚠️ HIGH"
287
+ )
288
+
289
+ # SRE metrics
290
+ stats_table.add_row(
291
+ "Success Rate", f"{suite.success_rate:.1f}%", "✅ EXCELLENT" if suite.success_rate >= 80 else "❌ POOR"
292
+ )
293
+ stats_table.add_row(
294
+ "Availability", f"{suite.availability:.1f}%", "✅ PASS" if suite.availability >= 99 else "❌ FAIL"
295
+ )
296
+ stats_table.add_row(
297
+ "Reliability Score",
298
+ f"{suite.reliability_score:.1f}%",
299
+ "✅ EXCELLENT" if suite.reliability_score >= 90 else "⚠️ NEEDS WORK",
300
+ )
301
+ stats_table.add_row(
302
+ "Consistency",
303
+ f"{suite.performance_consistency:.1f}%",
304
+ "✅ STABLE" if suite.performance_consistency >= 80 else "⚠️ VARIABLE",
305
+ )
306
+
307
+ console.print(stats_table)
308
+
309
+ # Individual iteration results
310
+ iterations_table = Table(title="Individual Iterations", box=box.MINIMAL)
311
+ iterations_table.add_column("Iteration", justify="center")
312
+ iterations_table.add_column("Accuracy", justify="right")
313
+ iterations_table.add_column("Time (s)", justify="right")
314
+ iterations_table.add_column("Passed/Total")
315
+ iterations_table.add_column("Status", style="bold")
316
+
317
+ for result in suite.results:
318
+ status_color = "green" if result.accuracy >= suite.target_accuracy else "red"
319
+ status = (
320
+ "PASS"
321
+ if result.accuracy >= suite.target_accuracy and result.execution_time <= suite.performance_target
322
+ else "FAIL"
323
+ )
324
+
325
+ iterations_table.add_row(
326
+ str(result.iteration),
327
+ f"{result.accuracy:.1f}%",
328
+ f"{result.execution_time:.1f}",
329
+ f"{result.passed_validations}/{result.total_validations}",
330
+ f"[{status_color}]{status}[/{status_color}]",
331
+ )
332
+
333
+ console.print(iterations_table)
334
+
335
+ # Recommendations
336
+ recommendations = self._generate_benchmark_recommendations(suite)
337
+ if recommendations:
338
+ console.print(
339
+ Panel("\n".join(f"• {rec}" for rec in recommendations), title="Recommendations", border_style="blue")
340
+ )
341
+
342
+ # Save benchmark report
343
+ self._save_benchmark_report(suite)
344
+
345
+ def _assess_benchmark_results(self, suite: BenchmarkSuite) -> str:
346
+ """Assess overall benchmark results."""
347
+
348
+ accuracy_pass = suite.avg_accuracy >= suite.target_accuracy
349
+ performance_pass = suite.avg_execution_time <= suite.performance_target
350
+ reliability_pass = suite.reliability_score >= 90
351
+ consistency_pass = suite.accuracy_std_dev < 5.0
352
+
353
+ if accuracy_pass and performance_pass and reliability_pass:
354
+ return "PASSED"
355
+ elif accuracy_pass and performance_pass:
356
+ return "WARNING"
357
+ else:
358
+ return "FAILED"
359
+
360
+ def _generate_benchmark_recommendations(self, suite: BenchmarkSuite) -> List[str]:
361
+ """Generate actionable recommendations based on benchmark results."""
362
+
363
+ recommendations = []
364
+
365
+ # Accuracy recommendations
366
+ if suite.avg_accuracy < suite.target_accuracy:
367
+ recommendations.append(
368
+ f"🎯 Improve average accuracy from {suite.avg_accuracy:.1f}% to {suite.target_accuracy}%"
369
+ )
370
+ recommendations.append("🔍 Review MCP integration and AWS API permissions")
371
+
372
+ # Performance recommendations
373
+ if suite.avg_execution_time > suite.performance_target:
374
+ recommendations.append(
375
+ f"⚡ Optimize performance from {suite.avg_execution_time:.1f}s to <{suite.performance_target}s"
376
+ )
377
+ recommendations.append("🚀 Consider parallel validation and caching strategies")
378
+
379
+ # Consistency recommendations
380
+ if suite.accuracy_std_dev > 5.0:
381
+ recommendations.append(f"📊 Improve consistency - accuracy std dev {suite.accuracy_std_dev:.1f}% is high")
382
+ recommendations.append("🔧 Investigate sources of validation variance")
383
+
384
+ # Reliability recommendations
385
+ if suite.reliability_score < 90:
386
+ recommendations.append(f"🛠️ Enhance reliability score from {suite.reliability_score:.1f}% to >90%")
387
+ recommendations.append("📈 Focus on both accuracy and performance improvements")
388
+
389
+ # Success rate recommendations
390
+ if suite.success_rate < 80:
391
+ recommendations.append(f"✅ Improve success rate from {suite.success_rate:.1f}% to >80%")
392
+ recommendations.append("🎯 Address systematic issues causing validation failures")
393
+
394
+ # Production readiness
395
+ overall_status = self._assess_benchmark_results(suite)
396
+ if overall_status == "PASSED":
397
+ recommendations.append("🚀 Benchmark PASSED - Ready for production deployment")
398
+ elif overall_status == "WARNING":
399
+ recommendations.append("⚠️ Benchmark WARNING - Address consistency issues before production")
400
+ else:
401
+ recommendations.append("❌ Benchmark FAILED - Significant improvements needed before production")
402
+
403
+ return recommendations
404
+
405
+ def _save_benchmark_report(self, suite: BenchmarkSuite) -> None:
406
+ """Save benchmark report to artifacts directory."""
407
+
408
+ from pathlib import Path
409
+
410
+ artifacts_dir = Path("./artifacts/benchmark")
411
+ artifacts_dir.mkdir(parents=True, exist_ok=True)
412
+
413
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
414
+ report_file = artifacts_dir / f"mcp_benchmark_{timestamp}.json"
415
+
416
+ # Convert to serializable format
417
+ report_data = {
418
+ "benchmark_suite": {
419
+ "target_accuracy": suite.target_accuracy,
420
+ "performance_target": suite.performance_target,
421
+ "iterations": suite.iterations,
422
+ "avg_accuracy": suite.avg_accuracy,
423
+ "avg_execution_time": suite.avg_execution_time,
424
+ "min_accuracy": suite.min_accuracy,
425
+ "max_accuracy": suite.max_accuracy,
426
+ "min_execution_time": suite.min_execution_time,
427
+ "max_execution_time": suite.max_execution_time,
428
+ "accuracy_std_dev": suite.accuracy_std_dev,
429
+ "time_std_dev": suite.time_std_dev,
430
+ "success_rate": suite.success_rate,
431
+ "availability": suite.availability,
432
+ "reliability_score": suite.reliability_score,
433
+ "performance_consistency": suite.performance_consistency,
434
+ },
435
+ "results": [
436
+ {
437
+ "iteration": r.iteration,
438
+ "accuracy": r.accuracy,
439
+ "execution_time": r.execution_time,
440
+ "passed_validations": r.passed_validations,
441
+ "total_validations": r.total_validations,
442
+ "timestamp": r.timestamp.isoformat(),
443
+ }
444
+ for r in suite.results
445
+ ],
446
+ "assessment": self._assess_benchmark_results(suite),
447
+ "recommendations": self._generate_benchmark_recommendations(suite),
448
+ }
449
+
450
+ with open(report_file, "w") as f:
451
+ json.dump(report_data, f, indent=2)
452
+
453
+ console.print(f"[green]Benchmark report saved:[/green] {report_file}")
454
+
455
+
456
+ # CLI entry point
457
+ async def main():
458
+ """CLI entry point for benchmark runner."""
459
+ import argparse
460
+
461
+ parser = argparse.ArgumentParser(description="MCP Validation Benchmark Suite")
462
+ parser.add_argument("--iterations", type=int, default=5, help="Number of benchmark iterations")
463
+ parser.add_argument("--target-accuracy", type=float, default=99.5, help="Target accuracy percentage")
464
+ parser.add_argument("--performance-target", type=float, default=30.0, help="Performance target in seconds")
465
+ parser.add_argument("--tolerance", type=float, default=5.0, help="Tolerance percentage")
466
+
467
+ args = parser.parse_args()
468
+
469
+ runner = MCPBenchmarkRunner(
470
+ target_accuracy=args.target_accuracy,
471
+ performance_target=args.performance_target,
472
+ tolerance_percentage=args.tolerance,
473
+ )
474
+
475
+ suite = await runner.run_benchmark(args.iterations)
476
+ runner.display_benchmark_results(suite)
477
+
478
+ # Exit with appropriate code
479
+ overall_status = runner._assess_benchmark_results(suite)
480
+ if overall_status == "PASSED":
481
+ exit(0)
482
+ elif overall_status == "WARNING":
483
+ exit(1)
484
+ else:
485
+ exit(2)
486
+
487
+
488
+ if __name__ == "__main__":
489
+ asyncio.run(main())