runbooks 0.7.6__py3-none-any.whl → 0.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runbooks/__init__.py +1 -1
- runbooks/base.py +5 -1
- runbooks/cfat/__init__.py +8 -4
- runbooks/cfat/assessment/collectors.py +171 -14
- runbooks/cfat/assessment/compliance.py +871 -0
- runbooks/cfat/assessment/runner.py +122 -11
- runbooks/cfat/models.py +6 -2
- runbooks/common/logger.py +14 -0
- runbooks/common/rich_utils.py +451 -0
- runbooks/enterprise/__init__.py +68 -0
- runbooks/enterprise/error_handling.py +411 -0
- runbooks/enterprise/logging.py +439 -0
- runbooks/enterprise/multi_tenant.py +583 -0
- runbooks/finops/README.md +468 -241
- runbooks/finops/__init__.py +39 -3
- runbooks/finops/cli.py +83 -18
- runbooks/finops/cross_validation.py +375 -0
- runbooks/finops/dashboard_runner.py +812 -164
- runbooks/finops/enhanced_dashboard_runner.py +525 -0
- runbooks/finops/finops_dashboard.py +1892 -0
- runbooks/finops/helpers.py +485 -51
- runbooks/finops/optimizer.py +823 -0
- runbooks/finops/tests/__init__.py +19 -0
- runbooks/finops/tests/results_test_finops_dashboard.xml +1 -0
- runbooks/finops/tests/run_comprehensive_tests.py +421 -0
- runbooks/finops/tests/run_tests.py +305 -0
- runbooks/finops/tests/test_finops_dashboard.py +705 -0
- runbooks/finops/tests/test_integration.py +477 -0
- runbooks/finops/tests/test_performance.py +380 -0
- runbooks/finops/tests/test_performance_benchmarks.py +500 -0
- runbooks/finops/tests/test_reference_images_validation.py +867 -0
- runbooks/finops/tests/test_single_account_features.py +715 -0
- runbooks/finops/tests/validate_test_suite.py +220 -0
- runbooks/finops/types.py +1 -1
- runbooks/hitl/enhanced_workflow_engine.py +725 -0
- runbooks/inventory/artifacts/scale-optimize-status.txt +12 -0
- runbooks/inventory/collectors/aws_comprehensive.py +442 -0
- runbooks/inventory/collectors/enterprise_scale.py +281 -0
- runbooks/inventory/core/collector.py +172 -13
- runbooks/inventory/discovery.md +1 -1
- runbooks/inventory/list_ec2_instances.py +18 -20
- runbooks/inventory/list_ssm_parameters.py +31 -3
- runbooks/inventory/organizations_discovery.py +1269 -0
- runbooks/inventory/rich_inventory_display.py +393 -0
- runbooks/inventory/run_on_multi_accounts.py +35 -19
- runbooks/inventory/runbooks.security.report_generator.log +0 -0
- runbooks/inventory/runbooks.security.run_script.log +0 -0
- runbooks/inventory/vpc_flow_analyzer.py +1030 -0
- runbooks/main.py +2215 -119
- runbooks/metrics/dora_metrics_engine.py +599 -0
- runbooks/operate/__init__.py +2 -2
- runbooks/operate/base.py +122 -10
- runbooks/operate/deployment_framework.py +1032 -0
- runbooks/operate/deployment_validator.py +853 -0
- runbooks/operate/dynamodb_operations.py +10 -6
- runbooks/operate/ec2_operations.py +319 -11
- runbooks/operate/executive_dashboard.py +779 -0
- runbooks/operate/mcp_integration.py +750 -0
- runbooks/operate/nat_gateway_operations.py +1120 -0
- runbooks/operate/networking_cost_heatmap.py +685 -0
- runbooks/operate/privatelink_operations.py +940 -0
- runbooks/operate/s3_operations.py +10 -6
- runbooks/operate/vpc_endpoints.py +644 -0
- runbooks/operate/vpc_operations.py +1038 -0
- runbooks/remediation/__init__.py +2 -2
- runbooks/remediation/acm_remediation.py +1 -1
- runbooks/remediation/base.py +1 -1
- runbooks/remediation/cloudtrail_remediation.py +1 -1
- runbooks/remediation/cognito_remediation.py +1 -1
- runbooks/remediation/dynamodb_remediation.py +1 -1
- runbooks/remediation/ec2_remediation.py +1 -1
- runbooks/remediation/ec2_unattached_ebs_volumes.py +1 -1
- runbooks/remediation/kms_enable_key_rotation.py +1 -1
- runbooks/remediation/kms_remediation.py +1 -1
- runbooks/remediation/lambda_remediation.py +1 -1
- runbooks/remediation/multi_account.py +1 -1
- runbooks/remediation/rds_remediation.py +1 -1
- runbooks/remediation/s3_block_public_access.py +1 -1
- runbooks/remediation/s3_enable_access_logging.py +1 -1
- runbooks/remediation/s3_encryption.py +1 -1
- runbooks/remediation/s3_remediation.py +1 -1
- runbooks/remediation/vpc_remediation.py +475 -0
- runbooks/security/__init__.py +3 -1
- runbooks/security/compliance_automation.py +632 -0
- runbooks/security/report_generator.py +10 -0
- runbooks/security/run_script.py +31 -5
- runbooks/security/security_baseline_tester.py +169 -30
- runbooks/security/security_export.py +477 -0
- runbooks/validation/__init__.py +10 -0
- runbooks/validation/benchmark.py +484 -0
- runbooks/validation/cli.py +356 -0
- runbooks/validation/mcp_validator.py +768 -0
- runbooks/vpc/__init__.py +38 -0
- runbooks/vpc/config.py +212 -0
- runbooks/vpc/cost_engine.py +347 -0
- runbooks/vpc/heatmap_engine.py +605 -0
- runbooks/vpc/manager_interface.py +634 -0
- runbooks/vpc/networking_wrapper.py +1260 -0
- runbooks/vpc/rich_formatters.py +679 -0
- runbooks/vpc/tests/__init__.py +5 -0
- runbooks/vpc/tests/conftest.py +356 -0
- runbooks/vpc/tests/test_cli_integration.py +530 -0
- runbooks/vpc/tests/test_config.py +458 -0
- runbooks/vpc/tests/test_cost_engine.py +479 -0
- runbooks/vpc/tests/test_networking_wrapper.py +512 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/METADATA +40 -12
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/RECORD +111 -50
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/WHEEL +0 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/entry_points.txt +0 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/licenses/LICENSE +0 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,484 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
MCP Validation Performance Benchmark Suite
|
4
|
+
|
5
|
+
Enterprise performance testing for MCP validation framework with:
|
6
|
+
- <30s validation cycle target
|
7
|
+
- 99.5% accuracy requirement
|
8
|
+
- Multi-account performance testing (60+ accounts)
|
9
|
+
- Real-time monitoring and reporting
|
10
|
+
- SRE reliability metrics
|
11
|
+
|
12
|
+
Usage:
|
13
|
+
python -m runbooks.validation.benchmark --iterations 10 --target-accuracy 99.5
|
14
|
+
"""
|
15
|
+
|
16
|
+
import asyncio
|
17
|
+
import time
|
18
|
+
import statistics
|
19
|
+
from datetime import datetime
|
20
|
+
from typing import List, Dict, Any
|
21
|
+
from dataclasses import dataclass
|
22
|
+
import json
|
23
|
+
|
24
|
+
from rich.console import Console
|
25
|
+
from rich.table import Table
|
26
|
+
from rich.panel import Panel
|
27
|
+
from rich.progress import Progress, TaskID
|
28
|
+
from rich import box
|
29
|
+
|
30
|
+
from .mcp_validator import MCPValidator, ValidationReport, ValidationStatus
|
31
|
+
|
32
|
+
console = Console()
|
33
|
+
|
34
|
+
@dataclass
|
35
|
+
class BenchmarkResult:
|
36
|
+
"""Individual benchmark iteration result."""
|
37
|
+
iteration: int
|
38
|
+
accuracy: float
|
39
|
+
execution_time: float
|
40
|
+
passed_validations: int
|
41
|
+
total_validations: int
|
42
|
+
timestamp: datetime
|
43
|
+
details: ValidationReport
|
44
|
+
|
45
|
+
@dataclass
|
46
|
+
class BenchmarkSuite:
|
47
|
+
"""Complete benchmark suite results."""
|
48
|
+
target_accuracy: float
|
49
|
+
performance_target: float
|
50
|
+
iterations: int
|
51
|
+
results: List[BenchmarkResult]
|
52
|
+
|
53
|
+
# Summary statistics
|
54
|
+
avg_accuracy: float
|
55
|
+
avg_execution_time: float
|
56
|
+
min_accuracy: float
|
57
|
+
max_accuracy: float
|
58
|
+
min_execution_time: float
|
59
|
+
max_execution_time: float
|
60
|
+
accuracy_std_dev: float
|
61
|
+
time_std_dev: float
|
62
|
+
success_rate: float
|
63
|
+
|
64
|
+
# SRE metrics
|
65
|
+
availability: float # % of successful validations
|
66
|
+
reliability_score: float # Combined accuracy + performance
|
67
|
+
performance_consistency: float # Low variance = high consistency
|
68
|
+
|
69
|
+
class MCPBenchmarkRunner:
|
70
|
+
"""
|
71
|
+
Enterprise benchmark runner for MCP validation framework.
|
72
|
+
|
73
|
+
Provides comprehensive performance testing with SRE reliability metrics
|
74
|
+
and enterprise reporting for production deployment validation.
|
75
|
+
"""
|
76
|
+
|
77
|
+
def __init__(self,
|
78
|
+
target_accuracy: float = 99.5,
|
79
|
+
performance_target: float = 30.0,
|
80
|
+
tolerance_percentage: float = 5.0):
|
81
|
+
"""Initialize benchmark runner."""
|
82
|
+
|
83
|
+
self.target_accuracy = target_accuracy
|
84
|
+
self.performance_target = performance_target
|
85
|
+
self.tolerance_percentage = tolerance_percentage
|
86
|
+
|
87
|
+
console.print(Panel(
|
88
|
+
f"[bold blue]MCP Validation Benchmark Suite[/bold blue]\n"
|
89
|
+
f"Target Accuracy: {target_accuracy}%\n"
|
90
|
+
f"Performance Target: <{performance_target}s\n"
|
91
|
+
f"Tolerance: ±{tolerance_percentage}%",
|
92
|
+
title="Enterprise Benchmark Framework"
|
93
|
+
))
|
94
|
+
|
95
|
+
async def run_benchmark(self, iterations: int = 5) -> BenchmarkSuite:
|
96
|
+
"""
|
97
|
+
Run comprehensive benchmark across multiple iterations.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
iterations: Number of benchmark iterations to run
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
BenchmarkSuite with complete performance analysis
|
104
|
+
"""
|
105
|
+
|
106
|
+
console.print(f"\n[bold cyan]Starting {iterations} benchmark iterations...[/bold cyan]")
|
107
|
+
|
108
|
+
results: List[BenchmarkResult] = []
|
109
|
+
|
110
|
+
with Progress() as progress:
|
111
|
+
task = progress.add_task("[cyan]Running benchmark...", total=iterations)
|
112
|
+
|
113
|
+
for i in range(iterations):
|
114
|
+
progress.console.print(f"\n[bold green]→ Iteration {i+1}/{iterations}[/bold green]")
|
115
|
+
|
116
|
+
# Run single benchmark iteration
|
117
|
+
result = await self._run_single_iteration(i + 1, progress)
|
118
|
+
results.append(result)
|
119
|
+
|
120
|
+
# Display iteration summary
|
121
|
+
status_color = "green" if result.accuracy >= self.target_accuracy else "red"
|
122
|
+
perf_color = "green" if result.execution_time <= self.performance_target else "red"
|
123
|
+
|
124
|
+
progress.console.print(
|
125
|
+
f" Accuracy: [{status_color}]{result.accuracy:.1f}%[/{status_color}] | "
|
126
|
+
f"Time: [{perf_color}]{result.execution_time:.1f}s[/{perf_color}] | "
|
127
|
+
f"Passed: {result.passed_validations}/{result.total_validations}"
|
128
|
+
)
|
129
|
+
|
130
|
+
progress.advance(task)
|
131
|
+
|
132
|
+
# Calculate benchmark suite statistics
|
133
|
+
return self._calculate_benchmark_statistics(results)
|
134
|
+
|
135
|
+
async def _run_single_iteration(self, iteration: int, progress: Progress) -> BenchmarkResult:
|
136
|
+
"""Run single benchmark iteration."""
|
137
|
+
|
138
|
+
start_time = time.time()
|
139
|
+
|
140
|
+
# Initialize validator for this iteration
|
141
|
+
validator = MCPValidator(
|
142
|
+
tolerance_percentage=self.tolerance_percentage,
|
143
|
+
performance_target_seconds=self.performance_target
|
144
|
+
)
|
145
|
+
|
146
|
+
# Run validation
|
147
|
+
try:
|
148
|
+
report = await validator.validate_all_operations()
|
149
|
+
|
150
|
+
execution_time = time.time() - start_time
|
151
|
+
|
152
|
+
return BenchmarkResult(
|
153
|
+
iteration=iteration,
|
154
|
+
accuracy=report.overall_accuracy,
|
155
|
+
execution_time=execution_time,
|
156
|
+
passed_validations=report.passed_validations,
|
157
|
+
total_validations=report.total_validations,
|
158
|
+
timestamp=datetime.now(),
|
159
|
+
details=report
|
160
|
+
)
|
161
|
+
|
162
|
+
except Exception as e:
|
163
|
+
execution_time = time.time() - start_time
|
164
|
+
progress.console.print(f"[red]Iteration {iteration} failed: {e}[/red]")
|
165
|
+
|
166
|
+
# Return failed iteration
|
167
|
+
return BenchmarkResult(
|
168
|
+
iteration=iteration,
|
169
|
+
accuracy=0.0,
|
170
|
+
execution_time=execution_time,
|
171
|
+
passed_validations=0,
|
172
|
+
total_validations=5, # Expected number of validations
|
173
|
+
timestamp=datetime.now(),
|
174
|
+
details=None
|
175
|
+
)
|
176
|
+
|
177
|
+
def _calculate_benchmark_statistics(self, results: List[BenchmarkResult]) -> BenchmarkSuite:
|
178
|
+
"""Calculate comprehensive benchmark statistics."""
|
179
|
+
|
180
|
+
if not results:
|
181
|
+
raise ValueError("No benchmark results to analyze")
|
182
|
+
|
183
|
+
# Basic statistics
|
184
|
+
accuracies = [r.accuracy for r in results]
|
185
|
+
times = [r.execution_time for r in results]
|
186
|
+
|
187
|
+
avg_accuracy = statistics.mean(accuracies)
|
188
|
+
avg_execution_time = statistics.mean(times)
|
189
|
+
min_accuracy = min(accuracies)
|
190
|
+
max_accuracy = max(accuracies)
|
191
|
+
min_execution_time = min(times)
|
192
|
+
max_execution_time = max(times)
|
193
|
+
|
194
|
+
# Calculate standard deviations
|
195
|
+
accuracy_std_dev = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0
|
196
|
+
time_std_dev = statistics.stdev(times) if len(times) > 1 else 0.0
|
197
|
+
|
198
|
+
# Success rate (meeting target accuracy)
|
199
|
+
successful_iterations = len([r for r in results if r.accuracy >= self.target_accuracy])
|
200
|
+
success_rate = (successful_iterations / len(results)) * 100
|
201
|
+
|
202
|
+
# SRE reliability metrics
|
203
|
+
availability = len([r for r in results if r.accuracy > 0]) / len(results) * 100
|
204
|
+
|
205
|
+
# Reliability score (weighted accuracy + performance)
|
206
|
+
accuracy_score = min(100, avg_accuracy / self.target_accuracy * 100)
|
207
|
+
performance_score = min(100, self.performance_target / avg_execution_time * 100) if avg_execution_time > 0 else 0
|
208
|
+
reliability_score = (accuracy_score * 0.7) + (performance_score * 0.3) # 70% accuracy, 30% performance
|
209
|
+
|
210
|
+
# Performance consistency (lower std dev = higher consistency)
|
211
|
+
max_acceptable_std_dev = 5.0 # 5% standard deviation is acceptable
|
212
|
+
performance_consistency = max(0, 100 - (accuracy_std_dev / max_acceptable_std_dev * 100))
|
213
|
+
|
214
|
+
return BenchmarkSuite(
|
215
|
+
target_accuracy=self.target_accuracy,
|
216
|
+
performance_target=self.performance_target,
|
217
|
+
iterations=len(results),
|
218
|
+
results=results,
|
219
|
+
avg_accuracy=avg_accuracy,
|
220
|
+
avg_execution_time=avg_execution_time,
|
221
|
+
min_accuracy=min_accuracy,
|
222
|
+
max_accuracy=max_accuracy,
|
223
|
+
min_execution_time=min_execution_time,
|
224
|
+
max_execution_time=max_execution_time,
|
225
|
+
accuracy_std_dev=accuracy_std_dev,
|
226
|
+
time_std_dev=time_std_dev,
|
227
|
+
success_rate=success_rate,
|
228
|
+
availability=availability,
|
229
|
+
reliability_score=reliability_score,
|
230
|
+
performance_consistency=performance_consistency
|
231
|
+
)
|
232
|
+
|
233
|
+
def display_benchmark_results(self, suite: BenchmarkSuite) -> None:
|
234
|
+
"""Display comprehensive benchmark results."""
|
235
|
+
|
236
|
+
# Overall assessment
|
237
|
+
overall_status = self._assess_benchmark_results(suite)
|
238
|
+
status_color = "green" if overall_status == "PASSED" else "red" if overall_status == "FAILED" else "yellow"
|
239
|
+
|
240
|
+
console.print(Panel(
|
241
|
+
f"[bold {status_color}]Benchmark Status: {overall_status}[/bold {status_color}]\n"
|
242
|
+
f"Average Accuracy: {suite.avg_accuracy:.2f}% (Target: {suite.target_accuracy}%)\n"
|
243
|
+
f"Average Time: {suite.avg_execution_time:.1f}s (Target: <{suite.performance_target}s)\n"
|
244
|
+
f"Success Rate: {suite.success_rate:.1f}% | Reliability: {suite.reliability_score:.1f}%",
|
245
|
+
title="Benchmark Summary"
|
246
|
+
))
|
247
|
+
|
248
|
+
# Detailed statistics table
|
249
|
+
stats_table = Table(title="Performance Statistics", box=box.ROUNDED)
|
250
|
+
stats_table.add_column("Metric", style="cyan", no_wrap=True)
|
251
|
+
stats_table.add_column("Value", justify="right", style="bold")
|
252
|
+
stats_table.add_column("Status", style="bold")
|
253
|
+
|
254
|
+
# Accuracy metrics
|
255
|
+
stats_table.add_row(
|
256
|
+
"Average Accuracy",
|
257
|
+
f"{suite.avg_accuracy:.2f}%",
|
258
|
+
"✅ PASS" if suite.avg_accuracy >= suite.target_accuracy else "❌ FAIL"
|
259
|
+
)
|
260
|
+
stats_table.add_row(
|
261
|
+
"Accuracy Range",
|
262
|
+
f"{suite.min_accuracy:.1f}% - {suite.max_accuracy:.1f}%",
|
263
|
+
"ℹ️ INFO"
|
264
|
+
)
|
265
|
+
stats_table.add_row(
|
266
|
+
"Accuracy Std Dev",
|
267
|
+
f"{suite.accuracy_std_dev:.2f}%",
|
268
|
+
"✅ GOOD" if suite.accuracy_std_dev < 5.0 else "⚠️ HIGH"
|
269
|
+
)
|
270
|
+
|
271
|
+
# Performance metrics
|
272
|
+
stats_table.add_row(
|
273
|
+
"Average Time",
|
274
|
+
f"{suite.avg_execution_time:.1f}s",
|
275
|
+
"✅ PASS" if suite.avg_execution_time <= suite.performance_target else "❌ FAIL"
|
276
|
+
)
|
277
|
+
stats_table.add_row(
|
278
|
+
"Time Range",
|
279
|
+
f"{suite.min_execution_time:.1f}s - {suite.max_execution_time:.1f}s",
|
280
|
+
"ℹ️ INFO"
|
281
|
+
)
|
282
|
+
stats_table.add_row(
|
283
|
+
"Time Std Dev",
|
284
|
+
f"{suite.time_std_dev:.1f}s",
|
285
|
+
"✅ GOOD" if suite.time_std_dev < 5.0 else "⚠️ HIGH"
|
286
|
+
)
|
287
|
+
|
288
|
+
# SRE metrics
|
289
|
+
stats_table.add_row(
|
290
|
+
"Success Rate",
|
291
|
+
f"{suite.success_rate:.1f}%",
|
292
|
+
"✅ EXCELLENT" if suite.success_rate >= 80 else "❌ POOR"
|
293
|
+
)
|
294
|
+
stats_table.add_row(
|
295
|
+
"Availability",
|
296
|
+
f"{suite.availability:.1f}%",
|
297
|
+
"✅ PASS" if suite.availability >= 99 else "❌ FAIL"
|
298
|
+
)
|
299
|
+
stats_table.add_row(
|
300
|
+
"Reliability Score",
|
301
|
+
f"{suite.reliability_score:.1f}%",
|
302
|
+
"✅ EXCELLENT" if suite.reliability_score >= 90 else "⚠️ NEEDS WORK"
|
303
|
+
)
|
304
|
+
stats_table.add_row(
|
305
|
+
"Consistency",
|
306
|
+
f"{suite.performance_consistency:.1f}%",
|
307
|
+
"✅ STABLE" if suite.performance_consistency >= 80 else "⚠️ VARIABLE"
|
308
|
+
)
|
309
|
+
|
310
|
+
console.print(stats_table)
|
311
|
+
|
312
|
+
# Individual iteration results
|
313
|
+
iterations_table = Table(title="Individual Iterations", box=box.MINIMAL)
|
314
|
+
iterations_table.add_column("Iteration", justify="center")
|
315
|
+
iterations_table.add_column("Accuracy", justify="right")
|
316
|
+
iterations_table.add_column("Time (s)", justify="right")
|
317
|
+
iterations_table.add_column("Passed/Total")
|
318
|
+
iterations_table.add_column("Status", style="bold")
|
319
|
+
|
320
|
+
for result in suite.results:
|
321
|
+
status_color = "green" if result.accuracy >= suite.target_accuracy else "red"
|
322
|
+
status = "PASS" if result.accuracy >= suite.target_accuracy and result.execution_time <= suite.performance_target else "FAIL"
|
323
|
+
|
324
|
+
iterations_table.add_row(
|
325
|
+
str(result.iteration),
|
326
|
+
f"{result.accuracy:.1f}%",
|
327
|
+
f"{result.execution_time:.1f}",
|
328
|
+
f"{result.passed_validations}/{result.total_validations}",
|
329
|
+
f"[{status_color}]{status}[/{status_color}]"
|
330
|
+
)
|
331
|
+
|
332
|
+
console.print(iterations_table)
|
333
|
+
|
334
|
+
# Recommendations
|
335
|
+
recommendations = self._generate_benchmark_recommendations(suite)
|
336
|
+
if recommendations:
|
337
|
+
console.print(Panel(
|
338
|
+
"\n".join(f"• {rec}" for rec in recommendations),
|
339
|
+
title="Recommendations",
|
340
|
+
border_style="blue"
|
341
|
+
))
|
342
|
+
|
343
|
+
# Save benchmark report
|
344
|
+
self._save_benchmark_report(suite)
|
345
|
+
|
346
|
+
def _assess_benchmark_results(self, suite: BenchmarkSuite) -> str:
|
347
|
+
"""Assess overall benchmark results."""
|
348
|
+
|
349
|
+
accuracy_pass = suite.avg_accuracy >= suite.target_accuracy
|
350
|
+
performance_pass = suite.avg_execution_time <= suite.performance_target
|
351
|
+
reliability_pass = suite.reliability_score >= 90
|
352
|
+
consistency_pass = suite.accuracy_std_dev < 5.0
|
353
|
+
|
354
|
+
if accuracy_pass and performance_pass and reliability_pass:
|
355
|
+
return "PASSED"
|
356
|
+
elif accuracy_pass and performance_pass:
|
357
|
+
return "WARNING"
|
358
|
+
else:
|
359
|
+
return "FAILED"
|
360
|
+
|
361
|
+
def _generate_benchmark_recommendations(self, suite: BenchmarkSuite) -> List[str]:
|
362
|
+
"""Generate actionable recommendations based on benchmark results."""
|
363
|
+
|
364
|
+
recommendations = []
|
365
|
+
|
366
|
+
# Accuracy recommendations
|
367
|
+
if suite.avg_accuracy < suite.target_accuracy:
|
368
|
+
recommendations.append(f"🎯 Improve average accuracy from {suite.avg_accuracy:.1f}% to {suite.target_accuracy}%")
|
369
|
+
recommendations.append("🔍 Review MCP integration and AWS API permissions")
|
370
|
+
|
371
|
+
# Performance recommendations
|
372
|
+
if suite.avg_execution_time > suite.performance_target:
|
373
|
+
recommendations.append(f"⚡ Optimize performance from {suite.avg_execution_time:.1f}s to <{suite.performance_target}s")
|
374
|
+
recommendations.append("🚀 Consider parallel validation and caching strategies")
|
375
|
+
|
376
|
+
# Consistency recommendations
|
377
|
+
if suite.accuracy_std_dev > 5.0:
|
378
|
+
recommendations.append(f"📊 Improve consistency - accuracy std dev {suite.accuracy_std_dev:.1f}% is high")
|
379
|
+
recommendations.append("🔧 Investigate sources of validation variance")
|
380
|
+
|
381
|
+
# Reliability recommendations
|
382
|
+
if suite.reliability_score < 90:
|
383
|
+
recommendations.append(f"🛠️ Enhance reliability score from {suite.reliability_score:.1f}% to >90%")
|
384
|
+
recommendations.append("📈 Focus on both accuracy and performance improvements")
|
385
|
+
|
386
|
+
# Success rate recommendations
|
387
|
+
if suite.success_rate < 80:
|
388
|
+
recommendations.append(f"✅ Improve success rate from {suite.success_rate:.1f}% to >80%")
|
389
|
+
recommendations.append("🎯 Address systematic issues causing validation failures")
|
390
|
+
|
391
|
+
# Production readiness
|
392
|
+
overall_status = self._assess_benchmark_results(suite)
|
393
|
+
if overall_status == "PASSED":
|
394
|
+
recommendations.append("🚀 Benchmark PASSED - Ready for production deployment")
|
395
|
+
elif overall_status == "WARNING":
|
396
|
+
recommendations.append("⚠️ Benchmark WARNING - Address consistency issues before production")
|
397
|
+
else:
|
398
|
+
recommendations.append("❌ Benchmark FAILED - Significant improvements needed before production")
|
399
|
+
|
400
|
+
return recommendations
|
401
|
+
|
402
|
+
def _save_benchmark_report(self, suite: BenchmarkSuite) -> None:
|
403
|
+
"""Save benchmark report to artifacts directory."""
|
404
|
+
|
405
|
+
from pathlib import Path
|
406
|
+
|
407
|
+
artifacts_dir = Path("./artifacts/benchmark")
|
408
|
+
artifacts_dir.mkdir(parents=True, exist_ok=True)
|
409
|
+
|
410
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
411
|
+
report_file = artifacts_dir / f"mcp_benchmark_{timestamp}.json"
|
412
|
+
|
413
|
+
# Convert to serializable format
|
414
|
+
report_data = {
|
415
|
+
"benchmark_suite": {
|
416
|
+
"target_accuracy": suite.target_accuracy,
|
417
|
+
"performance_target": suite.performance_target,
|
418
|
+
"iterations": suite.iterations,
|
419
|
+
"avg_accuracy": suite.avg_accuracy,
|
420
|
+
"avg_execution_time": suite.avg_execution_time,
|
421
|
+
"min_accuracy": suite.min_accuracy,
|
422
|
+
"max_accuracy": suite.max_accuracy,
|
423
|
+
"min_execution_time": suite.min_execution_time,
|
424
|
+
"max_execution_time": suite.max_execution_time,
|
425
|
+
"accuracy_std_dev": suite.accuracy_std_dev,
|
426
|
+
"time_std_dev": suite.time_std_dev,
|
427
|
+
"success_rate": suite.success_rate,
|
428
|
+
"availability": suite.availability,
|
429
|
+
"reliability_score": suite.reliability_score,
|
430
|
+
"performance_consistency": suite.performance_consistency
|
431
|
+
},
|
432
|
+
"results": [
|
433
|
+
{
|
434
|
+
"iteration": r.iteration,
|
435
|
+
"accuracy": r.accuracy,
|
436
|
+
"execution_time": r.execution_time,
|
437
|
+
"passed_validations": r.passed_validations,
|
438
|
+
"total_validations": r.total_validations,
|
439
|
+
"timestamp": r.timestamp.isoformat()
|
440
|
+
}
|
441
|
+
for r in suite.results
|
442
|
+
],
|
443
|
+
"assessment": self._assess_benchmark_results(suite),
|
444
|
+
"recommendations": self._generate_benchmark_recommendations(suite)
|
445
|
+
}
|
446
|
+
|
447
|
+
with open(report_file, 'w') as f:
|
448
|
+
json.dump(report_data, f, indent=2)
|
449
|
+
|
450
|
+
console.print(f"[green]Benchmark report saved:[/green] {report_file}")
|
451
|
+
|
452
|
+
# CLI entry point
|
453
|
+
async def main():
|
454
|
+
"""CLI entry point for benchmark runner."""
|
455
|
+
import argparse
|
456
|
+
|
457
|
+
parser = argparse.ArgumentParser(description="MCP Validation Benchmark Suite")
|
458
|
+
parser.add_argument('--iterations', type=int, default=5, help='Number of benchmark iterations')
|
459
|
+
parser.add_argument('--target-accuracy', type=float, default=99.5, help='Target accuracy percentage')
|
460
|
+
parser.add_argument('--performance-target', type=float, default=30.0, help='Performance target in seconds')
|
461
|
+
parser.add_argument('--tolerance', type=float, default=5.0, help='Tolerance percentage')
|
462
|
+
|
463
|
+
args = parser.parse_args()
|
464
|
+
|
465
|
+
runner = MCPBenchmarkRunner(
|
466
|
+
target_accuracy=args.target_accuracy,
|
467
|
+
performance_target=args.performance_target,
|
468
|
+
tolerance_percentage=args.tolerance
|
469
|
+
)
|
470
|
+
|
471
|
+
suite = await runner.run_benchmark(args.iterations)
|
472
|
+
runner.display_benchmark_results(suite)
|
473
|
+
|
474
|
+
# Exit with appropriate code
|
475
|
+
overall_status = runner._assess_benchmark_results(suite)
|
476
|
+
if overall_status == "PASSED":
|
477
|
+
exit(0)
|
478
|
+
elif overall_status == "WARNING":
|
479
|
+
exit(1)
|
480
|
+
else:
|
481
|
+
exit(2)
|
482
|
+
|
483
|
+
if __name__ == '__main__':
|
484
|
+
asyncio.run(main())
|