runbooks 0.7.9__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runbooks/__init__.py +1 -1
- runbooks/cfat/README.md +12 -1
- runbooks/cfat/__init__.py +1 -1
- runbooks/cfat/assessment/compliance.py +4 -1
- runbooks/cfat/assessment/runner.py +42 -34
- runbooks/cfat/models.py +1 -1
- runbooks/cloudops/__init__.py +123 -0
- runbooks/cloudops/base.py +385 -0
- runbooks/cloudops/cost_optimizer.py +811 -0
- runbooks/cloudops/infrastructure_optimizer.py +29 -0
- runbooks/cloudops/interfaces.py +828 -0
- runbooks/cloudops/lifecycle_manager.py +29 -0
- runbooks/cloudops/mcp_cost_validation.py +678 -0
- runbooks/cloudops/models.py +251 -0
- runbooks/cloudops/monitoring_automation.py +29 -0
- runbooks/cloudops/notebook_framework.py +676 -0
- runbooks/cloudops/security_enforcer.py +449 -0
- runbooks/common/__init__.py +152 -0
- runbooks/common/accuracy_validator.py +1039 -0
- runbooks/common/context_logger.py +440 -0
- runbooks/common/cross_module_integration.py +594 -0
- runbooks/common/enhanced_exception_handler.py +1108 -0
- runbooks/common/enterprise_audit_integration.py +634 -0
- runbooks/common/mcp_cost_explorer_integration.py +900 -0
- runbooks/common/mcp_integration.py +548 -0
- runbooks/common/performance_monitor.py +387 -0
- runbooks/common/profile_utils.py +216 -0
- runbooks/common/rich_utils.py +172 -1
- runbooks/feedback/user_feedback_collector.py +440 -0
- runbooks/finops/README.md +377 -458
- runbooks/finops/__init__.py +4 -21
- runbooks/finops/account_resolver.py +279 -0
- runbooks/finops/accuracy_cross_validator.py +638 -0
- runbooks/finops/aws_client.py +721 -36
- runbooks/finops/budget_integration.py +313 -0
- runbooks/finops/cli.py +59 -5
- runbooks/finops/cost_optimizer.py +1340 -0
- runbooks/finops/cost_processor.py +211 -37
- runbooks/finops/dashboard_router.py +900 -0
- runbooks/finops/dashboard_runner.py +990 -232
- runbooks/finops/embedded_mcp_validator.py +288 -0
- runbooks/finops/enhanced_dashboard_runner.py +8 -7
- runbooks/finops/enhanced_progress.py +327 -0
- runbooks/finops/enhanced_trend_visualization.py +423 -0
- runbooks/finops/finops_dashboard.py +184 -1829
- runbooks/finops/helpers.py +509 -196
- runbooks/finops/iam_guidance.py +400 -0
- runbooks/finops/markdown_exporter.py +466 -0
- runbooks/finops/multi_dashboard.py +1502 -0
- runbooks/finops/optimizer.py +15 -15
- runbooks/finops/profile_processor.py +2 -2
- runbooks/finops/runbooks.inventory.organizations_discovery.log +0 -0
- runbooks/finops/runbooks.security.report_generator.log +0 -0
- runbooks/finops/runbooks.security.run_script.log +0 -0
- runbooks/finops/runbooks.security.security_export.log +0 -0
- runbooks/finops/schemas.py +589 -0
- runbooks/finops/service_mapping.py +195 -0
- runbooks/finops/single_dashboard.py +710 -0
- runbooks/finops/tests/test_reference_images_validation.py +1 -1
- runbooks/inventory/README.md +12 -1
- runbooks/inventory/core/collector.py +157 -29
- runbooks/inventory/list_ec2_instances.py +9 -6
- runbooks/inventory/list_ssm_parameters.py +10 -10
- runbooks/inventory/organizations_discovery.py +210 -164
- runbooks/inventory/rich_inventory_display.py +74 -107
- runbooks/inventory/run_on_multi_accounts.py +13 -13
- runbooks/inventory/runbooks.inventory.organizations_discovery.log +0 -0
- runbooks/inventory/runbooks.security.security_export.log +0 -0
- runbooks/main.py +1371 -240
- runbooks/metrics/dora_metrics_engine.py +711 -17
- runbooks/monitoring/performance_monitor.py +433 -0
- runbooks/operate/README.md +394 -0
- runbooks/operate/base.py +215 -47
- runbooks/operate/ec2_operations.py +435 -5
- runbooks/operate/iam_operations.py +598 -3
- runbooks/operate/privatelink_operations.py +1 -1
- runbooks/operate/rds_operations.py +508 -0
- runbooks/operate/s3_operations.py +508 -0
- runbooks/operate/vpc_endpoints.py +1 -1
- runbooks/remediation/README.md +489 -13
- runbooks/remediation/base.py +5 -3
- runbooks/remediation/commons.py +8 -4
- runbooks/security/ENTERPRISE_SECURITY_FRAMEWORK.md +506 -0
- runbooks/security/README.md +12 -1
- runbooks/security/__init__.py +265 -33
- runbooks/security/cloudops_automation_security_validator.py +1164 -0
- runbooks/security/compliance_automation.py +12 -10
- runbooks/security/compliance_automation_engine.py +1021 -0
- runbooks/security/enterprise_security_framework.py +930 -0
- runbooks/security/enterprise_security_policies.json +293 -0
- runbooks/security/executive_security_dashboard.py +1247 -0
- runbooks/security/integration_test_enterprise_security.py +879 -0
- runbooks/security/module_security_integrator.py +641 -0
- runbooks/security/multi_account_security_controls.py +2254 -0
- runbooks/security/real_time_security_monitor.py +1196 -0
- runbooks/security/report_generator.py +1 -1
- runbooks/security/run_script.py +4 -8
- runbooks/security/security_baseline_tester.py +39 -52
- runbooks/security/security_export.py +99 -120
- runbooks/sre/README.md +472 -0
- runbooks/sre/__init__.py +33 -0
- runbooks/sre/mcp_reliability_engine.py +1049 -0
- runbooks/sre/performance_optimization_engine.py +1032 -0
- runbooks/sre/production_monitoring_framework.py +584 -0
- runbooks/sre/reliability_monitoring_framework.py +1011 -0
- runbooks/validation/__init__.py +2 -2
- runbooks/validation/benchmark.py +154 -149
- runbooks/validation/cli.py +159 -147
- runbooks/validation/mcp_validator.py +291 -248
- runbooks/vpc/README.md +478 -0
- runbooks/vpc/__init__.py +2 -2
- runbooks/vpc/manager_interface.py +366 -351
- runbooks/vpc/networking_wrapper.py +68 -36
- runbooks/vpc/rich_formatters.py +22 -8
- runbooks-0.9.1.dist-info/METADATA +308 -0
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/RECORD +120 -59
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/entry_points.txt +1 -1
- runbooks/finops/cross_validation.py +0 -375
- runbooks-0.7.9.dist-info/METADATA +0 -636
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/WHEEL +0 -0
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/licenses/LICENSE +0 -0
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/top_level.txt +0 -0
runbooks/validation/__init__.py
CHANGED
@@ -5,6 +5,6 @@ Provides comprehensive validation between runbooks outputs and MCP server result
|
|
5
5
|
for enterprise AWS operations with 99.5% accuracy target.
|
6
6
|
"""
|
7
7
|
|
8
|
-
from .mcp_validator import MCPValidator,
|
8
|
+
from .mcp_validator import MCPValidator, ValidationReport, ValidationResult, ValidationStatus
|
9
9
|
|
10
|
-
__all__ = [
|
10
|
+
__all__ = ["MCPValidator", "ValidationResult", "ValidationReport", "ValidationStatus"]
|
runbooks/validation/benchmark.py
CHANGED
@@ -14,26 +14,28 @@ Usage:
|
|
14
14
|
"""
|
15
15
|
|
16
16
|
import asyncio
|
17
|
-
import
|
17
|
+
import json
|
18
18
|
import statistics
|
19
|
-
|
20
|
-
from typing import List, Dict, Any
|
19
|
+
import time
|
21
20
|
from dataclasses import dataclass
|
22
|
-
import
|
21
|
+
from datetime import datetime
|
22
|
+
from typing import Any, Dict, List
|
23
23
|
|
24
|
+
from rich import box
|
24
25
|
from rich.console import Console
|
25
|
-
from rich.table import Table
|
26
26
|
from rich.panel import Panel
|
27
27
|
from rich.progress import Progress, TaskID
|
28
|
-
from rich import
|
28
|
+
from rich.table import Table
|
29
29
|
|
30
30
|
from .mcp_validator import MCPValidator, ValidationReport, ValidationStatus
|
31
31
|
|
32
32
|
console = Console()
|
33
33
|
|
34
|
+
|
34
35
|
@dataclass
|
35
36
|
class BenchmarkResult:
|
36
37
|
"""Individual benchmark iteration result."""
|
38
|
+
|
37
39
|
iteration: int
|
38
40
|
accuracy: float
|
39
41
|
execution_time: float
|
@@ -42,14 +44,16 @@ class BenchmarkResult:
|
|
42
44
|
timestamp: datetime
|
43
45
|
details: ValidationReport
|
44
46
|
|
47
|
+
|
45
48
|
@dataclass
|
46
49
|
class BenchmarkSuite:
|
47
50
|
"""Complete benchmark suite results."""
|
51
|
+
|
48
52
|
target_accuracy: float
|
49
53
|
performance_target: float
|
50
54
|
iterations: int
|
51
55
|
results: List[BenchmarkResult]
|
52
|
-
|
56
|
+
|
53
57
|
# Summary statistics
|
54
58
|
avg_accuracy: float
|
55
59
|
avg_execution_time: float
|
@@ -60,95 +64,96 @@ class BenchmarkSuite:
|
|
60
64
|
accuracy_std_dev: float
|
61
65
|
time_std_dev: float
|
62
66
|
success_rate: float
|
63
|
-
|
67
|
+
|
64
68
|
# SRE metrics
|
65
69
|
availability: float # % of successful validations
|
66
70
|
reliability_score: float # Combined accuracy + performance
|
67
71
|
performance_consistency: float # Low variance = high consistency
|
68
72
|
|
73
|
+
|
69
74
|
class MCPBenchmarkRunner:
|
70
75
|
"""
|
71
76
|
Enterprise benchmark runner for MCP validation framework.
|
72
|
-
|
77
|
+
|
73
78
|
Provides comprehensive performance testing with SRE reliability metrics
|
74
79
|
and enterprise reporting for production deployment validation.
|
75
80
|
"""
|
76
|
-
|
77
|
-
def __init__(
|
78
|
-
|
79
|
-
|
80
|
-
tolerance_percentage: float = 5.0):
|
81
|
+
|
82
|
+
def __init__(
|
83
|
+
self, target_accuracy: float = 99.5, performance_target: float = 30.0, tolerance_percentage: float = 5.0
|
84
|
+
):
|
81
85
|
"""Initialize benchmark runner."""
|
82
|
-
|
86
|
+
|
83
87
|
self.target_accuracy = target_accuracy
|
84
88
|
self.performance_target = performance_target
|
85
89
|
self.tolerance_percentage = tolerance_percentage
|
86
|
-
|
87
|
-
console.print(
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
90
|
+
|
91
|
+
console.print(
|
92
|
+
Panel(
|
93
|
+
f"[bold blue]MCP Validation Benchmark Suite[/bold blue]\n"
|
94
|
+
f"Target Accuracy: {target_accuracy}%\n"
|
95
|
+
f"Performance Target: <{performance_target}s\n"
|
96
|
+
f"Tolerance: ±{tolerance_percentage}%",
|
97
|
+
title="Enterprise Benchmark Framework",
|
98
|
+
)
|
99
|
+
)
|
100
|
+
|
95
101
|
async def run_benchmark(self, iterations: int = 5) -> BenchmarkSuite:
|
96
102
|
"""
|
97
103
|
Run comprehensive benchmark across multiple iterations.
|
98
|
-
|
104
|
+
|
99
105
|
Args:
|
100
106
|
iterations: Number of benchmark iterations to run
|
101
|
-
|
107
|
+
|
102
108
|
Returns:
|
103
109
|
BenchmarkSuite with complete performance analysis
|
104
110
|
"""
|
105
|
-
|
111
|
+
|
106
112
|
console.print(f"\n[bold cyan]Starting {iterations} benchmark iterations...[/bold cyan]")
|
107
|
-
|
113
|
+
|
108
114
|
results: List[BenchmarkResult] = []
|
109
|
-
|
115
|
+
|
110
116
|
with Progress() as progress:
|
111
117
|
task = progress.add_task("[cyan]Running benchmark...", total=iterations)
|
112
|
-
|
118
|
+
|
113
119
|
for i in range(iterations):
|
114
|
-
progress.console.print(f"\n[bold green]→ Iteration {i+1}/{iterations}[/bold green]")
|
115
|
-
|
120
|
+
progress.console.print(f"\n[bold green]→ Iteration {i + 1}/{iterations}[/bold green]")
|
121
|
+
|
116
122
|
# Run single benchmark iteration
|
117
123
|
result = await self._run_single_iteration(i + 1, progress)
|
118
124
|
results.append(result)
|
119
|
-
|
125
|
+
|
120
126
|
# Display iteration summary
|
121
127
|
status_color = "green" if result.accuracy >= self.target_accuracy else "red"
|
122
128
|
perf_color = "green" if result.execution_time <= self.performance_target else "red"
|
123
|
-
|
129
|
+
|
124
130
|
progress.console.print(
|
125
131
|
f" Accuracy: [{status_color}]{result.accuracy:.1f}%[/{status_color}] | "
|
126
132
|
f"Time: [{perf_color}]{result.execution_time:.1f}s[/{perf_color}] | "
|
127
133
|
f"Passed: {result.passed_validations}/{result.total_validations}"
|
128
134
|
)
|
129
|
-
|
135
|
+
|
130
136
|
progress.advance(task)
|
131
|
-
|
137
|
+
|
132
138
|
# Calculate benchmark suite statistics
|
133
139
|
return self._calculate_benchmark_statistics(results)
|
134
|
-
|
140
|
+
|
135
141
|
async def _run_single_iteration(self, iteration: int, progress: Progress) -> BenchmarkResult:
|
136
142
|
"""Run single benchmark iteration."""
|
137
|
-
|
143
|
+
|
138
144
|
start_time = time.time()
|
139
|
-
|
145
|
+
|
140
146
|
# Initialize validator for this iteration
|
141
147
|
validator = MCPValidator(
|
142
|
-
tolerance_percentage=self.tolerance_percentage,
|
143
|
-
performance_target_seconds=self.performance_target
|
148
|
+
tolerance_percentage=self.tolerance_percentage, performance_target_seconds=self.performance_target
|
144
149
|
)
|
145
|
-
|
150
|
+
|
146
151
|
# Run validation
|
147
152
|
try:
|
148
153
|
report = await validator.validate_all_operations()
|
149
|
-
|
154
|
+
|
150
155
|
execution_time = time.time() - start_time
|
151
|
-
|
156
|
+
|
152
157
|
return BenchmarkResult(
|
153
158
|
iteration=iteration,
|
154
159
|
accuracy=report.overall_accuracy,
|
@@ -156,13 +161,13 @@ class MCPBenchmarkRunner:
|
|
156
161
|
passed_validations=report.passed_validations,
|
157
162
|
total_validations=report.total_validations,
|
158
163
|
timestamp=datetime.now(),
|
159
|
-
details=report
|
164
|
+
details=report,
|
160
165
|
)
|
161
|
-
|
166
|
+
|
162
167
|
except Exception as e:
|
163
168
|
execution_time = time.time() - start_time
|
164
169
|
progress.console.print(f"[red]Iteration {iteration} failed: {e}[/red]")
|
165
|
-
|
170
|
+
|
166
171
|
# Return failed iteration
|
167
172
|
return BenchmarkResult(
|
168
173
|
iteration=iteration,
|
@@ -171,46 +176,48 @@ class MCPBenchmarkRunner:
|
|
171
176
|
passed_validations=0,
|
172
177
|
total_validations=5, # Expected number of validations
|
173
178
|
timestamp=datetime.now(),
|
174
|
-
details=None
|
179
|
+
details=None,
|
175
180
|
)
|
176
|
-
|
181
|
+
|
177
182
|
def _calculate_benchmark_statistics(self, results: List[BenchmarkResult]) -> BenchmarkSuite:
|
178
183
|
"""Calculate comprehensive benchmark statistics."""
|
179
|
-
|
184
|
+
|
180
185
|
if not results:
|
181
186
|
raise ValueError("No benchmark results to analyze")
|
182
|
-
|
187
|
+
|
183
188
|
# Basic statistics
|
184
189
|
accuracies = [r.accuracy for r in results]
|
185
190
|
times = [r.execution_time for r in results]
|
186
|
-
|
191
|
+
|
187
192
|
avg_accuracy = statistics.mean(accuracies)
|
188
193
|
avg_execution_time = statistics.mean(times)
|
189
194
|
min_accuracy = min(accuracies)
|
190
195
|
max_accuracy = max(accuracies)
|
191
196
|
min_execution_time = min(times)
|
192
197
|
max_execution_time = max(times)
|
193
|
-
|
198
|
+
|
194
199
|
# Calculate standard deviations
|
195
200
|
accuracy_std_dev = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0
|
196
201
|
time_std_dev = statistics.stdev(times) if len(times) > 1 else 0.0
|
197
|
-
|
202
|
+
|
198
203
|
# Success rate (meeting target accuracy)
|
199
204
|
successful_iterations = len([r for r in results if r.accuracy >= self.target_accuracy])
|
200
205
|
success_rate = (successful_iterations / len(results)) * 100
|
201
|
-
|
206
|
+
|
202
207
|
# SRE reliability metrics
|
203
208
|
availability = len([r for r in results if r.accuracy > 0]) / len(results) * 100
|
204
|
-
|
209
|
+
|
205
210
|
# Reliability score (weighted accuracy + performance)
|
206
211
|
accuracy_score = min(100, avg_accuracy / self.target_accuracy * 100)
|
207
|
-
performance_score =
|
212
|
+
performance_score = (
|
213
|
+
min(100, self.performance_target / avg_execution_time * 100) if avg_execution_time > 0 else 0
|
214
|
+
)
|
208
215
|
reliability_score = (accuracy_score * 0.7) + (performance_score * 0.3) # 70% accuracy, 30% performance
|
209
|
-
|
216
|
+
|
210
217
|
# Performance consistency (lower std dev = higher consistency)
|
211
218
|
max_acceptable_std_dev = 5.0 # 5% standard deviation is acceptable
|
212
219
|
performance_consistency = max(0, 100 - (accuracy_std_dev / max_acceptable_std_dev * 100))
|
213
|
-
|
220
|
+
|
214
221
|
return BenchmarkSuite(
|
215
222
|
target_accuracy=self.target_accuracy,
|
216
223
|
performance_target=self.performance_target,
|
@@ -227,88 +234,78 @@ class MCPBenchmarkRunner:
|
|
227
234
|
success_rate=success_rate,
|
228
235
|
availability=availability,
|
229
236
|
reliability_score=reliability_score,
|
230
|
-
performance_consistency=performance_consistency
|
237
|
+
performance_consistency=performance_consistency,
|
231
238
|
)
|
232
|
-
|
239
|
+
|
233
240
|
def display_benchmark_results(self, suite: BenchmarkSuite) -> None:
|
234
241
|
"""Display comprehensive benchmark results."""
|
235
|
-
|
242
|
+
|
236
243
|
# Overall assessment
|
237
244
|
overall_status = self._assess_benchmark_results(suite)
|
238
245
|
status_color = "green" if overall_status == "PASSED" else "red" if overall_status == "FAILED" else "yellow"
|
239
|
-
|
240
|
-
console.print(
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
246
|
+
|
247
|
+
console.print(
|
248
|
+
Panel(
|
249
|
+
f"[bold {status_color}]Benchmark Status: {overall_status}[/bold {status_color}]\n"
|
250
|
+
f"Average Accuracy: {suite.avg_accuracy:.2f}% (Target: {suite.target_accuracy}%)\n"
|
251
|
+
f"Average Time: {suite.avg_execution_time:.1f}s (Target: <{suite.performance_target}s)\n"
|
252
|
+
f"Success Rate: {suite.success_rate:.1f}% | Reliability: {suite.reliability_score:.1f}%",
|
253
|
+
title="Benchmark Summary",
|
254
|
+
)
|
255
|
+
)
|
256
|
+
|
248
257
|
# Detailed statistics table
|
249
258
|
stats_table = Table(title="Performance Statistics", box=box.ROUNDED)
|
250
259
|
stats_table.add_column("Metric", style="cyan", no_wrap=True)
|
251
260
|
stats_table.add_column("Value", justify="right", style="bold")
|
252
261
|
stats_table.add_column("Status", style="bold")
|
253
|
-
|
262
|
+
|
254
263
|
# Accuracy metrics
|
255
264
|
stats_table.add_row(
|
256
|
-
"Average Accuracy",
|
265
|
+
"Average Accuracy",
|
257
266
|
f"{suite.avg_accuracy:.2f}%",
|
258
|
-
"✅ PASS" if suite.avg_accuracy >= suite.target_accuracy else "❌ FAIL"
|
259
|
-
)
|
260
|
-
stats_table.add_row(
|
261
|
-
"Accuracy Range",
|
262
|
-
f"{suite.min_accuracy:.1f}% - {suite.max_accuracy:.1f}%",
|
263
|
-
"ℹ️ INFO"
|
267
|
+
"✅ PASS" if suite.avg_accuracy >= suite.target_accuracy else "❌ FAIL",
|
264
268
|
)
|
269
|
+
stats_table.add_row("Accuracy Range", f"{suite.min_accuracy:.1f}% - {suite.max_accuracy:.1f}%", "ℹ️ INFO")
|
265
270
|
stats_table.add_row(
|
266
271
|
"Accuracy Std Dev",
|
267
272
|
f"{suite.accuracy_std_dev:.2f}%",
|
268
|
-
"✅ GOOD" if suite.accuracy_std_dev < 5.0 else "⚠️ HIGH"
|
273
|
+
"✅ GOOD" if suite.accuracy_std_dev < 5.0 else "⚠️ HIGH",
|
269
274
|
)
|
270
|
-
|
275
|
+
|
271
276
|
# Performance metrics
|
272
277
|
stats_table.add_row(
|
273
278
|
"Average Time",
|
274
279
|
f"{suite.avg_execution_time:.1f}s",
|
275
|
-
"✅ PASS" if suite.avg_execution_time <= suite.performance_target else "❌ FAIL"
|
280
|
+
"✅ PASS" if suite.avg_execution_time <= suite.performance_target else "❌ FAIL",
|
276
281
|
)
|
277
282
|
stats_table.add_row(
|
278
|
-
"Time Range",
|
279
|
-
f"{suite.min_execution_time:.1f}s - {suite.max_execution_time:.1f}s",
|
280
|
-
"ℹ️ INFO"
|
283
|
+
"Time Range", f"{suite.min_execution_time:.1f}s - {suite.max_execution_time:.1f}s", "ℹ️ INFO"
|
281
284
|
)
|
282
285
|
stats_table.add_row(
|
283
|
-
"Time Std Dev",
|
284
|
-
f"{suite.time_std_dev:.1f}s",
|
285
|
-
"✅ GOOD" if suite.time_std_dev < 5.0 else "⚠️ HIGH"
|
286
|
+
"Time Std Dev", f"{suite.time_std_dev:.1f}s", "✅ GOOD" if suite.time_std_dev < 5.0 else "⚠️ HIGH"
|
286
287
|
)
|
287
|
-
|
288
|
+
|
288
289
|
# SRE metrics
|
289
290
|
stats_table.add_row(
|
290
|
-
"Success Rate",
|
291
|
-
f"{suite.success_rate:.1f}%",
|
292
|
-
"✅ EXCELLENT" if suite.success_rate >= 80 else "❌ POOR"
|
291
|
+
"Success Rate", f"{suite.success_rate:.1f}%", "✅ EXCELLENT" if suite.success_rate >= 80 else "❌ POOR"
|
293
292
|
)
|
294
293
|
stats_table.add_row(
|
295
|
-
"Availability",
|
296
|
-
f"{suite.availability:.1f}%",
|
297
|
-
"✅ PASS" if suite.availability >= 99 else "❌ FAIL"
|
294
|
+
"Availability", f"{suite.availability:.1f}%", "✅ PASS" if suite.availability >= 99 else "❌ FAIL"
|
298
295
|
)
|
299
296
|
stats_table.add_row(
|
300
297
|
"Reliability Score",
|
301
298
|
f"{suite.reliability_score:.1f}%",
|
302
|
-
"✅ EXCELLENT" if suite.reliability_score >= 90 else "⚠️ NEEDS WORK"
|
299
|
+
"✅ EXCELLENT" if suite.reliability_score >= 90 else "⚠️ NEEDS WORK",
|
303
300
|
)
|
304
301
|
stats_table.add_row(
|
305
302
|
"Consistency",
|
306
303
|
f"{suite.performance_consistency:.1f}%",
|
307
|
-
"✅ STABLE" if suite.performance_consistency >= 80 else "⚠️ VARIABLE"
|
304
|
+
"✅ STABLE" if suite.performance_consistency >= 80 else "⚠️ VARIABLE",
|
308
305
|
)
|
309
|
-
|
306
|
+
|
310
307
|
console.print(stats_table)
|
311
|
-
|
308
|
+
|
312
309
|
# Individual iteration results
|
313
310
|
iterations_table = Table(title="Individual Iterations", box=box.MINIMAL)
|
314
311
|
iterations_table.add_column("Iteration", justify="center")
|
@@ -316,78 +313,84 @@ class MCPBenchmarkRunner:
|
|
316
313
|
iterations_table.add_column("Time (s)", justify="right")
|
317
314
|
iterations_table.add_column("Passed/Total")
|
318
315
|
iterations_table.add_column("Status", style="bold")
|
319
|
-
|
316
|
+
|
320
317
|
for result in suite.results:
|
321
318
|
status_color = "green" if result.accuracy >= suite.target_accuracy else "red"
|
322
|
-
status =
|
323
|
-
|
319
|
+
status = (
|
320
|
+
"PASS"
|
321
|
+
if result.accuracy >= suite.target_accuracy and result.execution_time <= suite.performance_target
|
322
|
+
else "FAIL"
|
323
|
+
)
|
324
|
+
|
324
325
|
iterations_table.add_row(
|
325
326
|
str(result.iteration),
|
326
327
|
f"{result.accuracy:.1f}%",
|
327
328
|
f"{result.execution_time:.1f}",
|
328
329
|
f"{result.passed_validations}/{result.total_validations}",
|
329
|
-
f"[{status_color}]{status}[/{status_color}]"
|
330
|
+
f"[{status_color}]{status}[/{status_color}]",
|
330
331
|
)
|
331
|
-
|
332
|
+
|
332
333
|
console.print(iterations_table)
|
333
|
-
|
334
|
+
|
334
335
|
# Recommendations
|
335
336
|
recommendations = self._generate_benchmark_recommendations(suite)
|
336
337
|
if recommendations:
|
337
|
-
console.print(
|
338
|
-
"\n".join(f"• {rec}" for rec in recommendations),
|
339
|
-
|
340
|
-
|
341
|
-
))
|
342
|
-
|
338
|
+
console.print(
|
339
|
+
Panel("\n".join(f"• {rec}" for rec in recommendations), title="Recommendations", border_style="blue")
|
340
|
+
)
|
341
|
+
|
343
342
|
# Save benchmark report
|
344
343
|
self._save_benchmark_report(suite)
|
345
|
-
|
344
|
+
|
346
345
|
def _assess_benchmark_results(self, suite: BenchmarkSuite) -> str:
|
347
346
|
"""Assess overall benchmark results."""
|
348
|
-
|
347
|
+
|
349
348
|
accuracy_pass = suite.avg_accuracy >= suite.target_accuracy
|
350
349
|
performance_pass = suite.avg_execution_time <= suite.performance_target
|
351
350
|
reliability_pass = suite.reliability_score >= 90
|
352
351
|
consistency_pass = suite.accuracy_std_dev < 5.0
|
353
|
-
|
352
|
+
|
354
353
|
if accuracy_pass and performance_pass and reliability_pass:
|
355
354
|
return "PASSED"
|
356
355
|
elif accuracy_pass and performance_pass:
|
357
356
|
return "WARNING"
|
358
357
|
else:
|
359
358
|
return "FAILED"
|
360
|
-
|
359
|
+
|
361
360
|
def _generate_benchmark_recommendations(self, suite: BenchmarkSuite) -> List[str]:
|
362
361
|
"""Generate actionable recommendations based on benchmark results."""
|
363
|
-
|
362
|
+
|
364
363
|
recommendations = []
|
365
|
-
|
364
|
+
|
366
365
|
# Accuracy recommendations
|
367
366
|
if suite.avg_accuracy < suite.target_accuracy:
|
368
|
-
recommendations.append(
|
367
|
+
recommendations.append(
|
368
|
+
f"🎯 Improve average accuracy from {suite.avg_accuracy:.1f}% to {suite.target_accuracy}%"
|
369
|
+
)
|
369
370
|
recommendations.append("🔍 Review MCP integration and AWS API permissions")
|
370
|
-
|
371
|
-
# Performance recommendations
|
371
|
+
|
372
|
+
# Performance recommendations
|
372
373
|
if suite.avg_execution_time > suite.performance_target:
|
373
|
-
recommendations.append(
|
374
|
+
recommendations.append(
|
375
|
+
f"⚡ Optimize performance from {suite.avg_execution_time:.1f}s to <{suite.performance_target}s"
|
376
|
+
)
|
374
377
|
recommendations.append("🚀 Consider parallel validation and caching strategies")
|
375
|
-
|
378
|
+
|
376
379
|
# Consistency recommendations
|
377
380
|
if suite.accuracy_std_dev > 5.0:
|
378
381
|
recommendations.append(f"📊 Improve consistency - accuracy std dev {suite.accuracy_std_dev:.1f}% is high")
|
379
382
|
recommendations.append("🔧 Investigate sources of validation variance")
|
380
|
-
|
383
|
+
|
381
384
|
# Reliability recommendations
|
382
385
|
if suite.reliability_score < 90:
|
383
386
|
recommendations.append(f"🛠️ Enhance reliability score from {suite.reliability_score:.1f}% to >90%")
|
384
387
|
recommendations.append("📈 Focus on both accuracy and performance improvements")
|
385
|
-
|
388
|
+
|
386
389
|
# Success rate recommendations
|
387
390
|
if suite.success_rate < 80:
|
388
391
|
recommendations.append(f"✅ Improve success rate from {suite.success_rate:.1f}% to >80%")
|
389
392
|
recommendations.append("🎯 Address systematic issues causing validation failures")
|
390
|
-
|
393
|
+
|
391
394
|
# Production readiness
|
392
395
|
overall_status = self._assess_benchmark_results(suite)
|
393
396
|
if overall_status == "PASSED":
|
@@ -396,20 +399,20 @@ class MCPBenchmarkRunner:
|
|
396
399
|
recommendations.append("⚠️ Benchmark WARNING - Address consistency issues before production")
|
397
400
|
else:
|
398
401
|
recommendations.append("❌ Benchmark FAILED - Significant improvements needed before production")
|
399
|
-
|
402
|
+
|
400
403
|
return recommendations
|
401
|
-
|
404
|
+
|
402
405
|
def _save_benchmark_report(self, suite: BenchmarkSuite) -> None:
|
403
406
|
"""Save benchmark report to artifacts directory."""
|
404
|
-
|
407
|
+
|
405
408
|
from pathlib import Path
|
406
|
-
|
409
|
+
|
407
410
|
artifacts_dir = Path("./artifacts/benchmark")
|
408
411
|
artifacts_dir.mkdir(parents=True, exist_ok=True)
|
409
|
-
|
412
|
+
|
410
413
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
411
414
|
report_file = artifacts_dir / f"mcp_benchmark_{timestamp}.json"
|
412
|
-
|
415
|
+
|
413
416
|
# Convert to serializable format
|
414
417
|
report_data = {
|
415
418
|
"benchmark_suite": {
|
@@ -427,7 +430,7 @@ class MCPBenchmarkRunner:
|
|
427
430
|
"success_rate": suite.success_rate,
|
428
431
|
"availability": suite.availability,
|
429
432
|
"reliability_score": suite.reliability_score,
|
430
|
-
"performance_consistency": suite.performance_consistency
|
433
|
+
"performance_consistency": suite.performance_consistency,
|
431
434
|
},
|
432
435
|
"results": [
|
433
436
|
{
|
@@ -436,41 +439,42 @@ class MCPBenchmarkRunner:
|
|
436
439
|
"execution_time": r.execution_time,
|
437
440
|
"passed_validations": r.passed_validations,
|
438
441
|
"total_validations": r.total_validations,
|
439
|
-
"timestamp": r.timestamp.isoformat()
|
442
|
+
"timestamp": r.timestamp.isoformat(),
|
440
443
|
}
|
441
444
|
for r in suite.results
|
442
445
|
],
|
443
446
|
"assessment": self._assess_benchmark_results(suite),
|
444
|
-
"recommendations": self._generate_benchmark_recommendations(suite)
|
447
|
+
"recommendations": self._generate_benchmark_recommendations(suite),
|
445
448
|
}
|
446
|
-
|
447
|
-
with open(report_file,
|
449
|
+
|
450
|
+
with open(report_file, "w") as f:
|
448
451
|
json.dump(report_data, f, indent=2)
|
449
|
-
|
452
|
+
|
450
453
|
console.print(f"[green]Benchmark report saved:[/green] {report_file}")
|
451
454
|
|
455
|
+
|
452
456
|
# CLI entry point
|
453
457
|
async def main():
|
454
458
|
"""CLI entry point for benchmark runner."""
|
455
459
|
import argparse
|
456
|
-
|
460
|
+
|
457
461
|
parser = argparse.ArgumentParser(description="MCP Validation Benchmark Suite")
|
458
|
-
parser.add_argument(
|
459
|
-
parser.add_argument(
|
460
|
-
parser.add_argument(
|
461
|
-
parser.add_argument(
|
462
|
-
|
462
|
+
parser.add_argument("--iterations", type=int, default=5, help="Number of benchmark iterations")
|
463
|
+
parser.add_argument("--target-accuracy", type=float, default=99.5, help="Target accuracy percentage")
|
464
|
+
parser.add_argument("--performance-target", type=float, default=30.0, help="Performance target in seconds")
|
465
|
+
parser.add_argument("--tolerance", type=float, default=5.0, help="Tolerance percentage")
|
466
|
+
|
463
467
|
args = parser.parse_args()
|
464
|
-
|
468
|
+
|
465
469
|
runner = MCPBenchmarkRunner(
|
466
470
|
target_accuracy=args.target_accuracy,
|
467
471
|
performance_target=args.performance_target,
|
468
|
-
tolerance_percentage=args.tolerance
|
472
|
+
tolerance_percentage=args.tolerance,
|
469
473
|
)
|
470
|
-
|
474
|
+
|
471
475
|
suite = await runner.run_benchmark(args.iterations)
|
472
476
|
runner.display_benchmark_results(suite)
|
473
|
-
|
477
|
+
|
474
478
|
# Exit with appropriate code
|
475
479
|
overall_status = runner._assess_benchmark_results(suite)
|
476
480
|
if overall_status == "PASSED":
|
@@ -480,5 +484,6 @@ async def main():
|
|
480
484
|
else:
|
481
485
|
exit(2)
|
482
486
|
|
483
|
-
|
484
|
-
|
487
|
+
|
488
|
+
if __name__ == "__main__":
|
489
|
+
asyncio.run(main())
|