runbooks 1.1.4__py3-none-any.whl → 1.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runbooks/__init__.py +31 -2
- runbooks/__init___optimized.py +18 -4
- runbooks/_platform/__init__.py +1 -5
- runbooks/_platform/core/runbooks_wrapper.py +141 -138
- runbooks/aws2/accuracy_validator.py +812 -0
- runbooks/base.py +7 -0
- runbooks/cfat/assessment/compliance.py +1 -1
- runbooks/cfat/assessment/runner.py +1 -0
- runbooks/cfat/cloud_foundations_assessment.py +227 -239
- runbooks/cli/__init__.py +1 -1
- runbooks/cli/commands/cfat.py +64 -23
- runbooks/cli/commands/finops.py +1005 -54
- runbooks/cli/commands/inventory.py +138 -35
- runbooks/cli/commands/operate.py +9 -36
- runbooks/cli/commands/security.py +42 -18
- runbooks/cli/commands/validation.py +432 -18
- runbooks/cli/commands/vpc.py +81 -17
- runbooks/cli/registry.py +22 -10
- runbooks/cloudops/__init__.py +20 -27
- runbooks/cloudops/base.py +96 -107
- runbooks/cloudops/cost_optimizer.py +544 -542
- runbooks/cloudops/infrastructure_optimizer.py +5 -4
- runbooks/cloudops/interfaces.py +224 -225
- runbooks/cloudops/lifecycle_manager.py +5 -4
- runbooks/cloudops/mcp_cost_validation.py +252 -235
- runbooks/cloudops/models.py +78 -53
- runbooks/cloudops/monitoring_automation.py +5 -4
- runbooks/cloudops/notebook_framework.py +177 -213
- runbooks/cloudops/security_enforcer.py +125 -159
- runbooks/common/accuracy_validator.py +11 -0
- runbooks/common/aws_pricing.py +349 -326
- runbooks/common/aws_pricing_api.py +211 -212
- runbooks/common/aws_profile_manager.py +40 -36
- runbooks/common/aws_utils.py +74 -79
- runbooks/common/business_logic.py +126 -104
- runbooks/common/cli_decorators.py +36 -60
- runbooks/common/comprehensive_cost_explorer_integration.py +455 -463
- runbooks/common/cross_account_manager.py +197 -204
- runbooks/common/date_utils.py +27 -39
- runbooks/common/decorators.py +29 -19
- runbooks/common/dry_run_examples.py +173 -208
- runbooks/common/dry_run_framework.py +157 -155
- runbooks/common/enhanced_exception_handler.py +15 -4
- runbooks/common/enhanced_logging_example.py +50 -64
- runbooks/common/enhanced_logging_integration_example.py +65 -37
- runbooks/common/env_utils.py +16 -16
- runbooks/common/error_handling.py +40 -38
- runbooks/common/lazy_loader.py +41 -23
- runbooks/common/logging_integration_helper.py +79 -86
- runbooks/common/mcp_cost_explorer_integration.py +476 -493
- runbooks/common/mcp_integration.py +63 -74
- runbooks/common/memory_optimization.py +140 -118
- runbooks/common/module_cli_base.py +37 -58
- runbooks/common/organizations_client.py +175 -193
- runbooks/common/patterns.py +23 -25
- runbooks/common/performance_monitoring.py +67 -71
- runbooks/common/performance_optimization_engine.py +283 -274
- runbooks/common/profile_utils.py +111 -37
- runbooks/common/rich_utils.py +201 -141
- runbooks/common/sre_performance_suite.py +177 -186
- runbooks/enterprise/__init__.py +1 -1
- runbooks/enterprise/logging.py +144 -106
- runbooks/enterprise/security.py +187 -204
- runbooks/enterprise/validation.py +43 -56
- runbooks/finops/__init__.py +26 -30
- runbooks/finops/account_resolver.py +1 -1
- runbooks/finops/advanced_optimization_engine.py +980 -0
- runbooks/finops/automation_core.py +268 -231
- runbooks/finops/business_case_config.py +184 -179
- runbooks/finops/cli.py +660 -139
- runbooks/finops/commvault_ec2_analysis.py +157 -164
- runbooks/finops/compute_cost_optimizer.py +336 -320
- runbooks/finops/config.py +20 -20
- runbooks/finops/cost_optimizer.py +484 -618
- runbooks/finops/cost_processor.py +332 -214
- runbooks/finops/dashboard_runner.py +1006 -172
- runbooks/finops/ebs_cost_optimizer.py +991 -657
- runbooks/finops/elastic_ip_optimizer.py +317 -257
- runbooks/finops/enhanced_mcp_integration.py +340 -0
- runbooks/finops/enhanced_progress.py +32 -29
- runbooks/finops/enhanced_trend_visualization.py +3 -2
- runbooks/finops/enterprise_wrappers.py +223 -285
- runbooks/finops/executive_export.py +203 -160
- runbooks/finops/helpers.py +130 -288
- runbooks/finops/iam_guidance.py +1 -1
- runbooks/finops/infrastructure/__init__.py +80 -0
- runbooks/finops/infrastructure/commands.py +506 -0
- runbooks/finops/infrastructure/load_balancer_optimizer.py +866 -0
- runbooks/finops/infrastructure/vpc_endpoint_optimizer.py +832 -0
- runbooks/finops/markdown_exporter.py +337 -174
- runbooks/finops/mcp_validator.py +1952 -0
- runbooks/finops/nat_gateway_optimizer.py +1512 -481
- runbooks/finops/network_cost_optimizer.py +657 -587
- runbooks/finops/notebook_utils.py +226 -188
- runbooks/finops/optimization_engine.py +1136 -0
- runbooks/finops/optimizer.py +19 -23
- runbooks/finops/rds_snapshot_optimizer.py +367 -411
- runbooks/finops/reservation_optimizer.py +427 -363
- runbooks/finops/scenario_cli_integration.py +64 -65
- runbooks/finops/scenarios.py +1277 -438
- runbooks/finops/schemas.py +218 -182
- runbooks/finops/snapshot_manager.py +2289 -0
- runbooks/finops/types.py +3 -3
- runbooks/finops/validation_framework.py +259 -265
- runbooks/finops/vpc_cleanup_exporter.py +189 -144
- runbooks/finops/vpc_cleanup_optimizer.py +591 -573
- runbooks/finops/workspaces_analyzer.py +171 -182
- runbooks/integration/__init__.py +89 -0
- runbooks/integration/mcp_integration.py +1920 -0
- runbooks/inventory/CLAUDE.md +816 -0
- runbooks/inventory/__init__.py +2 -2
- runbooks/inventory/cloud_foundations_integration.py +144 -149
- runbooks/inventory/collectors/aws_comprehensive.py +1 -1
- runbooks/inventory/collectors/aws_networking.py +109 -99
- runbooks/inventory/collectors/base.py +4 -0
- runbooks/inventory/core/collector.py +495 -313
- runbooks/inventory/drift_detection_cli.py +69 -96
- runbooks/inventory/inventory_mcp_cli.py +48 -46
- runbooks/inventory/list_rds_snapshots_aggregator.py +192 -208
- runbooks/inventory/mcp_inventory_validator.py +549 -465
- runbooks/inventory/mcp_vpc_validator.py +359 -442
- runbooks/inventory/organizations_discovery.py +55 -51
- runbooks/inventory/rich_inventory_display.py +33 -32
- runbooks/inventory/unified_validation_engine.py +278 -251
- runbooks/inventory/vpc_analyzer.py +732 -695
- runbooks/inventory/vpc_architecture_validator.py +293 -348
- runbooks/inventory/vpc_dependency_analyzer.py +382 -378
- runbooks/inventory/vpc_flow_analyzer.py +1 -1
- runbooks/main.py +49 -34
- runbooks/main_final.py +91 -60
- runbooks/main_minimal.py +22 -10
- runbooks/main_optimized.py +131 -100
- runbooks/main_ultra_minimal.py +7 -2
- runbooks/mcp/__init__.py +36 -0
- runbooks/mcp/integration.py +679 -0
- runbooks/monitoring/performance_monitor.py +9 -4
- runbooks/operate/dynamodb_operations.py +3 -1
- runbooks/operate/ec2_operations.py +145 -137
- runbooks/operate/iam_operations.py +146 -152
- runbooks/operate/networking_cost_heatmap.py +29 -8
- runbooks/operate/rds_operations.py +223 -254
- runbooks/operate/s3_operations.py +107 -118
- runbooks/operate/vpc_operations.py +646 -616
- runbooks/remediation/base.py +1 -1
- runbooks/remediation/commons.py +10 -7
- runbooks/remediation/commvault_ec2_analysis.py +70 -66
- runbooks/remediation/ec2_unattached_ebs_volumes.py +1 -0
- runbooks/remediation/multi_account.py +24 -21
- runbooks/remediation/rds_snapshot_list.py +86 -60
- runbooks/remediation/remediation_cli.py +92 -146
- runbooks/remediation/universal_account_discovery.py +83 -79
- runbooks/remediation/workspaces_list.py +46 -41
- runbooks/security/__init__.py +19 -0
- runbooks/security/assessment_runner.py +1150 -0
- runbooks/security/baseline_checker.py +812 -0
- runbooks/security/cloudops_automation_security_validator.py +509 -535
- runbooks/security/compliance_automation_engine.py +17 -17
- runbooks/security/config/__init__.py +2 -2
- runbooks/security/config/compliance_config.py +50 -50
- runbooks/security/config_template_generator.py +63 -76
- runbooks/security/enterprise_security_framework.py +1 -1
- runbooks/security/executive_security_dashboard.py +519 -508
- runbooks/security/multi_account_security_controls.py +959 -1210
- runbooks/security/real_time_security_monitor.py +422 -444
- runbooks/security/security_baseline_tester.py +1 -1
- runbooks/security/security_cli.py +143 -112
- runbooks/security/test_2way_validation.py +439 -0
- runbooks/security/two_way_validation_framework.py +852 -0
- runbooks/sre/production_monitoring_framework.py +167 -177
- runbooks/tdd/__init__.py +15 -0
- runbooks/tdd/cli.py +1071 -0
- runbooks/utils/__init__.py +14 -17
- runbooks/utils/logger.py +7 -2
- runbooks/utils/version_validator.py +50 -47
- runbooks/validation/__init__.py +6 -6
- runbooks/validation/cli.py +9 -3
- runbooks/validation/comprehensive_2way_validator.py +745 -704
- runbooks/validation/mcp_validator.py +906 -228
- runbooks/validation/terraform_citations_validator.py +104 -115
- runbooks/validation/terraform_drift_detector.py +447 -451
- runbooks/vpc/README.md +617 -0
- runbooks/vpc/__init__.py +8 -1
- runbooks/vpc/analyzer.py +577 -0
- runbooks/vpc/cleanup_wrapper.py +476 -413
- runbooks/vpc/cli_cloudtrail_commands.py +339 -0
- runbooks/vpc/cli_mcp_validation_commands.py +480 -0
- runbooks/vpc/cloudtrail_audit_integration.py +717 -0
- runbooks/vpc/config.py +92 -97
- runbooks/vpc/cost_engine.py +411 -148
- runbooks/vpc/cost_explorer_integration.py +553 -0
- runbooks/vpc/cross_account_session.py +101 -106
- runbooks/vpc/enhanced_mcp_validation.py +917 -0
- runbooks/vpc/eni_gate_validator.py +961 -0
- runbooks/vpc/heatmap_engine.py +185 -160
- runbooks/vpc/mcp_no_eni_validator.py +680 -639
- runbooks/vpc/nat_gateway_optimizer.py +358 -0
- runbooks/vpc/networking_wrapper.py +15 -8
- runbooks/vpc/pdca_remediation_planner.py +528 -0
- runbooks/vpc/performance_optimized_analyzer.py +219 -231
- runbooks/vpc/runbooks_adapter.py +1167 -241
- runbooks/vpc/tdd_red_phase_stubs.py +601 -0
- runbooks/vpc/test_data_loader.py +358 -0
- runbooks/vpc/tests/conftest.py +314 -4
- runbooks/vpc/tests/test_cleanup_framework.py +1022 -0
- runbooks/vpc/tests/test_cost_engine.py +0 -2
- runbooks/vpc/topology_generator.py +326 -0
- runbooks/vpc/unified_scenarios.py +1297 -1124
- runbooks/vpc/vpc_cleanup_integration.py +1943 -1115
- runbooks-1.1.5.dist-info/METADATA +328 -0
- {runbooks-1.1.4.dist-info → runbooks-1.1.5.dist-info}/RECORD +214 -193
- runbooks/finops/README.md +0 -414
- runbooks/finops/accuracy_cross_validator.py +0 -647
- runbooks/finops/business_cases.py +0 -950
- runbooks/finops/dashboard_router.py +0 -922
- runbooks/finops/ebs_optimizer.py +0 -973
- runbooks/finops/embedded_mcp_validator.py +0 -1629
- runbooks/finops/enhanced_dashboard_runner.py +0 -527
- runbooks/finops/finops_dashboard.py +0 -584
- runbooks/finops/finops_scenarios.py +0 -1218
- runbooks/finops/legacy_migration.py +0 -730
- runbooks/finops/multi_dashboard.py +0 -1519
- runbooks/finops/single_dashboard.py +0 -1113
- runbooks/finops/unlimited_scenarios.py +0 -393
- runbooks-1.1.4.dist-info/METADATA +0 -800
- {runbooks-1.1.4.dist-info → runbooks-1.1.5.dist-info}/WHEEL +0 -0
- {runbooks-1.1.4.dist-info → runbooks-1.1.5.dist-info}/entry_points.txt +0 -0
- {runbooks-1.1.4.dist-info → runbooks-1.1.5.dist-info}/licenses/LICENSE +0 -0
- {runbooks-1.1.4.dist-info → runbooks-1.1.5.dist-info}/top_level.txt +0 -0
@@ -50,7 +50,7 @@ from runbooks.common.rich_utils import (
|
|
50
50
|
|
51
51
|
class AlertSeverity(Enum):
|
52
52
|
"""Alert severity levels for monitoring framework."""
|
53
|
-
|
53
|
+
|
54
54
|
INFO = "INFO"
|
55
55
|
WARNING = "WARNING"
|
56
56
|
CRITICAL = "CRITICAL"
|
@@ -59,7 +59,7 @@ class AlertSeverity(Enum):
|
|
59
59
|
|
60
60
|
class OperationStatus(Enum):
|
61
61
|
"""Operation status for monitoring."""
|
62
|
-
|
62
|
+
|
63
63
|
HEALTHY = "HEALTHY"
|
64
64
|
DEGRADED = "DEGRADED"
|
65
65
|
UNHEALTHY = "UNHEALTHY"
|
@@ -69,7 +69,7 @@ class OperationStatus(Enum):
|
|
69
69
|
@dataclass
|
70
70
|
class SLATarget:
|
71
71
|
"""SLA target definition with thresholds."""
|
72
|
-
|
72
|
+
|
73
73
|
name: str
|
74
74
|
target_value: float
|
75
75
|
warning_threshold: float
|
@@ -81,7 +81,7 @@ class SLATarget:
|
|
81
81
|
@dataclass
|
82
82
|
class MonitoringMetric:
|
83
83
|
"""Individual monitoring metric result."""
|
84
|
-
|
84
|
+
|
85
85
|
metric_name: str
|
86
86
|
current_value: float
|
87
87
|
target_value: float
|
@@ -93,7 +93,7 @@ class MonitoringMetric:
|
|
93
93
|
@dataclass
|
94
94
|
class AlertEvent:
|
95
95
|
"""Alert event structure."""
|
96
|
-
|
96
|
+
|
97
97
|
alert_id: str
|
98
98
|
severity: AlertSeverity
|
99
99
|
metric_name: str
|
@@ -107,201 +107,195 @@ class AlertEvent:
|
|
107
107
|
class ProductionMonitoringFramework:
|
108
108
|
"""
|
109
109
|
Enterprise production monitoring framework for CloudOps operations.
|
110
|
-
|
110
|
+
|
111
111
|
Monitors SLA compliance, performance metrics, and operational health
|
112
112
|
across 61-account enterprise environment.
|
113
113
|
"""
|
114
|
-
|
114
|
+
|
115
115
|
def __init__(self, console_instance: Optional[Console] = None):
|
116
116
|
"""
|
117
117
|
Initialize production monitoring framework.
|
118
|
-
|
118
|
+
|
119
119
|
Args:
|
120
120
|
console_instance: Rich console for output
|
121
121
|
"""
|
122
122
|
self.console = console_instance or console
|
123
123
|
self.start_time = time.time()
|
124
|
-
|
124
|
+
|
125
125
|
# SLA targets for enterprise operations
|
126
126
|
self.sla_targets = {
|
127
|
-
|
128
|
-
name=
|
127
|
+
"availability": SLATarget(
|
128
|
+
name="availability",
|
129
129
|
target_value=99.9,
|
130
130
|
warning_threshold=99.5,
|
131
131
|
critical_threshold=99.0,
|
132
|
-
unit=
|
133
|
-
description=
|
132
|
+
unit="%",
|
133
|
+
description="System availability percentage",
|
134
134
|
),
|
135
|
-
|
136
|
-
name=
|
135
|
+
"latency_p95": SLATarget(
|
136
|
+
name="latency_p95",
|
137
137
|
target_value=30.0,
|
138
138
|
warning_threshold=45.0,
|
139
139
|
critical_threshold=60.0,
|
140
|
-
unit=
|
141
|
-
description=
|
140
|
+
unit="seconds",
|
141
|
+
description="95th percentile operation latency",
|
142
142
|
),
|
143
|
-
|
144
|
-
name=
|
143
|
+
"success_rate": SLATarget(
|
144
|
+
name="success_rate",
|
145
145
|
target_value=95.0,
|
146
146
|
warning_threshold=90.0,
|
147
147
|
critical_threshold=85.0,
|
148
|
-
unit=
|
149
|
-
description=
|
148
|
+
unit="%",
|
149
|
+
description="Operation success rate",
|
150
150
|
),
|
151
|
-
|
152
|
-
name=
|
151
|
+
"error_budget": SLATarget(
|
152
|
+
name="error_budget",
|
153
153
|
target_value=0.1,
|
154
154
|
warning_threshold=0.05,
|
155
155
|
critical_threshold=0.01,
|
156
|
-
unit=
|
157
|
-
description=
|
158
|
-
)
|
156
|
+
unit="%",
|
157
|
+
description="Monthly error budget remaining",
|
158
|
+
),
|
159
159
|
}
|
160
|
-
|
160
|
+
|
161
161
|
# Monitoring state
|
162
162
|
self.active_alerts = []
|
163
163
|
self.metrics_history = []
|
164
164
|
self.circuit_breaker_state = {}
|
165
165
|
self.monitoring_active = False
|
166
|
-
|
166
|
+
|
167
167
|
# Performance tracking
|
168
168
|
self.operation_metrics = {
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
169
|
+
"total_operations": 0,
|
170
|
+
"successful_operations": 0,
|
171
|
+
"failed_operations": 0,
|
172
|
+
"average_latency": 0.0,
|
173
|
+
"p95_latency": 0.0,
|
174
174
|
}
|
175
|
-
|
175
|
+
|
176
176
|
async def start_monitoring(self, interval_seconds: int = 60) -> None:
|
177
177
|
"""
|
178
178
|
Start continuous monitoring loop.
|
179
|
-
|
179
|
+
|
180
180
|
Args:
|
181
181
|
interval_seconds: Monitoring interval in seconds
|
182
182
|
"""
|
183
183
|
self.monitoring_active = True
|
184
|
-
|
184
|
+
|
185
185
|
print_success("🚀 Production monitoring framework started")
|
186
|
-
|
186
|
+
|
187
187
|
with Live(self._create_monitoring_dashboard(), refresh_per_second=1, console=self.console) as live:
|
188
188
|
while self.monitoring_active:
|
189
189
|
try:
|
190
190
|
# Collect current metrics
|
191
191
|
current_metrics = await self._collect_current_metrics()
|
192
|
-
|
192
|
+
|
193
193
|
# Evaluate SLA compliance
|
194
194
|
sla_violations = self._evaluate_sla_compliance(current_metrics)
|
195
|
-
|
195
|
+
|
196
196
|
# Process alerts
|
197
197
|
await self._process_alerts(sla_violations)
|
198
|
-
|
198
|
+
|
199
199
|
# Update circuit breaker states
|
200
200
|
self._update_circuit_breakers(current_metrics)
|
201
|
-
|
201
|
+
|
202
202
|
# Update dashboard
|
203
203
|
live.update(self._create_monitoring_dashboard())
|
204
|
-
|
204
|
+
|
205
205
|
# Store metrics history
|
206
|
-
self.metrics_history.append({
|
207
|
-
|
208
|
-
'metrics': current_metrics
|
209
|
-
})
|
210
|
-
|
206
|
+
self.metrics_history.append({"timestamp": datetime.now(), "metrics": current_metrics})
|
207
|
+
|
211
208
|
# Clean old history (keep 24 hours)
|
212
209
|
self._cleanup_metrics_history()
|
213
|
-
|
210
|
+
|
214
211
|
await asyncio.sleep(interval_seconds)
|
215
|
-
|
212
|
+
|
216
213
|
except Exception as e:
|
217
214
|
print_error(f"Monitoring loop error: {str(e)}")
|
218
215
|
await asyncio.sleep(5) # Short retry interval
|
219
|
-
|
216
|
+
|
220
217
|
async def stop_monitoring(self) -> None:
|
221
218
|
"""Stop the monitoring framework gracefully."""
|
222
219
|
self.monitoring_active = False
|
223
220
|
print_info("📊 Production monitoring framework stopped")
|
224
|
-
|
221
|
+
|
225
222
|
async def _collect_current_metrics(self) -> Dict[str, MonitoringMetric]:
|
226
223
|
"""
|
227
224
|
Collect current operational metrics.
|
228
|
-
|
225
|
+
|
229
226
|
Returns:
|
230
227
|
Dictionary of current metrics
|
231
228
|
"""
|
232
229
|
current_metrics = {}
|
233
|
-
|
230
|
+
|
234
231
|
# Calculate availability (based on successful operations)
|
235
|
-
total_ops = max(self.operation_metrics[
|
236
|
-
success_ops = self.operation_metrics[
|
232
|
+
total_ops = max(self.operation_metrics["total_operations"], 1)
|
233
|
+
success_ops = self.operation_metrics["successful_operations"]
|
237
234
|
availability = (success_ops / total_ops) * 100
|
238
|
-
|
239
|
-
current_metrics[
|
240
|
-
metric_name=
|
235
|
+
|
236
|
+
current_metrics["availability"] = MonitoringMetric(
|
237
|
+
metric_name="availability",
|
241
238
|
current_value=availability,
|
242
|
-
target_value=self.sla_targets[
|
243
|
-
status=self._determine_status(
|
239
|
+
target_value=self.sla_targets["availability"].target_value,
|
240
|
+
status=self._determine_status("availability", availability),
|
244
241
|
timestamp=datetime.now(),
|
245
242
|
details={
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
}
|
243
|
+
"total_operations": total_ops,
|
244
|
+
"successful_operations": success_ops,
|
245
|
+
"failed_operations": self.operation_metrics["failed_operations"],
|
246
|
+
},
|
250
247
|
)
|
251
|
-
|
248
|
+
|
252
249
|
# P95 latency monitoring
|
253
|
-
p95_latency = self.operation_metrics[
|
254
|
-
current_metrics[
|
255
|
-
metric_name=
|
250
|
+
p95_latency = self.operation_metrics["p95_latency"]
|
251
|
+
current_metrics["latency_p95"] = MonitoringMetric(
|
252
|
+
metric_name="latency_p95",
|
256
253
|
current_value=p95_latency,
|
257
|
-
target_value=self.sla_targets[
|
258
|
-
status=self._determine_status(
|
254
|
+
target_value=self.sla_targets["latency_p95"].target_value,
|
255
|
+
status=self._determine_status("latency_p95", p95_latency),
|
259
256
|
timestamp=datetime.now(),
|
260
|
-
details={
|
261
|
-
'average_latency': self.operation_metrics['average_latency'],
|
262
|
-
'p95_latency': p95_latency
|
263
|
-
}
|
257
|
+
details={"average_latency": self.operation_metrics["average_latency"], "p95_latency": p95_latency},
|
264
258
|
)
|
265
|
-
|
259
|
+
|
266
260
|
# Success rate monitoring
|
267
261
|
success_rate = (success_ops / total_ops) * 100
|
268
|
-
current_metrics[
|
269
|
-
metric_name=
|
262
|
+
current_metrics["success_rate"] = MonitoringMetric(
|
263
|
+
metric_name="success_rate",
|
270
264
|
current_value=success_rate,
|
271
|
-
target_value=self.sla_targets[
|
272
|
-
status=self._determine_status(
|
265
|
+
target_value=self.sla_targets["success_rate"].target_value,
|
266
|
+
status=self._determine_status("success_rate", success_rate),
|
273
267
|
timestamp=datetime.now(),
|
274
|
-
details={
|
268
|
+
details={"success_percentage": success_rate},
|
275
269
|
)
|
276
|
-
|
270
|
+
|
277
271
|
# Error budget monitoring (simplified calculation)
|
278
|
-
error_budget = max(0.0, 1.0 - (self.operation_metrics[
|
279
|
-
current_metrics[
|
280
|
-
metric_name=
|
272
|
+
error_budget = max(0.0, 1.0 - (self.operation_metrics["failed_operations"] / total_ops)) * 100
|
273
|
+
current_metrics["error_budget"] = MonitoringMetric(
|
274
|
+
metric_name="error_budget",
|
281
275
|
current_value=error_budget,
|
282
|
-
target_value=self.sla_targets[
|
283
|
-
status=self._determine_status(
|
276
|
+
target_value=self.sla_targets["error_budget"].target_value,
|
277
|
+
status=self._determine_status("error_budget", error_budget),
|
284
278
|
timestamp=datetime.now(),
|
285
|
-
details={
|
279
|
+
details={"error_budget_remaining": error_budget},
|
286
280
|
)
|
287
|
-
|
281
|
+
|
288
282
|
return current_metrics
|
289
|
-
|
283
|
+
|
290
284
|
def _determine_status(self, metric_name: str, current_value: float) -> OperationStatus:
|
291
285
|
"""
|
292
286
|
Determine operation status based on current value and thresholds.
|
293
|
-
|
287
|
+
|
294
288
|
Args:
|
295
289
|
metric_name: Name of the metric
|
296
290
|
current_value: Current metric value
|
297
|
-
|
291
|
+
|
298
292
|
Returns:
|
299
293
|
OperationStatus enum value
|
300
294
|
"""
|
301
295
|
sla = self.sla_targets[metric_name]
|
302
|
-
|
296
|
+
|
303
297
|
# For latency, higher is worse
|
304
|
-
if metric_name ==
|
298
|
+
if metric_name == "latency_p95":
|
305
299
|
if current_value <= sla.target_value:
|
306
300
|
return OperationStatus.HEALTHY
|
307
301
|
elif current_value <= sla.warning_threshold:
|
@@ -310,7 +304,7 @@ class ProductionMonitoringFramework:
|
|
310
304
|
return OperationStatus.UNHEALTHY
|
311
305
|
else:
|
312
306
|
return OperationStatus.CRITICAL
|
313
|
-
|
307
|
+
|
314
308
|
# For other metrics, lower is worse
|
315
309
|
else:
|
316
310
|
if current_value >= sla.target_value:
|
@@ -321,29 +315,29 @@ class ProductionMonitoringFramework:
|
|
321
315
|
return OperationStatus.UNHEALTHY
|
322
316
|
else:
|
323
317
|
return OperationStatus.CRITICAL
|
324
|
-
|
318
|
+
|
325
319
|
def _evaluate_sla_compliance(self, current_metrics: Dict[str, MonitoringMetric]) -> List[MonitoringMetric]:
|
326
320
|
"""
|
327
321
|
Evaluate SLA compliance and identify violations.
|
328
|
-
|
322
|
+
|
329
323
|
Args:
|
330
324
|
current_metrics: Current metric values
|
331
|
-
|
325
|
+
|
332
326
|
Returns:
|
333
327
|
List of metrics that violate SLA thresholds
|
334
328
|
"""
|
335
329
|
violations = []
|
336
|
-
|
330
|
+
|
337
331
|
for metric in current_metrics.values():
|
338
332
|
if metric.status in [OperationStatus.UNHEALTHY, OperationStatus.CRITICAL]:
|
339
333
|
violations.append(metric)
|
340
|
-
|
334
|
+
|
341
335
|
return violations
|
342
|
-
|
336
|
+
|
343
337
|
async def _process_alerts(self, violations: List[MonitoringMetric]) -> None:
|
344
338
|
"""
|
345
339
|
Process SLA violations and generate alerts.
|
346
|
-
|
340
|
+
|
347
341
|
Args:
|
348
342
|
violations: List of metric violations
|
349
343
|
"""
|
@@ -351,23 +345,25 @@ class ProductionMonitoringFramework:
|
|
351
345
|
# Create alert event
|
352
346
|
alert = AlertEvent(
|
353
347
|
alert_id=f"SLA-{violation.metric_name}-{int(time.time())}",
|
354
|
-
severity=AlertSeverity.CRITICAL
|
348
|
+
severity=AlertSeverity.CRITICAL
|
349
|
+
if violation.status == OperationStatus.CRITICAL
|
350
|
+
else AlertSeverity.WARNING,
|
355
351
|
metric_name=violation.metric_name,
|
356
352
|
current_value=violation.current_value,
|
357
353
|
threshold_value=self.sla_targets[violation.metric_name].critical_threshold,
|
358
354
|
message=f"SLA violation detected for {violation.metric_name}: {violation.current_value:.2f}{self.sla_targets[violation.metric_name].unit}",
|
359
|
-
timestamp=datetime.now()
|
355
|
+
timestamp=datetime.now(),
|
360
356
|
)
|
361
|
-
|
357
|
+
|
362
358
|
# Add to active alerts if not already present
|
363
359
|
if not any(a.metric_name == alert.metric_name and not a.resolved for a in self.active_alerts):
|
364
360
|
self.active_alerts.append(alert)
|
365
361
|
await self._send_alert(alert)
|
366
|
-
|
362
|
+
|
367
363
|
async def _send_alert(self, alert: AlertEvent) -> None:
|
368
364
|
"""
|
369
365
|
Send alert notification (placeholder for integration with alerting systems).
|
370
|
-
|
366
|
+
|
371
367
|
Args:
|
372
368
|
alert: Alert event to send
|
373
369
|
"""
|
@@ -376,32 +372,32 @@ class ProductionMonitoringFramework:
|
|
376
372
|
# - PagerDuty/OpsGenie
|
377
373
|
# - Email notifications
|
378
374
|
# - ServiceNow incidents
|
379
|
-
|
375
|
+
|
380
376
|
if alert.severity == AlertSeverity.CRITICAL:
|
381
377
|
print_error(f"🚨 CRITICAL ALERT: {alert.message}")
|
382
378
|
else:
|
383
379
|
print_warning(f"⚠️ WARNING ALERT: {alert.message}")
|
384
|
-
|
380
|
+
|
385
381
|
def _update_circuit_breakers(self, current_metrics: Dict[str, MonitoringMetric]) -> None:
|
386
382
|
"""
|
387
383
|
Update circuit breaker states based on current metrics.
|
388
|
-
|
384
|
+
|
389
385
|
Args:
|
390
386
|
current_metrics: Current metric values
|
391
387
|
"""
|
392
388
|
for metric_name, metric in current_metrics.items():
|
393
389
|
if metric.status == OperationStatus.CRITICAL:
|
394
|
-
self.circuit_breaker_state[metric_name] =
|
390
|
+
self.circuit_breaker_state[metric_name] = "OPEN"
|
395
391
|
elif metric.status == OperationStatus.HEALTHY:
|
396
|
-
self.circuit_breaker_state[metric_name] =
|
392
|
+
self.circuit_breaker_state[metric_name] = "CLOSED"
|
397
393
|
else:
|
398
394
|
# Keep current state for degraded/unhealthy
|
399
395
|
pass
|
400
|
-
|
396
|
+
|
401
397
|
def _create_monitoring_dashboard(self) -> Panel:
|
402
398
|
"""
|
403
399
|
Create Rich dashboard for monitoring display.
|
404
|
-
|
400
|
+
|
405
401
|
Returns:
|
406
402
|
Rich Panel with monitoring dashboard
|
407
403
|
"""
|
@@ -411,136 +407,130 @@ class ProductionMonitoringFramework:
|
|
411
407
|
metrics_table.add_column("Current", style="yellow")
|
412
408
|
metrics_table.add_column("Target", style="green")
|
413
409
|
metrics_table.add_column("Status", style="blue")
|
414
|
-
|
410
|
+
|
415
411
|
for sla_name, sla in self.sla_targets.items():
|
416
412
|
# Get current value from operation metrics
|
417
|
-
if sla_name ==
|
418
|
-
total = max(self.operation_metrics[
|
419
|
-
current = (self.operation_metrics[
|
420
|
-
elif sla_name ==
|
421
|
-
current = self.operation_metrics[
|
422
|
-
elif sla_name ==
|
423
|
-
total = max(self.operation_metrics[
|
424
|
-
current = (self.operation_metrics[
|
413
|
+
if sla_name == "availability":
|
414
|
+
total = max(self.operation_metrics["total_operations"], 1)
|
415
|
+
current = (self.operation_metrics["successful_operations"] / total) * 100
|
416
|
+
elif sla_name == "latency_p95":
|
417
|
+
current = self.operation_metrics["p95_latency"]
|
418
|
+
elif sla_name == "success_rate":
|
419
|
+
total = max(self.operation_metrics["total_operations"], 1)
|
420
|
+
current = (self.operation_metrics["successful_operations"] / total) * 100
|
425
421
|
else: # error_budget
|
426
422
|
current = 0.1 # Placeholder calculation
|
427
|
-
|
423
|
+
|
428
424
|
status = self._determine_status(sla_name, current)
|
429
425
|
status_color = {
|
430
426
|
OperationStatus.HEALTHY: "[green]HEALTHY[/green]",
|
431
427
|
OperationStatus.DEGRADED: "[yellow]DEGRADED[/yellow]",
|
432
428
|
OperationStatus.UNHEALTHY: "[red]UNHEALTHY[/red]",
|
433
|
-
OperationStatus.CRITICAL: "[red bold]CRITICAL[/red bold]"
|
429
|
+
OperationStatus.CRITICAL: "[red bold]CRITICAL[/red bold]",
|
434
430
|
}[status]
|
435
|
-
|
431
|
+
|
436
432
|
metrics_table.add_row(
|
437
|
-
sla.description,
|
438
|
-
f"{current:.2f}{sla.unit}",
|
439
|
-
f"{sla.target_value:.2f}{sla.unit}",
|
440
|
-
status_color
|
433
|
+
sla.description, f"{current:.2f}{sla.unit}", f"{sla.target_value:.2f}{sla.unit}", status_color
|
441
434
|
)
|
442
|
-
|
435
|
+
|
443
436
|
# Active alerts table
|
444
437
|
alerts_table = Table(title="🚨 Active Alerts")
|
445
438
|
alerts_table.add_column("Severity", style="red")
|
446
439
|
alerts_table.add_column("Metric", style="cyan")
|
447
440
|
alerts_table.add_column("Message", style="yellow")
|
448
441
|
alerts_table.add_column("Time", style="blue")
|
449
|
-
|
442
|
+
|
450
443
|
active_alerts = [a for a in self.active_alerts if not a.resolved][-5:] # Show last 5
|
451
444
|
for alert in active_alerts:
|
452
445
|
alerts_table.add_row(
|
453
446
|
alert.severity.value,
|
454
447
|
alert.metric_name,
|
455
448
|
alert.message[:50] + "..." if len(alert.message) > 50 else alert.message,
|
456
|
-
alert.timestamp.strftime("%H:%M:%S")
|
449
|
+
alert.timestamp.strftime("%H:%M:%S"),
|
457
450
|
)
|
458
|
-
|
451
|
+
|
459
452
|
if not active_alerts:
|
460
453
|
alerts_table.add_row("None", "All systems operational", "No active alerts", "")
|
461
|
-
|
454
|
+
|
462
455
|
# Create dashboard layout
|
463
456
|
dashboard_content = f"""
|
464
457
|
[bold blue]CloudOps Production Monitoring Dashboard[/bold blue]
|
465
458
|
|
466
|
-
📊 Operations: {self.operation_metrics[
|
467
|
-
✅ Success: {self.operation_metrics[
|
468
|
-
❌ Failed: {self.operation_metrics[
|
469
|
-
⏱️ Avg Latency: {self.operation_metrics[
|
459
|
+
📊 Operations: {self.operation_metrics["total_operations"]} total
|
460
|
+
✅ Success: {self.operation_metrics["successful_operations"]}
|
461
|
+
❌ Failed: {self.operation_metrics["failed_operations"]}
|
462
|
+
⏱️ Avg Latency: {self.operation_metrics["average_latency"]:.2f}s
|
470
463
|
|
471
464
|
{metrics_table}
|
472
465
|
|
473
466
|
{alerts_table}
|
474
467
|
|
475
|
-
🔧 Circuit Breakers: {len([k for k, v in self.circuit_breaker_state.items() if v ==
|
468
|
+
🔧 Circuit Breakers: {len([k for k, v in self.circuit_breaker_state.items() if v == "OPEN"])} OPEN
|
476
469
|
⚡ Uptime: {time.time() - self.start_time:.0f}s
|
477
470
|
"""
|
478
|
-
|
471
|
+
|
479
472
|
return create_panel(dashboard_content, title="Enterprise SRE Monitoring")
|
480
|
-
|
473
|
+
|
481
474
|
def _cleanup_metrics_history(self) -> None:
|
482
475
|
"""Clean up old metrics history to prevent memory leaks."""
|
483
476
|
cutoff_time = datetime.now() - timedelta(hours=24)
|
484
|
-
self.metrics_history = [
|
485
|
-
|
486
|
-
if entry['timestamp'] > cutoff_time
|
487
|
-
]
|
488
|
-
|
477
|
+
self.metrics_history = [entry for entry in self.metrics_history if entry["timestamp"] > cutoff_time]
|
478
|
+
|
489
479
|
# Public interface for recording operations
|
490
480
|
def record_operation_start(self, operation_name: str) -> str:
|
491
481
|
"""
|
492
482
|
Record the start of an operation for monitoring.
|
493
|
-
|
483
|
+
|
494
484
|
Args:
|
495
485
|
operation_name: Name of the operation
|
496
|
-
|
486
|
+
|
497
487
|
Returns:
|
498
488
|
Operation tracking ID
|
499
489
|
"""
|
500
490
|
operation_id = f"{operation_name}-{int(time.time())}"
|
501
|
-
self.operation_metrics[
|
491
|
+
self.operation_metrics["total_operations"] += 1
|
502
492
|
return operation_id
|
503
|
-
|
493
|
+
|
504
494
|
def record_operation_success(self, operation_id: str, latency: float) -> None:
|
505
495
|
"""
|
506
496
|
Record successful operation completion.
|
507
|
-
|
497
|
+
|
508
498
|
Args:
|
509
499
|
operation_id: Operation tracking ID
|
510
500
|
latency: Operation latency in seconds
|
511
501
|
"""
|
512
|
-
self.operation_metrics[
|
513
|
-
|
502
|
+
self.operation_metrics["successful_operations"] += 1
|
503
|
+
|
514
504
|
# Update latency metrics (simplified calculation)
|
515
|
-
total_ops = self.operation_metrics[
|
516
|
-
current_avg = self.operation_metrics[
|
505
|
+
total_ops = self.operation_metrics["total_operations"]
|
506
|
+
current_avg = self.operation_metrics["average_latency"]
|
517
507
|
new_avg = ((current_avg * (total_ops - 1)) + latency) / total_ops
|
518
|
-
self.operation_metrics[
|
519
|
-
|
508
|
+
self.operation_metrics["average_latency"] = new_avg
|
509
|
+
|
520
510
|
# Simplified P95 calculation (use 95% of max latency seen)
|
521
|
-
self.operation_metrics[
|
522
|
-
|
511
|
+
self.operation_metrics["p95_latency"] = max(self.operation_metrics["p95_latency"], latency * 0.95)
|
512
|
+
|
523
513
|
def record_operation_failure(self, operation_id: str, error: str) -> None:
|
524
514
|
"""
|
525
515
|
Record failed operation.
|
526
|
-
|
516
|
+
|
527
517
|
Args:
|
528
518
|
operation_id: Operation tracking ID
|
529
519
|
error: Error message
|
530
520
|
"""
|
531
|
-
self.operation_metrics[
|
532
|
-
|
521
|
+
self.operation_metrics["failed_operations"] += 1
|
522
|
+
|
533
523
|
def is_circuit_breaker_open(self, metric_name: str) -> bool:
|
534
524
|
"""
|
535
525
|
Check if circuit breaker is open for a specific metric.
|
536
|
-
|
526
|
+
|
537
527
|
Args:
|
538
528
|
metric_name: Name of the metric to check
|
539
|
-
|
529
|
+
|
540
530
|
Returns:
|
541
531
|
True if circuit breaker is open
|
542
532
|
"""
|
543
|
-
return self.circuit_breaker_state.get(metric_name) ==
|
533
|
+
return self.circuit_breaker_state.get(metric_name) == "OPEN"
|
544
534
|
|
545
535
|
|
546
536
|
# Export public interface
|
@@ -557,28 +547,28 @@ __all__ = [
|
|
557
547
|
# CLI interface for running monitoring
|
558
548
|
if __name__ == "__main__":
|
559
549
|
import argparse
|
560
|
-
|
550
|
+
|
561
551
|
parser = argparse.ArgumentParser(description="CloudOps Production Monitoring Framework")
|
562
552
|
parser.add_argument("--interval", type=int, default=60, help="Monitoring interval in seconds")
|
563
553
|
parser.add_argument("--demo", action="store_true", help="Run in demo mode with simulated metrics")
|
564
|
-
|
554
|
+
|
565
555
|
args = parser.parse_args()
|
566
|
-
|
556
|
+
|
567
557
|
async def main():
|
568
558
|
monitoring = ProductionMonitoringFramework()
|
569
|
-
|
559
|
+
|
570
560
|
if args.demo:
|
571
561
|
# Simulate some operations for demo
|
572
|
-
monitoring.operation_metrics[
|
573
|
-
monitoring.operation_metrics[
|
574
|
-
monitoring.operation_metrics[
|
575
|
-
monitoring.operation_metrics[
|
576
|
-
monitoring.operation_metrics[
|
577
|
-
|
562
|
+
monitoring.operation_metrics["total_operations"] = 1000
|
563
|
+
monitoring.operation_metrics["successful_operations"] = 950
|
564
|
+
monitoring.operation_metrics["failed_operations"] = 50
|
565
|
+
monitoring.operation_metrics["average_latency"] = 15.5
|
566
|
+
monitoring.operation_metrics["p95_latency"] = 28.2
|
567
|
+
|
578
568
|
await monitoring.start_monitoring(args.interval)
|
579
|
-
|
569
|
+
|
580
570
|
# Run the monitoring framework
|
581
571
|
try:
|
582
572
|
asyncio.run(main())
|
583
573
|
except KeyboardInterrupt:
|
584
|
-
console.print("\n[yellow]Monitoring framework stopped by user[/yellow]")
|
574
|
+
console.print("\n[yellow]Monitoring framework stopped by user[/yellow]")
|