runbooks 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. runbooks/__init__.py +31 -2
  2. runbooks/__init___optimized.py +18 -4
  3. runbooks/_platform/__init__.py +1 -5
  4. runbooks/_platform/core/runbooks_wrapper.py +141 -138
  5. runbooks/aws2/accuracy_validator.py +812 -0
  6. runbooks/base.py +7 -0
  7. runbooks/cfat/WEIGHT_CONFIG_README.md +1 -1
  8. runbooks/cfat/assessment/compliance.py +8 -8
  9. runbooks/cfat/assessment/runner.py +1 -0
  10. runbooks/cfat/cloud_foundations_assessment.py +227 -239
  11. runbooks/cfat/models.py +6 -2
  12. runbooks/cfat/tests/__init__.py +6 -1
  13. runbooks/cli/__init__.py +13 -0
  14. runbooks/cli/commands/cfat.py +274 -0
  15. runbooks/cli/commands/finops.py +1164 -0
  16. runbooks/cli/commands/inventory.py +379 -0
  17. runbooks/cli/commands/operate.py +239 -0
  18. runbooks/cli/commands/security.py +248 -0
  19. runbooks/cli/commands/validation.py +825 -0
  20. runbooks/cli/commands/vpc.py +310 -0
  21. runbooks/cli/registry.py +107 -0
  22. runbooks/cloudops/__init__.py +23 -30
  23. runbooks/cloudops/base.py +96 -107
  24. runbooks/cloudops/cost_optimizer.py +549 -547
  25. runbooks/cloudops/infrastructure_optimizer.py +5 -4
  26. runbooks/cloudops/interfaces.py +226 -227
  27. runbooks/cloudops/lifecycle_manager.py +5 -4
  28. runbooks/cloudops/mcp_cost_validation.py +252 -235
  29. runbooks/cloudops/models.py +78 -53
  30. runbooks/cloudops/monitoring_automation.py +5 -4
  31. runbooks/cloudops/notebook_framework.py +179 -215
  32. runbooks/cloudops/security_enforcer.py +125 -159
  33. runbooks/common/accuracy_validator.py +11 -0
  34. runbooks/common/aws_pricing.py +349 -326
  35. runbooks/common/aws_pricing_api.py +211 -212
  36. runbooks/common/aws_profile_manager.py +341 -0
  37. runbooks/common/aws_utils.py +75 -80
  38. runbooks/common/business_logic.py +127 -105
  39. runbooks/common/cli_decorators.py +36 -60
  40. runbooks/common/comprehensive_cost_explorer_integration.py +456 -464
  41. runbooks/common/cross_account_manager.py +198 -205
  42. runbooks/common/date_utils.py +27 -39
  43. runbooks/common/decorators.py +235 -0
  44. runbooks/common/dry_run_examples.py +173 -208
  45. runbooks/common/dry_run_framework.py +157 -155
  46. runbooks/common/enhanced_exception_handler.py +15 -4
  47. runbooks/common/enhanced_logging_example.py +50 -64
  48. runbooks/common/enhanced_logging_integration_example.py +65 -37
  49. runbooks/common/env_utils.py +16 -16
  50. runbooks/common/error_handling.py +40 -38
  51. runbooks/common/lazy_loader.py +41 -23
  52. runbooks/common/logging_integration_helper.py +79 -86
  53. runbooks/common/mcp_cost_explorer_integration.py +478 -495
  54. runbooks/common/mcp_integration.py +63 -74
  55. runbooks/common/memory_optimization.py +140 -118
  56. runbooks/common/module_cli_base.py +37 -58
  57. runbooks/common/organizations_client.py +176 -194
  58. runbooks/common/patterns.py +204 -0
  59. runbooks/common/performance_monitoring.py +67 -71
  60. runbooks/common/performance_optimization_engine.py +283 -274
  61. runbooks/common/profile_utils.py +248 -39
  62. runbooks/common/rich_utils.py +643 -92
  63. runbooks/common/sre_performance_suite.py +177 -186
  64. runbooks/enterprise/__init__.py +1 -1
  65. runbooks/enterprise/logging.py +144 -106
  66. runbooks/enterprise/security.py +187 -204
  67. runbooks/enterprise/validation.py +43 -56
  68. runbooks/finops/__init__.py +29 -33
  69. runbooks/finops/account_resolver.py +1 -1
  70. runbooks/finops/advanced_optimization_engine.py +980 -0
  71. runbooks/finops/automation_core.py +268 -231
  72. runbooks/finops/business_case_config.py +184 -179
  73. runbooks/finops/cli.py +660 -139
  74. runbooks/finops/commvault_ec2_analysis.py +157 -164
  75. runbooks/finops/compute_cost_optimizer.py +336 -320
  76. runbooks/finops/config.py +20 -20
  77. runbooks/finops/cost_optimizer.py +488 -622
  78. runbooks/finops/cost_processor.py +332 -214
  79. runbooks/finops/dashboard_runner.py +1006 -172
  80. runbooks/finops/ebs_cost_optimizer.py +991 -657
  81. runbooks/finops/elastic_ip_optimizer.py +317 -257
  82. runbooks/finops/enhanced_mcp_integration.py +340 -0
  83. runbooks/finops/enhanced_progress.py +40 -37
  84. runbooks/finops/enhanced_trend_visualization.py +3 -2
  85. runbooks/finops/enterprise_wrappers.py +230 -292
  86. runbooks/finops/executive_export.py +203 -160
  87. runbooks/finops/helpers.py +130 -288
  88. runbooks/finops/iam_guidance.py +1 -1
  89. runbooks/finops/infrastructure/__init__.py +80 -0
  90. runbooks/finops/infrastructure/commands.py +506 -0
  91. runbooks/finops/infrastructure/load_balancer_optimizer.py +866 -0
  92. runbooks/finops/infrastructure/vpc_endpoint_optimizer.py +832 -0
  93. runbooks/finops/markdown_exporter.py +338 -175
  94. runbooks/finops/mcp_validator.py +1952 -0
  95. runbooks/finops/nat_gateway_optimizer.py +1513 -482
  96. runbooks/finops/network_cost_optimizer.py +657 -587
  97. runbooks/finops/notebook_utils.py +226 -188
  98. runbooks/finops/optimization_engine.py +1136 -0
  99. runbooks/finops/optimizer.py +25 -29
  100. runbooks/finops/rds_snapshot_optimizer.py +367 -411
  101. runbooks/finops/reservation_optimizer.py +427 -363
  102. runbooks/finops/scenario_cli_integration.py +77 -78
  103. runbooks/finops/scenarios.py +1278 -439
  104. runbooks/finops/schemas.py +218 -182
  105. runbooks/finops/snapshot_manager.py +2289 -0
  106. runbooks/finops/tests/test_finops_dashboard.py +3 -3
  107. runbooks/finops/tests/test_reference_images_validation.py +2 -2
  108. runbooks/finops/tests/test_single_account_features.py +17 -17
  109. runbooks/finops/tests/validate_test_suite.py +1 -1
  110. runbooks/finops/types.py +3 -3
  111. runbooks/finops/validation_framework.py +263 -269
  112. runbooks/finops/vpc_cleanup_exporter.py +191 -146
  113. runbooks/finops/vpc_cleanup_optimizer.py +593 -575
  114. runbooks/finops/workspaces_analyzer.py +171 -182
  115. runbooks/hitl/enhanced_workflow_engine.py +1 -1
  116. runbooks/integration/__init__.py +89 -0
  117. runbooks/integration/mcp_integration.py +1920 -0
  118. runbooks/inventory/CLAUDE.md +816 -0
  119. runbooks/inventory/README.md +3 -3
  120. runbooks/inventory/Tests/common_test_data.py +30 -30
  121. runbooks/inventory/__init__.py +2 -2
  122. runbooks/inventory/cloud_foundations_integration.py +144 -149
  123. runbooks/inventory/collectors/aws_comprehensive.py +28 -11
  124. runbooks/inventory/collectors/aws_networking.py +111 -101
  125. runbooks/inventory/collectors/base.py +4 -0
  126. runbooks/inventory/core/collector.py +495 -313
  127. runbooks/inventory/discovery.md +2 -2
  128. runbooks/inventory/drift_detection_cli.py +69 -96
  129. runbooks/inventory/find_ec2_security_groups.py +1 -1
  130. runbooks/inventory/inventory_mcp_cli.py +48 -46
  131. runbooks/inventory/list_rds_snapshots_aggregator.py +192 -208
  132. runbooks/inventory/mcp_inventory_validator.py +549 -465
  133. runbooks/inventory/mcp_vpc_validator.py +359 -442
  134. runbooks/inventory/organizations_discovery.py +56 -52
  135. runbooks/inventory/rich_inventory_display.py +33 -32
  136. runbooks/inventory/unified_validation_engine.py +278 -251
  137. runbooks/inventory/vpc_analyzer.py +733 -696
  138. runbooks/inventory/vpc_architecture_validator.py +293 -348
  139. runbooks/inventory/vpc_dependency_analyzer.py +382 -378
  140. runbooks/inventory/vpc_flow_analyzer.py +3 -3
  141. runbooks/main.py +152 -9147
  142. runbooks/main_final.py +91 -60
  143. runbooks/main_minimal.py +22 -10
  144. runbooks/main_optimized.py +131 -100
  145. runbooks/main_ultra_minimal.py +7 -2
  146. runbooks/mcp/__init__.py +36 -0
  147. runbooks/mcp/integration.py +679 -0
  148. runbooks/metrics/dora_metrics_engine.py +2 -2
  149. runbooks/monitoring/performance_monitor.py +9 -4
  150. runbooks/operate/dynamodb_operations.py +3 -1
  151. runbooks/operate/ec2_operations.py +145 -137
  152. runbooks/operate/iam_operations.py +146 -152
  153. runbooks/operate/mcp_integration.py +1 -1
  154. runbooks/operate/networking_cost_heatmap.py +33 -10
  155. runbooks/operate/privatelink_operations.py +1 -1
  156. runbooks/operate/rds_operations.py +223 -254
  157. runbooks/operate/s3_operations.py +107 -118
  158. runbooks/operate/vpc_endpoints.py +1 -1
  159. runbooks/operate/vpc_operations.py +648 -618
  160. runbooks/remediation/base.py +1 -1
  161. runbooks/remediation/commons.py +10 -7
  162. runbooks/remediation/commvault_ec2_analysis.py +71 -67
  163. runbooks/remediation/ec2_unattached_ebs_volumes.py +1 -0
  164. runbooks/remediation/multi_account.py +24 -21
  165. runbooks/remediation/rds_snapshot_list.py +91 -65
  166. runbooks/remediation/remediation_cli.py +92 -146
  167. runbooks/remediation/universal_account_discovery.py +83 -79
  168. runbooks/remediation/workspaces_list.py +49 -44
  169. runbooks/security/__init__.py +19 -0
  170. runbooks/security/assessment_runner.py +1150 -0
  171. runbooks/security/baseline_checker.py +812 -0
  172. runbooks/security/cloudops_automation_security_validator.py +509 -535
  173. runbooks/security/compliance_automation_engine.py +17 -17
  174. runbooks/security/config/__init__.py +2 -2
  175. runbooks/security/config/compliance_config.py +50 -50
  176. runbooks/security/config_template_generator.py +63 -76
  177. runbooks/security/enterprise_security_framework.py +1 -1
  178. runbooks/security/executive_security_dashboard.py +519 -508
  179. runbooks/security/integration_test_enterprise_security.py +5 -3
  180. runbooks/security/multi_account_security_controls.py +959 -1210
  181. runbooks/security/real_time_security_monitor.py +422 -444
  182. runbooks/security/run_script.py +1 -1
  183. runbooks/security/security_baseline_tester.py +1 -1
  184. runbooks/security/security_cli.py +143 -112
  185. runbooks/security/test_2way_validation.py +439 -0
  186. runbooks/security/two_way_validation_framework.py +852 -0
  187. runbooks/sre/mcp_reliability_engine.py +6 -6
  188. runbooks/sre/production_monitoring_framework.py +167 -177
  189. runbooks/tdd/__init__.py +15 -0
  190. runbooks/tdd/cli.py +1071 -0
  191. runbooks/utils/__init__.py +14 -17
  192. runbooks/utils/logger.py +7 -2
  193. runbooks/utils/version_validator.py +51 -48
  194. runbooks/validation/__init__.py +6 -6
  195. runbooks/validation/cli.py +9 -3
  196. runbooks/validation/comprehensive_2way_validator.py +754 -708
  197. runbooks/validation/mcp_validator.py +906 -228
  198. runbooks/validation/terraform_citations_validator.py +104 -115
  199. runbooks/validation/terraform_drift_detector.py +447 -451
  200. runbooks/vpc/README.md +617 -0
  201. runbooks/vpc/__init__.py +8 -1
  202. runbooks/vpc/analyzer.py +577 -0
  203. runbooks/vpc/cleanup_wrapper.py +476 -413
  204. runbooks/vpc/cli_cloudtrail_commands.py +339 -0
  205. runbooks/vpc/cli_mcp_validation_commands.py +480 -0
  206. runbooks/vpc/cloudtrail_audit_integration.py +717 -0
  207. runbooks/vpc/config.py +92 -97
  208. runbooks/vpc/cost_engine.py +411 -148
  209. runbooks/vpc/cost_explorer_integration.py +553 -0
  210. runbooks/vpc/cross_account_session.py +101 -106
  211. runbooks/vpc/enhanced_mcp_validation.py +917 -0
  212. runbooks/vpc/eni_gate_validator.py +961 -0
  213. runbooks/vpc/heatmap_engine.py +190 -162
  214. runbooks/vpc/mcp_no_eni_validator.py +681 -640
  215. runbooks/vpc/nat_gateway_optimizer.py +358 -0
  216. runbooks/vpc/networking_wrapper.py +15 -8
  217. runbooks/vpc/pdca_remediation_planner.py +528 -0
  218. runbooks/vpc/performance_optimized_analyzer.py +219 -231
  219. runbooks/vpc/runbooks_adapter.py +1167 -241
  220. runbooks/vpc/tdd_red_phase_stubs.py +601 -0
  221. runbooks/vpc/test_data_loader.py +358 -0
  222. runbooks/vpc/tests/conftest.py +314 -4
  223. runbooks/vpc/tests/test_cleanup_framework.py +1022 -0
  224. runbooks/vpc/tests/test_cost_engine.py +0 -2
  225. runbooks/vpc/topology_generator.py +326 -0
  226. runbooks/vpc/unified_scenarios.py +1302 -1129
  227. runbooks/vpc/vpc_cleanup_integration.py +1943 -1115
  228. runbooks-1.1.5.dist-info/METADATA +328 -0
  229. {runbooks-1.1.3.dist-info → runbooks-1.1.5.dist-info}/RECORD +233 -200
  230. runbooks/finops/README.md +0 -414
  231. runbooks/finops/accuracy_cross_validator.py +0 -647
  232. runbooks/finops/business_cases.py +0 -950
  233. runbooks/finops/dashboard_router.py +0 -922
  234. runbooks/finops/ebs_optimizer.py +0 -956
  235. runbooks/finops/embedded_mcp_validator.py +0 -1629
  236. runbooks/finops/enhanced_dashboard_runner.py +0 -527
  237. runbooks/finops/finops_dashboard.py +0 -584
  238. runbooks/finops/finops_scenarios.py +0 -1218
  239. runbooks/finops/legacy_migration.py +0 -730
  240. runbooks/finops/multi_dashboard.py +0 -1519
  241. runbooks/finops/single_dashboard.py +0 -1113
  242. runbooks/finops/unlimited_scenarios.py +0 -393
  243. runbooks-1.1.3.dist-info/METADATA +0 -799
  244. {runbooks-1.1.3.dist-info → runbooks-1.1.5.dist-info}/WHEEL +0 -0
  245. {runbooks-1.1.3.dist-info → runbooks-1.1.5.dist-info}/entry_points.txt +0 -0
  246. {runbooks-1.1.3.dist-info → runbooks-1.1.5.dist-info}/licenses/LICENSE +0 -0
  247. {runbooks-1.1.3.dist-info → runbooks-1.1.5.dist-info}/top_level.txt +0 -0
@@ -50,7 +50,7 @@ from runbooks.common.rich_utils import (
50
50
 
51
51
  class AlertSeverity(Enum):
52
52
  """Alert severity levels for monitoring framework."""
53
-
53
+
54
54
  INFO = "INFO"
55
55
  WARNING = "WARNING"
56
56
  CRITICAL = "CRITICAL"
@@ -59,7 +59,7 @@ class AlertSeverity(Enum):
59
59
 
60
60
  class OperationStatus(Enum):
61
61
  """Operation status for monitoring."""
62
-
62
+
63
63
  HEALTHY = "HEALTHY"
64
64
  DEGRADED = "DEGRADED"
65
65
  UNHEALTHY = "UNHEALTHY"
@@ -69,7 +69,7 @@ class OperationStatus(Enum):
69
69
  @dataclass
70
70
  class SLATarget:
71
71
  """SLA target definition with thresholds."""
72
-
72
+
73
73
  name: str
74
74
  target_value: float
75
75
  warning_threshold: float
@@ -81,7 +81,7 @@ class SLATarget:
81
81
  @dataclass
82
82
  class MonitoringMetric:
83
83
  """Individual monitoring metric result."""
84
-
84
+
85
85
  metric_name: str
86
86
  current_value: float
87
87
  target_value: float
@@ -93,7 +93,7 @@ class MonitoringMetric:
93
93
  @dataclass
94
94
  class AlertEvent:
95
95
  """Alert event structure."""
96
-
96
+
97
97
  alert_id: str
98
98
  severity: AlertSeverity
99
99
  metric_name: str
@@ -107,201 +107,195 @@ class AlertEvent:
107
107
  class ProductionMonitoringFramework:
108
108
  """
109
109
  Enterprise production monitoring framework for CloudOps operations.
110
-
110
+
111
111
  Monitors SLA compliance, performance metrics, and operational health
112
112
  across 61-account enterprise environment.
113
113
  """
114
-
114
+
115
115
  def __init__(self, console_instance: Optional[Console] = None):
116
116
  """
117
117
  Initialize production monitoring framework.
118
-
118
+
119
119
  Args:
120
120
  console_instance: Rich console for output
121
121
  """
122
122
  self.console = console_instance or console
123
123
  self.start_time = time.time()
124
-
124
+
125
125
  # SLA targets for enterprise operations
126
126
  self.sla_targets = {
127
- 'availability': SLATarget(
128
- name='availability',
127
+ "availability": SLATarget(
128
+ name="availability",
129
129
  target_value=99.9,
130
130
  warning_threshold=99.5,
131
131
  critical_threshold=99.0,
132
- unit='%',
133
- description='System availability percentage'
132
+ unit="%",
133
+ description="System availability percentage",
134
134
  ),
135
- 'latency_p95': SLATarget(
136
- name='latency_p95',
135
+ "latency_p95": SLATarget(
136
+ name="latency_p95",
137
137
  target_value=30.0,
138
138
  warning_threshold=45.0,
139
139
  critical_threshold=60.0,
140
- unit='seconds',
141
- description='95th percentile operation latency'
140
+ unit="seconds",
141
+ description="95th percentile operation latency",
142
142
  ),
143
- 'success_rate': SLATarget(
144
- name='success_rate',
143
+ "success_rate": SLATarget(
144
+ name="success_rate",
145
145
  target_value=95.0,
146
146
  warning_threshold=90.0,
147
147
  critical_threshold=85.0,
148
- unit='%',
149
- description='Operation success rate'
148
+ unit="%",
149
+ description="Operation success rate",
150
150
  ),
151
- 'error_budget': SLATarget(
152
- name='error_budget',
151
+ "error_budget": SLATarget(
152
+ name="error_budget",
153
153
  target_value=0.1,
154
154
  warning_threshold=0.05,
155
155
  critical_threshold=0.01,
156
- unit='%',
157
- description='Monthly error budget remaining'
158
- )
156
+ unit="%",
157
+ description="Monthly error budget remaining",
158
+ ),
159
159
  }
160
-
160
+
161
161
  # Monitoring state
162
162
  self.active_alerts = []
163
163
  self.metrics_history = []
164
164
  self.circuit_breaker_state = {}
165
165
  self.monitoring_active = False
166
-
166
+
167
167
  # Performance tracking
168
168
  self.operation_metrics = {
169
- 'total_operations': 0,
170
- 'successful_operations': 0,
171
- 'failed_operations': 0,
172
- 'average_latency': 0.0,
173
- 'p95_latency': 0.0
169
+ "total_operations": 0,
170
+ "successful_operations": 0,
171
+ "failed_operations": 0,
172
+ "average_latency": 0.0,
173
+ "p95_latency": 0.0,
174
174
  }
175
-
175
+
176
176
  async def start_monitoring(self, interval_seconds: int = 60) -> None:
177
177
  """
178
178
  Start continuous monitoring loop.
179
-
179
+
180
180
  Args:
181
181
  interval_seconds: Monitoring interval in seconds
182
182
  """
183
183
  self.monitoring_active = True
184
-
184
+
185
185
  print_success("🚀 Production monitoring framework started")
186
-
186
+
187
187
  with Live(self._create_monitoring_dashboard(), refresh_per_second=1, console=self.console) as live:
188
188
  while self.monitoring_active:
189
189
  try:
190
190
  # Collect current metrics
191
191
  current_metrics = await self._collect_current_metrics()
192
-
192
+
193
193
  # Evaluate SLA compliance
194
194
  sla_violations = self._evaluate_sla_compliance(current_metrics)
195
-
195
+
196
196
  # Process alerts
197
197
  await self._process_alerts(sla_violations)
198
-
198
+
199
199
  # Update circuit breaker states
200
200
  self._update_circuit_breakers(current_metrics)
201
-
201
+
202
202
  # Update dashboard
203
203
  live.update(self._create_monitoring_dashboard())
204
-
204
+
205
205
  # Store metrics history
206
- self.metrics_history.append({
207
- 'timestamp': datetime.now(),
208
- 'metrics': current_metrics
209
- })
210
-
206
+ self.metrics_history.append({"timestamp": datetime.now(), "metrics": current_metrics})
207
+
211
208
  # Clean old history (keep 24 hours)
212
209
  self._cleanup_metrics_history()
213
-
210
+
214
211
  await asyncio.sleep(interval_seconds)
215
-
212
+
216
213
  except Exception as e:
217
214
  print_error(f"Monitoring loop error: {str(e)}")
218
215
  await asyncio.sleep(5) # Short retry interval
219
-
216
+
220
217
  async def stop_monitoring(self) -> None:
221
218
  """Stop the monitoring framework gracefully."""
222
219
  self.monitoring_active = False
223
220
  print_info("📊 Production monitoring framework stopped")
224
-
221
+
225
222
  async def _collect_current_metrics(self) -> Dict[str, MonitoringMetric]:
226
223
  """
227
224
  Collect current operational metrics.
228
-
225
+
229
226
  Returns:
230
227
  Dictionary of current metrics
231
228
  """
232
229
  current_metrics = {}
233
-
230
+
234
231
  # Calculate availability (based on successful operations)
235
- total_ops = max(self.operation_metrics['total_operations'], 1)
236
- success_ops = self.operation_metrics['successful_operations']
232
+ total_ops = max(self.operation_metrics["total_operations"], 1)
233
+ success_ops = self.operation_metrics["successful_operations"]
237
234
  availability = (success_ops / total_ops) * 100
238
-
239
- current_metrics['availability'] = MonitoringMetric(
240
- metric_name='availability',
235
+
236
+ current_metrics["availability"] = MonitoringMetric(
237
+ metric_name="availability",
241
238
  current_value=availability,
242
- target_value=self.sla_targets['availability'].target_value,
243
- status=self._determine_status('availability', availability),
239
+ target_value=self.sla_targets["availability"].target_value,
240
+ status=self._determine_status("availability", availability),
244
241
  timestamp=datetime.now(),
245
242
  details={
246
- 'total_operations': total_ops,
247
- 'successful_operations': success_ops,
248
- 'failed_operations': self.operation_metrics['failed_operations']
249
- }
243
+ "total_operations": total_ops,
244
+ "successful_operations": success_ops,
245
+ "failed_operations": self.operation_metrics["failed_operations"],
246
+ },
250
247
  )
251
-
248
+
252
249
  # P95 latency monitoring
253
- p95_latency = self.operation_metrics['p95_latency']
254
- current_metrics['latency_p95'] = MonitoringMetric(
255
- metric_name='latency_p95',
250
+ p95_latency = self.operation_metrics["p95_latency"]
251
+ current_metrics["latency_p95"] = MonitoringMetric(
252
+ metric_name="latency_p95",
256
253
  current_value=p95_latency,
257
- target_value=self.sla_targets['latency_p95'].target_value,
258
- status=self._determine_status('latency_p95', p95_latency),
254
+ target_value=self.sla_targets["latency_p95"].target_value,
255
+ status=self._determine_status("latency_p95", p95_latency),
259
256
  timestamp=datetime.now(),
260
- details={
261
- 'average_latency': self.operation_metrics['average_latency'],
262
- 'p95_latency': p95_latency
263
- }
257
+ details={"average_latency": self.operation_metrics["average_latency"], "p95_latency": p95_latency},
264
258
  )
265
-
259
+
266
260
  # Success rate monitoring
267
261
  success_rate = (success_ops / total_ops) * 100
268
- current_metrics['success_rate'] = MonitoringMetric(
269
- metric_name='success_rate',
262
+ current_metrics["success_rate"] = MonitoringMetric(
263
+ metric_name="success_rate",
270
264
  current_value=success_rate,
271
- target_value=self.sla_targets['success_rate'].target_value,
272
- status=self._determine_status('success_rate', success_rate),
265
+ target_value=self.sla_targets["success_rate"].target_value,
266
+ status=self._determine_status("success_rate", success_rate),
273
267
  timestamp=datetime.now(),
274
- details={'success_percentage': success_rate}
268
+ details={"success_percentage": success_rate},
275
269
  )
276
-
270
+
277
271
  # Error budget monitoring (simplified calculation)
278
- error_budget = max(0.0, 1.0 - (self.operation_metrics['failed_operations'] / total_ops)) * 100
279
- current_metrics['error_budget'] = MonitoringMetric(
280
- metric_name='error_budget',
272
+ error_budget = max(0.0, 1.0 - (self.operation_metrics["failed_operations"] / total_ops)) * 100
273
+ current_metrics["error_budget"] = MonitoringMetric(
274
+ metric_name="error_budget",
281
275
  current_value=error_budget,
282
- target_value=self.sla_targets['error_budget'].target_value,
283
- status=self._determine_status('error_budget', error_budget),
276
+ target_value=self.sla_targets["error_budget"].target_value,
277
+ status=self._determine_status("error_budget", error_budget),
284
278
  timestamp=datetime.now(),
285
- details={'error_budget_remaining': error_budget}
279
+ details={"error_budget_remaining": error_budget},
286
280
  )
287
-
281
+
288
282
  return current_metrics
289
-
283
+
290
284
  def _determine_status(self, metric_name: str, current_value: float) -> OperationStatus:
291
285
  """
292
286
  Determine operation status based on current value and thresholds.
293
-
287
+
294
288
  Args:
295
289
  metric_name: Name of the metric
296
290
  current_value: Current metric value
297
-
291
+
298
292
  Returns:
299
293
  OperationStatus enum value
300
294
  """
301
295
  sla = self.sla_targets[metric_name]
302
-
296
+
303
297
  # For latency, higher is worse
304
- if metric_name == 'latency_p95':
298
+ if metric_name == "latency_p95":
305
299
  if current_value <= sla.target_value:
306
300
  return OperationStatus.HEALTHY
307
301
  elif current_value <= sla.warning_threshold:
@@ -310,7 +304,7 @@ class ProductionMonitoringFramework:
310
304
  return OperationStatus.UNHEALTHY
311
305
  else:
312
306
  return OperationStatus.CRITICAL
313
-
307
+
314
308
  # For other metrics, lower is worse
315
309
  else:
316
310
  if current_value >= sla.target_value:
@@ -321,29 +315,29 @@ class ProductionMonitoringFramework:
321
315
  return OperationStatus.UNHEALTHY
322
316
  else:
323
317
  return OperationStatus.CRITICAL
324
-
318
+
325
319
  def _evaluate_sla_compliance(self, current_metrics: Dict[str, MonitoringMetric]) -> List[MonitoringMetric]:
326
320
  """
327
321
  Evaluate SLA compliance and identify violations.
328
-
322
+
329
323
  Args:
330
324
  current_metrics: Current metric values
331
-
325
+
332
326
  Returns:
333
327
  List of metrics that violate SLA thresholds
334
328
  """
335
329
  violations = []
336
-
330
+
337
331
  for metric in current_metrics.values():
338
332
  if metric.status in [OperationStatus.UNHEALTHY, OperationStatus.CRITICAL]:
339
333
  violations.append(metric)
340
-
334
+
341
335
  return violations
342
-
336
+
343
337
  async def _process_alerts(self, violations: List[MonitoringMetric]) -> None:
344
338
  """
345
339
  Process SLA violations and generate alerts.
346
-
340
+
347
341
  Args:
348
342
  violations: List of metric violations
349
343
  """
@@ -351,23 +345,25 @@ class ProductionMonitoringFramework:
351
345
  # Create alert event
352
346
  alert = AlertEvent(
353
347
  alert_id=f"SLA-{violation.metric_name}-{int(time.time())}",
354
- severity=AlertSeverity.CRITICAL if violation.status == OperationStatus.CRITICAL else AlertSeverity.WARNING,
348
+ severity=AlertSeverity.CRITICAL
349
+ if violation.status == OperationStatus.CRITICAL
350
+ else AlertSeverity.WARNING,
355
351
  metric_name=violation.metric_name,
356
352
  current_value=violation.current_value,
357
353
  threshold_value=self.sla_targets[violation.metric_name].critical_threshold,
358
354
  message=f"SLA violation detected for {violation.metric_name}: {violation.current_value:.2f}{self.sla_targets[violation.metric_name].unit}",
359
- timestamp=datetime.now()
355
+ timestamp=datetime.now(),
360
356
  )
361
-
357
+
362
358
  # Add to active alerts if not already present
363
359
  if not any(a.metric_name == alert.metric_name and not a.resolved for a in self.active_alerts):
364
360
  self.active_alerts.append(alert)
365
361
  await self._send_alert(alert)
366
-
362
+
367
363
  async def _send_alert(self, alert: AlertEvent) -> None:
368
364
  """
369
365
  Send alert notification (placeholder for integration with alerting systems).
370
-
366
+
371
367
  Args:
372
368
  alert: Alert event to send
373
369
  """
@@ -376,32 +372,32 @@ class ProductionMonitoringFramework:
376
372
  # - PagerDuty/OpsGenie
377
373
  # - Email notifications
378
374
  # - ServiceNow incidents
379
-
375
+
380
376
  if alert.severity == AlertSeverity.CRITICAL:
381
377
  print_error(f"🚨 CRITICAL ALERT: {alert.message}")
382
378
  else:
383
379
  print_warning(f"⚠️ WARNING ALERT: {alert.message}")
384
-
380
+
385
381
  def _update_circuit_breakers(self, current_metrics: Dict[str, MonitoringMetric]) -> None:
386
382
  """
387
383
  Update circuit breaker states based on current metrics.
388
-
384
+
389
385
  Args:
390
386
  current_metrics: Current metric values
391
387
  """
392
388
  for metric_name, metric in current_metrics.items():
393
389
  if metric.status == OperationStatus.CRITICAL:
394
- self.circuit_breaker_state[metric_name] = 'OPEN'
390
+ self.circuit_breaker_state[metric_name] = "OPEN"
395
391
  elif metric.status == OperationStatus.HEALTHY:
396
- self.circuit_breaker_state[metric_name] = 'CLOSED'
392
+ self.circuit_breaker_state[metric_name] = "CLOSED"
397
393
  else:
398
394
  # Keep current state for degraded/unhealthy
399
395
  pass
400
-
396
+
401
397
  def _create_monitoring_dashboard(self) -> Panel:
402
398
  """
403
399
  Create Rich dashboard for monitoring display.
404
-
400
+
405
401
  Returns:
406
402
  Rich Panel with monitoring dashboard
407
403
  """
@@ -411,136 +407,130 @@ class ProductionMonitoringFramework:
411
407
  metrics_table.add_column("Current", style="yellow")
412
408
  metrics_table.add_column("Target", style="green")
413
409
  metrics_table.add_column("Status", style="blue")
414
-
410
+
415
411
  for sla_name, sla in self.sla_targets.items():
416
412
  # Get current value from operation metrics
417
- if sla_name == 'availability':
418
- total = max(self.operation_metrics['total_operations'], 1)
419
- current = (self.operation_metrics['successful_operations'] / total) * 100
420
- elif sla_name == 'latency_p95':
421
- current = self.operation_metrics['p95_latency']
422
- elif sla_name == 'success_rate':
423
- total = max(self.operation_metrics['total_operations'], 1)
424
- current = (self.operation_metrics['successful_operations'] / total) * 100
413
+ if sla_name == "availability":
414
+ total = max(self.operation_metrics["total_operations"], 1)
415
+ current = (self.operation_metrics["successful_operations"] / total) * 100
416
+ elif sla_name == "latency_p95":
417
+ current = self.operation_metrics["p95_latency"]
418
+ elif sla_name == "success_rate":
419
+ total = max(self.operation_metrics["total_operations"], 1)
420
+ current = (self.operation_metrics["successful_operations"] / total) * 100
425
421
  else: # error_budget
426
422
  current = 0.1 # Placeholder calculation
427
-
423
+
428
424
  status = self._determine_status(sla_name, current)
429
425
  status_color = {
430
426
  OperationStatus.HEALTHY: "[green]HEALTHY[/green]",
431
427
  OperationStatus.DEGRADED: "[yellow]DEGRADED[/yellow]",
432
428
  OperationStatus.UNHEALTHY: "[red]UNHEALTHY[/red]",
433
- OperationStatus.CRITICAL: "[red bold]CRITICAL[/red bold]"
429
+ OperationStatus.CRITICAL: "[red bold]CRITICAL[/red bold]",
434
430
  }[status]
435
-
431
+
436
432
  metrics_table.add_row(
437
- sla.description,
438
- f"{current:.2f}{sla.unit}",
439
- f"{sla.target_value:.2f}{sla.unit}",
440
- status_color
433
+ sla.description, f"{current:.2f}{sla.unit}", f"{sla.target_value:.2f}{sla.unit}", status_color
441
434
  )
442
-
435
+
443
436
  # Active alerts table
444
437
  alerts_table = Table(title="🚨 Active Alerts")
445
438
  alerts_table.add_column("Severity", style="red")
446
439
  alerts_table.add_column("Metric", style="cyan")
447
440
  alerts_table.add_column("Message", style="yellow")
448
441
  alerts_table.add_column("Time", style="blue")
449
-
442
+
450
443
  active_alerts = [a for a in self.active_alerts if not a.resolved][-5:] # Show last 5
451
444
  for alert in active_alerts:
452
445
  alerts_table.add_row(
453
446
  alert.severity.value,
454
447
  alert.metric_name,
455
448
  alert.message[:50] + "..." if len(alert.message) > 50 else alert.message,
456
- alert.timestamp.strftime("%H:%M:%S")
449
+ alert.timestamp.strftime("%H:%M:%S"),
457
450
  )
458
-
451
+
459
452
  if not active_alerts:
460
453
  alerts_table.add_row("None", "All systems operational", "No active alerts", "")
461
-
454
+
462
455
  # Create dashboard layout
463
456
  dashboard_content = f"""
464
457
  [bold blue]CloudOps Production Monitoring Dashboard[/bold blue]
465
458
 
466
- 📊 Operations: {self.operation_metrics['total_operations']} total
467
- ✅ Success: {self.operation_metrics['successful_operations']}
468
- ❌ Failed: {self.operation_metrics['failed_operations']}
469
- ⏱️ Avg Latency: {self.operation_metrics['average_latency']:.2f}s
459
+ 📊 Operations: {self.operation_metrics["total_operations"]} total
460
+ ✅ Success: {self.operation_metrics["successful_operations"]}
461
+ ❌ Failed: {self.operation_metrics["failed_operations"]}
462
+ ⏱️ Avg Latency: {self.operation_metrics["average_latency"]:.2f}s
470
463
 
471
464
  {metrics_table}
472
465
 
473
466
  {alerts_table}
474
467
 
475
- 🔧 Circuit Breakers: {len([k for k, v in self.circuit_breaker_state.items() if v == 'OPEN'])} OPEN
468
+ 🔧 Circuit Breakers: {len([k for k, v in self.circuit_breaker_state.items() if v == "OPEN"])} OPEN
476
469
  ⚡ Uptime: {time.time() - self.start_time:.0f}s
477
470
  """
478
-
471
+
479
472
  return create_panel(dashboard_content, title="Enterprise SRE Monitoring")
480
-
473
+
481
474
  def _cleanup_metrics_history(self) -> None:
482
475
  """Clean up old metrics history to prevent memory leaks."""
483
476
  cutoff_time = datetime.now() - timedelta(hours=24)
484
- self.metrics_history = [
485
- entry for entry in self.metrics_history
486
- if entry['timestamp'] > cutoff_time
487
- ]
488
-
477
+ self.metrics_history = [entry for entry in self.metrics_history if entry["timestamp"] > cutoff_time]
478
+
489
479
  # Public interface for recording operations
490
480
  def record_operation_start(self, operation_name: str) -> str:
491
481
  """
492
482
  Record the start of an operation for monitoring.
493
-
483
+
494
484
  Args:
495
485
  operation_name: Name of the operation
496
-
486
+
497
487
  Returns:
498
488
  Operation tracking ID
499
489
  """
500
490
  operation_id = f"{operation_name}-{int(time.time())}"
501
- self.operation_metrics['total_operations'] += 1
491
+ self.operation_metrics["total_operations"] += 1
502
492
  return operation_id
503
-
493
+
504
494
  def record_operation_success(self, operation_id: str, latency: float) -> None:
505
495
  """
506
496
  Record successful operation completion.
507
-
497
+
508
498
  Args:
509
499
  operation_id: Operation tracking ID
510
500
  latency: Operation latency in seconds
511
501
  """
512
- self.operation_metrics['successful_operations'] += 1
513
-
502
+ self.operation_metrics["successful_operations"] += 1
503
+
514
504
  # Update latency metrics (simplified calculation)
515
- total_ops = self.operation_metrics['total_operations']
516
- current_avg = self.operation_metrics['average_latency']
505
+ total_ops = self.operation_metrics["total_operations"]
506
+ current_avg = self.operation_metrics["average_latency"]
517
507
  new_avg = ((current_avg * (total_ops - 1)) + latency) / total_ops
518
- self.operation_metrics['average_latency'] = new_avg
519
-
508
+ self.operation_metrics["average_latency"] = new_avg
509
+
520
510
  # Simplified P95 calculation (use 95% of max latency seen)
521
- self.operation_metrics['p95_latency'] = max(self.operation_metrics['p95_latency'], latency * 0.95)
522
-
511
+ self.operation_metrics["p95_latency"] = max(self.operation_metrics["p95_latency"], latency * 0.95)
512
+
523
513
  def record_operation_failure(self, operation_id: str, error: str) -> None:
524
514
  """
525
515
  Record failed operation.
526
-
516
+
527
517
  Args:
528
518
  operation_id: Operation tracking ID
529
519
  error: Error message
530
520
  """
531
- self.operation_metrics['failed_operations'] += 1
532
-
521
+ self.operation_metrics["failed_operations"] += 1
522
+
533
523
  def is_circuit_breaker_open(self, metric_name: str) -> bool:
534
524
  """
535
525
  Check if circuit breaker is open for a specific metric.
536
-
526
+
537
527
  Args:
538
528
  metric_name: Name of the metric to check
539
-
529
+
540
530
  Returns:
541
531
  True if circuit breaker is open
542
532
  """
543
- return self.circuit_breaker_state.get(metric_name) == 'OPEN'
533
+ return self.circuit_breaker_state.get(metric_name) == "OPEN"
544
534
 
545
535
 
546
536
  # Export public interface
@@ -557,28 +547,28 @@ __all__ = [
557
547
  # CLI interface for running monitoring
558
548
  if __name__ == "__main__":
559
549
  import argparse
560
-
550
+
561
551
  parser = argparse.ArgumentParser(description="CloudOps Production Monitoring Framework")
562
552
  parser.add_argument("--interval", type=int, default=60, help="Monitoring interval in seconds")
563
553
  parser.add_argument("--demo", action="store_true", help="Run in demo mode with simulated metrics")
564
-
554
+
565
555
  args = parser.parse_args()
566
-
556
+
567
557
  async def main():
568
558
  monitoring = ProductionMonitoringFramework()
569
-
559
+
570
560
  if args.demo:
571
561
  # Simulate some operations for demo
572
- monitoring.operation_metrics['total_operations'] = 1000
573
- monitoring.operation_metrics['successful_operations'] = 950
574
- monitoring.operation_metrics['failed_operations'] = 50
575
- monitoring.operation_metrics['average_latency'] = 15.5
576
- monitoring.operation_metrics['p95_latency'] = 28.2
577
-
562
+ monitoring.operation_metrics["total_operations"] = 1000
563
+ monitoring.operation_metrics["successful_operations"] = 950
564
+ monitoring.operation_metrics["failed_operations"] = 50
565
+ monitoring.operation_metrics["average_latency"] = 15.5
566
+ monitoring.operation_metrics["p95_latency"] = 28.2
567
+
578
568
  await monitoring.start_monitoring(args.interval)
579
-
569
+
580
570
  # Run the monitoring framework
581
571
  try:
582
572
  asyncio.run(main())
583
573
  except KeyboardInterrupt:
584
- console.print("\n[yellow]Monitoring framework stopped by user[/yellow]")
574
+ console.print("\n[yellow]Monitoring framework stopped by user[/yellow]")