runbooks 0.7.7__py3-none-any.whl โ†’ 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. runbooks/__init__.py +1 -1
  2. runbooks/base.py +2 -2
  3. runbooks/cfat/README.md +12 -1
  4. runbooks/cfat/__init__.py +8 -4
  5. runbooks/cfat/assessment/collectors.py +171 -14
  6. runbooks/cfat/assessment/compliance.py +546 -522
  7. runbooks/cfat/assessment/runner.py +129 -10
  8. runbooks/cfat/models.py +6 -2
  9. runbooks/common/__init__.py +152 -0
  10. runbooks/common/accuracy_validator.py +1039 -0
  11. runbooks/common/context_logger.py +440 -0
  12. runbooks/common/cross_module_integration.py +594 -0
  13. runbooks/common/enhanced_exception_handler.py +1108 -0
  14. runbooks/common/enterprise_audit_integration.py +634 -0
  15. runbooks/common/logger.py +14 -0
  16. runbooks/common/mcp_integration.py +539 -0
  17. runbooks/common/performance_monitor.py +387 -0
  18. runbooks/common/profile_utils.py +216 -0
  19. runbooks/common/rich_utils.py +622 -0
  20. runbooks/enterprise/__init__.py +68 -0
  21. runbooks/enterprise/error_handling.py +411 -0
  22. runbooks/enterprise/logging.py +439 -0
  23. runbooks/enterprise/multi_tenant.py +583 -0
  24. runbooks/feedback/user_feedback_collector.py +440 -0
  25. runbooks/finops/README.md +129 -14
  26. runbooks/finops/__init__.py +22 -3
  27. runbooks/finops/account_resolver.py +279 -0
  28. runbooks/finops/accuracy_cross_validator.py +638 -0
  29. runbooks/finops/aws_client.py +721 -36
  30. runbooks/finops/budget_integration.py +313 -0
  31. runbooks/finops/cli.py +90 -33
  32. runbooks/finops/cost_processor.py +211 -37
  33. runbooks/finops/dashboard_router.py +900 -0
  34. runbooks/finops/dashboard_runner.py +1334 -399
  35. runbooks/finops/embedded_mcp_validator.py +288 -0
  36. runbooks/finops/enhanced_dashboard_runner.py +526 -0
  37. runbooks/finops/enhanced_progress.py +327 -0
  38. runbooks/finops/enhanced_trend_visualization.py +423 -0
  39. runbooks/finops/finops_dashboard.py +41 -0
  40. runbooks/finops/helpers.py +639 -323
  41. runbooks/finops/iam_guidance.py +400 -0
  42. runbooks/finops/markdown_exporter.py +466 -0
  43. runbooks/finops/multi_dashboard.py +1502 -0
  44. runbooks/finops/optimizer.py +396 -395
  45. runbooks/finops/profile_processor.py +2 -2
  46. runbooks/finops/runbooks.inventory.organizations_discovery.log +0 -0
  47. runbooks/finops/runbooks.security.report_generator.log +0 -0
  48. runbooks/finops/runbooks.security.run_script.log +0 -0
  49. runbooks/finops/runbooks.security.security_export.log +0 -0
  50. runbooks/finops/service_mapping.py +195 -0
  51. runbooks/finops/single_dashboard.py +710 -0
  52. runbooks/finops/tests/__init__.py +19 -0
  53. runbooks/finops/tests/results_test_finops_dashboard.xml +1 -0
  54. runbooks/finops/tests/run_comprehensive_tests.py +421 -0
  55. runbooks/finops/tests/run_tests.py +305 -0
  56. runbooks/finops/tests/test_finops_dashboard.py +705 -0
  57. runbooks/finops/tests/test_integration.py +477 -0
  58. runbooks/finops/tests/test_performance.py +380 -0
  59. runbooks/finops/tests/test_performance_benchmarks.py +500 -0
  60. runbooks/finops/tests/test_reference_images_validation.py +867 -0
  61. runbooks/finops/tests/test_single_account_features.py +715 -0
  62. runbooks/finops/tests/validate_test_suite.py +220 -0
  63. runbooks/finops/types.py +1 -1
  64. runbooks/hitl/enhanced_workflow_engine.py +725 -0
  65. runbooks/inventory/README.md +12 -1
  66. runbooks/inventory/artifacts/scale-optimize-status.txt +12 -0
  67. runbooks/inventory/collectors/aws_comprehensive.py +192 -185
  68. runbooks/inventory/collectors/enterprise_scale.py +281 -0
  69. runbooks/inventory/core/collector.py +299 -12
  70. runbooks/inventory/list_ec2_instances.py +21 -20
  71. runbooks/inventory/list_ssm_parameters.py +31 -3
  72. runbooks/inventory/organizations_discovery.py +1315 -0
  73. runbooks/inventory/rich_inventory_display.py +360 -0
  74. runbooks/inventory/run_on_multi_accounts.py +32 -16
  75. runbooks/inventory/runbooks.security.report_generator.log +0 -0
  76. runbooks/inventory/runbooks.security.run_script.log +0 -0
  77. runbooks/inventory/vpc_flow_analyzer.py +1030 -0
  78. runbooks/main.py +4171 -1615
  79. runbooks/metrics/dora_metrics_engine.py +1293 -0
  80. runbooks/monitoring/performance_monitor.py +433 -0
  81. runbooks/operate/README.md +394 -0
  82. runbooks/operate/__init__.py +2 -2
  83. runbooks/operate/base.py +291 -11
  84. runbooks/operate/deployment_framework.py +1032 -0
  85. runbooks/operate/deployment_validator.py +853 -0
  86. runbooks/operate/dynamodb_operations.py +10 -6
  87. runbooks/operate/ec2_operations.py +321 -11
  88. runbooks/operate/executive_dashboard.py +779 -0
  89. runbooks/operate/mcp_integration.py +750 -0
  90. runbooks/operate/nat_gateway_operations.py +1120 -0
  91. runbooks/operate/networking_cost_heatmap.py +685 -0
  92. runbooks/operate/privatelink_operations.py +940 -0
  93. runbooks/operate/s3_operations.py +10 -6
  94. runbooks/operate/vpc_endpoints.py +644 -0
  95. runbooks/operate/vpc_operations.py +1038 -0
  96. runbooks/remediation/README.md +489 -13
  97. runbooks/remediation/__init__.py +2 -2
  98. runbooks/remediation/acm_remediation.py +1 -1
  99. runbooks/remediation/base.py +1 -1
  100. runbooks/remediation/cloudtrail_remediation.py +1 -1
  101. runbooks/remediation/cognito_remediation.py +1 -1
  102. runbooks/remediation/commons.py +8 -4
  103. runbooks/remediation/dynamodb_remediation.py +1 -1
  104. runbooks/remediation/ec2_remediation.py +1 -1
  105. runbooks/remediation/ec2_unattached_ebs_volumes.py +1 -1
  106. runbooks/remediation/kms_enable_key_rotation.py +1 -1
  107. runbooks/remediation/kms_remediation.py +1 -1
  108. runbooks/remediation/lambda_remediation.py +1 -1
  109. runbooks/remediation/multi_account.py +1 -1
  110. runbooks/remediation/rds_remediation.py +1 -1
  111. runbooks/remediation/s3_block_public_access.py +1 -1
  112. runbooks/remediation/s3_enable_access_logging.py +1 -1
  113. runbooks/remediation/s3_encryption.py +1 -1
  114. runbooks/remediation/s3_remediation.py +1 -1
  115. runbooks/remediation/vpc_remediation.py +475 -0
  116. runbooks/security/ENTERPRISE_SECURITY_FRAMEWORK.md +506 -0
  117. runbooks/security/README.md +12 -1
  118. runbooks/security/__init__.py +166 -33
  119. runbooks/security/compliance_automation.py +634 -0
  120. runbooks/security/compliance_automation_engine.py +1021 -0
  121. runbooks/security/enterprise_security_framework.py +931 -0
  122. runbooks/security/enterprise_security_policies.json +293 -0
  123. runbooks/security/integration_test_enterprise_security.py +879 -0
  124. runbooks/security/module_security_integrator.py +641 -0
  125. runbooks/security/report_generator.py +10 -0
  126. runbooks/security/run_script.py +27 -5
  127. runbooks/security/security_baseline_tester.py +153 -27
  128. runbooks/security/security_export.py +456 -0
  129. runbooks/sre/README.md +472 -0
  130. runbooks/sre/__init__.py +33 -0
  131. runbooks/sre/mcp_reliability_engine.py +1049 -0
  132. runbooks/sre/performance_optimization_engine.py +1032 -0
  133. runbooks/sre/reliability_monitoring_framework.py +1011 -0
  134. runbooks/validation/__init__.py +10 -0
  135. runbooks/validation/benchmark.py +489 -0
  136. runbooks/validation/cli.py +368 -0
  137. runbooks/validation/mcp_validator.py +797 -0
  138. runbooks/vpc/README.md +478 -0
  139. runbooks/vpc/__init__.py +38 -0
  140. runbooks/vpc/config.py +212 -0
  141. runbooks/vpc/cost_engine.py +347 -0
  142. runbooks/vpc/heatmap_engine.py +605 -0
  143. runbooks/vpc/manager_interface.py +649 -0
  144. runbooks/vpc/networking_wrapper.py +1289 -0
  145. runbooks/vpc/rich_formatters.py +693 -0
  146. runbooks/vpc/tests/__init__.py +5 -0
  147. runbooks/vpc/tests/conftest.py +356 -0
  148. runbooks/vpc/tests/test_cli_integration.py +530 -0
  149. runbooks/vpc/tests/test_config.py +458 -0
  150. runbooks/vpc/tests/test_cost_engine.py +479 -0
  151. runbooks/vpc/tests/test_networking_wrapper.py +512 -0
  152. {runbooks-0.7.7.dist-info โ†’ runbooks-0.9.0.dist-info}/METADATA +175 -65
  153. {runbooks-0.7.7.dist-info โ†’ runbooks-0.9.0.dist-info}/RECORD +157 -60
  154. {runbooks-0.7.7.dist-info โ†’ runbooks-0.9.0.dist-info}/entry_points.txt +1 -1
  155. {runbooks-0.7.7.dist-info โ†’ runbooks-0.9.0.dist-info}/WHEEL +0 -0
  156. {runbooks-0.7.7.dist-info โ†’ runbooks-0.9.0.dist-info}/licenses/LICENSE +0 -0
  157. {runbooks-0.7.7.dist-info โ†’ runbooks-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1293 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ DORA Metrics Engine for HITL System Optimization
4
+
5
+ Issue #93: HITL System & DORA Metrics Optimization
6
+ Priority: High (Sprint 1 Improvements)
7
+ Scope: Optimize Human-in-the-Loop system and enhance DORA metrics collection
8
+ """
9
+
10
+ import asyncio
11
+ import json
12
+ import logging
13
+ import time
14
+ from dataclasses import asdict, dataclass
15
+ from datetime import datetime, timedelta, timezone
16
+ from pathlib import Path
17
+ from typing import Dict, List, Optional, Tuple
18
+
19
+ from ..utils.logger import configure_logger
20
+
21
+ logger = configure_logger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class DORAMetric:
26
+ """Individual DORA metric measurement"""
27
+
28
+ metric_name: str
29
+ value: float
30
+ unit: str
31
+ timestamp: datetime
32
+ tags: Dict[str, str] = None
33
+ metadata: Dict = None
34
+
35
+ def __post_init__(self):
36
+ if self.tags is None:
37
+ self.tags = {}
38
+ if self.metadata is None:
39
+ self.metadata = {}
40
+
41
+
42
+ @dataclass
43
+ class DeploymentEvent:
44
+ """Deployment event for DORA metrics tracking"""
45
+
46
+ deployment_id: str
47
+ environment: str
48
+ service_name: str
49
+ version: str
50
+ start_time: datetime
51
+ end_time: Optional[datetime] = None
52
+ status: str = "in_progress" # in_progress, success, failed, rolled_back
53
+ commit_sha: str = ""
54
+ approver: str = ""
55
+ rollback_time: Optional[datetime] = None
56
+
57
+
58
+ @dataclass
59
+ class IncidentEvent:
60
+ """Incident event for DORA metrics tracking"""
61
+
62
+ incident_id: str
63
+ service_name: str
64
+ severity: str # critical, high, medium, low
65
+ start_time: datetime
66
+ detection_time: Optional[datetime] = None
67
+ resolution_time: Optional[datetime] = None
68
+ root_cause: str = ""
69
+ caused_by_deployment: str = ""
70
+
71
+
72
+ class DORAMetricsEngine:
73
+ """
74
+ Enhanced DORA metrics collection and analysis engine for Enterprise SRE.
75
+
76
+ Provides comprehensive DORA metrics (Lead Time, Deploy Frequency, MTTR, Change Failure Rate)
77
+ with real-time collection, automated alerting, and enterprise dashboard integration.
78
+
79
+ Features:
80
+ - Real-time metrics streaming from git operations
81
+ - Automated deployment event capture via GitHub webhooks
82
+ - CloudWatch/Datadog integration for enterprise monitoring
83
+ - Cross-session persistence with baseline trending
84
+ - SLA compliance tracking with automated alerting
85
+ """
86
+
87
+ def __init__(self, artifacts_dir: str = "./artifacts/metrics", cross_validation_tolerance: float = 15.0):
88
+ """
89
+ Initialize enterprise DORA metrics engine
90
+
91
+ Args:
92
+ artifacts_dir: Directory to store metrics artifacts
93
+ cross_validation_tolerance: Tolerance percentage for metric validation
94
+ """
95
+ self.artifacts_dir = Path(artifacts_dir)
96
+ self.artifacts_dir.mkdir(parents=True, exist_ok=True)
97
+
98
+ # Create SRE-focused subdirectories
99
+ (self.artifacts_dir / "dora-reports").mkdir(exist_ok=True)
100
+ (self.artifacts_dir / "baselines").mkdir(exist_ok=True)
101
+ (self.artifacts_dir / "alerts").mkdir(exist_ok=True)
102
+ (self.artifacts_dir / "dashboards").mkdir(exist_ok=True)
103
+
104
+ self.tolerance = cross_validation_tolerance
105
+
106
+ # Metrics storage with persistence
107
+ self.deployments: List[DeploymentEvent] = []
108
+ self.incidents: List[IncidentEvent] = []
109
+ self.metrics_history: List[DORAMetric] = []
110
+ self.baselines: Dict[str, float] = {}
111
+
112
+ # HITL workflow metrics
113
+ self.approval_times: List[float] = []
114
+ self.workflow_bottlenecks: Dict[str, List[float]] = {}
115
+
116
+ # Enterprise SRE performance targets (FAANG SDLC standards)
117
+ self.targets = {
118
+ "lead_time_hours": 4, # <4 hours (FAANG velocity)
119
+ "deploy_frequency_daily": 1, # Daily deployment capability
120
+ "change_failure_rate": 0.05, # <5% (FAANG quality)
121
+ "mttr_hours": 1, # <1 hour (SRE excellence)
122
+ "approval_time_minutes": 30, # <30 minutes (HITL efficiency)
123
+ "success_rate": 0.95, # >95% (Enterprise reliability)
124
+ "sla_availability": 0.999, # >99.9% uptime
125
+ "performance_score": 90, # >90% performance score
126
+ }
127
+
128
+ # SRE alerting thresholds
129
+ self.alert_thresholds = {
130
+ "lead_time_hours": 6, # Alert if >6 hours
131
+ "deploy_frequency_daily": 0.5, # Alert if <0.5 deploys/day
132
+ "change_failure_rate": 0.10, # Alert if >10%
133
+ "mttr_hours": 2, # Alert if >2 hours
134
+ "approval_time_minutes": 60, # Alert if >60 minutes
135
+ }
136
+
137
+ # Load existing data
138
+ self._load_persistent_data()
139
+
140
+ # Initialize baseline metrics if not exists
141
+ self._initialize_baselines()
142
+
143
+ def record_deployment(
144
+ self,
145
+ deployment_id: str,
146
+ environment: str,
147
+ service_name: str,
148
+ version: str,
149
+ commit_sha: str = "",
150
+ approver: str = "",
151
+ ) -> DeploymentEvent:
152
+ """Record a new deployment event"""
153
+
154
+ deployment = DeploymentEvent(
155
+ deployment_id=deployment_id,
156
+ environment=environment,
157
+ service_name=service_name,
158
+ version=version,
159
+ start_time=datetime.now(timezone.utc),
160
+ commit_sha=commit_sha,
161
+ approver=approver,
162
+ )
163
+
164
+ self.deployments.append(deployment)
165
+
166
+ logger.info(f"๐Ÿš€ Deployment recorded: {deployment_id} for {service_name}")
167
+
168
+ return deployment
169
+
170
+ def complete_deployment(self, deployment_id: str, status: str, rollback_time: Optional[datetime] = None) -> bool:
171
+ """Mark deployment as complete"""
172
+
173
+ for deployment in self.deployments:
174
+ if deployment.deployment_id == deployment_id:
175
+ deployment.end_time = datetime.now(timezone.utc)
176
+ deployment.status = status
177
+ deployment.rollback_time = rollback_time
178
+
179
+ logger.info(f"โœ… Deployment completed: {deployment_id} - {status}")
180
+ return True
181
+
182
+ logger.warning(f"โš ๏ธ Deployment not found: {deployment_id}")
183
+ return False
184
+
185
+ def record_incident(
186
+ self, incident_id: str, service_name: str, severity: str, root_cause: str = "", caused_by_deployment: str = ""
187
+ ) -> IncidentEvent:
188
+ """Record a new incident event"""
189
+
190
+ incident = IncidentEvent(
191
+ incident_id=incident_id,
192
+ service_name=service_name,
193
+ severity=severity,
194
+ start_time=datetime.now(timezone.utc),
195
+ root_cause=root_cause,
196
+ caused_by_deployment=caused_by_deployment,
197
+ )
198
+
199
+ self.incidents.append(incident)
200
+
201
+ logger.info(f"๐Ÿšจ Incident recorded: {incident_id} - {severity} severity")
202
+
203
+ return incident
204
+
205
+ def resolve_incident(self, incident_id: str, detection_time: Optional[datetime] = None) -> bool:
206
+ """Mark incident as resolved"""
207
+
208
+ for incident in self.incidents:
209
+ if incident.incident_id == incident_id:
210
+ incident.resolution_time = datetime.now(timezone.utc)
211
+ if detection_time:
212
+ incident.detection_time = detection_time
213
+
214
+ logger.info(f"โœ… Incident resolved: {incident_id}")
215
+ return True
216
+
217
+ logger.warning(f"โš ๏ธ Incident not found: {incident_id}")
218
+ return False
219
+
220
+ def record_approval_time(self, approval_time_minutes: float, workflow_step: str = "general"):
221
+ """Record HITL approval time"""
222
+ self.approval_times.append(approval_time_minutes)
223
+
224
+ if workflow_step not in self.workflow_bottlenecks:
225
+ self.workflow_bottlenecks[workflow_step] = []
226
+ self.workflow_bottlenecks[workflow_step].append(approval_time_minutes)
227
+
228
+ def calculate_lead_time(self, days_back: int = 30) -> DORAMetric:
229
+ """Calculate deployment lead time"""
230
+
231
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
232
+ recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date and d.end_time]
233
+
234
+ if not recent_deployments:
235
+ return DORAMetric(
236
+ metric_name="lead_time",
237
+ value=0.0,
238
+ unit="hours",
239
+ timestamp=datetime.now(timezone.utc),
240
+ tags={"period": f"{days_back}d", "status": "no_data"},
241
+ )
242
+
243
+ # Calculate average lead time (simplified - in real scenario would track from commit to production)
244
+ lead_times = []
245
+ for deployment in recent_deployments:
246
+ if deployment.end_time and deployment.status == "success":
247
+ duration = (deployment.end_time - deployment.start_time).total_seconds() / 3600 # hours
248
+ lead_times.append(duration)
249
+
250
+ avg_lead_time = sum(lead_times) / len(lead_times) if lead_times else 0
251
+
252
+ metric = DORAMetric(
253
+ metric_name="lead_time",
254
+ value=avg_lead_time,
255
+ unit="hours",
256
+ timestamp=datetime.now(timezone.utc),
257
+ tags={
258
+ "period": f"{days_back}d",
259
+ "deployments_count": str(len(recent_deployments)),
260
+ "successful_deployments": str(len(lead_times)),
261
+ },
262
+ metadata={
263
+ "target": self.targets["lead_time_hours"],
264
+ "target_met": avg_lead_time <= self.targets["lead_time_hours"],
265
+ },
266
+ )
267
+
268
+ self.metrics_history.append(metric)
269
+ return metric
270
+
271
+ def calculate_deployment_frequency(self, days_back: int = 30) -> DORAMetric:
272
+ """Calculate deployment frequency"""
273
+
274
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
275
+ recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date]
276
+
277
+ # Calculate deployments per day
278
+ deployments_per_day = len(recent_deployments) / days_back if days_back > 0 else 0
279
+
280
+ metric = DORAMetric(
281
+ metric_name="deployment_frequency",
282
+ value=deployments_per_day,
283
+ unit="deployments_per_day",
284
+ timestamp=datetime.now(timezone.utc),
285
+ tags={"period": f"{days_back}d", "total_deployments": str(len(recent_deployments))},
286
+ metadata={
287
+ "target": self.targets["deploy_frequency_daily"],
288
+ "target_met": deployments_per_day >= self.targets["deploy_frequency_daily"],
289
+ },
290
+ )
291
+
292
+ self.metrics_history.append(metric)
293
+ return metric
294
+
295
+ def calculate_change_failure_rate(self, days_back: int = 30) -> DORAMetric:
296
+ """Calculate change failure rate"""
297
+
298
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
299
+ recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date and d.end_time]
300
+
301
+ if not recent_deployments:
302
+ return DORAMetric(
303
+ metric_name="change_failure_rate",
304
+ value=0.0,
305
+ unit="percentage",
306
+ timestamp=datetime.now(timezone.utc),
307
+ tags={"period": f"{days_back}d", "status": "no_data"},
308
+ )
309
+
310
+ failed_deployments = len([d for d in recent_deployments if d.status in ["failed", "rolled_back"]])
311
+
312
+ failure_rate = failed_deployments / len(recent_deployments)
313
+
314
+ metric = DORAMetric(
315
+ metric_name="change_failure_rate",
316
+ value=failure_rate,
317
+ unit="percentage",
318
+ timestamp=datetime.now(timezone.utc),
319
+ tags={
320
+ "period": f"{days_back}d",
321
+ "total_deployments": str(len(recent_deployments)),
322
+ "failed_deployments": str(failed_deployments),
323
+ },
324
+ metadata={
325
+ "target": self.targets["change_failure_rate"],
326
+ "target_met": failure_rate <= self.targets["change_failure_rate"],
327
+ },
328
+ )
329
+
330
+ self.metrics_history.append(metric)
331
+ return metric
332
+
333
+ def calculate_mttr(self, days_back: int = 30) -> DORAMetric:
334
+ """Calculate Mean Time to Recovery (MTTR)"""
335
+
336
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
337
+ recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date and i.resolution_time]
338
+
339
+ if not recent_incidents:
340
+ return DORAMetric(
341
+ metric_name="mttr",
342
+ value=0.0,
343
+ unit="hours",
344
+ timestamp=datetime.now(timezone.utc),
345
+ tags={"period": f"{days_back}d", "status": "no_data"},
346
+ )
347
+
348
+ # Calculate recovery times
349
+ recovery_times = []
350
+ for incident in recent_incidents:
351
+ if incident.resolution_time:
352
+ duration = (incident.resolution_time - incident.start_time).total_seconds() / 3600 # hours
353
+ recovery_times.append(duration)
354
+
355
+ avg_mttr = sum(recovery_times) / len(recovery_times) if recovery_times else 0
356
+
357
+ metric = DORAMetric(
358
+ metric_name="mttr",
359
+ value=avg_mttr,
360
+ unit="hours",
361
+ timestamp=datetime.now(timezone.utc),
362
+ tags={"period": f"{days_back}d", "incidents_count": str(len(recent_incidents))},
363
+ metadata={"target": self.targets["mttr_hours"], "target_met": avg_mttr <= self.targets["mttr_hours"]},
364
+ )
365
+
366
+ self.metrics_history.append(metric)
367
+ return metric
368
+
369
+ def calculate_hitl_metrics(self) -> Dict[str, DORAMetric]:
370
+ """Calculate Human-in-the-Loop specific metrics"""
371
+
372
+ metrics = {}
373
+
374
+ # Average approval time
375
+ if self.approval_times:
376
+ avg_approval_time = sum(self.approval_times) / len(self.approval_times)
377
+
378
+ metrics["approval_time"] = DORAMetric(
379
+ metric_name="approval_time",
380
+ value=avg_approval_time,
381
+ unit="minutes",
382
+ timestamp=datetime.now(timezone.utc),
383
+ tags={"total_approvals": str(len(self.approval_times))},
384
+ metadata={
385
+ "target": self.targets["approval_time_minutes"],
386
+ "target_met": avg_approval_time <= self.targets["approval_time_minutes"],
387
+ },
388
+ )
389
+
390
+ # Workflow bottlenecks analysis
391
+ if self.workflow_bottlenecks:
392
+ bottleneck_metrics = {}
393
+
394
+ for step, times in self.workflow_bottlenecks.items():
395
+ if times:
396
+ avg_time = sum(times) / len(times)
397
+ bottleneck_metrics[f"{step}_avg_time"] = avg_time
398
+
399
+ # Identify slowest step
400
+ if bottleneck_metrics:
401
+ slowest_step = max(bottleneck_metrics, key=bottleneck_metrics.get)
402
+ slowest_time = bottleneck_metrics[slowest_step]
403
+
404
+ metrics["workflow_bottleneck"] = DORAMetric(
405
+ metric_name="workflow_bottleneck",
406
+ value=slowest_time,
407
+ unit="minutes",
408
+ timestamp=datetime.now(timezone.utc),
409
+ tags={"bottleneck_step": slowest_step},
410
+ metadata={"all_steps": bottleneck_metrics},
411
+ )
412
+
413
+ return metrics
414
+
415
+ def _load_persistent_data(self) -> None:
416
+ """Load persistent DORA data from storage."""
417
+ try:
418
+ # Load deployments
419
+ deployments_file = self.artifacts_dir / "deployments.json"
420
+ if deployments_file.exists():
421
+ with open(deployments_file, "r") as f:
422
+ data = json.load(f)
423
+ self.deployments = [DeploymentEvent(**item) for item in data.get("deployments", [])]
424
+
425
+ # Load incidents
426
+ incidents_file = self.artifacts_dir / "incidents.json"
427
+ if incidents_file.exists():
428
+ with open(incidents_file, "r") as f:
429
+ data = json.load(f)
430
+ self.incidents = [IncidentEvent(**item) for item in data.get("incidents", [])]
431
+
432
+ # Load baselines
433
+ baselines_file = self.artifacts_dir / "baselines" / "current_baselines.json"
434
+ if baselines_file.exists():
435
+ with open(baselines_file, "r") as f:
436
+ self.baselines = json.load(f)
437
+
438
+ logger.info(f"๐Ÿ“Š Loaded {len(self.deployments)} deployments, {len(self.incidents)} incidents")
439
+
440
+ except Exception as e:
441
+ logger.warning(f"โš ๏ธ Failed to load persistent data: {e}")
442
+
443
+ def _save_persistent_data(self) -> None:
444
+ """Save persistent DORA data to storage."""
445
+ try:
446
+ # Save deployments
447
+ deployments_data = {
448
+ "deployments": [asdict(d) for d in self.deployments],
449
+ "last_updated": datetime.now(timezone.utc).isoformat(),
450
+ }
451
+ deployments_file = self.artifacts_dir / "deployments.json"
452
+ with open(deployments_file, "w") as f:
453
+ json.dump(deployments_data, f, indent=2, default=str)
454
+
455
+ # Save incidents
456
+ incidents_data = {
457
+ "incidents": [asdict(i) for i in self.incidents],
458
+ "last_updated": datetime.now(timezone.utc).isoformat(),
459
+ }
460
+ incidents_file = self.artifacts_dir / "incidents.json"
461
+ with open(incidents_file, "w") as f:
462
+ json.dump(incidents_data, f, indent=2, default=str)
463
+
464
+ # Save baselines
465
+ baselines_file = self.artifacts_dir / "baselines" / "current_baselines.json"
466
+ with open(baselines_file, "w") as f:
467
+ json.dump(self.baselines, f, indent=2)
468
+
469
+ except Exception as e:
470
+ logger.error(f"โŒ Failed to save persistent data: {e}")
471
+
472
+ def _initialize_baselines(self) -> None:
473
+ """Initialize baseline metrics for trending analysis."""
474
+ if not self.baselines and len(self.deployments) > 10:
475
+ # Calculate initial baselines from historical data
476
+ lead_time_metric = self.calculate_lead_time(30)
477
+ deploy_freq_metric = self.calculate_deployment_frequency(30)
478
+ failure_rate_metric = self.calculate_change_failure_rate(30)
479
+ mttr_metric = self.calculate_mttr(30)
480
+
481
+ self.baselines = {
482
+ "lead_time_hours": lead_time_metric.value,
483
+ "deploy_frequency_daily": deploy_freq_metric.value,
484
+ "change_failure_rate": failure_rate_metric.value,
485
+ "mttr_hours": mttr_metric.value,
486
+ "baseline_established": datetime.now(timezone.utc).isoformat(),
487
+ "sample_size": len(self.deployments),
488
+ }
489
+
490
+ logger.info("๐Ÿ“ˆ Established baseline metrics from historical data")
491
+ self._save_persistent_data()
492
+
493
+ def track_git_deployment(
494
+ self, commit_sha: str, branch: str = "main", author: str = "", message: str = ""
495
+ ) -> DeploymentEvent:
496
+ """
497
+ Track deployment from git operations for automated DORA collection.
498
+
499
+ Args:
500
+ commit_sha: Git commit SHA
501
+ branch: Git branch name
502
+ author: Commit author
503
+ message: Commit message
504
+
505
+ Returns:
506
+ Created deployment event
507
+ """
508
+ deployment_id = f"git-{commit_sha[:8]}-{int(time.time())}"
509
+
510
+ deployment = self.record_deployment(
511
+ deployment_id=deployment_id,
512
+ environment="production" if branch == "main" else "development",
513
+ service_name="runbooks",
514
+ version=commit_sha[:8],
515
+ commit_sha=commit_sha,
516
+ approver=author,
517
+ )
518
+
519
+ # Add git metadata
520
+ deployment.metadata = {
521
+ "branch": branch,
522
+ "author": author,
523
+ "message": message,
524
+ "automated": True,
525
+ "source": "git_integration",
526
+ }
527
+
528
+ logger.info(f"๐Ÿ”— Git deployment tracked: {commit_sha[:8]} on {branch}")
529
+
530
+ # Auto-save after git integration
531
+ self._save_persistent_data()
532
+
533
+ return deployment
534
+
535
+ def detect_performance_incident(
536
+ self, module: str, operation: str, execution_time: float, threshold: float
537
+ ) -> Optional[IncidentEvent]:
538
+ """
539
+ Automatically detect and record performance incidents.
540
+
541
+ Args:
542
+ module: Module name (e.g., 'finops', 'inventory')
543
+ operation: Operation name
544
+ execution_time: Actual execution time
545
+ threshold: Performance threshold
546
+
547
+ Returns:
548
+ Created incident if threshold exceeded, None otherwise
549
+ """
550
+ if execution_time <= threshold:
551
+ return None
552
+
553
+ incident_id = f"perf-{module}-{int(time.time())}"
554
+ severity = "critical" if execution_time > threshold * 2 else "high"
555
+
556
+ incident = self.record_incident(
557
+ incident_id=incident_id,
558
+ service_name=module,
559
+ severity=severity,
560
+ root_cause=f"Performance degradation: {operation} took {execution_time:.2f}s (threshold: {threshold:.2f}s)",
561
+ )
562
+
563
+ # Add performance metadata
564
+ incident.metadata = {
565
+ "operation": operation,
566
+ "execution_time": execution_time,
567
+ "threshold": threshold,
568
+ "degradation_factor": execution_time / threshold,
569
+ "automated_detection": True,
570
+ }
571
+
572
+ logger.warning(f"๐Ÿšจ Performance incident detected: {incident_id}")
573
+
574
+ # Generate real-time alert
575
+ self._generate_sre_alert(incident, execution_time, threshold)
576
+
577
+ return incident
578
+
579
+ def _generate_sre_alert(self, incident: IncidentEvent, execution_time: float, threshold: float) -> None:
580
+ """Generate SRE-focused performance alert."""
581
+ alert_data = {
582
+ "timestamp": datetime.now(timezone.utc).isoformat(),
583
+ "alert_type": "sre_performance_degradation",
584
+ "incident_id": incident.incident_id,
585
+ "service": incident.service_name,
586
+ "severity": incident.severity,
587
+ "execution_time": execution_time,
588
+ "threshold": threshold,
589
+ "degradation_factor": execution_time / threshold,
590
+ "impact": "user_experience" if execution_time > threshold * 1.5 else "performance_sla",
591
+ "recommended_actions": [
592
+ "Check system resource utilization",
593
+ "Review recent deployments for correlation",
594
+ "Validate AWS API rate limiting",
595
+ "Consider auto-scaling triggers",
596
+ ],
597
+ }
598
+
599
+ # Save alert to artifacts
600
+ alert_file = self.artifacts_dir / "alerts" / f"sre_alert_{incident.incident_id}.json"
601
+ with open(alert_file, "w") as f:
602
+ json.dump(alert_data, f, indent=2, default=str)
603
+
604
+ logger.critical(f"๐Ÿšจ SRE Alert generated: {alert_file}")
605
+
606
+ def calculate_sla_compliance(self, days_back: int = 30) -> Dict[str, DORAMetric]:
607
+ """
608
+ Calculate SLA compliance metrics for enterprise reporting.
609
+
610
+ Args:
611
+ days_back: Number of days to analyze
612
+
613
+ Returns:
614
+ Dictionary of SLA compliance metrics
615
+ """
616
+ sla_metrics = {}
617
+
618
+ # Calculate availability SLA (based on incident downtime)
619
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
620
+ recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date]
621
+
622
+ total_downtime_hours = 0
623
+ for incident in recent_incidents:
624
+ if incident.resolution_time and incident.severity in ["critical", "high"]:
625
+ downtime = (incident.resolution_time - incident.start_time).total_seconds() / 3600
626
+ total_downtime_hours += downtime
627
+
628
+ total_hours = days_back * 24
629
+ availability = max(0, (total_hours - total_downtime_hours) / total_hours)
630
+
631
+ sla_metrics["availability"] = DORAMetric(
632
+ metric_name="availability_sla",
633
+ value=availability,
634
+ unit="percentage",
635
+ timestamp=datetime.now(timezone.utc),
636
+ tags={"period": f"{days_back}d", "incidents": str(len(recent_incidents))},
637
+ metadata={
638
+ "target": self.targets["sla_availability"],
639
+ "target_met": availability >= self.targets["sla_availability"],
640
+ "downtime_hours": total_downtime_hours,
641
+ },
642
+ )
643
+
644
+ # Performance SLA (based on operation execution times)
645
+ performance_scores = []
646
+ for metric in self.metrics_history:
647
+ if metric.metadata and "performance_score" in metric.metadata:
648
+ performance_scores.append(metric.metadata["performance_score"])
649
+
650
+ avg_performance = sum(performance_scores) / len(performance_scores) if performance_scores else 0
651
+
652
+ sla_metrics["performance"] = DORAMetric(
653
+ metric_name="performance_sla",
654
+ value=avg_performance,
655
+ unit="percentage",
656
+ timestamp=datetime.now(timezone.utc),
657
+ tags={"sample_size": str(len(performance_scores))},
658
+ metadata={
659
+ "target": self.targets["performance_score"],
660
+ "target_met": avg_performance >= self.targets["performance_score"],
661
+ },
662
+ )
663
+
664
+ return sla_metrics
665
+
666
+ def generate_comprehensive_report(self, days_back: int = 30) -> Dict:
667
+ """Generate comprehensive DORA metrics report with SRE enhancements"""
668
+
669
+ logger.info(f"๐Ÿ“Š Generating enterprise DORA metrics report for last {days_back} days")
670
+
671
+ # Calculate all DORA metrics
672
+ lead_time = self.calculate_lead_time(days_back)
673
+ deployment_freq = self.calculate_deployment_frequency(days_back)
674
+ failure_rate = self.calculate_change_failure_rate(days_back)
675
+ mttr = self.calculate_mttr(days_back)
676
+
677
+ # Calculate HITL metrics
678
+ hitl_metrics = self.calculate_hitl_metrics()
679
+
680
+ # Calculate SLA compliance metrics
681
+ sla_metrics = self.calculate_sla_compliance(days_back)
682
+
683
+ # Performance analysis with enhanced SRE targets
684
+ targets_met = {
685
+ "lead_time": lead_time.metadata.get("target_met", False),
686
+ "deployment_frequency": deployment_freq.metadata.get("target_met", False),
687
+ "change_failure_rate": failure_rate.metadata.get("target_met", False),
688
+ "mttr": mttr.metadata.get("target_met", False),
689
+ }
690
+
691
+ # Add HITL targets
692
+ if "approval_time" in hitl_metrics:
693
+ targets_met["approval_time"] = hitl_metrics["approval_time"].metadata.get("target_met", False)
694
+
695
+ # Add SLA targets
696
+ for metric_name, metric in sla_metrics.items():
697
+ targets_met[f"sla_{metric_name}"] = metric.metadata.get("target_met", False)
698
+
699
+ overall_performance = sum(targets_met.values()) / len(targets_met) * 100
700
+
701
+ # Calculate trend analysis vs baselines
702
+ trend_analysis = {}
703
+ if self.baselines:
704
+ for metric_name, current_value in [
705
+ ("lead_time_hours", lead_time.value),
706
+ ("deploy_frequency_daily", deployment_freq.value),
707
+ ("change_failure_rate", failure_rate.value),
708
+ ("mttr_hours", mttr.value),
709
+ ]:
710
+ baseline = self.baselines.get(metric_name, current_value)
711
+ if baseline > 0:
712
+ trend_percentage = ((current_value - baseline) / baseline) * 100
713
+ trend_analysis[metric_name] = {
714
+ "current": current_value,
715
+ "baseline": baseline,
716
+ "trend_percentage": trend_percentage,
717
+ "improving": trend_percentage < 0
718
+ if metric_name != "deploy_frequency_daily"
719
+ else trend_percentage > 0,
720
+ }
721
+
722
+ report = {
723
+ "report_type": "dora_metrics_enterprise_sre",
724
+ "version": "2.0",
725
+ "period": f"{days_back}_days",
726
+ "timestamp": datetime.now(timezone.utc).isoformat(),
727
+ "dora_metrics": {
728
+ "lead_time": asdict(lead_time),
729
+ "deployment_frequency": asdict(deployment_freq),
730
+ "change_failure_rate": asdict(failure_rate),
731
+ "mttr": asdict(mttr),
732
+ },
733
+ "sla_metrics": {k: asdict(v) for k, v in sla_metrics.items()},
734
+ "hitl_metrics": {k: asdict(v) for k, v in hitl_metrics.items()},
735
+ "performance_analysis": {
736
+ "targets_met": targets_met,
737
+ "overall_performance_percentage": overall_performance,
738
+ "performance_grade": self._calculate_performance_grade(overall_performance),
739
+ "sla_compliance_score": sum(1 for k, v in targets_met.items() if k.startswith("sla_") and v)
740
+ / max(1, sum(1 for k in targets_met.keys() if k.startswith("sla_")))
741
+ * 100,
742
+ },
743
+ "trend_analysis": trend_analysis,
744
+ "baseline_comparison": self.baselines,
745
+ "recommendations": self._generate_sre_recommendations(
746
+ targets_met, hitl_metrics, sla_metrics, trend_analysis
747
+ ),
748
+ "alerts_summary": {
749
+ "active_alerts": len(
750
+ [
751
+ f
752
+ for f in (self.artifacts_dir / "alerts").glob("*.json")
753
+ if f.stat().st_mtime > time.time() - 86400
754
+ ]
755
+ ),
756
+ "performance_incidents": len(
757
+ [
758
+ i
759
+ for i in self.incidents
760
+ if i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
761
+ and "performance" in i.root_cause.lower()
762
+ ]
763
+ ),
764
+ "sre_health_score": overall_performance,
765
+ },
766
+ "raw_data": {
767
+ "deployments_count": len(self.deployments),
768
+ "incidents_count": len(self.incidents),
769
+ "approval_times_count": len(self.approval_times),
770
+ "automation_rate": len(
771
+ [d for d in self.deployments if getattr(d, "metadata", {}).get("automated", False)]
772
+ )
773
+ / max(1, len(self.deployments))
774
+ * 100,
775
+ },
776
+ }
777
+
778
+ # Save enhanced report to SRE reports directory
779
+ sre_reports_dir = self.artifacts_dir.parent / "sre-reports"
780
+ sre_reports_dir.mkdir(exist_ok=True)
781
+
782
+ report_file = sre_reports_dir / f"dora_enterprise_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
783
+ with open(report_file, "w") as f:
784
+ json.dump(report, f, indent=2, default=str)
785
+
786
+ # Also save to metrics directory for backward compatibility
787
+ legacy_report_file = (
788
+ self.artifacts_dir / "dora-reports" / f"dora_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
789
+ )
790
+ with open(legacy_report_file, "w") as f:
791
+ json.dump(report, f, indent=2, default=str)
792
+
793
+ logger.info(f"โœ… Enterprise DORA metrics report saved to: {report_file}")
794
+
795
+ # Auto-save persistent data after report generation
796
+ self._save_persistent_data()
797
+
798
+ return report
799
+
800
+ def _calculate_performance_grade(self, percentage: float) -> str:
801
+ """Calculate performance grade based on targets met"""
802
+ if percentage >= 90:
803
+ return "A (Excellent)"
804
+ elif percentage >= 80:
805
+ return "B (Good)"
806
+ elif percentage >= 70:
807
+ return "C (Satisfactory)"
808
+ elif percentage >= 60:
809
+ return "D (Needs Improvement)"
810
+ else:
811
+ return "F (Poor)"
812
+
813
+ def _generate_sre_recommendations(
814
+ self, targets_met: Dict[str, bool], hitl_metrics: Dict, sla_metrics: Dict, trend_analysis: Dict
815
+ ) -> List[str]:
816
+ """Generate enhanced SRE-focused recommendations based on comprehensive metrics analysis"""
817
+
818
+ recommendations = []
819
+
820
+ # DORA metrics recommendations
821
+ if not targets_met.get("lead_time", False):
822
+ recommendations.append(
823
+ "๐ŸŽฏ **Lead Time Optimization**: Implement parallel CI/CD workflows, automate testing pipelines, "
824
+ "and establish fast-track approval processes for low-risk changes"
825
+ )
826
+
827
+ if not targets_met.get("deployment_frequency", False):
828
+ recommendations.append(
829
+ "๐Ÿš€ **Deployment Frequency Enhancement**: Adopt continuous deployment patterns, implement "
830
+ "feature flags, and establish canary deployment strategies for risk mitigation"
831
+ )
832
+
833
+ if not targets_met.get("change_failure_rate", False):
834
+ recommendations.append(
835
+ "๐Ÿ›ก๏ธ **Change Failure Rate Reduction**: Enhance pre-production testing, implement progressive "
836
+ "rollouts, improve monitoring coverage, and establish automated rollback triggers"
837
+ )
838
+
839
+ if not targets_met.get("mttr", False):
840
+ recommendations.append(
841
+ "โšก **MTTR Improvement**: Implement automated incident detection, enhance observability stack, "
842
+ "establish runbook automation, and improve on-call response procedures"
843
+ )
844
+
845
+ # SLA compliance recommendations
846
+ if not targets_met.get("sla_availability", False):
847
+ recommendations.append(
848
+ "๐Ÿ”’ **Availability SLA Recovery**: Implement chaos engineering practices, enhance redundancy, "
849
+ "improve failover mechanisms, and establish proactive monitoring alerts"
850
+ )
851
+
852
+ if not targets_met.get("sla_performance", False):
853
+ recommendations.append(
854
+ "๐Ÿ“ˆ **Performance SLA Enhancement**: Optimize critical path operations, implement caching strategies, "
855
+ "enhance resource allocation, and establish performance regression testing"
856
+ )
857
+
858
+ # HITL workflow optimization
859
+ if not targets_met.get("approval_time", False):
860
+ recommendations.append(
861
+ "โฐ **Approval Workflow Optimization**: Implement risk-based approval routing, establish "
862
+ "parallel approval processes, and create self-service deployment capabilities for low-risk changes"
863
+ )
864
+
865
+ # Trend analysis recommendations
866
+ if trend_analysis:
867
+ declining_metrics = [k for k, v in trend_analysis.items() if not v.get("improving", True)]
868
+ if declining_metrics:
869
+ recommendations.append(
870
+ f"๐Ÿ“Š **Trend Alert**: Declining performance detected in {', '.join(declining_metrics)}. "
871
+ f"Implement immediate performance improvement initiatives and establish regression prevention measures"
872
+ )
873
+
874
+ # Proactive SRE recommendations based on patterns
875
+ if hitl_metrics.get("workflow_bottleneck"):
876
+ bottleneck_step = hitl_metrics["workflow_bottleneck"].tags.get("bottleneck_step", "unknown")
877
+ recommendations.append(
878
+ f"๐Ÿ” **Workflow Bottleneck Resolution**: Primary bottleneck identified in '{bottleneck_step}' step. "
879
+ f"Implement automation, parallel processing, or resource scaling for this workflow stage"
880
+ )
881
+
882
+ # Automation recommendations
883
+ automation_rate = targets_met.get("automation_rate", 0)
884
+ if automation_rate < 80:
885
+ recommendations.append(
886
+ "๐Ÿค– **Automation Enhancement**: Current automation rate below target. Implement GitOps workflows, "
887
+ "automated testing pipelines, and self-healing infrastructure patterns"
888
+ )
889
+
890
+ # Advanced SRE practices
891
+ if len([k for k, v in targets_met.items() if v]) / len(targets_met) < 0.8:
892
+ recommendations.append(
893
+ "๐ŸŽฏ **SRE Maturity Enhancement**: Consider implementing advanced SRE practices: error budgets, "
894
+ "SLI/SLO management, chaos engineering, and customer-centric reliability metrics"
895
+ )
896
+
897
+ if not recommendations:
898
+ recommendations.append(
899
+ "โœ… **Excellence Achieved**: All SRE targets met! Consider advanced optimization: predictive scaling, "
900
+ "AI-powered incident response, and continuous reliability improvement programs"
901
+ )
902
+
903
+ return recommendations
904
+
905
+ def _generate_recommendations(self, targets_met: Dict[str, bool], hitl_metrics: Dict) -> List[str]:
906
+ """Generate recommendations based on metrics analysis"""
907
+
908
+ recommendations = []
909
+
910
+ if not targets_met.get("lead_time", False):
911
+ recommendations.append(
912
+ "๐ŸŽฏ Optimize lead time: Consider parallel workflows, automated testing, and faster approval processes"
913
+ )
914
+
915
+ if not targets_met.get("deployment_frequency", False):
916
+ recommendations.append(
917
+ "๐Ÿš€ Increase deployment frequency: Implement continuous deployment pipeline and smaller batch sizes"
918
+ )
919
+
920
+ if not targets_met.get("change_failure_rate", False):
921
+ recommendations.append(
922
+ "๐Ÿ›ก๏ธ Reduce failure rate: Enhance testing coverage, implement canary deployments, and improve rollback procedures"
923
+ )
924
+
925
+ if not targets_met.get("mttr", False):
926
+ recommendations.append(
927
+ "โšก Improve MTTR: Enhance monitoring, implement automated incident response, and improve alerting"
928
+ )
929
+
930
+ if not targets_met.get("approval_time", False):
931
+ recommendations.append(
932
+ "โฐ Optimize approval workflow: Streamline HITL processes, implement parallel approvals, and reduce approval steps"
933
+ )
934
+
935
+ # HITL-specific recommendations
936
+ if "workflow_bottleneck" in hitl_metrics:
937
+ bottleneck_step = hitl_metrics["workflow_bottleneck"].tags.get("bottleneck_step", "unknown")
938
+ recommendations.append(f"๐Ÿ” Address workflow bottleneck: Focus on optimizing '{bottleneck_step}' step")
939
+
940
+ if not recommendations:
941
+ recommendations.append(
942
+ "โœ… All targets met! Consider raising performance targets or exploring advanced optimization opportunities"
943
+ )
944
+
945
+ return recommendations
946
+
947
+ def export_metrics_for_visualization(self, output_file: Optional[str] = None) -> str:
948
+ """Export metrics in format suitable for visualization tools"""
949
+
950
+ if not output_file:
951
+ output_file = self.artifacts_dir / f"metrics_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
952
+
953
+ export_data = {
954
+ "export_timestamp": datetime.now(timezone.utc).isoformat(),
955
+ "metrics_history": [asdict(m) for m in self.metrics_history],
956
+ "deployments": [asdict(d) for d in self.deployments],
957
+ "incidents": [asdict(i) for i in self.incidents],
958
+ "targets": self.targets,
959
+ "summary_stats": {
960
+ "total_deployments": len(self.deployments),
961
+ "successful_deployments": len([d for d in self.deployments if d.status == "success"]),
962
+ "total_incidents": len(self.incidents),
963
+ "resolved_incidents": len([i for i in self.incidents if i.resolution_time]),
964
+ "average_approval_time": sum(self.approval_times) / len(self.approval_times)
965
+ if self.approval_times
966
+ else 0,
967
+ },
968
+ }
969
+
970
+ with open(output_file, "w") as f:
971
+ json.dump(export_data, f, indent=2, default=str)
972
+
973
+ logger.info(f"๐Ÿ“Š Metrics exported for visualization: {output_file}")
974
+ return str(output_file)
975
+
976
+ def generate_sre_dashboard(self, days_back: int = 30) -> Dict:
977
+ """
978
+ Generate comprehensive SRE dashboard data for visualization tools.
979
+
980
+ Args:
981
+ days_back: Number of days to analyze for dashboard
982
+
983
+ Returns:
984
+ Dashboard data structure optimized for SRE tools (Datadog, Grafana, etc.)
985
+ """
986
+ logger.info(f"๐Ÿ“Š Generating SRE dashboard data for {days_back} days")
987
+
988
+ # Get comprehensive report data
989
+ report = self.generate_comprehensive_report(days_back)
990
+
991
+ # Format for SRE dashboard tools
992
+ dashboard_data = {
993
+ "dashboard_type": "sre_dora_metrics",
994
+ "generated_at": datetime.now(timezone.utc).isoformat(),
995
+ "time_range_days": days_back,
996
+ # Key Performance Indicators (KPIs) for executive view
997
+ "kpi_summary": {
998
+ "overall_performance_score": report["performance_analysis"]["overall_performance_percentage"],
999
+ "sla_compliance_score": report["performance_analysis"]["sla_compliance_score"],
1000
+ "dora_metrics_health": len(
1001
+ [
1002
+ k
1003
+ for k, v in report["performance_analysis"]["targets_met"].items()
1004
+ if not k.startswith("sla_") and v
1005
+ ]
1006
+ )
1007
+ / 4
1008
+ * 100,
1009
+ "active_incidents": len(
1010
+ [
1011
+ i
1012
+ for i in self.incidents
1013
+ if i.start_time >= datetime.now(timezone.utc) - timedelta(days=1) and not i.resolution_time
1014
+ ]
1015
+ ),
1016
+ "automation_percentage": report["raw_data"]["automation_rate"],
1017
+ },
1018
+ # Time series data for trending
1019
+ "time_series": {
1020
+ "lead_time": [
1021
+ {"timestamp": m.timestamp.isoformat(), "value": m.value}
1022
+ for m in self.metrics_history
1023
+ if m.metric_name == "lead_time"
1024
+ ][-30:], # Last 30 data points
1025
+ "deployment_frequency": [
1026
+ {"timestamp": m.timestamp.isoformat(), "value": m.value}
1027
+ for m in self.metrics_history
1028
+ if m.metric_name == "deployment_frequency"
1029
+ ][-30:],
1030
+ "change_failure_rate": [
1031
+ {"timestamp": m.timestamp.isoformat(), "value": m.value * 100} # Convert to percentage
1032
+ for m in self.metrics_history
1033
+ if m.metric_name == "change_failure_rate"
1034
+ ][-30:],
1035
+ "mttr": [
1036
+ {"timestamp": m.timestamp.isoformat(), "value": m.value}
1037
+ for m in self.metrics_history
1038
+ if m.metric_name == "mttr"
1039
+ ][-30:],
1040
+ },
1041
+ # Alert and incident summary
1042
+ "alerts_incidents": {
1043
+ "recent_alerts": len(
1044
+ [
1045
+ f
1046
+ for f in (self.artifacts_dir / "alerts").glob("*.json")
1047
+ if f.stat().st_mtime > time.time() - 86400
1048
+ ]
1049
+ ),
1050
+ "incident_severity_breakdown": {
1051
+ "critical": len(
1052
+ [
1053
+ i
1054
+ for i in self.incidents
1055
+ if i.severity == "critical"
1056
+ and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
1057
+ ]
1058
+ ),
1059
+ "high": len(
1060
+ [
1061
+ i
1062
+ for i in self.incidents
1063
+ if i.severity == "high"
1064
+ and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
1065
+ ]
1066
+ ),
1067
+ "medium": len(
1068
+ [
1069
+ i
1070
+ for i in self.incidents
1071
+ if i.severity == "medium"
1072
+ and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
1073
+ ]
1074
+ ),
1075
+ },
1076
+ "mttr_by_severity": self._calculate_mttr_by_severity(days_back),
1077
+ },
1078
+ # Operational metrics
1079
+ "operational_metrics": {
1080
+ "deployment_success_rate": len([d for d in self.deployments if d.status == "success"])
1081
+ / max(1, len(self.deployments))
1082
+ * 100,
1083
+ "avg_approval_time_minutes": sum(self.approval_times) / max(1, len(self.approval_times)),
1084
+ "workflow_efficiency_score": 100
1085
+ - (
1086
+ sum(self.approval_times) / max(1, len(self.approval_times)) / 60 * 100
1087
+ ), # Efficiency based on approval speed
1088
+ "service_reliability_score": report["sla_metrics"]["availability"]["value"] * 100
1089
+ if "availability" in report.get("sla_metrics", {})
1090
+ else 0,
1091
+ },
1092
+ # Targets and thresholds for visualization
1093
+ "targets": self.targets,
1094
+ "alert_thresholds": self.alert_thresholds,
1095
+ # Raw data for detailed analysis
1096
+ "raw_metrics": report,
1097
+ }
1098
+
1099
+ # Save dashboard data for external tools
1100
+ dashboard_file = (
1101
+ self.artifacts_dir / "dashboards" / f"sre_dashboard_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
1102
+ )
1103
+ with open(dashboard_file, "w") as f:
1104
+ json.dump(dashboard_data, f, indent=2, default=str)
1105
+
1106
+ logger.info(f"๐Ÿ“Š SRE dashboard data saved: {dashboard_file}")
1107
+
1108
+ return dashboard_data
1109
+
1110
+ def _calculate_mttr_by_severity(self, days_back: int) -> Dict[str, float]:
1111
+ """Calculate MTTR broken down by incident severity."""
1112
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
1113
+ recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date and i.resolution_time]
1114
+
1115
+ mttr_by_severity = {}
1116
+ for severity in ["critical", "high", "medium", "low"]:
1117
+ severity_incidents = [i for i in recent_incidents if i.severity == severity]
1118
+ if severity_incidents:
1119
+ total_time = sum((i.resolution_time - i.start_time).total_seconds() / 3600 for i in severity_incidents)
1120
+ mttr_by_severity[severity] = total_time / len(severity_incidents)
1121
+ else:
1122
+ mttr_by_severity[severity] = 0
1123
+
1124
+ return mttr_by_severity
1125
+
1126
+ def integrate_with_performance_monitor(self, performance_monitor) -> None:
1127
+ """
1128
+ Integrate DORA metrics with existing performance monitoring system.
1129
+
1130
+ Args:
1131
+ performance_monitor: Instance of PerformanceMonitor class
1132
+ """
1133
+ try:
1134
+ # Hook into performance monitor to auto-detect incidents
1135
+ original_track = performance_monitor.track_operation
1136
+
1137
+ def enhanced_track_operation(
1138
+ module: str, operation: str, execution_time: float, success: bool = True, metadata=None
1139
+ ):
1140
+ # Call original method
1141
+ result = original_track(module, operation, execution_time, success, metadata)
1142
+
1143
+ # Auto-detect performance incidents for DORA tracking
1144
+ target = performance_monitor.performance_targets.get(module, {})
1145
+ threshold = target.get("target_time", 30.0)
1146
+
1147
+ if execution_time > threshold:
1148
+ self.detect_performance_incident(module, operation, execution_time, threshold)
1149
+
1150
+ return result
1151
+
1152
+ # Replace with enhanced version
1153
+ performance_monitor.track_operation = enhanced_track_operation
1154
+
1155
+ logger.info("๐Ÿ”— DORA metrics integrated with performance monitor")
1156
+
1157
+ except Exception as e:
1158
+ logger.error(f"โŒ Failed to integrate with performance monitor: {e}")
1159
+
1160
+ def export_cloudwatch_metrics(self, namespace: str = "CloudOps/DORA") -> bool:
1161
+ """
1162
+ Export DORA metrics to CloudWatch for enterprise monitoring.
1163
+
1164
+ Args:
1165
+ namespace: CloudWatch metrics namespace
1166
+
1167
+ Returns:
1168
+ Success status of metric publishing
1169
+ """
1170
+ try:
1171
+ import boto3
1172
+
1173
+ cloudwatch = boto3.client("cloudwatch")
1174
+
1175
+ # Calculate current metrics
1176
+ lead_time = self.calculate_lead_time(7) # Weekly metrics
1177
+ deploy_freq = self.calculate_deployment_frequency(7)
1178
+ failure_rate = self.calculate_change_failure_rate(7)
1179
+ mttr = self.calculate_mttr(7)
1180
+
1181
+ # Publish to CloudWatch
1182
+ metrics_to_publish = [
1183
+ {
1184
+ "MetricName": "LeadTime",
1185
+ "Value": lead_time.value,
1186
+ "Unit": "Seconds",
1187
+ "Dimensions": [{"Name": "Environment", "Value": "production"}],
1188
+ },
1189
+ {
1190
+ "MetricName": "DeploymentFrequency",
1191
+ "Value": deploy_freq.value,
1192
+ "Unit": "Count/Second",
1193
+ "Dimensions": [{"Name": "Environment", "Value": "production"}],
1194
+ },
1195
+ {
1196
+ "MetricName": "ChangeFailureRate",
1197
+ "Value": failure_rate.value * 100, # Convert to percentage
1198
+ "Unit": "Percent",
1199
+ "Dimensions": [{"Name": "Environment", "Value": "production"}],
1200
+ },
1201
+ {
1202
+ "MetricName": "MeanTimeToRecovery",
1203
+ "Value": mttr.value,
1204
+ "Unit": "Seconds",
1205
+ "Dimensions": [{"Name": "Environment", "Value": "production"}],
1206
+ },
1207
+ ]
1208
+
1209
+ response = cloudwatch.put_metric_data(Namespace=namespace, MetricData=metrics_to_publish)
1210
+
1211
+ logger.info(f"๐Ÿ“Š DORA metrics published to CloudWatch: {namespace}")
1212
+ return True
1213
+
1214
+ except Exception as e:
1215
+ logger.error(f"โŒ Failed to export CloudWatch metrics: {e}")
1216
+ return False
1217
+
1218
+
1219
+ # Async functions for integration with existing systems
1220
+ async def simulate_dora_metrics_collection(duration_minutes: int = 5) -> Dict:
1221
+ """Simulate DORA metrics collection for demonstration"""
1222
+
1223
+ engine = DORAMetricsEngine()
1224
+
1225
+ logger.info(f"๐Ÿงช Starting {duration_minutes}-minute DORA metrics simulation")
1226
+
1227
+ # Simulate deployment events
1228
+ deployments = [
1229
+ ("deploy-001", "production", "vpc-wrapper", "v1.2.0", "abc123", "manager"),
1230
+ ("deploy-002", "staging", "finops-dashboard", "v2.1.0", "def456", "architect"),
1231
+ ("deploy-003", "production", "organizations-api", "v1.0.1", "ghi789", "manager"),
1232
+ ]
1233
+
1234
+ for dep_id, env, service, version, commit, approver in deployments:
1235
+ deployment = engine.record_deployment(dep_id, env, service, version, commit, approver)
1236
+
1237
+ # Simulate approval time
1238
+ approval_time = 15 + (hash(dep_id) % 30) # 15-45 minutes
1239
+ engine.record_approval_time(approval_time, f"{env}_deployment")
1240
+
1241
+ # Simulate deployment completion after short delay
1242
+ await asyncio.sleep(1)
1243
+
1244
+ # 90% success rate simulation
1245
+ status = "success" if hash(dep_id) % 10 < 9 else "failed"
1246
+ engine.complete_deployment(dep_id, status)
1247
+
1248
+ # Simulate incidents
1249
+ incidents = [
1250
+ ("inc-001", "vpc-wrapper", "high", "Network configuration error", "deploy-001"),
1251
+ ("inc-002", "finops-dashboard", "medium", "Query timeout", ""),
1252
+ ]
1253
+
1254
+ for inc_id, service, severity, cause, caused_by in incidents:
1255
+ incident = engine.record_incident(inc_id, service, severity, cause, caused_by)
1256
+
1257
+ # Simulate incident resolution
1258
+ await asyncio.sleep(0.5)
1259
+ detection_time = incident.start_time + timedelta(minutes=5)
1260
+ engine.resolve_incident(inc_id, detection_time)
1261
+
1262
+ # Generate comprehensive report
1263
+ report = engine.generate_comprehensive_report(days_back=7)
1264
+
1265
+ return report
1266
+
1267
+
1268
+ if __name__ == "__main__":
1269
+ # CLI execution
1270
+ import argparse
1271
+
1272
+ parser = argparse.ArgumentParser(description="DORA Metrics Engine")
1273
+ parser.add_argument("--simulate", action="store_true", help="Run simulation mode")
1274
+ parser.add_argument("--duration", type=int, default=5, help="Simulation duration in minutes")
1275
+ parser.add_argument("--output", "-o", default="./artifacts/metrics", help="Output directory for metrics")
1276
+
1277
+ args = parser.parse_args()
1278
+
1279
+ async def main():
1280
+ if args.simulate:
1281
+ report = await simulate_dora_metrics_collection(args.duration)
1282
+ print("โœ… DORA metrics simulation completed")
1283
+ print(f"๐Ÿ“Š Overall performance: {report['performance_analysis']['performance_grade']}")
1284
+ print(
1285
+ f"๐ŸŽฏ Targets met: {sum(report['performance_analysis']['targets_met'].values())}/{len(report['performance_analysis']['targets_met'])}"
1286
+ )
1287
+ else:
1288
+ engine = DORAMetricsEngine(args.output)
1289
+ report = engine.generate_comprehensive_report()
1290
+ print("โœ… DORA metrics report generated")
1291
+ print(f"๐Ÿ“Š Report saved to: {engine.artifacts_dir}")
1292
+
1293
+ asyncio.run(main())