runbooks 0.7.7__py3-none-any.whl โ 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runbooks/__init__.py +1 -1
- runbooks/base.py +2 -2
- runbooks/cfat/README.md +12 -1
- runbooks/cfat/__init__.py +8 -4
- runbooks/cfat/assessment/collectors.py +171 -14
- runbooks/cfat/assessment/compliance.py +546 -522
- runbooks/cfat/assessment/runner.py +129 -10
- runbooks/cfat/models.py +6 -2
- runbooks/common/__init__.py +152 -0
- runbooks/common/accuracy_validator.py +1039 -0
- runbooks/common/context_logger.py +440 -0
- runbooks/common/cross_module_integration.py +594 -0
- runbooks/common/enhanced_exception_handler.py +1108 -0
- runbooks/common/enterprise_audit_integration.py +634 -0
- runbooks/common/logger.py +14 -0
- runbooks/common/mcp_integration.py +539 -0
- runbooks/common/performance_monitor.py +387 -0
- runbooks/common/profile_utils.py +216 -0
- runbooks/common/rich_utils.py +622 -0
- runbooks/enterprise/__init__.py +68 -0
- runbooks/enterprise/error_handling.py +411 -0
- runbooks/enterprise/logging.py +439 -0
- runbooks/enterprise/multi_tenant.py +583 -0
- runbooks/feedback/user_feedback_collector.py +440 -0
- runbooks/finops/README.md +129 -14
- runbooks/finops/__init__.py +22 -3
- runbooks/finops/account_resolver.py +279 -0
- runbooks/finops/accuracy_cross_validator.py +638 -0
- runbooks/finops/aws_client.py +721 -36
- runbooks/finops/budget_integration.py +313 -0
- runbooks/finops/cli.py +90 -33
- runbooks/finops/cost_processor.py +211 -37
- runbooks/finops/dashboard_router.py +900 -0
- runbooks/finops/dashboard_runner.py +1334 -399
- runbooks/finops/embedded_mcp_validator.py +288 -0
- runbooks/finops/enhanced_dashboard_runner.py +526 -0
- runbooks/finops/enhanced_progress.py +327 -0
- runbooks/finops/enhanced_trend_visualization.py +423 -0
- runbooks/finops/finops_dashboard.py +41 -0
- runbooks/finops/helpers.py +639 -323
- runbooks/finops/iam_guidance.py +400 -0
- runbooks/finops/markdown_exporter.py +466 -0
- runbooks/finops/multi_dashboard.py +1502 -0
- runbooks/finops/optimizer.py +396 -395
- runbooks/finops/profile_processor.py +2 -2
- runbooks/finops/runbooks.inventory.organizations_discovery.log +0 -0
- runbooks/finops/runbooks.security.report_generator.log +0 -0
- runbooks/finops/runbooks.security.run_script.log +0 -0
- runbooks/finops/runbooks.security.security_export.log +0 -0
- runbooks/finops/service_mapping.py +195 -0
- runbooks/finops/single_dashboard.py +710 -0
- runbooks/finops/tests/__init__.py +19 -0
- runbooks/finops/tests/results_test_finops_dashboard.xml +1 -0
- runbooks/finops/tests/run_comprehensive_tests.py +421 -0
- runbooks/finops/tests/run_tests.py +305 -0
- runbooks/finops/tests/test_finops_dashboard.py +705 -0
- runbooks/finops/tests/test_integration.py +477 -0
- runbooks/finops/tests/test_performance.py +380 -0
- runbooks/finops/tests/test_performance_benchmarks.py +500 -0
- runbooks/finops/tests/test_reference_images_validation.py +867 -0
- runbooks/finops/tests/test_single_account_features.py +715 -0
- runbooks/finops/tests/validate_test_suite.py +220 -0
- runbooks/finops/types.py +1 -1
- runbooks/hitl/enhanced_workflow_engine.py +725 -0
- runbooks/inventory/README.md +12 -1
- runbooks/inventory/artifacts/scale-optimize-status.txt +12 -0
- runbooks/inventory/collectors/aws_comprehensive.py +192 -185
- runbooks/inventory/collectors/enterprise_scale.py +281 -0
- runbooks/inventory/core/collector.py +299 -12
- runbooks/inventory/list_ec2_instances.py +21 -20
- runbooks/inventory/list_ssm_parameters.py +31 -3
- runbooks/inventory/organizations_discovery.py +1315 -0
- runbooks/inventory/rich_inventory_display.py +360 -0
- runbooks/inventory/run_on_multi_accounts.py +32 -16
- runbooks/inventory/runbooks.security.report_generator.log +0 -0
- runbooks/inventory/runbooks.security.run_script.log +0 -0
- runbooks/inventory/vpc_flow_analyzer.py +1030 -0
- runbooks/main.py +4171 -1615
- runbooks/metrics/dora_metrics_engine.py +1293 -0
- runbooks/monitoring/performance_monitor.py +433 -0
- runbooks/operate/README.md +394 -0
- runbooks/operate/__init__.py +2 -2
- runbooks/operate/base.py +291 -11
- runbooks/operate/deployment_framework.py +1032 -0
- runbooks/operate/deployment_validator.py +853 -0
- runbooks/operate/dynamodb_operations.py +10 -6
- runbooks/operate/ec2_operations.py +321 -11
- runbooks/operate/executive_dashboard.py +779 -0
- runbooks/operate/mcp_integration.py +750 -0
- runbooks/operate/nat_gateway_operations.py +1120 -0
- runbooks/operate/networking_cost_heatmap.py +685 -0
- runbooks/operate/privatelink_operations.py +940 -0
- runbooks/operate/s3_operations.py +10 -6
- runbooks/operate/vpc_endpoints.py +644 -0
- runbooks/operate/vpc_operations.py +1038 -0
- runbooks/remediation/README.md +489 -13
- runbooks/remediation/__init__.py +2 -2
- runbooks/remediation/acm_remediation.py +1 -1
- runbooks/remediation/base.py +1 -1
- runbooks/remediation/cloudtrail_remediation.py +1 -1
- runbooks/remediation/cognito_remediation.py +1 -1
- runbooks/remediation/commons.py +8 -4
- runbooks/remediation/dynamodb_remediation.py +1 -1
- runbooks/remediation/ec2_remediation.py +1 -1
- runbooks/remediation/ec2_unattached_ebs_volumes.py +1 -1
- runbooks/remediation/kms_enable_key_rotation.py +1 -1
- runbooks/remediation/kms_remediation.py +1 -1
- runbooks/remediation/lambda_remediation.py +1 -1
- runbooks/remediation/multi_account.py +1 -1
- runbooks/remediation/rds_remediation.py +1 -1
- runbooks/remediation/s3_block_public_access.py +1 -1
- runbooks/remediation/s3_enable_access_logging.py +1 -1
- runbooks/remediation/s3_encryption.py +1 -1
- runbooks/remediation/s3_remediation.py +1 -1
- runbooks/remediation/vpc_remediation.py +475 -0
- runbooks/security/ENTERPRISE_SECURITY_FRAMEWORK.md +506 -0
- runbooks/security/README.md +12 -1
- runbooks/security/__init__.py +166 -33
- runbooks/security/compliance_automation.py +634 -0
- runbooks/security/compliance_automation_engine.py +1021 -0
- runbooks/security/enterprise_security_framework.py +931 -0
- runbooks/security/enterprise_security_policies.json +293 -0
- runbooks/security/integration_test_enterprise_security.py +879 -0
- runbooks/security/module_security_integrator.py +641 -0
- runbooks/security/report_generator.py +10 -0
- runbooks/security/run_script.py +27 -5
- runbooks/security/security_baseline_tester.py +153 -27
- runbooks/security/security_export.py +456 -0
- runbooks/sre/README.md +472 -0
- runbooks/sre/__init__.py +33 -0
- runbooks/sre/mcp_reliability_engine.py +1049 -0
- runbooks/sre/performance_optimization_engine.py +1032 -0
- runbooks/sre/reliability_monitoring_framework.py +1011 -0
- runbooks/validation/__init__.py +10 -0
- runbooks/validation/benchmark.py +489 -0
- runbooks/validation/cli.py +368 -0
- runbooks/validation/mcp_validator.py +797 -0
- runbooks/vpc/README.md +478 -0
- runbooks/vpc/__init__.py +38 -0
- runbooks/vpc/config.py +212 -0
- runbooks/vpc/cost_engine.py +347 -0
- runbooks/vpc/heatmap_engine.py +605 -0
- runbooks/vpc/manager_interface.py +649 -0
- runbooks/vpc/networking_wrapper.py +1289 -0
- runbooks/vpc/rich_formatters.py +693 -0
- runbooks/vpc/tests/__init__.py +5 -0
- runbooks/vpc/tests/conftest.py +356 -0
- runbooks/vpc/tests/test_cli_integration.py +530 -0
- runbooks/vpc/tests/test_config.py +458 -0
- runbooks/vpc/tests/test_cost_engine.py +479 -0
- runbooks/vpc/tests/test_networking_wrapper.py +512 -0
- {runbooks-0.7.7.dist-info โ runbooks-0.9.0.dist-info}/METADATA +175 -65
- {runbooks-0.7.7.dist-info โ runbooks-0.9.0.dist-info}/RECORD +157 -60
- {runbooks-0.7.7.dist-info โ runbooks-0.9.0.dist-info}/entry_points.txt +1 -1
- {runbooks-0.7.7.dist-info โ runbooks-0.9.0.dist-info}/WHEEL +0 -0
- {runbooks-0.7.7.dist-info โ runbooks-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {runbooks-0.7.7.dist-info โ runbooks-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1293 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
DORA Metrics Engine for HITL System Optimization
|
4
|
+
|
5
|
+
Issue #93: HITL System & DORA Metrics Optimization
|
6
|
+
Priority: High (Sprint 1 Improvements)
|
7
|
+
Scope: Optimize Human-in-the-Loop system and enhance DORA metrics collection
|
8
|
+
"""
|
9
|
+
|
10
|
+
import asyncio
|
11
|
+
import json
|
12
|
+
import logging
|
13
|
+
import time
|
14
|
+
from dataclasses import asdict, dataclass
|
15
|
+
from datetime import datetime, timedelta, timezone
|
16
|
+
from pathlib import Path
|
17
|
+
from typing import Dict, List, Optional, Tuple
|
18
|
+
|
19
|
+
from ..utils.logger import configure_logger
|
20
|
+
|
21
|
+
logger = configure_logger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
@dataclass
|
25
|
+
class DORAMetric:
|
26
|
+
"""Individual DORA metric measurement"""
|
27
|
+
|
28
|
+
metric_name: str
|
29
|
+
value: float
|
30
|
+
unit: str
|
31
|
+
timestamp: datetime
|
32
|
+
tags: Dict[str, str] = None
|
33
|
+
metadata: Dict = None
|
34
|
+
|
35
|
+
def __post_init__(self):
|
36
|
+
if self.tags is None:
|
37
|
+
self.tags = {}
|
38
|
+
if self.metadata is None:
|
39
|
+
self.metadata = {}
|
40
|
+
|
41
|
+
|
42
|
+
@dataclass
|
43
|
+
class DeploymentEvent:
|
44
|
+
"""Deployment event for DORA metrics tracking"""
|
45
|
+
|
46
|
+
deployment_id: str
|
47
|
+
environment: str
|
48
|
+
service_name: str
|
49
|
+
version: str
|
50
|
+
start_time: datetime
|
51
|
+
end_time: Optional[datetime] = None
|
52
|
+
status: str = "in_progress" # in_progress, success, failed, rolled_back
|
53
|
+
commit_sha: str = ""
|
54
|
+
approver: str = ""
|
55
|
+
rollback_time: Optional[datetime] = None
|
56
|
+
|
57
|
+
|
58
|
+
@dataclass
|
59
|
+
class IncidentEvent:
|
60
|
+
"""Incident event for DORA metrics tracking"""
|
61
|
+
|
62
|
+
incident_id: str
|
63
|
+
service_name: str
|
64
|
+
severity: str # critical, high, medium, low
|
65
|
+
start_time: datetime
|
66
|
+
detection_time: Optional[datetime] = None
|
67
|
+
resolution_time: Optional[datetime] = None
|
68
|
+
root_cause: str = ""
|
69
|
+
caused_by_deployment: str = ""
|
70
|
+
|
71
|
+
|
72
|
+
class DORAMetricsEngine:
|
73
|
+
"""
|
74
|
+
Enhanced DORA metrics collection and analysis engine for Enterprise SRE.
|
75
|
+
|
76
|
+
Provides comprehensive DORA metrics (Lead Time, Deploy Frequency, MTTR, Change Failure Rate)
|
77
|
+
with real-time collection, automated alerting, and enterprise dashboard integration.
|
78
|
+
|
79
|
+
Features:
|
80
|
+
- Real-time metrics streaming from git operations
|
81
|
+
- Automated deployment event capture via GitHub webhooks
|
82
|
+
- CloudWatch/Datadog integration for enterprise monitoring
|
83
|
+
- Cross-session persistence with baseline trending
|
84
|
+
- SLA compliance tracking with automated alerting
|
85
|
+
"""
|
86
|
+
|
87
|
+
def __init__(self, artifacts_dir: str = "./artifacts/metrics", cross_validation_tolerance: float = 15.0):
|
88
|
+
"""
|
89
|
+
Initialize enterprise DORA metrics engine
|
90
|
+
|
91
|
+
Args:
|
92
|
+
artifacts_dir: Directory to store metrics artifacts
|
93
|
+
cross_validation_tolerance: Tolerance percentage for metric validation
|
94
|
+
"""
|
95
|
+
self.artifacts_dir = Path(artifacts_dir)
|
96
|
+
self.artifacts_dir.mkdir(parents=True, exist_ok=True)
|
97
|
+
|
98
|
+
# Create SRE-focused subdirectories
|
99
|
+
(self.artifacts_dir / "dora-reports").mkdir(exist_ok=True)
|
100
|
+
(self.artifacts_dir / "baselines").mkdir(exist_ok=True)
|
101
|
+
(self.artifacts_dir / "alerts").mkdir(exist_ok=True)
|
102
|
+
(self.artifacts_dir / "dashboards").mkdir(exist_ok=True)
|
103
|
+
|
104
|
+
self.tolerance = cross_validation_tolerance
|
105
|
+
|
106
|
+
# Metrics storage with persistence
|
107
|
+
self.deployments: List[DeploymentEvent] = []
|
108
|
+
self.incidents: List[IncidentEvent] = []
|
109
|
+
self.metrics_history: List[DORAMetric] = []
|
110
|
+
self.baselines: Dict[str, float] = {}
|
111
|
+
|
112
|
+
# HITL workflow metrics
|
113
|
+
self.approval_times: List[float] = []
|
114
|
+
self.workflow_bottlenecks: Dict[str, List[float]] = {}
|
115
|
+
|
116
|
+
# Enterprise SRE performance targets (FAANG SDLC standards)
|
117
|
+
self.targets = {
|
118
|
+
"lead_time_hours": 4, # <4 hours (FAANG velocity)
|
119
|
+
"deploy_frequency_daily": 1, # Daily deployment capability
|
120
|
+
"change_failure_rate": 0.05, # <5% (FAANG quality)
|
121
|
+
"mttr_hours": 1, # <1 hour (SRE excellence)
|
122
|
+
"approval_time_minutes": 30, # <30 minutes (HITL efficiency)
|
123
|
+
"success_rate": 0.95, # >95% (Enterprise reliability)
|
124
|
+
"sla_availability": 0.999, # >99.9% uptime
|
125
|
+
"performance_score": 90, # >90% performance score
|
126
|
+
}
|
127
|
+
|
128
|
+
# SRE alerting thresholds
|
129
|
+
self.alert_thresholds = {
|
130
|
+
"lead_time_hours": 6, # Alert if >6 hours
|
131
|
+
"deploy_frequency_daily": 0.5, # Alert if <0.5 deploys/day
|
132
|
+
"change_failure_rate": 0.10, # Alert if >10%
|
133
|
+
"mttr_hours": 2, # Alert if >2 hours
|
134
|
+
"approval_time_minutes": 60, # Alert if >60 minutes
|
135
|
+
}
|
136
|
+
|
137
|
+
# Load existing data
|
138
|
+
self._load_persistent_data()
|
139
|
+
|
140
|
+
# Initialize baseline metrics if not exists
|
141
|
+
self._initialize_baselines()
|
142
|
+
|
143
|
+
def record_deployment(
|
144
|
+
self,
|
145
|
+
deployment_id: str,
|
146
|
+
environment: str,
|
147
|
+
service_name: str,
|
148
|
+
version: str,
|
149
|
+
commit_sha: str = "",
|
150
|
+
approver: str = "",
|
151
|
+
) -> DeploymentEvent:
|
152
|
+
"""Record a new deployment event"""
|
153
|
+
|
154
|
+
deployment = DeploymentEvent(
|
155
|
+
deployment_id=deployment_id,
|
156
|
+
environment=environment,
|
157
|
+
service_name=service_name,
|
158
|
+
version=version,
|
159
|
+
start_time=datetime.now(timezone.utc),
|
160
|
+
commit_sha=commit_sha,
|
161
|
+
approver=approver,
|
162
|
+
)
|
163
|
+
|
164
|
+
self.deployments.append(deployment)
|
165
|
+
|
166
|
+
logger.info(f"๐ Deployment recorded: {deployment_id} for {service_name}")
|
167
|
+
|
168
|
+
return deployment
|
169
|
+
|
170
|
+
def complete_deployment(self, deployment_id: str, status: str, rollback_time: Optional[datetime] = None) -> bool:
|
171
|
+
"""Mark deployment as complete"""
|
172
|
+
|
173
|
+
for deployment in self.deployments:
|
174
|
+
if deployment.deployment_id == deployment_id:
|
175
|
+
deployment.end_time = datetime.now(timezone.utc)
|
176
|
+
deployment.status = status
|
177
|
+
deployment.rollback_time = rollback_time
|
178
|
+
|
179
|
+
logger.info(f"โ
Deployment completed: {deployment_id} - {status}")
|
180
|
+
return True
|
181
|
+
|
182
|
+
logger.warning(f"โ ๏ธ Deployment not found: {deployment_id}")
|
183
|
+
return False
|
184
|
+
|
185
|
+
def record_incident(
|
186
|
+
self, incident_id: str, service_name: str, severity: str, root_cause: str = "", caused_by_deployment: str = ""
|
187
|
+
) -> IncidentEvent:
|
188
|
+
"""Record a new incident event"""
|
189
|
+
|
190
|
+
incident = IncidentEvent(
|
191
|
+
incident_id=incident_id,
|
192
|
+
service_name=service_name,
|
193
|
+
severity=severity,
|
194
|
+
start_time=datetime.now(timezone.utc),
|
195
|
+
root_cause=root_cause,
|
196
|
+
caused_by_deployment=caused_by_deployment,
|
197
|
+
)
|
198
|
+
|
199
|
+
self.incidents.append(incident)
|
200
|
+
|
201
|
+
logger.info(f"๐จ Incident recorded: {incident_id} - {severity} severity")
|
202
|
+
|
203
|
+
return incident
|
204
|
+
|
205
|
+
def resolve_incident(self, incident_id: str, detection_time: Optional[datetime] = None) -> bool:
|
206
|
+
"""Mark incident as resolved"""
|
207
|
+
|
208
|
+
for incident in self.incidents:
|
209
|
+
if incident.incident_id == incident_id:
|
210
|
+
incident.resolution_time = datetime.now(timezone.utc)
|
211
|
+
if detection_time:
|
212
|
+
incident.detection_time = detection_time
|
213
|
+
|
214
|
+
logger.info(f"โ
Incident resolved: {incident_id}")
|
215
|
+
return True
|
216
|
+
|
217
|
+
logger.warning(f"โ ๏ธ Incident not found: {incident_id}")
|
218
|
+
return False
|
219
|
+
|
220
|
+
def record_approval_time(self, approval_time_minutes: float, workflow_step: str = "general"):
|
221
|
+
"""Record HITL approval time"""
|
222
|
+
self.approval_times.append(approval_time_minutes)
|
223
|
+
|
224
|
+
if workflow_step not in self.workflow_bottlenecks:
|
225
|
+
self.workflow_bottlenecks[workflow_step] = []
|
226
|
+
self.workflow_bottlenecks[workflow_step].append(approval_time_minutes)
|
227
|
+
|
228
|
+
def calculate_lead_time(self, days_back: int = 30) -> DORAMetric:
|
229
|
+
"""Calculate deployment lead time"""
|
230
|
+
|
231
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
232
|
+
recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date and d.end_time]
|
233
|
+
|
234
|
+
if not recent_deployments:
|
235
|
+
return DORAMetric(
|
236
|
+
metric_name="lead_time",
|
237
|
+
value=0.0,
|
238
|
+
unit="hours",
|
239
|
+
timestamp=datetime.now(timezone.utc),
|
240
|
+
tags={"period": f"{days_back}d", "status": "no_data"},
|
241
|
+
)
|
242
|
+
|
243
|
+
# Calculate average lead time (simplified - in real scenario would track from commit to production)
|
244
|
+
lead_times = []
|
245
|
+
for deployment in recent_deployments:
|
246
|
+
if deployment.end_time and deployment.status == "success":
|
247
|
+
duration = (deployment.end_time - deployment.start_time).total_seconds() / 3600 # hours
|
248
|
+
lead_times.append(duration)
|
249
|
+
|
250
|
+
avg_lead_time = sum(lead_times) / len(lead_times) if lead_times else 0
|
251
|
+
|
252
|
+
metric = DORAMetric(
|
253
|
+
metric_name="lead_time",
|
254
|
+
value=avg_lead_time,
|
255
|
+
unit="hours",
|
256
|
+
timestamp=datetime.now(timezone.utc),
|
257
|
+
tags={
|
258
|
+
"period": f"{days_back}d",
|
259
|
+
"deployments_count": str(len(recent_deployments)),
|
260
|
+
"successful_deployments": str(len(lead_times)),
|
261
|
+
},
|
262
|
+
metadata={
|
263
|
+
"target": self.targets["lead_time_hours"],
|
264
|
+
"target_met": avg_lead_time <= self.targets["lead_time_hours"],
|
265
|
+
},
|
266
|
+
)
|
267
|
+
|
268
|
+
self.metrics_history.append(metric)
|
269
|
+
return metric
|
270
|
+
|
271
|
+
def calculate_deployment_frequency(self, days_back: int = 30) -> DORAMetric:
|
272
|
+
"""Calculate deployment frequency"""
|
273
|
+
|
274
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
275
|
+
recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date]
|
276
|
+
|
277
|
+
# Calculate deployments per day
|
278
|
+
deployments_per_day = len(recent_deployments) / days_back if days_back > 0 else 0
|
279
|
+
|
280
|
+
metric = DORAMetric(
|
281
|
+
metric_name="deployment_frequency",
|
282
|
+
value=deployments_per_day,
|
283
|
+
unit="deployments_per_day",
|
284
|
+
timestamp=datetime.now(timezone.utc),
|
285
|
+
tags={"period": f"{days_back}d", "total_deployments": str(len(recent_deployments))},
|
286
|
+
metadata={
|
287
|
+
"target": self.targets["deploy_frequency_daily"],
|
288
|
+
"target_met": deployments_per_day >= self.targets["deploy_frequency_daily"],
|
289
|
+
},
|
290
|
+
)
|
291
|
+
|
292
|
+
self.metrics_history.append(metric)
|
293
|
+
return metric
|
294
|
+
|
295
|
+
def calculate_change_failure_rate(self, days_back: int = 30) -> DORAMetric:
|
296
|
+
"""Calculate change failure rate"""
|
297
|
+
|
298
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
299
|
+
recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date and d.end_time]
|
300
|
+
|
301
|
+
if not recent_deployments:
|
302
|
+
return DORAMetric(
|
303
|
+
metric_name="change_failure_rate",
|
304
|
+
value=0.0,
|
305
|
+
unit="percentage",
|
306
|
+
timestamp=datetime.now(timezone.utc),
|
307
|
+
tags={"period": f"{days_back}d", "status": "no_data"},
|
308
|
+
)
|
309
|
+
|
310
|
+
failed_deployments = len([d for d in recent_deployments if d.status in ["failed", "rolled_back"]])
|
311
|
+
|
312
|
+
failure_rate = failed_deployments / len(recent_deployments)
|
313
|
+
|
314
|
+
metric = DORAMetric(
|
315
|
+
metric_name="change_failure_rate",
|
316
|
+
value=failure_rate,
|
317
|
+
unit="percentage",
|
318
|
+
timestamp=datetime.now(timezone.utc),
|
319
|
+
tags={
|
320
|
+
"period": f"{days_back}d",
|
321
|
+
"total_deployments": str(len(recent_deployments)),
|
322
|
+
"failed_deployments": str(failed_deployments),
|
323
|
+
},
|
324
|
+
metadata={
|
325
|
+
"target": self.targets["change_failure_rate"],
|
326
|
+
"target_met": failure_rate <= self.targets["change_failure_rate"],
|
327
|
+
},
|
328
|
+
)
|
329
|
+
|
330
|
+
self.metrics_history.append(metric)
|
331
|
+
return metric
|
332
|
+
|
333
|
+
def calculate_mttr(self, days_back: int = 30) -> DORAMetric:
|
334
|
+
"""Calculate Mean Time to Recovery (MTTR)"""
|
335
|
+
|
336
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
337
|
+
recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date and i.resolution_time]
|
338
|
+
|
339
|
+
if not recent_incidents:
|
340
|
+
return DORAMetric(
|
341
|
+
metric_name="mttr",
|
342
|
+
value=0.0,
|
343
|
+
unit="hours",
|
344
|
+
timestamp=datetime.now(timezone.utc),
|
345
|
+
tags={"period": f"{days_back}d", "status": "no_data"},
|
346
|
+
)
|
347
|
+
|
348
|
+
# Calculate recovery times
|
349
|
+
recovery_times = []
|
350
|
+
for incident in recent_incidents:
|
351
|
+
if incident.resolution_time:
|
352
|
+
duration = (incident.resolution_time - incident.start_time).total_seconds() / 3600 # hours
|
353
|
+
recovery_times.append(duration)
|
354
|
+
|
355
|
+
avg_mttr = sum(recovery_times) / len(recovery_times) if recovery_times else 0
|
356
|
+
|
357
|
+
metric = DORAMetric(
|
358
|
+
metric_name="mttr",
|
359
|
+
value=avg_mttr,
|
360
|
+
unit="hours",
|
361
|
+
timestamp=datetime.now(timezone.utc),
|
362
|
+
tags={"period": f"{days_back}d", "incidents_count": str(len(recent_incidents))},
|
363
|
+
metadata={"target": self.targets["mttr_hours"], "target_met": avg_mttr <= self.targets["mttr_hours"]},
|
364
|
+
)
|
365
|
+
|
366
|
+
self.metrics_history.append(metric)
|
367
|
+
return metric
|
368
|
+
|
369
|
+
def calculate_hitl_metrics(self) -> Dict[str, DORAMetric]:
|
370
|
+
"""Calculate Human-in-the-Loop specific metrics"""
|
371
|
+
|
372
|
+
metrics = {}
|
373
|
+
|
374
|
+
# Average approval time
|
375
|
+
if self.approval_times:
|
376
|
+
avg_approval_time = sum(self.approval_times) / len(self.approval_times)
|
377
|
+
|
378
|
+
metrics["approval_time"] = DORAMetric(
|
379
|
+
metric_name="approval_time",
|
380
|
+
value=avg_approval_time,
|
381
|
+
unit="minutes",
|
382
|
+
timestamp=datetime.now(timezone.utc),
|
383
|
+
tags={"total_approvals": str(len(self.approval_times))},
|
384
|
+
metadata={
|
385
|
+
"target": self.targets["approval_time_minutes"],
|
386
|
+
"target_met": avg_approval_time <= self.targets["approval_time_minutes"],
|
387
|
+
},
|
388
|
+
)
|
389
|
+
|
390
|
+
# Workflow bottlenecks analysis
|
391
|
+
if self.workflow_bottlenecks:
|
392
|
+
bottleneck_metrics = {}
|
393
|
+
|
394
|
+
for step, times in self.workflow_bottlenecks.items():
|
395
|
+
if times:
|
396
|
+
avg_time = sum(times) / len(times)
|
397
|
+
bottleneck_metrics[f"{step}_avg_time"] = avg_time
|
398
|
+
|
399
|
+
# Identify slowest step
|
400
|
+
if bottleneck_metrics:
|
401
|
+
slowest_step = max(bottleneck_metrics, key=bottleneck_metrics.get)
|
402
|
+
slowest_time = bottleneck_metrics[slowest_step]
|
403
|
+
|
404
|
+
metrics["workflow_bottleneck"] = DORAMetric(
|
405
|
+
metric_name="workflow_bottleneck",
|
406
|
+
value=slowest_time,
|
407
|
+
unit="minutes",
|
408
|
+
timestamp=datetime.now(timezone.utc),
|
409
|
+
tags={"bottleneck_step": slowest_step},
|
410
|
+
metadata={"all_steps": bottleneck_metrics},
|
411
|
+
)
|
412
|
+
|
413
|
+
return metrics
|
414
|
+
|
415
|
+
def _load_persistent_data(self) -> None:
|
416
|
+
"""Load persistent DORA data from storage."""
|
417
|
+
try:
|
418
|
+
# Load deployments
|
419
|
+
deployments_file = self.artifacts_dir / "deployments.json"
|
420
|
+
if deployments_file.exists():
|
421
|
+
with open(deployments_file, "r") as f:
|
422
|
+
data = json.load(f)
|
423
|
+
self.deployments = [DeploymentEvent(**item) for item in data.get("deployments", [])]
|
424
|
+
|
425
|
+
# Load incidents
|
426
|
+
incidents_file = self.artifacts_dir / "incidents.json"
|
427
|
+
if incidents_file.exists():
|
428
|
+
with open(incidents_file, "r") as f:
|
429
|
+
data = json.load(f)
|
430
|
+
self.incidents = [IncidentEvent(**item) for item in data.get("incidents", [])]
|
431
|
+
|
432
|
+
# Load baselines
|
433
|
+
baselines_file = self.artifacts_dir / "baselines" / "current_baselines.json"
|
434
|
+
if baselines_file.exists():
|
435
|
+
with open(baselines_file, "r") as f:
|
436
|
+
self.baselines = json.load(f)
|
437
|
+
|
438
|
+
logger.info(f"๐ Loaded {len(self.deployments)} deployments, {len(self.incidents)} incidents")
|
439
|
+
|
440
|
+
except Exception as e:
|
441
|
+
logger.warning(f"โ ๏ธ Failed to load persistent data: {e}")
|
442
|
+
|
443
|
+
def _save_persistent_data(self) -> None:
|
444
|
+
"""Save persistent DORA data to storage."""
|
445
|
+
try:
|
446
|
+
# Save deployments
|
447
|
+
deployments_data = {
|
448
|
+
"deployments": [asdict(d) for d in self.deployments],
|
449
|
+
"last_updated": datetime.now(timezone.utc).isoformat(),
|
450
|
+
}
|
451
|
+
deployments_file = self.artifacts_dir / "deployments.json"
|
452
|
+
with open(deployments_file, "w") as f:
|
453
|
+
json.dump(deployments_data, f, indent=2, default=str)
|
454
|
+
|
455
|
+
# Save incidents
|
456
|
+
incidents_data = {
|
457
|
+
"incidents": [asdict(i) for i in self.incidents],
|
458
|
+
"last_updated": datetime.now(timezone.utc).isoformat(),
|
459
|
+
}
|
460
|
+
incidents_file = self.artifacts_dir / "incidents.json"
|
461
|
+
with open(incidents_file, "w") as f:
|
462
|
+
json.dump(incidents_data, f, indent=2, default=str)
|
463
|
+
|
464
|
+
# Save baselines
|
465
|
+
baselines_file = self.artifacts_dir / "baselines" / "current_baselines.json"
|
466
|
+
with open(baselines_file, "w") as f:
|
467
|
+
json.dump(self.baselines, f, indent=2)
|
468
|
+
|
469
|
+
except Exception as e:
|
470
|
+
logger.error(f"โ Failed to save persistent data: {e}")
|
471
|
+
|
472
|
+
def _initialize_baselines(self) -> None:
|
473
|
+
"""Initialize baseline metrics for trending analysis."""
|
474
|
+
if not self.baselines and len(self.deployments) > 10:
|
475
|
+
# Calculate initial baselines from historical data
|
476
|
+
lead_time_metric = self.calculate_lead_time(30)
|
477
|
+
deploy_freq_metric = self.calculate_deployment_frequency(30)
|
478
|
+
failure_rate_metric = self.calculate_change_failure_rate(30)
|
479
|
+
mttr_metric = self.calculate_mttr(30)
|
480
|
+
|
481
|
+
self.baselines = {
|
482
|
+
"lead_time_hours": lead_time_metric.value,
|
483
|
+
"deploy_frequency_daily": deploy_freq_metric.value,
|
484
|
+
"change_failure_rate": failure_rate_metric.value,
|
485
|
+
"mttr_hours": mttr_metric.value,
|
486
|
+
"baseline_established": datetime.now(timezone.utc).isoformat(),
|
487
|
+
"sample_size": len(self.deployments),
|
488
|
+
}
|
489
|
+
|
490
|
+
logger.info("๐ Established baseline metrics from historical data")
|
491
|
+
self._save_persistent_data()
|
492
|
+
|
493
|
+
def track_git_deployment(
|
494
|
+
self, commit_sha: str, branch: str = "main", author: str = "", message: str = ""
|
495
|
+
) -> DeploymentEvent:
|
496
|
+
"""
|
497
|
+
Track deployment from git operations for automated DORA collection.
|
498
|
+
|
499
|
+
Args:
|
500
|
+
commit_sha: Git commit SHA
|
501
|
+
branch: Git branch name
|
502
|
+
author: Commit author
|
503
|
+
message: Commit message
|
504
|
+
|
505
|
+
Returns:
|
506
|
+
Created deployment event
|
507
|
+
"""
|
508
|
+
deployment_id = f"git-{commit_sha[:8]}-{int(time.time())}"
|
509
|
+
|
510
|
+
deployment = self.record_deployment(
|
511
|
+
deployment_id=deployment_id,
|
512
|
+
environment="production" if branch == "main" else "development",
|
513
|
+
service_name="runbooks",
|
514
|
+
version=commit_sha[:8],
|
515
|
+
commit_sha=commit_sha,
|
516
|
+
approver=author,
|
517
|
+
)
|
518
|
+
|
519
|
+
# Add git metadata
|
520
|
+
deployment.metadata = {
|
521
|
+
"branch": branch,
|
522
|
+
"author": author,
|
523
|
+
"message": message,
|
524
|
+
"automated": True,
|
525
|
+
"source": "git_integration",
|
526
|
+
}
|
527
|
+
|
528
|
+
logger.info(f"๐ Git deployment tracked: {commit_sha[:8]} on {branch}")
|
529
|
+
|
530
|
+
# Auto-save after git integration
|
531
|
+
self._save_persistent_data()
|
532
|
+
|
533
|
+
return deployment
|
534
|
+
|
535
|
+
def detect_performance_incident(
|
536
|
+
self, module: str, operation: str, execution_time: float, threshold: float
|
537
|
+
) -> Optional[IncidentEvent]:
|
538
|
+
"""
|
539
|
+
Automatically detect and record performance incidents.
|
540
|
+
|
541
|
+
Args:
|
542
|
+
module: Module name (e.g., 'finops', 'inventory')
|
543
|
+
operation: Operation name
|
544
|
+
execution_time: Actual execution time
|
545
|
+
threshold: Performance threshold
|
546
|
+
|
547
|
+
Returns:
|
548
|
+
Created incident if threshold exceeded, None otherwise
|
549
|
+
"""
|
550
|
+
if execution_time <= threshold:
|
551
|
+
return None
|
552
|
+
|
553
|
+
incident_id = f"perf-{module}-{int(time.time())}"
|
554
|
+
severity = "critical" if execution_time > threshold * 2 else "high"
|
555
|
+
|
556
|
+
incident = self.record_incident(
|
557
|
+
incident_id=incident_id,
|
558
|
+
service_name=module,
|
559
|
+
severity=severity,
|
560
|
+
root_cause=f"Performance degradation: {operation} took {execution_time:.2f}s (threshold: {threshold:.2f}s)",
|
561
|
+
)
|
562
|
+
|
563
|
+
# Add performance metadata
|
564
|
+
incident.metadata = {
|
565
|
+
"operation": operation,
|
566
|
+
"execution_time": execution_time,
|
567
|
+
"threshold": threshold,
|
568
|
+
"degradation_factor": execution_time / threshold,
|
569
|
+
"automated_detection": True,
|
570
|
+
}
|
571
|
+
|
572
|
+
logger.warning(f"๐จ Performance incident detected: {incident_id}")
|
573
|
+
|
574
|
+
# Generate real-time alert
|
575
|
+
self._generate_sre_alert(incident, execution_time, threshold)
|
576
|
+
|
577
|
+
return incident
|
578
|
+
|
579
|
+
def _generate_sre_alert(self, incident: IncidentEvent, execution_time: float, threshold: float) -> None:
|
580
|
+
"""Generate SRE-focused performance alert."""
|
581
|
+
alert_data = {
|
582
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
583
|
+
"alert_type": "sre_performance_degradation",
|
584
|
+
"incident_id": incident.incident_id,
|
585
|
+
"service": incident.service_name,
|
586
|
+
"severity": incident.severity,
|
587
|
+
"execution_time": execution_time,
|
588
|
+
"threshold": threshold,
|
589
|
+
"degradation_factor": execution_time / threshold,
|
590
|
+
"impact": "user_experience" if execution_time > threshold * 1.5 else "performance_sla",
|
591
|
+
"recommended_actions": [
|
592
|
+
"Check system resource utilization",
|
593
|
+
"Review recent deployments for correlation",
|
594
|
+
"Validate AWS API rate limiting",
|
595
|
+
"Consider auto-scaling triggers",
|
596
|
+
],
|
597
|
+
}
|
598
|
+
|
599
|
+
# Save alert to artifacts
|
600
|
+
alert_file = self.artifacts_dir / "alerts" / f"sre_alert_{incident.incident_id}.json"
|
601
|
+
with open(alert_file, "w") as f:
|
602
|
+
json.dump(alert_data, f, indent=2, default=str)
|
603
|
+
|
604
|
+
logger.critical(f"๐จ SRE Alert generated: {alert_file}")
|
605
|
+
|
606
|
+
def calculate_sla_compliance(self, days_back: int = 30) -> Dict[str, DORAMetric]:
|
607
|
+
"""
|
608
|
+
Calculate SLA compliance metrics for enterprise reporting.
|
609
|
+
|
610
|
+
Args:
|
611
|
+
days_back: Number of days to analyze
|
612
|
+
|
613
|
+
Returns:
|
614
|
+
Dictionary of SLA compliance metrics
|
615
|
+
"""
|
616
|
+
sla_metrics = {}
|
617
|
+
|
618
|
+
# Calculate availability SLA (based on incident downtime)
|
619
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
620
|
+
recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date]
|
621
|
+
|
622
|
+
total_downtime_hours = 0
|
623
|
+
for incident in recent_incidents:
|
624
|
+
if incident.resolution_time and incident.severity in ["critical", "high"]:
|
625
|
+
downtime = (incident.resolution_time - incident.start_time).total_seconds() / 3600
|
626
|
+
total_downtime_hours += downtime
|
627
|
+
|
628
|
+
total_hours = days_back * 24
|
629
|
+
availability = max(0, (total_hours - total_downtime_hours) / total_hours)
|
630
|
+
|
631
|
+
sla_metrics["availability"] = DORAMetric(
|
632
|
+
metric_name="availability_sla",
|
633
|
+
value=availability,
|
634
|
+
unit="percentage",
|
635
|
+
timestamp=datetime.now(timezone.utc),
|
636
|
+
tags={"period": f"{days_back}d", "incidents": str(len(recent_incidents))},
|
637
|
+
metadata={
|
638
|
+
"target": self.targets["sla_availability"],
|
639
|
+
"target_met": availability >= self.targets["sla_availability"],
|
640
|
+
"downtime_hours": total_downtime_hours,
|
641
|
+
},
|
642
|
+
)
|
643
|
+
|
644
|
+
# Performance SLA (based on operation execution times)
|
645
|
+
performance_scores = []
|
646
|
+
for metric in self.metrics_history:
|
647
|
+
if metric.metadata and "performance_score" in metric.metadata:
|
648
|
+
performance_scores.append(metric.metadata["performance_score"])
|
649
|
+
|
650
|
+
avg_performance = sum(performance_scores) / len(performance_scores) if performance_scores else 0
|
651
|
+
|
652
|
+
sla_metrics["performance"] = DORAMetric(
|
653
|
+
metric_name="performance_sla",
|
654
|
+
value=avg_performance,
|
655
|
+
unit="percentage",
|
656
|
+
timestamp=datetime.now(timezone.utc),
|
657
|
+
tags={"sample_size": str(len(performance_scores))},
|
658
|
+
metadata={
|
659
|
+
"target": self.targets["performance_score"],
|
660
|
+
"target_met": avg_performance >= self.targets["performance_score"],
|
661
|
+
},
|
662
|
+
)
|
663
|
+
|
664
|
+
return sla_metrics
|
665
|
+
|
666
|
+
def generate_comprehensive_report(self, days_back: int = 30) -> Dict:
|
667
|
+
"""Generate comprehensive DORA metrics report with SRE enhancements"""
|
668
|
+
|
669
|
+
logger.info(f"๐ Generating enterprise DORA metrics report for last {days_back} days")
|
670
|
+
|
671
|
+
# Calculate all DORA metrics
|
672
|
+
lead_time = self.calculate_lead_time(days_back)
|
673
|
+
deployment_freq = self.calculate_deployment_frequency(days_back)
|
674
|
+
failure_rate = self.calculate_change_failure_rate(days_back)
|
675
|
+
mttr = self.calculate_mttr(days_back)
|
676
|
+
|
677
|
+
# Calculate HITL metrics
|
678
|
+
hitl_metrics = self.calculate_hitl_metrics()
|
679
|
+
|
680
|
+
# Calculate SLA compliance metrics
|
681
|
+
sla_metrics = self.calculate_sla_compliance(days_back)
|
682
|
+
|
683
|
+
# Performance analysis with enhanced SRE targets
|
684
|
+
targets_met = {
|
685
|
+
"lead_time": lead_time.metadata.get("target_met", False),
|
686
|
+
"deployment_frequency": deployment_freq.metadata.get("target_met", False),
|
687
|
+
"change_failure_rate": failure_rate.metadata.get("target_met", False),
|
688
|
+
"mttr": mttr.metadata.get("target_met", False),
|
689
|
+
}
|
690
|
+
|
691
|
+
# Add HITL targets
|
692
|
+
if "approval_time" in hitl_metrics:
|
693
|
+
targets_met["approval_time"] = hitl_metrics["approval_time"].metadata.get("target_met", False)
|
694
|
+
|
695
|
+
# Add SLA targets
|
696
|
+
for metric_name, metric in sla_metrics.items():
|
697
|
+
targets_met[f"sla_{metric_name}"] = metric.metadata.get("target_met", False)
|
698
|
+
|
699
|
+
overall_performance = sum(targets_met.values()) / len(targets_met) * 100
|
700
|
+
|
701
|
+
# Calculate trend analysis vs baselines
|
702
|
+
trend_analysis = {}
|
703
|
+
if self.baselines:
|
704
|
+
for metric_name, current_value in [
|
705
|
+
("lead_time_hours", lead_time.value),
|
706
|
+
("deploy_frequency_daily", deployment_freq.value),
|
707
|
+
("change_failure_rate", failure_rate.value),
|
708
|
+
("mttr_hours", mttr.value),
|
709
|
+
]:
|
710
|
+
baseline = self.baselines.get(metric_name, current_value)
|
711
|
+
if baseline > 0:
|
712
|
+
trend_percentage = ((current_value - baseline) / baseline) * 100
|
713
|
+
trend_analysis[metric_name] = {
|
714
|
+
"current": current_value,
|
715
|
+
"baseline": baseline,
|
716
|
+
"trend_percentage": trend_percentage,
|
717
|
+
"improving": trend_percentage < 0
|
718
|
+
if metric_name != "deploy_frequency_daily"
|
719
|
+
else trend_percentage > 0,
|
720
|
+
}
|
721
|
+
|
722
|
+
report = {
|
723
|
+
"report_type": "dora_metrics_enterprise_sre",
|
724
|
+
"version": "2.0",
|
725
|
+
"period": f"{days_back}_days",
|
726
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
727
|
+
"dora_metrics": {
|
728
|
+
"lead_time": asdict(lead_time),
|
729
|
+
"deployment_frequency": asdict(deployment_freq),
|
730
|
+
"change_failure_rate": asdict(failure_rate),
|
731
|
+
"mttr": asdict(mttr),
|
732
|
+
},
|
733
|
+
"sla_metrics": {k: asdict(v) for k, v in sla_metrics.items()},
|
734
|
+
"hitl_metrics": {k: asdict(v) for k, v in hitl_metrics.items()},
|
735
|
+
"performance_analysis": {
|
736
|
+
"targets_met": targets_met,
|
737
|
+
"overall_performance_percentage": overall_performance,
|
738
|
+
"performance_grade": self._calculate_performance_grade(overall_performance),
|
739
|
+
"sla_compliance_score": sum(1 for k, v in targets_met.items() if k.startswith("sla_") and v)
|
740
|
+
/ max(1, sum(1 for k in targets_met.keys() if k.startswith("sla_")))
|
741
|
+
* 100,
|
742
|
+
},
|
743
|
+
"trend_analysis": trend_analysis,
|
744
|
+
"baseline_comparison": self.baselines,
|
745
|
+
"recommendations": self._generate_sre_recommendations(
|
746
|
+
targets_met, hitl_metrics, sla_metrics, trend_analysis
|
747
|
+
),
|
748
|
+
"alerts_summary": {
|
749
|
+
"active_alerts": len(
|
750
|
+
[
|
751
|
+
f
|
752
|
+
for f in (self.artifacts_dir / "alerts").glob("*.json")
|
753
|
+
if f.stat().st_mtime > time.time() - 86400
|
754
|
+
]
|
755
|
+
),
|
756
|
+
"performance_incidents": len(
|
757
|
+
[
|
758
|
+
i
|
759
|
+
for i in self.incidents
|
760
|
+
if i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
|
761
|
+
and "performance" in i.root_cause.lower()
|
762
|
+
]
|
763
|
+
),
|
764
|
+
"sre_health_score": overall_performance,
|
765
|
+
},
|
766
|
+
"raw_data": {
|
767
|
+
"deployments_count": len(self.deployments),
|
768
|
+
"incidents_count": len(self.incidents),
|
769
|
+
"approval_times_count": len(self.approval_times),
|
770
|
+
"automation_rate": len(
|
771
|
+
[d for d in self.deployments if getattr(d, "metadata", {}).get("automated", False)]
|
772
|
+
)
|
773
|
+
/ max(1, len(self.deployments))
|
774
|
+
* 100,
|
775
|
+
},
|
776
|
+
}
|
777
|
+
|
778
|
+
# Save enhanced report to SRE reports directory
|
779
|
+
sre_reports_dir = self.artifacts_dir.parent / "sre-reports"
|
780
|
+
sre_reports_dir.mkdir(exist_ok=True)
|
781
|
+
|
782
|
+
report_file = sre_reports_dir / f"dora_enterprise_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
783
|
+
with open(report_file, "w") as f:
|
784
|
+
json.dump(report, f, indent=2, default=str)
|
785
|
+
|
786
|
+
# Also save to metrics directory for backward compatibility
|
787
|
+
legacy_report_file = (
|
788
|
+
self.artifacts_dir / "dora-reports" / f"dora_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
789
|
+
)
|
790
|
+
with open(legacy_report_file, "w") as f:
|
791
|
+
json.dump(report, f, indent=2, default=str)
|
792
|
+
|
793
|
+
logger.info(f"โ
Enterprise DORA metrics report saved to: {report_file}")
|
794
|
+
|
795
|
+
# Auto-save persistent data after report generation
|
796
|
+
self._save_persistent_data()
|
797
|
+
|
798
|
+
return report
|
799
|
+
|
800
|
+
def _calculate_performance_grade(self, percentage: float) -> str:
|
801
|
+
"""Calculate performance grade based on targets met"""
|
802
|
+
if percentage >= 90:
|
803
|
+
return "A (Excellent)"
|
804
|
+
elif percentage >= 80:
|
805
|
+
return "B (Good)"
|
806
|
+
elif percentage >= 70:
|
807
|
+
return "C (Satisfactory)"
|
808
|
+
elif percentage >= 60:
|
809
|
+
return "D (Needs Improvement)"
|
810
|
+
else:
|
811
|
+
return "F (Poor)"
|
812
|
+
|
813
|
+
def _generate_sre_recommendations(
|
814
|
+
self, targets_met: Dict[str, bool], hitl_metrics: Dict, sla_metrics: Dict, trend_analysis: Dict
|
815
|
+
) -> List[str]:
|
816
|
+
"""Generate enhanced SRE-focused recommendations based on comprehensive metrics analysis"""
|
817
|
+
|
818
|
+
recommendations = []
|
819
|
+
|
820
|
+
# DORA metrics recommendations
|
821
|
+
if not targets_met.get("lead_time", False):
|
822
|
+
recommendations.append(
|
823
|
+
"๐ฏ **Lead Time Optimization**: Implement parallel CI/CD workflows, automate testing pipelines, "
|
824
|
+
"and establish fast-track approval processes for low-risk changes"
|
825
|
+
)
|
826
|
+
|
827
|
+
if not targets_met.get("deployment_frequency", False):
|
828
|
+
recommendations.append(
|
829
|
+
"๐ **Deployment Frequency Enhancement**: Adopt continuous deployment patterns, implement "
|
830
|
+
"feature flags, and establish canary deployment strategies for risk mitigation"
|
831
|
+
)
|
832
|
+
|
833
|
+
if not targets_met.get("change_failure_rate", False):
|
834
|
+
recommendations.append(
|
835
|
+
"๐ก๏ธ **Change Failure Rate Reduction**: Enhance pre-production testing, implement progressive "
|
836
|
+
"rollouts, improve monitoring coverage, and establish automated rollback triggers"
|
837
|
+
)
|
838
|
+
|
839
|
+
if not targets_met.get("mttr", False):
|
840
|
+
recommendations.append(
|
841
|
+
"โก **MTTR Improvement**: Implement automated incident detection, enhance observability stack, "
|
842
|
+
"establish runbook automation, and improve on-call response procedures"
|
843
|
+
)
|
844
|
+
|
845
|
+
# SLA compliance recommendations
|
846
|
+
if not targets_met.get("sla_availability", False):
|
847
|
+
recommendations.append(
|
848
|
+
"๐ **Availability SLA Recovery**: Implement chaos engineering practices, enhance redundancy, "
|
849
|
+
"improve failover mechanisms, and establish proactive monitoring alerts"
|
850
|
+
)
|
851
|
+
|
852
|
+
if not targets_met.get("sla_performance", False):
|
853
|
+
recommendations.append(
|
854
|
+
"๐ **Performance SLA Enhancement**: Optimize critical path operations, implement caching strategies, "
|
855
|
+
"enhance resource allocation, and establish performance regression testing"
|
856
|
+
)
|
857
|
+
|
858
|
+
# HITL workflow optimization
|
859
|
+
if not targets_met.get("approval_time", False):
|
860
|
+
recommendations.append(
|
861
|
+
"โฐ **Approval Workflow Optimization**: Implement risk-based approval routing, establish "
|
862
|
+
"parallel approval processes, and create self-service deployment capabilities for low-risk changes"
|
863
|
+
)
|
864
|
+
|
865
|
+
# Trend analysis recommendations
|
866
|
+
if trend_analysis:
|
867
|
+
declining_metrics = [k for k, v in trend_analysis.items() if not v.get("improving", True)]
|
868
|
+
if declining_metrics:
|
869
|
+
recommendations.append(
|
870
|
+
f"๐ **Trend Alert**: Declining performance detected in {', '.join(declining_metrics)}. "
|
871
|
+
f"Implement immediate performance improvement initiatives and establish regression prevention measures"
|
872
|
+
)
|
873
|
+
|
874
|
+
# Proactive SRE recommendations based on patterns
|
875
|
+
if hitl_metrics.get("workflow_bottleneck"):
|
876
|
+
bottleneck_step = hitl_metrics["workflow_bottleneck"].tags.get("bottleneck_step", "unknown")
|
877
|
+
recommendations.append(
|
878
|
+
f"๐ **Workflow Bottleneck Resolution**: Primary bottleneck identified in '{bottleneck_step}' step. "
|
879
|
+
f"Implement automation, parallel processing, or resource scaling for this workflow stage"
|
880
|
+
)
|
881
|
+
|
882
|
+
# Automation recommendations
|
883
|
+
automation_rate = targets_met.get("automation_rate", 0)
|
884
|
+
if automation_rate < 80:
|
885
|
+
recommendations.append(
|
886
|
+
"๐ค **Automation Enhancement**: Current automation rate below target. Implement GitOps workflows, "
|
887
|
+
"automated testing pipelines, and self-healing infrastructure patterns"
|
888
|
+
)
|
889
|
+
|
890
|
+
# Advanced SRE practices
|
891
|
+
if len([k for k, v in targets_met.items() if v]) / len(targets_met) < 0.8:
|
892
|
+
recommendations.append(
|
893
|
+
"๐ฏ **SRE Maturity Enhancement**: Consider implementing advanced SRE practices: error budgets, "
|
894
|
+
"SLI/SLO management, chaos engineering, and customer-centric reliability metrics"
|
895
|
+
)
|
896
|
+
|
897
|
+
if not recommendations:
|
898
|
+
recommendations.append(
|
899
|
+
"โ
**Excellence Achieved**: All SRE targets met! Consider advanced optimization: predictive scaling, "
|
900
|
+
"AI-powered incident response, and continuous reliability improvement programs"
|
901
|
+
)
|
902
|
+
|
903
|
+
return recommendations
|
904
|
+
|
905
|
+
def _generate_recommendations(self, targets_met: Dict[str, bool], hitl_metrics: Dict) -> List[str]:
|
906
|
+
"""Generate recommendations based on metrics analysis"""
|
907
|
+
|
908
|
+
recommendations = []
|
909
|
+
|
910
|
+
if not targets_met.get("lead_time", False):
|
911
|
+
recommendations.append(
|
912
|
+
"๐ฏ Optimize lead time: Consider parallel workflows, automated testing, and faster approval processes"
|
913
|
+
)
|
914
|
+
|
915
|
+
if not targets_met.get("deployment_frequency", False):
|
916
|
+
recommendations.append(
|
917
|
+
"๐ Increase deployment frequency: Implement continuous deployment pipeline and smaller batch sizes"
|
918
|
+
)
|
919
|
+
|
920
|
+
if not targets_met.get("change_failure_rate", False):
|
921
|
+
recommendations.append(
|
922
|
+
"๐ก๏ธ Reduce failure rate: Enhance testing coverage, implement canary deployments, and improve rollback procedures"
|
923
|
+
)
|
924
|
+
|
925
|
+
if not targets_met.get("mttr", False):
|
926
|
+
recommendations.append(
|
927
|
+
"โก Improve MTTR: Enhance monitoring, implement automated incident response, and improve alerting"
|
928
|
+
)
|
929
|
+
|
930
|
+
if not targets_met.get("approval_time", False):
|
931
|
+
recommendations.append(
|
932
|
+
"โฐ Optimize approval workflow: Streamline HITL processes, implement parallel approvals, and reduce approval steps"
|
933
|
+
)
|
934
|
+
|
935
|
+
# HITL-specific recommendations
|
936
|
+
if "workflow_bottleneck" in hitl_metrics:
|
937
|
+
bottleneck_step = hitl_metrics["workflow_bottleneck"].tags.get("bottleneck_step", "unknown")
|
938
|
+
recommendations.append(f"๐ Address workflow bottleneck: Focus on optimizing '{bottleneck_step}' step")
|
939
|
+
|
940
|
+
if not recommendations:
|
941
|
+
recommendations.append(
|
942
|
+
"โ
All targets met! Consider raising performance targets or exploring advanced optimization opportunities"
|
943
|
+
)
|
944
|
+
|
945
|
+
return recommendations
|
946
|
+
|
947
|
+
def export_metrics_for_visualization(self, output_file: Optional[str] = None) -> str:
|
948
|
+
"""Export metrics in format suitable for visualization tools"""
|
949
|
+
|
950
|
+
if not output_file:
|
951
|
+
output_file = self.artifacts_dir / f"metrics_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
952
|
+
|
953
|
+
export_data = {
|
954
|
+
"export_timestamp": datetime.now(timezone.utc).isoformat(),
|
955
|
+
"metrics_history": [asdict(m) for m in self.metrics_history],
|
956
|
+
"deployments": [asdict(d) for d in self.deployments],
|
957
|
+
"incidents": [asdict(i) for i in self.incidents],
|
958
|
+
"targets": self.targets,
|
959
|
+
"summary_stats": {
|
960
|
+
"total_deployments": len(self.deployments),
|
961
|
+
"successful_deployments": len([d for d in self.deployments if d.status == "success"]),
|
962
|
+
"total_incidents": len(self.incidents),
|
963
|
+
"resolved_incidents": len([i for i in self.incidents if i.resolution_time]),
|
964
|
+
"average_approval_time": sum(self.approval_times) / len(self.approval_times)
|
965
|
+
if self.approval_times
|
966
|
+
else 0,
|
967
|
+
},
|
968
|
+
}
|
969
|
+
|
970
|
+
with open(output_file, "w") as f:
|
971
|
+
json.dump(export_data, f, indent=2, default=str)
|
972
|
+
|
973
|
+
logger.info(f"๐ Metrics exported for visualization: {output_file}")
|
974
|
+
return str(output_file)
|
975
|
+
|
976
|
+
def generate_sre_dashboard(self, days_back: int = 30) -> Dict:
|
977
|
+
"""
|
978
|
+
Generate comprehensive SRE dashboard data for visualization tools.
|
979
|
+
|
980
|
+
Args:
|
981
|
+
days_back: Number of days to analyze for dashboard
|
982
|
+
|
983
|
+
Returns:
|
984
|
+
Dashboard data structure optimized for SRE tools (Datadog, Grafana, etc.)
|
985
|
+
"""
|
986
|
+
logger.info(f"๐ Generating SRE dashboard data for {days_back} days")
|
987
|
+
|
988
|
+
# Get comprehensive report data
|
989
|
+
report = self.generate_comprehensive_report(days_back)
|
990
|
+
|
991
|
+
# Format for SRE dashboard tools
|
992
|
+
dashboard_data = {
|
993
|
+
"dashboard_type": "sre_dora_metrics",
|
994
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
995
|
+
"time_range_days": days_back,
|
996
|
+
# Key Performance Indicators (KPIs) for executive view
|
997
|
+
"kpi_summary": {
|
998
|
+
"overall_performance_score": report["performance_analysis"]["overall_performance_percentage"],
|
999
|
+
"sla_compliance_score": report["performance_analysis"]["sla_compliance_score"],
|
1000
|
+
"dora_metrics_health": len(
|
1001
|
+
[
|
1002
|
+
k
|
1003
|
+
for k, v in report["performance_analysis"]["targets_met"].items()
|
1004
|
+
if not k.startswith("sla_") and v
|
1005
|
+
]
|
1006
|
+
)
|
1007
|
+
/ 4
|
1008
|
+
* 100,
|
1009
|
+
"active_incidents": len(
|
1010
|
+
[
|
1011
|
+
i
|
1012
|
+
for i in self.incidents
|
1013
|
+
if i.start_time >= datetime.now(timezone.utc) - timedelta(days=1) and not i.resolution_time
|
1014
|
+
]
|
1015
|
+
),
|
1016
|
+
"automation_percentage": report["raw_data"]["automation_rate"],
|
1017
|
+
},
|
1018
|
+
# Time series data for trending
|
1019
|
+
"time_series": {
|
1020
|
+
"lead_time": [
|
1021
|
+
{"timestamp": m.timestamp.isoformat(), "value": m.value}
|
1022
|
+
for m in self.metrics_history
|
1023
|
+
if m.metric_name == "lead_time"
|
1024
|
+
][-30:], # Last 30 data points
|
1025
|
+
"deployment_frequency": [
|
1026
|
+
{"timestamp": m.timestamp.isoformat(), "value": m.value}
|
1027
|
+
for m in self.metrics_history
|
1028
|
+
if m.metric_name == "deployment_frequency"
|
1029
|
+
][-30:],
|
1030
|
+
"change_failure_rate": [
|
1031
|
+
{"timestamp": m.timestamp.isoformat(), "value": m.value * 100} # Convert to percentage
|
1032
|
+
for m in self.metrics_history
|
1033
|
+
if m.metric_name == "change_failure_rate"
|
1034
|
+
][-30:],
|
1035
|
+
"mttr": [
|
1036
|
+
{"timestamp": m.timestamp.isoformat(), "value": m.value}
|
1037
|
+
for m in self.metrics_history
|
1038
|
+
if m.metric_name == "mttr"
|
1039
|
+
][-30:],
|
1040
|
+
},
|
1041
|
+
# Alert and incident summary
|
1042
|
+
"alerts_incidents": {
|
1043
|
+
"recent_alerts": len(
|
1044
|
+
[
|
1045
|
+
f
|
1046
|
+
for f in (self.artifacts_dir / "alerts").glob("*.json")
|
1047
|
+
if f.stat().st_mtime > time.time() - 86400
|
1048
|
+
]
|
1049
|
+
),
|
1050
|
+
"incident_severity_breakdown": {
|
1051
|
+
"critical": len(
|
1052
|
+
[
|
1053
|
+
i
|
1054
|
+
for i in self.incidents
|
1055
|
+
if i.severity == "critical"
|
1056
|
+
and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
|
1057
|
+
]
|
1058
|
+
),
|
1059
|
+
"high": len(
|
1060
|
+
[
|
1061
|
+
i
|
1062
|
+
for i in self.incidents
|
1063
|
+
if i.severity == "high"
|
1064
|
+
and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
|
1065
|
+
]
|
1066
|
+
),
|
1067
|
+
"medium": len(
|
1068
|
+
[
|
1069
|
+
i
|
1070
|
+
for i in self.incidents
|
1071
|
+
if i.severity == "medium"
|
1072
|
+
and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
|
1073
|
+
]
|
1074
|
+
),
|
1075
|
+
},
|
1076
|
+
"mttr_by_severity": self._calculate_mttr_by_severity(days_back),
|
1077
|
+
},
|
1078
|
+
# Operational metrics
|
1079
|
+
"operational_metrics": {
|
1080
|
+
"deployment_success_rate": len([d for d in self.deployments if d.status == "success"])
|
1081
|
+
/ max(1, len(self.deployments))
|
1082
|
+
* 100,
|
1083
|
+
"avg_approval_time_minutes": sum(self.approval_times) / max(1, len(self.approval_times)),
|
1084
|
+
"workflow_efficiency_score": 100
|
1085
|
+
- (
|
1086
|
+
sum(self.approval_times) / max(1, len(self.approval_times)) / 60 * 100
|
1087
|
+
), # Efficiency based on approval speed
|
1088
|
+
"service_reliability_score": report["sla_metrics"]["availability"]["value"] * 100
|
1089
|
+
if "availability" in report.get("sla_metrics", {})
|
1090
|
+
else 0,
|
1091
|
+
},
|
1092
|
+
# Targets and thresholds for visualization
|
1093
|
+
"targets": self.targets,
|
1094
|
+
"alert_thresholds": self.alert_thresholds,
|
1095
|
+
# Raw data for detailed analysis
|
1096
|
+
"raw_metrics": report,
|
1097
|
+
}
|
1098
|
+
|
1099
|
+
# Save dashboard data for external tools
|
1100
|
+
dashboard_file = (
|
1101
|
+
self.artifacts_dir / "dashboards" / f"sre_dashboard_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
1102
|
+
)
|
1103
|
+
with open(dashboard_file, "w") as f:
|
1104
|
+
json.dump(dashboard_data, f, indent=2, default=str)
|
1105
|
+
|
1106
|
+
logger.info(f"๐ SRE dashboard data saved: {dashboard_file}")
|
1107
|
+
|
1108
|
+
return dashboard_data
|
1109
|
+
|
1110
|
+
def _calculate_mttr_by_severity(self, days_back: int) -> Dict[str, float]:
|
1111
|
+
"""Calculate MTTR broken down by incident severity."""
|
1112
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
1113
|
+
recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date and i.resolution_time]
|
1114
|
+
|
1115
|
+
mttr_by_severity = {}
|
1116
|
+
for severity in ["critical", "high", "medium", "low"]:
|
1117
|
+
severity_incidents = [i for i in recent_incidents if i.severity == severity]
|
1118
|
+
if severity_incidents:
|
1119
|
+
total_time = sum((i.resolution_time - i.start_time).total_seconds() / 3600 for i in severity_incidents)
|
1120
|
+
mttr_by_severity[severity] = total_time / len(severity_incidents)
|
1121
|
+
else:
|
1122
|
+
mttr_by_severity[severity] = 0
|
1123
|
+
|
1124
|
+
return mttr_by_severity
|
1125
|
+
|
1126
|
+
def integrate_with_performance_monitor(self, performance_monitor) -> None:
|
1127
|
+
"""
|
1128
|
+
Integrate DORA metrics with existing performance monitoring system.
|
1129
|
+
|
1130
|
+
Args:
|
1131
|
+
performance_monitor: Instance of PerformanceMonitor class
|
1132
|
+
"""
|
1133
|
+
try:
|
1134
|
+
# Hook into performance monitor to auto-detect incidents
|
1135
|
+
original_track = performance_monitor.track_operation
|
1136
|
+
|
1137
|
+
def enhanced_track_operation(
|
1138
|
+
module: str, operation: str, execution_time: float, success: bool = True, metadata=None
|
1139
|
+
):
|
1140
|
+
# Call original method
|
1141
|
+
result = original_track(module, operation, execution_time, success, metadata)
|
1142
|
+
|
1143
|
+
# Auto-detect performance incidents for DORA tracking
|
1144
|
+
target = performance_monitor.performance_targets.get(module, {})
|
1145
|
+
threshold = target.get("target_time", 30.0)
|
1146
|
+
|
1147
|
+
if execution_time > threshold:
|
1148
|
+
self.detect_performance_incident(module, operation, execution_time, threshold)
|
1149
|
+
|
1150
|
+
return result
|
1151
|
+
|
1152
|
+
# Replace with enhanced version
|
1153
|
+
performance_monitor.track_operation = enhanced_track_operation
|
1154
|
+
|
1155
|
+
logger.info("๐ DORA metrics integrated with performance monitor")
|
1156
|
+
|
1157
|
+
except Exception as e:
|
1158
|
+
logger.error(f"โ Failed to integrate with performance monitor: {e}")
|
1159
|
+
|
1160
|
+
def export_cloudwatch_metrics(self, namespace: str = "CloudOps/DORA") -> bool:
|
1161
|
+
"""
|
1162
|
+
Export DORA metrics to CloudWatch for enterprise monitoring.
|
1163
|
+
|
1164
|
+
Args:
|
1165
|
+
namespace: CloudWatch metrics namespace
|
1166
|
+
|
1167
|
+
Returns:
|
1168
|
+
Success status of metric publishing
|
1169
|
+
"""
|
1170
|
+
try:
|
1171
|
+
import boto3
|
1172
|
+
|
1173
|
+
cloudwatch = boto3.client("cloudwatch")
|
1174
|
+
|
1175
|
+
# Calculate current metrics
|
1176
|
+
lead_time = self.calculate_lead_time(7) # Weekly metrics
|
1177
|
+
deploy_freq = self.calculate_deployment_frequency(7)
|
1178
|
+
failure_rate = self.calculate_change_failure_rate(7)
|
1179
|
+
mttr = self.calculate_mttr(7)
|
1180
|
+
|
1181
|
+
# Publish to CloudWatch
|
1182
|
+
metrics_to_publish = [
|
1183
|
+
{
|
1184
|
+
"MetricName": "LeadTime",
|
1185
|
+
"Value": lead_time.value,
|
1186
|
+
"Unit": "Seconds",
|
1187
|
+
"Dimensions": [{"Name": "Environment", "Value": "production"}],
|
1188
|
+
},
|
1189
|
+
{
|
1190
|
+
"MetricName": "DeploymentFrequency",
|
1191
|
+
"Value": deploy_freq.value,
|
1192
|
+
"Unit": "Count/Second",
|
1193
|
+
"Dimensions": [{"Name": "Environment", "Value": "production"}],
|
1194
|
+
},
|
1195
|
+
{
|
1196
|
+
"MetricName": "ChangeFailureRate",
|
1197
|
+
"Value": failure_rate.value * 100, # Convert to percentage
|
1198
|
+
"Unit": "Percent",
|
1199
|
+
"Dimensions": [{"Name": "Environment", "Value": "production"}],
|
1200
|
+
},
|
1201
|
+
{
|
1202
|
+
"MetricName": "MeanTimeToRecovery",
|
1203
|
+
"Value": mttr.value,
|
1204
|
+
"Unit": "Seconds",
|
1205
|
+
"Dimensions": [{"Name": "Environment", "Value": "production"}],
|
1206
|
+
},
|
1207
|
+
]
|
1208
|
+
|
1209
|
+
response = cloudwatch.put_metric_data(Namespace=namespace, MetricData=metrics_to_publish)
|
1210
|
+
|
1211
|
+
logger.info(f"๐ DORA metrics published to CloudWatch: {namespace}")
|
1212
|
+
return True
|
1213
|
+
|
1214
|
+
except Exception as e:
|
1215
|
+
logger.error(f"โ Failed to export CloudWatch metrics: {e}")
|
1216
|
+
return False
|
1217
|
+
|
1218
|
+
|
1219
|
+
# Async functions for integration with existing systems
|
1220
|
+
async def simulate_dora_metrics_collection(duration_minutes: int = 5) -> Dict:
|
1221
|
+
"""Simulate DORA metrics collection for demonstration"""
|
1222
|
+
|
1223
|
+
engine = DORAMetricsEngine()
|
1224
|
+
|
1225
|
+
logger.info(f"๐งช Starting {duration_minutes}-minute DORA metrics simulation")
|
1226
|
+
|
1227
|
+
# Simulate deployment events
|
1228
|
+
deployments = [
|
1229
|
+
("deploy-001", "production", "vpc-wrapper", "v1.2.0", "abc123", "manager"),
|
1230
|
+
("deploy-002", "staging", "finops-dashboard", "v2.1.0", "def456", "architect"),
|
1231
|
+
("deploy-003", "production", "organizations-api", "v1.0.1", "ghi789", "manager"),
|
1232
|
+
]
|
1233
|
+
|
1234
|
+
for dep_id, env, service, version, commit, approver in deployments:
|
1235
|
+
deployment = engine.record_deployment(dep_id, env, service, version, commit, approver)
|
1236
|
+
|
1237
|
+
# Simulate approval time
|
1238
|
+
approval_time = 15 + (hash(dep_id) % 30) # 15-45 minutes
|
1239
|
+
engine.record_approval_time(approval_time, f"{env}_deployment")
|
1240
|
+
|
1241
|
+
# Simulate deployment completion after short delay
|
1242
|
+
await asyncio.sleep(1)
|
1243
|
+
|
1244
|
+
# 90% success rate simulation
|
1245
|
+
status = "success" if hash(dep_id) % 10 < 9 else "failed"
|
1246
|
+
engine.complete_deployment(dep_id, status)
|
1247
|
+
|
1248
|
+
# Simulate incidents
|
1249
|
+
incidents = [
|
1250
|
+
("inc-001", "vpc-wrapper", "high", "Network configuration error", "deploy-001"),
|
1251
|
+
("inc-002", "finops-dashboard", "medium", "Query timeout", ""),
|
1252
|
+
]
|
1253
|
+
|
1254
|
+
for inc_id, service, severity, cause, caused_by in incidents:
|
1255
|
+
incident = engine.record_incident(inc_id, service, severity, cause, caused_by)
|
1256
|
+
|
1257
|
+
# Simulate incident resolution
|
1258
|
+
await asyncio.sleep(0.5)
|
1259
|
+
detection_time = incident.start_time + timedelta(minutes=5)
|
1260
|
+
engine.resolve_incident(inc_id, detection_time)
|
1261
|
+
|
1262
|
+
# Generate comprehensive report
|
1263
|
+
report = engine.generate_comprehensive_report(days_back=7)
|
1264
|
+
|
1265
|
+
return report
|
1266
|
+
|
1267
|
+
|
1268
|
+
if __name__ == "__main__":
|
1269
|
+
# CLI execution
|
1270
|
+
import argparse
|
1271
|
+
|
1272
|
+
parser = argparse.ArgumentParser(description="DORA Metrics Engine")
|
1273
|
+
parser.add_argument("--simulate", action="store_true", help="Run simulation mode")
|
1274
|
+
parser.add_argument("--duration", type=int, default=5, help="Simulation duration in minutes")
|
1275
|
+
parser.add_argument("--output", "-o", default="./artifacts/metrics", help="Output directory for metrics")
|
1276
|
+
|
1277
|
+
args = parser.parse_args()
|
1278
|
+
|
1279
|
+
async def main():
|
1280
|
+
if args.simulate:
|
1281
|
+
report = await simulate_dora_metrics_collection(args.duration)
|
1282
|
+
print("โ
DORA metrics simulation completed")
|
1283
|
+
print(f"๐ Overall performance: {report['performance_analysis']['performance_grade']}")
|
1284
|
+
print(
|
1285
|
+
f"๐ฏ Targets met: {sum(report['performance_analysis']['targets_met'].values())}/{len(report['performance_analysis']['targets_met'])}"
|
1286
|
+
)
|
1287
|
+
else:
|
1288
|
+
engine = DORAMetricsEngine(args.output)
|
1289
|
+
report = engine.generate_comprehensive_report()
|
1290
|
+
print("โ
DORA metrics report generated")
|
1291
|
+
print(f"๐ Report saved to: {engine.artifacts_dir}")
|
1292
|
+
|
1293
|
+
asyncio.run(main())
|