runbooks 0.7.6__py3-none-any.whl → 0.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runbooks/__init__.py +1 -1
- runbooks/base.py +5 -1
- runbooks/cfat/__init__.py +8 -4
- runbooks/cfat/assessment/collectors.py +171 -14
- runbooks/cfat/assessment/compliance.py +871 -0
- runbooks/cfat/assessment/runner.py +122 -11
- runbooks/cfat/models.py +6 -2
- runbooks/common/logger.py +14 -0
- runbooks/common/rich_utils.py +451 -0
- runbooks/enterprise/__init__.py +68 -0
- runbooks/enterprise/error_handling.py +411 -0
- runbooks/enterprise/logging.py +439 -0
- runbooks/enterprise/multi_tenant.py +583 -0
- runbooks/finops/README.md +468 -241
- runbooks/finops/__init__.py +39 -3
- runbooks/finops/cli.py +83 -18
- runbooks/finops/cross_validation.py +375 -0
- runbooks/finops/dashboard_runner.py +812 -164
- runbooks/finops/enhanced_dashboard_runner.py +525 -0
- runbooks/finops/finops_dashboard.py +1892 -0
- runbooks/finops/helpers.py +485 -51
- runbooks/finops/optimizer.py +823 -0
- runbooks/finops/tests/__init__.py +19 -0
- runbooks/finops/tests/results_test_finops_dashboard.xml +1 -0
- runbooks/finops/tests/run_comprehensive_tests.py +421 -0
- runbooks/finops/tests/run_tests.py +305 -0
- runbooks/finops/tests/test_finops_dashboard.py +705 -0
- runbooks/finops/tests/test_integration.py +477 -0
- runbooks/finops/tests/test_performance.py +380 -0
- runbooks/finops/tests/test_performance_benchmarks.py +500 -0
- runbooks/finops/tests/test_reference_images_validation.py +867 -0
- runbooks/finops/tests/test_single_account_features.py +715 -0
- runbooks/finops/tests/validate_test_suite.py +220 -0
- runbooks/finops/types.py +1 -1
- runbooks/hitl/enhanced_workflow_engine.py +725 -0
- runbooks/inventory/artifacts/scale-optimize-status.txt +12 -0
- runbooks/inventory/collectors/aws_comprehensive.py +442 -0
- runbooks/inventory/collectors/enterprise_scale.py +281 -0
- runbooks/inventory/core/collector.py +172 -13
- runbooks/inventory/discovery.md +1 -1
- runbooks/inventory/list_ec2_instances.py +18 -20
- runbooks/inventory/list_ssm_parameters.py +31 -3
- runbooks/inventory/organizations_discovery.py +1269 -0
- runbooks/inventory/rich_inventory_display.py +393 -0
- runbooks/inventory/run_on_multi_accounts.py +35 -19
- runbooks/inventory/runbooks.security.report_generator.log +0 -0
- runbooks/inventory/runbooks.security.run_script.log +0 -0
- runbooks/inventory/vpc_flow_analyzer.py +1030 -0
- runbooks/main.py +2215 -119
- runbooks/metrics/dora_metrics_engine.py +599 -0
- runbooks/operate/__init__.py +2 -2
- runbooks/operate/base.py +122 -10
- runbooks/operate/deployment_framework.py +1032 -0
- runbooks/operate/deployment_validator.py +853 -0
- runbooks/operate/dynamodb_operations.py +10 -6
- runbooks/operate/ec2_operations.py +319 -11
- runbooks/operate/executive_dashboard.py +779 -0
- runbooks/operate/mcp_integration.py +750 -0
- runbooks/operate/nat_gateway_operations.py +1120 -0
- runbooks/operate/networking_cost_heatmap.py +685 -0
- runbooks/operate/privatelink_operations.py +940 -0
- runbooks/operate/s3_operations.py +10 -6
- runbooks/operate/vpc_endpoints.py +644 -0
- runbooks/operate/vpc_operations.py +1038 -0
- runbooks/remediation/__init__.py +2 -2
- runbooks/remediation/acm_remediation.py +1 -1
- runbooks/remediation/base.py +1 -1
- runbooks/remediation/cloudtrail_remediation.py +1 -1
- runbooks/remediation/cognito_remediation.py +1 -1
- runbooks/remediation/dynamodb_remediation.py +1 -1
- runbooks/remediation/ec2_remediation.py +1 -1
- runbooks/remediation/ec2_unattached_ebs_volumes.py +1 -1
- runbooks/remediation/kms_enable_key_rotation.py +1 -1
- runbooks/remediation/kms_remediation.py +1 -1
- runbooks/remediation/lambda_remediation.py +1 -1
- runbooks/remediation/multi_account.py +1 -1
- runbooks/remediation/rds_remediation.py +1 -1
- runbooks/remediation/s3_block_public_access.py +1 -1
- runbooks/remediation/s3_enable_access_logging.py +1 -1
- runbooks/remediation/s3_encryption.py +1 -1
- runbooks/remediation/s3_remediation.py +1 -1
- runbooks/remediation/vpc_remediation.py +475 -0
- runbooks/security/__init__.py +3 -1
- runbooks/security/compliance_automation.py +632 -0
- runbooks/security/report_generator.py +10 -0
- runbooks/security/run_script.py +31 -5
- runbooks/security/security_baseline_tester.py +169 -30
- runbooks/security/security_export.py +477 -0
- runbooks/validation/__init__.py +10 -0
- runbooks/validation/benchmark.py +484 -0
- runbooks/validation/cli.py +356 -0
- runbooks/validation/mcp_validator.py +768 -0
- runbooks/vpc/__init__.py +38 -0
- runbooks/vpc/config.py +212 -0
- runbooks/vpc/cost_engine.py +347 -0
- runbooks/vpc/heatmap_engine.py +605 -0
- runbooks/vpc/manager_interface.py +634 -0
- runbooks/vpc/networking_wrapper.py +1260 -0
- runbooks/vpc/rich_formatters.py +679 -0
- runbooks/vpc/tests/__init__.py +5 -0
- runbooks/vpc/tests/conftest.py +356 -0
- runbooks/vpc/tests/test_cli_integration.py +530 -0
- runbooks/vpc/tests/test_config.py +458 -0
- runbooks/vpc/tests/test_cost_engine.py +479 -0
- runbooks/vpc/tests/test_networking_wrapper.py +512 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/METADATA +40 -12
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/RECORD +111 -50
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/WHEEL +0 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/entry_points.txt +0 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/licenses/LICENSE +0 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,599 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
DORA Metrics Engine for HITL System Optimization
|
4
|
+
|
5
|
+
Issue #93: HITL System & DORA Metrics Optimization
|
6
|
+
Priority: High (Sprint 1 Improvements)
|
7
|
+
Scope: Optimize Human-in-the-Loop system and enhance DORA metrics collection
|
8
|
+
"""
|
9
|
+
|
10
|
+
import asyncio
|
11
|
+
import json
|
12
|
+
import logging
|
13
|
+
import time
|
14
|
+
from dataclasses import asdict, dataclass
|
15
|
+
from datetime import datetime, timedelta, timezone
|
16
|
+
from pathlib import Path
|
17
|
+
from typing import Dict, List, Optional, Tuple
|
18
|
+
|
19
|
+
from ..utils.logger import configure_logger
|
20
|
+
|
21
|
+
logger = configure_logger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
@dataclass
|
25
|
+
class DORAMetric:
|
26
|
+
"""Individual DORA metric measurement"""
|
27
|
+
|
28
|
+
metric_name: str
|
29
|
+
value: float
|
30
|
+
unit: str
|
31
|
+
timestamp: datetime
|
32
|
+
tags: Dict[str, str] = None
|
33
|
+
metadata: Dict = None
|
34
|
+
|
35
|
+
def __post_init__(self):
|
36
|
+
if self.tags is None:
|
37
|
+
self.tags = {}
|
38
|
+
if self.metadata is None:
|
39
|
+
self.metadata = {}
|
40
|
+
|
41
|
+
|
42
|
+
@dataclass
|
43
|
+
class DeploymentEvent:
|
44
|
+
"""Deployment event for DORA metrics tracking"""
|
45
|
+
|
46
|
+
deployment_id: str
|
47
|
+
environment: str
|
48
|
+
service_name: str
|
49
|
+
version: str
|
50
|
+
start_time: datetime
|
51
|
+
end_time: Optional[datetime] = None
|
52
|
+
status: str = "in_progress" # in_progress, success, failed, rolled_back
|
53
|
+
commit_sha: str = ""
|
54
|
+
approver: str = ""
|
55
|
+
rollback_time: Optional[datetime] = None
|
56
|
+
|
57
|
+
|
58
|
+
@dataclass
|
59
|
+
class IncidentEvent:
|
60
|
+
"""Incident event for DORA metrics tracking"""
|
61
|
+
|
62
|
+
incident_id: str
|
63
|
+
service_name: str
|
64
|
+
severity: str # critical, high, medium, low
|
65
|
+
start_time: datetime
|
66
|
+
detection_time: Optional[datetime] = None
|
67
|
+
resolution_time: Optional[datetime] = None
|
68
|
+
root_cause: str = ""
|
69
|
+
caused_by_deployment: str = ""
|
70
|
+
|
71
|
+
|
72
|
+
class DORAMetricsEngine:
|
73
|
+
"""Enhanced DORA metrics collection and analysis engine"""
|
74
|
+
|
75
|
+
def __init__(self, artifacts_dir: str = "./artifacts/metrics", cross_validation_tolerance: float = 15.0):
|
76
|
+
"""
|
77
|
+
Initialize DORA metrics engine
|
78
|
+
|
79
|
+
Args:
|
80
|
+
artifacts_dir: Directory to store metrics artifacts
|
81
|
+
cross_validation_tolerance: Tolerance percentage for metric validation
|
82
|
+
"""
|
83
|
+
self.artifacts_dir = Path(artifacts_dir)
|
84
|
+
self.artifacts_dir.mkdir(parents=True, exist_ok=True)
|
85
|
+
|
86
|
+
self.tolerance = cross_validation_tolerance
|
87
|
+
|
88
|
+
# Metrics storage
|
89
|
+
self.deployments: List[DeploymentEvent] = []
|
90
|
+
self.incidents: List[IncidentEvent] = []
|
91
|
+
self.metrics_history: List[DORAMetric] = []
|
92
|
+
|
93
|
+
# HITL workflow metrics
|
94
|
+
self.approval_times: List[float] = []
|
95
|
+
self.workflow_bottlenecks: Dict[str, List[float]] = {}
|
96
|
+
|
97
|
+
# Performance targets from CLAUDE.md
|
98
|
+
self.targets = {
|
99
|
+
"lead_time_hours": 4, # <4 hours
|
100
|
+
"deploy_frequency_daily": 1, # Daily deployment capability
|
101
|
+
"change_failure_rate": 0.05, # <5%
|
102
|
+
"mttr_hours": 1, # <1 hour
|
103
|
+
"approval_time_minutes": 30, # <30 minutes
|
104
|
+
"success_rate": 0.95, # >95%
|
105
|
+
}
|
106
|
+
|
107
|
+
def record_deployment(
|
108
|
+
self,
|
109
|
+
deployment_id: str,
|
110
|
+
environment: str,
|
111
|
+
service_name: str,
|
112
|
+
version: str,
|
113
|
+
commit_sha: str = "",
|
114
|
+
approver: str = "",
|
115
|
+
) -> DeploymentEvent:
|
116
|
+
"""Record a new deployment event"""
|
117
|
+
|
118
|
+
deployment = DeploymentEvent(
|
119
|
+
deployment_id=deployment_id,
|
120
|
+
environment=environment,
|
121
|
+
service_name=service_name,
|
122
|
+
version=version,
|
123
|
+
start_time=datetime.now(timezone.utc),
|
124
|
+
commit_sha=commit_sha,
|
125
|
+
approver=approver,
|
126
|
+
)
|
127
|
+
|
128
|
+
self.deployments.append(deployment)
|
129
|
+
|
130
|
+
logger.info(f"🚀 Deployment recorded: {deployment_id} for {service_name}")
|
131
|
+
|
132
|
+
return deployment
|
133
|
+
|
134
|
+
def complete_deployment(self, deployment_id: str, status: str, rollback_time: Optional[datetime] = None) -> bool:
|
135
|
+
"""Mark deployment as complete"""
|
136
|
+
|
137
|
+
for deployment in self.deployments:
|
138
|
+
if deployment.deployment_id == deployment_id:
|
139
|
+
deployment.end_time = datetime.now(timezone.utc)
|
140
|
+
deployment.status = status
|
141
|
+
deployment.rollback_time = rollback_time
|
142
|
+
|
143
|
+
logger.info(f"✅ Deployment completed: {deployment_id} - {status}")
|
144
|
+
return True
|
145
|
+
|
146
|
+
logger.warning(f"⚠️ Deployment not found: {deployment_id}")
|
147
|
+
return False
|
148
|
+
|
149
|
+
def record_incident(
|
150
|
+
self, incident_id: str, service_name: str, severity: str, root_cause: str = "", caused_by_deployment: str = ""
|
151
|
+
) -> IncidentEvent:
|
152
|
+
"""Record a new incident event"""
|
153
|
+
|
154
|
+
incident = IncidentEvent(
|
155
|
+
incident_id=incident_id,
|
156
|
+
service_name=service_name,
|
157
|
+
severity=severity,
|
158
|
+
start_time=datetime.now(timezone.utc),
|
159
|
+
root_cause=root_cause,
|
160
|
+
caused_by_deployment=caused_by_deployment,
|
161
|
+
)
|
162
|
+
|
163
|
+
self.incidents.append(incident)
|
164
|
+
|
165
|
+
logger.info(f"🚨 Incident recorded: {incident_id} - {severity} severity")
|
166
|
+
|
167
|
+
return incident
|
168
|
+
|
169
|
+
def resolve_incident(self, incident_id: str, detection_time: Optional[datetime] = None) -> bool:
|
170
|
+
"""Mark incident as resolved"""
|
171
|
+
|
172
|
+
for incident in self.incidents:
|
173
|
+
if incident.incident_id == incident_id:
|
174
|
+
incident.resolution_time = datetime.now(timezone.utc)
|
175
|
+
if detection_time:
|
176
|
+
incident.detection_time = detection_time
|
177
|
+
|
178
|
+
logger.info(f"✅ Incident resolved: {incident_id}")
|
179
|
+
return True
|
180
|
+
|
181
|
+
logger.warning(f"⚠️ Incident not found: {incident_id}")
|
182
|
+
return False
|
183
|
+
|
184
|
+
def record_approval_time(self, approval_time_minutes: float, workflow_step: str = "general"):
|
185
|
+
"""Record HITL approval time"""
|
186
|
+
self.approval_times.append(approval_time_minutes)
|
187
|
+
|
188
|
+
if workflow_step not in self.workflow_bottlenecks:
|
189
|
+
self.workflow_bottlenecks[workflow_step] = []
|
190
|
+
self.workflow_bottlenecks[workflow_step].append(approval_time_minutes)
|
191
|
+
|
192
|
+
def calculate_lead_time(self, days_back: int = 30) -> DORAMetric:
|
193
|
+
"""Calculate deployment lead time"""
|
194
|
+
|
195
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
196
|
+
recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date and d.end_time]
|
197
|
+
|
198
|
+
if not recent_deployments:
|
199
|
+
return DORAMetric(
|
200
|
+
metric_name="lead_time",
|
201
|
+
value=0.0,
|
202
|
+
unit="hours",
|
203
|
+
timestamp=datetime.now(timezone.utc),
|
204
|
+
tags={"period": f"{days_back}d", "status": "no_data"},
|
205
|
+
)
|
206
|
+
|
207
|
+
# Calculate average lead time (simplified - in real scenario would track from commit to production)
|
208
|
+
lead_times = []
|
209
|
+
for deployment in recent_deployments:
|
210
|
+
if deployment.end_time and deployment.status == "success":
|
211
|
+
duration = (deployment.end_time - deployment.start_time).total_seconds() / 3600 # hours
|
212
|
+
lead_times.append(duration)
|
213
|
+
|
214
|
+
avg_lead_time = sum(lead_times) / len(lead_times) if lead_times else 0
|
215
|
+
|
216
|
+
metric = DORAMetric(
|
217
|
+
metric_name="lead_time",
|
218
|
+
value=avg_lead_time,
|
219
|
+
unit="hours",
|
220
|
+
timestamp=datetime.now(timezone.utc),
|
221
|
+
tags={
|
222
|
+
"period": f"{days_back}d",
|
223
|
+
"deployments_count": str(len(recent_deployments)),
|
224
|
+
"successful_deployments": str(len(lead_times)),
|
225
|
+
},
|
226
|
+
metadata={
|
227
|
+
"target": self.targets["lead_time_hours"],
|
228
|
+
"target_met": avg_lead_time <= self.targets["lead_time_hours"],
|
229
|
+
},
|
230
|
+
)
|
231
|
+
|
232
|
+
self.metrics_history.append(metric)
|
233
|
+
return metric
|
234
|
+
|
235
|
+
def calculate_deployment_frequency(self, days_back: int = 30) -> DORAMetric:
|
236
|
+
"""Calculate deployment frequency"""
|
237
|
+
|
238
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
239
|
+
recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date]
|
240
|
+
|
241
|
+
# Calculate deployments per day
|
242
|
+
deployments_per_day = len(recent_deployments) / days_back if days_back > 0 else 0
|
243
|
+
|
244
|
+
metric = DORAMetric(
|
245
|
+
metric_name="deployment_frequency",
|
246
|
+
value=deployments_per_day,
|
247
|
+
unit="deployments_per_day",
|
248
|
+
timestamp=datetime.now(timezone.utc),
|
249
|
+
tags={"period": f"{days_back}d", "total_deployments": str(len(recent_deployments))},
|
250
|
+
metadata={
|
251
|
+
"target": self.targets["deploy_frequency_daily"],
|
252
|
+
"target_met": deployments_per_day >= self.targets["deploy_frequency_daily"],
|
253
|
+
},
|
254
|
+
)
|
255
|
+
|
256
|
+
self.metrics_history.append(metric)
|
257
|
+
return metric
|
258
|
+
|
259
|
+
def calculate_change_failure_rate(self, days_back: int = 30) -> DORAMetric:
|
260
|
+
"""Calculate change failure rate"""
|
261
|
+
|
262
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
263
|
+
recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date and d.end_time]
|
264
|
+
|
265
|
+
if not recent_deployments:
|
266
|
+
return DORAMetric(
|
267
|
+
metric_name="change_failure_rate",
|
268
|
+
value=0.0,
|
269
|
+
unit="percentage",
|
270
|
+
timestamp=datetime.now(timezone.utc),
|
271
|
+
tags={"period": f"{days_back}d", "status": "no_data"},
|
272
|
+
)
|
273
|
+
|
274
|
+
failed_deployments = len([d for d in recent_deployments if d.status in ["failed", "rolled_back"]])
|
275
|
+
|
276
|
+
failure_rate = failed_deployments / len(recent_deployments)
|
277
|
+
|
278
|
+
metric = DORAMetric(
|
279
|
+
metric_name="change_failure_rate",
|
280
|
+
value=failure_rate,
|
281
|
+
unit="percentage",
|
282
|
+
timestamp=datetime.now(timezone.utc),
|
283
|
+
tags={
|
284
|
+
"period": f"{days_back}d",
|
285
|
+
"total_deployments": str(len(recent_deployments)),
|
286
|
+
"failed_deployments": str(failed_deployments),
|
287
|
+
},
|
288
|
+
metadata={
|
289
|
+
"target": self.targets["change_failure_rate"],
|
290
|
+
"target_met": failure_rate <= self.targets["change_failure_rate"],
|
291
|
+
},
|
292
|
+
)
|
293
|
+
|
294
|
+
self.metrics_history.append(metric)
|
295
|
+
return metric
|
296
|
+
|
297
|
+
def calculate_mttr(self, days_back: int = 30) -> DORAMetric:
|
298
|
+
"""Calculate Mean Time to Recovery (MTTR)"""
|
299
|
+
|
300
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
301
|
+
recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date and i.resolution_time]
|
302
|
+
|
303
|
+
if not recent_incidents:
|
304
|
+
return DORAMetric(
|
305
|
+
metric_name="mttr",
|
306
|
+
value=0.0,
|
307
|
+
unit="hours",
|
308
|
+
timestamp=datetime.now(timezone.utc),
|
309
|
+
tags={"period": f"{days_back}d", "status": "no_data"},
|
310
|
+
)
|
311
|
+
|
312
|
+
# Calculate recovery times
|
313
|
+
recovery_times = []
|
314
|
+
for incident in recent_incidents:
|
315
|
+
if incident.resolution_time:
|
316
|
+
duration = (incident.resolution_time - incident.start_time).total_seconds() / 3600 # hours
|
317
|
+
recovery_times.append(duration)
|
318
|
+
|
319
|
+
avg_mttr = sum(recovery_times) / len(recovery_times) if recovery_times else 0
|
320
|
+
|
321
|
+
metric = DORAMetric(
|
322
|
+
metric_name="mttr",
|
323
|
+
value=avg_mttr,
|
324
|
+
unit="hours",
|
325
|
+
timestamp=datetime.now(timezone.utc),
|
326
|
+
tags={"period": f"{days_back}d", "incidents_count": str(len(recent_incidents))},
|
327
|
+
metadata={"target": self.targets["mttr_hours"], "target_met": avg_mttr <= self.targets["mttr_hours"]},
|
328
|
+
)
|
329
|
+
|
330
|
+
self.metrics_history.append(metric)
|
331
|
+
return metric
|
332
|
+
|
333
|
+
def calculate_hitl_metrics(self) -> Dict[str, DORAMetric]:
|
334
|
+
"""Calculate Human-in-the-Loop specific metrics"""
|
335
|
+
|
336
|
+
metrics = {}
|
337
|
+
|
338
|
+
# Average approval time
|
339
|
+
if self.approval_times:
|
340
|
+
avg_approval_time = sum(self.approval_times) / len(self.approval_times)
|
341
|
+
|
342
|
+
metrics["approval_time"] = DORAMetric(
|
343
|
+
metric_name="approval_time",
|
344
|
+
value=avg_approval_time,
|
345
|
+
unit="minutes",
|
346
|
+
timestamp=datetime.now(timezone.utc),
|
347
|
+
tags={"total_approvals": str(len(self.approval_times))},
|
348
|
+
metadata={
|
349
|
+
"target": self.targets["approval_time_minutes"],
|
350
|
+
"target_met": avg_approval_time <= self.targets["approval_time_minutes"],
|
351
|
+
},
|
352
|
+
)
|
353
|
+
|
354
|
+
# Workflow bottlenecks analysis
|
355
|
+
if self.workflow_bottlenecks:
|
356
|
+
bottleneck_metrics = {}
|
357
|
+
|
358
|
+
for step, times in self.workflow_bottlenecks.items():
|
359
|
+
if times:
|
360
|
+
avg_time = sum(times) / len(times)
|
361
|
+
bottleneck_metrics[f"{step}_avg_time"] = avg_time
|
362
|
+
|
363
|
+
# Identify slowest step
|
364
|
+
if bottleneck_metrics:
|
365
|
+
slowest_step = max(bottleneck_metrics, key=bottleneck_metrics.get)
|
366
|
+
slowest_time = bottleneck_metrics[slowest_step]
|
367
|
+
|
368
|
+
metrics["workflow_bottleneck"] = DORAMetric(
|
369
|
+
metric_name="workflow_bottleneck",
|
370
|
+
value=slowest_time,
|
371
|
+
unit="minutes",
|
372
|
+
timestamp=datetime.now(timezone.utc),
|
373
|
+
tags={"bottleneck_step": slowest_step},
|
374
|
+
metadata={"all_steps": bottleneck_metrics},
|
375
|
+
)
|
376
|
+
|
377
|
+
return metrics
|
378
|
+
|
379
|
+
def generate_comprehensive_report(self, days_back: int = 30) -> Dict:
|
380
|
+
"""Generate comprehensive DORA metrics report"""
|
381
|
+
|
382
|
+
logger.info(f"📊 Generating DORA metrics report for last {days_back} days")
|
383
|
+
|
384
|
+
# Calculate all DORA metrics
|
385
|
+
lead_time = self.calculate_lead_time(days_back)
|
386
|
+
deployment_freq = self.calculate_deployment_frequency(days_back)
|
387
|
+
failure_rate = self.calculate_change_failure_rate(days_back)
|
388
|
+
mttr = self.calculate_mttr(days_back)
|
389
|
+
|
390
|
+
# Calculate HITL metrics
|
391
|
+
hitl_metrics = self.calculate_hitl_metrics()
|
392
|
+
|
393
|
+
# Performance analysis
|
394
|
+
targets_met = {
|
395
|
+
"lead_time": lead_time.metadata.get("target_met", False),
|
396
|
+
"deployment_frequency": deployment_freq.metadata.get("target_met", False),
|
397
|
+
"change_failure_rate": failure_rate.metadata.get("target_met", False),
|
398
|
+
"mttr": mttr.metadata.get("target_met", False),
|
399
|
+
}
|
400
|
+
|
401
|
+
# Add HITL targets
|
402
|
+
if "approval_time" in hitl_metrics:
|
403
|
+
targets_met["approval_time"] = hitl_metrics["approval_time"].metadata.get("target_met", False)
|
404
|
+
|
405
|
+
overall_performance = sum(targets_met.values()) / len(targets_met) * 100
|
406
|
+
|
407
|
+
report = {
|
408
|
+
"report_type": "dora_metrics_comprehensive",
|
409
|
+
"period": f"{days_back}_days",
|
410
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
411
|
+
"dora_metrics": {
|
412
|
+
"lead_time": asdict(lead_time),
|
413
|
+
"deployment_frequency": asdict(deployment_freq),
|
414
|
+
"change_failure_rate": asdict(failure_rate),
|
415
|
+
"mttr": asdict(mttr),
|
416
|
+
},
|
417
|
+
"hitl_metrics": {k: asdict(v) for k, v in hitl_metrics.items()},
|
418
|
+
"performance_analysis": {
|
419
|
+
"targets_met": targets_met,
|
420
|
+
"overall_performance_percentage": overall_performance,
|
421
|
+
"performance_grade": self._calculate_performance_grade(overall_performance),
|
422
|
+
},
|
423
|
+
"recommendations": self._generate_recommendations(targets_met, hitl_metrics),
|
424
|
+
"raw_data": {
|
425
|
+
"deployments_count": len(self.deployments),
|
426
|
+
"incidents_count": len(self.incidents),
|
427
|
+
"approval_times_count": len(self.approval_times),
|
428
|
+
},
|
429
|
+
}
|
430
|
+
|
431
|
+
# Save report
|
432
|
+
report_file = self.artifacts_dir / f"dora_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
433
|
+
with open(report_file, "w") as f:
|
434
|
+
json.dump(report, f, indent=2, default=str)
|
435
|
+
|
436
|
+
logger.info(f"✅ DORA metrics report saved to: {report_file}")
|
437
|
+
|
438
|
+
return report
|
439
|
+
|
440
|
+
def _calculate_performance_grade(self, percentage: float) -> str:
|
441
|
+
"""Calculate performance grade based on targets met"""
|
442
|
+
if percentage >= 90:
|
443
|
+
return "A (Excellent)"
|
444
|
+
elif percentage >= 80:
|
445
|
+
return "B (Good)"
|
446
|
+
elif percentage >= 70:
|
447
|
+
return "C (Satisfactory)"
|
448
|
+
elif percentage >= 60:
|
449
|
+
return "D (Needs Improvement)"
|
450
|
+
else:
|
451
|
+
return "F (Poor)"
|
452
|
+
|
453
|
+
def _generate_recommendations(self, targets_met: Dict[str, bool], hitl_metrics: Dict) -> List[str]:
|
454
|
+
"""Generate recommendations based on metrics analysis"""
|
455
|
+
|
456
|
+
recommendations = []
|
457
|
+
|
458
|
+
if not targets_met.get("lead_time", False):
|
459
|
+
recommendations.append(
|
460
|
+
"🎯 Optimize lead time: Consider parallel workflows, automated testing, and faster approval processes"
|
461
|
+
)
|
462
|
+
|
463
|
+
if not targets_met.get("deployment_frequency", False):
|
464
|
+
recommendations.append(
|
465
|
+
"🚀 Increase deployment frequency: Implement continuous deployment pipeline and smaller batch sizes"
|
466
|
+
)
|
467
|
+
|
468
|
+
if not targets_met.get("change_failure_rate", False):
|
469
|
+
recommendations.append(
|
470
|
+
"🛡️ Reduce failure rate: Enhance testing coverage, implement canary deployments, and improve rollback procedures"
|
471
|
+
)
|
472
|
+
|
473
|
+
if not targets_met.get("mttr", False):
|
474
|
+
recommendations.append(
|
475
|
+
"⚡ Improve MTTR: Enhance monitoring, implement automated incident response, and improve alerting"
|
476
|
+
)
|
477
|
+
|
478
|
+
if not targets_met.get("approval_time", False):
|
479
|
+
recommendations.append(
|
480
|
+
"⏰ Optimize approval workflow: Streamline HITL processes, implement parallel approvals, and reduce approval steps"
|
481
|
+
)
|
482
|
+
|
483
|
+
# HITL-specific recommendations
|
484
|
+
if "workflow_bottleneck" in hitl_metrics:
|
485
|
+
bottleneck_step = hitl_metrics["workflow_bottleneck"].tags.get("bottleneck_step", "unknown")
|
486
|
+
recommendations.append(f"🔍 Address workflow bottleneck: Focus on optimizing '{bottleneck_step}' step")
|
487
|
+
|
488
|
+
if not recommendations:
|
489
|
+
recommendations.append(
|
490
|
+
"✅ All targets met! Consider raising performance targets or exploring advanced optimization opportunities"
|
491
|
+
)
|
492
|
+
|
493
|
+
return recommendations
|
494
|
+
|
495
|
+
def export_metrics_for_visualization(self, output_file: Optional[str] = None) -> str:
|
496
|
+
"""Export metrics in format suitable for visualization tools"""
|
497
|
+
|
498
|
+
if not output_file:
|
499
|
+
output_file = self.artifacts_dir / f"metrics_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
500
|
+
|
501
|
+
export_data = {
|
502
|
+
"export_timestamp": datetime.now(timezone.utc).isoformat(),
|
503
|
+
"metrics_history": [asdict(m) for m in self.metrics_history],
|
504
|
+
"deployments": [asdict(d) for d in self.deployments],
|
505
|
+
"incidents": [asdict(i) for i in self.incidents],
|
506
|
+
"targets": self.targets,
|
507
|
+
"summary_stats": {
|
508
|
+
"total_deployments": len(self.deployments),
|
509
|
+
"successful_deployments": len([d for d in self.deployments if d.status == "success"]),
|
510
|
+
"total_incidents": len(self.incidents),
|
511
|
+
"resolved_incidents": len([i for i in self.incidents if i.resolution_time]),
|
512
|
+
"average_approval_time": sum(self.approval_times) / len(self.approval_times)
|
513
|
+
if self.approval_times
|
514
|
+
else 0,
|
515
|
+
},
|
516
|
+
}
|
517
|
+
|
518
|
+
with open(output_file, "w") as f:
|
519
|
+
json.dump(export_data, f, indent=2, default=str)
|
520
|
+
|
521
|
+
logger.info(f"📊 Metrics exported for visualization: {output_file}")
|
522
|
+
return str(output_file)
|
523
|
+
|
524
|
+
|
525
|
+
# Async functions for integration with existing systems
|
526
|
+
async def simulate_dora_metrics_collection(duration_minutes: int = 5) -> Dict:
|
527
|
+
"""Simulate DORA metrics collection for demonstration"""
|
528
|
+
|
529
|
+
engine = DORAMetricsEngine()
|
530
|
+
|
531
|
+
logger.info(f"🧪 Starting {duration_minutes}-minute DORA metrics simulation")
|
532
|
+
|
533
|
+
# Simulate deployment events
|
534
|
+
deployments = [
|
535
|
+
("deploy-001", "production", "vpc-wrapper", "v1.2.0", "abc123", "manager"),
|
536
|
+
("deploy-002", "staging", "finops-dashboard", "v2.1.0", "def456", "architect"),
|
537
|
+
("deploy-003", "production", "organizations-api", "v1.0.1", "ghi789", "manager"),
|
538
|
+
]
|
539
|
+
|
540
|
+
for dep_id, env, service, version, commit, approver in deployments:
|
541
|
+
deployment = engine.record_deployment(dep_id, env, service, version, commit, approver)
|
542
|
+
|
543
|
+
# Simulate approval time
|
544
|
+
approval_time = 15 + (hash(dep_id) % 30) # 15-45 minutes
|
545
|
+
engine.record_approval_time(approval_time, f"{env}_deployment")
|
546
|
+
|
547
|
+
# Simulate deployment completion after short delay
|
548
|
+
await asyncio.sleep(1)
|
549
|
+
|
550
|
+
# 90% success rate simulation
|
551
|
+
status = "success" if hash(dep_id) % 10 < 9 else "failed"
|
552
|
+
engine.complete_deployment(dep_id, status)
|
553
|
+
|
554
|
+
# Simulate incidents
|
555
|
+
incidents = [
|
556
|
+
("inc-001", "vpc-wrapper", "high", "Network configuration error", "deploy-001"),
|
557
|
+
("inc-002", "finops-dashboard", "medium", "Query timeout", ""),
|
558
|
+
]
|
559
|
+
|
560
|
+
for inc_id, service, severity, cause, caused_by in incidents:
|
561
|
+
incident = engine.record_incident(inc_id, service, severity, cause, caused_by)
|
562
|
+
|
563
|
+
# Simulate incident resolution
|
564
|
+
await asyncio.sleep(0.5)
|
565
|
+
detection_time = incident.start_time + timedelta(minutes=5)
|
566
|
+
engine.resolve_incident(inc_id, detection_time)
|
567
|
+
|
568
|
+
# Generate comprehensive report
|
569
|
+
report = engine.generate_comprehensive_report(days_back=7)
|
570
|
+
|
571
|
+
return report
|
572
|
+
|
573
|
+
|
574
|
+
if __name__ == "__main__":
|
575
|
+
# CLI execution
|
576
|
+
import argparse
|
577
|
+
|
578
|
+
parser = argparse.ArgumentParser(description="DORA Metrics Engine")
|
579
|
+
parser.add_argument("--simulate", action="store_true", help="Run simulation mode")
|
580
|
+
parser.add_argument("--duration", type=int, default=5, help="Simulation duration in minutes")
|
581
|
+
parser.add_argument("--output", "-o", default="./artifacts/metrics", help="Output directory for metrics")
|
582
|
+
|
583
|
+
args = parser.parse_args()
|
584
|
+
|
585
|
+
async def main():
|
586
|
+
if args.simulate:
|
587
|
+
report = await simulate_dora_metrics_collection(args.duration)
|
588
|
+
print("✅ DORA metrics simulation completed")
|
589
|
+
print(f"📊 Overall performance: {report['performance_analysis']['performance_grade']}")
|
590
|
+
print(
|
591
|
+
f"🎯 Targets met: {sum(report['performance_analysis']['targets_met'].values())}/{len(report['performance_analysis']['targets_met'])}"
|
592
|
+
)
|
593
|
+
else:
|
594
|
+
engine = DORAMetricsEngine(args.output)
|
595
|
+
report = engine.generate_comprehensive_report()
|
596
|
+
print("✅ DORA metrics report generated")
|
597
|
+
print(f"📊 Report saved to: {engine.artifacts_dir}")
|
598
|
+
|
599
|
+
asyncio.run(main())
|
runbooks/operate/__init__.py
CHANGED
@@ -169,7 +169,7 @@ runbooks operate iam update-roles-cross-accounts --role-name deployment-role
|
|
169
169
|
- **Platform Teams**: Self-service infrastructure capabilities
|
170
170
|
- **Security Teams**: Compliance automation and policy enforcement
|
171
171
|
|
172
|
-
Version: 0.7.
|
172
|
+
Version: 0.7.8 - Enterprise Production Ready
|
173
173
|
Compatibility: AWS SDK v3, Python 3.8+, Multi-deployment ready
|
174
174
|
"""
|
175
175
|
|
@@ -183,7 +183,7 @@ from runbooks.operate.s3_operations import S3Operations
|
|
183
183
|
from runbooks.operate.tagging_operations import TaggingOperations
|
184
184
|
|
185
185
|
# Version info
|
186
|
-
__version__ = "0.7.
|
186
|
+
__version__ = "0.7.8"
|
187
187
|
__author__ = "CloudOps Runbooks Team"
|
188
188
|
|
189
189
|
# Public API exports
|