runbooks 0.7.6__py3-none-any.whl → 0.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. runbooks/__init__.py +1 -1
  2. runbooks/base.py +5 -1
  3. runbooks/cfat/__init__.py +8 -4
  4. runbooks/cfat/assessment/collectors.py +171 -14
  5. runbooks/cfat/assessment/compliance.py +871 -0
  6. runbooks/cfat/assessment/runner.py +122 -11
  7. runbooks/cfat/models.py +6 -2
  8. runbooks/common/logger.py +14 -0
  9. runbooks/common/rich_utils.py +451 -0
  10. runbooks/enterprise/__init__.py +68 -0
  11. runbooks/enterprise/error_handling.py +411 -0
  12. runbooks/enterprise/logging.py +439 -0
  13. runbooks/enterprise/multi_tenant.py +583 -0
  14. runbooks/finops/README.md +468 -241
  15. runbooks/finops/__init__.py +39 -3
  16. runbooks/finops/cli.py +83 -18
  17. runbooks/finops/cross_validation.py +375 -0
  18. runbooks/finops/dashboard_runner.py +812 -164
  19. runbooks/finops/enhanced_dashboard_runner.py +525 -0
  20. runbooks/finops/finops_dashboard.py +1892 -0
  21. runbooks/finops/helpers.py +485 -51
  22. runbooks/finops/optimizer.py +823 -0
  23. runbooks/finops/tests/__init__.py +19 -0
  24. runbooks/finops/tests/results_test_finops_dashboard.xml +1 -0
  25. runbooks/finops/tests/run_comprehensive_tests.py +421 -0
  26. runbooks/finops/tests/run_tests.py +305 -0
  27. runbooks/finops/tests/test_finops_dashboard.py +705 -0
  28. runbooks/finops/tests/test_integration.py +477 -0
  29. runbooks/finops/tests/test_performance.py +380 -0
  30. runbooks/finops/tests/test_performance_benchmarks.py +500 -0
  31. runbooks/finops/tests/test_reference_images_validation.py +867 -0
  32. runbooks/finops/tests/test_single_account_features.py +715 -0
  33. runbooks/finops/tests/validate_test_suite.py +220 -0
  34. runbooks/finops/types.py +1 -1
  35. runbooks/hitl/enhanced_workflow_engine.py +725 -0
  36. runbooks/inventory/artifacts/scale-optimize-status.txt +12 -0
  37. runbooks/inventory/collectors/aws_comprehensive.py +442 -0
  38. runbooks/inventory/collectors/enterprise_scale.py +281 -0
  39. runbooks/inventory/core/collector.py +172 -13
  40. runbooks/inventory/discovery.md +1 -1
  41. runbooks/inventory/list_ec2_instances.py +18 -20
  42. runbooks/inventory/list_ssm_parameters.py +31 -3
  43. runbooks/inventory/organizations_discovery.py +1269 -0
  44. runbooks/inventory/rich_inventory_display.py +393 -0
  45. runbooks/inventory/run_on_multi_accounts.py +35 -19
  46. runbooks/inventory/runbooks.security.report_generator.log +0 -0
  47. runbooks/inventory/runbooks.security.run_script.log +0 -0
  48. runbooks/inventory/vpc_flow_analyzer.py +1030 -0
  49. runbooks/main.py +2215 -119
  50. runbooks/metrics/dora_metrics_engine.py +599 -0
  51. runbooks/operate/__init__.py +2 -2
  52. runbooks/operate/base.py +122 -10
  53. runbooks/operate/deployment_framework.py +1032 -0
  54. runbooks/operate/deployment_validator.py +853 -0
  55. runbooks/operate/dynamodb_operations.py +10 -6
  56. runbooks/operate/ec2_operations.py +319 -11
  57. runbooks/operate/executive_dashboard.py +779 -0
  58. runbooks/operate/mcp_integration.py +750 -0
  59. runbooks/operate/nat_gateway_operations.py +1120 -0
  60. runbooks/operate/networking_cost_heatmap.py +685 -0
  61. runbooks/operate/privatelink_operations.py +940 -0
  62. runbooks/operate/s3_operations.py +10 -6
  63. runbooks/operate/vpc_endpoints.py +644 -0
  64. runbooks/operate/vpc_operations.py +1038 -0
  65. runbooks/remediation/__init__.py +2 -2
  66. runbooks/remediation/acm_remediation.py +1 -1
  67. runbooks/remediation/base.py +1 -1
  68. runbooks/remediation/cloudtrail_remediation.py +1 -1
  69. runbooks/remediation/cognito_remediation.py +1 -1
  70. runbooks/remediation/dynamodb_remediation.py +1 -1
  71. runbooks/remediation/ec2_remediation.py +1 -1
  72. runbooks/remediation/ec2_unattached_ebs_volumes.py +1 -1
  73. runbooks/remediation/kms_enable_key_rotation.py +1 -1
  74. runbooks/remediation/kms_remediation.py +1 -1
  75. runbooks/remediation/lambda_remediation.py +1 -1
  76. runbooks/remediation/multi_account.py +1 -1
  77. runbooks/remediation/rds_remediation.py +1 -1
  78. runbooks/remediation/s3_block_public_access.py +1 -1
  79. runbooks/remediation/s3_enable_access_logging.py +1 -1
  80. runbooks/remediation/s3_encryption.py +1 -1
  81. runbooks/remediation/s3_remediation.py +1 -1
  82. runbooks/remediation/vpc_remediation.py +475 -0
  83. runbooks/security/__init__.py +3 -1
  84. runbooks/security/compliance_automation.py +632 -0
  85. runbooks/security/report_generator.py +10 -0
  86. runbooks/security/run_script.py +31 -5
  87. runbooks/security/security_baseline_tester.py +169 -30
  88. runbooks/security/security_export.py +477 -0
  89. runbooks/validation/__init__.py +10 -0
  90. runbooks/validation/benchmark.py +484 -0
  91. runbooks/validation/cli.py +356 -0
  92. runbooks/validation/mcp_validator.py +768 -0
  93. runbooks/vpc/__init__.py +38 -0
  94. runbooks/vpc/config.py +212 -0
  95. runbooks/vpc/cost_engine.py +347 -0
  96. runbooks/vpc/heatmap_engine.py +605 -0
  97. runbooks/vpc/manager_interface.py +634 -0
  98. runbooks/vpc/networking_wrapper.py +1260 -0
  99. runbooks/vpc/rich_formatters.py +679 -0
  100. runbooks/vpc/tests/__init__.py +5 -0
  101. runbooks/vpc/tests/conftest.py +356 -0
  102. runbooks/vpc/tests/test_cli_integration.py +530 -0
  103. runbooks/vpc/tests/test_config.py +458 -0
  104. runbooks/vpc/tests/test_cost_engine.py +479 -0
  105. runbooks/vpc/tests/test_networking_wrapper.py +512 -0
  106. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/METADATA +40 -12
  107. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/RECORD +111 -50
  108. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/WHEEL +0 -0
  109. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/entry_points.txt +0 -0
  110. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/licenses/LICENSE +0 -0
  111. {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,599 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ DORA Metrics Engine for HITL System Optimization
4
+
5
+ Issue #93: HITL System & DORA Metrics Optimization
6
+ Priority: High (Sprint 1 Improvements)
7
+ Scope: Optimize Human-in-the-Loop system and enhance DORA metrics collection
8
+ """
9
+
10
+ import asyncio
11
+ import json
12
+ import logging
13
+ import time
14
+ from dataclasses import asdict, dataclass
15
+ from datetime import datetime, timedelta, timezone
16
+ from pathlib import Path
17
+ from typing import Dict, List, Optional, Tuple
18
+
19
+ from ..utils.logger import configure_logger
20
+
21
+ logger = configure_logger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class DORAMetric:
26
+ """Individual DORA metric measurement"""
27
+
28
+ metric_name: str
29
+ value: float
30
+ unit: str
31
+ timestamp: datetime
32
+ tags: Dict[str, str] = None
33
+ metadata: Dict = None
34
+
35
+ def __post_init__(self):
36
+ if self.tags is None:
37
+ self.tags = {}
38
+ if self.metadata is None:
39
+ self.metadata = {}
40
+
41
+
42
+ @dataclass
43
+ class DeploymentEvent:
44
+ """Deployment event for DORA metrics tracking"""
45
+
46
+ deployment_id: str
47
+ environment: str
48
+ service_name: str
49
+ version: str
50
+ start_time: datetime
51
+ end_time: Optional[datetime] = None
52
+ status: str = "in_progress" # in_progress, success, failed, rolled_back
53
+ commit_sha: str = ""
54
+ approver: str = ""
55
+ rollback_time: Optional[datetime] = None
56
+
57
+
58
+ @dataclass
59
+ class IncidentEvent:
60
+ """Incident event for DORA metrics tracking"""
61
+
62
+ incident_id: str
63
+ service_name: str
64
+ severity: str # critical, high, medium, low
65
+ start_time: datetime
66
+ detection_time: Optional[datetime] = None
67
+ resolution_time: Optional[datetime] = None
68
+ root_cause: str = ""
69
+ caused_by_deployment: str = ""
70
+
71
+
72
+ class DORAMetricsEngine:
73
+ """Enhanced DORA metrics collection and analysis engine"""
74
+
75
+ def __init__(self, artifacts_dir: str = "./artifacts/metrics", cross_validation_tolerance: float = 15.0):
76
+ """
77
+ Initialize DORA metrics engine
78
+
79
+ Args:
80
+ artifacts_dir: Directory to store metrics artifacts
81
+ cross_validation_tolerance: Tolerance percentage for metric validation
82
+ """
83
+ self.artifacts_dir = Path(artifacts_dir)
84
+ self.artifacts_dir.mkdir(parents=True, exist_ok=True)
85
+
86
+ self.tolerance = cross_validation_tolerance
87
+
88
+ # Metrics storage
89
+ self.deployments: List[DeploymentEvent] = []
90
+ self.incidents: List[IncidentEvent] = []
91
+ self.metrics_history: List[DORAMetric] = []
92
+
93
+ # HITL workflow metrics
94
+ self.approval_times: List[float] = []
95
+ self.workflow_bottlenecks: Dict[str, List[float]] = {}
96
+
97
+ # Performance targets from CLAUDE.md
98
+ self.targets = {
99
+ "lead_time_hours": 4, # <4 hours
100
+ "deploy_frequency_daily": 1, # Daily deployment capability
101
+ "change_failure_rate": 0.05, # <5%
102
+ "mttr_hours": 1, # <1 hour
103
+ "approval_time_minutes": 30, # <30 minutes
104
+ "success_rate": 0.95, # >95%
105
+ }
106
+
107
+ def record_deployment(
108
+ self,
109
+ deployment_id: str,
110
+ environment: str,
111
+ service_name: str,
112
+ version: str,
113
+ commit_sha: str = "",
114
+ approver: str = "",
115
+ ) -> DeploymentEvent:
116
+ """Record a new deployment event"""
117
+
118
+ deployment = DeploymentEvent(
119
+ deployment_id=deployment_id,
120
+ environment=environment,
121
+ service_name=service_name,
122
+ version=version,
123
+ start_time=datetime.now(timezone.utc),
124
+ commit_sha=commit_sha,
125
+ approver=approver,
126
+ )
127
+
128
+ self.deployments.append(deployment)
129
+
130
+ logger.info(f"🚀 Deployment recorded: {deployment_id} for {service_name}")
131
+
132
+ return deployment
133
+
134
+ def complete_deployment(self, deployment_id: str, status: str, rollback_time: Optional[datetime] = None) -> bool:
135
+ """Mark deployment as complete"""
136
+
137
+ for deployment in self.deployments:
138
+ if deployment.deployment_id == deployment_id:
139
+ deployment.end_time = datetime.now(timezone.utc)
140
+ deployment.status = status
141
+ deployment.rollback_time = rollback_time
142
+
143
+ logger.info(f"✅ Deployment completed: {deployment_id} - {status}")
144
+ return True
145
+
146
+ logger.warning(f"⚠️ Deployment not found: {deployment_id}")
147
+ return False
148
+
149
+ def record_incident(
150
+ self, incident_id: str, service_name: str, severity: str, root_cause: str = "", caused_by_deployment: str = ""
151
+ ) -> IncidentEvent:
152
+ """Record a new incident event"""
153
+
154
+ incident = IncidentEvent(
155
+ incident_id=incident_id,
156
+ service_name=service_name,
157
+ severity=severity,
158
+ start_time=datetime.now(timezone.utc),
159
+ root_cause=root_cause,
160
+ caused_by_deployment=caused_by_deployment,
161
+ )
162
+
163
+ self.incidents.append(incident)
164
+
165
+ logger.info(f"🚨 Incident recorded: {incident_id} - {severity} severity")
166
+
167
+ return incident
168
+
169
+ def resolve_incident(self, incident_id: str, detection_time: Optional[datetime] = None) -> bool:
170
+ """Mark incident as resolved"""
171
+
172
+ for incident in self.incidents:
173
+ if incident.incident_id == incident_id:
174
+ incident.resolution_time = datetime.now(timezone.utc)
175
+ if detection_time:
176
+ incident.detection_time = detection_time
177
+
178
+ logger.info(f"✅ Incident resolved: {incident_id}")
179
+ return True
180
+
181
+ logger.warning(f"⚠️ Incident not found: {incident_id}")
182
+ return False
183
+
184
+ def record_approval_time(self, approval_time_minutes: float, workflow_step: str = "general"):
185
+ """Record HITL approval time"""
186
+ self.approval_times.append(approval_time_minutes)
187
+
188
+ if workflow_step not in self.workflow_bottlenecks:
189
+ self.workflow_bottlenecks[workflow_step] = []
190
+ self.workflow_bottlenecks[workflow_step].append(approval_time_minutes)
191
+
192
+ def calculate_lead_time(self, days_back: int = 30) -> DORAMetric:
193
+ """Calculate deployment lead time"""
194
+
195
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
196
+ recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date and d.end_time]
197
+
198
+ if not recent_deployments:
199
+ return DORAMetric(
200
+ metric_name="lead_time",
201
+ value=0.0,
202
+ unit="hours",
203
+ timestamp=datetime.now(timezone.utc),
204
+ tags={"period": f"{days_back}d", "status": "no_data"},
205
+ )
206
+
207
+ # Calculate average lead time (simplified - in real scenario would track from commit to production)
208
+ lead_times = []
209
+ for deployment in recent_deployments:
210
+ if deployment.end_time and deployment.status == "success":
211
+ duration = (deployment.end_time - deployment.start_time).total_seconds() / 3600 # hours
212
+ lead_times.append(duration)
213
+
214
+ avg_lead_time = sum(lead_times) / len(lead_times) if lead_times else 0
215
+
216
+ metric = DORAMetric(
217
+ metric_name="lead_time",
218
+ value=avg_lead_time,
219
+ unit="hours",
220
+ timestamp=datetime.now(timezone.utc),
221
+ tags={
222
+ "period": f"{days_back}d",
223
+ "deployments_count": str(len(recent_deployments)),
224
+ "successful_deployments": str(len(lead_times)),
225
+ },
226
+ metadata={
227
+ "target": self.targets["lead_time_hours"],
228
+ "target_met": avg_lead_time <= self.targets["lead_time_hours"],
229
+ },
230
+ )
231
+
232
+ self.metrics_history.append(metric)
233
+ return metric
234
+
235
+ def calculate_deployment_frequency(self, days_back: int = 30) -> DORAMetric:
236
+ """Calculate deployment frequency"""
237
+
238
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
239
+ recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date]
240
+
241
+ # Calculate deployments per day
242
+ deployments_per_day = len(recent_deployments) / days_back if days_back > 0 else 0
243
+
244
+ metric = DORAMetric(
245
+ metric_name="deployment_frequency",
246
+ value=deployments_per_day,
247
+ unit="deployments_per_day",
248
+ timestamp=datetime.now(timezone.utc),
249
+ tags={"period": f"{days_back}d", "total_deployments": str(len(recent_deployments))},
250
+ metadata={
251
+ "target": self.targets["deploy_frequency_daily"],
252
+ "target_met": deployments_per_day >= self.targets["deploy_frequency_daily"],
253
+ },
254
+ )
255
+
256
+ self.metrics_history.append(metric)
257
+ return metric
258
+
259
+ def calculate_change_failure_rate(self, days_back: int = 30) -> DORAMetric:
260
+ """Calculate change failure rate"""
261
+
262
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
263
+ recent_deployments = [d for d in self.deployments if d.start_time >= cutoff_date and d.end_time]
264
+
265
+ if not recent_deployments:
266
+ return DORAMetric(
267
+ metric_name="change_failure_rate",
268
+ value=0.0,
269
+ unit="percentage",
270
+ timestamp=datetime.now(timezone.utc),
271
+ tags={"period": f"{days_back}d", "status": "no_data"},
272
+ )
273
+
274
+ failed_deployments = len([d for d in recent_deployments if d.status in ["failed", "rolled_back"]])
275
+
276
+ failure_rate = failed_deployments / len(recent_deployments)
277
+
278
+ metric = DORAMetric(
279
+ metric_name="change_failure_rate",
280
+ value=failure_rate,
281
+ unit="percentage",
282
+ timestamp=datetime.now(timezone.utc),
283
+ tags={
284
+ "period": f"{days_back}d",
285
+ "total_deployments": str(len(recent_deployments)),
286
+ "failed_deployments": str(failed_deployments),
287
+ },
288
+ metadata={
289
+ "target": self.targets["change_failure_rate"],
290
+ "target_met": failure_rate <= self.targets["change_failure_rate"],
291
+ },
292
+ )
293
+
294
+ self.metrics_history.append(metric)
295
+ return metric
296
+
297
+ def calculate_mttr(self, days_back: int = 30) -> DORAMetric:
298
+ """Calculate Mean Time to Recovery (MTTR)"""
299
+
300
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
301
+ recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date and i.resolution_time]
302
+
303
+ if not recent_incidents:
304
+ return DORAMetric(
305
+ metric_name="mttr",
306
+ value=0.0,
307
+ unit="hours",
308
+ timestamp=datetime.now(timezone.utc),
309
+ tags={"period": f"{days_back}d", "status": "no_data"},
310
+ )
311
+
312
+ # Calculate recovery times
313
+ recovery_times = []
314
+ for incident in recent_incidents:
315
+ if incident.resolution_time:
316
+ duration = (incident.resolution_time - incident.start_time).total_seconds() / 3600 # hours
317
+ recovery_times.append(duration)
318
+
319
+ avg_mttr = sum(recovery_times) / len(recovery_times) if recovery_times else 0
320
+
321
+ metric = DORAMetric(
322
+ metric_name="mttr",
323
+ value=avg_mttr,
324
+ unit="hours",
325
+ timestamp=datetime.now(timezone.utc),
326
+ tags={"period": f"{days_back}d", "incidents_count": str(len(recent_incidents))},
327
+ metadata={"target": self.targets["mttr_hours"], "target_met": avg_mttr <= self.targets["mttr_hours"]},
328
+ )
329
+
330
+ self.metrics_history.append(metric)
331
+ return metric
332
+
333
+ def calculate_hitl_metrics(self) -> Dict[str, DORAMetric]:
334
+ """Calculate Human-in-the-Loop specific metrics"""
335
+
336
+ metrics = {}
337
+
338
+ # Average approval time
339
+ if self.approval_times:
340
+ avg_approval_time = sum(self.approval_times) / len(self.approval_times)
341
+
342
+ metrics["approval_time"] = DORAMetric(
343
+ metric_name="approval_time",
344
+ value=avg_approval_time,
345
+ unit="minutes",
346
+ timestamp=datetime.now(timezone.utc),
347
+ tags={"total_approvals": str(len(self.approval_times))},
348
+ metadata={
349
+ "target": self.targets["approval_time_minutes"],
350
+ "target_met": avg_approval_time <= self.targets["approval_time_minutes"],
351
+ },
352
+ )
353
+
354
+ # Workflow bottlenecks analysis
355
+ if self.workflow_bottlenecks:
356
+ bottleneck_metrics = {}
357
+
358
+ for step, times in self.workflow_bottlenecks.items():
359
+ if times:
360
+ avg_time = sum(times) / len(times)
361
+ bottleneck_metrics[f"{step}_avg_time"] = avg_time
362
+
363
+ # Identify slowest step
364
+ if bottleneck_metrics:
365
+ slowest_step = max(bottleneck_metrics, key=bottleneck_metrics.get)
366
+ slowest_time = bottleneck_metrics[slowest_step]
367
+
368
+ metrics["workflow_bottleneck"] = DORAMetric(
369
+ metric_name="workflow_bottleneck",
370
+ value=slowest_time,
371
+ unit="minutes",
372
+ timestamp=datetime.now(timezone.utc),
373
+ tags={"bottleneck_step": slowest_step},
374
+ metadata={"all_steps": bottleneck_metrics},
375
+ )
376
+
377
+ return metrics
378
+
379
+ def generate_comprehensive_report(self, days_back: int = 30) -> Dict:
380
+ """Generate comprehensive DORA metrics report"""
381
+
382
+ logger.info(f"📊 Generating DORA metrics report for last {days_back} days")
383
+
384
+ # Calculate all DORA metrics
385
+ lead_time = self.calculate_lead_time(days_back)
386
+ deployment_freq = self.calculate_deployment_frequency(days_back)
387
+ failure_rate = self.calculate_change_failure_rate(days_back)
388
+ mttr = self.calculate_mttr(days_back)
389
+
390
+ # Calculate HITL metrics
391
+ hitl_metrics = self.calculate_hitl_metrics()
392
+
393
+ # Performance analysis
394
+ targets_met = {
395
+ "lead_time": lead_time.metadata.get("target_met", False),
396
+ "deployment_frequency": deployment_freq.metadata.get("target_met", False),
397
+ "change_failure_rate": failure_rate.metadata.get("target_met", False),
398
+ "mttr": mttr.metadata.get("target_met", False),
399
+ }
400
+
401
+ # Add HITL targets
402
+ if "approval_time" in hitl_metrics:
403
+ targets_met["approval_time"] = hitl_metrics["approval_time"].metadata.get("target_met", False)
404
+
405
+ overall_performance = sum(targets_met.values()) / len(targets_met) * 100
406
+
407
+ report = {
408
+ "report_type": "dora_metrics_comprehensive",
409
+ "period": f"{days_back}_days",
410
+ "timestamp": datetime.now(timezone.utc).isoformat(),
411
+ "dora_metrics": {
412
+ "lead_time": asdict(lead_time),
413
+ "deployment_frequency": asdict(deployment_freq),
414
+ "change_failure_rate": asdict(failure_rate),
415
+ "mttr": asdict(mttr),
416
+ },
417
+ "hitl_metrics": {k: asdict(v) for k, v in hitl_metrics.items()},
418
+ "performance_analysis": {
419
+ "targets_met": targets_met,
420
+ "overall_performance_percentage": overall_performance,
421
+ "performance_grade": self._calculate_performance_grade(overall_performance),
422
+ },
423
+ "recommendations": self._generate_recommendations(targets_met, hitl_metrics),
424
+ "raw_data": {
425
+ "deployments_count": len(self.deployments),
426
+ "incidents_count": len(self.incidents),
427
+ "approval_times_count": len(self.approval_times),
428
+ },
429
+ }
430
+
431
+ # Save report
432
+ report_file = self.artifacts_dir / f"dora_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
433
+ with open(report_file, "w") as f:
434
+ json.dump(report, f, indent=2, default=str)
435
+
436
+ logger.info(f"✅ DORA metrics report saved to: {report_file}")
437
+
438
+ return report
439
+
440
+ def _calculate_performance_grade(self, percentage: float) -> str:
441
+ """Calculate performance grade based on targets met"""
442
+ if percentage >= 90:
443
+ return "A (Excellent)"
444
+ elif percentage >= 80:
445
+ return "B (Good)"
446
+ elif percentage >= 70:
447
+ return "C (Satisfactory)"
448
+ elif percentage >= 60:
449
+ return "D (Needs Improvement)"
450
+ else:
451
+ return "F (Poor)"
452
+
453
+ def _generate_recommendations(self, targets_met: Dict[str, bool], hitl_metrics: Dict) -> List[str]:
454
+ """Generate recommendations based on metrics analysis"""
455
+
456
+ recommendations = []
457
+
458
+ if not targets_met.get("lead_time", False):
459
+ recommendations.append(
460
+ "🎯 Optimize lead time: Consider parallel workflows, automated testing, and faster approval processes"
461
+ )
462
+
463
+ if not targets_met.get("deployment_frequency", False):
464
+ recommendations.append(
465
+ "🚀 Increase deployment frequency: Implement continuous deployment pipeline and smaller batch sizes"
466
+ )
467
+
468
+ if not targets_met.get("change_failure_rate", False):
469
+ recommendations.append(
470
+ "🛡️ Reduce failure rate: Enhance testing coverage, implement canary deployments, and improve rollback procedures"
471
+ )
472
+
473
+ if not targets_met.get("mttr", False):
474
+ recommendations.append(
475
+ "⚡ Improve MTTR: Enhance monitoring, implement automated incident response, and improve alerting"
476
+ )
477
+
478
+ if not targets_met.get("approval_time", False):
479
+ recommendations.append(
480
+ "⏰ Optimize approval workflow: Streamline HITL processes, implement parallel approvals, and reduce approval steps"
481
+ )
482
+
483
+ # HITL-specific recommendations
484
+ if "workflow_bottleneck" in hitl_metrics:
485
+ bottleneck_step = hitl_metrics["workflow_bottleneck"].tags.get("bottleneck_step", "unknown")
486
+ recommendations.append(f"🔍 Address workflow bottleneck: Focus on optimizing '{bottleneck_step}' step")
487
+
488
+ if not recommendations:
489
+ recommendations.append(
490
+ "✅ All targets met! Consider raising performance targets or exploring advanced optimization opportunities"
491
+ )
492
+
493
+ return recommendations
494
+
495
+ def export_metrics_for_visualization(self, output_file: Optional[str] = None) -> str:
496
+ """Export metrics in format suitable for visualization tools"""
497
+
498
+ if not output_file:
499
+ output_file = self.artifacts_dir / f"metrics_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
500
+
501
+ export_data = {
502
+ "export_timestamp": datetime.now(timezone.utc).isoformat(),
503
+ "metrics_history": [asdict(m) for m in self.metrics_history],
504
+ "deployments": [asdict(d) for d in self.deployments],
505
+ "incidents": [asdict(i) for i in self.incidents],
506
+ "targets": self.targets,
507
+ "summary_stats": {
508
+ "total_deployments": len(self.deployments),
509
+ "successful_deployments": len([d for d in self.deployments if d.status == "success"]),
510
+ "total_incidents": len(self.incidents),
511
+ "resolved_incidents": len([i for i in self.incidents if i.resolution_time]),
512
+ "average_approval_time": sum(self.approval_times) / len(self.approval_times)
513
+ if self.approval_times
514
+ else 0,
515
+ },
516
+ }
517
+
518
+ with open(output_file, "w") as f:
519
+ json.dump(export_data, f, indent=2, default=str)
520
+
521
+ logger.info(f"📊 Metrics exported for visualization: {output_file}")
522
+ return str(output_file)
523
+
524
+
525
+ # Async functions for integration with existing systems
526
+ async def simulate_dora_metrics_collection(duration_minutes: int = 5) -> Dict:
527
+ """Simulate DORA metrics collection for demonstration"""
528
+
529
+ engine = DORAMetricsEngine()
530
+
531
+ logger.info(f"🧪 Starting {duration_minutes}-minute DORA metrics simulation")
532
+
533
+ # Simulate deployment events
534
+ deployments = [
535
+ ("deploy-001", "production", "vpc-wrapper", "v1.2.0", "abc123", "manager"),
536
+ ("deploy-002", "staging", "finops-dashboard", "v2.1.0", "def456", "architect"),
537
+ ("deploy-003", "production", "organizations-api", "v1.0.1", "ghi789", "manager"),
538
+ ]
539
+
540
+ for dep_id, env, service, version, commit, approver in deployments:
541
+ deployment = engine.record_deployment(dep_id, env, service, version, commit, approver)
542
+
543
+ # Simulate approval time
544
+ approval_time = 15 + (hash(dep_id) % 30) # 15-45 minutes
545
+ engine.record_approval_time(approval_time, f"{env}_deployment")
546
+
547
+ # Simulate deployment completion after short delay
548
+ await asyncio.sleep(1)
549
+
550
+ # 90% success rate simulation
551
+ status = "success" if hash(dep_id) % 10 < 9 else "failed"
552
+ engine.complete_deployment(dep_id, status)
553
+
554
+ # Simulate incidents
555
+ incidents = [
556
+ ("inc-001", "vpc-wrapper", "high", "Network configuration error", "deploy-001"),
557
+ ("inc-002", "finops-dashboard", "medium", "Query timeout", ""),
558
+ ]
559
+
560
+ for inc_id, service, severity, cause, caused_by in incidents:
561
+ incident = engine.record_incident(inc_id, service, severity, cause, caused_by)
562
+
563
+ # Simulate incident resolution
564
+ await asyncio.sleep(0.5)
565
+ detection_time = incident.start_time + timedelta(minutes=5)
566
+ engine.resolve_incident(inc_id, detection_time)
567
+
568
+ # Generate comprehensive report
569
+ report = engine.generate_comprehensive_report(days_back=7)
570
+
571
+ return report
572
+
573
+
574
+ if __name__ == "__main__":
575
+ # CLI execution
576
+ import argparse
577
+
578
+ parser = argparse.ArgumentParser(description="DORA Metrics Engine")
579
+ parser.add_argument("--simulate", action="store_true", help="Run simulation mode")
580
+ parser.add_argument("--duration", type=int, default=5, help="Simulation duration in minutes")
581
+ parser.add_argument("--output", "-o", default="./artifacts/metrics", help="Output directory for metrics")
582
+
583
+ args = parser.parse_args()
584
+
585
+ async def main():
586
+ if args.simulate:
587
+ report = await simulate_dora_metrics_collection(args.duration)
588
+ print("✅ DORA metrics simulation completed")
589
+ print(f"📊 Overall performance: {report['performance_analysis']['performance_grade']}")
590
+ print(
591
+ f"🎯 Targets met: {sum(report['performance_analysis']['targets_met'].values())}/{len(report['performance_analysis']['targets_met'])}"
592
+ )
593
+ else:
594
+ engine = DORAMetricsEngine(args.output)
595
+ report = engine.generate_comprehensive_report()
596
+ print("✅ DORA metrics report generated")
597
+ print(f"📊 Report saved to: {engine.artifacts_dir}")
598
+
599
+ asyncio.run(main())
@@ -169,7 +169,7 @@ runbooks operate iam update-roles-cross-accounts --role-name deployment-role
169
169
  - **Platform Teams**: Self-service infrastructure capabilities
170
170
  - **Security Teams**: Compliance automation and policy enforcement
171
171
 
172
- Version: 0.7.6 - Enterprise Production Ready
172
+ Version: 0.7.8 - Enterprise Production Ready
173
173
  Compatibility: AWS SDK v3, Python 3.8+, Multi-deployment ready
174
174
  """
175
175
 
@@ -183,7 +183,7 @@ from runbooks.operate.s3_operations import S3Operations
183
183
  from runbooks.operate.tagging_operations import TaggingOperations
184
184
 
185
185
  # Version info
186
- __version__ = "0.7.6"
186
+ __version__ = "0.7.8"
187
187
  __author__ = "CloudOps Runbooks Team"
188
188
 
189
189
  # Public API exports