runbooks 0.7.9__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runbooks/__init__.py +1 -1
- runbooks/cfat/README.md +12 -1
- runbooks/cfat/__init__.py +1 -1
- runbooks/cfat/assessment/compliance.py +4 -1
- runbooks/cfat/assessment/runner.py +42 -34
- runbooks/cfat/models.py +1 -1
- runbooks/cloudops/__init__.py +123 -0
- runbooks/cloudops/base.py +385 -0
- runbooks/cloudops/cost_optimizer.py +811 -0
- runbooks/cloudops/infrastructure_optimizer.py +29 -0
- runbooks/cloudops/interfaces.py +828 -0
- runbooks/cloudops/lifecycle_manager.py +29 -0
- runbooks/cloudops/mcp_cost_validation.py +678 -0
- runbooks/cloudops/models.py +251 -0
- runbooks/cloudops/monitoring_automation.py +29 -0
- runbooks/cloudops/notebook_framework.py +676 -0
- runbooks/cloudops/security_enforcer.py +449 -0
- runbooks/common/__init__.py +152 -0
- runbooks/common/accuracy_validator.py +1039 -0
- runbooks/common/context_logger.py +440 -0
- runbooks/common/cross_module_integration.py +594 -0
- runbooks/common/enhanced_exception_handler.py +1108 -0
- runbooks/common/enterprise_audit_integration.py +634 -0
- runbooks/common/mcp_cost_explorer_integration.py +900 -0
- runbooks/common/mcp_integration.py +548 -0
- runbooks/common/performance_monitor.py +387 -0
- runbooks/common/profile_utils.py +216 -0
- runbooks/common/rich_utils.py +172 -1
- runbooks/feedback/user_feedback_collector.py +440 -0
- runbooks/finops/README.md +377 -458
- runbooks/finops/__init__.py +4 -21
- runbooks/finops/account_resolver.py +279 -0
- runbooks/finops/accuracy_cross_validator.py +638 -0
- runbooks/finops/aws_client.py +721 -36
- runbooks/finops/budget_integration.py +313 -0
- runbooks/finops/cli.py +59 -5
- runbooks/finops/cost_optimizer.py +1340 -0
- runbooks/finops/cost_processor.py +211 -37
- runbooks/finops/dashboard_router.py +900 -0
- runbooks/finops/dashboard_runner.py +990 -232
- runbooks/finops/embedded_mcp_validator.py +288 -0
- runbooks/finops/enhanced_dashboard_runner.py +8 -7
- runbooks/finops/enhanced_progress.py +327 -0
- runbooks/finops/enhanced_trend_visualization.py +423 -0
- runbooks/finops/finops_dashboard.py +184 -1829
- runbooks/finops/helpers.py +509 -196
- runbooks/finops/iam_guidance.py +400 -0
- runbooks/finops/markdown_exporter.py +466 -0
- runbooks/finops/multi_dashboard.py +1502 -0
- runbooks/finops/optimizer.py +15 -15
- runbooks/finops/profile_processor.py +2 -2
- runbooks/finops/runbooks.inventory.organizations_discovery.log +0 -0
- runbooks/finops/runbooks.security.report_generator.log +0 -0
- runbooks/finops/runbooks.security.run_script.log +0 -0
- runbooks/finops/runbooks.security.security_export.log +0 -0
- runbooks/finops/schemas.py +589 -0
- runbooks/finops/service_mapping.py +195 -0
- runbooks/finops/single_dashboard.py +710 -0
- runbooks/finops/tests/test_reference_images_validation.py +1 -1
- runbooks/inventory/README.md +12 -1
- runbooks/inventory/core/collector.py +157 -29
- runbooks/inventory/list_ec2_instances.py +9 -6
- runbooks/inventory/list_ssm_parameters.py +10 -10
- runbooks/inventory/organizations_discovery.py +210 -164
- runbooks/inventory/rich_inventory_display.py +74 -107
- runbooks/inventory/run_on_multi_accounts.py +13 -13
- runbooks/inventory/runbooks.inventory.organizations_discovery.log +0 -0
- runbooks/inventory/runbooks.security.security_export.log +0 -0
- runbooks/main.py +1371 -240
- runbooks/metrics/dora_metrics_engine.py +711 -17
- runbooks/monitoring/performance_monitor.py +433 -0
- runbooks/operate/README.md +394 -0
- runbooks/operate/base.py +215 -47
- runbooks/operate/ec2_operations.py +435 -5
- runbooks/operate/iam_operations.py +598 -3
- runbooks/operate/privatelink_operations.py +1 -1
- runbooks/operate/rds_operations.py +508 -0
- runbooks/operate/s3_operations.py +508 -0
- runbooks/operate/vpc_endpoints.py +1 -1
- runbooks/remediation/README.md +489 -13
- runbooks/remediation/base.py +5 -3
- runbooks/remediation/commons.py +8 -4
- runbooks/security/ENTERPRISE_SECURITY_FRAMEWORK.md +506 -0
- runbooks/security/README.md +12 -1
- runbooks/security/__init__.py +265 -33
- runbooks/security/cloudops_automation_security_validator.py +1164 -0
- runbooks/security/compliance_automation.py +12 -10
- runbooks/security/compliance_automation_engine.py +1021 -0
- runbooks/security/enterprise_security_framework.py +930 -0
- runbooks/security/enterprise_security_policies.json +293 -0
- runbooks/security/executive_security_dashboard.py +1247 -0
- runbooks/security/integration_test_enterprise_security.py +879 -0
- runbooks/security/module_security_integrator.py +641 -0
- runbooks/security/multi_account_security_controls.py +2254 -0
- runbooks/security/real_time_security_monitor.py +1196 -0
- runbooks/security/report_generator.py +1 -1
- runbooks/security/run_script.py +4 -8
- runbooks/security/security_baseline_tester.py +39 -52
- runbooks/security/security_export.py +99 -120
- runbooks/sre/README.md +472 -0
- runbooks/sre/__init__.py +33 -0
- runbooks/sre/mcp_reliability_engine.py +1049 -0
- runbooks/sre/performance_optimization_engine.py +1032 -0
- runbooks/sre/production_monitoring_framework.py +584 -0
- runbooks/sre/reliability_monitoring_framework.py +1011 -0
- runbooks/validation/__init__.py +2 -2
- runbooks/validation/benchmark.py +154 -149
- runbooks/validation/cli.py +159 -147
- runbooks/validation/mcp_validator.py +291 -248
- runbooks/vpc/README.md +478 -0
- runbooks/vpc/__init__.py +2 -2
- runbooks/vpc/manager_interface.py +366 -351
- runbooks/vpc/networking_wrapper.py +68 -36
- runbooks/vpc/rich_formatters.py +22 -8
- runbooks-0.9.1.dist-info/METADATA +308 -0
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/RECORD +120 -59
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/entry_points.txt +1 -1
- runbooks/finops/cross_validation.py +0 -375
- runbooks-0.7.9.dist-info/METADATA +0 -636
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/WHEEL +0 -0
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/licenses/LICENSE +0 -0
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/top_level.txt +0 -0
@@ -70,11 +70,23 @@ class IncidentEvent:
|
|
70
70
|
|
71
71
|
|
72
72
|
class DORAMetricsEngine:
|
73
|
-
"""
|
73
|
+
"""
|
74
|
+
Enhanced DORA metrics collection and analysis engine for Enterprise SRE.
|
75
|
+
|
76
|
+
Provides comprehensive DORA metrics (Lead Time, Deploy Frequency, MTTR, Change Failure Rate)
|
77
|
+
with real-time collection, automated alerting, and enterprise dashboard integration.
|
78
|
+
|
79
|
+
Features:
|
80
|
+
- Real-time metrics streaming from git operations
|
81
|
+
- Automated deployment event capture via GitHub webhooks
|
82
|
+
- CloudWatch/Datadog integration for enterprise monitoring
|
83
|
+
- Cross-session persistence with baseline trending
|
84
|
+
- SLA compliance tracking with automated alerting
|
85
|
+
"""
|
74
86
|
|
75
87
|
def __init__(self, artifacts_dir: str = "./artifacts/metrics", cross_validation_tolerance: float = 15.0):
|
76
88
|
"""
|
77
|
-
Initialize DORA metrics engine
|
89
|
+
Initialize enterprise DORA metrics engine
|
78
90
|
|
79
91
|
Args:
|
80
92
|
artifacts_dir: Directory to store metrics artifacts
|
@@ -83,27 +95,51 @@ class DORAMetricsEngine:
|
|
83
95
|
self.artifacts_dir = Path(artifacts_dir)
|
84
96
|
self.artifacts_dir.mkdir(parents=True, exist_ok=True)
|
85
97
|
|
98
|
+
# Create SRE-focused subdirectories
|
99
|
+
(self.artifacts_dir / "dora-reports").mkdir(exist_ok=True)
|
100
|
+
(self.artifacts_dir / "baselines").mkdir(exist_ok=True)
|
101
|
+
(self.artifacts_dir / "alerts").mkdir(exist_ok=True)
|
102
|
+
(self.artifacts_dir / "dashboards").mkdir(exist_ok=True)
|
103
|
+
|
86
104
|
self.tolerance = cross_validation_tolerance
|
87
105
|
|
88
|
-
# Metrics storage
|
106
|
+
# Metrics storage with persistence
|
89
107
|
self.deployments: List[DeploymentEvent] = []
|
90
108
|
self.incidents: List[IncidentEvent] = []
|
91
109
|
self.metrics_history: List[DORAMetric] = []
|
110
|
+
self.baselines: Dict[str, float] = {}
|
92
111
|
|
93
112
|
# HITL workflow metrics
|
94
113
|
self.approval_times: List[float] = []
|
95
114
|
self.workflow_bottlenecks: Dict[str, List[float]] = {}
|
96
115
|
|
97
|
-
#
|
116
|
+
# Enterprise SRE performance targets (FAANG SDLC standards)
|
98
117
|
self.targets = {
|
99
|
-
"lead_time_hours": 4, # <4 hours
|
118
|
+
"lead_time_hours": 4, # <4 hours (FAANG velocity)
|
100
119
|
"deploy_frequency_daily": 1, # Daily deployment capability
|
101
|
-
"change_failure_rate": 0.05, # <5%
|
102
|
-
"mttr_hours": 1, # <1 hour
|
103
|
-
"approval_time_minutes": 30, # <30 minutes
|
104
|
-
"success_rate": 0.95, # >95%
|
120
|
+
"change_failure_rate": 0.05, # <5% (FAANG quality)
|
121
|
+
"mttr_hours": 1, # <1 hour (SRE excellence)
|
122
|
+
"approval_time_minutes": 30, # <30 minutes (HITL efficiency)
|
123
|
+
"success_rate": 0.95, # >95% (Enterprise reliability)
|
124
|
+
"sla_availability": 0.999, # >99.9% uptime
|
125
|
+
"performance_score": 90, # >90% performance score
|
105
126
|
}
|
106
127
|
|
128
|
+
# SRE alerting thresholds
|
129
|
+
self.alert_thresholds = {
|
130
|
+
"lead_time_hours": 6, # Alert if >6 hours
|
131
|
+
"deploy_frequency_daily": 0.5, # Alert if <0.5 deploys/day
|
132
|
+
"change_failure_rate": 0.10, # Alert if >10%
|
133
|
+
"mttr_hours": 2, # Alert if >2 hours
|
134
|
+
"approval_time_minutes": 60, # Alert if >60 minutes
|
135
|
+
}
|
136
|
+
|
137
|
+
# Load existing data
|
138
|
+
self._load_persistent_data()
|
139
|
+
|
140
|
+
# Initialize baseline metrics if not exists
|
141
|
+
self._initialize_baselines()
|
142
|
+
|
107
143
|
def record_deployment(
|
108
144
|
self,
|
109
145
|
deployment_id: str,
|
@@ -376,10 +412,261 @@ class DORAMetricsEngine:
|
|
376
412
|
|
377
413
|
return metrics
|
378
414
|
|
415
|
+
def _load_persistent_data(self) -> None:
|
416
|
+
"""Load persistent DORA data from storage."""
|
417
|
+
try:
|
418
|
+
# Load deployments
|
419
|
+
deployments_file = self.artifacts_dir / "deployments.json"
|
420
|
+
if deployments_file.exists():
|
421
|
+
with open(deployments_file, "r") as f:
|
422
|
+
data = json.load(f)
|
423
|
+
self.deployments = [DeploymentEvent(**item) for item in data.get("deployments", [])]
|
424
|
+
|
425
|
+
# Load incidents
|
426
|
+
incidents_file = self.artifacts_dir / "incidents.json"
|
427
|
+
if incidents_file.exists():
|
428
|
+
with open(incidents_file, "r") as f:
|
429
|
+
data = json.load(f)
|
430
|
+
self.incidents = [IncidentEvent(**item) for item in data.get("incidents", [])]
|
431
|
+
|
432
|
+
# Load baselines
|
433
|
+
baselines_file = self.artifacts_dir / "baselines" / "current_baselines.json"
|
434
|
+
if baselines_file.exists():
|
435
|
+
with open(baselines_file, "r") as f:
|
436
|
+
self.baselines = json.load(f)
|
437
|
+
|
438
|
+
logger.info(f"📊 Loaded {len(self.deployments)} deployments, {len(self.incidents)} incidents")
|
439
|
+
|
440
|
+
except Exception as e:
|
441
|
+
logger.warning(f"⚠️ Failed to load persistent data: {e}")
|
442
|
+
|
443
|
+
def _save_persistent_data(self) -> None:
|
444
|
+
"""Save persistent DORA data to storage."""
|
445
|
+
try:
|
446
|
+
# Save deployments
|
447
|
+
deployments_data = {
|
448
|
+
"deployments": [asdict(d) for d in self.deployments],
|
449
|
+
"last_updated": datetime.now(timezone.utc).isoformat(),
|
450
|
+
}
|
451
|
+
deployments_file = self.artifacts_dir / "deployments.json"
|
452
|
+
with open(deployments_file, "w") as f:
|
453
|
+
json.dump(deployments_data, f, indent=2, default=str)
|
454
|
+
|
455
|
+
# Save incidents
|
456
|
+
incidents_data = {
|
457
|
+
"incidents": [asdict(i) for i in self.incidents],
|
458
|
+
"last_updated": datetime.now(timezone.utc).isoformat(),
|
459
|
+
}
|
460
|
+
incidents_file = self.artifacts_dir / "incidents.json"
|
461
|
+
with open(incidents_file, "w") as f:
|
462
|
+
json.dump(incidents_data, f, indent=2, default=str)
|
463
|
+
|
464
|
+
# Save baselines
|
465
|
+
baselines_file = self.artifacts_dir / "baselines" / "current_baselines.json"
|
466
|
+
with open(baselines_file, "w") as f:
|
467
|
+
json.dump(self.baselines, f, indent=2)
|
468
|
+
|
469
|
+
except Exception as e:
|
470
|
+
logger.error(f"❌ Failed to save persistent data: {e}")
|
471
|
+
|
472
|
+
def _initialize_baselines(self) -> None:
|
473
|
+
"""Initialize baseline metrics for trending analysis."""
|
474
|
+
if not self.baselines and len(self.deployments) > 10:
|
475
|
+
# Calculate initial baselines from historical data
|
476
|
+
lead_time_metric = self.calculate_lead_time(30)
|
477
|
+
deploy_freq_metric = self.calculate_deployment_frequency(30)
|
478
|
+
failure_rate_metric = self.calculate_change_failure_rate(30)
|
479
|
+
mttr_metric = self.calculate_mttr(30)
|
480
|
+
|
481
|
+
self.baselines = {
|
482
|
+
"lead_time_hours": lead_time_metric.value,
|
483
|
+
"deploy_frequency_daily": deploy_freq_metric.value,
|
484
|
+
"change_failure_rate": failure_rate_metric.value,
|
485
|
+
"mttr_hours": mttr_metric.value,
|
486
|
+
"baseline_established": datetime.now(timezone.utc).isoformat(),
|
487
|
+
"sample_size": len(self.deployments),
|
488
|
+
}
|
489
|
+
|
490
|
+
logger.info("📈 Established baseline metrics from historical data")
|
491
|
+
self._save_persistent_data()
|
492
|
+
|
493
|
+
def track_git_deployment(
|
494
|
+
self, commit_sha: str, branch: str = "main", author: str = "", message: str = ""
|
495
|
+
) -> DeploymentEvent:
|
496
|
+
"""
|
497
|
+
Track deployment from git operations for automated DORA collection.
|
498
|
+
|
499
|
+
Args:
|
500
|
+
commit_sha: Git commit SHA
|
501
|
+
branch: Git branch name
|
502
|
+
author: Commit author
|
503
|
+
message: Commit message
|
504
|
+
|
505
|
+
Returns:
|
506
|
+
Created deployment event
|
507
|
+
"""
|
508
|
+
deployment_id = f"git-{commit_sha[:8]}-{int(time.time())}"
|
509
|
+
|
510
|
+
deployment = self.record_deployment(
|
511
|
+
deployment_id=deployment_id,
|
512
|
+
environment="production" if branch == "main" else "development",
|
513
|
+
service_name="runbooks",
|
514
|
+
version=commit_sha[:8],
|
515
|
+
commit_sha=commit_sha,
|
516
|
+
approver=author,
|
517
|
+
)
|
518
|
+
|
519
|
+
# Add git metadata
|
520
|
+
deployment.metadata = {
|
521
|
+
"branch": branch,
|
522
|
+
"author": author,
|
523
|
+
"message": message,
|
524
|
+
"automated": True,
|
525
|
+
"source": "git_integration",
|
526
|
+
}
|
527
|
+
|
528
|
+
logger.info(f"🔗 Git deployment tracked: {commit_sha[:8]} on {branch}")
|
529
|
+
|
530
|
+
# Auto-save after git integration
|
531
|
+
self._save_persistent_data()
|
532
|
+
|
533
|
+
return deployment
|
534
|
+
|
535
|
+
def detect_performance_incident(
|
536
|
+
self, module: str, operation: str, execution_time: float, threshold: float
|
537
|
+
) -> Optional[IncidentEvent]:
|
538
|
+
"""
|
539
|
+
Automatically detect and record performance incidents.
|
540
|
+
|
541
|
+
Args:
|
542
|
+
module: Module name (e.g., 'finops', 'inventory')
|
543
|
+
operation: Operation name
|
544
|
+
execution_time: Actual execution time
|
545
|
+
threshold: Performance threshold
|
546
|
+
|
547
|
+
Returns:
|
548
|
+
Created incident if threshold exceeded, None otherwise
|
549
|
+
"""
|
550
|
+
if execution_time <= threshold:
|
551
|
+
return None
|
552
|
+
|
553
|
+
incident_id = f"perf-{module}-{int(time.time())}"
|
554
|
+
severity = "critical" if execution_time > threshold * 2 else "high"
|
555
|
+
|
556
|
+
incident = self.record_incident(
|
557
|
+
incident_id=incident_id,
|
558
|
+
service_name=module,
|
559
|
+
severity=severity,
|
560
|
+
root_cause=f"Performance degradation: {operation} took {execution_time:.2f}s (threshold: {threshold:.2f}s)",
|
561
|
+
)
|
562
|
+
|
563
|
+
# Add performance metadata
|
564
|
+
incident.metadata = {
|
565
|
+
"operation": operation,
|
566
|
+
"execution_time": execution_time,
|
567
|
+
"threshold": threshold,
|
568
|
+
"degradation_factor": execution_time / threshold,
|
569
|
+
"automated_detection": True,
|
570
|
+
}
|
571
|
+
|
572
|
+
logger.warning(f"🚨 Performance incident detected: {incident_id}")
|
573
|
+
|
574
|
+
# Generate real-time alert
|
575
|
+
self._generate_sre_alert(incident, execution_time, threshold)
|
576
|
+
|
577
|
+
return incident
|
578
|
+
|
579
|
+
def _generate_sre_alert(self, incident: IncidentEvent, execution_time: float, threshold: float) -> None:
|
580
|
+
"""Generate SRE-focused performance alert."""
|
581
|
+
alert_data = {
|
582
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
583
|
+
"alert_type": "sre_performance_degradation",
|
584
|
+
"incident_id": incident.incident_id,
|
585
|
+
"service": incident.service_name,
|
586
|
+
"severity": incident.severity,
|
587
|
+
"execution_time": execution_time,
|
588
|
+
"threshold": threshold,
|
589
|
+
"degradation_factor": execution_time / threshold,
|
590
|
+
"impact": "user_experience" if execution_time > threshold * 1.5 else "performance_sla",
|
591
|
+
"recommended_actions": [
|
592
|
+
"Check system resource utilization",
|
593
|
+
"Review recent deployments for correlation",
|
594
|
+
"Validate AWS API rate limiting",
|
595
|
+
"Consider auto-scaling triggers",
|
596
|
+
],
|
597
|
+
}
|
598
|
+
|
599
|
+
# Save alert to artifacts
|
600
|
+
alert_file = self.artifacts_dir / "alerts" / f"sre_alert_{incident.incident_id}.json"
|
601
|
+
with open(alert_file, "w") as f:
|
602
|
+
json.dump(alert_data, f, indent=2, default=str)
|
603
|
+
|
604
|
+
logger.critical(f"🚨 SRE Alert generated: {alert_file}")
|
605
|
+
|
606
|
+
def calculate_sla_compliance(self, days_back: int = 30) -> Dict[str, DORAMetric]:
|
607
|
+
"""
|
608
|
+
Calculate SLA compliance metrics for enterprise reporting.
|
609
|
+
|
610
|
+
Args:
|
611
|
+
days_back: Number of days to analyze
|
612
|
+
|
613
|
+
Returns:
|
614
|
+
Dictionary of SLA compliance metrics
|
615
|
+
"""
|
616
|
+
sla_metrics = {}
|
617
|
+
|
618
|
+
# Calculate availability SLA (based on incident downtime)
|
619
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
620
|
+
recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date]
|
621
|
+
|
622
|
+
total_downtime_hours = 0
|
623
|
+
for incident in recent_incidents:
|
624
|
+
if incident.resolution_time and incident.severity in ["critical", "high"]:
|
625
|
+
downtime = (incident.resolution_time - incident.start_time).total_seconds() / 3600
|
626
|
+
total_downtime_hours += downtime
|
627
|
+
|
628
|
+
total_hours = days_back * 24
|
629
|
+
availability = max(0, (total_hours - total_downtime_hours) / total_hours)
|
630
|
+
|
631
|
+
sla_metrics["availability"] = DORAMetric(
|
632
|
+
metric_name="availability_sla",
|
633
|
+
value=availability,
|
634
|
+
unit="percentage",
|
635
|
+
timestamp=datetime.now(timezone.utc),
|
636
|
+
tags={"period": f"{days_back}d", "incidents": str(len(recent_incidents))},
|
637
|
+
metadata={
|
638
|
+
"target": self.targets["sla_availability"],
|
639
|
+
"target_met": availability >= self.targets["sla_availability"],
|
640
|
+
"downtime_hours": total_downtime_hours,
|
641
|
+
},
|
642
|
+
)
|
643
|
+
|
644
|
+
# Performance SLA (based on operation execution times)
|
645
|
+
performance_scores = []
|
646
|
+
for metric in self.metrics_history:
|
647
|
+
if metric.metadata and "performance_score" in metric.metadata:
|
648
|
+
performance_scores.append(metric.metadata["performance_score"])
|
649
|
+
|
650
|
+
avg_performance = sum(performance_scores) / len(performance_scores) if performance_scores else 0
|
651
|
+
|
652
|
+
sla_metrics["performance"] = DORAMetric(
|
653
|
+
metric_name="performance_sla",
|
654
|
+
value=avg_performance,
|
655
|
+
unit="percentage",
|
656
|
+
timestamp=datetime.now(timezone.utc),
|
657
|
+
tags={"sample_size": str(len(performance_scores))},
|
658
|
+
metadata={
|
659
|
+
"target": self.targets["performance_score"],
|
660
|
+
"target_met": avg_performance >= self.targets["performance_score"],
|
661
|
+
},
|
662
|
+
)
|
663
|
+
|
664
|
+
return sla_metrics
|
665
|
+
|
379
666
|
def generate_comprehensive_report(self, days_back: int = 30) -> Dict:
|
380
|
-
"""Generate comprehensive DORA metrics report"""
|
667
|
+
"""Generate comprehensive DORA metrics report with SRE enhancements"""
|
381
668
|
|
382
|
-
logger.info(f"📊 Generating DORA metrics report for last {days_back} days")
|
669
|
+
logger.info(f"📊 Generating enterprise DORA metrics report for last {days_back} days")
|
383
670
|
|
384
671
|
# Calculate all DORA metrics
|
385
672
|
lead_time = self.calculate_lead_time(days_back)
|
@@ -390,7 +677,10 @@ class DORAMetricsEngine:
|
|
390
677
|
# Calculate HITL metrics
|
391
678
|
hitl_metrics = self.calculate_hitl_metrics()
|
392
679
|
|
393
|
-
#
|
680
|
+
# Calculate SLA compliance metrics
|
681
|
+
sla_metrics = self.calculate_sla_compliance(days_back)
|
682
|
+
|
683
|
+
# Performance analysis with enhanced SRE targets
|
394
684
|
targets_met = {
|
395
685
|
"lead_time": lead_time.metadata.get("target_met", False),
|
396
686
|
"deployment_frequency": deployment_freq.metadata.get("target_met", False),
|
@@ -402,10 +692,36 @@ class DORAMetricsEngine:
|
|
402
692
|
if "approval_time" in hitl_metrics:
|
403
693
|
targets_met["approval_time"] = hitl_metrics["approval_time"].metadata.get("target_met", False)
|
404
694
|
|
695
|
+
# Add SLA targets
|
696
|
+
for metric_name, metric in sla_metrics.items():
|
697
|
+
targets_met[f"sla_{metric_name}"] = metric.metadata.get("target_met", False)
|
698
|
+
|
405
699
|
overall_performance = sum(targets_met.values()) / len(targets_met) * 100
|
406
700
|
|
701
|
+
# Calculate trend analysis vs baselines
|
702
|
+
trend_analysis = {}
|
703
|
+
if self.baselines:
|
704
|
+
for metric_name, current_value in [
|
705
|
+
("lead_time_hours", lead_time.value),
|
706
|
+
("deploy_frequency_daily", deployment_freq.value),
|
707
|
+
("change_failure_rate", failure_rate.value),
|
708
|
+
("mttr_hours", mttr.value),
|
709
|
+
]:
|
710
|
+
baseline = self.baselines.get(metric_name, current_value)
|
711
|
+
if baseline > 0:
|
712
|
+
trend_percentage = ((current_value - baseline) / baseline) * 100
|
713
|
+
trend_analysis[metric_name] = {
|
714
|
+
"current": current_value,
|
715
|
+
"baseline": baseline,
|
716
|
+
"trend_percentage": trend_percentage,
|
717
|
+
"improving": trend_percentage < 0
|
718
|
+
if metric_name != "deploy_frequency_daily"
|
719
|
+
else trend_percentage > 0,
|
720
|
+
}
|
721
|
+
|
407
722
|
report = {
|
408
|
-
"report_type": "
|
723
|
+
"report_type": "dora_metrics_enterprise_sre",
|
724
|
+
"version": "2.0",
|
409
725
|
"period": f"{days_back}_days",
|
410
726
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
411
727
|
"dora_metrics": {
|
@@ -414,26 +730,70 @@ class DORAMetricsEngine:
|
|
414
730
|
"change_failure_rate": asdict(failure_rate),
|
415
731
|
"mttr": asdict(mttr),
|
416
732
|
},
|
733
|
+
"sla_metrics": {k: asdict(v) for k, v in sla_metrics.items()},
|
417
734
|
"hitl_metrics": {k: asdict(v) for k, v in hitl_metrics.items()},
|
418
735
|
"performance_analysis": {
|
419
736
|
"targets_met": targets_met,
|
420
737
|
"overall_performance_percentage": overall_performance,
|
421
738
|
"performance_grade": self._calculate_performance_grade(overall_performance),
|
739
|
+
"sla_compliance_score": sum(1 for k, v in targets_met.items() if k.startswith("sla_") and v)
|
740
|
+
/ max(1, sum(1 for k in targets_met.keys() if k.startswith("sla_")))
|
741
|
+
* 100,
|
742
|
+
},
|
743
|
+
"trend_analysis": trend_analysis,
|
744
|
+
"baseline_comparison": self.baselines,
|
745
|
+
"recommendations": self._generate_sre_recommendations(
|
746
|
+
targets_met, hitl_metrics, sla_metrics, trend_analysis
|
747
|
+
),
|
748
|
+
"alerts_summary": {
|
749
|
+
"active_alerts": len(
|
750
|
+
[
|
751
|
+
f
|
752
|
+
for f in (self.artifacts_dir / "alerts").glob("*.json")
|
753
|
+
if f.stat().st_mtime > time.time() - 86400
|
754
|
+
]
|
755
|
+
),
|
756
|
+
"performance_incidents": len(
|
757
|
+
[
|
758
|
+
i
|
759
|
+
for i in self.incidents
|
760
|
+
if i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
|
761
|
+
and "performance" in i.root_cause.lower()
|
762
|
+
]
|
763
|
+
),
|
764
|
+
"sre_health_score": overall_performance,
|
422
765
|
},
|
423
|
-
"recommendations": self._generate_recommendations(targets_met, hitl_metrics),
|
424
766
|
"raw_data": {
|
425
767
|
"deployments_count": len(self.deployments),
|
426
768
|
"incidents_count": len(self.incidents),
|
427
769
|
"approval_times_count": len(self.approval_times),
|
770
|
+
"automation_rate": len(
|
771
|
+
[d for d in self.deployments if getattr(d, "metadata", {}).get("automated", False)]
|
772
|
+
)
|
773
|
+
/ max(1, len(self.deployments))
|
774
|
+
* 100,
|
428
775
|
},
|
429
776
|
}
|
430
777
|
|
431
|
-
# Save report
|
432
|
-
|
778
|
+
# Save enhanced report to SRE reports directory
|
779
|
+
sre_reports_dir = self.artifacts_dir.parent / "sre-reports"
|
780
|
+
sre_reports_dir.mkdir(exist_ok=True)
|
781
|
+
|
782
|
+
report_file = sre_reports_dir / f"dora_enterprise_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
433
783
|
with open(report_file, "w") as f:
|
434
784
|
json.dump(report, f, indent=2, default=str)
|
435
785
|
|
436
|
-
|
786
|
+
# Also save to metrics directory for backward compatibility
|
787
|
+
legacy_report_file = (
|
788
|
+
self.artifacts_dir / "dora-reports" / f"dora_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
789
|
+
)
|
790
|
+
with open(legacy_report_file, "w") as f:
|
791
|
+
json.dump(report, f, indent=2, default=str)
|
792
|
+
|
793
|
+
logger.info(f"✅ Enterprise DORA metrics report saved to: {report_file}")
|
794
|
+
|
795
|
+
# Auto-save persistent data after report generation
|
796
|
+
self._save_persistent_data()
|
437
797
|
|
438
798
|
return report
|
439
799
|
|
@@ -450,6 +810,98 @@ class DORAMetricsEngine:
|
|
450
810
|
else:
|
451
811
|
return "F (Poor)"
|
452
812
|
|
813
|
+
def _generate_sre_recommendations(
|
814
|
+
self, targets_met: Dict[str, bool], hitl_metrics: Dict, sla_metrics: Dict, trend_analysis: Dict
|
815
|
+
) -> List[str]:
|
816
|
+
"""Generate enhanced SRE-focused recommendations based on comprehensive metrics analysis"""
|
817
|
+
|
818
|
+
recommendations = []
|
819
|
+
|
820
|
+
# DORA metrics recommendations
|
821
|
+
if not targets_met.get("lead_time", False):
|
822
|
+
recommendations.append(
|
823
|
+
"🎯 **Lead Time Optimization**: Implement parallel CI/CD workflows, automate testing pipelines, "
|
824
|
+
"and establish fast-track approval processes for low-risk changes"
|
825
|
+
)
|
826
|
+
|
827
|
+
if not targets_met.get("deployment_frequency", False):
|
828
|
+
recommendations.append(
|
829
|
+
"🚀 **Deployment Frequency Enhancement**: Adopt continuous deployment patterns, implement "
|
830
|
+
"feature flags, and establish canary deployment strategies for risk mitigation"
|
831
|
+
)
|
832
|
+
|
833
|
+
if not targets_met.get("change_failure_rate", False):
|
834
|
+
recommendations.append(
|
835
|
+
"🛡️ **Change Failure Rate Reduction**: Enhance pre-production testing, implement progressive "
|
836
|
+
"rollouts, improve monitoring coverage, and establish automated rollback triggers"
|
837
|
+
)
|
838
|
+
|
839
|
+
if not targets_met.get("mttr", False):
|
840
|
+
recommendations.append(
|
841
|
+
"⚡ **MTTR Improvement**: Implement automated incident detection, enhance observability stack, "
|
842
|
+
"establish runbook automation, and improve on-call response procedures"
|
843
|
+
)
|
844
|
+
|
845
|
+
# SLA compliance recommendations
|
846
|
+
if not targets_met.get("sla_availability", False):
|
847
|
+
recommendations.append(
|
848
|
+
"🔒 **Availability SLA Recovery**: Implement chaos engineering practices, enhance redundancy, "
|
849
|
+
"improve failover mechanisms, and establish proactive monitoring alerts"
|
850
|
+
)
|
851
|
+
|
852
|
+
if not targets_met.get("sla_performance", False):
|
853
|
+
recommendations.append(
|
854
|
+
"📈 **Performance SLA Enhancement**: Optimize critical path operations, implement caching strategies, "
|
855
|
+
"enhance resource allocation, and establish performance regression testing"
|
856
|
+
)
|
857
|
+
|
858
|
+
# HITL workflow optimization
|
859
|
+
if not targets_met.get("approval_time", False):
|
860
|
+
recommendations.append(
|
861
|
+
"⏰ **Approval Workflow Optimization**: Implement risk-based approval routing, establish "
|
862
|
+
"parallel approval processes, and create self-service deployment capabilities for low-risk changes"
|
863
|
+
)
|
864
|
+
|
865
|
+
# Trend analysis recommendations
|
866
|
+
if trend_analysis:
|
867
|
+
declining_metrics = [k for k, v in trend_analysis.items() if not v.get("improving", True)]
|
868
|
+
if declining_metrics:
|
869
|
+
recommendations.append(
|
870
|
+
f"📊 **Trend Alert**: Declining performance detected in {', '.join(declining_metrics)}. "
|
871
|
+
f"Implement immediate performance improvement initiatives and establish regression prevention measures"
|
872
|
+
)
|
873
|
+
|
874
|
+
# Proactive SRE recommendations based on patterns
|
875
|
+
if hitl_metrics.get("workflow_bottleneck"):
|
876
|
+
bottleneck_step = hitl_metrics["workflow_bottleneck"].tags.get("bottleneck_step", "unknown")
|
877
|
+
recommendations.append(
|
878
|
+
f"🔍 **Workflow Bottleneck Resolution**: Primary bottleneck identified in '{bottleneck_step}' step. "
|
879
|
+
f"Implement automation, parallel processing, or resource scaling for this workflow stage"
|
880
|
+
)
|
881
|
+
|
882
|
+
# Automation recommendations
|
883
|
+
automation_rate = targets_met.get("automation_rate", 0)
|
884
|
+
if automation_rate < 80:
|
885
|
+
recommendations.append(
|
886
|
+
"🤖 **Automation Enhancement**: Current automation rate below target. Implement GitOps workflows, "
|
887
|
+
"automated testing pipelines, and self-healing infrastructure patterns"
|
888
|
+
)
|
889
|
+
|
890
|
+
# Advanced SRE practices
|
891
|
+
if len([k for k, v in targets_met.items() if v]) / len(targets_met) < 0.8:
|
892
|
+
recommendations.append(
|
893
|
+
"🎯 **SRE Maturity Enhancement**: Consider implementing advanced SRE practices: error budgets, "
|
894
|
+
"SLI/SLO management, chaos engineering, and customer-centric reliability metrics"
|
895
|
+
)
|
896
|
+
|
897
|
+
if not recommendations:
|
898
|
+
recommendations.append(
|
899
|
+
"✅ **Excellence Achieved**: All SRE targets met! Consider advanced optimization: predictive scaling, "
|
900
|
+
"AI-powered incident response, and continuous reliability improvement programs"
|
901
|
+
)
|
902
|
+
|
903
|
+
return recommendations
|
904
|
+
|
453
905
|
def _generate_recommendations(self, targets_met: Dict[str, bool], hitl_metrics: Dict) -> List[str]:
|
454
906
|
"""Generate recommendations based on metrics analysis"""
|
455
907
|
|
@@ -521,6 +973,248 @@ class DORAMetricsEngine:
|
|
521
973
|
logger.info(f"📊 Metrics exported for visualization: {output_file}")
|
522
974
|
return str(output_file)
|
523
975
|
|
976
|
+
def generate_sre_dashboard(self, days_back: int = 30) -> Dict:
|
977
|
+
"""
|
978
|
+
Generate comprehensive SRE dashboard data for visualization tools.
|
979
|
+
|
980
|
+
Args:
|
981
|
+
days_back: Number of days to analyze for dashboard
|
982
|
+
|
983
|
+
Returns:
|
984
|
+
Dashboard data structure optimized for SRE tools (Datadog, Grafana, etc.)
|
985
|
+
"""
|
986
|
+
logger.info(f"📊 Generating SRE dashboard data for {days_back} days")
|
987
|
+
|
988
|
+
# Get comprehensive report data
|
989
|
+
report = self.generate_comprehensive_report(days_back)
|
990
|
+
|
991
|
+
# Format for SRE dashboard tools
|
992
|
+
dashboard_data = {
|
993
|
+
"dashboard_type": "sre_dora_metrics",
|
994
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
995
|
+
"time_range_days": days_back,
|
996
|
+
# Key Performance Indicators (KPIs) for executive view
|
997
|
+
"kpi_summary": {
|
998
|
+
"overall_performance_score": report["performance_analysis"]["overall_performance_percentage"],
|
999
|
+
"sla_compliance_score": report["performance_analysis"]["sla_compliance_score"],
|
1000
|
+
"dora_metrics_health": len(
|
1001
|
+
[
|
1002
|
+
k
|
1003
|
+
for k, v in report["performance_analysis"]["targets_met"].items()
|
1004
|
+
if not k.startswith("sla_") and v
|
1005
|
+
]
|
1006
|
+
)
|
1007
|
+
/ 4
|
1008
|
+
* 100,
|
1009
|
+
"active_incidents": len(
|
1010
|
+
[
|
1011
|
+
i
|
1012
|
+
for i in self.incidents
|
1013
|
+
if i.start_time >= datetime.now(timezone.utc) - timedelta(days=1) and not i.resolution_time
|
1014
|
+
]
|
1015
|
+
),
|
1016
|
+
"automation_percentage": report["raw_data"]["automation_rate"],
|
1017
|
+
},
|
1018
|
+
# Time series data for trending
|
1019
|
+
"time_series": {
|
1020
|
+
"lead_time": [
|
1021
|
+
{"timestamp": m.timestamp.isoformat(), "value": m.value}
|
1022
|
+
for m in self.metrics_history
|
1023
|
+
if m.metric_name == "lead_time"
|
1024
|
+
][-30:], # Last 30 data points
|
1025
|
+
"deployment_frequency": [
|
1026
|
+
{"timestamp": m.timestamp.isoformat(), "value": m.value}
|
1027
|
+
for m in self.metrics_history
|
1028
|
+
if m.metric_name == "deployment_frequency"
|
1029
|
+
][-30:],
|
1030
|
+
"change_failure_rate": [
|
1031
|
+
{"timestamp": m.timestamp.isoformat(), "value": m.value * 100} # Convert to percentage
|
1032
|
+
for m in self.metrics_history
|
1033
|
+
if m.metric_name == "change_failure_rate"
|
1034
|
+
][-30:],
|
1035
|
+
"mttr": [
|
1036
|
+
{"timestamp": m.timestamp.isoformat(), "value": m.value}
|
1037
|
+
for m in self.metrics_history
|
1038
|
+
if m.metric_name == "mttr"
|
1039
|
+
][-30:],
|
1040
|
+
},
|
1041
|
+
# Alert and incident summary
|
1042
|
+
"alerts_incidents": {
|
1043
|
+
"recent_alerts": len(
|
1044
|
+
[
|
1045
|
+
f
|
1046
|
+
for f in (self.artifacts_dir / "alerts").glob("*.json")
|
1047
|
+
if f.stat().st_mtime > time.time() - 86400
|
1048
|
+
]
|
1049
|
+
),
|
1050
|
+
"incident_severity_breakdown": {
|
1051
|
+
"critical": len(
|
1052
|
+
[
|
1053
|
+
i
|
1054
|
+
for i in self.incidents
|
1055
|
+
if i.severity == "critical"
|
1056
|
+
and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
|
1057
|
+
]
|
1058
|
+
),
|
1059
|
+
"high": len(
|
1060
|
+
[
|
1061
|
+
i
|
1062
|
+
for i in self.incidents
|
1063
|
+
if i.severity == "high"
|
1064
|
+
and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
|
1065
|
+
]
|
1066
|
+
),
|
1067
|
+
"medium": len(
|
1068
|
+
[
|
1069
|
+
i
|
1070
|
+
for i in self.incidents
|
1071
|
+
if i.severity == "medium"
|
1072
|
+
and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
|
1073
|
+
]
|
1074
|
+
),
|
1075
|
+
},
|
1076
|
+
"mttr_by_severity": self._calculate_mttr_by_severity(days_back),
|
1077
|
+
},
|
1078
|
+
# Operational metrics
|
1079
|
+
"operational_metrics": {
|
1080
|
+
"deployment_success_rate": len([d for d in self.deployments if d.status == "success"])
|
1081
|
+
/ max(1, len(self.deployments))
|
1082
|
+
* 100,
|
1083
|
+
"avg_approval_time_minutes": sum(self.approval_times) / max(1, len(self.approval_times)),
|
1084
|
+
"workflow_efficiency_score": 100
|
1085
|
+
- (
|
1086
|
+
sum(self.approval_times) / max(1, len(self.approval_times)) / 60 * 100
|
1087
|
+
), # Efficiency based on approval speed
|
1088
|
+
"service_reliability_score": report["sla_metrics"]["availability"]["value"] * 100
|
1089
|
+
if "availability" in report.get("sla_metrics", {})
|
1090
|
+
else 0,
|
1091
|
+
},
|
1092
|
+
# Targets and thresholds for visualization
|
1093
|
+
"targets": self.targets,
|
1094
|
+
"alert_thresholds": self.alert_thresholds,
|
1095
|
+
# Raw data for detailed analysis
|
1096
|
+
"raw_metrics": report,
|
1097
|
+
}
|
1098
|
+
|
1099
|
+
# Save dashboard data for external tools
|
1100
|
+
dashboard_file = (
|
1101
|
+
self.artifacts_dir / "dashboards" / f"sre_dashboard_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
1102
|
+
)
|
1103
|
+
with open(dashboard_file, "w") as f:
|
1104
|
+
json.dump(dashboard_data, f, indent=2, default=str)
|
1105
|
+
|
1106
|
+
logger.info(f"📊 SRE dashboard data saved: {dashboard_file}")
|
1107
|
+
|
1108
|
+
return dashboard_data
|
1109
|
+
|
1110
|
+
def _calculate_mttr_by_severity(self, days_back: int) -> Dict[str, float]:
|
1111
|
+
"""Calculate MTTR broken down by incident severity."""
|
1112
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
|
1113
|
+
recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date and i.resolution_time]
|
1114
|
+
|
1115
|
+
mttr_by_severity = {}
|
1116
|
+
for severity in ["critical", "high", "medium", "low"]:
|
1117
|
+
severity_incidents = [i for i in recent_incidents if i.severity == severity]
|
1118
|
+
if severity_incidents:
|
1119
|
+
total_time = sum((i.resolution_time - i.start_time).total_seconds() / 3600 for i in severity_incidents)
|
1120
|
+
mttr_by_severity[severity] = total_time / len(severity_incidents)
|
1121
|
+
else:
|
1122
|
+
mttr_by_severity[severity] = 0
|
1123
|
+
|
1124
|
+
return mttr_by_severity
|
1125
|
+
|
1126
|
+
def integrate_with_performance_monitor(self, performance_monitor) -> None:
|
1127
|
+
"""
|
1128
|
+
Integrate DORA metrics with existing performance monitoring system.
|
1129
|
+
|
1130
|
+
Args:
|
1131
|
+
performance_monitor: Instance of PerformanceMonitor class
|
1132
|
+
"""
|
1133
|
+
try:
|
1134
|
+
# Hook into performance monitor to auto-detect incidents
|
1135
|
+
original_track = performance_monitor.track_operation
|
1136
|
+
|
1137
|
+
def enhanced_track_operation(
|
1138
|
+
module: str, operation: str, execution_time: float, success: bool = True, metadata=None
|
1139
|
+
):
|
1140
|
+
# Call original method
|
1141
|
+
result = original_track(module, operation, execution_time, success, metadata)
|
1142
|
+
|
1143
|
+
# Auto-detect performance incidents for DORA tracking
|
1144
|
+
target = performance_monitor.performance_targets.get(module, {})
|
1145
|
+
threshold = target.get("target_time", 30.0)
|
1146
|
+
|
1147
|
+
if execution_time > threshold:
|
1148
|
+
self.detect_performance_incident(module, operation, execution_time, threshold)
|
1149
|
+
|
1150
|
+
return result
|
1151
|
+
|
1152
|
+
# Replace with enhanced version
|
1153
|
+
performance_monitor.track_operation = enhanced_track_operation
|
1154
|
+
|
1155
|
+
logger.info("🔗 DORA metrics integrated with performance monitor")
|
1156
|
+
|
1157
|
+
except Exception as e:
|
1158
|
+
logger.error(f"❌ Failed to integrate with performance monitor: {e}")
|
1159
|
+
|
1160
|
+
def export_cloudwatch_metrics(self, namespace: str = "CloudOps/DORA") -> bool:
|
1161
|
+
"""
|
1162
|
+
Export DORA metrics to CloudWatch for enterprise monitoring.
|
1163
|
+
|
1164
|
+
Args:
|
1165
|
+
namespace: CloudWatch metrics namespace
|
1166
|
+
|
1167
|
+
Returns:
|
1168
|
+
Success status of metric publishing
|
1169
|
+
"""
|
1170
|
+
try:
|
1171
|
+
import boto3
|
1172
|
+
|
1173
|
+
cloudwatch = boto3.client("cloudwatch")
|
1174
|
+
|
1175
|
+
# Calculate current metrics
|
1176
|
+
lead_time = self.calculate_lead_time(7) # Weekly metrics
|
1177
|
+
deploy_freq = self.calculate_deployment_frequency(7)
|
1178
|
+
failure_rate = self.calculate_change_failure_rate(7)
|
1179
|
+
mttr = self.calculate_mttr(7)
|
1180
|
+
|
1181
|
+
# Publish to CloudWatch
|
1182
|
+
metrics_to_publish = [
|
1183
|
+
{
|
1184
|
+
"MetricName": "LeadTime",
|
1185
|
+
"Value": lead_time.value,
|
1186
|
+
"Unit": "Seconds",
|
1187
|
+
"Dimensions": [{"Name": "Environment", "Value": "production"}],
|
1188
|
+
},
|
1189
|
+
{
|
1190
|
+
"MetricName": "DeploymentFrequency",
|
1191
|
+
"Value": deploy_freq.value,
|
1192
|
+
"Unit": "Count/Second",
|
1193
|
+
"Dimensions": [{"Name": "Environment", "Value": "production"}],
|
1194
|
+
},
|
1195
|
+
{
|
1196
|
+
"MetricName": "ChangeFailureRate",
|
1197
|
+
"Value": failure_rate.value * 100, # Convert to percentage
|
1198
|
+
"Unit": "Percent",
|
1199
|
+
"Dimensions": [{"Name": "Environment", "Value": "production"}],
|
1200
|
+
},
|
1201
|
+
{
|
1202
|
+
"MetricName": "MeanTimeToRecovery",
|
1203
|
+
"Value": mttr.value,
|
1204
|
+
"Unit": "Seconds",
|
1205
|
+
"Dimensions": [{"Name": "Environment", "Value": "production"}],
|
1206
|
+
},
|
1207
|
+
]
|
1208
|
+
|
1209
|
+
response = cloudwatch.put_metric_data(Namespace=namespace, MetricData=metrics_to_publish)
|
1210
|
+
|
1211
|
+
logger.info(f"📊 DORA metrics published to CloudWatch: {namespace}")
|
1212
|
+
return True
|
1213
|
+
|
1214
|
+
except Exception as e:
|
1215
|
+
logger.error(f"❌ Failed to export CloudWatch metrics: {e}")
|
1216
|
+
return False
|
1217
|
+
|
524
1218
|
|
525
1219
|
# Async functions for integration with existing systems
|
526
1220
|
async def simulate_dora_metrics_collection(duration_minutes: int = 5) -> Dict:
|