runbooks 0.7.9__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. runbooks/__init__.py +1 -1
  2. runbooks/cfat/README.md +12 -1
  3. runbooks/cfat/__init__.py +1 -1
  4. runbooks/cfat/assessment/compliance.py +4 -1
  5. runbooks/cfat/assessment/runner.py +42 -34
  6. runbooks/cfat/models.py +1 -1
  7. runbooks/cloudops/__init__.py +123 -0
  8. runbooks/cloudops/base.py +385 -0
  9. runbooks/cloudops/cost_optimizer.py +811 -0
  10. runbooks/cloudops/infrastructure_optimizer.py +29 -0
  11. runbooks/cloudops/interfaces.py +828 -0
  12. runbooks/cloudops/lifecycle_manager.py +29 -0
  13. runbooks/cloudops/mcp_cost_validation.py +678 -0
  14. runbooks/cloudops/models.py +251 -0
  15. runbooks/cloudops/monitoring_automation.py +29 -0
  16. runbooks/cloudops/notebook_framework.py +676 -0
  17. runbooks/cloudops/security_enforcer.py +449 -0
  18. runbooks/common/__init__.py +152 -0
  19. runbooks/common/accuracy_validator.py +1039 -0
  20. runbooks/common/context_logger.py +440 -0
  21. runbooks/common/cross_module_integration.py +594 -0
  22. runbooks/common/enhanced_exception_handler.py +1108 -0
  23. runbooks/common/enterprise_audit_integration.py +634 -0
  24. runbooks/common/mcp_cost_explorer_integration.py +900 -0
  25. runbooks/common/mcp_integration.py +548 -0
  26. runbooks/common/performance_monitor.py +387 -0
  27. runbooks/common/profile_utils.py +216 -0
  28. runbooks/common/rich_utils.py +172 -1
  29. runbooks/feedback/user_feedback_collector.py +440 -0
  30. runbooks/finops/README.md +377 -458
  31. runbooks/finops/__init__.py +4 -21
  32. runbooks/finops/account_resolver.py +279 -0
  33. runbooks/finops/accuracy_cross_validator.py +638 -0
  34. runbooks/finops/aws_client.py +721 -36
  35. runbooks/finops/budget_integration.py +313 -0
  36. runbooks/finops/cli.py +59 -5
  37. runbooks/finops/cost_optimizer.py +1340 -0
  38. runbooks/finops/cost_processor.py +211 -37
  39. runbooks/finops/dashboard_router.py +900 -0
  40. runbooks/finops/dashboard_runner.py +990 -232
  41. runbooks/finops/embedded_mcp_validator.py +288 -0
  42. runbooks/finops/enhanced_dashboard_runner.py +8 -7
  43. runbooks/finops/enhanced_progress.py +327 -0
  44. runbooks/finops/enhanced_trend_visualization.py +423 -0
  45. runbooks/finops/finops_dashboard.py +184 -1829
  46. runbooks/finops/helpers.py +509 -196
  47. runbooks/finops/iam_guidance.py +400 -0
  48. runbooks/finops/markdown_exporter.py +466 -0
  49. runbooks/finops/multi_dashboard.py +1502 -0
  50. runbooks/finops/optimizer.py +15 -15
  51. runbooks/finops/profile_processor.py +2 -2
  52. runbooks/finops/runbooks.inventory.organizations_discovery.log +0 -0
  53. runbooks/finops/runbooks.security.report_generator.log +0 -0
  54. runbooks/finops/runbooks.security.run_script.log +0 -0
  55. runbooks/finops/runbooks.security.security_export.log +0 -0
  56. runbooks/finops/schemas.py +589 -0
  57. runbooks/finops/service_mapping.py +195 -0
  58. runbooks/finops/single_dashboard.py +710 -0
  59. runbooks/finops/tests/test_reference_images_validation.py +1 -1
  60. runbooks/inventory/README.md +12 -1
  61. runbooks/inventory/core/collector.py +157 -29
  62. runbooks/inventory/list_ec2_instances.py +9 -6
  63. runbooks/inventory/list_ssm_parameters.py +10 -10
  64. runbooks/inventory/organizations_discovery.py +210 -164
  65. runbooks/inventory/rich_inventory_display.py +74 -107
  66. runbooks/inventory/run_on_multi_accounts.py +13 -13
  67. runbooks/inventory/runbooks.inventory.organizations_discovery.log +0 -0
  68. runbooks/inventory/runbooks.security.security_export.log +0 -0
  69. runbooks/main.py +1371 -240
  70. runbooks/metrics/dora_metrics_engine.py +711 -17
  71. runbooks/monitoring/performance_monitor.py +433 -0
  72. runbooks/operate/README.md +394 -0
  73. runbooks/operate/base.py +215 -47
  74. runbooks/operate/ec2_operations.py +435 -5
  75. runbooks/operate/iam_operations.py +598 -3
  76. runbooks/operate/privatelink_operations.py +1 -1
  77. runbooks/operate/rds_operations.py +508 -0
  78. runbooks/operate/s3_operations.py +508 -0
  79. runbooks/operate/vpc_endpoints.py +1 -1
  80. runbooks/remediation/README.md +489 -13
  81. runbooks/remediation/base.py +5 -3
  82. runbooks/remediation/commons.py +8 -4
  83. runbooks/security/ENTERPRISE_SECURITY_FRAMEWORK.md +506 -0
  84. runbooks/security/README.md +12 -1
  85. runbooks/security/__init__.py +265 -33
  86. runbooks/security/cloudops_automation_security_validator.py +1164 -0
  87. runbooks/security/compliance_automation.py +12 -10
  88. runbooks/security/compliance_automation_engine.py +1021 -0
  89. runbooks/security/enterprise_security_framework.py +930 -0
  90. runbooks/security/enterprise_security_policies.json +293 -0
  91. runbooks/security/executive_security_dashboard.py +1247 -0
  92. runbooks/security/integration_test_enterprise_security.py +879 -0
  93. runbooks/security/module_security_integrator.py +641 -0
  94. runbooks/security/multi_account_security_controls.py +2254 -0
  95. runbooks/security/real_time_security_monitor.py +1196 -0
  96. runbooks/security/report_generator.py +1 -1
  97. runbooks/security/run_script.py +4 -8
  98. runbooks/security/security_baseline_tester.py +39 -52
  99. runbooks/security/security_export.py +99 -120
  100. runbooks/sre/README.md +472 -0
  101. runbooks/sre/__init__.py +33 -0
  102. runbooks/sre/mcp_reliability_engine.py +1049 -0
  103. runbooks/sre/performance_optimization_engine.py +1032 -0
  104. runbooks/sre/production_monitoring_framework.py +584 -0
  105. runbooks/sre/reliability_monitoring_framework.py +1011 -0
  106. runbooks/validation/__init__.py +2 -2
  107. runbooks/validation/benchmark.py +154 -149
  108. runbooks/validation/cli.py +159 -147
  109. runbooks/validation/mcp_validator.py +291 -248
  110. runbooks/vpc/README.md +478 -0
  111. runbooks/vpc/__init__.py +2 -2
  112. runbooks/vpc/manager_interface.py +366 -351
  113. runbooks/vpc/networking_wrapper.py +68 -36
  114. runbooks/vpc/rich_formatters.py +22 -8
  115. runbooks-0.9.1.dist-info/METADATA +308 -0
  116. {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/RECORD +120 -59
  117. {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/entry_points.txt +1 -1
  118. runbooks/finops/cross_validation.py +0 -375
  119. runbooks-0.7.9.dist-info/METADATA +0 -636
  120. {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/WHEEL +0 -0
  121. {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/licenses/LICENSE +0 -0
  122. {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/top_level.txt +0 -0
@@ -70,11 +70,23 @@ class IncidentEvent:
70
70
 
71
71
 
72
72
  class DORAMetricsEngine:
73
- """Enhanced DORA metrics collection and analysis engine"""
73
+ """
74
+ Enhanced DORA metrics collection and analysis engine for Enterprise SRE.
75
+
76
+ Provides comprehensive DORA metrics (Lead Time, Deploy Frequency, MTTR, Change Failure Rate)
77
+ with real-time collection, automated alerting, and enterprise dashboard integration.
78
+
79
+ Features:
80
+ - Real-time metrics streaming from git operations
81
+ - Automated deployment event capture via GitHub webhooks
82
+ - CloudWatch/Datadog integration for enterprise monitoring
83
+ - Cross-session persistence with baseline trending
84
+ - SLA compliance tracking with automated alerting
85
+ """
74
86
 
75
87
  def __init__(self, artifacts_dir: str = "./artifacts/metrics", cross_validation_tolerance: float = 15.0):
76
88
  """
77
- Initialize DORA metrics engine
89
+ Initialize enterprise DORA metrics engine
78
90
 
79
91
  Args:
80
92
  artifacts_dir: Directory to store metrics artifacts
@@ -83,27 +95,51 @@ class DORAMetricsEngine:
83
95
  self.artifacts_dir = Path(artifacts_dir)
84
96
  self.artifacts_dir.mkdir(parents=True, exist_ok=True)
85
97
 
98
+ # Create SRE-focused subdirectories
99
+ (self.artifacts_dir / "dora-reports").mkdir(exist_ok=True)
100
+ (self.artifacts_dir / "baselines").mkdir(exist_ok=True)
101
+ (self.artifacts_dir / "alerts").mkdir(exist_ok=True)
102
+ (self.artifacts_dir / "dashboards").mkdir(exist_ok=True)
103
+
86
104
  self.tolerance = cross_validation_tolerance
87
105
 
88
- # Metrics storage
106
+ # Metrics storage with persistence
89
107
  self.deployments: List[DeploymentEvent] = []
90
108
  self.incidents: List[IncidentEvent] = []
91
109
  self.metrics_history: List[DORAMetric] = []
110
+ self.baselines: Dict[str, float] = {}
92
111
 
93
112
  # HITL workflow metrics
94
113
  self.approval_times: List[float] = []
95
114
  self.workflow_bottlenecks: Dict[str, List[float]] = {}
96
115
 
97
- # Performance targets from CLAUDE.md
116
+ # Enterprise SRE performance targets (FAANG SDLC standards)
98
117
  self.targets = {
99
- "lead_time_hours": 4, # <4 hours
118
+ "lead_time_hours": 4, # <4 hours (FAANG velocity)
100
119
  "deploy_frequency_daily": 1, # Daily deployment capability
101
- "change_failure_rate": 0.05, # <5%
102
- "mttr_hours": 1, # <1 hour
103
- "approval_time_minutes": 30, # <30 minutes
104
- "success_rate": 0.95, # >95%
120
+ "change_failure_rate": 0.05, # <5% (FAANG quality)
121
+ "mttr_hours": 1, # <1 hour (SRE excellence)
122
+ "approval_time_minutes": 30, # <30 minutes (HITL efficiency)
123
+ "success_rate": 0.95, # >95% (Enterprise reliability)
124
+ "sla_availability": 0.999, # >99.9% uptime
125
+ "performance_score": 90, # >90% performance score
105
126
  }
106
127
 
128
+ # SRE alerting thresholds
129
+ self.alert_thresholds = {
130
+ "lead_time_hours": 6, # Alert if >6 hours
131
+ "deploy_frequency_daily": 0.5, # Alert if <0.5 deploys/day
132
+ "change_failure_rate": 0.10, # Alert if >10%
133
+ "mttr_hours": 2, # Alert if >2 hours
134
+ "approval_time_minutes": 60, # Alert if >60 minutes
135
+ }
136
+
137
+ # Load existing data
138
+ self._load_persistent_data()
139
+
140
+ # Initialize baseline metrics if not exists
141
+ self._initialize_baselines()
142
+
107
143
  def record_deployment(
108
144
  self,
109
145
  deployment_id: str,
@@ -376,10 +412,261 @@ class DORAMetricsEngine:
376
412
 
377
413
  return metrics
378
414
 
415
+ def _load_persistent_data(self) -> None:
416
+ """Load persistent DORA data from storage."""
417
+ try:
418
+ # Load deployments
419
+ deployments_file = self.artifacts_dir / "deployments.json"
420
+ if deployments_file.exists():
421
+ with open(deployments_file, "r") as f:
422
+ data = json.load(f)
423
+ self.deployments = [DeploymentEvent(**item) for item in data.get("deployments", [])]
424
+
425
+ # Load incidents
426
+ incidents_file = self.artifacts_dir / "incidents.json"
427
+ if incidents_file.exists():
428
+ with open(incidents_file, "r") as f:
429
+ data = json.load(f)
430
+ self.incidents = [IncidentEvent(**item) for item in data.get("incidents", [])]
431
+
432
+ # Load baselines
433
+ baselines_file = self.artifacts_dir / "baselines" / "current_baselines.json"
434
+ if baselines_file.exists():
435
+ with open(baselines_file, "r") as f:
436
+ self.baselines = json.load(f)
437
+
438
+ logger.info(f"📊 Loaded {len(self.deployments)} deployments, {len(self.incidents)} incidents")
439
+
440
+ except Exception as e:
441
+ logger.warning(f"⚠️ Failed to load persistent data: {e}")
442
+
443
+ def _save_persistent_data(self) -> None:
444
+ """Save persistent DORA data to storage."""
445
+ try:
446
+ # Save deployments
447
+ deployments_data = {
448
+ "deployments": [asdict(d) for d in self.deployments],
449
+ "last_updated": datetime.now(timezone.utc).isoformat(),
450
+ }
451
+ deployments_file = self.artifacts_dir / "deployments.json"
452
+ with open(deployments_file, "w") as f:
453
+ json.dump(deployments_data, f, indent=2, default=str)
454
+
455
+ # Save incidents
456
+ incidents_data = {
457
+ "incidents": [asdict(i) for i in self.incidents],
458
+ "last_updated": datetime.now(timezone.utc).isoformat(),
459
+ }
460
+ incidents_file = self.artifacts_dir / "incidents.json"
461
+ with open(incidents_file, "w") as f:
462
+ json.dump(incidents_data, f, indent=2, default=str)
463
+
464
+ # Save baselines
465
+ baselines_file = self.artifacts_dir / "baselines" / "current_baselines.json"
466
+ with open(baselines_file, "w") as f:
467
+ json.dump(self.baselines, f, indent=2)
468
+
469
+ except Exception as e:
470
+ logger.error(f"❌ Failed to save persistent data: {e}")
471
+
472
+ def _initialize_baselines(self) -> None:
473
+ """Initialize baseline metrics for trending analysis."""
474
+ if not self.baselines and len(self.deployments) > 10:
475
+ # Calculate initial baselines from historical data
476
+ lead_time_metric = self.calculate_lead_time(30)
477
+ deploy_freq_metric = self.calculate_deployment_frequency(30)
478
+ failure_rate_metric = self.calculate_change_failure_rate(30)
479
+ mttr_metric = self.calculate_mttr(30)
480
+
481
+ self.baselines = {
482
+ "lead_time_hours": lead_time_metric.value,
483
+ "deploy_frequency_daily": deploy_freq_metric.value,
484
+ "change_failure_rate": failure_rate_metric.value,
485
+ "mttr_hours": mttr_metric.value,
486
+ "baseline_established": datetime.now(timezone.utc).isoformat(),
487
+ "sample_size": len(self.deployments),
488
+ }
489
+
490
+ logger.info("📈 Established baseline metrics from historical data")
491
+ self._save_persistent_data()
492
+
493
+ def track_git_deployment(
494
+ self, commit_sha: str, branch: str = "main", author: str = "", message: str = ""
495
+ ) -> DeploymentEvent:
496
+ """
497
+ Track deployment from git operations for automated DORA collection.
498
+
499
+ Args:
500
+ commit_sha: Git commit SHA
501
+ branch: Git branch name
502
+ author: Commit author
503
+ message: Commit message
504
+
505
+ Returns:
506
+ Created deployment event
507
+ """
508
+ deployment_id = f"git-{commit_sha[:8]}-{int(time.time())}"
509
+
510
+ deployment = self.record_deployment(
511
+ deployment_id=deployment_id,
512
+ environment="production" if branch == "main" else "development",
513
+ service_name="runbooks",
514
+ version=commit_sha[:8],
515
+ commit_sha=commit_sha,
516
+ approver=author,
517
+ )
518
+
519
+ # Add git metadata
520
+ deployment.metadata = {
521
+ "branch": branch,
522
+ "author": author,
523
+ "message": message,
524
+ "automated": True,
525
+ "source": "git_integration",
526
+ }
527
+
528
+ logger.info(f"🔗 Git deployment tracked: {commit_sha[:8]} on {branch}")
529
+
530
+ # Auto-save after git integration
531
+ self._save_persistent_data()
532
+
533
+ return deployment
534
+
535
+ def detect_performance_incident(
536
+ self, module: str, operation: str, execution_time: float, threshold: float
537
+ ) -> Optional[IncidentEvent]:
538
+ """
539
+ Automatically detect and record performance incidents.
540
+
541
+ Args:
542
+ module: Module name (e.g., 'finops', 'inventory')
543
+ operation: Operation name
544
+ execution_time: Actual execution time
545
+ threshold: Performance threshold
546
+
547
+ Returns:
548
+ Created incident if threshold exceeded, None otherwise
549
+ """
550
+ if execution_time <= threshold:
551
+ return None
552
+
553
+ incident_id = f"perf-{module}-{int(time.time())}"
554
+ severity = "critical" if execution_time > threshold * 2 else "high"
555
+
556
+ incident = self.record_incident(
557
+ incident_id=incident_id,
558
+ service_name=module,
559
+ severity=severity,
560
+ root_cause=f"Performance degradation: {operation} took {execution_time:.2f}s (threshold: {threshold:.2f}s)",
561
+ )
562
+
563
+ # Add performance metadata
564
+ incident.metadata = {
565
+ "operation": operation,
566
+ "execution_time": execution_time,
567
+ "threshold": threshold,
568
+ "degradation_factor": execution_time / threshold,
569
+ "automated_detection": True,
570
+ }
571
+
572
+ logger.warning(f"🚨 Performance incident detected: {incident_id}")
573
+
574
+ # Generate real-time alert
575
+ self._generate_sre_alert(incident, execution_time, threshold)
576
+
577
+ return incident
578
+
579
+ def _generate_sre_alert(self, incident: IncidentEvent, execution_time: float, threshold: float) -> None:
580
+ """Generate SRE-focused performance alert."""
581
+ alert_data = {
582
+ "timestamp": datetime.now(timezone.utc).isoformat(),
583
+ "alert_type": "sre_performance_degradation",
584
+ "incident_id": incident.incident_id,
585
+ "service": incident.service_name,
586
+ "severity": incident.severity,
587
+ "execution_time": execution_time,
588
+ "threshold": threshold,
589
+ "degradation_factor": execution_time / threshold,
590
+ "impact": "user_experience" if execution_time > threshold * 1.5 else "performance_sla",
591
+ "recommended_actions": [
592
+ "Check system resource utilization",
593
+ "Review recent deployments for correlation",
594
+ "Validate AWS API rate limiting",
595
+ "Consider auto-scaling triggers",
596
+ ],
597
+ }
598
+
599
+ # Save alert to artifacts
600
+ alert_file = self.artifacts_dir / "alerts" / f"sre_alert_{incident.incident_id}.json"
601
+ with open(alert_file, "w") as f:
602
+ json.dump(alert_data, f, indent=2, default=str)
603
+
604
+ logger.critical(f"🚨 SRE Alert generated: {alert_file}")
605
+
606
+ def calculate_sla_compliance(self, days_back: int = 30) -> Dict[str, DORAMetric]:
607
+ """
608
+ Calculate SLA compliance metrics for enterprise reporting.
609
+
610
+ Args:
611
+ days_back: Number of days to analyze
612
+
613
+ Returns:
614
+ Dictionary of SLA compliance metrics
615
+ """
616
+ sla_metrics = {}
617
+
618
+ # Calculate availability SLA (based on incident downtime)
619
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
620
+ recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date]
621
+
622
+ total_downtime_hours = 0
623
+ for incident in recent_incidents:
624
+ if incident.resolution_time and incident.severity in ["critical", "high"]:
625
+ downtime = (incident.resolution_time - incident.start_time).total_seconds() / 3600
626
+ total_downtime_hours += downtime
627
+
628
+ total_hours = days_back * 24
629
+ availability = max(0, (total_hours - total_downtime_hours) / total_hours)
630
+
631
+ sla_metrics["availability"] = DORAMetric(
632
+ metric_name="availability_sla",
633
+ value=availability,
634
+ unit="percentage",
635
+ timestamp=datetime.now(timezone.utc),
636
+ tags={"period": f"{days_back}d", "incidents": str(len(recent_incidents))},
637
+ metadata={
638
+ "target": self.targets["sla_availability"],
639
+ "target_met": availability >= self.targets["sla_availability"],
640
+ "downtime_hours": total_downtime_hours,
641
+ },
642
+ )
643
+
644
+ # Performance SLA (based on operation execution times)
645
+ performance_scores = []
646
+ for metric in self.metrics_history:
647
+ if metric.metadata and "performance_score" in metric.metadata:
648
+ performance_scores.append(metric.metadata["performance_score"])
649
+
650
+ avg_performance = sum(performance_scores) / len(performance_scores) if performance_scores else 0
651
+
652
+ sla_metrics["performance"] = DORAMetric(
653
+ metric_name="performance_sla",
654
+ value=avg_performance,
655
+ unit="percentage",
656
+ timestamp=datetime.now(timezone.utc),
657
+ tags={"sample_size": str(len(performance_scores))},
658
+ metadata={
659
+ "target": self.targets["performance_score"],
660
+ "target_met": avg_performance >= self.targets["performance_score"],
661
+ },
662
+ )
663
+
664
+ return sla_metrics
665
+
379
666
  def generate_comprehensive_report(self, days_back: int = 30) -> Dict:
380
- """Generate comprehensive DORA metrics report"""
667
+ """Generate comprehensive DORA metrics report with SRE enhancements"""
381
668
 
382
- logger.info(f"📊 Generating DORA metrics report for last {days_back} days")
669
+ logger.info(f"📊 Generating enterprise DORA metrics report for last {days_back} days")
383
670
 
384
671
  # Calculate all DORA metrics
385
672
  lead_time = self.calculate_lead_time(days_back)
@@ -390,7 +677,10 @@ class DORAMetricsEngine:
390
677
  # Calculate HITL metrics
391
678
  hitl_metrics = self.calculate_hitl_metrics()
392
679
 
393
- # Performance analysis
680
+ # Calculate SLA compliance metrics
681
+ sla_metrics = self.calculate_sla_compliance(days_back)
682
+
683
+ # Performance analysis with enhanced SRE targets
394
684
  targets_met = {
395
685
  "lead_time": lead_time.metadata.get("target_met", False),
396
686
  "deployment_frequency": deployment_freq.metadata.get("target_met", False),
@@ -402,10 +692,36 @@ class DORAMetricsEngine:
402
692
  if "approval_time" in hitl_metrics:
403
693
  targets_met["approval_time"] = hitl_metrics["approval_time"].metadata.get("target_met", False)
404
694
 
695
+ # Add SLA targets
696
+ for metric_name, metric in sla_metrics.items():
697
+ targets_met[f"sla_{metric_name}"] = metric.metadata.get("target_met", False)
698
+
405
699
  overall_performance = sum(targets_met.values()) / len(targets_met) * 100
406
700
 
701
+ # Calculate trend analysis vs baselines
702
+ trend_analysis = {}
703
+ if self.baselines:
704
+ for metric_name, current_value in [
705
+ ("lead_time_hours", lead_time.value),
706
+ ("deploy_frequency_daily", deployment_freq.value),
707
+ ("change_failure_rate", failure_rate.value),
708
+ ("mttr_hours", mttr.value),
709
+ ]:
710
+ baseline = self.baselines.get(metric_name, current_value)
711
+ if baseline > 0:
712
+ trend_percentage = ((current_value - baseline) / baseline) * 100
713
+ trend_analysis[metric_name] = {
714
+ "current": current_value,
715
+ "baseline": baseline,
716
+ "trend_percentage": trend_percentage,
717
+ "improving": trend_percentage < 0
718
+ if metric_name != "deploy_frequency_daily"
719
+ else trend_percentage > 0,
720
+ }
721
+
407
722
  report = {
408
- "report_type": "dora_metrics_comprehensive",
723
+ "report_type": "dora_metrics_enterprise_sre",
724
+ "version": "2.0",
409
725
  "period": f"{days_back}_days",
410
726
  "timestamp": datetime.now(timezone.utc).isoformat(),
411
727
  "dora_metrics": {
@@ -414,26 +730,70 @@ class DORAMetricsEngine:
414
730
  "change_failure_rate": asdict(failure_rate),
415
731
  "mttr": asdict(mttr),
416
732
  },
733
+ "sla_metrics": {k: asdict(v) for k, v in sla_metrics.items()},
417
734
  "hitl_metrics": {k: asdict(v) for k, v in hitl_metrics.items()},
418
735
  "performance_analysis": {
419
736
  "targets_met": targets_met,
420
737
  "overall_performance_percentage": overall_performance,
421
738
  "performance_grade": self._calculate_performance_grade(overall_performance),
739
+ "sla_compliance_score": sum(1 for k, v in targets_met.items() if k.startswith("sla_") and v)
740
+ / max(1, sum(1 for k in targets_met.keys() if k.startswith("sla_")))
741
+ * 100,
742
+ },
743
+ "trend_analysis": trend_analysis,
744
+ "baseline_comparison": self.baselines,
745
+ "recommendations": self._generate_sre_recommendations(
746
+ targets_met, hitl_metrics, sla_metrics, trend_analysis
747
+ ),
748
+ "alerts_summary": {
749
+ "active_alerts": len(
750
+ [
751
+ f
752
+ for f in (self.artifacts_dir / "alerts").glob("*.json")
753
+ if f.stat().st_mtime > time.time() - 86400
754
+ ]
755
+ ),
756
+ "performance_incidents": len(
757
+ [
758
+ i
759
+ for i in self.incidents
760
+ if i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
761
+ and "performance" in i.root_cause.lower()
762
+ ]
763
+ ),
764
+ "sre_health_score": overall_performance,
422
765
  },
423
- "recommendations": self._generate_recommendations(targets_met, hitl_metrics),
424
766
  "raw_data": {
425
767
  "deployments_count": len(self.deployments),
426
768
  "incidents_count": len(self.incidents),
427
769
  "approval_times_count": len(self.approval_times),
770
+ "automation_rate": len(
771
+ [d for d in self.deployments if getattr(d, "metadata", {}).get("automated", False)]
772
+ )
773
+ / max(1, len(self.deployments))
774
+ * 100,
428
775
  },
429
776
  }
430
777
 
431
- # Save report
432
- report_file = self.artifacts_dir / f"dora_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
778
+ # Save enhanced report to SRE reports directory
779
+ sre_reports_dir = self.artifacts_dir.parent / "sre-reports"
780
+ sre_reports_dir.mkdir(exist_ok=True)
781
+
782
+ report_file = sre_reports_dir / f"dora_enterprise_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
433
783
  with open(report_file, "w") as f:
434
784
  json.dump(report, f, indent=2, default=str)
435
785
 
436
- logger.info(f"✅ DORA metrics report saved to: {report_file}")
786
+ # Also save to metrics directory for backward compatibility
787
+ legacy_report_file = (
788
+ self.artifacts_dir / "dora-reports" / f"dora_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
789
+ )
790
+ with open(legacy_report_file, "w") as f:
791
+ json.dump(report, f, indent=2, default=str)
792
+
793
+ logger.info(f"✅ Enterprise DORA metrics report saved to: {report_file}")
794
+
795
+ # Auto-save persistent data after report generation
796
+ self._save_persistent_data()
437
797
 
438
798
  return report
439
799
 
@@ -450,6 +810,98 @@ class DORAMetricsEngine:
450
810
  else:
451
811
  return "F (Poor)"
452
812
 
813
+ def _generate_sre_recommendations(
814
+ self, targets_met: Dict[str, bool], hitl_metrics: Dict, sla_metrics: Dict, trend_analysis: Dict
815
+ ) -> List[str]:
816
+ """Generate enhanced SRE-focused recommendations based on comprehensive metrics analysis"""
817
+
818
+ recommendations = []
819
+
820
+ # DORA metrics recommendations
821
+ if not targets_met.get("lead_time", False):
822
+ recommendations.append(
823
+ "🎯 **Lead Time Optimization**: Implement parallel CI/CD workflows, automate testing pipelines, "
824
+ "and establish fast-track approval processes for low-risk changes"
825
+ )
826
+
827
+ if not targets_met.get("deployment_frequency", False):
828
+ recommendations.append(
829
+ "🚀 **Deployment Frequency Enhancement**: Adopt continuous deployment patterns, implement "
830
+ "feature flags, and establish canary deployment strategies for risk mitigation"
831
+ )
832
+
833
+ if not targets_met.get("change_failure_rate", False):
834
+ recommendations.append(
835
+ "🛡️ **Change Failure Rate Reduction**: Enhance pre-production testing, implement progressive "
836
+ "rollouts, improve monitoring coverage, and establish automated rollback triggers"
837
+ )
838
+
839
+ if not targets_met.get("mttr", False):
840
+ recommendations.append(
841
+ "⚡ **MTTR Improvement**: Implement automated incident detection, enhance observability stack, "
842
+ "establish runbook automation, and improve on-call response procedures"
843
+ )
844
+
845
+ # SLA compliance recommendations
846
+ if not targets_met.get("sla_availability", False):
847
+ recommendations.append(
848
+ "🔒 **Availability SLA Recovery**: Implement chaos engineering practices, enhance redundancy, "
849
+ "improve failover mechanisms, and establish proactive monitoring alerts"
850
+ )
851
+
852
+ if not targets_met.get("sla_performance", False):
853
+ recommendations.append(
854
+ "📈 **Performance SLA Enhancement**: Optimize critical path operations, implement caching strategies, "
855
+ "enhance resource allocation, and establish performance regression testing"
856
+ )
857
+
858
+ # HITL workflow optimization
859
+ if not targets_met.get("approval_time", False):
860
+ recommendations.append(
861
+ "⏰ **Approval Workflow Optimization**: Implement risk-based approval routing, establish "
862
+ "parallel approval processes, and create self-service deployment capabilities for low-risk changes"
863
+ )
864
+
865
+ # Trend analysis recommendations
866
+ if trend_analysis:
867
+ declining_metrics = [k for k, v in trend_analysis.items() if not v.get("improving", True)]
868
+ if declining_metrics:
869
+ recommendations.append(
870
+ f"📊 **Trend Alert**: Declining performance detected in {', '.join(declining_metrics)}. "
871
+ f"Implement immediate performance improvement initiatives and establish regression prevention measures"
872
+ )
873
+
874
+ # Proactive SRE recommendations based on patterns
875
+ if hitl_metrics.get("workflow_bottleneck"):
876
+ bottleneck_step = hitl_metrics["workflow_bottleneck"].tags.get("bottleneck_step", "unknown")
877
+ recommendations.append(
878
+ f"🔍 **Workflow Bottleneck Resolution**: Primary bottleneck identified in '{bottleneck_step}' step. "
879
+ f"Implement automation, parallel processing, or resource scaling for this workflow stage"
880
+ )
881
+
882
+ # Automation recommendations
883
+ automation_rate = targets_met.get("automation_rate", 0)
884
+ if automation_rate < 80:
885
+ recommendations.append(
886
+ "🤖 **Automation Enhancement**: Current automation rate below target. Implement GitOps workflows, "
887
+ "automated testing pipelines, and self-healing infrastructure patterns"
888
+ )
889
+
890
+ # Advanced SRE practices
891
+ if len([k for k, v in targets_met.items() if v]) / len(targets_met) < 0.8:
892
+ recommendations.append(
893
+ "🎯 **SRE Maturity Enhancement**: Consider implementing advanced SRE practices: error budgets, "
894
+ "SLI/SLO management, chaos engineering, and customer-centric reliability metrics"
895
+ )
896
+
897
+ if not recommendations:
898
+ recommendations.append(
899
+ "✅ **Excellence Achieved**: All SRE targets met! Consider advanced optimization: predictive scaling, "
900
+ "AI-powered incident response, and continuous reliability improvement programs"
901
+ )
902
+
903
+ return recommendations
904
+
453
905
  def _generate_recommendations(self, targets_met: Dict[str, bool], hitl_metrics: Dict) -> List[str]:
454
906
  """Generate recommendations based on metrics analysis"""
455
907
 
@@ -521,6 +973,248 @@ class DORAMetricsEngine:
521
973
  logger.info(f"📊 Metrics exported for visualization: {output_file}")
522
974
  return str(output_file)
523
975
 
976
+ def generate_sre_dashboard(self, days_back: int = 30) -> Dict:
977
+ """
978
+ Generate comprehensive SRE dashboard data for visualization tools.
979
+
980
+ Args:
981
+ days_back: Number of days to analyze for dashboard
982
+
983
+ Returns:
984
+ Dashboard data structure optimized for SRE tools (Datadog, Grafana, etc.)
985
+ """
986
+ logger.info(f"📊 Generating SRE dashboard data for {days_back} days")
987
+
988
+ # Get comprehensive report data
989
+ report = self.generate_comprehensive_report(days_back)
990
+
991
+ # Format for SRE dashboard tools
992
+ dashboard_data = {
993
+ "dashboard_type": "sre_dora_metrics",
994
+ "generated_at": datetime.now(timezone.utc).isoformat(),
995
+ "time_range_days": days_back,
996
+ # Key Performance Indicators (KPIs) for executive view
997
+ "kpi_summary": {
998
+ "overall_performance_score": report["performance_analysis"]["overall_performance_percentage"],
999
+ "sla_compliance_score": report["performance_analysis"]["sla_compliance_score"],
1000
+ "dora_metrics_health": len(
1001
+ [
1002
+ k
1003
+ for k, v in report["performance_analysis"]["targets_met"].items()
1004
+ if not k.startswith("sla_") and v
1005
+ ]
1006
+ )
1007
+ / 4
1008
+ * 100,
1009
+ "active_incidents": len(
1010
+ [
1011
+ i
1012
+ for i in self.incidents
1013
+ if i.start_time >= datetime.now(timezone.utc) - timedelta(days=1) and not i.resolution_time
1014
+ ]
1015
+ ),
1016
+ "automation_percentage": report["raw_data"]["automation_rate"],
1017
+ },
1018
+ # Time series data for trending
1019
+ "time_series": {
1020
+ "lead_time": [
1021
+ {"timestamp": m.timestamp.isoformat(), "value": m.value}
1022
+ for m in self.metrics_history
1023
+ if m.metric_name == "lead_time"
1024
+ ][-30:], # Last 30 data points
1025
+ "deployment_frequency": [
1026
+ {"timestamp": m.timestamp.isoformat(), "value": m.value}
1027
+ for m in self.metrics_history
1028
+ if m.metric_name == "deployment_frequency"
1029
+ ][-30:],
1030
+ "change_failure_rate": [
1031
+ {"timestamp": m.timestamp.isoformat(), "value": m.value * 100} # Convert to percentage
1032
+ for m in self.metrics_history
1033
+ if m.metric_name == "change_failure_rate"
1034
+ ][-30:],
1035
+ "mttr": [
1036
+ {"timestamp": m.timestamp.isoformat(), "value": m.value}
1037
+ for m in self.metrics_history
1038
+ if m.metric_name == "mttr"
1039
+ ][-30:],
1040
+ },
1041
+ # Alert and incident summary
1042
+ "alerts_incidents": {
1043
+ "recent_alerts": len(
1044
+ [
1045
+ f
1046
+ for f in (self.artifacts_dir / "alerts").glob("*.json")
1047
+ if f.stat().st_mtime > time.time() - 86400
1048
+ ]
1049
+ ),
1050
+ "incident_severity_breakdown": {
1051
+ "critical": len(
1052
+ [
1053
+ i
1054
+ for i in self.incidents
1055
+ if i.severity == "critical"
1056
+ and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
1057
+ ]
1058
+ ),
1059
+ "high": len(
1060
+ [
1061
+ i
1062
+ for i in self.incidents
1063
+ if i.severity == "high"
1064
+ and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
1065
+ ]
1066
+ ),
1067
+ "medium": len(
1068
+ [
1069
+ i
1070
+ for i in self.incidents
1071
+ if i.severity == "medium"
1072
+ and i.start_time >= datetime.now(timezone.utc) - timedelta(days=days_back)
1073
+ ]
1074
+ ),
1075
+ },
1076
+ "mttr_by_severity": self._calculate_mttr_by_severity(days_back),
1077
+ },
1078
+ # Operational metrics
1079
+ "operational_metrics": {
1080
+ "deployment_success_rate": len([d for d in self.deployments if d.status == "success"])
1081
+ / max(1, len(self.deployments))
1082
+ * 100,
1083
+ "avg_approval_time_minutes": sum(self.approval_times) / max(1, len(self.approval_times)),
1084
+ "workflow_efficiency_score": 100
1085
+ - (
1086
+ sum(self.approval_times) / max(1, len(self.approval_times)) / 60 * 100
1087
+ ), # Efficiency based on approval speed
1088
+ "service_reliability_score": report["sla_metrics"]["availability"]["value"] * 100
1089
+ if "availability" in report.get("sla_metrics", {})
1090
+ else 0,
1091
+ },
1092
+ # Targets and thresholds for visualization
1093
+ "targets": self.targets,
1094
+ "alert_thresholds": self.alert_thresholds,
1095
+ # Raw data for detailed analysis
1096
+ "raw_metrics": report,
1097
+ }
1098
+
1099
+ # Save dashboard data for external tools
1100
+ dashboard_file = (
1101
+ self.artifacts_dir / "dashboards" / f"sre_dashboard_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
1102
+ )
1103
+ with open(dashboard_file, "w") as f:
1104
+ json.dump(dashboard_data, f, indent=2, default=str)
1105
+
1106
+ logger.info(f"📊 SRE dashboard data saved: {dashboard_file}")
1107
+
1108
+ return dashboard_data
1109
+
1110
+ def _calculate_mttr_by_severity(self, days_back: int) -> Dict[str, float]:
1111
+ """Calculate MTTR broken down by incident severity."""
1112
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
1113
+ recent_incidents = [i for i in self.incidents if i.start_time >= cutoff_date and i.resolution_time]
1114
+
1115
+ mttr_by_severity = {}
1116
+ for severity in ["critical", "high", "medium", "low"]:
1117
+ severity_incidents = [i for i in recent_incidents if i.severity == severity]
1118
+ if severity_incidents:
1119
+ total_time = sum((i.resolution_time - i.start_time).total_seconds() / 3600 for i in severity_incidents)
1120
+ mttr_by_severity[severity] = total_time / len(severity_incidents)
1121
+ else:
1122
+ mttr_by_severity[severity] = 0
1123
+
1124
+ return mttr_by_severity
1125
+
1126
+ def integrate_with_performance_monitor(self, performance_monitor) -> None:
1127
+ """
1128
+ Integrate DORA metrics with existing performance monitoring system.
1129
+
1130
+ Args:
1131
+ performance_monitor: Instance of PerformanceMonitor class
1132
+ """
1133
+ try:
1134
+ # Hook into performance monitor to auto-detect incidents
1135
+ original_track = performance_monitor.track_operation
1136
+
1137
+ def enhanced_track_operation(
1138
+ module: str, operation: str, execution_time: float, success: bool = True, metadata=None
1139
+ ):
1140
+ # Call original method
1141
+ result = original_track(module, operation, execution_time, success, metadata)
1142
+
1143
+ # Auto-detect performance incidents for DORA tracking
1144
+ target = performance_monitor.performance_targets.get(module, {})
1145
+ threshold = target.get("target_time", 30.0)
1146
+
1147
+ if execution_time > threshold:
1148
+ self.detect_performance_incident(module, operation, execution_time, threshold)
1149
+
1150
+ return result
1151
+
1152
+ # Replace with enhanced version
1153
+ performance_monitor.track_operation = enhanced_track_operation
1154
+
1155
+ logger.info("🔗 DORA metrics integrated with performance monitor")
1156
+
1157
+ except Exception as e:
1158
+ logger.error(f"❌ Failed to integrate with performance monitor: {e}")
1159
+
1160
+ def export_cloudwatch_metrics(self, namespace: str = "CloudOps/DORA") -> bool:
1161
+ """
1162
+ Export DORA metrics to CloudWatch for enterprise monitoring.
1163
+
1164
+ Args:
1165
+ namespace: CloudWatch metrics namespace
1166
+
1167
+ Returns:
1168
+ Success status of metric publishing
1169
+ """
1170
+ try:
1171
+ import boto3
1172
+
1173
+ cloudwatch = boto3.client("cloudwatch")
1174
+
1175
+ # Calculate current metrics
1176
+ lead_time = self.calculate_lead_time(7) # Weekly metrics
1177
+ deploy_freq = self.calculate_deployment_frequency(7)
1178
+ failure_rate = self.calculate_change_failure_rate(7)
1179
+ mttr = self.calculate_mttr(7)
1180
+
1181
+ # Publish to CloudWatch
1182
+ metrics_to_publish = [
1183
+ {
1184
+ "MetricName": "LeadTime",
1185
+ "Value": lead_time.value,
1186
+ "Unit": "Seconds",
1187
+ "Dimensions": [{"Name": "Environment", "Value": "production"}],
1188
+ },
1189
+ {
1190
+ "MetricName": "DeploymentFrequency",
1191
+ "Value": deploy_freq.value,
1192
+ "Unit": "Count/Second",
1193
+ "Dimensions": [{"Name": "Environment", "Value": "production"}],
1194
+ },
1195
+ {
1196
+ "MetricName": "ChangeFailureRate",
1197
+ "Value": failure_rate.value * 100, # Convert to percentage
1198
+ "Unit": "Percent",
1199
+ "Dimensions": [{"Name": "Environment", "Value": "production"}],
1200
+ },
1201
+ {
1202
+ "MetricName": "MeanTimeToRecovery",
1203
+ "Value": mttr.value,
1204
+ "Unit": "Seconds",
1205
+ "Dimensions": [{"Name": "Environment", "Value": "production"}],
1206
+ },
1207
+ ]
1208
+
1209
+ response = cloudwatch.put_metric_data(Namespace=namespace, MetricData=metrics_to_publish)
1210
+
1211
+ logger.info(f"📊 DORA metrics published to CloudWatch: {namespace}")
1212
+ return True
1213
+
1214
+ except Exception as e:
1215
+ logger.error(f"❌ Failed to export CloudWatch metrics: {e}")
1216
+ return False
1217
+
524
1218
 
525
1219
  # Async functions for integration with existing systems
526
1220
  async def simulate_dora_metrics_collection(duration_minutes: int = 5) -> Dict: