runbooks 0.7.9__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runbooks/__init__.py +1 -1
- runbooks/cfat/README.md +12 -1
- runbooks/cfat/__init__.py +1 -1
- runbooks/cfat/assessment/compliance.py +4 -1
- runbooks/cfat/assessment/runner.py +42 -34
- runbooks/cfat/models.py +1 -1
- runbooks/cloudops/__init__.py +123 -0
- runbooks/cloudops/base.py +385 -0
- runbooks/cloudops/cost_optimizer.py +811 -0
- runbooks/cloudops/infrastructure_optimizer.py +29 -0
- runbooks/cloudops/interfaces.py +828 -0
- runbooks/cloudops/lifecycle_manager.py +29 -0
- runbooks/cloudops/mcp_cost_validation.py +678 -0
- runbooks/cloudops/models.py +251 -0
- runbooks/cloudops/monitoring_automation.py +29 -0
- runbooks/cloudops/notebook_framework.py +676 -0
- runbooks/cloudops/security_enforcer.py +449 -0
- runbooks/common/__init__.py +152 -0
- runbooks/common/accuracy_validator.py +1039 -0
- runbooks/common/context_logger.py +440 -0
- runbooks/common/cross_module_integration.py +594 -0
- runbooks/common/enhanced_exception_handler.py +1108 -0
- runbooks/common/enterprise_audit_integration.py +634 -0
- runbooks/common/mcp_cost_explorer_integration.py +900 -0
- runbooks/common/mcp_integration.py +548 -0
- runbooks/common/performance_monitor.py +387 -0
- runbooks/common/profile_utils.py +216 -0
- runbooks/common/rich_utils.py +172 -1
- runbooks/feedback/user_feedback_collector.py +440 -0
- runbooks/finops/README.md +377 -458
- runbooks/finops/__init__.py +4 -21
- runbooks/finops/account_resolver.py +279 -0
- runbooks/finops/accuracy_cross_validator.py +638 -0
- runbooks/finops/aws_client.py +721 -36
- runbooks/finops/budget_integration.py +313 -0
- runbooks/finops/cli.py +59 -5
- runbooks/finops/cost_optimizer.py +1340 -0
- runbooks/finops/cost_processor.py +211 -37
- runbooks/finops/dashboard_router.py +900 -0
- runbooks/finops/dashboard_runner.py +990 -232
- runbooks/finops/embedded_mcp_validator.py +288 -0
- runbooks/finops/enhanced_dashboard_runner.py +8 -7
- runbooks/finops/enhanced_progress.py +327 -0
- runbooks/finops/enhanced_trend_visualization.py +423 -0
- runbooks/finops/finops_dashboard.py +184 -1829
- runbooks/finops/helpers.py +509 -196
- runbooks/finops/iam_guidance.py +400 -0
- runbooks/finops/markdown_exporter.py +466 -0
- runbooks/finops/multi_dashboard.py +1502 -0
- runbooks/finops/optimizer.py +15 -15
- runbooks/finops/profile_processor.py +2 -2
- runbooks/finops/runbooks.inventory.organizations_discovery.log +0 -0
- runbooks/finops/runbooks.security.report_generator.log +0 -0
- runbooks/finops/runbooks.security.run_script.log +0 -0
- runbooks/finops/runbooks.security.security_export.log +0 -0
- runbooks/finops/schemas.py +589 -0
- runbooks/finops/service_mapping.py +195 -0
- runbooks/finops/single_dashboard.py +710 -0
- runbooks/finops/tests/test_reference_images_validation.py +1 -1
- runbooks/inventory/README.md +12 -1
- runbooks/inventory/core/collector.py +157 -29
- runbooks/inventory/list_ec2_instances.py +9 -6
- runbooks/inventory/list_ssm_parameters.py +10 -10
- runbooks/inventory/organizations_discovery.py +210 -164
- runbooks/inventory/rich_inventory_display.py +74 -107
- runbooks/inventory/run_on_multi_accounts.py +13 -13
- runbooks/inventory/runbooks.inventory.organizations_discovery.log +0 -0
- runbooks/inventory/runbooks.security.security_export.log +0 -0
- runbooks/main.py +1371 -240
- runbooks/metrics/dora_metrics_engine.py +711 -17
- runbooks/monitoring/performance_monitor.py +433 -0
- runbooks/operate/README.md +394 -0
- runbooks/operate/base.py +215 -47
- runbooks/operate/ec2_operations.py +435 -5
- runbooks/operate/iam_operations.py +598 -3
- runbooks/operate/privatelink_operations.py +1 -1
- runbooks/operate/rds_operations.py +508 -0
- runbooks/operate/s3_operations.py +508 -0
- runbooks/operate/vpc_endpoints.py +1 -1
- runbooks/remediation/README.md +489 -13
- runbooks/remediation/base.py +5 -3
- runbooks/remediation/commons.py +8 -4
- runbooks/security/ENTERPRISE_SECURITY_FRAMEWORK.md +506 -0
- runbooks/security/README.md +12 -1
- runbooks/security/__init__.py +265 -33
- runbooks/security/cloudops_automation_security_validator.py +1164 -0
- runbooks/security/compliance_automation.py +12 -10
- runbooks/security/compliance_automation_engine.py +1021 -0
- runbooks/security/enterprise_security_framework.py +930 -0
- runbooks/security/enterprise_security_policies.json +293 -0
- runbooks/security/executive_security_dashboard.py +1247 -0
- runbooks/security/integration_test_enterprise_security.py +879 -0
- runbooks/security/module_security_integrator.py +641 -0
- runbooks/security/multi_account_security_controls.py +2254 -0
- runbooks/security/real_time_security_monitor.py +1196 -0
- runbooks/security/report_generator.py +1 -1
- runbooks/security/run_script.py +4 -8
- runbooks/security/security_baseline_tester.py +39 -52
- runbooks/security/security_export.py +99 -120
- runbooks/sre/README.md +472 -0
- runbooks/sre/__init__.py +33 -0
- runbooks/sre/mcp_reliability_engine.py +1049 -0
- runbooks/sre/performance_optimization_engine.py +1032 -0
- runbooks/sre/production_monitoring_framework.py +584 -0
- runbooks/sre/reliability_monitoring_framework.py +1011 -0
- runbooks/validation/__init__.py +2 -2
- runbooks/validation/benchmark.py +154 -149
- runbooks/validation/cli.py +159 -147
- runbooks/validation/mcp_validator.py +291 -248
- runbooks/vpc/README.md +478 -0
- runbooks/vpc/__init__.py +2 -2
- runbooks/vpc/manager_interface.py +366 -351
- runbooks/vpc/networking_wrapper.py +68 -36
- runbooks/vpc/rich_formatters.py +22 -8
- runbooks-0.9.1.dist-info/METADATA +308 -0
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/RECORD +120 -59
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/entry_points.txt +1 -1
- runbooks/finops/cross_validation.py +0 -375
- runbooks-0.7.9.dist-info/METADATA +0 -636
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/WHEEL +0 -0
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/licenses/LICENSE +0 -0
- {runbooks-0.7.9.dist-info → runbooks-0.9.1.dist-info}/top_level.txt +0 -0
runbooks/sre/README.md
ADDED
@@ -0,0 +1,472 @@
|
|
1
|
+
# AWS SRE Automation & Reliability Engineering (CLI)
|
2
|
+
|
3
|
+
The AWS SRE Automation module is an enterprise-grade Site Reliability Engineering toolkit for AWS environments. Built with the Rich library and advanced MCP (Model Context Protocol) integration, it provides comprehensive reliability monitoring, automated incident response, and performance optimization capabilities.
|
4
|
+
|
5
|
+
## 📈 *sre-runbooks*.md Enterprise Rollout
|
6
|
+
|
7
|
+
Following proven **99/100 manager score** success patterns established in FinOps:
|
8
|
+
|
9
|
+
### **Rollout Strategy**: Progressive *-runbooks*.md standardization
|
10
|
+
- **Phase 4**: SRE rollout with *sre-runbooks*.md framework ✅
|
11
|
+
- **Integration**: MCP reliability engine with real-time monitoring
|
12
|
+
- **DORA Metrics**: Enterprise-grade DevOps performance measurement
|
13
|
+
|
14
|
+
## Why AWS SRE Automation?
|
15
|
+
|
16
|
+
Site Reliability Engineering requires sophisticated automation, monitoring, and incident response capabilities. The SRE Automation CLI provides enterprise-grade reliability tools designed for SRE teams, DevOps engineers, and platform engineers managing large-scale AWS environments.
|
17
|
+
|
18
|
+
Key capabilities include:
|
19
|
+
- **DORA Metrics Collection**: Lead Time, Deploy Frequency, MTTR, Change Failure Rate
|
20
|
+
- **MCP Reliability Engine**: Advanced Model Context Protocol integration for intelligent monitoring
|
21
|
+
- **Automated Incident Response**: AI-powered incident detection and automated remediation
|
22
|
+
- **Performance Monitoring**: Real-time system health and performance tracking
|
23
|
+
- **Chaos Engineering**: Controlled failure injection and resilience testing
|
24
|
+
|
25
|
+
## Table of Contents
|
26
|
+
|
27
|
+
- [Features](#features)
|
28
|
+
- [Prerequisites](#prerequisites)
|
29
|
+
- [Installation](#installation)
|
30
|
+
- [AWS CLI Profile Setup](#aws-cli-profile-setup)
|
31
|
+
- [Command Line Usage](#command-line-usage)
|
32
|
+
- [Options](#command-line-options)
|
33
|
+
- [Examples](#examples)
|
34
|
+
- [SRE Operations](#sre-operations)
|
35
|
+
- [DORA Metrics Collection](#dora-metrics-collection)
|
36
|
+
- [MCP Reliability Engine](#mcp-reliability-engine)
|
37
|
+
- [Incident Response Automation](#incident-response-automation)
|
38
|
+
- [Performance Monitoring](#performance-monitoring)
|
39
|
+
- [Chaos Engineering](#chaos-engineering)
|
40
|
+
- [Configuration](#configuration)
|
41
|
+
- [Export Formats](#export-formats)
|
42
|
+
- [Contributing](#contributing)
|
43
|
+
- [License](#license)
|
44
|
+
|
45
|
+
---
|
46
|
+
|
47
|
+
## Features
|
48
|
+
|
49
|
+
- **DORA Metrics Implementation**:
|
50
|
+
- Lead Time measurement and tracking
|
51
|
+
- Deployment Frequency monitoring
|
52
|
+
- Mean Time To Recovery (MTTR) calculation
|
53
|
+
- Change Failure Rate analysis
|
54
|
+
- Historical trending and benchmarking
|
55
|
+
- **MCP Reliability Engine**:
|
56
|
+
- Intelligent system monitoring using Model Context Protocol
|
57
|
+
- AI-powered anomaly detection
|
58
|
+
- Predictive failure analysis
|
59
|
+
- Automated remediation recommendations
|
60
|
+
- **Incident Response Automation**:
|
61
|
+
- Automated incident detection and classification
|
62
|
+
- Escalation path management
|
63
|
+
- Post-incident review automation
|
64
|
+
- Runbook execution and validation
|
65
|
+
- **Performance Monitoring**:
|
66
|
+
- Real-time system health dashboards
|
67
|
+
- Application performance monitoring
|
68
|
+
- Infrastructure utilization tracking
|
69
|
+
- Cost-performance optimization
|
70
|
+
- **Chaos Engineering**:
|
71
|
+
- Controlled failure injection
|
72
|
+
- Resilience testing automation
|
73
|
+
- Failure scenario simulation
|
74
|
+
- Recovery validation
|
75
|
+
- **Enterprise Integration**:
|
76
|
+
- PagerDuty and ServiceNow integration
|
77
|
+
- Slack and Teams notifications
|
78
|
+
- Jira and Confluence automation
|
79
|
+
- Custom webhook support
|
80
|
+
- **Rich Terminal UI**: Beautiful console output with real-time metrics and charts
|
81
|
+
|
82
|
+
---
|
83
|
+
|
84
|
+
## Prerequisites
|
85
|
+
|
86
|
+
- **Python 3.8 or later**: Ensure you have the required Python version installed
|
87
|
+
- **AWS CLI configured with named profiles**: Set up your AWS CLI profiles for seamless integration
|
88
|
+
- **AWS credentials with permissions**:
|
89
|
+
- `cloudwatch:*` (for metrics collection and monitoring)
|
90
|
+
- `logs:*` (for log analysis and aggregation)
|
91
|
+
- `events:*` (for event-driven automation)
|
92
|
+
- `lambda:*` (for serverless automation functions)
|
93
|
+
- `sns:*` (for notification management)
|
94
|
+
- `sts:GetCallerIdentity` (for identity validation)
|
95
|
+
|
96
|
+
---
|
97
|
+
|
98
|
+
## Installation
|
99
|
+
|
100
|
+
### Option 1: Using uv (Fast Python Package Installer)
|
101
|
+
```bash
|
102
|
+
# Install runbooks with SRE automation
|
103
|
+
uv pip install runbooks
|
104
|
+
```
|
105
|
+
|
106
|
+
### Option 2: Using pip
|
107
|
+
```bash
|
108
|
+
# Install runbooks package
|
109
|
+
pip install runbooks
|
110
|
+
```
|
111
|
+
|
112
|
+
---
|
113
|
+
|
114
|
+
## Command Line Usage
|
115
|
+
|
116
|
+
Run SRE operations using `runbooks sre` followed by options:
|
117
|
+
|
118
|
+
```bash
|
119
|
+
runbooks sre [operation] [options]
|
120
|
+
```
|
121
|
+
|
122
|
+
### Command Line Options
|
123
|
+
|
124
|
+
| Flag | Description |
|
125
|
+
|---|---|
|
126
|
+
| `--profile`, `-p` | AWS profile to use for operations |
|
127
|
+
| `--region`, `-r` | AWS region to monitor (default: us-east-1) |
|
128
|
+
| `--all-regions` | Monitor across all available regions |
|
129
|
+
| `--time-range` | Time range for metrics: 1h, 6h, 24h, 7d, 30d |
|
130
|
+
| `--output-format` | Output format: table, json, csv, html |
|
131
|
+
| `--dashboard` | Launch interactive dashboard |
|
132
|
+
| `--real-time` | Enable real-time monitoring mode |
|
133
|
+
| `--mcp-enabled` | Enable MCP reliability engine |
|
134
|
+
|
135
|
+
### Examples
|
136
|
+
|
137
|
+
```bash
|
138
|
+
# DORA metrics collection
|
139
|
+
runbooks sre dora --time-range 30d --profile production
|
140
|
+
|
141
|
+
# MCP reliability engine monitoring
|
142
|
+
runbooks sre monitor --mcp-enabled --dashboard --profile production
|
143
|
+
|
144
|
+
# Incident response automation
|
145
|
+
runbooks sre incident respond --incident-id INC-12345 --profile production
|
146
|
+
|
147
|
+
# Performance monitoring dashboard
|
148
|
+
runbooks sre performance --dashboard --real-time --profile production
|
149
|
+
|
150
|
+
# Chaos engineering experiment
|
151
|
+
runbooks sre chaos --experiment network-partition --duration 300s --profile staging
|
152
|
+
```
|
153
|
+
|
154
|
+
---
|
155
|
+
|
156
|
+
## SRE Operations
|
157
|
+
|
158
|
+
### DORA Metrics Collection
|
159
|
+
|
160
|
+
**Comprehensive DORA Metrics**:
|
161
|
+
```bash
|
162
|
+
# Collect all DORA metrics
|
163
|
+
runbooks sre dora --metrics all --time-range 30d --profile production
|
164
|
+
|
165
|
+
# Lead Time analysis
|
166
|
+
runbooks sre dora --metrics lead-time --time-range 7d --profile production
|
167
|
+
|
168
|
+
# Deployment frequency tracking
|
169
|
+
runbooks sre dora --metrics deployment-frequency --profile production
|
170
|
+
|
171
|
+
# MTTR calculation
|
172
|
+
runbooks sre dora --metrics mttr --time-range 90d --profile production
|
173
|
+
```
|
174
|
+
|
175
|
+
**Expected DORA Output**:
|
176
|
+
```
|
177
|
+
╭─ DORA Metrics Summary (Last 30 Days) ─╮
|
178
|
+
│ │
|
179
|
+
│ 🚀 Lead Time: 2.4 hours │
|
180
|
+
│ Target: <4 hours ✅ │
|
181
|
+
│ │
|
182
|
+
│ 📊 Deployment Frequency: 12.3/day │
|
183
|
+
│ Target: Daily ✅ │
|
184
|
+
│ │
|
185
|
+
│ ⚡ MTTR: 47 minutes │
|
186
|
+
│ Target: <1 hour ✅ │
|
187
|
+
│ │
|
188
|
+
│ ❌ Change Failure Rate: 3.2% │
|
189
|
+
│ Target: <5% ✅ │
|
190
|
+
│ │
|
191
|
+
│ 🏆 Overall DORA Score: Elite (95/100) │
|
192
|
+
╰────────────────────────────────────────╯
|
193
|
+
```
|
194
|
+
|
195
|
+
### MCP Reliability Engine
|
196
|
+
|
197
|
+
**Intelligent Monitoring with MCP**:
|
198
|
+
```bash
|
199
|
+
# Enable MCP reliability engine
|
200
|
+
runbooks sre mcp-engine start --profile production
|
201
|
+
|
202
|
+
# AI-powered anomaly detection
|
203
|
+
runbooks sre mcp-engine analyze --anomaly-detection --profile production
|
204
|
+
|
205
|
+
# Predictive failure analysis
|
206
|
+
runbooks sre mcp-engine predict --lookback 7d --forecast 24h --profile production
|
207
|
+
|
208
|
+
# Automated remediation suggestions
|
209
|
+
runbooks sre mcp-engine remediate --incident-type high-cpu --profile production
|
210
|
+
```
|
211
|
+
|
212
|
+
**MCP Engine Output**:
|
213
|
+
```
|
214
|
+
╭─ MCP Reliability Engine Status ─╮
|
215
|
+
│ │
|
216
|
+
│ 🧠 AI Analysis: Active │
|
217
|
+
│ 📈 Anomalies Detected: 3 │
|
218
|
+
│ ⚠️ Predictions: 2 warnings │
|
219
|
+
│ 🔧 Auto-Remediation: Enabled │
|
220
|
+
│ │
|
221
|
+
│ 🎯 Current Reliability Score: │
|
222
|
+
│ 97.8% (Target: 99.9%) │
|
223
|
+
│ │
|
224
|
+
│ 🚨 Recent Alerts: │
|
225
|
+
│ • High CPU: web-server-01 │
|
226
|
+
│ • Memory leak: api-service-03 │
|
227
|
+
│ • Disk usage: db-server-02 │
|
228
|
+
╰──────────────────────────────────╯
|
229
|
+
```
|
230
|
+
|
231
|
+
### Incident Response Automation
|
232
|
+
|
233
|
+
**Automated Incident Management**:
|
234
|
+
```bash
|
235
|
+
# Detect and classify incidents
|
236
|
+
runbooks sre incident detect --auto-classify --profile production
|
237
|
+
|
238
|
+
# Automated response execution
|
239
|
+
runbooks sre incident respond --incident-id INC-12345 --auto-remediate --profile production
|
240
|
+
|
241
|
+
# Post-incident review automation
|
242
|
+
runbooks sre incident review --incident-id INC-12345 --generate-report --profile production
|
243
|
+
|
244
|
+
# Runbook execution
|
245
|
+
runbooks sre runbook execute --runbook-id rb-high-cpu-response --profile production
|
246
|
+
```
|
247
|
+
|
248
|
+
### Performance Monitoring
|
249
|
+
|
250
|
+
**Real-Time Performance Dashboard**:
|
251
|
+
```bash
|
252
|
+
# Launch performance dashboard
|
253
|
+
runbooks sre performance --dashboard --real-time --profile production
|
254
|
+
|
255
|
+
# Application performance monitoring
|
256
|
+
runbooks sre monitor --application web-app --profile production
|
257
|
+
|
258
|
+
# Infrastructure utilization
|
259
|
+
runbooks sre monitor --infrastructure --include-costs --profile production
|
260
|
+
|
261
|
+
# Custom metrics collection
|
262
|
+
runbooks sre monitor --custom-metrics config.yaml --profile production
|
263
|
+
```
|
264
|
+
|
265
|
+
### Chaos Engineering
|
266
|
+
|
267
|
+
**Controlled Failure Testing**:
|
268
|
+
```bash
|
269
|
+
# Network partition experiment
|
270
|
+
runbooks sre chaos --experiment network-partition --target web-tier --duration 300s --profile staging
|
271
|
+
|
272
|
+
# CPU stress testing
|
273
|
+
runbooks sre chaos --experiment cpu-stress --intensity 80% --duration 600s --profile staging
|
274
|
+
|
275
|
+
# Memory exhaustion test
|
276
|
+
runbooks sre chaos --experiment memory-leak --rate 10MB/s --duration 300s --profile staging
|
277
|
+
|
278
|
+
# Service dependency failure
|
279
|
+
runbooks sre chaos --experiment service-failure --target payment-service --profile staging
|
280
|
+
```
|
281
|
+
|
282
|
+
---
|
283
|
+
|
284
|
+
## Configuration
|
285
|
+
|
286
|
+
### SRE Configuration File
|
287
|
+
|
288
|
+
Create an `sre_config.toml` file:
|
289
|
+
|
290
|
+
```toml
|
291
|
+
# sre_config.toml
|
292
|
+
[dora_metrics]
|
293
|
+
lead_time_target = "4h"
|
294
|
+
deployment_frequency_target = "daily"
|
295
|
+
mttr_target = "1h"
|
296
|
+
change_failure_rate_target = "5%"
|
297
|
+
|
298
|
+
[mcp_engine]
|
299
|
+
enabled = true
|
300
|
+
anomaly_threshold = 0.95
|
301
|
+
prediction_window = "24h"
|
302
|
+
auto_remediation = true
|
303
|
+
|
304
|
+
[monitoring]
|
305
|
+
dashboard_refresh = "30s"
|
306
|
+
alert_threshold = "95th_percentile"
|
307
|
+
notification_channels = ["slack", "pagerduty", "email"]
|
308
|
+
|
309
|
+
[chaos_engineering]
|
310
|
+
enabled_environments = ["staging", "pre-prod"]
|
311
|
+
max_blast_radius = "10%"
|
312
|
+
safety_checks = true
|
313
|
+
|
314
|
+
[integrations]
|
315
|
+
pagerduty_api_key = "${PAGERDUTY_API_KEY}"
|
316
|
+
slack_webhook = "${SLACK_WEBHOOK_URL}"
|
317
|
+
jira_url = "${JIRA_BASE_URL}"
|
318
|
+
|
319
|
+
[profiles]
|
320
|
+
production = "sre-prod-profile"
|
321
|
+
staging = "sre-staging-profile"
|
322
|
+
```
|
323
|
+
|
324
|
+
**Using Configuration File**:
|
325
|
+
```bash
|
326
|
+
runbooks sre --config sre_config.toml dora --metrics all
|
327
|
+
```
|
328
|
+
|
329
|
+
---
|
330
|
+
|
331
|
+
## Export Formats
|
332
|
+
|
333
|
+
### JSON Output Format
|
334
|
+
|
335
|
+
```bash
|
336
|
+
runbooks sre dora --output-format json --output-file dora_metrics.json --profile production
|
337
|
+
```
|
338
|
+
|
339
|
+
```json
|
340
|
+
{
|
341
|
+
"dora_metrics": {
|
342
|
+
"timestamp": "2024-01-15T10:30:00Z",
|
343
|
+
"time_range": "30d",
|
344
|
+
"lead_time": {
|
345
|
+
"value": 2.4,
|
346
|
+
"unit": "hours",
|
347
|
+
"target": 4,
|
348
|
+
"status": "meeting_target"
|
349
|
+
},
|
350
|
+
"deployment_frequency": {
|
351
|
+
"value": 12.3,
|
352
|
+
"unit": "per_day",
|
353
|
+
"target": "daily",
|
354
|
+
"status": "exceeding_target"
|
355
|
+
},
|
356
|
+
"mttr": {
|
357
|
+
"value": 47,
|
358
|
+
"unit": "minutes",
|
359
|
+
"target": 60,
|
360
|
+
"status": "meeting_target"
|
361
|
+
},
|
362
|
+
"change_failure_rate": {
|
363
|
+
"value": 3.2,
|
364
|
+
"unit": "percent",
|
365
|
+
"target": 5,
|
366
|
+
"status": "meeting_target"
|
367
|
+
}
|
368
|
+
}
|
369
|
+
}
|
370
|
+
```
|
371
|
+
|
372
|
+
### HTML Dashboard Export
|
373
|
+
|
374
|
+
```bash
|
375
|
+
runbooks sre dashboard --export-html --output-file sre_dashboard.html --profile production
|
376
|
+
```
|
377
|
+
|
378
|
+
---
|
379
|
+
|
380
|
+
## Enterprise Integration
|
381
|
+
|
382
|
+
### PagerDuty Integration
|
383
|
+
|
384
|
+
```bash
|
385
|
+
# Configure PagerDuty integration
|
386
|
+
runbooks sre configure --service pagerduty --api-key $PAGERDUTY_API_KEY
|
387
|
+
|
388
|
+
# Automated incident creation
|
389
|
+
runbooks sre incident create --severity critical --title "High CPU Alert" --service web-app
|
390
|
+
```
|
391
|
+
|
392
|
+
### Slack Notifications
|
393
|
+
|
394
|
+
```bash
|
395
|
+
# Configure Slack notifications
|
396
|
+
runbooks sre configure --service slack --webhook $SLACK_WEBHOOK_URL
|
397
|
+
|
398
|
+
# Send status updates
|
399
|
+
runbooks sre notify --channel "#sre-alerts" --message "DORA metrics updated"
|
400
|
+
```
|
401
|
+
|
402
|
+
### ServiceNow Integration
|
403
|
+
|
404
|
+
```bash
|
405
|
+
# ServiceNow incident management
|
406
|
+
runbooks sre incident create --platform servicenow --category performance --priority high
|
407
|
+
```
|
408
|
+
|
409
|
+
---
|
410
|
+
|
411
|
+
## Advanced MCP Features
|
412
|
+
|
413
|
+
### MCP Reliability Health Checker
|
414
|
+
|
415
|
+
The MCP reliability engine includes a comprehensive health checking system:
|
416
|
+
|
417
|
+
```bash
|
418
|
+
# Run MCP health checks
|
419
|
+
runbooks sre mcp-health-check --profile production
|
420
|
+
|
421
|
+
# Deep system analysis
|
422
|
+
runbooks sre mcp-analyze --deep-scan --profile production
|
423
|
+
|
424
|
+
# Generate reliability report
|
425
|
+
runbooks sre mcp-report --comprehensive --output reliability_report.html
|
426
|
+
```
|
427
|
+
|
428
|
+
### AI-Powered Remediation
|
429
|
+
|
430
|
+
```bash
|
431
|
+
# Get AI remediation suggestions
|
432
|
+
runbooks sre ai-remediate --issue high-latency --context "web application" --profile production
|
433
|
+
|
434
|
+
# Execute automated fixes
|
435
|
+
runbooks sre ai-remediate --auto-execute --confirm --profile production
|
436
|
+
```
|
437
|
+
|
438
|
+
---
|
439
|
+
|
440
|
+
## Contributing
|
441
|
+
|
442
|
+
We welcome contributions! Please see our [Contributing Guidelines](../../../CONTRIBUTING.md) for details.
|
443
|
+
|
444
|
+
### Development Setup
|
445
|
+
```bash
|
446
|
+
git clone https://github.com/1xOps/CloudOps-Runbooks.git
|
447
|
+
cd CloudOps-Runbooks
|
448
|
+
uv sync --all-extras
|
449
|
+
uv run python -m runbooks sre --help
|
450
|
+
```
|
451
|
+
|
452
|
+
### Running Tests
|
453
|
+
```bash
|
454
|
+
uv run pytest tests/sre/ -v
|
455
|
+
```
|
456
|
+
|
457
|
+
---
|
458
|
+
|
459
|
+
## License
|
460
|
+
|
461
|
+
This project is licensed under the Apache License 2.0 - see the [LICENSE](../../../LICENSE) file for details.
|
462
|
+
|
463
|
+
---
|
464
|
+
|
465
|
+
## Enterprise Support
|
466
|
+
|
467
|
+
For enterprise support, professional services, and custom SRE integrations:
|
468
|
+
- **Email**: [info@oceansoft.io](mailto:info@oceansoft.io)
|
469
|
+
- **GitHub**: [CloudOps Runbooks Issues](https://github.com/1xOps/CloudOps-Runbooks/issues)
|
470
|
+
- **Documentation**: [Enterprise SRE Documentation](https://docs.cloudops-runbooks.io/sre)
|
471
|
+
|
472
|
+
Let's build reliable, automated systems together. 🚀
|
runbooks/sre/__init__.py
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
SRE (Site Reliability Engineering) Module for CloudOps-Runbooks
|
4
|
+
|
5
|
+
This module provides enterprise-grade SRE automation capabilities including:
|
6
|
+
- Infrastructure monitoring and alerting
|
7
|
+
- Automated incident response and recovery
|
8
|
+
- Performance optimization and capacity planning
|
9
|
+
- Reliability engineering and chaos testing
|
10
|
+
- MCP integration reliability and health monitoring
|
11
|
+
|
12
|
+
Components:
|
13
|
+
- mcp_reliability_engine: Enterprise MCP reliability automation
|
14
|
+
- incident_response: Automated incident detection and response
|
15
|
+
- performance_monitoring: Real-time performance tracking
|
16
|
+
- chaos_engineering: Resilience testing framework
|
17
|
+
"""
|
18
|
+
|
19
|
+
from .mcp_reliability_engine import (
|
20
|
+
MCPConnectionPool,
|
21
|
+
MCPHealthCheck,
|
22
|
+
MCPReliabilityEngine,
|
23
|
+
run_mcp_reliability_suite,
|
24
|
+
)
|
25
|
+
|
26
|
+
__version__ = "1.0.0"
|
27
|
+
|
28
|
+
__all__ = [
|
29
|
+
"MCPReliabilityEngine",
|
30
|
+
"MCPConnectionPool",
|
31
|
+
"MCPHealthCheck",
|
32
|
+
"run_mcp_reliability_suite",
|
33
|
+
]
|