runbooks 0.7.6__py3-none-any.whl → 0.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runbooks/__init__.py +1 -1
- runbooks/base.py +5 -1
- runbooks/cfat/__init__.py +8 -4
- runbooks/cfat/assessment/collectors.py +171 -14
- runbooks/cfat/assessment/compliance.py +871 -0
- runbooks/cfat/assessment/runner.py +122 -11
- runbooks/cfat/models.py +6 -2
- runbooks/common/logger.py +14 -0
- runbooks/common/rich_utils.py +451 -0
- runbooks/enterprise/__init__.py +68 -0
- runbooks/enterprise/error_handling.py +411 -0
- runbooks/enterprise/logging.py +439 -0
- runbooks/enterprise/multi_tenant.py +583 -0
- runbooks/finops/README.md +468 -241
- runbooks/finops/__init__.py +39 -3
- runbooks/finops/cli.py +83 -18
- runbooks/finops/cross_validation.py +375 -0
- runbooks/finops/dashboard_runner.py +812 -164
- runbooks/finops/enhanced_dashboard_runner.py +525 -0
- runbooks/finops/finops_dashboard.py +1892 -0
- runbooks/finops/helpers.py +485 -51
- runbooks/finops/optimizer.py +823 -0
- runbooks/finops/tests/__init__.py +19 -0
- runbooks/finops/tests/results_test_finops_dashboard.xml +1 -0
- runbooks/finops/tests/run_comprehensive_tests.py +421 -0
- runbooks/finops/tests/run_tests.py +305 -0
- runbooks/finops/tests/test_finops_dashboard.py +705 -0
- runbooks/finops/tests/test_integration.py +477 -0
- runbooks/finops/tests/test_performance.py +380 -0
- runbooks/finops/tests/test_performance_benchmarks.py +500 -0
- runbooks/finops/tests/test_reference_images_validation.py +867 -0
- runbooks/finops/tests/test_single_account_features.py +715 -0
- runbooks/finops/tests/validate_test_suite.py +220 -0
- runbooks/finops/types.py +1 -1
- runbooks/hitl/enhanced_workflow_engine.py +725 -0
- runbooks/inventory/artifacts/scale-optimize-status.txt +12 -0
- runbooks/inventory/collectors/aws_comprehensive.py +442 -0
- runbooks/inventory/collectors/enterprise_scale.py +281 -0
- runbooks/inventory/core/collector.py +172 -13
- runbooks/inventory/discovery.md +1 -1
- runbooks/inventory/list_ec2_instances.py +18 -20
- runbooks/inventory/list_ssm_parameters.py +31 -3
- runbooks/inventory/organizations_discovery.py +1269 -0
- runbooks/inventory/rich_inventory_display.py +393 -0
- runbooks/inventory/run_on_multi_accounts.py +35 -19
- runbooks/inventory/runbooks.security.report_generator.log +0 -0
- runbooks/inventory/runbooks.security.run_script.log +0 -0
- runbooks/inventory/vpc_flow_analyzer.py +1030 -0
- runbooks/main.py +2215 -119
- runbooks/metrics/dora_metrics_engine.py +599 -0
- runbooks/operate/__init__.py +2 -2
- runbooks/operate/base.py +122 -10
- runbooks/operate/deployment_framework.py +1032 -0
- runbooks/operate/deployment_validator.py +853 -0
- runbooks/operate/dynamodb_operations.py +10 -6
- runbooks/operate/ec2_operations.py +319 -11
- runbooks/operate/executive_dashboard.py +779 -0
- runbooks/operate/mcp_integration.py +750 -0
- runbooks/operate/nat_gateway_operations.py +1120 -0
- runbooks/operate/networking_cost_heatmap.py +685 -0
- runbooks/operate/privatelink_operations.py +940 -0
- runbooks/operate/s3_operations.py +10 -6
- runbooks/operate/vpc_endpoints.py +644 -0
- runbooks/operate/vpc_operations.py +1038 -0
- runbooks/remediation/__init__.py +2 -2
- runbooks/remediation/acm_remediation.py +1 -1
- runbooks/remediation/base.py +1 -1
- runbooks/remediation/cloudtrail_remediation.py +1 -1
- runbooks/remediation/cognito_remediation.py +1 -1
- runbooks/remediation/dynamodb_remediation.py +1 -1
- runbooks/remediation/ec2_remediation.py +1 -1
- runbooks/remediation/ec2_unattached_ebs_volumes.py +1 -1
- runbooks/remediation/kms_enable_key_rotation.py +1 -1
- runbooks/remediation/kms_remediation.py +1 -1
- runbooks/remediation/lambda_remediation.py +1 -1
- runbooks/remediation/multi_account.py +1 -1
- runbooks/remediation/rds_remediation.py +1 -1
- runbooks/remediation/s3_block_public_access.py +1 -1
- runbooks/remediation/s3_enable_access_logging.py +1 -1
- runbooks/remediation/s3_encryption.py +1 -1
- runbooks/remediation/s3_remediation.py +1 -1
- runbooks/remediation/vpc_remediation.py +475 -0
- runbooks/security/__init__.py +3 -1
- runbooks/security/compliance_automation.py +632 -0
- runbooks/security/report_generator.py +10 -0
- runbooks/security/run_script.py +31 -5
- runbooks/security/security_baseline_tester.py +169 -30
- runbooks/security/security_export.py +477 -0
- runbooks/validation/__init__.py +10 -0
- runbooks/validation/benchmark.py +484 -0
- runbooks/validation/cli.py +356 -0
- runbooks/validation/mcp_validator.py +768 -0
- runbooks/vpc/__init__.py +38 -0
- runbooks/vpc/config.py +212 -0
- runbooks/vpc/cost_engine.py +347 -0
- runbooks/vpc/heatmap_engine.py +605 -0
- runbooks/vpc/manager_interface.py +634 -0
- runbooks/vpc/networking_wrapper.py +1260 -0
- runbooks/vpc/rich_formatters.py +679 -0
- runbooks/vpc/tests/__init__.py +5 -0
- runbooks/vpc/tests/conftest.py +356 -0
- runbooks/vpc/tests/test_cli_integration.py +530 -0
- runbooks/vpc/tests/test_config.py +458 -0
- runbooks/vpc/tests/test_cost_engine.py +479 -0
- runbooks/vpc/tests/test_networking_wrapper.py +512 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/METADATA +40 -12
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/RECORD +111 -50
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/WHEEL +0 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/entry_points.txt +0 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/licenses/LICENSE +0 -0
- {runbooks-0.7.6.dist-info → runbooks-0.7.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1032 @@
|
|
1
|
+
"""
|
2
|
+
Production Deployment Framework for AWS Networking Cost Optimization
|
3
|
+
Terminal 5: Deploy Agent - Enterprise Security-as-Code Implementation
|
4
|
+
|
5
|
+
Comprehensive production deployment framework with enterprise-grade safety controls,
|
6
|
+
monitoring, alerting, and rollback procedures for AWS networking cost optimization.
|
7
|
+
|
8
|
+
Features:
|
9
|
+
- Default DRY-RUN mode for all operations
|
10
|
+
- Management approval gates for cost impact >$1000
|
11
|
+
- Comprehensive rollback procedures with automated recovery
|
12
|
+
- Zero-downtime deployment approach with canary strategy
|
13
|
+
- Real-time monitoring with alerting on execution failures
|
14
|
+
- MCP server integration for production validation
|
15
|
+
- Executive dashboard deployment with ROI tracking
|
16
|
+
|
17
|
+
Production Safety Requirements:
|
18
|
+
- All destructive operations default to dry-run mode
|
19
|
+
- Cost impact validation with approval workflows
|
20
|
+
- Automated rollback on performance degradation
|
21
|
+
- Comprehensive audit trails and compliance tracking
|
22
|
+
- Multi-profile AWS integration with proper RBAC
|
23
|
+
"""
|
24
|
+
|
25
|
+
import asyncio
|
26
|
+
import json
|
27
|
+
import time
|
28
|
+
from concurrent.futures import ThreadPoolExecutor
|
29
|
+
from dataclasses import dataclass, field
|
30
|
+
from datetime import datetime, timedelta
|
31
|
+
from enum import Enum
|
32
|
+
from pathlib import Path
|
33
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
34
|
+
|
35
|
+
import boto3
|
36
|
+
from botocore.exceptions import ClientError
|
37
|
+
from loguru import logger
|
38
|
+
|
39
|
+
from runbooks.common.rich_utils import RichConsole
|
40
|
+
from runbooks.operate.base import BaseOperation, OperationContext, OperationResult, OperationStatus
|
41
|
+
from runbooks.operate.vpc_operations import VPCOperations
|
42
|
+
|
43
|
+
|
44
|
+
class DeploymentStrategy(Enum):
|
45
|
+
"""Deployment strategy options for production rollouts."""
|
46
|
+
|
47
|
+
BLUE_GREEN = "blue_green"
|
48
|
+
CANARY = "canary"
|
49
|
+
ROLLING = "rolling"
|
50
|
+
ALL_AT_ONCE = "all_at_once"
|
51
|
+
|
52
|
+
|
53
|
+
class ApprovalStatus(Enum):
|
54
|
+
"""Approval status for production operations."""
|
55
|
+
|
56
|
+
PENDING = "pending"
|
57
|
+
APPROVED = "approved"
|
58
|
+
REJECTED = "rejected"
|
59
|
+
EXPIRED = "expired"
|
60
|
+
|
61
|
+
|
62
|
+
class MonitoringAlert(Enum):
|
63
|
+
"""Monitoring alert severity levels."""
|
64
|
+
|
65
|
+
CRITICAL = "critical"
|
66
|
+
HIGH = "high"
|
67
|
+
MEDIUM = "medium"
|
68
|
+
LOW = "low"
|
69
|
+
INFO = "info"
|
70
|
+
|
71
|
+
|
72
|
+
@dataclass
|
73
|
+
class ApprovalRequest:
|
74
|
+
"""Production approval request with business context."""
|
75
|
+
|
76
|
+
request_id: str
|
77
|
+
operation_type: str
|
78
|
+
resource_id: str
|
79
|
+
cost_impact_monthly: float
|
80
|
+
cost_impact_annual: float
|
81
|
+
business_justification: str
|
82
|
+
risk_assessment: str
|
83
|
+
requestor: str
|
84
|
+
created_at: datetime = field(default_factory=datetime.utcnow)
|
85
|
+
expires_at: Optional[datetime] = None
|
86
|
+
status: ApprovalStatus = ApprovalStatus.PENDING
|
87
|
+
approver: Optional[str] = None
|
88
|
+
approval_notes: Optional[str] = None
|
89
|
+
|
90
|
+
def __post_init__(self):
|
91
|
+
if self.expires_at is None:
|
92
|
+
# Default 24-hour approval window
|
93
|
+
self.expires_at = self.created_at + timedelta(hours=24)
|
94
|
+
|
95
|
+
|
96
|
+
@dataclass
|
97
|
+
class DeploymentPlan:
|
98
|
+
"""Comprehensive deployment plan with safety controls."""
|
99
|
+
|
100
|
+
deployment_id: str
|
101
|
+
strategy: DeploymentStrategy
|
102
|
+
target_accounts: List[str]
|
103
|
+
target_regions: List[str]
|
104
|
+
operations: List[Dict[str, Any]]
|
105
|
+
approval_required: bool = True
|
106
|
+
dry_run_first: bool = True
|
107
|
+
rollback_enabled: bool = True
|
108
|
+
monitoring_enabled: bool = True
|
109
|
+
cost_threshold: float = 1000.0 # $1000 monthly cost threshold
|
110
|
+
|
111
|
+
# Safety thresholds
|
112
|
+
error_rate_threshold: float = 0.05 # 5% error rate triggers rollback
|
113
|
+
latency_threshold: float = 12.0 # 12s latency threshold
|
114
|
+
availability_threshold: float = 0.995 # 99.5% availability minimum
|
115
|
+
|
116
|
+
# Timing controls
|
117
|
+
canary_duration: int = 300 # 5 minutes canary phase
|
118
|
+
rollout_duration: int = 1800 # 30 minutes total rollout
|
119
|
+
monitoring_duration: int = 3600 # 1 hour post-deployment monitoring
|
120
|
+
|
121
|
+
|
122
|
+
@dataclass
|
123
|
+
class DeploymentStatus:
|
124
|
+
"""Real-time deployment status tracking."""
|
125
|
+
|
126
|
+
deployment_id: str
|
127
|
+
current_phase: str
|
128
|
+
started_at: datetime
|
129
|
+
completed_at: Optional[datetime] = None
|
130
|
+
progress_percentage: float = 0.0
|
131
|
+
successful_operations: int = 0
|
132
|
+
failed_operations: int = 0
|
133
|
+
rollback_triggered: bool = False
|
134
|
+
rollback_reason: Optional[str] = None
|
135
|
+
|
136
|
+
# Performance metrics
|
137
|
+
avg_execution_time: float = 0.0
|
138
|
+
error_rate: float = 0.0
|
139
|
+
availability_score: float = 1.0
|
140
|
+
|
141
|
+
|
142
|
+
class ProductionDeploymentFramework(BaseOperation):
|
143
|
+
"""
|
144
|
+
Enterprise Production Deployment Framework
|
145
|
+
|
146
|
+
Terminal 5: Deploy Agent implementation with comprehensive safety controls,
|
147
|
+
monitoring, rollback procedures, and compliance tracking for AWS networking
|
148
|
+
cost optimization campaigns.
|
149
|
+
|
150
|
+
Core Features:
|
151
|
+
- Multi-stage deployment with approval gates
|
152
|
+
- Real-time performance monitoring and alerting
|
153
|
+
- Automated rollback on performance degradation
|
154
|
+
- Comprehensive audit trails and compliance tracking
|
155
|
+
- MCP server integration for validation
|
156
|
+
- Executive dashboard and ROI tracking
|
157
|
+
"""
|
158
|
+
|
159
|
+
service_name = "deployment-framework"
|
160
|
+
supported_operations = {
|
161
|
+
"deploy_optimization_campaign",
|
162
|
+
"validate_deployment_plan",
|
163
|
+
"execute_canary_deployment",
|
164
|
+
"monitor_deployment_health",
|
165
|
+
"trigger_rollback",
|
166
|
+
"generate_deployment_report",
|
167
|
+
"setup_monitoring_alerts",
|
168
|
+
"create_approval_request",
|
169
|
+
"process_approval_workflow",
|
170
|
+
}
|
171
|
+
requires_confirmation = True
|
172
|
+
|
173
|
+
def __init__(self, profile: Optional[str] = None, region: Optional[str] = None, dry_run: bool = True):
|
174
|
+
"""
|
175
|
+
Initialize Production Deployment Framework.
|
176
|
+
|
177
|
+
Args:
|
178
|
+
profile: AWS profile for authentication
|
179
|
+
region: AWS region for operations
|
180
|
+
dry_run: Enable dry-run mode (ENABLED BY DEFAULT for safety)
|
181
|
+
"""
|
182
|
+
super().__init__(profile, region, dry_run)
|
183
|
+
self.rich_console = RichConsole()
|
184
|
+
self.vpc_operations = VPCOperations(profile, region, dry_run)
|
185
|
+
|
186
|
+
# Production safety defaults
|
187
|
+
self.default_dry_run = True # ALWAYS default to dry-run for safety
|
188
|
+
self.approval_timeout_hours = 24
|
189
|
+
self.cost_approval_threshold = 1000.0 # $1000 monthly threshold
|
190
|
+
|
191
|
+
# Monitoring configuration
|
192
|
+
self.monitoring_interval = 30 # seconds
|
193
|
+
self.health_check_timeout = 10 # seconds
|
194
|
+
self.max_retries = 3
|
195
|
+
|
196
|
+
# AWS profiles for multi-account operations
|
197
|
+
self.aws_profiles = {
|
198
|
+
"single_account": "ams-shared-services-non-prod-ReadOnlyAccess-499201730520",
|
199
|
+
"centralised_ops": "ams-centralised-ops-ReadOnlyAccess-335083429030",
|
200
|
+
"billing": "ams-admin-Billing-ReadOnlyAccess-909135376185",
|
201
|
+
}
|
202
|
+
|
203
|
+
# Deployment tracking
|
204
|
+
self.active_deployments: Dict[str, DeploymentStatus] = {}
|
205
|
+
self.approval_requests: Dict[str, ApprovalRequest] = {}
|
206
|
+
|
207
|
+
# Artifact storage
|
208
|
+
self.artifacts_dir = Path("artifacts/deployments")
|
209
|
+
self.artifacts_dir.mkdir(parents=True, exist_ok=True)
|
210
|
+
|
211
|
+
logger.info(f"Production Deployment Framework initialized - Safety Mode: {self.default_dry_run}")
|
212
|
+
|
213
|
+
async def deploy_optimization_campaign(self, deployment_plan: DeploymentPlan) -> Dict[str, Any]:
|
214
|
+
"""
|
215
|
+
Execute comprehensive AWS networking cost optimization deployment campaign.
|
216
|
+
|
217
|
+
This is the main entry point for production deployments with full
|
218
|
+
enterprise safety controls, monitoring, and approval workflows.
|
219
|
+
|
220
|
+
Args:
|
221
|
+
deployment_plan: Comprehensive deployment configuration
|
222
|
+
|
223
|
+
Returns:
|
224
|
+
Dict containing deployment results and status
|
225
|
+
"""
|
226
|
+
deployment_id = deployment_plan.deployment_id
|
227
|
+
|
228
|
+
self.rich_console.print_panel(
|
229
|
+
"🚀 Production Deployment Campaign",
|
230
|
+
f"Deployment ID: {deployment_id}\n"
|
231
|
+
f"Strategy: {deployment_plan.strategy.value}\n"
|
232
|
+
f"Target Accounts: {len(deployment_plan.target_accounts)}\n"
|
233
|
+
f"Operations: {len(deployment_plan.operations)}\n"
|
234
|
+
f"Cost Impact: ${sum(op.get('cost_impact', 0) for op in deployment_plan.operations):.0f}/month\n"
|
235
|
+
f"Safety Mode: {'ENABLED' if deployment_plan.dry_run_first else 'DISABLED'}",
|
236
|
+
title="🏗️ Enterprise Deployment",
|
237
|
+
)
|
238
|
+
|
239
|
+
try:
|
240
|
+
# Initialize deployment tracking
|
241
|
+
deployment_status = DeploymentStatus(
|
242
|
+
deployment_id=deployment_id, current_phase="initialization", started_at=datetime.utcnow()
|
243
|
+
)
|
244
|
+
self.active_deployments[deployment_id] = deployment_status
|
245
|
+
|
246
|
+
# Phase 1: Pre-deployment validation
|
247
|
+
validation_result = await self._validate_deployment_plan(deployment_plan)
|
248
|
+
if not validation_result["success"]:
|
249
|
+
return {"status": "failed", "phase": "validation", "error": validation_result["error"]}
|
250
|
+
|
251
|
+
deployment_status.current_phase = "validation_complete"
|
252
|
+
deployment_status.progress_percentage = 10.0
|
253
|
+
|
254
|
+
# Phase 2: Approval workflow (if required)
|
255
|
+
if deployment_plan.approval_required:
|
256
|
+
approval_result = await self._process_approval_workflow(deployment_plan)
|
257
|
+
if not approval_result["approved"]:
|
258
|
+
return {"status": "cancelled", "phase": "approval", "reason": approval_result["reason"]}
|
259
|
+
|
260
|
+
deployment_status.current_phase = "approved"
|
261
|
+
deployment_status.progress_percentage = 20.0
|
262
|
+
|
263
|
+
# Phase 3: Dry-run execution (if enabled)
|
264
|
+
if deployment_plan.dry_run_first:
|
265
|
+
dry_run_result = await self._execute_dry_run(deployment_plan)
|
266
|
+
if not dry_run_result["success"]:
|
267
|
+
return {"status": "failed", "phase": "dry_run", "error": dry_run_result["error"]}
|
268
|
+
|
269
|
+
deployment_status.current_phase = "dry_run_complete"
|
270
|
+
deployment_status.progress_percentage = 40.0
|
271
|
+
|
272
|
+
# Phase 4: Production deployment
|
273
|
+
deployment_result = await self._execute_production_deployment(deployment_plan, deployment_status)
|
274
|
+
|
275
|
+
# Phase 5: Post-deployment monitoring
|
276
|
+
if deployment_plan.monitoring_enabled:
|
277
|
+
monitoring_result = await self._monitor_deployment_health(deployment_plan, deployment_status)
|
278
|
+
|
279
|
+
# Generate comprehensive deployment report
|
280
|
+
report_result = await self._generate_deployment_report(deployment_plan, deployment_status)
|
281
|
+
|
282
|
+
return {
|
283
|
+
"status": "success",
|
284
|
+
"deployment_id": deployment_id,
|
285
|
+
"phases_completed": deployment_status.current_phase,
|
286
|
+
"total_operations": len(deployment_plan.operations),
|
287
|
+
"successful_operations": deployment_status.successful_operations,
|
288
|
+
"failed_operations": deployment_status.failed_operations,
|
289
|
+
"rollback_triggered": deployment_status.rollback_triggered,
|
290
|
+
"deployment_report": report_result,
|
291
|
+
}
|
292
|
+
|
293
|
+
except Exception as e:
|
294
|
+
error_msg = f"Deployment campaign failed: {str(e)}"
|
295
|
+
logger.error(error_msg)
|
296
|
+
|
297
|
+
# Trigger emergency rollback if needed
|
298
|
+
if deployment_status.successful_operations > 0:
|
299
|
+
await self._trigger_emergency_rollback(deployment_plan, deployment_status, str(e))
|
300
|
+
|
301
|
+
return {"status": "failed", "deployment_id": deployment_id, "error": error_msg, "rollback_triggered": True}
|
302
|
+
|
303
|
+
async def _validate_deployment_plan(self, deployment_plan: DeploymentPlan) -> Dict[str, Any]:
|
304
|
+
"""
|
305
|
+
Comprehensive deployment plan validation with security checks.
|
306
|
+
|
307
|
+
Args:
|
308
|
+
deployment_plan: Deployment plan to validate
|
309
|
+
|
310
|
+
Returns:
|
311
|
+
Dict containing validation results
|
312
|
+
"""
|
313
|
+
self.rich_console.print_info("🔍 Validating deployment plan...")
|
314
|
+
|
315
|
+
validation_issues = []
|
316
|
+
warnings = []
|
317
|
+
|
318
|
+
try:
|
319
|
+
# Validate target accounts and permissions
|
320
|
+
for account_id in deployment_plan.target_accounts:
|
321
|
+
if not await self._validate_account_access(account_id):
|
322
|
+
validation_issues.append(f"Invalid or insufficient access to account {account_id}")
|
323
|
+
|
324
|
+
# Validate target regions
|
325
|
+
for region in deployment_plan.target_regions:
|
326
|
+
if not await self._validate_region_availability(region):
|
327
|
+
validation_issues.append(f"Region {region} not available or accessible")
|
328
|
+
|
329
|
+
# Validate cost impact and approval requirements
|
330
|
+
total_monthly_cost = sum(op.get("cost_impact", 0) for op in deployment_plan.operations)
|
331
|
+
if total_monthly_cost > deployment_plan.cost_threshold:
|
332
|
+
if not deployment_plan.approval_required:
|
333
|
+
validation_issues.append(f"Cost impact ${total_monthly_cost:.0f}/month requires approval")
|
334
|
+
|
335
|
+
# Validate operation types and parameters
|
336
|
+
for i, operation in enumerate(deployment_plan.operations):
|
337
|
+
if not self._validate_operation_parameters(operation):
|
338
|
+
validation_issues.append(f"Invalid parameters in operation {i + 1}")
|
339
|
+
|
340
|
+
# Security validation
|
341
|
+
security_issues = await self._validate_security_compliance(deployment_plan)
|
342
|
+
validation_issues.extend(security_issues)
|
343
|
+
|
344
|
+
# Resource dependency validation
|
345
|
+
dependency_issues = await self._validate_resource_dependencies(deployment_plan)
|
346
|
+
validation_issues.extend(dependency_issues)
|
347
|
+
|
348
|
+
if validation_issues:
|
349
|
+
self.rich_console.print_error(f"❌ Validation failed with {len(validation_issues)} issues:")
|
350
|
+
for issue in validation_issues:
|
351
|
+
self.rich_console.print_error(f" • {issue}")
|
352
|
+
|
353
|
+
return {"success": False, "error": "Validation failed", "issues": validation_issues}
|
354
|
+
|
355
|
+
if warnings:
|
356
|
+
self.rich_console.print_warning(f"⚠️ Validation completed with {len(warnings)} warnings:")
|
357
|
+
for warning in warnings:
|
358
|
+
self.rich_console.print_warning(f" • {warning}")
|
359
|
+
|
360
|
+
self.rich_console.print_success("✅ Deployment plan validation successful")
|
361
|
+
return {"success": True, "warnings": warnings}
|
362
|
+
|
363
|
+
except Exception as e:
|
364
|
+
error_msg = f"Validation error: {str(e)}"
|
365
|
+
logger.error(error_msg)
|
366
|
+
return {"success": False, "error": error_msg}
|
367
|
+
|
368
|
+
async def _process_approval_workflow(self, deployment_plan: DeploymentPlan) -> Dict[str, Any]:
|
369
|
+
"""
|
370
|
+
Process approval workflow for production deployments.
|
371
|
+
|
372
|
+
Args:
|
373
|
+
deployment_plan: Deployment plan requiring approval
|
374
|
+
|
375
|
+
Returns:
|
376
|
+
Dict containing approval status and details
|
377
|
+
"""
|
378
|
+
total_cost_impact = sum(op.get("cost_impact", 0) for op in deployment_plan.operations)
|
379
|
+
|
380
|
+
# Create approval request
|
381
|
+
approval_request = ApprovalRequest(
|
382
|
+
request_id=f"APPROVE-{deployment_plan.deployment_id}",
|
383
|
+
operation_type="cost_optimization_deployment",
|
384
|
+
resource_id=deployment_plan.deployment_id,
|
385
|
+
cost_impact_monthly=total_cost_impact,
|
386
|
+
cost_impact_annual=total_cost_impact * 12,
|
387
|
+
business_justification="AWS networking cost optimization campaign with projected 25-50% savings",
|
388
|
+
risk_assessment="Low risk - automated deployment with rollback capability",
|
389
|
+
requestor="deploy-agent-terminal-5",
|
390
|
+
)
|
391
|
+
|
392
|
+
self.approval_requests[approval_request.request_id] = approval_request
|
393
|
+
|
394
|
+
self.rich_console.print_panel(
|
395
|
+
"🔐 Management Approval Required",
|
396
|
+
f"Request ID: {approval_request.request_id}\n"
|
397
|
+
f"Monthly Cost Impact: ${total_cost_impact:.0f}\n"
|
398
|
+
f"Annual Cost Impact: ${total_cost_impact * 12:.0f}\n"
|
399
|
+
f"Expires: {approval_request.expires_at.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
400
|
+
f"Risk Level: LOW (automated with rollback)",
|
401
|
+
title="🏢 Executive Approval Gate",
|
402
|
+
)
|
403
|
+
|
404
|
+
# For production deployment, require interactive approval
|
405
|
+
if not self.dry_run:
|
406
|
+
approval_response = (
|
407
|
+
input("\n🎯 Management Approval Required - Proceed with deployment? (yes/no): ").lower().strip()
|
408
|
+
)
|
409
|
+
|
410
|
+
if approval_response in ["yes", "y", "approve"]:
|
411
|
+
approval_request.status = ApprovalStatus.APPROVED
|
412
|
+
approval_request.approver = "management-terminal-0"
|
413
|
+
approval_request.approval_notes = "Approved for cost optimization deployment"
|
414
|
+
|
415
|
+
self.rich_console.print_success("✅ Deployment approved - proceeding with execution")
|
416
|
+
return {"approved": True, "approval_id": approval_request.request_id}
|
417
|
+
else:
|
418
|
+
approval_request.status = ApprovalStatus.REJECTED
|
419
|
+
approval_request.approval_notes = "Deployment rejected by management"
|
420
|
+
|
421
|
+
self.rich_console.print_warning("❌ Deployment rejected - operation cancelled")
|
422
|
+
return {"approved": False, "reason": "Management rejected deployment"}
|
423
|
+
else:
|
424
|
+
# Dry-run mode - simulate approval
|
425
|
+
self.rich_console.print_info("[DRY-RUN] Simulating management approval")
|
426
|
+
return {"approved": True, "approval_id": approval_request.request_id, "simulated": True}
|
427
|
+
|
428
|
+
async def _execute_production_deployment(
|
429
|
+
self, deployment_plan: DeploymentPlan, deployment_status: DeploymentStatus
|
430
|
+
) -> Dict[str, Any]:
|
431
|
+
"""
|
432
|
+
Execute production deployment with chosen strategy.
|
433
|
+
|
434
|
+
Args:
|
435
|
+
deployment_plan: Deployment configuration
|
436
|
+
deployment_status: Current deployment status
|
437
|
+
|
438
|
+
Returns:
|
439
|
+
Dict containing deployment results
|
440
|
+
"""
|
441
|
+
deployment_status.current_phase = "production_deployment"
|
442
|
+
|
443
|
+
self.rich_console.print_panel(
|
444
|
+
f"🚀 Executing {deployment_plan.strategy.value.replace('_', ' ').title()} Deployment",
|
445
|
+
f"Operations: {len(deployment_plan.operations)}\n"
|
446
|
+
f"Target Accounts: {len(deployment_plan.target_accounts)}\n"
|
447
|
+
f"Monitoring: {'ENABLED' if deployment_plan.monitoring_enabled else 'DISABLED'}\n"
|
448
|
+
f"Rollback: {'ENABLED' if deployment_plan.rollback_enabled else 'DISABLED'}",
|
449
|
+
title="🏗️ Production Execution",
|
450
|
+
)
|
451
|
+
|
452
|
+
try:
|
453
|
+
if deployment_plan.strategy == DeploymentStrategy.CANARY:
|
454
|
+
return await self._execute_canary_deployment(deployment_plan, deployment_status)
|
455
|
+
elif deployment_plan.strategy == DeploymentStrategy.BLUE_GREEN:
|
456
|
+
return await self._execute_blue_green_deployment(deployment_plan, deployment_status)
|
457
|
+
elif deployment_plan.strategy == DeploymentStrategy.ROLLING:
|
458
|
+
return await self._execute_rolling_deployment(deployment_plan, deployment_status)
|
459
|
+
else: # ALL_AT_ONCE
|
460
|
+
return await self._execute_all_at_once_deployment(deployment_plan, deployment_status)
|
461
|
+
|
462
|
+
except Exception as e:
|
463
|
+
error_msg = f"Production deployment failed: {str(e)}"
|
464
|
+
logger.error(error_msg)
|
465
|
+
|
466
|
+
if deployment_plan.rollback_enabled:
|
467
|
+
await self._trigger_emergency_rollback(deployment_plan, deployment_status, error_msg)
|
468
|
+
|
469
|
+
return {"success": False, "error": error_msg}
|
470
|
+
|
471
|
+
async def _execute_canary_deployment(
|
472
|
+
self, deployment_plan: DeploymentPlan, deployment_status: DeploymentStatus
|
473
|
+
) -> Dict[str, Any]:
|
474
|
+
"""
|
475
|
+
Execute canary deployment with gradual rollout and monitoring.
|
476
|
+
|
477
|
+
Args:
|
478
|
+
deployment_plan: Deployment configuration
|
479
|
+
deployment_status: Current deployment status
|
480
|
+
|
481
|
+
Returns:
|
482
|
+
Dict containing canary deployment results
|
483
|
+
"""
|
484
|
+
self.rich_console.print_info("🐤 Starting Canary Deployment Phase")
|
485
|
+
|
486
|
+
# Phase 1: Deploy to canary group (10% of targets)
|
487
|
+
canary_accounts = deployment_plan.target_accounts[: max(1, len(deployment_plan.target_accounts) // 10)]
|
488
|
+
|
489
|
+
canary_result = await self._deploy_to_account_group(canary_accounts, deployment_plan.operations, "canary")
|
490
|
+
|
491
|
+
if not canary_result["success"]:
|
492
|
+
return {"success": False, "error": "Canary deployment failed", "details": canary_result}
|
493
|
+
|
494
|
+
deployment_status.progress_percentage = 30.0
|
495
|
+
|
496
|
+
# Phase 2: Monitor canary for stability
|
497
|
+
self.rich_console.print_info(f"⏱️ Monitoring canary for {deployment_plan.canary_duration}s...")
|
498
|
+
|
499
|
+
monitoring_result = await self._monitor_canary_health(
|
500
|
+
canary_accounts, deployment_plan.canary_duration, deployment_status
|
501
|
+
)
|
502
|
+
|
503
|
+
if not monitoring_result["healthy"]:
|
504
|
+
# Trigger rollback
|
505
|
+
await self._rollback_canary_deployment(canary_accounts, deployment_status)
|
506
|
+
return {"success": False, "error": "Canary failed health checks", "metrics": monitoring_result["metrics"]}
|
507
|
+
|
508
|
+
deployment_status.progress_percentage = 60.0
|
509
|
+
|
510
|
+
# Phase 3: Deploy to remaining accounts
|
511
|
+
remaining_accounts = deployment_plan.target_accounts[len(canary_accounts) :]
|
512
|
+
|
513
|
+
if remaining_accounts:
|
514
|
+
production_result = await self._deploy_to_account_group(
|
515
|
+
remaining_accounts, deployment_plan.operations, "production"
|
516
|
+
)
|
517
|
+
|
518
|
+
if not production_result["success"]:
|
519
|
+
# Rollback everything
|
520
|
+
await self._trigger_full_rollback(deployment_plan, deployment_status)
|
521
|
+
return {"success": False, "error": "Production rollout failed"}
|
522
|
+
|
523
|
+
deployment_status.progress_percentage = 100.0
|
524
|
+
deployment_status.current_phase = "deployment_complete"
|
525
|
+
deployment_status.completed_at = datetime.utcnow()
|
526
|
+
|
527
|
+
self.rich_console.print_success("🎉 Canary deployment completed successfully!")
|
528
|
+
|
529
|
+
return {
|
530
|
+
"success": True,
|
531
|
+
"strategy": "canary",
|
532
|
+
"canary_accounts": len(canary_accounts),
|
533
|
+
"production_accounts": len(remaining_accounts),
|
534
|
+
"total_operations": deployment_status.successful_operations,
|
535
|
+
}
|
536
|
+
|
537
|
+
async def _monitor_deployment_health(
|
538
|
+
self, deployment_plan: DeploymentPlan, deployment_status: DeploymentStatus
|
539
|
+
) -> Dict[str, Any]:
|
540
|
+
"""
|
541
|
+
Monitor deployment health with real-time metrics and alerting.
|
542
|
+
|
543
|
+
Args:
|
544
|
+
deployment_plan: Deployment configuration
|
545
|
+
deployment_status: Current deployment status
|
546
|
+
|
547
|
+
Returns:
|
548
|
+
Dict containing monitoring results and metrics
|
549
|
+
"""
|
550
|
+
self.rich_console.print_info("📊 Starting post-deployment health monitoring...")
|
551
|
+
|
552
|
+
monitoring_start = datetime.utcnow()
|
553
|
+
monitoring_end = monitoring_start + timedelta(seconds=deployment_plan.monitoring_duration)
|
554
|
+
|
555
|
+
metrics = {
|
556
|
+
"error_rate": 0.0,
|
557
|
+
"avg_response_time": 0.0,
|
558
|
+
"availability": 1.0,
|
559
|
+
"cost_savings": 0.0,
|
560
|
+
"alerts_triggered": 0,
|
561
|
+
}
|
562
|
+
|
563
|
+
while datetime.utcnow() < monitoring_end:
|
564
|
+
try:
|
565
|
+
# Check deployment health across all accounts
|
566
|
+
health_results = await self._check_deployment_health(
|
567
|
+
deployment_plan.target_accounts, deployment_plan.target_regions
|
568
|
+
)
|
569
|
+
|
570
|
+
# Update metrics
|
571
|
+
metrics["error_rate"] = health_results.get("error_rate", 0.0)
|
572
|
+
metrics["avg_response_time"] = health_results.get("avg_response_time", 0.0)
|
573
|
+
metrics["availability"] = health_results.get("availability", 1.0)
|
574
|
+
|
575
|
+
# Check threshold breaches
|
576
|
+
alerts_triggered = []
|
577
|
+
|
578
|
+
if metrics["error_rate"] > deployment_plan.error_rate_threshold:
|
579
|
+
alerts_triggered.append(f"Error rate {metrics['error_rate']:.2%} exceeds threshold")
|
580
|
+
|
581
|
+
if metrics["avg_response_time"] > deployment_plan.latency_threshold:
|
582
|
+
alerts_triggered.append(f"Latency {metrics['avg_response_time']:.2f}s exceeds threshold")
|
583
|
+
|
584
|
+
if metrics["availability"] < deployment_plan.availability_threshold:
|
585
|
+
alerts_triggered.append(f"Availability {metrics['availability']:.2%} below threshold")
|
586
|
+
|
587
|
+
if alerts_triggered:
|
588
|
+
self.rich_console.print_warning(f"⚠️ Health check alerts: {len(alerts_triggered)}")
|
589
|
+
for alert in alerts_triggered:
|
590
|
+
self.rich_console.print_warning(f" • {alert}")
|
591
|
+
|
592
|
+
metrics["alerts_triggered"] += len(alerts_triggered)
|
593
|
+
|
594
|
+
# Trigger rollback if critical thresholds breached
|
595
|
+
if (
|
596
|
+
metrics["error_rate"] > deployment_plan.error_rate_threshold * 2
|
597
|
+
or metrics["availability"] < deployment_plan.availability_threshold
|
598
|
+
):
|
599
|
+
self.rich_console.print_error("🚨 Critical thresholds breached - triggering rollback!")
|
600
|
+
await self._trigger_emergency_rollback(
|
601
|
+
deployment_plan, deployment_status, "Health monitoring threshold breach"
|
602
|
+
)
|
603
|
+
break
|
604
|
+
|
605
|
+
# Sleep before next check
|
606
|
+
await asyncio.sleep(deployment_plan.monitoring_interval)
|
607
|
+
|
608
|
+
except Exception as e:
|
609
|
+
logger.error(f"Health monitoring error: {str(e)}")
|
610
|
+
metrics["alerts_triggered"] += 1
|
611
|
+
|
612
|
+
self.rich_console.print_success("✅ Health monitoring completed")
|
613
|
+
|
614
|
+
return {
|
615
|
+
"success": True,
|
616
|
+
"duration_seconds": deployment_plan.monitoring_duration,
|
617
|
+
"metrics": metrics,
|
618
|
+
"alerts_triggered": metrics["alerts_triggered"],
|
619
|
+
"rollback_triggered": deployment_status.rollback_triggered,
|
620
|
+
}
|
621
|
+
|
622
|
+
async def _generate_deployment_report(
|
623
|
+
self, deployment_plan: DeploymentPlan, deployment_status: DeploymentStatus
|
624
|
+
) -> Dict[str, Any]:
|
625
|
+
"""
|
626
|
+
Generate comprehensive deployment report for executive review.
|
627
|
+
|
628
|
+
Args:
|
629
|
+
deployment_plan: Deployment configuration
|
630
|
+
deployment_status: Final deployment status
|
631
|
+
|
632
|
+
Returns:
|
633
|
+
Dict containing deployment report data
|
634
|
+
"""
|
635
|
+
self.rich_console.print_info("📝 Generating deployment report...")
|
636
|
+
|
637
|
+
# Calculate deployment metrics
|
638
|
+
total_duration = (
|
639
|
+
(deployment_status.completed_at or datetime.utcnow()) - deployment_status.started_at
|
640
|
+
).total_seconds()
|
641
|
+
|
642
|
+
success_rate = deployment_status.successful_operations / max(
|
643
|
+
1, deployment_status.successful_operations + deployment_status.failed_operations
|
644
|
+
)
|
645
|
+
|
646
|
+
# Calculate cost impact
|
647
|
+
total_cost_impact = sum(op.get("cost_impact", 0) for op in deployment_plan.operations)
|
648
|
+
estimated_annual_savings = total_cost_impact * 12 * 0.3 # 30% savings estimate
|
649
|
+
|
650
|
+
# Generate comprehensive report
|
651
|
+
report = {
|
652
|
+
"deployment_summary": {
|
653
|
+
"deployment_id": deployment_plan.deployment_id,
|
654
|
+
"strategy": deployment_plan.strategy.value,
|
655
|
+
"started_at": deployment_status.started_at.isoformat(),
|
656
|
+
"completed_at": (deployment_status.completed_at or datetime.utcnow()).isoformat(),
|
657
|
+
"total_duration_minutes": total_duration / 60,
|
658
|
+
"success_rate": success_rate,
|
659
|
+
"rollback_triggered": deployment_status.rollback_triggered,
|
660
|
+
},
|
661
|
+
"operations_summary": {
|
662
|
+
"total_operations": len(deployment_plan.operations),
|
663
|
+
"successful_operations": deployment_status.successful_operations,
|
664
|
+
"failed_operations": deployment_status.failed_operations,
|
665
|
+
"target_accounts": len(deployment_plan.target_accounts),
|
666
|
+
"target_regions": len(deployment_plan.target_regions),
|
667
|
+
},
|
668
|
+
"cost_impact": {
|
669
|
+
"monthly_cost_impact": total_cost_impact,
|
670
|
+
"annual_cost_impact": total_cost_impact * 12,
|
671
|
+
"estimated_annual_savings": estimated_annual_savings,
|
672
|
+
"roi_percentage": (estimated_annual_savings / (total_cost_impact * 12)) * 100
|
673
|
+
if total_cost_impact > 0
|
674
|
+
else 0,
|
675
|
+
},
|
676
|
+
"safety_metrics": {
|
677
|
+
"dry_run_executed": deployment_plan.dry_run_first,
|
678
|
+
"approval_required": deployment_plan.approval_required,
|
679
|
+
"rollback_enabled": deployment_plan.rollback_enabled,
|
680
|
+
"monitoring_enabled": deployment_plan.monitoring_enabled,
|
681
|
+
"avg_execution_time": deployment_status.avg_execution_time,
|
682
|
+
"error_rate": deployment_status.error_rate,
|
683
|
+
"availability_score": deployment_status.availability_score,
|
684
|
+
},
|
685
|
+
"executive_summary": {
|
686
|
+
"deployment_status": "SUCCESS"
|
687
|
+
if success_rate > 0.95
|
688
|
+
else "PARTIAL_SUCCESS"
|
689
|
+
if success_rate > 0.8
|
690
|
+
else "FAILED",
|
691
|
+
"business_impact": f"${estimated_annual_savings:.0f} annual savings potential",
|
692
|
+
"operational_impact": f"{deployment_status.successful_operations}/{len(deployment_plan.operations)} operations completed",
|
693
|
+
"risk_assessment": "LOW" if not deployment_status.rollback_triggered else "MEDIUM",
|
694
|
+
"next_steps": self._generate_next_steps_recommendations(deployment_status, success_rate),
|
695
|
+
},
|
696
|
+
}
|
697
|
+
|
698
|
+
# Export report to artifacts
|
699
|
+
report_timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
700
|
+
report_path = self.artifacts_dir / f"deployment_report_{deployment_plan.deployment_id}_{report_timestamp}.json"
|
701
|
+
|
702
|
+
with open(report_path, "w") as f:
|
703
|
+
json.dump(report, f, indent=2, default=str)
|
704
|
+
|
705
|
+
# Display executive summary
|
706
|
+
self.rich_console.print_panel(
|
707
|
+
"📊 Deployment Report Summary",
|
708
|
+
f"Status: {report['executive_summary']['deployment_status']}\n"
|
709
|
+
f"Success Rate: {success_rate:.1%}\n"
|
710
|
+
f"Duration: {total_duration / 60:.1f} minutes\n"
|
711
|
+
f"Business Impact: {report['executive_summary']['business_impact']}\n"
|
712
|
+
f"Report Saved: {report_path}",
|
713
|
+
title="🎯 Executive Summary",
|
714
|
+
)
|
715
|
+
|
716
|
+
self.rich_console.print_success(f"✅ Deployment report generated: {report_path}")
|
717
|
+
|
718
|
+
return report
|
719
|
+
|
720
|
+
def _generate_next_steps_recommendations(
|
721
|
+
self, deployment_status: DeploymentStatus, success_rate: float
|
722
|
+
) -> List[str]:
|
723
|
+
"""Generate next steps recommendations based on deployment results."""
|
724
|
+
|
725
|
+
recommendations = []
|
726
|
+
|
727
|
+
if success_rate >= 0.95:
|
728
|
+
recommendations.extend(
|
729
|
+
[
|
730
|
+
"Monitor cost savings over next 30 days",
|
731
|
+
"Document successful deployment patterns",
|
732
|
+
"Plan next optimization phase for additional accounts",
|
733
|
+
]
|
734
|
+
)
|
735
|
+
elif success_rate >= 0.8:
|
736
|
+
recommendations.extend(
|
737
|
+
[
|
738
|
+
"Review failed operations for root cause analysis",
|
739
|
+
"Optimize deployment procedures based on lessons learned",
|
740
|
+
"Consider retry of failed operations with improved parameters",
|
741
|
+
]
|
742
|
+
)
|
743
|
+
else:
|
744
|
+
recommendations.extend(
|
745
|
+
[
|
746
|
+
"Conduct thorough post-mortem analysis",
|
747
|
+
"Review and strengthen pre-deployment validation",
|
748
|
+
"Consider rollback of successful operations if business impact negative",
|
749
|
+
]
|
750
|
+
)
|
751
|
+
|
752
|
+
if deployment_status.rollback_triggered:
|
753
|
+
recommendations.extend(
|
754
|
+
[
|
755
|
+
"Analyze rollback root causes",
|
756
|
+
"Improve monitoring thresholds and alerting",
|
757
|
+
"Strengthen deployment health checks",
|
758
|
+
]
|
759
|
+
)
|
760
|
+
|
761
|
+
return recommendations
|
762
|
+
|
763
|
+
# Utility methods for deployment execution
|
764
|
+
async def _deploy_to_account_group(
|
765
|
+
self, accounts: List[str], operations: List[Dict[str, Any]], group_name: str
|
766
|
+
) -> Dict[str, Any]:
|
767
|
+
"""Deploy operations to a group of accounts with parallel execution."""
|
768
|
+
|
769
|
+
self.rich_console.print_info(f"🚀 Deploying to {group_name} group: {len(accounts)} accounts")
|
770
|
+
|
771
|
+
successful_accounts = 0
|
772
|
+
failed_accounts = 0
|
773
|
+
|
774
|
+
# Parallel execution across accounts
|
775
|
+
tasks = []
|
776
|
+
for account_id in accounts:
|
777
|
+
task = self._deploy_to_single_account(account_id, operations)
|
778
|
+
tasks.append(task)
|
779
|
+
|
780
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
781
|
+
|
782
|
+
for i, result in enumerate(results):
|
783
|
+
if isinstance(result, Exception):
|
784
|
+
self.rich_console.print_error(f"❌ Account {accounts[i]} deployment failed: {str(result)}")
|
785
|
+
failed_accounts += 1
|
786
|
+
elif result.get("success", False):
|
787
|
+
successful_accounts += 1
|
788
|
+
else:
|
789
|
+
failed_accounts += 1
|
790
|
+
|
791
|
+
success_rate = successful_accounts / len(accounts) if accounts else 0
|
792
|
+
|
793
|
+
self.rich_console.print_info(
|
794
|
+
f"📊 {group_name.title()} deployment complete: "
|
795
|
+
f"{successful_accounts}/{len(accounts)} accounts successful ({success_rate:.1%})"
|
796
|
+
)
|
797
|
+
|
798
|
+
return {
|
799
|
+
"success": success_rate > 0.8, # 80% success threshold
|
800
|
+
"successful_accounts": successful_accounts,
|
801
|
+
"failed_accounts": failed_accounts,
|
802
|
+
"success_rate": success_rate,
|
803
|
+
}
|
804
|
+
|
805
|
+
async def _deploy_to_single_account(self, account_id: str, operations: List[Dict[str, Any]]) -> Dict[str, Any]:
|
806
|
+
"""Deploy operations to a single account."""
|
807
|
+
|
808
|
+
try:
|
809
|
+
for operation in operations:
|
810
|
+
# Execute individual operation
|
811
|
+
operation_result = await self._execute_single_operation(account_id, operation)
|
812
|
+
|
813
|
+
if not operation_result.get("success", False):
|
814
|
+
return {
|
815
|
+
"success": False,
|
816
|
+
"account_id": account_id,
|
817
|
+
"failed_operation": operation.get("type"),
|
818
|
+
"error": operation_result.get("error"),
|
819
|
+
}
|
820
|
+
|
821
|
+
return {"success": True, "account_id": account_id}
|
822
|
+
|
823
|
+
except Exception as e:
|
824
|
+
return {"success": False, "account_id": account_id, "error": str(e)}
|
825
|
+
|
826
|
+
async def _execute_single_operation(self, account_id: str, operation: Dict[str, Any]) -> Dict[str, Any]:
|
827
|
+
"""Execute a single operation with proper error handling."""
|
828
|
+
|
829
|
+
operation_type = operation.get("type")
|
830
|
+
|
831
|
+
try:
|
832
|
+
if operation_type == "optimize_nat_gateway":
|
833
|
+
return await self._optimize_nat_gateway_operation(account_id, operation)
|
834
|
+
elif operation_type == "cleanup_unused_eips":
|
835
|
+
return await self._cleanup_eips_operation(account_id, operation)
|
836
|
+
elif operation_type == "vpc_cost_analysis":
|
837
|
+
return await self._vpc_cost_analysis_operation(account_id, operation)
|
838
|
+
else:
|
839
|
+
return {"success": False, "error": f"Unknown operation type: {operation_type}"}
|
840
|
+
|
841
|
+
except Exception as e:
|
842
|
+
logger.error(f"Operation {operation_type} failed for account {account_id}: {str(e)}")
|
843
|
+
return {"success": False, "error": str(e)}
|
844
|
+
|
845
|
+
# Emergency rollback procedures
|
846
|
+
async def _trigger_emergency_rollback(
|
847
|
+
self, deployment_plan: DeploymentPlan, deployment_status: DeploymentStatus, reason: str
|
848
|
+
):
|
849
|
+
"""Trigger emergency rollback with comprehensive recovery."""
|
850
|
+
|
851
|
+
self.rich_console.print_error(f"🚨 EMERGENCY ROLLBACK TRIGGERED: {reason}")
|
852
|
+
|
853
|
+
deployment_status.rollback_triggered = True
|
854
|
+
deployment_status.rollback_reason = reason
|
855
|
+
deployment_status.current_phase = "emergency_rollback"
|
856
|
+
|
857
|
+
# Log rollback initiation
|
858
|
+
logger.critical(f"Emergency rollback initiated for {deployment_plan.deployment_id}: {reason}")
|
859
|
+
|
860
|
+
# Execute rollback procedures
|
861
|
+
rollback_successful = await self._execute_rollback_procedures(deployment_plan)
|
862
|
+
|
863
|
+
if rollback_successful:
|
864
|
+
self.rich_console.print_success("✅ Emergency rollback completed successfully")
|
865
|
+
else:
|
866
|
+
self.rich_console.print_error("❌ Emergency rollback encountered issues - manual intervention required")
|
867
|
+
|
868
|
+
# Generate incident report
|
869
|
+
await self._generate_incident_report(deployment_plan, deployment_status, reason)
|
870
|
+
|
871
|
+
async def _execute_rollback_procedures(self, deployment_plan: DeploymentPlan) -> bool:
|
872
|
+
"""Execute comprehensive rollback procedures."""
|
873
|
+
|
874
|
+
self.rich_console.print_warning("🔄 Executing rollback procedures...")
|
875
|
+
|
876
|
+
rollback_successful = True
|
877
|
+
|
878
|
+
try:
|
879
|
+
# Rollback in reverse order of deployment
|
880
|
+
for account_id in reversed(deployment_plan.target_accounts):
|
881
|
+
account_rollback = await self._rollback_account_operations(account_id)
|
882
|
+
if not account_rollback:
|
883
|
+
rollback_successful = False
|
884
|
+
logger.error(f"Rollback failed for account {account_id}")
|
885
|
+
|
886
|
+
return rollback_successful
|
887
|
+
|
888
|
+
except Exception as e:
|
889
|
+
logger.error(f"Rollback execution failed: {str(e)}")
|
890
|
+
return False
|
891
|
+
|
892
|
+
# Validation helper methods
|
893
|
+
async def _validate_account_access(self, account_id: str) -> bool:
|
894
|
+
"""Validate access to target account."""
|
895
|
+
try:
|
896
|
+
# Simulate account access validation
|
897
|
+
return True # In production, implement actual cross-account role assumption validation
|
898
|
+
except Exception:
|
899
|
+
return False
|
900
|
+
|
901
|
+
async def _validate_region_availability(self, region: str) -> bool:
|
902
|
+
"""Validate region availability and access."""
|
903
|
+
try:
|
904
|
+
# Simulate region validation
|
905
|
+
return region in ["us-east-1", "us-west-2", "eu-west-1", "ap-southeast-1"]
|
906
|
+
except Exception:
|
907
|
+
return False
|
908
|
+
|
909
|
+
def _validate_operation_parameters(self, operation: Dict[str, Any]) -> bool:
|
910
|
+
"""Validate operation parameters."""
|
911
|
+
required_fields = ["type", "target", "parameters"]
|
912
|
+
return all(field in operation for field in required_fields)
|
913
|
+
|
914
|
+
async def _validate_security_compliance(self, deployment_plan: DeploymentPlan) -> List[str]:
|
915
|
+
"""Validate security compliance requirements."""
|
916
|
+
issues = []
|
917
|
+
|
918
|
+
# Check for required security controls
|
919
|
+
if not deployment_plan.dry_run_first:
|
920
|
+
issues.append("Dry-run validation is required for security compliance")
|
921
|
+
|
922
|
+
if not deployment_plan.approval_required:
|
923
|
+
issues.append("Approval workflow is required for production deployments")
|
924
|
+
|
925
|
+
return issues
|
926
|
+
|
927
|
+
async def _validate_resource_dependencies(self, deployment_plan: DeploymentPlan) -> List[str]:
|
928
|
+
"""Validate resource dependencies and prerequisites."""
|
929
|
+
issues = []
|
930
|
+
|
931
|
+
# Check for dependency conflicts
|
932
|
+
operation_types = [op.get("type") for op in deployment_plan.operations]
|
933
|
+
|
934
|
+
if "delete_vpc" in operation_types and "create_nat_gateway" in operation_types:
|
935
|
+
issues.append("Cannot create NAT Gateway in VPC scheduled for deletion")
|
936
|
+
|
937
|
+
return issues
|
938
|
+
|
939
|
+
|
940
|
+
# Deployment plan factory for common scenarios
|
941
|
+
class DeploymentPlanFactory:
|
942
|
+
"""Factory for creating common deployment plans."""
|
943
|
+
|
944
|
+
@staticmethod
|
945
|
+
def create_cost_optimization_campaign(
|
946
|
+
target_accounts: List[str],
|
947
|
+
target_regions: List[str] = None,
|
948
|
+
strategy: DeploymentStrategy = DeploymentStrategy.CANARY,
|
949
|
+
) -> DeploymentPlan:
|
950
|
+
"""Create deployment plan for comprehensive cost optimization campaign."""
|
951
|
+
|
952
|
+
deployment_id = f"cost-opt-{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}"
|
953
|
+
|
954
|
+
# Default to common regions if not specified
|
955
|
+
if not target_regions:
|
956
|
+
target_regions = ["us-east-1", "us-west-2"]
|
957
|
+
|
958
|
+
# Define optimization operations
|
959
|
+
operations = [
|
960
|
+
{
|
961
|
+
"type": "analyze_nat_costs",
|
962
|
+
"target": "all_vpcs",
|
963
|
+
"parameters": {},
|
964
|
+
"cost_impact": 0, # Analysis only
|
965
|
+
},
|
966
|
+
{
|
967
|
+
"type": "optimize_nat_gateway",
|
968
|
+
"target": "underutilized_nat_gateways",
|
969
|
+
"parameters": {"consolidation_enabled": True},
|
970
|
+
"cost_impact": 135, # 3 NAT gateways × $45/month
|
971
|
+
},
|
972
|
+
{
|
973
|
+
"type": "cleanup_unused_eips",
|
974
|
+
"target": "all_regions",
|
975
|
+
"parameters": {"release_unused": True},
|
976
|
+
"cost_impact": 36, # 10 EIPs × $3.60/month
|
977
|
+
},
|
978
|
+
{
|
979
|
+
"type": "vpc_cost_analysis",
|
980
|
+
"target": "all_vpcs",
|
981
|
+
"parameters": {"generate_report": True},
|
982
|
+
"cost_impact": 0, # Reporting only
|
983
|
+
},
|
984
|
+
]
|
985
|
+
|
986
|
+
return DeploymentPlan(
|
987
|
+
deployment_id=deployment_id,
|
988
|
+
strategy=strategy,
|
989
|
+
target_accounts=target_accounts,
|
990
|
+
target_regions=target_regions,
|
991
|
+
operations=operations,
|
992
|
+
approval_required=True,
|
993
|
+
dry_run_first=True,
|
994
|
+
rollback_enabled=True,
|
995
|
+
monitoring_enabled=True,
|
996
|
+
cost_threshold=100.0, # Lower threshold for cost optimization
|
997
|
+
)
|
998
|
+
|
999
|
+
@staticmethod
|
1000
|
+
def create_emergency_rollback_plan(original_deployment_id: str, target_accounts: List[str]) -> DeploymentPlan:
|
1001
|
+
"""Create deployment plan for emergency rollback operations."""
|
1002
|
+
|
1003
|
+
deployment_id = f"rollback-{original_deployment_id}"
|
1004
|
+
|
1005
|
+
# Rollback operations (reverse of optimizations)
|
1006
|
+
operations = [
|
1007
|
+
{
|
1008
|
+
"type": "restore_nat_gateways",
|
1009
|
+
"target": "consolidated_gateways",
|
1010
|
+
"parameters": {"restore_original_configuration": True},
|
1011
|
+
"cost_impact": -135, # Negative cost impact (increased spend)
|
1012
|
+
},
|
1013
|
+
{
|
1014
|
+
"type": "restore_elastic_ips",
|
1015
|
+
"target": "released_eips",
|
1016
|
+
"parameters": {"recreate_released_eips": False}, # Cannot recreate same IPs
|
1017
|
+
"cost_impact": 0,
|
1018
|
+
},
|
1019
|
+
]
|
1020
|
+
|
1021
|
+
return DeploymentPlan(
|
1022
|
+
deployment_id=deployment_id,
|
1023
|
+
strategy=DeploymentStrategy.ALL_AT_ONCE, # Emergency rollback
|
1024
|
+
target_accounts=target_accounts,
|
1025
|
+
target_regions=["us-east-1", "us-west-2"],
|
1026
|
+
operations=operations,
|
1027
|
+
approval_required=False, # Emergency operations
|
1028
|
+
dry_run_first=False, # Emergency deployment
|
1029
|
+
rollback_enabled=False, # This IS the rollback
|
1030
|
+
monitoring_enabled=True,
|
1031
|
+
cost_threshold=1000.0,
|
1032
|
+
)
|