runbooks 1.1.4__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. runbooks/__init__.py +31 -2
  2. runbooks/__init___optimized.py +18 -4
  3. runbooks/_platform/__init__.py +1 -5
  4. runbooks/_platform/core/runbooks_wrapper.py +141 -138
  5. runbooks/aws2/accuracy_validator.py +812 -0
  6. runbooks/base.py +7 -0
  7. runbooks/cfat/assessment/compliance.py +1 -1
  8. runbooks/cfat/assessment/runner.py +1 -0
  9. runbooks/cfat/cloud_foundations_assessment.py +227 -239
  10. runbooks/cli/__init__.py +1 -1
  11. runbooks/cli/commands/cfat.py +64 -23
  12. runbooks/cli/commands/finops.py +1005 -54
  13. runbooks/cli/commands/inventory.py +138 -35
  14. runbooks/cli/commands/operate.py +9 -36
  15. runbooks/cli/commands/security.py +42 -18
  16. runbooks/cli/commands/validation.py +432 -18
  17. runbooks/cli/commands/vpc.py +81 -17
  18. runbooks/cli/registry.py +22 -10
  19. runbooks/cloudops/__init__.py +20 -27
  20. runbooks/cloudops/base.py +96 -107
  21. runbooks/cloudops/cost_optimizer.py +544 -542
  22. runbooks/cloudops/infrastructure_optimizer.py +5 -4
  23. runbooks/cloudops/interfaces.py +224 -225
  24. runbooks/cloudops/lifecycle_manager.py +5 -4
  25. runbooks/cloudops/mcp_cost_validation.py +252 -235
  26. runbooks/cloudops/models.py +78 -53
  27. runbooks/cloudops/monitoring_automation.py +5 -4
  28. runbooks/cloudops/notebook_framework.py +177 -213
  29. runbooks/cloudops/security_enforcer.py +125 -159
  30. runbooks/common/accuracy_validator.py +11 -0
  31. runbooks/common/aws_pricing.py +349 -326
  32. runbooks/common/aws_pricing_api.py +211 -212
  33. runbooks/common/aws_profile_manager.py +40 -36
  34. runbooks/common/aws_utils.py +74 -79
  35. runbooks/common/business_logic.py +126 -104
  36. runbooks/common/cli_decorators.py +36 -60
  37. runbooks/common/comprehensive_cost_explorer_integration.py +455 -463
  38. runbooks/common/cross_account_manager.py +197 -204
  39. runbooks/common/date_utils.py +27 -39
  40. runbooks/common/decorators.py +29 -19
  41. runbooks/common/dry_run_examples.py +173 -208
  42. runbooks/common/dry_run_framework.py +157 -155
  43. runbooks/common/enhanced_exception_handler.py +15 -4
  44. runbooks/common/enhanced_logging_example.py +50 -64
  45. runbooks/common/enhanced_logging_integration_example.py +65 -37
  46. runbooks/common/env_utils.py +16 -16
  47. runbooks/common/error_handling.py +40 -38
  48. runbooks/common/lazy_loader.py +41 -23
  49. runbooks/common/logging_integration_helper.py +79 -86
  50. runbooks/common/mcp_cost_explorer_integration.py +476 -493
  51. runbooks/common/mcp_integration.py +63 -74
  52. runbooks/common/memory_optimization.py +140 -118
  53. runbooks/common/module_cli_base.py +37 -58
  54. runbooks/common/organizations_client.py +175 -193
  55. runbooks/common/patterns.py +23 -25
  56. runbooks/common/performance_monitoring.py +67 -71
  57. runbooks/common/performance_optimization_engine.py +283 -274
  58. runbooks/common/profile_utils.py +111 -37
  59. runbooks/common/rich_utils.py +201 -141
  60. runbooks/common/sre_performance_suite.py +177 -186
  61. runbooks/enterprise/__init__.py +1 -1
  62. runbooks/enterprise/logging.py +144 -106
  63. runbooks/enterprise/security.py +187 -204
  64. runbooks/enterprise/validation.py +43 -56
  65. runbooks/finops/__init__.py +26 -30
  66. runbooks/finops/account_resolver.py +1 -1
  67. runbooks/finops/advanced_optimization_engine.py +980 -0
  68. runbooks/finops/automation_core.py +268 -231
  69. runbooks/finops/business_case_config.py +184 -179
  70. runbooks/finops/cli.py +660 -139
  71. runbooks/finops/commvault_ec2_analysis.py +157 -164
  72. runbooks/finops/compute_cost_optimizer.py +336 -320
  73. runbooks/finops/config.py +20 -20
  74. runbooks/finops/cost_optimizer.py +484 -618
  75. runbooks/finops/cost_processor.py +332 -214
  76. runbooks/finops/dashboard_runner.py +1006 -172
  77. runbooks/finops/ebs_cost_optimizer.py +991 -657
  78. runbooks/finops/elastic_ip_optimizer.py +317 -257
  79. runbooks/finops/enhanced_mcp_integration.py +340 -0
  80. runbooks/finops/enhanced_progress.py +32 -29
  81. runbooks/finops/enhanced_trend_visualization.py +3 -2
  82. runbooks/finops/enterprise_wrappers.py +223 -285
  83. runbooks/finops/executive_export.py +203 -160
  84. runbooks/finops/helpers.py +130 -288
  85. runbooks/finops/iam_guidance.py +1 -1
  86. runbooks/finops/infrastructure/__init__.py +80 -0
  87. runbooks/finops/infrastructure/commands.py +506 -0
  88. runbooks/finops/infrastructure/load_balancer_optimizer.py +866 -0
  89. runbooks/finops/infrastructure/vpc_endpoint_optimizer.py +832 -0
  90. runbooks/finops/markdown_exporter.py +337 -174
  91. runbooks/finops/mcp_validator.py +1952 -0
  92. runbooks/finops/nat_gateway_optimizer.py +1512 -481
  93. runbooks/finops/network_cost_optimizer.py +657 -587
  94. runbooks/finops/notebook_utils.py +226 -188
  95. runbooks/finops/optimization_engine.py +1136 -0
  96. runbooks/finops/optimizer.py +19 -23
  97. runbooks/finops/rds_snapshot_optimizer.py +367 -411
  98. runbooks/finops/reservation_optimizer.py +427 -363
  99. runbooks/finops/scenario_cli_integration.py +64 -65
  100. runbooks/finops/scenarios.py +1277 -438
  101. runbooks/finops/schemas.py +218 -182
  102. runbooks/finops/snapshot_manager.py +2289 -0
  103. runbooks/finops/types.py +3 -3
  104. runbooks/finops/validation_framework.py +259 -265
  105. runbooks/finops/vpc_cleanup_exporter.py +189 -144
  106. runbooks/finops/vpc_cleanup_optimizer.py +591 -573
  107. runbooks/finops/workspaces_analyzer.py +171 -182
  108. runbooks/integration/__init__.py +89 -0
  109. runbooks/integration/mcp_integration.py +1920 -0
  110. runbooks/inventory/CLAUDE.md +816 -0
  111. runbooks/inventory/__init__.py +2 -2
  112. runbooks/inventory/cloud_foundations_integration.py +144 -149
  113. runbooks/inventory/collectors/aws_comprehensive.py +1 -1
  114. runbooks/inventory/collectors/aws_networking.py +109 -99
  115. runbooks/inventory/collectors/base.py +4 -0
  116. runbooks/inventory/core/collector.py +495 -313
  117. runbooks/inventory/drift_detection_cli.py +69 -96
  118. runbooks/inventory/inventory_mcp_cli.py +48 -46
  119. runbooks/inventory/list_rds_snapshots_aggregator.py +192 -208
  120. runbooks/inventory/mcp_inventory_validator.py +549 -465
  121. runbooks/inventory/mcp_vpc_validator.py +359 -442
  122. runbooks/inventory/organizations_discovery.py +55 -51
  123. runbooks/inventory/rich_inventory_display.py +33 -32
  124. runbooks/inventory/unified_validation_engine.py +278 -251
  125. runbooks/inventory/vpc_analyzer.py +732 -695
  126. runbooks/inventory/vpc_architecture_validator.py +293 -348
  127. runbooks/inventory/vpc_dependency_analyzer.py +382 -378
  128. runbooks/inventory/vpc_flow_analyzer.py +1 -1
  129. runbooks/main.py +49 -34
  130. runbooks/main_final.py +91 -60
  131. runbooks/main_minimal.py +22 -10
  132. runbooks/main_optimized.py +131 -100
  133. runbooks/main_ultra_minimal.py +7 -2
  134. runbooks/mcp/__init__.py +36 -0
  135. runbooks/mcp/integration.py +679 -0
  136. runbooks/monitoring/performance_monitor.py +9 -4
  137. runbooks/operate/dynamodb_operations.py +3 -1
  138. runbooks/operate/ec2_operations.py +145 -137
  139. runbooks/operate/iam_operations.py +146 -152
  140. runbooks/operate/networking_cost_heatmap.py +29 -8
  141. runbooks/operate/rds_operations.py +223 -254
  142. runbooks/operate/s3_operations.py +107 -118
  143. runbooks/operate/vpc_operations.py +646 -616
  144. runbooks/remediation/base.py +1 -1
  145. runbooks/remediation/commons.py +10 -7
  146. runbooks/remediation/commvault_ec2_analysis.py +70 -66
  147. runbooks/remediation/ec2_unattached_ebs_volumes.py +1 -0
  148. runbooks/remediation/multi_account.py +24 -21
  149. runbooks/remediation/rds_snapshot_list.py +86 -60
  150. runbooks/remediation/remediation_cli.py +92 -146
  151. runbooks/remediation/universal_account_discovery.py +83 -79
  152. runbooks/remediation/workspaces_list.py +46 -41
  153. runbooks/security/__init__.py +19 -0
  154. runbooks/security/assessment_runner.py +1150 -0
  155. runbooks/security/baseline_checker.py +812 -0
  156. runbooks/security/cloudops_automation_security_validator.py +509 -535
  157. runbooks/security/compliance_automation_engine.py +17 -17
  158. runbooks/security/config/__init__.py +2 -2
  159. runbooks/security/config/compliance_config.py +50 -50
  160. runbooks/security/config_template_generator.py +63 -76
  161. runbooks/security/enterprise_security_framework.py +1 -1
  162. runbooks/security/executive_security_dashboard.py +519 -508
  163. runbooks/security/multi_account_security_controls.py +959 -1210
  164. runbooks/security/real_time_security_monitor.py +422 -444
  165. runbooks/security/security_baseline_tester.py +1 -1
  166. runbooks/security/security_cli.py +143 -112
  167. runbooks/security/test_2way_validation.py +439 -0
  168. runbooks/security/two_way_validation_framework.py +852 -0
  169. runbooks/sre/production_monitoring_framework.py +167 -177
  170. runbooks/tdd/__init__.py +15 -0
  171. runbooks/tdd/cli.py +1071 -0
  172. runbooks/utils/__init__.py +14 -17
  173. runbooks/utils/logger.py +7 -2
  174. runbooks/utils/version_validator.py +50 -47
  175. runbooks/validation/__init__.py +6 -6
  176. runbooks/validation/cli.py +9 -3
  177. runbooks/validation/comprehensive_2way_validator.py +745 -704
  178. runbooks/validation/mcp_validator.py +906 -228
  179. runbooks/validation/terraform_citations_validator.py +104 -115
  180. runbooks/validation/terraform_drift_detector.py +447 -451
  181. runbooks/vpc/README.md +617 -0
  182. runbooks/vpc/__init__.py +8 -1
  183. runbooks/vpc/analyzer.py +577 -0
  184. runbooks/vpc/cleanup_wrapper.py +476 -413
  185. runbooks/vpc/cli_cloudtrail_commands.py +339 -0
  186. runbooks/vpc/cli_mcp_validation_commands.py +480 -0
  187. runbooks/vpc/cloudtrail_audit_integration.py +717 -0
  188. runbooks/vpc/config.py +92 -97
  189. runbooks/vpc/cost_engine.py +411 -148
  190. runbooks/vpc/cost_explorer_integration.py +553 -0
  191. runbooks/vpc/cross_account_session.py +101 -106
  192. runbooks/vpc/enhanced_mcp_validation.py +917 -0
  193. runbooks/vpc/eni_gate_validator.py +961 -0
  194. runbooks/vpc/heatmap_engine.py +185 -160
  195. runbooks/vpc/mcp_no_eni_validator.py +680 -639
  196. runbooks/vpc/nat_gateway_optimizer.py +358 -0
  197. runbooks/vpc/networking_wrapper.py +15 -8
  198. runbooks/vpc/pdca_remediation_planner.py +528 -0
  199. runbooks/vpc/performance_optimized_analyzer.py +219 -231
  200. runbooks/vpc/runbooks_adapter.py +1167 -241
  201. runbooks/vpc/tdd_red_phase_stubs.py +601 -0
  202. runbooks/vpc/test_data_loader.py +358 -0
  203. runbooks/vpc/tests/conftest.py +314 -4
  204. runbooks/vpc/tests/test_cleanup_framework.py +1022 -0
  205. runbooks/vpc/tests/test_cost_engine.py +0 -2
  206. runbooks/vpc/topology_generator.py +326 -0
  207. runbooks/vpc/unified_scenarios.py +1297 -1124
  208. runbooks/vpc/vpc_cleanup_integration.py +1943 -1115
  209. runbooks-1.1.5.dist-info/METADATA +328 -0
  210. {runbooks-1.1.4.dist-info → runbooks-1.1.5.dist-info}/RECORD +214 -193
  211. runbooks/finops/README.md +0 -414
  212. runbooks/finops/accuracy_cross_validator.py +0 -647
  213. runbooks/finops/business_cases.py +0 -950
  214. runbooks/finops/dashboard_router.py +0 -922
  215. runbooks/finops/ebs_optimizer.py +0 -973
  216. runbooks/finops/embedded_mcp_validator.py +0 -1629
  217. runbooks/finops/enhanced_dashboard_runner.py +0 -527
  218. runbooks/finops/finops_dashboard.py +0 -584
  219. runbooks/finops/finops_scenarios.py +0 -1218
  220. runbooks/finops/legacy_migration.py +0 -730
  221. runbooks/finops/multi_dashboard.py +0 -1519
  222. runbooks/finops/single_dashboard.py +0 -1113
  223. runbooks/finops/unlimited_scenarios.py +0 -393
  224. runbooks-1.1.4.dist-info/METADATA +0 -800
  225. {runbooks-1.1.4.dist-info → runbooks-1.1.5.dist-info}/WHEEL +0 -0
  226. {runbooks-1.1.4.dist-info → runbooks-1.1.5.dist-info}/entry_points.txt +0 -0
  227. {runbooks-1.1.4.dist-info → runbooks-1.1.5.dist-info}/licenses/LICENSE +0 -0
  228. {runbooks-1.1.4.dist-info → runbooks-1.1.5.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,13 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
3
  Performance Optimization Engine for CloudOps-Runbooks - Phase 2 Enhanced
4
-
4
+
5
5
  🎯 SRE Automation Specialist Implementation
6
6
  Following proven systematic delegation patterns for production reliability optimization.
7
7
 
8
8
  Key Focus Areas (From PDCA Analysis):
9
9
  1. Organization Discovery Performance: 52.3s -> <30s target
10
- 2. VPC Analysis Timeout Issues: Optimize network operations
10
+ 2. VPC Analysis Timeout Issues: Optimize network operations
11
11
  3. Memory Usage Optimization: Address large-scale operation issues (6.6GB → <500MB)
12
12
  4. Multi-Account Scaling: 200+ account enterprise support with concurrent processing
13
13
  5. Reliability Enhancements: >99.9% operation success rate with circuit breaker patterns
@@ -49,7 +49,7 @@ from rich.progress import (
49
49
  BarColumn,
50
50
  TimeElapsedColumn,
51
51
  TaskProgressColumn,
52
- MofNCompleteColumn
52
+ MofNCompleteColumn,
53
53
  )
54
54
  from rich.status import Status
55
55
  from rich.panel import Panel
@@ -63,7 +63,7 @@ from runbooks.common.rich_utils import (
63
63
  print_error,
64
64
  create_table,
65
65
  create_progress_bar,
66
- STATUS_INDICATORS
66
+ STATUS_INDICATORS,
67
67
  )
68
68
 
69
69
  logger = logging.getLogger(__name__)
@@ -72,6 +72,7 @@ logger = logging.getLogger(__name__)
72
72
  @dataclass
73
73
  class OptimizationMetrics:
74
74
  """Performance optimization metrics tracking"""
75
+
75
76
  operation_name: str
76
77
  start_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
77
78
  end_time: Optional[datetime] = None
@@ -108,14 +109,14 @@ class OptimizationMetrics:
108
109
 
109
110
  class IntelligentCache:
110
111
  """Intelligent caching system with TTL management and memory optimization"""
111
-
112
+
112
113
  def __init__(self, default_ttl_minutes: int = 30, max_cache_size: int = 1000):
113
114
  self.cache: Dict[str, Dict] = {}
114
115
  self.cache_timestamps: Dict[str, datetime] = {}
115
116
  self.default_ttl_minutes = default_ttl_minutes
116
117
  self.max_cache_size = max_cache_size
117
118
  self._lock = threading.RLock()
118
-
119
+
119
120
  # Performance tracking
120
121
  self.hits = 0
121
122
  self.misses = 0
@@ -126,18 +127,18 @@ class IntelligentCache:
126
127
  if key not in self.cache:
127
128
  self.misses += 1
128
129
  return None
129
-
130
+
130
131
  # Check TTL
131
132
  ttl = ttl_minutes or self.default_ttl_minutes
132
133
  cache_age = (datetime.now(timezone.utc) - self.cache_timestamps[key]).total_seconds() / 60
133
-
134
+
134
135
  if cache_age > ttl:
135
136
  # Cache expired
136
137
  del self.cache[key]
137
138
  del self.cache_timestamps[key]
138
139
  self.misses += 1
139
140
  return None
140
-
141
+
141
142
  self.hits += 1
142
143
  return self.cache[key]
143
144
 
@@ -147,7 +148,7 @@ class IntelligentCache:
147
148
  # Clean up if at max capacity
148
149
  if len(self.cache) >= self.max_cache_size:
149
150
  self._cleanup_oldest_entries(int(self.max_cache_size * 0.2)) # Remove 20%
150
-
151
+
151
152
  self.cache[key] = value
152
153
  self.cache_timestamps[key] = datetime.now(timezone.utc)
153
154
 
@@ -172,25 +173,26 @@ class IntelligentCache:
172
173
  """Get cache performance statistics"""
173
174
  total_requests = self.hits + self.misses
174
175
  hit_rate = (self.hits / total_requests * 100) if total_requests > 0 else 0.0
175
-
176
+
176
177
  return {
177
178
  "size": len(self.cache),
178
179
  "hits": self.hits,
179
180
  "misses": self.misses,
180
181
  "hit_rate": hit_rate,
181
- "max_size": self.max_cache_size
182
+ "max_size": self.max_cache_size,
182
183
  }
183
184
 
184
185
 
185
186
  @dataclass
186
187
  class CircuitBreakerState:
187
188
  """Circuit breaker state tracking for reliability patterns"""
189
+
188
190
  failure_count: int = 0
189
191
  last_failure_time: Optional[datetime] = None
190
192
  state: str = "closed" # closed, open, half_open
191
193
  success_count: int = 0
192
194
  total_requests: int = 0
193
-
195
+
194
196
  def calculate_failure_rate(self) -> float:
195
197
  """Calculate failure rate percentage"""
196
198
  if self.total_requests == 0:
@@ -201,20 +203,17 @@ class CircuitBreakerState:
201
203
  class CircuitBreaker:
202
204
  """
203
205
  Circuit breaker implementation for AWS API reliability
204
-
206
+
205
207
  Provides >99.9% operation success rate through:
206
208
  - Automatic failure detection and recovery
207
209
  - Graceful degradation patterns
208
210
  - Exponential backoff with jitter
209
211
  """
210
-
211
- def __init__(self,
212
- failure_threshold: int = 5,
213
- recovery_timeout_seconds: int = 60,
214
- success_threshold: int = 3):
212
+
213
+ def __init__(self, failure_threshold: int = 5, recovery_timeout_seconds: int = 60, success_threshold: int = 3):
215
214
  """
216
215
  Initialize circuit breaker
217
-
216
+
218
217
  Args:
219
218
  failure_threshold: Number of failures before opening circuit
220
219
  recovery_timeout_seconds: Time to wait before attempting recovery
@@ -223,7 +222,7 @@ class CircuitBreaker:
223
222
  self.failure_threshold = failure_threshold
224
223
  self.recovery_timeout_seconds = recovery_timeout_seconds
225
224
  self.success_threshold = success_threshold
226
-
225
+
227
226
  self.state = CircuitBreakerState()
228
227
  self._lock = threading.RLock()
229
228
 
@@ -231,7 +230,7 @@ class CircuitBreaker:
231
230
  def protected_call(self, operation_name: str = "aws_operation"):
232
231
  """
233
232
  Context manager for circuit breaker protected operations
234
-
233
+
235
234
  Args:
236
235
  operation_name: Name of operation for logging
237
236
  """
@@ -240,21 +239,23 @@ class CircuitBreaker:
240
239
  if self._should_open_circuit():
241
240
  self.state.state = "open"
242
241
  self.state.last_failure_time = datetime.now(timezone.utc)
243
-
242
+
244
243
  # Check if circuit can transition to half-open
245
244
  if self._can_attempt_recovery():
246
245
  self.state.state = "half_open"
247
246
  console.log(f"[yellow]🔄 Circuit breaker half-open for {operation_name}[/yellow]")
248
-
247
+
249
248
  # Block requests if circuit is open
250
249
  if self.state.state == "open":
251
250
  time_since_failure = (datetime.now(timezone.utc) - self.state.last_failure_time).total_seconds()
252
251
  if time_since_failure < self.recovery_timeout_seconds:
253
- raise Exception(f"Circuit breaker OPEN for {operation_name} - recovery in {self.recovery_timeout_seconds - time_since_failure:.1f}s")
252
+ raise Exception(
253
+ f"Circuit breaker OPEN for {operation_name} - recovery in {self.recovery_timeout_seconds - time_since_failure:.1f}s"
254
+ )
254
255
 
255
256
  try:
256
257
  yield
257
-
258
+
258
259
  # Success - update state
259
260
  with self._lock:
260
261
  if self.state.state == "half_open":
@@ -263,35 +264,37 @@ class CircuitBreaker:
263
264
  self.state.state = "closed"
264
265
  self.state.failure_count = 0
265
266
  self.state.success_count = 0
266
- console.log(f"[green]✅ Circuit breaker CLOSED for {operation_name} - service recovered[/green]")
267
-
267
+ console.log(
268
+ f"[green]✅ Circuit breaker CLOSED for {operation_name} - service recovered[/green]"
269
+ )
270
+
268
271
  self.state.total_requests += 1
269
-
272
+
270
273
  except Exception as e:
271
274
  # Failure - update state
272
275
  with self._lock:
273
276
  self.state.failure_count += 1
274
277
  self.state.total_requests += 1
275
278
  self.state.last_failure_time = datetime.now(timezone.utc)
276
-
279
+
277
280
  if self.state.state == "half_open":
278
281
  self.state.state = "open"
279
282
  console.log(f"[red]🚨 Circuit breaker OPEN for {operation_name} - recovery attempt failed[/red]")
280
-
283
+
281
284
  raise
282
285
 
283
286
  def _should_open_circuit(self) -> bool:
284
287
  """Check if circuit should be opened based on failure rate"""
285
288
  if self.state.state != "closed":
286
289
  return False
287
-
290
+
288
291
  return self.state.failure_count >= self.failure_threshold
289
292
 
290
293
  def _can_attempt_recovery(self) -> bool:
291
294
  """Check if recovery can be attempted"""
292
295
  if self.state.state != "open" or not self.state.last_failure_time:
293
296
  return False
294
-
297
+
295
298
  time_since_failure = (datetime.now(timezone.utc) - self.state.last_failure_time).total_seconds()
296
299
  return time_since_failure >= self.recovery_timeout_seconds
297
300
 
@@ -308,28 +311,28 @@ class CircuitBreaker:
308
311
 
309
312
  class OptimizedAWSClientPool:
310
313
  """Connection pooling and optimized AWS client management with circuit breaker patterns"""
311
-
314
+
312
315
  def __init__(self, max_pool_connections: int = 100):
313
316
  self.max_pool_connections = max_pool_connections
314
317
  self.clients: Dict[str, boto3.client] = {}
315
318
  self.sessions: Dict[str, boto3.Session] = {}
316
319
  self.circuit_breakers: Dict[str, CircuitBreaker] = {}
317
320
  self._lock = threading.RLock()
318
-
321
+
319
322
  # Optimized botocore configuration with enhanced retry logic
320
323
  self.config = Config(
321
324
  max_pool_connections=max_pool_connections,
322
- retries={'max_attempts': 3, 'mode': 'adaptive'},
325
+ retries={"max_attempts": 3, "mode": "adaptive"},
323
326
  tcp_keepalive=True,
324
- region_name='us-east-1', # Default region for global services
327
+ region_name="us-east-1", # Default region for global services
325
328
  read_timeout=30, # 30 second read timeout
326
329
  connect_timeout=10, # 10 second connection timeout
327
330
  )
328
331
 
329
- def get_client(self, service: str, profile: str, region: str = 'us-east-1') -> boto3.client:
332
+ def get_client(self, service: str, profile: str, region: str = "us-east-1") -> boto3.client:
330
333
  """Get optimized AWS client with connection pooling and circuit breaker protection"""
331
334
  client_key = f"{service}_{profile}_{region}"
332
-
335
+
333
336
  with self._lock:
334
337
  if client_key not in self.clients:
335
338
  # Create circuit breaker for this service/region combination
@@ -337,38 +340,36 @@ class OptimizedAWSClientPool:
337
340
  self.circuit_breakers[client_key] = CircuitBreaker(
338
341
  failure_threshold=3, # Open after 3 failures
339
342
  recovery_timeout_seconds=30, # Attempt recovery after 30s
340
- success_threshold=2 # Close after 2 successes
343
+ success_threshold=2, # Close after 2 successes
341
344
  )
342
-
345
+
343
346
  # Create session if not exists
344
347
  session_key = f"{profile}_{region}"
345
348
  if session_key not in self.sessions:
346
349
  self.sessions[session_key] = boto3.Session(profile_name=profile)
347
-
350
+
348
351
  # Create client with optimized config
349
352
  self.clients[client_key] = self.sessions[session_key].client(
350
- service,
351
- config=self.config,
352
- region_name=region
353
+ service, config=self.config, region_name=region
353
354
  )
354
-
355
+
355
356
  return self.clients[client_key]
356
357
 
357
358
  def protected_api_call(self, client_key: str, api_call: Callable, *args, **kwargs):
358
359
  """
359
360
  Execute AWS API call with circuit breaker protection
360
-
361
+
361
362
  Args:
362
363
  client_key: Client identifier for circuit breaker tracking
363
364
  api_call: AWS API method to call
364
365
  *args, **kwargs: Arguments for the API call
365
-
366
+
366
367
  Returns:
367
368
  API call result with circuit breaker protection
368
369
  """
369
370
  if client_key not in self.circuit_breakers:
370
371
  self.circuit_breakers[client_key] = CircuitBreaker()
371
-
372
+
372
373
  with self.circuit_breakers[client_key].protected_call(f"aws_{client_key}"):
373
374
  return api_call(*args, **kwargs)
374
375
 
@@ -377,20 +378,26 @@ class OptimizedAWSClientPool:
377
378
  status = {}
378
379
  for client_key, breaker in self.circuit_breakers.items():
379
380
  status[client_key] = breaker.get_state_info()
380
-
381
+
381
382
  # Calculate overall reliability metrics
382
383
  total_requests = sum(breaker.state.total_requests for breaker in self.circuit_breakers.values())
383
384
  total_failures = sum(breaker.state.failure_count for breaker in self.circuit_breakers.values())
384
-
385
- overall_success_rate = ((total_requests - total_failures) / total_requests * 100) if total_requests > 0 else 100.0
386
-
385
+
386
+ overall_success_rate = (
387
+ ((total_requests - total_failures) / total_requests * 100) if total_requests > 0 else 100.0
388
+ )
389
+
387
390
  return {
388
391
  "circuit_breakers": status,
389
392
  "overall_success_rate": overall_success_rate,
390
393
  "total_requests": total_requests,
391
394
  "total_failures": total_failures,
392
395
  "target_success_rate": 99.9,
393
- "reliability_status": "excellent" if overall_success_rate >= 99.9 else "good" if overall_success_rate >= 95.0 else "needs_improvement"
396
+ "reliability_status": "excellent"
397
+ if overall_success_rate >= 99.9
398
+ else "good"
399
+ if overall_success_rate >= 95.0
400
+ else "needs_improvement",
394
401
  }
395
402
 
396
403
  def get_session(self, profile: str) -> boto3.Session:
@@ -410,7 +417,7 @@ class OptimizedAWSClientPool:
410
417
  class PerformanceOptimizationEngine:
411
418
  """
412
419
  Enterprise performance optimization engine for CloudOps-Runbooks
413
-
420
+
414
421
  Implements SRE automation patterns for:
415
422
  - Organization discovery optimization (52.3s -> <30s)
416
423
  - VPC analysis performance improvements
@@ -418,13 +425,12 @@ class PerformanceOptimizationEngine:
418
425
  - Intelligent caching and connection pooling
419
426
  """
420
427
 
421
- def __init__(self,
422
- max_workers: int = 20,
423
- cache_ttl_minutes: int = 30,
424
- memory_limit_mb: int = 512): # Phase 2: Reduced from 2048MB to 512MB target
428
+ def __init__(
429
+ self, max_workers: int = 20, cache_ttl_minutes: int = 30, memory_limit_mb: int = 512
430
+ ): # Phase 2: Reduced from 2048MB to 512MB target
425
431
  """
426
432
  Initialize performance optimization engine
427
-
433
+
428
434
  Args:
429
435
  max_workers: Maximum concurrent workers for parallel operations
430
436
  cache_ttl_minutes: Cache TTL in minutes
@@ -432,23 +438,23 @@ class PerformanceOptimizationEngine:
432
438
  """
433
439
  self.max_workers = max_workers
434
440
  self.memory_limit_mb = memory_limit_mb
435
-
441
+
436
442
  # Core optimization components
437
443
  self.cache = IntelligentCache(
438
444
  default_ttl_minutes=cache_ttl_minutes,
439
- max_cache_size=500 # Phase 2: Reduced cache size for memory optimization
445
+ max_cache_size=500, # Phase 2: Reduced cache size for memory optimization
440
446
  )
441
447
  self.client_pool = OptimizedAWSClientPool(max_pool_connections=50)
442
-
448
+
443
449
  # Performance tracking
444
450
  self.metrics: List[OptimizationMetrics] = []
445
451
  self.current_operation: Optional[OptimizationMetrics] = None
446
-
452
+
447
453
  # Phase 2: Enhanced memory monitoring
448
454
  self.process = psutil.Process()
449
455
  self.memory_monitoring_active = False
450
456
  self.memory_optimization_active = True
451
-
457
+
452
458
  # Phase 2: Multi-account scaling configuration
453
459
  self.enterprise_scaling_enabled = True
454
460
  self.adaptive_batch_sizing = True
@@ -458,38 +464,35 @@ class PerformanceOptimizationEngine:
458
464
  def optimize_operation(self, operation_name: str, target_seconds: float = 30.0):
459
465
  """
460
466
  Context manager for optimized operation execution with monitoring
461
-
467
+
462
468
  Args:
463
469
  operation_name: Name of the operation being optimized
464
470
  target_seconds: Target completion time in seconds
465
471
  """
466
472
  # Start operation metrics tracking
467
- metrics = OptimizationMetrics(
468
- operation_name=operation_name,
469
- target_seconds=target_seconds
470
- )
473
+ metrics = OptimizationMetrics(operation_name=operation_name, target_seconds=target_seconds)
471
474
  self.current_operation = metrics
472
-
475
+
473
476
  # Start memory monitoring
474
477
  self._start_memory_monitoring()
475
-
478
+
476
479
  # Enhanced progress indicator for long operations
477
480
  with Status(f"[cyan]🚀 Optimizing: {operation_name}[/cyan]", console=console):
478
481
  try:
479
482
  console.log(f"[dim]Starting optimized {operation_name} (target: {target_seconds}s)[/]")
480
-
483
+
481
484
  yield metrics
482
-
485
+
483
486
  # Mark as successful
484
487
  metrics.finish(success=True)
485
488
  self._log_optimization_results(metrics)
486
-
489
+
487
490
  except Exception as e:
488
491
  # Handle failure
489
492
  metrics.finish(success=False, error_message=str(e))
490
493
  print_error(f"Optimization failed for {operation_name}", e)
491
494
  raise
492
-
495
+
493
496
  finally:
494
497
  # Stop monitoring and store results
495
498
  self._stop_memory_monitoring()
@@ -499,51 +502,55 @@ class PerformanceOptimizationEngine:
499
502
  def _start_memory_monitoring(self):
500
503
  """Start background memory usage monitoring with Phase 2 aggressive optimization"""
501
504
  self.memory_monitoring_active = True
502
-
505
+
503
506
  def monitor_memory():
504
507
  peak_memory = 0.0
505
508
  cleanup_counter = 0
506
-
509
+
507
510
  while self.memory_monitoring_active and self.current_operation:
508
511
  try:
509
512
  current_memory = self.process.memory_info().rss / (1024 * 1024) # MB
510
513
  peak_memory = max(peak_memory, current_memory)
511
514
  self.current_operation.memory_peak_mb = peak_memory
512
-
515
+
513
516
  # Phase 2: Aggressive memory management at 80% threshold
514
517
  memory_threshold_80 = self.memory_limit_mb * 0.8
515
518
  memory_threshold_90 = self.memory_limit_mb * 0.9
516
-
519
+
517
520
  if current_memory > memory_threshold_90:
518
- console.log(f"[red]🚨 CRITICAL: Memory usage ({current_memory:.1f}MB) at 90% limit ({self.memory_limit_mb}MB)[/red]")
521
+ console.log(
522
+ f"[red]🚨 CRITICAL: Memory usage ({current_memory:.1f}MB) at 90% limit ({self.memory_limit_mb}MB)[/red]"
523
+ )
519
524
  if self.auto_memory_cleanup:
520
525
  self._aggressive_memory_cleanup()
521
-
526
+
522
527
  elif current_memory > memory_threshold_80:
523
- console.log(f"[yellow]⚠️ WARNING: Memory usage ({current_memory:.1f}MB) at 80% limit ({self.memory_limit_mb}MB)[/yellow]")
528
+ console.log(
529
+ f"[yellow]⚠️ WARNING: Memory usage ({current_memory:.1f}MB) at 80% limit ({self.memory_limit_mb}MB)[/yellow]"
530
+ )
524
531
  if self.auto_memory_cleanup and cleanup_counter % 5 == 0: # Every 5 seconds at 80%
525
532
  self._proactive_memory_cleanup()
526
-
533
+
527
534
  # Phase 2: Proactive cleanup every 10 seconds
528
535
  cleanup_counter += 1
529
536
  if self.auto_memory_cleanup and cleanup_counter % 10 == 0:
530
537
  gc.collect()
531
-
538
+
532
539
  time.sleep(1) # Check every second
533
540
  except Exception:
534
541
  break
535
-
542
+
536
543
  self.memory_thread = threading.Thread(target=monitor_memory, daemon=True)
537
544
  self.memory_thread.start()
538
545
 
539
546
  def _proactive_memory_cleanup(self):
540
547
  """Proactive memory cleanup at 80% threshold"""
541
548
  console.log("[dim]🧹 Proactive memory cleanup initiated[/dim]")
542
-
549
+
543
550
  # Clear old cache entries
544
- if hasattr(self.cache, '_cleanup_oldest_entries'):
551
+ if hasattr(self.cache, "_cleanup_oldest_entries"):
545
552
  self.cache._cleanup_oldest_entries(int(self.cache.max_cache_size * 0.1)) # Clear 10%
546
-
553
+
547
554
  # Force garbage collection
548
555
  collected = gc.collect()
549
556
  if collected > 0:
@@ -552,19 +559,19 @@ class PerformanceOptimizationEngine:
552
559
  def _aggressive_memory_cleanup(self):
553
560
  """Aggressive memory cleanup at 90% threshold"""
554
561
  console.log("[red]🚨 Aggressive memory cleanup initiated[/red]")
555
-
562
+
556
563
  # Clear significant cache entries
557
- if hasattr(self.cache, '_cleanup_oldest_entries'):
564
+ if hasattr(self.cache, "_cleanup_oldest_entries"):
558
565
  self.cache._cleanup_oldest_entries(int(self.cache.max_cache_size * 0.3)) # Clear 30%
559
-
566
+
560
567
  # Multiple GC passes
561
568
  total_collected = 0
562
569
  for i in range(3):
563
570
  collected = gc.collect(i)
564
571
  total_collected += collected
565
-
572
+
566
573
  console.log(f"[yellow]🗑️ Emergency cleanup collected {total_collected} objects[/yellow]")
567
-
574
+
568
575
  # Update optimization applied list
569
576
  if self.current_operation:
570
577
  self.current_operation.optimization_applied.append("aggressive_memory_cleanup")
@@ -577,7 +584,7 @@ class PerformanceOptimizationEngine:
577
584
  """Log optimization results with rich formatting"""
578
585
  improvement = metrics.get_performance_improvement()
579
586
  cache_efficiency = metrics.get_cache_efficiency()
580
-
587
+
581
588
  if metrics.success:
582
589
  if metrics.duration_seconds <= metrics.target_seconds:
583
590
  print_success(
@@ -589,33 +596,35 @@ class PerformanceOptimizationEngine:
589
596
  f"{metrics.operation_name} completed in {metrics.duration_seconds:.1f}s "
590
597
  f"(target: {metrics.target_seconds:.1f}s)"
591
598
  )
592
-
599
+
593
600
  # Log optimization details
594
601
  if metrics.optimization_applied:
595
602
  console.log(f"[dim]Optimizations applied: {', '.join(metrics.optimization_applied)}[/]")
596
-
603
+
597
604
  if cache_efficiency > 0:
598
- console.log(f"[dim]Cache efficiency: {cache_efficiency:.1f}% ({metrics.cache_hits} hits, {metrics.cache_misses} misses)[/]")
605
+ console.log(
606
+ f"[dim]Cache efficiency: {cache_efficiency:.1f}% ({metrics.cache_hits} hits, {metrics.cache_misses} misses)[/]"
607
+ )
599
608
 
600
- def optimize_organization_discovery(self,
601
- management_profile: str,
602
- use_parallel_processing: bool = True,
603
- batch_size: int = 20) -> Callable:
609
+ def optimize_organization_discovery(
610
+ self, management_profile: str, use_parallel_processing: bool = True, batch_size: int = 20
611
+ ) -> Callable:
604
612
  """
605
613
  Optimize organization discovery operations
606
-
614
+
607
615
  Addresses: Organization Discovery Performance (52.3s -> <30s target)
608
-
616
+
609
617
  Returns optimized function with:
610
618
  - Intelligent caching for Organizations API calls
611
619
  - Parallel account processing
612
620
  - Memory-efficient batch processing
613
621
  - Connection pooling
614
622
  """
623
+
615
624
  def optimized_discover_accounts():
616
625
  """Optimized account discovery with caching and parallel processing"""
617
626
  cache_key = f"org_accounts_{management_profile}"
618
-
627
+
619
628
  # Check cache first
620
629
  cached_result = self.cache.get(cache_key, ttl_minutes=15) # Shorter TTL for critical data
621
630
  if cached_result and self.current_operation:
@@ -623,53 +632,55 @@ class PerformanceOptimizationEngine:
623
632
  self.current_operation.optimization_applied.append("intelligent_caching")
624
633
  console.log("[blue]🚀 Using cached organization data for optimal performance[/blue]")
625
634
  return cached_result
626
-
635
+
627
636
  if self.current_operation:
628
637
  self.current_operation.cache_misses += 1
629
-
638
+
630
639
  # Perform optimized discovery
631
640
  try:
632
641
  # Get optimized Organizations client
633
- org_client = self.client_pool.get_client('organizations', management_profile)
634
-
642
+ org_client = self.client_pool.get_client("organizations", management_profile)
643
+
635
644
  accounts = []
636
- paginator = org_client.get_paginator('list_accounts')
637
-
645
+ paginator = org_client.get_paginator("list_accounts")
646
+
638
647
  # Track API calls
639
648
  api_calls = 0
640
-
649
+
641
650
  # Use parallel processing for account details if enabled
642
651
  if use_parallel_processing:
643
652
  if self.current_operation:
644
653
  self.current_operation.optimization_applied.append("parallel_processing")
645
-
654
+
646
655
  accounts = self._process_accounts_parallel(paginator, org_client, batch_size)
647
656
  else:
648
657
  # Sequential processing (fallback)
649
658
  for page in paginator.paginate():
650
- accounts.extend(page['Accounts'])
659
+ accounts.extend(page["Accounts"])
651
660
  api_calls += 1
652
-
661
+
653
662
  # Trigger garbage collection periodically for memory efficiency
654
663
  if api_calls % 10 == 0:
655
664
  gc.collect()
656
-
665
+
657
666
  if self.current_operation:
658
667
  self.current_operation.api_calls_made = api_calls
659
668
  self.current_operation.optimization_applied.append("connection_pooling")
660
-
669
+
661
670
  # Cache the result
662
671
  result = {
663
- 'accounts': accounts,
664
- 'total_count': len(accounts),
665
- 'discovery_method': 'optimized_organizations_api',
666
- 'optimizations_applied': self.current_operation.optimization_applied if self.current_operation else []
672
+ "accounts": accounts,
673
+ "total_count": len(accounts),
674
+ "discovery_method": "optimized_organizations_api",
675
+ "optimizations_applied": self.current_operation.optimization_applied
676
+ if self.current_operation
677
+ else [],
667
678
  }
668
-
679
+
669
680
  self.cache.set(cache_key, result)
670
-
681
+
671
682
  return result
672
-
683
+
673
684
  except Exception as e:
674
685
  logger.error(f"Optimized organization discovery failed: {e}")
675
686
  raise
@@ -679,23 +690,23 @@ class PerformanceOptimizationEngine:
679
690
  def _process_accounts_parallel(self, paginator, org_client, batch_size: int) -> List[Dict]:
680
691
  """Process accounts in parallel with memory optimization"""
681
692
  all_accounts = []
682
-
693
+
683
694
  # Collect all account IDs first (memory efficient)
684
695
  account_ids = []
685
696
  for page in paginator.paginate():
686
- account_ids.extend([acc['Id'] for acc in page['Accounts']])
687
- all_accounts.extend(page['Accounts']) # Store basic account info
688
-
697
+ account_ids.extend([acc["Id"] for acc in page["Accounts"]])
698
+ all_accounts.extend(page["Accounts"]) # Store basic account info
699
+
689
700
  if self.current_operation:
690
701
  self.current_operation.api_calls_made += len(account_ids) // 100 + 1 # Estimate pages
691
-
702
+
692
703
  # Process account tags in batches to avoid memory issues
693
704
  if len(account_ids) > batch_size:
694
705
  if self.current_operation:
695
706
  self.current_operation.optimization_applied.append("batch_processing")
696
-
707
+
697
708
  self._enrich_accounts_with_tags_batched(all_accounts, org_client, batch_size)
698
-
709
+
699
710
  return all_accounts
700
711
 
701
712
  def _enrich_accounts_with_tags_batched(self, accounts: List[Dict], org_client, batch_size: int):
@@ -703,25 +714,25 @@ class PerformanceOptimizationEngine:
703
714
  with ThreadPoolExecutor(max_workers=min(self.max_workers, 10)) as executor:
704
715
  # Process in batches to control memory usage
705
716
  for i in range(0, len(accounts), batch_size):
706
- batch = accounts[i:i + batch_size]
707
-
717
+ batch = accounts[i : i + batch_size]
718
+
708
719
  # Submit batch for parallel tag processing
709
720
  futures = []
710
721
  for account in batch:
711
- future = executor.submit(self._get_account_tags_safe, org_client, account['Id'])
722
+ future = executor.submit(self._get_account_tags_safe, org_client, account["Id"])
712
723
  futures.append((future, account))
713
-
724
+
714
725
  # Collect results for this batch
715
726
  for future, account in futures:
716
727
  try:
717
728
  tags = future.result(timeout=10) # 10 second timeout per account
718
- account['Tags'] = tags
729
+ account["Tags"] = tags
719
730
  if self.current_operation:
720
731
  self.current_operation.api_calls_made += 1
721
732
  except Exception as e:
722
733
  logger.debug(f"Failed to get tags for account {account['Id']}: {e}")
723
- account['Tags'] = {}
724
-
734
+ account["Tags"] = {}
735
+
725
736
  # Trigger garbage collection after each batch
726
737
  gc.collect()
727
738
 
@@ -729,112 +740,109 @@ class PerformanceOptimizationEngine:
729
740
  """Safely get account tags with error handling"""
730
741
  try:
731
742
  response = org_client.list_tags_for_resource(ResourceId=account_id)
732
- return {tag['Key']: tag['Value'] for tag in response['Tags']}
743
+ return {tag["Key"]: tag["Value"] for tag in response["Tags"]}
733
744
  except Exception:
734
745
  return {}
735
746
 
736
747
  def optimize_vpc_analysis(self, operational_profile: str) -> Callable:
737
748
  """
738
749
  Optimize VPC analysis operations to address timeout issues
739
-
750
+
740
751
  Returns optimized function with:
741
752
  - Connection pooling for multiple regions
742
753
  - Parallel region processing
743
754
  - Intelligent timeout handling
744
755
  - Memory-efficient resource processing
745
756
  """
757
+
746
758
  def optimized_vpc_analysis(regions: List[str] = None):
747
759
  """Optimized VPC analysis with regional parallelization"""
748
760
  if regions is None:
749
- regions = [
750
- 'us-east-1', 'us-west-2', 'eu-west-1', 'eu-central-1',
751
- 'ap-southeast-1', 'ap-northeast-1'
752
- ]
753
-
761
+ regions = ["us-east-1", "us-west-2", "eu-west-1", "eu-central-1", "ap-southeast-1", "ap-northeast-1"]
762
+
754
763
  cache_key = f"vpc_analysis_{operational_profile}_{'_'.join(sorted(regions))}"
755
-
764
+
756
765
  # Check cache
757
766
  cached_result = self.cache.get(cache_key, ttl_minutes=60) # Longer TTL for VPC data
758
767
  if cached_result and self.current_operation:
759
768
  self.current_operation.cache_hits += 1
760
769
  self.current_operation.optimization_applied.append("regional_caching")
761
770
  return cached_result
762
-
771
+
763
772
  if self.current_operation:
764
773
  self.current_operation.cache_misses += 1
765
774
  self.current_operation.optimization_applied.append("parallel_regional_processing")
766
-
775
+
767
776
  # Parallel regional analysis
768
777
  vpc_data = {}
769
-
778
+
770
779
  with Progress(
771
780
  SpinnerColumn(),
772
781
  TextColumn("[progress.description]{task.description}"),
773
782
  BarColumn(),
774
783
  MofNCompleteColumn(),
775
784
  TimeElapsedColumn(),
776
- console=console
785
+ console=console,
777
786
  ) as progress:
778
-
779
787
  task = progress.add_task("Analyzing VPCs across regions...", total=len(regions))
780
-
788
+
781
789
  with ThreadPoolExecutor(max_workers=min(self.max_workers, len(regions))) as executor:
782
790
  # Submit region analysis tasks
783
791
  future_to_region = {
784
- executor.submit(self._analyze_vpc_region, operational_profile, region): region
792
+ executor.submit(self._analyze_vpc_region, operational_profile, region): region
785
793
  for region in regions
786
794
  }
787
-
795
+
788
796
  for future in as_completed(future_to_region):
789
797
  region = future_to_region[future]
790
798
  try:
791
799
  region_data = future.result(timeout=45) # 45s timeout per region
792
800
  vpc_data[region] = region_data
793
-
801
+
794
802
  if self.current_operation:
795
- self.current_operation.api_calls_made += region_data.get('api_calls', 0)
796
-
803
+ self.current_operation.api_calls_made += region_data.get("api_calls", 0)
804
+
797
805
  except Exception as e:
798
806
  logger.warning(f"VPC analysis failed for region {region}: {e}")
799
- vpc_data[region] = {'error': str(e), 'vpcs': []}
800
-
807
+ vpc_data[region] = {"error": str(e), "vpcs": []}
808
+
801
809
  finally:
802
810
  progress.advance(task)
803
-
811
+
804
812
  # Aggregate results
805
813
  result = {
806
- 'vpc_data_by_region': vpc_data,
807
- 'total_vpcs': sum(len(data.get('vpcs', [])) for data in vpc_data.values()),
808
- 'regions_analyzed': len(regions),
809
- 'optimization_applied': self.current_operation.optimization_applied if self.current_operation else []
814
+ "vpc_data_by_region": vpc_data,
815
+ "total_vpcs": sum(len(data.get("vpcs", [])) for data in vpc_data.values()),
816
+ "regions_analyzed": len(regions),
817
+ "optimization_applied": self.current_operation.optimization_applied if self.current_operation else [],
810
818
  }
811
-
819
+
812
820
  # Cache result
813
821
  self.cache.set(cache_key, result)
814
-
822
+
815
823
  return result
816
824
 
817
825
  return optimized_vpc_analysis
818
826
 
819
- def optimize_multi_account_operations(self,
820
- account_list: List[str],
821
- operation_function: Callable,
822
- batch_size: Optional[int] = None) -> Callable:
827
+ def optimize_multi_account_operations(
828
+ self, account_list: List[str], operation_function: Callable, batch_size: Optional[int] = None
829
+ ) -> Callable:
823
830
  """
824
831
  Phase 2: Optimize multi-account operations for 200+ enterprise account scaling
825
-
832
+
826
833
  Args:
827
834
  account_list: List of AWS account IDs to process
828
835
  operation_function: Function to execute per account
829
836
  batch_size: Adaptive batch size (auto-calculated if None)
830
-
837
+
831
838
  Returns:
832
839
  Optimized function with enterprise scaling patterns
833
840
  """
841
+
834
842
  def optimized_multi_account_operation(**kwargs):
835
843
  """Optimized multi-account operation with adaptive scaling"""
836
844
  account_count = len(account_list)
837
-
845
+
838
846
  # Phase 2: Adaptive batch sizing based on account count and memory
839
847
  if batch_size is None:
840
848
  if account_count <= 50:
@@ -847,28 +855,30 @@ class PerformanceOptimizationEngine:
847
855
  calculated_batch_size = 25 # Enterprise scale 200+
848
856
  else:
849
857
  calculated_batch_size = batch_size
850
-
858
+
851
859
  # Adjust batch size based on current memory usage
852
860
  if self.memory_optimization_active:
853
861
  current_memory = self.process.memory_info().rss / (1024 * 1024)
854
862
  memory_utilization = current_memory / self.memory_limit_mb
855
-
863
+
856
864
  if memory_utilization > 0.7:
857
865
  calculated_batch_size = max(5, calculated_batch_size // 2)
858
- console.log(f"[yellow]📉 Reducing batch size to {calculated_batch_size} due to memory pressure[/yellow]")
859
-
860
- console.log(f"[cyan]🏢 Enterprise multi-account operation: {account_count} accounts, batch size: {calculated_batch_size}[/cyan]")
861
-
866
+ console.log(
867
+ f"[yellow]📉 Reducing batch size to {calculated_batch_size} due to memory pressure[/yellow]"
868
+ )
869
+
870
+ console.log(
871
+ f"[cyan]🏢 Enterprise multi-account operation: {account_count} accounts, batch size: {calculated_batch_size}[/cyan]"
872
+ )
873
+
862
874
  if self.current_operation:
863
- self.current_operation.optimization_applied.extend([
864
- "enterprise_multi_account_scaling",
865
- "adaptive_batch_sizing",
866
- f"batch_size_{calculated_batch_size}"
867
- ])
868
-
875
+ self.current_operation.optimization_applied.extend(
876
+ ["enterprise_multi_account_scaling", "adaptive_batch_sizing", f"batch_size_{calculated_batch_size}"]
877
+ )
878
+
869
879
  results = {}
870
880
  processed_count = 0
871
-
881
+
872
882
  # Process accounts in adaptive batches
873
883
  with Progress(
874
884
  SpinnerColumn(),
@@ -876,83 +886,84 @@ class PerformanceOptimizationEngine:
876
886
  BarColumn(),
877
887
  MofNCompleteColumn(),
878
888
  TimeElapsedColumn(),
879
- console=console
889
+ console=console,
880
890
  ) as progress:
881
-
882
891
  task = progress.add_task("Processing enterprise accounts...", total=account_count)
883
-
892
+
884
893
  # Process in batches with circuit breaker protection
885
894
  for i in range(0, account_count, calculated_batch_size):
886
- batch_accounts = account_list[i:i + calculated_batch_size]
887
-
895
+ batch_accounts = account_list[i : i + calculated_batch_size]
896
+
888
897
  with ThreadPoolExecutor(max_workers=min(self.max_workers, len(batch_accounts))) as executor:
889
898
  batch_futures = {}
890
-
899
+
891
900
  for account_id in batch_accounts:
892
901
  # Use circuit breaker protection for each account
893
902
  client_key = f"account_{account_id}"
894
-
903
+
895
904
  try:
896
905
  future = executor.submit(
897
906
  self._protected_account_operation,
898
907
  client_key,
899
908
  operation_function,
900
909
  account_id,
901
- **kwargs
910
+ **kwargs,
902
911
  )
903
912
  batch_futures[future] = account_id
904
-
913
+
905
914
  except Exception as e:
906
915
  logger.warning(f"Failed to submit operation for account {account_id}: {e}")
907
916
  results[account_id] = {"error": str(e), "success": False}
908
-
917
+
909
918
  # Collect batch results with timeout handling
910
919
  for future in as_completed(batch_futures, timeout=120): # 2 minute timeout per batch
911
920
  account_id = batch_futures[future]
912
921
  try:
913
922
  result = future.result(timeout=60) # 1 minute per account
914
923
  results[account_id] = result
915
-
924
+
916
925
  except Exception as e:
917
926
  logger.warning(f"Account operation failed for {account_id}: {e}")
918
927
  results[account_id] = {"error": str(e), "success": False}
919
-
928
+
920
929
  finally:
921
930
  processed_count += 1
922
931
  progress.advance(task)
923
-
932
+
924
933
  # Phase 2: Proactive memory cleanup between batches
925
934
  if self.auto_memory_cleanup and i > 0:
926
935
  current_memory = self.process.memory_info().rss / (1024 * 1024)
927
936
  if current_memory > self.memory_limit_mb * 0.6:
928
937
  self._proactive_memory_cleanup()
929
938
  time.sleep(1) # Brief pause after cleanup
930
-
939
+
931
940
  # Update operation metrics
932
941
  if self.current_operation:
933
942
  self.current_operation.api_calls_made += processed_count
934
943
  success_count = sum(1 for r in results.values() if r.get("success", False))
935
944
  success_rate = (success_count / processed_count * 100) if processed_count > 0 else 0
936
-
937
- console.log(f"[green]✅ Multi-account operation completed: {success_count}/{processed_count} accounts ({success_rate:.1f}% success)[/green]")
938
-
945
+
946
+ console.log(
947
+ f"[green]✅ Multi-account operation completed: {success_count}/{processed_count} accounts ({success_rate:.1f}% success)[/green]"
948
+ )
949
+
939
950
  if success_rate >= 99.0:
940
951
  self.current_operation.optimization_applied.append("high_reliability_achieved")
941
-
952
+
942
953
  return {
943
- 'results': results,
944
- 'total_accounts': account_count,
945
- 'processed_accounts': processed_count,
946
- 'success_rate': success_rate,
947
- 'batch_size_used': calculated_batch_size,
948
- 'optimization_summary': {
949
- 'enterprise_scaling': True,
950
- 'adaptive_batching': True,
951
- 'memory_optimized': self.memory_optimization_active,
952
- 'reliability_protected': True
953
- }
954
+ "results": results,
955
+ "total_accounts": account_count,
956
+ "processed_accounts": processed_count,
957
+ "success_rate": success_rate,
958
+ "batch_size_used": calculated_batch_size,
959
+ "optimization_summary": {
960
+ "enterprise_scaling": True,
961
+ "adaptive_batching": True,
962
+ "memory_optimized": self.memory_optimization_active,
963
+ "reliability_protected": True,
964
+ },
954
965
  }
955
-
966
+
956
967
  return optimized_multi_account_operation
957
968
 
958
969
  def _protected_account_operation(self, client_key: str, operation_function: Callable, account_id: str, **kwargs):
@@ -962,57 +973,53 @@ class PerformanceOptimizationEngine:
962
973
  self.client_pool.circuit_breakers[client_key] = CircuitBreaker(
963
974
  failure_threshold=2, # More aggressive for account-level operations
964
975
  recovery_timeout_seconds=15, # Faster recovery for account operations
965
- success_threshold=1 # Close quickly on success
976
+ success_threshold=1, # Close quickly on success
966
977
  )
967
-
978
+
968
979
  with self.client_pool.circuit_breakers[client_key].protected_call(f"account_{account_id}"):
969
980
  return operation_function(account_id=account_id, **kwargs)
970
981
 
971
982
  def _analyze_vpc_region(self, profile: str, region: str) -> Dict:
972
983
  """Analyze VPCs in a specific region with optimization"""
973
984
  try:
974
- ec2_client = self.client_pool.get_client('ec2', profile, region)
975
-
985
+ ec2_client = self.client_pool.get_client("ec2", profile, region)
986
+
976
987
  # Get VPCs with pagination
977
988
  vpcs = []
978
989
  api_calls = 0
979
-
980
- paginator = ec2_client.get_paginator('describe_vpcs')
990
+
991
+ paginator = ec2_client.get_paginator("describe_vpcs")
981
992
  for page in paginator.paginate():
982
- vpcs.extend(page['Vpcs'])
993
+ vpcs.extend(page["Vpcs"])
983
994
  api_calls += 1
984
-
995
+
985
996
  # Enrich with network details (optimized)
986
997
  for vpc in vpcs:
987
998
  # Get subnets for this VPC
988
999
  try:
989
1000
  subnets_response = ec2_client.describe_subnets(
990
- Filters=[{'Name': 'vpc-id', 'Values': [vpc['VpcId']]}]
1001
+ Filters=[{"Name": "vpc-id", "Values": [vpc["VpcId"]]}]
991
1002
  )
992
- vpc['Subnets'] = subnets_response['Subnets']
1003
+ vpc["Subnets"] = subnets_response["Subnets"]
993
1004
  api_calls += 1
994
1005
  except Exception as e:
995
1006
  logger.debug(f"Failed to get subnets for VPC {vpc['VpcId']}: {e}")
996
- vpc['Subnets'] = []
997
-
998
- return {
999
- 'vpcs': vpcs,
1000
- 'region': region,
1001
- 'api_calls': api_calls
1002
- }
1003
-
1007
+ vpc["Subnets"] = []
1008
+
1009
+ return {"vpcs": vpcs, "region": region, "api_calls": api_calls}
1010
+
1004
1011
  except Exception as e:
1005
1012
  logger.error(f"VPC region analysis failed for {region}: {e}")
1006
- return {'vpcs': [], 'region': region, 'error': str(e), 'api_calls': 0}
1013
+ return {"vpcs": [], "region": region, "error": str(e), "api_calls": 0}
1007
1014
 
1008
1015
  def create_optimization_summary(self) -> None:
1009
1016
  """Create comprehensive optimization performance summary with Phase 2 reliability metrics"""
1010
1017
  if not self.metrics:
1011
1018
  console.print("[yellow]No optimization metrics available yet[/]")
1012
1019
  return
1013
-
1020
+
1014
1021
  print_header("Performance Optimization Summary - Phase 2 Enhanced", "SRE Automation Engine")
1015
-
1022
+
1016
1023
  # Phase 2: Create enhanced metrics table with reliability information
1017
1024
  table = create_table(
1018
1025
  title="Phase 2 Optimization Results",
@@ -1023,23 +1030,23 @@ class PerformanceOptimizationEngine:
1023
1030
  {"name": "Memory", "style": "blue", "justify": "right"},
1024
1031
  {"name": "Improvement", "style": "white", "justify": "right"},
1025
1032
  {"name": "Optimizations", "style": "dim", "justify": "left", "max_width": 25},
1026
- {"name": "Status", "style": "white", "justify": "center"}
1027
- ]
1033
+ {"name": "Status", "style": "white", "justify": "center"},
1034
+ ],
1028
1035
  )
1029
-
1036
+
1030
1037
  for metrics in self.metrics:
1031
1038
  improvement = metrics.get_performance_improvement()
1032
- status_icon = STATUS_INDICATORS['success'] if metrics.success else STATUS_INDICATORS['error']
1033
- status_color = 'green' if metrics.success else 'red'
1034
-
1039
+ status_icon = STATUS_INDICATORS["success"] if metrics.success else STATUS_INDICATORS["error"]
1040
+ status_color = "green" if metrics.success else "red"
1041
+
1035
1042
  improvement_text = f"+{improvement:.1f}%" if improvement > 0 else f"{improvement:.1f}%"
1036
- improvement_color = 'green' if improvement > 0 else 'yellow'
1037
-
1043
+ improvement_color = "green" if improvement > 0 else "yellow"
1044
+
1038
1045
  # Phase 2: Memory usage display with color coding
1039
1046
  memory_mb = metrics.memory_peak_mb
1040
- memory_color = 'green' if memory_mb <= 256 else 'yellow' if memory_mb <= 512 else 'red'
1047
+ memory_color = "green" if memory_mb <= 256 else "yellow" if memory_mb <= 512 else "red"
1041
1048
  memory_text = f"[{memory_color}]{memory_mb:.0f}MB[/{memory_color}]"
1042
-
1049
+
1043
1050
  table.add_row(
1044
1051
  metrics.operation_name,
1045
1052
  f"{metrics.duration_seconds:.1f}s",
@@ -1047,29 +1054,27 @@ class PerformanceOptimizationEngine:
1047
1054
  memory_text,
1048
1055
  f"[{improvement_color}]{improvement_text}[/]",
1049
1056
  ", ".join(metrics.optimization_applied[:2]) + ("..." if len(metrics.optimization_applied) > 2 else ""),
1050
- f"[{status_color}]{status_icon}[/]"
1057
+ f"[{status_color}]{status_icon}[/]",
1051
1058
  )
1052
-
1059
+
1053
1060
  console.print(table)
1054
-
1061
+
1055
1062
  # Cache statistics
1056
1063
  cache_stats = self.cache.get_stats()
1057
1064
  cache_panel = Panel(
1058
1065
  f"[cyan]Cache Size:[/] {cache_stats['size']}/{cache_stats['max_size']}\n"
1059
1066
  f"[cyan]Hit Rate:[/] {cache_stats['hit_rate']:.1f}% ({cache_stats['hits']} hits, {cache_stats['misses']} misses)",
1060
1067
  title="[bold]Cache Performance[/bold]",
1061
- border_style="blue"
1068
+ border_style="blue",
1062
1069
  )
1063
1070
  console.print(cache_panel)
1064
-
1071
+
1065
1072
  # Phase 2: Reliability status panel
1066
1073
  reliability_stats = self.client_pool.get_reliability_status()
1067
- reliability_color = {
1068
- 'excellent': 'green',
1069
- 'good': 'blue',
1070
- 'needs_improvement': 'yellow'
1071
- }.get(reliability_stats.get('reliability_status', 'good'), 'white')
1072
-
1074
+ reliability_color = {"excellent": "green", "good": "blue", "needs_improvement": "yellow"}.get(
1075
+ reliability_stats.get("reliability_status", "good"), "white"
1076
+ )
1077
+
1073
1078
  reliability_panel = Panel(
1074
1079
  f"[cyan]Success Rate:[/] [{reliability_color}]{reliability_stats['overall_success_rate']:.2f}%[/{reliability_color}] "
1075
1080
  f"(Target: {reliability_stats['target_success_rate']}%)\n"
@@ -1078,33 +1083,39 @@ class PerformanceOptimizationEngine:
1078
1083
  f"[cyan]Circuit Breakers:[/] {len(reliability_stats['circuit_breakers'])} active "
1079
1084
  f"([cyan]Status:[/] [{reliability_color}]{reliability_stats['reliability_status'].title()}[/{reliability_color}])",
1080
1085
  title="[bold]Phase 2 Reliability Metrics[/bold]",
1081
- border_style=reliability_color
1086
+ border_style=reliability_color,
1082
1087
  )
1083
1088
  console.print(reliability_panel)
1084
-
1089
+
1085
1090
  # Phase 2: Memory optimization status
1086
1091
  memory_report = self.get_memory_usage_report()
1087
- memory_color = 'green' if memory_report['current_memory_mb'] <= 256 else 'yellow' if memory_report['current_memory_mb'] <= 512 else 'red'
1088
-
1092
+ memory_color = (
1093
+ "green"
1094
+ if memory_report["current_memory_mb"] <= 256
1095
+ else "yellow"
1096
+ if memory_report["current_memory_mb"] <= 512
1097
+ else "red"
1098
+ )
1099
+
1089
1100
  memory_panel = Panel(
1090
1101
  f"[cyan]Current Memory:[/] [{memory_color}]{memory_report['current_memory_mb']:.1f}MB[/{memory_color}] / {self.memory_limit_mb}MB\n"
1091
1102
  f"[cyan]Peak Memory:[/] {memory_report.get('peak_memory_mb', 0):.1f}MB\n"
1092
1103
  f"[cyan]Status:[/] [{memory_color}]{memory_report['memory_efficiency'].title()}[/{memory_color}] "
1093
1104
  f"([cyan]Cleanup:[/] {'Enabled' if self.auto_memory_cleanup else 'Disabled'})",
1094
1105
  title="[bold]Phase 2 Memory Optimization[/bold]",
1095
- border_style=memory_color
1106
+ border_style=memory_color,
1096
1107
  )
1097
1108
  console.print(memory_panel)
1098
1109
 
1099
1110
  def get_memory_usage_report(self) -> Dict[str, Any]:
1100
1111
  """Get current memory usage report"""
1101
1112
  memory_info = self.process.memory_info()
1102
-
1113
+
1103
1114
  return {
1104
1115
  "current_memory_mb": memory_info.rss / (1024 * 1024),
1105
1116
  "peak_memory_mb": max(m.memory_peak_mb for m in self.metrics) if self.metrics else 0.0,
1106
1117
  "memory_limit_mb": self.memory_limit_mb,
1107
- "memory_efficiency": "good" if memory_info.rss / (1024 * 1024) < self.memory_limit_mb * 0.8 else "warning"
1118
+ "memory_efficiency": "good" if memory_info.rss / (1024 * 1024) < self.memory_limit_mb * 0.8 else "warning",
1108
1119
  }
1109
1120
 
1110
1121
  def clear_caches(self):
@@ -1118,16 +1129,14 @@ class PerformanceOptimizationEngine:
1118
1129
  _optimization_engine: Optional[PerformanceOptimizationEngine] = None
1119
1130
 
1120
1131
 
1121
- def get_optimization_engine(max_workers: int = 20,
1122
- cache_ttl_minutes: int = 30,
1123
- memory_limit_mb: int = 512) -> PerformanceOptimizationEngine: # Phase 2: Default 512MB
1132
+ def get_optimization_engine(
1133
+ max_workers: int = 20, cache_ttl_minutes: int = 30, memory_limit_mb: int = 512
1134
+ ) -> PerformanceOptimizationEngine: # Phase 2: Default 512MB
1124
1135
  """Get or create global performance optimization engine with Phase 2 enhancements"""
1125
1136
  global _optimization_engine
1126
1137
  if _optimization_engine is None:
1127
1138
  _optimization_engine = PerformanceOptimizationEngine(
1128
- max_workers=max_workers,
1129
- cache_ttl_minutes=cache_ttl_minutes,
1130
- memory_limit_mb=memory_limit_mb
1139
+ max_workers=max_workers, cache_ttl_minutes=cache_ttl_minutes, memory_limit_mb=memory_limit_mb
1131
1140
  )
1132
1141
  return _optimization_engine
1133
1142
 
@@ -1143,11 +1152,11 @@ def create_optimization_report():
1143
1152
  # Export public interface - Phase 2 Enhanced
1144
1153
  __all__ = [
1145
1154
  "PerformanceOptimizationEngine",
1146
- "OptimizationMetrics",
1155
+ "OptimizationMetrics",
1147
1156
  "IntelligentCache",
1148
1157
  "OptimizedAWSClientPool",
1149
1158
  "CircuitBreaker",
1150
1159
  "CircuitBreakerState",
1151
1160
  "get_optimization_engine",
1152
- "create_optimization_report"
1153
- ]
1161
+ "create_optimization_report",
1162
+ ]