aws-cis-controls-assessment 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aws_cis_assessment/__init__.py +11 -0
- aws_cis_assessment/cli/__init__.py +3 -0
- aws_cis_assessment/cli/examples.py +274 -0
- aws_cis_assessment/cli/main.py +1259 -0
- aws_cis_assessment/cli/utils.py +356 -0
- aws_cis_assessment/config/__init__.py +1 -0
- aws_cis_assessment/config/config_loader.py +328 -0
- aws_cis_assessment/config/rules/cis_controls_ig1.yaml +590 -0
- aws_cis_assessment/config/rules/cis_controls_ig2.yaml +412 -0
- aws_cis_assessment/config/rules/cis_controls_ig3.yaml +100 -0
- aws_cis_assessment/controls/__init__.py +1 -0
- aws_cis_assessment/controls/base_control.py +400 -0
- aws_cis_assessment/controls/ig1/__init__.py +239 -0
- aws_cis_assessment/controls/ig1/control_1_1.py +586 -0
- aws_cis_assessment/controls/ig1/control_2_2.py +231 -0
- aws_cis_assessment/controls/ig1/control_3_3.py +718 -0
- aws_cis_assessment/controls/ig1/control_3_4.py +235 -0
- aws_cis_assessment/controls/ig1/control_4_1.py +461 -0
- aws_cis_assessment/controls/ig1/control_access_keys.py +310 -0
- aws_cis_assessment/controls/ig1/control_advanced_security.py +512 -0
- aws_cis_assessment/controls/ig1/control_backup_recovery.py +510 -0
- aws_cis_assessment/controls/ig1/control_cloudtrail_logging.py +197 -0
- aws_cis_assessment/controls/ig1/control_critical_security.py +422 -0
- aws_cis_assessment/controls/ig1/control_data_protection.py +898 -0
- aws_cis_assessment/controls/ig1/control_iam_advanced.py +573 -0
- aws_cis_assessment/controls/ig1/control_iam_governance.py +493 -0
- aws_cis_assessment/controls/ig1/control_iam_policies.py +383 -0
- aws_cis_assessment/controls/ig1/control_instance_optimization.py +100 -0
- aws_cis_assessment/controls/ig1/control_network_enhancements.py +203 -0
- aws_cis_assessment/controls/ig1/control_network_security.py +672 -0
- aws_cis_assessment/controls/ig1/control_s3_enhancements.py +173 -0
- aws_cis_assessment/controls/ig1/control_s3_security.py +422 -0
- aws_cis_assessment/controls/ig1/control_vpc_security.py +235 -0
- aws_cis_assessment/controls/ig2/__init__.py +172 -0
- aws_cis_assessment/controls/ig2/control_3_10.py +698 -0
- aws_cis_assessment/controls/ig2/control_3_11.py +1330 -0
- aws_cis_assessment/controls/ig2/control_5_2.py +393 -0
- aws_cis_assessment/controls/ig2/control_advanced_encryption.py +355 -0
- aws_cis_assessment/controls/ig2/control_codebuild_security.py +263 -0
- aws_cis_assessment/controls/ig2/control_encryption_rest.py +382 -0
- aws_cis_assessment/controls/ig2/control_encryption_transit.py +382 -0
- aws_cis_assessment/controls/ig2/control_network_ha.py +467 -0
- aws_cis_assessment/controls/ig2/control_remaining_encryption.py +426 -0
- aws_cis_assessment/controls/ig2/control_remaining_rules.py +363 -0
- aws_cis_assessment/controls/ig2/control_service_logging.py +402 -0
- aws_cis_assessment/controls/ig3/__init__.py +49 -0
- aws_cis_assessment/controls/ig3/control_12_8.py +395 -0
- aws_cis_assessment/controls/ig3/control_13_1.py +467 -0
- aws_cis_assessment/controls/ig3/control_3_14.py +523 -0
- aws_cis_assessment/controls/ig3/control_7_1.py +359 -0
- aws_cis_assessment/core/__init__.py +1 -0
- aws_cis_assessment/core/accuracy_validator.py +425 -0
- aws_cis_assessment/core/assessment_engine.py +1266 -0
- aws_cis_assessment/core/audit_trail.py +491 -0
- aws_cis_assessment/core/aws_client_factory.py +313 -0
- aws_cis_assessment/core/error_handler.py +607 -0
- aws_cis_assessment/core/models.py +166 -0
- aws_cis_assessment/core/scoring_engine.py +459 -0
- aws_cis_assessment/reporters/__init__.py +8 -0
- aws_cis_assessment/reporters/base_reporter.py +454 -0
- aws_cis_assessment/reporters/csv_reporter.py +835 -0
- aws_cis_assessment/reporters/html_reporter.py +2162 -0
- aws_cis_assessment/reporters/json_reporter.py +561 -0
- aws_cis_controls_assessment-1.0.3.dist-info/METADATA +248 -0
- aws_cis_controls_assessment-1.0.3.dist-info/RECORD +77 -0
- aws_cis_controls_assessment-1.0.3.dist-info/WHEEL +5 -0
- aws_cis_controls_assessment-1.0.3.dist-info/entry_points.txt +2 -0
- aws_cis_controls_assessment-1.0.3.dist-info/licenses/LICENSE +21 -0
- aws_cis_controls_assessment-1.0.3.dist-info/top_level.txt +2 -0
- docs/README.md +94 -0
- docs/assessment-logic.md +766 -0
- docs/cli-reference.md +698 -0
- docs/config-rule-mappings.md +393 -0
- docs/developer-guide.md +858 -0
- docs/installation.md +299 -0
- docs/troubleshooting.md +634 -0
- docs/user-guide.md +487 -0
|
@@ -0,0 +1,607 @@
|
|
|
1
|
+
"""Comprehensive error handling system for AWS CIS Assessment tool."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import traceback
|
|
5
|
+
import time
|
|
6
|
+
from typing import Dict, List, Optional, Any, Callable, Union
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime, timedelta
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
|
|
11
|
+
from botocore.exceptions import EndpointConnectionError, ConnectTimeoutError, ReadTimeoutError
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ErrorSeverity(Enum):
|
|
17
|
+
"""Error severity levels."""
|
|
18
|
+
CRITICAL = "CRITICAL"
|
|
19
|
+
HIGH = "HIGH"
|
|
20
|
+
MEDIUM = "MEDIUM"
|
|
21
|
+
LOW = "LOW"
|
|
22
|
+
INFO = "INFO"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ErrorCategory(Enum):
|
|
26
|
+
"""Error category classification."""
|
|
27
|
+
CREDENTIAL = "CREDENTIAL"
|
|
28
|
+
PERMISSION = "PERMISSION"
|
|
29
|
+
SERVICE_UNAVAILABLE = "SERVICE_UNAVAILABLE"
|
|
30
|
+
NETWORK = "NETWORK"
|
|
31
|
+
THROTTLING = "THROTTLING"
|
|
32
|
+
CONFIGURATION = "CONFIGURATION"
|
|
33
|
+
RESOURCE_NOT_FOUND = "RESOURCE_NOT_FOUND"
|
|
34
|
+
VALIDATION = "VALIDATION"
|
|
35
|
+
UNKNOWN = "UNKNOWN"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ErrorContext:
|
|
40
|
+
"""Context information for error handling."""
|
|
41
|
+
service_name: str = ""
|
|
42
|
+
region: str = ""
|
|
43
|
+
resource_type: str = ""
|
|
44
|
+
resource_id: str = ""
|
|
45
|
+
operation: str = ""
|
|
46
|
+
control_id: str = ""
|
|
47
|
+
config_rule_name: str = ""
|
|
48
|
+
additional_context: Dict[str, Any] = field(default_factory=dict)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class ErrorRecord:
|
|
53
|
+
"""Detailed error record for tracking and reporting."""
|
|
54
|
+
error_id: str
|
|
55
|
+
timestamp: datetime
|
|
56
|
+
severity: ErrorSeverity
|
|
57
|
+
category: ErrorCategory
|
|
58
|
+
message: str
|
|
59
|
+
context: ErrorContext
|
|
60
|
+
exception_type: str = ""
|
|
61
|
+
stack_trace: str = ""
|
|
62
|
+
recovery_attempted: bool = False
|
|
63
|
+
recovery_successful: bool = False
|
|
64
|
+
retry_count: int = 0
|
|
65
|
+
troubleshooting_guidance: List[str] = field(default_factory=list)
|
|
66
|
+
|
|
67
|
+
def __post_init__(self):
|
|
68
|
+
"""Generate error ID if not provided."""
|
|
69
|
+
if not self.error_id:
|
|
70
|
+
self.error_id = f"ERR_{int(time.time())}_{hash(self.message) % 10000:04d}"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class ErrorHandler:
|
|
74
|
+
"""Comprehensive error handling with graceful degradation and recovery."""
|
|
75
|
+
|
|
76
|
+
def __init__(self, max_retries: int = 3, base_delay: float = 1.0,
|
|
77
|
+
enable_audit_trail: bool = True):
|
|
78
|
+
"""Initialize error handler.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
max_retries: Maximum retry attempts for recoverable errors
|
|
82
|
+
base_delay: Base delay for exponential backoff
|
|
83
|
+
enable_audit_trail: Whether to maintain detailed error audit trail
|
|
84
|
+
"""
|
|
85
|
+
self.max_retries = max_retries
|
|
86
|
+
self.base_delay = base_delay
|
|
87
|
+
self.enable_audit_trail = enable_audit_trail
|
|
88
|
+
|
|
89
|
+
# Error tracking
|
|
90
|
+
self.error_records: List[ErrorRecord] = []
|
|
91
|
+
self.service_availability: Dict[str, Dict[str, bool]] = {}
|
|
92
|
+
self.retry_counts: Dict[str, int] = {}
|
|
93
|
+
|
|
94
|
+
# Recovery strategies
|
|
95
|
+
self.recovery_strategies: Dict[ErrorCategory, Callable] = {
|
|
96
|
+
ErrorCategory.THROTTLING: self._handle_throttling_error,
|
|
97
|
+
ErrorCategory.NETWORK: self._handle_network_error,
|
|
98
|
+
ErrorCategory.SERVICE_UNAVAILABLE: self._handle_service_unavailable_error,
|
|
99
|
+
ErrorCategory.PERMISSION: self._handle_permission_error,
|
|
100
|
+
ErrorCategory.CREDENTIAL: self._handle_credential_error,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
logger.info("ErrorHandler initialized with comprehensive error handling")
|
|
104
|
+
|
|
105
|
+
def handle_error(self, exception: Exception, context: ErrorContext,
|
|
106
|
+
operation: Optional[Callable] = None) -> Optional[Any]:
|
|
107
|
+
"""Handle error with appropriate recovery strategy.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
exception: The exception that occurred
|
|
111
|
+
context: Context information about the error
|
|
112
|
+
operation: Optional operation to retry
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Result of successful recovery or None if recovery failed
|
|
116
|
+
"""
|
|
117
|
+
# Classify the error
|
|
118
|
+
category = self._classify_error(exception)
|
|
119
|
+
severity = self._determine_severity(exception, category)
|
|
120
|
+
|
|
121
|
+
# Create error record
|
|
122
|
+
error_record = self._create_error_record(
|
|
123
|
+
exception, context, category, severity
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Log the error
|
|
127
|
+
self._log_error(error_record)
|
|
128
|
+
|
|
129
|
+
# Store error record if audit trail is enabled
|
|
130
|
+
if self.enable_audit_trail:
|
|
131
|
+
self.error_records.append(error_record)
|
|
132
|
+
|
|
133
|
+
# Attempt recovery if strategy exists
|
|
134
|
+
if category in self.recovery_strategies and operation:
|
|
135
|
+
try:
|
|
136
|
+
result = self.recovery_strategies[category](
|
|
137
|
+
exception, context, operation, error_record
|
|
138
|
+
)
|
|
139
|
+
if result is not None:
|
|
140
|
+
error_record.recovery_attempted = True
|
|
141
|
+
error_record.recovery_successful = True
|
|
142
|
+
logger.info(f"Successfully recovered from error: {error_record.error_id}")
|
|
143
|
+
return result
|
|
144
|
+
except Exception as recovery_error:
|
|
145
|
+
logger.error(f"Recovery failed for error {error_record.error_id}: {recovery_error}")
|
|
146
|
+
error_record.recovery_attempted = True
|
|
147
|
+
error_record.recovery_successful = False
|
|
148
|
+
|
|
149
|
+
# Update service availability tracking
|
|
150
|
+
self._update_service_availability(context, category)
|
|
151
|
+
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
def _classify_error(self, exception: Exception) -> ErrorCategory:
|
|
155
|
+
"""Classify error into appropriate category.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
exception: The exception to classify
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
ErrorCategory enum value
|
|
162
|
+
"""
|
|
163
|
+
if isinstance(exception, (NoCredentialsError, PartialCredentialsError)):
|
|
164
|
+
return ErrorCategory.CREDENTIAL
|
|
165
|
+
|
|
166
|
+
if isinstance(exception, ClientError):
|
|
167
|
+
error_code = exception.response.get('Error', {}).get('Code', '')
|
|
168
|
+
|
|
169
|
+
# Permission errors
|
|
170
|
+
if error_code in ['AccessDenied', 'UnauthorizedOperation', 'Forbidden']:
|
|
171
|
+
return ErrorCategory.PERMISSION
|
|
172
|
+
|
|
173
|
+
# Throttling errors
|
|
174
|
+
if error_code in ['Throttling', 'RequestLimitExceeded', 'TooManyRequestsException',
|
|
175
|
+
'ThrottlingException', 'ProvisionedThroughputExceededException']:
|
|
176
|
+
return ErrorCategory.THROTTLING
|
|
177
|
+
|
|
178
|
+
# Service unavailable errors
|
|
179
|
+
if error_code in ['ServiceUnavailable', 'InternalError', 'InternalFailure']:
|
|
180
|
+
return ErrorCategory.SERVICE_UNAVAILABLE
|
|
181
|
+
|
|
182
|
+
# Resource not found errors
|
|
183
|
+
if error_code in ['ResourceNotFound', 'NoSuchBucket', 'NoSuchKey', 'InvalidInstanceID.NotFound']:
|
|
184
|
+
return ErrorCategory.RESOURCE_NOT_FOUND
|
|
185
|
+
|
|
186
|
+
# Validation errors
|
|
187
|
+
if error_code in ['ValidationException', 'InvalidParameterValue', 'MalformedPolicyDocument']:
|
|
188
|
+
return ErrorCategory.VALIDATION
|
|
189
|
+
|
|
190
|
+
# Network errors
|
|
191
|
+
if isinstance(exception, (EndpointConnectionError, ConnectTimeoutError, ReadTimeoutError)):
|
|
192
|
+
return ErrorCategory.NETWORK
|
|
193
|
+
|
|
194
|
+
# Configuration errors
|
|
195
|
+
if isinstance(exception, (ValueError, TypeError, KeyError)):
|
|
196
|
+
return ErrorCategory.CONFIGURATION
|
|
197
|
+
|
|
198
|
+
return ErrorCategory.UNKNOWN
|
|
199
|
+
|
|
200
|
+
def _determine_severity(self, exception: Exception, category: ErrorCategory) -> ErrorSeverity:
|
|
201
|
+
"""Determine error severity based on exception and category.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
exception: The exception
|
|
205
|
+
category: Error category
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
ErrorSeverity enum value
|
|
209
|
+
"""
|
|
210
|
+
# Critical errors that prevent assessment
|
|
211
|
+
if category in [ErrorCategory.CREDENTIAL, ErrorCategory.CONFIGURATION]:
|
|
212
|
+
return ErrorSeverity.CRITICAL
|
|
213
|
+
|
|
214
|
+
# High severity for permission and service issues
|
|
215
|
+
if category in [ErrorCategory.PERMISSION, ErrorCategory.SERVICE_UNAVAILABLE]:
|
|
216
|
+
return ErrorSeverity.HIGH
|
|
217
|
+
|
|
218
|
+
# Medium severity for throttling and network issues
|
|
219
|
+
if category in [ErrorCategory.THROTTLING, ErrorCategory.NETWORK]:
|
|
220
|
+
return ErrorSeverity.MEDIUM
|
|
221
|
+
|
|
222
|
+
# Low severity for resource not found (expected in some cases)
|
|
223
|
+
if category == ErrorCategory.RESOURCE_NOT_FOUND:
|
|
224
|
+
return ErrorSeverity.LOW
|
|
225
|
+
|
|
226
|
+
return ErrorSeverity.MEDIUM
|
|
227
|
+
|
|
228
|
+
def _create_error_record(self, exception: Exception, context: ErrorContext,
|
|
229
|
+
category: ErrorCategory, severity: ErrorSeverity) -> ErrorRecord:
|
|
230
|
+
"""Create detailed error record.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
exception: The exception
|
|
234
|
+
context: Error context
|
|
235
|
+
category: Error category
|
|
236
|
+
severity: Error severity
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
ErrorRecord object
|
|
240
|
+
"""
|
|
241
|
+
# Generate troubleshooting guidance
|
|
242
|
+
troubleshooting_guidance = self._generate_troubleshooting_guidance(
|
|
243
|
+
exception, category, context
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
return ErrorRecord(
|
|
247
|
+
error_id="", # Will be generated in __post_init__
|
|
248
|
+
timestamp=datetime.now(),
|
|
249
|
+
severity=severity,
|
|
250
|
+
category=category,
|
|
251
|
+
message=str(exception),
|
|
252
|
+
context=context,
|
|
253
|
+
exception_type=type(exception).__name__,
|
|
254
|
+
stack_trace=traceback.format_exc(),
|
|
255
|
+
troubleshooting_guidance=troubleshooting_guidance
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
def _generate_troubleshooting_guidance(self, exception: Exception,
|
|
259
|
+
category: ErrorCategory,
|
|
260
|
+
context: ErrorContext) -> List[str]:
|
|
261
|
+
"""Generate specific troubleshooting guidance for the error.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
exception: The exception
|
|
265
|
+
category: Error category
|
|
266
|
+
context: Error context
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
List of troubleshooting steps
|
|
270
|
+
"""
|
|
271
|
+
guidance = []
|
|
272
|
+
|
|
273
|
+
if category == ErrorCategory.CREDENTIAL:
|
|
274
|
+
guidance.extend([
|
|
275
|
+
"Verify AWS credentials are properly configured",
|
|
276
|
+
"Check AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables",
|
|
277
|
+
"Ensure IAM user/role has necessary permissions",
|
|
278
|
+
"Verify credentials haven't expired (for temporary credentials)",
|
|
279
|
+
"Try running 'aws sts get-caller-identity' to test credentials"
|
|
280
|
+
])
|
|
281
|
+
|
|
282
|
+
elif category == ErrorCategory.PERMISSION:
|
|
283
|
+
guidance.extend([
|
|
284
|
+
f"Grant necessary permissions for {context.service_name} service",
|
|
285
|
+
f"Check IAM policies for {context.operation} operation",
|
|
286
|
+
"Review AWS Config rule required permissions documentation",
|
|
287
|
+
"Consider using AWS managed policies for CIS assessments",
|
|
288
|
+
f"Verify permissions in region {context.region}"
|
|
289
|
+
])
|
|
290
|
+
|
|
291
|
+
elif category == ErrorCategory.SERVICE_UNAVAILABLE:
|
|
292
|
+
guidance.extend([
|
|
293
|
+
f"Check AWS service health dashboard for {context.service_name}",
|
|
294
|
+
f"Verify {context.service_name} is available in region {context.region}",
|
|
295
|
+
"Wait and retry the operation",
|
|
296
|
+
"Consider assessing other regions while service recovers",
|
|
297
|
+
"Check for any ongoing AWS maintenance windows"
|
|
298
|
+
])
|
|
299
|
+
|
|
300
|
+
elif category == ErrorCategory.THROTTLING:
|
|
301
|
+
guidance.extend([
|
|
302
|
+
"Reduce API call frequency",
|
|
303
|
+
"Implement exponential backoff with jitter",
|
|
304
|
+
"Consider using fewer parallel workers",
|
|
305
|
+
"Request service limit increase if needed",
|
|
306
|
+
"Spread assessment across multiple time periods"
|
|
307
|
+
])
|
|
308
|
+
|
|
309
|
+
elif category == ErrorCategory.NETWORK:
|
|
310
|
+
guidance.extend([
|
|
311
|
+
"Check internet connectivity",
|
|
312
|
+
"Verify DNS resolution for AWS endpoints",
|
|
313
|
+
"Check firewall and proxy settings",
|
|
314
|
+
"Try different AWS region endpoints",
|
|
315
|
+
"Verify VPC endpoints if running in private subnet"
|
|
316
|
+
])
|
|
317
|
+
|
|
318
|
+
elif category == ErrorCategory.CONFIGURATION:
|
|
319
|
+
guidance.extend([
|
|
320
|
+
"Review configuration file syntax",
|
|
321
|
+
"Validate all required parameters are provided",
|
|
322
|
+
"Check for typos in configuration values",
|
|
323
|
+
"Ensure configuration matches expected schema",
|
|
324
|
+
"Review example configurations in documentation"
|
|
325
|
+
])
|
|
326
|
+
|
|
327
|
+
elif category == ErrorCategory.RESOURCE_NOT_FOUND:
|
|
328
|
+
guidance.extend([
|
|
329
|
+
f"Resource {context.resource_id} may not exist in {context.region}",
|
|
330
|
+
"This may be expected if no resources of this type exist",
|
|
331
|
+
"Verify resource ID format is correct",
|
|
332
|
+
"Check if resource exists in different region",
|
|
333
|
+
"Review resource naming conventions"
|
|
334
|
+
])
|
|
335
|
+
|
|
336
|
+
else:
|
|
337
|
+
guidance.extend([
|
|
338
|
+
"Review error message for specific details",
|
|
339
|
+
"Check AWS documentation for the affected service",
|
|
340
|
+
"Verify all prerequisites are met",
|
|
341
|
+
"Consider filing AWS support case if issue persists",
|
|
342
|
+
"Check for known issues in AWS forums"
|
|
343
|
+
])
|
|
344
|
+
|
|
345
|
+
return guidance
|
|
346
|
+
|
|
347
|
+
def _handle_throttling_error(self, exception: Exception, context: ErrorContext,
|
|
348
|
+
operation: Callable, error_record: ErrorRecord) -> Optional[Any]:
|
|
349
|
+
"""Handle throttling errors with exponential backoff.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
exception: The throttling exception
|
|
353
|
+
context: Error context
|
|
354
|
+
operation: Operation to retry
|
|
355
|
+
error_record: Error record for tracking
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
Result of successful retry or None
|
|
359
|
+
"""
|
|
360
|
+
retry_key = f"{context.service_name}_{context.region}_{context.operation}"
|
|
361
|
+
current_retries = self.retry_counts.get(retry_key, 0)
|
|
362
|
+
|
|
363
|
+
if current_retries >= self.max_retries:
|
|
364
|
+
logger.warning(f"Max retries exceeded for throttling error: {retry_key}")
|
|
365
|
+
return None
|
|
366
|
+
|
|
367
|
+
# Calculate delay with exponential backoff and jitter
|
|
368
|
+
delay = self.base_delay * (2 ** current_retries) + (time.time() % 1)
|
|
369
|
+
|
|
370
|
+
logger.info(f"Throttling detected, retrying in {delay:.2f} seconds "
|
|
371
|
+
f"(attempt {current_retries + 1}/{self.max_retries})")
|
|
372
|
+
|
|
373
|
+
time.sleep(delay)
|
|
374
|
+
|
|
375
|
+
# Update retry count before attempting
|
|
376
|
+
self.retry_counts[retry_key] = current_retries + 1
|
|
377
|
+
error_record.retry_count = current_retries + 1
|
|
378
|
+
|
|
379
|
+
try:
|
|
380
|
+
result = operation()
|
|
381
|
+
|
|
382
|
+
# Reset retry count on success
|
|
383
|
+
self.retry_counts[retry_key] = 0
|
|
384
|
+
return result
|
|
385
|
+
|
|
386
|
+
except Exception as retry_exception:
|
|
387
|
+
logger.warning(f"Retry failed for {retry_key}: {retry_exception}")
|
|
388
|
+
# If this was a throttling error, try again (up to max retries)
|
|
389
|
+
if (isinstance(retry_exception, ClientError) and
|
|
390
|
+
retry_exception.response.get('Error', {}).get('Code') in
|
|
391
|
+
['Throttling', 'RequestLimitExceeded', 'TooManyRequestsException']):
|
|
392
|
+
return self._handle_throttling_error(retry_exception, context, operation, error_record)
|
|
393
|
+
return None
|
|
394
|
+
|
|
395
|
+
def _handle_network_error(self, exception: Exception, context: ErrorContext,
|
|
396
|
+
operation: Callable, error_record: ErrorRecord) -> Optional[Any]:
|
|
397
|
+
"""Handle network errors with retry logic.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
exception: The network exception
|
|
401
|
+
context: Error context
|
|
402
|
+
operation: Operation to retry
|
|
403
|
+
error_record: Error record for tracking
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Result of successful retry or None
|
|
407
|
+
"""
|
|
408
|
+
retry_key = f"network_{context.service_name}_{context.region}"
|
|
409
|
+
current_retries = self.retry_counts.get(retry_key, 0)
|
|
410
|
+
|
|
411
|
+
if current_retries >= self.max_retries:
|
|
412
|
+
logger.warning(f"Max network retries exceeded: {retry_key}")
|
|
413
|
+
return None
|
|
414
|
+
|
|
415
|
+
# Shorter delay for network errors
|
|
416
|
+
delay = min(self.base_delay * (1.5 ** current_retries), 10.0)
|
|
417
|
+
|
|
418
|
+
logger.info(f"Network error detected, retrying in {delay:.2f} seconds")
|
|
419
|
+
time.sleep(delay)
|
|
420
|
+
|
|
421
|
+
try:
|
|
422
|
+
self.retry_counts[retry_key] = current_retries + 1
|
|
423
|
+
error_record.retry_count = current_retries + 1
|
|
424
|
+
result = operation()
|
|
425
|
+
|
|
426
|
+
self.retry_counts[retry_key] = 0
|
|
427
|
+
return result
|
|
428
|
+
|
|
429
|
+
except Exception as retry_exception:
|
|
430
|
+
logger.warning(f"Network retry failed: {retry_exception}")
|
|
431
|
+
return None
|
|
432
|
+
|
|
433
|
+
def _handle_service_unavailable_error(self, exception: Exception, context: ErrorContext,
|
|
434
|
+
operation: Callable, error_record: ErrorRecord) -> Optional[Any]:
|
|
435
|
+
"""Handle service unavailable errors.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
exception: The service unavailable exception
|
|
439
|
+
context: Error context
|
|
440
|
+
operation: Operation to retry
|
|
441
|
+
error_record: Error record for tracking
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
None (graceful degradation)
|
|
445
|
+
"""
|
|
446
|
+
# Mark service as unavailable
|
|
447
|
+
if context.service_name not in self.service_availability:
|
|
448
|
+
self.service_availability[context.service_name] = {}
|
|
449
|
+
|
|
450
|
+
self.service_availability[context.service_name][context.region] = False
|
|
451
|
+
|
|
452
|
+
logger.warning(f"Service {context.service_name} unavailable in {context.region}, "
|
|
453
|
+
"continuing with other services")
|
|
454
|
+
|
|
455
|
+
return None
|
|
456
|
+
|
|
457
|
+
def _handle_permission_error(self, exception: Exception, context: ErrorContext,
|
|
458
|
+
operation: Callable, error_record: ErrorRecord) -> Optional[Any]:
|
|
459
|
+
"""Handle permission errors with graceful degradation.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
exception: The permission exception
|
|
463
|
+
context: Error context
|
|
464
|
+
operation: Operation to retry
|
|
465
|
+
error_record: Error record for tracking
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
None (graceful degradation)
|
|
469
|
+
"""
|
|
470
|
+
logger.warning(f"Permission denied for {context.service_name} in {context.region}, "
|
|
471
|
+
"skipping this assessment")
|
|
472
|
+
|
|
473
|
+
return None
|
|
474
|
+
|
|
475
|
+
def _handle_credential_error(self, exception: Exception, context: ErrorContext,
|
|
476
|
+
operation: Callable, error_record: ErrorRecord) -> Optional[Any]:
|
|
477
|
+
"""Handle credential errors (critical - cannot recover).
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
exception: The credential exception
|
|
481
|
+
context: Error context
|
|
482
|
+
operation: Operation to retry
|
|
483
|
+
error_record: Error record for tracking
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
None (cannot recover from credential errors)
|
|
487
|
+
"""
|
|
488
|
+
logger.critical("Credential error detected - assessment cannot continue")
|
|
489
|
+
return None
|
|
490
|
+
|
|
491
|
+
def _update_service_availability(self, context: ErrorContext, category: ErrorCategory):
|
|
492
|
+
"""Update service availability tracking.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
context: Error context
|
|
496
|
+
category: Error category
|
|
497
|
+
"""
|
|
498
|
+
if category in [ErrorCategory.SERVICE_UNAVAILABLE, ErrorCategory.PERMISSION]:
|
|
499
|
+
if context.service_name not in self.service_availability:
|
|
500
|
+
self.service_availability[context.service_name] = {}
|
|
501
|
+
|
|
502
|
+
self.service_availability[context.service_name][context.region] = False
|
|
503
|
+
|
|
504
|
+
def _log_error(self, error_record: ErrorRecord):
|
|
505
|
+
"""Log error with appropriate level.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
error_record: Error record to log
|
|
509
|
+
"""
|
|
510
|
+
log_message = (f"[{error_record.error_id}] {error_record.category.value}: "
|
|
511
|
+
f"{error_record.message}")
|
|
512
|
+
|
|
513
|
+
if error_record.context.service_name:
|
|
514
|
+
log_message += f" (Service: {error_record.context.service_name}"
|
|
515
|
+
if error_record.context.region:
|
|
516
|
+
log_message += f", Region: {error_record.context.region}"
|
|
517
|
+
log_message += ")"
|
|
518
|
+
|
|
519
|
+
if error_record.severity == ErrorSeverity.CRITICAL:
|
|
520
|
+
logger.critical(log_message)
|
|
521
|
+
elif error_record.severity == ErrorSeverity.HIGH:
|
|
522
|
+
logger.error(log_message)
|
|
523
|
+
elif error_record.severity == ErrorSeverity.MEDIUM:
|
|
524
|
+
logger.warning(log_message)
|
|
525
|
+
else:
|
|
526
|
+
logger.info(log_message)
|
|
527
|
+
|
|
528
|
+
def is_service_available(self, service_name: str, region: str) -> bool:
|
|
529
|
+
"""Check if service is available in region.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
service_name: AWS service name
|
|
533
|
+
region: AWS region
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
True if service is available, False otherwise
|
|
537
|
+
"""
|
|
538
|
+
return self.service_availability.get(service_name, {}).get(region, True)
|
|
539
|
+
|
|
540
|
+
def get_error_summary(self) -> Dict[str, Any]:
|
|
541
|
+
"""Get summary of errors encountered.
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
Dictionary with error summary statistics
|
|
545
|
+
"""
|
|
546
|
+
summary = {
|
|
547
|
+
"total_errors": len(self.error_records),
|
|
548
|
+
"unavailable_services": dict(self.service_availability)
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
if not self.error_records:
|
|
552
|
+
return summary
|
|
553
|
+
|
|
554
|
+
# Count errors by category and severity
|
|
555
|
+
category_counts = {}
|
|
556
|
+
severity_counts = {}
|
|
557
|
+
|
|
558
|
+
for error in self.error_records:
|
|
559
|
+
category_counts[error.category.value] = category_counts.get(error.category.value, 0) + 1
|
|
560
|
+
severity_counts[error.severity.value] = severity_counts.get(error.severity.value, 0) + 1
|
|
561
|
+
|
|
562
|
+
# Calculate recovery statistics
|
|
563
|
+
total_recovery_attempts = sum(1 for e in self.error_records if e.recovery_attempted)
|
|
564
|
+
successful_recoveries = sum(1 for e in self.error_records if e.recovery_successful)
|
|
565
|
+
|
|
566
|
+
summary.update({
|
|
567
|
+
"errors_by_category": category_counts,
|
|
568
|
+
"errors_by_severity": severity_counts,
|
|
569
|
+
"recovery_attempts": total_recovery_attempts,
|
|
570
|
+
"successful_recoveries": successful_recoveries,
|
|
571
|
+
"recovery_rate": (successful_recoveries / total_recovery_attempts * 100)
|
|
572
|
+
if total_recovery_attempts > 0 else 0
|
|
573
|
+
})
|
|
574
|
+
|
|
575
|
+
return summary
|
|
576
|
+
|
|
577
|
+
def get_troubleshooting_report(self) -> List[Dict[str, Any]]:
|
|
578
|
+
"""Generate troubleshooting report for all errors.
|
|
579
|
+
|
|
580
|
+
Returns:
|
|
581
|
+
List of error details with troubleshooting guidance
|
|
582
|
+
"""
|
|
583
|
+
report = []
|
|
584
|
+
|
|
585
|
+
for error in self.error_records:
|
|
586
|
+
if error.severity in [ErrorSeverity.CRITICAL, ErrorSeverity.HIGH]:
|
|
587
|
+
report.append({
|
|
588
|
+
"error_id": error.error_id,
|
|
589
|
+
"timestamp": error.timestamp.isoformat(),
|
|
590
|
+
"severity": error.severity.value,
|
|
591
|
+
"category": error.category.value,
|
|
592
|
+
"message": error.message,
|
|
593
|
+
"service": error.context.service_name,
|
|
594
|
+
"region": error.context.region,
|
|
595
|
+
"troubleshooting_steps": error.troubleshooting_guidance,
|
|
596
|
+
"recovery_attempted": error.recovery_attempted,
|
|
597
|
+
"recovery_successful": error.recovery_successful
|
|
598
|
+
})
|
|
599
|
+
|
|
600
|
+
return report
|
|
601
|
+
|
|
602
|
+
def clear_error_history(self):
|
|
603
|
+
"""Clear error history and reset tracking."""
|
|
604
|
+
self.error_records.clear()
|
|
605
|
+
self.service_availability.clear()
|
|
606
|
+
self.retry_counts.clear()
|
|
607
|
+
logger.info("Error history cleared")
|