kailash 0.9.15__py3-none-any.whl → 0.9.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +4 -3
- kailash/middleware/database/base_models.py +7 -1
- kailash/migration/__init__.py +30 -0
- kailash/migration/cli.py +340 -0
- kailash/migration/compatibility_checker.py +662 -0
- kailash/migration/configuration_validator.py +837 -0
- kailash/migration/documentation_generator.py +1828 -0
- kailash/migration/examples/__init__.py +5 -0
- kailash/migration/examples/complete_migration_example.py +692 -0
- kailash/migration/migration_assistant.py +715 -0
- kailash/migration/performance_comparator.py +760 -0
- kailash/migration/regression_detector.py +1141 -0
- kailash/migration/tests/__init__.py +6 -0
- kailash/migration/tests/test_compatibility_checker.py +403 -0
- kailash/migration/tests/test_integration.py +463 -0
- kailash/migration/tests/test_migration_assistant.py +397 -0
- kailash/migration/tests/test_performance_comparator.py +433 -0
- kailash/monitoring/__init__.py +29 -2
- kailash/monitoring/asyncsql_metrics.py +275 -0
- kailash/nodes/data/async_sql.py +1828 -33
- kailash/runtime/local.py +1255 -8
- kailash/runtime/monitoring/__init__.py +1 -0
- kailash/runtime/monitoring/runtime_monitor.py +780 -0
- kailash/runtime/resource_manager.py +3033 -0
- kailash/sdk_exceptions.py +21 -0
- kailash/workflow/cyclic_runner.py +18 -2
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/METADATA +1 -1
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/RECORD +33 -14
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/WHEEL +0 -0
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/entry_points.txt +0 -0
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/licenses/NOTICE +0 -0
- {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3033 @@
|
|
1
|
+
"""Runtime resource management and coordination.
|
2
|
+
|
3
|
+
This module provides resource coordination, connection pool management,
|
4
|
+
and runtime lifecycle management for the enhanced LocalRuntime with
|
5
|
+
persistent mode support.
|
6
|
+
|
7
|
+
Components:
|
8
|
+
- ResourceCoordinator: Cross-runtime resource coordination
|
9
|
+
- ConnectionPoolManager: Connection pool sharing and lifecycle
|
10
|
+
- RuntimeLifecycleManager: Runtime startup/shutdown coordination
|
11
|
+
"""
|
12
|
+
|
13
|
+
import asyncio
|
14
|
+
import gc
|
15
|
+
import hashlib
|
16
|
+
import logging
|
17
|
+
import random
|
18
|
+
import re
|
19
|
+
import threading
|
20
|
+
import time
|
21
|
+
import uuid
|
22
|
+
from abc import ABC, abstractmethod
|
23
|
+
from collections import defaultdict, deque
|
24
|
+
from dataclasses import dataclass, field
|
25
|
+
from datetime import UTC, datetime, timedelta
|
26
|
+
from enum import Enum
|
27
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
|
28
|
+
|
29
|
+
import psutil
|
30
|
+
|
31
|
+
from kailash.sdk_exceptions import CircuitBreakerOpenError, ResourceLimitExceededError
|
32
|
+
|
33
|
+
logger = logging.getLogger(__name__)
|
34
|
+
|
35
|
+
|
36
|
+
class ResourceCoordinator:
|
37
|
+
"""Coordinates resources across multiple runtime instances."""
|
38
|
+
|
39
|
+
def __init__(self, runtime_id: str, enable_coordination: bool = True):
|
40
|
+
"""Initialize resource coordinator.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
runtime_id: Unique identifier for this runtime instance
|
44
|
+
enable_coordination: Whether to enable cross-runtime coordination
|
45
|
+
"""
|
46
|
+
self.runtime_id = runtime_id
|
47
|
+
self.enable_coordination = enable_coordination
|
48
|
+
|
49
|
+
# Resource tracking
|
50
|
+
self._shared_resources: Dict[str, Any] = {}
|
51
|
+
self._resource_configs: Dict[str, Dict] = {}
|
52
|
+
self._resource_references: Dict[str, int] = defaultdict(int)
|
53
|
+
self._registered_runtimes: Dict[str, Dict] = {}
|
54
|
+
|
55
|
+
# Thread safety
|
56
|
+
self._coordination_lock = threading.RLock()
|
57
|
+
|
58
|
+
# Async operations tracking
|
59
|
+
self._async_operations: Dict[str, asyncio.Task] = {}
|
60
|
+
|
61
|
+
logger.info(f"ResourceCoordinator initialized for runtime {runtime_id}")
|
62
|
+
|
63
|
+
def register_runtime(self, runtime_id: str, config: Dict[str, Any]) -> None:
|
64
|
+
"""Register a runtime instance for coordination.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
runtime_id: Runtime instance identifier
|
68
|
+
config: Runtime configuration for coordination
|
69
|
+
"""
|
70
|
+
with self._coordination_lock:
|
71
|
+
self._registered_runtimes[runtime_id] = {
|
72
|
+
"config": config,
|
73
|
+
"registered_at": datetime.now(UTC),
|
74
|
+
"last_seen": datetime.now(UTC),
|
75
|
+
}
|
76
|
+
|
77
|
+
logger.info(f"Registered runtime {runtime_id} for coordination")
|
78
|
+
|
79
|
+
def allocate_shared_resource(
|
80
|
+
self, resource_type: str, resource_config: Dict[str, Any]
|
81
|
+
) -> str:
|
82
|
+
"""Allocate a shared resource with reference counting.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
resource_type: Type of resource (e.g., 'connection_pool')
|
86
|
+
resource_config: Configuration for the resource
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
Resource ID for future reference
|
90
|
+
"""
|
91
|
+
with self._coordination_lock:
|
92
|
+
# Generate resource ID based on type and config
|
93
|
+
config_str = str(sorted(resource_config.items()))
|
94
|
+
resource_id = (
|
95
|
+
f"{resource_type}_{hashlib.md5(config_str.encode()).hexdigest()[:8]}"
|
96
|
+
)
|
97
|
+
|
98
|
+
if resource_id not in self._shared_resources:
|
99
|
+
# Create new resource
|
100
|
+
self._shared_resources[resource_id] = {
|
101
|
+
"type": resource_type,
|
102
|
+
"config": resource_config,
|
103
|
+
"created_at": datetime.now(UTC),
|
104
|
+
"created_by": self.runtime_id,
|
105
|
+
"instance": None, # To be set by specific managers
|
106
|
+
}
|
107
|
+
self._resource_configs[resource_id] = resource_config
|
108
|
+
|
109
|
+
# Increment reference count
|
110
|
+
self._resource_references[resource_id] += 1
|
111
|
+
|
112
|
+
logger.debug(
|
113
|
+
f"Allocated shared resource {resource_id}, refs: {self._resource_references[resource_id]}"
|
114
|
+
)
|
115
|
+
return resource_id
|
116
|
+
|
117
|
+
def get_shared_resource(self, resource_id: str) -> Optional[Dict[str, Any]]:
|
118
|
+
"""Get shared resource by ID.
|
119
|
+
|
120
|
+
Args:
|
121
|
+
resource_id: Resource identifier
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
Resource info or None if not found
|
125
|
+
"""
|
126
|
+
with self._coordination_lock:
|
127
|
+
return self._shared_resources.get(resource_id)
|
128
|
+
|
129
|
+
def add_resource_reference(self, resource_id: str) -> None:
|
130
|
+
"""Add reference to shared resource.
|
131
|
+
|
132
|
+
Args:
|
133
|
+
resource_id: Resource identifier
|
134
|
+
"""
|
135
|
+
with self._coordination_lock:
|
136
|
+
if resource_id in self._shared_resources:
|
137
|
+
self._resource_references[resource_id] += 1
|
138
|
+
|
139
|
+
def remove_resource_reference(self, resource_id: str) -> None:
|
140
|
+
"""Remove reference to shared resource.
|
141
|
+
|
142
|
+
Args:
|
143
|
+
resource_id: Resource identifier
|
144
|
+
"""
|
145
|
+
with self._coordination_lock:
|
146
|
+
if resource_id in self._resource_references:
|
147
|
+
self._resource_references[resource_id] -= 1
|
148
|
+
|
149
|
+
# Clean up if no references
|
150
|
+
if self._resource_references[resource_id] <= 0:
|
151
|
+
self._cleanup_resource(resource_id)
|
152
|
+
|
153
|
+
def get_resource_reference_count(self, resource_id: str) -> int:
|
154
|
+
"""Get reference count for resource.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
resource_id: Resource identifier
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
Current reference count
|
161
|
+
"""
|
162
|
+
with self._coordination_lock:
|
163
|
+
return self._resource_references.get(resource_id, 0)
|
164
|
+
|
165
|
+
def _cleanup_resource(self, resource_id: str) -> None:
|
166
|
+
"""Clean up resource when no references remain.
|
167
|
+
|
168
|
+
Args:
|
169
|
+
resource_id: Resource identifier
|
170
|
+
"""
|
171
|
+
if resource_id in self._shared_resources:
|
172
|
+
resource = self._shared_resources[resource_id]
|
173
|
+
logger.info(
|
174
|
+
f"Cleaning up shared resource {resource_id} (type: {resource['type']})"
|
175
|
+
)
|
176
|
+
|
177
|
+
# Remove from tracking
|
178
|
+
del self._shared_resources[resource_id]
|
179
|
+
del self._resource_references[resource_id]
|
180
|
+
if resource_id in self._resource_configs:
|
181
|
+
del self._resource_configs[resource_id]
|
182
|
+
|
183
|
+
async def coordinate_async_operation(self, operation_name: str) -> None:
|
184
|
+
"""Coordinate async operation across runtimes.
|
185
|
+
|
186
|
+
Args:
|
187
|
+
operation_name: Name of the operation being coordinated
|
188
|
+
"""
|
189
|
+
if not hasattr(self, "_async_operations"):
|
190
|
+
self._async_operations = {}
|
191
|
+
|
192
|
+
# Track operation
|
193
|
+
self._async_operations[operation_name] = {
|
194
|
+
"started_at": datetime.now(UTC),
|
195
|
+
"runtime_id": self.runtime_id,
|
196
|
+
}
|
197
|
+
|
198
|
+
def get_coordination_status(self) -> Dict[str, Any]:
|
199
|
+
"""Get current coordination status.
|
200
|
+
|
201
|
+
Returns:
|
202
|
+
Status information including resources and runtimes
|
203
|
+
"""
|
204
|
+
with self._coordination_lock:
|
205
|
+
return {
|
206
|
+
"runtime_id": self.runtime_id,
|
207
|
+
"enable_coordination": self.enable_coordination,
|
208
|
+
"shared_resources": len(self._shared_resources),
|
209
|
+
"registered_runtimes": len(self._registered_runtimes),
|
210
|
+
"total_references": sum(self._resource_references.values()),
|
211
|
+
}
|
212
|
+
|
213
|
+
|
214
|
+
class ConnectionPoolManager:
|
215
|
+
"""Manages connection pools with sharing and lifecycle support."""
|
216
|
+
|
217
|
+
def __init__(
|
218
|
+
self,
|
219
|
+
max_pools: int = 20,
|
220
|
+
default_pool_size: int = 10,
|
221
|
+
pool_timeout: int = 30,
|
222
|
+
enable_sharing: bool = True,
|
223
|
+
enable_health_monitoring: bool = True,
|
224
|
+
pool_ttl: int = 3600,
|
225
|
+
):
|
226
|
+
"""Initialize connection pool manager.
|
227
|
+
|
228
|
+
Args:
|
229
|
+
max_pools: Maximum number of pools to maintain
|
230
|
+
default_pool_size: Default size for new pools
|
231
|
+
pool_timeout: Default timeout for pool operations
|
232
|
+
enable_sharing: Enable pool sharing across runtimes
|
233
|
+
enable_health_monitoring: Enable health monitoring
|
234
|
+
pool_ttl: Time-to-live for unused pools in seconds
|
235
|
+
"""
|
236
|
+
self.max_pools = max_pools
|
237
|
+
self.default_pool_size = default_pool_size
|
238
|
+
self.pool_timeout = pool_timeout
|
239
|
+
self.enable_sharing = enable_sharing
|
240
|
+
self.enable_health_monitoring = enable_health_monitoring
|
241
|
+
self.pool_ttl = pool_ttl
|
242
|
+
|
243
|
+
# Pool tracking
|
244
|
+
self._pools: Dict[str, Any] = {}
|
245
|
+
self._pool_configs: Dict[str, Dict] = {}
|
246
|
+
self._pool_health: Dict[str, Dict] = {}
|
247
|
+
self._pool_usage: Dict[str, Dict] = {}
|
248
|
+
self._pool_runtimes: Dict[str, Set[str]] = defaultdict(set)
|
249
|
+
|
250
|
+
# Lock for thread safety
|
251
|
+
self._lock = threading.RLock()
|
252
|
+
|
253
|
+
logger.info(f"ConnectionPoolManager initialized (max_pools={max_pools})")
|
254
|
+
|
255
|
+
async def create_pool(self, pool_name: str, pool_config: Dict[str, Any]) -> Any:
|
256
|
+
"""Create a new connection pool.
|
257
|
+
|
258
|
+
Args:
|
259
|
+
pool_name: Name for the pool
|
260
|
+
pool_config: Pool configuration
|
261
|
+
|
262
|
+
Returns:
|
263
|
+
Pool instance
|
264
|
+
|
265
|
+
Raises:
|
266
|
+
ResourceLimitExceededError: If max_pools limit exceeded
|
267
|
+
"""
|
268
|
+
with self._lock:
|
269
|
+
if len(self._pools) >= self.max_pools:
|
270
|
+
raise ResourceLimitExceededError(
|
271
|
+
f"Maximum pools limit ({self.max_pools}) exceeded"
|
272
|
+
)
|
273
|
+
|
274
|
+
if pool_name in self._pools:
|
275
|
+
return self._pools[pool_name]
|
276
|
+
|
277
|
+
# Create appropriate pool based on database type
|
278
|
+
database_type = pool_config.get("database_type", "").lower()
|
279
|
+
|
280
|
+
if database_type == "sqlite":
|
281
|
+
# For SQLite, create a simple connection object
|
282
|
+
import aiosqlite
|
283
|
+
|
284
|
+
connection_string = pool_config.get("database_url", ":memory:")
|
285
|
+
pool = {
|
286
|
+
"database_type": "sqlite",
|
287
|
+
"connection_string": connection_string,
|
288
|
+
"aiosqlite": aiosqlite,
|
289
|
+
}
|
290
|
+
elif database_type == "postgresql":
|
291
|
+
# Create real PostgreSQL connection pool using asyncpg
|
292
|
+
pool = await self._create_postgresql_pool(pool_config)
|
293
|
+
elif database_type == "mysql":
|
294
|
+
# Create real MySQL connection pool using aiomysql
|
295
|
+
pool = await self._create_mysql_pool(pool_config)
|
296
|
+
else:
|
297
|
+
# Fail fast for unsupported database types - no production mock fallbacks
|
298
|
+
supported_types = ["postgresql", "mysql", "sqlite"]
|
299
|
+
raise ValueError(
|
300
|
+
f"Unsupported database type '{database_type}'. "
|
301
|
+
f"Supported types: {supported_types}. "
|
302
|
+
f"Configuration error in pool '{pool_name}'"
|
303
|
+
)
|
304
|
+
|
305
|
+
self._pools[pool_name] = pool
|
306
|
+
self._pool_configs[pool_name] = pool_config.copy()
|
307
|
+
self._pool_usage[pool_name] = {
|
308
|
+
"created_at": datetime.now(UTC),
|
309
|
+
"last_used": datetime.now(UTC),
|
310
|
+
"use_count": 0,
|
311
|
+
}
|
312
|
+
|
313
|
+
if self.enable_health_monitoring:
|
314
|
+
self._pool_health[pool_name] = {
|
315
|
+
"status": "healthy",
|
316
|
+
"active_connections": 0,
|
317
|
+
"total_connections": pool_config.get(
|
318
|
+
"pool_size", self.default_pool_size
|
319
|
+
),
|
320
|
+
"last_check": datetime.now(UTC),
|
321
|
+
}
|
322
|
+
|
323
|
+
logger.info(f"Created connection pool '{pool_name}'")
|
324
|
+
return pool
|
325
|
+
|
326
|
+
async def get_or_create_pool(
|
327
|
+
self, pool_name: str, pool_config: Dict[str, Any]
|
328
|
+
) -> Any:
|
329
|
+
"""Get existing pool or create new one.
|
330
|
+
|
331
|
+
Args:
|
332
|
+
pool_name: Name for the pool
|
333
|
+
pool_config: Pool configuration
|
334
|
+
|
335
|
+
Returns:
|
336
|
+
Pool instance
|
337
|
+
"""
|
338
|
+
with self._lock:
|
339
|
+
if pool_name in self._pools:
|
340
|
+
# Update usage
|
341
|
+
self._pool_usage[pool_name]["last_used"] = datetime.now(UTC)
|
342
|
+
self._pool_usage[pool_name]["use_count"] += 1
|
343
|
+
return self._pools[pool_name]
|
344
|
+
|
345
|
+
return await self.create_pool(pool_name, pool_config)
|
346
|
+
|
347
|
+
async def create_shared_pool(
|
348
|
+
self, pool_name: str, pool_config: Dict[str, Any], runtime_id: str
|
349
|
+
) -> Any:
|
350
|
+
"""Create a shared pool for cross-runtime use.
|
351
|
+
|
352
|
+
Args:
|
353
|
+
pool_name: Name for the pool
|
354
|
+
pool_config: Pool configuration
|
355
|
+
runtime_id: Runtime requesting the pool
|
356
|
+
|
357
|
+
Returns:
|
358
|
+
Pool instance
|
359
|
+
"""
|
360
|
+
if not self.enable_sharing:
|
361
|
+
return await self.create_pool(pool_name, pool_config)
|
362
|
+
|
363
|
+
with self._lock:
|
364
|
+
pool = await self.get_or_create_pool(pool_name, pool_config)
|
365
|
+
self._pool_runtimes[pool_name].add(runtime_id)
|
366
|
+
|
367
|
+
logger.info(f"Shared pool '{pool_name}' with runtime {runtime_id}")
|
368
|
+
return pool
|
369
|
+
|
370
|
+
async def get_shared_pool(self, pool_name: str, runtime_id: str) -> Optional[Any]:
|
371
|
+
"""Get shared pool for runtime.
|
372
|
+
|
373
|
+
Args:
|
374
|
+
pool_name: Name of the pool
|
375
|
+
runtime_id: Runtime requesting the pool
|
376
|
+
|
377
|
+
Returns:
|
378
|
+
Pool instance or None if not found
|
379
|
+
"""
|
380
|
+
with self._lock:
|
381
|
+
if pool_name in self._pools and self.enable_sharing:
|
382
|
+
self._pool_runtimes[pool_name].add(runtime_id)
|
383
|
+
return self._pools[pool_name]
|
384
|
+
return None
|
385
|
+
|
386
|
+
def get_pool_runtime_count(self, pool_name: str) -> int:
|
387
|
+
"""Get number of runtimes using a pool.
|
388
|
+
|
389
|
+
Args:
|
390
|
+
pool_name: Name of the pool
|
391
|
+
|
392
|
+
Returns:
|
393
|
+
Number of runtimes using the pool
|
394
|
+
"""
|
395
|
+
with self._lock:
|
396
|
+
return len(self._pool_runtimes.get(pool_name, set()))
|
397
|
+
|
398
|
+
def get_pool_health(self, pool_name: str) -> Dict[str, Any]:
|
399
|
+
"""Get health status for a pool.
|
400
|
+
|
401
|
+
Args:
|
402
|
+
pool_name: Name of the pool
|
403
|
+
|
404
|
+
Returns:
|
405
|
+
Health status dictionary
|
406
|
+
"""
|
407
|
+
with self._lock:
|
408
|
+
if pool_name in self._pool_health:
|
409
|
+
return self._pool_health[pool_name].copy()
|
410
|
+
|
411
|
+
return {
|
412
|
+
"status": "unknown",
|
413
|
+
"active_connections": 0,
|
414
|
+
"total_connections": 0,
|
415
|
+
"last_check": None,
|
416
|
+
}
|
417
|
+
|
418
|
+
def is_pool_active(self, pool_name: str) -> bool:
|
419
|
+
"""Check if pool is active.
|
420
|
+
|
421
|
+
Args:
|
422
|
+
pool_name: Name of the pool
|
423
|
+
|
424
|
+
Returns:
|
425
|
+
True if pool is active
|
426
|
+
"""
|
427
|
+
with self._lock:
|
428
|
+
return pool_name in self._pools
|
429
|
+
|
430
|
+
async def close_pool(self, pool_name: str) -> None:
|
431
|
+
"""Close and remove a pool with proper error handling and race condition protection.
|
432
|
+
|
433
|
+
Args:
|
434
|
+
pool_name: Name of the pool to close
|
435
|
+
"""
|
436
|
+
# Get pool reference under lock but don't hold lock during async operations
|
437
|
+
with self._lock:
|
438
|
+
if pool_name not in self._pools:
|
439
|
+
logger.debug(f"Pool '{pool_name}' not found for closure")
|
440
|
+
return
|
441
|
+
|
442
|
+
pool = self._pools[pool_name]
|
443
|
+
# Remove from pools immediately to prevent race conditions
|
444
|
+
del self._pools[pool_name]
|
445
|
+
|
446
|
+
# Close pool outside lock to prevent deadlock
|
447
|
+
close_error = None
|
448
|
+
try:
|
449
|
+
if isinstance(pool, RuntimeManagedPool):
|
450
|
+
await pool._runtime_close()
|
451
|
+
elif hasattr(pool, "close"):
|
452
|
+
await pool.close()
|
453
|
+
logger.info(f"Successfully closed connection pool '{pool_name}'")
|
454
|
+
except Exception as e:
|
455
|
+
close_error = e
|
456
|
+
logger.error(f"Failed to close pool '{pool_name}': {e}")
|
457
|
+
|
458
|
+
# Always clean up tracking dictionaries - even if close failed
|
459
|
+
with self._lock:
|
460
|
+
# Remove from all tracking structures
|
461
|
+
self._pool_configs.pop(pool_name, None)
|
462
|
+
self._pool_usage.pop(pool_name, None)
|
463
|
+
self._pool_health.pop(pool_name, None)
|
464
|
+
self._pool_runtimes.pop(pool_name, None)
|
465
|
+
|
466
|
+
# Re-raise close error after cleanup
|
467
|
+
if close_error:
|
468
|
+
raise close_error
|
469
|
+
|
470
|
+
async def cleanup_unused_pools(self) -> int:
|
471
|
+
"""Clean up unused pools past TTL.
|
472
|
+
|
473
|
+
Returns:
|
474
|
+
Number of pools cleaned up
|
475
|
+
"""
|
476
|
+
cleaned_count = 0
|
477
|
+
current_time = datetime.now(UTC)
|
478
|
+
|
479
|
+
# Identify pools to cleanup while holding lock
|
480
|
+
with self._lock:
|
481
|
+
pools_to_cleanup = []
|
482
|
+
|
483
|
+
for pool_name, usage in self._pool_usage.items():
|
484
|
+
if (current_time - usage["last_used"]).total_seconds() > self.pool_ttl:
|
485
|
+
pools_to_cleanup.append(pool_name)
|
486
|
+
|
487
|
+
# Close pools outside the lock to avoid async deadlock
|
488
|
+
for pool_name in pools_to_cleanup:
|
489
|
+
await self.close_pool(pool_name)
|
490
|
+
cleaned_count += 1
|
491
|
+
|
492
|
+
if cleaned_count > 0:
|
493
|
+
logger.info(f"Cleaned up {cleaned_count} unused connection pools")
|
494
|
+
|
495
|
+
return cleaned_count
|
496
|
+
|
497
|
+
async def _create_postgresql_pool(self, pool_config: Dict[str, Any]) -> Any:
|
498
|
+
"""Create a real PostgreSQL connection pool using asyncpg."""
|
499
|
+
try:
|
500
|
+
import asyncpg
|
501
|
+
except ImportError:
|
502
|
+
raise ImportError(
|
503
|
+
"asyncpg not installed. Install with: pip install asyncpg"
|
504
|
+
)
|
505
|
+
|
506
|
+
# Extract connection parameters
|
507
|
+
connection_string = pool_config.get("connection_string") or pool_config.get(
|
508
|
+
"database_url"
|
509
|
+
)
|
510
|
+
if not connection_string:
|
511
|
+
# Build connection string from individual parameters
|
512
|
+
host = pool_config.get("host", "localhost")
|
513
|
+
port = pool_config.get("port", 5432)
|
514
|
+
database = pool_config.get("database", "postgres")
|
515
|
+
user = pool_config.get("user", "postgres")
|
516
|
+
password = pool_config.get("password", "")
|
517
|
+
connection_string = (
|
518
|
+
f"postgresql://{user}:{password}@{host}:{port}/{database}"
|
519
|
+
)
|
520
|
+
|
521
|
+
# Extract pool size settings
|
522
|
+
min_size = pool_config.get("min_pool_size", 1)
|
523
|
+
max_size = pool_config.get(
|
524
|
+
"pool_size", pool_config.get("max_pool_size", self.default_pool_size)
|
525
|
+
)
|
526
|
+
|
527
|
+
# Create asyncpg pool
|
528
|
+
pool = await asyncpg.create_pool(
|
529
|
+
connection_string, min_size=min_size, max_size=max_size, command_timeout=60
|
530
|
+
)
|
531
|
+
|
532
|
+
logger.info(
|
533
|
+
f"Created PostgreSQL connection pool with {min_size}-{max_size} connections"
|
534
|
+
)
|
535
|
+
|
536
|
+
# Validate pool before wrapping
|
537
|
+
if not await self._validate_pool(pool, "postgresql"):
|
538
|
+
await pool.close() # Clean up failed pool
|
539
|
+
raise RuntimeError(
|
540
|
+
f"PostgreSQL pool validation failed for connection: {connection_string}"
|
541
|
+
)
|
542
|
+
|
543
|
+
# Wrap pool to prevent premature closure by node-level cleanup
|
544
|
+
return RuntimeManagedPool(pool)
|
545
|
+
|
546
|
+
async def _create_mysql_pool(self, pool_config: Dict[str, Any]) -> Any:
|
547
|
+
"""Create a real MySQL connection pool using aiomysql."""
|
548
|
+
try:
|
549
|
+
import aiomysql
|
550
|
+
except ImportError:
|
551
|
+
raise ImportError(
|
552
|
+
"aiomysql not installed. Install with: pip install aiomysql"
|
553
|
+
)
|
554
|
+
|
555
|
+
# Extract connection parameters
|
556
|
+
host = pool_config.get("host", "localhost")
|
557
|
+
port = pool_config.get("port", 3306)
|
558
|
+
user = pool_config.get("user", "root")
|
559
|
+
password = pool_config.get("password", "")
|
560
|
+
database = pool_config.get("database", "")
|
561
|
+
|
562
|
+
# Extract pool size settings
|
563
|
+
minsize = pool_config.get("min_pool_size", 1)
|
564
|
+
maxsize = pool_config.get(
|
565
|
+
"pool_size", pool_config.get("max_pool_size", self.default_pool_size)
|
566
|
+
)
|
567
|
+
|
568
|
+
# Create aiomysql pool
|
569
|
+
pool = await aiomysql.create_pool(
|
570
|
+
host=host,
|
571
|
+
port=port,
|
572
|
+
user=user,
|
573
|
+
password=password,
|
574
|
+
db=database,
|
575
|
+
minsize=minsize,
|
576
|
+
maxsize=maxsize,
|
577
|
+
autocommit=True,
|
578
|
+
)
|
579
|
+
|
580
|
+
logger.info(
|
581
|
+
f"Created MySQL connection pool with {minsize}-{maxsize} connections"
|
582
|
+
)
|
583
|
+
|
584
|
+
# Validate pool before wrapping
|
585
|
+
if not await self._validate_pool(pool, "mysql"):
|
586
|
+
await pool.close() # Clean up failed pool
|
587
|
+
raise RuntimeError(
|
588
|
+
f"MySQL pool validation failed for connection: {host}:{port}"
|
589
|
+
)
|
590
|
+
|
591
|
+
# Wrap pool to prevent premature closure by node-level cleanup
|
592
|
+
return RuntimeManagedPool(pool)
|
593
|
+
|
594
|
+
async def _validate_pool(self, pool: Any, database_type: str) -> bool:
|
595
|
+
"""Validate that a pool actually works before returning it.
|
596
|
+
|
597
|
+
Args:
|
598
|
+
pool: The database pool to validate
|
599
|
+
database_type: Type of database (postgresql, mysql, sqlite)
|
600
|
+
|
601
|
+
Returns:
|
602
|
+
True if pool is functional, False otherwise
|
603
|
+
"""
|
604
|
+
try:
|
605
|
+
if database_type == "postgresql":
|
606
|
+
async with pool.acquire() as conn:
|
607
|
+
await conn.fetchrow("SELECT 1 as test_connection")
|
608
|
+
logger.debug("PostgreSQL pool validation successful")
|
609
|
+
elif database_type == "mysql":
|
610
|
+
async with pool.acquire() as conn:
|
611
|
+
async with conn.cursor() as cursor:
|
612
|
+
await cursor.execute("SELECT 1 as test_connection")
|
613
|
+
await cursor.fetchone()
|
614
|
+
logger.debug("MySQL pool validation successful")
|
615
|
+
elif database_type == "sqlite":
|
616
|
+
# SQLite validation would be different since it uses dict format
|
617
|
+
logger.debug("SQLite pool validation skipped (not a real pool)")
|
618
|
+
else:
|
619
|
+
logger.warning(f"Unknown database type for validation: {database_type}")
|
620
|
+
return False
|
621
|
+
return True
|
622
|
+
except Exception as e:
|
623
|
+
logger.error(f"Pool validation failed for {database_type}: {e}")
|
624
|
+
return False
|
625
|
+
|
626
|
+
|
627
|
+
class RuntimeManagedPool:
|
628
|
+
"""Wrapper for database pools managed by runtime to prevent external closure."""
|
629
|
+
|
630
|
+
def __init__(self, underlying_pool):
|
631
|
+
"""Initialize with the real pool instance."""
|
632
|
+
self._underlying_pool = underlying_pool
|
633
|
+
self._is_runtime_managed = True
|
634
|
+
self._pool_type = type(underlying_pool).__name__
|
635
|
+
|
636
|
+
# Pre-validate essential attributes exist to fail fast
|
637
|
+
required_attrs = ["acquire"]
|
638
|
+
for attr in required_attrs:
|
639
|
+
if not hasattr(underlying_pool, attr):
|
640
|
+
raise ValueError(
|
641
|
+
f"Invalid pool type '{self._pool_type}': missing required attribute '{attr}'. "
|
642
|
+
f"Pool must implement acquire() method for database operations."
|
643
|
+
)
|
644
|
+
|
645
|
+
logger.debug(f"Created RuntimeManagedPool wrapping {self._pool_type}")
|
646
|
+
|
647
|
+
def __getattr__(self, name):
|
648
|
+
"""Delegate all attributes to the underlying pool except close()."""
|
649
|
+
if name == "close":
|
650
|
+
# Prevent external closure - only runtime can close
|
651
|
+
return self._no_close
|
652
|
+
try:
|
653
|
+
return getattr(self._underlying_pool, name)
|
654
|
+
except AttributeError as e:
|
655
|
+
# Provide clearer error messages for debugging
|
656
|
+
raise AttributeError(
|
657
|
+
f"RuntimeManagedPool({self._pool_type}): {e}. "
|
658
|
+
f"The underlying {self._pool_type} pool does not support attribute '{name}'"
|
659
|
+
) from e
|
660
|
+
|
661
|
+
async def _no_close(self):
|
662
|
+
"""No-op close method to prevent external closure."""
|
663
|
+
logger.debug(f"Ignored attempt to close runtime-managed {self._pool_type} pool")
|
664
|
+
pass
|
665
|
+
|
666
|
+
async def _runtime_close(self):
|
667
|
+
"""Internal method for runtime to actually close the pool."""
|
668
|
+
try:
|
669
|
+
if hasattr(self._underlying_pool, "close"):
|
670
|
+
await self._underlying_pool.close()
|
671
|
+
logger.debug(f"Successfully closed underlying {self._pool_type} pool")
|
672
|
+
else:
|
673
|
+
logger.warning(
|
674
|
+
f"Underlying {self._pool_type} pool has no close() method"
|
675
|
+
)
|
676
|
+
except Exception as e:
|
677
|
+
logger.error(f"Error closing underlying {self._pool_type} pool: {e}")
|
678
|
+
raise
|
679
|
+
|
680
|
+
|
681
|
+
class MockConnectionPool:
|
682
|
+
"""Mock connection pool for testing."""
|
683
|
+
|
684
|
+
def __init__(self, config: Dict[str, Any], pool_size: int):
|
685
|
+
self.config = config
|
686
|
+
self.pool_size = pool_size
|
687
|
+
self.created_at = datetime.now(UTC)
|
688
|
+
|
689
|
+
async def close(self):
|
690
|
+
"""Close the mock pool."""
|
691
|
+
pass
|
692
|
+
|
693
|
+
|
694
|
+
class RuntimeLifecycleManager:
|
695
|
+
"""Manages runtime lifecycle operations."""
|
696
|
+
|
697
|
+
def __init__(self, runtime_id: str):
|
698
|
+
"""Initialize runtime lifecycle manager.
|
699
|
+
|
700
|
+
Args:
|
701
|
+
runtime_id: Unique runtime identifier
|
702
|
+
"""
|
703
|
+
self.runtime_id = runtime_id
|
704
|
+
self._is_started = False
|
705
|
+
self._shutdown_hooks: List[Callable] = []
|
706
|
+
self._startup_hooks: List[Callable] = []
|
707
|
+
|
708
|
+
async def startup(self) -> None:
|
709
|
+
"""Execute startup sequence."""
|
710
|
+
if self._is_started:
|
711
|
+
return
|
712
|
+
|
713
|
+
logger.info(f"Starting runtime lifecycle for {self.runtime_id}")
|
714
|
+
|
715
|
+
# Execute startup hooks
|
716
|
+
for hook in self._startup_hooks:
|
717
|
+
try:
|
718
|
+
if asyncio.iscoroutinefunction(hook):
|
719
|
+
await hook()
|
720
|
+
else:
|
721
|
+
hook()
|
722
|
+
except Exception as e:
|
723
|
+
logger.error(f"Startup hook failed: {e}")
|
724
|
+
|
725
|
+
self._is_started = True
|
726
|
+
|
727
|
+
async def shutdown(self, timeout: int = 30) -> None:
|
728
|
+
"""Execute shutdown sequence.
|
729
|
+
|
730
|
+
Args:
|
731
|
+
timeout: Maximum time to wait for shutdown
|
732
|
+
"""
|
733
|
+
if not self._is_started:
|
734
|
+
return
|
735
|
+
|
736
|
+
logger.info(f"Shutting down runtime lifecycle for {self.runtime_id}")
|
737
|
+
|
738
|
+
# Execute shutdown hooks with timeout
|
739
|
+
try:
|
740
|
+
await asyncio.wait_for(self._execute_shutdown_hooks(), timeout=timeout)
|
741
|
+
except asyncio.TimeoutError:
|
742
|
+
logger.warning(
|
743
|
+
f"Shutdown timeout after {timeout}s for runtime {self.runtime_id}"
|
744
|
+
)
|
745
|
+
|
746
|
+
self._is_started = False
|
747
|
+
|
748
|
+
async def _execute_shutdown_hooks(self) -> None:
|
749
|
+
"""Execute all shutdown hooks."""
|
750
|
+
for hook in reversed(self._shutdown_hooks): # Reverse order for cleanup
|
751
|
+
try:
|
752
|
+
if asyncio.iscoroutinefunction(hook):
|
753
|
+
await hook()
|
754
|
+
else:
|
755
|
+
hook()
|
756
|
+
except Exception as e:
|
757
|
+
logger.error(f"Shutdown hook failed: {e}")
|
758
|
+
|
759
|
+
def add_startup_hook(self, hook: Callable) -> None:
|
760
|
+
"""Add startup hook.
|
761
|
+
|
762
|
+
Args:
|
763
|
+
hook: Function to call during startup
|
764
|
+
"""
|
765
|
+
self._startup_hooks.append(hook)
|
766
|
+
|
767
|
+
def add_shutdown_hook(self, hook: Callable) -> None:
|
768
|
+
"""Add shutdown hook.
|
769
|
+
|
770
|
+
Args:
|
771
|
+
hook: Function to call during shutdown
|
772
|
+
"""
|
773
|
+
self._shutdown_hooks.append(hook)
|
774
|
+
|
775
|
+
@property
|
776
|
+
def is_started(self) -> bool:
|
777
|
+
"""Check if runtime is started."""
|
778
|
+
return self._is_started
|
779
|
+
|
780
|
+
|
781
|
+
class CircuitBreakerState(Enum):
|
782
|
+
"""Circuit breaker states."""
|
783
|
+
|
784
|
+
CLOSED = "closed" # Normal operation
|
785
|
+
OPEN = "open" # Failing, blocking requests
|
786
|
+
HALF_OPEN = "half_open" # Testing if service recovered
|
787
|
+
|
788
|
+
|
789
|
+
class CircuitBreaker:
|
790
|
+
"""Circuit breaker pattern for resilience and fault tolerance.
|
791
|
+
|
792
|
+
Prevents cascading failures by temporarily blocking requests to failing services.
|
793
|
+
"""
|
794
|
+
|
795
|
+
def __init__(
|
796
|
+
self,
|
797
|
+
name: str,
|
798
|
+
failure_threshold: int = 5,
|
799
|
+
timeout_seconds: int = 60,
|
800
|
+
expected_exception: type = Exception,
|
801
|
+
recovery_threshold: int = 3,
|
802
|
+
):
|
803
|
+
"""Initialize circuit breaker.
|
804
|
+
|
805
|
+
Args:
|
806
|
+
name: Name of the circuit breaker for logging
|
807
|
+
failure_threshold: Number of failures before opening circuit
|
808
|
+
timeout_seconds: Time to wait before attempting recovery
|
809
|
+
expected_exception: Exception type that triggers the circuit breaker
|
810
|
+
recovery_threshold: Number of successes needed to close circuit from half-open
|
811
|
+
"""
|
812
|
+
self.name = name
|
813
|
+
self.failure_threshold = failure_threshold
|
814
|
+
self.timeout_seconds = timeout_seconds
|
815
|
+
self.expected_exception = expected_exception
|
816
|
+
self.recovery_threshold = recovery_threshold
|
817
|
+
|
818
|
+
self._state = CircuitBreakerState.CLOSED
|
819
|
+
self._failure_count = 0
|
820
|
+
self._last_failure_time: Optional[float] = None
|
821
|
+
self._success_count = 0
|
822
|
+
|
823
|
+
# Thread safety
|
824
|
+
self._lock = threading.RLock()
|
825
|
+
|
826
|
+
logger.info(f"Circuit breaker '{name}' initialized")
|
827
|
+
|
828
|
+
async def call(self, func: Callable, *args, **kwargs) -> Any:
|
829
|
+
"""Call function with circuit breaker protection.
|
830
|
+
|
831
|
+
Args:
|
832
|
+
func: Function to call (sync or async)
|
833
|
+
*args: Function arguments
|
834
|
+
**kwargs: Function keyword arguments
|
835
|
+
|
836
|
+
Returns:
|
837
|
+
Function result
|
838
|
+
|
839
|
+
Raises:
|
840
|
+
CircuitBreakerOpenError: If circuit is open
|
841
|
+
"""
|
842
|
+
with self._lock:
|
843
|
+
if self._state == CircuitBreakerState.OPEN:
|
844
|
+
if self._should_attempt_reset():
|
845
|
+
self._state = CircuitBreakerState.HALF_OPEN
|
846
|
+
logger.info(f"Circuit breaker '{self.name}' moved to HALF_OPEN")
|
847
|
+
else:
|
848
|
+
raise CircuitBreakerOpenError(
|
849
|
+
f"Circuit breaker '{self.name}' is OPEN"
|
850
|
+
)
|
851
|
+
|
852
|
+
try:
|
853
|
+
# Call function (handle both sync and async)
|
854
|
+
if asyncio.iscoroutinefunction(func):
|
855
|
+
result = await func(*args, **kwargs)
|
856
|
+
else:
|
857
|
+
result = func(*args, **kwargs)
|
858
|
+
|
859
|
+
# Success - update state
|
860
|
+
self._on_success()
|
861
|
+
return result
|
862
|
+
|
863
|
+
except self.expected_exception as e:
|
864
|
+
self._on_failure()
|
865
|
+
raise e
|
866
|
+
|
867
|
+
def _should_attempt_reset(self) -> bool:
|
868
|
+
"""Check if circuit should attempt reset."""
|
869
|
+
if self._last_failure_time is None:
|
870
|
+
return False
|
871
|
+
return (time.time() - self._last_failure_time) >= self.timeout_seconds
|
872
|
+
|
873
|
+
def _on_success(self) -> None:
|
874
|
+
"""Handle successful call."""
|
875
|
+
with self._lock:
|
876
|
+
if self._state == CircuitBreakerState.HALF_OPEN:
|
877
|
+
self._success_count += 1
|
878
|
+
if self._success_count >= self.recovery_threshold:
|
879
|
+
self._reset()
|
880
|
+
logger.info(f"Circuit breaker '{self.name}' CLOSED after recovery")
|
881
|
+
elif self._state == CircuitBreakerState.CLOSED:
|
882
|
+
self._reset() # Reset failure count on success
|
883
|
+
|
884
|
+
def _on_failure(self) -> None:
|
885
|
+
"""Handle failed call."""
|
886
|
+
with self._lock:
|
887
|
+
self._failure_count += 1
|
888
|
+
self._last_failure_time = time.time()
|
889
|
+
|
890
|
+
if self._failure_count >= self.failure_threshold:
|
891
|
+
self._state = CircuitBreakerState.OPEN
|
892
|
+
logger.warning(
|
893
|
+
f"Circuit breaker '{self.name}' OPENED after {self._failure_count} failures"
|
894
|
+
)
|
895
|
+
|
896
|
+
def _reset(self) -> None:
|
897
|
+
"""Reset circuit breaker to closed state."""
|
898
|
+
self._state = CircuitBreakerState.CLOSED
|
899
|
+
self._failure_count = 0
|
900
|
+
self._success_count = 0
|
901
|
+
self._last_failure_time = None
|
902
|
+
|
903
|
+
def get_state(self) -> Dict[str, Any]:
|
904
|
+
"""Get current circuit breaker state.
|
905
|
+
|
906
|
+
Returns:
|
907
|
+
State information dictionary
|
908
|
+
"""
|
909
|
+
with self._lock:
|
910
|
+
return {
|
911
|
+
"name": self.name,
|
912
|
+
"state": self._state.value,
|
913
|
+
"failure_count": self._failure_count,
|
914
|
+
"success_count": self._success_count,
|
915
|
+
"last_failure_time": self._last_failure_time,
|
916
|
+
"failure_threshold": self.failure_threshold,
|
917
|
+
"timeout_seconds": self.timeout_seconds,
|
918
|
+
}
|
919
|
+
|
920
|
+
def force_open(self) -> None:
|
921
|
+
"""Force circuit breaker to open state."""
|
922
|
+
with self._lock:
|
923
|
+
self._state = CircuitBreakerState.OPEN
|
924
|
+
self._failure_count = self.failure_threshold
|
925
|
+
self._last_failure_time = time.time()
|
926
|
+
logger.warning(f"Circuit breaker '{self.name}' forced OPEN")
|
927
|
+
|
928
|
+
def force_close(self) -> None:
|
929
|
+
"""Force circuit breaker to closed state."""
|
930
|
+
with self._lock:
|
931
|
+
self._reset()
|
932
|
+
logger.info(f"Circuit breaker '{self.name}' forced CLOSED")
|
933
|
+
|
934
|
+
|
935
|
+
# CircuitBreakerOpenError now imported from sdk_exceptions
|
936
|
+
|
937
|
+
|
938
|
+
class RetryPolicy:
|
939
|
+
"""Retry policy with exponential backoff and jitter.
|
940
|
+
|
941
|
+
Provides configurable retry behavior for transient failures.
|
942
|
+
"""
|
943
|
+
|
944
|
+
def __init__(
|
945
|
+
self,
|
946
|
+
max_attempts: int = 3,
|
947
|
+
base_delay: float = 1.0,
|
948
|
+
max_delay: float = 60.0,
|
949
|
+
exponential_base: float = 2.0,
|
950
|
+
jitter: bool = True,
|
951
|
+
retriable_exceptions: tuple = (Exception,),
|
952
|
+
):
|
953
|
+
"""Initialize retry policy.
|
954
|
+
|
955
|
+
Args:
|
956
|
+
max_attempts: Maximum number of attempts
|
957
|
+
base_delay: Base delay in seconds
|
958
|
+
max_delay: Maximum delay in seconds
|
959
|
+
exponential_base: Base for exponential backoff
|
960
|
+
jitter: Whether to add jitter to delays
|
961
|
+
retriable_exceptions: Exception types that should trigger retry
|
962
|
+
"""
|
963
|
+
self.max_attempts = max_attempts
|
964
|
+
self.base_delay = base_delay
|
965
|
+
self.max_delay = max_delay
|
966
|
+
self.exponential_base = exponential_base
|
967
|
+
self.jitter = jitter
|
968
|
+
self.retriable_exceptions = retriable_exceptions
|
969
|
+
|
970
|
+
logger.info(f"RetryPolicy initialized (max_attempts={max_attempts})")
|
971
|
+
|
972
|
+
async def call(self, func: Callable, *args, **kwargs) -> Any:
|
973
|
+
"""Call function with retry policy.
|
974
|
+
|
975
|
+
Args:
|
976
|
+
func: Function to call (sync or async)
|
977
|
+
*args: Function arguments
|
978
|
+
**kwargs: Function keyword arguments
|
979
|
+
|
980
|
+
Returns:
|
981
|
+
Function result
|
982
|
+
|
983
|
+
Raises:
|
984
|
+
Last exception if all retries fail
|
985
|
+
"""
|
986
|
+
last_exception = None
|
987
|
+
|
988
|
+
for attempt in range(1, self.max_attempts + 1):
|
989
|
+
try:
|
990
|
+
# Call function (handle both sync and async)
|
991
|
+
if asyncio.iscoroutinefunction(func):
|
992
|
+
result = await func(*args, **kwargs)
|
993
|
+
else:
|
994
|
+
result = func(*args, **kwargs)
|
995
|
+
|
996
|
+
if attempt > 1:
|
997
|
+
logger.info(f"Retry succeeded on attempt {attempt}")
|
998
|
+
|
999
|
+
return result
|
1000
|
+
|
1001
|
+
except self.retriable_exceptions as e:
|
1002
|
+
last_exception = e
|
1003
|
+
|
1004
|
+
if attempt < self.max_attempts:
|
1005
|
+
delay = self._calculate_delay(attempt)
|
1006
|
+
logger.warning(
|
1007
|
+
f"Attempt {attempt} failed, retrying in {delay:.2f}s: {e}"
|
1008
|
+
)
|
1009
|
+
await asyncio.sleep(delay)
|
1010
|
+
else:
|
1011
|
+
logger.error(f"All {self.max_attempts} attempts failed")
|
1012
|
+
|
1013
|
+
raise last_exception
|
1014
|
+
|
1015
|
+
def _calculate_delay(self, attempt: int) -> float:
|
1016
|
+
"""Calculate delay for given attempt.
|
1017
|
+
|
1018
|
+
Args:
|
1019
|
+
attempt: Current attempt number (1-based)
|
1020
|
+
|
1021
|
+
Returns:
|
1022
|
+
Delay in seconds
|
1023
|
+
"""
|
1024
|
+
delay = self.base_delay * (self.exponential_base ** (attempt - 1))
|
1025
|
+
delay = min(delay, self.max_delay)
|
1026
|
+
|
1027
|
+
if self.jitter:
|
1028
|
+
import random
|
1029
|
+
|
1030
|
+
# Add up to 25% jitter
|
1031
|
+
jitter_amount = delay * 0.25 * random.random()
|
1032
|
+
delay += jitter_amount
|
1033
|
+
|
1034
|
+
return delay
|
1035
|
+
|
1036
|
+
|
1037
|
+
# Resource Limit Enforcement Components
|
1038
|
+
# Note: gc and psutil are imported at the top of the file
|
1039
|
+
|
1040
|
+
|
1041
|
+
class EnforcementPolicy(Enum):
|
1042
|
+
"""Resource limit enforcement policies."""
|
1043
|
+
|
1044
|
+
STRICT = "strict" # Immediately reject when limits exceeded
|
1045
|
+
WARN = "warn" # Log warnings but allow execution
|
1046
|
+
ADAPTIVE = "adaptive" # Graceful degradation based on resource pressure
|
1047
|
+
|
1048
|
+
|
1049
|
+
class DegradationStrategy(Enum):
|
1050
|
+
"""Resource degradation strategies when limits are exceeded."""
|
1051
|
+
|
1052
|
+
QUEUE = "queue" # Queue requests when resources exhausted
|
1053
|
+
REJECT = "reject" # Immediately reject when resources exhausted
|
1054
|
+
DEFER = "defer" # Delay execution when resources exhausted
|
1055
|
+
|
1056
|
+
|
1057
|
+
@dataclass
|
1058
|
+
class ResourceCheckResult:
|
1059
|
+
"""Result of resource limit check."""
|
1060
|
+
|
1061
|
+
can_proceed: bool
|
1062
|
+
resource_type: str
|
1063
|
+
current_usage: float
|
1064
|
+
limit: float
|
1065
|
+
usage_percentage: float
|
1066
|
+
message: str
|
1067
|
+
|
1068
|
+
|
1069
|
+
@dataclass
|
1070
|
+
class ResourceMetrics:
|
1071
|
+
"""Comprehensive resource usage metrics."""
|
1072
|
+
|
1073
|
+
timestamp: datetime
|
1074
|
+
memory_usage_mb: float
|
1075
|
+
memory_usage_percent: float
|
1076
|
+
cpu_usage_percent: float
|
1077
|
+
active_connections: int
|
1078
|
+
peak_memory_mb: float
|
1079
|
+
peak_cpu_percent: float
|
1080
|
+
|
1081
|
+
|
1082
|
+
class MemoryLimitExceededError(ResourceLimitExceededError):
|
1083
|
+
"""Memory limit exceeded error."""
|
1084
|
+
|
1085
|
+
def __init__(self, current_mb: float, limit_mb: float):
|
1086
|
+
super().__init__(
|
1087
|
+
f"Memory limit exceeded: {current_mb:.1f}MB > {limit_mb:.1f}MB"
|
1088
|
+
)
|
1089
|
+
self.current_mb = current_mb
|
1090
|
+
self.limit_mb = limit_mb
|
1091
|
+
|
1092
|
+
|
1093
|
+
class ConnectionLimitExceededError(ResourceLimitExceededError):
|
1094
|
+
"""Connection limit exceeded error."""
|
1095
|
+
|
1096
|
+
def __init__(self, current_connections: int, max_connections: int):
|
1097
|
+
super().__init__(
|
1098
|
+
f"Connection limit exceeded: {current_connections} > {max_connections}"
|
1099
|
+
)
|
1100
|
+
self.current_connections = current_connections
|
1101
|
+
self.max_connections = max_connections
|
1102
|
+
|
1103
|
+
|
1104
|
+
class CPULimitExceededError(ResourceLimitExceededError):
|
1105
|
+
"""CPU limit exceeded error."""
|
1106
|
+
|
1107
|
+
def __init__(self, current_percent: float, limit_percent: float):
|
1108
|
+
super().__init__(
|
1109
|
+
f"CPU limit exceeded: {current_percent:.1f}% > {limit_percent:.1f}%"
|
1110
|
+
)
|
1111
|
+
self.current_percent = current_percent
|
1112
|
+
self.limit_percent = limit_percent
|
1113
|
+
|
1114
|
+
|
1115
|
+
class ResourceLimitEnforcer:
|
1116
|
+
"""Comprehensive resource limit enforcement for LocalRuntime.
|
1117
|
+
|
1118
|
+
Provides memory, connection, and CPU limit enforcement with configurable
|
1119
|
+
policies and graceful degradation strategies. Thread-safe for concurrent
|
1120
|
+
workflow execution.
|
1121
|
+
|
1122
|
+
Features:
|
1123
|
+
- Memory limit enforcement with real-time monitoring
|
1124
|
+
- Connection pool limit enforcement
|
1125
|
+
- CPU usage monitoring and throttling
|
1126
|
+
- Configurable enforcement policies (strict, warn, adaptive)
|
1127
|
+
- Graceful degradation strategies (queue, reject, defer)
|
1128
|
+
- Thread-safe operations
|
1129
|
+
- Real-time metrics and alerting
|
1130
|
+
"""
|
1131
|
+
|
1132
|
+
def __init__(
|
1133
|
+
self,
|
1134
|
+
max_memory_mb: Optional[int] = None,
|
1135
|
+
max_connections: Optional[int] = None,
|
1136
|
+
max_cpu_percent: Optional[float] = None,
|
1137
|
+
enforcement_policy: Union[str, EnforcementPolicy] = EnforcementPolicy.ADAPTIVE,
|
1138
|
+
degradation_strategy: Union[
|
1139
|
+
str, DegradationStrategy
|
1140
|
+
] = DegradationStrategy.DEFER,
|
1141
|
+
monitoring_interval: float = 1.0,
|
1142
|
+
enable_alerts: bool = True,
|
1143
|
+
memory_alert_threshold: float = 0.8,
|
1144
|
+
cpu_alert_threshold: float = 0.7,
|
1145
|
+
connection_alert_threshold: float = 0.9,
|
1146
|
+
enable_metrics_history: bool = True,
|
1147
|
+
metrics_history_size: int = 1000,
|
1148
|
+
):
|
1149
|
+
"""Initialize ResourceLimitEnforcer.
|
1150
|
+
|
1151
|
+
Args:
|
1152
|
+
max_memory_mb: Maximum memory usage in MB (None = no limit)
|
1153
|
+
max_connections: Maximum concurrent connections (None = no limit)
|
1154
|
+
max_cpu_percent: Maximum CPU usage percentage (None = no limit)
|
1155
|
+
enforcement_policy: How to enforce limits (strict/warn/adaptive)
|
1156
|
+
degradation_strategy: How to handle resource exhaustion
|
1157
|
+
monitoring_interval: Resource monitoring interval in seconds
|
1158
|
+
enable_alerts: Enable resource usage alerts
|
1159
|
+
memory_alert_threshold: Memory alert threshold (0.0-1.0)
|
1160
|
+
cpu_alert_threshold: CPU alert threshold (0.0-1.0)
|
1161
|
+
connection_alert_threshold: Connection alert threshold (0.0-1.0)
|
1162
|
+
enable_metrics_history: Enable metrics history tracking
|
1163
|
+
metrics_history_size: Maximum metrics history entries
|
1164
|
+
"""
|
1165
|
+
# Validate parameters
|
1166
|
+
if max_memory_mb is not None and max_memory_mb <= 0:
|
1167
|
+
raise ValueError("max_memory_mb must be positive")
|
1168
|
+
if max_connections is not None and max_connections <= 0:
|
1169
|
+
raise ValueError("max_connections must be positive")
|
1170
|
+
if max_cpu_percent is not None and (
|
1171
|
+
max_cpu_percent <= 0 or max_cpu_percent > 100
|
1172
|
+
):
|
1173
|
+
raise ValueError("max_cpu_percent must be between 0 and 100")
|
1174
|
+
if monitoring_interval <= 0:
|
1175
|
+
raise ValueError("monitoring_interval must be positive")
|
1176
|
+
|
1177
|
+
self.max_memory_mb = max_memory_mb
|
1178
|
+
self.max_connections = max_connections
|
1179
|
+
self.max_cpu_percent = max_cpu_percent
|
1180
|
+
|
1181
|
+
# Convert string policies to enums
|
1182
|
+
if isinstance(enforcement_policy, str):
|
1183
|
+
enforcement_policy = EnforcementPolicy(enforcement_policy)
|
1184
|
+
if isinstance(degradation_strategy, str):
|
1185
|
+
degradation_strategy = DegradationStrategy(degradation_strategy)
|
1186
|
+
|
1187
|
+
self.enforcement_policy = enforcement_policy
|
1188
|
+
self.degradation_strategy = degradation_strategy
|
1189
|
+
self.monitoring_interval = monitoring_interval
|
1190
|
+
self.enable_alerts = enable_alerts
|
1191
|
+
|
1192
|
+
# Alert thresholds
|
1193
|
+
self.memory_alert_threshold = memory_alert_threshold
|
1194
|
+
self.cpu_alert_threshold = cpu_alert_threshold
|
1195
|
+
self.connection_alert_threshold = connection_alert_threshold
|
1196
|
+
|
1197
|
+
# Metrics and history
|
1198
|
+
self.enable_metrics_history = enable_metrics_history
|
1199
|
+
self.metrics_history_size = metrics_history_size
|
1200
|
+
self.metrics_history: deque = deque(maxlen=metrics_history_size)
|
1201
|
+
|
1202
|
+
# Resource tracking
|
1203
|
+
self.active_connections: Set[str] = set()
|
1204
|
+
self.connection_queue: deque = deque()
|
1205
|
+
self.peak_memory_mb = 0.0
|
1206
|
+
self.peak_cpu_percent = 0.0
|
1207
|
+
|
1208
|
+
# Thread safety
|
1209
|
+
self._lock = threading.RLock()
|
1210
|
+
self._monitoring_task: Optional[asyncio.Task] = None
|
1211
|
+
self._is_monitoring = False
|
1212
|
+
|
1213
|
+
# Performance tracking
|
1214
|
+
self.enforcement_start_time = time.time()
|
1215
|
+
|
1216
|
+
logger.info(
|
1217
|
+
f"ResourceLimitEnforcer initialized: "
|
1218
|
+
f"memory={max_memory_mb}MB, connections={max_connections}, "
|
1219
|
+
f"cpu={max_cpu_percent}%, policy={enforcement_policy.value}"
|
1220
|
+
)
|
1221
|
+
|
1222
|
+
def check_memory_limits(self) -> ResourceCheckResult:
|
1223
|
+
"""Check if current memory usage is within limits.
|
1224
|
+
|
1225
|
+
Returns:
|
1226
|
+
ResourceCheckResult indicating if execution can proceed
|
1227
|
+
"""
|
1228
|
+
if self.max_memory_mb is None:
|
1229
|
+
return ResourceCheckResult(
|
1230
|
+
can_proceed=True,
|
1231
|
+
resource_type="memory",
|
1232
|
+
current_usage=0,
|
1233
|
+
limit=0,
|
1234
|
+
usage_percentage=0,
|
1235
|
+
message="No memory limit configured",
|
1236
|
+
)
|
1237
|
+
|
1238
|
+
# Get current memory usage
|
1239
|
+
# Get current process memory usage, not system-wide
|
1240
|
+
process = psutil.Process()
|
1241
|
+
memory_info = process.memory_info()
|
1242
|
+
current_mb = memory_info.rss / (1024 * 1024) # RSS is resident set size
|
1243
|
+
usage_percentage = current_mb / self.max_memory_mb
|
1244
|
+
|
1245
|
+
# Update peak tracking
|
1246
|
+
with self._lock:
|
1247
|
+
self.peak_memory_mb = max(self.peak_memory_mb, current_mb)
|
1248
|
+
|
1249
|
+
# Check if over limit
|
1250
|
+
if current_mb > self.max_memory_mb:
|
1251
|
+
return ResourceCheckResult(
|
1252
|
+
can_proceed=False,
|
1253
|
+
resource_type="memory",
|
1254
|
+
current_usage=current_mb,
|
1255
|
+
limit=self.max_memory_mb,
|
1256
|
+
usage_percentage=usage_percentage,
|
1257
|
+
message=f"Memory usage {current_mb:.1f}MB exceeds limit {self.max_memory_mb}MB",
|
1258
|
+
)
|
1259
|
+
|
1260
|
+
# Check alert threshold
|
1261
|
+
if self.enable_alerts and usage_percentage > self.memory_alert_threshold:
|
1262
|
+
logger.warning(
|
1263
|
+
f"Memory usage alert: {current_mb:.1f}MB ({usage_percentage:.1%}) "
|
1264
|
+
f"exceeds threshold {self.memory_alert_threshold:.1%}"
|
1265
|
+
)
|
1266
|
+
|
1267
|
+
return ResourceCheckResult(
|
1268
|
+
can_proceed=True,
|
1269
|
+
resource_type="memory",
|
1270
|
+
current_usage=current_mb,
|
1271
|
+
limit=self.max_memory_mb,
|
1272
|
+
usage_percentage=usage_percentage,
|
1273
|
+
message=f"Memory usage {current_mb:.1f}MB within limit",
|
1274
|
+
)
|
1275
|
+
|
1276
|
+
def check_cpu_limits(self) -> ResourceCheckResult:
|
1277
|
+
"""Check if current CPU usage is within limits.
|
1278
|
+
|
1279
|
+
Returns:
|
1280
|
+
ResourceCheckResult indicating if execution can proceed
|
1281
|
+
"""
|
1282
|
+
if self.max_cpu_percent is None:
|
1283
|
+
return ResourceCheckResult(
|
1284
|
+
can_proceed=True,
|
1285
|
+
resource_type="cpu",
|
1286
|
+
current_usage=0,
|
1287
|
+
limit=0,
|
1288
|
+
usage_percentage=0,
|
1289
|
+
message="No CPU limit configured",
|
1290
|
+
)
|
1291
|
+
|
1292
|
+
# Get current CPU usage
|
1293
|
+
cpu_percent = psutil.cpu_percent(interval=0.1)
|
1294
|
+
usage_percentage = cpu_percent / self.max_cpu_percent
|
1295
|
+
|
1296
|
+
# Update peak tracking
|
1297
|
+
with self._lock:
|
1298
|
+
self.peak_cpu_percent = max(self.peak_cpu_percent, cpu_percent)
|
1299
|
+
|
1300
|
+
# Check if over limit
|
1301
|
+
if cpu_percent > self.max_cpu_percent:
|
1302
|
+
return ResourceCheckResult(
|
1303
|
+
can_proceed=False,
|
1304
|
+
resource_type="cpu",
|
1305
|
+
current_usage=cpu_percent,
|
1306
|
+
limit=self.max_cpu_percent,
|
1307
|
+
usage_percentage=usage_percentage,
|
1308
|
+
message=f"CPU usage {cpu_percent:.1f}% exceeds limit {self.max_cpu_percent:.1f}%",
|
1309
|
+
)
|
1310
|
+
|
1311
|
+
# Check alert threshold
|
1312
|
+
if self.enable_alerts and usage_percentage > self.cpu_alert_threshold:
|
1313
|
+
logger.warning(
|
1314
|
+
f"CPU usage alert: {cpu_percent:.1f}% "
|
1315
|
+
f"exceeds threshold {self.cpu_alert_threshold:.1%}"
|
1316
|
+
)
|
1317
|
+
|
1318
|
+
return ResourceCheckResult(
|
1319
|
+
can_proceed=True,
|
1320
|
+
resource_type="cpu",
|
1321
|
+
current_usage=cpu_percent,
|
1322
|
+
limit=self.max_cpu_percent,
|
1323
|
+
usage_percentage=usage_percentage,
|
1324
|
+
message=f"CPU usage {cpu_percent:.1f}% within limit",
|
1325
|
+
)
|
1326
|
+
|
1327
|
+
def request_connection(self, connection_id: str) -> Dict[str, Any]:
|
1328
|
+
"""Request a new connection within limits.
|
1329
|
+
|
1330
|
+
Args:
|
1331
|
+
connection_id: Unique identifier for the connection
|
1332
|
+
|
1333
|
+
Returns:
|
1334
|
+
Dict with granted status and connection info
|
1335
|
+
|
1336
|
+
Raises:
|
1337
|
+
ConnectionLimitExceededError: If connection limit exceeded
|
1338
|
+
"""
|
1339
|
+
with self._lock:
|
1340
|
+
current_count = len(self.active_connections)
|
1341
|
+
|
1342
|
+
if self.max_connections is None:
|
1343
|
+
self.active_connections.add(connection_id)
|
1344
|
+
return {
|
1345
|
+
"granted": True,
|
1346
|
+
"connection_id": connection_id,
|
1347
|
+
"active_count": len(self.active_connections),
|
1348
|
+
}
|
1349
|
+
|
1350
|
+
# Check if over limit
|
1351
|
+
if current_count >= self.max_connections:
|
1352
|
+
if self.enforcement_policy == EnforcementPolicy.STRICT:
|
1353
|
+
raise ConnectionLimitExceededError(
|
1354
|
+
current_count, self.max_connections
|
1355
|
+
)
|
1356
|
+
elif self.enforcement_policy == EnforcementPolicy.WARN:
|
1357
|
+
logger.warning(
|
1358
|
+
f"Connection limit warning: {current_count} >= {self.max_connections}"
|
1359
|
+
)
|
1360
|
+
self.active_connections.add(connection_id)
|
1361
|
+
return {
|
1362
|
+
"granted": True,
|
1363
|
+
"connection_id": connection_id,
|
1364
|
+
"active_count": len(self.active_connections),
|
1365
|
+
"warning": "Connection limit exceeded but allowed by policy",
|
1366
|
+
}
|
1367
|
+
elif self.enforcement_policy == EnforcementPolicy.ADAPTIVE:
|
1368
|
+
# Handle based on degradation strategy
|
1369
|
+
if self.degradation_strategy == DegradationStrategy.QUEUE:
|
1370
|
+
self.connection_queue.append(connection_id)
|
1371
|
+
return {
|
1372
|
+
"granted": False,
|
1373
|
+
"connection_id": connection_id,
|
1374
|
+
"queued": True,
|
1375
|
+
"queue_position": len(self.connection_queue),
|
1376
|
+
}
|
1377
|
+
elif self.degradation_strategy == DegradationStrategy.REJECT:
|
1378
|
+
raise ConnectionLimitExceededError(
|
1379
|
+
current_count, self.max_connections
|
1380
|
+
)
|
1381
|
+
elif self.degradation_strategy == DegradationStrategy.DEFER:
|
1382
|
+
# Return deferred status - caller should retry later
|
1383
|
+
return {
|
1384
|
+
"granted": False,
|
1385
|
+
"connection_id": connection_id,
|
1386
|
+
"deferred": True,
|
1387
|
+
"retry_after": self.monitoring_interval,
|
1388
|
+
}
|
1389
|
+
|
1390
|
+
# Check alert threshold
|
1391
|
+
usage_percentage = current_count / self.max_connections
|
1392
|
+
if (
|
1393
|
+
self.enable_alerts
|
1394
|
+
and usage_percentage > self.connection_alert_threshold
|
1395
|
+
):
|
1396
|
+
logger.warning(
|
1397
|
+
f"Connection usage alert: {current_count}/{self.max_connections} "
|
1398
|
+
f"({usage_percentage:.1%}) exceeds threshold {self.connection_alert_threshold:.1%}"
|
1399
|
+
)
|
1400
|
+
|
1401
|
+
# Grant connection
|
1402
|
+
self.active_connections.add(connection_id)
|
1403
|
+
return {
|
1404
|
+
"granted": True,
|
1405
|
+
"connection_id": connection_id,
|
1406
|
+
"active_count": len(self.active_connections),
|
1407
|
+
}
|
1408
|
+
|
1409
|
+
def release_connection(self, connection_id: str) -> None:
|
1410
|
+
"""Release a connection and process any queued requests.
|
1411
|
+
|
1412
|
+
Args:
|
1413
|
+
connection_id: Connection to release
|
1414
|
+
"""
|
1415
|
+
with self._lock:
|
1416
|
+
if connection_id in self.active_connections:
|
1417
|
+
self.active_connections.remove(connection_id)
|
1418
|
+
|
1419
|
+
# Process queued connections if using queue strategy
|
1420
|
+
if (
|
1421
|
+
self.connection_queue
|
1422
|
+
and self.degradation_strategy == DegradationStrategy.QUEUE
|
1423
|
+
):
|
1424
|
+
next_connection_id = self.connection_queue.popleft()
|
1425
|
+
self.active_connections.add(next_connection_id)
|
1426
|
+
logger.info(f"Processed queued connection: {next_connection_id}")
|
1427
|
+
|
1428
|
+
def get_active_connection_count(self) -> int:
|
1429
|
+
"""Get current active connection count.
|
1430
|
+
|
1431
|
+
Returns:
|
1432
|
+
Number of active connections
|
1433
|
+
"""
|
1434
|
+
with self._lock:
|
1435
|
+
return len(self.active_connections)
|
1436
|
+
|
1437
|
+
def check_all_limits(self) -> Dict[str, ResourceCheckResult]:
|
1438
|
+
"""Check all configured resource limits.
|
1439
|
+
|
1440
|
+
Returns:
|
1441
|
+
Dict mapping resource types to check results
|
1442
|
+
"""
|
1443
|
+
results = {}
|
1444
|
+
|
1445
|
+
# Check memory limits
|
1446
|
+
results["memory"] = self.check_memory_limits()
|
1447
|
+
|
1448
|
+
# Check CPU limits
|
1449
|
+
results["cpu"] = self.check_cpu_limits()
|
1450
|
+
|
1451
|
+
# Check connection limits
|
1452
|
+
with self._lock:
|
1453
|
+
current_connections = len(self.active_connections)
|
1454
|
+
|
1455
|
+
if self.max_connections is not None:
|
1456
|
+
usage_percentage = current_connections / self.max_connections
|
1457
|
+
can_proceed = current_connections < self.max_connections
|
1458
|
+
|
1459
|
+
if not can_proceed and self.enforcement_policy == EnforcementPolicy.WARN:
|
1460
|
+
can_proceed = True
|
1461
|
+
|
1462
|
+
results["connections"] = ResourceCheckResult(
|
1463
|
+
can_proceed=can_proceed,
|
1464
|
+
resource_type="connections",
|
1465
|
+
current_usage=current_connections,
|
1466
|
+
limit=self.max_connections,
|
1467
|
+
usage_percentage=usage_percentage,
|
1468
|
+
message=f"Active connections: {current_connections}/{self.max_connections}",
|
1469
|
+
)
|
1470
|
+
else:
|
1471
|
+
results["connections"] = ResourceCheckResult(
|
1472
|
+
can_proceed=True,
|
1473
|
+
resource_type="connections",
|
1474
|
+
current_usage=current_connections,
|
1475
|
+
limit=0,
|
1476
|
+
usage_percentage=0,
|
1477
|
+
message="No connection limit configured",
|
1478
|
+
)
|
1479
|
+
|
1480
|
+
return results
|
1481
|
+
|
1482
|
+
def enforce_memory_limits(self) -> None:
|
1483
|
+
"""Enforce memory limits based on policy.
|
1484
|
+
|
1485
|
+
Raises:
|
1486
|
+
MemoryLimitExceededError: If memory limit exceeded and policy is strict
|
1487
|
+
"""
|
1488
|
+
result = self.check_memory_limits()
|
1489
|
+
|
1490
|
+
if not result.can_proceed:
|
1491
|
+
if self.enforcement_policy == EnforcementPolicy.STRICT:
|
1492
|
+
raise MemoryLimitExceededError(result.current_usage, result.limit)
|
1493
|
+
elif self.enforcement_policy == EnforcementPolicy.WARN:
|
1494
|
+
logger.warning(f"Memory limit exceeded: {result.message}")
|
1495
|
+
elif self.enforcement_policy == EnforcementPolicy.ADAPTIVE:
|
1496
|
+
# Trigger garbage collection to try to free memory
|
1497
|
+
logger.warning(
|
1498
|
+
f"Memory limit exceeded, triggering garbage collection: {result.message}"
|
1499
|
+
)
|
1500
|
+
gc.collect()
|
1501
|
+
|
1502
|
+
# Re-check after GC
|
1503
|
+
recheck_result = self.check_memory_limits()
|
1504
|
+
if not recheck_result.can_proceed:
|
1505
|
+
if self.degradation_strategy == DegradationStrategy.REJECT:
|
1506
|
+
raise MemoryLimitExceededError(
|
1507
|
+
recheck_result.current_usage, recheck_result.limit
|
1508
|
+
)
|
1509
|
+
else:
|
1510
|
+
logger.warning(
|
1511
|
+
f"Memory limit still exceeded after GC: {recheck_result.message}"
|
1512
|
+
)
|
1513
|
+
|
1514
|
+
def enforce_cpu_limits(self) -> None:
|
1515
|
+
"""Enforce CPU limits based on policy.
|
1516
|
+
|
1517
|
+
Raises:
|
1518
|
+
CPULimitExceededError: If CPU limit exceeded and policy is strict
|
1519
|
+
"""
|
1520
|
+
result = self.check_cpu_limits()
|
1521
|
+
|
1522
|
+
if not result.can_proceed:
|
1523
|
+
if self.enforcement_policy == EnforcementPolicy.STRICT:
|
1524
|
+
raise CPULimitExceededError(result.current_usage, result.limit)
|
1525
|
+
elif self.enforcement_policy == EnforcementPolicy.WARN:
|
1526
|
+
logger.warning(f"CPU limit exceeded: {result.message}")
|
1527
|
+
elif self.enforcement_policy == EnforcementPolicy.ADAPTIVE:
|
1528
|
+
# Adaptive CPU throttling - introduce delays
|
1529
|
+
throttle_delay = min(1.0, (result.usage_percentage - 1.0) * 2.0)
|
1530
|
+
if throttle_delay > 0:
|
1531
|
+
logger.warning(f"CPU throttling: sleeping {throttle_delay:.2f}s")
|
1532
|
+
time.sleep(throttle_delay)
|
1533
|
+
|
1534
|
+
def get_resource_metrics(self) -> Dict[str, Any]:
|
1535
|
+
"""Get current resource usage metrics.
|
1536
|
+
|
1537
|
+
Returns:
|
1538
|
+
Dict containing comprehensive resource metrics
|
1539
|
+
"""
|
1540
|
+
# Get current process metrics, not system-wide
|
1541
|
+
process = psutil.Process()
|
1542
|
+
memory_info = process.memory_info()
|
1543
|
+
cpu_percent = process.cpu_percent()
|
1544
|
+
|
1545
|
+
with self._lock:
|
1546
|
+
current_memory_mb = memory_info.rss / (1024 * 1024)
|
1547
|
+
memory_usage_percent = (
|
1548
|
+
(current_memory_mb / self.max_memory_mb * 100)
|
1549
|
+
if self.max_memory_mb
|
1550
|
+
else 0
|
1551
|
+
)
|
1552
|
+
|
1553
|
+
metrics = {
|
1554
|
+
"timestamp": datetime.now(UTC),
|
1555
|
+
"memory_usage_mb": current_memory_mb,
|
1556
|
+
"memory_usage_percent": memory_usage_percent,
|
1557
|
+
"cpu_usage_percent": cpu_percent,
|
1558
|
+
"active_connections": len(self.active_connections),
|
1559
|
+
"peak_memory_mb": self.peak_memory_mb,
|
1560
|
+
"peak_cpu_percent": self.peak_cpu_percent,
|
1561
|
+
"max_memory_mb": self.max_memory_mb,
|
1562
|
+
"max_connections": self.max_connections,
|
1563
|
+
"max_cpu_percent": self.max_cpu_percent,
|
1564
|
+
"enforcement_policy": self.enforcement_policy.value,
|
1565
|
+
"degradation_strategy": self.degradation_strategy.value,
|
1566
|
+
"uptime_seconds": time.time() - self.enforcement_start_time,
|
1567
|
+
}
|
1568
|
+
|
1569
|
+
# Add to history if enabled
|
1570
|
+
if self.enable_metrics_history:
|
1571
|
+
self.metrics_history.append(
|
1572
|
+
ResourceMetrics(
|
1573
|
+
timestamp=metrics["timestamp"],
|
1574
|
+
memory_usage_mb=metrics["memory_usage_mb"],
|
1575
|
+
memory_usage_percent=metrics["memory_usage_percent"],
|
1576
|
+
cpu_usage_percent=metrics["cpu_usage_percent"],
|
1577
|
+
active_connections=metrics["active_connections"],
|
1578
|
+
peak_memory_mb=metrics["peak_memory_mb"],
|
1579
|
+
peak_cpu_percent=metrics["peak_cpu_percent"],
|
1580
|
+
)
|
1581
|
+
)
|
1582
|
+
|
1583
|
+
return metrics
|
1584
|
+
|
1585
|
+
def get_metrics_history(
|
1586
|
+
self, duration_seconds: Optional[int] = None
|
1587
|
+
) -> List[ResourceMetrics]:
|
1588
|
+
"""Get resource metrics history.
|
1589
|
+
|
1590
|
+
Args:
|
1591
|
+
duration_seconds: Only return metrics from last N seconds (None = all)
|
1592
|
+
|
1593
|
+
Returns:
|
1594
|
+
List of ResourceMetrics from history
|
1595
|
+
"""
|
1596
|
+
if not self.enable_metrics_history:
|
1597
|
+
return []
|
1598
|
+
|
1599
|
+
with self._lock:
|
1600
|
+
if duration_seconds is None:
|
1601
|
+
return list(self.metrics_history)
|
1602
|
+
|
1603
|
+
# Filter by duration
|
1604
|
+
cutoff_time = datetime.now(UTC) - timedelta(seconds=duration_seconds)
|
1605
|
+
return [
|
1606
|
+
metrics
|
1607
|
+
for metrics in self.metrics_history
|
1608
|
+
if metrics.timestamp >= cutoff_time
|
1609
|
+
]
|
1610
|
+
|
1611
|
+
async def start_monitoring(self) -> None:
|
1612
|
+
"""Start asynchronous resource monitoring."""
|
1613
|
+
if self._is_monitoring:
|
1614
|
+
return
|
1615
|
+
|
1616
|
+
self._is_monitoring = True
|
1617
|
+
self._monitoring_task = asyncio.create_task(self._monitoring_loop())
|
1618
|
+
logger.info("Resource monitoring started")
|
1619
|
+
|
1620
|
+
async def stop_monitoring(self) -> None:
|
1621
|
+
"""Stop asynchronous resource monitoring."""
|
1622
|
+
if not self._is_monitoring:
|
1623
|
+
return
|
1624
|
+
|
1625
|
+
self._is_monitoring = False
|
1626
|
+
if self._monitoring_task:
|
1627
|
+
self._monitoring_task.cancel()
|
1628
|
+
try:
|
1629
|
+
await self._monitoring_task
|
1630
|
+
except asyncio.CancelledError:
|
1631
|
+
pass
|
1632
|
+
logger.info("Resource monitoring stopped")
|
1633
|
+
|
1634
|
+
async def _monitoring_loop(self) -> None:
|
1635
|
+
"""Internal monitoring loop."""
|
1636
|
+
while self._is_monitoring:
|
1637
|
+
try:
|
1638
|
+
# Collect metrics
|
1639
|
+
self.get_resource_metrics()
|
1640
|
+
|
1641
|
+
# Check for limit violations
|
1642
|
+
results = self.check_all_limits()
|
1643
|
+
|
1644
|
+
# Log warnings for violations
|
1645
|
+
for resource_type, result in results.items():
|
1646
|
+
if not result.can_proceed and self.enable_alerts:
|
1647
|
+
logger.warning(f"Resource limit violation: {result.message}")
|
1648
|
+
|
1649
|
+
await asyncio.sleep(self.monitoring_interval)
|
1650
|
+
|
1651
|
+
except Exception as e:
|
1652
|
+
logger.error(f"Error in resource monitoring loop: {e}")
|
1653
|
+
await asyncio.sleep(self.monitoring_interval)
|
1654
|
+
|
1655
|
+
|
1656
|
+
# Comprehensive Retry Policy Engine Implementation
|
1657
|
+
|
1658
|
+
|
1659
|
+
class RetryPolicyMode(Enum):
|
1660
|
+
"""Retry policy operation modes."""
|
1661
|
+
|
1662
|
+
STRICT = "strict" # Fail fast on non-retriable exceptions
|
1663
|
+
PERMISSIVE = "permissive" # Allow retries for more exception types
|
1664
|
+
ADAPTIVE = "adaptive" # Learn and adapt retry behavior
|
1665
|
+
CIRCUIT_AWARE = "circuit_aware" # Coordinate with circuit breakers
|
1666
|
+
|
1667
|
+
|
1668
|
+
@dataclass
|
1669
|
+
class RetryAttempt:
|
1670
|
+
"""Record of a single retry attempt."""
|
1671
|
+
|
1672
|
+
timestamp: datetime
|
1673
|
+
exception_type: Type[Exception]
|
1674
|
+
attempt_number: int
|
1675
|
+
delay_used: float
|
1676
|
+
success: bool
|
1677
|
+
execution_time: float
|
1678
|
+
error_message: str = ""
|
1679
|
+
|
1680
|
+
|
1681
|
+
@dataclass
|
1682
|
+
class RetryResult:
|
1683
|
+
"""Result of retry policy execution."""
|
1684
|
+
|
1685
|
+
success: bool
|
1686
|
+
value: Any = None
|
1687
|
+
total_attempts: int = 0
|
1688
|
+
total_time: float = 0.0
|
1689
|
+
final_exception: Optional[Exception] = None
|
1690
|
+
attempts: List[RetryAttempt] = field(default_factory=list)
|
1691
|
+
|
1692
|
+
|
1693
|
+
class RetryStrategy(ABC):
|
1694
|
+
"""Abstract base class for retry strategies."""
|
1695
|
+
|
1696
|
+
def __init__(self, name: str, max_attempts: int = 3):
|
1697
|
+
"""Initialize retry strategy.
|
1698
|
+
|
1699
|
+
Args:
|
1700
|
+
name: Strategy name for identification
|
1701
|
+
max_attempts: Maximum number of retry attempts
|
1702
|
+
"""
|
1703
|
+
self.name = name
|
1704
|
+
self.max_attempts = max_attempts
|
1705
|
+
|
1706
|
+
@abstractmethod
|
1707
|
+
def calculate_delay(self, attempt: int) -> float:
|
1708
|
+
"""Calculate delay for the given attempt number.
|
1709
|
+
|
1710
|
+
Args:
|
1711
|
+
attempt: Current attempt number (1-based)
|
1712
|
+
|
1713
|
+
Returns:
|
1714
|
+
Delay in seconds
|
1715
|
+
"""
|
1716
|
+
pass
|
1717
|
+
|
1718
|
+
def should_retry(self, exception: Exception, attempt: int) -> bool:
|
1719
|
+
"""Determine if the operation should be retried.
|
1720
|
+
|
1721
|
+
Args:
|
1722
|
+
exception: Exception that occurred
|
1723
|
+
attempt: Current attempt number
|
1724
|
+
|
1725
|
+
Returns:
|
1726
|
+
True if should retry, False otherwise
|
1727
|
+
"""
|
1728
|
+
# Default implementation - retry for most exceptions except system ones
|
1729
|
+
non_retriable = (KeyboardInterrupt, SystemExit, SystemError)
|
1730
|
+
return not isinstance(exception, non_retriable)
|
1731
|
+
|
1732
|
+
def get_config(self) -> Dict[str, Any]:
|
1733
|
+
"""Get strategy configuration for serialization.
|
1734
|
+
|
1735
|
+
Returns:
|
1736
|
+
Configuration dictionary
|
1737
|
+
"""
|
1738
|
+
return {"strategy_type": self.name, "max_attempts": self.max_attempts}
|
1739
|
+
|
1740
|
+
|
1741
|
+
class ExponentialBackoffStrategy(RetryStrategy):
|
1742
|
+
"""Exponential backoff retry strategy with jitter."""
|
1743
|
+
|
1744
|
+
def __init__(
|
1745
|
+
self,
|
1746
|
+
max_attempts: int = 3,
|
1747
|
+
base_delay: float = 1.0,
|
1748
|
+
max_delay: float = 60.0,
|
1749
|
+
multiplier: float = 2.0,
|
1750
|
+
jitter: bool = True,
|
1751
|
+
):
|
1752
|
+
"""Initialize exponential backoff strategy.
|
1753
|
+
|
1754
|
+
Args:
|
1755
|
+
max_attempts: Maximum number of attempts
|
1756
|
+
base_delay: Base delay in seconds
|
1757
|
+
max_delay: Maximum delay in seconds
|
1758
|
+
multiplier: Exponential multiplier
|
1759
|
+
jitter: Whether to add jitter to delays
|
1760
|
+
"""
|
1761
|
+
super().__init__("exponential_backoff", max_attempts)
|
1762
|
+
self.base_delay = base_delay
|
1763
|
+
self.max_delay = max_delay
|
1764
|
+
self.multiplier = multiplier
|
1765
|
+
self.jitter = jitter
|
1766
|
+
|
1767
|
+
def calculate_delay(self, attempt: int) -> float:
|
1768
|
+
"""Calculate exponential backoff delay with optional jitter."""
|
1769
|
+
delay = self.base_delay * (self.multiplier ** (attempt - 1))
|
1770
|
+
delay = min(delay, self.max_delay)
|
1771
|
+
|
1772
|
+
if self.jitter:
|
1773
|
+
# Add up to 25% jitter
|
1774
|
+
jitter_amount = delay * 0.25 * random.random()
|
1775
|
+
delay += jitter_amount
|
1776
|
+
|
1777
|
+
return delay
|
1778
|
+
|
1779
|
+
def get_config(self) -> Dict[str, Any]:
|
1780
|
+
"""Get exponential backoff configuration."""
|
1781
|
+
config = super().get_config()
|
1782
|
+
config.update(
|
1783
|
+
{
|
1784
|
+
"base_delay": self.base_delay,
|
1785
|
+
"max_delay": self.max_delay,
|
1786
|
+
"multiplier": self.multiplier,
|
1787
|
+
"jitter": self.jitter,
|
1788
|
+
}
|
1789
|
+
)
|
1790
|
+
return config
|
1791
|
+
|
1792
|
+
|
1793
|
+
class LinearBackoffStrategy(RetryStrategy):
|
1794
|
+
"""Linear backoff retry strategy with optional jitter."""
|
1795
|
+
|
1796
|
+
def __init__(
|
1797
|
+
self,
|
1798
|
+
max_attempts: int = 3,
|
1799
|
+
base_delay: float = 1.0,
|
1800
|
+
max_delay: float = 30.0,
|
1801
|
+
increment: float = 1.0,
|
1802
|
+
jitter: bool = True,
|
1803
|
+
):
|
1804
|
+
"""Initialize linear backoff strategy.
|
1805
|
+
|
1806
|
+
Args:
|
1807
|
+
max_attempts: Maximum number of attempts
|
1808
|
+
base_delay: Base delay in seconds
|
1809
|
+
max_delay: Maximum delay in seconds
|
1810
|
+
increment: Linear increment per attempt
|
1811
|
+
jitter: Whether to add jitter to delays
|
1812
|
+
"""
|
1813
|
+
super().__init__("linear_backoff", max_attempts)
|
1814
|
+
self.base_delay = base_delay
|
1815
|
+
self.max_delay = max_delay
|
1816
|
+
self.increment = increment
|
1817
|
+
self.jitter = jitter
|
1818
|
+
|
1819
|
+
def calculate_delay(self, attempt: int) -> float:
|
1820
|
+
"""Calculate linear backoff delay with optional jitter."""
|
1821
|
+
delay = self.base_delay + ((attempt - 1) * self.increment)
|
1822
|
+
delay = min(delay, self.max_delay)
|
1823
|
+
|
1824
|
+
if self.jitter:
|
1825
|
+
# Add up to 25% jitter
|
1826
|
+
jitter_amount = delay * 0.25 * random.random()
|
1827
|
+
delay += jitter_amount
|
1828
|
+
|
1829
|
+
return delay
|
1830
|
+
|
1831
|
+
def get_config(self) -> Dict[str, Any]:
|
1832
|
+
"""Get linear backoff configuration."""
|
1833
|
+
config = super().get_config()
|
1834
|
+
config.update(
|
1835
|
+
{
|
1836
|
+
"base_delay": self.base_delay,
|
1837
|
+
"max_delay": self.max_delay,
|
1838
|
+
"increment": self.increment,
|
1839
|
+
"jitter": self.jitter,
|
1840
|
+
}
|
1841
|
+
)
|
1842
|
+
return config
|
1843
|
+
|
1844
|
+
|
1845
|
+
class FixedDelayStrategy(RetryStrategy):
|
1846
|
+
"""Fixed delay retry strategy with optional jitter."""
|
1847
|
+
|
1848
|
+
def __init__(self, max_attempts: int = 3, delay: float = 1.0, jitter: bool = True):
|
1849
|
+
"""Initialize fixed delay strategy.
|
1850
|
+
|
1851
|
+
Args:
|
1852
|
+
max_attempts: Maximum number of attempts
|
1853
|
+
delay: Fixed delay in seconds
|
1854
|
+
jitter: Whether to add jitter to delays
|
1855
|
+
"""
|
1856
|
+
super().__init__("fixed_delay", max_attempts)
|
1857
|
+
self.delay = delay
|
1858
|
+
self.jitter = jitter
|
1859
|
+
|
1860
|
+
def calculate_delay(self, attempt: int) -> float:
|
1861
|
+
"""Calculate fixed delay with optional jitter."""
|
1862
|
+
delay = self.delay
|
1863
|
+
|
1864
|
+
if self.jitter:
|
1865
|
+
# Add up to 25% jitter
|
1866
|
+
jitter_amount = delay * 0.25 * random.random()
|
1867
|
+
delay += jitter_amount
|
1868
|
+
|
1869
|
+
return delay
|
1870
|
+
|
1871
|
+
def get_config(self) -> Dict[str, Any]:
|
1872
|
+
"""Get fixed delay configuration."""
|
1873
|
+
config = super().get_config()
|
1874
|
+
config.update({"delay": self.delay, "jitter": self.jitter})
|
1875
|
+
return config
|
1876
|
+
|
1877
|
+
|
1878
|
+
class AdaptiveRetryStrategy(RetryStrategy):
|
1879
|
+
"""Adaptive retry strategy that learns from historical success/failure patterns."""
|
1880
|
+
|
1881
|
+
def __init__(
|
1882
|
+
self,
|
1883
|
+
max_attempts: int = 3,
|
1884
|
+
initial_delay: float = 1.0,
|
1885
|
+
min_delay: float = 0.1,
|
1886
|
+
max_delay: float = 30.0,
|
1887
|
+
learning_rate: float = 0.1,
|
1888
|
+
history_size: int = 1000,
|
1889
|
+
):
|
1890
|
+
"""Initialize adaptive retry strategy.
|
1891
|
+
|
1892
|
+
Args:
|
1893
|
+
max_attempts: Maximum number of attempts
|
1894
|
+
initial_delay: Initial delay for new exception types
|
1895
|
+
min_delay: Minimum delay bound
|
1896
|
+
max_delay: Maximum delay bound
|
1897
|
+
learning_rate: How quickly to adapt (0.0-1.0)
|
1898
|
+
history_size: Maximum number of attempts to remember
|
1899
|
+
"""
|
1900
|
+
super().__init__("adaptive_retry", max_attempts)
|
1901
|
+
self.initial_delay = initial_delay
|
1902
|
+
self.min_delay = min_delay
|
1903
|
+
self.max_delay = max_delay
|
1904
|
+
self.learning_rate = learning_rate
|
1905
|
+
self.history_size = history_size
|
1906
|
+
|
1907
|
+
# Learning data structures
|
1908
|
+
self.attempt_history: deque = deque(maxlen=history_size)
|
1909
|
+
self.exception_delays: Dict[Type[Exception], float] = {}
|
1910
|
+
self.success_rates: Dict[Type[Exception], Tuple[int, int]] = defaultdict(
|
1911
|
+
lambda: (0, 0)
|
1912
|
+
)
|
1913
|
+
|
1914
|
+
# Thread safety for learning data
|
1915
|
+
self._learning_lock = threading.RLock()
|
1916
|
+
|
1917
|
+
def calculate_delay(
|
1918
|
+
self, attempt: int, exception_type: Type[Exception] = Exception
|
1919
|
+
) -> float:
|
1920
|
+
"""Calculate adaptive delay based on learned patterns."""
|
1921
|
+
with self._learning_lock:
|
1922
|
+
if exception_type in self.exception_delays:
|
1923
|
+
base_delay = self.exception_delays[exception_type]
|
1924
|
+
else:
|
1925
|
+
base_delay = self.initial_delay
|
1926
|
+
|
1927
|
+
# Apply attempt multiplier with learned adjustments
|
1928
|
+
delay = base_delay * (1.2 ** (attempt - 1))
|
1929
|
+
return max(self.min_delay, min(delay, self.max_delay))
|
1930
|
+
|
1931
|
+
def get_recommended_delay(
|
1932
|
+
self, exception_type: Type[Exception], attempt: int
|
1933
|
+
) -> float:
|
1934
|
+
"""Get recommended delay for specific exception type and attempt."""
|
1935
|
+
return self.calculate_delay(attempt, exception_type)
|
1936
|
+
|
1937
|
+
def record_attempt_result(
|
1938
|
+
self,
|
1939
|
+
exception_type: Type[Exception],
|
1940
|
+
attempt: int,
|
1941
|
+
delay_used: float,
|
1942
|
+
success: bool,
|
1943
|
+
execution_time: float = 0.0,
|
1944
|
+
) -> None:
|
1945
|
+
"""Record the result of an attempt for learning.
|
1946
|
+
|
1947
|
+
Args:
|
1948
|
+
exception_type: Type of exception that occurred
|
1949
|
+
attempt: Attempt number
|
1950
|
+
delay_used: Delay that was used
|
1951
|
+
success: Whether the attempt succeeded
|
1952
|
+
execution_time: How long the operation took
|
1953
|
+
"""
|
1954
|
+
with self._learning_lock:
|
1955
|
+
# Record in history
|
1956
|
+
self.attempt_history.append(
|
1957
|
+
{
|
1958
|
+
"exception_type": exception_type,
|
1959
|
+
"attempt": attempt,
|
1960
|
+
"delay_used": delay_used,
|
1961
|
+
"success": success,
|
1962
|
+
"execution_time": execution_time,
|
1963
|
+
"timestamp": datetime.now(UTC),
|
1964
|
+
}
|
1965
|
+
)
|
1966
|
+
|
1967
|
+
# Update success rates
|
1968
|
+
successes, failures = self.success_rates[exception_type]
|
1969
|
+
if success:
|
1970
|
+
successes += 1
|
1971
|
+
else:
|
1972
|
+
failures += 1
|
1973
|
+
self.success_rates[exception_type] = (successes, failures)
|
1974
|
+
|
1975
|
+
# Adapt delay based on result
|
1976
|
+
current_delay = self.exception_delays.get(
|
1977
|
+
exception_type, self.initial_delay
|
1978
|
+
)
|
1979
|
+
|
1980
|
+
if success:
|
1981
|
+
# Successful retry - reduce delay slightly
|
1982
|
+
new_delay = current_delay * (1.0 - self.learning_rate * 0.5)
|
1983
|
+
else:
|
1984
|
+
# Failed retry - increase delay
|
1985
|
+
new_delay = current_delay * (1.0 + self.learning_rate)
|
1986
|
+
|
1987
|
+
# Apply bounds
|
1988
|
+
new_delay = max(self.min_delay, min(new_delay, self.max_delay))
|
1989
|
+
self.exception_delays[exception_type] = new_delay
|
1990
|
+
|
1991
|
+
logger.debug(
|
1992
|
+
f"Adaptive retry learned: {exception_type.__name__} delay "
|
1993
|
+
f"{current_delay:.2f}s -> {new_delay:.2f}s (success: {success})"
|
1994
|
+
)
|
1995
|
+
|
1996
|
+
def get_learning_stats(self) -> Dict[str, Any]:
|
1997
|
+
"""Get statistics about learned patterns.
|
1998
|
+
|
1999
|
+
Returns:
|
2000
|
+
Dictionary containing learning statistics
|
2001
|
+
"""
|
2002
|
+
with self._learning_lock:
|
2003
|
+
return {
|
2004
|
+
"total_attempts": len(self.attempt_history),
|
2005
|
+
"unique_exceptions": len(self.exception_delays),
|
2006
|
+
"learned_delays": {
|
2007
|
+
exc_type.__name__: delay
|
2008
|
+
for exc_type, delay in self.exception_delays.items()
|
2009
|
+
},
|
2010
|
+
"success_rates": {
|
2011
|
+
exc_type.__name__: (
|
2012
|
+
successes / (successes + failures)
|
2013
|
+
if (successes + failures) > 0
|
2014
|
+
else 0.0
|
2015
|
+
)
|
2016
|
+
for exc_type, (successes, failures) in self.success_rates.items()
|
2017
|
+
},
|
2018
|
+
}
|
2019
|
+
|
2020
|
+
def get_config(self) -> Dict[str, Any]:
|
2021
|
+
"""Get adaptive strategy configuration."""
|
2022
|
+
config = super().get_config()
|
2023
|
+
config.update(
|
2024
|
+
{
|
2025
|
+
"initial_delay": self.initial_delay,
|
2026
|
+
"min_delay": self.min_delay,
|
2027
|
+
"max_delay": self.max_delay,
|
2028
|
+
"learning_rate": self.learning_rate,
|
2029
|
+
"history_size": self.history_size,
|
2030
|
+
}
|
2031
|
+
)
|
2032
|
+
return config
|
2033
|
+
|
2034
|
+
|
2035
|
+
class ExceptionClassifier:
|
2036
|
+
"""Smart exception classification for retry decisions."""
|
2037
|
+
|
2038
|
+
def __init__(self):
|
2039
|
+
"""Initialize exception classifier with built-in rules."""
|
2040
|
+
# Built-in retriable exceptions (network, temporary failures)
|
2041
|
+
self.retriable_exceptions: Set[Type[Exception]] = {
|
2042
|
+
ConnectionError,
|
2043
|
+
TimeoutError,
|
2044
|
+
OSError, # Network-related OS errors
|
2045
|
+
RuntimeError, # General runtime issues
|
2046
|
+
ValueError, # Often temporary data issues
|
2047
|
+
}
|
2048
|
+
|
2049
|
+
# Built-in non-retriable exceptions (system, user, permanent)
|
2050
|
+
self.non_retriable_exceptions: Set[Type[Exception]] = {
|
2051
|
+
KeyboardInterrupt,
|
2052
|
+
SystemExit,
|
2053
|
+
SystemError,
|
2054
|
+
MemoryError,
|
2055
|
+
RecursionError,
|
2056
|
+
SyntaxError,
|
2057
|
+
TypeError, # Usually indicates programming errors
|
2058
|
+
AttributeError, # Usually permanent
|
2059
|
+
ImportError, # Usually permanent
|
2060
|
+
}
|
2061
|
+
|
2062
|
+
# Pattern-based rules (regex patterns to match exception messages)
|
2063
|
+
self.retriable_patterns: List[Tuple[re.Pattern, bool]] = (
|
2064
|
+
[]
|
2065
|
+
) # (pattern, case_sensitive)
|
2066
|
+
self.non_retriable_patterns: List[Tuple[re.Pattern, bool]] = []
|
2067
|
+
|
2068
|
+
# Lock for thread safety
|
2069
|
+
self._lock = threading.RLock()
|
2070
|
+
|
2071
|
+
logger.info("ExceptionClassifier initialized with built-in rules")
|
2072
|
+
|
2073
|
+
def is_retriable(self, exception: Exception) -> bool:
|
2074
|
+
"""Determine if an exception is retriable.
|
2075
|
+
|
2076
|
+
Args:
|
2077
|
+
exception: Exception to classify
|
2078
|
+
|
2079
|
+
Returns:
|
2080
|
+
True if the exception is retriable, False otherwise
|
2081
|
+
"""
|
2082
|
+
with self._lock:
|
2083
|
+
exception_type = type(exception)
|
2084
|
+
exception_message = str(exception)
|
2085
|
+
|
2086
|
+
# Check non-retriable patterns first (higher priority)
|
2087
|
+
for pattern, case_sensitive in self.non_retriable_patterns:
|
2088
|
+
if pattern.search(exception_message):
|
2089
|
+
logger.debug(
|
2090
|
+
f"Exception '{exception_message}' matched non-retriable pattern"
|
2091
|
+
)
|
2092
|
+
return False
|
2093
|
+
|
2094
|
+
# Check non-retriable exception types
|
2095
|
+
for non_retriable_type in self.non_retriable_exceptions:
|
2096
|
+
if issubclass(exception_type, non_retriable_type):
|
2097
|
+
logger.debug(
|
2098
|
+
f"Exception type {exception_type.__name__} is non-retriable"
|
2099
|
+
)
|
2100
|
+
return False
|
2101
|
+
|
2102
|
+
# Check retriable patterns
|
2103
|
+
for pattern, case_sensitive in self.retriable_patterns:
|
2104
|
+
if pattern.search(exception_message):
|
2105
|
+
logger.debug(
|
2106
|
+
f"Exception '{exception_message}' matched retriable pattern"
|
2107
|
+
)
|
2108
|
+
return True
|
2109
|
+
|
2110
|
+
# Check retriable exception types
|
2111
|
+
for retriable_type in self.retriable_exceptions:
|
2112
|
+
if issubclass(exception_type, retriable_type):
|
2113
|
+
logger.debug(
|
2114
|
+
f"Exception type {exception_type.__name__} is retriable"
|
2115
|
+
)
|
2116
|
+
return True
|
2117
|
+
|
2118
|
+
# Default to non-retriable for unknown exceptions
|
2119
|
+
logger.debug(
|
2120
|
+
f"Exception type {exception_type.__name__} not classified, defaulting to non-retriable"
|
2121
|
+
)
|
2122
|
+
return False
|
2123
|
+
|
2124
|
+
def add_retriable_exception(self, exception_type: Type[Exception]) -> None:
|
2125
|
+
"""Add an exception type to retriable list.
|
2126
|
+
|
2127
|
+
Args:
|
2128
|
+
exception_type: Exception type to mark as retriable
|
2129
|
+
"""
|
2130
|
+
with self._lock:
|
2131
|
+
self.retriable_exceptions.add(exception_type)
|
2132
|
+
# Remove from non-retriable if present
|
2133
|
+
self.non_retriable_exceptions.discard(exception_type)
|
2134
|
+
|
2135
|
+
logger.info(f"Added {exception_type.__name__} to retriable exceptions")
|
2136
|
+
|
2137
|
+
def add_non_retriable_exception(self, exception_type: Type[Exception]) -> None:
|
2138
|
+
"""Add an exception type to non-retriable list.
|
2139
|
+
|
2140
|
+
Args:
|
2141
|
+
exception_type: Exception type to mark as non-retriable
|
2142
|
+
"""
|
2143
|
+
with self._lock:
|
2144
|
+
self.non_retriable_exceptions.add(exception_type)
|
2145
|
+
# Remove from retriable if present
|
2146
|
+
self.retriable_exceptions.discard(exception_type)
|
2147
|
+
|
2148
|
+
logger.info(f"Added {exception_type.__name__} to non-retriable exceptions")
|
2149
|
+
|
2150
|
+
def add_retriable_pattern(self, pattern: str, case_sensitive: bool = True) -> None:
|
2151
|
+
"""Add a regex pattern for retriable exceptions.
|
2152
|
+
|
2153
|
+
Args:
|
2154
|
+
pattern: Regex pattern to match exception messages
|
2155
|
+
case_sensitive: Whether the pattern matching is case-sensitive
|
2156
|
+
"""
|
2157
|
+
with self._lock:
|
2158
|
+
flags = 0 if case_sensitive else re.IGNORECASE
|
2159
|
+
compiled_pattern = re.compile(pattern, flags)
|
2160
|
+
self.retriable_patterns.append((compiled_pattern, case_sensitive))
|
2161
|
+
|
2162
|
+
logger.info(
|
2163
|
+
f"Added retriable pattern: {pattern} (case_sensitive: {case_sensitive})"
|
2164
|
+
)
|
2165
|
+
|
2166
|
+
def add_non_retriable_pattern(
|
2167
|
+
self, pattern: str, case_sensitive: bool = True
|
2168
|
+
) -> None:
|
2169
|
+
"""Add a regex pattern for non-retriable exceptions.
|
2170
|
+
|
2171
|
+
Args:
|
2172
|
+
pattern: Regex pattern to match exception messages
|
2173
|
+
case_sensitive: Whether the pattern matching is case-sensitive
|
2174
|
+
"""
|
2175
|
+
with self._lock:
|
2176
|
+
flags = 0 if case_sensitive else re.IGNORECASE
|
2177
|
+
compiled_pattern = re.compile(pattern, flags)
|
2178
|
+
self.non_retriable_patterns.append((compiled_pattern, case_sensitive))
|
2179
|
+
|
2180
|
+
logger.info(
|
2181
|
+
f"Added non-retriable pattern: {pattern} (case_sensitive: {case_sensitive})"
|
2182
|
+
)
|
2183
|
+
|
2184
|
+
def get_classification_rules(self) -> Dict[str, Any]:
|
2185
|
+
"""Get current classification rules.
|
2186
|
+
|
2187
|
+
Returns:
|
2188
|
+
Dictionary containing all classification rules
|
2189
|
+
"""
|
2190
|
+
with self._lock:
|
2191
|
+
return {
|
2192
|
+
"retriable_exceptions": [
|
2193
|
+
exc.__name__ for exc in self.retriable_exceptions
|
2194
|
+
],
|
2195
|
+
"non_retriable_exceptions": [
|
2196
|
+
exc.__name__ for exc in self.non_retriable_exceptions
|
2197
|
+
],
|
2198
|
+
"retriable_patterns": [
|
2199
|
+
(p.pattern, cs) for p, cs in self.retriable_patterns
|
2200
|
+
],
|
2201
|
+
"non_retriable_patterns": [
|
2202
|
+
(p.pattern, cs) for p, cs in self.non_retriable_patterns
|
2203
|
+
],
|
2204
|
+
}
|
2205
|
+
|
2206
|
+
|
2207
|
+
class RetryMetrics:
|
2208
|
+
"""Comprehensive retry metrics collection and analysis."""
|
2209
|
+
|
2210
|
+
def __init__(self):
|
2211
|
+
"""Initialize retry metrics collector."""
|
2212
|
+
self.total_attempts = 0
|
2213
|
+
self.total_successes = 0
|
2214
|
+
self.total_failures = 0
|
2215
|
+
self.attempt_history: List[RetryAttempt] = []
|
2216
|
+
|
2217
|
+
# Performance metrics
|
2218
|
+
self.total_delay_time = 0.0
|
2219
|
+
self.total_execution_time = 0.0
|
2220
|
+
|
2221
|
+
# Exception tracking
|
2222
|
+
self.exception_counts: Dict[str, int] = defaultdict(int)
|
2223
|
+
|
2224
|
+
# Thread safety
|
2225
|
+
self._lock = threading.RLock()
|
2226
|
+
|
2227
|
+
def record_attempt(self, attempt: RetryAttempt) -> None:
|
2228
|
+
"""Record a retry attempt.
|
2229
|
+
|
2230
|
+
Args:
|
2231
|
+
attempt: RetryAttempt object with attempt details
|
2232
|
+
"""
|
2233
|
+
with self._lock:
|
2234
|
+
self.attempt_history.append(attempt)
|
2235
|
+
self.total_attempts += 1
|
2236
|
+
|
2237
|
+
if attempt.success:
|
2238
|
+
self.total_successes += 1
|
2239
|
+
else:
|
2240
|
+
self.total_failures += 1
|
2241
|
+
|
2242
|
+
self.total_delay_time += attempt.delay_used
|
2243
|
+
self.total_execution_time += attempt.execution_time
|
2244
|
+
self.exception_counts[attempt.exception_type.__name__] += 1
|
2245
|
+
|
2246
|
+
@property
|
2247
|
+
def success_rate(self) -> float:
|
2248
|
+
"""Calculate overall success rate."""
|
2249
|
+
if self.total_attempts == 0:
|
2250
|
+
return 0.0
|
2251
|
+
return self.total_successes / self.total_attempts
|
2252
|
+
|
2253
|
+
@property
|
2254
|
+
def average_delay(self) -> float:
|
2255
|
+
"""Calculate average delay between attempts."""
|
2256
|
+
if self.total_attempts == 0:
|
2257
|
+
return 0.0
|
2258
|
+
return self.total_delay_time / self.total_attempts
|
2259
|
+
|
2260
|
+
@property
|
2261
|
+
def average_execution_time(self) -> float:
|
2262
|
+
"""Calculate average execution time per attempt."""
|
2263
|
+
if self.total_attempts == 0:
|
2264
|
+
return 0.0
|
2265
|
+
return self.total_execution_time / self.total_attempts
|
2266
|
+
|
2267
|
+
def get_exception_breakdown(self) -> Dict[str, int]:
|
2268
|
+
"""Get breakdown of exceptions by type.
|
2269
|
+
|
2270
|
+
Returns:
|
2271
|
+
Dictionary mapping exception names to counts
|
2272
|
+
"""
|
2273
|
+
with self._lock:
|
2274
|
+
return dict(self.exception_counts)
|
2275
|
+
|
2276
|
+
def get_attempt_timeline(self) -> List[Dict[str, Any]]:
|
2277
|
+
"""Get chronological timeline of attempts.
|
2278
|
+
|
2279
|
+
Returns:
|
2280
|
+
List of attempt dictionaries sorted by timestamp
|
2281
|
+
"""
|
2282
|
+
with self._lock:
|
2283
|
+
timeline = []
|
2284
|
+
for attempt in sorted(self.attempt_history, key=lambda a: a.timestamp):
|
2285
|
+
timeline.append(
|
2286
|
+
{
|
2287
|
+
"timestamp": attempt.timestamp,
|
2288
|
+
"attempt_number": attempt.attempt_number,
|
2289
|
+
"exception_type": attempt.exception_type.__name__,
|
2290
|
+
"delay_used": attempt.delay_used,
|
2291
|
+
"success": attempt.success,
|
2292
|
+
"execution_time": attempt.execution_time,
|
2293
|
+
"error_message": attempt.error_message,
|
2294
|
+
}
|
2295
|
+
)
|
2296
|
+
return timeline
|
2297
|
+
|
2298
|
+
def get_summary_stats(self) -> Dict[str, Any]:
|
2299
|
+
"""Get comprehensive summary statistics.
|
2300
|
+
|
2301
|
+
Returns:
|
2302
|
+
Dictionary containing all metrics
|
2303
|
+
"""
|
2304
|
+
with self._lock:
|
2305
|
+
return {
|
2306
|
+
"total_attempts": self.total_attempts,
|
2307
|
+
"total_successes": self.total_successes,
|
2308
|
+
"total_failures": self.total_failures,
|
2309
|
+
"success_rate": self.success_rate,
|
2310
|
+
"average_delay": self.average_delay,
|
2311
|
+
"average_execution_time": self.average_execution_time,
|
2312
|
+
"total_delay_time": self.total_delay_time,
|
2313
|
+
"total_execution_time": self.total_execution_time,
|
2314
|
+
"unique_exceptions": len(self.exception_counts),
|
2315
|
+
"most_common_exception": (
|
2316
|
+
max(self.exception_counts.items(), key=lambda x: x[1])[0]
|
2317
|
+
if self.exception_counts
|
2318
|
+
else None
|
2319
|
+
),
|
2320
|
+
}
|
2321
|
+
|
2322
|
+
|
2323
|
+
@dataclass
|
2324
|
+
class RetryAnalytics:
|
2325
|
+
"""Advanced retry analytics and reporting."""
|
2326
|
+
|
2327
|
+
total_retry_sessions: int = 0
|
2328
|
+
total_attempts: int = 0
|
2329
|
+
total_successes: int = 0
|
2330
|
+
average_attempts_per_session: float = 0.0
|
2331
|
+
most_common_exceptions: List[Tuple[str, int]] = field(default_factory=list)
|
2332
|
+
|
2333
|
+
def __post_init__(self):
|
2334
|
+
"""Initialize analytics collections."""
|
2335
|
+
self.session_data: List[Dict[str, Any]] = []
|
2336
|
+
self.exception_frequencies: Dict[str, int] = defaultdict(int)
|
2337
|
+
self.strategy_performance: Dict[str, Dict[str, Any]] = defaultdict(
|
2338
|
+
lambda: {
|
2339
|
+
"total_uses": 0,
|
2340
|
+
"total_successes": 0,
|
2341
|
+
"total_attempts": 0,
|
2342
|
+
"total_time": 0.0,
|
2343
|
+
"success_rate": 0.0,
|
2344
|
+
"average_attempts": 0.0,
|
2345
|
+
"average_time": 0.0,
|
2346
|
+
}
|
2347
|
+
)
|
2348
|
+
self.time_series_data: Dict[str, List[Tuple[datetime, float]]] = defaultdict(
|
2349
|
+
list
|
2350
|
+
)
|
2351
|
+
self.enable_time_series = False
|
2352
|
+
self._lock = threading.RLock()
|
2353
|
+
|
2354
|
+
def record_session(
|
2355
|
+
self,
|
2356
|
+
session_id: str,
|
2357
|
+
attempts: int,
|
2358
|
+
success: bool,
|
2359
|
+
total_time: float,
|
2360
|
+
strategy_name: str,
|
2361
|
+
) -> None:
|
2362
|
+
"""Record a retry session.
|
2363
|
+
|
2364
|
+
Args:
|
2365
|
+
session_id: Unique session identifier
|
2366
|
+
attempts: Number of attempts made
|
2367
|
+
success: Whether the session ultimately succeeded
|
2368
|
+
total_time: Total time spent on retries
|
2369
|
+
strategy_name: Name of retry strategy used
|
2370
|
+
"""
|
2371
|
+
with self._lock:
|
2372
|
+
self.session_data.append(
|
2373
|
+
{
|
2374
|
+
"session_id": session_id,
|
2375
|
+
"attempts": attempts,
|
2376
|
+
"success": success,
|
2377
|
+
"total_time": total_time,
|
2378
|
+
"strategy_name": strategy_name,
|
2379
|
+
"timestamp": datetime.now(UTC),
|
2380
|
+
}
|
2381
|
+
)
|
2382
|
+
|
2383
|
+
self.total_retry_sessions += 1
|
2384
|
+
self.total_attempts += attempts
|
2385
|
+
if success:
|
2386
|
+
self.total_successes += 1
|
2387
|
+
|
2388
|
+
# Update running average
|
2389
|
+
self.average_attempts_per_session = (
|
2390
|
+
self.total_attempts / self.total_retry_sessions
|
2391
|
+
)
|
2392
|
+
|
2393
|
+
def record_exception(self, exception_type: Type[Exception]) -> None:
|
2394
|
+
"""Record an exception occurrence.
|
2395
|
+
|
2396
|
+
Args:
|
2397
|
+
exception_type: Type of exception that occurred
|
2398
|
+
"""
|
2399
|
+
with self._lock:
|
2400
|
+
self.exception_frequencies[exception_type.__name__] += 1
|
2401
|
+
# Update most common exceptions (top 10)
|
2402
|
+
self.most_common_exceptions = sorted(
|
2403
|
+
self.exception_frequencies.items(), key=lambda x: x[1], reverse=True
|
2404
|
+
)[:10]
|
2405
|
+
|
2406
|
+
def record_strategy_performance(
|
2407
|
+
self, strategy_name: str, attempts: int, success: bool, total_time: float
|
2408
|
+
) -> None:
|
2409
|
+
"""Record performance data for a retry strategy.
|
2410
|
+
|
2411
|
+
Args:
|
2412
|
+
strategy_name: Name of the retry strategy
|
2413
|
+
attempts: Number of attempts made
|
2414
|
+
success: Whether the strategy succeeded
|
2415
|
+
total_time: Total time taken
|
2416
|
+
"""
|
2417
|
+
with self._lock:
|
2418
|
+
perf = self.strategy_performance[strategy_name]
|
2419
|
+
perf["total_uses"] += 1
|
2420
|
+
perf["total_attempts"] += attempts
|
2421
|
+
perf["total_time"] += total_time
|
2422
|
+
|
2423
|
+
if success:
|
2424
|
+
perf["total_successes"] += 1
|
2425
|
+
|
2426
|
+
# Update calculated metrics
|
2427
|
+
perf["success_rate"] = perf["total_successes"] / perf["total_uses"]
|
2428
|
+
perf["average_attempts"] = perf["total_attempts"] / perf["total_uses"]
|
2429
|
+
perf["average_time"] = perf["total_time"] / perf["total_uses"]
|
2430
|
+
|
2431
|
+
def get_strategy_performance(self, strategy_name: str) -> Dict[str, Any]:
|
2432
|
+
"""Get performance metrics for a specific strategy.
|
2433
|
+
|
2434
|
+
Args:
|
2435
|
+
strategy_name: Name of the strategy
|
2436
|
+
|
2437
|
+
Returns:
|
2438
|
+
Performance metrics dictionary
|
2439
|
+
"""
|
2440
|
+
with self._lock:
|
2441
|
+
return dict(self.strategy_performance.get(strategy_name, {}))
|
2442
|
+
|
2443
|
+
def record_time_series_point(
|
2444
|
+
self, timestamp: datetime, metric: str, value: float
|
2445
|
+
) -> None:
|
2446
|
+
"""Record a time series data point.
|
2447
|
+
|
2448
|
+
Args:
|
2449
|
+
timestamp: When the data point was recorded
|
2450
|
+
metric: Name of the metric
|
2451
|
+
value: Metric value
|
2452
|
+
"""
|
2453
|
+
if self.enable_time_series:
|
2454
|
+
with self._lock:
|
2455
|
+
self.time_series_data[metric].append((timestamp, value))
|
2456
|
+
# Keep only last 1000 points per metric
|
2457
|
+
if len(self.time_series_data[metric]) > 1000:
|
2458
|
+
self.time_series_data[metric] = self.time_series_data[metric][
|
2459
|
+
-1000:
|
2460
|
+
]
|
2461
|
+
|
2462
|
+
def get_time_series(self, metric: str) -> List[Tuple[datetime, float]]:
|
2463
|
+
"""Get time series data for a metric.
|
2464
|
+
|
2465
|
+
Args:
|
2466
|
+
metric: Name of the metric
|
2467
|
+
|
2468
|
+
Returns:
|
2469
|
+
List of (timestamp, value) tuples
|
2470
|
+
"""
|
2471
|
+
with self._lock:
|
2472
|
+
return list(self.time_series_data.get(metric, []))
|
2473
|
+
|
2474
|
+
def generate_report(self) -> Dict[str, Any]:
|
2475
|
+
"""Generate comprehensive analytics report.
|
2476
|
+
|
2477
|
+
Returns:
|
2478
|
+
Complete analytics report
|
2479
|
+
"""
|
2480
|
+
with self._lock:
|
2481
|
+
report = {
|
2482
|
+
"generated_at": datetime.now(UTC),
|
2483
|
+
"total_sessions": self.total_retry_sessions,
|
2484
|
+
"total_attempts": self.total_attempts,
|
2485
|
+
"total_successes": self.total_successes,
|
2486
|
+
"success_rate": (
|
2487
|
+
self.total_successes / self.total_retry_sessions
|
2488
|
+
if self.total_retry_sessions > 0
|
2489
|
+
else 0.0
|
2490
|
+
),
|
2491
|
+
"average_attempts": self.average_attempts_per_session,
|
2492
|
+
"most_common_exceptions": self.most_common_exceptions,
|
2493
|
+
"strategy_performance": dict(self.strategy_performance),
|
2494
|
+
"recommendations": self._generate_recommendations(),
|
2495
|
+
}
|
2496
|
+
return report
|
2497
|
+
|
2498
|
+
def _generate_recommendations(self) -> List[str]:
|
2499
|
+
"""Generate recommendations based on analytics.
|
2500
|
+
|
2501
|
+
Returns:
|
2502
|
+
List of recommendation strings
|
2503
|
+
"""
|
2504
|
+
recommendations = []
|
2505
|
+
|
2506
|
+
# Success rate recommendations
|
2507
|
+
if self.total_retry_sessions > 10:
|
2508
|
+
success_rate = self.total_successes / self.total_retry_sessions
|
2509
|
+
if success_rate < 0.5:
|
2510
|
+
recommendations.append(
|
2511
|
+
"Low success rate detected. Consider reviewing exception handling and retry strategies."
|
2512
|
+
)
|
2513
|
+
elif success_rate > 0.95:
|
2514
|
+
recommendations.append(
|
2515
|
+
"High success rate achieved. Current retry configuration appears optimal."
|
2516
|
+
)
|
2517
|
+
|
2518
|
+
# Strategy performance recommendations
|
2519
|
+
if len(self.strategy_performance) > 1:
|
2520
|
+
best_strategy = max(
|
2521
|
+
self.strategy_performance.items(), key=lambda x: x[1]["success_rate"]
|
2522
|
+
)
|
2523
|
+
recommendations.append(
|
2524
|
+
f"Strategy '{best_strategy[0]}' shows best performance with "
|
2525
|
+
f"{best_strategy[1]['success_rate']:.1%} success rate."
|
2526
|
+
)
|
2527
|
+
|
2528
|
+
# Exception pattern recommendations
|
2529
|
+
if self.most_common_exceptions:
|
2530
|
+
most_common = self.most_common_exceptions[0]
|
2531
|
+
recommendations.append(
|
2532
|
+
f"Most common exception: {most_common[0]} ({most_common[1]} occurrences). "
|
2533
|
+
f"Consider targeted handling for this exception type."
|
2534
|
+
)
|
2535
|
+
|
2536
|
+
return recommendations
|
2537
|
+
|
2538
|
+
|
2539
|
+
class RetryPolicyEngine:
|
2540
|
+
"""Comprehensive retry policy engine with pluggable strategies and enterprise integration."""
|
2541
|
+
|
2542
|
+
def __init__(
|
2543
|
+
self,
|
2544
|
+
default_strategy: Optional[RetryStrategy] = None,
|
2545
|
+
exception_classifier: Optional[ExceptionClassifier] = None,
|
2546
|
+
enable_analytics: bool = True,
|
2547
|
+
enable_circuit_breaker_coordination: bool = False,
|
2548
|
+
enable_resource_limit_coordination: bool = False,
|
2549
|
+
circuit_breaker: Optional["CircuitBreaker"] = None,
|
2550
|
+
resource_limit_enforcer: Optional["ResourceLimitEnforcer"] = None,
|
2551
|
+
mode: RetryPolicyMode = RetryPolicyMode.ADAPTIVE,
|
2552
|
+
):
|
2553
|
+
"""Initialize retry policy engine.
|
2554
|
+
|
2555
|
+
Args:
|
2556
|
+
default_strategy: Default retry strategy to use
|
2557
|
+
exception_classifier: Exception classification system
|
2558
|
+
enable_analytics: Enable analytics and metrics collection
|
2559
|
+
enable_circuit_breaker_coordination: Coordinate with circuit breakers
|
2560
|
+
enable_resource_limit_coordination: Coordinate with resource limits
|
2561
|
+
circuit_breaker: CircuitBreaker instance for coordination
|
2562
|
+
resource_limit_enforcer: ResourceLimitEnforcer instance for coordination
|
2563
|
+
mode: Retry policy operation mode
|
2564
|
+
"""
|
2565
|
+
# Initialize default strategy if not provided
|
2566
|
+
if default_strategy is None:
|
2567
|
+
default_strategy = ExponentialBackoffStrategy()
|
2568
|
+
|
2569
|
+
self.default_strategy = default_strategy
|
2570
|
+
self.exception_classifier = exception_classifier or ExceptionClassifier()
|
2571
|
+
self.enable_analytics = enable_analytics
|
2572
|
+
self.enable_circuit_breaker_coordination = enable_circuit_breaker_coordination
|
2573
|
+
self.enable_resource_limit_coordination = enable_resource_limit_coordination
|
2574
|
+
self.circuit_breaker = circuit_breaker
|
2575
|
+
self.resource_limit_enforcer = resource_limit_enforcer
|
2576
|
+
self.mode = mode
|
2577
|
+
|
2578
|
+
# Strategy registry
|
2579
|
+
self.strategies: Dict[str, RetryStrategy] = {
|
2580
|
+
"exponential_backoff": ExponentialBackoffStrategy(),
|
2581
|
+
"linear_backoff": LinearBackoffStrategy(),
|
2582
|
+
"fixed_delay": FixedDelayStrategy(),
|
2583
|
+
"adaptive_retry": AdaptiveRetryStrategy(),
|
2584
|
+
}
|
2585
|
+
|
2586
|
+
# Exception-specific strategies
|
2587
|
+
self.exception_strategies: Dict[Type[Exception], RetryStrategy] = {}
|
2588
|
+
|
2589
|
+
# Metrics and analytics
|
2590
|
+
self.metrics = RetryMetrics() if enable_analytics else None
|
2591
|
+
self.analytics = RetryAnalytics() if enable_analytics else None
|
2592
|
+
|
2593
|
+
# Strategy effectiveness tracking
|
2594
|
+
self.strategy_effectiveness: Dict[str, Dict[str, Any]] = defaultdict(
|
2595
|
+
lambda: {"uses": 0, "successes": 0, "total_attempts": 0, "total_time": 0.0}
|
2596
|
+
)
|
2597
|
+
|
2598
|
+
# Thread safety
|
2599
|
+
self._lock = threading.RLock()
|
2600
|
+
|
2601
|
+
logger.info(f"RetryPolicyEngine initialized with mode: {mode.value}")
|
2602
|
+
|
2603
|
+
def register_strategy(self, name: str, strategy: RetryStrategy) -> None:
|
2604
|
+
"""Register a custom retry strategy.
|
2605
|
+
|
2606
|
+
Args:
|
2607
|
+
name: Strategy name for identification
|
2608
|
+
strategy: RetryStrategy instance
|
2609
|
+
"""
|
2610
|
+
with self._lock:
|
2611
|
+
self.strategies[name] = strategy
|
2612
|
+
logger.info(f"Registered retry strategy: {name}")
|
2613
|
+
|
2614
|
+
def register_strategy_for_exception(
|
2615
|
+
self, exception_type: Type[Exception], strategy: RetryStrategy
|
2616
|
+
) -> None:
|
2617
|
+
"""Register strategy for specific exception type.
|
2618
|
+
|
2619
|
+
Args:
|
2620
|
+
exception_type: Exception type to handle
|
2621
|
+
strategy: RetryStrategy to use for this exception type
|
2622
|
+
"""
|
2623
|
+
with self._lock:
|
2624
|
+
self.exception_strategies[exception_type] = strategy
|
2625
|
+
logger.info(
|
2626
|
+
f"Registered strategy for {exception_type.__name__}: {strategy.name}"
|
2627
|
+
)
|
2628
|
+
|
2629
|
+
def select_strategy(
|
2630
|
+
self, strategy_name: Optional[str] = None, exception: Optional[Exception] = None
|
2631
|
+
) -> RetryStrategy:
|
2632
|
+
"""Select appropriate retry strategy.
|
2633
|
+
|
2634
|
+
Args:
|
2635
|
+
strategy_name: Explicit strategy name to use
|
2636
|
+
exception: Exception that occurred (for strategy selection)
|
2637
|
+
|
2638
|
+
Returns:
|
2639
|
+
Selected RetryStrategy instance
|
2640
|
+
"""
|
2641
|
+
with self._lock:
|
2642
|
+
# Explicit strategy selection
|
2643
|
+
if strategy_name and strategy_name in self.strategies:
|
2644
|
+
return self.strategies[strategy_name]
|
2645
|
+
|
2646
|
+
# Exception-specific strategy selection
|
2647
|
+
if exception:
|
2648
|
+
exception_type = type(exception)
|
2649
|
+
for exc_type, strategy in self.exception_strategies.items():
|
2650
|
+
if issubclass(exception_type, exc_type):
|
2651
|
+
return strategy
|
2652
|
+
|
2653
|
+
# Default strategy
|
2654
|
+
return self.default_strategy
|
2655
|
+
|
2656
|
+
async def execute_with_retry(
|
2657
|
+
self,
|
2658
|
+
func: Callable,
|
2659
|
+
*args,
|
2660
|
+
strategy_name: Optional[str] = None,
|
2661
|
+
timeout: Optional[float] = None,
|
2662
|
+
**kwargs,
|
2663
|
+
) -> RetryResult:
|
2664
|
+
"""Execute function with retry policy.
|
2665
|
+
|
2666
|
+
Args:
|
2667
|
+
func: Function to execute (sync or async)
|
2668
|
+
*args: Function arguments
|
2669
|
+
strategy_name: Specific strategy to use
|
2670
|
+
timeout: Overall timeout for all attempts
|
2671
|
+
**kwargs: Function keyword arguments
|
2672
|
+
|
2673
|
+
Returns:
|
2674
|
+
RetryResult with execution details
|
2675
|
+
"""
|
2676
|
+
session_id = str(uuid.uuid4())
|
2677
|
+
start_time = time.time()
|
2678
|
+
attempts = []
|
2679
|
+
last_exception = None
|
2680
|
+
|
2681
|
+
# Initial strategy selection (may be updated based on exceptions)
|
2682
|
+
current_strategy = self.select_strategy(strategy_name)
|
2683
|
+
|
2684
|
+
logger.debug(
|
2685
|
+
f"Starting retry session {session_id} with strategy: {current_strategy.name}"
|
2686
|
+
)
|
2687
|
+
|
2688
|
+
for attempt_num in range(1, current_strategy.max_attempts + 1):
|
2689
|
+
# Check timeout
|
2690
|
+
if timeout and (time.time() - start_time) >= timeout:
|
2691
|
+
logger.warning(f"Retry session {session_id} timed out after {timeout}s")
|
2692
|
+
break
|
2693
|
+
|
2694
|
+
# Check resource limits if enabled
|
2695
|
+
if self.enable_resource_limit_coordination and self.resource_limit_enforcer:
|
2696
|
+
try:
|
2697
|
+
limits_check = self.resource_limit_enforcer.check_all_limits()
|
2698
|
+
for resource_type, result in limits_check.items():
|
2699
|
+
if not result.can_proceed:
|
2700
|
+
logger.warning(
|
2701
|
+
f"Resource limit prevents retry: {result.message}"
|
2702
|
+
)
|
2703
|
+
return RetryResult(
|
2704
|
+
success=False,
|
2705
|
+
total_attempts=attempt_num,
|
2706
|
+
total_time=time.time() - start_time,
|
2707
|
+
final_exception=ResourceLimitExceededError(
|
2708
|
+
result.message
|
2709
|
+
),
|
2710
|
+
attempts=attempts,
|
2711
|
+
)
|
2712
|
+
except Exception as e:
|
2713
|
+
logger.error(f"Error checking resource limits: {e}")
|
2714
|
+
|
2715
|
+
# Check circuit breaker if enabled
|
2716
|
+
if self.enable_circuit_breaker_coordination and self.circuit_breaker:
|
2717
|
+
try:
|
2718
|
+
# Execute through circuit breaker
|
2719
|
+
attempt_start = time.time()
|
2720
|
+
if asyncio.iscoroutinefunction(func):
|
2721
|
+
result = await self.circuit_breaker.call(func, *args, **kwargs)
|
2722
|
+
else:
|
2723
|
+
result = await self.circuit_breaker.call(func, *args, **kwargs)
|
2724
|
+
attempt_time = time.time() - attempt_start
|
2725
|
+
|
2726
|
+
# Success
|
2727
|
+
attempt = RetryAttempt(
|
2728
|
+
timestamp=datetime.now(UTC),
|
2729
|
+
exception_type=type(None),
|
2730
|
+
attempt_number=attempt_num,
|
2731
|
+
delay_used=0.0,
|
2732
|
+
success=True,
|
2733
|
+
execution_time=attempt_time,
|
2734
|
+
)
|
2735
|
+
attempts.append(attempt)
|
2736
|
+
|
2737
|
+
# Record metrics
|
2738
|
+
if self.metrics:
|
2739
|
+
self.metrics.record_attempt(attempt)
|
2740
|
+
|
2741
|
+
# Record strategy effectiveness
|
2742
|
+
self.record_strategy_effectiveness(
|
2743
|
+
current_strategy, attempt_num, True, time.time() - start_time
|
2744
|
+
)
|
2745
|
+
|
2746
|
+
total_time = time.time() - start_time
|
2747
|
+
logger.info(
|
2748
|
+
f"Retry session {session_id} succeeded on attempt {attempt_num}"
|
2749
|
+
)
|
2750
|
+
|
2751
|
+
return RetryResult(
|
2752
|
+
success=True,
|
2753
|
+
value=result,
|
2754
|
+
total_attempts=attempt_num,
|
2755
|
+
total_time=total_time,
|
2756
|
+
attempts=attempts,
|
2757
|
+
)
|
2758
|
+
|
2759
|
+
except CircuitBreakerOpenError as e:
|
2760
|
+
# Circuit breaker is open, fail immediately
|
2761
|
+
logger.warning(
|
2762
|
+
f"Circuit breaker open, failing retry session {session_id}"
|
2763
|
+
)
|
2764
|
+
return RetryResult(
|
2765
|
+
success=False,
|
2766
|
+
total_attempts=attempt_num,
|
2767
|
+
total_time=time.time() - start_time,
|
2768
|
+
final_exception=e,
|
2769
|
+
attempts=attempts,
|
2770
|
+
)
|
2771
|
+
|
2772
|
+
except Exception as e:
|
2773
|
+
last_exception = e
|
2774
|
+
else:
|
2775
|
+
# Execute without circuit breaker
|
2776
|
+
try:
|
2777
|
+
attempt_start = time.time()
|
2778
|
+
if asyncio.iscoroutinefunction(func):
|
2779
|
+
result = await func(*args, **kwargs)
|
2780
|
+
else:
|
2781
|
+
result = func(*args, **kwargs)
|
2782
|
+
attempt_time = time.time() - attempt_start
|
2783
|
+
|
2784
|
+
# Success
|
2785
|
+
attempt = RetryAttempt(
|
2786
|
+
timestamp=datetime.now(UTC),
|
2787
|
+
exception_type=type(None),
|
2788
|
+
attempt_number=attempt_num,
|
2789
|
+
delay_used=0.0,
|
2790
|
+
success=True,
|
2791
|
+
execution_time=attempt_time,
|
2792
|
+
)
|
2793
|
+
attempts.append(attempt)
|
2794
|
+
|
2795
|
+
# Record metrics
|
2796
|
+
if self.metrics:
|
2797
|
+
self.metrics.record_attempt(attempt)
|
2798
|
+
|
2799
|
+
# Record strategy effectiveness
|
2800
|
+
self.record_strategy_effectiveness(
|
2801
|
+
current_strategy, attempt_num, True, time.time() - start_time
|
2802
|
+
)
|
2803
|
+
|
2804
|
+
total_time = time.time() - start_time
|
2805
|
+
logger.info(
|
2806
|
+
f"Retry session {session_id} succeeded on attempt {attempt_num}"
|
2807
|
+
)
|
2808
|
+
|
2809
|
+
return RetryResult(
|
2810
|
+
success=True,
|
2811
|
+
value=result,
|
2812
|
+
total_attempts=attempt_num,
|
2813
|
+
total_time=total_time,
|
2814
|
+
attempts=attempts,
|
2815
|
+
)
|
2816
|
+
|
2817
|
+
except Exception as e:
|
2818
|
+
last_exception = e
|
2819
|
+
attempt_time = time.time() - attempt_start
|
2820
|
+
|
2821
|
+
# Handle exception
|
2822
|
+
if last_exception:
|
2823
|
+
# Update strategy selection based on exception
|
2824
|
+
exception_specific_strategy = self.select_strategy(
|
2825
|
+
exception=last_exception
|
2826
|
+
)
|
2827
|
+
if exception_specific_strategy != current_strategy:
|
2828
|
+
logger.debug(
|
2829
|
+
f"Switching strategy from {current_strategy.name} to "
|
2830
|
+
f"{exception_specific_strategy.name} for {type(last_exception).__name__}"
|
2831
|
+
)
|
2832
|
+
current_strategy = exception_specific_strategy
|
2833
|
+
|
2834
|
+
# Check if exception is retriable
|
2835
|
+
if not self.exception_classifier.is_retriable(last_exception):
|
2836
|
+
logger.info(
|
2837
|
+
f"Non-retriable exception in session {session_id}: "
|
2838
|
+
f"{type(last_exception).__name__}: {last_exception}"
|
2839
|
+
)
|
2840
|
+
|
2841
|
+
# Record non-retriable attempt
|
2842
|
+
attempt = RetryAttempt(
|
2843
|
+
timestamp=datetime.now(UTC),
|
2844
|
+
exception_type=type(last_exception),
|
2845
|
+
attempt_number=attempt_num,
|
2846
|
+
delay_used=0.0,
|
2847
|
+
success=False,
|
2848
|
+
execution_time=attempt_time,
|
2849
|
+
error_message=str(last_exception),
|
2850
|
+
)
|
2851
|
+
attempts.append(attempt)
|
2852
|
+
|
2853
|
+
if self.metrics:
|
2854
|
+
self.metrics.record_attempt(attempt)
|
2855
|
+
|
2856
|
+
return RetryResult(
|
2857
|
+
success=False,
|
2858
|
+
total_attempts=attempt_num,
|
2859
|
+
total_time=time.time() - start_time,
|
2860
|
+
final_exception=last_exception,
|
2861
|
+
attempts=attempts,
|
2862
|
+
)
|
2863
|
+
|
2864
|
+
# Calculate delay for next attempt
|
2865
|
+
if attempt_num < current_strategy.max_attempts:
|
2866
|
+
delay = current_strategy.calculate_delay(attempt_num + 1)
|
2867
|
+
|
2868
|
+
# Record failed attempt
|
2869
|
+
attempt = RetryAttempt(
|
2870
|
+
timestamp=datetime.now(UTC),
|
2871
|
+
exception_type=type(last_exception),
|
2872
|
+
attempt_number=attempt_num,
|
2873
|
+
delay_used=delay,
|
2874
|
+
success=False,
|
2875
|
+
execution_time=attempt_time,
|
2876
|
+
error_message=str(last_exception),
|
2877
|
+
)
|
2878
|
+
attempts.append(attempt)
|
2879
|
+
|
2880
|
+
if self.metrics:
|
2881
|
+
self.metrics.record_attempt(attempt)
|
2882
|
+
|
2883
|
+
# Record learning data for adaptive strategies
|
2884
|
+
if isinstance(current_strategy, AdaptiveRetryStrategy):
|
2885
|
+
current_strategy.record_attempt_result(
|
2886
|
+
type(last_exception),
|
2887
|
+
attempt_num,
|
2888
|
+
delay,
|
2889
|
+
False,
|
2890
|
+
attempt_time,
|
2891
|
+
)
|
2892
|
+
|
2893
|
+
logger.warning(
|
2894
|
+
f"Attempt {attempt_num} failed in session {session_id}, "
|
2895
|
+
f"retrying in {delay:.2f}s: {type(last_exception).__name__}: {last_exception}"
|
2896
|
+
)
|
2897
|
+
|
2898
|
+
# Wait before retry
|
2899
|
+
await asyncio.sleep(delay)
|
2900
|
+
else:
|
2901
|
+
# Record final failed attempt
|
2902
|
+
attempt = RetryAttempt(
|
2903
|
+
timestamp=datetime.now(UTC),
|
2904
|
+
exception_type=type(last_exception),
|
2905
|
+
attempt_number=attempt_num,
|
2906
|
+
delay_used=0.0,
|
2907
|
+
success=False,
|
2908
|
+
execution_time=attempt_time,
|
2909
|
+
error_message=str(last_exception),
|
2910
|
+
)
|
2911
|
+
attempts.append(attempt)
|
2912
|
+
|
2913
|
+
if self.metrics:
|
2914
|
+
self.metrics.record_attempt(attempt)
|
2915
|
+
|
2916
|
+
# All attempts failed
|
2917
|
+
total_time = time.time() - start_time
|
2918
|
+
logger.error(
|
2919
|
+
f"Retry session {session_id} failed after {current_strategy.max_attempts} attempts "
|
2920
|
+
f"in {total_time:.2f}s"
|
2921
|
+
)
|
2922
|
+
|
2923
|
+
# Record strategy effectiveness
|
2924
|
+
self.record_strategy_effectiveness(
|
2925
|
+
current_strategy, current_strategy.max_attempts, False, total_time
|
2926
|
+
)
|
2927
|
+
|
2928
|
+
# Record analytics
|
2929
|
+
if self.analytics:
|
2930
|
+
self.analytics.record_session(
|
2931
|
+
session_id,
|
2932
|
+
current_strategy.max_attempts,
|
2933
|
+
False,
|
2934
|
+
total_time,
|
2935
|
+
current_strategy.name,
|
2936
|
+
)
|
2937
|
+
if last_exception:
|
2938
|
+
self.analytics.record_exception(type(last_exception))
|
2939
|
+
|
2940
|
+
return RetryResult(
|
2941
|
+
success=False,
|
2942
|
+
total_attempts=current_strategy.max_attempts,
|
2943
|
+
total_time=total_time,
|
2944
|
+
final_exception=last_exception,
|
2945
|
+
attempts=attempts,
|
2946
|
+
)
|
2947
|
+
|
2948
|
+
def record_strategy_effectiveness(
|
2949
|
+
self, strategy: RetryStrategy, attempts: int, success: bool, total_time: float
|
2950
|
+
) -> None:
|
2951
|
+
"""Record effectiveness data for a strategy.
|
2952
|
+
|
2953
|
+
Args:
|
2954
|
+
strategy: Strategy that was used
|
2955
|
+
attempts: Number of attempts made
|
2956
|
+
success: Whether the strategy succeeded
|
2957
|
+
total_time: Total time taken
|
2958
|
+
"""
|
2959
|
+
with self._lock:
|
2960
|
+
effectiveness = self.strategy_effectiveness[strategy.name]
|
2961
|
+
effectiveness["uses"] += 1
|
2962
|
+
effectiveness["total_attempts"] += attempts
|
2963
|
+
effectiveness["total_time"] += total_time
|
2964
|
+
|
2965
|
+
if success:
|
2966
|
+
effectiveness["successes"] += 1
|
2967
|
+
|
2968
|
+
def get_strategy_effectiveness(self) -> Dict[str, Dict[str, Any]]:
|
2969
|
+
"""Get effectiveness statistics for all strategies.
|
2970
|
+
|
2971
|
+
Returns:
|
2972
|
+
Dictionary mapping strategy names to effectiveness stats
|
2973
|
+
"""
|
2974
|
+
with self._lock:
|
2975
|
+
result = {}
|
2976
|
+
for name, data in self.strategy_effectiveness.items():
|
2977
|
+
if data["uses"] > 0:
|
2978
|
+
result[name] = {
|
2979
|
+
"uses": data["uses"],
|
2980
|
+
"success_rate": data["successes"] / data["uses"],
|
2981
|
+
"average_attempts": data["total_attempts"] / data["uses"],
|
2982
|
+
"average_time": data["total_time"] / data["uses"],
|
2983
|
+
}
|
2984
|
+
return result
|
2985
|
+
|
2986
|
+
def get_analytics(self) -> Optional[RetryAnalytics]:
|
2987
|
+
"""Get current analytics data.
|
2988
|
+
|
2989
|
+
Returns:
|
2990
|
+
RetryAnalytics instance or None if analytics disabled
|
2991
|
+
"""
|
2992
|
+
return self.analytics
|
2993
|
+
|
2994
|
+
def get_metrics_summary(self) -> Optional[Dict[str, Any]]:
|
2995
|
+
"""Get metrics summary.
|
2996
|
+
|
2997
|
+
Returns:
|
2998
|
+
Metrics summary dictionary or None if metrics disabled
|
2999
|
+
"""
|
3000
|
+
if self.metrics:
|
3001
|
+
return self.metrics.get_summary_stats()
|
3002
|
+
return None
|
3003
|
+
|
3004
|
+
def reset_metrics(self) -> None:
|
3005
|
+
"""Reset all metrics and analytics data."""
|
3006
|
+
if self.metrics:
|
3007
|
+
self.metrics = RetryMetrics()
|
3008
|
+
if self.analytics:
|
3009
|
+
self.analytics = RetryAnalytics()
|
3010
|
+
with self._lock:
|
3011
|
+
self.strategy_effectiveness.clear()
|
3012
|
+
logger.info("Retry policy metrics reset")
|
3013
|
+
|
3014
|
+
def get_configuration(self) -> Dict[str, Any]:
|
3015
|
+
"""Get current retry policy configuration.
|
3016
|
+
|
3017
|
+
Returns:
|
3018
|
+
Configuration dictionary
|
3019
|
+
"""
|
3020
|
+
with self._lock:
|
3021
|
+
return {
|
3022
|
+
"default_strategy": self.default_strategy.get_config(),
|
3023
|
+
"mode": self.mode.value,
|
3024
|
+
"enable_analytics": self.enable_analytics,
|
3025
|
+
"enable_circuit_breaker_coordination": self.enable_circuit_breaker_coordination,
|
3026
|
+
"enable_resource_limit_coordination": self.enable_resource_limit_coordination,
|
3027
|
+
"registered_strategies": list(self.strategies.keys()),
|
3028
|
+
"exception_specific_strategies": {
|
3029
|
+
exc_type.__name__: strategy.name
|
3030
|
+
for exc_type, strategy in self.exception_strategies.items()
|
3031
|
+
},
|
3032
|
+
"classification_rules": self.exception_classifier.get_classification_rules(),
|
3033
|
+
}
|