kailash 0.9.15__py3-none-any.whl → 0.9.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. kailash/__init__.py +4 -3
  2. kailash/middleware/database/base_models.py +7 -1
  3. kailash/migration/__init__.py +30 -0
  4. kailash/migration/cli.py +340 -0
  5. kailash/migration/compatibility_checker.py +662 -0
  6. kailash/migration/configuration_validator.py +837 -0
  7. kailash/migration/documentation_generator.py +1828 -0
  8. kailash/migration/examples/__init__.py +5 -0
  9. kailash/migration/examples/complete_migration_example.py +692 -0
  10. kailash/migration/migration_assistant.py +715 -0
  11. kailash/migration/performance_comparator.py +760 -0
  12. kailash/migration/regression_detector.py +1141 -0
  13. kailash/migration/tests/__init__.py +6 -0
  14. kailash/migration/tests/test_compatibility_checker.py +403 -0
  15. kailash/migration/tests/test_integration.py +463 -0
  16. kailash/migration/tests/test_migration_assistant.py +397 -0
  17. kailash/migration/tests/test_performance_comparator.py +433 -0
  18. kailash/monitoring/__init__.py +29 -2
  19. kailash/monitoring/asyncsql_metrics.py +275 -0
  20. kailash/nodes/data/async_sql.py +1828 -33
  21. kailash/runtime/local.py +1255 -8
  22. kailash/runtime/monitoring/__init__.py +1 -0
  23. kailash/runtime/monitoring/runtime_monitor.py +780 -0
  24. kailash/runtime/resource_manager.py +3033 -0
  25. kailash/sdk_exceptions.py +21 -0
  26. kailash/workflow/cyclic_runner.py +18 -2
  27. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/METADATA +1 -1
  28. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/RECORD +33 -14
  29. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/WHEEL +0 -0
  30. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/entry_points.txt +0 -0
  31. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/licenses/LICENSE +0 -0
  32. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/licenses/NOTICE +0 -0
  33. {kailash-0.9.15.dist-info → kailash-0.9.17.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3033 @@
1
+ """Runtime resource management and coordination.
2
+
3
+ This module provides resource coordination, connection pool management,
4
+ and runtime lifecycle management for the enhanced LocalRuntime with
5
+ persistent mode support.
6
+
7
+ Components:
8
+ - ResourceCoordinator: Cross-runtime resource coordination
9
+ - ConnectionPoolManager: Connection pool sharing and lifecycle
10
+ - RuntimeLifecycleManager: Runtime startup/shutdown coordination
11
+ """
12
+
13
+ import asyncio
14
+ import gc
15
+ import hashlib
16
+ import logging
17
+ import random
18
+ import re
19
+ import threading
20
+ import time
21
+ import uuid
22
+ from abc import ABC, abstractmethod
23
+ from collections import defaultdict, deque
24
+ from dataclasses import dataclass, field
25
+ from datetime import UTC, datetime, timedelta
26
+ from enum import Enum
27
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
28
+
29
+ import psutil
30
+
31
+ from kailash.sdk_exceptions import CircuitBreakerOpenError, ResourceLimitExceededError
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class ResourceCoordinator:
37
+ """Coordinates resources across multiple runtime instances."""
38
+
39
+ def __init__(self, runtime_id: str, enable_coordination: bool = True):
40
+ """Initialize resource coordinator.
41
+
42
+ Args:
43
+ runtime_id: Unique identifier for this runtime instance
44
+ enable_coordination: Whether to enable cross-runtime coordination
45
+ """
46
+ self.runtime_id = runtime_id
47
+ self.enable_coordination = enable_coordination
48
+
49
+ # Resource tracking
50
+ self._shared_resources: Dict[str, Any] = {}
51
+ self._resource_configs: Dict[str, Dict] = {}
52
+ self._resource_references: Dict[str, int] = defaultdict(int)
53
+ self._registered_runtimes: Dict[str, Dict] = {}
54
+
55
+ # Thread safety
56
+ self._coordination_lock = threading.RLock()
57
+
58
+ # Async operations tracking
59
+ self._async_operations: Dict[str, asyncio.Task] = {}
60
+
61
+ logger.info(f"ResourceCoordinator initialized for runtime {runtime_id}")
62
+
63
+ def register_runtime(self, runtime_id: str, config: Dict[str, Any]) -> None:
64
+ """Register a runtime instance for coordination.
65
+
66
+ Args:
67
+ runtime_id: Runtime instance identifier
68
+ config: Runtime configuration for coordination
69
+ """
70
+ with self._coordination_lock:
71
+ self._registered_runtimes[runtime_id] = {
72
+ "config": config,
73
+ "registered_at": datetime.now(UTC),
74
+ "last_seen": datetime.now(UTC),
75
+ }
76
+
77
+ logger.info(f"Registered runtime {runtime_id} for coordination")
78
+
79
+ def allocate_shared_resource(
80
+ self, resource_type: str, resource_config: Dict[str, Any]
81
+ ) -> str:
82
+ """Allocate a shared resource with reference counting.
83
+
84
+ Args:
85
+ resource_type: Type of resource (e.g., 'connection_pool')
86
+ resource_config: Configuration for the resource
87
+
88
+ Returns:
89
+ Resource ID for future reference
90
+ """
91
+ with self._coordination_lock:
92
+ # Generate resource ID based on type and config
93
+ config_str = str(sorted(resource_config.items()))
94
+ resource_id = (
95
+ f"{resource_type}_{hashlib.md5(config_str.encode()).hexdigest()[:8]}"
96
+ )
97
+
98
+ if resource_id not in self._shared_resources:
99
+ # Create new resource
100
+ self._shared_resources[resource_id] = {
101
+ "type": resource_type,
102
+ "config": resource_config,
103
+ "created_at": datetime.now(UTC),
104
+ "created_by": self.runtime_id,
105
+ "instance": None, # To be set by specific managers
106
+ }
107
+ self._resource_configs[resource_id] = resource_config
108
+
109
+ # Increment reference count
110
+ self._resource_references[resource_id] += 1
111
+
112
+ logger.debug(
113
+ f"Allocated shared resource {resource_id}, refs: {self._resource_references[resource_id]}"
114
+ )
115
+ return resource_id
116
+
117
+ def get_shared_resource(self, resource_id: str) -> Optional[Dict[str, Any]]:
118
+ """Get shared resource by ID.
119
+
120
+ Args:
121
+ resource_id: Resource identifier
122
+
123
+ Returns:
124
+ Resource info or None if not found
125
+ """
126
+ with self._coordination_lock:
127
+ return self._shared_resources.get(resource_id)
128
+
129
+ def add_resource_reference(self, resource_id: str) -> None:
130
+ """Add reference to shared resource.
131
+
132
+ Args:
133
+ resource_id: Resource identifier
134
+ """
135
+ with self._coordination_lock:
136
+ if resource_id in self._shared_resources:
137
+ self._resource_references[resource_id] += 1
138
+
139
+ def remove_resource_reference(self, resource_id: str) -> None:
140
+ """Remove reference to shared resource.
141
+
142
+ Args:
143
+ resource_id: Resource identifier
144
+ """
145
+ with self._coordination_lock:
146
+ if resource_id in self._resource_references:
147
+ self._resource_references[resource_id] -= 1
148
+
149
+ # Clean up if no references
150
+ if self._resource_references[resource_id] <= 0:
151
+ self._cleanup_resource(resource_id)
152
+
153
+ def get_resource_reference_count(self, resource_id: str) -> int:
154
+ """Get reference count for resource.
155
+
156
+ Args:
157
+ resource_id: Resource identifier
158
+
159
+ Returns:
160
+ Current reference count
161
+ """
162
+ with self._coordination_lock:
163
+ return self._resource_references.get(resource_id, 0)
164
+
165
+ def _cleanup_resource(self, resource_id: str) -> None:
166
+ """Clean up resource when no references remain.
167
+
168
+ Args:
169
+ resource_id: Resource identifier
170
+ """
171
+ if resource_id in self._shared_resources:
172
+ resource = self._shared_resources[resource_id]
173
+ logger.info(
174
+ f"Cleaning up shared resource {resource_id} (type: {resource['type']})"
175
+ )
176
+
177
+ # Remove from tracking
178
+ del self._shared_resources[resource_id]
179
+ del self._resource_references[resource_id]
180
+ if resource_id in self._resource_configs:
181
+ del self._resource_configs[resource_id]
182
+
183
+ async def coordinate_async_operation(self, operation_name: str) -> None:
184
+ """Coordinate async operation across runtimes.
185
+
186
+ Args:
187
+ operation_name: Name of the operation being coordinated
188
+ """
189
+ if not hasattr(self, "_async_operations"):
190
+ self._async_operations = {}
191
+
192
+ # Track operation
193
+ self._async_operations[operation_name] = {
194
+ "started_at": datetime.now(UTC),
195
+ "runtime_id": self.runtime_id,
196
+ }
197
+
198
+ def get_coordination_status(self) -> Dict[str, Any]:
199
+ """Get current coordination status.
200
+
201
+ Returns:
202
+ Status information including resources and runtimes
203
+ """
204
+ with self._coordination_lock:
205
+ return {
206
+ "runtime_id": self.runtime_id,
207
+ "enable_coordination": self.enable_coordination,
208
+ "shared_resources": len(self._shared_resources),
209
+ "registered_runtimes": len(self._registered_runtimes),
210
+ "total_references": sum(self._resource_references.values()),
211
+ }
212
+
213
+
214
+ class ConnectionPoolManager:
215
+ """Manages connection pools with sharing and lifecycle support."""
216
+
217
+ def __init__(
218
+ self,
219
+ max_pools: int = 20,
220
+ default_pool_size: int = 10,
221
+ pool_timeout: int = 30,
222
+ enable_sharing: bool = True,
223
+ enable_health_monitoring: bool = True,
224
+ pool_ttl: int = 3600,
225
+ ):
226
+ """Initialize connection pool manager.
227
+
228
+ Args:
229
+ max_pools: Maximum number of pools to maintain
230
+ default_pool_size: Default size for new pools
231
+ pool_timeout: Default timeout for pool operations
232
+ enable_sharing: Enable pool sharing across runtimes
233
+ enable_health_monitoring: Enable health monitoring
234
+ pool_ttl: Time-to-live for unused pools in seconds
235
+ """
236
+ self.max_pools = max_pools
237
+ self.default_pool_size = default_pool_size
238
+ self.pool_timeout = pool_timeout
239
+ self.enable_sharing = enable_sharing
240
+ self.enable_health_monitoring = enable_health_monitoring
241
+ self.pool_ttl = pool_ttl
242
+
243
+ # Pool tracking
244
+ self._pools: Dict[str, Any] = {}
245
+ self._pool_configs: Dict[str, Dict] = {}
246
+ self._pool_health: Dict[str, Dict] = {}
247
+ self._pool_usage: Dict[str, Dict] = {}
248
+ self._pool_runtimes: Dict[str, Set[str]] = defaultdict(set)
249
+
250
+ # Lock for thread safety
251
+ self._lock = threading.RLock()
252
+
253
+ logger.info(f"ConnectionPoolManager initialized (max_pools={max_pools})")
254
+
255
+ async def create_pool(self, pool_name: str, pool_config: Dict[str, Any]) -> Any:
256
+ """Create a new connection pool.
257
+
258
+ Args:
259
+ pool_name: Name for the pool
260
+ pool_config: Pool configuration
261
+
262
+ Returns:
263
+ Pool instance
264
+
265
+ Raises:
266
+ ResourceLimitExceededError: If max_pools limit exceeded
267
+ """
268
+ with self._lock:
269
+ if len(self._pools) >= self.max_pools:
270
+ raise ResourceLimitExceededError(
271
+ f"Maximum pools limit ({self.max_pools}) exceeded"
272
+ )
273
+
274
+ if pool_name in self._pools:
275
+ return self._pools[pool_name]
276
+
277
+ # Create appropriate pool based on database type
278
+ database_type = pool_config.get("database_type", "").lower()
279
+
280
+ if database_type == "sqlite":
281
+ # For SQLite, create a simple connection object
282
+ import aiosqlite
283
+
284
+ connection_string = pool_config.get("database_url", ":memory:")
285
+ pool = {
286
+ "database_type": "sqlite",
287
+ "connection_string": connection_string,
288
+ "aiosqlite": aiosqlite,
289
+ }
290
+ elif database_type == "postgresql":
291
+ # Create real PostgreSQL connection pool using asyncpg
292
+ pool = await self._create_postgresql_pool(pool_config)
293
+ elif database_type == "mysql":
294
+ # Create real MySQL connection pool using aiomysql
295
+ pool = await self._create_mysql_pool(pool_config)
296
+ else:
297
+ # Fail fast for unsupported database types - no production mock fallbacks
298
+ supported_types = ["postgresql", "mysql", "sqlite"]
299
+ raise ValueError(
300
+ f"Unsupported database type '{database_type}'. "
301
+ f"Supported types: {supported_types}. "
302
+ f"Configuration error in pool '{pool_name}'"
303
+ )
304
+
305
+ self._pools[pool_name] = pool
306
+ self._pool_configs[pool_name] = pool_config.copy()
307
+ self._pool_usage[pool_name] = {
308
+ "created_at": datetime.now(UTC),
309
+ "last_used": datetime.now(UTC),
310
+ "use_count": 0,
311
+ }
312
+
313
+ if self.enable_health_monitoring:
314
+ self._pool_health[pool_name] = {
315
+ "status": "healthy",
316
+ "active_connections": 0,
317
+ "total_connections": pool_config.get(
318
+ "pool_size", self.default_pool_size
319
+ ),
320
+ "last_check": datetime.now(UTC),
321
+ }
322
+
323
+ logger.info(f"Created connection pool '{pool_name}'")
324
+ return pool
325
+
326
+ async def get_or_create_pool(
327
+ self, pool_name: str, pool_config: Dict[str, Any]
328
+ ) -> Any:
329
+ """Get existing pool or create new one.
330
+
331
+ Args:
332
+ pool_name: Name for the pool
333
+ pool_config: Pool configuration
334
+
335
+ Returns:
336
+ Pool instance
337
+ """
338
+ with self._lock:
339
+ if pool_name in self._pools:
340
+ # Update usage
341
+ self._pool_usage[pool_name]["last_used"] = datetime.now(UTC)
342
+ self._pool_usage[pool_name]["use_count"] += 1
343
+ return self._pools[pool_name]
344
+
345
+ return await self.create_pool(pool_name, pool_config)
346
+
347
+ async def create_shared_pool(
348
+ self, pool_name: str, pool_config: Dict[str, Any], runtime_id: str
349
+ ) -> Any:
350
+ """Create a shared pool for cross-runtime use.
351
+
352
+ Args:
353
+ pool_name: Name for the pool
354
+ pool_config: Pool configuration
355
+ runtime_id: Runtime requesting the pool
356
+
357
+ Returns:
358
+ Pool instance
359
+ """
360
+ if not self.enable_sharing:
361
+ return await self.create_pool(pool_name, pool_config)
362
+
363
+ with self._lock:
364
+ pool = await self.get_or_create_pool(pool_name, pool_config)
365
+ self._pool_runtimes[pool_name].add(runtime_id)
366
+
367
+ logger.info(f"Shared pool '{pool_name}' with runtime {runtime_id}")
368
+ return pool
369
+
370
+ async def get_shared_pool(self, pool_name: str, runtime_id: str) -> Optional[Any]:
371
+ """Get shared pool for runtime.
372
+
373
+ Args:
374
+ pool_name: Name of the pool
375
+ runtime_id: Runtime requesting the pool
376
+
377
+ Returns:
378
+ Pool instance or None if not found
379
+ """
380
+ with self._lock:
381
+ if pool_name in self._pools and self.enable_sharing:
382
+ self._pool_runtimes[pool_name].add(runtime_id)
383
+ return self._pools[pool_name]
384
+ return None
385
+
386
+ def get_pool_runtime_count(self, pool_name: str) -> int:
387
+ """Get number of runtimes using a pool.
388
+
389
+ Args:
390
+ pool_name: Name of the pool
391
+
392
+ Returns:
393
+ Number of runtimes using the pool
394
+ """
395
+ with self._lock:
396
+ return len(self._pool_runtimes.get(pool_name, set()))
397
+
398
+ def get_pool_health(self, pool_name: str) -> Dict[str, Any]:
399
+ """Get health status for a pool.
400
+
401
+ Args:
402
+ pool_name: Name of the pool
403
+
404
+ Returns:
405
+ Health status dictionary
406
+ """
407
+ with self._lock:
408
+ if pool_name in self._pool_health:
409
+ return self._pool_health[pool_name].copy()
410
+
411
+ return {
412
+ "status": "unknown",
413
+ "active_connections": 0,
414
+ "total_connections": 0,
415
+ "last_check": None,
416
+ }
417
+
418
+ def is_pool_active(self, pool_name: str) -> bool:
419
+ """Check if pool is active.
420
+
421
+ Args:
422
+ pool_name: Name of the pool
423
+
424
+ Returns:
425
+ True if pool is active
426
+ """
427
+ with self._lock:
428
+ return pool_name in self._pools
429
+
430
+ async def close_pool(self, pool_name: str) -> None:
431
+ """Close and remove a pool with proper error handling and race condition protection.
432
+
433
+ Args:
434
+ pool_name: Name of the pool to close
435
+ """
436
+ # Get pool reference under lock but don't hold lock during async operations
437
+ with self._lock:
438
+ if pool_name not in self._pools:
439
+ logger.debug(f"Pool '{pool_name}' not found for closure")
440
+ return
441
+
442
+ pool = self._pools[pool_name]
443
+ # Remove from pools immediately to prevent race conditions
444
+ del self._pools[pool_name]
445
+
446
+ # Close pool outside lock to prevent deadlock
447
+ close_error = None
448
+ try:
449
+ if isinstance(pool, RuntimeManagedPool):
450
+ await pool._runtime_close()
451
+ elif hasattr(pool, "close"):
452
+ await pool.close()
453
+ logger.info(f"Successfully closed connection pool '{pool_name}'")
454
+ except Exception as e:
455
+ close_error = e
456
+ logger.error(f"Failed to close pool '{pool_name}': {e}")
457
+
458
+ # Always clean up tracking dictionaries - even if close failed
459
+ with self._lock:
460
+ # Remove from all tracking structures
461
+ self._pool_configs.pop(pool_name, None)
462
+ self._pool_usage.pop(pool_name, None)
463
+ self._pool_health.pop(pool_name, None)
464
+ self._pool_runtimes.pop(pool_name, None)
465
+
466
+ # Re-raise close error after cleanup
467
+ if close_error:
468
+ raise close_error
469
+
470
+ async def cleanup_unused_pools(self) -> int:
471
+ """Clean up unused pools past TTL.
472
+
473
+ Returns:
474
+ Number of pools cleaned up
475
+ """
476
+ cleaned_count = 0
477
+ current_time = datetime.now(UTC)
478
+
479
+ # Identify pools to cleanup while holding lock
480
+ with self._lock:
481
+ pools_to_cleanup = []
482
+
483
+ for pool_name, usage in self._pool_usage.items():
484
+ if (current_time - usage["last_used"]).total_seconds() > self.pool_ttl:
485
+ pools_to_cleanup.append(pool_name)
486
+
487
+ # Close pools outside the lock to avoid async deadlock
488
+ for pool_name in pools_to_cleanup:
489
+ await self.close_pool(pool_name)
490
+ cleaned_count += 1
491
+
492
+ if cleaned_count > 0:
493
+ logger.info(f"Cleaned up {cleaned_count} unused connection pools")
494
+
495
+ return cleaned_count
496
+
497
+ async def _create_postgresql_pool(self, pool_config: Dict[str, Any]) -> Any:
498
+ """Create a real PostgreSQL connection pool using asyncpg."""
499
+ try:
500
+ import asyncpg
501
+ except ImportError:
502
+ raise ImportError(
503
+ "asyncpg not installed. Install with: pip install asyncpg"
504
+ )
505
+
506
+ # Extract connection parameters
507
+ connection_string = pool_config.get("connection_string") or pool_config.get(
508
+ "database_url"
509
+ )
510
+ if not connection_string:
511
+ # Build connection string from individual parameters
512
+ host = pool_config.get("host", "localhost")
513
+ port = pool_config.get("port", 5432)
514
+ database = pool_config.get("database", "postgres")
515
+ user = pool_config.get("user", "postgres")
516
+ password = pool_config.get("password", "")
517
+ connection_string = (
518
+ f"postgresql://{user}:{password}@{host}:{port}/{database}"
519
+ )
520
+
521
+ # Extract pool size settings
522
+ min_size = pool_config.get("min_pool_size", 1)
523
+ max_size = pool_config.get(
524
+ "pool_size", pool_config.get("max_pool_size", self.default_pool_size)
525
+ )
526
+
527
+ # Create asyncpg pool
528
+ pool = await asyncpg.create_pool(
529
+ connection_string, min_size=min_size, max_size=max_size, command_timeout=60
530
+ )
531
+
532
+ logger.info(
533
+ f"Created PostgreSQL connection pool with {min_size}-{max_size} connections"
534
+ )
535
+
536
+ # Validate pool before wrapping
537
+ if not await self._validate_pool(pool, "postgresql"):
538
+ await pool.close() # Clean up failed pool
539
+ raise RuntimeError(
540
+ f"PostgreSQL pool validation failed for connection: {connection_string}"
541
+ )
542
+
543
+ # Wrap pool to prevent premature closure by node-level cleanup
544
+ return RuntimeManagedPool(pool)
545
+
546
+ async def _create_mysql_pool(self, pool_config: Dict[str, Any]) -> Any:
547
+ """Create a real MySQL connection pool using aiomysql."""
548
+ try:
549
+ import aiomysql
550
+ except ImportError:
551
+ raise ImportError(
552
+ "aiomysql not installed. Install with: pip install aiomysql"
553
+ )
554
+
555
+ # Extract connection parameters
556
+ host = pool_config.get("host", "localhost")
557
+ port = pool_config.get("port", 3306)
558
+ user = pool_config.get("user", "root")
559
+ password = pool_config.get("password", "")
560
+ database = pool_config.get("database", "")
561
+
562
+ # Extract pool size settings
563
+ minsize = pool_config.get("min_pool_size", 1)
564
+ maxsize = pool_config.get(
565
+ "pool_size", pool_config.get("max_pool_size", self.default_pool_size)
566
+ )
567
+
568
+ # Create aiomysql pool
569
+ pool = await aiomysql.create_pool(
570
+ host=host,
571
+ port=port,
572
+ user=user,
573
+ password=password,
574
+ db=database,
575
+ minsize=minsize,
576
+ maxsize=maxsize,
577
+ autocommit=True,
578
+ )
579
+
580
+ logger.info(
581
+ f"Created MySQL connection pool with {minsize}-{maxsize} connections"
582
+ )
583
+
584
+ # Validate pool before wrapping
585
+ if not await self._validate_pool(pool, "mysql"):
586
+ await pool.close() # Clean up failed pool
587
+ raise RuntimeError(
588
+ f"MySQL pool validation failed for connection: {host}:{port}"
589
+ )
590
+
591
+ # Wrap pool to prevent premature closure by node-level cleanup
592
+ return RuntimeManagedPool(pool)
593
+
594
+ async def _validate_pool(self, pool: Any, database_type: str) -> bool:
595
+ """Validate that a pool actually works before returning it.
596
+
597
+ Args:
598
+ pool: The database pool to validate
599
+ database_type: Type of database (postgresql, mysql, sqlite)
600
+
601
+ Returns:
602
+ True if pool is functional, False otherwise
603
+ """
604
+ try:
605
+ if database_type == "postgresql":
606
+ async with pool.acquire() as conn:
607
+ await conn.fetchrow("SELECT 1 as test_connection")
608
+ logger.debug("PostgreSQL pool validation successful")
609
+ elif database_type == "mysql":
610
+ async with pool.acquire() as conn:
611
+ async with conn.cursor() as cursor:
612
+ await cursor.execute("SELECT 1 as test_connection")
613
+ await cursor.fetchone()
614
+ logger.debug("MySQL pool validation successful")
615
+ elif database_type == "sqlite":
616
+ # SQLite validation would be different since it uses dict format
617
+ logger.debug("SQLite pool validation skipped (not a real pool)")
618
+ else:
619
+ logger.warning(f"Unknown database type for validation: {database_type}")
620
+ return False
621
+ return True
622
+ except Exception as e:
623
+ logger.error(f"Pool validation failed for {database_type}: {e}")
624
+ return False
625
+
626
+
627
+ class RuntimeManagedPool:
628
+ """Wrapper for database pools managed by runtime to prevent external closure."""
629
+
630
+ def __init__(self, underlying_pool):
631
+ """Initialize with the real pool instance."""
632
+ self._underlying_pool = underlying_pool
633
+ self._is_runtime_managed = True
634
+ self._pool_type = type(underlying_pool).__name__
635
+
636
+ # Pre-validate essential attributes exist to fail fast
637
+ required_attrs = ["acquire"]
638
+ for attr in required_attrs:
639
+ if not hasattr(underlying_pool, attr):
640
+ raise ValueError(
641
+ f"Invalid pool type '{self._pool_type}': missing required attribute '{attr}'. "
642
+ f"Pool must implement acquire() method for database operations."
643
+ )
644
+
645
+ logger.debug(f"Created RuntimeManagedPool wrapping {self._pool_type}")
646
+
647
+ def __getattr__(self, name):
648
+ """Delegate all attributes to the underlying pool except close()."""
649
+ if name == "close":
650
+ # Prevent external closure - only runtime can close
651
+ return self._no_close
652
+ try:
653
+ return getattr(self._underlying_pool, name)
654
+ except AttributeError as e:
655
+ # Provide clearer error messages for debugging
656
+ raise AttributeError(
657
+ f"RuntimeManagedPool({self._pool_type}): {e}. "
658
+ f"The underlying {self._pool_type} pool does not support attribute '{name}'"
659
+ ) from e
660
+
661
+ async def _no_close(self):
662
+ """No-op close method to prevent external closure."""
663
+ logger.debug(f"Ignored attempt to close runtime-managed {self._pool_type} pool")
664
+ pass
665
+
666
+ async def _runtime_close(self):
667
+ """Internal method for runtime to actually close the pool."""
668
+ try:
669
+ if hasattr(self._underlying_pool, "close"):
670
+ await self._underlying_pool.close()
671
+ logger.debug(f"Successfully closed underlying {self._pool_type} pool")
672
+ else:
673
+ logger.warning(
674
+ f"Underlying {self._pool_type} pool has no close() method"
675
+ )
676
+ except Exception as e:
677
+ logger.error(f"Error closing underlying {self._pool_type} pool: {e}")
678
+ raise
679
+
680
+
681
+ class MockConnectionPool:
682
+ """Mock connection pool for testing."""
683
+
684
+ def __init__(self, config: Dict[str, Any], pool_size: int):
685
+ self.config = config
686
+ self.pool_size = pool_size
687
+ self.created_at = datetime.now(UTC)
688
+
689
+ async def close(self):
690
+ """Close the mock pool."""
691
+ pass
692
+
693
+
694
+ class RuntimeLifecycleManager:
695
+ """Manages runtime lifecycle operations."""
696
+
697
+ def __init__(self, runtime_id: str):
698
+ """Initialize runtime lifecycle manager.
699
+
700
+ Args:
701
+ runtime_id: Unique runtime identifier
702
+ """
703
+ self.runtime_id = runtime_id
704
+ self._is_started = False
705
+ self._shutdown_hooks: List[Callable] = []
706
+ self._startup_hooks: List[Callable] = []
707
+
708
+ async def startup(self) -> None:
709
+ """Execute startup sequence."""
710
+ if self._is_started:
711
+ return
712
+
713
+ logger.info(f"Starting runtime lifecycle for {self.runtime_id}")
714
+
715
+ # Execute startup hooks
716
+ for hook in self._startup_hooks:
717
+ try:
718
+ if asyncio.iscoroutinefunction(hook):
719
+ await hook()
720
+ else:
721
+ hook()
722
+ except Exception as e:
723
+ logger.error(f"Startup hook failed: {e}")
724
+
725
+ self._is_started = True
726
+
727
+ async def shutdown(self, timeout: int = 30) -> None:
728
+ """Execute shutdown sequence.
729
+
730
+ Args:
731
+ timeout: Maximum time to wait for shutdown
732
+ """
733
+ if not self._is_started:
734
+ return
735
+
736
+ logger.info(f"Shutting down runtime lifecycle for {self.runtime_id}")
737
+
738
+ # Execute shutdown hooks with timeout
739
+ try:
740
+ await asyncio.wait_for(self._execute_shutdown_hooks(), timeout=timeout)
741
+ except asyncio.TimeoutError:
742
+ logger.warning(
743
+ f"Shutdown timeout after {timeout}s for runtime {self.runtime_id}"
744
+ )
745
+
746
+ self._is_started = False
747
+
748
+ async def _execute_shutdown_hooks(self) -> None:
749
+ """Execute all shutdown hooks."""
750
+ for hook in reversed(self._shutdown_hooks): # Reverse order for cleanup
751
+ try:
752
+ if asyncio.iscoroutinefunction(hook):
753
+ await hook()
754
+ else:
755
+ hook()
756
+ except Exception as e:
757
+ logger.error(f"Shutdown hook failed: {e}")
758
+
759
+ def add_startup_hook(self, hook: Callable) -> None:
760
+ """Add startup hook.
761
+
762
+ Args:
763
+ hook: Function to call during startup
764
+ """
765
+ self._startup_hooks.append(hook)
766
+
767
+ def add_shutdown_hook(self, hook: Callable) -> None:
768
+ """Add shutdown hook.
769
+
770
+ Args:
771
+ hook: Function to call during shutdown
772
+ """
773
+ self._shutdown_hooks.append(hook)
774
+
775
+ @property
776
+ def is_started(self) -> bool:
777
+ """Check if runtime is started."""
778
+ return self._is_started
779
+
780
+
781
+ class CircuitBreakerState(Enum):
782
+ """Circuit breaker states."""
783
+
784
+ CLOSED = "closed" # Normal operation
785
+ OPEN = "open" # Failing, blocking requests
786
+ HALF_OPEN = "half_open" # Testing if service recovered
787
+
788
+
789
+ class CircuitBreaker:
790
+ """Circuit breaker pattern for resilience and fault tolerance.
791
+
792
+ Prevents cascading failures by temporarily blocking requests to failing services.
793
+ """
794
+
795
+ def __init__(
796
+ self,
797
+ name: str,
798
+ failure_threshold: int = 5,
799
+ timeout_seconds: int = 60,
800
+ expected_exception: type = Exception,
801
+ recovery_threshold: int = 3,
802
+ ):
803
+ """Initialize circuit breaker.
804
+
805
+ Args:
806
+ name: Name of the circuit breaker for logging
807
+ failure_threshold: Number of failures before opening circuit
808
+ timeout_seconds: Time to wait before attempting recovery
809
+ expected_exception: Exception type that triggers the circuit breaker
810
+ recovery_threshold: Number of successes needed to close circuit from half-open
811
+ """
812
+ self.name = name
813
+ self.failure_threshold = failure_threshold
814
+ self.timeout_seconds = timeout_seconds
815
+ self.expected_exception = expected_exception
816
+ self.recovery_threshold = recovery_threshold
817
+
818
+ self._state = CircuitBreakerState.CLOSED
819
+ self._failure_count = 0
820
+ self._last_failure_time: Optional[float] = None
821
+ self._success_count = 0
822
+
823
+ # Thread safety
824
+ self._lock = threading.RLock()
825
+
826
+ logger.info(f"Circuit breaker '{name}' initialized")
827
+
828
+ async def call(self, func: Callable, *args, **kwargs) -> Any:
829
+ """Call function with circuit breaker protection.
830
+
831
+ Args:
832
+ func: Function to call (sync or async)
833
+ *args: Function arguments
834
+ **kwargs: Function keyword arguments
835
+
836
+ Returns:
837
+ Function result
838
+
839
+ Raises:
840
+ CircuitBreakerOpenError: If circuit is open
841
+ """
842
+ with self._lock:
843
+ if self._state == CircuitBreakerState.OPEN:
844
+ if self._should_attempt_reset():
845
+ self._state = CircuitBreakerState.HALF_OPEN
846
+ logger.info(f"Circuit breaker '{self.name}' moved to HALF_OPEN")
847
+ else:
848
+ raise CircuitBreakerOpenError(
849
+ f"Circuit breaker '{self.name}' is OPEN"
850
+ )
851
+
852
+ try:
853
+ # Call function (handle both sync and async)
854
+ if asyncio.iscoroutinefunction(func):
855
+ result = await func(*args, **kwargs)
856
+ else:
857
+ result = func(*args, **kwargs)
858
+
859
+ # Success - update state
860
+ self._on_success()
861
+ return result
862
+
863
+ except self.expected_exception as e:
864
+ self._on_failure()
865
+ raise e
866
+
867
+ def _should_attempt_reset(self) -> bool:
868
+ """Check if circuit should attempt reset."""
869
+ if self._last_failure_time is None:
870
+ return False
871
+ return (time.time() - self._last_failure_time) >= self.timeout_seconds
872
+
873
+ def _on_success(self) -> None:
874
+ """Handle successful call."""
875
+ with self._lock:
876
+ if self._state == CircuitBreakerState.HALF_OPEN:
877
+ self._success_count += 1
878
+ if self._success_count >= self.recovery_threshold:
879
+ self._reset()
880
+ logger.info(f"Circuit breaker '{self.name}' CLOSED after recovery")
881
+ elif self._state == CircuitBreakerState.CLOSED:
882
+ self._reset() # Reset failure count on success
883
+
884
+ def _on_failure(self) -> None:
885
+ """Handle failed call."""
886
+ with self._lock:
887
+ self._failure_count += 1
888
+ self._last_failure_time = time.time()
889
+
890
+ if self._failure_count >= self.failure_threshold:
891
+ self._state = CircuitBreakerState.OPEN
892
+ logger.warning(
893
+ f"Circuit breaker '{self.name}' OPENED after {self._failure_count} failures"
894
+ )
895
+
896
+ def _reset(self) -> None:
897
+ """Reset circuit breaker to closed state."""
898
+ self._state = CircuitBreakerState.CLOSED
899
+ self._failure_count = 0
900
+ self._success_count = 0
901
+ self._last_failure_time = None
902
+
903
+ def get_state(self) -> Dict[str, Any]:
904
+ """Get current circuit breaker state.
905
+
906
+ Returns:
907
+ State information dictionary
908
+ """
909
+ with self._lock:
910
+ return {
911
+ "name": self.name,
912
+ "state": self._state.value,
913
+ "failure_count": self._failure_count,
914
+ "success_count": self._success_count,
915
+ "last_failure_time": self._last_failure_time,
916
+ "failure_threshold": self.failure_threshold,
917
+ "timeout_seconds": self.timeout_seconds,
918
+ }
919
+
920
+ def force_open(self) -> None:
921
+ """Force circuit breaker to open state."""
922
+ with self._lock:
923
+ self._state = CircuitBreakerState.OPEN
924
+ self._failure_count = self.failure_threshold
925
+ self._last_failure_time = time.time()
926
+ logger.warning(f"Circuit breaker '{self.name}' forced OPEN")
927
+
928
+ def force_close(self) -> None:
929
+ """Force circuit breaker to closed state."""
930
+ with self._lock:
931
+ self._reset()
932
+ logger.info(f"Circuit breaker '{self.name}' forced CLOSED")
933
+
934
+
935
+ # CircuitBreakerOpenError now imported from sdk_exceptions
936
+
937
+
938
+ class RetryPolicy:
939
+ """Retry policy with exponential backoff and jitter.
940
+
941
+ Provides configurable retry behavior for transient failures.
942
+ """
943
+
944
+ def __init__(
945
+ self,
946
+ max_attempts: int = 3,
947
+ base_delay: float = 1.0,
948
+ max_delay: float = 60.0,
949
+ exponential_base: float = 2.0,
950
+ jitter: bool = True,
951
+ retriable_exceptions: tuple = (Exception,),
952
+ ):
953
+ """Initialize retry policy.
954
+
955
+ Args:
956
+ max_attempts: Maximum number of attempts
957
+ base_delay: Base delay in seconds
958
+ max_delay: Maximum delay in seconds
959
+ exponential_base: Base for exponential backoff
960
+ jitter: Whether to add jitter to delays
961
+ retriable_exceptions: Exception types that should trigger retry
962
+ """
963
+ self.max_attempts = max_attempts
964
+ self.base_delay = base_delay
965
+ self.max_delay = max_delay
966
+ self.exponential_base = exponential_base
967
+ self.jitter = jitter
968
+ self.retriable_exceptions = retriable_exceptions
969
+
970
+ logger.info(f"RetryPolicy initialized (max_attempts={max_attempts})")
971
+
972
+ async def call(self, func: Callable, *args, **kwargs) -> Any:
973
+ """Call function with retry policy.
974
+
975
+ Args:
976
+ func: Function to call (sync or async)
977
+ *args: Function arguments
978
+ **kwargs: Function keyword arguments
979
+
980
+ Returns:
981
+ Function result
982
+
983
+ Raises:
984
+ Last exception if all retries fail
985
+ """
986
+ last_exception = None
987
+
988
+ for attempt in range(1, self.max_attempts + 1):
989
+ try:
990
+ # Call function (handle both sync and async)
991
+ if asyncio.iscoroutinefunction(func):
992
+ result = await func(*args, **kwargs)
993
+ else:
994
+ result = func(*args, **kwargs)
995
+
996
+ if attempt > 1:
997
+ logger.info(f"Retry succeeded on attempt {attempt}")
998
+
999
+ return result
1000
+
1001
+ except self.retriable_exceptions as e:
1002
+ last_exception = e
1003
+
1004
+ if attempt < self.max_attempts:
1005
+ delay = self._calculate_delay(attempt)
1006
+ logger.warning(
1007
+ f"Attempt {attempt} failed, retrying in {delay:.2f}s: {e}"
1008
+ )
1009
+ await asyncio.sleep(delay)
1010
+ else:
1011
+ logger.error(f"All {self.max_attempts} attempts failed")
1012
+
1013
+ raise last_exception
1014
+
1015
+ def _calculate_delay(self, attempt: int) -> float:
1016
+ """Calculate delay for given attempt.
1017
+
1018
+ Args:
1019
+ attempt: Current attempt number (1-based)
1020
+
1021
+ Returns:
1022
+ Delay in seconds
1023
+ """
1024
+ delay = self.base_delay * (self.exponential_base ** (attempt - 1))
1025
+ delay = min(delay, self.max_delay)
1026
+
1027
+ if self.jitter:
1028
+ import random
1029
+
1030
+ # Add up to 25% jitter
1031
+ jitter_amount = delay * 0.25 * random.random()
1032
+ delay += jitter_amount
1033
+
1034
+ return delay
1035
+
1036
+
1037
+ # Resource Limit Enforcement Components
1038
+ # Note: gc and psutil are imported at the top of the file
1039
+
1040
+
1041
+ class EnforcementPolicy(Enum):
1042
+ """Resource limit enforcement policies."""
1043
+
1044
+ STRICT = "strict" # Immediately reject when limits exceeded
1045
+ WARN = "warn" # Log warnings but allow execution
1046
+ ADAPTIVE = "adaptive" # Graceful degradation based on resource pressure
1047
+
1048
+
1049
+ class DegradationStrategy(Enum):
1050
+ """Resource degradation strategies when limits are exceeded."""
1051
+
1052
+ QUEUE = "queue" # Queue requests when resources exhausted
1053
+ REJECT = "reject" # Immediately reject when resources exhausted
1054
+ DEFER = "defer" # Delay execution when resources exhausted
1055
+
1056
+
1057
+ @dataclass
1058
+ class ResourceCheckResult:
1059
+ """Result of resource limit check."""
1060
+
1061
+ can_proceed: bool
1062
+ resource_type: str
1063
+ current_usage: float
1064
+ limit: float
1065
+ usage_percentage: float
1066
+ message: str
1067
+
1068
+
1069
+ @dataclass
1070
+ class ResourceMetrics:
1071
+ """Comprehensive resource usage metrics."""
1072
+
1073
+ timestamp: datetime
1074
+ memory_usage_mb: float
1075
+ memory_usage_percent: float
1076
+ cpu_usage_percent: float
1077
+ active_connections: int
1078
+ peak_memory_mb: float
1079
+ peak_cpu_percent: float
1080
+
1081
+
1082
+ class MemoryLimitExceededError(ResourceLimitExceededError):
1083
+ """Memory limit exceeded error."""
1084
+
1085
+ def __init__(self, current_mb: float, limit_mb: float):
1086
+ super().__init__(
1087
+ f"Memory limit exceeded: {current_mb:.1f}MB > {limit_mb:.1f}MB"
1088
+ )
1089
+ self.current_mb = current_mb
1090
+ self.limit_mb = limit_mb
1091
+
1092
+
1093
+ class ConnectionLimitExceededError(ResourceLimitExceededError):
1094
+ """Connection limit exceeded error."""
1095
+
1096
+ def __init__(self, current_connections: int, max_connections: int):
1097
+ super().__init__(
1098
+ f"Connection limit exceeded: {current_connections} > {max_connections}"
1099
+ )
1100
+ self.current_connections = current_connections
1101
+ self.max_connections = max_connections
1102
+
1103
+
1104
+ class CPULimitExceededError(ResourceLimitExceededError):
1105
+ """CPU limit exceeded error."""
1106
+
1107
+ def __init__(self, current_percent: float, limit_percent: float):
1108
+ super().__init__(
1109
+ f"CPU limit exceeded: {current_percent:.1f}% > {limit_percent:.1f}%"
1110
+ )
1111
+ self.current_percent = current_percent
1112
+ self.limit_percent = limit_percent
1113
+
1114
+
1115
+ class ResourceLimitEnforcer:
1116
+ """Comprehensive resource limit enforcement for LocalRuntime.
1117
+
1118
+ Provides memory, connection, and CPU limit enforcement with configurable
1119
+ policies and graceful degradation strategies. Thread-safe for concurrent
1120
+ workflow execution.
1121
+
1122
+ Features:
1123
+ - Memory limit enforcement with real-time monitoring
1124
+ - Connection pool limit enforcement
1125
+ - CPU usage monitoring and throttling
1126
+ - Configurable enforcement policies (strict, warn, adaptive)
1127
+ - Graceful degradation strategies (queue, reject, defer)
1128
+ - Thread-safe operations
1129
+ - Real-time metrics and alerting
1130
+ """
1131
+
1132
+ def __init__(
1133
+ self,
1134
+ max_memory_mb: Optional[int] = None,
1135
+ max_connections: Optional[int] = None,
1136
+ max_cpu_percent: Optional[float] = None,
1137
+ enforcement_policy: Union[str, EnforcementPolicy] = EnforcementPolicy.ADAPTIVE,
1138
+ degradation_strategy: Union[
1139
+ str, DegradationStrategy
1140
+ ] = DegradationStrategy.DEFER,
1141
+ monitoring_interval: float = 1.0,
1142
+ enable_alerts: bool = True,
1143
+ memory_alert_threshold: float = 0.8,
1144
+ cpu_alert_threshold: float = 0.7,
1145
+ connection_alert_threshold: float = 0.9,
1146
+ enable_metrics_history: bool = True,
1147
+ metrics_history_size: int = 1000,
1148
+ ):
1149
+ """Initialize ResourceLimitEnforcer.
1150
+
1151
+ Args:
1152
+ max_memory_mb: Maximum memory usage in MB (None = no limit)
1153
+ max_connections: Maximum concurrent connections (None = no limit)
1154
+ max_cpu_percent: Maximum CPU usage percentage (None = no limit)
1155
+ enforcement_policy: How to enforce limits (strict/warn/adaptive)
1156
+ degradation_strategy: How to handle resource exhaustion
1157
+ monitoring_interval: Resource monitoring interval in seconds
1158
+ enable_alerts: Enable resource usage alerts
1159
+ memory_alert_threshold: Memory alert threshold (0.0-1.0)
1160
+ cpu_alert_threshold: CPU alert threshold (0.0-1.0)
1161
+ connection_alert_threshold: Connection alert threshold (0.0-1.0)
1162
+ enable_metrics_history: Enable metrics history tracking
1163
+ metrics_history_size: Maximum metrics history entries
1164
+ """
1165
+ # Validate parameters
1166
+ if max_memory_mb is not None and max_memory_mb <= 0:
1167
+ raise ValueError("max_memory_mb must be positive")
1168
+ if max_connections is not None and max_connections <= 0:
1169
+ raise ValueError("max_connections must be positive")
1170
+ if max_cpu_percent is not None and (
1171
+ max_cpu_percent <= 0 or max_cpu_percent > 100
1172
+ ):
1173
+ raise ValueError("max_cpu_percent must be between 0 and 100")
1174
+ if monitoring_interval <= 0:
1175
+ raise ValueError("monitoring_interval must be positive")
1176
+
1177
+ self.max_memory_mb = max_memory_mb
1178
+ self.max_connections = max_connections
1179
+ self.max_cpu_percent = max_cpu_percent
1180
+
1181
+ # Convert string policies to enums
1182
+ if isinstance(enforcement_policy, str):
1183
+ enforcement_policy = EnforcementPolicy(enforcement_policy)
1184
+ if isinstance(degradation_strategy, str):
1185
+ degradation_strategy = DegradationStrategy(degradation_strategy)
1186
+
1187
+ self.enforcement_policy = enforcement_policy
1188
+ self.degradation_strategy = degradation_strategy
1189
+ self.monitoring_interval = monitoring_interval
1190
+ self.enable_alerts = enable_alerts
1191
+
1192
+ # Alert thresholds
1193
+ self.memory_alert_threshold = memory_alert_threshold
1194
+ self.cpu_alert_threshold = cpu_alert_threshold
1195
+ self.connection_alert_threshold = connection_alert_threshold
1196
+
1197
+ # Metrics and history
1198
+ self.enable_metrics_history = enable_metrics_history
1199
+ self.metrics_history_size = metrics_history_size
1200
+ self.metrics_history: deque = deque(maxlen=metrics_history_size)
1201
+
1202
+ # Resource tracking
1203
+ self.active_connections: Set[str] = set()
1204
+ self.connection_queue: deque = deque()
1205
+ self.peak_memory_mb = 0.0
1206
+ self.peak_cpu_percent = 0.0
1207
+
1208
+ # Thread safety
1209
+ self._lock = threading.RLock()
1210
+ self._monitoring_task: Optional[asyncio.Task] = None
1211
+ self._is_monitoring = False
1212
+
1213
+ # Performance tracking
1214
+ self.enforcement_start_time = time.time()
1215
+
1216
+ logger.info(
1217
+ f"ResourceLimitEnforcer initialized: "
1218
+ f"memory={max_memory_mb}MB, connections={max_connections}, "
1219
+ f"cpu={max_cpu_percent}%, policy={enforcement_policy.value}"
1220
+ )
1221
+
1222
+ def check_memory_limits(self) -> ResourceCheckResult:
1223
+ """Check if current memory usage is within limits.
1224
+
1225
+ Returns:
1226
+ ResourceCheckResult indicating if execution can proceed
1227
+ """
1228
+ if self.max_memory_mb is None:
1229
+ return ResourceCheckResult(
1230
+ can_proceed=True,
1231
+ resource_type="memory",
1232
+ current_usage=0,
1233
+ limit=0,
1234
+ usage_percentage=0,
1235
+ message="No memory limit configured",
1236
+ )
1237
+
1238
+ # Get current memory usage
1239
+ # Get current process memory usage, not system-wide
1240
+ process = psutil.Process()
1241
+ memory_info = process.memory_info()
1242
+ current_mb = memory_info.rss / (1024 * 1024) # RSS is resident set size
1243
+ usage_percentage = current_mb / self.max_memory_mb
1244
+
1245
+ # Update peak tracking
1246
+ with self._lock:
1247
+ self.peak_memory_mb = max(self.peak_memory_mb, current_mb)
1248
+
1249
+ # Check if over limit
1250
+ if current_mb > self.max_memory_mb:
1251
+ return ResourceCheckResult(
1252
+ can_proceed=False,
1253
+ resource_type="memory",
1254
+ current_usage=current_mb,
1255
+ limit=self.max_memory_mb,
1256
+ usage_percentage=usage_percentage,
1257
+ message=f"Memory usage {current_mb:.1f}MB exceeds limit {self.max_memory_mb}MB",
1258
+ )
1259
+
1260
+ # Check alert threshold
1261
+ if self.enable_alerts and usage_percentage > self.memory_alert_threshold:
1262
+ logger.warning(
1263
+ f"Memory usage alert: {current_mb:.1f}MB ({usage_percentage:.1%}) "
1264
+ f"exceeds threshold {self.memory_alert_threshold:.1%}"
1265
+ )
1266
+
1267
+ return ResourceCheckResult(
1268
+ can_proceed=True,
1269
+ resource_type="memory",
1270
+ current_usage=current_mb,
1271
+ limit=self.max_memory_mb,
1272
+ usage_percentage=usage_percentage,
1273
+ message=f"Memory usage {current_mb:.1f}MB within limit",
1274
+ )
1275
+
1276
+ def check_cpu_limits(self) -> ResourceCheckResult:
1277
+ """Check if current CPU usage is within limits.
1278
+
1279
+ Returns:
1280
+ ResourceCheckResult indicating if execution can proceed
1281
+ """
1282
+ if self.max_cpu_percent is None:
1283
+ return ResourceCheckResult(
1284
+ can_proceed=True,
1285
+ resource_type="cpu",
1286
+ current_usage=0,
1287
+ limit=0,
1288
+ usage_percentage=0,
1289
+ message="No CPU limit configured",
1290
+ )
1291
+
1292
+ # Get current CPU usage
1293
+ cpu_percent = psutil.cpu_percent(interval=0.1)
1294
+ usage_percentage = cpu_percent / self.max_cpu_percent
1295
+
1296
+ # Update peak tracking
1297
+ with self._lock:
1298
+ self.peak_cpu_percent = max(self.peak_cpu_percent, cpu_percent)
1299
+
1300
+ # Check if over limit
1301
+ if cpu_percent > self.max_cpu_percent:
1302
+ return ResourceCheckResult(
1303
+ can_proceed=False,
1304
+ resource_type="cpu",
1305
+ current_usage=cpu_percent,
1306
+ limit=self.max_cpu_percent,
1307
+ usage_percentage=usage_percentage,
1308
+ message=f"CPU usage {cpu_percent:.1f}% exceeds limit {self.max_cpu_percent:.1f}%",
1309
+ )
1310
+
1311
+ # Check alert threshold
1312
+ if self.enable_alerts and usage_percentage > self.cpu_alert_threshold:
1313
+ logger.warning(
1314
+ f"CPU usage alert: {cpu_percent:.1f}% "
1315
+ f"exceeds threshold {self.cpu_alert_threshold:.1%}"
1316
+ )
1317
+
1318
+ return ResourceCheckResult(
1319
+ can_proceed=True,
1320
+ resource_type="cpu",
1321
+ current_usage=cpu_percent,
1322
+ limit=self.max_cpu_percent,
1323
+ usage_percentage=usage_percentage,
1324
+ message=f"CPU usage {cpu_percent:.1f}% within limit",
1325
+ )
1326
+
1327
+ def request_connection(self, connection_id: str) -> Dict[str, Any]:
1328
+ """Request a new connection within limits.
1329
+
1330
+ Args:
1331
+ connection_id: Unique identifier for the connection
1332
+
1333
+ Returns:
1334
+ Dict with granted status and connection info
1335
+
1336
+ Raises:
1337
+ ConnectionLimitExceededError: If connection limit exceeded
1338
+ """
1339
+ with self._lock:
1340
+ current_count = len(self.active_connections)
1341
+
1342
+ if self.max_connections is None:
1343
+ self.active_connections.add(connection_id)
1344
+ return {
1345
+ "granted": True,
1346
+ "connection_id": connection_id,
1347
+ "active_count": len(self.active_connections),
1348
+ }
1349
+
1350
+ # Check if over limit
1351
+ if current_count >= self.max_connections:
1352
+ if self.enforcement_policy == EnforcementPolicy.STRICT:
1353
+ raise ConnectionLimitExceededError(
1354
+ current_count, self.max_connections
1355
+ )
1356
+ elif self.enforcement_policy == EnforcementPolicy.WARN:
1357
+ logger.warning(
1358
+ f"Connection limit warning: {current_count} >= {self.max_connections}"
1359
+ )
1360
+ self.active_connections.add(connection_id)
1361
+ return {
1362
+ "granted": True,
1363
+ "connection_id": connection_id,
1364
+ "active_count": len(self.active_connections),
1365
+ "warning": "Connection limit exceeded but allowed by policy",
1366
+ }
1367
+ elif self.enforcement_policy == EnforcementPolicy.ADAPTIVE:
1368
+ # Handle based on degradation strategy
1369
+ if self.degradation_strategy == DegradationStrategy.QUEUE:
1370
+ self.connection_queue.append(connection_id)
1371
+ return {
1372
+ "granted": False,
1373
+ "connection_id": connection_id,
1374
+ "queued": True,
1375
+ "queue_position": len(self.connection_queue),
1376
+ }
1377
+ elif self.degradation_strategy == DegradationStrategy.REJECT:
1378
+ raise ConnectionLimitExceededError(
1379
+ current_count, self.max_connections
1380
+ )
1381
+ elif self.degradation_strategy == DegradationStrategy.DEFER:
1382
+ # Return deferred status - caller should retry later
1383
+ return {
1384
+ "granted": False,
1385
+ "connection_id": connection_id,
1386
+ "deferred": True,
1387
+ "retry_after": self.monitoring_interval,
1388
+ }
1389
+
1390
+ # Check alert threshold
1391
+ usage_percentage = current_count / self.max_connections
1392
+ if (
1393
+ self.enable_alerts
1394
+ and usage_percentage > self.connection_alert_threshold
1395
+ ):
1396
+ logger.warning(
1397
+ f"Connection usage alert: {current_count}/{self.max_connections} "
1398
+ f"({usage_percentage:.1%}) exceeds threshold {self.connection_alert_threshold:.1%}"
1399
+ )
1400
+
1401
+ # Grant connection
1402
+ self.active_connections.add(connection_id)
1403
+ return {
1404
+ "granted": True,
1405
+ "connection_id": connection_id,
1406
+ "active_count": len(self.active_connections),
1407
+ }
1408
+
1409
+ def release_connection(self, connection_id: str) -> None:
1410
+ """Release a connection and process any queued requests.
1411
+
1412
+ Args:
1413
+ connection_id: Connection to release
1414
+ """
1415
+ with self._lock:
1416
+ if connection_id in self.active_connections:
1417
+ self.active_connections.remove(connection_id)
1418
+
1419
+ # Process queued connections if using queue strategy
1420
+ if (
1421
+ self.connection_queue
1422
+ and self.degradation_strategy == DegradationStrategy.QUEUE
1423
+ ):
1424
+ next_connection_id = self.connection_queue.popleft()
1425
+ self.active_connections.add(next_connection_id)
1426
+ logger.info(f"Processed queued connection: {next_connection_id}")
1427
+
1428
+ def get_active_connection_count(self) -> int:
1429
+ """Get current active connection count.
1430
+
1431
+ Returns:
1432
+ Number of active connections
1433
+ """
1434
+ with self._lock:
1435
+ return len(self.active_connections)
1436
+
1437
+ def check_all_limits(self) -> Dict[str, ResourceCheckResult]:
1438
+ """Check all configured resource limits.
1439
+
1440
+ Returns:
1441
+ Dict mapping resource types to check results
1442
+ """
1443
+ results = {}
1444
+
1445
+ # Check memory limits
1446
+ results["memory"] = self.check_memory_limits()
1447
+
1448
+ # Check CPU limits
1449
+ results["cpu"] = self.check_cpu_limits()
1450
+
1451
+ # Check connection limits
1452
+ with self._lock:
1453
+ current_connections = len(self.active_connections)
1454
+
1455
+ if self.max_connections is not None:
1456
+ usage_percentage = current_connections / self.max_connections
1457
+ can_proceed = current_connections < self.max_connections
1458
+
1459
+ if not can_proceed and self.enforcement_policy == EnforcementPolicy.WARN:
1460
+ can_proceed = True
1461
+
1462
+ results["connections"] = ResourceCheckResult(
1463
+ can_proceed=can_proceed,
1464
+ resource_type="connections",
1465
+ current_usage=current_connections,
1466
+ limit=self.max_connections,
1467
+ usage_percentage=usage_percentage,
1468
+ message=f"Active connections: {current_connections}/{self.max_connections}",
1469
+ )
1470
+ else:
1471
+ results["connections"] = ResourceCheckResult(
1472
+ can_proceed=True,
1473
+ resource_type="connections",
1474
+ current_usage=current_connections,
1475
+ limit=0,
1476
+ usage_percentage=0,
1477
+ message="No connection limit configured",
1478
+ )
1479
+
1480
+ return results
1481
+
1482
+ def enforce_memory_limits(self) -> None:
1483
+ """Enforce memory limits based on policy.
1484
+
1485
+ Raises:
1486
+ MemoryLimitExceededError: If memory limit exceeded and policy is strict
1487
+ """
1488
+ result = self.check_memory_limits()
1489
+
1490
+ if not result.can_proceed:
1491
+ if self.enforcement_policy == EnforcementPolicy.STRICT:
1492
+ raise MemoryLimitExceededError(result.current_usage, result.limit)
1493
+ elif self.enforcement_policy == EnforcementPolicy.WARN:
1494
+ logger.warning(f"Memory limit exceeded: {result.message}")
1495
+ elif self.enforcement_policy == EnforcementPolicy.ADAPTIVE:
1496
+ # Trigger garbage collection to try to free memory
1497
+ logger.warning(
1498
+ f"Memory limit exceeded, triggering garbage collection: {result.message}"
1499
+ )
1500
+ gc.collect()
1501
+
1502
+ # Re-check after GC
1503
+ recheck_result = self.check_memory_limits()
1504
+ if not recheck_result.can_proceed:
1505
+ if self.degradation_strategy == DegradationStrategy.REJECT:
1506
+ raise MemoryLimitExceededError(
1507
+ recheck_result.current_usage, recheck_result.limit
1508
+ )
1509
+ else:
1510
+ logger.warning(
1511
+ f"Memory limit still exceeded after GC: {recheck_result.message}"
1512
+ )
1513
+
1514
+ def enforce_cpu_limits(self) -> None:
1515
+ """Enforce CPU limits based on policy.
1516
+
1517
+ Raises:
1518
+ CPULimitExceededError: If CPU limit exceeded and policy is strict
1519
+ """
1520
+ result = self.check_cpu_limits()
1521
+
1522
+ if not result.can_proceed:
1523
+ if self.enforcement_policy == EnforcementPolicy.STRICT:
1524
+ raise CPULimitExceededError(result.current_usage, result.limit)
1525
+ elif self.enforcement_policy == EnforcementPolicy.WARN:
1526
+ logger.warning(f"CPU limit exceeded: {result.message}")
1527
+ elif self.enforcement_policy == EnforcementPolicy.ADAPTIVE:
1528
+ # Adaptive CPU throttling - introduce delays
1529
+ throttle_delay = min(1.0, (result.usage_percentage - 1.0) * 2.0)
1530
+ if throttle_delay > 0:
1531
+ logger.warning(f"CPU throttling: sleeping {throttle_delay:.2f}s")
1532
+ time.sleep(throttle_delay)
1533
+
1534
+ def get_resource_metrics(self) -> Dict[str, Any]:
1535
+ """Get current resource usage metrics.
1536
+
1537
+ Returns:
1538
+ Dict containing comprehensive resource metrics
1539
+ """
1540
+ # Get current process metrics, not system-wide
1541
+ process = psutil.Process()
1542
+ memory_info = process.memory_info()
1543
+ cpu_percent = process.cpu_percent()
1544
+
1545
+ with self._lock:
1546
+ current_memory_mb = memory_info.rss / (1024 * 1024)
1547
+ memory_usage_percent = (
1548
+ (current_memory_mb / self.max_memory_mb * 100)
1549
+ if self.max_memory_mb
1550
+ else 0
1551
+ )
1552
+
1553
+ metrics = {
1554
+ "timestamp": datetime.now(UTC),
1555
+ "memory_usage_mb": current_memory_mb,
1556
+ "memory_usage_percent": memory_usage_percent,
1557
+ "cpu_usage_percent": cpu_percent,
1558
+ "active_connections": len(self.active_connections),
1559
+ "peak_memory_mb": self.peak_memory_mb,
1560
+ "peak_cpu_percent": self.peak_cpu_percent,
1561
+ "max_memory_mb": self.max_memory_mb,
1562
+ "max_connections": self.max_connections,
1563
+ "max_cpu_percent": self.max_cpu_percent,
1564
+ "enforcement_policy": self.enforcement_policy.value,
1565
+ "degradation_strategy": self.degradation_strategy.value,
1566
+ "uptime_seconds": time.time() - self.enforcement_start_time,
1567
+ }
1568
+
1569
+ # Add to history if enabled
1570
+ if self.enable_metrics_history:
1571
+ self.metrics_history.append(
1572
+ ResourceMetrics(
1573
+ timestamp=metrics["timestamp"],
1574
+ memory_usage_mb=metrics["memory_usage_mb"],
1575
+ memory_usage_percent=metrics["memory_usage_percent"],
1576
+ cpu_usage_percent=metrics["cpu_usage_percent"],
1577
+ active_connections=metrics["active_connections"],
1578
+ peak_memory_mb=metrics["peak_memory_mb"],
1579
+ peak_cpu_percent=metrics["peak_cpu_percent"],
1580
+ )
1581
+ )
1582
+
1583
+ return metrics
1584
+
1585
+ def get_metrics_history(
1586
+ self, duration_seconds: Optional[int] = None
1587
+ ) -> List[ResourceMetrics]:
1588
+ """Get resource metrics history.
1589
+
1590
+ Args:
1591
+ duration_seconds: Only return metrics from last N seconds (None = all)
1592
+
1593
+ Returns:
1594
+ List of ResourceMetrics from history
1595
+ """
1596
+ if not self.enable_metrics_history:
1597
+ return []
1598
+
1599
+ with self._lock:
1600
+ if duration_seconds is None:
1601
+ return list(self.metrics_history)
1602
+
1603
+ # Filter by duration
1604
+ cutoff_time = datetime.now(UTC) - timedelta(seconds=duration_seconds)
1605
+ return [
1606
+ metrics
1607
+ for metrics in self.metrics_history
1608
+ if metrics.timestamp >= cutoff_time
1609
+ ]
1610
+
1611
+ async def start_monitoring(self) -> None:
1612
+ """Start asynchronous resource monitoring."""
1613
+ if self._is_monitoring:
1614
+ return
1615
+
1616
+ self._is_monitoring = True
1617
+ self._monitoring_task = asyncio.create_task(self._monitoring_loop())
1618
+ logger.info("Resource monitoring started")
1619
+
1620
+ async def stop_monitoring(self) -> None:
1621
+ """Stop asynchronous resource monitoring."""
1622
+ if not self._is_monitoring:
1623
+ return
1624
+
1625
+ self._is_monitoring = False
1626
+ if self._monitoring_task:
1627
+ self._monitoring_task.cancel()
1628
+ try:
1629
+ await self._monitoring_task
1630
+ except asyncio.CancelledError:
1631
+ pass
1632
+ logger.info("Resource monitoring stopped")
1633
+
1634
+ async def _monitoring_loop(self) -> None:
1635
+ """Internal monitoring loop."""
1636
+ while self._is_monitoring:
1637
+ try:
1638
+ # Collect metrics
1639
+ self.get_resource_metrics()
1640
+
1641
+ # Check for limit violations
1642
+ results = self.check_all_limits()
1643
+
1644
+ # Log warnings for violations
1645
+ for resource_type, result in results.items():
1646
+ if not result.can_proceed and self.enable_alerts:
1647
+ logger.warning(f"Resource limit violation: {result.message}")
1648
+
1649
+ await asyncio.sleep(self.monitoring_interval)
1650
+
1651
+ except Exception as e:
1652
+ logger.error(f"Error in resource monitoring loop: {e}")
1653
+ await asyncio.sleep(self.monitoring_interval)
1654
+
1655
+
1656
+ # Comprehensive Retry Policy Engine Implementation
1657
+
1658
+
1659
+ class RetryPolicyMode(Enum):
1660
+ """Retry policy operation modes."""
1661
+
1662
+ STRICT = "strict" # Fail fast on non-retriable exceptions
1663
+ PERMISSIVE = "permissive" # Allow retries for more exception types
1664
+ ADAPTIVE = "adaptive" # Learn and adapt retry behavior
1665
+ CIRCUIT_AWARE = "circuit_aware" # Coordinate with circuit breakers
1666
+
1667
+
1668
+ @dataclass
1669
+ class RetryAttempt:
1670
+ """Record of a single retry attempt."""
1671
+
1672
+ timestamp: datetime
1673
+ exception_type: Type[Exception]
1674
+ attempt_number: int
1675
+ delay_used: float
1676
+ success: bool
1677
+ execution_time: float
1678
+ error_message: str = ""
1679
+
1680
+
1681
+ @dataclass
1682
+ class RetryResult:
1683
+ """Result of retry policy execution."""
1684
+
1685
+ success: bool
1686
+ value: Any = None
1687
+ total_attempts: int = 0
1688
+ total_time: float = 0.0
1689
+ final_exception: Optional[Exception] = None
1690
+ attempts: List[RetryAttempt] = field(default_factory=list)
1691
+
1692
+
1693
+ class RetryStrategy(ABC):
1694
+ """Abstract base class for retry strategies."""
1695
+
1696
+ def __init__(self, name: str, max_attempts: int = 3):
1697
+ """Initialize retry strategy.
1698
+
1699
+ Args:
1700
+ name: Strategy name for identification
1701
+ max_attempts: Maximum number of retry attempts
1702
+ """
1703
+ self.name = name
1704
+ self.max_attempts = max_attempts
1705
+
1706
+ @abstractmethod
1707
+ def calculate_delay(self, attempt: int) -> float:
1708
+ """Calculate delay for the given attempt number.
1709
+
1710
+ Args:
1711
+ attempt: Current attempt number (1-based)
1712
+
1713
+ Returns:
1714
+ Delay in seconds
1715
+ """
1716
+ pass
1717
+
1718
+ def should_retry(self, exception: Exception, attempt: int) -> bool:
1719
+ """Determine if the operation should be retried.
1720
+
1721
+ Args:
1722
+ exception: Exception that occurred
1723
+ attempt: Current attempt number
1724
+
1725
+ Returns:
1726
+ True if should retry, False otherwise
1727
+ """
1728
+ # Default implementation - retry for most exceptions except system ones
1729
+ non_retriable = (KeyboardInterrupt, SystemExit, SystemError)
1730
+ return not isinstance(exception, non_retriable)
1731
+
1732
+ def get_config(self) -> Dict[str, Any]:
1733
+ """Get strategy configuration for serialization.
1734
+
1735
+ Returns:
1736
+ Configuration dictionary
1737
+ """
1738
+ return {"strategy_type": self.name, "max_attempts": self.max_attempts}
1739
+
1740
+
1741
+ class ExponentialBackoffStrategy(RetryStrategy):
1742
+ """Exponential backoff retry strategy with jitter."""
1743
+
1744
+ def __init__(
1745
+ self,
1746
+ max_attempts: int = 3,
1747
+ base_delay: float = 1.0,
1748
+ max_delay: float = 60.0,
1749
+ multiplier: float = 2.0,
1750
+ jitter: bool = True,
1751
+ ):
1752
+ """Initialize exponential backoff strategy.
1753
+
1754
+ Args:
1755
+ max_attempts: Maximum number of attempts
1756
+ base_delay: Base delay in seconds
1757
+ max_delay: Maximum delay in seconds
1758
+ multiplier: Exponential multiplier
1759
+ jitter: Whether to add jitter to delays
1760
+ """
1761
+ super().__init__("exponential_backoff", max_attempts)
1762
+ self.base_delay = base_delay
1763
+ self.max_delay = max_delay
1764
+ self.multiplier = multiplier
1765
+ self.jitter = jitter
1766
+
1767
+ def calculate_delay(self, attempt: int) -> float:
1768
+ """Calculate exponential backoff delay with optional jitter."""
1769
+ delay = self.base_delay * (self.multiplier ** (attempt - 1))
1770
+ delay = min(delay, self.max_delay)
1771
+
1772
+ if self.jitter:
1773
+ # Add up to 25% jitter
1774
+ jitter_amount = delay * 0.25 * random.random()
1775
+ delay += jitter_amount
1776
+
1777
+ return delay
1778
+
1779
+ def get_config(self) -> Dict[str, Any]:
1780
+ """Get exponential backoff configuration."""
1781
+ config = super().get_config()
1782
+ config.update(
1783
+ {
1784
+ "base_delay": self.base_delay,
1785
+ "max_delay": self.max_delay,
1786
+ "multiplier": self.multiplier,
1787
+ "jitter": self.jitter,
1788
+ }
1789
+ )
1790
+ return config
1791
+
1792
+
1793
+ class LinearBackoffStrategy(RetryStrategy):
1794
+ """Linear backoff retry strategy with optional jitter."""
1795
+
1796
+ def __init__(
1797
+ self,
1798
+ max_attempts: int = 3,
1799
+ base_delay: float = 1.0,
1800
+ max_delay: float = 30.0,
1801
+ increment: float = 1.0,
1802
+ jitter: bool = True,
1803
+ ):
1804
+ """Initialize linear backoff strategy.
1805
+
1806
+ Args:
1807
+ max_attempts: Maximum number of attempts
1808
+ base_delay: Base delay in seconds
1809
+ max_delay: Maximum delay in seconds
1810
+ increment: Linear increment per attempt
1811
+ jitter: Whether to add jitter to delays
1812
+ """
1813
+ super().__init__("linear_backoff", max_attempts)
1814
+ self.base_delay = base_delay
1815
+ self.max_delay = max_delay
1816
+ self.increment = increment
1817
+ self.jitter = jitter
1818
+
1819
+ def calculate_delay(self, attempt: int) -> float:
1820
+ """Calculate linear backoff delay with optional jitter."""
1821
+ delay = self.base_delay + ((attempt - 1) * self.increment)
1822
+ delay = min(delay, self.max_delay)
1823
+
1824
+ if self.jitter:
1825
+ # Add up to 25% jitter
1826
+ jitter_amount = delay * 0.25 * random.random()
1827
+ delay += jitter_amount
1828
+
1829
+ return delay
1830
+
1831
+ def get_config(self) -> Dict[str, Any]:
1832
+ """Get linear backoff configuration."""
1833
+ config = super().get_config()
1834
+ config.update(
1835
+ {
1836
+ "base_delay": self.base_delay,
1837
+ "max_delay": self.max_delay,
1838
+ "increment": self.increment,
1839
+ "jitter": self.jitter,
1840
+ }
1841
+ )
1842
+ return config
1843
+
1844
+
1845
+ class FixedDelayStrategy(RetryStrategy):
1846
+ """Fixed delay retry strategy with optional jitter."""
1847
+
1848
+ def __init__(self, max_attempts: int = 3, delay: float = 1.0, jitter: bool = True):
1849
+ """Initialize fixed delay strategy.
1850
+
1851
+ Args:
1852
+ max_attempts: Maximum number of attempts
1853
+ delay: Fixed delay in seconds
1854
+ jitter: Whether to add jitter to delays
1855
+ """
1856
+ super().__init__("fixed_delay", max_attempts)
1857
+ self.delay = delay
1858
+ self.jitter = jitter
1859
+
1860
+ def calculate_delay(self, attempt: int) -> float:
1861
+ """Calculate fixed delay with optional jitter."""
1862
+ delay = self.delay
1863
+
1864
+ if self.jitter:
1865
+ # Add up to 25% jitter
1866
+ jitter_amount = delay * 0.25 * random.random()
1867
+ delay += jitter_amount
1868
+
1869
+ return delay
1870
+
1871
+ def get_config(self) -> Dict[str, Any]:
1872
+ """Get fixed delay configuration."""
1873
+ config = super().get_config()
1874
+ config.update({"delay": self.delay, "jitter": self.jitter})
1875
+ return config
1876
+
1877
+
1878
+ class AdaptiveRetryStrategy(RetryStrategy):
1879
+ """Adaptive retry strategy that learns from historical success/failure patterns."""
1880
+
1881
+ def __init__(
1882
+ self,
1883
+ max_attempts: int = 3,
1884
+ initial_delay: float = 1.0,
1885
+ min_delay: float = 0.1,
1886
+ max_delay: float = 30.0,
1887
+ learning_rate: float = 0.1,
1888
+ history_size: int = 1000,
1889
+ ):
1890
+ """Initialize adaptive retry strategy.
1891
+
1892
+ Args:
1893
+ max_attempts: Maximum number of attempts
1894
+ initial_delay: Initial delay for new exception types
1895
+ min_delay: Minimum delay bound
1896
+ max_delay: Maximum delay bound
1897
+ learning_rate: How quickly to adapt (0.0-1.0)
1898
+ history_size: Maximum number of attempts to remember
1899
+ """
1900
+ super().__init__("adaptive_retry", max_attempts)
1901
+ self.initial_delay = initial_delay
1902
+ self.min_delay = min_delay
1903
+ self.max_delay = max_delay
1904
+ self.learning_rate = learning_rate
1905
+ self.history_size = history_size
1906
+
1907
+ # Learning data structures
1908
+ self.attempt_history: deque = deque(maxlen=history_size)
1909
+ self.exception_delays: Dict[Type[Exception], float] = {}
1910
+ self.success_rates: Dict[Type[Exception], Tuple[int, int]] = defaultdict(
1911
+ lambda: (0, 0)
1912
+ )
1913
+
1914
+ # Thread safety for learning data
1915
+ self._learning_lock = threading.RLock()
1916
+
1917
+ def calculate_delay(
1918
+ self, attempt: int, exception_type: Type[Exception] = Exception
1919
+ ) -> float:
1920
+ """Calculate adaptive delay based on learned patterns."""
1921
+ with self._learning_lock:
1922
+ if exception_type in self.exception_delays:
1923
+ base_delay = self.exception_delays[exception_type]
1924
+ else:
1925
+ base_delay = self.initial_delay
1926
+
1927
+ # Apply attempt multiplier with learned adjustments
1928
+ delay = base_delay * (1.2 ** (attempt - 1))
1929
+ return max(self.min_delay, min(delay, self.max_delay))
1930
+
1931
+ def get_recommended_delay(
1932
+ self, exception_type: Type[Exception], attempt: int
1933
+ ) -> float:
1934
+ """Get recommended delay for specific exception type and attempt."""
1935
+ return self.calculate_delay(attempt, exception_type)
1936
+
1937
+ def record_attempt_result(
1938
+ self,
1939
+ exception_type: Type[Exception],
1940
+ attempt: int,
1941
+ delay_used: float,
1942
+ success: bool,
1943
+ execution_time: float = 0.0,
1944
+ ) -> None:
1945
+ """Record the result of an attempt for learning.
1946
+
1947
+ Args:
1948
+ exception_type: Type of exception that occurred
1949
+ attempt: Attempt number
1950
+ delay_used: Delay that was used
1951
+ success: Whether the attempt succeeded
1952
+ execution_time: How long the operation took
1953
+ """
1954
+ with self._learning_lock:
1955
+ # Record in history
1956
+ self.attempt_history.append(
1957
+ {
1958
+ "exception_type": exception_type,
1959
+ "attempt": attempt,
1960
+ "delay_used": delay_used,
1961
+ "success": success,
1962
+ "execution_time": execution_time,
1963
+ "timestamp": datetime.now(UTC),
1964
+ }
1965
+ )
1966
+
1967
+ # Update success rates
1968
+ successes, failures = self.success_rates[exception_type]
1969
+ if success:
1970
+ successes += 1
1971
+ else:
1972
+ failures += 1
1973
+ self.success_rates[exception_type] = (successes, failures)
1974
+
1975
+ # Adapt delay based on result
1976
+ current_delay = self.exception_delays.get(
1977
+ exception_type, self.initial_delay
1978
+ )
1979
+
1980
+ if success:
1981
+ # Successful retry - reduce delay slightly
1982
+ new_delay = current_delay * (1.0 - self.learning_rate * 0.5)
1983
+ else:
1984
+ # Failed retry - increase delay
1985
+ new_delay = current_delay * (1.0 + self.learning_rate)
1986
+
1987
+ # Apply bounds
1988
+ new_delay = max(self.min_delay, min(new_delay, self.max_delay))
1989
+ self.exception_delays[exception_type] = new_delay
1990
+
1991
+ logger.debug(
1992
+ f"Adaptive retry learned: {exception_type.__name__} delay "
1993
+ f"{current_delay:.2f}s -> {new_delay:.2f}s (success: {success})"
1994
+ )
1995
+
1996
+ def get_learning_stats(self) -> Dict[str, Any]:
1997
+ """Get statistics about learned patterns.
1998
+
1999
+ Returns:
2000
+ Dictionary containing learning statistics
2001
+ """
2002
+ with self._learning_lock:
2003
+ return {
2004
+ "total_attempts": len(self.attempt_history),
2005
+ "unique_exceptions": len(self.exception_delays),
2006
+ "learned_delays": {
2007
+ exc_type.__name__: delay
2008
+ for exc_type, delay in self.exception_delays.items()
2009
+ },
2010
+ "success_rates": {
2011
+ exc_type.__name__: (
2012
+ successes / (successes + failures)
2013
+ if (successes + failures) > 0
2014
+ else 0.0
2015
+ )
2016
+ for exc_type, (successes, failures) in self.success_rates.items()
2017
+ },
2018
+ }
2019
+
2020
+ def get_config(self) -> Dict[str, Any]:
2021
+ """Get adaptive strategy configuration."""
2022
+ config = super().get_config()
2023
+ config.update(
2024
+ {
2025
+ "initial_delay": self.initial_delay,
2026
+ "min_delay": self.min_delay,
2027
+ "max_delay": self.max_delay,
2028
+ "learning_rate": self.learning_rate,
2029
+ "history_size": self.history_size,
2030
+ }
2031
+ )
2032
+ return config
2033
+
2034
+
2035
+ class ExceptionClassifier:
2036
+ """Smart exception classification for retry decisions."""
2037
+
2038
+ def __init__(self):
2039
+ """Initialize exception classifier with built-in rules."""
2040
+ # Built-in retriable exceptions (network, temporary failures)
2041
+ self.retriable_exceptions: Set[Type[Exception]] = {
2042
+ ConnectionError,
2043
+ TimeoutError,
2044
+ OSError, # Network-related OS errors
2045
+ RuntimeError, # General runtime issues
2046
+ ValueError, # Often temporary data issues
2047
+ }
2048
+
2049
+ # Built-in non-retriable exceptions (system, user, permanent)
2050
+ self.non_retriable_exceptions: Set[Type[Exception]] = {
2051
+ KeyboardInterrupt,
2052
+ SystemExit,
2053
+ SystemError,
2054
+ MemoryError,
2055
+ RecursionError,
2056
+ SyntaxError,
2057
+ TypeError, # Usually indicates programming errors
2058
+ AttributeError, # Usually permanent
2059
+ ImportError, # Usually permanent
2060
+ }
2061
+
2062
+ # Pattern-based rules (regex patterns to match exception messages)
2063
+ self.retriable_patterns: List[Tuple[re.Pattern, bool]] = (
2064
+ []
2065
+ ) # (pattern, case_sensitive)
2066
+ self.non_retriable_patterns: List[Tuple[re.Pattern, bool]] = []
2067
+
2068
+ # Lock for thread safety
2069
+ self._lock = threading.RLock()
2070
+
2071
+ logger.info("ExceptionClassifier initialized with built-in rules")
2072
+
2073
+ def is_retriable(self, exception: Exception) -> bool:
2074
+ """Determine if an exception is retriable.
2075
+
2076
+ Args:
2077
+ exception: Exception to classify
2078
+
2079
+ Returns:
2080
+ True if the exception is retriable, False otherwise
2081
+ """
2082
+ with self._lock:
2083
+ exception_type = type(exception)
2084
+ exception_message = str(exception)
2085
+
2086
+ # Check non-retriable patterns first (higher priority)
2087
+ for pattern, case_sensitive in self.non_retriable_patterns:
2088
+ if pattern.search(exception_message):
2089
+ logger.debug(
2090
+ f"Exception '{exception_message}' matched non-retriable pattern"
2091
+ )
2092
+ return False
2093
+
2094
+ # Check non-retriable exception types
2095
+ for non_retriable_type in self.non_retriable_exceptions:
2096
+ if issubclass(exception_type, non_retriable_type):
2097
+ logger.debug(
2098
+ f"Exception type {exception_type.__name__} is non-retriable"
2099
+ )
2100
+ return False
2101
+
2102
+ # Check retriable patterns
2103
+ for pattern, case_sensitive in self.retriable_patterns:
2104
+ if pattern.search(exception_message):
2105
+ logger.debug(
2106
+ f"Exception '{exception_message}' matched retriable pattern"
2107
+ )
2108
+ return True
2109
+
2110
+ # Check retriable exception types
2111
+ for retriable_type in self.retriable_exceptions:
2112
+ if issubclass(exception_type, retriable_type):
2113
+ logger.debug(
2114
+ f"Exception type {exception_type.__name__} is retriable"
2115
+ )
2116
+ return True
2117
+
2118
+ # Default to non-retriable for unknown exceptions
2119
+ logger.debug(
2120
+ f"Exception type {exception_type.__name__} not classified, defaulting to non-retriable"
2121
+ )
2122
+ return False
2123
+
2124
+ def add_retriable_exception(self, exception_type: Type[Exception]) -> None:
2125
+ """Add an exception type to retriable list.
2126
+
2127
+ Args:
2128
+ exception_type: Exception type to mark as retriable
2129
+ """
2130
+ with self._lock:
2131
+ self.retriable_exceptions.add(exception_type)
2132
+ # Remove from non-retriable if present
2133
+ self.non_retriable_exceptions.discard(exception_type)
2134
+
2135
+ logger.info(f"Added {exception_type.__name__} to retriable exceptions")
2136
+
2137
+ def add_non_retriable_exception(self, exception_type: Type[Exception]) -> None:
2138
+ """Add an exception type to non-retriable list.
2139
+
2140
+ Args:
2141
+ exception_type: Exception type to mark as non-retriable
2142
+ """
2143
+ with self._lock:
2144
+ self.non_retriable_exceptions.add(exception_type)
2145
+ # Remove from retriable if present
2146
+ self.retriable_exceptions.discard(exception_type)
2147
+
2148
+ logger.info(f"Added {exception_type.__name__} to non-retriable exceptions")
2149
+
2150
+ def add_retriable_pattern(self, pattern: str, case_sensitive: bool = True) -> None:
2151
+ """Add a regex pattern for retriable exceptions.
2152
+
2153
+ Args:
2154
+ pattern: Regex pattern to match exception messages
2155
+ case_sensitive: Whether the pattern matching is case-sensitive
2156
+ """
2157
+ with self._lock:
2158
+ flags = 0 if case_sensitive else re.IGNORECASE
2159
+ compiled_pattern = re.compile(pattern, flags)
2160
+ self.retriable_patterns.append((compiled_pattern, case_sensitive))
2161
+
2162
+ logger.info(
2163
+ f"Added retriable pattern: {pattern} (case_sensitive: {case_sensitive})"
2164
+ )
2165
+
2166
+ def add_non_retriable_pattern(
2167
+ self, pattern: str, case_sensitive: bool = True
2168
+ ) -> None:
2169
+ """Add a regex pattern for non-retriable exceptions.
2170
+
2171
+ Args:
2172
+ pattern: Regex pattern to match exception messages
2173
+ case_sensitive: Whether the pattern matching is case-sensitive
2174
+ """
2175
+ with self._lock:
2176
+ flags = 0 if case_sensitive else re.IGNORECASE
2177
+ compiled_pattern = re.compile(pattern, flags)
2178
+ self.non_retriable_patterns.append((compiled_pattern, case_sensitive))
2179
+
2180
+ logger.info(
2181
+ f"Added non-retriable pattern: {pattern} (case_sensitive: {case_sensitive})"
2182
+ )
2183
+
2184
+ def get_classification_rules(self) -> Dict[str, Any]:
2185
+ """Get current classification rules.
2186
+
2187
+ Returns:
2188
+ Dictionary containing all classification rules
2189
+ """
2190
+ with self._lock:
2191
+ return {
2192
+ "retriable_exceptions": [
2193
+ exc.__name__ for exc in self.retriable_exceptions
2194
+ ],
2195
+ "non_retriable_exceptions": [
2196
+ exc.__name__ for exc in self.non_retriable_exceptions
2197
+ ],
2198
+ "retriable_patterns": [
2199
+ (p.pattern, cs) for p, cs in self.retriable_patterns
2200
+ ],
2201
+ "non_retriable_patterns": [
2202
+ (p.pattern, cs) for p, cs in self.non_retriable_patterns
2203
+ ],
2204
+ }
2205
+
2206
+
2207
+ class RetryMetrics:
2208
+ """Comprehensive retry metrics collection and analysis."""
2209
+
2210
+ def __init__(self):
2211
+ """Initialize retry metrics collector."""
2212
+ self.total_attempts = 0
2213
+ self.total_successes = 0
2214
+ self.total_failures = 0
2215
+ self.attempt_history: List[RetryAttempt] = []
2216
+
2217
+ # Performance metrics
2218
+ self.total_delay_time = 0.0
2219
+ self.total_execution_time = 0.0
2220
+
2221
+ # Exception tracking
2222
+ self.exception_counts: Dict[str, int] = defaultdict(int)
2223
+
2224
+ # Thread safety
2225
+ self._lock = threading.RLock()
2226
+
2227
+ def record_attempt(self, attempt: RetryAttempt) -> None:
2228
+ """Record a retry attempt.
2229
+
2230
+ Args:
2231
+ attempt: RetryAttempt object with attempt details
2232
+ """
2233
+ with self._lock:
2234
+ self.attempt_history.append(attempt)
2235
+ self.total_attempts += 1
2236
+
2237
+ if attempt.success:
2238
+ self.total_successes += 1
2239
+ else:
2240
+ self.total_failures += 1
2241
+
2242
+ self.total_delay_time += attempt.delay_used
2243
+ self.total_execution_time += attempt.execution_time
2244
+ self.exception_counts[attempt.exception_type.__name__] += 1
2245
+
2246
+ @property
2247
+ def success_rate(self) -> float:
2248
+ """Calculate overall success rate."""
2249
+ if self.total_attempts == 0:
2250
+ return 0.0
2251
+ return self.total_successes / self.total_attempts
2252
+
2253
+ @property
2254
+ def average_delay(self) -> float:
2255
+ """Calculate average delay between attempts."""
2256
+ if self.total_attempts == 0:
2257
+ return 0.0
2258
+ return self.total_delay_time / self.total_attempts
2259
+
2260
+ @property
2261
+ def average_execution_time(self) -> float:
2262
+ """Calculate average execution time per attempt."""
2263
+ if self.total_attempts == 0:
2264
+ return 0.0
2265
+ return self.total_execution_time / self.total_attempts
2266
+
2267
+ def get_exception_breakdown(self) -> Dict[str, int]:
2268
+ """Get breakdown of exceptions by type.
2269
+
2270
+ Returns:
2271
+ Dictionary mapping exception names to counts
2272
+ """
2273
+ with self._lock:
2274
+ return dict(self.exception_counts)
2275
+
2276
+ def get_attempt_timeline(self) -> List[Dict[str, Any]]:
2277
+ """Get chronological timeline of attempts.
2278
+
2279
+ Returns:
2280
+ List of attempt dictionaries sorted by timestamp
2281
+ """
2282
+ with self._lock:
2283
+ timeline = []
2284
+ for attempt in sorted(self.attempt_history, key=lambda a: a.timestamp):
2285
+ timeline.append(
2286
+ {
2287
+ "timestamp": attempt.timestamp,
2288
+ "attempt_number": attempt.attempt_number,
2289
+ "exception_type": attempt.exception_type.__name__,
2290
+ "delay_used": attempt.delay_used,
2291
+ "success": attempt.success,
2292
+ "execution_time": attempt.execution_time,
2293
+ "error_message": attempt.error_message,
2294
+ }
2295
+ )
2296
+ return timeline
2297
+
2298
+ def get_summary_stats(self) -> Dict[str, Any]:
2299
+ """Get comprehensive summary statistics.
2300
+
2301
+ Returns:
2302
+ Dictionary containing all metrics
2303
+ """
2304
+ with self._lock:
2305
+ return {
2306
+ "total_attempts": self.total_attempts,
2307
+ "total_successes": self.total_successes,
2308
+ "total_failures": self.total_failures,
2309
+ "success_rate": self.success_rate,
2310
+ "average_delay": self.average_delay,
2311
+ "average_execution_time": self.average_execution_time,
2312
+ "total_delay_time": self.total_delay_time,
2313
+ "total_execution_time": self.total_execution_time,
2314
+ "unique_exceptions": len(self.exception_counts),
2315
+ "most_common_exception": (
2316
+ max(self.exception_counts.items(), key=lambda x: x[1])[0]
2317
+ if self.exception_counts
2318
+ else None
2319
+ ),
2320
+ }
2321
+
2322
+
2323
+ @dataclass
2324
+ class RetryAnalytics:
2325
+ """Advanced retry analytics and reporting."""
2326
+
2327
+ total_retry_sessions: int = 0
2328
+ total_attempts: int = 0
2329
+ total_successes: int = 0
2330
+ average_attempts_per_session: float = 0.0
2331
+ most_common_exceptions: List[Tuple[str, int]] = field(default_factory=list)
2332
+
2333
+ def __post_init__(self):
2334
+ """Initialize analytics collections."""
2335
+ self.session_data: List[Dict[str, Any]] = []
2336
+ self.exception_frequencies: Dict[str, int] = defaultdict(int)
2337
+ self.strategy_performance: Dict[str, Dict[str, Any]] = defaultdict(
2338
+ lambda: {
2339
+ "total_uses": 0,
2340
+ "total_successes": 0,
2341
+ "total_attempts": 0,
2342
+ "total_time": 0.0,
2343
+ "success_rate": 0.0,
2344
+ "average_attempts": 0.0,
2345
+ "average_time": 0.0,
2346
+ }
2347
+ )
2348
+ self.time_series_data: Dict[str, List[Tuple[datetime, float]]] = defaultdict(
2349
+ list
2350
+ )
2351
+ self.enable_time_series = False
2352
+ self._lock = threading.RLock()
2353
+
2354
+ def record_session(
2355
+ self,
2356
+ session_id: str,
2357
+ attempts: int,
2358
+ success: bool,
2359
+ total_time: float,
2360
+ strategy_name: str,
2361
+ ) -> None:
2362
+ """Record a retry session.
2363
+
2364
+ Args:
2365
+ session_id: Unique session identifier
2366
+ attempts: Number of attempts made
2367
+ success: Whether the session ultimately succeeded
2368
+ total_time: Total time spent on retries
2369
+ strategy_name: Name of retry strategy used
2370
+ """
2371
+ with self._lock:
2372
+ self.session_data.append(
2373
+ {
2374
+ "session_id": session_id,
2375
+ "attempts": attempts,
2376
+ "success": success,
2377
+ "total_time": total_time,
2378
+ "strategy_name": strategy_name,
2379
+ "timestamp": datetime.now(UTC),
2380
+ }
2381
+ )
2382
+
2383
+ self.total_retry_sessions += 1
2384
+ self.total_attempts += attempts
2385
+ if success:
2386
+ self.total_successes += 1
2387
+
2388
+ # Update running average
2389
+ self.average_attempts_per_session = (
2390
+ self.total_attempts / self.total_retry_sessions
2391
+ )
2392
+
2393
+ def record_exception(self, exception_type: Type[Exception]) -> None:
2394
+ """Record an exception occurrence.
2395
+
2396
+ Args:
2397
+ exception_type: Type of exception that occurred
2398
+ """
2399
+ with self._lock:
2400
+ self.exception_frequencies[exception_type.__name__] += 1
2401
+ # Update most common exceptions (top 10)
2402
+ self.most_common_exceptions = sorted(
2403
+ self.exception_frequencies.items(), key=lambda x: x[1], reverse=True
2404
+ )[:10]
2405
+
2406
+ def record_strategy_performance(
2407
+ self, strategy_name: str, attempts: int, success: bool, total_time: float
2408
+ ) -> None:
2409
+ """Record performance data for a retry strategy.
2410
+
2411
+ Args:
2412
+ strategy_name: Name of the retry strategy
2413
+ attempts: Number of attempts made
2414
+ success: Whether the strategy succeeded
2415
+ total_time: Total time taken
2416
+ """
2417
+ with self._lock:
2418
+ perf = self.strategy_performance[strategy_name]
2419
+ perf["total_uses"] += 1
2420
+ perf["total_attempts"] += attempts
2421
+ perf["total_time"] += total_time
2422
+
2423
+ if success:
2424
+ perf["total_successes"] += 1
2425
+
2426
+ # Update calculated metrics
2427
+ perf["success_rate"] = perf["total_successes"] / perf["total_uses"]
2428
+ perf["average_attempts"] = perf["total_attempts"] / perf["total_uses"]
2429
+ perf["average_time"] = perf["total_time"] / perf["total_uses"]
2430
+
2431
+ def get_strategy_performance(self, strategy_name: str) -> Dict[str, Any]:
2432
+ """Get performance metrics for a specific strategy.
2433
+
2434
+ Args:
2435
+ strategy_name: Name of the strategy
2436
+
2437
+ Returns:
2438
+ Performance metrics dictionary
2439
+ """
2440
+ with self._lock:
2441
+ return dict(self.strategy_performance.get(strategy_name, {}))
2442
+
2443
+ def record_time_series_point(
2444
+ self, timestamp: datetime, metric: str, value: float
2445
+ ) -> None:
2446
+ """Record a time series data point.
2447
+
2448
+ Args:
2449
+ timestamp: When the data point was recorded
2450
+ metric: Name of the metric
2451
+ value: Metric value
2452
+ """
2453
+ if self.enable_time_series:
2454
+ with self._lock:
2455
+ self.time_series_data[metric].append((timestamp, value))
2456
+ # Keep only last 1000 points per metric
2457
+ if len(self.time_series_data[metric]) > 1000:
2458
+ self.time_series_data[metric] = self.time_series_data[metric][
2459
+ -1000:
2460
+ ]
2461
+
2462
+ def get_time_series(self, metric: str) -> List[Tuple[datetime, float]]:
2463
+ """Get time series data for a metric.
2464
+
2465
+ Args:
2466
+ metric: Name of the metric
2467
+
2468
+ Returns:
2469
+ List of (timestamp, value) tuples
2470
+ """
2471
+ with self._lock:
2472
+ return list(self.time_series_data.get(metric, []))
2473
+
2474
+ def generate_report(self) -> Dict[str, Any]:
2475
+ """Generate comprehensive analytics report.
2476
+
2477
+ Returns:
2478
+ Complete analytics report
2479
+ """
2480
+ with self._lock:
2481
+ report = {
2482
+ "generated_at": datetime.now(UTC),
2483
+ "total_sessions": self.total_retry_sessions,
2484
+ "total_attempts": self.total_attempts,
2485
+ "total_successes": self.total_successes,
2486
+ "success_rate": (
2487
+ self.total_successes / self.total_retry_sessions
2488
+ if self.total_retry_sessions > 0
2489
+ else 0.0
2490
+ ),
2491
+ "average_attempts": self.average_attempts_per_session,
2492
+ "most_common_exceptions": self.most_common_exceptions,
2493
+ "strategy_performance": dict(self.strategy_performance),
2494
+ "recommendations": self._generate_recommendations(),
2495
+ }
2496
+ return report
2497
+
2498
+ def _generate_recommendations(self) -> List[str]:
2499
+ """Generate recommendations based on analytics.
2500
+
2501
+ Returns:
2502
+ List of recommendation strings
2503
+ """
2504
+ recommendations = []
2505
+
2506
+ # Success rate recommendations
2507
+ if self.total_retry_sessions > 10:
2508
+ success_rate = self.total_successes / self.total_retry_sessions
2509
+ if success_rate < 0.5:
2510
+ recommendations.append(
2511
+ "Low success rate detected. Consider reviewing exception handling and retry strategies."
2512
+ )
2513
+ elif success_rate > 0.95:
2514
+ recommendations.append(
2515
+ "High success rate achieved. Current retry configuration appears optimal."
2516
+ )
2517
+
2518
+ # Strategy performance recommendations
2519
+ if len(self.strategy_performance) > 1:
2520
+ best_strategy = max(
2521
+ self.strategy_performance.items(), key=lambda x: x[1]["success_rate"]
2522
+ )
2523
+ recommendations.append(
2524
+ f"Strategy '{best_strategy[0]}' shows best performance with "
2525
+ f"{best_strategy[1]['success_rate']:.1%} success rate."
2526
+ )
2527
+
2528
+ # Exception pattern recommendations
2529
+ if self.most_common_exceptions:
2530
+ most_common = self.most_common_exceptions[0]
2531
+ recommendations.append(
2532
+ f"Most common exception: {most_common[0]} ({most_common[1]} occurrences). "
2533
+ f"Consider targeted handling for this exception type."
2534
+ )
2535
+
2536
+ return recommendations
2537
+
2538
+
2539
+ class RetryPolicyEngine:
2540
+ """Comprehensive retry policy engine with pluggable strategies and enterprise integration."""
2541
+
2542
+ def __init__(
2543
+ self,
2544
+ default_strategy: Optional[RetryStrategy] = None,
2545
+ exception_classifier: Optional[ExceptionClassifier] = None,
2546
+ enable_analytics: bool = True,
2547
+ enable_circuit_breaker_coordination: bool = False,
2548
+ enable_resource_limit_coordination: bool = False,
2549
+ circuit_breaker: Optional["CircuitBreaker"] = None,
2550
+ resource_limit_enforcer: Optional["ResourceLimitEnforcer"] = None,
2551
+ mode: RetryPolicyMode = RetryPolicyMode.ADAPTIVE,
2552
+ ):
2553
+ """Initialize retry policy engine.
2554
+
2555
+ Args:
2556
+ default_strategy: Default retry strategy to use
2557
+ exception_classifier: Exception classification system
2558
+ enable_analytics: Enable analytics and metrics collection
2559
+ enable_circuit_breaker_coordination: Coordinate with circuit breakers
2560
+ enable_resource_limit_coordination: Coordinate with resource limits
2561
+ circuit_breaker: CircuitBreaker instance for coordination
2562
+ resource_limit_enforcer: ResourceLimitEnforcer instance for coordination
2563
+ mode: Retry policy operation mode
2564
+ """
2565
+ # Initialize default strategy if not provided
2566
+ if default_strategy is None:
2567
+ default_strategy = ExponentialBackoffStrategy()
2568
+
2569
+ self.default_strategy = default_strategy
2570
+ self.exception_classifier = exception_classifier or ExceptionClassifier()
2571
+ self.enable_analytics = enable_analytics
2572
+ self.enable_circuit_breaker_coordination = enable_circuit_breaker_coordination
2573
+ self.enable_resource_limit_coordination = enable_resource_limit_coordination
2574
+ self.circuit_breaker = circuit_breaker
2575
+ self.resource_limit_enforcer = resource_limit_enforcer
2576
+ self.mode = mode
2577
+
2578
+ # Strategy registry
2579
+ self.strategies: Dict[str, RetryStrategy] = {
2580
+ "exponential_backoff": ExponentialBackoffStrategy(),
2581
+ "linear_backoff": LinearBackoffStrategy(),
2582
+ "fixed_delay": FixedDelayStrategy(),
2583
+ "adaptive_retry": AdaptiveRetryStrategy(),
2584
+ }
2585
+
2586
+ # Exception-specific strategies
2587
+ self.exception_strategies: Dict[Type[Exception], RetryStrategy] = {}
2588
+
2589
+ # Metrics and analytics
2590
+ self.metrics = RetryMetrics() if enable_analytics else None
2591
+ self.analytics = RetryAnalytics() if enable_analytics else None
2592
+
2593
+ # Strategy effectiveness tracking
2594
+ self.strategy_effectiveness: Dict[str, Dict[str, Any]] = defaultdict(
2595
+ lambda: {"uses": 0, "successes": 0, "total_attempts": 0, "total_time": 0.0}
2596
+ )
2597
+
2598
+ # Thread safety
2599
+ self._lock = threading.RLock()
2600
+
2601
+ logger.info(f"RetryPolicyEngine initialized with mode: {mode.value}")
2602
+
2603
+ def register_strategy(self, name: str, strategy: RetryStrategy) -> None:
2604
+ """Register a custom retry strategy.
2605
+
2606
+ Args:
2607
+ name: Strategy name for identification
2608
+ strategy: RetryStrategy instance
2609
+ """
2610
+ with self._lock:
2611
+ self.strategies[name] = strategy
2612
+ logger.info(f"Registered retry strategy: {name}")
2613
+
2614
+ def register_strategy_for_exception(
2615
+ self, exception_type: Type[Exception], strategy: RetryStrategy
2616
+ ) -> None:
2617
+ """Register strategy for specific exception type.
2618
+
2619
+ Args:
2620
+ exception_type: Exception type to handle
2621
+ strategy: RetryStrategy to use for this exception type
2622
+ """
2623
+ with self._lock:
2624
+ self.exception_strategies[exception_type] = strategy
2625
+ logger.info(
2626
+ f"Registered strategy for {exception_type.__name__}: {strategy.name}"
2627
+ )
2628
+
2629
+ def select_strategy(
2630
+ self, strategy_name: Optional[str] = None, exception: Optional[Exception] = None
2631
+ ) -> RetryStrategy:
2632
+ """Select appropriate retry strategy.
2633
+
2634
+ Args:
2635
+ strategy_name: Explicit strategy name to use
2636
+ exception: Exception that occurred (for strategy selection)
2637
+
2638
+ Returns:
2639
+ Selected RetryStrategy instance
2640
+ """
2641
+ with self._lock:
2642
+ # Explicit strategy selection
2643
+ if strategy_name and strategy_name in self.strategies:
2644
+ return self.strategies[strategy_name]
2645
+
2646
+ # Exception-specific strategy selection
2647
+ if exception:
2648
+ exception_type = type(exception)
2649
+ for exc_type, strategy in self.exception_strategies.items():
2650
+ if issubclass(exception_type, exc_type):
2651
+ return strategy
2652
+
2653
+ # Default strategy
2654
+ return self.default_strategy
2655
+
2656
+ async def execute_with_retry(
2657
+ self,
2658
+ func: Callable,
2659
+ *args,
2660
+ strategy_name: Optional[str] = None,
2661
+ timeout: Optional[float] = None,
2662
+ **kwargs,
2663
+ ) -> RetryResult:
2664
+ """Execute function with retry policy.
2665
+
2666
+ Args:
2667
+ func: Function to execute (sync or async)
2668
+ *args: Function arguments
2669
+ strategy_name: Specific strategy to use
2670
+ timeout: Overall timeout for all attempts
2671
+ **kwargs: Function keyword arguments
2672
+
2673
+ Returns:
2674
+ RetryResult with execution details
2675
+ """
2676
+ session_id = str(uuid.uuid4())
2677
+ start_time = time.time()
2678
+ attempts = []
2679
+ last_exception = None
2680
+
2681
+ # Initial strategy selection (may be updated based on exceptions)
2682
+ current_strategy = self.select_strategy(strategy_name)
2683
+
2684
+ logger.debug(
2685
+ f"Starting retry session {session_id} with strategy: {current_strategy.name}"
2686
+ )
2687
+
2688
+ for attempt_num in range(1, current_strategy.max_attempts + 1):
2689
+ # Check timeout
2690
+ if timeout and (time.time() - start_time) >= timeout:
2691
+ logger.warning(f"Retry session {session_id} timed out after {timeout}s")
2692
+ break
2693
+
2694
+ # Check resource limits if enabled
2695
+ if self.enable_resource_limit_coordination and self.resource_limit_enforcer:
2696
+ try:
2697
+ limits_check = self.resource_limit_enforcer.check_all_limits()
2698
+ for resource_type, result in limits_check.items():
2699
+ if not result.can_proceed:
2700
+ logger.warning(
2701
+ f"Resource limit prevents retry: {result.message}"
2702
+ )
2703
+ return RetryResult(
2704
+ success=False,
2705
+ total_attempts=attempt_num,
2706
+ total_time=time.time() - start_time,
2707
+ final_exception=ResourceLimitExceededError(
2708
+ result.message
2709
+ ),
2710
+ attempts=attempts,
2711
+ )
2712
+ except Exception as e:
2713
+ logger.error(f"Error checking resource limits: {e}")
2714
+
2715
+ # Check circuit breaker if enabled
2716
+ if self.enable_circuit_breaker_coordination and self.circuit_breaker:
2717
+ try:
2718
+ # Execute through circuit breaker
2719
+ attempt_start = time.time()
2720
+ if asyncio.iscoroutinefunction(func):
2721
+ result = await self.circuit_breaker.call(func, *args, **kwargs)
2722
+ else:
2723
+ result = await self.circuit_breaker.call(func, *args, **kwargs)
2724
+ attempt_time = time.time() - attempt_start
2725
+
2726
+ # Success
2727
+ attempt = RetryAttempt(
2728
+ timestamp=datetime.now(UTC),
2729
+ exception_type=type(None),
2730
+ attempt_number=attempt_num,
2731
+ delay_used=0.0,
2732
+ success=True,
2733
+ execution_time=attempt_time,
2734
+ )
2735
+ attempts.append(attempt)
2736
+
2737
+ # Record metrics
2738
+ if self.metrics:
2739
+ self.metrics.record_attempt(attempt)
2740
+
2741
+ # Record strategy effectiveness
2742
+ self.record_strategy_effectiveness(
2743
+ current_strategy, attempt_num, True, time.time() - start_time
2744
+ )
2745
+
2746
+ total_time = time.time() - start_time
2747
+ logger.info(
2748
+ f"Retry session {session_id} succeeded on attempt {attempt_num}"
2749
+ )
2750
+
2751
+ return RetryResult(
2752
+ success=True,
2753
+ value=result,
2754
+ total_attempts=attempt_num,
2755
+ total_time=total_time,
2756
+ attempts=attempts,
2757
+ )
2758
+
2759
+ except CircuitBreakerOpenError as e:
2760
+ # Circuit breaker is open, fail immediately
2761
+ logger.warning(
2762
+ f"Circuit breaker open, failing retry session {session_id}"
2763
+ )
2764
+ return RetryResult(
2765
+ success=False,
2766
+ total_attempts=attempt_num,
2767
+ total_time=time.time() - start_time,
2768
+ final_exception=e,
2769
+ attempts=attempts,
2770
+ )
2771
+
2772
+ except Exception as e:
2773
+ last_exception = e
2774
+ else:
2775
+ # Execute without circuit breaker
2776
+ try:
2777
+ attempt_start = time.time()
2778
+ if asyncio.iscoroutinefunction(func):
2779
+ result = await func(*args, **kwargs)
2780
+ else:
2781
+ result = func(*args, **kwargs)
2782
+ attempt_time = time.time() - attempt_start
2783
+
2784
+ # Success
2785
+ attempt = RetryAttempt(
2786
+ timestamp=datetime.now(UTC),
2787
+ exception_type=type(None),
2788
+ attempt_number=attempt_num,
2789
+ delay_used=0.0,
2790
+ success=True,
2791
+ execution_time=attempt_time,
2792
+ )
2793
+ attempts.append(attempt)
2794
+
2795
+ # Record metrics
2796
+ if self.metrics:
2797
+ self.metrics.record_attempt(attempt)
2798
+
2799
+ # Record strategy effectiveness
2800
+ self.record_strategy_effectiveness(
2801
+ current_strategy, attempt_num, True, time.time() - start_time
2802
+ )
2803
+
2804
+ total_time = time.time() - start_time
2805
+ logger.info(
2806
+ f"Retry session {session_id} succeeded on attempt {attempt_num}"
2807
+ )
2808
+
2809
+ return RetryResult(
2810
+ success=True,
2811
+ value=result,
2812
+ total_attempts=attempt_num,
2813
+ total_time=total_time,
2814
+ attempts=attempts,
2815
+ )
2816
+
2817
+ except Exception as e:
2818
+ last_exception = e
2819
+ attempt_time = time.time() - attempt_start
2820
+
2821
+ # Handle exception
2822
+ if last_exception:
2823
+ # Update strategy selection based on exception
2824
+ exception_specific_strategy = self.select_strategy(
2825
+ exception=last_exception
2826
+ )
2827
+ if exception_specific_strategy != current_strategy:
2828
+ logger.debug(
2829
+ f"Switching strategy from {current_strategy.name} to "
2830
+ f"{exception_specific_strategy.name} for {type(last_exception).__name__}"
2831
+ )
2832
+ current_strategy = exception_specific_strategy
2833
+
2834
+ # Check if exception is retriable
2835
+ if not self.exception_classifier.is_retriable(last_exception):
2836
+ logger.info(
2837
+ f"Non-retriable exception in session {session_id}: "
2838
+ f"{type(last_exception).__name__}: {last_exception}"
2839
+ )
2840
+
2841
+ # Record non-retriable attempt
2842
+ attempt = RetryAttempt(
2843
+ timestamp=datetime.now(UTC),
2844
+ exception_type=type(last_exception),
2845
+ attempt_number=attempt_num,
2846
+ delay_used=0.0,
2847
+ success=False,
2848
+ execution_time=attempt_time,
2849
+ error_message=str(last_exception),
2850
+ )
2851
+ attempts.append(attempt)
2852
+
2853
+ if self.metrics:
2854
+ self.metrics.record_attempt(attempt)
2855
+
2856
+ return RetryResult(
2857
+ success=False,
2858
+ total_attempts=attempt_num,
2859
+ total_time=time.time() - start_time,
2860
+ final_exception=last_exception,
2861
+ attempts=attempts,
2862
+ )
2863
+
2864
+ # Calculate delay for next attempt
2865
+ if attempt_num < current_strategy.max_attempts:
2866
+ delay = current_strategy.calculate_delay(attempt_num + 1)
2867
+
2868
+ # Record failed attempt
2869
+ attempt = RetryAttempt(
2870
+ timestamp=datetime.now(UTC),
2871
+ exception_type=type(last_exception),
2872
+ attempt_number=attempt_num,
2873
+ delay_used=delay,
2874
+ success=False,
2875
+ execution_time=attempt_time,
2876
+ error_message=str(last_exception),
2877
+ )
2878
+ attempts.append(attempt)
2879
+
2880
+ if self.metrics:
2881
+ self.metrics.record_attempt(attempt)
2882
+
2883
+ # Record learning data for adaptive strategies
2884
+ if isinstance(current_strategy, AdaptiveRetryStrategy):
2885
+ current_strategy.record_attempt_result(
2886
+ type(last_exception),
2887
+ attempt_num,
2888
+ delay,
2889
+ False,
2890
+ attempt_time,
2891
+ )
2892
+
2893
+ logger.warning(
2894
+ f"Attempt {attempt_num} failed in session {session_id}, "
2895
+ f"retrying in {delay:.2f}s: {type(last_exception).__name__}: {last_exception}"
2896
+ )
2897
+
2898
+ # Wait before retry
2899
+ await asyncio.sleep(delay)
2900
+ else:
2901
+ # Record final failed attempt
2902
+ attempt = RetryAttempt(
2903
+ timestamp=datetime.now(UTC),
2904
+ exception_type=type(last_exception),
2905
+ attempt_number=attempt_num,
2906
+ delay_used=0.0,
2907
+ success=False,
2908
+ execution_time=attempt_time,
2909
+ error_message=str(last_exception),
2910
+ )
2911
+ attempts.append(attempt)
2912
+
2913
+ if self.metrics:
2914
+ self.metrics.record_attempt(attempt)
2915
+
2916
+ # All attempts failed
2917
+ total_time = time.time() - start_time
2918
+ logger.error(
2919
+ f"Retry session {session_id} failed after {current_strategy.max_attempts} attempts "
2920
+ f"in {total_time:.2f}s"
2921
+ )
2922
+
2923
+ # Record strategy effectiveness
2924
+ self.record_strategy_effectiveness(
2925
+ current_strategy, current_strategy.max_attempts, False, total_time
2926
+ )
2927
+
2928
+ # Record analytics
2929
+ if self.analytics:
2930
+ self.analytics.record_session(
2931
+ session_id,
2932
+ current_strategy.max_attempts,
2933
+ False,
2934
+ total_time,
2935
+ current_strategy.name,
2936
+ )
2937
+ if last_exception:
2938
+ self.analytics.record_exception(type(last_exception))
2939
+
2940
+ return RetryResult(
2941
+ success=False,
2942
+ total_attempts=current_strategy.max_attempts,
2943
+ total_time=total_time,
2944
+ final_exception=last_exception,
2945
+ attempts=attempts,
2946
+ )
2947
+
2948
+ def record_strategy_effectiveness(
2949
+ self, strategy: RetryStrategy, attempts: int, success: bool, total_time: float
2950
+ ) -> None:
2951
+ """Record effectiveness data for a strategy.
2952
+
2953
+ Args:
2954
+ strategy: Strategy that was used
2955
+ attempts: Number of attempts made
2956
+ success: Whether the strategy succeeded
2957
+ total_time: Total time taken
2958
+ """
2959
+ with self._lock:
2960
+ effectiveness = self.strategy_effectiveness[strategy.name]
2961
+ effectiveness["uses"] += 1
2962
+ effectiveness["total_attempts"] += attempts
2963
+ effectiveness["total_time"] += total_time
2964
+
2965
+ if success:
2966
+ effectiveness["successes"] += 1
2967
+
2968
+ def get_strategy_effectiveness(self) -> Dict[str, Dict[str, Any]]:
2969
+ """Get effectiveness statistics for all strategies.
2970
+
2971
+ Returns:
2972
+ Dictionary mapping strategy names to effectiveness stats
2973
+ """
2974
+ with self._lock:
2975
+ result = {}
2976
+ for name, data in self.strategy_effectiveness.items():
2977
+ if data["uses"] > 0:
2978
+ result[name] = {
2979
+ "uses": data["uses"],
2980
+ "success_rate": data["successes"] / data["uses"],
2981
+ "average_attempts": data["total_attempts"] / data["uses"],
2982
+ "average_time": data["total_time"] / data["uses"],
2983
+ }
2984
+ return result
2985
+
2986
+ def get_analytics(self) -> Optional[RetryAnalytics]:
2987
+ """Get current analytics data.
2988
+
2989
+ Returns:
2990
+ RetryAnalytics instance or None if analytics disabled
2991
+ """
2992
+ return self.analytics
2993
+
2994
+ def get_metrics_summary(self) -> Optional[Dict[str, Any]]:
2995
+ """Get metrics summary.
2996
+
2997
+ Returns:
2998
+ Metrics summary dictionary or None if metrics disabled
2999
+ """
3000
+ if self.metrics:
3001
+ return self.metrics.get_summary_stats()
3002
+ return None
3003
+
3004
+ def reset_metrics(self) -> None:
3005
+ """Reset all metrics and analytics data."""
3006
+ if self.metrics:
3007
+ self.metrics = RetryMetrics()
3008
+ if self.analytics:
3009
+ self.analytics = RetryAnalytics()
3010
+ with self._lock:
3011
+ self.strategy_effectiveness.clear()
3012
+ logger.info("Retry policy metrics reset")
3013
+
3014
+ def get_configuration(self) -> Dict[str, Any]:
3015
+ """Get current retry policy configuration.
3016
+
3017
+ Returns:
3018
+ Configuration dictionary
3019
+ """
3020
+ with self._lock:
3021
+ return {
3022
+ "default_strategy": self.default_strategy.get_config(),
3023
+ "mode": self.mode.value,
3024
+ "enable_analytics": self.enable_analytics,
3025
+ "enable_circuit_breaker_coordination": self.enable_circuit_breaker_coordination,
3026
+ "enable_resource_limit_coordination": self.enable_resource_limit_coordination,
3027
+ "registered_strategies": list(self.strategies.keys()),
3028
+ "exception_specific_strategies": {
3029
+ exc_type.__name__: strategy.name
3030
+ for exc_type, strategy in self.exception_strategies.items()
3031
+ },
3032
+ "classification_rules": self.exception_classifier.get_classification_rules(),
3033
+ }