tetra-rp 0.6.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tetra_rp/__init__.py +109 -19
- tetra_rp/cli/commands/__init__.py +1 -0
- tetra_rp/cli/commands/apps.py +143 -0
- tetra_rp/cli/commands/build.py +1082 -0
- tetra_rp/cli/commands/build_utils/__init__.py +1 -0
- tetra_rp/cli/commands/build_utils/handler_generator.py +176 -0
- tetra_rp/cli/commands/build_utils/lb_handler_generator.py +309 -0
- tetra_rp/cli/commands/build_utils/manifest.py +430 -0
- tetra_rp/cli/commands/build_utils/mothership_handler_generator.py +75 -0
- tetra_rp/cli/commands/build_utils/scanner.py +596 -0
- tetra_rp/cli/commands/deploy.py +580 -0
- tetra_rp/cli/commands/init.py +123 -0
- tetra_rp/cli/commands/resource.py +108 -0
- tetra_rp/cli/commands/run.py +296 -0
- tetra_rp/cli/commands/test_mothership.py +458 -0
- tetra_rp/cli/commands/undeploy.py +533 -0
- tetra_rp/cli/main.py +97 -0
- tetra_rp/cli/utils/__init__.py +1 -0
- tetra_rp/cli/utils/app.py +15 -0
- tetra_rp/cli/utils/conda.py +127 -0
- tetra_rp/cli/utils/deployment.py +530 -0
- tetra_rp/cli/utils/ignore.py +143 -0
- tetra_rp/cli/utils/skeleton.py +184 -0
- tetra_rp/cli/utils/skeleton_template/.env.example +4 -0
- tetra_rp/cli/utils/skeleton_template/.flashignore +40 -0
- tetra_rp/cli/utils/skeleton_template/.gitignore +44 -0
- tetra_rp/cli/utils/skeleton_template/README.md +263 -0
- tetra_rp/cli/utils/skeleton_template/main.py +44 -0
- tetra_rp/cli/utils/skeleton_template/mothership.py +55 -0
- tetra_rp/cli/utils/skeleton_template/pyproject.toml +58 -0
- tetra_rp/cli/utils/skeleton_template/requirements.txt +1 -0
- tetra_rp/cli/utils/skeleton_template/workers/__init__.py +0 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/__init__.py +19 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/endpoint.py +36 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/__init__.py +19 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/endpoint.py +61 -0
- tetra_rp/client.py +136 -33
- tetra_rp/config.py +29 -0
- tetra_rp/core/api/runpod.py +591 -39
- tetra_rp/core/deployment.py +232 -0
- tetra_rp/core/discovery.py +425 -0
- tetra_rp/core/exceptions.py +50 -0
- tetra_rp/core/resources/__init__.py +27 -9
- tetra_rp/core/resources/app.py +738 -0
- tetra_rp/core/resources/base.py +139 -4
- tetra_rp/core/resources/constants.py +21 -0
- tetra_rp/core/resources/cpu.py +115 -13
- tetra_rp/core/resources/gpu.py +182 -16
- tetra_rp/core/resources/live_serverless.py +153 -16
- tetra_rp/core/resources/load_balancer_sls_resource.py +440 -0
- tetra_rp/core/resources/network_volume.py +126 -31
- tetra_rp/core/resources/resource_manager.py +436 -35
- tetra_rp/core/resources/serverless.py +537 -120
- tetra_rp/core/resources/serverless_cpu.py +201 -0
- tetra_rp/core/resources/template.py +1 -59
- tetra_rp/core/utils/constants.py +10 -0
- tetra_rp/core/utils/file_lock.py +260 -0
- tetra_rp/core/utils/http.py +67 -0
- tetra_rp/core/utils/lru_cache.py +75 -0
- tetra_rp/core/utils/singleton.py +36 -1
- tetra_rp/core/validation.py +44 -0
- tetra_rp/execute_class.py +301 -0
- tetra_rp/protos/remote_execution.py +98 -9
- tetra_rp/runtime/__init__.py +1 -0
- tetra_rp/runtime/circuit_breaker.py +274 -0
- tetra_rp/runtime/config.py +12 -0
- tetra_rp/runtime/exceptions.py +49 -0
- tetra_rp/runtime/generic_handler.py +206 -0
- tetra_rp/runtime/lb_handler.py +189 -0
- tetra_rp/runtime/load_balancer.py +160 -0
- tetra_rp/runtime/manifest_fetcher.py +192 -0
- tetra_rp/runtime/metrics.py +325 -0
- tetra_rp/runtime/models.py +73 -0
- tetra_rp/runtime/mothership_provisioner.py +512 -0
- tetra_rp/runtime/production_wrapper.py +266 -0
- tetra_rp/runtime/reliability_config.py +149 -0
- tetra_rp/runtime/retry_manager.py +118 -0
- tetra_rp/runtime/serialization.py +124 -0
- tetra_rp/runtime/service_registry.py +346 -0
- tetra_rp/runtime/state_manager_client.py +248 -0
- tetra_rp/stubs/live_serverless.py +35 -17
- tetra_rp/stubs/load_balancer_sls.py +357 -0
- tetra_rp/stubs/registry.py +145 -19
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/METADATA +398 -60
- tetra_rp-0.24.0.dist-info/RECORD +99 -0
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/WHEEL +1 -1
- tetra_rp-0.24.0.dist-info/entry_points.txt +2 -0
- tetra_rp/core/pool/cluster_manager.py +0 -177
- tetra_rp/core/pool/dataclass.py +0 -18
- tetra_rp/core/pool/ex.py +0 -38
- tetra_rp/core/pool/job.py +0 -22
- tetra_rp/core/pool/worker.py +0 -19
- tetra_rp/core/resources/utils.py +0 -50
- tetra_rp/core/utils/json.py +0 -33
- tetra_rp-0.6.0.dist-info/RECORD +0 -39
- /tetra_rp/{core/pool → cli}/__init__.py +0 -0
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
"""Production wrapper for cross-endpoint function routing."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Callable, Dict, Optional
|
|
5
|
+
|
|
6
|
+
from tetra_rp.core.resources.serverless import ServerlessResource
|
|
7
|
+
|
|
8
|
+
from .exceptions import RemoteExecutionError
|
|
9
|
+
from .serialization import serialize_args, serialize_kwargs
|
|
10
|
+
from .service_registry import ServiceRegistry
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ProductionWrapper:
|
|
16
|
+
"""Wrapper that routes function execution between endpoints.
|
|
17
|
+
|
|
18
|
+
Intercepts stub execution and determines if the call is local (execute
|
|
19
|
+
directly) or remote (call via HTTP to another endpoint).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, service_registry: ServiceRegistry):
|
|
23
|
+
"""Initialize production wrapper.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
service_registry: Service registry for routing decisions.
|
|
27
|
+
"""
|
|
28
|
+
self.service_registry = service_registry
|
|
29
|
+
|
|
30
|
+
async def wrap_function_execution(
|
|
31
|
+
self,
|
|
32
|
+
original_stub_func: Callable,
|
|
33
|
+
func: Callable,
|
|
34
|
+
dependencies: Optional[list],
|
|
35
|
+
system_dependencies: Optional[list],
|
|
36
|
+
accelerate_downloads: bool,
|
|
37
|
+
*args: Any,
|
|
38
|
+
**kwargs: Any,
|
|
39
|
+
) -> Any:
|
|
40
|
+
"""Route function execution to local or remote endpoint.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
original_stub_func: The original stubbed_resource function.
|
|
44
|
+
func: The decorated function being called.
|
|
45
|
+
dependencies: Pip dependencies (for local execution).
|
|
46
|
+
system_dependencies: System dependencies (for local execution).
|
|
47
|
+
accelerate_downloads: Download acceleration flag (for local).
|
|
48
|
+
*args: Function positional arguments.
|
|
49
|
+
**kwargs: Function keyword arguments.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Function execution result.
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
Exception: If execution fails.
|
|
56
|
+
"""
|
|
57
|
+
function_name = func.__name__
|
|
58
|
+
|
|
59
|
+
# Ensure manifest is loaded
|
|
60
|
+
await self.service_registry._ensure_manifest_loaded()
|
|
61
|
+
|
|
62
|
+
# Determine routing
|
|
63
|
+
try:
|
|
64
|
+
resource = await self.service_registry.get_resource_for_function(
|
|
65
|
+
function_name
|
|
66
|
+
)
|
|
67
|
+
except ValueError as e:
|
|
68
|
+
# Function not in manifest, execute locally
|
|
69
|
+
logger.debug(
|
|
70
|
+
f"Function {function_name} not in manifest: {e}, executing locally"
|
|
71
|
+
)
|
|
72
|
+
return await original_stub_func(
|
|
73
|
+
func,
|
|
74
|
+
dependencies,
|
|
75
|
+
system_dependencies,
|
|
76
|
+
accelerate_downloads,
|
|
77
|
+
*args,
|
|
78
|
+
**kwargs,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Local execution
|
|
82
|
+
if resource is None:
|
|
83
|
+
logger.debug(f"Executing local function: {function_name}")
|
|
84
|
+
return await original_stub_func(
|
|
85
|
+
func,
|
|
86
|
+
dependencies,
|
|
87
|
+
system_dependencies,
|
|
88
|
+
accelerate_downloads,
|
|
89
|
+
*args,
|
|
90
|
+
**kwargs,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Remote execution
|
|
94
|
+
logger.debug(f"Routing function {function_name} to remote endpoint")
|
|
95
|
+
return await self._execute_remote(
|
|
96
|
+
resource,
|
|
97
|
+
function_name,
|
|
98
|
+
args,
|
|
99
|
+
kwargs,
|
|
100
|
+
execution_type="function",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
async def wrap_class_method_execution(
|
|
104
|
+
self,
|
|
105
|
+
original_method_func: Callable,
|
|
106
|
+
request: Any,
|
|
107
|
+
) -> Any:
|
|
108
|
+
"""Route class method execution to local or remote endpoint.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
original_method_func: The original execute_class_method function.
|
|
112
|
+
request: FunctionRequest containing class and method info.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Method execution result.
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
Exception: If execution fails.
|
|
119
|
+
"""
|
|
120
|
+
# Ensure manifest is loaded
|
|
121
|
+
await self.service_registry._ensure_manifest_loaded()
|
|
122
|
+
|
|
123
|
+
class_name = getattr(request, "class_name", None)
|
|
124
|
+
|
|
125
|
+
if not class_name:
|
|
126
|
+
# No class name, execute locally
|
|
127
|
+
return await original_method_func(request)
|
|
128
|
+
|
|
129
|
+
# Determine routing
|
|
130
|
+
try:
|
|
131
|
+
resource = await self.service_registry.get_resource_for_function(class_name)
|
|
132
|
+
except ValueError:
|
|
133
|
+
# Class not in manifest, execute locally
|
|
134
|
+
logger.debug(f"Class {class_name} not in manifest, executing locally")
|
|
135
|
+
return await original_method_func(request)
|
|
136
|
+
|
|
137
|
+
# Local execution
|
|
138
|
+
if resource is None:
|
|
139
|
+
logger.debug(f"Executing local class method: {class_name}")
|
|
140
|
+
return await original_method_func(request)
|
|
141
|
+
|
|
142
|
+
# Remote execution
|
|
143
|
+
logger.debug(f"Routing class {class_name} to remote endpoint")
|
|
144
|
+
|
|
145
|
+
# Convert FunctionRequest to dict payload
|
|
146
|
+
payload = self._build_class_payload(request)
|
|
147
|
+
return await self._execute_remote(
|
|
148
|
+
resource,
|
|
149
|
+
class_name,
|
|
150
|
+
(),
|
|
151
|
+
payload.get("input", {}),
|
|
152
|
+
execution_type="class",
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
async def _execute_remote(
|
|
156
|
+
self,
|
|
157
|
+
resource: ServerlessResource,
|
|
158
|
+
function_name: str,
|
|
159
|
+
args: tuple,
|
|
160
|
+
kwargs: dict,
|
|
161
|
+
execution_type: str = "function",
|
|
162
|
+
) -> Any:
|
|
163
|
+
"""Execute function on remote endpoint.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
resource: ServerlessResource with endpoint ID set.
|
|
167
|
+
function_name: Name of function/class to execute.
|
|
168
|
+
args: Positional arguments.
|
|
169
|
+
kwargs: Keyword arguments.
|
|
170
|
+
execution_type: "function" or "class".
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Execution result.
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
RemoteExecutionError: If remote execution fails.
|
|
177
|
+
"""
|
|
178
|
+
# Serialize arguments
|
|
179
|
+
serialized_args = serialize_args(args)
|
|
180
|
+
serialized_kwargs = serialize_kwargs(kwargs)
|
|
181
|
+
|
|
182
|
+
# Build payload matching RunPod format
|
|
183
|
+
payload = {
|
|
184
|
+
"input": {
|
|
185
|
+
"function_name": function_name,
|
|
186
|
+
"execution_type": execution_type,
|
|
187
|
+
"args": serialized_args,
|
|
188
|
+
"kwargs": serialized_kwargs,
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
# Execute via ServerlessResource
|
|
193
|
+
result = await resource.run_sync(payload)
|
|
194
|
+
|
|
195
|
+
# Handle response
|
|
196
|
+
if result.error:
|
|
197
|
+
raise RemoteExecutionError(
|
|
198
|
+
f"Remote execution of {function_name} failed: {result.error}"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
return result.output
|
|
202
|
+
|
|
203
|
+
def _build_class_payload(self, request: Any) -> Dict[str, Any]:
|
|
204
|
+
"""Build payload from FunctionRequest for class execution.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
request: FunctionRequest object.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
RunPod-format payload dict.
|
|
211
|
+
"""
|
|
212
|
+
# Extract request data - handle both dict and object access patterns
|
|
213
|
+
if isinstance(request, dict):
|
|
214
|
+
data = request
|
|
215
|
+
else:
|
|
216
|
+
data = (
|
|
217
|
+
request.model_dump(exclude_none=True)
|
|
218
|
+
if hasattr(request, "model_dump")
|
|
219
|
+
else {}
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Extract class execution data
|
|
223
|
+
payload = {
|
|
224
|
+
"input": {
|
|
225
|
+
"function_name": data.get("class_name"),
|
|
226
|
+
"execution_type": "class",
|
|
227
|
+
"args": data.get("args", []),
|
|
228
|
+
"kwargs": data.get("kwargs", {}),
|
|
229
|
+
"method_name": data.get("method_name"),
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
return payload
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
# Singleton instance management
|
|
237
|
+
_wrapper_instance: Optional[ProductionWrapper] = None
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def create_production_wrapper(
|
|
241
|
+
service_registry: Optional[ServiceRegistry] = None,
|
|
242
|
+
) -> ProductionWrapper:
|
|
243
|
+
"""Create or get singleton ProductionWrapper instance.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
service_registry: Service registry. Creates if not provided.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
ProductionWrapper instance.
|
|
250
|
+
"""
|
|
251
|
+
global _wrapper_instance
|
|
252
|
+
|
|
253
|
+
if _wrapper_instance is None:
|
|
254
|
+
# Create components if not provided
|
|
255
|
+
if service_registry is None:
|
|
256
|
+
service_registry = ServiceRegistry()
|
|
257
|
+
|
|
258
|
+
_wrapper_instance = ProductionWrapper(service_registry)
|
|
259
|
+
|
|
260
|
+
return _wrapper_instance
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def reset_wrapper() -> None:
|
|
264
|
+
"""Reset singleton wrapper (mainly for testing)."""
|
|
265
|
+
global _wrapper_instance
|
|
266
|
+
_wrapper_instance = None
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Centralized configuration for reliability features."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LoadBalancerStrategy(Enum):
|
|
10
|
+
"""Load balancing strategies for endpoint selection."""
|
|
11
|
+
|
|
12
|
+
ROUND_ROBIN = "round_robin"
|
|
13
|
+
LEAST_CONNECTIONS = "least_connections"
|
|
14
|
+
RANDOM = "random"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class CircuitBreakerConfig:
|
|
19
|
+
"""Configuration for circuit breaker behavior."""
|
|
20
|
+
|
|
21
|
+
enabled: bool = True
|
|
22
|
+
failure_threshold: int = 5
|
|
23
|
+
success_threshold: int = 2
|
|
24
|
+
timeout_seconds: int = 60
|
|
25
|
+
window_size: int = 10
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class LoadBalancerConfig:
|
|
30
|
+
"""Configuration for load balancer behavior."""
|
|
31
|
+
|
|
32
|
+
enabled: bool = False
|
|
33
|
+
strategy: LoadBalancerStrategy = LoadBalancerStrategy.ROUND_ROBIN
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class RetryConfig:
|
|
38
|
+
"""Configuration for retry behavior with exponential backoff."""
|
|
39
|
+
|
|
40
|
+
enabled: bool = True
|
|
41
|
+
max_attempts: int = 3
|
|
42
|
+
base_delay: float = 0.5
|
|
43
|
+
max_delay: float = 10.0
|
|
44
|
+
jitter: float = 0.2
|
|
45
|
+
retryable_exceptions: tuple = field(
|
|
46
|
+
default_factory=lambda: (TimeoutError, ConnectionError)
|
|
47
|
+
)
|
|
48
|
+
retryable_status_codes: set = field(
|
|
49
|
+
default_factory=lambda: {408, 429, 500, 502, 503, 504}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class MetricsConfig:
|
|
55
|
+
"""Configuration for metrics collection."""
|
|
56
|
+
|
|
57
|
+
enabled: bool = True
|
|
58
|
+
namespace: str = "tetra.metrics"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class ReliabilityConfig:
|
|
63
|
+
"""Centralized reliability features configuration."""
|
|
64
|
+
|
|
65
|
+
circuit_breaker: CircuitBreakerConfig = field(default_factory=CircuitBreakerConfig)
|
|
66
|
+
load_balancer: LoadBalancerConfig = field(default_factory=LoadBalancerConfig)
|
|
67
|
+
retry: RetryConfig = field(default_factory=RetryConfig)
|
|
68
|
+
metrics: MetricsConfig = field(default_factory=MetricsConfig)
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def from_env(cls) -> "ReliabilityConfig":
|
|
72
|
+
"""Load configuration from environment variables.
|
|
73
|
+
|
|
74
|
+
Environment variables:
|
|
75
|
+
- TETRA_CIRCUIT_BREAKER_ENABLED: Enable circuit breaker (default: true)
|
|
76
|
+
- TETRA_CB_FAILURE_THRESHOLD: Failures before opening (default: 5)
|
|
77
|
+
- TETRA_CB_SUCCESS_THRESHOLD: Successes to close (default: 2)
|
|
78
|
+
- TETRA_CB_TIMEOUT_SECONDS: Time before half-open (default: 60)
|
|
79
|
+
- TETRA_LOAD_BALANCER_ENABLED: Enable load balancer (default: false)
|
|
80
|
+
- TETRA_LB_STRATEGY: Load balancer strategy (default: round_robin)
|
|
81
|
+
- TETRA_RETRY_ENABLED: Enable retry (default: true)
|
|
82
|
+
- TETRA_RETRY_MAX_ATTEMPTS: Max retry attempts (default: 3)
|
|
83
|
+
- TETRA_RETRY_BASE_DELAY: Base delay for backoff (default: 0.5)
|
|
84
|
+
- TETRA_METRICS_ENABLED: Enable metrics (default: true)
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
ReliabilityConfig initialized from environment variables.
|
|
88
|
+
"""
|
|
89
|
+
circuit_breaker = CircuitBreakerConfig(
|
|
90
|
+
enabled=os.getenv("TETRA_CIRCUIT_BREAKER_ENABLED", "true").lower()
|
|
91
|
+
== "true",
|
|
92
|
+
failure_threshold=int(os.getenv("TETRA_CB_FAILURE_THRESHOLD", "5")),
|
|
93
|
+
success_threshold=int(os.getenv("TETRA_CB_SUCCESS_THRESHOLD", "2")),
|
|
94
|
+
timeout_seconds=int(os.getenv("TETRA_CB_TIMEOUT_SECONDS", "60")),
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
strategy_str = os.getenv("TETRA_LB_STRATEGY", "round_robin").lower()
|
|
98
|
+
try:
|
|
99
|
+
strategy = LoadBalancerStrategy(strategy_str)
|
|
100
|
+
except ValueError:
|
|
101
|
+
strategy = LoadBalancerStrategy.ROUND_ROBIN
|
|
102
|
+
|
|
103
|
+
load_balancer = LoadBalancerConfig(
|
|
104
|
+
enabled=os.getenv("TETRA_LOAD_BALANCER_ENABLED", "false").lower() == "true",
|
|
105
|
+
strategy=strategy,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
retry = RetryConfig(
|
|
109
|
+
enabled=os.getenv("TETRA_RETRY_ENABLED", "true").lower() == "true",
|
|
110
|
+
max_attempts=int(os.getenv("TETRA_RETRY_MAX_ATTEMPTS", "3")),
|
|
111
|
+
base_delay=float(os.getenv("TETRA_RETRY_BASE_DELAY", "0.5")),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
metrics = MetricsConfig(
|
|
115
|
+
enabled=os.getenv("TETRA_METRICS_ENABLED", "true").lower() == "true",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return cls(
|
|
119
|
+
circuit_breaker=circuit_breaker,
|
|
120
|
+
load_balancer=load_balancer,
|
|
121
|
+
retry=retry,
|
|
122
|
+
metrics=metrics,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# Global default configuration
|
|
127
|
+
_config: Optional[ReliabilityConfig] = None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def get_reliability_config() -> ReliabilityConfig:
|
|
131
|
+
"""Get global reliability configuration (lazy-loaded).
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
ReliabilityConfig instance initialized from environment.
|
|
135
|
+
"""
|
|
136
|
+
global _config
|
|
137
|
+
if _config is None:
|
|
138
|
+
_config = ReliabilityConfig.from_env()
|
|
139
|
+
return _config
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def set_reliability_config(config: ReliabilityConfig) -> None:
|
|
143
|
+
"""Set global reliability configuration (for testing).
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
config: ReliabilityConfig to set as global.
|
|
147
|
+
"""
|
|
148
|
+
global _config
|
|
149
|
+
_config = config
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Retry logic with exponential backoff for failed remote calls."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Any, Callable, Optional, Set, Tuple, Type
|
|
6
|
+
|
|
7
|
+
from tetra_rp.core.utils.backoff import get_backoff_delay
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RetryExhaustedError(Exception):
|
|
13
|
+
"""Raised when max retry attempts are exceeded."""
|
|
14
|
+
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
async def retry_with_backoff(
|
|
19
|
+
func: Callable[..., Any],
|
|
20
|
+
max_attempts: int = 3,
|
|
21
|
+
base_delay: float = 0.5,
|
|
22
|
+
max_delay: float = 10.0,
|
|
23
|
+
jitter: float = 0.2,
|
|
24
|
+
retryable_exceptions: Optional[Tuple[Type[Exception], ...]] = None,
|
|
25
|
+
retryable_status_codes: Optional[Set[int]] = None,
|
|
26
|
+
circuit_breaker: Optional[Any] = None,
|
|
27
|
+
*args: Any,
|
|
28
|
+
**kwargs: Any,
|
|
29
|
+
) -> Any:
|
|
30
|
+
"""Execute async function with retry and exponential backoff.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
func: Async function to execute
|
|
34
|
+
max_attempts: Maximum number of attempts (default: 3)
|
|
35
|
+
base_delay: Base delay between retries in seconds (default: 0.5)
|
|
36
|
+
max_delay: Maximum delay between retries (default: 10.0)
|
|
37
|
+
jitter: Jitter factor (0.0-1.0) to add randomness (default: 0.2)
|
|
38
|
+
retryable_exceptions: Tuple of exception types to retry on
|
|
39
|
+
(default: (asyncio.TimeoutError, ConnectionError))
|
|
40
|
+
retryable_status_codes: Set of HTTP status codes to retry on
|
|
41
|
+
(default: {408, 429, 500, 502, 503, 504})
|
|
42
|
+
circuit_breaker: Optional circuit breaker to check before retry
|
|
43
|
+
*args: Positional arguments for func
|
|
44
|
+
**kwargs: Keyword arguments for func
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Result from successful function call
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
RetryExhaustedError: If max attempts exceeded
|
|
51
|
+
Exception: If non-retryable exception occurs
|
|
52
|
+
"""
|
|
53
|
+
if retryable_exceptions is None:
|
|
54
|
+
retryable_exceptions = (asyncio.TimeoutError, ConnectionError)
|
|
55
|
+
|
|
56
|
+
if retryable_status_codes is None:
|
|
57
|
+
retryable_status_codes = {408, 429, 500, 502, 503, 504}
|
|
58
|
+
|
|
59
|
+
last_exception: Optional[Exception] = None
|
|
60
|
+
|
|
61
|
+
for attempt in range(max_attempts):
|
|
62
|
+
try:
|
|
63
|
+
# Check circuit breaker before attempting
|
|
64
|
+
if circuit_breaker is not None:
|
|
65
|
+
from tetra_rp.runtime.circuit_breaker import CircuitState
|
|
66
|
+
|
|
67
|
+
if circuit_breaker.get_state() == CircuitState.OPEN:
|
|
68
|
+
raise RuntimeError(
|
|
69
|
+
f"Circuit breaker OPEN, skipping retry attempt {attempt + 1}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
result = await func(*args, **kwargs)
|
|
73
|
+
|
|
74
|
+
# Log success on retry
|
|
75
|
+
if attempt > 0:
|
|
76
|
+
logger.info(f"Retry succeeded on attempt {attempt + 1}/{max_attempts}")
|
|
77
|
+
|
|
78
|
+
return result
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
last_exception = e
|
|
82
|
+
|
|
83
|
+
# Check if exception is retryable
|
|
84
|
+
if not isinstance(e, retryable_exceptions):
|
|
85
|
+
logger.debug(
|
|
86
|
+
f"Non-retryable exception in {func.__name__}: {type(e).__name__}"
|
|
87
|
+
)
|
|
88
|
+
raise
|
|
89
|
+
|
|
90
|
+
# Check for retryable status codes (if exception has status_code)
|
|
91
|
+
if hasattr(e, "status_code"):
|
|
92
|
+
if e.status_code not in retryable_status_codes: # type: ignore
|
|
93
|
+
logger.debug(
|
|
94
|
+
f"Non-retryable status code {e.status_code} in {func.__name__}"
|
|
95
|
+
)
|
|
96
|
+
raise
|
|
97
|
+
|
|
98
|
+
# If this is the last attempt, don't retry
|
|
99
|
+
if attempt >= max_attempts - 1:
|
|
100
|
+
logger.warning(
|
|
101
|
+
f"Max retries ({max_attempts}) exhausted for {func.__name__}"
|
|
102
|
+
)
|
|
103
|
+
raise RetryExhaustedError(
|
|
104
|
+
f"Failed after {max_attempts} attempts: {e}"
|
|
105
|
+
) from e
|
|
106
|
+
|
|
107
|
+
# Calculate delay with exponential backoff and jitter
|
|
108
|
+
delay = get_backoff_delay(attempt, base_delay, max_delay, jitter=jitter)
|
|
109
|
+
logger.debug(
|
|
110
|
+
f"Retry {attempt + 1}/{max_attempts} for {func.__name__} "
|
|
111
|
+
f"after {delay:.2f}s"
|
|
112
|
+
)
|
|
113
|
+
await asyncio.sleep(delay)
|
|
114
|
+
|
|
115
|
+
# Should never reach here, but handle edge case
|
|
116
|
+
if last_exception:
|
|
117
|
+
raise last_exception
|
|
118
|
+
raise RetryExhaustedError(f"Failed after {max_attempts} attempts")
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Shared serialization utilities for cloudpickle + base64 encoding."""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
import cloudpickle
|
|
7
|
+
|
|
8
|
+
from .exceptions import SerializationError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def serialize_arg(arg: Any) -> str:
|
|
12
|
+
"""Serialize single argument with cloudpickle + base64.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
arg: Argument to serialize.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
Base64-encoded cloudpickle serialized string.
|
|
19
|
+
|
|
20
|
+
Raises:
|
|
21
|
+
SerializationError: If serialization fails.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
return base64.b64encode(cloudpickle.dumps(arg)).decode("utf-8")
|
|
25
|
+
except Exception as e:
|
|
26
|
+
raise SerializationError(f"Failed to serialize argument: {e}") from e
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def serialize_args(args: tuple) -> List[str]:
|
|
30
|
+
"""Serialize positional arguments.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
args: Tuple of arguments to serialize.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List of base64-encoded serialized arguments.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
SerializationError: If serialization fails.
|
|
40
|
+
"""
|
|
41
|
+
try:
|
|
42
|
+
return [serialize_arg(arg) for arg in args]
|
|
43
|
+
except SerializationError:
|
|
44
|
+
raise
|
|
45
|
+
except Exception as e:
|
|
46
|
+
raise SerializationError(f"Failed to serialize args: {e}") from e
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def serialize_kwargs(kwargs: dict) -> Dict[str, str]:
|
|
50
|
+
"""Serialize keyword arguments.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
kwargs: Dictionary of keyword arguments.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Dictionary with base64-encoded serialized values.
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
SerializationError: If serialization fails.
|
|
60
|
+
"""
|
|
61
|
+
try:
|
|
62
|
+
return {k: serialize_arg(v) for k, v in kwargs.items()}
|
|
63
|
+
except SerializationError:
|
|
64
|
+
raise
|
|
65
|
+
except Exception as e:
|
|
66
|
+
raise SerializationError(f"Failed to serialize kwargs: {e}") from e
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def deserialize_arg(arg_b64: str) -> Any:
|
|
70
|
+
"""Deserialize single base64-encoded cloudpickle argument.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
arg_b64: Base64-encoded serialized argument.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Deserialized argument.
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
SerializationError: If deserialization fails.
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
82
|
+
return cloudpickle.loads(base64.b64decode(arg_b64))
|
|
83
|
+
except Exception as e:
|
|
84
|
+
raise SerializationError(f"Failed to deserialize argument: {e}") from e
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def deserialize_args(args_b64: List[str]) -> List[Any]:
|
|
88
|
+
"""Deserialize list of base64-encoded arguments.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
args_b64: List of base64-encoded serialized arguments.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
List of deserialized arguments.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
SerializationError: If deserialization fails.
|
|
98
|
+
"""
|
|
99
|
+
try:
|
|
100
|
+
return [deserialize_arg(arg) for arg in args_b64]
|
|
101
|
+
except SerializationError:
|
|
102
|
+
raise
|
|
103
|
+
except Exception as e:
|
|
104
|
+
raise SerializationError(f"Failed to deserialize args: {e}") from e
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def deserialize_kwargs(kwargs_b64: Dict[str, str]) -> Dict[str, Any]:
|
|
108
|
+
"""Deserialize dict of base64-encoded keyword arguments.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
kwargs_b64: Dictionary with base64-encoded serialized values.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Dictionary with deserialized values.
|
|
115
|
+
|
|
116
|
+
Raises:
|
|
117
|
+
SerializationError: If deserialization fails.
|
|
118
|
+
"""
|
|
119
|
+
try:
|
|
120
|
+
return {k: deserialize_arg(v) for k, v in kwargs_b64.items()}
|
|
121
|
+
except SerializationError:
|
|
122
|
+
raise
|
|
123
|
+
except Exception as e:
|
|
124
|
+
raise SerializationError(f"Failed to deserialize kwargs: {e}") from e
|