resilient-sdk-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resilient_sdk_core-0.1.0/PKG-INFO +187 -0
- resilient_sdk_core-0.1.0/README.md +162 -0
- resilient_sdk_core-0.1.0/pyproject.toml +37 -0
- resilient_sdk_core-0.1.0/resilient/__init__.py +3 -0
- resilient_sdk_core-0.1.0/resilient/circuit.py +87 -0
- resilient_sdk_core-0.1.0/resilient/classifier.py +81 -0
- resilient_sdk_core-0.1.0/resilient/config.py +30 -0
- resilient_sdk_core-0.1.0/resilient/decorator.py +104 -0
- resilient_sdk_core-0.1.0/resilient/store.py +96 -0
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: resilient-sdk-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Adaptive retry SDK with persistent failure memory
|
|
5
|
+
Keywords: retry,resilience,observability,backoff,circuit-breaker
|
|
6
|
+
Author: Abhishek Dasgupta
|
|
7
|
+
Author-email: abhishek_dasgupta@zykrr.com
|
|
8
|
+
Requires-Python: >=3.10,<4.0
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
19
|
+
Requires-Dist: asyncpg (>=0.29,<0.30)
|
|
20
|
+
Requires-Dist: tomli (>=2.0,<3.0) ; python_version < "3.11"
|
|
21
|
+
Project-URL: Homepage, https://github.com/abhishekgit03/resilient-sdk
|
|
22
|
+
Project-URL: Repository, https://github.com/abhishekgit03/resilient-sdk
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# resilient-sdk
|
|
26
|
+
|
|
27
|
+
Adaptive retry SDK + CLI for production Python applications.
|
|
28
|
+
|
|
29
|
+
Every developer writes ad-hoc retry logic - fixed attempts, guessed backoff values, no observability. **resilient-sdk** replaces that with a zero-config decorator that learns from your failure history and surfaces actionable insights via a CLI.
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from resilient import retry
|
|
33
|
+
|
|
34
|
+
@retry.auto
|
|
35
|
+
def call_openai(prompt: str):
|
|
36
|
+
return openai.chat.completions.create(...)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
$ resilient report
|
|
41
|
+
$ resilient explain openai
|
|
42
|
+
$ resilient anomalies
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## How it works
|
|
48
|
+
|
|
49
|
+
- **`@retry.auto`** - wraps any function (sync or async), classifies exceptions automatically, and applies exponential backoff with jitter
|
|
50
|
+
- **Postgres persistence** - every retry event is written to `resilient.events`, multi-pod safe
|
|
51
|
+
- **CLI** - queries that data and gives you plain-English reports powered by Gemini
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
### Python SDK
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install resilient-sdk-core
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Requires Python 3.10+ and a Postgres database.
|
|
64
|
+
|
|
65
|
+
### CLI
|
|
66
|
+
|
|
67
|
+
Download the binary from [GitHub Releases](https://github.com/abhishekgit03/resilient-sdk/releases) or install with Go:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
go install github.com/abhishekgit03/resilient-sdk/cli@latest
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Setup
|
|
76
|
+
|
|
77
|
+
### 1. Configure
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
resilient init --dsn postgresql://user:pass@host/dbname --gemini-key AIza...
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
This writes `~/.resilient/config.toml`. The SDK reads the same file.
|
|
84
|
+
|
|
85
|
+
### 2. Use the decorator
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from resilient import retry
|
|
89
|
+
|
|
90
|
+
# Works with any external call - HTTP, DB, queue
|
|
91
|
+
@retry.auto
|
|
92
|
+
def call_stripe():
|
|
93
|
+
return stripe.PaymentIntent.create(...)
|
|
94
|
+
|
|
95
|
+
@retry.auto
|
|
96
|
+
async def call_openai(prompt: str):
|
|
97
|
+
return await openai.chat.completions.create(...)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### 3. Optional - Circuit Breaker
|
|
101
|
+
|
|
102
|
+
Pair with the circuit breaker to stop retrying a service that's fully down:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from resilient import retry
|
|
106
|
+
from resilient.circuit import CircuitBreaker
|
|
107
|
+
|
|
108
|
+
cb = CircuitBreaker(failure_threshold=5, recovery_timeout=30)
|
|
109
|
+
|
|
110
|
+
@retry.auto
|
|
111
|
+
@cb.protect
|
|
112
|
+
def call_openai(prompt: str):
|
|
113
|
+
...
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## CLI Commands
|
|
119
|
+
|
|
120
|
+
| Command | Description |
|
|
121
|
+
|---|---|
|
|
122
|
+
| `resilient init --dsn <dsn> --gemini-key <key>` | One-time setup |
|
|
123
|
+
| `resilient report` | Failure summary, last 24h |
|
|
124
|
+
| `resilient report --app openai --last 7d` | Scoped report |
|
|
125
|
+
| `resilient explain <service>` | AI-powered analysis |
|
|
126
|
+
| `resilient explain <service> --last 7d` | Scoped explanation |
|
|
127
|
+
| `resilient anomalies` | Services that spiked vs yesterday |
|
|
128
|
+
| `resilient top` | Worst offenders in the last hour |
|
|
129
|
+
|
|
130
|
+
### Example output
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
$ resilient explain openai
|
|
134
|
+
|
|
135
|
+
Analysing openai (last 7d)...
|
|
136
|
+
|
|
137
|
+
OpenAI calls are failing at 4.2% over the last 7 days, up from 1.1% the week
|
|
138
|
+
before. Failures cluster between 14:00–16:00 UTC. The rate_limit errors suggest
|
|
139
|
+
you are retrying inside the same rate-limit window. Recommendation: add a 60s
|
|
140
|
+
cooldown after 3 consecutive 429s and consider request batching during peak hours.
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Error Classification
|
|
146
|
+
|
|
147
|
+
The SDK classifies exceptions automatically - no configuration needed.
|
|
148
|
+
|
|
149
|
+
| HTTP Status | Error Type | Strategy |
|
|
150
|
+
|---|---|---|
|
|
151
|
+
| 429 | `rate_limit` | Exponential backoff + jitter, 5 attempts |
|
|
152
|
+
| 500/502/503/504 | `server_error` | Backoff + jitter, 4 attempts |
|
|
153
|
+
| 400/401/403/404 | `client_fault` | No retry - fail immediately |
|
|
154
|
+
| Timeout exceptions | `transient` | Short jitter, 3 attempts |
|
|
155
|
+
|
|
156
|
+
Works with any HTTP library (`httpx`, `requests`, `aiohttp`) without importing them.
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Database Schema
|
|
161
|
+
|
|
162
|
+
Auto-created on first run:
|
|
163
|
+
|
|
164
|
+
```sql
|
|
165
|
+
resilient.events -- one row per retry attempt
|
|
166
|
+
resilient.stats -- aggregated windows (populated by CLI queries)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Compatible with any existing Postgres instance. Uses a dedicated `resilient` schema to avoid conflicts.
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Tech Stack
|
|
174
|
+
|
|
175
|
+
| Layer | Technology |
|
|
176
|
+
|---|---|
|
|
177
|
+
| SDK | Python + Poetry |
|
|
178
|
+
| CLI | Go + Cobra |
|
|
179
|
+
| Storage | PostgreSQL |
|
|
180
|
+
| AI | Gemini 2.5 Flash |
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## License
|
|
185
|
+
|
|
186
|
+
MIT
|
|
187
|
+
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# resilient-sdk
|
|
2
|
+
|
|
3
|
+
Adaptive retry SDK + CLI for production Python applications.
|
|
4
|
+
|
|
5
|
+
Every developer writes ad-hoc retry logic - fixed attempts, guessed backoff values, no observability. **resilient-sdk** replaces that with a zero-config decorator that learns from your failure history and surfaces actionable insights via a CLI.
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from resilient import retry
|
|
9
|
+
|
|
10
|
+
@retry.auto
|
|
11
|
+
def call_openai(prompt: str):
|
|
12
|
+
return openai.chat.completions.create(...)
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
$ resilient report
|
|
17
|
+
$ resilient explain openai
|
|
18
|
+
$ resilient anomalies
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## How it works
|
|
24
|
+
|
|
25
|
+
- **`@retry.auto`** - wraps any function (sync or async), classifies exceptions automatically, and applies exponential backoff with jitter
|
|
26
|
+
- **Postgres persistence** - every retry event is written to `resilient.events`, multi-pod safe
|
|
27
|
+
- **CLI** - queries that data and gives you plain-English reports powered by Gemini
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
### Python SDK
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install resilient-sdk-core
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Requires Python 3.10+ and a Postgres database.
|
|
40
|
+
|
|
41
|
+
### CLI
|
|
42
|
+
|
|
43
|
+
Download the binary from [GitHub Releases](https://github.com/abhishekgit03/resilient-sdk/releases) or install with Go:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
go install github.com/abhishekgit03/resilient-sdk/cli@latest
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Setup
|
|
52
|
+
|
|
53
|
+
### 1. Configure
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
resilient init --dsn postgresql://user:pass@host/dbname --gemini-key AIza...
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
This writes `~/.resilient/config.toml`. The SDK reads the same file.
|
|
60
|
+
|
|
61
|
+
### 2. Use the decorator
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from resilient import retry
|
|
65
|
+
|
|
66
|
+
# Works with any external call - HTTP, DB, queue
|
|
67
|
+
@retry.auto
|
|
68
|
+
def call_stripe():
|
|
69
|
+
return stripe.PaymentIntent.create(...)
|
|
70
|
+
|
|
71
|
+
@retry.auto
|
|
72
|
+
async def call_openai(prompt: str):
|
|
73
|
+
return await openai.chat.completions.create(...)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### 3. Optional - Circuit Breaker
|
|
77
|
+
|
|
78
|
+
Pair with the circuit breaker to stop retrying a service that's fully down:
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from resilient import retry
|
|
82
|
+
from resilient.circuit import CircuitBreaker
|
|
83
|
+
|
|
84
|
+
cb = CircuitBreaker(failure_threshold=5, recovery_timeout=30)
|
|
85
|
+
|
|
86
|
+
@retry.auto
|
|
87
|
+
@cb.protect
|
|
88
|
+
def call_openai(prompt: str):
|
|
89
|
+
...
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## CLI Commands
|
|
95
|
+
|
|
96
|
+
| Command | Description |
|
|
97
|
+
|---|---|
|
|
98
|
+
| `resilient init --dsn <dsn> --gemini-key <key>` | One-time setup |
|
|
99
|
+
| `resilient report` | Failure summary, last 24h |
|
|
100
|
+
| `resilient report --app openai --last 7d` | Scoped report |
|
|
101
|
+
| `resilient explain <service>` | AI-powered analysis |
|
|
102
|
+
| `resilient explain <service> --last 7d` | Scoped explanation |
|
|
103
|
+
| `resilient anomalies` | Services that spiked vs yesterday |
|
|
104
|
+
| `resilient top` | Worst offenders in the last hour |
|
|
105
|
+
|
|
106
|
+
### Example output
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
$ resilient explain openai
|
|
110
|
+
|
|
111
|
+
Analysing openai (last 7d)...
|
|
112
|
+
|
|
113
|
+
OpenAI calls are failing at 4.2% over the last 7 days, up from 1.1% the week
|
|
114
|
+
before. Failures cluster between 14:00–16:00 UTC. The rate_limit errors suggest
|
|
115
|
+
you are retrying inside the same rate-limit window. Recommendation: add a 60s
|
|
116
|
+
cooldown after 3 consecutive 429s and consider request batching during peak hours.
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Error Classification
|
|
122
|
+
|
|
123
|
+
The SDK classifies exceptions automatically - no configuration needed.
|
|
124
|
+
|
|
125
|
+
| HTTP Status | Error Type | Strategy |
|
|
126
|
+
|---|---|---|
|
|
127
|
+
| 429 | `rate_limit` | Exponential backoff + jitter, 5 attempts |
|
|
128
|
+
| 500/502/503/504 | `server_error` | Backoff + jitter, 4 attempts |
|
|
129
|
+
| 400/401/403/404 | `client_fault` | No retry - fail immediately |
|
|
130
|
+
| Timeout exceptions | `transient` | Short jitter, 3 attempts |
|
|
131
|
+
|
|
132
|
+
Works with any HTTP library (`httpx`, `requests`, `aiohttp`) without importing them.
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Database Schema
|
|
137
|
+
|
|
138
|
+
Auto-created on first run:
|
|
139
|
+
|
|
140
|
+
```sql
|
|
141
|
+
resilient.events -- one row per retry attempt
|
|
142
|
+
resilient.stats -- aggregated windows (populated by CLI queries)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Compatible with any existing Postgres instance. Uses a dedicated `resilient` schema to avoid conflicts.
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Tech Stack
|
|
150
|
+
|
|
151
|
+
| Layer | Technology |
|
|
152
|
+
|---|---|
|
|
153
|
+
| SDK | Python + Poetry |
|
|
154
|
+
| CLI | Go + Cobra |
|
|
155
|
+
| Storage | PostgreSQL |
|
|
156
|
+
| AI | Gemini 2.5 Flash |
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## License
|
|
161
|
+
|
|
162
|
+
MIT
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "resilient-sdk-core"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Adaptive retry SDK with persistent failure memory"
|
|
5
|
+
authors = ["Abhishek Dasgupta <abhishek_dasgupta@zykrr.com>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
homepage = "https://github.com/abhishekgit03/resilient-sdk"
|
|
8
|
+
repository = "https://github.com/abhishekgit03/resilient-sdk"
|
|
9
|
+
keywords = ["retry", "resilience", "observability", "backoff", "circuit-breaker"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 3 - Alpha",
|
|
12
|
+
"Intended Audience :: Developers",
|
|
13
|
+
"Topic :: Software Development :: Libraries",
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.10",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
]
|
|
20
|
+
packages = [{ include = "resilient" }]
|
|
21
|
+
|
|
22
|
+
[tool.poetry.dependencies]
|
|
23
|
+
python = "^3.10"
|
|
24
|
+
asyncpg = "^0.29"
|
|
25
|
+
tomli = { version = "^2.0", python = "<3.11" } # built-in in 3.11+
|
|
26
|
+
|
|
27
|
+
[tool.poetry.group.dev.dependencies]
|
|
28
|
+
pytest = "^8.0"
|
|
29
|
+
pytest-asyncio = "^0.23"
|
|
30
|
+
|
|
31
|
+
[build-system]
|
|
32
|
+
requires = ["poetry-core"]
|
|
33
|
+
build-backend = "poetry.core.masonry.api"
|
|
34
|
+
|
|
35
|
+
[tool.pytest.ini_options]
|
|
36
|
+
testpaths = ["tests"]
|
|
37
|
+
asyncio_mode = "auto"
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import time
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class State(Enum):
|
|
7
|
+
CLOSED = "closed" # normal — calls go through
|
|
8
|
+
OPEN = "open" # tripped — fail immediately
|
|
9
|
+
HALF_OPEN = "half_open" # testing recovery — one call allowed
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CircuitBreakerOpen(Exception):
|
|
13
|
+
"""Raised when a call is blocked by an open circuit breaker."""
|
|
14
|
+
def __init__(self, service: str):
|
|
15
|
+
super().__init__(f"Circuit breaker open for '{service}' — service appears to be down")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CircuitBreaker:
|
|
19
|
+
"""
|
|
20
|
+
Usage:
|
|
21
|
+
cb = CircuitBreaker(failure_threshold=5, recovery_timeout=30)
|
|
22
|
+
|
|
23
|
+
@retry.auto
|
|
24
|
+
@cb.protect
|
|
25
|
+
def call_openai(...): ...
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
failure_threshold: int = 5, # failures before opening
|
|
31
|
+
recovery_timeout: float = 30.0, # seconds to wait before half-open
|
|
32
|
+
):
|
|
33
|
+
self.failure_threshold = failure_threshold
|
|
34
|
+
self.recovery_timeout = recovery_timeout
|
|
35
|
+
|
|
36
|
+
self._state = State.CLOSED
|
|
37
|
+
self._failure_count = 0
|
|
38
|
+
self._opened_at: float | None = None
|
|
39
|
+
self._lock = threading.Lock()
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def state(self) -> State:
|
|
43
|
+
with self._lock:
|
|
44
|
+
return self._get_state()
|
|
45
|
+
|
|
46
|
+
def _get_state(self) -> State:
|
|
47
|
+
if self._state == State.OPEN:
|
|
48
|
+
if time.monotonic() - self._opened_at >= self.recovery_timeout:
|
|
49
|
+
# Cooldown elapsed — move to half-open to test recovery
|
|
50
|
+
self._state = State.HALF_OPEN
|
|
51
|
+
return self._state
|
|
52
|
+
|
|
53
|
+
def protect(self, fn):
|
|
54
|
+
"""Decorator that wraps a function with circuit breaker protection."""
|
|
55
|
+
import functools
|
|
56
|
+
|
|
57
|
+
@functools.wraps(fn)
|
|
58
|
+
def inner(*args, **kwargs):
|
|
59
|
+
with self._lock:
|
|
60
|
+
state = self._get_state()
|
|
61
|
+
if state == State.OPEN:
|
|
62
|
+
raise CircuitBreakerOpen(fn.__name__)
|
|
63
|
+
if state == State.HALF_OPEN:
|
|
64
|
+
# Allow through — result determines if we close or re-open
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
result = fn(*args, **kwargs)
|
|
69
|
+
self._on_success()
|
|
70
|
+
return result
|
|
71
|
+
except Exception:
|
|
72
|
+
self._on_failure()
|
|
73
|
+
raise
|
|
74
|
+
|
|
75
|
+
return inner
|
|
76
|
+
|
|
77
|
+
def _on_success(self):
|
|
78
|
+
with self._lock:
|
|
79
|
+
self._failure_count = 0
|
|
80
|
+
self._state = State.CLOSED
|
|
81
|
+
|
|
82
|
+
def _on_failure(self):
|
|
83
|
+
with self._lock:
|
|
84
|
+
self._failure_count += 1
|
|
85
|
+
if self._failure_count >= self.failure_threshold:
|
|
86
|
+
self._state = State.OPEN
|
|
87
|
+
self._opened_at = time.monotonic()
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@dataclass
|
|
5
|
+
class RetryStrategy:
|
|
6
|
+
error_type: str
|
|
7
|
+
should_retry: bool
|
|
8
|
+
base_delay: float
|
|
9
|
+
max_attempts: int
|
|
10
|
+
use_jitter: bool
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_STRATEGIES: dict[str, RetryStrategy] = {
|
|
14
|
+
"rate_limit": RetryStrategy(
|
|
15
|
+
error_type="rate_limit",
|
|
16
|
+
should_retry=True,
|
|
17
|
+
base_delay=2.0,
|
|
18
|
+
max_attempts=5,
|
|
19
|
+
use_jitter=True,
|
|
20
|
+
),
|
|
21
|
+
"server_error": RetryStrategy(
|
|
22
|
+
error_type="server_error",
|
|
23
|
+
should_retry=True,
|
|
24
|
+
base_delay=1.0,
|
|
25
|
+
max_attempts=4,
|
|
26
|
+
use_jitter=True,
|
|
27
|
+
),
|
|
28
|
+
"transient": RetryStrategy(
|
|
29
|
+
error_type="transient",
|
|
30
|
+
should_retry=True,
|
|
31
|
+
base_delay=0.5,
|
|
32
|
+
max_attempts=3,
|
|
33
|
+
use_jitter=True,
|
|
34
|
+
),
|
|
35
|
+
"client_fault": RetryStrategy(
|
|
36
|
+
error_type="client_fault",
|
|
37
|
+
should_retry=False,
|
|
38
|
+
base_delay=0.0,
|
|
39
|
+
max_attempts=1,
|
|
40
|
+
use_jitter=False,
|
|
41
|
+
),
|
|
42
|
+
"unknown": RetryStrategy(
|
|
43
|
+
error_type="unknown",
|
|
44
|
+
should_retry=True,
|
|
45
|
+
base_delay=1.0,
|
|
46
|
+
max_attempts=3,
|
|
47
|
+
use_jitter=True,
|
|
48
|
+
),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def classify(exc: Exception) -> RetryStrategy:
|
|
53
|
+
"""Map an exception to a retry strategy."""
|
|
54
|
+
|
|
55
|
+
status = _extract_status_code(exc)
|
|
56
|
+
if status is not None:
|
|
57
|
+
if status == 429:
|
|
58
|
+
return _STRATEGIES["rate_limit"]
|
|
59
|
+
if status in (500, 502, 503, 504):
|
|
60
|
+
return _STRATEGIES["server_error"]
|
|
61
|
+
if status in (400, 401, 403, 404, 422):
|
|
62
|
+
return _STRATEGIES["client_fault"]
|
|
63
|
+
|
|
64
|
+
type_name = type(exc).__name__.lower()
|
|
65
|
+
if any(kw in type_name for kw in ("timeout", "timedout", "timed_out")):
|
|
66
|
+
return _STRATEGIES["transient"]
|
|
67
|
+
if any(kw in type_name for kw in ("connection", "network", "connect")):
|
|
68
|
+
return _STRATEGIES["transient"]
|
|
69
|
+
|
|
70
|
+
return _STRATEGIES["unknown"]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _extract_status_code(exc: Exception) -> int | None:
|
|
74
|
+
"""Try common attribute names used by different HTTP libraries."""
|
|
75
|
+
for attr in ("status_code", "status", "code", "response"):
|
|
76
|
+
val = getattr(exc, attr, None)
|
|
77
|
+
if isinstance(val, int):
|
|
78
|
+
return val
|
|
79
|
+
if hasattr(val, "status_code"):
|
|
80
|
+
return val.status_code
|
|
81
|
+
return None
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
if sys.version_info >= (3, 11):
|
|
6
|
+
import tomllib
|
|
7
|
+
else:
|
|
8
|
+
import tomli as tomllib
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_config: dict | None = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_config() -> dict:
|
|
15
|
+
global _config
|
|
16
|
+
if _config is not None:
|
|
17
|
+
return _config
|
|
18
|
+
|
|
19
|
+
config_path = Path(os.getenv("RESILIENT_CONFIG", Path.home() / ".resilient" / "config.toml"))
|
|
20
|
+
|
|
21
|
+
if config_path.exists():
|
|
22
|
+
with open(config_path, "rb") as f:
|
|
23
|
+
_config = tomllib.load(f)
|
|
24
|
+
else:
|
|
25
|
+
_config = {}
|
|
26
|
+
|
|
27
|
+
if dsn := os.getenv("RESILIENT_DSN"):
|
|
28
|
+
_config["dsn"] = dsn
|
|
29
|
+
|
|
30
|
+
return _config
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import functools
|
|
3
|
+
import inspect
|
|
4
|
+
import random
|
|
5
|
+
import time
|
|
6
|
+
from typing import Any, Callable
|
|
7
|
+
|
|
8
|
+
from .classifier import RetryStrategy, classify
|
|
9
|
+
from .store import record_event
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RetryManager:
|
|
13
|
+
"""
|
|
14
|
+
Usage:
|
|
15
|
+
@retry.auto
|
|
16
|
+
def call_openai(...): ...
|
|
17
|
+
|
|
18
|
+
@retry.auto
|
|
19
|
+
async def call_stripe(...): ...
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def auto(self, fn: Callable) -> Callable:
|
|
23
|
+
if inspect.iscoroutinefunction(fn):
|
|
24
|
+
return _async_wrapper(fn)
|
|
25
|
+
return _sync_wrapper(fn)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
#sync wrapper
|
|
29
|
+
def _sync_wrapper(fn: Callable) -> Callable:
|
|
30
|
+
@functools.wraps(fn)
|
|
31
|
+
def inner(*args: Any, **kwargs: Any) -> Any:
|
|
32
|
+
service = fn.__module__.split(".")[0]
|
|
33
|
+
attempt = 0
|
|
34
|
+
|
|
35
|
+
while True:
|
|
36
|
+
t0 = time.monotonic()
|
|
37
|
+
try:
|
|
38
|
+
result = fn(*args, **kwargs)
|
|
39
|
+
duration_ms = int((time.monotonic() - t0) * 1000)
|
|
40
|
+
_fire_and_forget(record_event(service, fn.__name__, attempt + 1, "success", None, duration_ms))
|
|
41
|
+
return result
|
|
42
|
+
|
|
43
|
+
except Exception as exc:
|
|
44
|
+
duration_ms = int((time.monotonic() - t0) * 1000)
|
|
45
|
+
strategy = classify(exc)
|
|
46
|
+
attempt += 1
|
|
47
|
+
|
|
48
|
+
_fire_and_forget(record_event(service, fn.__name__, attempt, "failure", strategy.error_type, duration_ms))
|
|
49
|
+
|
|
50
|
+
if not strategy.should_retry or attempt >= strategy.max_attempts:
|
|
51
|
+
raise
|
|
52
|
+
|
|
53
|
+
delay = _backoff(strategy, attempt)
|
|
54
|
+
time.sleep(delay)
|
|
55
|
+
|
|
56
|
+
return inner
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
#async wrapper
|
|
60
|
+
def _async_wrapper(fn: Callable) -> Callable:
|
|
61
|
+
@functools.wraps(fn)
|
|
62
|
+
async def inner(*args: Any, **kwargs: Any) -> Any:
|
|
63
|
+
service = fn.__module__.split(".")[0]
|
|
64
|
+
attempt = 0
|
|
65
|
+
|
|
66
|
+
while True:
|
|
67
|
+
t0 = time.monotonic()
|
|
68
|
+
try:
|
|
69
|
+
result = await fn(*args, **kwargs)
|
|
70
|
+
duration_ms = int((time.monotonic() - t0) * 1000)
|
|
71
|
+
await record_event(service, fn.__name__, attempt + 1, "success", None, duration_ms)
|
|
72
|
+
return result
|
|
73
|
+
|
|
74
|
+
except Exception as exc:
|
|
75
|
+
duration_ms = int((time.monotonic() - t0) * 1000)
|
|
76
|
+
strategy = classify(exc)
|
|
77
|
+
attempt += 1
|
|
78
|
+
|
|
79
|
+
await record_event(service, fn.__name__, attempt, "failure", strategy.error_type, duration_ms)
|
|
80
|
+
|
|
81
|
+
if not strategy.should_retry or attempt >= strategy.max_attempts:
|
|
82
|
+
raise
|
|
83
|
+
|
|
84
|
+
delay = _backoff(strategy, attempt)
|
|
85
|
+
await asyncio.sleep(delay)
|
|
86
|
+
|
|
87
|
+
return inner
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _backoff(strategy: RetryStrategy, attempt: int) -> float:
|
|
91
|
+
"""Exponential backoff: base * 2^attempt, plus jitter up to base seconds."""
|
|
92
|
+
delay = strategy.base_delay * (2 ** (attempt - 1))
|
|
93
|
+
if strategy.use_jitter:
|
|
94
|
+
delay += random.uniform(0, strategy.base_delay)
|
|
95
|
+
return delay
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _fire_and_forget(coro: Any) -> None:
|
|
99
|
+
"""Run an async coroutine from sync context without blocking."""
|
|
100
|
+
try:
|
|
101
|
+
loop = asyncio.get_running_loop()
|
|
102
|
+
loop.create_task(coro)
|
|
103
|
+
except RuntimeError:
|
|
104
|
+
asyncio.run(coro)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import asyncpg
|
|
7
|
+
|
|
8
|
+
from .config import get_config
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
_pool: asyncpg.Pool | None = None
|
|
13
|
+
_pool_lock = asyncio.Lock()
|
|
14
|
+
|
|
15
|
+
_SCHEMA_SQL = """
|
|
16
|
+
CREATE SCHEMA IF NOT EXISTS resilient;
|
|
17
|
+
|
|
18
|
+
CREATE TABLE IF NOT EXISTS resilient.events (
|
|
19
|
+
id BIGSERIAL PRIMARY KEY,
|
|
20
|
+
service TEXT NOT NULL,
|
|
21
|
+
fn TEXT NOT NULL,
|
|
22
|
+
attempt INT NOT NULL,
|
|
23
|
+
status TEXT NOT NULL, -- 'success' | 'failure'
|
|
24
|
+
error_type TEXT, -- NULL on success
|
|
25
|
+
duration_ms INT NOT NULL,
|
|
26
|
+
ts TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
27
|
+
);
|
|
28
|
+
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_events_service_ts
|
|
30
|
+
ON resilient.events (service, ts DESC);
|
|
31
|
+
|
|
32
|
+
CREATE TABLE IF NOT EXISTS resilient.stats (
|
|
33
|
+
id BIGSERIAL PRIMARY KEY,
|
|
34
|
+
service TEXT NOT NULL,
|
|
35
|
+
window TEXT NOT NULL, -- e.g. '1h', '24h', '7d'
|
|
36
|
+
failure_rate FLOAT NOT NULL,
|
|
37
|
+
p95_latency INT, -- ms
|
|
38
|
+
peak_hour INT, -- 0-23 UTC
|
|
39
|
+
computed_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
40
|
+
);
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def _get_pool() -> asyncpg.Pool:
|
|
45
|
+
"""Return the shared pool, creating it (and the schema) on first call."""
|
|
46
|
+
global _pool
|
|
47
|
+
if _pool is not None:
|
|
48
|
+
return _pool
|
|
49
|
+
|
|
50
|
+
async with _pool_lock:
|
|
51
|
+
if _pool is not None:
|
|
52
|
+
return _pool
|
|
53
|
+
|
|
54
|
+
dsn = get_config()["dsn"]
|
|
55
|
+
if not dsn:
|
|
56
|
+
raise RuntimeError(
|
|
57
|
+
"resilient-sdk: no DSN configured. "
|
|
58
|
+
"Set RESILIENT_DSN env var or add 'dsn' to config.toml."
|
|
59
|
+
)
|
|
60
|
+
_pool = await asyncpg.create_pool(dsn=dsn, min_size=2, max_size=10)
|
|
61
|
+
|
|
62
|
+
async with _pool.acquire() as conn:
|
|
63
|
+
await conn.execute(_SCHEMA_SQL)
|
|
64
|
+
|
|
65
|
+
return _pool
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
async def record_event(
|
|
69
|
+
service: str,
|
|
70
|
+
fn: str,
|
|
71
|
+
attempt: int,
|
|
72
|
+
status: str,
|
|
73
|
+
error_type: Optional[str],
|
|
74
|
+
duration_ms: int,
|
|
75
|
+
) -> None:
|
|
76
|
+
"""Write one retry event row. Silently drops on any DB error."""
|
|
77
|
+
try:
|
|
78
|
+
pool = await _get_pool()
|
|
79
|
+
async with pool.acquire() as conn:
|
|
80
|
+
await conn.execute(
|
|
81
|
+
"""
|
|
82
|
+
INSERT INTO resilient.events
|
|
83
|
+
(service, fn, attempt, status, error_type, duration_ms, ts)
|
|
84
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
|
85
|
+
""",
|
|
86
|
+
service,
|
|
87
|
+
fn,
|
|
88
|
+
attempt,
|
|
89
|
+
status,
|
|
90
|
+
error_type,
|
|
91
|
+
duration_ms,
|
|
92
|
+
datetime.now(timezone.utc),
|
|
93
|
+
)
|
|
94
|
+
except Exception:
|
|
95
|
+
# DB being down must never crash the decorated function
|
|
96
|
+
logger.debug("resilient: failed to record event", exc_info=True)
|