nullrun 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nullrun/__init__.py +282 -0
- nullrun/__version__.py +4 -0
- nullrun/actions.py +455 -0
- nullrun/breaker/__init__.py +27 -0
- nullrun/breaker/circuit_breaker.py +402 -0
- nullrun/breaker/exceptions.py +319 -0
- nullrun/context.py +208 -0
- nullrun/decorators.py +649 -0
- nullrun/instrumentation/__init__.py +23 -0
- nullrun/instrumentation/_safe_patch.py +99 -0
- nullrun/instrumentation/auto.py +1095 -0
- nullrun/instrumentation/auto_requests.py +257 -0
- nullrun/instrumentation/autogen.py +163 -0
- nullrun/instrumentation/crewai.py +140 -0
- nullrun/instrumentation/langgraph.py +412 -0
- nullrun/instrumentation/llama_index.py +110 -0
- nullrun/observability.py +160 -0
- nullrun/py.typed +0 -0
- nullrun/runtime.py +1806 -0
- nullrun/toolbox/__init__.py +20 -0
- nullrun/toolbox/langgraph.py +94 -0
- nullrun/tracing.py +155 -0
- nullrun/transport.py +1509 -0
- nullrun/transport_websocket.py +627 -0
- nullrun-0.4.0.dist-info/METADATA +194 -0
- nullrun-0.4.0.dist-info/RECORD +28 -0
- nullrun-0.4.0.dist-info/WHEEL +4 -0
- nullrun-0.4.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Circuit breaker implementation for NullRun SDK.
|
|
3
|
+
|
|
4
|
+
Provides a proper three-state circuit breaker (CLOSED/OPEN/HALF_OPEN).
|
|
5
|
+
Supports distributed state sharing via Redis for multi-worker deployments.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
import random
|
|
11
|
+
import threading
|
|
12
|
+
import time
|
|
13
|
+
from collections.abc import Callable
|
|
14
|
+
from enum import Enum
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
from nullrun.breaker.exceptions import BreakerTransportError
|
|
21
|
+
from nullrun.observability import metrics
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class CBState(Enum):
|
|
25
|
+
CLOSED = "closed"
|
|
26
|
+
OPEN = "open"
|
|
27
|
+
HALF_OPEN = "half_open"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CircuitBreakerMetrics:
|
|
31
|
+
"""Metrics for circuit breaker observability."""
|
|
32
|
+
|
|
33
|
+
def __init__(self):
|
|
34
|
+
self.circuit_open_count = 0
|
|
35
|
+
self.circuit_half_open_count = 0
|
|
36
|
+
self.circuit_closed_count = 0
|
|
37
|
+
self.total_failure_count = 0
|
|
38
|
+
self.total_success_count = 0
|
|
39
|
+
self.half_open_duration_sum = 0.0
|
|
40
|
+
self.half_open_duration_count = 0
|
|
41
|
+
self.fallback_activations = 0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class CircuitBreaker:
|
|
45
|
+
"""
|
|
46
|
+
Full-featured circuit breaker with three states.
|
|
47
|
+
|
|
48
|
+
CLOSED -> (failures >= threshold) -> OPEN
|
|
49
|
+
OPEN -> (timeout elapsed) -> HALF_OPEN
|
|
50
|
+
HALF_OPEN -> (success) -> CLOSED
|
|
51
|
+
HALF_OPEN -> (failure) -> OPEN
|
|
52
|
+
|
|
53
|
+
Supports distributed state sharing via Redis for multi-worker deployments.
|
|
54
|
+
When one worker opens the circuit, all workers see it via Redis.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
failure_threshold: int = 5,
|
|
60
|
+
recovery_timeout: float = 30.0,
|
|
61
|
+
half_open_max_calls: int = 1,
|
|
62
|
+
redis_client: Any | None = None,
|
|
63
|
+
name: str = "default",
|
|
64
|
+
):
|
|
65
|
+
self._failure_threshold = failure_threshold
|
|
66
|
+
self._recovery_timeout = recovery_timeout
|
|
67
|
+
self._half_open_max_calls = half_open_max_calls
|
|
68
|
+
|
|
69
|
+
# Redis-based distributed state sharing
|
|
70
|
+
self._redis_client = redis_client
|
|
71
|
+
self._redis_key_prefix = f"cb:{name}:"
|
|
72
|
+
self._state_ttl = 60 # seconds - state expires if not refreshed
|
|
73
|
+
|
|
74
|
+
self._state = CBState.CLOSED
|
|
75
|
+
self._failure_count = 0
|
|
76
|
+
self._last_failure_time: float | None = None
|
|
77
|
+
self._opened_at: float | None = None # Track when circuit last opened
|
|
78
|
+
self._half_open_calls = 0
|
|
79
|
+
self._half_open_start: float | None = None # Track half-open entry time
|
|
80
|
+
self._lock = threading.Lock()
|
|
81
|
+
self._async_lock: asyncio.Lock | None = None # Lazily created
|
|
82
|
+
|
|
83
|
+
# Metrics
|
|
84
|
+
self._metrics = CircuitBreakerMetrics()
|
|
85
|
+
self.total_failures = 0
|
|
86
|
+
self.total_opens = 0
|
|
87
|
+
self.total_successes = 0
|
|
88
|
+
|
|
89
|
+
def _get_async_lock(self) -> asyncio.Lock:
|
|
90
|
+
"""Get or create async lock. Must be called from async context."""
|
|
91
|
+
if self._async_lock is None:
|
|
92
|
+
self._async_lock = asyncio.Lock()
|
|
93
|
+
return self._async_lock
|
|
94
|
+
|
|
95
|
+
# =============================================================================
|
|
96
|
+
# Redis-based distributed state sharing
|
|
97
|
+
# =============================================================================
|
|
98
|
+
|
|
99
|
+
def _check_global_state(self) -> str | None:
|
|
100
|
+
"""
|
|
101
|
+
Check if any instance has the circuit open in Redis.
|
|
102
|
+
|
|
103
|
+
Returns 'OPEN', 'HALF_OPEN', 'CLOSED', or None if no global state exists.
|
|
104
|
+
"""
|
|
105
|
+
if not self._redis_client:
|
|
106
|
+
return None
|
|
107
|
+
try:
|
|
108
|
+
key = f"{self._redis_key_prefix}state"
|
|
109
|
+
state = self._redis_client.get(key)
|
|
110
|
+
return state if state else None
|
|
111
|
+
except Exception as e:
|
|
112
|
+
logger.warning(f"Redis state check failed: {e}")
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
def _check_global_recovered(self) -> bool:
|
|
116
|
+
"""
|
|
117
|
+
Check if another instance recovered the circuit (closed it in Redis).
|
|
118
|
+
|
|
119
|
+
Returns True if another instance closed the circuit.
|
|
120
|
+
"""
|
|
121
|
+
if not self._redis_client:
|
|
122
|
+
return False
|
|
123
|
+
try:
|
|
124
|
+
key = f"{self._redis_key_prefix}state"
|
|
125
|
+
state = self._redis_client.get(key)
|
|
126
|
+
return state == "CLOSED"
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.warning(f"Redis recovery check failed: {e}")
|
|
129
|
+
return False
|
|
130
|
+
|
|
131
|
+
def _publish_open_state(self) -> None:
|
|
132
|
+
"""Publish OPEN state to Redis with TTL."""
|
|
133
|
+
if not self._redis_client:
|
|
134
|
+
return
|
|
135
|
+
try:
|
|
136
|
+
key = f"{self._redis_key_prefix}state"
|
|
137
|
+
self._redis_client.setex(key, self._state_ttl, "OPEN")
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.warning(f"Redis publish OPEN state failed: {e}")
|
|
140
|
+
|
|
141
|
+
def _publish_half_open_state(self) -> None:
|
|
142
|
+
"""Publish HALF_OPEN state to Redis with TTL."""
|
|
143
|
+
if not self._redis_client:
|
|
144
|
+
return
|
|
145
|
+
try:
|
|
146
|
+
key = f"{self._redis_key_prefix}state"
|
|
147
|
+
self._redis_client.setex(key, self._state_ttl, "HALF_OPEN")
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.warning(f"Redis publish HALF_OPEN state failed: {e}")
|
|
150
|
+
|
|
151
|
+
def _clear_global_state(self) -> None:
|
|
152
|
+
"""Clear global state from Redis when circuit closes."""
|
|
153
|
+
if not self._redis_client:
|
|
154
|
+
return
|
|
155
|
+
try:
|
|
156
|
+
key = f"{self._redis_key_prefix}state"
|
|
157
|
+
self._redis_client.delete(key)
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.warning(f"Redis clear state failed: {e}")
|
|
160
|
+
|
|
161
|
+
# =============================================================================
|
|
162
|
+
# State transition helpers
|
|
163
|
+
# =============================================================================
|
|
164
|
+
|
|
165
|
+
def _global_state_allows_call(self) -> bool:
|
|
166
|
+
"""
|
|
167
|
+
Check if global Redis state allows this call to proceed.
|
|
168
|
+
|
|
169
|
+
If Redis says OPEN, reject immediately.
|
|
170
|
+
If Redis says HALF_OPEN, allow up to half_open_max_calls.
|
|
171
|
+
If Redis says CLOSED or no state, allow the call.
|
|
172
|
+
"""
|
|
173
|
+
global_state = self._check_global_state()
|
|
174
|
+
if global_state is None:
|
|
175
|
+
return True # No global state, local logic applies
|
|
176
|
+
|
|
177
|
+
if global_state == "OPEN":
|
|
178
|
+
return False # Another instance has it open
|
|
179
|
+
|
|
180
|
+
if global_state == "HALF_OPEN":
|
|
181
|
+
# Allow if we haven't exhausted our half-open attempts
|
|
182
|
+
with self._lock:
|
|
183
|
+
if self._half_open_calls >= self._half_open_max_calls:
|
|
184
|
+
return False
|
|
185
|
+
return True
|
|
186
|
+
|
|
187
|
+
# global_state == "CLOSED" - another instance recovered, sync local
|
|
188
|
+
with self._lock:
|
|
189
|
+
self._state = CBState.CLOSED
|
|
190
|
+
self._failure_count = 0
|
|
191
|
+
return True
|
|
192
|
+
|
|
193
|
+
def _on_state_change(self, old_state: CBState, new_state: CBState) -> None:
|
|
194
|
+
"""Record state transition metrics."""
|
|
195
|
+
if new_state == CBState.OPEN:
|
|
196
|
+
metrics.inc_transport("circuit_open_count")
|
|
197
|
+
# Sprint 3 follow-up (B24): also bump the
|
|
198
|
+
# ``circuit_breaker_opens`` global counter on
|
|
199
|
+
# ``TransportMetrics`` (was 0-call). This is the
|
|
200
|
+
# cross-CB-instance counter — the operator alerts
|
|
201
|
+
# on its rate, not on the per-CB ``circuit_open_count``.
|
|
202
|
+
metrics.inc_transport("circuit_breaker_opens")
|
|
203
|
+
self._metrics.circuit_open_count += 1
|
|
204
|
+
elif new_state == CBState.HALF_OPEN:
|
|
205
|
+
metrics.inc_transport("circuit_half_open_count")
|
|
206
|
+
self._metrics.circuit_half_open_count += 1
|
|
207
|
+
elif new_state == CBState.CLOSED and old_state != CBState.CLOSED:
|
|
208
|
+
metrics.inc_transport("circuit_closed_count")
|
|
209
|
+
self._metrics.circuit_closed_count += 1
|
|
210
|
+
|
|
211
|
+
def _on_half_open(self) -> None:
|
|
212
|
+
"""Record half-open state entry."""
|
|
213
|
+
self._half_open_start = time.monotonic()
|
|
214
|
+
|
|
215
|
+
def _on_closed(self) -> None:
|
|
216
|
+
"""Record circuit closure and half-open duration."""
|
|
217
|
+
if self._half_open_start:
|
|
218
|
+
duration = time.monotonic() - self._half_open_start
|
|
219
|
+
self._metrics.half_open_duration_sum += duration
|
|
220
|
+
self._metrics.half_open_duration_count += 1
|
|
221
|
+
self._half_open_start = None
|
|
222
|
+
|
|
223
|
+
@property
|
|
224
|
+
def state(self) -> CBState:
|
|
225
|
+
# Phase 0.3.1: hold the lock for the whole transition so
|
|
226
|
+
# concurrent threads do not race into HALF_OPEN. The
|
|
227
|
+
# previous version only held the lock for the dict read,
|
|
228
|
+
# which let two workers independently decide they should
|
|
229
|
+
# both probe in HALF_OPEN at the same wall-clock moment.
|
|
230
|
+
# The fix also publishes HALF_OPEN to Redis (was defined
|
|
231
|
+
# but never called) so other workers see the state via
|
|
232
|
+
# ``_check_global_state`` instead of falling back to
|
|
233
|
+
# PERMISSIVE.
|
|
234
|
+
with self._lock:
|
|
235
|
+
if self._state == CBState.OPEN:
|
|
236
|
+
if (
|
|
237
|
+
self._last_failure_time is not None
|
|
238
|
+
and time.monotonic() - self._last_failure_time >= self._recovery_timeout
|
|
239
|
+
):
|
|
240
|
+
old_state = self._state
|
|
241
|
+
self._state = CBState.HALF_OPEN
|
|
242
|
+
self._half_open_calls = 0
|
|
243
|
+
self._on_state_change(old_state, self._state)
|
|
244
|
+
self._on_half_open()
|
|
245
|
+
# Publish the new state so other workers see
|
|
246
|
+
# HALF_OPEN in Redis and respect
|
|
247
|
+
# _half_open_max_calls (instead of treating
|
|
248
|
+
# the local probe as fresh and sending
|
|
249
|
+
# uncapped traffic).
|
|
250
|
+
self._publish_half_open_state()
|
|
251
|
+
return self._state
|
|
252
|
+
|
|
253
|
+
def call(self, func: Callable[..., Any], *args, **kwargs) -> Any:
|
|
254
|
+
"""Execute func through circuit breaker. Supports both sync and async functions."""
|
|
255
|
+
|
|
256
|
+
# Check global Redis state first - reject if another instance has it open
|
|
257
|
+
if not self._global_state_allows_call():
|
|
258
|
+
raise BreakerTransportError(
|
|
259
|
+
f"Circuit breaker OPEN (global) -- service unavailable. "
|
|
260
|
+
f"Retry in {self._recovery_timeout:.0f}s"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# Add jitter before transitioning from OPEN to HALF_OPEN to prevent thundering herd
|
|
264
|
+
if self._state == CBState.OPEN and self._opened_at is not None:
|
|
265
|
+
time_in_open = time.monotonic() - self._opened_at
|
|
266
|
+
if time_in_open >= self._recovery_timeout:
|
|
267
|
+
# Add random jitter (0-30 seconds) to prevent thundering herd
|
|
268
|
+
# Phase 8: cap at 5s (was 30s). The previous value
|
|
269
|
+
# blocked the caller's thread for up to 30s on
|
|
270
|
+
# every OPEN->HALF_OPEN transition. 5s is plenty
|
|
271
|
+
# to spread reconnects across workers.
|
|
272
|
+
jitter = random.uniform(0, 5.0)
|
|
273
|
+
time.sleep(jitter)
|
|
274
|
+
|
|
275
|
+
state = self.state
|
|
276
|
+
|
|
277
|
+
if state == CBState.OPEN:
|
|
278
|
+
raise BreakerTransportError(
|
|
279
|
+
f"Circuit breaker OPEN -- service unavailable. "
|
|
280
|
+
f"Retry in {self._recovery_timeout:.0f}s"
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
if state == CBState.HALF_OPEN:
|
|
284
|
+
with self._lock:
|
|
285
|
+
if self._half_open_calls >= self._half_open_max_calls:
|
|
286
|
+
raise BreakerTransportError("Circuit breaker HALF_OPEN -- waiting")
|
|
287
|
+
self._half_open_calls += 1
|
|
288
|
+
|
|
289
|
+
# Check if func is a coroutine function (async)
|
|
290
|
+
import inspect
|
|
291
|
+
if inspect.iscoroutinefunction(func):
|
|
292
|
+
return self._call_async(func, *args, **kwargs)
|
|
293
|
+
else:
|
|
294
|
+
return self._call_sync(func, *args, **kwargs)
|
|
295
|
+
|
|
296
|
+
def _call_sync(self, func: Callable[..., Any], *args, **kwargs) -> Any:
|
|
297
|
+
"""Execute sync func through circuit breaker."""
|
|
298
|
+
try:
|
|
299
|
+
result = func(*args, **kwargs)
|
|
300
|
+
self._on_success()
|
|
301
|
+
return result
|
|
302
|
+
except Exception:
|
|
303
|
+
self._on_failure()
|
|
304
|
+
raise
|
|
305
|
+
|
|
306
|
+
async def _call_async(self, func: Callable[..., Any], *args, **kwargs) -> Any:
|
|
307
|
+
"""Execute async func through circuit breaker."""
|
|
308
|
+
try:
|
|
309
|
+
result = await func(*args, **kwargs)
|
|
310
|
+
await self._on_success_async()
|
|
311
|
+
return result
|
|
312
|
+
except Exception:
|
|
313
|
+
await self._on_failure_async()
|
|
314
|
+
raise
|
|
315
|
+
|
|
316
|
+
def _on_success(self) -> None:
|
|
317
|
+
old_state = self._state
|
|
318
|
+
with self._lock:
|
|
319
|
+
self._state = CBState.CLOSED
|
|
320
|
+
self._failure_count = 0
|
|
321
|
+
self.total_successes += 1
|
|
322
|
+
self._metrics.total_success_count += 1
|
|
323
|
+
|
|
324
|
+
self._on_state_change(old_state, CBState.CLOSED)
|
|
325
|
+
self._on_closed()
|
|
326
|
+
|
|
327
|
+
# Update Redis - clear OPEN state since we recovered
|
|
328
|
+
if self._redis_client:
|
|
329
|
+
self._clear_global_state()
|
|
330
|
+
|
|
331
|
+
def _on_failure(self) -> None:
|
|
332
|
+
old_state = self._state
|
|
333
|
+
with self._lock:
|
|
334
|
+
self._failure_count += 1
|
|
335
|
+
self._last_failure_time = time.monotonic()
|
|
336
|
+
self.total_failures += 1
|
|
337
|
+
self._metrics.total_failure_count += 1
|
|
338
|
+
if self._failure_count >= self._failure_threshold:
|
|
339
|
+
if old_state != CBState.OPEN:
|
|
340
|
+
self.total_opens += 1
|
|
341
|
+
self._on_state_change(old_state, CBState.OPEN)
|
|
342
|
+
self._state = CBState.OPEN
|
|
343
|
+
self._opened_at = time.monotonic()
|
|
344
|
+
|
|
345
|
+
# Publish OPEN state to Redis so other workers see it
|
|
346
|
+
if self._redis_client and self._state == CBState.OPEN:
|
|
347
|
+
self._publish_open_state()
|
|
348
|
+
|
|
349
|
+
async def _on_success_async(self) -> None:
|
|
350
|
+
"""Async-safe success handler."""
|
|
351
|
+
old_state = self._state
|
|
352
|
+
async_lock = self._get_async_lock()
|
|
353
|
+
async with async_lock:
|
|
354
|
+
self._state = CBState.CLOSED
|
|
355
|
+
self._failure_count = 0
|
|
356
|
+
self.total_successes += 1
|
|
357
|
+
self._metrics.total_success_count += 1
|
|
358
|
+
|
|
359
|
+
self._on_state_change(old_state, CBState.CLOSED)
|
|
360
|
+
self._on_closed()
|
|
361
|
+
|
|
362
|
+
# Update Redis - clear OPEN state since we recovered
|
|
363
|
+
if self._redis_client:
|
|
364
|
+
self._clear_global_state()
|
|
365
|
+
|
|
366
|
+
async def _on_failure_async(self) -> None:
|
|
367
|
+
"""Async-safe failure handler."""
|
|
368
|
+
old_state = self._state
|
|
369
|
+
async_lock = self._get_async_lock()
|
|
370
|
+
async with async_lock:
|
|
371
|
+
self._failure_count += 1
|
|
372
|
+
self._last_failure_time = time.monotonic()
|
|
373
|
+
self.total_failures += 1
|
|
374
|
+
self._metrics.total_failure_count += 1
|
|
375
|
+
if self._failure_count >= self._failure_threshold:
|
|
376
|
+
if old_state != CBState.OPEN:
|
|
377
|
+
self.total_opens += 1
|
|
378
|
+
self._on_state_change(old_state, CBState.OPEN)
|
|
379
|
+
self._state = CBState.OPEN
|
|
380
|
+
self._opened_at = time.monotonic()
|
|
381
|
+
|
|
382
|
+
# Publish OPEN state to Redis so other workers see it
|
|
383
|
+
if self._redis_client and self._state == CBState.OPEN:
|
|
384
|
+
self._publish_open_state()
|
|
385
|
+
|
|
386
|
+
def get_metrics(self) -> dict:
|
|
387
|
+
return {
|
|
388
|
+
"state": self.state.value,
|
|
389
|
+
"failure_count": self._failure_count,
|
|
390
|
+
"total_failures": self.total_failures,
|
|
391
|
+
"total_opens": self.total_opens,
|
|
392
|
+
"total_successes": self.total_successes,
|
|
393
|
+
"circuit_open_count": self._metrics.circuit_open_count,
|
|
394
|
+
"circuit_half_open_count": self._metrics.circuit_half_open_count,
|
|
395
|
+
"circuit_closed_count": self._metrics.circuit_closed_count,
|
|
396
|
+
"fallback_activations": self._metrics.fallback_activations,
|
|
397
|
+
"avg_half_open_duration": (
|
|
398
|
+
self._metrics.half_open_duration_sum /
|
|
399
|
+
self._metrics.half_open_duration_count
|
|
400
|
+
if self._metrics.half_open_duration_count > 0 else 0
|
|
401
|
+
),
|
|
402
|
+
}
|