nullrun 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,402 @@
1
+ """
2
+ Circuit breaker implementation for NullRun SDK.
3
+
4
+ Provides a proper three-state circuit breaker (CLOSED/OPEN/HALF_OPEN).
5
+ Supports distributed state sharing via Redis for multi-worker deployments.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ import random
11
+ import threading
12
+ import time
13
+ from collections.abc import Callable
14
+ from enum import Enum
15
+ from typing import Any
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ from nullrun.breaker.exceptions import BreakerTransportError
21
+ from nullrun.observability import metrics
22
+
23
+
24
+ class CBState(Enum):
25
+ CLOSED = "closed"
26
+ OPEN = "open"
27
+ HALF_OPEN = "half_open"
28
+
29
+
30
+ class CircuitBreakerMetrics:
31
+ """Metrics for circuit breaker observability."""
32
+
33
+ def __init__(self):
34
+ self.circuit_open_count = 0
35
+ self.circuit_half_open_count = 0
36
+ self.circuit_closed_count = 0
37
+ self.total_failure_count = 0
38
+ self.total_success_count = 0
39
+ self.half_open_duration_sum = 0.0
40
+ self.half_open_duration_count = 0
41
+ self.fallback_activations = 0
42
+
43
+
44
+ class CircuitBreaker:
45
+ """
46
+ Full-featured circuit breaker with three states.
47
+
48
+ CLOSED -> (failures >= threshold) -> OPEN
49
+ OPEN -> (timeout elapsed) -> HALF_OPEN
50
+ HALF_OPEN -> (success) -> CLOSED
51
+ HALF_OPEN -> (failure) -> OPEN
52
+
53
+ Supports distributed state sharing via Redis for multi-worker deployments.
54
+ When one worker opens the circuit, all workers see it via Redis.
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ failure_threshold: int = 5,
60
+ recovery_timeout: float = 30.0,
61
+ half_open_max_calls: int = 1,
62
+ redis_client: Any | None = None,
63
+ name: str = "default",
64
+ ):
65
+ self._failure_threshold = failure_threshold
66
+ self._recovery_timeout = recovery_timeout
67
+ self._half_open_max_calls = half_open_max_calls
68
+
69
+ # Redis-based distributed state sharing
70
+ self._redis_client = redis_client
71
+ self._redis_key_prefix = f"cb:{name}:"
72
+ self._state_ttl = 60 # seconds - state expires if not refreshed
73
+
74
+ self._state = CBState.CLOSED
75
+ self._failure_count = 0
76
+ self._last_failure_time: float | None = None
77
+ self._opened_at: float | None = None # Track when circuit last opened
78
+ self._half_open_calls = 0
79
+ self._half_open_start: float | None = None # Track half-open entry time
80
+ self._lock = threading.Lock()
81
+ self._async_lock: asyncio.Lock | None = None # Lazily created
82
+
83
+ # Metrics
84
+ self._metrics = CircuitBreakerMetrics()
85
+ self.total_failures = 0
86
+ self.total_opens = 0
87
+ self.total_successes = 0
88
+
89
+ def _get_async_lock(self) -> asyncio.Lock:
90
+ """Get or create async lock. Must be called from async context."""
91
+ if self._async_lock is None:
92
+ self._async_lock = asyncio.Lock()
93
+ return self._async_lock
94
+
95
+ # =============================================================================
96
+ # Redis-based distributed state sharing
97
+ # =============================================================================
98
+
99
+ def _check_global_state(self) -> str | None:
100
+ """
101
+ Check if any instance has the circuit open in Redis.
102
+
103
+ Returns 'OPEN', 'HALF_OPEN', 'CLOSED', or None if no global state exists.
104
+ """
105
+ if not self._redis_client:
106
+ return None
107
+ try:
108
+ key = f"{self._redis_key_prefix}state"
109
+ state = self._redis_client.get(key)
110
+ return state if state else None
111
+ except Exception as e:
112
+ logger.warning(f"Redis state check failed: {e}")
113
+ return None
114
+
115
+ def _check_global_recovered(self) -> bool:
116
+ """
117
+ Check if another instance recovered the circuit (closed it in Redis).
118
+
119
+ Returns True if another instance closed the circuit.
120
+ """
121
+ if not self._redis_client:
122
+ return False
123
+ try:
124
+ key = f"{self._redis_key_prefix}state"
125
+ state = self._redis_client.get(key)
126
+ return state == "CLOSED"
127
+ except Exception as e:
128
+ logger.warning(f"Redis recovery check failed: {e}")
129
+ return False
130
+
131
+ def _publish_open_state(self) -> None:
132
+ """Publish OPEN state to Redis with TTL."""
133
+ if not self._redis_client:
134
+ return
135
+ try:
136
+ key = f"{self._redis_key_prefix}state"
137
+ self._redis_client.setex(key, self._state_ttl, "OPEN")
138
+ except Exception as e:
139
+ logger.warning(f"Redis publish OPEN state failed: {e}")
140
+
141
+ def _publish_half_open_state(self) -> None:
142
+ """Publish HALF_OPEN state to Redis with TTL."""
143
+ if not self._redis_client:
144
+ return
145
+ try:
146
+ key = f"{self._redis_key_prefix}state"
147
+ self._redis_client.setex(key, self._state_ttl, "HALF_OPEN")
148
+ except Exception as e:
149
+ logger.warning(f"Redis publish HALF_OPEN state failed: {e}")
150
+
151
+ def _clear_global_state(self) -> None:
152
+ """Clear global state from Redis when circuit closes."""
153
+ if not self._redis_client:
154
+ return
155
+ try:
156
+ key = f"{self._redis_key_prefix}state"
157
+ self._redis_client.delete(key)
158
+ except Exception as e:
159
+ logger.warning(f"Redis clear state failed: {e}")
160
+
161
+ # =============================================================================
162
+ # State transition helpers
163
+ # =============================================================================
164
+
165
+ def _global_state_allows_call(self) -> bool:
166
+ """
167
+ Check if global Redis state allows this call to proceed.
168
+
169
+ If Redis says OPEN, reject immediately.
170
+ If Redis says HALF_OPEN, allow up to half_open_max_calls.
171
+ If Redis says CLOSED or no state, allow the call.
172
+ """
173
+ global_state = self._check_global_state()
174
+ if global_state is None:
175
+ return True # No global state, local logic applies
176
+
177
+ if global_state == "OPEN":
178
+ return False # Another instance has it open
179
+
180
+ if global_state == "HALF_OPEN":
181
+ # Allow if we haven't exhausted our half-open attempts
182
+ with self._lock:
183
+ if self._half_open_calls >= self._half_open_max_calls:
184
+ return False
185
+ return True
186
+
187
+ # global_state == "CLOSED" - another instance recovered, sync local
188
+ with self._lock:
189
+ self._state = CBState.CLOSED
190
+ self._failure_count = 0
191
+ return True
192
+
193
+ def _on_state_change(self, old_state: CBState, new_state: CBState) -> None:
194
+ """Record state transition metrics."""
195
+ if new_state == CBState.OPEN:
196
+ metrics.inc_transport("circuit_open_count")
197
+ # Sprint 3 follow-up (B24): also bump the
198
+ # ``circuit_breaker_opens`` global counter on
199
+ # ``TransportMetrics`` (was 0-call). This is the
200
+ # cross-CB-instance counter — the operator alerts
201
+ # on its rate, not on the per-CB ``circuit_open_count``.
202
+ metrics.inc_transport("circuit_breaker_opens")
203
+ self._metrics.circuit_open_count += 1
204
+ elif new_state == CBState.HALF_OPEN:
205
+ metrics.inc_transport("circuit_half_open_count")
206
+ self._metrics.circuit_half_open_count += 1
207
+ elif new_state == CBState.CLOSED and old_state != CBState.CLOSED:
208
+ metrics.inc_transport("circuit_closed_count")
209
+ self._metrics.circuit_closed_count += 1
210
+
211
+ def _on_half_open(self) -> None:
212
+ """Record half-open state entry."""
213
+ self._half_open_start = time.monotonic()
214
+
215
+ def _on_closed(self) -> None:
216
+ """Record circuit closure and half-open duration."""
217
+ if self._half_open_start:
218
+ duration = time.monotonic() - self._half_open_start
219
+ self._metrics.half_open_duration_sum += duration
220
+ self._metrics.half_open_duration_count += 1
221
+ self._half_open_start = None
222
+
223
+ @property
224
+ def state(self) -> CBState:
225
+ # Phase 0.3.1: hold the lock for the whole transition so
226
+ # concurrent threads do not race into HALF_OPEN. The
227
+ # previous version only held the lock for the dict read,
228
+ # which let two workers independently decide they should
229
+ # both probe in HALF_OPEN at the same wall-clock moment.
230
+ # The fix also publishes HALF_OPEN to Redis (was defined
231
+ # but never called) so other workers see the state via
232
+ # ``_check_global_state`` instead of falling back to
233
+ # PERMISSIVE.
234
+ with self._lock:
235
+ if self._state == CBState.OPEN:
236
+ if (
237
+ self._last_failure_time is not None
238
+ and time.monotonic() - self._last_failure_time >= self._recovery_timeout
239
+ ):
240
+ old_state = self._state
241
+ self._state = CBState.HALF_OPEN
242
+ self._half_open_calls = 0
243
+ self._on_state_change(old_state, self._state)
244
+ self._on_half_open()
245
+ # Publish the new state so other workers see
246
+ # HALF_OPEN in Redis and respect
247
+ # _half_open_max_calls (instead of treating
248
+ # the local probe as fresh and sending
249
+ # uncapped traffic).
250
+ self._publish_half_open_state()
251
+ return self._state
252
+
253
+ def call(self, func: Callable[..., Any], *args, **kwargs) -> Any:
254
+ """Execute func through circuit breaker. Supports both sync and async functions."""
255
+
256
+ # Check global Redis state first - reject if another instance has it open
257
+ if not self._global_state_allows_call():
258
+ raise BreakerTransportError(
259
+ f"Circuit breaker OPEN (global) -- service unavailable. "
260
+ f"Retry in {self._recovery_timeout:.0f}s"
261
+ )
262
+
263
+ # Add jitter before transitioning from OPEN to HALF_OPEN to prevent thundering herd
264
+ if self._state == CBState.OPEN and self._opened_at is not None:
265
+ time_in_open = time.monotonic() - self._opened_at
266
+ if time_in_open >= self._recovery_timeout:
267
+ # Add random jitter (0-30 seconds) to prevent thundering herd
268
+ # Phase 8: cap at 5s (was 30s). The previous value
269
+ # blocked the caller's thread for up to 30s on
270
+ # every OPEN->HALF_OPEN transition. 5s is plenty
271
+ # to spread reconnects across workers.
272
+ jitter = random.uniform(0, 5.0)
273
+ time.sleep(jitter)
274
+
275
+ state = self.state
276
+
277
+ if state == CBState.OPEN:
278
+ raise BreakerTransportError(
279
+ f"Circuit breaker OPEN -- service unavailable. "
280
+ f"Retry in {self._recovery_timeout:.0f}s"
281
+ )
282
+
283
+ if state == CBState.HALF_OPEN:
284
+ with self._lock:
285
+ if self._half_open_calls >= self._half_open_max_calls:
286
+ raise BreakerTransportError("Circuit breaker HALF_OPEN -- waiting")
287
+ self._half_open_calls += 1
288
+
289
+ # Check if func is a coroutine function (async)
290
+ import inspect
291
+ if inspect.iscoroutinefunction(func):
292
+ return self._call_async(func, *args, **kwargs)
293
+ else:
294
+ return self._call_sync(func, *args, **kwargs)
295
+
296
+ def _call_sync(self, func: Callable[..., Any], *args, **kwargs) -> Any:
297
+ """Execute sync func through circuit breaker."""
298
+ try:
299
+ result = func(*args, **kwargs)
300
+ self._on_success()
301
+ return result
302
+ except Exception:
303
+ self._on_failure()
304
+ raise
305
+
306
+ async def _call_async(self, func: Callable[..., Any], *args, **kwargs) -> Any:
307
+ """Execute async func through circuit breaker."""
308
+ try:
309
+ result = await func(*args, **kwargs)
310
+ await self._on_success_async()
311
+ return result
312
+ except Exception:
313
+ await self._on_failure_async()
314
+ raise
315
+
316
+ def _on_success(self) -> None:
317
+ old_state = self._state
318
+ with self._lock:
319
+ self._state = CBState.CLOSED
320
+ self._failure_count = 0
321
+ self.total_successes += 1
322
+ self._metrics.total_success_count += 1
323
+
324
+ self._on_state_change(old_state, CBState.CLOSED)
325
+ self._on_closed()
326
+
327
+ # Update Redis - clear OPEN state since we recovered
328
+ if self._redis_client:
329
+ self._clear_global_state()
330
+
331
+ def _on_failure(self) -> None:
332
+ old_state = self._state
333
+ with self._lock:
334
+ self._failure_count += 1
335
+ self._last_failure_time = time.monotonic()
336
+ self.total_failures += 1
337
+ self._metrics.total_failure_count += 1
338
+ if self._failure_count >= self._failure_threshold:
339
+ if old_state != CBState.OPEN:
340
+ self.total_opens += 1
341
+ self._on_state_change(old_state, CBState.OPEN)
342
+ self._state = CBState.OPEN
343
+ self._opened_at = time.monotonic()
344
+
345
+ # Publish OPEN state to Redis so other workers see it
346
+ if self._redis_client and self._state == CBState.OPEN:
347
+ self._publish_open_state()
348
+
349
+ async def _on_success_async(self) -> None:
350
+ """Async-safe success handler."""
351
+ old_state = self._state
352
+ async_lock = self._get_async_lock()
353
+ async with async_lock:
354
+ self._state = CBState.CLOSED
355
+ self._failure_count = 0
356
+ self.total_successes += 1
357
+ self._metrics.total_success_count += 1
358
+
359
+ self._on_state_change(old_state, CBState.CLOSED)
360
+ self._on_closed()
361
+
362
+ # Update Redis - clear OPEN state since we recovered
363
+ if self._redis_client:
364
+ self._clear_global_state()
365
+
366
+ async def _on_failure_async(self) -> None:
367
+ """Async-safe failure handler."""
368
+ old_state = self._state
369
+ async_lock = self._get_async_lock()
370
+ async with async_lock:
371
+ self._failure_count += 1
372
+ self._last_failure_time = time.monotonic()
373
+ self.total_failures += 1
374
+ self._metrics.total_failure_count += 1
375
+ if self._failure_count >= self._failure_threshold:
376
+ if old_state != CBState.OPEN:
377
+ self.total_opens += 1
378
+ self._on_state_change(old_state, CBState.OPEN)
379
+ self._state = CBState.OPEN
380
+ self._opened_at = time.monotonic()
381
+
382
+ # Publish OPEN state to Redis so other workers see it
383
+ if self._redis_client and self._state == CBState.OPEN:
384
+ self._publish_open_state()
385
+
386
+ def get_metrics(self) -> dict:
387
+ return {
388
+ "state": self.state.value,
389
+ "failure_count": self._failure_count,
390
+ "total_failures": self.total_failures,
391
+ "total_opens": self.total_opens,
392
+ "total_successes": self.total_successes,
393
+ "circuit_open_count": self._metrics.circuit_open_count,
394
+ "circuit_half_open_count": self._metrics.circuit_half_open_count,
395
+ "circuit_closed_count": self._metrics.circuit_closed_count,
396
+ "fallback_activations": self._metrics.fallback_activations,
397
+ "avg_half_open_duration": (
398
+ self._metrics.half_open_duration_sum /
399
+ self._metrics.half_open_duration_count
400
+ if self._metrics.half_open_duration_count > 0 else 0
401
+ ),
402
+ }