@zap-js/server 0.0.2 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +18 -14
- package/package.json +4 -9
- package/src/bin/zap.rs +0 -154
- package/src/config.rs +0 -253
- package/src/connection_pool.rs +0 -404
- package/src/error.rs +0 -380
- package/src/handler.rs +0 -89
- package/src/ipc.js +0 -10
- package/src/ipc.rs +0 -499
- package/src/lib.rs +0 -433
- package/src/metrics.rs +0 -264
- package/src/proxy.rs +0 -436
- package/src/reliability.rs +0 -917
- package/src/request.rs +0 -60
- package/src/request_id.rs +0 -97
- package/src/response.rs +0 -182
- package/src/rpc.js +0 -14
- package/src/server.rs +0 -597
- package/src/static.rs +0 -572
- package/src/types.js +0 -21
- package/src/utils.rs +0 -18
- package/src/websocket.rs +0 -429
package/src/reliability.rs
DELETED
|
@@ -1,917 +0,0 @@
|
|
|
1
|
-
//! Reliability Module for ZapJS
|
|
2
|
-
//!
|
|
3
|
-
//! Production-grade resilience patterns for IPC communication:
|
|
4
|
-
//! - Exponential backoff with jitter for retries
|
|
5
|
-
//! - Circuit breaker for cascading failure prevention
|
|
6
|
-
//! - Enhanced health checks with readiness/liveness probes
|
|
7
|
-
//!
|
|
8
|
-
//! ## Retry Strategy
|
|
9
|
-
//! Uses exponential backoff with full jitter:
|
|
10
|
-
//! - Base delay: 100ms
|
|
11
|
-
//! - Max delay: 10s
|
|
12
|
-
//! - Max retries: 3 (configurable)
|
|
13
|
-
//! - Formula: min(max_delay, base_delay * 2^attempt) * random(0, 1)
|
|
14
|
-
//!
|
|
15
|
-
//! ## Circuit Breaker States
|
|
16
|
-
//! - CLOSED: Normal operation, requests flow through
|
|
17
|
-
//! - OPEN: Too many failures, requests fail immediately
|
|
18
|
-
//! - HALF_OPEN: Testing if service recovered
|
|
19
|
-
//!
|
|
20
|
-
//! ## Health Check Types
|
|
21
|
-
//! - `/health/live`: Is the process alive? (liveness probe)
|
|
22
|
-
//! - `/health/ready`: Can it handle requests? (readiness probe)
|
|
23
|
-
|
|
24
|
-
use crate::connection_pool::ConnectionPool;
|
|
25
|
-
use crate::error::{ZapError, ZapResult};
|
|
26
|
-
use crate::ipc::IpcMessage;
|
|
27
|
-
use std::sync::atomic::{AtomicU64, Ordering};
|
|
28
|
-
use std::sync::Arc;
|
|
29
|
-
use std::time::{Duration, Instant};
|
|
30
|
-
use tokio::sync::RwLock;
|
|
31
|
-
use tracing::{debug, error, info, warn};
|
|
32
|
-
|
|
33
|
-
// ============================================================================
|
|
34
|
-
// Retry Configuration
|
|
35
|
-
// ============================================================================
|
|
36
|
-
|
|
37
|
-
/// Default base delay for exponential backoff (100ms)
|
|
38
|
-
const DEFAULT_BASE_DELAY_MS: u64 = 100;
|
|
39
|
-
|
|
40
|
-
/// Maximum delay cap for exponential backoff (10 seconds)
|
|
41
|
-
const DEFAULT_MAX_DELAY_MS: u64 = 10_000;
|
|
42
|
-
|
|
43
|
-
/// Default maximum number of retry attempts
|
|
44
|
-
const DEFAULT_MAX_RETRIES: usize = 3;
|
|
45
|
-
|
|
46
|
-
/// Retry configuration with exponential backoff
|
|
47
|
-
#[derive(Debug, Clone)]
|
|
48
|
-
pub struct RetryConfig {
|
|
49
|
-
/// Base delay for exponential backoff
|
|
50
|
-
pub base_delay: Duration,
|
|
51
|
-
/// Maximum delay cap
|
|
52
|
-
pub max_delay: Duration,
|
|
53
|
-
/// Maximum number of retry attempts (0 = no retries)
|
|
54
|
-
pub max_retries: usize,
|
|
55
|
-
/// Enable jitter to prevent thundering herd
|
|
56
|
-
pub use_jitter: bool,
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
impl Default for RetryConfig {
|
|
60
|
-
fn default() -> Self {
|
|
61
|
-
Self {
|
|
62
|
-
base_delay: Duration::from_millis(DEFAULT_BASE_DELAY_MS),
|
|
63
|
-
max_delay: Duration::from_millis(DEFAULT_MAX_DELAY_MS),
|
|
64
|
-
max_retries: DEFAULT_MAX_RETRIES,
|
|
65
|
-
use_jitter: true,
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
impl RetryConfig {
|
|
71
|
-
/// Create a new retry configuration
|
|
72
|
-
pub fn new() -> Self {
|
|
73
|
-
Self::default()
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
/// Set base delay
|
|
77
|
-
pub fn base_delay(mut self, delay: Duration) -> Self {
|
|
78
|
-
self.base_delay = delay;
|
|
79
|
-
self
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
/// Set maximum delay cap
|
|
83
|
-
pub fn max_delay(mut self, delay: Duration) -> Self {
|
|
84
|
-
self.max_delay = delay;
|
|
85
|
-
self
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
/// Set maximum retries
|
|
89
|
-
pub fn max_retries(mut self, retries: usize) -> Self {
|
|
90
|
-
self.max_retries = retries;
|
|
91
|
-
self
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
/// Enable or disable jitter
|
|
95
|
-
pub fn jitter(mut self, enable: bool) -> Self {
|
|
96
|
-
self.use_jitter = enable;
|
|
97
|
-
self
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
/// Calculate delay for a given attempt (0-indexed)
|
|
101
|
-
pub fn delay_for_attempt(&self, attempt: usize) -> Duration {
|
|
102
|
-
// Exponential backoff: base_delay * 2^attempt
|
|
103
|
-
let exp_delay_ms = self.base_delay.as_millis() as u64 * (1u64 << attempt.min(10));
|
|
104
|
-
let capped_delay_ms = exp_delay_ms.min(self.max_delay.as_millis() as u64);
|
|
105
|
-
|
|
106
|
-
if self.use_jitter {
|
|
107
|
-
// Full jitter: random value between 0 and calculated delay
|
|
108
|
-
let jitter = fastrand::u64(0..=capped_delay_ms);
|
|
109
|
-
Duration::from_millis(jitter)
|
|
110
|
-
} else {
|
|
111
|
-
Duration::from_millis(capped_delay_ms)
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
// ============================================================================
|
|
117
|
-
// Circuit Breaker
|
|
118
|
-
// ============================================================================
|
|
119
|
-
|
|
120
|
-
/// Circuit breaker states
|
|
121
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
122
|
-
pub enum CircuitState {
|
|
123
|
-
/// Normal operation - requests flow through
|
|
124
|
-
Closed,
|
|
125
|
-
/// Too many failures - requests fail immediately
|
|
126
|
-
Open,
|
|
127
|
-
/// Testing recovery - allow limited requests
|
|
128
|
-
HalfOpen,
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
impl std::fmt::Display for CircuitState {
|
|
132
|
-
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
133
|
-
match self {
|
|
134
|
-
CircuitState::Closed => write!(f, "CLOSED"),
|
|
135
|
-
CircuitState::Open => write!(f, "OPEN"),
|
|
136
|
-
CircuitState::HalfOpen => write!(f, "HALF_OPEN"),
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
/// Circuit breaker configuration
|
|
142
|
-
#[derive(Debug, Clone)]
|
|
143
|
-
pub struct CircuitBreakerConfig {
|
|
144
|
-
/// Number of failures before opening circuit
|
|
145
|
-
pub failure_threshold: usize,
|
|
146
|
-
/// Time to wait before transitioning from OPEN to HALF_OPEN
|
|
147
|
-
pub reset_timeout: Duration,
|
|
148
|
-
/// Number of successes in HALF_OPEN to close circuit
|
|
149
|
-
pub success_threshold: usize,
|
|
150
|
-
/// Time window for counting failures
|
|
151
|
-
pub failure_window: Duration,
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
impl Default for CircuitBreakerConfig {
|
|
155
|
-
fn default() -> Self {
|
|
156
|
-
Self {
|
|
157
|
-
failure_threshold: 5,
|
|
158
|
-
reset_timeout: Duration::from_secs(30),
|
|
159
|
-
success_threshold: 3,
|
|
160
|
-
failure_window: Duration::from_secs(60),
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
impl CircuitBreakerConfig {
|
|
166
|
-
/// Create new circuit breaker config
|
|
167
|
-
pub fn new() -> Self {
|
|
168
|
-
Self::default()
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
/// Set failure threshold
|
|
172
|
-
pub fn failure_threshold(mut self, threshold: usize) -> Self {
|
|
173
|
-
self.failure_threshold = threshold;
|
|
174
|
-
self
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
/// Set reset timeout
|
|
178
|
-
pub fn reset_timeout(mut self, timeout: Duration) -> Self {
|
|
179
|
-
self.reset_timeout = timeout;
|
|
180
|
-
self
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
/// Set success threshold for half-open state
|
|
184
|
-
pub fn success_threshold(mut self, threshold: usize) -> Self {
|
|
185
|
-
self.success_threshold = threshold;
|
|
186
|
-
self
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
/// Set failure counting window
|
|
190
|
-
pub fn failure_window(mut self, window: Duration) -> Self {
|
|
191
|
-
self.failure_window = window;
|
|
192
|
-
self
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
/// Circuit breaker internal state
|
|
197
|
-
struct CircuitBreakerState {
|
|
198
|
-
state: CircuitState,
|
|
199
|
-
failure_count: usize,
|
|
200
|
-
success_count: usize,
|
|
201
|
-
last_failure_time: Option<Instant>,
|
|
202
|
-
opened_at: Option<Instant>,
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
/// Circuit breaker for protecting against cascading failures
|
|
206
|
-
pub struct CircuitBreaker {
|
|
207
|
-
config: CircuitBreakerConfig,
|
|
208
|
-
state: RwLock<CircuitBreakerState>,
|
|
209
|
-
/// Total failures (for metrics)
|
|
210
|
-
total_failures: AtomicU64,
|
|
211
|
-
/// Total successes (for metrics)
|
|
212
|
-
total_successes: AtomicU64,
|
|
213
|
-
/// Times circuit opened (for metrics)
|
|
214
|
-
times_opened: AtomicU64,
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
impl CircuitBreaker {
|
|
218
|
-
/// Create a new circuit breaker with default configuration
|
|
219
|
-
pub fn new() -> Self {
|
|
220
|
-
Self::with_config(CircuitBreakerConfig::default())
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
/// Create a new circuit breaker with custom configuration
|
|
224
|
-
pub fn with_config(config: CircuitBreakerConfig) -> Self {
|
|
225
|
-
Self {
|
|
226
|
-
config,
|
|
227
|
-
state: RwLock::new(CircuitBreakerState {
|
|
228
|
-
state: CircuitState::Closed,
|
|
229
|
-
failure_count: 0,
|
|
230
|
-
success_count: 0,
|
|
231
|
-
last_failure_time: None,
|
|
232
|
-
opened_at: None,
|
|
233
|
-
}),
|
|
234
|
-
total_failures: AtomicU64::new(0),
|
|
235
|
-
total_successes: AtomicU64::new(0),
|
|
236
|
-
times_opened: AtomicU64::new(0),
|
|
237
|
-
}
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
/// Check if a request is allowed to proceed
|
|
241
|
-
pub async fn allow_request(&self) -> bool {
|
|
242
|
-
let mut state = self.state.write().await;
|
|
243
|
-
|
|
244
|
-
match state.state {
|
|
245
|
-
CircuitState::Closed => true,
|
|
246
|
-
CircuitState::Open => {
|
|
247
|
-
// Check if reset timeout has elapsed
|
|
248
|
-
if let Some(opened_at) = state.opened_at {
|
|
249
|
-
if opened_at.elapsed() >= self.config.reset_timeout {
|
|
250
|
-
info!("Circuit breaker transitioning from OPEN to HALF_OPEN");
|
|
251
|
-
state.state = CircuitState::HalfOpen;
|
|
252
|
-
state.success_count = 0;
|
|
253
|
-
true
|
|
254
|
-
} else {
|
|
255
|
-
false
|
|
256
|
-
}
|
|
257
|
-
} else {
|
|
258
|
-
false
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
CircuitState::HalfOpen => {
|
|
262
|
-
// Allow request in half-open state
|
|
263
|
-
true
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
/// Record a successful request
|
|
269
|
-
pub async fn record_success(&self) {
|
|
270
|
-
self.total_successes.fetch_add(1, Ordering::Relaxed);
|
|
271
|
-
|
|
272
|
-
let mut state = self.state.write().await;
|
|
273
|
-
|
|
274
|
-
match state.state {
|
|
275
|
-
CircuitState::HalfOpen => {
|
|
276
|
-
state.success_count += 1;
|
|
277
|
-
if state.success_count >= self.config.success_threshold {
|
|
278
|
-
info!(
|
|
279
|
-
"Circuit breaker closing after {} successes in HALF_OPEN",
|
|
280
|
-
state.success_count
|
|
281
|
-
);
|
|
282
|
-
state.state = CircuitState::Closed;
|
|
283
|
-
state.failure_count = 0;
|
|
284
|
-
state.success_count = 0;
|
|
285
|
-
state.opened_at = None;
|
|
286
|
-
}
|
|
287
|
-
}
|
|
288
|
-
CircuitState::Closed => {
|
|
289
|
-
// Reset failure count on success (sliding window behavior)
|
|
290
|
-
if let Some(last_failure) = state.last_failure_time {
|
|
291
|
-
if last_failure.elapsed() > self.config.failure_window {
|
|
292
|
-
state.failure_count = 0;
|
|
293
|
-
}
|
|
294
|
-
}
|
|
295
|
-
}
|
|
296
|
-
CircuitState::Open => {
|
|
297
|
-
// Shouldn't happen, but handle gracefully
|
|
298
|
-
}
|
|
299
|
-
}
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
/// Record a failed request
|
|
303
|
-
pub async fn record_failure(&self) {
|
|
304
|
-
self.total_failures.fetch_add(1, Ordering::Relaxed);
|
|
305
|
-
|
|
306
|
-
let mut state = self.state.write().await;
|
|
307
|
-
|
|
308
|
-
match state.state {
|
|
309
|
-
CircuitState::Closed => {
|
|
310
|
-
// Check if we should reset the failure window
|
|
311
|
-
if let Some(last_failure) = state.last_failure_time {
|
|
312
|
-
if last_failure.elapsed() > self.config.failure_window {
|
|
313
|
-
state.failure_count = 0;
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
state.failure_count += 1;
|
|
318
|
-
state.last_failure_time = Some(Instant::now());
|
|
319
|
-
|
|
320
|
-
if state.failure_count >= self.config.failure_threshold {
|
|
321
|
-
warn!(
|
|
322
|
-
"Circuit breaker OPENING after {} failures",
|
|
323
|
-
state.failure_count
|
|
324
|
-
);
|
|
325
|
-
state.state = CircuitState::Open;
|
|
326
|
-
state.opened_at = Some(Instant::now());
|
|
327
|
-
self.times_opened.fetch_add(1, Ordering::Relaxed);
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
CircuitState::HalfOpen => {
|
|
331
|
-
// Any failure in half-open immediately re-opens
|
|
332
|
-
warn!("Circuit breaker re-opening from HALF_OPEN after failure");
|
|
333
|
-
state.state = CircuitState::Open;
|
|
334
|
-
state.opened_at = Some(Instant::now());
|
|
335
|
-
state.success_count = 0;
|
|
336
|
-
self.times_opened.fetch_add(1, Ordering::Relaxed);
|
|
337
|
-
}
|
|
338
|
-
CircuitState::Open => {
|
|
339
|
-
// Already open, update failure time
|
|
340
|
-
state.last_failure_time = Some(Instant::now());
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
}
|
|
344
|
-
|
|
345
|
-
/// Get current circuit state
|
|
346
|
-
pub async fn state(&self) -> CircuitState {
|
|
347
|
-
self.state.read().await.state
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
/// Get circuit breaker statistics
|
|
351
|
-
pub async fn stats(&self) -> CircuitBreakerStats {
|
|
352
|
-
let state = self.state.read().await;
|
|
353
|
-
CircuitBreakerStats {
|
|
354
|
-
state: state.state,
|
|
355
|
-
failure_count: state.failure_count,
|
|
356
|
-
success_count: state.success_count,
|
|
357
|
-
total_failures: self.total_failures.load(Ordering::Relaxed),
|
|
358
|
-
total_successes: self.total_successes.load(Ordering::Relaxed),
|
|
359
|
-
times_opened: self.times_opened.load(Ordering::Relaxed),
|
|
360
|
-
}
|
|
361
|
-
}
|
|
362
|
-
|
|
363
|
-
/// Force the circuit to a specific state (for testing/admin)
|
|
364
|
-
pub async fn force_state(&self, new_state: CircuitState) {
|
|
365
|
-
let mut state = self.state.write().await;
|
|
366
|
-
info!("Force-setting circuit breaker to {}", new_state);
|
|
367
|
-
state.state = new_state;
|
|
368
|
-
if new_state == CircuitState::Open {
|
|
369
|
-
state.opened_at = Some(Instant::now());
|
|
370
|
-
} else {
|
|
371
|
-
state.opened_at = None;
|
|
372
|
-
}
|
|
373
|
-
state.failure_count = 0;
|
|
374
|
-
state.success_count = 0;
|
|
375
|
-
}
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
impl Default for CircuitBreaker {
|
|
379
|
-
fn default() -> Self {
|
|
380
|
-
Self::new()
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
/// Circuit breaker statistics
|
|
385
|
-
#[derive(Debug, Clone)]
|
|
386
|
-
pub struct CircuitBreakerStats {
|
|
387
|
-
pub state: CircuitState,
|
|
388
|
-
pub failure_count: usize,
|
|
389
|
-
pub success_count: usize,
|
|
390
|
-
pub total_failures: u64,
|
|
391
|
-
pub total_successes: u64,
|
|
392
|
-
pub times_opened: u64,
|
|
393
|
-
}
|
|
394
|
-
|
|
395
|
-
// ============================================================================
|
|
396
|
-
// Resilient IPC Client
|
|
397
|
-
// ============================================================================
|
|
398
|
-
|
|
399
|
-
/// Resilient IPC client with retry and circuit breaker
|
|
400
|
-
pub struct ResilientIpc {
|
|
401
|
-
pool: Arc<ConnectionPool>,
|
|
402
|
-
retry_config: RetryConfig,
|
|
403
|
-
circuit_breaker: Arc<CircuitBreaker>,
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
impl ResilientIpc {
|
|
407
|
-
/// Create a new resilient IPC client
|
|
408
|
-
pub fn new(pool: Arc<ConnectionPool>) -> Self {
|
|
409
|
-
Self {
|
|
410
|
-
pool,
|
|
411
|
-
retry_config: RetryConfig::default(),
|
|
412
|
-
circuit_breaker: Arc::new(CircuitBreaker::new()),
|
|
413
|
-
}
|
|
414
|
-
}
|
|
415
|
-
|
|
416
|
-
/// Create with custom configurations
|
|
417
|
-
pub fn with_config(
|
|
418
|
-
pool: Arc<ConnectionPool>,
|
|
419
|
-
retry_config: RetryConfig,
|
|
420
|
-
circuit_config: CircuitBreakerConfig,
|
|
421
|
-
) -> Self {
|
|
422
|
-
Self {
|
|
423
|
-
pool,
|
|
424
|
-
retry_config,
|
|
425
|
-
circuit_breaker: Arc::new(CircuitBreaker::with_config(circuit_config)),
|
|
426
|
-
}
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
/// Send a message with retry and circuit breaker protection
|
|
430
|
-
pub async fn send_recv(&self, message: IpcMessage) -> ZapResult<IpcMessage> {
|
|
431
|
-
// Check circuit breaker first
|
|
432
|
-
if !self.circuit_breaker.allow_request().await {
|
|
433
|
-
let state = self.circuit_breaker.state().await;
|
|
434
|
-
warn!("Circuit breaker is {}, rejecting request", state);
|
|
435
|
-
return Err(ZapError::ipc(format!(
|
|
436
|
-
"Circuit breaker is {}, service unavailable",
|
|
437
|
-
state
|
|
438
|
-
)));
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
let mut last_error: Option<ZapError> = None;
|
|
442
|
-
|
|
443
|
-
// Attempt with retries
|
|
444
|
-
for attempt in 0..=self.retry_config.max_retries {
|
|
445
|
-
if attempt > 0 {
|
|
446
|
-
// Calculate delay with exponential backoff
|
|
447
|
-
let delay = self.retry_config.delay_for_attempt(attempt - 1);
|
|
448
|
-
debug!(
|
|
449
|
-
"Retry attempt {}/{} after {:?} delay",
|
|
450
|
-
attempt, self.retry_config.max_retries, delay
|
|
451
|
-
);
|
|
452
|
-
tokio::time::sleep(delay).await;
|
|
453
|
-
}
|
|
454
|
-
|
|
455
|
-
match self.pool.send_recv(message.clone()).await {
|
|
456
|
-
Ok(response) => {
|
|
457
|
-
// Check for error responses from TypeScript
|
|
458
|
-
if let IpcMessage::Error { code, message: _, .. } = &response {
|
|
459
|
-
// Handler errors shouldn't trigger circuit breaker
|
|
460
|
-
// (they're application-level, not infrastructure)
|
|
461
|
-
if code != "HANDLER_ERROR" && code != "VALIDATION_ERROR" {
|
|
462
|
-
self.circuit_breaker.record_failure().await;
|
|
463
|
-
} else {
|
|
464
|
-
self.circuit_breaker.record_success().await;
|
|
465
|
-
}
|
|
466
|
-
return Ok(response);
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
self.circuit_breaker.record_success().await;
|
|
470
|
-
return Ok(response);
|
|
471
|
-
}
|
|
472
|
-
Err(e) => {
|
|
473
|
-
warn!("IPC request failed (attempt {}): {}", attempt + 1, e);
|
|
474
|
-
last_error = Some(e);
|
|
475
|
-
|
|
476
|
-
// Don't retry on certain errors
|
|
477
|
-
if let Some(ref err) = last_error {
|
|
478
|
-
if is_non_retryable_error(err) {
|
|
479
|
-
break;
|
|
480
|
-
}
|
|
481
|
-
}
|
|
482
|
-
}
|
|
483
|
-
}
|
|
484
|
-
}
|
|
485
|
-
|
|
486
|
-
// All retries exhausted
|
|
487
|
-
self.circuit_breaker.record_failure().await;
|
|
488
|
-
error!(
|
|
489
|
-
"IPC request failed after {} attempts",
|
|
490
|
-
self.retry_config.max_retries + 1
|
|
491
|
-
);
|
|
492
|
-
|
|
493
|
-
Err(last_error.unwrap_or_else(|| ZapError::ipc("Unknown IPC error")))
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
/// Get the circuit breaker for monitoring
|
|
497
|
-
pub fn circuit_breaker(&self) -> &Arc<CircuitBreaker> {
|
|
498
|
-
&self.circuit_breaker
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
/// Get circuit breaker statistics
|
|
502
|
-
pub async fn circuit_stats(&self) -> CircuitBreakerStats {
|
|
503
|
-
self.circuit_breaker.stats().await
|
|
504
|
-
}
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
/// Check if an error is non-retryable (e.g., validation errors)
|
|
508
|
-
fn is_non_retryable_error(error: &ZapError) -> bool {
|
|
509
|
-
match error {
|
|
510
|
-
ZapError::Validation { .. } => true,
|
|
511
|
-
ZapError::Unauthorized { .. } => true,
|
|
512
|
-
ZapError::Forbidden { .. } => true,
|
|
513
|
-
ZapError::RateLimited { .. } => true,
|
|
514
|
-
_ => false,
|
|
515
|
-
}
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
// ============================================================================
|
|
519
|
-
// Enhanced Health Checks
|
|
520
|
-
// ============================================================================
|
|
521
|
-
|
|
522
|
-
/// Health check status
|
|
523
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
524
|
-
pub enum HealthStatus {
|
|
525
|
-
Healthy,
|
|
526
|
-
Degraded,
|
|
527
|
-
Unhealthy,
|
|
528
|
-
}
|
|
529
|
-
|
|
530
|
-
impl std::fmt::Display for HealthStatus {
|
|
531
|
-
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
532
|
-
match self {
|
|
533
|
-
HealthStatus::Healthy => write!(f, "healthy"),
|
|
534
|
-
HealthStatus::Degraded => write!(f, "degraded"),
|
|
535
|
-
HealthStatus::Unhealthy => write!(f, "unhealthy"),
|
|
536
|
-
}
|
|
537
|
-
}
|
|
538
|
-
}
|
|
539
|
-
|
|
540
|
-
/// Component health information
|
|
541
|
-
#[derive(Debug, Clone)]
|
|
542
|
-
pub struct ComponentHealth {
|
|
543
|
-
pub name: String,
|
|
544
|
-
pub status: HealthStatus,
|
|
545
|
-
pub message: Option<String>,
|
|
546
|
-
pub latency_ms: Option<u64>,
|
|
547
|
-
}
|
|
548
|
-
|
|
549
|
-
/// Complete health check response
|
|
550
|
-
#[derive(Debug, Clone)]
|
|
551
|
-
pub struct HealthCheckResponse {
|
|
552
|
-
/// Overall status
|
|
553
|
-
pub status: HealthStatus,
|
|
554
|
-
/// Individual component health
|
|
555
|
-
pub components: Vec<ComponentHealth>,
|
|
556
|
-
/// Server version
|
|
557
|
-
pub version: String,
|
|
558
|
-
/// Server uptime in seconds
|
|
559
|
-
pub uptime_secs: u64,
|
|
560
|
-
}
|
|
561
|
-
|
|
562
|
-
impl HealthCheckResponse {
|
|
563
|
-
/// Convert to JSON
|
|
564
|
-
pub fn to_json(&self) -> String {
|
|
565
|
-
let components_json: Vec<String> = self
|
|
566
|
-
.components
|
|
567
|
-
.iter()
|
|
568
|
-
.map(|c| {
|
|
569
|
-
let msg = c
|
|
570
|
-
.message
|
|
571
|
-
.as_ref()
|
|
572
|
-
.map(|m| format!(r#","message":"{}""#, m))
|
|
573
|
-
.unwrap_or_default();
|
|
574
|
-
let latency = c
|
|
575
|
-
.latency_ms
|
|
576
|
-
.map(|l| format!(r#","latency_ms":{}"#, l))
|
|
577
|
-
.unwrap_or_default();
|
|
578
|
-
format!(
|
|
579
|
-
r#"{{"name":"{}","status":"{}"{}{}}}"#,
|
|
580
|
-
c.name, c.status, msg, latency
|
|
581
|
-
)
|
|
582
|
-
})
|
|
583
|
-
.collect();
|
|
584
|
-
|
|
585
|
-
format!(
|
|
586
|
-
r#"{{"status":"{}","version":"{}","uptime_secs":{},"components":[{}]}}"#,
|
|
587
|
-
self.status,
|
|
588
|
-
self.version,
|
|
589
|
-
self.uptime_secs,
|
|
590
|
-
components_json.join(",")
|
|
591
|
-
)
|
|
592
|
-
}
|
|
593
|
-
}
|
|
594
|
-
|
|
595
|
-
/// Health checker for the ZapJS server
|
|
596
|
-
pub struct HealthChecker {
|
|
597
|
-
start_time: Instant,
|
|
598
|
-
version: String,
|
|
599
|
-
pool: Option<Arc<ConnectionPool>>,
|
|
600
|
-
circuit_breaker: Option<Arc<CircuitBreaker>>,
|
|
601
|
-
}
|
|
602
|
-
|
|
603
|
-
impl HealthChecker {
|
|
604
|
-
/// Create a new health checker
|
|
605
|
-
pub fn new(version: String) -> Self {
|
|
606
|
-
Self {
|
|
607
|
-
start_time: Instant::now(),
|
|
608
|
-
version,
|
|
609
|
-
pool: None,
|
|
610
|
-
circuit_breaker: None,
|
|
611
|
-
}
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
/// Set the connection pool to monitor
|
|
615
|
-
pub fn with_pool(mut self, pool: Arc<ConnectionPool>) -> Self {
|
|
616
|
-
self.pool = Some(pool);
|
|
617
|
-
self
|
|
618
|
-
}
|
|
619
|
-
|
|
620
|
-
/// Set the circuit breaker to monitor
|
|
621
|
-
pub fn with_circuit_breaker(mut self, cb: Arc<CircuitBreaker>) -> Self {
|
|
622
|
-
self.circuit_breaker = Some(cb);
|
|
623
|
-
self
|
|
624
|
-
}
|
|
625
|
-
|
|
626
|
-
/// Liveness probe: Is the process alive?
|
|
627
|
-
/// This should always return true if the server can respond at all.
|
|
628
|
-
pub fn liveness(&self) -> HealthCheckResponse {
|
|
629
|
-
HealthCheckResponse {
|
|
630
|
-
status: HealthStatus::Healthy,
|
|
631
|
-
components: vec![ComponentHealth {
|
|
632
|
-
name: "process".to_string(),
|
|
633
|
-
status: HealthStatus::Healthy,
|
|
634
|
-
message: Some("Server is running".to_string()),
|
|
635
|
-
latency_ms: None,
|
|
636
|
-
}],
|
|
637
|
-
version: self.version.clone(),
|
|
638
|
-
uptime_secs: self.start_time.elapsed().as_secs(),
|
|
639
|
-
}
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
/// Readiness probe: Can the server handle requests?
|
|
643
|
-
/// Checks connection pool and circuit breaker state.
|
|
644
|
-
pub async fn readiness(&self) -> HealthCheckResponse {
|
|
645
|
-
let mut components = Vec::new();
|
|
646
|
-
let mut overall_status = HealthStatus::Healthy;
|
|
647
|
-
|
|
648
|
-
// Check connection pool
|
|
649
|
-
if let Some(pool) = &self.pool {
|
|
650
|
-
let start = Instant::now();
|
|
651
|
-
let (healthy, total) = pool.health_check().await;
|
|
652
|
-
let latency = start.elapsed().as_millis() as u64;
|
|
653
|
-
|
|
654
|
-
let pool_status = if healthy == total {
|
|
655
|
-
HealthStatus::Healthy
|
|
656
|
-
} else if healthy > 0 {
|
|
657
|
-
overall_status = HealthStatus::Degraded;
|
|
658
|
-
HealthStatus::Degraded
|
|
659
|
-
} else {
|
|
660
|
-
overall_status = HealthStatus::Unhealthy;
|
|
661
|
-
HealthStatus::Unhealthy
|
|
662
|
-
};
|
|
663
|
-
|
|
664
|
-
components.push(ComponentHealth {
|
|
665
|
-
name: "connection_pool".to_string(),
|
|
666
|
-
status: pool_status,
|
|
667
|
-
message: Some(format!("{}/{} connections healthy", healthy, total)),
|
|
668
|
-
latency_ms: Some(latency),
|
|
669
|
-
});
|
|
670
|
-
}
|
|
671
|
-
|
|
672
|
-
// Check circuit breaker
|
|
673
|
-
if let Some(cb) = &self.circuit_breaker {
|
|
674
|
-
let state = cb.state().await;
|
|
675
|
-
let cb_status = match state {
|
|
676
|
-
CircuitState::Closed => HealthStatus::Healthy,
|
|
677
|
-
CircuitState::HalfOpen => {
|
|
678
|
-
if overall_status == HealthStatus::Healthy {
|
|
679
|
-
overall_status = HealthStatus::Degraded;
|
|
680
|
-
}
|
|
681
|
-
HealthStatus::Degraded
|
|
682
|
-
}
|
|
683
|
-
CircuitState::Open => {
|
|
684
|
-
overall_status = HealthStatus::Unhealthy;
|
|
685
|
-
HealthStatus::Unhealthy
|
|
686
|
-
}
|
|
687
|
-
};
|
|
688
|
-
|
|
689
|
-
components.push(ComponentHealth {
|
|
690
|
-
name: "circuit_breaker".to_string(),
|
|
691
|
-
status: cb_status,
|
|
692
|
-
message: Some(format!("Circuit is {}", state)),
|
|
693
|
-
latency_ms: None,
|
|
694
|
-
});
|
|
695
|
-
}
|
|
696
|
-
|
|
697
|
-
// If no components configured, assume healthy
|
|
698
|
-
if components.is_empty() {
|
|
699
|
-
components.push(ComponentHealth {
|
|
700
|
-
name: "server".to_string(),
|
|
701
|
-
status: HealthStatus::Healthy,
|
|
702
|
-
message: Some("No components configured".to_string()),
|
|
703
|
-
latency_ms: None,
|
|
704
|
-
});
|
|
705
|
-
}
|
|
706
|
-
|
|
707
|
-
HealthCheckResponse {
|
|
708
|
-
status: overall_status,
|
|
709
|
-
components,
|
|
710
|
-
version: self.version.clone(),
|
|
711
|
-
uptime_secs: self.start_time.elapsed().as_secs(),
|
|
712
|
-
}
|
|
713
|
-
}
|
|
714
|
-
|
|
715
|
-
/// Get uptime in seconds
|
|
716
|
-
pub fn uptime_secs(&self) -> u64 {
|
|
717
|
-
self.start_time.elapsed().as_secs()
|
|
718
|
-
}
|
|
719
|
-
}
|
|
720
|
-
|
|
721
|
-
// ============================================================================
|
|
722
|
-
// Tests
|
|
723
|
-
// ============================================================================
|
|
724
|
-
|
|
725
|
-
#[cfg(test)]
|
|
726
|
-
mod tests {
|
|
727
|
-
use super::*;
|
|
728
|
-
|
|
729
|
-
#[test]
|
|
730
|
-
fn test_retry_config_default() {
|
|
731
|
-
let config = RetryConfig::default();
|
|
732
|
-
assert_eq!(config.max_retries, DEFAULT_MAX_RETRIES);
|
|
733
|
-
assert_eq!(config.base_delay, Duration::from_millis(DEFAULT_BASE_DELAY_MS));
|
|
734
|
-
assert!(config.use_jitter);
|
|
735
|
-
}
|
|
736
|
-
|
|
737
|
-
#[test]
|
|
738
|
-
fn test_retry_config_builder() {
|
|
739
|
-
let config = RetryConfig::new()
|
|
740
|
-
.max_retries(5)
|
|
741
|
-
.base_delay(Duration::from_millis(200))
|
|
742
|
-
.max_delay(Duration::from_secs(5))
|
|
743
|
-
.jitter(false);
|
|
744
|
-
|
|
745
|
-
assert_eq!(config.max_retries, 5);
|
|
746
|
-
assert_eq!(config.base_delay, Duration::from_millis(200));
|
|
747
|
-
assert_eq!(config.max_delay, Duration::from_secs(5));
|
|
748
|
-
assert!(!config.use_jitter);
|
|
749
|
-
}
|
|
750
|
-
|
|
751
|
-
#[test]
|
|
752
|
-
fn test_exponential_backoff_without_jitter() {
|
|
753
|
-
let config = RetryConfig::new()
|
|
754
|
-
.base_delay(Duration::from_millis(100))
|
|
755
|
-
.max_delay(Duration::from_secs(10))
|
|
756
|
-
.jitter(false);
|
|
757
|
-
|
|
758
|
-
// 100ms * 2^0 = 100ms
|
|
759
|
-
assert_eq!(config.delay_for_attempt(0), Duration::from_millis(100));
|
|
760
|
-
// 100ms * 2^1 = 200ms
|
|
761
|
-
assert_eq!(config.delay_for_attempt(1), Duration::from_millis(200));
|
|
762
|
-
// 100ms * 2^2 = 400ms
|
|
763
|
-
assert_eq!(config.delay_for_attempt(2), Duration::from_millis(400));
|
|
764
|
-
// 100ms * 2^3 = 800ms
|
|
765
|
-
assert_eq!(config.delay_for_attempt(3), Duration::from_millis(800));
|
|
766
|
-
}
|
|
767
|
-
|
|
768
|
-
#[test]
|
|
769
|
-
fn test_exponential_backoff_with_cap() {
|
|
770
|
-
let config = RetryConfig::new()
|
|
771
|
-
.base_delay(Duration::from_millis(1000))
|
|
772
|
-
.max_delay(Duration::from_millis(5000))
|
|
773
|
-
.jitter(false);
|
|
774
|
-
|
|
775
|
-
// 1000ms * 2^0 = 1000ms
|
|
776
|
-
assert_eq!(config.delay_for_attempt(0), Duration::from_millis(1000));
|
|
777
|
-
// 1000ms * 2^1 = 2000ms
|
|
778
|
-
assert_eq!(config.delay_for_attempt(1), Duration::from_millis(2000));
|
|
779
|
-
// 1000ms * 2^2 = 4000ms
|
|
780
|
-
assert_eq!(config.delay_for_attempt(2), Duration::from_millis(4000));
|
|
781
|
-
// 1000ms * 2^3 = 8000ms, capped to 5000ms
|
|
782
|
-
assert_eq!(config.delay_for_attempt(3), Duration::from_millis(5000));
|
|
783
|
-
}
|
|
784
|
-
|
|
785
|
-
#[test]
|
|
786
|
-
fn test_circuit_breaker_config_default() {
|
|
787
|
-
let config = CircuitBreakerConfig::default();
|
|
788
|
-
assert_eq!(config.failure_threshold, 5);
|
|
789
|
-
assert_eq!(config.reset_timeout, Duration::from_secs(30));
|
|
790
|
-
assert_eq!(config.success_threshold, 3);
|
|
791
|
-
}
|
|
792
|
-
|
|
793
|
-
#[test]
|
|
794
|
-
fn test_circuit_breaker_config_builder() {
|
|
795
|
-
let config = CircuitBreakerConfig::new()
|
|
796
|
-
.failure_threshold(10)
|
|
797
|
-
.reset_timeout(Duration::from_secs(60))
|
|
798
|
-
.success_threshold(5);
|
|
799
|
-
|
|
800
|
-
assert_eq!(config.failure_threshold, 10);
|
|
801
|
-
assert_eq!(config.reset_timeout, Duration::from_secs(60));
|
|
802
|
-
assert_eq!(config.success_threshold, 5);
|
|
803
|
-
}
|
|
804
|
-
|
|
805
|
-
#[tokio::test]
|
|
806
|
-
async fn test_circuit_breaker_initial_state() {
|
|
807
|
-
let cb = CircuitBreaker::new();
|
|
808
|
-
assert_eq!(cb.state().await, CircuitState::Closed);
|
|
809
|
-
assert!(cb.allow_request().await);
|
|
810
|
-
}
|
|
811
|
-
|
|
812
|
-
#[tokio::test]
|
|
813
|
-
async fn test_circuit_breaker_opens_after_failures() {
|
|
814
|
-
let config = CircuitBreakerConfig::new()
|
|
815
|
-
.failure_threshold(3)
|
|
816
|
-
.failure_window(Duration::from_secs(60));
|
|
817
|
-
let cb = CircuitBreaker::with_config(config);
|
|
818
|
-
|
|
819
|
-
// Record 3 failures
|
|
820
|
-
cb.record_failure().await;
|
|
821
|
-
assert_eq!(cb.state().await, CircuitState::Closed);
|
|
822
|
-
cb.record_failure().await;
|
|
823
|
-
assert_eq!(cb.state().await, CircuitState::Closed);
|
|
824
|
-
cb.record_failure().await;
|
|
825
|
-
assert_eq!(cb.state().await, CircuitState::Open);
|
|
826
|
-
|
|
827
|
-
// Should reject requests
|
|
828
|
-
assert!(!cb.allow_request().await);
|
|
829
|
-
}
|
|
830
|
-
|
|
831
|
-
#[tokio::test]
|
|
832
|
-
async fn test_circuit_breaker_half_open_recovery() {
|
|
833
|
-
let config = CircuitBreakerConfig::new()
|
|
834
|
-
.failure_threshold(2)
|
|
835
|
-
.success_threshold(2)
|
|
836
|
-
.reset_timeout(Duration::from_millis(10));
|
|
837
|
-
let cb = CircuitBreaker::with_config(config);
|
|
838
|
-
|
|
839
|
-
// Open the circuit
|
|
840
|
-
cb.record_failure().await;
|
|
841
|
-
cb.record_failure().await;
|
|
842
|
-
assert_eq!(cb.state().await, CircuitState::Open);
|
|
843
|
-
|
|
844
|
-
// Wait for reset timeout
|
|
845
|
-
tokio::time::sleep(Duration::from_millis(15)).await;
|
|
846
|
-
|
|
847
|
-
// Should transition to half-open
|
|
848
|
-
assert!(cb.allow_request().await);
|
|
849
|
-
assert_eq!(cb.state().await, CircuitState::HalfOpen);
|
|
850
|
-
|
|
851
|
-
// Record successes to close
|
|
852
|
-
cb.record_success().await;
|
|
853
|
-
assert_eq!(cb.state().await, CircuitState::HalfOpen);
|
|
854
|
-
cb.record_success().await;
|
|
855
|
-
assert_eq!(cb.state().await, CircuitState::Closed);
|
|
856
|
-
}
|
|
857
|
-
|
|
858
|
-
#[tokio::test]
|
|
859
|
-
async fn test_circuit_breaker_stats() {
|
|
860
|
-
let cb = CircuitBreaker::new();
|
|
861
|
-
|
|
862
|
-
cb.record_success().await;
|
|
863
|
-
cb.record_success().await;
|
|
864
|
-
cb.record_failure().await;
|
|
865
|
-
|
|
866
|
-
let stats = cb.stats().await;
|
|
867
|
-
assert_eq!(stats.total_successes, 2);
|
|
868
|
-
assert_eq!(stats.total_failures, 1);
|
|
869
|
-
assert_eq!(stats.state, CircuitState::Closed);
|
|
870
|
-
}
|
|
871
|
-
|
|
872
|
-
#[test]
|
|
873
|
-
fn test_health_status_display() {
|
|
874
|
-
assert_eq!(format!("{}", HealthStatus::Healthy), "healthy");
|
|
875
|
-
assert_eq!(format!("{}", HealthStatus::Degraded), "degraded");
|
|
876
|
-
assert_eq!(format!("{}", HealthStatus::Unhealthy), "unhealthy");
|
|
877
|
-
}
|
|
878
|
-
|
|
879
|
-
#[test]
|
|
880
|
-
fn test_health_checker_liveness() {
|
|
881
|
-
let checker = HealthChecker::new("1.0.0".to_string());
|
|
882
|
-
let response = checker.liveness();
|
|
883
|
-
|
|
884
|
-
assert_eq!(response.status, HealthStatus::Healthy);
|
|
885
|
-
assert_eq!(response.version, "1.0.0");
|
|
886
|
-
assert!(!response.components.is_empty());
|
|
887
|
-
}
|
|
888
|
-
|
|
889
|
-
#[tokio::test]
|
|
890
|
-
async fn test_health_checker_readiness_no_components() {
|
|
891
|
-
let checker = HealthChecker::new("1.0.0".to_string());
|
|
892
|
-
let response = checker.readiness().await;
|
|
893
|
-
|
|
894
|
-
assert_eq!(response.status, HealthStatus::Healthy);
|
|
895
|
-
}
|
|
896
|
-
|
|
897
|
-
#[test]
|
|
898
|
-
fn test_health_response_json() {
|
|
899
|
-
let response = HealthCheckResponse {
|
|
900
|
-
status: HealthStatus::Healthy,
|
|
901
|
-
components: vec![ComponentHealth {
|
|
902
|
-
name: "test".to_string(),
|
|
903
|
-
status: HealthStatus::Healthy,
|
|
904
|
-
message: Some("OK".to_string()),
|
|
905
|
-
latency_ms: Some(5),
|
|
906
|
-
}],
|
|
907
|
-
version: "1.0.0".to_string(),
|
|
908
|
-
uptime_secs: 100,
|
|
909
|
-
};
|
|
910
|
-
|
|
911
|
-
let json = response.to_json();
|
|
912
|
-
assert!(json.contains(r#""status":"healthy""#));
|
|
913
|
-
assert!(json.contains(r#""version":"1.0.0""#));
|
|
914
|
-
assert!(json.contains(r#""uptime_secs":100"#));
|
|
915
|
-
assert!(json.contains(r#""name":"test""#));
|
|
916
|
-
}
|
|
917
|
-
}
|