proactiveguard 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proactiveguard-0.1.0/PKG-INFO +208 -0
- proactiveguard-0.1.0/README.md +187 -0
- proactiveguard-0.1.0/README_PACKAGE.md +164 -0
- proactiveguard-0.1.0/proactiveguard/__init__.py +477 -0
- proactiveguard-0.1.0/proactiveguard/collectors/__init__.py +19 -0
- proactiveguard-0.1.0/proactiveguard/collectors/etcd.py +209 -0
- proactiveguard-0.1.0/proactiveguard/collectors/prometheus.py +231 -0
- proactiveguard-0.1.0/proactiveguard/engine.py +363 -0
- proactiveguard-0.1.0/proactiveguard/exceptions.py +42 -0
- proactiveguard-0.1.0/proactiveguard/types.py +88 -0
- proactiveguard-0.1.0/proactiveguard/weights/etcd_raft_v1.pt +0 -0
- proactiveguard-0.1.0/proactiveguard.egg-info/PKG-INFO +208 -0
- proactiveguard-0.1.0/proactiveguard.egg-info/SOURCES.txt +20 -0
- proactiveguard-0.1.0/proactiveguard.egg-info/dependency_links.txt +1 -0
- proactiveguard-0.1.0/proactiveguard.egg-info/requires.txt +24 -0
- proactiveguard-0.1.0/proactiveguard.egg-info/top_level.txt +1 -0
- proactiveguard-0.1.0/pyproject.toml +104 -0
- proactiveguard-0.1.0/setup.cfg +4 -0
- proactiveguard-0.1.0/tests/test_neural.py +0 -0
- proactiveguard-0.1.0/tests/test_proactiveguard.py +271 -0
- proactiveguard-0.1.0/tests/test_raft.py +0 -0
- proactiveguard-0.1.0/tests/test_simulation.py +0 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: proactiveguard
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Predictive failure detection for distributed consensus systems (etcd, Raft, CockroachDB)
|
|
5
|
+
Author: ProactiveGuard Authors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Prakhar998/neural-consensus
|
|
8
|
+
Project-URL: Documentation, https://github.com/Prakhar998/neural-consensus#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/Prakhar998/neural-consensus
|
|
10
|
+
Project-URL: Issues, https://github.com/Prakhar998/neural-consensus/issues
|
|
11
|
+
Keywords: distributed-systems,failure-detection,machine-learning,etcd,raft,consensus,predictive,monitoring
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: System Administrators
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: System :: Monitoring
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
Requires-Dist: torch>=2.0.0
|
|
25
|
+
Requires-Dist: numpy>=1.24.0
|
|
26
|
+
Requires-Dist: requests>=2.28.0
|
|
27
|
+
Provides-Extra: etcd
|
|
28
|
+
Requires-Dist: etcd3>=0.12.0; extra == "etcd"
|
|
29
|
+
Provides-Extra: train
|
|
30
|
+
Requires-Dist: scikit-learn>=1.3.0; extra == "train"
|
|
31
|
+
Requires-Dist: pandas>=2.0.0; extra == "train"
|
|
32
|
+
Requires-Dist: tqdm>=4.65.0; extra == "train"
|
|
33
|
+
Requires-Dist: loguru>=0.7.0; extra == "train"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
39
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: torch>=2.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: numpy>=1.24.0; extra == "dev"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: proactiveguard[etcd,train]; extra == "all"
|
|
44
|
+
|
|
45
|
+
# ProactiveGuard
|
|
46
|
+
|
|
47
|
+
**Predictive failure detection for distributed consensus systems.**
|
|
48
|
+
|
|
49
|
+
ProactiveGuard uses deep learning to predict node failures in etcd, Raft, and CockroachDB clusters *before* they happen — giving you 5–30 seconds to act rather than reacting after the fact.
|
|
50
|
+
|
|
51
|
+
[](https://badge.fury.io/py/proactiveguard)
|
|
52
|
+
[](https://www.python.org/downloads/)
|
|
53
|
+
[](https://opensource.org/licenses/MIT)
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Why ProactiveGuard?
|
|
58
|
+
|
|
59
|
+
| | Timeout-based | Phi Accrual | **ProactiveGuard** |
|
|
60
|
+
|---|---|---|---|
|
|
61
|
+
| Detects crash failures | After timeout | After timeout | **Before failure** |
|
|
62
|
+
| Detects slow/byzantine | No | No | **Yes** |
|
|
63
|
+
| Recall on etcd simulation | — | 98% | **100%** |
|
|
64
|
+
| Predict time-to-failure | No | No | **Yes (5–30s ahead)** |
|
|
65
|
+
|
|
66
|
+
## Installation
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install proactiveguard
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
With real etcd cluster support:
|
|
73
|
+
```bash
|
|
74
|
+
pip install "proactiveguard[etcd]"
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Quick Start
|
|
78
|
+
|
|
79
|
+
### Pre-trained model (etcd / Raft clusters)
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from proactiveguard import ProactiveGuard
|
|
83
|
+
from proactiveguard.collectors import PrometheusCollector
|
|
84
|
+
|
|
85
|
+
pg = ProactiveGuard.from_pretrained("etcd-raft-v1")
|
|
86
|
+
|
|
87
|
+
collector = PrometheusCollector("http://prometheus:9090")
|
|
88
|
+
|
|
89
|
+
for obs in collector.stream():
|
|
90
|
+
result = pg.observe(obs.node_id, obs)
|
|
91
|
+
if result and result.is_pre_failure:
|
|
92
|
+
print(f"Warning: {result.node_id} — {result.status}")
|
|
93
|
+
print(f" Estimated time to failure: {result.time_to_failure:.0f}s")
|
|
94
|
+
print(f" Failure type: {result.failure_type}")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Train on your own data
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
import numpy as np
|
|
101
|
+
from proactiveguard import ProactiveGuard
|
|
102
|
+
from proactiveguard.engine import PREDICTION_HORIZONS
|
|
103
|
+
|
|
104
|
+
pg = ProactiveGuard(window_size=50)
|
|
105
|
+
|
|
106
|
+
# X: (n_samples, window_size, n_features)
|
|
107
|
+
# y: integer class labels from PREDICTION_HORIZONS
|
|
108
|
+
X_train = np.random.randn(1000, 50, 32).astype("float32")
|
|
109
|
+
y_train = np.random.randint(0, 9, size=1000)
|
|
110
|
+
|
|
111
|
+
pg.fit(X_train, y_train, epochs=50)
|
|
112
|
+
|
|
113
|
+
# Predict
|
|
114
|
+
labels = pg.predict(X_test) # → ['healthy', 'degraded_10s', ...]
|
|
115
|
+
probs = pg.predict_proba(X_test) # → (n, 9) probability array
|
|
116
|
+
|
|
117
|
+
# Save / load
|
|
118
|
+
pg.save("my_model.pt")
|
|
119
|
+
pg = ProactiveGuard.load("my_model.pt")
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Feed raw metrics dict
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
pg = ProactiveGuard.from_pretrained("etcd-raft-v1")
|
|
126
|
+
|
|
127
|
+
# Call observe() once per polling interval for each node
|
|
128
|
+
result = pg.observe("etcd-0", {
|
|
129
|
+
"heartbeat_latency_ms": 45.0,
|
|
130
|
+
"missed_heartbeats": 2,
|
|
131
|
+
"response_rate": 0.8,
|
|
132
|
+
"messages_dropped": 3,
|
|
133
|
+
"term": 5,
|
|
134
|
+
"commit_index": 1042,
|
|
135
|
+
"is_leader": False,
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
if result: # None until window_size observations collected
|
|
139
|
+
print(result)
|
|
140
|
+
# PredictionResult(node='etcd-0', status='degraded_10s', risk=0.82, confidence=0.91)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Prediction Classes
|
|
144
|
+
|
|
145
|
+
| Class | Meaning |
|
|
146
|
+
|-------|---------|
|
|
147
|
+
| `healthy` | Node operating normally |
|
|
148
|
+
| `degraded_30s` | Predicted failure in ~30 seconds |
|
|
149
|
+
| `degraded_20s` | Predicted failure in ~20 seconds |
|
|
150
|
+
| `degraded_10s` | Predicted failure in ~10 seconds |
|
|
151
|
+
| `degraded_5s` | Predicted failure in ~5 seconds |
|
|
152
|
+
| `failed_crash` | Node has crashed / stopped responding |
|
|
153
|
+
| `failed_slow` | Node is severely degraded (slow) |
|
|
154
|
+
| `failed_byzantine` | Node exhibiting Byzantine behaviour |
|
|
155
|
+
| `failed_partition` | Node is network-partitioned |
|
|
156
|
+
|
|
157
|
+
## Collectors
|
|
158
|
+
|
|
159
|
+
### Prometheus (recommended)
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from proactiveguard.collectors import PrometheusCollector
|
|
163
|
+
|
|
164
|
+
collector = PrometheusCollector(
|
|
165
|
+
prometheus_url="http://prometheus:9090",
|
|
166
|
+
node_selector={"job": "etcd"},
|
|
167
|
+
interval_s=5.0,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
for obs in collector.stream():
|
|
171
|
+
pg.observe(obs.node_id, obs)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Direct etcd gRPC
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from proactiveguard.collectors import EtcdCollector # requires pip install proactiveguard[etcd]
|
|
178
|
+
|
|
179
|
+
collector = EtcdCollector(
|
|
180
|
+
endpoints=["http://etcd-0:2379", "http://etcd-1:2379", "http://etcd-2:2379"],
|
|
181
|
+
interval_s=1.0,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
for obs in collector.stream():
|
|
185
|
+
pg.observe(obs.node_id, obs)
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Model Architecture
|
|
189
|
+
|
|
190
|
+
- **Input**: 50-step sliding window × 32 features per step
|
|
191
|
+
- **CNN branch**: 1-D ResNet with Squeeze-and-Excitation attention
|
|
192
|
+
- **LSTM branch**: Bidirectional 2-layer LSTM
|
|
193
|
+
- **Attention branch**: 4-head multi-head self-attention
|
|
194
|
+
- **Fusion**: Concat → MLP → latent representation
|
|
195
|
+
- **Output heads**: 9-class classification + time-to-failure regression + confidence estimation
|
|
196
|
+
|
|
197
|
+
## Research
|
|
198
|
+
|
|
199
|
+
ProactiveGuard is based on peer-reviewed research submitted to the *Journal of Supercomputing*.
|
|
200
|
+
|
|
201
|
+
Experimental results:
|
|
202
|
+
- **Google Cluster traces**: 100% recall
|
|
203
|
+
- **Backblaze hard-drive failures**: 100% recall
|
|
204
|
+
- **etcd/Raft simulation**: 100% recall vs. 98% for Phi Accrual
|
|
205
|
+
|
|
206
|
+
## License
|
|
207
|
+
|
|
208
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# neural-consensus# Neural Consensus
|
|
2
|
+
|
|
3
|
+
**Neural Fault Detection with Transfer Learning for Distributed Consensus**
|
|
4
|
+
|
|
5
|
+
A research implementation exploring whether neural networks can detect and classify node failures faster and more accurately than traditional timeout-based methods in distributed consensus systems.
|
|
6
|
+
|
|
7
|
+
## 🎯 Research Question
|
|
8
|
+
|
|
9
|
+
> Can a neural network with transfer learning capabilities detect and classify node failures faster and more accurately than traditional timeout-based methods, while generalizing across different network deployments?
|
|
10
|
+
|
|
11
|
+
## 🔬 Key Innovations
|
|
12
|
+
|
|
13
|
+
1. **Predictive Failure Detection** — Detect failures *before* they happen using learned patterns
|
|
14
|
+
2. **Failure Classification** — Distinguish crash vs Byzantine vs partition vs slowdown
|
|
15
|
+
3. **Transfer Learning** — Train on one deployment, transfer to another with minimal fine-tuning
|
|
16
|
+
|
|
17
|
+
## 📁 Project Structure
|
|
18
|
+
```
|
|
19
|
+
neural-consensus/
|
|
20
|
+
├── simulation/ # Network simulation environment
|
|
21
|
+
│ ├── clock.py # Discrete event simulation clock
|
|
22
|
+
│ ├── network.py # Message passing with delays/loss/partitions
|
|
23
|
+
│ ├── node.py # Base node with failure injection
|
|
24
|
+
│ └── failures.py # Failure injection strategies
|
|
25
|
+
│
|
|
26
|
+
├── protocols/raft/ # Raft consensus implementation
|
|
27
|
+
│ ├── messages.py # Raft message types (Vote, AppendEntries, etc.)
|
|
28
|
+
│ ├── state.py # Raft state management
|
|
29
|
+
│ └── node.py # Complete Raft node
|
|
30
|
+
│
|
|
31
|
+
├── neural/ # Neural network components
|
|
32
|
+
│ ├── features.py # Feature extraction from observations
|
|
33
|
+
│ ├── encoder.py # LSTM autoencoder
|
|
34
|
+
│ ├── classifier.py # Failure classification head
|
|
35
|
+
│ ├── detector.py # Neural failure detector
|
|
36
|
+
│ ├── training.py # Training loop
|
|
37
|
+
│ └── transfer.py # Transfer learning utilities
|
|
38
|
+
│
|
|
39
|
+
├── data/ # Data collection
|
|
40
|
+
│ ├── collector.py # Observation collector
|
|
41
|
+
│ └── labeler.py # Auto-labeling
|
|
42
|
+
│
|
|
43
|
+
├── experiments/ # Experiment scripts
|
|
44
|
+
├── configs/ # Configuration files
|
|
45
|
+
├── models/ # Saved models
|
|
46
|
+
├── results/ # Experiment results
|
|
47
|
+
└── tests/ # Unit tests
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## 🚀 Quick Start
|
|
51
|
+
|
|
52
|
+
### Installation
|
|
53
|
+
```bash
|
|
54
|
+
# Clone and enter directory
|
|
55
|
+
cd neural-consensus
|
|
56
|
+
|
|
57
|
+
# Create virtual environment
|
|
58
|
+
python -m venv venv
|
|
59
|
+
source venv/bin/activate # Windows: venv\Scripts\activate
|
|
60
|
+
|
|
61
|
+
# Install dependencies
|
|
62
|
+
pip install -r requirements.txt
|
|
63
|
+
|
|
64
|
+
# Run tests
|
|
65
|
+
python test_all.py
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Train the Neural Detector
|
|
69
|
+
```bash
|
|
70
|
+
python train_detector.py
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Run Experiments
|
|
74
|
+
```bash
|
|
75
|
+
python run_experiments.py
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## 🧠 Neural Architecture
|
|
79
|
+
```
|
|
80
|
+
Input: [20 observations × 16 features]
|
|
81
|
+
↓
|
|
82
|
+
┌───────────────┐
|
|
83
|
+
│ LSTM Encoder │ (64 units, 2 layers)
|
|
84
|
+
└───────────────┘
|
|
85
|
+
↓
|
|
86
|
+
[32-dim latent space]
|
|
87
|
+
↓
|
|
88
|
+
┌────────┴────────┐
|
|
89
|
+
↓ ↓
|
|
90
|
+
┌─────────┐ ┌──────────────┐
|
|
91
|
+
│ Decoder │ │ Classifier │
|
|
92
|
+
└─────────┘ └──────────────┘
|
|
93
|
+
↓ ↓
|
|
94
|
+
Reconstruction Failure Type
|
|
95
|
+
Error Prediction
|
|
96
|
+
(anomaly score)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Failure Classes
|
|
100
|
+
|
|
101
|
+
| Class | Description |
|
|
102
|
+
|-------|-------------|
|
|
103
|
+
| 0 - Healthy | Normal operation |
|
|
104
|
+
| 1 - Pre-failure | About to fail (within 5s) |
|
|
105
|
+
| 2 - Crashed | Node has stopped |
|
|
106
|
+
| 3 - Byzantine | Malicious behavior |
|
|
107
|
+
| 4 - Partitioned | Network split |
|
|
108
|
+
| 5 - Slow | Degraded performance |
|
|
109
|
+
|
|
110
|
+
### Features (16 per observation)
|
|
111
|
+
|
|
112
|
+
- Latency: mean, std, trend, jitter
|
|
113
|
+
- Messages: rate, drop rate
|
|
114
|
+
- Heartbeats: regularity, missed count
|
|
115
|
+
- Response: rate, time
|
|
116
|
+
- Raft: term freshness, log/commit progress, leader status
|
|
117
|
+
- Composite: health score
|
|
118
|
+
|
|
119
|
+
## 📊 Experiments
|
|
120
|
+
|
|
121
|
+
### 1. Detection Speed
|
|
122
|
+
Compare time-to-detection between neural and timeout-based approaches.
|
|
123
|
+
|
|
124
|
+
### 2. False Positive Rate
|
|
125
|
+
Measure false alarms under various network conditions.
|
|
126
|
+
|
|
127
|
+
### 3. Classification Accuracy
|
|
128
|
+
Evaluate failure type classification with confusion matrix.
|
|
129
|
+
|
|
130
|
+
### 4. Transfer Learning
|
|
131
|
+
Test model transfer across different network deployments.
|
|
132
|
+
|
|
133
|
+
### 5. End-to-End Performance
|
|
134
|
+
Measure impact on consensus throughput, latency, and availability.
|
|
135
|
+
|
|
136
|
+
## 🔧 Configuration
|
|
137
|
+
|
|
138
|
+
See `configs/default.yaml` for all options:
|
|
139
|
+
```yaml
|
|
140
|
+
neural_detector:
|
|
141
|
+
window_size: 20
|
|
142
|
+
encoder:
|
|
143
|
+
hidden_size: 64
|
|
144
|
+
latent_size: 32
|
|
145
|
+
classifier:
|
|
146
|
+
hidden_sizes: [64, 32]
|
|
147
|
+
num_classes: 6
|
|
148
|
+
training:
|
|
149
|
+
epochs: 100
|
|
150
|
+
learning_rate: 0.001
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## 📈 Results
|
|
154
|
+
|
|
155
|
+
After training, results are saved to `results/`:
|
|
156
|
+
- `training_history.png` — Loss curves
|
|
157
|
+
- `confusion_matrix.png` — Classification performance
|
|
158
|
+
- `detection_latency.png` — Time to detect failures
|
|
159
|
+
- `transfer_performance.png` — Transfer learning results
|
|
160
|
+
|
|
161
|
+
## 🔮 Blockchain Applications
|
|
162
|
+
|
|
163
|
+
This research directly applies to:
|
|
164
|
+
- **Proof of Stake** validator monitoring (Ethereum, Solana)
|
|
165
|
+
- **BFT chains** (Cosmos/Tendermint, BNB Chain)
|
|
166
|
+
- **Layer 2** sequencer monitoring
|
|
167
|
+
- **Cross-chain bridges** validator security
|
|
168
|
+
|
|
169
|
+
## 📚 References
|
|
170
|
+
|
|
171
|
+
1. Ongaro & Ousterhout. "In Search of an Understandable Consensus Algorithm" (Raft)
|
|
172
|
+
2. Castro & Liskov. "Practical Byzantine Fault Tolerance"
|
|
173
|
+
3. Kleppmann. "Designing Data-Intensive Applications"
|
|
174
|
+
4. Chandra & Toueg. "Unreliable Failure Detectors for Reliable Distributed Systems"
|
|
175
|
+
|
|
176
|
+
## 📄 License
|
|
177
|
+
|
|
178
|
+
MIT License
|
|
179
|
+
|
|
180
|
+
## 📖 Citation
|
|
181
|
+
```bibtex
|
|
182
|
+
@article{neural-consensus-2025,
|
|
183
|
+
title={Neural Fault Detection with Transfer Learning for Distributed Consensus},
|
|
184
|
+
author={Prakhar},
|
|
185
|
+
year={2025}
|
|
186
|
+
}
|
|
187
|
+
```
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# ProactiveGuard
|
|
2
|
+
|
|
3
|
+
**Predictive failure detection for distributed consensus systems.**
|
|
4
|
+
|
|
5
|
+
ProactiveGuard uses deep learning to predict node failures in etcd, Raft, and CockroachDB clusters *before* they happen — giving you 5–30 seconds to act rather than reacting after the fact.
|
|
6
|
+
|
|
7
|
+
[](https://badge.fury.io/py/proactiveguard)
|
|
8
|
+
[](https://www.python.org/downloads/)
|
|
9
|
+
[](https://opensource.org/licenses/MIT)
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Why ProactiveGuard?
|
|
14
|
+
|
|
15
|
+
| | Timeout-based | Phi Accrual | **ProactiveGuard** |
|
|
16
|
+
|---|---|---|---|
|
|
17
|
+
| Detects crash failures | After timeout | After timeout | **Before failure** |
|
|
18
|
+
| Detects slow/byzantine | No | No | **Yes** |
|
|
19
|
+
| Recall on etcd simulation | — | 98% | **100%** |
|
|
20
|
+
| Predict time-to-failure | No | No | **Yes (5–30s ahead)** |
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install proactiveguard
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
With real etcd cluster support:
|
|
29
|
+
```bash
|
|
30
|
+
pip install "proactiveguard[etcd]"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
### Pre-trained model (etcd / Raft clusters)
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from proactiveguard import ProactiveGuard
|
|
39
|
+
from proactiveguard.collectors import PrometheusCollector
|
|
40
|
+
|
|
41
|
+
pg = ProactiveGuard.from_pretrained("etcd-raft-v1")
|
|
42
|
+
|
|
43
|
+
collector = PrometheusCollector("http://prometheus:9090")
|
|
44
|
+
|
|
45
|
+
for obs in collector.stream():
|
|
46
|
+
result = pg.observe(obs.node_id, obs)
|
|
47
|
+
if result and result.is_pre_failure:
|
|
48
|
+
print(f"Warning: {result.node_id} — {result.status}")
|
|
49
|
+
print(f" Estimated time to failure: {result.time_to_failure:.0f}s")
|
|
50
|
+
print(f" Failure type: {result.failure_type}")
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Train on your own data
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
import numpy as np
|
|
57
|
+
from proactiveguard import ProactiveGuard
|
|
58
|
+
from proactiveguard.engine import PREDICTION_HORIZONS
|
|
59
|
+
|
|
60
|
+
pg = ProactiveGuard(window_size=50)
|
|
61
|
+
|
|
62
|
+
# X: (n_samples, window_size, n_features)
|
|
63
|
+
# y: integer class labels from PREDICTION_HORIZONS
|
|
64
|
+
X_train = np.random.randn(1000, 50, 32).astype("float32")
|
|
65
|
+
y_train = np.random.randint(0, 9, size=1000)
|
|
66
|
+
|
|
67
|
+
pg.fit(X_train, y_train, epochs=50)
|
|
68
|
+
|
|
69
|
+
# Predict
|
|
70
|
+
labels = pg.predict(X_test) # → ['healthy', 'degraded_10s', ...]
|
|
71
|
+
probs = pg.predict_proba(X_test) # → (n, 9) probability array
|
|
72
|
+
|
|
73
|
+
# Save / load
|
|
74
|
+
pg.save("my_model.pt")
|
|
75
|
+
pg = ProactiveGuard.load("my_model.pt")
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Feed raw metrics dict
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
pg = ProactiveGuard.from_pretrained("etcd-raft-v1")
|
|
82
|
+
|
|
83
|
+
# Call observe() once per polling interval for each node
|
|
84
|
+
result = pg.observe("etcd-0", {
|
|
85
|
+
"heartbeat_latency_ms": 45.0,
|
|
86
|
+
"missed_heartbeats": 2,
|
|
87
|
+
"response_rate": 0.8,
|
|
88
|
+
"messages_dropped": 3,
|
|
89
|
+
"term": 5,
|
|
90
|
+
"commit_index": 1042,
|
|
91
|
+
"is_leader": False,
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
if result: # None until window_size observations collected
|
|
95
|
+
print(result)
|
|
96
|
+
# PredictionResult(node='etcd-0', status='degraded_10s', risk=0.82, confidence=0.91)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Prediction Classes
|
|
100
|
+
|
|
101
|
+
| Class | Meaning |
|
|
102
|
+
|-------|---------|
|
|
103
|
+
| `healthy` | Node operating normally |
|
|
104
|
+
| `degraded_30s` | Predicted failure in ~30 seconds |
|
|
105
|
+
| `degraded_20s` | Predicted failure in ~20 seconds |
|
|
106
|
+
| `degraded_10s` | Predicted failure in ~10 seconds |
|
|
107
|
+
| `degraded_5s` | Predicted failure in ~5 seconds |
|
|
108
|
+
| `failed_crash` | Node has crashed / stopped responding |
|
|
109
|
+
| `failed_slow` | Node is severely degraded (slow) |
|
|
110
|
+
| `failed_byzantine` | Node exhibiting Byzantine behaviour |
|
|
111
|
+
| `failed_partition` | Node is network-partitioned |
|
|
112
|
+
|
|
113
|
+
## Collectors
|
|
114
|
+
|
|
115
|
+
### Prometheus (recommended)
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from proactiveguard.collectors import PrometheusCollector
|
|
119
|
+
|
|
120
|
+
collector = PrometheusCollector(
|
|
121
|
+
prometheus_url="http://prometheus:9090",
|
|
122
|
+
node_selector={"job": "etcd"},
|
|
123
|
+
interval_s=5.0,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
for obs in collector.stream():
|
|
127
|
+
pg.observe(obs.node_id, obs)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Direct etcd gRPC
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from proactiveguard.collectors import EtcdCollector # requires pip install proactiveguard[etcd]
|
|
134
|
+
|
|
135
|
+
collector = EtcdCollector(
|
|
136
|
+
endpoints=["http://etcd-0:2379", "http://etcd-1:2379", "http://etcd-2:2379"],
|
|
137
|
+
interval_s=1.0,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
for obs in collector.stream():
|
|
141
|
+
pg.observe(obs.node_id, obs)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Model Architecture
|
|
145
|
+
|
|
146
|
+
- **Input**: 50-step sliding window × 32 features per step
|
|
147
|
+
- **CNN branch**: 1-D ResNet with Squeeze-and-Excitation attention
|
|
148
|
+
- **LSTM branch**: Bidirectional 2-layer LSTM
|
|
149
|
+
- **Attention branch**: 4-head multi-head self-attention
|
|
150
|
+
- **Fusion**: Concat → MLP → latent representation
|
|
151
|
+
- **Output heads**: 9-class classification + time-to-failure regression + confidence estimation
|
|
152
|
+
|
|
153
|
+
## Research
|
|
154
|
+
|
|
155
|
+
ProactiveGuard is based on peer-reviewed research submitted to the *Journal of Supercomputing*.
|
|
156
|
+
|
|
157
|
+
Experimental results:
|
|
158
|
+
- **Google Cluster traces**: 100% recall
|
|
159
|
+
- **Backblaze hard-drive failures**: 100% recall
|
|
160
|
+
- **etcd/Raft simulation**: 100% recall vs. 98% for Phi Accrual
|
|
161
|
+
|
|
162
|
+
## License
|
|
163
|
+
|
|
164
|
+
MIT License. See [LICENSE](LICENSE) for details.
|