@pioneer-platform/blockbook 8.32.0 → 8.32.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/FAIL_FAST_IMPLEMENTATION.md +696 -0
- package/SUMMARY.md +313 -0
- package/WEBSOCKET_REGRESSION_ANALYSIS.md +438 -0
- package/lib/BlockbookWebSocket.d.ts +22 -0
- package/lib/BlockbookWebSocket.js +109 -21
- package/lib/index.d.ts +1 -91
- package/lib/index.js +31 -4
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# @pioneer-platform/blockbook
|
|
2
2
|
|
|
3
|
+
## 8.32.2
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 163653b: chore: fix(swap): Fix critical swap amount bug causing 0 ETH transfers
|
|
8
|
+
|
|
9
|
+
## 8.32.1
|
|
10
|
+
|
|
11
|
+
### Patch Changes
|
|
12
|
+
|
|
13
|
+
- fix(swap): Fix critical swap amount bug causing 0 ETH transfers
|
|
14
|
+
|
|
3
15
|
## 8.32.0
|
|
4
16
|
|
|
5
17
|
### Minor Changes
|
|
@@ -0,0 +1,696 @@
|
|
|
1
|
+
# Fail-Fast Implementation Guide
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
Implement robust health monitoring and fail-fast mechanisms to detect when the BlockbookWebSocket event system is broken, and signal failure to container orchestration layers immediately.
|
|
6
|
+
|
|
7
|
+
## Core Philosophy
|
|
8
|
+
|
|
9
|
+
**Silent failures are catastrophic failures.** If we can't receive events, we MUST:
|
|
10
|
+
1. Detect the failure quickly (within minutes, not hours)
|
|
11
|
+
2. Log clear diagnostic information
|
|
12
|
+
3. Signal unhealthy status to orchestration layer
|
|
13
|
+
4. Exit with non-zero exit code to trigger container restart
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Implementation Layers
|
|
18
|
+
|
|
19
|
+
### Layer 1: BlockbookWebSocket Class Health Tracking
|
|
20
|
+
|
|
21
|
+
Add health monitoring directly to `BlockbookWebSocket.ts`:
|
|
22
|
+
|
|
23
|
+
```typescript
|
|
24
|
+
export class BlockbookWebSocket extends EventEmitter {
|
|
25
|
+
// ... existing fields ...
|
|
26
|
+
|
|
27
|
+
// Health tracking
|
|
28
|
+
private lastEventTime?: number;
|
|
29
|
+
private healthCheckInterval?: NodeJS.Timeout;
|
|
30
|
+
private eventCounts = {
|
|
31
|
+
blocks: 0,
|
|
32
|
+
transactions: 0,
|
|
33
|
+
total: 0
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
// Configuration
|
|
37
|
+
private readonly HEALTH_CHECK_INTERVAL = 60000; // 1 minute
|
|
38
|
+
private readonly EVENT_TIMEOUT = 300000; // 5 minutes
|
|
39
|
+
|
|
40
|
+
constructor(url: string) {
|
|
41
|
+
super();
|
|
42
|
+
this.url = url;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async connect(): Promise<void> {
|
|
46
|
+
// ... existing connection code ...
|
|
47
|
+
|
|
48
|
+
this.ws.once('open', () => {
|
|
49
|
+
this.connected = true;
|
|
50
|
+
this.reconnectAttempts = 0;
|
|
51
|
+
this.emit('connected', this.url);
|
|
52
|
+
this.startPing();
|
|
53
|
+
this.startHealthMonitoring(); // ← NEW: Start health checks
|
|
54
|
+
resolve();
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
// ... rest of connection code ...
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
private handleMessage(message: BlockbookMessage): void {
|
|
61
|
+
const { id, data } = message;
|
|
62
|
+
|
|
63
|
+
if (!id) {
|
|
64
|
+
this.emit('error', new Error('Received message without ID'));
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Check if this is a response to a pending request
|
|
69
|
+
const pending = this.pendingRequests.get(id);
|
|
70
|
+
if (pending) {
|
|
71
|
+
this.pendingRequests.delete(id);
|
|
72
|
+
|
|
73
|
+
if (data?.error) {
|
|
74
|
+
pending.reject(new Error(data.error.message || 'Request failed'));
|
|
75
|
+
} else {
|
|
76
|
+
pending.resolve(data);
|
|
77
|
+
}
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Check if this is a subscription notification
|
|
82
|
+
const subscription = this.subscriptions.get(id);
|
|
83
|
+
if (subscription) {
|
|
84
|
+
// ← NEW: Track event reception
|
|
85
|
+
this.lastEventTime = Date.now();
|
|
86
|
+
this.eventCounts.total++;
|
|
87
|
+
|
|
88
|
+
// Track by type
|
|
89
|
+
if (subscription.method === 'subscribeNewBlock') {
|
|
90
|
+
this.eventCounts.blocks++;
|
|
91
|
+
} else if (subscription.method === 'subscribeAddresses') {
|
|
92
|
+
this.eventCounts.transactions++;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if (data?.error) {
|
|
96
|
+
this.emit('error', new Error(`Subscription error for ${subscription.method}: ${data.error.message}`));
|
|
97
|
+
} else {
|
|
98
|
+
try {
|
|
99
|
+
const result = subscription.callback(data);
|
|
100
|
+
if (result instanceof Promise) {
|
|
101
|
+
result.catch((error) => {
|
|
102
|
+
this.emit('error', new Error(`Subscription callback error: ${error}`));
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
} catch (error) {
|
|
106
|
+
this.emit('error', new Error(`Subscription callback error: ${error}`));
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
return;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Unknown message
|
|
113
|
+
this.emit('unknown-message', message);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
private startHealthMonitoring(): void {
|
|
117
|
+
this.stopHealthMonitoring();
|
|
118
|
+
|
|
119
|
+
this.healthCheckInterval = setInterval(() => {
|
|
120
|
+
this.checkHealth();
|
|
121
|
+
}, this.HEALTH_CHECK_INTERVAL);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
private stopHealthMonitoring(): void {
|
|
125
|
+
if (this.healthCheckInterval) {
|
|
126
|
+
clearInterval(this.healthCheckInterval);
|
|
127
|
+
this.healthCheckInterval = undefined;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
private checkHealth(): void {
|
|
132
|
+
const now = Date.now();
|
|
133
|
+
const hasActiveSubscriptions = this.subscriptions.size > 0;
|
|
134
|
+
|
|
135
|
+
if (!hasActiveSubscriptions) {
|
|
136
|
+
// No subscriptions, nothing to monitor
|
|
137
|
+
return;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const timeSinceLastEvent = this.lastEventTime
|
|
141
|
+
? now - this.lastEventTime
|
|
142
|
+
: Infinity;
|
|
143
|
+
|
|
144
|
+
const healthStatus = {
|
|
145
|
+
connected: this.connected,
|
|
146
|
+
subscriptions: this.subscriptions.size,
|
|
147
|
+
lastEventTime: this.lastEventTime,
|
|
148
|
+
timeSinceLastEvent,
|
|
149
|
+
eventCounts: { ...this.eventCounts },
|
|
150
|
+
url: this.url
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
// Emit health status for monitoring
|
|
154
|
+
this.emit('health-check', healthStatus);
|
|
155
|
+
|
|
156
|
+
// FAIL FAST: No events received within timeout
|
|
157
|
+
if (timeSinceLastEvent > this.EVENT_TIMEOUT) {
|
|
158
|
+
const errorMessage = `CRITICAL: No events received in ${this.EVENT_TIMEOUT / 1000}s despite ${this.subscriptions.size} active subscriptions`;
|
|
159
|
+
|
|
160
|
+
this.emit('health-failure', {
|
|
161
|
+
...healthStatus,
|
|
162
|
+
error: errorMessage,
|
|
163
|
+
severity: 'critical'
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
// Log to stderr for container log collection
|
|
167
|
+
console.error(`[BlockbookWebSocket] ${errorMessage}`);
|
|
168
|
+
console.error(`[BlockbookWebSocket] Health Status:`, JSON.stringify(healthStatus, null, 2));
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
getHealthStatus(): {
|
|
173
|
+
connected: boolean;
|
|
174
|
+
subscriptions: number;
|
|
175
|
+
lastEventTime?: number;
|
|
176
|
+
timeSinceLastEvent: number;
|
|
177
|
+
eventCounts: typeof this.eventCounts;
|
|
178
|
+
healthy: boolean;
|
|
179
|
+
} {
|
|
180
|
+
const now = Date.now();
|
|
181
|
+
const timeSinceLastEvent = this.lastEventTime
|
|
182
|
+
? now - this.lastEventTime
|
|
183
|
+
: Infinity;
|
|
184
|
+
|
|
185
|
+
const hasActiveSubscriptions = this.subscriptions.size > 0;
|
|
186
|
+
const healthy = !hasActiveSubscriptions || timeSinceLastEvent < this.EVENT_TIMEOUT;
|
|
187
|
+
|
|
188
|
+
return {
|
|
189
|
+
connected: this.connected,
|
|
190
|
+
subscriptions: this.subscriptions.size,
|
|
191
|
+
lastEventTime: this.lastEventTime,
|
|
192
|
+
timeSinceLastEvent,
|
|
193
|
+
eventCounts: { ...this.eventCounts },
|
|
194
|
+
healthy
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
async disconnect(): Promise<void> {
|
|
199
|
+
if (!this.ws || !this.connected) {
|
|
200
|
+
return;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
this.reconnecting = false; // Prevent auto-reconnect
|
|
204
|
+
this.stopHealthMonitoring(); // ← NEW: Stop health checks
|
|
205
|
+
|
|
206
|
+
return new Promise((resolve) => {
|
|
207
|
+
this.ws!.once('close', () => resolve());
|
|
208
|
+
this.ws!.close(1000, 'Normal closure');
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
### Layer 2: Watchtower Service Health Endpoint
|
|
217
|
+
|
|
218
|
+
Add health check endpoint to watchtower service that aggregates all websocket health statuses:
|
|
219
|
+
|
|
220
|
+
```typescript
|
|
221
|
+
// services/pioneer-watchtower/src/health.ts
|
|
222
|
+
|
|
223
|
+
import { network } from './blockbook-integration';
|
|
224
|
+
|
|
225
|
+
export interface WatchtowerHealth {
|
|
226
|
+
status: 'healthy' | 'degraded' | 'failing';
|
|
227
|
+
timestamp: number;
|
|
228
|
+
chains: {
|
|
229
|
+
[symbol: string]: {
|
|
230
|
+
connected: boolean;
|
|
231
|
+
subscriptions: number;
|
|
232
|
+
timeSinceLastEvent: number;
|
|
233
|
+
eventCounts: {
|
|
234
|
+
blocks: number;
|
|
235
|
+
transactions: number;
|
|
236
|
+
total: number;
|
|
237
|
+
};
|
|
238
|
+
healthy: boolean;
|
|
239
|
+
};
|
|
240
|
+
};
|
|
241
|
+
overall: {
|
|
242
|
+
totalChains: number;
|
|
243
|
+
healthyChains: number;
|
|
244
|
+
degradedChains: number;
|
|
245
|
+
failingChains: number;
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
export function getWatchtowerHealth(): WatchtowerHealth {
|
|
250
|
+
const sockets = network.getBlockbookSockets();
|
|
251
|
+
const chains: WatchtowerHealth['chains'] = {};
|
|
252
|
+
|
|
253
|
+
let healthyCount = 0;
|
|
254
|
+
let degradedCount = 0;
|
|
255
|
+
let failingCount = 0;
|
|
256
|
+
|
|
257
|
+
for (const [symbol, socket] of Object.entries(sockets)) {
|
|
258
|
+
const health = socket.getHealthStatus();
|
|
259
|
+
chains[symbol] = health;
|
|
260
|
+
|
|
261
|
+
if (health.healthy) {
|
|
262
|
+
healthyCount++;
|
|
263
|
+
} else if (health.timeSinceLastEvent < 600000) { // 10 minutes
|
|
264
|
+
degradedCount++;
|
|
265
|
+
} else {
|
|
266
|
+
failingCount++;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
const totalChains = Object.keys(sockets).length;
|
|
271
|
+
|
|
272
|
+
let status: 'healthy' | 'degraded' | 'failing' = 'healthy';
|
|
273
|
+
if (failingCount > 0) {
|
|
274
|
+
status = 'failing';
|
|
275
|
+
} else if (degradedCount > 0) {
|
|
276
|
+
status = 'degraded';
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
return {
|
|
280
|
+
status,
|
|
281
|
+
timestamp: Date.now(),
|
|
282
|
+
chains,
|
|
283
|
+
overall: {
|
|
284
|
+
totalChains,
|
|
285
|
+
healthyChains: healthyCount,
|
|
286
|
+
degradedChains: degradedCount,
|
|
287
|
+
failingChains: failingCount
|
|
288
|
+
}
|
|
289
|
+
};
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// HTTP endpoint
|
|
293
|
+
app.get('/health', (req, res) => {
|
|
294
|
+
const health = getWatchtowerHealth();
|
|
295
|
+
|
|
296
|
+
const statusCode = health.status === 'healthy' ? 200 :
|
|
297
|
+
health.status === 'degraded' ? 200 :
|
|
298
|
+
503; // Service Unavailable for failing
|
|
299
|
+
|
|
300
|
+
res.status(statusCode).json(health);
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
// Kubernetes-style liveness probe (simple boolean)
|
|
304
|
+
app.get('/healthz', (req, res) => {
|
|
305
|
+
const health = getWatchtowerHealth();
|
|
306
|
+
|
|
307
|
+
if (health.status === 'failing') {
|
|
308
|
+
res.status(503).send('FAILING');
|
|
309
|
+
} else {
|
|
310
|
+
res.status(200).send('OK');
|
|
311
|
+
}
|
|
312
|
+
});
|
|
313
|
+
|
|
314
|
+
// Kubernetes-style readiness probe (ready to accept traffic)
|
|
315
|
+
app.get('/readyz', (req, res) => {
|
|
316
|
+
const health = getWatchtowerHealth();
|
|
317
|
+
|
|
318
|
+
// Only ready if at least 80% of chains are healthy
|
|
319
|
+
const healthyPercent = health.overall.healthyChains / health.overall.totalChains;
|
|
320
|
+
|
|
321
|
+
if (healthyPercent >= 0.8) {
|
|
322
|
+
res.status(200).send('READY');
|
|
323
|
+
} else {
|
|
324
|
+
res.status(503).send('NOT_READY');
|
|
325
|
+
}
|
|
326
|
+
});
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
---
|
|
330
|
+
|
|
331
|
+
### Layer 3: Automatic Fail-Fast Exit
|
|
332
|
+
|
|
333
|
+
Automatically exit with error code when event system is critically broken:
|
|
334
|
+
|
|
335
|
+
```typescript
|
|
336
|
+
// services/pioneer-watchtower/src/fail-fast.ts
|
|
337
|
+
|
|
338
|
+
import { network } from './blockbook-integration';
|
|
339
|
+
|
|
340
|
+
const CRITICAL_TIMEOUT = 600000; // 10 minutes
|
|
341
|
+
const CHECK_INTERVAL = 60000; // Check every minute
|
|
342
|
+
|
|
343
|
+
export function enableFailFast(): void {
|
|
344
|
+
setInterval(() => {
|
|
345
|
+
const sockets = network.getBlockbookSockets();
|
|
346
|
+
|
|
347
|
+
for (const [symbol, socket] of Object.entries(sockets)) {
|
|
348
|
+
const health = socket.getHealthStatus();
|
|
349
|
+
|
|
350
|
+
// If we have subscriptions but no events in 10 minutes, FAIL HARD
|
|
351
|
+
if (health.subscriptions > 0 && health.timeSinceLastEvent > CRITICAL_TIMEOUT) {
|
|
352
|
+
console.error(`\n${'='.repeat(60)}`);
|
|
353
|
+
console.error(`🚨 CRITICAL: ${symbol} event system is BROKEN`);
|
|
354
|
+
console.error(`${'='.repeat(60)}`);
|
|
355
|
+
console.error(`Subscriptions: ${health.subscriptions}`);
|
|
356
|
+
console.error(`Last Event: ${health.lastEventTime ? new Date(health.lastEventTime).toISOString() : 'NEVER'}`);
|
|
357
|
+
console.error(`Time Since Last Event: ${Math.floor(health.timeSinceLastEvent / 1000)}s`);
|
|
358
|
+
console.error(`Event Counts:`, health.eventCounts);
|
|
359
|
+
console.error(`${'='.repeat(60)}`);
|
|
360
|
+
console.error(`\n❌ Exiting with code 1 to signal container orchestration`);
|
|
361
|
+
console.error(`Container will be restarted by orchestrator\n`);
|
|
362
|
+
|
|
363
|
+
process.exit(1);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
}, CHECK_INTERVAL);
|
|
367
|
+
|
|
368
|
+
console.log(`✅ Fail-fast monitoring enabled (${CRITICAL_TIMEOUT/1000}s timeout, ${CHECK_INTERVAL/1000}s check interval)`);
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
// Listen for health-failure events from individual sockets
|
|
372
|
+
export function monitorSocketHealth(): void {
|
|
373
|
+
const sockets = network.getBlockbookSockets();
|
|
374
|
+
|
|
375
|
+
for (const [symbol, socket] of Object.entries(sockets)) {
|
|
376
|
+
socket.on('health-failure', (status) => {
|
|
377
|
+
console.error(`\n⚠️ Health failure detected for ${symbol}:`, status);
|
|
378
|
+
|
|
379
|
+
if (status.severity === 'critical') {
|
|
380
|
+
console.error(`🚨 CRITICAL health failure for ${symbol} - considering fail-fast exit`);
|
|
381
|
+
}
|
|
382
|
+
});
|
|
383
|
+
|
|
384
|
+
socket.on('health-check', (status) => {
|
|
385
|
+
// Log health checks at debug level
|
|
386
|
+
if (process.env.DEBUG_HEALTH) {
|
|
387
|
+
console.log(`[${symbol}] Health:`, {
|
|
388
|
+
subscriptions: status.subscriptions,
|
|
389
|
+
timeSinceLastEvent: Math.floor(status.timeSinceLastEvent / 1000),
|
|
390
|
+
events: status.eventCounts.total
|
|
391
|
+
});
|
|
392
|
+
}
|
|
393
|
+
});
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
---
|
|
399
|
+
|
|
400
|
+
### Layer 4: Container Integration
|
|
401
|
+
|
|
402
|
+
#### Docker Compose Health Check
|
|
403
|
+
|
|
404
|
+
```yaml
|
|
405
|
+
services:
|
|
406
|
+
pioneer-watchtower:
|
|
407
|
+
image: pioneer-watchtower:latest
|
|
408
|
+
healthcheck:
|
|
409
|
+
test: ["CMD", "curl", "-f", "http://localhost:3000/healthz"]
|
|
410
|
+
interval: 30s
|
|
411
|
+
timeout: 10s
|
|
412
|
+
retries: 3
|
|
413
|
+
start_period: 60s
|
|
414
|
+
restart: unless-stopped
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
#### Kubernetes Probes
|
|
418
|
+
|
|
419
|
+
```yaml
|
|
420
|
+
apiVersion: v1
|
|
421
|
+
kind: Pod
|
|
422
|
+
metadata:
|
|
423
|
+
name: pioneer-watchtower
|
|
424
|
+
spec:
|
|
425
|
+
containers:
|
|
426
|
+
- name: watchtower
|
|
427
|
+
image: pioneer-watchtower:latest
|
|
428
|
+
ports:
|
|
429
|
+
- containerPort: 3000
|
|
430
|
+
livenessProbe:
|
|
431
|
+
httpGet:
|
|
432
|
+
path: /healthz
|
|
433
|
+
port: 3000
|
|
434
|
+
initialDelaySeconds: 60
|
|
435
|
+
periodSeconds: 30
|
|
436
|
+
timeoutSeconds: 10
|
|
437
|
+
failureThreshold: 3
|
|
438
|
+
readinessProbe:
|
|
439
|
+
httpGet:
|
|
440
|
+
path: /readyz
|
|
441
|
+
port: 3000
|
|
442
|
+
initialDelaySeconds: 30
|
|
443
|
+
periodSeconds: 10
|
|
444
|
+
timeoutSeconds: 5
|
|
445
|
+
failureThreshold: 2
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
---
|
|
449
|
+
|
|
450
|
+
## Monitoring and Alerting
|
|
451
|
+
|
|
452
|
+
### Prometheus Metrics
|
|
453
|
+
|
|
454
|
+
```typescript
|
|
455
|
+
// services/pioneer-watchtower/src/metrics.ts
|
|
456
|
+
|
|
457
|
+
import { network } from './blockbook-integration';
|
|
458
|
+
import { register, Gauge, Counter } from 'prom-client';
|
|
459
|
+
|
|
460
|
+
// Metrics
|
|
461
|
+
const subscriptionsGauge = new Gauge({
|
|
462
|
+
name: 'blockbook_subscriptions_active',
|
|
463
|
+
help: 'Number of active websocket subscriptions',
|
|
464
|
+
labelNames: ['chain']
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
const eventCounter = new Counter({
|
|
468
|
+
name: 'blockbook_events_total',
|
|
469
|
+
help: 'Total number of events received',
|
|
470
|
+
labelNames: ['chain', 'type'] // type: block, transaction
|
|
471
|
+
});
|
|
472
|
+
|
|
473
|
+
const lastEventGauge = new Gauge({
|
|
474
|
+
name: 'blockbook_last_event_seconds',
|
|
475
|
+
help: 'Seconds since last event received',
|
|
476
|
+
labelNames: ['chain']
|
|
477
|
+
});
|
|
478
|
+
|
|
479
|
+
const healthGauge = new Gauge({
|
|
480
|
+
name: 'blockbook_health_status',
|
|
481
|
+
help: 'Health status (1 = healthy, 0 = unhealthy)',
|
|
482
|
+
labelNames: ['chain']
|
|
483
|
+
});
|
|
484
|
+
|
|
485
|
+
// Update metrics periodically
|
|
486
|
+
setInterval(() => {
|
|
487
|
+
const sockets = network.getBlockbookSockets();
|
|
488
|
+
|
|
489
|
+
for (const [symbol, socket] of Object.entries(sockets)) {
|
|
490
|
+
const health = socket.getHealthStatus();
|
|
491
|
+
|
|
492
|
+
subscriptionsGauge.set({ chain: symbol }, health.subscriptions);
|
|
493
|
+
lastEventGauge.set({ chain: symbol }, health.timeSinceLastEvent / 1000);
|
|
494
|
+
healthGauge.set({ chain: symbol }, health.healthy ? 1 : 0);
|
|
495
|
+
|
|
496
|
+
// Event counters are incremented as events occur
|
|
497
|
+
}
|
|
498
|
+
}, 10000); // Update every 10 seconds
|
|
499
|
+
|
|
500
|
+
// Expose metrics endpoint
|
|
501
|
+
app.get('/metrics', async (req, res) => {
|
|
502
|
+
res.set('Content-Type', register.contentType);
|
|
503
|
+
res.end(await register.metrics());
|
|
504
|
+
});
|
|
505
|
+
```
|
|
506
|
+
|
|
507
|
+
### Alert Rules (Prometheus Alertmanager)
|
|
508
|
+
|
|
509
|
+
```yaml
|
|
510
|
+
groups:
|
|
511
|
+
- name: blockbook_events
|
|
512
|
+
interval: 30s
|
|
513
|
+
rules:
|
|
514
|
+
- alert: BlockbookNoEvents
|
|
515
|
+
expr: time() - blockbook_last_event_seconds > 300
|
|
516
|
+
for: 2m
|
|
517
|
+
labels:
|
|
518
|
+
severity: critical
|
|
519
|
+
annotations:
|
|
520
|
+
summary: "Blockbook events not received for {{ $labels.chain }}"
|
|
521
|
+
description: "No events received for {{ $labels.chain }} in over 5 minutes despite active subscriptions"
|
|
522
|
+
|
|
523
|
+
- alert: BlockbookUnhealthy
|
|
524
|
+
expr: blockbook_health_status == 0
|
|
525
|
+
for: 5m
|
|
526
|
+
labels:
|
|
527
|
+
severity: warning
|
|
528
|
+
annotations:
|
|
529
|
+
summary: "Blockbook websocket unhealthy for {{ $labels.chain }}"
|
|
530
|
+
description: "Health check failing for {{ $labels.chain }} chain"
|
|
531
|
+
|
|
532
|
+
- alert: BlockbookSubscriptionsZero
|
|
533
|
+
expr: blockbook_subscriptions_active == 0
|
|
534
|
+
for: 10m
|
|
535
|
+
labels:
|
|
536
|
+
severity: warning
|
|
537
|
+
annotations:
|
|
538
|
+
summary: "No active subscriptions for {{ $labels.chain }}"
|
|
539
|
+
description: "Watchtower has no active subscriptions for {{ $labels.chain }}"
|
|
540
|
+
```
|
|
541
|
+
|
|
542
|
+
---
|
|
543
|
+
|
|
544
|
+
## Testing the Fail-Fast System
|
|
545
|
+
|
|
546
|
+
### Manual Test 1: Simulate Event Timeout
|
|
547
|
+
|
|
548
|
+
```bash
|
|
549
|
+
# Start watchtower
|
|
550
|
+
npm start
|
|
551
|
+
|
|
552
|
+
# Wait for subscriptions to be established
|
|
553
|
+
sleep 30
|
|
554
|
+
|
|
555
|
+
# Kill the blockbook websocket connection (simulate network failure)
|
|
556
|
+
# The service should:
|
|
557
|
+
# 1. Detect no events within timeout
|
|
558
|
+
# 2. Log health failures
|
|
559
|
+
# 3. Exit with code 1
|
|
560
|
+
# 4. Container orchestrator restarts service
|
|
561
|
+
```
|
|
562
|
+
|
|
563
|
+
### Manual Test 2: Check Health Endpoints
|
|
564
|
+
|
|
565
|
+
```bash
|
|
566
|
+
# Health endpoint (detailed)
|
|
567
|
+
curl http://localhost:3000/health | jq
|
|
568
|
+
|
|
569
|
+
# Expected output:
|
|
570
|
+
{
|
|
571
|
+
"status": "healthy",
|
|
572
|
+
"timestamp": 1736496000000,
|
|
573
|
+
"chains": {
|
|
574
|
+
"ETH": {
|
|
575
|
+
"connected": true,
|
|
576
|
+
"subscriptions": 2,
|
|
577
|
+
"timeSinceLastEvent": 5432,
|
|
578
|
+
"eventCounts": {
|
|
579
|
+
"blocks": 42,
|
|
580
|
+
"transactions": 7,
|
|
581
|
+
"total": 49
|
|
582
|
+
},
|
|
583
|
+
"healthy": true
|
|
584
|
+
}
|
|
585
|
+
},
|
|
586
|
+
"overall": {
|
|
587
|
+
"totalChains": 1,
|
|
588
|
+
"healthyChains": 1,
|
|
589
|
+
"degradedChains": 0,
|
|
590
|
+
"failingChains": 0
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
# Liveness probe (simple)
|
|
595
|
+
curl http://localhost:3000/healthz
|
|
596
|
+
# Expected: "OK" with 200 status
|
|
597
|
+
|
|
598
|
+
# Readiness probe
|
|
599
|
+
curl http://localhost:3000/readyz
|
|
600
|
+
# Expected: "READY" with 200 status if >= 80% chains healthy
|
|
601
|
+
```
|
|
602
|
+
|
|
603
|
+
### Automated Integration Test
|
|
604
|
+
|
|
605
|
+
```javascript
|
|
606
|
+
// __tests__/test-fail-fast.js
|
|
607
|
+
|
|
608
|
+
const network = require('../lib/index');
|
|
609
|
+
|
|
610
|
+
describe('Fail-Fast System', () => {
|
|
611
|
+
it('should detect event timeout and emit health-failure', async () => {
|
|
612
|
+
await network.init();
|
|
613
|
+
const sockets = network.getBlockbookSockets();
|
|
614
|
+
const ethSocket = sockets.ETH;
|
|
615
|
+
|
|
616
|
+
// Subscribe but never receive events
|
|
617
|
+
await ethSocket.connect();
|
|
618
|
+
await ethSocket.subscribeNewBlock(() => {});
|
|
619
|
+
|
|
620
|
+
// Mock: prevent events from firing
|
|
621
|
+
jest.spyOn(ethSocket, 'handleMessage').mockImplementation(() => {});
|
|
622
|
+
|
|
623
|
+
// Wait for health failure event
|
|
624
|
+
const healthFailure = new Promise((resolve) => {
|
|
625
|
+
ethSocket.on('health-failure', resolve);
|
|
626
|
+
});
|
|
627
|
+
|
|
628
|
+
const failure = await healthFailure;
|
|
629
|
+
expect(failure.severity).toBe('critical');
|
|
630
|
+
expect(failure.timeSinceLastEvent).toBeGreaterThan(300000);
|
|
631
|
+
});
|
|
632
|
+
|
|
633
|
+
it('should expose healthy status when events are flowing', async () => {
|
|
634
|
+
await network.init();
|
|
635
|
+
const health = getWatchtowerHealth();
|
|
636
|
+
|
|
637
|
+
// Wait for at least one event
|
|
638
|
+
await waitForEvents(60000);
|
|
639
|
+
|
|
640
|
+
const updatedHealth = getWatchtowerHealth();
|
|
641
|
+
expect(updatedHealth.status).toBe('healthy');
|
|
642
|
+
expect(updatedHealth.overall.healthyChains).toBeGreaterThan(0);
|
|
643
|
+
});
|
|
644
|
+
});
|
|
645
|
+
```
|
|
646
|
+
|
|
647
|
+
---
|
|
648
|
+
|
|
649
|
+
## Deployment Checklist
|
|
650
|
+
|
|
651
|
+
- [ ] BlockbookWebSocket health tracking implemented
|
|
652
|
+
- [ ] Watchtower health endpoints added (`/health`, `/healthz`, `/readyz`)
|
|
653
|
+
- [ ] Fail-fast auto-exit mechanism enabled
|
|
654
|
+
- [ ] Container health checks configured
|
|
655
|
+
- [ ] Prometheus metrics exposed
|
|
656
|
+
- [ ] Alert rules configured
|
|
657
|
+
- [ ] Integration tests passing
|
|
658
|
+
- [ ] Documentation updated
|
|
659
|
+
- [ ] Monitoring dashboards created
|
|
660
|
+
- [ ] On-call runbooks updated
|
|
661
|
+
|
|
662
|
+
---
|
|
663
|
+
|
|
664
|
+
## Incident Response
|
|
665
|
+
|
|
666
|
+
### When Fail-Fast Triggers
|
|
667
|
+
|
|
668
|
+
1. **Container Restarts**: Orchestrator automatically restarts the container
|
|
669
|
+
2. **Check Logs**: Review stderr logs for health failure details
|
|
670
|
+
3. **Verify Network**: Check if blockbook provider is accessible
|
|
671
|
+
4. **Check Subscriptions**: Verify addresses are correctly subscribed
|
|
672
|
+
5. **Monitor Recovery**: Watch for events to start flowing again
|
|
673
|
+
6. **Escalate**: If container restart doesn't resolve, escalate to on-call
|
|
674
|
+
|
|
675
|
+
### Common Issues
|
|
676
|
+
|
|
677
|
+
| Symptom | Likely Cause | Resolution |
|
|
678
|
+
|---------|-------------|------------|
|
|
679
|
+
| No events, connection OK | ID mismatch bug regression | Review subscription code |
|
|
680
|
+
| Connection drops | Network/provider issue | Check provider status |
|
|
681
|
+
| Events stop after N hours | Connection timeout | Verify ping/keepalive |
|
|
682
|
+
| Subscriptions = 0 | Initialization failure | Check startup logs |
|
|
683
|
+
| High CPU, no events | Message handling loop | Check for infinite loops |
|
|
684
|
+
|
|
685
|
+
---
|
|
686
|
+
|
|
687
|
+
## Conclusion
|
|
688
|
+
|
|
689
|
+
The fail-fast system provides multiple layers of protection:
|
|
690
|
+
|
|
691
|
+
1. **Real-time detection** via health checks every minute
|
|
692
|
+
2. **Automatic recovery** via container restarts
|
|
693
|
+
3. **Visibility** via health endpoints and Prometheus metrics
|
|
694
|
+
4. **Alerting** for human intervention when auto-recovery fails
|
|
695
|
+
|
|
696
|
+
**Remember**: A service that appears healthy but doesn't process events is worse than a service that crashes. Fail fast, fail visibly, and let orchestration handle recovery.
|