@pioneer-platform/blockbook 8.32.0 → 8.32.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # @pioneer-platform/blockbook
2
2
 
3
+ ## 8.32.2
4
+
5
+ ### Patch Changes
6
+
7
+ - 163653b: chore: fix(swap): Fix critical swap amount bug causing 0 ETH transfers
8
+
9
+ ## 8.32.1
10
+
11
+ ### Patch Changes
12
+
13
+ - fix(swap): Fix critical swap amount bug causing 0 ETH transfers
14
+
3
15
  ## 8.32.0
4
16
 
5
17
  ### Minor Changes
@@ -0,0 +1,696 @@
1
+ # Fail-Fast Implementation Guide
2
+
3
+ ## Purpose
4
+
5
+ Implement robust health monitoring and fail-fast mechanisms to detect when the BlockbookWebSocket event system is broken, and signal failure to container orchestration layers immediately.
6
+
7
+ ## Core Philosophy
8
+
9
+ **Silent failures are catastrophic failures.** If we can't receive events, we MUST:
10
+ 1. Detect the failure quickly (within minutes, not hours)
11
+ 2. Log clear diagnostic information
12
+ 3. Signal unhealthy status to orchestration layer
13
+ 4. Exit with non-zero exit code to trigger container restart
14
+
15
+ ---
16
+
17
+ ## Implementation Layers
18
+
19
+ ### Layer 1: BlockbookWebSocket Class Health Tracking
20
+
21
+ Add health monitoring directly to `BlockbookWebSocket.ts`:
22
+
23
+ ```typescript
24
+ export class BlockbookWebSocket extends EventEmitter {
25
+ // ... existing fields ...
26
+
27
+ // Health tracking
28
+ private lastEventTime?: number;
29
+ private healthCheckInterval?: NodeJS.Timeout;
30
+ private eventCounts = {
31
+ blocks: 0,
32
+ transactions: 0,
33
+ total: 0
34
+ };
35
+
36
+ // Configuration
37
+ private readonly HEALTH_CHECK_INTERVAL = 60000; // 1 minute
38
+ private readonly EVENT_TIMEOUT = 300000; // 5 minutes
39
+
40
+ constructor(url: string) {
41
+ super();
42
+ this.url = url;
43
+ }
44
+
45
+ async connect(): Promise<void> {
46
+ // ... existing connection code ...
47
+
48
+ this.ws.once('open', () => {
49
+ this.connected = true;
50
+ this.reconnectAttempts = 0;
51
+ this.emit('connected', this.url);
52
+ this.startPing();
53
+ this.startHealthMonitoring(); // ← NEW: Start health checks
54
+ resolve();
55
+ });
56
+
57
+ // ... rest of connection code ...
58
+ }
59
+
60
+ private handleMessage(message: BlockbookMessage): void {
61
+ const { id, data } = message;
62
+
63
+ if (!id) {
64
+ this.emit('error', new Error('Received message without ID'));
65
+ return;
66
+ }
67
+
68
+ // Check if this is a response to a pending request
69
+ const pending = this.pendingRequests.get(id);
70
+ if (pending) {
71
+ this.pendingRequests.delete(id);
72
+
73
+ if (data?.error) {
74
+ pending.reject(new Error(data.error.message || 'Request failed'));
75
+ } else {
76
+ pending.resolve(data);
77
+ }
78
+ return;
79
+ }
80
+
81
+ // Check if this is a subscription notification
82
+ const subscription = this.subscriptions.get(id);
83
+ if (subscription) {
84
+ // ← NEW: Track event reception
85
+ this.lastEventTime = Date.now();
86
+ this.eventCounts.total++;
87
+
88
+ // Track by type
89
+ if (subscription.method === 'subscribeNewBlock') {
90
+ this.eventCounts.blocks++;
91
+ } else if (subscription.method === 'subscribeAddresses') {
92
+ this.eventCounts.transactions++;
93
+ }
94
+
95
+ if (data?.error) {
96
+ this.emit('error', new Error(`Subscription error for ${subscription.method}: ${data.error.message}`));
97
+ } else {
98
+ try {
99
+ const result = subscription.callback(data);
100
+ if (result instanceof Promise) {
101
+ result.catch((error) => {
102
+ this.emit('error', new Error(`Subscription callback error: ${error}`));
103
+ });
104
+ }
105
+ } catch (error) {
106
+ this.emit('error', new Error(`Subscription callback error: ${error}`));
107
+ }
108
+ }
109
+ return;
110
+ }
111
+
112
+ // Unknown message
113
+ this.emit('unknown-message', message);
114
+ }
115
+
116
+ private startHealthMonitoring(): void {
117
+ this.stopHealthMonitoring();
118
+
119
+ this.healthCheckInterval = setInterval(() => {
120
+ this.checkHealth();
121
+ }, this.HEALTH_CHECK_INTERVAL);
122
+ }
123
+
124
+ private stopHealthMonitoring(): void {
125
+ if (this.healthCheckInterval) {
126
+ clearInterval(this.healthCheckInterval);
127
+ this.healthCheckInterval = undefined;
128
+ }
129
+ }
130
+
131
+ private checkHealth(): void {
132
+ const now = Date.now();
133
+ const hasActiveSubscriptions = this.subscriptions.size > 0;
134
+
135
+ if (!hasActiveSubscriptions) {
136
+ // No subscriptions, nothing to monitor
137
+ return;
138
+ }
139
+
140
+ const timeSinceLastEvent = this.lastEventTime
141
+ ? now - this.lastEventTime
142
+ : Infinity;
143
+
144
+ const healthStatus = {
145
+ connected: this.connected,
146
+ subscriptions: this.subscriptions.size,
147
+ lastEventTime: this.lastEventTime,
148
+ timeSinceLastEvent,
149
+ eventCounts: { ...this.eventCounts },
150
+ url: this.url
151
+ };
152
+
153
+ // Emit health status for monitoring
154
+ this.emit('health-check', healthStatus);
155
+
156
+ // FAIL FAST: No events received within timeout
157
+ if (timeSinceLastEvent > this.EVENT_TIMEOUT) {
158
+ const errorMessage = `CRITICAL: No events received in ${this.EVENT_TIMEOUT / 1000}s despite ${this.subscriptions.size} active subscriptions`;
159
+
160
+ this.emit('health-failure', {
161
+ ...healthStatus,
162
+ error: errorMessage,
163
+ severity: 'critical'
164
+ });
165
+
166
+ // Log to stderr for container log collection
167
+ console.error(`[BlockbookWebSocket] ${errorMessage}`);
168
+ console.error(`[BlockbookWebSocket] Health Status:`, JSON.stringify(healthStatus, null, 2));
169
+ }
170
+ }
171
+
172
+ getHealthStatus(): {
173
+ connected: boolean;
174
+ subscriptions: number;
175
+ lastEventTime?: number;
176
+ timeSinceLastEvent: number;
177
+ eventCounts: typeof this.eventCounts;
178
+ healthy: boolean;
179
+ } {
180
+ const now = Date.now();
181
+ const timeSinceLastEvent = this.lastEventTime
182
+ ? now - this.lastEventTime
183
+ : Infinity;
184
+
185
+ const hasActiveSubscriptions = this.subscriptions.size > 0;
186
+ const healthy = !hasActiveSubscriptions || timeSinceLastEvent < this.EVENT_TIMEOUT;
187
+
188
+ return {
189
+ connected: this.connected,
190
+ subscriptions: this.subscriptions.size,
191
+ lastEventTime: this.lastEventTime,
192
+ timeSinceLastEvent,
193
+ eventCounts: { ...this.eventCounts },
194
+ healthy
195
+ };
196
+ }
197
+
198
+ async disconnect(): Promise<void> {
199
+ if (!this.ws || !this.connected) {
200
+ return;
201
+ }
202
+
203
+ this.reconnecting = false; // Prevent auto-reconnect
204
+ this.stopHealthMonitoring(); // ← NEW: Stop health checks
205
+
206
+ return new Promise((resolve) => {
207
+ this.ws!.once('close', () => resolve());
208
+ this.ws!.close(1000, 'Normal closure');
209
+ });
210
+ }
211
+ }
212
+ ```
213
+
214
+ ---
215
+
216
+ ### Layer 2: Watchtower Service Health Endpoint
217
+
218
+ Add health check endpoint to watchtower service that aggregates all websocket health statuses:
219
+
220
+ ```typescript
221
+ // services/pioneer-watchtower/src/health.ts
222
+
223
+ import { network } from './blockbook-integration';
224
+
225
+ export interface WatchtowerHealth {
226
+ status: 'healthy' | 'degraded' | 'failing';
227
+ timestamp: number;
228
+ chains: {
229
+ [symbol: string]: {
230
+ connected: boolean;
231
+ subscriptions: number;
232
+ timeSinceLastEvent: number;
233
+ eventCounts: {
234
+ blocks: number;
235
+ transactions: number;
236
+ total: number;
237
+ };
238
+ healthy: boolean;
239
+ };
240
+ };
241
+ overall: {
242
+ totalChains: number;
243
+ healthyChains: number;
244
+ degradedChains: number;
245
+ failingChains: number;
246
+ };
247
+ }
248
+
249
+ export function getWatchtowerHealth(): WatchtowerHealth {
250
+ const sockets = network.getBlockbookSockets();
251
+ const chains: WatchtowerHealth['chains'] = {};
252
+
253
+ let healthyCount = 0;
254
+ let degradedCount = 0;
255
+ let failingCount = 0;
256
+
257
+ for (const [symbol, socket] of Object.entries(sockets)) {
258
+ const health = socket.getHealthStatus();
259
+ chains[symbol] = health;
260
+
261
+ if (health.healthy) {
262
+ healthyCount++;
263
+ } else if (health.timeSinceLastEvent < 600000) { // 10 minutes
264
+ degradedCount++;
265
+ } else {
266
+ failingCount++;
267
+ }
268
+ }
269
+
270
+ const totalChains = Object.keys(sockets).length;
271
+
272
+ let status: 'healthy' | 'degraded' | 'failing' = 'healthy';
273
+ if (failingCount > 0) {
274
+ status = 'failing';
275
+ } else if (degradedCount > 0) {
276
+ status = 'degraded';
277
+ }
278
+
279
+ return {
280
+ status,
281
+ timestamp: Date.now(),
282
+ chains,
283
+ overall: {
284
+ totalChains,
285
+ healthyChains: healthyCount,
286
+ degradedChains: degradedCount,
287
+ failingChains: failingCount
288
+ }
289
+ };
290
+ }
291
+
292
+ // HTTP endpoint
293
+ app.get('/health', (req, res) => {
294
+ const health = getWatchtowerHealth();
295
+
296
+ const statusCode = health.status === 'healthy' ? 200 :
297
+ health.status === 'degraded' ? 200 :
298
+ 503; // Service Unavailable for failing
299
+
300
+ res.status(statusCode).json(health);
301
+ });
302
+
303
+ // Kubernetes-style liveness probe (simple boolean)
304
+ app.get('/healthz', (req, res) => {
305
+ const health = getWatchtowerHealth();
306
+
307
+ if (health.status === 'failing') {
308
+ res.status(503).send('FAILING');
309
+ } else {
310
+ res.status(200).send('OK');
311
+ }
312
+ });
313
+
314
+ // Kubernetes-style readiness probe (ready to accept traffic)
315
+ app.get('/readyz', (req, res) => {
316
+ const health = getWatchtowerHealth();
317
+
318
+ // Only ready if at least 80% of chains are healthy
319
+ const healthyPercent = health.overall.healthyChains / health.overall.totalChains;
320
+
321
+ if (healthyPercent >= 0.8) {
322
+ res.status(200).send('READY');
323
+ } else {
324
+ res.status(503).send('NOT_READY');
325
+ }
326
+ });
327
+ ```
328
+
329
+ ---
330
+
331
+ ### Layer 3: Automatic Fail-Fast Exit
332
+
333
+ Automatically exit with error code when event system is critically broken:
334
+
335
+ ```typescript
336
+ // services/pioneer-watchtower/src/fail-fast.ts
337
+
338
+ import { network } from './blockbook-integration';
339
+
340
+ const CRITICAL_TIMEOUT = 600000; // 10 minutes
341
+ const CHECK_INTERVAL = 60000; // Check every minute
342
+
343
+ export function enableFailFast(): void {
344
+ setInterval(() => {
345
+ const sockets = network.getBlockbookSockets();
346
+
347
+ for (const [symbol, socket] of Object.entries(sockets)) {
348
+ const health = socket.getHealthStatus();
349
+
350
+ // If we have subscriptions but no events in 10 minutes, FAIL HARD
351
+ if (health.subscriptions > 0 && health.timeSinceLastEvent > CRITICAL_TIMEOUT) {
352
+ console.error(`\n${'='.repeat(60)}`);
353
+ console.error(`🚨 CRITICAL: ${symbol} event system is BROKEN`);
354
+ console.error(`${'='.repeat(60)}`);
355
+ console.error(`Subscriptions: ${health.subscriptions}`);
356
+ console.error(`Last Event: ${health.lastEventTime ? new Date(health.lastEventTime).toISOString() : 'NEVER'}`);
357
+ console.error(`Time Since Last Event: ${Math.floor(health.timeSinceLastEvent / 1000)}s`);
358
+ console.error(`Event Counts:`, health.eventCounts);
359
+ console.error(`${'='.repeat(60)}`);
360
+ console.error(`\n❌ Exiting with code 1 to signal container orchestration`);
361
+ console.error(`Container will be restarted by orchestrator\n`);
362
+
363
+ process.exit(1);
364
+ }
365
+ }
366
+ }, CHECK_INTERVAL);
367
+
368
+ console.log(`✅ Fail-fast monitoring enabled (${CRITICAL_TIMEOUT/1000}s timeout, ${CHECK_INTERVAL/1000}s check interval)`);
369
+ }
370
+
371
+ // Listen for health-failure events from individual sockets
372
+ export function monitorSocketHealth(): void {
373
+ const sockets = network.getBlockbookSockets();
374
+
375
+ for (const [symbol, socket] of Object.entries(sockets)) {
376
+ socket.on('health-failure', (status) => {
377
+ console.error(`\n⚠️ Health failure detected for ${symbol}:`, status);
378
+
379
+ if (status.severity === 'critical') {
380
+ console.error(`🚨 CRITICAL health failure for ${symbol} - considering fail-fast exit`);
381
+ }
382
+ });
383
+
384
+ socket.on('health-check', (status) => {
385
+ // Log health checks at debug level
386
+ if (process.env.DEBUG_HEALTH) {
387
+ console.log(`[${symbol}] Health:`, {
388
+ subscriptions: status.subscriptions,
389
+ timeSinceLastEvent: Math.floor(status.timeSinceLastEvent / 1000),
390
+ events: status.eventCounts.total
391
+ });
392
+ }
393
+ });
394
+ }
395
+ }
396
+ ```
397
+
398
+ ---
399
+
400
+ ### Layer 4: Container Integration
401
+
402
+ #### Docker Compose Health Check
403
+
404
+ ```yaml
405
+ services:
406
+ pioneer-watchtower:
407
+ image: pioneer-watchtower:latest
408
+ healthcheck:
409
+ test: ["CMD", "curl", "-f", "http://localhost:3000/healthz"]
410
+ interval: 30s
411
+ timeout: 10s
412
+ retries: 3
413
+ start_period: 60s
414
+ restart: unless-stopped
415
+ ```
416
+
417
+ #### Kubernetes Probes
418
+
419
+ ```yaml
420
+ apiVersion: v1
421
+ kind: Pod
422
+ metadata:
423
+ name: pioneer-watchtower
424
+ spec:
425
+ containers:
426
+ - name: watchtower
427
+ image: pioneer-watchtower:latest
428
+ ports:
429
+ - containerPort: 3000
430
+ livenessProbe:
431
+ httpGet:
432
+ path: /healthz
433
+ port: 3000
434
+ initialDelaySeconds: 60
435
+ periodSeconds: 30
436
+ timeoutSeconds: 10
437
+ failureThreshold: 3
438
+ readinessProbe:
439
+ httpGet:
440
+ path: /readyz
441
+ port: 3000
442
+ initialDelaySeconds: 30
443
+ periodSeconds: 10
444
+ timeoutSeconds: 5
445
+ failureThreshold: 2
446
+ ```
447
+
448
+ ---
449
+
450
+ ## Monitoring and Alerting
451
+
452
+ ### Prometheus Metrics
453
+
454
+ ```typescript
455
+ // services/pioneer-watchtower/src/metrics.ts
456
+
457
+ import { network } from './blockbook-integration';
458
+ import { register, Gauge, Counter } from 'prom-client';
459
+
460
+ // Metrics
461
+ const subscriptionsGauge = new Gauge({
462
+ name: 'blockbook_subscriptions_active',
463
+ help: 'Number of active websocket subscriptions',
464
+ labelNames: ['chain']
465
+ });
466
+
467
+ const eventCounter = new Counter({
468
+ name: 'blockbook_events_total',
469
+ help: 'Total number of events received',
470
+ labelNames: ['chain', 'type'] // type: block, transaction
471
+ });
472
+
473
+ const lastEventGauge = new Gauge({
474
+ name: 'blockbook_last_event_seconds',
475
+ help: 'Seconds since last event received',
476
+ labelNames: ['chain']
477
+ });
478
+
479
+ const healthGauge = new Gauge({
480
+ name: 'blockbook_health_status',
481
+ help: 'Health status (1 = healthy, 0 = unhealthy)',
482
+ labelNames: ['chain']
483
+ });
484
+
485
+ // Update metrics periodically
486
+ setInterval(() => {
487
+ const sockets = network.getBlockbookSockets();
488
+
489
+ for (const [symbol, socket] of Object.entries(sockets)) {
490
+ const health = socket.getHealthStatus();
491
+
492
+ subscriptionsGauge.set({ chain: symbol }, health.subscriptions);
493
+ lastEventGauge.set({ chain: symbol }, health.timeSinceLastEvent / 1000);
494
+ healthGauge.set({ chain: symbol }, health.healthy ? 1 : 0);
495
+
496
+ // Event counters are incremented as events occur
497
+ }
498
+ }, 10000); // Update every 10 seconds
499
+
500
+ // Expose metrics endpoint
501
+ app.get('/metrics', async (req, res) => {
502
+ res.set('Content-Type', register.contentType);
503
+ res.end(await register.metrics());
504
+ });
505
+ ```
506
+
507
+ ### Alert Rules (Prometheus Alertmanager)
508
+
509
+ ```yaml
510
+ groups:
511
+ - name: blockbook_events
512
+ interval: 30s
513
+ rules:
514
+ - alert: BlockbookNoEvents
515
+ expr: time() - blockbook_last_event_seconds > 300
516
+ for: 2m
517
+ labels:
518
+ severity: critical
519
+ annotations:
520
+ summary: "Blockbook events not received for {{ $labels.chain }}"
521
+ description: "No events received for {{ $labels.chain }} in over 5 minutes despite active subscriptions"
522
+
523
+ - alert: BlockbookUnhealthy
524
+ expr: blockbook_health_status == 0
525
+ for: 5m
526
+ labels:
527
+ severity: warning
528
+ annotations:
529
+ summary: "Blockbook websocket unhealthy for {{ $labels.chain }}"
530
+ description: "Health check failing for {{ $labels.chain }} chain"
531
+
532
+ - alert: BlockbookSubscriptionsZero
533
+ expr: blockbook_subscriptions_active == 0
534
+ for: 10m
535
+ labels:
536
+ severity: warning
537
+ annotations:
538
+ summary: "No active subscriptions for {{ $labels.chain }}"
539
+ description: "Watchtower has no active subscriptions for {{ $labels.chain }}"
540
+ ```
541
+
542
+ ---
543
+
544
+ ## Testing the Fail-Fast System
545
+
546
+ ### Manual Test 1: Simulate Event Timeout
547
+
548
+ ```bash
549
+ # Start watchtower
550
+ npm start
551
+
552
+ # Wait for subscriptions to be established
553
+ sleep 30
554
+
555
+ # Kill the blockbook websocket connection (simulate network failure)
556
+ # The service should:
557
+ # 1. Detect no events within timeout
558
+ # 2. Log health failures
559
+ # 3. Exit with code 1
560
+ # 4. Container orchestrator restarts service
561
+ ```
562
+
563
+ ### Manual Test 2: Check Health Endpoints
564
+
565
+ ```bash
566
+ # Health endpoint (detailed)
567
+ curl http://localhost:3000/health | jq
568
+
569
+ # Expected output:
570
+ {
571
+ "status": "healthy",
572
+ "timestamp": 1736496000000,
573
+ "chains": {
574
+ "ETH": {
575
+ "connected": true,
576
+ "subscriptions": 2,
577
+ "timeSinceLastEvent": 5432,
578
+ "eventCounts": {
579
+ "blocks": 42,
580
+ "transactions": 7,
581
+ "total": 49
582
+ },
583
+ "healthy": true
584
+ }
585
+ },
586
+ "overall": {
587
+ "totalChains": 1,
588
+ "healthyChains": 1,
589
+ "degradedChains": 0,
590
+ "failingChains": 0
591
+ }
592
+ }
593
+
594
+ # Liveness probe (simple)
595
+ curl http://localhost:3000/healthz
596
+ # Expected: "OK" with 200 status
597
+
598
+ # Readiness probe
599
+ curl http://localhost:3000/readyz
600
+ # Expected: "READY" with 200 status if >= 80% chains healthy
601
+ ```
602
+
603
+ ### Automated Integration Test
604
+
605
+ ```javascript
606
+ // __tests__/test-fail-fast.js
607
+
608
+ const network = require('../lib/index');
609
+
610
+ describe('Fail-Fast System', () => {
611
+ it('should detect event timeout and emit health-failure', async () => {
612
+ await network.init();
613
+ const sockets = network.getBlockbookSockets();
614
+ const ethSocket = sockets.ETH;
615
+
616
+ // Subscribe but never receive events
617
+ await ethSocket.connect();
618
+ await ethSocket.subscribeNewBlock(() => {});
619
+
620
+ // Mock: prevent events from firing
621
+ jest.spyOn(ethSocket, 'handleMessage').mockImplementation(() => {});
622
+
623
+ // Wait for health failure event
624
+ const healthFailure = new Promise((resolve) => {
625
+ ethSocket.on('health-failure', resolve);
626
+ });
627
+
628
+ const failure = await healthFailure;
629
+ expect(failure.severity).toBe('critical');
630
+ expect(failure.timeSinceLastEvent).toBeGreaterThan(300000);
631
+ });
632
+
633
+ it('should expose healthy status when events are flowing', async () => {
634
+ await network.init();
635
+ const health = getWatchtowerHealth();
636
+
637
+ // Wait for at least one event
638
+ await waitForEvents(60000);
639
+
640
+ const updatedHealth = getWatchtowerHealth();
641
+ expect(updatedHealth.status).toBe('healthy');
642
+ expect(updatedHealth.overall.healthyChains).toBeGreaterThan(0);
643
+ });
644
+ });
645
+ ```
646
+
647
+ ---
648
+
649
+ ## Deployment Checklist
650
+
651
+ - [ ] BlockbookWebSocket health tracking implemented
652
+ - [ ] Watchtower health endpoints added (`/health`, `/healthz`, `/readyz`)
653
+ - [ ] Fail-fast auto-exit mechanism enabled
654
+ - [ ] Container health checks configured
655
+ - [ ] Prometheus metrics exposed
656
+ - [ ] Alert rules configured
657
+ - [ ] Integration tests passing
658
+ - [ ] Documentation updated
659
+ - [ ] Monitoring dashboards created
660
+ - [ ] On-call runbooks updated
661
+
662
+ ---
663
+
664
+ ## Incident Response
665
+
666
+ ### When Fail-Fast Triggers
667
+
668
+ 1. **Container Restarts**: Orchestrator automatically restarts the container
669
+ 2. **Check Logs**: Review stderr logs for health failure details
670
+ 3. **Verify Network**: Check if blockbook provider is accessible
671
+ 4. **Check Subscriptions**: Verify addresses are correctly subscribed
672
+ 5. **Monitor Recovery**: Watch for events to start flowing again
673
+ 6. **Escalate**: If container restart doesn't resolve, escalate to on-call
674
+
675
+ ### Common Issues
676
+
677
+ | Symptom | Likely Cause | Resolution |
678
+ |---------|-------------|------------|
679
+ | No events, connection OK | ID mismatch bug regression | Review subscription code |
680
+ | Connection drops | Network/provider issue | Check provider status |
681
+ | Events stop after N hours | Connection timeout | Verify ping/keepalive |
682
+ | Subscriptions = 0 | Initialization failure | Check startup logs |
683
+ | High CPU, no events | Message handling loop | Check for infinite loops |
684
+
685
+ ---
686
+
687
+ ## Conclusion
688
+
689
+ The fail-fast system provides multiple layers of protection:
690
+
691
+ 1. **Real-time detection** via health checks every minute
692
+ 2. **Automatic recovery** via container restarts
693
+ 3. **Visibility** via health endpoints and Prometheus metrics
694
+ 4. **Alerting** for human intervention when auto-recovery fails
695
+
696
+ **Remember**: A service that appears healthy but doesn't process events is worse than a service that crashes. Fail fast, fail visibly, and let orchestration handle recovery.