hungry-ghost-hive 0.45.0 → 0.46.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/dist/cli/commands/cluster.d.ts.map +1 -1
  2. package/dist/cli/commands/cluster.js +348 -1
  3. package/dist/cli/commands/cluster.js.map +1 -1
  4. package/dist/cli/commands/cluster.test.js +313 -9
  5. package/dist/cli/commands/cluster.test.js.map +1 -1
  6. package/dist/cli/commands/req-spawn.test.d.ts +2 -0
  7. package/dist/cli/commands/req-spawn.test.d.ts.map +1 -0
  8. package/dist/cli/commands/req-spawn.test.js +116 -0
  9. package/dist/cli/commands/req-spawn.test.js.map +1 -0
  10. package/dist/cli/commands/req.d.ts.map +1 -1
  11. package/dist/cli/commands/req.js +21 -13
  12. package/dist/cli/commands/req.js.map +1 -1
  13. package/dist/cluster/cluster-http-server.d.ts +32 -0
  14. package/dist/cluster/cluster-http-server.d.ts.map +1 -1
  15. package/dist/cluster/cluster-http-server.js +42 -0
  16. package/dist/cluster/cluster-http-server.js.map +1 -1
  17. package/dist/cluster/distributed-runtime-coverage.test.js +9 -0
  18. package/dist/cluster/distributed-runtime-coverage.test.js.map +1 -1
  19. package/dist/cluster/distributed-system.test.js +135 -0
  20. package/dist/cluster/distributed-system.test.js.map +1 -1
  21. package/dist/cluster/events.d.ts +23 -0
  22. package/dist/cluster/events.d.ts.map +1 -1
  23. package/dist/cluster/events.js +74 -0
  24. package/dist/cluster/events.js.map +1 -1
  25. package/dist/cluster/heartbeat-manager.d.ts +2 -0
  26. package/dist/cluster/heartbeat-manager.d.ts.map +1 -1
  27. package/dist/cluster/heartbeat-manager.js +42 -6
  28. package/dist/cluster/heartbeat-manager.js.map +1 -1
  29. package/dist/cluster/membership.test.d.ts +2 -0
  30. package/dist/cluster/membership.test.d.ts.map +1 -0
  31. package/dist/cluster/membership.test.js +416 -0
  32. package/dist/cluster/membership.test.js.map +1 -0
  33. package/dist/cluster/partition-safety.test.d.ts +2 -0
  34. package/dist/cluster/partition-safety.test.d.ts.map +1 -0
  35. package/dist/cluster/partition-safety.test.js +440 -0
  36. package/dist/cluster/partition-safety.test.js.map +1 -0
  37. package/dist/cluster/raft-state-machine.d.ts +33 -1
  38. package/dist/cluster/raft-state-machine.d.ts.map +1 -1
  39. package/dist/cluster/raft-state-machine.js +65 -3
  40. package/dist/cluster/raft-state-machine.js.map +1 -1
  41. package/dist/cluster/raft-store.d.ts +26 -1
  42. package/dist/cluster/raft-store.d.ts.map +1 -1
  43. package/dist/cluster/raft-store.js +137 -0
  44. package/dist/cluster/raft-store.js.map +1 -1
  45. package/dist/cluster/replication-lag.test.d.ts +2 -0
  46. package/dist/cluster/replication-lag.test.d.ts.map +1 -0
  47. package/dist/cluster/replication-lag.test.js +239 -0
  48. package/dist/cluster/replication-lag.test.js.map +1 -0
  49. package/dist/cluster/replication.d.ts +2 -2
  50. package/dist/cluster/replication.d.ts.map +1 -1
  51. package/dist/cluster/replication.js +1 -1
  52. package/dist/cluster/replication.js.map +1 -1
  53. package/dist/cluster/runtime.d.ts +78 -0
  54. package/dist/cluster/runtime.d.ts.map +1 -1
  55. package/dist/cluster/runtime.js +400 -13
  56. package/dist/cluster/runtime.js.map +1 -1
  57. package/dist/cluster/state-recovery.test.d.ts +2 -0
  58. package/dist/cluster/state-recovery.test.d.ts.map +1 -0
  59. package/dist/cluster/state-recovery.test.js +310 -0
  60. package/dist/cluster/state-recovery.test.js.map +1 -0
  61. package/dist/cluster/types.d.ts +30 -0
  62. package/dist/cluster/types.d.ts.map +1 -1
  63. package/dist/config/schema.d.ts +48 -0
  64. package/dist/config/schema.d.ts.map +1 -1
  65. package/dist/config/schema.js +11 -0
  66. package/dist/config/schema.js.map +1 -1
  67. package/dist/context-files/generator.js +1 -1
  68. package/dist/context-files/generator.js.map +1 -1
  69. package/dist/context-files/generator.test.js +51 -0
  70. package/dist/context-files/generator.test.js.map +1 -1
  71. package/dist/orchestrator/orphan-recovery.d.ts +1 -1
  72. package/dist/orchestrator/orphan-recovery.d.ts.map +1 -1
  73. package/dist/orchestrator/orphan-recovery.js +4 -4
  74. package/dist/orchestrator/orphan-recovery.js.map +1 -1
  75. package/dist/orchestrator/prompt-templates.d.ts +3 -1
  76. package/dist/orchestrator/prompt-templates.d.ts.map +1 -1
  77. package/dist/orchestrator/prompt-templates.js +45 -8
  78. package/dist/orchestrator/prompt-templates.js.map +1 -1
  79. package/dist/orchestrator/prompt-templates.test.js +210 -0
  80. package/dist/orchestrator/prompt-templates.test.js.map +1 -1
  81. package/dist/orchestrator/scheduler.d.ts +1 -0
  82. package/dist/orchestrator/scheduler.d.ts.map +1 -1
  83. package/dist/orchestrator/scheduler.js +15 -10
  84. package/dist/orchestrator/scheduler.js.map +1 -1
  85. package/dist/orchestrator/scheduler.test.js +97 -6
  86. package/dist/orchestrator/scheduler.test.js.map +1 -1
  87. package/package.json +1 -1
  88. package/src/cli/commands/cluster.test.ts +387 -9
  89. package/src/cli/commands/cluster.ts +486 -1
  90. package/src/cli/commands/req-spawn.test.ts +153 -0
  91. package/src/cli/commands/req.ts +31 -18
  92. package/src/cluster/cluster-http-server.ts +80 -0
  93. package/src/cluster/distributed-runtime-coverage.test.ts +9 -0
  94. package/src/cluster/distributed-system.test.ts +168 -0
  95. package/src/cluster/events.ts +90 -0
  96. package/src/cluster/heartbeat-manager.ts +48 -6
  97. package/src/cluster/membership.test.ts +498 -0
  98. package/src/cluster/partition-safety.test.ts +523 -0
  99. package/src/cluster/raft-state-machine.ts +76 -4
  100. package/src/cluster/raft-store.ts +167 -1
  101. package/src/cluster/replication-lag.test.ts +284 -0
  102. package/src/cluster/replication.ts +6 -0
  103. package/src/cluster/runtime.ts +551 -12
  104. package/src/cluster/state-recovery.test.ts +420 -0
  105. package/src/cluster/types.ts +32 -0
  106. package/src/config/schema.ts +11 -0
  107. package/src/context-files/generator.test.ts +55 -0
  108. package/src/context-files/generator.ts +5 -5
  109. package/src/orchestrator/orphan-recovery.ts +32 -13
  110. package/src/orchestrator/prompt-templates.test.ts +263 -0
  111. package/src/orchestrator/prompt-templates.ts +49 -8
  112. package/src/orchestrator/scheduler.test.ts +129 -6
  113. package/src/orchestrator/scheduler.ts +46 -20
@@ -3,19 +3,32 @@
3
3
  import { join } from 'path';
4
4
  import type { Database } from 'sql.js';
5
5
  import type { ClusterConfig, ClusterPeerConfig } from '../config/schema.js';
6
- import { ClusterHttpServer } from './cluster-http-server.js';
6
+ import { queryAll } from '../db/client.js';
7
+ import { REPLICATED_TABLES } from './adapters.js';
8
+ import {
9
+ ClusterHttpServer,
10
+ type MembershipJoinRequest,
11
+ type MembershipJoinResponse,
12
+ type MembershipLeaveRequest,
13
+ type MembershipLeaveResponse,
14
+ } from './cluster-http-server.js';
7
15
  import { HeartbeatManager } from './heartbeat-manager.js';
8
16
  import { RaftStateMachine } from './raft-state-machine.js';
9
17
  import {
10
18
  applyRemoteEvents,
11
19
  ensureClusterTables,
12
20
  getAllClusterEvents,
21
+ getClusterEventCount,
22
+ getEffectiveVersionVector,
13
23
  getVersionVector,
14
24
  mergeSimilarStories,
25
+ pruneClusterEvents,
15
26
  scanLocalChanges,
27
+ setSnapshotVersionVector,
16
28
  type ClusterEvent,
17
29
  type VersionVector,
18
30
  } from './replication.js';
31
+ import type { ClusterSnapshot } from './types.js';
19
32
 
20
33
  type NodeRole = 'leader' | 'follower' | 'candidate';
21
34
 
@@ -31,6 +44,7 @@ interface ClusterStatusFetchOptions {
31
44
  interface DeltaResponse {
32
45
  events: ClusterEvent[];
33
46
  version_vector: VersionVector;
47
+ fencing_token?: number;
34
48
  }
35
49
 
36
50
  export interface ClusterStatus {
@@ -42,10 +56,15 @@ export interface ClusterStatus {
42
56
  is_leader: boolean;
43
57
  leader_id: string | null;
44
58
  leader_url: string | null;
59
+ fencing_token: number;
60
+ leader_lease_valid: boolean;
61
+ leader_lease_duration_ms: number;
45
62
  raft_commit_index: number;
46
63
  raft_last_applied: number;
47
64
  raft_last_log_index: number;
48
65
  peers: Array<{ id: string; url: string }>;
66
+ /** True while the node is performing snapshot-based catch-up and not yet election-eligible. */
67
+ is_catching_up: boolean;
49
68
  }
50
69
 
51
70
  export interface ClusterSyncResult {
@@ -53,6 +72,32 @@ export interface ClusterSyncResult {
53
72
  imported_events_applied: number;
54
73
  merged_duplicate_stories: number;
55
74
  durable_log_entries_appended: number;
75
+ log_entries_compacted: number;
76
+ cluster_events_pruned: number;
77
+ /** True when this sync triggered snapshot-based recovery rather than delta sync. */
78
+ used_snapshot_recovery: boolean;
79
+ /** Number of rows applied from the snapshot (0 when delta sync was used). */
80
+ catch_up_applied: number;
81
+ /** Total rows in the snapshot (0 when delta sync was used). */
82
+ catch_up_total: number;
83
+ }
84
+
85
+ export interface PeerReplicationLag {
86
+ peer_id: string;
87
+ peer_url: string;
88
+ reachable: boolean;
89
+ events_behind: number;
90
+ last_sync_at: string | null;
91
+ last_sync_duration_ms: number | null;
92
+ last_sync_events_applied: number;
93
+ }
94
+
95
+ export interface ReplicationLagSummary {
96
+ node_id: string;
97
+ total_local_events: number;
98
+ version_vector: VersionVector;
99
+ peers: PeerReplicationLag[];
100
+ last_sync_at: string | null;
56
101
  }
57
102
 
58
103
  export class ClusterRuntime {
@@ -61,6 +106,12 @@ export class ClusterRuntime {
61
106
 
62
107
  private eventCache: ClusterEvent[] = [];
63
108
  private versionVectorCache: VersionVector = {};
109
+ private lastCompactionAt = 0;
110
+ private peerLagMap = new Map<string, PeerReplicationLag>();
111
+ private lastSyncAt: string | null = null;
112
+
113
+ /** Cached full snapshot refreshed on every sync, served to recovering nodes. */
114
+ private cachedSnapshot: ClusterSnapshot | null = null;
64
115
 
65
116
  private readonly raft: RaftStateMachine;
66
117
  private readonly heartbeat: HeartbeatManager;
@@ -81,6 +132,10 @@ export class ClusterRuntime {
81
132
  postJson: (peer, path, body) => this.postJson(peer, path, body),
82
133
  isActive: () => this.started && !this.stopping,
83
134
  handleBackgroundError: error => this.handleBackgroundError(error),
135
+ onPeersUpdated: peers => {
136
+ // Follower received updated peer list from leader via heartbeat
137
+ this.raft.setPeers(peers);
138
+ },
84
139
  });
85
140
 
86
141
  this.httpServer = new ClusterHttpServer(config, {
@@ -89,6 +144,13 @@ export class ClusterRuntime {
89
144
  handleHeartbeat: body => this.heartbeat.handleHeartbeat(body),
90
145
  getDeltaFromCache: (vector, limit) => this.getDeltaFromCache(vector, limit),
91
146
  getVersionVectorCache: () => this.versionVectorCache,
147
+ getReplicationLag: () => this.getReplicationLag(),
148
+ getFencingToken: () => this.raft.getFencingToken(),
149
+ validateFencingToken: token => this.raft.validateFencingToken(token),
150
+ isLeaderLeaseValid: () => this.raft.isLeaderLeaseValid(),
151
+ handleMembershipJoin: body => this.handleMembershipJoin(body),
152
+ handleMembershipLeave: body => this.handleMembershipLeave(body),
153
+ getSnapshot: () => this.cachedSnapshot ?? { version_vector: {}, tables: {} },
92
154
  });
93
155
  }
94
156
 
@@ -138,6 +200,30 @@ export class ClusterRuntime {
138
200
  return this.raft.role === 'leader';
139
201
  }
140
202
 
203
+ getReplicationLag(): ReplicationLagSummary {
204
+ return {
205
+ node_id: this.config.node_id,
206
+ total_local_events: this.eventCache.length,
207
+ version_vector: { ...this.versionVectorCache },
208
+ peers: this.raft
209
+ .getPeers()
210
+ .filter(p => p.id !== this.config.node_id)
211
+ .map(
212
+ p =>
213
+ this.peerLagMap.get(p.id) || {
214
+ peer_id: p.id,
215
+ peer_url: p.url,
216
+ reachable: false,
217
+ events_behind: 0,
218
+ last_sync_at: null,
219
+ last_sync_duration_ms: null,
220
+ last_sync_events_applied: 0,
221
+ }
222
+ ),
223
+ last_sync_at: this.lastSyncAt,
224
+ };
225
+ }
226
+
141
227
  getStatus(): ClusterStatus {
142
228
  const raftState = this.raft.getRaftStoreState();
143
229
 
@@ -150,10 +236,14 @@ export class ClusterRuntime {
150
236
  is_leader: this.isLeader(),
151
237
  leader_id: this.raft.leaderId,
152
238
  leader_url: this.raft.getLeaderUrl(),
239
+ fencing_token: this.raft.getFencingToken(),
240
+ leader_lease_valid: this.raft.isLeaderLeaseValid(),
241
+ leader_lease_duration_ms: this.raft.leaderLeaseDurationMs,
153
242
  raft_commit_index: raftState?.commit_index || 0,
154
243
  raft_last_applied: raftState?.last_applied || 0,
155
244
  raft_last_log_index: raftState?.last_log_index || 0,
156
- peers: this.config.peers.map(peer => ({ id: peer.id, url: peer.url })),
245
+ peers: this.raft.getPeers().map(peer => ({ id: peer.id, url: peer.url })),
246
+ is_catching_up: this.raft.isCatchingUp,
157
247
  };
158
248
  }
159
249
 
@@ -164,6 +254,11 @@ export class ClusterRuntime {
164
254
  imported_events_applied: 0,
165
255
  merged_duplicate_stories: 0,
166
256
  durable_log_entries_appended: 0,
257
+ log_entries_compacted: 0,
258
+ cluster_events_pruned: 0,
259
+ used_snapshot_recovery: false,
260
+ catch_up_applied: 0,
261
+ catch_up_total: 0,
167
262
  };
168
263
  }
169
264
 
@@ -172,11 +267,15 @@ export class ClusterRuntime {
172
267
 
173
268
  ensureClusterTables(db, this.config.node_id);
174
269
 
270
+ // Refresh snapshot cache so the HTTP endpoint always serves current data
271
+ this.cachedSnapshot = this.buildSnapshot(db);
272
+
175
273
  const localEventsBefore = scanLocalChanges(db, this.config.node_id);
176
- const imported = await this.pullEventsFromPeers(db);
274
+ const { imported, usedSnapshot, catchUpApplied, catchUpTotal } =
275
+ await this.pullEventsFromPeers(db);
177
276
  const merged = mergeSimilarStories(db, this.config.story_similarity_threshold);
178
277
  const localEventsAfter =
179
- imported > 0 || merged > 0 ? scanLocalChanges(db, this.config.node_id) : 0;
278
+ imported > 0 || merged > 0 || usedSnapshot ? scanLocalChanges(db, this.config.node_id) : 0;
180
279
 
181
280
  this.refreshCache(db);
182
281
 
@@ -184,35 +283,376 @@ export class ClusterRuntime {
184
283
  getAllClusterEvents(db)
185
284
  );
186
285
 
286
+ // Run compaction if thresholds are met and enough time has elapsed
287
+ const { logCompacted, eventsPruned } = this.maybeCompact(db);
288
+
187
289
  return {
188
290
  local_events_emitted: localEventsBefore + localEventsAfter,
189
291
  imported_events_applied: imported,
190
292
  merged_duplicate_stories: merged,
191
293
  durable_log_entries_appended: durableLogEntriesAppended,
294
+ log_entries_compacted: logCompacted,
295
+ cluster_events_pruned: eventsPruned,
296
+ used_snapshot_recovery: usedSnapshot,
297
+ catch_up_applied: catchUpApplied,
298
+ catch_up_total: catchUpTotal,
299
+ };
300
+ }
301
+
302
+ handleMembershipJoin(request: MembershipJoinRequest): MembershipJoinResponse {
303
+ const peers = this.raft.getPeers();
304
+ const leaderUrl = this.raft.getLeaderUrl();
305
+
306
+ // If not the leader, redirect to leader
307
+ if (this.raft.role !== 'leader') {
308
+ return {
309
+ success: false,
310
+ leader_id: this.raft.leaderId,
311
+ leader_url: leaderUrl,
312
+ peers: peers.map(p => ({ id: p.id, url: p.url })),
313
+ term: this.raft.currentTerm,
314
+ };
315
+ }
316
+
317
+ // Check if peer already exists
318
+ const existing = peers.find(p => p.id === request.node_id);
319
+ if (existing) {
320
+ // Update URL if changed
321
+ if (existing.url !== request.url) {
322
+ const updated = peers.map(p =>
323
+ p.id === request.node_id ? { id: p.id, url: request.url } : p
324
+ );
325
+ this.raft.setPeers(updated);
326
+ this.raft.appendDurableEntry('membership_change', {
327
+ action: 'update',
328
+ node_id: request.node_id,
329
+ url: request.url,
330
+ peer_count: updated.length,
331
+ });
332
+ }
333
+ return {
334
+ success: true,
335
+ leader_id: this.raft.leaderId,
336
+ leader_url: this.config.public_url,
337
+ peers: this.raft.getPeers().map(p => ({ id: p.id, url: p.url })),
338
+ term: this.raft.currentTerm,
339
+ };
340
+ }
341
+
342
+ // Add new peer
343
+ const newPeer: ClusterPeerConfig = { id: request.node_id, url: request.url };
344
+ const updated = [...peers, newPeer];
345
+ this.raft.setPeers(updated);
346
+
347
+ this.raft.appendDurableEntry('membership_change', {
348
+ action: 'join',
349
+ node_id: request.node_id,
350
+ url: request.url,
351
+ peer_count: updated.length,
352
+ });
353
+
354
+ return {
355
+ success: true,
356
+ leader_id: this.raft.leaderId,
357
+ leader_url: this.config.public_url,
358
+ peers: updated.map(p => ({ id: p.id, url: p.url })),
359
+ term: this.raft.currentTerm,
360
+ };
361
+ }
362
+
363
+ handleMembershipLeave(request: MembershipLeaveRequest): MembershipLeaveResponse {
364
+ const peers = this.raft.getPeers();
365
+
366
+ // If not the leader, cannot process leave
367
+ if (this.raft.role !== 'leader') {
368
+ return {
369
+ success: false,
370
+ peers: peers.map(p => ({ id: p.id, url: p.url })),
371
+ };
372
+ }
373
+
374
+ // Cannot remove self (leader) — leader must transfer leadership first
375
+ if (request.node_id === this.config.node_id) {
376
+ return {
377
+ success: false,
378
+ peers: peers.map(p => ({ id: p.id, url: p.url })),
379
+ };
380
+ }
381
+
382
+ const existing = peers.find(p => p.id === request.node_id);
383
+ if (!existing) {
384
+ // Already gone
385
+ return {
386
+ success: true,
387
+ peers: peers.map(p => ({ id: p.id, url: p.url })),
388
+ };
389
+ }
390
+
391
+ const updated = peers.filter(p => p.id !== request.node_id);
392
+ this.raft.setPeers(updated);
393
+
394
+ this.raft.appendDurableEntry('membership_change', {
395
+ action: 'leave',
396
+ node_id: request.node_id,
397
+ peer_count: updated.length,
398
+ });
399
+
400
+ return {
401
+ success: true,
402
+ peers: updated.map(p => ({ id: p.id, url: p.url })),
192
403
  };
193
404
  }
194
405
 
406
+ private maybeCompact(db: Database): { logCompacted: number; eventsPruned: number } {
407
+ const now = Date.now();
408
+ const interval = this.config.compaction_interval_ms ?? 300000;
409
+
410
+ // Respect minimum interval between compaction runs
411
+ if (interval > 0 && now - this.lastCompactionAt < interval) {
412
+ return { logCompacted: 0, eventsPruned: 0 };
413
+ }
414
+
415
+ let logCompacted = 0;
416
+ let eventsPruned = 0;
417
+
418
+ // Compact raft log if threshold exceeded
419
+ const maxLogEntries = this.config.max_log_entries ?? 10000;
420
+ if (maxLogEntries > 0) {
421
+ const logCount = this.raft.getLogEntryCount();
422
+ if (logCount > maxLogEntries) {
423
+ const versionVector = getVersionVector(db);
424
+ const result = this.raft.createSnapshotAndCompact(versionVector);
425
+ logCompacted = result.entries_removed;
426
+ }
427
+ }
428
+
429
+ // Prune cluster_events if threshold exceeded
430
+ const maxEvents = this.config.max_cluster_events ?? 50000;
431
+ if (maxEvents > 0) {
432
+ const eventCount = getClusterEventCount(db);
433
+ if (eventCount > maxEvents) {
434
+ eventsPruned = pruneClusterEvents(db, maxEvents);
435
+ if (eventsPruned > 0) {
436
+ this.refreshCache(db);
437
+ }
438
+ }
439
+ }
440
+
441
+ if (logCompacted > 0 || eventsPruned > 0) {
442
+ this.lastCompactionAt = now;
443
+ }
444
+
445
+ return { logCompacted, eventsPruned };
446
+ }
447
+
195
448
  private refreshCache(db: Database): void {
196
449
  this.eventCache = getAllClusterEvents(db).slice(-20000);
197
450
  this.versionVectorCache = getVersionVector(db);
198
451
  }
199
452
 
200
- private async pullEventsFromPeers(db: Database): Promise<number> {
201
- if (this.config.peers.length === 0) return 0;
453
+ private async pullEventsFromPeers(db: Database): Promise<{
454
+ imported: number;
455
+ usedSnapshot: boolean;
456
+ catchUpApplied: number;
457
+ catchUpTotal: number;
458
+ }> {
459
+ const peers = this.raft.getPeers();
460
+ if (peers.length === 0) {
461
+ return { imported: 0, usedSnapshot: false, catchUpApplied: 0, catchUpTotal: 0 };
462
+ }
202
463
 
203
- let applied = 0;
464
+ let imported = 0;
465
+ const syncTimestamp = new Date().toISOString();
466
+ this.lastSyncAt = syncTimestamp;
204
467
 
205
- for (const peer of this.config.peers) {
468
+ for (const peer of peers) {
206
469
  if (peer.id === this.config.node_id) continue;
207
470
 
208
- const localVector = getVersionVector(db);
471
+ const localVector = getEffectiveVersionVector(db);
472
+ const syncStart = Date.now();
209
473
  const response = await this.requestDelta(peer, localVector, 4000);
210
- if (!response || response.events.length === 0) continue;
211
474
 
212
- applied += applyRemoteEvents(db, this.config.node_id, response.events);
475
+ if (!response) {
476
+ this.peerLagMap.set(peer.id, {
477
+ peer_id: peer.id,
478
+ peer_url: peer.url,
479
+ reachable: false,
480
+ events_behind: 0,
481
+ last_sync_at: syncTimestamp,
482
+ last_sync_duration_ms: Date.now() - syncStart,
483
+ last_sync_events_applied: 0,
484
+ });
485
+ continue;
486
+ }
487
+
488
+ // If the peer advertises a higher fencing token, step down
489
+ if (
490
+ typeof response.fencing_token === 'number' &&
491
+ response.fencing_token > this.raft.currentTerm
492
+ ) {
493
+ this.raft.stepDown(response.fencing_token, null);
494
+ }
495
+
496
+ // Detect if the delta is insufficient (peer's log was truncated past our position)
497
+ if (this.isDeltaInsufficient(localVector, response.version_vector, response.events)) {
498
+ const recovery = await this.recoverFromSnapshot(db, peer);
499
+ if (recovery !== null) {
500
+ this.peerLagMap.set(peer.id, {
501
+ peer_id: peer.id,
502
+ peer_url: peer.url,
503
+ reachable: true,
504
+ events_behind: 0,
505
+ last_sync_at: syncTimestamp,
506
+ last_sync_duration_ms: Date.now() - syncStart,
507
+ last_sync_events_applied: recovery.applied,
508
+ });
509
+ return {
510
+ imported: 0,
511
+ usedSnapshot: true,
512
+ catchUpApplied: recovery.applied,
513
+ catchUpTotal: recovery.total,
514
+ };
515
+ }
516
+ // Snapshot recovery failed — fall through and apply whatever delta we have
517
+ }
518
+
519
+ const eventsBehind = response.events.length;
520
+ const peerApplied =
521
+ eventsBehind > 0 ? applyRemoteEvents(db, this.config.node_id, response.events) : 0;
522
+ imported += peerApplied;
523
+
524
+ this.peerLagMap.set(peer.id, {
525
+ peer_id: peer.id,
526
+ peer_url: peer.url,
527
+ reachable: true,
528
+ events_behind: eventsBehind,
529
+ last_sync_at: syncTimestamp,
530
+ last_sync_duration_ms: Date.now() - syncStart,
531
+ last_sync_events_applied: peerApplied,
532
+ });
213
533
  }
214
534
 
215
- return applied;
535
+ // If we had been catching up and now the effective vector matches peers, mark done
536
+ if (this.raft.isCatchingUp) {
537
+ this.raft.isCatchingUp = false;
538
+ }
539
+
540
+ return { imported, usedSnapshot: false, catchUpApplied: 0, catchUpTotal: 0 };
541
+ }
542
+
543
+ /**
544
+ * Returns true when the delta response is missing events the peer should have.
545
+ * This happens when the peer's event cache has been truncated (log compaction)
546
+ * and can no longer provide all events since our last known version.
547
+ */
548
+ private isDeltaInsufficient(
549
+ localVector: VersionVector,
550
+ peerVector: VersionVector,
551
+ receivedEvents: ClusterEvent[]
552
+ ): boolean {
553
+ // Count how many events we actually received per actor
554
+ const received: Record<string, number> = {};
555
+ for (const event of receivedEvents) {
556
+ received[event.version.actor_id] = (received[event.version.actor_id] ?? 0) + 1;
557
+ }
558
+
559
+ for (const [actorId, peerCounter] of Object.entries(peerVector)) {
560
+ const localCounter = localVector[actorId] ?? 0;
561
+ const needed = peerCounter - localCounter;
562
+ if (needed <= 0) continue;
563
+
564
+ const receivedCount = received[actorId] ?? 0;
565
+ if (receivedCount < needed) {
566
+ // We're missing events for this actor that the peer should have
567
+ return true;
568
+ }
569
+ }
570
+
571
+ return false;
572
+ }
573
+
574
+ /**
575
+ * Requests a full snapshot from the given peer and applies it locally.
576
+ * Marks the node as no longer catching up once complete.
577
+ * Returns { applied, total } on success, null on failure.
578
+ */
579
+ private async recoverFromSnapshot(
580
+ db: Database,
581
+ peer: ClusterPeerConfig
582
+ ): Promise<{ applied: number; total: number } | null> {
583
+ this.raft.isCatchingUp = true;
584
+ this.raft.appendDurableEntry('runtime', {
585
+ event: 'snapshot_recovery_start',
586
+ node_id: this.config.node_id,
587
+ peer_id: peer.id,
588
+ });
589
+
590
+ const snapshot = await this.requestSnapshot(peer);
591
+ if (!snapshot) {
592
+ return null;
593
+ }
594
+
595
+ const { applied, total } = this.applySnapshot(db, snapshot);
596
+
597
+ this.raft.isCatchingUp = false;
598
+ this.raft.appendDurableEntry('runtime', {
599
+ event: 'snapshot_recovery_complete',
600
+ node_id: this.config.node_id,
601
+ peer_id: peer.id,
602
+ rows_applied: applied,
603
+ rows_total: total,
604
+ });
605
+
606
+ return { applied, total };
607
+ }
608
+
609
+ /**
610
+ * Applies a snapshot to the local database, upserting all rows from all tables.
611
+ * Stores the snapshot's version vector so future delta requests start from here.
612
+ */
613
+ private applySnapshot(
614
+ db: Database,
615
+ snapshot: ClusterSnapshot
616
+ ): { applied: number; total: number } {
617
+ let applied = 0;
618
+ let total = 0;
619
+
620
+ for (const adapter of REPLICATED_TABLES) {
621
+ const rows = snapshot.tables[adapter.table];
622
+ if (!rows) continue;
623
+ total += rows.length;
624
+ for (const row of rows) {
625
+ adapter.upsert(db, row.payload);
626
+ applied++;
627
+ }
628
+ }
629
+
630
+ // Record the snapshot version vector so future delta requests
631
+ // only ask for events newer than this snapshot
632
+ setSnapshotVersionVector(db, snapshot.version_vector);
633
+
634
+ return { applied, total };
635
+ }
636
+
637
+ /**
638
+ * Builds a full snapshot of all replicated tables from the current db state.
639
+ * Called during sync to keep cachedSnapshot fresh for the HTTP endpoint.
640
+ */
641
+ private buildSnapshot(db: Database): ClusterSnapshot {
642
+ const tables: ClusterSnapshot['tables'] = {};
643
+
644
+ for (const adapter of REPLICATED_TABLES) {
645
+ const rows = queryAll<Record<string, unknown>>(db, adapter.selectSql);
646
+ tables[adapter.table] = rows.map(row => ({
647
+ rowId: adapter.rowId(row),
648
+ payload: adapter.payload(row),
649
+ }));
650
+ }
651
+
652
+ return {
653
+ version_vector: getVersionVector(db),
654
+ tables,
655
+ };
216
656
  }
217
657
 
218
658
  private async requestDelta(
@@ -223,9 +663,14 @@ export class ClusterRuntime {
223
663
  return this.postJson<DeltaResponse>(peer, '/cluster/v1/events/delta', {
224
664
  version_vector: versionVector,
225
665
  limit,
666
+ fencing_token: this.raft.getFencingToken(),
226
667
  });
227
668
  }
228
669
 
670
+ private async requestSnapshot(peer: ClusterPeerConfig): Promise<ClusterSnapshot | null> {
671
+ return this.getJson<ClusterSnapshot>(peer, '/cluster/v1/snapshot');
672
+ }
673
+
229
674
  private getDeltaFromCache(remoteVersionVector: VersionVector, limit: number): ClusterEvent[] {
230
675
  return this.eventCache
231
676
  .filter(event => {
@@ -254,6 +699,18 @@ export class ClusterRuntime {
254
699
  );
255
700
  }
256
701
 
702
+ private async getJson<T>(peer: ClusterPeerConfig, path: string): Promise<T | null> {
703
+ const normalizedBase = peer.url.endsWith('/') ? peer.url : `${peer.url}/`;
704
+ const url = new URL(path.replace(/^\//, ''), normalizedBase).toString();
705
+
706
+ return fetchClusterStatusOrPostJson<T>(
707
+ url,
708
+ this.config.request_timeout_ms,
709
+ this.config.auth_token,
710
+ { method: 'GET' }
711
+ );
712
+ }
713
+
257
714
  private handleBackgroundError(error: unknown): void {
258
715
  if (!this.started || this.stopping) return;
259
716
  const err = error as NodeJS.ErrnoException;
@@ -271,6 +728,80 @@ export class ClusterRuntime {
271
728
  }
272
729
  }
273
730
 
731
+ export async function fetchReplicationLag(
732
+ config: ClusterConfig
733
+ ): Promise<ReplicationLagSummary | null> {
734
+ if (!config.enabled) return null;
735
+
736
+ const host = config.listen_host === '0.0.0.0' ? '127.0.0.1' : config.listen_host;
737
+ const url = `http://${host}:${config.listen_port}/cluster/v1/replication-lag`;
738
+
739
+ return fetchClusterStatusOrPostJson<ReplicationLagSummary>(
740
+ url,
741
+ config.request_timeout_ms,
742
+ config.auth_token,
743
+ { method: 'GET' }
744
+ );
745
+ }
746
+
747
+ /**
748
+ * Fetches recent cluster events from the local runtime via the delta endpoint.
749
+ * Uses an empty version vector to request recent events up to the given limit.
750
+ */
751
+ export async function fetchLocalClusterEvents(
752
+ config: ClusterConfig,
753
+ limit: number = 50
754
+ ): Promise<ClusterEvent[] | null> {
755
+ if (!config.enabled) return null;
756
+
757
+ const host = config.listen_host === '0.0.0.0' ? '127.0.0.1' : config.listen_host;
758
+ const url = `http://${host}:${config.listen_port}/cluster/v1/events/delta`;
759
+
760
+ const response = await fetchClusterStatusOrPostJson<{ events: ClusterEvent[] }>(
761
+ url,
762
+ config.request_timeout_ms,
763
+ config.auth_token,
764
+ { method: 'POST', body: { version_vector: {}, limit } }
765
+ );
766
+
767
+ return response?.events ?? null;
768
+ }
769
+
770
+ /**
771
+ * POSTs to the local cluster runtime at the given path.
772
+ */
773
+ export async function postToLocalCluster<T>(
774
+ config: ClusterConfig,
775
+ path: string,
776
+ body: unknown
777
+ ): Promise<T | null> {
778
+ if (!config.enabled) return null;
779
+
780
+ const host = config.listen_host === '0.0.0.0' ? '127.0.0.1' : config.listen_host;
781
+ const url = `http://${host}:${config.listen_port}${path}`;
782
+
783
+ return fetchClusterStatusOrPostJson<T>(url, config.request_timeout_ms, config.auth_token, {
784
+ method: 'POST',
785
+ body,
786
+ });
787
+ }
788
+
789
+ /**
790
+ * POSTs to a peer cluster node at the given URL and path.
791
+ */
792
+ export async function postToPeerCluster<T>(
793
+ peerUrl: string,
794
+ path: string,
795
+ body: unknown,
796
+ options: ClusterStatusFetchOptions
797
+ ): Promise<T | null> {
798
+ const url = `${peerUrl.replace(/\/$/, '')}${path}`;
799
+ return fetchClusterStatusOrPostJson<T>(url, options.timeoutMs, options.authToken, {
800
+ method: 'POST',
801
+ body,
802
+ });
803
+ }
804
+
274
805
  export async function fetchLocalClusterStatus(
275
806
  config: ClusterConfig
276
807
  ): Promise<ClusterStatus | null> {
@@ -284,10 +815,14 @@ export async function fetchLocalClusterStatus(
284
815
  is_leader: true,
285
816
  leader_id: config.node_id,
286
817
  leader_url: null,
818
+ fencing_token: 0,
819
+ leader_lease_valid: true,
820
+ leader_lease_duration_ms: config.leader_lease_ms ?? config.heartbeat_interval_ms * 3,
287
821
  raft_commit_index: 0,
288
822
  raft_last_applied: 0,
289
823
  raft_last_log_index: 0,
290
824
  peers: config.peers.map(peer => ({ id: peer.id, url: peer.url })),
825
+ is_catching_up: false,
291
826
  };
292
827
  }
293
828
 
@@ -345,10 +880,14 @@ function parseClusterStatus(input: Record<string, unknown>): ClusterStatus {
345
880
  is_leader: input.is_leader === true,
346
881
  leader_id: typeof input.leader_id === 'string' ? input.leader_id : null,
347
882
  leader_url: typeof input.leader_url === 'string' ? input.leader_url : null,
883
+ fencing_token: toInt(input.fencing_token),
884
+ leader_lease_valid: input.leader_lease_valid === true,
885
+ leader_lease_duration_ms: toInt(input.leader_lease_duration_ms),
348
886
  raft_commit_index: toInt(input.raft_commit_index),
349
887
  raft_last_applied: toInt(input.raft_last_applied),
350
888
  raft_last_log_index: toInt(input.raft_last_log_index),
351
889
  peers,
890
+ is_catching_up: input.is_catching_up === true,
352
891
  };
353
892
  }
354
893