hungry-ghost-hive 0.45.0 → 0.46.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/cluster.d.ts.map +1 -1
- package/dist/cli/commands/cluster.js +348 -1
- package/dist/cli/commands/cluster.js.map +1 -1
- package/dist/cli/commands/cluster.test.js +313 -9
- package/dist/cli/commands/cluster.test.js.map +1 -1
- package/dist/cli/commands/req-spawn.test.d.ts +2 -0
- package/dist/cli/commands/req-spawn.test.d.ts.map +1 -0
- package/dist/cli/commands/req-spawn.test.js +116 -0
- package/dist/cli/commands/req-spawn.test.js.map +1 -0
- package/dist/cli/commands/req.d.ts.map +1 -1
- package/dist/cli/commands/req.js +21 -13
- package/dist/cli/commands/req.js.map +1 -1
- package/dist/cluster/cluster-http-server.d.ts +32 -0
- package/dist/cluster/cluster-http-server.d.ts.map +1 -1
- package/dist/cluster/cluster-http-server.js +42 -0
- package/dist/cluster/cluster-http-server.js.map +1 -1
- package/dist/cluster/distributed-runtime-coverage.test.js +9 -0
- package/dist/cluster/distributed-runtime-coverage.test.js.map +1 -1
- package/dist/cluster/distributed-system.test.js +135 -0
- package/dist/cluster/distributed-system.test.js.map +1 -1
- package/dist/cluster/events.d.ts +23 -0
- package/dist/cluster/events.d.ts.map +1 -1
- package/dist/cluster/events.js +74 -0
- package/dist/cluster/events.js.map +1 -1
- package/dist/cluster/heartbeat-manager.d.ts +2 -0
- package/dist/cluster/heartbeat-manager.d.ts.map +1 -1
- package/dist/cluster/heartbeat-manager.js +42 -6
- package/dist/cluster/heartbeat-manager.js.map +1 -1
- package/dist/cluster/membership.test.d.ts +2 -0
- package/dist/cluster/membership.test.d.ts.map +1 -0
- package/dist/cluster/membership.test.js +416 -0
- package/dist/cluster/membership.test.js.map +1 -0
- package/dist/cluster/partition-safety.test.d.ts +2 -0
- package/dist/cluster/partition-safety.test.d.ts.map +1 -0
- package/dist/cluster/partition-safety.test.js +440 -0
- package/dist/cluster/partition-safety.test.js.map +1 -0
- package/dist/cluster/raft-state-machine.d.ts +33 -1
- package/dist/cluster/raft-state-machine.d.ts.map +1 -1
- package/dist/cluster/raft-state-machine.js +65 -3
- package/dist/cluster/raft-state-machine.js.map +1 -1
- package/dist/cluster/raft-store.d.ts +26 -1
- package/dist/cluster/raft-store.d.ts.map +1 -1
- package/dist/cluster/raft-store.js +137 -0
- package/dist/cluster/raft-store.js.map +1 -1
- package/dist/cluster/replication-lag.test.d.ts +2 -0
- package/dist/cluster/replication-lag.test.d.ts.map +1 -0
- package/dist/cluster/replication-lag.test.js +239 -0
- package/dist/cluster/replication-lag.test.js.map +1 -0
- package/dist/cluster/replication.d.ts +2 -2
- package/dist/cluster/replication.d.ts.map +1 -1
- package/dist/cluster/replication.js +1 -1
- package/dist/cluster/replication.js.map +1 -1
- package/dist/cluster/runtime.d.ts +78 -0
- package/dist/cluster/runtime.d.ts.map +1 -1
- package/dist/cluster/runtime.js +400 -13
- package/dist/cluster/runtime.js.map +1 -1
- package/dist/cluster/state-recovery.test.d.ts +2 -0
- package/dist/cluster/state-recovery.test.d.ts.map +1 -0
- package/dist/cluster/state-recovery.test.js +310 -0
- package/dist/cluster/state-recovery.test.js.map +1 -0
- package/dist/cluster/types.d.ts +30 -0
- package/dist/cluster/types.d.ts.map +1 -1
- package/dist/config/schema.d.ts +48 -0
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +11 -0
- package/dist/config/schema.js.map +1 -1
- package/dist/context-files/generator.js +1 -1
- package/dist/context-files/generator.js.map +1 -1
- package/dist/context-files/generator.test.js +51 -0
- package/dist/context-files/generator.test.js.map +1 -1
- package/dist/orchestrator/orphan-recovery.d.ts +1 -1
- package/dist/orchestrator/orphan-recovery.d.ts.map +1 -1
- package/dist/orchestrator/orphan-recovery.js +4 -4
- package/dist/orchestrator/orphan-recovery.js.map +1 -1
- package/dist/orchestrator/prompt-templates.d.ts +3 -1
- package/dist/orchestrator/prompt-templates.d.ts.map +1 -1
- package/dist/orchestrator/prompt-templates.js +45 -8
- package/dist/orchestrator/prompt-templates.js.map +1 -1
- package/dist/orchestrator/prompt-templates.test.js +210 -0
- package/dist/orchestrator/prompt-templates.test.js.map +1 -1
- package/dist/orchestrator/scheduler.d.ts +1 -0
- package/dist/orchestrator/scheduler.d.ts.map +1 -1
- package/dist/orchestrator/scheduler.js +15 -10
- package/dist/orchestrator/scheduler.js.map +1 -1
- package/dist/orchestrator/scheduler.test.js +97 -6
- package/dist/orchestrator/scheduler.test.js.map +1 -1
- package/package.json +1 -1
- package/src/cli/commands/cluster.test.ts +387 -9
- package/src/cli/commands/cluster.ts +486 -1
- package/src/cli/commands/req-spawn.test.ts +153 -0
- package/src/cli/commands/req.ts +31 -18
- package/src/cluster/cluster-http-server.ts +80 -0
- package/src/cluster/distributed-runtime-coverage.test.ts +9 -0
- package/src/cluster/distributed-system.test.ts +168 -0
- package/src/cluster/events.ts +90 -0
- package/src/cluster/heartbeat-manager.ts +48 -6
- package/src/cluster/membership.test.ts +498 -0
- package/src/cluster/partition-safety.test.ts +523 -0
- package/src/cluster/raft-state-machine.ts +76 -4
- package/src/cluster/raft-store.ts +167 -1
- package/src/cluster/replication-lag.test.ts +284 -0
- package/src/cluster/replication.ts +6 -0
- package/src/cluster/runtime.ts +551 -12
- package/src/cluster/state-recovery.test.ts +420 -0
- package/src/cluster/types.ts +32 -0
- package/src/config/schema.ts +11 -0
- package/src/context-files/generator.test.ts +55 -0
- package/src/context-files/generator.ts +5 -5
- package/src/orchestrator/orphan-recovery.ts +32 -13
- package/src/orchestrator/prompt-templates.test.ts +263 -0
- package/src/orchestrator/prompt-templates.ts +49 -8
- package/src/orchestrator/scheduler.test.ts +129 -6
- package/src/orchestrator/scheduler.ts +46 -20
package/src/cluster/runtime.ts
CHANGED
|
@@ -3,19 +3,32 @@
|
|
|
3
3
|
import { join } from 'path';
|
|
4
4
|
import type { Database } from 'sql.js';
|
|
5
5
|
import type { ClusterConfig, ClusterPeerConfig } from '../config/schema.js';
|
|
6
|
-
import {
|
|
6
|
+
import { queryAll } from '../db/client.js';
|
|
7
|
+
import { REPLICATED_TABLES } from './adapters.js';
|
|
8
|
+
import {
|
|
9
|
+
ClusterHttpServer,
|
|
10
|
+
type MembershipJoinRequest,
|
|
11
|
+
type MembershipJoinResponse,
|
|
12
|
+
type MembershipLeaveRequest,
|
|
13
|
+
type MembershipLeaveResponse,
|
|
14
|
+
} from './cluster-http-server.js';
|
|
7
15
|
import { HeartbeatManager } from './heartbeat-manager.js';
|
|
8
16
|
import { RaftStateMachine } from './raft-state-machine.js';
|
|
9
17
|
import {
|
|
10
18
|
applyRemoteEvents,
|
|
11
19
|
ensureClusterTables,
|
|
12
20
|
getAllClusterEvents,
|
|
21
|
+
getClusterEventCount,
|
|
22
|
+
getEffectiveVersionVector,
|
|
13
23
|
getVersionVector,
|
|
14
24
|
mergeSimilarStories,
|
|
25
|
+
pruneClusterEvents,
|
|
15
26
|
scanLocalChanges,
|
|
27
|
+
setSnapshotVersionVector,
|
|
16
28
|
type ClusterEvent,
|
|
17
29
|
type VersionVector,
|
|
18
30
|
} from './replication.js';
|
|
31
|
+
import type { ClusterSnapshot } from './types.js';
|
|
19
32
|
|
|
20
33
|
type NodeRole = 'leader' | 'follower' | 'candidate';
|
|
21
34
|
|
|
@@ -31,6 +44,7 @@ interface ClusterStatusFetchOptions {
|
|
|
31
44
|
interface DeltaResponse {
|
|
32
45
|
events: ClusterEvent[];
|
|
33
46
|
version_vector: VersionVector;
|
|
47
|
+
fencing_token?: number;
|
|
34
48
|
}
|
|
35
49
|
|
|
36
50
|
export interface ClusterStatus {
|
|
@@ -42,10 +56,15 @@ export interface ClusterStatus {
|
|
|
42
56
|
is_leader: boolean;
|
|
43
57
|
leader_id: string | null;
|
|
44
58
|
leader_url: string | null;
|
|
59
|
+
fencing_token: number;
|
|
60
|
+
leader_lease_valid: boolean;
|
|
61
|
+
leader_lease_duration_ms: number;
|
|
45
62
|
raft_commit_index: number;
|
|
46
63
|
raft_last_applied: number;
|
|
47
64
|
raft_last_log_index: number;
|
|
48
65
|
peers: Array<{ id: string; url: string }>;
|
|
66
|
+
/** True while the node is performing snapshot-based catch-up and not yet election-eligible. */
|
|
67
|
+
is_catching_up: boolean;
|
|
49
68
|
}
|
|
50
69
|
|
|
51
70
|
export interface ClusterSyncResult {
|
|
@@ -53,6 +72,32 @@ export interface ClusterSyncResult {
|
|
|
53
72
|
imported_events_applied: number;
|
|
54
73
|
merged_duplicate_stories: number;
|
|
55
74
|
durable_log_entries_appended: number;
|
|
75
|
+
log_entries_compacted: number;
|
|
76
|
+
cluster_events_pruned: number;
|
|
77
|
+
/** True when this sync triggered snapshot-based recovery rather than delta sync. */
|
|
78
|
+
used_snapshot_recovery: boolean;
|
|
79
|
+
/** Number of rows applied from the snapshot (0 when delta sync was used). */
|
|
80
|
+
catch_up_applied: number;
|
|
81
|
+
/** Total rows in the snapshot (0 when delta sync was used). */
|
|
82
|
+
catch_up_total: number;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export interface PeerReplicationLag {
|
|
86
|
+
peer_id: string;
|
|
87
|
+
peer_url: string;
|
|
88
|
+
reachable: boolean;
|
|
89
|
+
events_behind: number;
|
|
90
|
+
last_sync_at: string | null;
|
|
91
|
+
last_sync_duration_ms: number | null;
|
|
92
|
+
last_sync_events_applied: number;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export interface ReplicationLagSummary {
|
|
96
|
+
node_id: string;
|
|
97
|
+
total_local_events: number;
|
|
98
|
+
version_vector: VersionVector;
|
|
99
|
+
peers: PeerReplicationLag[];
|
|
100
|
+
last_sync_at: string | null;
|
|
56
101
|
}
|
|
57
102
|
|
|
58
103
|
export class ClusterRuntime {
|
|
@@ -61,6 +106,12 @@ export class ClusterRuntime {
|
|
|
61
106
|
|
|
62
107
|
private eventCache: ClusterEvent[] = [];
|
|
63
108
|
private versionVectorCache: VersionVector = {};
|
|
109
|
+
private lastCompactionAt = 0;
|
|
110
|
+
private peerLagMap = new Map<string, PeerReplicationLag>();
|
|
111
|
+
private lastSyncAt: string | null = null;
|
|
112
|
+
|
|
113
|
+
/** Cached full snapshot refreshed on every sync, served to recovering nodes. */
|
|
114
|
+
private cachedSnapshot: ClusterSnapshot | null = null;
|
|
64
115
|
|
|
65
116
|
private readonly raft: RaftStateMachine;
|
|
66
117
|
private readonly heartbeat: HeartbeatManager;
|
|
@@ -81,6 +132,10 @@ export class ClusterRuntime {
|
|
|
81
132
|
postJson: (peer, path, body) => this.postJson(peer, path, body),
|
|
82
133
|
isActive: () => this.started && !this.stopping,
|
|
83
134
|
handleBackgroundError: error => this.handleBackgroundError(error),
|
|
135
|
+
onPeersUpdated: peers => {
|
|
136
|
+
// Follower received updated peer list from leader via heartbeat
|
|
137
|
+
this.raft.setPeers(peers);
|
|
138
|
+
},
|
|
84
139
|
});
|
|
85
140
|
|
|
86
141
|
this.httpServer = new ClusterHttpServer(config, {
|
|
@@ -89,6 +144,13 @@ export class ClusterRuntime {
|
|
|
89
144
|
handleHeartbeat: body => this.heartbeat.handleHeartbeat(body),
|
|
90
145
|
getDeltaFromCache: (vector, limit) => this.getDeltaFromCache(vector, limit),
|
|
91
146
|
getVersionVectorCache: () => this.versionVectorCache,
|
|
147
|
+
getReplicationLag: () => this.getReplicationLag(),
|
|
148
|
+
getFencingToken: () => this.raft.getFencingToken(),
|
|
149
|
+
validateFencingToken: token => this.raft.validateFencingToken(token),
|
|
150
|
+
isLeaderLeaseValid: () => this.raft.isLeaderLeaseValid(),
|
|
151
|
+
handleMembershipJoin: body => this.handleMembershipJoin(body),
|
|
152
|
+
handleMembershipLeave: body => this.handleMembershipLeave(body),
|
|
153
|
+
getSnapshot: () => this.cachedSnapshot ?? { version_vector: {}, tables: {} },
|
|
92
154
|
});
|
|
93
155
|
}
|
|
94
156
|
|
|
@@ -138,6 +200,30 @@ export class ClusterRuntime {
|
|
|
138
200
|
return this.raft.role === 'leader';
|
|
139
201
|
}
|
|
140
202
|
|
|
203
|
+
getReplicationLag(): ReplicationLagSummary {
|
|
204
|
+
return {
|
|
205
|
+
node_id: this.config.node_id,
|
|
206
|
+
total_local_events: this.eventCache.length,
|
|
207
|
+
version_vector: { ...this.versionVectorCache },
|
|
208
|
+
peers: this.raft
|
|
209
|
+
.getPeers()
|
|
210
|
+
.filter(p => p.id !== this.config.node_id)
|
|
211
|
+
.map(
|
|
212
|
+
p =>
|
|
213
|
+
this.peerLagMap.get(p.id) || {
|
|
214
|
+
peer_id: p.id,
|
|
215
|
+
peer_url: p.url,
|
|
216
|
+
reachable: false,
|
|
217
|
+
events_behind: 0,
|
|
218
|
+
last_sync_at: null,
|
|
219
|
+
last_sync_duration_ms: null,
|
|
220
|
+
last_sync_events_applied: 0,
|
|
221
|
+
}
|
|
222
|
+
),
|
|
223
|
+
last_sync_at: this.lastSyncAt,
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
|
|
141
227
|
getStatus(): ClusterStatus {
|
|
142
228
|
const raftState = this.raft.getRaftStoreState();
|
|
143
229
|
|
|
@@ -150,10 +236,14 @@ export class ClusterRuntime {
|
|
|
150
236
|
is_leader: this.isLeader(),
|
|
151
237
|
leader_id: this.raft.leaderId,
|
|
152
238
|
leader_url: this.raft.getLeaderUrl(),
|
|
239
|
+
fencing_token: this.raft.getFencingToken(),
|
|
240
|
+
leader_lease_valid: this.raft.isLeaderLeaseValid(),
|
|
241
|
+
leader_lease_duration_ms: this.raft.leaderLeaseDurationMs,
|
|
153
242
|
raft_commit_index: raftState?.commit_index || 0,
|
|
154
243
|
raft_last_applied: raftState?.last_applied || 0,
|
|
155
244
|
raft_last_log_index: raftState?.last_log_index || 0,
|
|
156
|
-
peers: this.
|
|
245
|
+
peers: this.raft.getPeers().map(peer => ({ id: peer.id, url: peer.url })),
|
|
246
|
+
is_catching_up: this.raft.isCatchingUp,
|
|
157
247
|
};
|
|
158
248
|
}
|
|
159
249
|
|
|
@@ -164,6 +254,11 @@ export class ClusterRuntime {
|
|
|
164
254
|
imported_events_applied: 0,
|
|
165
255
|
merged_duplicate_stories: 0,
|
|
166
256
|
durable_log_entries_appended: 0,
|
|
257
|
+
log_entries_compacted: 0,
|
|
258
|
+
cluster_events_pruned: 0,
|
|
259
|
+
used_snapshot_recovery: false,
|
|
260
|
+
catch_up_applied: 0,
|
|
261
|
+
catch_up_total: 0,
|
|
167
262
|
};
|
|
168
263
|
}
|
|
169
264
|
|
|
@@ -172,11 +267,15 @@ export class ClusterRuntime {
|
|
|
172
267
|
|
|
173
268
|
ensureClusterTables(db, this.config.node_id);
|
|
174
269
|
|
|
270
|
+
// Refresh snapshot cache so the HTTP endpoint always serves current data
|
|
271
|
+
this.cachedSnapshot = this.buildSnapshot(db);
|
|
272
|
+
|
|
175
273
|
const localEventsBefore = scanLocalChanges(db, this.config.node_id);
|
|
176
|
-
const imported
|
|
274
|
+
const { imported, usedSnapshot, catchUpApplied, catchUpTotal } =
|
|
275
|
+
await this.pullEventsFromPeers(db);
|
|
177
276
|
const merged = mergeSimilarStories(db, this.config.story_similarity_threshold);
|
|
178
277
|
const localEventsAfter =
|
|
179
|
-
imported > 0 || merged > 0 ? scanLocalChanges(db, this.config.node_id) : 0;
|
|
278
|
+
imported > 0 || merged > 0 || usedSnapshot ? scanLocalChanges(db, this.config.node_id) : 0;
|
|
180
279
|
|
|
181
280
|
this.refreshCache(db);
|
|
182
281
|
|
|
@@ -184,35 +283,376 @@ export class ClusterRuntime {
|
|
|
184
283
|
getAllClusterEvents(db)
|
|
185
284
|
);
|
|
186
285
|
|
|
286
|
+
// Run compaction if thresholds are met and enough time has elapsed
|
|
287
|
+
const { logCompacted, eventsPruned } = this.maybeCompact(db);
|
|
288
|
+
|
|
187
289
|
return {
|
|
188
290
|
local_events_emitted: localEventsBefore + localEventsAfter,
|
|
189
291
|
imported_events_applied: imported,
|
|
190
292
|
merged_duplicate_stories: merged,
|
|
191
293
|
durable_log_entries_appended: durableLogEntriesAppended,
|
|
294
|
+
log_entries_compacted: logCompacted,
|
|
295
|
+
cluster_events_pruned: eventsPruned,
|
|
296
|
+
used_snapshot_recovery: usedSnapshot,
|
|
297
|
+
catch_up_applied: catchUpApplied,
|
|
298
|
+
catch_up_total: catchUpTotal,
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
handleMembershipJoin(request: MembershipJoinRequest): MembershipJoinResponse {
|
|
303
|
+
const peers = this.raft.getPeers();
|
|
304
|
+
const leaderUrl = this.raft.getLeaderUrl();
|
|
305
|
+
|
|
306
|
+
// If not the leader, redirect to leader
|
|
307
|
+
if (this.raft.role !== 'leader') {
|
|
308
|
+
return {
|
|
309
|
+
success: false,
|
|
310
|
+
leader_id: this.raft.leaderId,
|
|
311
|
+
leader_url: leaderUrl,
|
|
312
|
+
peers: peers.map(p => ({ id: p.id, url: p.url })),
|
|
313
|
+
term: this.raft.currentTerm,
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Check if peer already exists
|
|
318
|
+
const existing = peers.find(p => p.id === request.node_id);
|
|
319
|
+
if (existing) {
|
|
320
|
+
// Update URL if changed
|
|
321
|
+
if (existing.url !== request.url) {
|
|
322
|
+
const updated = peers.map(p =>
|
|
323
|
+
p.id === request.node_id ? { id: p.id, url: request.url } : p
|
|
324
|
+
);
|
|
325
|
+
this.raft.setPeers(updated);
|
|
326
|
+
this.raft.appendDurableEntry('membership_change', {
|
|
327
|
+
action: 'update',
|
|
328
|
+
node_id: request.node_id,
|
|
329
|
+
url: request.url,
|
|
330
|
+
peer_count: updated.length,
|
|
331
|
+
});
|
|
332
|
+
}
|
|
333
|
+
return {
|
|
334
|
+
success: true,
|
|
335
|
+
leader_id: this.raft.leaderId,
|
|
336
|
+
leader_url: this.config.public_url,
|
|
337
|
+
peers: this.raft.getPeers().map(p => ({ id: p.id, url: p.url })),
|
|
338
|
+
term: this.raft.currentTerm,
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// Add new peer
|
|
343
|
+
const newPeer: ClusterPeerConfig = { id: request.node_id, url: request.url };
|
|
344
|
+
const updated = [...peers, newPeer];
|
|
345
|
+
this.raft.setPeers(updated);
|
|
346
|
+
|
|
347
|
+
this.raft.appendDurableEntry('membership_change', {
|
|
348
|
+
action: 'join',
|
|
349
|
+
node_id: request.node_id,
|
|
350
|
+
url: request.url,
|
|
351
|
+
peer_count: updated.length,
|
|
352
|
+
});
|
|
353
|
+
|
|
354
|
+
return {
|
|
355
|
+
success: true,
|
|
356
|
+
leader_id: this.raft.leaderId,
|
|
357
|
+
leader_url: this.config.public_url,
|
|
358
|
+
peers: updated.map(p => ({ id: p.id, url: p.url })),
|
|
359
|
+
term: this.raft.currentTerm,
|
|
360
|
+
};
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
handleMembershipLeave(request: MembershipLeaveRequest): MembershipLeaveResponse {
|
|
364
|
+
const peers = this.raft.getPeers();
|
|
365
|
+
|
|
366
|
+
// If not the leader, cannot process leave
|
|
367
|
+
if (this.raft.role !== 'leader') {
|
|
368
|
+
return {
|
|
369
|
+
success: false,
|
|
370
|
+
peers: peers.map(p => ({ id: p.id, url: p.url })),
|
|
371
|
+
};
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Cannot remove self (leader) — leader must transfer leadership first
|
|
375
|
+
if (request.node_id === this.config.node_id) {
|
|
376
|
+
return {
|
|
377
|
+
success: false,
|
|
378
|
+
peers: peers.map(p => ({ id: p.id, url: p.url })),
|
|
379
|
+
};
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
const existing = peers.find(p => p.id === request.node_id);
|
|
383
|
+
if (!existing) {
|
|
384
|
+
// Already gone
|
|
385
|
+
return {
|
|
386
|
+
success: true,
|
|
387
|
+
peers: peers.map(p => ({ id: p.id, url: p.url })),
|
|
388
|
+
};
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
const updated = peers.filter(p => p.id !== request.node_id);
|
|
392
|
+
this.raft.setPeers(updated);
|
|
393
|
+
|
|
394
|
+
this.raft.appendDurableEntry('membership_change', {
|
|
395
|
+
action: 'leave',
|
|
396
|
+
node_id: request.node_id,
|
|
397
|
+
peer_count: updated.length,
|
|
398
|
+
});
|
|
399
|
+
|
|
400
|
+
return {
|
|
401
|
+
success: true,
|
|
402
|
+
peers: updated.map(p => ({ id: p.id, url: p.url })),
|
|
192
403
|
};
|
|
193
404
|
}
|
|
194
405
|
|
|
406
|
+
private maybeCompact(db: Database): { logCompacted: number; eventsPruned: number } {
|
|
407
|
+
const now = Date.now();
|
|
408
|
+
const interval = this.config.compaction_interval_ms ?? 300000;
|
|
409
|
+
|
|
410
|
+
// Respect minimum interval between compaction runs
|
|
411
|
+
if (interval > 0 && now - this.lastCompactionAt < interval) {
|
|
412
|
+
return { logCompacted: 0, eventsPruned: 0 };
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
let logCompacted = 0;
|
|
416
|
+
let eventsPruned = 0;
|
|
417
|
+
|
|
418
|
+
// Compact raft log if threshold exceeded
|
|
419
|
+
const maxLogEntries = this.config.max_log_entries ?? 10000;
|
|
420
|
+
if (maxLogEntries > 0) {
|
|
421
|
+
const logCount = this.raft.getLogEntryCount();
|
|
422
|
+
if (logCount > maxLogEntries) {
|
|
423
|
+
const versionVector = getVersionVector(db);
|
|
424
|
+
const result = this.raft.createSnapshotAndCompact(versionVector);
|
|
425
|
+
logCompacted = result.entries_removed;
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// Prune cluster_events if threshold exceeded
|
|
430
|
+
const maxEvents = this.config.max_cluster_events ?? 50000;
|
|
431
|
+
if (maxEvents > 0) {
|
|
432
|
+
const eventCount = getClusterEventCount(db);
|
|
433
|
+
if (eventCount > maxEvents) {
|
|
434
|
+
eventsPruned = pruneClusterEvents(db, maxEvents);
|
|
435
|
+
if (eventsPruned > 0) {
|
|
436
|
+
this.refreshCache(db);
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
if (logCompacted > 0 || eventsPruned > 0) {
|
|
442
|
+
this.lastCompactionAt = now;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
return { logCompacted, eventsPruned };
|
|
446
|
+
}
|
|
447
|
+
|
|
195
448
|
private refreshCache(db: Database): void {
|
|
196
449
|
this.eventCache = getAllClusterEvents(db).slice(-20000);
|
|
197
450
|
this.versionVectorCache = getVersionVector(db);
|
|
198
451
|
}
|
|
199
452
|
|
|
200
|
-
private async pullEventsFromPeers(db: Database): Promise<
|
|
201
|
-
|
|
453
|
+
private async pullEventsFromPeers(db: Database): Promise<{
|
|
454
|
+
imported: number;
|
|
455
|
+
usedSnapshot: boolean;
|
|
456
|
+
catchUpApplied: number;
|
|
457
|
+
catchUpTotal: number;
|
|
458
|
+
}> {
|
|
459
|
+
const peers = this.raft.getPeers();
|
|
460
|
+
if (peers.length === 0) {
|
|
461
|
+
return { imported: 0, usedSnapshot: false, catchUpApplied: 0, catchUpTotal: 0 };
|
|
462
|
+
}
|
|
202
463
|
|
|
203
|
-
let
|
|
464
|
+
let imported = 0;
|
|
465
|
+
const syncTimestamp = new Date().toISOString();
|
|
466
|
+
this.lastSyncAt = syncTimestamp;
|
|
204
467
|
|
|
205
|
-
for (const peer of
|
|
468
|
+
for (const peer of peers) {
|
|
206
469
|
if (peer.id === this.config.node_id) continue;
|
|
207
470
|
|
|
208
|
-
const localVector =
|
|
471
|
+
const localVector = getEffectiveVersionVector(db);
|
|
472
|
+
const syncStart = Date.now();
|
|
209
473
|
const response = await this.requestDelta(peer, localVector, 4000);
|
|
210
|
-
if (!response || response.events.length === 0) continue;
|
|
211
474
|
|
|
212
|
-
|
|
475
|
+
if (!response) {
|
|
476
|
+
this.peerLagMap.set(peer.id, {
|
|
477
|
+
peer_id: peer.id,
|
|
478
|
+
peer_url: peer.url,
|
|
479
|
+
reachable: false,
|
|
480
|
+
events_behind: 0,
|
|
481
|
+
last_sync_at: syncTimestamp,
|
|
482
|
+
last_sync_duration_ms: Date.now() - syncStart,
|
|
483
|
+
last_sync_events_applied: 0,
|
|
484
|
+
});
|
|
485
|
+
continue;
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
// If the peer advertises a higher fencing token, step down
|
|
489
|
+
if (
|
|
490
|
+
typeof response.fencing_token === 'number' &&
|
|
491
|
+
response.fencing_token > this.raft.currentTerm
|
|
492
|
+
) {
|
|
493
|
+
this.raft.stepDown(response.fencing_token, null);
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
// Detect if the delta is insufficient (peer's log was truncated past our position)
|
|
497
|
+
if (this.isDeltaInsufficient(localVector, response.version_vector, response.events)) {
|
|
498
|
+
const recovery = await this.recoverFromSnapshot(db, peer);
|
|
499
|
+
if (recovery !== null) {
|
|
500
|
+
this.peerLagMap.set(peer.id, {
|
|
501
|
+
peer_id: peer.id,
|
|
502
|
+
peer_url: peer.url,
|
|
503
|
+
reachable: true,
|
|
504
|
+
events_behind: 0,
|
|
505
|
+
last_sync_at: syncTimestamp,
|
|
506
|
+
last_sync_duration_ms: Date.now() - syncStart,
|
|
507
|
+
last_sync_events_applied: recovery.applied,
|
|
508
|
+
});
|
|
509
|
+
return {
|
|
510
|
+
imported: 0,
|
|
511
|
+
usedSnapshot: true,
|
|
512
|
+
catchUpApplied: recovery.applied,
|
|
513
|
+
catchUpTotal: recovery.total,
|
|
514
|
+
};
|
|
515
|
+
}
|
|
516
|
+
// Snapshot recovery failed — fall through and apply whatever delta we have
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
const eventsBehind = response.events.length;
|
|
520
|
+
const peerApplied =
|
|
521
|
+
eventsBehind > 0 ? applyRemoteEvents(db, this.config.node_id, response.events) : 0;
|
|
522
|
+
imported += peerApplied;
|
|
523
|
+
|
|
524
|
+
this.peerLagMap.set(peer.id, {
|
|
525
|
+
peer_id: peer.id,
|
|
526
|
+
peer_url: peer.url,
|
|
527
|
+
reachable: true,
|
|
528
|
+
events_behind: eventsBehind,
|
|
529
|
+
last_sync_at: syncTimestamp,
|
|
530
|
+
last_sync_duration_ms: Date.now() - syncStart,
|
|
531
|
+
last_sync_events_applied: peerApplied,
|
|
532
|
+
});
|
|
213
533
|
}
|
|
214
534
|
|
|
215
|
-
|
|
535
|
+
// If we had been catching up and now the effective vector matches peers, mark done
|
|
536
|
+
if (this.raft.isCatchingUp) {
|
|
537
|
+
this.raft.isCatchingUp = false;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
return { imported, usedSnapshot: false, catchUpApplied: 0, catchUpTotal: 0 };
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
/**
|
|
544
|
+
* Returns true when the delta response is missing events the peer should have.
|
|
545
|
+
* This happens when the peer's event cache has been truncated (log compaction)
|
|
546
|
+
* and can no longer provide all events since our last known version.
|
|
547
|
+
*/
|
|
548
|
+
private isDeltaInsufficient(
|
|
549
|
+
localVector: VersionVector,
|
|
550
|
+
peerVector: VersionVector,
|
|
551
|
+
receivedEvents: ClusterEvent[]
|
|
552
|
+
): boolean {
|
|
553
|
+
// Count how many events we actually received per actor
|
|
554
|
+
const received: Record<string, number> = {};
|
|
555
|
+
for (const event of receivedEvents) {
|
|
556
|
+
received[event.version.actor_id] = (received[event.version.actor_id] ?? 0) + 1;
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
for (const [actorId, peerCounter] of Object.entries(peerVector)) {
|
|
560
|
+
const localCounter = localVector[actorId] ?? 0;
|
|
561
|
+
const needed = peerCounter - localCounter;
|
|
562
|
+
if (needed <= 0) continue;
|
|
563
|
+
|
|
564
|
+
const receivedCount = received[actorId] ?? 0;
|
|
565
|
+
if (receivedCount < needed) {
|
|
566
|
+
// We're missing events for this actor that the peer should have
|
|
567
|
+
return true;
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
return false;
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
/**
|
|
575
|
+
* Requests a full snapshot from the given peer and applies it locally.
|
|
576
|
+
* Marks the node as no longer catching up once complete.
|
|
577
|
+
* Returns { applied, total } on success, null on failure.
|
|
578
|
+
*/
|
|
579
|
+
private async recoverFromSnapshot(
|
|
580
|
+
db: Database,
|
|
581
|
+
peer: ClusterPeerConfig
|
|
582
|
+
): Promise<{ applied: number; total: number } | null> {
|
|
583
|
+
this.raft.isCatchingUp = true;
|
|
584
|
+
this.raft.appendDurableEntry('runtime', {
|
|
585
|
+
event: 'snapshot_recovery_start',
|
|
586
|
+
node_id: this.config.node_id,
|
|
587
|
+
peer_id: peer.id,
|
|
588
|
+
});
|
|
589
|
+
|
|
590
|
+
const snapshot = await this.requestSnapshot(peer);
|
|
591
|
+
if (!snapshot) {
|
|
592
|
+
return null;
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
const { applied, total } = this.applySnapshot(db, snapshot);
|
|
596
|
+
|
|
597
|
+
this.raft.isCatchingUp = false;
|
|
598
|
+
this.raft.appendDurableEntry('runtime', {
|
|
599
|
+
event: 'snapshot_recovery_complete',
|
|
600
|
+
node_id: this.config.node_id,
|
|
601
|
+
peer_id: peer.id,
|
|
602
|
+
rows_applied: applied,
|
|
603
|
+
rows_total: total,
|
|
604
|
+
});
|
|
605
|
+
|
|
606
|
+
return { applied, total };
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
/**
|
|
610
|
+
* Applies a snapshot to the local database, upserting all rows from all tables.
|
|
611
|
+
* Stores the snapshot's version vector so future delta requests start from here.
|
|
612
|
+
*/
|
|
613
|
+
private applySnapshot(
|
|
614
|
+
db: Database,
|
|
615
|
+
snapshot: ClusterSnapshot
|
|
616
|
+
): { applied: number; total: number } {
|
|
617
|
+
let applied = 0;
|
|
618
|
+
let total = 0;
|
|
619
|
+
|
|
620
|
+
for (const adapter of REPLICATED_TABLES) {
|
|
621
|
+
const rows = snapshot.tables[adapter.table];
|
|
622
|
+
if (!rows) continue;
|
|
623
|
+
total += rows.length;
|
|
624
|
+
for (const row of rows) {
|
|
625
|
+
adapter.upsert(db, row.payload);
|
|
626
|
+
applied++;
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
// Record the snapshot version vector so future delta requests
|
|
631
|
+
// only ask for events newer than this snapshot
|
|
632
|
+
setSnapshotVersionVector(db, snapshot.version_vector);
|
|
633
|
+
|
|
634
|
+
return { applied, total };
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
/**
|
|
638
|
+
* Builds a full snapshot of all replicated tables from the current db state.
|
|
639
|
+
* Called during sync to keep cachedSnapshot fresh for the HTTP endpoint.
|
|
640
|
+
*/
|
|
641
|
+
private buildSnapshot(db: Database): ClusterSnapshot {
|
|
642
|
+
const tables: ClusterSnapshot['tables'] = {};
|
|
643
|
+
|
|
644
|
+
for (const adapter of REPLICATED_TABLES) {
|
|
645
|
+
const rows = queryAll<Record<string, unknown>>(db, adapter.selectSql);
|
|
646
|
+
tables[adapter.table] = rows.map(row => ({
|
|
647
|
+
rowId: adapter.rowId(row),
|
|
648
|
+
payload: adapter.payload(row),
|
|
649
|
+
}));
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
return {
|
|
653
|
+
version_vector: getVersionVector(db),
|
|
654
|
+
tables,
|
|
655
|
+
};
|
|
216
656
|
}
|
|
217
657
|
|
|
218
658
|
private async requestDelta(
|
|
@@ -223,9 +663,14 @@ export class ClusterRuntime {
|
|
|
223
663
|
return this.postJson<DeltaResponse>(peer, '/cluster/v1/events/delta', {
|
|
224
664
|
version_vector: versionVector,
|
|
225
665
|
limit,
|
|
666
|
+
fencing_token: this.raft.getFencingToken(),
|
|
226
667
|
});
|
|
227
668
|
}
|
|
228
669
|
|
|
670
|
+
private async requestSnapshot(peer: ClusterPeerConfig): Promise<ClusterSnapshot | null> {
|
|
671
|
+
return this.getJson<ClusterSnapshot>(peer, '/cluster/v1/snapshot');
|
|
672
|
+
}
|
|
673
|
+
|
|
229
674
|
private getDeltaFromCache(remoteVersionVector: VersionVector, limit: number): ClusterEvent[] {
|
|
230
675
|
return this.eventCache
|
|
231
676
|
.filter(event => {
|
|
@@ -254,6 +699,18 @@ export class ClusterRuntime {
|
|
|
254
699
|
);
|
|
255
700
|
}
|
|
256
701
|
|
|
702
|
+
private async getJson<T>(peer: ClusterPeerConfig, path: string): Promise<T | null> {
|
|
703
|
+
const normalizedBase = peer.url.endsWith('/') ? peer.url : `${peer.url}/`;
|
|
704
|
+
const url = new URL(path.replace(/^\//, ''), normalizedBase).toString();
|
|
705
|
+
|
|
706
|
+
return fetchClusterStatusOrPostJson<T>(
|
|
707
|
+
url,
|
|
708
|
+
this.config.request_timeout_ms,
|
|
709
|
+
this.config.auth_token,
|
|
710
|
+
{ method: 'GET' }
|
|
711
|
+
);
|
|
712
|
+
}
|
|
713
|
+
|
|
257
714
|
private handleBackgroundError(error: unknown): void {
|
|
258
715
|
if (!this.started || this.stopping) return;
|
|
259
716
|
const err = error as NodeJS.ErrnoException;
|
|
@@ -271,6 +728,80 @@ export class ClusterRuntime {
|
|
|
271
728
|
}
|
|
272
729
|
}
|
|
273
730
|
|
|
731
|
+
export async function fetchReplicationLag(
|
|
732
|
+
config: ClusterConfig
|
|
733
|
+
): Promise<ReplicationLagSummary | null> {
|
|
734
|
+
if (!config.enabled) return null;
|
|
735
|
+
|
|
736
|
+
const host = config.listen_host === '0.0.0.0' ? '127.0.0.1' : config.listen_host;
|
|
737
|
+
const url = `http://${host}:${config.listen_port}/cluster/v1/replication-lag`;
|
|
738
|
+
|
|
739
|
+
return fetchClusterStatusOrPostJson<ReplicationLagSummary>(
|
|
740
|
+
url,
|
|
741
|
+
config.request_timeout_ms,
|
|
742
|
+
config.auth_token,
|
|
743
|
+
{ method: 'GET' }
|
|
744
|
+
);
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
/**
|
|
748
|
+
* Fetches recent cluster events from the local runtime via the delta endpoint.
|
|
749
|
+
* Uses an empty version vector to request recent events up to the given limit.
|
|
750
|
+
*/
|
|
751
|
+
export async function fetchLocalClusterEvents(
|
|
752
|
+
config: ClusterConfig,
|
|
753
|
+
limit: number = 50
|
|
754
|
+
): Promise<ClusterEvent[] | null> {
|
|
755
|
+
if (!config.enabled) return null;
|
|
756
|
+
|
|
757
|
+
const host = config.listen_host === '0.0.0.0' ? '127.0.0.1' : config.listen_host;
|
|
758
|
+
const url = `http://${host}:${config.listen_port}/cluster/v1/events/delta`;
|
|
759
|
+
|
|
760
|
+
const response = await fetchClusterStatusOrPostJson<{ events: ClusterEvent[] }>(
|
|
761
|
+
url,
|
|
762
|
+
config.request_timeout_ms,
|
|
763
|
+
config.auth_token,
|
|
764
|
+
{ method: 'POST', body: { version_vector: {}, limit } }
|
|
765
|
+
);
|
|
766
|
+
|
|
767
|
+
return response?.events ?? null;
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
/**
|
|
771
|
+
* POSTs to the local cluster runtime at the given path.
|
|
772
|
+
*/
|
|
773
|
+
export async function postToLocalCluster<T>(
|
|
774
|
+
config: ClusterConfig,
|
|
775
|
+
path: string,
|
|
776
|
+
body: unknown
|
|
777
|
+
): Promise<T | null> {
|
|
778
|
+
if (!config.enabled) return null;
|
|
779
|
+
|
|
780
|
+
const host = config.listen_host === '0.0.0.0' ? '127.0.0.1' : config.listen_host;
|
|
781
|
+
const url = `http://${host}:${config.listen_port}${path}`;
|
|
782
|
+
|
|
783
|
+
return fetchClusterStatusOrPostJson<T>(url, config.request_timeout_ms, config.auth_token, {
|
|
784
|
+
method: 'POST',
|
|
785
|
+
body,
|
|
786
|
+
});
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
/**
|
|
790
|
+
* POSTs to a peer cluster node at the given URL and path.
|
|
791
|
+
*/
|
|
792
|
+
export async function postToPeerCluster<T>(
|
|
793
|
+
peerUrl: string,
|
|
794
|
+
path: string,
|
|
795
|
+
body: unknown,
|
|
796
|
+
options: ClusterStatusFetchOptions
|
|
797
|
+
): Promise<T | null> {
|
|
798
|
+
const url = `${peerUrl.replace(/\/$/, '')}${path}`;
|
|
799
|
+
return fetchClusterStatusOrPostJson<T>(url, options.timeoutMs, options.authToken, {
|
|
800
|
+
method: 'POST',
|
|
801
|
+
body,
|
|
802
|
+
});
|
|
803
|
+
}
|
|
804
|
+
|
|
274
805
|
export async function fetchLocalClusterStatus(
|
|
275
806
|
config: ClusterConfig
|
|
276
807
|
): Promise<ClusterStatus | null> {
|
|
@@ -284,10 +815,14 @@ export async function fetchLocalClusterStatus(
|
|
|
284
815
|
is_leader: true,
|
|
285
816
|
leader_id: config.node_id,
|
|
286
817
|
leader_url: null,
|
|
818
|
+
fencing_token: 0,
|
|
819
|
+
leader_lease_valid: true,
|
|
820
|
+
leader_lease_duration_ms: config.leader_lease_ms ?? config.heartbeat_interval_ms * 3,
|
|
287
821
|
raft_commit_index: 0,
|
|
288
822
|
raft_last_applied: 0,
|
|
289
823
|
raft_last_log_index: 0,
|
|
290
824
|
peers: config.peers.map(peer => ({ id: peer.id, url: peer.url })),
|
|
825
|
+
is_catching_up: false,
|
|
291
826
|
};
|
|
292
827
|
}
|
|
293
828
|
|
|
@@ -345,10 +880,14 @@ function parseClusterStatus(input: Record<string, unknown>): ClusterStatus {
|
|
|
345
880
|
is_leader: input.is_leader === true,
|
|
346
881
|
leader_id: typeof input.leader_id === 'string' ? input.leader_id : null,
|
|
347
882
|
leader_url: typeof input.leader_url === 'string' ? input.leader_url : null,
|
|
883
|
+
fencing_token: toInt(input.fencing_token),
|
|
884
|
+
leader_lease_valid: input.leader_lease_valid === true,
|
|
885
|
+
leader_lease_duration_ms: toInt(input.leader_lease_duration_ms),
|
|
348
886
|
raft_commit_index: toInt(input.raft_commit_index),
|
|
349
887
|
raft_last_applied: toInt(input.raft_last_applied),
|
|
350
888
|
raft_last_log_index: toInt(input.raft_last_log_index),
|
|
351
889
|
peers,
|
|
890
|
+
is_catching_up: input.is_catching_up === true,
|
|
352
891
|
};
|
|
353
892
|
}
|
|
354
893
|
|