hungry-ghost-hive 0.45.0 → 0.46.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/cluster.d.ts.map +1 -1
- package/dist/cli/commands/cluster.js +348 -1
- package/dist/cli/commands/cluster.js.map +1 -1
- package/dist/cli/commands/cluster.test.js +313 -9
- package/dist/cli/commands/cluster.test.js.map +1 -1
- package/dist/cli/commands/req-spawn.test.d.ts +2 -0
- package/dist/cli/commands/req-spawn.test.d.ts.map +1 -0
- package/dist/cli/commands/req-spawn.test.js +116 -0
- package/dist/cli/commands/req-spawn.test.js.map +1 -0
- package/dist/cli/commands/req.d.ts.map +1 -1
- package/dist/cli/commands/req.js +21 -13
- package/dist/cli/commands/req.js.map +1 -1
- package/dist/cluster/cluster-http-server.d.ts +32 -0
- package/dist/cluster/cluster-http-server.d.ts.map +1 -1
- package/dist/cluster/cluster-http-server.js +42 -0
- package/dist/cluster/cluster-http-server.js.map +1 -1
- package/dist/cluster/distributed-runtime-coverage.test.js +9 -0
- package/dist/cluster/distributed-runtime-coverage.test.js.map +1 -1
- package/dist/cluster/distributed-system.test.js +135 -0
- package/dist/cluster/distributed-system.test.js.map +1 -1
- package/dist/cluster/events.d.ts +23 -0
- package/dist/cluster/events.d.ts.map +1 -1
- package/dist/cluster/events.js +74 -0
- package/dist/cluster/events.js.map +1 -1
- package/dist/cluster/heartbeat-manager.d.ts +2 -0
- package/dist/cluster/heartbeat-manager.d.ts.map +1 -1
- package/dist/cluster/heartbeat-manager.js +42 -6
- package/dist/cluster/heartbeat-manager.js.map +1 -1
- package/dist/cluster/membership.test.d.ts +2 -0
- package/dist/cluster/membership.test.d.ts.map +1 -0
- package/dist/cluster/membership.test.js +416 -0
- package/dist/cluster/membership.test.js.map +1 -0
- package/dist/cluster/partition-safety.test.d.ts +2 -0
- package/dist/cluster/partition-safety.test.d.ts.map +1 -0
- package/dist/cluster/partition-safety.test.js +440 -0
- package/dist/cluster/partition-safety.test.js.map +1 -0
- package/dist/cluster/raft-state-machine.d.ts +33 -1
- package/dist/cluster/raft-state-machine.d.ts.map +1 -1
- package/dist/cluster/raft-state-machine.js +65 -3
- package/dist/cluster/raft-state-machine.js.map +1 -1
- package/dist/cluster/raft-store.d.ts +26 -1
- package/dist/cluster/raft-store.d.ts.map +1 -1
- package/dist/cluster/raft-store.js +137 -0
- package/dist/cluster/raft-store.js.map +1 -1
- package/dist/cluster/replication-lag.test.d.ts +2 -0
- package/dist/cluster/replication-lag.test.d.ts.map +1 -0
- package/dist/cluster/replication-lag.test.js +239 -0
- package/dist/cluster/replication-lag.test.js.map +1 -0
- package/dist/cluster/replication.d.ts +2 -2
- package/dist/cluster/replication.d.ts.map +1 -1
- package/dist/cluster/replication.js +1 -1
- package/dist/cluster/replication.js.map +1 -1
- package/dist/cluster/runtime.d.ts +78 -0
- package/dist/cluster/runtime.d.ts.map +1 -1
- package/dist/cluster/runtime.js +400 -13
- package/dist/cluster/runtime.js.map +1 -1
- package/dist/cluster/state-recovery.test.d.ts +2 -0
- package/dist/cluster/state-recovery.test.d.ts.map +1 -0
- package/dist/cluster/state-recovery.test.js +310 -0
- package/dist/cluster/state-recovery.test.js.map +1 -0
- package/dist/cluster/types.d.ts +30 -0
- package/dist/cluster/types.d.ts.map +1 -1
- package/dist/config/schema.d.ts +48 -0
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +11 -0
- package/dist/config/schema.js.map +1 -1
- package/dist/context-files/generator.js +1 -1
- package/dist/context-files/generator.js.map +1 -1
- package/dist/context-files/generator.test.js +51 -0
- package/dist/context-files/generator.test.js.map +1 -1
- package/dist/orchestrator/orphan-recovery.d.ts +1 -1
- package/dist/orchestrator/orphan-recovery.d.ts.map +1 -1
- package/dist/orchestrator/orphan-recovery.js +4 -4
- package/dist/orchestrator/orphan-recovery.js.map +1 -1
- package/dist/orchestrator/prompt-templates.d.ts +3 -1
- package/dist/orchestrator/prompt-templates.d.ts.map +1 -1
- package/dist/orchestrator/prompt-templates.js +45 -8
- package/dist/orchestrator/prompt-templates.js.map +1 -1
- package/dist/orchestrator/prompt-templates.test.js +210 -0
- package/dist/orchestrator/prompt-templates.test.js.map +1 -1
- package/dist/orchestrator/scheduler.d.ts +1 -0
- package/dist/orchestrator/scheduler.d.ts.map +1 -1
- package/dist/orchestrator/scheduler.js +15 -10
- package/dist/orchestrator/scheduler.js.map +1 -1
- package/dist/orchestrator/scheduler.test.js +97 -6
- package/dist/orchestrator/scheduler.test.js.map +1 -1
- package/package.json +1 -1
- package/src/cli/commands/cluster.test.ts +387 -9
- package/src/cli/commands/cluster.ts +486 -1
- package/src/cli/commands/req-spawn.test.ts +153 -0
- package/src/cli/commands/req.ts +31 -18
- package/src/cluster/cluster-http-server.ts +80 -0
- package/src/cluster/distributed-runtime-coverage.test.ts +9 -0
- package/src/cluster/distributed-system.test.ts +168 -0
- package/src/cluster/events.ts +90 -0
- package/src/cluster/heartbeat-manager.ts +48 -6
- package/src/cluster/membership.test.ts +498 -0
- package/src/cluster/partition-safety.test.ts +523 -0
- package/src/cluster/raft-state-machine.ts +76 -4
- package/src/cluster/raft-store.ts +167 -1
- package/src/cluster/replication-lag.test.ts +284 -0
- package/src/cluster/replication.ts +6 -0
- package/src/cluster/runtime.ts +551 -12
- package/src/cluster/state-recovery.test.ts +420 -0
- package/src/cluster/types.ts +32 -0
- package/src/config/schema.ts +11 -0
- package/src/context-files/generator.test.ts +55 -0
- package/src/context-files/generator.ts +5 -5
- package/src/orchestrator/orphan-recovery.ts +32 -13
- package/src/orchestrator/prompt-templates.test.ts +263 -0
- package/src/orchestrator/prompt-templates.ts +49 -8
- package/src/orchestrator/scheduler.test.ts +129 -6
- package/src/orchestrator/scheduler.ts +46 -20
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
// Licensed under the Hungry Ghost Hive License. See LICENSE.
|
|
2
|
+
|
|
3
|
+
import { mkdirSync, mkdtempSync, rmSync } from 'fs';
|
|
4
|
+
import { createServer as createNetServer } from 'net';
|
|
5
|
+
import { tmpdir } from 'os';
|
|
6
|
+
import { join } from 'path';
|
|
7
|
+
import { afterEach, describe, expect, it } from 'vitest';
|
|
8
|
+
import type { ClusterConfig } from '../config/schema.js';
|
|
9
|
+
import { ClusterRuntime } from './runtime.js';
|
|
10
|
+
|
|
11
|
+
interface RuntimeFixture {
|
|
12
|
+
root: string;
|
|
13
|
+
hiveDir: string;
|
|
14
|
+
config: ClusterConfig;
|
|
15
|
+
runtime: ClusterRuntime;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const tempRoots: string[] = [];
|
|
19
|
+
const activeRuntimes: ClusterRuntime[] = [];
|
|
20
|
+
|
|
21
|
+
afterEach(async () => {
|
|
22
|
+
for (const runtime of activeRuntimes.splice(0)) {
|
|
23
|
+
try {
|
|
24
|
+
await runtime.stop();
|
|
25
|
+
} catch {
|
|
26
|
+
// Best effort shutdown for test cleanup.
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
for (const root of tempRoots.splice(0)) {
|
|
31
|
+
rmSync(root, { recursive: true, force: true });
|
|
32
|
+
}
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
describe('fencing token validation', () => {
|
|
36
|
+
it('rejects heartbeats with fencing_token lower than term', async () => {
|
|
37
|
+
if (!(await canListenOnLocalhost())) return;
|
|
38
|
+
|
|
39
|
+
const fixture = await startRuntimeFixture({
|
|
40
|
+
node_id: 'node-fence-reject',
|
|
41
|
+
election_timeout_min_ms: 2000,
|
|
42
|
+
election_timeout_max_ms: 2000,
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
// First, advance the term by accepting a vote request
|
|
46
|
+
await postJson(fixture.config.public_url, '/cluster/v1/election/request-vote', {
|
|
47
|
+
term: 5,
|
|
48
|
+
candidate_id: 'candidate-5',
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
// Send heartbeat with valid term but stale fencing token
|
|
52
|
+
const res = await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
|
|
53
|
+
term: 5,
|
|
54
|
+
leader_id: 'candidate-5',
|
|
55
|
+
fencing_token: 3,
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
expect(res.success).toBe(false);
|
|
59
|
+
expect(res.fencing_token).toBe(5);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
it('accepts heartbeats with valid fencing_token', async () => {
|
|
63
|
+
if (!(await canListenOnLocalhost())) return;
|
|
64
|
+
|
|
65
|
+
const fixture = await startRuntimeFixture({
|
|
66
|
+
node_id: 'node-fence-accept',
|
|
67
|
+
election_timeout_min_ms: 2000,
|
|
68
|
+
election_timeout_max_ms: 2000,
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
const res = await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
|
|
72
|
+
term: 3,
|
|
73
|
+
leader_id: 'leader-3',
|
|
74
|
+
fencing_token: 3,
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
expect(res.success).toBe(true);
|
|
78
|
+
expect(res.fencing_token).toBe(3);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('rejects delta requests with stale fencing_token', async () => {
|
|
82
|
+
if (!(await canListenOnLocalhost())) return;
|
|
83
|
+
|
|
84
|
+
const fixture = await startRuntimeFixture({
|
|
85
|
+
node_id: 'node-delta-fence',
|
|
86
|
+
election_timeout_min_ms: 2000,
|
|
87
|
+
election_timeout_max_ms: 2000,
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
// Advance term
|
|
91
|
+
await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
|
|
92
|
+
term: 10,
|
|
93
|
+
leader_id: 'leader-10',
|
|
94
|
+
fencing_token: 10,
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
// Request delta with stale fencing token
|
|
98
|
+
const res = await fetch(`${fixture.config.public_url}/cluster/v1/events/delta`, {
|
|
99
|
+
method: 'POST',
|
|
100
|
+
headers: { 'Content-Type': 'application/json' },
|
|
101
|
+
body: JSON.stringify({
|
|
102
|
+
version_vector: {},
|
|
103
|
+
fencing_token: 5,
|
|
104
|
+
}),
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
expect(res.status).toBe(409);
|
|
108
|
+
const body = (await res.json()) as { error: string; fencing_token: number };
|
|
109
|
+
expect(body.error).toContain('stale leader epoch');
|
|
110
|
+
expect(body.fencing_token).toBe(10);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it('accepts delta requests with current fencing_token', async () => {
|
|
114
|
+
if (!(await canListenOnLocalhost())) return;
|
|
115
|
+
|
|
116
|
+
const fixture = await startRuntimeFixture({
|
|
117
|
+
node_id: 'node-delta-fence-ok',
|
|
118
|
+
election_timeout_min_ms: 2000,
|
|
119
|
+
election_timeout_max_ms: 2000,
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
// Set term to 4
|
|
123
|
+
await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
|
|
124
|
+
term: 4,
|
|
125
|
+
leader_id: 'leader-4',
|
|
126
|
+
fencing_token: 4,
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
// Request delta with matching fencing token
|
|
130
|
+
const res = await fetch(`${fixture.config.public_url}/cluster/v1/events/delta`, {
|
|
131
|
+
method: 'POST',
|
|
132
|
+
headers: { 'Content-Type': 'application/json' },
|
|
133
|
+
body: JSON.stringify({
|
|
134
|
+
version_vector: {},
|
|
135
|
+
fencing_token: 4,
|
|
136
|
+
}),
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
expect(res.status).toBe(200);
|
|
140
|
+
const body = (await res.json()) as { fencing_token: number };
|
|
141
|
+
expect(body.fencing_token).toBe(4);
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
it('accepts delta requests without fencing_token for backward compatibility', async () => {
|
|
145
|
+
if (!(await canListenOnLocalhost())) return;
|
|
146
|
+
|
|
147
|
+
const fixture = await startRuntimeFixture({
|
|
148
|
+
node_id: 'node-delta-no-fence',
|
|
149
|
+
election_timeout_min_ms: 2000,
|
|
150
|
+
election_timeout_max_ms: 2000,
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
const res = await fetch(`${fixture.config.public_url}/cluster/v1/events/delta`, {
|
|
154
|
+
method: 'POST',
|
|
155
|
+
headers: { 'Content-Type': 'application/json' },
|
|
156
|
+
body: JSON.stringify({
|
|
157
|
+
version_vector: {},
|
|
158
|
+
}),
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
expect(res.status).toBe(200);
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
it('returns fencing_token in status endpoint', async () => {
|
|
165
|
+
if (!(await canListenOnLocalhost())) return;
|
|
166
|
+
|
|
167
|
+
const fixture = await startRuntimeFixture({
|
|
168
|
+
node_id: 'node-status-fence',
|
|
169
|
+
election_timeout_min_ms: 2000,
|
|
170
|
+
election_timeout_max_ms: 2000,
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
// Advance term
|
|
174
|
+
await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
|
|
175
|
+
term: 7,
|
|
176
|
+
leader_id: 'leader-7',
|
|
177
|
+
fencing_token: 7,
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
const status = fixture.runtime.getStatus();
|
|
181
|
+
expect(status.fencing_token).toBe(7);
|
|
182
|
+
expect(status.term).toBe(7);
|
|
183
|
+
});
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
describe('leader lease validation', () => {
|
|
187
|
+
it('reports lease invalid when no heartbeat has been received', async () => {
|
|
188
|
+
if (!(await canListenOnLocalhost())) return;
|
|
189
|
+
|
|
190
|
+
const fixture = await startRuntimeFixture({
|
|
191
|
+
node_id: 'node-lease-none',
|
|
192
|
+
election_timeout_min_ms: 2000,
|
|
193
|
+
election_timeout_max_ms: 2000,
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
const status = fixture.runtime.getStatus();
|
|
197
|
+
expect(status.leader_lease_valid).toBe(false);
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
it('reports lease valid immediately after receiving heartbeat', async () => {
|
|
201
|
+
if (!(await canListenOnLocalhost())) return;
|
|
202
|
+
|
|
203
|
+
const fixture = await startRuntimeFixture({
|
|
204
|
+
node_id: 'node-lease-fresh',
|
|
205
|
+
election_timeout_min_ms: 2000,
|
|
206
|
+
election_timeout_max_ms: 2000,
|
|
207
|
+
heartbeat_interval_ms: 100,
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
// Send a heartbeat
|
|
211
|
+
await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
|
|
212
|
+
term: 2,
|
|
213
|
+
leader_id: 'leader-2',
|
|
214
|
+
fencing_token: 2,
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
const status = fixture.runtime.getStatus();
|
|
218
|
+
expect(status.leader_lease_valid).toBe(true);
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
it('leader always reports lease valid', async () => {
|
|
222
|
+
if (!(await canListenOnLocalhost())) return;
|
|
223
|
+
|
|
224
|
+
const fixture = await startRuntimeFixture({
|
|
225
|
+
node_id: 'node-lease-leader',
|
|
226
|
+
election_timeout_min_ms: 80,
|
|
227
|
+
election_timeout_max_ms: 120,
|
|
228
|
+
heartbeat_interval_ms: 60,
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
await waitFor(() => fixture.runtime.getStatus().is_leader, 4000);
|
|
232
|
+
const status = fixture.runtime.getStatus();
|
|
233
|
+
expect(status.leader_lease_valid).toBe(true);
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
it('reports lease expired after timeout elapses without heartbeat', async () => {
|
|
237
|
+
if (!(await canListenOnLocalhost())) return;
|
|
238
|
+
|
|
239
|
+
const leaseMs = 150;
|
|
240
|
+
const fixture = await startRuntimeFixture({
|
|
241
|
+
node_id: 'node-lease-expire',
|
|
242
|
+
election_timeout_min_ms: 5000,
|
|
243
|
+
election_timeout_max_ms: 5000,
|
|
244
|
+
heartbeat_interval_ms: 50,
|
|
245
|
+
leader_lease_ms: leaseMs,
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
// Send heartbeat to establish lease
|
|
249
|
+
await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
|
|
250
|
+
term: 1,
|
|
251
|
+
leader_id: 'leader-1',
|
|
252
|
+
fencing_token: 1,
|
|
253
|
+
});
|
|
254
|
+
|
|
255
|
+
expect(fixture.runtime.getStatus().leader_lease_valid).toBe(true);
|
|
256
|
+
|
|
257
|
+
// Wait for lease to expire
|
|
258
|
+
await new Promise(resolve => setTimeout(resolve, leaseMs + 50));
|
|
259
|
+
|
|
260
|
+
expect(fixture.runtime.getStatus().leader_lease_valid).toBe(false);
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
it('reports correct leader_lease_duration_ms from config', async () => {
|
|
264
|
+
if (!(await canListenOnLocalhost())) return;
|
|
265
|
+
|
|
266
|
+
const fixture = await startRuntimeFixture({
|
|
267
|
+
node_id: 'node-lease-config',
|
|
268
|
+
heartbeat_interval_ms: 200,
|
|
269
|
+
leader_lease_ms: 1000,
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
expect(fixture.runtime.getStatus().leader_lease_duration_ms).toBe(1000);
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
it('defaults leader_lease_duration_ms to 3x heartbeat_interval_ms', async () => {
|
|
276
|
+
if (!(await canListenOnLocalhost())) return;
|
|
277
|
+
|
|
278
|
+
const fixture = await startRuntimeFixture({
|
|
279
|
+
node_id: 'node-lease-default',
|
|
280
|
+
heartbeat_interval_ms: 200,
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
expect(fixture.runtime.getStatus().leader_lease_duration_ms).toBe(600);
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
it('resets lease on step-down from higher term', async () => {
|
|
287
|
+
if (!(await canListenOnLocalhost())) return;
|
|
288
|
+
|
|
289
|
+
const fixture = await startRuntimeFixture({
|
|
290
|
+
node_id: 'node-lease-stepdown',
|
|
291
|
+
election_timeout_min_ms: 5000,
|
|
292
|
+
election_timeout_max_ms: 5000,
|
|
293
|
+
heartbeat_interval_ms: 100,
|
|
294
|
+
});
|
|
295
|
+
|
|
296
|
+
// Establish lease at term 2
|
|
297
|
+
await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
|
|
298
|
+
term: 2,
|
|
299
|
+
leader_id: 'leader-2',
|
|
300
|
+
fencing_token: 2,
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
expect(fixture.runtime.getStatus().leader_lease_valid).toBe(true);
|
|
304
|
+
|
|
305
|
+
// Higher term vote request causes step-down, which should reset lease
|
|
306
|
+
await postJson(fixture.config.public_url, '/cluster/v1/election/request-vote', {
|
|
307
|
+
term: 5,
|
|
308
|
+
candidate_id: 'candidate-5',
|
|
309
|
+
});
|
|
310
|
+
|
|
311
|
+
// Lease should be invalid after step-down (no heartbeat from new leader yet)
|
|
312
|
+
expect(fixture.runtime.getStatus().leader_lease_valid).toBe(false);
|
|
313
|
+
});
|
|
314
|
+
});
|
|
315
|
+
|
|
316
|
+
describe('partition healing scenarios', () => {
|
|
317
|
+
it('stale leader is fenced after partition heals', async () => {
|
|
318
|
+
if (!(await canListenOnLocalhost())) return;
|
|
319
|
+
|
|
320
|
+
const portA = await getFreePort();
|
|
321
|
+
const portB = await getFreePort();
|
|
322
|
+
|
|
323
|
+
// Node A and B are peers
|
|
324
|
+
const configA = await buildConfig({
|
|
325
|
+
node_id: 'node-a-heal',
|
|
326
|
+
listen_port: portA,
|
|
327
|
+
public_url: `http://127.0.0.1:${portA}`,
|
|
328
|
+
peers: [{ id: 'node-b-heal', url: `http://127.0.0.1:${portB}` }],
|
|
329
|
+
election_timeout_min_ms: 80,
|
|
330
|
+
election_timeout_max_ms: 120,
|
|
331
|
+
heartbeat_interval_ms: 60,
|
|
332
|
+
});
|
|
333
|
+
const configB = await buildConfig({
|
|
334
|
+
node_id: 'node-b-heal',
|
|
335
|
+
listen_port: portB,
|
|
336
|
+
public_url: `http://127.0.0.1:${portB}`,
|
|
337
|
+
peers: [{ id: 'node-a-heal', url: `http://127.0.0.1:${portA}` }],
|
|
338
|
+
election_timeout_min_ms: 80,
|
|
339
|
+
election_timeout_max_ms: 120,
|
|
340
|
+
heartbeat_interval_ms: 60,
|
|
341
|
+
});
|
|
342
|
+
|
|
343
|
+
const fixtureA = await startRuntimeWithConfig(configA);
|
|
344
|
+
const fixtureB = await startRuntimeWithConfig(configB);
|
|
345
|
+
|
|
346
|
+
// Wait until at least one becomes leader
|
|
347
|
+
await waitFor(
|
|
348
|
+
() => fixtureA.runtime.getStatus().is_leader || fixtureB.runtime.getStatus().is_leader,
|
|
349
|
+
4000
|
|
350
|
+
);
|
|
351
|
+
|
|
352
|
+
const statusA = fixtureA.runtime.getStatus();
|
|
353
|
+
const statusB = fixtureB.runtime.getStatus();
|
|
354
|
+
|
|
355
|
+
// Exactly one should be leader (same term wins in a 2-node cluster)
|
|
356
|
+
const leaderCount = [statusA, statusB].filter(s => s.is_leader).length;
|
|
357
|
+
expect(leaderCount).toBeLessThanOrEqual(1);
|
|
358
|
+
|
|
359
|
+
// Both should have fencing tokens
|
|
360
|
+
expect(statusA.fencing_token).toBeGreaterThanOrEqual(0);
|
|
361
|
+
expect(statusB.fencing_token).toBeGreaterThanOrEqual(0);
|
|
362
|
+
});
|
|
363
|
+
|
|
364
|
+
it('follower rejects stale leader heartbeat after seeing higher term', async () => {
|
|
365
|
+
if (!(await canListenOnLocalhost())) return;
|
|
366
|
+
|
|
367
|
+
const fixture = await startRuntimeFixture({
|
|
368
|
+
node_id: 'node-heal-reject',
|
|
369
|
+
election_timeout_min_ms: 5000,
|
|
370
|
+
election_timeout_max_ms: 5000,
|
|
371
|
+
});
|
|
372
|
+
|
|
373
|
+
// Node sees term 10 from new leader
|
|
374
|
+
await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
|
|
375
|
+
term: 10,
|
|
376
|
+
leader_id: 'new-leader',
|
|
377
|
+
fencing_token: 10,
|
|
378
|
+
});
|
|
379
|
+
|
|
380
|
+
// Old leader (term 5) tries to send heartbeat after partition heals
|
|
381
|
+
const staleRes = await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
|
|
382
|
+
term: 5,
|
|
383
|
+
leader_id: 'old-leader',
|
|
384
|
+
fencing_token: 5,
|
|
385
|
+
});
|
|
386
|
+
|
|
387
|
+
expect(staleRes.success).toBe(false);
|
|
388
|
+
expect(staleRes.fencing_token).toBe(10);
|
|
389
|
+
|
|
390
|
+
// Verify node still follows new leader
|
|
391
|
+
const status = fixture.runtime.getStatus();
|
|
392
|
+
expect(status.leader_id).toBe('new-leader');
|
|
393
|
+
expect(status.term).toBe(10);
|
|
394
|
+
});
|
|
395
|
+
});
|
|
396
|
+
|
|
397
|
+
// --- Test helpers ---
|
|
398
|
+
|
|
399
|
+
async function startRuntimeFixture(
|
|
400
|
+
overrides: Partial<ClusterConfig> = {}
|
|
401
|
+
): Promise<RuntimeFixture> {
|
|
402
|
+
const attempts = overrides.listen_port ? 1 : 5;
|
|
403
|
+
let lastError: unknown;
|
|
404
|
+
|
|
405
|
+
for (let i = 0; i < attempts; i++) {
|
|
406
|
+
const config = await buildConfig(overrides);
|
|
407
|
+
try {
|
|
408
|
+
return await startRuntimeWithConfig(config);
|
|
409
|
+
} catch (error) {
|
|
410
|
+
lastError = error;
|
|
411
|
+
const err = error as NodeJS.ErrnoException;
|
|
412
|
+
if (!overrides.listen_port && err.code === 'EADDRINUSE') {
|
|
413
|
+
continue;
|
|
414
|
+
}
|
|
415
|
+
throw error;
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
throw lastError instanceof Error ? lastError : new Error('Failed to start runtime fixture');
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
async function startRuntimeWithConfig(config: ClusterConfig): Promise<RuntimeFixture> {
|
|
423
|
+
const root = mkdtempSync(join(tmpdir(), `hive-partition-safety-${config.node_id}-`));
|
|
424
|
+
const hiveDir = join(root, '.hive');
|
|
425
|
+
mkdirSync(hiveDir, { recursive: true });
|
|
426
|
+
|
|
427
|
+
const runtime = new ClusterRuntime(config, { hiveDir });
|
|
428
|
+
try {
|
|
429
|
+
await runtime.start();
|
|
430
|
+
activeRuntimes.push(runtime);
|
|
431
|
+
tempRoots.push(root);
|
|
432
|
+
|
|
433
|
+
return { root, hiveDir, config, runtime };
|
|
434
|
+
} catch (error) {
|
|
435
|
+
try {
|
|
436
|
+
await runtime.stop();
|
|
437
|
+
} catch {
|
|
438
|
+
// Best effort cleanup for partial starts.
|
|
439
|
+
}
|
|
440
|
+
rmSync(root, { recursive: true, force: true });
|
|
441
|
+
throw error;
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
async function buildConfig(overrides: Partial<ClusterConfig> = {}): Promise<ClusterConfig> {
|
|
446
|
+
const port = overrides.listen_port ?? (await getFreePort());
|
|
447
|
+
const base: ClusterConfig = {
|
|
448
|
+
enabled: true,
|
|
449
|
+
node_id: 'node-test',
|
|
450
|
+
listen_host: '127.0.0.1',
|
|
451
|
+
listen_port: port,
|
|
452
|
+
public_url: `http://127.0.0.1:${port}`,
|
|
453
|
+
peers: [],
|
|
454
|
+
heartbeat_interval_ms: 100,
|
|
455
|
+
election_timeout_min_ms: 150,
|
|
456
|
+
election_timeout_max_ms: 250,
|
|
457
|
+
sync_interval_ms: 200,
|
|
458
|
+
request_timeout_ms: 600,
|
|
459
|
+
story_similarity_threshold: 0.8,
|
|
460
|
+
};
|
|
461
|
+
|
|
462
|
+
return {
|
|
463
|
+
...base,
|
|
464
|
+
...overrides,
|
|
465
|
+
public_url: overrides.public_url || base.public_url,
|
|
466
|
+
peers: overrides.peers || base.peers,
|
|
467
|
+
};
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
async function postJson(
|
|
471
|
+
baseUrl: string,
|
|
472
|
+
path: string,
|
|
473
|
+
body: Record<string, unknown>
|
|
474
|
+
): Promise<Record<string, any>> {
|
|
475
|
+
const res = await fetch(`${baseUrl}${path}`, {
|
|
476
|
+
method: 'POST',
|
|
477
|
+
headers: { 'Content-Type': 'application/json' },
|
|
478
|
+
body: JSON.stringify(body),
|
|
479
|
+
});
|
|
480
|
+
|
|
481
|
+
return (await res.json()) as Record<string, any>;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
async function waitFor(predicate: () => boolean, timeoutMs: number): Promise<void> {
|
|
485
|
+
const start = Date.now();
|
|
486
|
+
while (Date.now() - start < timeoutMs) {
|
|
487
|
+
if (predicate()) return;
|
|
488
|
+
await new Promise(resolve => setTimeout(resolve, 25));
|
|
489
|
+
}
|
|
490
|
+
throw new Error('Timed out waiting for condition');
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
async function getFreePort(): Promise<number> {
|
|
494
|
+
return new Promise((resolve, reject) => {
|
|
495
|
+
const server = createNetServer();
|
|
496
|
+
server.once('error', reject);
|
|
497
|
+
server.listen(0, '127.0.0.1', () => {
|
|
498
|
+
const address = server.address();
|
|
499
|
+
if (!address || typeof address === 'string') {
|
|
500
|
+
server.close(() => reject(new Error('Failed to allocate free port')));
|
|
501
|
+
return;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
const port = address.port;
|
|
505
|
+
server.close(err => {
|
|
506
|
+
if (err) {
|
|
507
|
+
reject(err);
|
|
508
|
+
return;
|
|
509
|
+
}
|
|
510
|
+
resolve(port);
|
|
511
|
+
});
|
|
512
|
+
});
|
|
513
|
+
});
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
async function canListenOnLocalhost(): Promise<boolean> {
|
|
517
|
+
try {
|
|
518
|
+
await getFreePort();
|
|
519
|
+
return true;
|
|
520
|
+
} catch {
|
|
521
|
+
return false;
|
|
522
|
+
}
|
|
523
|
+
}
|
|
@@ -2,8 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
import { join } from 'path';
|
|
4
4
|
import type { ClusterConfig, ClusterPeerConfig } from '../config/schema.js';
|
|
5
|
-
import type { DurableLogEntryType } from './raft-store.js';
|
|
5
|
+
import type { CompactionResult, DurableLogEntryType } from './raft-store.js';
|
|
6
6
|
import { RaftMetadataStore } from './raft-store.js';
|
|
7
|
+
import type { VersionVector } from './types.js';
|
|
7
8
|
|
|
8
9
|
type NodeRole = 'leader' | 'follower' | 'candidate';
|
|
9
10
|
|
|
@@ -29,6 +30,16 @@ export class RaftStateMachine {
|
|
|
29
30
|
currentTerm = 0;
|
|
30
31
|
votedFor: string | null = null;
|
|
31
32
|
leaderId: string | null = null;
|
|
33
|
+
lastHeartbeatReceivedAt = 0;
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* When true, this node is catching up from a snapshot and must not
|
|
37
|
+
* participate in leader elections until fully recovered.
|
|
38
|
+
*/
|
|
39
|
+
isCatchingUp = false;
|
|
40
|
+
|
|
41
|
+
/** Dynamic peer list that can be updated at runtime via membership changes. */
|
|
42
|
+
private dynamicPeers: ClusterPeerConfig[] | null = null;
|
|
32
43
|
|
|
33
44
|
private electionDeadline = 0;
|
|
34
45
|
private electionInFlight = false;
|
|
@@ -40,6 +51,47 @@ export class RaftStateMachine {
|
|
|
40
51
|
private readonly deps: RaftStateMachineDeps
|
|
41
52
|
) {}
|
|
42
53
|
|
|
54
|
+
/** Returns the active peer list (dynamic if set, otherwise static config). */
|
|
55
|
+
getPeers(): ClusterPeerConfig[] {
|
|
56
|
+
return this.dynamicPeers ?? this.config.peers;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Replaces the dynamic peer list. */
|
|
60
|
+
setPeers(peers: ClusterPeerConfig[]): void {
|
|
61
|
+
this.dynamicPeers = peers;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/** Returns the leader lease window in milliseconds. */
|
|
65
|
+
get leaderLeaseDurationMs(): number {
|
|
66
|
+
return this.config.leader_lease_ms ?? this.config.heartbeat_interval_ms * 3;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Returns true when this follower has received a valid heartbeat
|
|
71
|
+
* from the current leader within the lease window.
|
|
72
|
+
*/
|
|
73
|
+
isLeaderLeaseValid(): boolean {
|
|
74
|
+
if (this.role === 'leader') return true;
|
|
75
|
+
if (this.lastHeartbeatReceivedAt === 0) return false;
|
|
76
|
+
return Date.now() - this.lastHeartbeatReceivedAt < this.leaderLeaseDurationMs;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* The fencing token is the current Raft term. Operations tagged with a
|
|
81
|
+
* lower term than ours must be rejected to prevent stale-leader writes.
|
|
82
|
+
*/
|
|
83
|
+
getFencingToken(): number {
|
|
84
|
+
return this.currentTerm;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Validates a fencing token from a remote node. Returns true when the
|
|
89
|
+
* token is at least as recent as our current term.
|
|
90
|
+
*/
|
|
91
|
+
validateFencingToken(token: number): boolean {
|
|
92
|
+
return token >= this.currentTerm;
|
|
93
|
+
}
|
|
94
|
+
|
|
43
95
|
initializeRaftStore(hiveDir: string): void {
|
|
44
96
|
if (this.raftStore) return;
|
|
45
97
|
|
|
@@ -69,6 +121,12 @@ export class RaftStateMachine {
|
|
|
69
121
|
this.electionTimer = setInterval(() => {
|
|
70
122
|
if (!this.config.enabled) return;
|
|
71
123
|
if (this.role === 'leader') return;
|
|
124
|
+
// Do not start elections while catching up from a snapshot — the node
|
|
125
|
+
// must not become leader until it has a complete, current state.
|
|
126
|
+
if (this.isCatchingUp) {
|
|
127
|
+
this.resetElectionDeadline();
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
72
130
|
|
|
73
131
|
if (Date.now() >= this.electionDeadline) {
|
|
74
132
|
void this.startElection().catch(error => this.deps.handleBackgroundError(error));
|
|
@@ -105,7 +163,7 @@ export class RaftStateMachine {
|
|
|
105
163
|
|
|
106
164
|
try {
|
|
107
165
|
await Promise.all(
|
|
108
|
-
this.
|
|
166
|
+
this.getPeers()
|
|
109
167
|
.filter(peer => peer.id !== this.config.node_id)
|
|
110
168
|
.map(async peer => {
|
|
111
169
|
const response = await this.deps.postJson<VoteResponse>(
|
|
@@ -195,6 +253,7 @@ export class RaftStateMachine {
|
|
|
195
253
|
this.role = 'follower';
|
|
196
254
|
this.votedFor = null;
|
|
197
255
|
this.leaderId = leaderId;
|
|
256
|
+
this.lastHeartbeatReceivedAt = 0;
|
|
198
257
|
this.resetElectionDeadline();
|
|
199
258
|
this.persistRaftState();
|
|
200
259
|
|
|
@@ -207,7 +266,7 @@ export class RaftStateMachine {
|
|
|
207
266
|
}
|
|
208
267
|
|
|
209
268
|
quorum(): number {
|
|
210
|
-
const nodes = this.
|
|
269
|
+
const nodes = this.getPeers().length + 1;
|
|
211
270
|
return Math.floor(nodes / 2) + 1;
|
|
212
271
|
}
|
|
213
272
|
|
|
@@ -257,11 +316,24 @@ export class RaftStateMachine {
|
|
|
257
316
|
return this.raftStore?.getState() ?? null;
|
|
258
317
|
}
|
|
259
318
|
|
|
319
|
+
getLogEntryCount(): number {
|
|
320
|
+
return this.raftStore?.getLogEntryCount() ?? 0;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
createSnapshotAndCompact(versionVector: VersionVector): CompactionResult {
|
|
324
|
+
if (!this.raftStore) {
|
|
325
|
+
return { entries_removed: 0, entries_retained: 0, snapshot_index: 0 };
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
this.raftStore.createSnapshot(versionVector);
|
|
329
|
+
return this.raftStore.compactLog();
|
|
330
|
+
}
|
|
331
|
+
|
|
260
332
|
getLeaderUrl(): string | null {
|
|
261
333
|
if (!this.leaderId) return null;
|
|
262
334
|
if (this.leaderId === this.config.node_id) return this.config.public_url;
|
|
263
335
|
|
|
264
|
-
const peer = this.
|
|
336
|
+
const peer = this.getPeers().find(item => item.id === this.leaderId);
|
|
265
337
|
return peer?.url || null;
|
|
266
338
|
}
|
|
267
339
|
}
|