hungry-ghost-hive 0.45.0 → 0.46.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/dist/cli/commands/cluster.d.ts.map +1 -1
  2. package/dist/cli/commands/cluster.js +348 -1
  3. package/dist/cli/commands/cluster.js.map +1 -1
  4. package/dist/cli/commands/cluster.test.js +313 -9
  5. package/dist/cli/commands/cluster.test.js.map +1 -1
  6. package/dist/cli/commands/req-spawn.test.d.ts +2 -0
  7. package/dist/cli/commands/req-spawn.test.d.ts.map +1 -0
  8. package/dist/cli/commands/req-spawn.test.js +116 -0
  9. package/dist/cli/commands/req-spawn.test.js.map +1 -0
  10. package/dist/cli/commands/req.d.ts.map +1 -1
  11. package/dist/cli/commands/req.js +21 -13
  12. package/dist/cli/commands/req.js.map +1 -1
  13. package/dist/cluster/cluster-http-server.d.ts +32 -0
  14. package/dist/cluster/cluster-http-server.d.ts.map +1 -1
  15. package/dist/cluster/cluster-http-server.js +42 -0
  16. package/dist/cluster/cluster-http-server.js.map +1 -1
  17. package/dist/cluster/distributed-runtime-coverage.test.js +9 -0
  18. package/dist/cluster/distributed-runtime-coverage.test.js.map +1 -1
  19. package/dist/cluster/distributed-system.test.js +135 -0
  20. package/dist/cluster/distributed-system.test.js.map +1 -1
  21. package/dist/cluster/events.d.ts +23 -0
  22. package/dist/cluster/events.d.ts.map +1 -1
  23. package/dist/cluster/events.js +74 -0
  24. package/dist/cluster/events.js.map +1 -1
  25. package/dist/cluster/heartbeat-manager.d.ts +2 -0
  26. package/dist/cluster/heartbeat-manager.d.ts.map +1 -1
  27. package/dist/cluster/heartbeat-manager.js +42 -6
  28. package/dist/cluster/heartbeat-manager.js.map +1 -1
  29. package/dist/cluster/membership.test.d.ts +2 -0
  30. package/dist/cluster/membership.test.d.ts.map +1 -0
  31. package/dist/cluster/membership.test.js +416 -0
  32. package/dist/cluster/membership.test.js.map +1 -0
  33. package/dist/cluster/partition-safety.test.d.ts +2 -0
  34. package/dist/cluster/partition-safety.test.d.ts.map +1 -0
  35. package/dist/cluster/partition-safety.test.js +440 -0
  36. package/dist/cluster/partition-safety.test.js.map +1 -0
  37. package/dist/cluster/raft-state-machine.d.ts +33 -1
  38. package/dist/cluster/raft-state-machine.d.ts.map +1 -1
  39. package/dist/cluster/raft-state-machine.js +65 -3
  40. package/dist/cluster/raft-state-machine.js.map +1 -1
  41. package/dist/cluster/raft-store.d.ts +26 -1
  42. package/dist/cluster/raft-store.d.ts.map +1 -1
  43. package/dist/cluster/raft-store.js +137 -0
  44. package/dist/cluster/raft-store.js.map +1 -1
  45. package/dist/cluster/replication-lag.test.d.ts +2 -0
  46. package/dist/cluster/replication-lag.test.d.ts.map +1 -0
  47. package/dist/cluster/replication-lag.test.js +239 -0
  48. package/dist/cluster/replication-lag.test.js.map +1 -0
  49. package/dist/cluster/replication.d.ts +2 -2
  50. package/dist/cluster/replication.d.ts.map +1 -1
  51. package/dist/cluster/replication.js +1 -1
  52. package/dist/cluster/replication.js.map +1 -1
  53. package/dist/cluster/runtime.d.ts +78 -0
  54. package/dist/cluster/runtime.d.ts.map +1 -1
  55. package/dist/cluster/runtime.js +400 -13
  56. package/dist/cluster/runtime.js.map +1 -1
  57. package/dist/cluster/state-recovery.test.d.ts +2 -0
  58. package/dist/cluster/state-recovery.test.d.ts.map +1 -0
  59. package/dist/cluster/state-recovery.test.js +310 -0
  60. package/dist/cluster/state-recovery.test.js.map +1 -0
  61. package/dist/cluster/types.d.ts +30 -0
  62. package/dist/cluster/types.d.ts.map +1 -1
  63. package/dist/config/schema.d.ts +48 -0
  64. package/dist/config/schema.d.ts.map +1 -1
  65. package/dist/config/schema.js +11 -0
  66. package/dist/config/schema.js.map +1 -1
  67. package/dist/context-files/generator.js +1 -1
  68. package/dist/context-files/generator.js.map +1 -1
  69. package/dist/context-files/generator.test.js +51 -0
  70. package/dist/context-files/generator.test.js.map +1 -1
  71. package/dist/orchestrator/orphan-recovery.d.ts +1 -1
  72. package/dist/orchestrator/orphan-recovery.d.ts.map +1 -1
  73. package/dist/orchestrator/orphan-recovery.js +4 -4
  74. package/dist/orchestrator/orphan-recovery.js.map +1 -1
  75. package/dist/orchestrator/prompt-templates.d.ts +3 -1
  76. package/dist/orchestrator/prompt-templates.d.ts.map +1 -1
  77. package/dist/orchestrator/prompt-templates.js +45 -8
  78. package/dist/orchestrator/prompt-templates.js.map +1 -1
  79. package/dist/orchestrator/prompt-templates.test.js +210 -0
  80. package/dist/orchestrator/prompt-templates.test.js.map +1 -1
  81. package/dist/orchestrator/scheduler.d.ts +1 -0
  82. package/dist/orchestrator/scheduler.d.ts.map +1 -1
  83. package/dist/orchestrator/scheduler.js +15 -10
  84. package/dist/orchestrator/scheduler.js.map +1 -1
  85. package/dist/orchestrator/scheduler.test.js +97 -6
  86. package/dist/orchestrator/scheduler.test.js.map +1 -1
  87. package/package.json +1 -1
  88. package/src/cli/commands/cluster.test.ts +387 -9
  89. package/src/cli/commands/cluster.ts +486 -1
  90. package/src/cli/commands/req-spawn.test.ts +153 -0
  91. package/src/cli/commands/req.ts +31 -18
  92. package/src/cluster/cluster-http-server.ts +80 -0
  93. package/src/cluster/distributed-runtime-coverage.test.ts +9 -0
  94. package/src/cluster/distributed-system.test.ts +168 -0
  95. package/src/cluster/events.ts +90 -0
  96. package/src/cluster/heartbeat-manager.ts +48 -6
  97. package/src/cluster/membership.test.ts +498 -0
  98. package/src/cluster/partition-safety.test.ts +523 -0
  99. package/src/cluster/raft-state-machine.ts +76 -4
  100. package/src/cluster/raft-store.ts +167 -1
  101. package/src/cluster/replication-lag.test.ts +284 -0
  102. package/src/cluster/replication.ts +6 -0
  103. package/src/cluster/runtime.ts +551 -12
  104. package/src/cluster/state-recovery.test.ts +420 -0
  105. package/src/cluster/types.ts +32 -0
  106. package/src/config/schema.ts +11 -0
  107. package/src/context-files/generator.test.ts +55 -0
  108. package/src/context-files/generator.ts +5 -5
  109. package/src/orchestrator/orphan-recovery.ts +32 -13
  110. package/src/orchestrator/prompt-templates.test.ts +263 -0
  111. package/src/orchestrator/prompt-templates.ts +49 -8
  112. package/src/orchestrator/scheduler.test.ts +129 -6
  113. package/src/orchestrator/scheduler.ts +46 -20
@@ -0,0 +1,523 @@
1
+ // Licensed under the Hungry Ghost Hive License. See LICENSE.
2
+
3
+ import { mkdirSync, mkdtempSync, rmSync } from 'fs';
4
+ import { createServer as createNetServer } from 'net';
5
+ import { tmpdir } from 'os';
6
+ import { join } from 'path';
7
+ import { afterEach, describe, expect, it } from 'vitest';
8
+ import type { ClusterConfig } from '../config/schema.js';
9
+ import { ClusterRuntime } from './runtime.js';
10
+
11
+ interface RuntimeFixture {
12
+ root: string;
13
+ hiveDir: string;
14
+ config: ClusterConfig;
15
+ runtime: ClusterRuntime;
16
+ }
17
+
18
+ const tempRoots: string[] = [];
19
+ const activeRuntimes: ClusterRuntime[] = [];
20
+
21
+ afterEach(async () => {
22
+ for (const runtime of activeRuntimes.splice(0)) {
23
+ try {
24
+ await runtime.stop();
25
+ } catch {
26
+ // Best effort shutdown for test cleanup.
27
+ }
28
+ }
29
+
30
+ for (const root of tempRoots.splice(0)) {
31
+ rmSync(root, { recursive: true, force: true });
32
+ }
33
+ });
34
+
35
+ describe('fencing token validation', () => {
36
+ it('rejects heartbeats with fencing_token lower than term', async () => {
37
+ if (!(await canListenOnLocalhost())) return;
38
+
39
+ const fixture = await startRuntimeFixture({
40
+ node_id: 'node-fence-reject',
41
+ election_timeout_min_ms: 2000,
42
+ election_timeout_max_ms: 2000,
43
+ });
44
+
45
+ // First, advance the term by accepting a vote request
46
+ await postJson(fixture.config.public_url, '/cluster/v1/election/request-vote', {
47
+ term: 5,
48
+ candidate_id: 'candidate-5',
49
+ });
50
+
51
+ // Send heartbeat with valid term but stale fencing token
52
+ const res = await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
53
+ term: 5,
54
+ leader_id: 'candidate-5',
55
+ fencing_token: 3,
56
+ });
57
+
58
+ expect(res.success).toBe(false);
59
+ expect(res.fencing_token).toBe(5);
60
+ });
61
+
62
+ it('accepts heartbeats with valid fencing_token', async () => {
63
+ if (!(await canListenOnLocalhost())) return;
64
+
65
+ const fixture = await startRuntimeFixture({
66
+ node_id: 'node-fence-accept',
67
+ election_timeout_min_ms: 2000,
68
+ election_timeout_max_ms: 2000,
69
+ });
70
+
71
+ const res = await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
72
+ term: 3,
73
+ leader_id: 'leader-3',
74
+ fencing_token: 3,
75
+ });
76
+
77
+ expect(res.success).toBe(true);
78
+ expect(res.fencing_token).toBe(3);
79
+ });
80
+
81
+ it('rejects delta requests with stale fencing_token', async () => {
82
+ if (!(await canListenOnLocalhost())) return;
83
+
84
+ const fixture = await startRuntimeFixture({
85
+ node_id: 'node-delta-fence',
86
+ election_timeout_min_ms: 2000,
87
+ election_timeout_max_ms: 2000,
88
+ });
89
+
90
+ // Advance term
91
+ await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
92
+ term: 10,
93
+ leader_id: 'leader-10',
94
+ fencing_token: 10,
95
+ });
96
+
97
+ // Request delta with stale fencing token
98
+ const res = await fetch(`${fixture.config.public_url}/cluster/v1/events/delta`, {
99
+ method: 'POST',
100
+ headers: { 'Content-Type': 'application/json' },
101
+ body: JSON.stringify({
102
+ version_vector: {},
103
+ fencing_token: 5,
104
+ }),
105
+ });
106
+
107
+ expect(res.status).toBe(409);
108
+ const body = (await res.json()) as { error: string; fencing_token: number };
109
+ expect(body.error).toContain('stale leader epoch');
110
+ expect(body.fencing_token).toBe(10);
111
+ });
112
+
113
+ it('accepts delta requests with current fencing_token', async () => {
114
+ if (!(await canListenOnLocalhost())) return;
115
+
116
+ const fixture = await startRuntimeFixture({
117
+ node_id: 'node-delta-fence-ok',
118
+ election_timeout_min_ms: 2000,
119
+ election_timeout_max_ms: 2000,
120
+ });
121
+
122
+ // Set term to 4
123
+ await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
124
+ term: 4,
125
+ leader_id: 'leader-4',
126
+ fencing_token: 4,
127
+ });
128
+
129
+ // Request delta with matching fencing token
130
+ const res = await fetch(`${fixture.config.public_url}/cluster/v1/events/delta`, {
131
+ method: 'POST',
132
+ headers: { 'Content-Type': 'application/json' },
133
+ body: JSON.stringify({
134
+ version_vector: {},
135
+ fencing_token: 4,
136
+ }),
137
+ });
138
+
139
+ expect(res.status).toBe(200);
140
+ const body = (await res.json()) as { fencing_token: number };
141
+ expect(body.fencing_token).toBe(4);
142
+ });
143
+
144
+ it('accepts delta requests without fencing_token for backward compatibility', async () => {
145
+ if (!(await canListenOnLocalhost())) return;
146
+
147
+ const fixture = await startRuntimeFixture({
148
+ node_id: 'node-delta-no-fence',
149
+ election_timeout_min_ms: 2000,
150
+ election_timeout_max_ms: 2000,
151
+ });
152
+
153
+ const res = await fetch(`${fixture.config.public_url}/cluster/v1/events/delta`, {
154
+ method: 'POST',
155
+ headers: { 'Content-Type': 'application/json' },
156
+ body: JSON.stringify({
157
+ version_vector: {},
158
+ }),
159
+ });
160
+
161
+ expect(res.status).toBe(200);
162
+ });
163
+
164
+ it('returns fencing_token in status endpoint', async () => {
165
+ if (!(await canListenOnLocalhost())) return;
166
+
167
+ const fixture = await startRuntimeFixture({
168
+ node_id: 'node-status-fence',
169
+ election_timeout_min_ms: 2000,
170
+ election_timeout_max_ms: 2000,
171
+ });
172
+
173
+ // Advance term
174
+ await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
175
+ term: 7,
176
+ leader_id: 'leader-7',
177
+ fencing_token: 7,
178
+ });
179
+
180
+ const status = fixture.runtime.getStatus();
181
+ expect(status.fencing_token).toBe(7);
182
+ expect(status.term).toBe(7);
183
+ });
184
+ });
185
+
186
+ describe('leader lease validation', () => {
187
+ it('reports lease invalid when no heartbeat has been received', async () => {
188
+ if (!(await canListenOnLocalhost())) return;
189
+
190
+ const fixture = await startRuntimeFixture({
191
+ node_id: 'node-lease-none',
192
+ election_timeout_min_ms: 2000,
193
+ election_timeout_max_ms: 2000,
194
+ });
195
+
196
+ const status = fixture.runtime.getStatus();
197
+ expect(status.leader_lease_valid).toBe(false);
198
+ });
199
+
200
+ it('reports lease valid immediately after receiving heartbeat', async () => {
201
+ if (!(await canListenOnLocalhost())) return;
202
+
203
+ const fixture = await startRuntimeFixture({
204
+ node_id: 'node-lease-fresh',
205
+ election_timeout_min_ms: 2000,
206
+ election_timeout_max_ms: 2000,
207
+ heartbeat_interval_ms: 100,
208
+ });
209
+
210
+ // Send a heartbeat
211
+ await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
212
+ term: 2,
213
+ leader_id: 'leader-2',
214
+ fencing_token: 2,
215
+ });
216
+
217
+ const status = fixture.runtime.getStatus();
218
+ expect(status.leader_lease_valid).toBe(true);
219
+ });
220
+
221
+ it('leader always reports lease valid', async () => {
222
+ if (!(await canListenOnLocalhost())) return;
223
+
224
+ const fixture = await startRuntimeFixture({
225
+ node_id: 'node-lease-leader',
226
+ election_timeout_min_ms: 80,
227
+ election_timeout_max_ms: 120,
228
+ heartbeat_interval_ms: 60,
229
+ });
230
+
231
+ await waitFor(() => fixture.runtime.getStatus().is_leader, 4000);
232
+ const status = fixture.runtime.getStatus();
233
+ expect(status.leader_lease_valid).toBe(true);
234
+ });
235
+
236
+ it('reports lease expired after timeout elapses without heartbeat', async () => {
237
+ if (!(await canListenOnLocalhost())) return;
238
+
239
+ const leaseMs = 150;
240
+ const fixture = await startRuntimeFixture({
241
+ node_id: 'node-lease-expire',
242
+ election_timeout_min_ms: 5000,
243
+ election_timeout_max_ms: 5000,
244
+ heartbeat_interval_ms: 50,
245
+ leader_lease_ms: leaseMs,
246
+ });
247
+
248
+ // Send heartbeat to establish lease
249
+ await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
250
+ term: 1,
251
+ leader_id: 'leader-1',
252
+ fencing_token: 1,
253
+ });
254
+
255
+ expect(fixture.runtime.getStatus().leader_lease_valid).toBe(true);
256
+
257
+ // Wait for lease to expire
258
+ await new Promise(resolve => setTimeout(resolve, leaseMs + 50));
259
+
260
+ expect(fixture.runtime.getStatus().leader_lease_valid).toBe(false);
261
+ });
262
+
263
+ it('reports correct leader_lease_duration_ms from config', async () => {
264
+ if (!(await canListenOnLocalhost())) return;
265
+
266
+ const fixture = await startRuntimeFixture({
267
+ node_id: 'node-lease-config',
268
+ heartbeat_interval_ms: 200,
269
+ leader_lease_ms: 1000,
270
+ });
271
+
272
+ expect(fixture.runtime.getStatus().leader_lease_duration_ms).toBe(1000);
273
+ });
274
+
275
+ it('defaults leader_lease_duration_ms to 3x heartbeat_interval_ms', async () => {
276
+ if (!(await canListenOnLocalhost())) return;
277
+
278
+ const fixture = await startRuntimeFixture({
279
+ node_id: 'node-lease-default',
280
+ heartbeat_interval_ms: 200,
281
+ });
282
+
283
+ expect(fixture.runtime.getStatus().leader_lease_duration_ms).toBe(600);
284
+ });
285
+
286
+ it('resets lease on step-down from higher term', async () => {
287
+ if (!(await canListenOnLocalhost())) return;
288
+
289
+ const fixture = await startRuntimeFixture({
290
+ node_id: 'node-lease-stepdown',
291
+ election_timeout_min_ms: 5000,
292
+ election_timeout_max_ms: 5000,
293
+ heartbeat_interval_ms: 100,
294
+ });
295
+
296
+ // Establish lease at term 2
297
+ await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
298
+ term: 2,
299
+ leader_id: 'leader-2',
300
+ fencing_token: 2,
301
+ });
302
+
303
+ expect(fixture.runtime.getStatus().leader_lease_valid).toBe(true);
304
+
305
+ // Higher term vote request causes step-down, which should reset lease
306
+ await postJson(fixture.config.public_url, '/cluster/v1/election/request-vote', {
307
+ term: 5,
308
+ candidate_id: 'candidate-5',
309
+ });
310
+
311
+ // Lease should be invalid after step-down (no heartbeat from new leader yet)
312
+ expect(fixture.runtime.getStatus().leader_lease_valid).toBe(false);
313
+ });
314
+ });
315
+
316
+ describe('partition healing scenarios', () => {
317
+ it('stale leader is fenced after partition heals', async () => {
318
+ if (!(await canListenOnLocalhost())) return;
319
+
320
+ const portA = await getFreePort();
321
+ const portB = await getFreePort();
322
+
323
+ // Node A and B are peers
324
+ const configA = await buildConfig({
325
+ node_id: 'node-a-heal',
326
+ listen_port: portA,
327
+ public_url: `http://127.0.0.1:${portA}`,
328
+ peers: [{ id: 'node-b-heal', url: `http://127.0.0.1:${portB}` }],
329
+ election_timeout_min_ms: 80,
330
+ election_timeout_max_ms: 120,
331
+ heartbeat_interval_ms: 60,
332
+ });
333
+ const configB = await buildConfig({
334
+ node_id: 'node-b-heal',
335
+ listen_port: portB,
336
+ public_url: `http://127.0.0.1:${portB}`,
337
+ peers: [{ id: 'node-a-heal', url: `http://127.0.0.1:${portA}` }],
338
+ election_timeout_min_ms: 80,
339
+ election_timeout_max_ms: 120,
340
+ heartbeat_interval_ms: 60,
341
+ });
342
+
343
+ const fixtureA = await startRuntimeWithConfig(configA);
344
+ const fixtureB = await startRuntimeWithConfig(configB);
345
+
346
+ // Wait until at least one becomes leader
347
+ await waitFor(
348
+ () => fixtureA.runtime.getStatus().is_leader || fixtureB.runtime.getStatus().is_leader,
349
+ 4000
350
+ );
351
+
352
+ const statusA = fixtureA.runtime.getStatus();
353
+ const statusB = fixtureB.runtime.getStatus();
354
+
355
+ // Exactly one should be leader (same term wins in a 2-node cluster)
356
+ const leaderCount = [statusA, statusB].filter(s => s.is_leader).length;
357
+ expect(leaderCount).toBeLessThanOrEqual(1);
358
+
359
+ // Both should have fencing tokens
360
+ expect(statusA.fencing_token).toBeGreaterThanOrEqual(0);
361
+ expect(statusB.fencing_token).toBeGreaterThanOrEqual(0);
362
+ });
363
+
364
+ it('follower rejects stale leader heartbeat after seeing higher term', async () => {
365
+ if (!(await canListenOnLocalhost())) return;
366
+
367
+ const fixture = await startRuntimeFixture({
368
+ node_id: 'node-heal-reject',
369
+ election_timeout_min_ms: 5000,
370
+ election_timeout_max_ms: 5000,
371
+ });
372
+
373
+ // Node sees term 10 from new leader
374
+ await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
375
+ term: 10,
376
+ leader_id: 'new-leader',
377
+ fencing_token: 10,
378
+ });
379
+
380
+ // Old leader (term 5) tries to send heartbeat after partition heals
381
+ const staleRes = await postJson(fixture.config.public_url, '/cluster/v1/election/heartbeat', {
382
+ term: 5,
383
+ leader_id: 'old-leader',
384
+ fencing_token: 5,
385
+ });
386
+
387
+ expect(staleRes.success).toBe(false);
388
+ expect(staleRes.fencing_token).toBe(10);
389
+
390
+ // Verify node still follows new leader
391
+ const status = fixture.runtime.getStatus();
392
+ expect(status.leader_id).toBe('new-leader');
393
+ expect(status.term).toBe(10);
394
+ });
395
+ });
396
+
397
+ // --- Test helpers ---
398
+
399
+ async function startRuntimeFixture(
400
+ overrides: Partial<ClusterConfig> = {}
401
+ ): Promise<RuntimeFixture> {
402
+ const attempts = overrides.listen_port ? 1 : 5;
403
+ let lastError: unknown;
404
+
405
+ for (let i = 0; i < attempts; i++) {
406
+ const config = await buildConfig(overrides);
407
+ try {
408
+ return await startRuntimeWithConfig(config);
409
+ } catch (error) {
410
+ lastError = error;
411
+ const err = error as NodeJS.ErrnoException;
412
+ if (!overrides.listen_port && err.code === 'EADDRINUSE') {
413
+ continue;
414
+ }
415
+ throw error;
416
+ }
417
+ }
418
+
419
+ throw lastError instanceof Error ? lastError : new Error('Failed to start runtime fixture');
420
+ }
421
+
422
+ async function startRuntimeWithConfig(config: ClusterConfig): Promise<RuntimeFixture> {
423
+ const root = mkdtempSync(join(tmpdir(), `hive-partition-safety-${config.node_id}-`));
424
+ const hiveDir = join(root, '.hive');
425
+ mkdirSync(hiveDir, { recursive: true });
426
+
427
+ const runtime = new ClusterRuntime(config, { hiveDir });
428
+ try {
429
+ await runtime.start();
430
+ activeRuntimes.push(runtime);
431
+ tempRoots.push(root);
432
+
433
+ return { root, hiveDir, config, runtime };
434
+ } catch (error) {
435
+ try {
436
+ await runtime.stop();
437
+ } catch {
438
+ // Best effort cleanup for partial starts.
439
+ }
440
+ rmSync(root, { recursive: true, force: true });
441
+ throw error;
442
+ }
443
+ }
444
+
445
+ async function buildConfig(overrides: Partial<ClusterConfig> = {}): Promise<ClusterConfig> {
446
+ const port = overrides.listen_port ?? (await getFreePort());
447
+ const base: ClusterConfig = {
448
+ enabled: true,
449
+ node_id: 'node-test',
450
+ listen_host: '127.0.0.1',
451
+ listen_port: port,
452
+ public_url: `http://127.0.0.1:${port}`,
453
+ peers: [],
454
+ heartbeat_interval_ms: 100,
455
+ election_timeout_min_ms: 150,
456
+ election_timeout_max_ms: 250,
457
+ sync_interval_ms: 200,
458
+ request_timeout_ms: 600,
459
+ story_similarity_threshold: 0.8,
460
+ };
461
+
462
+ return {
463
+ ...base,
464
+ ...overrides,
465
+ public_url: overrides.public_url || base.public_url,
466
+ peers: overrides.peers || base.peers,
467
+ };
468
+ }
469
+
470
+ async function postJson(
471
+ baseUrl: string,
472
+ path: string,
473
+ body: Record<string, unknown>
474
+ ): Promise<Record<string, any>> {
475
+ const res = await fetch(`${baseUrl}${path}`, {
476
+ method: 'POST',
477
+ headers: { 'Content-Type': 'application/json' },
478
+ body: JSON.stringify(body),
479
+ });
480
+
481
+ return (await res.json()) as Record<string, any>;
482
+ }
483
+
484
+ async function waitFor(predicate: () => boolean, timeoutMs: number): Promise<void> {
485
+ const start = Date.now();
486
+ while (Date.now() - start < timeoutMs) {
487
+ if (predicate()) return;
488
+ await new Promise(resolve => setTimeout(resolve, 25));
489
+ }
490
+ throw new Error('Timed out waiting for condition');
491
+ }
492
+
493
+ async function getFreePort(): Promise<number> {
494
+ return new Promise((resolve, reject) => {
495
+ const server = createNetServer();
496
+ server.once('error', reject);
497
+ server.listen(0, '127.0.0.1', () => {
498
+ const address = server.address();
499
+ if (!address || typeof address === 'string') {
500
+ server.close(() => reject(new Error('Failed to allocate free port')));
501
+ return;
502
+ }
503
+
504
+ const port = address.port;
505
+ server.close(err => {
506
+ if (err) {
507
+ reject(err);
508
+ return;
509
+ }
510
+ resolve(port);
511
+ });
512
+ });
513
+ });
514
+ }
515
+
516
+ async function canListenOnLocalhost(): Promise<boolean> {
517
+ try {
518
+ await getFreePort();
519
+ return true;
520
+ } catch {
521
+ return false;
522
+ }
523
+ }
@@ -2,8 +2,9 @@
2
2
 
3
3
  import { join } from 'path';
4
4
  import type { ClusterConfig, ClusterPeerConfig } from '../config/schema.js';
5
- import type { DurableLogEntryType } from './raft-store.js';
5
+ import type { CompactionResult, DurableLogEntryType } from './raft-store.js';
6
6
  import { RaftMetadataStore } from './raft-store.js';
7
+ import type { VersionVector } from './types.js';
7
8
 
8
9
  type NodeRole = 'leader' | 'follower' | 'candidate';
9
10
 
@@ -29,6 +30,16 @@ export class RaftStateMachine {
29
30
  currentTerm = 0;
30
31
  votedFor: string | null = null;
31
32
  leaderId: string | null = null;
33
+ lastHeartbeatReceivedAt = 0;
34
+
35
+ /**
36
+ * When true, this node is catching up from a snapshot and must not
37
+ * participate in leader elections until fully recovered.
38
+ */
39
+ isCatchingUp = false;
40
+
41
+ /** Dynamic peer list that can be updated at runtime via membership changes. */
42
+ private dynamicPeers: ClusterPeerConfig[] | null = null;
32
43
 
33
44
  private electionDeadline = 0;
34
45
  private electionInFlight = false;
@@ -40,6 +51,47 @@ export class RaftStateMachine {
40
51
  private readonly deps: RaftStateMachineDeps
41
52
  ) {}
42
53
 
54
+ /** Returns the active peer list (dynamic if set, otherwise static config). */
55
+ getPeers(): ClusterPeerConfig[] {
56
+ return this.dynamicPeers ?? this.config.peers;
57
+ }
58
+
59
+ /** Replaces the dynamic peer list. */
60
+ setPeers(peers: ClusterPeerConfig[]): void {
61
+ this.dynamicPeers = peers;
62
+ }
63
+
64
+ /** Returns the leader lease window in milliseconds. */
65
+ get leaderLeaseDurationMs(): number {
66
+ return this.config.leader_lease_ms ?? this.config.heartbeat_interval_ms * 3;
67
+ }
68
+
69
+ /**
70
+ * Returns true when this follower has received a valid heartbeat
71
+ * from the current leader within the lease window.
72
+ */
73
+ isLeaderLeaseValid(): boolean {
74
+ if (this.role === 'leader') return true;
75
+ if (this.lastHeartbeatReceivedAt === 0) return false;
76
+ return Date.now() - this.lastHeartbeatReceivedAt < this.leaderLeaseDurationMs;
77
+ }
78
+
79
+ /**
80
+ * The fencing token is the current Raft term. Operations tagged with a
81
+ * lower term than ours must be rejected to prevent stale-leader writes.
82
+ */
83
+ getFencingToken(): number {
84
+ return this.currentTerm;
85
+ }
86
+
87
+ /**
88
+ * Validates a fencing token from a remote node. Returns true when the
89
+ * token is at least as recent as our current term.
90
+ */
91
+ validateFencingToken(token: number): boolean {
92
+ return token >= this.currentTerm;
93
+ }
94
+
43
95
  initializeRaftStore(hiveDir: string): void {
44
96
  if (this.raftStore) return;
45
97
 
@@ -69,6 +121,12 @@ export class RaftStateMachine {
69
121
  this.electionTimer = setInterval(() => {
70
122
  if (!this.config.enabled) return;
71
123
  if (this.role === 'leader') return;
124
+ // Do not start elections while catching up from a snapshot — the node
125
+ // must not become leader until it has a complete, current state.
126
+ if (this.isCatchingUp) {
127
+ this.resetElectionDeadline();
128
+ return;
129
+ }
72
130
 
73
131
  if (Date.now() >= this.electionDeadline) {
74
132
  void this.startElection().catch(error => this.deps.handleBackgroundError(error));
@@ -105,7 +163,7 @@ export class RaftStateMachine {
105
163
 
106
164
  try {
107
165
  await Promise.all(
108
- this.config.peers
166
+ this.getPeers()
109
167
  .filter(peer => peer.id !== this.config.node_id)
110
168
  .map(async peer => {
111
169
  const response = await this.deps.postJson<VoteResponse>(
@@ -195,6 +253,7 @@ export class RaftStateMachine {
195
253
  this.role = 'follower';
196
254
  this.votedFor = null;
197
255
  this.leaderId = leaderId;
256
+ this.lastHeartbeatReceivedAt = 0;
198
257
  this.resetElectionDeadline();
199
258
  this.persistRaftState();
200
259
 
@@ -207,7 +266,7 @@ export class RaftStateMachine {
207
266
  }
208
267
 
209
268
  quorum(): number {
210
- const nodes = this.config.peers.length + 1;
269
+ const nodes = this.getPeers().length + 1;
211
270
  return Math.floor(nodes / 2) + 1;
212
271
  }
213
272
 
@@ -257,11 +316,24 @@ export class RaftStateMachine {
257
316
  return this.raftStore?.getState() ?? null;
258
317
  }
259
318
 
319
+ getLogEntryCount(): number {
320
+ return this.raftStore?.getLogEntryCount() ?? 0;
321
+ }
322
+
323
+ createSnapshotAndCompact(versionVector: VersionVector): CompactionResult {
324
+ if (!this.raftStore) {
325
+ return { entries_removed: 0, entries_retained: 0, snapshot_index: 0 };
326
+ }
327
+
328
+ this.raftStore.createSnapshot(versionVector);
329
+ return this.raftStore.compactLog();
330
+ }
331
+
260
332
  getLeaderUrl(): string | null {
261
333
  if (!this.leaderId) return null;
262
334
  if (this.leaderId === this.config.node_id) return this.config.public_url;
263
335
 
264
- const peer = this.config.peers.find(item => item.id === this.leaderId);
336
+ const peer = this.getPeers().find(item => item.id === this.leaderId);
265
337
  return peer?.url || null;
266
338
  }
267
339
  }