@olane/o-leader 0.7.13-alpha.0 → 0.7.13-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,791 @@
1
+ import { expect } from 'chai';
2
+ import { TestLeaderNode } from './helpers/test-leader.node.js';
3
+ import { TestLaneTool } from './helpers/test-lane.tool.js';
4
+ import { NodeState, NodeType, oAddress, } from '@olane/o-core';
5
+ import { oNodeAddress } from '@olane/o-node';
6
+ // Current issue: we are routing waht seems to be directly to the node instead of the router via the test suite.
7
+ /**
8
+ * CRITICAL TEST SUITE: Node Failure and Reconnection Scenarios (8.4)
9
+ *
10
+ * This test suite validates the distributed system's resilience during node failures.
11
+ * Every node maintains dual references (parent + leader), which enables sophisticated
12
+ * graph state reconstruction during failures.
13
+ *
14
+ * Test Priority: CRITICAL - Essential for understanding graph state management
15
+ */
16
+ describe('Leader Reconnection Tests - CRITICAL (Section 8.4)', () => {
17
+ let leaderNode;
18
+ let parentNode; // Middleware parent
19
+ let childNode; // Leaf child
20
+ let clientNode; // Independent client (no parent, no leader)
21
+ /**
22
+ * Helper: Create a three-tier hierarchy (Leader → Parent → Child)
23
+ * This setup allows us to test failures at any level.
24
+ */
25
+ async function createThreeTierHierarchy() {
26
+ // 1. Create and start leader
27
+ leaderNode = new TestLeaderNode({
28
+ type: NodeType.LEADER,
29
+ leader: null,
30
+ parent: null,
31
+ systemName: 'reconnection-test-network',
32
+ network: {
33
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
34
+ },
35
+ });
36
+ await leaderNode.start();
37
+ expect(leaderNode.state).to.equal(NodeState.RUNNING);
38
+ expect(leaderNode.address.toString()).to.equal('o://leader');
39
+ // 2. Create and start parent (middleware node)
40
+ parentNode = new TestLaneTool({
41
+ address: new oNodeAddress('o://parent'),
42
+ leader: leaderNode.address,
43
+ parent: leaderNode.address,
44
+ description: 'Middleware parent node for reconnection testing',
45
+ network: {
46
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
47
+ },
48
+ });
49
+ await parentNode.start();
50
+ expect(parentNode.state).to.equal(NodeState.RUNNING);
51
+ expect(parentNode.address.toString()).to.equal('o://leader/parent');
52
+ // 3. Create and start child
53
+ childNode = new TestLaneTool({
54
+ address: new oNodeAddress('o://child'),
55
+ leader: leaderNode.address,
56
+ parent: parentNode.address,
57
+ description: 'Leaf child node for reconnection testing',
58
+ network: {
59
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
60
+ },
61
+ });
62
+ await childNode.start();
63
+ expect(childNode.state).to.equal(NodeState.RUNNING);
64
+ expect(childNode.address.toString()).to.equal('o://leader/parent/child');
65
+ // Wait for full registration
66
+ await new Promise((resolve) => setTimeout(resolve, 2000));
67
+ }
68
+ /**
69
+ * Helper: Verify dual-reference integrity
70
+ * Ensures nodes maintain both parent and leader references
71
+ */
72
+ function verifyDualReferences() {
73
+ // Child should reference both parent and leader
74
+ expect(childNode.parent).to.not.be.null;
75
+ expect(childNode.leader).to.not.be.null;
76
+ expect(childNode.parent.toString()).to.equal('o://leader/parent');
77
+ expect(childNode.leader.toString()).to.equal('o://leader');
78
+ // Parent should reference leader as both parent and leader
79
+ expect(parentNode.parent).to.not.be.null;
80
+ expect(parentNode.leader).to.not.be.null;
81
+ expect(parentNode.parent.toString()).to.equal('o://leader');
82
+ expect(parentNode.leader.toString()).to.equal('o://leader');
83
+ }
84
+ /**
85
+ * Helper: Create an independent client node (no parent, no leader)
86
+ * This simulates a production scenario where external nodes consume the graph
87
+ */
88
+ async function createIndependentClient() {
89
+ clientNode = new TestLaneTool({
90
+ address: new oNodeAddress('o://independent-client'),
91
+ leader: null,
92
+ parent: null,
93
+ description: 'Independent client node for testing connectivity',
94
+ network: {
95
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
96
+ },
97
+ });
98
+ await clientNode.start();
99
+ expect(clientNode.state).to.equal(NodeState.RUNNING);
100
+ expect(clientNode.address.toString()).to.equal('o://independent-client');
101
+ // Wait for network initialization
102
+ await new Promise((resolve) => setTimeout(resolve, 1000));
103
+ }
104
+ /**
105
+ * Helper: Test functional connectivity from independent client to target node
106
+ * Uses actual node object to get both address AND transports (required for unaffiliated clients)
107
+ * Simulates real production behavior where external consumers discover nodes via registry
108
+ *
109
+ * @param targetNode - The actual node object (provides address + transports)
110
+ * @param description - Description for logging
111
+ */
112
+ async function testConnectivity(targetNode, description) {
113
+ try {
114
+ // In production, client would:
115
+ // 1. Query registry for node by address/protocol
116
+ // 2. Get node's transports from registry
117
+ // 3. Create address with transports
118
+ // 4. Connect and call methods
119
+ // For testing, we simulate this by using the actual node's address + transports
120
+ const targetAddress = targetNode.address;
121
+ const targetTransports = targetNode.transports;
122
+ expect(targetTransports).to.exist;
123
+ expect(targetTransports.length).to.be.greaterThan(0, `Target node ${description} has no transports available`);
124
+ // Attempt to ping the target node with its transports
125
+ const response = await clientNode.use(new oNodeAddress(targetAddress.value, leaderNode.transports), {
126
+ method: 'ping',
127
+ params: {},
128
+ });
129
+ // Verify we got a response (connection successful)
130
+ expect(response).to.exist;
131
+ expect(response.result).to.exist;
132
+ // If ping succeeds, the node is reachable and functional
133
+ console.log(`✓ Connectivity verified: ${description} at ${targetAddress} (${targetTransports.length} transports)`);
134
+ }
135
+ catch (error) {
136
+ // If we can't connect, the test should fail
137
+ throw new Error(`Failed to connect to ${description}: ${error.message}`);
138
+ }
139
+ }
140
+ /**
141
+ * Helper: Verify that connectivity FAILS during node downtime
142
+ * This proves the graph is actually unhealthy during failures
143
+ *
144
+ * @param targetNode - Node object (may be null/stopped, but we try last known address)
145
+ * @param lastKnownAddress - Address to attempt connection to
146
+ * @param description - Description for logging
147
+ * @param timeoutMs - Maximum time to wait for failure (default: 5000ms)
148
+ */
149
+ async function expectConnectivityFailure(targetNode, lastKnownAddress, description, timeoutMs = 5000) {
150
+ const startTime = Date.now();
151
+ let lastError = null;
152
+ try {
153
+ // Attempt to ping with timeout
154
+ const timeoutPromise = new Promise((_, reject) => {
155
+ setTimeout(() => reject(new Error('Timeout waiting for failure')), timeoutMs);
156
+ });
157
+ // Try to use last known address (won't have valid transports since node is down)
158
+ const pingPromise = clientNode.use(new oAddress(lastKnownAddress), {
159
+ method: 'ping',
160
+ params: {},
161
+ });
162
+ await Promise.race([pingPromise, timeoutPromise]);
163
+ // If we get here, the ping succeeded when it should have failed!
164
+ throw new Error(`UNHEALTHY STATE NOT DETECTED: ${description} at ${lastKnownAddress} responded when it should be unreachable!`);
165
+ }
166
+ catch (error) {
167
+ const elapsed = Date.now() - startTime;
168
+ lastError = error;
169
+ // Check if this is our failure detection error (bad - test failed)
170
+ if (error.message.includes('UNHEALTHY STATE NOT DETECTED')) {
171
+ throw error;
172
+ }
173
+ // Check if this is a timeout (might indicate slow failure detection)
174
+ if (error.message.includes('Timeout waiting for failure')) {
175
+ console.warn(`⚠ Slow failure detection for ${description}: took >${timeoutMs}ms`);
176
+ // This is acceptable - node is unreachable, just took longer
177
+ console.log(`✓ Unhealthy state verified (timeout): ${description} at ${lastKnownAddress} (${elapsed}ms)`);
178
+ return;
179
+ }
180
+ // Any other error indicates the node is unreachable (expected during downtime)
181
+ console.log(`✓ Unhealthy state verified: ${description} at ${lastKnownAddress} unreachable (${elapsed}ms) - ${error.message.substring(0, 60)}...`);
182
+ }
183
+ }
184
+ /**
185
+ * Helper: Verify graph health state with comprehensive checks
186
+ * Tests connectivity to multiple nodes and validates expected health states
187
+ */
188
+ async function verifyGraphHealthState(expectations) {
189
+ for (const expectation of expectations) {
190
+ if (expectation.shouldBeReachable) {
191
+ if (!expectation.node) {
192
+ throw new Error(`Cannot test connectivity to ${expectation.description} - node is null but expected to be reachable`);
193
+ }
194
+ await testConnectivity(expectation.node, expectation.description);
195
+ }
196
+ else {
197
+ await expectConnectivityFailure(expectation.node, expectation.lastKnownAddress, expectation.description);
198
+ }
199
+ }
200
+ }
201
+ /**
202
+ * Helper: Cleanup all nodes
203
+ */
204
+ async function cleanupAll() {
205
+ if (clientNode && clientNode.state === NodeState.RUNNING) {
206
+ await clientNode.stop();
207
+ }
208
+ if (childNode && childNode.state === NodeState.RUNNING) {
209
+ await childNode.stop();
210
+ }
211
+ if (parentNode && parentNode.state === NodeState.RUNNING) {
212
+ await parentNode.stop();
213
+ }
214
+ if (leaderNode && leaderNode.state === NodeState.RUNNING) {
215
+ await leaderNode.stop();
216
+ }
217
+ }
218
+ afterEach(async () => {
219
+ await cleanupAll();
220
+ });
221
+ describe('Dual-Reference System Validation', () => {
222
+ it('should maintain dual-reference integrity (parent + leader) during initialization', async () => {
223
+ await createThreeTierHierarchy();
224
+ verifyDualReferences();
225
+ });
226
+ it('should verify parent reference updates after reconnection', async () => {
227
+ await createThreeTierHierarchy();
228
+ // Verify initial references
229
+ expect(childNode.parent).to.not.be.null;
230
+ const initialParentRef = childNode.parent.toString();
231
+ expect(initialParentRef).to.equal('o://leader/parent');
232
+ // Stop and restart parent (simulates reconnection)
233
+ await parentNode.stop();
234
+ await new Promise((resolve) => setTimeout(resolve, 1000));
235
+ // Restart parent
236
+ parentNode = new TestLaneTool({
237
+ address: new oNodeAddress('o://parent'),
238
+ leader: leaderNode.address,
239
+ parent: leaderNode.address,
240
+ description: 'Reconnected parent node',
241
+ network: {
242
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
243
+ },
244
+ });
245
+ await parentNode.start();
246
+ await new Promise((resolve) => setTimeout(resolve, 2000));
247
+ // Parent should have reconnected
248
+ expect(parentNode.state).to.equal(NodeState.RUNNING);
249
+ expect(parentNode.address.toString()).to.equal('o://leader/parent');
250
+ });
251
+ it('should verify leader reference updates after reconnection', async () => {
252
+ await createThreeTierHierarchy();
253
+ // Verify initial leader references
254
+ expect(childNode.leader).to.not.be.null;
255
+ expect(parentNode.leader).to.not.be.null;
256
+ expect(childNode.leader.toString()).to.equal('o://leader');
257
+ expect(parentNode.leader.toString()).to.equal('o://leader');
258
+ // Stop and restart leader (simulates leader failure and recovery)
259
+ const originalLeaderTransports = leaderNode.transports;
260
+ await leaderNode.stop();
261
+ await new Promise((resolve) => setTimeout(resolve, 1000));
262
+ // Restart leader
263
+ leaderNode = new TestLeaderNode({
264
+ type: NodeType.LEADER,
265
+ leader: null,
266
+ parent: null,
267
+ systemName: 'reconnection-test-network',
268
+ network: {
269
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
270
+ },
271
+ });
272
+ await leaderNode.start();
273
+ expect(leaderNode.state).to.equal(NodeState.RUNNING);
274
+ expect(leaderNode.address.toString()).to.equal('o://leader');
275
+ // Leader should be running and ready to accept reconnections
276
+ await new Promise((resolve) => setTimeout(resolve, 2000));
277
+ });
278
+ });
279
+ describe('Leader Node Failure and Reconnection', () => {
280
+ it('should handle leader node failure and reconnection', async () => {
281
+ await createThreeTierHierarchy();
282
+ // Capture initial state
283
+ const initialLeaderAddress = leaderNode.address.toString();
284
+ expect(initialLeaderAddress).to.equal('o://leader');
285
+ // Simulate leader failure
286
+ await leaderNode.stop();
287
+ expect(leaderNode.state).to.equal(NodeState.STOPPED);
288
+ // Wait for failure detection
289
+ await new Promise((resolve) => setTimeout(resolve, 1000));
290
+ // Restart leader
291
+ leaderNode = new TestLeaderNode({
292
+ type: NodeType.LEADER,
293
+ leader: null,
294
+ parent: null,
295
+ systemName: 'reconnection-test-network',
296
+ network: {
297
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
298
+ },
299
+ });
300
+ await leaderNode.start();
301
+ // Verify leader recovered
302
+ expect(leaderNode.state).to.equal(NodeState.RUNNING);
303
+ expect(leaderNode.address.toString()).to.equal('o://leader');
304
+ // Allow time for nodes to discover reconnected leader
305
+ await new Promise((resolve) => setTimeout(resolve, 2000));
306
+ });
307
+ it('should restore graph state after leader reconnection', async () => {
308
+ await createThreeTierHierarchy();
309
+ // Get initial hierarchy state
310
+ const initialChildrenCount = leaderNode.hierarchyManager.getChildren().length;
311
+ expect(initialChildrenCount).to.be.greaterThan(0);
312
+ // Stop leader
313
+ await leaderNode.stop();
314
+ await new Promise((resolve) => setTimeout(resolve, 1000));
315
+ // Restart leader
316
+ leaderNode = new TestLeaderNode({
317
+ type: NodeType.LEADER,
318
+ leader: null,
319
+ parent: null,
320
+ systemName: 'reconnection-test-network',
321
+ network: {
322
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
323
+ },
324
+ });
325
+ await leaderNode.start();
326
+ await new Promise((resolve) => setTimeout(resolve, 3000));
327
+ // Note: In a real scenario with persistence, graph state would be restored
328
+ // For now, we verify the leader is ready to accept new registrations
329
+ expect(leaderNode.state).to.equal(NodeState.RUNNING);
330
+ });
331
+ it('should handle child re-registration after leader failure', async () => {
332
+ await createThreeTierHierarchy();
333
+ // Stop leader
334
+ await leaderNode.stop();
335
+ await new Promise((resolve) => setTimeout(resolve, 1000));
336
+ // Restart leader
337
+ leaderNode = new TestLeaderNode({
338
+ type: NodeType.LEADER,
339
+ leader: null,
340
+ parent: null,
341
+ systemName: 'reconnection-test-network',
342
+ network: {
343
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
344
+ },
345
+ });
346
+ await leaderNode.start();
347
+ await new Promise((resolve) => setTimeout(resolve, 1000));
348
+ // Simulate child re-registration
349
+ // In a real system, children would detect leader failure and re-register
350
+ expect(leaderNode.state).to.equal(NodeState.RUNNING);
351
+ expect(childNode.state).to.equal(NodeState.RUNNING);
352
+ expect(parentNode.state).to.equal(NodeState.RUNNING);
353
+ });
354
+ });
355
+ describe('Middleware Parent Node Failure and Reconnection', () => {
356
+ it('should handle middleware parent node failure and reconnection', async () => {
357
+ await createThreeTierHierarchy();
358
+ // Simulate parent failure
359
+ await parentNode.stop();
360
+ expect(parentNode.state).to.equal(NodeState.STOPPED);
361
+ // Wait for failure detection
362
+ await new Promise((resolve) => setTimeout(resolve, 1000));
363
+ // Restart parent
364
+ parentNode = new TestLaneTool({
365
+ address: new oNodeAddress('o://parent'),
366
+ leader: leaderNode.address,
367
+ parent: leaderNode.address,
368
+ description: 'Reconnected parent node',
369
+ network: {
370
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
371
+ },
372
+ });
373
+ await parentNode.start();
374
+ // Verify parent recovered
375
+ expect(parentNode.state).to.equal(NodeState.RUNNING);
376
+ expect(parentNode.address.toString()).to.equal('o://leader/parent');
377
+ await new Promise((resolve) => setTimeout(resolve, 2000));
378
+ });
379
+ it('should restore graph state after parent reconnection', async () => {
380
+ await createThreeTierHierarchy();
381
+ // Stop parent
382
+ await parentNode.stop();
383
+ await new Promise((resolve) => setTimeout(resolve, 1000));
384
+ // Restart parent
385
+ parentNode = new TestLaneTool({
386
+ address: new oNodeAddress('o://parent'),
387
+ leader: leaderNode.address,
388
+ parent: leaderNode.address,
389
+ description: 'Reconnected parent node',
390
+ network: {
391
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
392
+ },
393
+ });
394
+ await parentNode.start();
395
+ await new Promise((resolve) => setTimeout(resolve, 2000));
396
+ // Verify parent restored
397
+ expect(parentNode.state).to.equal(NodeState.RUNNING);
398
+ expect(parentNode.parent).to.not.be.null;
399
+ expect(parentNode.leader).to.not.be.null;
400
+ expect(parentNode.parent.toString()).to.equal('o://leader');
401
+ expect(parentNode.leader.toString()).to.equal('o://leader');
402
+ });
403
+ it('should handle orphaned children when parent fails', async () => {
404
+ await createThreeTierHierarchy();
405
+ // Verify child's parent reference before failure
406
+ expect(childNode.parent).to.not.be.null;
407
+ expect(childNode.parent.toString()).to.equal('o://leader/parent');
408
+ // Stop parent (creates orphaned child)
409
+ await parentNode.stop();
410
+ await new Promise((resolve) => setTimeout(resolve, 1000));
411
+ // Child should still be running but parent is unreachable
412
+ expect(childNode.state).to.equal(NodeState.RUNNING);
413
+ // Child should still have leader reference (dual-reference resilience)
414
+ expect(childNode.leader).to.not.be.null;
415
+ expect(childNode.leader.toString()).to.equal('o://leader');
416
+ });
417
+ });
418
+ describe('Child Node Failure and Reconnection', () => {
419
+ it('should handle child node failure and reconnection', async () => {
420
+ await createThreeTierHierarchy();
421
+ // Simulate child failure
422
+ await childNode.stop();
423
+ expect(childNode.state).to.equal(NodeState.STOPPED);
424
+ // Wait for failure detection
425
+ await new Promise((resolve) => setTimeout(resolve, 1000));
426
+ // Restart child
427
+ childNode = new TestLaneTool({
428
+ address: new oNodeAddress('o://child'),
429
+ leader: leaderNode.address,
430
+ parent: parentNode.address,
431
+ description: 'Reconnected child node',
432
+ network: {
433
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
434
+ },
435
+ });
436
+ await childNode.start();
437
+ // Verify child recovered
438
+ expect(childNode.state).to.equal(NodeState.RUNNING);
439
+ expect(childNode.address.toString()).to.equal('o://leader/parent/child');
440
+ await new Promise((resolve) => setTimeout(resolve, 2000));
441
+ // Verify dual references restored
442
+ expect(childNode.parent).to.not.be.null;
443
+ expect(childNode.leader).to.not.be.null;
444
+ expect(childNode.parent.toString()).to.equal('o://leader/parent');
445
+ expect(childNode.leader.toString()).to.equal('o://leader');
446
+ });
447
+ });
448
+ describe('Cascading Failures and Recoveries', () => {
449
+ it('should handle cascading failures (child → parent → leader)', async () => {
450
+ await createThreeTierHierarchy();
451
+ // Cascading failure from bottom to top
452
+ await childNode.stop();
453
+ await new Promise((resolve) => setTimeout(resolve, 500));
454
+ await parentNode.stop();
455
+ await new Promise((resolve) => setTimeout(resolve, 500));
456
+ await leaderNode.stop();
457
+ await new Promise((resolve) => setTimeout(resolve, 500));
458
+ // Verify all stopped
459
+ expect(childNode.state).to.equal(NodeState.STOPPED);
460
+ expect(parentNode.state).to.equal(NodeState.STOPPED);
461
+ expect(leaderNode.state).to.equal(NodeState.STOPPED);
462
+ });
463
+ it('should handle cascading reconnections (leader → parent → child)', async () => {
464
+ await createThreeTierHierarchy();
465
+ // Stop all nodes
466
+ await childNode.stop();
467
+ await parentNode.stop();
468
+ await leaderNode.stop();
469
+ await new Promise((resolve) => setTimeout(resolve, 1000));
470
+ // Cascading reconnection from top to bottom
471
+ // 1. Restart leader
472
+ leaderNode = new TestLeaderNode({
473
+ type: NodeType.LEADER,
474
+ leader: null,
475
+ parent: null,
476
+ systemName: 'reconnection-test-network',
477
+ network: {
478
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
479
+ },
480
+ });
481
+ await leaderNode.start();
482
+ expect(leaderNode.state).to.equal(NodeState.RUNNING);
483
+ await new Promise((resolve) => setTimeout(resolve, 1000));
484
+ // 2. Restart parent
485
+ parentNode = new TestLaneTool({
486
+ address: new oNodeAddress('o://parent'),
487
+ leader: leaderNode.address,
488
+ parent: leaderNode.address,
489
+ description: 'Reconnected parent',
490
+ network: {
491
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
492
+ },
493
+ });
494
+ await parentNode.start();
495
+ expect(parentNode.state).to.equal(NodeState.RUNNING);
496
+ await new Promise((resolve) => setTimeout(resolve, 1000));
497
+ // 3. Restart child
498
+ childNode = new TestLaneTool({
499
+ address: new oNodeAddress('o://child'),
500
+ leader: leaderNode.address,
501
+ parent: parentNode.address,
502
+ description: 'Reconnected child',
503
+ network: {
504
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
505
+ },
506
+ });
507
+ await childNode.start();
508
+ expect(childNode.state).to.equal(NodeState.RUNNING);
509
+ await new Promise((resolve) => setTimeout(resolve, 2000));
510
+ // Verify entire hierarchy restored
511
+ expect(leaderNode.address.toString()).to.equal('o://leader');
512
+ expect(parentNode.address.toString()).to.equal('o://leader/parent');
513
+ expect(childNode.address.toString()).to.equal('o://leader/parent/child');
514
+ // Verify dual references
515
+ verifyDualReferences();
516
+ });
517
+ it('should handle simultaneous parent and leader failures', async () => {
518
+ await createThreeTierHierarchy();
519
+ // Simultaneous failure
520
+ await Promise.all([
521
+ parentNode.stop(),
522
+ leaderNode.stop(),
523
+ ]);
524
+ await new Promise((resolve) => setTimeout(resolve, 1000));
525
+ // Verify failures
526
+ expect(parentNode.state).to.equal(NodeState.STOPPED);
527
+ expect(leaderNode.state).to.equal(NodeState.STOPPED);
528
+ // Child should still be running
529
+ expect(childNode.state).to.equal(NodeState.RUNNING);
530
+ });
531
+ it('should maintain network topology consistency during reconnection', async () => {
532
+ await createThreeTierHierarchy();
533
+ // Verify initial topology
534
+ expect(childNode.address.toString()).to.equal('o://leader/parent/child');
535
+ // Stop parent
536
+ await parentNode.stop();
537
+ await new Promise((resolve) => setTimeout(resolve, 1000));
538
+ // Restart parent
539
+ parentNode = new TestLaneTool({
540
+ address: new oNodeAddress('o://parent'),
541
+ leader: leaderNode.address,
542
+ parent: leaderNode.address,
543
+ description: 'Reconnected parent',
544
+ network: {
545
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
546
+ },
547
+ });
548
+ await parentNode.start();
549
+ await new Promise((resolve) => setTimeout(resolve, 2000));
550
+ // Verify topology consistency
551
+ expect(parentNode.address.toString()).to.equal('o://leader/parent');
552
+ expect(parentNode.parent).to.not.be.null;
553
+ expect(parentNode.leader).to.not.be.null;
554
+ expect(parentNode.parent.toString()).to.equal('o://leader');
555
+ expect(parentNode.leader.toString()).to.equal('o://leader');
556
+ });
557
+ });
558
+ describe('Registry State During Reconnections', () => {
559
+ it('should update registry entries after node reconnection', async () => {
560
+ await createThreeTierHierarchy();
561
+ // Stop and restart parent
562
+ await parentNode.stop();
563
+ await new Promise((resolve) => setTimeout(resolve, 1000));
564
+ parentNode = new TestLaneTool({
565
+ address: new oNodeAddress('o://parent'),
566
+ leader: leaderNode.address,
567
+ parent: leaderNode.address,
568
+ description: 'Reconnected parent',
569
+ network: {
570
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
571
+ },
572
+ });
573
+ await parentNode.start();
574
+ await new Promise((resolve) => setTimeout(resolve, 2000));
575
+ // Registry should reflect the reconnected node
576
+ // In production, this would query the registry tool
577
+ expect(parentNode.state).to.equal(NodeState.RUNNING);
578
+ });
579
+ });
580
+ describe('PRODUCTION VALIDATION: Functional Connectivity After Graph Self-Healing', () => {
581
+ /**
582
+ * These tests simulate real production scenarios where independent external nodes
583
+ * (with no parent, no leader) need to discover and consume nodes after reconnection.
584
+ * This validates that graph self-healing actually restores functional connectivity,
585
+ * not just internal reference integrity.
586
+ */
587
+ it('should allow independent client to ping child after leader reconnection', async () => {
588
+ // Setup: Create hierarchy and independent client
589
+ await createThreeTierHierarchy();
590
+ await createIndependentClient();
591
+ // PHASE 1: HEALTHY - Verify initial connectivity
592
+ console.log('\n=== PHASE 1: HEALTHY STATE (Initial) ===');
593
+ await testConnectivity(leaderNode, 'leader (initial)');
594
+ await testConnectivity(parentNode, 'parent (initial)');
595
+ await testConnectivity(childNode, 'child (initial)');
596
+ // PHASE 2: FAILURE - Simulate leader failure
597
+ console.log('\n=== PHASE 2: INDUCING FAILURE (Leader Stop) ===');
598
+ await leaderNode.stop();
599
+ await new Promise((resolve) => setTimeout(resolve, 1000));
600
+ // PHASE 3: UNHEALTHY - Verify graph is in unhealthy state
601
+ console.log('\n=== PHASE 3: UNHEALTHY STATE (During Downtime) ===');
602
+ await expectConnectivityFailure(null, 'o://leader', 'leader (during downtime)');
603
+ // Note: Parent and child may still be running but unreachable via normal routing
604
+ await expectConnectivityFailure(childNode, 'o://leader/parent/child', 'child (during leader downtime)');
605
+ // PHASE 4: RECOVERY - Restart leader
606
+ console.log('\n=== PHASE 4: RECOVERY (Leader Restart) ===');
607
+ leaderNode = new TestLeaderNode({
608
+ type: NodeType.LEADER,
609
+ leader: null,
610
+ parent: null,
611
+ systemName: 'reconnection-test-network',
612
+ network: {
613
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
614
+ },
615
+ });
616
+ await leaderNode.start();
617
+ await new Promise((resolve) => setTimeout(resolve, 3000));
618
+ // PHASE 5: HEALTHY - Verify connectivity restored
619
+ console.log('\n=== PHASE 5: HEALTHY STATE (After Reconnection) ===');
620
+ await testConnectivity(leaderNode, 'leader (after reconnection)');
621
+ await testConnectivity(childNode, 'child (after leader reconnection)');
622
+ });
623
+ it('should allow independent client to ping child after parent reconnection', async () => {
624
+ // Setup
625
+ await createThreeTierHierarchy();
626
+ await createIndependentClient();
627
+ // Verify initial connectivity
628
+ await testConnectivity(childNode, 'child node (initial)');
629
+ // Simulate parent failure and recovery
630
+ await parentNode.stop();
631
+ await new Promise((resolve) => setTimeout(resolve, 1000));
632
+ parentNode = new TestLaneTool({
633
+ address: new oNodeAddress('o://parent'),
634
+ leader: leaderNode.address,
635
+ parent: leaderNode.address,
636
+ description: 'Reconnected parent',
637
+ network: {
638
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
639
+ },
640
+ });
641
+ await parentNode.start();
642
+ await new Promise((resolve) => setTimeout(resolve, 3000));
643
+ // CRITICAL: Verify child is still reachable from independent client
644
+ await testConnectivity(childNode, 'child node (after parent reconnection)');
645
+ });
646
+ it('should allow independent client to ping child after child reconnection', async () => {
647
+ // Setup
648
+ await createThreeTierHierarchy();
649
+ await createIndependentClient();
650
+ // Verify initial connectivity
651
+ await testConnectivity(childNode, 'child node (initial)');
652
+ // Simulate child failure and recovery
653
+ await childNode.stop();
654
+ await new Promise((resolve) => setTimeout(resolve, 1000));
655
+ childNode = new TestLaneTool({
656
+ address: new oNodeAddress('o://child'),
657
+ leader: leaderNode.address,
658
+ parent: parentNode.address,
659
+ description: 'Reconnected child',
660
+ network: {
661
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
662
+ },
663
+ });
664
+ await childNode.start();
665
+ await new Promise((resolve) => setTimeout(resolve, 3000));
666
+ // CRITICAL: Verify child is reachable after its own reconnection
667
+ await testConnectivity(childNode, 'child node (after child reconnection)');
668
+ });
669
+ it('should allow independent client to ping all nodes after full cascade reconnection', async () => {
670
+ // Setup
671
+ await createThreeTierHierarchy();
672
+ await createIndependentClient();
673
+ // Verify initial connectivity to all nodes
674
+ await testConnectivity(leaderNode, 'leader (initial)');
675
+ await testConnectivity(parentNode, 'parent (initial)');
676
+ await testConnectivity(childNode, 'child (initial)');
677
+ // Full cascade failure
678
+ await childNode.stop();
679
+ await parentNode.stop();
680
+ await leaderNode.stop();
681
+ await new Promise((resolve) => setTimeout(resolve, 1000));
682
+ // Full cascade recovery (leader → parent → child)
683
+ leaderNode = new TestLeaderNode({
684
+ type: NodeType.LEADER,
685
+ leader: null,
686
+ parent: null,
687
+ systemName: 'reconnection-test-network',
688
+ network: {
689
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
690
+ },
691
+ });
692
+ await leaderNode.start();
693
+ await new Promise((resolve) => setTimeout(resolve, 1000));
694
+ parentNode = new TestLaneTool({
695
+ address: new oNodeAddress('o://parent'),
696
+ leader: leaderNode.address,
697
+ parent: leaderNode.address,
698
+ description: 'Reconnected parent',
699
+ network: {
700
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
701
+ },
702
+ });
703
+ await parentNode.start();
704
+ await new Promise((resolve) => setTimeout(resolve, 1000));
705
+ childNode = new TestLaneTool({
706
+ address: new oNodeAddress('o://child'),
707
+ leader: leaderNode.address,
708
+ parent: parentNode.address,
709
+ description: 'Reconnected child',
710
+ network: {
711
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
712
+ },
713
+ });
714
+ await childNode.start();
715
+ await new Promise((resolve) => setTimeout(resolve, 3000));
716
+ // CRITICAL: Verify all nodes are reachable after full cascade reconnection
717
+ await testConnectivity(leaderNode, 'leader (after cascade)');
718
+ await testConnectivity(parentNode, 'parent (after cascade)');
719
+ await testConnectivity(childNode, 'child (after cascade)');
720
+ });
721
+ it('should allow independent client to discover and ping child via registry after reconnection', async () => {
722
+ // Setup
723
+ await createThreeTierHierarchy();
724
+ await createIndependentClient();
725
+ // Simulate parent reconnection
726
+ await parentNode.stop();
727
+ await new Promise((resolve) => setTimeout(resolve, 1000));
728
+ parentNode = new TestLaneTool({
729
+ address: new oNodeAddress('o://parent'),
730
+ leader: leaderNode.address,
731
+ parent: leaderNode.address,
732
+ description: 'Reconnected parent',
733
+ network: {
734
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
735
+ },
736
+ });
737
+ await parentNode.start();
738
+ await new Promise((resolve) => setTimeout(resolve, 3000));
739
+ // CRITICAL: In production, client would:
740
+ // 1. Query registry for node
741
+ // 2. Discover node's transports
742
+ // 3. Connect and call methods
743
+ // For now, we test direct connectivity which validates the foundation
744
+ await testConnectivity(childNode, 'child via registry (after parent reconnection)');
745
+ });
746
+ it('should detect unhealthy state during parent failure and verify recovery for all affected nodes', async () => {
747
+ // Setup
748
+ await createThreeTierHierarchy();
749
+ await createIndependentClient();
750
+ // PHASE 1: HEALTHY - All nodes reachable
751
+ console.log('\n=== PHASE 1: HEALTHY STATE ===');
752
+ await verifyGraphHealthState([
753
+ { node: leaderNode, lastKnownAddress: 'o://leader', description: 'leader', shouldBeReachable: true },
754
+ { node: parentNode, lastKnownAddress: 'o://leader/parent', description: 'parent', shouldBeReachable: true },
755
+ { node: childNode, lastKnownAddress: 'o://leader/parent/child', description: 'child', shouldBeReachable: true },
756
+ ]);
757
+ // PHASE 2: INDUCE FAILURE - Parent goes down
758
+ console.log('\n=== PHASE 2: INDUCING PARENT FAILURE ===');
759
+ await parentNode.stop();
760
+ await new Promise((resolve) => setTimeout(resolve, 1000));
761
+ // PHASE 3: UNHEALTHY - Verify partial graph failure
762
+ console.log('\n=== PHASE 3: UNHEALTHY STATE ===');
763
+ await verifyGraphHealthState([
764
+ { node: leaderNode, lastKnownAddress: 'o://leader', description: 'leader (still healthy)', shouldBeReachable: true },
765
+ { node: null, lastKnownAddress: 'o://leader/parent', description: 'parent (failed)', shouldBeReachable: false },
766
+ { node: childNode, lastKnownAddress: 'o://leader/parent/child', description: 'child (orphaned)', shouldBeReachable: false },
767
+ ]);
768
+ // PHASE 4: RECOVERY
769
+ console.log('\n=== PHASE 4: RECOVERY ===');
770
+ parentNode = new TestLaneTool({
771
+ address: new oNodeAddress('o://parent'),
772
+ leader: leaderNode.address,
773
+ parent: leaderNode.address,
774
+ description: 'Reconnected parent',
775
+ network: {
776
+ listeners: ['/ip4/127.0.0.1/tcp/0/ws'],
777
+ },
778
+ });
779
+ await parentNode.start();
780
+ await new Promise((resolve) => setTimeout(resolve, 3000));
781
+ // PHASE 5: HEALTHY - Verify full recovery
782
+ console.log('\n=== PHASE 5: HEALTHY STATE (Recovered) ===');
783
+ await verifyGraphHealthState([
784
+ { node: leaderNode, lastKnownAddress: 'o://leader', description: 'leader (recovered)', shouldBeReachable: true },
785
+ { node: parentNode, lastKnownAddress: 'o://leader/parent', description: 'parent (recovered)', shouldBeReachable: true },
786
+ // Note: Child may need explicit re-registration after parent recovery
787
+ // This test documents the actual behavior
788
+ ]);
789
+ });
790
+ });
791
+ });