clawmatrix 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/router.ts CHANGED
@@ -1,10 +1,13 @@
1
1
  import type { ClusterFrame, AnyClusterFrame, PeerInfo, AgentInfo, ModelInfo, DeviceInfo, ToolProxyInfo, AcpAgentInfo } from "./types.ts";
2
2
  import type { Connection } from "./connection.ts";
3
+ import { LRUCache } from "lru-cache";
4
+ import { debug } from "./debug.ts";
3
5
 
4
6
  const DEFAULT_TTL = 3;
5
7
  const MAX_SEEN_FRAMES = 10_000;
6
8
  const MAX_FAILED_REQUESTS = 5_000;
7
- const ROTATE_INTERVAL = 60_000; // rotate dedup maps every 60s
9
+ const SEEN_FRAME_TTL = 120_000; // 2 minutes (was ~60-120s with double-map rotation)
10
+ const FAILED_REQUEST_TTL = 900_000; // 15 minutes
8
11
 
9
12
  export interface RouteEntry {
10
13
  nodeId: string;
@@ -33,12 +36,17 @@ export class Router {
33
36
  private connections = new Map<string, Connection>(); // nodeId → active (best) direct connection
34
37
  /** All live channels per nodeId (multi-channel support). */
35
38
  private channels = new Map<string, Set<Connection>>();
36
- /** Double-map dedup: current window + previous window. Rotated periodically. */
37
- private seenCurrent = new Map<string, true>();
38
- private seenPrevious = new Map<string, true>();
39
- private rotateTimer: ReturnType<typeof setInterval> | null = null;
40
- /** Failed request IDs with expiry timestamps. Separate from dedup to support longer TTLs. */
41
- private failedRequests = new Map<string, number>(); // requestId expiresAt
39
+
40
+ // ── Delta sync versioning ─────────────────────────────────────
41
+ private syncVersion = 0;
42
+ /** Version at which each nodeId was last added/updated. */
43
+ private peerVersions = new Map<string, number>();
44
+ /** Removed nodeIds with their removal version. Pruned when gap > 200. */
45
+ private removedPeers = new Map<string, number>();
46
+ /** LRU-based frame deduplication with TTL. */
47
+ private seenFrames = new LRUCache<string, true>({ max: MAX_SEEN_FRAMES, ttl: SEEN_FRAME_TTL });
48
+ /** Failed request IDs with TTL. Separate from dedup to support longer TTLs. */
49
+ private failedRequests = new LRUCache<string, true>({ max: MAX_FAILED_REQUESTS, ttl: FAILED_REQUEST_TTL });
42
50
 
43
51
  // ── Indexes for O(1) lookups in hot paths ──────────────────────
44
52
  /** agentId → Set of nodeIds that host this agent. */
@@ -59,18 +67,6 @@ export class Router {
59
67
  this.localDeviceInfo = localCapabilities?.deviceInfo;
60
68
  this.localToolProxy = localCapabilities?.toolProxy;
61
69
  this.localAcpAgents = localCapabilities?.acpAgents;
62
-
63
- this.rotateTimer = setInterval(() => this.rotateSeenFrames(), ROTATE_INTERVAL);
64
- }
65
-
66
- /** Rebuild all indexes from scratch. Called after any route table mutation. */
67
- private rebuildIndexes() {
68
- this.agentIndex.clear();
69
- this.tagIndex.clear();
70
- this.modelIndex.clear();
71
- for (const entry of this.routes.values()) {
72
- this.indexEntry(entry);
73
- }
74
70
  }
75
71
 
76
72
  /** Add a single entry to all indexes. */
@@ -98,15 +94,25 @@ export class Router {
98
94
  }
99
95
  }
100
96
 
101
- /** Remove a single entry from all indexes. */
97
+ /** Remove a single entry from all indexes. Cleans up empty Sets to prevent memory leaks. */
102
98
  private unindexEntry(entry: RouteEntry) {
103
99
  const nid = entry.nodeId;
104
100
  for (const a of entry.agents ?? []) {
105
- this.agentIndex.get(a.id)?.delete(nid);
106
- for (const t of a.tags ?? []) this.tagIndex.get(t)?.delete(nid);
101
+ const aSet = this.agentIndex.get(a.id);
102
+ if (aSet) { aSet.delete(nid); if (aSet.size === 0) this.agentIndex.delete(a.id); }
103
+ for (const t of a.tags ?? []) {
104
+ const tSet = this.tagIndex.get(t);
105
+ if (tSet) { tSet.delete(nid); if (tSet.size === 0) this.tagIndex.delete(t); }
106
+ }
107
+ }
108
+ for (const t of entry.tags ?? []) {
109
+ const tSet = this.tagIndex.get(t);
110
+ if (tSet) { tSet.delete(nid); if (tSet.size === 0) this.tagIndex.delete(t); }
111
+ }
112
+ for (const m of entry.models ?? []) {
113
+ const mSet = this.modelIndex.get(m.id);
114
+ if (mSet) { mSet.delete(nid); if (mSet.size === 0) this.modelIndex.delete(m.id); }
107
115
  }
108
- for (const t of entry.tags ?? []) this.tagIndex.get(t)?.delete(nid);
109
- for (const m of entry.models ?? []) this.modelIndex.get(m.id)?.delete(nid);
110
116
  }
111
117
 
112
118
  /** Update locally advertised ACP agents (used after auto-detection). */
@@ -123,12 +129,7 @@ export class Router {
123
129
 
124
130
  /** Stop periodic cleanup. Call on shutdown. */
125
131
  destroy() {
126
- if (this.rotateTimer) {
127
- clearInterval(this.rotateTimer);
128
- this.rotateTimer = null;
129
- }
130
- this.seenCurrent.clear();
131
- this.seenPrevious.clear();
132
+ this.seenFrames.clear();
132
133
  this.failedRequests.clear();
133
134
  this.channels.clear();
134
135
  }
@@ -162,6 +163,9 @@ export class Router {
162
163
  };
163
164
  this.routes.set(nodeId, entry);
164
165
  this.indexEntry(entry);
166
+ this.syncVersion++;
167
+ this.peerVersions.set(nodeId, this.syncVersion);
168
+ this.removedPeers.delete(nodeId);
165
169
  }
166
170
 
167
171
  /** Add an additional channel to an existing peer (multi-channel). */
@@ -263,6 +267,9 @@ export class Router {
263
267
  };
264
268
  this.routes.set(peer.nodeId, entry);
265
269
  this.indexEntry(entry);
270
+ this.syncVersion++;
271
+ this.peerVersions.set(peer.nodeId, this.syncVersion);
272
+ this.removedPeers.delete(peer.nodeId);
266
273
  }
267
274
 
268
275
  removePeer(nodeId: string) {
@@ -272,12 +279,17 @@ export class Router {
272
279
  if (removed) {
273
280
  this.unindexEntry(removed);
274
281
  this.routes.delete(nodeId);
282
+ this.syncVersion++;
283
+ this.removedPeers.set(nodeId, this.syncVersion);
284
+ this.peerVersions.delete(nodeId);
275
285
  }
276
286
  // Also remove routes that relied on this node as relay
277
287
  for (const [id, entry] of this.routes) {
278
288
  if (entry.reachableVia === nodeId) {
279
289
  this.unindexEntry(entry);
280
290
  this.routes.delete(id);
291
+ this.removedPeers.set(id, this.syncVersion);
292
+ this.peerVersions.delete(id);
281
293
  }
282
294
  }
283
295
  }
@@ -302,6 +314,8 @@ export class Router {
302
314
  entry.acpAgents = capabilities.acpAgents;
303
315
  entry.lastSeen = Date.now();
304
316
  this.indexEntry(entry);
317
+ this.syncVersion++;
318
+ this.peerVersions.set(nodeId, this.syncVersion);
305
319
  }
306
320
  }
307
321
 
@@ -317,8 +331,9 @@ export class Router {
317
331
  return this.routes.get(nodeId);
318
332
  }
319
333
 
320
- /** Resolve target agent to a specific nodeId. Supports agent ID or "tags:<tag>". */
321
- resolveAgent(target: string): RouteEntry | undefined {
334
+ /** Resolve target agent to a specific nodeId. Supports agent ID or "tags:<tag>".
335
+ * Optionally excludes nodes in the `exclude` set (e.g. previously failed nodes). */
336
+ resolveAgent(target: string, exclude?: Set<string>): RouteEntry | undefined {
322
337
  const isTagQuery = target.startsWith("tags:");
323
338
 
324
339
  let nodeIds: Set<string> | undefined;
@@ -332,6 +347,7 @@ export class Router {
332
347
  if (nodeIds) {
333
348
  for (const nid of nodeIds) {
334
349
  if (nid === this.nodeId) continue;
350
+ if (exclude?.has(nid)) continue;
335
351
  const entry = this.routes.get(nid);
336
352
  if (entry) candidates.push(entry);
337
353
  }
@@ -339,8 +355,10 @@ export class Router {
339
355
 
340
356
  // Fallback: if no agent ID or tag matched, try matching by nodeId
341
357
  if (candidates.length === 0 && !isTagQuery) {
342
- const byNode = this.routes.get(target);
343
- if (byNode && byNode.nodeId !== this.nodeId) candidates.push(byNode);
358
+ if (!exclude?.has(target)) {
359
+ const byNode = this.routes.get(target);
360
+ if (byNode && byNode.nodeId !== this.nodeId) candidates.push(byNode);
361
+ }
344
362
  }
345
363
 
346
364
  if (candidates.length === 0) return undefined;
@@ -452,6 +470,7 @@ export class Router {
452
470
  }
453
471
  }
454
472
 
473
+ debug("router", `sendTo(${targetNodeId}): all paths failed (conn open=${route.connection?.isOpen}, relay=${route.reachableVia})`);
455
474
  return false;
456
475
  }
457
476
 
@@ -477,70 +496,26 @@ export class Router {
477
496
  return relayed;
478
497
  }
479
498
 
480
- // ── Deduplication (double-map rotation) ────────────────────────
499
+ // ── Deduplication (LRU with TTL) ──────────────────────────────
481
500
  /**
482
501
  * Returns true if the frame has been seen before (duplicate).
483
- * Uses a double-map strategy: entries live in `seenCurrent` and are
484
- * promoted from `seenPrevious` on access. Every ROTATE_INTERVAL the
485
- * previous map is discarded and current becomes previous — O(1) cleanup.
502
+ * Uses LRU cache with TTL automatic eviction by age and size.
486
503
  */
487
504
  isDuplicate(frameId: string): boolean {
488
505
  if (!frameId) return false;
489
- if (this.seenCurrent.has(frameId)) return true;
490
- if (this.seenPrevious.has(frameId)) {
491
- // Promote to current so it survives the next rotation
492
- this.seenCurrent.set(frameId, true);
493
- return true;
494
- }
495
-
496
- this.seenCurrent.set(frameId, true);
497
- // Safety valve: if current map grows too large, force a rotation
498
- if (this.seenCurrent.size > MAX_SEEN_FRAMES) {
499
- this.rotateSeenFrames();
500
- }
506
+ if (this.seenFrames.has(frameId)) return true;
507
+ this.seenFrames.set(frameId, true);
501
508
  return false;
502
509
  }
503
510
 
504
511
  /** Mark a request ID as failed so late responses are ignored.
505
512
  * TTL defaults to 15 minutes — long enough for handoff timeouts. */
506
513
  markFailed(requestId: string, ttlMs = 900_000) {
507
- this.failedRequests.set(requestId, Date.now() + ttlMs);
508
- // Evict entries when map grows too large: first expired, then FIFO
509
- if (this.failedRequests.size > MAX_FAILED_REQUESTS) {
510
- const now = Date.now();
511
- // Pass 1: remove expired entries
512
- for (const [id, expiresAt] of this.failedRequests) {
513
- if (now > expiresAt) this.failedRequests.delete(id);
514
- }
515
- // Pass 2: if still over limit, remove oldest (insertion-order) entries
516
- if (this.failedRequests.size > MAX_FAILED_REQUESTS) {
517
- for (const [id] of this.failedRequests) {
518
- if (this.failedRequests.size <= MAX_FAILED_REQUESTS) break;
519
- this.failedRequests.delete(id);
520
- }
521
- }
522
- }
514
+ this.failedRequests.set(requestId, true, { ttl: ttlMs });
523
515
  }
524
516
 
525
517
  isFailed(requestId: string): boolean {
526
- const expiresAt = this.failedRequests.get(requestId);
527
- if (expiresAt === undefined) return false;
528
- if (Date.now() > expiresAt) {
529
- this.failedRequests.delete(requestId);
530
- return false;
531
- }
532
- return true;
533
- }
534
-
535
- private rotateSeenFrames() {
536
- this.seenPrevious = this.seenCurrent;
537
- this.seenCurrent = new Map();
538
-
539
- // Prune expired failed requests
540
- const now = Date.now();
541
- for (const [id, expiresAt] of this.failedRequests) {
542
- if (now > expiresAt) this.failedRequests.delete(id);
543
- }
518
+ return this.failedRequests.has(requestId);
544
519
  }
545
520
 
546
521
  // ── Accessors ──────────────────────────────────────────────────
@@ -562,38 +537,111 @@ export class Router {
562
537
  return [...this.connections.values()];
563
538
  }
564
539
 
565
- /** Build PeerInfo list for peer_sync. */
566
- buildPeerSyncPayload(): PeerInfo[] {
567
- const peers: PeerInfo[] = [];
568
- // Include ourselves with our direct peer list
569
- const myDirectPeers = [...this.connections.keys()];
570
- peers.push({
540
+ /** Current sync version for delta protocol. */
541
+ get currentSyncVersion(): number {
542
+ return this.syncVersion;
543
+ }
544
+
545
+ /** Convert a RouteEntry to PeerInfo for wire transmission. */
546
+ private entryToPeerInfo(entry: RouteEntry): PeerInfo {
547
+ return {
548
+ nodeId: entry.nodeId,
549
+ agents: entry.agents,
550
+ models: entry.models,
551
+ tags: entry.tags,
552
+ reachableVia: entry.reachableVia ?? undefined,
553
+ directPeers: entry.directPeers.length > 0 ? entry.directPeers : undefined,
554
+ deviceInfo: entry.deviceInfo,
555
+ toolProxy: entry.toolProxy,
556
+ acpAgents: entry.acpAgents,
557
+ latencyMs: entry.latencyMs > 0 ? entry.latencyMs : undefined,
558
+ };
559
+ }
560
+
561
+ /** Build local node PeerInfo. */
562
+ private localPeerInfo(): PeerInfo {
563
+ return {
571
564
  nodeId: this.nodeId,
572
565
  agents: this.localAgents,
573
566
  models: this.localModels,
574
567
  tags: this.localTags,
575
- directPeers: myDirectPeers,
568
+ directPeers: [...this.connections.keys()],
576
569
  deviceInfo: this.localDeviceInfo,
577
570
  toolProxy: this.localToolProxy,
578
571
  acpAgents: this.localAcpAgents,
579
- });
572
+ };
573
+ }
574
+
575
+ /** Build PeerInfo list for peer_sync. */
576
+ buildPeerSyncPayload(): PeerInfo[] {
577
+ const peers: PeerInfo[] = [this.localPeerInfo()];
580
578
  for (const entry of this.routes.values()) {
581
579
  // Same-nodeId 本地客户端(Mac/iOS)不出现在 peer_sync 中,
582
580
  // 客户端通过 auth_ok 获取网关 capabilities,无需在此重复。
583
581
  if (entry.nodeId === this.nodeId) continue;
584
- peers.push({
585
- nodeId: entry.nodeId,
586
- agents: entry.agents,
587
- models: entry.models,
588
- tags: entry.tags,
589
- reachableVia: entry.reachableVia ?? undefined,
590
- directPeers: entry.directPeers.length > 0 ? entry.directPeers : undefined,
591
- deviceInfo: entry.deviceInfo,
592
- toolProxy: entry.toolProxy,
593
- acpAgents: entry.acpAgents,
594
- latencyMs: entry.latencyMs > 0 ? entry.latencyMs : undefined,
595
- });
582
+ peers.push(this.entryToPeerInfo(entry));
596
583
  }
597
584
  return peers;
598
585
  }
586
+
587
+ /** Build delta peer_sync payload since a given version.
588
+ * Returns { version, peers (full) } if delta is too large or sinceVersion=0.
589
+ * Returns { version, added, removed, updated, peers (compat) } for incremental. */
590
+ buildPeerSyncDelta(sinceVersion: number): {
591
+ version: number;
592
+ peers: PeerInfo[];
593
+ added?: PeerInfo[];
594
+ removed?: string[];
595
+ updated?: PeerInfo[];
596
+ } {
597
+ const version = this.syncVersion;
598
+
599
+ // Full sync if: first sync, version gap too large, or no history
600
+ if (sinceVersion === 0 || version - sinceVersion > 100) {
601
+ return { version, peers: this.buildPeerSyncPayload() };
602
+ }
603
+
604
+ const added: PeerInfo[] = [];
605
+ const updated: PeerInfo[] = [];
606
+ const removed: string[] = [];
607
+
608
+ // Collect changes since sinceVersion
609
+ for (const [nodeId, ver] of this.peerVersions) {
610
+ if (ver > sinceVersion) {
611
+ const entry = this.routes.get(nodeId);
612
+ if (entry && entry.nodeId !== this.nodeId) {
613
+ // Distinguish add vs update: if version was set before sinceVersion, it's an update
614
+ // But we don't track the original add version separately, so treat all as "updated"
615
+ updated.push(this.entryToPeerInfo(entry));
616
+ }
617
+ }
618
+ }
619
+
620
+ // Always include self in updated (directPeers may have changed)
621
+ updated.push(this.localPeerInfo());
622
+
623
+ for (const [nodeId, ver] of this.removedPeers) {
624
+ if (ver > sinceVersion) {
625
+ removed.push(nodeId);
626
+ }
627
+ }
628
+
629
+ // Prune stale removedPeers entries
630
+ if (this.removedPeers.size > 200) {
631
+ const threshold = this.syncVersion - 100;
632
+ for (const [nodeId, ver] of this.removedPeers) {
633
+ if (ver < threshold) this.removedPeers.delete(nodeId);
634
+ }
635
+ }
636
+
637
+ // If delta is > 50% of full sync, just send full
638
+ const fullSize = this.routes.size + 1;
639
+ if (updated.length + removed.length > fullSize * 0.5) {
640
+ return { version, peers: this.buildPeerSyncPayload() };
641
+ }
642
+
643
+ // For backward compatibility: peers field contains updated entries
644
+ // so old nodes that only read `peers` still get capability changes
645
+ return { version, peers: updated, added, removed, updated };
646
+ }
599
647
  }
package/src/sentinel.ts CHANGED
@@ -15,6 +15,7 @@
15
15
 
16
16
  import { spawn } from "node:child_process";
17
17
  import { readFileSync, writeFileSync, unlinkSync, existsSync } from "node:fs";
18
+ import { readFile } from "node:fs/promises";
18
19
  import { createServer, type Server } from "node:http";
19
20
  import { WebSocketServer, WebSocket as WsWebSocket } from "ws";
20
21
  import path from "node:path";
@@ -253,11 +254,11 @@ function handleFrame(frame: AnyClusterFrame, conn: Connection) {
253
254
  }
254
255
 
255
256
  function handleDiagnosticExec(frame: DiagnosticExec, conn: Connection) {
256
- // Rate limiting
257
+ // Rate limiting — index-based pruning avoids O(n) shift() reallocation
257
258
  const now = Date.now();
258
- while (execTimestamps.length > 0 && now - execTimestamps[0]! > EXEC_RATE_WINDOW) {
259
- execTimestamps.shift();
260
- }
259
+ let firstValid = 0;
260
+ while (firstValid < execTimestamps.length && now - execTimestamps[firstValid]! > EXEC_RATE_WINDOW) firstValid++;
261
+ if (firstValid > 0) execTimestamps.splice(0, firstValid);
261
262
  if (execTimestamps.length >= EXEC_RATE_LIMIT) {
262
263
  conn.send({
263
264
  type: "diagnostic_exec_res",
@@ -339,10 +340,10 @@ function handleDiagnosticStatus(frame: DiagnosticStatus, conn: Connection) {
339
340
 
340
341
  // ── Port takeover: listen when gateway dies, release when it returns ──
341
342
 
342
- function startListening() {
343
+ async function startListening() {
343
344
  if (listening || !config.listenPort) return;
344
345
  // If we've been replaced by a new sentinel, exit instead of competing for the port
345
- if (isReplaced()) {
346
+ if (await isReplaced()) {
346
347
  log("PID file replaced — another sentinel is active, exiting");
347
348
  cleanup();
348
349
  return;
@@ -372,7 +373,7 @@ function startListening() {
372
373
 
373
374
  wss.on("connection", (ws) => {
374
375
  const transport: WsTransport = {
375
- send(data: string) { ws.send(data); },
376
+ send(data: string | Buffer) { ws.send(data); },
376
377
  close(code?: number, reason?: string) { ws.close(code, reason); },
377
378
  get readyState() { return ws.readyState; },
378
379
  };
@@ -421,7 +422,7 @@ function startListening() {
421
422
  conn.on("error", () => { /* close will follow */ });
422
423
 
423
424
  ws.on("message", (data) => {
424
- conn.feedMessage(typeof data === "string" ? data : String(data));
425
+ conn.feedMessage(Buffer.isBuffer(data) ? data : typeof data === "string" ? data : String(data));
425
426
  });
426
427
 
427
428
  ws.on("close", (code, reason) => {
@@ -439,15 +440,17 @@ function startListening() {
439
440
  listening = false;
440
441
 
441
442
  // If we've been replaced by a new sentinel, exit gracefully
442
- if (isReplaced()) {
443
- log("PID file replaced — exiting");
444
- cleanup();
445
- return;
446
- }
447
- // Port may still be held briefly by the dying gateway — retry after a delay
448
- setTimeout(() => {
449
- if (!gatewayAlive && config.listenPort) startListening();
450
- }, 3_000);
443
+ isReplaced().then((replaced) => {
444
+ if (replaced) {
445
+ log("PID file replaced — exiting");
446
+ cleanup();
447
+ return;
448
+ }
449
+ // Port may still be held briefly by the dying gateway — retry after a delay
450
+ setTimeout(() => {
451
+ if (!gatewayAlive && config.listenPort) startListening();
452
+ }, 3_000);
453
+ });
451
454
  });
452
455
 
453
456
  httpServer.listen(port, host, () => {
@@ -458,21 +461,31 @@ function startListening() {
458
461
 
459
462
  function stopListening() {
460
463
  if (!listening) return;
461
- // Gracefully close all inbound connections
462
- for (const [ws, conn] of inboundConnections) {
463
- conn.close(1001, "gateway recovered");
464
- ws.close(1001, "gateway recovered");
464
+ // Notify peers before closing so they reconnect immediately
465
+ for (const [, conn] of inboundConnections) {
466
+ try {
467
+ conn.send({
468
+ type: "peer_sync", from: sentinelNodeId(), timestamp: Date.now(),
469
+ payload: { peers: [] },
470
+ } as AnyClusterFrame);
471
+ } catch { /* best effort */ }
465
472
  }
466
- inboundConnections.clear();
467
- wss?.close();
468
- wss = null;
469
- httpServer?.close();
470
- httpServer = null;
471
- listening = false;
472
- // Mark voluntary release — sentinel will not re-listen during cooldown
473
- // to give the gateway time to bind the port.
474
- voluntaryReleaseAt = Date.now();
475
- log("Port released — gateway is back");
473
+ // Delay 200ms to let notification frames flush
474
+ setTimeout(() => {
475
+ for (const [ws, conn] of inboundConnections) {
476
+ conn.close(1001, "gateway recovered");
477
+ }
478
+ inboundConnections.clear();
479
+ wss?.close();
480
+ wss = null;
481
+ httpServer?.close();
482
+ httpServer = null;
483
+ listening = false;
484
+ // Mark voluntary release — sentinel will not re-listen during cooldown
485
+ // to give the gateway time to bind the port.
486
+ voluntaryReleaseAt = Date.now();
487
+ log("Port released — gateway is back");
488
+ }, 200);
476
489
  }
477
490
 
478
491
  // ── PID file management ─────────────────────────────────────────
@@ -480,8 +493,9 @@ function writePidFile() {
480
493
  writeFileSync(config.pidFile, String(process.pid));
481
494
  }
482
495
 
483
- /** Check if another sentinel has replaced us (PID file contains a different PID). */
484
- function isReplaced(): boolean {
496
+ /** Check if another sentinel has replaced us (PID file contains a different PID).
497
+ * Sync variant — only used in uncaughtException handler where the event loop may be draining. */
498
+ function isReplacedSync(): boolean {
485
499
  try {
486
500
  if (!existsSync(config.pidFile)) return true;
487
501
  const filePid = parseInt(readFileSync(config.pidFile, "utf-8").trim(), 10);
@@ -491,7 +505,19 @@ function isReplaced(): boolean {
491
505
  }
492
506
  }
493
507
 
494
- function killOldSentinel() {
508
+ /** Async variant of isReplaced — used in recurring timer callbacks to avoid blocking the event loop. */
509
+ async function isReplaced(): Promise<boolean> {
510
+ try {
511
+ const content = await readFile(config.pidFile, "utf-8");
512
+ const filePid = parseInt(content.trim(), 10);
513
+ return filePid !== process.pid;
514
+ } catch {
515
+ // File doesn't exist or can't be read — assume replaced
516
+ return true;
517
+ }
518
+ }
519
+
520
+ async function killOldSentinel() {
495
521
  if (!existsSync(config.pidFile)) return;
496
522
  try {
497
523
  const oldPid = parseInt(readFileSync(config.pidFile, "utf-8").trim(), 10);
@@ -505,9 +531,8 @@ function killOldSentinel() {
505
531
  while (Date.now() < deadline) {
506
532
  try {
507
533
  process.kill(oldPid, 0);
508
- // Still alive — busy-wait briefly
509
- const waitUntil = Date.now() + 100;
510
- while (Date.now() < waitUntil) { /* spin */ }
534
+ // Still alive — async wait to avoid blocking the event loop
535
+ await new Promise((r) => setTimeout(r, 100));
511
536
  } catch {
512
537
  // Process exited
513
538
  log(`Old sentinel (pid ${oldPid}) exited`);
@@ -564,7 +589,7 @@ process.on("uncaughtException", (err) => {
564
589
  log(`Uncaught exception: ${err.stack || err.message}`);
565
590
  // EADDRINUSE from a listen call means the port is taken — if we've been
566
591
  // replaced by a new sentinel/gateway, exit cleanly instead of looping.
567
- if ((err as NodeJS.ErrnoException).code === "EADDRINUSE" && isReplaced()) {
592
+ if ((err as NodeJS.ErrnoException).code === "EADDRINUSE" && isReplacedSync()) {
568
593
  log("Port in use and PID file replaced — exiting");
569
594
  cleanup();
570
595
  }
@@ -588,15 +613,24 @@ function connectAllPeers() {
588
613
 
589
614
  /** Disconnect from all peers (called when gateway recovers). */
590
615
  function disconnectAllPeers() {
591
- for (const [nodeId, conn] of connections) {
592
- conn.close(1000, "gateway recovered");
593
- connections.delete(nodeId);
594
- }
595
- for (const [nodeId, timer] of reconnectTimers) {
596
- clearTimeout(timer);
597
- reconnectTimers.delete(nodeId);
616
+ // Notify peers before closing so they reconnect immediately
617
+ for (const [, conn] of connections) {
618
+ try {
619
+ conn.send({
620
+ type: "peer_sync", from: sentinelNodeId(), timestamp: Date.now(),
621
+ payload: { peers: [] },
622
+ } as AnyClusterFrame);
623
+ } catch { /* best effort */ }
598
624
  }
599
- reconnectAttempts.clear();
625
+ setTimeout(() => {
626
+ for (const [nodeId, conn] of connections) {
627
+ conn.close(1001, "gateway recovered");
628
+ connections.delete(nodeId);
629
+ }
630
+ for (const [, timer] of reconnectTimers) clearTimeout(timer);
631
+ reconnectTimers.clear();
632
+ reconnectAttempts.clear();
633
+ }, 200);
600
634
  }
601
635
 
602
636
  /** Periodically check if the gateway process is still alive via kill(pid, 0). */
@@ -625,22 +659,22 @@ function startGatewayHealthCheck() {
625
659
  if (config.listenPort) {
626
660
  const cooldownRemaining = PORT_RELEASE_COOLDOWN - (Date.now() - voluntaryReleaseAt);
627
661
  const delay = Math.max(2_000, cooldownRemaining);
628
- setTimeout(() => {
629
- if (!gatewayAlive && !isReplaced()) startListening();
662
+ setTimeout(async () => {
663
+ if (!gatewayAlive && !(await isReplaced())) startListening();
630
664
  }, delay);
631
665
  }
632
666
  }
633
667
  }
634
- }, 5_000);
668
+ }, 2_000);
635
669
  }
636
670
 
637
- function boot() {
671
+ async function boot() {
638
672
  // Prefer explicit gatewayPid from config (sent by SentinelManager),
639
673
  // fall back to ppid (may be inaccurate if forked indirectly).
640
674
  gatewayPid = config.gatewayPid ?? process.ppid;
641
675
 
642
676
  loadApprovedPeers();
643
- killOldSentinel();
677
+ await killOldSentinel();
644
678
  writePidFile();
645
679
  log(`Started (pid ${process.pid}, gateway ${gatewayPid}, nodeId ${sentinelNodeId()}, takeover port ${config.listenPort || "none"})`);
646
680