clawmatrix 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { EventEmitter } from "node:events";
1
+ import { EventEmitter } from "eventemitter3";
2
2
  import path from "node:path";
3
3
  import { homedir, tmpdir } from "node:os";
4
4
  import { createServer, type IncomingMessage, type ServerResponse, type Server } from "node:http";
@@ -22,10 +22,11 @@ import type {
22
22
  PeerSync,
23
23
  } from "./types.ts";
24
24
  import { PeerApprovalManager, type ChannelApi, type NotifyTarget } from "./peer-approval.ts";
25
+ import { computeBackoffDelay, getReconnectBackoff } from "./retry.ts";
25
26
  import { loadOrCreateIdentity } from "./identity.ts";
26
27
  import type { KeyPair } from "./crypto.ts";
27
28
 
28
- const RECONNECT_BASE = 1_000;
29
+ // Reconnect constants kept for RECONNECT_MAX reference in failover scheduling
29
30
  const RECONNECT_MAX = 60_000;
30
31
 
31
32
  /** Frame types that bypass dedup (streams share one id across chunks; responses share id with request). */
@@ -51,6 +52,8 @@ const SKIP_DEDUP_TYPES = new Set([
51
52
  "file_transfer_ack", "file_transfer_complete",
52
53
  ]);
53
54
 
55
+ // Reconnect backoff params are now provided by retry.ts → getReconnectBackoff()
56
+
54
57
  /** Classify WebSocket close code into a human-readable reason. */
55
58
  function classifyCloseReason(code: number, reason: string): string {
56
59
  if (reason) return reason;
@@ -94,6 +97,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
94
97
  /** Reconnect timers keyed by `nodeId|url` for per-channel reconnection. */
95
98
  private reconnectTimers = new Map<string, ReturnType<typeof setTimeout>>();
96
99
  private reconnectAttempts = new Map<string, number>();
100
+ /** Last close code per channel key for adaptive reconnect backoff. */
101
+ private lastCloseCode = new Map<string, number>();
97
102
  /** Track which nodeIds have already completed the full peer join (for multi-channel). */
98
103
  private joinedPeers = new Set<string>();
99
104
  /** All configured URLs per peer (for multi-URL peers). */
@@ -104,6 +109,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
104
109
  private urlProbeLatencies = new Map<string, number>();
105
110
  /** Route probe interval timer. */
106
111
  private probeTimer: ReturnType<typeof setInterval> | null = null;
112
+ /** EMA latency baseline per peer (for spike detection). */
113
+ private latencyBaselines = new Map<string, number>();
114
+ /** Last time a latency-triggered probe was fired per peer (debounce). */
115
+ private lastProbeTime = new Map<string, number>();
107
116
  /** Deferred disconnect timers — grace period before broadcasting peer_leave. */
108
117
  private disconnectGraceTimers = new Map<string, ReturnType<typeof setTimeout>>();
109
118
  private stopped = false;
@@ -121,10 +130,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
121
130
  private rateLimiter: RateLimiter;
122
131
  readonly approvalManager: PeerApprovalManager;
123
132
 
124
- constructor(config: ClawMatrixConfig, openclawVersion?: string) {
133
+ constructor(config: ClawMatrixConfig, openclawVersion?: string, openclawConfig?: Record<string, unknown>) {
125
134
  super();
126
135
  this.config = config;
127
- this.localDeviceInfo = collectDeviceInfo(openclawVersion);
136
+ this.localDeviceInfo = collectDeviceInfo(openclawVersion, openclawConfig);
128
137
  const acpAgents = config.acp?.enabled ? config.acp.agents : undefined;
129
138
  this.localCapabilities = {
130
139
  nodeId: config.nodeId,
@@ -285,7 +294,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
285
294
  }
286
295
 
287
296
  // ── Route probing (for multi-URL peers) ──────────────────────────
288
- private static readonly PROBE_INTERVAL = 3_600_000; // 1 hour
297
+ private static readonly PROBE_INTERVAL = 900_000; // 15 minutes
289
298
  /** Minimum improvement ratio to trigger a route switch. */
290
299
  private static readonly SWITCH_THRESHOLD = 0.7; // new must be ≤70% of current
291
300
 
@@ -385,6 +394,32 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
385
394
  this.switchRoute(nodeId, bestUrl);
386
395
  }
387
396
 
397
+ /** Trigger an immediate route probe for a specific peer (debounced). */
398
+ private triggerProbeForPeer(nodeId: string) {
399
+ const urls = this.peerUrls.get(nodeId);
400
+ if (!urls || urls.length <= 1) return;
401
+
402
+ const now = Date.now();
403
+ const lastProbe = this.lastProbeTime.get(nodeId) ?? 0;
404
+ if (now - lastProbe < 60_000) return; // debounce: 1 per minute
405
+ this.lastProbeTime.set(nodeId, now);
406
+
407
+ // Probe non-active URLs for this peer
408
+ const activeUrl = this.activeUrls.get(nodeId);
409
+ (async () => {
410
+ for (const url of urls) {
411
+ if (url === activeUrl) continue;
412
+ const latency = await this.probeUrl(url);
413
+ if (latency !== null) this.urlProbeLatencies.set(url, latency);
414
+ }
415
+ if (activeUrl) {
416
+ const route = this.router.getRoute(nodeId);
417
+ if (route && route.latencyMs > 0) this.urlProbeLatencies.set(activeUrl, route.latencyMs);
418
+ }
419
+ this.evaluateRouteSwitch(nodeId);
420
+ })();
421
+ }
422
+
388
423
  /** Callback to check if there are active tasks involving a peer. Set by ClusterRuntime. */
389
424
  private activeTaskChecker: ((nodeId: string) => boolean) | null = null;
390
425
 
@@ -465,10 +500,13 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
465
500
  this.inboundIps.set(ws, ip);
466
501
  this.handleInboundOpen(ws, ip);
467
502
 
468
- ws.on("message", (data) => {
503
+ ws.on("message", (data, isBinary) => {
469
504
  const conn = this.inboundConnections.get(ws);
470
505
  if (conn) {
471
- conn.feedMessage(typeof data === "string" ? data : String(data));
506
+ // ws package always delivers data as Buffer; use isBinary to distinguish frame type.
507
+ // Text frames (e.g. base64 encrypted envelopes) must be passed as strings
508
+ // so Connection routes them through decryptBinary, not decryptBinaryRaw.
509
+ conn.feedMessage(isBinary ? (Buffer.isBuffer(data) ? data : Buffer.from(data as any)) : data.toString());
472
510
  }
473
511
  });
474
512
 
@@ -492,7 +530,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
492
530
  private handleInboundOpen(ws: WsWebSocket, ip: string) {
493
531
  // Wrap ws WebSocket into our WsTransport interface
494
532
  const transport: WsTransport = {
495
- send(data: string) {
533
+ send(data: string | Buffer) {
496
534
  ws.send(data);
497
535
  },
498
536
  close(code?: number, reason?: string) {
@@ -539,10 +577,44 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
539
577
  private connectToPeer(peer: PeerConfig) {
540
578
  const urls = Array.isArray(peer.url) ? peer.url : [peer.url];
541
579
  this.peerUrls.set(peer.nodeId, urls);
542
- // Connect to the first URL (or best known from probes)
543
- const bestUrl = this.pickBestUrl(peer.nodeId, urls);
544
- this.activeUrls.set(peer.nodeId, bestUrl);
545
- this.connectToChannel(peer.nodeId, bestUrl);
580
+ if (urls.length <= 1) {
581
+ this.activeUrls.set(peer.nodeId, urls[0]);
582
+ this.connectToChannel(peer.nodeId, urls[0]);
583
+ } else {
584
+ // Multi-URL: connect all simultaneously, prune to best after 10s
585
+ this.activeUrls.set(peer.nodeId, urls[0]);
586
+ for (const url of urls) {
587
+ this.connectToChannel(peer.nodeId, url);
588
+ }
589
+ setTimeout(() => this.pruneWarmupConnections(peer.nodeId), 10_000);
590
+ }
591
+ }
592
+
593
+ /** After warmup period, keep only the lowest-latency channel and close the rest. */
594
+ private pruneWarmupConnections(nodeId: string) {
595
+ const channels = this.router.getChannels(nodeId);
596
+ if (channels.length <= 1) return;
597
+
598
+ let best: Connection | null = null;
599
+ let bestLatency = Infinity;
600
+ for (const conn of channels) {
601
+ if (!conn.isOpen) continue;
602
+ const lat = conn.latencyMs > 0 ? conn.latencyMs : 10_000;
603
+ if (lat < bestLatency) {
604
+ bestLatency = lat;
605
+ best = conn;
606
+ }
607
+ }
608
+
609
+ for (const conn of channels) {
610
+ if (conn !== best && conn.isOpen) {
611
+ conn.close(1000, "warm-up pruned");
612
+ }
613
+ }
614
+
615
+ if (best) {
616
+ debug("peer", `pruneWarmupConnections(${nodeId}): kept channel with latency=${best.latencyMs}ms, pruned ${channels.length - 1} others`);
617
+ }
546
618
  }
547
619
 
548
620
  /** Pick the best URL for a peer based on probe latencies. Falls back to first URL. */
@@ -596,6 +668,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
596
668
  conn.on("authenticated", (caps) => {
597
669
  debug("peer", `connectToChannel(${nodeId}): authenticated url=${url}`);
598
670
  this.reconnectAttempts.delete(channelKey);
671
+ this.lastCloseCode.delete(channelKey);
599
672
  this.onPeerAuthenticated(conn, caps);
600
673
  });
601
674
 
@@ -624,6 +697,14 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
624
697
  debug("peer", `connectToChannel(${nodeId}): self-connection, will not reconnect`);
625
698
  return;
626
699
  }
700
+ // Don't reconnect warm-up pruned channels
701
+ if (ev.reason === "warm-up pruned") {
702
+ debug("peer", `connectToChannel(${nodeId}): warm-up pruned, will not reconnect`);
703
+ return;
704
+ }
705
+ // Record close code for adaptive backoff
706
+ const channelKey = `${nodeId}|${url}`;
707
+ this.lastCloseCode.set(channelKey, ev.code);
627
708
  if (!lastError) {
628
709
  lastError = classifyCloseReason(ev.code, ev.reason);
629
710
  }
@@ -665,7 +746,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
665
746
  }
666
747
  }
667
748
 
668
- const delay = Math.min(RECONNECT_BASE * 2 ** attempt, RECONNECT_MAX);
749
+ const params = getReconnectBackoff(this.lastCloseCode.get(channelKey) ?? 1006);
750
+ const delay = computeBackoffDelay(attempt, params);
669
751
  this.reconnectAttempts.set(channelKey, attempt + 1);
670
752
  const tag = reason ? ` reason="${reason}"` : "";
671
753
  debug("peer", `scheduleChannelReconnect(${nodeId}): attempt=${attempt} delay=${delay}ms url=${url}${tag}`);
@@ -693,11 +775,62 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
693
775
  }
694
776
 
695
777
  // ── Peer lifecycle ─────────────────────────────────────────────
778
+
779
+ /** Migrate tracking maps when a URL-derived nodeId (starts with "_") resolves to the real nodeId. */
780
+ private migrateUrlDerivedPeer(derivedId: string, realId: string) {
781
+ // Migrate peerUrls, activeUrls
782
+ const urls = this.peerUrls.get(derivedId);
783
+ if (urls) {
784
+ this.peerUrls.delete(derivedId);
785
+ if (!this.peerUrls.has(realId)) this.peerUrls.set(realId, urls);
786
+ }
787
+ const activeUrl = this.activeUrls.get(derivedId);
788
+ if (activeUrl) {
789
+ this.activeUrls.delete(derivedId);
790
+ if (!this.activeUrls.has(realId)) this.activeUrls.set(realId, activeUrl);
791
+ }
792
+ // Migrate reconnect state: channelKey uses nodeId|url
793
+ for (const url of urls ?? []) {
794
+ const oldKey = `${derivedId}|${url}`;
795
+ const newKey = `${realId}|${url}`;
796
+ const timer = this.reconnectTimers.get(oldKey);
797
+ if (timer) { this.reconnectTimers.delete(oldKey); this.reconnectTimers.set(newKey, timer); }
798
+ const attempts = this.reconnectAttempts.get(oldKey);
799
+ if (attempts !== undefined) { this.reconnectAttempts.delete(oldKey); this.reconnectAttempts.set(newKey, attempts); }
800
+ const code = this.lastCloseCode.get(oldKey);
801
+ if (code !== undefined) { this.lastCloseCode.delete(oldKey); this.lastCloseCode.set(newKey, code); }
802
+ }
803
+ // Update the config peer entry in place for sentinel and other consumers
804
+ for (const peer of this.config.peers) {
805
+ if (peer.nodeId === derivedId) {
806
+ (peer as { nodeId: string }).nodeId = realId;
807
+ break;
808
+ }
809
+ }
810
+ debug("peer", `migrateUrlDerivedPeer: ${derivedId} → ${realId}`);
811
+ }
812
+
696
813
  private onPeerAuthenticated(conn: Connection, caps: NodeCapabilities, ip?: string) {
697
814
  const nodeId = conn.remoteNodeId!;
698
815
  // Peer's persistent public key for TOFU identity binding
699
816
  const peerPublicKey = conn.remoteIdentityKey ?? undefined;
700
817
 
818
+ // If this outbound peer was configured with URL string shorthand (nodeId starts
819
+ // with "_"), resolve to the real nodeId from the authenticated handshake.
820
+ if (conn.role === "outbound") {
821
+ for (const peer of this.config.peers) {
822
+ if (peer.nodeId.startsWith("_") && this.peerUrls.get(peer.nodeId)) {
823
+ const urls = this.peerUrls.get(peer.nodeId)!;
824
+ // Check if this connection belongs to this URL-derived peer
825
+ const activeUrl = this.activeUrls.get(peer.nodeId);
826
+ if (activeUrl && urls.length > 0) {
827
+ this.migrateUrlDerivedPeer(peer.nodeId, nodeId);
828
+ break;
829
+ }
830
+ }
831
+ }
832
+ }
833
+
701
834
  // Prevent self-connection: close immediately if the remote side authenticated
702
835
  // with our own nodeId. For outbound this means the peer URL accidentally
703
836
  // points to self; for inbound it means a remote node is (mis)using our nodeId.
@@ -796,7 +929,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
796
929
  // Additional channel — just add to the channel pool, no peer_join broadcast
797
930
  this.router.addChannel(nodeId, conn);
798
931
  conn.on("message", (frame) => this.onFrame(frame, conn));
799
- conn.on("latency", () => this.router.updateActiveChannel(nodeId));
932
+ conn.on("latency", (latencyMs) => {
933
+ this.router.updateActiveChannel(nodeId);
934
+ // Update baseline and check for spike
935
+ const baseline = this.latencyBaselines.get(nodeId) ?? latencyMs;
936
+ const newBaseline = baseline * 0.8 + latencyMs * 0.2;
937
+ this.latencyBaselines.set(nodeId, newBaseline);
938
+ if (latencyMs > baseline * 2 && latencyMs > 200) {
939
+ this.triggerProbeForPeer(nodeId);
940
+ }
941
+ });
800
942
  conn.on("close", () => this.onChannelDisconnected(conn));
801
943
  const channelCount = this.router.getChannelCount(nodeId);
802
944
  debug("peer", `completePeerJoin(${nodeId}): additional channel added (total=${channelCount})`);
@@ -830,7 +972,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
830
972
  this.joinedPeers.add(nodeId);
831
973
 
832
974
  conn.on("message", (frame) => this.onFrame(frame, conn));
833
- conn.on("latency", () => this.router.updateActiveChannel(nodeId));
975
+ conn.on("latency", (latencyMs) => {
976
+ this.router.updateActiveChannel(nodeId);
977
+ // Update baseline and check for spike
978
+ const baseline = this.latencyBaselines.get(nodeId) ?? latencyMs;
979
+ const newBaseline = baseline * 0.8 + latencyMs * 0.2;
980
+ this.latencyBaselines.set(nodeId, newBaseline);
981
+ if (latencyMs > baseline * 2 && latencyMs > 200) {
982
+ this.triggerProbeForPeer(nodeId);
983
+ }
984
+ });
834
985
  conn.on("close", () => this.onChannelDisconnected(conn));
835
986
 
836
987
  this.sendPeerSync(conn);
@@ -862,6 +1013,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
862
1013
 
863
1014
  audit("peer_join", { nodeId, detail: `agents=${caps.agents.length} models=${caps.models.length}` });
864
1015
  this.emit("peerConnected", nodeId);
1016
+
1017
+ // Probe alternative routes after reconnect (connectivity may have changed)
1018
+ setTimeout(() => this.triggerProbeForPeer(nodeId), 5_000);
865
1019
  }
866
1020
 
867
1021
  /** Handle a single channel disconnecting (multi-channel aware). */
@@ -901,6 +1055,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
901
1055
  if (nodeId === this.config.nodeId) {
902
1056
  this.router.removePeer(nodeId);
903
1057
  this.joinedPeers.delete(nodeId);
1058
+ this.latencyBaselines.delete(nodeId);
1059
+ this.lastProbeTime.delete(nodeId);
1060
+ // 清除 delta sync 版本号,确保重连时发全量 peer_sync
1061
+ this.peerSyncVersions.delete(nodeId);
904
1062
  return;
905
1063
  }
906
1064
 
@@ -950,6 +1108,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
950
1108
  audit("peer_leave", { nodeId });
951
1109
  this.router.removePeer(nodeId);
952
1110
  this.joinedPeers.delete(nodeId);
1111
+ this.latencyBaselines.delete(nodeId);
1112
+ this.lastProbeTime.delete(nodeId);
1113
+ this.peerSyncVersions.delete(nodeId);
953
1114
 
954
1115
  // Remove satellite contexts that were only reachable via this peer
955
1116
  for (let i = this.satelliteContexts.length - 1; i >= 0; i--) {
@@ -1076,13 +1237,19 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
1076
1237
  this.emit("frame", frame, from);
1077
1238
  }
1078
1239
 
1240
+ /** Last sync version sent to each direct peer. */
1241
+ private peerSyncVersions = new Map<string, number>();
1242
+
1079
1243
  private sendPeerSync(conn: Connection) {
1080
- const peers = this.router.buildPeerSyncPayload();
1081
- const payload: Record<string, unknown> = { peers };
1244
+ const remoteNodeId = conn.remoteNodeId ?? "";
1245
+ const sinceVersion = this.peerSyncVersions.get(remoteNodeId) ?? 0;
1246
+ const delta = this.router.buildPeerSyncDelta(sinceVersion);
1247
+ const payload: Record<string, unknown> = { ...delta };
1082
1248
  if (this.satelliteContexts.length > 0) {
1083
1249
  payload.satellites = this.satelliteContexts;
1084
1250
  }
1085
- conn.send({
1251
+ this.peerSyncVersions.set(remoteNodeId, delta.version);
1252
+ conn.sendDirect({
1086
1253
  type: "peer_sync",
1087
1254
  from: this.config.nodeId,
1088
1255
  timestamp: Date.now(),
@@ -1105,34 +1272,53 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
1105
1272
  }
1106
1273
 
1107
1274
  let changed = false;
1108
- for (const peer of frame.payload.peers) {
1109
- if (peer.nodeId === this.config.nodeId) continue;
1110
- if (peer.nodeId === from.remoteNodeId) {
1111
- const prev = this.router.getRoute(peer.nodeId);
1112
- const hadAgents = prev?.agents.length ?? 0;
1113
- const hadDirectPeers = prev?.directPeers.length ?? 0;
1114
- const hadDeviceInfo = prev?.deviceInfo?.hostname;
1115
- const hadAcpAgents = prev?.acpAgents?.length ?? 0;
1116
- const hadToolProxyEnabled = prev?.toolProxy?.enabled;
1117
- const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
1118
- const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
1119
- this.router.updatePeerCapabilities(peer.nodeId, peer);
1120
- if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
1121
- || (peer.directPeers?.length ?? 0) !== hadDirectPeers
1122
- || peer.toolProxy?.enabled !== hadToolProxyEnabled
1123
- || (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
1124
- || (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
1125
- || peer.deviceInfo?.hostname !== hadDeviceInfo
1126
- || (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
1275
+
1276
+ // Delta sync: apply removed, then added/updated
1277
+ if (frame.payload.version != null && frame.payload.removed) {
1278
+ for (const nodeId of frame.payload.removed) {
1279
+ if (nodeId === this.config.nodeId) continue;
1280
+ // Only remove relay routes — don't remove direct connections
1281
+ const route = this.router.getRoute(nodeId);
1282
+ if (route && !route.connection) {
1283
+ this.router.removePeer(nodeId);
1127
1284
  changed = true;
1128
1285
  }
1129
- } else {
1130
- // Skip if the remote peer only knows about this node through us —
1131
- // using them as relay would create a routing loop.
1132
- if (peer.reachableVia === this.config.nodeId) continue;
1133
- const existing = this.router.getRoute(peer.nodeId);
1134
- if (!existing) changed = true;
1135
- this.router.addRelayPeer(peer, from.remoteNodeId!);
1286
+ }
1287
+ }
1288
+
1289
+ // Process peers from either full sync or delta (updated field)
1290
+ // Delta sets peers = updated for backward compat, so this works for both
1291
+ const peersToProcess = frame.payload.peers;
1292
+ if (peersToProcess) {
1293
+ for (const peer of peersToProcess) {
1294
+ if (peer.nodeId === this.config.nodeId) continue;
1295
+ if (peer.nodeId === from.remoteNodeId) {
1296
+ const prev = this.router.getRoute(peer.nodeId);
1297
+ const hadAgents = prev?.agents.length ?? 0;
1298
+ const hadDirectPeers = prev?.directPeers.length ?? 0;
1299
+ const hadDeviceInfo = prev?.deviceInfo?.hostname;
1300
+ const hadAcpAgents = prev?.acpAgents?.length ?? 0;
1301
+ const hadToolProxyEnabled = prev?.toolProxy?.enabled;
1302
+ const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
1303
+ const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
1304
+ this.router.updatePeerCapabilities(peer.nodeId, peer);
1305
+ if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
1306
+ || (peer.directPeers?.length ?? 0) !== hadDirectPeers
1307
+ || peer.toolProxy?.enabled !== hadToolProxyEnabled
1308
+ || (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
1309
+ || (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
1310
+ || peer.deviceInfo?.hostname !== hadDeviceInfo
1311
+ || (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
1312
+ changed = true;
1313
+ }
1314
+ } else {
1315
+ // Skip if the remote peer only knows about this node through us —
1316
+ // using them as relay would create a routing loop.
1317
+ if (peer.reachableVia === this.config.nodeId) continue;
1318
+ const existing = this.router.getRoute(peer.nodeId);
1319
+ if (!existing) changed = true;
1320
+ this.router.addRelayPeer(peer, from.remoteNodeId!);
1321
+ }
1136
1322
  }
1137
1323
  }
1138
1324
 
package/src/retry.ts ADDED
@@ -0,0 +1,81 @@
1
+ /**
2
+ * Shared retry/backoff/circuit-breaker utilities backed by cockatiel.
3
+ *
4
+ * Provides:
5
+ * - Per-node circuit breakers for mesh health tracking
6
+ * - Adaptive backoff delay computation
7
+ */
8
+
9
+ import { circuitBreaker, handleAll, SamplingBreaker, ConsecutiveBreaker } from "cockatiel";
10
+
11
+ // ── Per-node circuit breakers ────────────────────────────────────
12
+
13
+ export interface CircuitBreakerConfig {
14
+ /** Number of consecutive failures before opening the circuit (default: 5). */
15
+ threshold?: number;
16
+ /** How long to wait before trying a half-open probe in ms (default: 30000). */
17
+ halfOpenAfter?: number;
18
+ }
19
+
20
+ const DEFAULT_CB_CONFIG: Required<CircuitBreakerConfig> = {
21
+ threshold: 5,
22
+ halfOpenAfter: 30_000,
23
+ };
24
+
25
+ const nodeBreakers = new Map<string, ReturnType<typeof circuitBreaker>>();
26
+
27
+ /** Get or create a circuit breaker for a node. Breakers are cached per nodeId. */
28
+ export function getNodeCircuitBreaker(nodeId: string, config?: CircuitBreakerConfig) {
29
+ let cb = nodeBreakers.get(nodeId);
30
+ if (cb) return cb;
31
+
32
+ const cfg = { ...DEFAULT_CB_CONFIG, ...config };
33
+ cb = circuitBreaker(handleAll, {
34
+ halfOpenAfter: cfg.halfOpenAfter,
35
+ breaker: new ConsecutiveBreaker(cfg.threshold),
36
+ });
37
+ nodeBreakers.set(nodeId, cb);
38
+ return cb;
39
+ }
40
+
41
+ /** Remove a node's circuit breaker (e.g. when peer leaves). */
42
+ export function removeNodeCircuitBreaker(nodeId: string) {
43
+ nodeBreakers.delete(nodeId);
44
+ }
45
+
46
+ /** Check if a node's circuit is currently open (unhealthy). */
47
+ export function isNodeCircuitOpen(nodeId: string): boolean {
48
+ const cb = nodeBreakers.get(nodeId);
49
+ if (!cb) return false;
50
+ return cb.state === "open";
51
+ }
52
+
53
+ /** Reset all circuit breakers (e.g. on shutdown). */
54
+ export function resetAllCircuitBreakers() {
55
+ nodeBreakers.clear();
56
+ }
57
+
58
+ // ── Adaptive backoff computation ─────────────────────────────────
59
+
60
+ export interface BackoffParams {
61
+ /** Base delay in ms. */
62
+ base: number;
63
+ /** Maximum delay in ms. */
64
+ max: number;
65
+ }
66
+
67
+ /** Compute an exponential backoff delay with jitter.
68
+ * Returns a delay in ms: min(base * 2^attempt, max) * random(0.5, 1.0). */
69
+ export function computeBackoffDelay(attempt: number, params: BackoffParams): number {
70
+ const raw = Math.min(params.base * 2 ** attempt, params.max);
71
+ return Math.round(raw * (0.5 + Math.random() * 0.5));
72
+ }
73
+
74
+ /** Get adaptive backoff parameters based on WebSocket close code. */
75
+ export function getReconnectBackoff(code: number): BackoffParams {
76
+ if (code === 4001) return { base: 30_000, max: 300_000 }; // auth failed → slow
77
+ if (code === 4003) return { base: 5_000, max: 60_000 }; // auth timeout → medium
78
+ if (code === 1006) return { base: 1_000, max: 10_000 }; // network error → fast
79
+ if (code === 1001) return { base: 2_000, max: 30_000 }; // going away → medium-fast
80
+ return { base: 1_000, max: 60_000 }; // default
81
+ }