clawmatrix 0.4.2 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { EventEmitter } from "node:events";
1
+ import { EventEmitter } from "eventemitter3";
2
2
  import path from "node:path";
3
3
  import { homedir, tmpdir } from "node:os";
4
4
  import { createServer, type IncomingMessage, type ServerResponse, type Server } from "node:http";
@@ -22,10 +22,11 @@ import type {
22
22
  PeerSync,
23
23
  } from "./types.ts";
24
24
  import { PeerApprovalManager, type ChannelApi, type NotifyTarget } from "./peer-approval.ts";
25
+ import { computeBackoffDelay, getReconnectBackoff } from "./retry.ts";
25
26
  import { loadOrCreateIdentity } from "./identity.ts";
26
27
  import type { KeyPair } from "./crypto.ts";
27
28
 
28
- const RECONNECT_BASE = 1_000;
29
+ // Reconnect constants kept for RECONNECT_MAX reference in failover scheduling
29
30
  const RECONNECT_MAX = 60_000;
30
31
 
31
32
  /** Frame types that bypass dedup (streams share one id across chunks; responses share id with request). */
@@ -51,6 +52,8 @@ const SKIP_DEDUP_TYPES = new Set([
51
52
  "file_transfer_ack", "file_transfer_complete",
52
53
  ]);
53
54
 
55
+ // Reconnect backoff params are now provided by retry.ts → getReconnectBackoff()
56
+
54
57
  /** Classify WebSocket close code into a human-readable reason. */
55
58
  function classifyCloseReason(code: number, reason: string): string {
56
59
  if (reason) return reason;
@@ -94,6 +97,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
94
97
  /** Reconnect timers keyed by `nodeId|url` for per-channel reconnection. */
95
98
  private reconnectTimers = new Map<string, ReturnType<typeof setTimeout>>();
96
99
  private reconnectAttempts = new Map<string, number>();
100
+ /** Last close code per channel key for adaptive reconnect backoff. */
101
+ private lastCloseCode = new Map<string, number>();
97
102
  /** Track which nodeIds have already completed the full peer join (for multi-channel). */
98
103
  private joinedPeers = new Set<string>();
99
104
  /** All configured URLs per peer (for multi-URL peers). */
@@ -104,6 +109,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
104
109
  private urlProbeLatencies = new Map<string, number>();
105
110
  /** Route probe interval timer. */
106
111
  private probeTimer: ReturnType<typeof setInterval> | null = null;
112
+ /** EMA latency baseline per peer (for spike detection). */
113
+ private latencyBaselines = new Map<string, number>();
114
+ /** Last time a latency-triggered probe was fired per peer (debounce). */
115
+ private lastProbeTime = new Map<string, number>();
107
116
  /** Deferred disconnect timers — grace period before broadcasting peer_leave. */
108
117
  private disconnectGraceTimers = new Map<string, ReturnType<typeof setTimeout>>();
109
118
  private stopped = false;
@@ -121,10 +130,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
121
130
  private rateLimiter: RateLimiter;
122
131
  readonly approvalManager: PeerApprovalManager;
123
132
 
124
- constructor(config: ClawMatrixConfig, openclawVersion?: string) {
133
+ constructor(config: ClawMatrixConfig, openclawVersion?: string, openclawConfig?: Record<string, unknown>) {
125
134
  super();
126
135
  this.config = config;
127
- this.localDeviceInfo = collectDeviceInfo(openclawVersion);
136
+ this.localDeviceInfo = collectDeviceInfo(openclawVersion, openclawConfig);
128
137
  const acpAgents = config.acp?.enabled ? config.acp.agents : undefined;
129
138
  this.localCapabilities = {
130
139
  nodeId: config.nodeId,
@@ -285,7 +294,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
285
294
  }
286
295
 
287
296
  // ── Route probing (for multi-URL peers) ──────────────────────────
288
- private static readonly PROBE_INTERVAL = 3_600_000; // 1 hour
297
+ private static readonly PROBE_INTERVAL = 900_000; // 15 minutes
289
298
  /** Minimum improvement ratio to trigger a route switch. */
290
299
  private static readonly SWITCH_THRESHOLD = 0.7; // new must be ≤70% of current
291
300
 
@@ -385,6 +394,32 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
385
394
  this.switchRoute(nodeId, bestUrl);
386
395
  }
387
396
 
397
+ /** Trigger an immediate route probe for a specific peer (debounced). */
398
+ private triggerProbeForPeer(nodeId: string) {
399
+ const urls = this.peerUrls.get(nodeId);
400
+ if (!urls || urls.length <= 1) return;
401
+
402
+ const now = Date.now();
403
+ const lastProbe = this.lastProbeTime.get(nodeId) ?? 0;
404
+ if (now - lastProbe < 60_000) return; // debounce: 1 per minute
405
+ this.lastProbeTime.set(nodeId, now);
406
+
407
+ // Probe non-active URLs for this peer
408
+ const activeUrl = this.activeUrls.get(nodeId);
409
+ (async () => {
410
+ for (const url of urls) {
411
+ if (url === activeUrl) continue;
412
+ const latency = await this.probeUrl(url);
413
+ if (latency !== null) this.urlProbeLatencies.set(url, latency);
414
+ }
415
+ if (activeUrl) {
416
+ const route = this.router.getRoute(nodeId);
417
+ if (route && route.latencyMs > 0) this.urlProbeLatencies.set(activeUrl, route.latencyMs);
418
+ }
419
+ this.evaluateRouteSwitch(nodeId);
420
+ })();
421
+ }
422
+
388
423
  /** Callback to check if there are active tasks involving a peer. Set by ClusterRuntime. */
389
424
  private activeTaskChecker: ((nodeId: string) => boolean) | null = null;
390
425
 
@@ -465,10 +500,13 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
465
500
  this.inboundIps.set(ws, ip);
466
501
  this.handleInboundOpen(ws, ip);
467
502
 
468
- ws.on("message", (data) => {
503
+ ws.on("message", (data, isBinary) => {
469
504
  const conn = this.inboundConnections.get(ws);
470
505
  if (conn) {
471
- conn.feedMessage(typeof data === "string" ? data : String(data));
506
+ // ws package always delivers data as Buffer; use isBinary to distinguish frame type.
507
+ // Text frames (e.g. base64 encrypted envelopes) must be passed as strings
508
+ // so Connection routes them through decryptBinary, not decryptBinaryRaw.
509
+ conn.feedMessage(isBinary ? (Buffer.isBuffer(data) ? data : Buffer.from(data as any)) : data.toString());
472
510
  }
473
511
  });
474
512
 
@@ -492,7 +530,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
492
530
  private handleInboundOpen(ws: WsWebSocket, ip: string) {
493
531
  // Wrap ws WebSocket into our WsTransport interface
494
532
  const transport: WsTransport = {
495
- send(data: string) {
533
+ send(data: string | Buffer) {
496
534
  ws.send(data);
497
535
  },
498
536
  close(code?: number, reason?: string) {
@@ -539,10 +577,44 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
539
577
  private connectToPeer(peer: PeerConfig) {
540
578
  const urls = Array.isArray(peer.url) ? peer.url : [peer.url];
541
579
  this.peerUrls.set(peer.nodeId, urls);
542
- // Connect to the first URL (or best known from probes)
543
- const bestUrl = this.pickBestUrl(peer.nodeId, urls);
544
- this.activeUrls.set(peer.nodeId, bestUrl);
545
- this.connectToChannel(peer.nodeId, bestUrl);
580
+ if (urls.length <= 1) {
581
+ this.activeUrls.set(peer.nodeId, urls[0]);
582
+ this.connectToChannel(peer.nodeId, urls[0]);
583
+ } else {
584
+ // Multi-URL: connect all simultaneously, prune to best after 10s
585
+ this.activeUrls.set(peer.nodeId, urls[0]);
586
+ for (const url of urls) {
587
+ this.connectToChannel(peer.nodeId, url);
588
+ }
589
+ setTimeout(() => this.pruneWarmupConnections(peer.nodeId), 10_000);
590
+ }
591
+ }
592
+
593
+ /** After warmup period, keep only the lowest-latency channel and close the rest. */
594
+ private pruneWarmupConnections(nodeId: string) {
595
+ const channels = this.router.getChannels(nodeId);
596
+ if (channels.length <= 1) return;
597
+
598
+ let best: Connection | null = null;
599
+ let bestLatency = Infinity;
600
+ for (const conn of channels) {
601
+ if (!conn.isOpen) continue;
602
+ const lat = conn.latencyMs > 0 ? conn.latencyMs : 10_000;
603
+ if (lat < bestLatency) {
604
+ bestLatency = lat;
605
+ best = conn;
606
+ }
607
+ }
608
+
609
+ for (const conn of channels) {
610
+ if (conn !== best && conn.isOpen) {
611
+ conn.close(1000, "warm-up pruned");
612
+ }
613
+ }
614
+
615
+ if (best) {
616
+ debug("peer", `pruneWarmupConnections(${nodeId}): kept channel with latency=${best.latencyMs}ms, pruned ${channels.length - 1} others`);
617
+ }
546
618
  }
547
619
 
548
620
  /** Pick the best URL for a peer based on probe latencies. Falls back to first URL. */
@@ -571,10 +643,14 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
571
643
  const attempt = this.reconnectAttempts.get(channelKey) ?? 0;
572
644
  debug("peer", `connectToChannel(${nodeId}): attempt=${attempt} url=${url}`);
573
645
 
574
- // Use a common WS subprotocol for traffic disguise
575
- let ws: WebSocket;
646
+ // Use the `ws` package for outbound connections.
647
+ // Node.js 24+'s built-in WebSocket (undici) defaults binaryType to "blob",
648
+ // causing binary frames to arrive as Blob instead of Buffer — which
649
+ // onRawMessage cannot handle, silently dropping encrypted frames (including auth_ok).
650
+ // The `ws` package defaults to binaryType "nodebuffer", avoiding this issue.
651
+ let ws: InstanceType<typeof WsWebSocket>;
576
652
  try {
577
- ws = new WebSocket(url, ["graphql-transport-ws"]);
653
+ ws = new WsWebSocket(url, ["graphql-transport-ws"]);
578
654
  } catch (err) {
579
655
  debug("peer", `connectToChannel(${nodeId}): WebSocket constructor threw: ${err}`);
580
656
  this.scheduleChannelReconnect(nodeId, url);
@@ -596,6 +672,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
596
672
  conn.on("authenticated", (caps) => {
597
673
  debug("peer", `connectToChannel(${nodeId}): authenticated url=${url}`);
598
674
  this.reconnectAttempts.delete(channelKey);
675
+ this.lastCloseCode.delete(channelKey);
599
676
  this.onPeerAuthenticated(conn, caps);
600
677
  });
601
678
 
@@ -624,6 +701,14 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
624
701
  debug("peer", `connectToChannel(${nodeId}): self-connection, will not reconnect`);
625
702
  return;
626
703
  }
704
+ // Don't reconnect warm-up pruned channels
705
+ if (ev.reason === "warm-up pruned") {
706
+ debug("peer", `connectToChannel(${nodeId}): warm-up pruned, will not reconnect`);
707
+ return;
708
+ }
709
+ // Record close code for adaptive backoff
710
+ const channelKey = `${nodeId}|${url}`;
711
+ this.lastCloseCode.set(channelKey, ev.code);
627
712
  if (!lastError) {
628
713
  lastError = classifyCloseReason(ev.code, ev.reason);
629
714
  }
@@ -665,7 +750,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
665
750
  }
666
751
  }
667
752
 
668
- const delay = Math.min(RECONNECT_BASE * 2 ** attempt, RECONNECT_MAX);
753
+ const params = getReconnectBackoff(this.lastCloseCode.get(channelKey) ?? 1006);
754
+ const delay = computeBackoffDelay(attempt, params);
669
755
  this.reconnectAttempts.set(channelKey, attempt + 1);
670
756
  const tag = reason ? ` reason="${reason}"` : "";
671
757
  debug("peer", `scheduleChannelReconnect(${nodeId}): attempt=${attempt} delay=${delay}ms url=${url}${tag}`);
@@ -693,11 +779,62 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
693
779
  }
694
780
 
695
781
  // ── Peer lifecycle ─────────────────────────────────────────────
782
+
783
+ /** Migrate tracking maps when a URL-derived nodeId (starts with "_") resolves to the real nodeId. */
784
+ private migrateUrlDerivedPeer(derivedId: string, realId: string) {
785
+ // Migrate peerUrls, activeUrls
786
+ const urls = this.peerUrls.get(derivedId);
787
+ if (urls) {
788
+ this.peerUrls.delete(derivedId);
789
+ if (!this.peerUrls.has(realId)) this.peerUrls.set(realId, urls);
790
+ }
791
+ const activeUrl = this.activeUrls.get(derivedId);
792
+ if (activeUrl) {
793
+ this.activeUrls.delete(derivedId);
794
+ if (!this.activeUrls.has(realId)) this.activeUrls.set(realId, activeUrl);
795
+ }
796
+ // Migrate reconnect state: channelKey uses nodeId|url
797
+ for (const url of urls ?? []) {
798
+ const oldKey = `${derivedId}|${url}`;
799
+ const newKey = `${realId}|${url}`;
800
+ const timer = this.reconnectTimers.get(oldKey);
801
+ if (timer) { this.reconnectTimers.delete(oldKey); this.reconnectTimers.set(newKey, timer); }
802
+ const attempts = this.reconnectAttempts.get(oldKey);
803
+ if (attempts !== undefined) { this.reconnectAttempts.delete(oldKey); this.reconnectAttempts.set(newKey, attempts); }
804
+ const code = this.lastCloseCode.get(oldKey);
805
+ if (code !== undefined) { this.lastCloseCode.delete(oldKey); this.lastCloseCode.set(newKey, code); }
806
+ }
807
+ // Update the config peer entry in place for sentinel and other consumers
808
+ for (const peer of this.config.peers) {
809
+ if (peer.nodeId === derivedId) {
810
+ (peer as { nodeId: string }).nodeId = realId;
811
+ break;
812
+ }
813
+ }
814
+ debug("peer", `migrateUrlDerivedPeer: ${derivedId} → ${realId}`);
815
+ }
816
+
696
817
  private onPeerAuthenticated(conn: Connection, caps: NodeCapabilities, ip?: string) {
697
818
  const nodeId = conn.remoteNodeId!;
698
819
  // Peer's persistent public key for TOFU identity binding
699
820
  const peerPublicKey = conn.remoteIdentityKey ?? undefined;
700
821
 
822
+ // If this outbound peer was configured with URL string shorthand (nodeId starts
823
+ // with "_"), resolve to the real nodeId from the authenticated handshake.
824
+ if (conn.role === "outbound") {
825
+ for (const peer of this.config.peers) {
826
+ if (peer.nodeId.startsWith("_") && this.peerUrls.get(peer.nodeId)) {
827
+ const urls = this.peerUrls.get(peer.nodeId)!;
828
+ // Check if this connection belongs to this URL-derived peer
829
+ const activeUrl = this.activeUrls.get(peer.nodeId);
830
+ if (activeUrl && urls.length > 0) {
831
+ this.migrateUrlDerivedPeer(peer.nodeId, nodeId);
832
+ break;
833
+ }
834
+ }
835
+ }
836
+ }
837
+
701
838
  // Prevent self-connection: close immediately if the remote side authenticated
702
839
  // with our own nodeId. For outbound this means the peer URL accidentally
703
840
  // points to self; for inbound it means a remote node is (mis)using our nodeId.
@@ -796,7 +933,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
796
933
  // Additional channel — just add to the channel pool, no peer_join broadcast
797
934
  this.router.addChannel(nodeId, conn);
798
935
  conn.on("message", (frame) => this.onFrame(frame, conn));
799
- conn.on("latency", () => this.router.updateActiveChannel(nodeId));
936
+ conn.on("latency", (latencyMs) => {
937
+ this.router.updateActiveChannel(nodeId);
938
+ // Update baseline and check for spike
939
+ const baseline = this.latencyBaselines.get(nodeId) ?? latencyMs;
940
+ const newBaseline = baseline * 0.8 + latencyMs * 0.2;
941
+ this.latencyBaselines.set(nodeId, newBaseline);
942
+ if (latencyMs > baseline * 2 && latencyMs > 200) {
943
+ this.triggerProbeForPeer(nodeId);
944
+ }
945
+ });
800
946
  conn.on("close", () => this.onChannelDisconnected(conn));
801
947
  const channelCount = this.router.getChannelCount(nodeId);
802
948
  debug("peer", `completePeerJoin(${nodeId}): additional channel added (total=${channelCount})`);
@@ -830,7 +976,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
830
976
  this.joinedPeers.add(nodeId);
831
977
 
832
978
  conn.on("message", (frame) => this.onFrame(frame, conn));
833
- conn.on("latency", () => this.router.updateActiveChannel(nodeId));
979
+ conn.on("latency", (latencyMs) => {
980
+ this.router.updateActiveChannel(nodeId);
981
+ // Update baseline and check for spike
982
+ const baseline = this.latencyBaselines.get(nodeId) ?? latencyMs;
983
+ const newBaseline = baseline * 0.8 + latencyMs * 0.2;
984
+ this.latencyBaselines.set(nodeId, newBaseline);
985
+ if (latencyMs > baseline * 2 && latencyMs > 200) {
986
+ this.triggerProbeForPeer(nodeId);
987
+ }
988
+ });
834
989
  conn.on("close", () => this.onChannelDisconnected(conn));
835
990
 
836
991
  this.sendPeerSync(conn);
@@ -862,6 +1017,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
862
1017
 
863
1018
  audit("peer_join", { nodeId, detail: `agents=${caps.agents.length} models=${caps.models.length}` });
864
1019
  this.emit("peerConnected", nodeId);
1020
+
1021
+ // Probe alternative routes after reconnect (connectivity may have changed)
1022
+ setTimeout(() => this.triggerProbeForPeer(nodeId), 5_000);
865
1023
  }
866
1024
 
867
1025
  /** Handle a single channel disconnecting (multi-channel aware). */
@@ -901,6 +1059,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
901
1059
  if (nodeId === this.config.nodeId) {
902
1060
  this.router.removePeer(nodeId);
903
1061
  this.joinedPeers.delete(nodeId);
1062
+ this.latencyBaselines.delete(nodeId);
1063
+ this.lastProbeTime.delete(nodeId);
1064
+ // 清除 delta sync 版本号,确保重连时发全量 peer_sync
1065
+ this.peerSyncVersions.delete(nodeId);
904
1066
  return;
905
1067
  }
906
1068
 
@@ -950,6 +1112,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
950
1112
  audit("peer_leave", { nodeId });
951
1113
  this.router.removePeer(nodeId);
952
1114
  this.joinedPeers.delete(nodeId);
1115
+ this.latencyBaselines.delete(nodeId);
1116
+ this.lastProbeTime.delete(nodeId);
1117
+ this.peerSyncVersions.delete(nodeId);
953
1118
 
954
1119
  // Remove satellite contexts that were only reachable via this peer
955
1120
  for (let i = this.satelliteContexts.length - 1; i >= 0; i--) {
@@ -1076,13 +1241,19 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
1076
1241
  this.emit("frame", frame, from);
1077
1242
  }
1078
1243
 
1244
+ /** Last sync version sent to each direct peer. */
1245
+ private peerSyncVersions = new Map<string, number>();
1246
+
1079
1247
  private sendPeerSync(conn: Connection) {
1080
- const peers = this.router.buildPeerSyncPayload();
1081
- const payload: Record<string, unknown> = { peers };
1248
+ const remoteNodeId = conn.remoteNodeId ?? "";
1249
+ const sinceVersion = this.peerSyncVersions.get(remoteNodeId) ?? 0;
1250
+ const delta = this.router.buildPeerSyncDelta(sinceVersion);
1251
+ const payload: Record<string, unknown> = { ...delta };
1082
1252
  if (this.satelliteContexts.length > 0) {
1083
1253
  payload.satellites = this.satelliteContexts;
1084
1254
  }
1085
- conn.send({
1255
+ this.peerSyncVersions.set(remoteNodeId, delta.version);
1256
+ conn.sendDirect({
1086
1257
  type: "peer_sync",
1087
1258
  from: this.config.nodeId,
1088
1259
  timestamp: Date.now(),
@@ -1105,34 +1276,53 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
1105
1276
  }
1106
1277
 
1107
1278
  let changed = false;
1108
- for (const peer of frame.payload.peers) {
1109
- if (peer.nodeId === this.config.nodeId) continue;
1110
- if (peer.nodeId === from.remoteNodeId) {
1111
- const prev = this.router.getRoute(peer.nodeId);
1112
- const hadAgents = prev?.agents.length ?? 0;
1113
- const hadDirectPeers = prev?.directPeers.length ?? 0;
1114
- const hadDeviceInfo = prev?.deviceInfo?.hostname;
1115
- const hadAcpAgents = prev?.acpAgents?.length ?? 0;
1116
- const hadToolProxyEnabled = prev?.toolProxy?.enabled;
1117
- const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
1118
- const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
1119
- this.router.updatePeerCapabilities(peer.nodeId, peer);
1120
- if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
1121
- || (peer.directPeers?.length ?? 0) !== hadDirectPeers
1122
- || peer.toolProxy?.enabled !== hadToolProxyEnabled
1123
- || (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
1124
- || (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
1125
- || peer.deviceInfo?.hostname !== hadDeviceInfo
1126
- || (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
1279
+
1280
+ // Delta sync: apply removed, then added/updated
1281
+ if (frame.payload.version != null && frame.payload.removed) {
1282
+ for (const nodeId of frame.payload.removed) {
1283
+ if (nodeId === this.config.nodeId) continue;
1284
+ // Only remove relay routes — don't remove direct connections
1285
+ const route = this.router.getRoute(nodeId);
1286
+ if (route && !route.connection) {
1287
+ this.router.removePeer(nodeId);
1127
1288
  changed = true;
1128
1289
  }
1129
- } else {
1130
- // Skip if the remote peer only knows about this node through us —
1131
- // using them as relay would create a routing loop.
1132
- if (peer.reachableVia === this.config.nodeId) continue;
1133
- const existing = this.router.getRoute(peer.nodeId);
1134
- if (!existing) changed = true;
1135
- this.router.addRelayPeer(peer, from.remoteNodeId!);
1290
+ }
1291
+ }
1292
+
1293
+ // Process peers from either full sync or delta (updated field)
1294
+ // Delta sets peers = updated for backward compat, so this works for both
1295
+ const peersToProcess = frame.payload.peers;
1296
+ if (peersToProcess) {
1297
+ for (const peer of peersToProcess) {
1298
+ if (peer.nodeId === this.config.nodeId) continue;
1299
+ if (peer.nodeId === from.remoteNodeId) {
1300
+ const prev = this.router.getRoute(peer.nodeId);
1301
+ const hadAgents = prev?.agents.length ?? 0;
1302
+ const hadDirectPeers = prev?.directPeers.length ?? 0;
1303
+ const hadDeviceInfo = prev?.deviceInfo?.hostname;
1304
+ const hadAcpAgents = prev?.acpAgents?.length ?? 0;
1305
+ const hadToolProxyEnabled = prev?.toolProxy?.enabled;
1306
+ const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
1307
+ const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
1308
+ this.router.updatePeerCapabilities(peer.nodeId, peer);
1309
+ if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
1310
+ || (peer.directPeers?.length ?? 0) !== hadDirectPeers
1311
+ || peer.toolProxy?.enabled !== hadToolProxyEnabled
1312
+ || (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
1313
+ || (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
1314
+ || peer.deviceInfo?.hostname !== hadDeviceInfo
1315
+ || (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
1316
+ changed = true;
1317
+ }
1318
+ } else {
1319
+ // Skip if the remote peer only knows about this node through us —
1320
+ // using them as relay would create a routing loop.
1321
+ if (peer.reachableVia === this.config.nodeId) continue;
1322
+ const existing = this.router.getRoute(peer.nodeId);
1323
+ if (!existing) changed = true;
1324
+ this.router.addRelayPeer(peer, from.remoteNodeId!);
1325
+ }
1136
1326
  }
1137
1327
  }
1138
1328
 
package/src/retry.ts ADDED
@@ -0,0 +1,81 @@
1
+ /**
2
+ * Shared retry/backoff/circuit-breaker utilities backed by cockatiel.
3
+ *
4
+ * Provides:
5
+ * - Per-node circuit breakers for mesh health tracking
6
+ * - Adaptive backoff delay computation
7
+ */
8
+
9
+ import { circuitBreaker, handleAll, SamplingBreaker, ConsecutiveBreaker } from "cockatiel";
10
+
11
+ // ── Per-node circuit breakers ────────────────────────────────────
12
+
13
+ export interface CircuitBreakerConfig {
14
+ /** Number of consecutive failures before opening the circuit (default: 5). */
15
+ threshold?: number;
16
+ /** How long to wait before trying a half-open probe in ms (default: 30000). */
17
+ halfOpenAfter?: number;
18
+ }
19
+
20
+ const DEFAULT_CB_CONFIG: Required<CircuitBreakerConfig> = {
21
+ threshold: 5,
22
+ halfOpenAfter: 30_000,
23
+ };
24
+
25
+ const nodeBreakers = new Map<string, ReturnType<typeof circuitBreaker>>();
26
+
27
+ /** Get or create a circuit breaker for a node. Breakers are cached per nodeId. */
28
+ export function getNodeCircuitBreaker(nodeId: string, config?: CircuitBreakerConfig) {
29
+ let cb = nodeBreakers.get(nodeId);
30
+ if (cb) return cb;
31
+
32
+ const cfg = { ...DEFAULT_CB_CONFIG, ...config };
33
+ cb = circuitBreaker(handleAll, {
34
+ halfOpenAfter: cfg.halfOpenAfter,
35
+ breaker: new ConsecutiveBreaker(cfg.threshold),
36
+ });
37
+ nodeBreakers.set(nodeId, cb);
38
+ return cb;
39
+ }
40
+
41
+ /** Remove a node's circuit breaker (e.g. when peer leaves). */
42
+ export function removeNodeCircuitBreaker(nodeId: string) {
43
+ nodeBreakers.delete(nodeId);
44
+ }
45
+
46
+ /** Check if a node's circuit is currently open (unhealthy). */
47
+ export function isNodeCircuitOpen(nodeId: string): boolean {
48
+ const cb = nodeBreakers.get(nodeId);
49
+ if (!cb) return false;
50
+ return cb.state === "open";
51
+ }
52
+
53
+ /** Reset all circuit breakers (e.g. on shutdown). */
54
+ export function resetAllCircuitBreakers() {
55
+ nodeBreakers.clear();
56
+ }
57
+
58
+ // ── Adaptive backoff computation ─────────────────────────────────
59
+
60
+ export interface BackoffParams {
61
+ /** Base delay in ms. */
62
+ base: number;
63
+ /** Maximum delay in ms. */
64
+ max: number;
65
+ }
66
+
67
+ /** Compute an exponential backoff delay with jitter.
68
+ * Returns a delay in ms: min(base * 2^attempt, max) * random(0.5, 1.0). */
69
+ export function computeBackoffDelay(attempt: number, params: BackoffParams): number {
70
+ const raw = Math.min(params.base * 2 ** attempt, params.max);
71
+ return Math.round(raw * (0.5 + Math.random() * 0.5));
72
+ }
73
+
74
+ /** Get adaptive backoff parameters based on WebSocket close code. */
75
+ export function getReconnectBackoff(code: number): BackoffParams {
76
+ if (code === 4001) return { base: 30_000, max: 300_000 }; // auth failed → slow
77
+ if (code === 4003) return { base: 5_000, max: 60_000 }; // auth timeout → medium
78
+ if (code === 1006) return { base: 1_000, max: 10_000 }; // network error → fast
79
+ if (code === 1001) return { base: 2_000, max: 30_000 }; // going away → medium-fast
80
+ return { base: 1_000, max: 60_000 }; // default
81
+ }