clawmatrix 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -21
- package/cli/bin/clawmatrix.mjs +300 -1
- package/package.json +8 -1
- package/src/acp-proxy.ts +122 -50
- package/src/{web.ts → api.ts} +646 -25
- package/src/audit.ts +37 -2
- package/src/auth.ts +5 -10
- package/src/automation.ts +625 -0
- package/src/cluster-service.ts +172 -16
- package/src/compat.ts +103 -0
- package/src/config.ts +75 -27
- package/src/connection.ts +215 -37
- package/src/crypto.ts +72 -5
- package/src/device-info.ts +21 -2
- package/src/file-transfer.ts +3 -2
- package/src/handoff.ts +90 -32
- package/src/health-tracker.ts +91 -356
- package/src/index.ts +421 -13
- package/src/kanban.ts +507 -0
- package/src/knowledge-sync.ts +158 -7
- package/src/local-tools.ts +65 -2
- package/src/log-replication.ts +198 -0
- package/src/model-proxy.ts +152 -60
- package/src/peer-approval.ts +3 -2
- package/src/peer-manager.ts +236 -44
- package/src/retry.ts +81 -0
- package/src/router.ts +152 -104
- package/src/sentinel.ts +85 -51
- package/src/store.ts +578 -0
- package/src/terminal.ts +17 -8
- package/src/tool-proxy.ts +6 -5
- package/src/tools/cluster-events.ts +6 -6
- package/src/tools/cluster-kanban.ts +345 -0
- package/src/tools/cluster-peers.ts +1 -1
- package/src/tools/cluster-query.ts +145 -0
- package/src/types.ts +95 -9
package/src/peer-manager.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { EventEmitter } from "
|
|
1
|
+
import { EventEmitter } from "eventemitter3";
|
|
2
2
|
import path from "node:path";
|
|
3
3
|
import { homedir, tmpdir } from "node:os";
|
|
4
4
|
import { createServer, type IncomingMessage, type ServerResponse, type Server } from "node:http";
|
|
@@ -22,10 +22,11 @@ import type {
|
|
|
22
22
|
PeerSync,
|
|
23
23
|
} from "./types.ts";
|
|
24
24
|
import { PeerApprovalManager, type ChannelApi, type NotifyTarget } from "./peer-approval.ts";
|
|
25
|
+
import { computeBackoffDelay, getReconnectBackoff } from "./retry.ts";
|
|
25
26
|
import { loadOrCreateIdentity } from "./identity.ts";
|
|
26
27
|
import type { KeyPair } from "./crypto.ts";
|
|
27
28
|
|
|
28
|
-
|
|
29
|
+
// Reconnect constants kept for RECONNECT_MAX reference in failover scheduling
|
|
29
30
|
const RECONNECT_MAX = 60_000;
|
|
30
31
|
|
|
31
32
|
/** Frame types that bypass dedup (streams share one id across chunks; responses share id with request). */
|
|
@@ -51,6 +52,8 @@ const SKIP_DEDUP_TYPES = new Set([
|
|
|
51
52
|
"file_transfer_ack", "file_transfer_complete",
|
|
52
53
|
]);
|
|
53
54
|
|
|
55
|
+
// Reconnect backoff params are now provided by retry.ts → getReconnectBackoff()
|
|
56
|
+
|
|
54
57
|
/** Classify WebSocket close code into a human-readable reason. */
|
|
55
58
|
function classifyCloseReason(code: number, reason: string): string {
|
|
56
59
|
if (reason) return reason;
|
|
@@ -94,6 +97,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
94
97
|
/** Reconnect timers keyed by `nodeId|url` for per-channel reconnection. */
|
|
95
98
|
private reconnectTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
96
99
|
private reconnectAttempts = new Map<string, number>();
|
|
100
|
+
/** Last close code per channel key for adaptive reconnect backoff. */
|
|
101
|
+
private lastCloseCode = new Map<string, number>();
|
|
97
102
|
/** Track which nodeIds have already completed the full peer join (for multi-channel). */
|
|
98
103
|
private joinedPeers = new Set<string>();
|
|
99
104
|
/** All configured URLs per peer (for multi-URL peers). */
|
|
@@ -104,6 +109,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
104
109
|
private urlProbeLatencies = new Map<string, number>();
|
|
105
110
|
/** Route probe interval timer. */
|
|
106
111
|
private probeTimer: ReturnType<typeof setInterval> | null = null;
|
|
112
|
+
/** EMA latency baseline per peer (for spike detection). */
|
|
113
|
+
private latencyBaselines = new Map<string, number>();
|
|
114
|
+
/** Last time a latency-triggered probe was fired per peer (debounce). */
|
|
115
|
+
private lastProbeTime = new Map<string, number>();
|
|
107
116
|
/** Deferred disconnect timers — grace period before broadcasting peer_leave. */
|
|
108
117
|
private disconnectGraceTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
109
118
|
private stopped = false;
|
|
@@ -121,10 +130,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
121
130
|
private rateLimiter: RateLimiter;
|
|
122
131
|
readonly approvalManager: PeerApprovalManager;
|
|
123
132
|
|
|
124
|
-
constructor(config: ClawMatrixConfig, openclawVersion?: string) {
|
|
133
|
+
constructor(config: ClawMatrixConfig, openclawVersion?: string, openclawConfig?: Record<string, unknown>) {
|
|
125
134
|
super();
|
|
126
135
|
this.config = config;
|
|
127
|
-
this.localDeviceInfo = collectDeviceInfo(openclawVersion);
|
|
136
|
+
this.localDeviceInfo = collectDeviceInfo(openclawVersion, openclawConfig);
|
|
128
137
|
const acpAgents = config.acp?.enabled ? config.acp.agents : undefined;
|
|
129
138
|
this.localCapabilities = {
|
|
130
139
|
nodeId: config.nodeId,
|
|
@@ -285,7 +294,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
285
294
|
}
|
|
286
295
|
|
|
287
296
|
// ── Route probing (for multi-URL peers) ──────────────────────────
|
|
288
|
-
private static readonly PROBE_INTERVAL =
|
|
297
|
+
private static readonly PROBE_INTERVAL = 900_000; // 15 minutes
|
|
289
298
|
/** Minimum improvement ratio to trigger a route switch. */
|
|
290
299
|
private static readonly SWITCH_THRESHOLD = 0.7; // new must be ≤70% of current
|
|
291
300
|
|
|
@@ -385,6 +394,32 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
385
394
|
this.switchRoute(nodeId, bestUrl);
|
|
386
395
|
}
|
|
387
396
|
|
|
397
|
+
/** Trigger an immediate route probe for a specific peer (debounced). */
|
|
398
|
+
private triggerProbeForPeer(nodeId: string) {
|
|
399
|
+
const urls = this.peerUrls.get(nodeId);
|
|
400
|
+
if (!urls || urls.length <= 1) return;
|
|
401
|
+
|
|
402
|
+
const now = Date.now();
|
|
403
|
+
const lastProbe = this.lastProbeTime.get(nodeId) ?? 0;
|
|
404
|
+
if (now - lastProbe < 60_000) return; // debounce: 1 per minute
|
|
405
|
+
this.lastProbeTime.set(nodeId, now);
|
|
406
|
+
|
|
407
|
+
// Probe non-active URLs for this peer
|
|
408
|
+
const activeUrl = this.activeUrls.get(nodeId);
|
|
409
|
+
(async () => {
|
|
410
|
+
for (const url of urls) {
|
|
411
|
+
if (url === activeUrl) continue;
|
|
412
|
+
const latency = await this.probeUrl(url);
|
|
413
|
+
if (latency !== null) this.urlProbeLatencies.set(url, latency);
|
|
414
|
+
}
|
|
415
|
+
if (activeUrl) {
|
|
416
|
+
const route = this.router.getRoute(nodeId);
|
|
417
|
+
if (route && route.latencyMs > 0) this.urlProbeLatencies.set(activeUrl, route.latencyMs);
|
|
418
|
+
}
|
|
419
|
+
this.evaluateRouteSwitch(nodeId);
|
|
420
|
+
})();
|
|
421
|
+
}
|
|
422
|
+
|
|
388
423
|
/** Callback to check if there are active tasks involving a peer. Set by ClusterRuntime. */
|
|
389
424
|
private activeTaskChecker: ((nodeId: string) => boolean) | null = null;
|
|
390
425
|
|
|
@@ -465,10 +500,13 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
465
500
|
this.inboundIps.set(ws, ip);
|
|
466
501
|
this.handleInboundOpen(ws, ip);
|
|
467
502
|
|
|
468
|
-
ws.on("message", (data) => {
|
|
503
|
+
ws.on("message", (data, isBinary) => {
|
|
469
504
|
const conn = this.inboundConnections.get(ws);
|
|
470
505
|
if (conn) {
|
|
471
|
-
|
|
506
|
+
// ws package always delivers data as Buffer; use isBinary to distinguish frame type.
|
|
507
|
+
// Text frames (e.g. base64 encrypted envelopes) must be passed as strings
|
|
508
|
+
// so Connection routes them through decryptBinary, not decryptBinaryRaw.
|
|
509
|
+
conn.feedMessage(isBinary ? (Buffer.isBuffer(data) ? data : Buffer.from(data as any)) : data.toString());
|
|
472
510
|
}
|
|
473
511
|
});
|
|
474
512
|
|
|
@@ -492,7 +530,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
492
530
|
private handleInboundOpen(ws: WsWebSocket, ip: string) {
|
|
493
531
|
// Wrap ws WebSocket into our WsTransport interface
|
|
494
532
|
const transport: WsTransport = {
|
|
495
|
-
send(data: string) {
|
|
533
|
+
send(data: string | Buffer) {
|
|
496
534
|
ws.send(data);
|
|
497
535
|
},
|
|
498
536
|
close(code?: number, reason?: string) {
|
|
@@ -539,10 +577,44 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
539
577
|
private connectToPeer(peer: PeerConfig) {
|
|
540
578
|
const urls = Array.isArray(peer.url) ? peer.url : [peer.url];
|
|
541
579
|
this.peerUrls.set(peer.nodeId, urls);
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
580
|
+
if (urls.length <= 1) {
|
|
581
|
+
this.activeUrls.set(peer.nodeId, urls[0]);
|
|
582
|
+
this.connectToChannel(peer.nodeId, urls[0]);
|
|
583
|
+
} else {
|
|
584
|
+
// Multi-URL: connect all simultaneously, prune to best after 10s
|
|
585
|
+
this.activeUrls.set(peer.nodeId, urls[0]);
|
|
586
|
+
for (const url of urls) {
|
|
587
|
+
this.connectToChannel(peer.nodeId, url);
|
|
588
|
+
}
|
|
589
|
+
setTimeout(() => this.pruneWarmupConnections(peer.nodeId), 10_000);
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
/** After warmup period, keep only the lowest-latency channel and close the rest. */
|
|
594
|
+
private pruneWarmupConnections(nodeId: string) {
|
|
595
|
+
const channels = this.router.getChannels(nodeId);
|
|
596
|
+
if (channels.length <= 1) return;
|
|
597
|
+
|
|
598
|
+
let best: Connection | null = null;
|
|
599
|
+
let bestLatency = Infinity;
|
|
600
|
+
for (const conn of channels) {
|
|
601
|
+
if (!conn.isOpen) continue;
|
|
602
|
+
const lat = conn.latencyMs > 0 ? conn.latencyMs : 10_000;
|
|
603
|
+
if (lat < bestLatency) {
|
|
604
|
+
bestLatency = lat;
|
|
605
|
+
best = conn;
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
for (const conn of channels) {
|
|
610
|
+
if (conn !== best && conn.isOpen) {
|
|
611
|
+
conn.close(1000, "warm-up pruned");
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
if (best) {
|
|
616
|
+
debug("peer", `pruneWarmupConnections(${nodeId}): kept channel with latency=${best.latencyMs}ms, pruned ${channels.length - 1} others`);
|
|
617
|
+
}
|
|
546
618
|
}
|
|
547
619
|
|
|
548
620
|
/** Pick the best URL for a peer based on probe latencies. Falls back to first URL. */
|
|
@@ -596,6 +668,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
596
668
|
conn.on("authenticated", (caps) => {
|
|
597
669
|
debug("peer", `connectToChannel(${nodeId}): authenticated url=${url}`);
|
|
598
670
|
this.reconnectAttempts.delete(channelKey);
|
|
671
|
+
this.lastCloseCode.delete(channelKey);
|
|
599
672
|
this.onPeerAuthenticated(conn, caps);
|
|
600
673
|
});
|
|
601
674
|
|
|
@@ -624,6 +697,14 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
624
697
|
debug("peer", `connectToChannel(${nodeId}): self-connection, will not reconnect`);
|
|
625
698
|
return;
|
|
626
699
|
}
|
|
700
|
+
// Don't reconnect warm-up pruned channels
|
|
701
|
+
if (ev.reason === "warm-up pruned") {
|
|
702
|
+
debug("peer", `connectToChannel(${nodeId}): warm-up pruned, will not reconnect`);
|
|
703
|
+
return;
|
|
704
|
+
}
|
|
705
|
+
// Record close code for adaptive backoff
|
|
706
|
+
const channelKey = `${nodeId}|${url}`;
|
|
707
|
+
this.lastCloseCode.set(channelKey, ev.code);
|
|
627
708
|
if (!lastError) {
|
|
628
709
|
lastError = classifyCloseReason(ev.code, ev.reason);
|
|
629
710
|
}
|
|
@@ -665,7 +746,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
665
746
|
}
|
|
666
747
|
}
|
|
667
748
|
|
|
668
|
-
const
|
|
749
|
+
const params = getReconnectBackoff(this.lastCloseCode.get(channelKey) ?? 1006);
|
|
750
|
+
const delay = computeBackoffDelay(attempt, params);
|
|
669
751
|
this.reconnectAttempts.set(channelKey, attempt + 1);
|
|
670
752
|
const tag = reason ? ` reason="${reason}"` : "";
|
|
671
753
|
debug("peer", `scheduleChannelReconnect(${nodeId}): attempt=${attempt} delay=${delay}ms url=${url}${tag}`);
|
|
@@ -693,11 +775,62 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
693
775
|
}
|
|
694
776
|
|
|
695
777
|
// ── Peer lifecycle ─────────────────────────────────────────────
|
|
778
|
+
|
|
779
|
+
/** Migrate tracking maps when a URL-derived nodeId (starts with "_") resolves to the real nodeId. */
|
|
780
|
+
private migrateUrlDerivedPeer(derivedId: string, realId: string) {
|
|
781
|
+
// Migrate peerUrls, activeUrls
|
|
782
|
+
const urls = this.peerUrls.get(derivedId);
|
|
783
|
+
if (urls) {
|
|
784
|
+
this.peerUrls.delete(derivedId);
|
|
785
|
+
if (!this.peerUrls.has(realId)) this.peerUrls.set(realId, urls);
|
|
786
|
+
}
|
|
787
|
+
const activeUrl = this.activeUrls.get(derivedId);
|
|
788
|
+
if (activeUrl) {
|
|
789
|
+
this.activeUrls.delete(derivedId);
|
|
790
|
+
if (!this.activeUrls.has(realId)) this.activeUrls.set(realId, activeUrl);
|
|
791
|
+
}
|
|
792
|
+
// Migrate reconnect state: channelKey uses nodeId|url
|
|
793
|
+
for (const url of urls ?? []) {
|
|
794
|
+
const oldKey = `${derivedId}|${url}`;
|
|
795
|
+
const newKey = `${realId}|${url}`;
|
|
796
|
+
const timer = this.reconnectTimers.get(oldKey);
|
|
797
|
+
if (timer) { this.reconnectTimers.delete(oldKey); this.reconnectTimers.set(newKey, timer); }
|
|
798
|
+
const attempts = this.reconnectAttempts.get(oldKey);
|
|
799
|
+
if (attempts !== undefined) { this.reconnectAttempts.delete(oldKey); this.reconnectAttempts.set(newKey, attempts); }
|
|
800
|
+
const code = this.lastCloseCode.get(oldKey);
|
|
801
|
+
if (code !== undefined) { this.lastCloseCode.delete(oldKey); this.lastCloseCode.set(newKey, code); }
|
|
802
|
+
}
|
|
803
|
+
// Update the config peer entry in place for sentinel and other consumers
|
|
804
|
+
for (const peer of this.config.peers) {
|
|
805
|
+
if (peer.nodeId === derivedId) {
|
|
806
|
+
(peer as { nodeId: string }).nodeId = realId;
|
|
807
|
+
break;
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
debug("peer", `migrateUrlDerivedPeer: ${derivedId} → ${realId}`);
|
|
811
|
+
}
|
|
812
|
+
|
|
696
813
|
private onPeerAuthenticated(conn: Connection, caps: NodeCapabilities, ip?: string) {
|
|
697
814
|
const nodeId = conn.remoteNodeId!;
|
|
698
815
|
// Peer's persistent public key for TOFU identity binding
|
|
699
816
|
const peerPublicKey = conn.remoteIdentityKey ?? undefined;
|
|
700
817
|
|
|
818
|
+
// If this outbound peer was configured with URL string shorthand (nodeId starts
|
|
819
|
+
// with "_"), resolve to the real nodeId from the authenticated handshake.
|
|
820
|
+
if (conn.role === "outbound") {
|
|
821
|
+
for (const peer of this.config.peers) {
|
|
822
|
+
if (peer.nodeId.startsWith("_") && this.peerUrls.get(peer.nodeId)) {
|
|
823
|
+
const urls = this.peerUrls.get(peer.nodeId)!;
|
|
824
|
+
// Check if this connection belongs to this URL-derived peer
|
|
825
|
+
const activeUrl = this.activeUrls.get(peer.nodeId);
|
|
826
|
+
if (activeUrl && urls.length > 0) {
|
|
827
|
+
this.migrateUrlDerivedPeer(peer.nodeId, nodeId);
|
|
828
|
+
break;
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
|
|
701
834
|
// Prevent self-connection: close immediately if the remote side authenticated
|
|
702
835
|
// with our own nodeId. For outbound this means the peer URL accidentally
|
|
703
836
|
// points to self; for inbound it means a remote node is (mis)using our nodeId.
|
|
@@ -796,11 +929,26 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
796
929
|
// Additional channel — just add to the channel pool, no peer_join broadcast
|
|
797
930
|
this.router.addChannel(nodeId, conn);
|
|
798
931
|
conn.on("message", (frame) => this.onFrame(frame, conn));
|
|
799
|
-
conn.on("latency", () =>
|
|
932
|
+
conn.on("latency", (latencyMs) => {
|
|
933
|
+
this.router.updateActiveChannel(nodeId);
|
|
934
|
+
// Update baseline and check for spike
|
|
935
|
+
const baseline = this.latencyBaselines.get(nodeId) ?? latencyMs;
|
|
936
|
+
const newBaseline = baseline * 0.8 + latencyMs * 0.2;
|
|
937
|
+
this.latencyBaselines.set(nodeId, newBaseline);
|
|
938
|
+
if (latencyMs > baseline * 2 && latencyMs > 200) {
|
|
939
|
+
this.triggerProbeForPeer(nodeId);
|
|
940
|
+
}
|
|
941
|
+
});
|
|
800
942
|
conn.on("close", () => this.onChannelDisconnected(conn));
|
|
801
943
|
const channelCount = this.router.getChannelCount(nodeId);
|
|
802
944
|
debug("peer", `completePeerJoin(${nodeId}): additional channel added (total=${channelCount})`);
|
|
803
945
|
audit("channel_add", { nodeId, detail: `channels=${channelCount}` });
|
|
946
|
+
// If the peer reconnected after a grace period, its client-side state
|
|
947
|
+
// may have been reset (e.g. iOS app restart). Send peer_sync so the
|
|
948
|
+
// reconnecting node re-discovers other peers in the cluster.
|
|
949
|
+
if (wasInGrace) {
|
|
950
|
+
this.sendPeerSync(conn);
|
|
951
|
+
}
|
|
804
952
|
return;
|
|
805
953
|
}
|
|
806
954
|
|
|
@@ -824,7 +972,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
824
972
|
this.joinedPeers.add(nodeId);
|
|
825
973
|
|
|
826
974
|
conn.on("message", (frame) => this.onFrame(frame, conn));
|
|
827
|
-
conn.on("latency", () =>
|
|
975
|
+
conn.on("latency", (latencyMs) => {
|
|
976
|
+
this.router.updateActiveChannel(nodeId);
|
|
977
|
+
// Update baseline and check for spike
|
|
978
|
+
const baseline = this.latencyBaselines.get(nodeId) ?? latencyMs;
|
|
979
|
+
const newBaseline = baseline * 0.8 + latencyMs * 0.2;
|
|
980
|
+
this.latencyBaselines.set(nodeId, newBaseline);
|
|
981
|
+
if (latencyMs > baseline * 2 && latencyMs > 200) {
|
|
982
|
+
this.triggerProbeForPeer(nodeId);
|
|
983
|
+
}
|
|
984
|
+
});
|
|
828
985
|
conn.on("close", () => this.onChannelDisconnected(conn));
|
|
829
986
|
|
|
830
987
|
this.sendPeerSync(conn);
|
|
@@ -856,6 +1013,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
856
1013
|
|
|
857
1014
|
audit("peer_join", { nodeId, detail: `agents=${caps.agents.length} models=${caps.models.length}` });
|
|
858
1015
|
this.emit("peerConnected", nodeId);
|
|
1016
|
+
|
|
1017
|
+
// Probe alternative routes after reconnect (connectivity may have changed)
|
|
1018
|
+
setTimeout(() => this.triggerProbeForPeer(nodeId), 5_000);
|
|
859
1019
|
}
|
|
860
1020
|
|
|
861
1021
|
/** Handle a single channel disconnecting (multi-channel aware). */
|
|
@@ -895,6 +1055,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
895
1055
|
if (nodeId === this.config.nodeId) {
|
|
896
1056
|
this.router.removePeer(nodeId);
|
|
897
1057
|
this.joinedPeers.delete(nodeId);
|
|
1058
|
+
this.latencyBaselines.delete(nodeId);
|
|
1059
|
+
this.lastProbeTime.delete(nodeId);
|
|
1060
|
+
// 清除 delta sync 版本号,确保重连时发全量 peer_sync
|
|
1061
|
+
this.peerSyncVersions.delete(nodeId);
|
|
898
1062
|
return;
|
|
899
1063
|
}
|
|
900
1064
|
|
|
@@ -944,6 +1108,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
944
1108
|
audit("peer_leave", { nodeId });
|
|
945
1109
|
this.router.removePeer(nodeId);
|
|
946
1110
|
this.joinedPeers.delete(nodeId);
|
|
1111
|
+
this.latencyBaselines.delete(nodeId);
|
|
1112
|
+
this.lastProbeTime.delete(nodeId);
|
|
1113
|
+
this.peerSyncVersions.delete(nodeId);
|
|
947
1114
|
|
|
948
1115
|
// Remove satellite contexts that were only reachable via this peer
|
|
949
1116
|
for (let i = this.satelliteContexts.length - 1; i >= 0; i--) {
|
|
@@ -1070,13 +1237,19 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
1070
1237
|
this.emit("frame", frame, from);
|
|
1071
1238
|
}
|
|
1072
1239
|
|
|
1240
|
+
/** Last sync version sent to each direct peer. */
|
|
1241
|
+
private peerSyncVersions = new Map<string, number>();
|
|
1242
|
+
|
|
1073
1243
|
private sendPeerSync(conn: Connection) {
|
|
1074
|
-
const
|
|
1075
|
-
const
|
|
1244
|
+
const remoteNodeId = conn.remoteNodeId ?? "";
|
|
1245
|
+
const sinceVersion = this.peerSyncVersions.get(remoteNodeId) ?? 0;
|
|
1246
|
+
const delta = this.router.buildPeerSyncDelta(sinceVersion);
|
|
1247
|
+
const payload: Record<string, unknown> = { ...delta };
|
|
1076
1248
|
if (this.satelliteContexts.length > 0) {
|
|
1077
1249
|
payload.satellites = this.satelliteContexts;
|
|
1078
1250
|
}
|
|
1079
|
-
|
|
1251
|
+
this.peerSyncVersions.set(remoteNodeId, delta.version);
|
|
1252
|
+
conn.sendDirect({
|
|
1080
1253
|
type: "peer_sync",
|
|
1081
1254
|
from: this.config.nodeId,
|
|
1082
1255
|
timestamp: Date.now(),
|
|
@@ -1099,34 +1272,53 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
1099
1272
|
}
|
|
1100
1273
|
|
|
1101
1274
|
let changed = false;
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
const
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
|
|
1112
|
-
const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
|
|
1113
|
-
this.router.updatePeerCapabilities(peer.nodeId, peer);
|
|
1114
|
-
if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
|
|
1115
|
-
|| (peer.directPeers?.length ?? 0) !== hadDirectPeers
|
|
1116
|
-
|| peer.toolProxy?.enabled !== hadToolProxyEnabled
|
|
1117
|
-
|| (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
|
|
1118
|
-
|| (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
|
|
1119
|
-
|| peer.deviceInfo?.hostname !== hadDeviceInfo
|
|
1120
|
-
|| (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
|
|
1275
|
+
|
|
1276
|
+
// Delta sync: apply removed, then added/updated
|
|
1277
|
+
if (frame.payload.version != null && frame.payload.removed) {
|
|
1278
|
+
for (const nodeId of frame.payload.removed) {
|
|
1279
|
+
if (nodeId === this.config.nodeId) continue;
|
|
1280
|
+
// Only remove relay routes — don't remove direct connections
|
|
1281
|
+
const route = this.router.getRoute(nodeId);
|
|
1282
|
+
if (route && !route.connection) {
|
|
1283
|
+
this.router.removePeer(nodeId);
|
|
1121
1284
|
changed = true;
|
|
1122
1285
|
}
|
|
1123
|
-
}
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1286
|
+
}
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
// Process peers from either full sync or delta (updated field)
|
|
1290
|
+
// Delta sets peers = updated for backward compat, so this works for both
|
|
1291
|
+
const peersToProcess = frame.payload.peers;
|
|
1292
|
+
if (peersToProcess) {
|
|
1293
|
+
for (const peer of peersToProcess) {
|
|
1294
|
+
if (peer.nodeId === this.config.nodeId) continue;
|
|
1295
|
+
if (peer.nodeId === from.remoteNodeId) {
|
|
1296
|
+
const prev = this.router.getRoute(peer.nodeId);
|
|
1297
|
+
const hadAgents = prev?.agents.length ?? 0;
|
|
1298
|
+
const hadDirectPeers = prev?.directPeers.length ?? 0;
|
|
1299
|
+
const hadDeviceInfo = prev?.deviceInfo?.hostname;
|
|
1300
|
+
const hadAcpAgents = prev?.acpAgents?.length ?? 0;
|
|
1301
|
+
const hadToolProxyEnabled = prev?.toolProxy?.enabled;
|
|
1302
|
+
const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
|
|
1303
|
+
const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
|
|
1304
|
+
this.router.updatePeerCapabilities(peer.nodeId, peer);
|
|
1305
|
+
if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
|
|
1306
|
+
|| (peer.directPeers?.length ?? 0) !== hadDirectPeers
|
|
1307
|
+
|| peer.toolProxy?.enabled !== hadToolProxyEnabled
|
|
1308
|
+
|| (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
|
|
1309
|
+
|| (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
|
|
1310
|
+
|| peer.deviceInfo?.hostname !== hadDeviceInfo
|
|
1311
|
+
|| (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
|
|
1312
|
+
changed = true;
|
|
1313
|
+
}
|
|
1314
|
+
} else {
|
|
1315
|
+
// Skip if the remote peer only knows about this node through us —
|
|
1316
|
+
// using them as relay would create a routing loop.
|
|
1317
|
+
if (peer.reachableVia === this.config.nodeId) continue;
|
|
1318
|
+
const existing = this.router.getRoute(peer.nodeId);
|
|
1319
|
+
if (!existing) changed = true;
|
|
1320
|
+
this.router.addRelayPeer(peer, from.remoteNodeId!);
|
|
1321
|
+
}
|
|
1130
1322
|
}
|
|
1131
1323
|
}
|
|
1132
1324
|
|
package/src/retry.ts
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared retry/backoff/circuit-breaker utilities backed by cockatiel.
|
|
3
|
+
*
|
|
4
|
+
* Provides:
|
|
5
|
+
* - Per-node circuit breakers for mesh health tracking
|
|
6
|
+
* - Adaptive backoff delay computation
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { circuitBreaker, handleAll, SamplingBreaker, ConsecutiveBreaker } from "cockatiel";
|
|
10
|
+
|
|
11
|
+
// ── Per-node circuit breakers ────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
export interface CircuitBreakerConfig {
|
|
14
|
+
/** Number of consecutive failures before opening the circuit (default: 5). */
|
|
15
|
+
threshold?: number;
|
|
16
|
+
/** How long to wait before trying a half-open probe in ms (default: 30000). */
|
|
17
|
+
halfOpenAfter?: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const DEFAULT_CB_CONFIG: Required<CircuitBreakerConfig> = {
|
|
21
|
+
threshold: 5,
|
|
22
|
+
halfOpenAfter: 30_000,
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
const nodeBreakers = new Map<string, ReturnType<typeof circuitBreaker>>();
|
|
26
|
+
|
|
27
|
+
/** Get or create a circuit breaker for a node. Breakers are cached per nodeId. */
|
|
28
|
+
export function getNodeCircuitBreaker(nodeId: string, config?: CircuitBreakerConfig) {
|
|
29
|
+
let cb = nodeBreakers.get(nodeId);
|
|
30
|
+
if (cb) return cb;
|
|
31
|
+
|
|
32
|
+
const cfg = { ...DEFAULT_CB_CONFIG, ...config };
|
|
33
|
+
cb = circuitBreaker(handleAll, {
|
|
34
|
+
halfOpenAfter: cfg.halfOpenAfter,
|
|
35
|
+
breaker: new ConsecutiveBreaker(cfg.threshold),
|
|
36
|
+
});
|
|
37
|
+
nodeBreakers.set(nodeId, cb);
|
|
38
|
+
return cb;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** Remove a node's circuit breaker (e.g. when peer leaves). */
|
|
42
|
+
export function removeNodeCircuitBreaker(nodeId: string) {
|
|
43
|
+
nodeBreakers.delete(nodeId);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Check if a node's circuit is currently open (unhealthy). */
|
|
47
|
+
export function isNodeCircuitOpen(nodeId: string): boolean {
|
|
48
|
+
const cb = nodeBreakers.get(nodeId);
|
|
49
|
+
if (!cb) return false;
|
|
50
|
+
return cb.state === "open";
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** Reset all circuit breakers (e.g. on shutdown). */
|
|
54
|
+
export function resetAllCircuitBreakers() {
|
|
55
|
+
nodeBreakers.clear();
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// ── Adaptive backoff computation ─────────────────────────────────
|
|
59
|
+
|
|
60
|
+
export interface BackoffParams {
|
|
61
|
+
/** Base delay in ms. */
|
|
62
|
+
base: number;
|
|
63
|
+
/** Maximum delay in ms. */
|
|
64
|
+
max: number;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/** Compute an exponential backoff delay with jitter.
|
|
68
|
+
* Returns a delay in ms: min(base * 2^attempt, max) * random(0.5, 1.0). */
|
|
69
|
+
export function computeBackoffDelay(attempt: number, params: BackoffParams): number {
|
|
70
|
+
const raw = Math.min(params.base * 2 ** attempt, params.max);
|
|
71
|
+
return Math.round(raw * (0.5 + Math.random() * 0.5));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/** Get adaptive backoff parameters based on WebSocket close code. */
|
|
75
|
+
export function getReconnectBackoff(code: number): BackoffParams {
|
|
76
|
+
if (code === 4001) return { base: 30_000, max: 300_000 }; // auth failed → slow
|
|
77
|
+
if (code === 4003) return { base: 5_000, max: 60_000 }; // auth timeout → medium
|
|
78
|
+
if (code === 1006) return { base: 1_000, max: 10_000 }; // network error → fast
|
|
79
|
+
if (code === 1001) return { base: 2_000, max: 30_000 }; // going away → medium-fast
|
|
80
|
+
return { base: 1_000, max: 60_000 }; // default
|
|
81
|
+
}
|