clawmatrix 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -21
- package/cli/bin/clawmatrix.mjs +300 -1
- package/package.json +8 -1
- package/src/acp-proxy.ts +122 -50
- package/src/{web.ts → api.ts} +646 -25
- package/src/audit.ts +37 -2
- package/src/auth.ts +5 -10
- package/src/automation.ts +625 -0
- package/src/cluster-service.ts +172 -16
- package/src/compat.ts +103 -0
- package/src/config.ts +75 -27
- package/src/connection.ts +215 -37
- package/src/crypto.ts +72 -5
- package/src/device-info.ts +21 -2
- package/src/file-transfer.ts +3 -2
- package/src/handoff.ts +90 -32
- package/src/health-tracker.ts +91 -356
- package/src/index.ts +421 -13
- package/src/kanban.ts +507 -0
- package/src/knowledge-sync.ts +158 -7
- package/src/local-tools.ts +65 -2
- package/src/log-replication.ts +198 -0
- package/src/model-proxy.ts +152 -60
- package/src/peer-approval.ts +3 -2
- package/src/peer-manager.ts +230 -44
- package/src/retry.ts +81 -0
- package/src/router.ts +152 -104
- package/src/sentinel.ts +85 -51
- package/src/store.ts +578 -0
- package/src/terminal.ts +17 -8
- package/src/tool-proxy.ts +6 -5
- package/src/tools/cluster-events.ts +6 -6
- package/src/tools/cluster-kanban.ts +345 -0
- package/src/tools/cluster-peers.ts +1 -1
- package/src/tools/cluster-query.ts +145 -0
- package/src/types.ts +95 -9
package/src/peer-manager.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { EventEmitter } from "
|
|
1
|
+
import { EventEmitter } from "eventemitter3";
|
|
2
2
|
import path from "node:path";
|
|
3
3
|
import { homedir, tmpdir } from "node:os";
|
|
4
4
|
import { createServer, type IncomingMessage, type ServerResponse, type Server } from "node:http";
|
|
@@ -22,10 +22,11 @@ import type {
|
|
|
22
22
|
PeerSync,
|
|
23
23
|
} from "./types.ts";
|
|
24
24
|
import { PeerApprovalManager, type ChannelApi, type NotifyTarget } from "./peer-approval.ts";
|
|
25
|
+
import { computeBackoffDelay, getReconnectBackoff } from "./retry.ts";
|
|
25
26
|
import { loadOrCreateIdentity } from "./identity.ts";
|
|
26
27
|
import type { KeyPair } from "./crypto.ts";
|
|
27
28
|
|
|
28
|
-
|
|
29
|
+
// Reconnect constants kept for RECONNECT_MAX reference in failover scheduling
|
|
29
30
|
const RECONNECT_MAX = 60_000;
|
|
30
31
|
|
|
31
32
|
/** Frame types that bypass dedup (streams share one id across chunks; responses share id with request). */
|
|
@@ -51,6 +52,8 @@ const SKIP_DEDUP_TYPES = new Set([
|
|
|
51
52
|
"file_transfer_ack", "file_transfer_complete",
|
|
52
53
|
]);
|
|
53
54
|
|
|
55
|
+
// Reconnect backoff params are now provided by retry.ts → getReconnectBackoff()
|
|
56
|
+
|
|
54
57
|
/** Classify WebSocket close code into a human-readable reason. */
|
|
55
58
|
function classifyCloseReason(code: number, reason: string): string {
|
|
56
59
|
if (reason) return reason;
|
|
@@ -94,6 +97,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
94
97
|
/** Reconnect timers keyed by `nodeId|url` for per-channel reconnection. */
|
|
95
98
|
private reconnectTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
96
99
|
private reconnectAttempts = new Map<string, number>();
|
|
100
|
+
/** Last close code per channel key for adaptive reconnect backoff. */
|
|
101
|
+
private lastCloseCode = new Map<string, number>();
|
|
97
102
|
/** Track which nodeIds have already completed the full peer join (for multi-channel). */
|
|
98
103
|
private joinedPeers = new Set<string>();
|
|
99
104
|
/** All configured URLs per peer (for multi-URL peers). */
|
|
@@ -104,6 +109,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
104
109
|
private urlProbeLatencies = new Map<string, number>();
|
|
105
110
|
/** Route probe interval timer. */
|
|
106
111
|
private probeTimer: ReturnType<typeof setInterval> | null = null;
|
|
112
|
+
/** EMA latency baseline per peer (for spike detection). */
|
|
113
|
+
private latencyBaselines = new Map<string, number>();
|
|
114
|
+
/** Last time a latency-triggered probe was fired per peer (debounce). */
|
|
115
|
+
private lastProbeTime = new Map<string, number>();
|
|
107
116
|
/** Deferred disconnect timers — grace period before broadcasting peer_leave. */
|
|
108
117
|
private disconnectGraceTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
109
118
|
private stopped = false;
|
|
@@ -121,10 +130,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
121
130
|
private rateLimiter: RateLimiter;
|
|
122
131
|
readonly approvalManager: PeerApprovalManager;
|
|
123
132
|
|
|
124
|
-
constructor(config: ClawMatrixConfig, openclawVersion?: string) {
|
|
133
|
+
constructor(config: ClawMatrixConfig, openclawVersion?: string, openclawConfig?: Record<string, unknown>) {
|
|
125
134
|
super();
|
|
126
135
|
this.config = config;
|
|
127
|
-
this.localDeviceInfo = collectDeviceInfo(openclawVersion);
|
|
136
|
+
this.localDeviceInfo = collectDeviceInfo(openclawVersion, openclawConfig);
|
|
128
137
|
const acpAgents = config.acp?.enabled ? config.acp.agents : undefined;
|
|
129
138
|
this.localCapabilities = {
|
|
130
139
|
nodeId: config.nodeId,
|
|
@@ -285,7 +294,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
285
294
|
}
|
|
286
295
|
|
|
287
296
|
// ── Route probing (for multi-URL peers) ──────────────────────────
|
|
288
|
-
private static readonly PROBE_INTERVAL =
|
|
297
|
+
private static readonly PROBE_INTERVAL = 900_000; // 15 minutes
|
|
289
298
|
/** Minimum improvement ratio to trigger a route switch. */
|
|
290
299
|
private static readonly SWITCH_THRESHOLD = 0.7; // new must be ≤70% of current
|
|
291
300
|
|
|
@@ -385,6 +394,32 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
385
394
|
this.switchRoute(nodeId, bestUrl);
|
|
386
395
|
}
|
|
387
396
|
|
|
397
|
+
/** Trigger an immediate route probe for a specific peer (debounced). */
|
|
398
|
+
private triggerProbeForPeer(nodeId: string) {
|
|
399
|
+
const urls = this.peerUrls.get(nodeId);
|
|
400
|
+
if (!urls || urls.length <= 1) return;
|
|
401
|
+
|
|
402
|
+
const now = Date.now();
|
|
403
|
+
const lastProbe = this.lastProbeTime.get(nodeId) ?? 0;
|
|
404
|
+
if (now - lastProbe < 60_000) return; // debounce: 1 per minute
|
|
405
|
+
this.lastProbeTime.set(nodeId, now);
|
|
406
|
+
|
|
407
|
+
// Probe non-active URLs for this peer
|
|
408
|
+
const activeUrl = this.activeUrls.get(nodeId);
|
|
409
|
+
(async () => {
|
|
410
|
+
for (const url of urls) {
|
|
411
|
+
if (url === activeUrl) continue;
|
|
412
|
+
const latency = await this.probeUrl(url);
|
|
413
|
+
if (latency !== null) this.urlProbeLatencies.set(url, latency);
|
|
414
|
+
}
|
|
415
|
+
if (activeUrl) {
|
|
416
|
+
const route = this.router.getRoute(nodeId);
|
|
417
|
+
if (route && route.latencyMs > 0) this.urlProbeLatencies.set(activeUrl, route.latencyMs);
|
|
418
|
+
}
|
|
419
|
+
this.evaluateRouteSwitch(nodeId);
|
|
420
|
+
})();
|
|
421
|
+
}
|
|
422
|
+
|
|
388
423
|
/** Callback to check if there are active tasks involving a peer. Set by ClusterRuntime. */
|
|
389
424
|
private activeTaskChecker: ((nodeId: string) => boolean) | null = null;
|
|
390
425
|
|
|
@@ -465,10 +500,13 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
465
500
|
this.inboundIps.set(ws, ip);
|
|
466
501
|
this.handleInboundOpen(ws, ip);
|
|
467
502
|
|
|
468
|
-
ws.on("message", (data) => {
|
|
503
|
+
ws.on("message", (data, isBinary) => {
|
|
469
504
|
const conn = this.inboundConnections.get(ws);
|
|
470
505
|
if (conn) {
|
|
471
|
-
|
|
506
|
+
// ws package always delivers data as Buffer; use isBinary to distinguish frame type.
|
|
507
|
+
// Text frames (e.g. base64 encrypted envelopes) must be passed as strings
|
|
508
|
+
// so Connection routes them through decryptBinary, not decryptBinaryRaw.
|
|
509
|
+
conn.feedMessage(isBinary ? (Buffer.isBuffer(data) ? data : Buffer.from(data as any)) : data.toString());
|
|
472
510
|
}
|
|
473
511
|
});
|
|
474
512
|
|
|
@@ -492,7 +530,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
492
530
|
private handleInboundOpen(ws: WsWebSocket, ip: string) {
|
|
493
531
|
// Wrap ws WebSocket into our WsTransport interface
|
|
494
532
|
const transport: WsTransport = {
|
|
495
|
-
send(data: string) {
|
|
533
|
+
send(data: string | Buffer) {
|
|
496
534
|
ws.send(data);
|
|
497
535
|
},
|
|
498
536
|
close(code?: number, reason?: string) {
|
|
@@ -539,10 +577,44 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
539
577
|
private connectToPeer(peer: PeerConfig) {
|
|
540
578
|
const urls = Array.isArray(peer.url) ? peer.url : [peer.url];
|
|
541
579
|
this.peerUrls.set(peer.nodeId, urls);
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
580
|
+
if (urls.length <= 1) {
|
|
581
|
+
this.activeUrls.set(peer.nodeId, urls[0]);
|
|
582
|
+
this.connectToChannel(peer.nodeId, urls[0]);
|
|
583
|
+
} else {
|
|
584
|
+
// Multi-URL: connect all simultaneously, prune to best after 10s
|
|
585
|
+
this.activeUrls.set(peer.nodeId, urls[0]);
|
|
586
|
+
for (const url of urls) {
|
|
587
|
+
this.connectToChannel(peer.nodeId, url);
|
|
588
|
+
}
|
|
589
|
+
setTimeout(() => this.pruneWarmupConnections(peer.nodeId), 10_000);
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
/** After warmup period, keep only the lowest-latency channel and close the rest. */
|
|
594
|
+
private pruneWarmupConnections(nodeId: string) {
|
|
595
|
+
const channels = this.router.getChannels(nodeId);
|
|
596
|
+
if (channels.length <= 1) return;
|
|
597
|
+
|
|
598
|
+
let best: Connection | null = null;
|
|
599
|
+
let bestLatency = Infinity;
|
|
600
|
+
for (const conn of channels) {
|
|
601
|
+
if (!conn.isOpen) continue;
|
|
602
|
+
const lat = conn.latencyMs > 0 ? conn.latencyMs : 10_000;
|
|
603
|
+
if (lat < bestLatency) {
|
|
604
|
+
bestLatency = lat;
|
|
605
|
+
best = conn;
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
for (const conn of channels) {
|
|
610
|
+
if (conn !== best && conn.isOpen) {
|
|
611
|
+
conn.close(1000, "warm-up pruned");
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
if (best) {
|
|
616
|
+
debug("peer", `pruneWarmupConnections(${nodeId}): kept channel with latency=${best.latencyMs}ms, pruned ${channels.length - 1} others`);
|
|
617
|
+
}
|
|
546
618
|
}
|
|
547
619
|
|
|
548
620
|
/** Pick the best URL for a peer based on probe latencies. Falls back to first URL. */
|
|
@@ -596,6 +668,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
596
668
|
conn.on("authenticated", (caps) => {
|
|
597
669
|
debug("peer", `connectToChannel(${nodeId}): authenticated url=${url}`);
|
|
598
670
|
this.reconnectAttempts.delete(channelKey);
|
|
671
|
+
this.lastCloseCode.delete(channelKey);
|
|
599
672
|
this.onPeerAuthenticated(conn, caps);
|
|
600
673
|
});
|
|
601
674
|
|
|
@@ -624,6 +697,14 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
624
697
|
debug("peer", `connectToChannel(${nodeId}): self-connection, will not reconnect`);
|
|
625
698
|
return;
|
|
626
699
|
}
|
|
700
|
+
// Don't reconnect warm-up pruned channels
|
|
701
|
+
if (ev.reason === "warm-up pruned") {
|
|
702
|
+
debug("peer", `connectToChannel(${nodeId}): warm-up pruned, will not reconnect`);
|
|
703
|
+
return;
|
|
704
|
+
}
|
|
705
|
+
// Record close code for adaptive backoff
|
|
706
|
+
const channelKey = `${nodeId}|${url}`;
|
|
707
|
+
this.lastCloseCode.set(channelKey, ev.code);
|
|
627
708
|
if (!lastError) {
|
|
628
709
|
lastError = classifyCloseReason(ev.code, ev.reason);
|
|
629
710
|
}
|
|
@@ -665,7 +746,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
665
746
|
}
|
|
666
747
|
}
|
|
667
748
|
|
|
668
|
-
const
|
|
749
|
+
const params = getReconnectBackoff(this.lastCloseCode.get(channelKey) ?? 1006);
|
|
750
|
+
const delay = computeBackoffDelay(attempt, params);
|
|
669
751
|
this.reconnectAttempts.set(channelKey, attempt + 1);
|
|
670
752
|
const tag = reason ? ` reason="${reason}"` : "";
|
|
671
753
|
debug("peer", `scheduleChannelReconnect(${nodeId}): attempt=${attempt} delay=${delay}ms url=${url}${tag}`);
|
|
@@ -693,11 +775,62 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
693
775
|
}
|
|
694
776
|
|
|
695
777
|
// ── Peer lifecycle ─────────────────────────────────────────────
|
|
778
|
+
|
|
779
|
+
/** Migrate tracking maps when a URL-derived nodeId (starts with "_") resolves to the real nodeId. */
|
|
780
|
+
private migrateUrlDerivedPeer(derivedId: string, realId: string) {
|
|
781
|
+
// Migrate peerUrls, activeUrls
|
|
782
|
+
const urls = this.peerUrls.get(derivedId);
|
|
783
|
+
if (urls) {
|
|
784
|
+
this.peerUrls.delete(derivedId);
|
|
785
|
+
if (!this.peerUrls.has(realId)) this.peerUrls.set(realId, urls);
|
|
786
|
+
}
|
|
787
|
+
const activeUrl = this.activeUrls.get(derivedId);
|
|
788
|
+
if (activeUrl) {
|
|
789
|
+
this.activeUrls.delete(derivedId);
|
|
790
|
+
if (!this.activeUrls.has(realId)) this.activeUrls.set(realId, activeUrl);
|
|
791
|
+
}
|
|
792
|
+
// Migrate reconnect state: channelKey uses nodeId|url
|
|
793
|
+
for (const url of urls ?? []) {
|
|
794
|
+
const oldKey = `${derivedId}|${url}`;
|
|
795
|
+
const newKey = `${realId}|${url}`;
|
|
796
|
+
const timer = this.reconnectTimers.get(oldKey);
|
|
797
|
+
if (timer) { this.reconnectTimers.delete(oldKey); this.reconnectTimers.set(newKey, timer); }
|
|
798
|
+
const attempts = this.reconnectAttempts.get(oldKey);
|
|
799
|
+
if (attempts !== undefined) { this.reconnectAttempts.delete(oldKey); this.reconnectAttempts.set(newKey, attempts); }
|
|
800
|
+
const code = this.lastCloseCode.get(oldKey);
|
|
801
|
+
if (code !== undefined) { this.lastCloseCode.delete(oldKey); this.lastCloseCode.set(newKey, code); }
|
|
802
|
+
}
|
|
803
|
+
// Update the config peer entry in place for sentinel and other consumers
|
|
804
|
+
for (const peer of this.config.peers) {
|
|
805
|
+
if (peer.nodeId === derivedId) {
|
|
806
|
+
(peer as { nodeId: string }).nodeId = realId;
|
|
807
|
+
break;
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
debug("peer", `migrateUrlDerivedPeer: ${derivedId} → ${realId}`);
|
|
811
|
+
}
|
|
812
|
+
|
|
696
813
|
private onPeerAuthenticated(conn: Connection, caps: NodeCapabilities, ip?: string) {
|
|
697
814
|
const nodeId = conn.remoteNodeId!;
|
|
698
815
|
// Peer's persistent public key for TOFU identity binding
|
|
699
816
|
const peerPublicKey = conn.remoteIdentityKey ?? undefined;
|
|
700
817
|
|
|
818
|
+
// If this outbound peer was configured with URL string shorthand (nodeId starts
|
|
819
|
+
// with "_"), resolve to the real nodeId from the authenticated handshake.
|
|
820
|
+
if (conn.role === "outbound") {
|
|
821
|
+
for (const peer of this.config.peers) {
|
|
822
|
+
if (peer.nodeId.startsWith("_") && this.peerUrls.get(peer.nodeId)) {
|
|
823
|
+
const urls = this.peerUrls.get(peer.nodeId)!;
|
|
824
|
+
// Check if this connection belongs to this URL-derived peer
|
|
825
|
+
const activeUrl = this.activeUrls.get(peer.nodeId);
|
|
826
|
+
if (activeUrl && urls.length > 0) {
|
|
827
|
+
this.migrateUrlDerivedPeer(peer.nodeId, nodeId);
|
|
828
|
+
break;
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
|
|
701
834
|
// Prevent self-connection: close immediately if the remote side authenticated
|
|
702
835
|
// with our own nodeId. For outbound this means the peer URL accidentally
|
|
703
836
|
// points to self; for inbound it means a remote node is (mis)using our nodeId.
|
|
@@ -796,7 +929,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
796
929
|
// Additional channel — just add to the channel pool, no peer_join broadcast
|
|
797
930
|
this.router.addChannel(nodeId, conn);
|
|
798
931
|
conn.on("message", (frame) => this.onFrame(frame, conn));
|
|
799
|
-
conn.on("latency", () =>
|
|
932
|
+
conn.on("latency", (latencyMs) => {
|
|
933
|
+
this.router.updateActiveChannel(nodeId);
|
|
934
|
+
// Update baseline and check for spike
|
|
935
|
+
const baseline = this.latencyBaselines.get(nodeId) ?? latencyMs;
|
|
936
|
+
const newBaseline = baseline * 0.8 + latencyMs * 0.2;
|
|
937
|
+
this.latencyBaselines.set(nodeId, newBaseline);
|
|
938
|
+
if (latencyMs > baseline * 2 && latencyMs > 200) {
|
|
939
|
+
this.triggerProbeForPeer(nodeId);
|
|
940
|
+
}
|
|
941
|
+
});
|
|
800
942
|
conn.on("close", () => this.onChannelDisconnected(conn));
|
|
801
943
|
const channelCount = this.router.getChannelCount(nodeId);
|
|
802
944
|
debug("peer", `completePeerJoin(${nodeId}): additional channel added (total=${channelCount})`);
|
|
@@ -830,7 +972,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
830
972
|
this.joinedPeers.add(nodeId);
|
|
831
973
|
|
|
832
974
|
conn.on("message", (frame) => this.onFrame(frame, conn));
|
|
833
|
-
conn.on("latency", () =>
|
|
975
|
+
conn.on("latency", (latencyMs) => {
|
|
976
|
+
this.router.updateActiveChannel(nodeId);
|
|
977
|
+
// Update baseline and check for spike
|
|
978
|
+
const baseline = this.latencyBaselines.get(nodeId) ?? latencyMs;
|
|
979
|
+
const newBaseline = baseline * 0.8 + latencyMs * 0.2;
|
|
980
|
+
this.latencyBaselines.set(nodeId, newBaseline);
|
|
981
|
+
if (latencyMs > baseline * 2 && latencyMs > 200) {
|
|
982
|
+
this.triggerProbeForPeer(nodeId);
|
|
983
|
+
}
|
|
984
|
+
});
|
|
834
985
|
conn.on("close", () => this.onChannelDisconnected(conn));
|
|
835
986
|
|
|
836
987
|
this.sendPeerSync(conn);
|
|
@@ -862,6 +1013,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
862
1013
|
|
|
863
1014
|
audit("peer_join", { nodeId, detail: `agents=${caps.agents.length} models=${caps.models.length}` });
|
|
864
1015
|
this.emit("peerConnected", nodeId);
|
|
1016
|
+
|
|
1017
|
+
// Probe alternative routes after reconnect (connectivity may have changed)
|
|
1018
|
+
setTimeout(() => this.triggerProbeForPeer(nodeId), 5_000);
|
|
865
1019
|
}
|
|
866
1020
|
|
|
867
1021
|
/** Handle a single channel disconnecting (multi-channel aware). */
|
|
@@ -901,6 +1055,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
901
1055
|
if (nodeId === this.config.nodeId) {
|
|
902
1056
|
this.router.removePeer(nodeId);
|
|
903
1057
|
this.joinedPeers.delete(nodeId);
|
|
1058
|
+
this.latencyBaselines.delete(nodeId);
|
|
1059
|
+
this.lastProbeTime.delete(nodeId);
|
|
1060
|
+
// 清除 delta sync 版本号,确保重连时发全量 peer_sync
|
|
1061
|
+
this.peerSyncVersions.delete(nodeId);
|
|
904
1062
|
return;
|
|
905
1063
|
}
|
|
906
1064
|
|
|
@@ -950,6 +1108,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
950
1108
|
audit("peer_leave", { nodeId });
|
|
951
1109
|
this.router.removePeer(nodeId);
|
|
952
1110
|
this.joinedPeers.delete(nodeId);
|
|
1111
|
+
this.latencyBaselines.delete(nodeId);
|
|
1112
|
+
this.lastProbeTime.delete(nodeId);
|
|
1113
|
+
this.peerSyncVersions.delete(nodeId);
|
|
953
1114
|
|
|
954
1115
|
// Remove satellite contexts that were only reachable via this peer
|
|
955
1116
|
for (let i = this.satelliteContexts.length - 1; i >= 0; i--) {
|
|
@@ -1076,13 +1237,19 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
1076
1237
|
this.emit("frame", frame, from);
|
|
1077
1238
|
}
|
|
1078
1239
|
|
|
1240
|
+
/** Last sync version sent to each direct peer. */
|
|
1241
|
+
private peerSyncVersions = new Map<string, number>();
|
|
1242
|
+
|
|
1079
1243
|
private sendPeerSync(conn: Connection) {
|
|
1080
|
-
const
|
|
1081
|
-
const
|
|
1244
|
+
const remoteNodeId = conn.remoteNodeId ?? "";
|
|
1245
|
+
const sinceVersion = this.peerSyncVersions.get(remoteNodeId) ?? 0;
|
|
1246
|
+
const delta = this.router.buildPeerSyncDelta(sinceVersion);
|
|
1247
|
+
const payload: Record<string, unknown> = { ...delta };
|
|
1082
1248
|
if (this.satelliteContexts.length > 0) {
|
|
1083
1249
|
payload.satellites = this.satelliteContexts;
|
|
1084
1250
|
}
|
|
1085
|
-
|
|
1251
|
+
this.peerSyncVersions.set(remoteNodeId, delta.version);
|
|
1252
|
+
conn.sendDirect({
|
|
1086
1253
|
type: "peer_sync",
|
|
1087
1254
|
from: this.config.nodeId,
|
|
1088
1255
|
timestamp: Date.now(),
|
|
@@ -1105,34 +1272,53 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
1105
1272
|
}
|
|
1106
1273
|
|
|
1107
1274
|
let changed = false;
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
const
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
|
|
1118
|
-
const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
|
|
1119
|
-
this.router.updatePeerCapabilities(peer.nodeId, peer);
|
|
1120
|
-
if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
|
|
1121
|
-
|| (peer.directPeers?.length ?? 0) !== hadDirectPeers
|
|
1122
|
-
|| peer.toolProxy?.enabled !== hadToolProxyEnabled
|
|
1123
|
-
|| (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
|
|
1124
|
-
|| (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
|
|
1125
|
-
|| peer.deviceInfo?.hostname !== hadDeviceInfo
|
|
1126
|
-
|| (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
|
|
1275
|
+
|
|
1276
|
+
// Delta sync: apply removed, then added/updated
|
|
1277
|
+
if (frame.payload.version != null && frame.payload.removed) {
|
|
1278
|
+
for (const nodeId of frame.payload.removed) {
|
|
1279
|
+
if (nodeId === this.config.nodeId) continue;
|
|
1280
|
+
// Only remove relay routes — don't remove direct connections
|
|
1281
|
+
const route = this.router.getRoute(nodeId);
|
|
1282
|
+
if (route && !route.connection) {
|
|
1283
|
+
this.router.removePeer(nodeId);
|
|
1127
1284
|
changed = true;
|
|
1128
1285
|
}
|
|
1129
|
-
}
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1286
|
+
}
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
// Process peers from either full sync or delta (updated field)
|
|
1290
|
+
// Delta sets peers = updated for backward compat, so this works for both
|
|
1291
|
+
const peersToProcess = frame.payload.peers;
|
|
1292
|
+
if (peersToProcess) {
|
|
1293
|
+
for (const peer of peersToProcess) {
|
|
1294
|
+
if (peer.nodeId === this.config.nodeId) continue;
|
|
1295
|
+
if (peer.nodeId === from.remoteNodeId) {
|
|
1296
|
+
const prev = this.router.getRoute(peer.nodeId);
|
|
1297
|
+
const hadAgents = prev?.agents.length ?? 0;
|
|
1298
|
+
const hadDirectPeers = prev?.directPeers.length ?? 0;
|
|
1299
|
+
const hadDeviceInfo = prev?.deviceInfo?.hostname;
|
|
1300
|
+
const hadAcpAgents = prev?.acpAgents?.length ?? 0;
|
|
1301
|
+
const hadToolProxyEnabled = prev?.toolProxy?.enabled;
|
|
1302
|
+
const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
|
|
1303
|
+
const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
|
|
1304
|
+
this.router.updatePeerCapabilities(peer.nodeId, peer);
|
|
1305
|
+
if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
|
|
1306
|
+
|| (peer.directPeers?.length ?? 0) !== hadDirectPeers
|
|
1307
|
+
|| peer.toolProxy?.enabled !== hadToolProxyEnabled
|
|
1308
|
+
|| (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
|
|
1309
|
+
|| (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
|
|
1310
|
+
|| peer.deviceInfo?.hostname !== hadDeviceInfo
|
|
1311
|
+
|| (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
|
|
1312
|
+
changed = true;
|
|
1313
|
+
}
|
|
1314
|
+
} else {
|
|
1315
|
+
// Skip if the remote peer only knows about this node through us —
|
|
1316
|
+
// using them as relay would create a routing loop.
|
|
1317
|
+
if (peer.reachableVia === this.config.nodeId) continue;
|
|
1318
|
+
const existing = this.router.getRoute(peer.nodeId);
|
|
1319
|
+
if (!existing) changed = true;
|
|
1320
|
+
this.router.addRelayPeer(peer, from.remoteNodeId!);
|
|
1321
|
+
}
|
|
1136
1322
|
}
|
|
1137
1323
|
}
|
|
1138
1324
|
|
package/src/retry.ts
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared retry/backoff/circuit-breaker utilities backed by cockatiel.
|
|
3
|
+
*
|
|
4
|
+
* Provides:
|
|
5
|
+
* - Per-node circuit breakers for mesh health tracking
|
|
6
|
+
* - Adaptive backoff delay computation
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { circuitBreaker, handleAll, SamplingBreaker, ConsecutiveBreaker } from "cockatiel";
|
|
10
|
+
|
|
11
|
+
// ── Per-node circuit breakers ────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
export interface CircuitBreakerConfig {
|
|
14
|
+
/** Number of consecutive failures before opening the circuit (default: 5). */
|
|
15
|
+
threshold?: number;
|
|
16
|
+
/** How long to wait before trying a half-open probe in ms (default: 30000). */
|
|
17
|
+
halfOpenAfter?: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const DEFAULT_CB_CONFIG: Required<CircuitBreakerConfig> = {
|
|
21
|
+
threshold: 5,
|
|
22
|
+
halfOpenAfter: 30_000,
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
const nodeBreakers = new Map<string, ReturnType<typeof circuitBreaker>>();
|
|
26
|
+
|
|
27
|
+
/** Get or create a circuit breaker for a node. Breakers are cached per nodeId. */
|
|
28
|
+
export function getNodeCircuitBreaker(nodeId: string, config?: CircuitBreakerConfig) {
|
|
29
|
+
let cb = nodeBreakers.get(nodeId);
|
|
30
|
+
if (cb) return cb;
|
|
31
|
+
|
|
32
|
+
const cfg = { ...DEFAULT_CB_CONFIG, ...config };
|
|
33
|
+
cb = circuitBreaker(handleAll, {
|
|
34
|
+
halfOpenAfter: cfg.halfOpenAfter,
|
|
35
|
+
breaker: new ConsecutiveBreaker(cfg.threshold),
|
|
36
|
+
});
|
|
37
|
+
nodeBreakers.set(nodeId, cb);
|
|
38
|
+
return cb;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** Remove a node's circuit breaker (e.g. when peer leaves). */
|
|
42
|
+
export function removeNodeCircuitBreaker(nodeId: string) {
|
|
43
|
+
nodeBreakers.delete(nodeId);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Check if a node's circuit is currently open (unhealthy). */
|
|
47
|
+
export function isNodeCircuitOpen(nodeId: string): boolean {
|
|
48
|
+
const cb = nodeBreakers.get(nodeId);
|
|
49
|
+
if (!cb) return false;
|
|
50
|
+
return cb.state === "open";
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** Reset all circuit breakers (e.g. on shutdown). */
|
|
54
|
+
export function resetAllCircuitBreakers() {
|
|
55
|
+
nodeBreakers.clear();
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// ── Adaptive backoff computation ─────────────────────────────────
|
|
59
|
+
|
|
60
|
+
export interface BackoffParams {
|
|
61
|
+
/** Base delay in ms. */
|
|
62
|
+
base: number;
|
|
63
|
+
/** Maximum delay in ms. */
|
|
64
|
+
max: number;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/** Compute an exponential backoff delay with jitter.
|
|
68
|
+
* Returns a delay in ms: min(base * 2^attempt, max) * random(0.5, 1.0). */
|
|
69
|
+
export function computeBackoffDelay(attempt: number, params: BackoffParams): number {
|
|
70
|
+
const raw = Math.min(params.base * 2 ** attempt, params.max);
|
|
71
|
+
return Math.round(raw * (0.5 + Math.random() * 0.5));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/** Get adaptive backoff parameters based on WebSocket close code. */
|
|
75
|
+
export function getReconnectBackoff(code: number): BackoffParams {
|
|
76
|
+
if (code === 4001) return { base: 30_000, max: 300_000 }; // auth failed → slow
|
|
77
|
+
if (code === 4003) return { base: 5_000, max: 60_000 }; // auth timeout → medium
|
|
78
|
+
if (code === 1006) return { base: 1_000, max: 10_000 }; // network error → fast
|
|
79
|
+
if (code === 1001) return { base: 2_000, max: 30_000 }; // going away → medium-fast
|
|
80
|
+
return { base: 1_000, max: 60_000 }; // default
|
|
81
|
+
}
|