clawmatrix 0.4.2 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -21
- package/cli/bin/clawmatrix.mjs +300 -1
- package/package.json +8 -1
- package/src/acp-proxy.ts +122 -50
- package/src/{web.ts → api.ts} +646 -25
- package/src/audit.ts +37 -2
- package/src/auth.ts +5 -10
- package/src/automation.ts +625 -0
- package/src/cluster-service.ts +172 -16
- package/src/compat.ts +103 -0
- package/src/config.ts +75 -27
- package/src/connection.ts +225 -37
- package/src/crypto.ts +72 -5
- package/src/device-info.ts +21 -2
- package/src/file-transfer.ts +3 -2
- package/src/handoff.ts +90 -32
- package/src/health-tracker.ts +91 -356
- package/src/index.ts +421 -13
- package/src/kanban.ts +507 -0
- package/src/knowledge-sync.ts +158 -7
- package/src/local-tools.ts +65 -2
- package/src/log-replication.ts +198 -0
- package/src/model-proxy.ts +152 -60
- package/src/peer-approval.ts +3 -2
- package/src/peer-manager.ts +237 -47
- package/src/retry.ts +81 -0
- package/src/router.ts +152 -104
- package/src/sentinel.ts +86 -52
- package/src/store.ts +578 -0
- package/src/terminal.ts +17 -8
- package/src/tool-proxy.ts +6 -5
- package/src/tools/cluster-events.ts +6 -6
- package/src/tools/cluster-kanban.ts +345 -0
- package/src/tools/cluster-peers.ts +1 -1
- package/src/tools/cluster-query.ts +145 -0
- package/src/types.ts +95 -9
package/src/peer-manager.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { EventEmitter } from "
|
|
1
|
+
import { EventEmitter } from "eventemitter3";
|
|
2
2
|
import path from "node:path";
|
|
3
3
|
import { homedir, tmpdir } from "node:os";
|
|
4
4
|
import { createServer, type IncomingMessage, type ServerResponse, type Server } from "node:http";
|
|
@@ -22,10 +22,11 @@ import type {
|
|
|
22
22
|
PeerSync,
|
|
23
23
|
} from "./types.ts";
|
|
24
24
|
import { PeerApprovalManager, type ChannelApi, type NotifyTarget } from "./peer-approval.ts";
|
|
25
|
+
import { computeBackoffDelay, getReconnectBackoff } from "./retry.ts";
|
|
25
26
|
import { loadOrCreateIdentity } from "./identity.ts";
|
|
26
27
|
import type { KeyPair } from "./crypto.ts";
|
|
27
28
|
|
|
28
|
-
|
|
29
|
+
// Reconnect constants kept for RECONNECT_MAX reference in failover scheduling
|
|
29
30
|
const RECONNECT_MAX = 60_000;
|
|
30
31
|
|
|
31
32
|
/** Frame types that bypass dedup (streams share one id across chunks; responses share id with request). */
|
|
@@ -51,6 +52,8 @@ const SKIP_DEDUP_TYPES = new Set([
|
|
|
51
52
|
"file_transfer_ack", "file_transfer_complete",
|
|
52
53
|
]);
|
|
53
54
|
|
|
55
|
+
// Reconnect backoff params are now provided by retry.ts → getReconnectBackoff()
|
|
56
|
+
|
|
54
57
|
/** Classify WebSocket close code into a human-readable reason. */
|
|
55
58
|
function classifyCloseReason(code: number, reason: string): string {
|
|
56
59
|
if (reason) return reason;
|
|
@@ -94,6 +97,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
94
97
|
/** Reconnect timers keyed by `nodeId|url` for per-channel reconnection. */
|
|
95
98
|
private reconnectTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
96
99
|
private reconnectAttempts = new Map<string, number>();
|
|
100
|
+
/** Last close code per channel key for adaptive reconnect backoff. */
|
|
101
|
+
private lastCloseCode = new Map<string, number>();
|
|
97
102
|
/** Track which nodeIds have already completed the full peer join (for multi-channel). */
|
|
98
103
|
private joinedPeers = new Set<string>();
|
|
99
104
|
/** All configured URLs per peer (for multi-URL peers). */
|
|
@@ -104,6 +109,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
104
109
|
private urlProbeLatencies = new Map<string, number>();
|
|
105
110
|
/** Route probe interval timer. */
|
|
106
111
|
private probeTimer: ReturnType<typeof setInterval> | null = null;
|
|
112
|
+
/** EMA latency baseline per peer (for spike detection). */
|
|
113
|
+
private latencyBaselines = new Map<string, number>();
|
|
114
|
+
/** Last time a latency-triggered probe was fired per peer (debounce). */
|
|
115
|
+
private lastProbeTime = new Map<string, number>();
|
|
107
116
|
/** Deferred disconnect timers — grace period before broadcasting peer_leave. */
|
|
108
117
|
private disconnectGraceTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
109
118
|
private stopped = false;
|
|
@@ -121,10 +130,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
121
130
|
private rateLimiter: RateLimiter;
|
|
122
131
|
readonly approvalManager: PeerApprovalManager;
|
|
123
132
|
|
|
124
|
-
constructor(config: ClawMatrixConfig, openclawVersion?: string) {
|
|
133
|
+
constructor(config: ClawMatrixConfig, openclawVersion?: string, openclawConfig?: Record<string, unknown>) {
|
|
125
134
|
super();
|
|
126
135
|
this.config = config;
|
|
127
|
-
this.localDeviceInfo = collectDeviceInfo(openclawVersion);
|
|
136
|
+
this.localDeviceInfo = collectDeviceInfo(openclawVersion, openclawConfig);
|
|
128
137
|
const acpAgents = config.acp?.enabled ? config.acp.agents : undefined;
|
|
129
138
|
this.localCapabilities = {
|
|
130
139
|
nodeId: config.nodeId,
|
|
@@ -285,7 +294,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
285
294
|
}
|
|
286
295
|
|
|
287
296
|
// ── Route probing (for multi-URL peers) ──────────────────────────
|
|
288
|
-
private static readonly PROBE_INTERVAL =
|
|
297
|
+
private static readonly PROBE_INTERVAL = 900_000; // 15 minutes
|
|
289
298
|
/** Minimum improvement ratio to trigger a route switch. */
|
|
290
299
|
private static readonly SWITCH_THRESHOLD = 0.7; // new must be ≤70% of current
|
|
291
300
|
|
|
@@ -385,6 +394,32 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
385
394
|
this.switchRoute(nodeId, bestUrl);
|
|
386
395
|
}
|
|
387
396
|
|
|
397
|
+
/** Trigger an immediate route probe for a specific peer (debounced). */
|
|
398
|
+
private triggerProbeForPeer(nodeId: string) {
|
|
399
|
+
const urls = this.peerUrls.get(nodeId);
|
|
400
|
+
if (!urls || urls.length <= 1) return;
|
|
401
|
+
|
|
402
|
+
const now = Date.now();
|
|
403
|
+
const lastProbe = this.lastProbeTime.get(nodeId) ?? 0;
|
|
404
|
+
if (now - lastProbe < 60_000) return; // debounce: 1 per minute
|
|
405
|
+
this.lastProbeTime.set(nodeId, now);
|
|
406
|
+
|
|
407
|
+
// Probe non-active URLs for this peer
|
|
408
|
+
const activeUrl = this.activeUrls.get(nodeId);
|
|
409
|
+
(async () => {
|
|
410
|
+
for (const url of urls) {
|
|
411
|
+
if (url === activeUrl) continue;
|
|
412
|
+
const latency = await this.probeUrl(url);
|
|
413
|
+
if (latency !== null) this.urlProbeLatencies.set(url, latency);
|
|
414
|
+
}
|
|
415
|
+
if (activeUrl) {
|
|
416
|
+
const route = this.router.getRoute(nodeId);
|
|
417
|
+
if (route && route.latencyMs > 0) this.urlProbeLatencies.set(activeUrl, route.latencyMs);
|
|
418
|
+
}
|
|
419
|
+
this.evaluateRouteSwitch(nodeId);
|
|
420
|
+
})();
|
|
421
|
+
}
|
|
422
|
+
|
|
388
423
|
/** Callback to check if there are active tasks involving a peer. Set by ClusterRuntime. */
|
|
389
424
|
private activeTaskChecker: ((nodeId: string) => boolean) | null = null;
|
|
390
425
|
|
|
@@ -465,10 +500,13 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
465
500
|
this.inboundIps.set(ws, ip);
|
|
466
501
|
this.handleInboundOpen(ws, ip);
|
|
467
502
|
|
|
468
|
-
ws.on("message", (data) => {
|
|
503
|
+
ws.on("message", (data, isBinary) => {
|
|
469
504
|
const conn = this.inboundConnections.get(ws);
|
|
470
505
|
if (conn) {
|
|
471
|
-
|
|
506
|
+
// ws package always delivers data as Buffer; use isBinary to distinguish frame type.
|
|
507
|
+
// Text frames (e.g. base64 encrypted envelopes) must be passed as strings
|
|
508
|
+
// so Connection routes them through decryptBinary, not decryptBinaryRaw.
|
|
509
|
+
conn.feedMessage(isBinary ? (Buffer.isBuffer(data) ? data : Buffer.from(data as any)) : data.toString());
|
|
472
510
|
}
|
|
473
511
|
});
|
|
474
512
|
|
|
@@ -492,7 +530,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
492
530
|
private handleInboundOpen(ws: WsWebSocket, ip: string) {
|
|
493
531
|
// Wrap ws WebSocket into our WsTransport interface
|
|
494
532
|
const transport: WsTransport = {
|
|
495
|
-
send(data: string) {
|
|
533
|
+
send(data: string | Buffer) {
|
|
496
534
|
ws.send(data);
|
|
497
535
|
},
|
|
498
536
|
close(code?: number, reason?: string) {
|
|
@@ -539,10 +577,44 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
539
577
|
private connectToPeer(peer: PeerConfig) {
|
|
540
578
|
const urls = Array.isArray(peer.url) ? peer.url : [peer.url];
|
|
541
579
|
this.peerUrls.set(peer.nodeId, urls);
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
580
|
+
if (urls.length <= 1) {
|
|
581
|
+
this.activeUrls.set(peer.nodeId, urls[0]);
|
|
582
|
+
this.connectToChannel(peer.nodeId, urls[0]);
|
|
583
|
+
} else {
|
|
584
|
+
// Multi-URL: connect all simultaneously, prune to best after 10s
|
|
585
|
+
this.activeUrls.set(peer.nodeId, urls[0]);
|
|
586
|
+
for (const url of urls) {
|
|
587
|
+
this.connectToChannel(peer.nodeId, url);
|
|
588
|
+
}
|
|
589
|
+
setTimeout(() => this.pruneWarmupConnections(peer.nodeId), 10_000);
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
/** After warmup period, keep only the lowest-latency channel and close the rest. */
|
|
594
|
+
private pruneWarmupConnections(nodeId: string) {
|
|
595
|
+
const channels = this.router.getChannels(nodeId);
|
|
596
|
+
if (channels.length <= 1) return;
|
|
597
|
+
|
|
598
|
+
let best: Connection | null = null;
|
|
599
|
+
let bestLatency = Infinity;
|
|
600
|
+
for (const conn of channels) {
|
|
601
|
+
if (!conn.isOpen) continue;
|
|
602
|
+
const lat = conn.latencyMs > 0 ? conn.latencyMs : 10_000;
|
|
603
|
+
if (lat < bestLatency) {
|
|
604
|
+
bestLatency = lat;
|
|
605
|
+
best = conn;
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
for (const conn of channels) {
|
|
610
|
+
if (conn !== best && conn.isOpen) {
|
|
611
|
+
conn.close(1000, "warm-up pruned");
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
if (best) {
|
|
616
|
+
debug("peer", `pruneWarmupConnections(${nodeId}): kept channel with latency=${best.latencyMs}ms, pruned ${channels.length - 1} others`);
|
|
617
|
+
}
|
|
546
618
|
}
|
|
547
619
|
|
|
548
620
|
/** Pick the best URL for a peer based on probe latencies. Falls back to first URL. */
|
|
@@ -571,10 +643,14 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
571
643
|
const attempt = this.reconnectAttempts.get(channelKey) ?? 0;
|
|
572
644
|
debug("peer", `connectToChannel(${nodeId}): attempt=${attempt} url=${url}`);
|
|
573
645
|
|
|
574
|
-
// Use
|
|
575
|
-
|
|
646
|
+
// Use the `ws` package for outbound connections.
|
|
647
|
+
// Node.js 24+'s built-in WebSocket (undici) defaults binaryType to "blob",
|
|
648
|
+
// causing binary frames to arrive as Blob instead of Buffer — which
|
|
649
|
+
// onRawMessage cannot handle, silently dropping encrypted frames (including auth_ok).
|
|
650
|
+
// The `ws` package defaults to binaryType "nodebuffer", avoiding this issue.
|
|
651
|
+
let ws: InstanceType<typeof WsWebSocket>;
|
|
576
652
|
try {
|
|
577
|
-
ws = new
|
|
653
|
+
ws = new WsWebSocket(url, ["graphql-transport-ws"]);
|
|
578
654
|
} catch (err) {
|
|
579
655
|
debug("peer", `connectToChannel(${nodeId}): WebSocket constructor threw: ${err}`);
|
|
580
656
|
this.scheduleChannelReconnect(nodeId, url);
|
|
@@ -596,6 +672,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
596
672
|
conn.on("authenticated", (caps) => {
|
|
597
673
|
debug("peer", `connectToChannel(${nodeId}): authenticated url=${url}`);
|
|
598
674
|
this.reconnectAttempts.delete(channelKey);
|
|
675
|
+
this.lastCloseCode.delete(channelKey);
|
|
599
676
|
this.onPeerAuthenticated(conn, caps);
|
|
600
677
|
});
|
|
601
678
|
|
|
@@ -624,6 +701,14 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
624
701
|
debug("peer", `connectToChannel(${nodeId}): self-connection, will not reconnect`);
|
|
625
702
|
return;
|
|
626
703
|
}
|
|
704
|
+
// Don't reconnect warm-up pruned channels
|
|
705
|
+
if (ev.reason === "warm-up pruned") {
|
|
706
|
+
debug("peer", `connectToChannel(${nodeId}): warm-up pruned, will not reconnect`);
|
|
707
|
+
return;
|
|
708
|
+
}
|
|
709
|
+
// Record close code for adaptive backoff
|
|
710
|
+
const channelKey = `${nodeId}|${url}`;
|
|
711
|
+
this.lastCloseCode.set(channelKey, ev.code);
|
|
627
712
|
if (!lastError) {
|
|
628
713
|
lastError = classifyCloseReason(ev.code, ev.reason);
|
|
629
714
|
}
|
|
@@ -665,7 +750,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
665
750
|
}
|
|
666
751
|
}
|
|
667
752
|
|
|
668
|
-
const
|
|
753
|
+
const params = getReconnectBackoff(this.lastCloseCode.get(channelKey) ?? 1006);
|
|
754
|
+
const delay = computeBackoffDelay(attempt, params);
|
|
669
755
|
this.reconnectAttempts.set(channelKey, attempt + 1);
|
|
670
756
|
const tag = reason ? ` reason="${reason}"` : "";
|
|
671
757
|
debug("peer", `scheduleChannelReconnect(${nodeId}): attempt=${attempt} delay=${delay}ms url=${url}${tag}`);
|
|
@@ -693,11 +779,62 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
693
779
|
}
|
|
694
780
|
|
|
695
781
|
// ── Peer lifecycle ─────────────────────────────────────────────
|
|
782
|
+
|
|
783
|
+
/** Migrate tracking maps when a URL-derived nodeId (starts with "_") resolves to the real nodeId. */
|
|
784
|
+
private migrateUrlDerivedPeer(derivedId: string, realId: string) {
|
|
785
|
+
// Migrate peerUrls, activeUrls
|
|
786
|
+
const urls = this.peerUrls.get(derivedId);
|
|
787
|
+
if (urls) {
|
|
788
|
+
this.peerUrls.delete(derivedId);
|
|
789
|
+
if (!this.peerUrls.has(realId)) this.peerUrls.set(realId, urls);
|
|
790
|
+
}
|
|
791
|
+
const activeUrl = this.activeUrls.get(derivedId);
|
|
792
|
+
if (activeUrl) {
|
|
793
|
+
this.activeUrls.delete(derivedId);
|
|
794
|
+
if (!this.activeUrls.has(realId)) this.activeUrls.set(realId, activeUrl);
|
|
795
|
+
}
|
|
796
|
+
// Migrate reconnect state: channelKey uses nodeId|url
|
|
797
|
+
for (const url of urls ?? []) {
|
|
798
|
+
const oldKey = `${derivedId}|${url}`;
|
|
799
|
+
const newKey = `${realId}|${url}`;
|
|
800
|
+
const timer = this.reconnectTimers.get(oldKey);
|
|
801
|
+
if (timer) { this.reconnectTimers.delete(oldKey); this.reconnectTimers.set(newKey, timer); }
|
|
802
|
+
const attempts = this.reconnectAttempts.get(oldKey);
|
|
803
|
+
if (attempts !== undefined) { this.reconnectAttempts.delete(oldKey); this.reconnectAttempts.set(newKey, attempts); }
|
|
804
|
+
const code = this.lastCloseCode.get(oldKey);
|
|
805
|
+
if (code !== undefined) { this.lastCloseCode.delete(oldKey); this.lastCloseCode.set(newKey, code); }
|
|
806
|
+
}
|
|
807
|
+
// Update the config peer entry in place for sentinel and other consumers
|
|
808
|
+
for (const peer of this.config.peers) {
|
|
809
|
+
if (peer.nodeId === derivedId) {
|
|
810
|
+
(peer as { nodeId: string }).nodeId = realId;
|
|
811
|
+
break;
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
debug("peer", `migrateUrlDerivedPeer: ${derivedId} → ${realId}`);
|
|
815
|
+
}
|
|
816
|
+
|
|
696
817
|
private onPeerAuthenticated(conn: Connection, caps: NodeCapabilities, ip?: string) {
|
|
697
818
|
const nodeId = conn.remoteNodeId!;
|
|
698
819
|
// Peer's persistent public key for TOFU identity binding
|
|
699
820
|
const peerPublicKey = conn.remoteIdentityKey ?? undefined;
|
|
700
821
|
|
|
822
|
+
// If this outbound peer was configured with URL string shorthand (nodeId starts
|
|
823
|
+
// with "_"), resolve to the real nodeId from the authenticated handshake.
|
|
824
|
+
if (conn.role === "outbound") {
|
|
825
|
+
for (const peer of this.config.peers) {
|
|
826
|
+
if (peer.nodeId.startsWith("_") && this.peerUrls.get(peer.nodeId)) {
|
|
827
|
+
const urls = this.peerUrls.get(peer.nodeId)!;
|
|
828
|
+
// Check if this connection belongs to this URL-derived peer
|
|
829
|
+
const activeUrl = this.activeUrls.get(peer.nodeId);
|
|
830
|
+
if (activeUrl && urls.length > 0) {
|
|
831
|
+
this.migrateUrlDerivedPeer(peer.nodeId, nodeId);
|
|
832
|
+
break;
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
|
|
701
838
|
// Prevent self-connection: close immediately if the remote side authenticated
|
|
702
839
|
// with our own nodeId. For outbound this means the peer URL accidentally
|
|
703
840
|
// points to self; for inbound it means a remote node is (mis)using our nodeId.
|
|
@@ -796,7 +933,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
796
933
|
// Additional channel — just add to the channel pool, no peer_join broadcast
|
|
797
934
|
this.router.addChannel(nodeId, conn);
|
|
798
935
|
conn.on("message", (frame) => this.onFrame(frame, conn));
|
|
799
|
-
conn.on("latency", () =>
|
|
936
|
+
conn.on("latency", (latencyMs) => {
|
|
937
|
+
this.router.updateActiveChannel(nodeId);
|
|
938
|
+
// Update baseline and check for spike
|
|
939
|
+
const baseline = this.latencyBaselines.get(nodeId) ?? latencyMs;
|
|
940
|
+
const newBaseline = baseline * 0.8 + latencyMs * 0.2;
|
|
941
|
+
this.latencyBaselines.set(nodeId, newBaseline);
|
|
942
|
+
if (latencyMs > baseline * 2 && latencyMs > 200) {
|
|
943
|
+
this.triggerProbeForPeer(nodeId);
|
|
944
|
+
}
|
|
945
|
+
});
|
|
800
946
|
conn.on("close", () => this.onChannelDisconnected(conn));
|
|
801
947
|
const channelCount = this.router.getChannelCount(nodeId);
|
|
802
948
|
debug("peer", `completePeerJoin(${nodeId}): additional channel added (total=${channelCount})`);
|
|
@@ -830,7 +976,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
830
976
|
this.joinedPeers.add(nodeId);
|
|
831
977
|
|
|
832
978
|
conn.on("message", (frame) => this.onFrame(frame, conn));
|
|
833
|
-
conn.on("latency", () =>
|
|
979
|
+
conn.on("latency", (latencyMs) => {
|
|
980
|
+
this.router.updateActiveChannel(nodeId);
|
|
981
|
+
// Update baseline and check for spike
|
|
982
|
+
const baseline = this.latencyBaselines.get(nodeId) ?? latencyMs;
|
|
983
|
+
const newBaseline = baseline * 0.8 + latencyMs * 0.2;
|
|
984
|
+
this.latencyBaselines.set(nodeId, newBaseline);
|
|
985
|
+
if (latencyMs > baseline * 2 && latencyMs > 200) {
|
|
986
|
+
this.triggerProbeForPeer(nodeId);
|
|
987
|
+
}
|
|
988
|
+
});
|
|
834
989
|
conn.on("close", () => this.onChannelDisconnected(conn));
|
|
835
990
|
|
|
836
991
|
this.sendPeerSync(conn);
|
|
@@ -862,6 +1017,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
862
1017
|
|
|
863
1018
|
audit("peer_join", { nodeId, detail: `agents=${caps.agents.length} models=${caps.models.length}` });
|
|
864
1019
|
this.emit("peerConnected", nodeId);
|
|
1020
|
+
|
|
1021
|
+
// Probe alternative routes after reconnect (connectivity may have changed)
|
|
1022
|
+
setTimeout(() => this.triggerProbeForPeer(nodeId), 5_000);
|
|
865
1023
|
}
|
|
866
1024
|
|
|
867
1025
|
/** Handle a single channel disconnecting (multi-channel aware). */
|
|
@@ -901,6 +1059,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
901
1059
|
if (nodeId === this.config.nodeId) {
|
|
902
1060
|
this.router.removePeer(nodeId);
|
|
903
1061
|
this.joinedPeers.delete(nodeId);
|
|
1062
|
+
this.latencyBaselines.delete(nodeId);
|
|
1063
|
+
this.lastProbeTime.delete(nodeId);
|
|
1064
|
+
// 清除 delta sync 版本号,确保重连时发全量 peer_sync
|
|
1065
|
+
this.peerSyncVersions.delete(nodeId);
|
|
904
1066
|
return;
|
|
905
1067
|
}
|
|
906
1068
|
|
|
@@ -950,6 +1112,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
950
1112
|
audit("peer_leave", { nodeId });
|
|
951
1113
|
this.router.removePeer(nodeId);
|
|
952
1114
|
this.joinedPeers.delete(nodeId);
|
|
1115
|
+
this.latencyBaselines.delete(nodeId);
|
|
1116
|
+
this.lastProbeTime.delete(nodeId);
|
|
1117
|
+
this.peerSyncVersions.delete(nodeId);
|
|
953
1118
|
|
|
954
1119
|
// Remove satellite contexts that were only reachable via this peer
|
|
955
1120
|
for (let i = this.satelliteContexts.length - 1; i >= 0; i--) {
|
|
@@ -1076,13 +1241,19 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
1076
1241
|
this.emit("frame", frame, from);
|
|
1077
1242
|
}
|
|
1078
1243
|
|
|
1244
|
+
/** Last sync version sent to each direct peer. */
|
|
1245
|
+
private peerSyncVersions = new Map<string, number>();
|
|
1246
|
+
|
|
1079
1247
|
private sendPeerSync(conn: Connection) {
|
|
1080
|
-
const
|
|
1081
|
-
const
|
|
1248
|
+
const remoteNodeId = conn.remoteNodeId ?? "";
|
|
1249
|
+
const sinceVersion = this.peerSyncVersions.get(remoteNodeId) ?? 0;
|
|
1250
|
+
const delta = this.router.buildPeerSyncDelta(sinceVersion);
|
|
1251
|
+
const payload: Record<string, unknown> = { ...delta };
|
|
1082
1252
|
if (this.satelliteContexts.length > 0) {
|
|
1083
1253
|
payload.satellites = this.satelliteContexts;
|
|
1084
1254
|
}
|
|
1085
|
-
|
|
1255
|
+
this.peerSyncVersions.set(remoteNodeId, delta.version);
|
|
1256
|
+
conn.sendDirect({
|
|
1086
1257
|
type: "peer_sync",
|
|
1087
1258
|
from: this.config.nodeId,
|
|
1088
1259
|
timestamp: Date.now(),
|
|
@@ -1105,34 +1276,53 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
1105
1276
|
}
|
|
1106
1277
|
|
|
1107
1278
|
let changed = false;
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
const
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
|
|
1118
|
-
const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
|
|
1119
|
-
this.router.updatePeerCapabilities(peer.nodeId, peer);
|
|
1120
|
-
if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
|
|
1121
|
-
|| (peer.directPeers?.length ?? 0) !== hadDirectPeers
|
|
1122
|
-
|| peer.toolProxy?.enabled !== hadToolProxyEnabled
|
|
1123
|
-
|| (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
|
|
1124
|
-
|| (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
|
|
1125
|
-
|| peer.deviceInfo?.hostname !== hadDeviceInfo
|
|
1126
|
-
|| (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
|
|
1279
|
+
|
|
1280
|
+
// Delta sync: apply removed, then added/updated
|
|
1281
|
+
if (frame.payload.version != null && frame.payload.removed) {
|
|
1282
|
+
for (const nodeId of frame.payload.removed) {
|
|
1283
|
+
if (nodeId === this.config.nodeId) continue;
|
|
1284
|
+
// Only remove relay routes — don't remove direct connections
|
|
1285
|
+
const route = this.router.getRoute(nodeId);
|
|
1286
|
+
if (route && !route.connection) {
|
|
1287
|
+
this.router.removePeer(nodeId);
|
|
1127
1288
|
changed = true;
|
|
1128
1289
|
}
|
|
1129
|
-
}
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1290
|
+
}
|
|
1291
|
+
}
|
|
1292
|
+
|
|
1293
|
+
// Process peers from either full sync or delta (updated field)
|
|
1294
|
+
// Delta sets peers = updated for backward compat, so this works for both
|
|
1295
|
+
const peersToProcess = frame.payload.peers;
|
|
1296
|
+
if (peersToProcess) {
|
|
1297
|
+
for (const peer of peersToProcess) {
|
|
1298
|
+
if (peer.nodeId === this.config.nodeId) continue;
|
|
1299
|
+
if (peer.nodeId === from.remoteNodeId) {
|
|
1300
|
+
const prev = this.router.getRoute(peer.nodeId);
|
|
1301
|
+
const hadAgents = prev?.agents.length ?? 0;
|
|
1302
|
+
const hadDirectPeers = prev?.directPeers.length ?? 0;
|
|
1303
|
+
const hadDeviceInfo = prev?.deviceInfo?.hostname;
|
|
1304
|
+
const hadAcpAgents = prev?.acpAgents?.length ?? 0;
|
|
1305
|
+
const hadToolProxyEnabled = prev?.toolProxy?.enabled;
|
|
1306
|
+
const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
|
|
1307
|
+
const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
|
|
1308
|
+
this.router.updatePeerCapabilities(peer.nodeId, peer);
|
|
1309
|
+
if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
|
|
1310
|
+
|| (peer.directPeers?.length ?? 0) !== hadDirectPeers
|
|
1311
|
+
|| peer.toolProxy?.enabled !== hadToolProxyEnabled
|
|
1312
|
+
|| (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
|
|
1313
|
+
|| (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
|
|
1314
|
+
|| peer.deviceInfo?.hostname !== hadDeviceInfo
|
|
1315
|
+
|| (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
|
|
1316
|
+
changed = true;
|
|
1317
|
+
}
|
|
1318
|
+
} else {
|
|
1319
|
+
// Skip if the remote peer only knows about this node through us —
|
|
1320
|
+
// using them as relay would create a routing loop.
|
|
1321
|
+
if (peer.reachableVia === this.config.nodeId) continue;
|
|
1322
|
+
const existing = this.router.getRoute(peer.nodeId);
|
|
1323
|
+
if (!existing) changed = true;
|
|
1324
|
+
this.router.addRelayPeer(peer, from.remoteNodeId!);
|
|
1325
|
+
}
|
|
1136
1326
|
}
|
|
1137
1327
|
}
|
|
1138
1328
|
|
package/src/retry.ts
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared retry/backoff/circuit-breaker utilities backed by cockatiel.
|
|
3
|
+
*
|
|
4
|
+
* Provides:
|
|
5
|
+
* - Per-node circuit breakers for mesh health tracking
|
|
6
|
+
* - Adaptive backoff delay computation
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { circuitBreaker, handleAll, SamplingBreaker, ConsecutiveBreaker } from "cockatiel";
|
|
10
|
+
|
|
11
|
+
// ── Per-node circuit breakers ────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
export interface CircuitBreakerConfig {
|
|
14
|
+
/** Number of consecutive failures before opening the circuit (default: 5). */
|
|
15
|
+
threshold?: number;
|
|
16
|
+
/** How long to wait before trying a half-open probe in ms (default: 30000). */
|
|
17
|
+
halfOpenAfter?: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const DEFAULT_CB_CONFIG: Required<CircuitBreakerConfig> = {
|
|
21
|
+
threshold: 5,
|
|
22
|
+
halfOpenAfter: 30_000,
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
const nodeBreakers = new Map<string, ReturnType<typeof circuitBreaker>>();
|
|
26
|
+
|
|
27
|
+
/** Get or create a circuit breaker for a node. Breakers are cached per nodeId. */
|
|
28
|
+
export function getNodeCircuitBreaker(nodeId: string, config?: CircuitBreakerConfig) {
|
|
29
|
+
let cb = nodeBreakers.get(nodeId);
|
|
30
|
+
if (cb) return cb;
|
|
31
|
+
|
|
32
|
+
const cfg = { ...DEFAULT_CB_CONFIG, ...config };
|
|
33
|
+
cb = circuitBreaker(handleAll, {
|
|
34
|
+
halfOpenAfter: cfg.halfOpenAfter,
|
|
35
|
+
breaker: new ConsecutiveBreaker(cfg.threshold),
|
|
36
|
+
});
|
|
37
|
+
nodeBreakers.set(nodeId, cb);
|
|
38
|
+
return cb;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** Remove a node's circuit breaker (e.g. when peer leaves). */
|
|
42
|
+
export function removeNodeCircuitBreaker(nodeId: string) {
|
|
43
|
+
nodeBreakers.delete(nodeId);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Check if a node's circuit is currently open (unhealthy). */
|
|
47
|
+
export function isNodeCircuitOpen(nodeId: string): boolean {
|
|
48
|
+
const cb = nodeBreakers.get(nodeId);
|
|
49
|
+
if (!cb) return false;
|
|
50
|
+
return cb.state === "open";
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** Reset all circuit breakers (e.g. on shutdown). */
|
|
54
|
+
export function resetAllCircuitBreakers() {
|
|
55
|
+
nodeBreakers.clear();
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// ── Adaptive backoff computation ─────────────────────────────────
|
|
59
|
+
|
|
60
|
+
export interface BackoffParams {
|
|
61
|
+
/** Base delay in ms. */
|
|
62
|
+
base: number;
|
|
63
|
+
/** Maximum delay in ms. */
|
|
64
|
+
max: number;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/** Compute an exponential backoff delay with jitter.
|
|
68
|
+
* Returns a delay in ms: min(base * 2^attempt, max) * random(0.5, 1.0). */
|
|
69
|
+
export function computeBackoffDelay(attempt: number, params: BackoffParams): number {
|
|
70
|
+
const raw = Math.min(params.base * 2 ** attempt, params.max);
|
|
71
|
+
return Math.round(raw * (0.5 + Math.random() * 0.5));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/** Get adaptive backoff parameters based on WebSocket close code. */
|
|
75
|
+
export function getReconnectBackoff(code: number): BackoffParams {
|
|
76
|
+
if (code === 4001) return { base: 30_000, max: 300_000 }; // auth failed → slow
|
|
77
|
+
if (code === 4003) return { base: 5_000, max: 60_000 }; // auth timeout → medium
|
|
78
|
+
if (code === 1006) return { base: 1_000, max: 10_000 }; // network error → fast
|
|
79
|
+
if (code === 1001) return { base: 2_000, max: 30_000 }; // going away → medium-fast
|
|
80
|
+
return { base: 1_000, max: 60_000 }; // default
|
|
81
|
+
}
|