clawmatrix 0.3.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/bin/clawmatrix.mjs +1006 -0
- package/cli/package.json +27 -0
- package/cli/skills/clawmatrix/SKILL.md +104 -0
- package/openclaw.plugin.json +1 -0
- package/package.json +2 -1
- package/src/acp-proxy.ts +425 -37
- package/src/cluster-service.ts +82 -3
- package/src/config.ts +2 -1
- package/src/handoff.ts +8 -0
- package/src/health-tracker.ts +80 -19
- package/src/index.ts +471 -28
- package/src/knowledge-sync.ts +18 -4
- package/src/model-proxy.ts +16 -0
- package/src/peer-manager.ts +318 -25
- package/src/router.ts +93 -1
- package/src/tool-proxy.ts +40 -2
- package/src/tools/cluster-notify.ts +132 -0
- package/src/tools/cluster-peers.ts +2 -0
- package/src/types.ts +1 -1
- package/src/cli.ts +0 -711
package/src/peer-manager.ts
CHANGED
|
@@ -91,8 +91,19 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
91
91
|
private localCapabilities: NodeCapabilities;
|
|
92
92
|
private httpServer: Server | null = null;
|
|
93
93
|
private wss: WebSocketServer | null = null;
|
|
94
|
+
/** Reconnect timers keyed by `nodeId|url` for per-channel reconnection. */
|
|
94
95
|
private reconnectTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
95
96
|
private reconnectAttempts = new Map<string, number>();
|
|
97
|
+
/** Track which nodeIds have already completed the full peer join (for multi-channel). */
|
|
98
|
+
private joinedPeers = new Set<string>();
|
|
99
|
+
/** All configured URLs per peer (for multi-URL peers). */
|
|
100
|
+
private peerUrls = new Map<string, string[]>();
|
|
101
|
+
/** Currently active URL per peer. */
|
|
102
|
+
private activeUrls = new Map<string, string>();
|
|
103
|
+
/** Last probe latency per URL (ms). */
|
|
104
|
+
private urlProbeLatencies = new Map<string, number>();
|
|
105
|
+
/** Route probe interval timer. */
|
|
106
|
+
private probeTimer: ReturnType<typeof setInterval> | null = null;
|
|
96
107
|
/** Deferred disconnect timers — grace period before broadcasting peer_leave. */
|
|
97
108
|
private disconnectGraceTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
98
109
|
private stopped = false;
|
|
@@ -191,10 +202,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
191
202
|
for (const peer of this.config.peers) {
|
|
192
203
|
this.connectToPeer(peer);
|
|
193
204
|
}
|
|
205
|
+
// Start route probing for peers with multiple URLs
|
|
206
|
+
this.startRouteProbing();
|
|
194
207
|
}
|
|
195
208
|
|
|
196
209
|
async stop() {
|
|
197
210
|
this.stopped = true;
|
|
211
|
+
if (this.probeTimer) {
|
|
212
|
+
clearInterval(this.probeTimer);
|
|
213
|
+
this.probeTimer = null;
|
|
214
|
+
}
|
|
198
215
|
if (this.gossipDebounceTimer) {
|
|
199
216
|
clearTimeout(this.gossipDebounceTimer);
|
|
200
217
|
this.gossipDebounceTimer = null;
|
|
@@ -221,18 +238,187 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
221
238
|
conn.close(1000, "shutdown");
|
|
222
239
|
}
|
|
223
240
|
|
|
241
|
+
this.closeServers();
|
|
242
|
+
|
|
243
|
+
this.rateLimiter.destroy();
|
|
244
|
+
this.approvalManager.destroy();
|
|
245
|
+
this.router.destroy();
|
|
246
|
+
this.joinedPeers.clear();
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/** Force-stop without broadcasting or waiting — used when graceful stop times out. */
|
|
250
|
+
forceStop() {
|
|
251
|
+
this.stopped = true;
|
|
252
|
+
if (this.probeTimer) { clearInterval(this.probeTimer); this.probeTimer = null; }
|
|
253
|
+
for (const timer of this.reconnectTimers.values()) clearTimeout(timer);
|
|
254
|
+
this.reconnectTimers.clear();
|
|
255
|
+
for (const [, timer] of this.disconnectGraceTimers) clearTimeout(timer);
|
|
256
|
+
this.disconnectGraceTimers.clear();
|
|
257
|
+
if (this.gossipDebounceTimer) {
|
|
258
|
+
clearTimeout(this.gossipDebounceTimer);
|
|
259
|
+
this.gossipDebounceTimer = null;
|
|
260
|
+
}
|
|
261
|
+
for (const conn of this.router.getDirectConnections()) {
|
|
262
|
+
try { conn.close(1000, "shutdown"); } catch { /* best effort */ }
|
|
263
|
+
}
|
|
264
|
+
this.closeServers();
|
|
265
|
+
this.rateLimiter.destroy();
|
|
266
|
+
this.approvalManager.destroy();
|
|
267
|
+
this.router.destroy();
|
|
268
|
+
this.joinedPeers.clear();
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
private closeServers() {
|
|
224
272
|
if (this.wss) {
|
|
225
273
|
this.wss.close();
|
|
226
274
|
this.wss = null;
|
|
227
275
|
}
|
|
228
276
|
if (this.httpServer) {
|
|
277
|
+
// Force-close all keep-alive connections so the port is released immediately
|
|
278
|
+
const server = this.httpServer as typeof this.httpServer & { closeAllConnections?: () => void };
|
|
279
|
+
if (typeof server.closeAllConnections === "function") {
|
|
280
|
+
server.closeAllConnections();
|
|
281
|
+
}
|
|
229
282
|
this.httpServer.close();
|
|
230
283
|
this.httpServer = null;
|
|
231
284
|
}
|
|
285
|
+
}
|
|
232
286
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
287
|
+
// ── Route probing (for multi-URL peers) ──────────────────────────
|
|
288
|
+
private static readonly PROBE_INTERVAL = 3_600_000; // 1 hour
|
|
289
|
+
/** Minimum improvement ratio to trigger a route switch. */
|
|
290
|
+
private static readonly SWITCH_THRESHOLD = 0.7; // new must be ≤70% of current
|
|
291
|
+
|
|
292
|
+
private startRouteProbing() {
|
|
293
|
+
// Only probe if any peer has multiple URLs
|
|
294
|
+
const hasMultiUrl = [...this.peerUrls.values()].some((urls) => urls.length > 1);
|
|
295
|
+
if (!hasMultiUrl) return;
|
|
296
|
+
|
|
297
|
+
this.probeTimer = setInterval(() => this.probeAllRoutes(), PeerManager.PROBE_INTERVAL);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
private async probeAllRoutes() {
|
|
301
|
+
for (const [nodeId, urls] of this.peerUrls) {
|
|
302
|
+
if (urls.length <= 1) continue;
|
|
303
|
+
const activeUrl = this.activeUrls.get(nodeId);
|
|
304
|
+
for (const url of urls) {
|
|
305
|
+
if (url === activeUrl) continue;
|
|
306
|
+
// Probe non-active URLs
|
|
307
|
+
const latency = await this.probeUrl(url);
|
|
308
|
+
if (latency !== null) {
|
|
309
|
+
this.urlProbeLatencies.set(url, latency);
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
// Also record active connection's real latency
|
|
313
|
+
if (activeUrl) {
|
|
314
|
+
const route = this.router.getRoute(nodeId);
|
|
315
|
+
if (route && route.latencyMs > 0) {
|
|
316
|
+
this.urlProbeLatencies.set(activeUrl, route.latencyMs);
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
// Evaluate if we should switch
|
|
320
|
+
this.evaluateRouteSwitch(nodeId);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Probe a URL by measuring HTTP response time (WS server also serves HTTP).
|
|
326
|
+
* Returns latency in ms, or null if unreachable.
|
|
327
|
+
*/
|
|
328
|
+
private async probeUrl(wsUrl: string): Promise<number | null> {
|
|
329
|
+
try {
|
|
330
|
+
const httpUrl = wsUrl.replace(/^ws(s?):\/\//, "http$1://");
|
|
331
|
+
const start = Date.now();
|
|
332
|
+
const controller = new AbortController();
|
|
333
|
+
const timeout = setTimeout(() => controller.abort(), 5_000);
|
|
334
|
+
try {
|
|
335
|
+
const res = await fetch(httpUrl, {
|
|
336
|
+
method: "HEAD",
|
|
337
|
+
signal: controller.signal,
|
|
338
|
+
// @ts-ignore — Node.js 18+ supports this
|
|
339
|
+
keepalive: false,
|
|
340
|
+
});
|
|
341
|
+
clearTimeout(timeout);
|
|
342
|
+
if (res.ok || res.status === 200) {
|
|
343
|
+
return Date.now() - start;
|
|
344
|
+
}
|
|
345
|
+
return null;
|
|
346
|
+
} catch {
|
|
347
|
+
clearTimeout(timeout);
|
|
348
|
+
return null;
|
|
349
|
+
}
|
|
350
|
+
} catch {
|
|
351
|
+
return null;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/** Check if we should switch to a better URL for a peer. */
|
|
356
|
+
private evaluateRouteSwitch(nodeId: string) {
|
|
357
|
+
const urls = this.peerUrls.get(nodeId);
|
|
358
|
+
const activeUrl = this.activeUrls.get(nodeId);
|
|
359
|
+
if (!urls || !activeUrl || urls.length <= 1) return;
|
|
360
|
+
|
|
361
|
+
const currentLatency = this.urlProbeLatencies.get(activeUrl);
|
|
362
|
+
if (!currentLatency || currentLatency <= 0) return;
|
|
363
|
+
|
|
364
|
+
// Find best alternative
|
|
365
|
+
let bestUrl: string | undefined;
|
|
366
|
+
let bestLatency = Infinity;
|
|
367
|
+
for (const url of urls) {
|
|
368
|
+
if (url === activeUrl) continue;
|
|
369
|
+
const lat = this.urlProbeLatencies.get(url);
|
|
370
|
+
if (lat !== undefined && lat < bestLatency) {
|
|
371
|
+
bestLatency = lat;
|
|
372
|
+
bestUrl = url;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
if (!bestUrl || bestLatency >= currentLatency * PeerManager.SWITCH_THRESHOLD) return;
|
|
377
|
+
|
|
378
|
+
// Check if there are active tasks — don't switch mid-task
|
|
379
|
+
if (this.hasActiveTasks(nodeId)) {
|
|
380
|
+
debug("probe", `${nodeId}: better route found (${activeUrl} ${currentLatency}ms → ${bestUrl} ${bestLatency}ms) but has active tasks, deferring`);
|
|
381
|
+
return;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
debug("probe", `${nodeId}: switching route ${activeUrl} ${currentLatency}ms → ${bestUrl} ${bestLatency}ms`);
|
|
385
|
+
this.switchRoute(nodeId, bestUrl);
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
/** Callback to check if there are active tasks involving a peer. Set by ClusterRuntime. */
|
|
389
|
+
private activeTaskChecker: ((nodeId: string) => boolean) | null = null;
|
|
390
|
+
|
|
391
|
+
/** Register a callback to check for active tasks (used to prevent route switches mid-task). */
|
|
392
|
+
setActiveTaskChecker(checker: (nodeId: string) => boolean) {
|
|
393
|
+
this.activeTaskChecker = checker;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
/** Check if there are active tasks involving a peer (handoffs, model requests, etc.). */
|
|
397
|
+
private hasActiveTasks(nodeId: string): boolean {
|
|
398
|
+
return this.activeTaskChecker?.(nodeId) ?? false;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
/** Switch the active route for a peer to a new URL. */
|
|
402
|
+
private switchRoute(nodeId: string, newUrl: string) {
|
|
403
|
+
this.activeUrls.set(nodeId, newUrl);
|
|
404
|
+
// Connect to the new URL — the new connection will authenticate and join
|
|
405
|
+
// as an additional channel briefly, then we close the old one.
|
|
406
|
+
const oldRoute = this.router.getRoute(nodeId);
|
|
407
|
+
const oldConn = oldRoute?.connection;
|
|
408
|
+
|
|
409
|
+
this.connectToChannel(nodeId, newUrl);
|
|
410
|
+
|
|
411
|
+
// Close the old connection after a short delay (give new connection time to establish)
|
|
412
|
+
if (oldConn?.isOpen) {
|
|
413
|
+
setTimeout(() => {
|
|
414
|
+
// Only close if a new connection has taken over
|
|
415
|
+
const currentRoute = this.router.getRoute(nodeId);
|
|
416
|
+
if (currentRoute?.connection && currentRoute.connection !== oldConn) {
|
|
417
|
+
debug("probe", `${nodeId}: closing old channel after route switch`);
|
|
418
|
+
oldConn.close(1000, "route switch");
|
|
419
|
+
}
|
|
420
|
+
}, 5_000);
|
|
421
|
+
}
|
|
236
422
|
}
|
|
237
423
|
|
|
238
424
|
/** Set an HTTP request handler for non-WebSocket requests (e.g. web dashboard). */
|
|
@@ -351,26 +537,52 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
351
537
|
|
|
352
538
|
// ── Outbound connections (standard WebSocket) ──────────────────
|
|
353
539
|
private connectToPeer(peer: PeerConfig) {
|
|
540
|
+
const urls = Array.isArray(peer.url) ? peer.url : [peer.url];
|
|
541
|
+
this.peerUrls.set(peer.nodeId, urls);
|
|
542
|
+
// Connect to the first URL (or best known from probes)
|
|
543
|
+
const bestUrl = this.pickBestUrl(peer.nodeId, urls);
|
|
544
|
+
this.activeUrls.set(peer.nodeId, bestUrl);
|
|
545
|
+
this.connectToChannel(peer.nodeId, bestUrl);
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
/** Pick the best URL for a peer based on probe latencies. Falls back to first URL. */
|
|
549
|
+
private pickBestUrl(nodeId: string, urls: string[]): string {
|
|
550
|
+
if (urls.length <= 1) return urls[0];
|
|
551
|
+
let bestUrl = urls[0];
|
|
552
|
+
let bestLatency = Infinity;
|
|
553
|
+
for (const url of urls) {
|
|
554
|
+
const lat = this.urlProbeLatencies.get(url);
|
|
555
|
+
if (lat !== undefined && lat < bestLatency) {
|
|
556
|
+
bestLatency = lat;
|
|
557
|
+
bestUrl = url;
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
return bestUrl;
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
/** Connect a single channel (URL) for a peer. */
|
|
564
|
+
private connectToChannel(nodeId: string, url: string) {
|
|
354
565
|
if (this.stopped) {
|
|
355
|
-
debug("peer", `
|
|
566
|
+
debug("peer", `connectToChannel(${nodeId}): skipped (stopped)`);
|
|
356
567
|
return;
|
|
357
568
|
}
|
|
358
569
|
|
|
359
|
-
const
|
|
360
|
-
|
|
570
|
+
const channelKey = `${nodeId}|${url}`;
|
|
571
|
+
const attempt = this.reconnectAttempts.get(channelKey) ?? 0;
|
|
572
|
+
debug("peer", `connectToChannel(${nodeId}): attempt=${attempt} url=${url}`);
|
|
361
573
|
|
|
362
574
|
// Use a common WS subprotocol for traffic disguise
|
|
363
575
|
let ws: WebSocket;
|
|
364
576
|
try {
|
|
365
|
-
ws = new WebSocket(
|
|
577
|
+
ws = new WebSocket(url, ["graphql-transport-ws"]);
|
|
366
578
|
} catch (err) {
|
|
367
|
-
debug("peer", `
|
|
368
|
-
this.
|
|
579
|
+
debug("peer", `connectToChannel(${nodeId}): WebSocket constructor threw: ${err}`);
|
|
580
|
+
this.scheduleChannelReconnect(nodeId, url);
|
|
369
581
|
return;
|
|
370
582
|
}
|
|
371
583
|
|
|
372
584
|
ws.addEventListener("open", () => {
|
|
373
|
-
debug("peer", `
|
|
585
|
+
debug("peer", `connectToChannel(${nodeId}): ws open url=${url}`);
|
|
374
586
|
const conn = new Connection(
|
|
375
587
|
ws,
|
|
376
588
|
"outbound",
|
|
@@ -382,8 +594,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
382
594
|
conn.bindWebSocket(ws);
|
|
383
595
|
|
|
384
596
|
conn.on("authenticated", (caps) => {
|
|
385
|
-
debug("peer", `
|
|
386
|
-
this.reconnectAttempts.delete(
|
|
597
|
+
debug("peer", `connectToChannel(${nodeId}): authenticated url=${url}`);
|
|
598
|
+
this.reconnectAttempts.delete(channelKey);
|
|
387
599
|
this.onPeerAuthenticated(conn, caps);
|
|
388
600
|
});
|
|
389
601
|
|
|
@@ -397,7 +609,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
397
609
|
const tryReconnect = () => {
|
|
398
610
|
if (!reconnectScheduled) {
|
|
399
611
|
reconnectScheduled = true;
|
|
400
|
-
this.
|
|
612
|
+
this.scheduleChannelReconnect(nodeId, url, lastError);
|
|
401
613
|
}
|
|
402
614
|
};
|
|
403
615
|
|
|
@@ -409,7 +621,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
409
621
|
// Don't reconnect if this was a self-connection (peer URL points to ourselves).
|
|
410
622
|
// Without this guard, outbound detects self → closes → scheduleReconnect → loop.
|
|
411
623
|
if (ev.code === 4002 && ev.reason === "self-connection") {
|
|
412
|
-
debug("peer", `
|
|
624
|
+
debug("peer", `connectToChannel(${nodeId}): self-connection, will not reconnect`);
|
|
413
625
|
return;
|
|
414
626
|
}
|
|
415
627
|
if (!lastError) {
|
|
@@ -419,24 +631,65 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
419
631
|
});
|
|
420
632
|
}
|
|
421
633
|
|
|
422
|
-
private
|
|
634
|
+
private scheduleChannelReconnect(nodeId: string, url: string, reason?: string) {
|
|
423
635
|
if (this.stopped) {
|
|
424
|
-
debug("peer", `
|
|
636
|
+
debug("peer", `scheduleChannelReconnect(${nodeId}): skipped (stopped)`);
|
|
425
637
|
return;
|
|
426
638
|
}
|
|
427
|
-
|
|
639
|
+
const channelKey = `${nodeId}|${url}`;
|
|
640
|
+
if (this.reconnectTimers.has(channelKey)) return;
|
|
641
|
+
|
|
642
|
+
const attempt = this.reconnectAttempts.get(channelKey) ?? 0;
|
|
643
|
+
|
|
644
|
+
// On first failure, try an alternative URL immediately (failover)
|
|
645
|
+
if (attempt === 0) {
|
|
646
|
+
const urls = this.peerUrls.get(nodeId);
|
|
647
|
+
if (urls && urls.length > 1) {
|
|
648
|
+
const altUrl = this.pickNextUrl(nodeId, url, urls);
|
|
649
|
+
if (altUrl && altUrl !== url) {
|
|
650
|
+
debug("peer", `scheduleChannelReconnect(${nodeId}): failover ${url} → ${altUrl}`);
|
|
651
|
+
this.activeUrls.set(nodeId, altUrl);
|
|
652
|
+
this.reconnectAttempts.set(channelKey, attempt + 1);
|
|
653
|
+
// Connect to alternative immediately, schedule original for later
|
|
654
|
+
this.connectToChannel(nodeId, altUrl);
|
|
655
|
+
const timer = setTimeout(() => {
|
|
656
|
+
this.reconnectTimers.delete(channelKey);
|
|
657
|
+
// Only reconnect original URL if not already connected
|
|
658
|
+
if (!this.joinedPeers.has(nodeId)) {
|
|
659
|
+
this.connectToChannel(nodeId, url);
|
|
660
|
+
}
|
|
661
|
+
}, RECONNECT_MAX);
|
|
662
|
+
this.reconnectTimers.set(channelKey, timer);
|
|
663
|
+
return;
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
}
|
|
428
667
|
|
|
429
|
-
const attempt = this.reconnectAttempts.get(peer.nodeId) ?? 0;
|
|
430
668
|
const delay = Math.min(RECONNECT_BASE * 2 ** attempt, RECONNECT_MAX);
|
|
431
|
-
this.reconnectAttempts.set(
|
|
669
|
+
this.reconnectAttempts.set(channelKey, attempt + 1);
|
|
432
670
|
const tag = reason ? ` reason="${reason}"` : "";
|
|
433
|
-
debug("peer", `
|
|
671
|
+
debug("peer", `scheduleChannelReconnect(${nodeId}): attempt=${attempt} delay=${delay}ms url=${url}${tag}`);
|
|
434
672
|
|
|
435
673
|
const timer = setTimeout(() => {
|
|
436
|
-
this.reconnectTimers.delete(
|
|
437
|
-
this.
|
|
674
|
+
this.reconnectTimers.delete(channelKey);
|
|
675
|
+
this.connectToChannel(nodeId, url);
|
|
438
676
|
}, delay);
|
|
439
|
-
this.reconnectTimers.set(
|
|
677
|
+
this.reconnectTimers.set(channelKey, timer);
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
/** Pick the next best URL to try, excluding the current one. */
|
|
681
|
+
private pickNextUrl(nodeId: string, currentUrl: string, urls: string[]): string | undefined {
|
|
682
|
+
let bestUrl: string | undefined;
|
|
683
|
+
let bestLatency = Infinity;
|
|
684
|
+
for (const url of urls) {
|
|
685
|
+
if (url === currentUrl) continue;
|
|
686
|
+
const lat = this.urlProbeLatencies.get(url) ?? 10_000;
|
|
687
|
+
if (lat < bestLatency) {
|
|
688
|
+
bestLatency = lat;
|
|
689
|
+
bestUrl = url;
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
return bestUrl ?? urls.find((u) => u !== currentUrl);
|
|
440
693
|
}
|
|
441
694
|
|
|
442
695
|
// ── Peer lifecycle ─────────────────────────────────────────────
|
|
@@ -536,6 +789,23 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
536
789
|
// Cancel disconnect grace timer if the peer is reconnecting
|
|
537
790
|
const wasInGrace = this.cancelDisconnectGrace(nodeId);
|
|
538
791
|
|
|
792
|
+
// Check if this peer already has an active connection (additional channel)
|
|
793
|
+
const isAdditionalChannel = this.joinedPeers.has(nodeId);
|
|
794
|
+
|
|
795
|
+
if (isAdditionalChannel) {
|
|
796
|
+
// Additional channel — just add to the channel pool, no peer_join broadcast
|
|
797
|
+
this.router.addChannel(nodeId, conn);
|
|
798
|
+
conn.on("message", (frame) => this.onFrame(frame, conn));
|
|
799
|
+
conn.on("latency", () => this.router.updateActiveChannel(nodeId));
|
|
800
|
+
conn.on("close", () => this.onChannelDisconnected(conn));
|
|
801
|
+
const channelCount = this.router.getChannelCount(nodeId);
|
|
802
|
+
debug("peer", `completePeerJoin(${nodeId}): additional channel added (total=${channelCount})`);
|
|
803
|
+
audit("channel_add", { nodeId, detail: `channels=${channelCount}` });
|
|
804
|
+
return;
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
// First channel — full join flow
|
|
808
|
+
|
|
539
809
|
// If there's an existing connection for this nodeId (e.g. peer reconnected
|
|
540
810
|
// while old TCP hadn't closed yet), close it AFTER overwriting the route so
|
|
541
811
|
// the stale-close guard in onPeerDisconnected correctly skips cleanup.
|
|
@@ -551,9 +821,11 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
551
821
|
oldConn.close(1000, "replaced by new connection");
|
|
552
822
|
}
|
|
553
823
|
|
|
824
|
+
this.joinedPeers.add(nodeId);
|
|
825
|
+
|
|
554
826
|
conn.on("message", (frame) => this.onFrame(frame, conn));
|
|
555
|
-
conn.on("latency", (
|
|
556
|
-
conn.on("close", () => this.
|
|
827
|
+
conn.on("latency", () => this.router.updateActiveChannel(nodeId));
|
|
828
|
+
conn.on("close", () => this.onChannelDisconnected(conn));
|
|
557
829
|
|
|
558
830
|
this.sendPeerSync(conn);
|
|
559
831
|
|
|
@@ -586,6 +858,25 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
586
858
|
this.emit("peerConnected", nodeId);
|
|
587
859
|
}
|
|
588
860
|
|
|
861
|
+
/** Handle a single channel disconnecting (multi-channel aware). */
|
|
862
|
+
private onChannelDisconnected(conn: Connection) {
|
|
863
|
+
const nodeId = conn.remoteNodeId;
|
|
864
|
+
if (!nodeId) return;
|
|
865
|
+
|
|
866
|
+
// Remove this channel from the pool
|
|
867
|
+
const hasRemaining = this.router.removeChannel(nodeId, conn);
|
|
868
|
+
if (hasRemaining) {
|
|
869
|
+
// Other channels still alive — just log, no peer_leave
|
|
870
|
+
const channelCount = this.router.getChannelCount(nodeId);
|
|
871
|
+
debug("peer", `onChannelDisconnected(${nodeId}): channel lost, ${channelCount} remaining`);
|
|
872
|
+
audit("channel_remove", { nodeId, detail: `channels=${channelCount}` });
|
|
873
|
+
return;
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
// Last channel gone — proceed with peer disconnect logic
|
|
877
|
+
this.onPeerDisconnected(conn);
|
|
878
|
+
}
|
|
879
|
+
|
|
589
880
|
private onPeerDisconnected(conn: Connection) {
|
|
590
881
|
const nodeId = conn.remoteNodeId;
|
|
591
882
|
if (!nodeId) return;
|
|
@@ -603,6 +894,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
603
894
|
// Same-nodeId 本地客户端断开:仅清理路由,不广播 peer_leave
|
|
604
895
|
if (nodeId === this.config.nodeId) {
|
|
605
896
|
this.router.removePeer(nodeId);
|
|
897
|
+
this.joinedPeers.delete(nodeId);
|
|
606
898
|
return;
|
|
607
899
|
}
|
|
608
900
|
|
|
@@ -651,6 +943,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
651
943
|
|
|
652
944
|
audit("peer_leave", { nodeId });
|
|
653
945
|
this.router.removePeer(nodeId);
|
|
946
|
+
this.joinedPeers.delete(nodeId);
|
|
654
947
|
|
|
655
948
|
// Remove satellite contexts that were only reachable via this peer
|
|
656
949
|
for (let i = this.satelliteContexts.length - 1; i >= 0; i--) {
|
package/src/router.ts
CHANGED
|
@@ -30,7 +30,9 @@ export class Router {
|
|
|
30
30
|
private localToolProxy?: ToolProxyInfo;
|
|
31
31
|
private localAcpAgents?: AcpAgentInfo[];
|
|
32
32
|
private routes = new Map<string, RouteEntry>();
|
|
33
|
-
private connections = new Map<string, Connection>(); // nodeId → direct connection
|
|
33
|
+
private connections = new Map<string, Connection>(); // nodeId → active (best) direct connection
|
|
34
|
+
/** All live channels per nodeId (multi-channel support). */
|
|
35
|
+
private channels = new Map<string, Set<Connection>>();
|
|
34
36
|
/** Double-map dedup: current window + previous window. Rotated periodically. */
|
|
35
37
|
private seenCurrent = new Map<string, true>();
|
|
36
38
|
private seenPrevious = new Map<string, true>();
|
|
@@ -128,6 +130,7 @@ export class Router {
|
|
|
128
130
|
this.seenCurrent.clear();
|
|
129
131
|
this.seenPrevious.clear();
|
|
130
132
|
this.failedRequests.clear();
|
|
133
|
+
this.channels.clear();
|
|
131
134
|
}
|
|
132
135
|
|
|
133
136
|
// ── Route table management ─────────────────────────────────────
|
|
@@ -139,6 +142,10 @@ export class Router {
|
|
|
139
142
|
const old = this.routes.get(nodeId);
|
|
140
143
|
if (old) this.unindexEntry(old);
|
|
141
144
|
this.connections.set(nodeId, connection);
|
|
145
|
+
// Add to channel set
|
|
146
|
+
let channelSet = this.channels.get(nodeId);
|
|
147
|
+
if (!channelSet) { channelSet = new Set(); this.channels.set(nodeId, channelSet); }
|
|
148
|
+
channelSet.add(connection);
|
|
142
149
|
const entry: RouteEntry = {
|
|
143
150
|
nodeId,
|
|
144
151
|
agents: capabilities.agents,
|
|
@@ -157,6 +164,74 @@ export class Router {
|
|
|
157
164
|
this.indexEntry(entry);
|
|
158
165
|
}
|
|
159
166
|
|
|
167
|
+
/** Add an additional channel to an existing peer (multi-channel). */
|
|
168
|
+
addChannel(nodeId: string, connection: Connection) {
|
|
169
|
+
let channelSet = this.channels.get(nodeId);
|
|
170
|
+
if (!channelSet) { channelSet = new Set(); this.channels.set(nodeId, channelSet); }
|
|
171
|
+
channelSet.add(connection);
|
|
172
|
+
this.updateActiveChannel(nodeId);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/** Remove a single channel. Returns true if the peer still has live channels. */
|
|
176
|
+
removeChannel(nodeId: string, connection: Connection): boolean {
|
|
177
|
+
const channelSet = this.channels.get(nodeId);
|
|
178
|
+
if (channelSet) {
|
|
179
|
+
channelSet.delete(connection);
|
|
180
|
+
if (channelSet.size === 0) {
|
|
181
|
+
this.channels.delete(nodeId);
|
|
182
|
+
return false;
|
|
183
|
+
}
|
|
184
|
+
// Pick new active channel
|
|
185
|
+
this.updateActiveChannel(nodeId);
|
|
186
|
+
return true;
|
|
187
|
+
}
|
|
188
|
+
return false;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/** Re-evaluate the active (best) channel for a peer based on latency. */
|
|
192
|
+
updateActiveChannel(nodeId: string) {
|
|
193
|
+
const channelSet = this.channels.get(nodeId);
|
|
194
|
+
if (!channelSet || channelSet.size === 0) return;
|
|
195
|
+
|
|
196
|
+
let best: Connection | null = null;
|
|
197
|
+
let bestLatency = Infinity;
|
|
198
|
+
for (const conn of channelSet) {
|
|
199
|
+
if (!conn.isOpen) continue;
|
|
200
|
+
// Prefer lower latency; treat 0 (unmeasured) as high
|
|
201
|
+
const lat = conn.latencyMs > 0 ? conn.latencyMs : 10_000;
|
|
202
|
+
if (lat < bestLatency) {
|
|
203
|
+
bestLatency = lat;
|
|
204
|
+
best = conn;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
if (best) {
|
|
209
|
+
this.connections.set(nodeId, best);
|
|
210
|
+
const route = this.routes.get(nodeId);
|
|
211
|
+
if (route) {
|
|
212
|
+
route.connection = best;
|
|
213
|
+
route.latencyMs = best.latencyMs;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/** Get the number of live channels for a peer. */
|
|
219
|
+
getChannelCount(nodeId: string): number {
|
|
220
|
+
const channelSet = this.channels.get(nodeId);
|
|
221
|
+
if (!channelSet) return 0;
|
|
222
|
+
let count = 0;
|
|
223
|
+
for (const conn of channelSet) {
|
|
224
|
+
if (conn.isOpen) count++;
|
|
225
|
+
}
|
|
226
|
+
return count;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/** Get all channels for a peer (for diagnostics/status). */
|
|
230
|
+
getChannels(nodeId: string): Connection[] {
|
|
231
|
+
const channelSet = this.channels.get(nodeId);
|
|
232
|
+
return channelSet ? [...channelSet] : [];
|
|
233
|
+
}
|
|
234
|
+
|
|
160
235
|
addRelayPeer(peer: PeerInfo, viaNodeId: string) {
|
|
161
236
|
// Don't add ourselves
|
|
162
237
|
if (peer.nodeId === this.nodeId) return;
|
|
@@ -192,6 +267,7 @@ export class Router {
|
|
|
192
267
|
|
|
193
268
|
removePeer(nodeId: string) {
|
|
194
269
|
this.connections.delete(nodeId);
|
|
270
|
+
this.channels.delete(nodeId);
|
|
195
271
|
const removed = this.routes.get(nodeId);
|
|
196
272
|
if (removed) {
|
|
197
273
|
this.unindexEntry(removed);
|
|
@@ -346,11 +422,27 @@ export class Router {
|
|
|
346
422
|
const route = this.routes.get(targetNodeId);
|
|
347
423
|
if (!route) return false;
|
|
348
424
|
|
|
425
|
+
// Try active connection first
|
|
349
426
|
if (route.connection?.isOpen) {
|
|
350
427
|
route.connection.send(frame);
|
|
351
428
|
return true;
|
|
352
429
|
}
|
|
353
430
|
|
|
431
|
+
// Fallback: try other channels (multi-channel failover)
|
|
432
|
+
const channelSet = this.channels.get(targetNodeId);
|
|
433
|
+
if (channelSet) {
|
|
434
|
+
for (const conn of channelSet) {
|
|
435
|
+
if (conn.isOpen) {
|
|
436
|
+
conn.send(frame);
|
|
437
|
+
// Promote to active
|
|
438
|
+
this.connections.set(targetNodeId, conn);
|
|
439
|
+
route.connection = conn;
|
|
440
|
+
route.latencyMs = conn.latencyMs;
|
|
441
|
+
return true;
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
|
|
354
446
|
// Relay through intermediate node
|
|
355
447
|
if (route.reachableVia) {
|
|
356
448
|
const relay = this.connections.get(route.reachableVia);
|
package/src/tool-proxy.ts
CHANGED
|
@@ -10,6 +10,9 @@ import type {
|
|
|
10
10
|
} from "./types.ts";
|
|
11
11
|
import type { PluginLogger } from "openclaw/plugin-sdk";
|
|
12
12
|
import { isLocalTool, executeLocally } from "./local-tools.ts";
|
|
13
|
+
import { writeFileSync, mkdirSync } from "node:fs";
|
|
14
|
+
import { join } from "node:path";
|
|
15
|
+
import { tmpdir } from "node:os";
|
|
13
16
|
|
|
14
17
|
const DEFAULT_TOOL_TIMEOUT = 30_000;
|
|
15
18
|
|
|
@@ -136,13 +139,41 @@ export class ToolProxy {
|
|
|
136
139
|
|
|
137
140
|
if (frame.payload.success && frame.payload.result) {
|
|
138
141
|
this.logger.info(`[clawmatrix] Tool response: id=${frame.id} from="${frame.from}" success`);
|
|
139
|
-
|
|
142
|
+
const result = this.extractInlineImage(frame.payload.result);
|
|
143
|
+
pending.resolve(result);
|
|
140
144
|
} else {
|
|
141
145
|
this.logger.warn(`[clawmatrix] Tool response: id=${frame.id} from="${frame.from}" failed: ${frame.payload.error}`);
|
|
142
146
|
pending.reject(new Error(frame.payload.error ?? "Remote tool execution failed"));
|
|
143
147
|
}
|
|
144
148
|
}
|
|
145
149
|
|
|
150
|
+
/**
|
|
151
|
+
* If the tool result contains inline base64 image data (mime: "image/*" + data),
|
|
152
|
+
* save it to a local temp file and replace `data` with `localPath`.
|
|
153
|
+
* This avoids flooding the LLM context with base64 text (saves ~tens of thousands of tokens).
|
|
154
|
+
*/
|
|
155
|
+
private extractInlineImage(result: Record<string, unknown>): Record<string, unknown> {
|
|
156
|
+
const mime = result.mime;
|
|
157
|
+
const data = result.data;
|
|
158
|
+
if (typeof mime !== "string" || !mime.startsWith("image/") || typeof data !== "string") {
|
|
159
|
+
return result;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
try {
|
|
163
|
+
const ext = mime === "image/png" ? ".png" : mime === "image/webp" ? ".webp" : ".jpg";
|
|
164
|
+
const dir = join(tmpdir(), "clawmatrix-images");
|
|
165
|
+
mkdirSync(dir, { recursive: true });
|
|
166
|
+
const localPath = join(dir, `${Date.now()}-${Math.random().toString(36).slice(2, 8)}${ext}`);
|
|
167
|
+
writeFileSync(localPath, Buffer.from(data, "base64"));
|
|
168
|
+
this.logger.info(`[clawmatrix] Saved inline image (${(data.length * 0.75 / 1024).toFixed(0)}KB) to ${localPath}`);
|
|
169
|
+
const { data: _stripped, ...rest } = result;
|
|
170
|
+
return { ...rest, localPath };
|
|
171
|
+
} catch (err) {
|
|
172
|
+
this.logger.warn(`[clawmatrix] Failed to extract inline image: ${err}`);
|
|
173
|
+
return result;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
146
177
|
// ── Incoming request: execute via local Gateway ────────────────
|
|
147
178
|
async handleRequest(frame: ToolProxyRequest): Promise<void> {
|
|
148
179
|
const { id, from, payload } = frame;
|
|
@@ -288,7 +319,14 @@ export class ToolProxy {
|
|
|
288
319
|
|
|
289
320
|
clearTimeout(pending.timer);
|
|
290
321
|
this.pendingBatch.delete(frame.id);
|
|
291
|
-
|
|
322
|
+
// Extract inline images from batch results
|
|
323
|
+
const results = frame.payload.results.map((item) => {
|
|
324
|
+
if (item.success && item.result) {
|
|
325
|
+
return { ...item, result: this.extractInlineImage(item.result) };
|
|
326
|
+
}
|
|
327
|
+
return item;
|
|
328
|
+
});
|
|
329
|
+
pending.resolve(results);
|
|
292
330
|
}
|
|
293
331
|
|
|
294
332
|
// ── Incoming batch request: execute sequentially via local Gateway ──
|