clawmatrix 0.3.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -91,8 +91,19 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
91
91
  private localCapabilities: NodeCapabilities;
92
92
  private httpServer: Server | null = null;
93
93
  private wss: WebSocketServer | null = null;
94
+ /** Reconnect timers keyed by `nodeId|url` for per-channel reconnection. */
94
95
  private reconnectTimers = new Map<string, ReturnType<typeof setTimeout>>();
95
96
  private reconnectAttempts = new Map<string, number>();
97
+ /** Track which nodeIds have already completed the full peer join (for multi-channel). */
98
+ private joinedPeers = new Set<string>();
99
+ /** All configured URLs per peer (for multi-URL peers). */
100
+ private peerUrls = new Map<string, string[]>();
101
+ /** Currently active URL per peer. */
102
+ private activeUrls = new Map<string, string>();
103
+ /** Last probe latency per URL (ms). */
104
+ private urlProbeLatencies = new Map<string, number>();
105
+ /** Route probe interval timer. */
106
+ private probeTimer: ReturnType<typeof setInterval> | null = null;
96
107
  /** Deferred disconnect timers — grace period before broadcasting peer_leave. */
97
108
  private disconnectGraceTimers = new Map<string, ReturnType<typeof setTimeout>>();
98
109
  private stopped = false;
@@ -191,10 +202,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
191
202
  for (const peer of this.config.peers) {
192
203
  this.connectToPeer(peer);
193
204
  }
205
+ // Start route probing for peers with multiple URLs
206
+ this.startRouteProbing();
194
207
  }
195
208
 
196
209
  async stop() {
197
210
  this.stopped = true;
211
+ if (this.probeTimer) {
212
+ clearInterval(this.probeTimer);
213
+ this.probeTimer = null;
214
+ }
198
215
  if (this.gossipDebounceTimer) {
199
216
  clearTimeout(this.gossipDebounceTimer);
200
217
  this.gossipDebounceTimer = null;
@@ -221,18 +238,187 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
221
238
  conn.close(1000, "shutdown");
222
239
  }
223
240
 
241
+ this.closeServers();
242
+
243
+ this.rateLimiter.destroy();
244
+ this.approvalManager.destroy();
245
+ this.router.destroy();
246
+ this.joinedPeers.clear();
247
+ }
248
+
249
+ /** Force-stop without broadcasting or waiting — used when graceful stop times out. */
250
+ forceStop() {
251
+ this.stopped = true;
252
+ if (this.probeTimer) { clearInterval(this.probeTimer); this.probeTimer = null; }
253
+ for (const timer of this.reconnectTimers.values()) clearTimeout(timer);
254
+ this.reconnectTimers.clear();
255
+ for (const [, timer] of this.disconnectGraceTimers) clearTimeout(timer);
256
+ this.disconnectGraceTimers.clear();
257
+ if (this.gossipDebounceTimer) {
258
+ clearTimeout(this.gossipDebounceTimer);
259
+ this.gossipDebounceTimer = null;
260
+ }
261
+ for (const conn of this.router.getDirectConnections()) {
262
+ try { conn.close(1000, "shutdown"); } catch { /* best effort */ }
263
+ }
264
+ this.closeServers();
265
+ this.rateLimiter.destroy();
266
+ this.approvalManager.destroy();
267
+ this.router.destroy();
268
+ this.joinedPeers.clear();
269
+ }
270
+
271
+ private closeServers() {
224
272
  if (this.wss) {
225
273
  this.wss.close();
226
274
  this.wss = null;
227
275
  }
228
276
  if (this.httpServer) {
277
+ // Force-close all keep-alive connections so the port is released immediately
278
+ const server = this.httpServer as typeof this.httpServer & { closeAllConnections?: () => void };
279
+ if (typeof server.closeAllConnections === "function") {
280
+ server.closeAllConnections();
281
+ }
229
282
  this.httpServer.close();
230
283
  this.httpServer = null;
231
284
  }
285
+ }
232
286
 
233
- this.rateLimiter.destroy();
234
- this.approvalManager.destroy();
235
- this.router.destroy();
287
+ // ── Route probing (for multi-URL peers) ──────────────────────────
288
+ private static readonly PROBE_INTERVAL = 3_600_000; // 1 hour
289
+ /** Minimum improvement ratio to trigger a route switch. */
290
+ private static readonly SWITCH_THRESHOLD = 0.7; // new must be ≤70% of current
291
+
292
+ private startRouteProbing() {
293
+ // Only probe if any peer has multiple URLs
294
+ const hasMultiUrl = [...this.peerUrls.values()].some((urls) => urls.length > 1);
295
+ if (!hasMultiUrl) return;
296
+
297
+ this.probeTimer = setInterval(() => this.probeAllRoutes(), PeerManager.PROBE_INTERVAL);
298
+ }
299
+
300
+ private async probeAllRoutes() {
301
+ for (const [nodeId, urls] of this.peerUrls) {
302
+ if (urls.length <= 1) continue;
303
+ const activeUrl = this.activeUrls.get(nodeId);
304
+ for (const url of urls) {
305
+ if (url === activeUrl) continue;
306
+ // Probe non-active URLs
307
+ const latency = await this.probeUrl(url);
308
+ if (latency !== null) {
309
+ this.urlProbeLatencies.set(url, latency);
310
+ }
311
+ }
312
+ // Also record active connection's real latency
313
+ if (activeUrl) {
314
+ const route = this.router.getRoute(nodeId);
315
+ if (route && route.latencyMs > 0) {
316
+ this.urlProbeLatencies.set(activeUrl, route.latencyMs);
317
+ }
318
+ }
319
+ // Evaluate if we should switch
320
+ this.evaluateRouteSwitch(nodeId);
321
+ }
322
+ }
323
+
324
+ /**
325
+ * Probe a URL by measuring HTTP response time (WS server also serves HTTP).
326
+ * Returns latency in ms, or null if unreachable.
327
+ */
328
+ private async probeUrl(wsUrl: string): Promise<number | null> {
329
+ try {
330
+ const httpUrl = wsUrl.replace(/^ws(s?):\/\//, "http$1://");
331
+ const start = Date.now();
332
+ const controller = new AbortController();
333
+ const timeout = setTimeout(() => controller.abort(), 5_000);
334
+ try {
335
+ const res = await fetch(httpUrl, {
336
+ method: "HEAD",
337
+ signal: controller.signal,
338
+ // @ts-ignore — Node.js 18+ supports this
339
+ keepalive: false,
340
+ });
341
+ clearTimeout(timeout);
342
+ if (res.ok || res.status === 200) {
343
+ return Date.now() - start;
344
+ }
345
+ return null;
346
+ } catch {
347
+ clearTimeout(timeout);
348
+ return null;
349
+ }
350
+ } catch {
351
+ return null;
352
+ }
353
+ }
354
+
355
+ /** Check if we should switch to a better URL for a peer. */
356
+ private evaluateRouteSwitch(nodeId: string) {
357
+ const urls = this.peerUrls.get(nodeId);
358
+ const activeUrl = this.activeUrls.get(nodeId);
359
+ if (!urls || !activeUrl || urls.length <= 1) return;
360
+
361
+ const currentLatency = this.urlProbeLatencies.get(activeUrl);
362
+ if (!currentLatency || currentLatency <= 0) return;
363
+
364
+ // Find best alternative
365
+ let bestUrl: string | undefined;
366
+ let bestLatency = Infinity;
367
+ for (const url of urls) {
368
+ if (url === activeUrl) continue;
369
+ const lat = this.urlProbeLatencies.get(url);
370
+ if (lat !== undefined && lat < bestLatency) {
371
+ bestLatency = lat;
372
+ bestUrl = url;
373
+ }
374
+ }
375
+
376
+ if (!bestUrl || bestLatency >= currentLatency * PeerManager.SWITCH_THRESHOLD) return;
377
+
378
+ // Check if there are active tasks — don't switch mid-task
379
+ if (this.hasActiveTasks(nodeId)) {
380
+ debug("probe", `${nodeId}: better route found (${activeUrl} ${currentLatency}ms → ${bestUrl} ${bestLatency}ms) but has active tasks, deferring`);
381
+ return;
382
+ }
383
+
384
+ debug("probe", `${nodeId}: switching route ${activeUrl} ${currentLatency}ms → ${bestUrl} ${bestLatency}ms`);
385
+ this.switchRoute(nodeId, bestUrl);
386
+ }
387
+
388
+ /** Callback to check if there are active tasks involving a peer. Set by ClusterRuntime. */
389
+ private activeTaskChecker: ((nodeId: string) => boolean) | null = null;
390
+
391
+ /** Register a callback to check for active tasks (used to prevent route switches mid-task). */
392
+ setActiveTaskChecker(checker: (nodeId: string) => boolean) {
393
+ this.activeTaskChecker = checker;
394
+ }
395
+
396
+ /** Check if there are active tasks involving a peer (handoffs, model requests, etc.). */
397
+ private hasActiveTasks(nodeId: string): boolean {
398
+ return this.activeTaskChecker?.(nodeId) ?? false;
399
+ }
400
+
401
+ /** Switch the active route for a peer to a new URL. */
402
+ private switchRoute(nodeId: string, newUrl: string) {
403
+ this.activeUrls.set(nodeId, newUrl);
404
+ // Connect to the new URL — the new connection will authenticate and join
405
+ // as an additional channel briefly, then we close the old one.
406
+ const oldRoute = this.router.getRoute(nodeId);
407
+ const oldConn = oldRoute?.connection;
408
+
409
+ this.connectToChannel(nodeId, newUrl);
410
+
411
+ // Close the old connection after a short delay (give new connection time to establish)
412
+ if (oldConn?.isOpen) {
413
+ setTimeout(() => {
414
+ // Only close if a new connection has taken over
415
+ const currentRoute = this.router.getRoute(nodeId);
416
+ if (currentRoute?.connection && currentRoute.connection !== oldConn) {
417
+ debug("probe", `${nodeId}: closing old channel after route switch`);
418
+ oldConn.close(1000, "route switch");
419
+ }
420
+ }, 5_000);
421
+ }
236
422
  }
237
423
 
238
424
  /** Set an HTTP request handler for non-WebSocket requests (e.g. web dashboard). */
@@ -351,26 +537,52 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
351
537
 
352
538
  // ── Outbound connections (standard WebSocket) ──────────────────
353
539
  private connectToPeer(peer: PeerConfig) {
540
+ const urls = Array.isArray(peer.url) ? peer.url : [peer.url];
541
+ this.peerUrls.set(peer.nodeId, urls);
542
+ // Connect to the first URL (or best known from probes)
543
+ const bestUrl = this.pickBestUrl(peer.nodeId, urls);
544
+ this.activeUrls.set(peer.nodeId, bestUrl);
545
+ this.connectToChannel(peer.nodeId, bestUrl);
546
+ }
547
+
548
+ /** Pick the best URL for a peer based on probe latencies. Falls back to first URL. */
549
+ private pickBestUrl(nodeId: string, urls: string[]): string {
550
+ if (urls.length <= 1) return urls[0];
551
+ let bestUrl = urls[0];
552
+ let bestLatency = Infinity;
553
+ for (const url of urls) {
554
+ const lat = this.urlProbeLatencies.get(url);
555
+ if (lat !== undefined && lat < bestLatency) {
556
+ bestLatency = lat;
557
+ bestUrl = url;
558
+ }
559
+ }
560
+ return bestUrl;
561
+ }
562
+
563
+ /** Connect a single channel (URL) for a peer. */
564
+ private connectToChannel(nodeId: string, url: string) {
354
565
  if (this.stopped) {
355
- debug("peer", `connectToPeer(${peer.nodeId}): skipped (stopped)`);
566
+ debug("peer", `connectToChannel(${nodeId}): skipped (stopped)`);
356
567
  return;
357
568
  }
358
569
 
359
- const attempt = this.reconnectAttempts.get(peer.nodeId) ?? 0;
360
- debug("peer", `connectToPeer(${peer.nodeId}): attempt=${attempt} url=${peer.url}`);
570
+ const channelKey = `${nodeId}|${url}`;
571
+ const attempt = this.reconnectAttempts.get(channelKey) ?? 0;
572
+ debug("peer", `connectToChannel(${nodeId}): attempt=${attempt} url=${url}`);
361
573
 
362
574
  // Use a common WS subprotocol for traffic disguise
363
575
  let ws: WebSocket;
364
576
  try {
365
- ws = new WebSocket(peer.url, ["graphql-transport-ws"]);
577
+ ws = new WebSocket(url, ["graphql-transport-ws"]);
366
578
  } catch (err) {
367
- debug("peer", `connectToPeer(${peer.nodeId}): WebSocket constructor threw: ${err}`);
368
- this.scheduleReconnect(peer);
579
+ debug("peer", `connectToChannel(${nodeId}): WebSocket constructor threw: ${err}`);
580
+ this.scheduleChannelReconnect(nodeId, url);
369
581
  return;
370
582
  }
371
583
 
372
584
  ws.addEventListener("open", () => {
373
- debug("peer", `connectToPeer(${peer.nodeId}): ws open`);
585
+ debug("peer", `connectToChannel(${nodeId}): ws open url=${url}`);
374
586
  const conn = new Connection(
375
587
  ws,
376
588
  "outbound",
@@ -382,8 +594,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
382
594
  conn.bindWebSocket(ws);
383
595
 
384
596
  conn.on("authenticated", (caps) => {
385
- debug("peer", `connectToPeer(${peer.nodeId}): authenticated`);
386
- this.reconnectAttempts.delete(peer.nodeId);
597
+ debug("peer", `connectToChannel(${nodeId}): authenticated url=${url}`);
598
+ this.reconnectAttempts.delete(channelKey);
387
599
  this.onPeerAuthenticated(conn, caps);
388
600
  });
389
601
 
@@ -397,7 +609,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
397
609
  const tryReconnect = () => {
398
610
  if (!reconnectScheduled) {
399
611
  reconnectScheduled = true;
400
- this.scheduleReconnect(peer, lastError);
612
+ this.scheduleChannelReconnect(nodeId, url, lastError);
401
613
  }
402
614
  };
403
615
 
@@ -409,7 +621,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
409
621
  // Don't reconnect if this was a self-connection (peer URL points to ourselves).
410
622
  // Without this guard, outbound detects self → closes → scheduleReconnect → loop.
411
623
  if (ev.code === 4002 && ev.reason === "self-connection") {
412
- debug("peer", `connectToPeer(${peer.nodeId}): self-connection, will not reconnect`);
624
+ debug("peer", `connectToChannel(${nodeId}): self-connection, will not reconnect`);
413
625
  return;
414
626
  }
415
627
  if (!lastError) {
@@ -419,24 +631,65 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
419
631
  });
420
632
  }
421
633
 
422
- private scheduleReconnect(peer: PeerConfig, reason?: string) {
634
+ private scheduleChannelReconnect(nodeId: string, url: string, reason?: string) {
423
635
  if (this.stopped) {
424
- debug("peer", `scheduleReconnect(${peer.nodeId}): skipped (stopped)`);
636
+ debug("peer", `scheduleChannelReconnect(${nodeId}): skipped (stopped)`);
425
637
  return;
426
638
  }
427
- if (this.reconnectTimers.has(peer.nodeId)) return;
639
+ const channelKey = `${nodeId}|${url}`;
640
+ if (this.reconnectTimers.has(channelKey)) return;
641
+
642
+ const attempt = this.reconnectAttempts.get(channelKey) ?? 0;
643
+
644
+ // On first failure, try an alternative URL immediately (failover)
645
+ if (attempt === 0) {
646
+ const urls = this.peerUrls.get(nodeId);
647
+ if (urls && urls.length > 1) {
648
+ const altUrl = this.pickNextUrl(nodeId, url, urls);
649
+ if (altUrl && altUrl !== url) {
650
+ debug("peer", `scheduleChannelReconnect(${nodeId}): failover ${url} → ${altUrl}`);
651
+ this.activeUrls.set(nodeId, altUrl);
652
+ this.reconnectAttempts.set(channelKey, attempt + 1);
653
+ // Connect to alternative immediately, schedule original for later
654
+ this.connectToChannel(nodeId, altUrl);
655
+ const timer = setTimeout(() => {
656
+ this.reconnectTimers.delete(channelKey);
657
+ // Only reconnect original URL if not already connected
658
+ if (!this.joinedPeers.has(nodeId)) {
659
+ this.connectToChannel(nodeId, url);
660
+ }
661
+ }, RECONNECT_MAX);
662
+ this.reconnectTimers.set(channelKey, timer);
663
+ return;
664
+ }
665
+ }
666
+ }
428
667
 
429
- const attempt = this.reconnectAttempts.get(peer.nodeId) ?? 0;
430
668
  const delay = Math.min(RECONNECT_BASE * 2 ** attempt, RECONNECT_MAX);
431
- this.reconnectAttempts.set(peer.nodeId, attempt + 1);
669
+ this.reconnectAttempts.set(channelKey, attempt + 1);
432
670
  const tag = reason ? ` reason="${reason}"` : "";
433
- debug("peer", `scheduleReconnect(${peer.nodeId}): attempt=${attempt} delay=${delay}ms${tag}`);
671
+ debug("peer", `scheduleChannelReconnect(${nodeId}): attempt=${attempt} delay=${delay}ms url=${url}${tag}`);
434
672
 
435
673
  const timer = setTimeout(() => {
436
- this.reconnectTimers.delete(peer.nodeId);
437
- this.connectToPeer(peer);
674
+ this.reconnectTimers.delete(channelKey);
675
+ this.connectToChannel(nodeId, url);
438
676
  }, delay);
439
- this.reconnectTimers.set(peer.nodeId, timer);
677
+ this.reconnectTimers.set(channelKey, timer);
678
+ }
679
+
680
+ /** Pick the next best URL to try, excluding the current one. */
681
+ private pickNextUrl(nodeId: string, currentUrl: string, urls: string[]): string | undefined {
682
+ let bestUrl: string | undefined;
683
+ let bestLatency = Infinity;
684
+ for (const url of urls) {
685
+ if (url === currentUrl) continue;
686
+ const lat = this.urlProbeLatencies.get(url) ?? 10_000;
687
+ if (lat < bestLatency) {
688
+ bestLatency = lat;
689
+ bestUrl = url;
690
+ }
691
+ }
692
+ return bestUrl ?? urls.find((u) => u !== currentUrl);
440
693
  }
441
694
 
442
695
  // ── Peer lifecycle ─────────────────────────────────────────────
@@ -536,6 +789,23 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
536
789
  // Cancel disconnect grace timer if the peer is reconnecting
537
790
  const wasInGrace = this.cancelDisconnectGrace(nodeId);
538
791
 
792
+ // Check if this peer already has an active connection (additional channel)
793
+ const isAdditionalChannel = this.joinedPeers.has(nodeId);
794
+
795
+ if (isAdditionalChannel) {
796
+ // Additional channel — just add to the channel pool, no peer_join broadcast
797
+ this.router.addChannel(nodeId, conn);
798
+ conn.on("message", (frame) => this.onFrame(frame, conn));
799
+ conn.on("latency", () => this.router.updateActiveChannel(nodeId));
800
+ conn.on("close", () => this.onChannelDisconnected(conn));
801
+ const channelCount = this.router.getChannelCount(nodeId);
802
+ debug("peer", `completePeerJoin(${nodeId}): additional channel added (total=${channelCount})`);
803
+ audit("channel_add", { nodeId, detail: `channels=${channelCount}` });
804
+ return;
805
+ }
806
+
807
+ // First channel — full join flow
808
+
539
809
  // If there's an existing connection for this nodeId (e.g. peer reconnected
540
810
  // while old TCP hadn't closed yet), close it AFTER overwriting the route so
541
811
  // the stale-close guard in onPeerDisconnected correctly skips cleanup.
@@ -551,9 +821,11 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
551
821
  oldConn.close(1000, "replaced by new connection");
552
822
  }
553
823
 
824
+ this.joinedPeers.add(nodeId);
825
+
554
826
  conn.on("message", (frame) => this.onFrame(frame, conn));
555
- conn.on("latency", (ms) => this.router.updateLatency(nodeId, ms));
556
- conn.on("close", () => this.onPeerDisconnected(conn));
827
+ conn.on("latency", () => this.router.updateActiveChannel(nodeId));
828
+ conn.on("close", () => this.onChannelDisconnected(conn));
557
829
 
558
830
  this.sendPeerSync(conn);
559
831
 
@@ -586,6 +858,25 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
586
858
  this.emit("peerConnected", nodeId);
587
859
  }
588
860
 
861
+ /** Handle a single channel disconnecting (multi-channel aware). */
862
+ private onChannelDisconnected(conn: Connection) {
863
+ const nodeId = conn.remoteNodeId;
864
+ if (!nodeId) return;
865
+
866
+ // Remove this channel from the pool
867
+ const hasRemaining = this.router.removeChannel(nodeId, conn);
868
+ if (hasRemaining) {
869
+ // Other channels still alive — just log, no peer_leave
870
+ const channelCount = this.router.getChannelCount(nodeId);
871
+ debug("peer", `onChannelDisconnected(${nodeId}): channel lost, ${channelCount} remaining`);
872
+ audit("channel_remove", { nodeId, detail: `channels=${channelCount}` });
873
+ return;
874
+ }
875
+
876
+ // Last channel gone — proceed with peer disconnect logic
877
+ this.onPeerDisconnected(conn);
878
+ }
879
+
589
880
  private onPeerDisconnected(conn: Connection) {
590
881
  const nodeId = conn.remoteNodeId;
591
882
  if (!nodeId) return;
@@ -603,6 +894,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
603
894
  // Same-nodeId 本地客户端断开:仅清理路由,不广播 peer_leave
604
895
  if (nodeId === this.config.nodeId) {
605
896
  this.router.removePeer(nodeId);
897
+ this.joinedPeers.delete(nodeId);
606
898
  return;
607
899
  }
608
900
 
@@ -651,6 +943,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
651
943
 
652
944
  audit("peer_leave", { nodeId });
653
945
  this.router.removePeer(nodeId);
946
+ this.joinedPeers.delete(nodeId);
654
947
 
655
948
  // Remove satellite contexts that were only reachable via this peer
656
949
  for (let i = this.satelliteContexts.length - 1; i >= 0; i--) {
package/src/router.ts CHANGED
@@ -30,7 +30,9 @@ export class Router {
30
30
  private localToolProxy?: ToolProxyInfo;
31
31
  private localAcpAgents?: AcpAgentInfo[];
32
32
  private routes = new Map<string, RouteEntry>();
33
- private connections = new Map<string, Connection>(); // nodeId → direct connection
33
+ private connections = new Map<string, Connection>(); // nodeId → active (best) direct connection
34
+ /** All live channels per nodeId (multi-channel support). */
35
+ private channels = new Map<string, Set<Connection>>();
34
36
  /** Double-map dedup: current window + previous window. Rotated periodically. */
35
37
  private seenCurrent = new Map<string, true>();
36
38
  private seenPrevious = new Map<string, true>();
@@ -128,6 +130,7 @@ export class Router {
128
130
  this.seenCurrent.clear();
129
131
  this.seenPrevious.clear();
130
132
  this.failedRequests.clear();
133
+ this.channels.clear();
131
134
  }
132
135
 
133
136
  // ── Route table management ─────────────────────────────────────
@@ -139,6 +142,10 @@ export class Router {
139
142
  const old = this.routes.get(nodeId);
140
143
  if (old) this.unindexEntry(old);
141
144
  this.connections.set(nodeId, connection);
145
+ // Add to channel set
146
+ let channelSet = this.channels.get(nodeId);
147
+ if (!channelSet) { channelSet = new Set(); this.channels.set(nodeId, channelSet); }
148
+ channelSet.add(connection);
142
149
  const entry: RouteEntry = {
143
150
  nodeId,
144
151
  agents: capabilities.agents,
@@ -157,6 +164,74 @@ export class Router {
157
164
  this.indexEntry(entry);
158
165
  }
159
166
 
167
+ /** Add an additional channel to an existing peer (multi-channel). */
168
+ addChannel(nodeId: string, connection: Connection) {
169
+ let channelSet = this.channels.get(nodeId);
170
+ if (!channelSet) { channelSet = new Set(); this.channels.set(nodeId, channelSet); }
171
+ channelSet.add(connection);
172
+ this.updateActiveChannel(nodeId);
173
+ }
174
+
175
+ /** Remove a single channel. Returns true if the peer still has live channels. */
176
+ removeChannel(nodeId: string, connection: Connection): boolean {
177
+ const channelSet = this.channels.get(nodeId);
178
+ if (channelSet) {
179
+ channelSet.delete(connection);
180
+ if (channelSet.size === 0) {
181
+ this.channels.delete(nodeId);
182
+ return false;
183
+ }
184
+ // Pick new active channel
185
+ this.updateActiveChannel(nodeId);
186
+ return true;
187
+ }
188
+ return false;
189
+ }
190
+
191
+ /** Re-evaluate the active (best) channel for a peer based on latency. */
192
+ updateActiveChannel(nodeId: string) {
193
+ const channelSet = this.channels.get(nodeId);
194
+ if (!channelSet || channelSet.size === 0) return;
195
+
196
+ let best: Connection | null = null;
197
+ let bestLatency = Infinity;
198
+ for (const conn of channelSet) {
199
+ if (!conn.isOpen) continue;
200
+ // Prefer lower latency; treat 0 (unmeasured) as high
201
+ const lat = conn.latencyMs > 0 ? conn.latencyMs : 10_000;
202
+ if (lat < bestLatency) {
203
+ bestLatency = lat;
204
+ best = conn;
205
+ }
206
+ }
207
+
208
+ if (best) {
209
+ this.connections.set(nodeId, best);
210
+ const route = this.routes.get(nodeId);
211
+ if (route) {
212
+ route.connection = best;
213
+ route.latencyMs = best.latencyMs;
214
+ }
215
+ }
216
+ }
217
+
218
+ /** Get the number of live channels for a peer. */
219
+ getChannelCount(nodeId: string): number {
220
+ const channelSet = this.channels.get(nodeId);
221
+ if (!channelSet) return 0;
222
+ let count = 0;
223
+ for (const conn of channelSet) {
224
+ if (conn.isOpen) count++;
225
+ }
226
+ return count;
227
+ }
228
+
229
+ /** Get all channels for a peer (for diagnostics/status). */
230
+ getChannels(nodeId: string): Connection[] {
231
+ const channelSet = this.channels.get(nodeId);
232
+ return channelSet ? [...channelSet] : [];
233
+ }
234
+
160
235
  addRelayPeer(peer: PeerInfo, viaNodeId: string) {
161
236
  // Don't add ourselves
162
237
  if (peer.nodeId === this.nodeId) return;
@@ -192,6 +267,7 @@ export class Router {
192
267
 
193
268
  removePeer(nodeId: string) {
194
269
  this.connections.delete(nodeId);
270
+ this.channels.delete(nodeId);
195
271
  const removed = this.routes.get(nodeId);
196
272
  if (removed) {
197
273
  this.unindexEntry(removed);
@@ -346,11 +422,27 @@ export class Router {
346
422
  const route = this.routes.get(targetNodeId);
347
423
  if (!route) return false;
348
424
 
425
+ // Try active connection first
349
426
  if (route.connection?.isOpen) {
350
427
  route.connection.send(frame);
351
428
  return true;
352
429
  }
353
430
 
431
+ // Fallback: try other channels (multi-channel failover)
432
+ const channelSet = this.channels.get(targetNodeId);
433
+ if (channelSet) {
434
+ for (const conn of channelSet) {
435
+ if (conn.isOpen) {
436
+ conn.send(frame);
437
+ // Promote to active
438
+ this.connections.set(targetNodeId, conn);
439
+ route.connection = conn;
440
+ route.latencyMs = conn.latencyMs;
441
+ return true;
442
+ }
443
+ }
444
+ }
445
+
354
446
  // Relay through intermediate node
355
447
  if (route.reachableVia) {
356
448
  const relay = this.connections.get(route.reachableVia);
package/src/tool-proxy.ts CHANGED
@@ -10,6 +10,9 @@ import type {
10
10
  } from "./types.ts";
11
11
  import type { PluginLogger } from "openclaw/plugin-sdk";
12
12
  import { isLocalTool, executeLocally } from "./local-tools.ts";
13
+ import { writeFileSync, mkdirSync } from "node:fs";
14
+ import { join } from "node:path";
15
+ import { tmpdir } from "node:os";
13
16
 
14
17
  const DEFAULT_TOOL_TIMEOUT = 30_000;
15
18
 
@@ -136,13 +139,41 @@ export class ToolProxy {
136
139
 
137
140
  if (frame.payload.success && frame.payload.result) {
138
141
  this.logger.info(`[clawmatrix] Tool response: id=${frame.id} from="${frame.from}" success`);
139
- pending.resolve(frame.payload.result);
142
+ const result = this.extractInlineImage(frame.payload.result);
143
+ pending.resolve(result);
140
144
  } else {
141
145
  this.logger.warn(`[clawmatrix] Tool response: id=${frame.id} from="${frame.from}" failed: ${frame.payload.error}`);
142
146
  pending.reject(new Error(frame.payload.error ?? "Remote tool execution failed"));
143
147
  }
144
148
  }
145
149
 
150
+ /**
151
+ * If the tool result contains inline base64 image data (mime: "image/*" + data),
152
+ * save it to a local temp file and replace `data` with `localPath`.
153
+ * This avoids flooding the LLM context with base64 text (saves ~tens of thousands of tokens).
154
+ */
155
+ private extractInlineImage(result: Record<string, unknown>): Record<string, unknown> {
156
+ const mime = result.mime;
157
+ const data = result.data;
158
+ if (typeof mime !== "string" || !mime.startsWith("image/") || typeof data !== "string") {
159
+ return result;
160
+ }
161
+
162
+ try {
163
+ const ext = mime === "image/png" ? ".png" : mime === "image/webp" ? ".webp" : ".jpg";
164
+ const dir = join(tmpdir(), "clawmatrix-images");
165
+ mkdirSync(dir, { recursive: true });
166
+ const localPath = join(dir, `${Date.now()}-${Math.random().toString(36).slice(2, 8)}${ext}`);
167
+ writeFileSync(localPath, Buffer.from(data, "base64"));
168
+ this.logger.info(`[clawmatrix] Saved inline image (${(data.length * 0.75 / 1024).toFixed(0)}KB) to ${localPath}`);
169
+ const { data: _stripped, ...rest } = result;
170
+ return { ...rest, localPath };
171
+ } catch (err) {
172
+ this.logger.warn(`[clawmatrix] Failed to extract inline image: ${err}`);
173
+ return result;
174
+ }
175
+ }
176
+
146
177
  // ── Incoming request: execute via local Gateway ────────────────
147
178
  async handleRequest(frame: ToolProxyRequest): Promise<void> {
148
179
  const { id, from, payload } = frame;
@@ -288,7 +319,14 @@ export class ToolProxy {
288
319
 
289
320
  clearTimeout(pending.timer);
290
321
  this.pendingBatch.delete(frame.id);
291
- pending.resolve(frame.payload.results);
322
+ // Extract inline images from batch results
323
+ const results = frame.payload.results.map((item) => {
324
+ if (item.success && item.result) {
325
+ return { ...item, result: this.extractInlineImage(item.result) };
326
+ }
327
+ return item;
328
+ });
329
+ pending.resolve(results);
292
330
  }
293
331
 
294
332
  // ── Incoming batch request: execute sequentially via local Gateway ──