clawmatrix 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clawmatrix",
3
- "version": "0.4.0",
3
+ "version": "0.4.1",
4
4
  "description": "Decentralized mesh cluster plugin for OpenClaw — inter-gateway communication, model proxy, task handoff, and tool proxy.",
5
5
  "type": "module",
6
6
  "license": "MIT",
package/src/acp-proxy.ts CHANGED
@@ -2781,13 +2781,15 @@ async function readFirstUserMessageFromTranscript(transcriptPath: string): Promi
2781
2781
  const msg = parsed?.message;
2782
2782
  if (msg?.role !== "user") continue;
2783
2783
  // Extract text from content (string or array of content blocks)
2784
- if (typeof msg.content === "string") return msg.content.slice(0, 120);
2785
- if (Array.isArray(msg.content)) {
2784
+ let raw: string | null = null;
2785
+ if (typeof msg.content === "string") raw = msg.content;
2786
+ else if (Array.isArray(msg.content)) {
2786
2787
  for (const block of msg.content) {
2787
- if (typeof block === "string") return block.slice(0, 120);
2788
- if (block?.type === "text" && typeof block.text === "string") return block.text.slice(0, 120);
2788
+ if (typeof block === "string") { raw = block; break; }
2789
+ if (block?.type === "text" && typeof block.text === "string") { raw = block.text; break; }
2789
2790
  }
2790
2791
  }
2792
+ if (raw) return stripEnvelope(stripInboundMetadata(raw)).slice(0, 120) || null;
2791
2793
  } catch {
2792
2794
  // skip malformed lines
2793
2795
  }
@@ -2861,7 +2863,8 @@ async function fetchSessionListFromDisk(): Promise<AcpSessionInfo[]> {
2861
2863
  try {
2862
2864
  const content = await readFileText(storePath);
2863
2865
  const entries: Record<string, { sessionId?: string; updatedAt?: number; displayName?: string; subject?: string; label?: string; acp?: { agent?: string } }> = JSON.parse(content);
2864
- const agent = agentId;
2866
+ // Use ACP agent type if available, otherwise default to "openclaw" for native sessions
2867
+ const agentDefault = "openclaw";
2865
2868
  // Read all transcripts in parallel (first message + mtime)
2866
2869
  const entryList = Object.entries(entries).filter(([, e]) => e.sessionId);
2867
2870
  const transcriptResults = await Promise.all(
@@ -2885,7 +2888,7 @@ async function fetchSessionListFromDisk(): Promise<AcpSessionInfo[]> {
2885
2888
  title: entry.displayName ?? entry.subject ?? entry.label ?? undefined,
2886
2889
  description: firstMsg ?? undefined,
2887
2890
  updatedAt: effectiveTs ? new Date(effectiveTs).toISOString() : undefined,
2888
- agent,
2891
+ agent: entry.acp?.agent ?? agentDefault,
2889
2892
  });
2890
2893
  }
2891
2894
  } catch {
@@ -133,6 +133,12 @@ export class ClusterRuntime {
133
133
  this.agentById.set(a.id, a);
134
134
  for (const t of a.tags) this.agentsByTag.set(t, a);
135
135
  }
136
+
137
+ // Wire up active task checker for route probing (prevents switching mid-task)
138
+ this.peerManager.setActiveTaskChecker((nodeId) => {
139
+ return this.handoffManager.hasPendingForNode(nodeId)
140
+ || this.modelProxy.hasPendingForNode(nodeId);
141
+ });
136
142
  }
137
143
 
138
144
  async start() {
@@ -144,8 +150,11 @@ export class ClusterRuntime {
144
150
  this.peerManager.on("peerConnected", (nodeId) => {
145
151
  this.logger.info(`[clawmatrix] Peer connected: ${nodeId}`);
146
152
  this.refreshDiscoveredModels();
147
- this.healthTracker.recordPeerOnline(nodeId, "direct");
153
+ // Init sync state BEFORE recording the event — recordPeerOnline triggers
154
+ // broadcastSync which must use the freshly initialized syncState.
155
+ // Reversing this order causes syncState corruption and infinite sync loops.
148
156
  this.healthTracker.initPeerSync(nodeId);
157
+ this.healthTracker.recordPeerOnline(nodeId, "direct");
149
158
  });
150
159
 
151
160
  this.peerManager.on("peerDisconnected", (nodeId) => {
package/src/config.ts CHANGED
@@ -51,7 +51,8 @@ const ModelInfoSchema = z.object({
51
51
 
52
52
  const PeerConfigSchema = z.object({
53
53
  nodeId: z.string(),
54
- url: z.string(),
54
+ /** Single URL or array of URLs for multi-channel connections. */
55
+ url: z.union([z.string(), z.array(z.string()).min(1)]),
55
56
  });
56
57
 
57
58
  const ToolProxyConfigSchema = z.object({
package/src/handoff.ts CHANGED
@@ -678,6 +678,14 @@ export class HandoffManager {
678
678
  }
679
679
 
680
680
  /** Clean up on shutdown. */
681
+ /** Check if there are pending outbound handoffs targeting a specific node. */
682
+ hasPendingForNode(nodeId: string): boolean {
683
+ for (const p of this.pending.values()) {
684
+ if (p.targetNodeId === nodeId) return true;
685
+ }
686
+ return false;
687
+ }
688
+
681
689
  destroy() {
682
690
  if (this.staleCleanupTimer) {
683
691
  clearInterval(this.staleCleanupTimer);
@@ -79,6 +79,12 @@ export class HealthTracker {
79
79
  private compactTimer: ReturnType<typeof setInterval> | null = null;
80
80
  private saveTimer: ReturnType<typeof setTimeout> | null = null;
81
81
  private dirty = false;
82
+ /** Debounce timer for broadcastSync (prevents rapid-fire broadcasts). */
83
+ private broadcastTimer: ReturnType<typeof setTimeout> | null = null;
84
+ /** Round counter per peer to detect non-converging sync loops. */
85
+ private syncRounds = new Map<string, number>();
86
+ private static readonly MAX_SYNC_ROUNDS = 10;
87
+ private static readonly BROADCAST_DEBOUNCE = 500; // ms
82
88
 
83
89
  constructor(opts: HealthTrackerOptions) {
84
90
  this.nodeId = opts.nodeId;
@@ -119,6 +125,10 @@ export class HealthTracker {
119
125
  clearInterval(this.compactTimer);
120
126
  this.compactTimer = null;
121
127
  }
128
+ if (this.broadcastTimer) {
129
+ clearTimeout(this.broadcastTimer);
130
+ this.broadcastTimer = null;
131
+ }
122
132
  if (this.saveTimer) {
123
133
  clearTimeout(this.saveTimer);
124
134
  this.saveTimer = null;
@@ -165,6 +175,16 @@ export class HealthTracker {
165
175
  const message = new Uint8Array(Buffer.from(frame.payload.data, "base64"));
166
176
  const syncKey = peerId;
167
177
 
178
+ // Guard against non-converging sync loops
179
+ const rounds = (this.syncRounds.get(peerId) ?? 0) + 1;
180
+ if (rounds > HealthTracker.MAX_SYNC_ROUNDS) {
181
+ debug(TAG, `sync with ${peerId} exceeded ${HealthTracker.MAX_SYNC_ROUNDS} rounds, resetting`);
182
+ this.syncStates.set(syncKey, Automerge.initSyncState());
183
+ this.syncRounds.delete(peerId);
184
+ return;
185
+ }
186
+ this.syncRounds.set(peerId, rounds);
187
+
168
188
  try {
169
189
  const syncState = this.syncStates.get(syncKey) ?? Automerge.initSyncState();
170
190
  const [newDoc, newSyncState] = Automerge.receiveSyncMessage(this.doc, syncState, message);
@@ -172,18 +192,19 @@ export class HealthTracker {
172
192
  this.syncStates.set(syncKey, newSyncState);
173
193
  this.scheduleSave();
174
194
 
175
- // Send our response
195
+ // Send our response (only if there's something to send)
176
196
  this.sendSyncMessage(peerId);
177
197
  } catch (err) {
178
198
  debug(TAG, `error handling sync from ${peerId}: ${err}`);
179
199
  }
180
200
  }
181
201
 
182
- /** Initiate sync with a peer (called on peer connect). */
202
+ /** Initialize sync state for a peer (called on peer connect).
203
+ * Does NOT send a message — the subsequent recordPeerOnline → broadcastSync handles that.
204
+ * Sending here would race with broadcastSync and corrupt the sync state. */
183
205
  initPeerSync(peerId: string) {
184
206
  if (peerId === this.nodeId) return;
185
207
  this.syncStates.set(peerId, Automerge.initSyncState());
186
- this.sendSyncMessage(peerId);
187
208
  }
188
209
 
189
210
  /** Clean up sync state for a disconnected peer. */
@@ -196,7 +217,11 @@ export class HealthTracker {
196
217
  const [newSyncState, message] = Automerge.generateSyncMessage(this.doc, syncState);
197
218
  this.syncStates.set(peerId, newSyncState);
198
219
 
199
- if (!message) return;
220
+ if (!message) {
221
+ // Sync converged — reset round counter
222
+ this.syncRounds.delete(peerId);
223
+ return;
224
+ }
200
225
 
201
226
  debug(TAG, `sending health sync to ${peerId} (${message.byteLength} bytes)`);
202
227
 
@@ -214,10 +239,17 @@ export class HealthTracker {
214
239
  }
215
240
 
216
241
  private broadcastSync() {
217
- const peers = this.peerManager.router.getAllPeers();
218
- for (const peer of peers) {
219
- this.sendSyncMessage(peer.nodeId);
220
- }
242
+ // Debounce: multiple events in quick succession → single broadcast
243
+ if (this.broadcastTimer) return;
244
+ this.broadcastTimer = setTimeout(() => {
245
+ this.broadcastTimer = null;
246
+ // Reset round counters — new broadcast starts fresh sync cycle
247
+ this.syncRounds.clear();
248
+ const peers = this.peerManager.router.getAllPeers();
249
+ for (const peer of peers) {
250
+ this.sendSyncMessage(peer.nodeId);
251
+ }
252
+ }, HealthTracker.BROADCAST_DEBOUNCE);
221
253
  }
222
254
 
223
255
  // ── Timeline aggregation ──────────────────────────────────
@@ -43,6 +43,7 @@ interface PendingModelReq {
43
43
  stream: boolean;
44
44
  responseFormat: ResponseFormat;
45
45
  model?: string;
46
+ targetNodeId?: string;
46
47
  controller?: ReadableStreamDefaultController;
47
48
  encoder?: TextEncoder;
48
49
  /** Whether real content (not just setup events) has been sent to the stream. */
@@ -356,6 +357,14 @@ export class ModelProxy {
356
357
  this.httpServer.listen(this.config.proxyPort, "127.0.0.1");
357
358
  }
358
359
 
360
+ /** Check if there are pending model requests targeting a specific node. */
361
+ hasPendingForNode(nodeId: string): boolean {
362
+ for (const p of this.pending.values()) {
363
+ if (p.targetNodeId === nodeId) return true;
364
+ }
365
+ return false;
366
+ }
367
+
359
368
  stop() {
360
369
  if (this.cacheCleanupTimer) {
361
370
  clearInterval(this.cacheCleanupTimer);
@@ -653,6 +662,7 @@ export class ModelProxy {
653
662
  this.pending.set(requestId, {
654
663
  resolve: () => {}, reject: () => {},
655
664
  timer, stream: true, responseFormat, model,
665
+ targetNodeId,
656
666
  controller, encoder,
657
667
  hasContent: false,
658
668
  failoverCandidates,
@@ -827,6 +837,7 @@ export class ModelProxy {
827
837
  this.pending.set(requestId, {
828
838
  resolve: resolve as (v: unknown) => void,
829
839
  reject, timer, stream: false, responseFormat,
840
+ targetNodeId,
830
841
  });
831
842
 
832
843
  const sent = this.peerManager.sendTo(targetNodeId, frame);
@@ -91,8 +91,19 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
91
91
  private localCapabilities: NodeCapabilities;
92
92
  private httpServer: Server | null = null;
93
93
  private wss: WebSocketServer | null = null;
94
+ /** Reconnect timers keyed by `nodeId|url` for per-channel reconnection. */
94
95
  private reconnectTimers = new Map<string, ReturnType<typeof setTimeout>>();
95
96
  private reconnectAttempts = new Map<string, number>();
97
+ /** Track which nodeIds have already completed the full peer join (for multi-channel). */
98
+ private joinedPeers = new Set<string>();
99
+ /** All configured URLs per peer (for multi-URL peers). */
100
+ private peerUrls = new Map<string, string[]>();
101
+ /** Currently active URL per peer. */
102
+ private activeUrls = new Map<string, string>();
103
+ /** Last probe latency per URL (ms). */
104
+ private urlProbeLatencies = new Map<string, number>();
105
+ /** Route probe interval timer. */
106
+ private probeTimer: ReturnType<typeof setInterval> | null = null;
96
107
  /** Deferred disconnect timers — grace period before broadcasting peer_leave. */
97
108
  private disconnectGraceTimers = new Map<string, ReturnType<typeof setTimeout>>();
98
109
  private stopped = false;
@@ -191,10 +202,16 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
191
202
  for (const peer of this.config.peers) {
192
203
  this.connectToPeer(peer);
193
204
  }
205
+ // Start route probing for peers with multiple URLs
206
+ this.startRouteProbing();
194
207
  }
195
208
 
196
209
  async stop() {
197
210
  this.stopped = true;
211
+ if (this.probeTimer) {
212
+ clearInterval(this.probeTimer);
213
+ this.probeTimer = null;
214
+ }
198
215
  if (this.gossipDebounceTimer) {
199
216
  clearTimeout(this.gossipDebounceTimer);
200
217
  this.gossipDebounceTimer = null;
@@ -226,11 +243,13 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
226
243
  this.rateLimiter.destroy();
227
244
  this.approvalManager.destroy();
228
245
  this.router.destroy();
246
+ this.joinedPeers.clear();
229
247
  }
230
248
 
231
249
  /** Force-stop without broadcasting or waiting — used when graceful stop times out. */
232
250
  forceStop() {
233
251
  this.stopped = true;
252
+ if (this.probeTimer) { clearInterval(this.probeTimer); this.probeTimer = null; }
234
253
  for (const timer of this.reconnectTimers.values()) clearTimeout(timer);
235
254
  this.reconnectTimers.clear();
236
255
  for (const [, timer] of this.disconnectGraceTimers) clearTimeout(timer);
@@ -246,6 +265,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
246
265
  this.rateLimiter.destroy();
247
266
  this.approvalManager.destroy();
248
267
  this.router.destroy();
268
+ this.joinedPeers.clear();
249
269
  }
250
270
 
251
271
  private closeServers() {
@@ -264,6 +284,143 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
264
284
  }
265
285
  }
266
286
 
287
+ // ── Route probing (for multi-URL peers) ──────────────────────────
288
+ private static readonly PROBE_INTERVAL = 3_600_000; // 1 hour
289
+ /** Minimum improvement ratio to trigger a route switch. */
290
+ private static readonly SWITCH_THRESHOLD = 0.7; // new must be ≤70% of current
291
+
292
+ private startRouteProbing() {
293
+ // Only probe if any peer has multiple URLs
294
+ const hasMultiUrl = [...this.peerUrls.values()].some((urls) => urls.length > 1);
295
+ if (!hasMultiUrl) return;
296
+
297
+ this.probeTimer = setInterval(() => this.probeAllRoutes(), PeerManager.PROBE_INTERVAL);
298
+ }
299
+
300
+ private async probeAllRoutes() {
301
+ for (const [nodeId, urls] of this.peerUrls) {
302
+ if (urls.length <= 1) continue;
303
+ const activeUrl = this.activeUrls.get(nodeId);
304
+ for (const url of urls) {
305
+ if (url === activeUrl) continue;
306
+ // Probe non-active URLs
307
+ const latency = await this.probeUrl(url);
308
+ if (latency !== null) {
309
+ this.urlProbeLatencies.set(url, latency);
310
+ }
311
+ }
312
+ // Also record active connection's real latency
313
+ if (activeUrl) {
314
+ const route = this.router.getRoute(nodeId);
315
+ if (route && route.latencyMs > 0) {
316
+ this.urlProbeLatencies.set(activeUrl, route.latencyMs);
317
+ }
318
+ }
319
+ // Evaluate if we should switch
320
+ this.evaluateRouteSwitch(nodeId);
321
+ }
322
+ }
323
+
324
+ /**
325
+ * Probe a URL by measuring HTTP response time (WS server also serves HTTP).
326
+ * Returns latency in ms, or null if unreachable.
327
+ */
328
+ private async probeUrl(wsUrl: string): Promise<number | null> {
329
+ try {
330
+ const httpUrl = wsUrl.replace(/^ws(s?):\/\//, "http$1://");
331
+ const start = Date.now();
332
+ const controller = new AbortController();
333
+ const timeout = setTimeout(() => controller.abort(), 5_000);
334
+ try {
335
+ const res = await fetch(httpUrl, {
336
+ method: "HEAD",
337
+ signal: controller.signal,
338
+ // @ts-ignore — Node.js 18+ supports this
339
+ keepalive: false,
340
+ });
341
+ clearTimeout(timeout);
342
+ if (res.ok || res.status === 200) {
343
+ return Date.now() - start;
344
+ }
345
+ return null;
346
+ } catch {
347
+ clearTimeout(timeout);
348
+ return null;
349
+ }
350
+ } catch {
351
+ return null;
352
+ }
353
+ }
354
+
355
+ /** Check if we should switch to a better URL for a peer. */
356
+ private evaluateRouteSwitch(nodeId: string) {
357
+ const urls = this.peerUrls.get(nodeId);
358
+ const activeUrl = this.activeUrls.get(nodeId);
359
+ if (!urls || !activeUrl || urls.length <= 1) return;
360
+
361
+ const currentLatency = this.urlProbeLatencies.get(activeUrl);
362
+ if (!currentLatency || currentLatency <= 0) return;
363
+
364
+ // Find best alternative
365
+ let bestUrl: string | undefined;
366
+ let bestLatency = Infinity;
367
+ for (const url of urls) {
368
+ if (url === activeUrl) continue;
369
+ const lat = this.urlProbeLatencies.get(url);
370
+ if (lat !== undefined && lat < bestLatency) {
371
+ bestLatency = lat;
372
+ bestUrl = url;
373
+ }
374
+ }
375
+
376
+ if (!bestUrl || bestLatency >= currentLatency * PeerManager.SWITCH_THRESHOLD) return;
377
+
378
+ // Check if there are active tasks — don't switch mid-task
379
+ if (this.hasActiveTasks(nodeId)) {
380
+ debug("probe", `${nodeId}: better route found (${activeUrl} ${currentLatency}ms → ${bestUrl} ${bestLatency}ms) but has active tasks, deferring`);
381
+ return;
382
+ }
383
+
384
+ debug("probe", `${nodeId}: switching route ${activeUrl} ${currentLatency}ms → ${bestUrl} ${bestLatency}ms`);
385
+ this.switchRoute(nodeId, bestUrl);
386
+ }
387
+
388
+ /** Callback to check if there are active tasks involving a peer. Set by ClusterRuntime. */
389
+ private activeTaskChecker: ((nodeId: string) => boolean) | null = null;
390
+
391
+ /** Register a callback to check for active tasks (used to prevent route switches mid-task). */
392
+ setActiveTaskChecker(checker: (nodeId: string) => boolean) {
393
+ this.activeTaskChecker = checker;
394
+ }
395
+
396
+ /** Check if there are active tasks involving a peer (handoffs, model requests, etc.). */
397
+ private hasActiveTasks(nodeId: string): boolean {
398
+ return this.activeTaskChecker?.(nodeId) ?? false;
399
+ }
400
+
401
+ /** Switch the active route for a peer to a new URL. */
402
+ private switchRoute(nodeId: string, newUrl: string) {
403
+ this.activeUrls.set(nodeId, newUrl);
404
+ // Connect to the new URL — the new connection will authenticate and join
405
+ // as an additional channel briefly, then we close the old one.
406
+ const oldRoute = this.router.getRoute(nodeId);
407
+ const oldConn = oldRoute?.connection;
408
+
409
+ this.connectToChannel(nodeId, newUrl);
410
+
411
+ // Close the old connection after a short delay (give new connection time to establish)
412
+ if (oldConn?.isOpen) {
413
+ setTimeout(() => {
414
+ // Only close if a new connection has taken over
415
+ const currentRoute = this.router.getRoute(nodeId);
416
+ if (currentRoute?.connection && currentRoute.connection !== oldConn) {
417
+ debug("probe", `${nodeId}: closing old channel after route switch`);
418
+ oldConn.close(1000, "route switch");
419
+ }
420
+ }, 5_000);
421
+ }
422
+ }
423
+
267
424
  /** Set an HTTP request handler for non-WebSocket requests (e.g. web dashboard). */
268
425
  private httpRequestHandler: ((req: IncomingMessage, res: ServerResponse) => boolean) | null = null;
269
426
 
@@ -380,26 +537,52 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
380
537
 
381
538
  // ── Outbound connections (standard WebSocket) ──────────────────
382
539
  private connectToPeer(peer: PeerConfig) {
540
+ const urls = Array.isArray(peer.url) ? peer.url : [peer.url];
541
+ this.peerUrls.set(peer.nodeId, urls);
542
+ // Connect to the first URL (or best known from probes)
543
+ const bestUrl = this.pickBestUrl(peer.nodeId, urls);
544
+ this.activeUrls.set(peer.nodeId, bestUrl);
545
+ this.connectToChannel(peer.nodeId, bestUrl);
546
+ }
547
+
548
+ /** Pick the best URL for a peer based on probe latencies. Falls back to first URL. */
549
+ private pickBestUrl(nodeId: string, urls: string[]): string {
550
+ if (urls.length <= 1) return urls[0];
551
+ let bestUrl = urls[0];
552
+ let bestLatency = Infinity;
553
+ for (const url of urls) {
554
+ const lat = this.urlProbeLatencies.get(url);
555
+ if (lat !== undefined && lat < bestLatency) {
556
+ bestLatency = lat;
557
+ bestUrl = url;
558
+ }
559
+ }
560
+ return bestUrl;
561
+ }
562
+
563
+ /** Connect a single channel (URL) for a peer. */
564
+ private connectToChannel(nodeId: string, url: string) {
383
565
  if (this.stopped) {
384
- debug("peer", `connectToPeer(${peer.nodeId}): skipped (stopped)`);
566
+ debug("peer", `connectToChannel(${nodeId}): skipped (stopped)`);
385
567
  return;
386
568
  }
387
569
 
388
- const attempt = this.reconnectAttempts.get(peer.nodeId) ?? 0;
389
- debug("peer", `connectToPeer(${peer.nodeId}): attempt=${attempt} url=${peer.url}`);
570
+ const channelKey = `${nodeId}|${url}`;
571
+ const attempt = this.reconnectAttempts.get(channelKey) ?? 0;
572
+ debug("peer", `connectToChannel(${nodeId}): attempt=${attempt} url=${url}`);
390
573
 
391
574
  // Use a common WS subprotocol for traffic disguise
392
575
  let ws: WebSocket;
393
576
  try {
394
- ws = new WebSocket(peer.url, ["graphql-transport-ws"]);
577
+ ws = new WebSocket(url, ["graphql-transport-ws"]);
395
578
  } catch (err) {
396
- debug("peer", `connectToPeer(${peer.nodeId}): WebSocket constructor threw: ${err}`);
397
- this.scheduleReconnect(peer);
579
+ debug("peer", `connectToChannel(${nodeId}): WebSocket constructor threw: ${err}`);
580
+ this.scheduleChannelReconnect(nodeId, url);
398
581
  return;
399
582
  }
400
583
 
401
584
  ws.addEventListener("open", () => {
402
- debug("peer", `connectToPeer(${peer.nodeId}): ws open`);
585
+ debug("peer", `connectToChannel(${nodeId}): ws open url=${url}`);
403
586
  const conn = new Connection(
404
587
  ws,
405
588
  "outbound",
@@ -411,8 +594,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
411
594
  conn.bindWebSocket(ws);
412
595
 
413
596
  conn.on("authenticated", (caps) => {
414
- debug("peer", `connectToPeer(${peer.nodeId}): authenticated`);
415
- this.reconnectAttempts.delete(peer.nodeId);
597
+ debug("peer", `connectToChannel(${nodeId}): authenticated url=${url}`);
598
+ this.reconnectAttempts.delete(channelKey);
416
599
  this.onPeerAuthenticated(conn, caps);
417
600
  });
418
601
 
@@ -426,7 +609,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
426
609
  const tryReconnect = () => {
427
610
  if (!reconnectScheduled) {
428
611
  reconnectScheduled = true;
429
- this.scheduleReconnect(peer, lastError);
612
+ this.scheduleChannelReconnect(nodeId, url, lastError);
430
613
  }
431
614
  };
432
615
 
@@ -438,7 +621,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
438
621
  // Don't reconnect if this was a self-connection (peer URL points to ourselves).
439
622
  // Without this guard, outbound detects self → closes → scheduleReconnect → loop.
440
623
  if (ev.code === 4002 && ev.reason === "self-connection") {
441
- debug("peer", `connectToPeer(${peer.nodeId}): self-connection, will not reconnect`);
624
+ debug("peer", `connectToChannel(${nodeId}): self-connection, will not reconnect`);
442
625
  return;
443
626
  }
444
627
  if (!lastError) {
@@ -448,24 +631,65 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
448
631
  });
449
632
  }
450
633
 
451
- private scheduleReconnect(peer: PeerConfig, reason?: string) {
634
+ private scheduleChannelReconnect(nodeId: string, url: string, reason?: string) {
452
635
  if (this.stopped) {
453
- debug("peer", `scheduleReconnect(${peer.nodeId}): skipped (stopped)`);
636
+ debug("peer", `scheduleChannelReconnect(${nodeId}): skipped (stopped)`);
454
637
  return;
455
638
  }
456
- if (this.reconnectTimers.has(peer.nodeId)) return;
639
+ const channelKey = `${nodeId}|${url}`;
640
+ if (this.reconnectTimers.has(channelKey)) return;
641
+
642
+ const attempt = this.reconnectAttempts.get(channelKey) ?? 0;
643
+
644
+ // On first failure, try an alternative URL immediately (failover)
645
+ if (attempt === 0) {
646
+ const urls = this.peerUrls.get(nodeId);
647
+ if (urls && urls.length > 1) {
648
+ const altUrl = this.pickNextUrl(nodeId, url, urls);
649
+ if (altUrl && altUrl !== url) {
650
+ debug("peer", `scheduleChannelReconnect(${nodeId}): failover ${url} → ${altUrl}`);
651
+ this.activeUrls.set(nodeId, altUrl);
652
+ this.reconnectAttempts.set(channelKey, attempt + 1);
653
+ // Connect to alternative immediately, schedule original for later
654
+ this.connectToChannel(nodeId, altUrl);
655
+ const timer = setTimeout(() => {
656
+ this.reconnectTimers.delete(channelKey);
657
+ // Only reconnect original URL if not already connected
658
+ if (!this.joinedPeers.has(nodeId)) {
659
+ this.connectToChannel(nodeId, url);
660
+ }
661
+ }, RECONNECT_MAX);
662
+ this.reconnectTimers.set(channelKey, timer);
663
+ return;
664
+ }
665
+ }
666
+ }
457
667
 
458
- const attempt = this.reconnectAttempts.get(peer.nodeId) ?? 0;
459
668
  const delay = Math.min(RECONNECT_BASE * 2 ** attempt, RECONNECT_MAX);
460
- this.reconnectAttempts.set(peer.nodeId, attempt + 1);
669
+ this.reconnectAttempts.set(channelKey, attempt + 1);
461
670
  const tag = reason ? ` reason="${reason}"` : "";
462
- debug("peer", `scheduleReconnect(${peer.nodeId}): attempt=${attempt} delay=${delay}ms${tag}`);
671
+ debug("peer", `scheduleChannelReconnect(${nodeId}): attempt=${attempt} delay=${delay}ms url=${url}${tag}`);
463
672
 
464
673
  const timer = setTimeout(() => {
465
- this.reconnectTimers.delete(peer.nodeId);
466
- this.connectToPeer(peer);
674
+ this.reconnectTimers.delete(channelKey);
675
+ this.connectToChannel(nodeId, url);
467
676
  }, delay);
468
- this.reconnectTimers.set(peer.nodeId, timer);
677
+ this.reconnectTimers.set(channelKey, timer);
678
+ }
679
+
680
+ /** Pick the next best URL to try, excluding the current one. */
681
+ private pickNextUrl(nodeId: string, currentUrl: string, urls: string[]): string | undefined {
682
+ let bestUrl: string | undefined;
683
+ let bestLatency = Infinity;
684
+ for (const url of urls) {
685
+ if (url === currentUrl) continue;
686
+ const lat = this.urlProbeLatencies.get(url) ?? 10_000;
687
+ if (lat < bestLatency) {
688
+ bestLatency = lat;
689
+ bestUrl = url;
690
+ }
691
+ }
692
+ return bestUrl ?? urls.find((u) => u !== currentUrl);
469
693
  }
470
694
 
471
695
  // ── Peer lifecycle ─────────────────────────────────────────────
@@ -565,6 +789,23 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
565
789
  // Cancel disconnect grace timer if the peer is reconnecting
566
790
  const wasInGrace = this.cancelDisconnectGrace(nodeId);
567
791
 
792
+ // Check if this peer already has an active connection (additional channel)
793
+ const isAdditionalChannel = this.joinedPeers.has(nodeId);
794
+
795
+ if (isAdditionalChannel) {
796
+ // Additional channel — just add to the channel pool, no peer_join broadcast
797
+ this.router.addChannel(nodeId, conn);
798
+ conn.on("message", (frame) => this.onFrame(frame, conn));
799
+ conn.on("latency", () => this.router.updateActiveChannel(nodeId));
800
+ conn.on("close", () => this.onChannelDisconnected(conn));
801
+ const channelCount = this.router.getChannelCount(nodeId);
802
+ debug("peer", `completePeerJoin(${nodeId}): additional channel added (total=${channelCount})`);
803
+ audit("channel_add", { nodeId, detail: `channels=${channelCount}` });
804
+ return;
805
+ }
806
+
807
+ // First channel — full join flow
808
+
568
809
  // If there's an existing connection for this nodeId (e.g. peer reconnected
569
810
  // while old TCP hadn't closed yet), close it AFTER overwriting the route so
570
811
  // the stale-close guard in onPeerDisconnected correctly skips cleanup.
@@ -580,9 +821,11 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
580
821
  oldConn.close(1000, "replaced by new connection");
581
822
  }
582
823
 
824
+ this.joinedPeers.add(nodeId);
825
+
583
826
  conn.on("message", (frame) => this.onFrame(frame, conn));
584
- conn.on("latency", (ms) => this.router.updateLatency(nodeId, ms));
585
- conn.on("close", () => this.onPeerDisconnected(conn));
827
+ conn.on("latency", () => this.router.updateActiveChannel(nodeId));
828
+ conn.on("close", () => this.onChannelDisconnected(conn));
586
829
 
587
830
  this.sendPeerSync(conn);
588
831
 
@@ -615,6 +858,25 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
615
858
  this.emit("peerConnected", nodeId);
616
859
  }
617
860
 
861
+ /** Handle a single channel disconnecting (multi-channel aware). */
862
+ private onChannelDisconnected(conn: Connection) {
863
+ const nodeId = conn.remoteNodeId;
864
+ if (!nodeId) return;
865
+
866
+ // Remove this channel from the pool
867
+ const hasRemaining = this.router.removeChannel(nodeId, conn);
868
+ if (hasRemaining) {
869
+ // Other channels still alive — just log, no peer_leave
870
+ const channelCount = this.router.getChannelCount(nodeId);
871
+ debug("peer", `onChannelDisconnected(${nodeId}): channel lost, ${channelCount} remaining`);
872
+ audit("channel_remove", { nodeId, detail: `channels=${channelCount}` });
873
+ return;
874
+ }
875
+
876
+ // Last channel gone — proceed with peer disconnect logic
877
+ this.onPeerDisconnected(conn);
878
+ }
879
+
618
880
  private onPeerDisconnected(conn: Connection) {
619
881
  const nodeId = conn.remoteNodeId;
620
882
  if (!nodeId) return;
@@ -632,6 +894,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
632
894
  // Same-nodeId 本地客户端断开:仅清理路由,不广播 peer_leave
633
895
  if (nodeId === this.config.nodeId) {
634
896
  this.router.removePeer(nodeId);
897
+ this.joinedPeers.delete(nodeId);
635
898
  return;
636
899
  }
637
900
 
@@ -680,6 +943,7 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
680
943
 
681
944
  audit("peer_leave", { nodeId });
682
945
  this.router.removePeer(nodeId);
946
+ this.joinedPeers.delete(nodeId);
683
947
 
684
948
  // Remove satellite contexts that were only reachable via this peer
685
949
  for (let i = this.satelliteContexts.length - 1; i >= 0; i--) {
package/src/router.ts CHANGED
@@ -30,7 +30,9 @@ export class Router {
30
30
  private localToolProxy?: ToolProxyInfo;
31
31
  private localAcpAgents?: AcpAgentInfo[];
32
32
  private routes = new Map<string, RouteEntry>();
33
- private connections = new Map<string, Connection>(); // nodeId → direct connection
33
+ private connections = new Map<string, Connection>(); // nodeId → active (best) direct connection
34
+ /** All live channels per nodeId (multi-channel support). */
35
+ private channels = new Map<string, Set<Connection>>();
34
36
  /** Double-map dedup: current window + previous window. Rotated periodically. */
35
37
  private seenCurrent = new Map<string, true>();
36
38
  private seenPrevious = new Map<string, true>();
@@ -128,6 +130,7 @@ export class Router {
128
130
  this.seenCurrent.clear();
129
131
  this.seenPrevious.clear();
130
132
  this.failedRequests.clear();
133
+ this.channels.clear();
131
134
  }
132
135
 
133
136
  // ── Route table management ─────────────────────────────────────
@@ -139,6 +142,10 @@ export class Router {
139
142
  const old = this.routes.get(nodeId);
140
143
  if (old) this.unindexEntry(old);
141
144
  this.connections.set(nodeId, connection);
145
+ // Add to channel set
146
+ let channelSet = this.channels.get(nodeId);
147
+ if (!channelSet) { channelSet = new Set(); this.channels.set(nodeId, channelSet); }
148
+ channelSet.add(connection);
142
149
  const entry: RouteEntry = {
143
150
  nodeId,
144
151
  agents: capabilities.agents,
@@ -157,6 +164,74 @@ export class Router {
157
164
  this.indexEntry(entry);
158
165
  }
159
166
 
167
+ /** Add an additional channel to an existing peer (multi-channel). */
168
+ addChannel(nodeId: string, connection: Connection) {
169
+ let channelSet = this.channels.get(nodeId);
170
+ if (!channelSet) { channelSet = new Set(); this.channels.set(nodeId, channelSet); }
171
+ channelSet.add(connection);
172
+ this.updateActiveChannel(nodeId);
173
+ }
174
+
175
+ /** Remove a single channel. Returns true if the peer still has live channels. */
176
+ removeChannel(nodeId: string, connection: Connection): boolean {
177
+ const channelSet = this.channels.get(nodeId);
178
+ if (channelSet) {
179
+ channelSet.delete(connection);
180
+ if (channelSet.size === 0) {
181
+ this.channels.delete(nodeId);
182
+ return false;
183
+ }
184
+ // Pick new active channel
185
+ this.updateActiveChannel(nodeId);
186
+ return true;
187
+ }
188
+ return false;
189
+ }
190
+
191
+ /** Re-evaluate the active (best) channel for a peer based on latency. */
192
+ updateActiveChannel(nodeId: string) {
193
+ const channelSet = this.channels.get(nodeId);
194
+ if (!channelSet || channelSet.size === 0) return;
195
+
196
+ let best: Connection | null = null;
197
+ let bestLatency = Infinity;
198
+ for (const conn of channelSet) {
199
+ if (!conn.isOpen) continue;
200
+ // Prefer lower latency; treat 0 (unmeasured) as high
201
+ const lat = conn.latencyMs > 0 ? conn.latencyMs : 10_000;
202
+ if (lat < bestLatency) {
203
+ bestLatency = lat;
204
+ best = conn;
205
+ }
206
+ }
207
+
208
+ if (best) {
209
+ this.connections.set(nodeId, best);
210
+ const route = this.routes.get(nodeId);
211
+ if (route) {
212
+ route.connection = best;
213
+ route.latencyMs = best.latencyMs;
214
+ }
215
+ }
216
+ }
217
+
218
+ /** Get the number of live channels for a peer. */
219
+ getChannelCount(nodeId: string): number {
220
+ const channelSet = this.channels.get(nodeId);
221
+ if (!channelSet) return 0;
222
+ let count = 0;
223
+ for (const conn of channelSet) {
224
+ if (conn.isOpen) count++;
225
+ }
226
+ return count;
227
+ }
228
+
229
+ /** Get all channels for a peer (for diagnostics/status). */
230
+ getChannels(nodeId: string): Connection[] {
231
+ const channelSet = this.channels.get(nodeId);
232
+ return channelSet ? [...channelSet] : [];
233
+ }
234
+
160
235
  addRelayPeer(peer: PeerInfo, viaNodeId: string) {
161
236
  // Don't add ourselves
162
237
  if (peer.nodeId === this.nodeId) return;
@@ -192,6 +267,7 @@ export class Router {
192
267
 
193
268
  removePeer(nodeId: string) {
194
269
  this.connections.delete(nodeId);
270
+ this.channels.delete(nodeId);
195
271
  const removed = this.routes.get(nodeId);
196
272
  if (removed) {
197
273
  this.unindexEntry(removed);
@@ -346,11 +422,27 @@ export class Router {
346
422
  const route = this.routes.get(targetNodeId);
347
423
  if (!route) return false;
348
424
 
425
+ // Try active connection first
349
426
  if (route.connection?.isOpen) {
350
427
  route.connection.send(frame);
351
428
  return true;
352
429
  }
353
430
 
431
+ // Fallback: try other channels (multi-channel failover)
432
+ const channelSet = this.channels.get(targetNodeId);
433
+ if (channelSet) {
434
+ for (const conn of channelSet) {
435
+ if (conn.isOpen) {
436
+ conn.send(frame);
437
+ // Promote to active
438
+ this.connections.set(targetNodeId, conn);
439
+ route.connection = conn;
440
+ route.latencyMs = conn.latencyMs;
441
+ return true;
442
+ }
443
+ }
444
+ }
445
+
354
446
  // Relay through intermediate node
355
447
  if (route.reachableVia) {
356
448
  const relay = this.connections.get(route.reachableVia);
@@ -41,6 +41,7 @@ export function createClusterPeersTool(): AnyAgentTool {
41
41
  const sentinelOnline = sentinelStatus === "direct" || sentinelStatus === "relay";
42
42
  const effectiveStatus = status === "unreachable" && sentinelOnline ? "sentinel-only" : status;
43
43
 
44
+ const channelCount = runtime.peerManager.router.getChannelCount(entry.nodeId);
44
45
  return {
45
46
  nodeId: entry.nodeId,
46
47
  agents: entry.agents.map((a) => ({
@@ -53,6 +54,7 @@ export function createClusterPeersTool(): AnyAgentTool {
53
54
  tools: entry.toolProxy?.enabled ? (entry.toolProxy.allow ?? []) : [],
54
55
  status: effectiveStatus,
55
56
  latencyMs: entry.latencyMs,
57
+ ...(channelCount > 1 ? { channels: channelCount } : {}),
56
58
  ...(hasSentinel ? {
57
59
  sentinel: sentinelOnline ? "online" : "offline",
58
60
  } : {}),