clawmatrix 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clawmatrix",
3
- "version": "0.6.2",
3
+ "version": "0.6.3",
4
4
  "description": "Decentralized mesh cluster plugin for OpenClaw — inter-gateway communication, model proxy, task handoff, and tool proxy.",
5
5
  "type": "module",
6
6
  "license": "MIT",
package/src/acp-proxy.ts CHANGED
@@ -2626,6 +2626,42 @@ export class AcpProxy {
2626
2626
  return null;
2627
2627
  }
2628
2628
 
2629
+ /** Reject a specific pending request by frame id (e.g. from relay_fail). */
2630
+ rejectPending(frameId: string, reason: string) {
2631
+ const entry = this.pending.get(frameId);
2632
+ if (entry) {
2633
+ clearTimeout(entry.timer);
2634
+ this.pending.delete(frameId);
2635
+ entry.reject(new Error(reason));
2636
+ }
2637
+ }
2638
+
2639
+ /**
2640
+ * Fail-fast all pending requests targeting a disconnected node (or nodes
2641
+ * that were reachable only via that node as relay).
2642
+ * Called by ClusterRuntime on peerDisconnected.
2643
+ */
2644
+ onPeerDisconnected(nodeId: string) {
2645
+ for (const [id, entry] of this.pending) {
2646
+ const target = entry.targetNodeId;
2647
+ if (!target) continue;
2648
+ // Direct match: request was targeting the disconnected node
2649
+ if (target === nodeId) {
2650
+ clearTimeout(entry.timer);
2651
+ this.pending.delete(id);
2652
+ entry.reject(new Error(`Node "${target}" disconnected while request was pending`));
2653
+ continue;
2654
+ }
2655
+ // Relay match: request was targeting a node reachable via the disconnected relay
2656
+ const route = this.peerManager.router.getRoute(target);
2657
+ if (!route || (!route.connection?.isOpen && !route.reachableVia)) {
2658
+ clearTimeout(entry.timer);
2659
+ this.pending.delete(id);
2660
+ entry.reject(new Error(`Node "${target}" became unreachable (relay "${nodeId}" disconnected)`));
2661
+ }
2662
+ }
2663
+ }
2664
+
2629
2665
  private sendRequest(
2630
2666
  targetNodeId: string,
2631
2667
  agent: string,
@@ -175,6 +175,9 @@ export class ClusterRuntime {
175
175
  this.refreshDiscoveredModels();
176
176
  this.healthTracker.recordPeerOffline(nodeId);
177
177
  this.healthTracker.removePeerSync(nodeId);
178
+ // Fail-fast any pending requests targeting this node or relayed through it
179
+ this.acpProxy?.onPeerDisconnected(nodeId);
180
+ this.handoffManager?.onPeerDisconnected(nodeId);
178
181
  });
179
182
 
180
183
  this.peerManager.on("peerCapabilitiesChanged", () => {
@@ -275,7 +278,12 @@ export class ClusterRuntime {
275
278
  // Auto-detect ACP agents if ACP is enabled but no agents are explicitly configured
276
279
  // Check both ClawMatrix and OpenClaw configs (consistent with acpProxy creation above)
277
280
  if (this.acpProxy && (!this.config.acp?.agents || this.config.acp.agents.length === 0)) {
278
- AcpProxy.detectAvailableAgents(this.config.acp?.commands).then((detected) => {
281
+ const openclawAcp = (this.openclawConfig as Record<string, any>).acp;
282
+ const detectionCommands = {
283
+ ...(openclawAcp?.commands && typeof openclawAcp.commands === "object" ? openclawAcp.commands : {}),
284
+ ...(this.config.acp?.commands ?? {}),
285
+ };
286
+ AcpProxy.detectAvailableAgents(Object.keys(detectionCommands).length > 0 ? detectionCommands : undefined).then((detected) => {
279
287
  if (detected.length > 0) {
280
288
  this.logger.info(`[clawmatrix] Auto-detected ACP agents: ${detected.map((a) => a.id).join(", ")}`);
281
289
  this.peerManager.updateAcpAgents(detected);
@@ -891,6 +899,22 @@ export class ClusterRuntime {
891
899
  this.apiHandler.pushKanbanEvent(kf.payload);
892
900
  }
893
901
  break;
902
+ case "relay_fail": {
903
+ // A relay node could not forward our frame to its target.
904
+ // Clean up the stale relay route and reject pending requests.
905
+ const target = (frame.payload as any)?.target ?? "unknown";
906
+ debug("dispatch", `relay_fail id=${frame.id} target=${target} from=${frame.from}`);
907
+ // Remove the stale relay route so we don't keep sending frames into a dead path
908
+ if (typeof target === "string" && target !== "unknown") {
909
+ this.peerManager.router.removeRelayRoute(target);
910
+ }
911
+ if (frame.id) {
912
+ this.acpProxy?.rejectPending(frame.id, `Relay "${frame.from}" cannot reach "${target}"`);
913
+ this.handoffManager?.rejectPending(frame.id, `Relay "${frame.from}" cannot reach "${target}"`);
914
+ this.toolProxy.rejectPending(frame.id, `Relay "${frame.from}" cannot reach "${target}"`);
915
+ }
916
+ break;
917
+ }
894
918
  }
895
919
  }
896
920
 
package/src/handoff.ts CHANGED
@@ -744,6 +744,38 @@ export class HandoffManager {
744
744
  return false;
745
745
  }
746
746
 
747
+ /** Reject a specific pending request by frame id (e.g. from relay_fail). */
748
+ rejectPending(frameId: string, reason: string) {
749
+ const entry = this.pending.get(frameId);
750
+ if (entry) {
751
+ clearTimeout(entry.timer);
752
+ this.pending.delete(frameId);
753
+ entry.reject(new Error(reason));
754
+ }
755
+ }
756
+
757
+ /**
758
+ * Fail-fast pending handoff requests targeting a disconnected node
759
+ * (or nodes reachable only via that relay). Called on peerDisconnected.
760
+ */
761
+ onPeerDisconnected(nodeId: string) {
762
+ for (const [id, entry] of this.pending) {
763
+ if (entry.targetNodeId === nodeId) {
764
+ clearTimeout(entry.timer);
765
+ this.pending.delete(id);
766
+ entry.reject(new Error(`Node "${entry.targetNodeId}" disconnected while handoff was pending`));
767
+ continue;
768
+ }
769
+ // Check if the target was reachable via this relay
770
+ const route = this.peerManager.router.getRoute(entry.targetNodeId);
771
+ if (!route || (!route.connection?.isOpen && !route.reachableVia)) {
772
+ clearTimeout(entry.timer);
773
+ this.pending.delete(id);
774
+ entry.reject(new Error(`Node "${entry.targetNodeId}" became unreachable (relay "${nodeId}" disconnected)`));
775
+ }
776
+ }
777
+ }
778
+
747
779
  destroy() {
748
780
  if (this.staleCleanupTimer) {
749
781
  clearInterval(this.staleCleanupTimer);
@@ -50,6 +50,8 @@ const SKIP_DEDUP_EXPLICIT = new Set([
50
50
  // File transfer
51
51
  "file_transfer_chunk", "file_transfer_chunk_ack",
52
52
  "file_transfer_ack", "file_transfer_complete",
53
+ // Relay failure notification (shares id with the original request)
54
+ "relay_fail",
53
55
  ]);
54
56
 
55
57
  function skipDedup(type: string): boolean {
@@ -151,6 +153,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
151
153
  acpAgents = config.acp.agents;
152
154
  } else if (Array.isArray(ocAcp?.allowedAgents) && ocAcp.allowedAgents.length > 0) {
153
155
  acpAgents = ocAcp.allowedAgents.map((id: string) => ({ id, description: "" }));
156
+ } else if (ocAcp?.commands && typeof ocAcp.commands === "object") {
157
+ acpAgents = Object.keys(ocAcp.commands).map((id) => ({ id, description: "" }));
154
158
  }
155
159
  }
156
160
  this.localCapabilities = {
@@ -231,6 +235,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
231
235
  }
232
236
  // Start route probing for peers with multiple URLs
233
237
  this.startRouteProbing();
238
+ // Start periodic sweep of stale relay routes
239
+ this.router.startRelaySweep();
234
240
  }
235
241
 
236
242
  async stop() {
@@ -1251,6 +1257,22 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
1251
1257
  // No remote alternative — fall through to local handling
1252
1258
  // (model-proxy will handle locally or send error back)
1253
1259
  } else {
1260
+ // Relay failed — clean up the stale route so we don't keep trying it,
1261
+ // and notify the sender so it can fail-fast.
1262
+ if (frame.to) {
1263
+ this.router.removeRelayRoute(frame.to);
1264
+ }
1265
+ if (frame.from && frame.id) {
1266
+ debug("peer", `relay failed for ${frame.type} id=${frame.id} to=${frame.to}, sending relay_fail to ${frame.from}`);
1267
+ this.sendTo(frame.from, {
1268
+ type: "relay_fail",
1269
+ id: frame.id,
1270
+ from: this.config.nodeId,
1271
+ to: frame.from,
1272
+ timestamp: Date.now(),
1273
+ payload: { target: frame.to, reason: "unreachable" },
1274
+ } as AnyClusterFrame);
1275
+ }
1254
1276
  return;
1255
1277
  }
1256
1278
  }
@@ -1335,21 +1357,18 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
1335
1357
  if (peer.nodeId === this.config.nodeId) continue;
1336
1358
  if (peer.nodeId === from.remoteNodeId) {
1337
1359
  const prev = this.router.getRoute(peer.nodeId);
1338
- const hadAgents = prev?.agents.length ?? 0;
1339
- const hadDirectPeers = prev?.directPeers.length ?? 0;
1340
- const hadDeviceInfo = prev?.deviceInfo?.hostname;
1341
- const hadAcpAgents = prev?.acpAgents?.length ?? 0;
1342
- const hadToolProxyEnabled = prev?.toolProxy?.enabled;
1343
- const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
1344
- const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
1360
+ const prevSnapshot = prev ? JSON.stringify({
1361
+ a: prev.agents.length, m: prev.models.length,
1362
+ dp: prev.directPeers?.length, tp: prev.toolProxy,
1363
+ di: prev.deviceInfo, aa: prev.acpAgents?.length,
1364
+ }) : "";
1345
1365
  this.router.updatePeerCapabilities(peer.nodeId, peer);
1346
- if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
1347
- || (peer.directPeers?.length ?? 0) !== hadDirectPeers
1348
- || peer.toolProxy?.enabled !== hadToolProxyEnabled
1349
- || (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
1350
- || (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
1351
- || peer.deviceInfo?.hostname !== hadDeviceInfo
1352
- || (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
1366
+ const newSnapshot = JSON.stringify({
1367
+ a: peer.agents.length, m: peer.models.length,
1368
+ dp: peer.directPeers?.length, tp: peer.toolProxy,
1369
+ di: peer.deviceInfo, aa: peer.acpAgents?.length,
1370
+ });
1371
+ if (newSnapshot !== prevSnapshot) {
1353
1372
  changed = true;
1354
1373
  }
1355
1374
  } else {
@@ -1357,7 +1376,14 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
1357
1376
  // using them as relay would create a routing loop.
1358
1377
  if (peer.reachableVia === this.config.nodeId) continue;
1359
1378
  const existing = this.router.getRoute(peer.nodeId);
1360
- if (!existing) changed = true;
1379
+ if (!existing) {
1380
+ changed = true;
1381
+ } else {
1382
+ // Detect changes in existing relay peer (e.g. version update after restart)
1383
+ const prevSnap = JSON.stringify({ a: existing.agents.length, m: existing.models.length, di: existing.deviceInfo, aa: existing.acpAgents?.length });
1384
+ const newSnap = JSON.stringify({ a: peer.agents.length, m: peer.models.length, di: peer.deviceInfo, aa: peer.acpAgents?.length });
1385
+ if (newSnap !== prevSnap) changed = true;
1386
+ }
1361
1387
  this.router.addRelayPeer(peer, from.remoteNodeId!);
1362
1388
  }
1363
1389
  }
package/src/router.ts CHANGED
@@ -8,6 +8,9 @@ const MAX_SEEN_FRAMES = 10_000;
8
8
  const MAX_FAILED_REQUESTS = 5_000;
9
9
  const SEEN_FRAME_TTL = 120_000; // 2 minutes (was ~60-120s with double-map rotation)
10
10
  const FAILED_REQUEST_TTL = 900_000; // 15 minutes
11
+ /** Relay routes older than this without a peer_sync refresh are considered stale. */
12
+ const RELAY_ROUTE_MAX_AGE = 300_000; // 5 minutes
13
+ const RELAY_ROUTE_SWEEP_INTERVAL = 60_000; // sweep every 60s
11
14
 
12
15
  export interface RouteEntry {
13
16
  nodeId: string;
@@ -127,8 +130,51 @@ export class Router {
127
130
  }
128
131
  }
129
132
 
133
+ private relaySweepTimer: ReturnType<typeof setInterval> | null = null;
134
+
135
+ /** Start periodic sweep of stale relay routes. */
136
+ startRelaySweep() {
137
+ if (this.relaySweepTimer) return;
138
+ this.relaySweepTimer = setInterval(() => this.removeStaleRelayRoutes(), RELAY_ROUTE_SWEEP_INTERVAL);
139
+ }
140
+
141
+ /** Remove relay routes whose lastSeen is older than RELAY_ROUTE_MAX_AGE. */
142
+ removeStaleRelayRoutes(): string[] {
143
+ const now = Date.now();
144
+ const removed: string[] = [];
145
+ for (const [id, entry] of this.routes) {
146
+ if (entry.reachableVia && !entry.connection && (now - entry.lastSeen) > RELAY_ROUTE_MAX_AGE) {
147
+ debug("router", `removing stale relay route: ${id} (via ${entry.reachableVia}, age=${Math.round((now - entry.lastSeen) / 1000)}s)`);
148
+ this.unindexEntry(entry);
149
+ this.routes.delete(id);
150
+ this.syncVersion++;
151
+ this.removedPeers.set(id, this.syncVersion);
152
+ this.peerVersions.delete(id);
153
+ removed.push(id);
154
+ }
155
+ }
156
+ return removed;
157
+ }
158
+
159
+ /** Remove a specific relay route (e.g. on relay_fail). No-op for direct connections. */
160
+ removeRelayRoute(nodeId: string): boolean {
161
+ const entry = this.routes.get(nodeId);
162
+ if (!entry || entry.connection) return false; // only remove relay routes
163
+ debug("router", `removing relay route on failure: ${nodeId} (via ${entry.reachableVia})`);
164
+ this.unindexEntry(entry);
165
+ this.routes.delete(nodeId);
166
+ this.syncVersion++;
167
+ this.removedPeers.set(nodeId, this.syncVersion);
168
+ this.peerVersions.delete(nodeId);
169
+ return true;
170
+ }
171
+
130
172
  /** Stop periodic cleanup. Call on shutdown. */
131
173
  destroy() {
174
+ if (this.relaySweepTimer) {
175
+ clearInterval(this.relaySweepTimer);
176
+ this.relaySweepTimer = null;
177
+ }
132
178
  this.seenFrames.clear();
133
179
  this.failedRequests.clear();
134
180
  this.channels.clear();
package/src/tool-proxy.ts CHANGED
@@ -130,6 +130,23 @@ export class ToolProxy {
130
130
  }
131
131
 
132
132
  // ── Incoming response ──────────────────────────────────────────
133
+ /** Reject a specific pending request by frame id (e.g. from relay_fail). */
134
+ rejectPending(frameId: string, reason: string) {
135
+ const pending = this.pending.get(frameId);
136
+ if (pending) {
137
+ clearTimeout(pending.timer);
138
+ this.pending.delete(frameId);
139
+ pending.reject(new Error(reason));
140
+ return;
141
+ }
142
+ const batch = this.pendingBatch.get(frameId);
143
+ if (batch) {
144
+ clearTimeout(batch.timer);
145
+ this.pendingBatch.delete(frameId);
146
+ batch.reject(new Error(reason));
147
+ }
148
+ }
149
+
133
150
  handleResponse(frame: ToolProxyResponse) {
134
151
  if (this.peerManager.router.isFailed(frame.id)) return;
135
152
  const pending = this.pending.get(frame.id);