clawmatrix 0.6.2 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/acp-proxy.ts +36 -0
- package/src/cluster-service.ts +25 -1
- package/src/handoff.ts +32 -0
- package/src/peer-manager.ts +41 -15
- package/src/router.ts +46 -0
- package/src/tool-proxy.ts +17 -0
package/package.json
CHANGED
package/src/acp-proxy.ts
CHANGED
|
@@ -2626,6 +2626,42 @@ export class AcpProxy {
|
|
|
2626
2626
|
return null;
|
|
2627
2627
|
}
|
|
2628
2628
|
|
|
2629
|
+
/** Reject a specific pending request by frame id (e.g. from relay_fail). */
|
|
2630
|
+
rejectPending(frameId: string, reason: string) {
|
|
2631
|
+
const entry = this.pending.get(frameId);
|
|
2632
|
+
if (entry) {
|
|
2633
|
+
clearTimeout(entry.timer);
|
|
2634
|
+
this.pending.delete(frameId);
|
|
2635
|
+
entry.reject(new Error(reason));
|
|
2636
|
+
}
|
|
2637
|
+
}
|
|
2638
|
+
|
|
2639
|
+
/**
|
|
2640
|
+
* Fail-fast all pending requests targeting a disconnected node (or nodes
|
|
2641
|
+
* that were reachable only via that node as relay).
|
|
2642
|
+
* Called by ClusterRuntime on peerDisconnected.
|
|
2643
|
+
*/
|
|
2644
|
+
onPeerDisconnected(nodeId: string) {
|
|
2645
|
+
for (const [id, entry] of this.pending) {
|
|
2646
|
+
const target = entry.targetNodeId;
|
|
2647
|
+
if (!target) continue;
|
|
2648
|
+
// Direct match: request was targeting the disconnected node
|
|
2649
|
+
if (target === nodeId) {
|
|
2650
|
+
clearTimeout(entry.timer);
|
|
2651
|
+
this.pending.delete(id);
|
|
2652
|
+
entry.reject(new Error(`Node "${target}" disconnected while request was pending`));
|
|
2653
|
+
continue;
|
|
2654
|
+
}
|
|
2655
|
+
// Relay match: request was targeting a node reachable via the disconnected relay
|
|
2656
|
+
const route = this.peerManager.router.getRoute(target);
|
|
2657
|
+
if (!route || (!route.connection?.isOpen && !route.reachableVia)) {
|
|
2658
|
+
clearTimeout(entry.timer);
|
|
2659
|
+
this.pending.delete(id);
|
|
2660
|
+
entry.reject(new Error(`Node "${target}" became unreachable (relay "${nodeId}" disconnected)`));
|
|
2661
|
+
}
|
|
2662
|
+
}
|
|
2663
|
+
}
|
|
2664
|
+
|
|
2629
2665
|
private sendRequest(
|
|
2630
2666
|
targetNodeId: string,
|
|
2631
2667
|
agent: string,
|
package/src/cluster-service.ts
CHANGED
|
@@ -175,6 +175,9 @@ export class ClusterRuntime {
|
|
|
175
175
|
this.refreshDiscoveredModels();
|
|
176
176
|
this.healthTracker.recordPeerOffline(nodeId);
|
|
177
177
|
this.healthTracker.removePeerSync(nodeId);
|
|
178
|
+
// Fail-fast any pending requests targeting this node or relayed through it
|
|
179
|
+
this.acpProxy?.onPeerDisconnected(nodeId);
|
|
180
|
+
this.handoffManager?.onPeerDisconnected(nodeId);
|
|
178
181
|
});
|
|
179
182
|
|
|
180
183
|
this.peerManager.on("peerCapabilitiesChanged", () => {
|
|
@@ -275,7 +278,12 @@ export class ClusterRuntime {
|
|
|
275
278
|
// Auto-detect ACP agents if ACP is enabled but no agents are explicitly configured
|
|
276
279
|
// Check both ClawMatrix and OpenClaw configs (consistent with acpProxy creation above)
|
|
277
280
|
if (this.acpProxy && (!this.config.acp?.agents || this.config.acp.agents.length === 0)) {
|
|
278
|
-
|
|
281
|
+
const openclawAcp = (this.openclawConfig as Record<string, any>).acp;
|
|
282
|
+
const detectionCommands = {
|
|
283
|
+
...(openclawAcp?.commands && typeof openclawAcp.commands === "object" ? openclawAcp.commands : {}),
|
|
284
|
+
...(this.config.acp?.commands ?? {}),
|
|
285
|
+
};
|
|
286
|
+
AcpProxy.detectAvailableAgents(Object.keys(detectionCommands).length > 0 ? detectionCommands : undefined).then((detected) => {
|
|
279
287
|
if (detected.length > 0) {
|
|
280
288
|
this.logger.info(`[clawmatrix] Auto-detected ACP agents: ${detected.map((a) => a.id).join(", ")}`);
|
|
281
289
|
this.peerManager.updateAcpAgents(detected);
|
|
@@ -891,6 +899,22 @@ export class ClusterRuntime {
|
|
|
891
899
|
this.apiHandler.pushKanbanEvent(kf.payload);
|
|
892
900
|
}
|
|
893
901
|
break;
|
|
902
|
+
case "relay_fail": {
|
|
903
|
+
// A relay node could not forward our frame to its target.
|
|
904
|
+
// Clean up the stale relay route and reject pending requests.
|
|
905
|
+
const target = (frame.payload as any)?.target ?? "unknown";
|
|
906
|
+
debug("dispatch", `relay_fail id=${frame.id} target=${target} from=${frame.from}`);
|
|
907
|
+
// Remove the stale relay route so we don't keep sending frames into a dead path
|
|
908
|
+
if (typeof target === "string" && target !== "unknown") {
|
|
909
|
+
this.peerManager.router.removeRelayRoute(target);
|
|
910
|
+
}
|
|
911
|
+
if (frame.id) {
|
|
912
|
+
this.acpProxy?.rejectPending(frame.id, `Relay "${frame.from}" cannot reach "${target}"`);
|
|
913
|
+
this.handoffManager?.rejectPending(frame.id, `Relay "${frame.from}" cannot reach "${target}"`);
|
|
914
|
+
this.toolProxy.rejectPending(frame.id, `Relay "${frame.from}" cannot reach "${target}"`);
|
|
915
|
+
}
|
|
916
|
+
break;
|
|
917
|
+
}
|
|
894
918
|
}
|
|
895
919
|
}
|
|
896
920
|
|
package/src/handoff.ts
CHANGED
|
@@ -744,6 +744,38 @@ export class HandoffManager {
|
|
|
744
744
|
return false;
|
|
745
745
|
}
|
|
746
746
|
|
|
747
|
+
/** Reject a specific pending request by frame id (e.g. from relay_fail). */
|
|
748
|
+
rejectPending(frameId: string, reason: string) {
|
|
749
|
+
const entry = this.pending.get(frameId);
|
|
750
|
+
if (entry) {
|
|
751
|
+
clearTimeout(entry.timer);
|
|
752
|
+
this.pending.delete(frameId);
|
|
753
|
+
entry.reject(new Error(reason));
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
/**
|
|
758
|
+
* Fail-fast pending handoff requests targeting a disconnected node
|
|
759
|
+
* (or nodes reachable only via that relay). Called on peerDisconnected.
|
|
760
|
+
*/
|
|
761
|
+
onPeerDisconnected(nodeId: string) {
|
|
762
|
+
for (const [id, entry] of this.pending) {
|
|
763
|
+
if (entry.targetNodeId === nodeId) {
|
|
764
|
+
clearTimeout(entry.timer);
|
|
765
|
+
this.pending.delete(id);
|
|
766
|
+
entry.reject(new Error(`Node "${entry.targetNodeId}" disconnected while handoff was pending`));
|
|
767
|
+
continue;
|
|
768
|
+
}
|
|
769
|
+
// Check if the target was reachable via this relay
|
|
770
|
+
const route = this.peerManager.router.getRoute(entry.targetNodeId);
|
|
771
|
+
if (!route || (!route.connection?.isOpen && !route.reachableVia)) {
|
|
772
|
+
clearTimeout(entry.timer);
|
|
773
|
+
this.pending.delete(id);
|
|
774
|
+
entry.reject(new Error(`Node "${entry.targetNodeId}" became unreachable (relay "${nodeId}" disconnected)`));
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
|
|
747
779
|
destroy() {
|
|
748
780
|
if (this.staleCleanupTimer) {
|
|
749
781
|
clearInterval(this.staleCleanupTimer);
|
package/src/peer-manager.ts
CHANGED
|
@@ -50,6 +50,8 @@ const SKIP_DEDUP_EXPLICIT = new Set([
|
|
|
50
50
|
// File transfer
|
|
51
51
|
"file_transfer_chunk", "file_transfer_chunk_ack",
|
|
52
52
|
"file_transfer_ack", "file_transfer_complete",
|
|
53
|
+
// Relay failure notification (shares id with the original request)
|
|
54
|
+
"relay_fail",
|
|
53
55
|
]);
|
|
54
56
|
|
|
55
57
|
function skipDedup(type: string): boolean {
|
|
@@ -151,6 +153,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
151
153
|
acpAgents = config.acp.agents;
|
|
152
154
|
} else if (Array.isArray(ocAcp?.allowedAgents) && ocAcp.allowedAgents.length > 0) {
|
|
153
155
|
acpAgents = ocAcp.allowedAgents.map((id: string) => ({ id, description: "" }));
|
|
156
|
+
} else if (ocAcp?.commands && typeof ocAcp.commands === "object") {
|
|
157
|
+
acpAgents = Object.keys(ocAcp.commands).map((id) => ({ id, description: "" }));
|
|
154
158
|
}
|
|
155
159
|
}
|
|
156
160
|
this.localCapabilities = {
|
|
@@ -231,6 +235,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
231
235
|
}
|
|
232
236
|
// Start route probing for peers with multiple URLs
|
|
233
237
|
this.startRouteProbing();
|
|
238
|
+
// Start periodic sweep of stale relay routes
|
|
239
|
+
this.router.startRelaySweep();
|
|
234
240
|
}
|
|
235
241
|
|
|
236
242
|
async stop() {
|
|
@@ -1251,6 +1257,22 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
1251
1257
|
// No remote alternative — fall through to local handling
|
|
1252
1258
|
// (model-proxy will handle locally or send error back)
|
|
1253
1259
|
} else {
|
|
1260
|
+
// Relay failed — clean up the stale route so we don't keep trying it,
|
|
1261
|
+
// and notify the sender so it can fail-fast.
|
|
1262
|
+
if (frame.to) {
|
|
1263
|
+
this.router.removeRelayRoute(frame.to);
|
|
1264
|
+
}
|
|
1265
|
+
if (frame.from && frame.id) {
|
|
1266
|
+
debug("peer", `relay failed for ${frame.type} id=${frame.id} to=${frame.to}, sending relay_fail to ${frame.from}`);
|
|
1267
|
+
this.sendTo(frame.from, {
|
|
1268
|
+
type: "relay_fail",
|
|
1269
|
+
id: frame.id,
|
|
1270
|
+
from: this.config.nodeId,
|
|
1271
|
+
to: frame.from,
|
|
1272
|
+
timestamp: Date.now(),
|
|
1273
|
+
payload: { target: frame.to, reason: "unreachable" },
|
|
1274
|
+
} as AnyClusterFrame);
|
|
1275
|
+
}
|
|
1254
1276
|
return;
|
|
1255
1277
|
}
|
|
1256
1278
|
}
|
|
@@ -1335,21 +1357,18 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
1335
1357
|
if (peer.nodeId === this.config.nodeId) continue;
|
|
1336
1358
|
if (peer.nodeId === from.remoteNodeId) {
|
|
1337
1359
|
const prev = this.router.getRoute(peer.nodeId);
|
|
1338
|
-
const
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
|
|
1344
|
-
const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
|
|
1360
|
+
const prevSnapshot = prev ? JSON.stringify({
|
|
1361
|
+
a: prev.agents.length, m: prev.models.length,
|
|
1362
|
+
dp: prev.directPeers?.length, tp: prev.toolProxy,
|
|
1363
|
+
di: prev.deviceInfo, aa: prev.acpAgents?.length,
|
|
1364
|
+
}) : "";
|
|
1345
1365
|
this.router.updatePeerCapabilities(peer.nodeId, peer);
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|| (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
|
|
1366
|
+
const newSnapshot = JSON.stringify({
|
|
1367
|
+
a: peer.agents.length, m: peer.models.length,
|
|
1368
|
+
dp: peer.directPeers?.length, tp: peer.toolProxy,
|
|
1369
|
+
di: peer.deviceInfo, aa: peer.acpAgents?.length,
|
|
1370
|
+
});
|
|
1371
|
+
if (newSnapshot !== prevSnapshot) {
|
|
1353
1372
|
changed = true;
|
|
1354
1373
|
}
|
|
1355
1374
|
} else {
|
|
@@ -1357,7 +1376,14 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
1357
1376
|
// using them as relay would create a routing loop.
|
|
1358
1377
|
if (peer.reachableVia === this.config.nodeId) continue;
|
|
1359
1378
|
const existing = this.router.getRoute(peer.nodeId);
|
|
1360
|
-
if (!existing)
|
|
1379
|
+
if (!existing) {
|
|
1380
|
+
changed = true;
|
|
1381
|
+
} else {
|
|
1382
|
+
// Detect changes in existing relay peer (e.g. version update after restart)
|
|
1383
|
+
const prevSnap = JSON.stringify({ a: existing.agents.length, m: existing.models.length, di: existing.deviceInfo, aa: existing.acpAgents?.length });
|
|
1384
|
+
const newSnap = JSON.stringify({ a: peer.agents.length, m: peer.models.length, di: peer.deviceInfo, aa: peer.acpAgents?.length });
|
|
1385
|
+
if (newSnap !== prevSnap) changed = true;
|
|
1386
|
+
}
|
|
1361
1387
|
this.router.addRelayPeer(peer, from.remoteNodeId!);
|
|
1362
1388
|
}
|
|
1363
1389
|
}
|
package/src/router.ts
CHANGED
|
@@ -8,6 +8,9 @@ const MAX_SEEN_FRAMES = 10_000;
|
|
|
8
8
|
const MAX_FAILED_REQUESTS = 5_000;
|
|
9
9
|
const SEEN_FRAME_TTL = 120_000; // 2 minutes (was ~60-120s with double-map rotation)
|
|
10
10
|
const FAILED_REQUEST_TTL = 900_000; // 15 minutes
|
|
11
|
+
/** Relay routes older than this without a peer_sync refresh are considered stale. */
|
|
12
|
+
const RELAY_ROUTE_MAX_AGE = 300_000; // 5 minutes
|
|
13
|
+
const RELAY_ROUTE_SWEEP_INTERVAL = 60_000; // sweep every 60s
|
|
11
14
|
|
|
12
15
|
export interface RouteEntry {
|
|
13
16
|
nodeId: string;
|
|
@@ -127,8 +130,51 @@ export class Router {
|
|
|
127
130
|
}
|
|
128
131
|
}
|
|
129
132
|
|
|
133
|
+
private relaySweepTimer: ReturnType<typeof setInterval> | null = null;
|
|
134
|
+
|
|
135
|
+
/** Start periodic sweep of stale relay routes. */
|
|
136
|
+
startRelaySweep() {
|
|
137
|
+
if (this.relaySweepTimer) return;
|
|
138
|
+
this.relaySweepTimer = setInterval(() => this.removeStaleRelayRoutes(), RELAY_ROUTE_SWEEP_INTERVAL);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/** Remove relay routes whose lastSeen is older than RELAY_ROUTE_MAX_AGE. */
|
|
142
|
+
removeStaleRelayRoutes(): string[] {
|
|
143
|
+
const now = Date.now();
|
|
144
|
+
const removed: string[] = [];
|
|
145
|
+
for (const [id, entry] of this.routes) {
|
|
146
|
+
if (entry.reachableVia && !entry.connection && (now - entry.lastSeen) > RELAY_ROUTE_MAX_AGE) {
|
|
147
|
+
debug("router", `removing stale relay route: ${id} (via ${entry.reachableVia}, age=${Math.round((now - entry.lastSeen) / 1000)}s)`);
|
|
148
|
+
this.unindexEntry(entry);
|
|
149
|
+
this.routes.delete(id);
|
|
150
|
+
this.syncVersion++;
|
|
151
|
+
this.removedPeers.set(id, this.syncVersion);
|
|
152
|
+
this.peerVersions.delete(id);
|
|
153
|
+
removed.push(id);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
return removed;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/** Remove a specific relay route (e.g. on relay_fail). No-op for direct connections. */
|
|
160
|
+
removeRelayRoute(nodeId: string): boolean {
|
|
161
|
+
const entry = this.routes.get(nodeId);
|
|
162
|
+
if (!entry || entry.connection) return false; // only remove relay routes
|
|
163
|
+
debug("router", `removing relay route on failure: ${nodeId} (via ${entry.reachableVia})`);
|
|
164
|
+
this.unindexEntry(entry);
|
|
165
|
+
this.routes.delete(nodeId);
|
|
166
|
+
this.syncVersion++;
|
|
167
|
+
this.removedPeers.set(nodeId, this.syncVersion);
|
|
168
|
+
this.peerVersions.delete(nodeId);
|
|
169
|
+
return true;
|
|
170
|
+
}
|
|
171
|
+
|
|
130
172
|
/** Stop periodic cleanup. Call on shutdown. */
|
|
131
173
|
destroy() {
|
|
174
|
+
if (this.relaySweepTimer) {
|
|
175
|
+
clearInterval(this.relaySweepTimer);
|
|
176
|
+
this.relaySweepTimer = null;
|
|
177
|
+
}
|
|
132
178
|
this.seenFrames.clear();
|
|
133
179
|
this.failedRequests.clear();
|
|
134
180
|
this.channels.clear();
|
package/src/tool-proxy.ts
CHANGED
|
@@ -130,6 +130,23 @@ export class ToolProxy {
|
|
|
130
130
|
}
|
|
131
131
|
|
|
132
132
|
// ── Incoming response ──────────────────────────────────────────
|
|
133
|
+
/** Reject a specific pending request by frame id (e.g. from relay_fail). */
|
|
134
|
+
rejectPending(frameId: string, reason: string) {
|
|
135
|
+
const pending = this.pending.get(frameId);
|
|
136
|
+
if (pending) {
|
|
137
|
+
clearTimeout(pending.timer);
|
|
138
|
+
this.pending.delete(frameId);
|
|
139
|
+
pending.reject(new Error(reason));
|
|
140
|
+
return;
|
|
141
|
+
}
|
|
142
|
+
const batch = this.pendingBatch.get(frameId);
|
|
143
|
+
if (batch) {
|
|
144
|
+
clearTimeout(batch.timer);
|
|
145
|
+
this.pendingBatch.delete(frameId);
|
|
146
|
+
batch.reject(new Error(reason));
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
133
150
|
handleResponse(frame: ToolProxyResponse) {
|
|
134
151
|
if (this.peerManager.router.isFailed(frame.id)) return;
|
|
135
152
|
const pending = this.pending.get(frame.id);
|