clawmatrix 0.3.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/bin/clawmatrix.mjs +1006 -0
- package/cli/package.json +27 -0
- package/cli/skills/clawmatrix/SKILL.md +104 -0
- package/openclaw.plugin.json +1 -0
- package/package.json +2 -1
- package/src/acp-proxy.ts +425 -37
- package/src/cluster-service.ts +82 -3
- package/src/config.ts +2 -1
- package/src/handoff.ts +8 -0
- package/src/health-tracker.ts +80 -19
- package/src/index.ts +471 -28
- package/src/knowledge-sync.ts +18 -4
- package/src/model-proxy.ts +16 -0
- package/src/peer-manager.ts +318 -25
- package/src/router.ts +93 -1
- package/src/tool-proxy.ts +40 -2
- package/src/tools/cluster-notify.ts +132 -0
- package/src/tools/cluster-peers.ts +2 -0
- package/src/types.ts +1 -1
- package/src/cli.ts +0 -711
package/src/cluster-service.ts
CHANGED
|
@@ -105,6 +105,8 @@ export class ClusterRuntime {
|
|
|
105
105
|
// Pre-built indexes for O(1) local agent lookup
|
|
106
106
|
private agentById = new Map<string, ClawMatrixConfig["agents"][number]>();
|
|
107
107
|
private agentsByTag = new Map<string, ClawMatrixConfig["agents"][number]>();
|
|
108
|
+
/** Track known relay peers to record health events on discovery/removal. */
|
|
109
|
+
private knownRelayPeers = new Set<string>();
|
|
108
110
|
|
|
109
111
|
constructor(config: ClawMatrixConfig, logger: PluginLogger, openclawConfig: OpenClawConfig, openclawVersion?: string) {
|
|
110
112
|
this.config = config;
|
|
@@ -131,6 +133,12 @@ export class ClusterRuntime {
|
|
|
131
133
|
this.agentById.set(a.id, a);
|
|
132
134
|
for (const t of a.tags) this.agentsByTag.set(t, a);
|
|
133
135
|
}
|
|
136
|
+
|
|
137
|
+
// Wire up active task checker for route probing (prevents switching mid-task)
|
|
138
|
+
this.peerManager.setActiveTaskChecker((nodeId) => {
|
|
139
|
+
return this.handoffManager.hasPendingForNode(nodeId)
|
|
140
|
+
|| this.modelProxy.hasPendingForNode(nodeId);
|
|
141
|
+
});
|
|
134
142
|
}
|
|
135
143
|
|
|
136
144
|
async start() {
|
|
@@ -142,8 +150,11 @@ export class ClusterRuntime {
|
|
|
142
150
|
this.peerManager.on("peerConnected", (nodeId) => {
|
|
143
151
|
this.logger.info(`[clawmatrix] Peer connected: ${nodeId}`);
|
|
144
152
|
this.refreshDiscoveredModels();
|
|
145
|
-
|
|
153
|
+
// Init sync state BEFORE recording the event — recordPeerOnline triggers
|
|
154
|
+
// broadcastSync which must use the freshly initialized syncState.
|
|
155
|
+
// Reversing this order causes syncState corruption and infinite sync loops.
|
|
146
156
|
this.healthTracker.initPeerSync(nodeId);
|
|
157
|
+
this.healthTracker.recordPeerOnline(nodeId, "direct");
|
|
147
158
|
});
|
|
148
159
|
|
|
149
160
|
this.peerManager.on("peerDisconnected", (nodeId) => {
|
|
@@ -155,6 +166,7 @@ export class ClusterRuntime {
|
|
|
155
166
|
|
|
156
167
|
this.peerManager.on("peerCapabilitiesChanged", () => {
|
|
157
168
|
this.refreshDiscoveredModels();
|
|
169
|
+
this.trackRelayPeerHealth();
|
|
158
170
|
});
|
|
159
171
|
|
|
160
172
|
// Web dashboard (must be set before peerManager.start() creates the HTTP server)
|
|
@@ -268,8 +280,33 @@ export class ClusterRuntime {
|
|
|
268
280
|
// NOTE: intentionally do NOT stop sentinel here.
|
|
269
281
|
// Sentinel must survive gateway shutdown — that's its entire purpose.
|
|
270
282
|
// It will be replaced by killOldSentinel() on next gateway start.
|
|
283
|
+
|
|
284
|
+
// Wrap all async shutdown in a 4s timeout to prevent blocking gateway restart.
|
|
285
|
+
// OpenClaw's force-exit timer is 5s for stop, so we must finish before that.
|
|
286
|
+
const STOP_TIMEOUT_MS = 4000;
|
|
287
|
+
let timer: ReturnType<typeof setTimeout> | undefined;
|
|
288
|
+
await Promise.race([
|
|
289
|
+
this.stopInternal().then(() => { clearTimeout(timer); }),
|
|
290
|
+
new Promise<void>((resolve) => {
|
|
291
|
+
timer = setTimeout(() => {
|
|
292
|
+
this.logger.warn("[clawmatrix] Graceful shutdown timed out after 4s, forcing cleanup");
|
|
293
|
+
this.forceCleanup();
|
|
294
|
+
resolve();
|
|
295
|
+
}, STOP_TIMEOUT_MS);
|
|
296
|
+
}),
|
|
297
|
+
]);
|
|
298
|
+
this.logger.info(`[clawmatrix] Node "${this.config.nodeId}" stopped`);
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
private async stopInternal() {
|
|
271
302
|
await this.healthTracker.stop();
|
|
272
303
|
await this.knowledgeSync?.stop();
|
|
304
|
+
this.syncCleanup();
|
|
305
|
+
await this.peerManager.stop();
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
/** Synchronous cleanup that never blocks. */
|
|
309
|
+
private syncCleanup() {
|
|
273
310
|
this.webHandler?.destroy();
|
|
274
311
|
this.handoffManager.destroy();
|
|
275
312
|
this.acpProxy?.destroy();
|
|
@@ -277,8 +314,12 @@ export class ClusterRuntime {
|
|
|
277
314
|
this.modelProxy.stop();
|
|
278
315
|
this.fileTransferManager?.destroy();
|
|
279
316
|
this.toolProxy.destroy();
|
|
280
|
-
|
|
281
|
-
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
/** Emergency cleanup when graceful shutdown times out. */
|
|
320
|
+
private forceCleanup() {
|
|
321
|
+
try { this.syncCleanup(); } catch { /* best effort */ }
|
|
322
|
+
try { this.peerManager.forceStop(); } catch { /* best effort */ }
|
|
282
323
|
}
|
|
283
324
|
|
|
284
325
|
private refreshDiscoveredModels() {
|
|
@@ -286,6 +327,32 @@ export class ClusterRuntime {
|
|
|
286
327
|
this.modelProxy.updateDiscoveredModels(peers);
|
|
287
328
|
}
|
|
288
329
|
|
|
330
|
+
/** Track relay peer health: record peer_online/peer_offline for relay peers. */
|
|
331
|
+
private trackRelayPeerHealth() {
|
|
332
|
+
const currentRelayPeers = new Set<string>();
|
|
333
|
+
for (const peer of this.peerManager.router.getAllPeers()) {
|
|
334
|
+
if (peer.reachableVia) {
|
|
335
|
+
currentRelayPeers.add(peer.nodeId);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// New relay peers discovered
|
|
340
|
+
for (const nodeId of currentRelayPeers) {
|
|
341
|
+
if (!this.knownRelayPeers.has(nodeId)) {
|
|
342
|
+
this.healthTracker.recordPeerOnline(nodeId, "relay");
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Relay peers that disappeared
|
|
347
|
+
for (const nodeId of this.knownRelayPeers) {
|
|
348
|
+
if (!currentRelayPeers.has(nodeId)) {
|
|
349
|
+
this.healthTracker.recordPeerOffline(nodeId, "relay_route_lost");
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
this.knownRelayPeers = currentRelayPeers;
|
|
354
|
+
}
|
|
355
|
+
|
|
289
356
|
/** Fetch tool catalog from the local OpenClaw gateway and advertise to peers. */
|
|
290
357
|
private async fetchToolCatalog() {
|
|
291
358
|
const { spawnProcess } = await import("./compat.ts");
|
|
@@ -576,6 +643,12 @@ export class ClusterRuntime {
|
|
|
576
643
|
this.acpProxy.handleSetConfigRequest(frame as import("./types.ts").AcpSetConfigRequest).catch((err) => {
|
|
577
644
|
this.logger.error(`[clawmatrix] ACP set config error: ${err}`);
|
|
578
645
|
});
|
|
646
|
+
} else {
|
|
647
|
+
const cf = frame as import("./types.ts").AcpSetConfigRequest;
|
|
648
|
+
this.peerManager.sendTo(cf.from, {
|
|
649
|
+
type: "acp_set_config_res", id: cf.id, from: this.config.nodeId, to: cf.from,
|
|
650
|
+
timestamp: Date.now(), payload: { success: false, error: "ACP not enabled on this node" },
|
|
651
|
+
} as import("./types.ts").AcpSetConfigResponse);
|
|
579
652
|
}
|
|
580
653
|
break;
|
|
581
654
|
case "acp_set_config_res":
|
|
@@ -586,6 +659,12 @@ export class ClusterRuntime {
|
|
|
586
659
|
this.acpProxy.handleSubscribeRequest(frame as import("./types.ts").AcpSubscribeRequest).catch((err) => {
|
|
587
660
|
this.logger.error(`[clawmatrix] ACP subscribe error: ${err}`);
|
|
588
661
|
});
|
|
662
|
+
} else {
|
|
663
|
+
const sf = frame as import("./types.ts").AcpSubscribeRequest;
|
|
664
|
+
this.peerManager.sendTo(sf.from, {
|
|
665
|
+
type: "acp_subscribe_res", id: sf.id, from: this.config.nodeId, to: sf.from,
|
|
666
|
+
timestamp: Date.now(), payload: { success: false, error: "ACP not enabled on this node" },
|
|
667
|
+
} as import("./types.ts").AcpSubscribeResponse);
|
|
589
668
|
}
|
|
590
669
|
break;
|
|
591
670
|
case "acp_unsubscribe":
|
package/src/config.ts
CHANGED
|
@@ -51,7 +51,8 @@ const ModelInfoSchema = z.object({
|
|
|
51
51
|
|
|
52
52
|
const PeerConfigSchema = z.object({
|
|
53
53
|
nodeId: z.string(),
|
|
54
|
-
|
|
54
|
+
/** Single URL or array of URLs for multi-channel connections. */
|
|
55
|
+
url: z.union([z.string(), z.array(z.string()).min(1)]),
|
|
55
56
|
});
|
|
56
57
|
|
|
57
58
|
const ToolProxyConfigSchema = z.object({
|
package/src/handoff.ts
CHANGED
|
@@ -678,6 +678,14 @@ export class HandoffManager {
|
|
|
678
678
|
}
|
|
679
679
|
|
|
680
680
|
/** Clean up on shutdown. */
|
|
681
|
+
/** Check if there are pending outbound handoffs targeting a specific node. */
|
|
682
|
+
hasPendingForNode(nodeId: string): boolean {
|
|
683
|
+
for (const p of this.pending.values()) {
|
|
684
|
+
if (p.targetNodeId === nodeId) return true;
|
|
685
|
+
}
|
|
686
|
+
return false;
|
|
687
|
+
}
|
|
688
|
+
|
|
681
689
|
destroy() {
|
|
682
690
|
if (this.staleCleanupTimer) {
|
|
683
691
|
clearInterval(this.staleCleanupTimer);
|
package/src/health-tracker.ts
CHANGED
|
@@ -79,6 +79,12 @@ export class HealthTracker {
|
|
|
79
79
|
private compactTimer: ReturnType<typeof setInterval> | null = null;
|
|
80
80
|
private saveTimer: ReturnType<typeof setTimeout> | null = null;
|
|
81
81
|
private dirty = false;
|
|
82
|
+
/** Debounce timer for broadcastSync (prevents rapid-fire broadcasts). */
|
|
83
|
+
private broadcastTimer: ReturnType<typeof setTimeout> | null = null;
|
|
84
|
+
/** Round counter per peer to detect non-converging sync loops. */
|
|
85
|
+
private syncRounds = new Map<string, number>();
|
|
86
|
+
private static readonly MAX_SYNC_ROUNDS = 10;
|
|
87
|
+
private static readonly BROADCAST_DEBOUNCE = 500; // ms
|
|
82
88
|
|
|
83
89
|
constructor(opts: HealthTrackerOptions) {
|
|
84
90
|
this.nodeId = opts.nodeId;
|
|
@@ -119,6 +125,10 @@ export class HealthTracker {
|
|
|
119
125
|
clearInterval(this.compactTimer);
|
|
120
126
|
this.compactTimer = null;
|
|
121
127
|
}
|
|
128
|
+
if (this.broadcastTimer) {
|
|
129
|
+
clearTimeout(this.broadcastTimer);
|
|
130
|
+
this.broadcastTimer = null;
|
|
131
|
+
}
|
|
122
132
|
if (this.saveTimer) {
|
|
123
133
|
clearTimeout(this.saveTimer);
|
|
124
134
|
this.saveTimer = null;
|
|
@@ -165,6 +175,16 @@ export class HealthTracker {
|
|
|
165
175
|
const message = new Uint8Array(Buffer.from(frame.payload.data, "base64"));
|
|
166
176
|
const syncKey = peerId;
|
|
167
177
|
|
|
178
|
+
// Guard against non-converging sync loops
|
|
179
|
+
const rounds = (this.syncRounds.get(peerId) ?? 0) + 1;
|
|
180
|
+
if (rounds > HealthTracker.MAX_SYNC_ROUNDS) {
|
|
181
|
+
debug(TAG, `sync with ${peerId} exceeded ${HealthTracker.MAX_SYNC_ROUNDS} rounds, resetting`);
|
|
182
|
+
this.syncStates.set(syncKey, Automerge.initSyncState());
|
|
183
|
+
this.syncRounds.delete(peerId);
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
186
|
+
this.syncRounds.set(peerId, rounds);
|
|
187
|
+
|
|
168
188
|
try {
|
|
169
189
|
const syncState = this.syncStates.get(syncKey) ?? Automerge.initSyncState();
|
|
170
190
|
const [newDoc, newSyncState] = Automerge.receiveSyncMessage(this.doc, syncState, message);
|
|
@@ -172,18 +192,19 @@ export class HealthTracker {
|
|
|
172
192
|
this.syncStates.set(syncKey, newSyncState);
|
|
173
193
|
this.scheduleSave();
|
|
174
194
|
|
|
175
|
-
// Send our response
|
|
195
|
+
// Send our response (only if there's something to send)
|
|
176
196
|
this.sendSyncMessage(peerId);
|
|
177
197
|
} catch (err) {
|
|
178
198
|
debug(TAG, `error handling sync from ${peerId}: ${err}`);
|
|
179
199
|
}
|
|
180
200
|
}
|
|
181
201
|
|
|
182
|
-
/**
|
|
202
|
+
/** Initialize sync state for a peer (called on peer connect).
|
|
203
|
+
* Does NOT send a message — the subsequent recordPeerOnline → broadcastSync handles that.
|
|
204
|
+
* Sending here would race with broadcastSync and corrupt the sync state. */
|
|
183
205
|
initPeerSync(peerId: string) {
|
|
184
206
|
if (peerId === this.nodeId) return;
|
|
185
207
|
this.syncStates.set(peerId, Automerge.initSyncState());
|
|
186
|
-
this.sendSyncMessage(peerId);
|
|
187
208
|
}
|
|
188
209
|
|
|
189
210
|
/** Clean up sync state for a disconnected peer. */
|
|
@@ -196,7 +217,11 @@ export class HealthTracker {
|
|
|
196
217
|
const [newSyncState, message] = Automerge.generateSyncMessage(this.doc, syncState);
|
|
197
218
|
this.syncStates.set(peerId, newSyncState);
|
|
198
219
|
|
|
199
|
-
if (!message)
|
|
220
|
+
if (!message) {
|
|
221
|
+
// Sync converged — reset round counter
|
|
222
|
+
this.syncRounds.delete(peerId);
|
|
223
|
+
return;
|
|
224
|
+
}
|
|
200
225
|
|
|
201
226
|
debug(TAG, `sending health sync to ${peerId} (${message.byteLength} bytes)`);
|
|
202
227
|
|
|
@@ -214,10 +239,17 @@ export class HealthTracker {
|
|
|
214
239
|
}
|
|
215
240
|
|
|
216
241
|
private broadcastSync() {
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
242
|
+
// Debounce: multiple events in quick succession → single broadcast
|
|
243
|
+
if (this.broadcastTimer) return;
|
|
244
|
+
this.broadcastTimer = setTimeout(() => {
|
|
245
|
+
this.broadcastTimer = null;
|
|
246
|
+
// Reset round counters — new broadcast starts fresh sync cycle
|
|
247
|
+
this.syncRounds.clear();
|
|
248
|
+
const peers = this.peerManager.router.getAllPeers();
|
|
249
|
+
for (const peer of peers) {
|
|
250
|
+
this.sendSyncMessage(peer.nodeId);
|
|
251
|
+
}
|
|
252
|
+
}, HealthTracker.BROADCAST_DEBOUNCE);
|
|
221
253
|
}
|
|
222
254
|
|
|
223
255
|
// ── Timeline aggregation ──────────────────────────────────
|
|
@@ -257,9 +289,18 @@ export class HealthTracker {
|
|
|
257
289
|
// Build timeline for each node (including self)
|
|
258
290
|
const nodes: NodeTimeline[] = [];
|
|
259
291
|
|
|
292
|
+
// Collect nodeIds that have ever had peer_online events (actually connected)
|
|
293
|
+
const everConnected = new Set<string>();
|
|
294
|
+
for (const [, entry] of Object.entries(this.doc.nodes)) {
|
|
295
|
+
for (const ev of entry.events) {
|
|
296
|
+
if (ev.type === "peer_online" && ev.peer) everConnected.add(ev.peer);
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
260
300
|
for (const [nodeId, entry] of Object.entries(this.doc.nodes)) {
|
|
261
|
-
//
|
|
262
|
-
|
|
301
|
+
// Remote nodes: must have successfully connected at some point
|
|
302
|
+
if (nodeId !== this.nodeId && !everConnected.has(nodeId)) continue;
|
|
303
|
+
|
|
263
304
|
const timeline = this.buildNodeTimeline(
|
|
264
305
|
nodeId,
|
|
265
306
|
entry,
|
|
@@ -269,7 +310,10 @@ export class HealthTracker {
|
|
|
269
310
|
bucketCount,
|
|
270
311
|
gaps,
|
|
271
312
|
);
|
|
272
|
-
if (timeline)
|
|
313
|
+
if (!timeline) continue;
|
|
314
|
+
// Hide remote nodes that were offline for the entire requested range
|
|
315
|
+
if (nodeId !== this.nodeId && timeline.uptimeRatio === 0) continue;
|
|
316
|
+
nodes.push(timeline);
|
|
273
317
|
}
|
|
274
318
|
|
|
275
319
|
return {
|
|
@@ -345,19 +389,36 @@ export class HealthTracker {
|
|
|
345
389
|
startTs: number,
|
|
346
390
|
endTs: number,
|
|
347
391
|
): Array<[number, number]> {
|
|
348
|
-
const intervals: Array<[number, number]> = [];
|
|
349
|
-
|
|
350
392
|
if (nodeId === this.nodeId) {
|
|
351
393
|
// Self: start/stop events define uptime
|
|
352
|
-
// But we're looking at all nodes' data, so check if this nodeId
|
|
353
|
-
// has start/stop events (each node writes its own start/stop)
|
|
354
394
|
return this.buildSelfIntervals(events, startTs, endTs);
|
|
355
395
|
}
|
|
356
396
|
|
|
357
|
-
// For remote nodes:
|
|
358
|
-
//
|
|
359
|
-
//
|
|
360
|
-
|
|
397
|
+
// For remote nodes: use BOTH self-reported start/stop intervals AND
|
|
398
|
+
// peer_online/peer_offline observations, then merge for best accuracy.
|
|
399
|
+
// Self-reported intervals are the primary signal (the node knows when
|
|
400
|
+
// it was running); peer observations supplement for relay peers or
|
|
401
|
+
// when CRDT sync hasn't propagated the remote node's own events.
|
|
402
|
+
const selfIntervals = this.buildSelfIntervals(events, startTs, endTs);
|
|
403
|
+
const peerIntervals = this.buildPeerIntervals(nodeId, startTs, endTs);
|
|
404
|
+
return this.mergeIntervals([...selfIntervals, ...peerIntervals]);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
/** Merge overlapping intervals into a sorted, non-overlapping set. */
|
|
408
|
+
private mergeIntervals(intervals: Array<[number, number]>): Array<[number, number]> {
|
|
409
|
+
if (intervals.length <= 1) return intervals;
|
|
410
|
+
intervals.sort((a, b) => a[0] - b[0]);
|
|
411
|
+
const merged: Array<[number, number]> = [intervals[0]!];
|
|
412
|
+
for (let i = 1; i < intervals.length; i++) {
|
|
413
|
+
const prev = merged[merged.length - 1]!;
|
|
414
|
+
const cur = intervals[i]!;
|
|
415
|
+
if (cur[0] <= prev[1]) {
|
|
416
|
+
prev[1] = Math.max(prev[1], cur[1]);
|
|
417
|
+
} else {
|
|
418
|
+
merged.push(cur);
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
return merged;
|
|
361
422
|
}
|
|
362
423
|
|
|
363
424
|
private buildSelfIntervals(
|