clawmatrix 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/bin/clawmatrix.mjs +1006 -0
- package/cli/package.json +27 -0
- package/cli/skills/clawmatrix/SKILL.md +104 -0
- package/openclaw.plugin.json +1 -0
- package/package.json +2 -1
- package/src/acp-proxy.ts +416 -31
- package/src/cluster-service.ts +72 -2
- package/src/health-tracker.ts +40 -11
- package/src/index.ts +471 -28
- package/src/knowledge-sync.ts +18 -4
- package/src/model-proxy.ts +5 -0
- package/src/peer-manager.ts +33 -4
- package/src/tool-proxy.ts +40 -2
- package/src/tools/cluster-notify.ts +132 -0
- package/src/types.ts +1 -1
- package/src/cli.ts +0 -711
package/src/cluster-service.ts
CHANGED
|
@@ -105,6 +105,8 @@ export class ClusterRuntime {
|
|
|
105
105
|
// Pre-built indexes for O(1) local agent lookup
|
|
106
106
|
private agentById = new Map<string, ClawMatrixConfig["agents"][number]>();
|
|
107
107
|
private agentsByTag = new Map<string, ClawMatrixConfig["agents"][number]>();
|
|
108
|
+
/** Track known relay peers to record health events on discovery/removal. */
|
|
109
|
+
private knownRelayPeers = new Set<string>();
|
|
108
110
|
|
|
109
111
|
constructor(config: ClawMatrixConfig, logger: PluginLogger, openclawConfig: OpenClawConfig, openclawVersion?: string) {
|
|
110
112
|
this.config = config;
|
|
@@ -155,6 +157,7 @@ export class ClusterRuntime {
|
|
|
155
157
|
|
|
156
158
|
this.peerManager.on("peerCapabilitiesChanged", () => {
|
|
157
159
|
this.refreshDiscoveredModels();
|
|
160
|
+
this.trackRelayPeerHealth();
|
|
158
161
|
});
|
|
159
162
|
|
|
160
163
|
// Web dashboard (must be set before peerManager.start() creates the HTTP server)
|
|
@@ -268,8 +271,33 @@ export class ClusterRuntime {
|
|
|
268
271
|
// NOTE: intentionally do NOT stop sentinel here.
|
|
269
272
|
// Sentinel must survive gateway shutdown — that's its entire purpose.
|
|
270
273
|
// It will be replaced by killOldSentinel() on next gateway start.
|
|
274
|
+
|
|
275
|
+
// Wrap all async shutdown in a 4s timeout to prevent blocking gateway restart.
|
|
276
|
+
// OpenClaw's force-exit timer is 5s for stop, so we must finish before that.
|
|
277
|
+
const STOP_TIMEOUT_MS = 4000;
|
|
278
|
+
let timer: ReturnType<typeof setTimeout> | undefined;
|
|
279
|
+
await Promise.race([
|
|
280
|
+
this.stopInternal().then(() => { clearTimeout(timer); }),
|
|
281
|
+
new Promise<void>((resolve) => {
|
|
282
|
+
timer = setTimeout(() => {
|
|
283
|
+
this.logger.warn("[clawmatrix] Graceful shutdown timed out after 4s, forcing cleanup");
|
|
284
|
+
this.forceCleanup();
|
|
285
|
+
resolve();
|
|
286
|
+
}, STOP_TIMEOUT_MS);
|
|
287
|
+
}),
|
|
288
|
+
]);
|
|
289
|
+
this.logger.info(`[clawmatrix] Node "${this.config.nodeId}" stopped`);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
private async stopInternal() {
|
|
271
293
|
await this.healthTracker.stop();
|
|
272
294
|
await this.knowledgeSync?.stop();
|
|
295
|
+
this.syncCleanup();
|
|
296
|
+
await this.peerManager.stop();
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/** Synchronous cleanup that never blocks. */
|
|
300
|
+
private syncCleanup() {
|
|
273
301
|
this.webHandler?.destroy();
|
|
274
302
|
this.handoffManager.destroy();
|
|
275
303
|
this.acpProxy?.destroy();
|
|
@@ -277,8 +305,12 @@ export class ClusterRuntime {
|
|
|
277
305
|
this.modelProxy.stop();
|
|
278
306
|
this.fileTransferManager?.destroy();
|
|
279
307
|
this.toolProxy.destroy();
|
|
280
|
-
|
|
281
|
-
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/** Emergency cleanup when graceful shutdown times out. */
|
|
311
|
+
private forceCleanup() {
|
|
312
|
+
try { this.syncCleanup(); } catch { /* best effort */ }
|
|
313
|
+
try { this.peerManager.forceStop(); } catch { /* best effort */ }
|
|
282
314
|
}
|
|
283
315
|
|
|
284
316
|
private refreshDiscoveredModels() {
|
|
@@ -286,6 +318,32 @@ export class ClusterRuntime {
|
|
|
286
318
|
this.modelProxy.updateDiscoveredModels(peers);
|
|
287
319
|
}
|
|
288
320
|
|
|
321
|
+
/** Track relay peer health: record peer_online/peer_offline for relay peers. */
|
|
322
|
+
private trackRelayPeerHealth() {
|
|
323
|
+
const currentRelayPeers = new Set<string>();
|
|
324
|
+
for (const peer of this.peerManager.router.getAllPeers()) {
|
|
325
|
+
if (peer.reachableVia) {
|
|
326
|
+
currentRelayPeers.add(peer.nodeId);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// New relay peers discovered
|
|
331
|
+
for (const nodeId of currentRelayPeers) {
|
|
332
|
+
if (!this.knownRelayPeers.has(nodeId)) {
|
|
333
|
+
this.healthTracker.recordPeerOnline(nodeId, "relay");
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Relay peers that disappeared
|
|
338
|
+
for (const nodeId of this.knownRelayPeers) {
|
|
339
|
+
if (!currentRelayPeers.has(nodeId)) {
|
|
340
|
+
this.healthTracker.recordPeerOffline(nodeId, "relay_route_lost");
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
this.knownRelayPeers = currentRelayPeers;
|
|
345
|
+
}
|
|
346
|
+
|
|
289
347
|
/** Fetch tool catalog from the local OpenClaw gateway and advertise to peers. */
|
|
290
348
|
private async fetchToolCatalog() {
|
|
291
349
|
const { spawnProcess } = await import("./compat.ts");
|
|
@@ -576,6 +634,12 @@ export class ClusterRuntime {
|
|
|
576
634
|
this.acpProxy.handleSetConfigRequest(frame as import("./types.ts").AcpSetConfigRequest).catch((err) => {
|
|
577
635
|
this.logger.error(`[clawmatrix] ACP set config error: ${err}`);
|
|
578
636
|
});
|
|
637
|
+
} else {
|
|
638
|
+
const cf = frame as import("./types.ts").AcpSetConfigRequest;
|
|
639
|
+
this.peerManager.sendTo(cf.from, {
|
|
640
|
+
type: "acp_set_config_res", id: cf.id, from: this.config.nodeId, to: cf.from,
|
|
641
|
+
timestamp: Date.now(), payload: { success: false, error: "ACP not enabled on this node" },
|
|
642
|
+
} as import("./types.ts").AcpSetConfigResponse);
|
|
579
643
|
}
|
|
580
644
|
break;
|
|
581
645
|
case "acp_set_config_res":
|
|
@@ -586,6 +650,12 @@ export class ClusterRuntime {
|
|
|
586
650
|
this.acpProxy.handleSubscribeRequest(frame as import("./types.ts").AcpSubscribeRequest).catch((err) => {
|
|
587
651
|
this.logger.error(`[clawmatrix] ACP subscribe error: ${err}`);
|
|
588
652
|
});
|
|
653
|
+
} else {
|
|
654
|
+
const sf = frame as import("./types.ts").AcpSubscribeRequest;
|
|
655
|
+
this.peerManager.sendTo(sf.from, {
|
|
656
|
+
type: "acp_subscribe_res", id: sf.id, from: this.config.nodeId, to: sf.from,
|
|
657
|
+
timestamp: Date.now(), payload: { success: false, error: "ACP not enabled on this node" },
|
|
658
|
+
} as import("./types.ts").AcpSubscribeResponse);
|
|
589
659
|
}
|
|
590
660
|
break;
|
|
591
661
|
case "acp_unsubscribe":
|
package/src/health-tracker.ts
CHANGED
|
@@ -257,9 +257,18 @@ export class HealthTracker {
|
|
|
257
257
|
// Build timeline for each node (including self)
|
|
258
258
|
const nodes: NodeTimeline[] = [];
|
|
259
259
|
|
|
260
|
+
// Collect nodeIds that have ever had peer_online events (actually connected)
|
|
261
|
+
const everConnected = new Set<string>();
|
|
262
|
+
for (const [, entry] of Object.entries(this.doc.nodes)) {
|
|
263
|
+
for (const ev of entry.events) {
|
|
264
|
+
if (ev.type === "peer_online" && ev.peer) everConnected.add(ev.peer);
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
260
268
|
for (const [nodeId, entry] of Object.entries(this.doc.nodes)) {
|
|
261
|
-
//
|
|
262
|
-
|
|
269
|
+
// Remote nodes: must have successfully connected at some point
|
|
270
|
+
if (nodeId !== this.nodeId && !everConnected.has(nodeId)) continue;
|
|
271
|
+
|
|
263
272
|
const timeline = this.buildNodeTimeline(
|
|
264
273
|
nodeId,
|
|
265
274
|
entry,
|
|
@@ -269,7 +278,10 @@ export class HealthTracker {
|
|
|
269
278
|
bucketCount,
|
|
270
279
|
gaps,
|
|
271
280
|
);
|
|
272
|
-
if (timeline)
|
|
281
|
+
if (!timeline) continue;
|
|
282
|
+
// Hide remote nodes that were offline for the entire requested range
|
|
283
|
+
if (nodeId !== this.nodeId && timeline.uptimeRatio === 0) continue;
|
|
284
|
+
nodes.push(timeline);
|
|
273
285
|
}
|
|
274
286
|
|
|
275
287
|
return {
|
|
@@ -345,19 +357,36 @@ export class HealthTracker {
|
|
|
345
357
|
startTs: number,
|
|
346
358
|
endTs: number,
|
|
347
359
|
): Array<[number, number]> {
|
|
348
|
-
const intervals: Array<[number, number]> = [];
|
|
349
|
-
|
|
350
360
|
if (nodeId === this.nodeId) {
|
|
351
361
|
// Self: start/stop events define uptime
|
|
352
|
-
// But we're looking at all nodes' data, so check if this nodeId
|
|
353
|
-
// has start/stop events (each node writes its own start/stop)
|
|
354
362
|
return this.buildSelfIntervals(events, startTs, endTs);
|
|
355
363
|
}
|
|
356
364
|
|
|
357
|
-
// For remote nodes:
|
|
358
|
-
//
|
|
359
|
-
//
|
|
360
|
-
|
|
365
|
+
// For remote nodes: use BOTH self-reported start/stop intervals AND
|
|
366
|
+
// peer_online/peer_offline observations, then merge for best accuracy.
|
|
367
|
+
// Self-reported intervals are the primary signal (the node knows when
|
|
368
|
+
// it was running); peer observations supplement for relay peers or
|
|
369
|
+
// when CRDT sync hasn't propagated the remote node's own events.
|
|
370
|
+
const selfIntervals = this.buildSelfIntervals(events, startTs, endTs);
|
|
371
|
+
const peerIntervals = this.buildPeerIntervals(nodeId, startTs, endTs);
|
|
372
|
+
return this.mergeIntervals([...selfIntervals, ...peerIntervals]);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
/** Merge overlapping intervals into a sorted, non-overlapping set. */
|
|
376
|
+
private mergeIntervals(intervals: Array<[number, number]>): Array<[number, number]> {
|
|
377
|
+
if (intervals.length <= 1) return intervals;
|
|
378
|
+
intervals.sort((a, b) => a[0] - b[0]);
|
|
379
|
+
const merged: Array<[number, number]> = [intervals[0]!];
|
|
380
|
+
for (let i = 1; i < intervals.length; i++) {
|
|
381
|
+
const prev = merged[merged.length - 1]!;
|
|
382
|
+
const cur = intervals[i]!;
|
|
383
|
+
if (cur[0] <= prev[1]) {
|
|
384
|
+
prev[1] = Math.max(prev[1], cur[1]);
|
|
385
|
+
} else {
|
|
386
|
+
merged.push(cur);
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
return merged;
|
|
361
390
|
}
|
|
362
391
|
|
|
363
392
|
private buildSelfIntervals(
|