clawmatrix 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -105,6 +105,8 @@ export class ClusterRuntime {
105
105
  // Pre-built indexes for O(1) local agent lookup
106
106
  private agentById = new Map<string, ClawMatrixConfig["agents"][number]>();
107
107
  private agentsByTag = new Map<string, ClawMatrixConfig["agents"][number]>();
108
+ /** Track known relay peers to record health events on discovery/removal. */
109
+ private knownRelayPeers = new Set<string>();
108
110
 
109
111
  constructor(config: ClawMatrixConfig, logger: PluginLogger, openclawConfig: OpenClawConfig, openclawVersion?: string) {
110
112
  this.config = config;
@@ -155,6 +157,7 @@ export class ClusterRuntime {
155
157
 
156
158
  this.peerManager.on("peerCapabilitiesChanged", () => {
157
159
  this.refreshDiscoveredModels();
160
+ this.trackRelayPeerHealth();
158
161
  });
159
162
 
160
163
  // Web dashboard (must be set before peerManager.start() creates the HTTP server)
@@ -268,8 +271,33 @@ export class ClusterRuntime {
268
271
  // NOTE: intentionally do NOT stop sentinel here.
269
272
  // Sentinel must survive gateway shutdown — that's its entire purpose.
270
273
  // It will be replaced by killOldSentinel() on next gateway start.
274
+
275
+ // Wrap all async shutdown in a 4s timeout to prevent blocking gateway restart.
276
+ // OpenClaw's force-exit timer is 5s for stop, so we must finish before that.
277
+ const STOP_TIMEOUT_MS = 4000;
278
+ let timer: ReturnType<typeof setTimeout> | undefined;
279
+ await Promise.race([
280
+ this.stopInternal().then(() => { clearTimeout(timer); }),
281
+ new Promise<void>((resolve) => {
282
+ timer = setTimeout(() => {
283
+ this.logger.warn("[clawmatrix] Graceful shutdown timed out after 4s, forcing cleanup");
284
+ this.forceCleanup();
285
+ resolve();
286
+ }, STOP_TIMEOUT_MS);
287
+ }),
288
+ ]);
289
+ this.logger.info(`[clawmatrix] Node "${this.config.nodeId}" stopped`);
290
+ }
291
+
292
+ private async stopInternal() {
271
293
  await this.healthTracker.stop();
272
294
  await this.knowledgeSync?.stop();
295
+ this.syncCleanup();
296
+ await this.peerManager.stop();
297
+ }
298
+
299
+ /** Synchronous cleanup that never blocks. */
300
+ private syncCleanup() {
273
301
  this.webHandler?.destroy();
274
302
  this.handoffManager.destroy();
275
303
  this.acpProxy?.destroy();
@@ -277,8 +305,12 @@ export class ClusterRuntime {
277
305
  this.modelProxy.stop();
278
306
  this.fileTransferManager?.destroy();
279
307
  this.toolProxy.destroy();
280
- await this.peerManager.stop();
281
- this.logger.info(`[clawmatrix] Node "${this.config.nodeId}" stopped`);
308
+ }
309
+
310
+ /** Emergency cleanup when graceful shutdown times out. */
311
+ private forceCleanup() {
312
+ try { this.syncCleanup(); } catch { /* best effort */ }
313
+ try { this.peerManager.forceStop(); } catch { /* best effort */ }
282
314
  }
283
315
 
284
316
  private refreshDiscoveredModels() {
@@ -286,6 +318,32 @@ export class ClusterRuntime {
286
318
  this.modelProxy.updateDiscoveredModels(peers);
287
319
  }
288
320
 
321
+ /** Track relay peer health: record peer_online/peer_offline for relay peers. */
322
+ private trackRelayPeerHealth() {
323
+ const currentRelayPeers = new Set<string>();
324
+ for (const peer of this.peerManager.router.getAllPeers()) {
325
+ if (peer.reachableVia) {
326
+ currentRelayPeers.add(peer.nodeId);
327
+ }
328
+ }
329
+
330
+ // New relay peers discovered
331
+ for (const nodeId of currentRelayPeers) {
332
+ if (!this.knownRelayPeers.has(nodeId)) {
333
+ this.healthTracker.recordPeerOnline(nodeId, "relay");
334
+ }
335
+ }
336
+
337
+ // Relay peers that disappeared
338
+ for (const nodeId of this.knownRelayPeers) {
339
+ if (!currentRelayPeers.has(nodeId)) {
340
+ this.healthTracker.recordPeerOffline(nodeId, "relay_route_lost");
341
+ }
342
+ }
343
+
344
+ this.knownRelayPeers = currentRelayPeers;
345
+ }
346
+
289
347
  /** Fetch tool catalog from the local OpenClaw gateway and advertise to peers. */
290
348
  private async fetchToolCatalog() {
291
349
  const { spawnProcess } = await import("./compat.ts");
@@ -576,6 +634,12 @@ export class ClusterRuntime {
576
634
  this.acpProxy.handleSetConfigRequest(frame as import("./types.ts").AcpSetConfigRequest).catch((err) => {
577
635
  this.logger.error(`[clawmatrix] ACP set config error: ${err}`);
578
636
  });
637
+ } else {
638
+ const cf = frame as import("./types.ts").AcpSetConfigRequest;
639
+ this.peerManager.sendTo(cf.from, {
640
+ type: "acp_set_config_res", id: cf.id, from: this.config.nodeId, to: cf.from,
641
+ timestamp: Date.now(), payload: { success: false, error: "ACP not enabled on this node" },
642
+ } as import("./types.ts").AcpSetConfigResponse);
579
643
  }
580
644
  break;
581
645
  case "acp_set_config_res":
@@ -586,6 +650,12 @@ export class ClusterRuntime {
586
650
  this.acpProxy.handleSubscribeRequest(frame as import("./types.ts").AcpSubscribeRequest).catch((err) => {
587
651
  this.logger.error(`[clawmatrix] ACP subscribe error: ${err}`);
588
652
  });
653
+ } else {
654
+ const sf = frame as import("./types.ts").AcpSubscribeRequest;
655
+ this.peerManager.sendTo(sf.from, {
656
+ type: "acp_subscribe_res", id: sf.id, from: this.config.nodeId, to: sf.from,
657
+ timestamp: Date.now(), payload: { success: false, error: "ACP not enabled on this node" },
658
+ } as import("./types.ts").AcpSubscribeResponse);
589
659
  }
590
660
  break;
591
661
  case "acp_unsubscribe":
@@ -257,9 +257,18 @@ export class HealthTracker {
257
257
  // Build timeline for each node (including self)
258
258
  const nodes: NodeTimeline[] = [];
259
259
 
260
+ // Collect nodeIds that have ever had peer_online events (actually connected)
261
+ const everConnected = new Set<string>();
262
+ for (const [, entry] of Object.entries(this.doc.nodes)) {
263
+ for (const ev of entry.events) {
264
+ if (ev.type === "peer_online" && ev.peer) everConnected.add(ev.peer);
265
+ }
266
+ }
267
+
260
268
  for (const [nodeId, entry] of Object.entries(this.doc.nodes)) {
261
- // For self, use start/stop events to determine uptime
262
- // For other nodes, use peer_online/peer_offline from the observing node
269
+ // Remote nodes: must have successfully connected at some point
270
+ if (nodeId !== this.nodeId && !everConnected.has(nodeId)) continue;
271
+
263
272
  const timeline = this.buildNodeTimeline(
264
273
  nodeId,
265
274
  entry,
@@ -269,7 +278,10 @@ export class HealthTracker {
269
278
  bucketCount,
270
279
  gaps,
271
280
  );
272
- if (timeline) nodes.push(timeline);
281
+ if (!timeline) continue;
282
+ // Hide remote nodes that were offline for the entire requested range
283
+ if (nodeId !== this.nodeId && timeline.uptimeRatio === 0) continue;
284
+ nodes.push(timeline);
273
285
  }
274
286
 
275
287
  return {
@@ -345,19 +357,36 @@ export class HealthTracker {
345
357
  startTs: number,
346
358
  endTs: number,
347
359
  ): Array<[number, number]> {
348
- const intervals: Array<[number, number]> = [];
349
-
350
360
  if (nodeId === this.nodeId) {
351
361
  // Self: start/stop events define uptime
352
- // But we're looking at all nodes' data, so check if this nodeId
353
- // has start/stop events (each node writes its own start/stop)
354
362
  return this.buildSelfIntervals(events, startTs, endTs);
355
363
  }
356
364
 
357
- // For remote nodes: gather peer_online/peer_offline events from all observer nodes
358
- // We have the CRDT doc with all nodes' events merged
359
- // Look through ALL nodes' events for peer_online/peer_offline referencing this nodeId
360
- return this.buildPeerIntervals(nodeId, startTs, endTs);
365
+ // For remote nodes: use BOTH self-reported start/stop intervals AND
366
+ // peer_online/peer_offline observations, then merge for best accuracy.
367
+ // Self-reported intervals are the primary signal (the node knows when
368
+ // it was running); peer observations supplement for relay peers or
369
+ // when CRDT sync hasn't propagated the remote node's own events.
370
+ const selfIntervals = this.buildSelfIntervals(events, startTs, endTs);
371
+ const peerIntervals = this.buildPeerIntervals(nodeId, startTs, endTs);
372
+ return this.mergeIntervals([...selfIntervals, ...peerIntervals]);
373
+ }
374
+
375
+ /** Merge overlapping intervals into a sorted, non-overlapping set. */
376
+ private mergeIntervals(intervals: Array<[number, number]>): Array<[number, number]> {
377
+ if (intervals.length <= 1) return intervals;
378
+ intervals.sort((a, b) => a[0] - b[0]);
379
+ const merged: Array<[number, number]> = [intervals[0]!];
380
+ for (let i = 1; i < intervals.length; i++) {
381
+ const prev = merged[merged.length - 1]!;
382
+ const cur = intervals[i]!;
383
+ if (cur[0] <= prev[1]) {
384
+ prev[1] = Math.max(prev[1], cur[1]);
385
+ } else {
386
+ merged.push(cur);
387
+ }
388
+ }
389
+ return merged;
361
390
  }
362
391
 
363
392
  private buildSelfIntervals(