clawmatrix 0.3.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -105,6 +105,8 @@ export class ClusterRuntime {
105
105
  // Pre-built indexes for O(1) local agent lookup
106
106
  private agentById = new Map<string, ClawMatrixConfig["agents"][number]>();
107
107
  private agentsByTag = new Map<string, ClawMatrixConfig["agents"][number]>();
108
+ /** Track known relay peers to record health events on discovery/removal. */
109
+ private knownRelayPeers = new Set<string>();
108
110
 
109
111
  constructor(config: ClawMatrixConfig, logger: PluginLogger, openclawConfig: OpenClawConfig, openclawVersion?: string) {
110
112
  this.config = config;
@@ -131,6 +133,12 @@ export class ClusterRuntime {
131
133
  this.agentById.set(a.id, a);
132
134
  for (const t of a.tags) this.agentsByTag.set(t, a);
133
135
  }
136
+
137
+ // Wire up active task checker for route probing (prevents switching mid-task)
138
+ this.peerManager.setActiveTaskChecker((nodeId) => {
139
+ return this.handoffManager.hasPendingForNode(nodeId)
140
+ || this.modelProxy.hasPendingForNode(nodeId);
141
+ });
134
142
  }
135
143
 
136
144
  async start() {
@@ -142,8 +150,11 @@ export class ClusterRuntime {
142
150
  this.peerManager.on("peerConnected", (nodeId) => {
143
151
  this.logger.info(`[clawmatrix] Peer connected: ${nodeId}`);
144
152
  this.refreshDiscoveredModels();
145
- this.healthTracker.recordPeerOnline(nodeId, "direct");
153
+ // Init sync state BEFORE recording the event — recordPeerOnline triggers
154
+ // broadcastSync which must use the freshly initialized syncState.
155
+ // Reversing this order causes syncState corruption and infinite sync loops.
146
156
  this.healthTracker.initPeerSync(nodeId);
157
+ this.healthTracker.recordPeerOnline(nodeId, "direct");
147
158
  });
148
159
 
149
160
  this.peerManager.on("peerDisconnected", (nodeId) => {
@@ -155,6 +166,7 @@ export class ClusterRuntime {
155
166
 
156
167
  this.peerManager.on("peerCapabilitiesChanged", () => {
157
168
  this.refreshDiscoveredModels();
169
+ this.trackRelayPeerHealth();
158
170
  });
159
171
 
160
172
  // Web dashboard (must be set before peerManager.start() creates the HTTP server)
@@ -268,8 +280,33 @@ export class ClusterRuntime {
268
280
  // NOTE: intentionally do NOT stop sentinel here.
269
281
  // Sentinel must survive gateway shutdown — that's its entire purpose.
270
282
  // It will be replaced by killOldSentinel() on next gateway start.
283
+
284
+ // Wrap all async shutdown in a 4s timeout to prevent blocking gateway restart.
285
+ // OpenClaw's force-exit timer is 5s for stop, so we must finish before that.
286
+ const STOP_TIMEOUT_MS = 4000;
287
+ let timer: ReturnType<typeof setTimeout> | undefined;
288
+ await Promise.race([
289
+ this.stopInternal().then(() => { clearTimeout(timer); }),
290
+ new Promise<void>((resolve) => {
291
+ timer = setTimeout(() => {
292
+ this.logger.warn("[clawmatrix] Graceful shutdown timed out after 4s, forcing cleanup");
293
+ this.forceCleanup();
294
+ resolve();
295
+ }, STOP_TIMEOUT_MS);
296
+ }),
297
+ ]);
298
+ this.logger.info(`[clawmatrix] Node "${this.config.nodeId}" stopped`);
299
+ }
300
+
301
+ private async stopInternal() {
271
302
  await this.healthTracker.stop();
272
303
  await this.knowledgeSync?.stop();
304
+ this.syncCleanup();
305
+ await this.peerManager.stop();
306
+ }
307
+
308
+ /** Synchronous cleanup that never blocks. */
309
+ private syncCleanup() {
273
310
  this.webHandler?.destroy();
274
311
  this.handoffManager.destroy();
275
312
  this.acpProxy?.destroy();
@@ -277,8 +314,12 @@ export class ClusterRuntime {
277
314
  this.modelProxy.stop();
278
315
  this.fileTransferManager?.destroy();
279
316
  this.toolProxy.destroy();
280
- await this.peerManager.stop();
281
- this.logger.info(`[clawmatrix] Node "${this.config.nodeId}" stopped`);
317
+ }
318
+
319
+ /** Emergency cleanup when graceful shutdown times out. */
320
+ private forceCleanup() {
321
+ try { this.syncCleanup(); } catch { /* best effort */ }
322
+ try { this.peerManager.forceStop(); } catch { /* best effort */ }
282
323
  }
283
324
 
284
325
  private refreshDiscoveredModels() {
@@ -286,6 +327,32 @@ export class ClusterRuntime {
286
327
  this.modelProxy.updateDiscoveredModels(peers);
287
328
  }
288
329
 
330
+ /** Track relay peer health: record peer_online/peer_offline for relay peers. */
331
+ private trackRelayPeerHealth() {
332
+ const currentRelayPeers = new Set<string>();
333
+ for (const peer of this.peerManager.router.getAllPeers()) {
334
+ if (peer.reachableVia) {
335
+ currentRelayPeers.add(peer.nodeId);
336
+ }
337
+ }
338
+
339
+ // New relay peers discovered
340
+ for (const nodeId of currentRelayPeers) {
341
+ if (!this.knownRelayPeers.has(nodeId)) {
342
+ this.healthTracker.recordPeerOnline(nodeId, "relay");
343
+ }
344
+ }
345
+
346
+ // Relay peers that disappeared
347
+ for (const nodeId of this.knownRelayPeers) {
348
+ if (!currentRelayPeers.has(nodeId)) {
349
+ this.healthTracker.recordPeerOffline(nodeId, "relay_route_lost");
350
+ }
351
+ }
352
+
353
+ this.knownRelayPeers = currentRelayPeers;
354
+ }
355
+
289
356
  /** Fetch tool catalog from the local OpenClaw gateway and advertise to peers. */
290
357
  private async fetchToolCatalog() {
291
358
  const { spawnProcess } = await import("./compat.ts");
@@ -576,6 +643,12 @@ export class ClusterRuntime {
576
643
  this.acpProxy.handleSetConfigRequest(frame as import("./types.ts").AcpSetConfigRequest).catch((err) => {
577
644
  this.logger.error(`[clawmatrix] ACP set config error: ${err}`);
578
645
  });
646
+ } else {
647
+ const cf = frame as import("./types.ts").AcpSetConfigRequest;
648
+ this.peerManager.sendTo(cf.from, {
649
+ type: "acp_set_config_res", id: cf.id, from: this.config.nodeId, to: cf.from,
650
+ timestamp: Date.now(), payload: { success: false, error: "ACP not enabled on this node" },
651
+ } as import("./types.ts").AcpSetConfigResponse);
579
652
  }
580
653
  break;
581
654
  case "acp_set_config_res":
@@ -586,6 +659,12 @@ export class ClusterRuntime {
586
659
  this.acpProxy.handleSubscribeRequest(frame as import("./types.ts").AcpSubscribeRequest).catch((err) => {
587
660
  this.logger.error(`[clawmatrix] ACP subscribe error: ${err}`);
588
661
  });
662
+ } else {
663
+ const sf = frame as import("./types.ts").AcpSubscribeRequest;
664
+ this.peerManager.sendTo(sf.from, {
665
+ type: "acp_subscribe_res", id: sf.id, from: this.config.nodeId, to: sf.from,
666
+ timestamp: Date.now(), payload: { success: false, error: "ACP not enabled on this node" },
667
+ } as import("./types.ts").AcpSubscribeResponse);
589
668
  }
590
669
  break;
591
670
  case "acp_unsubscribe":
package/src/config.ts CHANGED
@@ -51,7 +51,8 @@ const ModelInfoSchema = z.object({
51
51
 
52
52
  const PeerConfigSchema = z.object({
53
53
  nodeId: z.string(),
54
- url: z.string(),
54
+ /** Single URL or array of URLs for multi-channel connections. */
55
+ url: z.union([z.string(), z.array(z.string()).min(1)]),
55
56
  });
56
57
 
57
58
  const ToolProxyConfigSchema = z.object({
package/src/handoff.ts CHANGED
@@ -678,6 +678,14 @@ export class HandoffManager {
678
678
  }
679
679
 
680
680
  /** Clean up on shutdown. */
681
+ /** Check if there are pending outbound handoffs targeting a specific node. */
682
+ hasPendingForNode(nodeId: string): boolean {
683
+ for (const p of this.pending.values()) {
684
+ if (p.targetNodeId === nodeId) return true;
685
+ }
686
+ return false;
687
+ }
688
+
681
689
  destroy() {
682
690
  if (this.staleCleanupTimer) {
683
691
  clearInterval(this.staleCleanupTimer);
@@ -79,6 +79,12 @@ export class HealthTracker {
79
79
  private compactTimer: ReturnType<typeof setInterval> | null = null;
80
80
  private saveTimer: ReturnType<typeof setTimeout> | null = null;
81
81
  private dirty = false;
82
+ /** Debounce timer for broadcastSync (prevents rapid-fire broadcasts). */
83
+ private broadcastTimer: ReturnType<typeof setTimeout> | null = null;
84
+ /** Round counter per peer to detect non-converging sync loops. */
85
+ private syncRounds = new Map<string, number>();
86
+ private static readonly MAX_SYNC_ROUNDS = 10;
87
+ private static readonly BROADCAST_DEBOUNCE = 500; // ms
82
88
 
83
89
  constructor(opts: HealthTrackerOptions) {
84
90
  this.nodeId = opts.nodeId;
@@ -119,6 +125,10 @@ export class HealthTracker {
119
125
  clearInterval(this.compactTimer);
120
126
  this.compactTimer = null;
121
127
  }
128
+ if (this.broadcastTimer) {
129
+ clearTimeout(this.broadcastTimer);
130
+ this.broadcastTimer = null;
131
+ }
122
132
  if (this.saveTimer) {
123
133
  clearTimeout(this.saveTimer);
124
134
  this.saveTimer = null;
@@ -165,6 +175,16 @@ export class HealthTracker {
165
175
  const message = new Uint8Array(Buffer.from(frame.payload.data, "base64"));
166
176
  const syncKey = peerId;
167
177
 
178
+ // Guard against non-converging sync loops
179
+ const rounds = (this.syncRounds.get(peerId) ?? 0) + 1;
180
+ if (rounds > HealthTracker.MAX_SYNC_ROUNDS) {
181
+ debug(TAG, `sync with ${peerId} exceeded ${HealthTracker.MAX_SYNC_ROUNDS} rounds, resetting`);
182
+ this.syncStates.set(syncKey, Automerge.initSyncState());
183
+ this.syncRounds.delete(peerId);
184
+ return;
185
+ }
186
+ this.syncRounds.set(peerId, rounds);
187
+
168
188
  try {
169
189
  const syncState = this.syncStates.get(syncKey) ?? Automerge.initSyncState();
170
190
  const [newDoc, newSyncState] = Automerge.receiveSyncMessage(this.doc, syncState, message);
@@ -172,18 +192,19 @@ export class HealthTracker {
172
192
  this.syncStates.set(syncKey, newSyncState);
173
193
  this.scheduleSave();
174
194
 
175
- // Send our response
195
+ // Send our response (only if there's something to send)
176
196
  this.sendSyncMessage(peerId);
177
197
  } catch (err) {
178
198
  debug(TAG, `error handling sync from ${peerId}: ${err}`);
179
199
  }
180
200
  }
181
201
 
182
- /** Initiate sync with a peer (called on peer connect). */
202
+ /** Initialize sync state for a peer (called on peer connect).
203
+ * Does NOT send a message — the subsequent recordPeerOnline → broadcastSync handles that.
204
+ * Sending here would race with broadcastSync and corrupt the sync state. */
183
205
  initPeerSync(peerId: string) {
184
206
  if (peerId === this.nodeId) return;
185
207
  this.syncStates.set(peerId, Automerge.initSyncState());
186
- this.sendSyncMessage(peerId);
187
208
  }
188
209
 
189
210
  /** Clean up sync state for a disconnected peer. */
@@ -196,7 +217,11 @@ export class HealthTracker {
196
217
  const [newSyncState, message] = Automerge.generateSyncMessage(this.doc, syncState);
197
218
  this.syncStates.set(peerId, newSyncState);
198
219
 
199
- if (!message) return;
220
+ if (!message) {
221
+ // Sync converged — reset round counter
222
+ this.syncRounds.delete(peerId);
223
+ return;
224
+ }
200
225
 
201
226
  debug(TAG, `sending health sync to ${peerId} (${message.byteLength} bytes)`);
202
227
 
@@ -214,10 +239,17 @@ export class HealthTracker {
214
239
  }
215
240
 
216
241
  private broadcastSync() {
217
- const peers = this.peerManager.router.getAllPeers();
218
- for (const peer of peers) {
219
- this.sendSyncMessage(peer.nodeId);
220
- }
242
+ // Debounce: multiple events in quick succession → single broadcast
243
+ if (this.broadcastTimer) return;
244
+ this.broadcastTimer = setTimeout(() => {
245
+ this.broadcastTimer = null;
246
+ // Reset round counters — new broadcast starts fresh sync cycle
247
+ this.syncRounds.clear();
248
+ const peers = this.peerManager.router.getAllPeers();
249
+ for (const peer of peers) {
250
+ this.sendSyncMessage(peer.nodeId);
251
+ }
252
+ }, HealthTracker.BROADCAST_DEBOUNCE);
221
253
  }
222
254
 
223
255
  // ── Timeline aggregation ──────────────────────────────────
@@ -257,9 +289,18 @@ export class HealthTracker {
257
289
  // Build timeline for each node (including self)
258
290
  const nodes: NodeTimeline[] = [];
259
291
 
292
+ // Collect nodeIds that have ever had peer_online events (actually connected)
293
+ const everConnected = new Set<string>();
294
+ for (const [, entry] of Object.entries(this.doc.nodes)) {
295
+ for (const ev of entry.events) {
296
+ if (ev.type === "peer_online" && ev.peer) everConnected.add(ev.peer);
297
+ }
298
+ }
299
+
260
300
  for (const [nodeId, entry] of Object.entries(this.doc.nodes)) {
261
- // For self, use start/stop events to determine uptime
262
- // For other nodes, use peer_online/peer_offline from the observing node
301
+ // Remote nodes: must have successfully connected at some point
302
+ if (nodeId !== this.nodeId && !everConnected.has(nodeId)) continue;
303
+
263
304
  const timeline = this.buildNodeTimeline(
264
305
  nodeId,
265
306
  entry,
@@ -269,7 +310,10 @@ export class HealthTracker {
269
310
  bucketCount,
270
311
  gaps,
271
312
  );
272
- if (timeline) nodes.push(timeline);
313
+ if (!timeline) continue;
314
+ // Hide remote nodes that were offline for the entire requested range
315
+ if (nodeId !== this.nodeId && timeline.uptimeRatio === 0) continue;
316
+ nodes.push(timeline);
273
317
  }
274
318
 
275
319
  return {
@@ -345,19 +389,36 @@ export class HealthTracker {
345
389
  startTs: number,
346
390
  endTs: number,
347
391
  ): Array<[number, number]> {
348
- const intervals: Array<[number, number]> = [];
349
-
350
392
  if (nodeId === this.nodeId) {
351
393
  // Self: start/stop events define uptime
352
- // But we're looking at all nodes' data, so check if this nodeId
353
- // has start/stop events (each node writes its own start/stop)
354
394
  return this.buildSelfIntervals(events, startTs, endTs);
355
395
  }
356
396
 
357
- // For remote nodes: gather peer_online/peer_offline events from all observer nodes
358
- // We have the CRDT doc with all nodes' events merged
359
- // Look through ALL nodes' events for peer_online/peer_offline referencing this nodeId
360
- return this.buildPeerIntervals(nodeId, startTs, endTs);
397
+ // For remote nodes: use BOTH self-reported start/stop intervals AND
398
+ // peer_online/peer_offline observations, then merge for best accuracy.
399
+ // Self-reported intervals are the primary signal (the node knows when
400
+ // it was running); peer observations supplement for relay peers or
401
+ // when CRDT sync hasn't propagated the remote node's own events.
402
+ const selfIntervals = this.buildSelfIntervals(events, startTs, endTs);
403
+ const peerIntervals = this.buildPeerIntervals(nodeId, startTs, endTs);
404
+ return this.mergeIntervals([...selfIntervals, ...peerIntervals]);
405
+ }
406
+
407
+ /** Merge overlapping intervals into a sorted, non-overlapping set. */
408
+ private mergeIntervals(intervals: Array<[number, number]>): Array<[number, number]> {
409
+ if (intervals.length <= 1) return intervals;
410
+ intervals.sort((a, b) => a[0] - b[0]);
411
+ const merged: Array<[number, number]> = [intervals[0]!];
412
+ for (let i = 1; i < intervals.length; i++) {
413
+ const prev = merged[merged.length - 1]!;
414
+ const cur = intervals[i]!;
415
+ if (cur[0] <= prev[1]) {
416
+ prev[1] = Math.max(prev[1], cur[1]);
417
+ } else {
418
+ merged.push(cur);
419
+ }
420
+ }
421
+ return merged;
361
422
  }
362
423
 
363
424
  private buildSelfIntervals(