clawmatrix 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,581 @@
1
+ import * as Automerge from "@automerge/automerge";
2
+ import path from "node:path";
3
+ import { readFile, writeFile, mkdir } from "node:fs/promises";
4
+ import { homedir, tmpdir } from "node:os";
5
+
6
+ import { debug } from "./debug.ts";
7
+ import type { PeerManager } from "./peer-manager.ts";
8
+ import type { HealthSyncFrame } from "./types.ts";
9
+
10
+ const TAG = "health";
11
+
12
+ /** Retention period for health events (default 90 days). */
13
+ const DEFAULT_RETENTION_MS = 90 * 24 * 60 * 60 * 1000;
14
+
15
+ /** Compact interval: every 24 hours. */
16
+ const COMPACT_INTERVAL = 24 * 60 * 60 * 1000;
17
+
18
+ /** Save debounce interval (5 seconds). */
19
+ const SAVE_DEBOUNCE = 5_000;
20
+
21
+ // ── Document schema ─────────────────────────────────────────────
22
+
23
+ export interface HealthEvent {
24
+ ts: number;
25
+ type: "start" | "stop" | "peer_online" | "peer_offline";
26
+ peer?: string;
27
+ via?: string; // "direct" | "relay"
28
+ reason?: string; // disconnect reason
29
+ }
30
+
31
+ interface NodeHealthEntry {
32
+ events: HealthEvent[];
33
+ lastUpdated: number;
34
+ }
35
+
36
+ export interface HealthDoc {
37
+ nodes: Record<string, NodeHealthEntry>;
38
+ }
39
+
40
+ // ── Timeline aggregation ────────────────────────────────────────
41
+
42
+ export type BucketState = "up" | "degraded" | "down" | "unknown";
43
+
44
+ export interface NodeTimeline {
45
+ nodeId: string;
46
+ firstSeen: number;
47
+ lastSeen: number;
48
+ buckets: BucketState[];
49
+ uptimeRatio: number;
50
+ }
51
+
52
+ export interface AvailabilityResult {
53
+ range: string;
54
+ bucketMinutes: number;
55
+ startTs: number;
56
+ endTs: number;
57
+ nodes: NodeTimeline[];
58
+ /** Time periods when the observing node was down (cannot observe). */
59
+ gaps: Array<[number, number]>;
60
+ }
61
+
62
+ // ── HealthTracker ───────────────────────────────────────────────
63
+
64
+ export interface HealthTrackerOptions {
65
+ nodeId: string;
66
+ peerManager: PeerManager;
67
+ retentionMs?: number;
68
+ /** Override state directory (for tests). */
69
+ stateDir?: string;
70
+ }
71
+
72
+ export class HealthTracker {
73
+ private doc: Automerge.Doc<HealthDoc>;
74
+ private syncStates = new Map<string, Automerge.SyncState>();
75
+ private readonly nodeId: string;
76
+ private readonly peerManager: PeerManager;
77
+ private readonly retentionMs: number;
78
+ private readonly docPath: string;
79
+ private compactTimer: ReturnType<typeof setInterval> | null = null;
80
+ private saveTimer: ReturnType<typeof setTimeout> | null = null;
81
+ private dirty = false;
82
+
83
+ constructor(opts: HealthTrackerOptions) {
84
+ this.nodeId = opts.nodeId;
85
+ this.peerManager = opts.peerManager;
86
+ this.retentionMs = opts.retentionMs ?? DEFAULT_RETENTION_MS;
87
+
88
+ const stateDir = opts.stateDir ?? path.join(homedir() || tmpdir(), ".openclaw", "clawmatrix");
89
+ this.docPath = path.join(stateDir, "health.automerge");
90
+
91
+ // Initialize empty doc (will be replaced by load if file exists)
92
+ this.doc = Automerge.init<HealthDoc>();
93
+ this.doc = Automerge.change(this.doc, (d) => {
94
+ (d as HealthDoc).nodes = {};
95
+ });
96
+ }
97
+
98
+ async start() {
99
+ // Load persisted doc
100
+ await this.load();
101
+
102
+ // Record self start
103
+ this.recordEvent({ ts: Date.now(), type: "start" });
104
+
105
+ // Compact old events on start
106
+ this.compact();
107
+
108
+ // Schedule periodic compact
109
+ this.compactTimer = setInterval(() => this.compact(), COMPACT_INTERVAL);
110
+
111
+ debug(TAG, `health tracker started for node "${this.nodeId}"`);
112
+ }
113
+
114
+ async stop() {
115
+ // Record self stop
116
+ this.recordEvent({ ts: Date.now(), type: "stop" });
117
+
118
+ if (this.compactTimer) {
119
+ clearInterval(this.compactTimer);
120
+ this.compactTimer = null;
121
+ }
122
+ if (this.saveTimer) {
123
+ clearTimeout(this.saveTimer);
124
+ this.saveTimer = null;
125
+ }
126
+
127
+ // Final save
128
+ await this.save();
129
+ debug(TAG, "health tracker stopped");
130
+ }
131
+
132
+ // ── Event recording ─────────────────────────────────────────
133
+
134
+ recordEvent(event: HealthEvent) {
135
+ this.doc = Automerge.change(this.doc, (d) => {
136
+ if (!d.nodes[this.nodeId]) {
137
+ d.nodes[this.nodeId] = { events: [], lastUpdated: 0 };
138
+ }
139
+ const entry = d.nodes[this.nodeId]!;
140
+ entry.events.push({ ...event });
141
+ entry.lastUpdated = Date.now();
142
+ });
143
+ this.scheduleSave();
144
+ this.broadcastSync();
145
+ }
146
+
147
+ recordPeerOnline(peerId: string, via: "direct" | "relay") {
148
+ this.recordEvent({ ts: Date.now(), type: "peer_online", peer: peerId, via });
149
+ }
150
+
151
+ recordPeerOffline(peerId: string, reason?: string) {
152
+ this.recordEvent({ ts: Date.now(), type: "peer_offline", peer: peerId, reason });
153
+ }
154
+
155
+ // ── Sync protocol ──────────────────────────────────────────
156
+
157
+ /** Handle incoming health_sync frame from a peer. */
158
+ handleSyncMessage(frame: HealthSyncFrame) {
159
+ const peerId = frame.from;
160
+ const message = new Uint8Array(Buffer.from(frame.payload.data, "base64"));
161
+ const syncKey = peerId;
162
+
163
+ try {
164
+ const syncState = this.syncStates.get(syncKey) ?? Automerge.initSyncState();
165
+ const [newDoc, newSyncState] = Automerge.receiveSyncMessage(this.doc, syncState, message);
166
+ this.doc = newDoc;
167
+ this.syncStates.set(syncKey, newSyncState);
168
+ this.scheduleSave();
169
+
170
+ // Send our response
171
+ this.sendSyncMessage(peerId);
172
+ } catch (err) {
173
+ debug(TAG, `error handling sync from ${peerId}: ${err}`);
174
+ }
175
+ }
176
+
177
+ /** Initiate sync with a peer (called on peer connect). */
178
+ initPeerSync(peerId: string) {
179
+ if (peerId === this.nodeId) return;
180
+ this.syncStates.set(peerId, Automerge.initSyncState());
181
+ this.sendSyncMessage(peerId);
182
+ }
183
+
184
+ /** Clean up sync state for a disconnected peer. */
185
+ removePeerSync(peerId: string) {
186
+ this.syncStates.delete(peerId);
187
+ }
188
+
189
+ private sendSyncMessage(peerId: string) {
190
+ const syncState = this.syncStates.get(peerId) ?? Automerge.initSyncState();
191
+ const [newSyncState, message] = Automerge.generateSyncMessage(this.doc, syncState);
192
+ this.syncStates.set(peerId, newSyncState);
193
+
194
+ if (!message) return;
195
+
196
+ debug(TAG, `sending health sync to ${peerId} (${message.byteLength} bytes)`);
197
+
198
+ const frame: HealthSyncFrame = {
199
+ type: "health_sync",
200
+ from: this.nodeId,
201
+ to: peerId,
202
+ timestamp: Date.now(),
203
+ payload: {
204
+ data: Buffer.from(message).toString("base64"),
205
+ },
206
+ };
207
+
208
+ this.peerManager.router.sendTo(peerId, frame);
209
+ }
210
+
211
+ private broadcastSync() {
212
+ const peers = this.peerManager.router.getAllPeers();
213
+ for (const peer of peers) {
214
+ this.sendSyncMessage(peer.nodeId);
215
+ }
216
+ }
217
+
218
+ // ── Timeline aggregation ──────────────────────────────────
219
+
220
+ /**
221
+ * Build availability timeline for all known nodes.
222
+ * @param range - "24h", "7d", or "90d"
223
+ */
224
+ getAvailability(range: "24h" | "7d" | "90d" = "24h"): AvailabilityResult {
225
+ const now = Date.now();
226
+ let durationMs: number;
227
+ let bucketMinutes: number;
228
+
229
+ switch (range) {
230
+ case "24h":
231
+ durationMs = 24 * 60 * 60 * 1000;
232
+ bucketMinutes = 30;
233
+ break;
234
+ case "7d":
235
+ durationMs = 7 * 24 * 60 * 60 * 1000;
236
+ bucketMinutes = 60 * 4; // 4-hour buckets
237
+ break;
238
+ case "90d":
239
+ durationMs = 90 * 24 * 60 * 60 * 1000;
240
+ bucketMinutes = 60 * 24; // 1-day buckets
241
+ break;
242
+ }
243
+
244
+ const startTs = now - durationMs;
245
+ const endTs = now;
246
+ const bucketMs = bucketMinutes * 60 * 1000;
247
+ const bucketCount = Math.ceil(durationMs / bucketMs);
248
+
249
+ // Find observation gaps (periods where THIS node was down)
250
+ const gaps = this.getObservationGaps(this.nodeId, startTs, endTs);
251
+
252
+ // Build timeline for each node (including self)
253
+ const nodes: NodeTimeline[] = [];
254
+
255
+ for (const [nodeId, entry] of Object.entries(this.doc.nodes)) {
256
+ // For self, use start/stop events to determine uptime
257
+ // For other nodes, use peer_online/peer_offline from the observing node
258
+ const timeline = this.buildNodeTimeline(
259
+ nodeId,
260
+ entry,
261
+ startTs,
262
+ endTs,
263
+ bucketMs,
264
+ bucketCount,
265
+ gaps,
266
+ );
267
+ if (timeline) nodes.push(timeline);
268
+ }
269
+
270
+ return {
271
+ range,
272
+ bucketMinutes,
273
+ startTs,
274
+ endTs,
275
+ nodes,
276
+ gaps,
277
+ };
278
+ }
279
+
280
+ private buildNodeTimeline(
281
+ nodeId: string,
282
+ entry: NodeHealthEntry,
283
+ startTs: number,
284
+ endTs: number,
285
+ bucketMs: number,
286
+ bucketCount: number,
287
+ observerGaps: Array<[number, number]>,
288
+ ): NodeTimeline | null {
289
+ const events = [...entry.events].sort((a, b) => a.ts - b.ts);
290
+ if (events.length === 0) return null;
291
+
292
+ const firstSeen = events[0]!.ts;
293
+ const lastSeen = entry.lastUpdated;
294
+
295
+ // Build online intervals for this node
296
+ const intervals = this.buildOnlineIntervals(nodeId, events, startTs, endTs);
297
+
298
+ // Calculate per-bucket state
299
+ const buckets: BucketState[] = [];
300
+ let totalOnline = 0;
301
+ let totalObservable = 0;
302
+
303
+ for (let i = 0; i < bucketCount; i++) {
304
+ const bStart = startTs + i * bucketMs;
305
+ const bEnd = Math.min(bStart + bucketMs, endTs);
306
+
307
+ // How much of this bucket is observable (subtract observer gaps)
308
+ const observableMs = this.observableTimeInRange(bStart, bEnd, observerGaps);
309
+
310
+ if (observableMs === 0) {
311
+ buckets.push("unknown");
312
+ continue;
313
+ }
314
+
315
+ // How much of this bucket the node was online
316
+ const onlineMs = this.overlapMs(intervals, bStart, bEnd);
317
+ const ratio = onlineMs / observableMs;
318
+
319
+ totalOnline += onlineMs;
320
+ totalObservable += observableMs;
321
+
322
+ if (ratio >= 0.95) buckets.push("up");
323
+ else if (ratio >= 0.05) buckets.push("degraded");
324
+ else buckets.push("down");
325
+ }
326
+
327
+ const uptimeRatio = totalObservable > 0 ? totalOnline / totalObservable : 0;
328
+
329
+ return { nodeId, firstSeen, lastSeen, buckets, uptimeRatio };
330
+ }
331
+
332
+ /**
333
+ * Build online intervals for a node.
334
+ * - For self node: uses start/stop events
335
+ * - For other nodes: uses peer_online/peer_offline events from all observers
336
+ */
337
+ private buildOnlineIntervals(
338
+ nodeId: string,
339
+ events: HealthEvent[],
340
+ startTs: number,
341
+ endTs: number,
342
+ ): Array<[number, number]> {
343
+ const intervals: Array<[number, number]> = [];
344
+
345
+ if (nodeId === this.nodeId) {
346
+ // Self: start/stop events define uptime
347
+ // But we're looking at all nodes' data, so check if this nodeId
348
+ // has start/stop events (each node writes its own start/stop)
349
+ return this.buildSelfIntervals(events, startTs, endTs);
350
+ }
351
+
352
+ // For remote nodes: gather peer_online/peer_offline events from all observer nodes
353
+ // We have the CRDT doc with all nodes' events merged
354
+ // Look through ALL nodes' events for peer_online/peer_offline referencing this nodeId
355
+ return this.buildPeerIntervals(nodeId, startTs, endTs);
356
+ }
357
+
358
+ private buildSelfIntervals(
359
+ events: HealthEvent[],
360
+ startTs: number,
361
+ endTs: number,
362
+ ): Array<[number, number]> {
363
+ const intervals: Array<[number, number]> = [];
364
+ let onlineSince: number | null = null;
365
+
366
+ for (const ev of events) {
367
+ if (ev.ts < startTs) {
368
+ // Track state before window
369
+ if (ev.type === "start") onlineSince = ev.ts;
370
+ else if (ev.type === "stop") onlineSince = null;
371
+ continue;
372
+ }
373
+ if (ev.ts > endTs) break;
374
+
375
+ if (ev.type === "start") {
376
+ onlineSince = ev.ts;
377
+ } else if (ev.type === "stop" && onlineSince !== null) {
378
+ intervals.push([Math.max(onlineSince, startTs), ev.ts]);
379
+ onlineSince = null;
380
+ }
381
+ }
382
+
383
+ // If still online at end of window
384
+ if (onlineSince !== null) {
385
+ intervals.push([Math.max(onlineSince, startTs), endTs]);
386
+ }
387
+
388
+ return intervals;
389
+ }
390
+
391
+ private buildPeerIntervals(
392
+ targetNodeId: string,
393
+ startTs: number,
394
+ endTs: number,
395
+ ): Array<[number, number]> {
396
+ const intervals: Array<[number, number]> = [];
397
+
398
+ // Collect peer_online/peer_offline events from all observer nodes
399
+ const relevantEvents: HealthEvent[] = [];
400
+ for (const [, entry] of Object.entries(this.doc.nodes)) {
401
+ for (const ev of entry.events) {
402
+ if (ev.peer === targetNodeId && (ev.type === "peer_online" || ev.type === "peer_offline")) {
403
+ relevantEvents.push(ev);
404
+ }
405
+ }
406
+ }
407
+ relevantEvents.sort((a, b) => a.ts - b.ts);
408
+
409
+ let onlineSince: number | null = null;
410
+
411
+ for (const ev of relevantEvents) {
412
+ if (ev.ts < startTs) {
413
+ if (ev.type === "peer_online") onlineSince = ev.ts;
414
+ else if (ev.type === "peer_offline") onlineSince = null;
415
+ continue;
416
+ }
417
+ if (ev.ts > endTs) break;
418
+
419
+ if (ev.type === "peer_online") {
420
+ if (onlineSince === null) onlineSince = ev.ts;
421
+ } else if (ev.type === "peer_offline" && onlineSince !== null) {
422
+ intervals.push([Math.max(onlineSince, startTs), ev.ts]);
423
+ onlineSince = null;
424
+ }
425
+ }
426
+
427
+ if (onlineSince !== null) {
428
+ intervals.push([Math.max(onlineSince, startTs), endTs]);
429
+ }
430
+
431
+ return intervals;
432
+ }
433
+
434
+ /** Get observation gaps: periods when the local node was not running. */
435
+ private getObservationGaps(
436
+ nodeId: string,
437
+ startTs: number,
438
+ endTs: number,
439
+ ): Array<[number, number]> {
440
+ const entry = this.doc.nodes[nodeId];
441
+ if (!entry) return [[startTs, endTs]]; // no data = entire range is a gap
442
+
443
+ const selfIntervals = this.buildSelfIntervals(
444
+ [...entry.events].sort((a, b) => a.ts - b.ts),
445
+ startTs,
446
+ endTs,
447
+ );
448
+
449
+ // Gaps are the complement of self intervals within [startTs, endTs]
450
+ const gaps: Array<[number, number]> = [];
451
+ let cursor = startTs;
452
+
453
+ for (const [start, end] of selfIntervals) {
454
+ if (start > cursor) {
455
+ gaps.push([cursor, start]);
456
+ }
457
+ cursor = Math.max(cursor, end);
458
+ }
459
+
460
+ if (cursor < endTs) {
461
+ gaps.push([cursor, endTs]);
462
+ }
463
+
464
+ return gaps;
465
+ }
466
+
467
+ /** Calculate observable time in a range, excluding gaps. */
468
+ private observableTimeInRange(
469
+ start: number,
470
+ end: number,
471
+ gaps: Array<[number, number]>,
472
+ ): number {
473
+ let total = end - start;
474
+ for (const [gStart, gEnd] of gaps) {
475
+ const overlapStart = Math.max(start, gStart);
476
+ const overlapEnd = Math.min(end, gEnd);
477
+ if (overlapStart < overlapEnd) {
478
+ total -= overlapEnd - overlapStart;
479
+ }
480
+ }
481
+ return Math.max(0, total);
482
+ }
483
+
484
+ /** Calculate total overlap between intervals and a range. */
485
+ private overlapMs(intervals: Array<[number, number]>, start: number, end: number): number {
486
+ let total = 0;
487
+ for (const [iStart, iEnd] of intervals) {
488
+ const overlapStart = Math.max(start, iStart);
489
+ const overlapEnd = Math.min(end, iEnd);
490
+ if (overlapStart < overlapEnd) {
491
+ total += overlapEnd - overlapStart;
492
+ }
493
+ }
494
+ return total;
495
+ }
496
+
497
+ // ── Compact ───────────────────────────────────────────────
498
+
499
+ private compact() {
500
+ const cutoff = Date.now() - this.retentionMs;
501
+ let pruned = 0;
502
+
503
+ this.doc = Automerge.change(this.doc, (d) => {
504
+ for (const [, node] of Object.entries(d.nodes)) {
505
+ const before = node.events.length;
506
+ // Keep events newer than cutoff; also keep the last event before cutoff
507
+ // to preserve state continuity
508
+ let lastBeforeCutoff = -1;
509
+ for (let i = 0; i < node.events.length; i++) {
510
+ if (node.events[i]!.ts < cutoff) lastBeforeCutoff = i;
511
+ }
512
+ if (lastBeforeCutoff > 0) {
513
+ // Remove all events before the last one before cutoff
514
+ node.events.splice(0, lastBeforeCutoff);
515
+ pruned += before - node.events.length;
516
+ }
517
+ }
518
+ });
519
+
520
+ if (pruned > 0) {
521
+ debug(TAG, `compacted ${pruned} old events`);
522
+ // Re-save to discard old ops
523
+ this.recompact();
524
+ }
525
+ }
526
+
527
+ /** Re-serialize to discard Automerge op history for removed data. */
528
+ private recompact() {
529
+ const bytes = Automerge.save(this.doc);
530
+ this.doc = Automerge.load<HealthDoc>(bytes);
531
+ this.scheduleSave();
532
+ }
533
+
534
+ // ── Persistence ───────────────────────────────────────────
535
+
536
+ private async load() {
537
+ try {
538
+ const data = await readFile(this.docPath);
539
+ this.doc = Automerge.load<HealthDoc>(new Uint8Array(data));
540
+ debug(TAG, `loaded health doc from ${this.docPath}`);
541
+ } catch {
542
+ debug(TAG, "no existing health doc, starting fresh");
543
+ }
544
+ }
545
+
546
+ private async save() {
547
+ try {
548
+ const data = Automerge.save(this.doc);
549
+ await mkdir(path.dirname(this.docPath), { recursive: true });
550
+ await writeFile(this.docPath, Buffer.from(data));
551
+ } catch (err) {
552
+ debug(TAG, `failed to save health doc: ${err}`);
553
+ }
554
+ }
555
+
556
+ private scheduleSave() {
557
+ this.dirty = true;
558
+ if (this.saveTimer) return;
559
+ this.saveTimer = setTimeout(() => {
560
+ this.saveTimer = null;
561
+ if (this.dirty) {
562
+ this.dirty = false;
563
+ this.save().catch((err) => {
564
+ debug(TAG, `deferred save error: ${err}`);
565
+ });
566
+ }
567
+ }, SAVE_DEBOUNCE);
568
+ }
569
+
570
+ // ── Public accessors ──────────────────────────────────────
571
+
572
+ /** Get all known node IDs (including offline ones). */
573
+ getKnownNodes(): string[] {
574
+ return Object.keys(this.doc.nodes);
575
+ }
576
+
577
+ /** Get raw events for a specific node. */
578
+ getNodeEvents(nodeId: string): HealthEvent[] {
579
+ return [...(this.doc.nodes[nodeId]?.events ?? [])];
580
+ }
581
+ }
@@ -1090,13 +1090,28 @@ export class ModelProxy {
1090
1090
  ?? this.config.models.find((m) => m.id === payload.model)
1091
1091
  : this.config.models.find((m) => m.id === payload.model);
1092
1092
  if (!model) {
1093
+ // Model not available locally — try forwarding to a remote node that has it
1094
+ const originalTarget = frame.to;
1095
+ const exclude = new Set([this.config.nodeId]);
1096
+ if (originalTarget) exclude.add(originalTarget);
1097
+ const alternatives = this.peerManager.router.findNodesForModel(payload.model, exclude);
1098
+ for (const alt of alternatives) {
1099
+ if (this.peerManager.sendTo(alt.nodeId, { ...frame, to: alt.nodeId })) {
1100
+ debug("model_req", `failover: ${originalTarget ?? "local"} → ${alt.nodeId} for "${payload.model}"`);
1101
+ return;
1102
+ }
1103
+ }
1104
+ // No alternative found
1105
+ const hint = originalTarget && originalTarget !== this.config.nodeId
1106
+ ? `Target node "${originalTarget}" is unreachable and no alternative nodes provide model "${payload.model}"`
1107
+ : `Model "${payload.model}" not available locally`;
1093
1108
  this.peerManager.sendTo(from, {
1094
1109
  type: "model_res",
1095
1110
  id,
1096
1111
  from: this.config.nodeId,
1097
1112
  to: from,
1098
1113
  timestamp: Date.now(),
1099
- payload: { success: false, error: `Model "${payload.model}" not available locally` },
1114
+ payload: { success: false, error: hint },
1100
1115
  } satisfies ModelResponse);
1101
1116
  return;
1102
1117
  }