clawmatrix 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -21
- package/cli/bin/clawmatrix.mjs +300 -1
- package/package.json +8 -1
- package/src/acp-proxy.ts +122 -50
- package/src/{web.ts → api.ts} +646 -25
- package/src/audit.ts +37 -2
- package/src/auth.ts +5 -10
- package/src/automation.ts +625 -0
- package/src/cluster-service.ts +172 -16
- package/src/compat.ts +103 -0
- package/src/config.ts +75 -27
- package/src/connection.ts +215 -37
- package/src/crypto.ts +72 -5
- package/src/device-info.ts +21 -2
- package/src/file-transfer.ts +3 -2
- package/src/handoff.ts +90 -32
- package/src/health-tracker.ts +91 -356
- package/src/index.ts +421 -13
- package/src/kanban.ts +507 -0
- package/src/knowledge-sync.ts +158 -7
- package/src/local-tools.ts +65 -2
- package/src/log-replication.ts +198 -0
- package/src/model-proxy.ts +152 -60
- package/src/peer-approval.ts +3 -2
- package/src/peer-manager.ts +236 -44
- package/src/retry.ts +81 -0
- package/src/router.ts +152 -104
- package/src/sentinel.ts +85 -51
- package/src/store.ts +578 -0
- package/src/terminal.ts +17 -8
- package/src/tool-proxy.ts +6 -5
- package/src/tools/cluster-events.ts +6 -6
- package/src/tools/cluster-kanban.ts +345 -0
- package/src/tools/cluster-peers.ts +1 -1
- package/src/tools/cluster-query.ts +145 -0
- package/src/types.ts +95 -9
package/src/health-tracker.ts
CHANGED
|
@@ -1,24 +1,19 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
/**
|
|
2
|
+
* Health event tracker — records node lifecycle and peer connectivity events.
|
|
3
|
+
*
|
|
4
|
+
* Events are persisted to SQLite via Store and synced across peers via
|
|
5
|
+
* LogReplicator (sequence-based replication). Replaces the previous
|
|
6
|
+
* Automerge-based implementation with a lighter append-only log approach.
|
|
7
|
+
*/
|
|
5
8
|
|
|
6
9
|
import { debug } from "./debug.ts";
|
|
7
|
-
import type {
|
|
8
|
-
import type {
|
|
10
|
+
import type { Store } from "./store.ts";
|
|
11
|
+
import type { LogReplicator } from "./log-replication.ts";
|
|
12
|
+
import type { HealthRow } from "./store.ts";
|
|
9
13
|
|
|
10
14
|
const TAG = "health";
|
|
11
15
|
|
|
12
|
-
|
|
13
|
-
const DEFAULT_RETENTION_MS = 90 * 24 * 60 * 60 * 1000;
|
|
14
|
-
|
|
15
|
-
/** Compact interval: every 24 hours. */
|
|
16
|
-
const COMPACT_INTERVAL = 24 * 60 * 60 * 1000;
|
|
17
|
-
|
|
18
|
-
/** Save debounce interval (5 seconds). */
|
|
19
|
-
const SAVE_DEBOUNCE = 5_000;
|
|
20
|
-
|
|
21
|
-
// ── Document schema ─────────────────────────────────────────────
|
|
16
|
+
// ── Public types (unchanged from previous implementation) ───────
|
|
22
17
|
|
|
23
18
|
export interface HealthEvent {
|
|
24
19
|
ts: number;
|
|
@@ -28,17 +23,6 @@ export interface HealthEvent {
|
|
|
28
23
|
reason?: string; // disconnect reason
|
|
29
24
|
}
|
|
30
25
|
|
|
31
|
-
interface NodeHealthEntry {
|
|
32
|
-
events: HealthEvent[];
|
|
33
|
-
lastUpdated: number;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
export interface HealthDoc {
|
|
37
|
-
nodes: Record<string, NodeHealthEntry>;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
// ── Timeline aggregation ────────────────────────────────────────
|
|
41
|
-
|
|
42
26
|
export type BucketState = "up" | "degraded" | "down" | "unknown";
|
|
43
27
|
|
|
44
28
|
export interface NodeTimeline {
|
|
@@ -63,100 +47,54 @@ export interface AvailabilityResult {
|
|
|
63
47
|
|
|
64
48
|
export interface HealthTrackerOptions {
|
|
65
49
|
nodeId: string;
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
/** Override state directory (for tests). */
|
|
69
|
-
stateDir?: string;
|
|
50
|
+
store?: Store;
|
|
51
|
+
logReplicator?: LogReplicator;
|
|
70
52
|
}
|
|
71
53
|
|
|
72
54
|
export class HealthTracker {
|
|
73
|
-
private doc: Automerge.Doc<HealthDoc>;
|
|
74
|
-
private syncStates = new Map<string, Automerge.SyncState>();
|
|
75
55
|
private readonly nodeId: string;
|
|
76
|
-
private
|
|
77
|
-
private
|
|
78
|
-
private readonly docPath: string;
|
|
79
|
-
private compactTimer: ReturnType<typeof setInterval> | null = null;
|
|
80
|
-
private saveTimer: ReturnType<typeof setTimeout> | null = null;
|
|
81
|
-
private dirty = false;
|
|
82
|
-
/** Debounce timer for broadcastSync (prevents rapid-fire broadcasts). */
|
|
83
|
-
private broadcastTimer: ReturnType<typeof setTimeout> | null = null;
|
|
84
|
-
/** Round counter per peer to detect non-converging sync loops. */
|
|
85
|
-
private syncRounds = new Map<string, number>();
|
|
86
|
-
private static readonly MAX_SYNC_ROUNDS = 10;
|
|
87
|
-
private static readonly BROADCAST_DEBOUNCE = 500; // ms
|
|
56
|
+
private store: Store | null;
|
|
57
|
+
private logReplicator: LogReplicator | null;
|
|
88
58
|
|
|
89
59
|
constructor(opts: HealthTrackerOptions) {
|
|
90
60
|
this.nodeId = opts.nodeId;
|
|
91
|
-
this.
|
|
92
|
-
this.
|
|
93
|
-
|
|
94
|
-
const stateDir = opts.stateDir ?? path.join(homedir() || tmpdir(), ".openclaw", "clawmatrix");
|
|
95
|
-
this.docPath = path.join(stateDir, "health.automerge");
|
|
61
|
+
this.store = opts.store ?? null;
|
|
62
|
+
this.logReplicator = opts.logReplicator ?? null;
|
|
63
|
+
}
|
|
96
64
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
this.
|
|
100
|
-
|
|
101
|
-
});
|
|
65
|
+
/** Set store after construction (for late initialization). */
|
|
66
|
+
setStore(store: Store, logReplicator?: LogReplicator) {
|
|
67
|
+
this.store = store;
|
|
68
|
+
this.logReplicator = logReplicator ?? this.logReplicator;
|
|
102
69
|
}
|
|
103
70
|
|
|
104
71
|
async start() {
|
|
105
|
-
// Load persisted doc
|
|
106
|
-
await this.load();
|
|
107
|
-
|
|
108
|
-
// Record self start
|
|
109
72
|
this.recordEvent({ ts: Date.now(), type: "start" });
|
|
110
|
-
|
|
111
|
-
// Compact old events on start
|
|
112
|
-
this.compact();
|
|
113
|
-
|
|
114
|
-
// Schedule periodic compact
|
|
115
|
-
this.compactTimer = setInterval(() => this.compact(), COMPACT_INTERVAL);
|
|
116
|
-
|
|
117
73
|
debug(TAG, `health tracker started for node "${this.nodeId}"`);
|
|
118
74
|
}
|
|
119
75
|
|
|
120
76
|
async stop() {
|
|
121
|
-
// Record self stop
|
|
122
77
|
this.recordEvent({ ts: Date.now(), type: "stop" });
|
|
123
|
-
|
|
124
|
-
if (this.compactTimer) {
|
|
125
|
-
clearInterval(this.compactTimer);
|
|
126
|
-
this.compactTimer = null;
|
|
127
|
-
}
|
|
128
|
-
if (this.broadcastTimer) {
|
|
129
|
-
clearTimeout(this.broadcastTimer);
|
|
130
|
-
this.broadcastTimer = null;
|
|
131
|
-
}
|
|
132
|
-
if (this.saveTimer) {
|
|
133
|
-
clearTimeout(this.saveTimer);
|
|
134
|
-
this.saveTimer = null;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
// Final save
|
|
138
|
-
await this.save();
|
|
139
78
|
debug(TAG, "health tracker stopped");
|
|
140
79
|
}
|
|
141
80
|
|
|
142
81
|
// ── Event recording ─────────────────────────────────────────
|
|
143
82
|
|
|
144
83
|
recordEvent(event: HealthEvent) {
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
84
|
+
if (!this.store) return;
|
|
85
|
+
try {
|
|
86
|
+
this.store.insertHealth({
|
|
87
|
+
nodeId: this.nodeId,
|
|
88
|
+
ts: event.ts,
|
|
89
|
+
type: event.type,
|
|
90
|
+
peer: event.peer,
|
|
91
|
+
via: event.via,
|
|
92
|
+
reason: event.reason,
|
|
93
|
+
});
|
|
94
|
+
this.logReplicator?.notifyLocalInsert("health_events");
|
|
95
|
+
} catch (err) {
|
|
96
|
+
debug(TAG, `failed to record event: ${err}`);
|
|
149
97
|
}
|
|
150
|
-
this.doc = Automerge.change(this.doc, (d) => {
|
|
151
|
-
if (!d.nodes[this.nodeId]) {
|
|
152
|
-
d.nodes[this.nodeId] = { events: [], lastUpdated: 0 };
|
|
153
|
-
}
|
|
154
|
-
const entry = d.nodes[this.nodeId]!;
|
|
155
|
-
entry.events.push(clean as HealthEvent);
|
|
156
|
-
entry.lastUpdated = Date.now();
|
|
157
|
-
});
|
|
158
|
-
this.scheduleSave();
|
|
159
|
-
this.broadcastSync();
|
|
160
98
|
}
|
|
161
99
|
|
|
162
100
|
recordPeerOnline(peerId: string, via: "direct" | "relay") {
|
|
@@ -167,90 +105,10 @@ export class HealthTracker {
|
|
|
167
105
|
this.recordEvent({ ts: Date.now(), type: "peer_offline", peer: peerId, reason });
|
|
168
106
|
}
|
|
169
107
|
|
|
170
|
-
// ── Sync
|
|
171
|
-
|
|
172
|
-
/** Handle incoming health_sync frame from a peer. */
|
|
173
|
-
handleSyncMessage(frame: HealthSyncFrame) {
|
|
174
|
-
const peerId = frame.from;
|
|
175
|
-
const message = new Uint8Array(Buffer.from(frame.payload.data, "base64"));
|
|
176
|
-
const syncKey = peerId;
|
|
177
|
-
|
|
178
|
-
// Guard against non-converging sync loops
|
|
179
|
-
const rounds = (this.syncRounds.get(peerId) ?? 0) + 1;
|
|
180
|
-
if (rounds > HealthTracker.MAX_SYNC_ROUNDS) {
|
|
181
|
-
debug(TAG, `sync with ${peerId} exceeded ${HealthTracker.MAX_SYNC_ROUNDS} rounds, resetting`);
|
|
182
|
-
this.syncStates.set(syncKey, Automerge.initSyncState());
|
|
183
|
-
this.syncRounds.delete(peerId);
|
|
184
|
-
return;
|
|
185
|
-
}
|
|
186
|
-
this.syncRounds.set(peerId, rounds);
|
|
108
|
+
// ── Sync lifecycle (no-ops, handled by LogReplicator now) ───
|
|
187
109
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
const [newDoc, newSyncState] = Automerge.receiveSyncMessage(this.doc, syncState, message);
|
|
191
|
-
this.doc = newDoc;
|
|
192
|
-
this.syncStates.set(syncKey, newSyncState);
|
|
193
|
-
this.scheduleSave();
|
|
194
|
-
|
|
195
|
-
// Send our response (only if there's something to send)
|
|
196
|
-
this.sendSyncMessage(peerId);
|
|
197
|
-
} catch (err) {
|
|
198
|
-
debug(TAG, `error handling sync from ${peerId}: ${err}`);
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
/** Initialize sync state for a peer (called on peer connect).
|
|
203
|
-
* Does NOT send a message — the subsequent recordPeerOnline → broadcastSync handles that.
|
|
204
|
-
* Sending here would race with broadcastSync and corrupt the sync state. */
|
|
205
|
-
initPeerSync(peerId: string) {
|
|
206
|
-
if (peerId === this.nodeId) return;
|
|
207
|
-
this.syncStates.set(peerId, Automerge.initSyncState());
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
/** Clean up sync state for a disconnected peer. */
|
|
211
|
-
removePeerSync(peerId: string) {
|
|
212
|
-
this.syncStates.delete(peerId);
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
private sendSyncMessage(peerId: string) {
|
|
216
|
-
const syncState = this.syncStates.get(peerId) ?? Automerge.initSyncState();
|
|
217
|
-
const [newSyncState, message] = Automerge.generateSyncMessage(this.doc, syncState);
|
|
218
|
-
this.syncStates.set(peerId, newSyncState);
|
|
219
|
-
|
|
220
|
-
if (!message) {
|
|
221
|
-
// Sync converged — reset round counter
|
|
222
|
-
this.syncRounds.delete(peerId);
|
|
223
|
-
return;
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
debug(TAG, `sending health sync to ${peerId} (${message.byteLength} bytes)`);
|
|
227
|
-
|
|
228
|
-
const frame: HealthSyncFrame = {
|
|
229
|
-
type: "health_sync",
|
|
230
|
-
from: this.nodeId,
|
|
231
|
-
to: peerId,
|
|
232
|
-
timestamp: Date.now(),
|
|
233
|
-
payload: {
|
|
234
|
-
data: Buffer.from(message).toString("base64"),
|
|
235
|
-
},
|
|
236
|
-
};
|
|
237
|
-
|
|
238
|
-
this.peerManager.router.sendTo(peerId, frame);
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
private broadcastSync() {
|
|
242
|
-
// Debounce: multiple events in quick succession → single broadcast
|
|
243
|
-
if (this.broadcastTimer) return;
|
|
244
|
-
this.broadcastTimer = setTimeout(() => {
|
|
245
|
-
this.broadcastTimer = null;
|
|
246
|
-
// Reset round counters — new broadcast starts fresh sync cycle
|
|
247
|
-
this.syncRounds.clear();
|
|
248
|
-
const peers = this.peerManager.router.getAllPeers();
|
|
249
|
-
for (const peer of peers) {
|
|
250
|
-
this.sendSyncMessage(peer.nodeId);
|
|
251
|
-
}
|
|
252
|
-
}, HealthTracker.BROADCAST_DEBOUNCE);
|
|
253
|
-
}
|
|
110
|
+
initPeerSync(_peerId: string) { /* handled by LogReplicator */ }
|
|
111
|
+
removePeerSync(_peerId: string) { /* handled by LogReplicator */ }
|
|
254
112
|
|
|
255
113
|
// ── Timeline aggregation ──────────────────────────────────
|
|
256
114
|
|
|
@@ -270,11 +128,11 @@ export class HealthTracker {
|
|
|
270
128
|
break;
|
|
271
129
|
case "7d":
|
|
272
130
|
durationMs = 7 * 24 * 60 * 60 * 1000;
|
|
273
|
-
bucketMinutes = 60 * 4;
|
|
131
|
+
bucketMinutes = 60 * 4;
|
|
274
132
|
break;
|
|
275
133
|
case "90d":
|
|
276
134
|
durationMs = 90 * 24 * 60 * 60 * 1000;
|
|
277
|
-
bucketMinutes = 60 * 24;
|
|
135
|
+
bucketMinutes = 60 * 24;
|
|
278
136
|
break;
|
|
279
137
|
}
|
|
280
138
|
|
|
@@ -284,67 +142,52 @@ export class HealthTracker {
|
|
|
284
142
|
const bucketCount = Math.ceil(durationMs / bucketMs);
|
|
285
143
|
|
|
286
144
|
// Find observation gaps (periods where THIS node was down)
|
|
287
|
-
const
|
|
145
|
+
const selfEvents = this.getEventsForNode(this.nodeId);
|
|
146
|
+
const gaps = this.getObservationGaps(selfEvents, startTs, endTs);
|
|
288
147
|
|
|
289
148
|
// Build timeline for each node (including self)
|
|
290
149
|
const nodes: NodeTimeline[] = [];
|
|
291
150
|
|
|
292
|
-
// Collect nodeIds that have ever had peer_online events
|
|
151
|
+
// Collect nodeIds that have ever had peer_online events
|
|
293
152
|
const everConnected = new Set<string>();
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
}
|
|
153
|
+
const allEvents = this.store ? this.store.queryHealth({ type: "peer_online" }) : [];
|
|
154
|
+
for (const ev of allEvents) {
|
|
155
|
+
if (ev.peer) everConnected.add(ev.peer);
|
|
298
156
|
}
|
|
299
157
|
|
|
300
|
-
|
|
301
|
-
|
|
158
|
+
const knownNodes = this.getKnownNodes();
|
|
159
|
+
for (const nodeId of knownNodes) {
|
|
302
160
|
if (nodeId !== this.nodeId && !everConnected.has(nodeId)) continue;
|
|
303
161
|
|
|
162
|
+
const events = this.getEventsForNode(nodeId);
|
|
304
163
|
const timeline = this.buildNodeTimeline(
|
|
305
|
-
nodeId,
|
|
306
|
-
entry,
|
|
307
|
-
startTs,
|
|
308
|
-
endTs,
|
|
309
|
-
bucketMs,
|
|
310
|
-
bucketCount,
|
|
311
|
-
gaps,
|
|
164
|
+
nodeId, events, startTs, endTs, bucketMs, bucketCount, gaps,
|
|
312
165
|
);
|
|
313
166
|
if (!timeline) continue;
|
|
314
|
-
// Hide remote nodes that were offline for the entire requested range
|
|
315
167
|
if (nodeId !== this.nodeId && timeline.uptimeRatio === 0) continue;
|
|
316
168
|
nodes.push(timeline);
|
|
317
169
|
}
|
|
318
170
|
|
|
319
|
-
return {
|
|
320
|
-
range,
|
|
321
|
-
bucketMinutes,
|
|
322
|
-
startTs,
|
|
323
|
-
endTs,
|
|
324
|
-
nodes,
|
|
325
|
-
gaps,
|
|
326
|
-
};
|
|
171
|
+
return { range, bucketMinutes, startTs, endTs, nodes, gaps };
|
|
327
172
|
}
|
|
328
173
|
|
|
329
174
|
private buildNodeTimeline(
|
|
330
175
|
nodeId: string,
|
|
331
|
-
|
|
176
|
+
events: HealthEvent[],
|
|
332
177
|
startTs: number,
|
|
333
178
|
endTs: number,
|
|
334
179
|
bucketMs: number,
|
|
335
180
|
bucketCount: number,
|
|
336
181
|
observerGaps: Array<[number, number]>,
|
|
337
182
|
): NodeTimeline | null {
|
|
338
|
-
const
|
|
339
|
-
if (
|
|
183
|
+
const sorted = [...events].sort((a, b) => a.ts - b.ts);
|
|
184
|
+
if (sorted.length === 0) return null;
|
|
340
185
|
|
|
341
|
-
const firstSeen =
|
|
342
|
-
const lastSeen =
|
|
186
|
+
const firstSeen = sorted[0]!.ts;
|
|
187
|
+
const lastSeen = sorted[sorted.length - 1]!.ts;
|
|
343
188
|
|
|
344
|
-
|
|
345
|
-
const intervals = this.buildOnlineIntervals(nodeId, events, startTs, endTs);
|
|
189
|
+
const intervals = this.buildOnlineIntervals(nodeId, sorted, startTs, endTs);
|
|
346
190
|
|
|
347
|
-
// Calculate per-bucket state
|
|
348
191
|
const buckets: BucketState[] = [];
|
|
349
192
|
let totalOnline = 0;
|
|
350
193
|
let totalObservable = 0;
|
|
@@ -353,15 +196,9 @@ export class HealthTracker {
|
|
|
353
196
|
const bStart = startTs + i * bucketMs;
|
|
354
197
|
const bEnd = Math.min(bStart + bucketMs, endTs);
|
|
355
198
|
|
|
356
|
-
// How much of this bucket is observable (subtract observer gaps)
|
|
357
199
|
const observableMs = this.observableTimeInRange(bStart, bEnd, observerGaps);
|
|
200
|
+
if (observableMs === 0) { buckets.push("unknown"); continue; }
|
|
358
201
|
|
|
359
|
-
if (observableMs === 0) {
|
|
360
|
-
buckets.push("unknown");
|
|
361
|
-
continue;
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
// How much of this bucket the node was online
|
|
365
202
|
const onlineMs = this.overlapMs(intervals, bStart, bEnd);
|
|
366
203
|
const ratio = onlineMs / observableMs;
|
|
367
204
|
|
|
@@ -374,15 +211,9 @@ export class HealthTracker {
|
|
|
374
211
|
}
|
|
375
212
|
|
|
376
213
|
const uptimeRatio = totalObservable > 0 ? totalOnline / totalObservable : 0;
|
|
377
|
-
|
|
378
214
|
return { nodeId, firstSeen, lastSeen, buckets, uptimeRatio };
|
|
379
215
|
}
|
|
380
216
|
|
|
381
|
-
/**
|
|
382
|
-
* Build online intervals for a node.
|
|
383
|
-
* - For self node: uses start/stop events
|
|
384
|
-
* - For other nodes: uses peer_online/peer_offline events from all observers
|
|
385
|
-
*/
|
|
386
217
|
private buildOnlineIntervals(
|
|
387
218
|
nodeId: string,
|
|
388
219
|
events: HealthEvent[],
|
|
@@ -390,21 +221,14 @@ export class HealthTracker {
|
|
|
390
221
|
endTs: number,
|
|
391
222
|
): Array<[number, number]> {
|
|
392
223
|
if (nodeId === this.nodeId) {
|
|
393
|
-
// Self: start/stop events define uptime
|
|
394
224
|
return this.buildSelfIntervals(events, startTs, endTs);
|
|
395
225
|
}
|
|
396
|
-
|
|
397
|
-
// For remote nodes: use BOTH self-reported start/stop intervals AND
|
|
398
|
-
// peer_online/peer_offline observations, then merge for best accuracy.
|
|
399
|
-
// Self-reported intervals are the primary signal (the node knows when
|
|
400
|
-
// it was running); peer observations supplement for relay peers or
|
|
401
|
-
// when CRDT sync hasn't propagated the remote node's own events.
|
|
226
|
+
// For remote nodes: merge self-reported start/stop with peer observations
|
|
402
227
|
const selfIntervals = this.buildSelfIntervals(events, startTs, endTs);
|
|
403
228
|
const peerIntervals = this.buildPeerIntervals(nodeId, startTs, endTs);
|
|
404
229
|
return this.mergeIntervals([...selfIntervals, ...peerIntervals]);
|
|
405
230
|
}
|
|
406
231
|
|
|
407
|
-
/** Merge overlapping intervals into a sorted, non-overlapping set. */
|
|
408
232
|
private mergeIntervals(intervals: Array<[number, number]>): Array<[number, number]> {
|
|
409
233
|
if (intervals.length <= 1) return intervals;
|
|
410
234
|
intervals.sort((a, b) => a[0] - b[0]);
|
|
@@ -431,7 +255,6 @@ export class HealthTracker {
|
|
|
431
255
|
|
|
432
256
|
for (const ev of events) {
|
|
433
257
|
if (ev.ts < startTs) {
|
|
434
|
-
// Track state before window
|
|
435
258
|
if (ev.type === "start") onlineSince = ev.ts;
|
|
436
259
|
else if (ev.type === "stop") onlineSince = null;
|
|
437
260
|
continue;
|
|
@@ -446,11 +269,9 @@ export class HealthTracker {
|
|
|
446
269
|
}
|
|
447
270
|
}
|
|
448
271
|
|
|
449
|
-
// If still online at end of window
|
|
450
272
|
if (onlineSince !== null) {
|
|
451
273
|
intervals.push([Math.max(onlineSince, startTs), endTs]);
|
|
452
274
|
}
|
|
453
|
-
|
|
454
275
|
return intervals;
|
|
455
276
|
}
|
|
456
277
|
|
|
@@ -459,19 +280,16 @@ export class HealthTracker {
|
|
|
459
280
|
startTs: number,
|
|
460
281
|
endTs: number,
|
|
461
282
|
): Array<[number, number]> {
|
|
462
|
-
|
|
283
|
+
if (!this.store) return [];
|
|
463
284
|
|
|
464
|
-
// Collect peer_online/peer_offline events from
|
|
465
|
-
const
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
relevantEvents.push(ev);
|
|
470
|
-
}
|
|
471
|
-
}
|
|
472
|
-
}
|
|
285
|
+
// Collect peer_online/peer_offline events about this node from ALL observers
|
|
286
|
+
const relevantRows = this.store.queryHealth({ peer: targetNodeId });
|
|
287
|
+
const relevantEvents: HealthEvent[] = relevantRows
|
|
288
|
+
.filter(r => r.type === "peer_online" || r.type === "peer_offline")
|
|
289
|
+
.map(r => ({ ts: r.ts, type: r.type as HealthEvent["type"], peer: r.peer ?? undefined }));
|
|
473
290
|
relevantEvents.sort((a, b) => a.ts - b.ts);
|
|
474
291
|
|
|
292
|
+
const intervals: Array<[number, number]> = [];
|
|
475
293
|
let onlineSince: number | null = null;
|
|
476
294
|
|
|
477
295
|
for (const ev of relevantEvents) {
|
|
@@ -493,155 +311,72 @@ export class HealthTracker {
|
|
|
493
311
|
if (onlineSince !== null) {
|
|
494
312
|
intervals.push([Math.max(onlineSince, startTs), endTs]);
|
|
495
313
|
}
|
|
496
|
-
|
|
497
314
|
return intervals;
|
|
498
315
|
}
|
|
499
316
|
|
|
500
|
-
/** Get observation gaps: periods when the local node was not running. */
|
|
501
317
|
private getObservationGaps(
|
|
502
|
-
|
|
318
|
+
selfEvents: HealthEvent[],
|
|
503
319
|
startTs: number,
|
|
504
320
|
endTs: number,
|
|
505
321
|
): Array<[number, number]> {
|
|
506
|
-
|
|
507
|
-
if (!entry) return [[startTs, endTs]]; // no data = entire range is a gap
|
|
322
|
+
if (selfEvents.length === 0) return [[startTs, endTs]];
|
|
508
323
|
|
|
509
|
-
const
|
|
510
|
-
|
|
511
|
-
startTs,
|
|
512
|
-
endTs,
|
|
513
|
-
);
|
|
324
|
+
const sorted = [...selfEvents].sort((a, b) => a.ts - b.ts);
|
|
325
|
+
const selfIntervals = this.buildSelfIntervals(sorted, startTs, endTs);
|
|
514
326
|
|
|
515
|
-
// Gaps are the complement of self intervals within [startTs, endTs]
|
|
516
327
|
const gaps: Array<[number, number]> = [];
|
|
517
328
|
let cursor = startTs;
|
|
518
329
|
|
|
519
330
|
for (const [start, end] of selfIntervals) {
|
|
520
|
-
if (start > cursor)
|
|
521
|
-
gaps.push([cursor, start]);
|
|
522
|
-
}
|
|
331
|
+
if (start > cursor) gaps.push([cursor, start]);
|
|
523
332
|
cursor = Math.max(cursor, end);
|
|
524
333
|
}
|
|
525
334
|
|
|
526
|
-
if (cursor < endTs)
|
|
527
|
-
gaps.push([cursor, endTs]);
|
|
528
|
-
}
|
|
529
|
-
|
|
335
|
+
if (cursor < endTs) gaps.push([cursor, endTs]);
|
|
530
336
|
return gaps;
|
|
531
337
|
}
|
|
532
338
|
|
|
533
|
-
|
|
534
|
-
private observableTimeInRange(
|
|
535
|
-
start: number,
|
|
536
|
-
end: number,
|
|
537
|
-
gaps: Array<[number, number]>,
|
|
538
|
-
): number {
|
|
339
|
+
private observableTimeInRange(start: number, end: number, gaps: Array<[number, number]>): number {
|
|
539
340
|
let total = end - start;
|
|
540
341
|
for (const [gStart, gEnd] of gaps) {
|
|
541
342
|
const overlapStart = Math.max(start, gStart);
|
|
542
343
|
const overlapEnd = Math.min(end, gEnd);
|
|
543
|
-
if (overlapStart < overlapEnd)
|
|
544
|
-
total -= overlapEnd - overlapStart;
|
|
545
|
-
}
|
|
344
|
+
if (overlapStart < overlapEnd) total -= overlapEnd - overlapStart;
|
|
546
345
|
}
|
|
547
346
|
return Math.max(0, total);
|
|
548
347
|
}
|
|
549
348
|
|
|
550
|
-
/** Calculate total overlap between intervals and a range. */
|
|
551
349
|
private overlapMs(intervals: Array<[number, number]>, start: number, end: number): number {
|
|
552
350
|
let total = 0;
|
|
553
351
|
for (const [iStart, iEnd] of intervals) {
|
|
554
352
|
const overlapStart = Math.max(start, iStart);
|
|
555
353
|
const overlapEnd = Math.min(end, iEnd);
|
|
556
|
-
if (overlapStart < overlapEnd)
|
|
557
|
-
total += overlapEnd - overlapStart;
|
|
558
|
-
}
|
|
354
|
+
if (overlapStart < overlapEnd) total += overlapEnd - overlapStart;
|
|
559
355
|
}
|
|
560
356
|
return total;
|
|
561
357
|
}
|
|
562
358
|
|
|
563
|
-
// ── Compact ───────────────────────────────────────────────
|
|
564
|
-
|
|
565
|
-
private compact() {
|
|
566
|
-
const cutoff = Date.now() - this.retentionMs;
|
|
567
|
-
let pruned = 0;
|
|
568
|
-
|
|
569
|
-
this.doc = Automerge.change(this.doc, (d) => {
|
|
570
|
-
for (const [, node] of Object.entries(d.nodes)) {
|
|
571
|
-
const before = node.events.length;
|
|
572
|
-
// Keep events newer than cutoff; also keep the last event before cutoff
|
|
573
|
-
// to preserve state continuity
|
|
574
|
-
let lastBeforeCutoff = -1;
|
|
575
|
-
for (let i = 0; i < node.events.length; i++) {
|
|
576
|
-
if (node.events[i]!.ts < cutoff) lastBeforeCutoff = i;
|
|
577
|
-
}
|
|
578
|
-
if (lastBeforeCutoff > 0) {
|
|
579
|
-
// Remove all events before the last one before cutoff
|
|
580
|
-
node.events.splice(0, lastBeforeCutoff);
|
|
581
|
-
pruned += before - node.events.length;
|
|
582
|
-
}
|
|
583
|
-
}
|
|
584
|
-
});
|
|
585
|
-
|
|
586
|
-
if (pruned > 0) {
|
|
587
|
-
debug(TAG, `compacted ${pruned} old events`);
|
|
588
|
-
// Re-save to discard old ops
|
|
589
|
-
this.recompact();
|
|
590
|
-
}
|
|
591
|
-
}
|
|
592
|
-
|
|
593
|
-
/** Re-serialize to discard Automerge op history for removed data. */
|
|
594
|
-
private recompact() {
|
|
595
|
-
const bytes = Automerge.save(this.doc);
|
|
596
|
-
this.doc = Automerge.load<HealthDoc>(bytes);
|
|
597
|
-
this.scheduleSave();
|
|
598
|
-
}
|
|
599
|
-
|
|
600
|
-
// ── Persistence ───────────────────────────────────────────
|
|
601
|
-
|
|
602
|
-
private async load() {
|
|
603
|
-
try {
|
|
604
|
-
const data = await readFile(this.docPath);
|
|
605
|
-
this.doc = Automerge.load<HealthDoc>(new Uint8Array(data));
|
|
606
|
-
debug(TAG, `loaded health doc from ${this.docPath}`);
|
|
607
|
-
} catch {
|
|
608
|
-
debug(TAG, "no existing health doc, starting fresh");
|
|
609
|
-
}
|
|
610
|
-
}
|
|
611
|
-
|
|
612
|
-
private async save() {
|
|
613
|
-
try {
|
|
614
|
-
const data = Automerge.save(this.doc);
|
|
615
|
-
await mkdir(path.dirname(this.docPath), { recursive: true });
|
|
616
|
-
await writeFile(this.docPath, Buffer.from(data));
|
|
617
|
-
} catch (err) {
|
|
618
|
-
debug(TAG, `failed to save health doc: ${err}`);
|
|
619
|
-
}
|
|
620
|
-
}
|
|
621
|
-
|
|
622
|
-
private scheduleSave() {
|
|
623
|
-
this.dirty = true;
|
|
624
|
-
if (this.saveTimer) return;
|
|
625
|
-
this.saveTimer = setTimeout(() => {
|
|
626
|
-
this.saveTimer = null;
|
|
627
|
-
if (this.dirty) {
|
|
628
|
-
this.dirty = false;
|
|
629
|
-
this.save().catch((err) => {
|
|
630
|
-
debug(TAG, `deferred save error: ${err}`);
|
|
631
|
-
});
|
|
632
|
-
}
|
|
633
|
-
}, SAVE_DEBOUNCE);
|
|
634
|
-
}
|
|
635
|
-
|
|
636
359
|
// ── Public accessors ──────────────────────────────────────
|
|
637
360
|
|
|
638
|
-
/** Get all known node IDs (including offline ones). */
|
|
639
361
|
getKnownNodes(): string[] {
|
|
640
|
-
return
|
|
362
|
+
return this.store?.getHealthNodeIds() ?? [];
|
|
641
363
|
}
|
|
642
364
|
|
|
643
|
-
/** Get raw events for a specific node. */
|
|
644
365
|
getNodeEvents(nodeId: string): HealthEvent[] {
|
|
645
|
-
return
|
|
366
|
+
return this.getEventsForNode(nodeId);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
private getEventsForNode(nodeId: string): HealthEvent[] {
|
|
370
|
+
if (!this.store) return [];
|
|
371
|
+
const rows = this.store.queryHealth({ nodeId });
|
|
372
|
+
return rows.map(rowToEvent);
|
|
646
373
|
}
|
|
647
374
|
}
|
|
375
|
+
|
|
376
|
+
function rowToEvent(row: HealthRow): HealthEvent {
|
|
377
|
+
const event: HealthEvent = { ts: row.ts, type: row.type as HealthEvent["type"] };
|
|
378
|
+
if (row.peer) event.peer = row.peer;
|
|
379
|
+
if (row.via) event.via = row.via;
|
|
380
|
+
if (row.reason) event.reason = row.reason;
|
|
381
|
+
return event;
|
|
382
|
+
}
|