clawmatrix 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/acp-proxy.ts +574 -207
- package/src/cluster-service.ts +24 -2
- package/src/compat.ts +36 -1
- package/src/handoff.ts +10 -3
- package/src/health-tracker.ts +581 -0
- package/src/model-proxy.ts +16 -1
- package/src/peer-manager.ts +61 -38
- package/src/router.ts +41 -0
- package/src/sentinel.ts +29 -7
- package/src/types.ts +10 -1
- package/src/web.ts +33 -0
|
@@ -0,0 +1,581 @@
|
|
|
1
|
+
import * as Automerge from "@automerge/automerge";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
|
4
|
+
import { homedir, tmpdir } from "node:os";
|
|
5
|
+
|
|
6
|
+
import { debug } from "./debug.ts";
|
|
7
|
+
import type { PeerManager } from "./peer-manager.ts";
|
|
8
|
+
import type { HealthSyncFrame } from "./types.ts";
|
|
9
|
+
|
|
10
|
+
const TAG = "health";
|
|
11
|
+
|
|
12
|
+
/** Retention period for health events (default 90 days). */
|
|
13
|
+
const DEFAULT_RETENTION_MS = 90 * 24 * 60 * 60 * 1000;
|
|
14
|
+
|
|
15
|
+
/** Compact interval: every 24 hours. */
|
|
16
|
+
const COMPACT_INTERVAL = 24 * 60 * 60 * 1000;
|
|
17
|
+
|
|
18
|
+
/** Save debounce interval (5 seconds). */
|
|
19
|
+
const SAVE_DEBOUNCE = 5_000;
|
|
20
|
+
|
|
21
|
+
// ── Document schema ─────────────────────────────────────────────
|
|
22
|
+
|
|
23
|
+
export interface HealthEvent {
|
|
24
|
+
ts: number;
|
|
25
|
+
type: "start" | "stop" | "peer_online" | "peer_offline";
|
|
26
|
+
peer?: string;
|
|
27
|
+
via?: string; // "direct" | "relay"
|
|
28
|
+
reason?: string; // disconnect reason
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
interface NodeHealthEntry {
|
|
32
|
+
events: HealthEvent[];
|
|
33
|
+
lastUpdated: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface HealthDoc {
|
|
37
|
+
nodes: Record<string, NodeHealthEntry>;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// ── Timeline aggregation ────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
export type BucketState = "up" | "degraded" | "down" | "unknown";
|
|
43
|
+
|
|
44
|
+
export interface NodeTimeline {
|
|
45
|
+
nodeId: string;
|
|
46
|
+
firstSeen: number;
|
|
47
|
+
lastSeen: number;
|
|
48
|
+
buckets: BucketState[];
|
|
49
|
+
uptimeRatio: number;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export interface AvailabilityResult {
|
|
53
|
+
range: string;
|
|
54
|
+
bucketMinutes: number;
|
|
55
|
+
startTs: number;
|
|
56
|
+
endTs: number;
|
|
57
|
+
nodes: NodeTimeline[];
|
|
58
|
+
/** Time periods when the observing node was down (cannot observe). */
|
|
59
|
+
gaps: Array<[number, number]>;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// ── HealthTracker ───────────────────────────────────────────────
|
|
63
|
+
|
|
64
|
+
export interface HealthTrackerOptions {
|
|
65
|
+
nodeId: string;
|
|
66
|
+
peerManager: PeerManager;
|
|
67
|
+
retentionMs?: number;
|
|
68
|
+
/** Override state directory (for tests). */
|
|
69
|
+
stateDir?: string;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export class HealthTracker {
|
|
73
|
+
private doc: Automerge.Doc<HealthDoc>;
|
|
74
|
+
private syncStates = new Map<string, Automerge.SyncState>();
|
|
75
|
+
private readonly nodeId: string;
|
|
76
|
+
private readonly peerManager: PeerManager;
|
|
77
|
+
private readonly retentionMs: number;
|
|
78
|
+
private readonly docPath: string;
|
|
79
|
+
private compactTimer: ReturnType<typeof setInterval> | null = null;
|
|
80
|
+
private saveTimer: ReturnType<typeof setTimeout> | null = null;
|
|
81
|
+
private dirty = false;
|
|
82
|
+
|
|
83
|
+
constructor(opts: HealthTrackerOptions) {
|
|
84
|
+
this.nodeId = opts.nodeId;
|
|
85
|
+
this.peerManager = opts.peerManager;
|
|
86
|
+
this.retentionMs = opts.retentionMs ?? DEFAULT_RETENTION_MS;
|
|
87
|
+
|
|
88
|
+
const stateDir = opts.stateDir ?? path.join(homedir() || tmpdir(), ".openclaw", "clawmatrix");
|
|
89
|
+
this.docPath = path.join(stateDir, "health.automerge");
|
|
90
|
+
|
|
91
|
+
// Initialize empty doc (will be replaced by load if file exists)
|
|
92
|
+
this.doc = Automerge.init<HealthDoc>();
|
|
93
|
+
this.doc = Automerge.change(this.doc, (d) => {
|
|
94
|
+
(d as HealthDoc).nodes = {};
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
async start() {
|
|
99
|
+
// Load persisted doc
|
|
100
|
+
await this.load();
|
|
101
|
+
|
|
102
|
+
// Record self start
|
|
103
|
+
this.recordEvent({ ts: Date.now(), type: "start" });
|
|
104
|
+
|
|
105
|
+
// Compact old events on start
|
|
106
|
+
this.compact();
|
|
107
|
+
|
|
108
|
+
// Schedule periodic compact
|
|
109
|
+
this.compactTimer = setInterval(() => this.compact(), COMPACT_INTERVAL);
|
|
110
|
+
|
|
111
|
+
debug(TAG, `health tracker started for node "${this.nodeId}"`);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
async stop() {
|
|
115
|
+
// Record self stop
|
|
116
|
+
this.recordEvent({ ts: Date.now(), type: "stop" });
|
|
117
|
+
|
|
118
|
+
if (this.compactTimer) {
|
|
119
|
+
clearInterval(this.compactTimer);
|
|
120
|
+
this.compactTimer = null;
|
|
121
|
+
}
|
|
122
|
+
if (this.saveTimer) {
|
|
123
|
+
clearTimeout(this.saveTimer);
|
|
124
|
+
this.saveTimer = null;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Final save
|
|
128
|
+
await this.save();
|
|
129
|
+
debug(TAG, "health tracker stopped");
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// ── Event recording ─────────────────────────────────────────
|
|
133
|
+
|
|
134
|
+
recordEvent(event: HealthEvent) {
|
|
135
|
+
this.doc = Automerge.change(this.doc, (d) => {
|
|
136
|
+
if (!d.nodes[this.nodeId]) {
|
|
137
|
+
d.nodes[this.nodeId] = { events: [], lastUpdated: 0 };
|
|
138
|
+
}
|
|
139
|
+
const entry = d.nodes[this.nodeId]!;
|
|
140
|
+
entry.events.push({ ...event });
|
|
141
|
+
entry.lastUpdated = Date.now();
|
|
142
|
+
});
|
|
143
|
+
this.scheduleSave();
|
|
144
|
+
this.broadcastSync();
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
recordPeerOnline(peerId: string, via: "direct" | "relay") {
|
|
148
|
+
this.recordEvent({ ts: Date.now(), type: "peer_online", peer: peerId, via });
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
recordPeerOffline(peerId: string, reason?: string) {
|
|
152
|
+
this.recordEvent({ ts: Date.now(), type: "peer_offline", peer: peerId, reason });
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// ── Sync protocol ──────────────────────────────────────────
|
|
156
|
+
|
|
157
|
+
/** Handle incoming health_sync frame from a peer. */
|
|
158
|
+
handleSyncMessage(frame: HealthSyncFrame) {
|
|
159
|
+
const peerId = frame.from;
|
|
160
|
+
const message = new Uint8Array(Buffer.from(frame.payload.data, "base64"));
|
|
161
|
+
const syncKey = peerId;
|
|
162
|
+
|
|
163
|
+
try {
|
|
164
|
+
const syncState = this.syncStates.get(syncKey) ?? Automerge.initSyncState();
|
|
165
|
+
const [newDoc, newSyncState] = Automerge.receiveSyncMessage(this.doc, syncState, message);
|
|
166
|
+
this.doc = newDoc;
|
|
167
|
+
this.syncStates.set(syncKey, newSyncState);
|
|
168
|
+
this.scheduleSave();
|
|
169
|
+
|
|
170
|
+
// Send our response
|
|
171
|
+
this.sendSyncMessage(peerId);
|
|
172
|
+
} catch (err) {
|
|
173
|
+
debug(TAG, `error handling sync from ${peerId}: ${err}`);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/** Initiate sync with a peer (called on peer connect). */
|
|
178
|
+
initPeerSync(peerId: string) {
|
|
179
|
+
if (peerId === this.nodeId) return;
|
|
180
|
+
this.syncStates.set(peerId, Automerge.initSyncState());
|
|
181
|
+
this.sendSyncMessage(peerId);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/** Clean up sync state for a disconnected peer. */
|
|
185
|
+
removePeerSync(peerId: string) {
|
|
186
|
+
this.syncStates.delete(peerId);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
private sendSyncMessage(peerId: string) {
|
|
190
|
+
const syncState = this.syncStates.get(peerId) ?? Automerge.initSyncState();
|
|
191
|
+
const [newSyncState, message] = Automerge.generateSyncMessage(this.doc, syncState);
|
|
192
|
+
this.syncStates.set(peerId, newSyncState);
|
|
193
|
+
|
|
194
|
+
if (!message) return;
|
|
195
|
+
|
|
196
|
+
debug(TAG, `sending health sync to ${peerId} (${message.byteLength} bytes)`);
|
|
197
|
+
|
|
198
|
+
const frame: HealthSyncFrame = {
|
|
199
|
+
type: "health_sync",
|
|
200
|
+
from: this.nodeId,
|
|
201
|
+
to: peerId,
|
|
202
|
+
timestamp: Date.now(),
|
|
203
|
+
payload: {
|
|
204
|
+
data: Buffer.from(message).toString("base64"),
|
|
205
|
+
},
|
|
206
|
+
};
|
|
207
|
+
|
|
208
|
+
this.peerManager.router.sendTo(peerId, frame);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
private broadcastSync() {
|
|
212
|
+
const peers = this.peerManager.router.getAllPeers();
|
|
213
|
+
for (const peer of peers) {
|
|
214
|
+
this.sendSyncMessage(peer.nodeId);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// ── Timeline aggregation ──────────────────────────────────
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Build availability timeline for all known nodes.
|
|
222
|
+
* @param range - "24h", "7d", or "90d"
|
|
223
|
+
*/
|
|
224
|
+
getAvailability(range: "24h" | "7d" | "90d" = "24h"): AvailabilityResult {
|
|
225
|
+
const now = Date.now();
|
|
226
|
+
let durationMs: number;
|
|
227
|
+
let bucketMinutes: number;
|
|
228
|
+
|
|
229
|
+
switch (range) {
|
|
230
|
+
case "24h":
|
|
231
|
+
durationMs = 24 * 60 * 60 * 1000;
|
|
232
|
+
bucketMinutes = 30;
|
|
233
|
+
break;
|
|
234
|
+
case "7d":
|
|
235
|
+
durationMs = 7 * 24 * 60 * 60 * 1000;
|
|
236
|
+
bucketMinutes = 60 * 4; // 4-hour buckets
|
|
237
|
+
break;
|
|
238
|
+
case "90d":
|
|
239
|
+
durationMs = 90 * 24 * 60 * 60 * 1000;
|
|
240
|
+
bucketMinutes = 60 * 24; // 1-day buckets
|
|
241
|
+
break;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
const startTs = now - durationMs;
|
|
245
|
+
const endTs = now;
|
|
246
|
+
const bucketMs = bucketMinutes * 60 * 1000;
|
|
247
|
+
const bucketCount = Math.ceil(durationMs / bucketMs);
|
|
248
|
+
|
|
249
|
+
// Find observation gaps (periods where THIS node was down)
|
|
250
|
+
const gaps = this.getObservationGaps(this.nodeId, startTs, endTs);
|
|
251
|
+
|
|
252
|
+
// Build timeline for each node (including self)
|
|
253
|
+
const nodes: NodeTimeline[] = [];
|
|
254
|
+
|
|
255
|
+
for (const [nodeId, entry] of Object.entries(this.doc.nodes)) {
|
|
256
|
+
// For self, use start/stop events to determine uptime
|
|
257
|
+
// For other nodes, use peer_online/peer_offline from the observing node
|
|
258
|
+
const timeline = this.buildNodeTimeline(
|
|
259
|
+
nodeId,
|
|
260
|
+
entry,
|
|
261
|
+
startTs,
|
|
262
|
+
endTs,
|
|
263
|
+
bucketMs,
|
|
264
|
+
bucketCount,
|
|
265
|
+
gaps,
|
|
266
|
+
);
|
|
267
|
+
if (timeline) nodes.push(timeline);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
return {
|
|
271
|
+
range,
|
|
272
|
+
bucketMinutes,
|
|
273
|
+
startTs,
|
|
274
|
+
endTs,
|
|
275
|
+
nodes,
|
|
276
|
+
gaps,
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
private buildNodeTimeline(
|
|
281
|
+
nodeId: string,
|
|
282
|
+
entry: NodeHealthEntry,
|
|
283
|
+
startTs: number,
|
|
284
|
+
endTs: number,
|
|
285
|
+
bucketMs: number,
|
|
286
|
+
bucketCount: number,
|
|
287
|
+
observerGaps: Array<[number, number]>,
|
|
288
|
+
): NodeTimeline | null {
|
|
289
|
+
const events = [...entry.events].sort((a, b) => a.ts - b.ts);
|
|
290
|
+
if (events.length === 0) return null;
|
|
291
|
+
|
|
292
|
+
const firstSeen = events[0]!.ts;
|
|
293
|
+
const lastSeen = entry.lastUpdated;
|
|
294
|
+
|
|
295
|
+
// Build online intervals for this node
|
|
296
|
+
const intervals = this.buildOnlineIntervals(nodeId, events, startTs, endTs);
|
|
297
|
+
|
|
298
|
+
// Calculate per-bucket state
|
|
299
|
+
const buckets: BucketState[] = [];
|
|
300
|
+
let totalOnline = 0;
|
|
301
|
+
let totalObservable = 0;
|
|
302
|
+
|
|
303
|
+
for (let i = 0; i < bucketCount; i++) {
|
|
304
|
+
const bStart = startTs + i * bucketMs;
|
|
305
|
+
const bEnd = Math.min(bStart + bucketMs, endTs);
|
|
306
|
+
|
|
307
|
+
// How much of this bucket is observable (subtract observer gaps)
|
|
308
|
+
const observableMs = this.observableTimeInRange(bStart, bEnd, observerGaps);
|
|
309
|
+
|
|
310
|
+
if (observableMs === 0) {
|
|
311
|
+
buckets.push("unknown");
|
|
312
|
+
continue;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// How much of this bucket the node was online
|
|
316
|
+
const onlineMs = this.overlapMs(intervals, bStart, bEnd);
|
|
317
|
+
const ratio = onlineMs / observableMs;
|
|
318
|
+
|
|
319
|
+
totalOnline += onlineMs;
|
|
320
|
+
totalObservable += observableMs;
|
|
321
|
+
|
|
322
|
+
if (ratio >= 0.95) buckets.push("up");
|
|
323
|
+
else if (ratio >= 0.05) buckets.push("degraded");
|
|
324
|
+
else buckets.push("down");
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
const uptimeRatio = totalObservable > 0 ? totalOnline / totalObservable : 0;
|
|
328
|
+
|
|
329
|
+
return { nodeId, firstSeen, lastSeen, buckets, uptimeRatio };
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Build online intervals for a node.
|
|
334
|
+
* - For self node: uses start/stop events
|
|
335
|
+
* - For other nodes: uses peer_online/peer_offline events from all observers
|
|
336
|
+
*/
|
|
337
|
+
private buildOnlineIntervals(
|
|
338
|
+
nodeId: string,
|
|
339
|
+
events: HealthEvent[],
|
|
340
|
+
startTs: number,
|
|
341
|
+
endTs: number,
|
|
342
|
+
): Array<[number, number]> {
|
|
343
|
+
const intervals: Array<[number, number]> = [];
|
|
344
|
+
|
|
345
|
+
if (nodeId === this.nodeId) {
|
|
346
|
+
// Self: start/stop events define uptime
|
|
347
|
+
// But we're looking at all nodes' data, so check if this nodeId
|
|
348
|
+
// has start/stop events (each node writes its own start/stop)
|
|
349
|
+
return this.buildSelfIntervals(events, startTs, endTs);
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// For remote nodes: gather peer_online/peer_offline events from all observer nodes
|
|
353
|
+
// We have the CRDT doc with all nodes' events merged
|
|
354
|
+
// Look through ALL nodes' events for peer_online/peer_offline referencing this nodeId
|
|
355
|
+
return this.buildPeerIntervals(nodeId, startTs, endTs);
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
private buildSelfIntervals(
|
|
359
|
+
events: HealthEvent[],
|
|
360
|
+
startTs: number,
|
|
361
|
+
endTs: number,
|
|
362
|
+
): Array<[number, number]> {
|
|
363
|
+
const intervals: Array<[number, number]> = [];
|
|
364
|
+
let onlineSince: number | null = null;
|
|
365
|
+
|
|
366
|
+
for (const ev of events) {
|
|
367
|
+
if (ev.ts < startTs) {
|
|
368
|
+
// Track state before window
|
|
369
|
+
if (ev.type === "start") onlineSince = ev.ts;
|
|
370
|
+
else if (ev.type === "stop") onlineSince = null;
|
|
371
|
+
continue;
|
|
372
|
+
}
|
|
373
|
+
if (ev.ts > endTs) break;
|
|
374
|
+
|
|
375
|
+
if (ev.type === "start") {
|
|
376
|
+
onlineSince = ev.ts;
|
|
377
|
+
} else if (ev.type === "stop" && onlineSince !== null) {
|
|
378
|
+
intervals.push([Math.max(onlineSince, startTs), ev.ts]);
|
|
379
|
+
onlineSince = null;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// If still online at end of window
|
|
384
|
+
if (onlineSince !== null) {
|
|
385
|
+
intervals.push([Math.max(onlineSince, startTs), endTs]);
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
return intervals;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
private buildPeerIntervals(
|
|
392
|
+
targetNodeId: string,
|
|
393
|
+
startTs: number,
|
|
394
|
+
endTs: number,
|
|
395
|
+
): Array<[number, number]> {
|
|
396
|
+
const intervals: Array<[number, number]> = [];
|
|
397
|
+
|
|
398
|
+
// Collect peer_online/peer_offline events from all observer nodes
|
|
399
|
+
const relevantEvents: HealthEvent[] = [];
|
|
400
|
+
for (const [, entry] of Object.entries(this.doc.nodes)) {
|
|
401
|
+
for (const ev of entry.events) {
|
|
402
|
+
if (ev.peer === targetNodeId && (ev.type === "peer_online" || ev.type === "peer_offline")) {
|
|
403
|
+
relevantEvents.push(ev);
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
relevantEvents.sort((a, b) => a.ts - b.ts);
|
|
408
|
+
|
|
409
|
+
let onlineSince: number | null = null;
|
|
410
|
+
|
|
411
|
+
for (const ev of relevantEvents) {
|
|
412
|
+
if (ev.ts < startTs) {
|
|
413
|
+
if (ev.type === "peer_online") onlineSince = ev.ts;
|
|
414
|
+
else if (ev.type === "peer_offline") onlineSince = null;
|
|
415
|
+
continue;
|
|
416
|
+
}
|
|
417
|
+
if (ev.ts > endTs) break;
|
|
418
|
+
|
|
419
|
+
if (ev.type === "peer_online") {
|
|
420
|
+
if (onlineSince === null) onlineSince = ev.ts;
|
|
421
|
+
} else if (ev.type === "peer_offline" && onlineSince !== null) {
|
|
422
|
+
intervals.push([Math.max(onlineSince, startTs), ev.ts]);
|
|
423
|
+
onlineSince = null;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
if (onlineSince !== null) {
|
|
428
|
+
intervals.push([Math.max(onlineSince, startTs), endTs]);
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
return intervals;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
/** Get observation gaps: periods when the local node was not running. */
|
|
435
|
+
private getObservationGaps(
|
|
436
|
+
nodeId: string,
|
|
437
|
+
startTs: number,
|
|
438
|
+
endTs: number,
|
|
439
|
+
): Array<[number, number]> {
|
|
440
|
+
const entry = this.doc.nodes[nodeId];
|
|
441
|
+
if (!entry) return [[startTs, endTs]]; // no data = entire range is a gap
|
|
442
|
+
|
|
443
|
+
const selfIntervals = this.buildSelfIntervals(
|
|
444
|
+
[...entry.events].sort((a, b) => a.ts - b.ts),
|
|
445
|
+
startTs,
|
|
446
|
+
endTs,
|
|
447
|
+
);
|
|
448
|
+
|
|
449
|
+
// Gaps are the complement of self intervals within [startTs, endTs]
|
|
450
|
+
const gaps: Array<[number, number]> = [];
|
|
451
|
+
let cursor = startTs;
|
|
452
|
+
|
|
453
|
+
for (const [start, end] of selfIntervals) {
|
|
454
|
+
if (start > cursor) {
|
|
455
|
+
gaps.push([cursor, start]);
|
|
456
|
+
}
|
|
457
|
+
cursor = Math.max(cursor, end);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
if (cursor < endTs) {
|
|
461
|
+
gaps.push([cursor, endTs]);
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
return gaps;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/** Calculate observable time in a range, excluding gaps. */
|
|
468
|
+
private observableTimeInRange(
|
|
469
|
+
start: number,
|
|
470
|
+
end: number,
|
|
471
|
+
gaps: Array<[number, number]>,
|
|
472
|
+
): number {
|
|
473
|
+
let total = end - start;
|
|
474
|
+
for (const [gStart, gEnd] of gaps) {
|
|
475
|
+
const overlapStart = Math.max(start, gStart);
|
|
476
|
+
const overlapEnd = Math.min(end, gEnd);
|
|
477
|
+
if (overlapStart < overlapEnd) {
|
|
478
|
+
total -= overlapEnd - overlapStart;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
return Math.max(0, total);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
/** Calculate total overlap between intervals and a range. */
|
|
485
|
+
private overlapMs(intervals: Array<[number, number]>, start: number, end: number): number {
|
|
486
|
+
let total = 0;
|
|
487
|
+
for (const [iStart, iEnd] of intervals) {
|
|
488
|
+
const overlapStart = Math.max(start, iStart);
|
|
489
|
+
const overlapEnd = Math.min(end, iEnd);
|
|
490
|
+
if (overlapStart < overlapEnd) {
|
|
491
|
+
total += overlapEnd - overlapStart;
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
return total;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// ── Compact ───────────────────────────────────────────────
|
|
498
|
+
|
|
499
|
+
private compact() {
|
|
500
|
+
const cutoff = Date.now() - this.retentionMs;
|
|
501
|
+
let pruned = 0;
|
|
502
|
+
|
|
503
|
+
this.doc = Automerge.change(this.doc, (d) => {
|
|
504
|
+
for (const [, node] of Object.entries(d.nodes)) {
|
|
505
|
+
const before = node.events.length;
|
|
506
|
+
// Keep events newer than cutoff; also keep the last event before cutoff
|
|
507
|
+
// to preserve state continuity
|
|
508
|
+
let lastBeforeCutoff = -1;
|
|
509
|
+
for (let i = 0; i < node.events.length; i++) {
|
|
510
|
+
if (node.events[i]!.ts < cutoff) lastBeforeCutoff = i;
|
|
511
|
+
}
|
|
512
|
+
if (lastBeforeCutoff > 0) {
|
|
513
|
+
// Remove all events before the last one before cutoff
|
|
514
|
+
node.events.splice(0, lastBeforeCutoff);
|
|
515
|
+
pruned += before - node.events.length;
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
});
|
|
519
|
+
|
|
520
|
+
if (pruned > 0) {
|
|
521
|
+
debug(TAG, `compacted ${pruned} old events`);
|
|
522
|
+
// Re-save to discard old ops
|
|
523
|
+
this.recompact();
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
/** Re-serialize to discard Automerge op history for removed data. */
|
|
528
|
+
private recompact() {
|
|
529
|
+
const bytes = Automerge.save(this.doc);
|
|
530
|
+
this.doc = Automerge.load<HealthDoc>(bytes);
|
|
531
|
+
this.scheduleSave();
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// ── Persistence ───────────────────────────────────────────
|
|
535
|
+
|
|
536
|
+
private async load() {
|
|
537
|
+
try {
|
|
538
|
+
const data = await readFile(this.docPath);
|
|
539
|
+
this.doc = Automerge.load<HealthDoc>(new Uint8Array(data));
|
|
540
|
+
debug(TAG, `loaded health doc from ${this.docPath}`);
|
|
541
|
+
} catch {
|
|
542
|
+
debug(TAG, "no existing health doc, starting fresh");
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
private async save() {
|
|
547
|
+
try {
|
|
548
|
+
const data = Automerge.save(this.doc);
|
|
549
|
+
await mkdir(path.dirname(this.docPath), { recursive: true });
|
|
550
|
+
await writeFile(this.docPath, Buffer.from(data));
|
|
551
|
+
} catch (err) {
|
|
552
|
+
debug(TAG, `failed to save health doc: ${err}`);
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
private scheduleSave() {
|
|
557
|
+
this.dirty = true;
|
|
558
|
+
if (this.saveTimer) return;
|
|
559
|
+
this.saveTimer = setTimeout(() => {
|
|
560
|
+
this.saveTimer = null;
|
|
561
|
+
if (this.dirty) {
|
|
562
|
+
this.dirty = false;
|
|
563
|
+
this.save().catch((err) => {
|
|
564
|
+
debug(TAG, `deferred save error: ${err}`);
|
|
565
|
+
});
|
|
566
|
+
}
|
|
567
|
+
}, SAVE_DEBOUNCE);
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
// ── Public accessors ──────────────────────────────────────
|
|
571
|
+
|
|
572
|
+
/** Get all known node IDs (including offline ones). */
|
|
573
|
+
getKnownNodes(): string[] {
|
|
574
|
+
return Object.keys(this.doc.nodes);
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
/** Get raw events for a specific node. */
|
|
578
|
+
getNodeEvents(nodeId: string): HealthEvent[] {
|
|
579
|
+
return [...(this.doc.nodes[nodeId]?.events ?? [])];
|
|
580
|
+
}
|
|
581
|
+
}
|
package/src/model-proxy.ts
CHANGED
|
@@ -1090,13 +1090,28 @@ export class ModelProxy {
|
|
|
1090
1090
|
?? this.config.models.find((m) => m.id === payload.model)
|
|
1091
1091
|
: this.config.models.find((m) => m.id === payload.model);
|
|
1092
1092
|
if (!model) {
|
|
1093
|
+
// Model not available locally — try forwarding to a remote node that has it
|
|
1094
|
+
const originalTarget = frame.to;
|
|
1095
|
+
const exclude = new Set([this.config.nodeId]);
|
|
1096
|
+
if (originalTarget) exclude.add(originalTarget);
|
|
1097
|
+
const alternatives = this.peerManager.router.findNodesForModel(payload.model, exclude);
|
|
1098
|
+
for (const alt of alternatives) {
|
|
1099
|
+
if (this.peerManager.sendTo(alt.nodeId, { ...frame, to: alt.nodeId })) {
|
|
1100
|
+
debug("model_req", `failover: ${originalTarget ?? "local"} → ${alt.nodeId} for "${payload.model}"`);
|
|
1101
|
+
return;
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
// No alternative found
|
|
1105
|
+
const hint = originalTarget && originalTarget !== this.config.nodeId
|
|
1106
|
+
? `Target node "${originalTarget}" is unreachable and no alternative nodes provide model "${payload.model}"`
|
|
1107
|
+
: `Model "${payload.model}" not available locally`;
|
|
1093
1108
|
this.peerManager.sendTo(from, {
|
|
1094
1109
|
type: "model_res",
|
|
1095
1110
|
id,
|
|
1096
1111
|
from: this.config.nodeId,
|
|
1097
1112
|
to: from,
|
|
1098
1113
|
timestamp: Date.now(),
|
|
1099
|
-
payload: { success: false, error:
|
|
1114
|
+
payload: { success: false, error: hint },
|
|
1100
1115
|
} satisfies ModelResponse);
|
|
1101
1116
|
return;
|
|
1102
1117
|
}
|