clawmatrix 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/acp-proxy.ts +92 -6
- package/src/cluster-service.ts +24 -2
- package/src/health-tracker.ts +581 -0
- package/src/model-proxy.ts +16 -1
- package/src/peer-manager.ts +25 -4
- package/src/router.ts +25 -0
- package/src/sentinel.ts +29 -7
- package/src/types.ts +10 -1
- package/src/web.ts +33 -0
package/package.json
CHANGED
package/src/acp-proxy.ts
CHANGED
|
@@ -584,7 +584,7 @@ export class AcpProxy {
|
|
|
584
584
|
let newSession: AcpSession;
|
|
585
585
|
if (resumeAcpSessionId) {
|
|
586
586
|
debug("acp", `Session "${sessionId}" not found, resuming via acpSessionId "${resumeAcpSessionId.slice(0, 8)}..."`);
|
|
587
|
-
const effectiveCwd = cwd
|
|
587
|
+
const effectiveCwd = cwd || process.cwd();
|
|
588
588
|
try {
|
|
589
589
|
newSession = await this.createSessionWithResume(agent, resumeAcpSessionId, effectiveCwd, from);
|
|
590
590
|
} catch (resumeErr) {
|
|
@@ -611,7 +611,7 @@ export class AcpProxy {
|
|
|
611
611
|
});
|
|
612
612
|
}
|
|
613
613
|
if (mode === "oneshot" && task) {
|
|
614
|
-
this.returnToReusePool(newSession, cwd
|
|
614
|
+
this.returnToReusePool(newSession, cwd || process.cwd());
|
|
615
615
|
}
|
|
616
616
|
} else {
|
|
617
617
|
// Re-create native session
|
|
@@ -671,7 +671,7 @@ export class AcpProxy {
|
|
|
671
671
|
});
|
|
672
672
|
}
|
|
673
673
|
if (mode === "oneshot" && task) {
|
|
674
|
-
this.returnToReusePool(session, cwd
|
|
674
|
+
this.returnToReusePool(session, cwd || process.cwd());
|
|
675
675
|
}
|
|
676
676
|
} else {
|
|
677
677
|
// Create new native OpenClaw session
|
|
@@ -778,7 +778,8 @@ export class AcpProxy {
|
|
|
778
778
|
/** Handle resume session request (receiver side): resume an ACP session and track it. */
|
|
779
779
|
async handleResumeRequest(frame: AcpResumeRequest): Promise<void> {
|
|
780
780
|
const { id, from, payload } = frame;
|
|
781
|
-
const { agent, acpSessionId
|
|
781
|
+
const { agent, acpSessionId } = payload;
|
|
782
|
+
const cwd = payload.cwd || process.cwd();
|
|
782
783
|
|
|
783
784
|
try {
|
|
784
785
|
// Enforce concurrent session limit
|
|
@@ -1089,6 +1090,30 @@ export class AcpProxy {
|
|
|
1089
1090
|
}
|
|
1090
1091
|
} catch { /* no claude projects directory */ }
|
|
1091
1092
|
|
|
1093
|
+
// 3. Search Codex sessions (~/.codex/sessions/YYYY/MM/DD/rollout-*-{sessionId}.jsonl)
|
|
1094
|
+
// Reverse sort so we search recent dates first (most likely to match)
|
|
1095
|
+
const codexSessionsDir = join(homedir(), ".codex", "sessions");
|
|
1096
|
+
try {
|
|
1097
|
+
const years = (await readdir(codexSessionsDir)).sort().reverse();
|
|
1098
|
+
for (const year of years) {
|
|
1099
|
+
const monthsDir = join(codexSessionsDir, year);
|
|
1100
|
+
let months: string[];
|
|
1101
|
+
try { months = (await readdir(monthsDir)).sort().reverse(); } catch { continue; }
|
|
1102
|
+
for (const month of months) {
|
|
1103
|
+
const daysDir = join(monthsDir, month);
|
|
1104
|
+
let days: string[];
|
|
1105
|
+
try { days = (await readdir(daysDir)).sort().reverse(); } catch { continue; }
|
|
1106
|
+
for (const day of days) {
|
|
1107
|
+
const dayDir = join(daysDir, day);
|
|
1108
|
+
let files: string[];
|
|
1109
|
+
try { files = await readdir(dayDir); } catch { continue; }
|
|
1110
|
+
const match = files.find((f) => f.endsWith(`-${sessionId}.jsonl`));
|
|
1111
|
+
if (match) return join(dayDir, match);
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
}
|
|
1115
|
+
} catch { /* no codex sessions directory */ }
|
|
1116
|
+
|
|
1092
1117
|
return null;
|
|
1093
1118
|
}
|
|
1094
1119
|
|
|
@@ -1313,7 +1338,7 @@ export class AcpProxy {
|
|
|
1313
1338
|
// ── Internal: ACP session management (receiver) ────────────────
|
|
1314
1339
|
|
|
1315
1340
|
private async createSession(agent: string, cwd: string | undefined, from: string): Promise<AcpSession> {
|
|
1316
|
-
const effectiveCwd = cwd
|
|
1341
|
+
const effectiveCwd = cwd || process.cwd();
|
|
1317
1342
|
|
|
1318
1343
|
// 1. Try daemon pool first (long-lived process, instant newSession)
|
|
1319
1344
|
try {
|
|
@@ -2256,6 +2281,62 @@ async function fetchSessionListFromDisk(): Promise<AcpSessionInfo[]> {
|
|
|
2256
2281
|
// history.jsonl missing or unreadable — skip
|
|
2257
2282
|
}
|
|
2258
2283
|
|
|
2284
|
+
// 3. Codex session history (~/.codex/history.jsonl + session_index.jsonl)
|
|
2285
|
+
try {
|
|
2286
|
+
const codexDir = join(homedir(), ".codex");
|
|
2287
|
+
// Read session index for titles
|
|
2288
|
+
const titleMap = new Map<string, string>();
|
|
2289
|
+
try {
|
|
2290
|
+
const indexContent = await readFileText(join(codexDir, "session_index.jsonl"));
|
|
2291
|
+
for (const line of indexContent.split("\n")) {
|
|
2292
|
+
if (!line.trim()) continue;
|
|
2293
|
+
try {
|
|
2294
|
+
const entry = JSON.parse(line) as { id?: string; thread_name?: string };
|
|
2295
|
+
if (entry.id && entry.thread_name) titleMap.set(entry.id, entry.thread_name);
|
|
2296
|
+
} catch { /* skip */ }
|
|
2297
|
+
}
|
|
2298
|
+
} catch { /* index missing */ }
|
|
2299
|
+
|
|
2300
|
+
const codexHistoryPath = join(codexDir, "history.jsonl");
|
|
2301
|
+
const codexContent = await readFileTail(codexHistoryPath, 512 * 1024);
|
|
2302
|
+
const codexSessions = new Map<string, { title: string; updatedAt: number }>();
|
|
2303
|
+
|
|
2304
|
+
for (const line of codexContent.split("\n")) {
|
|
2305
|
+
if (!line.trim()) continue;
|
|
2306
|
+
try {
|
|
2307
|
+
const entry = JSON.parse(line) as { session_id?: string; ts?: number; text?: string };
|
|
2308
|
+
if (!entry.session_id) continue;
|
|
2309
|
+
const existing = codexSessions.get(entry.session_id);
|
|
2310
|
+
if (!existing) {
|
|
2311
|
+
const title = (entry.text ?? "").replace(/\s+/g, " ").trim().slice(0, 120);
|
|
2312
|
+
codexSessions.set(entry.session_id, {
|
|
2313
|
+
title,
|
|
2314
|
+
updatedAt: entry.ts ? entry.ts * 1000 : 0, // ts is in seconds
|
|
2315
|
+
});
|
|
2316
|
+
} else if (entry.ts && entry.ts * 1000 > existing.updatedAt) {
|
|
2317
|
+
existing.updatedAt = entry.ts * 1000;
|
|
2318
|
+
}
|
|
2319
|
+
} catch { /* skip */ }
|
|
2320
|
+
}
|
|
2321
|
+
|
|
2322
|
+
const codexResults: AcpSessionInfo[] = [];
|
|
2323
|
+
const existingIds2 = new Set(results.map((r) => r.sessionId));
|
|
2324
|
+
for (const [sessionId, info] of codexSessions) {
|
|
2325
|
+
if (existingIds2.has(sessionId)) continue;
|
|
2326
|
+
codexResults.push({
|
|
2327
|
+
sessionId,
|
|
2328
|
+
cwd: "",
|
|
2329
|
+
title: titleMap.get(sessionId) ?? info.title,
|
|
2330
|
+
updatedAt: info.updatedAt ? new Date(info.updatedAt).toISOString() : undefined,
|
|
2331
|
+
agent: "codex",
|
|
2332
|
+
});
|
|
2333
|
+
}
|
|
2334
|
+
codexResults.sort((a, b) => (b.updatedAt ?? "").localeCompare(a.updatedAt ?? ""));
|
|
2335
|
+
results.push(...codexResults);
|
|
2336
|
+
} catch {
|
|
2337
|
+
// Codex history missing or unreadable — skip
|
|
2338
|
+
}
|
|
2339
|
+
|
|
2259
2340
|
return results;
|
|
2260
2341
|
}
|
|
2261
2342
|
|
|
@@ -2444,7 +2525,12 @@ function normalizeTranscriptMessages(lines: string[], limit: number): ChatHistor
|
|
|
2444
2525
|
for (const line of lines) {
|
|
2445
2526
|
try {
|
|
2446
2527
|
const parsed = JSON.parse(line);
|
|
2447
|
-
|
|
2528
|
+
|
|
2529
|
+
// Support both OpenClaw/Claude format (parsed.message) and Codex format (parsed.payload with type=response_item)
|
|
2530
|
+
let msg: Record<string, unknown> | undefined = parsed?.message;
|
|
2531
|
+
if (!msg && parsed?.type === "response_item" && parsed?.payload?.role) {
|
|
2532
|
+
msg = parsed.payload;
|
|
2533
|
+
}
|
|
2448
2534
|
if (!msg) continue;
|
|
2449
2535
|
|
|
2450
2536
|
const role = typeof msg.role === "string" ? msg.role : "";
|
package/src/cluster-service.ts
CHANGED
|
@@ -16,6 +16,7 @@ import { AcpProxy, readAllSessionStoresFromDisk } from "./acp-proxy.ts";
|
|
|
16
16
|
import { TerminalManager } from "./terminal.ts";
|
|
17
17
|
import { WebHandler } from "./web.ts";
|
|
18
18
|
import { KnowledgeSync } from "./knowledge-sync.ts";
|
|
19
|
+
import { HealthTracker } from "./health-tracker.ts";
|
|
19
20
|
import { SentinelManager } from "./sentinel-manager.ts";
|
|
20
21
|
import type {
|
|
21
22
|
AnyClusterFrame,
|
|
@@ -28,6 +29,7 @@ import type {
|
|
|
28
29
|
HandoffInputRequired,
|
|
29
30
|
HandoffInput,
|
|
30
31
|
KnowledgeSyncFrame,
|
|
32
|
+
HealthSyncFrame,
|
|
31
33
|
ModelRequest,
|
|
32
34
|
ModelResponse,
|
|
33
35
|
ModelStreamChunk,
|
|
@@ -85,6 +87,7 @@ export class ClusterRuntime {
|
|
|
85
87
|
readonly acpProxy: AcpProxy | null;
|
|
86
88
|
readonly terminalManager: TerminalManager;
|
|
87
89
|
knowledgeSync: KnowledgeSync | null = null;
|
|
90
|
+
healthTracker: HealthTracker;
|
|
88
91
|
webHandler: WebHandler | null = null;
|
|
89
92
|
private sentinelManager: SentinelManager | null = null;
|
|
90
93
|
private logger: PluginLogger;
|
|
@@ -104,6 +107,10 @@ export class ClusterRuntime {
|
|
|
104
107
|
const acpEnabled = config.acp?.enabled || (openclawConfig as Record<string, any>).acp?.enabled;
|
|
105
108
|
this.acpProxy = acpEnabled ? new AcpProxy(config, this.peerManager, openclawConfig as Record<string, unknown>, gatewayInfo) : null;
|
|
106
109
|
this.terminalManager = new TerminalManager(config, this.peerManager);
|
|
110
|
+
this.healthTracker = new HealthTracker({
|
|
111
|
+
nodeId: config.nodeId,
|
|
112
|
+
peerManager: this.peerManager,
|
|
113
|
+
});
|
|
107
114
|
}
|
|
108
115
|
|
|
109
116
|
start() {
|
|
@@ -115,11 +122,15 @@ export class ClusterRuntime {
|
|
|
115
122
|
this.peerManager.on("peerConnected", (nodeId) => {
|
|
116
123
|
this.logger.info(`[clawmatrix] Peer connected: ${nodeId}`);
|
|
117
124
|
this.refreshDiscoveredModels();
|
|
125
|
+
this.healthTracker.recordPeerOnline(nodeId, "direct");
|
|
126
|
+
this.healthTracker.initPeerSync(nodeId);
|
|
118
127
|
});
|
|
119
128
|
|
|
120
129
|
this.peerManager.on("peerDisconnected", (nodeId) => {
|
|
121
130
|
this.logger.info(`[clawmatrix] Peer disconnected: ${nodeId}`);
|
|
122
131
|
this.refreshDiscoveredModels();
|
|
132
|
+
this.healthTracker.recordPeerOffline(nodeId);
|
|
133
|
+
this.healthTracker.removePeerSync(nodeId);
|
|
123
134
|
});
|
|
124
135
|
|
|
125
136
|
this.peerManager.on("peerCapabilitiesChanged", () => {
|
|
@@ -132,6 +143,7 @@ export class ClusterRuntime {
|
|
|
132
143
|
this.peerManager.setHttpHandler((req, res) => this.webHandler!.handle(req, res));
|
|
133
144
|
// Enable satellite tool routing through WebHandler
|
|
134
145
|
this.toolProxy.setSatelliteHandler(this.webHandler);
|
|
146
|
+
this.webHandler.setHealthTracker(this.healthTracker);
|
|
135
147
|
this.logger.info(`[clawmatrix] Web dashboard enabled on listen port`);
|
|
136
148
|
}
|
|
137
149
|
|
|
@@ -166,8 +178,9 @@ export class ClusterRuntime {
|
|
|
166
178
|
}
|
|
167
179
|
|
|
168
180
|
// Auto-detect ACP agents if ACP is enabled but no agents are explicitly configured
|
|
169
|
-
|
|
170
|
-
|
|
181
|
+
// Check both ClawMatrix and OpenClaw configs (consistent with acpProxy creation above)
|
|
182
|
+
if (this.acpProxy && (!this.config.acp?.agents || this.config.acp.agents.length === 0)) {
|
|
183
|
+
AcpProxy.detectAvailableAgents(this.config.acp?.commands).then((detected) => {
|
|
171
184
|
if (detected.length > 0) {
|
|
172
185
|
this.logger.info(`[clawmatrix] Auto-detected ACP agents: ${detected.map((a) => a.id).join(", ")}`);
|
|
173
186
|
this.peerManager.updateAcpAgents(detected);
|
|
@@ -177,6 +190,11 @@ export class ClusterRuntime {
|
|
|
177
190
|
});
|
|
178
191
|
}
|
|
179
192
|
|
|
193
|
+
// Health tracker (always enabled, no config gate)
|
|
194
|
+
this.healthTracker.start().catch((err) => {
|
|
195
|
+
this.logger.error(`[clawmatrix] Health tracker failed to start: ${err}`);
|
|
196
|
+
});
|
|
197
|
+
|
|
180
198
|
// Start subsystems
|
|
181
199
|
this.peerManager.start();
|
|
182
200
|
this.modelProxy.start();
|
|
@@ -213,6 +231,7 @@ export class ClusterRuntime {
|
|
|
213
231
|
// NOTE: intentionally do NOT stop sentinel here.
|
|
214
232
|
// Sentinel must survive gateway shutdown — that's its entire purpose.
|
|
215
233
|
// It will be replaced by killOldSentinel() on next gateway start.
|
|
234
|
+
await this.healthTracker.stop();
|
|
216
235
|
await this.knowledgeSync?.stop();
|
|
217
236
|
this.webHandler?.destroy();
|
|
218
237
|
this.handoffManager.destroy();
|
|
@@ -316,6 +335,9 @@ export class ClusterRuntime {
|
|
|
316
335
|
this.logger.error(`[clawmatrix] Knowledge sync error: ${err}`);
|
|
317
336
|
});
|
|
318
337
|
break;
|
|
338
|
+
case "health_sync":
|
|
339
|
+
this.healthTracker.handleSyncMessage(frame as HealthSyncFrame);
|
|
340
|
+
break;
|
|
319
341
|
case "acp_req":
|
|
320
342
|
if (this.acpProxy) {
|
|
321
343
|
this.acpProxy.handleRequest(frame as AcpTaskRequest).catch((err) => {
|
|
@@ -0,0 +1,581 @@
|
|
|
1
|
+
import * as Automerge from "@automerge/automerge";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
|
4
|
+
import { homedir, tmpdir } from "node:os";
|
|
5
|
+
|
|
6
|
+
import { debug } from "./debug.ts";
|
|
7
|
+
import type { PeerManager } from "./peer-manager.ts";
|
|
8
|
+
import type { HealthSyncFrame } from "./types.ts";
|
|
9
|
+
|
|
10
|
+
const TAG = "health";
|
|
11
|
+
|
|
12
|
+
/** Retention period for health events (default 90 days). */
|
|
13
|
+
const DEFAULT_RETENTION_MS = 90 * 24 * 60 * 60 * 1000;
|
|
14
|
+
|
|
15
|
+
/** Compact interval: every 24 hours. */
|
|
16
|
+
const COMPACT_INTERVAL = 24 * 60 * 60 * 1000;
|
|
17
|
+
|
|
18
|
+
/** Save debounce interval (5 seconds). */
|
|
19
|
+
const SAVE_DEBOUNCE = 5_000;
|
|
20
|
+
|
|
21
|
+
// ── Document schema ─────────────────────────────────────────────
|
|
22
|
+
|
|
23
|
+
export interface HealthEvent {
|
|
24
|
+
ts: number;
|
|
25
|
+
type: "start" | "stop" | "peer_online" | "peer_offline";
|
|
26
|
+
peer?: string;
|
|
27
|
+
via?: string; // "direct" | "relay"
|
|
28
|
+
reason?: string; // disconnect reason
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
interface NodeHealthEntry {
|
|
32
|
+
events: HealthEvent[];
|
|
33
|
+
lastUpdated: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface HealthDoc {
|
|
37
|
+
nodes: Record<string, NodeHealthEntry>;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// ── Timeline aggregation ────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
export type BucketState = "up" | "degraded" | "down" | "unknown";
|
|
43
|
+
|
|
44
|
+
export interface NodeTimeline {
|
|
45
|
+
nodeId: string;
|
|
46
|
+
firstSeen: number;
|
|
47
|
+
lastSeen: number;
|
|
48
|
+
buckets: BucketState[];
|
|
49
|
+
uptimeRatio: number;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export interface AvailabilityResult {
|
|
53
|
+
range: string;
|
|
54
|
+
bucketMinutes: number;
|
|
55
|
+
startTs: number;
|
|
56
|
+
endTs: number;
|
|
57
|
+
nodes: NodeTimeline[];
|
|
58
|
+
/** Time periods when the observing node was down (cannot observe). */
|
|
59
|
+
gaps: Array<[number, number]>;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// ── HealthTracker ───────────────────────────────────────────────
|
|
63
|
+
|
|
64
|
+
export interface HealthTrackerOptions {
|
|
65
|
+
nodeId: string;
|
|
66
|
+
peerManager: PeerManager;
|
|
67
|
+
retentionMs?: number;
|
|
68
|
+
/** Override state directory (for tests). */
|
|
69
|
+
stateDir?: string;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export class HealthTracker {
|
|
73
|
+
private doc: Automerge.Doc<HealthDoc>;
|
|
74
|
+
private syncStates = new Map<string, Automerge.SyncState>();
|
|
75
|
+
private readonly nodeId: string;
|
|
76
|
+
private readonly peerManager: PeerManager;
|
|
77
|
+
private readonly retentionMs: number;
|
|
78
|
+
private readonly docPath: string;
|
|
79
|
+
private compactTimer: ReturnType<typeof setInterval> | null = null;
|
|
80
|
+
private saveTimer: ReturnType<typeof setTimeout> | null = null;
|
|
81
|
+
private dirty = false;
|
|
82
|
+
|
|
83
|
+
constructor(opts: HealthTrackerOptions) {
|
|
84
|
+
this.nodeId = opts.nodeId;
|
|
85
|
+
this.peerManager = opts.peerManager;
|
|
86
|
+
this.retentionMs = opts.retentionMs ?? DEFAULT_RETENTION_MS;
|
|
87
|
+
|
|
88
|
+
const stateDir = opts.stateDir ?? path.join(homedir() || tmpdir(), ".openclaw", "clawmatrix");
|
|
89
|
+
this.docPath = path.join(stateDir, "health.automerge");
|
|
90
|
+
|
|
91
|
+
// Initialize empty doc (will be replaced by load if file exists)
|
|
92
|
+
this.doc = Automerge.init<HealthDoc>();
|
|
93
|
+
this.doc = Automerge.change(this.doc, (d) => {
|
|
94
|
+
(d as HealthDoc).nodes = {};
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
async start() {
|
|
99
|
+
// Load persisted doc
|
|
100
|
+
await this.load();
|
|
101
|
+
|
|
102
|
+
// Record self start
|
|
103
|
+
this.recordEvent({ ts: Date.now(), type: "start" });
|
|
104
|
+
|
|
105
|
+
// Compact old events on start
|
|
106
|
+
this.compact();
|
|
107
|
+
|
|
108
|
+
// Schedule periodic compact
|
|
109
|
+
this.compactTimer = setInterval(() => this.compact(), COMPACT_INTERVAL);
|
|
110
|
+
|
|
111
|
+
debug(TAG, `health tracker started for node "${this.nodeId}"`);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
async stop() {
|
|
115
|
+
// Record self stop
|
|
116
|
+
this.recordEvent({ ts: Date.now(), type: "stop" });
|
|
117
|
+
|
|
118
|
+
if (this.compactTimer) {
|
|
119
|
+
clearInterval(this.compactTimer);
|
|
120
|
+
this.compactTimer = null;
|
|
121
|
+
}
|
|
122
|
+
if (this.saveTimer) {
|
|
123
|
+
clearTimeout(this.saveTimer);
|
|
124
|
+
this.saveTimer = null;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Final save
|
|
128
|
+
await this.save();
|
|
129
|
+
debug(TAG, "health tracker stopped");
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// ── Event recording ─────────────────────────────────────────
|
|
133
|
+
|
|
134
|
+
recordEvent(event: HealthEvent) {
|
|
135
|
+
this.doc = Automerge.change(this.doc, (d) => {
|
|
136
|
+
if (!d.nodes[this.nodeId]) {
|
|
137
|
+
d.nodes[this.nodeId] = { events: [], lastUpdated: 0 };
|
|
138
|
+
}
|
|
139
|
+
const entry = d.nodes[this.nodeId]!;
|
|
140
|
+
entry.events.push({ ...event });
|
|
141
|
+
entry.lastUpdated = Date.now();
|
|
142
|
+
});
|
|
143
|
+
this.scheduleSave();
|
|
144
|
+
this.broadcastSync();
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
recordPeerOnline(peerId: string, via: "direct" | "relay") {
|
|
148
|
+
this.recordEvent({ ts: Date.now(), type: "peer_online", peer: peerId, via });
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
recordPeerOffline(peerId: string, reason?: string) {
|
|
152
|
+
this.recordEvent({ ts: Date.now(), type: "peer_offline", peer: peerId, reason });
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// ── Sync protocol ──────────────────────────────────────────
|
|
156
|
+
|
|
157
|
+
/** Handle incoming health_sync frame from a peer. */
|
|
158
|
+
handleSyncMessage(frame: HealthSyncFrame) {
|
|
159
|
+
const peerId = frame.from;
|
|
160
|
+
const message = new Uint8Array(Buffer.from(frame.payload.data, "base64"));
|
|
161
|
+
const syncKey = peerId;
|
|
162
|
+
|
|
163
|
+
try {
|
|
164
|
+
const syncState = this.syncStates.get(syncKey) ?? Automerge.initSyncState();
|
|
165
|
+
const [newDoc, newSyncState] = Automerge.receiveSyncMessage(this.doc, syncState, message);
|
|
166
|
+
this.doc = newDoc;
|
|
167
|
+
this.syncStates.set(syncKey, newSyncState);
|
|
168
|
+
this.scheduleSave();
|
|
169
|
+
|
|
170
|
+
// Send our response
|
|
171
|
+
this.sendSyncMessage(peerId);
|
|
172
|
+
} catch (err) {
|
|
173
|
+
debug(TAG, `error handling sync from ${peerId}: ${err}`);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/** Initiate sync with a peer (called on peer connect). */
|
|
178
|
+
initPeerSync(peerId: string) {
|
|
179
|
+
if (peerId === this.nodeId) return;
|
|
180
|
+
this.syncStates.set(peerId, Automerge.initSyncState());
|
|
181
|
+
this.sendSyncMessage(peerId);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/** Clean up sync state for a disconnected peer. */
|
|
185
|
+
removePeerSync(peerId: string) {
|
|
186
|
+
this.syncStates.delete(peerId);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
private sendSyncMessage(peerId: string) {
|
|
190
|
+
const syncState = this.syncStates.get(peerId) ?? Automerge.initSyncState();
|
|
191
|
+
const [newSyncState, message] = Automerge.generateSyncMessage(this.doc, syncState);
|
|
192
|
+
this.syncStates.set(peerId, newSyncState);
|
|
193
|
+
|
|
194
|
+
if (!message) return;
|
|
195
|
+
|
|
196
|
+
debug(TAG, `sending health sync to ${peerId} (${message.byteLength} bytes)`);
|
|
197
|
+
|
|
198
|
+
const frame: HealthSyncFrame = {
|
|
199
|
+
type: "health_sync",
|
|
200
|
+
from: this.nodeId,
|
|
201
|
+
to: peerId,
|
|
202
|
+
timestamp: Date.now(),
|
|
203
|
+
payload: {
|
|
204
|
+
data: Buffer.from(message).toString("base64"),
|
|
205
|
+
},
|
|
206
|
+
};
|
|
207
|
+
|
|
208
|
+
this.peerManager.router.sendTo(peerId, frame);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
private broadcastSync() {
|
|
212
|
+
const peers = this.peerManager.router.getAllPeers();
|
|
213
|
+
for (const peer of peers) {
|
|
214
|
+
this.sendSyncMessage(peer.nodeId);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// ── Timeline aggregation ──────────────────────────────────
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Build availability timeline for all known nodes.
|
|
222
|
+
* @param range - "24h", "7d", or "90d"
|
|
223
|
+
*/
|
|
224
|
+
getAvailability(range: "24h" | "7d" | "90d" = "24h"): AvailabilityResult {
|
|
225
|
+
const now = Date.now();
|
|
226
|
+
let durationMs: number;
|
|
227
|
+
let bucketMinutes: number;
|
|
228
|
+
|
|
229
|
+
switch (range) {
|
|
230
|
+
case "24h":
|
|
231
|
+
durationMs = 24 * 60 * 60 * 1000;
|
|
232
|
+
bucketMinutes = 30;
|
|
233
|
+
break;
|
|
234
|
+
case "7d":
|
|
235
|
+
durationMs = 7 * 24 * 60 * 60 * 1000;
|
|
236
|
+
bucketMinutes = 60 * 4; // 4-hour buckets
|
|
237
|
+
break;
|
|
238
|
+
case "90d":
|
|
239
|
+
durationMs = 90 * 24 * 60 * 60 * 1000;
|
|
240
|
+
bucketMinutes = 60 * 24; // 1-day buckets
|
|
241
|
+
break;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
const startTs = now - durationMs;
|
|
245
|
+
const endTs = now;
|
|
246
|
+
const bucketMs = bucketMinutes * 60 * 1000;
|
|
247
|
+
const bucketCount = Math.ceil(durationMs / bucketMs);
|
|
248
|
+
|
|
249
|
+
// Find observation gaps (periods where THIS node was down)
|
|
250
|
+
const gaps = this.getObservationGaps(this.nodeId, startTs, endTs);
|
|
251
|
+
|
|
252
|
+
// Build timeline for each node (including self)
|
|
253
|
+
const nodes: NodeTimeline[] = [];
|
|
254
|
+
|
|
255
|
+
for (const [nodeId, entry] of Object.entries(this.doc.nodes)) {
|
|
256
|
+
// For self, use start/stop events to determine uptime
|
|
257
|
+
// For other nodes, use peer_online/peer_offline from the observing node
|
|
258
|
+
const timeline = this.buildNodeTimeline(
|
|
259
|
+
nodeId,
|
|
260
|
+
entry,
|
|
261
|
+
startTs,
|
|
262
|
+
endTs,
|
|
263
|
+
bucketMs,
|
|
264
|
+
bucketCount,
|
|
265
|
+
gaps,
|
|
266
|
+
);
|
|
267
|
+
if (timeline) nodes.push(timeline);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
return {
|
|
271
|
+
range,
|
|
272
|
+
bucketMinutes,
|
|
273
|
+
startTs,
|
|
274
|
+
endTs,
|
|
275
|
+
nodes,
|
|
276
|
+
gaps,
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
private buildNodeTimeline(
|
|
281
|
+
nodeId: string,
|
|
282
|
+
entry: NodeHealthEntry,
|
|
283
|
+
startTs: number,
|
|
284
|
+
endTs: number,
|
|
285
|
+
bucketMs: number,
|
|
286
|
+
bucketCount: number,
|
|
287
|
+
observerGaps: Array<[number, number]>,
|
|
288
|
+
): NodeTimeline | null {
|
|
289
|
+
const events = [...entry.events].sort((a, b) => a.ts - b.ts);
|
|
290
|
+
if (events.length === 0) return null;
|
|
291
|
+
|
|
292
|
+
const firstSeen = events[0]!.ts;
|
|
293
|
+
const lastSeen = entry.lastUpdated;
|
|
294
|
+
|
|
295
|
+
// Build online intervals for this node
|
|
296
|
+
const intervals = this.buildOnlineIntervals(nodeId, events, startTs, endTs);
|
|
297
|
+
|
|
298
|
+
// Calculate per-bucket state
|
|
299
|
+
const buckets: BucketState[] = [];
|
|
300
|
+
let totalOnline = 0;
|
|
301
|
+
let totalObservable = 0;
|
|
302
|
+
|
|
303
|
+
for (let i = 0; i < bucketCount; i++) {
|
|
304
|
+
const bStart = startTs + i * bucketMs;
|
|
305
|
+
const bEnd = Math.min(bStart + bucketMs, endTs);
|
|
306
|
+
|
|
307
|
+
// How much of this bucket is observable (subtract observer gaps)
|
|
308
|
+
const observableMs = this.observableTimeInRange(bStart, bEnd, observerGaps);
|
|
309
|
+
|
|
310
|
+
if (observableMs === 0) {
|
|
311
|
+
buckets.push("unknown");
|
|
312
|
+
continue;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// How much of this bucket the node was online
|
|
316
|
+
const onlineMs = this.overlapMs(intervals, bStart, bEnd);
|
|
317
|
+
const ratio = onlineMs / observableMs;
|
|
318
|
+
|
|
319
|
+
totalOnline += onlineMs;
|
|
320
|
+
totalObservable += observableMs;
|
|
321
|
+
|
|
322
|
+
if (ratio >= 0.95) buckets.push("up");
|
|
323
|
+
else if (ratio >= 0.05) buckets.push("degraded");
|
|
324
|
+
else buckets.push("down");
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
const uptimeRatio = totalObservable > 0 ? totalOnline / totalObservable : 0;
|
|
328
|
+
|
|
329
|
+
return { nodeId, firstSeen, lastSeen, buckets, uptimeRatio };
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Build online intervals for a node.
|
|
334
|
+
* - For self node: uses start/stop events
|
|
335
|
+
* - For other nodes: uses peer_online/peer_offline events from all observers
|
|
336
|
+
*/
|
|
337
|
+
private buildOnlineIntervals(
|
|
338
|
+
nodeId: string,
|
|
339
|
+
events: HealthEvent[],
|
|
340
|
+
startTs: number,
|
|
341
|
+
endTs: number,
|
|
342
|
+
): Array<[number, number]> {
|
|
343
|
+
const intervals: Array<[number, number]> = [];
|
|
344
|
+
|
|
345
|
+
if (nodeId === this.nodeId) {
|
|
346
|
+
// Self: start/stop events define uptime
|
|
347
|
+
// But we're looking at all nodes' data, so check if this nodeId
|
|
348
|
+
// has start/stop events (each node writes its own start/stop)
|
|
349
|
+
return this.buildSelfIntervals(events, startTs, endTs);
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// For remote nodes: gather peer_online/peer_offline events from all observer nodes
|
|
353
|
+
// We have the CRDT doc with all nodes' events merged
|
|
354
|
+
// Look through ALL nodes' events for peer_online/peer_offline referencing this nodeId
|
|
355
|
+
return this.buildPeerIntervals(nodeId, startTs, endTs);
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
private buildSelfIntervals(
|
|
359
|
+
events: HealthEvent[],
|
|
360
|
+
startTs: number,
|
|
361
|
+
endTs: number,
|
|
362
|
+
): Array<[number, number]> {
|
|
363
|
+
const intervals: Array<[number, number]> = [];
|
|
364
|
+
let onlineSince: number | null = null;
|
|
365
|
+
|
|
366
|
+
for (const ev of events) {
|
|
367
|
+
if (ev.ts < startTs) {
|
|
368
|
+
// Track state before window
|
|
369
|
+
if (ev.type === "start") onlineSince = ev.ts;
|
|
370
|
+
else if (ev.type === "stop") onlineSince = null;
|
|
371
|
+
continue;
|
|
372
|
+
}
|
|
373
|
+
if (ev.ts > endTs) break;
|
|
374
|
+
|
|
375
|
+
if (ev.type === "start") {
|
|
376
|
+
onlineSince = ev.ts;
|
|
377
|
+
} else if (ev.type === "stop" && onlineSince !== null) {
|
|
378
|
+
intervals.push([Math.max(onlineSince, startTs), ev.ts]);
|
|
379
|
+
onlineSince = null;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// If still online at end of window
|
|
384
|
+
if (onlineSince !== null) {
|
|
385
|
+
intervals.push([Math.max(onlineSince, startTs), endTs]);
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
return intervals;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
private buildPeerIntervals(
|
|
392
|
+
targetNodeId: string,
|
|
393
|
+
startTs: number,
|
|
394
|
+
endTs: number,
|
|
395
|
+
): Array<[number, number]> {
|
|
396
|
+
const intervals: Array<[number, number]> = [];
|
|
397
|
+
|
|
398
|
+
// Collect peer_online/peer_offline events from all observer nodes
|
|
399
|
+
const relevantEvents: HealthEvent[] = [];
|
|
400
|
+
for (const [, entry] of Object.entries(this.doc.nodes)) {
|
|
401
|
+
for (const ev of entry.events) {
|
|
402
|
+
if (ev.peer === targetNodeId && (ev.type === "peer_online" || ev.type === "peer_offline")) {
|
|
403
|
+
relevantEvents.push(ev);
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
relevantEvents.sort((a, b) => a.ts - b.ts);
|
|
408
|
+
|
|
409
|
+
let onlineSince: number | null = null;
|
|
410
|
+
|
|
411
|
+
for (const ev of relevantEvents) {
|
|
412
|
+
if (ev.ts < startTs) {
|
|
413
|
+
if (ev.type === "peer_online") onlineSince = ev.ts;
|
|
414
|
+
else if (ev.type === "peer_offline") onlineSince = null;
|
|
415
|
+
continue;
|
|
416
|
+
}
|
|
417
|
+
if (ev.ts > endTs) break;
|
|
418
|
+
|
|
419
|
+
if (ev.type === "peer_online") {
|
|
420
|
+
if (onlineSince === null) onlineSince = ev.ts;
|
|
421
|
+
} else if (ev.type === "peer_offline" && onlineSince !== null) {
|
|
422
|
+
intervals.push([Math.max(onlineSince, startTs), ev.ts]);
|
|
423
|
+
onlineSince = null;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
if (onlineSince !== null) {
|
|
428
|
+
intervals.push([Math.max(onlineSince, startTs), endTs]);
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
return intervals;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
/** Get observation gaps: periods when the local node was not running. */
|
|
435
|
+
private getObservationGaps(
|
|
436
|
+
nodeId: string,
|
|
437
|
+
startTs: number,
|
|
438
|
+
endTs: number,
|
|
439
|
+
): Array<[number, number]> {
|
|
440
|
+
const entry = this.doc.nodes[nodeId];
|
|
441
|
+
if (!entry) return [[startTs, endTs]]; // no data = entire range is a gap
|
|
442
|
+
|
|
443
|
+
const selfIntervals = this.buildSelfIntervals(
|
|
444
|
+
[...entry.events].sort((a, b) => a.ts - b.ts),
|
|
445
|
+
startTs,
|
|
446
|
+
endTs,
|
|
447
|
+
);
|
|
448
|
+
|
|
449
|
+
// Gaps are the complement of self intervals within [startTs, endTs]
|
|
450
|
+
const gaps: Array<[number, number]> = [];
|
|
451
|
+
let cursor = startTs;
|
|
452
|
+
|
|
453
|
+
for (const [start, end] of selfIntervals) {
|
|
454
|
+
if (start > cursor) {
|
|
455
|
+
gaps.push([cursor, start]);
|
|
456
|
+
}
|
|
457
|
+
cursor = Math.max(cursor, end);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
if (cursor < endTs) {
|
|
461
|
+
gaps.push([cursor, endTs]);
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
return gaps;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/** Calculate observable time in a range, excluding gaps. */
|
|
468
|
+
private observableTimeInRange(
|
|
469
|
+
start: number,
|
|
470
|
+
end: number,
|
|
471
|
+
gaps: Array<[number, number]>,
|
|
472
|
+
): number {
|
|
473
|
+
let total = end - start;
|
|
474
|
+
for (const [gStart, gEnd] of gaps) {
|
|
475
|
+
const overlapStart = Math.max(start, gStart);
|
|
476
|
+
const overlapEnd = Math.min(end, gEnd);
|
|
477
|
+
if (overlapStart < overlapEnd) {
|
|
478
|
+
total -= overlapEnd - overlapStart;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
return Math.max(0, total);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
/** Calculate total overlap between intervals and a range. */
|
|
485
|
+
private overlapMs(intervals: Array<[number, number]>, start: number, end: number): number {
|
|
486
|
+
let total = 0;
|
|
487
|
+
for (const [iStart, iEnd] of intervals) {
|
|
488
|
+
const overlapStart = Math.max(start, iStart);
|
|
489
|
+
const overlapEnd = Math.min(end, iEnd);
|
|
490
|
+
if (overlapStart < overlapEnd) {
|
|
491
|
+
total += overlapEnd - overlapStart;
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
return total;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// ── Compact ───────────────────────────────────────────────
|
|
498
|
+
|
|
499
|
+
private compact() {
|
|
500
|
+
const cutoff = Date.now() - this.retentionMs;
|
|
501
|
+
let pruned = 0;
|
|
502
|
+
|
|
503
|
+
this.doc = Automerge.change(this.doc, (d) => {
|
|
504
|
+
for (const [, node] of Object.entries(d.nodes)) {
|
|
505
|
+
const before = node.events.length;
|
|
506
|
+
// Keep events newer than cutoff; also keep the last event before cutoff
|
|
507
|
+
// to preserve state continuity
|
|
508
|
+
let lastBeforeCutoff = -1;
|
|
509
|
+
for (let i = 0; i < node.events.length; i++) {
|
|
510
|
+
if (node.events[i]!.ts < cutoff) lastBeforeCutoff = i;
|
|
511
|
+
}
|
|
512
|
+
if (lastBeforeCutoff > 0) {
|
|
513
|
+
// Remove all events before the last one before cutoff
|
|
514
|
+
node.events.splice(0, lastBeforeCutoff);
|
|
515
|
+
pruned += before - node.events.length;
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
});
|
|
519
|
+
|
|
520
|
+
if (pruned > 0) {
|
|
521
|
+
debug(TAG, `compacted ${pruned} old events`);
|
|
522
|
+
// Re-save to discard old ops
|
|
523
|
+
this.recompact();
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
/** Re-serialize to discard Automerge op history for removed data. */
|
|
528
|
+
private recompact() {
|
|
529
|
+
const bytes = Automerge.save(this.doc);
|
|
530
|
+
this.doc = Automerge.load<HealthDoc>(bytes);
|
|
531
|
+
this.scheduleSave();
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// ── Persistence ───────────────────────────────────────────
|
|
535
|
+
|
|
536
|
+
private async load() {
|
|
537
|
+
try {
|
|
538
|
+
const data = await readFile(this.docPath);
|
|
539
|
+
this.doc = Automerge.load<HealthDoc>(new Uint8Array(data));
|
|
540
|
+
debug(TAG, `loaded health doc from ${this.docPath}`);
|
|
541
|
+
} catch {
|
|
542
|
+
debug(TAG, "no existing health doc, starting fresh");
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
private async save() {
|
|
547
|
+
try {
|
|
548
|
+
const data = Automerge.save(this.doc);
|
|
549
|
+
await mkdir(path.dirname(this.docPath), { recursive: true });
|
|
550
|
+
await writeFile(this.docPath, Buffer.from(data));
|
|
551
|
+
} catch (err) {
|
|
552
|
+
debug(TAG, `failed to save health doc: ${err}`);
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
private scheduleSave() {
|
|
557
|
+
this.dirty = true;
|
|
558
|
+
if (this.saveTimer) return;
|
|
559
|
+
this.saveTimer = setTimeout(() => {
|
|
560
|
+
this.saveTimer = null;
|
|
561
|
+
if (this.dirty) {
|
|
562
|
+
this.dirty = false;
|
|
563
|
+
this.save().catch((err) => {
|
|
564
|
+
debug(TAG, `deferred save error: ${err}`);
|
|
565
|
+
});
|
|
566
|
+
}
|
|
567
|
+
}, SAVE_DEBOUNCE);
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
// ── Public accessors ──────────────────────────────────────
|
|
571
|
+
|
|
572
|
+
/** Get all known node IDs (including offline ones). */
|
|
573
|
+
getKnownNodes(): string[] {
|
|
574
|
+
return Object.keys(this.doc.nodes);
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
/** Get raw events for a specific node. */
|
|
578
|
+
getNodeEvents(nodeId: string): HealthEvent[] {
|
|
579
|
+
return [...(this.doc.nodes[nodeId]?.events ?? [])];
|
|
580
|
+
}
|
|
581
|
+
}
|
package/src/model-proxy.ts
CHANGED
|
@@ -1090,13 +1090,28 @@ export class ModelProxy {
|
|
|
1090
1090
|
?? this.config.models.find((m) => m.id === payload.model)
|
|
1091
1091
|
: this.config.models.find((m) => m.id === payload.model);
|
|
1092
1092
|
if (!model) {
|
|
1093
|
+
// Model not available locally — try forwarding to a remote node that has it
|
|
1094
|
+
const originalTarget = frame.to;
|
|
1095
|
+
const exclude = new Set([this.config.nodeId]);
|
|
1096
|
+
if (originalTarget) exclude.add(originalTarget);
|
|
1097
|
+
const alternatives = this.peerManager.router.findNodesForModel(payload.model, exclude);
|
|
1098
|
+
for (const alt of alternatives) {
|
|
1099
|
+
if (this.peerManager.sendTo(alt.nodeId, { ...frame, to: alt.nodeId })) {
|
|
1100
|
+
debug("model_req", `failover: ${originalTarget ?? "local"} → ${alt.nodeId} for "${payload.model}"`);
|
|
1101
|
+
return;
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
// No alternative found
|
|
1105
|
+
const hint = originalTarget && originalTarget !== this.config.nodeId
|
|
1106
|
+
? `Target node "${originalTarget}" is unreachable and no alternative nodes provide model "${payload.model}"`
|
|
1107
|
+
: `Model "${payload.model}" not available locally`;
|
|
1093
1108
|
this.peerManager.sendTo(from, {
|
|
1094
1109
|
type: "model_res",
|
|
1095
1110
|
id,
|
|
1096
1111
|
from: this.config.nodeId,
|
|
1097
1112
|
to: from,
|
|
1098
1113
|
timestamp: Date.now(),
|
|
1099
|
-
payload: { success: false, error:
|
|
1114
|
+
payload: { success: false, error: hint },
|
|
1100
1115
|
} satisfies ModelResponse);
|
|
1101
1116
|
return;
|
|
1102
1117
|
}
|
package/src/peer-manager.ts
CHANGED
|
@@ -604,8 +604,11 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
604
604
|
|
|
605
605
|
// ── Message handling ───────────────────────────────────────────
|
|
606
606
|
private onFrame(frame: AnyClusterFrame, from: Connection) {
|
|
607
|
-
// Ignore self-echo: frames with our own nodeId that were relayed back to us
|
|
608
|
-
|
|
607
|
+
// Ignore self-echo: frames with our own nodeId that were relayed back to us.
|
|
608
|
+
// Exception: frames from same-nodeId satellite connections (Mac/iOS client)
|
|
609
|
+
// are legitimate requests that must be processed or relayed.
|
|
610
|
+
const isSatellite = from.remoteNodeId === this.config.nodeId;
|
|
611
|
+
if (frame.from === this.config.nodeId && !isSatellite) return;
|
|
609
612
|
|
|
610
613
|
// Validate from field: must be the direct peer or a known node (relayed)
|
|
611
614
|
if (frame.from && frame.from !== from.remoteNodeId && !this.router.getRoute(frame.from)) {
|
|
@@ -674,8 +677,26 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
674
677
|
}
|
|
675
678
|
|
|
676
679
|
if (frame.to && frame.to !== this.config.nodeId) {
|
|
677
|
-
this.router.tryRelay(frame);
|
|
678
|
-
|
|
680
|
+
if (this.router.tryRelay(frame)) return;
|
|
681
|
+
|
|
682
|
+
// Relay failed — for model_req, try alternative nodes or fall through to local handling
|
|
683
|
+
if (frame.type === "model_req") {
|
|
684
|
+
const modelId = (frame as any).payload?.model;
|
|
685
|
+
if (modelId) {
|
|
686
|
+
const exclude = new Set([frame.to, this.config.nodeId]);
|
|
687
|
+
const alternatives = this.router.findNodesForModel(modelId, exclude);
|
|
688
|
+
for (const alt of alternatives) {
|
|
689
|
+
if (this.sendTo(alt.nodeId, { ...frame, to: alt.nodeId })) {
|
|
690
|
+
debug("peer", `model_req failover: ${frame.to} → ${alt.nodeId}`);
|
|
691
|
+
return;
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
// No remote alternative — fall through to local handling
|
|
696
|
+
// (model-proxy will handle locally or send error back)
|
|
697
|
+
} else {
|
|
698
|
+
return;
|
|
699
|
+
}
|
|
679
700
|
}
|
|
680
701
|
|
|
681
702
|
// Forward to same-nodeId satellite connection (e.g. Mac desktop app) so that
|
package/src/router.ts
CHANGED
|
@@ -233,6 +233,31 @@ export class Router {
|
|
|
233
233
|
return this.routes.get(target);
|
|
234
234
|
}
|
|
235
235
|
|
|
236
|
+
/** Find reachable nodes that provide a specific model, sorted by latency.
|
|
237
|
+
* Excludes nodes in the `exclude` set. */
|
|
238
|
+
findNodesForModel(modelId: string, exclude?: Set<string>): RouteEntry[] {
|
|
239
|
+
const candidates: RouteEntry[] = [];
|
|
240
|
+
for (const entry of this.routes.values()) {
|
|
241
|
+
if (exclude?.has(entry.nodeId)) continue;
|
|
242
|
+
if (!entry.models.some((m) => m.id === modelId)) continue;
|
|
243
|
+
// Check reachability
|
|
244
|
+
if (entry.connection?.isOpen) {
|
|
245
|
+
candidates.push(entry);
|
|
246
|
+
} else if (entry.reachableVia) {
|
|
247
|
+
const relay = this.connections.get(entry.reachableVia);
|
|
248
|
+
if (relay?.isOpen) candidates.push(entry);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
// Sort: direct first, then by latency
|
|
252
|
+
candidates.sort((a, b) => {
|
|
253
|
+
const aDirect = a.connection ? 0 : 1;
|
|
254
|
+
const bDirect = b.connection ? 0 : 1;
|
|
255
|
+
if (aDirect !== bDirect) return aDirect - bDirect;
|
|
256
|
+
return a.latencyMs - b.latencyMs;
|
|
257
|
+
});
|
|
258
|
+
return candidates;
|
|
259
|
+
}
|
|
260
|
+
|
|
236
261
|
// ── Message sending and relay ──────────────────────────────────
|
|
237
262
|
/** Send a frame to a specific node, relaying if necessary. Returns true if sent. */
|
|
238
263
|
sendTo(targetNodeId: string, frame: ClusterFrame | AnyClusterFrame): boolean {
|
package/src/sentinel.ts
CHANGED
|
@@ -569,6 +569,28 @@ process.on("exit", (code) => {
|
|
|
569
569
|
try { process.stderr.write(`[svc ${ts}] Exit code=${code}\n`); } catch { /* ignore */ }
|
|
570
570
|
});
|
|
571
571
|
|
|
572
|
+
/** Connect to all configured peers (called when gateway dies). */
|
|
573
|
+
function connectAllPeers() {
|
|
574
|
+
for (const peer of config.peers) {
|
|
575
|
+
if (!connections.has(peer.nodeId) && !reconnectTimers.has(peer.nodeId)) {
|
|
576
|
+
connectToPeer(peer);
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
/** Disconnect from all peers (called when gateway recovers). */
|
|
582
|
+
function disconnectAllPeers() {
|
|
583
|
+
for (const [nodeId, conn] of connections) {
|
|
584
|
+
conn.close(1000, "gateway recovered");
|
|
585
|
+
connections.delete(nodeId);
|
|
586
|
+
}
|
|
587
|
+
for (const [nodeId, timer] of reconnectTimers) {
|
|
588
|
+
clearTimeout(timer);
|
|
589
|
+
reconnectTimers.delete(nodeId);
|
|
590
|
+
}
|
|
591
|
+
reconnectAttempts.clear();
|
|
592
|
+
}
|
|
593
|
+
|
|
572
594
|
/** Periodically check if the gateway process is still alive via kill(pid, 0). */
|
|
573
595
|
function startGatewayHealthCheck() {
|
|
574
596
|
if (healthCheckTimer || !gatewayPid) return;
|
|
@@ -581,11 +603,15 @@ function startGatewayHealthCheck() {
|
|
|
581
603
|
log("Gateway process detected — back online");
|
|
582
604
|
// Release the port so the gateway can reclaim it
|
|
583
605
|
stopListening();
|
|
606
|
+
// Disconnect from peers — gateway handles mesh connections
|
|
607
|
+
disconnectAllPeers();
|
|
584
608
|
}
|
|
585
609
|
} catch {
|
|
586
610
|
if (gatewayAlive) {
|
|
587
611
|
gatewayAlive = false;
|
|
588
612
|
log(`Gateway process (pid ${gatewayPid}) gone — entering standalone mode`);
|
|
613
|
+
// Connect to peers now that gateway is down
|
|
614
|
+
connectAllPeers();
|
|
589
615
|
// Take over the gateway's listen port
|
|
590
616
|
if (config.listenPort) {
|
|
591
617
|
// Small delay to let the OS release the port from the dead process
|
|
@@ -608,11 +634,7 @@ function boot() {
|
|
|
608
634
|
writePidFile();
|
|
609
635
|
log(`Started (pid ${process.pid}, gateway ${gatewayPid}, nodeId ${sentinelNodeId()}, takeover port ${config.listenPort || "none"})`);
|
|
610
636
|
|
|
611
|
-
//
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
}
|
|
615
|
-
|
|
616
|
-
// Note: we do NOT start listening here.
|
|
617
|
-
// Listening only starts when gateway dies (port takeover mode).
|
|
637
|
+
// Do NOT connect to peers on boot — gateway handles mesh connections.
|
|
638
|
+
// Sentinel only connects when gateway dies (standalone mode).
|
|
639
|
+
// Listening also only starts when gateway dies (port takeover mode).
|
|
618
640
|
}
|
package/src/types.ts
CHANGED
|
@@ -411,6 +411,14 @@ export interface KnowledgeSyncFrame extends ClusterFrame {
|
|
|
411
411
|
};
|
|
412
412
|
}
|
|
413
413
|
|
|
414
|
+
// ── Health sync ──────────────────────────────────────────────────
|
|
415
|
+
export interface HealthSyncFrame extends ClusterFrame {
|
|
416
|
+
type: "health_sync";
|
|
417
|
+
payload: {
|
|
418
|
+
data: string; // base64-encoded Automerge sync message
|
|
419
|
+
};
|
|
420
|
+
}
|
|
421
|
+
|
|
414
422
|
// ── Diagnostic (sentinel) ────────────────────────────────────────
|
|
415
423
|
export interface DiagnosticExec extends ClusterFrame {
|
|
416
424
|
type: "diagnostic_exec";
|
|
@@ -831,4 +839,5 @@ export type AnyClusterFrame =
|
|
|
831
839
|
| TerminalData
|
|
832
840
|
| TerminalResize
|
|
833
841
|
| TerminalCloseRequest
|
|
834
|
-
| TerminalCloseResponse
|
|
842
|
+
| TerminalCloseResponse
|
|
843
|
+
| HealthSyncFrame;
|
package/src/web.ts
CHANGED
|
@@ -3,6 +3,7 @@ import type { PeerManager } from "./peer-manager.ts";
|
|
|
3
3
|
import type { HandoffManager } from "./handoff.ts";
|
|
4
4
|
import type { ClawMatrixConfig } from "./config.ts";
|
|
5
5
|
import type { SatelliteContext, IngestedEvent } from "./types.ts";
|
|
6
|
+
import type { HealthTracker } from "./health-tracker.ts";
|
|
6
7
|
import { timingSafeEqual } from "./auth.ts";
|
|
7
8
|
import { renderDashboard } from "./web-ui.ts";
|
|
8
9
|
import { readBody } from "./http-utils.ts";
|
|
@@ -46,6 +47,7 @@ export class WebHandler {
|
|
|
46
47
|
private ingestedEvents: IngestedEvent[] = []; // ring buffer for ingested events
|
|
47
48
|
private loginAttempts = new Map<string, { count: number; resetAt: number }>(); // IP → rate limit
|
|
48
49
|
private loginCleanupTimer: ReturnType<typeof setInterval> | null = null;
|
|
50
|
+
private healthTracker: HealthTracker | null = null;
|
|
49
51
|
private onPeerConnected: (nodeId: string) => void;
|
|
50
52
|
private onPeerDisconnected: (nodeId: string) => void;
|
|
51
53
|
|
|
@@ -91,6 +93,11 @@ export class WebHandler {
|
|
|
91
93
|
peerManager.on("peerDisconnected", this.onPeerDisconnected);
|
|
92
94
|
}
|
|
93
95
|
|
|
96
|
+
/** Set the health tracker for availability API. */
|
|
97
|
+
setHealthTracker(tracker: HealthTracker) {
|
|
98
|
+
this.healthTracker = tracker;
|
|
99
|
+
}
|
|
100
|
+
|
|
94
101
|
/** Clean up timers and pending requests on shutdown. */
|
|
95
102
|
destroy() {
|
|
96
103
|
// Remove event listeners to prevent post-destroy callbacks
|
|
@@ -181,6 +188,11 @@ export class WebHandler {
|
|
|
181
188
|
return;
|
|
182
189
|
}
|
|
183
190
|
|
|
191
|
+
if (path === "/api/availability" && req.method === "GET") {
|
|
192
|
+
this.handleAvailability(req, res);
|
|
193
|
+
return;
|
|
194
|
+
}
|
|
195
|
+
|
|
184
196
|
if (path === "/api/satellite/poll" && req.method === "GET") {
|
|
185
197
|
this.handleSatellitePoll(req, res);
|
|
186
198
|
return;
|
|
@@ -271,6 +283,27 @@ export class WebHandler {
|
|
|
271
283
|
}
|
|
272
284
|
}
|
|
273
285
|
|
|
286
|
+
private handleAvailability(req: IncomingMessage, res: ServerResponse) {
|
|
287
|
+
if (!this.healthTracker) {
|
|
288
|
+
res.writeHead(503, { "Content-Type": "application/json" });
|
|
289
|
+
res.end(JSON.stringify({ error: "Health tracker not available" }));
|
|
290
|
+
return;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
const url = new URL(req.url ?? "/", `http://${req.headers.host ?? "localhost"}`);
|
|
294
|
+
const range = (url.searchParams.get("range") ?? "24h") as "24h" | "7d" | "90d";
|
|
295
|
+
|
|
296
|
+
if (!["24h", "7d", "90d"].includes(range)) {
|
|
297
|
+
res.writeHead(400, { "Content-Type": "application/json" });
|
|
298
|
+
res.end(JSON.stringify({ error: "Invalid range. Use 24h, 7d, or 90d" }));
|
|
299
|
+
return;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
const result = this.healthTracker.getAvailability(range);
|
|
303
|
+
res.writeHead(200, { "Content-Type": "application/json" });
|
|
304
|
+
res.end(JSON.stringify(result));
|
|
305
|
+
}
|
|
306
|
+
|
|
274
307
|
private handleStatus(res: ServerResponse) {
|
|
275
308
|
const peers = this.peerManager.router.getAllPeers();
|
|
276
309
|
const localNode = {
|