clawmatrix 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clawmatrix",
3
- "version": "0.2.8",
3
+ "version": "0.2.9",
4
4
  "description": "Decentralized mesh cluster plugin for OpenClaw — inter-gateway communication, model proxy, task handoff, and tool proxy.",
5
5
  "type": "module",
6
6
  "license": "MIT",
package/src/acp-proxy.ts CHANGED
@@ -584,7 +584,7 @@ export class AcpProxy {
584
584
  let newSession: AcpSession;
585
585
  if (resumeAcpSessionId) {
586
586
  debug("acp", `Session "${sessionId}" not found, resuming via acpSessionId "${resumeAcpSessionId.slice(0, 8)}..."`);
587
- const effectiveCwd = cwd ?? process.cwd();
587
+ const effectiveCwd = cwd || process.cwd();
588
588
  try {
589
589
  newSession = await this.createSessionWithResume(agent, resumeAcpSessionId, effectiveCwd, from);
590
590
  } catch (resumeErr) {
@@ -611,7 +611,7 @@ export class AcpProxy {
611
611
  });
612
612
  }
613
613
  if (mode === "oneshot" && task) {
614
- this.returnToReusePool(newSession, cwd ?? process.cwd());
614
+ this.returnToReusePool(newSession, cwd || process.cwd());
615
615
  }
616
616
  } else {
617
617
  // Re-create native session
@@ -671,7 +671,7 @@ export class AcpProxy {
671
671
  });
672
672
  }
673
673
  if (mode === "oneshot" && task) {
674
- this.returnToReusePool(session, cwd ?? process.cwd());
674
+ this.returnToReusePool(session, cwd || process.cwd());
675
675
  }
676
676
  } else {
677
677
  // Create new native OpenClaw session
@@ -778,7 +778,8 @@ export class AcpProxy {
778
778
  /** Handle resume session request (receiver side): resume an ACP session and track it. */
779
779
  async handleResumeRequest(frame: AcpResumeRequest): Promise<void> {
780
780
  const { id, from, payload } = frame;
781
- const { agent, acpSessionId, cwd } = payload;
781
+ const { agent, acpSessionId } = payload;
782
+ const cwd = payload.cwd || process.cwd();
782
783
 
783
784
  try {
784
785
  // Enforce concurrent session limit
@@ -1089,6 +1090,30 @@ export class AcpProxy {
1089
1090
  }
1090
1091
  } catch { /* no claude projects directory */ }
1091
1092
 
1093
+ // 3. Search Codex sessions (~/.codex/sessions/YYYY/MM/DD/rollout-*-{sessionId}.jsonl)
1094
+ // Reverse sort so we search recent dates first (most likely to match)
1095
+ const codexSessionsDir = join(homedir(), ".codex", "sessions");
1096
+ try {
1097
+ const years = (await readdir(codexSessionsDir)).sort().reverse();
1098
+ for (const year of years) {
1099
+ const monthsDir = join(codexSessionsDir, year);
1100
+ let months: string[];
1101
+ try { months = (await readdir(monthsDir)).sort().reverse(); } catch { continue; }
1102
+ for (const month of months) {
1103
+ const daysDir = join(monthsDir, month);
1104
+ let days: string[];
1105
+ try { days = (await readdir(daysDir)).sort().reverse(); } catch { continue; }
1106
+ for (const day of days) {
1107
+ const dayDir = join(daysDir, day);
1108
+ let files: string[];
1109
+ try { files = await readdir(dayDir); } catch { continue; }
1110
+ const match = files.find((f) => f.endsWith(`-${sessionId}.jsonl`));
1111
+ if (match) return join(dayDir, match);
1112
+ }
1113
+ }
1114
+ }
1115
+ } catch { /* no codex sessions directory */ }
1116
+
1092
1117
  return null;
1093
1118
  }
1094
1119
 
@@ -1313,7 +1338,7 @@ export class AcpProxy {
1313
1338
  // ── Internal: ACP session management (receiver) ────────────────
1314
1339
 
1315
1340
  private async createSession(agent: string, cwd: string | undefined, from: string): Promise<AcpSession> {
1316
- const effectiveCwd = cwd ?? process.cwd();
1341
+ const effectiveCwd = cwd || process.cwd();
1317
1342
 
1318
1343
  // 1. Try daemon pool first (long-lived process, instant newSession)
1319
1344
  try {
@@ -2256,6 +2281,62 @@ async function fetchSessionListFromDisk(): Promise<AcpSessionInfo[]> {
2256
2281
  // history.jsonl missing or unreadable — skip
2257
2282
  }
2258
2283
 
2284
+ // 3. Codex session history (~/.codex/history.jsonl + session_index.jsonl)
2285
+ try {
2286
+ const codexDir = join(homedir(), ".codex");
2287
+ // Read session index for titles
2288
+ const titleMap = new Map<string, string>();
2289
+ try {
2290
+ const indexContent = await readFileText(join(codexDir, "session_index.jsonl"));
2291
+ for (const line of indexContent.split("\n")) {
2292
+ if (!line.trim()) continue;
2293
+ try {
2294
+ const entry = JSON.parse(line) as { id?: string; thread_name?: string };
2295
+ if (entry.id && entry.thread_name) titleMap.set(entry.id, entry.thread_name);
2296
+ } catch { /* skip */ }
2297
+ }
2298
+ } catch { /* index missing */ }
2299
+
2300
+ const codexHistoryPath = join(codexDir, "history.jsonl");
2301
+ const codexContent = await readFileTail(codexHistoryPath, 512 * 1024);
2302
+ const codexSessions = new Map<string, { title: string; updatedAt: number }>();
2303
+
2304
+ for (const line of codexContent.split("\n")) {
2305
+ if (!line.trim()) continue;
2306
+ try {
2307
+ const entry = JSON.parse(line) as { session_id?: string; ts?: number; text?: string };
2308
+ if (!entry.session_id) continue;
2309
+ const existing = codexSessions.get(entry.session_id);
2310
+ if (!existing) {
2311
+ const title = (entry.text ?? "").replace(/\s+/g, " ").trim().slice(0, 120);
2312
+ codexSessions.set(entry.session_id, {
2313
+ title,
2314
+ updatedAt: entry.ts ? entry.ts * 1000 : 0, // ts is in seconds
2315
+ });
2316
+ } else if (entry.ts && entry.ts * 1000 > existing.updatedAt) {
2317
+ existing.updatedAt = entry.ts * 1000;
2318
+ }
2319
+ } catch { /* skip */ }
2320
+ }
2321
+
2322
+ const codexResults: AcpSessionInfo[] = [];
2323
+ const existingIds2 = new Set(results.map((r) => r.sessionId));
2324
+ for (const [sessionId, info] of codexSessions) {
2325
+ if (existingIds2.has(sessionId)) continue;
2326
+ codexResults.push({
2327
+ sessionId,
2328
+ cwd: "",
2329
+ title: titleMap.get(sessionId) ?? info.title,
2330
+ updatedAt: info.updatedAt ? new Date(info.updatedAt).toISOString() : undefined,
2331
+ agent: "codex",
2332
+ });
2333
+ }
2334
+ codexResults.sort((a, b) => (b.updatedAt ?? "").localeCompare(a.updatedAt ?? ""));
2335
+ results.push(...codexResults);
2336
+ } catch {
2337
+ // Codex history missing or unreadable — skip
2338
+ }
2339
+
2259
2340
  return results;
2260
2341
  }
2261
2342
 
@@ -2444,7 +2525,12 @@ function normalizeTranscriptMessages(lines: string[], limit: number): ChatHistor
2444
2525
  for (const line of lines) {
2445
2526
  try {
2446
2527
  const parsed = JSON.parse(line);
2447
- const msg = parsed?.message;
2528
+
2529
+ // Support both OpenClaw/Claude format (parsed.message) and Codex format (parsed.payload with type=response_item)
2530
+ let msg: Record<string, unknown> | undefined = parsed?.message;
2531
+ if (!msg && parsed?.type === "response_item" && parsed?.payload?.role) {
2532
+ msg = parsed.payload;
2533
+ }
2448
2534
  if (!msg) continue;
2449
2535
 
2450
2536
  const role = typeof msg.role === "string" ? msg.role : "";
@@ -16,6 +16,7 @@ import { AcpProxy, readAllSessionStoresFromDisk } from "./acp-proxy.ts";
16
16
  import { TerminalManager } from "./terminal.ts";
17
17
  import { WebHandler } from "./web.ts";
18
18
  import { KnowledgeSync } from "./knowledge-sync.ts";
19
+ import { HealthTracker } from "./health-tracker.ts";
19
20
  import { SentinelManager } from "./sentinel-manager.ts";
20
21
  import type {
21
22
  AnyClusterFrame,
@@ -28,6 +29,7 @@ import type {
28
29
  HandoffInputRequired,
29
30
  HandoffInput,
30
31
  KnowledgeSyncFrame,
32
+ HealthSyncFrame,
31
33
  ModelRequest,
32
34
  ModelResponse,
33
35
  ModelStreamChunk,
@@ -85,6 +87,7 @@ export class ClusterRuntime {
85
87
  readonly acpProxy: AcpProxy | null;
86
88
  readonly terminalManager: TerminalManager;
87
89
  knowledgeSync: KnowledgeSync | null = null;
90
+ healthTracker: HealthTracker;
88
91
  webHandler: WebHandler | null = null;
89
92
  private sentinelManager: SentinelManager | null = null;
90
93
  private logger: PluginLogger;
@@ -104,6 +107,10 @@ export class ClusterRuntime {
104
107
  const acpEnabled = config.acp?.enabled || (openclawConfig as Record<string, any>).acp?.enabled;
105
108
  this.acpProxy = acpEnabled ? new AcpProxy(config, this.peerManager, openclawConfig as Record<string, unknown>, gatewayInfo) : null;
106
109
  this.terminalManager = new TerminalManager(config, this.peerManager);
110
+ this.healthTracker = new HealthTracker({
111
+ nodeId: config.nodeId,
112
+ peerManager: this.peerManager,
113
+ });
107
114
  }
108
115
 
109
116
  start() {
@@ -115,11 +122,15 @@ export class ClusterRuntime {
115
122
  this.peerManager.on("peerConnected", (nodeId) => {
116
123
  this.logger.info(`[clawmatrix] Peer connected: ${nodeId}`);
117
124
  this.refreshDiscoveredModels();
125
+ this.healthTracker.recordPeerOnline(nodeId, "direct");
126
+ this.healthTracker.initPeerSync(nodeId);
118
127
  });
119
128
 
120
129
  this.peerManager.on("peerDisconnected", (nodeId) => {
121
130
  this.logger.info(`[clawmatrix] Peer disconnected: ${nodeId}`);
122
131
  this.refreshDiscoveredModels();
132
+ this.healthTracker.recordPeerOffline(nodeId);
133
+ this.healthTracker.removePeerSync(nodeId);
123
134
  });
124
135
 
125
136
  this.peerManager.on("peerCapabilitiesChanged", () => {
@@ -132,6 +143,7 @@ export class ClusterRuntime {
132
143
  this.peerManager.setHttpHandler((req, res) => this.webHandler!.handle(req, res));
133
144
  // Enable satellite tool routing through WebHandler
134
145
  this.toolProxy.setSatelliteHandler(this.webHandler);
146
+ this.webHandler.setHealthTracker(this.healthTracker);
135
147
  this.logger.info(`[clawmatrix] Web dashboard enabled on listen port`);
136
148
  }
137
149
 
@@ -166,8 +178,9 @@ export class ClusterRuntime {
166
178
  }
167
179
 
168
180
  // Auto-detect ACP agents if ACP is enabled but no agents are explicitly configured
169
- if (this.acpProxy && this.config.acp?.enabled && (!this.config.acp.agents || this.config.acp.agents.length === 0)) {
170
- AcpProxy.detectAvailableAgents(this.config.acp.commands).then((detected) => {
181
+ // Check both ClawMatrix and OpenClaw configs (consistent with acpProxy creation above)
182
+ if (this.acpProxy && (!this.config.acp?.agents || this.config.acp.agents.length === 0)) {
183
+ AcpProxy.detectAvailableAgents(this.config.acp?.commands).then((detected) => {
171
184
  if (detected.length > 0) {
172
185
  this.logger.info(`[clawmatrix] Auto-detected ACP agents: ${detected.map((a) => a.id).join(", ")}`);
173
186
  this.peerManager.updateAcpAgents(detected);
@@ -177,6 +190,11 @@ export class ClusterRuntime {
177
190
  });
178
191
  }
179
192
 
193
+ // Health tracker (always enabled, no config gate)
194
+ this.healthTracker.start().catch((err) => {
195
+ this.logger.error(`[clawmatrix] Health tracker failed to start: ${err}`);
196
+ });
197
+
180
198
  // Start subsystems
181
199
  this.peerManager.start();
182
200
  this.modelProxy.start();
@@ -213,6 +231,7 @@ export class ClusterRuntime {
213
231
  // NOTE: intentionally do NOT stop sentinel here.
214
232
  // Sentinel must survive gateway shutdown — that's its entire purpose.
215
233
  // It will be replaced by killOldSentinel() on next gateway start.
234
+ await this.healthTracker.stop();
216
235
  await this.knowledgeSync?.stop();
217
236
  this.webHandler?.destroy();
218
237
  this.handoffManager.destroy();
@@ -316,6 +335,9 @@ export class ClusterRuntime {
316
335
  this.logger.error(`[clawmatrix] Knowledge sync error: ${err}`);
317
336
  });
318
337
  break;
338
+ case "health_sync":
339
+ this.healthTracker.handleSyncMessage(frame as HealthSyncFrame);
340
+ break;
319
341
  case "acp_req":
320
342
  if (this.acpProxy) {
321
343
  this.acpProxy.handleRequest(frame as AcpTaskRequest).catch((err) => {
@@ -0,0 +1,581 @@
1
+ import * as Automerge from "@automerge/automerge";
2
+ import path from "node:path";
3
+ import { readFile, writeFile, mkdir } from "node:fs/promises";
4
+ import { homedir, tmpdir } from "node:os";
5
+
6
+ import { debug } from "./debug.ts";
7
+ import type { PeerManager } from "./peer-manager.ts";
8
+ import type { HealthSyncFrame } from "./types.ts";
9
+
10
+ const TAG = "health";
11
+
12
+ /** Retention period for health events (default 90 days). */
13
+ const DEFAULT_RETENTION_MS = 90 * 24 * 60 * 60 * 1000;
14
+
15
+ /** Compact interval: every 24 hours. */
16
+ const COMPACT_INTERVAL = 24 * 60 * 60 * 1000;
17
+
18
+ /** Save debounce interval (5 seconds). */
19
+ const SAVE_DEBOUNCE = 5_000;
20
+
21
+ // ── Document schema ─────────────────────────────────────────────
22
+
23
+ export interface HealthEvent {
24
+ ts: number;
25
+ type: "start" | "stop" | "peer_online" | "peer_offline";
26
+ peer?: string;
27
+ via?: string; // "direct" | "relay"
28
+ reason?: string; // disconnect reason
29
+ }
30
+
31
+ interface NodeHealthEntry {
32
+ events: HealthEvent[];
33
+ lastUpdated: number;
34
+ }
35
+
36
+ export interface HealthDoc {
37
+ nodes: Record<string, NodeHealthEntry>;
38
+ }
39
+
40
+ // ── Timeline aggregation ────────────────────────────────────────
41
+
42
+ export type BucketState = "up" | "degraded" | "down" | "unknown";
43
+
44
+ export interface NodeTimeline {
45
+ nodeId: string;
46
+ firstSeen: number;
47
+ lastSeen: number;
48
+ buckets: BucketState[];
49
+ uptimeRatio: number;
50
+ }
51
+
52
+ export interface AvailabilityResult {
53
+ range: string;
54
+ bucketMinutes: number;
55
+ startTs: number;
56
+ endTs: number;
57
+ nodes: NodeTimeline[];
58
+ /** Time periods when the observing node was down (cannot observe). */
59
+ gaps: Array<[number, number]>;
60
+ }
61
+
62
+ // ── HealthTracker ───────────────────────────────────────────────
63
+
64
+ export interface HealthTrackerOptions {
65
+ nodeId: string;
66
+ peerManager: PeerManager;
67
+ retentionMs?: number;
68
+ /** Override state directory (for tests). */
69
+ stateDir?: string;
70
+ }
71
+
72
+ export class HealthTracker {
73
+ private doc: Automerge.Doc<HealthDoc>;
74
+ private syncStates = new Map<string, Automerge.SyncState>();
75
+ private readonly nodeId: string;
76
+ private readonly peerManager: PeerManager;
77
+ private readonly retentionMs: number;
78
+ private readonly docPath: string;
79
+ private compactTimer: ReturnType<typeof setInterval> | null = null;
80
+ private saveTimer: ReturnType<typeof setTimeout> | null = null;
81
+ private dirty = false;
82
+
83
+ constructor(opts: HealthTrackerOptions) {
84
+ this.nodeId = opts.nodeId;
85
+ this.peerManager = opts.peerManager;
86
+ this.retentionMs = opts.retentionMs ?? DEFAULT_RETENTION_MS;
87
+
88
+ const stateDir = opts.stateDir ?? path.join(homedir() || tmpdir(), ".openclaw", "clawmatrix");
89
+ this.docPath = path.join(stateDir, "health.automerge");
90
+
91
+ // Initialize empty doc (will be replaced by load if file exists)
92
+ this.doc = Automerge.init<HealthDoc>();
93
+ this.doc = Automerge.change(this.doc, (d) => {
94
+ (d as HealthDoc).nodes = {};
95
+ });
96
+ }
97
+
98
+ async start() {
99
+ // Load persisted doc
100
+ await this.load();
101
+
102
+ // Record self start
103
+ this.recordEvent({ ts: Date.now(), type: "start" });
104
+
105
+ // Compact old events on start
106
+ this.compact();
107
+
108
+ // Schedule periodic compact
109
+ this.compactTimer = setInterval(() => this.compact(), COMPACT_INTERVAL);
110
+
111
+ debug(TAG, `health tracker started for node "${this.nodeId}"`);
112
+ }
113
+
114
+ async stop() {
115
+ // Record self stop
116
+ this.recordEvent({ ts: Date.now(), type: "stop" });
117
+
118
+ if (this.compactTimer) {
119
+ clearInterval(this.compactTimer);
120
+ this.compactTimer = null;
121
+ }
122
+ if (this.saveTimer) {
123
+ clearTimeout(this.saveTimer);
124
+ this.saveTimer = null;
125
+ }
126
+
127
+ // Final save
128
+ await this.save();
129
+ debug(TAG, "health tracker stopped");
130
+ }
131
+
132
+ // ── Event recording ─────────────────────────────────────────
133
+
134
+ recordEvent(event: HealthEvent) {
135
+ this.doc = Automerge.change(this.doc, (d) => {
136
+ if (!d.nodes[this.nodeId]) {
137
+ d.nodes[this.nodeId] = { events: [], lastUpdated: 0 };
138
+ }
139
+ const entry = d.nodes[this.nodeId]!;
140
+ entry.events.push({ ...event });
141
+ entry.lastUpdated = Date.now();
142
+ });
143
+ this.scheduleSave();
144
+ this.broadcastSync();
145
+ }
146
+
147
+ recordPeerOnline(peerId: string, via: "direct" | "relay") {
148
+ this.recordEvent({ ts: Date.now(), type: "peer_online", peer: peerId, via });
149
+ }
150
+
151
+ recordPeerOffline(peerId: string, reason?: string) {
152
+ this.recordEvent({ ts: Date.now(), type: "peer_offline", peer: peerId, reason });
153
+ }
154
+
155
+ // ── Sync protocol ──────────────────────────────────────────
156
+
157
+ /** Handle incoming health_sync frame from a peer. */
158
+ handleSyncMessage(frame: HealthSyncFrame) {
159
+ const peerId = frame.from;
160
+ const message = new Uint8Array(Buffer.from(frame.payload.data, "base64"));
161
+ const syncKey = peerId;
162
+
163
+ try {
164
+ const syncState = this.syncStates.get(syncKey) ?? Automerge.initSyncState();
165
+ const [newDoc, newSyncState] = Automerge.receiveSyncMessage(this.doc, syncState, message);
166
+ this.doc = newDoc;
167
+ this.syncStates.set(syncKey, newSyncState);
168
+ this.scheduleSave();
169
+
170
+ // Send our response
171
+ this.sendSyncMessage(peerId);
172
+ } catch (err) {
173
+ debug(TAG, `error handling sync from ${peerId}: ${err}`);
174
+ }
175
+ }
176
+
177
+ /** Initiate sync with a peer (called on peer connect). */
178
+ initPeerSync(peerId: string) {
179
+ if (peerId === this.nodeId) return;
180
+ this.syncStates.set(peerId, Automerge.initSyncState());
181
+ this.sendSyncMessage(peerId);
182
+ }
183
+
184
+ /** Clean up sync state for a disconnected peer. */
185
+ removePeerSync(peerId: string) {
186
+ this.syncStates.delete(peerId);
187
+ }
188
+
189
+ private sendSyncMessage(peerId: string) {
190
+ const syncState = this.syncStates.get(peerId) ?? Automerge.initSyncState();
191
+ const [newSyncState, message] = Automerge.generateSyncMessage(this.doc, syncState);
192
+ this.syncStates.set(peerId, newSyncState);
193
+
194
+ if (!message) return;
195
+
196
+ debug(TAG, `sending health sync to ${peerId} (${message.byteLength} bytes)`);
197
+
198
+ const frame: HealthSyncFrame = {
199
+ type: "health_sync",
200
+ from: this.nodeId,
201
+ to: peerId,
202
+ timestamp: Date.now(),
203
+ payload: {
204
+ data: Buffer.from(message).toString("base64"),
205
+ },
206
+ };
207
+
208
+ this.peerManager.router.sendTo(peerId, frame);
209
+ }
210
+
211
+ private broadcastSync() {
212
+ const peers = this.peerManager.router.getAllPeers();
213
+ for (const peer of peers) {
214
+ this.sendSyncMessage(peer.nodeId);
215
+ }
216
+ }
217
+
218
+ // ── Timeline aggregation ──────────────────────────────────
219
+
220
+ /**
221
+ * Build availability timeline for all known nodes.
222
+ * @param range - "24h", "7d", or "90d"
223
+ */
224
+ getAvailability(range: "24h" | "7d" | "90d" = "24h"): AvailabilityResult {
225
+ const now = Date.now();
226
+ let durationMs: number;
227
+ let bucketMinutes: number;
228
+
229
+ switch (range) {
230
+ case "24h":
231
+ durationMs = 24 * 60 * 60 * 1000;
232
+ bucketMinutes = 30;
233
+ break;
234
+ case "7d":
235
+ durationMs = 7 * 24 * 60 * 60 * 1000;
236
+ bucketMinutes = 60 * 4; // 4-hour buckets
237
+ break;
238
+ case "90d":
239
+ durationMs = 90 * 24 * 60 * 60 * 1000;
240
+ bucketMinutes = 60 * 24; // 1-day buckets
241
+ break;
242
+ }
243
+
244
+ const startTs = now - durationMs;
245
+ const endTs = now;
246
+ const bucketMs = bucketMinutes * 60 * 1000;
247
+ const bucketCount = Math.ceil(durationMs / bucketMs);
248
+
249
+ // Find observation gaps (periods where THIS node was down)
250
+ const gaps = this.getObservationGaps(this.nodeId, startTs, endTs);
251
+
252
+ // Build timeline for each node (including self)
253
+ const nodes: NodeTimeline[] = [];
254
+
255
+ for (const [nodeId, entry] of Object.entries(this.doc.nodes)) {
256
+ // For self, use start/stop events to determine uptime
257
+ // For other nodes, use peer_online/peer_offline from the observing node
258
+ const timeline = this.buildNodeTimeline(
259
+ nodeId,
260
+ entry,
261
+ startTs,
262
+ endTs,
263
+ bucketMs,
264
+ bucketCount,
265
+ gaps,
266
+ );
267
+ if (timeline) nodes.push(timeline);
268
+ }
269
+
270
+ return {
271
+ range,
272
+ bucketMinutes,
273
+ startTs,
274
+ endTs,
275
+ nodes,
276
+ gaps,
277
+ };
278
+ }
279
+
280
+ private buildNodeTimeline(
281
+ nodeId: string,
282
+ entry: NodeHealthEntry,
283
+ startTs: number,
284
+ endTs: number,
285
+ bucketMs: number,
286
+ bucketCount: number,
287
+ observerGaps: Array<[number, number]>,
288
+ ): NodeTimeline | null {
289
+ const events = [...entry.events].sort((a, b) => a.ts - b.ts);
290
+ if (events.length === 0) return null;
291
+
292
+ const firstSeen = events[0]!.ts;
293
+ const lastSeen = entry.lastUpdated;
294
+
295
+ // Build online intervals for this node
296
+ const intervals = this.buildOnlineIntervals(nodeId, events, startTs, endTs);
297
+
298
+ // Calculate per-bucket state
299
+ const buckets: BucketState[] = [];
300
+ let totalOnline = 0;
301
+ let totalObservable = 0;
302
+
303
+ for (let i = 0; i < bucketCount; i++) {
304
+ const bStart = startTs + i * bucketMs;
305
+ const bEnd = Math.min(bStart + bucketMs, endTs);
306
+
307
+ // How much of this bucket is observable (subtract observer gaps)
308
+ const observableMs = this.observableTimeInRange(bStart, bEnd, observerGaps);
309
+
310
+ if (observableMs === 0) {
311
+ buckets.push("unknown");
312
+ continue;
313
+ }
314
+
315
+ // How much of this bucket the node was online
316
+ const onlineMs = this.overlapMs(intervals, bStart, bEnd);
317
+ const ratio = onlineMs / observableMs;
318
+
319
+ totalOnline += onlineMs;
320
+ totalObservable += observableMs;
321
+
322
+ if (ratio >= 0.95) buckets.push("up");
323
+ else if (ratio >= 0.05) buckets.push("degraded");
324
+ else buckets.push("down");
325
+ }
326
+
327
+ const uptimeRatio = totalObservable > 0 ? totalOnline / totalObservable : 0;
328
+
329
+ return { nodeId, firstSeen, lastSeen, buckets, uptimeRatio };
330
+ }
331
+
332
+ /**
333
+ * Build online intervals for a node.
334
+ * - For self node: uses start/stop events
335
+ * - For other nodes: uses peer_online/peer_offline events from all observers
336
+ */
337
+ private buildOnlineIntervals(
338
+ nodeId: string,
339
+ events: HealthEvent[],
340
+ startTs: number,
341
+ endTs: number,
342
+ ): Array<[number, number]> {
343
+ const intervals: Array<[number, number]> = [];
344
+
345
+ if (nodeId === this.nodeId) {
346
+ // Self: start/stop events define uptime
347
+ // But we're looking at all nodes' data, so check if this nodeId
348
+ // has start/stop events (each node writes its own start/stop)
349
+ return this.buildSelfIntervals(events, startTs, endTs);
350
+ }
351
+
352
+ // For remote nodes: gather peer_online/peer_offline events from all observer nodes
353
+ // We have the CRDT doc with all nodes' events merged
354
+ // Look through ALL nodes' events for peer_online/peer_offline referencing this nodeId
355
+ return this.buildPeerIntervals(nodeId, startTs, endTs);
356
+ }
357
+
358
+ private buildSelfIntervals(
359
+ events: HealthEvent[],
360
+ startTs: number,
361
+ endTs: number,
362
+ ): Array<[number, number]> {
363
+ const intervals: Array<[number, number]> = [];
364
+ let onlineSince: number | null = null;
365
+
366
+ for (const ev of events) {
367
+ if (ev.ts < startTs) {
368
+ // Track state before window
369
+ if (ev.type === "start") onlineSince = ev.ts;
370
+ else if (ev.type === "stop") onlineSince = null;
371
+ continue;
372
+ }
373
+ if (ev.ts > endTs) break;
374
+
375
+ if (ev.type === "start") {
376
+ onlineSince = ev.ts;
377
+ } else if (ev.type === "stop" && onlineSince !== null) {
378
+ intervals.push([Math.max(onlineSince, startTs), ev.ts]);
379
+ onlineSince = null;
380
+ }
381
+ }
382
+
383
+ // If still online at end of window
384
+ if (onlineSince !== null) {
385
+ intervals.push([Math.max(onlineSince, startTs), endTs]);
386
+ }
387
+
388
+ return intervals;
389
+ }
390
+
391
+ private buildPeerIntervals(
392
+ targetNodeId: string,
393
+ startTs: number,
394
+ endTs: number,
395
+ ): Array<[number, number]> {
396
+ const intervals: Array<[number, number]> = [];
397
+
398
+ // Collect peer_online/peer_offline events from all observer nodes
399
+ const relevantEvents: HealthEvent[] = [];
400
+ for (const [, entry] of Object.entries(this.doc.nodes)) {
401
+ for (const ev of entry.events) {
402
+ if (ev.peer === targetNodeId && (ev.type === "peer_online" || ev.type === "peer_offline")) {
403
+ relevantEvents.push(ev);
404
+ }
405
+ }
406
+ }
407
+ relevantEvents.sort((a, b) => a.ts - b.ts);
408
+
409
+ let onlineSince: number | null = null;
410
+
411
+ for (const ev of relevantEvents) {
412
+ if (ev.ts < startTs) {
413
+ if (ev.type === "peer_online") onlineSince = ev.ts;
414
+ else if (ev.type === "peer_offline") onlineSince = null;
415
+ continue;
416
+ }
417
+ if (ev.ts > endTs) break;
418
+
419
+ if (ev.type === "peer_online") {
420
+ if (onlineSince === null) onlineSince = ev.ts;
421
+ } else if (ev.type === "peer_offline" && onlineSince !== null) {
422
+ intervals.push([Math.max(onlineSince, startTs), ev.ts]);
423
+ onlineSince = null;
424
+ }
425
+ }
426
+
427
+ if (onlineSince !== null) {
428
+ intervals.push([Math.max(onlineSince, startTs), endTs]);
429
+ }
430
+
431
+ return intervals;
432
+ }
433
+
434
+ /** Get observation gaps: periods when the local node was not running. */
435
+ private getObservationGaps(
436
+ nodeId: string,
437
+ startTs: number,
438
+ endTs: number,
439
+ ): Array<[number, number]> {
440
+ const entry = this.doc.nodes[nodeId];
441
+ if (!entry) return [[startTs, endTs]]; // no data = entire range is a gap
442
+
443
+ const selfIntervals = this.buildSelfIntervals(
444
+ [...entry.events].sort((a, b) => a.ts - b.ts),
445
+ startTs,
446
+ endTs,
447
+ );
448
+
449
+ // Gaps are the complement of self intervals within [startTs, endTs]
450
+ const gaps: Array<[number, number]> = [];
451
+ let cursor = startTs;
452
+
453
+ for (const [start, end] of selfIntervals) {
454
+ if (start > cursor) {
455
+ gaps.push([cursor, start]);
456
+ }
457
+ cursor = Math.max(cursor, end);
458
+ }
459
+
460
+ if (cursor < endTs) {
461
+ gaps.push([cursor, endTs]);
462
+ }
463
+
464
+ return gaps;
465
+ }
466
+
467
+ /** Calculate observable time in a range, excluding gaps. */
468
+ private observableTimeInRange(
469
+ start: number,
470
+ end: number,
471
+ gaps: Array<[number, number]>,
472
+ ): number {
473
+ let total = end - start;
474
+ for (const [gStart, gEnd] of gaps) {
475
+ const overlapStart = Math.max(start, gStart);
476
+ const overlapEnd = Math.min(end, gEnd);
477
+ if (overlapStart < overlapEnd) {
478
+ total -= overlapEnd - overlapStart;
479
+ }
480
+ }
481
+ return Math.max(0, total);
482
+ }
483
+
484
+ /** Calculate total overlap between intervals and a range. */
485
+ private overlapMs(intervals: Array<[number, number]>, start: number, end: number): number {
486
+ let total = 0;
487
+ for (const [iStart, iEnd] of intervals) {
488
+ const overlapStart = Math.max(start, iStart);
489
+ const overlapEnd = Math.min(end, iEnd);
490
+ if (overlapStart < overlapEnd) {
491
+ total += overlapEnd - overlapStart;
492
+ }
493
+ }
494
+ return total;
495
+ }
496
+
497
+ // ── Compact ───────────────────────────────────────────────
498
+
499
+ private compact() {
500
+ const cutoff = Date.now() - this.retentionMs;
501
+ let pruned = 0;
502
+
503
+ this.doc = Automerge.change(this.doc, (d) => {
504
+ for (const [, node] of Object.entries(d.nodes)) {
505
+ const before = node.events.length;
506
+ // Keep events newer than cutoff; also keep the last event before cutoff
507
+ // to preserve state continuity
508
+ let lastBeforeCutoff = -1;
509
+ for (let i = 0; i < node.events.length; i++) {
510
+ if (node.events[i]!.ts < cutoff) lastBeforeCutoff = i;
511
+ }
512
+ if (lastBeforeCutoff > 0) {
513
+ // Remove all events before the last one before cutoff
514
+ node.events.splice(0, lastBeforeCutoff);
515
+ pruned += before - node.events.length;
516
+ }
517
+ }
518
+ });
519
+
520
+ if (pruned > 0) {
521
+ debug(TAG, `compacted ${pruned} old events`);
522
+ // Re-save to discard old ops
523
+ this.recompact();
524
+ }
525
+ }
526
+
527
+ /** Re-serialize to discard Automerge op history for removed data. */
528
+ private recompact() {
529
+ const bytes = Automerge.save(this.doc);
530
+ this.doc = Automerge.load<HealthDoc>(bytes);
531
+ this.scheduleSave();
532
+ }
533
+
534
+ // ── Persistence ───────────────────────────────────────────
535
+
536
+ private async load() {
537
+ try {
538
+ const data = await readFile(this.docPath);
539
+ this.doc = Automerge.load<HealthDoc>(new Uint8Array(data));
540
+ debug(TAG, `loaded health doc from ${this.docPath}`);
541
+ } catch {
542
+ debug(TAG, "no existing health doc, starting fresh");
543
+ }
544
+ }
545
+
546
+ private async save() {
547
+ try {
548
+ const data = Automerge.save(this.doc);
549
+ await mkdir(path.dirname(this.docPath), { recursive: true });
550
+ await writeFile(this.docPath, Buffer.from(data));
551
+ } catch (err) {
552
+ debug(TAG, `failed to save health doc: ${err}`);
553
+ }
554
+ }
555
+
556
+ private scheduleSave() {
557
+ this.dirty = true;
558
+ if (this.saveTimer) return;
559
+ this.saveTimer = setTimeout(() => {
560
+ this.saveTimer = null;
561
+ if (this.dirty) {
562
+ this.dirty = false;
563
+ this.save().catch((err) => {
564
+ debug(TAG, `deferred save error: ${err}`);
565
+ });
566
+ }
567
+ }, SAVE_DEBOUNCE);
568
+ }
569
+
570
+ // ── Public accessors ──────────────────────────────────────
571
+
572
+ /** Get all known node IDs (including offline ones). */
573
+ getKnownNodes(): string[] {
574
+ return Object.keys(this.doc.nodes);
575
+ }
576
+
577
+ /** Get raw events for a specific node. */
578
+ getNodeEvents(nodeId: string): HealthEvent[] {
579
+ return [...(this.doc.nodes[nodeId]?.events ?? [])];
580
+ }
581
+ }
@@ -1090,13 +1090,28 @@ export class ModelProxy {
1090
1090
  ?? this.config.models.find((m) => m.id === payload.model)
1091
1091
  : this.config.models.find((m) => m.id === payload.model);
1092
1092
  if (!model) {
1093
+ // Model not available locally — try forwarding to a remote node that has it
1094
+ const originalTarget = frame.to;
1095
+ const exclude = new Set([this.config.nodeId]);
1096
+ if (originalTarget) exclude.add(originalTarget);
1097
+ const alternatives = this.peerManager.router.findNodesForModel(payload.model, exclude);
1098
+ for (const alt of alternatives) {
1099
+ if (this.peerManager.sendTo(alt.nodeId, { ...frame, to: alt.nodeId })) {
1100
+ debug("model_req", `failover: ${originalTarget ?? "local"} → ${alt.nodeId} for "${payload.model}"`);
1101
+ return;
1102
+ }
1103
+ }
1104
+ // No alternative found
1105
+ const hint = originalTarget && originalTarget !== this.config.nodeId
1106
+ ? `Target node "${originalTarget}" is unreachable and no alternative nodes provide model "${payload.model}"`
1107
+ : `Model "${payload.model}" not available locally`;
1093
1108
  this.peerManager.sendTo(from, {
1094
1109
  type: "model_res",
1095
1110
  id,
1096
1111
  from: this.config.nodeId,
1097
1112
  to: from,
1098
1113
  timestamp: Date.now(),
1099
- payload: { success: false, error: `Model "${payload.model}" not available locally` },
1114
+ payload: { success: false, error: hint },
1100
1115
  } satisfies ModelResponse);
1101
1116
  return;
1102
1117
  }
@@ -604,8 +604,11 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
604
604
 
605
605
  // ── Message handling ───────────────────────────────────────────
606
606
  private onFrame(frame: AnyClusterFrame, from: Connection) {
607
- // Ignore self-echo: frames with our own nodeId that were relayed back to us
608
- if (frame.from === this.config.nodeId) return;
607
+ // Ignore self-echo: frames with our own nodeId that were relayed back to us.
608
+ // Exception: frames from same-nodeId satellite connections (Mac/iOS client)
609
+ // are legitimate requests that must be processed or relayed.
610
+ const isSatellite = from.remoteNodeId === this.config.nodeId;
611
+ if (frame.from === this.config.nodeId && !isSatellite) return;
609
612
 
610
613
  // Validate from field: must be the direct peer or a known node (relayed)
611
614
  if (frame.from && frame.from !== from.remoteNodeId && !this.router.getRoute(frame.from)) {
@@ -674,8 +677,26 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
674
677
  }
675
678
 
676
679
  if (frame.to && frame.to !== this.config.nodeId) {
677
- this.router.tryRelay(frame);
678
- return;
680
+ if (this.router.tryRelay(frame)) return;
681
+
682
+ // Relay failed — for model_req, try alternative nodes or fall through to local handling
683
+ if (frame.type === "model_req") {
684
+ const modelId = (frame as any).payload?.model;
685
+ if (modelId) {
686
+ const exclude = new Set([frame.to, this.config.nodeId]);
687
+ const alternatives = this.router.findNodesForModel(modelId, exclude);
688
+ for (const alt of alternatives) {
689
+ if (this.sendTo(alt.nodeId, { ...frame, to: alt.nodeId })) {
690
+ debug("peer", `model_req failover: ${frame.to} → ${alt.nodeId}`);
691
+ return;
692
+ }
693
+ }
694
+ }
695
+ // No remote alternative — fall through to local handling
696
+ // (model-proxy will handle locally or send error back)
697
+ } else {
698
+ return;
699
+ }
679
700
  }
680
701
 
681
702
  // Forward to same-nodeId satellite connection (e.g. Mac desktop app) so that
package/src/router.ts CHANGED
@@ -233,6 +233,31 @@ export class Router {
233
233
  return this.routes.get(target);
234
234
  }
235
235
 
236
+ /** Find reachable nodes that provide a specific model, sorted by latency.
237
+ * Excludes nodes in the `exclude` set. */
238
+ findNodesForModel(modelId: string, exclude?: Set<string>): RouteEntry[] {
239
+ const candidates: RouteEntry[] = [];
240
+ for (const entry of this.routes.values()) {
241
+ if (exclude?.has(entry.nodeId)) continue;
242
+ if (!entry.models.some((m) => m.id === modelId)) continue;
243
+ // Check reachability
244
+ if (entry.connection?.isOpen) {
245
+ candidates.push(entry);
246
+ } else if (entry.reachableVia) {
247
+ const relay = this.connections.get(entry.reachableVia);
248
+ if (relay?.isOpen) candidates.push(entry);
249
+ }
250
+ }
251
+ // Sort: direct first, then by latency
252
+ candidates.sort((a, b) => {
253
+ const aDirect = a.connection ? 0 : 1;
254
+ const bDirect = b.connection ? 0 : 1;
255
+ if (aDirect !== bDirect) return aDirect - bDirect;
256
+ return a.latencyMs - b.latencyMs;
257
+ });
258
+ return candidates;
259
+ }
260
+
236
261
  // ── Message sending and relay ──────────────────────────────────
237
262
  /** Send a frame to a specific node, relaying if necessary. Returns true if sent. */
238
263
  sendTo(targetNodeId: string, frame: ClusterFrame | AnyClusterFrame): boolean {
package/src/sentinel.ts CHANGED
@@ -569,6 +569,28 @@ process.on("exit", (code) => {
569
569
  try { process.stderr.write(`[svc ${ts}] Exit code=${code}\n`); } catch { /* ignore */ }
570
570
  });
571
571
 
572
+ /** Connect to all configured peers (called when gateway dies). */
573
+ function connectAllPeers() {
574
+ for (const peer of config.peers) {
575
+ if (!connections.has(peer.nodeId) && !reconnectTimers.has(peer.nodeId)) {
576
+ connectToPeer(peer);
577
+ }
578
+ }
579
+ }
580
+
581
+ /** Disconnect from all peers (called when gateway recovers). */
582
+ function disconnectAllPeers() {
583
+ for (const [nodeId, conn] of connections) {
584
+ conn.close(1000, "gateway recovered");
585
+ connections.delete(nodeId);
586
+ }
587
+ for (const [nodeId, timer] of reconnectTimers) {
588
+ clearTimeout(timer);
589
+ reconnectTimers.delete(nodeId);
590
+ }
591
+ reconnectAttempts.clear();
592
+ }
593
+
572
594
  /** Periodically check if the gateway process is still alive via kill(pid, 0). */
573
595
  function startGatewayHealthCheck() {
574
596
  if (healthCheckTimer || !gatewayPid) return;
@@ -581,11 +603,15 @@ function startGatewayHealthCheck() {
581
603
  log("Gateway process detected — back online");
582
604
  // Release the port so the gateway can reclaim it
583
605
  stopListening();
606
+ // Disconnect from peers — gateway handles mesh connections
607
+ disconnectAllPeers();
584
608
  }
585
609
  } catch {
586
610
  if (gatewayAlive) {
587
611
  gatewayAlive = false;
588
612
  log(`Gateway process (pid ${gatewayPid}) gone — entering standalone mode`);
613
+ // Connect to peers now that gateway is down
614
+ connectAllPeers();
589
615
  // Take over the gateway's listen port
590
616
  if (config.listenPort) {
591
617
  // Small delay to let the OS release the port from the dead process
@@ -608,11 +634,7 @@ function boot() {
608
634
  writePidFile();
609
635
  log(`Started (pid ${process.pid}, gateway ${gatewayPid}, nodeId ${sentinelNodeId()}, takeover port ${config.listenPort || "none"})`);
610
636
 
611
- // Connect to all configured peers
612
- for (const peer of config.peers) {
613
- connectToPeer(peer);
614
- }
615
-
616
- // Note: we do NOT start listening here.
617
- // Listening only starts when gateway dies (port takeover mode).
637
+ // Do NOT connect to peers on boot — gateway handles mesh connections.
638
+ // Sentinel only connects when gateway dies (standalone mode).
639
+ // Listening also only starts when gateway dies (port takeover mode).
618
640
  }
package/src/types.ts CHANGED
@@ -411,6 +411,14 @@ export interface KnowledgeSyncFrame extends ClusterFrame {
411
411
  };
412
412
  }
413
413
 
414
+ // ── Health sync ──────────────────────────────────────────────────
415
+ export interface HealthSyncFrame extends ClusterFrame {
416
+ type: "health_sync";
417
+ payload: {
418
+ data: string; // base64-encoded Automerge sync message
419
+ };
420
+ }
421
+
414
422
  // ── Diagnostic (sentinel) ────────────────────────────────────────
415
423
  export interface DiagnosticExec extends ClusterFrame {
416
424
  type: "diagnostic_exec";
@@ -831,4 +839,5 @@ export type AnyClusterFrame =
831
839
  | TerminalData
832
840
  | TerminalResize
833
841
  | TerminalCloseRequest
834
- | TerminalCloseResponse;
842
+ | TerminalCloseResponse
843
+ | HealthSyncFrame;
package/src/web.ts CHANGED
@@ -3,6 +3,7 @@ import type { PeerManager } from "./peer-manager.ts";
3
3
  import type { HandoffManager } from "./handoff.ts";
4
4
  import type { ClawMatrixConfig } from "./config.ts";
5
5
  import type { SatelliteContext, IngestedEvent } from "./types.ts";
6
+ import type { HealthTracker } from "./health-tracker.ts";
6
7
  import { timingSafeEqual } from "./auth.ts";
7
8
  import { renderDashboard } from "./web-ui.ts";
8
9
  import { readBody } from "./http-utils.ts";
@@ -46,6 +47,7 @@ export class WebHandler {
46
47
  private ingestedEvents: IngestedEvent[] = []; // ring buffer for ingested events
47
48
  private loginAttempts = new Map<string, { count: number; resetAt: number }>(); // IP → rate limit
48
49
  private loginCleanupTimer: ReturnType<typeof setInterval> | null = null;
50
+ private healthTracker: HealthTracker | null = null;
49
51
  private onPeerConnected: (nodeId: string) => void;
50
52
  private onPeerDisconnected: (nodeId: string) => void;
51
53
 
@@ -91,6 +93,11 @@ export class WebHandler {
91
93
  peerManager.on("peerDisconnected", this.onPeerDisconnected);
92
94
  }
93
95
 
96
+ /** Set the health tracker for availability API. */
97
+ setHealthTracker(tracker: HealthTracker) {
98
+ this.healthTracker = tracker;
99
+ }
100
+
94
101
  /** Clean up timers and pending requests on shutdown. */
95
102
  destroy() {
96
103
  // Remove event listeners to prevent post-destroy callbacks
@@ -181,6 +188,11 @@ export class WebHandler {
181
188
  return;
182
189
  }
183
190
 
191
+ if (path === "/api/availability" && req.method === "GET") {
192
+ this.handleAvailability(req, res);
193
+ return;
194
+ }
195
+
184
196
  if (path === "/api/satellite/poll" && req.method === "GET") {
185
197
  this.handleSatellitePoll(req, res);
186
198
  return;
@@ -271,6 +283,27 @@ export class WebHandler {
271
283
  }
272
284
  }
273
285
 
286
+ private handleAvailability(req: IncomingMessage, res: ServerResponse) {
287
+ if (!this.healthTracker) {
288
+ res.writeHead(503, { "Content-Type": "application/json" });
289
+ res.end(JSON.stringify({ error: "Health tracker not available" }));
290
+ return;
291
+ }
292
+
293
+ const url = new URL(req.url ?? "/", `http://${req.headers.host ?? "localhost"}`);
294
+ const range = (url.searchParams.get("range") ?? "24h") as "24h" | "7d" | "90d";
295
+
296
+ if (!["24h", "7d", "90d"].includes(range)) {
297
+ res.writeHead(400, { "Content-Type": "application/json" });
298
+ res.end(JSON.stringify({ error: "Invalid range. Use 24h, 7d, or 90d" }));
299
+ return;
300
+ }
301
+
302
+ const result = this.healthTracker.getAvailability(range);
303
+ res.writeHead(200, { "Content-Type": "application/json" });
304
+ res.end(JSON.stringify(result));
305
+ }
306
+
274
307
  private handleStatus(res: ServerResponse) {
275
308
  const peers = this.peerManager.router.getAllPeers();
276
309
  const localNode = {