clawmatrix 0.1.23 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/package.json +4 -2
- package/src/acp-proxy.ts +2183 -0
- package/src/audit.ts +42 -0
- package/src/auth.ts +2 -3
- package/src/cli.ts +76 -2
- package/src/cluster-service.ts +243 -3
- package/src/compat.ts +84 -3
- package/src/config.ts +117 -4
- package/src/connection.ts +288 -85
- package/src/crypto.ts +179 -0
- package/src/debug.ts +15 -2
- package/src/e2e/helpers.ts +318 -0
- package/src/handoff.ts +171 -92
- package/src/identity.ts +95 -0
- package/src/index.ts +433 -58
- package/src/knowledge-sync.ts +776 -207
- package/src/model-proxy.ts +144 -39
- package/src/peer-approval.ts +628 -0
- package/src/peer-manager.ts +261 -32
- package/src/rate-limiter.ts +88 -0
- package/src/router.ts +32 -10
- package/src/sentinel-manager.ts +142 -0
- package/src/sentinel.ts +618 -0
- package/src/task-activity.ts +74 -0
- package/src/terminal.ts +566 -0
- package/src/tool-proxy.ts +127 -3
- package/src/tools/cluster-acp.ts +237 -0
- package/src/tools/cluster-batch.ts +76 -0
- package/src/tools/cluster-diagnostic.ts +174 -0
- package/src/tools/cluster-edit.ts +70 -0
- package/src/tools/cluster-peers.ts +59 -14
- package/src/tools/cluster-terminal.ts +232 -0
- package/src/tools/cluster-tool.ts +26 -11
- package/src/types.ts +477 -3
- package/src/web.ts +2 -2
package/src/sentinel.ts
ADDED
|
@@ -0,0 +1,618 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sentinel — lightweight detached subprocess that survives OpenClaw crashes.
|
|
3
|
+
*
|
|
4
|
+
* Maintains independent WS connections to peers and handles diagnostic
|
|
5
|
+
* commands (exec, status) so remote nodes can troubleshoot even when
|
|
6
|
+
* the main gateway process is down.
|
|
7
|
+
*
|
|
8
|
+
* When the gateway dies and `listenPort` is configured, sentinel takes over
|
|
9
|
+
* the same port so existing clients (e.g. iOS) can reconnect without any
|
|
10
|
+
* URL change. When the gateway comes back, sentinel releases the port.
|
|
11
|
+
*
|
|
12
|
+
* Spawned by SentinelManager with `detached: true` + `unref()`.
|
|
13
|
+
* Receives config via IPC from the parent process.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { spawn } from "node:child_process";
|
|
17
|
+
import { readFileSync, writeFileSync, unlinkSync, existsSync } from "node:fs";
|
|
18
|
+
import { createServer, type Server } from "node:http";
|
|
19
|
+
import { WebSocketServer, WebSocket as WsWebSocket } from "ws";
|
|
20
|
+
import path from "node:path";
|
|
21
|
+
import { homedir, tmpdir } from "node:os";
|
|
22
|
+
import { Connection, type WsTransport, type ConnectionE2eeOptions } from "./connection.ts";
|
|
23
|
+
import { collectDeviceInfo } from "./device-info.ts";
|
|
24
|
+
import { loadOrCreateIdentity } from "./identity.ts";
|
|
25
|
+
import type { KeyPair } from "./crypto.ts";
|
|
26
|
+
import type {
|
|
27
|
+
AnyClusterFrame,
|
|
28
|
+
NodeCapabilities,
|
|
29
|
+
DiagnosticExec,
|
|
30
|
+
DiagnosticStatus,
|
|
31
|
+
} from "./types.ts";
|
|
32
|
+
|
|
33
|
+
// ── Config received from parent via IPC ─────────────────────────
|
|
34
|
+
interface SentinelConfig {
|
|
35
|
+
nodeId: string;
|
|
36
|
+
secret: string;
|
|
37
|
+
peers: Array<{ nodeId: string; url: string }>;
|
|
38
|
+
agents?: Array<{ id: string; description?: string; tags?: string[] }>;
|
|
39
|
+
models?: Array<{ id: string; provider: string; description?: string }>;
|
|
40
|
+
tags?: string[];
|
|
41
|
+
e2ee: boolean;
|
|
42
|
+
compression: boolean;
|
|
43
|
+
pidFile: string;
|
|
44
|
+
gatewayPid?: number;
|
|
45
|
+
/** Port to take over when gateway dies (typically the gateway's own listen port). */
|
|
46
|
+
listenPort?: number;
|
|
47
|
+
listenHost?: string;
|
|
48
|
+
/** Peer approval config (allowList + persistPath). */
|
|
49
|
+
peerApproval?: {
|
|
50
|
+
enabled: boolean;
|
|
51
|
+
allowList: string[];
|
|
52
|
+
persistPath: string;
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// ── State ───────────────────────────────────────────────────────
|
|
57
|
+
let config: SentinelConfig;
|
|
58
|
+
let gatewayAlive = true;
|
|
59
|
+
let gatewayPid: number | null = null;
|
|
60
|
+
let healthCheckTimer: ReturnType<typeof setInterval> | null = null;
|
|
61
|
+
const startTime = Date.now();
|
|
62
|
+
const sentinelNodeId = () => `${config.nodeId}:sentinel`;
|
|
63
|
+
|
|
64
|
+
// ── Peer approval (read-only — sentinel only accepts already-approved peers) ──
|
|
65
|
+
let approvedNodeIds = new Set<string>();
|
|
66
|
+
let identityKeyPair: KeyPair | null = null;
|
|
67
|
+
/** Approved peer records with pinned public keys (TOFU). */
|
|
68
|
+
let approvedPeerKeys = new Map<string, string>(); // nodeId → publicKey
|
|
69
|
+
|
|
70
|
+
const connections = new Map<string, Connection>();
|
|
71
|
+
const reconnectTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
72
|
+
const reconnectAttempts = new Map<string, number>();
|
|
73
|
+
const RECONNECT_BASE = 2_000;
|
|
74
|
+
const RECONNECT_MAX = 60_000;
|
|
75
|
+
|
|
76
|
+
// ── Inbound listener state (port takeover) ──────────────────────
|
|
77
|
+
let httpServer: Server | null = null;
|
|
78
|
+
let wss: WebSocketServer | null = null;
|
|
79
|
+
const inboundConnections = new Map<WsWebSocket, Connection>();
|
|
80
|
+
let listening = false;
|
|
81
|
+
|
|
82
|
+
// ── Rate limiting for diagnostic_exec ────────────────────────────
|
|
83
|
+
const EXEC_RATE_WINDOW = 60_000; // 1 minute
|
|
84
|
+
const EXEC_RATE_LIMIT = 20; // max execs per window
|
|
85
|
+
const execTimestamps: number[] = [];
|
|
86
|
+
|
|
87
|
+
// ── Approved peers loading ────────────────────────────────────────
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Load approved peers from the persisted JSON file.
|
|
91
|
+
* Sentinel only accepts connections from peers that were already approved
|
|
92
|
+
* by the gateway — it does NOT support approving new peers.
|
|
93
|
+
* This ensures that even if the gateway is down, unapproved devices
|
|
94
|
+
* (including those with a leaked token) cannot join.
|
|
95
|
+
*/
|
|
96
|
+
function loadApprovedPeers() {
|
|
97
|
+
const stateDir = path.join(homedir() || tmpdir(), ".openclaw", "clawmatrix");
|
|
98
|
+
const approval = config.peerApproval;
|
|
99
|
+
|
|
100
|
+
// Load identity key pair (shared with gateway via same state dir)
|
|
101
|
+
try {
|
|
102
|
+
identityKeyPair = loadOrCreateIdentity(stateDir);
|
|
103
|
+
} catch {
|
|
104
|
+
log("Failed to load identity key pair");
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
if (!approval?.enabled) return;
|
|
108
|
+
|
|
109
|
+
// Add allowList peers
|
|
110
|
+
for (const nodeId of approval.allowList) {
|
|
111
|
+
approvedNodeIds.add(nodeId);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Load persisted approved peers
|
|
115
|
+
try {
|
|
116
|
+
const filePath = path.join(stateDir, approval.persistPath);
|
|
117
|
+
if (existsSync(filePath)) {
|
|
118
|
+
const raw = readFileSync(filePath, "utf-8");
|
|
119
|
+
const data = JSON.parse(raw);
|
|
120
|
+
if (data.approved) {
|
|
121
|
+
for (const [nodeId, record] of Object.entries(data.approved)) {
|
|
122
|
+
approvedNodeIds.add(nodeId);
|
|
123
|
+
const rec = record as { publicKey?: string };
|
|
124
|
+
if (rec.publicKey) {
|
|
125
|
+
approvedPeerKeys.set(nodeId, rec.publicKey);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
} catch {
|
|
131
|
+
log("Failed to load approved peers — rejecting all inbound");
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
log(`Loaded ${approvedNodeIds.size} approved peers`);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Check if an inbound peer should be accepted by sentinel.
|
|
139
|
+
* Only allows: same-nodeId from loopback, allowList, or approved peers
|
|
140
|
+
* with matching TOFU public key.
|
|
141
|
+
*/
|
|
142
|
+
function isSentinelPeerAllowed(nodeId: string, publicKey: string | null): boolean {
|
|
143
|
+
if (!config.peerApproval?.enabled) return true;
|
|
144
|
+
|
|
145
|
+
// Always allow configured peer nodeIds (outbound targets connecting back)
|
|
146
|
+
if (config.peers.some(p => p.nodeId === nodeId)) return true;
|
|
147
|
+
|
|
148
|
+
// Allow allowList peers
|
|
149
|
+
if (approvedNodeIds.has(nodeId)) {
|
|
150
|
+
// TOFU check: if we have a pinned key, verify it matches
|
|
151
|
+
const pinnedKey = approvedPeerKeys.get(nodeId);
|
|
152
|
+
if (pinnedKey && publicKey && pinnedKey !== publicKey) {
|
|
153
|
+
log(`TOFU mismatch for ${nodeId} — rejecting`);
|
|
154
|
+
return false;
|
|
155
|
+
}
|
|
156
|
+
return true;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
return false;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// ── Peer connection ─────────────────────────────────────────────
|
|
163
|
+
function buildCapabilities(): NodeCapabilities {
|
|
164
|
+
return {
|
|
165
|
+
nodeId: sentinelNodeId(),
|
|
166
|
+
agents: config.agents ?? [],
|
|
167
|
+
models: config.models ?? [],
|
|
168
|
+
tags: [...(config.tags ?? []), "sentinel"],
|
|
169
|
+
deviceInfo: collectDeviceInfo(),
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function connectToPeer(peer: { nodeId: string; url: string }) {
|
|
174
|
+
const ws = new WebSocket(peer.url, ["graphql-transport-ws"]);
|
|
175
|
+
const e2eeOpts: ConnectionE2eeOptions = {
|
|
176
|
+
e2ee: config.e2ee,
|
|
177
|
+
compression: config.compression,
|
|
178
|
+
identityKeyPair: config.e2ee && identityKeyPair ? identityKeyPair : undefined,
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
// Create Connection only after WS is open (matches PeerManager pattern).
|
|
182
|
+
// Otherwise the 10s auth timer starts before the TCP handshake completes.
|
|
183
|
+
ws.addEventListener("open", () => {
|
|
184
|
+
const conn = new Connection(
|
|
185
|
+
ws as unknown as WsTransport,
|
|
186
|
+
"outbound",
|
|
187
|
+
sentinelNodeId(),
|
|
188
|
+
config.secret,
|
|
189
|
+
buildCapabilities(),
|
|
190
|
+
e2eeOpts,
|
|
191
|
+
);
|
|
192
|
+
conn.bindWebSocket(ws);
|
|
193
|
+
|
|
194
|
+
conn.on("authenticated", () => {
|
|
195
|
+
reconnectAttempts.delete(peer.nodeId);
|
|
196
|
+
connections.set(peer.nodeId, conn);
|
|
197
|
+
log(`Peer connected: ${peer.nodeId}`);
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
conn.on("message", (frame) => handleFrame(frame, conn));
|
|
201
|
+
|
|
202
|
+
conn.on("error", () => { /* close will follow */ });
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
let reconnectScheduled = false;
|
|
206
|
+
const tryReconnect = () => {
|
|
207
|
+
if (!reconnectScheduled) {
|
|
208
|
+
reconnectScheduled = true;
|
|
209
|
+
connections.delete(peer.nodeId);
|
|
210
|
+
scheduleReconnect(peer);
|
|
211
|
+
}
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
ws.addEventListener("error", tryReconnect);
|
|
215
|
+
ws.addEventListener("close", tryReconnect);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
function scheduleReconnect(peer: { nodeId: string; url: string }) {
|
|
219
|
+
if (reconnectTimers.has(peer.nodeId)) return;
|
|
220
|
+
const attempt = reconnectAttempts.get(peer.nodeId) ?? 0;
|
|
221
|
+
const delay = Math.min(RECONNECT_BASE * 2 ** attempt, RECONNECT_MAX);
|
|
222
|
+
reconnectAttempts.set(peer.nodeId, attempt + 1);
|
|
223
|
+
|
|
224
|
+
const timer = setTimeout(() => {
|
|
225
|
+
reconnectTimers.delete(peer.nodeId);
|
|
226
|
+
connectToPeer(peer);
|
|
227
|
+
}, delay);
|
|
228
|
+
reconnectTimers.set(peer.nodeId, timer);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// ── Frame handling ──────────────────────────────────────────────
|
|
232
|
+
function handleFrame(frame: AnyClusterFrame, conn: Connection) {
|
|
233
|
+
switch (frame.type) {
|
|
234
|
+
case "diagnostic_exec":
|
|
235
|
+
handleDiagnosticExec(frame as DiagnosticExec, conn);
|
|
236
|
+
break;
|
|
237
|
+
case "diagnostic_status":
|
|
238
|
+
handleDiagnosticStatus(frame as DiagnosticStatus, conn);
|
|
239
|
+
break;
|
|
240
|
+
// Silently ignore peer protocol frames — sentinel is not a full node
|
|
241
|
+
case "peer_sync":
|
|
242
|
+
case "ping":
|
|
243
|
+
conn.send({ type: "pong", from: sentinelNodeId(), timestamp: Date.now() } as AnyClusterFrame);
|
|
244
|
+
break;
|
|
245
|
+
case "pong":
|
|
246
|
+
break;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
function handleDiagnosticExec(frame: DiagnosticExec, conn: Connection) {
|
|
251
|
+
// Rate limiting
|
|
252
|
+
const now = Date.now();
|
|
253
|
+
while (execTimestamps.length > 0 && now - execTimestamps[0]! > EXEC_RATE_WINDOW) {
|
|
254
|
+
execTimestamps.shift();
|
|
255
|
+
}
|
|
256
|
+
if (execTimestamps.length >= EXEC_RATE_LIMIT) {
|
|
257
|
+
conn.send({
|
|
258
|
+
type: "diagnostic_exec_res",
|
|
259
|
+
id: frame.id,
|
|
260
|
+
from: sentinelNodeId(),
|
|
261
|
+
to: frame.from,
|
|
262
|
+
timestamp: now,
|
|
263
|
+
payload: { success: false, error: "Rate limit exceeded" },
|
|
264
|
+
} as AnyClusterFrame);
|
|
265
|
+
return;
|
|
266
|
+
}
|
|
267
|
+
execTimestamps.push(now);
|
|
268
|
+
|
|
269
|
+
const { command, timeout = 30 } = frame.payload;
|
|
270
|
+
const timeoutMs = timeout * 1000;
|
|
271
|
+
|
|
272
|
+
log(`Exec from ${frame.from}: ${command}`);
|
|
273
|
+
|
|
274
|
+
const child = spawn("sh", ["-c", command], {
|
|
275
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
276
|
+
timeout: timeoutMs,
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
let stdout = "";
|
|
280
|
+
let stderr = "";
|
|
281
|
+
let responded = false;
|
|
282
|
+
const MAX_OUTPUT = 512 * 1024; // 512KB
|
|
283
|
+
|
|
284
|
+
const sendResponse = (payload: Record<string, unknown>) => {
|
|
285
|
+
if (responded) return;
|
|
286
|
+
responded = true;
|
|
287
|
+
conn.send({
|
|
288
|
+
type: "diagnostic_exec_res",
|
|
289
|
+
id: frame.id,
|
|
290
|
+
from: sentinelNodeId(),
|
|
291
|
+
to: frame.from,
|
|
292
|
+
timestamp: Date.now(),
|
|
293
|
+
payload,
|
|
294
|
+
} as AnyClusterFrame);
|
|
295
|
+
};
|
|
296
|
+
|
|
297
|
+
child.stdout?.on("data", (chunk: Buffer) => {
|
|
298
|
+
if (stdout.length < MAX_OUTPUT) stdout += chunk.toString();
|
|
299
|
+
});
|
|
300
|
+
child.stderr?.on("data", (chunk: Buffer) => {
|
|
301
|
+
if (stderr.length < MAX_OUTPUT) stderr += chunk.toString();
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
child.on("close", (code) => {
|
|
305
|
+
sendResponse({
|
|
306
|
+
success: code === 0,
|
|
307
|
+
exitCode: code ?? 1,
|
|
308
|
+
stdout: stdout.slice(0, MAX_OUTPUT),
|
|
309
|
+
stderr: stderr.slice(0, MAX_OUTPUT),
|
|
310
|
+
});
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
child.on("error", (err) => {
|
|
314
|
+
sendResponse({ success: false, error: err.message });
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
function handleDiagnosticStatus(frame: DiagnosticStatus, conn: Connection) {
|
|
319
|
+
conn.send({
|
|
320
|
+
type: "diagnostic_status_res",
|
|
321
|
+
id: frame.id,
|
|
322
|
+
from: sentinelNodeId(),
|
|
323
|
+
to: frame.from,
|
|
324
|
+
timestamp: Date.now(),
|
|
325
|
+
payload: {
|
|
326
|
+
gatewayAlive,
|
|
327
|
+
uptimeMs: Date.now() - startTime,
|
|
328
|
+
pid: process.pid,
|
|
329
|
+
gatewayPid: gatewayAlive && gatewayPid ? gatewayPid : undefined,
|
|
330
|
+
listening,
|
|
331
|
+
},
|
|
332
|
+
} as AnyClusterFrame);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// ── Port takeover: listen when gateway dies, release when it returns ──
|
|
336
|
+
|
|
337
|
+
function startListening() {
|
|
338
|
+
if (listening || !config.listenPort) return;
|
|
339
|
+
// If we've been replaced by a new sentinel, exit instead of competing for the port
|
|
340
|
+
if (isReplaced()) {
|
|
341
|
+
log("PID file replaced — another sentinel is active, exiting");
|
|
342
|
+
cleanup();
|
|
343
|
+
return;
|
|
344
|
+
}
|
|
345
|
+
const port = config.listenPort;
|
|
346
|
+
const host = config.listenHost ?? "0.0.0.0";
|
|
347
|
+
|
|
348
|
+
const e2eeOpts: ConnectionE2eeOptions = {
|
|
349
|
+
e2ee: config.e2ee,
|
|
350
|
+
compression: config.compression,
|
|
351
|
+
identityKeyPair: config.e2ee && identityKeyPair ? identityKeyPair : undefined,
|
|
352
|
+
deferAuthOk: !!config.peerApproval?.enabled,
|
|
353
|
+
};
|
|
354
|
+
|
|
355
|
+
httpServer = createServer((_req, res) => {
|
|
356
|
+
res.writeHead(200, { "Content-Type": "text/html", "Server": "nginx" });
|
|
357
|
+
res.end("<!DOCTYPE html><html><head><title>Welcome</title></head><body><p>It works!</p></body></html>");
|
|
358
|
+
});
|
|
359
|
+
|
|
360
|
+
wss = new WebSocketServer({
|
|
361
|
+
server: httpServer,
|
|
362
|
+
handleProtocols(protocols) {
|
|
363
|
+
if (protocols.size > 0) return protocols.values().next().value!;
|
|
364
|
+
return false;
|
|
365
|
+
},
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
wss.on("connection", (ws) => {
|
|
369
|
+
const transport: WsTransport = {
|
|
370
|
+
send(data: string) { ws.send(data); },
|
|
371
|
+
close(code?: number, reason?: string) { ws.close(code, reason); },
|
|
372
|
+
get readyState() { return ws.readyState; },
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
const conn = new Connection(
|
|
376
|
+
transport,
|
|
377
|
+
"inbound",
|
|
378
|
+
sentinelNodeId(),
|
|
379
|
+
config.secret,
|
|
380
|
+
buildCapabilities(),
|
|
381
|
+
e2eeOpts,
|
|
382
|
+
);
|
|
383
|
+
|
|
384
|
+
inboundConnections.set(ws, conn);
|
|
385
|
+
|
|
386
|
+
conn.on("authenticated", (caps) => {
|
|
387
|
+
const nodeId = caps.nodeId;
|
|
388
|
+
const peerPublicKey = conn.remoteIdentityKey;
|
|
389
|
+
|
|
390
|
+
// Sentinel only accepts already-approved peers — no approval flow
|
|
391
|
+
if (!isSentinelPeerAllowed(nodeId, peerPublicKey)) {
|
|
392
|
+
log(`Rejected unapproved peer: ${nodeId}`);
|
|
393
|
+
conn.close(4005, "not approved");
|
|
394
|
+
return;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
conn.completeAuth();
|
|
398
|
+
connections.set(nodeId, conn);
|
|
399
|
+
log(`Inbound peer authenticated: ${nodeId}`);
|
|
400
|
+
|
|
401
|
+
// Send peer_sync so the client can see this sentinel in its peer list
|
|
402
|
+
conn.send({
|
|
403
|
+
type: "peer_sync",
|
|
404
|
+
from: sentinelNodeId(),
|
|
405
|
+
timestamp: Date.now(),
|
|
406
|
+
payload: { peers: [buildCapabilities()] },
|
|
407
|
+
} as AnyClusterFrame);
|
|
408
|
+
});
|
|
409
|
+
|
|
410
|
+
conn.on("message", (frame) => handleFrame(frame, conn));
|
|
411
|
+
|
|
412
|
+
conn.on("close", () => {
|
|
413
|
+
inboundConnections.delete(ws);
|
|
414
|
+
});
|
|
415
|
+
|
|
416
|
+
conn.on("error", () => { /* close will follow */ });
|
|
417
|
+
|
|
418
|
+
ws.on("message", (data) => {
|
|
419
|
+
conn.feedMessage(typeof data === "string" ? data : String(data));
|
|
420
|
+
});
|
|
421
|
+
|
|
422
|
+
ws.on("close", (code, reason) => {
|
|
423
|
+
conn.feedClose(code, reason.toString());
|
|
424
|
+
inboundConnections.delete(ws);
|
|
425
|
+
});
|
|
426
|
+
});
|
|
427
|
+
|
|
428
|
+
httpServer.on("error", (err) => {
|
|
429
|
+
log(`Listener error on port ${port}: ${err.message}`);
|
|
430
|
+
httpServer?.close();
|
|
431
|
+
httpServer = null;
|
|
432
|
+
wss?.close();
|
|
433
|
+
wss = null;
|
|
434
|
+
listening = false;
|
|
435
|
+
|
|
436
|
+
// If we've been replaced by a new sentinel, exit gracefully
|
|
437
|
+
if (isReplaced()) {
|
|
438
|
+
log("PID file replaced — exiting");
|
|
439
|
+
cleanup();
|
|
440
|
+
return;
|
|
441
|
+
}
|
|
442
|
+
// Port may still be held briefly by the dying gateway — retry after a delay
|
|
443
|
+
setTimeout(() => {
|
|
444
|
+
if (!gatewayAlive && config.listenPort) startListening();
|
|
445
|
+
}, 3_000);
|
|
446
|
+
});
|
|
447
|
+
|
|
448
|
+
httpServer.listen(port, host, () => {
|
|
449
|
+
listening = true;
|
|
450
|
+
log(`Port takeover: listening on ${host}:${port}`);
|
|
451
|
+
});
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
function stopListening() {
|
|
455
|
+
if (!listening) return;
|
|
456
|
+
// Gracefully close all inbound connections
|
|
457
|
+
for (const [ws, conn] of inboundConnections) {
|
|
458
|
+
conn.close(1001, "gateway recovered");
|
|
459
|
+
ws.close(1001, "gateway recovered");
|
|
460
|
+
}
|
|
461
|
+
inboundConnections.clear();
|
|
462
|
+
wss?.close();
|
|
463
|
+
wss = null;
|
|
464
|
+
httpServer?.close();
|
|
465
|
+
httpServer = null;
|
|
466
|
+
listening = false;
|
|
467
|
+
log("Port released — gateway is back");
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
// ── PID file management ─────────────────────────────────────────
|
|
471
|
+
function writePidFile() {
|
|
472
|
+
writeFileSync(config.pidFile, String(process.pid));
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
/** Check if another sentinel has replaced us (PID file contains a different PID). */
|
|
476
|
+
function isReplaced(): boolean {
|
|
477
|
+
try {
|
|
478
|
+
if (!existsSync(config.pidFile)) return true;
|
|
479
|
+
const filePid = parseInt(readFileSync(config.pidFile, "utf-8").trim(), 10);
|
|
480
|
+
return filePid !== process.pid;
|
|
481
|
+
} catch {
|
|
482
|
+
return false;
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
function killOldSentinel() {
|
|
487
|
+
if (!existsSync(config.pidFile)) return;
|
|
488
|
+
try {
|
|
489
|
+
const oldPid = parseInt(readFileSync(config.pidFile, "utf-8").trim(), 10);
|
|
490
|
+
if (oldPid && oldPid !== process.pid) {
|
|
491
|
+
try {
|
|
492
|
+
process.kill(oldPid, 0); // existence check
|
|
493
|
+
process.kill(oldPid, "SIGTERM");
|
|
494
|
+
log(`Killing old sentinel (pid ${oldPid})`);
|
|
495
|
+
// Wait for the old process to actually exit (up to 5s)
|
|
496
|
+
const deadline = Date.now() + 5_000;
|
|
497
|
+
while (Date.now() < deadline) {
|
|
498
|
+
try {
|
|
499
|
+
process.kill(oldPid, 0);
|
|
500
|
+
// Still alive — busy-wait briefly
|
|
501
|
+
const waitUntil = Date.now() + 100;
|
|
502
|
+
while (Date.now() < waitUntil) { /* spin */ }
|
|
503
|
+
} catch {
|
|
504
|
+
// Process exited
|
|
505
|
+
log(`Old sentinel (pid ${oldPid}) exited`);
|
|
506
|
+
break;
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
} catch {
|
|
510
|
+
// Process already gone
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
} catch {
|
|
514
|
+
// Malformed PID file
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
function cleanup() {
|
|
519
|
+
if (healthCheckTimer) { clearInterval(healthCheckTimer); healthCheckTimer = null; }
|
|
520
|
+
try { unlinkSync(config.pidFile); } catch { /* ignore */ }
|
|
521
|
+
for (const conn of connections.values()) conn.close(1000, "shutdown");
|
|
522
|
+
for (const timer of reconnectTimers.values()) clearTimeout(timer);
|
|
523
|
+
stopListening();
|
|
524
|
+
process.exit(0);
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
// ── Logging ─────────────────────────────────────────────────────
|
|
528
|
+
function log(msg: string) {
|
|
529
|
+
const ts = new Date().toISOString();
|
|
530
|
+
// Generic prefix to avoid endpoint detection fingerprinting
|
|
531
|
+
process.stderr.write(`[svc ${ts}] ${msg}\n`);
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// ── Bootstrap ───────────────────────────────────────────────────
|
|
535
|
+
process.on("message", (msg: unknown) => {
|
|
536
|
+
const m = msg as { type: string; config?: SentinelConfig };
|
|
537
|
+
if (m.type === "init" && m.config) {
|
|
538
|
+
config = m.config;
|
|
539
|
+
boot();
|
|
540
|
+
} else if (m.type === "shutdown") {
|
|
541
|
+
cleanup();
|
|
542
|
+
}
|
|
543
|
+
});
|
|
544
|
+
|
|
545
|
+
// Parent IPC disconnect — by design (SentinelManager disconnects after init).
|
|
546
|
+
// Switch to PID-based health checks instead of treating disconnect as crash.
|
|
547
|
+
process.on("disconnect", () => {
|
|
548
|
+
log("IPC disconnected — switching to PID-based gateway health check");
|
|
549
|
+
startGatewayHealthCheck();
|
|
550
|
+
});
|
|
551
|
+
|
|
552
|
+
process.on("SIGTERM", () => { log("Received SIGTERM"); cleanup(); });
|
|
553
|
+
process.on("SIGINT", () => { log("Received SIGINT"); cleanup(); });
|
|
554
|
+
process.on("SIGHUP", () => { log("Received SIGHUP (ignored)"); });
|
|
555
|
+
process.on("uncaughtException", (err) => {
|
|
556
|
+
log(`Uncaught exception: ${err.stack || err.message}`);
|
|
557
|
+
// EADDRINUSE from a listen call means the port is taken — if we've been
|
|
558
|
+
// replaced by a new sentinel/gateway, exit cleanly instead of looping.
|
|
559
|
+
if ((err as NodeJS.ErrnoException).code === "EADDRINUSE" && isReplaced()) {
|
|
560
|
+
log("Port in use and PID file replaced — exiting");
|
|
561
|
+
cleanup();
|
|
562
|
+
}
|
|
563
|
+
});
|
|
564
|
+
process.on("unhandledRejection", (reason) => { log(`Unhandled rejection: ${reason}`); });
|
|
565
|
+
process.on("beforeExit", (code) => { log(`beforeExit code=${code}`); });
|
|
566
|
+
process.on("exit", (code) => {
|
|
567
|
+
// Sync write since event loop is draining
|
|
568
|
+
const ts = new Date().toISOString();
|
|
569
|
+
try { process.stderr.write(`[svc ${ts}] Exit code=${code}\n`); } catch { /* ignore */ }
|
|
570
|
+
});
|
|
571
|
+
|
|
572
|
+
/** Periodically check if the gateway process is still alive via kill(pid, 0). */
|
|
573
|
+
function startGatewayHealthCheck() {
|
|
574
|
+
if (healthCheckTimer || !gatewayPid) return;
|
|
575
|
+
healthCheckTimer = setInterval(() => {
|
|
576
|
+
if (!gatewayPid) return;
|
|
577
|
+
try {
|
|
578
|
+
process.kill(gatewayPid, 0); // signal 0 = existence check
|
|
579
|
+
if (!gatewayAlive) {
|
|
580
|
+
gatewayAlive = true;
|
|
581
|
+
log("Gateway process detected — back online");
|
|
582
|
+
// Release the port so the gateway can reclaim it
|
|
583
|
+
stopListening();
|
|
584
|
+
}
|
|
585
|
+
} catch {
|
|
586
|
+
if (gatewayAlive) {
|
|
587
|
+
gatewayAlive = false;
|
|
588
|
+
log(`Gateway process (pid ${gatewayPid}) gone — entering standalone mode`);
|
|
589
|
+
// Take over the gateway's listen port
|
|
590
|
+
if (config.listenPort) {
|
|
591
|
+
// Small delay to let the OS release the port from the dead process
|
|
592
|
+
setTimeout(() => {
|
|
593
|
+
if (!gatewayAlive && !isReplaced()) startListening();
|
|
594
|
+
}, 2_000);
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
}, 5_000);
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
function boot() {
|
|
602
|
+
// Prefer explicit gatewayPid from config (sent by SentinelManager),
|
|
603
|
+
// fall back to ppid (may be inaccurate if forked indirectly).
|
|
604
|
+
gatewayPid = config.gatewayPid ?? process.ppid;
|
|
605
|
+
|
|
606
|
+
loadApprovedPeers();
|
|
607
|
+
killOldSentinel();
|
|
608
|
+
writePidFile();
|
|
609
|
+
log(`Started (pid ${process.pid}, gateway ${gatewayPid}, nodeId ${sentinelNodeId()}, takeover port ${config.listenPort || "none"})`);
|
|
610
|
+
|
|
611
|
+
// Connect to all configured peers
|
|
612
|
+
for (const peer of config.peers) {
|
|
613
|
+
connectToPeer(peer);
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
// Note: we do NOT start listening here.
|
|
617
|
+
// Listening only starts when gateway dies (port takeover mode).
|
|
618
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import type { PeerManager } from "./peer-manager.ts";
|
|
2
|
+
import type { ClawMatrixConfig } from "./config.ts";
|
|
3
|
+
import type { TaskActivityFrame, TaskActivityStatus } from "./types.ts";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Broadcasts task_activity frames to mobile peers (tagged mobile/ios/phone).
|
|
7
|
+
* Shared between AcpProxy and HandoffManager to avoid duplication.
|
|
8
|
+
*/
|
|
9
|
+
export class TaskActivityBroadcaster {
|
|
10
|
+
private config: ClawMatrixConfig;
|
|
11
|
+
private peerManager: PeerManager;
|
|
12
|
+
private throttles = new Map<string, number>();
|
|
13
|
+
|
|
14
|
+
constructor(config: ClawMatrixConfig, peerManager: PeerManager) {
|
|
15
|
+
this.config = config;
|
|
16
|
+
this.peerManager = peerManager;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
broadcast(
|
|
20
|
+
taskId: string,
|
|
21
|
+
taskType: "acp" | "handoff",
|
|
22
|
+
status: TaskActivityStatus,
|
|
23
|
+
agent: string,
|
|
24
|
+
startedAt: number,
|
|
25
|
+
detail?: string,
|
|
26
|
+
tool?: string,
|
|
27
|
+
toolDone?: boolean,
|
|
28
|
+
) {
|
|
29
|
+
// Throttle progress updates to at most once per 3 seconds per task
|
|
30
|
+
if (status === "progress") {
|
|
31
|
+
const now = Date.now();
|
|
32
|
+
const lastSent = this.throttles.get(taskId) ?? 0;
|
|
33
|
+
if (now - lastSent < 3_000) return;
|
|
34
|
+
this.throttles.set(taskId, now);
|
|
35
|
+
} else {
|
|
36
|
+
this.throttles.delete(taskId);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const peers = this.peerManager.router.getAllPeers();
|
|
40
|
+
const mobileTargets = peers.filter((p) =>
|
|
41
|
+
p.tags.some((t) => t === "mobile" || t === "ios" || t === "phone"),
|
|
42
|
+
);
|
|
43
|
+
if (mobileTargets.length === 0) return;
|
|
44
|
+
|
|
45
|
+
const now = Date.now();
|
|
46
|
+
const frame: TaskActivityFrame = {
|
|
47
|
+
type: "task_activity",
|
|
48
|
+
from: this.config.nodeId,
|
|
49
|
+
timestamp: now,
|
|
50
|
+
payload: {
|
|
51
|
+
taskId,
|
|
52
|
+
taskType,
|
|
53
|
+
status,
|
|
54
|
+
agent,
|
|
55
|
+
nodeId: this.config.nodeId,
|
|
56
|
+
title: agent,
|
|
57
|
+
detail,
|
|
58
|
+
startedAt,
|
|
59
|
+
elapsedMs: now - startedAt,
|
|
60
|
+
tool,
|
|
61
|
+
toolDone,
|
|
62
|
+
},
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
for (const target of mobileTargets) {
|
|
66
|
+
this.peerManager.sendTo(target.nodeId, { ...frame, to: target.nodeId });
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** Clean up throttle state for a completed/failed task. */
|
|
71
|
+
cleanup(taskId: string) {
|
|
72
|
+
this.throttles.delete(taskId);
|
|
73
|
+
}
|
|
74
|
+
}
|