clawmatrix 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/acp-proxy.ts +574 -207
- package/src/cluster-service.ts +24 -2
- package/src/compat.ts +36 -1
- package/src/handoff.ts +10 -3
- package/src/health-tracker.ts +581 -0
- package/src/model-proxy.ts +16 -1
- package/src/peer-manager.ts +61 -38
- package/src/router.ts +41 -0
- package/src/sentinel.ts +29 -7
- package/src/types.ts +10 -1
- package/src/web.ts +33 -0
package/src/peer-manager.ts
CHANGED
|
@@ -28,6 +28,26 @@ import type { KeyPair } from "./crypto.ts";
|
|
|
28
28
|
const RECONNECT_BASE = 1_000;
|
|
29
29
|
const RECONNECT_MAX = 60_000;
|
|
30
30
|
|
|
31
|
+
/** Frame types that bypass dedup (streams share one id across chunks; responses share id with request). */
|
|
32
|
+
const SKIP_DEDUP_TYPES = new Set([
|
|
33
|
+
// Streaming
|
|
34
|
+
"model_stream", "handoff_stream", "acp_stream",
|
|
35
|
+
// Response frames (share id with their request)
|
|
36
|
+
"model_res", "tool_res", "tool_batch_res",
|
|
37
|
+
"handoff_res", "handoff_status_res", "handoff_input_required",
|
|
38
|
+
// Handoff control (reuse original handoff_req id)
|
|
39
|
+
"handoff_input", "handoff_cancel", "handoff_status",
|
|
40
|
+
// Diagnostics & approval
|
|
41
|
+
"diagnostic_exec_res", "diagnostic_status_res", "peer_approval_res",
|
|
42
|
+
// ACP responses
|
|
43
|
+
"acp_res", "acp_close_res", "acp_list_res", "acp_resume_res",
|
|
44
|
+
"acp_cancel_res", "acp_set_mode_res", "acp_get_modes_res",
|
|
45
|
+
"chat_history_res",
|
|
46
|
+
// Terminal
|
|
47
|
+
"terminal_open_res", "terminal_data", "terminal_resize",
|
|
48
|
+
"terminal_close", "terminal_close_res",
|
|
49
|
+
]);
|
|
50
|
+
|
|
31
51
|
/** Classify WebSocket close code into a human-readable reason. */
|
|
32
52
|
function classifyCloseReason(code: number, reason: string): string {
|
|
33
53
|
if (reason) return reason;
|
|
@@ -364,6 +384,12 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
364
384
|
tryReconnect();
|
|
365
385
|
});
|
|
366
386
|
ws.addEventListener("close", (ev) => {
|
|
387
|
+
// Don't reconnect if this was a self-connection (peer URL points to ourselves).
|
|
388
|
+
// Without this guard, outbound detects self → closes → scheduleReconnect → loop.
|
|
389
|
+
if (ev.code === 4002 && ev.reason === "self-connection") {
|
|
390
|
+
debug("peer", `connectToPeer(${peer.nodeId}): self-connection, will not reconnect`);
|
|
391
|
+
return;
|
|
392
|
+
}
|
|
367
393
|
if (!lastError) {
|
|
368
394
|
lastError = classifyCloseReason(ev.code, ev.reason);
|
|
369
395
|
}
|
|
@@ -397,19 +423,17 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
397
423
|
// Peer's persistent public key for TOFU identity binding
|
|
398
424
|
const peerPublicKey = conn.remoteIdentityKey ?? undefined;
|
|
399
425
|
|
|
400
|
-
// Prevent self-connection:
|
|
401
|
-
//
|
|
402
|
-
|
|
403
|
-
|
|
426
|
+
// Prevent self-connection: close immediately if the remote side authenticated
|
|
427
|
+
// with our own nodeId. For outbound this means the peer URL accidentally
|
|
428
|
+
// points to self; for inbound it means a remote node is (mis)using our nodeId.
|
|
429
|
+
// Exception: loopback connections with the same nodeId are local clients
|
|
430
|
+
// (Mac desktop app / iOS simulator) and are allowed through.
|
|
431
|
+
const isLocalClient = conn.role === "inbound" && nodeId === this.config.nodeId && isLoopback(ip);
|
|
432
|
+
if (nodeId === this.config.nodeId && !isLocalClient) {
|
|
433
|
+
debug("peer", `Self-connection detected (${conn.role}, nodeId=${nodeId}, ip=${ip}), closing`);
|
|
404
434
|
conn.close(4002, "self-connection");
|
|
405
435
|
return;
|
|
406
436
|
}
|
|
407
|
-
|
|
408
|
-
// Peer approval check (inbound only — outbound peers are explicitly configured)
|
|
409
|
-
// Skip approval for same-nodeId connections from localhost (local clients
|
|
410
|
-
// like Mac desktop app / iOS simulator). An attacker would need to already
|
|
411
|
-
// be on the same machine to exploit this, which is outside our threat model.
|
|
412
|
-
const isLocalClient = nodeId === this.config.nodeId && isLoopback(ip);
|
|
413
437
|
debug("approval", `onPeerAuthenticated: nodeId=${nodeId} role=${conn.role} isLocalClient=${isLocalClient} ip=${ip}`);
|
|
414
438
|
if (conn.role === "inbound" && !isLocalClient) {
|
|
415
439
|
// IP-level approval rate limiting (suppress noise from leaked tokens)
|
|
@@ -580,8 +604,11 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
580
604
|
|
|
581
605
|
// ── Message handling ───────────────────────────────────────────
|
|
582
606
|
private onFrame(frame: AnyClusterFrame, from: Connection) {
|
|
583
|
-
// Ignore self-echo: frames with our own nodeId that were relayed back to us
|
|
584
|
-
|
|
607
|
+
// Ignore self-echo: frames with our own nodeId that were relayed back to us.
|
|
608
|
+
// Exception: frames from same-nodeId satellite connections (Mac/iOS client)
|
|
609
|
+
// are legitimate requests that must be processed or relayed.
|
|
610
|
+
const isSatellite = from.remoteNodeId === this.config.nodeId;
|
|
611
|
+
if (frame.from === this.config.nodeId && !isSatellite) return;
|
|
585
612
|
|
|
586
613
|
// Validate from field: must be the direct peer or a known node (relayed)
|
|
587
614
|
if (frame.from && frame.from !== from.remoteNodeId && !this.router.getRoute(frame.from)) {
|
|
@@ -589,32 +616,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
589
616
|
return;
|
|
590
617
|
}
|
|
591
618
|
|
|
592
|
-
// Skip dedup for streaming and response frame types.
|
|
593
|
-
// Stream frames share one id across many chunks.
|
|
594
|
-
// Response frames (model_res, tool_res, handoff_res, etc.) share the same id
|
|
595
|
-
// as their request — without this exemption, a relay node that forwarded
|
|
596
|
-
// the request would mark the id as seen and then drop the returning response.
|
|
597
619
|
// Skip dedup for streaming chunks (same id across many chunks) and response
|
|
598
620
|
// frames (share id with their request — relay would otherwise drop the reply).
|
|
599
|
-
// handoff_input
|
|
600
|
-
|
|
601
|
-
const skipDedup = frame.type === "model_stream" || frame.type === "handoff_stream"
|
|
602
|
-
|| frame.type === "model_res" || frame.type === "tool_res"
|
|
603
|
-
|| frame.type === "handoff_res" || frame.type === "handoff_status_res"
|
|
604
|
-
|| frame.type === "handoff_input_required"
|
|
605
|
-
|| frame.type === "handoff_input" || frame.type === "handoff_cancel"
|
|
606
|
-
|| frame.type === "handoff_status"
|
|
607
|
-
|| frame.type === "diagnostic_exec_res" || frame.type === "diagnostic_status_res"
|
|
608
|
-
|| frame.type === "peer_approval_res"
|
|
609
|
-
|| frame.type === "acp_stream" || frame.type === "acp_res"
|
|
610
|
-
|| frame.type === "acp_close_res"
|
|
611
|
-
|| frame.type === "acp_list_res" || frame.type === "acp_resume_res"
|
|
612
|
-
|| frame.type === "acp_cancel_res"
|
|
613
|
-
|| frame.type === "acp_set_mode_res" || frame.type === "acp_get_modes_res"
|
|
614
|
-
|| frame.type === "terminal_open_res" || frame.type === "terminal_data"
|
|
615
|
-
|| frame.type === "terminal_resize" || frame.type === "terminal_close"
|
|
616
|
-
|| frame.type === "terminal_close_res";
|
|
617
|
-
if (frame.id && !skipDedup && this.router.isDuplicate(frame.id)) return;
|
|
621
|
+
// handoff_input/cancel/status reuse the original handoff_req id.
|
|
622
|
+
if (frame.id && !SKIP_DEDUP_TYPES.has(frame.type) && this.router.isDuplicate(frame.id)) return;
|
|
618
623
|
|
|
619
624
|
// Handle peer approval responses locally (don't emit to cluster-service)
|
|
620
625
|
if (frame.type === "peer_approval_res") {
|
|
@@ -672,8 +677,26 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
672
677
|
}
|
|
673
678
|
|
|
674
679
|
if (frame.to && frame.to !== this.config.nodeId) {
|
|
675
|
-
this.router.tryRelay(frame);
|
|
676
|
-
|
|
680
|
+
if (this.router.tryRelay(frame)) return;
|
|
681
|
+
|
|
682
|
+
// Relay failed — for model_req, try alternative nodes or fall through to local handling
|
|
683
|
+
if (frame.type === "model_req") {
|
|
684
|
+
const modelId = (frame as any).payload?.model;
|
|
685
|
+
if (modelId) {
|
|
686
|
+
const exclude = new Set([frame.to, this.config.nodeId]);
|
|
687
|
+
const alternatives = this.router.findNodesForModel(modelId, exclude);
|
|
688
|
+
for (const alt of alternatives) {
|
|
689
|
+
if (this.sendTo(alt.nodeId, { ...frame, to: alt.nodeId })) {
|
|
690
|
+
debug("peer", `model_req failover: ${frame.to} → ${alt.nodeId}`);
|
|
691
|
+
return;
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
// No remote alternative — fall through to local handling
|
|
696
|
+
// (model-proxy will handle locally or send error back)
|
|
697
|
+
} else {
|
|
698
|
+
return;
|
|
699
|
+
}
|
|
677
700
|
}
|
|
678
701
|
|
|
679
702
|
// Forward to same-nodeId satellite connection (e.g. Mac desktop app) so that
|
package/src/router.ts
CHANGED
|
@@ -3,6 +3,7 @@ import type { Connection } from "./connection.ts";
|
|
|
3
3
|
|
|
4
4
|
const DEFAULT_TTL = 3;
|
|
5
5
|
const MAX_SEEN_FRAMES = 10_000;
|
|
6
|
+
const MAX_FAILED_REQUESTS = 5_000;
|
|
6
7
|
const ROTATE_INTERVAL = 60_000; // rotate dedup maps every 60s
|
|
7
8
|
|
|
8
9
|
export interface RouteEntry {
|
|
@@ -232,6 +233,31 @@ export class Router {
|
|
|
232
233
|
return this.routes.get(target);
|
|
233
234
|
}
|
|
234
235
|
|
|
236
|
+
/** Find reachable nodes that provide a specific model, sorted by latency.
|
|
237
|
+
* Excludes nodes in the `exclude` set. */
|
|
238
|
+
findNodesForModel(modelId: string, exclude?: Set<string>): RouteEntry[] {
|
|
239
|
+
const candidates: RouteEntry[] = [];
|
|
240
|
+
for (const entry of this.routes.values()) {
|
|
241
|
+
if (exclude?.has(entry.nodeId)) continue;
|
|
242
|
+
if (!entry.models.some((m) => m.id === modelId)) continue;
|
|
243
|
+
// Check reachability
|
|
244
|
+
if (entry.connection?.isOpen) {
|
|
245
|
+
candidates.push(entry);
|
|
246
|
+
} else if (entry.reachableVia) {
|
|
247
|
+
const relay = this.connections.get(entry.reachableVia);
|
|
248
|
+
if (relay?.isOpen) candidates.push(entry);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
// Sort: direct first, then by latency
|
|
252
|
+
candidates.sort((a, b) => {
|
|
253
|
+
const aDirect = a.connection ? 0 : 1;
|
|
254
|
+
const bDirect = b.connection ? 0 : 1;
|
|
255
|
+
if (aDirect !== bDirect) return aDirect - bDirect;
|
|
256
|
+
return a.latencyMs - b.latencyMs;
|
|
257
|
+
});
|
|
258
|
+
return candidates;
|
|
259
|
+
}
|
|
260
|
+
|
|
235
261
|
// ── Message sending and relay ──────────────────────────────────
|
|
236
262
|
/** Send a frame to a specific node, relaying if necessary. Returns true if sent. */
|
|
237
263
|
sendTo(targetNodeId: string, frame: ClusterFrame | AnyClusterFrame): boolean {
|
|
@@ -305,6 +331,21 @@ export class Router {
|
|
|
305
331
|
* TTL defaults to 15 minutes — long enough for handoff timeouts. */
|
|
306
332
|
markFailed(requestId: string, ttlMs = 900_000) {
|
|
307
333
|
this.failedRequests.set(requestId, Date.now() + ttlMs);
|
|
334
|
+
// Evict entries when map grows too large: first expired, then FIFO
|
|
335
|
+
if (this.failedRequests.size > MAX_FAILED_REQUESTS) {
|
|
336
|
+
const now = Date.now();
|
|
337
|
+
// Pass 1: remove expired entries
|
|
338
|
+
for (const [id, expiresAt] of this.failedRequests) {
|
|
339
|
+
if (now > expiresAt) this.failedRequests.delete(id);
|
|
340
|
+
}
|
|
341
|
+
// Pass 2: if still over limit, remove oldest (insertion-order) entries
|
|
342
|
+
if (this.failedRequests.size > MAX_FAILED_REQUESTS) {
|
|
343
|
+
for (const [id] of this.failedRequests) {
|
|
344
|
+
if (this.failedRequests.size <= MAX_FAILED_REQUESTS) break;
|
|
345
|
+
this.failedRequests.delete(id);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
}
|
|
308
349
|
}
|
|
309
350
|
|
|
310
351
|
isFailed(requestId: string): boolean {
|
package/src/sentinel.ts
CHANGED
|
@@ -569,6 +569,28 @@ process.on("exit", (code) => {
|
|
|
569
569
|
try { process.stderr.write(`[svc ${ts}] Exit code=${code}\n`); } catch { /* ignore */ }
|
|
570
570
|
});
|
|
571
571
|
|
|
572
|
+
/** Connect to all configured peers (called when gateway dies). */
|
|
573
|
+
function connectAllPeers() {
|
|
574
|
+
for (const peer of config.peers) {
|
|
575
|
+
if (!connections.has(peer.nodeId) && !reconnectTimers.has(peer.nodeId)) {
|
|
576
|
+
connectToPeer(peer);
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
/** Disconnect from all peers (called when gateway recovers). */
|
|
582
|
+
function disconnectAllPeers() {
|
|
583
|
+
for (const [nodeId, conn] of connections) {
|
|
584
|
+
conn.close(1000, "gateway recovered");
|
|
585
|
+
connections.delete(nodeId);
|
|
586
|
+
}
|
|
587
|
+
for (const [nodeId, timer] of reconnectTimers) {
|
|
588
|
+
clearTimeout(timer);
|
|
589
|
+
reconnectTimers.delete(nodeId);
|
|
590
|
+
}
|
|
591
|
+
reconnectAttempts.clear();
|
|
592
|
+
}
|
|
593
|
+
|
|
572
594
|
/** Periodically check if the gateway process is still alive via kill(pid, 0). */
|
|
573
595
|
function startGatewayHealthCheck() {
|
|
574
596
|
if (healthCheckTimer || !gatewayPid) return;
|
|
@@ -581,11 +603,15 @@ function startGatewayHealthCheck() {
|
|
|
581
603
|
log("Gateway process detected — back online");
|
|
582
604
|
// Release the port so the gateway can reclaim it
|
|
583
605
|
stopListening();
|
|
606
|
+
// Disconnect from peers — gateway handles mesh connections
|
|
607
|
+
disconnectAllPeers();
|
|
584
608
|
}
|
|
585
609
|
} catch {
|
|
586
610
|
if (gatewayAlive) {
|
|
587
611
|
gatewayAlive = false;
|
|
588
612
|
log(`Gateway process (pid ${gatewayPid}) gone — entering standalone mode`);
|
|
613
|
+
// Connect to peers now that gateway is down
|
|
614
|
+
connectAllPeers();
|
|
589
615
|
// Take over the gateway's listen port
|
|
590
616
|
if (config.listenPort) {
|
|
591
617
|
// Small delay to let the OS release the port from the dead process
|
|
@@ -608,11 +634,7 @@ function boot() {
|
|
|
608
634
|
writePidFile();
|
|
609
635
|
log(`Started (pid ${process.pid}, gateway ${gatewayPid}, nodeId ${sentinelNodeId()}, takeover port ${config.listenPort || "none"})`);
|
|
610
636
|
|
|
611
|
-
//
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
}
|
|
615
|
-
|
|
616
|
-
// Note: we do NOT start listening here.
|
|
617
|
-
// Listening only starts when gateway dies (port takeover mode).
|
|
637
|
+
// Do NOT connect to peers on boot — gateway handles mesh connections.
|
|
638
|
+
// Sentinel only connects when gateway dies (standalone mode).
|
|
639
|
+
// Listening also only starts when gateway dies (port takeover mode).
|
|
618
640
|
}
|
package/src/types.ts
CHANGED
|
@@ -411,6 +411,14 @@ export interface KnowledgeSyncFrame extends ClusterFrame {
|
|
|
411
411
|
};
|
|
412
412
|
}
|
|
413
413
|
|
|
414
|
+
// ── Health sync ──────────────────────────────────────────────────
|
|
415
|
+
export interface HealthSyncFrame extends ClusterFrame {
|
|
416
|
+
type: "health_sync";
|
|
417
|
+
payload: {
|
|
418
|
+
data: string; // base64-encoded Automerge sync message
|
|
419
|
+
};
|
|
420
|
+
}
|
|
421
|
+
|
|
414
422
|
// ── Diagnostic (sentinel) ────────────────────────────────────────
|
|
415
423
|
export interface DiagnosticExec extends ClusterFrame {
|
|
416
424
|
type: "diagnostic_exec";
|
|
@@ -831,4 +839,5 @@ export type AnyClusterFrame =
|
|
|
831
839
|
| TerminalData
|
|
832
840
|
| TerminalResize
|
|
833
841
|
| TerminalCloseRequest
|
|
834
|
-
| TerminalCloseResponse
|
|
842
|
+
| TerminalCloseResponse
|
|
843
|
+
| HealthSyncFrame;
|
package/src/web.ts
CHANGED
|
@@ -3,6 +3,7 @@ import type { PeerManager } from "./peer-manager.ts";
|
|
|
3
3
|
import type { HandoffManager } from "./handoff.ts";
|
|
4
4
|
import type { ClawMatrixConfig } from "./config.ts";
|
|
5
5
|
import type { SatelliteContext, IngestedEvent } from "./types.ts";
|
|
6
|
+
import type { HealthTracker } from "./health-tracker.ts";
|
|
6
7
|
import { timingSafeEqual } from "./auth.ts";
|
|
7
8
|
import { renderDashboard } from "./web-ui.ts";
|
|
8
9
|
import { readBody } from "./http-utils.ts";
|
|
@@ -46,6 +47,7 @@ export class WebHandler {
|
|
|
46
47
|
private ingestedEvents: IngestedEvent[] = []; // ring buffer for ingested events
|
|
47
48
|
private loginAttempts = new Map<string, { count: number; resetAt: number }>(); // IP → rate limit
|
|
48
49
|
private loginCleanupTimer: ReturnType<typeof setInterval> | null = null;
|
|
50
|
+
private healthTracker: HealthTracker | null = null;
|
|
49
51
|
private onPeerConnected: (nodeId: string) => void;
|
|
50
52
|
private onPeerDisconnected: (nodeId: string) => void;
|
|
51
53
|
|
|
@@ -91,6 +93,11 @@ export class WebHandler {
|
|
|
91
93
|
peerManager.on("peerDisconnected", this.onPeerDisconnected);
|
|
92
94
|
}
|
|
93
95
|
|
|
96
|
+
/** Set the health tracker for availability API. */
|
|
97
|
+
setHealthTracker(tracker: HealthTracker) {
|
|
98
|
+
this.healthTracker = tracker;
|
|
99
|
+
}
|
|
100
|
+
|
|
94
101
|
/** Clean up timers and pending requests on shutdown. */
|
|
95
102
|
destroy() {
|
|
96
103
|
// Remove event listeners to prevent post-destroy callbacks
|
|
@@ -181,6 +188,11 @@ export class WebHandler {
|
|
|
181
188
|
return;
|
|
182
189
|
}
|
|
183
190
|
|
|
191
|
+
if (path === "/api/availability" && req.method === "GET") {
|
|
192
|
+
this.handleAvailability(req, res);
|
|
193
|
+
return;
|
|
194
|
+
}
|
|
195
|
+
|
|
184
196
|
if (path === "/api/satellite/poll" && req.method === "GET") {
|
|
185
197
|
this.handleSatellitePoll(req, res);
|
|
186
198
|
return;
|
|
@@ -271,6 +283,27 @@ export class WebHandler {
|
|
|
271
283
|
}
|
|
272
284
|
}
|
|
273
285
|
|
|
286
|
+
private handleAvailability(req: IncomingMessage, res: ServerResponse) {
|
|
287
|
+
if (!this.healthTracker) {
|
|
288
|
+
res.writeHead(503, { "Content-Type": "application/json" });
|
|
289
|
+
res.end(JSON.stringify({ error: "Health tracker not available" }));
|
|
290
|
+
return;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
const url = new URL(req.url ?? "/", `http://${req.headers.host ?? "localhost"}`);
|
|
294
|
+
const range = (url.searchParams.get("range") ?? "24h") as "24h" | "7d" | "90d";
|
|
295
|
+
|
|
296
|
+
if (!["24h", "7d", "90d"].includes(range)) {
|
|
297
|
+
res.writeHead(400, { "Content-Type": "application/json" });
|
|
298
|
+
res.end(JSON.stringify({ error: "Invalid range. Use 24h, 7d, or 90d" }));
|
|
299
|
+
return;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
const result = this.healthTracker.getAvailability(range);
|
|
303
|
+
res.writeHead(200, { "Content-Type": "application/json" });
|
|
304
|
+
res.end(JSON.stringify(result));
|
|
305
|
+
}
|
|
306
|
+
|
|
274
307
|
private handleStatus(res: ServerResponse) {
|
|
275
308
|
const peers = this.peerManager.router.getAllPeers();
|
|
276
309
|
const localNode = {
|