clawmatrix 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,6 +28,26 @@ import type { KeyPair } from "./crypto.ts";
28
28
  const RECONNECT_BASE = 1_000;
29
29
  const RECONNECT_MAX = 60_000;
30
30
 
31
+ /** Frame types that bypass dedup (streams share one id across chunks; responses share id with request). */
32
+ const SKIP_DEDUP_TYPES = new Set([
33
+ // Streaming
34
+ "model_stream", "handoff_stream", "acp_stream",
35
+ // Response frames (share id with their request)
36
+ "model_res", "tool_res", "tool_batch_res",
37
+ "handoff_res", "handoff_status_res", "handoff_input_required",
38
+ // Handoff control (reuse original handoff_req id)
39
+ "handoff_input", "handoff_cancel", "handoff_status",
40
+ // Diagnostics & approval
41
+ "diagnostic_exec_res", "diagnostic_status_res", "peer_approval_res",
42
+ // ACP responses
43
+ "acp_res", "acp_close_res", "acp_list_res", "acp_resume_res",
44
+ "acp_cancel_res", "acp_set_mode_res", "acp_get_modes_res",
45
+ "chat_history_res",
46
+ // Terminal
47
+ "terminal_open_res", "terminal_data", "terminal_resize",
48
+ "terminal_close", "terminal_close_res",
49
+ ]);
50
+
31
51
  /** Classify WebSocket close code into a human-readable reason. */
32
52
  function classifyCloseReason(code: number, reason: string): string {
33
53
  if (reason) return reason;
@@ -364,6 +384,12 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
364
384
  tryReconnect();
365
385
  });
366
386
  ws.addEventListener("close", (ev) => {
387
+ // Don't reconnect if this was a self-connection (peer URL points to ourselves).
388
+ // Without this guard, outbound detects self → closes → scheduleReconnect → loop.
389
+ if (ev.code === 4002 && ev.reason === "self-connection") {
390
+ debug("peer", `connectToPeer(${peer.nodeId}): self-connection, will not reconnect`);
391
+ return;
392
+ }
367
393
  if (!lastError) {
368
394
  lastError = classifyCloseReason(ev.code, ev.reason);
369
395
  }
@@ -397,19 +423,17 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
397
423
  // Peer's persistent public key for TOFU identity binding
398
424
  const peerPublicKey = conn.remoteIdentityKey ?? undefined;
399
425
 
400
- // Prevent self-connection: if outbound connection authenticated as our own nodeId
401
- // (e.g. peer URL accidentally points to self), close immediately.
402
- if (conn.role === "outbound" && nodeId === this.config.nodeId) {
403
- debug("peer", `Self-connection detected (outbound to ${nodeId}), closing`);
426
+ // Prevent self-connection: close immediately if the remote side authenticated
427
+ // with our own nodeId. For outbound this means the peer URL accidentally
428
+ // points to self; for inbound it means a remote node is (mis)using our nodeId.
429
+ // Exception: loopback connections with the same nodeId are local clients
430
+ // (Mac desktop app / iOS simulator) and are allowed through.
431
+ const isLocalClient = conn.role === "inbound" && nodeId === this.config.nodeId && isLoopback(ip);
432
+ if (nodeId === this.config.nodeId && !isLocalClient) {
433
+ debug("peer", `Self-connection detected (${conn.role}, nodeId=${nodeId}, ip=${ip}), closing`);
404
434
  conn.close(4002, "self-connection");
405
435
  return;
406
436
  }
407
-
408
- // Peer approval check (inbound only — outbound peers are explicitly configured)
409
- // Skip approval for same-nodeId connections from localhost (local clients
410
- // like Mac desktop app / iOS simulator). An attacker would need to already
411
- // be on the same machine to exploit this, which is outside our threat model.
412
- const isLocalClient = nodeId === this.config.nodeId && isLoopback(ip);
413
437
  debug("approval", `onPeerAuthenticated: nodeId=${nodeId} role=${conn.role} isLocalClient=${isLocalClient} ip=${ip}`);
414
438
  if (conn.role === "inbound" && !isLocalClient) {
415
439
  // IP-level approval rate limiting (suppress noise from leaked tokens)
@@ -580,8 +604,11 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
580
604
 
581
605
  // ── Message handling ───────────────────────────────────────────
582
606
  private onFrame(frame: AnyClusterFrame, from: Connection) {
583
- // Ignore self-echo: frames with our own nodeId that were relayed back to us
584
- if (frame.from === this.config.nodeId) return;
607
+ // Ignore self-echo: frames with our own nodeId that were relayed back to us.
608
+ // Exception: frames from same-nodeId satellite connections (Mac/iOS client)
609
+ // are legitimate requests that must be processed or relayed.
610
+ const isSatellite = from.remoteNodeId === this.config.nodeId;
611
+ if (frame.from === this.config.nodeId && !isSatellite) return;
585
612
 
586
613
  // Validate from field: must be the direct peer or a known node (relayed)
587
614
  if (frame.from && frame.from !== from.remoteNodeId && !this.router.getRoute(frame.from)) {
@@ -589,32 +616,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
589
616
  return;
590
617
  }
591
618
 
592
- // Skip dedup for streaming and response frame types.
593
- // Stream frames share one id across many chunks.
594
- // Response frames (model_res, tool_res, handoff_res, etc.) share the same id
595
- // as their request — without this exemption, a relay node that forwarded
596
- // the request would mark the id as seen and then drop the returning response.
597
619
  // Skip dedup for streaming chunks (same id across many chunks) and response
598
620
  // frames (share id with their request — relay would otherwise drop the reply).
599
- // handoff_input, handoff_cancel, and handoff_status reuse the original handoff_req id,
600
- // so relay nodes that already forwarded the request would drop them as duplicates.
601
- const skipDedup = frame.type === "model_stream" || frame.type === "handoff_stream"
602
- || frame.type === "model_res" || frame.type === "tool_res"
603
- || frame.type === "handoff_res" || frame.type === "handoff_status_res"
604
- || frame.type === "handoff_input_required"
605
- || frame.type === "handoff_input" || frame.type === "handoff_cancel"
606
- || frame.type === "handoff_status"
607
- || frame.type === "diagnostic_exec_res" || frame.type === "diagnostic_status_res"
608
- || frame.type === "peer_approval_res"
609
- || frame.type === "acp_stream" || frame.type === "acp_res"
610
- || frame.type === "acp_close_res"
611
- || frame.type === "acp_list_res" || frame.type === "acp_resume_res"
612
- || frame.type === "acp_cancel_res"
613
- || frame.type === "acp_set_mode_res" || frame.type === "acp_get_modes_res"
614
- || frame.type === "terminal_open_res" || frame.type === "terminal_data"
615
- || frame.type === "terminal_resize" || frame.type === "terminal_close"
616
- || frame.type === "terminal_close_res";
617
- if (frame.id && !skipDedup && this.router.isDuplicate(frame.id)) return;
621
+ // handoff_input/cancel/status reuse the original handoff_req id.
622
+ if (frame.id && !SKIP_DEDUP_TYPES.has(frame.type) && this.router.isDuplicate(frame.id)) return;
618
623
 
619
624
  // Handle peer approval responses locally (don't emit to cluster-service)
620
625
  if (frame.type === "peer_approval_res") {
@@ -672,8 +677,26 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
672
677
  }
673
678
 
674
679
  if (frame.to && frame.to !== this.config.nodeId) {
675
- this.router.tryRelay(frame);
676
- return;
680
+ if (this.router.tryRelay(frame)) return;
681
+
682
+ // Relay failed — for model_req, try alternative nodes or fall through to local handling
683
+ if (frame.type === "model_req") {
684
+ const modelId = (frame as any).payload?.model;
685
+ if (modelId) {
686
+ const exclude = new Set([frame.to, this.config.nodeId]);
687
+ const alternatives = this.router.findNodesForModel(modelId, exclude);
688
+ for (const alt of alternatives) {
689
+ if (this.sendTo(alt.nodeId, { ...frame, to: alt.nodeId })) {
690
+ debug("peer", `model_req failover: ${frame.to} → ${alt.nodeId}`);
691
+ return;
692
+ }
693
+ }
694
+ }
695
+ // No remote alternative — fall through to local handling
696
+ // (model-proxy will handle locally or send error back)
697
+ } else {
698
+ return;
699
+ }
677
700
  }
678
701
 
679
702
  // Forward to same-nodeId satellite connection (e.g. Mac desktop app) so that
package/src/router.ts CHANGED
@@ -3,6 +3,7 @@ import type { Connection } from "./connection.ts";
3
3
 
4
4
  const DEFAULT_TTL = 3;
5
5
  const MAX_SEEN_FRAMES = 10_000;
6
+ const MAX_FAILED_REQUESTS = 5_000;
6
7
  const ROTATE_INTERVAL = 60_000; // rotate dedup maps every 60s
7
8
 
8
9
  export interface RouteEntry {
@@ -232,6 +233,31 @@ export class Router {
232
233
  return this.routes.get(target);
233
234
  }
234
235
 
236
+ /** Find reachable nodes that provide a specific model, sorted by latency.
237
+ * Excludes nodes in the `exclude` set. */
238
+ findNodesForModel(modelId: string, exclude?: Set<string>): RouteEntry[] {
239
+ const candidates: RouteEntry[] = [];
240
+ for (const entry of this.routes.values()) {
241
+ if (exclude?.has(entry.nodeId)) continue;
242
+ if (!entry.models.some((m) => m.id === modelId)) continue;
243
+ // Check reachability
244
+ if (entry.connection?.isOpen) {
245
+ candidates.push(entry);
246
+ } else if (entry.reachableVia) {
247
+ const relay = this.connections.get(entry.reachableVia);
248
+ if (relay?.isOpen) candidates.push(entry);
249
+ }
250
+ }
251
+ // Sort: direct first, then by latency
252
+ candidates.sort((a, b) => {
253
+ const aDirect = a.connection ? 0 : 1;
254
+ const bDirect = b.connection ? 0 : 1;
255
+ if (aDirect !== bDirect) return aDirect - bDirect;
256
+ return a.latencyMs - b.latencyMs;
257
+ });
258
+ return candidates;
259
+ }
260
+
235
261
  // ── Message sending and relay ──────────────────────────────────
236
262
  /** Send a frame to a specific node, relaying if necessary. Returns true if sent. */
237
263
  sendTo(targetNodeId: string, frame: ClusterFrame | AnyClusterFrame): boolean {
@@ -305,6 +331,21 @@ export class Router {
305
331
  * TTL defaults to 15 minutes — long enough for handoff timeouts. */
306
332
  markFailed(requestId: string, ttlMs = 900_000) {
307
333
  this.failedRequests.set(requestId, Date.now() + ttlMs);
334
+ // Evict entries when map grows too large: first expired, then FIFO
335
+ if (this.failedRequests.size > MAX_FAILED_REQUESTS) {
336
+ const now = Date.now();
337
+ // Pass 1: remove expired entries
338
+ for (const [id, expiresAt] of this.failedRequests) {
339
+ if (now > expiresAt) this.failedRequests.delete(id);
340
+ }
341
+ // Pass 2: if still over limit, remove oldest (insertion-order) entries
342
+ if (this.failedRequests.size > MAX_FAILED_REQUESTS) {
343
+ for (const [id] of this.failedRequests) {
344
+ if (this.failedRequests.size <= MAX_FAILED_REQUESTS) break;
345
+ this.failedRequests.delete(id);
346
+ }
347
+ }
348
+ }
308
349
  }
309
350
 
310
351
  isFailed(requestId: string): boolean {
package/src/sentinel.ts CHANGED
@@ -569,6 +569,28 @@ process.on("exit", (code) => {
569
569
  try { process.stderr.write(`[svc ${ts}] Exit code=${code}\n`); } catch { /* ignore */ }
570
570
  });
571
571
 
572
+ /** Connect to all configured peers (called when gateway dies). */
573
+ function connectAllPeers() {
574
+ for (const peer of config.peers) {
575
+ if (!connections.has(peer.nodeId) && !reconnectTimers.has(peer.nodeId)) {
576
+ connectToPeer(peer);
577
+ }
578
+ }
579
+ }
580
+
581
+ /** Disconnect from all peers (called when gateway recovers). */
582
+ function disconnectAllPeers() {
583
+ for (const [nodeId, conn] of connections) {
584
+ conn.close(1000, "gateway recovered");
585
+ connections.delete(nodeId);
586
+ }
587
+ for (const [nodeId, timer] of reconnectTimers) {
588
+ clearTimeout(timer);
589
+ reconnectTimers.delete(nodeId);
590
+ }
591
+ reconnectAttempts.clear();
592
+ }
593
+
572
594
  /** Periodically check if the gateway process is still alive via kill(pid, 0). */
573
595
  function startGatewayHealthCheck() {
574
596
  if (healthCheckTimer || !gatewayPid) return;
@@ -581,11 +603,15 @@ function startGatewayHealthCheck() {
581
603
  log("Gateway process detected — back online");
582
604
  // Release the port so the gateway can reclaim it
583
605
  stopListening();
606
+ // Disconnect from peers — gateway handles mesh connections
607
+ disconnectAllPeers();
584
608
  }
585
609
  } catch {
586
610
  if (gatewayAlive) {
587
611
  gatewayAlive = false;
588
612
  log(`Gateway process (pid ${gatewayPid}) gone — entering standalone mode`);
613
+ // Connect to peers now that gateway is down
614
+ connectAllPeers();
589
615
  // Take over the gateway's listen port
590
616
  if (config.listenPort) {
591
617
  // Small delay to let the OS release the port from the dead process
@@ -608,11 +634,7 @@ function boot() {
608
634
  writePidFile();
609
635
  log(`Started (pid ${process.pid}, gateway ${gatewayPid}, nodeId ${sentinelNodeId()}, takeover port ${config.listenPort || "none"})`);
610
636
 
611
- // Connect to all configured peers
612
- for (const peer of config.peers) {
613
- connectToPeer(peer);
614
- }
615
-
616
- // Note: we do NOT start listening here.
617
- // Listening only starts when gateway dies (port takeover mode).
637
+ // Do NOT connect to peers on boot — gateway handles mesh connections.
638
+ // Sentinel only connects when gateway dies (standalone mode).
639
+ // Listening also only starts when gateway dies (port takeover mode).
618
640
  }
package/src/types.ts CHANGED
@@ -411,6 +411,14 @@ export interface KnowledgeSyncFrame extends ClusterFrame {
411
411
  };
412
412
  }
413
413
 
414
+ // ── Health sync ──────────────────────────────────────────────────
415
+ export interface HealthSyncFrame extends ClusterFrame {
416
+ type: "health_sync";
417
+ payload: {
418
+ data: string; // base64-encoded Automerge sync message
419
+ };
420
+ }
421
+
414
422
  // ── Diagnostic (sentinel) ────────────────────────────────────────
415
423
  export interface DiagnosticExec extends ClusterFrame {
416
424
  type: "diagnostic_exec";
@@ -831,4 +839,5 @@ export type AnyClusterFrame =
831
839
  | TerminalData
832
840
  | TerminalResize
833
841
  | TerminalCloseRequest
834
- | TerminalCloseResponse;
842
+ | TerminalCloseResponse
843
+ | HealthSyncFrame;
package/src/web.ts CHANGED
@@ -3,6 +3,7 @@ import type { PeerManager } from "./peer-manager.ts";
3
3
  import type { HandoffManager } from "./handoff.ts";
4
4
  import type { ClawMatrixConfig } from "./config.ts";
5
5
  import type { SatelliteContext, IngestedEvent } from "./types.ts";
6
+ import type { HealthTracker } from "./health-tracker.ts";
6
7
  import { timingSafeEqual } from "./auth.ts";
7
8
  import { renderDashboard } from "./web-ui.ts";
8
9
  import { readBody } from "./http-utils.ts";
@@ -46,6 +47,7 @@ export class WebHandler {
46
47
  private ingestedEvents: IngestedEvent[] = []; // ring buffer for ingested events
47
48
  private loginAttempts = new Map<string, { count: number; resetAt: number }>(); // IP → rate limit
48
49
  private loginCleanupTimer: ReturnType<typeof setInterval> | null = null;
50
+ private healthTracker: HealthTracker | null = null;
49
51
  private onPeerConnected: (nodeId: string) => void;
50
52
  private onPeerDisconnected: (nodeId: string) => void;
51
53
 
@@ -91,6 +93,11 @@ export class WebHandler {
91
93
  peerManager.on("peerDisconnected", this.onPeerDisconnected);
92
94
  }
93
95
 
96
+ /** Set the health tracker for availability API. */
97
+ setHealthTracker(tracker: HealthTracker) {
98
+ this.healthTracker = tracker;
99
+ }
100
+
94
101
  /** Clean up timers and pending requests on shutdown. */
95
102
  destroy() {
96
103
  // Remove event listeners to prevent post-destroy callbacks
@@ -181,6 +188,11 @@ export class WebHandler {
181
188
  return;
182
189
  }
183
190
 
191
+ if (path === "/api/availability" && req.method === "GET") {
192
+ this.handleAvailability(req, res);
193
+ return;
194
+ }
195
+
184
196
  if (path === "/api/satellite/poll" && req.method === "GET") {
185
197
  this.handleSatellitePoll(req, res);
186
198
  return;
@@ -271,6 +283,27 @@ export class WebHandler {
271
283
  }
272
284
  }
273
285
 
286
+ private handleAvailability(req: IncomingMessage, res: ServerResponse) {
287
+ if (!this.healthTracker) {
288
+ res.writeHead(503, { "Content-Type": "application/json" });
289
+ res.end(JSON.stringify({ error: "Health tracker not available" }));
290
+ return;
291
+ }
292
+
293
+ const url = new URL(req.url ?? "/", `http://${req.headers.host ?? "localhost"}`);
294
+ const range = (url.searchParams.get("range") ?? "24h") as "24h" | "7d" | "90d";
295
+
296
+ if (!["24h", "7d", "90d"].includes(range)) {
297
+ res.writeHead(400, { "Content-Type": "application/json" });
298
+ res.end(JSON.stringify({ error: "Invalid range. Use 24h, 7d, or 90d" }));
299
+ return;
300
+ }
301
+
302
+ const result = this.healthTracker.getAvailability(range);
303
+ res.writeHead(200, { "Content-Type": "application/json" });
304
+ res.end(JSON.stringify(result));
305
+ }
306
+
274
307
  private handleStatus(res: ServerResponse) {
275
308
  const peers = this.peerManager.router.getAllPeers();
276
309
  const localNode = {