clawmatrix 0.2.9 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -46,6 +46,9 @@ const SKIP_DEDUP_TYPES = new Set([
46
46
  // Terminal
47
47
  "terminal_open_res", "terminal_data", "terminal_resize",
48
48
  "terminal_close", "terminal_close_res",
49
+ // File transfer
50
+ "file_transfer_chunk", "file_transfer_chunk_ack",
51
+ "file_transfer_ack", "file_transfer_complete",
49
52
  ]);
50
53
 
51
54
  /** Classify WebSocket close code into a human-readable reason. */
@@ -90,6 +93,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
90
93
  private wss: WebSocketServer | null = null;
91
94
  private reconnectTimers = new Map<string, ReturnType<typeof setTimeout>>();
92
95
  private reconnectAttempts = new Map<string, number>();
96
+ /** Deferred disconnect timers — grace period before broadcasting peer_leave. */
97
+ private disconnectGraceTimers = new Map<string, ReturnType<typeof setTimeout>>();
93
98
  private stopped = false;
94
99
  /** Map from ws WebSocket to Connection for inbound connections. */
95
100
  private inboundConnections = new Map<WsWebSocket, Connection>();
@@ -162,6 +167,17 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
162
167
  }
163
168
  }
164
169
 
170
+ /** Update the local tool proxy catalog and re-broadcast to all peers. */
171
+ updateToolCatalog(catalog: import("./types.ts").ToolCatalogEntry[]) {
172
+ if (this.localCapabilities.toolProxy) {
173
+ this.localCapabilities.toolProxy = { ...this.localCapabilities.toolProxy, catalog };
174
+ }
175
+ this.router.updateLocalToolCatalog(catalog);
176
+ for (const conn of this.router.getDirectConnections()) {
177
+ this.sendPeerSync(conn);
178
+ }
179
+ }
180
+
165
181
  // ── Lifecycle ──────────────────────────────────────────────────
166
182
  async start() {
167
183
  await this.approvalManager.load();
@@ -187,6 +203,12 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
187
203
  clearTimeout(timer);
188
204
  }
189
205
  this.reconnectTimers.clear();
206
+ // Flush all disconnect grace timers (execute leave immediately on shutdown)
207
+ for (const [nodeId, timer] of this.disconnectGraceTimers) {
208
+ clearTimeout(timer);
209
+ this.executePeerLeave(nodeId);
210
+ }
211
+ this.disconnectGraceTimers.clear();
190
212
 
191
213
  this.router.broadcast({
192
214
  type: "peer_leave",
@@ -458,9 +480,6 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
458
480
  if (this.pendingApprovalConns.has(nodeId)) {
459
481
  debug("approval", `reusing pending approval for ${nodeId}, updating conn ref`);
460
482
  this.pendingApprovalConns.set(nodeId, { conn, caps });
461
- if (this.config.peerApproval?.mode === "required") {
462
- conn.on("close", () => this.onPeerDisconnected(conn));
463
- }
464
483
  return;
465
484
  }
466
485
 
@@ -492,10 +511,12 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
492
511
  );
493
512
  }
494
513
  });
495
- // In required mode, don't complete the join yet
514
+ // In required mode, don't complete the join yet.
515
+ // No close handler needed here: the peer was never added to the router,
516
+ // so onPeerDisconnected would broadcast a spurious peer_leave.
517
+ // If the conn drops before approval resolves, the .then() handler sees
518
+ // activeConn.isOpen === false and skips all actions.
496
519
  if (this.config.peerApproval?.mode === "required") {
497
- // Wire up close handler to clean up if connection drops while pending
498
- conn.on("close", () => this.onPeerDisconnected(conn));
499
520
  return;
500
521
  }
501
522
  // In notify mode, requestApproval resolves immediately, but
@@ -512,6 +533,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
512
533
  private completePeerJoin(conn: Connection, caps: NodeCapabilities) {
513
534
  const nodeId = conn.remoteNodeId!;
514
535
 
536
+ // Cancel disconnect grace timer if the peer is reconnecting
537
+ const wasInGrace = this.cancelDisconnectGrace(nodeId);
538
+
515
539
  // If there's an existing connection for this nodeId (e.g. peer reconnected
516
540
  // while old TCP hadn't closed yet), close it AFTER overwriting the route so
517
541
  // the stale-close guard in onPeerDisconnected correctly skips cleanup.
@@ -582,15 +606,58 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
582
606
  return;
583
607
  }
584
608
 
609
+ // Grace period: defer peer_leave broadcast to allow quick reconnection
610
+ // (e.g. iOS WiFi ↔ cellular handoff, brief audio interruption).
611
+ // If the peer reconnects within the grace window, completePeerJoin
612
+ // will cancel this timer via cancelDisconnectGrace.
613
+ const graceMs = this.config.disconnectGrace ?? 30_000;
614
+ if (graceMs <= 0) {
615
+ this.executePeerLeave(nodeId, conn);
616
+ return;
617
+ }
618
+ debug("peer", `onPeerDisconnected(${nodeId}): starting ${graceMs / 1000}s grace period`);
619
+
620
+ // Clear any existing grace timer for this node (shouldn't happen, but be safe)
621
+ this.cancelDisconnectGrace(nodeId);
622
+
623
+ this.disconnectGraceTimers.set(nodeId, setTimeout(() => {
624
+ this.disconnectGraceTimers.delete(nodeId);
625
+ this.executePeerLeave(nodeId, conn);
626
+ }, graceMs));
627
+ }
628
+
629
+ /** Cancel a pending disconnect grace timer (called when peer reconnects quickly). */
630
+ private cancelDisconnectGrace(nodeId: string): boolean {
631
+ const timer = this.disconnectGraceTimers.get(nodeId);
632
+ if (timer) {
633
+ clearTimeout(timer);
634
+ this.disconnectGraceTimers.delete(nodeId);
635
+ debug("peer", `cancelDisconnectGrace(${nodeId}): peer reconnected within grace period`);
636
+ return true;
637
+ }
638
+ return false;
639
+ }
640
+
641
+ /** Execute the actual peer leave (after grace period expires or immediate for shutdown). */
642
+ private executePeerLeave(nodeId: string, conn?: Connection) {
643
+ // Double-check the route hasn't been replaced by a new connection during grace
644
+ if (conn) {
645
+ const currentRoute = this.router.getRoute(nodeId);
646
+ if (currentRoute?.connection && currentRoute.connection !== conn) {
647
+ debug("peer", `executePeerLeave(${nodeId}): route replaced during grace — skipping`);
648
+ return;
649
+ }
650
+ }
651
+
585
652
  audit("peer_leave", { nodeId });
586
653
  this.router.removePeer(nodeId);
587
654
 
588
655
  // Remove satellite contexts that were only reachable via this peer
589
- this.satelliteContexts = this.satelliteContexts.filter(s => {
590
- // Keep satellites that are not associated with the disconnected peer
591
- // (satellite nodeIds typically differ from mesh peer nodeIds)
592
- return s.nodeId !== nodeId;
593
- });
656
+ for (let i = this.satelliteContexts.length - 1; i >= 0; i--) {
657
+ if (this.satelliteContexts[i].nodeId === nodeId) {
658
+ this.satelliteContexts.splice(i, 1);
659
+ }
660
+ }
594
661
 
595
662
  this.router.broadcast({
596
663
  type: "peer_leave",
@@ -745,13 +812,17 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
745
812
  const prev = this.router.getRoute(peer.nodeId);
746
813
  const hadAgents = prev?.agents.length ?? 0;
747
814
  const hadDirectPeers = prev?.directPeers.length ?? 0;
748
- const hadToolProxy = JSON.stringify(prev?.toolProxy);
749
815
  const hadDeviceInfo = prev?.deviceInfo?.hostname;
750
816
  const hadAcpAgents = prev?.acpAgents?.length ?? 0;
817
+ const hadToolProxyEnabled = prev?.toolProxy?.enabled;
818
+ const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
819
+ const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
751
820
  this.router.updatePeerCapabilities(peer.nodeId, peer);
752
821
  if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
753
822
  || (peer.directPeers?.length ?? 0) !== hadDirectPeers
754
- || JSON.stringify(peer.toolProxy) !== hadToolProxy
823
+ || peer.toolProxy?.enabled !== hadToolProxyEnabled
824
+ || (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
825
+ || (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
755
826
  || peer.deviceInfo?.hostname !== hadDeviceInfo
756
827
  || (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
757
828
  changed = true;
@@ -33,19 +33,20 @@ export class RateLimiter {
33
33
 
34
34
  let timestamps = this.attempts.get(ip);
35
35
  if (timestamps) {
36
- // Remove expired entries
37
- timestamps = timestamps.filter((t) => t > cutoff);
36
+ // In-place pruning: find first non-expired index and splice
37
+ let firstValid = 0;
38
+ while (firstValid < timestamps.length && timestamps[firstValid] <= cutoff) firstValid++;
39
+ if (firstValid > 0) timestamps.splice(0, firstValid);
38
40
  } else {
39
41
  timestamps = [];
42
+ this.attempts.set(ip, timestamps);
40
43
  }
41
44
 
42
45
  if (timestamps.length >= this.config.maxAttempts) {
43
- this.attempts.set(ip, timestamps);
44
46
  return false;
45
47
  }
46
48
 
47
49
  timestamps.push(now);
48
- this.attempts.set(ip, timestamps);
49
50
  return true;
50
51
  }
51
52
 
@@ -61,19 +62,24 @@ export class RateLimiter {
61
62
  /** Get remaining attempts for an IP. */
62
63
  remaining(ip: string): number {
63
64
  const cutoff = Date.now() - this.config.windowMs;
64
- const timestamps = this.attempts.get(ip) ?? [];
65
- const active = timestamps.filter((t) => t > cutoff).length;
65
+ const timestamps = this.attempts.get(ip);
66
+ if (!timestamps) return this.config.maxAttempts;
67
+ let active = 0;
68
+ for (let i = timestamps.length - 1; i >= 0; i--) {
69
+ if (timestamps[i] > cutoff) active++; else break;
70
+ }
66
71
  return Math.max(0, this.config.maxAttempts - active);
67
72
  }
68
73
 
69
74
  private gc() {
70
75
  const cutoff = Date.now() - this.config.windowMs;
71
76
  for (const [ip, timestamps] of this.attempts) {
72
- const active = timestamps.filter((t) => t > cutoff);
73
- if (active.length === 0) {
77
+ let firstValid = 0;
78
+ while (firstValid < timestamps.length && timestamps[firstValid] <= cutoff) firstValid++;
79
+ if (firstValid === timestamps.length) {
74
80
  this.attempts.delete(ip);
75
- } else {
76
- this.attempts.set(ip, active);
81
+ } else if (firstValid > 0) {
82
+ timestamps.splice(0, firstValid);
77
83
  }
78
84
  }
79
85
  }
package/src/router.ts CHANGED
@@ -38,6 +38,14 @@ export class Router {
38
38
  /** Failed request IDs with expiry timestamps. Separate from dedup to support longer TTLs. */
39
39
  private failedRequests = new Map<string, number>(); // requestId → expiresAt
40
40
 
41
+ // ── Indexes for O(1) lookups in hot paths ──────────────────────
42
+ /** agentId → Set of nodeIds that host this agent. */
43
+ private agentIndex = new Map<string, Set<string>>();
44
+ /** tag → Set of nodeIds (both node-level and agent-level tags). */
45
+ private tagIndex = new Map<string, Set<string>>();
46
+ /** modelId → Set of nodeIds that provide this model. */
47
+ private modelIndex = new Map<string, Set<string>>();
48
+
41
49
  constructor(
42
50
  nodeId: string,
43
51
  localCapabilities?: { agents: AgentInfo[]; models: ModelInfo[]; tags: string[]; deviceInfo?: DeviceInfo; toolProxy?: ToolProxyInfo; acpAgents?: AcpAgentInfo[] },
@@ -53,11 +61,64 @@ export class Router {
53
61
  this.rotateTimer = setInterval(() => this.rotateSeenFrames(), ROTATE_INTERVAL);
54
62
  }
55
63
 
64
+ /** Rebuild all indexes from scratch. Called after any route table mutation. */
65
+ private rebuildIndexes() {
66
+ this.agentIndex.clear();
67
+ this.tagIndex.clear();
68
+ this.modelIndex.clear();
69
+ for (const entry of this.routes.values()) {
70
+ this.indexEntry(entry);
71
+ }
72
+ }
73
+
74
+ /** Add a single entry to all indexes. */
75
+ private indexEntry(entry: RouteEntry) {
76
+ const nid = entry.nodeId;
77
+ for (const a of entry.agents) {
78
+ let set = this.agentIndex.get(a.id);
79
+ if (!set) { set = new Set(); this.agentIndex.set(a.id, set); }
80
+ set.add(nid);
81
+ for (const t of a.tags ?? []) {
82
+ let ts = this.tagIndex.get(t);
83
+ if (!ts) { ts = new Set(); this.tagIndex.set(t, ts); }
84
+ ts.add(nid);
85
+ }
86
+ }
87
+ for (const t of entry.tags ?? []) {
88
+ let set = this.tagIndex.get(t);
89
+ if (!set) { set = new Set(); this.tagIndex.set(t, set); }
90
+ set.add(nid);
91
+ }
92
+ for (const m of entry.models ?? []) {
93
+ let set = this.modelIndex.get(m.id);
94
+ if (!set) { set = new Set(); this.modelIndex.set(m.id, set); }
95
+ set.add(nid);
96
+ }
97
+ }
98
+
99
+ /** Remove a single entry from all indexes. */
100
+ private unindexEntry(entry: RouteEntry) {
101
+ const nid = entry.nodeId;
102
+ for (const a of entry.agents ?? []) {
103
+ this.agentIndex.get(a.id)?.delete(nid);
104
+ for (const t of a.tags ?? []) this.tagIndex.get(t)?.delete(nid);
105
+ }
106
+ for (const t of entry.tags ?? []) this.tagIndex.get(t)?.delete(nid);
107
+ for (const m of entry.models ?? []) this.modelIndex.get(m.id)?.delete(nid);
108
+ }
109
+
56
110
  /** Update locally advertised ACP agents (used after auto-detection). */
57
111
  updateLocalAcpAgents(agents: AcpAgentInfo[]) {
58
112
  this.localAcpAgents = agents;
59
113
  }
60
114
 
115
+ /** Update the local tool proxy catalog (descriptions + schemas for remote callers). */
116
+ updateLocalToolCatalog(catalog: ToolProxyInfo["catalog"]) {
117
+ if (this.localToolProxy) {
118
+ this.localToolProxy = { ...this.localToolProxy, catalog };
119
+ }
120
+ }
121
+
61
122
  /** Stop periodic cleanup. Call on shutdown. */
62
123
  destroy() {
63
124
  if (this.rotateTimer) {
@@ -75,8 +136,10 @@ export class Router {
75
136
  connection: Connection,
76
137
  capabilities: { agents: AgentInfo[]; models: ModelInfo[]; tags: string[]; deviceInfo?: DeviceInfo; toolProxy?: ToolProxyInfo; acpAgents?: AcpAgentInfo[] },
77
138
  ) {
139
+ const old = this.routes.get(nodeId);
140
+ if (old) this.unindexEntry(old);
78
141
  this.connections.set(nodeId, connection);
79
- this.routes.set(nodeId, {
142
+ const entry: RouteEntry = {
80
143
  nodeId,
81
144
  agents: capabilities.agents,
82
145
  models: capabilities.models,
@@ -89,7 +152,9 @@ export class Router {
89
152
  deviceInfo: capabilities.deviceInfo,
90
153
  toolProxy: capabilities.toolProxy,
91
154
  acpAgents: capabilities.acpAgents,
92
- });
155
+ };
156
+ this.routes.set(nodeId, entry);
157
+ this.indexEntry(entry);
93
158
  }
94
159
 
95
160
  addRelayPeer(peer: PeerInfo, viaNodeId: string) {
@@ -106,7 +171,8 @@ export class Router {
106
171
  // Don't overwrite a better relay route with a worse one (allow equal for capability updates)
107
172
  if (existing?.reachableVia && existing.latencyMs < estimatedLatency) return;
108
173
 
109
- this.routes.set(peer.nodeId, {
174
+ if (existing) this.unindexEntry(existing);
175
+ const entry: RouteEntry = {
110
176
  nodeId: peer.nodeId,
111
177
  agents: peer.agents,
112
178
  models: peer.models,
@@ -119,15 +185,22 @@ export class Router {
119
185
  deviceInfo: peer.deviceInfo,
120
186
  toolProxy: peer.toolProxy,
121
187
  acpAgents: peer.acpAgents,
122
- });
188
+ };
189
+ this.routes.set(peer.nodeId, entry);
190
+ this.indexEntry(entry);
123
191
  }
124
192
 
125
193
  removePeer(nodeId: string) {
126
194
  this.connections.delete(nodeId);
127
- this.routes.delete(nodeId);
195
+ const removed = this.routes.get(nodeId);
196
+ if (removed) {
197
+ this.unindexEntry(removed);
198
+ this.routes.delete(nodeId);
199
+ }
128
200
  // Also remove routes that relied on this node as relay
129
201
  for (const [id, entry] of this.routes) {
130
202
  if (entry.reachableVia === nodeId) {
203
+ this.unindexEntry(entry);
131
204
  this.routes.delete(id);
132
205
  }
133
206
  }
@@ -139,6 +212,7 @@ export class Router {
139
212
  ) {
140
213
  const entry = this.routes.get(nodeId);
141
214
  if (entry) {
215
+ this.unindexEntry(entry);
142
216
  entry.agents = capabilities.agents;
143
217
  entry.models = capabilities.models;
144
218
  entry.tags = capabilities.tags;
@@ -151,6 +225,7 @@ export class Router {
151
225
  entry.toolProxy = capabilities.toolProxy;
152
226
  entry.acpAgents = capabilities.acpAgents;
153
227
  entry.lastSeen = Date.now();
228
+ this.indexEntry(entry);
154
229
  }
155
230
  }
156
231
 
@@ -169,20 +244,20 @@ export class Router {
169
244
  /** Resolve target agent to a specific nodeId. Supports agent ID or "tags:<tag>". */
170
245
  resolveAgent(target: string): RouteEntry | undefined {
171
246
  const isTagQuery = target.startsWith("tags:");
172
- const tag = isTagQuery ? target.slice(5) : null;
247
+
248
+ let nodeIds: Set<string> | undefined;
249
+ if (isTagQuery) {
250
+ nodeIds = this.tagIndex.get(target.slice(5));
251
+ } else {
252
+ nodeIds = this.agentIndex.get(target);
253
+ }
173
254
 
174
255
  let candidates: RouteEntry[] = [];
175
- for (const entry of this.routes.values()) {
176
- // Skip self never resolve to our own node
177
- if (entry.nodeId === this.nodeId) continue;
178
- if (isTagQuery) {
179
- if (entry.agents.some((a) => a.tags.includes(tag!)) || entry.tags.includes(tag!)) {
180
- candidates.push(entry);
181
- }
182
- } else {
183
- if (entry.agents.some((a) => a.id === target)) {
184
- candidates.push(entry);
185
- }
256
+ if (nodeIds) {
257
+ for (const nid of nodeIds) {
258
+ if (nid === this.nodeId) continue;
259
+ const entry = this.routes.get(nid);
260
+ if (entry) candidates.push(entry);
186
261
  }
187
262
  }
188
263
 
@@ -193,6 +268,7 @@ export class Router {
193
268
  }
194
269
 
195
270
  if (candidates.length === 0) return undefined;
271
+ if (candidates.length === 1) return candidates[0];
196
272
 
197
273
  // Sort: direct connections first, then by latency
198
274
  candidates.sort((a, b) => {
@@ -210,15 +286,16 @@ export class Router {
210
286
  resolveNode(target: string): RouteEntry | undefined {
211
287
  if (target.startsWith("tags:")) {
212
288
  const tag = target.slice(5);
289
+ const nodeIds = this.tagIndex.get(tag);
290
+ if (!nodeIds) return undefined;
213
291
  const candidates: RouteEntry[] = [];
214
- for (const entry of this.routes.values()) {
215
- // Skip self never resolve to our own node
216
- if (entry.nodeId === this.nodeId) continue;
217
- if (entry.tags.includes(tag)) {
218
- candidates.push(entry);
219
- }
292
+ for (const nid of nodeIds) {
293
+ if (nid === this.nodeId) continue;
294
+ const entry = this.routes.get(nid);
295
+ if (entry) candidates.push(entry);
220
296
  }
221
297
  if (candidates.length === 0) return undefined;
298
+ if (candidates.length === 1) return candidates[0];
222
299
  // Sort: direct connections first, then by latency
223
300
  candidates.sort((a, b) => {
224
301
  const aDirect = a.connection ? 0 : 1;
@@ -236,10 +313,13 @@ export class Router {
236
313
  /** Find reachable nodes that provide a specific model, sorted by latency.
237
314
  * Excludes nodes in the `exclude` set. */
238
315
  findNodesForModel(modelId: string, exclude?: Set<string>): RouteEntry[] {
316
+ const nodeIds = this.modelIndex.get(modelId);
317
+ if (!nodeIds) return [];
239
318
  const candidates: RouteEntry[] = [];
240
- for (const entry of this.routes.values()) {
241
- if (exclude?.has(entry.nodeId)) continue;
242
- if (!entry.models.some((m) => m.id === modelId)) continue;
319
+ for (const nid of nodeIds) {
320
+ if (exclude?.has(nid)) continue;
321
+ const entry = this.routes.get(nid);
322
+ if (!entry) continue;
243
323
  // Check reachability
244
324
  if (entry.connection?.isOpen) {
245
325
  candidates.push(entry);
@@ -248,13 +328,15 @@ export class Router {
248
328
  if (relay?.isOpen) candidates.push(entry);
249
329
  }
250
330
  }
251
- // Sort: direct first, then by latency
252
- candidates.sort((a, b) => {
253
- const aDirect = a.connection ? 0 : 1;
254
- const bDirect = b.connection ? 0 : 1;
255
- if (aDirect !== bDirect) return aDirect - bDirect;
256
- return a.latencyMs - b.latencyMs;
257
- });
331
+ if (candidates.length > 1) {
332
+ // Sort: direct first, then by latency
333
+ candidates.sort((a, b) => {
334
+ const aDirect = a.connection ? 0 : 1;
335
+ const bDirect = b.connection ? 0 : 1;
336
+ if (aDirect !== bDirect) return aDirect - bDirect;
337
+ return a.latencyMs - b.latencyMs;
338
+ });
339
+ }
258
340
  return candidates;
259
341
  }
260
342
 
@@ -8,7 +8,8 @@
8
8
 
9
9
  import { fork, type ChildProcess } from "node:child_process";
10
10
  import { join, dirname } from "node:path";
11
- import { existsSync, readFileSync, mkdirSync, openSync } from "node:fs";
11
+ import { existsSync, readFileSync, mkdirSync, openSync, closeSync } from "node:fs";
12
+ import { createConnection } from "node:net";
12
13
  import { homedir, tmpdir } from "node:os";
13
14
  import type { ClawMatrixConfig } from "./config.ts";
14
15
 
@@ -42,6 +43,9 @@ export class SentinelManager {
42
43
  execArgv: this.resolveExecArgv(),
43
44
  });
44
45
 
46
+ // Close the log fd in the parent — the child has its own copy
47
+ closeSync(logFd);
48
+
45
49
  // Send config to sentinel via IPC (includes gateway PID for health checks)
46
50
  // If sentinel has no explicit listenPort but the gateway is a listener,
47
51
  // inherit the gateway's port for automatic takeover when gateway dies.
@@ -83,7 +87,57 @@ export class SentinelManager {
83
87
  }, 1000);
84
88
  }
85
89
 
86
- stop() {
90
+ /**
91
+ * Kill the old sentinel and wait for the listen port to become free.
92
+ * Must be called BEFORE PeerManager.startListening() to avoid EADDRINUSE.
93
+ */
94
+ async ensurePortFree() {
95
+ // Kill old sentinel process if alive
96
+ if (existsSync(this.pidFile)) {
97
+ try {
98
+ const pid = parseInt(readFileSync(this.pidFile, "utf-8").trim(), 10);
99
+ if (pid) {
100
+ try {
101
+ process.kill(pid, "SIGTERM");
102
+ // Wait for the process to exit (up to 5s)
103
+ for (let i = 0; i < 100; i++) {
104
+ try {
105
+ process.kill(pid, 0);
106
+ await new Promise((r) => setTimeout(r, 50));
107
+ } catch {
108
+ break; // exited
109
+ }
110
+ }
111
+ } catch {
112
+ // Already gone
113
+ }
114
+ }
115
+ } catch {
116
+ // Malformed PID file
117
+ }
118
+ }
119
+
120
+ // Probe the port until it's free (up to 5s)
121
+ const port = this.config.sentinel?.listenPort
122
+ ?? (this.config.listen ? this.config.listenPort : 0);
123
+ if (!port) return;
124
+
125
+ const host = this.config.sentinel?.listenHost ?? this.config.listenHost ?? "0.0.0.0";
126
+ for (let i = 0; i < 50; i++) {
127
+ const inUse = await new Promise<boolean>((resolve) => {
128
+ const sock = createConnection({ port, host }, () => {
129
+ sock.destroy();
130
+ resolve(true);
131
+ });
132
+ sock.on("error", () => resolve(false));
133
+ sock.setTimeout(200, () => { sock.destroy(); resolve(false); });
134
+ });
135
+ if (!inUse) return;
136
+ await new Promise((r) => setTimeout(r, 100));
137
+ }
138
+ }
139
+
140
+ async stop() {
87
141
  // IPC is disconnected shortly after start, so use PID file for shutdown
88
142
  if (existsSync(this.pidFile)) {
89
143
  try {
@@ -92,13 +146,11 @@ export class SentinelManager {
92
146
  process.kill(pid, "SIGTERM");
93
147
  // Wait briefly for the process to exit so the next start()
94
148
  // doesn't race with a still-dying sentinel
95
- const deadline = Date.now() + 3_000;
96
- while (Date.now() < deadline) {
149
+ for (let i = 0; i < 60; i++) {
97
150
  try {
98
151
  process.kill(pid, 0);
99
- // Still alive — brief spin
100
- const waitUntil = Date.now() + 50;
101
- while (Date.now() < waitUntil) { /* spin */ }
152
+ // Still alive — async wait
153
+ await new Promise((r) => setTimeout(r, 50));
102
154
  } catch {
103
155
  break; // exited
104
156
  }
package/src/sentinel.ts CHANGED
@@ -78,6 +78,11 @@ let httpServer: Server | null = null;
78
78
  let wss: WebSocketServer | null = null;
79
79
  const inboundConnections = new Map<WsWebSocket, Connection>();
80
80
  let listening = false;
81
+ /** Timestamp when sentinel voluntarily released the port. During the cooldown
82
+ * period (30s), sentinel will not re-listen even if gateway appears to be gone,
83
+ * giving the new gateway time to bind the port. */
84
+ let voluntaryReleaseAt = 0;
85
+ const PORT_RELEASE_COOLDOWN = 30_000;
81
86
 
82
87
  // ── Rate limiting for diagnostic_exec ────────────────────────────
83
88
  const EXEC_RATE_WINDOW = 60_000; // 1 minute
@@ -464,6 +469,9 @@ function stopListening() {
464
469
  httpServer?.close();
465
470
  httpServer = null;
466
471
  listening = false;
472
+ // Mark voluntary release — sentinel will not re-listen during cooldown
473
+ // to give the gateway time to bind the port.
474
+ voluntaryReleaseAt = Date.now();
467
475
  log("Port released — gateway is back");
468
476
  }
469
477
 
@@ -612,12 +620,14 @@ function startGatewayHealthCheck() {
612
620
  log(`Gateway process (pid ${gatewayPid}) gone — entering standalone mode`);
613
621
  // Connect to peers now that gateway is down
614
622
  connectAllPeers();
615
- // Take over the gateway's listen port
623
+ // Take over the gateway's listen port — but respect cooldown after
624
+ // voluntary release so we don't compete with a restarting gateway.
616
625
  if (config.listenPort) {
617
- // Small delay to let the OS release the port from the dead process
626
+ const cooldownRemaining = PORT_RELEASE_COOLDOWN - (Date.now() - voluntaryReleaseAt);
627
+ const delay = Math.max(2_000, cooldownRemaining);
618
628
  setTimeout(() => {
619
629
  if (!gatewayAlive && !isReplaced()) startListening();
620
- }, 2_000);
630
+ }, delay);
621
631
  }
622
632
  }
623
633
  }
package/src/terminal.ts CHANGED
@@ -109,7 +109,8 @@ export class TerminalManager {
109
109
  return;
110
110
  }
111
111
 
112
- // Check allowFrom
112
+ // TODO(security): allowFrom 为空时默认允许所有已认证 peer 打开终端会话。
113
+ // 当前仅用于受信任网络。开放前需改为默认拒绝或要求显式配置。
113
114
  if (termConfig?.allowFrom && termConfig.allowFrom.length > 0) {
114
115
  if (!termConfig.allowFrom.includes(frame.from)) {
115
116
  this.peerManager.sendTo(frame.from, {
package/src/tool-proxy.ts CHANGED
@@ -45,6 +45,10 @@ export class ToolProxy {
45
45
  private logger: PluginLogger;
46
46
  private satelliteHandler: SatelliteToolHandler | null = null;
47
47
  private readonly toolTimeout: number;
48
+ // Pre-built Sets for O(1) allow/deny checks
49
+ private readonly allowSet: Set<string>;
50
+ private readonly denySet: Set<string>;
51
+ private readonly allowAll: boolean;
48
52
 
49
53
  constructor(config: ClawMatrixConfig, peerManager: PeerManager, gatewayInfo: GatewayInfo, logger: PluginLogger) {
50
54
  this.config = config;
@@ -52,6 +56,10 @@ export class ToolProxy {
52
56
  this.gatewayInfo = gatewayInfo;
53
57
  this.logger = logger;
54
58
  this.toolTimeout = config.toolTimeout ?? DEFAULT_TOOL_TIMEOUT;
59
+ const tp = config.toolProxy;
60
+ this.denySet = new Set(tp?.deny ?? []);
61
+ this.allowSet = new Set(tp?.allow ?? []);
62
+ this.allowAll = this.allowSet.size === 0 || this.allowSet.has("*");
55
63
  }
56
64
 
57
65
  /** Set the satellite tool handler (called by ClusterRuntime after WebHandler is created). */
@@ -338,10 +346,10 @@ export class ToolProxy {
338
346
  }
339
347
 
340
348
  // ── Security ───────────────────────────────────────────────────
341
- private isToolAllowed(tool: string, tpConfig: ToolProxyConfig): boolean {
342
- if (tpConfig.deny.includes(tool)) return false;
343
- if (tpConfig.allow.length === 0 || tpConfig.allow.includes("*")) return true;
344
- return tpConfig.allow.includes(tool);
349
+ private isToolAllowed(tool: string, _tpConfig: ToolProxyConfig): boolean {
350
+ if (this.denySet.has(tool)) return false;
351
+ if (this.allowAll) return true;
352
+ return this.allowSet.has(tool);
345
353
  }
346
354
 
347
355
  destroy() {