clawmatrix 0.2.9 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +27 -0
- package/README.md +123 -12
- package/package.json +2 -1
- package/src/acp-proxy.ts +433 -70
- package/src/cli.ts +478 -10
- package/src/cluster-service.ts +158 -14
- package/src/compat.ts +0 -6
- package/src/config.ts +17 -5
- package/src/connection.ts +61 -55
- package/src/e2e/helpers.ts +1 -5
- package/src/file-transfer.ts +721 -0
- package/src/handoff.ts +21 -8
- package/src/health-tracker.ts +6 -1
- package/src/index.ts +245 -11
- package/src/knowledge-sync.ts +74 -7
- package/src/model-proxy.ts +35 -10
- package/src/peer-manager.ts +84 -13
- package/src/rate-limiter.ts +16 -10
- package/src/router.ts +115 -33
- package/src/sentinel-manager.ts +59 -7
- package/src/sentinel.ts +13 -3
- package/src/terminal.ts +2 -1
- package/src/tool-proxy.ts +12 -4
- package/src/tools/cluster-diagnostic.ts +5 -2
- package/src/tools/cluster-edit.ts +2 -1
- package/src/tools/cluster-events.ts +3 -1
- package/src/tools/cluster-exec.ts +2 -0
- package/src/tools/cluster-handoff.ts +3 -1
- package/src/tools/cluster-peers.ts +3 -1
- package/src/tools/cluster-read.ts +4 -1
- package/src/tools/cluster-send.ts +2 -1
- package/src/tools/cluster-terminal.ts +4 -7
- package/src/tools/cluster-tool.ts +2 -2
- package/src/tools/cluster-transfer.ts +91 -0
- package/src/tools/cluster-write.ts +3 -1
- package/src/types.ts +191 -2
- package/src/web.ts +2 -10
- package/src/web-ui.ts +0 -1622
package/src/peer-manager.ts
CHANGED
|
@@ -46,6 +46,9 @@ const SKIP_DEDUP_TYPES = new Set([
|
|
|
46
46
|
// Terminal
|
|
47
47
|
"terminal_open_res", "terminal_data", "terminal_resize",
|
|
48
48
|
"terminal_close", "terminal_close_res",
|
|
49
|
+
// File transfer
|
|
50
|
+
"file_transfer_chunk", "file_transfer_chunk_ack",
|
|
51
|
+
"file_transfer_ack", "file_transfer_complete",
|
|
49
52
|
]);
|
|
50
53
|
|
|
51
54
|
/** Classify WebSocket close code into a human-readable reason. */
|
|
@@ -90,6 +93,8 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
90
93
|
private wss: WebSocketServer | null = null;
|
|
91
94
|
private reconnectTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
92
95
|
private reconnectAttempts = new Map<string, number>();
|
|
96
|
+
/** Deferred disconnect timers — grace period before broadcasting peer_leave. */
|
|
97
|
+
private disconnectGraceTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
93
98
|
private stopped = false;
|
|
94
99
|
/** Map from ws WebSocket to Connection for inbound connections. */
|
|
95
100
|
private inboundConnections = new Map<WsWebSocket, Connection>();
|
|
@@ -162,6 +167,17 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
162
167
|
}
|
|
163
168
|
}
|
|
164
169
|
|
|
170
|
+
/** Update the local tool proxy catalog and re-broadcast to all peers. */
|
|
171
|
+
updateToolCatalog(catalog: import("./types.ts").ToolCatalogEntry[]) {
|
|
172
|
+
if (this.localCapabilities.toolProxy) {
|
|
173
|
+
this.localCapabilities.toolProxy = { ...this.localCapabilities.toolProxy, catalog };
|
|
174
|
+
}
|
|
175
|
+
this.router.updateLocalToolCatalog(catalog);
|
|
176
|
+
for (const conn of this.router.getDirectConnections()) {
|
|
177
|
+
this.sendPeerSync(conn);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
165
181
|
// ── Lifecycle ──────────────────────────────────────────────────
|
|
166
182
|
async start() {
|
|
167
183
|
await this.approvalManager.load();
|
|
@@ -187,6 +203,12 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
187
203
|
clearTimeout(timer);
|
|
188
204
|
}
|
|
189
205
|
this.reconnectTimers.clear();
|
|
206
|
+
// Flush all disconnect grace timers (execute leave immediately on shutdown)
|
|
207
|
+
for (const [nodeId, timer] of this.disconnectGraceTimers) {
|
|
208
|
+
clearTimeout(timer);
|
|
209
|
+
this.executePeerLeave(nodeId);
|
|
210
|
+
}
|
|
211
|
+
this.disconnectGraceTimers.clear();
|
|
190
212
|
|
|
191
213
|
this.router.broadcast({
|
|
192
214
|
type: "peer_leave",
|
|
@@ -458,9 +480,6 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
458
480
|
if (this.pendingApprovalConns.has(nodeId)) {
|
|
459
481
|
debug("approval", `reusing pending approval for ${nodeId}, updating conn ref`);
|
|
460
482
|
this.pendingApprovalConns.set(nodeId, { conn, caps });
|
|
461
|
-
if (this.config.peerApproval?.mode === "required") {
|
|
462
|
-
conn.on("close", () => this.onPeerDisconnected(conn));
|
|
463
|
-
}
|
|
464
483
|
return;
|
|
465
484
|
}
|
|
466
485
|
|
|
@@ -492,10 +511,12 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
492
511
|
);
|
|
493
512
|
}
|
|
494
513
|
});
|
|
495
|
-
// In required mode, don't complete the join yet
|
|
514
|
+
// In required mode, don't complete the join yet.
|
|
515
|
+
// No close handler needed here: the peer was never added to the router,
|
|
516
|
+
// so onPeerDisconnected would broadcast a spurious peer_leave.
|
|
517
|
+
// If the conn drops before approval resolves, the .then() handler sees
|
|
518
|
+
// activeConn.isOpen === false and skips all actions.
|
|
496
519
|
if (this.config.peerApproval?.mode === "required") {
|
|
497
|
-
// Wire up close handler to clean up if connection drops while pending
|
|
498
|
-
conn.on("close", () => this.onPeerDisconnected(conn));
|
|
499
520
|
return;
|
|
500
521
|
}
|
|
501
522
|
// In notify mode, requestApproval resolves immediately, but
|
|
@@ -512,6 +533,9 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
512
533
|
private completePeerJoin(conn: Connection, caps: NodeCapabilities) {
|
|
513
534
|
const nodeId = conn.remoteNodeId!;
|
|
514
535
|
|
|
536
|
+
// Cancel disconnect grace timer if the peer is reconnecting
|
|
537
|
+
const wasInGrace = this.cancelDisconnectGrace(nodeId);
|
|
538
|
+
|
|
515
539
|
// If there's an existing connection for this nodeId (e.g. peer reconnected
|
|
516
540
|
// while old TCP hadn't closed yet), close it AFTER overwriting the route so
|
|
517
541
|
// the stale-close guard in onPeerDisconnected correctly skips cleanup.
|
|
@@ -582,15 +606,58 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
582
606
|
return;
|
|
583
607
|
}
|
|
584
608
|
|
|
609
|
+
// Grace period: defer peer_leave broadcast to allow quick reconnection
|
|
610
|
+
// (e.g. iOS WiFi ↔ cellular handoff, brief audio interruption).
|
|
611
|
+
// If the peer reconnects within the grace window, completePeerJoin
|
|
612
|
+
// will cancel this timer via cancelDisconnectGrace.
|
|
613
|
+
const graceMs = this.config.disconnectGrace ?? 30_000;
|
|
614
|
+
if (graceMs <= 0) {
|
|
615
|
+
this.executePeerLeave(nodeId, conn);
|
|
616
|
+
return;
|
|
617
|
+
}
|
|
618
|
+
debug("peer", `onPeerDisconnected(${nodeId}): starting ${graceMs / 1000}s grace period`);
|
|
619
|
+
|
|
620
|
+
// Clear any existing grace timer for this node (shouldn't happen, but be safe)
|
|
621
|
+
this.cancelDisconnectGrace(nodeId);
|
|
622
|
+
|
|
623
|
+
this.disconnectGraceTimers.set(nodeId, setTimeout(() => {
|
|
624
|
+
this.disconnectGraceTimers.delete(nodeId);
|
|
625
|
+
this.executePeerLeave(nodeId, conn);
|
|
626
|
+
}, graceMs));
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
/** Cancel a pending disconnect grace timer (called when peer reconnects quickly). */
|
|
630
|
+
private cancelDisconnectGrace(nodeId: string): boolean {
|
|
631
|
+
const timer = this.disconnectGraceTimers.get(nodeId);
|
|
632
|
+
if (timer) {
|
|
633
|
+
clearTimeout(timer);
|
|
634
|
+
this.disconnectGraceTimers.delete(nodeId);
|
|
635
|
+
debug("peer", `cancelDisconnectGrace(${nodeId}): peer reconnected within grace period`);
|
|
636
|
+
return true;
|
|
637
|
+
}
|
|
638
|
+
return false;
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
/** Execute the actual peer leave (after grace period expires or immediate for shutdown). */
|
|
642
|
+
private executePeerLeave(nodeId: string, conn?: Connection) {
|
|
643
|
+
// Double-check the route hasn't been replaced by a new connection during grace
|
|
644
|
+
if (conn) {
|
|
645
|
+
const currentRoute = this.router.getRoute(nodeId);
|
|
646
|
+
if (currentRoute?.connection && currentRoute.connection !== conn) {
|
|
647
|
+
debug("peer", `executePeerLeave(${nodeId}): route replaced during grace — skipping`);
|
|
648
|
+
return;
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
|
|
585
652
|
audit("peer_leave", { nodeId });
|
|
586
653
|
this.router.removePeer(nodeId);
|
|
587
654
|
|
|
588
655
|
// Remove satellite contexts that were only reachable via this peer
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
}
|
|
656
|
+
for (let i = this.satelliteContexts.length - 1; i >= 0; i--) {
|
|
657
|
+
if (this.satelliteContexts[i].nodeId === nodeId) {
|
|
658
|
+
this.satelliteContexts.splice(i, 1);
|
|
659
|
+
}
|
|
660
|
+
}
|
|
594
661
|
|
|
595
662
|
this.router.broadcast({
|
|
596
663
|
type: "peer_leave",
|
|
@@ -745,13 +812,17 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
|
|
|
745
812
|
const prev = this.router.getRoute(peer.nodeId);
|
|
746
813
|
const hadAgents = prev?.agents.length ?? 0;
|
|
747
814
|
const hadDirectPeers = prev?.directPeers.length ?? 0;
|
|
748
|
-
const hadToolProxy = JSON.stringify(prev?.toolProxy);
|
|
749
815
|
const hadDeviceInfo = prev?.deviceInfo?.hostname;
|
|
750
816
|
const hadAcpAgents = prev?.acpAgents?.length ?? 0;
|
|
817
|
+
const hadToolProxyEnabled = prev?.toolProxy?.enabled;
|
|
818
|
+
const hadToolProxyCatalogLen = prev?.toolProxy?.catalog?.length ?? 0;
|
|
819
|
+
const hadToolProxyAllowLen = prev?.toolProxy?.allow?.length ?? 0;
|
|
751
820
|
this.router.updatePeerCapabilities(peer.nodeId, peer);
|
|
752
821
|
if (peer.agents.length !== hadAgents || peer.models.length !== (prev?.models.length ?? 0)
|
|
753
822
|
|| (peer.directPeers?.length ?? 0) !== hadDirectPeers
|
|
754
|
-
||
|
|
823
|
+
|| peer.toolProxy?.enabled !== hadToolProxyEnabled
|
|
824
|
+
|| (peer.toolProxy?.catalog?.length ?? 0) !== hadToolProxyCatalogLen
|
|
825
|
+
|| (peer.toolProxy?.allow?.length ?? 0) !== hadToolProxyAllowLen
|
|
755
826
|
|| peer.deviceInfo?.hostname !== hadDeviceInfo
|
|
756
827
|
|| (peer.acpAgents?.length ?? 0) !== hadAcpAgents) {
|
|
757
828
|
changed = true;
|
package/src/rate-limiter.ts
CHANGED
|
@@ -33,19 +33,20 @@ export class RateLimiter {
|
|
|
33
33
|
|
|
34
34
|
let timestamps = this.attempts.get(ip);
|
|
35
35
|
if (timestamps) {
|
|
36
|
-
//
|
|
37
|
-
|
|
36
|
+
// In-place pruning: find first non-expired index and splice
|
|
37
|
+
let firstValid = 0;
|
|
38
|
+
while (firstValid < timestamps.length && timestamps[firstValid] <= cutoff) firstValid++;
|
|
39
|
+
if (firstValid > 0) timestamps.splice(0, firstValid);
|
|
38
40
|
} else {
|
|
39
41
|
timestamps = [];
|
|
42
|
+
this.attempts.set(ip, timestamps);
|
|
40
43
|
}
|
|
41
44
|
|
|
42
45
|
if (timestamps.length >= this.config.maxAttempts) {
|
|
43
|
-
this.attempts.set(ip, timestamps);
|
|
44
46
|
return false;
|
|
45
47
|
}
|
|
46
48
|
|
|
47
49
|
timestamps.push(now);
|
|
48
|
-
this.attempts.set(ip, timestamps);
|
|
49
50
|
return true;
|
|
50
51
|
}
|
|
51
52
|
|
|
@@ -61,19 +62,24 @@ export class RateLimiter {
|
|
|
61
62
|
/** Get remaining attempts for an IP. */
|
|
62
63
|
remaining(ip: string): number {
|
|
63
64
|
const cutoff = Date.now() - this.config.windowMs;
|
|
64
|
-
const timestamps = this.attempts.get(ip)
|
|
65
|
-
|
|
65
|
+
const timestamps = this.attempts.get(ip);
|
|
66
|
+
if (!timestamps) return this.config.maxAttempts;
|
|
67
|
+
let active = 0;
|
|
68
|
+
for (let i = timestamps.length - 1; i >= 0; i--) {
|
|
69
|
+
if (timestamps[i] > cutoff) active++; else break;
|
|
70
|
+
}
|
|
66
71
|
return Math.max(0, this.config.maxAttempts - active);
|
|
67
72
|
}
|
|
68
73
|
|
|
69
74
|
private gc() {
|
|
70
75
|
const cutoff = Date.now() - this.config.windowMs;
|
|
71
76
|
for (const [ip, timestamps] of this.attempts) {
|
|
72
|
-
|
|
73
|
-
|
|
77
|
+
let firstValid = 0;
|
|
78
|
+
while (firstValid < timestamps.length && timestamps[firstValid] <= cutoff) firstValid++;
|
|
79
|
+
if (firstValid === timestamps.length) {
|
|
74
80
|
this.attempts.delete(ip);
|
|
75
|
-
} else {
|
|
76
|
-
|
|
81
|
+
} else if (firstValid > 0) {
|
|
82
|
+
timestamps.splice(0, firstValid);
|
|
77
83
|
}
|
|
78
84
|
}
|
|
79
85
|
}
|
package/src/router.ts
CHANGED
|
@@ -38,6 +38,14 @@ export class Router {
|
|
|
38
38
|
/** Failed request IDs with expiry timestamps. Separate from dedup to support longer TTLs. */
|
|
39
39
|
private failedRequests = new Map<string, number>(); // requestId → expiresAt
|
|
40
40
|
|
|
41
|
+
// ── Indexes for O(1) lookups in hot paths ──────────────────────
|
|
42
|
+
/** agentId → Set of nodeIds that host this agent. */
|
|
43
|
+
private agentIndex = new Map<string, Set<string>>();
|
|
44
|
+
/** tag → Set of nodeIds (both node-level and agent-level tags). */
|
|
45
|
+
private tagIndex = new Map<string, Set<string>>();
|
|
46
|
+
/** modelId → Set of nodeIds that provide this model. */
|
|
47
|
+
private modelIndex = new Map<string, Set<string>>();
|
|
48
|
+
|
|
41
49
|
constructor(
|
|
42
50
|
nodeId: string,
|
|
43
51
|
localCapabilities?: { agents: AgentInfo[]; models: ModelInfo[]; tags: string[]; deviceInfo?: DeviceInfo; toolProxy?: ToolProxyInfo; acpAgents?: AcpAgentInfo[] },
|
|
@@ -53,11 +61,64 @@ export class Router {
|
|
|
53
61
|
this.rotateTimer = setInterval(() => this.rotateSeenFrames(), ROTATE_INTERVAL);
|
|
54
62
|
}
|
|
55
63
|
|
|
64
|
+
/** Rebuild all indexes from scratch. Called after any route table mutation. */
|
|
65
|
+
private rebuildIndexes() {
|
|
66
|
+
this.agentIndex.clear();
|
|
67
|
+
this.tagIndex.clear();
|
|
68
|
+
this.modelIndex.clear();
|
|
69
|
+
for (const entry of this.routes.values()) {
|
|
70
|
+
this.indexEntry(entry);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/** Add a single entry to all indexes. */
|
|
75
|
+
private indexEntry(entry: RouteEntry) {
|
|
76
|
+
const nid = entry.nodeId;
|
|
77
|
+
for (const a of entry.agents) {
|
|
78
|
+
let set = this.agentIndex.get(a.id);
|
|
79
|
+
if (!set) { set = new Set(); this.agentIndex.set(a.id, set); }
|
|
80
|
+
set.add(nid);
|
|
81
|
+
for (const t of a.tags ?? []) {
|
|
82
|
+
let ts = this.tagIndex.get(t);
|
|
83
|
+
if (!ts) { ts = new Set(); this.tagIndex.set(t, ts); }
|
|
84
|
+
ts.add(nid);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
for (const t of entry.tags ?? []) {
|
|
88
|
+
let set = this.tagIndex.get(t);
|
|
89
|
+
if (!set) { set = new Set(); this.tagIndex.set(t, set); }
|
|
90
|
+
set.add(nid);
|
|
91
|
+
}
|
|
92
|
+
for (const m of entry.models ?? []) {
|
|
93
|
+
let set = this.modelIndex.get(m.id);
|
|
94
|
+
if (!set) { set = new Set(); this.modelIndex.set(m.id, set); }
|
|
95
|
+
set.add(nid);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/** Remove a single entry from all indexes. */
|
|
100
|
+
private unindexEntry(entry: RouteEntry) {
|
|
101
|
+
const nid = entry.nodeId;
|
|
102
|
+
for (const a of entry.agents ?? []) {
|
|
103
|
+
this.agentIndex.get(a.id)?.delete(nid);
|
|
104
|
+
for (const t of a.tags ?? []) this.tagIndex.get(t)?.delete(nid);
|
|
105
|
+
}
|
|
106
|
+
for (const t of entry.tags ?? []) this.tagIndex.get(t)?.delete(nid);
|
|
107
|
+
for (const m of entry.models ?? []) this.modelIndex.get(m.id)?.delete(nid);
|
|
108
|
+
}
|
|
109
|
+
|
|
56
110
|
/** Update locally advertised ACP agents (used after auto-detection). */
|
|
57
111
|
updateLocalAcpAgents(agents: AcpAgentInfo[]) {
|
|
58
112
|
this.localAcpAgents = agents;
|
|
59
113
|
}
|
|
60
114
|
|
|
115
|
+
/** Update the local tool proxy catalog (descriptions + schemas for remote callers). */
|
|
116
|
+
updateLocalToolCatalog(catalog: ToolProxyInfo["catalog"]) {
|
|
117
|
+
if (this.localToolProxy) {
|
|
118
|
+
this.localToolProxy = { ...this.localToolProxy, catalog };
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
61
122
|
/** Stop periodic cleanup. Call on shutdown. */
|
|
62
123
|
destroy() {
|
|
63
124
|
if (this.rotateTimer) {
|
|
@@ -75,8 +136,10 @@ export class Router {
|
|
|
75
136
|
connection: Connection,
|
|
76
137
|
capabilities: { agents: AgentInfo[]; models: ModelInfo[]; tags: string[]; deviceInfo?: DeviceInfo; toolProxy?: ToolProxyInfo; acpAgents?: AcpAgentInfo[] },
|
|
77
138
|
) {
|
|
139
|
+
const old = this.routes.get(nodeId);
|
|
140
|
+
if (old) this.unindexEntry(old);
|
|
78
141
|
this.connections.set(nodeId, connection);
|
|
79
|
-
|
|
142
|
+
const entry: RouteEntry = {
|
|
80
143
|
nodeId,
|
|
81
144
|
agents: capabilities.agents,
|
|
82
145
|
models: capabilities.models,
|
|
@@ -89,7 +152,9 @@ export class Router {
|
|
|
89
152
|
deviceInfo: capabilities.deviceInfo,
|
|
90
153
|
toolProxy: capabilities.toolProxy,
|
|
91
154
|
acpAgents: capabilities.acpAgents,
|
|
92
|
-
}
|
|
155
|
+
};
|
|
156
|
+
this.routes.set(nodeId, entry);
|
|
157
|
+
this.indexEntry(entry);
|
|
93
158
|
}
|
|
94
159
|
|
|
95
160
|
addRelayPeer(peer: PeerInfo, viaNodeId: string) {
|
|
@@ -106,7 +171,8 @@ export class Router {
|
|
|
106
171
|
// Don't overwrite a better relay route with a worse one (allow equal for capability updates)
|
|
107
172
|
if (existing?.reachableVia && existing.latencyMs < estimatedLatency) return;
|
|
108
173
|
|
|
109
|
-
this.
|
|
174
|
+
if (existing) this.unindexEntry(existing);
|
|
175
|
+
const entry: RouteEntry = {
|
|
110
176
|
nodeId: peer.nodeId,
|
|
111
177
|
agents: peer.agents,
|
|
112
178
|
models: peer.models,
|
|
@@ -119,15 +185,22 @@ export class Router {
|
|
|
119
185
|
deviceInfo: peer.deviceInfo,
|
|
120
186
|
toolProxy: peer.toolProxy,
|
|
121
187
|
acpAgents: peer.acpAgents,
|
|
122
|
-
}
|
|
188
|
+
};
|
|
189
|
+
this.routes.set(peer.nodeId, entry);
|
|
190
|
+
this.indexEntry(entry);
|
|
123
191
|
}
|
|
124
192
|
|
|
125
193
|
removePeer(nodeId: string) {
|
|
126
194
|
this.connections.delete(nodeId);
|
|
127
|
-
this.routes.
|
|
195
|
+
const removed = this.routes.get(nodeId);
|
|
196
|
+
if (removed) {
|
|
197
|
+
this.unindexEntry(removed);
|
|
198
|
+
this.routes.delete(nodeId);
|
|
199
|
+
}
|
|
128
200
|
// Also remove routes that relied on this node as relay
|
|
129
201
|
for (const [id, entry] of this.routes) {
|
|
130
202
|
if (entry.reachableVia === nodeId) {
|
|
203
|
+
this.unindexEntry(entry);
|
|
131
204
|
this.routes.delete(id);
|
|
132
205
|
}
|
|
133
206
|
}
|
|
@@ -139,6 +212,7 @@ export class Router {
|
|
|
139
212
|
) {
|
|
140
213
|
const entry = this.routes.get(nodeId);
|
|
141
214
|
if (entry) {
|
|
215
|
+
this.unindexEntry(entry);
|
|
142
216
|
entry.agents = capabilities.agents;
|
|
143
217
|
entry.models = capabilities.models;
|
|
144
218
|
entry.tags = capabilities.tags;
|
|
@@ -151,6 +225,7 @@ export class Router {
|
|
|
151
225
|
entry.toolProxy = capabilities.toolProxy;
|
|
152
226
|
entry.acpAgents = capabilities.acpAgents;
|
|
153
227
|
entry.lastSeen = Date.now();
|
|
228
|
+
this.indexEntry(entry);
|
|
154
229
|
}
|
|
155
230
|
}
|
|
156
231
|
|
|
@@ -169,20 +244,20 @@ export class Router {
|
|
|
169
244
|
/** Resolve target agent to a specific nodeId. Supports agent ID or "tags:<tag>". */
|
|
170
245
|
resolveAgent(target: string): RouteEntry | undefined {
|
|
171
246
|
const isTagQuery = target.startsWith("tags:");
|
|
172
|
-
|
|
247
|
+
|
|
248
|
+
let nodeIds: Set<string> | undefined;
|
|
249
|
+
if (isTagQuery) {
|
|
250
|
+
nodeIds = this.tagIndex.get(target.slice(5));
|
|
251
|
+
} else {
|
|
252
|
+
nodeIds = this.agentIndex.get(target);
|
|
253
|
+
}
|
|
173
254
|
|
|
174
255
|
let candidates: RouteEntry[] = [];
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
if (entry
|
|
180
|
-
candidates.push(entry);
|
|
181
|
-
}
|
|
182
|
-
} else {
|
|
183
|
-
if (entry.agents.some((a) => a.id === target)) {
|
|
184
|
-
candidates.push(entry);
|
|
185
|
-
}
|
|
256
|
+
if (nodeIds) {
|
|
257
|
+
for (const nid of nodeIds) {
|
|
258
|
+
if (nid === this.nodeId) continue;
|
|
259
|
+
const entry = this.routes.get(nid);
|
|
260
|
+
if (entry) candidates.push(entry);
|
|
186
261
|
}
|
|
187
262
|
}
|
|
188
263
|
|
|
@@ -193,6 +268,7 @@ export class Router {
|
|
|
193
268
|
}
|
|
194
269
|
|
|
195
270
|
if (candidates.length === 0) return undefined;
|
|
271
|
+
if (candidates.length === 1) return candidates[0];
|
|
196
272
|
|
|
197
273
|
// Sort: direct connections first, then by latency
|
|
198
274
|
candidates.sort((a, b) => {
|
|
@@ -210,15 +286,16 @@ export class Router {
|
|
|
210
286
|
resolveNode(target: string): RouteEntry | undefined {
|
|
211
287
|
if (target.startsWith("tags:")) {
|
|
212
288
|
const tag = target.slice(5);
|
|
289
|
+
const nodeIds = this.tagIndex.get(tag);
|
|
290
|
+
if (!nodeIds) return undefined;
|
|
213
291
|
const candidates: RouteEntry[] = [];
|
|
214
|
-
for (const
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
if (entry.
|
|
218
|
-
candidates.push(entry);
|
|
219
|
-
}
|
|
292
|
+
for (const nid of nodeIds) {
|
|
293
|
+
if (nid === this.nodeId) continue;
|
|
294
|
+
const entry = this.routes.get(nid);
|
|
295
|
+
if (entry) candidates.push(entry);
|
|
220
296
|
}
|
|
221
297
|
if (candidates.length === 0) return undefined;
|
|
298
|
+
if (candidates.length === 1) return candidates[0];
|
|
222
299
|
// Sort: direct connections first, then by latency
|
|
223
300
|
candidates.sort((a, b) => {
|
|
224
301
|
const aDirect = a.connection ? 0 : 1;
|
|
@@ -236,10 +313,13 @@ export class Router {
|
|
|
236
313
|
/** Find reachable nodes that provide a specific model, sorted by latency.
|
|
237
314
|
* Excludes nodes in the `exclude` set. */
|
|
238
315
|
findNodesForModel(modelId: string, exclude?: Set<string>): RouteEntry[] {
|
|
316
|
+
const nodeIds = this.modelIndex.get(modelId);
|
|
317
|
+
if (!nodeIds) return [];
|
|
239
318
|
const candidates: RouteEntry[] = [];
|
|
240
|
-
for (const
|
|
241
|
-
if (exclude?.has(
|
|
242
|
-
|
|
319
|
+
for (const nid of nodeIds) {
|
|
320
|
+
if (exclude?.has(nid)) continue;
|
|
321
|
+
const entry = this.routes.get(nid);
|
|
322
|
+
if (!entry) continue;
|
|
243
323
|
// Check reachability
|
|
244
324
|
if (entry.connection?.isOpen) {
|
|
245
325
|
candidates.push(entry);
|
|
@@ -248,13 +328,15 @@ export class Router {
|
|
|
248
328
|
if (relay?.isOpen) candidates.push(entry);
|
|
249
329
|
}
|
|
250
330
|
}
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
331
|
+
if (candidates.length > 1) {
|
|
332
|
+
// Sort: direct first, then by latency
|
|
333
|
+
candidates.sort((a, b) => {
|
|
334
|
+
const aDirect = a.connection ? 0 : 1;
|
|
335
|
+
const bDirect = b.connection ? 0 : 1;
|
|
336
|
+
if (aDirect !== bDirect) return aDirect - bDirect;
|
|
337
|
+
return a.latencyMs - b.latencyMs;
|
|
338
|
+
});
|
|
339
|
+
}
|
|
258
340
|
return candidates;
|
|
259
341
|
}
|
|
260
342
|
|
package/src/sentinel-manager.ts
CHANGED
|
@@ -8,7 +8,8 @@
|
|
|
8
8
|
|
|
9
9
|
import { fork, type ChildProcess } from "node:child_process";
|
|
10
10
|
import { join, dirname } from "node:path";
|
|
11
|
-
import { existsSync, readFileSync, mkdirSync, openSync } from "node:fs";
|
|
11
|
+
import { existsSync, readFileSync, mkdirSync, openSync, closeSync } from "node:fs";
|
|
12
|
+
import { createConnection } from "node:net";
|
|
12
13
|
import { homedir, tmpdir } from "node:os";
|
|
13
14
|
import type { ClawMatrixConfig } from "./config.ts";
|
|
14
15
|
|
|
@@ -42,6 +43,9 @@ export class SentinelManager {
|
|
|
42
43
|
execArgv: this.resolveExecArgv(),
|
|
43
44
|
});
|
|
44
45
|
|
|
46
|
+
// Close the log fd in the parent — the child has its own copy
|
|
47
|
+
closeSync(logFd);
|
|
48
|
+
|
|
45
49
|
// Send config to sentinel via IPC (includes gateway PID for health checks)
|
|
46
50
|
// If sentinel has no explicit listenPort but the gateway is a listener,
|
|
47
51
|
// inherit the gateway's port for automatic takeover when gateway dies.
|
|
@@ -83,7 +87,57 @@ export class SentinelManager {
|
|
|
83
87
|
}, 1000);
|
|
84
88
|
}
|
|
85
89
|
|
|
86
|
-
|
|
90
|
+
/**
|
|
91
|
+
* Kill the old sentinel and wait for the listen port to become free.
|
|
92
|
+
* Must be called BEFORE PeerManager.startListening() to avoid EADDRINUSE.
|
|
93
|
+
*/
|
|
94
|
+
async ensurePortFree() {
|
|
95
|
+
// Kill old sentinel process if alive
|
|
96
|
+
if (existsSync(this.pidFile)) {
|
|
97
|
+
try {
|
|
98
|
+
const pid = parseInt(readFileSync(this.pidFile, "utf-8").trim(), 10);
|
|
99
|
+
if (pid) {
|
|
100
|
+
try {
|
|
101
|
+
process.kill(pid, "SIGTERM");
|
|
102
|
+
// Wait for the process to exit (up to 5s)
|
|
103
|
+
for (let i = 0; i < 100; i++) {
|
|
104
|
+
try {
|
|
105
|
+
process.kill(pid, 0);
|
|
106
|
+
await new Promise((r) => setTimeout(r, 50));
|
|
107
|
+
} catch {
|
|
108
|
+
break; // exited
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
} catch {
|
|
112
|
+
// Already gone
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
} catch {
|
|
116
|
+
// Malformed PID file
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Probe the port until it's free (up to 5s)
|
|
121
|
+
const port = this.config.sentinel?.listenPort
|
|
122
|
+
?? (this.config.listen ? this.config.listenPort : 0);
|
|
123
|
+
if (!port) return;
|
|
124
|
+
|
|
125
|
+
const host = this.config.sentinel?.listenHost ?? this.config.listenHost ?? "0.0.0.0";
|
|
126
|
+
for (let i = 0; i < 50; i++) {
|
|
127
|
+
const inUse = await new Promise<boolean>((resolve) => {
|
|
128
|
+
const sock = createConnection({ port, host }, () => {
|
|
129
|
+
sock.destroy();
|
|
130
|
+
resolve(true);
|
|
131
|
+
});
|
|
132
|
+
sock.on("error", () => resolve(false));
|
|
133
|
+
sock.setTimeout(200, () => { sock.destroy(); resolve(false); });
|
|
134
|
+
});
|
|
135
|
+
if (!inUse) return;
|
|
136
|
+
await new Promise((r) => setTimeout(r, 100));
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
async stop() {
|
|
87
141
|
// IPC is disconnected shortly after start, so use PID file for shutdown
|
|
88
142
|
if (existsSync(this.pidFile)) {
|
|
89
143
|
try {
|
|
@@ -92,13 +146,11 @@ export class SentinelManager {
|
|
|
92
146
|
process.kill(pid, "SIGTERM");
|
|
93
147
|
// Wait briefly for the process to exit so the next start()
|
|
94
148
|
// doesn't race with a still-dying sentinel
|
|
95
|
-
|
|
96
|
-
while (Date.now() < deadline) {
|
|
149
|
+
for (let i = 0; i < 60; i++) {
|
|
97
150
|
try {
|
|
98
151
|
process.kill(pid, 0);
|
|
99
|
-
// Still alive —
|
|
100
|
-
|
|
101
|
-
while (Date.now() < waitUntil) { /* spin */ }
|
|
152
|
+
// Still alive — async wait
|
|
153
|
+
await new Promise((r) => setTimeout(r, 50));
|
|
102
154
|
} catch {
|
|
103
155
|
break; // exited
|
|
104
156
|
}
|
package/src/sentinel.ts
CHANGED
|
@@ -78,6 +78,11 @@ let httpServer: Server | null = null;
|
|
|
78
78
|
let wss: WebSocketServer | null = null;
|
|
79
79
|
const inboundConnections = new Map<WsWebSocket, Connection>();
|
|
80
80
|
let listening = false;
|
|
81
|
+
/** Timestamp when sentinel voluntarily released the port. During the cooldown
|
|
82
|
+
* period (30s), sentinel will not re-listen even if gateway appears to be gone,
|
|
83
|
+
* giving the new gateway time to bind the port. */
|
|
84
|
+
let voluntaryReleaseAt = 0;
|
|
85
|
+
const PORT_RELEASE_COOLDOWN = 30_000;
|
|
81
86
|
|
|
82
87
|
// ── Rate limiting for diagnostic_exec ────────────────────────────
|
|
83
88
|
const EXEC_RATE_WINDOW = 60_000; // 1 minute
|
|
@@ -464,6 +469,9 @@ function stopListening() {
|
|
|
464
469
|
httpServer?.close();
|
|
465
470
|
httpServer = null;
|
|
466
471
|
listening = false;
|
|
472
|
+
// Mark voluntary release — sentinel will not re-listen during cooldown
|
|
473
|
+
// to give the gateway time to bind the port.
|
|
474
|
+
voluntaryReleaseAt = Date.now();
|
|
467
475
|
log("Port released — gateway is back");
|
|
468
476
|
}
|
|
469
477
|
|
|
@@ -612,12 +620,14 @@ function startGatewayHealthCheck() {
|
|
|
612
620
|
log(`Gateway process (pid ${gatewayPid}) gone — entering standalone mode`);
|
|
613
621
|
// Connect to peers now that gateway is down
|
|
614
622
|
connectAllPeers();
|
|
615
|
-
// Take over the gateway's listen port
|
|
623
|
+
// Take over the gateway's listen port — but respect cooldown after
|
|
624
|
+
// voluntary release so we don't compete with a restarting gateway.
|
|
616
625
|
if (config.listenPort) {
|
|
617
|
-
|
|
626
|
+
const cooldownRemaining = PORT_RELEASE_COOLDOWN - (Date.now() - voluntaryReleaseAt);
|
|
627
|
+
const delay = Math.max(2_000, cooldownRemaining);
|
|
618
628
|
setTimeout(() => {
|
|
619
629
|
if (!gatewayAlive && !isReplaced()) startListening();
|
|
620
|
-
},
|
|
630
|
+
}, delay);
|
|
621
631
|
}
|
|
622
632
|
}
|
|
623
633
|
}
|
package/src/terminal.ts
CHANGED
|
@@ -109,7 +109,8 @@ export class TerminalManager {
|
|
|
109
109
|
return;
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
-
//
|
|
112
|
+
// TODO(security): allowFrom 为空时默认允许所有已认证 peer 打开终端会话。
|
|
113
|
+
// 当前仅用于受信任网络。开放前需改为默认拒绝或要求显式配置。
|
|
113
114
|
if (termConfig?.allowFrom && termConfig.allowFrom.length > 0) {
|
|
114
115
|
if (!termConfig.allowFrom.includes(frame.from)) {
|
|
115
116
|
this.peerManager.sendTo(frame.from, {
|
package/src/tool-proxy.ts
CHANGED
|
@@ -45,6 +45,10 @@ export class ToolProxy {
|
|
|
45
45
|
private logger: PluginLogger;
|
|
46
46
|
private satelliteHandler: SatelliteToolHandler | null = null;
|
|
47
47
|
private readonly toolTimeout: number;
|
|
48
|
+
// Pre-built Sets for O(1) allow/deny checks
|
|
49
|
+
private readonly allowSet: Set<string>;
|
|
50
|
+
private readonly denySet: Set<string>;
|
|
51
|
+
private readonly allowAll: boolean;
|
|
48
52
|
|
|
49
53
|
constructor(config: ClawMatrixConfig, peerManager: PeerManager, gatewayInfo: GatewayInfo, logger: PluginLogger) {
|
|
50
54
|
this.config = config;
|
|
@@ -52,6 +56,10 @@ export class ToolProxy {
|
|
|
52
56
|
this.gatewayInfo = gatewayInfo;
|
|
53
57
|
this.logger = logger;
|
|
54
58
|
this.toolTimeout = config.toolTimeout ?? DEFAULT_TOOL_TIMEOUT;
|
|
59
|
+
const tp = config.toolProxy;
|
|
60
|
+
this.denySet = new Set(tp?.deny ?? []);
|
|
61
|
+
this.allowSet = new Set(tp?.allow ?? []);
|
|
62
|
+
this.allowAll = this.allowSet.size === 0 || this.allowSet.has("*");
|
|
55
63
|
}
|
|
56
64
|
|
|
57
65
|
/** Set the satellite tool handler (called by ClusterRuntime after WebHandler is created). */
|
|
@@ -338,10 +346,10 @@ export class ToolProxy {
|
|
|
338
346
|
}
|
|
339
347
|
|
|
340
348
|
// ── Security ───────────────────────────────────────────────────
|
|
341
|
-
private isToolAllowed(tool: string,
|
|
342
|
-
if (
|
|
343
|
-
if (
|
|
344
|
-
return
|
|
349
|
+
private isToolAllowed(tool: string, _tpConfig: ToolProxyConfig): boolean {
|
|
350
|
+
if (this.denySet.has(tool)) return false;
|
|
351
|
+
if (this.allowAll) return true;
|
|
352
|
+
return this.allowSet.has(tool);
|
|
345
353
|
}
|
|
346
354
|
|
|
347
355
|
destroy() {
|