clawmatrix 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -21
- package/cli/bin/clawmatrix.mjs +300 -1
- package/package.json +8 -1
- package/src/acp-proxy.ts +122 -50
- package/src/{web.ts → api.ts} +646 -25
- package/src/audit.ts +37 -2
- package/src/auth.ts +5 -10
- package/src/automation.ts +625 -0
- package/src/cluster-service.ts +172 -16
- package/src/compat.ts +103 -0
- package/src/config.ts +75 -27
- package/src/connection.ts +215 -37
- package/src/crypto.ts +72 -5
- package/src/device-info.ts +21 -2
- package/src/file-transfer.ts +3 -2
- package/src/handoff.ts +90 -32
- package/src/health-tracker.ts +91 -356
- package/src/index.ts +421 -13
- package/src/kanban.ts +507 -0
- package/src/knowledge-sync.ts +158 -7
- package/src/local-tools.ts +65 -2
- package/src/log-replication.ts +198 -0
- package/src/model-proxy.ts +152 -60
- package/src/peer-approval.ts +3 -2
- package/src/peer-manager.ts +230 -44
- package/src/retry.ts +81 -0
- package/src/router.ts +152 -104
- package/src/sentinel.ts +85 -51
- package/src/store.ts +578 -0
- package/src/terminal.ts +17 -8
- package/src/tool-proxy.ts +6 -5
- package/src/tools/cluster-events.ts +6 -6
- package/src/tools/cluster-kanban.ts +345 -0
- package/src/tools/cluster-peers.ts +1 -1
- package/src/tools/cluster-query.ts +145 -0
- package/src/types.ts +95 -9
package/src/router.ts
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import type { ClusterFrame, AnyClusterFrame, PeerInfo, AgentInfo, ModelInfo, DeviceInfo, ToolProxyInfo, AcpAgentInfo } from "./types.ts";
|
|
2
2
|
import type { Connection } from "./connection.ts";
|
|
3
|
+
import { LRUCache } from "lru-cache";
|
|
4
|
+
import { debug } from "./debug.ts";
|
|
3
5
|
|
|
4
6
|
const DEFAULT_TTL = 3;
|
|
5
7
|
const MAX_SEEN_FRAMES = 10_000;
|
|
6
8
|
const MAX_FAILED_REQUESTS = 5_000;
|
|
7
|
-
const
|
|
9
|
+
const SEEN_FRAME_TTL = 120_000; // 2 minutes (was ~60-120s with double-map rotation)
|
|
10
|
+
const FAILED_REQUEST_TTL = 900_000; // 15 minutes
|
|
8
11
|
|
|
9
12
|
export interface RouteEntry {
|
|
10
13
|
nodeId: string;
|
|
@@ -33,12 +36,17 @@ export class Router {
|
|
|
33
36
|
private connections = new Map<string, Connection>(); // nodeId → active (best) direct connection
|
|
34
37
|
/** All live channels per nodeId (multi-channel support). */
|
|
35
38
|
private channels = new Map<string, Set<Connection>>();
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
private
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
39
|
+
|
|
40
|
+
// ── Delta sync versioning ─────────────────────────────────────
|
|
41
|
+
private syncVersion = 0;
|
|
42
|
+
/** Version at which each nodeId was last added/updated. */
|
|
43
|
+
private peerVersions = new Map<string, number>();
|
|
44
|
+
/** Removed nodeIds with their removal version. Pruned when gap > 200. */
|
|
45
|
+
private removedPeers = new Map<string, number>();
|
|
46
|
+
/** LRU-based frame deduplication with TTL. */
|
|
47
|
+
private seenFrames = new LRUCache<string, true>({ max: MAX_SEEN_FRAMES, ttl: SEEN_FRAME_TTL });
|
|
48
|
+
/** Failed request IDs with TTL. Separate from dedup to support longer TTLs. */
|
|
49
|
+
private failedRequests = new LRUCache<string, true>({ max: MAX_FAILED_REQUESTS, ttl: FAILED_REQUEST_TTL });
|
|
42
50
|
|
|
43
51
|
// ── Indexes for O(1) lookups in hot paths ──────────────────────
|
|
44
52
|
/** agentId → Set of nodeIds that host this agent. */
|
|
@@ -59,18 +67,6 @@ export class Router {
|
|
|
59
67
|
this.localDeviceInfo = localCapabilities?.deviceInfo;
|
|
60
68
|
this.localToolProxy = localCapabilities?.toolProxy;
|
|
61
69
|
this.localAcpAgents = localCapabilities?.acpAgents;
|
|
62
|
-
|
|
63
|
-
this.rotateTimer = setInterval(() => this.rotateSeenFrames(), ROTATE_INTERVAL);
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
/** Rebuild all indexes from scratch. Called after any route table mutation. */
|
|
67
|
-
private rebuildIndexes() {
|
|
68
|
-
this.agentIndex.clear();
|
|
69
|
-
this.tagIndex.clear();
|
|
70
|
-
this.modelIndex.clear();
|
|
71
|
-
for (const entry of this.routes.values()) {
|
|
72
|
-
this.indexEntry(entry);
|
|
73
|
-
}
|
|
74
70
|
}
|
|
75
71
|
|
|
76
72
|
/** Add a single entry to all indexes. */
|
|
@@ -98,15 +94,25 @@ export class Router {
|
|
|
98
94
|
}
|
|
99
95
|
}
|
|
100
96
|
|
|
101
|
-
/** Remove a single entry from all indexes. */
|
|
97
|
+
/** Remove a single entry from all indexes. Cleans up empty Sets to prevent memory leaks. */
|
|
102
98
|
private unindexEntry(entry: RouteEntry) {
|
|
103
99
|
const nid = entry.nodeId;
|
|
104
100
|
for (const a of entry.agents ?? []) {
|
|
105
|
-
this.agentIndex.get(a.id)
|
|
106
|
-
|
|
101
|
+
const aSet = this.agentIndex.get(a.id);
|
|
102
|
+
if (aSet) { aSet.delete(nid); if (aSet.size === 0) this.agentIndex.delete(a.id); }
|
|
103
|
+
for (const t of a.tags ?? []) {
|
|
104
|
+
const tSet = this.tagIndex.get(t);
|
|
105
|
+
if (tSet) { tSet.delete(nid); if (tSet.size === 0) this.tagIndex.delete(t); }
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
for (const t of entry.tags ?? []) {
|
|
109
|
+
const tSet = this.tagIndex.get(t);
|
|
110
|
+
if (tSet) { tSet.delete(nid); if (tSet.size === 0) this.tagIndex.delete(t); }
|
|
111
|
+
}
|
|
112
|
+
for (const m of entry.models ?? []) {
|
|
113
|
+
const mSet = this.modelIndex.get(m.id);
|
|
114
|
+
if (mSet) { mSet.delete(nid); if (mSet.size === 0) this.modelIndex.delete(m.id); }
|
|
107
115
|
}
|
|
108
|
-
for (const t of entry.tags ?? []) this.tagIndex.get(t)?.delete(nid);
|
|
109
|
-
for (const m of entry.models ?? []) this.modelIndex.get(m.id)?.delete(nid);
|
|
110
116
|
}
|
|
111
117
|
|
|
112
118
|
/** Update locally advertised ACP agents (used after auto-detection). */
|
|
@@ -123,12 +129,7 @@ export class Router {
|
|
|
123
129
|
|
|
124
130
|
/** Stop periodic cleanup. Call on shutdown. */
|
|
125
131
|
destroy() {
|
|
126
|
-
|
|
127
|
-
clearInterval(this.rotateTimer);
|
|
128
|
-
this.rotateTimer = null;
|
|
129
|
-
}
|
|
130
|
-
this.seenCurrent.clear();
|
|
131
|
-
this.seenPrevious.clear();
|
|
132
|
+
this.seenFrames.clear();
|
|
132
133
|
this.failedRequests.clear();
|
|
133
134
|
this.channels.clear();
|
|
134
135
|
}
|
|
@@ -162,6 +163,9 @@ export class Router {
|
|
|
162
163
|
};
|
|
163
164
|
this.routes.set(nodeId, entry);
|
|
164
165
|
this.indexEntry(entry);
|
|
166
|
+
this.syncVersion++;
|
|
167
|
+
this.peerVersions.set(nodeId, this.syncVersion);
|
|
168
|
+
this.removedPeers.delete(nodeId);
|
|
165
169
|
}
|
|
166
170
|
|
|
167
171
|
/** Add an additional channel to an existing peer (multi-channel). */
|
|
@@ -263,6 +267,9 @@ export class Router {
|
|
|
263
267
|
};
|
|
264
268
|
this.routes.set(peer.nodeId, entry);
|
|
265
269
|
this.indexEntry(entry);
|
|
270
|
+
this.syncVersion++;
|
|
271
|
+
this.peerVersions.set(peer.nodeId, this.syncVersion);
|
|
272
|
+
this.removedPeers.delete(peer.nodeId);
|
|
266
273
|
}
|
|
267
274
|
|
|
268
275
|
removePeer(nodeId: string) {
|
|
@@ -272,12 +279,17 @@ export class Router {
|
|
|
272
279
|
if (removed) {
|
|
273
280
|
this.unindexEntry(removed);
|
|
274
281
|
this.routes.delete(nodeId);
|
|
282
|
+
this.syncVersion++;
|
|
283
|
+
this.removedPeers.set(nodeId, this.syncVersion);
|
|
284
|
+
this.peerVersions.delete(nodeId);
|
|
275
285
|
}
|
|
276
286
|
// Also remove routes that relied on this node as relay
|
|
277
287
|
for (const [id, entry] of this.routes) {
|
|
278
288
|
if (entry.reachableVia === nodeId) {
|
|
279
289
|
this.unindexEntry(entry);
|
|
280
290
|
this.routes.delete(id);
|
|
291
|
+
this.removedPeers.set(id, this.syncVersion);
|
|
292
|
+
this.peerVersions.delete(id);
|
|
281
293
|
}
|
|
282
294
|
}
|
|
283
295
|
}
|
|
@@ -302,6 +314,8 @@ export class Router {
|
|
|
302
314
|
entry.acpAgents = capabilities.acpAgents;
|
|
303
315
|
entry.lastSeen = Date.now();
|
|
304
316
|
this.indexEntry(entry);
|
|
317
|
+
this.syncVersion++;
|
|
318
|
+
this.peerVersions.set(nodeId, this.syncVersion);
|
|
305
319
|
}
|
|
306
320
|
}
|
|
307
321
|
|
|
@@ -317,8 +331,9 @@ export class Router {
|
|
|
317
331
|
return this.routes.get(nodeId);
|
|
318
332
|
}
|
|
319
333
|
|
|
320
|
-
/** Resolve target agent to a specific nodeId. Supports agent ID or "tags:<tag>".
|
|
321
|
-
|
|
334
|
+
/** Resolve target agent to a specific nodeId. Supports agent ID or "tags:<tag>".
|
|
335
|
+
* Optionally excludes nodes in the `exclude` set (e.g. previously failed nodes). */
|
|
336
|
+
resolveAgent(target: string, exclude?: Set<string>): RouteEntry | undefined {
|
|
322
337
|
const isTagQuery = target.startsWith("tags:");
|
|
323
338
|
|
|
324
339
|
let nodeIds: Set<string> | undefined;
|
|
@@ -332,6 +347,7 @@ export class Router {
|
|
|
332
347
|
if (nodeIds) {
|
|
333
348
|
for (const nid of nodeIds) {
|
|
334
349
|
if (nid === this.nodeId) continue;
|
|
350
|
+
if (exclude?.has(nid)) continue;
|
|
335
351
|
const entry = this.routes.get(nid);
|
|
336
352
|
if (entry) candidates.push(entry);
|
|
337
353
|
}
|
|
@@ -339,8 +355,10 @@ export class Router {
|
|
|
339
355
|
|
|
340
356
|
// Fallback: if no agent ID or tag matched, try matching by nodeId
|
|
341
357
|
if (candidates.length === 0 && !isTagQuery) {
|
|
342
|
-
|
|
343
|
-
|
|
358
|
+
if (!exclude?.has(target)) {
|
|
359
|
+
const byNode = this.routes.get(target);
|
|
360
|
+
if (byNode && byNode.nodeId !== this.nodeId) candidates.push(byNode);
|
|
361
|
+
}
|
|
344
362
|
}
|
|
345
363
|
|
|
346
364
|
if (candidates.length === 0) return undefined;
|
|
@@ -452,6 +470,7 @@ export class Router {
|
|
|
452
470
|
}
|
|
453
471
|
}
|
|
454
472
|
|
|
473
|
+
debug("router", `sendTo(${targetNodeId}): all paths failed (conn open=${route.connection?.isOpen}, relay=${route.reachableVia})`);
|
|
455
474
|
return false;
|
|
456
475
|
}
|
|
457
476
|
|
|
@@ -477,70 +496,26 @@ export class Router {
|
|
|
477
496
|
return relayed;
|
|
478
497
|
}
|
|
479
498
|
|
|
480
|
-
// ── Deduplication (
|
|
499
|
+
// ── Deduplication (LRU with TTL) ──────────────────────────────
|
|
481
500
|
/**
|
|
482
501
|
* Returns true if the frame has been seen before (duplicate).
|
|
483
|
-
* Uses
|
|
484
|
-
* promoted from `seenPrevious` on access. Every ROTATE_INTERVAL the
|
|
485
|
-
* previous map is discarded and current becomes previous — O(1) cleanup.
|
|
502
|
+
* Uses LRU cache with TTL — automatic eviction by age and size.
|
|
486
503
|
*/
|
|
487
504
|
isDuplicate(frameId: string): boolean {
|
|
488
505
|
if (!frameId) return false;
|
|
489
|
-
if (this.
|
|
490
|
-
|
|
491
|
-
// Promote to current so it survives the next rotation
|
|
492
|
-
this.seenCurrent.set(frameId, true);
|
|
493
|
-
return true;
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
this.seenCurrent.set(frameId, true);
|
|
497
|
-
// Safety valve: if current map grows too large, force a rotation
|
|
498
|
-
if (this.seenCurrent.size > MAX_SEEN_FRAMES) {
|
|
499
|
-
this.rotateSeenFrames();
|
|
500
|
-
}
|
|
506
|
+
if (this.seenFrames.has(frameId)) return true;
|
|
507
|
+
this.seenFrames.set(frameId, true);
|
|
501
508
|
return false;
|
|
502
509
|
}
|
|
503
510
|
|
|
504
511
|
/** Mark a request ID as failed so late responses are ignored.
|
|
505
512
|
* TTL defaults to 15 minutes — long enough for handoff timeouts. */
|
|
506
513
|
markFailed(requestId: string, ttlMs = 900_000) {
|
|
507
|
-
this.failedRequests.set(requestId,
|
|
508
|
-
// Evict entries when map grows too large: first expired, then FIFO
|
|
509
|
-
if (this.failedRequests.size > MAX_FAILED_REQUESTS) {
|
|
510
|
-
const now = Date.now();
|
|
511
|
-
// Pass 1: remove expired entries
|
|
512
|
-
for (const [id, expiresAt] of this.failedRequests) {
|
|
513
|
-
if (now > expiresAt) this.failedRequests.delete(id);
|
|
514
|
-
}
|
|
515
|
-
// Pass 2: if still over limit, remove oldest (insertion-order) entries
|
|
516
|
-
if (this.failedRequests.size > MAX_FAILED_REQUESTS) {
|
|
517
|
-
for (const [id] of this.failedRequests) {
|
|
518
|
-
if (this.failedRequests.size <= MAX_FAILED_REQUESTS) break;
|
|
519
|
-
this.failedRequests.delete(id);
|
|
520
|
-
}
|
|
521
|
-
}
|
|
522
|
-
}
|
|
514
|
+
this.failedRequests.set(requestId, true, { ttl: ttlMs });
|
|
523
515
|
}
|
|
524
516
|
|
|
525
517
|
isFailed(requestId: string): boolean {
|
|
526
|
-
|
|
527
|
-
if (expiresAt === undefined) return false;
|
|
528
|
-
if (Date.now() > expiresAt) {
|
|
529
|
-
this.failedRequests.delete(requestId);
|
|
530
|
-
return false;
|
|
531
|
-
}
|
|
532
|
-
return true;
|
|
533
|
-
}
|
|
534
|
-
|
|
535
|
-
private rotateSeenFrames() {
|
|
536
|
-
this.seenPrevious = this.seenCurrent;
|
|
537
|
-
this.seenCurrent = new Map();
|
|
538
|
-
|
|
539
|
-
// Prune expired failed requests
|
|
540
|
-
const now = Date.now();
|
|
541
|
-
for (const [id, expiresAt] of this.failedRequests) {
|
|
542
|
-
if (now > expiresAt) this.failedRequests.delete(id);
|
|
543
|
-
}
|
|
518
|
+
return this.failedRequests.has(requestId);
|
|
544
519
|
}
|
|
545
520
|
|
|
546
521
|
// ── Accessors ──────────────────────────────────────────────────
|
|
@@ -562,38 +537,111 @@ export class Router {
|
|
|
562
537
|
return [...this.connections.values()];
|
|
563
538
|
}
|
|
564
539
|
|
|
565
|
-
/**
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
540
|
+
/** Current sync version for delta protocol. */
|
|
541
|
+
get currentSyncVersion(): number {
|
|
542
|
+
return this.syncVersion;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
/** Convert a RouteEntry to PeerInfo for wire transmission. */
|
|
546
|
+
private entryToPeerInfo(entry: RouteEntry): PeerInfo {
|
|
547
|
+
return {
|
|
548
|
+
nodeId: entry.nodeId,
|
|
549
|
+
agents: entry.agents,
|
|
550
|
+
models: entry.models,
|
|
551
|
+
tags: entry.tags,
|
|
552
|
+
reachableVia: entry.reachableVia ?? undefined,
|
|
553
|
+
directPeers: entry.directPeers.length > 0 ? entry.directPeers : undefined,
|
|
554
|
+
deviceInfo: entry.deviceInfo,
|
|
555
|
+
toolProxy: entry.toolProxy,
|
|
556
|
+
acpAgents: entry.acpAgents,
|
|
557
|
+
latencyMs: entry.latencyMs > 0 ? entry.latencyMs : undefined,
|
|
558
|
+
};
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
/** Build local node PeerInfo. */
|
|
562
|
+
private localPeerInfo(): PeerInfo {
|
|
563
|
+
return {
|
|
571
564
|
nodeId: this.nodeId,
|
|
572
565
|
agents: this.localAgents,
|
|
573
566
|
models: this.localModels,
|
|
574
567
|
tags: this.localTags,
|
|
575
|
-
directPeers:
|
|
568
|
+
directPeers: [...this.connections.keys()],
|
|
576
569
|
deviceInfo: this.localDeviceInfo,
|
|
577
570
|
toolProxy: this.localToolProxy,
|
|
578
571
|
acpAgents: this.localAcpAgents,
|
|
579
|
-
}
|
|
572
|
+
};
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
/** Build PeerInfo list for peer_sync. */
|
|
576
|
+
buildPeerSyncPayload(): PeerInfo[] {
|
|
577
|
+
const peers: PeerInfo[] = [this.localPeerInfo()];
|
|
580
578
|
for (const entry of this.routes.values()) {
|
|
581
579
|
// Same-nodeId 本地客户端(Mac/iOS)不出现在 peer_sync 中,
|
|
582
580
|
// 客户端通过 auth_ok 获取网关 capabilities,无需在此重复。
|
|
583
581
|
if (entry.nodeId === this.nodeId) continue;
|
|
584
|
-
peers.push(
|
|
585
|
-
nodeId: entry.nodeId,
|
|
586
|
-
agents: entry.agents,
|
|
587
|
-
models: entry.models,
|
|
588
|
-
tags: entry.tags,
|
|
589
|
-
reachableVia: entry.reachableVia ?? undefined,
|
|
590
|
-
directPeers: entry.directPeers.length > 0 ? entry.directPeers : undefined,
|
|
591
|
-
deviceInfo: entry.deviceInfo,
|
|
592
|
-
toolProxy: entry.toolProxy,
|
|
593
|
-
acpAgents: entry.acpAgents,
|
|
594
|
-
latencyMs: entry.latencyMs > 0 ? entry.latencyMs : undefined,
|
|
595
|
-
});
|
|
582
|
+
peers.push(this.entryToPeerInfo(entry));
|
|
596
583
|
}
|
|
597
584
|
return peers;
|
|
598
585
|
}
|
|
586
|
+
|
|
587
|
+
/** Build delta peer_sync payload since a given version.
|
|
588
|
+
* Returns { version, peers (full) } if delta is too large or sinceVersion=0.
|
|
589
|
+
* Returns { version, added, removed, updated, peers (compat) } for incremental. */
|
|
590
|
+
buildPeerSyncDelta(sinceVersion: number): {
|
|
591
|
+
version: number;
|
|
592
|
+
peers: PeerInfo[];
|
|
593
|
+
added?: PeerInfo[];
|
|
594
|
+
removed?: string[];
|
|
595
|
+
updated?: PeerInfo[];
|
|
596
|
+
} {
|
|
597
|
+
const version = this.syncVersion;
|
|
598
|
+
|
|
599
|
+
// Full sync if: first sync, version gap too large, or no history
|
|
600
|
+
if (sinceVersion === 0 || version - sinceVersion > 100) {
|
|
601
|
+
return { version, peers: this.buildPeerSyncPayload() };
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
const added: PeerInfo[] = [];
|
|
605
|
+
const updated: PeerInfo[] = [];
|
|
606
|
+
const removed: string[] = [];
|
|
607
|
+
|
|
608
|
+
// Collect changes since sinceVersion
|
|
609
|
+
for (const [nodeId, ver] of this.peerVersions) {
|
|
610
|
+
if (ver > sinceVersion) {
|
|
611
|
+
const entry = this.routes.get(nodeId);
|
|
612
|
+
if (entry && entry.nodeId !== this.nodeId) {
|
|
613
|
+
// Distinguish add vs update: if version was set before sinceVersion, it's an update
|
|
614
|
+
// But we don't track the original add version separately, so treat all as "updated"
|
|
615
|
+
updated.push(this.entryToPeerInfo(entry));
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
// Always include self in updated (directPeers may have changed)
|
|
621
|
+
updated.push(this.localPeerInfo());
|
|
622
|
+
|
|
623
|
+
for (const [nodeId, ver] of this.removedPeers) {
|
|
624
|
+
if (ver > sinceVersion) {
|
|
625
|
+
removed.push(nodeId);
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
// Prune stale removedPeers entries
|
|
630
|
+
if (this.removedPeers.size > 200) {
|
|
631
|
+
const threshold = this.syncVersion - 100;
|
|
632
|
+
for (const [nodeId, ver] of this.removedPeers) {
|
|
633
|
+
if (ver < threshold) this.removedPeers.delete(nodeId);
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
// If delta is > 50% of full sync, just send full
|
|
638
|
+
const fullSize = this.routes.size + 1;
|
|
639
|
+
if (updated.length + removed.length > fullSize * 0.5) {
|
|
640
|
+
return { version, peers: this.buildPeerSyncPayload() };
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
// For backward compatibility: peers field contains updated entries
|
|
644
|
+
// so old nodes that only read `peers` still get capability changes
|
|
645
|
+
return { version, peers: updated, added, removed, updated };
|
|
646
|
+
}
|
|
599
647
|
}
|
package/src/sentinel.ts
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
|
|
16
16
|
import { spawn } from "node:child_process";
|
|
17
17
|
import { readFileSync, writeFileSync, unlinkSync, existsSync } from "node:fs";
|
|
18
|
+
import { readFile } from "node:fs/promises";
|
|
18
19
|
import { createServer, type Server } from "node:http";
|
|
19
20
|
import { WebSocketServer, WebSocket as WsWebSocket } from "ws";
|
|
20
21
|
import path from "node:path";
|
|
@@ -253,11 +254,11 @@ function handleFrame(frame: AnyClusterFrame, conn: Connection) {
|
|
|
253
254
|
}
|
|
254
255
|
|
|
255
256
|
function handleDiagnosticExec(frame: DiagnosticExec, conn: Connection) {
|
|
256
|
-
// Rate limiting
|
|
257
|
+
// Rate limiting — index-based pruning avoids O(n) shift() reallocation
|
|
257
258
|
const now = Date.now();
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
259
|
+
let firstValid = 0;
|
|
260
|
+
while (firstValid < execTimestamps.length && now - execTimestamps[firstValid]! > EXEC_RATE_WINDOW) firstValid++;
|
|
261
|
+
if (firstValid > 0) execTimestamps.splice(0, firstValid);
|
|
261
262
|
if (execTimestamps.length >= EXEC_RATE_LIMIT) {
|
|
262
263
|
conn.send({
|
|
263
264
|
type: "diagnostic_exec_res",
|
|
@@ -339,10 +340,10 @@ function handleDiagnosticStatus(frame: DiagnosticStatus, conn: Connection) {
|
|
|
339
340
|
|
|
340
341
|
// ── Port takeover: listen when gateway dies, release when it returns ──
|
|
341
342
|
|
|
342
|
-
function startListening() {
|
|
343
|
+
async function startListening() {
|
|
343
344
|
if (listening || !config.listenPort) return;
|
|
344
345
|
// If we've been replaced by a new sentinel, exit instead of competing for the port
|
|
345
|
-
if (isReplaced()) {
|
|
346
|
+
if (await isReplaced()) {
|
|
346
347
|
log("PID file replaced — another sentinel is active, exiting");
|
|
347
348
|
cleanup();
|
|
348
349
|
return;
|
|
@@ -372,7 +373,7 @@ function startListening() {
|
|
|
372
373
|
|
|
373
374
|
wss.on("connection", (ws) => {
|
|
374
375
|
const transport: WsTransport = {
|
|
375
|
-
send(data: string) { ws.send(data); },
|
|
376
|
+
send(data: string | Buffer) { ws.send(data); },
|
|
376
377
|
close(code?: number, reason?: string) { ws.close(code, reason); },
|
|
377
378
|
get readyState() { return ws.readyState; },
|
|
378
379
|
};
|
|
@@ -421,7 +422,7 @@ function startListening() {
|
|
|
421
422
|
conn.on("error", () => { /* close will follow */ });
|
|
422
423
|
|
|
423
424
|
ws.on("message", (data) => {
|
|
424
|
-
conn.feedMessage(typeof data === "string" ? data : String(data));
|
|
425
|
+
conn.feedMessage(Buffer.isBuffer(data) ? data : typeof data === "string" ? data : String(data));
|
|
425
426
|
});
|
|
426
427
|
|
|
427
428
|
ws.on("close", (code, reason) => {
|
|
@@ -439,15 +440,17 @@ function startListening() {
|
|
|
439
440
|
listening = false;
|
|
440
441
|
|
|
441
442
|
// If we've been replaced by a new sentinel, exit gracefully
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
443
|
+
isReplaced().then((replaced) => {
|
|
444
|
+
if (replaced) {
|
|
445
|
+
log("PID file replaced — exiting");
|
|
446
|
+
cleanup();
|
|
447
|
+
return;
|
|
448
|
+
}
|
|
449
|
+
// Port may still be held briefly by the dying gateway — retry after a delay
|
|
450
|
+
setTimeout(() => {
|
|
451
|
+
if (!gatewayAlive && config.listenPort) startListening();
|
|
452
|
+
}, 3_000);
|
|
453
|
+
});
|
|
451
454
|
});
|
|
452
455
|
|
|
453
456
|
httpServer.listen(port, host, () => {
|
|
@@ -458,21 +461,31 @@ function startListening() {
|
|
|
458
461
|
|
|
459
462
|
function stopListening() {
|
|
460
463
|
if (!listening) return;
|
|
461
|
-
//
|
|
462
|
-
for (const [
|
|
463
|
-
|
|
464
|
-
|
|
464
|
+
// Notify peers before closing so they reconnect immediately
|
|
465
|
+
for (const [, conn] of inboundConnections) {
|
|
466
|
+
try {
|
|
467
|
+
conn.send({
|
|
468
|
+
type: "peer_sync", from: sentinelNodeId(), timestamp: Date.now(),
|
|
469
|
+
payload: { peers: [] },
|
|
470
|
+
} as AnyClusterFrame);
|
|
471
|
+
} catch { /* best effort */ }
|
|
465
472
|
}
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
473
|
+
// Delay 200ms to let notification frames flush
|
|
474
|
+
setTimeout(() => {
|
|
475
|
+
for (const [ws, conn] of inboundConnections) {
|
|
476
|
+
conn.close(1001, "gateway recovered");
|
|
477
|
+
}
|
|
478
|
+
inboundConnections.clear();
|
|
479
|
+
wss?.close();
|
|
480
|
+
wss = null;
|
|
481
|
+
httpServer?.close();
|
|
482
|
+
httpServer = null;
|
|
483
|
+
listening = false;
|
|
484
|
+
// Mark voluntary release — sentinel will not re-listen during cooldown
|
|
485
|
+
// to give the gateway time to bind the port.
|
|
486
|
+
voluntaryReleaseAt = Date.now();
|
|
487
|
+
log("Port released — gateway is back");
|
|
488
|
+
}, 200);
|
|
476
489
|
}
|
|
477
490
|
|
|
478
491
|
// ── PID file management ─────────────────────────────────────────
|
|
@@ -480,8 +493,9 @@ function writePidFile() {
|
|
|
480
493
|
writeFileSync(config.pidFile, String(process.pid));
|
|
481
494
|
}
|
|
482
495
|
|
|
483
|
-
/** Check if another sentinel has replaced us (PID file contains a different PID).
|
|
484
|
-
|
|
496
|
+
/** Check if another sentinel has replaced us (PID file contains a different PID).
|
|
497
|
+
* Sync variant — only used in uncaughtException handler where the event loop may be draining. */
|
|
498
|
+
function isReplacedSync(): boolean {
|
|
485
499
|
try {
|
|
486
500
|
if (!existsSync(config.pidFile)) return true;
|
|
487
501
|
const filePid = parseInt(readFileSync(config.pidFile, "utf-8").trim(), 10);
|
|
@@ -491,7 +505,19 @@ function isReplaced(): boolean {
|
|
|
491
505
|
}
|
|
492
506
|
}
|
|
493
507
|
|
|
494
|
-
|
|
508
|
+
/** Async variant of isReplaced — used in recurring timer callbacks to avoid blocking the event loop. */
|
|
509
|
+
async function isReplaced(): Promise<boolean> {
|
|
510
|
+
try {
|
|
511
|
+
const content = await readFile(config.pidFile, "utf-8");
|
|
512
|
+
const filePid = parseInt(content.trim(), 10);
|
|
513
|
+
return filePid !== process.pid;
|
|
514
|
+
} catch {
|
|
515
|
+
// File doesn't exist or can't be read — assume replaced
|
|
516
|
+
return true;
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
async function killOldSentinel() {
|
|
495
521
|
if (!existsSync(config.pidFile)) return;
|
|
496
522
|
try {
|
|
497
523
|
const oldPid = parseInt(readFileSync(config.pidFile, "utf-8").trim(), 10);
|
|
@@ -505,9 +531,8 @@ function killOldSentinel() {
|
|
|
505
531
|
while (Date.now() < deadline) {
|
|
506
532
|
try {
|
|
507
533
|
process.kill(oldPid, 0);
|
|
508
|
-
// Still alive —
|
|
509
|
-
|
|
510
|
-
while (Date.now() < waitUntil) { /* spin */ }
|
|
534
|
+
// Still alive — async wait to avoid blocking the event loop
|
|
535
|
+
await new Promise((r) => setTimeout(r, 100));
|
|
511
536
|
} catch {
|
|
512
537
|
// Process exited
|
|
513
538
|
log(`Old sentinel (pid ${oldPid}) exited`);
|
|
@@ -564,7 +589,7 @@ process.on("uncaughtException", (err) => {
|
|
|
564
589
|
log(`Uncaught exception: ${err.stack || err.message}`);
|
|
565
590
|
// EADDRINUSE from a listen call means the port is taken — if we've been
|
|
566
591
|
// replaced by a new sentinel/gateway, exit cleanly instead of looping.
|
|
567
|
-
if ((err as NodeJS.ErrnoException).code === "EADDRINUSE" &&
|
|
592
|
+
if ((err as NodeJS.ErrnoException).code === "EADDRINUSE" && isReplacedSync()) {
|
|
568
593
|
log("Port in use and PID file replaced — exiting");
|
|
569
594
|
cleanup();
|
|
570
595
|
}
|
|
@@ -588,15 +613,24 @@ function connectAllPeers() {
|
|
|
588
613
|
|
|
589
614
|
/** Disconnect from all peers (called when gateway recovers). */
|
|
590
615
|
function disconnectAllPeers() {
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
616
|
+
// Notify peers before closing so they reconnect immediately
|
|
617
|
+
for (const [, conn] of connections) {
|
|
618
|
+
try {
|
|
619
|
+
conn.send({
|
|
620
|
+
type: "peer_sync", from: sentinelNodeId(), timestamp: Date.now(),
|
|
621
|
+
payload: { peers: [] },
|
|
622
|
+
} as AnyClusterFrame);
|
|
623
|
+
} catch { /* best effort */ }
|
|
598
624
|
}
|
|
599
|
-
|
|
625
|
+
setTimeout(() => {
|
|
626
|
+
for (const [nodeId, conn] of connections) {
|
|
627
|
+
conn.close(1001, "gateway recovered");
|
|
628
|
+
connections.delete(nodeId);
|
|
629
|
+
}
|
|
630
|
+
for (const [, timer] of reconnectTimers) clearTimeout(timer);
|
|
631
|
+
reconnectTimers.clear();
|
|
632
|
+
reconnectAttempts.clear();
|
|
633
|
+
}, 200);
|
|
600
634
|
}
|
|
601
635
|
|
|
602
636
|
/** Periodically check if the gateway process is still alive via kill(pid, 0). */
|
|
@@ -625,22 +659,22 @@ function startGatewayHealthCheck() {
|
|
|
625
659
|
if (config.listenPort) {
|
|
626
660
|
const cooldownRemaining = PORT_RELEASE_COOLDOWN - (Date.now() - voluntaryReleaseAt);
|
|
627
661
|
const delay = Math.max(2_000, cooldownRemaining);
|
|
628
|
-
setTimeout(() => {
|
|
629
|
-
if (!gatewayAlive && !isReplaced()) startListening();
|
|
662
|
+
setTimeout(async () => {
|
|
663
|
+
if (!gatewayAlive && !(await isReplaced())) startListening();
|
|
630
664
|
}, delay);
|
|
631
665
|
}
|
|
632
666
|
}
|
|
633
667
|
}
|
|
634
|
-
},
|
|
668
|
+
}, 2_000);
|
|
635
669
|
}
|
|
636
670
|
|
|
637
|
-
function boot() {
|
|
671
|
+
async function boot() {
|
|
638
672
|
// Prefer explicit gatewayPid from config (sent by SentinelManager),
|
|
639
673
|
// fall back to ppid (may be inaccurate if forked indirectly).
|
|
640
674
|
gatewayPid = config.gatewayPid ?? process.ppid;
|
|
641
675
|
|
|
642
676
|
loadApprovedPeers();
|
|
643
|
-
killOldSentinel();
|
|
677
|
+
await killOldSentinel();
|
|
644
678
|
writePidFile();
|
|
645
679
|
log(`Started (pid ${process.pid}, gateway ${gatewayPid}, nodeId ${sentinelNodeId()}, takeover port ${config.listenPort || "none"})`);
|
|
646
680
|
|