nexus-prime 7.7.1 → 7.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +9 -0
- package/dist/daemon/server.d.ts +6 -0
- package/dist/daemon/server.js +11 -0
- package/dist/daemon/supervisor.d.ts +57 -0
- package/dist/daemon/supervisor.js +151 -0
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -30,6 +30,7 @@ import { runHookBootstrap, runHookMemory, runHookMindkit, runHookGhostPass, runH
|
|
|
30
30
|
import { resolveWorkspaceContext } from './engines/workspace-resolver.js';
|
|
31
31
|
import { ensureDaemonReady, getDaemonStatus, stopDaemon } from './daemon/client.js';
|
|
32
32
|
import { NexusDaemonServer } from './daemon/server.js';
|
|
33
|
+
import { DaemonSupervisor } from './daemon/supervisor.js';
|
|
33
34
|
import { startDaemonBackedMcpProxy } from './daemon/proxy.js';
|
|
34
35
|
import { getSharedLicenseManager, snapshotPCU, formatPCUStatus, loginFromCLI, isLoggedIn, logout, readAuthInfo } from './licensing/index.js';
|
|
35
36
|
import { syncLicense, requestUpgrade } from './licensing/license-sync.js';
|
|
@@ -806,6 +807,14 @@ program
|
|
|
806
807
|
process.exit(1);
|
|
807
808
|
}
|
|
808
809
|
console.error(`Nexus Prime daemon started (pid ${record.pid}, ${formatDaemonAddress(record)})`);
|
|
810
|
+
// Liveness supervisor: pings /health every 30s; restarts the daemon if
|
|
811
|
+
// 3 consecutive timeouts. Opt out with NEXUS_SUPERVISOR_DISABLED=1.
|
|
812
|
+
const supervisor = new DaemonSupervisor({
|
|
813
|
+
daemon,
|
|
814
|
+
workspaceContext,
|
|
815
|
+
getLockRecord: () => daemon.getLockRecord(),
|
|
816
|
+
});
|
|
817
|
+
supervisor.start();
|
|
809
818
|
}));
|
|
810
819
|
program
|
|
811
820
|
.command('mcp')
|
package/dist/daemon/server.d.ts
CHANGED
|
@@ -15,6 +15,12 @@ export declare class NexusDaemonServer {
|
|
|
15
15
|
private stopping;
|
|
16
16
|
constructor(workspace: WorkspaceContext);
|
|
17
17
|
private installProcessErrorHandlers;
|
|
18
|
+
/**
|
|
19
|
+
* Live read of the lock record (port + token) so supervisors and other
|
|
20
|
+
* callers can re-fetch credentials after a stop()/start() cycle without
|
|
21
|
+
* holding stale references.
|
|
22
|
+
*/
|
|
23
|
+
getLockRecord(): DaemonLockRecord | null;
|
|
18
24
|
start(): Promise<{
|
|
19
25
|
started: boolean;
|
|
20
26
|
record: DaemonLockRecord;
|
package/dist/daemon/server.js
CHANGED
|
@@ -148,7 +148,18 @@ export class NexusDaemonServer {
|
|
|
148
148
|
catch { /* last-resort logging — swallow */ }
|
|
149
149
|
});
|
|
150
150
|
}
|
|
151
|
+
/**
|
|
152
|
+
* Live read of the lock record (port + token) so supervisors and other
|
|
153
|
+
* callers can re-fetch credentials after a stop()/start() cycle without
|
|
154
|
+
* holding stale references.
|
|
155
|
+
*/
|
|
156
|
+
getLockRecord() {
|
|
157
|
+
return this.lockRecord;
|
|
158
|
+
}
|
|
151
159
|
async start() {
|
|
160
|
+
// Clear the stopping flag so a second start() after a supervisor-triggered
|
|
161
|
+
// stop() doesn't silently no-op via the SIGINT/SIGTERM handler guard.
|
|
162
|
+
this.stopping = false;
|
|
152
163
|
const lock = acquireDaemonLock(this.workspace, {
|
|
153
164
|
token: this.authToken,
|
|
154
165
|
});
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { NexusDaemonServer } from './server.js';
|
|
2
|
+
import type { DaemonLockRecord } from './lock.js';
|
|
3
|
+
import type { WorkspaceContext } from '../engines/workspace-resolver.js';
|
|
4
|
+
export interface SupervisorOptions {
|
|
5
|
+
/** The live daemon. The supervisor replaces this internally on a restart. */
|
|
6
|
+
daemon: NexusDaemonServer;
|
|
7
|
+
/** Workspace context — used to construct a fresh server on restart. */
|
|
8
|
+
workspaceContext: WorkspaceContext;
|
|
9
|
+
/** Live lock-record fetcher. Returns null when daemon hasn't yet listened. */
|
|
10
|
+
getLockRecord: () => DaemonLockRecord | null;
|
|
11
|
+
/** Override the incidents log path. Defaults to ~/.nexus-prime/incidents.jsonl. */
|
|
12
|
+
incidentsPath?: string;
|
|
13
|
+
/** Ping interval. Default 30 s. */
|
|
14
|
+
pingIntervalMs?: number;
|
|
15
|
+
/** Per-ping timeout. Default 5 s. */
|
|
16
|
+
pingTimeoutMs?: number;
|
|
17
|
+
/** Consecutive timeouts before declaring hung. Default 3. */
|
|
18
|
+
maxConsecutiveTimeouts?: number;
|
|
19
|
+
}
|
|
20
|
+
export interface SupervisorIncident {
|
|
21
|
+
ts: number;
|
|
22
|
+
kind: 'hung' | 'restart-ok' | 'restart-failed';
|
|
23
|
+
consecutiveTimeouts: number;
|
|
24
|
+
pid: number;
|
|
25
|
+
port: number | undefined;
|
|
26
|
+
error?: string;
|
|
27
|
+
}
|
|
28
|
+
export declare class DaemonSupervisor {
|
|
29
|
+
private daemon;
|
|
30
|
+
private readonly workspaceContext;
|
|
31
|
+
private readonly getLockRecord;
|
|
32
|
+
private readonly incidentsPath;
|
|
33
|
+
private readonly pingIntervalMs;
|
|
34
|
+
private readonly pingTimeoutMs;
|
|
35
|
+
private readonly maxConsecutiveTimeouts;
|
|
36
|
+
private consecutiveTimeouts;
|
|
37
|
+
private intervalHandle;
|
|
38
|
+
private restarting;
|
|
39
|
+
private stopped;
|
|
40
|
+
constructor(options: SupervisorOptions);
|
|
41
|
+
/**
|
|
42
|
+
* Schedule the first tick via setImmediate (no boot-time latency cost),
|
|
43
|
+
* then a periodic interval. Honors NEXUS_SUPERVISOR_DISABLED=1 — returns
|
|
44
|
+
* a no-op start in that case so callers don't need an outer guard.
|
|
45
|
+
*/
|
|
46
|
+
start(): void;
|
|
47
|
+
/** Stop watching. Safe to call repeatedly. */
|
|
48
|
+
stop(): void;
|
|
49
|
+
/** Single health-check tick. Public for tests. */
|
|
50
|
+
tick(): Promise<void>;
|
|
51
|
+
private handleHung;
|
|
52
|
+
private appendIncident;
|
|
53
|
+
/** Test helper: read current consecutive-timeout counter. */
|
|
54
|
+
getConsecutiveTimeouts(): number;
|
|
55
|
+
/** Test helper: get the live daemon ref (changes after a restart). */
|
|
56
|
+
getDaemon(): NexusDaemonServer;
|
|
57
|
+
}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Daemon liveness supervisor.
|
|
3
|
+
*
|
|
4
|
+
* Watches the daemon's `/health` endpoint on a fixed interval. If three
|
|
5
|
+
* consecutive pings time out, the daemon is considered hung — supervisor
|
|
6
|
+
* stops the existing server, constructs a fresh one, and starts it.
|
|
7
|
+
*
|
|
8
|
+
* Architecture: in-process. Holds a direct reference to the live
|
|
9
|
+
* NexusDaemonServer so a restart is a `stop()` + new `NexusDaemonServer().start()`
|
|
10
|
+
* call, not a fork. Cheaper than a child-process supervisor and avoids PID
|
|
11
|
+
* race conditions with the lock file. The tradeoff: a truly deadlocked event
|
|
12
|
+
* loop won't fire setInterval callbacks at all — but a deadlocked Node would
|
|
13
|
+
* also fail any out-of-process probe. The intended fail mode is "hung HTTP
|
|
14
|
+
* handler", which this catches.
|
|
15
|
+
*
|
|
16
|
+
* Opt out with `NEXUS_SUPERVISOR_DISABLED=1`.
|
|
17
|
+
*/
|
|
18
|
+
import * as fs from 'fs';
|
|
19
|
+
import * as os from 'os';
|
|
20
|
+
import * as path from 'path';
|
|
21
|
+
import { NexusDaemonServer } from './server.js';
|
|
22
|
+
import { pingDaemonHealth } from './client.js';
|
|
23
|
+
const DEFAULT_PING_INTERVAL_MS = 30_000;
|
|
24
|
+
const DEFAULT_PING_TIMEOUT_MS = 5_000;
|
|
25
|
+
const DEFAULT_MAX_TIMEOUTS = 3;
|
|
26
|
+
export class DaemonSupervisor {
|
|
27
|
+
daemon;
|
|
28
|
+
workspaceContext;
|
|
29
|
+
getLockRecord;
|
|
30
|
+
incidentsPath;
|
|
31
|
+
pingIntervalMs;
|
|
32
|
+
pingTimeoutMs;
|
|
33
|
+
maxConsecutiveTimeouts;
|
|
34
|
+
consecutiveTimeouts = 0;
|
|
35
|
+
intervalHandle;
|
|
36
|
+
restarting = false;
|
|
37
|
+
stopped = false;
|
|
38
|
+
constructor(options) {
|
|
39
|
+
this.daemon = options.daemon;
|
|
40
|
+
this.workspaceContext = options.workspaceContext;
|
|
41
|
+
this.getLockRecord = options.getLockRecord;
|
|
42
|
+
this.incidentsPath = options.incidentsPath
|
|
43
|
+
?? path.join(os.homedir(), '.nexus-prime', 'incidents.jsonl');
|
|
44
|
+
this.pingIntervalMs = options.pingIntervalMs ?? DEFAULT_PING_INTERVAL_MS;
|
|
45
|
+
this.pingTimeoutMs = options.pingTimeoutMs ?? DEFAULT_PING_TIMEOUT_MS;
|
|
46
|
+
this.maxConsecutiveTimeouts = options.maxConsecutiveTimeouts ?? DEFAULT_MAX_TIMEOUTS;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Schedule the first tick via setImmediate (no boot-time latency cost),
|
|
50
|
+
* then a periodic interval. Honors NEXUS_SUPERVISOR_DISABLED=1 — returns
|
|
51
|
+
* a no-op start in that case so callers don't need an outer guard.
|
|
52
|
+
*/
|
|
53
|
+
start() {
|
|
54
|
+
if (process.env.NEXUS_SUPERVISOR_DISABLED === '1') {
|
|
55
|
+
return;
|
|
56
|
+
}
|
|
57
|
+
if (this.intervalHandle)
|
|
58
|
+
return;
|
|
59
|
+
// Defer so daemon.start() can finish listening before we ping.
|
|
60
|
+
setImmediate(() => {
|
|
61
|
+
if (this.stopped)
|
|
62
|
+
return;
|
|
63
|
+
void this.tick();
|
|
64
|
+
});
|
|
65
|
+
this.intervalHandle = setInterval(() => { void this.tick(); }, this.pingIntervalMs);
|
|
66
|
+
this.intervalHandle.unref();
|
|
67
|
+
}
|
|
68
|
+
/** Stop watching. Safe to call repeatedly. */
|
|
69
|
+
stop() {
|
|
70
|
+
this.stopped = true;
|
|
71
|
+
if (this.intervalHandle) {
|
|
72
|
+
clearInterval(this.intervalHandle);
|
|
73
|
+
this.intervalHandle = undefined;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
/** Single health-check tick. Public for tests. */
|
|
77
|
+
async tick() {
|
|
78
|
+
if (this.stopped || this.restarting)
|
|
79
|
+
return;
|
|
80
|
+
const record = this.getLockRecord();
|
|
81
|
+
if (!record)
|
|
82
|
+
return; // daemon not yet listening — skip silently
|
|
83
|
+
try {
|
|
84
|
+
await pingDaemonHealth(record, this.pingTimeoutMs);
|
|
85
|
+
this.consecutiveTimeouts = 0;
|
|
86
|
+
}
|
|
87
|
+
catch (err) {
|
|
88
|
+
this.consecutiveTimeouts += 1;
|
|
89
|
+
if (this.consecutiveTimeouts >= this.maxConsecutiveTimeouts) {
|
|
90
|
+
await this.handleHung(err);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
async handleHung(error) {
|
|
95
|
+
if (this.restarting)
|
|
96
|
+
return;
|
|
97
|
+
this.restarting = true;
|
|
98
|
+
const record = this.getLockRecord();
|
|
99
|
+
const errMsg = error instanceof Error ? error.message : String(error);
|
|
100
|
+
this.appendIncident({
|
|
101
|
+
ts: Date.now(),
|
|
102
|
+
kind: 'hung',
|
|
103
|
+
consecutiveTimeouts: this.consecutiveTimeouts,
|
|
104
|
+
pid: process.pid,
|
|
105
|
+
port: record?.port,
|
|
106
|
+
error: errMsg,
|
|
107
|
+
});
|
|
108
|
+
try {
|
|
109
|
+
await this.daemon.stop('supervisor-restart');
|
|
110
|
+
const fresh = new NexusDaemonServer(this.workspaceContext);
|
|
111
|
+
await fresh.start();
|
|
112
|
+
this.daemon = fresh;
|
|
113
|
+
this.consecutiveTimeouts = 0;
|
|
114
|
+
this.appendIncident({
|
|
115
|
+
ts: Date.now(),
|
|
116
|
+
kind: 'restart-ok',
|
|
117
|
+
consecutiveTimeouts: 0,
|
|
118
|
+
pid: process.pid,
|
|
119
|
+
port: this.daemon.getLockRecord()?.port,
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
catch (restartErr) {
|
|
123
|
+
this.appendIncident({
|
|
124
|
+
ts: Date.now(),
|
|
125
|
+
kind: 'restart-failed',
|
|
126
|
+
consecutiveTimeouts: this.consecutiveTimeouts,
|
|
127
|
+
pid: process.pid,
|
|
128
|
+
port: record?.port,
|
|
129
|
+
error: restartErr instanceof Error ? restartErr.message : String(restartErr),
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
finally {
|
|
133
|
+
this.restarting = false;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
appendIncident(incident) {
|
|
137
|
+
try {
|
|
138
|
+
fs.mkdirSync(path.dirname(this.incidentsPath), { recursive: true });
|
|
139
|
+
fs.appendFileSync(this.incidentsPath, JSON.stringify(incident) + '\n', 'utf8');
|
|
140
|
+
}
|
|
141
|
+
catch { /* incident log is best-effort */ }
|
|
142
|
+
}
|
|
143
|
+
/** Test helper: read current consecutive-timeout counter. */
|
|
144
|
+
getConsecutiveTimeouts() {
|
|
145
|
+
return this.consecutiveTimeouts;
|
|
146
|
+
}
|
|
147
|
+
/** Test helper: get the live daemon ref (changes after a restart). */
|
|
148
|
+
getDaemon() {
|
|
149
|
+
return this.daemon;
|
|
150
|
+
}
|
|
151
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nexus-prime",
|
|
3
|
-
"version": "7.
|
|
3
|
+
"version": "7.8.0",
|
|
4
4
|
"description": "Local-first MCP control plane for coding agents with bootstrap-orchestrate execution, memory fabric, token budgeting, and worktree-backed swarms",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|