nexus-prime 7.7.1 → 7.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -30,6 +30,7 @@ import { runHookBootstrap, runHookMemory, runHookMindkit, runHookGhostPass, runH
30
30
  import { resolveWorkspaceContext } from './engines/workspace-resolver.js';
31
31
  import { ensureDaemonReady, getDaemonStatus, stopDaemon } from './daemon/client.js';
32
32
  import { NexusDaemonServer } from './daemon/server.js';
33
+ import { DaemonSupervisor } from './daemon/supervisor.js';
33
34
  import { startDaemonBackedMcpProxy } from './daemon/proxy.js';
34
35
  import { getSharedLicenseManager, snapshotPCU, formatPCUStatus, loginFromCLI, isLoggedIn, logout, readAuthInfo } from './licensing/index.js';
35
36
  import { syncLicense, requestUpgrade } from './licensing/license-sync.js';
@@ -806,6 +807,14 @@ program
806
807
  process.exit(1);
807
808
  }
808
809
  console.error(`Nexus Prime daemon started (pid ${record.pid}, ${formatDaemonAddress(record)})`);
810
+ // Liveness supervisor: pings /health every 30s; restarts the daemon if
811
+ // 3 consecutive timeouts. Opt out with NEXUS_SUPERVISOR_DISABLED=1.
812
+ const supervisor = new DaemonSupervisor({
813
+ daemon,
814
+ workspaceContext,
815
+ getLockRecord: () => daemon.getLockRecord(),
816
+ });
817
+ supervisor.start();
809
818
  }));
810
819
  program
811
820
  .command('mcp')
@@ -15,6 +15,12 @@ export declare class NexusDaemonServer {
15
15
  private stopping;
16
16
  constructor(workspace: WorkspaceContext);
17
17
  private installProcessErrorHandlers;
18
+ /**
19
+ * Live read of the lock record (port + token) so supervisors and other
20
+ * callers can re-fetch credentials after a stop()/start() cycle without
21
+ * holding stale references.
22
+ */
23
+ getLockRecord(): DaemonLockRecord | null;
18
24
  start(): Promise<{
19
25
  started: boolean;
20
26
  record: DaemonLockRecord;
@@ -148,7 +148,18 @@ export class NexusDaemonServer {
148
148
  catch { /* last-resort logging — swallow */ }
149
149
  });
150
150
  }
151
+ /**
152
+ * Live read of the lock record (port + token) so supervisors and other
153
+ * callers can re-fetch credentials after a stop()/start() cycle without
154
+ * holding stale references.
155
+ */
156
+ getLockRecord() {
157
+ return this.lockRecord;
158
+ }
151
159
  async start() {
160
+ // Clear the stopping flag so a second start() after a supervisor-triggered
161
+ // stop() doesn't silently no-op via the SIGINT/SIGTERM handler guard.
162
+ this.stopping = false;
152
163
  const lock = acquireDaemonLock(this.workspace, {
153
164
  token: this.authToken,
154
165
  });
@@ -0,0 +1,57 @@
1
+ import { NexusDaemonServer } from './server.js';
2
+ import type { DaemonLockRecord } from './lock.js';
3
+ import type { WorkspaceContext } from '../engines/workspace-resolver.js';
4
+ export interface SupervisorOptions {
5
+ /** The live daemon. The supervisor replaces this internally on a restart. */
6
+ daemon: NexusDaemonServer;
7
+ /** Workspace context — used to construct a fresh server on restart. */
8
+ workspaceContext: WorkspaceContext;
9
+ /** Live lock-record fetcher. Returns null when daemon hasn't yet listened. */
10
+ getLockRecord: () => DaemonLockRecord | null;
11
+ /** Override the incidents log path. Defaults to ~/.nexus-prime/incidents.jsonl. */
12
+ incidentsPath?: string;
13
+ /** Ping interval. Default 30 s. */
14
+ pingIntervalMs?: number;
15
+ /** Per-ping timeout. Default 5 s. */
16
+ pingTimeoutMs?: number;
17
+ /** Consecutive timeouts before declaring hung. Default 3. */
18
+ maxConsecutiveTimeouts?: number;
19
+ }
20
+ export interface SupervisorIncident {
21
+ ts: number;
22
+ kind: 'hung' | 'restart-ok' | 'restart-failed';
23
+ consecutiveTimeouts: number;
24
+ pid: number;
25
+ port: number | undefined;
26
+ error?: string;
27
+ }
28
+ export declare class DaemonSupervisor {
29
+ private daemon;
30
+ private readonly workspaceContext;
31
+ private readonly getLockRecord;
32
+ private readonly incidentsPath;
33
+ private readonly pingIntervalMs;
34
+ private readonly pingTimeoutMs;
35
+ private readonly maxConsecutiveTimeouts;
36
+ private consecutiveTimeouts;
37
+ private intervalHandle;
38
+ private restarting;
39
+ private stopped;
40
+ constructor(options: SupervisorOptions);
41
+ /**
42
+ * Schedule the first tick via setImmediate (no boot-time latency cost),
43
+ * then a periodic interval. Honors NEXUS_SUPERVISOR_DISABLED=1 — returns
44
+ * a no-op start in that case so callers don't need an outer guard.
45
+ */
46
+ start(): void;
47
+ /** Stop watching. Safe to call repeatedly. */
48
+ stop(): void;
49
+ /** Single health-check tick. Public for tests. */
50
+ tick(): Promise<void>;
51
+ private handleHung;
52
+ private appendIncident;
53
+ /** Test helper: read current consecutive-timeout counter. */
54
+ getConsecutiveTimeouts(): number;
55
+ /** Test helper: get the live daemon ref (changes after a restart). */
56
+ getDaemon(): NexusDaemonServer;
57
+ }
@@ -0,0 +1,151 @@
1
+ /**
2
+ * Daemon liveness supervisor.
3
+ *
4
+ * Watches the daemon's `/health` endpoint on a fixed interval. If three
5
+ * consecutive pings time out, the daemon is considered hung — supervisor
6
+ * stops the existing server, constructs a fresh one, and starts it.
7
+ *
8
+ * Architecture: in-process. Holds a direct reference to the live
9
+ * NexusDaemonServer so a restart is a `stop()` + new `NexusDaemonServer().start()`
10
+ * call, not a fork. Cheaper than a child-process supervisor and avoids PID
11
+ * race conditions with the lock file. The tradeoff: a truly deadlocked event
12
+ * loop won't fire setInterval callbacks at all — but a deadlocked Node would
13
+ * also fail any out-of-process probe. The intended fail mode is "hung HTTP
14
+ * handler", which this catches.
15
+ *
16
+ * Opt out with `NEXUS_SUPERVISOR_DISABLED=1`.
17
+ */
18
+ import * as fs from 'fs';
19
+ import * as os from 'os';
20
+ import * as path from 'path';
21
+ import { NexusDaemonServer } from './server.js';
22
+ import { pingDaemonHealth } from './client.js';
23
+ const DEFAULT_PING_INTERVAL_MS = 30_000;
24
+ const DEFAULT_PING_TIMEOUT_MS = 5_000;
25
+ const DEFAULT_MAX_TIMEOUTS = 3;
26
+ export class DaemonSupervisor {
27
+ daemon;
28
+ workspaceContext;
29
+ getLockRecord;
30
+ incidentsPath;
31
+ pingIntervalMs;
32
+ pingTimeoutMs;
33
+ maxConsecutiveTimeouts;
34
+ consecutiveTimeouts = 0;
35
+ intervalHandle;
36
+ restarting = false;
37
+ stopped = false;
38
+ constructor(options) {
39
+ this.daemon = options.daemon;
40
+ this.workspaceContext = options.workspaceContext;
41
+ this.getLockRecord = options.getLockRecord;
42
+ this.incidentsPath = options.incidentsPath
43
+ ?? path.join(os.homedir(), '.nexus-prime', 'incidents.jsonl');
44
+ this.pingIntervalMs = options.pingIntervalMs ?? DEFAULT_PING_INTERVAL_MS;
45
+ this.pingTimeoutMs = options.pingTimeoutMs ?? DEFAULT_PING_TIMEOUT_MS;
46
+ this.maxConsecutiveTimeouts = options.maxConsecutiveTimeouts ?? DEFAULT_MAX_TIMEOUTS;
47
+ }
48
+ /**
49
+ * Schedule the first tick via setImmediate (no boot-time latency cost),
50
+ * then a periodic interval. Honors NEXUS_SUPERVISOR_DISABLED=1 — returns
51
+ * a no-op start in that case so callers don't need an outer guard.
52
+ */
53
+ start() {
54
+ if (process.env.NEXUS_SUPERVISOR_DISABLED === '1') {
55
+ return;
56
+ }
57
+ if (this.intervalHandle)
58
+ return;
59
+ // Defer so daemon.start() can finish listening before we ping.
60
+ setImmediate(() => {
61
+ if (this.stopped)
62
+ return;
63
+ void this.tick();
64
+ });
65
+ this.intervalHandle = setInterval(() => { void this.tick(); }, this.pingIntervalMs);
66
+ this.intervalHandle.unref();
67
+ }
68
+ /** Stop watching. Safe to call repeatedly. */
69
+ stop() {
70
+ this.stopped = true;
71
+ if (this.intervalHandle) {
72
+ clearInterval(this.intervalHandle);
73
+ this.intervalHandle = undefined;
74
+ }
75
+ }
76
+ /** Single health-check tick. Public for tests. */
77
+ async tick() {
78
+ if (this.stopped || this.restarting)
79
+ return;
80
+ const record = this.getLockRecord();
81
+ if (!record)
82
+ return; // daemon not yet listening — skip silently
83
+ try {
84
+ await pingDaemonHealth(record, this.pingTimeoutMs);
85
+ this.consecutiveTimeouts = 0;
86
+ }
87
+ catch (err) {
88
+ this.consecutiveTimeouts += 1;
89
+ if (this.consecutiveTimeouts >= this.maxConsecutiveTimeouts) {
90
+ await this.handleHung(err);
91
+ }
92
+ }
93
+ }
94
+ async handleHung(error) {
95
+ if (this.restarting)
96
+ return;
97
+ this.restarting = true;
98
+ const record = this.getLockRecord();
99
+ const errMsg = error instanceof Error ? error.message : String(error);
100
+ this.appendIncident({
101
+ ts: Date.now(),
102
+ kind: 'hung',
103
+ consecutiveTimeouts: this.consecutiveTimeouts,
104
+ pid: process.pid,
105
+ port: record?.port,
106
+ error: errMsg,
107
+ });
108
+ try {
109
+ await this.daemon.stop('supervisor-restart');
110
+ const fresh = new NexusDaemonServer(this.workspaceContext);
111
+ await fresh.start();
112
+ this.daemon = fresh;
113
+ this.consecutiveTimeouts = 0;
114
+ this.appendIncident({
115
+ ts: Date.now(),
116
+ kind: 'restart-ok',
117
+ consecutiveTimeouts: 0,
118
+ pid: process.pid,
119
+ port: this.daemon.getLockRecord()?.port,
120
+ });
121
+ }
122
+ catch (restartErr) {
123
+ this.appendIncident({
124
+ ts: Date.now(),
125
+ kind: 'restart-failed',
126
+ consecutiveTimeouts: this.consecutiveTimeouts,
127
+ pid: process.pid,
128
+ port: record?.port,
129
+ error: restartErr instanceof Error ? restartErr.message : String(restartErr),
130
+ });
131
+ }
132
+ finally {
133
+ this.restarting = false;
134
+ }
135
+ }
136
+ appendIncident(incident) {
137
+ try {
138
+ fs.mkdirSync(path.dirname(this.incidentsPath), { recursive: true });
139
+ fs.appendFileSync(this.incidentsPath, JSON.stringify(incident) + '\n', 'utf8');
140
+ }
141
+ catch { /* incident log is best-effort */ }
142
+ }
143
+ /** Test helper: read current consecutive-timeout counter. */
144
+ getConsecutiveTimeouts() {
145
+ return this.consecutiveTimeouts;
146
+ }
147
+ /** Test helper: get the live daemon ref (changes after a restart). */
148
+ getDaemon() {
149
+ return this.daemon;
150
+ }
151
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nexus-prime",
3
- "version": "7.7.1",
3
+ "version": "7.8.0",
4
4
  "description": "Local-first MCP control plane for coding agents with bootstrap-orchestrate execution, memory fabric, token budgeting, and worktree-backed swarms",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",