niahere 0.2.38 → 0.2.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/commands/service.ts +4 -2
- package/src/core/alive.ts +81 -2
- package/src/core/daemon.ts +9 -6
- package/src/core/runner.ts +8 -4
package/package.json
CHANGED
package/src/commands/service.ts
CHANGED
|
@@ -48,6 +48,8 @@ function buildPlist(): string {
|
|
|
48
48
|
<key>SuccessfulExit</key>
|
|
49
49
|
<false/>
|
|
50
50
|
</dict>
|
|
51
|
+
<key>ThrottleInterval</key>
|
|
52
|
+
<integer>10</integer>
|
|
51
53
|
<key>StandardOutPath</key>
|
|
52
54
|
<string>${paths.daemonLog}</string>
|
|
53
55
|
<key>StandardErrorPath</key>
|
|
@@ -86,10 +88,10 @@ async function uninstallLaunchd(): Promise<void> {
|
|
|
86
88
|
const path = plistPath();
|
|
87
89
|
if (!existsSync(path)) return;
|
|
88
90
|
|
|
91
|
+
// Unload to stop the process and disable KeepAlive respawn.
|
|
92
|
+
// Keep the plist file so RunAtLoad starts the daemon on next login.
|
|
89
93
|
const unload = Bun.spawn(["launchctl", "unload", path], { stdout: "pipe", stderr: "pipe" });
|
|
90
94
|
await unload.exited;
|
|
91
|
-
|
|
92
|
-
try { unlinkSync(path); } catch { /* already gone */ }
|
|
93
95
|
}
|
|
94
96
|
|
|
95
97
|
function isLaunchdInstalled(): boolean {
|
package/src/core/alive.ts
CHANGED
|
@@ -4,11 +4,72 @@ import { getSql, closeDb } from "../db/connection";
|
|
|
4
4
|
import { getFailures, type Check } from "./health";
|
|
5
5
|
|
|
6
6
|
const HEARTBEAT_INTERVAL = 60_000; // 60s
|
|
7
|
+
const PG_DATA_DIRS = [
|
|
8
|
+
"/opt/homebrew/var/postgresql@18",
|
|
9
|
+
"/opt/homebrew/var/postgresql@17",
|
|
10
|
+
"/opt/homebrew/var/postgres",
|
|
11
|
+
];
|
|
7
12
|
|
|
8
13
|
let timer: ReturnType<typeof setInterval> | null = null;
|
|
9
14
|
let lastFailures: string[] = [];
|
|
10
15
|
let recoveryAttempted = false;
|
|
11
16
|
|
|
17
|
+
/** Deterministic Postgres recovery: remove stale PID file + restart service. */
|
|
18
|
+
async function recoverPostgres(): Promise<boolean> {
|
|
19
|
+
const ready = Bun.spawnSync(["pg_isready"]);
|
|
20
|
+
if (ready.exitCode === 0) return true; // already up
|
|
21
|
+
|
|
22
|
+
log.info("alive: postgres not ready, attempting deterministic recovery");
|
|
23
|
+
|
|
24
|
+
// Find and remove stale postmaster.pid
|
|
25
|
+
const { existsSync, unlinkSync, readFileSync } = await import("fs");
|
|
26
|
+
for (const dir of PG_DATA_DIRS) {
|
|
27
|
+
const pidFile = `${dir}/postmaster.pid`;
|
|
28
|
+
if (!existsSync(pidFile)) continue;
|
|
29
|
+
|
|
30
|
+
// Read the PID from line 1 and check if it's actually a postgres process
|
|
31
|
+
try {
|
|
32
|
+
const pid = parseInt(readFileSync(pidFile, "utf8").split("\n")[0], 10);
|
|
33
|
+
if (!isNaN(pid)) {
|
|
34
|
+
const check = Bun.spawnSync(["ps", "-p", String(pid), "-o", "comm="]);
|
|
35
|
+
const comm = new TextDecoder().decode(check.stdout).trim();
|
|
36
|
+
if (check.exitCode !== 0 || !comm.includes("postgres")) {
|
|
37
|
+
log.info({ pidFile, stalePid: pid, actualProcess: comm || "dead" }, "alive: removing stale postmaster.pid");
|
|
38
|
+
unlinkSync(pidFile);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
} catch (err) {
|
|
42
|
+
log.warn({ err, pidFile }, "alive: could not inspect postmaster.pid");
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Restart the service
|
|
47
|
+
if (process.platform === "darwin") {
|
|
48
|
+
// Try common brew postgresql service names
|
|
49
|
+
for (const svc of ["postgresql@18", "postgresql@17", "postgresql"]) {
|
|
50
|
+
const result = Bun.spawnSync(["brew", "services", "start", svc]);
|
|
51
|
+
if (result.exitCode === 0) {
|
|
52
|
+
log.info({ service: svc }, "alive: brew service start issued");
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
} else {
|
|
57
|
+
Bun.spawnSync(["systemctl", "start", "postgresql"]);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Wait briefly for postgres to come up
|
|
61
|
+
await new Promise((r) => setTimeout(r, 3000));
|
|
62
|
+
|
|
63
|
+
const check = Bun.spawnSync(["pg_isready"]);
|
|
64
|
+
if (check.exitCode === 0) {
|
|
65
|
+
log.info("alive: postgres recovered via deterministic fix");
|
|
66
|
+
return true;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
log.warn("alive: deterministic postgres recovery failed");
|
|
70
|
+
return false;
|
|
71
|
+
}
|
|
72
|
+
|
|
12
73
|
async function attemptDbReconnect(): Promise<boolean> {
|
|
13
74
|
try {
|
|
14
75
|
await closeDb();
|
|
@@ -138,10 +199,28 @@ async function heartbeat(): Promise<void> {
|
|
|
138
199
|
}
|
|
139
200
|
}
|
|
140
201
|
|
|
141
|
-
//
|
|
202
|
+
// Deterministic postgres recovery before LLM agent
|
|
203
|
+
if (failureNames.includes("database") && !recoveryAttempted) {
|
|
204
|
+
const pgFixed = await recoverPostgres();
|
|
205
|
+
if (pgFixed) {
|
|
206
|
+
const reconnected = await attemptDbReconnect();
|
|
207
|
+
if (reconnected) {
|
|
208
|
+
const remaining = await getFailures();
|
|
209
|
+
if (remaining.length === 0) {
|
|
210
|
+
log.info("alive: postgres recovered (deterministic fix, no LLM needed)");
|
|
211
|
+
await notifyUser("Postgres was down (stale PID). Fixed automatically — no LLM agent needed.");
|
|
212
|
+
lastFailures = [];
|
|
213
|
+
recoveryAttempted = false;
|
|
214
|
+
return;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Run LLM recovery agent once per outage (fallback for non-trivial issues)
|
|
142
221
|
if (!recoveryAttempted) {
|
|
143
222
|
recoveryAttempted = true;
|
|
144
|
-
log.info({ failures: failureNames }, "alive: running recovery agent");
|
|
223
|
+
log.info({ failures: failureNames }, "alive: running LLM recovery agent");
|
|
145
224
|
|
|
146
225
|
const { recovered, report } = await runRecoveryAgent(failures);
|
|
147
226
|
|
package/src/core/daemon.ts
CHANGED
|
@@ -121,7 +121,7 @@ function waitForExit(timeoutMs: number): void {
|
|
|
121
121
|
/** Return PIDs of running daemon processes (excluding ourselves). */
|
|
122
122
|
export function findDaemonPids(): number[] {
|
|
123
123
|
try {
|
|
124
|
-
const result = Bun.spawnSync(["pgrep", "-f", "
|
|
124
|
+
const result = Bun.spawnSync(["pgrep", "-f", "src/cli\\.ts run$"]);
|
|
125
125
|
const stdout = new TextDecoder().decode(result.stdout).trim();
|
|
126
126
|
if (!stdout) return [];
|
|
127
127
|
return stdout.split("\n")
|
|
@@ -150,16 +150,19 @@ export async function runDaemon(): Promise<void> {
|
|
|
150
150
|
delete process.env.CLAUDE_CODE_ENTRYPOINT;
|
|
151
151
|
delete process.env.CLAUDE_AGENT_SDK_VERSION;
|
|
152
152
|
|
|
153
|
-
// Startup guard: if another daemon is alive, exit immediately
|
|
153
|
+
// Startup guard: if another nia daemon is alive, exit immediately.
|
|
154
|
+
// Use pgrep (via findDaemonPids) instead of kill(pid,0) to verify the
|
|
155
|
+
// PID is actually a nia process — not a recycled OS PID from something else.
|
|
154
156
|
const existingPid = readPid();
|
|
155
157
|
if (existingPid !== null && existingPid !== process.pid) {
|
|
156
|
-
|
|
157
|
-
|
|
158
|
+
const aliveDaemons = findDaemonPids();
|
|
159
|
+
if (aliveDaemons.includes(existingPid)) {
|
|
158
160
|
log.debug({ existingPid, myPid: process.pid }, "another daemon is already running, exiting");
|
|
159
161
|
process.exit(0);
|
|
160
|
-
} catch {
|
|
161
|
-
// Dead PID in pidfile — safe to take over
|
|
162
162
|
}
|
|
163
|
+
// PID in file is stale (dead or recycled by OS) — safe to take over
|
|
164
|
+
log.warn({ stalePid: existingPid }, "taking over from stale pid");
|
|
165
|
+
removePid();
|
|
163
166
|
}
|
|
164
167
|
|
|
165
168
|
// Crash handlers — ensure PID cleanup and logging on unhandled errors.
|
package/src/core/runner.ts
CHANGED
|
@@ -247,13 +247,15 @@ export async function runJob(job: JobInput, onActivity?: ActivityCallback): Prom
|
|
|
247
247
|
};
|
|
248
248
|
appendAudit(auditEntry);
|
|
249
249
|
|
|
250
|
-
state
|
|
250
|
+
// Re-read state to avoid clobbering concurrent job updates
|
|
251
|
+
const freshState = { ...readState() };
|
|
252
|
+
freshState[job.name] = {
|
|
251
253
|
lastRun: timestamp,
|
|
252
254
|
status: result.status,
|
|
253
255
|
duration_ms: result.duration_ms,
|
|
254
256
|
error: result.error,
|
|
255
257
|
};
|
|
256
|
-
writeState(
|
|
258
|
+
writeState(freshState);
|
|
257
259
|
|
|
258
260
|
return result;
|
|
259
261
|
} catch (err) {
|
|
@@ -278,13 +280,15 @@ export async function runJob(job: JobInput, onActivity?: ActivityCallback): Prom
|
|
|
278
280
|
error: errorMsg,
|
|
279
281
|
});
|
|
280
282
|
|
|
281
|
-
state
|
|
283
|
+
// Re-read state to avoid clobbering concurrent job updates
|
|
284
|
+
const freshState = { ...readState() };
|
|
285
|
+
freshState[job.name] = {
|
|
282
286
|
lastRun: timestamp,
|
|
283
287
|
status: "error",
|
|
284
288
|
duration_ms,
|
|
285
289
|
error: errorMsg,
|
|
286
290
|
};
|
|
287
|
-
writeState(
|
|
291
|
+
writeState(freshState);
|
|
288
292
|
|
|
289
293
|
return result;
|
|
290
294
|
}
|