niahere 0.2.29 → 0.2.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/core/alive.ts +20 -21
package/package.json
CHANGED
package/src/core/alive.ts
CHANGED
|
@@ -3,7 +3,6 @@ import { getConfig } from "../utils/config";
|
|
|
3
3
|
import { getSql, closeDb } from "../db/connection";
|
|
4
4
|
|
|
5
5
|
const HEARTBEAT_INTERVAL = 60_000; // 60s
|
|
6
|
-
const RECOVERY_THRESHOLD = 30; // 30 consecutive failures = ~30 min
|
|
7
6
|
|
|
8
7
|
let timer: ReturnType<typeof setInterval> | null = null;
|
|
9
8
|
let consecutiveFailures = 0;
|
|
@@ -22,7 +21,6 @@ async function checkDb(): Promise<boolean> {
|
|
|
22
21
|
async function attemptReconnect(): Promise<boolean> {
|
|
23
22
|
try {
|
|
24
23
|
await closeDb();
|
|
25
|
-
// getSql() will create a fresh connection on next call
|
|
26
24
|
const sql = getSql();
|
|
27
25
|
await sql`SELECT 1`;
|
|
28
26
|
return true;
|
|
@@ -72,7 +70,7 @@ async function notifyUser(message: string): Promise<void> {
|
|
|
72
70
|
log.error("alive: could not notify user — no channel available");
|
|
73
71
|
}
|
|
74
72
|
|
|
75
|
-
/**
|
|
73
|
+
/** Run an LLM recovery agent to diagnose and fix the issue. */
|
|
76
74
|
async function runRecoveryAgent(error: string): Promise<{ recovered: boolean; report: string }> {
|
|
77
75
|
try {
|
|
78
76
|
const { runJobWithClaude } = await import("./runner");
|
|
@@ -80,7 +78,7 @@ async function runRecoveryAgent(error: string): Promise<{ recovered: boolean; re
|
|
|
80
78
|
|
|
81
79
|
const systemPrompt = [
|
|
82
80
|
"You are a system recovery agent for the Nia daemon.",
|
|
83
|
-
"The database connection
|
|
81
|
+
"The database connection just failed and reconnect didn't work.",
|
|
84
82
|
"Your job: diagnose the issue, attempt to fix it, and report the outcome.",
|
|
85
83
|
"",
|
|
86
84
|
"Steps:",
|
|
@@ -96,7 +94,7 @@ async function runRecoveryAgent(error: string): Promise<{ recovered: boolean; re
|
|
|
96
94
|
"- Any recommendations",
|
|
97
95
|
].join("\n");
|
|
98
96
|
|
|
99
|
-
const jobPrompt = `Database
|
|
97
|
+
const jobPrompt = `Database is unreachable. Reconnect failed.\nError: ${error}\n\nDiagnose and fix.`;
|
|
100
98
|
|
|
101
99
|
const result = await runJobWithClaude(systemPrompt, jobPrompt, homedir());
|
|
102
100
|
const recovered = await checkDb();
|
|
@@ -119,9 +117,7 @@ async function heartbeat(): Promise<void> {
|
|
|
119
117
|
if (ok) {
|
|
120
118
|
if (consecutiveFailures > 0) {
|
|
121
119
|
log.info({ previousFailures: consecutiveFailures }, "alive: database recovered");
|
|
122
|
-
|
|
123
|
-
await notifyUser(`Database recovered after ${consecutiveFailures} minutes of downtime.`);
|
|
124
|
-
}
|
|
120
|
+
await notifyUser(`Database recovered after ~${consecutiveFailures} min of downtime.`);
|
|
125
121
|
}
|
|
126
122
|
consecutiveFailures = 0;
|
|
127
123
|
recoveryAttempted = false;
|
|
@@ -131,44 +127,47 @@ async function heartbeat(): Promise<void> {
|
|
|
131
127
|
consecutiveFailures++;
|
|
132
128
|
log.warn({ consecutiveFailures }, "alive: database unreachable");
|
|
133
129
|
|
|
134
|
-
// Try reconnect
|
|
130
|
+
// Try reconnect
|
|
135
131
|
const reconnected = await attemptReconnect();
|
|
136
132
|
if (reconnected) {
|
|
137
133
|
log.info("alive: reconnected to database");
|
|
134
|
+
if (consecutiveFailures > 1) {
|
|
135
|
+
await notifyUser(`Database reconnected after ~${consecutiveFailures} min.`);
|
|
136
|
+
}
|
|
138
137
|
consecutiveFailures = 0;
|
|
139
138
|
recoveryAttempted = false;
|
|
140
139
|
return;
|
|
141
140
|
}
|
|
142
141
|
|
|
143
|
-
//
|
|
144
|
-
if (
|
|
142
|
+
// Reconnect failed — run recovery agent immediately (once per outage)
|
|
143
|
+
if (!recoveryAttempted) {
|
|
145
144
|
recoveryAttempted = true;
|
|
146
|
-
log.info("alive:
|
|
145
|
+
log.info("alive: reconnect failed, running recovery agent");
|
|
147
146
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
147
|
+
const { recovered, report } = await runRecoveryAgent(
|
|
148
|
+
"PostgreSQL unreachable, reconnect failed after " + consecutiveFailures + " heartbeat(s)"
|
|
149
|
+
);
|
|
151
150
|
|
|
152
151
|
if (recovered) {
|
|
153
152
|
log.info("alive: recovery agent succeeded");
|
|
154
|
-
await notifyUser(`Database was down
|
|
153
|
+
await notifyUser(`Database was down. Recovery agent fixed it.\n\n${report}`);
|
|
155
154
|
consecutiveFailures = 0;
|
|
156
155
|
recoveryAttempted = false;
|
|
157
156
|
} else {
|
|
158
|
-
|
|
159
|
-
log.error("alive: recovery agent failed, notifying user");
|
|
157
|
+
log.error("alive: recovery failed, notifying user");
|
|
160
158
|
await notifyUser(
|
|
161
|
-
`Database
|
|
159
|
+
`Database is down and auto-recovery failed.\n\n` +
|
|
162
160
|
`Recovery report:\n${report}\n\n` +
|
|
163
|
-
`Run \`nia health\` to check status
|
|
161
|
+
`Run \`nia health\` to check status.`
|
|
164
162
|
);
|
|
165
163
|
}
|
|
166
164
|
}
|
|
165
|
+
// After recovery attempted: just log failures, don't spam user or agent.
|
|
166
|
+
// When DB comes back, the ok branch above will notify.
|
|
167
167
|
}
|
|
168
168
|
|
|
169
169
|
export function startAlive(): void {
|
|
170
170
|
log.info("alive started (60s heartbeat)");
|
|
171
|
-
// Initial check after a short delay (let startup finish)
|
|
172
171
|
setTimeout(heartbeat, 10_000);
|
|
173
172
|
timer = setInterval(heartbeat, HEARTBEAT_INTERVAL);
|
|
174
173
|
}
|