niahere 0.2.29 → 0.2.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/core/alive.ts +20 -21
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "niahere",
3
- "version": "0.2.29",
3
+ "version": "0.2.30",
4
4
  "description": "A personal AI assistant daemon — scheduled jobs, chat across Telegram and Slack, persona system, and visual identity.",
5
5
  "type": "module",
6
6
  "scripts": {
package/src/core/alive.ts CHANGED
@@ -3,7 +3,6 @@ import { getConfig } from "../utils/config";
3
3
  import { getSql, closeDb } from "../db/connection";
4
4
 
5
5
  const HEARTBEAT_INTERVAL = 60_000; // 60s
6
- const RECOVERY_THRESHOLD = 30; // 30 consecutive failures = ~30 min
7
6
 
8
7
  let timer: ReturnType<typeof setInterval> | null = null;
9
8
  let consecutiveFailures = 0;
@@ -22,7 +21,6 @@ async function checkDb(): Promise<boolean> {
22
21
  async function attemptReconnect(): Promise<boolean> {
23
22
  try {
24
23
  await closeDb();
25
- // getSql() will create a fresh connection on next call
26
24
  const sql = getSql();
27
25
  await sql`SELECT 1`;
28
26
  return true;
@@ -72,7 +70,7 @@ async function notifyUser(message: string): Promise<void> {
72
70
  log.error("alive: could not notify user — no channel available");
73
71
  }
74
72
 
75
- /** Layer 1: Run an LLM recovery agent to diagnose and fix the issue. */
73
+ /** Run an LLM recovery agent to diagnose and fix the issue. */
76
74
  async function runRecoveryAgent(error: string): Promise<{ recovered: boolean; report: string }> {
77
75
  try {
78
76
  const { runJobWithClaude } = await import("./runner");
@@ -80,7 +78,7 @@ async function runRecoveryAgent(error: string): Promise<{ recovered: boolean; re
80
78
 
81
79
  const systemPrompt = [
82
80
  "You are a system recovery agent for the Nia daemon.",
83
- "The database connection has been failing for 30+ minutes.",
81
+ "The database connection just failed and reconnect didn't work.",
84
82
  "Your job: diagnose the issue, attempt to fix it, and report the outcome.",
85
83
  "",
86
84
  "Steps:",
@@ -96,7 +94,7 @@ async function runRecoveryAgent(error: string): Promise<{ recovered: boolean; re
96
94
  "- Any recommendations",
97
95
  ].join("\n");
98
96
 
99
- const jobPrompt = `Database has been unreachable for 30+ minutes.\nLast error: ${error}\n\nDiagnose and fix if possible.`;
97
+ const jobPrompt = `Database is unreachable. Reconnect failed.\nError: ${error}\n\nDiagnose and fix.`;
100
98
 
101
99
  const result = await runJobWithClaude(systemPrompt, jobPrompt, homedir());
102
100
  const recovered = await checkDb();
@@ -119,9 +117,7 @@ async function heartbeat(): Promise<void> {
119
117
  if (ok) {
120
118
  if (consecutiveFailures > 0) {
121
119
  log.info({ previousFailures: consecutiveFailures }, "alive: database recovered");
122
- if (consecutiveFailures >= RECOVERY_THRESHOLD) {
123
- await notifyUser(`Database recovered after ${consecutiveFailures} minutes of downtime.`);
124
- }
120
+ await notifyUser(`Database recovered after ~${consecutiveFailures} min of downtime.`);
125
121
  }
126
122
  consecutiveFailures = 0;
127
123
  recoveryAttempted = false;
@@ -131,44 +127,47 @@ async function heartbeat(): Promise<void> {
131
127
  consecutiveFailures++;
132
128
  log.warn({ consecutiveFailures }, "alive: database unreachable");
133
129
 
134
- // Try reconnect on every failure
130
+ // Try reconnect
135
131
  const reconnected = await attemptReconnect();
136
132
  if (reconnected) {
137
133
  log.info("alive: reconnected to database");
134
+ if (consecutiveFailures > 1) {
135
+ await notifyUser(`Database reconnected after ~${consecutiveFailures} min.`);
136
+ }
138
137
  consecutiveFailures = 0;
139
138
  recoveryAttempted = false;
140
139
  return;
141
140
  }
142
141
 
143
- // After threshold, trigger recovery (once)
144
- if (consecutiveFailures >= RECOVERY_THRESHOLD && !recoveryAttempted) {
142
+ // Reconnect failed run recovery agent immediately (once per outage)
143
+ if (!recoveryAttempted) {
145
144
  recoveryAttempted = true;
146
- log.info("alive: triggering recovery after " + consecutiveFailures + " failures");
145
+ log.info("alive: reconnect failed, running recovery agent");
147
146
 
148
- // Layer 1: LLM recovery agent
149
- const lastError = "PostgreSQL unreachable after " + consecutiveFailures + " consecutive heartbeat failures";
150
- const { recovered, report } = await runRecoveryAgent(lastError);
147
+ const { recovered, report } = await runRecoveryAgent(
148
+ "PostgreSQL unreachable, reconnect failed after " + consecutiveFailures + " heartbeat(s)"
149
+ );
151
150
 
152
151
  if (recovered) {
153
152
  log.info("alive: recovery agent succeeded");
154
- await notifyUser(`Database was down for ~${consecutiveFailures} min. Recovery agent fixed it.\n\n${report}`);
153
+ await notifyUser(`Database was down. Recovery agent fixed it.\n\n${report}`);
155
154
  consecutiveFailures = 0;
156
155
  recoveryAttempted = false;
157
156
  } else {
158
- // Layer 2: Direct notification
159
- log.error("alive: recovery agent failed, notifying user");
157
+ log.error("alive: recovery failed, notifying user");
160
158
  await notifyUser(
161
- `Database has been down for ~${consecutiveFailures} min and auto-recovery failed.\n\n` +
159
+ `Database is down and auto-recovery failed.\n\n` +
162
160
  `Recovery report:\n${report}\n\n` +
163
- `Run \`nia health\` to check status. You may need to restart PostgreSQL manually.`
161
+ `Run \`nia health\` to check status.`
164
162
  );
165
163
  }
166
164
  }
165
+ // After recovery attempted: just log failures, don't spam user or agent.
166
+ // When DB comes back, the ok branch above will notify.
167
167
  }
168
168
 
169
169
  export function startAlive(): void {
170
170
  log.info("alive started (60s heartbeat)");
171
- // Initial check after a short delay (let startup finish)
172
171
  setTimeout(heartbeat, 10_000);
173
172
  timer = setInterval(heartbeat, HEARTBEAT_INTERVAL);
174
173
  }