alvin-bot 4.15.1 β†’ 4.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,36 @@
2
2
 
3
3
  All notable changes to Alvin Bot are documented here.
4
4
 
5
+ ## [4.15.2] β€” 2026-04-17
6
+
7
+ ### πŸ› Fix: sleep-aware heartbeat prevents false failover after macOS wake
8
+
9
+ **Problem:** When the Mac goes to sleep, Node.js' `setInterval` pauses completely. After waking up, the first heartbeat probe runs against a CLI + network stack that's still warming up (OAuth token refresh, DNS cache cold, TCP connections stale). The 5s `isAvailable()` timeout is too tight for post-wake latency β†’ probe fails β†’ 2 consecutive failures (the heartbeat fires its backlog) β†’ auto-failover to Ollama β†’ the bot silently answers via Gemma4 instead of Claude, sometimes for hours.
10
+
11
+ **Evidence:** Logs showed a 7-hour gap (02:02–09:14 UTC) with zero heartbeat activity β€” the Mac was asleep. Immediately after wake, `claude-sdk: failure 1/2` β†’ `unhealthy` β†’ Ollama boot. The auto-recovery logic was correct but had no chance to fire before a manual restart.
12
+
13
+ **Fix β€” three mechanisms in `heartbeat.ts`:**
14
+
15
+ 1. **Sleep detection via wall-clock drift:** If `now - lastHeartbeatRanAt > 2Γ— interval`, the machine was suspended. On detection:
16
+ - 60s grace period where probe failures don't count toward the fail threshold
17
+ - All stale failure counters reset to zero (pre-sleep failures are meaningless)
18
+ - `isAvailable()` caches invalidated (a 7-hour-old "available: false" cache must not survive wake)
19
+
20
+ 2. **Quick recovery probe:** After every failover, schedule an extra heartbeat after 60s (not 5 min). If the primary is already back, recovery happens in ≀60s instead of up to 5 minutes.
21
+
22
+ 3. **Cache invalidation API:** `ClaudeSDKProvider.invalidateAvailabilityCache()` exposed so the heartbeat can clear stale results after sleep.
23
+
24
+ **Typical post-sleep flow with fix:**
25
+ ```
26
+ [wake] β†’ πŸ’“ 😴 Sleep detected (~420min gap). Grace period 60s
27
+ β†’ reset claude-sdk to healthy, invalidate caches
28
+ [+0s] β†’ πŸ’“ 😴 claude-sdk: probe failed during grace period β€” not counting
29
+ [+60s] β†’ grace expired β†’ normal probe β†’ claude-sdk healthy βœ…
30
+ ```
31
+ Without the fix, the same scenario triggered failover at +0s.
32
+
33
+ ---
34
+
5
35
  ## [4.15.1] β€” 2026-04-16
6
36
 
7
37
  ### πŸ› Patch: suppress `fallbackModel` when primary is Haiku
@@ -393,6 +393,12 @@ export class ClaudeSDKProvider {
393
393
  return cache(false);
394
394
  }
395
395
  }
396
+ /** v4.15.2 β€” Clear the cached isAvailable() result. Called by the
397
+ * heartbeat service after detecting macOS sleep/wake so the first
398
+ * post-wake probe doesn't serve a stale "unavailable" from hours ago. */
399
+ invalidateAvailabilityCache() {
400
+ this.availabilityCache = null;
401
+ }
396
402
  getInfo() {
397
403
  const model = this.config.model === "inherit"
398
404
  ? "CLI default (latest)"
@@ -5,6 +5,12 @@
5
5
  * If the primary provider fails, auto-switches to the first healthy fallback.
6
6
  * When the primary recovers, switches back automatically.
7
7
  *
8
+ * v4.15.2 β€” Sleep-aware: detects macOS/Linux suspend via wall-clock drift
9
+ * (gap between expected and actual heartbeat tick > 2Γ— the interval). After
10
+ * wake, gives providers a grace period before counting failures, and schedules
11
+ * a quick recovery probe 60s after any failover so recovery doesn't wait for
12
+ * the full 5-minute cycle.
13
+ *
8
14
  * The heartbeat provider (Groq by default) is always registered as the
9
15
  * last-resort fallback β€” free, fast, reliable.
10
16
  */
@@ -15,6 +21,12 @@ const HEARTBEAT_INTERVAL_MS = 5 * 60 * 1000; // Check every 5 minutes
15
21
  const HEARTBEAT_TIMEOUT_MS = 15_000; // 15s timeout per check
16
22
  const FAIL_THRESHOLD = 2; // Switch after 2 consecutive failures
17
23
  const RECOVERY_THRESHOLD = 1; // Switch back after 1 success
24
+ /** After detecting macOS sleep/wake, skip failure accounting for this long.
25
+ * Gives network, DNS, and OAuth token refresh time to settle. */
26
+ const POST_SLEEP_GRACE_MS = 60_000; // 60s grace after wake
27
+ /** After a failover, schedule an extra recovery probe after this delay
28
+ * instead of waiting for the full HEARTBEAT_INTERVAL_MS cycle. */
29
+ const QUICK_RECOVERY_DELAY_MS = 60_000; // 60s after failover β†’ re-check
18
30
  // Default heartbeat/fallback provider (free, no key needed for check)
19
31
  const HEARTBEAT_PROVIDER = "groq";
20
32
  // ── State ───────────────────────────────────────────────────────────────────
@@ -24,6 +36,9 @@ const state = {
24
36
  isRunning: false,
25
37
  originalPrimary: "",
26
38
  wasFailedOver: false,
39
+ lastRunAt: 0,
40
+ graceUntil: 0,
41
+ quickRecoveryTimer: null,
27
42
  };
28
43
  // ── Public API ──────────────────────────────────────────────────────────────
29
44
  /**
@@ -35,8 +50,9 @@ export function startHeartbeat() {
35
50
  const registry = getRegistry();
36
51
  state.originalPrimary = registry.getActiveKey();
37
52
  state.isRunning = true;
53
+ state.lastRunAt = Date.now();
54
+ state.graceUntil = 0;
38
55
  // Initial health state for all providers
39
- const allProviders = registry;
40
56
  // We'll check providers in the fallback chain
41
57
  const chain = [
42
58
  config.primaryProvider,
@@ -66,6 +82,10 @@ export function stopHeartbeat() {
66
82
  clearInterval(state.intervalId);
67
83
  state.intervalId = null;
68
84
  }
85
+ if (state.quickRecoveryTimer) {
86
+ clearTimeout(state.quickRecoveryTimer);
87
+ state.quickRecoveryTimer = null;
88
+ }
69
89
  state.isRunning = false;
70
90
  console.log("πŸ’“ Heartbeat monitor stopped");
71
91
  }
@@ -97,6 +117,40 @@ export function isFailedOver() {
97
117
  // ── Internal ────────────────────────────────────────────────────────────────
98
118
  async function runHeartbeat() {
99
119
  const registry = getRegistry();
120
+ const now = Date.now();
121
+ // ── Sleep detection ────────────────────────────────────────────────────
122
+ // Node.js setInterval pauses during macOS/Linux suspend. If the wall-clock
123
+ // gap since the last tick exceeds 2Γ— the interval, the machine was asleep.
124
+ // In that case, providers (especially CLI-based ones like claude-sdk) need
125
+ // time to warm up β€” network re-connects, OAuth tokens refresh, DNS caches
126
+ // re-populate. Without a grace period, the first probe after wake almost
127
+ // always fails, triggering a premature failover to Ollama.
128
+ const elapsed = now - state.lastRunAt;
129
+ const justWoke = state.lastRunAt > 0 && elapsed > HEARTBEAT_INTERVAL_MS * 2;
130
+ if (justWoke) {
131
+ const sleepDuration = Math.round(elapsed / 60_000);
132
+ console.log(`πŸ’“ 😴 Sleep detected (~${sleepDuration}min gap). Grace period ${POST_SLEEP_GRACE_MS / 1000}s β€” failures won't count.`);
133
+ state.graceUntil = now + POST_SLEEP_GRACE_MS;
134
+ // Invalidate isAvailable() caches on all providers so the first probe
135
+ // after wake doesn't serve a 7-hour-old cached "unavailable" result.
136
+ for (const [key] of state.providers) {
137
+ const provider = registry.get(key);
138
+ if (provider && typeof provider.invalidateAvailabilityCache === "function") {
139
+ provider.invalidateAvailabilityCache();
140
+ }
141
+ }
142
+ // Reset fail counters β€” stale failures from before sleep are meaningless.
143
+ for (const [, health] of state.providers) {
144
+ if (!health.healthy) {
145
+ health.failCount = 0;
146
+ health.healthy = true;
147
+ console.log(`πŸ’“ 😴 Reset ${health.key} to healthy (post-sleep clean slate)`);
148
+ }
149
+ }
150
+ }
151
+ state.lastRunAt = now;
152
+ const inGracePeriod = now < state.graceUntil;
153
+ // ── Provider health checks ─────────────────────────────────────────────
100
154
  for (const [key, health] of state.providers) {
101
155
  const provider = registry.get(key);
102
156
  if (!provider)
@@ -140,10 +194,17 @@ async function runHeartbeat() {
140
194
  health.failCount = 0;
141
195
  }
142
196
  catch (err) {
143
- health.failCount++;
144
197
  health.lastLatencyMs = Date.now() - start;
145
198
  health.lastCheck = Date.now();
146
199
  health.lastError = err instanceof Error ? err.message : String(err);
200
+ // During the post-sleep grace period, log the failure but don't
201
+ // increment the counter β€” transient post-wake unavailability is
202
+ // expected and should not trigger a failover.
203
+ if (inGracePeriod) {
204
+ console.log(`πŸ’“ 😴 ${key}: probe failed during grace period (${health.lastError}) β€” not counting`);
205
+ continue;
206
+ }
207
+ health.failCount++;
147
208
  if (health.failCount >= FAIL_THRESHOLD) {
148
209
  health.healthy = false;
149
210
  console.log(`πŸ’“ ❌ ${key}: unhealthy (${health.failCount} failures: ${health.lastError})`);
@@ -206,6 +267,10 @@ async function handleFailover(registry) {
206
267
  }
207
268
  registry.switchTo(fbKey);
208
269
  state.wasFailedOver = true;
270
+ // v4.15.2 β€” Schedule a quick recovery probe so we don't sit on
271
+ // the fallback for a full 5 minutes when the primary might already
272
+ // be back. Clear any previous pending timer first.
273
+ scheduleQuickRecovery();
209
274
  return;
210
275
  }
211
276
  console.log("πŸ’“ ⚠️ All providers unhealthy β€” staying on primary");
@@ -224,3 +289,21 @@ async function handleFailover(registry) {
224
289
  }
225
290
  }
226
291
  }
292
+ /**
293
+ * Schedule an extra heartbeat probe after QUICK_RECOVERY_DELAY_MS. This runs
294
+ * in addition to the regular 5-minute interval β€” its sole purpose is to detect
295
+ * primary recovery quickly after a failover instead of waiting up to 5 minutes.
296
+ */
297
+ function scheduleQuickRecovery() {
298
+ if (state.quickRecoveryTimer) {
299
+ clearTimeout(state.quickRecoveryTimer);
300
+ }
301
+ console.log(`πŸ’“ ⏱️ Quick recovery probe scheduled in ${QUICK_RECOVERY_DELAY_MS / 1000}s`);
302
+ state.quickRecoveryTimer = setTimeout(async () => {
303
+ state.quickRecoveryTimer = null;
304
+ if (!state.wasFailedOver || !state.isRunning)
305
+ return;
306
+ console.log("πŸ’“ ⏱️ Quick recovery probe firing…");
307
+ await runHeartbeat();
308
+ }, QUICK_RECOVERY_DELAY_MS);
309
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "alvin-bot",
3
- "version": "4.15.1",
3
+ "version": "4.15.2",
4
4
  "description": "Alvin Bot \u2014 Your personal AI agent on Telegram, WhatsApp, Discord, Signal, and Web.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",