npm - alvin-bot - Versions diffs - 4.15.1 → 4.15.2 - Mend

alvin-bot 4.15.1 → 4.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/CHANGELOG.md +30 -0
package/dist/providers/claude-sdk-provider.js +6 -0
package/dist/services/heartbeat.js +85 -2
package/package.json +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,36 @@
 All notable changes to Alvin Bot are documented here.
+## [4.15.2] — 2026-04-17
+### 🐛 Fix: sleep-aware heartbeat prevents false failover after macOS wake
+**Problem:** When the Mac goes to sleep, Node.js' `setInterval` pauses completely. After waking up, the first heartbeat probe runs against a CLI + network stack that's still warming up (OAuth token refresh, DNS cache cold, TCP connections stale). The 5s `isAvailable()` timeout is too tight for post-wake latency → probe fails → 2 consecutive failures (the heartbeat fires its backlog) → auto-failover to Ollama → the bot silently answers via Gemma4 instead of Claude, sometimes for hours.
+**Evidence:** Logs showed a 7-hour gap (02:02–09:14 UTC) with zero heartbeat activity — the Mac was asleep. Immediately after wake, `claude-sdk: failure 1/2` → `unhealthy` → Ollama boot. The auto-recovery logic was correct but had no chance to fire before a manual restart.
+**Fix — three mechanisms in `heartbeat.ts`:**
+1. **Sleep detection via wall-clock drift:** If `now - lastHeartbeatRanAt > 2× interval`, the machine was suspended. On detection:
+   - 60s grace period where probe failures don't count toward the fail threshold
+   - All stale failure counters reset to zero (pre-sleep failures are meaningless)
+   - `isAvailable()` caches invalidated (a 7-hour-old "available: false" cache must not survive wake)
+2. **Quick recovery probe:** After every failover, schedule an extra heartbeat after 60s (not 5 min). If the primary is already back, recovery happens in ≤60s instead of up to 5 minutes.
+3. **Cache invalidation API:** `ClaudeSDKProvider.invalidateAvailabilityCache()` exposed so the heartbeat can clear stale results after sleep.
+**Typical post-sleep flow with fix:**
+```
+[wake]   → 💓 😴 Sleep detected (~420min gap). Grace period 60s
+         → reset claude-sdk to healthy, invalidate caches
+[+0s]    → 💓 😴 claude-sdk: probe failed during grace period — not counting
+[+60s]   → grace expired → normal probe → claude-sdk healthy ✅
+```
+Without the fix, the same scenario triggered failover at +0s.
+---
 ## [4.15.1] — 2026-04-16
 ### 🐛 Patch: suppress `fallbackModel` when primary is Haiku

package/dist/providers/claude-sdk-provider.js CHANGED Viewed

@@ -393,6 +393,12 @@ export class ClaudeSDKProvider {
             return cache(false);
         }
     }
+    /** v4.15.2 — Clear the cached isAvailable() result. Called by the
+     *  heartbeat service after detecting macOS sleep/wake so the first
+     *  post-wake probe doesn't serve a stale "unavailable" from hours ago. */
+    invalidateAvailabilityCache() {
+        this.availabilityCache = null;
+    }
     getInfo() {
         const model = this.config.model === "inherit"
             ? "CLI default (latest)"

package/dist/services/heartbeat.js CHANGED Viewed

@@ -5,6 +5,12 @@
  * If the primary provider fails, auto-switches to the first healthy fallback.
  * When the primary recovers, switches back automatically.
  *
+ * v4.15.2 — Sleep-aware: detects macOS/Linux suspend via wall-clock drift
+ * (gap between expected and actual heartbeat tick > 2× the interval). After
+ * wake, gives providers a grace period before counting failures, and schedules
+ * a quick recovery probe 60s after any failover so recovery doesn't wait for
+ * the full 5-minute cycle.
+ *
  * The heartbeat provider (Groq by default) is always registered as the
  * last-resort fallback — free, fast, reliable.
  */
@@ -15,6 +21,12 @@ const HEARTBEAT_INTERVAL_MS = 5 * 60 * 1000; // Check every 5 minutes
 const HEARTBEAT_TIMEOUT_MS = 15_000; // 15s timeout per check
 const FAIL_THRESHOLD = 2; // Switch after 2 consecutive failures
 const RECOVERY_THRESHOLD = 1; // Switch back after 1 success
+/** After detecting macOS sleep/wake, skip failure accounting for this long.
+ *  Gives network, DNS, and OAuth token refresh time to settle. */
+const POST_SLEEP_GRACE_MS = 60_000; // 60s grace after wake
+/** After a failover, schedule an extra recovery probe after this delay
+ *  instead of waiting for the full HEARTBEAT_INTERVAL_MS cycle. */
+const QUICK_RECOVERY_DELAY_MS = 60_000; // 60s after failover → re-check
 // Default heartbeat/fallback provider (free, no key needed for check)
 const HEARTBEAT_PROVIDER = "groq";
 // ── State ───────────────────────────────────────────────────────────────────
@@ -24,6 +36,9 @@ const state = {
     isRunning: false,
     originalPrimary: "",
     wasFailedOver: false,
+    lastRunAt: 0,
+    graceUntil: 0,
+    quickRecoveryTimer: null,
 };
 // ── Public API ──────────────────────────────────────────────────────────────
 /**
@@ -35,8 +50,9 @@ export function startHeartbeat() {
     const registry = getRegistry();
     state.originalPrimary = registry.getActiveKey();
     state.isRunning = true;
+    state.lastRunAt = Date.now();
+    state.graceUntil = 0;
     // Initial health state for all providers
-    const allProviders = registry;
     // We'll check providers in the fallback chain
     const chain = [
         config.primaryProvider,
@@ -66,6 +82,10 @@ export function stopHeartbeat() {
         clearInterval(state.intervalId);
         state.intervalId = null;
     }
+    if (state.quickRecoveryTimer) {
+        clearTimeout(state.quickRecoveryTimer);
+        state.quickRecoveryTimer = null;
+    }
     state.isRunning = false;
     console.log("💓 Heartbeat monitor stopped");
 }
@@ -97,6 +117,40 @@ export function isFailedOver() {
 // ── Internal ────────────────────────────────────────────────────────────────
 async function runHeartbeat() {
     const registry = getRegistry();
+    const now = Date.now();
+    // ── Sleep detection ────────────────────────────────────────────────────
+    // Node.js setInterval pauses during macOS/Linux suspend. If the wall-clock
+    // gap since the last tick exceeds 2× the interval, the machine was asleep.
+    // In that case, providers (especially CLI-based ones like claude-sdk) need
+    // time to warm up — network re-connects, OAuth tokens refresh, DNS caches
+    // re-populate. Without a grace period, the first probe after wake almost
+    // always fails, triggering a premature failover to Ollama.
+    const elapsed = now - state.lastRunAt;
+    const justWoke = state.lastRunAt > 0 && elapsed > HEARTBEAT_INTERVAL_MS * 2;
+    if (justWoke) {
+        const sleepDuration = Math.round(elapsed / 60_000);
+        console.log(`💓 😴 Sleep detected (~${sleepDuration}min gap). Grace period ${POST_SLEEP_GRACE_MS / 1000}s — failures won't count.`);
+        state.graceUntil = now + POST_SLEEP_GRACE_MS;
+        // Invalidate isAvailable() caches on all providers so the first probe
+        // after wake doesn't serve a 7-hour-old cached "unavailable" result.
+        for (const [key] of state.providers) {
+            const provider = registry.get(key);
+            if (provider && typeof provider.invalidateAvailabilityCache === "function") {
+                provider.invalidateAvailabilityCache();
+            }
+        }
+        // Reset fail counters — stale failures from before sleep are meaningless.
+        for (const [, health] of state.providers) {
+            if (!health.healthy) {
+                health.failCount = 0;
+                health.healthy = true;
+                console.log(`💓 😴 Reset ${health.key} to healthy (post-sleep clean slate)`);
+            }
+        }
+    }
+    state.lastRunAt = now;
+    const inGracePeriod = now < state.graceUntil;
+    // ── Provider health checks ─────────────────────────────────────────────
     for (const [key, health] of state.providers) {
         const provider = registry.get(key);
         if (!provider)
@@ -140,10 +194,17 @@ async function runHeartbeat() {
             health.failCount = 0;
         }
         catch (err) {
-            health.failCount++;
             health.lastLatencyMs = Date.now() - start;
             health.lastCheck = Date.now();
             health.lastError = err instanceof Error ? err.message : String(err);
+            // During the post-sleep grace period, log the failure but don't
+            // increment the counter — transient post-wake unavailability is
+            // expected and should not trigger a failover.
+            if (inGracePeriod) {
+                console.log(`💓 😴 ${key}: probe failed during grace period (${health.lastError}) — not counting`);
+                continue;
+            }
+            health.failCount++;
             if (health.failCount >= FAIL_THRESHOLD) {
                 health.healthy = false;
                 console.log(`💓 ❌ ${key}: unhealthy (${health.failCount} failures: ${health.lastError})`);
@@ -206,6 +267,10 @@ async function handleFailover(registry) {
             }
             registry.switchTo(fbKey);
             state.wasFailedOver = true;
+            // v4.15.2 — Schedule a quick recovery probe so we don't sit on
+            // the fallback for a full 5 minutes when the primary might already
+            // be back. Clear any previous pending timer first.
+            scheduleQuickRecovery();
             return;
         }
         console.log("💓 ⚠️ All providers unhealthy — staying on primary");
@@ -224,3 +289,21 @@ async function handleFailover(registry) {
         }
     }
 }
+/**
+ * Schedule an extra heartbeat probe after QUICK_RECOVERY_DELAY_MS. This runs
+ * in addition to the regular 5-minute interval — its sole purpose is to detect
+ * primary recovery quickly after a failover instead of waiting up to 5 minutes.
+ */
+function scheduleQuickRecovery() {
+    if (state.quickRecoveryTimer) {
+        clearTimeout(state.quickRecoveryTimer);
+    }
+    console.log(`💓 ⏱️ Quick recovery probe scheduled in ${QUICK_RECOVERY_DELAY_MS / 1000}s`);
+    state.quickRecoveryTimer = setTimeout(async () => {
+        state.quickRecoveryTimer = null;
+        if (!state.wasFailedOver || !state.isRunning)
+            return;
+        console.log("💓 ⏱️ Quick recovery probe firing…");
+        await runHeartbeat();
+    }, QUICK_RECOVERY_DELAY_MS);
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "alvin-bot",
-  "version": "4.15.1",
+  "version": "4.15.2",
   "description": "Alvin Bot \u2014 Your personal AI agent on Telegram, WhatsApp, Discord, Signal, and Web.",
   "type": "module",
   "main": "dist/index.js",