alvin-bot 4.15.1 β 4.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,36 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to Alvin Bot are documented here.
|
|
4
4
|
|
|
5
|
+
## [4.15.2] β 2026-04-17
|
|
6
|
+
|
|
7
|
+
### π Fix: sleep-aware heartbeat prevents false failover after macOS wake
|
|
8
|
+
|
|
9
|
+
**Problem:** When the Mac goes to sleep, Node.js' `setInterval` pauses completely. After waking up, the first heartbeat probe runs against a CLI + network stack that's still warming up (OAuth token refresh, DNS cache cold, TCP connections stale). The 5s `isAvailable()` timeout is too tight for post-wake latency β probe fails β 2 consecutive failures (the heartbeat fires its backlog) β auto-failover to Ollama β the bot silently answers via Gemma4 instead of Claude, sometimes for hours.
|
|
10
|
+
|
|
11
|
+
**Evidence:** Logs showed a 7-hour gap (02:02β09:14 UTC) with zero heartbeat activity β the Mac was asleep. Immediately after wake, `claude-sdk: failure 1/2` β `unhealthy` β Ollama boot. The auto-recovery logic was correct but had no chance to fire before a manual restart.
|
|
12
|
+
|
|
13
|
+
**Fix β three mechanisms in `heartbeat.ts`:**
|
|
14
|
+
|
|
15
|
+
1. **Sleep detection via wall-clock drift:** If `now - lastHeartbeatRanAt > 2Γ interval`, the machine was suspended. On detection:
|
|
16
|
+
- 60s grace period where probe failures don't count toward the fail threshold
|
|
17
|
+
- All stale failure counters reset to zero (pre-sleep failures are meaningless)
|
|
18
|
+
- `isAvailable()` caches invalidated (a 7-hour-old "available: false" cache must not survive wake)
|
|
19
|
+
|
|
20
|
+
2. **Quick recovery probe:** After every failover, schedule an extra heartbeat after 60s (not 5 min). If the primary is already back, recovery happens in β€60s instead of up to 5 minutes.
|
|
21
|
+
|
|
22
|
+
3. **Cache invalidation API:** `ClaudeSDKProvider.invalidateAvailabilityCache()` exposed so the heartbeat can clear stale results after sleep.
|
|
23
|
+
|
|
24
|
+
**Typical post-sleep flow with fix:**
|
|
25
|
+
```
|
|
26
|
+
[wake] β π π΄ Sleep detected (~420min gap). Grace period 60s
|
|
27
|
+
β reset claude-sdk to healthy, invalidate caches
|
|
28
|
+
[+0s] β π π΄ claude-sdk: probe failed during grace period β not counting
|
|
29
|
+
[+60s] β grace expired β normal probe β claude-sdk healthy β
|
|
30
|
+
```
|
|
31
|
+
Without the fix, the same scenario triggered failover at +0s.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
5
35
|
## [4.15.1] β 2026-04-16
|
|
6
36
|
|
|
7
37
|
### π Patch: suppress `fallbackModel` when primary is Haiku
|
|
@@ -393,6 +393,12 @@ export class ClaudeSDKProvider {
|
|
|
393
393
|
return cache(false);
|
|
394
394
|
}
|
|
395
395
|
}
|
|
396
|
+
/** v4.15.2 β Clear the cached isAvailable() result. Called by the
|
|
397
|
+
* heartbeat service after detecting macOS sleep/wake so the first
|
|
398
|
+
* post-wake probe doesn't serve a stale "unavailable" from hours ago. */
|
|
399
|
+
invalidateAvailabilityCache() {
|
|
400
|
+
this.availabilityCache = null;
|
|
401
|
+
}
|
|
396
402
|
getInfo() {
|
|
397
403
|
const model = this.config.model === "inherit"
|
|
398
404
|
? "CLI default (latest)"
|
|
@@ -5,6 +5,12 @@
|
|
|
5
5
|
* If the primary provider fails, auto-switches to the first healthy fallback.
|
|
6
6
|
* When the primary recovers, switches back automatically.
|
|
7
7
|
*
|
|
8
|
+
* v4.15.2 β Sleep-aware: detects macOS/Linux suspend via wall-clock drift
|
|
9
|
+
* (gap between expected and actual heartbeat tick > 2Γ the interval). After
|
|
10
|
+
* wake, gives providers a grace period before counting failures, and schedules
|
|
11
|
+
* a quick recovery probe 60s after any failover so recovery doesn't wait for
|
|
12
|
+
* the full 5-minute cycle.
|
|
13
|
+
*
|
|
8
14
|
* The heartbeat provider (Groq by default) is always registered as the
|
|
9
15
|
* last-resort fallback β free, fast, reliable.
|
|
10
16
|
*/
|
|
@@ -15,6 +21,12 @@ const HEARTBEAT_INTERVAL_MS = 5 * 60 * 1000; // Check every 5 minutes
|
|
|
15
21
|
const HEARTBEAT_TIMEOUT_MS = 15_000; // 15s timeout per check
|
|
16
22
|
const FAIL_THRESHOLD = 2; // Switch after 2 consecutive failures
|
|
17
23
|
const RECOVERY_THRESHOLD = 1; // Switch back after 1 success
|
|
24
|
+
/** After detecting macOS sleep/wake, skip failure accounting for this long.
|
|
25
|
+
* Gives network, DNS, and OAuth token refresh time to settle. */
|
|
26
|
+
const POST_SLEEP_GRACE_MS = 60_000; // 60s grace after wake
|
|
27
|
+
/** After a failover, schedule an extra recovery probe after this delay
|
|
28
|
+
* instead of waiting for the full HEARTBEAT_INTERVAL_MS cycle. */
|
|
29
|
+
const QUICK_RECOVERY_DELAY_MS = 60_000; // 60s after failover β re-check
|
|
18
30
|
// Default heartbeat/fallback provider (free, no key needed for check)
|
|
19
31
|
const HEARTBEAT_PROVIDER = "groq";
|
|
20
32
|
// ββ State βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -24,6 +36,9 @@ const state = {
|
|
|
24
36
|
isRunning: false,
|
|
25
37
|
originalPrimary: "",
|
|
26
38
|
wasFailedOver: false,
|
|
39
|
+
lastRunAt: 0,
|
|
40
|
+
graceUntil: 0,
|
|
41
|
+
quickRecoveryTimer: null,
|
|
27
42
|
};
|
|
28
43
|
// ββ Public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
29
44
|
/**
|
|
@@ -35,8 +50,9 @@ export function startHeartbeat() {
|
|
|
35
50
|
const registry = getRegistry();
|
|
36
51
|
state.originalPrimary = registry.getActiveKey();
|
|
37
52
|
state.isRunning = true;
|
|
53
|
+
state.lastRunAt = Date.now();
|
|
54
|
+
state.graceUntil = 0;
|
|
38
55
|
// Initial health state for all providers
|
|
39
|
-
const allProviders = registry;
|
|
40
56
|
// We'll check providers in the fallback chain
|
|
41
57
|
const chain = [
|
|
42
58
|
config.primaryProvider,
|
|
@@ -66,6 +82,10 @@ export function stopHeartbeat() {
|
|
|
66
82
|
clearInterval(state.intervalId);
|
|
67
83
|
state.intervalId = null;
|
|
68
84
|
}
|
|
85
|
+
if (state.quickRecoveryTimer) {
|
|
86
|
+
clearTimeout(state.quickRecoveryTimer);
|
|
87
|
+
state.quickRecoveryTimer = null;
|
|
88
|
+
}
|
|
69
89
|
state.isRunning = false;
|
|
70
90
|
console.log("π Heartbeat monitor stopped");
|
|
71
91
|
}
|
|
@@ -97,6 +117,40 @@ export function isFailedOver() {
|
|
|
97
117
|
// ββ Internal ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
98
118
|
async function runHeartbeat() {
|
|
99
119
|
const registry = getRegistry();
|
|
120
|
+
const now = Date.now();
|
|
121
|
+
// ββ Sleep detection ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
122
|
+
// Node.js setInterval pauses during macOS/Linux suspend. If the wall-clock
|
|
123
|
+
// gap since the last tick exceeds 2Γ the interval, the machine was asleep.
|
|
124
|
+
// In that case, providers (especially CLI-based ones like claude-sdk) need
|
|
125
|
+
// time to warm up β network re-connects, OAuth tokens refresh, DNS caches
|
|
126
|
+
// re-populate. Without a grace period, the first probe after wake almost
|
|
127
|
+
// always fails, triggering a premature failover to Ollama.
|
|
128
|
+
const elapsed = now - state.lastRunAt;
|
|
129
|
+
const justWoke = state.lastRunAt > 0 && elapsed > HEARTBEAT_INTERVAL_MS * 2;
|
|
130
|
+
if (justWoke) {
|
|
131
|
+
const sleepDuration = Math.round(elapsed / 60_000);
|
|
132
|
+
console.log(`π π΄ Sleep detected (~${sleepDuration}min gap). Grace period ${POST_SLEEP_GRACE_MS / 1000}s β failures won't count.`);
|
|
133
|
+
state.graceUntil = now + POST_SLEEP_GRACE_MS;
|
|
134
|
+
// Invalidate isAvailable() caches on all providers so the first probe
|
|
135
|
+
// after wake doesn't serve a 7-hour-old cached "unavailable" result.
|
|
136
|
+
for (const [key] of state.providers) {
|
|
137
|
+
const provider = registry.get(key);
|
|
138
|
+
if (provider && typeof provider.invalidateAvailabilityCache === "function") {
|
|
139
|
+
provider.invalidateAvailabilityCache();
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
// Reset fail counters β stale failures from before sleep are meaningless.
|
|
143
|
+
for (const [, health] of state.providers) {
|
|
144
|
+
if (!health.healthy) {
|
|
145
|
+
health.failCount = 0;
|
|
146
|
+
health.healthy = true;
|
|
147
|
+
console.log(`π π΄ Reset ${health.key} to healthy (post-sleep clean slate)`);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
state.lastRunAt = now;
|
|
152
|
+
const inGracePeriod = now < state.graceUntil;
|
|
153
|
+
// ββ Provider health checks βββββββββββββββββββββββββββββββββββββββββββββ
|
|
100
154
|
for (const [key, health] of state.providers) {
|
|
101
155
|
const provider = registry.get(key);
|
|
102
156
|
if (!provider)
|
|
@@ -140,10 +194,17 @@ async function runHeartbeat() {
|
|
|
140
194
|
health.failCount = 0;
|
|
141
195
|
}
|
|
142
196
|
catch (err) {
|
|
143
|
-
health.failCount++;
|
|
144
197
|
health.lastLatencyMs = Date.now() - start;
|
|
145
198
|
health.lastCheck = Date.now();
|
|
146
199
|
health.lastError = err instanceof Error ? err.message : String(err);
|
|
200
|
+
// During the post-sleep grace period, log the failure but don't
|
|
201
|
+
// increment the counter β transient post-wake unavailability is
|
|
202
|
+
// expected and should not trigger a failover.
|
|
203
|
+
if (inGracePeriod) {
|
|
204
|
+
console.log(`π π΄ ${key}: probe failed during grace period (${health.lastError}) β not counting`);
|
|
205
|
+
continue;
|
|
206
|
+
}
|
|
207
|
+
health.failCount++;
|
|
147
208
|
if (health.failCount >= FAIL_THRESHOLD) {
|
|
148
209
|
health.healthy = false;
|
|
149
210
|
console.log(`π β ${key}: unhealthy (${health.failCount} failures: ${health.lastError})`);
|
|
@@ -206,6 +267,10 @@ async function handleFailover(registry) {
|
|
|
206
267
|
}
|
|
207
268
|
registry.switchTo(fbKey);
|
|
208
269
|
state.wasFailedOver = true;
|
|
270
|
+
// v4.15.2 β Schedule a quick recovery probe so we don't sit on
|
|
271
|
+
// the fallback for a full 5 minutes when the primary might already
|
|
272
|
+
// be back. Clear any previous pending timer first.
|
|
273
|
+
scheduleQuickRecovery();
|
|
209
274
|
return;
|
|
210
275
|
}
|
|
211
276
|
console.log("π β οΈ All providers unhealthy β staying on primary");
|
|
@@ -224,3 +289,21 @@ async function handleFailover(registry) {
|
|
|
224
289
|
}
|
|
225
290
|
}
|
|
226
291
|
}
|
|
292
|
+
/**
|
|
293
|
+
* Schedule an extra heartbeat probe after QUICK_RECOVERY_DELAY_MS. This runs
|
|
294
|
+
* in addition to the regular 5-minute interval β its sole purpose is to detect
|
|
295
|
+
* primary recovery quickly after a failover instead of waiting up to 5 minutes.
|
|
296
|
+
*/
|
|
297
|
+
function scheduleQuickRecovery() {
|
|
298
|
+
if (state.quickRecoveryTimer) {
|
|
299
|
+
clearTimeout(state.quickRecoveryTimer);
|
|
300
|
+
}
|
|
301
|
+
console.log(`π β±οΈ Quick recovery probe scheduled in ${QUICK_RECOVERY_DELAY_MS / 1000}s`);
|
|
302
|
+
state.quickRecoveryTimer = setTimeout(async () => {
|
|
303
|
+
state.quickRecoveryTimer = null;
|
|
304
|
+
if (!state.wasFailedOver || !state.isRunning)
|
|
305
|
+
return;
|
|
306
|
+
console.log("π β±οΈ Quick recovery probe firingβ¦");
|
|
307
|
+
await runHeartbeat();
|
|
308
|
+
}, QUICK_RECOVERY_DELAY_MS);
|
|
309
|
+
}
|