npm - @swarmclawai/swarmclaw - Versions diffs - 1.5.65 → 1.5.66 - Mend

@swarmclawai/swarmclaw 1.5.65 → 1.5.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md +8 -0
package/package.json +1 -1
package/src/lib/server/runtime/heartbeat-service.test.ts +32 -0
package/src/lib/server/runtime/heartbeat-service.ts +59 -25

package/README.md CHANGED Viewed

@@ -399,6 +399,14 @@ Operational docs: https://swarmclaw.ai/docs/observability
 ## Releases
+### v1.5.66 Highlights
+Fixes a runaway-token-burn bug in the orchestrator-wake and heartbeat loops. The root cause was hidden in the success/failure classification: a session run can resolve its promise successfully while still carrying an `error` on the result (e.g. a provider 429 swallowed into persisted output), and the wake trackers only incremented their failure counters on a rejected promise. So the backoff never engaged, the auto-disable-after-N-failures gate never tripped, and the wake kept firing at its configured interval indefinitely — every firing spending tokens on a full prompt against a provider that was already cooling down.
+- **`classifyWakeOutcome` (`src/lib/server/runtime/heartbeat-service.ts`)** — new pure helper, extracted for unit testing, that maps a resolved run result into `null` (success) or a short failure reason. A run counts as a failure when `result.error` is a non-empty string, *or* when `result.text` is empty/whitespace-only. Both the orchestrator-wake and heartbeat outcome handlers now feed through this helper, so silent-failure runs tick the failure counter and the exponential backoff (10s → 5min) kicks in normally.
+- **Auto-disable gate now trips for provider 429 / silent-wake loops.** The existing `MAX_CONSECUTIVE_FAILURES = 10` threshold was already in place but unreachable for the most common failure mode (429 errors that still persisted a run). After the fix, ten consecutive dud wakes auto-disable the orchestrator/heartbeat for that agent/session and post an explicit notification instead of grinding indefinitely.
+- **Regression coverage.** `heartbeat-service.test.ts` now has 5 targeted cases on `classifyWakeOutcome` — the 429 regression, empty-output detection, non-string error fields, whitespace-only errors, and the happy path. `test:runtime` now runs 104 cases.
 ### v1.5.65 Highlights
 Follow-up hardening on the v1.5.64 work after live-testing the chat-header flows, the MCP connection pool, and the MCP Registry browser. Six concrete bugs fixed in the clear/undo, MCP pool eviction, and registry-browser code paths.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@swarmclawai/swarmclaw",
-  "version": "1.5.65",
+  "version": "1.5.66",
   "description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
   "main": "electron-dist/main.js",
   "license": "MIT",

package/src/lib/server/runtime/heartbeat-service.test.ts CHANGED Viewed

@@ -450,3 +450,35 @@ describe('heartbeatConfigForSession lightContext', () => {
     assert.equal(cfg.lightContext, false)
   })
 })
+describe('classifyWakeOutcome (runaway-loop guard)', () => {
+  it('returns null for a run with visible text and no error', () => {
+    assert.equal(mod.classifyWakeOutcome({ text: 'all good', error: null }), null)
+    assert.equal(mod.classifyWakeOutcome({ text: 'ORCHESTRATOR_OK' }), null)
+  })
+  it('treats a resolved-but-errored result as failure (the 429 regression)', () => {
+    const out = mod.classifyWakeOutcome({
+      text: '',
+      error: '429 All credentials for model gpt-5.4 are cooling down via provider codex',
+    })
+    assert.equal(out, '429 All credentials for model gpt-5.4 are cooling down via provider codex')
+  })
+  it('counts empty visible output as failure so silent wakes trigger backoff', () => {
+    assert.equal(mod.classifyWakeOutcome({ text: '' }), 'empty wake response')
+    assert.equal(mod.classifyWakeOutcome({ text: '   \n\t' }), 'empty wake response')
+    assert.equal(mod.classifyWakeOutcome({}), 'empty wake response')
+    assert.equal(mod.classifyWakeOutcome(null), 'empty wake response')
+    assert.equal(mod.classifyWakeOutcome(undefined), 'empty wake response')
+  })
+  it('ignores a non-string error field and falls back to text check', () => {
+    assert.equal(mod.classifyWakeOutcome({ text: 'hi', error: 42 }), null)
+    assert.equal(mod.classifyWakeOutcome({ text: '', error: 42 }), 'empty wake response')
+  })
+  it('ignores an empty-string error so whitespace errors do not double-count', () => {
+    assert.equal(mod.classifyWakeOutcome({ text: 'fine', error: '   ' }), null)
+  })
+})

package/src/lib/server/runtime/heartbeat-service.ts CHANGED Viewed

@@ -54,6 +54,23 @@ const ORCHESTRATOR_MIN_INTERVAL_SEC = 60
 const ORCHESTRATOR_MAX_INTERVAL_SEC = 86400    // 24h
 const ORCHESTRATOR_MAX_PROMPT_CHARS = 4000
+/**
+ * Classify a resolved session-run result as success or failure for the
+ * heartbeat/orchestrator outcome tracker. A resolved promise can still
+ * carry an error on `result.error` (e.g. a provider 429 that was swallowed
+ * into persisted output) or resolve with empty text, and both cases must
+ * count as failures — otherwise a stuck wake loop never ticks the
+ * failure counter, never backs off, and never auto-disables.
+ */
+export function classifyWakeOutcome(result: unknown): string | null {
+  if (!result || typeof result !== 'object') return 'empty wake response'
+  const obj = result as { error?: unknown; text?: unknown }
+  if (typeof obj.error === 'string' && obj.error.trim()) return obj.error
+  const text = typeof obj.text === 'string' ? obj.text : ''
+  if (!text.trim()) return 'empty wake response'
+  return null
+}
 interface FailureRecord {
   count: number
   lastFailedAt: number
@@ -782,24 +799,28 @@ export async function tickHeartbeats() {
     state.lastBySession.set(session.id, now)
     const sid = session.id as string
-    enqueue.promise.then(() => {
-      const prev = state.failures.get(sid)
-      if (prev?.recoveryAttempts) {
-        log.info('heartbeat', `Recovery successful for session ${sid} after ${prev.recoveryAttempts} attempt(s)`)
+    // A session run can "resolve" with an error in result.error (e.g. provider
+    // 429 swallowed into the persisted failure) or with empty text. Treat both
+    // as failures so backoff and auto-disable trigger, otherwise a stuck
+    // heartbeat keeps re-firing at the configured interval and burning tokens.
+    const handleHeartbeatOutcome = (failure: string | null) => {
+      if (!failure) {
+        const prev = state.failures.get(sid)
+        if (prev?.recoveryAttempts) {
+          log.info('heartbeat', `Recovery successful for session ${sid} after ${prev.recoveryAttempts} attempt(s)`)
+        }
+        state.failures.delete(sid)
+        patchSession(sid, (s) => {
+          if (!s) return s
+          s.lastDeliveryStatus = 'ok'
+          s.lastDeliveredAt = Date.now()
+          return s
+        })
+        return
       }
-      state.failures.delete(sid)
-      // Track successful delivery
-      patchSession(sid, (s) => {
-        if (!s) return s
-        s.lastDeliveryStatus = 'ok'
-        s.lastDeliveredAt = Date.now()
-        return s
-      })
-    }).catch((err: unknown) => {
       const prev = state.failures.get(sid)
       const newCount = (prev?.count ?? 0) + 1
       const record: FailureRecord = { count: newCount, lastFailedAt: Date.now() }
-      // Auto-disable heartbeat after too many consecutive failures to prevent resource waste
       if (newCount >= MAX_CONSECUTIVE_FAILURES) {
         record.autoDisabledAt = Date.now()
         log.warn('heartbeat', `Auto-disabling heartbeat for session ${sid} after ${newCount} consecutive failures`)
@@ -821,17 +842,20 @@ export async function tickHeartbeats() {
         })
       }
       state.failures.set(sid, record)
-      const msg = errorMessage(err)
-      log.warn('heartbeat', `Heartbeat run failed for session ${sid} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, msg)
-      // Track failed delivery
+      log.warn('heartbeat', `Heartbeat run failed for session ${sid} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, failure)
       patchSession(sid, (s) => {
         if (!s) return s
         s.lastDeliveryStatus = 'error'
-        s.lastDeliveryError = msg
+        s.lastDeliveryError = failure
         s.lastDeliveredAt = Date.now()
         return s
       })
-    })
+    }
+    enqueue.promise
+      .then((result) => handleHeartbeatOutcome(classifyWakeOutcome(result)))
+      .catch((err: unknown) => {
+        handleHeartbeatOutcome(errorMessage(err) || 'heartbeat rejected')
+      })
   }
 }
@@ -1118,10 +1142,15 @@ export async function tickOrchestratorAgents() {
       log.info('orchestrator', `Woke orchestrator agent ${agent.name} (${agent.id}), cycle #${(agent.orchestratorCycleCount || 0) + 1}`)
-      // Track success/failure
-      enqueue.promise.then(() => {
-        orchestratorState.failures.delete(agent.id)
-      }).catch((err: unknown) => {
+      // Track success/failure. A run can "resolve" but still carry an error
+      // on the result (e.g. provider 429 that was caught and persisted), so we
+      // inspect the resolved result as well as the rejected path — otherwise
+      // a stuck wake loop never ticks the failure counter and never backs off.
+      const handleWakeOutcome = (failure: string | null) => {
+        if (!failure) {
+          orchestratorState.failures.delete(agent.id)
+          return
+        }
         const prev = orchestratorState.failures.get(agent.id)
         const newCount = (prev?.count ?? 0) + 1
         const record: FailureRecord = { count: newCount, lastFailedAt: Date.now() }
@@ -1146,8 +1175,13 @@ export async function tickOrchestratorAgents() {
           })
         }
         orchestratorState.failures.set(agent.id, record)
-        log.warn('orchestrator', `Orchestrator wake failed for agent ${agent.id} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, errorMessage(err))
-      })
+        log.warn('orchestrator', `Orchestrator wake failed for agent ${agent.id} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, failure)
+      }
+      enqueue.promise
+        .then((result) => handleWakeOutcome(classifyWakeOutcome(result)))
+        .catch((err: unknown) => {
+          handleWakeOutcome(errorMessage(err) || 'wake rejected')
+        })
     } catch (err) {
       log.warn('orchestrator', `Error ticking orchestrator agent ${agent.id}:`, errorMessage(err))
     }