@swarmclawai/swarmclaw 1.5.65 → 1.5.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -399,6 +399,14 @@ Operational docs: https://swarmclaw.ai/docs/observability
399
399
 
400
400
  ## Releases
401
401
 
402
+ ### v1.5.66 Highlights
403
+
404
+ Fixes a runaway-token-burn bug in the orchestrator-wake and heartbeat loops. The root cause was hidden in the success/failure classification: a session run can resolve its promise successfully while still carrying an `error` on the result (e.g. a provider 429 swallowed into persisted output), and the wake trackers only incremented their failure counters on a rejected promise. So the backoff never engaged, the auto-disable-after-N-failures gate never tripped, and the wake kept firing at its configured interval indefinitely — every firing spending tokens on a full prompt against a provider that was already cooling down.
405
+
406
+ - **`classifyWakeOutcome` (`src/lib/server/runtime/heartbeat-service.ts`)** — new pure helper, extracted for unit testing, that maps a resolved run result into `null` (success) or a short failure reason. A run counts as a failure when `result.error` is a non-empty string, *or* when `result.text` is empty/whitespace-only. Both the orchestrator-wake and heartbeat outcome handlers now feed through this helper, so silent-failure runs tick the failure counter and the exponential backoff (10s → 5min) kicks in normally.
407
+ - **Auto-disable gate now trips for provider 429 / silent-wake loops.** The existing `MAX_CONSECUTIVE_FAILURES = 10` threshold was already in place but unreachable for the most common failure mode (429 errors that still persisted a run). After the fix, ten consecutive dud wakes auto-disable the orchestrator/heartbeat for that agent/session and post an explicit notification instead of grinding indefinitely.
408
+ - **Regression coverage.** `heartbeat-service.test.ts` now has 5 targeted cases on `classifyWakeOutcome` — the 429 regression, empty-output detection, non-string error fields, whitespace-only errors, and the happy path. `test:runtime` now runs 104 cases.
409
+
402
410
  ### v1.5.65 Highlights
403
411
 
404
412
  Follow-up hardening on the v1.5.64 work after live-testing the chat-header flows, the MCP connection pool, and the MCP Registry browser. Six concrete bugs fixed in the clear/undo, MCP pool eviction, and registry-browser code paths.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@swarmclawai/swarmclaw",
3
- "version": "1.5.65",
3
+ "version": "1.5.66",
4
4
  "description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
5
5
  "main": "electron-dist/main.js",
6
6
  "license": "MIT",
@@ -450,3 +450,35 @@ describe('heartbeatConfigForSession lightContext', () => {
450
450
  assert.equal(cfg.lightContext, false)
451
451
  })
452
452
  })
453
+
454
+ describe('classifyWakeOutcome (runaway-loop guard)', () => {
455
+ it('returns null for a run with visible text and no error', () => {
456
+ assert.equal(mod.classifyWakeOutcome({ text: 'all good', error: null }), null)
457
+ assert.equal(mod.classifyWakeOutcome({ text: 'ORCHESTRATOR_OK' }), null)
458
+ })
459
+
460
+ it('treats a resolved-but-errored result as failure (the 429 regression)', () => {
461
+ const out = mod.classifyWakeOutcome({
462
+ text: '',
463
+ error: '429 All credentials for model gpt-5.4 are cooling down via provider codex',
464
+ })
465
+ assert.equal(out, '429 All credentials for model gpt-5.4 are cooling down via provider codex')
466
+ })
467
+
468
+ it('counts empty visible output as failure so silent wakes trigger backoff', () => {
469
+ assert.equal(mod.classifyWakeOutcome({ text: '' }), 'empty wake response')
470
+ assert.equal(mod.classifyWakeOutcome({ text: ' \n\t' }), 'empty wake response')
471
+ assert.equal(mod.classifyWakeOutcome({}), 'empty wake response')
472
+ assert.equal(mod.classifyWakeOutcome(null), 'empty wake response')
473
+ assert.equal(mod.classifyWakeOutcome(undefined), 'empty wake response')
474
+ })
475
+
476
+ it('ignores a non-string error field and falls back to text check', () => {
477
+ assert.equal(mod.classifyWakeOutcome({ text: 'hi', error: 42 }), null)
478
+ assert.equal(mod.classifyWakeOutcome({ text: '', error: 42 }), 'empty wake response')
479
+ })
480
+
481
+ it('ignores an empty-string error so whitespace errors do not double-count', () => {
482
+ assert.equal(mod.classifyWakeOutcome({ text: 'fine', error: ' ' }), null)
483
+ })
484
+ })
@@ -54,6 +54,23 @@ const ORCHESTRATOR_MIN_INTERVAL_SEC = 60
54
54
  const ORCHESTRATOR_MAX_INTERVAL_SEC = 86400 // 24h
55
55
  const ORCHESTRATOR_MAX_PROMPT_CHARS = 4000
56
56
 
57
+ /**
58
+ * Classify a resolved session-run result as success or failure for the
59
+ * heartbeat/orchestrator outcome tracker. A resolved promise can still
60
+ * carry an error on `result.error` (e.g. a provider 429 that was swallowed
61
+ * into persisted output) or resolve with empty text, and both cases must
62
+ * count as failures — otherwise a stuck wake loop never ticks the
63
+ * failure counter, never backs off, and never auto-disables.
64
+ */
65
+ export function classifyWakeOutcome(result: unknown): string | null {
66
+ if (!result || typeof result !== 'object') return 'empty wake response'
67
+ const obj = result as { error?: unknown; text?: unknown }
68
+ if (typeof obj.error === 'string' && obj.error.trim()) return obj.error
69
+ const text = typeof obj.text === 'string' ? obj.text : ''
70
+ if (!text.trim()) return 'empty wake response'
71
+ return null
72
+ }
73
+
57
74
  interface FailureRecord {
58
75
  count: number
59
76
  lastFailedAt: number
@@ -782,24 +799,28 @@ export async function tickHeartbeats() {
782
799
  state.lastBySession.set(session.id, now)
783
800
 
784
801
  const sid = session.id as string
785
- enqueue.promise.then(() => {
786
- const prev = state.failures.get(sid)
787
- if (prev?.recoveryAttempts) {
788
- log.info('heartbeat', `Recovery successful for session ${sid} after ${prev.recoveryAttempts} attempt(s)`)
802
+ // A session run can "resolve" with an error in result.error (e.g. provider
803
+ // 429 swallowed into the persisted failure) or with empty text. Treat both
804
+ // as failures so backoff and auto-disable trigger, otherwise a stuck
805
+ // heartbeat keeps re-firing at the configured interval and burning tokens.
806
+ const handleHeartbeatOutcome = (failure: string | null) => {
807
+ if (!failure) {
808
+ const prev = state.failures.get(sid)
809
+ if (prev?.recoveryAttempts) {
810
+ log.info('heartbeat', `Recovery successful for session ${sid} after ${prev.recoveryAttempts} attempt(s)`)
811
+ }
812
+ state.failures.delete(sid)
813
+ patchSession(sid, (s) => {
814
+ if (!s) return s
815
+ s.lastDeliveryStatus = 'ok'
816
+ s.lastDeliveredAt = Date.now()
817
+ return s
818
+ })
819
+ return
789
820
  }
790
- state.failures.delete(sid)
791
- // Track successful delivery
792
- patchSession(sid, (s) => {
793
- if (!s) return s
794
- s.lastDeliveryStatus = 'ok'
795
- s.lastDeliveredAt = Date.now()
796
- return s
797
- })
798
- }).catch((err: unknown) => {
799
821
  const prev = state.failures.get(sid)
800
822
  const newCount = (prev?.count ?? 0) + 1
801
823
  const record: FailureRecord = { count: newCount, lastFailedAt: Date.now() }
802
- // Auto-disable heartbeat after too many consecutive failures to prevent resource waste
803
824
  if (newCount >= MAX_CONSECUTIVE_FAILURES) {
804
825
  record.autoDisabledAt = Date.now()
805
826
  log.warn('heartbeat', `Auto-disabling heartbeat for session ${sid} after ${newCount} consecutive failures`)
@@ -821,17 +842,20 @@ export async function tickHeartbeats() {
821
842
  })
822
843
  }
823
844
  state.failures.set(sid, record)
824
- const msg = errorMessage(err)
825
- log.warn('heartbeat', `Heartbeat run failed for session ${sid} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, msg)
826
- // Track failed delivery
845
+ log.warn('heartbeat', `Heartbeat run failed for session ${sid} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, failure)
827
846
  patchSession(sid, (s) => {
828
847
  if (!s) return s
829
848
  s.lastDeliveryStatus = 'error'
830
- s.lastDeliveryError = msg
849
+ s.lastDeliveryError = failure
831
850
  s.lastDeliveredAt = Date.now()
832
851
  return s
833
852
  })
834
- })
853
+ }
854
+ enqueue.promise
855
+ .then((result) => handleHeartbeatOutcome(classifyWakeOutcome(result)))
856
+ .catch((err: unknown) => {
857
+ handleHeartbeatOutcome(errorMessage(err) || 'heartbeat rejected')
858
+ })
835
859
  }
836
860
  }
837
861
 
@@ -1118,10 +1142,15 @@ export async function tickOrchestratorAgents() {
1118
1142
 
1119
1143
  log.info('orchestrator', `Woke orchestrator agent ${agent.name} (${agent.id}), cycle #${(agent.orchestratorCycleCount || 0) + 1}`)
1120
1144
 
1121
- // Track success/failure
1122
- enqueue.promise.then(() => {
1123
- orchestratorState.failures.delete(agent.id)
1124
- }).catch((err: unknown) => {
1145
+ // Track success/failure. A run can "resolve" but still carry an error
1146
+ // on the result (e.g. provider 429 that was caught and persisted), so we
1147
+ // inspect the resolved result as well as the rejected path — otherwise
1148
+ // a stuck wake loop never ticks the failure counter and never backs off.
1149
+ const handleWakeOutcome = (failure: string | null) => {
1150
+ if (!failure) {
1151
+ orchestratorState.failures.delete(agent.id)
1152
+ return
1153
+ }
1125
1154
  const prev = orchestratorState.failures.get(agent.id)
1126
1155
  const newCount = (prev?.count ?? 0) + 1
1127
1156
  const record: FailureRecord = { count: newCount, lastFailedAt: Date.now() }
@@ -1146,8 +1175,13 @@ export async function tickOrchestratorAgents() {
1146
1175
  })
1147
1176
  }
1148
1177
  orchestratorState.failures.set(agent.id, record)
1149
- log.warn('orchestrator', `Orchestrator wake failed for agent ${agent.id} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, errorMessage(err))
1150
- })
1178
+ log.warn('orchestrator', `Orchestrator wake failed for agent ${agent.id} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, failure)
1179
+ }
1180
+ enqueue.promise
1181
+ .then((result) => handleWakeOutcome(classifyWakeOutcome(result)))
1182
+ .catch((err: unknown) => {
1183
+ handleWakeOutcome(errorMessage(err) || 'wake rejected')
1184
+ })
1151
1185
  } catch (err) {
1152
1186
  log.warn('orchestrator', `Error ticking orchestrator agent ${agent.id}:`, errorMessage(err))
1153
1187
  }