@swarmclawai/swarmclaw 1.5.65 → 1.5.66
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -399,6 +399,14 @@ Operational docs: https://swarmclaw.ai/docs/observability
|
|
|
399
399
|
|
|
400
400
|
## Releases
|
|
401
401
|
|
|
402
|
+
### v1.5.66 Highlights
|
|
403
|
+
|
|
404
|
+
Fixes a runaway-token-burn bug in the orchestrator-wake and heartbeat loops. The root cause was hidden in the success/failure classification: a session run can resolve its promise successfully while still carrying an `error` on the result (e.g. a provider 429 swallowed into persisted output), and the wake trackers only incremented their failure counters on a rejected promise. So the backoff never engaged, the auto-disable-after-N-failures gate never tripped, and the wake kept firing at its configured interval indefinitely — every firing spending tokens on a full prompt against a provider that was already cooling down.
|
|
405
|
+
|
|
406
|
+
- **`classifyWakeOutcome` (`src/lib/server/runtime/heartbeat-service.ts`)** — new pure helper, extracted for unit testing, that maps a resolved run result into `null` (success) or a short failure reason. A run counts as a failure when `result.error` is a non-empty string, *or* when `result.text` is empty/whitespace-only. Both the orchestrator-wake and heartbeat outcome handlers now feed through this helper, so silent-failure runs tick the failure counter and the exponential backoff (10s → 5min) kicks in normally.
|
|
407
|
+
- **Auto-disable gate now trips for provider 429 / silent-wake loops.** The existing `MAX_CONSECUTIVE_FAILURES = 10` threshold was already in place but unreachable for the most common failure mode (429 errors that still persisted a run). After the fix, ten consecutive dud wakes auto-disable the orchestrator/heartbeat for that agent/session and post an explicit notification instead of grinding indefinitely.
|
|
408
|
+
- **Regression coverage.** `heartbeat-service.test.ts` now has 5 targeted cases on `classifyWakeOutcome` — the 429 regression, empty-output detection, non-string error fields, whitespace-only errors, and the happy path. `test:runtime` now runs 104 cases.
|
|
409
|
+
|
|
402
410
|
### v1.5.65 Highlights
|
|
403
411
|
|
|
404
412
|
Follow-up hardening on the v1.5.64 work after live-testing the chat-header flows, the MCP connection pool, and the MCP Registry browser. Six concrete bugs fixed in the clear/undo, MCP pool eviction, and registry-browser code paths.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@swarmclawai/swarmclaw",
|
|
3
|
-
"version": "1.5.
|
|
3
|
+
"version": "1.5.66",
|
|
4
4
|
"description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
|
|
5
5
|
"main": "electron-dist/main.js",
|
|
6
6
|
"license": "MIT",
|
|
@@ -450,3 +450,35 @@ describe('heartbeatConfigForSession lightContext', () => {
|
|
|
450
450
|
assert.equal(cfg.lightContext, false)
|
|
451
451
|
})
|
|
452
452
|
})
|
|
453
|
+
|
|
454
|
+
describe('classifyWakeOutcome (runaway-loop guard)', () => {
|
|
455
|
+
it('returns null for a run with visible text and no error', () => {
|
|
456
|
+
assert.equal(mod.classifyWakeOutcome({ text: 'all good', error: null }), null)
|
|
457
|
+
assert.equal(mod.classifyWakeOutcome({ text: 'ORCHESTRATOR_OK' }), null)
|
|
458
|
+
})
|
|
459
|
+
|
|
460
|
+
it('treats a resolved-but-errored result as failure (the 429 regression)', () => {
|
|
461
|
+
const out = mod.classifyWakeOutcome({
|
|
462
|
+
text: '',
|
|
463
|
+
error: '429 All credentials for model gpt-5.4 are cooling down via provider codex',
|
|
464
|
+
})
|
|
465
|
+
assert.equal(out, '429 All credentials for model gpt-5.4 are cooling down via provider codex')
|
|
466
|
+
})
|
|
467
|
+
|
|
468
|
+
it('counts empty visible output as failure so silent wakes trigger backoff', () => {
|
|
469
|
+
assert.equal(mod.classifyWakeOutcome({ text: '' }), 'empty wake response')
|
|
470
|
+
assert.equal(mod.classifyWakeOutcome({ text: ' \n\t' }), 'empty wake response')
|
|
471
|
+
assert.equal(mod.classifyWakeOutcome({}), 'empty wake response')
|
|
472
|
+
assert.equal(mod.classifyWakeOutcome(null), 'empty wake response')
|
|
473
|
+
assert.equal(mod.classifyWakeOutcome(undefined), 'empty wake response')
|
|
474
|
+
})
|
|
475
|
+
|
|
476
|
+
it('ignores a non-string error field and falls back to text check', () => {
|
|
477
|
+
assert.equal(mod.classifyWakeOutcome({ text: 'hi', error: 42 }), null)
|
|
478
|
+
assert.equal(mod.classifyWakeOutcome({ text: '', error: 42 }), 'empty wake response')
|
|
479
|
+
})
|
|
480
|
+
|
|
481
|
+
it('ignores an empty-string error so whitespace errors do not double-count', () => {
|
|
482
|
+
assert.equal(mod.classifyWakeOutcome({ text: 'fine', error: ' ' }), null)
|
|
483
|
+
})
|
|
484
|
+
})
|
|
@@ -54,6 +54,23 @@ const ORCHESTRATOR_MIN_INTERVAL_SEC = 60
|
|
|
54
54
|
const ORCHESTRATOR_MAX_INTERVAL_SEC = 86400 // 24h
|
|
55
55
|
const ORCHESTRATOR_MAX_PROMPT_CHARS = 4000
|
|
56
56
|
|
|
57
|
+
/**
|
|
58
|
+
* Classify a resolved session-run result as success or failure for the
|
|
59
|
+
* heartbeat/orchestrator outcome tracker. A resolved promise can still
|
|
60
|
+
* carry an error on `result.error` (e.g. a provider 429 that was swallowed
|
|
61
|
+
* into persisted output) or resolve with empty text, and both cases must
|
|
62
|
+
* count as failures — otherwise a stuck wake loop never ticks the
|
|
63
|
+
* failure counter, never backs off, and never auto-disables.
|
|
64
|
+
*/
|
|
65
|
+
export function classifyWakeOutcome(result: unknown): string | null {
|
|
66
|
+
if (!result || typeof result !== 'object') return 'empty wake response'
|
|
67
|
+
const obj = result as { error?: unknown; text?: unknown }
|
|
68
|
+
if (typeof obj.error === 'string' && obj.error.trim()) return obj.error
|
|
69
|
+
const text = typeof obj.text === 'string' ? obj.text : ''
|
|
70
|
+
if (!text.trim()) return 'empty wake response'
|
|
71
|
+
return null
|
|
72
|
+
}
|
|
73
|
+
|
|
57
74
|
interface FailureRecord {
|
|
58
75
|
count: number
|
|
59
76
|
lastFailedAt: number
|
|
@@ -782,24 +799,28 @@ export async function tickHeartbeats() {
|
|
|
782
799
|
state.lastBySession.set(session.id, now)
|
|
783
800
|
|
|
784
801
|
const sid = session.id as string
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
802
|
+
// A session run can "resolve" with an error in result.error (e.g. provider
|
|
803
|
+
// 429 swallowed into the persisted failure) or with empty text. Treat both
|
|
804
|
+
// as failures so backoff and auto-disable trigger, otherwise a stuck
|
|
805
|
+
// heartbeat keeps re-firing at the configured interval and burning tokens.
|
|
806
|
+
const handleHeartbeatOutcome = (failure: string | null) => {
|
|
807
|
+
if (!failure) {
|
|
808
|
+
const prev = state.failures.get(sid)
|
|
809
|
+
if (prev?.recoveryAttempts) {
|
|
810
|
+
log.info('heartbeat', `Recovery successful for session ${sid} after ${prev.recoveryAttempts} attempt(s)`)
|
|
811
|
+
}
|
|
812
|
+
state.failures.delete(sid)
|
|
813
|
+
patchSession(sid, (s) => {
|
|
814
|
+
if (!s) return s
|
|
815
|
+
s.lastDeliveryStatus = 'ok'
|
|
816
|
+
s.lastDeliveredAt = Date.now()
|
|
817
|
+
return s
|
|
818
|
+
})
|
|
819
|
+
return
|
|
789
820
|
}
|
|
790
|
-
state.failures.delete(sid)
|
|
791
|
-
// Track successful delivery
|
|
792
|
-
patchSession(sid, (s) => {
|
|
793
|
-
if (!s) return s
|
|
794
|
-
s.lastDeliveryStatus = 'ok'
|
|
795
|
-
s.lastDeliveredAt = Date.now()
|
|
796
|
-
return s
|
|
797
|
-
})
|
|
798
|
-
}).catch((err: unknown) => {
|
|
799
821
|
const prev = state.failures.get(sid)
|
|
800
822
|
const newCount = (prev?.count ?? 0) + 1
|
|
801
823
|
const record: FailureRecord = { count: newCount, lastFailedAt: Date.now() }
|
|
802
|
-
// Auto-disable heartbeat after too many consecutive failures to prevent resource waste
|
|
803
824
|
if (newCount >= MAX_CONSECUTIVE_FAILURES) {
|
|
804
825
|
record.autoDisabledAt = Date.now()
|
|
805
826
|
log.warn('heartbeat', `Auto-disabling heartbeat for session ${sid} after ${newCount} consecutive failures`)
|
|
@@ -821,17 +842,20 @@ export async function tickHeartbeats() {
|
|
|
821
842
|
})
|
|
822
843
|
}
|
|
823
844
|
state.failures.set(sid, record)
|
|
824
|
-
|
|
825
|
-
log.warn('heartbeat', `Heartbeat run failed for session ${sid} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, msg)
|
|
826
|
-
// Track failed delivery
|
|
845
|
+
log.warn('heartbeat', `Heartbeat run failed for session ${sid} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, failure)
|
|
827
846
|
patchSession(sid, (s) => {
|
|
828
847
|
if (!s) return s
|
|
829
848
|
s.lastDeliveryStatus = 'error'
|
|
830
|
-
s.lastDeliveryError =
|
|
849
|
+
s.lastDeliveryError = failure
|
|
831
850
|
s.lastDeliveredAt = Date.now()
|
|
832
851
|
return s
|
|
833
852
|
})
|
|
834
|
-
}
|
|
853
|
+
}
|
|
854
|
+
enqueue.promise
|
|
855
|
+
.then((result) => handleHeartbeatOutcome(classifyWakeOutcome(result)))
|
|
856
|
+
.catch((err: unknown) => {
|
|
857
|
+
handleHeartbeatOutcome(errorMessage(err) || 'heartbeat rejected')
|
|
858
|
+
})
|
|
835
859
|
}
|
|
836
860
|
}
|
|
837
861
|
|
|
@@ -1118,10 +1142,15 @@ export async function tickOrchestratorAgents() {
|
|
|
1118
1142
|
|
|
1119
1143
|
log.info('orchestrator', `Woke orchestrator agent ${agent.name} (${agent.id}), cycle #${(agent.orchestratorCycleCount || 0) + 1}`)
|
|
1120
1144
|
|
|
1121
|
-
// Track success/failure
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1145
|
+
// Track success/failure. A run can "resolve" but still carry an error
|
|
1146
|
+
// on the result (e.g. provider 429 that was caught and persisted), so we
|
|
1147
|
+
// inspect the resolved result as well as the rejected path — otherwise
|
|
1148
|
+
// a stuck wake loop never ticks the failure counter and never backs off.
|
|
1149
|
+
const handleWakeOutcome = (failure: string | null) => {
|
|
1150
|
+
if (!failure) {
|
|
1151
|
+
orchestratorState.failures.delete(agent.id)
|
|
1152
|
+
return
|
|
1153
|
+
}
|
|
1125
1154
|
const prev = orchestratorState.failures.get(agent.id)
|
|
1126
1155
|
const newCount = (prev?.count ?? 0) + 1
|
|
1127
1156
|
const record: FailureRecord = { count: newCount, lastFailedAt: Date.now() }
|
|
@@ -1146,8 +1175,13 @@ export async function tickOrchestratorAgents() {
|
|
|
1146
1175
|
})
|
|
1147
1176
|
}
|
|
1148
1177
|
orchestratorState.failures.set(agent.id, record)
|
|
1149
|
-
log.warn('orchestrator', `Orchestrator wake failed for agent ${agent.id} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`,
|
|
1150
|
-
}
|
|
1178
|
+
log.warn('orchestrator', `Orchestrator wake failed for agent ${agent.id} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, failure)
|
|
1179
|
+
}
|
|
1180
|
+
enqueue.promise
|
|
1181
|
+
.then((result) => handleWakeOutcome(classifyWakeOutcome(result)))
|
|
1182
|
+
.catch((err: unknown) => {
|
|
1183
|
+
handleWakeOutcome(errorMessage(err) || 'wake rejected')
|
|
1184
|
+
})
|
|
1151
1185
|
} catch (err) {
|
|
1152
1186
|
log.warn('orchestrator', `Error ticking orchestrator agent ${agent.id}:`, errorMessage(err))
|
|
1153
1187
|
}
|