@swarmclawai/swarmclaw 1.2.1 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/README.md +16 -85
  2. package/bin/server-cmd.js +64 -1
  3. package/package.json +2 -2
  4. package/skills/coding-agent/SKILL.md +111 -0
  5. package/skills/github/SKILL.md +140 -0
  6. package/skills/nano-banana-pro/SKILL.md +62 -0
  7. package/skills/nano-banana-pro/scripts/generate_image.py +235 -0
  8. package/skills/nano-pdf/SKILL.md +53 -0
  9. package/skills/openai-image-gen/SKILL.md +78 -0
  10. package/skills/openai-image-gen/scripts/gen.py +328 -0
  11. package/skills/resourceful-problem-solving/SKILL.md +49 -0
  12. package/skills/skill-creator/SKILL.md +147 -0
  13. package/skills/skill-creator/scripts/init_skill.py +378 -0
  14. package/skills/skill-creator/scripts/quick_validate.py +159 -0
  15. package/skills/summarize/SKILL.md +77 -0
  16. package/src/app/api/auth/route.ts +20 -5
  17. package/src/app/api/chats/[id]/devserver/route.ts +13 -19
  18. package/src/app/api/chats/[id]/messages/route.ts +13 -15
  19. package/src/app/api/chats/[id]/route.ts +9 -10
  20. package/src/app/api/chats/[id]/stop/route.ts +5 -7
  21. package/src/app/api/chats/messages-route.test.ts +8 -6
  22. package/src/app/api/chats/route.ts +9 -10
  23. package/src/app/api/ip/route.ts +2 -2
  24. package/src/app/api/preview-server/route.ts +1 -1
  25. package/src/app/api/projects/[id]/route.ts +7 -46
  26. package/src/cli/server-cmd.test.js +74 -0
  27. package/src/components/chat/chat-area.tsx +45 -23
  28. package/src/components/chat/message-bubble.test.ts +35 -0
  29. package/src/components/chat/message-bubble.tsx +19 -9
  30. package/src/components/chat/message-list.tsx +37 -3
  31. package/src/components/input/chat-input.tsx +34 -14
  32. package/src/components/openclaw/openclaw-deploy-panel.tsx +4 -0
  33. package/src/instrumentation.ts +1 -1
  34. package/src/lib/chat/assistant-render-id.ts +3 -0
  35. package/src/lib/chat/chat-streaming-state.test.ts +42 -3
  36. package/src/lib/chat/chat-streaming-state.ts +20 -8
  37. package/src/lib/chat/queued-message-queue.test.ts +23 -1
  38. package/src/lib/chat/queued-message-queue.ts +11 -2
  39. package/src/lib/providers/cli-utils.test.ts +124 -0
  40. package/src/lib/server/activity/activity-log.ts +21 -0
  41. package/src/lib/server/agents/agent-availability.test.ts +10 -5
  42. package/src/lib/server/agents/agent-cascade.ts +79 -59
  43. package/src/lib/server/agents/agent-registry.ts +3 -1
  44. package/src/lib/server/agents/agent-repository.ts +90 -0
  45. package/src/lib/server/agents/delegation-job-repository.ts +53 -0
  46. package/src/lib/server/agents/delegation-jobs.ts +11 -4
  47. package/src/lib/server/agents/guardian-checkpoint-repository.ts +35 -0
  48. package/src/lib/server/agents/guardian.ts +2 -2
  49. package/src/lib/server/agents/main-agent-loop.ts +10 -3
  50. package/src/lib/server/agents/main-loop-state-repository.ts +38 -0
  51. package/src/lib/server/agents/subagent-runtime.ts +9 -6
  52. package/src/lib/server/agents/subagent-swarm.ts +3 -2
  53. package/src/lib/server/agents/task-session.ts +3 -4
  54. package/src/lib/server/approvals/approval-repository.ts +30 -0
  55. package/src/lib/server/autonomy/supervisor-incident-repository.ts +42 -0
  56. package/src/lib/server/chat-execution/chat-execution-types.ts +38 -0
  57. package/src/lib/server/chat-execution/chat-execution-utils.ts +1 -1
  58. package/src/lib/server/chat-execution/chat-execution.ts +84 -1926
  59. package/src/lib/server/chat-execution/chat-turn-finalization.ts +620 -0
  60. package/src/lib/server/chat-execution/chat-turn-partial-persistence.ts +221 -0
  61. package/src/lib/server/chat-execution/chat-turn-preflight.ts +133 -0
  62. package/src/lib/server/chat-execution/chat-turn-preparation.ts +817 -0
  63. package/src/lib/server/chat-execution/chat-turn-stream-execution.ts +296 -0
  64. package/src/lib/server/chat-execution/chat-turn-tool-routing.ts +5 -5
  65. package/src/lib/server/chat-execution/message-classifier.test.ts +329 -0
  66. package/src/lib/server/chat-execution/post-stream-finalization.ts +1 -1
  67. package/src/lib/server/chat-execution/prompt-builder.ts +11 -0
  68. package/src/lib/server/chat-execution/prompt-sections.ts +5 -6
  69. package/src/lib/server/chat-execution/situational-awareness.ts +12 -7
  70. package/src/lib/server/chat-execution/stream-agent-chat.ts +16 -13
  71. package/src/lib/server/chatrooms/chatroom-repository.ts +32 -0
  72. package/src/lib/server/connectors/connector-repository.ts +58 -0
  73. package/src/lib/server/connectors/runtime-state.test.ts +117 -0
  74. package/src/lib/server/credentials/credential-repository.ts +7 -0
  75. package/src/lib/server/gateways/gateway-profile-repository.ts +4 -0
  76. package/src/lib/server/memory/memory-abstract.test.ts +59 -0
  77. package/src/lib/server/missions/mission-repository.ts +74 -0
  78. package/src/lib/server/missions/mission-service/actions.ts +6 -0
  79. package/src/lib/server/missions/mission-service/bindings.ts +9 -0
  80. package/src/lib/server/missions/mission-service/context.ts +4 -0
  81. package/src/lib/server/missions/mission-service/core.ts +2269 -0
  82. package/src/lib/server/missions/mission-service/queries.ts +12 -0
  83. package/src/lib/server/missions/mission-service/recovery.ts +5 -0
  84. package/src/lib/server/missions/mission-service/ticks.ts +9 -0
  85. package/src/lib/server/missions/mission-service.test.ts +9 -2
  86. package/src/lib/server/missions/mission-service.ts +6 -2266
  87. package/src/lib/server/openclaw/deploy.test.ts +42 -3
  88. package/src/lib/server/openclaw/deploy.ts +26 -12
  89. package/src/lib/server/persistence/repository-utils.ts +154 -0
  90. package/src/lib/server/persistence/storage-context.ts +51 -0
  91. package/src/lib/server/persistence/transaction.ts +1 -0
  92. package/src/lib/server/projects/project-repository.ts +36 -0
  93. package/src/lib/server/projects/project-service.ts +79 -0
  94. package/src/lib/server/protocols/protocol-normalization.test.ts +6 -4
  95. package/src/lib/server/runtime/alert-dispatch.ts +1 -1
  96. package/src/lib/server/runtime/daemon-policy.ts +1 -1
  97. package/src/lib/server/runtime/daemon-state/core.ts +1570 -0
  98. package/src/lib/server/runtime/daemon-state/health.ts +6 -0
  99. package/src/lib/server/runtime/daemon-state/policy.ts +7 -0
  100. package/src/lib/server/runtime/daemon-state/supervisor.ts +6 -0
  101. package/src/lib/server/runtime/daemon-state.test.ts +48 -0
  102. package/src/lib/server/runtime/daemon-state.ts +3 -1470
  103. package/src/lib/server/runtime/estop-repository.ts +4 -0
  104. package/src/lib/server/runtime/estop.ts +3 -1
  105. package/src/lib/server/runtime/heartbeat-service.test.ts +2 -2
  106. package/src/lib/server/runtime/heartbeat-service.ts +55 -34
  107. package/src/lib/server/runtime/heartbeat-wake.ts +6 -4
  108. package/src/lib/server/runtime/idle-window.ts +2 -2
  109. package/src/lib/server/runtime/network.ts +11 -0
  110. package/src/lib/server/runtime/orchestrator-events.ts +2 -2
  111. package/src/lib/server/runtime/queue/claims.ts +4 -0
  112. package/src/lib/server/runtime/queue/core.ts +2079 -0
  113. package/src/lib/server/runtime/queue/execution.ts +7 -0
  114. package/src/lib/server/runtime/queue/followups.ts +4 -0
  115. package/src/lib/server/runtime/queue/queries.ts +12 -0
  116. package/src/lib/server/runtime/queue/recovery.ts +7 -0
  117. package/src/lib/server/runtime/queue-recovery.test.ts +48 -13
  118. package/src/lib/server/runtime/queue-repository.ts +17 -0
  119. package/src/lib/server/runtime/queue.ts +5 -2061
  120. package/src/lib/server/runtime/run-ledger.ts +6 -5
  121. package/src/lib/server/runtime/run-repository.ts +73 -0
  122. package/src/lib/server/runtime/runtime-lock-repository.ts +8 -0
  123. package/src/lib/server/runtime/runtime-settings.ts +1 -1
  124. package/src/lib/server/runtime/runtime-state.ts +99 -0
  125. package/src/lib/server/runtime/scheduler.ts +4 -2
  126. package/src/lib/server/runtime/session-run-manager/cancellation.ts +157 -0
  127. package/src/lib/server/runtime/session-run-manager/drain.ts +246 -0
  128. package/src/lib/server/runtime/session-run-manager/enqueue.ts +287 -0
  129. package/src/lib/server/runtime/session-run-manager/queries.ts +117 -0
  130. package/src/lib/server/runtime/session-run-manager/recovery.ts +238 -0
  131. package/src/lib/server/runtime/session-run-manager/state.ts +441 -0
  132. package/src/lib/server/runtime/session-run-manager/types.ts +74 -0
  133. package/src/lib/server/runtime/session-run-manager.ts +72 -1377
  134. package/src/lib/server/runtime/watch-job-repository.ts +35 -0
  135. package/src/lib/server/runtime/watch-jobs.ts +3 -1
  136. package/src/lib/server/schedules/schedule-repository.ts +42 -0
  137. package/src/lib/server/sessions/session-repository.ts +85 -0
  138. package/src/lib/server/settings/settings-repository.ts +25 -0
  139. package/src/lib/server/skills/skill-discovery.test.ts +2 -2
  140. package/src/lib/server/skills/skill-discovery.ts +2 -2
  141. package/src/lib/server/skills/skill-repository.ts +14 -0
  142. package/src/lib/server/storage.ts +13 -24
  143. package/src/lib/server/tasks/task-repository.ts +54 -0
  144. package/src/lib/server/usage/usage-repository.ts +30 -0
  145. package/src/lib/server/webhooks/webhook-repository.ts +10 -0
  146. package/src/lib/strip-internal-metadata.test.ts +42 -41
  147. package/src/stores/use-chat-store.test.ts +54 -0
  148. package/src/stores/use-chat-store.ts +21 -5
  149. /package/{bundled-skills → skills}/google-workspace/SKILL.md +0 -0
@@ -0,0 +1,1570 @@
1
+ import { log } from '@/lib/server/logger'
2
+ import { loadAgents } from '@/lib/server/agents/agent-repository'
3
+ import { loadConnectors, saveConnectors } from '@/lib/server/connectors/connector-repository'
4
+ import { decryptKey, loadCredentials } from '@/lib/server/credentials/credential-repository'
5
+ import { loadQueue } from '@/lib/server/runtime/queue-repository'
6
+ import { pruneExpiredLocks, readRuntimeLock, releaseRuntimeLock, renewRuntimeLock, tryAcquireRuntimeLock } from '@/lib/server/runtime/runtime-lock-repository'
7
+ import { loadSchedules } from '@/lib/server/schedules/schedule-repository'
8
+ import { loadSessions } from '@/lib/server/sessions/session-repository'
9
+ import { loadSettings } from '@/lib/server/settings/settings-repository'
10
+ import { pruneOldUsage } from '@/lib/server/usage/usage-repository'
11
+ import { appendWebhookLog, deleteWebhookRetry, loadWebhookRetryQueue, loadWebhooks, upsertWebhookRetry } from '@/lib/server/webhooks/webhook-repository'
12
+ import { notify } from '@/lib/server/ws-hub'
13
+ import { processNext, cleanupFinishedTaskSessions, validateCompletedTasksQueue, recoverStalledRunningTasks, resumeQueue, promoteDeferred } from '@/lib/server/runtime/queue'
14
+ import { startScheduler, stopScheduler } from '@/lib/server/runtime/scheduler'
15
+ import { sweepOrphanedBrowsers, getActiveBrowserCount } from '@/lib/server/session-tools'
16
+ import {
17
+ autoStartConnectors,
18
+ stopAllConnectors,
19
+ startConnector,
20
+ getConnectorStatus,
21
+ checkConnectorHealth,
22
+ createConnectorReconnectState,
23
+ advanceConnectorReconnectState,
24
+ clearReconnectState,
25
+ getAllReconnectStates,
26
+ getReconnectState,
27
+ setReconnectState,
28
+ } from '@/lib/server/connectors/manager'
29
+ import { startConnectorOutboxWorker, stopConnectorOutboxWorker } from '@/lib/server/connectors/outbox'
30
+ import { pruneConnectorTrackingState } from '@/lib/server/connectors/runtime-state'
31
+ import { startHeartbeatService, stopHeartbeatService, getHeartbeatServiceStatus, pruneHeartbeatState, pruneOrchestratorState } from '@/lib/server/runtime/heartbeat-service'
32
+ import { hasOpenClawAgents, ensureGatewayConnected, disconnectAutoGateways, getGateway } from '@/lib/server/openclaw/gateway'
33
+ import { enqueueSessionRun, sweepStuckRuns } from '@/lib/server/runtime/session-run-manager'
34
+ import { pruneOldRuns } from '@/lib/server/runtime/run-ledger'
35
+ import { getEnabledCapabilitySelection } from '@/lib/capability-selection'
36
+ import { WORKSPACE_DIR } from '@/lib/server/data-dir'
37
+ import { DEFAULT_HEARTBEAT_INTERVAL_SEC } from '@/lib/runtime/heartbeat-defaults'
38
+ import { genId } from '@/lib/id'
39
+ import { isAgentDisabled } from '@/lib/server/agents/agent-availability'
40
+ import { errorMessage, hmrSingleton } from '@/lib/shared-utils'
41
+ import path from 'node:path'
42
+ import type { Connector, Session, WebhookRetryEntry } from '@/types'
43
+ import { createNotification } from '@/lib/server/create-notification'
44
+ import { pingProvider, OPENAI_COMPATIBLE_DEFAULTS, restoreProviderHealthState } from '@/lib/server/provider-health'
45
+ import { runIntegrityMonitor } from '@/lib/server/integrity-monitor'
46
+ import { notifyOrchestrators } from '@/lib/server/runtime/orchestrator-events'
47
+ import { recoverStaleDelegationJobs } from '@/lib/server/agents/delegation-jobs'
48
+ import { restoreSwarmRegistry } from '@/lib/server/agents/subagent-swarm'
49
+ import { cleanupFinishedSubagents } from '@/lib/server/agents/subagent-runtime'
50
+ import { pruneMainLoopState } from '@/lib/server/agents/main-agent-loop'
51
+ import { pruneSystemEventQueues, pruneOrchestratorEventQueues } from '@/lib/server/runtime/system-events'
52
+ import { checkSwarmTimeouts, ensureProtocolEngineRecovered } from '@/lib/server/protocols/protocol-service'
53
+ import { sweepManagedProcesses, reapOrphanedSandboxContainers } from '@/lib/server/runtime/process-manager'
54
+ import { drainIdleWindowCallbacks } from '@/lib/server/runtime/idle-window'
55
+ import {
56
+ buildSessionHeartbeatHealthDedupKey,
57
+ daemonAutostartEnvEnabled,
58
+ isDaemonBackgroundServicesEnabled,
59
+ parseCronToMs,
60
+ parseHeartbeatIntervalSec,
61
+ shouldNotifyProviderReachabilityIssue,
62
+ shouldSuppressSessionHeartbeatHealthAlert,
63
+ shouldSuppressSyntheticAgentHealthAlert,
64
+ } from '@/lib/server/runtime/daemon-policy'
65
+ import { loadEstopState } from '@/lib/server/runtime/estop'
66
+ import { classifyRuntimeFailure, recordSupervisorIncident } from '@/lib/server/autonomy/supervisor-reflection'
67
+ import { getMemoryDb } from '@/lib/server/memory/memory-db'
68
+ import { clearLogsByAge } from '@/lib/server/execution-log'
69
+ import { runMissionControllerStartupRecovery } from '@/lib/server/missions/mission-service'
70
+
71
+ const TAG = 'daemon-state'
72
+
73
+ const QUEUE_CHECK_INTERVAL = 30_000 // 30 seconds
74
+ const BROWSER_SWEEP_INTERVAL = 60_000 // 60 seconds
75
+ const BROWSER_MAX_AGE = 10 * 60 * 1000 // 10 minutes idle = orphaned
76
+ const HEALTH_CHECK_INTERVAL = 120_000 // 2 minutes
77
+ const CONNECTOR_HEALTH_CHECK_INTERVAL = 15_000 // 15 seconds
78
+ const MEMORY_CONSOLIDATION_INTERVAL = 6 * 3600_000 // 6 hours
79
+ const MEMORY_CONSOLIDATION_INITIAL_DELAY = 60_000 // 1 minute after daemon start
80
+ const STALE_MULTIPLIER = 4 // session is stale after N × heartbeat interval
81
+ const STALE_MIN_MS = 4 * 60 * 1000 // minimum 4 minutes regardless of interval
82
+ const STALE_AUTO_DISABLE_MULTIPLIER = 16 // auto-disable after much longer sustained staleness
83
+ const STALE_AUTO_DISABLE_MIN_MS = 45 * 60 * 1000 // never auto-disable before 45 minutes
84
+ const CONNECTOR_RESTART_BASE_MS = 30_000
85
+ const CONNECTOR_RESTART_MAX_MS = 15 * 60 * 1000
86
+ const MAX_WAKE_ATTEMPTS = 3
87
+ const QUEUE_PROCESS_TIMEOUT = 10 * 60_000 // 10 minutes
88
+ const SHUTDOWN_TIMEOUT_MS = 15_000
89
+ const PROVIDER_PING_CB_THRESHOLD = 3 // trips after 3 consecutive failures
90
+ const PROVIDER_PING_CB_BASE_MS = 300_000 // 5 min initial cooldown
91
+ const PROVIDER_PING_CB_MAX_MS = 1_800_000 // 30 min max cooldown
92
+ const DAEMON_RUNTIME_LOCK_NAME = 'daemon-primary'
93
+ const DAEMON_RUNTIME_LOCK_TTL_MS = 120_000
94
+ const DAEMON_RUNTIME_LOCK_RENEW_MS = 30_000
95
+
96
+ export {
97
+ buildSessionHeartbeatHealthDedupKey,
98
+ isDaemonBackgroundServicesEnabled,
99
+ shouldNotifyProviderReachabilityIssue,
100
+ shouldSuppressSessionHeartbeatHealthAlert,
101
+ shouldSuppressSyntheticAgentHealthAlert,
102
+ }
103
+
104
+ // Store daemon state on globalThis to survive HMR reloads
105
+ interface DaemonState {
106
+ queueIntervalId: ReturnType<typeof setInterval> | null
107
+ browserSweepId: ReturnType<typeof setInterval> | null
108
+ healthIntervalId: ReturnType<typeof setInterval> | null
109
+ connectorHealthIntervalId: ReturnType<typeof setInterval> | null
110
+ memoryConsolidationTimeoutId: ReturnType<typeof setTimeout> | null
111
+ memoryConsolidationIntervalId: ReturnType<typeof setInterval> | null
112
+ evalSchedulerIntervalId: ReturnType<typeof setInterval> | null
113
+ swarmTimeoutIntervalId: ReturnType<typeof setInterval> | null
114
+ /** Session IDs we've already alerted as stale (alert-once semantics). */
115
+ staleSessionIds: Set<string>
116
+ /** OpenClaw gateway agent IDs currently considered down. */
117
+ openclawDownAgentIds: Set<string>
118
+ /** Per-agent auto-repair state for OpenClaw gateways. */
119
+ openclawRepairState: Map<string, { attempts: number; lastAttemptAt: number; cooldownUntil: number }>
120
+ lastIntegrityCheckAt: number | null
121
+ lastIntegrityDriftCount: number
122
+ manualStopRequested: boolean
123
+ running: boolean
124
+ lastProcessedAt: number | null
125
+ healthCheckRunning: boolean
126
+ connectorHealthCheckRunning: boolean
127
+ shuttingDown: boolean
128
+ providerPingCircuitBreaker: Map<string, { consecutiveFailures: number; skipUntil: number }>
129
+ lockRenewIntervalId: ReturnType<typeof setInterval> | null
130
+ primaryLeaseHeld: boolean
131
+ }
132
+
133
+ const ds: DaemonState = hmrSingleton<DaemonState>('__swarmclaw_daemon__', () => ({
134
+ queueIntervalId: null,
135
+ browserSweepId: null,
136
+ healthIntervalId: null,
137
+ connectorHealthIntervalId: null,
138
+ memoryConsolidationTimeoutId: null,
139
+ memoryConsolidationIntervalId: null,
140
+ evalSchedulerIntervalId: null,
141
+ swarmTimeoutIntervalId: null,
142
+ staleSessionIds: new Set<string>(),
143
+ openclawDownAgentIds: new Set<string>(),
144
+ openclawRepairState: new Map<string, { attempts: number; lastAttemptAt: number; cooldownUntil: number }>(),
145
+ lastIntegrityCheckAt: null,
146
+ lastIntegrityDriftCount: 0,
147
+ manualStopRequested: false,
148
+ running: false,
149
+ lastProcessedAt: null,
150
+ healthCheckRunning: false,
151
+ connectorHealthCheckRunning: false,
152
+ shuttingDown: false,
153
+ providerPingCircuitBreaker: new Map<string, { consecutiveFailures: number; skipUntil: number }>(),
154
+ lockRenewIntervalId: null,
155
+ primaryLeaseHeld: false,
156
+ }))
157
+
158
+ const daemonLockOwner = hmrSingleton<string>(
159
+ '__swarmclaw_daemon_lock_owner__',
160
+ () => `pid:${process.pid}:${genId(8)}`,
161
+ )
162
+
163
+ // Backfill fields for hot-reloaded daemon state objects from older code versions.
164
+ if (!ds.staleSessionIds) ds.staleSessionIds = new Set<string>()
165
+ if (!ds.openclawDownAgentIds) ds.openclawDownAgentIds = new Set<string>()
166
+ if (!ds.openclawRepairState) ds.openclawRepairState = new Map<string, { attempts: number; lastAttemptAt: number; cooldownUntil: number }>()
167
+ if (ds.lastIntegrityCheckAt === undefined) ds.lastIntegrityCheckAt = null
168
+ if (ds.lastIntegrityDriftCount === undefined) ds.lastIntegrityDriftCount = 0
169
+ // Migrate from old issueLastAlertAt map if present (HMR across code versions)
170
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
171
+ if ((ds as any).issueLastAlertAt) delete (ds as any).issueLastAlertAt
172
+ if (ds.healthIntervalId === undefined) ds.healthIntervalId = null
173
+ if (ds.connectorHealthIntervalId === undefined) ds.connectorHealthIntervalId = null
174
+ if (ds.manualStopRequested === undefined) ds.manualStopRequested = false
175
+ if (ds.memoryConsolidationTimeoutId === undefined) ds.memoryConsolidationTimeoutId = null
176
+ if (ds.memoryConsolidationIntervalId === undefined) ds.memoryConsolidationIntervalId = null
177
+ if (ds.evalSchedulerIntervalId === undefined) ds.evalSchedulerIntervalId = null
178
+ if (ds.swarmTimeoutIntervalId === undefined) ds.swarmTimeoutIntervalId = null
179
+ if (ds.healthCheckRunning === undefined) ds.healthCheckRunning = false
180
+ if (ds.connectorHealthCheckRunning === undefined) ds.connectorHealthCheckRunning = false
181
+ if (ds.shuttingDown === undefined) ds.shuttingDown = false
182
+ if (!ds.providerPingCircuitBreaker) ds.providerPingCircuitBreaker = new Map<string, { consecutiveFailures: number; skipUntil: number }>()
183
+ if (ds.lockRenewIntervalId === undefined) ds.lockRenewIntervalId = null
184
+ if (ds.primaryLeaseHeld === undefined) ds.primaryLeaseHeld = false
185
+
186
+ function stopDaemonLeaseRenewal(opts?: { release?: boolean }) {
187
+ if (ds.lockRenewIntervalId) {
188
+ clearInterval(ds.lockRenewIntervalId)
189
+ ds.lockRenewIntervalId = null
190
+ }
191
+ if (opts?.release !== false && ds.primaryLeaseHeld) {
192
+ try {
193
+ releaseRuntimeLock(DAEMON_RUNTIME_LOCK_NAME, daemonLockOwner)
194
+ } catch {
195
+ // Best effort during shutdown or HMR.
196
+ }
197
+ }
198
+ if (opts?.release !== false) ds.primaryLeaseHeld = false
199
+ }
200
+
201
+ function startDaemonLeaseRenewal() {
202
+ if (!ds.primaryLeaseHeld || ds.lockRenewIntervalId) return
203
+ ds.lockRenewIntervalId = setInterval(() => {
204
+ if (!ds.running || !ds.primaryLeaseHeld) return
205
+ let renewed = false
206
+ try {
207
+ renewed = renewRuntimeLock(DAEMON_RUNTIME_LOCK_NAME, daemonLockOwner, DAEMON_RUNTIME_LOCK_TTL_MS)
208
+ } catch (err: unknown) {
209
+ log.warn(TAG, `[daemon] Failed to renew daemon lease: ${errorMessage(err)}`)
210
+ }
211
+ if (renewed) return
212
+ ds.primaryLeaseHeld = false
213
+ stopDaemonLeaseRenewal({ release: false })
214
+ log.warn(TAG, '[daemon] Lost cross-process daemon lease; stopping local daemon instance')
215
+ void stopDaemon({ source: 'lease-lost' })
216
+ }, DAEMON_RUNTIME_LOCK_RENEW_MS)
217
+ }
218
+
219
+ function acquireDaemonLease(source: string): boolean {
220
+ if (ds.primaryLeaseHeld) {
221
+ startDaemonLeaseRenewal()
222
+ return true
223
+ }
224
+ let acquired = false
225
+ try {
226
+ acquired = tryAcquireRuntimeLock(DAEMON_RUNTIME_LOCK_NAME, daemonLockOwner, DAEMON_RUNTIME_LOCK_TTL_MS)
227
+ } catch (err: unknown) {
228
+ log.warn(TAG, `[daemon] Failed to acquire daemon lease (source=${source}): ${errorMessage(err)}`)
229
+ return false
230
+ }
231
+ if (!acquired) {
232
+ let owner = 'another process'
233
+ try {
234
+ owner = readRuntimeLock(DAEMON_RUNTIME_LOCK_NAME)?.owner || owner
235
+ } catch {
236
+ // Best-effort diagnostics only.
237
+ }
238
+ log.info(TAG, `[daemon] Skipping start (source=${source}); lease held by ${owner}`)
239
+ return false
240
+ }
241
+ ds.primaryLeaseHeld = true
242
+ startDaemonLeaseRenewal()
243
+ return true
244
+ }
245
+
246
+ export function ensureDaemonStarted(source = 'unknown'): boolean {
247
+ if (ds.running) return false
248
+ if (!daemonAutostartEnvEnabled()) return false
249
+ if (ds.manualStopRequested) return false
250
+ if (loadEstopState().level !== 'none') return false
251
+ return startDaemon({ source, manualStart: false })
252
+ }
253
+
254
+ export function startDaemon(options?: { source?: string; manualStart?: boolean }): boolean {
255
+ const source = options?.source || 'unknown'
256
+ const manualStart = options?.manualStart === true
257
+ if (manualStart) ds.manualStopRequested = false
258
+ const estop = loadEstopState()
259
+ if (estop.level !== 'none') {
260
+ notify('daemon')
261
+ log.warn(TAG, `[daemon] Start blocked by estop (level=${estop.level}, source=${source})`)
262
+ return false
263
+ }
264
+
265
+ if (ds.running) {
266
+ // In dev/HMR, daemon can already be flagged running while new interval types
267
+ // (for example health monitor) were introduced in newer code.
268
+ startDaemonLeaseRenewal()
269
+ startQueueProcessor()
270
+ startBrowserSweep()
271
+ startHeartbeatService()
272
+ startMemoryConsolidation()
273
+ startSwarmTimeoutChecker()
274
+ syncDaemonBackgroundServices({ runConnectorHealthCheckImmediately: false })
275
+ return false
276
+ }
277
+ if (!acquireDaemonLease(source)) {
278
+ notify('daemon')
279
+ return false
280
+ }
281
+ ds.running = true
282
+ notify('daemon')
283
+ log.info(TAG, `[daemon] Starting daemon (source=${source}, scheduler + queue processor + heartbeat)`)
284
+
285
+ try {
286
+ validateCompletedTasksQueue()
287
+ cleanupFinishedTaskSessions()
288
+ recoverStaleDelegationJobs({ fullRestart: true })
289
+ ensureProtocolEngineRecovered()
290
+ restoreProviderHealthState()
291
+ try {
292
+ const lost = restoreSwarmRegistry()
293
+ if (lost > 0) log.info(TAG, `[daemon] Marked ${lost} in-flight swarm(s) as lost after restart`)
294
+ } catch { /* best-effort */ }
295
+ resumeQueue()
296
+ const missionRecovery = runMissionControllerStartupRecovery()
297
+ if (missionRecovery.recovered > 0 || missionRecovery.rerunVerification > 0) {
298
+ log.info(
299
+ TAG,
300
+ `[daemon] Recovered ${missionRecovery.recovered} mission(s) on startup`
301
+ + ` (${missionRecovery.rerunVerification} queued for verification replay)`,
302
+ )
303
+ }
304
+ startScheduler()
305
+ startQueueProcessor()
306
+ startBrowserSweep()
307
+ startHeartbeatService()
308
+ startMemoryConsolidation()
309
+ startSwarmTimeoutChecker()
310
+ syncDaemonBackgroundServices({ runConnectorHealthCheckImmediately: false })
311
+ } catch (err: unknown) {
312
+ ds.running = false
313
+ stopDaemonLeaseRenewal()
314
+ notify('daemon')
315
+ log.error(TAG, '[daemon] Failed to start:', errorMessage(err))
316
+ throw err
317
+ }
318
+
319
+ if (isDaemonBackgroundServicesEnabled()) {
320
+ // Auto-start enabled connectors only when the full background stack is enabled.
321
+ autoStartConnectors().catch((err: unknown) => {
322
+ log.error(TAG, '[daemon] Error auto-starting connectors:', errorMessage(err))
323
+ })
324
+ }
325
+ return true
326
+ }
327
+
328
+ export async function stopDaemon(options?: { source?: string; manualStop?: boolean }) {
329
+ const source = options?.source || 'unknown'
330
+ if (options?.manualStop === true) ds.manualStopRequested = true
331
+ if (!ds.running) {
332
+ stopDaemonLeaseRenewal()
333
+ return
334
+ }
335
+ ds.running = false
336
+ ds.shuttingDown = true
337
+ notify('daemon')
338
+ log.info(TAG, `[daemon] Stopping daemon (source=${source})`)
339
+
340
+ stopScheduler()
341
+ stopQueueProcessor()
342
+ stopBrowserSweep()
343
+ stopHealthMonitor()
344
+ stopConnectorHealthMonitor()
345
+ stopConnectorOutboxWorker()
346
+ stopHeartbeatService()
347
+ stopMemoryConsolidation()
348
+ stopSwarmTimeoutChecker()
349
+ stopEvalScheduler()
350
+ try {
351
+ await Promise.race([
352
+ stopAllConnectors({ disable: false }),
353
+ new Promise<void>((_, reject) =>
354
+ setTimeout(() => reject(new Error('Connector shutdown timed out')), SHUTDOWN_TIMEOUT_MS)
355
+ ),
356
+ ])
357
+ } catch (err: unknown) {
358
+ log.warn(TAG, `[daemon] Connector shutdown issue: ${errorMessage(err)}`)
359
+ } finally {
360
+ stopDaemonLeaseRenewal()
361
+ ds.shuttingDown = false
362
+ }
363
+ }
364
+
365
+ function startBrowserSweep() {
366
+ if (ds.browserSweepId) return
367
+ ds.browserSweepId = setInterval(() => {
368
+ const count = getActiveBrowserCount()
369
+ if (count > 0) {
370
+ const cleaned = sweepOrphanedBrowsers(BROWSER_MAX_AGE)
371
+ if (cleaned > 0) {
372
+ log.info(TAG, `[daemon] Cleaned ${cleaned} orphaned browser(s), ${getActiveBrowserCount()} still active`)
373
+ }
374
+ }
375
+ }, BROWSER_SWEEP_INTERVAL)
376
+ }
377
+
378
+ function stopBrowserSweep() {
379
+ if (ds.browserSweepId) {
380
+ clearInterval(ds.browserSweepId)
381
+ ds.browserSweepId = null
382
+ }
383
+ // Kill all remaining browsers on shutdown
384
+ sweepOrphanedBrowsers(0)
385
+ }
386
+
387
+ export async function syncOpenClawGatewayLifecycle() {
388
+ if (!hasOpenClawAgents()) {
389
+ disconnectAutoGateways()
390
+ return
391
+ }
392
+ if (!getGateway()?.connected) {
393
+ await ensureGatewayConnected()
394
+ }
395
+ }
396
+
397
+ function startQueueProcessor() {
398
+ if (ds.queueIntervalId) return
399
+ ds.queueIntervalId = setInterval(async () => {
400
+ if (!ds.running) return
401
+ const queue = loadQueue()
402
+ if (queue.length > 0) {
403
+ log.info(TAG, `[daemon] Processing ${queue.length} queued task(s)`)
404
+ try {
405
+ await Promise.race([
406
+ processNext(),
407
+ new Promise<void>((_, reject) =>
408
+ setTimeout(() => reject(new Error('Queue processing timed out')), QUEUE_PROCESS_TIMEOUT)
409
+ ),
410
+ ])
411
+ } catch (err: unknown) {
412
+ log.error(TAG, `[daemon] Queue processing error/timeout: ${errorMessage(err)}`)
413
+ }
414
+ ds.lastProcessedAt = Date.now()
415
+ }
416
+ if (!isDaemonBackgroundServicesEnabled()) return
417
+ // OpenClaw gateway lifecycle: lazy connect for active OpenClaw agents, stop auto-managed reconnects when none remain.
418
+ try {
419
+ await syncOpenClawGatewayLifecycle()
420
+ } catch { /* gateway errors are non-fatal */ }
421
+ }, QUEUE_CHECK_INTERVAL)
422
+ }
423
+
424
+ function stopQueueProcessor() {
425
+ if (ds.queueIntervalId) {
426
+ clearInterval(ds.queueIntervalId)
427
+ ds.queueIntervalId = null
428
+ }
429
+ }
430
+
431
+ async function sendHealthAlert(input: string | {
432
+ text: string
433
+ dedupKey?: string
434
+ entityType?: string
435
+ entityId?: string
436
+ }) {
437
+ const payload = typeof input === 'string' ? { text: input } : input
438
+ const text = payload.text
439
+ log.warn(TAG, `[health] ${text}`)
440
+ createNotification({
441
+ type: 'warning',
442
+ title: 'SwarmClaw health alert',
443
+ message: text,
444
+ dedupKey: payload.dedupKey || `health-alert:${text}`,
445
+ entityType: payload.entityType,
446
+ entityId: payload.entityId,
447
+ dispatchExternally: false,
448
+ })
449
+ }
450
+
451
+ async function runConnectorHealthChecks(now: number) {
452
+ // First, collapse dead runtime instances into persisted error state so the
453
+ // daemon can own the restart cadence and backoff policy.
454
+ try {
455
+ await checkConnectorHealth()
456
+ } catch (err: unknown) {
457
+ log.error(TAG, '[health] Connector isAlive check failed:', errorMessage(err))
458
+ }
459
+
460
+ const connectors = loadConnectors()
461
+ for (const connector of Object.values(connectors) as Connector[]) {
462
+ if (!connector?.id || typeof connector.id !== 'string') continue
463
+ if (connector.isEnabled !== true) {
464
+ clearReconnectState(connector.id)
465
+ continue
466
+ }
467
+
468
+ const runtimeStatus = getConnectorStatus(connector.id)
469
+ if (runtimeStatus === 'running') {
470
+ clearReconnectState(connector.id)
471
+ continue
472
+ }
473
+
474
+ const current = getReconnectState(connector.id)
475
+ ?? createConnectorReconnectState(
476
+ { error: typeof connector.lastError === 'string' ? connector.lastError : '' },
477
+ { initialBackoffMs: CONNECTOR_RESTART_BASE_MS },
478
+ )
479
+
480
+ if (current.exhausted) {
481
+ continue
482
+ }
483
+
484
+ if (current.nextRetryAt > now) continue
485
+
486
+ // Notify on first detection of a down connector
487
+ if (current.attempts === 0) {
488
+ createNotification({
489
+ type: 'warning',
490
+ title: `Connector "${connector.name}" is down`,
491
+ message: 'Auto-restart in progress.',
492
+ dedupKey: `connector-down:${connector.id}`,
493
+ entityType: 'connector',
494
+ entityId: connector.id,
495
+ })
496
+ }
497
+
498
+ try {
499
+ await startConnector(connector.id)
500
+ clearReconnectState(connector.id)
501
+ await sendHealthAlert(`Connector "${connector.name}" (${connector.platform}) was down and has been auto-restarted.`)
502
+ } catch (err: unknown) {
503
+ const message = errorMessage(err)
504
+ const next = advanceConnectorReconnectState(current, message, now, {
505
+ initialBackoffMs: CONNECTOR_RESTART_BASE_MS,
506
+ maxBackoffMs: CONNECTOR_RESTART_MAX_MS,
507
+ maxAttempts: MAX_WAKE_ATTEMPTS,
508
+ })
509
+ setReconnectState(connector.id, next)
510
+ if (next.exhausted) {
511
+ log.warn(TAG, `[health] Connector "${connector.name}" exceeded ${MAX_WAKE_ATTEMPTS} auto-restart attempts — giving up until the server restarts or the user retries manually`)
512
+ connector.status = 'error'
513
+ connector.lastError = `Auto-restart gave up after ${MAX_WAKE_ATTEMPTS} attempts: ${message}`
514
+ connector.updatedAt = Date.now()
515
+ connectors[connector.id] = connector
516
+ saveConnectors(connectors)
517
+ notify('connectors')
518
+ notifyOrchestrators(`Connector ${connector.name || connector.id} status: error — auto-restart exhausted after ${MAX_WAKE_ATTEMPTS} attempts`, `connector-status:${connector.id}`)
519
+ createNotification({
520
+ type: 'error',
521
+ title: `Connector "${connector.name}" failed`,
522
+ message: `Auto-restart gave up after ${MAX_WAKE_ATTEMPTS} attempts.`,
523
+ dedupKey: `connector-gave-up:${connector.id}`,
524
+ entityType: 'connector',
525
+ entityId: connector.id,
526
+ })
527
+ } else {
528
+ log.warn(TAG, `[health] Connector auto-restart failed for ${connector.name} (attempt ${next.attempts}/${MAX_WAKE_ATTEMPTS}): ${message}`)
529
+ }
530
+ }
531
+ }
532
+
533
+ // Purge restart state for connectors that no longer exist in storage
534
+ for (const id of Object.keys(getAllReconnectStates())) {
535
+ if (!connectors[id] || connectors[id]?.isEnabled !== true) clearReconnectState(id)
536
+ }
537
+ }
538
+
539
+ async function processWebhookRetries() {
540
+ const retryQueue = loadWebhookRetryQueue()
541
+ const now = Date.now()
542
+ const dueEntries: WebhookRetryEntry[] = []
543
+
544
+ for (const raw of Object.values(retryQueue)) {
545
+ const entry = raw as WebhookRetryEntry
546
+ if (entry.deadLettered) continue
547
+ if (entry.nextRetryAt > now) continue
548
+ dueEntries.push(entry)
549
+ }
550
+
551
+ if (dueEntries.length === 0) return
552
+
553
+ const webhooks = loadWebhooks()
554
+ const agents = loadAgents()
555
+ const sessions = loadSessions()
556
+
557
+ for (const entry of dueEntries) {
558
+ const webhook = webhooks[entry.webhookId] as unknown as Record<string, unknown> | undefined
559
+ if (!webhook) {
560
+ // Webhook deleted — drop the retry
561
+ deleteWebhookRetry(entry.id)
562
+ continue
563
+ }
564
+
565
+ const agentId = typeof webhook.agentId === 'string' ? webhook.agentId : ''
566
+ const agent = agentId ? (agents[agentId] as unknown as Record<string, unknown> | undefined) : null
567
+ if (!agent) {
568
+ entry.deadLettered = true
569
+ upsertWebhookRetry(entry.id, entry)
570
+ log.warn(TAG, `[webhook-retry] Dead-lettered ${entry.id}: agent not found for webhook ${entry.webhookId}`)
571
+ continue
572
+ }
573
+ if (isAgentDisabled(agent)) {
574
+ entry.deadLettered = true
575
+ upsertWebhookRetry(entry.id, entry)
576
+ log.warn(TAG, `[webhook-retry] Dead-lettered ${entry.id}: agent disabled for webhook ${entry.webhookId}`)
577
+ continue
578
+ }
579
+
580
+ // Find or create a webhook session (same logic as the POST handler)
581
+ const sessionName = `webhook:${entry.webhookId}`
582
+ let session = Object.values(sessions).find(
583
+ (s: unknown) => {
584
+ const rec = s as Record<string, unknown>
585
+ return rec.name === sessionName && rec.agentId === agent.id
586
+ },
587
+ ) as unknown as Record<string, unknown> | undefined
588
+
589
+ if (!session) {
590
+ const sessionId = genId()
591
+ const ts = Date.now()
592
+ session = {
593
+ id: sessionId,
594
+ name: sessionName,
595
+ cwd: WORKSPACE_DIR,
596
+ user: 'system',
597
+ provider: agent.provider || 'claude-cli',
598
+ model: agent.model || '',
599
+ credentialId: agent.credentialId || null,
600
+ apiEndpoint: agent.apiEndpoint || null,
601
+ claudeSessionId: null,
602
+ codexThreadId: null,
603
+ opencodeSessionId: null,
604
+ delegateResumeIds: { claudeCode: null, codex: null, opencode: null, gemini: null },
605
+ messages: [],
606
+ createdAt: ts,
607
+ lastActiveAt: ts,
608
+ sessionType: 'human',
609
+ agentId: agent.id,
610
+ parentSessionId: null,
611
+ ...getEnabledCapabilitySelection(agent),
612
+ heartbeatEnabled: (agent.heartbeatEnabled as boolean | undefined) ?? false,
613
+ heartbeatIntervalSec: (agent.heartbeatIntervalSec as number | null | undefined) ?? null,
614
+ }
615
+ const { upsertSession: upsert } = await import('@/lib/server/storage')
616
+ upsert(session.id as string, session)
617
+ }
618
+
619
+ const payloadPreview = (entry.payload || '').slice(0, 12_000)
620
+ const prompt = [
621
+ 'Webhook event received (retry).',
622
+ `Webhook ID: ${entry.webhookId}`,
623
+ `Webhook Name: ${(webhook.name as string) || entry.webhookId}`,
624
+ `Source: ${(webhook.source as string) || 'custom'}`,
625
+ `Event: ${entry.event}`,
626
+ `Retry attempt: ${entry.attempts}`,
627
+ `Original received at: ${new Date(entry.createdAt).toISOString()}`,
628
+ '',
629
+ 'Payload:',
630
+ payloadPreview || '(empty payload)',
631
+ '',
632
+ 'Handle this event now. If this requires notifying the user, use configured connector tools.',
633
+ ].join('\n')
634
+
635
+ try {
636
+ const run = enqueueSessionRun({
637
+ sessionId: session.id as string,
638
+ message: prompt,
639
+ source: 'webhook',
640
+ internal: false,
641
+ mode: 'followup',
642
+ })
643
+
644
+ appendWebhookLog(genId(8), {
645
+ id: genId(8),
646
+ webhookId: entry.webhookId,
647
+ event: entry.event,
648
+ payload: (entry.payload || '').slice(0, 2000),
649
+ status: 'success',
650
+ sessionId: session.id,
651
+ runId: run.runId,
652
+ timestamp: Date.now(),
653
+ })
654
+
655
+ deleteWebhookRetry(entry.id)
656
+ log.info(TAG, `[webhook-retry] Successfully retried ${entry.id} for webhook ${entry.webhookId} (attempt ${entry.attempts})`)
657
+ } catch (err: unknown) {
658
+ const errorMsg = errorMessage(err)
659
+ entry.attempts += 1
660
+
661
+ if (entry.attempts >= entry.maxAttempts) {
662
+ entry.deadLettered = true
663
+ upsertWebhookRetry(entry.id, entry)
664
+ log.warn(TAG, `[webhook-retry] Dead-lettered ${entry.id} after ${entry.attempts} attempts: ${errorMsg}`)
665
+ const failure = classifyRuntimeFailure({ source: 'webhook', message: errorMsg })
666
+ if (session?.id) {
667
+ recordSupervisorIncident({
668
+ runId: entry.id,
669
+ sessionId: session.id as string,
670
+ taskId: null,
671
+ agentId: agentId || null,
672
+ source: 'webhook',
673
+ kind: 'runtime_failure',
674
+ severity: failure.severity,
675
+ summary: `Webhook delivery dead-lettered: ${errorMsg}`.slice(0, 320),
676
+ details: errorMsg,
677
+ failureFamily: failure.family,
678
+ remediation: failure.remediation,
679
+ repairPrompt: failure.repairPrompt,
680
+ autoAction: null,
681
+ })
682
+ }
683
+
684
+ appendWebhookLog(genId(8), {
685
+ id: genId(8),
686
+ webhookId: entry.webhookId,
687
+ event: entry.event,
688
+ payload: (entry.payload || '').slice(0, 2000),
689
+ status: 'error',
690
+ error: `Dead-lettered after ${entry.attempts} attempts: ${errorMsg}`,
691
+ timestamp: Date.now(),
692
+ })
693
+ } else {
694
+ // Exponential backoff: 30s * 2^attempt + random jitter (0-5000ms)
695
+ const jitter = Math.floor(Math.random() * 5000)
696
+ entry.nextRetryAt = Date.now() + (30_000 * Math.pow(2, entry.attempts)) + jitter
697
+ upsertWebhookRetry(entry.id, entry)
698
+ log.warn(TAG, `[webhook-retry] Retry ${entry.id} failed (attempt ${entry.attempts}/${entry.maxAttempts}), next at ${new Date(entry.nextRetryAt).toISOString()}: ${errorMsg}`)
699
+ }
700
+ }
701
+ }
702
+ }
703
+
704
+ async function runProviderHealthChecks() {
705
+ const agents = loadAgents()
706
+ const credentials = loadCredentials()
707
+
708
+ // Build deduplicated set of { provider, credentialId, apiEndpoint } tuples
709
+ const seen = new Set<string>()
710
+ const tuples: { provider: string; credentialId: string; apiEndpoint: string; agentId: string; credentialName: string }[] = []
711
+
712
+ for (const agent of Object.values(agents) as unknown as Record<string, unknown>[]) {
713
+ if (!agent?.id || typeof agent.id !== 'string') continue
714
+ if (shouldSuppressSyntheticAgentHealthAlert(agent.id)) continue
715
+ const provider = typeof agent.provider === 'string' ? agent.provider : ''
716
+ if (!provider || ['claude-cli', 'codex-cli', 'opencode-cli'].includes(provider)) continue
717
+
718
+ const credentialId = typeof agent.credentialId === 'string' ? agent.credentialId : ''
719
+ const apiEndpoint = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
720
+
721
+ // For OpenClaw, scope per agent (each may have a different gateway)
722
+ const key = provider === 'openclaw'
723
+ ? `openclaw:${agent.id}`
724
+ : `${provider}:${credentialId || 'no-cred'}:${apiEndpoint}`
725
+ if (seen.has(key)) continue
726
+ seen.add(key)
727
+
728
+ const cred = credentialId ? (credentials[credentialId] as unknown as Record<string, unknown> | undefined) : undefined
729
+ const credName = typeof cred?.name === 'string' ? cred.name : provider
730
+
731
+ tuples.push({
732
+ provider,
733
+ credentialId,
734
+ apiEndpoint,
735
+ agentId: agent.id,
736
+ credentialName: credName,
737
+ })
738
+ }
739
+
740
+ for (const tuple of tuples) {
741
+ // Circuit breaker: skip providers that have failed repeatedly
742
+ const cbKey = `${tuple.provider}:${tuple.credentialId || 'no-cred'}:${tuple.apiEndpoint}`
743
+ const cb = ds.providerPingCircuitBreaker.get(cbKey)
744
+ const now = Date.now()
745
+ if (cb && cb.skipUntil > now) continue
746
+
747
+ let apiKey: string | undefined
748
+ if (tuple.credentialId) {
749
+ const cred = credentials[tuple.credentialId] as unknown as Record<string, unknown> | undefined
750
+ if (cred?.encryptedKey && typeof cred.encryptedKey === 'string') {
751
+ try { apiKey = decryptKey(cred.encryptedKey) } catch { /* skip undecryptable */ continue }
752
+ }
753
+ }
754
+
755
+ const endpoint = tuple.apiEndpoint || OPENAI_COMPATIBLE_DEFAULTS[tuple.provider]?.defaultEndpoint || undefined
756
+ const result = await pingProvider(tuple.provider, apiKey, endpoint)
757
+
758
+ if (!result.ok) {
759
+ // Update circuit breaker state
760
+ const existing = ds.providerPingCircuitBreaker.get(cbKey) || { consecutiveFailures: 0, skipUntil: 0 }
761
+ existing.consecutiveFailures += 1
762
+ if (existing.consecutiveFailures >= PROVIDER_PING_CB_THRESHOLD) {
763
+ const cooldown = Math.min(
764
+ PROVIDER_PING_CB_BASE_MS * Math.pow(2, existing.consecutiveFailures - PROVIDER_PING_CB_THRESHOLD),
765
+ PROVIDER_PING_CB_MAX_MS,
766
+ )
767
+ existing.skipUntil = now + cooldown
768
+ log.info(TAG, `[health] Circuit breaker tripped for ${tuple.credentialName} — skipping pings for ${Math.round(cooldown / 60_000)}m`)
769
+ }
770
+ ds.providerPingCircuitBreaker.set(cbKey, existing)
771
+
772
+ if (!shouldNotifyProviderReachabilityIssue(tuple.provider)) {
773
+ continue
774
+ }
775
+
776
+ const dedupKey = `provider-down:${tuple.credentialId || tuple.provider}`
777
+
778
+ const entityType = tuple.credentialId ? 'credential' : undefined
779
+ const entityId = tuple.credentialId || undefined
780
+
781
+ createNotification({
782
+ type: 'warning',
783
+ title: `Provider unreachable: ${tuple.credentialName}`,
784
+ message: result.message,
785
+ dedupKey,
786
+ entityType,
787
+ entityId,
788
+ })
789
+ } else {
790
+ // Success — clear circuit breaker
791
+ ds.providerPingCircuitBreaker.delete(cbKey)
792
+ }
793
+ }
794
+ }
795
+
796
+ const OPENCLAW_REPAIR_MAX_ATTEMPTS = 3
797
+ const OPENCLAW_REPAIR_COOLDOWN_MS = 300_000 // 5 minutes
798
+
799
+ async function runOpenClawGatewayHealthChecks() {
800
+ const agents = loadAgents()
801
+ const credentials = loadCredentials()
802
+
803
+ // Build deduplicated OpenClaw agent tuples
804
+ const seen = new Set<string>()
805
+ const tuples: { agentId: string; endpoint: string; credentialId: string; credentialName: string }[] = []
806
+
807
+ for (const agent of Object.values(agents) as unknown as Record<string, unknown>[]) {
808
+ if (!agent?.id || typeof agent.id !== 'string') continue
809
+ if (shouldSuppressSyntheticAgentHealthAlert(agent.id)) continue
810
+ if (agent.provider !== 'openclaw') continue
811
+
812
+ const key = `openclaw:${agent.id}`
813
+ if (seen.has(key)) continue
814
+ seen.add(key)
815
+
816
+ const credentialId = typeof agent.credentialId === 'string' ? agent.credentialId : ''
817
+ const endpoint = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
818
+ const cred = credentialId ? (credentials[credentialId] as unknown as Record<string, unknown> | undefined) : undefined
819
+ const credName = typeof cred?.name === 'string' ? cred.name : 'openclaw'
820
+
821
+ tuples.push({ agentId: agent.id, endpoint, credentialId, credentialName: credName })
822
+ }
823
+
824
+ if (!tuples.length) return
825
+
826
+ const { probeOpenClawHealth } = await import('@/lib/server/openclaw/health')
827
+
828
+ for (const tuple of tuples) {
829
+ let token: string | undefined
830
+ if (tuple.credentialId) {
831
+ const cred = credentials[tuple.credentialId] as unknown as Record<string, unknown> | undefined
832
+ if (cred?.encryptedKey && typeof cred.encryptedKey === 'string') {
833
+ try { token = decryptKey(cred.encryptedKey) } catch { continue }
834
+ }
835
+ }
836
+
837
+ const result = await probeOpenClawHealth({
838
+ endpoint: tuple.endpoint || undefined,
839
+ token,
840
+ timeoutMs: 10_000,
841
+ })
842
+
843
+ const now = Date.now()
844
+
845
+ if (result.ok) {
846
+ // Recovered
847
+ if (ds.openclawDownAgentIds.has(tuple.agentId)) {
848
+ ds.openclawDownAgentIds.delete(tuple.agentId)
849
+ ds.openclawRepairState.delete(tuple.agentId)
850
+ createNotification({
851
+ type: 'success',
852
+ title: 'OpenClaw gateway recovered',
853
+ message: `Gateway for ${tuple.credentialName} is reachable again.`,
854
+ dedupKey: `openclaw-gw-down:${tuple.agentId}`,
855
+ })
856
+ }
857
+ continue
858
+ }
859
+
860
+ // Unhealthy
861
+ const repair = ds.openclawRepairState.get(tuple.agentId) || { attempts: 0, lastAttemptAt: 0, cooldownUntil: 0 }
862
+
863
+ // In cooldown — skip
864
+ if (repair.cooldownUntil > now) continue
865
+
866
+ // Cooldown expired — reset
867
+ if (repair.cooldownUntil > 0 && repair.cooldownUntil <= now) {
868
+ repair.attempts = 0
869
+ repair.cooldownUntil = 0
870
+ }
871
+
872
+ ds.openclawDownAgentIds.add(tuple.agentId)
873
+
874
+ if (repair.attempts < OPENCLAW_REPAIR_MAX_ATTEMPTS) {
875
+ try {
876
+ const { runOpenClawDoctor } = await import('@/lib/server/openclaw/doctor')
877
+ await runOpenClawDoctor({ fix: true })
878
+ } catch (err: unknown) {
879
+ log.warn(TAG, '[daemon] openclaw doctor --fix failed:', errorMessage(err))
880
+ }
881
+ repair.attempts += 1
882
+ repair.lastAttemptAt = now
883
+ } else {
884
+ repair.cooldownUntil = now + OPENCLAW_REPAIR_COOLDOWN_MS
885
+ }
886
+
887
+ ds.openclawRepairState.set(tuple.agentId, repair)
888
+
889
+ createNotification({
890
+ type: 'error',
891
+ title: `OpenClaw gateway unreachable: ${tuple.credentialName}`,
892
+ message: result.error || 'Health check failed',
893
+ dedupKey: `openclaw-gw-down:${tuple.agentId}`,
894
+ })
895
+ }
896
+ }
897
+
898
+ /**
899
+ * Prune orphaned entries from module-level Maps/Sets that reference
900
+ * sessions, connectors, or agents that no longer exist in storage.
901
+ * Runs every health-check cycle (2 minutes).
902
+ */
903
+ function pruneOrphanedState(sessions: Record<string, unknown>): void {
904
+ const liveSessionIds = new Set(Object.keys(sessions))
905
+
906
+ // Main-loop state map (per-session autonomous state)
907
+ pruneMainLoopState(liveSessionIds)
908
+
909
+ // Heartbeat service tracking maps
910
+ pruneHeartbeatState(liveSessionIds)
911
+
912
+ // System event queues for dead sessions
913
+ pruneSystemEventQueues(liveSessionIds)
914
+
915
+ // Subagent lineage/handle registry — remove finished subagent state older than 30 min
916
+ cleanupFinishedSubagents()
917
+
918
+ // Process manager — sweep completed processes older than TTL
919
+ sweepManagedProcesses()
920
+
921
+ // Reap orphaned sandbox containers from prior crashes
922
+ reapOrphanedSandboxContainers().catch((err) => {
923
+ log.warn(TAG, '[daemon] Orphaned sandbox reap failed:', typeof err === 'object' && err !== null && 'message' in err ? (err as Error).message : String(err))
924
+ })
925
+
926
+ // Daemon-local: prune openclawRepairState for agents that no longer exist
927
+ const agents = loadAgents()
928
+ for (const agentId of ds.openclawRepairState.keys()) {
929
+ if (!agents[agentId]) ds.openclawRepairState.delete(agentId)
930
+ }
931
+ for (const agentId of ds.openclawDownAgentIds) {
932
+ if (!agents[agentId]) ds.openclawDownAgentIds.delete(agentId)
933
+ }
934
+
935
+ // Orchestrator event queues for dead agents
936
+ const liveAgentIds = new Set(Object.keys(agents))
937
+ pruneOrchestratorEventQueues(liveAgentIds)
938
+
939
+ // Orchestrator wake/failure/dailyCycles Maps for deleted agents
940
+ pruneOrchestratorState(liveAgentIds)
941
+
942
+ // Connector tracking Maps for deleted connectors
943
+ const connectors = loadConnectors()
944
+ pruneConnectorTrackingState(new Set(Object.keys(connectors)))
945
+
946
+ // Prune circuit breaker entries for providers that no longer have any agent referencing them
947
+ const liveProviderKeys = new Set<string>()
948
+ for (const agent of Object.values(agents) as unknown as Record<string, unknown>[]) {
949
+ if (!agent?.id) continue
950
+ const p = typeof agent.provider === 'string' ? agent.provider : ''
951
+ const c = typeof agent.credentialId === 'string' ? agent.credentialId : ''
952
+ const e = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
953
+ if (p) liveProviderKeys.add(`${p}:${c || 'no-cred'}:${e}`)
954
+ }
955
+ for (const key of ds.providerPingCircuitBreaker.keys()) {
956
+ if (!liveProviderKeys.has(key)) ds.providerPingCircuitBreaker.delete(key)
957
+ }
958
+ }
959
+
960
+ async function runMemoryMaintenanceTick(): Promise<void> {
961
+ try {
962
+ const memDb = getMemoryDb()
963
+ const result = memDb.maintain({ dedupe: true, pruneWorking: true, ttlHours: 24 })
964
+ if (result.deduped > 0 || result.pruned > 0) {
965
+ log.info(TAG, `[daemon] Memory maintenance: deduped=${result.deduped}, pruned=${result.pruned}`)
966
+ }
967
+ } catch (err: unknown) {
968
+ log.warn(TAG, '[daemon] Memory maintenance tick failed:', err instanceof Error ? err.message : String(err))
969
+ }
970
+ }
971
+
972
+ async function runHealthChecks() {
973
+ // Continuously keep the completed queue honest.
974
+ validateCompletedTasksQueue()
975
+ recoverStalledRunningTasks()
976
+
977
+ // Watchdog: abort runs stuck in running state beyond their timeout threshold.
978
+ try {
979
+ const stuck = sweepStuckRuns()
980
+ if (stuck.aborted > 0) {
981
+ log.info(TAG, `[daemon] Watchdog: aborted ${stuck.aborted} stuck run(s)`)
982
+ }
983
+ } catch (err: unknown) {
984
+ log.error(TAG, '[daemon] Stuck-run watchdog failed:', err instanceof Error ? err.message : String(err))
985
+ }
986
+
987
+ // Keep heartbeat state in sync with task terminal states even without daemon restarts.
988
+ cleanupFinishedTaskSessions()
989
+
990
+ // Re-queue deferred tasks whose agents have become available again.
991
+ try { promoteDeferred() } catch {}
992
+
993
+ const sessions = loadSessions()
994
+ const now = Date.now()
995
+ const currentlyStale = new Set<string>()
996
+ const dirtySessionIds: string[] = []
997
+
998
+ for (const session of Object.values(sessions) as unknown as Record<string, unknown>[]) {
999
+ if (!session?.id || typeof session.id !== 'string') continue
1000
+ if (session.heartbeatEnabled !== true) continue
1001
+
1002
+ const sessionId = session.id
1003
+ if (shouldSuppressSessionHeartbeatHealthAlert(session as Pick<Session, 'id' | 'name' | 'user' | 'shortcutForAgentId'>)) {
1004
+ ds.staleSessionIds.delete(sessionId)
1005
+ continue
1006
+ }
1007
+
1008
+ const sessionLabel = String(session.name || sessionId)
1009
+ const intervalSec = parseHeartbeatIntervalSec(session.heartbeatIntervalSec, DEFAULT_HEARTBEAT_INTERVAL_SEC)
1010
+ if (intervalSec <= 0) continue
1011
+ const staleAfter = Math.max(intervalSec * STALE_MULTIPLIER * 1000, STALE_MIN_MS)
1012
+ const lastActive = typeof session.lastActiveAt === 'number' ? session.lastActiveAt : 0
1013
+ if (lastActive <= 0) continue
1014
+
1015
+ const staleForMs = now - lastActive
1016
+ if (staleForMs > staleAfter) {
1017
+ const autoDisableAfter = Math.max(intervalSec * STALE_AUTO_DISABLE_MULTIPLIER * 1000, STALE_AUTO_DISABLE_MIN_MS)
1018
+ if (staleForMs > autoDisableAfter) {
1019
+ session.heartbeatEnabled = false
1020
+ session.lastActiveAt = now
1021
+ dirtySessionIds.push(sessionId)
1022
+ ds.staleSessionIds.delete(sessionId)
1023
+ await sendHealthAlert({
1024
+ text: `Auto-disabled heartbeat for stale session "${sessionLabel}" after ${Math.round(staleForMs / 60_000)}m of inactivity.`,
1025
+ dedupKey: buildSessionHeartbeatHealthDedupKey(sessionId, 'auto-disabled'),
1026
+ entityType: 'session',
1027
+ entityId: sessionId,
1028
+ })
1029
+ continue
1030
+ }
1031
+
1032
+ currentlyStale.add(sessionId)
1033
+ // Only alert on transition from healthy → stale (once per stale episode)
1034
+ if (!ds.staleSessionIds.has(sessionId)) {
1035
+ ds.staleSessionIds.add(sessionId)
1036
+ await sendHealthAlert({
1037
+ text: `Session "${sessionLabel}" heartbeat appears stale (last active ${(Math.round(staleForMs / 1000))}s ago, interval ${intervalSec}s).`,
1038
+ dedupKey: buildSessionHeartbeatHealthDedupKey(sessionId, 'stale'),
1039
+ entityType: 'session',
1040
+ entityId: sessionId,
1041
+ })
1042
+ }
1043
+ }
1044
+ }
1045
+
1046
+ // Clear recovered sessions so they can re-alert if they go stale again later
1047
+ for (const id of ds.staleSessionIds) {
1048
+ if (!currentlyStale.has(id)) {
1049
+ ds.staleSessionIds.delete(id)
1050
+ }
1051
+ }
1052
+
1053
+ for (const sid of dirtySessionIds) {
1054
+ const s = sessions[sid]
1055
+ if (s) {
1056
+ const { upsertSession: upsert } = await import('@/lib/server/storage')
1057
+ upsert(sid, s)
1058
+ }
1059
+ }
1060
+
1061
+ // Provider reachability checks
1062
+ try {
1063
+ await runProviderHealthChecks()
1064
+ } catch (err: unknown) {
1065
+ log.error(TAG, '[daemon] Provider health check failed:', errorMessage(err))
1066
+ }
1067
+
1068
+ // OpenClaw gateway health checks + auto-repair
1069
+ try {
1070
+ await runOpenClawGatewayHealthChecks()
1071
+ } catch (err: unknown) {
1072
+ log.error(TAG, '[daemon] OpenClaw gateway health check failed:', errorMessage(err))
1073
+ }
1074
+
1075
+ // Integrity drift monitoring for identity/config/extension files.
1076
+ try {
1077
+ const integrity = runIntegrityMonitor(loadSettings())
1078
+ ds.lastIntegrityCheckAt = integrity.checkedAt
1079
+ ds.lastIntegrityDriftCount = integrity.drifts.length
1080
+ if (integrity.drifts.length > 0) {
1081
+ for (const drift of integrity.drifts) {
1082
+ const rel = path.relative(process.cwd(), drift.filePath)
1083
+ const shortPath = rel && !rel.startsWith('..') ? rel : drift.filePath
1084
+ const action = drift.type === 'created'
1085
+ ? 'created'
1086
+ : drift.type === 'deleted'
1087
+ ? 'deleted'
1088
+ : 'modified'
1089
+ createNotification({
1090
+ type: drift.type === 'deleted' ? 'error' : 'warning',
1091
+ title: `Integrity drift detected (${drift.kind})`,
1092
+ message: `${shortPath} was ${action}.`,
1093
+ dedupKey: `integrity:${drift.id}:${drift.nextHash || 'missing'}`,
1094
+ entityType: 'session',
1095
+ entityId: drift.id,
1096
+ })
1097
+ }
1098
+ await sendHealthAlert(`Integrity monitor detected ${integrity.drifts.length} file drift event(s).`)
1099
+ }
1100
+ } catch (err: unknown) {
1101
+ log.error(TAG, '[daemon] Integrity monitor check failed:', errorMessage(err))
1102
+ }
1103
+
1104
+ // Process webhook retry queue
1105
+ try {
1106
+ await processWebhookRetries()
1107
+ } catch (err: unknown) {
1108
+ log.error(TAG, '[daemon] Webhook retry processing failed:', errorMessage(err))
1109
+ }
1110
+
1111
+ // Periodic memory hygiene: prune orphaned state for deleted sessions/connectors
1112
+ try {
1113
+ pruneOrphanedState(sessions)
1114
+ } catch (err: unknown) {
1115
+ log.error(TAG, '[daemon] Memory hygiene sweep failed:', errorMessage(err))
1116
+ }
1117
+
1118
+ // Prune old terminal runs and their events to prevent unbounded growth
1119
+ try {
1120
+ const pruned = pruneOldRuns()
1121
+ if (pruned.prunedRuns > 0 || pruned.prunedEvents > 0) {
1122
+ log.info(TAG, `[daemon] Pruned ${pruned.prunedRuns} old run(s) and ${pruned.prunedEvents} run event(s)`)
1123
+ }
1124
+ } catch (err: unknown) {
1125
+ log.error(TAG, '[daemon] Run pruning failed:', err instanceof Error ? err.message : String(err))
1126
+ }
1127
+
1128
+ // Prune expired runtime locks
1129
+ try {
1130
+ const locksRemoved = pruneExpiredLocks()
1131
+ if (locksRemoved > 0) {
1132
+ log.info(TAG, `[daemon] Pruned ${locksRemoved} expired lock(s)`)
1133
+ }
1134
+ } catch (err: unknown) {
1135
+ log.error(TAG, '[daemon] Lock pruning failed:', err instanceof Error ? err.message : String(err))
1136
+ }
1137
+
1138
+ // Prune old execution logs (30-day retention)
1139
+ try {
1140
+ const logsRemoved = clearLogsByAge(30 * 24 * 3600_000)
1141
+ if (logsRemoved > 0) {
1142
+ log.info(TAG, `[daemon] Pruned ${logsRemoved} old execution log(s)`)
1143
+ }
1144
+ } catch (err: unknown) {
1145
+ log.error(TAG, '[daemon] Execution log pruning failed:', errorMessage(err))
1146
+ }
1147
+
1148
+ // Prune old usage records (90-day retention)
1149
+ try {
1150
+ const usageRemoved = pruneOldUsage(90 * 24 * 3600_000)
1151
+ if (usageRemoved > 0) {
1152
+ log.info(TAG, `[daemon] Pruned ${usageRemoved} old usage record(s)`)
1153
+ }
1154
+ } catch (err: unknown) {
1155
+ log.error(TAG, '[daemon] Usage pruning failed:', errorMessage(err))
1156
+ }
1157
+
1158
+ // Periodic memory database maintenance (dedup + TTL pruning)
1159
+ try {
1160
+ await runMemoryMaintenanceTick()
1161
+ } catch (err: unknown) {
1162
+ log.error(TAG, '[daemon] Memory maintenance failed:', err instanceof Error ? err.message : String(err))
1163
+ }
1164
+
1165
+ // Drain idle-window callbacks when the system is quiet
1166
+ try {
1167
+ await drainIdleWindowCallbacks()
1168
+ } catch (err: unknown) {
1169
+ log.error(TAG, '[daemon] Idle-window drain failed:', err instanceof Error ? err.message : String(err))
1170
+ }
1171
+ }
1172
+
1173
+ function startHealthMonitor() {
1174
+ if (ds.healthIntervalId) return
1175
+ ds.healthIntervalId = setInterval(() => {
1176
+ if (ds.healthCheckRunning || ds.shuttingDown) return
1177
+ ds.healthCheckRunning = true
1178
+ runHealthChecks()
1179
+ .catch((err) => {
1180
+ log.error(TAG, '[daemon] Health monitor tick failed:', err?.message || String(err))
1181
+ })
1182
+ .finally(() => { ds.healthCheckRunning = false })
1183
+ }, HEALTH_CHECK_INTERVAL)
1184
+ }
1185
+
1186
+ function stopHealthMonitor() {
1187
+ if (ds.healthIntervalId) {
1188
+ clearInterval(ds.healthIntervalId)
1189
+ ds.healthIntervalId = null
1190
+ }
1191
+ }
1192
+
1193
+ function syncDaemonBackgroundServices(options?: { runConnectorHealthCheckImmediately?: boolean }) {
1194
+ if (isDaemonBackgroundServicesEnabled()) {
1195
+ startHealthMonitor()
1196
+ startConnectorHealthMonitor({
1197
+ runImmediately: options?.runConnectorHealthCheckImmediately !== false,
1198
+ })
1199
+ startConnectorOutboxWorker()
1200
+ startEvalScheduler()
1201
+ return
1202
+ }
1203
+ stopHealthMonitor()
1204
+ stopConnectorHealthMonitor()
1205
+ stopConnectorOutboxWorker()
1206
+ stopEvalScheduler()
1207
+ }
1208
+
1209
+ function startConnectorHealthMonitor(options?: { runImmediately?: boolean }) {
1210
+ if (ds.connectorHealthIntervalId) return
1211
+
1212
+ const tick = () => {
1213
+ if (ds.connectorHealthCheckRunning || ds.shuttingDown) return
1214
+ ds.connectorHealthCheckRunning = true
1215
+ runConnectorHealthChecks(Date.now())
1216
+ .catch((err) => {
1217
+ log.error(TAG, '[daemon] Connector health tick failed:', errorMessage(err))
1218
+ })
1219
+ .finally(() => { ds.connectorHealthCheckRunning = false })
1220
+ }
1221
+
1222
+ if (options?.runImmediately !== false) tick()
1223
+ ds.connectorHealthIntervalId = setInterval(tick, CONNECTOR_HEALTH_CHECK_INTERVAL)
1224
+ }
1225
+
1226
+ function stopConnectorHealthMonitor() {
1227
+ if (ds.connectorHealthIntervalId) {
1228
+ clearInterval(ds.connectorHealthIntervalId)
1229
+ ds.connectorHealthIntervalId = null
1230
+ }
1231
+ }
1232
+
1233
+ function runConsolidationTick() {
1234
+ import('@/lib/server/memory/memory-consolidation').then(({ runDailyConsolidation, registerConsolidationIdleCallback, registerCompactionIdleCallback }) => {
1235
+ // Wire idle-window callbacks so consolidation and compaction run during quiet periods
1236
+ registerConsolidationIdleCallback()
1237
+ registerCompactionIdleCallback()
1238
+
1239
+ return runDailyConsolidation().then((stats) => {
1240
+ if (stats.digests > 0 || stats.pruned > 0 || stats.deduped > 0) {
1241
+ log.info(TAG, `[daemon] Memory consolidation: ${stats.digests} digest(s), ${stats.pruned} pruned, ${stats.deduped} deduped`)
1242
+ }
1243
+ if (stats.errors.length > 0) {
1244
+ log.warn(TAG, `[daemon] Memory consolidation errors: ${stats.errors.join('; ')}`)
1245
+ }
1246
+ })
1247
+ }).catch((err: unknown) => {
1248
+ log.error(TAG, '[daemon] Memory consolidation failed:', errorMessage(err))
1249
+ })
1250
+ }
1251
+
1252
+ function startMemoryConsolidation() {
1253
+ if (ds.memoryConsolidationTimeoutId || ds.memoryConsolidationIntervalId) return
1254
+ // Deferred first run, then repeat on interval
1255
+ ds.memoryConsolidationTimeoutId = setTimeout(() => {
1256
+ ds.memoryConsolidationTimeoutId = null
1257
+ runConsolidationTick()
1258
+ ds.memoryConsolidationIntervalId = setInterval(runConsolidationTick, MEMORY_CONSOLIDATION_INTERVAL)
1259
+ }, MEMORY_CONSOLIDATION_INITIAL_DELAY)
1260
+ }
1261
+
1262
+ function stopMemoryConsolidation() {
1263
+ if (ds.memoryConsolidationTimeoutId) {
1264
+ clearTimeout(ds.memoryConsolidationTimeoutId)
1265
+ ds.memoryConsolidationTimeoutId = null
1266
+ }
1267
+ if (ds.memoryConsolidationIntervalId) {
1268
+ clearInterval(ds.memoryConsolidationIntervalId)
1269
+ ds.memoryConsolidationIntervalId = null
1270
+ }
1271
+ }
1272
+
1273
+ // --- Eval scheduler ---
1274
+
1275
+ const EVAL_DEFAULT_INTERVAL_MS = 24 * 3600_000 // 24 hours
1276
+
1277
+ async function runEvalSchedulerTick() {
1278
+ try {
1279
+ const settings = loadSettings()
1280
+ if (!settings.autonomyEvalEnabled) return
1281
+
1282
+ const { runEvalSuite } = await import('@/lib/server/eval/runner')
1283
+ const agents = loadAgents()
1284
+ const heartbeatAgentIds = Object.keys(agents).filter(
1285
+ (id) => agents[id].heartbeatEnabled === true,
1286
+ )
1287
+
1288
+ for (const agentId of heartbeatAgentIds) {
1289
+ try {
1290
+ const result = await runEvalSuite(agentId)
1291
+ log.info(TAG,
1292
+ `[daemon:eval] Agent ${agents[agentId].name}: ${result.percentage}% (${result.totalScore}/${result.maxScore})`,
1293
+ )
1294
+ createNotification({
1295
+ title: `Eval: ${agents[agentId].name} scored ${result.percentage}%`,
1296
+ message: `${result.runs.length} scenarios, ${result.totalScore}/${result.maxScore} points`,
1297
+ type: result.percentage >= 60 ? 'info' : 'warning',
1298
+ })
1299
+ } catch (err: unknown) {
1300
+ log.error(TAG, `[daemon:eval] Failed for agent ${agentId}:`, errorMessage(err))
1301
+ }
1302
+ }
1303
+ } catch (err: unknown) {
1304
+ log.error(TAG, '[daemon:eval] Scheduler tick error:', errorMessage(err))
1305
+ }
1306
+ }
1307
+
1308
+ function startEvalScheduler() {
1309
+ if (ds.evalSchedulerIntervalId) return
1310
+ try {
1311
+ const settings = loadSettings()
1312
+ if (!settings.autonomyEvalEnabled) return
1313
+ const intervalMs = parseCronToMs(settings.autonomyEvalCron, EVAL_DEFAULT_INTERVAL_MS) || EVAL_DEFAULT_INTERVAL_MS
1314
+ ds.evalSchedulerIntervalId = setInterval(runEvalSchedulerTick, intervalMs)
1315
+ log.info(TAG, `[daemon:eval] Eval scheduler started (interval=${Math.round(intervalMs / 3600_000)}h)`)
1316
+ } catch {
1317
+ // Eval scheduling is optional — don't block daemon start
1318
+ }
1319
+ }
1320
+
1321
+ function stopEvalScheduler() {
1322
+ if (ds.evalSchedulerIntervalId) {
1323
+ clearInterval(ds.evalSchedulerIntervalId)
1324
+ ds.evalSchedulerIntervalId = null
1325
+ }
1326
+ }
1327
+
1328
+ const SWARM_TIMEOUT_CHECK_INTERVAL = 30_000
1329
+
1330
+ function startSwarmTimeoutChecker() {
1331
+ if (ds.swarmTimeoutIntervalId) return
1332
+ ds.swarmTimeoutIntervalId = setInterval(() => {
1333
+ if (!ds.running || ds.shuttingDown) return
1334
+ try {
1335
+ checkSwarmTimeouts()
1336
+ } catch (err: unknown) {
1337
+ log.error(TAG, `[daemon] Swarm timeout check error: ${errorMessage(err)}`)
1338
+ }
1339
+ }, SWARM_TIMEOUT_CHECK_INTERVAL)
1340
+ }
1341
+
1342
+ function stopSwarmTimeoutChecker() {
1343
+ if (ds.swarmTimeoutIntervalId) {
1344
+ clearInterval(ds.swarmTimeoutIntervalId)
1345
+ ds.swarmTimeoutIntervalId = null
1346
+ }
1347
+ }
1348
+
1349
+ function refreshDaemonTimersForHotReload() {
1350
+ if (!ds.running) return
1351
+
1352
+ if (ds.queueIntervalId) {
1353
+ clearInterval(ds.queueIntervalId)
1354
+ ds.queueIntervalId = null
1355
+ startQueueProcessor()
1356
+ }
1357
+
1358
+ if (ds.browserSweepId) {
1359
+ clearInterval(ds.browserSweepId)
1360
+ ds.browserSweepId = null
1361
+ startBrowserSweep()
1362
+ }
1363
+
1364
+ if (ds.healthIntervalId) {
1365
+ clearInterval(ds.healthIntervalId)
1366
+ ds.healthIntervalId = null
1367
+ }
1368
+
1369
+ if (ds.connectorHealthIntervalId) {
1370
+ clearInterval(ds.connectorHealthIntervalId)
1371
+ ds.connectorHealthIntervalId = null
1372
+ }
1373
+
1374
+ if (ds.memoryConsolidationTimeoutId || ds.memoryConsolidationIntervalId) {
1375
+ stopMemoryConsolidation()
1376
+ startMemoryConsolidation()
1377
+ }
1378
+
1379
+ if (ds.evalSchedulerIntervalId) {
1380
+ stopEvalScheduler()
1381
+ }
1382
+
1383
+ if (ds.swarmTimeoutIntervalId) {
1384
+ stopSwarmTimeoutChecker()
1385
+ startSwarmTimeoutChecker()
1386
+ }
1387
+
1388
+ syncDaemonBackgroundServices()
1389
+ }
1390
+
1391
+ // In dev/HMR, the daemon state survives on globalThis while interval callbacks keep
1392
+ // the old module closure alive. Refresh long-lived timers so they always run the
1393
+ // current module's logic instead of stale health-alert code paths.
1394
+ refreshDaemonTimersForHotReload()
1395
+
1396
+ export async function runDaemonHealthCheckNow() {
1397
+ // Bypass circuit breaker for manual/forced checks
1398
+ ds.providerPingCircuitBreaker.clear()
1399
+ await Promise.all([
1400
+ runHealthChecks(),
1401
+ runConnectorHealthChecks(Date.now()),
1402
+ ])
1403
+ }
1404
+
1405
+ export async function runConnectorHealthCheckNowForTest(now = Date.now()) {
1406
+ await runConnectorHealthChecks(now)
1407
+ }
1408
+
1409
+ export function getDaemonStatus() {
1410
+ const estop = loadEstopState()
1411
+ const queue = loadQueue()
1412
+ const schedules = loadSchedules()
1413
+ const reconnectStates = Object.values(getAllReconnectStates())
1414
+
1415
+ // Find next scheduled task
1416
+ let nextScheduled: number | null = null
1417
+ for (const s of Object.values(schedules) as unknown as Record<string, unknown>[]) {
1418
+ if (s.status === 'active' && s.nextRunAt) {
1419
+ if (!nextScheduled || (s.nextRunAt as number) < nextScheduled) {
1420
+ nextScheduled = s.nextRunAt as number
1421
+ }
1422
+ }
1423
+ }
1424
+
1425
+ // Webhook retry queue stats
1426
+ const retryQueue = loadWebhookRetryQueue()
1427
+ const retryEntries = Object.values(retryQueue) as WebhookRetryEntry[]
1428
+ const pendingRetries = retryEntries.filter(e => !e.deadLettered).length
1429
+ const deadLettered = retryEntries.filter(e => e.deadLettered).length
1430
+
1431
+ return {
1432
+ running: ds.running,
1433
+ schedulerActive: ds.running,
1434
+ autostartEnabled: daemonAutostartEnvEnabled(),
1435
+ backgroundServicesEnabled: isDaemonBackgroundServicesEnabled(),
1436
+ reducedMode: !isDaemonBackgroundServicesEnabled(),
1437
+ manualStopRequested: ds.manualStopRequested,
1438
+ estop,
1439
+ queueLength: queue.length,
1440
+ lastProcessed: ds.lastProcessedAt,
1441
+ nextScheduled,
1442
+ heartbeat: getHeartbeatServiceStatus(),
1443
+ health: {
1444
+ monitorActive: !!ds.healthIntervalId,
1445
+ connectorMonitorActive: !!ds.connectorHealthIntervalId,
1446
+ staleSessions: ds.staleSessionIds.size,
1447
+ connectorsInBackoff: reconnectStates.filter((state) => !state.exhausted).length,
1448
+ connectorsExhausted: reconnectStates.filter((state) => state.exhausted).length,
1449
+ checkIntervalSec: Math.trunc(HEALTH_CHECK_INTERVAL / 1000),
1450
+ connectorCheckIntervalSec: Math.trunc(CONNECTOR_HEALTH_CHECK_INTERVAL / 1000),
1451
+ integrity: {
1452
+ enabled: loadSettings().integrityMonitorEnabled !== false,
1453
+ lastCheckedAt: ds.lastIntegrityCheckAt,
1454
+ lastDriftCount: ds.lastIntegrityDriftCount,
1455
+ },
1456
+ },
1457
+ webhookRetry: {
1458
+ pendingRetries,
1459
+ deadLettered,
1460
+ },
1461
+ guards: {
1462
+ healthCheckRunning: ds.healthCheckRunning,
1463
+ connectorHealthCheckRunning: ds.connectorHealthCheckRunning,
1464
+ shuttingDown: ds.shuttingDown,
1465
+ providerCircuitBreakers: ds.providerPingCircuitBreaker.size,
1466
+ },
1467
+ }
1468
+ }
1469
+
1470
+ /**
1471
+ * Lightweight health summary safe for external consumption.
1472
+ * Reads cached state only — no probes or side effects.
1473
+ */
1474
+ export function getDaemonHealthSummary(): {
1475
+ ok: boolean
1476
+ uptime: number
1477
+ components: {
1478
+ daemon: { status: 'healthy' | 'stopped' | 'degraded' }
1479
+ connectors: { healthy: number; errored: number; total: number }
1480
+ providers: { healthy: number; cooldown: number; total: number }
1481
+ gateways: { healthy: number; degraded: number; total: number }
1482
+ }
1483
+ estop: boolean
1484
+ nextScheduledTask: number | null
1485
+ } {
1486
+ const estopState = loadEstopState()
1487
+ const estopActive = estopState.level !== 'none'
1488
+
1489
+ // Daemon status
1490
+ const daemonStatus: 'healthy' | 'stopped' | 'degraded' = !ds.running
1491
+ ? 'stopped'
1492
+ : estopActive ? 'degraded' : 'healthy'
1493
+
1494
+ // Connector summary
1495
+ const connectors = loadConnectors()
1496
+ const connectorEntries = Object.values(connectors) as unknown as Record<string, unknown>[]
1497
+ const enabledConnectors = connectorEntries.filter(c => c?.isEnabled === true)
1498
+ let healthyConnectors = 0
1499
+ let erroredConnectors = 0
1500
+ for (const c of enabledConnectors) {
1501
+ if (typeof c.id === 'string' && getConnectorStatus(c.id) === 'running') {
1502
+ healthyConnectors++
1503
+ } else {
1504
+ erroredConnectors++
1505
+ }
1506
+ }
1507
+
1508
+ // Provider summary (based on circuit breaker state)
1509
+ const agents = loadAgents()
1510
+ const agentEntries = Object.values(agents) as unknown as Record<string, unknown>[]
1511
+ const providerKeys = new Set<string>()
1512
+ for (const agent of agentEntries) {
1513
+ if (!agent?.id || typeof agent.id !== 'string') continue
1514
+ const provider = typeof agent.provider === 'string' ? agent.provider : ''
1515
+ if (!provider || ['claude-cli', 'codex-cli', 'opencode-cli'].includes(provider)) continue
1516
+ const credentialId = typeof agent.credentialId === 'string' ? agent.credentialId : ''
1517
+ const apiEndpoint = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
1518
+ providerKeys.add(`${provider}:${credentialId || 'no-cred'}:${apiEndpoint}`)
1519
+ }
1520
+ const now = Date.now()
1521
+ let cooldownProviders = 0
1522
+ for (const key of providerKeys) {
1523
+ const cb = ds.providerPingCircuitBreaker.get(key)
1524
+ if (cb && cb.skipUntil > now) cooldownProviders++
1525
+ }
1526
+
1527
+ // Gateway summary (OpenClaw gateways)
1528
+ const totalGateways = ds.openclawDownAgentIds.size
1529
+ + agentEntries.filter(a => a?.provider === 'openclaw' && !ds.openclawDownAgentIds.has(a.id as string)).length
1530
+ const degradedGateways = ds.openclawDownAgentIds.size
1531
+
1532
+ // Next scheduled task
1533
+ const schedules = loadSchedules()
1534
+ let nextScheduled: number | null = null
1535
+ for (const s of Object.values(schedules) as unknown as Record<string, unknown>[]) {
1536
+ if (s.status === 'active' && s.nextRunAt) {
1537
+ if (!nextScheduled || (s.nextRunAt as number) < nextScheduled) {
1538
+ nextScheduled = s.nextRunAt as number
1539
+ }
1540
+ }
1541
+ }
1542
+
1543
+ const allProvidersDown = providerKeys.size > 0 && cooldownProviders >= providerKeys.size
1544
+ const ok = ds.running && !estopActive && !allProvidersDown
1545
+
1546
+ return {
1547
+ ok,
1548
+ uptime: Math.trunc(process.uptime()),
1549
+ components: {
1550
+ daemon: { status: daemonStatus },
1551
+ connectors: {
1552
+ healthy: healthyConnectors,
1553
+ errored: erroredConnectors,
1554
+ total: enabledConnectors.length,
1555
+ },
1556
+ providers: {
1557
+ healthy: providerKeys.size - cooldownProviders,
1558
+ cooldown: cooldownProviders,
1559
+ total: providerKeys.size,
1560
+ },
1561
+ gateways: {
1562
+ healthy: totalGateways - degradedGateways,
1563
+ degraded: degradedGateways,
1564
+ total: totalGateways,
1565
+ },
1566
+ },
1567
+ estop: estopActive,
1568
+ nextScheduledTask: nextScheduled,
1569
+ }
1570
+ }