@swarmclawai/swarmclaw 1.2.1 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/README.md +16 -85
  2. package/bin/server-cmd.js +64 -1
  3. package/package.json +2 -2
  4. package/skills/coding-agent/SKILL.md +111 -0
  5. package/skills/github/SKILL.md +140 -0
  6. package/skills/nano-banana-pro/SKILL.md +62 -0
  7. package/skills/nano-banana-pro/scripts/generate_image.py +235 -0
  8. package/skills/nano-pdf/SKILL.md +53 -0
  9. package/skills/openai-image-gen/SKILL.md +78 -0
  10. package/skills/openai-image-gen/scripts/gen.py +328 -0
  11. package/skills/resourceful-problem-solving/SKILL.md +49 -0
  12. package/skills/skill-creator/SKILL.md +147 -0
  13. package/skills/skill-creator/scripts/init_skill.py +378 -0
  14. package/skills/skill-creator/scripts/quick_validate.py +159 -0
  15. package/skills/summarize/SKILL.md +77 -0
  16. package/src/app/api/auth/route.ts +20 -5
  17. package/src/app/api/chats/[id]/devserver/route.ts +13 -19
  18. package/src/app/api/chats/[id]/messages/route.ts +13 -15
  19. package/src/app/api/chats/[id]/route.ts +9 -10
  20. package/src/app/api/chats/[id]/stop/route.ts +5 -7
  21. package/src/app/api/chats/messages-route.test.ts +8 -6
  22. package/src/app/api/chats/route.ts +9 -10
  23. package/src/app/api/ip/route.ts +2 -2
  24. package/src/app/api/preview-server/route.ts +1 -1
  25. package/src/app/api/projects/[id]/route.ts +7 -46
  26. package/src/cli/server-cmd.test.js +74 -0
  27. package/src/components/chat/chat-area.tsx +45 -23
  28. package/src/components/chat/message-bubble.test.ts +35 -0
  29. package/src/components/chat/message-bubble.tsx +19 -9
  30. package/src/components/chat/message-list.tsx +37 -3
  31. package/src/components/input/chat-input.tsx +34 -14
  32. package/src/components/openclaw/openclaw-deploy-panel.tsx +4 -0
  33. package/src/instrumentation.ts +1 -1
  34. package/src/lib/chat/assistant-render-id.ts +3 -0
  35. package/src/lib/chat/chat-streaming-state.test.ts +42 -3
  36. package/src/lib/chat/chat-streaming-state.ts +20 -8
  37. package/src/lib/chat/queued-message-queue.test.ts +23 -1
  38. package/src/lib/chat/queued-message-queue.ts +11 -2
  39. package/src/lib/providers/cli-utils.test.ts +124 -0
  40. package/src/lib/server/activity/activity-log.ts +21 -0
  41. package/src/lib/server/agents/agent-availability.test.ts +10 -5
  42. package/src/lib/server/agents/agent-cascade.ts +79 -59
  43. package/src/lib/server/agents/agent-registry.ts +3 -1
  44. package/src/lib/server/agents/agent-repository.ts +90 -0
  45. package/src/lib/server/agents/delegation-job-repository.ts +53 -0
  46. package/src/lib/server/agents/delegation-jobs.ts +11 -4
  47. package/src/lib/server/agents/guardian-checkpoint-repository.ts +35 -0
  48. package/src/lib/server/agents/guardian.ts +2 -2
  49. package/src/lib/server/agents/main-agent-loop.ts +10 -3
  50. package/src/lib/server/agents/main-loop-state-repository.ts +38 -0
  51. package/src/lib/server/agents/subagent-runtime.ts +9 -6
  52. package/src/lib/server/agents/subagent-swarm.ts +3 -2
  53. package/src/lib/server/agents/task-session.ts +3 -4
  54. package/src/lib/server/approvals/approval-repository.ts +30 -0
  55. package/src/lib/server/autonomy/supervisor-incident-repository.ts +42 -0
  56. package/src/lib/server/chat-execution/chat-execution-types.ts +38 -0
  57. package/src/lib/server/chat-execution/chat-execution-utils.ts +1 -1
  58. package/src/lib/server/chat-execution/chat-execution.ts +84 -1926
  59. package/src/lib/server/chat-execution/chat-turn-finalization.ts +620 -0
  60. package/src/lib/server/chat-execution/chat-turn-partial-persistence.ts +221 -0
  61. package/src/lib/server/chat-execution/chat-turn-preflight.ts +133 -0
  62. package/src/lib/server/chat-execution/chat-turn-preparation.ts +817 -0
  63. package/src/lib/server/chat-execution/chat-turn-stream-execution.ts +296 -0
  64. package/src/lib/server/chat-execution/chat-turn-tool-routing.ts +5 -5
  65. package/src/lib/server/chat-execution/message-classifier.test.ts +329 -0
  66. package/src/lib/server/chat-execution/post-stream-finalization.ts +1 -1
  67. package/src/lib/server/chat-execution/prompt-builder.ts +11 -0
  68. package/src/lib/server/chat-execution/prompt-sections.ts +5 -6
  69. package/src/lib/server/chat-execution/situational-awareness.ts +12 -7
  70. package/src/lib/server/chat-execution/stream-agent-chat.ts +16 -13
  71. package/src/lib/server/chatrooms/chatroom-repository.ts +32 -0
  72. package/src/lib/server/connectors/connector-repository.ts +58 -0
  73. package/src/lib/server/connectors/runtime-state.test.ts +117 -0
  74. package/src/lib/server/credentials/credential-repository.ts +7 -0
  75. package/src/lib/server/gateways/gateway-profile-repository.ts +4 -0
  76. package/src/lib/server/memory/memory-abstract.test.ts +59 -0
  77. package/src/lib/server/missions/mission-repository.ts +74 -0
  78. package/src/lib/server/missions/mission-service/actions.ts +6 -0
  79. package/src/lib/server/missions/mission-service/bindings.ts +9 -0
  80. package/src/lib/server/missions/mission-service/context.ts +4 -0
  81. package/src/lib/server/missions/mission-service/core.ts +2269 -0
  82. package/src/lib/server/missions/mission-service/queries.ts +12 -0
  83. package/src/lib/server/missions/mission-service/recovery.ts +5 -0
  84. package/src/lib/server/missions/mission-service/ticks.ts +9 -0
  85. package/src/lib/server/missions/mission-service.test.ts +9 -2
  86. package/src/lib/server/missions/mission-service.ts +6 -2266
  87. package/src/lib/server/openclaw/deploy.test.ts +42 -3
  88. package/src/lib/server/openclaw/deploy.ts +26 -12
  89. package/src/lib/server/persistence/repository-utils.ts +154 -0
  90. package/src/lib/server/persistence/storage-context.ts +51 -0
  91. package/src/lib/server/persistence/transaction.ts +1 -0
  92. package/src/lib/server/projects/project-repository.ts +36 -0
  93. package/src/lib/server/projects/project-service.ts +79 -0
  94. package/src/lib/server/protocols/protocol-normalization.test.ts +6 -4
  95. package/src/lib/server/runtime/alert-dispatch.ts +1 -1
  96. package/src/lib/server/runtime/daemon-policy.ts +1 -1
  97. package/src/lib/server/runtime/daemon-state/core.ts +1570 -0
  98. package/src/lib/server/runtime/daemon-state/health.ts +6 -0
  99. package/src/lib/server/runtime/daemon-state/policy.ts +7 -0
  100. package/src/lib/server/runtime/daemon-state/supervisor.ts +6 -0
  101. package/src/lib/server/runtime/daemon-state.test.ts +48 -0
  102. package/src/lib/server/runtime/daemon-state.ts +3 -1470
  103. package/src/lib/server/runtime/estop-repository.ts +4 -0
  104. package/src/lib/server/runtime/estop.ts +3 -1
  105. package/src/lib/server/runtime/heartbeat-service.test.ts +2 -2
  106. package/src/lib/server/runtime/heartbeat-service.ts +55 -34
  107. package/src/lib/server/runtime/heartbeat-wake.ts +6 -4
  108. package/src/lib/server/runtime/idle-window.ts +2 -2
  109. package/src/lib/server/runtime/network.ts +11 -0
  110. package/src/lib/server/runtime/orchestrator-events.ts +2 -2
  111. package/src/lib/server/runtime/queue/claims.ts +4 -0
  112. package/src/lib/server/runtime/queue/core.ts +2079 -0
  113. package/src/lib/server/runtime/queue/execution.ts +7 -0
  114. package/src/lib/server/runtime/queue/followups.ts +4 -0
  115. package/src/lib/server/runtime/queue/queries.ts +12 -0
  116. package/src/lib/server/runtime/queue/recovery.ts +7 -0
  117. package/src/lib/server/runtime/queue-recovery.test.ts +48 -13
  118. package/src/lib/server/runtime/queue-repository.ts +17 -0
  119. package/src/lib/server/runtime/queue.ts +5 -2061
  120. package/src/lib/server/runtime/run-ledger.ts +6 -5
  121. package/src/lib/server/runtime/run-repository.ts +73 -0
  122. package/src/lib/server/runtime/runtime-lock-repository.ts +8 -0
  123. package/src/lib/server/runtime/runtime-settings.ts +1 -1
  124. package/src/lib/server/runtime/runtime-state.ts +99 -0
  125. package/src/lib/server/runtime/scheduler.ts +4 -2
  126. package/src/lib/server/runtime/session-run-manager/cancellation.ts +157 -0
  127. package/src/lib/server/runtime/session-run-manager/drain.ts +246 -0
  128. package/src/lib/server/runtime/session-run-manager/enqueue.ts +287 -0
  129. package/src/lib/server/runtime/session-run-manager/queries.ts +117 -0
  130. package/src/lib/server/runtime/session-run-manager/recovery.ts +238 -0
  131. package/src/lib/server/runtime/session-run-manager/state.ts +441 -0
  132. package/src/lib/server/runtime/session-run-manager/types.ts +74 -0
  133. package/src/lib/server/runtime/session-run-manager.ts +72 -1377
  134. package/src/lib/server/runtime/watch-job-repository.ts +35 -0
  135. package/src/lib/server/runtime/watch-jobs.ts +3 -1
  136. package/src/lib/server/schedules/schedule-repository.ts +42 -0
  137. package/src/lib/server/sessions/session-repository.ts +85 -0
  138. package/src/lib/server/settings/settings-repository.ts +25 -0
  139. package/src/lib/server/skills/skill-discovery.test.ts +2 -2
  140. package/src/lib/server/skills/skill-discovery.ts +2 -2
  141. package/src/lib/server/skills/skill-repository.ts +14 -0
  142. package/src/lib/server/storage.ts +13 -24
  143. package/src/lib/server/tasks/task-repository.ts +54 -0
  144. package/src/lib/server/usage/usage-repository.ts +30 -0
  145. package/src/lib/server/webhooks/webhook-repository.ts +10 -0
  146. package/src/lib/strip-internal-metadata.test.ts +42 -41
  147. package/src/stores/use-chat-store.test.ts +54 -0
  148. package/src/stores/use-chat-store.ts +21 -5
  149. /package/{bundled-skills → skills}/google-workspace/SKILL.md +0 -0
@@ -1,1470 +1,3 @@
1
- import { log } from '@/lib/server/logger'
2
- import { loadQueue, loadSchedules, loadSessions, loadConnectors, saveConnectors, loadWebhookRetryQueue, upsertWebhookRetry, deleteWebhookRetry, loadWebhooks, loadAgents, loadSettings, appendWebhookLog, loadCredentials, decryptKey, pruneExpiredLocks, pruneOldUsage } from '@/lib/server/storage'
3
- import { notify } from '@/lib/server/ws-hub'
4
- import { processNext, cleanupFinishedTaskSessions, validateCompletedTasksQueue, recoverStalledRunningTasks, resumeQueue, promoteDeferred } from '@/lib/server/runtime/queue'
5
- import { startScheduler, stopScheduler } from '@/lib/server/runtime/scheduler'
6
- import { sweepOrphanedBrowsers, getActiveBrowserCount } from '@/lib/server/session-tools'
7
- import {
8
- autoStartConnectors,
9
- listRunningConnectors,
10
- sendConnectorMessage,
11
- stopAllConnectors,
12
- startConnector,
13
- getConnectorStatus,
14
- checkConnectorHealth,
15
- createConnectorReconnectState,
16
- advanceConnectorReconnectState,
17
- clearReconnectState,
18
- getAllReconnectStates,
19
- getReconnectState,
20
- setReconnectState,
21
- } from '@/lib/server/connectors/manager'
22
- import { startConnectorOutboxWorker, stopConnectorOutboxWorker } from '@/lib/server/connectors/outbox'
23
- import { pruneConnectorTrackingState } from '@/lib/server/connectors/runtime-state'
24
- import { startHeartbeatService, stopHeartbeatService, getHeartbeatServiceStatus, pruneHeartbeatState, pruneOrchestratorState } from '@/lib/server/runtime/heartbeat-service'
25
- import { hasOpenClawAgents, ensureGatewayConnected, disconnectAutoGateways, getGateway } from '@/lib/server/openclaw/gateway'
26
- import { enqueueSessionRun, sweepStuckRuns } from '@/lib/server/runtime/session-run-manager'
27
- import { pruneOldRuns } from '@/lib/server/runtime/run-ledger'
28
- import { getEnabledCapabilitySelection } from '@/lib/capability-selection'
29
- import { WORKSPACE_DIR } from '@/lib/server/data-dir'
30
- import { DEFAULT_HEARTBEAT_INTERVAL_SEC } from '@/lib/runtime/heartbeat-defaults'
31
- import { genId } from '@/lib/id'
32
- import { isAgentDisabled } from '@/lib/server/agents/agent-availability'
33
- import { errorMessage, hmrSingleton } from '@/lib/shared-utils'
34
- import path from 'node:path'
35
- import type { Session, WebhookRetryEntry } from '@/types'
36
- import { createNotification } from '@/lib/server/create-notification'
37
- import { pingProvider, OPENAI_COMPATIBLE_DEFAULTS, restoreProviderHealthState } from '@/lib/server/provider-health'
38
- import { runIntegrityMonitor } from '@/lib/server/integrity-monitor'
39
- import { notifyOrchestrators } from '@/lib/server/runtime/orchestrator-events'
40
- import { recoverStaleDelegationJobs } from '@/lib/server/agents/delegation-jobs'
41
- import { restoreSwarmRegistry } from '@/lib/server/agents/subagent-swarm'
42
- import { cleanupFinishedSubagents } from '@/lib/server/agents/subagent-runtime'
43
- import { pruneMainLoopState } from '@/lib/server/agents/main-agent-loop'
44
- import { pruneSystemEventQueues, pruneOrchestratorEventQueues } from '@/lib/server/runtime/system-events'
45
- import { checkSwarmTimeouts, ensureProtocolEngineRecovered } from '@/lib/server/protocols/protocol-service'
46
- import { sweepManagedProcesses, reapOrphanedSandboxContainers } from '@/lib/server/runtime/process-manager'
47
- import { drainIdleWindowCallbacks } from '@/lib/server/runtime/idle-window'
48
- import {
49
- buildSessionHeartbeatHealthDedupKey,
50
- daemonAutostartEnvEnabled,
51
- isDaemonBackgroundServicesEnabled,
52
- parseCronToMs,
53
- parseHeartbeatIntervalSec,
54
- shouldNotifyProviderReachabilityIssue,
55
- shouldSuppressSessionHeartbeatHealthAlert,
56
- shouldSuppressSyntheticAgentHealthAlert,
57
- } from '@/lib/server/runtime/daemon-policy'
58
- import { loadEstopState } from '@/lib/server/runtime/estop'
59
- import { classifyRuntimeFailure, recordSupervisorIncident } from '@/lib/server/autonomy/supervisor-reflection'
60
- import { getMemoryDb } from '@/lib/server/memory/memory-db'
61
- import { clearLogsByAge } from '@/lib/server/execution-log'
62
-
63
- const TAG = 'daemon-state'
64
-
65
- const QUEUE_CHECK_INTERVAL = 30_000 // 30 seconds
66
- const BROWSER_SWEEP_INTERVAL = 60_000 // 60 seconds
67
- const BROWSER_MAX_AGE = 10 * 60 * 1000 // 10 minutes idle = orphaned
68
- const HEALTH_CHECK_INTERVAL = 120_000 // 2 minutes
69
- const CONNECTOR_HEALTH_CHECK_INTERVAL = 15_000 // 15 seconds
70
- const MEMORY_CONSOLIDATION_INTERVAL = 6 * 3600_000 // 6 hours
71
- const MEMORY_CONSOLIDATION_INITIAL_DELAY = 60_000 // 1 minute after daemon start
72
- const STALE_MULTIPLIER = 4 // session is stale after N × heartbeat interval
73
- const STALE_MIN_MS = 4 * 60 * 1000 // minimum 4 minutes regardless of interval
74
- const STALE_AUTO_DISABLE_MULTIPLIER = 16 // auto-disable after much longer sustained staleness
75
- const STALE_AUTO_DISABLE_MIN_MS = 45 * 60 * 1000 // never auto-disable before 45 minutes
76
- const CONNECTOR_RESTART_BASE_MS = 30_000
77
- const CONNECTOR_RESTART_MAX_MS = 15 * 60 * 1000
78
- const MAX_WAKE_ATTEMPTS = 3
79
- const QUEUE_PROCESS_TIMEOUT = 10 * 60_000 // 10 minutes
80
- const SHUTDOWN_TIMEOUT_MS = 15_000
81
- const PROVIDER_PING_CB_THRESHOLD = 3 // trips after 3 consecutive failures
82
- const PROVIDER_PING_CB_BASE_MS = 300_000 // 5 min initial cooldown
83
- const PROVIDER_PING_CB_MAX_MS = 1_800_000 // 30 min max cooldown
84
-
85
- export {
86
- buildSessionHeartbeatHealthDedupKey,
87
- isDaemonBackgroundServicesEnabled,
88
- shouldNotifyProviderReachabilityIssue,
89
- shouldSuppressSessionHeartbeatHealthAlert,
90
- shouldSuppressSyntheticAgentHealthAlert,
91
- }
92
-
93
- // Store daemon state on globalThis to survive HMR reloads
94
- interface DaemonState {
95
- queueIntervalId: ReturnType<typeof setInterval> | null
96
- browserSweepId: ReturnType<typeof setInterval> | null
97
- healthIntervalId: ReturnType<typeof setInterval> | null
98
- connectorHealthIntervalId: ReturnType<typeof setInterval> | null
99
- memoryConsolidationTimeoutId: ReturnType<typeof setTimeout> | null
100
- memoryConsolidationIntervalId: ReturnType<typeof setInterval> | null
101
- evalSchedulerIntervalId: ReturnType<typeof setInterval> | null
102
- swarmTimeoutIntervalId: ReturnType<typeof setInterval> | null
103
- /** Session IDs we've already alerted as stale (alert-once semantics). */
104
- staleSessionIds: Set<string>
105
- /** OpenClaw gateway agent IDs currently considered down. */
106
- openclawDownAgentIds: Set<string>
107
- /** Per-agent auto-repair state for OpenClaw gateways. */
108
- openclawRepairState: Map<string, { attempts: number; lastAttemptAt: number; cooldownUntil: number }>
109
- lastIntegrityCheckAt: number | null
110
- lastIntegrityDriftCount: number
111
- manualStopRequested: boolean
112
- running: boolean
113
- lastProcessedAt: number | null
114
- healthCheckRunning: boolean
115
- connectorHealthCheckRunning: boolean
116
- shuttingDown: boolean
117
- providerPingCircuitBreaker: Map<string, { consecutiveFailures: number; skipUntil: number }>
118
- }
119
-
120
- const ds: DaemonState = hmrSingleton<DaemonState>('__swarmclaw_daemon__', () => ({
121
- queueIntervalId: null,
122
- browserSweepId: null,
123
- healthIntervalId: null,
124
- connectorHealthIntervalId: null,
125
- memoryConsolidationTimeoutId: null,
126
- memoryConsolidationIntervalId: null,
127
- evalSchedulerIntervalId: null,
128
- swarmTimeoutIntervalId: null,
129
- staleSessionIds: new Set<string>(),
130
- openclawDownAgentIds: new Set<string>(),
131
- openclawRepairState: new Map<string, { attempts: number; lastAttemptAt: number; cooldownUntil: number }>(),
132
- lastIntegrityCheckAt: null,
133
- lastIntegrityDriftCount: 0,
134
- manualStopRequested: false,
135
- running: false,
136
- lastProcessedAt: null,
137
- healthCheckRunning: false,
138
- connectorHealthCheckRunning: false,
139
- shuttingDown: false,
140
- providerPingCircuitBreaker: new Map<string, { consecutiveFailures: number; skipUntil: number }>(),
141
- }))
142
-
143
- // Backfill fields for hot-reloaded daemon state objects from older code versions.
144
- if (!ds.staleSessionIds) ds.staleSessionIds = new Set<string>()
145
- if (!ds.openclawDownAgentIds) ds.openclawDownAgentIds = new Set<string>()
146
- if (!ds.openclawRepairState) ds.openclawRepairState = new Map<string, { attempts: number; lastAttemptAt: number; cooldownUntil: number }>()
147
- if (ds.lastIntegrityCheckAt === undefined) ds.lastIntegrityCheckAt = null
148
- if (ds.lastIntegrityDriftCount === undefined) ds.lastIntegrityDriftCount = 0
149
- // Migrate from old issueLastAlertAt map if present (HMR across code versions)
150
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
151
- if ((ds as any).issueLastAlertAt) delete (ds as any).issueLastAlertAt
152
- if (ds.healthIntervalId === undefined) ds.healthIntervalId = null
153
- if (ds.connectorHealthIntervalId === undefined) ds.connectorHealthIntervalId = null
154
- if (ds.manualStopRequested === undefined) ds.manualStopRequested = false
155
- if (ds.memoryConsolidationTimeoutId === undefined) ds.memoryConsolidationTimeoutId = null
156
- if (ds.memoryConsolidationIntervalId === undefined) ds.memoryConsolidationIntervalId = null
157
- if (ds.evalSchedulerIntervalId === undefined) ds.evalSchedulerIntervalId = null
158
- if (ds.swarmTimeoutIntervalId === undefined) ds.swarmTimeoutIntervalId = null
159
- if (ds.healthCheckRunning === undefined) ds.healthCheckRunning = false
160
- if (ds.connectorHealthCheckRunning === undefined) ds.connectorHealthCheckRunning = false
161
- if (ds.shuttingDown === undefined) ds.shuttingDown = false
162
- if (!ds.providerPingCircuitBreaker) ds.providerPingCircuitBreaker = new Map<string, { consecutiveFailures: number; skipUntil: number }>()
163
-
164
- export function ensureDaemonStarted(source = 'unknown'): boolean {
165
- if (ds.running) return false
166
- if (!daemonAutostartEnvEnabled()) return false
167
- if (ds.manualStopRequested) return false
168
- if (loadEstopState().level !== 'none') return false
169
- startDaemon({ source, manualStart: false })
170
- return true
171
- }
172
-
173
- export function startDaemon(options?: { source?: string; manualStart?: boolean }) {
174
- const source = options?.source || 'unknown'
175
- const manualStart = options?.manualStart === true
176
- if (manualStart) ds.manualStopRequested = false
177
- const estop = loadEstopState()
178
- if (estop.level !== 'none') {
179
- notify('daemon')
180
- log.warn(TAG, `[daemon] Start blocked by estop (level=${estop.level}, source=${source})`)
181
- return
182
- }
183
-
184
- if (ds.running) {
185
- // In dev/HMR, daemon can already be flagged running while new interval types
186
- // (for example health monitor) were introduced in newer code.
187
- startQueueProcessor()
188
- startBrowserSweep()
189
- startHeartbeatService()
190
- startMemoryConsolidation()
191
- startSwarmTimeoutChecker()
192
- syncDaemonBackgroundServices({ runConnectorHealthCheckImmediately: false })
193
- return
194
- }
195
- ds.running = true
196
- notify('daemon')
197
- log.info(TAG, `[daemon] Starting daemon (source=${source}, scheduler + queue processor + heartbeat)`)
198
-
199
- try {
200
- validateCompletedTasksQueue()
201
- cleanupFinishedTaskSessions()
202
- recoverStaleDelegationJobs({ fullRestart: true })
203
- ensureProtocolEngineRecovered()
204
- restoreProviderHealthState()
205
- try {
206
- const lost = restoreSwarmRegistry()
207
- if (lost > 0) log.info(TAG, `[daemon] Marked ${lost} in-flight swarm(s) as lost after restart`)
208
- } catch { /* best-effort */ }
209
- resumeQueue()
210
- startScheduler()
211
- startQueueProcessor()
212
- startBrowserSweep()
213
- startHeartbeatService()
214
- startMemoryConsolidation()
215
- startSwarmTimeoutChecker()
216
- syncDaemonBackgroundServices({ runConnectorHealthCheckImmediately: false })
217
- } catch (err: unknown) {
218
- ds.running = false
219
- notify('daemon')
220
- log.error(TAG, '[daemon] Failed to start:', errorMessage(err))
221
- throw err
222
- }
223
-
224
- if (isDaemonBackgroundServicesEnabled()) {
225
- // Auto-start enabled connectors only when the full background stack is enabled.
226
- autoStartConnectors().catch((err: unknown) => {
227
- log.error(TAG, '[daemon] Error auto-starting connectors:', errorMessage(err))
228
- })
229
- }
230
- }
231
-
232
- export async function stopDaemon(options?: { source?: string; manualStop?: boolean }) {
233
- const source = options?.source || 'unknown'
234
- if (options?.manualStop === true) ds.manualStopRequested = true
235
- if (!ds.running) return
236
- ds.running = false
237
- ds.shuttingDown = true
238
- notify('daemon')
239
- log.info(TAG, `[daemon] Stopping daemon (source=${source})`)
240
-
241
- stopScheduler()
242
- stopQueueProcessor()
243
- stopBrowserSweep()
244
- stopHealthMonitor()
245
- stopConnectorHealthMonitor()
246
- stopConnectorOutboxWorker()
247
- stopHeartbeatService()
248
- stopMemoryConsolidation()
249
- stopSwarmTimeoutChecker()
250
- stopEvalScheduler()
251
- try {
252
- await Promise.race([
253
- stopAllConnectors({ disable: false }),
254
- new Promise<void>((_, reject) =>
255
- setTimeout(() => reject(new Error('Connector shutdown timed out')), SHUTDOWN_TIMEOUT_MS)
256
- ),
257
- ])
258
- } catch (err: unknown) {
259
- log.warn(TAG, `[daemon] Connector shutdown issue: ${errorMessage(err)}`)
260
- } finally {
261
- ds.shuttingDown = false
262
- }
263
- }
264
-
265
- function startBrowserSweep() {
266
- if (ds.browserSweepId) return
267
- ds.browserSweepId = setInterval(() => {
268
- const count = getActiveBrowserCount()
269
- if (count > 0) {
270
- const cleaned = sweepOrphanedBrowsers(BROWSER_MAX_AGE)
271
- if (cleaned > 0) {
272
- log.info(TAG, `[daemon] Cleaned ${cleaned} orphaned browser(s), ${getActiveBrowserCount()} still active`)
273
- }
274
- }
275
- }, BROWSER_SWEEP_INTERVAL)
276
- }
277
-
278
- function stopBrowserSweep() {
279
- if (ds.browserSweepId) {
280
- clearInterval(ds.browserSweepId)
281
- ds.browserSweepId = null
282
- }
283
- // Kill all remaining browsers on shutdown
284
- sweepOrphanedBrowsers(0)
285
- }
286
-
287
- export async function syncOpenClawGatewayLifecycle() {
288
- if (!hasOpenClawAgents()) {
289
- disconnectAutoGateways()
290
- return
291
- }
292
- if (!getGateway()?.connected) {
293
- await ensureGatewayConnected()
294
- }
295
- }
296
-
297
- function startQueueProcessor() {
298
- if (ds.queueIntervalId) return
299
- ds.queueIntervalId = setInterval(async () => {
300
- if (!ds.running) return
301
- const queue = loadQueue()
302
- if (queue.length > 0) {
303
- log.info(TAG, `[daemon] Processing ${queue.length} queued task(s)`)
304
- try {
305
- await Promise.race([
306
- processNext(),
307
- new Promise<void>((_, reject) =>
308
- setTimeout(() => reject(new Error('Queue processing timed out')), QUEUE_PROCESS_TIMEOUT)
309
- ),
310
- ])
311
- } catch (err: unknown) {
312
- log.error(TAG, `[daemon] Queue processing error/timeout: ${errorMessage(err)}`)
313
- }
314
- ds.lastProcessedAt = Date.now()
315
- }
316
- if (!isDaemonBackgroundServicesEnabled()) return
317
- // OpenClaw gateway lifecycle: lazy connect for active OpenClaw agents, stop auto-managed reconnects when none remain.
318
- try {
319
- await syncOpenClawGatewayLifecycle()
320
- } catch { /* gateway errors are non-fatal */ }
321
- }, QUEUE_CHECK_INTERVAL)
322
- }
323
-
324
- function stopQueueProcessor() {
325
- if (ds.queueIntervalId) {
326
- clearInterval(ds.queueIntervalId)
327
- ds.queueIntervalId = null
328
- }
329
- }
330
-
331
- async function sendHealthAlert(input: string | {
332
- text: string
333
- dedupKey?: string
334
- entityType?: string
335
- entityId?: string
336
- }) {
337
- const payload = typeof input === 'string' ? { text: input } : input
338
- const text = payload.text
339
- log.warn(TAG, `[health] ${text}`)
340
- createNotification({
341
- type: 'warning',
342
- title: 'SwarmClaw health alert',
343
- message: text,
344
- dedupKey: payload.dedupKey || `health-alert:${text}`,
345
- entityType: payload.entityType,
346
- entityId: payload.entityId,
347
- dispatchExternally: false,
348
- })
349
- }
350
-
351
- async function runConnectorHealthChecks(now: number) {
352
- // First, collapse dead runtime instances into persisted error state so the
353
- // daemon can own the restart cadence and backoff policy.
354
- try {
355
- await checkConnectorHealth()
356
- } catch (err: unknown) {
357
- log.error(TAG, '[health] Connector isAlive check failed:', errorMessage(err))
358
- }
359
-
360
- const connectors = loadConnectors()
361
- for (const connector of Object.values(connectors) as unknown as Record<string, unknown>[]) {
362
- if (!connector?.id || typeof connector.id !== 'string') continue
363
- if (connector.isEnabled !== true) {
364
- clearReconnectState(connector.id)
365
- continue
366
- }
367
-
368
- const runtimeStatus = getConnectorStatus(connector.id)
369
- if (runtimeStatus === 'running') {
370
- clearReconnectState(connector.id)
371
- continue
372
- }
373
-
374
- const current = getReconnectState(connector.id)
375
- ?? createConnectorReconnectState(
376
- { error: typeof connector.lastError === 'string' ? connector.lastError : '' },
377
- { initialBackoffMs: CONNECTOR_RESTART_BASE_MS },
378
- )
379
-
380
- if (current.exhausted) {
381
- continue
382
- }
383
-
384
- if (current.nextRetryAt > now) continue
385
-
386
- // Notify on first detection of a down connector
387
- if (current.attempts === 0) {
388
- createNotification({
389
- type: 'warning',
390
- title: `Connector "${connector.name}" is down`,
391
- message: 'Auto-restart in progress.',
392
- dedupKey: `connector-down:${connector.id}`,
393
- entityType: 'connector',
394
- entityId: connector.id,
395
- })
396
- }
397
-
398
- try {
399
- await startConnector(connector.id)
400
- clearReconnectState(connector.id)
401
- await sendHealthAlert(`Connector "${connector.name}" (${connector.platform}) was down and has been auto-restarted.`)
402
- } catch (err: unknown) {
403
- const message = errorMessage(err)
404
- const next = advanceConnectorReconnectState(current, message, now, {
405
- initialBackoffMs: CONNECTOR_RESTART_BASE_MS,
406
- maxBackoffMs: CONNECTOR_RESTART_MAX_MS,
407
- maxAttempts: MAX_WAKE_ATTEMPTS,
408
- })
409
- setReconnectState(connector.id, next)
410
- if (next.exhausted) {
411
- log.warn(TAG, `[health] Connector "${connector.name}" exceeded ${MAX_WAKE_ATTEMPTS} auto-restart attempts — giving up until the server restarts or the user retries manually`)
412
- connector.status = 'error'
413
- connector.lastError = `Auto-restart gave up after ${MAX_WAKE_ATTEMPTS} attempts: ${message}`
414
- connector.updatedAt = Date.now()
415
- connectors[connector.id] = connector
416
- saveConnectors(connectors)
417
- notify('connectors')
418
- notifyOrchestrators(`Connector ${connector.name || connector.id} status: error — auto-restart exhausted after ${MAX_WAKE_ATTEMPTS} attempts`, `connector-status:${connector.id}`)
419
- createNotification({
420
- type: 'error',
421
- title: `Connector "${connector.name}" failed`,
422
- message: `Auto-restart gave up after ${MAX_WAKE_ATTEMPTS} attempts.`,
423
- dedupKey: `connector-gave-up:${connector.id}`,
424
- entityType: 'connector',
425
- entityId: connector.id,
426
- })
427
- } else {
428
- log.warn(TAG, `[health] Connector auto-restart failed for ${connector.name} (attempt ${next.attempts}/${MAX_WAKE_ATTEMPTS}): ${message}`)
429
- }
430
- }
431
- }
432
-
433
- // Purge restart state for connectors that no longer exist in storage
434
- for (const id of Object.keys(getAllReconnectStates())) {
435
- if (!connectors[id] || connectors[id]?.isEnabled !== true) clearReconnectState(id)
436
- }
437
- }
438
-
439
- async function processWebhookRetries() {
440
- const retryQueue = loadWebhookRetryQueue()
441
- const now = Date.now()
442
- const dueEntries: WebhookRetryEntry[] = []
443
-
444
- for (const raw of Object.values(retryQueue)) {
445
- const entry = raw as WebhookRetryEntry
446
- if (entry.deadLettered) continue
447
- if (entry.nextRetryAt > now) continue
448
- dueEntries.push(entry)
449
- }
450
-
451
- if (dueEntries.length === 0) return
452
-
453
- const webhooks = loadWebhooks()
454
- const agents = loadAgents()
455
- const sessions = loadSessions()
456
-
457
- for (const entry of dueEntries) {
458
- const webhook = webhooks[entry.webhookId] as unknown as Record<string, unknown> | undefined
459
- if (!webhook) {
460
- // Webhook deleted — drop the retry
461
- deleteWebhookRetry(entry.id)
462
- continue
463
- }
464
-
465
- const agentId = typeof webhook.agentId === 'string' ? webhook.agentId : ''
466
- const agent = agentId ? (agents[agentId] as unknown as Record<string, unknown> | undefined) : null
467
- if (!agent) {
468
- entry.deadLettered = true
469
- upsertWebhookRetry(entry.id, entry)
470
- log.warn(TAG, `[webhook-retry] Dead-lettered ${entry.id}: agent not found for webhook ${entry.webhookId}`)
471
- continue
472
- }
473
- if (isAgentDisabled(agent)) {
474
- entry.deadLettered = true
475
- upsertWebhookRetry(entry.id, entry)
476
- log.warn(TAG, `[webhook-retry] Dead-lettered ${entry.id}: agent disabled for webhook ${entry.webhookId}`)
477
- continue
478
- }
479
-
480
- // Find or create a webhook session (same logic as the POST handler)
481
- const sessionName = `webhook:${entry.webhookId}`
482
- let session = Object.values(sessions).find(
483
- (s: unknown) => {
484
- const rec = s as Record<string, unknown>
485
- return rec.name === sessionName && rec.agentId === agent.id
486
- },
487
- ) as unknown as Record<string, unknown> | undefined
488
-
489
- if (!session) {
490
- const sessionId = genId()
491
- const ts = Date.now()
492
- session = {
493
- id: sessionId,
494
- name: sessionName,
495
- cwd: WORKSPACE_DIR,
496
- user: 'system',
497
- provider: agent.provider || 'claude-cli',
498
- model: agent.model || '',
499
- credentialId: agent.credentialId || null,
500
- apiEndpoint: agent.apiEndpoint || null,
501
- claudeSessionId: null,
502
- codexThreadId: null,
503
- opencodeSessionId: null,
504
- delegateResumeIds: { claudeCode: null, codex: null, opencode: null, gemini: null },
505
- messages: [],
506
- createdAt: ts,
507
- lastActiveAt: ts,
508
- sessionType: 'human',
509
- agentId: agent.id,
510
- parentSessionId: null,
511
- ...getEnabledCapabilitySelection(agent),
512
- heartbeatEnabled: (agent.heartbeatEnabled as boolean | undefined) ?? false,
513
- heartbeatIntervalSec: (agent.heartbeatIntervalSec as number | null | undefined) ?? null,
514
- }
515
- const { upsertSession: upsert } = await import('@/lib/server/storage')
516
- upsert(session.id as string, session)
517
- }
518
-
519
- const payloadPreview = (entry.payload || '').slice(0, 12_000)
520
- const prompt = [
521
- 'Webhook event received (retry).',
522
- `Webhook ID: ${entry.webhookId}`,
523
- `Webhook Name: ${(webhook.name as string) || entry.webhookId}`,
524
- `Source: ${(webhook.source as string) || 'custom'}`,
525
- `Event: ${entry.event}`,
526
- `Retry attempt: ${entry.attempts}`,
527
- `Original received at: ${new Date(entry.createdAt).toISOString()}`,
528
- '',
529
- 'Payload:',
530
- payloadPreview || '(empty payload)',
531
- '',
532
- 'Handle this event now. If this requires notifying the user, use configured connector tools.',
533
- ].join('\n')
534
-
535
- try {
536
- const run = enqueueSessionRun({
537
- sessionId: session.id as string,
538
- message: prompt,
539
- source: 'webhook',
540
- internal: false,
541
- mode: 'followup',
542
- })
543
-
544
- appendWebhookLog(genId(8), {
545
- id: genId(8),
546
- webhookId: entry.webhookId,
547
- event: entry.event,
548
- payload: (entry.payload || '').slice(0, 2000),
549
- status: 'success',
550
- sessionId: session.id,
551
- runId: run.runId,
552
- timestamp: Date.now(),
553
- })
554
-
555
- deleteWebhookRetry(entry.id)
556
- log.info(TAG, `[webhook-retry] Successfully retried ${entry.id} for webhook ${entry.webhookId} (attempt ${entry.attempts})`)
557
- } catch (err: unknown) {
558
- const errorMsg = errorMessage(err)
559
- entry.attempts += 1
560
-
561
- if (entry.attempts >= entry.maxAttempts) {
562
- entry.deadLettered = true
563
- upsertWebhookRetry(entry.id, entry)
564
- log.warn(TAG, `[webhook-retry] Dead-lettered ${entry.id} after ${entry.attempts} attempts: ${errorMsg}`)
565
- const failure = classifyRuntimeFailure({ source: 'webhook', message: errorMsg })
566
- if (session?.id) {
567
- recordSupervisorIncident({
568
- runId: entry.id,
569
- sessionId: session.id as string,
570
- taskId: null,
571
- agentId: agentId || null,
572
- source: 'webhook',
573
- kind: 'runtime_failure',
574
- severity: failure.severity,
575
- summary: `Webhook delivery dead-lettered: ${errorMsg}`.slice(0, 320),
576
- details: errorMsg,
577
- failureFamily: failure.family,
578
- remediation: failure.remediation,
579
- repairPrompt: failure.repairPrompt,
580
- autoAction: null,
581
- })
582
- }
583
-
584
- appendWebhookLog(genId(8), {
585
- id: genId(8),
586
- webhookId: entry.webhookId,
587
- event: entry.event,
588
- payload: (entry.payload || '').slice(0, 2000),
589
- status: 'error',
590
- error: `Dead-lettered after ${entry.attempts} attempts: ${errorMsg}`,
591
- timestamp: Date.now(),
592
- })
593
- } else {
594
- // Exponential backoff: 30s * 2^attempt + random jitter (0-5000ms)
595
- const jitter = Math.floor(Math.random() * 5000)
596
- entry.nextRetryAt = Date.now() + (30_000 * Math.pow(2, entry.attempts)) + jitter
597
- upsertWebhookRetry(entry.id, entry)
598
- log.warn(TAG, `[webhook-retry] Retry ${entry.id} failed (attempt ${entry.attempts}/${entry.maxAttempts}), next at ${new Date(entry.nextRetryAt).toISOString()}: ${errorMsg}`)
599
- }
600
- }
601
- }
602
- }
603
-
604
- async function runProviderHealthChecks() {
605
- const agents = loadAgents()
606
- const credentials = loadCredentials()
607
-
608
- // Build deduplicated set of { provider, credentialId, apiEndpoint } tuples
609
- const seen = new Set<string>()
610
- const tuples: { provider: string; credentialId: string; apiEndpoint: string; agentId: string; credentialName: string }[] = []
611
-
612
- for (const agent of Object.values(agents) as unknown as Record<string, unknown>[]) {
613
- if (!agent?.id || typeof agent.id !== 'string') continue
614
- if (shouldSuppressSyntheticAgentHealthAlert(agent.id)) continue
615
- const provider = typeof agent.provider === 'string' ? agent.provider : ''
616
- if (!provider || ['claude-cli', 'codex-cli', 'opencode-cli'].includes(provider)) continue
617
-
618
- const credentialId = typeof agent.credentialId === 'string' ? agent.credentialId : ''
619
- const apiEndpoint = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
620
-
621
- // For OpenClaw, scope per agent (each may have a different gateway)
622
- const key = provider === 'openclaw'
623
- ? `openclaw:${agent.id}`
624
- : `${provider}:${credentialId || 'no-cred'}:${apiEndpoint}`
625
- if (seen.has(key)) continue
626
- seen.add(key)
627
-
628
- const cred = credentialId ? (credentials[credentialId] as unknown as Record<string, unknown> | undefined) : undefined
629
- const credName = typeof cred?.name === 'string' ? cred.name : provider
630
-
631
- tuples.push({
632
- provider,
633
- credentialId,
634
- apiEndpoint,
635
- agentId: agent.id,
636
- credentialName: credName,
637
- })
638
- }
639
-
640
- for (const tuple of tuples) {
641
- // Circuit breaker: skip providers that have failed repeatedly
642
- const cbKey = `${tuple.provider}:${tuple.credentialId || 'no-cred'}:${tuple.apiEndpoint}`
643
- const cb = ds.providerPingCircuitBreaker.get(cbKey)
644
- const now = Date.now()
645
- if (cb && cb.skipUntil > now) continue
646
-
647
- let apiKey: string | undefined
648
- if (tuple.credentialId) {
649
- const cred = credentials[tuple.credentialId] as unknown as Record<string, unknown> | undefined
650
- if (cred?.encryptedKey && typeof cred.encryptedKey === 'string') {
651
- try { apiKey = decryptKey(cred.encryptedKey) } catch { /* skip undecryptable */ continue }
652
- }
653
- }
654
-
655
- const endpoint = tuple.apiEndpoint || OPENAI_COMPATIBLE_DEFAULTS[tuple.provider]?.defaultEndpoint || undefined
656
- const result = await pingProvider(tuple.provider, apiKey, endpoint)
657
-
658
- if (!result.ok) {
659
- // Update circuit breaker state
660
- const existing = ds.providerPingCircuitBreaker.get(cbKey) || { consecutiveFailures: 0, skipUntil: 0 }
661
- existing.consecutiveFailures += 1
662
- if (existing.consecutiveFailures >= PROVIDER_PING_CB_THRESHOLD) {
663
- const cooldown = Math.min(
664
- PROVIDER_PING_CB_BASE_MS * Math.pow(2, existing.consecutiveFailures - PROVIDER_PING_CB_THRESHOLD),
665
- PROVIDER_PING_CB_MAX_MS,
666
- )
667
- existing.skipUntil = now + cooldown
668
- log.info(TAG, `[health] Circuit breaker tripped for ${tuple.credentialName} — skipping pings for ${Math.round(cooldown / 60_000)}m`)
669
- }
670
- ds.providerPingCircuitBreaker.set(cbKey, existing)
671
-
672
- if (!shouldNotifyProviderReachabilityIssue(tuple.provider)) {
673
- continue
674
- }
675
-
676
- const dedupKey = `provider-down:${tuple.credentialId || tuple.provider}`
677
-
678
- const entityType = tuple.credentialId ? 'credential' : undefined
679
- const entityId = tuple.credentialId || undefined
680
-
681
- createNotification({
682
- type: 'warning',
683
- title: `Provider unreachable: ${tuple.credentialName}`,
684
- message: result.message,
685
- dedupKey,
686
- entityType,
687
- entityId,
688
- })
689
- } else {
690
- // Success — clear circuit breaker
691
- ds.providerPingCircuitBreaker.delete(cbKey)
692
- }
693
- }
694
- }
695
-
696
- const OPENCLAW_REPAIR_MAX_ATTEMPTS = 3
697
- const OPENCLAW_REPAIR_COOLDOWN_MS = 300_000 // 5 minutes
698
-
699
- async function runOpenClawGatewayHealthChecks() {
700
- const agents = loadAgents()
701
- const credentials = loadCredentials()
702
-
703
- // Build deduplicated OpenClaw agent tuples
704
- const seen = new Set<string>()
705
- const tuples: { agentId: string; endpoint: string; credentialId: string; credentialName: string }[] = []
706
-
707
- for (const agent of Object.values(agents) as unknown as Record<string, unknown>[]) {
708
- if (!agent?.id || typeof agent.id !== 'string') continue
709
- if (shouldSuppressSyntheticAgentHealthAlert(agent.id)) continue
710
- if (agent.provider !== 'openclaw') continue
711
-
712
- const key = `openclaw:${agent.id}`
713
- if (seen.has(key)) continue
714
- seen.add(key)
715
-
716
- const credentialId = typeof agent.credentialId === 'string' ? agent.credentialId : ''
717
- const endpoint = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
718
- const cred = credentialId ? (credentials[credentialId] as unknown as Record<string, unknown> | undefined) : undefined
719
- const credName = typeof cred?.name === 'string' ? cred.name : 'openclaw'
720
-
721
- tuples.push({ agentId: agent.id, endpoint, credentialId, credentialName: credName })
722
- }
723
-
724
- if (!tuples.length) return
725
-
726
- const { probeOpenClawHealth } = await import('@/lib/server/openclaw/health')
727
-
728
- for (const tuple of tuples) {
729
- let token: string | undefined
730
- if (tuple.credentialId) {
731
- const cred = credentials[tuple.credentialId] as unknown as Record<string, unknown> | undefined
732
- if (cred?.encryptedKey && typeof cred.encryptedKey === 'string') {
733
- try { token = decryptKey(cred.encryptedKey) } catch { continue }
734
- }
735
- }
736
-
737
- const result = await probeOpenClawHealth({
738
- endpoint: tuple.endpoint || undefined,
739
- token,
740
- timeoutMs: 10_000,
741
- })
742
-
743
- const now = Date.now()
744
-
745
- if (result.ok) {
746
- // Recovered
747
- if (ds.openclawDownAgentIds.has(tuple.agentId)) {
748
- ds.openclawDownAgentIds.delete(tuple.agentId)
749
- ds.openclawRepairState.delete(tuple.agentId)
750
- createNotification({
751
- type: 'success',
752
- title: 'OpenClaw gateway recovered',
753
- message: `Gateway for ${tuple.credentialName} is reachable again.`,
754
- dedupKey: `openclaw-gw-down:${tuple.agentId}`,
755
- })
756
- }
757
- continue
758
- }
759
-
760
- // Unhealthy
761
- const repair = ds.openclawRepairState.get(tuple.agentId) || { attempts: 0, lastAttemptAt: 0, cooldownUntil: 0 }
762
-
763
- // In cooldown — skip
764
- if (repair.cooldownUntil > now) continue
765
-
766
- // Cooldown expired — reset
767
- if (repair.cooldownUntil > 0 && repair.cooldownUntil <= now) {
768
- repair.attempts = 0
769
- repair.cooldownUntil = 0
770
- }
771
-
772
- ds.openclawDownAgentIds.add(tuple.agentId)
773
-
774
- if (repair.attempts < OPENCLAW_REPAIR_MAX_ATTEMPTS) {
775
- try {
776
- const { runOpenClawDoctor } = await import('@/lib/server/openclaw/doctor')
777
- await runOpenClawDoctor({ fix: true })
778
- } catch (err: unknown) {
779
- log.warn(TAG, '[daemon] openclaw doctor --fix failed:', errorMessage(err))
780
- }
781
- repair.attempts += 1
782
- repair.lastAttemptAt = now
783
- } else {
784
- repair.cooldownUntil = now + OPENCLAW_REPAIR_COOLDOWN_MS
785
- }
786
-
787
- ds.openclawRepairState.set(tuple.agentId, repair)
788
-
789
- createNotification({
790
- type: 'error',
791
- title: `OpenClaw gateway unreachable: ${tuple.credentialName}`,
792
- message: result.error || 'Health check failed',
793
- dedupKey: `openclaw-gw-down:${tuple.agentId}`,
794
- })
795
- }
796
- }
797
-
798
- /**
799
- * Prune orphaned entries from module-level Maps/Sets that reference
800
- * sessions, connectors, or agents that no longer exist in storage.
801
- * Runs every health-check cycle (2 minutes).
802
- */
803
- function pruneOrphanedState(sessions: Record<string, unknown>): void {
804
- const liveSessionIds = new Set(Object.keys(sessions))
805
-
806
- // Main-loop state map (per-session autonomous state)
807
- pruneMainLoopState(liveSessionIds)
808
-
809
- // Heartbeat service tracking maps
810
- pruneHeartbeatState(liveSessionIds)
811
-
812
- // System event queues for dead sessions
813
- pruneSystemEventQueues(liveSessionIds)
814
-
815
- // Subagent lineage/handle registry — remove finished subagent state older than 30 min
816
- cleanupFinishedSubagents()
817
-
818
- // Process manager — sweep completed processes older than TTL
819
- sweepManagedProcesses()
820
-
821
- // Reap orphaned sandbox containers from prior crashes
822
- reapOrphanedSandboxContainers().catch((err) => {
823
- log.warn(TAG, '[daemon] Orphaned sandbox reap failed:', typeof err === 'object' && err !== null && 'message' in err ? (err as Error).message : String(err))
824
- })
825
-
826
- // Daemon-local: prune openclawRepairState for agents that no longer exist
827
- const agents = loadAgents()
828
- for (const agentId of ds.openclawRepairState.keys()) {
829
- if (!agents[agentId]) ds.openclawRepairState.delete(agentId)
830
- }
831
- for (const agentId of ds.openclawDownAgentIds) {
832
- if (!agents[agentId]) ds.openclawDownAgentIds.delete(agentId)
833
- }
834
-
835
- // Orchestrator event queues for dead agents
836
- const liveAgentIds = new Set(Object.keys(agents))
837
- pruneOrchestratorEventQueues(liveAgentIds)
838
-
839
- // Orchestrator wake/failure/dailyCycles Maps for deleted agents
840
- pruneOrchestratorState(liveAgentIds)
841
-
842
- // Connector tracking Maps for deleted connectors
843
- const connectors = loadConnectors()
844
- pruneConnectorTrackingState(new Set(Object.keys(connectors)))
845
-
846
- // Prune circuit breaker entries for providers that no longer have any agent referencing them
847
- const liveProviderKeys = new Set<string>()
848
- for (const agent of Object.values(agents) as unknown as Record<string, unknown>[]) {
849
- if (!agent?.id) continue
850
- const p = typeof agent.provider === 'string' ? agent.provider : ''
851
- const c = typeof agent.credentialId === 'string' ? agent.credentialId : ''
852
- const e = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
853
- if (p) liveProviderKeys.add(`${p}:${c || 'no-cred'}:${e}`)
854
- }
855
- for (const key of ds.providerPingCircuitBreaker.keys()) {
856
- if (!liveProviderKeys.has(key)) ds.providerPingCircuitBreaker.delete(key)
857
- }
858
- }
859
-
860
- async function runMemoryMaintenanceTick(): Promise<void> {
861
- try {
862
- const memDb = getMemoryDb()
863
- const result = memDb.maintain({ dedupe: true, pruneWorking: true, ttlHours: 24 })
864
- if (result.deduped > 0 || result.pruned > 0) {
865
- log.info(TAG, `[daemon] Memory maintenance: deduped=${result.deduped}, pruned=${result.pruned}`)
866
- }
867
- } catch (err: unknown) {
868
- log.warn(TAG, '[daemon] Memory maintenance tick failed:', err instanceof Error ? err.message : String(err))
869
- }
870
- }
871
-
872
- async function runHealthChecks() {
873
- // Continuously keep the completed queue honest.
874
- validateCompletedTasksQueue()
875
- recoverStalledRunningTasks()
876
-
877
- // Watchdog: abort runs stuck in running state beyond their timeout threshold.
878
- try {
879
- const stuck = sweepStuckRuns()
880
- if (stuck.aborted > 0) {
881
- log.info(TAG, `[daemon] Watchdog: aborted ${stuck.aborted} stuck run(s)`)
882
- }
883
- } catch (err: unknown) {
884
- log.error(TAG, '[daemon] Stuck-run watchdog failed:', err instanceof Error ? err.message : String(err))
885
- }
886
-
887
- // Keep heartbeat state in sync with task terminal states even without daemon restarts.
888
- cleanupFinishedTaskSessions()
889
-
890
- // Re-queue deferred tasks whose agents have become available again.
891
- try { promoteDeferred() } catch {}
892
-
893
- const sessions = loadSessions()
894
- const now = Date.now()
895
- const currentlyStale = new Set<string>()
896
- const dirtySessionIds: string[] = []
897
-
898
- for (const session of Object.values(sessions) as unknown as Record<string, unknown>[]) {
899
- if (!session?.id || typeof session.id !== 'string') continue
900
- if (session.heartbeatEnabled !== true) continue
901
-
902
- const sessionId = session.id
903
- if (shouldSuppressSessionHeartbeatHealthAlert(session as Pick<Session, 'id' | 'name' | 'user' | 'shortcutForAgentId'>)) {
904
- ds.staleSessionIds.delete(sessionId)
905
- continue
906
- }
907
-
908
- const sessionLabel = String(session.name || sessionId)
909
- const intervalSec = parseHeartbeatIntervalSec(session.heartbeatIntervalSec, DEFAULT_HEARTBEAT_INTERVAL_SEC)
910
- if (intervalSec <= 0) continue
911
- const staleAfter = Math.max(intervalSec * STALE_MULTIPLIER * 1000, STALE_MIN_MS)
912
- const lastActive = typeof session.lastActiveAt === 'number' ? session.lastActiveAt : 0
913
- if (lastActive <= 0) continue
914
-
915
- const staleForMs = now - lastActive
916
- if (staleForMs > staleAfter) {
917
- const autoDisableAfter = Math.max(intervalSec * STALE_AUTO_DISABLE_MULTIPLIER * 1000, STALE_AUTO_DISABLE_MIN_MS)
918
- if (staleForMs > autoDisableAfter) {
919
- session.heartbeatEnabled = false
920
- session.lastActiveAt = now
921
- dirtySessionIds.push(sessionId)
922
- ds.staleSessionIds.delete(sessionId)
923
- await sendHealthAlert({
924
- text: `Auto-disabled heartbeat for stale session "${sessionLabel}" after ${Math.round(staleForMs / 60_000)}m of inactivity.`,
925
- dedupKey: buildSessionHeartbeatHealthDedupKey(sessionId, 'auto-disabled'),
926
- entityType: 'session',
927
- entityId: sessionId,
928
- })
929
- continue
930
- }
931
-
932
- currentlyStale.add(sessionId)
933
- // Only alert on transition from healthy → stale (once per stale episode)
934
- if (!ds.staleSessionIds.has(sessionId)) {
935
- ds.staleSessionIds.add(sessionId)
936
- await sendHealthAlert({
937
- text: `Session "${sessionLabel}" heartbeat appears stale (last active ${(Math.round(staleForMs / 1000))}s ago, interval ${intervalSec}s).`,
938
- dedupKey: buildSessionHeartbeatHealthDedupKey(sessionId, 'stale'),
939
- entityType: 'session',
940
- entityId: sessionId,
941
- })
942
- }
943
- }
944
- }
945
-
946
- // Clear recovered sessions so they can re-alert if they go stale again later
947
- for (const id of ds.staleSessionIds) {
948
- if (!currentlyStale.has(id)) {
949
- ds.staleSessionIds.delete(id)
950
- }
951
- }
952
-
953
- for (const sid of dirtySessionIds) {
954
- const s = sessions[sid]
955
- if (s) {
956
- const { upsertSession: upsert } = await import('@/lib/server/storage')
957
- upsert(sid, s)
958
- }
959
- }
960
-
961
- // Provider reachability checks
962
- try {
963
- await runProviderHealthChecks()
964
- } catch (err: unknown) {
965
- log.error(TAG, '[daemon] Provider health check failed:', errorMessage(err))
966
- }
967
-
968
- // OpenClaw gateway health checks + auto-repair
969
- try {
970
- await runOpenClawGatewayHealthChecks()
971
- } catch (err: unknown) {
972
- log.error(TAG, '[daemon] OpenClaw gateway health check failed:', errorMessage(err))
973
- }
974
-
975
- // Integrity drift monitoring for identity/config/extension files.
976
- try {
977
- const integrity = runIntegrityMonitor(loadSettings())
978
- ds.lastIntegrityCheckAt = integrity.checkedAt
979
- ds.lastIntegrityDriftCount = integrity.drifts.length
980
- if (integrity.drifts.length > 0) {
981
- for (const drift of integrity.drifts) {
982
- const rel = path.relative(process.cwd(), drift.filePath)
983
- const shortPath = rel && !rel.startsWith('..') ? rel : drift.filePath
984
- const action = drift.type === 'created'
985
- ? 'created'
986
- : drift.type === 'deleted'
987
- ? 'deleted'
988
- : 'modified'
989
- createNotification({
990
- type: drift.type === 'deleted' ? 'error' : 'warning',
991
- title: `Integrity drift detected (${drift.kind})`,
992
- message: `${shortPath} was ${action}.`,
993
- dedupKey: `integrity:${drift.id}:${drift.nextHash || 'missing'}`,
994
- entityType: 'session',
995
- entityId: drift.id,
996
- })
997
- }
998
- await sendHealthAlert(`Integrity monitor detected ${integrity.drifts.length} file drift event(s).`)
999
- }
1000
- } catch (err: unknown) {
1001
- log.error(TAG, '[daemon] Integrity monitor check failed:', errorMessage(err))
1002
- }
1003
-
1004
- // Process webhook retry queue
1005
- try {
1006
- await processWebhookRetries()
1007
- } catch (err: unknown) {
1008
- log.error(TAG, '[daemon] Webhook retry processing failed:', errorMessage(err))
1009
- }
1010
-
1011
- // Periodic memory hygiene: prune orphaned state for deleted sessions/connectors
1012
- try {
1013
- pruneOrphanedState(sessions)
1014
- } catch (err: unknown) {
1015
- log.error(TAG, '[daemon] Memory hygiene sweep failed:', errorMessage(err))
1016
- }
1017
-
1018
- // Prune old terminal runs and their events to prevent unbounded growth
1019
- try {
1020
- const pruned = pruneOldRuns()
1021
- if (pruned.prunedRuns > 0 || pruned.prunedEvents > 0) {
1022
- log.info(TAG, `[daemon] Pruned ${pruned.prunedRuns} old run(s) and ${pruned.prunedEvents} run event(s)`)
1023
- }
1024
- } catch (err: unknown) {
1025
- log.error(TAG, '[daemon] Run pruning failed:', err instanceof Error ? err.message : String(err))
1026
- }
1027
-
1028
- // Prune expired runtime locks
1029
- try {
1030
- const locksRemoved = pruneExpiredLocks()
1031
- if (locksRemoved > 0) {
1032
- log.info(TAG, `[daemon] Pruned ${locksRemoved} expired lock(s)`)
1033
- }
1034
- } catch (err: unknown) {
1035
- log.error(TAG, '[daemon] Lock pruning failed:', err instanceof Error ? err.message : String(err))
1036
- }
1037
-
1038
- // Prune old execution logs (30-day retention)
1039
- try {
1040
- const logsRemoved = clearLogsByAge(30 * 24 * 3600_000)
1041
- if (logsRemoved > 0) {
1042
- log.info(TAG, `[daemon] Pruned ${logsRemoved} old execution log(s)`)
1043
- }
1044
- } catch (err: unknown) {
1045
- log.error(TAG, '[daemon] Execution log pruning failed:', errorMessage(err))
1046
- }
1047
-
1048
- // Prune old usage records (90-day retention)
1049
- try {
1050
- const usageRemoved = pruneOldUsage(90 * 24 * 3600_000)
1051
- if (usageRemoved > 0) {
1052
- log.info(TAG, `[daemon] Pruned ${usageRemoved} old usage record(s)`)
1053
- }
1054
- } catch (err: unknown) {
1055
- log.error(TAG, '[daemon] Usage pruning failed:', errorMessage(err))
1056
- }
1057
-
1058
- // Periodic memory database maintenance (dedup + TTL pruning)
1059
- try {
1060
- await runMemoryMaintenanceTick()
1061
- } catch (err: unknown) {
1062
- log.error(TAG, '[daemon] Memory maintenance failed:', err instanceof Error ? err.message : String(err))
1063
- }
1064
-
1065
- // Drain idle-window callbacks when the system is quiet
1066
- try {
1067
- await drainIdleWindowCallbacks()
1068
- } catch (err: unknown) {
1069
- log.error(TAG, '[daemon] Idle-window drain failed:', err instanceof Error ? err.message : String(err))
1070
- }
1071
- }
1072
-
1073
- function startHealthMonitor() {
1074
- if (ds.healthIntervalId) return
1075
- ds.healthIntervalId = setInterval(() => {
1076
- if (ds.healthCheckRunning || ds.shuttingDown) return
1077
- ds.healthCheckRunning = true
1078
- runHealthChecks()
1079
- .catch((err) => {
1080
- log.error(TAG, '[daemon] Health monitor tick failed:', err?.message || String(err))
1081
- })
1082
- .finally(() => { ds.healthCheckRunning = false })
1083
- }, HEALTH_CHECK_INTERVAL)
1084
- }
1085
-
1086
- function stopHealthMonitor() {
1087
- if (ds.healthIntervalId) {
1088
- clearInterval(ds.healthIntervalId)
1089
- ds.healthIntervalId = null
1090
- }
1091
- }
1092
-
1093
- function syncDaemonBackgroundServices(options?: { runConnectorHealthCheckImmediately?: boolean }) {
1094
- if (isDaemonBackgroundServicesEnabled()) {
1095
- startHealthMonitor()
1096
- startConnectorHealthMonitor({
1097
- runImmediately: options?.runConnectorHealthCheckImmediately !== false,
1098
- })
1099
- startConnectorOutboxWorker()
1100
- startEvalScheduler()
1101
- return
1102
- }
1103
- stopHealthMonitor()
1104
- stopConnectorHealthMonitor()
1105
- stopConnectorOutboxWorker()
1106
- stopEvalScheduler()
1107
- }
1108
-
1109
- function startConnectorHealthMonitor(options?: { runImmediately?: boolean }) {
1110
- if (ds.connectorHealthIntervalId) return
1111
-
1112
- const tick = () => {
1113
- if (ds.connectorHealthCheckRunning || ds.shuttingDown) return
1114
- ds.connectorHealthCheckRunning = true
1115
- runConnectorHealthChecks(Date.now())
1116
- .catch((err) => {
1117
- log.error(TAG, '[daemon] Connector health tick failed:', errorMessage(err))
1118
- })
1119
- .finally(() => { ds.connectorHealthCheckRunning = false })
1120
- }
1121
-
1122
- if (options?.runImmediately !== false) tick()
1123
- ds.connectorHealthIntervalId = setInterval(tick, CONNECTOR_HEALTH_CHECK_INTERVAL)
1124
- }
1125
-
1126
- function stopConnectorHealthMonitor() {
1127
- if (ds.connectorHealthIntervalId) {
1128
- clearInterval(ds.connectorHealthIntervalId)
1129
- ds.connectorHealthIntervalId = null
1130
- }
1131
- }
1132
-
1133
- function runConsolidationTick() {
1134
- import('@/lib/server/memory/memory-consolidation').then(({ runDailyConsolidation, registerConsolidationIdleCallback, registerCompactionIdleCallback }) => {
1135
- // Wire idle-window callbacks so consolidation and compaction run during quiet periods
1136
- registerConsolidationIdleCallback()
1137
- registerCompactionIdleCallback()
1138
-
1139
- return runDailyConsolidation().then((stats) => {
1140
- if (stats.digests > 0 || stats.pruned > 0 || stats.deduped > 0) {
1141
- log.info(TAG, `[daemon] Memory consolidation: ${stats.digests} digest(s), ${stats.pruned} pruned, ${stats.deduped} deduped`)
1142
- }
1143
- if (stats.errors.length > 0) {
1144
- log.warn(TAG, `[daemon] Memory consolidation errors: ${stats.errors.join('; ')}`)
1145
- }
1146
- })
1147
- }).catch((err: unknown) => {
1148
- log.error(TAG, '[daemon] Memory consolidation failed:', errorMessage(err))
1149
- })
1150
- }
1151
-
1152
- function startMemoryConsolidation() {
1153
- if (ds.memoryConsolidationTimeoutId || ds.memoryConsolidationIntervalId) return
1154
- // Deferred first run, then repeat on interval
1155
- ds.memoryConsolidationTimeoutId = setTimeout(() => {
1156
- ds.memoryConsolidationTimeoutId = null
1157
- runConsolidationTick()
1158
- ds.memoryConsolidationIntervalId = setInterval(runConsolidationTick, MEMORY_CONSOLIDATION_INTERVAL)
1159
- }, MEMORY_CONSOLIDATION_INITIAL_DELAY)
1160
- }
1161
-
1162
- function stopMemoryConsolidation() {
1163
- if (ds.memoryConsolidationTimeoutId) {
1164
- clearTimeout(ds.memoryConsolidationTimeoutId)
1165
- ds.memoryConsolidationTimeoutId = null
1166
- }
1167
- if (ds.memoryConsolidationIntervalId) {
1168
- clearInterval(ds.memoryConsolidationIntervalId)
1169
- ds.memoryConsolidationIntervalId = null
1170
- }
1171
- }
1172
-
1173
- // --- Eval scheduler ---
1174
-
1175
- const EVAL_DEFAULT_INTERVAL_MS = 24 * 3600_000 // 24 hours
1176
-
1177
- async function runEvalSchedulerTick() {
1178
- try {
1179
- const settings = loadSettings()
1180
- if (!settings.autonomyEvalEnabled) return
1181
-
1182
- const { runEvalSuite } = await import('@/lib/server/eval/runner')
1183
- const agents = loadAgents()
1184
- const heartbeatAgentIds = Object.keys(agents).filter(
1185
- (id) => agents[id].heartbeatEnabled === true,
1186
- )
1187
-
1188
- for (const agentId of heartbeatAgentIds) {
1189
- try {
1190
- const result = await runEvalSuite(agentId)
1191
- log.info(TAG,
1192
- `[daemon:eval] Agent ${agents[agentId].name}: ${result.percentage}% (${result.totalScore}/${result.maxScore})`,
1193
- )
1194
- createNotification({
1195
- title: `Eval: ${agents[agentId].name} scored ${result.percentage}%`,
1196
- message: `${result.runs.length} scenarios, ${result.totalScore}/${result.maxScore} points`,
1197
- type: result.percentage >= 60 ? 'info' : 'warning',
1198
- })
1199
- } catch (err: unknown) {
1200
- log.error(TAG, `[daemon:eval] Failed for agent ${agentId}:`, errorMessage(err))
1201
- }
1202
- }
1203
- } catch (err: unknown) {
1204
- log.error(TAG, '[daemon:eval] Scheduler tick error:', errorMessage(err))
1205
- }
1206
- }
1207
-
1208
- function startEvalScheduler() {
1209
- if (ds.evalSchedulerIntervalId) return
1210
- try {
1211
- const settings = loadSettings()
1212
- if (!settings.autonomyEvalEnabled) return
1213
- const intervalMs = parseCronToMs(settings.autonomyEvalCron, EVAL_DEFAULT_INTERVAL_MS) || EVAL_DEFAULT_INTERVAL_MS
1214
- ds.evalSchedulerIntervalId = setInterval(runEvalSchedulerTick, intervalMs)
1215
- log.info(TAG, `[daemon:eval] Eval scheduler started (interval=${Math.round(intervalMs / 3600_000)}h)`)
1216
- } catch {
1217
- // Eval scheduling is optional — don't block daemon start
1218
- }
1219
- }
1220
-
1221
- function stopEvalScheduler() {
1222
- if (ds.evalSchedulerIntervalId) {
1223
- clearInterval(ds.evalSchedulerIntervalId)
1224
- ds.evalSchedulerIntervalId = null
1225
- }
1226
- }
1227
-
1228
- const SWARM_TIMEOUT_CHECK_INTERVAL = 30_000
1229
-
1230
- function startSwarmTimeoutChecker() {
1231
- if (ds.swarmTimeoutIntervalId) return
1232
- ds.swarmTimeoutIntervalId = setInterval(() => {
1233
- if (!ds.running || ds.shuttingDown) return
1234
- try {
1235
- checkSwarmTimeouts()
1236
- } catch (err: unknown) {
1237
- log.error(TAG, `[daemon] Swarm timeout check error: ${errorMessage(err)}`)
1238
- }
1239
- }, SWARM_TIMEOUT_CHECK_INTERVAL)
1240
- }
1241
-
1242
- function stopSwarmTimeoutChecker() {
1243
- if (ds.swarmTimeoutIntervalId) {
1244
- clearInterval(ds.swarmTimeoutIntervalId)
1245
- ds.swarmTimeoutIntervalId = null
1246
- }
1247
- }
1248
-
1249
- function refreshDaemonTimersForHotReload() {
1250
- if (!ds.running) return
1251
-
1252
- if (ds.queueIntervalId) {
1253
- clearInterval(ds.queueIntervalId)
1254
- ds.queueIntervalId = null
1255
- startQueueProcessor()
1256
- }
1257
-
1258
- if (ds.browserSweepId) {
1259
- clearInterval(ds.browserSweepId)
1260
- ds.browserSweepId = null
1261
- startBrowserSweep()
1262
- }
1263
-
1264
- if (ds.healthIntervalId) {
1265
- clearInterval(ds.healthIntervalId)
1266
- ds.healthIntervalId = null
1267
- }
1268
-
1269
- if (ds.connectorHealthIntervalId) {
1270
- clearInterval(ds.connectorHealthIntervalId)
1271
- ds.connectorHealthIntervalId = null
1272
- }
1273
-
1274
- if (ds.memoryConsolidationTimeoutId || ds.memoryConsolidationIntervalId) {
1275
- stopMemoryConsolidation()
1276
- startMemoryConsolidation()
1277
- }
1278
-
1279
- if (ds.evalSchedulerIntervalId) {
1280
- stopEvalScheduler()
1281
- }
1282
-
1283
- if (ds.swarmTimeoutIntervalId) {
1284
- stopSwarmTimeoutChecker()
1285
- startSwarmTimeoutChecker()
1286
- }
1287
-
1288
- syncDaemonBackgroundServices()
1289
- }
1290
-
1291
- // In dev/HMR, the daemon state survives on globalThis while interval callbacks keep
1292
- // the old module closure alive. Refresh long-lived timers so they always run the
1293
- // current module's logic instead of stale health-alert code paths.
1294
- refreshDaemonTimersForHotReload()
1295
-
1296
- export async function runDaemonHealthCheckNow() {
1297
- // Bypass circuit breaker for manual/forced checks
1298
- ds.providerPingCircuitBreaker.clear()
1299
- await Promise.all([
1300
- runHealthChecks(),
1301
- runConnectorHealthChecks(Date.now()),
1302
- ])
1303
- }
1304
-
1305
- export async function runConnectorHealthCheckNowForTest(now = Date.now()) {
1306
- await runConnectorHealthChecks(now)
1307
- }
1308
-
1309
- export function getDaemonStatus() {
1310
- const estop = loadEstopState()
1311
- const queue = loadQueue()
1312
- const schedules = loadSchedules()
1313
- const reconnectStates = Object.values(getAllReconnectStates())
1314
-
1315
- // Find next scheduled task
1316
- let nextScheduled: number | null = null
1317
- for (const s of Object.values(schedules) as unknown as Record<string, unknown>[]) {
1318
- if (s.status === 'active' && s.nextRunAt) {
1319
- if (!nextScheduled || (s.nextRunAt as number) < nextScheduled) {
1320
- nextScheduled = s.nextRunAt as number
1321
- }
1322
- }
1323
- }
1324
-
1325
- // Webhook retry queue stats
1326
- const retryQueue = loadWebhookRetryQueue()
1327
- const retryEntries = Object.values(retryQueue) as WebhookRetryEntry[]
1328
- const pendingRetries = retryEntries.filter(e => !e.deadLettered).length
1329
- const deadLettered = retryEntries.filter(e => e.deadLettered).length
1330
-
1331
- return {
1332
- running: ds.running,
1333
- schedulerActive: ds.running,
1334
- autostartEnabled: daemonAutostartEnvEnabled(),
1335
- backgroundServicesEnabled: isDaemonBackgroundServicesEnabled(),
1336
- reducedMode: !isDaemonBackgroundServicesEnabled(),
1337
- manualStopRequested: ds.manualStopRequested,
1338
- estop,
1339
- queueLength: queue.length,
1340
- lastProcessed: ds.lastProcessedAt,
1341
- nextScheduled,
1342
- heartbeat: getHeartbeatServiceStatus(),
1343
- health: {
1344
- monitorActive: !!ds.healthIntervalId,
1345
- connectorMonitorActive: !!ds.connectorHealthIntervalId,
1346
- staleSessions: ds.staleSessionIds.size,
1347
- connectorsInBackoff: reconnectStates.filter((state) => !state.exhausted).length,
1348
- connectorsExhausted: reconnectStates.filter((state) => state.exhausted).length,
1349
- checkIntervalSec: Math.trunc(HEALTH_CHECK_INTERVAL / 1000),
1350
- connectorCheckIntervalSec: Math.trunc(CONNECTOR_HEALTH_CHECK_INTERVAL / 1000),
1351
- integrity: {
1352
- enabled: loadSettings().integrityMonitorEnabled !== false,
1353
- lastCheckedAt: ds.lastIntegrityCheckAt,
1354
- lastDriftCount: ds.lastIntegrityDriftCount,
1355
- },
1356
- },
1357
- webhookRetry: {
1358
- pendingRetries,
1359
- deadLettered,
1360
- },
1361
- guards: {
1362
- healthCheckRunning: ds.healthCheckRunning,
1363
- connectorHealthCheckRunning: ds.connectorHealthCheckRunning,
1364
- shuttingDown: ds.shuttingDown,
1365
- providerCircuitBreakers: ds.providerPingCircuitBreaker.size,
1366
- },
1367
- }
1368
- }
1369
-
1370
- /**
1371
- * Lightweight health summary safe for external consumption.
1372
- * Reads cached state only — no probes or side effects.
1373
- */
1374
- export function getDaemonHealthSummary(): {
1375
- ok: boolean
1376
- uptime: number
1377
- components: {
1378
- daemon: { status: 'healthy' | 'stopped' | 'degraded' }
1379
- connectors: { healthy: number; errored: number; total: number }
1380
- providers: { healthy: number; cooldown: number; total: number }
1381
- gateways: { healthy: number; degraded: number; total: number }
1382
- }
1383
- estop: boolean
1384
- nextScheduledTask: number | null
1385
- } {
1386
- const estopState = loadEstopState()
1387
- const estopActive = estopState.level !== 'none'
1388
-
1389
- // Daemon status
1390
- const daemonStatus: 'healthy' | 'stopped' | 'degraded' = !ds.running
1391
- ? 'stopped'
1392
- : estopActive ? 'degraded' : 'healthy'
1393
-
1394
- // Connector summary
1395
- const connectors = loadConnectors()
1396
- const connectorEntries = Object.values(connectors) as unknown as Record<string, unknown>[]
1397
- const enabledConnectors = connectorEntries.filter(c => c?.isEnabled === true)
1398
- let healthyConnectors = 0
1399
- let erroredConnectors = 0
1400
- for (const c of enabledConnectors) {
1401
- if (typeof c.id === 'string' && getConnectorStatus(c.id) === 'running') {
1402
- healthyConnectors++
1403
- } else {
1404
- erroredConnectors++
1405
- }
1406
- }
1407
-
1408
- // Provider summary (based on circuit breaker state)
1409
- const agents = loadAgents()
1410
- const agentEntries = Object.values(agents) as unknown as Record<string, unknown>[]
1411
- const providerKeys = new Set<string>()
1412
- for (const agent of agentEntries) {
1413
- if (!agent?.id || typeof agent.id !== 'string') continue
1414
- const provider = typeof agent.provider === 'string' ? agent.provider : ''
1415
- if (!provider || ['claude-cli', 'codex-cli', 'opencode-cli'].includes(provider)) continue
1416
- const credentialId = typeof agent.credentialId === 'string' ? agent.credentialId : ''
1417
- const apiEndpoint = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
1418
- providerKeys.add(`${provider}:${credentialId || 'no-cred'}:${apiEndpoint}`)
1419
- }
1420
- const now = Date.now()
1421
- let cooldownProviders = 0
1422
- for (const key of providerKeys) {
1423
- const cb = ds.providerPingCircuitBreaker.get(key)
1424
- if (cb && cb.skipUntil > now) cooldownProviders++
1425
- }
1426
-
1427
- // Gateway summary (OpenClaw gateways)
1428
- const totalGateways = ds.openclawDownAgentIds.size
1429
- + agentEntries.filter(a => a?.provider === 'openclaw' && !ds.openclawDownAgentIds.has(a.id as string)).length
1430
- const degradedGateways = ds.openclawDownAgentIds.size
1431
-
1432
- // Next scheduled task
1433
- const schedules = loadSchedules()
1434
- let nextScheduled: number | null = null
1435
- for (const s of Object.values(schedules) as unknown as Record<string, unknown>[]) {
1436
- if (s.status === 'active' && s.nextRunAt) {
1437
- if (!nextScheduled || (s.nextRunAt as number) < nextScheduled) {
1438
- nextScheduled = s.nextRunAt as number
1439
- }
1440
- }
1441
- }
1442
-
1443
- const allProvidersDown = providerKeys.size > 0 && cooldownProviders >= providerKeys.size
1444
- const ok = ds.running && !estopActive && !allProvidersDown
1445
-
1446
- return {
1447
- ok,
1448
- uptime: Math.trunc(process.uptime()),
1449
- components: {
1450
- daemon: { status: daemonStatus },
1451
- connectors: {
1452
- healthy: healthyConnectors,
1453
- errored: erroredConnectors,
1454
- total: enabledConnectors.length,
1455
- },
1456
- providers: {
1457
- healthy: providerKeys.size - cooldownProviders,
1458
- cooldown: cooldownProviders,
1459
- total: providerKeys.size,
1460
- },
1461
- gateways: {
1462
- healthy: totalGateways - degradedGateways,
1463
- degraded: degradedGateways,
1464
- total: totalGateways,
1465
- },
1466
- },
1467
- estop: estopActive,
1468
- nextScheduledTask: nextScheduled,
1469
- }
1470
- }
1
+ export * from './daemon-state/policy'
2
+ export * from './daemon-state/supervisor'
3
+ export * from './daemon-state/health'