@swarmclawai/swarmclaw 1.2.1 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -85
- package/bin/server-cmd.js +64 -1
- package/package.json +2 -2
- package/skills/coding-agent/SKILL.md +111 -0
- package/skills/github/SKILL.md +140 -0
- package/skills/nano-banana-pro/SKILL.md +62 -0
- package/skills/nano-banana-pro/scripts/generate_image.py +235 -0
- package/skills/nano-pdf/SKILL.md +53 -0
- package/skills/openai-image-gen/SKILL.md +78 -0
- package/skills/openai-image-gen/scripts/gen.py +328 -0
- package/skills/resourceful-problem-solving/SKILL.md +49 -0
- package/skills/skill-creator/SKILL.md +147 -0
- package/skills/skill-creator/scripts/init_skill.py +378 -0
- package/skills/skill-creator/scripts/quick_validate.py +159 -0
- package/skills/summarize/SKILL.md +77 -0
- package/src/app/api/auth/route.ts +20 -5
- package/src/app/api/chats/[id]/devserver/route.ts +13 -19
- package/src/app/api/chats/[id]/messages/route.ts +13 -15
- package/src/app/api/chats/[id]/route.ts +9 -10
- package/src/app/api/chats/[id]/stop/route.ts +5 -7
- package/src/app/api/chats/messages-route.test.ts +8 -6
- package/src/app/api/chats/route.ts +9 -10
- package/src/app/api/ip/route.ts +2 -2
- package/src/app/api/preview-server/route.ts +1 -1
- package/src/app/api/projects/[id]/route.ts +7 -46
- package/src/cli/server-cmd.test.js +74 -0
- package/src/components/chat/chat-area.tsx +45 -23
- package/src/components/chat/message-bubble.test.ts +35 -0
- package/src/components/chat/message-bubble.tsx +19 -9
- package/src/components/chat/message-list.tsx +37 -3
- package/src/components/input/chat-input.tsx +34 -14
- package/src/components/openclaw/openclaw-deploy-panel.tsx +4 -0
- package/src/instrumentation.ts +1 -1
- package/src/lib/chat/assistant-render-id.ts +3 -0
- package/src/lib/chat/chat-streaming-state.test.ts +42 -3
- package/src/lib/chat/chat-streaming-state.ts +20 -8
- package/src/lib/chat/queued-message-queue.test.ts +23 -1
- package/src/lib/chat/queued-message-queue.ts +11 -2
- package/src/lib/providers/cli-utils.test.ts +124 -0
- package/src/lib/server/activity/activity-log.ts +21 -0
- package/src/lib/server/agents/agent-availability.test.ts +10 -5
- package/src/lib/server/agents/agent-cascade.ts +79 -59
- package/src/lib/server/agents/agent-registry.ts +3 -1
- package/src/lib/server/agents/agent-repository.ts +90 -0
- package/src/lib/server/agents/delegation-job-repository.ts +53 -0
- package/src/lib/server/agents/delegation-jobs.ts +11 -4
- package/src/lib/server/agents/guardian-checkpoint-repository.ts +35 -0
- package/src/lib/server/agents/guardian.ts +2 -2
- package/src/lib/server/agents/main-agent-loop.ts +10 -3
- package/src/lib/server/agents/main-loop-state-repository.ts +38 -0
- package/src/lib/server/agents/subagent-runtime.ts +9 -6
- package/src/lib/server/agents/subagent-swarm.ts +3 -2
- package/src/lib/server/agents/task-session.ts +3 -4
- package/src/lib/server/approvals/approval-repository.ts +30 -0
- package/src/lib/server/autonomy/supervisor-incident-repository.ts +42 -0
- package/src/lib/server/chat-execution/chat-execution-types.ts +38 -0
- package/src/lib/server/chat-execution/chat-execution-utils.ts +1 -1
- package/src/lib/server/chat-execution/chat-execution.ts +84 -1926
- package/src/lib/server/chat-execution/chat-turn-finalization.ts +620 -0
- package/src/lib/server/chat-execution/chat-turn-partial-persistence.ts +221 -0
- package/src/lib/server/chat-execution/chat-turn-preflight.ts +133 -0
- package/src/lib/server/chat-execution/chat-turn-preparation.ts +817 -0
- package/src/lib/server/chat-execution/chat-turn-stream-execution.ts +296 -0
- package/src/lib/server/chat-execution/chat-turn-tool-routing.ts +5 -5
- package/src/lib/server/chat-execution/message-classifier.test.ts +329 -0
- package/src/lib/server/chat-execution/post-stream-finalization.ts +1 -1
- package/src/lib/server/chat-execution/prompt-builder.ts +11 -0
- package/src/lib/server/chat-execution/prompt-sections.ts +5 -6
- package/src/lib/server/chat-execution/situational-awareness.ts +12 -7
- package/src/lib/server/chat-execution/stream-agent-chat.ts +16 -13
- package/src/lib/server/chatrooms/chatroom-repository.ts +32 -0
- package/src/lib/server/connectors/connector-repository.ts +58 -0
- package/src/lib/server/connectors/runtime-state.test.ts +117 -0
- package/src/lib/server/credentials/credential-repository.ts +7 -0
- package/src/lib/server/gateways/gateway-profile-repository.ts +4 -0
- package/src/lib/server/memory/memory-abstract.test.ts +59 -0
- package/src/lib/server/missions/mission-repository.ts +74 -0
- package/src/lib/server/missions/mission-service/actions.ts +6 -0
- package/src/lib/server/missions/mission-service/bindings.ts +9 -0
- package/src/lib/server/missions/mission-service/context.ts +4 -0
- package/src/lib/server/missions/mission-service/core.ts +2269 -0
- package/src/lib/server/missions/mission-service/queries.ts +12 -0
- package/src/lib/server/missions/mission-service/recovery.ts +5 -0
- package/src/lib/server/missions/mission-service/ticks.ts +9 -0
- package/src/lib/server/missions/mission-service.test.ts +9 -2
- package/src/lib/server/missions/mission-service.ts +6 -2266
- package/src/lib/server/openclaw/deploy.test.ts +42 -3
- package/src/lib/server/openclaw/deploy.ts +26 -12
- package/src/lib/server/persistence/repository-utils.ts +154 -0
- package/src/lib/server/persistence/storage-context.ts +51 -0
- package/src/lib/server/persistence/transaction.ts +1 -0
- package/src/lib/server/projects/project-repository.ts +36 -0
- package/src/lib/server/projects/project-service.ts +79 -0
- package/src/lib/server/protocols/protocol-normalization.test.ts +6 -4
- package/src/lib/server/runtime/alert-dispatch.ts +1 -1
- package/src/lib/server/runtime/daemon-policy.ts +1 -1
- package/src/lib/server/runtime/daemon-state/core.ts +1570 -0
- package/src/lib/server/runtime/daemon-state/health.ts +6 -0
- package/src/lib/server/runtime/daemon-state/policy.ts +7 -0
- package/src/lib/server/runtime/daemon-state/supervisor.ts +6 -0
- package/src/lib/server/runtime/daemon-state.test.ts +48 -0
- package/src/lib/server/runtime/daemon-state.ts +3 -1470
- package/src/lib/server/runtime/estop-repository.ts +4 -0
- package/src/lib/server/runtime/estop.ts +3 -1
- package/src/lib/server/runtime/heartbeat-service.test.ts +2 -2
- package/src/lib/server/runtime/heartbeat-service.ts +55 -34
- package/src/lib/server/runtime/heartbeat-wake.ts +6 -4
- package/src/lib/server/runtime/idle-window.ts +2 -2
- package/src/lib/server/runtime/network.ts +11 -0
- package/src/lib/server/runtime/orchestrator-events.ts +2 -2
- package/src/lib/server/runtime/queue/claims.ts +4 -0
- package/src/lib/server/runtime/queue/core.ts +2079 -0
- package/src/lib/server/runtime/queue/execution.ts +7 -0
- package/src/lib/server/runtime/queue/followups.ts +4 -0
- package/src/lib/server/runtime/queue/queries.ts +12 -0
- package/src/lib/server/runtime/queue/recovery.ts +7 -0
- package/src/lib/server/runtime/queue-recovery.test.ts +48 -13
- package/src/lib/server/runtime/queue-repository.ts +17 -0
- package/src/lib/server/runtime/queue.ts +5 -2061
- package/src/lib/server/runtime/run-ledger.ts +6 -5
- package/src/lib/server/runtime/run-repository.ts +73 -0
- package/src/lib/server/runtime/runtime-lock-repository.ts +8 -0
- package/src/lib/server/runtime/runtime-settings.ts +1 -1
- package/src/lib/server/runtime/runtime-state.ts +99 -0
- package/src/lib/server/runtime/scheduler.ts +4 -2
- package/src/lib/server/runtime/session-run-manager/cancellation.ts +157 -0
- package/src/lib/server/runtime/session-run-manager/drain.ts +246 -0
- package/src/lib/server/runtime/session-run-manager/enqueue.ts +287 -0
- package/src/lib/server/runtime/session-run-manager/queries.ts +117 -0
- package/src/lib/server/runtime/session-run-manager/recovery.ts +238 -0
- package/src/lib/server/runtime/session-run-manager/state.ts +441 -0
- package/src/lib/server/runtime/session-run-manager/types.ts +74 -0
- package/src/lib/server/runtime/session-run-manager.ts +72 -1377
- package/src/lib/server/runtime/watch-job-repository.ts +35 -0
- package/src/lib/server/runtime/watch-jobs.ts +3 -1
- package/src/lib/server/schedules/schedule-repository.ts +42 -0
- package/src/lib/server/sessions/session-repository.ts +85 -0
- package/src/lib/server/settings/settings-repository.ts +25 -0
- package/src/lib/server/skills/skill-discovery.test.ts +2 -2
- package/src/lib/server/skills/skill-discovery.ts +2 -2
- package/src/lib/server/skills/skill-repository.ts +14 -0
- package/src/lib/server/storage.ts +13 -24
- package/src/lib/server/tasks/task-repository.ts +54 -0
- package/src/lib/server/usage/usage-repository.ts +30 -0
- package/src/lib/server/webhooks/webhook-repository.ts +10 -0
- package/src/lib/strip-internal-metadata.test.ts +42 -41
- package/src/stores/use-chat-store.test.ts +54 -0
- package/src/stores/use-chat-store.ts +21 -5
- /package/{bundled-skills → skills}/google-workspace/SKILL.md +0 -0
|
@@ -1,1470 +1,3 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
import { processNext, cleanupFinishedTaskSessions, validateCompletedTasksQueue, recoverStalledRunningTasks, resumeQueue, promoteDeferred } from '@/lib/server/runtime/queue'
|
|
5
|
-
import { startScheduler, stopScheduler } from '@/lib/server/runtime/scheduler'
|
|
6
|
-
import { sweepOrphanedBrowsers, getActiveBrowserCount } from '@/lib/server/session-tools'
|
|
7
|
-
import {
|
|
8
|
-
autoStartConnectors,
|
|
9
|
-
listRunningConnectors,
|
|
10
|
-
sendConnectorMessage,
|
|
11
|
-
stopAllConnectors,
|
|
12
|
-
startConnector,
|
|
13
|
-
getConnectorStatus,
|
|
14
|
-
checkConnectorHealth,
|
|
15
|
-
createConnectorReconnectState,
|
|
16
|
-
advanceConnectorReconnectState,
|
|
17
|
-
clearReconnectState,
|
|
18
|
-
getAllReconnectStates,
|
|
19
|
-
getReconnectState,
|
|
20
|
-
setReconnectState,
|
|
21
|
-
} from '@/lib/server/connectors/manager'
|
|
22
|
-
import { startConnectorOutboxWorker, stopConnectorOutboxWorker } from '@/lib/server/connectors/outbox'
|
|
23
|
-
import { pruneConnectorTrackingState } from '@/lib/server/connectors/runtime-state'
|
|
24
|
-
import { startHeartbeatService, stopHeartbeatService, getHeartbeatServiceStatus, pruneHeartbeatState, pruneOrchestratorState } from '@/lib/server/runtime/heartbeat-service'
|
|
25
|
-
import { hasOpenClawAgents, ensureGatewayConnected, disconnectAutoGateways, getGateway } from '@/lib/server/openclaw/gateway'
|
|
26
|
-
import { enqueueSessionRun, sweepStuckRuns } from '@/lib/server/runtime/session-run-manager'
|
|
27
|
-
import { pruneOldRuns } from '@/lib/server/runtime/run-ledger'
|
|
28
|
-
import { getEnabledCapabilitySelection } from '@/lib/capability-selection'
|
|
29
|
-
import { WORKSPACE_DIR } from '@/lib/server/data-dir'
|
|
30
|
-
import { DEFAULT_HEARTBEAT_INTERVAL_SEC } from '@/lib/runtime/heartbeat-defaults'
|
|
31
|
-
import { genId } from '@/lib/id'
|
|
32
|
-
import { isAgentDisabled } from '@/lib/server/agents/agent-availability'
|
|
33
|
-
import { errorMessage, hmrSingleton } from '@/lib/shared-utils'
|
|
34
|
-
import path from 'node:path'
|
|
35
|
-
import type { Session, WebhookRetryEntry } from '@/types'
|
|
36
|
-
import { createNotification } from '@/lib/server/create-notification'
|
|
37
|
-
import { pingProvider, OPENAI_COMPATIBLE_DEFAULTS, restoreProviderHealthState } from '@/lib/server/provider-health'
|
|
38
|
-
import { runIntegrityMonitor } from '@/lib/server/integrity-monitor'
|
|
39
|
-
import { notifyOrchestrators } from '@/lib/server/runtime/orchestrator-events'
|
|
40
|
-
import { recoverStaleDelegationJobs } from '@/lib/server/agents/delegation-jobs'
|
|
41
|
-
import { restoreSwarmRegistry } from '@/lib/server/agents/subagent-swarm'
|
|
42
|
-
import { cleanupFinishedSubagents } from '@/lib/server/agents/subagent-runtime'
|
|
43
|
-
import { pruneMainLoopState } from '@/lib/server/agents/main-agent-loop'
|
|
44
|
-
import { pruneSystemEventQueues, pruneOrchestratorEventQueues } from '@/lib/server/runtime/system-events'
|
|
45
|
-
import { checkSwarmTimeouts, ensureProtocolEngineRecovered } from '@/lib/server/protocols/protocol-service'
|
|
46
|
-
import { sweepManagedProcesses, reapOrphanedSandboxContainers } from '@/lib/server/runtime/process-manager'
|
|
47
|
-
import { drainIdleWindowCallbacks } from '@/lib/server/runtime/idle-window'
|
|
48
|
-
import {
|
|
49
|
-
buildSessionHeartbeatHealthDedupKey,
|
|
50
|
-
daemonAutostartEnvEnabled,
|
|
51
|
-
isDaemonBackgroundServicesEnabled,
|
|
52
|
-
parseCronToMs,
|
|
53
|
-
parseHeartbeatIntervalSec,
|
|
54
|
-
shouldNotifyProviderReachabilityIssue,
|
|
55
|
-
shouldSuppressSessionHeartbeatHealthAlert,
|
|
56
|
-
shouldSuppressSyntheticAgentHealthAlert,
|
|
57
|
-
} from '@/lib/server/runtime/daemon-policy'
|
|
58
|
-
import { loadEstopState } from '@/lib/server/runtime/estop'
|
|
59
|
-
import { classifyRuntimeFailure, recordSupervisorIncident } from '@/lib/server/autonomy/supervisor-reflection'
|
|
60
|
-
import { getMemoryDb } from '@/lib/server/memory/memory-db'
|
|
61
|
-
import { clearLogsByAge } from '@/lib/server/execution-log'
|
|
62
|
-
|
|
63
|
-
const TAG = 'daemon-state'
|
|
64
|
-
|
|
65
|
-
const QUEUE_CHECK_INTERVAL = 30_000 // 30 seconds
|
|
66
|
-
const BROWSER_SWEEP_INTERVAL = 60_000 // 60 seconds
|
|
67
|
-
const BROWSER_MAX_AGE = 10 * 60 * 1000 // 10 minutes idle = orphaned
|
|
68
|
-
const HEALTH_CHECK_INTERVAL = 120_000 // 2 minutes
|
|
69
|
-
const CONNECTOR_HEALTH_CHECK_INTERVAL = 15_000 // 15 seconds
|
|
70
|
-
const MEMORY_CONSOLIDATION_INTERVAL = 6 * 3600_000 // 6 hours
|
|
71
|
-
const MEMORY_CONSOLIDATION_INITIAL_DELAY = 60_000 // 1 minute after daemon start
|
|
72
|
-
const STALE_MULTIPLIER = 4 // session is stale after N × heartbeat interval
|
|
73
|
-
const STALE_MIN_MS = 4 * 60 * 1000 // minimum 4 minutes regardless of interval
|
|
74
|
-
const STALE_AUTO_DISABLE_MULTIPLIER = 16 // auto-disable after much longer sustained staleness
|
|
75
|
-
const STALE_AUTO_DISABLE_MIN_MS = 45 * 60 * 1000 // never auto-disable before 45 minutes
|
|
76
|
-
const CONNECTOR_RESTART_BASE_MS = 30_000
|
|
77
|
-
const CONNECTOR_RESTART_MAX_MS = 15 * 60 * 1000
|
|
78
|
-
const MAX_WAKE_ATTEMPTS = 3
|
|
79
|
-
const QUEUE_PROCESS_TIMEOUT = 10 * 60_000 // 10 minutes
|
|
80
|
-
const SHUTDOWN_TIMEOUT_MS = 15_000
|
|
81
|
-
const PROVIDER_PING_CB_THRESHOLD = 3 // trips after 3 consecutive failures
|
|
82
|
-
const PROVIDER_PING_CB_BASE_MS = 300_000 // 5 min initial cooldown
|
|
83
|
-
const PROVIDER_PING_CB_MAX_MS = 1_800_000 // 30 min max cooldown
|
|
84
|
-
|
|
85
|
-
export {
|
|
86
|
-
buildSessionHeartbeatHealthDedupKey,
|
|
87
|
-
isDaemonBackgroundServicesEnabled,
|
|
88
|
-
shouldNotifyProviderReachabilityIssue,
|
|
89
|
-
shouldSuppressSessionHeartbeatHealthAlert,
|
|
90
|
-
shouldSuppressSyntheticAgentHealthAlert,
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
// Store daemon state on globalThis to survive HMR reloads
|
|
94
|
-
interface DaemonState {
|
|
95
|
-
queueIntervalId: ReturnType<typeof setInterval> | null
|
|
96
|
-
browserSweepId: ReturnType<typeof setInterval> | null
|
|
97
|
-
healthIntervalId: ReturnType<typeof setInterval> | null
|
|
98
|
-
connectorHealthIntervalId: ReturnType<typeof setInterval> | null
|
|
99
|
-
memoryConsolidationTimeoutId: ReturnType<typeof setTimeout> | null
|
|
100
|
-
memoryConsolidationIntervalId: ReturnType<typeof setInterval> | null
|
|
101
|
-
evalSchedulerIntervalId: ReturnType<typeof setInterval> | null
|
|
102
|
-
swarmTimeoutIntervalId: ReturnType<typeof setInterval> | null
|
|
103
|
-
/** Session IDs we've already alerted as stale (alert-once semantics). */
|
|
104
|
-
staleSessionIds: Set<string>
|
|
105
|
-
/** OpenClaw gateway agent IDs currently considered down. */
|
|
106
|
-
openclawDownAgentIds: Set<string>
|
|
107
|
-
/** Per-agent auto-repair state for OpenClaw gateways. */
|
|
108
|
-
openclawRepairState: Map<string, { attempts: number; lastAttemptAt: number; cooldownUntil: number }>
|
|
109
|
-
lastIntegrityCheckAt: number | null
|
|
110
|
-
lastIntegrityDriftCount: number
|
|
111
|
-
manualStopRequested: boolean
|
|
112
|
-
running: boolean
|
|
113
|
-
lastProcessedAt: number | null
|
|
114
|
-
healthCheckRunning: boolean
|
|
115
|
-
connectorHealthCheckRunning: boolean
|
|
116
|
-
shuttingDown: boolean
|
|
117
|
-
providerPingCircuitBreaker: Map<string, { consecutiveFailures: number; skipUntil: number }>
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
const ds: DaemonState = hmrSingleton<DaemonState>('__swarmclaw_daemon__', () => ({
|
|
121
|
-
queueIntervalId: null,
|
|
122
|
-
browserSweepId: null,
|
|
123
|
-
healthIntervalId: null,
|
|
124
|
-
connectorHealthIntervalId: null,
|
|
125
|
-
memoryConsolidationTimeoutId: null,
|
|
126
|
-
memoryConsolidationIntervalId: null,
|
|
127
|
-
evalSchedulerIntervalId: null,
|
|
128
|
-
swarmTimeoutIntervalId: null,
|
|
129
|
-
staleSessionIds: new Set<string>(),
|
|
130
|
-
openclawDownAgentIds: new Set<string>(),
|
|
131
|
-
openclawRepairState: new Map<string, { attempts: number; lastAttemptAt: number; cooldownUntil: number }>(),
|
|
132
|
-
lastIntegrityCheckAt: null,
|
|
133
|
-
lastIntegrityDriftCount: 0,
|
|
134
|
-
manualStopRequested: false,
|
|
135
|
-
running: false,
|
|
136
|
-
lastProcessedAt: null,
|
|
137
|
-
healthCheckRunning: false,
|
|
138
|
-
connectorHealthCheckRunning: false,
|
|
139
|
-
shuttingDown: false,
|
|
140
|
-
providerPingCircuitBreaker: new Map<string, { consecutiveFailures: number; skipUntil: number }>(),
|
|
141
|
-
}))
|
|
142
|
-
|
|
143
|
-
// Backfill fields for hot-reloaded daemon state objects from older code versions.
|
|
144
|
-
if (!ds.staleSessionIds) ds.staleSessionIds = new Set<string>()
|
|
145
|
-
if (!ds.openclawDownAgentIds) ds.openclawDownAgentIds = new Set<string>()
|
|
146
|
-
if (!ds.openclawRepairState) ds.openclawRepairState = new Map<string, { attempts: number; lastAttemptAt: number; cooldownUntil: number }>()
|
|
147
|
-
if (ds.lastIntegrityCheckAt === undefined) ds.lastIntegrityCheckAt = null
|
|
148
|
-
if (ds.lastIntegrityDriftCount === undefined) ds.lastIntegrityDriftCount = 0
|
|
149
|
-
// Migrate from old issueLastAlertAt map if present (HMR across code versions)
|
|
150
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
151
|
-
if ((ds as any).issueLastAlertAt) delete (ds as any).issueLastAlertAt
|
|
152
|
-
if (ds.healthIntervalId === undefined) ds.healthIntervalId = null
|
|
153
|
-
if (ds.connectorHealthIntervalId === undefined) ds.connectorHealthIntervalId = null
|
|
154
|
-
if (ds.manualStopRequested === undefined) ds.manualStopRequested = false
|
|
155
|
-
if (ds.memoryConsolidationTimeoutId === undefined) ds.memoryConsolidationTimeoutId = null
|
|
156
|
-
if (ds.memoryConsolidationIntervalId === undefined) ds.memoryConsolidationIntervalId = null
|
|
157
|
-
if (ds.evalSchedulerIntervalId === undefined) ds.evalSchedulerIntervalId = null
|
|
158
|
-
if (ds.swarmTimeoutIntervalId === undefined) ds.swarmTimeoutIntervalId = null
|
|
159
|
-
if (ds.healthCheckRunning === undefined) ds.healthCheckRunning = false
|
|
160
|
-
if (ds.connectorHealthCheckRunning === undefined) ds.connectorHealthCheckRunning = false
|
|
161
|
-
if (ds.shuttingDown === undefined) ds.shuttingDown = false
|
|
162
|
-
if (!ds.providerPingCircuitBreaker) ds.providerPingCircuitBreaker = new Map<string, { consecutiveFailures: number; skipUntil: number }>()
|
|
163
|
-
|
|
164
|
-
export function ensureDaemonStarted(source = 'unknown'): boolean {
|
|
165
|
-
if (ds.running) return false
|
|
166
|
-
if (!daemonAutostartEnvEnabled()) return false
|
|
167
|
-
if (ds.manualStopRequested) return false
|
|
168
|
-
if (loadEstopState().level !== 'none') return false
|
|
169
|
-
startDaemon({ source, manualStart: false })
|
|
170
|
-
return true
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
export function startDaemon(options?: { source?: string; manualStart?: boolean }) {
|
|
174
|
-
const source = options?.source || 'unknown'
|
|
175
|
-
const manualStart = options?.manualStart === true
|
|
176
|
-
if (manualStart) ds.manualStopRequested = false
|
|
177
|
-
const estop = loadEstopState()
|
|
178
|
-
if (estop.level !== 'none') {
|
|
179
|
-
notify('daemon')
|
|
180
|
-
log.warn(TAG, `[daemon] Start blocked by estop (level=${estop.level}, source=${source})`)
|
|
181
|
-
return
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
if (ds.running) {
|
|
185
|
-
// In dev/HMR, daemon can already be flagged running while new interval types
|
|
186
|
-
// (for example health monitor) were introduced in newer code.
|
|
187
|
-
startQueueProcessor()
|
|
188
|
-
startBrowserSweep()
|
|
189
|
-
startHeartbeatService()
|
|
190
|
-
startMemoryConsolidation()
|
|
191
|
-
startSwarmTimeoutChecker()
|
|
192
|
-
syncDaemonBackgroundServices({ runConnectorHealthCheckImmediately: false })
|
|
193
|
-
return
|
|
194
|
-
}
|
|
195
|
-
ds.running = true
|
|
196
|
-
notify('daemon')
|
|
197
|
-
log.info(TAG, `[daemon] Starting daemon (source=${source}, scheduler + queue processor + heartbeat)`)
|
|
198
|
-
|
|
199
|
-
try {
|
|
200
|
-
validateCompletedTasksQueue()
|
|
201
|
-
cleanupFinishedTaskSessions()
|
|
202
|
-
recoverStaleDelegationJobs({ fullRestart: true })
|
|
203
|
-
ensureProtocolEngineRecovered()
|
|
204
|
-
restoreProviderHealthState()
|
|
205
|
-
try {
|
|
206
|
-
const lost = restoreSwarmRegistry()
|
|
207
|
-
if (lost > 0) log.info(TAG, `[daemon] Marked ${lost} in-flight swarm(s) as lost after restart`)
|
|
208
|
-
} catch { /* best-effort */ }
|
|
209
|
-
resumeQueue()
|
|
210
|
-
startScheduler()
|
|
211
|
-
startQueueProcessor()
|
|
212
|
-
startBrowserSweep()
|
|
213
|
-
startHeartbeatService()
|
|
214
|
-
startMemoryConsolidation()
|
|
215
|
-
startSwarmTimeoutChecker()
|
|
216
|
-
syncDaemonBackgroundServices({ runConnectorHealthCheckImmediately: false })
|
|
217
|
-
} catch (err: unknown) {
|
|
218
|
-
ds.running = false
|
|
219
|
-
notify('daemon')
|
|
220
|
-
log.error(TAG, '[daemon] Failed to start:', errorMessage(err))
|
|
221
|
-
throw err
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
if (isDaemonBackgroundServicesEnabled()) {
|
|
225
|
-
// Auto-start enabled connectors only when the full background stack is enabled.
|
|
226
|
-
autoStartConnectors().catch((err: unknown) => {
|
|
227
|
-
log.error(TAG, '[daemon] Error auto-starting connectors:', errorMessage(err))
|
|
228
|
-
})
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
export async function stopDaemon(options?: { source?: string; manualStop?: boolean }) {
|
|
233
|
-
const source = options?.source || 'unknown'
|
|
234
|
-
if (options?.manualStop === true) ds.manualStopRequested = true
|
|
235
|
-
if (!ds.running) return
|
|
236
|
-
ds.running = false
|
|
237
|
-
ds.shuttingDown = true
|
|
238
|
-
notify('daemon')
|
|
239
|
-
log.info(TAG, `[daemon] Stopping daemon (source=${source})`)
|
|
240
|
-
|
|
241
|
-
stopScheduler()
|
|
242
|
-
stopQueueProcessor()
|
|
243
|
-
stopBrowserSweep()
|
|
244
|
-
stopHealthMonitor()
|
|
245
|
-
stopConnectorHealthMonitor()
|
|
246
|
-
stopConnectorOutboxWorker()
|
|
247
|
-
stopHeartbeatService()
|
|
248
|
-
stopMemoryConsolidation()
|
|
249
|
-
stopSwarmTimeoutChecker()
|
|
250
|
-
stopEvalScheduler()
|
|
251
|
-
try {
|
|
252
|
-
await Promise.race([
|
|
253
|
-
stopAllConnectors({ disable: false }),
|
|
254
|
-
new Promise<void>((_, reject) =>
|
|
255
|
-
setTimeout(() => reject(new Error('Connector shutdown timed out')), SHUTDOWN_TIMEOUT_MS)
|
|
256
|
-
),
|
|
257
|
-
])
|
|
258
|
-
} catch (err: unknown) {
|
|
259
|
-
log.warn(TAG, `[daemon] Connector shutdown issue: ${errorMessage(err)}`)
|
|
260
|
-
} finally {
|
|
261
|
-
ds.shuttingDown = false
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
function startBrowserSweep() {
|
|
266
|
-
if (ds.browserSweepId) return
|
|
267
|
-
ds.browserSweepId = setInterval(() => {
|
|
268
|
-
const count = getActiveBrowserCount()
|
|
269
|
-
if (count > 0) {
|
|
270
|
-
const cleaned = sweepOrphanedBrowsers(BROWSER_MAX_AGE)
|
|
271
|
-
if (cleaned > 0) {
|
|
272
|
-
log.info(TAG, `[daemon] Cleaned ${cleaned} orphaned browser(s), ${getActiveBrowserCount()} still active`)
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
}, BROWSER_SWEEP_INTERVAL)
|
|
276
|
-
}
|
|
277
|
-
|
|
278
|
-
function stopBrowserSweep() {
|
|
279
|
-
if (ds.browserSweepId) {
|
|
280
|
-
clearInterval(ds.browserSweepId)
|
|
281
|
-
ds.browserSweepId = null
|
|
282
|
-
}
|
|
283
|
-
// Kill all remaining browsers on shutdown
|
|
284
|
-
sweepOrphanedBrowsers(0)
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
export async function syncOpenClawGatewayLifecycle() {
|
|
288
|
-
if (!hasOpenClawAgents()) {
|
|
289
|
-
disconnectAutoGateways()
|
|
290
|
-
return
|
|
291
|
-
}
|
|
292
|
-
if (!getGateway()?.connected) {
|
|
293
|
-
await ensureGatewayConnected()
|
|
294
|
-
}
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
function startQueueProcessor() {
|
|
298
|
-
if (ds.queueIntervalId) return
|
|
299
|
-
ds.queueIntervalId = setInterval(async () => {
|
|
300
|
-
if (!ds.running) return
|
|
301
|
-
const queue = loadQueue()
|
|
302
|
-
if (queue.length > 0) {
|
|
303
|
-
log.info(TAG, `[daemon] Processing ${queue.length} queued task(s)`)
|
|
304
|
-
try {
|
|
305
|
-
await Promise.race([
|
|
306
|
-
processNext(),
|
|
307
|
-
new Promise<void>((_, reject) =>
|
|
308
|
-
setTimeout(() => reject(new Error('Queue processing timed out')), QUEUE_PROCESS_TIMEOUT)
|
|
309
|
-
),
|
|
310
|
-
])
|
|
311
|
-
} catch (err: unknown) {
|
|
312
|
-
log.error(TAG, `[daemon] Queue processing error/timeout: ${errorMessage(err)}`)
|
|
313
|
-
}
|
|
314
|
-
ds.lastProcessedAt = Date.now()
|
|
315
|
-
}
|
|
316
|
-
if (!isDaemonBackgroundServicesEnabled()) return
|
|
317
|
-
// OpenClaw gateway lifecycle: lazy connect for active OpenClaw agents, stop auto-managed reconnects when none remain.
|
|
318
|
-
try {
|
|
319
|
-
await syncOpenClawGatewayLifecycle()
|
|
320
|
-
} catch { /* gateway errors are non-fatal */ }
|
|
321
|
-
}, QUEUE_CHECK_INTERVAL)
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
function stopQueueProcessor() {
|
|
325
|
-
if (ds.queueIntervalId) {
|
|
326
|
-
clearInterval(ds.queueIntervalId)
|
|
327
|
-
ds.queueIntervalId = null
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
async function sendHealthAlert(input: string | {
|
|
332
|
-
text: string
|
|
333
|
-
dedupKey?: string
|
|
334
|
-
entityType?: string
|
|
335
|
-
entityId?: string
|
|
336
|
-
}) {
|
|
337
|
-
const payload = typeof input === 'string' ? { text: input } : input
|
|
338
|
-
const text = payload.text
|
|
339
|
-
log.warn(TAG, `[health] ${text}`)
|
|
340
|
-
createNotification({
|
|
341
|
-
type: 'warning',
|
|
342
|
-
title: 'SwarmClaw health alert',
|
|
343
|
-
message: text,
|
|
344
|
-
dedupKey: payload.dedupKey || `health-alert:${text}`,
|
|
345
|
-
entityType: payload.entityType,
|
|
346
|
-
entityId: payload.entityId,
|
|
347
|
-
dispatchExternally: false,
|
|
348
|
-
})
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
async function runConnectorHealthChecks(now: number) {
|
|
352
|
-
// First, collapse dead runtime instances into persisted error state so the
|
|
353
|
-
// daemon can own the restart cadence and backoff policy.
|
|
354
|
-
try {
|
|
355
|
-
await checkConnectorHealth()
|
|
356
|
-
} catch (err: unknown) {
|
|
357
|
-
log.error(TAG, '[health] Connector isAlive check failed:', errorMessage(err))
|
|
358
|
-
}
|
|
359
|
-
|
|
360
|
-
const connectors = loadConnectors()
|
|
361
|
-
for (const connector of Object.values(connectors) as unknown as Record<string, unknown>[]) {
|
|
362
|
-
if (!connector?.id || typeof connector.id !== 'string') continue
|
|
363
|
-
if (connector.isEnabled !== true) {
|
|
364
|
-
clearReconnectState(connector.id)
|
|
365
|
-
continue
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
const runtimeStatus = getConnectorStatus(connector.id)
|
|
369
|
-
if (runtimeStatus === 'running') {
|
|
370
|
-
clearReconnectState(connector.id)
|
|
371
|
-
continue
|
|
372
|
-
}
|
|
373
|
-
|
|
374
|
-
const current = getReconnectState(connector.id)
|
|
375
|
-
?? createConnectorReconnectState(
|
|
376
|
-
{ error: typeof connector.lastError === 'string' ? connector.lastError : '' },
|
|
377
|
-
{ initialBackoffMs: CONNECTOR_RESTART_BASE_MS },
|
|
378
|
-
)
|
|
379
|
-
|
|
380
|
-
if (current.exhausted) {
|
|
381
|
-
continue
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
if (current.nextRetryAt > now) continue
|
|
385
|
-
|
|
386
|
-
// Notify on first detection of a down connector
|
|
387
|
-
if (current.attempts === 0) {
|
|
388
|
-
createNotification({
|
|
389
|
-
type: 'warning',
|
|
390
|
-
title: `Connector "${connector.name}" is down`,
|
|
391
|
-
message: 'Auto-restart in progress.',
|
|
392
|
-
dedupKey: `connector-down:${connector.id}`,
|
|
393
|
-
entityType: 'connector',
|
|
394
|
-
entityId: connector.id,
|
|
395
|
-
})
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
try {
|
|
399
|
-
await startConnector(connector.id)
|
|
400
|
-
clearReconnectState(connector.id)
|
|
401
|
-
await sendHealthAlert(`Connector "${connector.name}" (${connector.platform}) was down and has been auto-restarted.`)
|
|
402
|
-
} catch (err: unknown) {
|
|
403
|
-
const message = errorMessage(err)
|
|
404
|
-
const next = advanceConnectorReconnectState(current, message, now, {
|
|
405
|
-
initialBackoffMs: CONNECTOR_RESTART_BASE_MS,
|
|
406
|
-
maxBackoffMs: CONNECTOR_RESTART_MAX_MS,
|
|
407
|
-
maxAttempts: MAX_WAKE_ATTEMPTS,
|
|
408
|
-
})
|
|
409
|
-
setReconnectState(connector.id, next)
|
|
410
|
-
if (next.exhausted) {
|
|
411
|
-
log.warn(TAG, `[health] Connector "${connector.name}" exceeded ${MAX_WAKE_ATTEMPTS} auto-restart attempts — giving up until the server restarts or the user retries manually`)
|
|
412
|
-
connector.status = 'error'
|
|
413
|
-
connector.lastError = `Auto-restart gave up after ${MAX_WAKE_ATTEMPTS} attempts: ${message}`
|
|
414
|
-
connector.updatedAt = Date.now()
|
|
415
|
-
connectors[connector.id] = connector
|
|
416
|
-
saveConnectors(connectors)
|
|
417
|
-
notify('connectors')
|
|
418
|
-
notifyOrchestrators(`Connector ${connector.name || connector.id} status: error — auto-restart exhausted after ${MAX_WAKE_ATTEMPTS} attempts`, `connector-status:${connector.id}`)
|
|
419
|
-
createNotification({
|
|
420
|
-
type: 'error',
|
|
421
|
-
title: `Connector "${connector.name}" failed`,
|
|
422
|
-
message: `Auto-restart gave up after ${MAX_WAKE_ATTEMPTS} attempts.`,
|
|
423
|
-
dedupKey: `connector-gave-up:${connector.id}`,
|
|
424
|
-
entityType: 'connector',
|
|
425
|
-
entityId: connector.id,
|
|
426
|
-
})
|
|
427
|
-
} else {
|
|
428
|
-
log.warn(TAG, `[health] Connector auto-restart failed for ${connector.name} (attempt ${next.attempts}/${MAX_WAKE_ATTEMPTS}): ${message}`)
|
|
429
|
-
}
|
|
430
|
-
}
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
// Purge restart state for connectors that no longer exist in storage
|
|
434
|
-
for (const id of Object.keys(getAllReconnectStates())) {
|
|
435
|
-
if (!connectors[id] || connectors[id]?.isEnabled !== true) clearReconnectState(id)
|
|
436
|
-
}
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
async function processWebhookRetries() {
|
|
440
|
-
const retryQueue = loadWebhookRetryQueue()
|
|
441
|
-
const now = Date.now()
|
|
442
|
-
const dueEntries: WebhookRetryEntry[] = []
|
|
443
|
-
|
|
444
|
-
for (const raw of Object.values(retryQueue)) {
|
|
445
|
-
const entry = raw as WebhookRetryEntry
|
|
446
|
-
if (entry.deadLettered) continue
|
|
447
|
-
if (entry.nextRetryAt > now) continue
|
|
448
|
-
dueEntries.push(entry)
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
if (dueEntries.length === 0) return
|
|
452
|
-
|
|
453
|
-
const webhooks = loadWebhooks()
|
|
454
|
-
const agents = loadAgents()
|
|
455
|
-
const sessions = loadSessions()
|
|
456
|
-
|
|
457
|
-
for (const entry of dueEntries) {
|
|
458
|
-
const webhook = webhooks[entry.webhookId] as unknown as Record<string, unknown> | undefined
|
|
459
|
-
if (!webhook) {
|
|
460
|
-
// Webhook deleted — drop the retry
|
|
461
|
-
deleteWebhookRetry(entry.id)
|
|
462
|
-
continue
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
const agentId = typeof webhook.agentId === 'string' ? webhook.agentId : ''
|
|
466
|
-
const agent = agentId ? (agents[agentId] as unknown as Record<string, unknown> | undefined) : null
|
|
467
|
-
if (!agent) {
|
|
468
|
-
entry.deadLettered = true
|
|
469
|
-
upsertWebhookRetry(entry.id, entry)
|
|
470
|
-
log.warn(TAG, `[webhook-retry] Dead-lettered ${entry.id}: agent not found for webhook ${entry.webhookId}`)
|
|
471
|
-
continue
|
|
472
|
-
}
|
|
473
|
-
if (isAgentDisabled(agent)) {
|
|
474
|
-
entry.deadLettered = true
|
|
475
|
-
upsertWebhookRetry(entry.id, entry)
|
|
476
|
-
log.warn(TAG, `[webhook-retry] Dead-lettered ${entry.id}: agent disabled for webhook ${entry.webhookId}`)
|
|
477
|
-
continue
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
// Find or create a webhook session (same logic as the POST handler)
|
|
481
|
-
const sessionName = `webhook:${entry.webhookId}`
|
|
482
|
-
let session = Object.values(sessions).find(
|
|
483
|
-
(s: unknown) => {
|
|
484
|
-
const rec = s as Record<string, unknown>
|
|
485
|
-
return rec.name === sessionName && rec.agentId === agent.id
|
|
486
|
-
},
|
|
487
|
-
) as unknown as Record<string, unknown> | undefined
|
|
488
|
-
|
|
489
|
-
if (!session) {
|
|
490
|
-
const sessionId = genId()
|
|
491
|
-
const ts = Date.now()
|
|
492
|
-
session = {
|
|
493
|
-
id: sessionId,
|
|
494
|
-
name: sessionName,
|
|
495
|
-
cwd: WORKSPACE_DIR,
|
|
496
|
-
user: 'system',
|
|
497
|
-
provider: agent.provider || 'claude-cli',
|
|
498
|
-
model: agent.model || '',
|
|
499
|
-
credentialId: agent.credentialId || null,
|
|
500
|
-
apiEndpoint: agent.apiEndpoint || null,
|
|
501
|
-
claudeSessionId: null,
|
|
502
|
-
codexThreadId: null,
|
|
503
|
-
opencodeSessionId: null,
|
|
504
|
-
delegateResumeIds: { claudeCode: null, codex: null, opencode: null, gemini: null },
|
|
505
|
-
messages: [],
|
|
506
|
-
createdAt: ts,
|
|
507
|
-
lastActiveAt: ts,
|
|
508
|
-
sessionType: 'human',
|
|
509
|
-
agentId: agent.id,
|
|
510
|
-
parentSessionId: null,
|
|
511
|
-
...getEnabledCapabilitySelection(agent),
|
|
512
|
-
heartbeatEnabled: (agent.heartbeatEnabled as boolean | undefined) ?? false,
|
|
513
|
-
heartbeatIntervalSec: (agent.heartbeatIntervalSec as number | null | undefined) ?? null,
|
|
514
|
-
}
|
|
515
|
-
const { upsertSession: upsert } = await import('@/lib/server/storage')
|
|
516
|
-
upsert(session.id as string, session)
|
|
517
|
-
}
|
|
518
|
-
|
|
519
|
-
const payloadPreview = (entry.payload || '').slice(0, 12_000)
|
|
520
|
-
const prompt = [
|
|
521
|
-
'Webhook event received (retry).',
|
|
522
|
-
`Webhook ID: ${entry.webhookId}`,
|
|
523
|
-
`Webhook Name: ${(webhook.name as string) || entry.webhookId}`,
|
|
524
|
-
`Source: ${(webhook.source as string) || 'custom'}`,
|
|
525
|
-
`Event: ${entry.event}`,
|
|
526
|
-
`Retry attempt: ${entry.attempts}`,
|
|
527
|
-
`Original received at: ${new Date(entry.createdAt).toISOString()}`,
|
|
528
|
-
'',
|
|
529
|
-
'Payload:',
|
|
530
|
-
payloadPreview || '(empty payload)',
|
|
531
|
-
'',
|
|
532
|
-
'Handle this event now. If this requires notifying the user, use configured connector tools.',
|
|
533
|
-
].join('\n')
|
|
534
|
-
|
|
535
|
-
try {
|
|
536
|
-
const run = enqueueSessionRun({
|
|
537
|
-
sessionId: session.id as string,
|
|
538
|
-
message: prompt,
|
|
539
|
-
source: 'webhook',
|
|
540
|
-
internal: false,
|
|
541
|
-
mode: 'followup',
|
|
542
|
-
})
|
|
543
|
-
|
|
544
|
-
appendWebhookLog(genId(8), {
|
|
545
|
-
id: genId(8),
|
|
546
|
-
webhookId: entry.webhookId,
|
|
547
|
-
event: entry.event,
|
|
548
|
-
payload: (entry.payload || '').slice(0, 2000),
|
|
549
|
-
status: 'success',
|
|
550
|
-
sessionId: session.id,
|
|
551
|
-
runId: run.runId,
|
|
552
|
-
timestamp: Date.now(),
|
|
553
|
-
})
|
|
554
|
-
|
|
555
|
-
deleteWebhookRetry(entry.id)
|
|
556
|
-
log.info(TAG, `[webhook-retry] Successfully retried ${entry.id} for webhook ${entry.webhookId} (attempt ${entry.attempts})`)
|
|
557
|
-
} catch (err: unknown) {
|
|
558
|
-
const errorMsg = errorMessage(err)
|
|
559
|
-
entry.attempts += 1
|
|
560
|
-
|
|
561
|
-
if (entry.attempts >= entry.maxAttempts) {
|
|
562
|
-
entry.deadLettered = true
|
|
563
|
-
upsertWebhookRetry(entry.id, entry)
|
|
564
|
-
log.warn(TAG, `[webhook-retry] Dead-lettered ${entry.id} after ${entry.attempts} attempts: ${errorMsg}`)
|
|
565
|
-
const failure = classifyRuntimeFailure({ source: 'webhook', message: errorMsg })
|
|
566
|
-
if (session?.id) {
|
|
567
|
-
recordSupervisorIncident({
|
|
568
|
-
runId: entry.id,
|
|
569
|
-
sessionId: session.id as string,
|
|
570
|
-
taskId: null,
|
|
571
|
-
agentId: agentId || null,
|
|
572
|
-
source: 'webhook',
|
|
573
|
-
kind: 'runtime_failure',
|
|
574
|
-
severity: failure.severity,
|
|
575
|
-
summary: `Webhook delivery dead-lettered: ${errorMsg}`.slice(0, 320),
|
|
576
|
-
details: errorMsg,
|
|
577
|
-
failureFamily: failure.family,
|
|
578
|
-
remediation: failure.remediation,
|
|
579
|
-
repairPrompt: failure.repairPrompt,
|
|
580
|
-
autoAction: null,
|
|
581
|
-
})
|
|
582
|
-
}
|
|
583
|
-
|
|
584
|
-
appendWebhookLog(genId(8), {
|
|
585
|
-
id: genId(8),
|
|
586
|
-
webhookId: entry.webhookId,
|
|
587
|
-
event: entry.event,
|
|
588
|
-
payload: (entry.payload || '').slice(0, 2000),
|
|
589
|
-
status: 'error',
|
|
590
|
-
error: `Dead-lettered after ${entry.attempts} attempts: ${errorMsg}`,
|
|
591
|
-
timestamp: Date.now(),
|
|
592
|
-
})
|
|
593
|
-
} else {
|
|
594
|
-
// Exponential backoff: 30s * 2^attempt + random jitter (0-5000ms)
|
|
595
|
-
const jitter = Math.floor(Math.random() * 5000)
|
|
596
|
-
entry.nextRetryAt = Date.now() + (30_000 * Math.pow(2, entry.attempts)) + jitter
|
|
597
|
-
upsertWebhookRetry(entry.id, entry)
|
|
598
|
-
log.warn(TAG, `[webhook-retry] Retry ${entry.id} failed (attempt ${entry.attempts}/${entry.maxAttempts}), next at ${new Date(entry.nextRetryAt).toISOString()}: ${errorMsg}`)
|
|
599
|
-
}
|
|
600
|
-
}
|
|
601
|
-
}
|
|
602
|
-
}
|
|
603
|
-
|
|
604
|
-
async function runProviderHealthChecks() {
|
|
605
|
-
const agents = loadAgents()
|
|
606
|
-
const credentials = loadCredentials()
|
|
607
|
-
|
|
608
|
-
// Build deduplicated set of { provider, credentialId, apiEndpoint } tuples
|
|
609
|
-
const seen = new Set<string>()
|
|
610
|
-
const tuples: { provider: string; credentialId: string; apiEndpoint: string; agentId: string; credentialName: string }[] = []
|
|
611
|
-
|
|
612
|
-
for (const agent of Object.values(agents) as unknown as Record<string, unknown>[]) {
|
|
613
|
-
if (!agent?.id || typeof agent.id !== 'string') continue
|
|
614
|
-
if (shouldSuppressSyntheticAgentHealthAlert(agent.id)) continue
|
|
615
|
-
const provider = typeof agent.provider === 'string' ? agent.provider : ''
|
|
616
|
-
if (!provider || ['claude-cli', 'codex-cli', 'opencode-cli'].includes(provider)) continue
|
|
617
|
-
|
|
618
|
-
const credentialId = typeof agent.credentialId === 'string' ? agent.credentialId : ''
|
|
619
|
-
const apiEndpoint = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
|
|
620
|
-
|
|
621
|
-
// For OpenClaw, scope per agent (each may have a different gateway)
|
|
622
|
-
const key = provider === 'openclaw'
|
|
623
|
-
? `openclaw:${agent.id}`
|
|
624
|
-
: `${provider}:${credentialId || 'no-cred'}:${apiEndpoint}`
|
|
625
|
-
if (seen.has(key)) continue
|
|
626
|
-
seen.add(key)
|
|
627
|
-
|
|
628
|
-
const cred = credentialId ? (credentials[credentialId] as unknown as Record<string, unknown> | undefined) : undefined
|
|
629
|
-
const credName = typeof cred?.name === 'string' ? cred.name : provider
|
|
630
|
-
|
|
631
|
-
tuples.push({
|
|
632
|
-
provider,
|
|
633
|
-
credentialId,
|
|
634
|
-
apiEndpoint,
|
|
635
|
-
agentId: agent.id,
|
|
636
|
-
credentialName: credName,
|
|
637
|
-
})
|
|
638
|
-
}
|
|
639
|
-
|
|
640
|
-
for (const tuple of tuples) {
|
|
641
|
-
// Circuit breaker: skip providers that have failed repeatedly
|
|
642
|
-
const cbKey = `${tuple.provider}:${tuple.credentialId || 'no-cred'}:${tuple.apiEndpoint}`
|
|
643
|
-
const cb = ds.providerPingCircuitBreaker.get(cbKey)
|
|
644
|
-
const now = Date.now()
|
|
645
|
-
if (cb && cb.skipUntil > now) continue
|
|
646
|
-
|
|
647
|
-
let apiKey: string | undefined
|
|
648
|
-
if (tuple.credentialId) {
|
|
649
|
-
const cred = credentials[tuple.credentialId] as unknown as Record<string, unknown> | undefined
|
|
650
|
-
if (cred?.encryptedKey && typeof cred.encryptedKey === 'string') {
|
|
651
|
-
try { apiKey = decryptKey(cred.encryptedKey) } catch { /* skip undecryptable */ continue }
|
|
652
|
-
}
|
|
653
|
-
}
|
|
654
|
-
|
|
655
|
-
const endpoint = tuple.apiEndpoint || OPENAI_COMPATIBLE_DEFAULTS[tuple.provider]?.defaultEndpoint || undefined
|
|
656
|
-
const result = await pingProvider(tuple.provider, apiKey, endpoint)
|
|
657
|
-
|
|
658
|
-
if (!result.ok) {
|
|
659
|
-
// Update circuit breaker state
|
|
660
|
-
const existing = ds.providerPingCircuitBreaker.get(cbKey) || { consecutiveFailures: 0, skipUntil: 0 }
|
|
661
|
-
existing.consecutiveFailures += 1
|
|
662
|
-
if (existing.consecutiveFailures >= PROVIDER_PING_CB_THRESHOLD) {
|
|
663
|
-
const cooldown = Math.min(
|
|
664
|
-
PROVIDER_PING_CB_BASE_MS * Math.pow(2, existing.consecutiveFailures - PROVIDER_PING_CB_THRESHOLD),
|
|
665
|
-
PROVIDER_PING_CB_MAX_MS,
|
|
666
|
-
)
|
|
667
|
-
existing.skipUntil = now + cooldown
|
|
668
|
-
log.info(TAG, `[health] Circuit breaker tripped for ${tuple.credentialName} — skipping pings for ${Math.round(cooldown / 60_000)}m`)
|
|
669
|
-
}
|
|
670
|
-
ds.providerPingCircuitBreaker.set(cbKey, existing)
|
|
671
|
-
|
|
672
|
-
if (!shouldNotifyProviderReachabilityIssue(tuple.provider)) {
|
|
673
|
-
continue
|
|
674
|
-
}
|
|
675
|
-
|
|
676
|
-
const dedupKey = `provider-down:${tuple.credentialId || tuple.provider}`
|
|
677
|
-
|
|
678
|
-
const entityType = tuple.credentialId ? 'credential' : undefined
|
|
679
|
-
const entityId = tuple.credentialId || undefined
|
|
680
|
-
|
|
681
|
-
createNotification({
|
|
682
|
-
type: 'warning',
|
|
683
|
-
title: `Provider unreachable: ${tuple.credentialName}`,
|
|
684
|
-
message: result.message,
|
|
685
|
-
dedupKey,
|
|
686
|
-
entityType,
|
|
687
|
-
entityId,
|
|
688
|
-
})
|
|
689
|
-
} else {
|
|
690
|
-
// Success — clear circuit breaker
|
|
691
|
-
ds.providerPingCircuitBreaker.delete(cbKey)
|
|
692
|
-
}
|
|
693
|
-
}
|
|
694
|
-
}
|
|
695
|
-
|
|
696
|
-
const OPENCLAW_REPAIR_MAX_ATTEMPTS = 3
|
|
697
|
-
const OPENCLAW_REPAIR_COOLDOWN_MS = 300_000 // 5 minutes
|
|
698
|
-
|
|
699
|
-
async function runOpenClawGatewayHealthChecks() {
|
|
700
|
-
const agents = loadAgents()
|
|
701
|
-
const credentials = loadCredentials()
|
|
702
|
-
|
|
703
|
-
// Build deduplicated OpenClaw agent tuples
|
|
704
|
-
const seen = new Set<string>()
|
|
705
|
-
const tuples: { agentId: string; endpoint: string; credentialId: string; credentialName: string }[] = []
|
|
706
|
-
|
|
707
|
-
for (const agent of Object.values(agents) as unknown as Record<string, unknown>[]) {
|
|
708
|
-
if (!agent?.id || typeof agent.id !== 'string') continue
|
|
709
|
-
if (shouldSuppressSyntheticAgentHealthAlert(agent.id)) continue
|
|
710
|
-
if (agent.provider !== 'openclaw') continue
|
|
711
|
-
|
|
712
|
-
const key = `openclaw:${agent.id}`
|
|
713
|
-
if (seen.has(key)) continue
|
|
714
|
-
seen.add(key)
|
|
715
|
-
|
|
716
|
-
const credentialId = typeof agent.credentialId === 'string' ? agent.credentialId : ''
|
|
717
|
-
const endpoint = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
|
|
718
|
-
const cred = credentialId ? (credentials[credentialId] as unknown as Record<string, unknown> | undefined) : undefined
|
|
719
|
-
const credName = typeof cred?.name === 'string' ? cred.name : 'openclaw'
|
|
720
|
-
|
|
721
|
-
tuples.push({ agentId: agent.id, endpoint, credentialId, credentialName: credName })
|
|
722
|
-
}
|
|
723
|
-
|
|
724
|
-
if (!tuples.length) return
|
|
725
|
-
|
|
726
|
-
const { probeOpenClawHealth } = await import('@/lib/server/openclaw/health')
|
|
727
|
-
|
|
728
|
-
for (const tuple of tuples) {
|
|
729
|
-
let token: string | undefined
|
|
730
|
-
if (tuple.credentialId) {
|
|
731
|
-
const cred = credentials[tuple.credentialId] as unknown as Record<string, unknown> | undefined
|
|
732
|
-
if (cred?.encryptedKey && typeof cred.encryptedKey === 'string') {
|
|
733
|
-
try { token = decryptKey(cred.encryptedKey) } catch { continue }
|
|
734
|
-
}
|
|
735
|
-
}
|
|
736
|
-
|
|
737
|
-
const result = await probeOpenClawHealth({
|
|
738
|
-
endpoint: tuple.endpoint || undefined,
|
|
739
|
-
token,
|
|
740
|
-
timeoutMs: 10_000,
|
|
741
|
-
})
|
|
742
|
-
|
|
743
|
-
const now = Date.now()
|
|
744
|
-
|
|
745
|
-
if (result.ok) {
|
|
746
|
-
// Recovered
|
|
747
|
-
if (ds.openclawDownAgentIds.has(tuple.agentId)) {
|
|
748
|
-
ds.openclawDownAgentIds.delete(tuple.agentId)
|
|
749
|
-
ds.openclawRepairState.delete(tuple.agentId)
|
|
750
|
-
createNotification({
|
|
751
|
-
type: 'success',
|
|
752
|
-
title: 'OpenClaw gateway recovered',
|
|
753
|
-
message: `Gateway for ${tuple.credentialName} is reachable again.`,
|
|
754
|
-
dedupKey: `openclaw-gw-down:${tuple.agentId}`,
|
|
755
|
-
})
|
|
756
|
-
}
|
|
757
|
-
continue
|
|
758
|
-
}
|
|
759
|
-
|
|
760
|
-
// Unhealthy
|
|
761
|
-
const repair = ds.openclawRepairState.get(tuple.agentId) || { attempts: 0, lastAttemptAt: 0, cooldownUntil: 0 }
|
|
762
|
-
|
|
763
|
-
// In cooldown — skip
|
|
764
|
-
if (repair.cooldownUntil > now) continue
|
|
765
|
-
|
|
766
|
-
// Cooldown expired — reset
|
|
767
|
-
if (repair.cooldownUntil > 0 && repair.cooldownUntil <= now) {
|
|
768
|
-
repair.attempts = 0
|
|
769
|
-
repair.cooldownUntil = 0
|
|
770
|
-
}
|
|
771
|
-
|
|
772
|
-
ds.openclawDownAgentIds.add(tuple.agentId)
|
|
773
|
-
|
|
774
|
-
if (repair.attempts < OPENCLAW_REPAIR_MAX_ATTEMPTS) {
|
|
775
|
-
try {
|
|
776
|
-
const { runOpenClawDoctor } = await import('@/lib/server/openclaw/doctor')
|
|
777
|
-
await runOpenClawDoctor({ fix: true })
|
|
778
|
-
} catch (err: unknown) {
|
|
779
|
-
log.warn(TAG, '[daemon] openclaw doctor --fix failed:', errorMessage(err))
|
|
780
|
-
}
|
|
781
|
-
repair.attempts += 1
|
|
782
|
-
repair.lastAttemptAt = now
|
|
783
|
-
} else {
|
|
784
|
-
repair.cooldownUntil = now + OPENCLAW_REPAIR_COOLDOWN_MS
|
|
785
|
-
}
|
|
786
|
-
|
|
787
|
-
ds.openclawRepairState.set(tuple.agentId, repair)
|
|
788
|
-
|
|
789
|
-
createNotification({
|
|
790
|
-
type: 'error',
|
|
791
|
-
title: `OpenClaw gateway unreachable: ${tuple.credentialName}`,
|
|
792
|
-
message: result.error || 'Health check failed',
|
|
793
|
-
dedupKey: `openclaw-gw-down:${tuple.agentId}`,
|
|
794
|
-
})
|
|
795
|
-
}
|
|
796
|
-
}
|
|
797
|
-
|
|
798
|
-
/**
|
|
799
|
-
* Prune orphaned entries from module-level Maps/Sets that reference
|
|
800
|
-
* sessions, connectors, or agents that no longer exist in storage.
|
|
801
|
-
* Runs every health-check cycle (2 minutes).
|
|
802
|
-
*/
|
|
803
|
-
function pruneOrphanedState(sessions: Record<string, unknown>): void {
|
|
804
|
-
const liveSessionIds = new Set(Object.keys(sessions))
|
|
805
|
-
|
|
806
|
-
// Main-loop state map (per-session autonomous state)
|
|
807
|
-
pruneMainLoopState(liveSessionIds)
|
|
808
|
-
|
|
809
|
-
// Heartbeat service tracking maps
|
|
810
|
-
pruneHeartbeatState(liveSessionIds)
|
|
811
|
-
|
|
812
|
-
// System event queues for dead sessions
|
|
813
|
-
pruneSystemEventQueues(liveSessionIds)
|
|
814
|
-
|
|
815
|
-
// Subagent lineage/handle registry — remove finished subagent state older than 30 min
|
|
816
|
-
cleanupFinishedSubagents()
|
|
817
|
-
|
|
818
|
-
// Process manager — sweep completed processes older than TTL
|
|
819
|
-
sweepManagedProcesses()
|
|
820
|
-
|
|
821
|
-
// Reap orphaned sandbox containers from prior crashes
|
|
822
|
-
reapOrphanedSandboxContainers().catch((err) => {
|
|
823
|
-
log.warn(TAG, '[daemon] Orphaned sandbox reap failed:', typeof err === 'object' && err !== null && 'message' in err ? (err as Error).message : String(err))
|
|
824
|
-
})
|
|
825
|
-
|
|
826
|
-
// Daemon-local: prune openclawRepairState for agents that no longer exist
|
|
827
|
-
const agents = loadAgents()
|
|
828
|
-
for (const agentId of ds.openclawRepairState.keys()) {
|
|
829
|
-
if (!agents[agentId]) ds.openclawRepairState.delete(agentId)
|
|
830
|
-
}
|
|
831
|
-
for (const agentId of ds.openclawDownAgentIds) {
|
|
832
|
-
if (!agents[agentId]) ds.openclawDownAgentIds.delete(agentId)
|
|
833
|
-
}
|
|
834
|
-
|
|
835
|
-
// Orchestrator event queues for dead agents
|
|
836
|
-
const liveAgentIds = new Set(Object.keys(agents))
|
|
837
|
-
pruneOrchestratorEventQueues(liveAgentIds)
|
|
838
|
-
|
|
839
|
-
// Orchestrator wake/failure/dailyCycles Maps for deleted agents
|
|
840
|
-
pruneOrchestratorState(liveAgentIds)
|
|
841
|
-
|
|
842
|
-
// Connector tracking Maps for deleted connectors
|
|
843
|
-
const connectors = loadConnectors()
|
|
844
|
-
pruneConnectorTrackingState(new Set(Object.keys(connectors)))
|
|
845
|
-
|
|
846
|
-
// Prune circuit breaker entries for providers that no longer have any agent referencing them
|
|
847
|
-
const liveProviderKeys = new Set<string>()
|
|
848
|
-
for (const agent of Object.values(agents) as unknown as Record<string, unknown>[]) {
|
|
849
|
-
if (!agent?.id) continue
|
|
850
|
-
const p = typeof agent.provider === 'string' ? agent.provider : ''
|
|
851
|
-
const c = typeof agent.credentialId === 'string' ? agent.credentialId : ''
|
|
852
|
-
const e = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
|
|
853
|
-
if (p) liveProviderKeys.add(`${p}:${c || 'no-cred'}:${e}`)
|
|
854
|
-
}
|
|
855
|
-
for (const key of ds.providerPingCircuitBreaker.keys()) {
|
|
856
|
-
if (!liveProviderKeys.has(key)) ds.providerPingCircuitBreaker.delete(key)
|
|
857
|
-
}
|
|
858
|
-
}
|
|
859
|
-
|
|
860
|
-
async function runMemoryMaintenanceTick(): Promise<void> {
|
|
861
|
-
try {
|
|
862
|
-
const memDb = getMemoryDb()
|
|
863
|
-
const result = memDb.maintain({ dedupe: true, pruneWorking: true, ttlHours: 24 })
|
|
864
|
-
if (result.deduped > 0 || result.pruned > 0) {
|
|
865
|
-
log.info(TAG, `[daemon] Memory maintenance: deduped=${result.deduped}, pruned=${result.pruned}`)
|
|
866
|
-
}
|
|
867
|
-
} catch (err: unknown) {
|
|
868
|
-
log.warn(TAG, '[daemon] Memory maintenance tick failed:', err instanceof Error ? err.message : String(err))
|
|
869
|
-
}
|
|
870
|
-
}
|
|
871
|
-
|
|
872
|
-
async function runHealthChecks() {
|
|
873
|
-
// Continuously keep the completed queue honest.
|
|
874
|
-
validateCompletedTasksQueue()
|
|
875
|
-
recoverStalledRunningTasks()
|
|
876
|
-
|
|
877
|
-
// Watchdog: abort runs stuck in running state beyond their timeout threshold.
|
|
878
|
-
try {
|
|
879
|
-
const stuck = sweepStuckRuns()
|
|
880
|
-
if (stuck.aborted > 0) {
|
|
881
|
-
log.info(TAG, `[daemon] Watchdog: aborted ${stuck.aborted} stuck run(s)`)
|
|
882
|
-
}
|
|
883
|
-
} catch (err: unknown) {
|
|
884
|
-
log.error(TAG, '[daemon] Stuck-run watchdog failed:', err instanceof Error ? err.message : String(err))
|
|
885
|
-
}
|
|
886
|
-
|
|
887
|
-
// Keep heartbeat state in sync with task terminal states even without daemon restarts.
|
|
888
|
-
cleanupFinishedTaskSessions()
|
|
889
|
-
|
|
890
|
-
// Re-queue deferred tasks whose agents have become available again.
|
|
891
|
-
try { promoteDeferred() } catch {}
|
|
892
|
-
|
|
893
|
-
const sessions = loadSessions()
|
|
894
|
-
const now = Date.now()
|
|
895
|
-
const currentlyStale = new Set<string>()
|
|
896
|
-
const dirtySessionIds: string[] = []
|
|
897
|
-
|
|
898
|
-
for (const session of Object.values(sessions) as unknown as Record<string, unknown>[]) {
|
|
899
|
-
if (!session?.id || typeof session.id !== 'string') continue
|
|
900
|
-
if (session.heartbeatEnabled !== true) continue
|
|
901
|
-
|
|
902
|
-
const sessionId = session.id
|
|
903
|
-
if (shouldSuppressSessionHeartbeatHealthAlert(session as Pick<Session, 'id' | 'name' | 'user' | 'shortcutForAgentId'>)) {
|
|
904
|
-
ds.staleSessionIds.delete(sessionId)
|
|
905
|
-
continue
|
|
906
|
-
}
|
|
907
|
-
|
|
908
|
-
const sessionLabel = String(session.name || sessionId)
|
|
909
|
-
const intervalSec = parseHeartbeatIntervalSec(session.heartbeatIntervalSec, DEFAULT_HEARTBEAT_INTERVAL_SEC)
|
|
910
|
-
if (intervalSec <= 0) continue
|
|
911
|
-
const staleAfter = Math.max(intervalSec * STALE_MULTIPLIER * 1000, STALE_MIN_MS)
|
|
912
|
-
const lastActive = typeof session.lastActiveAt === 'number' ? session.lastActiveAt : 0
|
|
913
|
-
if (lastActive <= 0) continue
|
|
914
|
-
|
|
915
|
-
const staleForMs = now - lastActive
|
|
916
|
-
if (staleForMs > staleAfter) {
|
|
917
|
-
const autoDisableAfter = Math.max(intervalSec * STALE_AUTO_DISABLE_MULTIPLIER * 1000, STALE_AUTO_DISABLE_MIN_MS)
|
|
918
|
-
if (staleForMs > autoDisableAfter) {
|
|
919
|
-
session.heartbeatEnabled = false
|
|
920
|
-
session.lastActiveAt = now
|
|
921
|
-
dirtySessionIds.push(sessionId)
|
|
922
|
-
ds.staleSessionIds.delete(sessionId)
|
|
923
|
-
await sendHealthAlert({
|
|
924
|
-
text: `Auto-disabled heartbeat for stale session "${sessionLabel}" after ${Math.round(staleForMs / 60_000)}m of inactivity.`,
|
|
925
|
-
dedupKey: buildSessionHeartbeatHealthDedupKey(sessionId, 'auto-disabled'),
|
|
926
|
-
entityType: 'session',
|
|
927
|
-
entityId: sessionId,
|
|
928
|
-
})
|
|
929
|
-
continue
|
|
930
|
-
}
|
|
931
|
-
|
|
932
|
-
currentlyStale.add(sessionId)
|
|
933
|
-
// Only alert on transition from healthy → stale (once per stale episode)
|
|
934
|
-
if (!ds.staleSessionIds.has(sessionId)) {
|
|
935
|
-
ds.staleSessionIds.add(sessionId)
|
|
936
|
-
await sendHealthAlert({
|
|
937
|
-
text: `Session "${sessionLabel}" heartbeat appears stale (last active ${(Math.round(staleForMs / 1000))}s ago, interval ${intervalSec}s).`,
|
|
938
|
-
dedupKey: buildSessionHeartbeatHealthDedupKey(sessionId, 'stale'),
|
|
939
|
-
entityType: 'session',
|
|
940
|
-
entityId: sessionId,
|
|
941
|
-
})
|
|
942
|
-
}
|
|
943
|
-
}
|
|
944
|
-
}
|
|
945
|
-
|
|
946
|
-
// Clear recovered sessions so they can re-alert if they go stale again later
|
|
947
|
-
for (const id of ds.staleSessionIds) {
|
|
948
|
-
if (!currentlyStale.has(id)) {
|
|
949
|
-
ds.staleSessionIds.delete(id)
|
|
950
|
-
}
|
|
951
|
-
}
|
|
952
|
-
|
|
953
|
-
for (const sid of dirtySessionIds) {
|
|
954
|
-
const s = sessions[sid]
|
|
955
|
-
if (s) {
|
|
956
|
-
const { upsertSession: upsert } = await import('@/lib/server/storage')
|
|
957
|
-
upsert(sid, s)
|
|
958
|
-
}
|
|
959
|
-
}
|
|
960
|
-
|
|
961
|
-
// Provider reachability checks
|
|
962
|
-
try {
|
|
963
|
-
await runProviderHealthChecks()
|
|
964
|
-
} catch (err: unknown) {
|
|
965
|
-
log.error(TAG, '[daemon] Provider health check failed:', errorMessage(err))
|
|
966
|
-
}
|
|
967
|
-
|
|
968
|
-
// OpenClaw gateway health checks + auto-repair
|
|
969
|
-
try {
|
|
970
|
-
await runOpenClawGatewayHealthChecks()
|
|
971
|
-
} catch (err: unknown) {
|
|
972
|
-
log.error(TAG, '[daemon] OpenClaw gateway health check failed:', errorMessage(err))
|
|
973
|
-
}
|
|
974
|
-
|
|
975
|
-
// Integrity drift monitoring for identity/config/extension files.
|
|
976
|
-
try {
|
|
977
|
-
const integrity = runIntegrityMonitor(loadSettings())
|
|
978
|
-
ds.lastIntegrityCheckAt = integrity.checkedAt
|
|
979
|
-
ds.lastIntegrityDriftCount = integrity.drifts.length
|
|
980
|
-
if (integrity.drifts.length > 0) {
|
|
981
|
-
for (const drift of integrity.drifts) {
|
|
982
|
-
const rel = path.relative(process.cwd(), drift.filePath)
|
|
983
|
-
const shortPath = rel && !rel.startsWith('..') ? rel : drift.filePath
|
|
984
|
-
const action = drift.type === 'created'
|
|
985
|
-
? 'created'
|
|
986
|
-
: drift.type === 'deleted'
|
|
987
|
-
? 'deleted'
|
|
988
|
-
: 'modified'
|
|
989
|
-
createNotification({
|
|
990
|
-
type: drift.type === 'deleted' ? 'error' : 'warning',
|
|
991
|
-
title: `Integrity drift detected (${drift.kind})`,
|
|
992
|
-
message: `${shortPath} was ${action}.`,
|
|
993
|
-
dedupKey: `integrity:${drift.id}:${drift.nextHash || 'missing'}`,
|
|
994
|
-
entityType: 'session',
|
|
995
|
-
entityId: drift.id,
|
|
996
|
-
})
|
|
997
|
-
}
|
|
998
|
-
await sendHealthAlert(`Integrity monitor detected ${integrity.drifts.length} file drift event(s).`)
|
|
999
|
-
}
|
|
1000
|
-
} catch (err: unknown) {
|
|
1001
|
-
log.error(TAG, '[daemon] Integrity monitor check failed:', errorMessage(err))
|
|
1002
|
-
}
|
|
1003
|
-
|
|
1004
|
-
// Process webhook retry queue
|
|
1005
|
-
try {
|
|
1006
|
-
await processWebhookRetries()
|
|
1007
|
-
} catch (err: unknown) {
|
|
1008
|
-
log.error(TAG, '[daemon] Webhook retry processing failed:', errorMessage(err))
|
|
1009
|
-
}
|
|
1010
|
-
|
|
1011
|
-
// Periodic memory hygiene: prune orphaned state for deleted sessions/connectors
|
|
1012
|
-
try {
|
|
1013
|
-
pruneOrphanedState(sessions)
|
|
1014
|
-
} catch (err: unknown) {
|
|
1015
|
-
log.error(TAG, '[daemon] Memory hygiene sweep failed:', errorMessage(err))
|
|
1016
|
-
}
|
|
1017
|
-
|
|
1018
|
-
// Prune old terminal runs and their events to prevent unbounded growth
|
|
1019
|
-
try {
|
|
1020
|
-
const pruned = pruneOldRuns()
|
|
1021
|
-
if (pruned.prunedRuns > 0 || pruned.prunedEvents > 0) {
|
|
1022
|
-
log.info(TAG, `[daemon] Pruned ${pruned.prunedRuns} old run(s) and ${pruned.prunedEvents} run event(s)`)
|
|
1023
|
-
}
|
|
1024
|
-
} catch (err: unknown) {
|
|
1025
|
-
log.error(TAG, '[daemon] Run pruning failed:', err instanceof Error ? err.message : String(err))
|
|
1026
|
-
}
|
|
1027
|
-
|
|
1028
|
-
// Prune expired runtime locks
|
|
1029
|
-
try {
|
|
1030
|
-
const locksRemoved = pruneExpiredLocks()
|
|
1031
|
-
if (locksRemoved > 0) {
|
|
1032
|
-
log.info(TAG, `[daemon] Pruned ${locksRemoved} expired lock(s)`)
|
|
1033
|
-
}
|
|
1034
|
-
} catch (err: unknown) {
|
|
1035
|
-
log.error(TAG, '[daemon] Lock pruning failed:', err instanceof Error ? err.message : String(err))
|
|
1036
|
-
}
|
|
1037
|
-
|
|
1038
|
-
// Prune old execution logs (30-day retention)
|
|
1039
|
-
try {
|
|
1040
|
-
const logsRemoved = clearLogsByAge(30 * 24 * 3600_000)
|
|
1041
|
-
if (logsRemoved > 0) {
|
|
1042
|
-
log.info(TAG, `[daemon] Pruned ${logsRemoved} old execution log(s)`)
|
|
1043
|
-
}
|
|
1044
|
-
} catch (err: unknown) {
|
|
1045
|
-
log.error(TAG, '[daemon] Execution log pruning failed:', errorMessage(err))
|
|
1046
|
-
}
|
|
1047
|
-
|
|
1048
|
-
// Prune old usage records (90-day retention)
|
|
1049
|
-
try {
|
|
1050
|
-
const usageRemoved = pruneOldUsage(90 * 24 * 3600_000)
|
|
1051
|
-
if (usageRemoved > 0) {
|
|
1052
|
-
log.info(TAG, `[daemon] Pruned ${usageRemoved} old usage record(s)`)
|
|
1053
|
-
}
|
|
1054
|
-
} catch (err: unknown) {
|
|
1055
|
-
log.error(TAG, '[daemon] Usage pruning failed:', errorMessage(err))
|
|
1056
|
-
}
|
|
1057
|
-
|
|
1058
|
-
// Periodic memory database maintenance (dedup + TTL pruning)
|
|
1059
|
-
try {
|
|
1060
|
-
await runMemoryMaintenanceTick()
|
|
1061
|
-
} catch (err: unknown) {
|
|
1062
|
-
log.error(TAG, '[daemon] Memory maintenance failed:', err instanceof Error ? err.message : String(err))
|
|
1063
|
-
}
|
|
1064
|
-
|
|
1065
|
-
// Drain idle-window callbacks when the system is quiet
|
|
1066
|
-
try {
|
|
1067
|
-
await drainIdleWindowCallbacks()
|
|
1068
|
-
} catch (err: unknown) {
|
|
1069
|
-
log.error(TAG, '[daemon] Idle-window drain failed:', err instanceof Error ? err.message : String(err))
|
|
1070
|
-
}
|
|
1071
|
-
}
|
|
1072
|
-
|
|
1073
|
-
function startHealthMonitor() {
|
|
1074
|
-
if (ds.healthIntervalId) return
|
|
1075
|
-
ds.healthIntervalId = setInterval(() => {
|
|
1076
|
-
if (ds.healthCheckRunning || ds.shuttingDown) return
|
|
1077
|
-
ds.healthCheckRunning = true
|
|
1078
|
-
runHealthChecks()
|
|
1079
|
-
.catch((err) => {
|
|
1080
|
-
log.error(TAG, '[daemon] Health monitor tick failed:', err?.message || String(err))
|
|
1081
|
-
})
|
|
1082
|
-
.finally(() => { ds.healthCheckRunning = false })
|
|
1083
|
-
}, HEALTH_CHECK_INTERVAL)
|
|
1084
|
-
}
|
|
1085
|
-
|
|
1086
|
-
function stopHealthMonitor() {
|
|
1087
|
-
if (ds.healthIntervalId) {
|
|
1088
|
-
clearInterval(ds.healthIntervalId)
|
|
1089
|
-
ds.healthIntervalId = null
|
|
1090
|
-
}
|
|
1091
|
-
}
|
|
1092
|
-
|
|
1093
|
-
function syncDaemonBackgroundServices(options?: { runConnectorHealthCheckImmediately?: boolean }) {
|
|
1094
|
-
if (isDaemonBackgroundServicesEnabled()) {
|
|
1095
|
-
startHealthMonitor()
|
|
1096
|
-
startConnectorHealthMonitor({
|
|
1097
|
-
runImmediately: options?.runConnectorHealthCheckImmediately !== false,
|
|
1098
|
-
})
|
|
1099
|
-
startConnectorOutboxWorker()
|
|
1100
|
-
startEvalScheduler()
|
|
1101
|
-
return
|
|
1102
|
-
}
|
|
1103
|
-
stopHealthMonitor()
|
|
1104
|
-
stopConnectorHealthMonitor()
|
|
1105
|
-
stopConnectorOutboxWorker()
|
|
1106
|
-
stopEvalScheduler()
|
|
1107
|
-
}
|
|
1108
|
-
|
|
1109
|
-
function startConnectorHealthMonitor(options?: { runImmediately?: boolean }) {
|
|
1110
|
-
if (ds.connectorHealthIntervalId) return
|
|
1111
|
-
|
|
1112
|
-
const tick = () => {
|
|
1113
|
-
if (ds.connectorHealthCheckRunning || ds.shuttingDown) return
|
|
1114
|
-
ds.connectorHealthCheckRunning = true
|
|
1115
|
-
runConnectorHealthChecks(Date.now())
|
|
1116
|
-
.catch((err) => {
|
|
1117
|
-
log.error(TAG, '[daemon] Connector health tick failed:', errorMessage(err))
|
|
1118
|
-
})
|
|
1119
|
-
.finally(() => { ds.connectorHealthCheckRunning = false })
|
|
1120
|
-
}
|
|
1121
|
-
|
|
1122
|
-
if (options?.runImmediately !== false) tick()
|
|
1123
|
-
ds.connectorHealthIntervalId = setInterval(tick, CONNECTOR_HEALTH_CHECK_INTERVAL)
|
|
1124
|
-
}
|
|
1125
|
-
|
|
1126
|
-
function stopConnectorHealthMonitor() {
|
|
1127
|
-
if (ds.connectorHealthIntervalId) {
|
|
1128
|
-
clearInterval(ds.connectorHealthIntervalId)
|
|
1129
|
-
ds.connectorHealthIntervalId = null
|
|
1130
|
-
}
|
|
1131
|
-
}
|
|
1132
|
-
|
|
1133
|
-
function runConsolidationTick() {
|
|
1134
|
-
import('@/lib/server/memory/memory-consolidation').then(({ runDailyConsolidation, registerConsolidationIdleCallback, registerCompactionIdleCallback }) => {
|
|
1135
|
-
// Wire idle-window callbacks so consolidation and compaction run during quiet periods
|
|
1136
|
-
registerConsolidationIdleCallback()
|
|
1137
|
-
registerCompactionIdleCallback()
|
|
1138
|
-
|
|
1139
|
-
return runDailyConsolidation().then((stats) => {
|
|
1140
|
-
if (stats.digests > 0 || stats.pruned > 0 || stats.deduped > 0) {
|
|
1141
|
-
log.info(TAG, `[daemon] Memory consolidation: ${stats.digests} digest(s), ${stats.pruned} pruned, ${stats.deduped} deduped`)
|
|
1142
|
-
}
|
|
1143
|
-
if (stats.errors.length > 0) {
|
|
1144
|
-
log.warn(TAG, `[daemon] Memory consolidation errors: ${stats.errors.join('; ')}`)
|
|
1145
|
-
}
|
|
1146
|
-
})
|
|
1147
|
-
}).catch((err: unknown) => {
|
|
1148
|
-
log.error(TAG, '[daemon] Memory consolidation failed:', errorMessage(err))
|
|
1149
|
-
})
|
|
1150
|
-
}
|
|
1151
|
-
|
|
1152
|
-
function startMemoryConsolidation() {
|
|
1153
|
-
if (ds.memoryConsolidationTimeoutId || ds.memoryConsolidationIntervalId) return
|
|
1154
|
-
// Deferred first run, then repeat on interval
|
|
1155
|
-
ds.memoryConsolidationTimeoutId = setTimeout(() => {
|
|
1156
|
-
ds.memoryConsolidationTimeoutId = null
|
|
1157
|
-
runConsolidationTick()
|
|
1158
|
-
ds.memoryConsolidationIntervalId = setInterval(runConsolidationTick, MEMORY_CONSOLIDATION_INTERVAL)
|
|
1159
|
-
}, MEMORY_CONSOLIDATION_INITIAL_DELAY)
|
|
1160
|
-
}
|
|
1161
|
-
|
|
1162
|
-
function stopMemoryConsolidation() {
|
|
1163
|
-
if (ds.memoryConsolidationTimeoutId) {
|
|
1164
|
-
clearTimeout(ds.memoryConsolidationTimeoutId)
|
|
1165
|
-
ds.memoryConsolidationTimeoutId = null
|
|
1166
|
-
}
|
|
1167
|
-
if (ds.memoryConsolidationIntervalId) {
|
|
1168
|
-
clearInterval(ds.memoryConsolidationIntervalId)
|
|
1169
|
-
ds.memoryConsolidationIntervalId = null
|
|
1170
|
-
}
|
|
1171
|
-
}
|
|
1172
|
-
|
|
1173
|
-
// --- Eval scheduler ---
|
|
1174
|
-
|
|
1175
|
-
const EVAL_DEFAULT_INTERVAL_MS = 24 * 3600_000 // 24 hours
|
|
1176
|
-
|
|
1177
|
-
async function runEvalSchedulerTick() {
|
|
1178
|
-
try {
|
|
1179
|
-
const settings = loadSettings()
|
|
1180
|
-
if (!settings.autonomyEvalEnabled) return
|
|
1181
|
-
|
|
1182
|
-
const { runEvalSuite } = await import('@/lib/server/eval/runner')
|
|
1183
|
-
const agents = loadAgents()
|
|
1184
|
-
const heartbeatAgentIds = Object.keys(agents).filter(
|
|
1185
|
-
(id) => agents[id].heartbeatEnabled === true,
|
|
1186
|
-
)
|
|
1187
|
-
|
|
1188
|
-
for (const agentId of heartbeatAgentIds) {
|
|
1189
|
-
try {
|
|
1190
|
-
const result = await runEvalSuite(agentId)
|
|
1191
|
-
log.info(TAG,
|
|
1192
|
-
`[daemon:eval] Agent ${agents[agentId].name}: ${result.percentage}% (${result.totalScore}/${result.maxScore})`,
|
|
1193
|
-
)
|
|
1194
|
-
createNotification({
|
|
1195
|
-
title: `Eval: ${agents[agentId].name} scored ${result.percentage}%`,
|
|
1196
|
-
message: `${result.runs.length} scenarios, ${result.totalScore}/${result.maxScore} points`,
|
|
1197
|
-
type: result.percentage >= 60 ? 'info' : 'warning',
|
|
1198
|
-
})
|
|
1199
|
-
} catch (err: unknown) {
|
|
1200
|
-
log.error(TAG, `[daemon:eval] Failed for agent ${agentId}:`, errorMessage(err))
|
|
1201
|
-
}
|
|
1202
|
-
}
|
|
1203
|
-
} catch (err: unknown) {
|
|
1204
|
-
log.error(TAG, '[daemon:eval] Scheduler tick error:', errorMessage(err))
|
|
1205
|
-
}
|
|
1206
|
-
}
|
|
1207
|
-
|
|
1208
|
-
function startEvalScheduler() {
|
|
1209
|
-
if (ds.evalSchedulerIntervalId) return
|
|
1210
|
-
try {
|
|
1211
|
-
const settings = loadSettings()
|
|
1212
|
-
if (!settings.autonomyEvalEnabled) return
|
|
1213
|
-
const intervalMs = parseCronToMs(settings.autonomyEvalCron, EVAL_DEFAULT_INTERVAL_MS) || EVAL_DEFAULT_INTERVAL_MS
|
|
1214
|
-
ds.evalSchedulerIntervalId = setInterval(runEvalSchedulerTick, intervalMs)
|
|
1215
|
-
log.info(TAG, `[daemon:eval] Eval scheduler started (interval=${Math.round(intervalMs / 3600_000)}h)`)
|
|
1216
|
-
} catch {
|
|
1217
|
-
// Eval scheduling is optional — don't block daemon start
|
|
1218
|
-
}
|
|
1219
|
-
}
|
|
1220
|
-
|
|
1221
|
-
function stopEvalScheduler() {
|
|
1222
|
-
if (ds.evalSchedulerIntervalId) {
|
|
1223
|
-
clearInterval(ds.evalSchedulerIntervalId)
|
|
1224
|
-
ds.evalSchedulerIntervalId = null
|
|
1225
|
-
}
|
|
1226
|
-
}
|
|
1227
|
-
|
|
1228
|
-
const SWARM_TIMEOUT_CHECK_INTERVAL = 30_000
|
|
1229
|
-
|
|
1230
|
-
function startSwarmTimeoutChecker() {
|
|
1231
|
-
if (ds.swarmTimeoutIntervalId) return
|
|
1232
|
-
ds.swarmTimeoutIntervalId = setInterval(() => {
|
|
1233
|
-
if (!ds.running || ds.shuttingDown) return
|
|
1234
|
-
try {
|
|
1235
|
-
checkSwarmTimeouts()
|
|
1236
|
-
} catch (err: unknown) {
|
|
1237
|
-
log.error(TAG, `[daemon] Swarm timeout check error: ${errorMessage(err)}`)
|
|
1238
|
-
}
|
|
1239
|
-
}, SWARM_TIMEOUT_CHECK_INTERVAL)
|
|
1240
|
-
}
|
|
1241
|
-
|
|
1242
|
-
function stopSwarmTimeoutChecker() {
|
|
1243
|
-
if (ds.swarmTimeoutIntervalId) {
|
|
1244
|
-
clearInterval(ds.swarmTimeoutIntervalId)
|
|
1245
|
-
ds.swarmTimeoutIntervalId = null
|
|
1246
|
-
}
|
|
1247
|
-
}
|
|
1248
|
-
|
|
1249
|
-
function refreshDaemonTimersForHotReload() {
|
|
1250
|
-
if (!ds.running) return
|
|
1251
|
-
|
|
1252
|
-
if (ds.queueIntervalId) {
|
|
1253
|
-
clearInterval(ds.queueIntervalId)
|
|
1254
|
-
ds.queueIntervalId = null
|
|
1255
|
-
startQueueProcessor()
|
|
1256
|
-
}
|
|
1257
|
-
|
|
1258
|
-
if (ds.browserSweepId) {
|
|
1259
|
-
clearInterval(ds.browserSweepId)
|
|
1260
|
-
ds.browserSweepId = null
|
|
1261
|
-
startBrowserSweep()
|
|
1262
|
-
}
|
|
1263
|
-
|
|
1264
|
-
if (ds.healthIntervalId) {
|
|
1265
|
-
clearInterval(ds.healthIntervalId)
|
|
1266
|
-
ds.healthIntervalId = null
|
|
1267
|
-
}
|
|
1268
|
-
|
|
1269
|
-
if (ds.connectorHealthIntervalId) {
|
|
1270
|
-
clearInterval(ds.connectorHealthIntervalId)
|
|
1271
|
-
ds.connectorHealthIntervalId = null
|
|
1272
|
-
}
|
|
1273
|
-
|
|
1274
|
-
if (ds.memoryConsolidationTimeoutId || ds.memoryConsolidationIntervalId) {
|
|
1275
|
-
stopMemoryConsolidation()
|
|
1276
|
-
startMemoryConsolidation()
|
|
1277
|
-
}
|
|
1278
|
-
|
|
1279
|
-
if (ds.evalSchedulerIntervalId) {
|
|
1280
|
-
stopEvalScheduler()
|
|
1281
|
-
}
|
|
1282
|
-
|
|
1283
|
-
if (ds.swarmTimeoutIntervalId) {
|
|
1284
|
-
stopSwarmTimeoutChecker()
|
|
1285
|
-
startSwarmTimeoutChecker()
|
|
1286
|
-
}
|
|
1287
|
-
|
|
1288
|
-
syncDaemonBackgroundServices()
|
|
1289
|
-
}
|
|
1290
|
-
|
|
1291
|
-
// In dev/HMR, the daemon state survives on globalThis while interval callbacks keep
|
|
1292
|
-
// the old module closure alive. Refresh long-lived timers so they always run the
|
|
1293
|
-
// current module's logic instead of stale health-alert code paths.
|
|
1294
|
-
refreshDaemonTimersForHotReload()
|
|
1295
|
-
|
|
1296
|
-
export async function runDaemonHealthCheckNow() {
|
|
1297
|
-
// Bypass circuit breaker for manual/forced checks
|
|
1298
|
-
ds.providerPingCircuitBreaker.clear()
|
|
1299
|
-
await Promise.all([
|
|
1300
|
-
runHealthChecks(),
|
|
1301
|
-
runConnectorHealthChecks(Date.now()),
|
|
1302
|
-
])
|
|
1303
|
-
}
|
|
1304
|
-
|
|
1305
|
-
export async function runConnectorHealthCheckNowForTest(now = Date.now()) {
|
|
1306
|
-
await runConnectorHealthChecks(now)
|
|
1307
|
-
}
|
|
1308
|
-
|
|
1309
|
-
export function getDaemonStatus() {
|
|
1310
|
-
const estop = loadEstopState()
|
|
1311
|
-
const queue = loadQueue()
|
|
1312
|
-
const schedules = loadSchedules()
|
|
1313
|
-
const reconnectStates = Object.values(getAllReconnectStates())
|
|
1314
|
-
|
|
1315
|
-
// Find next scheduled task
|
|
1316
|
-
let nextScheduled: number | null = null
|
|
1317
|
-
for (const s of Object.values(schedules) as unknown as Record<string, unknown>[]) {
|
|
1318
|
-
if (s.status === 'active' && s.nextRunAt) {
|
|
1319
|
-
if (!nextScheduled || (s.nextRunAt as number) < nextScheduled) {
|
|
1320
|
-
nextScheduled = s.nextRunAt as number
|
|
1321
|
-
}
|
|
1322
|
-
}
|
|
1323
|
-
}
|
|
1324
|
-
|
|
1325
|
-
// Webhook retry queue stats
|
|
1326
|
-
const retryQueue = loadWebhookRetryQueue()
|
|
1327
|
-
const retryEntries = Object.values(retryQueue) as WebhookRetryEntry[]
|
|
1328
|
-
const pendingRetries = retryEntries.filter(e => !e.deadLettered).length
|
|
1329
|
-
const deadLettered = retryEntries.filter(e => e.deadLettered).length
|
|
1330
|
-
|
|
1331
|
-
return {
|
|
1332
|
-
running: ds.running,
|
|
1333
|
-
schedulerActive: ds.running,
|
|
1334
|
-
autostartEnabled: daemonAutostartEnvEnabled(),
|
|
1335
|
-
backgroundServicesEnabled: isDaemonBackgroundServicesEnabled(),
|
|
1336
|
-
reducedMode: !isDaemonBackgroundServicesEnabled(),
|
|
1337
|
-
manualStopRequested: ds.manualStopRequested,
|
|
1338
|
-
estop,
|
|
1339
|
-
queueLength: queue.length,
|
|
1340
|
-
lastProcessed: ds.lastProcessedAt,
|
|
1341
|
-
nextScheduled,
|
|
1342
|
-
heartbeat: getHeartbeatServiceStatus(),
|
|
1343
|
-
health: {
|
|
1344
|
-
monitorActive: !!ds.healthIntervalId,
|
|
1345
|
-
connectorMonitorActive: !!ds.connectorHealthIntervalId,
|
|
1346
|
-
staleSessions: ds.staleSessionIds.size,
|
|
1347
|
-
connectorsInBackoff: reconnectStates.filter((state) => !state.exhausted).length,
|
|
1348
|
-
connectorsExhausted: reconnectStates.filter((state) => state.exhausted).length,
|
|
1349
|
-
checkIntervalSec: Math.trunc(HEALTH_CHECK_INTERVAL / 1000),
|
|
1350
|
-
connectorCheckIntervalSec: Math.trunc(CONNECTOR_HEALTH_CHECK_INTERVAL / 1000),
|
|
1351
|
-
integrity: {
|
|
1352
|
-
enabled: loadSettings().integrityMonitorEnabled !== false,
|
|
1353
|
-
lastCheckedAt: ds.lastIntegrityCheckAt,
|
|
1354
|
-
lastDriftCount: ds.lastIntegrityDriftCount,
|
|
1355
|
-
},
|
|
1356
|
-
},
|
|
1357
|
-
webhookRetry: {
|
|
1358
|
-
pendingRetries,
|
|
1359
|
-
deadLettered,
|
|
1360
|
-
},
|
|
1361
|
-
guards: {
|
|
1362
|
-
healthCheckRunning: ds.healthCheckRunning,
|
|
1363
|
-
connectorHealthCheckRunning: ds.connectorHealthCheckRunning,
|
|
1364
|
-
shuttingDown: ds.shuttingDown,
|
|
1365
|
-
providerCircuitBreakers: ds.providerPingCircuitBreaker.size,
|
|
1366
|
-
},
|
|
1367
|
-
}
|
|
1368
|
-
}
|
|
1369
|
-
|
|
1370
|
-
/**
|
|
1371
|
-
* Lightweight health summary safe for external consumption.
|
|
1372
|
-
* Reads cached state only — no probes or side effects.
|
|
1373
|
-
*/
|
|
1374
|
-
export function getDaemonHealthSummary(): {
|
|
1375
|
-
ok: boolean
|
|
1376
|
-
uptime: number
|
|
1377
|
-
components: {
|
|
1378
|
-
daemon: { status: 'healthy' | 'stopped' | 'degraded' }
|
|
1379
|
-
connectors: { healthy: number; errored: number; total: number }
|
|
1380
|
-
providers: { healthy: number; cooldown: number; total: number }
|
|
1381
|
-
gateways: { healthy: number; degraded: number; total: number }
|
|
1382
|
-
}
|
|
1383
|
-
estop: boolean
|
|
1384
|
-
nextScheduledTask: number | null
|
|
1385
|
-
} {
|
|
1386
|
-
const estopState = loadEstopState()
|
|
1387
|
-
const estopActive = estopState.level !== 'none'
|
|
1388
|
-
|
|
1389
|
-
// Daemon status
|
|
1390
|
-
const daemonStatus: 'healthy' | 'stopped' | 'degraded' = !ds.running
|
|
1391
|
-
? 'stopped'
|
|
1392
|
-
: estopActive ? 'degraded' : 'healthy'
|
|
1393
|
-
|
|
1394
|
-
// Connector summary
|
|
1395
|
-
const connectors = loadConnectors()
|
|
1396
|
-
const connectorEntries = Object.values(connectors) as unknown as Record<string, unknown>[]
|
|
1397
|
-
const enabledConnectors = connectorEntries.filter(c => c?.isEnabled === true)
|
|
1398
|
-
let healthyConnectors = 0
|
|
1399
|
-
let erroredConnectors = 0
|
|
1400
|
-
for (const c of enabledConnectors) {
|
|
1401
|
-
if (typeof c.id === 'string' && getConnectorStatus(c.id) === 'running') {
|
|
1402
|
-
healthyConnectors++
|
|
1403
|
-
} else {
|
|
1404
|
-
erroredConnectors++
|
|
1405
|
-
}
|
|
1406
|
-
}
|
|
1407
|
-
|
|
1408
|
-
// Provider summary (based on circuit breaker state)
|
|
1409
|
-
const agents = loadAgents()
|
|
1410
|
-
const agentEntries = Object.values(agents) as unknown as Record<string, unknown>[]
|
|
1411
|
-
const providerKeys = new Set<string>()
|
|
1412
|
-
for (const agent of agentEntries) {
|
|
1413
|
-
if (!agent?.id || typeof agent.id !== 'string') continue
|
|
1414
|
-
const provider = typeof agent.provider === 'string' ? agent.provider : ''
|
|
1415
|
-
if (!provider || ['claude-cli', 'codex-cli', 'opencode-cli'].includes(provider)) continue
|
|
1416
|
-
const credentialId = typeof agent.credentialId === 'string' ? agent.credentialId : ''
|
|
1417
|
-
const apiEndpoint = typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : ''
|
|
1418
|
-
providerKeys.add(`${provider}:${credentialId || 'no-cred'}:${apiEndpoint}`)
|
|
1419
|
-
}
|
|
1420
|
-
const now = Date.now()
|
|
1421
|
-
let cooldownProviders = 0
|
|
1422
|
-
for (const key of providerKeys) {
|
|
1423
|
-
const cb = ds.providerPingCircuitBreaker.get(key)
|
|
1424
|
-
if (cb && cb.skipUntil > now) cooldownProviders++
|
|
1425
|
-
}
|
|
1426
|
-
|
|
1427
|
-
// Gateway summary (OpenClaw gateways)
|
|
1428
|
-
const totalGateways = ds.openclawDownAgentIds.size
|
|
1429
|
-
+ agentEntries.filter(a => a?.provider === 'openclaw' && !ds.openclawDownAgentIds.has(a.id as string)).length
|
|
1430
|
-
const degradedGateways = ds.openclawDownAgentIds.size
|
|
1431
|
-
|
|
1432
|
-
// Next scheduled task
|
|
1433
|
-
const schedules = loadSchedules()
|
|
1434
|
-
let nextScheduled: number | null = null
|
|
1435
|
-
for (const s of Object.values(schedules) as unknown as Record<string, unknown>[]) {
|
|
1436
|
-
if (s.status === 'active' && s.nextRunAt) {
|
|
1437
|
-
if (!nextScheduled || (s.nextRunAt as number) < nextScheduled) {
|
|
1438
|
-
nextScheduled = s.nextRunAt as number
|
|
1439
|
-
}
|
|
1440
|
-
}
|
|
1441
|
-
}
|
|
1442
|
-
|
|
1443
|
-
const allProvidersDown = providerKeys.size > 0 && cooldownProviders >= providerKeys.size
|
|
1444
|
-
const ok = ds.running && !estopActive && !allProvidersDown
|
|
1445
|
-
|
|
1446
|
-
return {
|
|
1447
|
-
ok,
|
|
1448
|
-
uptime: Math.trunc(process.uptime()),
|
|
1449
|
-
components: {
|
|
1450
|
-
daemon: { status: daemonStatus },
|
|
1451
|
-
connectors: {
|
|
1452
|
-
healthy: healthyConnectors,
|
|
1453
|
-
errored: erroredConnectors,
|
|
1454
|
-
total: enabledConnectors.length,
|
|
1455
|
-
},
|
|
1456
|
-
providers: {
|
|
1457
|
-
healthy: providerKeys.size - cooldownProviders,
|
|
1458
|
-
cooldown: cooldownProviders,
|
|
1459
|
-
total: providerKeys.size,
|
|
1460
|
-
},
|
|
1461
|
-
gateways: {
|
|
1462
|
-
healthy: totalGateways - degradedGateways,
|
|
1463
|
-
degraded: degradedGateways,
|
|
1464
|
-
total: totalGateways,
|
|
1465
|
-
},
|
|
1466
|
-
},
|
|
1467
|
-
estop: estopActive,
|
|
1468
|
-
nextScheduledTask: nextScheduled,
|
|
1469
|
-
}
|
|
1470
|
-
}
|
|
1
|
+
export * from './daemon-state/policy'
|
|
2
|
+
export * from './daemon-state/supervisor'
|
|
3
|
+
export * from './daemon-state/health'
|