@swarmclawai/swarmclaw 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/README.md +9 -0
  2. package/package.json +2 -2
  3. package/skills/coding-agent/SKILL.md +111 -0
  4. package/skills/github/SKILL.md +140 -0
  5. package/skills/nano-banana-pro/SKILL.md +62 -0
  6. package/skills/nano-banana-pro/scripts/generate_image.py +235 -0
  7. package/skills/nano-pdf/SKILL.md +53 -0
  8. package/skills/openai-image-gen/SKILL.md +78 -0
  9. package/skills/openai-image-gen/scripts/gen.py +328 -0
  10. package/skills/resourceful-problem-solving/SKILL.md +49 -0
  11. package/skills/skill-creator/SKILL.md +147 -0
  12. package/skills/skill-creator/scripts/init_skill.py +378 -0
  13. package/skills/skill-creator/scripts/quick_validate.py +159 -0
  14. package/skills/summarize/SKILL.md +77 -0
  15. package/src/app/api/auth/route.ts +20 -5
  16. package/src/app/api/chats/[id]/devserver/route.ts +13 -19
  17. package/src/app/api/chats/[id]/messages/route.ts +13 -15
  18. package/src/app/api/chats/[id]/route.ts +9 -10
  19. package/src/app/api/chats/[id]/stop/route.ts +5 -7
  20. package/src/app/api/chats/messages-route.test.ts +8 -6
  21. package/src/app/api/chats/route.ts +9 -10
  22. package/src/app/api/ip/route.ts +2 -2
  23. package/src/app/api/preview-server/route.ts +1 -1
  24. package/src/app/api/projects/[id]/route.ts +7 -46
  25. package/src/components/chat/chat-area.tsx +45 -23
  26. package/src/components/chat/message-bubble.test.ts +35 -0
  27. package/src/components/chat/message-bubble.tsx +19 -9
  28. package/src/components/chat/message-list.tsx +37 -3
  29. package/src/components/input/chat-input.tsx +34 -14
  30. package/src/instrumentation.ts +1 -1
  31. package/src/lib/chat/assistant-render-id.ts +3 -0
  32. package/src/lib/chat/chat-streaming-state.test.ts +42 -3
  33. package/src/lib/chat/chat-streaming-state.ts +20 -8
  34. package/src/lib/chat/queued-message-queue.test.ts +23 -1
  35. package/src/lib/chat/queued-message-queue.ts +11 -2
  36. package/src/lib/providers/cli-utils.test.ts +124 -0
  37. package/src/lib/server/activity/activity-log.ts +21 -0
  38. package/src/lib/server/agents/agent-availability.test.ts +10 -5
  39. package/src/lib/server/agents/agent-cascade.ts +79 -59
  40. package/src/lib/server/agents/agent-registry.ts +3 -1
  41. package/src/lib/server/agents/agent-repository.ts +90 -0
  42. package/src/lib/server/agents/delegation-job-repository.ts +53 -0
  43. package/src/lib/server/agents/delegation-jobs.ts +11 -4
  44. package/src/lib/server/agents/guardian-checkpoint-repository.ts +35 -0
  45. package/src/lib/server/agents/guardian.ts +2 -2
  46. package/src/lib/server/agents/main-agent-loop.ts +10 -3
  47. package/src/lib/server/agents/main-loop-state-repository.ts +38 -0
  48. package/src/lib/server/agents/subagent-runtime.ts +9 -6
  49. package/src/lib/server/agents/subagent-swarm.ts +3 -2
  50. package/src/lib/server/agents/task-session.ts +3 -4
  51. package/src/lib/server/approvals/approval-repository.ts +30 -0
  52. package/src/lib/server/autonomy/supervisor-incident-repository.ts +42 -0
  53. package/src/lib/server/chat-execution/chat-execution-types.ts +38 -0
  54. package/src/lib/server/chat-execution/chat-execution-utils.ts +1 -1
  55. package/src/lib/server/chat-execution/chat-execution.ts +84 -1926
  56. package/src/lib/server/chat-execution/chat-turn-finalization.ts +620 -0
  57. package/src/lib/server/chat-execution/chat-turn-partial-persistence.ts +221 -0
  58. package/src/lib/server/chat-execution/chat-turn-preflight.ts +133 -0
  59. package/src/lib/server/chat-execution/chat-turn-preparation.ts +817 -0
  60. package/src/lib/server/chat-execution/chat-turn-stream-execution.ts +296 -0
  61. package/src/lib/server/chat-execution/chat-turn-tool-routing.ts +5 -5
  62. package/src/lib/server/chat-execution/message-classifier.test.ts +329 -0
  63. package/src/lib/server/chat-execution/post-stream-finalization.ts +1 -1
  64. package/src/lib/server/chat-execution/prompt-builder.ts +11 -0
  65. package/src/lib/server/chat-execution/prompt-sections.ts +5 -6
  66. package/src/lib/server/chat-execution/situational-awareness.ts +12 -7
  67. package/src/lib/server/chat-execution/stream-agent-chat.ts +16 -13
  68. package/src/lib/server/chatrooms/chatroom-repository.ts +32 -0
  69. package/src/lib/server/connectors/connector-repository.ts +58 -0
  70. package/src/lib/server/connectors/runtime-state.test.ts +117 -0
  71. package/src/lib/server/credentials/credential-repository.ts +7 -0
  72. package/src/lib/server/gateways/gateway-profile-repository.ts +4 -0
  73. package/src/lib/server/memory/memory-abstract.test.ts +59 -0
  74. package/src/lib/server/missions/mission-repository.ts +74 -0
  75. package/src/lib/server/missions/mission-service/actions.ts +6 -0
  76. package/src/lib/server/missions/mission-service/bindings.ts +9 -0
  77. package/src/lib/server/missions/mission-service/context.ts +4 -0
  78. package/src/lib/server/missions/mission-service/core.ts +2269 -0
  79. package/src/lib/server/missions/mission-service/queries.ts +12 -0
  80. package/src/lib/server/missions/mission-service/recovery.ts +5 -0
  81. package/src/lib/server/missions/mission-service/ticks.ts +9 -0
  82. package/src/lib/server/missions/mission-service.test.ts +9 -2
  83. package/src/lib/server/missions/mission-service.ts +6 -2266
  84. package/src/lib/server/persistence/repository-utils.ts +154 -0
  85. package/src/lib/server/persistence/storage-context.ts +51 -0
  86. package/src/lib/server/persistence/transaction.ts +1 -0
  87. package/src/lib/server/projects/project-repository.ts +36 -0
  88. package/src/lib/server/projects/project-service.ts +79 -0
  89. package/src/lib/server/protocols/protocol-normalization.test.ts +6 -4
  90. package/src/lib/server/runtime/alert-dispatch.ts +1 -1
  91. package/src/lib/server/runtime/daemon-policy.ts +1 -1
  92. package/src/lib/server/runtime/daemon-state/core.ts +1570 -0
  93. package/src/lib/server/runtime/daemon-state/health.ts +6 -0
  94. package/src/lib/server/runtime/daemon-state/policy.ts +7 -0
  95. package/src/lib/server/runtime/daemon-state/supervisor.ts +6 -0
  96. package/src/lib/server/runtime/daemon-state.test.ts +48 -0
  97. package/src/lib/server/runtime/daemon-state.ts +3 -1470
  98. package/src/lib/server/runtime/estop-repository.ts +4 -0
  99. package/src/lib/server/runtime/estop.ts +3 -1
  100. package/src/lib/server/runtime/heartbeat-service.test.ts +2 -2
  101. package/src/lib/server/runtime/heartbeat-service.ts +55 -34
  102. package/src/lib/server/runtime/heartbeat-wake.ts +6 -4
  103. package/src/lib/server/runtime/idle-window.ts +2 -2
  104. package/src/lib/server/runtime/network.ts +11 -0
  105. package/src/lib/server/runtime/orchestrator-events.ts +2 -2
  106. package/src/lib/server/runtime/queue/claims.ts +4 -0
  107. package/src/lib/server/runtime/queue/core.ts +2079 -0
  108. package/src/lib/server/runtime/queue/execution.ts +7 -0
  109. package/src/lib/server/runtime/queue/followups.ts +4 -0
  110. package/src/lib/server/runtime/queue/queries.ts +12 -0
  111. package/src/lib/server/runtime/queue/recovery.ts +7 -0
  112. package/src/lib/server/runtime/queue-recovery.test.ts +48 -13
  113. package/src/lib/server/runtime/queue-repository.ts +17 -0
  114. package/src/lib/server/runtime/queue.ts +5 -2061
  115. package/src/lib/server/runtime/run-ledger.ts +6 -5
  116. package/src/lib/server/runtime/run-repository.ts +73 -0
  117. package/src/lib/server/runtime/runtime-lock-repository.ts +8 -0
  118. package/src/lib/server/runtime/runtime-settings.ts +1 -1
  119. package/src/lib/server/runtime/runtime-state.ts +99 -0
  120. package/src/lib/server/runtime/scheduler.ts +4 -2
  121. package/src/lib/server/runtime/session-run-manager/cancellation.ts +157 -0
  122. package/src/lib/server/runtime/session-run-manager/drain.ts +246 -0
  123. package/src/lib/server/runtime/session-run-manager/enqueue.ts +287 -0
  124. package/src/lib/server/runtime/session-run-manager/queries.ts +117 -0
  125. package/src/lib/server/runtime/session-run-manager/recovery.ts +238 -0
  126. package/src/lib/server/runtime/session-run-manager/state.ts +441 -0
  127. package/src/lib/server/runtime/session-run-manager/types.ts +74 -0
  128. package/src/lib/server/runtime/session-run-manager.ts +72 -1377
  129. package/src/lib/server/runtime/watch-job-repository.ts +35 -0
  130. package/src/lib/server/runtime/watch-jobs.ts +3 -1
  131. package/src/lib/server/schedules/schedule-repository.ts +42 -0
  132. package/src/lib/server/sessions/session-repository.ts +85 -0
  133. package/src/lib/server/settings/settings-repository.ts +25 -0
  134. package/src/lib/server/skills/skill-discovery.test.ts +2 -2
  135. package/src/lib/server/skills/skill-discovery.ts +2 -2
  136. package/src/lib/server/skills/skill-repository.ts +14 -0
  137. package/src/lib/server/storage.ts +13 -24
  138. package/src/lib/server/tasks/task-repository.ts +54 -0
  139. package/src/lib/server/usage/usage-repository.ts +30 -0
  140. package/src/lib/server/webhooks/webhook-repository.ts +10 -0
  141. package/src/lib/strip-internal-metadata.test.ts +42 -41
  142. package/src/stores/use-chat-store.test.ts +54 -0
  143. package/src/stores/use-chat-store.ts +21 -5
  144. /package/{bundled-skills → skills}/google-workspace/SKILL.md +0 -0
@@ -1,533 +1,52 @@
1
- import { genId } from '@/lib/id'
2
- import type {
3
- RunEventRecord,
4
- SessionRunHeartbeatConfig,
5
- SessionQueueSnapshot,
6
- SessionQueuedTurn,
7
- SessionRunRecord,
8
- SessionRunStatus,
9
- SSEEvent,
10
- } from '@/types'
11
1
  import {
12
- active,
13
- isRuntimeLockActive,
14
- loadSession,
15
- releaseRuntimeLock,
16
- tryAcquireRuntimeLock,
17
- } from '@/lib/server/storage'
18
- import { executeSessionChatTurn, type ExecuteChatTurnResult } from '@/lib/server/chat-execution/chat-execution'
19
- import { loadRuntimeSettings } from '@/lib/server/runtime/runtime-settings'
20
- import { log } from '@/lib/server/logger'
21
- import { isInternalHeartbeatRun } from '@/lib/server/runtime/heartbeat-source'
22
- import { cleanupSessionBrowser } from '@/lib/server/session-tools/web'
23
- import { cancelDelegationJobsForParentSession } from '@/lib/server/agents/delegation-jobs'
24
- import { getMainLoopStateForSession, handleMainLoopRunResult } from '@/lib/server/agents/main-agent-loop'
25
- import { observeAutonomyRunOutcome } from '@/lib/server/autonomy/supervisor-reflection'
26
- import { observeLearnedSkillRunOutcome } from '@/lib/server/skills/learned-skills'
27
- import { errorMessage, hmrSingleton } from '@/lib/shared-utils'
28
- import { getEnabledToolIds } from '@/lib/capability-selection'
2
+ acquireExternalSessionExecutionHold as acquireExternalSessionExecutionHoldInternal,
3
+ hasActiveNonHeartbeatSessionLease,
4
+ resetSessionRunManagerStateForTests,
5
+ } from '@/lib/server/runtime/session-run-manager/state'
29
6
  import {
30
- appendPersistedRunEvent,
31
- isRestartRecoverableSource,
32
- listPersistedRunEvents,
33
- listPersistedRuns,
34
- loadPersistedRun,
35
- loadRecoverableStaleRuns,
36
- patchPersistedRun,
37
- persistRun,
38
- } from '@/lib/server/runtime/run-ledger'
39
- import { isAllEstopEngaged, isAutonomyEstopEngaged } from '@/lib/server/runtime/estop'
40
- import { notify } from '@/lib/server/ws-hub'
41
-
42
- export type SessionQueueMode = 'followup' | 'steer' | 'collect'
43
-
44
- interface QueueEntry {
45
- executionKey: string
46
- run: SessionRunRecord
47
- message: string
48
- imagePath?: string
49
- imageUrl?: string
50
- attachedFiles?: string[]
51
- onEvents: Array<(event: SSEEvent) => void>
52
- signalController: AbortController
53
- maxRuntimeMs?: number
54
- modelOverride?: string
55
- heartbeatConfig?: SessionRunHeartbeatConfig
56
- replyToId?: string
57
- resolve: (value: ExecuteChatTurnResult) => void
58
- reject: (error: Error) => void
59
- promise: Promise<ExecuteChatTurnResult>
60
- /** Whether this entry has been counted in nonHeartbeatWorkCount (prevents double-decrement). */
61
- nonHeartbeatCounted?: boolean
62
- }
63
-
64
- interface RuntimeState {
65
- runningByExecution: Map<string, QueueEntry>
66
- queueByExecution: Map<string, QueueEntry[]>
67
- runs: Map<string, SessionRunRecord>
68
- recentRunIds: string[]
69
- promises: Map<string, Promise<ExecuteChatTurnResult>>
70
- deferredDrainTimers: Map<string, ReturnType<typeof setTimeout>>
71
- activityLeaseRenewTimers: Map<string, ReturnType<typeof setInterval>>
72
- externalSessionHolds: Map<string, number>
73
- externalHoldTimers: Map<string, ReturnType<typeof setTimeout>>
74
- drainDepth: Map<string, number>
75
- lastQueuedAt: number
76
- nonHeartbeatWorkCount: Map<string, number>
77
- }
78
-
79
- const MAX_RECENT_RUNS = 500
80
- const COLLECT_COALESCE_WINDOW_MS = 1500
81
- const SHARED_ACTIVITY_LEASE_TTL_MS = 15_000
82
- const SHARED_ACTIVITY_LEASE_RENEW_MS = 5_000
83
- const EXTERNAL_HOLD_TTL_MS = 60_000
84
- const MAX_DRAIN_DEPTH = 25
85
- const HEARTBEAT_BUSY_RETRY_MS = 1_000
86
- const STALE_QUEUED_RUN_MS = 15_000
87
- const SHARED_ACTIVITY_LEASE_OWNER = `session-run:${process.pid}:${genId(6)}`
88
- const state: RuntimeState = hmrSingleton<RuntimeState>('__swarmclaw_session_run_manager__', () => ({
89
- runningByExecution: new Map<string, QueueEntry>(),
90
- queueByExecution: new Map<string, QueueEntry[]>(),
91
- runs: new Map<string, SessionRunRecord>(),
92
- recentRunIds: [],
93
- promises: new Map<string, Promise<ExecuteChatTurnResult>>(),
94
- deferredDrainTimers: new Map<string, ReturnType<typeof setTimeout>>(),
95
- activityLeaseRenewTimers: new Map<string, ReturnType<typeof setInterval>>(),
96
- externalSessionHolds: new Map<string, number>(),
97
- externalHoldTimers: new Map<string, ReturnType<typeof setTimeout>>(),
98
- drainDepth: new Map<string, number>(),
99
- lastQueuedAt: 0,
100
- nonHeartbeatWorkCount: new Map<string, number>(),
101
- }))
102
- const recoveryState = hmrSingleton('__swarmclaw_session_run_recovery__', () => ({ completed: false }))
103
-
104
- // Backfill fields for hot-reloaded state objects created by older code versions.
105
- if (!state.runningByExecution) state.runningByExecution = new Map<string, QueueEntry>()
106
- if (!state.queueByExecution) state.queueByExecution = new Map<string, QueueEntry[]>()
107
- if (!state.runs) state.runs = new Map<string, SessionRunRecord>()
108
- if (!state.recentRunIds) state.recentRunIds = []
109
- if (!state.promises) state.promises = new Map<string, Promise<ExecuteChatTurnResult>>()
110
- if (!state.deferredDrainTimers) state.deferredDrainTimers = new Map<string, ReturnType<typeof setTimeout>>()
111
- if (!state.activityLeaseRenewTimers) state.activityLeaseRenewTimers = new Map<string, ReturnType<typeof setInterval>>()
112
- if (!state.externalSessionHolds) state.externalSessionHolds = new Map<string, number>()
113
- if (!state.externalHoldTimers) state.externalHoldTimers = new Map<string, ReturnType<typeof setTimeout>>()
114
- if (!state.drainDepth) state.drainDepth = new Map<string, number>()
115
- if (typeof state.lastQueuedAt !== 'number') state.lastQueuedAt = 0
116
- if (!state.nonHeartbeatWorkCount) state.nonHeartbeatWorkCount = new Map<string, number>()
117
-
118
- function now() {
119
- return Date.now()
120
- }
121
-
122
- function nextQueuedAt() {
123
- const current = now()
124
- const next = current <= state.lastQueuedAt ? state.lastQueuedAt + 1 : current
125
- state.lastQueuedAt = next
126
- return next
127
- }
128
-
129
- function messagePreview(text: string): string {
130
- return (text || '').replace(/\s+/g, ' ').trim().slice(0, 140)
131
- }
132
-
133
- function trimRecentRuns() {
134
- while (state.recentRunIds.length > MAX_RECENT_RUNS) {
135
- const id = state.recentRunIds.shift()
136
- if (!id) continue
137
- state.runs.delete(id)
138
- state.promises.delete(id)
139
- }
140
- }
141
-
142
- function syncRunRecord(run: SessionRunRecord): SessionRunRecord {
143
- state.runs.set(run.id, run)
144
- persistRun(run)
145
- return run
146
- }
147
-
148
- function registerRun(run: SessionRunRecord) {
149
- syncRunRecord(run)
150
- state.recentRunIds.push(run.id)
151
- trimRecentRuns()
152
- }
153
-
154
- function shouldPersistRunEvent(event: SSEEvent): boolean {
155
- return event.t !== 'd' && event.t !== 'thinking' && event.t !== 'reset'
156
- }
157
-
158
- function persistEventForRun(entry: QueueEntry, event: SSEEvent, opts?: {
159
- phase?: RunEventRecord['phase']
160
- status?: SessionRunStatus
161
- summary?: string
162
- }): void {
163
- if (!shouldPersistRunEvent(event)) return
164
- appendPersistedRunEvent({
165
- runId: entry.run.id,
166
- sessionId: entry.run.sessionId,
167
- phase: opts?.phase || 'event',
168
- status: opts?.status,
169
- summary: opts?.summary,
170
- event,
171
- })
172
- }
173
-
174
- /** Chain an external AbortSignal to an internal AbortController so that
175
- * when the caller (e.g. HTTP request) disconnects, the run is cancelled. */
176
- function chainCallerSignal(callerSignal: AbortSignal, controller: AbortController): void {
177
- if (callerSignal.aborted) {
178
- controller.abort()
179
- return
180
- }
181
- const onAbort = () => controller.abort()
182
- callerSignal.addEventListener('abort', onAbort, { once: true })
183
- }
184
-
185
- function emitToSubscribers(entry: QueueEntry, event: SSEEvent) {
186
- persistEventForRun(entry, event)
187
- for (const send of entry.onEvents) {
188
- try {
189
- send(event)
190
- } catch {
191
- // Subscriber stream can be closed by the client.
192
- }
193
- }
194
- }
195
-
196
- function emitRunMeta(entry: QueueEntry, status: SessionRunStatus, extra?: Record<string, unknown>) {
197
- const event: SSEEvent = {
198
- t: 'md',
199
- text: JSON.stringify({
200
- run: {
201
- id: entry.run.id,
202
- sessionId: entry.run.sessionId,
203
- status,
204
- source: entry.run.source,
205
- internal: entry.run.internal,
206
- ...extra,
207
- },
208
- }),
209
- }
210
- persistEventForRun(entry, event, { phase: 'status', status })
211
- for (const send of entry.onEvents) {
212
- try {
213
- send(event)
214
- } catch {
215
- // Subscriber stream can be closed by the client.
216
- }
217
- }
218
- notifySessionRunState(entry.run.sessionId)
219
- }
220
-
221
- function notifySessionRunState(sessionId: string): void {
222
- notify('runs')
223
- notify('sessions')
224
- notify(`session:${sessionId}`)
225
- }
226
-
227
- function queueAutonomyObservation(input: {
228
- runId: string
229
- sessionId: string
230
- source: string
231
- status: SessionRunStatus
232
- resultText?: string | null
233
- error?: string | null
234
- toolEvents?: ExecuteChatTurnResult['toolEvents']
235
- sourceMessage?: string | null
236
- }) {
237
- const session = loadSession(input.sessionId)
238
- void observeAutonomyRunOutcome({
239
- runId: input.runId,
240
- sessionId: input.sessionId,
241
- agentId: session?.agentId || null,
242
- source: input.source,
243
- status: input.status,
244
- resultText: input.resultText,
245
- error: input.error || undefined,
246
- toolEvents: input.toolEvents,
247
- mainLoopState: getMainLoopStateForSession(input.sessionId),
248
- sourceMessage: input.sourceMessage,
249
- }).then(({ reflection }) => observeLearnedSkillRunOutcome({
250
- runId: input.runId,
251
- sessionId: input.sessionId,
252
- agentId: session?.agentId || null,
253
- source: input.source,
254
- status: input.status,
255
- resultText: input.resultText,
256
- error: input.error || undefined,
257
- toolEvents: input.toolEvents,
258
- reflection,
259
- })).catch((err: unknown) => {
260
- log.warn('session-run', `Autonomy observation failed for ${input.runId}`, {
261
- sessionId: input.sessionId,
262
- error: errorMessage(err),
263
- })
264
- })
265
- }
266
-
267
- function markRunningEntryCancelled(entry: QueueEntry, reason: string) {
268
- if (entry.run.status === 'cancelled') return
269
- entry.run.status = 'cancelled'
270
- entry.run.endedAt = now()
271
- entry.run.error = reason
272
- syncRunRecord(entry.run)
273
- emitRunMeta(entry, 'cancelled', { reason })
274
- }
275
-
276
- function abortSessionRuntime(entry: QueueEntry, reason: string) {
277
- markRunningEntryCancelled(entry, reason)
278
- entry.signalController.abort()
279
- try { active.get(entry.run.sessionId)?.kill?.() } catch { /* noop */ }
280
- active.delete(entry.run.sessionId)
281
- try { cleanupSessionBrowser(entry.run.sessionId) } catch { /* noop */ }
282
- try { cancelDelegationJobsForParentSession(entry.run.sessionId, reason) } catch { /* noop */ }
283
- }
284
-
285
- function executionKeyForSession(sessionId: string): string {
286
- return `session:${sessionId}`
287
- }
288
-
289
- function nonHeartbeatActivityLeaseName(sessionId: string): string {
290
- return `session-non-heartbeat:${sessionId}`
291
- }
292
-
293
- export function hasActiveNonHeartbeatSessionLease(sessionId: string): boolean {
294
- return isRuntimeLockActive(nonHeartbeatActivityLeaseName(sessionId))
295
- }
296
-
297
- function hasExternalSessionExecutionHold(sessionId: string): boolean {
298
- return (state.externalSessionHolds.get(sessionId) || 0) > 0
299
- }
300
-
301
- export function acquireExternalSessionExecutionHold(sessionId: string): () => void {
302
- const current = state.externalSessionHolds.get(sessionId) || 0
303
- state.externalSessionHolds.set(sessionId, current + 1)
304
- let released = false
305
- const holdKey = `${sessionId}:${current + 1}`
306
- const ttlTimer = setTimeout(() => {
307
- if (released) return
308
- log.warn('session-run', 'External hold auto-released after TTL', { sessionId, holdKey, ttlMs: EXTERNAL_HOLD_TTL_MS })
309
- release()
310
- }, EXTERNAL_HOLD_TTL_MS)
311
- state.externalHoldTimers.set(holdKey, ttlTimer)
312
- const release = () => {
313
- if (released) return
314
- released = true
315
- const timer = state.externalHoldTimers.get(holdKey)
316
- if (timer) {
317
- clearTimeout(timer)
318
- state.externalHoldTimers.delete(holdKey)
319
- }
320
- const next = (state.externalSessionHolds.get(sessionId) || 1) - 1
321
- if (next > 0) state.externalSessionHolds.set(sessionId, next)
322
- else state.externalSessionHolds.delete(sessionId)
323
- void drainExecution(executionKeyForSession(sessionId))
324
- }
325
- return release
326
- }
327
-
328
- function queueForExecution(executionKey: string): QueueEntry[] {
329
- const existing = state.queueByExecution.get(executionKey)
330
- if (existing) return existing
331
- const created: QueueEntry[] = []
332
- state.queueByExecution.set(executionKey, created)
333
- return created
334
- }
335
-
336
- function normalizeMode(mode: string | undefined, internal: boolean): SessionQueueMode {
337
- if (mode === 'steer' || mode === 'collect' || mode === 'followup') return mode
338
- return internal ? 'collect' : 'followup'
339
- }
7
+ cancelAllHeartbeatRuns as cancelAllHeartbeatRunsInternal,
8
+ cancelAllRuns as cancelAllRunsInternal,
9
+ cancelQueuedRunById as cancelQueuedRunByIdInternal,
10
+ cancelQueuedRunsForSession as cancelQueuedRunsForSessionInternal,
11
+ cancelSessionRuns as cancelSessionRunsInternal,
12
+ } from '@/lib/server/runtime/session-run-manager/cancellation'
13
+ import { drainExecution as drainExecutionInternal } from '@/lib/server/runtime/session-run-manager/drain'
14
+ import { enqueueSessionRun as enqueueSessionRunInternal } from '@/lib/server/runtime/session-run-manager/enqueue'
15
+ import {
16
+ getRunById as getRunByIdInternal,
17
+ getSessionExecutionState as getSessionExecutionStateInternal,
18
+ getSessionQueueSnapshot as getSessionQueueSnapshotInternal,
19
+ getSessionRunState as getSessionRunStateInternal,
20
+ listRunEvents as listRunEventsInternal,
21
+ listRuns as listRunsInternal,
22
+ } from '@/lib/server/runtime/session-run-manager/queries'
23
+ import {
24
+ ensureRecoveredPersistedRuns as ensureRecoveredPersistedRunsInternal,
25
+ repairSessionRunQueue as repairSessionRunQueueInternal,
26
+ sweepStuckRuns as sweepStuckRunsInternal,
27
+ } from '@/lib/server/runtime/session-run-manager/recovery'
28
+ import type {
29
+ EnqueueSessionRunInput,
30
+ EnqueueSessionRunResult,
31
+ } from '@/lib/server/runtime/session-run-manager/types'
340
32
 
341
- function markPersistedRunInterrupted(run: SessionRunRecord, reason: string): SessionRunRecord {
342
- const interruptedAt = now()
343
- const next = patchPersistedRun(run.id, (current) => {
344
- const target = current || run
345
- return {
346
- ...target,
347
- status: 'cancelled',
348
- endedAt: target.endedAt || interruptedAt,
349
- interruptedAt,
350
- interruptedReason: reason,
351
- error: target.error || reason,
352
- }
353
- }) || {
354
- ...run,
355
- status: 'cancelled',
356
- endedAt: run.endedAt || interruptedAt,
357
- interruptedAt,
358
- interruptedReason: reason,
359
- error: run.error || reason,
360
- }
361
- state.runs.set(next.id, next)
362
- if (!state.recentRunIds.includes(next.id)) {
363
- state.recentRunIds.push(next.id)
364
- trimRecentRuns()
365
- }
366
- appendPersistedRunEvent({
367
- runId: next.id,
368
- sessionId: next.sessionId,
369
- phase: 'status',
370
- status: 'cancelled',
371
- summary: reason,
372
- event: {
373
- t: 'md',
374
- text: JSON.stringify({
375
- run: {
376
- id: next.id,
377
- sessionId: next.sessionId,
378
- status: 'cancelled',
379
- interrupted: true,
380
- reason,
381
- },
382
- }),
383
- },
384
- })
385
- return next
386
- }
33
+ export type {
34
+ EnqueueSessionRunInput,
35
+ EnqueueSessionRunResult,
36
+ SessionQueueMode,
37
+ } from '@/lib/server/runtime/session-run-manager/types'
387
38
 
388
39
  function ensureRecoveredPersistedRuns(): void {
389
- if (recoveryState.completed) return
390
- recoveryState.completed = true
391
- const staleRuns = loadRecoverableStaleRuns()
392
- if (!staleRuns.length) return
393
- const recoveryBlocked = isAutonomyEstopEngaged() || isAllEstopEngaged()
394
-
395
- for (const run of staleRuns) {
396
- const interrupted = markPersistedRunInterrupted(run, 'Interrupted by server restart before the run completed.')
397
- const payload = interrupted.recoveryPayload
398
- if (
399
- recoveryBlocked
400
- || interrupted.recoveredFromRestart
401
- || !payload
402
- || !isRestartRecoverableSource(interrupted.source)
403
- ) {
404
- continue
405
- }
406
-
407
- try {
408
- enqueueSessionRun({
409
- sessionId: interrupted.sessionId,
410
- message: payload.message,
411
- imagePath: payload.imagePath,
412
- imageUrl: payload.imageUrl,
413
- attachedFiles: payload.attachedFiles,
414
- internal: payload.internal,
415
- source: payload.source,
416
- mode: normalizeMode(payload.mode, payload.internal),
417
- dedupeKey: interrupted.dedupeKey,
418
- maxRuntimeMs: payload.maxRuntimeMs,
419
- modelOverride: payload.modelOverride,
420
- heartbeatConfig: payload.heartbeatConfig,
421
- replyToId: payload.replyToId,
422
- executionGroupKey: payload.executionGroupKey,
423
- recoveredFromRestart: true,
424
- recoveredFromRunId: interrupted.id,
425
- })
426
- } catch (err: unknown) {
427
- log.warn('session-run', `Failed to requeue interrupted run ${interrupted.id}`, {
428
- sessionId: interrupted.sessionId,
429
- error: errorMessage(err),
430
- })
431
- }
432
- }
433
- }
434
-
435
- function isNonHeartbeatEntry(entry: QueueEntry): boolean {
436
- return !isInternalHeartbeatRun(entry.run.internal, entry.run.source)
437
- }
438
-
439
- function incrementNonHeartbeatWork(entry: QueueEntry): void {
440
- if (!isNonHeartbeatEntry(entry)) return
441
- entry.nonHeartbeatCounted = true
442
- state.nonHeartbeatWorkCount.set(entry.run.sessionId, (state.nonHeartbeatWorkCount.get(entry.run.sessionId) || 0) + 1)
443
- }
444
-
445
- function decrementNonHeartbeatWork(entry: QueueEntry): void {
446
- if (!entry.nonHeartbeatCounted) return
447
- entry.nonHeartbeatCounted = false
448
- const sessionId = entry.run.sessionId
449
- const count = (state.nonHeartbeatWorkCount.get(sessionId) || 0) - 1
450
- if (count <= 0) state.nonHeartbeatWorkCount.delete(sessionId)
451
- else state.nonHeartbeatWorkCount.set(sessionId, count)
452
- }
453
-
454
- function hasLocalNonHeartbeatWork(sessionId: string): boolean {
455
- return (state.nonHeartbeatWorkCount.get(sessionId) || 0) > 0
40
+ ensureRecoveredPersistedRunsInternal(enqueueSessionRun)
456
41
  }
457
42
 
458
- function clearDeferredDrain(executionKey: string): void {
459
- const timer = state.deferredDrainTimers.get(executionKey)
460
- if (!timer) return
461
- clearTimeout(timer)
462
- state.deferredDrainTimers.delete(executionKey)
43
+ function drainExecution(executionKey: string): Promise<void> {
44
+ return drainExecutionInternal(executionKey, { enqueueSessionRun })
463
45
  }
464
46
 
465
- function deleteQueueEntry(queue: QueueEntry[], target: QueueEntry): boolean {
466
- const idx = queue.indexOf(target)
467
- if (idx === -1) return false
468
- queue.splice(idx, 1)
469
- return true
470
- }
471
-
472
- function scheduleDeferredDrain(executionKey: string, delayMs = HEARTBEAT_BUSY_RETRY_MS): void {
473
- if (state.deferredDrainTimers.has(executionKey)) return
474
- const timer = setTimeout(() => {
475
- state.deferredDrainTimers.delete(executionKey)
47
+ export function acquireExternalSessionExecutionHold(sessionId: string): () => void {
48
+ return acquireExternalSessionExecutionHoldInternal(sessionId, (executionKey) => {
476
49
  void drainExecution(executionKey)
477
- }, delayMs)
478
- state.deferredDrainTimers.set(executionKey, timer)
479
- }
480
-
481
- function stopSessionActivityLease(sessionId: string): void {
482
- const timer = state.activityLeaseRenewTimers.get(sessionId)
483
- if (timer) {
484
- clearInterval(timer)
485
- state.activityLeaseRenewTimers.delete(sessionId)
486
- }
487
- releaseRuntimeLock(nonHeartbeatActivityLeaseName(sessionId), SHARED_ACTIVITY_LEASE_OWNER)
488
- }
489
-
490
- function startSessionActivityLease(sessionId: string): void {
491
- if (state.activityLeaseRenewTimers.has(sessionId)) return
492
- const leaseName = nonHeartbeatActivityLeaseName(sessionId)
493
- tryAcquireRuntimeLock(leaseName, SHARED_ACTIVITY_LEASE_OWNER, SHARED_ACTIVITY_LEASE_TTL_MS)
494
- const timer = setInterval(() => {
495
- if (!hasLocalNonHeartbeatWork(sessionId)) {
496
- stopSessionActivityLease(sessionId)
497
- return
498
- }
499
- tryAcquireRuntimeLock(leaseName, SHARED_ACTIVITY_LEASE_OWNER, SHARED_ACTIVITY_LEASE_TTL_MS)
500
- }, SHARED_ACTIVITY_LEASE_RENEW_MS)
501
- state.activityLeaseRenewTimers.set(sessionId, timer)
502
- }
503
-
504
- function reconcileSessionActivityLease(sessionId: string): void {
505
- if (hasLocalNonHeartbeatWork(sessionId)) startSessionActivityLease(sessionId)
506
- else stopSessionActivityLease(sessionId)
507
- }
508
-
509
- function resolveRecoveredQueuedEntry(entry: QueueEntry, reason: string): void {
510
- decrementNonHeartbeatWork(entry)
511
- if (entry.run.status === 'completed' || entry.run.status === 'failed' || entry.run.status === 'cancelled') {
512
- entry.run.endedAt = entry.run.endedAt || now()
513
- } else {
514
- entry.run.status = 'failed'
515
- entry.run.endedAt = now()
516
- }
517
- entry.run.error = reason
518
- syncRunRecord(entry.run)
519
- emitToSubscribers(entry, { t: 'err', text: reason })
520
- emitRunMeta(entry, 'failed', {
521
- error: reason,
522
- recovered: true,
523
- })
524
- entry.resolve({
525
- runId: entry.run.id,
526
- sessionId: entry.run.sessionId,
527
- text: '',
528
- persisted: false,
529
- toolEvents: [],
530
- error: reason,
531
50
  })
532
51
  }
533
52
 
@@ -542,903 +61,79 @@ export function repairSessionRunQueue(
542
61
  kickedExecutionKeys: number
543
62
  recoveredQueuedRuns: number
544
63
  } {
545
- const maxQueuedAgeMs = Math.max(1_000, opts?.maxQueuedAgeMs ?? STALE_QUEUED_RUN_MS)
546
- const reason = opts?.reason || 'Recovered stale queued run'
547
- const targetExecutionKey = typeof opts?.executionKey === 'string' && opts.executionKey.trim()
548
- ? opts.executionKey.trim()
549
- : null
550
- const queuedNow = now()
551
- let kickedExecutionKeys = 0
552
- let recoveredQueuedRuns = 0
553
-
554
- for (const [executionKey, queue] of state.queueByExecution.entries()) {
555
- if (targetExecutionKey && executionKey !== targetExecutionKey) continue
556
- if (!queue.length) {
557
- clearDeferredDrain(executionKey)
558
- state.queueByExecution.delete(executionKey)
559
- continue
560
- }
561
- if (state.runningByExecution.has(executionKey)) continue
562
-
563
- const matching = queue.filter((entry) => entry.run.sessionId === sessionId)
564
- if (!matching.length) continue
565
-
566
- for (const entry of [...matching]) {
567
- const missingPromise = !state.promises.has(entry.run.id)
568
- const previousStatus = entry.run.status
569
- const nonQueued = previousStatus !== 'queued'
570
- const ageMs = Math.max(0, queuedNow - (entry.run.queuedAt || 0))
571
- const stale = nonQueued || missingPromise || ageMs >= maxQueuedAgeMs
572
- if (!stale) continue
573
- if (!deleteQueueEntry(queue, entry)) continue
574
- clearDeferredDrain(executionKey)
575
- resolveRecoveredQueuedEntry(entry, reason)
576
- recoveredQueuedRuns += 1
577
- log.warn('session-run', `Recovered stale queued run ${entry.run.id}`, {
578
- sessionId: entry.run.sessionId,
579
- executionKey,
580
- source: entry.run.source,
581
- ageMs,
582
- missingPromise,
583
- previousStatus,
584
- })
585
- }
586
-
587
- if (!queue.length) {
588
- clearDeferredDrain(executionKey)
589
- state.queueByExecution.delete(executionKey)
590
- continue
591
- }
592
-
593
- if (queue.some((entry) => entry.run.sessionId === sessionId)) {
594
- clearDeferredDrain(executionKey)
595
- kickedExecutionKeys += 1
596
- void drainExecution(executionKey)
597
- }
598
- }
599
-
600
- if (recoveredQueuedRuns > 0) reconcileSessionActivityLease(sessionId)
601
- return { kickedExecutionKeys, recoveredQueuedRuns }
602
- }
603
-
604
- function cancelPendingForSession(sessionId: string, reason: string): number {
605
- let cancelled = 0
606
- for (const [key, queue] of state.queueByExecution.entries()) {
607
- if (!queue.length) continue
608
- const keep: QueueEntry[] = []
609
- for (const entry of queue) {
610
- if (entry.run.sessionId !== sessionId) {
611
- keep.push(entry)
612
- continue
613
- }
614
- entry.run.status = 'cancelled'
615
- entry.run.endedAt = now()
616
- entry.run.error = reason
617
- syncRunRecord(entry.run)
618
- emitRunMeta(entry, 'cancelled', { reason })
619
- entry.reject(new Error(reason))
620
- decrementNonHeartbeatWork(entry)
621
- cancelled++
622
- }
623
- if (keep.length > 0) state.queueByExecution.set(key, keep)
624
- else state.queueByExecution.delete(key)
625
- }
626
- reconcileSessionActivityLease(sessionId)
627
- return cancelled
628
- }
629
-
630
- function cancelQueuedEntries(
631
- matcher: (entry: QueueEntry) => boolean,
632
- reason: string,
633
- ): { cancelled: number; sessionIds: Set<string> } {
634
- let cancelled = 0
635
- const sessionIds = new Set<string>()
636
- for (const [key, queue] of state.queueByExecution.entries()) {
637
- if (!queue.length) continue
638
- const keep: QueueEntry[] = []
639
- for (const entry of queue) {
640
- if (!matcher(entry)) {
641
- keep.push(entry)
642
- continue
643
- }
644
- entry.run.status = 'cancelled'
645
- entry.run.endedAt = now()
646
- entry.run.error = reason
647
- syncRunRecord(entry.run)
648
- emitRunMeta(entry, 'cancelled', { reason })
649
- entry.reject(new Error(reason))
650
- decrementNonHeartbeatWork(entry)
651
- sessionIds.add(entry.run.sessionId)
652
- cancelled += 1
653
- }
654
- if (keep.length > 0) state.queueByExecution.set(key, keep)
655
- else state.queueByExecution.delete(key)
656
- }
657
- for (const sessionId of sessionIds) reconcileSessionActivityLease(sessionId)
658
- return { cancelled, sessionIds }
659
- }
660
-
661
- export function cancelAllHeartbeatRuns(reason = 'Heartbeat disabled globally'): { cancelledQueued: number; abortedRunning: number } {
662
- ensureRecoveredPersistedRuns()
663
- let cancelledQueued = 0
664
- let abortedRunning = 0
665
-
666
- for (const [key, queue] of state.queueByExecution.entries()) {
667
- if (!queue.length) continue
668
- const keep: QueueEntry[] = []
669
- for (const entry of queue) {
670
- const isHeartbeat = isInternalHeartbeatRun(entry.run.internal, entry.run.source)
671
- if (!isHeartbeat) {
672
- keep.push(entry)
673
- continue
674
- }
675
- entry.run.status = 'cancelled'
676
- entry.run.endedAt = now()
677
- entry.run.error = reason
678
- syncRunRecord(entry.run)
679
- emitRunMeta(entry, 'cancelled', { reason })
680
- entry.reject(new Error(reason))
681
- cancelledQueued += 1
682
- }
683
- if (keep.length > 0) state.queueByExecution.set(key, keep)
684
- else state.queueByExecution.delete(key)
685
- }
686
-
687
- for (const entry of state.runningByExecution.values()) {
688
- const isHeartbeat = isInternalHeartbeatRun(entry.run.internal, entry.run.source)
689
- if (!isHeartbeat) continue
690
- abortedRunning += 1
691
- abortSessionRuntime(entry, reason)
692
- }
693
-
694
- return { cancelledQueued, abortedRunning }
695
- }
696
-
697
- export function cancelAllRuns(reason = 'Cancelled'): { cancelledQueued: number; abortedRunning: number } {
698
- ensureRecoveredPersistedRuns()
699
- let cancelledQueued = 0
700
- let abortedRunning = 0
701
-
702
- for (const [key, queue] of state.queueByExecution.entries()) {
703
- if (!queue.length) continue
704
- for (const entry of queue) {
705
- entry.run.status = 'cancelled'
706
- entry.run.endedAt = now()
707
- entry.run.error = reason
708
- syncRunRecord(entry.run)
709
- emitRunMeta(entry, 'cancelled', { reason })
710
- entry.reject(new Error(reason))
711
- cancelledQueued += 1
712
- }
713
- state.queueByExecution.delete(key)
714
- }
715
-
716
- for (const entry of state.runningByExecution.values()) {
717
- abortedRunning += 1
718
- abortSessionRuntime(entry, reason)
719
- }
720
- state.runningByExecution.clear()
721
- state.nonHeartbeatWorkCount.clear()
722
-
723
- return { cancelledQueued, abortedRunning }
724
- }
725
-
726
- async function drainExecution(executionKey: string): Promise<void> {
727
- const depth = (state.drainDepth.get(executionKey) || 0) + 1
728
- state.drainDepth.set(executionKey, depth)
729
- if (depth > MAX_DRAIN_DEPTH) {
730
- log.error('session-run', 'Drain recursion depth exceeded, deferring', { executionKey, depth, max: MAX_DRAIN_DEPTH })
731
- state.drainDepth.delete(executionKey)
732
- scheduleDeferredDrain(executionKey, 500)
733
- return
734
- }
735
- try {
736
- if (state.runningByExecution.has(executionKey)) return
737
- const q = queueForExecution(executionKey)
738
- // 3-tier drain priority: (1) user-facing, (2) internal non-heartbeat, (3) heartbeat
739
- const userIdx = q.findIndex(e => !e.run.internal)
740
- let next: QueueEntry | undefined
741
- if (userIdx >= 0) {
742
- next = q.splice(userIdx, 1)[0]
743
- } else {
744
- const internalIdx = q.findIndex(e => !isInternalHeartbeatRun(e.run.internal, e.run.source))
745
- next = internalIdx >= 0 ? q.splice(internalIdx, 1)[0] : q.shift()
746
- }
747
- if (!next) {
748
- clearDeferredDrain(executionKey)
749
- return
750
- }
751
-
752
- if (isInternalHeartbeatRun(next.run.internal, next.run.source) && hasActiveNonHeartbeatSessionLease(next.run.sessionId)) {
753
- q.unshift(next)
754
- scheduleDeferredDrain(executionKey, HEARTBEAT_BUSY_RETRY_MS)
755
- log.info('session-run', `Deferred heartbeat run ${next.run.id} for shared busy session`, {
756
- sessionId: next.run.sessionId,
757
- source: next.run.source,
758
- leaseName: nonHeartbeatActivityLeaseName(next.run.sessionId),
759
- })
760
- return
761
- }
762
-
763
- if (hasExternalSessionExecutionHold(next.run.sessionId)) {
764
- q.unshift(next)
765
- scheduleDeferredDrain(executionKey, HEARTBEAT_BUSY_RETRY_MS)
766
- log.info('session-run', `Deferred run ${next.run.id} for external session hold`, {
767
- sessionId: next.run.sessionId,
768
- source: next.run.source,
769
- mode: next.run.mode,
770
- })
771
- return
772
- }
773
-
774
- clearDeferredDrain(executionKey)
775
- state.runningByExecution.set(executionKey, next)
776
- next.run.status = 'running'
777
- next.run.startedAt = now()
778
- syncRunRecord(next.run)
779
- emitRunMeta(next, 'running')
780
- log.info('session-run', `Run started ${next.run.id}`, {
781
- sessionId: next.run.sessionId,
782
- source: next.run.source,
783
- internal: next.run.internal,
784
- mode: next.run.mode,
785
- timeoutMs: next.maxRuntimeMs || null,
786
- })
787
-
788
- let runtimeTimer: ReturnType<typeof setTimeout> | null = null
789
- let finishedMissionId: string | null = null
790
- if (next.maxRuntimeMs && next.maxRuntimeMs > 0) {
791
- runtimeTimer = setTimeout(() => {
792
- next.signalController.abort()
793
- }, next.maxRuntimeMs)
794
- }
795
-
796
- try {
797
- const result = await executeSessionChatTurn({
798
- sessionId: next.run.sessionId,
799
- message: next.message,
800
- imagePath: next.imagePath,
801
- imageUrl: next.imageUrl,
802
- attachedFiles: next.attachedFiles,
803
- internal: next.run.internal,
804
- source: next.run.source,
805
- runId: next.run.id,
806
- signal: next.signalController.signal,
807
- onEvent: (event) => emitToSubscribers(next, event),
808
- modelOverride: next.modelOverride,
809
- heartbeatConfig: next.heartbeatConfig,
810
- replyToId: next.replyToId,
811
- })
812
-
813
- const failed = !!result.error
814
- const aborted = next.signalController.signal.aborted
815
- next.run.status = aborted ? 'cancelled' : (failed ? 'failed' : 'completed')
816
- next.run.endedAt = next.run.endedAt || now()
817
- next.run.error = aborted ? (next.run.error || 'Cancelled') : result.error
818
- next.run.missionId = result.missionId || next.run.missionId || null
819
- finishedMissionId = next.run.missionId || null
820
- next.run.resultPreview = result.text?.slice(0, 280)
821
- if (typeof result.inputTokens === 'number') next.run.totalInputTokens = result.inputTokens
822
- if (typeof result.outputTokens === 'number') next.run.totalOutputTokens = result.outputTokens
823
- if (typeof result.estimatedCost === 'number') next.run.estimatedCost = result.estimatedCost
824
- syncRunRecord(next.run)
825
- emitRunMeta(next, next.run.status, {
826
- persisted: result.persisted,
827
- hasText: !!result.text,
828
- error: next.run.error || null,
829
- })
830
- log.info('session-run', `Run finished ${next.run.id}`, {
831
- sessionId: next.run.sessionId,
832
- status: next.run.status,
833
- persisted: result.persisted,
834
- hasText: !!result.text,
835
- error: next.run.error || null,
836
- durationMs: (next.run.endedAt || now()) - (next.run.startedAt || now()),
837
- })
838
- const followup = handleMainLoopRunResult({
839
- runId: next.run.id,
840
- sessionId: next.run.sessionId,
841
- message: next.message,
842
- internal: next.run.internal,
843
- source: next.run.source,
844
- resultText: result.text,
845
- error: next.run.error,
846
- toolEvents: result.toolEvents,
847
- inputTokens: result.inputTokens,
848
- outputTokens: result.outputTokens,
849
- estimatedCost: result.estimatedCost,
850
- })
851
- queueAutonomyObservation({
852
- runId: next.run.id,
853
- sessionId: next.run.sessionId,
854
- source: next.run.source,
855
- status: next.run.status,
856
- resultText: result.text,
857
- error: next.run.error || null,
858
- toolEvents: result.toolEvents,
859
- sourceMessage: next.message,
860
- })
861
- if (followup) {
862
- setTimeout(() => {
863
- try {
864
- enqueueSessionRun({
865
- sessionId: next.run.sessionId,
866
- message: followup.message,
867
- internal: true,
868
- source: 'main-loop-followup',
869
- mode: 'followup',
870
- dedupeKey: followup.dedupeKey,
871
- })
872
- } catch (err: unknown) {
873
- log.warn('session-run', `Main loop follow-up enqueue failed for ${next.run.sessionId}`, {
874
- error: errorMessage(err),
875
- })
876
- }
877
- }, Math.max(0, followup.delayMs || 0))
878
- }
879
- next.resolve(result)
880
- } catch (err: unknown) {
881
- const aborted = next.signalController.signal.aborted
882
- next.run.status = aborted ? 'cancelled' : 'failed'
883
- next.run.endedAt = now()
884
- next.run.error = errorMessage(err)
885
- finishedMissionId = next.run.missionId || null
886
- syncRunRecord(next.run)
887
- emitRunMeta(next, next.run.status, { error: next.run.error })
888
- log.error('session-run', `Run failed ${next.run.id}`, {
889
- sessionId: next.run.sessionId,
890
- status: next.run.status,
891
- error: next.run.error,
892
- durationMs: (next.run.endedAt || now()) - (next.run.startedAt || now()),
893
- })
894
- if (err instanceof Error && err.stack) {
895
- log.error('session-run', `Run failed stack trace ${next.run.id}`, {
896
- sessionId: next.run.sessionId,
897
- stack: err.stack,
898
- })
899
- }
900
- queueAutonomyObservation({
901
- runId: next.run.id,
902
- sessionId: next.run.sessionId,
903
- source: next.run.source,
904
- status: next.run.status,
905
- error: next.run.error || null,
906
- sourceMessage: next.message,
907
- })
908
- next.reject(err instanceof Error ? err : new Error(next.run.error))
909
- } finally {
910
- if (runtimeTimer) clearTimeout(runtimeTimer)
911
- state.runningByExecution.delete(executionKey)
912
- decrementNonHeartbeatWork(next)
913
- reconcileSessionActivityLease(next.run.sessionId)
914
- notify(`stream-end:${next.run.sessionId}`)
915
- if (finishedMissionId && next.run.source !== 'chat') {
916
- queueMicrotask(() => {
917
- import('@/lib/server/missions/mission-service')
918
- .then(({ loadMissionById, requestMissionTick }) => {
919
- const mission = loadMissionById(finishedMissionId)
920
- if (!mission) return
921
- if (mission.status !== 'active') return
922
- if (mission.phase === 'dispatching' || mission.phase === 'executing') return
923
- requestMissionTick(finishedMissionId as string, 'run_drained', {
924
- runId: next.run.id,
925
- source: next.run.source,
926
- status: next.run.status,
927
- })
928
- })
929
- .catch((err: unknown) => {
930
- log.warn('session-run', 'Mission tick failed', { missionId: finishedMissionId, runId: next.run.id, error: errorMessage(err) })
931
- })
932
- })
933
- }
934
- void drainExecution(executionKey)
935
- }
936
- } finally {
937
- state.drainDepth.delete(executionKey)
938
- }
939
- }
940
-
941
- function findDedupeMatch(sessionId: string, dedupeKey?: string): QueueEntry | null {
942
- if (!dedupeKey) return null
943
- const executionKey = executionKeyForSession(sessionId)
944
- const running = state.runningByExecution.get(executionKey)
945
- if (running?.run.sessionId === sessionId && running?.run.dedupeKey === dedupeKey) return running
946
- const q = queueForExecution(executionKey)
947
- return q.find((e) => e.run.sessionId === sessionId && e.run.dedupeKey === dedupeKey) || null
948
- }
949
-
950
- export interface EnqueueSessionRunInput {
951
- sessionId: string
952
- message: string
953
- missionId?: string | null
954
- imagePath?: string
955
- imageUrl?: string
956
- attachedFiles?: string[]
957
- internal?: boolean
958
- source?: string
959
- mode?: SessionQueueMode
960
- onEvent?: (event: SSEEvent) => void
961
- dedupeKey?: string
962
- maxRuntimeMs?: number
963
- modelOverride?: string
964
- heartbeatConfig?: SessionRunHeartbeatConfig
965
- replyToId?: string
966
- /** Optional shared execution lane key. When set, multiple sessions can be serialized together. */
967
- executionGroupKey?: string
968
- /** External abort signal (e.g. from the HTTP request) — chained to the run's internal AbortController */
969
- callerSignal?: AbortSignal
970
- recoveredFromRestart?: boolean
971
- recoveredFromRunId?: string
972
- }
973
-
974
- export interface EnqueueSessionRunResult {
975
- runId: string
976
- position: number
977
- deduped?: boolean
978
- coalesced?: boolean
979
- promise: Promise<ExecuteChatTurnResult>
980
- /** Abort the run's internal AbortController (cancels the LLM stream). */
981
- abort: () => void
982
- /** Remove this caller's onEvent listener from the run (call on client disconnect). */
983
- unsubscribe: () => void
984
- }
985
-
986
- const LONG_TOOL_NAMES: ReadonlySet<string> = new Set(['claude_code', 'codex_cli', 'opencode_cli'])
987
- type SessionToolConfig = {
988
- tools?: string[] | null
989
- extensions?: string[] | null
990
- }
991
-
992
- function computeEffectiveRunTimeoutMs(
993
- baseTimeoutMs: number,
994
- sessionTools: string[],
995
- runtime: { claudeCodeTimeoutMs: number },
996
- ): number {
997
- const hasLongTool = sessionTools.some(t => LONG_TOOL_NAMES.has(t))
998
- if (!hasLongTool) return baseTimeoutMs
999
- const toolTimeout = runtime.claudeCodeTimeoutMs + 120_000
1000
- return Math.max(baseTimeoutMs, toolTimeout)
1001
- }
1002
-
1003
- function isAutonomyManagedEnqueue(source: string, internal: boolean): boolean {
1004
- return !(source === 'chat' && !internal)
1005
- }
1006
-
1007
- function buildRecoveryPayload(
1008
- input: EnqueueSessionRunInput,
1009
- source: string,
1010
- mode: SessionQueueMode,
1011
- maxRuntimeMs: number | undefined,
1012
- executionKey: string,
1013
- ) {
1014
- return {
1015
- message: input.message,
1016
- imagePath: input.imagePath,
1017
- imageUrl: input.imageUrl,
1018
- attachedFiles: input.attachedFiles,
1019
- internal: input.internal === true,
1020
- source,
1021
- mode,
1022
- maxRuntimeMs,
1023
- modelOverride: input.modelOverride,
1024
- heartbeatConfig: input.heartbeatConfig,
1025
- replyToId: input.replyToId,
1026
- executionGroupKey: executionKey.startsWith('session:') ? undefined : executionKey,
1027
- }
64
+ return repairSessionRunQueueInternal(sessionId, drainExecution, opts)
1028
65
  }
1029
66
 
1030
67
  export function enqueueSessionRun(input: EnqueueSessionRunInput): EnqueueSessionRunResult {
1031
68
  ensureRecoveredPersistedRuns()
1032
- const internal = input.internal === true
1033
- const mode = normalizeMode(input.mode, internal)
1034
- const source = input.source || 'chat'
1035
- if (isAllEstopEngaged()) {
1036
- throw new Error('Execution is blocked because all estop is engaged.')
1037
- }
1038
- if (isAutonomyEstopEngaged() && isAutonomyManagedEnqueue(source, internal)) {
1039
- throw new Error(`Autonomy estop is engaged. New ${source} runs are paused.`)
1040
- }
1041
- const executionKey = typeof input.executionGroupKey === 'string' && input.executionGroupKey.trim()
1042
- ? input.executionGroupKey.trim()
1043
- : executionKeyForSession(input.sessionId)
1044
- repairSessionRunQueue(input.sessionId, {
1045
- executionKey,
1046
- reason: 'Recovered stale queued run before enqueue',
1047
- })
1048
- const runtime = loadRuntimeSettings()
1049
- const defaultMaxRuntimeMs = runtime.ongoingLoopMaxRuntimeMs ?? (10 * 60_000)
1050
- const sessionData = loadSession(input.sessionId) as SessionToolConfig | null
1051
- const sessionTools = getEnabledToolIds(sessionData)
1052
- const adjustedDefaultMs = computeEffectiveRunTimeoutMs(defaultMaxRuntimeMs, sessionTools, runtime)
1053
- const effectiveMaxRuntimeMs = typeof input.maxRuntimeMs === 'number'
1054
- ? input.maxRuntimeMs
1055
- : adjustedDefaultMs
1056
-
1057
- const dedupe = findDedupeMatch(input.sessionId, input.dedupeKey)
1058
- if (dedupe) {
1059
- const cb = input.onEvent
1060
- if (cb) dedupe.onEvents.push(cb)
1061
- if (input.callerSignal) chainCallerSignal(input.callerSignal, dedupe.signalController)
1062
- return {
1063
- runId: dedupe.run.id,
1064
- position: 0,
1065
- deduped: true,
1066
- promise: dedupe.promise,
1067
- abort: () => dedupe.signalController.abort(),
1068
- unsubscribe: () => {
1069
- if (!cb) return
1070
- const idx = dedupe.onEvents.indexOf(cb)
1071
- if (idx >= 0) dedupe.onEvents.splice(idx, 1)
1072
- },
1073
- }
1074
- }
1075
-
1076
- if (mode === 'steer') {
1077
- const running = state.runningByExecution.get(executionKey)
1078
- if (running && running.run.sessionId === input.sessionId) {
1079
- running.signalController.abort()
1080
- try { active.get(input.sessionId)?.kill?.() } catch { /* noop */ }
1081
- }
1082
- cancelPendingForSession(input.sessionId, 'Cancelled by steer mode')
1083
- }
1084
-
1085
- // Heartbeat preemption: if a user chat arrives while a heartbeat is running,
1086
- // abort the heartbeat so the user doesn't wait. The heartbeat will retry
1087
- // on the next tick.
1088
- if (!internal && source === 'chat') {
1089
- const running = state.runningByExecution.get(executionKey)
1090
- if (running && isInternalHeartbeatRun(running.run.internal, running.run.source)) {
1091
- log.info('session-run', `Preempting heartbeat ${running.run.id} for user chat on ${input.sessionId}`)
1092
- abortSessionRuntime(running, 'Preempted by user chat')
1093
- state.runningByExecution.delete(executionKey)
1094
- }
1095
- }
1096
-
1097
- const running = state.runningByExecution.get(executionKey)
1098
- const q = queueForExecution(executionKey)
1099
- if (mode === 'collect' && !input.imagePath && !input.imageUrl && !input.attachedFiles?.length) {
1100
- const nowMs = nextQueuedAt()
1101
- const candidate = q.at(-1)
1102
- const canCoalesce = !!candidate
1103
- && candidate.run.mode === 'collect'
1104
- && candidate.run.internal === internal
1105
- && candidate.run.source === source
1106
- && !candidate.imagePath
1107
- && !candidate.imageUrl
1108
- && !candidate.attachedFiles?.length
1109
- && (nowMs - candidate.run.queuedAt) <= COLLECT_COALESCE_WINDOW_MS
1110
-
1111
- if (candidate && canCoalesce) {
1112
- const nextChunk = input.message.trim()
1113
- if (nextChunk) {
1114
- const current = candidate.message.trim()
1115
- candidate.message = current
1116
- ? `${current}\n\n[Collected follow-up]\n${nextChunk}`
1117
- : nextChunk
1118
- candidate.run.messagePreview = messagePreview(candidate.message)
1119
- candidate.run.queuedAt = nowMs
1120
- syncRunRecord(candidate.run)
1121
- }
1122
- const coalesceCb = input.onEvent
1123
- if (coalesceCb) candidate.onEvents.push(coalesceCb)
1124
- if (input.callerSignal) chainCallerSignal(input.callerSignal, candidate.signalController)
1125
- emitRunMeta(candidate, 'queued', { position: 0, coalesced: true, mergedIntoRunId: candidate.run.id })
1126
- return {
1127
- runId: candidate.run.id,
1128
- position: 0,
1129
- coalesced: true,
1130
- promise: candidate.promise,
1131
- abort: () => candidate.signalController.abort(),
1132
- unsubscribe: () => {
1133
- if (!coalesceCb) return
1134
- const idx = candidate.onEvents.indexOf(coalesceCb)
1135
- if (idx >= 0) candidate.onEvents.splice(idx, 1)
1136
- },
1137
- }
1138
- }
1139
- }
1140
-
1141
- const runId = genId(8)
1142
- const run: SessionRunRecord = {
1143
- id: runId,
1144
- sessionId: input.sessionId,
1145
- missionId: input.missionId ?? loadSession(input.sessionId)?.missionId ?? null,
1146
- source,
1147
- internal,
1148
- mode,
1149
- status: 'queued',
1150
- messagePreview: messagePreview(input.message),
1151
- dedupeKey: input.dedupeKey,
1152
- queuedAt: nextQueuedAt(),
1153
- recoveredFromRestart: input.recoveredFromRestart === true,
1154
- recoveredFromRunId: input.recoveredFromRunId,
1155
- recoveryPayload: buildRecoveryPayload(
1156
- input,
1157
- source,
1158
- mode,
1159
- effectiveMaxRuntimeMs > 0 ? effectiveMaxRuntimeMs : undefined,
1160
- executionKey,
1161
- ),
1162
- }
1163
- registerRun(run)
1164
-
1165
- let resolve!: (value: ExecuteChatTurnResult) => void
1166
- let reject!: (error: Error) => void
1167
- const promise = new Promise<ExecuteChatTurnResult>((res, rej) => {
1168
- resolve = res
1169
- reject = rej
69
+ return enqueueSessionRunInternal(input, {
70
+ repairSessionRunQueue: (sessionId, opts) => repairSessionRunQueue(sessionId, opts),
71
+ drainExecution,
1170
72
  })
1171
- promise.catch(() => {}) // prevent unhandledRejection when entry is cancelled
1172
- state.promises.set(runId, promise)
1173
-
1174
- const entry: QueueEntry = {
1175
- executionKey,
1176
- run,
1177
- message: input.message,
1178
- imagePath: input.imagePath,
1179
- imageUrl: input.imageUrl,
1180
- attachedFiles: input.attachedFiles,
1181
- onEvents: input.onEvent ? [input.onEvent] : [],
1182
- signalController: new AbortController(),
1183
- maxRuntimeMs: effectiveMaxRuntimeMs > 0 ? effectiveMaxRuntimeMs : undefined,
1184
- modelOverride: input.modelOverride,
1185
- heartbeatConfig: input.heartbeatConfig,
1186
- replyToId: input.replyToId,
1187
- resolve,
1188
- reject,
1189
- promise,
1190
- }
1191
-
1192
- if (input.callerSignal) chainCallerSignal(input.callerSignal, entry.signalController)
1193
-
1194
- q.push(entry)
1195
- incrementNonHeartbeatWork(entry)
1196
- if (entry.nonHeartbeatCounted) {
1197
- reconcileSessionActivityLease(input.sessionId)
1198
- }
1199
- const position = (running ? 1 : 0) + q.length - 1
1200
- emitRunMeta(entry, 'queued', { position })
1201
- void drainExecution(executionKey)
1202
-
1203
- const entryCb = input.onEvent
1204
- return {
1205
- runId,
1206
- position,
1207
- promise,
1208
- abort: () => entry.signalController.abort(),
1209
- unsubscribe: () => {
1210
- if (!entryCb) return
1211
- const idx = entry.onEvents.indexOf(entryCb)
1212
- if (idx >= 0) entry.onEvents.splice(idx, 1)
1213
- },
1214
- }
1215
73
  }
1216
74
 
1217
- export function getSessionRunState(sessionId: string): {
1218
- runningRunId?: string
1219
- queueLength: number
1220
- } {
75
+ export function getSessionRunState(sessionId: string) {
1221
76
  ensureRecoveredPersistedRuns()
1222
- const summary = getSessionExecutionState(sessionId)
1223
- return {
1224
- runningRunId: summary.runningRunId,
1225
- queueLength: summary.queueLength,
1226
- }
1227
- }
1228
-
1229
- function visibleQueuedEntriesForSession(sessionId: string): QueueEntry[] {
1230
- return Array.from(state.queueByExecution.values())
1231
- .flatMap((queue) => queue)
1232
- .filter((entry) => entry.run.sessionId === sessionId && entry.run.internal !== true)
1233
- .sort((left, right) => left.run.queuedAt - right.run.queuedAt)
1234
- }
1235
-
1236
- function toQueuedTurn(entry: QueueEntry, index: number): SessionQueuedTurn {
1237
- return {
1238
- runId: entry.run.id,
1239
- sessionId: entry.run.sessionId,
1240
- missionId: entry.run.missionId || null,
1241
- text: entry.message,
1242
- queuedAt: entry.run.queuedAt,
1243
- position: index + 1,
1244
- imagePath: entry.imagePath,
1245
- imageUrl: entry.imageUrl,
1246
- attachedFiles: entry.attachedFiles,
1247
- replyToId: entry.replyToId,
1248
- source: entry.run.source,
1249
- }
77
+ return getSessionRunStateInternal(sessionId)
1250
78
  }
1251
79
 
1252
- export function getSessionQueueSnapshot(sessionId: string): SessionQueueSnapshot {
80
+ export function getSessionQueueSnapshot(sessionId: string) {
1253
81
  ensureRecoveredPersistedRuns()
1254
- const execution = getSessionExecutionState(sessionId)
1255
- const visibleQueued = visibleQueuedEntriesForSession(sessionId)
1256
- return {
1257
- sessionId,
1258
- activeRunId: execution.runningRunId || null,
1259
- queueLength: visibleQueued.length,
1260
- items: visibleQueued.map((entry, index) => toQueuedTurn(entry, index)),
1261
- }
82
+ return getSessionQueueSnapshotInternal(sessionId)
1262
83
  }
1263
84
 
1264
- export function getSessionExecutionState(sessionId: string): {
1265
- runningRunId?: string
1266
- queueLength: number
1267
- hasRunning: boolean
1268
- hasQueued: boolean
1269
- hasRunningHeartbeat: boolean
1270
- hasQueuedHeartbeat: boolean
1271
- hasRunningNonHeartbeat: boolean
1272
- hasQueuedNonHeartbeat: boolean
1273
- } {
85
+ export function getSessionExecutionState(sessionId: string) {
1274
86
  ensureRecoveredPersistedRuns()
1275
- const running = Array.from(state.runningByExecution.values())
1276
- .find((entry) => entry.run.sessionId === sessionId)
1277
- const runningMatchesSession = Boolean(running)
1278
- const runningHeartbeat = Boolean(
1279
- runningMatchesSession
1280
- && running
1281
- && isInternalHeartbeatRun(running.run.internal, running.run.source),
1282
- )
1283
- const runningNonHeartbeat = Boolean(runningMatchesSession && !runningHeartbeat)
1284
- const queuedEntries = Array.from(state.queueByExecution.values())
1285
- .flatMap((queue) => queue)
1286
- .filter((entry) => entry.run.sessionId === sessionId)
1287
- const queuedHeartbeat = queuedEntries.filter((entry) =>
1288
- isInternalHeartbeatRun(entry.run.internal, entry.run.source),
1289
- ).length
1290
- const queuedNonHeartbeat = queuedEntries.length - queuedHeartbeat
1291
- return {
1292
- runningRunId: (runningMatchesSession && running?.run.status === 'running')
1293
- ? running.run.id
1294
- : undefined,
1295
- queueLength: queuedEntries.length,
1296
- hasRunning: Boolean(runningMatchesSession),
1297
- hasQueued: queuedEntries.length > 0,
1298
- hasRunningHeartbeat: runningHeartbeat,
1299
- hasQueuedHeartbeat: queuedHeartbeat > 0,
1300
- hasRunningNonHeartbeat: runningNonHeartbeat,
1301
- hasQueuedNonHeartbeat: queuedNonHeartbeat > 0,
1302
- }
87
+ return getSessionExecutionStateInternal(sessionId)
1303
88
  }
1304
89
 
1305
- export function getRunById(runId: string): SessionRunRecord | null {
90
+ export function getRunById(runId: string) {
1306
91
  ensureRecoveredPersistedRuns()
1307
- return state.runs.get(runId) || loadPersistedRun(runId)
92
+ return getRunByIdInternal(runId)
1308
93
  }
1309
94
 
1310
- export function listRuns(params?: {
1311
- sessionId?: string
1312
- status?: SessionRunStatus
1313
- limit?: number
1314
- }): SessionRunRecord[] {
95
+ export function listRuns(params?: Parameters<typeof listRunsInternal>[0]) {
1315
96
  ensureRecoveredPersistedRuns()
1316
- return listPersistedRuns(params)
97
+ return listRunsInternal(params)
1317
98
  }
1318
99
 
1319
- export function listRunEvents(runId: string, limit?: number): RunEventRecord[] {
100
+ export function listRunEvents(runId: string, limit?: number) {
1320
101
  ensureRecoveredPersistedRuns()
1321
- return listPersistedRunEvents(runId, limit)
102
+ return listRunEventsInternal(runId, limit)
1322
103
  }
1323
104
 
1324
105
  export function cancelQueuedRunById(runId: string, reason = 'Removed from queue'): boolean {
1325
106
  ensureRecoveredPersistedRuns()
1326
- const result = cancelQueuedEntries((entry) => entry.run.id === runId, reason)
1327
- return result.cancelled > 0
107
+ return cancelQueuedRunByIdInternal(runId, reason)
1328
108
  }
1329
109
 
1330
110
  export function cancelQueuedRunsForSession(sessionId: string, reason = 'Cleared queued messages'): number {
1331
111
  ensureRecoveredPersistedRuns()
1332
- const result = cancelQueuedEntries((entry) => entry.run.sessionId === sessionId, reason)
1333
- return result.cancelled
112
+ return cancelQueuedRunsForSessionInternal(sessionId, reason)
1334
113
  }
1335
114
 
1336
- export function cancelSessionRuns(sessionId: string, reason = 'Cancelled'): { cancelledQueued: number; cancelledRunning: boolean } {
115
+ export function cancelSessionRuns(sessionId: string, reason = 'Cancelled') {
1337
116
  ensureRecoveredPersistedRuns()
1338
- const running = Array.from(state.runningByExecution.values())
1339
- .find((entry) => entry.run.sessionId === sessionId)
1340
- let cancelledRunning = false
1341
- if (running) {
1342
- cancelledRunning = true
1343
- abortSessionRuntime(running, reason)
1344
- state.runningByExecution.delete(running.executionKey)
1345
- decrementNonHeartbeatWork(running)
1346
- }
1347
- const cancelledQueued = cancelPendingForSession(sessionId, reason)
1348
- reconcileSessionActivityLease(sessionId)
1349
- return { cancelledQueued, cancelledRunning }
117
+ return cancelSessionRunsInternal(sessionId, reason)
1350
118
  }
1351
119
 
1352
- // ---------------------------------------------------------------------------
1353
- // Stuck-run watchdog — safety net for runs that outlive their timeout.
1354
- // Called periodically from daemon health checks.
1355
- // ---------------------------------------------------------------------------
120
+ export function cancelAllHeartbeatRuns(reason = 'Heartbeat disabled globally') {
121
+ ensureRecoveredPersistedRuns()
122
+ return cancelAllHeartbeatRunsInternal(reason)
123
+ }
1356
124
 
1357
- const STUCK_RUN_THRESHOLD_MS = 20 * 60_000 // 20 min ( default maxRuntime)
125
+ export function cancelAllRuns(reason = 'Cancelled') {
126
+ ensureRecoveredPersistedRuns()
127
+ return cancelAllRunsInternal(reason)
128
+ }
1358
129
 
1359
130
  export function sweepStuckRuns(): { aborted: number } {
1360
- const deadline = now()
1361
- let aborted = 0
1362
-
1363
- // 1. In-memory running entries that have exceeded their timeout
1364
- for (const [execKey, entry] of state.runningByExecution.entries()) {
1365
- const age = deadline - (entry.run.startedAt || entry.run.queuedAt)
1366
- // If the run has an explicit maxRuntimeMs, the existing setTimeout handles it;
1367
- // the watchdog only kicks in at 1.5× as a safety net.
1368
- if (entry.maxRuntimeMs && age < entry.maxRuntimeMs * 1.5) continue
1369
- if (age < STUCK_RUN_THRESHOLD_MS) continue
1370
-
1371
- abortSessionRuntime(entry, 'Watchdog: run exceeded maximum allowed duration')
1372
- state.runningByExecution.delete(execKey)
1373
- decrementNonHeartbeatWork(entry)
1374
- reconcileSessionActivityLease(entry.run.sessionId)
1375
- aborted++
1376
- }
1377
-
1378
- // 2. Persisted runs marked running but with no in-memory entry (orphaned by HMR/crash)
1379
- const persistedRunning = listPersistedRuns({ status: 'running' })
1380
- for (const run of persistedRunning) {
1381
- const execKey = run.recoveryPayload?.executionGroupKey || executionKeyForSession(run.sessionId)
1382
- const inMemory = state.runningByExecution.get(execKey)
1383
- if (inMemory && inMemory.run.id === run.id) continue // still tracked
1384
-
1385
- const age = deadline - (run.startedAt || run.queuedAt)
1386
- if (age < STUCK_RUN_THRESHOLD_MS) continue
1387
-
1388
- markPersistedRunInterrupted(run, 'Watchdog: orphaned run detected after server restart or HMR')
1389
- aborted++
1390
-
1391
- // Re-enqueue if the source is recoverable and no other run is already in-flight for this session
1392
- const alreadyRunning = state.runningByExecution.has(execKey)
1393
- const alreadyQueued = (state.queueByExecution.get(execKey) || []).some(e => e.run.sessionId === run.sessionId)
1394
- if (run.recoveryPayload && isRestartRecoverableSource(run.source) && !alreadyRunning && !alreadyQueued) {
1395
- try {
1396
- const payload = run.recoveryPayload
1397
- enqueueSessionRun({
1398
- sessionId: run.sessionId,
1399
- message: payload.message,
1400
- imagePath: payload.imagePath,
1401
- imageUrl: payload.imageUrl,
1402
- attachedFiles: payload.attachedFiles,
1403
- internal: payload.internal,
1404
- source: payload.source,
1405
- mode: normalizeMode(payload.mode, payload.internal),
1406
- dedupeKey: run.dedupeKey,
1407
- maxRuntimeMs: payload.maxRuntimeMs,
1408
- modelOverride: payload.modelOverride,
1409
- heartbeatConfig: payload.heartbeatConfig,
1410
- replyToId: payload.replyToId,
1411
- executionGroupKey: payload.executionGroupKey,
1412
- recoveredFromRestart: true,
1413
- recoveredFromRunId: run.id,
1414
- })
1415
- } catch (err: unknown) {
1416
- log.warn('session-run', `Watchdog: failed to re-enqueue orphaned run ${run.id}`, {
1417
- sessionId: run.sessionId,
1418
- error: errorMessage(err),
1419
- })
1420
- }
1421
- }
1422
- }
1423
-
1424
- return { aborted }
131
+ ensureRecoveredPersistedRuns()
132
+ return sweepStuckRunsInternal(enqueueSessionRun)
1425
133
  }
1426
134
 
1427
135
  export function resetSessionRunManagerForTests(): void {
1428
- recoveryState.completed = false
1429
- for (const timer of state.deferredDrainTimers.values()) clearTimeout(timer)
1430
- state.deferredDrainTimers.clear()
1431
- for (const [sessionId, timer] of state.activityLeaseRenewTimers.entries()) {
1432
- clearInterval(timer)
1433
- releaseRuntimeLock(nonHeartbeatActivityLeaseName(sessionId), SHARED_ACTIVITY_LEASE_OWNER)
1434
- }
1435
- state.activityLeaseRenewTimers.clear()
1436
- state.runningByExecution.clear()
1437
- state.queueByExecution.clear()
1438
- state.runs.clear()
1439
- state.recentRunIds.length = 0
1440
- state.promises.clear()
1441
- state.externalSessionHolds.clear()
1442
- state.nonHeartbeatWorkCount.clear()
1443
- state.lastQueuedAt = 0
136
+ resetSessionRunManagerStateForTests()
1444
137
  }
138
+
139
+ export { hasActiveNonHeartbeatSessionLease }