@swarmclawai/swarmclaw 1.9.37 → 1.9.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -1
- package/package.json +2 -2
- package/src/app/api/chats/[id]/context-status/route.ts +2 -0
- package/src/app/api/chats/context-status-route.test.ts +59 -0
- package/src/app/api/setup/check-provider/route.test.ts +12 -0
- package/src/app/api/setup/check-provider/route.ts +6 -0
- package/src/lib/providers/index.ts +23 -0
- package/src/lib/server/autonomy/supervisor-reflection.test.ts +10 -1
- package/src/lib/server/connectors/outbox.ts +22 -2
- package/src/lib/server/context-manager.ts +4 -0
- package/src/lib/server/openrouter-model-context.test.ts +205 -0
- package/src/lib/server/openrouter-model-context.ts +169 -0
- package/src/lib/server/provider-health.ts +1 -0
- package/src/lib/server/runtime/queue/core.ts +160 -18
- package/src/lib/server/runtime/queue/orphan-recovery.test.ts +49 -0
- package/src/lib/server/runtime/queue/orphan-recovery.ts +32 -0
- package/src/lib/server/runtime/scheduled-run-preflight.test.ts +73 -0
- package/src/lib/server/runtime/scheduled-run-preflight.ts +83 -0
- package/src/lib/server/schedules/schedule-lifecycle.test.ts +44 -0
- package/src/lib/server/schedules/schedule-lifecycle.ts +27 -0
- package/src/lib/server/storage-normalization.ts +13 -0
- package/src/lib/server/tasks/task-followups.test.ts +124 -41
- package/src/lib/server/tasks/task-followups.ts +28 -3
- package/src/lib/server/tasks/task-lifecycle.test.ts +25 -0
- package/src/lib/server/tasks/task-lifecycle.ts +6 -0
- package/src/lib/server/tasks/task-result.test.ts +25 -1
- package/src/lib/server/tasks/task-result.ts +22 -0
- package/src/lib/server/workspace-paths.test.ts +72 -0
- package/src/lib/server/workspace-paths.ts +60 -0
- package/src/lib/setup-defaults.test.ts +10 -1
- package/src/lib/setup-defaults.ts +20 -0
- package/src/types/provider.ts +1 -1
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import fs from 'node:fs/promises'
|
|
2
|
+
import path from 'node:path'
|
|
3
|
+
|
|
4
|
+
import { fetchWithTimeout } from '@/lib/fetch-timeout'
|
|
5
|
+
import { DATA_DIR } from '@/lib/server/data-dir'
|
|
6
|
+
|
|
7
|
+
interface OpenRouterModelEntry {
|
|
8
|
+
id?: string
|
|
9
|
+
context_length?: number
|
|
10
|
+
top_provider?: {
|
|
11
|
+
context_length?: number
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
interface OpenRouterModelsResponse {
|
|
16
|
+
data?: OpenRouterModelEntry[]
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
interface OpenRouterModelContextCache {
|
|
20
|
+
loadedAt: number
|
|
21
|
+
models: Record<string, number>
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const OPENROUTER_MODELS_URL = 'https://openrouter.ai/api/v1/models'
|
|
25
|
+
const CACHE_TTL_MS = 24 * 60 * 60 * 1000
|
|
26
|
+
const FETCH_TIMEOUT_MS = 2_000
|
|
27
|
+
const CACHE_PATH = path.join(DATA_DIR, 'openrouter-model-context.json')
|
|
28
|
+
|
|
29
|
+
let cache: OpenRouterModelContextCache | null = null
|
|
30
|
+
let loading: Promise<void> | null = null
|
|
31
|
+
|
|
32
|
+
function isRecord(value: unknown): value is Record<string, unknown> {
|
|
33
|
+
return typeof value === 'object' && value !== null && !Array.isArray(value)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function parseModelEntry(value: unknown): OpenRouterModelEntry | null {
|
|
37
|
+
if (!isRecord(value)) return null
|
|
38
|
+
|
|
39
|
+
const entry: OpenRouterModelEntry = {}
|
|
40
|
+
if (typeof value.id === 'string') entry.id = value.id
|
|
41
|
+
if (typeof value.context_length === 'number') entry.context_length = value.context_length
|
|
42
|
+
|
|
43
|
+
if (isRecord(value.top_provider)) {
|
|
44
|
+
const topProvider: OpenRouterModelEntry['top_provider'] = {}
|
|
45
|
+
if (typeof value.top_provider.context_length === 'number') {
|
|
46
|
+
topProvider.context_length = value.top_provider.context_length
|
|
47
|
+
}
|
|
48
|
+
entry.top_provider = topProvider
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return entry
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function parseModelsResponse(value: unknown): OpenRouterModelsResponse {
|
|
55
|
+
if (!isRecord(value) || !Array.isArray(value.data)) return {}
|
|
56
|
+
return {
|
|
57
|
+
data: value.data
|
|
58
|
+
.map(parseModelEntry)
|
|
59
|
+
.filter((entry): entry is OpenRouterModelEntry => entry !== null),
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function parseCache(value: unknown): OpenRouterModelContextCache | null {
|
|
64
|
+
if (!isRecord(value) || typeof value.loadedAt !== 'number' || !isRecord(value.models)) {
|
|
65
|
+
return null
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const models: Record<string, number> = {}
|
|
69
|
+
for (const [id, contextLength] of Object.entries(value.models)) {
|
|
70
|
+
if (typeof contextLength === 'number' && Number.isFinite(contextLength) && contextLength > 0) {
|
|
71
|
+
models[id] = contextLength
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return { loadedAt: value.loadedAt, models }
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function isFreshCache(value: OpenRouterModelContextCache | null): value is OpenRouterModelContextCache {
|
|
79
|
+
return value !== null
|
|
80
|
+
&& Number.isFinite(value.loadedAt)
|
|
81
|
+
&& Date.now() - value.loadedAt <= CACHE_TTL_MS
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
async function readCache(): Promise<OpenRouterModelContextCache | null> {
|
|
85
|
+
try {
|
|
86
|
+
const raw = await fs.readFile(CACHE_PATH, 'utf8')
|
|
87
|
+
const parsed = parseCache(JSON.parse(raw))
|
|
88
|
+
return isFreshCache(parsed) ? parsed : null
|
|
89
|
+
} catch {
|
|
90
|
+
return null
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
async function writeCache(nextCache: OpenRouterModelContextCache): Promise<void> {
|
|
95
|
+
try {
|
|
96
|
+
await fs.mkdir(DATA_DIR, { recursive: true })
|
|
97
|
+
await fs.writeFile(CACHE_PATH, JSON.stringify(nextCache), 'utf8')
|
|
98
|
+
} catch {
|
|
99
|
+
// Best-effort cache. Runtime behavior should not depend on disk writes.
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function buildModelContextMap(response: OpenRouterModelsResponse): Record<string, number> {
|
|
104
|
+
const models: Record<string, number> = {}
|
|
105
|
+
for (const entry of response.data || []) {
|
|
106
|
+
if (!entry.id) continue
|
|
107
|
+
const contextLength = entry.top_provider?.context_length || entry.context_length
|
|
108
|
+
if (typeof contextLength === 'number' && Number.isFinite(contextLength) && contextLength > 0) {
|
|
109
|
+
models[entry.id] = contextLength
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
return models
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
async function fetchOpenRouterModels(): Promise<OpenRouterModelContextCache | null> {
|
|
116
|
+
try {
|
|
117
|
+
const response = await fetchWithTimeout(OPENROUTER_MODELS_URL, {}, FETCH_TIMEOUT_MS)
|
|
118
|
+
if (!response.ok) return null
|
|
119
|
+
|
|
120
|
+
const payload = parseModelsResponse(await response.json())
|
|
121
|
+
return {
|
|
122
|
+
loadedAt: Date.now(),
|
|
123
|
+
models: buildModelContextMap(payload),
|
|
124
|
+
}
|
|
125
|
+
} catch {
|
|
126
|
+
return null
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
async function loadOpenRouterModelContextCache(): Promise<void> {
|
|
131
|
+
const diskCache = await readCache()
|
|
132
|
+
if (diskCache) {
|
|
133
|
+
cache = diskCache
|
|
134
|
+
return
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const fetchedCache = await fetchOpenRouterModels()
|
|
138
|
+
if (!fetchedCache) return
|
|
139
|
+
|
|
140
|
+
cache = fetchedCache
|
|
141
|
+
await writeCache(fetchedCache)
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
export function getCachedOpenRouterContextWindow(provider: string, model: string): number | null {
|
|
145
|
+
if (provider !== 'openrouter' || !isFreshCache(cache)) return null
|
|
146
|
+
|
|
147
|
+
const exactMatch = cache.models[model]
|
|
148
|
+
if (exactMatch) return exactMatch
|
|
149
|
+
|
|
150
|
+
if (model.includes('/')) return null
|
|
151
|
+
|
|
152
|
+
const suffixMatches = Object.entries(cache.models)
|
|
153
|
+
.filter(([id]) => id.endsWith(`/${model}`))
|
|
154
|
+
.map(([, contextLength]) => contextLength)
|
|
155
|
+
|
|
156
|
+
return suffixMatches.length === 1 ? suffixMatches[0] : null
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
export async function ensureOpenRouterModelContextCache(provider: string): Promise<void> {
|
|
160
|
+
if (provider !== 'openrouter' || isFreshCache(cache)) return
|
|
161
|
+
|
|
162
|
+
if (!loading) {
|
|
163
|
+
loading = loadOpenRouterModelContextCache().finally(() => {
|
|
164
|
+
loading = null
|
|
165
|
+
})
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
await loading
|
|
169
|
+
}
|
|
@@ -261,6 +261,7 @@ async function parseErrorMessage(res: Response, fallback: string): Promise<strin
|
|
|
261
261
|
export const OPENAI_COMPATIBLE_DEFAULTS: Record<string, { name: string; defaultEndpoint: string }> = {
|
|
262
262
|
openai: { name: 'OpenAI', defaultEndpoint: 'https://api.openai.com/v1' },
|
|
263
263
|
openrouter: { name: 'OpenRouter', defaultEndpoint: 'https://openrouter.ai/api/v1' },
|
|
264
|
+
tokenmix: { name: 'TokenMix', defaultEndpoint: 'https://api.tokenmix.ai/v1' },
|
|
264
265
|
google: { name: 'Google Gemini', defaultEndpoint: 'https://generativelanguage.googleapis.com/v1beta/openai' },
|
|
265
266
|
deepseek: { name: 'DeepSeek', defaultEndpoint: 'https://api.deepseek.com/v1' },
|
|
266
267
|
groq: { name: 'Groq', defaultEndpoint: 'https://api.groq.com/openai/v1' },
|
|
@@ -8,7 +8,8 @@ import { logActivity } from '@/lib/server/activity/activity-log'
|
|
|
8
8
|
import { loadAgents } from '@/lib/server/agents/agent-repository'
|
|
9
9
|
import { withTransaction } from '@/lib/server/persistence/transaction'
|
|
10
10
|
import { loadQueue, saveQueue } from '@/lib/server/runtime/queue-repository'
|
|
11
|
-
import { loadSchedules, saveSchedules } from '@/lib/server/schedules/schedule-repository'
|
|
11
|
+
import { loadSchedules, saveSchedules, upsertSchedule } from '@/lib/server/schedules/schedule-repository'
|
|
12
|
+
import { applyScheduleRunOutcome } from '@/lib/server/schedules/schedule-lifecycle'
|
|
12
13
|
import { loadSessions, saveSessions } from '@/lib/server/sessions/session-repository'
|
|
13
14
|
import { loadSettings } from '@/lib/server/settings/settings-repository'
|
|
14
15
|
import { loadTasks, saveTasks } from '@/lib/server/tasks/task-repository'
|
|
@@ -16,13 +17,20 @@ import { notify } from '@/lib/server/ws-hub'
|
|
|
16
17
|
import { getMessages, getLastMessage, appendMessage } from '@/lib/server/messages/message-repository'
|
|
17
18
|
import { perf } from '@/lib/server/runtime/perf'
|
|
18
19
|
import { WORKSPACE_DIR } from '@/lib/server/data-dir'
|
|
20
|
+
import { normalizeLegacyWorkspacePath } from '@/lib/server/workspace-paths'
|
|
21
|
+
import {
|
|
22
|
+
MAX_ORPHAN_RECOVERY_ATTEMPTS,
|
|
23
|
+
pruneOrphanRecovery,
|
|
24
|
+
trackOrphanRecovery,
|
|
25
|
+
} from '@/lib/server/runtime/queue/orphan-recovery'
|
|
26
|
+
import { preflightProviderCredential } from '@/lib/server/runtime/scheduled-run-preflight'
|
|
19
27
|
import { createAgentTaskSession } from '@/lib/server/agents/task-session'
|
|
20
28
|
import { formatValidationFailure } from '@/lib/server/tasks/task-validation'
|
|
21
29
|
import { pushMainLoopEventToMainSessions } from '@/lib/server/agents/main-agent-loop'
|
|
22
30
|
import type { ExecuteChatTurnResult } from '@/lib/server/chat-execution/chat-execution-types'
|
|
23
31
|
import { checkAgentBudgetLimits } from '@/lib/server/cost'
|
|
24
32
|
import { enqueueExecution } from '@/lib/server/execution-engine'
|
|
25
|
-
import { extractTaskResult, formatResultBody } from '@/lib/server/tasks/task-result'
|
|
33
|
+
import { classifyEmptyRunOutcome, EMPTY_RUN_OUTCOME_MESSAGE, extractTaskResult, formatResultBody } from '@/lib/server/tasks/task-result'
|
|
26
34
|
import { checkoutTask } from '@/lib/server/tasks/task-checkout'
|
|
27
35
|
import { queueSwarmFeedTaskCompletionWake } from '@/lib/server/swarmfeed-runtime'
|
|
28
36
|
import {
|
|
@@ -64,6 +72,7 @@ const _queueState = hmrSingleton('__swarmclaw_queue__', () => ({
|
|
|
64
72
|
activeCount: 0,
|
|
65
73
|
maxConcurrent: 3,
|
|
66
74
|
pendingKick: false,
|
|
75
|
+
orphanRecoveryAttempts: {} as Record<string, number>,
|
|
67
76
|
}))
|
|
68
77
|
|
|
69
78
|
function normalizeInt(value: unknown, fallback: number, min: number, max: number): number {
|
|
@@ -499,7 +508,10 @@ function inferWorkspaceProjectCwd(task: Pick<BoardTask, 'title' | 'description'
|
|
|
499
508
|
function resolveTaskExecutionCwd(task: ScheduleTaskMeta, sessions: Record<string, SessionLike>): string {
|
|
500
509
|
const workspaceRoot = path.resolve(WORKSPACE_DIR)
|
|
501
510
|
|
|
502
|
-
const explicitCwd = normalizeDirCandidate(
|
|
511
|
+
const explicitCwd = normalizeDirCandidate(
|
|
512
|
+
normalizeLegacyWorkspacePath(typeof task.cwd === 'string' ? task.cwd : '', { workspaceRoot, taskId: task.id }),
|
|
513
|
+
workspaceRoot,
|
|
514
|
+
)
|
|
503
515
|
if (explicitCwd) return explicitCwd
|
|
504
516
|
|
|
505
517
|
const projectId = typeof task.projectId === 'string' ? task.projectId.trim() : ''
|
|
@@ -520,13 +532,19 @@ function resolveTaskExecutionCwd(task: ScheduleTaskMeta, sessions: Record<string
|
|
|
520
532
|
|
|
521
533
|
const sourceSessionId = typeof task.createdInSessionId === 'string' ? task.createdInSessionId.trim() : ''
|
|
522
534
|
const sourceSessionCwd = sourceSessionId
|
|
523
|
-
? normalizeDirCandidate(
|
|
535
|
+
? normalizeDirCandidate(
|
|
536
|
+
normalizeLegacyWorkspacePath(sessions[sourceSessionId]?.cwd, { workspaceRoot, taskId: task.id }),
|
|
537
|
+
workspaceRoot,
|
|
538
|
+
)
|
|
524
539
|
: null
|
|
525
540
|
if (sourceSessionCwd && path.resolve(sourceSessionCwd) !== workspaceRoot) return sourceSessionCwd
|
|
526
541
|
|
|
527
542
|
const runSessionId = typeof task.sessionId === 'string' ? task.sessionId.trim() : ''
|
|
528
543
|
const runSessionCwd = runSessionId
|
|
529
|
-
? normalizeDirCandidate(
|
|
544
|
+
? normalizeDirCandidate(
|
|
545
|
+
normalizeLegacyWorkspacePath(sessions[runSessionId]?.cwd, { workspaceRoot, taskId: task.id }),
|
|
546
|
+
workspaceRoot,
|
|
547
|
+
)
|
|
530
548
|
: null
|
|
531
549
|
if (runSessionCwd && path.resolve(runSessionCwd) !== workspaceRoot) return runSessionCwd
|
|
532
550
|
|
|
@@ -708,6 +726,7 @@ export function reconcileFinishedRunningTasks(): { reconciled: number; deadLette
|
|
|
708
726
|
if (!fallbackText && !task.result) {
|
|
709
727
|
task.status = 'failed'
|
|
710
728
|
task.result = 'Agent session finished without producing output.'
|
|
729
|
+
task.error = EMPTY_RUN_OUTCOME_MESSAGE.slice(0, 500)
|
|
711
730
|
task.checkoutRunId = null
|
|
712
731
|
task.updatedAt = now
|
|
713
732
|
tasksDirty = true
|
|
@@ -854,7 +873,20 @@ function deliverTaskConnectorFollowups(task: BoardTask, sessions: Record<string,
|
|
|
854
873
|
})
|
|
855
874
|
}
|
|
856
875
|
|
|
876
|
+
/** Reflects a terminal scheduled-run outcome back onto the originating schedule. */
|
|
877
|
+
function recordScheduleRunOutcome(task: BoardTask): void {
|
|
878
|
+
const meta = task as ScheduleTaskMeta
|
|
879
|
+
const sourceScheduleId = typeof meta.sourceScheduleId === 'string' ? meta.sourceScheduleId.trim() : ''
|
|
880
|
+
if (!sourceScheduleId) return
|
|
881
|
+
const schedule = loadSchedules()[sourceScheduleId]
|
|
882
|
+
if (!schedule) return
|
|
883
|
+
if (!applyScheduleRunOutcome(schedule, task, Date.now())) return
|
|
884
|
+
upsertSchedule(sourceScheduleId, schedule)
|
|
885
|
+
notify('schedules')
|
|
886
|
+
}
|
|
887
|
+
|
|
857
888
|
function handleTerminalTaskResultDeliveries(task: BoardTask): void {
|
|
889
|
+
recordScheduleRunOutcome(task)
|
|
858
890
|
const sessions = loadSessions() as Record<string, SessionLike>
|
|
859
891
|
pushUserFacingTaskResult(task, sessions)
|
|
860
892
|
deliverTaskConnectorFollowups(task, sessions)
|
|
@@ -1114,25 +1146,68 @@ export async function processNext() {
|
|
|
1114
1146
|
const allTasks = loadTasks()
|
|
1115
1147
|
const currentQueue = loadQueue()
|
|
1116
1148
|
const queueSet = new Set(currentQueue)
|
|
1149
|
+
// Backfill for hmrSingleton state created before this field existed
|
|
1150
|
+
_queueState.orphanRecoveryAttempts ??= {}
|
|
1151
|
+
const orphanAttempts = _queueState.orphanRecoveryAttempts
|
|
1152
|
+
const stillOrphanedIds = new Set<string>()
|
|
1153
|
+
const deadLetteredOrphans: BoardTask[] = []
|
|
1117
1154
|
let recovered = false
|
|
1118
1155
|
let tasksDirty = false
|
|
1119
1156
|
for (const [id, t] of Object.entries(allTasks) as [string, BoardTask][]) {
|
|
1120
|
-
if (t.status
|
|
1157
|
+
if (t.status !== 'queued' || queueSet.has(id)) continue
|
|
1158
|
+
const decision = trackOrphanRecovery(orphanAttempts, id)
|
|
1159
|
+
if (decision.action === 'dead_letter') {
|
|
1160
|
+
// Recovery keeps re-queueing this task but it never starts. Stop the
|
|
1161
|
+
// loop with one terminal reason instead of spamming recovery forever.
|
|
1162
|
+
const now = Date.now()
|
|
1163
|
+
t.status = 'failed'
|
|
1164
|
+
t.deadLetteredAt = now
|
|
1165
|
+
t.retryScheduledAt = null
|
|
1166
|
+
t.checkoutRunId = null
|
|
1167
|
+
t.updatedAt = now
|
|
1168
|
+
t.error = `Orphan recovery exhausted after ${MAX_ORPHAN_RECOVERY_ATTEMPTS} attempts: task repeatedly returned to "queued" without starting.`
|
|
1169
|
+
if (!t.comments) t.comments = []
|
|
1170
|
+
t.comments.push({
|
|
1171
|
+
id: genId(),
|
|
1172
|
+
author: 'System',
|
|
1173
|
+
text: t.error,
|
|
1174
|
+
createdAt: now,
|
|
1175
|
+
})
|
|
1176
|
+
delete orphanAttempts[id]
|
|
1177
|
+
tasksDirty = true
|
|
1178
|
+
deadLetteredOrphans.push(t)
|
|
1179
|
+
log.warn(TAG, `[queue] Dead-lettered orphaned queued task after ${decision.attempt - 1} recovery attempts: "${t.title}" (${id})`)
|
|
1180
|
+
continue
|
|
1181
|
+
}
|
|
1182
|
+
stillOrphanedIds.add(id)
|
|
1183
|
+
if (decision.firstAttempt) {
|
|
1121
1184
|
log.info(TAG, `[queue] Recovering orphaned queued task: "${t.title}" (${id})`)
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1185
|
+
} else {
|
|
1186
|
+
log.debug(TAG, `[queue] Re-recovering orphaned queued task (attempt ${decision.attempt}): "${t.title}" (${id})`)
|
|
1187
|
+
}
|
|
1188
|
+
// Defence in depth: a queued task must not carry a stale checkoutRunId
|
|
1189
|
+
// (left over from pre-1.5.38 retries). If it does, checkoutTask() will
|
|
1190
|
+
// reject every attempt and this orphan-recovery loop will spin at 100%
|
|
1191
|
+
// CPU re-queueing a task that can never run.
|
|
1192
|
+
if (t.checkoutRunId) {
|
|
1193
|
+
t.checkoutRunId = null
|
|
1194
|
+
tasksDirty = true
|
|
1132
1195
|
}
|
|
1196
|
+
pushQueueUnique(currentQueue, id)
|
|
1197
|
+
recovered = true
|
|
1133
1198
|
}
|
|
1199
|
+
pruneOrphanRecovery(orphanAttempts, stillOrphanedIds)
|
|
1134
1200
|
if (tasksDirty) saveTasks(allTasks)
|
|
1135
1201
|
if (recovered) saveQueue(currentQueue)
|
|
1202
|
+
for (const t of deadLetteredOrphans) {
|
|
1203
|
+
notify('tasks')
|
|
1204
|
+
logActivity({ entityType: 'task', entityId: t.id, action: 'failed', actor: 'system', actorId: t.agentId, summary: `Task failed: "${t.title}" (orphan recovery exhausted)` })
|
|
1205
|
+
pushMainLoopEventToMainSessions({
|
|
1206
|
+
type: 'task_failed',
|
|
1207
|
+
text: `Task failed: "${t.title}" (${t.id}): orphan recovery exhausted.`,
|
|
1208
|
+
})
|
|
1209
|
+
handleTerminalTaskResultDeliveries(t)
|
|
1210
|
+
}
|
|
1136
1211
|
}
|
|
1137
1212
|
|
|
1138
1213
|
// Process ONE task per invocation (no while loop)
|
|
@@ -1261,6 +1336,61 @@ export async function processNext() {
|
|
|
1261
1336
|
} catch {}
|
|
1262
1337
|
}
|
|
1263
1338
|
|
|
1339
|
+
// Credential preflight for scheduled runs: fail fast with an actionable
|
|
1340
|
+
// error instead of letting the schedule die on a 401 deep in execution.
|
|
1341
|
+
// Retries cannot succeed without a key, so this dead-letters immediately.
|
|
1342
|
+
if ((task as ScheduleTaskMeta).sourceType === 'schedule') {
|
|
1343
|
+
const preflight = preflightProviderCredential({
|
|
1344
|
+
provider: typedAgent.provider,
|
|
1345
|
+
ollamaMode: typedAgent.ollamaMode ?? null,
|
|
1346
|
+
credentialId: typedAgent.credentialId ?? null,
|
|
1347
|
+
fallbackCredentialIds: typedAgent.fallbackCredentialIds || null,
|
|
1348
|
+
})
|
|
1349
|
+
if (!preflight.ok) {
|
|
1350
|
+
const now = Date.now()
|
|
1351
|
+
task.status = 'failed'
|
|
1352
|
+
task.deadLetteredAt = now
|
|
1353
|
+
task.retryScheduledAt = null
|
|
1354
|
+
task.checkoutRunId = null
|
|
1355
|
+
task.error = preflight.error.slice(0, 500)
|
|
1356
|
+
task.updatedAt = now
|
|
1357
|
+
if (!task.comments) task.comments = []
|
|
1358
|
+
task.comments.push({
|
|
1359
|
+
id: genId(),
|
|
1360
|
+
author: 'System',
|
|
1361
|
+
text: preflight.error,
|
|
1362
|
+
createdAt: now,
|
|
1363
|
+
})
|
|
1364
|
+
saveTasks(latestTasks)
|
|
1365
|
+
notify('tasks')
|
|
1366
|
+
const failure = classifyRuntimeFailure({ source: 'task', message: preflight.error })
|
|
1367
|
+
recordSupervisorIncident({
|
|
1368
|
+
runId: task.id,
|
|
1369
|
+
sessionId: task.sessionId || '',
|
|
1370
|
+
taskId: task.id,
|
|
1371
|
+
agentId: typedAgent.id,
|
|
1372
|
+
source: 'task',
|
|
1373
|
+
kind: 'runtime_failure',
|
|
1374
|
+
severity: failure.severity,
|
|
1375
|
+
summary: `Scheduled run blocked by credential preflight: ${preflight.error}`.slice(0, 320),
|
|
1376
|
+
details: preflight.error,
|
|
1377
|
+
failureFamily: failure.family,
|
|
1378
|
+
remediation: failure.remediation,
|
|
1379
|
+
repairPrompt: failure.repairPrompt,
|
|
1380
|
+
autoAction: null,
|
|
1381
|
+
})
|
|
1382
|
+
logActivity({ entityType: 'task', entityId: task.id, action: 'failed', actor: 'system', actorId: typedAgent.id, summary: `Task failed credential preflight: "${task.title}"` })
|
|
1383
|
+
pushMainLoopEventToMainSessions({
|
|
1384
|
+
type: 'task_failed',
|
|
1385
|
+
text: `Task failed: "${task.title}" (${task.id}): ${preflight.error.slice(0, 200)}`,
|
|
1386
|
+
})
|
|
1387
|
+
handleTerminalTaskResultDeliveries(task)
|
|
1388
|
+
cleanupTerminalOneOffSchedule(task)
|
|
1389
|
+
log.warn(TAG, `[queue] Scheduled task "${task.title}" (${taskId}) failed credential preflight: ${preflight.error}`)
|
|
1390
|
+
return
|
|
1391
|
+
}
|
|
1392
|
+
}
|
|
1393
|
+
|
|
1264
1394
|
// Atomic checkout — prevents two runners from starting the same task
|
|
1265
1395
|
const runId = genId()
|
|
1266
1396
|
task = checkoutTask(taskId, runId) as BoardTask | undefined
|
|
@@ -1296,8 +1426,17 @@ export async function processNext() {
|
|
|
1296
1426
|
: ''
|
|
1297
1427
|
if (existingSessionId) {
|
|
1298
1428
|
const sessions = loadSessions()
|
|
1299
|
-
|
|
1429
|
+
const existingSession = sessions[existingSessionId]
|
|
1430
|
+
if (existingSession) {
|
|
1300
1431
|
sessionId = existingSessionId
|
|
1432
|
+
// Rebind sessions still pinned to a legacy workspace root (e.g. a
|
|
1433
|
+
// pre-migration ~/.swarmclaw/workspace path) onto the current root.
|
|
1434
|
+
const sessionCwd = typeof existingSession.cwd === 'string' ? existingSession.cwd : ''
|
|
1435
|
+
if (sessionCwd && normalizeLegacyWorkspacePath(sessionCwd, { taskId: task.id }) !== sessionCwd) {
|
|
1436
|
+
existingSession.cwd = taskCwd
|
|
1437
|
+
saveSessions(sessions)
|
|
1438
|
+
log.info(TAG, `[queue] Rebound stale schedule session cwd to ${taskCwd} (session ${existingSessionId})`)
|
|
1439
|
+
}
|
|
1301
1440
|
}
|
|
1302
1441
|
}
|
|
1303
1442
|
if (!sessionId) {
|
|
@@ -1467,7 +1606,10 @@ export async function processNext() {
|
|
|
1467
1606
|
createdAt: now,
|
|
1468
1607
|
})
|
|
1469
1608
|
} else {
|
|
1470
|
-
|
|
1609
|
+
// A run with no text, no tool calls, and no error gets an actionable
|
|
1610
|
+
// reason instead of the generic "Result summary is empty." message.
|
|
1611
|
+
const emptyRunReason = classifyEmptyRunOutcome(taskRun)
|
|
1612
|
+
const failureReason = (emptyRunReason || formatValidationFailure(validation.reasons)).slice(0, 500)
|
|
1471
1613
|
const retryState = scheduleRetryOrDeadLetter(t2[taskId], failureReason)
|
|
1472
1614
|
t2[taskId].completedAt = retryState === 'dead_lettered' ? null : t2[taskId].completedAt
|
|
1473
1615
|
t2[taskId].comments!.push({
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import test from 'node:test'
|
|
2
|
+
import assert from 'node:assert/strict'
|
|
3
|
+
|
|
4
|
+
import {
|
|
5
|
+
MAX_ORPHAN_RECOVERY_ATTEMPTS,
|
|
6
|
+
pruneOrphanRecovery,
|
|
7
|
+
trackOrphanRecovery,
|
|
8
|
+
} from './orphan-recovery'
|
|
9
|
+
|
|
10
|
+
test('allows recovery for the first attempts and flags only the first one', () => {
|
|
11
|
+
const attempts: Record<string, number> = {}
|
|
12
|
+
|
|
13
|
+
const first = trackOrphanRecovery(attempts, 'task-1')
|
|
14
|
+
assert.deepEqual(first, { action: 'recover', attempt: 1, firstAttempt: true })
|
|
15
|
+
|
|
16
|
+
const second = trackOrphanRecovery(attempts, 'task-1')
|
|
17
|
+
assert.deepEqual(second, { action: 'recover', attempt: 2, firstAttempt: false })
|
|
18
|
+
|
|
19
|
+
const third = trackOrphanRecovery(attempts, 'task-1')
|
|
20
|
+
assert.deepEqual(third, { action: 'recover', attempt: 3, firstAttempt: false })
|
|
21
|
+
})
|
|
22
|
+
|
|
23
|
+
test('dead-letters once the attempt cap is exceeded', () => {
|
|
24
|
+
const attempts: Record<string, number> = { 'task-1': MAX_ORPHAN_RECOVERY_ATTEMPTS }
|
|
25
|
+
|
|
26
|
+
const decision = trackOrphanRecovery(attempts, 'task-1')
|
|
27
|
+
assert.deepEqual(decision, { action: 'dead_letter', attempt: MAX_ORPHAN_RECOVERY_ATTEMPTS + 1 })
|
|
28
|
+
})
|
|
29
|
+
|
|
30
|
+
test('tracks tasks independently', () => {
|
|
31
|
+
const attempts: Record<string, number> = {}
|
|
32
|
+
trackOrphanRecovery(attempts, 'task-1')
|
|
33
|
+
trackOrphanRecovery(attempts, 'task-1')
|
|
34
|
+
const other = trackOrphanRecovery(attempts, 'task-2')
|
|
35
|
+
assert.equal(other.action, 'recover')
|
|
36
|
+
assert.equal(other.attempt, 1)
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
test('prune drops counters for tasks no longer orphaned', () => {
|
|
40
|
+
const attempts: Record<string, number> = { 'task-1': 2, 'task-2': 1 }
|
|
41
|
+
pruneOrphanRecovery(attempts, new Set(['task-2']))
|
|
42
|
+
assert.deepEqual(attempts, { 'task-2': 1 })
|
|
43
|
+
})
|
|
44
|
+
|
|
45
|
+
test('honors a custom max', () => {
|
|
46
|
+
const attempts: Record<string, number> = {}
|
|
47
|
+
assert.equal(trackOrphanRecovery(attempts, 'task-1', 1).action, 'recover')
|
|
48
|
+
assert.equal(trackOrphanRecovery(attempts, 'task-1', 1).action, 'dead_letter')
|
|
49
|
+
})
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
export const MAX_ORPHAN_RECOVERY_ATTEMPTS = 3
|
|
2
|
+
|
|
3
|
+
export type OrphanRecoveryDecision =
|
|
4
|
+
| { action: 'recover'; attempt: number; firstAttempt: boolean }
|
|
5
|
+
| { action: 'dead_letter'; attempt: number }
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Tracks how many times an orphaned queued task has been re-queued by the
|
|
9
|
+
* startup/daemon recovery scan. Recovery is allowed a bounded number of
|
|
10
|
+
* attempts; after that the task should be dead-lettered with one terminal
|
|
11
|
+
* reason instead of looping through recovery forever.
|
|
12
|
+
*/
|
|
13
|
+
export function trackOrphanRecovery(
|
|
14
|
+
attempts: Record<string, number>,
|
|
15
|
+
taskId: string,
|
|
16
|
+
max: number = MAX_ORPHAN_RECOVERY_ATTEMPTS,
|
|
17
|
+
): OrphanRecoveryDecision {
|
|
18
|
+
const attempt = (attempts[taskId] || 0) + 1
|
|
19
|
+
attempts[taskId] = attempt
|
|
20
|
+
if (attempt > max) return { action: 'dead_letter', attempt }
|
|
21
|
+
return { action: 'recover', attempt, firstAttempt: attempt === 1 }
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** Drops counters for tasks that are no longer orphaned so a future orphan starts fresh. */
|
|
25
|
+
export function pruneOrphanRecovery(
|
|
26
|
+
attempts: Record<string, number>,
|
|
27
|
+
stillOrphanedIds: ReadonlySet<string>,
|
|
28
|
+
): void {
|
|
29
|
+
for (const taskId of Object.keys(attempts)) {
|
|
30
|
+
if (!stillOrphanedIds.has(taskId)) delete attempts[taskId]
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import test from 'node:test'
|
|
2
|
+
import assert from 'node:assert/strict'
|
|
3
|
+
|
|
4
|
+
import {
|
|
5
|
+
preflightProviderCredential,
|
|
6
|
+
type ProviderCredentialPreflightDeps,
|
|
7
|
+
} from './scheduled-run-preflight'
|
|
8
|
+
|
|
9
|
+
function makeDeps(overrides: Partial<ProviderCredentialPreflightDeps> = {}): ProviderCredentialPreflightDeps {
|
|
10
|
+
return {
|
|
11
|
+
getProvider: () => ({ requiresApiKey: true }),
|
|
12
|
+
resolveProviderCredentialId: (input) => input.credentialId || null,
|
|
13
|
+
resolveCredentialSecret: () => null,
|
|
14
|
+
...overrides,
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
test('passes when the provider does not require an API key', () => {
|
|
19
|
+
const result = preflightProviderCredential(
|
|
20
|
+
{ provider: 'ollama' },
|
|
21
|
+
makeDeps({ getProvider: () => ({ requiresApiKey: false }) }),
|
|
22
|
+
)
|
|
23
|
+
assert.deepEqual(result, { ok: true })
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
test('passes when the provider is unknown', () => {
|
|
27
|
+
const result = preflightProviderCredential(
|
|
28
|
+
{ provider: 'mystery' },
|
|
29
|
+
makeDeps({ getProvider: () => null }),
|
|
30
|
+
)
|
|
31
|
+
assert.deepEqual(result, { ok: true })
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
test('passes when no provider is set', () => {
|
|
35
|
+
assert.deepEqual(preflightProviderCredential({ provider: '' }, makeDeps()), { ok: true })
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
test('passes when the resolved credential has a secret', () => {
|
|
39
|
+
const result = preflightProviderCredential(
|
|
40
|
+
{ provider: 'openai', credentialId: 'cred-1' },
|
|
41
|
+
makeDeps({ resolveCredentialSecret: (id) => (id === 'cred-1' ? 'sk-test' : null) }),
|
|
42
|
+
)
|
|
43
|
+
assert.deepEqual(result, { ok: true })
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
test('passes when a fallback credential rescues a dead primary', () => {
|
|
47
|
+
const result = preflightProviderCredential(
|
|
48
|
+
{ provider: 'openai', credentialId: 'cred-dead', fallbackCredentialIds: ['cred-live'] },
|
|
49
|
+
makeDeps({ resolveCredentialSecret: (id) => (id === 'cred-live' ? 'sk-test' : null) }),
|
|
50
|
+
)
|
|
51
|
+
assert.deepEqual(result, { ok: true })
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
test('passes when auto-matching finds another credential for the provider', () => {
|
|
55
|
+
const result = preflightProviderCredential(
|
|
56
|
+
{ provider: 'openai', credentialId: 'cred-dead' },
|
|
57
|
+
makeDeps({
|
|
58
|
+
resolveProviderCredentialId: (input) => (input.credentialId ? input.credentialId : 'cred-auto'),
|
|
59
|
+
resolveCredentialSecret: (id) => (id === 'cred-auto' ? 'sk-test' : null),
|
|
60
|
+
}),
|
|
61
|
+
)
|
|
62
|
+
assert.deepEqual(result, { ok: true })
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
test('fails with an actionable error naming the provider when nothing resolves', () => {
|
|
66
|
+
const result = preflightProviderCredential({ provider: 'openai', credentialId: 'cred-dead' }, makeDeps())
|
|
67
|
+
assert.equal(result.ok, false)
|
|
68
|
+
if (!result.ok) {
|
|
69
|
+
assert.match(result.error, /Provider authentication preflight failed/)
|
|
70
|
+
assert.match(result.error, /"openai"/)
|
|
71
|
+
assert.match(result.error, /Settings/)
|
|
72
|
+
}
|
|
73
|
+
})
|