@swarmclawai/swarmclaw 1.9.4 → 1.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -0
- package/package.json +2 -2
- package/src/app/api/eval/environments/route.ts +59 -0
- package/src/app/api/eval/run/route.ts +8 -1
- package/src/app/api/eval/suite/route.ts +6 -0
- package/src/app/api/portability/export/route.test.ts +225 -0
- package/src/app/api/portability/export/route.ts +18 -9
- package/src/app/api/portability/import/route.test.ts +232 -31
- package/src/app/api/portability/import/route.ts +2 -2
- package/src/cli/index.js +2 -0
- package/src/components/quality/quality-workspace.tsx +149 -5
- package/src/lib/server/eval/environment-plan.test.ts +221 -0
- package/src/lib/server/eval/environment-plan.ts +498 -0
- package/src/lib/server/eval/runner.ts +53 -3
- package/src/lib/server/eval/scenarios.ts +18 -0
- package/src/lib/server/eval/types.ts +55 -0
- package/src/lib/server/portability/export.ts +244 -38
- package/src/lib/server/portability/import.ts +148 -98
- package/src/lib/validation/schemas.ts +54 -1
|
@@ -0,0 +1,498 @@
|
|
|
1
|
+
import fs from 'node:fs'
|
|
2
|
+
import path from 'node:path'
|
|
3
|
+
|
|
4
|
+
import { WORKSPACE_DIR } from '@/lib/server/data-dir'
|
|
5
|
+
import { resolveAgentRouteCandidatesWithProfiles, type ResolvedAgentRoute } from '@/lib/server/agents/agent-runtime-config'
|
|
6
|
+
import { checkCliProviderReady, type CliProviderReadyResult } from '@/lib/server/cli-provider-readiness'
|
|
7
|
+
import { listOpenClawGatewayEnvironments } from '@/lib/server/gateways/gateway-topology'
|
|
8
|
+
import { loadAgents, loadCredentials } from '@/lib/server/storage'
|
|
9
|
+
import { isCliProviderId } from '@/lib/providers/cli-provider-metadata'
|
|
10
|
+
import type { Agent, GatewayProfile, OpenClawEnvironmentSummary, OpenClawGatewayEnvironmentList } from '@/types'
|
|
11
|
+
import type {
|
|
12
|
+
EvalEnvironmentCheck,
|
|
13
|
+
EvalEnvironmentGeneratedFile,
|
|
14
|
+
EvalEnvironmentPlan,
|
|
15
|
+
EvalEnvironmentTarget,
|
|
16
|
+
EvalScenario,
|
|
17
|
+
EvalScenarioFixture,
|
|
18
|
+
} from './types'
|
|
19
|
+
import { getScenario, getSuiteScenarios } from './scenarios'
|
|
20
|
+
import { listOpenClawGatewayProfiles } from '../gateways/gateway-profile-service'
|
|
21
|
+
|
|
22
|
+
export interface EvalEnvironmentPlanInput {
|
|
23
|
+
agentId: string
|
|
24
|
+
scenarioId?: string | null
|
|
25
|
+
suite?: string | null
|
|
26
|
+
gatewayProfileId?: string | null
|
|
27
|
+
environmentId?: string | null
|
|
28
|
+
refreshGateway?: boolean
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
interface EvalEnvironmentPlanDeps {
|
|
32
|
+
now?: () => number
|
|
33
|
+
loadAgents?: () => Record<string, Agent>
|
|
34
|
+
loadCredentials?: () => Record<string, unknown>
|
|
35
|
+
listGatewayProfiles?: () => GatewayProfile[]
|
|
36
|
+
listGatewayEnvironments?: (id: string) => Promise<OpenClawGatewayEnvironmentList | null>
|
|
37
|
+
checkCliProviderReady?: (providerId: string) => CliProviderReadyResult
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
interface WriteEvalWorkspaceOptions {
|
|
41
|
+
runId: string
|
|
42
|
+
workspacePath: string
|
|
43
|
+
scenario: EvalScenario
|
|
44
|
+
plan: EvalEnvironmentPlan
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function normalizeOptionalId(value: string | null | undefined): string | null {
|
|
48
|
+
return typeof value === 'string' && value.trim() ? value.trim() : null
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function uniqueStrings(values: string[]): string[] {
|
|
52
|
+
return [...new Set(values.map((value) => value.trim()).filter(Boolean))]
|
|
53
|
+
.sort((left, right) => left.localeCompare(right))
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function scenarioSet(input: EvalEnvironmentPlanInput): { scenarios: EvalScenario[]; missing?: string } {
|
|
57
|
+
const scenarioId = normalizeOptionalId(input.scenarioId)
|
|
58
|
+
if (scenarioId) {
|
|
59
|
+
const scenario = getScenario(scenarioId)
|
|
60
|
+
return scenario ? { scenarios: [scenario] } : { scenarios: [], missing: scenarioId }
|
|
61
|
+
}
|
|
62
|
+
const suite = normalizeOptionalId(input.suite) || 'core'
|
|
63
|
+
return { scenarios: getSuiteScenarios(suite) }
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function maxScore(scenarios: EvalScenario[]): number {
|
|
67
|
+
return scenarios.reduce(
|
|
68
|
+
(sum, scenario) => sum + scenario.scoringCriteria.reduce((criterionSum, criterion) => criterionSum + criterion.weight, 0),
|
|
69
|
+
0,
|
|
70
|
+
)
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function timeoutMs(scenarios: EvalScenario[]): number {
|
|
74
|
+
return scenarios.reduce((sum, scenario) => sum + scenario.timeoutMs, 0)
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function fixtureFiles(scenarios: EvalScenario[]): EvalEnvironmentGeneratedFile[] {
|
|
78
|
+
return scenarios.flatMap((scenario) => (scenario.fixtures || []).map((fixture) => ({
|
|
79
|
+
path: fixture.path,
|
|
80
|
+
kind: 'fixture' as const,
|
|
81
|
+
required: true,
|
|
82
|
+
})))
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function baseGeneratedFiles(scenarios: EvalScenario[]): EvalEnvironmentGeneratedFile[] {
|
|
86
|
+
return [
|
|
87
|
+
{ path: 'README.md', kind: 'readme', required: true },
|
|
88
|
+
{ path: 'environment.json', kind: 'manifest', required: true },
|
|
89
|
+
{ path: '.env.swarmclaw-eval', kind: 'env', required: true },
|
|
90
|
+
...fixtureFiles(scenarios),
|
|
91
|
+
]
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function providerNeedsCredential(route: ResolvedAgentRoute): boolean {
|
|
95
|
+
if (route.provider === 'openclaw') return false
|
|
96
|
+
if (route.provider === 'ollama' && route.ollamaMode !== 'cloud') return false
|
|
97
|
+
if (isCliProviderId(route.provider)) return false
|
|
98
|
+
return true
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function credentialExists(credentialId: string | null | undefined, credentials: Record<string, unknown>): boolean {
|
|
102
|
+
return typeof credentialId === 'string' && credentialId.trim() ? Boolean(credentials[credentialId]) : false
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function checkLevelRank(level: EvalEnvironmentCheck['level']): number {
|
|
106
|
+
if (level === 'error') return 2
|
|
107
|
+
if (level === 'warn') return 1
|
|
108
|
+
return 0
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function statusFromChecks(checks: EvalEnvironmentCheck[]): EvalEnvironmentPlan['status'] {
|
|
112
|
+
const max = checks.reduce((rank, check) => Math.max(rank, checkLevelRank(check.level)), 0)
|
|
113
|
+
if (max >= 2) return 'blocked'
|
|
114
|
+
if (max >= 1) return 'warning'
|
|
115
|
+
return 'ready'
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function pickGatewayProfile(
|
|
119
|
+
route: ResolvedAgentRoute | null,
|
|
120
|
+
profiles: GatewayProfile[],
|
|
121
|
+
requestedProfileId: string | null,
|
|
122
|
+
): GatewayProfile | null {
|
|
123
|
+
if (requestedProfileId) {
|
|
124
|
+
return profiles.find((profile) => profile.id === requestedProfileId) || null
|
|
125
|
+
}
|
|
126
|
+
if (route?.gatewayProfileId) {
|
|
127
|
+
return profiles.find((profile) => profile.id === route.gatewayProfileId) || null
|
|
128
|
+
}
|
|
129
|
+
return profiles.find((profile) => profile.isDefault) || profiles[0] || null
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function summarizeGatewayTarget(route: ResolvedAgentRoute, profile: GatewayProfile | null): EvalEnvironmentTarget {
|
|
133
|
+
return {
|
|
134
|
+
kind: 'gateway',
|
|
135
|
+
provider: route.provider,
|
|
136
|
+
model: route.model,
|
|
137
|
+
label: profile?.name || route.label,
|
|
138
|
+
gatewayProfileId: profile?.id || route.gatewayProfileId || null,
|
|
139
|
+
capabilities: ['agent.run', 'sessions', 'tools', 'workspace'],
|
|
140
|
+
refreshedAt: profile?.stats?.lastTopologyCheckedAt || profile?.lastCheckedAt || null,
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function summarizeLocalTarget(route: ResolvedAgentRoute): EvalEnvironmentTarget {
|
|
145
|
+
return {
|
|
146
|
+
kind: 'local',
|
|
147
|
+
provider: route.provider,
|
|
148
|
+
model: route.model,
|
|
149
|
+
label: route.label,
|
|
150
|
+
capabilities: ['agent.run', 'tools', 'workspace'],
|
|
151
|
+
refreshedAt: null,
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function addEnvHint(
|
|
156
|
+
hints: EvalEnvironmentPlan['envHints'],
|
|
157
|
+
key: string,
|
|
158
|
+
value: string | null | undefined,
|
|
159
|
+
description?: string,
|
|
160
|
+
): void {
|
|
161
|
+
if (!value) return
|
|
162
|
+
hints.push({ key, value, ...(description ? { description } : {}) })
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function buildEnvHints(params: {
|
|
166
|
+
agent: Agent | null
|
|
167
|
+
scenarios: EvalScenario[]
|
|
168
|
+
suite: string | null
|
|
169
|
+
target: EvalEnvironmentTarget | null
|
|
170
|
+
}): EvalEnvironmentPlan['envHints'] {
|
|
171
|
+
const hints: EvalEnvironmentPlan['envHints'] = []
|
|
172
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_AGENT_ID', params.agent?.id, 'Agent under validation')
|
|
173
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_AGENT_NAME', params.agent?.name, 'Agent display name')
|
|
174
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_SCENARIOS', params.scenarios.map((scenario) => scenario.id).join(','), 'Comma-separated eval scenario ids')
|
|
175
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_SUITE', params.suite, 'Eval suite name')
|
|
176
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_TARGET_KIND', params.target?.kind, 'Resolved execution target kind')
|
|
177
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_PROVIDER', params.target?.provider, 'Resolved provider')
|
|
178
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_MODEL', params.target?.model, 'Resolved model')
|
|
179
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_GATEWAY_PROFILE_ID', params.target?.gatewayProfileId || null, 'Resolved gateway profile id')
|
|
180
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_ENVIRONMENT_ID', params.target?.environmentId || null, 'Requested or selected gateway environment id')
|
|
181
|
+
return hints
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
function normalizeEnvironmentCapabilities(environment: OpenClawEnvironmentSummary | null | undefined): string[] {
|
|
185
|
+
return uniqueStrings(environment?.capabilities || [])
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
async function attachGatewayEnvironment(
|
|
189
|
+
target: EvalEnvironmentTarget,
|
|
190
|
+
profile: GatewayProfile | null,
|
|
191
|
+
checks: EvalEnvironmentCheck[],
|
|
192
|
+
input: EvalEnvironmentPlanInput,
|
|
193
|
+
deps: Required<Pick<EvalEnvironmentPlanDeps, 'listGatewayEnvironments'>>,
|
|
194
|
+
): Promise<EvalEnvironmentTarget> {
|
|
195
|
+
if (!profile) return target
|
|
196
|
+
const requestedEnvironmentId = normalizeOptionalId(input.environmentId)
|
|
197
|
+
|
|
198
|
+
if (profile.status === 'offline') {
|
|
199
|
+
checks.push({
|
|
200
|
+
code: 'gateway_offline',
|
|
201
|
+
level: 'error',
|
|
202
|
+
message: `${profile.name} is offline.`,
|
|
203
|
+
hint: 'Refresh or repair the gateway before running evals through it.',
|
|
204
|
+
})
|
|
205
|
+
} else if (profile.status === 'degraded') {
|
|
206
|
+
checks.push({
|
|
207
|
+
code: 'gateway_degraded',
|
|
208
|
+
level: 'warn',
|
|
209
|
+
message: `${profile.name} is degraded.`,
|
|
210
|
+
detail: profile.lastError || undefined,
|
|
211
|
+
})
|
|
212
|
+
} else if (profile.status === 'pending' || profile.status === 'unknown') {
|
|
213
|
+
checks.push({
|
|
214
|
+
code: 'gateway_unverified',
|
|
215
|
+
level: 'warn',
|
|
216
|
+
message: `${profile.name} has not reported a healthy gateway status yet.`,
|
|
217
|
+
})
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const environmentCount = profile.stats?.environmentCount || 0
|
|
221
|
+
const availableEnvironmentCount = profile.stats?.availableEnvironmentCount || 0
|
|
222
|
+
if (environmentCount > 0 && availableEnvironmentCount === 0) {
|
|
223
|
+
checks.push({
|
|
224
|
+
code: 'no_available_gateway_environment',
|
|
225
|
+
level: 'error',
|
|
226
|
+
message: `${profile.name} has ${environmentCount} execution environment${environmentCount === 1 ? '' : 's'}, but none are available.`,
|
|
227
|
+
})
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if (!input.refreshGateway) {
|
|
231
|
+
if (requestedEnvironmentId) {
|
|
232
|
+
checks.push({
|
|
233
|
+
code: 'environment_not_refreshed',
|
|
234
|
+
level: 'warn',
|
|
235
|
+
message: `Environment ${requestedEnvironmentId} was requested but not refreshed.`,
|
|
236
|
+
hint: 'Run validation with refresh enabled to verify the exact environment.',
|
|
237
|
+
})
|
|
238
|
+
return { ...target, environmentId: requestedEnvironmentId }
|
|
239
|
+
}
|
|
240
|
+
checks.push({
|
|
241
|
+
code: 'gateway_snapshot_only',
|
|
242
|
+
level: 'info',
|
|
243
|
+
message: 'Using the last stored gateway topology snapshot for validation.',
|
|
244
|
+
})
|
|
245
|
+
return target
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
const snapshot = await deps.listGatewayEnvironments(profile.id)
|
|
249
|
+
if (!snapshot) {
|
|
250
|
+
checks.push({
|
|
251
|
+
code: 'gateway_environment_snapshot_missing',
|
|
252
|
+
level: 'error',
|
|
253
|
+
message: `${profile.name} could not be refreshed for environment validation.`,
|
|
254
|
+
})
|
|
255
|
+
return target
|
|
256
|
+
}
|
|
257
|
+
for (const error of snapshot.errors) {
|
|
258
|
+
checks.push({
|
|
259
|
+
code: 'gateway_environment_refresh_error',
|
|
260
|
+
level: 'warn',
|
|
261
|
+
message: `${error.method}: ${error.message}`,
|
|
262
|
+
})
|
|
263
|
+
}
|
|
264
|
+
const environments = snapshot.environments
|
|
265
|
+
const selected = requestedEnvironmentId
|
|
266
|
+
? environments.find((environment) => environment.id === requestedEnvironmentId) || null
|
|
267
|
+
: environments.find((environment) => environment.status === 'available') || environments[0] || null
|
|
268
|
+
if (requestedEnvironmentId && !selected) {
|
|
269
|
+
checks.push({
|
|
270
|
+
code: 'environment_not_found',
|
|
271
|
+
level: 'error',
|
|
272
|
+
message: `Requested execution environment ${requestedEnvironmentId} was not found on ${profile.name}.`,
|
|
273
|
+
})
|
|
274
|
+
return { ...target, environmentId: requestedEnvironmentId, refreshedAt: snapshot.refreshedAt }
|
|
275
|
+
}
|
|
276
|
+
if (!selected) {
|
|
277
|
+
checks.push({
|
|
278
|
+
code: 'no_gateway_environments',
|
|
279
|
+
level: 'warn',
|
|
280
|
+
message: `${profile.name} did not report any execution environments.`,
|
|
281
|
+
})
|
|
282
|
+
return { ...target, refreshedAt: snapshot.refreshedAt }
|
|
283
|
+
}
|
|
284
|
+
if (selected.status !== 'available') {
|
|
285
|
+
checks.push({
|
|
286
|
+
code: 'environment_unavailable',
|
|
287
|
+
level: selected.status === 'error' ? 'error' : 'warn',
|
|
288
|
+
message: `${selected.label || selected.id} is ${selected.status}.`,
|
|
289
|
+
})
|
|
290
|
+
} else {
|
|
291
|
+
checks.push({
|
|
292
|
+
code: 'environment_available',
|
|
293
|
+
level: 'info',
|
|
294
|
+
message: `${selected.label || selected.id} is available for validation runs.`,
|
|
295
|
+
})
|
|
296
|
+
}
|
|
297
|
+
return {
|
|
298
|
+
...target,
|
|
299
|
+
environmentId: selected.id,
|
|
300
|
+
environmentLabel: selected.label || selected.id,
|
|
301
|
+
environmentStatus: selected.status,
|
|
302
|
+
capabilities: normalizeEnvironmentCapabilities(selected),
|
|
303
|
+
refreshedAt: snapshot.refreshedAt,
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
export async function buildEvalEnvironmentPlan(
|
|
308
|
+
input: EvalEnvironmentPlanInput,
|
|
309
|
+
deps: EvalEnvironmentPlanDeps = {},
|
|
310
|
+
): Promise<EvalEnvironmentPlan> {
|
|
311
|
+
const now = deps.now || (() => Date.now())
|
|
312
|
+
const generatedAt = now()
|
|
313
|
+
const loadAgentsImpl = deps.loadAgents || (() => loadAgents() as Record<string, Agent>)
|
|
314
|
+
const loadCredentialsImpl = deps.loadCredentials || (() => loadCredentials() as Record<string, unknown>)
|
|
315
|
+
const listGatewayProfilesImpl = deps.listGatewayProfiles || listOpenClawGatewayProfiles
|
|
316
|
+
const checkCliProviderReadyImpl = deps.checkCliProviderReady || checkCliProviderReady
|
|
317
|
+
const checks: EvalEnvironmentCheck[] = []
|
|
318
|
+
const { scenarios, missing } = scenarioSet(input)
|
|
319
|
+
const suite = normalizeOptionalId(input.suite) || (input.scenarioId ? null : 'core')
|
|
320
|
+
const agents = loadAgentsImpl()
|
|
321
|
+
const agent = agents[input.agentId] || null
|
|
322
|
+
const requiredTools = uniqueStrings(scenarios.flatMap((scenario) => scenario.tools || []))
|
|
323
|
+
let target: EvalEnvironmentTarget | null = null
|
|
324
|
+
|
|
325
|
+
if (missing) {
|
|
326
|
+
checks.push({
|
|
327
|
+
code: 'scenario_not_found',
|
|
328
|
+
level: 'error',
|
|
329
|
+
message: `Eval scenario ${missing} was not found.`,
|
|
330
|
+
})
|
|
331
|
+
} else if (scenarios.length === 0) {
|
|
332
|
+
checks.push({
|
|
333
|
+
code: 'scenario_set_empty',
|
|
334
|
+
level: 'error',
|
|
335
|
+
message: 'No eval scenarios matched the requested suite.',
|
|
336
|
+
})
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
if (!agent) {
|
|
340
|
+
checks.push({
|
|
341
|
+
code: 'agent_not_found',
|
|
342
|
+
level: 'error',
|
|
343
|
+
message: `Agent ${input.agentId} was not found.`,
|
|
344
|
+
})
|
|
345
|
+
} else {
|
|
346
|
+
if (agent.trashedAt) {
|
|
347
|
+
checks.push({ code: 'agent_trashed', level: 'error', message: `${agent.name} is in trash.` })
|
|
348
|
+
}
|
|
349
|
+
if (agent.disabled) {
|
|
350
|
+
checks.push({ code: 'agent_disabled', level: 'error', message: `${agent.name} is disabled.` })
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
const gatewayProfiles = listGatewayProfilesImpl()
|
|
354
|
+
const [route] = resolveAgentRouteCandidatesWithProfiles(agent, gatewayProfiles)
|
|
355
|
+
if (!route) {
|
|
356
|
+
checks.push({
|
|
357
|
+
code: 'route_unresolved',
|
|
358
|
+
level: 'error',
|
|
359
|
+
message: `${agent.name} does not have a runnable provider/model route.`,
|
|
360
|
+
})
|
|
361
|
+
} else if (route.provider === 'openclaw') {
|
|
362
|
+
const profile = pickGatewayProfile(route, gatewayProfiles, normalizeOptionalId(input.gatewayProfileId))
|
|
363
|
+
if (!profile) {
|
|
364
|
+
checks.push({
|
|
365
|
+
code: 'gateway_profile_missing',
|
|
366
|
+
level: 'error',
|
|
367
|
+
message: 'No gateway profile is available for this agent route.',
|
|
368
|
+
})
|
|
369
|
+
target = summarizeGatewayTarget(route, null)
|
|
370
|
+
} else {
|
|
371
|
+
target = await attachGatewayEnvironment(
|
|
372
|
+
summarizeGatewayTarget(route, profile),
|
|
373
|
+
profile,
|
|
374
|
+
checks,
|
|
375
|
+
input,
|
|
376
|
+
{ listGatewayEnvironments: deps.listGatewayEnvironments || listOpenClawGatewayEnvironments },
|
|
377
|
+
)
|
|
378
|
+
}
|
|
379
|
+
} else {
|
|
380
|
+
target = summarizeLocalTarget(route)
|
|
381
|
+
if (isCliProviderId(route.provider)) {
|
|
382
|
+
const ready = checkCliProviderReadyImpl(route.provider)
|
|
383
|
+
checks.push({
|
|
384
|
+
code: ready.ok ? 'cli_provider_ready' : 'cli_provider_not_ready',
|
|
385
|
+
level: ready.ok ? 'info' : 'error',
|
|
386
|
+
message: ready.message,
|
|
387
|
+
detail: ready.binaryPath,
|
|
388
|
+
})
|
|
389
|
+
} else if (providerNeedsCredential(route) && !credentialExists(route.credentialId, loadCredentialsImpl())) {
|
|
390
|
+
checks.push({
|
|
391
|
+
code: 'credential_missing',
|
|
392
|
+
level: 'warn',
|
|
393
|
+
message: `${route.provider} does not have a stored credential for this route.`,
|
|
394
|
+
hint: 'The run may still work if the provider is configured through environment variables.',
|
|
395
|
+
})
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
if (requiredTools.length > 0) {
|
|
401
|
+
checks.push({
|
|
402
|
+
code: 'tools_declared',
|
|
403
|
+
level: 'info',
|
|
404
|
+
message: `${requiredTools.length} eval tool${requiredTools.length === 1 ? '' : 's'} will be enabled: ${requiredTools.join(', ')}.`,
|
|
405
|
+
})
|
|
406
|
+
} else {
|
|
407
|
+
checks.push({
|
|
408
|
+
code: 'no_tools_required',
|
|
409
|
+
level: 'info',
|
|
410
|
+
message: 'This eval scenario does not require tool access.',
|
|
411
|
+
})
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
const envHints = buildEnvHints({ agent, scenarios, suite, target })
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
generatedAt,
|
|
418
|
+
status: statusFromChecks(checks),
|
|
419
|
+
agentId: input.agentId,
|
|
420
|
+
agentName: agent?.name || input.agentId,
|
|
421
|
+
scenarioIds: scenarios.map((scenario) => scenario.id),
|
|
422
|
+
suite,
|
|
423
|
+
target,
|
|
424
|
+
checks,
|
|
425
|
+
requiredTools,
|
|
426
|
+
missingTools: [],
|
|
427
|
+
maxScore: maxScore(scenarios),
|
|
428
|
+
timeoutMs: timeoutMs(scenarios),
|
|
429
|
+
generatedFiles: baseGeneratedFiles(scenarios),
|
|
430
|
+
envHints,
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
function safeFixtureDestination(workspacePath: string, fixture: EvalScenarioFixture): string {
|
|
435
|
+
const relative = fixture.path.trim()
|
|
436
|
+
if (!relative || path.isAbsolute(relative)) {
|
|
437
|
+
throw new Error(`Unsafe eval fixture path: ${fixture.path}`)
|
|
438
|
+
}
|
|
439
|
+
const destination = path.resolve(workspacePath, relative)
|
|
440
|
+
const root = path.resolve(workspacePath)
|
|
441
|
+
if (destination !== root && !destination.startsWith(`${root}${path.sep}`)) {
|
|
442
|
+
throw new Error(`Unsafe eval fixture path: ${fixture.path}`)
|
|
443
|
+
}
|
|
444
|
+
return destination
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
function writeTextFile(filePath: string, content: string, mode?: number): void {
|
|
448
|
+
fs.mkdirSync(path.dirname(filePath), { recursive: true })
|
|
449
|
+
fs.writeFileSync(filePath, content.endsWith('\n') ? content : `${content}\n`, { encoding: 'utf8', mode })
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
function envLine(hint: EvalEnvironmentPlan['envHints'][number]): string {
|
|
453
|
+
return `${hint.key}=${JSON.stringify(hint.value)}`
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
export function writeEvalEnvironmentWorkspace(options: WriteEvalWorkspaceOptions): EvalEnvironmentGeneratedFile[] {
|
|
457
|
+
const { runId, workspacePath, scenario, plan } = options
|
|
458
|
+
fs.mkdirSync(workspacePath, { recursive: true })
|
|
459
|
+
|
|
460
|
+
const readme = [
|
|
461
|
+
`# Eval Workspace: ${scenario.name}`,
|
|
462
|
+
'',
|
|
463
|
+
`Run ID: ${runId}`,
|
|
464
|
+
`Agent: ${plan.agentName} (${plan.agentId})`,
|
|
465
|
+
`Scenario: ${scenario.id}`,
|
|
466
|
+
`Status at start: ${plan.status}`,
|
|
467
|
+
'',
|
|
468
|
+
'Runtime manifest: ./environment.json',
|
|
469
|
+
'Environment hints: ./.env.swarmclaw-eval',
|
|
470
|
+
'',
|
|
471
|
+
'This directory is isolated for eval artifacts, fixtures, and generated outputs.',
|
|
472
|
+
].join('\n')
|
|
473
|
+
writeTextFile(path.join(workspacePath, 'README.md'), readme)
|
|
474
|
+
writeTextFile(path.join(workspacePath, 'environment.json'), JSON.stringify({ runId, plan }, null, 2))
|
|
475
|
+
writeTextFile(
|
|
476
|
+
path.join(workspacePath, '.env.swarmclaw-eval'),
|
|
477
|
+
[
|
|
478
|
+
'# Generated by SwarmClaw. Contains eval context only, not secrets.',
|
|
479
|
+
`SWARMCLAW_EVAL_RUN_ID=${JSON.stringify(runId)}`,
|
|
480
|
+
...plan.envHints.map(envLine),
|
|
481
|
+
].join('\n'),
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
for (const fixture of scenario.fixtures || []) {
|
|
485
|
+
writeTextFile(safeFixtureDestination(workspacePath, fixture), fixture.content, fixture.mode)
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
return [
|
|
489
|
+
{ path: 'README.md', kind: 'readme', required: true },
|
|
490
|
+
{ path: 'environment.json', kind: 'manifest', required: true },
|
|
491
|
+
{ path: '.env.swarmclaw-eval', kind: 'env', required: true },
|
|
492
|
+
...fixtureFiles([scenario]),
|
|
493
|
+
]
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
export function resolveEvalWorkspacePath(runId: string): string {
|
|
497
|
+
return path.join(WORKSPACE_DIR, 'evals', runId)
|
|
498
|
+
}
|
|
@@ -10,6 +10,7 @@ import { executeExecutionChatTurn } from '@/lib/server/execution-engine/chat-tur
|
|
|
10
10
|
import { WORKSPACE_DIR } from '../data-dir'
|
|
11
11
|
import type { Session } from '@/types'
|
|
12
12
|
import { errorMessage } from '@/lib/shared-utils'
|
|
13
|
+
import { buildEvalEnvironmentPlan, writeEvalEnvironmentWorkspace } from './environment-plan'
|
|
13
14
|
|
|
14
15
|
export function resolveEvalSessionCwd(runId: string): string {
|
|
15
16
|
const dir = path.join(WORKSPACE_DIR, 'evals', runId)
|
|
@@ -17,7 +18,17 @@ export function resolveEvalSessionCwd(runId: string): string {
|
|
|
17
18
|
return dir
|
|
18
19
|
}
|
|
19
20
|
|
|
20
|
-
export
|
|
21
|
+
export interface RunEvalScenarioOptions {
|
|
22
|
+
gatewayProfileId?: string | null
|
|
23
|
+
environmentId?: string | null
|
|
24
|
+
refreshGateway?: boolean
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export async function runEvalScenario(
|
|
28
|
+
scenarioId: string,
|
|
29
|
+
agentId: string,
|
|
30
|
+
options: RunEvalScenarioOptions = {},
|
|
31
|
+
): Promise<EvalRun> {
|
|
21
32
|
const scenario = getScenario(scenarioId)
|
|
22
33
|
if (!scenario) throw new Error(`Unknown eval scenario: ${scenarioId}`)
|
|
23
34
|
|
|
@@ -29,6 +40,13 @@ export async function runEvalScenario(scenarioId: string, agentId: string): Prom
|
|
|
29
40
|
const sessionId = `eval-${runId}`
|
|
30
41
|
const now = Date.now()
|
|
31
42
|
const sessionCwd = resolveEvalSessionCwd(runId)
|
|
43
|
+
const environment = await buildEvalEnvironmentPlan({
|
|
44
|
+
agentId,
|
|
45
|
+
scenarioId,
|
|
46
|
+
gatewayProfileId: options.gatewayProfileId || null,
|
|
47
|
+
environmentId: options.environmentId || null,
|
|
48
|
+
refreshGateway: options.refreshGateway === true,
|
|
49
|
+
})
|
|
32
50
|
|
|
33
51
|
const run: EvalRun = {
|
|
34
52
|
id: runId,
|
|
@@ -40,6 +58,34 @@ export async function runEvalScenario(scenarioId: string, agentId: string): Prom
|
|
|
40
58
|
maxScore: scenario.scoringCriteria.reduce((sum, c) => sum + c.weight, 0),
|
|
41
59
|
details: [],
|
|
42
60
|
sessionId,
|
|
61
|
+
environment,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
writeEvalEnvironmentWorkspace({
|
|
65
|
+
runId,
|
|
66
|
+
workspacePath: sessionCwd,
|
|
67
|
+
scenario,
|
|
68
|
+
plan: environment,
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
if (environment.status === 'blocked') {
|
|
72
|
+
run.status = 'failed'
|
|
73
|
+
run.error = environment.checks
|
|
74
|
+
.filter((check) => check.level === 'error')
|
|
75
|
+
.map((check) => check.message)
|
|
76
|
+
.join(' ')
|
|
77
|
+
|| 'Eval environment validation failed.'
|
|
78
|
+
run.endedAt = Date.now()
|
|
79
|
+
run.details = environment.checks
|
|
80
|
+
.filter((check) => check.level !== 'info')
|
|
81
|
+
.map((check) => ({
|
|
82
|
+
criterion: check.code,
|
|
83
|
+
score: 0,
|
|
84
|
+
maxScore: 0,
|
|
85
|
+
evidence: check.message,
|
|
86
|
+
}))
|
|
87
|
+
saveEvalRun(run)
|
|
88
|
+
return run
|
|
43
89
|
}
|
|
44
90
|
|
|
45
91
|
// Create temporary eval session
|
|
@@ -114,7 +160,7 @@ export async function runEvalScenario(scenarioId: string, agentId: string): Prom
|
|
|
114
160
|
|
|
115
161
|
export async function runEvalSuite(
|
|
116
162
|
agentId: string,
|
|
117
|
-
opts: { categories?: string[]; suite?: string } = {},
|
|
163
|
+
opts: { categories?: string[]; suite?: string; gatewayProfileId?: string | null; environmentId?: string | null; refreshGateway?: boolean } = {},
|
|
118
164
|
): Promise<EvalSuiteResult> {
|
|
119
165
|
let scenarios: EvalScenario[]
|
|
120
166
|
if (opts.suite) {
|
|
@@ -130,7 +176,11 @@ export async function runEvalSuite(
|
|
|
130
176
|
|
|
131
177
|
const runs: EvalRun[] = []
|
|
132
178
|
for (const scenario of scenarios) {
|
|
133
|
-
const evalRun = await runEvalScenario(scenario.id, agentId
|
|
179
|
+
const evalRun = await runEvalScenario(scenario.id, agentId, {
|
|
180
|
+
gatewayProfileId: opts.gatewayProfileId || null,
|
|
181
|
+
environmentId: opts.environmentId || null,
|
|
182
|
+
refreshGateway: opts.refreshGateway === true,
|
|
183
|
+
})
|
|
134
184
|
runs.push(evalRun)
|
|
135
185
|
}
|
|
136
186
|
|
|
@@ -212,6 +212,24 @@ const CORE_SCENARIOS: EvalScenario[] = [
|
|
|
212
212
|
],
|
|
213
213
|
timeoutMs: 60_000,
|
|
214
214
|
tools: ['shell', 'files'],
|
|
215
|
+
fixtures: [
|
|
216
|
+
{
|
|
217
|
+
path: 'package.json',
|
|
218
|
+
content: JSON.stringify({
|
|
219
|
+
name: 'swarmclaw-eval-fixture',
|
|
220
|
+
version: '0.0.0',
|
|
221
|
+
private: true,
|
|
222
|
+
dependencies: {
|
|
223
|
+
'@modelcontextprotocol/sdk': '^1.29.0',
|
|
224
|
+
zod: '^4.1.13',
|
|
225
|
+
},
|
|
226
|
+
devDependencies: {
|
|
227
|
+
typescript: '^5.9.3',
|
|
228
|
+
tsx: '^4.20.6',
|
|
229
|
+
},
|
|
230
|
+
}, null, 2),
|
|
231
|
+
},
|
|
232
|
+
],
|
|
215
233
|
},
|
|
216
234
|
]
|
|
217
235
|
|
|
@@ -17,10 +17,64 @@ export interface EvalScenario {
|
|
|
17
17
|
scoringCriteria: ScoringCriterion[]
|
|
18
18
|
timeoutMs: number
|
|
19
19
|
tools: string[]
|
|
20
|
+
fixtures?: EvalScenarioFixture[]
|
|
20
21
|
/** Optional suite tag. Scenarios without a suite belong to the 'core' suite. */
|
|
21
22
|
suite?: EvalSuite
|
|
22
23
|
}
|
|
23
24
|
|
|
25
|
+
export interface EvalScenarioFixture {
|
|
26
|
+
path: string
|
|
27
|
+
content: string
|
|
28
|
+
mode?: number
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export type EvalEnvironmentStatus = 'ready' | 'warning' | 'blocked'
|
|
32
|
+
export type EvalEnvironmentCheckLevel = 'info' | 'warn' | 'error'
|
|
33
|
+
|
|
34
|
+
export interface EvalEnvironmentCheck {
|
|
35
|
+
code: string
|
|
36
|
+
level: EvalEnvironmentCheckLevel
|
|
37
|
+
message: string
|
|
38
|
+
detail?: string
|
|
39
|
+
hint?: string
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface EvalEnvironmentTarget {
|
|
43
|
+
kind: 'local' | 'gateway'
|
|
44
|
+
provider: string
|
|
45
|
+
model: string
|
|
46
|
+
label: string
|
|
47
|
+
gatewayProfileId?: string | null
|
|
48
|
+
environmentId?: string | null
|
|
49
|
+
environmentLabel?: string | null
|
|
50
|
+
environmentStatus?: string | null
|
|
51
|
+
capabilities?: string[]
|
|
52
|
+
refreshedAt?: number | null
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export interface EvalEnvironmentGeneratedFile {
|
|
56
|
+
path: string
|
|
57
|
+
kind: 'readme' | 'manifest' | 'env' | 'fixture'
|
|
58
|
+
required: boolean
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export interface EvalEnvironmentPlan {
|
|
62
|
+
generatedAt: number
|
|
63
|
+
status: EvalEnvironmentStatus
|
|
64
|
+
agentId: string
|
|
65
|
+
agentName: string
|
|
66
|
+
scenarioIds: string[]
|
|
67
|
+
suite?: string | null
|
|
68
|
+
target: EvalEnvironmentTarget | null
|
|
69
|
+
checks: EvalEnvironmentCheck[]
|
|
70
|
+
requiredTools: string[]
|
|
71
|
+
missingTools: string[]
|
|
72
|
+
maxScore: number
|
|
73
|
+
timeoutMs: number
|
|
74
|
+
generatedFiles: EvalEnvironmentGeneratedFile[]
|
|
75
|
+
envHints: Array<{ key: string; value: string; description?: string }>
|
|
76
|
+
}
|
|
77
|
+
|
|
24
78
|
export interface EvalRun {
|
|
25
79
|
id: string
|
|
26
80
|
scenarioId: string
|
|
@@ -32,6 +86,7 @@ export interface EvalRun {
|
|
|
32
86
|
maxScore: number
|
|
33
87
|
details: EvalCriterionResult[]
|
|
34
88
|
sessionId?: string
|
|
89
|
+
environment?: EvalEnvironmentPlan
|
|
35
90
|
error?: string
|
|
36
91
|
}
|
|
37
92
|
|