@swarmclawai/swarmclaw 1.9.4 → 1.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,498 @@
1
+ import fs from 'node:fs'
2
+ import path from 'node:path'
3
+
4
+ import { WORKSPACE_DIR } from '@/lib/server/data-dir'
5
+ import { resolveAgentRouteCandidatesWithProfiles, type ResolvedAgentRoute } from '@/lib/server/agents/agent-runtime-config'
6
+ import { checkCliProviderReady, type CliProviderReadyResult } from '@/lib/server/cli-provider-readiness'
7
+ import { listOpenClawGatewayEnvironments } from '@/lib/server/gateways/gateway-topology'
8
+ import { loadAgents, loadCredentials } from '@/lib/server/storage'
9
+ import { isCliProviderId } from '@/lib/providers/cli-provider-metadata'
10
+ import type { Agent, GatewayProfile, OpenClawEnvironmentSummary, OpenClawGatewayEnvironmentList } from '@/types'
11
+ import type {
12
+ EvalEnvironmentCheck,
13
+ EvalEnvironmentGeneratedFile,
14
+ EvalEnvironmentPlan,
15
+ EvalEnvironmentTarget,
16
+ EvalScenario,
17
+ EvalScenarioFixture,
18
+ } from './types'
19
+ import { getScenario, getSuiteScenarios } from './scenarios'
20
+ import { listOpenClawGatewayProfiles } from '../gateways/gateway-profile-service'
21
+
22
+ export interface EvalEnvironmentPlanInput {
23
+ agentId: string
24
+ scenarioId?: string | null
25
+ suite?: string | null
26
+ gatewayProfileId?: string | null
27
+ environmentId?: string | null
28
+ refreshGateway?: boolean
29
+ }
30
+
31
+ interface EvalEnvironmentPlanDeps {
32
+ now?: () => number
33
+ loadAgents?: () => Record<string, Agent>
34
+ loadCredentials?: () => Record<string, unknown>
35
+ listGatewayProfiles?: () => GatewayProfile[]
36
+ listGatewayEnvironments?: (id: string) => Promise<OpenClawGatewayEnvironmentList | null>
37
+ checkCliProviderReady?: (providerId: string) => CliProviderReadyResult
38
+ }
39
+
40
+ interface WriteEvalWorkspaceOptions {
41
+ runId: string
42
+ workspacePath: string
43
+ scenario: EvalScenario
44
+ plan: EvalEnvironmentPlan
45
+ }
46
+
47
+ function normalizeOptionalId(value: string | null | undefined): string | null {
48
+ return typeof value === 'string' && value.trim() ? value.trim() : null
49
+ }
50
+
51
+ function uniqueStrings(values: string[]): string[] {
52
+ return [...new Set(values.map((value) => value.trim()).filter(Boolean))]
53
+ .sort((left, right) => left.localeCompare(right))
54
+ }
55
+
56
+ function scenarioSet(input: EvalEnvironmentPlanInput): { scenarios: EvalScenario[]; missing?: string } {
57
+ const scenarioId = normalizeOptionalId(input.scenarioId)
58
+ if (scenarioId) {
59
+ const scenario = getScenario(scenarioId)
60
+ return scenario ? { scenarios: [scenario] } : { scenarios: [], missing: scenarioId }
61
+ }
62
+ const suite = normalizeOptionalId(input.suite) || 'core'
63
+ return { scenarios: getSuiteScenarios(suite) }
64
+ }
65
+
66
+ function maxScore(scenarios: EvalScenario[]): number {
67
+ return scenarios.reduce(
68
+ (sum, scenario) => sum + scenario.scoringCriteria.reduce((criterionSum, criterion) => criterionSum + criterion.weight, 0),
69
+ 0,
70
+ )
71
+ }
72
+
73
+ function timeoutMs(scenarios: EvalScenario[]): number {
74
+ return scenarios.reduce((sum, scenario) => sum + scenario.timeoutMs, 0)
75
+ }
76
+
77
+ function fixtureFiles(scenarios: EvalScenario[]): EvalEnvironmentGeneratedFile[] {
78
+ return scenarios.flatMap((scenario) => (scenario.fixtures || []).map((fixture) => ({
79
+ path: fixture.path,
80
+ kind: 'fixture' as const,
81
+ required: true,
82
+ })))
83
+ }
84
+
85
+ function baseGeneratedFiles(scenarios: EvalScenario[]): EvalEnvironmentGeneratedFile[] {
86
+ return [
87
+ { path: 'README.md', kind: 'readme', required: true },
88
+ { path: 'environment.json', kind: 'manifest', required: true },
89
+ { path: '.env.swarmclaw-eval', kind: 'env', required: true },
90
+ ...fixtureFiles(scenarios),
91
+ ]
92
+ }
93
+
94
+ function providerNeedsCredential(route: ResolvedAgentRoute): boolean {
95
+ if (route.provider === 'openclaw') return false
96
+ if (route.provider === 'ollama' && route.ollamaMode !== 'cloud') return false
97
+ if (isCliProviderId(route.provider)) return false
98
+ return true
99
+ }
100
+
101
+ function credentialExists(credentialId: string | null | undefined, credentials: Record<string, unknown>): boolean {
102
+ return typeof credentialId === 'string' && credentialId.trim() ? Boolean(credentials[credentialId]) : false
103
+ }
104
+
105
+ function checkLevelRank(level: EvalEnvironmentCheck['level']): number {
106
+ if (level === 'error') return 2
107
+ if (level === 'warn') return 1
108
+ return 0
109
+ }
110
+
111
+ function statusFromChecks(checks: EvalEnvironmentCheck[]): EvalEnvironmentPlan['status'] {
112
+ const max = checks.reduce((rank, check) => Math.max(rank, checkLevelRank(check.level)), 0)
113
+ if (max >= 2) return 'blocked'
114
+ if (max >= 1) return 'warning'
115
+ return 'ready'
116
+ }
117
+
118
+ function pickGatewayProfile(
119
+ route: ResolvedAgentRoute | null,
120
+ profiles: GatewayProfile[],
121
+ requestedProfileId: string | null,
122
+ ): GatewayProfile | null {
123
+ if (requestedProfileId) {
124
+ return profiles.find((profile) => profile.id === requestedProfileId) || null
125
+ }
126
+ if (route?.gatewayProfileId) {
127
+ return profiles.find((profile) => profile.id === route.gatewayProfileId) || null
128
+ }
129
+ return profiles.find((profile) => profile.isDefault) || profiles[0] || null
130
+ }
131
+
132
+ function summarizeGatewayTarget(route: ResolvedAgentRoute, profile: GatewayProfile | null): EvalEnvironmentTarget {
133
+ return {
134
+ kind: 'gateway',
135
+ provider: route.provider,
136
+ model: route.model,
137
+ label: profile?.name || route.label,
138
+ gatewayProfileId: profile?.id || route.gatewayProfileId || null,
139
+ capabilities: ['agent.run', 'sessions', 'tools', 'workspace'],
140
+ refreshedAt: profile?.stats?.lastTopologyCheckedAt || profile?.lastCheckedAt || null,
141
+ }
142
+ }
143
+
144
+ function summarizeLocalTarget(route: ResolvedAgentRoute): EvalEnvironmentTarget {
145
+ return {
146
+ kind: 'local',
147
+ provider: route.provider,
148
+ model: route.model,
149
+ label: route.label,
150
+ capabilities: ['agent.run', 'tools', 'workspace'],
151
+ refreshedAt: null,
152
+ }
153
+ }
154
+
155
+ function addEnvHint(
156
+ hints: EvalEnvironmentPlan['envHints'],
157
+ key: string,
158
+ value: string | null | undefined,
159
+ description?: string,
160
+ ): void {
161
+ if (!value) return
162
+ hints.push({ key, value, ...(description ? { description } : {}) })
163
+ }
164
+
165
+ function buildEnvHints(params: {
166
+ agent: Agent | null
167
+ scenarios: EvalScenario[]
168
+ suite: string | null
169
+ target: EvalEnvironmentTarget | null
170
+ }): EvalEnvironmentPlan['envHints'] {
171
+ const hints: EvalEnvironmentPlan['envHints'] = []
172
+ addEnvHint(hints, 'SWARMCLAW_EVAL_AGENT_ID', params.agent?.id, 'Agent under validation')
173
+ addEnvHint(hints, 'SWARMCLAW_EVAL_AGENT_NAME', params.agent?.name, 'Agent display name')
174
+ addEnvHint(hints, 'SWARMCLAW_EVAL_SCENARIOS', params.scenarios.map((scenario) => scenario.id).join(','), 'Comma-separated eval scenario ids')
175
+ addEnvHint(hints, 'SWARMCLAW_EVAL_SUITE', params.suite, 'Eval suite name')
176
+ addEnvHint(hints, 'SWARMCLAW_EVAL_TARGET_KIND', params.target?.kind, 'Resolved execution target kind')
177
+ addEnvHint(hints, 'SWARMCLAW_EVAL_PROVIDER', params.target?.provider, 'Resolved provider')
178
+ addEnvHint(hints, 'SWARMCLAW_EVAL_MODEL', params.target?.model, 'Resolved model')
179
+ addEnvHint(hints, 'SWARMCLAW_EVAL_GATEWAY_PROFILE_ID', params.target?.gatewayProfileId || null, 'Resolved gateway profile id')
180
+ addEnvHint(hints, 'SWARMCLAW_EVAL_ENVIRONMENT_ID', params.target?.environmentId || null, 'Requested or selected gateway environment id')
181
+ return hints
182
+ }
183
+
184
+ function normalizeEnvironmentCapabilities(environment: OpenClawEnvironmentSummary | null | undefined): string[] {
185
+ return uniqueStrings(environment?.capabilities || [])
186
+ }
187
+
188
+ async function attachGatewayEnvironment(
189
+ target: EvalEnvironmentTarget,
190
+ profile: GatewayProfile | null,
191
+ checks: EvalEnvironmentCheck[],
192
+ input: EvalEnvironmentPlanInput,
193
+ deps: Required<Pick<EvalEnvironmentPlanDeps, 'listGatewayEnvironments'>>,
194
+ ): Promise<EvalEnvironmentTarget> {
195
+ if (!profile) return target
196
+ const requestedEnvironmentId = normalizeOptionalId(input.environmentId)
197
+
198
+ if (profile.status === 'offline') {
199
+ checks.push({
200
+ code: 'gateway_offline',
201
+ level: 'error',
202
+ message: `${profile.name} is offline.`,
203
+ hint: 'Refresh or repair the gateway before running evals through it.',
204
+ })
205
+ } else if (profile.status === 'degraded') {
206
+ checks.push({
207
+ code: 'gateway_degraded',
208
+ level: 'warn',
209
+ message: `${profile.name} is degraded.`,
210
+ detail: profile.lastError || undefined,
211
+ })
212
+ } else if (profile.status === 'pending' || profile.status === 'unknown') {
213
+ checks.push({
214
+ code: 'gateway_unverified',
215
+ level: 'warn',
216
+ message: `${profile.name} has not reported a healthy gateway status yet.`,
217
+ })
218
+ }
219
+
220
+ const environmentCount = profile.stats?.environmentCount || 0
221
+ const availableEnvironmentCount = profile.stats?.availableEnvironmentCount || 0
222
+ if (environmentCount > 0 && availableEnvironmentCount === 0) {
223
+ checks.push({
224
+ code: 'no_available_gateway_environment',
225
+ level: 'error',
226
+ message: `${profile.name} has ${environmentCount} execution environment${environmentCount === 1 ? '' : 's'}, but none are available.`,
227
+ })
228
+ }
229
+
230
+ if (!input.refreshGateway) {
231
+ if (requestedEnvironmentId) {
232
+ checks.push({
233
+ code: 'environment_not_refreshed',
234
+ level: 'warn',
235
+ message: `Environment ${requestedEnvironmentId} was requested but not refreshed.`,
236
+ hint: 'Run validation with refresh enabled to verify the exact environment.',
237
+ })
238
+ return { ...target, environmentId: requestedEnvironmentId }
239
+ }
240
+ checks.push({
241
+ code: 'gateway_snapshot_only',
242
+ level: 'info',
243
+ message: 'Using the last stored gateway topology snapshot for validation.',
244
+ })
245
+ return target
246
+ }
247
+
248
+ const snapshot = await deps.listGatewayEnvironments(profile.id)
249
+ if (!snapshot) {
250
+ checks.push({
251
+ code: 'gateway_environment_snapshot_missing',
252
+ level: 'error',
253
+ message: `${profile.name} could not be refreshed for environment validation.`,
254
+ })
255
+ return target
256
+ }
257
+ for (const error of snapshot.errors) {
258
+ checks.push({
259
+ code: 'gateway_environment_refresh_error',
260
+ level: 'warn',
261
+ message: `${error.method}: ${error.message}`,
262
+ })
263
+ }
264
+ const environments = snapshot.environments
265
+ const selected = requestedEnvironmentId
266
+ ? environments.find((environment) => environment.id === requestedEnvironmentId) || null
267
+ : environments.find((environment) => environment.status === 'available') || environments[0] || null
268
+ if (requestedEnvironmentId && !selected) {
269
+ checks.push({
270
+ code: 'environment_not_found',
271
+ level: 'error',
272
+ message: `Requested execution environment ${requestedEnvironmentId} was not found on ${profile.name}.`,
273
+ })
274
+ return { ...target, environmentId: requestedEnvironmentId, refreshedAt: snapshot.refreshedAt }
275
+ }
276
+ if (!selected) {
277
+ checks.push({
278
+ code: 'no_gateway_environments',
279
+ level: 'warn',
280
+ message: `${profile.name} did not report any execution environments.`,
281
+ })
282
+ return { ...target, refreshedAt: snapshot.refreshedAt }
283
+ }
284
+ if (selected.status !== 'available') {
285
+ checks.push({
286
+ code: 'environment_unavailable',
287
+ level: selected.status === 'error' ? 'error' : 'warn',
288
+ message: `${selected.label || selected.id} is ${selected.status}.`,
289
+ })
290
+ } else {
291
+ checks.push({
292
+ code: 'environment_available',
293
+ level: 'info',
294
+ message: `${selected.label || selected.id} is available for validation runs.`,
295
+ })
296
+ }
297
+ return {
298
+ ...target,
299
+ environmentId: selected.id,
300
+ environmentLabel: selected.label || selected.id,
301
+ environmentStatus: selected.status,
302
+ capabilities: normalizeEnvironmentCapabilities(selected),
303
+ refreshedAt: snapshot.refreshedAt,
304
+ }
305
+ }
306
+
307
+ export async function buildEvalEnvironmentPlan(
308
+ input: EvalEnvironmentPlanInput,
309
+ deps: EvalEnvironmentPlanDeps = {},
310
+ ): Promise<EvalEnvironmentPlan> {
311
+ const now = deps.now || (() => Date.now())
312
+ const generatedAt = now()
313
+ const loadAgentsImpl = deps.loadAgents || (() => loadAgents() as Record<string, Agent>)
314
+ const loadCredentialsImpl = deps.loadCredentials || (() => loadCredentials() as Record<string, unknown>)
315
+ const listGatewayProfilesImpl = deps.listGatewayProfiles || listOpenClawGatewayProfiles
316
+ const checkCliProviderReadyImpl = deps.checkCliProviderReady || checkCliProviderReady
317
+ const checks: EvalEnvironmentCheck[] = []
318
+ const { scenarios, missing } = scenarioSet(input)
319
+ const suite = normalizeOptionalId(input.suite) || (input.scenarioId ? null : 'core')
320
+ const agents = loadAgentsImpl()
321
+ const agent = agents[input.agentId] || null
322
+ const requiredTools = uniqueStrings(scenarios.flatMap((scenario) => scenario.tools || []))
323
+ let target: EvalEnvironmentTarget | null = null
324
+
325
+ if (missing) {
326
+ checks.push({
327
+ code: 'scenario_not_found',
328
+ level: 'error',
329
+ message: `Eval scenario ${missing} was not found.`,
330
+ })
331
+ } else if (scenarios.length === 0) {
332
+ checks.push({
333
+ code: 'scenario_set_empty',
334
+ level: 'error',
335
+ message: 'No eval scenarios matched the requested suite.',
336
+ })
337
+ }
338
+
339
+ if (!agent) {
340
+ checks.push({
341
+ code: 'agent_not_found',
342
+ level: 'error',
343
+ message: `Agent ${input.agentId} was not found.`,
344
+ })
345
+ } else {
346
+ if (agent.trashedAt) {
347
+ checks.push({ code: 'agent_trashed', level: 'error', message: `${agent.name} is in trash.` })
348
+ }
349
+ if (agent.disabled) {
350
+ checks.push({ code: 'agent_disabled', level: 'error', message: `${agent.name} is disabled.` })
351
+ }
352
+
353
+ const gatewayProfiles = listGatewayProfilesImpl()
354
+ const [route] = resolveAgentRouteCandidatesWithProfiles(agent, gatewayProfiles)
355
+ if (!route) {
356
+ checks.push({
357
+ code: 'route_unresolved',
358
+ level: 'error',
359
+ message: `${agent.name} does not have a runnable provider/model route.`,
360
+ })
361
+ } else if (route.provider === 'openclaw') {
362
+ const profile = pickGatewayProfile(route, gatewayProfiles, normalizeOptionalId(input.gatewayProfileId))
363
+ if (!profile) {
364
+ checks.push({
365
+ code: 'gateway_profile_missing',
366
+ level: 'error',
367
+ message: 'No gateway profile is available for this agent route.',
368
+ })
369
+ target = summarizeGatewayTarget(route, null)
370
+ } else {
371
+ target = await attachGatewayEnvironment(
372
+ summarizeGatewayTarget(route, profile),
373
+ profile,
374
+ checks,
375
+ input,
376
+ { listGatewayEnvironments: deps.listGatewayEnvironments || listOpenClawGatewayEnvironments },
377
+ )
378
+ }
379
+ } else {
380
+ target = summarizeLocalTarget(route)
381
+ if (isCliProviderId(route.provider)) {
382
+ const ready = checkCliProviderReadyImpl(route.provider)
383
+ checks.push({
384
+ code: ready.ok ? 'cli_provider_ready' : 'cli_provider_not_ready',
385
+ level: ready.ok ? 'info' : 'error',
386
+ message: ready.message,
387
+ detail: ready.binaryPath,
388
+ })
389
+ } else if (providerNeedsCredential(route) && !credentialExists(route.credentialId, loadCredentialsImpl())) {
390
+ checks.push({
391
+ code: 'credential_missing',
392
+ level: 'warn',
393
+ message: `${route.provider} does not have a stored credential for this route.`,
394
+ hint: 'The run may still work if the provider is configured through environment variables.',
395
+ })
396
+ }
397
+ }
398
+ }
399
+
400
+ if (requiredTools.length > 0) {
401
+ checks.push({
402
+ code: 'tools_declared',
403
+ level: 'info',
404
+ message: `${requiredTools.length} eval tool${requiredTools.length === 1 ? '' : 's'} will be enabled: ${requiredTools.join(', ')}.`,
405
+ })
406
+ } else {
407
+ checks.push({
408
+ code: 'no_tools_required',
409
+ level: 'info',
410
+ message: 'This eval scenario does not require tool access.',
411
+ })
412
+ }
413
+
414
+ const envHints = buildEnvHints({ agent, scenarios, suite, target })
415
+
416
+ return {
417
+ generatedAt,
418
+ status: statusFromChecks(checks),
419
+ agentId: input.agentId,
420
+ agentName: agent?.name || input.agentId,
421
+ scenarioIds: scenarios.map((scenario) => scenario.id),
422
+ suite,
423
+ target,
424
+ checks,
425
+ requiredTools,
426
+ missingTools: [],
427
+ maxScore: maxScore(scenarios),
428
+ timeoutMs: timeoutMs(scenarios),
429
+ generatedFiles: baseGeneratedFiles(scenarios),
430
+ envHints,
431
+ }
432
+ }
433
+
434
+ function safeFixtureDestination(workspacePath: string, fixture: EvalScenarioFixture): string {
435
+ const relative = fixture.path.trim()
436
+ if (!relative || path.isAbsolute(relative)) {
437
+ throw new Error(`Unsafe eval fixture path: ${fixture.path}`)
438
+ }
439
+ const destination = path.resolve(workspacePath, relative)
440
+ const root = path.resolve(workspacePath)
441
+ if (destination !== root && !destination.startsWith(`${root}${path.sep}`)) {
442
+ throw new Error(`Unsafe eval fixture path: ${fixture.path}`)
443
+ }
444
+ return destination
445
+ }
446
+
447
+ function writeTextFile(filePath: string, content: string, mode?: number): void {
448
+ fs.mkdirSync(path.dirname(filePath), { recursive: true })
449
+ fs.writeFileSync(filePath, content.endsWith('\n') ? content : `${content}\n`, { encoding: 'utf8', mode })
450
+ }
451
+
452
+ function envLine(hint: EvalEnvironmentPlan['envHints'][number]): string {
453
+ return `${hint.key}=${JSON.stringify(hint.value)}`
454
+ }
455
+
456
+ export function writeEvalEnvironmentWorkspace(options: WriteEvalWorkspaceOptions): EvalEnvironmentGeneratedFile[] {
457
+ const { runId, workspacePath, scenario, plan } = options
458
+ fs.mkdirSync(workspacePath, { recursive: true })
459
+
460
+ const readme = [
461
+ `# Eval Workspace: ${scenario.name}`,
462
+ '',
463
+ `Run ID: ${runId}`,
464
+ `Agent: ${plan.agentName} (${plan.agentId})`,
465
+ `Scenario: ${scenario.id}`,
466
+ `Status at start: ${plan.status}`,
467
+ '',
468
+ 'Runtime manifest: ./environment.json',
469
+ 'Environment hints: ./.env.swarmclaw-eval',
470
+ '',
471
+ 'This directory is isolated for eval artifacts, fixtures, and generated outputs.',
472
+ ].join('\n')
473
+ writeTextFile(path.join(workspacePath, 'README.md'), readme)
474
+ writeTextFile(path.join(workspacePath, 'environment.json'), JSON.stringify({ runId, plan }, null, 2))
475
+ writeTextFile(
476
+ path.join(workspacePath, '.env.swarmclaw-eval'),
477
+ [
478
+ '# Generated by SwarmClaw. Contains eval context only, not secrets.',
479
+ `SWARMCLAW_EVAL_RUN_ID=${JSON.stringify(runId)}`,
480
+ ...plan.envHints.map(envLine),
481
+ ].join('\n'),
482
+ )
483
+
484
+ for (const fixture of scenario.fixtures || []) {
485
+ writeTextFile(safeFixtureDestination(workspacePath, fixture), fixture.content, fixture.mode)
486
+ }
487
+
488
+ return [
489
+ { path: 'README.md', kind: 'readme', required: true },
490
+ { path: 'environment.json', kind: 'manifest', required: true },
491
+ { path: '.env.swarmclaw-eval', kind: 'env', required: true },
492
+ ...fixtureFiles([scenario]),
493
+ ]
494
+ }
495
+
496
+ export function resolveEvalWorkspacePath(runId: string): string {
497
+ return path.join(WORKSPACE_DIR, 'evals', runId)
498
+ }
@@ -10,6 +10,7 @@ import { executeExecutionChatTurn } from '@/lib/server/execution-engine/chat-tur
10
10
  import { WORKSPACE_DIR } from '../data-dir'
11
11
  import type { Session } from '@/types'
12
12
  import { errorMessage } from '@/lib/shared-utils'
13
+ import { buildEvalEnvironmentPlan, writeEvalEnvironmentWorkspace } from './environment-plan'
13
14
 
14
15
  export function resolveEvalSessionCwd(runId: string): string {
15
16
  const dir = path.join(WORKSPACE_DIR, 'evals', runId)
@@ -17,7 +18,17 @@ export function resolveEvalSessionCwd(runId: string): string {
17
18
  return dir
18
19
  }
19
20
 
20
- export async function runEvalScenario(scenarioId: string, agentId: string): Promise<EvalRun> {
21
+ export interface RunEvalScenarioOptions {
22
+ gatewayProfileId?: string | null
23
+ environmentId?: string | null
24
+ refreshGateway?: boolean
25
+ }
26
+
27
+ export async function runEvalScenario(
28
+ scenarioId: string,
29
+ agentId: string,
30
+ options: RunEvalScenarioOptions = {},
31
+ ): Promise<EvalRun> {
21
32
  const scenario = getScenario(scenarioId)
22
33
  if (!scenario) throw new Error(`Unknown eval scenario: ${scenarioId}`)
23
34
 
@@ -29,6 +40,13 @@ export async function runEvalScenario(scenarioId: string, agentId: string): Prom
29
40
  const sessionId = `eval-${runId}`
30
41
  const now = Date.now()
31
42
  const sessionCwd = resolveEvalSessionCwd(runId)
43
+ const environment = await buildEvalEnvironmentPlan({
44
+ agentId,
45
+ scenarioId,
46
+ gatewayProfileId: options.gatewayProfileId || null,
47
+ environmentId: options.environmentId || null,
48
+ refreshGateway: options.refreshGateway === true,
49
+ })
32
50
 
33
51
  const run: EvalRun = {
34
52
  id: runId,
@@ -40,6 +58,34 @@ export async function runEvalScenario(scenarioId: string, agentId: string): Prom
40
58
  maxScore: scenario.scoringCriteria.reduce((sum, c) => sum + c.weight, 0),
41
59
  details: [],
42
60
  sessionId,
61
+ environment,
62
+ }
63
+
64
+ writeEvalEnvironmentWorkspace({
65
+ runId,
66
+ workspacePath: sessionCwd,
67
+ scenario,
68
+ plan: environment,
69
+ })
70
+
71
+ if (environment.status === 'blocked') {
72
+ run.status = 'failed'
73
+ run.error = environment.checks
74
+ .filter((check) => check.level === 'error')
75
+ .map((check) => check.message)
76
+ .join(' ')
77
+ || 'Eval environment validation failed.'
78
+ run.endedAt = Date.now()
79
+ run.details = environment.checks
80
+ .filter((check) => check.level !== 'info')
81
+ .map((check) => ({
82
+ criterion: check.code,
83
+ score: 0,
84
+ maxScore: 0,
85
+ evidence: check.message,
86
+ }))
87
+ saveEvalRun(run)
88
+ return run
43
89
  }
44
90
 
45
91
  // Create temporary eval session
@@ -114,7 +160,7 @@ export async function runEvalScenario(scenarioId: string, agentId: string): Prom
114
160
 
115
161
  export async function runEvalSuite(
116
162
  agentId: string,
117
- opts: { categories?: string[]; suite?: string } = {},
163
+ opts: { categories?: string[]; suite?: string; gatewayProfileId?: string | null; environmentId?: string | null; refreshGateway?: boolean } = {},
118
164
  ): Promise<EvalSuiteResult> {
119
165
  let scenarios: EvalScenario[]
120
166
  if (opts.suite) {
@@ -130,7 +176,11 @@ export async function runEvalSuite(
130
176
 
131
177
  const runs: EvalRun[] = []
132
178
  for (const scenario of scenarios) {
133
- const evalRun = await runEvalScenario(scenario.id, agentId)
179
+ const evalRun = await runEvalScenario(scenario.id, agentId, {
180
+ gatewayProfileId: opts.gatewayProfileId || null,
181
+ environmentId: opts.environmentId || null,
182
+ refreshGateway: opts.refreshGateway === true,
183
+ })
134
184
  runs.push(evalRun)
135
185
  }
136
186
 
@@ -212,6 +212,24 @@ const CORE_SCENARIOS: EvalScenario[] = [
212
212
  ],
213
213
  timeoutMs: 60_000,
214
214
  tools: ['shell', 'files'],
215
+ fixtures: [
216
+ {
217
+ path: 'package.json',
218
+ content: JSON.stringify({
219
+ name: 'swarmclaw-eval-fixture',
220
+ version: '0.0.0',
221
+ private: true,
222
+ dependencies: {
223
+ '@modelcontextprotocol/sdk': '^1.29.0',
224
+ zod: '^4.1.13',
225
+ },
226
+ devDependencies: {
227
+ typescript: '^5.9.3',
228
+ tsx: '^4.20.6',
229
+ },
230
+ }, null, 2),
231
+ },
232
+ ],
215
233
  },
216
234
  ]
217
235
 
@@ -17,10 +17,64 @@ export interface EvalScenario {
17
17
  scoringCriteria: ScoringCriterion[]
18
18
  timeoutMs: number
19
19
  tools: string[]
20
+ fixtures?: EvalScenarioFixture[]
20
21
  /** Optional suite tag. Scenarios without a suite belong to the 'core' suite. */
21
22
  suite?: EvalSuite
22
23
  }
23
24
 
25
+ export interface EvalScenarioFixture {
26
+ path: string
27
+ content: string
28
+ mode?: number
29
+ }
30
+
31
+ export type EvalEnvironmentStatus = 'ready' | 'warning' | 'blocked'
32
+ export type EvalEnvironmentCheckLevel = 'info' | 'warn' | 'error'
33
+
34
+ export interface EvalEnvironmentCheck {
35
+ code: string
36
+ level: EvalEnvironmentCheckLevel
37
+ message: string
38
+ detail?: string
39
+ hint?: string
40
+ }
41
+
42
+ export interface EvalEnvironmentTarget {
43
+ kind: 'local' | 'gateway'
44
+ provider: string
45
+ model: string
46
+ label: string
47
+ gatewayProfileId?: string | null
48
+ environmentId?: string | null
49
+ environmentLabel?: string | null
50
+ environmentStatus?: string | null
51
+ capabilities?: string[]
52
+ refreshedAt?: number | null
53
+ }
54
+
55
+ export interface EvalEnvironmentGeneratedFile {
56
+ path: string
57
+ kind: 'readme' | 'manifest' | 'env' | 'fixture'
58
+ required: boolean
59
+ }
60
+
61
+ export interface EvalEnvironmentPlan {
62
+ generatedAt: number
63
+ status: EvalEnvironmentStatus
64
+ agentId: string
65
+ agentName: string
66
+ scenarioIds: string[]
67
+ suite?: string | null
68
+ target: EvalEnvironmentTarget | null
69
+ checks: EvalEnvironmentCheck[]
70
+ requiredTools: string[]
71
+ missingTools: string[]
72
+ maxScore: number
73
+ timeoutMs: number
74
+ generatedFiles: EvalEnvironmentGeneratedFile[]
75
+ envHints: Array<{ key: string; value: string; description?: string }>
76
+ }
77
+
24
78
  export interface EvalRun {
25
79
  id: string
26
80
  scenarioId: string
@@ -32,6 +86,7 @@ export interface EvalRun {
32
86
  maxScore: number
33
87
  details: EvalCriterionResult[]
34
88
  sessionId?: string
89
+ environment?: EvalEnvironmentPlan
35
90
  error?: string
36
91
  }
37
92