@swarmclawai/swarmclaw 1.9.5 → 1.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -0
- package/package.json +2 -2
- package/src/app/api/eval/environments/route.ts +59 -0
- package/src/app/api/eval/run/route.ts +8 -1
- package/src/app/api/eval/suite/route.ts +6 -0
- package/src/cli/index.js +2 -0
- package/src/components/quality/quality-workspace.tsx +149 -5
- package/src/lib/server/eval/environment-plan.test.ts +221 -0
- package/src/lib/server/eval/environment-plan.ts +498 -0
- package/src/lib/server/eval/runner.ts +53 -3
- package/src/lib/server/eval/scenarios.ts +18 -0
- package/src/lib/server/eval/types.ts +55 -0
package/README.md
CHANGED
|
@@ -399,6 +399,16 @@ Operational docs: https://swarmclaw.ai/docs/observability
|
|
|
399
399
|
|
|
400
400
|
## Releases
|
|
401
401
|
|
|
402
|
+
### v1.9.6 Highlights
|
|
403
|
+
|
|
404
|
+
Bundled eval-environment release: validation preflights, deterministic eval workspaces, and clearer operator readiness before spending run budget.
|
|
405
|
+
|
|
406
|
+
- **Eval validation environments.** `/api/eval/environments` now resolves the selected agent route, gateway target, scenario tools, generated files, and readiness checks before an eval runs.
|
|
407
|
+
- **Workspace manifests.** Eval runs now write `environment.json`, `.env.swarmclaw-eval`, and a task-focused `README.md` into each isolated eval workspace without embedding secrets.
|
|
408
|
+
- **Scenario fixtures.** Eval scenarios can declare fixture files, and the package-analysis scenario now gets a deterministic `package.json` in its workspace.
|
|
409
|
+
- **Fail-fast readiness.** Blocked evals stop before model execution when the agent route, CLI provider, gateway profile, or execution environment is not ready.
|
|
410
|
+
- **Quality UI preflight.** The Eval Lab now shows target status, gateway environment, checks, tools, and generated files next to the selected scenario.
|
|
411
|
+
|
|
402
412
|
### v1.9.5 Highlights
|
|
403
413
|
|
|
404
414
|
Bundled portability release: project-scoped workspace bundles, safer v2 imports, and preserved internal relationships for reusable teams.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@swarmclawai/swarmclaw",
|
|
3
|
-
"version": "1.9.
|
|
3
|
+
"version": "1.9.6",
|
|
4
4
|
"description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
|
|
5
5
|
"main": "electron-dist/main.js",
|
|
6
6
|
"license": "MIT",
|
|
@@ -87,7 +87,7 @@
|
|
|
87
87
|
"test:cli": "node --test src/cli/*.test.js bin/*.test.js scripts/electron-after-pack.test.mjs scripts/ensure-sandbox-browser-image.test.mjs scripts/postinstall.test.mjs scripts/run-next-build.test.mjs scripts/run-next-typegen.test.mjs",
|
|
88
88
|
"test:setup": "tsx --test src/app/api/setup/check-provider/route.test.ts src/lib/server/provider-model-discovery.test.ts src/components/auth/setup-wizard/utils.test.ts src/components/auth/setup-wizard/types.test.ts src/hooks/setup-done-detection.test.ts src/lib/setup-defaults.test.ts src/lib/server/storage-auth.test.ts src/lib/server/storage-auth-docker.test.ts",
|
|
89
89
|
"test:openclaw": "tsx --test src/lib/openclaw/openclaw-agent-id.test.ts src/lib/openclaw/openclaw-endpoint.test.ts src/lib/server/agents/agent-runtime-config.test.ts src/lib/server/build-llm.test.ts src/lib/server/connectors/connector-routing.test.ts src/lib/server/connectors/openclaw.test.ts src/lib/server/connectors/swarmdock.test.ts src/lib/server/gateway/protocol.test.ts src/lib/server/gateways/gateway-topology.test.ts src/lib/server/llm-response-cache.test.ts src/lib/server/mcp-conformance.test.ts src/lib/server/openclaw/agent-resolver.test.ts src/lib/server/openclaw/deploy.test.ts src/lib/server/openclaw/skills-normalize.test.ts src/lib/server/session-tools/openclaw-nodes.test.ts src/lib/server/session-tools/swarmdock.test.ts src/lib/server/tasks/task-quality-gate.test.ts src/lib/server/tasks/task-validation.test.ts src/lib/server/tool-capability-policy.test.ts src/lib/providers/openai.test.ts src/lib/providers/openclaw-exports.test.ts src/app/api/gateways/topology-route.test.ts src/app/api/openclaw/dashboard-url/route.test.ts",
|
|
90
|
-
"test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/tts/route.test.ts",
|
|
90
|
+
"test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/eval/environment-plan.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/tts/route.test.ts",
|
|
91
91
|
"test:builder": "tsx --test src/features/protocols/builder/utils/nodes-to-template.test.ts src/features/protocols/builder/utils/template-to-nodes.test.ts src/features/protocols/builder/validators/dag-validator.test.ts",
|
|
92
92
|
"test:e2e": "node --import tsx scripts/browser-e2e-smoke.ts",
|
|
93
93
|
"test:mcp:conformance": "node --import tsx ./scripts/mcp-conformance-check.ts",
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { NextResponse } from 'next/server'
|
|
2
|
+
import { z } from 'zod'
|
|
3
|
+
|
|
4
|
+
import { buildEvalEnvironmentPlan } from '@/lib/server/eval/environment-plan'
|
|
5
|
+
import { errorMessage } from '@/lib/shared-utils'
|
|
6
|
+
|
|
7
|
+
const PlanSchema = z.object({
|
|
8
|
+
agentId: z.string().min(1),
|
|
9
|
+
scenarioId: z.string().min(1).nullable().optional(),
|
|
10
|
+
suite: z.string().min(1).nullable().optional(),
|
|
11
|
+
gatewayProfileId: z.string().min(1).nullable().optional(),
|
|
12
|
+
environmentId: z.string().min(1).nullable().optional(),
|
|
13
|
+
refreshGateway: z.boolean().optional(),
|
|
14
|
+
})
|
|
15
|
+
|
|
16
|
+
function readBoolean(value: string | null): boolean {
|
|
17
|
+
return value === '1' || value === 'true'
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export async function GET(req: Request) {
|
|
21
|
+
try {
|
|
22
|
+
const { searchParams } = new URL(req.url)
|
|
23
|
+
const parsed = PlanSchema.safeParse({
|
|
24
|
+
agentId: searchParams.get('agentId') || '',
|
|
25
|
+
scenarioId: searchParams.get('scenarioId'),
|
|
26
|
+
suite: searchParams.get('suite'),
|
|
27
|
+
gatewayProfileId: searchParams.get('gatewayProfileId'),
|
|
28
|
+
environmentId: searchParams.get('environmentId'),
|
|
29
|
+
refreshGateway: readBoolean(searchParams.get('refreshGateway')),
|
|
30
|
+
})
|
|
31
|
+
if (!parsed.success) {
|
|
32
|
+
return NextResponse.json(
|
|
33
|
+
{ error: parsed.error.issues.map((issue) => issue.message).join(', ') },
|
|
34
|
+
{ status: 400 },
|
|
35
|
+
)
|
|
36
|
+
}
|
|
37
|
+
const plan = await buildEvalEnvironmentPlan(parsed.data)
|
|
38
|
+
return NextResponse.json(plan)
|
|
39
|
+
} catch (err: unknown) {
|
|
40
|
+
return NextResponse.json({ error: errorMessage(err) }, { status: 500 })
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export async function POST(req: Request) {
|
|
45
|
+
try {
|
|
46
|
+
const body: unknown = await req.json()
|
|
47
|
+
const parsed = PlanSchema.safeParse(body)
|
|
48
|
+
if (!parsed.success) {
|
|
49
|
+
return NextResponse.json(
|
|
50
|
+
{ error: parsed.error.issues.map((issue) => issue.message).join(', ') },
|
|
51
|
+
{ status: 400 },
|
|
52
|
+
)
|
|
53
|
+
}
|
|
54
|
+
const plan = await buildEvalEnvironmentPlan(parsed.data)
|
|
55
|
+
return NextResponse.json(plan)
|
|
56
|
+
} catch (err: unknown) {
|
|
57
|
+
return NextResponse.json({ error: errorMessage(err) }, { status: 500 })
|
|
58
|
+
}
|
|
59
|
+
}
|
|
@@ -7,6 +7,9 @@ import { errorMessage } from '@/lib/shared-utils'
|
|
|
7
7
|
const RunSchema = z.object({
|
|
8
8
|
scenarioId: z.string().min(1),
|
|
9
9
|
agentId: z.string().min(1),
|
|
10
|
+
gatewayProfileId: z.string().min(1).nullable().optional(),
|
|
11
|
+
environmentId: z.string().min(1).nullable().optional(),
|
|
12
|
+
refreshGateway: z.boolean().optional(),
|
|
10
13
|
})
|
|
11
14
|
|
|
12
15
|
export async function POST(req: Request) {
|
|
@@ -20,7 +23,11 @@ export async function POST(req: Request) {
|
|
|
20
23
|
)
|
|
21
24
|
}
|
|
22
25
|
|
|
23
|
-
const result = await runEvalScenario(parsed.data.scenarioId, parsed.data.agentId
|
|
26
|
+
const result = await runEvalScenario(parsed.data.scenarioId, parsed.data.agentId, {
|
|
27
|
+
gatewayProfileId: parsed.data.gatewayProfileId || null,
|
|
28
|
+
environmentId: parsed.data.environmentId || null,
|
|
29
|
+
refreshGateway: parsed.data.refreshGateway === true,
|
|
30
|
+
})
|
|
24
31
|
return NextResponse.json(result)
|
|
25
32
|
} catch (err: unknown) {
|
|
26
33
|
return NextResponse.json(
|
|
@@ -7,6 +7,9 @@ const SuiteSchema = z.object({
|
|
|
7
7
|
agentId: z.string().min(1),
|
|
8
8
|
categories: z.array(z.string()).optional(),
|
|
9
9
|
suite: z.string().min(1).optional(),
|
|
10
|
+
gatewayProfileId: z.string().min(1).nullable().optional(),
|
|
11
|
+
environmentId: z.string().min(1).nullable().optional(),
|
|
12
|
+
refreshGateway: z.boolean().optional(),
|
|
10
13
|
})
|
|
11
14
|
|
|
12
15
|
export async function POST(req: Request) {
|
|
@@ -23,6 +26,9 @@ export async function POST(req: Request) {
|
|
|
23
26
|
const result = await runEvalSuite(parsed.data.agentId, {
|
|
24
27
|
categories: parsed.data.categories,
|
|
25
28
|
suite: parsed.data.suite,
|
|
29
|
+
gatewayProfileId: parsed.data.gatewayProfileId || null,
|
|
30
|
+
environmentId: parsed.data.environmentId || null,
|
|
31
|
+
refreshGateway: parsed.data.refreshGateway === true,
|
|
26
32
|
})
|
|
27
33
|
return NextResponse.json(result)
|
|
28
34
|
} catch (err: unknown) {
|
package/src/cli/index.js
CHANGED
|
@@ -231,8 +231,10 @@ const COMMAND_GROUPS = [
|
|
|
231
231
|
cmd('scenarios', 'GET', '/eval/scenarios', 'List available eval scenarios'),
|
|
232
232
|
cmd('suites', 'GET', '/eval/suites', 'List available eval suites (core, swe-bench-lite, gaia-l1, ...)'),
|
|
233
233
|
cmd('status', 'GET', '/eval/run', 'Get eval run status'),
|
|
234
|
+
cmd('environment', 'GET', '/eval/environments', 'Preview validation environment readiness for an eval'),
|
|
234
235
|
cmd('run', 'POST', '/eval/run', 'Run an eval scenario against an agent', { expectsJsonBody: true }),
|
|
235
236
|
cmd('suite', 'POST', '/eval/suite', 'Run a full eval suite against an agent (pass { suite: "swe-bench-lite" } in body)', { expectsJsonBody: true }),
|
|
237
|
+
cmd('environment-prepare', 'POST', '/eval/environments', 'Prepare validation environment readiness for an eval', { expectsJsonBody: true }),
|
|
236
238
|
],
|
|
237
239
|
},
|
|
238
240
|
{
|
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
} from '@/lib/quality/quality-summary'
|
|
18
18
|
import { cn } from '@/lib/utils'
|
|
19
19
|
import { useAppStore } from '@/stores/use-app-store'
|
|
20
|
-
import type { EvalRun, EvalSuiteResult } from '@/lib/server/eval/types'
|
|
20
|
+
import type { EvalEnvironmentPlan, EvalRun, EvalSuiteResult } from '@/lib/server/eval/types'
|
|
21
21
|
import type { Agent, ApprovalRequest, SessionRunRecord } from '@/types'
|
|
22
22
|
|
|
23
23
|
type QualityTab = 'overview' | 'evals' | 'approvals' | 'runs'
|
|
@@ -105,6 +105,96 @@ function EmptyState({ title, description }: { title: string; description: string
|
|
|
105
105
|
)
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
+
function environmentStatusClass(status: EvalEnvironmentPlan['status']): string {
|
|
109
|
+
if (status === 'ready') return 'border-emerald-500/25 bg-emerald-500/10 text-emerald-200'
|
|
110
|
+
if (status === 'warning') return 'border-amber-500/25 bg-amber-500/10 text-amber-200'
|
|
111
|
+
return 'border-rose-500/25 bg-rose-500/10 text-rose-200'
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function checkClass(level: 'info' | 'warn' | 'error'): string {
|
|
115
|
+
if (level === 'error') return 'border-rose-500/20 bg-rose-500/[0.05] text-rose-200'
|
|
116
|
+
if (level === 'warn') return 'border-amber-500/20 bg-amber-500/[0.05] text-amber-200'
|
|
117
|
+
return 'border-white/[0.06] bg-white/[0.025] text-text-3'
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function EvalEnvironmentPanel({ plan, loading, onRefresh }: {
|
|
121
|
+
plan: EvalEnvironmentPlan | null
|
|
122
|
+
loading: boolean
|
|
123
|
+
onRefresh: () => void
|
|
124
|
+
}) {
|
|
125
|
+
return (
|
|
126
|
+
<div className="rounded-[12px] border border-white/[0.06] bg-white/[0.025] px-3 py-3">
|
|
127
|
+
<div className="flex items-start justify-between gap-3">
|
|
128
|
+
<div>
|
|
129
|
+
<div className="text-[13px] font-800 text-text">Validation environment</div>
|
|
130
|
+
<p className="mt-1 text-[11px] leading-relaxed text-text-3/65">
|
|
131
|
+
Preflight checks, workspace context, and generated files for the selected eval.
|
|
132
|
+
</p>
|
|
133
|
+
</div>
|
|
134
|
+
<button
|
|
135
|
+
type="button"
|
|
136
|
+
onClick={onRefresh}
|
|
137
|
+
disabled={loading}
|
|
138
|
+
className="shrink-0 rounded-[8px] border border-white/[0.08] px-2 py-1 text-[10px] font-800 text-text-2 transition-colors hover:bg-white/[0.06] disabled:opacity-40"
|
|
139
|
+
>
|
|
140
|
+
{loading ? 'Checking' : 'Refresh'}
|
|
141
|
+
</button>
|
|
142
|
+
</div>
|
|
143
|
+
{!plan ? (
|
|
144
|
+
<div className="mt-3 text-[11px] text-text-3/60">{loading ? 'Checking readiness...' : 'Choose an agent and scenario.'}</div>
|
|
145
|
+
) : (
|
|
146
|
+
<div className="mt-3 flex flex-col gap-3">
|
|
147
|
+
<div className="flex flex-wrap items-center gap-2">
|
|
148
|
+
<span className={cn('rounded-full border px-2 py-1 text-[10px] font-800 uppercase tracking-[0.08em]', environmentStatusClass(plan.status))}>
|
|
149
|
+
{plan.status}
|
|
150
|
+
</span>
|
|
151
|
+
{plan.target && (
|
|
152
|
+
<span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
153
|
+
{plan.target.kind} - {plan.target.label}
|
|
154
|
+
</span>
|
|
155
|
+
)}
|
|
156
|
+
<span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
157
|
+
{plan.requiredTools.length} tool{plan.requiredTools.length === 1 ? '' : 's'}
|
|
158
|
+
</span>
|
|
159
|
+
<span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
160
|
+
{plan.generatedFiles.length} file{plan.generatedFiles.length === 1 ? '' : 's'}
|
|
161
|
+
</span>
|
|
162
|
+
</div>
|
|
163
|
+
{plan.target?.environmentLabel && (
|
|
164
|
+
<div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-3 py-2 text-[11px] text-text-3/70">
|
|
165
|
+
Environment: <span className="font-700 text-text-2">{plan.target.environmentLabel}</span>
|
|
166
|
+
{plan.target.environmentStatus ? ` (${plan.target.environmentStatus})` : ''}
|
|
167
|
+
</div>
|
|
168
|
+
)}
|
|
169
|
+
<div className="flex flex-col gap-1.5">
|
|
170
|
+
{plan.checks.slice(0, 4).map((check) => (
|
|
171
|
+
<div key={`${check.code}:${check.message}`} className={cn('rounded-[9px] border px-2.5 py-2 text-[11px] leading-relaxed', checkClass(check.level))}>
|
|
172
|
+
<span className="font-800 uppercase tracking-[0.08em]">{check.level}</span>
|
|
173
|
+
<span className="ml-2">{check.message}</span>
|
|
174
|
+
</div>
|
|
175
|
+
))}
|
|
176
|
+
{plan.checks.length > 4 && (
|
|
177
|
+
<div className="text-[10px] text-text-3/55">+{plan.checks.length - 4} more check{plan.checks.length - 4 === 1 ? '' : 's'}</div>
|
|
178
|
+
)}
|
|
179
|
+
</div>
|
|
180
|
+
<div className="flex flex-wrap gap-1.5">
|
|
181
|
+
{plan.generatedFiles.slice(0, 5).map((file) => (
|
|
182
|
+
<span key={`${file.kind}:${file.path}`} className="rounded-full bg-white/[0.04] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
183
|
+
{file.path}
|
|
184
|
+
</span>
|
|
185
|
+
))}
|
|
186
|
+
{plan.generatedFiles.length > 5 && (
|
|
187
|
+
<span className="rounded-full bg-white/[0.04] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
188
|
+
+{plan.generatedFiles.length - 5}
|
|
189
|
+
</span>
|
|
190
|
+
)}
|
|
191
|
+
</div>
|
|
192
|
+
</div>
|
|
193
|
+
)}
|
|
194
|
+
</div>
|
|
195
|
+
)
|
|
196
|
+
}
|
|
197
|
+
|
|
108
198
|
export function QualityWorkspace() {
|
|
109
199
|
const router = useRouter()
|
|
110
200
|
const searchParams = useSearchParams()
|
|
@@ -127,6 +217,8 @@ export function QualityWorkspace() {
|
|
|
127
217
|
const [selectedSuite, setSelectedSuite] = useState('core')
|
|
128
218
|
const [selectedScenarioId, setSelectedScenarioId] = useState('')
|
|
129
219
|
const [evalBusy, setEvalBusy] = useState<string | null>(null)
|
|
220
|
+
const [evalEnvironmentPlan, setEvalEnvironmentPlan] = useState<EvalEnvironmentPlan | null>(null)
|
|
221
|
+
const [evalEnvironmentLoading, setEvalEnvironmentLoading] = useState(false)
|
|
130
222
|
const [approvalBusy, setApprovalBusy] = useState<string | null>(null)
|
|
131
223
|
|
|
132
224
|
useEffect(() => {
|
|
@@ -170,6 +262,27 @@ export function QualityWorkspace() {
|
|
|
170
262
|
}
|
|
171
263
|
}, [])
|
|
172
264
|
|
|
265
|
+
const loadEvalEnvironmentPlan = useCallback(async (opts: { refreshGateway?: boolean } = {}) => {
|
|
266
|
+
if (!selectedAgentId) {
|
|
267
|
+
setEvalEnvironmentPlan(null)
|
|
268
|
+
return
|
|
269
|
+
}
|
|
270
|
+
const params = new URLSearchParams({ agentId: selectedAgentId })
|
|
271
|
+
if (selectedScenarioId) params.set('scenarioId', selectedScenarioId)
|
|
272
|
+
else if (selectedSuite) params.set('suite', selectedSuite)
|
|
273
|
+
if (opts.refreshGateway) params.set('refreshGateway', 'true')
|
|
274
|
+
setEvalEnvironmentLoading(true)
|
|
275
|
+
try {
|
|
276
|
+
const plan = await api<EvalEnvironmentPlan>('GET', `/eval/environments?${params.toString()}`, undefined, { timeoutMs: opts.refreshGateway ? 20_000 : 8_000 })
|
|
277
|
+
setEvalEnvironmentPlan(plan)
|
|
278
|
+
} catch (err) {
|
|
279
|
+
setEvalEnvironmentPlan(null)
|
|
280
|
+
toast.error(err instanceof Error ? err.message : 'Unable to validate eval environment')
|
|
281
|
+
} finally {
|
|
282
|
+
setEvalEnvironmentLoading(false)
|
|
283
|
+
}
|
|
284
|
+
}, [selectedAgentId, selectedScenarioId, selectedSuite])
|
|
285
|
+
|
|
173
286
|
useEffect(() => {
|
|
174
287
|
void loadQualityData()
|
|
175
288
|
}, [loadQualityData])
|
|
@@ -184,6 +297,10 @@ export function QualityWorkspace() {
|
|
|
184
297
|
if (!selectedScenarioId && scenarios[0]) setSelectedScenarioId(scenarios[0].id)
|
|
185
298
|
}, [scenarios, selectedScenarioId])
|
|
186
299
|
|
|
300
|
+
useEffect(() => {
|
|
301
|
+
void loadEvalEnvironmentPlan()
|
|
302
|
+
}, [loadEvalEnvironmentPlan])
|
|
303
|
+
|
|
187
304
|
useEffect(() => {
|
|
188
305
|
if (!suites.some((suite) => suite.name === selectedSuite) && suites[0]) {
|
|
189
306
|
setSelectedSuite(suites[0].name)
|
|
@@ -208,34 +325,56 @@ export function QualityWorkspace() {
|
|
|
208
325
|
toast.error('Choose an agent and scenario first')
|
|
209
326
|
return
|
|
210
327
|
}
|
|
328
|
+
if (evalEnvironmentPlan?.status === 'blocked') {
|
|
329
|
+
toast.error('Fix the validation environment before running this eval')
|
|
330
|
+
return
|
|
331
|
+
}
|
|
211
332
|
setEvalBusy(`scenario:${selectedScenarioId}`)
|
|
212
333
|
try {
|
|
213
|
-
await api<EvalRun>('POST', '/eval/run', {
|
|
334
|
+
await api<EvalRun>('POST', '/eval/run', {
|
|
335
|
+
agentId: selectedAgentId,
|
|
336
|
+
scenarioId: selectedScenarioId,
|
|
337
|
+
gatewayProfileId: evalEnvironmentPlan?.target?.gatewayProfileId || null,
|
|
338
|
+
environmentId: evalEnvironmentPlan?.target?.environmentId || null,
|
|
339
|
+
refreshGateway: evalEnvironmentPlan?.target?.kind === 'gateway',
|
|
340
|
+
}, { timeoutMs: 180_000 })
|
|
214
341
|
toast.success('Eval scenario completed')
|
|
215
342
|
await loadQualityData({ silent: true })
|
|
343
|
+
await loadEvalEnvironmentPlan()
|
|
216
344
|
} catch (err) {
|
|
217
345
|
toast.error(err instanceof Error ? err.message : 'Eval scenario failed')
|
|
218
346
|
} finally {
|
|
219
347
|
setEvalBusy(null)
|
|
220
348
|
}
|
|
221
|
-
}, [loadQualityData, selectedAgentId, selectedScenarioId])
|
|
349
|
+
}, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadQualityData, selectedAgentId, selectedScenarioId])
|
|
222
350
|
|
|
223
351
|
const runSuite = useCallback(async (suiteName: string) => {
|
|
224
352
|
if (!selectedAgentId) {
|
|
225
353
|
toast.error('Choose an agent first')
|
|
226
354
|
return
|
|
227
355
|
}
|
|
356
|
+
if (evalEnvironmentPlan?.status === 'blocked') {
|
|
357
|
+
toast.error('Fix the validation environment before running this suite')
|
|
358
|
+
return
|
|
359
|
+
}
|
|
228
360
|
setEvalBusy(`suite:${suiteName}`)
|
|
229
361
|
try {
|
|
230
|
-
const result = await api<EvalSuiteResult>('POST', '/eval/suite', {
|
|
362
|
+
const result = await api<EvalSuiteResult>('POST', '/eval/suite', {
|
|
363
|
+
agentId: selectedAgentId,
|
|
364
|
+
suite: suiteName,
|
|
365
|
+
gatewayProfileId: evalEnvironmentPlan?.target?.gatewayProfileId || null,
|
|
366
|
+
environmentId: evalEnvironmentPlan?.target?.environmentId || null,
|
|
367
|
+
refreshGateway: evalEnvironmentPlan?.target?.kind === 'gateway',
|
|
368
|
+
}, { timeoutMs: 300_000 })
|
|
231
369
|
toast.success(`Suite completed at ${Math.round(result.percentage)}%`)
|
|
232
370
|
await loadQualityData({ silent: true })
|
|
371
|
+
await loadEvalEnvironmentPlan()
|
|
233
372
|
} catch (err) {
|
|
234
373
|
toast.error(err instanceof Error ? err.message : 'Eval suite failed')
|
|
235
374
|
} finally {
|
|
236
375
|
setEvalBusy(null)
|
|
237
376
|
}
|
|
238
|
-
}, [loadQualityData, selectedAgentId])
|
|
377
|
+
}, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadQualityData, selectedAgentId])
|
|
239
378
|
|
|
240
379
|
const actOnApproval = useCallback(async (approval: ApprovalRequest, approved: boolean) => {
|
|
241
380
|
setApprovalBusy(approval.id)
|
|
@@ -456,6 +595,11 @@ export function QualityWorkspace() {
|
|
|
456
595
|
</div>
|
|
457
596
|
</div>
|
|
458
597
|
)}
|
|
598
|
+
<EvalEnvironmentPanel
|
|
599
|
+
plan={evalEnvironmentPlan}
|
|
600
|
+
loading={evalEnvironmentLoading}
|
|
601
|
+
onRefresh={() => void loadEvalEnvironmentPlan({ refreshGateway: true })}
|
|
602
|
+
/>
|
|
459
603
|
<button
|
|
460
604
|
type="button"
|
|
461
605
|
onClick={() => openMissionTemplate('release-candidate-qa')}
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
import assert from 'node:assert/strict'
|
|
2
|
+
import fs from 'node:fs'
|
|
3
|
+
import os from 'node:os'
|
|
4
|
+
import path from 'node:path'
|
|
5
|
+
import test from 'node:test'
|
|
6
|
+
|
|
7
|
+
import type { Agent, GatewayProfile } from '@/types'
|
|
8
|
+
import { getScenario } from './scenarios'
|
|
9
|
+
import { buildEvalEnvironmentPlan, writeEvalEnvironmentWorkspace } from './environment-plan'
|
|
10
|
+
import type { EvalEnvironmentPlan, EvalScenario } from './types'
|
|
11
|
+
|
|
12
|
+
function makeAgent(overrides: Partial<Agent> = {}): Agent {
|
|
13
|
+
return {
|
|
14
|
+
id: 'agent-1',
|
|
15
|
+
name: 'Eval Agent',
|
|
16
|
+
description: 'Validates eval environments.',
|
|
17
|
+
systemPrompt: 'You are an eval agent.',
|
|
18
|
+
provider: 'ollama',
|
|
19
|
+
model: 'llama3',
|
|
20
|
+
ollamaMode: 'local',
|
|
21
|
+
tools: [],
|
|
22
|
+
createdAt: 1,
|
|
23
|
+
updatedAt: 1,
|
|
24
|
+
...overrides,
|
|
25
|
+
} as Agent
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function makeGateway(overrides: Partial<GatewayProfile> = {}): GatewayProfile {
|
|
29
|
+
return {
|
|
30
|
+
id: 'gateway-1',
|
|
31
|
+
name: 'Gateway 1',
|
|
32
|
+
provider: 'openclaw',
|
|
33
|
+
endpoint: 'http://127.0.0.1:18789/v1',
|
|
34
|
+
wsUrl: 'ws://127.0.0.1:18789',
|
|
35
|
+
credentialId: null,
|
|
36
|
+
status: 'healthy',
|
|
37
|
+
stats: {
|
|
38
|
+
nodeCount: 1,
|
|
39
|
+
connectedNodeCount: 1,
|
|
40
|
+
environmentCount: 1,
|
|
41
|
+
availableEnvironmentCount: 1,
|
|
42
|
+
pendingNodePairings: 0,
|
|
43
|
+
pendingDevicePairings: 0,
|
|
44
|
+
pairedDeviceCount: 0,
|
|
45
|
+
lastTopologyCheckedAt: 2,
|
|
46
|
+
lastTopologyErrorCount: 0,
|
|
47
|
+
lastTopologyError: null,
|
|
48
|
+
},
|
|
49
|
+
createdAt: 1,
|
|
50
|
+
updatedAt: 1,
|
|
51
|
+
...overrides,
|
|
52
|
+
} as GatewayProfile
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
test('eval environment plan blocks missing CLI provider readiness before a run spends tokens', async () => {
|
|
56
|
+
const plan = await buildEvalEnvironmentPlan(
|
|
57
|
+
{ agentId: 'agent-cli', scenarioId: 'coding-prime' },
|
|
58
|
+
{
|
|
59
|
+
now: () => 123,
|
|
60
|
+
loadAgents: () => ({
|
|
61
|
+
'agent-cli': makeAgent({
|
|
62
|
+
id: 'agent-cli',
|
|
63
|
+
provider: 'codex-cli',
|
|
64
|
+
model: 'gpt-5.2',
|
|
65
|
+
ollamaMode: null,
|
|
66
|
+
}),
|
|
67
|
+
}),
|
|
68
|
+
listGatewayProfiles: () => [],
|
|
69
|
+
checkCliProviderReady: () => ({
|
|
70
|
+
ok: false,
|
|
71
|
+
message: 'Codex CLI is not installed.',
|
|
72
|
+
providerId: 'codex-cli',
|
|
73
|
+
displayName: 'Codex CLI',
|
|
74
|
+
binaryName: 'codex',
|
|
75
|
+
}),
|
|
76
|
+
},
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
assert.equal(plan.status, 'blocked')
|
|
80
|
+
assert.equal(plan.target?.kind, 'local')
|
|
81
|
+
assert.ok(plan.checks.some((check) => check.code === 'cli_provider_not_ready' && check.level === 'error'))
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
test('eval environment plan refreshes gateway environments and selects an available target', async () => {
|
|
85
|
+
const plan = await buildEvalEnvironmentPlan(
|
|
86
|
+
{
|
|
87
|
+
agentId: 'agent-openclaw',
|
|
88
|
+
scenarioId: 'coding-prime',
|
|
89
|
+
refreshGateway: true,
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
now: () => 456,
|
|
93
|
+
loadAgents: () => ({
|
|
94
|
+
'agent-openclaw': makeAgent({
|
|
95
|
+
id: 'agent-openclaw',
|
|
96
|
+
provider: 'openclaw',
|
|
97
|
+
model: 'default',
|
|
98
|
+
gatewayProfileId: 'gateway-1',
|
|
99
|
+
}),
|
|
100
|
+
}),
|
|
101
|
+
listGatewayProfiles: () => [makeGateway()],
|
|
102
|
+
listGatewayEnvironments: async () => ({
|
|
103
|
+
profile: makeGateway(),
|
|
104
|
+
connected: true,
|
|
105
|
+
refreshedAt: 789,
|
|
106
|
+
errors: [],
|
|
107
|
+
environments: [
|
|
108
|
+
{ id: 'env-busy', type: 'sandbox', label: 'Busy', status: 'starting', capabilities: ['agent.run'] },
|
|
109
|
+
{ id: 'env-ready', type: 'sandbox', label: 'Ready', status: 'available', capabilities: ['agent.run', 'workspace'] },
|
|
110
|
+
],
|
|
111
|
+
}),
|
|
112
|
+
},
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
assert.equal(plan.status, 'ready')
|
|
116
|
+
assert.equal(plan.target?.kind, 'gateway')
|
|
117
|
+
assert.equal(plan.target?.environmentId, 'env-ready')
|
|
118
|
+
assert.equal(plan.target?.environmentStatus, 'available')
|
|
119
|
+
assert.deepEqual(plan.target?.capabilities, ['agent.run', 'workspace'])
|
|
120
|
+
assert.ok(plan.checks.some((check) => check.code === 'environment_available'))
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
test('eval environment plan blocks gateways with no available execution environments', async () => {
|
|
124
|
+
const plan = await buildEvalEnvironmentPlan(
|
|
125
|
+
{ agentId: 'agent-openclaw', scenarioId: 'coding-prime' },
|
|
126
|
+
{
|
|
127
|
+
loadAgents: () => ({
|
|
128
|
+
'agent-openclaw': makeAgent({
|
|
129
|
+
id: 'agent-openclaw',
|
|
130
|
+
provider: 'openclaw',
|
|
131
|
+
model: 'default',
|
|
132
|
+
gatewayProfileId: 'gateway-1',
|
|
133
|
+
}),
|
|
134
|
+
}),
|
|
135
|
+
listGatewayProfiles: () => [
|
|
136
|
+
makeGateway({
|
|
137
|
+
stats: {
|
|
138
|
+
nodeCount: 1,
|
|
139
|
+
connectedNodeCount: 1,
|
|
140
|
+
environmentCount: 2,
|
|
141
|
+
availableEnvironmentCount: 0,
|
|
142
|
+
pendingNodePairings: 0,
|
|
143
|
+
pendingDevicePairings: 0,
|
|
144
|
+
pairedDeviceCount: 0,
|
|
145
|
+
},
|
|
146
|
+
}),
|
|
147
|
+
],
|
|
148
|
+
},
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
assert.equal(plan.status, 'blocked')
|
|
152
|
+
assert.ok(plan.checks.some((check) => check.code === 'no_available_gateway_environment'))
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
test('eval workspace writer materializes manifests, env hints, and scenario fixtures', async () => {
|
|
156
|
+
const scenario = getScenario('multi-step-analyze')
|
|
157
|
+
assert.ok(scenario)
|
|
158
|
+
const root = fs.mkdtempSync(path.join(os.tmpdir(), 'swarmclaw-eval-env-'))
|
|
159
|
+
const plan = await buildEvalEnvironmentPlan(
|
|
160
|
+
{ agentId: 'agent-1', scenarioId: scenario.id },
|
|
161
|
+
{
|
|
162
|
+
now: () => 999,
|
|
163
|
+
loadAgents: () => ({ 'agent-1': makeAgent() }),
|
|
164
|
+
listGatewayProfiles: () => [],
|
|
165
|
+
},
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
const files = writeEvalEnvironmentWorkspace({
|
|
169
|
+
runId: 'run-1',
|
|
170
|
+
workspacePath: root,
|
|
171
|
+
scenario,
|
|
172
|
+
plan,
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
assert.ok(files.some((file) => file.path === 'package.json' && file.kind === 'fixture'))
|
|
176
|
+
assert.ok(fs.existsSync(path.join(root, 'README.md')))
|
|
177
|
+
assert.ok(fs.existsSync(path.join(root, 'environment.json')))
|
|
178
|
+
assert.ok(fs.existsSync(path.join(root, '.env.swarmclaw-eval')))
|
|
179
|
+
const fixture = JSON.parse(fs.readFileSync(path.join(root, 'package.json'), 'utf8')) as { dependencies?: Record<string, string> }
|
|
180
|
+
assert.equal(fixture.dependencies?.zod, '^4.1.13')
|
|
181
|
+
assert.ok(fs.readFileSync(path.join(root, '.env.swarmclaw-eval'), 'utf8').includes('SWARMCLAW_EVAL_RUN_ID="run-1"'))
|
|
182
|
+
})
|
|
183
|
+
|
|
184
|
+
test('eval workspace writer refuses fixture paths outside the eval workspace', () => {
|
|
185
|
+
const root = fs.mkdtempSync(path.join(os.tmpdir(), 'swarmclaw-eval-unsafe-'))
|
|
186
|
+
const scenario: EvalScenario = {
|
|
187
|
+
id: 'unsafe-fixture',
|
|
188
|
+
name: 'Unsafe Fixture',
|
|
189
|
+
category: 'coding',
|
|
190
|
+
description: 'Unsafe fixture path test',
|
|
191
|
+
userMessage: 'noop',
|
|
192
|
+
expectedBehaviors: [],
|
|
193
|
+
scoringCriteria: [],
|
|
194
|
+
timeoutMs: 1,
|
|
195
|
+
tools: [],
|
|
196
|
+
fixtures: [{ path: '../outside.txt', content: 'nope' }],
|
|
197
|
+
}
|
|
198
|
+
const plan: EvalEnvironmentPlan = {
|
|
199
|
+
generatedAt: 1,
|
|
200
|
+
status: 'ready',
|
|
201
|
+
agentId: 'agent-1',
|
|
202
|
+
agentName: 'Eval Agent',
|
|
203
|
+
scenarioIds: [scenario.id],
|
|
204
|
+
suite: null,
|
|
205
|
+
target: null,
|
|
206
|
+
checks: [],
|
|
207
|
+
requiredTools: [],
|
|
208
|
+
missingTools: [],
|
|
209
|
+
maxScore: 0,
|
|
210
|
+
timeoutMs: 1,
|
|
211
|
+
generatedFiles: [],
|
|
212
|
+
envHints: [],
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
assert.throws(() => writeEvalEnvironmentWorkspace({
|
|
216
|
+
runId: 'run-unsafe',
|
|
217
|
+
workspacePath: root,
|
|
218
|
+
scenario,
|
|
219
|
+
plan,
|
|
220
|
+
}), /Unsafe eval fixture path/)
|
|
221
|
+
})
|
|
@@ -0,0 +1,498 @@
|
|
|
1
|
+
import fs from 'node:fs'
|
|
2
|
+
import path from 'node:path'
|
|
3
|
+
|
|
4
|
+
import { WORKSPACE_DIR } from '@/lib/server/data-dir'
|
|
5
|
+
import { resolveAgentRouteCandidatesWithProfiles, type ResolvedAgentRoute } from '@/lib/server/agents/agent-runtime-config'
|
|
6
|
+
import { checkCliProviderReady, type CliProviderReadyResult } from '@/lib/server/cli-provider-readiness'
|
|
7
|
+
import { listOpenClawGatewayEnvironments } from '@/lib/server/gateways/gateway-topology'
|
|
8
|
+
import { loadAgents, loadCredentials } from '@/lib/server/storage'
|
|
9
|
+
import { isCliProviderId } from '@/lib/providers/cli-provider-metadata'
|
|
10
|
+
import type { Agent, GatewayProfile, OpenClawEnvironmentSummary, OpenClawGatewayEnvironmentList } from '@/types'
|
|
11
|
+
import type {
|
|
12
|
+
EvalEnvironmentCheck,
|
|
13
|
+
EvalEnvironmentGeneratedFile,
|
|
14
|
+
EvalEnvironmentPlan,
|
|
15
|
+
EvalEnvironmentTarget,
|
|
16
|
+
EvalScenario,
|
|
17
|
+
EvalScenarioFixture,
|
|
18
|
+
} from './types'
|
|
19
|
+
import { getScenario, getSuiteScenarios } from './scenarios'
|
|
20
|
+
import { listOpenClawGatewayProfiles } from '../gateways/gateway-profile-service'
|
|
21
|
+
|
|
22
|
+
export interface EvalEnvironmentPlanInput {
|
|
23
|
+
agentId: string
|
|
24
|
+
scenarioId?: string | null
|
|
25
|
+
suite?: string | null
|
|
26
|
+
gatewayProfileId?: string | null
|
|
27
|
+
environmentId?: string | null
|
|
28
|
+
refreshGateway?: boolean
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
interface EvalEnvironmentPlanDeps {
|
|
32
|
+
now?: () => number
|
|
33
|
+
loadAgents?: () => Record<string, Agent>
|
|
34
|
+
loadCredentials?: () => Record<string, unknown>
|
|
35
|
+
listGatewayProfiles?: () => GatewayProfile[]
|
|
36
|
+
listGatewayEnvironments?: (id: string) => Promise<OpenClawGatewayEnvironmentList | null>
|
|
37
|
+
checkCliProviderReady?: (providerId: string) => CliProviderReadyResult
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
interface WriteEvalWorkspaceOptions {
|
|
41
|
+
runId: string
|
|
42
|
+
workspacePath: string
|
|
43
|
+
scenario: EvalScenario
|
|
44
|
+
plan: EvalEnvironmentPlan
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function normalizeOptionalId(value: string | null | undefined): string | null {
|
|
48
|
+
return typeof value === 'string' && value.trim() ? value.trim() : null
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function uniqueStrings(values: string[]): string[] {
|
|
52
|
+
return [...new Set(values.map((value) => value.trim()).filter(Boolean))]
|
|
53
|
+
.sort((left, right) => left.localeCompare(right))
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function scenarioSet(input: EvalEnvironmentPlanInput): { scenarios: EvalScenario[]; missing?: string } {
|
|
57
|
+
const scenarioId = normalizeOptionalId(input.scenarioId)
|
|
58
|
+
if (scenarioId) {
|
|
59
|
+
const scenario = getScenario(scenarioId)
|
|
60
|
+
return scenario ? { scenarios: [scenario] } : { scenarios: [], missing: scenarioId }
|
|
61
|
+
}
|
|
62
|
+
const suite = normalizeOptionalId(input.suite) || 'core'
|
|
63
|
+
return { scenarios: getSuiteScenarios(suite) }
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function maxScore(scenarios: EvalScenario[]): number {
|
|
67
|
+
return scenarios.reduce(
|
|
68
|
+
(sum, scenario) => sum + scenario.scoringCriteria.reduce((criterionSum, criterion) => criterionSum + criterion.weight, 0),
|
|
69
|
+
0,
|
|
70
|
+
)
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function timeoutMs(scenarios: EvalScenario[]): number {
|
|
74
|
+
return scenarios.reduce((sum, scenario) => sum + scenario.timeoutMs, 0)
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function fixtureFiles(scenarios: EvalScenario[]): EvalEnvironmentGeneratedFile[] {
|
|
78
|
+
return scenarios.flatMap((scenario) => (scenario.fixtures || []).map((fixture) => ({
|
|
79
|
+
path: fixture.path,
|
|
80
|
+
kind: 'fixture' as const,
|
|
81
|
+
required: true,
|
|
82
|
+
})))
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function baseGeneratedFiles(scenarios: EvalScenario[]): EvalEnvironmentGeneratedFile[] {
|
|
86
|
+
return [
|
|
87
|
+
{ path: 'README.md', kind: 'readme', required: true },
|
|
88
|
+
{ path: 'environment.json', kind: 'manifest', required: true },
|
|
89
|
+
{ path: '.env.swarmclaw-eval', kind: 'env', required: true },
|
|
90
|
+
...fixtureFiles(scenarios),
|
|
91
|
+
]
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function providerNeedsCredential(route: ResolvedAgentRoute): boolean {
|
|
95
|
+
if (route.provider === 'openclaw') return false
|
|
96
|
+
if (route.provider === 'ollama' && route.ollamaMode !== 'cloud') return false
|
|
97
|
+
if (isCliProviderId(route.provider)) return false
|
|
98
|
+
return true
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function credentialExists(credentialId: string | null | undefined, credentials: Record<string, unknown>): boolean {
|
|
102
|
+
return typeof credentialId === 'string' && credentialId.trim() ? Boolean(credentials[credentialId]) : false
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function checkLevelRank(level: EvalEnvironmentCheck['level']): number {
|
|
106
|
+
if (level === 'error') return 2
|
|
107
|
+
if (level === 'warn') return 1
|
|
108
|
+
return 0
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function statusFromChecks(checks: EvalEnvironmentCheck[]): EvalEnvironmentPlan['status'] {
|
|
112
|
+
const max = checks.reduce((rank, check) => Math.max(rank, checkLevelRank(check.level)), 0)
|
|
113
|
+
if (max >= 2) return 'blocked'
|
|
114
|
+
if (max >= 1) return 'warning'
|
|
115
|
+
return 'ready'
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function pickGatewayProfile(
|
|
119
|
+
route: ResolvedAgentRoute | null,
|
|
120
|
+
profiles: GatewayProfile[],
|
|
121
|
+
requestedProfileId: string | null,
|
|
122
|
+
): GatewayProfile | null {
|
|
123
|
+
if (requestedProfileId) {
|
|
124
|
+
return profiles.find((profile) => profile.id === requestedProfileId) || null
|
|
125
|
+
}
|
|
126
|
+
if (route?.gatewayProfileId) {
|
|
127
|
+
return profiles.find((profile) => profile.id === route.gatewayProfileId) || null
|
|
128
|
+
}
|
|
129
|
+
return profiles.find((profile) => profile.isDefault) || profiles[0] || null
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function summarizeGatewayTarget(route: ResolvedAgentRoute, profile: GatewayProfile | null): EvalEnvironmentTarget {
|
|
133
|
+
return {
|
|
134
|
+
kind: 'gateway',
|
|
135
|
+
provider: route.provider,
|
|
136
|
+
model: route.model,
|
|
137
|
+
label: profile?.name || route.label,
|
|
138
|
+
gatewayProfileId: profile?.id || route.gatewayProfileId || null,
|
|
139
|
+
capabilities: ['agent.run', 'sessions', 'tools', 'workspace'],
|
|
140
|
+
refreshedAt: profile?.stats?.lastTopologyCheckedAt || profile?.lastCheckedAt || null,
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function summarizeLocalTarget(route: ResolvedAgentRoute): EvalEnvironmentTarget {
|
|
145
|
+
return {
|
|
146
|
+
kind: 'local',
|
|
147
|
+
provider: route.provider,
|
|
148
|
+
model: route.model,
|
|
149
|
+
label: route.label,
|
|
150
|
+
capabilities: ['agent.run', 'tools', 'workspace'],
|
|
151
|
+
refreshedAt: null,
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function addEnvHint(
|
|
156
|
+
hints: EvalEnvironmentPlan['envHints'],
|
|
157
|
+
key: string,
|
|
158
|
+
value: string | null | undefined,
|
|
159
|
+
description?: string,
|
|
160
|
+
): void {
|
|
161
|
+
if (!value) return
|
|
162
|
+
hints.push({ key, value, ...(description ? { description } : {}) })
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function buildEnvHints(params: {
|
|
166
|
+
agent: Agent | null
|
|
167
|
+
scenarios: EvalScenario[]
|
|
168
|
+
suite: string | null
|
|
169
|
+
target: EvalEnvironmentTarget | null
|
|
170
|
+
}): EvalEnvironmentPlan['envHints'] {
|
|
171
|
+
const hints: EvalEnvironmentPlan['envHints'] = []
|
|
172
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_AGENT_ID', params.agent?.id, 'Agent under validation')
|
|
173
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_AGENT_NAME', params.agent?.name, 'Agent display name')
|
|
174
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_SCENARIOS', params.scenarios.map((scenario) => scenario.id).join(','), 'Comma-separated eval scenario ids')
|
|
175
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_SUITE', params.suite, 'Eval suite name')
|
|
176
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_TARGET_KIND', params.target?.kind, 'Resolved execution target kind')
|
|
177
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_PROVIDER', params.target?.provider, 'Resolved provider')
|
|
178
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_MODEL', params.target?.model, 'Resolved model')
|
|
179
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_GATEWAY_PROFILE_ID', params.target?.gatewayProfileId || null, 'Resolved gateway profile id')
|
|
180
|
+
addEnvHint(hints, 'SWARMCLAW_EVAL_ENVIRONMENT_ID', params.target?.environmentId || null, 'Requested or selected gateway environment id')
|
|
181
|
+
return hints
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
function normalizeEnvironmentCapabilities(environment: OpenClawEnvironmentSummary | null | undefined): string[] {
|
|
185
|
+
return uniqueStrings(environment?.capabilities || [])
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
async function attachGatewayEnvironment(
|
|
189
|
+
target: EvalEnvironmentTarget,
|
|
190
|
+
profile: GatewayProfile | null,
|
|
191
|
+
checks: EvalEnvironmentCheck[],
|
|
192
|
+
input: EvalEnvironmentPlanInput,
|
|
193
|
+
deps: Required<Pick<EvalEnvironmentPlanDeps, 'listGatewayEnvironments'>>,
|
|
194
|
+
): Promise<EvalEnvironmentTarget> {
|
|
195
|
+
if (!profile) return target
|
|
196
|
+
const requestedEnvironmentId = normalizeOptionalId(input.environmentId)
|
|
197
|
+
|
|
198
|
+
if (profile.status === 'offline') {
|
|
199
|
+
checks.push({
|
|
200
|
+
code: 'gateway_offline',
|
|
201
|
+
level: 'error',
|
|
202
|
+
message: `${profile.name} is offline.`,
|
|
203
|
+
hint: 'Refresh or repair the gateway before running evals through it.',
|
|
204
|
+
})
|
|
205
|
+
} else if (profile.status === 'degraded') {
|
|
206
|
+
checks.push({
|
|
207
|
+
code: 'gateway_degraded',
|
|
208
|
+
level: 'warn',
|
|
209
|
+
message: `${profile.name} is degraded.`,
|
|
210
|
+
detail: profile.lastError || undefined,
|
|
211
|
+
})
|
|
212
|
+
} else if (profile.status === 'pending' || profile.status === 'unknown') {
|
|
213
|
+
checks.push({
|
|
214
|
+
code: 'gateway_unverified',
|
|
215
|
+
level: 'warn',
|
|
216
|
+
message: `${profile.name} has not reported a healthy gateway status yet.`,
|
|
217
|
+
})
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const environmentCount = profile.stats?.environmentCount || 0
|
|
221
|
+
const availableEnvironmentCount = profile.stats?.availableEnvironmentCount || 0
|
|
222
|
+
if (environmentCount > 0 && availableEnvironmentCount === 0) {
|
|
223
|
+
checks.push({
|
|
224
|
+
code: 'no_available_gateway_environment',
|
|
225
|
+
level: 'error',
|
|
226
|
+
message: `${profile.name} has ${environmentCount} execution environment${environmentCount === 1 ? '' : 's'}, but none are available.`,
|
|
227
|
+
})
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if (!input.refreshGateway) {
|
|
231
|
+
if (requestedEnvironmentId) {
|
|
232
|
+
checks.push({
|
|
233
|
+
code: 'environment_not_refreshed',
|
|
234
|
+
level: 'warn',
|
|
235
|
+
message: `Environment ${requestedEnvironmentId} was requested but not refreshed.`,
|
|
236
|
+
hint: 'Run validation with refresh enabled to verify the exact environment.',
|
|
237
|
+
})
|
|
238
|
+
return { ...target, environmentId: requestedEnvironmentId }
|
|
239
|
+
}
|
|
240
|
+
checks.push({
|
|
241
|
+
code: 'gateway_snapshot_only',
|
|
242
|
+
level: 'info',
|
|
243
|
+
message: 'Using the last stored gateway topology snapshot for validation.',
|
|
244
|
+
})
|
|
245
|
+
return target
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
const snapshot = await deps.listGatewayEnvironments(profile.id)
|
|
249
|
+
if (!snapshot) {
|
|
250
|
+
checks.push({
|
|
251
|
+
code: 'gateway_environment_snapshot_missing',
|
|
252
|
+
level: 'error',
|
|
253
|
+
message: `${profile.name} could not be refreshed for environment validation.`,
|
|
254
|
+
})
|
|
255
|
+
return target
|
|
256
|
+
}
|
|
257
|
+
for (const error of snapshot.errors) {
|
|
258
|
+
checks.push({
|
|
259
|
+
code: 'gateway_environment_refresh_error',
|
|
260
|
+
level: 'warn',
|
|
261
|
+
message: `${error.method}: ${error.message}`,
|
|
262
|
+
})
|
|
263
|
+
}
|
|
264
|
+
const environments = snapshot.environments
|
|
265
|
+
const selected = requestedEnvironmentId
|
|
266
|
+
? environments.find((environment) => environment.id === requestedEnvironmentId) || null
|
|
267
|
+
: environments.find((environment) => environment.status === 'available') || environments[0] || null
|
|
268
|
+
if (requestedEnvironmentId && !selected) {
|
|
269
|
+
checks.push({
|
|
270
|
+
code: 'environment_not_found',
|
|
271
|
+
level: 'error',
|
|
272
|
+
message: `Requested execution environment ${requestedEnvironmentId} was not found on ${profile.name}.`,
|
|
273
|
+
})
|
|
274
|
+
return { ...target, environmentId: requestedEnvironmentId, refreshedAt: snapshot.refreshedAt }
|
|
275
|
+
}
|
|
276
|
+
if (!selected) {
|
|
277
|
+
checks.push({
|
|
278
|
+
code: 'no_gateway_environments',
|
|
279
|
+
level: 'warn',
|
|
280
|
+
message: `${profile.name} did not report any execution environments.`,
|
|
281
|
+
})
|
|
282
|
+
return { ...target, refreshedAt: snapshot.refreshedAt }
|
|
283
|
+
}
|
|
284
|
+
if (selected.status !== 'available') {
|
|
285
|
+
checks.push({
|
|
286
|
+
code: 'environment_unavailable',
|
|
287
|
+
level: selected.status === 'error' ? 'error' : 'warn',
|
|
288
|
+
message: `${selected.label || selected.id} is ${selected.status}.`,
|
|
289
|
+
})
|
|
290
|
+
} else {
|
|
291
|
+
checks.push({
|
|
292
|
+
code: 'environment_available',
|
|
293
|
+
level: 'info',
|
|
294
|
+
message: `${selected.label || selected.id} is available for validation runs.`,
|
|
295
|
+
})
|
|
296
|
+
}
|
|
297
|
+
return {
|
|
298
|
+
...target,
|
|
299
|
+
environmentId: selected.id,
|
|
300
|
+
environmentLabel: selected.label || selected.id,
|
|
301
|
+
environmentStatus: selected.status,
|
|
302
|
+
capabilities: normalizeEnvironmentCapabilities(selected),
|
|
303
|
+
refreshedAt: snapshot.refreshedAt,
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
export async function buildEvalEnvironmentPlan(
|
|
308
|
+
input: EvalEnvironmentPlanInput,
|
|
309
|
+
deps: EvalEnvironmentPlanDeps = {},
|
|
310
|
+
): Promise<EvalEnvironmentPlan> {
|
|
311
|
+
const now = deps.now || (() => Date.now())
|
|
312
|
+
const generatedAt = now()
|
|
313
|
+
const loadAgentsImpl = deps.loadAgents || (() => loadAgents() as Record<string, Agent>)
|
|
314
|
+
const loadCredentialsImpl = deps.loadCredentials || (() => loadCredentials() as Record<string, unknown>)
|
|
315
|
+
const listGatewayProfilesImpl = deps.listGatewayProfiles || listOpenClawGatewayProfiles
|
|
316
|
+
const checkCliProviderReadyImpl = deps.checkCliProviderReady || checkCliProviderReady
|
|
317
|
+
const checks: EvalEnvironmentCheck[] = []
|
|
318
|
+
const { scenarios, missing } = scenarioSet(input)
|
|
319
|
+
const suite = normalizeOptionalId(input.suite) || (input.scenarioId ? null : 'core')
|
|
320
|
+
const agents = loadAgentsImpl()
|
|
321
|
+
const agent = agents[input.agentId] || null
|
|
322
|
+
const requiredTools = uniqueStrings(scenarios.flatMap((scenario) => scenario.tools || []))
|
|
323
|
+
let target: EvalEnvironmentTarget | null = null
|
|
324
|
+
|
|
325
|
+
if (missing) {
|
|
326
|
+
checks.push({
|
|
327
|
+
code: 'scenario_not_found',
|
|
328
|
+
level: 'error',
|
|
329
|
+
message: `Eval scenario ${missing} was not found.`,
|
|
330
|
+
})
|
|
331
|
+
} else if (scenarios.length === 0) {
|
|
332
|
+
checks.push({
|
|
333
|
+
code: 'scenario_set_empty',
|
|
334
|
+
level: 'error',
|
|
335
|
+
message: 'No eval scenarios matched the requested suite.',
|
|
336
|
+
})
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
if (!agent) {
|
|
340
|
+
checks.push({
|
|
341
|
+
code: 'agent_not_found',
|
|
342
|
+
level: 'error',
|
|
343
|
+
message: `Agent ${input.agentId} was not found.`,
|
|
344
|
+
})
|
|
345
|
+
} else {
|
|
346
|
+
if (agent.trashedAt) {
|
|
347
|
+
checks.push({ code: 'agent_trashed', level: 'error', message: `${agent.name} is in trash.` })
|
|
348
|
+
}
|
|
349
|
+
if (agent.disabled) {
|
|
350
|
+
checks.push({ code: 'agent_disabled', level: 'error', message: `${agent.name} is disabled.` })
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
const gatewayProfiles = listGatewayProfilesImpl()
|
|
354
|
+
const [route] = resolveAgentRouteCandidatesWithProfiles(agent, gatewayProfiles)
|
|
355
|
+
if (!route) {
|
|
356
|
+
checks.push({
|
|
357
|
+
code: 'route_unresolved',
|
|
358
|
+
level: 'error',
|
|
359
|
+
message: `${agent.name} does not have a runnable provider/model route.`,
|
|
360
|
+
})
|
|
361
|
+
} else if (route.provider === 'openclaw') {
|
|
362
|
+
const profile = pickGatewayProfile(route, gatewayProfiles, normalizeOptionalId(input.gatewayProfileId))
|
|
363
|
+
if (!profile) {
|
|
364
|
+
checks.push({
|
|
365
|
+
code: 'gateway_profile_missing',
|
|
366
|
+
level: 'error',
|
|
367
|
+
message: 'No gateway profile is available for this agent route.',
|
|
368
|
+
})
|
|
369
|
+
target = summarizeGatewayTarget(route, null)
|
|
370
|
+
} else {
|
|
371
|
+
target = await attachGatewayEnvironment(
|
|
372
|
+
summarizeGatewayTarget(route, profile),
|
|
373
|
+
profile,
|
|
374
|
+
checks,
|
|
375
|
+
input,
|
|
376
|
+
{ listGatewayEnvironments: deps.listGatewayEnvironments || listOpenClawGatewayEnvironments },
|
|
377
|
+
)
|
|
378
|
+
}
|
|
379
|
+
} else {
|
|
380
|
+
target = summarizeLocalTarget(route)
|
|
381
|
+
if (isCliProviderId(route.provider)) {
|
|
382
|
+
const ready = checkCliProviderReadyImpl(route.provider)
|
|
383
|
+
checks.push({
|
|
384
|
+
code: ready.ok ? 'cli_provider_ready' : 'cli_provider_not_ready',
|
|
385
|
+
level: ready.ok ? 'info' : 'error',
|
|
386
|
+
message: ready.message,
|
|
387
|
+
detail: ready.binaryPath,
|
|
388
|
+
})
|
|
389
|
+
} else if (providerNeedsCredential(route) && !credentialExists(route.credentialId, loadCredentialsImpl())) {
|
|
390
|
+
checks.push({
|
|
391
|
+
code: 'credential_missing',
|
|
392
|
+
level: 'warn',
|
|
393
|
+
message: `${route.provider} does not have a stored credential for this route.`,
|
|
394
|
+
hint: 'The run may still work if the provider is configured through environment variables.',
|
|
395
|
+
})
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
if (requiredTools.length > 0) {
|
|
401
|
+
checks.push({
|
|
402
|
+
code: 'tools_declared',
|
|
403
|
+
level: 'info',
|
|
404
|
+
message: `${requiredTools.length} eval tool${requiredTools.length === 1 ? '' : 's'} will be enabled: ${requiredTools.join(', ')}.`,
|
|
405
|
+
})
|
|
406
|
+
} else {
|
|
407
|
+
checks.push({
|
|
408
|
+
code: 'no_tools_required',
|
|
409
|
+
level: 'info',
|
|
410
|
+
message: 'This eval scenario does not require tool access.',
|
|
411
|
+
})
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
const envHints = buildEnvHints({ agent, scenarios, suite, target })
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
generatedAt,
|
|
418
|
+
status: statusFromChecks(checks),
|
|
419
|
+
agentId: input.agentId,
|
|
420
|
+
agentName: agent?.name || input.agentId,
|
|
421
|
+
scenarioIds: scenarios.map((scenario) => scenario.id),
|
|
422
|
+
suite,
|
|
423
|
+
target,
|
|
424
|
+
checks,
|
|
425
|
+
requiredTools,
|
|
426
|
+
missingTools: [],
|
|
427
|
+
maxScore: maxScore(scenarios),
|
|
428
|
+
timeoutMs: timeoutMs(scenarios),
|
|
429
|
+
generatedFiles: baseGeneratedFiles(scenarios),
|
|
430
|
+
envHints,
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
function safeFixtureDestination(workspacePath: string, fixture: EvalScenarioFixture): string {
|
|
435
|
+
const relative = fixture.path.trim()
|
|
436
|
+
if (!relative || path.isAbsolute(relative)) {
|
|
437
|
+
throw new Error(`Unsafe eval fixture path: ${fixture.path}`)
|
|
438
|
+
}
|
|
439
|
+
const destination = path.resolve(workspacePath, relative)
|
|
440
|
+
const root = path.resolve(workspacePath)
|
|
441
|
+
if (destination !== root && !destination.startsWith(`${root}${path.sep}`)) {
|
|
442
|
+
throw new Error(`Unsafe eval fixture path: ${fixture.path}`)
|
|
443
|
+
}
|
|
444
|
+
return destination
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
function writeTextFile(filePath: string, content: string, mode?: number): void {
|
|
448
|
+
fs.mkdirSync(path.dirname(filePath), { recursive: true })
|
|
449
|
+
fs.writeFileSync(filePath, content.endsWith('\n') ? content : `${content}\n`, { encoding: 'utf8', mode })
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
function envLine(hint: EvalEnvironmentPlan['envHints'][number]): string {
|
|
453
|
+
return `${hint.key}=${JSON.stringify(hint.value)}`
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
export function writeEvalEnvironmentWorkspace(options: WriteEvalWorkspaceOptions): EvalEnvironmentGeneratedFile[] {
|
|
457
|
+
const { runId, workspacePath, scenario, plan } = options
|
|
458
|
+
fs.mkdirSync(workspacePath, { recursive: true })
|
|
459
|
+
|
|
460
|
+
const readme = [
|
|
461
|
+
`# Eval Workspace: ${scenario.name}`,
|
|
462
|
+
'',
|
|
463
|
+
`Run ID: ${runId}`,
|
|
464
|
+
`Agent: ${plan.agentName} (${plan.agentId})`,
|
|
465
|
+
`Scenario: ${scenario.id}`,
|
|
466
|
+
`Status at start: ${plan.status}`,
|
|
467
|
+
'',
|
|
468
|
+
'Runtime manifest: ./environment.json',
|
|
469
|
+
'Environment hints: ./.env.swarmclaw-eval',
|
|
470
|
+
'',
|
|
471
|
+
'This directory is isolated for eval artifacts, fixtures, and generated outputs.',
|
|
472
|
+
].join('\n')
|
|
473
|
+
writeTextFile(path.join(workspacePath, 'README.md'), readme)
|
|
474
|
+
writeTextFile(path.join(workspacePath, 'environment.json'), JSON.stringify({ runId, plan }, null, 2))
|
|
475
|
+
writeTextFile(
|
|
476
|
+
path.join(workspacePath, '.env.swarmclaw-eval'),
|
|
477
|
+
[
|
|
478
|
+
'# Generated by SwarmClaw. Contains eval context only, not secrets.',
|
|
479
|
+
`SWARMCLAW_EVAL_RUN_ID=${JSON.stringify(runId)}`,
|
|
480
|
+
...plan.envHints.map(envLine),
|
|
481
|
+
].join('\n'),
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
for (const fixture of scenario.fixtures || []) {
|
|
485
|
+
writeTextFile(safeFixtureDestination(workspacePath, fixture), fixture.content, fixture.mode)
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
return [
|
|
489
|
+
{ path: 'README.md', kind: 'readme', required: true },
|
|
490
|
+
{ path: 'environment.json', kind: 'manifest', required: true },
|
|
491
|
+
{ path: '.env.swarmclaw-eval', kind: 'env', required: true },
|
|
492
|
+
...fixtureFiles([scenario]),
|
|
493
|
+
]
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
export function resolveEvalWorkspacePath(runId: string): string {
|
|
497
|
+
return path.join(WORKSPACE_DIR, 'evals', runId)
|
|
498
|
+
}
|
|
@@ -10,6 +10,7 @@ import { executeExecutionChatTurn } from '@/lib/server/execution-engine/chat-tur
|
|
|
10
10
|
import { WORKSPACE_DIR } from '../data-dir'
|
|
11
11
|
import type { Session } from '@/types'
|
|
12
12
|
import { errorMessage } from '@/lib/shared-utils'
|
|
13
|
+
import { buildEvalEnvironmentPlan, writeEvalEnvironmentWorkspace } from './environment-plan'
|
|
13
14
|
|
|
14
15
|
export function resolveEvalSessionCwd(runId: string): string {
|
|
15
16
|
const dir = path.join(WORKSPACE_DIR, 'evals', runId)
|
|
@@ -17,7 +18,17 @@ export function resolveEvalSessionCwd(runId: string): string {
|
|
|
17
18
|
return dir
|
|
18
19
|
}
|
|
19
20
|
|
|
20
|
-
export
|
|
21
|
+
export interface RunEvalScenarioOptions {
|
|
22
|
+
gatewayProfileId?: string | null
|
|
23
|
+
environmentId?: string | null
|
|
24
|
+
refreshGateway?: boolean
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export async function runEvalScenario(
|
|
28
|
+
scenarioId: string,
|
|
29
|
+
agentId: string,
|
|
30
|
+
options: RunEvalScenarioOptions = {},
|
|
31
|
+
): Promise<EvalRun> {
|
|
21
32
|
const scenario = getScenario(scenarioId)
|
|
22
33
|
if (!scenario) throw new Error(`Unknown eval scenario: ${scenarioId}`)
|
|
23
34
|
|
|
@@ -29,6 +40,13 @@ export async function runEvalScenario(scenarioId: string, agentId: string): Prom
|
|
|
29
40
|
const sessionId = `eval-${runId}`
|
|
30
41
|
const now = Date.now()
|
|
31
42
|
const sessionCwd = resolveEvalSessionCwd(runId)
|
|
43
|
+
const environment = await buildEvalEnvironmentPlan({
|
|
44
|
+
agentId,
|
|
45
|
+
scenarioId,
|
|
46
|
+
gatewayProfileId: options.gatewayProfileId || null,
|
|
47
|
+
environmentId: options.environmentId || null,
|
|
48
|
+
refreshGateway: options.refreshGateway === true,
|
|
49
|
+
})
|
|
32
50
|
|
|
33
51
|
const run: EvalRun = {
|
|
34
52
|
id: runId,
|
|
@@ -40,6 +58,34 @@ export async function runEvalScenario(scenarioId: string, agentId: string): Prom
|
|
|
40
58
|
maxScore: scenario.scoringCriteria.reduce((sum, c) => sum + c.weight, 0),
|
|
41
59
|
details: [],
|
|
42
60
|
sessionId,
|
|
61
|
+
environment,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
writeEvalEnvironmentWorkspace({
|
|
65
|
+
runId,
|
|
66
|
+
workspacePath: sessionCwd,
|
|
67
|
+
scenario,
|
|
68
|
+
plan: environment,
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
if (environment.status === 'blocked') {
|
|
72
|
+
run.status = 'failed'
|
|
73
|
+
run.error = environment.checks
|
|
74
|
+
.filter((check) => check.level === 'error')
|
|
75
|
+
.map((check) => check.message)
|
|
76
|
+
.join(' ')
|
|
77
|
+
|| 'Eval environment validation failed.'
|
|
78
|
+
run.endedAt = Date.now()
|
|
79
|
+
run.details = environment.checks
|
|
80
|
+
.filter((check) => check.level !== 'info')
|
|
81
|
+
.map((check) => ({
|
|
82
|
+
criterion: check.code,
|
|
83
|
+
score: 0,
|
|
84
|
+
maxScore: 0,
|
|
85
|
+
evidence: check.message,
|
|
86
|
+
}))
|
|
87
|
+
saveEvalRun(run)
|
|
88
|
+
return run
|
|
43
89
|
}
|
|
44
90
|
|
|
45
91
|
// Create temporary eval session
|
|
@@ -114,7 +160,7 @@ export async function runEvalScenario(scenarioId: string, agentId: string): Prom
|
|
|
114
160
|
|
|
115
161
|
export async function runEvalSuite(
|
|
116
162
|
agentId: string,
|
|
117
|
-
opts: { categories?: string[]; suite?: string } = {},
|
|
163
|
+
opts: { categories?: string[]; suite?: string; gatewayProfileId?: string | null; environmentId?: string | null; refreshGateway?: boolean } = {},
|
|
118
164
|
): Promise<EvalSuiteResult> {
|
|
119
165
|
let scenarios: EvalScenario[]
|
|
120
166
|
if (opts.suite) {
|
|
@@ -130,7 +176,11 @@ export async function runEvalSuite(
|
|
|
130
176
|
|
|
131
177
|
const runs: EvalRun[] = []
|
|
132
178
|
for (const scenario of scenarios) {
|
|
133
|
-
const evalRun = await runEvalScenario(scenario.id, agentId
|
|
179
|
+
const evalRun = await runEvalScenario(scenario.id, agentId, {
|
|
180
|
+
gatewayProfileId: opts.gatewayProfileId || null,
|
|
181
|
+
environmentId: opts.environmentId || null,
|
|
182
|
+
refreshGateway: opts.refreshGateway === true,
|
|
183
|
+
})
|
|
134
184
|
runs.push(evalRun)
|
|
135
185
|
}
|
|
136
186
|
|
|
@@ -212,6 +212,24 @@ const CORE_SCENARIOS: EvalScenario[] = [
|
|
|
212
212
|
],
|
|
213
213
|
timeoutMs: 60_000,
|
|
214
214
|
tools: ['shell', 'files'],
|
|
215
|
+
fixtures: [
|
|
216
|
+
{
|
|
217
|
+
path: 'package.json',
|
|
218
|
+
content: JSON.stringify({
|
|
219
|
+
name: 'swarmclaw-eval-fixture',
|
|
220
|
+
version: '0.0.0',
|
|
221
|
+
private: true,
|
|
222
|
+
dependencies: {
|
|
223
|
+
'@modelcontextprotocol/sdk': '^1.29.0',
|
|
224
|
+
zod: '^4.1.13',
|
|
225
|
+
},
|
|
226
|
+
devDependencies: {
|
|
227
|
+
typescript: '^5.9.3',
|
|
228
|
+
tsx: '^4.20.6',
|
|
229
|
+
},
|
|
230
|
+
}, null, 2),
|
|
231
|
+
},
|
|
232
|
+
],
|
|
215
233
|
},
|
|
216
234
|
]
|
|
217
235
|
|
|
@@ -17,10 +17,64 @@ export interface EvalScenario {
|
|
|
17
17
|
scoringCriteria: ScoringCriterion[]
|
|
18
18
|
timeoutMs: number
|
|
19
19
|
tools: string[]
|
|
20
|
+
fixtures?: EvalScenarioFixture[]
|
|
20
21
|
/** Optional suite tag. Scenarios without a suite belong to the 'core' suite. */
|
|
21
22
|
suite?: EvalSuite
|
|
22
23
|
}
|
|
23
24
|
|
|
25
|
+
export interface EvalScenarioFixture {
|
|
26
|
+
path: string
|
|
27
|
+
content: string
|
|
28
|
+
mode?: number
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export type EvalEnvironmentStatus = 'ready' | 'warning' | 'blocked'
|
|
32
|
+
export type EvalEnvironmentCheckLevel = 'info' | 'warn' | 'error'
|
|
33
|
+
|
|
34
|
+
export interface EvalEnvironmentCheck {
|
|
35
|
+
code: string
|
|
36
|
+
level: EvalEnvironmentCheckLevel
|
|
37
|
+
message: string
|
|
38
|
+
detail?: string
|
|
39
|
+
hint?: string
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface EvalEnvironmentTarget {
|
|
43
|
+
kind: 'local' | 'gateway'
|
|
44
|
+
provider: string
|
|
45
|
+
model: string
|
|
46
|
+
label: string
|
|
47
|
+
gatewayProfileId?: string | null
|
|
48
|
+
environmentId?: string | null
|
|
49
|
+
environmentLabel?: string | null
|
|
50
|
+
environmentStatus?: string | null
|
|
51
|
+
capabilities?: string[]
|
|
52
|
+
refreshedAt?: number | null
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export interface EvalEnvironmentGeneratedFile {
|
|
56
|
+
path: string
|
|
57
|
+
kind: 'readme' | 'manifest' | 'env' | 'fixture'
|
|
58
|
+
required: boolean
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export interface EvalEnvironmentPlan {
|
|
62
|
+
generatedAt: number
|
|
63
|
+
status: EvalEnvironmentStatus
|
|
64
|
+
agentId: string
|
|
65
|
+
agentName: string
|
|
66
|
+
scenarioIds: string[]
|
|
67
|
+
suite?: string | null
|
|
68
|
+
target: EvalEnvironmentTarget | null
|
|
69
|
+
checks: EvalEnvironmentCheck[]
|
|
70
|
+
requiredTools: string[]
|
|
71
|
+
missingTools: string[]
|
|
72
|
+
maxScore: number
|
|
73
|
+
timeoutMs: number
|
|
74
|
+
generatedFiles: EvalEnvironmentGeneratedFile[]
|
|
75
|
+
envHints: Array<{ key: string; value: string; description?: string }>
|
|
76
|
+
}
|
|
77
|
+
|
|
24
78
|
export interface EvalRun {
|
|
25
79
|
id: string
|
|
26
80
|
scenarioId: string
|
|
@@ -32,6 +86,7 @@ export interface EvalRun {
|
|
|
32
86
|
maxScore: number
|
|
33
87
|
details: EvalCriterionResult[]
|
|
34
88
|
sessionId?: string
|
|
89
|
+
environment?: EvalEnvironmentPlan
|
|
35
90
|
error?: string
|
|
36
91
|
}
|
|
37
92
|
|