@swarmclawai/swarmclaw 1.9.5 → 1.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -0
- package/package.json +2 -2
- package/src/app/api/eval/baselines/route.ts +55 -0
- package/src/app/api/eval/environments/route.ts +59 -0
- package/src/app/api/eval/gate/route.ts +36 -0
- package/src/app/api/eval/run/route.ts +8 -1
- package/src/app/api/eval/suite/route.ts +6 -0
- package/src/cli/index.js +5 -0
- package/src/components/quality/quality-workspace.tsx +337 -5
- package/src/lib/server/eval/baseline.test.ts +111 -0
- package/src/lib/server/eval/baseline.ts +274 -0
- package/src/lib/server/eval/environment-plan.test.ts +221 -0
- package/src/lib/server/eval/environment-plan.ts +498 -0
- package/src/lib/server/eval/runner.ts +53 -3
- package/src/lib/server/eval/scenarios.ts +18 -0
- package/src/lib/server/eval/store.ts +47 -1
- package/src/lib/server/eval/types.ts +105 -0
- package/src/lib/server/session-tools/extension-creator.ts +2 -2
- package/src/lib/server/tasks/task-checkout.ts +1 -1
- package/src/types/extension.ts +3 -3
- package/electron-dist/main.js +0 -218
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
} from '@/lib/quality/quality-summary'
|
|
18
18
|
import { cn } from '@/lib/utils'
|
|
19
19
|
import { useAppStore } from '@/stores/use-app-store'
|
|
20
|
-
import type { EvalRun, EvalSuiteResult } from '@/lib/server/eval/types'
|
|
20
|
+
import type { EvalEnvironmentPlan, EvalGateResult, EvalRun, EvalSuiteResult } from '@/lib/server/eval/types'
|
|
21
21
|
import type { Agent, ApprovalRequest, SessionRunRecord } from '@/types'
|
|
22
22
|
|
|
23
23
|
type QualityTab = 'overview' | 'evals' | 'approvals' | 'runs'
|
|
@@ -105,6 +105,217 @@ function EmptyState({ title, description }: { title: string; description: string
|
|
|
105
105
|
)
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
+
function environmentStatusClass(status: EvalEnvironmentPlan['status']): string {
|
|
109
|
+
if (status === 'ready') return 'border-emerald-500/25 bg-emerald-500/10 text-emerald-200'
|
|
110
|
+
if (status === 'warning') return 'border-amber-500/25 bg-amber-500/10 text-amber-200'
|
|
111
|
+
return 'border-rose-500/25 bg-rose-500/10 text-rose-200'
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function checkClass(level: 'info' | 'warn' | 'error'): string {
|
|
115
|
+
if (level === 'error') return 'border-rose-500/20 bg-rose-500/[0.05] text-rose-200'
|
|
116
|
+
if (level === 'warn') return 'border-amber-500/20 bg-amber-500/[0.05] text-amber-200'
|
|
117
|
+
return 'border-white/[0.06] bg-white/[0.025] text-text-3'
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function gateStatusClass(status: EvalGateResult['status']): string {
|
|
121
|
+
if (status === 'pass') return 'border-emerald-500/25 bg-emerald-500/10 text-emerald-200'
|
|
122
|
+
if (status === 'warn') return 'border-amber-500/25 bg-amber-500/10 text-amber-200'
|
|
123
|
+
return 'border-rose-500/25 bg-rose-500/10 text-rose-200'
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function gateCheckClass(status: EvalGateResult['status']): string {
|
|
127
|
+
if (status === 'fail') return 'border-rose-500/20 bg-rose-500/[0.05] text-rose-200'
|
|
128
|
+
if (status === 'warn') return 'border-amber-500/20 bg-amber-500/[0.05] text-amber-200'
|
|
129
|
+
return 'border-emerald-500/20 bg-emerald-500/[0.05] text-emerald-200'
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function EvalEnvironmentPanel({ plan, loading, onRefresh }: {
|
|
133
|
+
plan: EvalEnvironmentPlan | null
|
|
134
|
+
loading: boolean
|
|
135
|
+
onRefresh: () => void
|
|
136
|
+
}) {
|
|
137
|
+
return (
|
|
138
|
+
<div className="rounded-[12px] border border-white/[0.06] bg-white/[0.025] px-3 py-3">
|
|
139
|
+
<div className="flex items-start justify-between gap-3">
|
|
140
|
+
<div>
|
|
141
|
+
<div className="text-[13px] font-800 text-text">Validation environment</div>
|
|
142
|
+
<p className="mt-1 text-[11px] leading-relaxed text-text-3/65">
|
|
143
|
+
Preflight checks, workspace context, and generated files for the selected eval.
|
|
144
|
+
</p>
|
|
145
|
+
</div>
|
|
146
|
+
<button
|
|
147
|
+
type="button"
|
|
148
|
+
onClick={onRefresh}
|
|
149
|
+
disabled={loading}
|
|
150
|
+
className="shrink-0 rounded-[8px] border border-white/[0.08] px-2 py-1 text-[10px] font-800 text-text-2 transition-colors hover:bg-white/[0.06] disabled:opacity-40"
|
|
151
|
+
>
|
|
152
|
+
{loading ? 'Checking' : 'Refresh'}
|
|
153
|
+
</button>
|
|
154
|
+
</div>
|
|
155
|
+
{!plan ? (
|
|
156
|
+
<div className="mt-3 text-[11px] text-text-3/60">{loading ? 'Checking readiness...' : 'Choose an agent and scenario.'}</div>
|
|
157
|
+
) : (
|
|
158
|
+
<div className="mt-3 flex flex-col gap-3">
|
|
159
|
+
<div className="flex flex-wrap items-center gap-2">
|
|
160
|
+
<span className={cn('rounded-full border px-2 py-1 text-[10px] font-800 uppercase tracking-[0.08em]', environmentStatusClass(plan.status))}>
|
|
161
|
+
{plan.status}
|
|
162
|
+
</span>
|
|
163
|
+
{plan.target && (
|
|
164
|
+
<span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
165
|
+
{plan.target.kind} - {plan.target.label}
|
|
166
|
+
</span>
|
|
167
|
+
)}
|
|
168
|
+
<span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
169
|
+
{plan.requiredTools.length} tool{plan.requiredTools.length === 1 ? '' : 's'}
|
|
170
|
+
</span>
|
|
171
|
+
<span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
172
|
+
{plan.generatedFiles.length} file{plan.generatedFiles.length === 1 ? '' : 's'}
|
|
173
|
+
</span>
|
|
174
|
+
</div>
|
|
175
|
+
{plan.target?.environmentLabel && (
|
|
176
|
+
<div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-3 py-2 text-[11px] text-text-3/70">
|
|
177
|
+
Environment: <span className="font-700 text-text-2">{plan.target.environmentLabel}</span>
|
|
178
|
+
{plan.target.environmentStatus ? ` (${plan.target.environmentStatus})` : ''}
|
|
179
|
+
</div>
|
|
180
|
+
)}
|
|
181
|
+
<div className="flex flex-col gap-1.5">
|
|
182
|
+
{plan.checks.slice(0, 4).map((check) => (
|
|
183
|
+
<div key={`${check.code}:${check.message}`} className={cn('rounded-[9px] border px-2.5 py-2 text-[11px] leading-relaxed', checkClass(check.level))}>
|
|
184
|
+
<span className="font-800 uppercase tracking-[0.08em]">{check.level}</span>
|
|
185
|
+
<span className="ml-2">{check.message}</span>
|
|
186
|
+
</div>
|
|
187
|
+
))}
|
|
188
|
+
{plan.checks.length > 4 && (
|
|
189
|
+
<div className="text-[10px] text-text-3/55">+{plan.checks.length - 4} more check{plan.checks.length - 4 === 1 ? '' : 's'}</div>
|
|
190
|
+
)}
|
|
191
|
+
</div>
|
|
192
|
+
<div className="flex flex-wrap gap-1.5">
|
|
193
|
+
{plan.generatedFiles.slice(0, 5).map((file) => (
|
|
194
|
+
<span key={`${file.kind}:${file.path}`} className="rounded-full bg-white/[0.04] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
195
|
+
{file.path}
|
|
196
|
+
</span>
|
|
197
|
+
))}
|
|
198
|
+
{plan.generatedFiles.length > 5 && (
|
|
199
|
+
<span className="rounded-full bg-white/[0.04] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
200
|
+
+{plan.generatedFiles.length - 5}
|
|
201
|
+
</span>
|
|
202
|
+
)}
|
|
203
|
+
</div>
|
|
204
|
+
</div>
|
|
205
|
+
)}
|
|
206
|
+
</div>
|
|
207
|
+
)
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function EvalGatePanel({
|
|
211
|
+
gate,
|
|
212
|
+
loading,
|
|
213
|
+
busy,
|
|
214
|
+
scope,
|
|
215
|
+
onScopeChange,
|
|
216
|
+
onRefresh,
|
|
217
|
+
onSetBaseline,
|
|
218
|
+
}: {
|
|
219
|
+
gate: EvalGateResult | null
|
|
220
|
+
loading: boolean
|
|
221
|
+
busy: boolean
|
|
222
|
+
scope: 'scenario' | 'suite'
|
|
223
|
+
onScopeChange: (scope: 'scenario' | 'suite') => void
|
|
224
|
+
onRefresh: () => void
|
|
225
|
+
onSetBaseline: () => void
|
|
226
|
+
}) {
|
|
227
|
+
return (
|
|
228
|
+
<div className="rounded-[12px] border border-white/[0.06] bg-white/[0.025] px-3 py-3">
|
|
229
|
+
<div className="flex items-start justify-between gap-3">
|
|
230
|
+
<div>
|
|
231
|
+
<div className="text-[13px] font-800 text-text">Regression gate</div>
|
|
232
|
+
<p className="mt-1 text-[11px] leading-relaxed text-text-3/65">
|
|
233
|
+
Compare latest eval evidence against thresholds and an approved baseline.
|
|
234
|
+
</p>
|
|
235
|
+
</div>
|
|
236
|
+
<button
|
|
237
|
+
type="button"
|
|
238
|
+
onClick={onRefresh}
|
|
239
|
+
disabled={loading}
|
|
240
|
+
className="shrink-0 rounded-[8px] border border-white/[0.08] px-2 py-1 text-[10px] font-800 text-text-2 transition-colors hover:bg-white/[0.06] disabled:opacity-40"
|
|
241
|
+
>
|
|
242
|
+
{loading ? 'Checking' : 'Refresh'}
|
|
243
|
+
</button>
|
|
244
|
+
</div>
|
|
245
|
+
|
|
246
|
+
<div className="mt-3 flex rounded-[10px] border border-white/[0.06] bg-white/[0.025] p-1">
|
|
247
|
+
{(['scenario', 'suite'] as const).map((item) => (
|
|
248
|
+
<button
|
|
249
|
+
key={item}
|
|
250
|
+
type="button"
|
|
251
|
+
onClick={() => onScopeChange(item)}
|
|
252
|
+
className={cn(
|
|
253
|
+
'flex-1 rounded-[8px] px-2 py-1.5 text-[10px] font-800 uppercase tracking-[0.08em] transition-colors',
|
|
254
|
+
scope === item ? 'bg-white/[0.1] text-text' : 'text-text-3 hover:bg-white/[0.05]',
|
|
255
|
+
)}
|
|
256
|
+
>
|
|
257
|
+
{item}
|
|
258
|
+
</button>
|
|
259
|
+
))}
|
|
260
|
+
</div>
|
|
261
|
+
|
|
262
|
+
{!gate ? (
|
|
263
|
+
<div className="mt-3 text-[11px] text-text-3/60">{loading ? 'Checking gate...' : 'Run evals to build gate evidence.'}</div>
|
|
264
|
+
) : (
|
|
265
|
+
<div className="mt-3 flex flex-col gap-3">
|
|
266
|
+
<div className="flex flex-wrap items-center gap-2">
|
|
267
|
+
<span className={cn('rounded-full border px-2 py-1 text-[10px] font-800 uppercase tracking-[0.08em]', gateStatusClass(gate.status))}>
|
|
268
|
+
{gate.status}
|
|
269
|
+
</span>
|
|
270
|
+
<span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
271
|
+
{gate.scope.label}
|
|
272
|
+
</span>
|
|
273
|
+
<span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
274
|
+
{gate.latestRuns.length}/{gate.scope.scenarioIds.length} latest runs
|
|
275
|
+
</span>
|
|
276
|
+
</div>
|
|
277
|
+
|
|
278
|
+
<div className="grid grid-cols-3 gap-2">
|
|
279
|
+
<div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
|
|
280
|
+
<div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Current</div>
|
|
281
|
+
<div className="mt-1 text-[14px] font-800 text-text">{formatPercent(gate.currentPercent)}</div>
|
|
282
|
+
</div>
|
|
283
|
+
<div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
|
|
284
|
+
<div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Baseline</div>
|
|
285
|
+
<div className="mt-1 text-[14px] font-800 text-text">{gate.baseline ? `${gate.baseline.baselinePercent}%` : 'none'}</div>
|
|
286
|
+
</div>
|
|
287
|
+
<div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
|
|
288
|
+
<div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Regression</div>
|
|
289
|
+
<div className="mt-1 text-[14px] font-800 text-text">{gate.regressionPoints == null ? 'n/a' : `${gate.regressionPoints}pt`}</div>
|
|
290
|
+
</div>
|
|
291
|
+
</div>
|
|
292
|
+
|
|
293
|
+
<div className="flex flex-col gap-1.5">
|
|
294
|
+
{gate.checks.slice(0, 4).map((check) => (
|
|
295
|
+
<div key={`${check.code}:${check.message}`} className={cn('rounded-[9px] border px-2.5 py-2 text-[11px] leading-relaxed', gateCheckClass(check.status))}>
|
|
296
|
+
<span className="font-800 uppercase tracking-[0.08em]">{check.status}</span>
|
|
297
|
+
<span className="ml-2">{check.message}</span>
|
|
298
|
+
</div>
|
|
299
|
+
))}
|
|
300
|
+
{gate.checks.length > 4 && (
|
|
301
|
+
<div className="text-[10px] text-text-3/55">+{gate.checks.length - 4} more check{gate.checks.length - 4 === 1 ? '' : 's'}</div>
|
|
302
|
+
)}
|
|
303
|
+
</div>
|
|
304
|
+
|
|
305
|
+
<button
|
|
306
|
+
type="button"
|
|
307
|
+
onClick={onSetBaseline}
|
|
308
|
+
disabled={busy || gate.latestRuns.length === 0 || gate.checks.some((check) => check.code === 'missing_scope_runs')}
|
|
309
|
+
className="rounded-[9px] border border-white/[0.08] bg-white/[0.04] px-3 py-2 text-[11px] font-800 text-text-2 transition-colors hover:bg-white/[0.08] disabled:cursor-not-allowed disabled:opacity-40"
|
|
310
|
+
>
|
|
311
|
+
{busy ? 'Saving baseline' : gate.baseline ? 'Update baseline' : 'Set baseline'}
|
|
312
|
+
</button>
|
|
313
|
+
</div>
|
|
314
|
+
)}
|
|
315
|
+
</div>
|
|
316
|
+
)
|
|
317
|
+
}
|
|
318
|
+
|
|
108
319
|
export function QualityWorkspace() {
|
|
109
320
|
const router = useRouter()
|
|
110
321
|
const searchParams = useSearchParams()
|
|
@@ -127,6 +338,12 @@ export function QualityWorkspace() {
|
|
|
127
338
|
const [selectedSuite, setSelectedSuite] = useState('core')
|
|
128
339
|
const [selectedScenarioId, setSelectedScenarioId] = useState('')
|
|
129
340
|
const [evalBusy, setEvalBusy] = useState<string | null>(null)
|
|
341
|
+
const [evalEnvironmentPlan, setEvalEnvironmentPlan] = useState<EvalEnvironmentPlan | null>(null)
|
|
342
|
+
const [evalEnvironmentLoading, setEvalEnvironmentLoading] = useState(false)
|
|
343
|
+
const [evalGate, setEvalGate] = useState<EvalGateResult | null>(null)
|
|
344
|
+
const [evalGateScope, setEvalGateScope] = useState<'scenario' | 'suite'>('scenario')
|
|
345
|
+
const [evalGateLoading, setEvalGateLoading] = useState(false)
|
|
346
|
+
const [evalBaselineBusy, setEvalBaselineBusy] = useState(false)
|
|
130
347
|
const [approvalBusy, setApprovalBusy] = useState<string | null>(null)
|
|
131
348
|
|
|
132
349
|
useEffect(() => {
|
|
@@ -170,6 +387,51 @@ export function QualityWorkspace() {
|
|
|
170
387
|
}
|
|
171
388
|
}, [])
|
|
172
389
|
|
|
390
|
+
const loadEvalEnvironmentPlan = useCallback(async (opts: { refreshGateway?: boolean } = {}) => {
|
|
391
|
+
if (!selectedAgentId) {
|
|
392
|
+
setEvalEnvironmentPlan(null)
|
|
393
|
+
return
|
|
394
|
+
}
|
|
395
|
+
const params = new URLSearchParams({ agentId: selectedAgentId })
|
|
396
|
+
if (selectedScenarioId) params.set('scenarioId', selectedScenarioId)
|
|
397
|
+
else if (selectedSuite) params.set('suite', selectedSuite)
|
|
398
|
+
if (opts.refreshGateway) params.set('refreshGateway', 'true')
|
|
399
|
+
setEvalEnvironmentLoading(true)
|
|
400
|
+
try {
|
|
401
|
+
const plan = await api<EvalEnvironmentPlan>('GET', `/eval/environments?${params.toString()}`, undefined, { timeoutMs: opts.refreshGateway ? 20_000 : 8_000 })
|
|
402
|
+
setEvalEnvironmentPlan(plan)
|
|
403
|
+
} catch (err) {
|
|
404
|
+
setEvalEnvironmentPlan(null)
|
|
405
|
+
toast.error(err instanceof Error ? err.message : 'Unable to validate eval environment')
|
|
406
|
+
} finally {
|
|
407
|
+
setEvalEnvironmentLoading(false)
|
|
408
|
+
}
|
|
409
|
+
}, [selectedAgentId, selectedScenarioId, selectedSuite])
|
|
410
|
+
|
|
411
|
+
const loadEvalGate = useCallback(async () => {
|
|
412
|
+
if (!selectedAgentId) {
|
|
413
|
+
setEvalGate(null)
|
|
414
|
+
return
|
|
415
|
+
}
|
|
416
|
+
if (evalGateScope === 'scenario' && !selectedScenarioId) {
|
|
417
|
+
setEvalGate(null)
|
|
418
|
+
return
|
|
419
|
+
}
|
|
420
|
+
const params = new URLSearchParams({ agentId: selectedAgentId })
|
|
421
|
+
if (evalGateScope === 'scenario') params.set('scenarioId', selectedScenarioId)
|
|
422
|
+
else params.set('suite', selectedSuite)
|
|
423
|
+
setEvalGateLoading(true)
|
|
424
|
+
try {
|
|
425
|
+
const gate = await api<EvalGateResult>('GET', `/eval/gate?${params.toString()}`)
|
|
426
|
+
setEvalGate(gate)
|
|
427
|
+
} catch (err) {
|
|
428
|
+
setEvalGate(null)
|
|
429
|
+
toast.error(err instanceof Error ? err.message : 'Unable to check eval gate')
|
|
430
|
+
} finally {
|
|
431
|
+
setEvalGateLoading(false)
|
|
432
|
+
}
|
|
433
|
+
}, [evalGateScope, selectedAgentId, selectedScenarioId, selectedSuite])
|
|
434
|
+
|
|
173
435
|
useEffect(() => {
|
|
174
436
|
void loadQualityData()
|
|
175
437
|
}, [loadQualityData])
|
|
@@ -184,6 +446,14 @@ export function QualityWorkspace() {
|
|
|
184
446
|
if (!selectedScenarioId && scenarios[0]) setSelectedScenarioId(scenarios[0].id)
|
|
185
447
|
}, [scenarios, selectedScenarioId])
|
|
186
448
|
|
|
449
|
+
useEffect(() => {
|
|
450
|
+
void loadEvalEnvironmentPlan()
|
|
451
|
+
}, [loadEvalEnvironmentPlan])
|
|
452
|
+
|
|
453
|
+
useEffect(() => {
|
|
454
|
+
void loadEvalGate()
|
|
455
|
+
}, [loadEvalGate])
|
|
456
|
+
|
|
187
457
|
useEffect(() => {
|
|
188
458
|
if (!suites.some((suite) => suite.name === selectedSuite) && suites[0]) {
|
|
189
459
|
setSelectedSuite(suites[0].name)
|
|
@@ -208,34 +478,82 @@ export function QualityWorkspace() {
|
|
|
208
478
|
toast.error('Choose an agent and scenario first')
|
|
209
479
|
return
|
|
210
480
|
}
|
|
481
|
+
if (evalEnvironmentPlan?.status === 'blocked') {
|
|
482
|
+
toast.error('Fix the validation environment before running this eval')
|
|
483
|
+
return
|
|
484
|
+
}
|
|
211
485
|
setEvalBusy(`scenario:${selectedScenarioId}`)
|
|
212
486
|
try {
|
|
213
|
-
await api<EvalRun>('POST', '/eval/run', {
|
|
487
|
+
await api<EvalRun>('POST', '/eval/run', {
|
|
488
|
+
agentId: selectedAgentId,
|
|
489
|
+
scenarioId: selectedScenarioId,
|
|
490
|
+
gatewayProfileId: evalEnvironmentPlan?.target?.gatewayProfileId || null,
|
|
491
|
+
environmentId: evalEnvironmentPlan?.target?.environmentId || null,
|
|
492
|
+
refreshGateway: evalEnvironmentPlan?.target?.kind === 'gateway',
|
|
493
|
+
}, { timeoutMs: 180_000 })
|
|
214
494
|
toast.success('Eval scenario completed')
|
|
215
495
|
await loadQualityData({ silent: true })
|
|
496
|
+
await loadEvalEnvironmentPlan()
|
|
497
|
+
await loadEvalGate()
|
|
216
498
|
} catch (err) {
|
|
217
499
|
toast.error(err instanceof Error ? err.message : 'Eval scenario failed')
|
|
218
500
|
} finally {
|
|
219
501
|
setEvalBusy(null)
|
|
220
502
|
}
|
|
221
|
-
}, [loadQualityData, selectedAgentId, selectedScenarioId])
|
|
503
|
+
}, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadEvalGate, loadQualityData, selectedAgentId, selectedScenarioId])
|
|
222
504
|
|
|
223
505
|
const runSuite = useCallback(async (suiteName: string) => {
|
|
224
506
|
if (!selectedAgentId) {
|
|
225
507
|
toast.error('Choose an agent first')
|
|
226
508
|
return
|
|
227
509
|
}
|
|
510
|
+
if (evalEnvironmentPlan?.status === 'blocked') {
|
|
511
|
+
toast.error('Fix the validation environment before running this suite')
|
|
512
|
+
return
|
|
513
|
+
}
|
|
228
514
|
setEvalBusy(`suite:${suiteName}`)
|
|
229
515
|
try {
|
|
230
|
-
const result = await api<EvalSuiteResult>('POST', '/eval/suite', {
|
|
516
|
+
const result = await api<EvalSuiteResult>('POST', '/eval/suite', {
|
|
517
|
+
agentId: selectedAgentId,
|
|
518
|
+
suite: suiteName,
|
|
519
|
+
gatewayProfileId: evalEnvironmentPlan?.target?.gatewayProfileId || null,
|
|
520
|
+
environmentId: evalEnvironmentPlan?.target?.environmentId || null,
|
|
521
|
+
refreshGateway: evalEnvironmentPlan?.target?.kind === 'gateway',
|
|
522
|
+
}, { timeoutMs: 300_000 })
|
|
231
523
|
toast.success(`Suite completed at ${Math.round(result.percentage)}%`)
|
|
232
524
|
await loadQualityData({ silent: true })
|
|
525
|
+
await loadEvalEnvironmentPlan()
|
|
526
|
+
await loadEvalGate()
|
|
233
527
|
} catch (err) {
|
|
234
528
|
toast.error(err instanceof Error ? err.message : 'Eval suite failed')
|
|
235
529
|
} finally {
|
|
236
530
|
setEvalBusy(null)
|
|
237
531
|
}
|
|
238
|
-
}, [loadQualityData, selectedAgentId])
|
|
532
|
+
}, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadEvalGate, loadQualityData, selectedAgentId])
|
|
533
|
+
|
|
534
|
+
const setEvalBaseline = useCallback(async () => {
|
|
535
|
+
if (!selectedAgentId) {
|
|
536
|
+
toast.error('Choose an agent first')
|
|
537
|
+
return
|
|
538
|
+
}
|
|
539
|
+
if (evalGateScope === 'scenario' && !selectedScenarioId) {
|
|
540
|
+
toast.error('Choose a scenario first')
|
|
541
|
+
return
|
|
542
|
+
}
|
|
543
|
+
setEvalBaselineBusy(true)
|
|
544
|
+
try {
|
|
545
|
+
const body = evalGateScope === 'scenario'
|
|
546
|
+
? { agentId: selectedAgentId, scenarioId: selectedScenarioId, minPercent: evalGate?.minPercent ?? 80, maxRegressionPoints: evalGate?.maxRegressionPoints ?? 5 }
|
|
547
|
+
: { agentId: selectedAgentId, suite: selectedSuite, minPercent: evalGate?.minPercent ?? 80, maxRegressionPoints: evalGate?.maxRegressionPoints ?? 5 }
|
|
548
|
+
const result = await api<{ gate: EvalGateResult }>('POST', '/eval/baselines', body)
|
|
549
|
+
setEvalGate(result.gate)
|
|
550
|
+
toast.success('Eval baseline saved')
|
|
551
|
+
} catch (err) {
|
|
552
|
+
toast.error(err instanceof Error ? err.message : 'Unable to save eval baseline')
|
|
553
|
+
} finally {
|
|
554
|
+
setEvalBaselineBusy(false)
|
|
555
|
+
}
|
|
556
|
+
}, [evalGate, evalGateScope, selectedAgentId, selectedScenarioId, selectedSuite])
|
|
239
557
|
|
|
240
558
|
const actOnApproval = useCallback(async (approval: ApprovalRequest, approved: boolean) => {
|
|
241
559
|
setApprovalBusy(approval.id)
|
|
@@ -456,6 +774,20 @@ export function QualityWorkspace() {
|
|
|
456
774
|
</div>
|
|
457
775
|
</div>
|
|
458
776
|
)}
|
|
777
|
+
<EvalEnvironmentPanel
|
|
778
|
+
plan={evalEnvironmentPlan}
|
|
779
|
+
loading={evalEnvironmentLoading}
|
|
780
|
+
onRefresh={() => void loadEvalEnvironmentPlan({ refreshGateway: true })}
|
|
781
|
+
/>
|
|
782
|
+
<EvalGatePanel
|
|
783
|
+
gate={evalGate}
|
|
784
|
+
loading={evalGateLoading}
|
|
785
|
+
busy={evalBaselineBusy}
|
|
786
|
+
scope={evalGateScope}
|
|
787
|
+
onScopeChange={setEvalGateScope}
|
|
788
|
+
onRefresh={() => void loadEvalGate()}
|
|
789
|
+
onSetBaseline={() => void setEvalBaseline()}
|
|
790
|
+
/>
|
|
459
791
|
<button
|
|
460
792
|
type="button"
|
|
461
793
|
onClick={() => openMissionTemplate('release-candidate-qa')}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import assert from 'node:assert/strict'
|
|
2
|
+
import test from 'node:test'
|
|
3
|
+
|
|
4
|
+
import {
|
|
5
|
+
evaluateEvalGate,
|
|
6
|
+
setEvalBaseline,
|
|
7
|
+
} from './baseline'
|
|
8
|
+
import type { EvalBaseline, EvalRun } from './types'
|
|
9
|
+
|
|
10
|
+
function makeRun(overrides: Partial<EvalRun> = {}): EvalRun {
|
|
11
|
+
return {
|
|
12
|
+
id: 'run-1',
|
|
13
|
+
scenarioId: 'coding-prime',
|
|
14
|
+
agentId: 'agent-1',
|
|
15
|
+
status: 'completed',
|
|
16
|
+
startedAt: 1,
|
|
17
|
+
endedAt: 2,
|
|
18
|
+
score: 8,
|
|
19
|
+
maxScore: 10,
|
|
20
|
+
details: [],
|
|
21
|
+
...overrides,
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function depsFor(runs: EvalRun[], baseline: EvalBaseline | null = null, saved: EvalBaseline[] = []) {
|
|
26
|
+
return {
|
|
27
|
+
now: () => 123,
|
|
28
|
+
listRunsByAgent: (agentId: string) => runs.filter((run) => run.agentId === agentId),
|
|
29
|
+
getBaselineForScope: () => baseline,
|
|
30
|
+
saveBaseline: (next: EvalBaseline) => { saved.push(next) },
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
test('setEvalBaseline snapshots the latest scenario score and gate defaults', () => {
|
|
35
|
+
const saved: EvalBaseline[] = []
|
|
36
|
+
const baseline = setEvalBaseline(
|
|
37
|
+
{
|
|
38
|
+
agentId: 'agent-1',
|
|
39
|
+
scenarioId: 'coding-prime',
|
|
40
|
+
minPercent: 75,
|
|
41
|
+
maxRegressionPoints: 3,
|
|
42
|
+
label: 'Release candidate',
|
|
43
|
+
},
|
|
44
|
+
depsFor([
|
|
45
|
+
makeRun({ id: 'older', score: 4, startedAt: 1, endedAt: 2 }),
|
|
46
|
+
makeRun({ id: 'latest', score: 8, startedAt: 5, endedAt: 6 }),
|
|
47
|
+
], null, saved),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
assert.equal(saved.length, 1)
|
|
51
|
+
assert.equal(baseline.scope.type, 'scenario')
|
|
52
|
+
assert.equal(baseline.scope.id, 'coding-prime')
|
|
53
|
+
assert.equal(baseline.baselinePercent, 80)
|
|
54
|
+
assert.equal(baseline.minPercent, 75)
|
|
55
|
+
assert.equal(baseline.maxRegressionPoints, 3)
|
|
56
|
+
assert.deepEqual(baseline.runIds, ['latest'])
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
test('evaluateEvalGate warns until a baseline is approved', () => {
|
|
60
|
+
const gate = evaluateEvalGate(
|
|
61
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70 },
|
|
62
|
+
depsFor([makeRun({ score: 8, maxScore: 10 })]),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
assert.equal(gate.currentPercent, 80)
|
|
66
|
+
assert.equal(gate.status, 'warn')
|
|
67
|
+
assert.ok(gate.checks.some((check) => check.code === 'baseline_missing' && check.status === 'warn'))
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
test('evaluateEvalGate fails when regression exceeds the baseline allowance', () => {
|
|
71
|
+
const baseline = setEvalBaseline(
|
|
72
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 2 },
|
|
73
|
+
depsFor([makeRun({ id: 'baseline', score: 9, maxScore: 10 })]),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
const gate = evaluateEvalGate(
|
|
77
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime' },
|
|
78
|
+
depsFor([makeRun({ id: 'current', score: 6, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
assert.equal(gate.currentPercent, 60)
|
|
82
|
+
assert.equal(gate.regressionPoints, 30)
|
|
83
|
+
assert.equal(gate.status, 'fail')
|
|
84
|
+
assert.ok(gate.checks.some((check) => check.code === 'regression_limit_exceeded'))
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
test('evaluateEvalGate passes when score and regression checks pass', () => {
|
|
88
|
+
const baseline = setEvalBaseline(
|
|
89
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 5 },
|
|
90
|
+
depsFor([makeRun({ id: 'baseline', score: 8, maxScore: 10 })]),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
const gate = evaluateEvalGate(
|
|
94
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime' },
|
|
95
|
+
depsFor([makeRun({ id: 'current', score: 8, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
assert.equal(gate.status, 'pass')
|
|
99
|
+
assert.equal(gate.regressionPoints, 0)
|
|
100
|
+
assert.ok(gate.checks.some((check) => check.code === 'score_threshold_met'))
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
test('suite gates require latest runs for every scenario in scope before baselining', () => {
|
|
104
|
+
assert.throws(
|
|
105
|
+
() => setEvalBaseline(
|
|
106
|
+
{ agentId: 'agent-1', suite: 'core' },
|
|
107
|
+
depsFor([makeRun({ scenarioId: 'coding-prime' })]),
|
|
108
|
+
),
|
|
109
|
+
/Baseline requires latest runs for every scenario in scope/,
|
|
110
|
+
)
|
|
111
|
+
})
|