@swarmclawai/swarmclaw 1.9.5 → 1.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,7 @@ import {
17
17
  } from '@/lib/quality/quality-summary'
18
18
  import { cn } from '@/lib/utils'
19
19
  import { useAppStore } from '@/stores/use-app-store'
20
- import type { EvalRun, EvalSuiteResult } from '@/lib/server/eval/types'
20
+ import type { EvalEnvironmentPlan, EvalGateResult, EvalRun, EvalSuiteResult } from '@/lib/server/eval/types'
21
21
  import type { Agent, ApprovalRequest, SessionRunRecord } from '@/types'
22
22
 
23
23
  type QualityTab = 'overview' | 'evals' | 'approvals' | 'runs'
@@ -105,6 +105,217 @@ function EmptyState({ title, description }: { title: string; description: string
105
105
  )
106
106
  }
107
107
 
108
+ function environmentStatusClass(status: EvalEnvironmentPlan['status']): string {
109
+ if (status === 'ready') return 'border-emerald-500/25 bg-emerald-500/10 text-emerald-200'
110
+ if (status === 'warning') return 'border-amber-500/25 bg-amber-500/10 text-amber-200'
111
+ return 'border-rose-500/25 bg-rose-500/10 text-rose-200'
112
+ }
113
+
114
+ function checkClass(level: 'info' | 'warn' | 'error'): string {
115
+ if (level === 'error') return 'border-rose-500/20 bg-rose-500/[0.05] text-rose-200'
116
+ if (level === 'warn') return 'border-amber-500/20 bg-amber-500/[0.05] text-amber-200'
117
+ return 'border-white/[0.06] bg-white/[0.025] text-text-3'
118
+ }
119
+
120
+ function gateStatusClass(status: EvalGateResult['status']): string {
121
+ if (status === 'pass') return 'border-emerald-500/25 bg-emerald-500/10 text-emerald-200'
122
+ if (status === 'warn') return 'border-amber-500/25 bg-amber-500/10 text-amber-200'
123
+ return 'border-rose-500/25 bg-rose-500/10 text-rose-200'
124
+ }
125
+
126
+ function gateCheckClass(status: EvalGateResult['status']): string {
127
+ if (status === 'fail') return 'border-rose-500/20 bg-rose-500/[0.05] text-rose-200'
128
+ if (status === 'warn') return 'border-amber-500/20 bg-amber-500/[0.05] text-amber-200'
129
+ return 'border-emerald-500/20 bg-emerald-500/[0.05] text-emerald-200'
130
+ }
131
+
132
+ function EvalEnvironmentPanel({ plan, loading, onRefresh }: {
133
+ plan: EvalEnvironmentPlan | null
134
+ loading: boolean
135
+ onRefresh: () => void
136
+ }) {
137
+ return (
138
+ <div className="rounded-[12px] border border-white/[0.06] bg-white/[0.025] px-3 py-3">
139
+ <div className="flex items-start justify-between gap-3">
140
+ <div>
141
+ <div className="text-[13px] font-800 text-text">Validation environment</div>
142
+ <p className="mt-1 text-[11px] leading-relaxed text-text-3/65">
143
+ Preflight checks, workspace context, and generated files for the selected eval.
144
+ </p>
145
+ </div>
146
+ <button
147
+ type="button"
148
+ onClick={onRefresh}
149
+ disabled={loading}
150
+ className="shrink-0 rounded-[8px] border border-white/[0.08] px-2 py-1 text-[10px] font-800 text-text-2 transition-colors hover:bg-white/[0.06] disabled:opacity-40"
151
+ >
152
+ {loading ? 'Checking' : 'Refresh'}
153
+ </button>
154
+ </div>
155
+ {!plan ? (
156
+ <div className="mt-3 text-[11px] text-text-3/60">{loading ? 'Checking readiness...' : 'Choose an agent and scenario.'}</div>
157
+ ) : (
158
+ <div className="mt-3 flex flex-col gap-3">
159
+ <div className="flex flex-wrap items-center gap-2">
160
+ <span className={cn('rounded-full border px-2 py-1 text-[10px] font-800 uppercase tracking-[0.08em]', environmentStatusClass(plan.status))}>
161
+ {plan.status}
162
+ </span>
163
+ {plan.target && (
164
+ <span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
165
+ {plan.target.kind} - {plan.target.label}
166
+ </span>
167
+ )}
168
+ <span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
169
+ {plan.requiredTools.length} tool{plan.requiredTools.length === 1 ? '' : 's'}
170
+ </span>
171
+ <span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
172
+ {plan.generatedFiles.length} file{plan.generatedFiles.length === 1 ? '' : 's'}
173
+ </span>
174
+ </div>
175
+ {plan.target?.environmentLabel && (
176
+ <div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-3 py-2 text-[11px] text-text-3/70">
177
+ Environment: <span className="font-700 text-text-2">{plan.target.environmentLabel}</span>
178
+ {plan.target.environmentStatus ? ` (${plan.target.environmentStatus})` : ''}
179
+ </div>
180
+ )}
181
+ <div className="flex flex-col gap-1.5">
182
+ {plan.checks.slice(0, 4).map((check) => (
183
+ <div key={`${check.code}:${check.message}`} className={cn('rounded-[9px] border px-2.5 py-2 text-[11px] leading-relaxed', checkClass(check.level))}>
184
+ <span className="font-800 uppercase tracking-[0.08em]">{check.level}</span>
185
+ <span className="ml-2">{check.message}</span>
186
+ </div>
187
+ ))}
188
+ {plan.checks.length > 4 && (
189
+ <div className="text-[10px] text-text-3/55">+{plan.checks.length - 4} more check{plan.checks.length - 4 === 1 ? '' : 's'}</div>
190
+ )}
191
+ </div>
192
+ <div className="flex flex-wrap gap-1.5">
193
+ {plan.generatedFiles.slice(0, 5).map((file) => (
194
+ <span key={`${file.kind}:${file.path}`} className="rounded-full bg-white/[0.04] px-2 py-1 text-[10px] font-700 text-text-3">
195
+ {file.path}
196
+ </span>
197
+ ))}
198
+ {plan.generatedFiles.length > 5 && (
199
+ <span className="rounded-full bg-white/[0.04] px-2 py-1 text-[10px] font-700 text-text-3">
200
+ +{plan.generatedFiles.length - 5}
201
+ </span>
202
+ )}
203
+ </div>
204
+ </div>
205
+ )}
206
+ </div>
207
+ )
208
+ }
209
+
210
+ function EvalGatePanel({
211
+ gate,
212
+ loading,
213
+ busy,
214
+ scope,
215
+ onScopeChange,
216
+ onRefresh,
217
+ onSetBaseline,
218
+ }: {
219
+ gate: EvalGateResult | null
220
+ loading: boolean
221
+ busy: boolean
222
+ scope: 'scenario' | 'suite'
223
+ onScopeChange: (scope: 'scenario' | 'suite') => void
224
+ onRefresh: () => void
225
+ onSetBaseline: () => void
226
+ }) {
227
+ return (
228
+ <div className="rounded-[12px] border border-white/[0.06] bg-white/[0.025] px-3 py-3">
229
+ <div className="flex items-start justify-between gap-3">
230
+ <div>
231
+ <div className="text-[13px] font-800 text-text">Regression gate</div>
232
+ <p className="mt-1 text-[11px] leading-relaxed text-text-3/65">
233
+ Compare latest eval evidence against thresholds and an approved baseline.
234
+ </p>
235
+ </div>
236
+ <button
237
+ type="button"
238
+ onClick={onRefresh}
239
+ disabled={loading}
240
+ className="shrink-0 rounded-[8px] border border-white/[0.08] px-2 py-1 text-[10px] font-800 text-text-2 transition-colors hover:bg-white/[0.06] disabled:opacity-40"
241
+ >
242
+ {loading ? 'Checking' : 'Refresh'}
243
+ </button>
244
+ </div>
245
+
246
+ <div className="mt-3 flex rounded-[10px] border border-white/[0.06] bg-white/[0.025] p-1">
247
+ {(['scenario', 'suite'] as const).map((item) => (
248
+ <button
249
+ key={item}
250
+ type="button"
251
+ onClick={() => onScopeChange(item)}
252
+ className={cn(
253
+ 'flex-1 rounded-[8px] px-2 py-1.5 text-[10px] font-800 uppercase tracking-[0.08em] transition-colors',
254
+ scope === item ? 'bg-white/[0.1] text-text' : 'text-text-3 hover:bg-white/[0.05]',
255
+ )}
256
+ >
257
+ {item}
258
+ </button>
259
+ ))}
260
+ </div>
261
+
262
+ {!gate ? (
263
+ <div className="mt-3 text-[11px] text-text-3/60">{loading ? 'Checking gate...' : 'Run evals to build gate evidence.'}</div>
264
+ ) : (
265
+ <div className="mt-3 flex flex-col gap-3">
266
+ <div className="flex flex-wrap items-center gap-2">
267
+ <span className={cn('rounded-full border px-2 py-1 text-[10px] font-800 uppercase tracking-[0.08em]', gateStatusClass(gate.status))}>
268
+ {gate.status}
269
+ </span>
270
+ <span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
271
+ {gate.scope.label}
272
+ </span>
273
+ <span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
274
+ {gate.latestRuns.length}/{gate.scope.scenarioIds.length} latest runs
275
+ </span>
276
+ </div>
277
+
278
+ <div className="grid grid-cols-3 gap-2">
279
+ <div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
280
+ <div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Current</div>
281
+ <div className="mt-1 text-[14px] font-800 text-text">{formatPercent(gate.currentPercent)}</div>
282
+ </div>
283
+ <div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
284
+ <div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Baseline</div>
285
+ <div className="mt-1 text-[14px] font-800 text-text">{gate.baseline ? `${gate.baseline.baselinePercent}%` : 'none'}</div>
286
+ </div>
287
+ <div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
288
+ <div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Regression</div>
289
+ <div className="mt-1 text-[14px] font-800 text-text">{gate.regressionPoints == null ? 'n/a' : `${gate.regressionPoints}pt`}</div>
290
+ </div>
291
+ </div>
292
+
293
+ <div className="flex flex-col gap-1.5">
294
+ {gate.checks.slice(0, 4).map((check) => (
295
+ <div key={`${check.code}:${check.message}`} className={cn('rounded-[9px] border px-2.5 py-2 text-[11px] leading-relaxed', gateCheckClass(check.status))}>
296
+ <span className="font-800 uppercase tracking-[0.08em]">{check.status}</span>
297
+ <span className="ml-2">{check.message}</span>
298
+ </div>
299
+ ))}
300
+ {gate.checks.length > 4 && (
301
+ <div className="text-[10px] text-text-3/55">+{gate.checks.length - 4} more check{gate.checks.length - 4 === 1 ? '' : 's'}</div>
302
+ )}
303
+ </div>
304
+
305
+ <button
306
+ type="button"
307
+ onClick={onSetBaseline}
308
+ disabled={busy || gate.latestRuns.length === 0 || gate.checks.some((check) => check.code === 'missing_scope_runs')}
309
+ className="rounded-[9px] border border-white/[0.08] bg-white/[0.04] px-3 py-2 text-[11px] font-800 text-text-2 transition-colors hover:bg-white/[0.08] disabled:cursor-not-allowed disabled:opacity-40"
310
+ >
311
+ {busy ? 'Saving baseline' : gate.baseline ? 'Update baseline' : 'Set baseline'}
312
+ </button>
313
+ </div>
314
+ )}
315
+ </div>
316
+ )
317
+ }
318
+
108
319
  export function QualityWorkspace() {
109
320
  const router = useRouter()
110
321
  const searchParams = useSearchParams()
@@ -127,6 +338,12 @@ export function QualityWorkspace() {
127
338
  const [selectedSuite, setSelectedSuite] = useState('core')
128
339
  const [selectedScenarioId, setSelectedScenarioId] = useState('')
129
340
  const [evalBusy, setEvalBusy] = useState<string | null>(null)
341
+ const [evalEnvironmentPlan, setEvalEnvironmentPlan] = useState<EvalEnvironmentPlan | null>(null)
342
+ const [evalEnvironmentLoading, setEvalEnvironmentLoading] = useState(false)
343
+ const [evalGate, setEvalGate] = useState<EvalGateResult | null>(null)
344
+ const [evalGateScope, setEvalGateScope] = useState<'scenario' | 'suite'>('scenario')
345
+ const [evalGateLoading, setEvalGateLoading] = useState(false)
346
+ const [evalBaselineBusy, setEvalBaselineBusy] = useState(false)
130
347
  const [approvalBusy, setApprovalBusy] = useState<string | null>(null)
131
348
 
132
349
  useEffect(() => {
@@ -170,6 +387,51 @@ export function QualityWorkspace() {
170
387
  }
171
388
  }, [])
172
389
 
390
+ const loadEvalEnvironmentPlan = useCallback(async (opts: { refreshGateway?: boolean } = {}) => {
391
+ if (!selectedAgentId) {
392
+ setEvalEnvironmentPlan(null)
393
+ return
394
+ }
395
+ const params = new URLSearchParams({ agentId: selectedAgentId })
396
+ if (selectedScenarioId) params.set('scenarioId', selectedScenarioId)
397
+ else if (selectedSuite) params.set('suite', selectedSuite)
398
+ if (opts.refreshGateway) params.set('refreshGateway', 'true')
399
+ setEvalEnvironmentLoading(true)
400
+ try {
401
+ const plan = await api<EvalEnvironmentPlan>('GET', `/eval/environments?${params.toString()}`, undefined, { timeoutMs: opts.refreshGateway ? 20_000 : 8_000 })
402
+ setEvalEnvironmentPlan(plan)
403
+ } catch (err) {
404
+ setEvalEnvironmentPlan(null)
405
+ toast.error(err instanceof Error ? err.message : 'Unable to validate eval environment')
406
+ } finally {
407
+ setEvalEnvironmentLoading(false)
408
+ }
409
+ }, [selectedAgentId, selectedScenarioId, selectedSuite])
410
+
411
+ const loadEvalGate = useCallback(async () => {
412
+ if (!selectedAgentId) {
413
+ setEvalGate(null)
414
+ return
415
+ }
416
+ if (evalGateScope === 'scenario' && !selectedScenarioId) {
417
+ setEvalGate(null)
418
+ return
419
+ }
420
+ const params = new URLSearchParams({ agentId: selectedAgentId })
421
+ if (evalGateScope === 'scenario') params.set('scenarioId', selectedScenarioId)
422
+ else params.set('suite', selectedSuite)
423
+ setEvalGateLoading(true)
424
+ try {
425
+ const gate = await api<EvalGateResult>('GET', `/eval/gate?${params.toString()}`)
426
+ setEvalGate(gate)
427
+ } catch (err) {
428
+ setEvalGate(null)
429
+ toast.error(err instanceof Error ? err.message : 'Unable to check eval gate')
430
+ } finally {
431
+ setEvalGateLoading(false)
432
+ }
433
+ }, [evalGateScope, selectedAgentId, selectedScenarioId, selectedSuite])
434
+
173
435
  useEffect(() => {
174
436
  void loadQualityData()
175
437
  }, [loadQualityData])
@@ -184,6 +446,14 @@ export function QualityWorkspace() {
184
446
  if (!selectedScenarioId && scenarios[0]) setSelectedScenarioId(scenarios[0].id)
185
447
  }, [scenarios, selectedScenarioId])
186
448
 
449
+ useEffect(() => {
450
+ void loadEvalEnvironmentPlan()
451
+ }, [loadEvalEnvironmentPlan])
452
+
453
+ useEffect(() => {
454
+ void loadEvalGate()
455
+ }, [loadEvalGate])
456
+
187
457
  useEffect(() => {
188
458
  if (!suites.some((suite) => suite.name === selectedSuite) && suites[0]) {
189
459
  setSelectedSuite(suites[0].name)
@@ -208,34 +478,82 @@ export function QualityWorkspace() {
208
478
  toast.error('Choose an agent and scenario first')
209
479
  return
210
480
  }
481
+ if (evalEnvironmentPlan?.status === 'blocked') {
482
+ toast.error('Fix the validation environment before running this eval')
483
+ return
484
+ }
211
485
  setEvalBusy(`scenario:${selectedScenarioId}`)
212
486
  try {
213
- await api<EvalRun>('POST', '/eval/run', { agentId: selectedAgentId, scenarioId: selectedScenarioId }, { timeoutMs: 180_000 })
487
+ await api<EvalRun>('POST', '/eval/run', {
488
+ agentId: selectedAgentId,
489
+ scenarioId: selectedScenarioId,
490
+ gatewayProfileId: evalEnvironmentPlan?.target?.gatewayProfileId || null,
491
+ environmentId: evalEnvironmentPlan?.target?.environmentId || null,
492
+ refreshGateway: evalEnvironmentPlan?.target?.kind === 'gateway',
493
+ }, { timeoutMs: 180_000 })
214
494
  toast.success('Eval scenario completed')
215
495
  await loadQualityData({ silent: true })
496
+ await loadEvalEnvironmentPlan()
497
+ await loadEvalGate()
216
498
  } catch (err) {
217
499
  toast.error(err instanceof Error ? err.message : 'Eval scenario failed')
218
500
  } finally {
219
501
  setEvalBusy(null)
220
502
  }
221
- }, [loadQualityData, selectedAgentId, selectedScenarioId])
503
+ }, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadEvalGate, loadQualityData, selectedAgentId, selectedScenarioId])
222
504
 
223
505
  const runSuite = useCallback(async (suiteName: string) => {
224
506
  if (!selectedAgentId) {
225
507
  toast.error('Choose an agent first')
226
508
  return
227
509
  }
510
+ if (evalEnvironmentPlan?.status === 'blocked') {
511
+ toast.error('Fix the validation environment before running this suite')
512
+ return
513
+ }
228
514
  setEvalBusy(`suite:${suiteName}`)
229
515
  try {
230
- const result = await api<EvalSuiteResult>('POST', '/eval/suite', { agentId: selectedAgentId, suite: suiteName }, { timeoutMs: 300_000 })
516
+ const result = await api<EvalSuiteResult>('POST', '/eval/suite', {
517
+ agentId: selectedAgentId,
518
+ suite: suiteName,
519
+ gatewayProfileId: evalEnvironmentPlan?.target?.gatewayProfileId || null,
520
+ environmentId: evalEnvironmentPlan?.target?.environmentId || null,
521
+ refreshGateway: evalEnvironmentPlan?.target?.kind === 'gateway',
522
+ }, { timeoutMs: 300_000 })
231
523
  toast.success(`Suite completed at ${Math.round(result.percentage)}%`)
232
524
  await loadQualityData({ silent: true })
525
+ await loadEvalEnvironmentPlan()
526
+ await loadEvalGate()
233
527
  } catch (err) {
234
528
  toast.error(err instanceof Error ? err.message : 'Eval suite failed')
235
529
  } finally {
236
530
  setEvalBusy(null)
237
531
  }
238
- }, [loadQualityData, selectedAgentId])
532
+ }, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadEvalGate, loadQualityData, selectedAgentId])
533
+
534
+ const setEvalBaseline = useCallback(async () => {
535
+ if (!selectedAgentId) {
536
+ toast.error('Choose an agent first')
537
+ return
538
+ }
539
+ if (evalGateScope === 'scenario' && !selectedScenarioId) {
540
+ toast.error('Choose a scenario first')
541
+ return
542
+ }
543
+ setEvalBaselineBusy(true)
544
+ try {
545
+ const body = evalGateScope === 'scenario'
546
+ ? { agentId: selectedAgentId, scenarioId: selectedScenarioId, minPercent: evalGate?.minPercent ?? 80, maxRegressionPoints: evalGate?.maxRegressionPoints ?? 5 }
547
+ : { agentId: selectedAgentId, suite: selectedSuite, minPercent: evalGate?.minPercent ?? 80, maxRegressionPoints: evalGate?.maxRegressionPoints ?? 5 }
548
+ const result = await api<{ gate: EvalGateResult }>('POST', '/eval/baselines', body)
549
+ setEvalGate(result.gate)
550
+ toast.success('Eval baseline saved')
551
+ } catch (err) {
552
+ toast.error(err instanceof Error ? err.message : 'Unable to save eval baseline')
553
+ } finally {
554
+ setEvalBaselineBusy(false)
555
+ }
556
+ }, [evalGate, evalGateScope, selectedAgentId, selectedScenarioId, selectedSuite])
239
557
 
240
558
  const actOnApproval = useCallback(async (approval: ApprovalRequest, approved: boolean) => {
241
559
  setApprovalBusy(approval.id)
@@ -456,6 +774,20 @@ export function QualityWorkspace() {
456
774
  </div>
457
775
  </div>
458
776
  )}
777
+ <EvalEnvironmentPanel
778
+ plan={evalEnvironmentPlan}
779
+ loading={evalEnvironmentLoading}
780
+ onRefresh={() => void loadEvalEnvironmentPlan({ refreshGateway: true })}
781
+ />
782
+ <EvalGatePanel
783
+ gate={evalGate}
784
+ loading={evalGateLoading}
785
+ busy={evalBaselineBusy}
786
+ scope={evalGateScope}
787
+ onScopeChange={setEvalGateScope}
788
+ onRefresh={() => void loadEvalGate()}
789
+ onSetBaseline={() => void setEvalBaseline()}
790
+ />
459
791
  <button
460
792
  type="button"
461
793
  onClick={() => openMissionTemplate('release-candidate-qa')}
@@ -0,0 +1,111 @@
1
+ import assert from 'node:assert/strict'
2
+ import test from 'node:test'
3
+
4
+ import {
5
+ evaluateEvalGate,
6
+ setEvalBaseline,
7
+ } from './baseline'
8
+ import type { EvalBaseline, EvalRun } from './types'
9
+
10
+ function makeRun(overrides: Partial<EvalRun> = {}): EvalRun {
11
+ return {
12
+ id: 'run-1',
13
+ scenarioId: 'coding-prime',
14
+ agentId: 'agent-1',
15
+ status: 'completed',
16
+ startedAt: 1,
17
+ endedAt: 2,
18
+ score: 8,
19
+ maxScore: 10,
20
+ details: [],
21
+ ...overrides,
22
+ }
23
+ }
24
+
25
+ function depsFor(runs: EvalRun[], baseline: EvalBaseline | null = null, saved: EvalBaseline[] = []) {
26
+ return {
27
+ now: () => 123,
28
+ listRunsByAgent: (agentId: string) => runs.filter((run) => run.agentId === agentId),
29
+ getBaselineForScope: () => baseline,
30
+ saveBaseline: (next: EvalBaseline) => { saved.push(next) },
31
+ }
32
+ }
33
+
34
+ test('setEvalBaseline snapshots the latest scenario score and gate defaults', () => {
35
+ const saved: EvalBaseline[] = []
36
+ const baseline = setEvalBaseline(
37
+ {
38
+ agentId: 'agent-1',
39
+ scenarioId: 'coding-prime',
40
+ minPercent: 75,
41
+ maxRegressionPoints: 3,
42
+ label: 'Release candidate',
43
+ },
44
+ depsFor([
45
+ makeRun({ id: 'older', score: 4, startedAt: 1, endedAt: 2 }),
46
+ makeRun({ id: 'latest', score: 8, startedAt: 5, endedAt: 6 }),
47
+ ], null, saved),
48
+ )
49
+
50
+ assert.equal(saved.length, 1)
51
+ assert.equal(baseline.scope.type, 'scenario')
52
+ assert.equal(baseline.scope.id, 'coding-prime')
53
+ assert.equal(baseline.baselinePercent, 80)
54
+ assert.equal(baseline.minPercent, 75)
55
+ assert.equal(baseline.maxRegressionPoints, 3)
56
+ assert.deepEqual(baseline.runIds, ['latest'])
57
+ })
58
+
59
+ test('evaluateEvalGate warns until a baseline is approved', () => {
60
+ const gate = evaluateEvalGate(
61
+ { agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70 },
62
+ depsFor([makeRun({ score: 8, maxScore: 10 })]),
63
+ )
64
+
65
+ assert.equal(gate.currentPercent, 80)
66
+ assert.equal(gate.status, 'warn')
67
+ assert.ok(gate.checks.some((check) => check.code === 'baseline_missing' && check.status === 'warn'))
68
+ })
69
+
70
+ test('evaluateEvalGate fails when regression exceeds the baseline allowance', () => {
71
+ const baseline = setEvalBaseline(
72
+ { agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 2 },
73
+ depsFor([makeRun({ id: 'baseline', score: 9, maxScore: 10 })]),
74
+ )
75
+
76
+ const gate = evaluateEvalGate(
77
+ { agentId: 'agent-1', scenarioId: 'coding-prime' },
78
+ depsFor([makeRun({ id: 'current', score: 6, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
79
+ )
80
+
81
+ assert.equal(gate.currentPercent, 60)
82
+ assert.equal(gate.regressionPoints, 30)
83
+ assert.equal(gate.status, 'fail')
84
+ assert.ok(gate.checks.some((check) => check.code === 'regression_limit_exceeded'))
85
+ })
86
+
87
+ test('evaluateEvalGate passes when score and regression checks pass', () => {
88
+ const baseline = setEvalBaseline(
89
+ { agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 5 },
90
+ depsFor([makeRun({ id: 'baseline', score: 8, maxScore: 10 })]),
91
+ )
92
+
93
+ const gate = evaluateEvalGate(
94
+ { agentId: 'agent-1', scenarioId: 'coding-prime' },
95
+ depsFor([makeRun({ id: 'current', score: 8, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
96
+ )
97
+
98
+ assert.equal(gate.status, 'pass')
99
+ assert.equal(gate.regressionPoints, 0)
100
+ assert.ok(gate.checks.some((check) => check.code === 'score_threshold_met'))
101
+ })
102
+
103
+ test('suite gates require latest runs for every scenario in scope before baselining', () => {
104
+ assert.throws(
105
+ () => setEvalBaseline(
106
+ { agentId: 'agent-1', suite: 'core' },
107
+ depsFor([makeRun({ scenarioId: 'coding-prime' })]),
108
+ ),
109
+ /Baseline requires latest runs for every scenario in scope/,
110
+ )
111
+ })