@swarmclawai/swarmclaw 1.9.6 → 1.9.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -0
- package/package.json +2 -2
- package/src/app/api/eval/baselines/route.ts +55 -0
- package/src/app/api/eval/gate/route.ts +36 -0
- package/src/app/api/quality/release-readiness/route.ts +38 -0
- package/src/cli/index.js +4 -0
- package/src/components/quality/quality-workspace.tsx +352 -4
- package/src/lib/quality/release-readiness.test.ts +129 -0
- package/src/lib/quality/release-readiness.ts +187 -0
- package/src/lib/server/eval/baseline.test.ts +111 -0
- package/src/lib/server/eval/baseline.ts +274 -0
- package/src/lib/server/eval/store.ts +47 -1
- package/src/lib/server/eval/types.ts +50 -0
- package/src/lib/server/session-tools/extension-creator.ts +2 -2
- package/src/lib/server/tasks/task-checkout.ts +1 -1
- package/src/types/extension.ts +3 -3
- package/electron-dist/main.js +0 -218
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import assert from 'node:assert/strict'
|
|
2
|
+
import { describe, it } from 'node:test'
|
|
3
|
+
|
|
4
|
+
import { buildReleaseReadinessReport } from './release-readiness'
|
|
5
|
+
import type { EvalGateResult } from '@/lib/server/eval/types'
|
|
6
|
+
import type { OperationPulse } from '@/types'
|
|
7
|
+
|
|
8
|
+
const now = 100_000
|
|
9
|
+
|
|
10
|
+
function pulse(overrides: Partial<OperationPulse> = {}): OperationPulse {
|
|
11
|
+
return {
|
|
12
|
+
generatedAt: now,
|
|
13
|
+
range: '24h',
|
|
14
|
+
windowStart: now - 86_400_000,
|
|
15
|
+
kpis: {
|
|
16
|
+
activeMissions: 0,
|
|
17
|
+
runningRuns: 0,
|
|
18
|
+
failedRuns: 0,
|
|
19
|
+
pendingApprovals: 0,
|
|
20
|
+
connectorAttention: 0,
|
|
21
|
+
gatewayAttention: 0,
|
|
22
|
+
budgetWarnings: 0,
|
|
23
|
+
},
|
|
24
|
+
actions: [],
|
|
25
|
+
...overrides,
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function evalGate(overrides: Partial<EvalGateResult> = {}): EvalGateResult {
|
|
30
|
+
return {
|
|
31
|
+
agentId: 'agent_1',
|
|
32
|
+
scope: {
|
|
33
|
+
type: 'suite',
|
|
34
|
+
id: 'core',
|
|
35
|
+
label: 'core',
|
|
36
|
+
scenarioIds: ['coding-prime'],
|
|
37
|
+
},
|
|
38
|
+
status: 'pass',
|
|
39
|
+
generatedAt: now,
|
|
40
|
+
baseline: null,
|
|
41
|
+
latestRuns: [],
|
|
42
|
+
currentScore: 10,
|
|
43
|
+
currentMaxScore: 10,
|
|
44
|
+
currentPercent: 100,
|
|
45
|
+
regressionPoints: 0,
|
|
46
|
+
minPercent: 80,
|
|
47
|
+
maxRegressionPoints: 5,
|
|
48
|
+
checks: [{ code: 'score_threshold_met', status: 'pass', message: 'Current score meets the 80% gate.' }],
|
|
49
|
+
...overrides,
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
describe('release readiness report', () => {
|
|
54
|
+
it('passes when eval gate and operations pulse are clean', () => {
|
|
55
|
+
const report = buildReleaseReadinessReport({
|
|
56
|
+
pulse: pulse(),
|
|
57
|
+
evalGate: evalGate(),
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
assert.equal(report.status, 'ready')
|
|
61
|
+
assert.equal(report.score, 100)
|
|
62
|
+
assert.equal(report.blockerCount, 0)
|
|
63
|
+
assert.equal(report.warningCount, 0)
|
|
64
|
+
assert.ok(report.checks.some((check) => check.code === 'eval_gate_passed'))
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
it('warns when no eval gate is selected', () => {
|
|
68
|
+
const report = buildReleaseReadinessReport({
|
|
69
|
+
pulse: pulse(),
|
|
70
|
+
evalGate: null,
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
assert.equal(report.status, 'warning')
|
|
74
|
+
assert.equal(report.blockerCount, 0)
|
|
75
|
+
assert.equal(report.warningCount, 1)
|
|
76
|
+
assert.ok(report.score < 100)
|
|
77
|
+
assert.ok(report.checks.some((check) => check.code === 'eval_gate_missing'))
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
it('blocks when eval regression gate fails', () => {
|
|
81
|
+
const report = buildReleaseReadinessReport({
|
|
82
|
+
pulse: pulse(),
|
|
83
|
+
evalGate: evalGate({
|
|
84
|
+
status: 'fail',
|
|
85
|
+
currentPercent: 60,
|
|
86
|
+
checks: [{ code: 'score_below_threshold', status: 'fail', message: 'Current score is below the 80% gate.' }],
|
|
87
|
+
}),
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
assert.equal(report.status, 'blocked')
|
|
91
|
+
assert.equal(report.blockerCount, 1)
|
|
92
|
+
assert.ok(report.score <= 70)
|
|
93
|
+
assert.ok(report.checks.some((check) => check.code === 'eval_gate_failed'))
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
it('blocks on failed runs and pending approvals, then surfaces pulse actions', () => {
|
|
97
|
+
const report = buildReleaseReadinessReport({
|
|
98
|
+
pulse: pulse({
|
|
99
|
+
kpis: {
|
|
100
|
+
activeMissions: 1,
|
|
101
|
+
runningRuns: 1,
|
|
102
|
+
failedRuns: 2,
|
|
103
|
+
pendingApprovals: 3,
|
|
104
|
+
connectorAttention: 1,
|
|
105
|
+
gatewayAttention: 1,
|
|
106
|
+
budgetWarnings: 1,
|
|
107
|
+
},
|
|
108
|
+
actions: [{
|
|
109
|
+
id: 'run:failed',
|
|
110
|
+
kind: 'run',
|
|
111
|
+
severity: 'high',
|
|
112
|
+
title: 'Review failed run',
|
|
113
|
+
summary: 'Run failed',
|
|
114
|
+
href: '/quality?tab=runs',
|
|
115
|
+
evidence: ['run'],
|
|
116
|
+
createdAt: now,
|
|
117
|
+
}],
|
|
118
|
+
}),
|
|
119
|
+
evalGate: evalGate(),
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
assert.equal(report.status, 'blocked')
|
|
123
|
+
assert.equal(report.blockerCount, 2)
|
|
124
|
+
assert.ok(report.warningCount >= 4)
|
|
125
|
+
assert.equal(report.nextActions[0]?.id, 'run:failed')
|
|
126
|
+
assert.ok(report.checks.some((check) => check.code === 'failed_runs_present'))
|
|
127
|
+
assert.ok(report.checks.some((check) => check.code === 'pending_approvals_present'))
|
|
128
|
+
})
|
|
129
|
+
})
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import type { EvalGateResult } from '@/lib/server/eval/types'
|
|
2
|
+
import type { OperationPulse, OperationPulseAction, OperationPulseRange } from '@/types'
|
|
3
|
+
|
|
4
|
+
export type ReleaseReadinessStatus = 'ready' | 'warning' | 'blocked'
|
|
5
|
+
|
|
6
|
+
export interface ReleaseReadinessCheck {
|
|
7
|
+
code: string
|
|
8
|
+
status: ReleaseReadinessStatus
|
|
9
|
+
title: string
|
|
10
|
+
summary: string
|
|
11
|
+
href?: string
|
|
12
|
+
evidence?: string[]
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface ReleaseReadinessReport {
|
|
16
|
+
generatedAt: number
|
|
17
|
+
range: OperationPulseRange
|
|
18
|
+
status: ReleaseReadinessStatus
|
|
19
|
+
score: number
|
|
20
|
+
blockerCount: number
|
|
21
|
+
warningCount: number
|
|
22
|
+
pulse: OperationPulse
|
|
23
|
+
evalGate: EvalGateResult | null
|
|
24
|
+
checks: ReleaseReadinessCheck[]
|
|
25
|
+
nextActions: OperationPulseAction[]
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const BLOCKER_PENALTY = 30
|
|
29
|
+
const WARNING_PENALTY = 10
|
|
30
|
+
|
|
31
|
+
function readinessStatus(checks: ReleaseReadinessCheck[]): ReleaseReadinessStatus {
|
|
32
|
+
if (checks.some((check) => check.status === 'blocked')) return 'blocked'
|
|
33
|
+
if (checks.some((check) => check.status === 'warning')) return 'warning'
|
|
34
|
+
return 'ready'
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function readinessScore(checks: ReleaseReadinessCheck[]): number {
|
|
38
|
+
const penalty = checks.reduce((sum, check) => {
|
|
39
|
+
if (check.status === 'blocked') return sum + BLOCKER_PENALTY
|
|
40
|
+
if (check.status === 'warning') return sum + WARNING_PENALTY
|
|
41
|
+
return sum
|
|
42
|
+
}, 0)
|
|
43
|
+
return Math.max(0, 100 - penalty)
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function plural(count: number, singular: string, pluralLabel = `${singular}s`): string {
|
|
47
|
+
return `${count} ${count === 1 ? singular : pluralLabel}`
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function addCheck(checks: ReleaseReadinessCheck[], check: ReleaseReadinessCheck): void {
|
|
51
|
+
checks.push(check)
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export function buildReleaseReadinessReport(input: {
|
|
55
|
+
pulse: OperationPulse
|
|
56
|
+
evalGate?: EvalGateResult | null
|
|
57
|
+
}): ReleaseReadinessReport {
|
|
58
|
+
const checks: ReleaseReadinessCheck[] = []
|
|
59
|
+
const evalGate = input.evalGate ?? null
|
|
60
|
+
|
|
61
|
+
if (!evalGate) {
|
|
62
|
+
addCheck(checks, {
|
|
63
|
+
code: 'eval_gate_missing',
|
|
64
|
+
status: 'warning',
|
|
65
|
+
title: 'Select an eval gate',
|
|
66
|
+
summary: 'No eval regression gate is included in this readiness report.',
|
|
67
|
+
href: '/quality?tab=evals',
|
|
68
|
+
})
|
|
69
|
+
} else if (evalGate.status === 'fail') {
|
|
70
|
+
addCheck(checks, {
|
|
71
|
+
code: 'eval_gate_failed',
|
|
72
|
+
status: 'blocked',
|
|
73
|
+
title: 'Eval gate failed',
|
|
74
|
+
summary: `${evalGate.scope.label} is not passing the configured eval release gate.`,
|
|
75
|
+
href: '/quality?tab=evals',
|
|
76
|
+
evidence: evalGate.checks
|
|
77
|
+
.filter((check) => check.status === 'fail')
|
|
78
|
+
.map((check) => check.message),
|
|
79
|
+
})
|
|
80
|
+
} else if (evalGate.status === 'warn') {
|
|
81
|
+
addCheck(checks, {
|
|
82
|
+
code: 'eval_gate_warning',
|
|
83
|
+
status: 'warning',
|
|
84
|
+
title: 'Eval gate needs a baseline',
|
|
85
|
+
summary: `${evalGate.scope.label} passes the score threshold but still has release-gate warnings.`,
|
|
86
|
+
href: '/quality?tab=evals',
|
|
87
|
+
evidence: evalGate.checks
|
|
88
|
+
.filter((check) => check.status === 'warn')
|
|
89
|
+
.map((check) => check.message),
|
|
90
|
+
})
|
|
91
|
+
} else {
|
|
92
|
+
addCheck(checks, {
|
|
93
|
+
code: 'eval_gate_passed',
|
|
94
|
+
status: 'ready',
|
|
95
|
+
title: 'Eval gate passed',
|
|
96
|
+
summary: `${evalGate.scope.label} meets the configured score and regression checks.`,
|
|
97
|
+
href: '/quality?tab=evals',
|
|
98
|
+
evidence: [`${evalGate.currentPercent ?? 'n/a'}% current score`],
|
|
99
|
+
})
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (input.pulse.kpis.failedRuns > 0) {
|
|
103
|
+
addCheck(checks, {
|
|
104
|
+
code: 'failed_runs_present',
|
|
105
|
+
status: 'blocked',
|
|
106
|
+
title: 'Failed runs need review',
|
|
107
|
+
summary: `${plural(input.pulse.kpis.failedRuns, 'failed run')} found in the ${input.pulse.range} operations window.`,
|
|
108
|
+
href: '/quality?tab=runs',
|
|
109
|
+
})
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (input.pulse.kpis.pendingApprovals > 0) {
|
|
113
|
+
addCheck(checks, {
|
|
114
|
+
code: 'pending_approvals_present',
|
|
115
|
+
status: 'blocked',
|
|
116
|
+
title: 'Pending approvals need decisions',
|
|
117
|
+
summary: `${plural(input.pulse.kpis.pendingApprovals, 'approval')} still waiting on an operator.`,
|
|
118
|
+
href: '/quality?tab=approvals',
|
|
119
|
+
})
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (input.pulse.kpis.runningRuns > 0) {
|
|
123
|
+
addCheck(checks, {
|
|
124
|
+
code: 'active_runs_present',
|
|
125
|
+
status: 'warning',
|
|
126
|
+
title: 'Runs are still active',
|
|
127
|
+
summary: `${plural(input.pulse.kpis.runningRuns, 'run')} queued or running while this report was generated.`,
|
|
128
|
+
href: '/runs',
|
|
129
|
+
})
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if (input.pulse.kpis.connectorAttention > 0) {
|
|
133
|
+
addCheck(checks, {
|
|
134
|
+
code: 'connector_attention_present',
|
|
135
|
+
status: 'warning',
|
|
136
|
+
title: 'Connector readiness needs attention',
|
|
137
|
+
summary: `${plural(input.pulse.kpis.connectorAttention, 'connector')} reporting degraded readiness.`,
|
|
138
|
+
href: '/connectors',
|
|
139
|
+
})
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if (input.pulse.kpis.gatewayAttention > 0) {
|
|
143
|
+
addCheck(checks, {
|
|
144
|
+
code: 'gateway_attention_present',
|
|
145
|
+
status: 'warning',
|
|
146
|
+
title: 'Gateway readiness needs attention',
|
|
147
|
+
summary: `${plural(input.pulse.kpis.gatewayAttention, 'gateway')} reporting topology or environment warnings.`,
|
|
148
|
+
href: '/providers',
|
|
149
|
+
})
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (input.pulse.kpis.budgetWarnings > 0) {
|
|
153
|
+
addCheck(checks, {
|
|
154
|
+
code: 'budget_warnings_present',
|
|
155
|
+
status: 'warning',
|
|
156
|
+
title: 'Mission budget pressure',
|
|
157
|
+
summary: `${plural(input.pulse.kpis.budgetWarnings, 'mission')} near a configured budget limit.`,
|
|
158
|
+
href: '/missions',
|
|
159
|
+
})
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if (input.pulse.kpis.activeMissions > 0) {
|
|
163
|
+
addCheck(checks, {
|
|
164
|
+
code: 'active_missions_present',
|
|
165
|
+
status: 'warning',
|
|
166
|
+
title: 'Missions are still active',
|
|
167
|
+
summary: `${plural(input.pulse.kpis.activeMissions, 'mission')} running or paused in the operations window.`,
|
|
168
|
+
href: '/missions',
|
|
169
|
+
})
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const blockerCount = checks.filter((check) => check.status === 'blocked').length
|
|
173
|
+
const warningCount = checks.filter((check) => check.status === 'warning').length
|
|
174
|
+
|
|
175
|
+
return {
|
|
176
|
+
generatedAt: input.pulse.generatedAt,
|
|
177
|
+
range: input.pulse.range,
|
|
178
|
+
status: readinessStatus(checks),
|
|
179
|
+
score: readinessScore(checks),
|
|
180
|
+
blockerCount,
|
|
181
|
+
warningCount,
|
|
182
|
+
pulse: input.pulse,
|
|
183
|
+
evalGate,
|
|
184
|
+
checks,
|
|
185
|
+
nextActions: input.pulse.actions.slice(0, 8),
|
|
186
|
+
}
|
|
187
|
+
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import assert from 'node:assert/strict'
|
|
2
|
+
import test from 'node:test'
|
|
3
|
+
|
|
4
|
+
import {
|
|
5
|
+
evaluateEvalGate,
|
|
6
|
+
setEvalBaseline,
|
|
7
|
+
} from './baseline'
|
|
8
|
+
import type { EvalBaseline, EvalRun } from './types'
|
|
9
|
+
|
|
10
|
+
function makeRun(overrides: Partial<EvalRun> = {}): EvalRun {
|
|
11
|
+
return {
|
|
12
|
+
id: 'run-1',
|
|
13
|
+
scenarioId: 'coding-prime',
|
|
14
|
+
agentId: 'agent-1',
|
|
15
|
+
status: 'completed',
|
|
16
|
+
startedAt: 1,
|
|
17
|
+
endedAt: 2,
|
|
18
|
+
score: 8,
|
|
19
|
+
maxScore: 10,
|
|
20
|
+
details: [],
|
|
21
|
+
...overrides,
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function depsFor(runs: EvalRun[], baseline: EvalBaseline | null = null, saved: EvalBaseline[] = []) {
|
|
26
|
+
return {
|
|
27
|
+
now: () => 123,
|
|
28
|
+
listRunsByAgent: (agentId: string) => runs.filter((run) => run.agentId === agentId),
|
|
29
|
+
getBaselineForScope: () => baseline,
|
|
30
|
+
saveBaseline: (next: EvalBaseline) => { saved.push(next) },
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
test('setEvalBaseline snapshots the latest scenario score and gate defaults', () => {
|
|
35
|
+
const saved: EvalBaseline[] = []
|
|
36
|
+
const baseline = setEvalBaseline(
|
|
37
|
+
{
|
|
38
|
+
agentId: 'agent-1',
|
|
39
|
+
scenarioId: 'coding-prime',
|
|
40
|
+
minPercent: 75,
|
|
41
|
+
maxRegressionPoints: 3,
|
|
42
|
+
label: 'Release candidate',
|
|
43
|
+
},
|
|
44
|
+
depsFor([
|
|
45
|
+
makeRun({ id: 'older', score: 4, startedAt: 1, endedAt: 2 }),
|
|
46
|
+
makeRun({ id: 'latest', score: 8, startedAt: 5, endedAt: 6 }),
|
|
47
|
+
], null, saved),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
assert.equal(saved.length, 1)
|
|
51
|
+
assert.equal(baseline.scope.type, 'scenario')
|
|
52
|
+
assert.equal(baseline.scope.id, 'coding-prime')
|
|
53
|
+
assert.equal(baseline.baselinePercent, 80)
|
|
54
|
+
assert.equal(baseline.minPercent, 75)
|
|
55
|
+
assert.equal(baseline.maxRegressionPoints, 3)
|
|
56
|
+
assert.deepEqual(baseline.runIds, ['latest'])
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
test('evaluateEvalGate warns until a baseline is approved', () => {
|
|
60
|
+
const gate = evaluateEvalGate(
|
|
61
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70 },
|
|
62
|
+
depsFor([makeRun({ score: 8, maxScore: 10 })]),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
assert.equal(gate.currentPercent, 80)
|
|
66
|
+
assert.equal(gate.status, 'warn')
|
|
67
|
+
assert.ok(gate.checks.some((check) => check.code === 'baseline_missing' && check.status === 'warn'))
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
test('evaluateEvalGate fails when regression exceeds the baseline allowance', () => {
|
|
71
|
+
const baseline = setEvalBaseline(
|
|
72
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 2 },
|
|
73
|
+
depsFor([makeRun({ id: 'baseline', score: 9, maxScore: 10 })]),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
const gate = evaluateEvalGate(
|
|
77
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime' },
|
|
78
|
+
depsFor([makeRun({ id: 'current', score: 6, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
assert.equal(gate.currentPercent, 60)
|
|
82
|
+
assert.equal(gate.regressionPoints, 30)
|
|
83
|
+
assert.equal(gate.status, 'fail')
|
|
84
|
+
assert.ok(gate.checks.some((check) => check.code === 'regression_limit_exceeded'))
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
test('evaluateEvalGate passes when score and regression checks pass', () => {
|
|
88
|
+
const baseline = setEvalBaseline(
|
|
89
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 5 },
|
|
90
|
+
depsFor([makeRun({ id: 'baseline', score: 8, maxScore: 10 })]),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
const gate = evaluateEvalGate(
|
|
94
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime' },
|
|
95
|
+
depsFor([makeRun({ id: 'current', score: 8, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
assert.equal(gate.status, 'pass')
|
|
99
|
+
assert.equal(gate.regressionPoints, 0)
|
|
100
|
+
assert.ok(gate.checks.some((check) => check.code === 'score_threshold_met'))
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
test('suite gates require latest runs for every scenario in scope before baselining', () => {
|
|
104
|
+
assert.throws(
|
|
105
|
+
() => setEvalBaseline(
|
|
106
|
+
{ agentId: 'agent-1', suite: 'core' },
|
|
107
|
+
depsFor([makeRun({ scenarioId: 'coding-prime' })]),
|
|
108
|
+
),
|
|
109
|
+
/Baseline requires latest runs for every scenario in scope/,
|
|
110
|
+
)
|
|
111
|
+
})
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
import {
|
|
2
|
+
getEvalBaselineForScope,
|
|
3
|
+
listEvalBaselines,
|
|
4
|
+
listEvalRunsByAgent,
|
|
5
|
+
saveEvalBaseline,
|
|
6
|
+
} from './store'
|
|
7
|
+
import { getScenario, getSuiteScenarios } from './scenarios'
|
|
8
|
+
import type {
|
|
9
|
+
EvalBaseline,
|
|
10
|
+
EvalGateCheck,
|
|
11
|
+
EvalGateResult,
|
|
12
|
+
EvalGateScope,
|
|
13
|
+
EvalGateScopeType,
|
|
14
|
+
EvalRun,
|
|
15
|
+
} from './types'
|
|
16
|
+
|
|
17
|
+
const DEFAULT_MIN_PERCENT = 80
|
|
18
|
+
const DEFAULT_MAX_REGRESSION_POINTS = 5
|
|
19
|
+
const MAX_LOOKBACK_RUNS = 1_000
|
|
20
|
+
|
|
21
|
+
export interface EvalGateInput {
|
|
22
|
+
agentId: string
|
|
23
|
+
scenarioId?: string | null
|
|
24
|
+
suite?: string | null
|
|
25
|
+
minPercent?: number | null
|
|
26
|
+
maxRegressionPoints?: number | null
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface SetEvalBaselineInput extends EvalGateInput {
|
|
30
|
+
label?: string | null
|
|
31
|
+
notes?: string | null
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
interface EvalGateDeps {
|
|
35
|
+
now?: () => number
|
|
36
|
+
listRunsByAgent?: (agentId: string, limit: number) => EvalRun[]
|
|
37
|
+
getBaselineForScope?: (agentId: string, scopeType: EvalGateScopeType, scopeId: string) => EvalBaseline | null
|
|
38
|
+
saveBaseline?: (baseline: EvalBaseline) => void
|
|
39
|
+
listBaselines?: (filters?: { agentId?: string; limit?: number }) => EvalBaseline[]
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
interface EvalAggregate {
|
|
43
|
+
runs: EvalRun[]
|
|
44
|
+
missingScenarioIds: string[]
|
|
45
|
+
score: number
|
|
46
|
+
maxScore: number
|
|
47
|
+
percent: number | null
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function normalizePercent(value: number | null | undefined, fallback: number): number {
|
|
51
|
+
if (!Number.isFinite(value) || value == null) return fallback
|
|
52
|
+
return Math.max(0, Math.min(100, Math.round(value)))
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function normalizeRegressionPoints(value: number | null | undefined, fallback: number): number {
|
|
56
|
+
if (!Number.isFinite(value) || value == null) return fallback
|
|
57
|
+
return Math.max(0, Math.round(value))
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function scorePercent(score: number, maxScore: number): number | null {
|
|
61
|
+
if (!Number.isFinite(score) || !Number.isFinite(maxScore) || maxScore <= 0) return null
|
|
62
|
+
return Math.round((score / maxScore) * 100)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function maxScoreForScenario(scenarioId: string): number {
|
|
66
|
+
const scenario = getScenario(scenarioId)
|
|
67
|
+
return scenario?.scoringCriteria.reduce((sum, criterion) => sum + criterion.weight, 0) ?? 0
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export function resolveEvalGateScope(input: Pick<EvalGateInput, 'scenarioId' | 'suite'>): EvalGateScope {
|
|
71
|
+
const scenarioId = input.scenarioId?.trim()
|
|
72
|
+
if (scenarioId) {
|
|
73
|
+
const scenario = getScenario(scenarioId)
|
|
74
|
+
if (!scenario) throw new Error(`Unknown eval scenario: ${scenarioId}`)
|
|
75
|
+
return {
|
|
76
|
+
type: 'scenario',
|
|
77
|
+
id: scenario.id,
|
|
78
|
+
label: scenario.name,
|
|
79
|
+
scenarioIds: [scenario.id],
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const suite = input.suite?.trim() || 'core'
|
|
84
|
+
const scenarios = getSuiteScenarios(suite)
|
|
85
|
+
if (scenarios.length === 0) throw new Error(`Unknown or empty eval suite: ${suite}`)
|
|
86
|
+
return {
|
|
87
|
+
type: 'suite',
|
|
88
|
+
id: suite,
|
|
89
|
+
label: suite,
|
|
90
|
+
scenarioIds: scenarios.map((scenario) => scenario.id),
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export function evalBaselineId(agentId: string, scope: EvalGateScope): string {
|
|
95
|
+
return `eval-baseline:${agentId}:${scope.type}:${scope.id}`
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function latestRunsForScope(runs: EvalRun[], scope: EvalGateScope): EvalRun[] {
|
|
99
|
+
const scenarioSet = new Set(scope.scenarioIds)
|
|
100
|
+
const latest = new Map<string, EvalRun>()
|
|
101
|
+
|
|
102
|
+
for (const run of runs) {
|
|
103
|
+
if (!scenarioSet.has(run.scenarioId)) continue
|
|
104
|
+
if (run.status === 'pending' || run.status === 'running') continue
|
|
105
|
+
const previous = latest.get(run.scenarioId)
|
|
106
|
+
if (!previous || (run.endedAt ?? run.startedAt) > (previous.endedAt ?? previous.startedAt)) {
|
|
107
|
+
latest.set(run.scenarioId, run)
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return scope.scenarioIds
|
|
112
|
+
.map((scenarioId) => latest.get(scenarioId))
|
|
113
|
+
.filter(Boolean) as EvalRun[]
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function aggregateRuns(scope: EvalGateScope, runs: EvalRun[]): EvalAggregate {
|
|
117
|
+
const byScenario = new Map(runs.map((run) => [run.scenarioId, run]))
|
|
118
|
+
const missingScenarioIds = scope.scenarioIds.filter((scenarioId) => !byScenario.has(scenarioId))
|
|
119
|
+
const score = scope.scenarioIds.reduce((sum, scenarioId) => sum + (byScenario.get(scenarioId)?.score ?? 0), 0)
|
|
120
|
+
const maxScore = scope.scenarioIds.reduce((sum, scenarioId) => {
|
|
121
|
+
const runMaxScore = byScenario.get(scenarioId)?.maxScore
|
|
122
|
+
return sum + (Number.isFinite(runMaxScore) && runMaxScore != null ? runMaxScore : maxScoreForScenario(scenarioId))
|
|
123
|
+
}, 0)
|
|
124
|
+
return {
|
|
125
|
+
runs,
|
|
126
|
+
missingScenarioIds,
|
|
127
|
+
score,
|
|
128
|
+
maxScore,
|
|
129
|
+
percent: scorePercent(score, maxScore),
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function statusFromChecks(checks: EvalGateCheck[]): EvalGateResult['status'] {
|
|
134
|
+
if (checks.some((check) => check.status === 'fail')) return 'fail'
|
|
135
|
+
if (checks.some((check) => check.status === 'warn')) return 'warn'
|
|
136
|
+
return 'pass'
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
export function listEvalBaselinesForAgent(agentId?: string | null, deps: EvalGateDeps = {}): EvalBaseline[] {
|
|
140
|
+
const list = deps.listBaselines || listEvalBaselines
|
|
141
|
+
return list({ agentId: agentId || undefined, limit: 200 })
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
export function setEvalBaseline(input: SetEvalBaselineInput, deps: EvalGateDeps = {}): EvalBaseline {
|
|
145
|
+
if (!input.agentId.trim()) throw new Error('agentId is required')
|
|
146
|
+
|
|
147
|
+
const now = deps.now?.() ?? Date.now()
|
|
148
|
+
const scope = resolveEvalGateScope(input)
|
|
149
|
+
const runs = latestRunsForScope(
|
|
150
|
+
(deps.listRunsByAgent || listEvalRunsByAgent)(input.agentId, MAX_LOOKBACK_RUNS),
|
|
151
|
+
scope,
|
|
152
|
+
)
|
|
153
|
+
const aggregate = aggregateRuns(scope, runs)
|
|
154
|
+
if (aggregate.runs.length === 0) {
|
|
155
|
+
throw new Error('Run the selected eval before setting a baseline.')
|
|
156
|
+
}
|
|
157
|
+
if (aggregate.missingScenarioIds.length > 0) {
|
|
158
|
+
throw new Error(`Baseline requires latest runs for every scenario in scope. Missing: ${aggregate.missingScenarioIds.join(', ')}`)
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const existing = (deps.getBaselineForScope || getEvalBaselineForScope)(input.agentId, scope.type, scope.id)
|
|
162
|
+
const baseline: EvalBaseline = {
|
|
163
|
+
id: existing?.id || evalBaselineId(input.agentId, scope),
|
|
164
|
+
agentId: input.agentId,
|
|
165
|
+
scope,
|
|
166
|
+
baselineScore: aggregate.score,
|
|
167
|
+
baselineMaxScore: aggregate.maxScore,
|
|
168
|
+
baselinePercent: aggregate.percent ?? 0,
|
|
169
|
+
minPercent: normalizePercent(input.minPercent, aggregate.percent ?? DEFAULT_MIN_PERCENT),
|
|
170
|
+
maxRegressionPoints: normalizeRegressionPoints(input.maxRegressionPoints, existing?.maxRegressionPoints ?? DEFAULT_MAX_REGRESSION_POINTS),
|
|
171
|
+
runIds: aggregate.runs.map((run) => run.id),
|
|
172
|
+
label: input.label?.trim() || existing?.label || null,
|
|
173
|
+
notes: input.notes?.trim() || existing?.notes || null,
|
|
174
|
+
createdAt: existing?.createdAt || now,
|
|
175
|
+
updatedAt: now,
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
;(deps.saveBaseline || saveEvalBaseline)(baseline)
|
|
179
|
+
return baseline
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
export function evaluateEvalGate(input: EvalGateInput, deps: EvalGateDeps = {}): EvalGateResult {
|
|
183
|
+
if (!input.agentId.trim()) throw new Error('agentId is required')
|
|
184
|
+
|
|
185
|
+
const generatedAt = deps.now?.() ?? Date.now()
|
|
186
|
+
const scope = resolveEvalGateScope(input)
|
|
187
|
+
const baseline = (deps.getBaselineForScope || getEvalBaselineForScope)(input.agentId, scope.type, scope.id)
|
|
188
|
+
const runs = latestRunsForScope(
|
|
189
|
+
(deps.listRunsByAgent || listEvalRunsByAgent)(input.agentId, MAX_LOOKBACK_RUNS),
|
|
190
|
+
scope,
|
|
191
|
+
)
|
|
192
|
+
const aggregate = aggregateRuns(scope, runs)
|
|
193
|
+
const minPercent = normalizePercent(input.minPercent, baseline?.minPercent ?? DEFAULT_MIN_PERCENT)
|
|
194
|
+
const maxRegressionPoints = normalizeRegressionPoints(input.maxRegressionPoints, baseline?.maxRegressionPoints ?? DEFAULT_MAX_REGRESSION_POINTS)
|
|
195
|
+
const regressionPoints = baseline && aggregate.percent != null
|
|
196
|
+
? Math.max(0, baseline.baselinePercent - aggregate.percent)
|
|
197
|
+
: null
|
|
198
|
+
|
|
199
|
+
const checks: EvalGateCheck[] = []
|
|
200
|
+
if (aggregate.runs.length === 0) {
|
|
201
|
+
checks.push({
|
|
202
|
+
code: 'no_eval_runs',
|
|
203
|
+
status: 'fail',
|
|
204
|
+
message: 'No completed eval runs are available for this gate.',
|
|
205
|
+
})
|
|
206
|
+
}
|
|
207
|
+
if (aggregate.missingScenarioIds.length > 0) {
|
|
208
|
+
checks.push({
|
|
209
|
+
code: 'missing_scope_runs',
|
|
210
|
+
status: 'fail',
|
|
211
|
+
message: `${aggregate.missingScenarioIds.length} scenario${aggregate.missingScenarioIds.length === 1 ? '' : 's'} have no latest run in this gate.`,
|
|
212
|
+
detail: aggregate.missingScenarioIds.join(', '),
|
|
213
|
+
})
|
|
214
|
+
}
|
|
215
|
+
if (aggregate.runs.some((run) => run.status === 'failed')) {
|
|
216
|
+
checks.push({
|
|
217
|
+
code: 'failed_eval_run',
|
|
218
|
+
status: 'fail',
|
|
219
|
+
message: 'At least one latest eval run failed.',
|
|
220
|
+
})
|
|
221
|
+
}
|
|
222
|
+
if (aggregate.percent == null || aggregate.percent < minPercent) {
|
|
223
|
+
checks.push({
|
|
224
|
+
code: 'score_below_threshold',
|
|
225
|
+
status: 'fail',
|
|
226
|
+
message: `Current score is below the ${minPercent}% gate.`,
|
|
227
|
+
detail: aggregate.percent == null ? 'n/a' : `${aggregate.percent}%`,
|
|
228
|
+
})
|
|
229
|
+
} else {
|
|
230
|
+
checks.push({
|
|
231
|
+
code: 'score_threshold_met',
|
|
232
|
+
status: 'pass',
|
|
233
|
+
message: `Current score meets the ${minPercent}% gate.`,
|
|
234
|
+
detail: `${aggregate.percent}%`,
|
|
235
|
+
})
|
|
236
|
+
}
|
|
237
|
+
if (!baseline) {
|
|
238
|
+
checks.push({
|
|
239
|
+
code: 'baseline_missing',
|
|
240
|
+
status: 'warn',
|
|
241
|
+
message: 'No approved baseline is set for this gate.',
|
|
242
|
+
})
|
|
243
|
+
} else if (regressionPoints != null && regressionPoints > maxRegressionPoints) {
|
|
244
|
+
checks.push({
|
|
245
|
+
code: 'regression_limit_exceeded',
|
|
246
|
+
status: 'fail',
|
|
247
|
+
message: `Regression exceeds the ${maxRegressionPoints} point allowance.`,
|
|
248
|
+
detail: `${regressionPoints} points below baseline`,
|
|
249
|
+
})
|
|
250
|
+
} else if (regressionPoints != null) {
|
|
251
|
+
checks.push({
|
|
252
|
+
code: 'regression_within_limit',
|
|
253
|
+
status: 'pass',
|
|
254
|
+
message: `Regression is within the ${maxRegressionPoints} point allowance.`,
|
|
255
|
+
detail: `${regressionPoints} point${regressionPoints === 1 ? '' : 's'} below baseline`,
|
|
256
|
+
})
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
return {
|
|
260
|
+
agentId: input.agentId,
|
|
261
|
+
scope,
|
|
262
|
+
status: statusFromChecks(checks),
|
|
263
|
+
generatedAt,
|
|
264
|
+
baseline,
|
|
265
|
+
latestRuns: aggregate.runs,
|
|
266
|
+
currentScore: aggregate.score,
|
|
267
|
+
currentMaxScore: aggregate.maxScore,
|
|
268
|
+
currentPercent: aggregate.percent,
|
|
269
|
+
regressionPoints,
|
|
270
|
+
minPercent,
|
|
271
|
+
maxRegressionPoints,
|
|
272
|
+
checks,
|
|
273
|
+
}
|
|
274
|
+
}
|