@swarmclawai/swarmclaw 1.9.6 → 1.9.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ import assert from 'node:assert/strict'
2
+ import { describe, it } from 'node:test'
3
+
4
+ import { buildReleaseReadinessReport } from './release-readiness'
5
+ import type { EvalGateResult } from '@/lib/server/eval/types'
6
+ import type { OperationPulse } from '@/types'
7
+
8
+ const now = 100_000
9
+
10
+ function pulse(overrides: Partial<OperationPulse> = {}): OperationPulse {
11
+ return {
12
+ generatedAt: now,
13
+ range: '24h',
14
+ windowStart: now - 86_400_000,
15
+ kpis: {
16
+ activeMissions: 0,
17
+ runningRuns: 0,
18
+ failedRuns: 0,
19
+ pendingApprovals: 0,
20
+ connectorAttention: 0,
21
+ gatewayAttention: 0,
22
+ budgetWarnings: 0,
23
+ },
24
+ actions: [],
25
+ ...overrides,
26
+ }
27
+ }
28
+
29
+ function evalGate(overrides: Partial<EvalGateResult> = {}): EvalGateResult {
30
+ return {
31
+ agentId: 'agent_1',
32
+ scope: {
33
+ type: 'suite',
34
+ id: 'core',
35
+ label: 'core',
36
+ scenarioIds: ['coding-prime'],
37
+ },
38
+ status: 'pass',
39
+ generatedAt: now,
40
+ baseline: null,
41
+ latestRuns: [],
42
+ currentScore: 10,
43
+ currentMaxScore: 10,
44
+ currentPercent: 100,
45
+ regressionPoints: 0,
46
+ minPercent: 80,
47
+ maxRegressionPoints: 5,
48
+ checks: [{ code: 'score_threshold_met', status: 'pass', message: 'Current score meets the 80% gate.' }],
49
+ ...overrides,
50
+ }
51
+ }
52
+
53
+ describe('release readiness report', () => {
54
+ it('passes when eval gate and operations pulse are clean', () => {
55
+ const report = buildReleaseReadinessReport({
56
+ pulse: pulse(),
57
+ evalGate: evalGate(),
58
+ })
59
+
60
+ assert.equal(report.status, 'ready')
61
+ assert.equal(report.score, 100)
62
+ assert.equal(report.blockerCount, 0)
63
+ assert.equal(report.warningCount, 0)
64
+ assert.ok(report.checks.some((check) => check.code === 'eval_gate_passed'))
65
+ })
66
+
67
+ it('warns when no eval gate is selected', () => {
68
+ const report = buildReleaseReadinessReport({
69
+ pulse: pulse(),
70
+ evalGate: null,
71
+ })
72
+
73
+ assert.equal(report.status, 'warning')
74
+ assert.equal(report.blockerCount, 0)
75
+ assert.equal(report.warningCount, 1)
76
+ assert.ok(report.score < 100)
77
+ assert.ok(report.checks.some((check) => check.code === 'eval_gate_missing'))
78
+ })
79
+
80
+ it('blocks when eval regression gate fails', () => {
81
+ const report = buildReleaseReadinessReport({
82
+ pulse: pulse(),
83
+ evalGate: evalGate({
84
+ status: 'fail',
85
+ currentPercent: 60,
86
+ checks: [{ code: 'score_below_threshold', status: 'fail', message: 'Current score is below the 80% gate.' }],
87
+ }),
88
+ })
89
+
90
+ assert.equal(report.status, 'blocked')
91
+ assert.equal(report.blockerCount, 1)
92
+ assert.ok(report.score <= 70)
93
+ assert.ok(report.checks.some((check) => check.code === 'eval_gate_failed'))
94
+ })
95
+
96
+ it('blocks on failed runs and pending approvals, then surfaces pulse actions', () => {
97
+ const report = buildReleaseReadinessReport({
98
+ pulse: pulse({
99
+ kpis: {
100
+ activeMissions: 1,
101
+ runningRuns: 1,
102
+ failedRuns: 2,
103
+ pendingApprovals: 3,
104
+ connectorAttention: 1,
105
+ gatewayAttention: 1,
106
+ budgetWarnings: 1,
107
+ },
108
+ actions: [{
109
+ id: 'run:failed',
110
+ kind: 'run',
111
+ severity: 'high',
112
+ title: 'Review failed run',
113
+ summary: 'Run failed',
114
+ href: '/quality?tab=runs',
115
+ evidence: ['run'],
116
+ createdAt: now,
117
+ }],
118
+ }),
119
+ evalGate: evalGate(),
120
+ })
121
+
122
+ assert.equal(report.status, 'blocked')
123
+ assert.equal(report.blockerCount, 2)
124
+ assert.ok(report.warningCount >= 4)
125
+ assert.equal(report.nextActions[0]?.id, 'run:failed')
126
+ assert.ok(report.checks.some((check) => check.code === 'failed_runs_present'))
127
+ assert.ok(report.checks.some((check) => check.code === 'pending_approvals_present'))
128
+ })
129
+ })
@@ -0,0 +1,187 @@
1
+ import type { EvalGateResult } from '@/lib/server/eval/types'
2
+ import type { OperationPulse, OperationPulseAction, OperationPulseRange } from '@/types'
3
+
4
+ export type ReleaseReadinessStatus = 'ready' | 'warning' | 'blocked'
5
+
6
+ export interface ReleaseReadinessCheck {
7
+ code: string
8
+ status: ReleaseReadinessStatus
9
+ title: string
10
+ summary: string
11
+ href?: string
12
+ evidence?: string[]
13
+ }
14
+
15
+ export interface ReleaseReadinessReport {
16
+ generatedAt: number
17
+ range: OperationPulseRange
18
+ status: ReleaseReadinessStatus
19
+ score: number
20
+ blockerCount: number
21
+ warningCount: number
22
+ pulse: OperationPulse
23
+ evalGate: EvalGateResult | null
24
+ checks: ReleaseReadinessCheck[]
25
+ nextActions: OperationPulseAction[]
26
+ }
27
+
28
+ const BLOCKER_PENALTY = 30
29
+ const WARNING_PENALTY = 10
30
+
31
+ function readinessStatus(checks: ReleaseReadinessCheck[]): ReleaseReadinessStatus {
32
+ if (checks.some((check) => check.status === 'blocked')) return 'blocked'
33
+ if (checks.some((check) => check.status === 'warning')) return 'warning'
34
+ return 'ready'
35
+ }
36
+
37
+ function readinessScore(checks: ReleaseReadinessCheck[]): number {
38
+ const penalty = checks.reduce((sum, check) => {
39
+ if (check.status === 'blocked') return sum + BLOCKER_PENALTY
40
+ if (check.status === 'warning') return sum + WARNING_PENALTY
41
+ return sum
42
+ }, 0)
43
+ return Math.max(0, 100 - penalty)
44
+ }
45
+
46
+ function plural(count: number, singular: string, pluralLabel = `${singular}s`): string {
47
+ return `${count} ${count === 1 ? singular : pluralLabel}`
48
+ }
49
+
50
+ function addCheck(checks: ReleaseReadinessCheck[], check: ReleaseReadinessCheck): void {
51
+ checks.push(check)
52
+ }
53
+
54
+ export function buildReleaseReadinessReport(input: {
55
+ pulse: OperationPulse
56
+ evalGate?: EvalGateResult | null
57
+ }): ReleaseReadinessReport {
58
+ const checks: ReleaseReadinessCheck[] = []
59
+ const evalGate = input.evalGate ?? null
60
+
61
+ if (!evalGate) {
62
+ addCheck(checks, {
63
+ code: 'eval_gate_missing',
64
+ status: 'warning',
65
+ title: 'Select an eval gate',
66
+ summary: 'No eval regression gate is included in this readiness report.',
67
+ href: '/quality?tab=evals',
68
+ })
69
+ } else if (evalGate.status === 'fail') {
70
+ addCheck(checks, {
71
+ code: 'eval_gate_failed',
72
+ status: 'blocked',
73
+ title: 'Eval gate failed',
74
+ summary: `${evalGate.scope.label} is not passing the configured eval release gate.`,
75
+ href: '/quality?tab=evals',
76
+ evidence: evalGate.checks
77
+ .filter((check) => check.status === 'fail')
78
+ .map((check) => check.message),
79
+ })
80
+ } else if (evalGate.status === 'warn') {
81
+ addCheck(checks, {
82
+ code: 'eval_gate_warning',
83
+ status: 'warning',
84
+ title: 'Eval gate needs a baseline',
85
+ summary: `${evalGate.scope.label} passes the score threshold but still has release-gate warnings.`,
86
+ href: '/quality?tab=evals',
87
+ evidence: evalGate.checks
88
+ .filter((check) => check.status === 'warn')
89
+ .map((check) => check.message),
90
+ })
91
+ } else {
92
+ addCheck(checks, {
93
+ code: 'eval_gate_passed',
94
+ status: 'ready',
95
+ title: 'Eval gate passed',
96
+ summary: `${evalGate.scope.label} meets the configured score and regression checks.`,
97
+ href: '/quality?tab=evals',
98
+ evidence: [`${evalGate.currentPercent ?? 'n/a'}% current score`],
99
+ })
100
+ }
101
+
102
+ if (input.pulse.kpis.failedRuns > 0) {
103
+ addCheck(checks, {
104
+ code: 'failed_runs_present',
105
+ status: 'blocked',
106
+ title: 'Failed runs need review',
107
+ summary: `${plural(input.pulse.kpis.failedRuns, 'failed run')} found in the ${input.pulse.range} operations window.`,
108
+ href: '/quality?tab=runs',
109
+ })
110
+ }
111
+
112
+ if (input.pulse.kpis.pendingApprovals > 0) {
113
+ addCheck(checks, {
114
+ code: 'pending_approvals_present',
115
+ status: 'blocked',
116
+ title: 'Pending approvals need decisions',
117
+ summary: `${plural(input.pulse.kpis.pendingApprovals, 'approval')} still waiting on an operator.`,
118
+ href: '/quality?tab=approvals',
119
+ })
120
+ }
121
+
122
+ if (input.pulse.kpis.runningRuns > 0) {
123
+ addCheck(checks, {
124
+ code: 'active_runs_present',
125
+ status: 'warning',
126
+ title: 'Runs are still active',
127
+ summary: `${plural(input.pulse.kpis.runningRuns, 'run')} queued or running while this report was generated.`,
128
+ href: '/runs',
129
+ })
130
+ }
131
+
132
+ if (input.pulse.kpis.connectorAttention > 0) {
133
+ addCheck(checks, {
134
+ code: 'connector_attention_present',
135
+ status: 'warning',
136
+ title: 'Connector readiness needs attention',
137
+ summary: `${plural(input.pulse.kpis.connectorAttention, 'connector')} reporting degraded readiness.`,
138
+ href: '/connectors',
139
+ })
140
+ }
141
+
142
+ if (input.pulse.kpis.gatewayAttention > 0) {
143
+ addCheck(checks, {
144
+ code: 'gateway_attention_present',
145
+ status: 'warning',
146
+ title: 'Gateway readiness needs attention',
147
+ summary: `${plural(input.pulse.kpis.gatewayAttention, 'gateway')} reporting topology or environment warnings.`,
148
+ href: '/providers',
149
+ })
150
+ }
151
+
152
+ if (input.pulse.kpis.budgetWarnings > 0) {
153
+ addCheck(checks, {
154
+ code: 'budget_warnings_present',
155
+ status: 'warning',
156
+ title: 'Mission budget pressure',
157
+ summary: `${plural(input.pulse.kpis.budgetWarnings, 'mission')} near a configured budget limit.`,
158
+ href: '/missions',
159
+ })
160
+ }
161
+
162
+ if (input.pulse.kpis.activeMissions > 0) {
163
+ addCheck(checks, {
164
+ code: 'active_missions_present',
165
+ status: 'warning',
166
+ title: 'Missions are still active',
167
+ summary: `${plural(input.pulse.kpis.activeMissions, 'mission')} running or paused in the operations window.`,
168
+ href: '/missions',
169
+ })
170
+ }
171
+
172
+ const blockerCount = checks.filter((check) => check.status === 'blocked').length
173
+ const warningCount = checks.filter((check) => check.status === 'warning').length
174
+
175
+ return {
176
+ generatedAt: input.pulse.generatedAt,
177
+ range: input.pulse.range,
178
+ status: readinessStatus(checks),
179
+ score: readinessScore(checks),
180
+ blockerCount,
181
+ warningCount,
182
+ pulse: input.pulse,
183
+ evalGate,
184
+ checks,
185
+ nextActions: input.pulse.actions.slice(0, 8),
186
+ }
187
+ }
@@ -0,0 +1,111 @@
1
+ import assert from 'node:assert/strict'
2
+ import test from 'node:test'
3
+
4
+ import {
5
+ evaluateEvalGate,
6
+ setEvalBaseline,
7
+ } from './baseline'
8
+ import type { EvalBaseline, EvalRun } from './types'
9
+
10
+ function makeRun(overrides: Partial<EvalRun> = {}): EvalRun {
11
+ return {
12
+ id: 'run-1',
13
+ scenarioId: 'coding-prime',
14
+ agentId: 'agent-1',
15
+ status: 'completed',
16
+ startedAt: 1,
17
+ endedAt: 2,
18
+ score: 8,
19
+ maxScore: 10,
20
+ details: [],
21
+ ...overrides,
22
+ }
23
+ }
24
+
25
+ function depsFor(runs: EvalRun[], baseline: EvalBaseline | null = null, saved: EvalBaseline[] = []) {
26
+ return {
27
+ now: () => 123,
28
+ listRunsByAgent: (agentId: string) => runs.filter((run) => run.agentId === agentId),
29
+ getBaselineForScope: () => baseline,
30
+ saveBaseline: (next: EvalBaseline) => { saved.push(next) },
31
+ }
32
+ }
33
+
34
+ test('setEvalBaseline snapshots the latest scenario score and gate defaults', () => {
35
+ const saved: EvalBaseline[] = []
36
+ const baseline = setEvalBaseline(
37
+ {
38
+ agentId: 'agent-1',
39
+ scenarioId: 'coding-prime',
40
+ minPercent: 75,
41
+ maxRegressionPoints: 3,
42
+ label: 'Release candidate',
43
+ },
44
+ depsFor([
45
+ makeRun({ id: 'older', score: 4, startedAt: 1, endedAt: 2 }),
46
+ makeRun({ id: 'latest', score: 8, startedAt: 5, endedAt: 6 }),
47
+ ], null, saved),
48
+ )
49
+
50
+ assert.equal(saved.length, 1)
51
+ assert.equal(baseline.scope.type, 'scenario')
52
+ assert.equal(baseline.scope.id, 'coding-prime')
53
+ assert.equal(baseline.baselinePercent, 80)
54
+ assert.equal(baseline.minPercent, 75)
55
+ assert.equal(baseline.maxRegressionPoints, 3)
56
+ assert.deepEqual(baseline.runIds, ['latest'])
57
+ })
58
+
59
+ test('evaluateEvalGate warns until a baseline is approved', () => {
60
+ const gate = evaluateEvalGate(
61
+ { agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70 },
62
+ depsFor([makeRun({ score: 8, maxScore: 10 })]),
63
+ )
64
+
65
+ assert.equal(gate.currentPercent, 80)
66
+ assert.equal(gate.status, 'warn')
67
+ assert.ok(gate.checks.some((check) => check.code === 'baseline_missing' && check.status === 'warn'))
68
+ })
69
+
70
+ test('evaluateEvalGate fails when regression exceeds the baseline allowance', () => {
71
+ const baseline = setEvalBaseline(
72
+ { agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 2 },
73
+ depsFor([makeRun({ id: 'baseline', score: 9, maxScore: 10 })]),
74
+ )
75
+
76
+ const gate = evaluateEvalGate(
77
+ { agentId: 'agent-1', scenarioId: 'coding-prime' },
78
+ depsFor([makeRun({ id: 'current', score: 6, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
79
+ )
80
+
81
+ assert.equal(gate.currentPercent, 60)
82
+ assert.equal(gate.regressionPoints, 30)
83
+ assert.equal(gate.status, 'fail')
84
+ assert.ok(gate.checks.some((check) => check.code === 'regression_limit_exceeded'))
85
+ })
86
+
87
+ test('evaluateEvalGate passes when score and regression checks pass', () => {
88
+ const baseline = setEvalBaseline(
89
+ { agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 5 },
90
+ depsFor([makeRun({ id: 'baseline', score: 8, maxScore: 10 })]),
91
+ )
92
+
93
+ const gate = evaluateEvalGate(
94
+ { agentId: 'agent-1', scenarioId: 'coding-prime' },
95
+ depsFor([makeRun({ id: 'current', score: 8, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
96
+ )
97
+
98
+ assert.equal(gate.status, 'pass')
99
+ assert.equal(gate.regressionPoints, 0)
100
+ assert.ok(gate.checks.some((check) => check.code === 'score_threshold_met'))
101
+ })
102
+
103
+ test('suite gates require latest runs for every scenario in scope before baselining', () => {
104
+ assert.throws(
105
+ () => setEvalBaseline(
106
+ { agentId: 'agent-1', suite: 'core' },
107
+ depsFor([makeRun({ scenarioId: 'coding-prime' })]),
108
+ ),
109
+ /Baseline requires latest runs for every scenario in scope/,
110
+ )
111
+ })
@@ -0,0 +1,274 @@
1
+ import {
2
+ getEvalBaselineForScope,
3
+ listEvalBaselines,
4
+ listEvalRunsByAgent,
5
+ saveEvalBaseline,
6
+ } from './store'
7
+ import { getScenario, getSuiteScenarios } from './scenarios'
8
+ import type {
9
+ EvalBaseline,
10
+ EvalGateCheck,
11
+ EvalGateResult,
12
+ EvalGateScope,
13
+ EvalGateScopeType,
14
+ EvalRun,
15
+ } from './types'
16
+
17
+ const DEFAULT_MIN_PERCENT = 80
18
+ const DEFAULT_MAX_REGRESSION_POINTS = 5
19
+ const MAX_LOOKBACK_RUNS = 1_000
20
+
21
+ export interface EvalGateInput {
22
+ agentId: string
23
+ scenarioId?: string | null
24
+ suite?: string | null
25
+ minPercent?: number | null
26
+ maxRegressionPoints?: number | null
27
+ }
28
+
29
+ export interface SetEvalBaselineInput extends EvalGateInput {
30
+ label?: string | null
31
+ notes?: string | null
32
+ }
33
+
34
+ interface EvalGateDeps {
35
+ now?: () => number
36
+ listRunsByAgent?: (agentId: string, limit: number) => EvalRun[]
37
+ getBaselineForScope?: (agentId: string, scopeType: EvalGateScopeType, scopeId: string) => EvalBaseline | null
38
+ saveBaseline?: (baseline: EvalBaseline) => void
39
+ listBaselines?: (filters?: { agentId?: string; limit?: number }) => EvalBaseline[]
40
+ }
41
+
42
+ interface EvalAggregate {
43
+ runs: EvalRun[]
44
+ missingScenarioIds: string[]
45
+ score: number
46
+ maxScore: number
47
+ percent: number | null
48
+ }
49
+
50
+ function normalizePercent(value: number | null | undefined, fallback: number): number {
51
+ if (!Number.isFinite(value) || value == null) return fallback
52
+ return Math.max(0, Math.min(100, Math.round(value)))
53
+ }
54
+
55
+ function normalizeRegressionPoints(value: number | null | undefined, fallback: number): number {
56
+ if (!Number.isFinite(value) || value == null) return fallback
57
+ return Math.max(0, Math.round(value))
58
+ }
59
+
60
+ function scorePercent(score: number, maxScore: number): number | null {
61
+ if (!Number.isFinite(score) || !Number.isFinite(maxScore) || maxScore <= 0) return null
62
+ return Math.round((score / maxScore) * 100)
63
+ }
64
+
65
+ function maxScoreForScenario(scenarioId: string): number {
66
+ const scenario = getScenario(scenarioId)
67
+ return scenario?.scoringCriteria.reduce((sum, criterion) => sum + criterion.weight, 0) ?? 0
68
+ }
69
+
70
+ export function resolveEvalGateScope(input: Pick<EvalGateInput, 'scenarioId' | 'suite'>): EvalGateScope {
71
+ const scenarioId = input.scenarioId?.trim()
72
+ if (scenarioId) {
73
+ const scenario = getScenario(scenarioId)
74
+ if (!scenario) throw new Error(`Unknown eval scenario: ${scenarioId}`)
75
+ return {
76
+ type: 'scenario',
77
+ id: scenario.id,
78
+ label: scenario.name,
79
+ scenarioIds: [scenario.id],
80
+ }
81
+ }
82
+
83
+ const suite = input.suite?.trim() || 'core'
84
+ const scenarios = getSuiteScenarios(suite)
85
+ if (scenarios.length === 0) throw new Error(`Unknown or empty eval suite: ${suite}`)
86
+ return {
87
+ type: 'suite',
88
+ id: suite,
89
+ label: suite,
90
+ scenarioIds: scenarios.map((scenario) => scenario.id),
91
+ }
92
+ }
93
+
94
+ export function evalBaselineId(agentId: string, scope: EvalGateScope): string {
95
+ return `eval-baseline:${agentId}:${scope.type}:${scope.id}`
96
+ }
97
+
98
+ function latestRunsForScope(runs: EvalRun[], scope: EvalGateScope): EvalRun[] {
99
+ const scenarioSet = new Set(scope.scenarioIds)
100
+ const latest = new Map<string, EvalRun>()
101
+
102
+ for (const run of runs) {
103
+ if (!scenarioSet.has(run.scenarioId)) continue
104
+ if (run.status === 'pending' || run.status === 'running') continue
105
+ const previous = latest.get(run.scenarioId)
106
+ if (!previous || (run.endedAt ?? run.startedAt) > (previous.endedAt ?? previous.startedAt)) {
107
+ latest.set(run.scenarioId, run)
108
+ }
109
+ }
110
+
111
+ return scope.scenarioIds
112
+ .map((scenarioId) => latest.get(scenarioId))
113
+ .filter(Boolean) as EvalRun[]
114
+ }
115
+
116
+ function aggregateRuns(scope: EvalGateScope, runs: EvalRun[]): EvalAggregate {
117
+ const byScenario = new Map(runs.map((run) => [run.scenarioId, run]))
118
+ const missingScenarioIds = scope.scenarioIds.filter((scenarioId) => !byScenario.has(scenarioId))
119
+ const score = scope.scenarioIds.reduce((sum, scenarioId) => sum + (byScenario.get(scenarioId)?.score ?? 0), 0)
120
+ const maxScore = scope.scenarioIds.reduce((sum, scenarioId) => {
121
+ const runMaxScore = byScenario.get(scenarioId)?.maxScore
122
+ return sum + (Number.isFinite(runMaxScore) && runMaxScore != null ? runMaxScore : maxScoreForScenario(scenarioId))
123
+ }, 0)
124
+ return {
125
+ runs,
126
+ missingScenarioIds,
127
+ score,
128
+ maxScore,
129
+ percent: scorePercent(score, maxScore),
130
+ }
131
+ }
132
+
133
+ function statusFromChecks(checks: EvalGateCheck[]): EvalGateResult['status'] {
134
+ if (checks.some((check) => check.status === 'fail')) return 'fail'
135
+ if (checks.some((check) => check.status === 'warn')) return 'warn'
136
+ return 'pass'
137
+ }
138
+
139
+ export function listEvalBaselinesForAgent(agentId?: string | null, deps: EvalGateDeps = {}): EvalBaseline[] {
140
+ const list = deps.listBaselines || listEvalBaselines
141
+ return list({ agentId: agentId || undefined, limit: 200 })
142
+ }
143
+
144
+ export function setEvalBaseline(input: SetEvalBaselineInput, deps: EvalGateDeps = {}): EvalBaseline {
145
+ if (!input.agentId.trim()) throw new Error('agentId is required')
146
+
147
+ const now = deps.now?.() ?? Date.now()
148
+ const scope = resolveEvalGateScope(input)
149
+ const runs = latestRunsForScope(
150
+ (deps.listRunsByAgent || listEvalRunsByAgent)(input.agentId, MAX_LOOKBACK_RUNS),
151
+ scope,
152
+ )
153
+ const aggregate = aggregateRuns(scope, runs)
154
+ if (aggregate.runs.length === 0) {
155
+ throw new Error('Run the selected eval before setting a baseline.')
156
+ }
157
+ if (aggregate.missingScenarioIds.length > 0) {
158
+ throw new Error(`Baseline requires latest runs for every scenario in scope. Missing: ${aggregate.missingScenarioIds.join(', ')}`)
159
+ }
160
+
161
+ const existing = (deps.getBaselineForScope || getEvalBaselineForScope)(input.agentId, scope.type, scope.id)
162
+ const baseline: EvalBaseline = {
163
+ id: existing?.id || evalBaselineId(input.agentId, scope),
164
+ agentId: input.agentId,
165
+ scope,
166
+ baselineScore: aggregate.score,
167
+ baselineMaxScore: aggregate.maxScore,
168
+ baselinePercent: aggregate.percent ?? 0,
169
+ minPercent: normalizePercent(input.minPercent, aggregate.percent ?? DEFAULT_MIN_PERCENT),
170
+ maxRegressionPoints: normalizeRegressionPoints(input.maxRegressionPoints, existing?.maxRegressionPoints ?? DEFAULT_MAX_REGRESSION_POINTS),
171
+ runIds: aggregate.runs.map((run) => run.id),
172
+ label: input.label?.trim() || existing?.label || null,
173
+ notes: input.notes?.trim() || existing?.notes || null,
174
+ createdAt: existing?.createdAt || now,
175
+ updatedAt: now,
176
+ }
177
+
178
+ ;(deps.saveBaseline || saveEvalBaseline)(baseline)
179
+ return baseline
180
+ }
181
+
182
+ export function evaluateEvalGate(input: EvalGateInput, deps: EvalGateDeps = {}): EvalGateResult {
183
+ if (!input.agentId.trim()) throw new Error('agentId is required')
184
+
185
+ const generatedAt = deps.now?.() ?? Date.now()
186
+ const scope = resolveEvalGateScope(input)
187
+ const baseline = (deps.getBaselineForScope || getEvalBaselineForScope)(input.agentId, scope.type, scope.id)
188
+ const runs = latestRunsForScope(
189
+ (deps.listRunsByAgent || listEvalRunsByAgent)(input.agentId, MAX_LOOKBACK_RUNS),
190
+ scope,
191
+ )
192
+ const aggregate = aggregateRuns(scope, runs)
193
+ const minPercent = normalizePercent(input.minPercent, baseline?.minPercent ?? DEFAULT_MIN_PERCENT)
194
+ const maxRegressionPoints = normalizeRegressionPoints(input.maxRegressionPoints, baseline?.maxRegressionPoints ?? DEFAULT_MAX_REGRESSION_POINTS)
195
+ const regressionPoints = baseline && aggregate.percent != null
196
+ ? Math.max(0, baseline.baselinePercent - aggregate.percent)
197
+ : null
198
+
199
+ const checks: EvalGateCheck[] = []
200
+ if (aggregate.runs.length === 0) {
201
+ checks.push({
202
+ code: 'no_eval_runs',
203
+ status: 'fail',
204
+ message: 'No completed eval runs are available for this gate.',
205
+ })
206
+ }
207
+ if (aggregate.missingScenarioIds.length > 0) {
208
+ checks.push({
209
+ code: 'missing_scope_runs',
210
+ status: 'fail',
211
+ message: `${aggregate.missingScenarioIds.length} scenario${aggregate.missingScenarioIds.length === 1 ? '' : 's'} have no latest run in this gate.`,
212
+ detail: aggregate.missingScenarioIds.join(', '),
213
+ })
214
+ }
215
+ if (aggregate.runs.some((run) => run.status === 'failed')) {
216
+ checks.push({
217
+ code: 'failed_eval_run',
218
+ status: 'fail',
219
+ message: 'At least one latest eval run failed.',
220
+ })
221
+ }
222
+ if (aggregate.percent == null || aggregate.percent < minPercent) {
223
+ checks.push({
224
+ code: 'score_below_threshold',
225
+ status: 'fail',
226
+ message: `Current score is below the ${minPercent}% gate.`,
227
+ detail: aggregate.percent == null ? 'n/a' : `${aggregate.percent}%`,
228
+ })
229
+ } else {
230
+ checks.push({
231
+ code: 'score_threshold_met',
232
+ status: 'pass',
233
+ message: `Current score meets the ${minPercent}% gate.`,
234
+ detail: `${aggregate.percent}%`,
235
+ })
236
+ }
237
+ if (!baseline) {
238
+ checks.push({
239
+ code: 'baseline_missing',
240
+ status: 'warn',
241
+ message: 'No approved baseline is set for this gate.',
242
+ })
243
+ } else if (regressionPoints != null && regressionPoints > maxRegressionPoints) {
244
+ checks.push({
245
+ code: 'regression_limit_exceeded',
246
+ status: 'fail',
247
+ message: `Regression exceeds the ${maxRegressionPoints} point allowance.`,
248
+ detail: `${regressionPoints} points below baseline`,
249
+ })
250
+ } else if (regressionPoints != null) {
251
+ checks.push({
252
+ code: 'regression_within_limit',
253
+ status: 'pass',
254
+ message: `Regression is within the ${maxRegressionPoints} point allowance.`,
255
+ detail: `${regressionPoints} point${regressionPoints === 1 ? '' : 's'} below baseline`,
256
+ })
257
+ }
258
+
259
+ return {
260
+ agentId: input.agentId,
261
+ scope,
262
+ status: statusFromChecks(checks),
263
+ generatedAt,
264
+ baseline,
265
+ latestRuns: aggregate.runs,
266
+ currentScore: aggregate.score,
267
+ currentMaxScore: aggregate.maxScore,
268
+ currentPercent: aggregate.percent,
269
+ regressionPoints,
270
+ minPercent,
271
+ maxRegressionPoints,
272
+ checks,
273
+ }
274
+ }