@swarmclawai/swarmclaw 1.9.6 → 1.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -399,6 +399,16 @@ Operational docs: https://swarmclaw.ai/docs/observability
399
399
 
400
400
  ## Releases
401
401
 
402
+ ### v1.9.7 Highlights
403
+
404
+ Bundled eval-gate release: approved baselines, regression checks, and Quality Center release gates for repeatable eval evidence.
405
+
406
+ - **Eval regression baselines.** Operators can snapshot the latest scenario or suite score as an approved baseline with minimum score and regression allowance settings.
407
+ - **Release gate API.** `/api/eval/gate` compares current eval evidence against thresholds and baselines, while `/api/eval/baselines` lists and updates approved baselines.
408
+ - **CLI gate checks.** `swarmclaw eval gate`, `swarmclaw eval baselines`, and `swarmclaw eval baseline-set` expose the same release-gate workflow from automation.
409
+ - **Quality Center gate panel.** Eval Lab now shows pass/warn/fail status, latest-run coverage, current score, baseline score, regression points, and actionable checks.
410
+ - **Public-source hygiene.** Generic implementation comments now describe SwarmClaw behavior without naming internal comparison sources.
411
+
402
412
  ### v1.9.6 Highlights
403
413
 
404
414
  Bundled eval-environment release: validation preflights, deterministic eval workspaces, and clearer operator readiness before spending run budget.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@swarmclawai/swarmclaw",
3
- "version": "1.9.6",
3
+ "version": "1.9.7",
4
4
  "description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
5
5
  "main": "electron-dist/main.js",
6
6
  "license": "MIT",
@@ -87,7 +87,7 @@
87
87
  "test:cli": "node --test src/cli/*.test.js bin/*.test.js scripts/electron-after-pack.test.mjs scripts/ensure-sandbox-browser-image.test.mjs scripts/postinstall.test.mjs scripts/run-next-build.test.mjs scripts/run-next-typegen.test.mjs",
88
88
  "test:setup": "tsx --test src/app/api/setup/check-provider/route.test.ts src/lib/server/provider-model-discovery.test.ts src/components/auth/setup-wizard/utils.test.ts src/components/auth/setup-wizard/types.test.ts src/hooks/setup-done-detection.test.ts src/lib/setup-defaults.test.ts src/lib/server/storage-auth.test.ts src/lib/server/storage-auth-docker.test.ts",
89
89
  "test:openclaw": "tsx --test src/lib/openclaw/openclaw-agent-id.test.ts src/lib/openclaw/openclaw-endpoint.test.ts src/lib/server/agents/agent-runtime-config.test.ts src/lib/server/build-llm.test.ts src/lib/server/connectors/connector-routing.test.ts src/lib/server/connectors/openclaw.test.ts src/lib/server/connectors/swarmdock.test.ts src/lib/server/gateway/protocol.test.ts src/lib/server/gateways/gateway-topology.test.ts src/lib/server/llm-response-cache.test.ts src/lib/server/mcp-conformance.test.ts src/lib/server/openclaw/agent-resolver.test.ts src/lib/server/openclaw/deploy.test.ts src/lib/server/openclaw/skills-normalize.test.ts src/lib/server/session-tools/openclaw-nodes.test.ts src/lib/server/session-tools/swarmdock.test.ts src/lib/server/tasks/task-quality-gate.test.ts src/lib/server/tasks/task-validation.test.ts src/lib/server/tool-capability-policy.test.ts src/lib/providers/openai.test.ts src/lib/providers/openclaw-exports.test.ts src/app/api/gateways/topology-route.test.ts src/app/api/openclaw/dashboard-url/route.test.ts",
90
- "test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/eval/environment-plan.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/tts/route.test.ts",
90
+ "test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/eval/baseline.test.ts src/lib/server/eval/environment-plan.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/tts/route.test.ts",
91
91
  "test:builder": "tsx --test src/features/protocols/builder/utils/nodes-to-template.test.ts src/features/protocols/builder/utils/template-to-nodes.test.ts src/features/protocols/builder/validators/dag-validator.test.ts",
92
92
  "test:e2e": "node --import tsx scripts/browser-e2e-smoke.ts",
93
93
  "test:mcp:conformance": "node --import tsx ./scripts/mcp-conformance-check.ts",
@@ -0,0 +1,55 @@
1
+ import { NextResponse } from 'next/server'
2
+ import { z } from 'zod'
3
+ import { evaluateEvalGate, listEvalBaselinesForAgent, setEvalBaseline } from '@/lib/server/eval/baseline'
4
+ import { errorMessage } from '@/lib/shared-utils'
5
+
6
+ const BaselineSchema = z.object({
7
+ agentId: z.string().min(1),
8
+ scenarioId: z.string().min(1).nullable().optional(),
9
+ suite: z.string().min(1).nullable().optional(),
10
+ minPercent: z.number().min(0).max(100).nullable().optional(),
11
+ maxRegressionPoints: z.number().min(0).max(100).nullable().optional(),
12
+ label: z.string().max(160).nullable().optional(),
13
+ notes: z.string().max(1_000).nullable().optional(),
14
+ })
15
+
16
+ export async function GET(req: Request) {
17
+ try {
18
+ const { searchParams } = new URL(req.url)
19
+ const agentId = searchParams.get('agentId')
20
+ return NextResponse.json(listEvalBaselinesForAgent(agentId))
21
+ } catch (err: unknown) {
22
+ return NextResponse.json(
23
+ { error: errorMessage(err) },
24
+ { status: 500 },
25
+ )
26
+ }
27
+ }
28
+
29
+ export async function POST(req: Request) {
30
+ try {
31
+ const body: unknown = await req.json()
32
+ const parsed = BaselineSchema.safeParse(body)
33
+ if (!parsed.success) {
34
+ return NextResponse.json(
35
+ { error: parsed.error.issues.map((issue) => issue.message).join(', ') },
36
+ { status: 400 },
37
+ )
38
+ }
39
+
40
+ const baseline = setEvalBaseline(parsed.data)
41
+ const gate = evaluateEvalGate({
42
+ agentId: parsed.data.agentId,
43
+ scenarioId: parsed.data.scenarioId,
44
+ suite: parsed.data.suite,
45
+ minPercent: parsed.data.minPercent,
46
+ maxRegressionPoints: parsed.data.maxRegressionPoints,
47
+ })
48
+ return NextResponse.json({ baseline, gate })
49
+ } catch (err: unknown) {
50
+ return NextResponse.json(
51
+ { error: errorMessage(err) },
52
+ { status: 500 },
53
+ )
54
+ }
55
+ }
@@ -0,0 +1,36 @@
1
+ import { NextResponse } from 'next/server'
2
+ import { evaluateEvalGate } from '@/lib/server/eval/baseline'
3
+ import { errorMessage } from '@/lib/shared-utils'
4
+
5
+ function parseNumberParam(value: string | null): number | null {
6
+ if (value == null || value.trim() === '') return null
7
+ const parsed = Number(value)
8
+ return Number.isFinite(parsed) ? parsed : null
9
+ }
10
+
11
+ export async function GET(req: Request) {
12
+ try {
13
+ const { searchParams } = new URL(req.url)
14
+ const agentId = searchParams.get('agentId') || ''
15
+ if (!agentId) {
16
+ return NextResponse.json(
17
+ { error: 'agentId is required' },
18
+ { status: 400 },
19
+ )
20
+ }
21
+
22
+ const result = evaluateEvalGate({
23
+ agentId,
24
+ scenarioId: searchParams.get('scenarioId'),
25
+ suite: searchParams.get('suite'),
26
+ minPercent: parseNumberParam(searchParams.get('minPercent')),
27
+ maxRegressionPoints: parseNumberParam(searchParams.get('maxRegressionPoints')),
28
+ })
29
+ return NextResponse.json(result)
30
+ } catch (err: unknown) {
31
+ return NextResponse.json(
32
+ { error: errorMessage(err) },
33
+ { status: 500 },
34
+ )
35
+ }
36
+ }
package/src/cli/index.js CHANGED
@@ -232,9 +232,12 @@ const COMMAND_GROUPS = [
232
232
  cmd('suites', 'GET', '/eval/suites', 'List available eval suites (core, swe-bench-lite, gaia-l1, ...)'),
233
233
  cmd('status', 'GET', '/eval/run', 'Get eval run status'),
234
234
  cmd('environment', 'GET', '/eval/environments', 'Preview validation environment readiness for an eval'),
235
+ cmd('baselines', 'GET', '/eval/baselines', 'List eval regression baselines'),
236
+ cmd('gate', 'GET', '/eval/gate', 'Check the latest eval score against thresholds and baseline'),
235
237
  cmd('run', 'POST', '/eval/run', 'Run an eval scenario against an agent', { expectsJsonBody: true }),
236
238
  cmd('suite', 'POST', '/eval/suite', 'Run a full eval suite against an agent (pass { suite: "swe-bench-lite" } in body)', { expectsJsonBody: true }),
237
239
  cmd('environment-prepare', 'POST', '/eval/environments', 'Prepare validation environment readiness for an eval', { expectsJsonBody: true }),
240
+ cmd('baseline-set', 'POST', '/eval/baselines', 'Set an eval regression baseline from latest completed runs', { expectsJsonBody: true }),
238
241
  ],
239
242
  },
240
243
  {
@@ -17,7 +17,7 @@ import {
17
17
  } from '@/lib/quality/quality-summary'
18
18
  import { cn } from '@/lib/utils'
19
19
  import { useAppStore } from '@/stores/use-app-store'
20
- import type { EvalEnvironmentPlan, EvalRun, EvalSuiteResult } from '@/lib/server/eval/types'
20
+ import type { EvalEnvironmentPlan, EvalGateResult, EvalRun, EvalSuiteResult } from '@/lib/server/eval/types'
21
21
  import type { Agent, ApprovalRequest, SessionRunRecord } from '@/types'
22
22
 
23
23
  type QualityTab = 'overview' | 'evals' | 'approvals' | 'runs'
@@ -117,6 +117,18 @@ function checkClass(level: 'info' | 'warn' | 'error'): string {
117
117
  return 'border-white/[0.06] bg-white/[0.025] text-text-3'
118
118
  }
119
119
 
120
+ function gateStatusClass(status: EvalGateResult['status']): string {
121
+ if (status === 'pass') return 'border-emerald-500/25 bg-emerald-500/10 text-emerald-200'
122
+ if (status === 'warn') return 'border-amber-500/25 bg-amber-500/10 text-amber-200'
123
+ return 'border-rose-500/25 bg-rose-500/10 text-rose-200'
124
+ }
125
+
126
+ function gateCheckClass(status: EvalGateResult['status']): string {
127
+ if (status === 'fail') return 'border-rose-500/20 bg-rose-500/[0.05] text-rose-200'
128
+ if (status === 'warn') return 'border-amber-500/20 bg-amber-500/[0.05] text-amber-200'
129
+ return 'border-emerald-500/20 bg-emerald-500/[0.05] text-emerald-200'
130
+ }
131
+
120
132
  function EvalEnvironmentPanel({ plan, loading, onRefresh }: {
121
133
  plan: EvalEnvironmentPlan | null
122
134
  loading: boolean
@@ -195,6 +207,115 @@ function EvalEnvironmentPanel({ plan, loading, onRefresh }: {
195
207
  )
196
208
  }
197
209
 
210
+ function EvalGatePanel({
211
+ gate,
212
+ loading,
213
+ busy,
214
+ scope,
215
+ onScopeChange,
216
+ onRefresh,
217
+ onSetBaseline,
218
+ }: {
219
+ gate: EvalGateResult | null
220
+ loading: boolean
221
+ busy: boolean
222
+ scope: 'scenario' | 'suite'
223
+ onScopeChange: (scope: 'scenario' | 'suite') => void
224
+ onRefresh: () => void
225
+ onSetBaseline: () => void
226
+ }) {
227
+ return (
228
+ <div className="rounded-[12px] border border-white/[0.06] bg-white/[0.025] px-3 py-3">
229
+ <div className="flex items-start justify-between gap-3">
230
+ <div>
231
+ <div className="text-[13px] font-800 text-text">Regression gate</div>
232
+ <p className="mt-1 text-[11px] leading-relaxed text-text-3/65">
233
+ Compare latest eval evidence against thresholds and an approved baseline.
234
+ </p>
235
+ </div>
236
+ <button
237
+ type="button"
238
+ onClick={onRefresh}
239
+ disabled={loading}
240
+ className="shrink-0 rounded-[8px] border border-white/[0.08] px-2 py-1 text-[10px] font-800 text-text-2 transition-colors hover:bg-white/[0.06] disabled:opacity-40"
241
+ >
242
+ {loading ? 'Checking' : 'Refresh'}
243
+ </button>
244
+ </div>
245
+
246
+ <div className="mt-3 flex rounded-[10px] border border-white/[0.06] bg-white/[0.025] p-1">
247
+ {(['scenario', 'suite'] as const).map((item) => (
248
+ <button
249
+ key={item}
250
+ type="button"
251
+ onClick={() => onScopeChange(item)}
252
+ className={cn(
253
+ 'flex-1 rounded-[8px] px-2 py-1.5 text-[10px] font-800 uppercase tracking-[0.08em] transition-colors',
254
+ scope === item ? 'bg-white/[0.1] text-text' : 'text-text-3 hover:bg-white/[0.05]',
255
+ )}
256
+ >
257
+ {item}
258
+ </button>
259
+ ))}
260
+ </div>
261
+
262
+ {!gate ? (
263
+ <div className="mt-3 text-[11px] text-text-3/60">{loading ? 'Checking gate...' : 'Run evals to build gate evidence.'}</div>
264
+ ) : (
265
+ <div className="mt-3 flex flex-col gap-3">
266
+ <div className="flex flex-wrap items-center gap-2">
267
+ <span className={cn('rounded-full border px-2 py-1 text-[10px] font-800 uppercase tracking-[0.08em]', gateStatusClass(gate.status))}>
268
+ {gate.status}
269
+ </span>
270
+ <span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
271
+ {gate.scope.label}
272
+ </span>
273
+ <span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
274
+ {gate.latestRuns.length}/{gate.scope.scenarioIds.length} latest runs
275
+ </span>
276
+ </div>
277
+
278
+ <div className="grid grid-cols-3 gap-2">
279
+ <div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
280
+ <div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Current</div>
281
+ <div className="mt-1 text-[14px] font-800 text-text">{formatPercent(gate.currentPercent)}</div>
282
+ </div>
283
+ <div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
284
+ <div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Baseline</div>
285
+ <div className="mt-1 text-[14px] font-800 text-text">{gate.baseline ? `${gate.baseline.baselinePercent}%` : 'none'}</div>
286
+ </div>
287
+ <div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
288
+ <div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Regression</div>
289
+ <div className="mt-1 text-[14px] font-800 text-text">{gate.regressionPoints == null ? 'n/a' : `${gate.regressionPoints}pt`}</div>
290
+ </div>
291
+ </div>
292
+
293
+ <div className="flex flex-col gap-1.5">
294
+ {gate.checks.slice(0, 4).map((check) => (
295
+ <div key={`${check.code}:${check.message}`} className={cn('rounded-[9px] border px-2.5 py-2 text-[11px] leading-relaxed', gateCheckClass(check.status))}>
296
+ <span className="font-800 uppercase tracking-[0.08em]">{check.status}</span>
297
+ <span className="ml-2">{check.message}</span>
298
+ </div>
299
+ ))}
300
+ {gate.checks.length > 4 && (
301
+ <div className="text-[10px] text-text-3/55">+{gate.checks.length - 4} more check{gate.checks.length - 4 === 1 ? '' : 's'}</div>
302
+ )}
303
+ </div>
304
+
305
+ <button
306
+ type="button"
307
+ onClick={onSetBaseline}
308
+ disabled={busy || gate.latestRuns.length === 0 || gate.checks.some((check) => check.code === 'missing_scope_runs')}
309
+ className="rounded-[9px] border border-white/[0.08] bg-white/[0.04] px-3 py-2 text-[11px] font-800 text-text-2 transition-colors hover:bg-white/[0.08] disabled:cursor-not-allowed disabled:opacity-40"
310
+ >
311
+ {busy ? 'Saving baseline' : gate.baseline ? 'Update baseline' : 'Set baseline'}
312
+ </button>
313
+ </div>
314
+ )}
315
+ </div>
316
+ )
317
+ }
318
+
198
319
  export function QualityWorkspace() {
199
320
  const router = useRouter()
200
321
  const searchParams = useSearchParams()
@@ -219,6 +340,10 @@ export function QualityWorkspace() {
219
340
  const [evalBusy, setEvalBusy] = useState<string | null>(null)
220
341
  const [evalEnvironmentPlan, setEvalEnvironmentPlan] = useState<EvalEnvironmentPlan | null>(null)
221
342
  const [evalEnvironmentLoading, setEvalEnvironmentLoading] = useState(false)
343
+ const [evalGate, setEvalGate] = useState<EvalGateResult | null>(null)
344
+ const [evalGateScope, setEvalGateScope] = useState<'scenario' | 'suite'>('scenario')
345
+ const [evalGateLoading, setEvalGateLoading] = useState(false)
346
+ const [evalBaselineBusy, setEvalBaselineBusy] = useState(false)
222
347
  const [approvalBusy, setApprovalBusy] = useState<string | null>(null)
223
348
 
224
349
  useEffect(() => {
@@ -283,6 +408,30 @@ export function QualityWorkspace() {
283
408
  }
284
409
  }, [selectedAgentId, selectedScenarioId, selectedSuite])
285
410
 
411
+ const loadEvalGate = useCallback(async () => {
412
+ if (!selectedAgentId) {
413
+ setEvalGate(null)
414
+ return
415
+ }
416
+ if (evalGateScope === 'scenario' && !selectedScenarioId) {
417
+ setEvalGate(null)
418
+ return
419
+ }
420
+ const params = new URLSearchParams({ agentId: selectedAgentId })
421
+ if (evalGateScope === 'scenario') params.set('scenarioId', selectedScenarioId)
422
+ else params.set('suite', selectedSuite)
423
+ setEvalGateLoading(true)
424
+ try {
425
+ const gate = await api<EvalGateResult>('GET', `/eval/gate?${params.toString()}`)
426
+ setEvalGate(gate)
427
+ } catch (err) {
428
+ setEvalGate(null)
429
+ toast.error(err instanceof Error ? err.message : 'Unable to check eval gate')
430
+ } finally {
431
+ setEvalGateLoading(false)
432
+ }
433
+ }, [evalGateScope, selectedAgentId, selectedScenarioId, selectedSuite])
434
+
286
435
  useEffect(() => {
287
436
  void loadQualityData()
288
437
  }, [loadQualityData])
@@ -301,6 +450,10 @@ export function QualityWorkspace() {
301
450
  void loadEvalEnvironmentPlan()
302
451
  }, [loadEvalEnvironmentPlan])
303
452
 
453
+ useEffect(() => {
454
+ void loadEvalGate()
455
+ }, [loadEvalGate])
456
+
304
457
  useEffect(() => {
305
458
  if (!suites.some((suite) => suite.name === selectedSuite) && suites[0]) {
306
459
  setSelectedSuite(suites[0].name)
@@ -341,12 +494,13 @@ export function QualityWorkspace() {
341
494
  toast.success('Eval scenario completed')
342
495
  await loadQualityData({ silent: true })
343
496
  await loadEvalEnvironmentPlan()
497
+ await loadEvalGate()
344
498
  } catch (err) {
345
499
  toast.error(err instanceof Error ? err.message : 'Eval scenario failed')
346
500
  } finally {
347
501
  setEvalBusy(null)
348
502
  }
349
- }, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadQualityData, selectedAgentId, selectedScenarioId])
503
+ }, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadEvalGate, loadQualityData, selectedAgentId, selectedScenarioId])
350
504
 
351
505
  const runSuite = useCallback(async (suiteName: string) => {
352
506
  if (!selectedAgentId) {
@@ -369,12 +523,37 @@ export function QualityWorkspace() {
369
523
  toast.success(`Suite completed at ${Math.round(result.percentage)}%`)
370
524
  await loadQualityData({ silent: true })
371
525
  await loadEvalEnvironmentPlan()
526
+ await loadEvalGate()
372
527
  } catch (err) {
373
528
  toast.error(err instanceof Error ? err.message : 'Eval suite failed')
374
529
  } finally {
375
530
  setEvalBusy(null)
376
531
  }
377
- }, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadQualityData, selectedAgentId])
532
+ }, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadEvalGate, loadQualityData, selectedAgentId])
533
+
534
+ const setEvalBaseline = useCallback(async () => {
535
+ if (!selectedAgentId) {
536
+ toast.error('Choose an agent first')
537
+ return
538
+ }
539
+ if (evalGateScope === 'scenario' && !selectedScenarioId) {
540
+ toast.error('Choose a scenario first')
541
+ return
542
+ }
543
+ setEvalBaselineBusy(true)
544
+ try {
545
+ const body = evalGateScope === 'scenario'
546
+ ? { agentId: selectedAgentId, scenarioId: selectedScenarioId, minPercent: evalGate?.minPercent ?? 80, maxRegressionPoints: evalGate?.maxRegressionPoints ?? 5 }
547
+ : { agentId: selectedAgentId, suite: selectedSuite, minPercent: evalGate?.minPercent ?? 80, maxRegressionPoints: evalGate?.maxRegressionPoints ?? 5 }
548
+ const result = await api<{ gate: EvalGateResult }>('POST', '/eval/baselines', body)
549
+ setEvalGate(result.gate)
550
+ toast.success('Eval baseline saved')
551
+ } catch (err) {
552
+ toast.error(err instanceof Error ? err.message : 'Unable to save eval baseline')
553
+ } finally {
554
+ setEvalBaselineBusy(false)
555
+ }
556
+ }, [evalGate, evalGateScope, selectedAgentId, selectedScenarioId, selectedSuite])
378
557
 
379
558
  const actOnApproval = useCallback(async (approval: ApprovalRequest, approved: boolean) => {
380
559
  setApprovalBusy(approval.id)
@@ -600,6 +779,15 @@ export function QualityWorkspace() {
600
779
  loading={evalEnvironmentLoading}
601
780
  onRefresh={() => void loadEvalEnvironmentPlan({ refreshGateway: true })}
602
781
  />
782
+ <EvalGatePanel
783
+ gate={evalGate}
784
+ loading={evalGateLoading}
785
+ busy={evalBaselineBusy}
786
+ scope={evalGateScope}
787
+ onScopeChange={setEvalGateScope}
788
+ onRefresh={() => void loadEvalGate()}
789
+ onSetBaseline={() => void setEvalBaseline()}
790
+ />
603
791
  <button
604
792
  type="button"
605
793
  onClick={() => openMissionTemplate('release-candidate-qa')}
@@ -0,0 +1,111 @@
1
+ import assert from 'node:assert/strict'
2
+ import test from 'node:test'
3
+
4
+ import {
5
+ evaluateEvalGate,
6
+ setEvalBaseline,
7
+ } from './baseline'
8
+ import type { EvalBaseline, EvalRun } from './types'
9
+
10
+ function makeRun(overrides: Partial<EvalRun> = {}): EvalRun {
11
+ return {
12
+ id: 'run-1',
13
+ scenarioId: 'coding-prime',
14
+ agentId: 'agent-1',
15
+ status: 'completed',
16
+ startedAt: 1,
17
+ endedAt: 2,
18
+ score: 8,
19
+ maxScore: 10,
20
+ details: [],
21
+ ...overrides,
22
+ }
23
+ }
24
+
25
+ function depsFor(runs: EvalRun[], baseline: EvalBaseline | null = null, saved: EvalBaseline[] = []) {
26
+ return {
27
+ now: () => 123,
28
+ listRunsByAgent: (agentId: string) => runs.filter((run) => run.agentId === agentId),
29
+ getBaselineForScope: () => baseline,
30
+ saveBaseline: (next: EvalBaseline) => { saved.push(next) },
31
+ }
32
+ }
33
+
34
+ test('setEvalBaseline snapshots the latest scenario score and gate defaults', () => {
35
+ const saved: EvalBaseline[] = []
36
+ const baseline = setEvalBaseline(
37
+ {
38
+ agentId: 'agent-1',
39
+ scenarioId: 'coding-prime',
40
+ minPercent: 75,
41
+ maxRegressionPoints: 3,
42
+ label: 'Release candidate',
43
+ },
44
+ depsFor([
45
+ makeRun({ id: 'older', score: 4, startedAt: 1, endedAt: 2 }),
46
+ makeRun({ id: 'latest', score: 8, startedAt: 5, endedAt: 6 }),
47
+ ], null, saved),
48
+ )
49
+
50
+ assert.equal(saved.length, 1)
51
+ assert.equal(baseline.scope.type, 'scenario')
52
+ assert.equal(baseline.scope.id, 'coding-prime')
53
+ assert.equal(baseline.baselinePercent, 80)
54
+ assert.equal(baseline.minPercent, 75)
55
+ assert.equal(baseline.maxRegressionPoints, 3)
56
+ assert.deepEqual(baseline.runIds, ['latest'])
57
+ })
58
+
59
+ test('evaluateEvalGate warns until a baseline is approved', () => {
60
+ const gate = evaluateEvalGate(
61
+ { agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70 },
62
+ depsFor([makeRun({ score: 8, maxScore: 10 })]),
63
+ )
64
+
65
+ assert.equal(gate.currentPercent, 80)
66
+ assert.equal(gate.status, 'warn')
67
+ assert.ok(gate.checks.some((check) => check.code === 'baseline_missing' && check.status === 'warn'))
68
+ })
69
+
70
+ test('evaluateEvalGate fails when regression exceeds the baseline allowance', () => {
71
+ const baseline = setEvalBaseline(
72
+ { agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 2 },
73
+ depsFor([makeRun({ id: 'baseline', score: 9, maxScore: 10 })]),
74
+ )
75
+
76
+ const gate = evaluateEvalGate(
77
+ { agentId: 'agent-1', scenarioId: 'coding-prime' },
78
+ depsFor([makeRun({ id: 'current', score: 6, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
79
+ )
80
+
81
+ assert.equal(gate.currentPercent, 60)
82
+ assert.equal(gate.regressionPoints, 30)
83
+ assert.equal(gate.status, 'fail')
84
+ assert.ok(gate.checks.some((check) => check.code === 'regression_limit_exceeded'))
85
+ })
86
+
87
+ test('evaluateEvalGate passes when score and regression checks pass', () => {
88
+ const baseline = setEvalBaseline(
89
+ { agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 5 },
90
+ depsFor([makeRun({ id: 'baseline', score: 8, maxScore: 10 })]),
91
+ )
92
+
93
+ const gate = evaluateEvalGate(
94
+ { agentId: 'agent-1', scenarioId: 'coding-prime' },
95
+ depsFor([makeRun({ id: 'current', score: 8, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
96
+ )
97
+
98
+ assert.equal(gate.status, 'pass')
99
+ assert.equal(gate.regressionPoints, 0)
100
+ assert.ok(gate.checks.some((check) => check.code === 'score_threshold_met'))
101
+ })
102
+
103
+ test('suite gates require latest runs for every scenario in scope before baselining', () => {
104
+ assert.throws(
105
+ () => setEvalBaseline(
106
+ { agentId: 'agent-1', suite: 'core' },
107
+ depsFor([makeRun({ scenarioId: 'coding-prime' })]),
108
+ ),
109
+ /Baseline requires latest runs for every scenario in scope/,
110
+ )
111
+ })
@@ -0,0 +1,274 @@
1
+ import {
2
+ getEvalBaselineForScope,
3
+ listEvalBaselines,
4
+ listEvalRunsByAgent,
5
+ saveEvalBaseline,
6
+ } from './store'
7
+ import { getScenario, getSuiteScenarios } from './scenarios'
8
+ import type {
9
+ EvalBaseline,
10
+ EvalGateCheck,
11
+ EvalGateResult,
12
+ EvalGateScope,
13
+ EvalGateScopeType,
14
+ EvalRun,
15
+ } from './types'
16
+
17
+ const DEFAULT_MIN_PERCENT = 80
18
+ const DEFAULT_MAX_REGRESSION_POINTS = 5
19
+ const MAX_LOOKBACK_RUNS = 1_000
20
+
21
+ export interface EvalGateInput {
22
+ agentId: string
23
+ scenarioId?: string | null
24
+ suite?: string | null
25
+ minPercent?: number | null
26
+ maxRegressionPoints?: number | null
27
+ }
28
+
29
+ export interface SetEvalBaselineInput extends EvalGateInput {
30
+ label?: string | null
31
+ notes?: string | null
32
+ }
33
+
34
+ interface EvalGateDeps {
35
+ now?: () => number
36
+ listRunsByAgent?: (agentId: string, limit: number) => EvalRun[]
37
+ getBaselineForScope?: (agentId: string, scopeType: EvalGateScopeType, scopeId: string) => EvalBaseline | null
38
+ saveBaseline?: (baseline: EvalBaseline) => void
39
+ listBaselines?: (filters?: { agentId?: string; limit?: number }) => EvalBaseline[]
40
+ }
41
+
42
+ interface EvalAggregate {
43
+ runs: EvalRun[]
44
+ missingScenarioIds: string[]
45
+ score: number
46
+ maxScore: number
47
+ percent: number | null
48
+ }
49
+
50
+ function normalizePercent(value: number | null | undefined, fallback: number): number {
51
+ if (!Number.isFinite(value) || value == null) return fallback
52
+ return Math.max(0, Math.min(100, Math.round(value)))
53
+ }
54
+
55
+ function normalizeRegressionPoints(value: number | null | undefined, fallback: number): number {
56
+ if (!Number.isFinite(value) || value == null) return fallback
57
+ return Math.max(0, Math.round(value))
58
+ }
59
+
60
+ function scorePercent(score: number, maxScore: number): number | null {
61
+ if (!Number.isFinite(score) || !Number.isFinite(maxScore) || maxScore <= 0) return null
62
+ return Math.round((score / maxScore) * 100)
63
+ }
64
+
65
+ function maxScoreForScenario(scenarioId: string): number {
66
+ const scenario = getScenario(scenarioId)
67
+ return scenario?.scoringCriteria.reduce((sum, criterion) => sum + criterion.weight, 0) ?? 0
68
+ }
69
+
70
+ export function resolveEvalGateScope(input: Pick<EvalGateInput, 'scenarioId' | 'suite'>): EvalGateScope {
71
+ const scenarioId = input.scenarioId?.trim()
72
+ if (scenarioId) {
73
+ const scenario = getScenario(scenarioId)
74
+ if (!scenario) throw new Error(`Unknown eval scenario: ${scenarioId}`)
75
+ return {
76
+ type: 'scenario',
77
+ id: scenario.id,
78
+ label: scenario.name,
79
+ scenarioIds: [scenario.id],
80
+ }
81
+ }
82
+
83
+ const suite = input.suite?.trim() || 'core'
84
+ const scenarios = getSuiteScenarios(suite)
85
+ if (scenarios.length === 0) throw new Error(`Unknown or empty eval suite: ${suite}`)
86
+ return {
87
+ type: 'suite',
88
+ id: suite,
89
+ label: suite,
90
+ scenarioIds: scenarios.map((scenario) => scenario.id),
91
+ }
92
+ }
93
+
94
+ export function evalBaselineId(agentId: string, scope: EvalGateScope): string {
95
+ return `eval-baseline:${agentId}:${scope.type}:${scope.id}`
96
+ }
97
+
98
+ function latestRunsForScope(runs: EvalRun[], scope: EvalGateScope): EvalRun[] {
99
+ const scenarioSet = new Set(scope.scenarioIds)
100
+ const latest = new Map<string, EvalRun>()
101
+
102
+ for (const run of runs) {
103
+ if (!scenarioSet.has(run.scenarioId)) continue
104
+ if (run.status === 'pending' || run.status === 'running') continue
105
+ const previous = latest.get(run.scenarioId)
106
+ if (!previous || (run.endedAt ?? run.startedAt) > (previous.endedAt ?? previous.startedAt)) {
107
+ latest.set(run.scenarioId, run)
108
+ }
109
+ }
110
+
111
+ return scope.scenarioIds
112
+ .map((scenarioId) => latest.get(scenarioId))
113
+ .filter(Boolean) as EvalRun[]
114
+ }
115
+
116
+ function aggregateRuns(scope: EvalGateScope, runs: EvalRun[]): EvalAggregate {
117
+ const byScenario = new Map(runs.map((run) => [run.scenarioId, run]))
118
+ const missingScenarioIds = scope.scenarioIds.filter((scenarioId) => !byScenario.has(scenarioId))
119
+ const score = scope.scenarioIds.reduce((sum, scenarioId) => sum + (byScenario.get(scenarioId)?.score ?? 0), 0)
120
+ const maxScore = scope.scenarioIds.reduce((sum, scenarioId) => {
121
+ const runMaxScore = byScenario.get(scenarioId)?.maxScore
122
+ return sum + (Number.isFinite(runMaxScore) && runMaxScore != null ? runMaxScore : maxScoreForScenario(scenarioId))
123
+ }, 0)
124
+ return {
125
+ runs,
126
+ missingScenarioIds,
127
+ score,
128
+ maxScore,
129
+ percent: scorePercent(score, maxScore),
130
+ }
131
+ }
132
+
133
+ function statusFromChecks(checks: EvalGateCheck[]): EvalGateResult['status'] {
134
+ if (checks.some((check) => check.status === 'fail')) return 'fail'
135
+ if (checks.some((check) => check.status === 'warn')) return 'warn'
136
+ return 'pass'
137
+ }
138
+
139
+ export function listEvalBaselinesForAgent(agentId?: string | null, deps: EvalGateDeps = {}): EvalBaseline[] {
140
+ const list = deps.listBaselines || listEvalBaselines
141
+ return list({ agentId: agentId || undefined, limit: 200 })
142
+ }
143
+
144
+ export function setEvalBaseline(input: SetEvalBaselineInput, deps: EvalGateDeps = {}): EvalBaseline {
145
+ if (!input.agentId.trim()) throw new Error('agentId is required')
146
+
147
+ const now = deps.now?.() ?? Date.now()
148
+ const scope = resolveEvalGateScope(input)
149
+ const runs = latestRunsForScope(
150
+ (deps.listRunsByAgent || listEvalRunsByAgent)(input.agentId, MAX_LOOKBACK_RUNS),
151
+ scope,
152
+ )
153
+ const aggregate = aggregateRuns(scope, runs)
154
+ if (aggregate.runs.length === 0) {
155
+ throw new Error('Run the selected eval before setting a baseline.')
156
+ }
157
+ if (aggregate.missingScenarioIds.length > 0) {
158
+ throw new Error(`Baseline requires latest runs for every scenario in scope. Missing: ${aggregate.missingScenarioIds.join(', ')}`)
159
+ }
160
+
161
+ const existing = (deps.getBaselineForScope || getEvalBaselineForScope)(input.agentId, scope.type, scope.id)
162
+ const baseline: EvalBaseline = {
163
+ id: existing?.id || evalBaselineId(input.agentId, scope),
164
+ agentId: input.agentId,
165
+ scope,
166
+ baselineScore: aggregate.score,
167
+ baselineMaxScore: aggregate.maxScore,
168
+ baselinePercent: aggregate.percent ?? 0,
169
+ minPercent: normalizePercent(input.minPercent, aggregate.percent ?? DEFAULT_MIN_PERCENT),
170
+ maxRegressionPoints: normalizeRegressionPoints(input.maxRegressionPoints, existing?.maxRegressionPoints ?? DEFAULT_MAX_REGRESSION_POINTS),
171
+ runIds: aggregate.runs.map((run) => run.id),
172
+ label: input.label?.trim() || existing?.label || null,
173
+ notes: input.notes?.trim() || existing?.notes || null,
174
+ createdAt: existing?.createdAt || now,
175
+ updatedAt: now,
176
+ }
177
+
178
+ ;(deps.saveBaseline || saveEvalBaseline)(baseline)
179
+ return baseline
180
+ }
181
+
182
+ export function evaluateEvalGate(input: EvalGateInput, deps: EvalGateDeps = {}): EvalGateResult {
183
+ if (!input.agentId.trim()) throw new Error('agentId is required')
184
+
185
+ const generatedAt = deps.now?.() ?? Date.now()
186
+ const scope = resolveEvalGateScope(input)
187
+ const baseline = (deps.getBaselineForScope || getEvalBaselineForScope)(input.agentId, scope.type, scope.id)
188
+ const runs = latestRunsForScope(
189
+ (deps.listRunsByAgent || listEvalRunsByAgent)(input.agentId, MAX_LOOKBACK_RUNS),
190
+ scope,
191
+ )
192
+ const aggregate = aggregateRuns(scope, runs)
193
+ const minPercent = normalizePercent(input.minPercent, baseline?.minPercent ?? DEFAULT_MIN_PERCENT)
194
+ const maxRegressionPoints = normalizeRegressionPoints(input.maxRegressionPoints, baseline?.maxRegressionPoints ?? DEFAULT_MAX_REGRESSION_POINTS)
195
+ const regressionPoints = baseline && aggregate.percent != null
196
+ ? Math.max(0, baseline.baselinePercent - aggregate.percent)
197
+ : null
198
+
199
+ const checks: EvalGateCheck[] = []
200
+ if (aggregate.runs.length === 0) {
201
+ checks.push({
202
+ code: 'no_eval_runs',
203
+ status: 'fail',
204
+ message: 'No completed eval runs are available for this gate.',
205
+ })
206
+ }
207
+ if (aggregate.missingScenarioIds.length > 0) {
208
+ checks.push({
209
+ code: 'missing_scope_runs',
210
+ status: 'fail',
211
+ message: `${aggregate.missingScenarioIds.length} scenario${aggregate.missingScenarioIds.length === 1 ? '' : 's'} have no latest run in this gate.`,
212
+ detail: aggregate.missingScenarioIds.join(', '),
213
+ })
214
+ }
215
+ if (aggregate.runs.some((run) => run.status === 'failed')) {
216
+ checks.push({
217
+ code: 'failed_eval_run',
218
+ status: 'fail',
219
+ message: 'At least one latest eval run failed.',
220
+ })
221
+ }
222
+ if (aggregate.percent == null || aggregate.percent < minPercent) {
223
+ checks.push({
224
+ code: 'score_below_threshold',
225
+ status: 'fail',
226
+ message: `Current score is below the ${minPercent}% gate.`,
227
+ detail: aggregate.percent == null ? 'n/a' : `${aggregate.percent}%`,
228
+ })
229
+ } else {
230
+ checks.push({
231
+ code: 'score_threshold_met',
232
+ status: 'pass',
233
+ message: `Current score meets the ${minPercent}% gate.`,
234
+ detail: `${aggregate.percent}%`,
235
+ })
236
+ }
237
+ if (!baseline) {
238
+ checks.push({
239
+ code: 'baseline_missing',
240
+ status: 'warn',
241
+ message: 'No approved baseline is set for this gate.',
242
+ })
243
+ } else if (regressionPoints != null && regressionPoints > maxRegressionPoints) {
244
+ checks.push({
245
+ code: 'regression_limit_exceeded',
246
+ status: 'fail',
247
+ message: `Regression exceeds the ${maxRegressionPoints} point allowance.`,
248
+ detail: `${regressionPoints} points below baseline`,
249
+ })
250
+ } else if (regressionPoints != null) {
251
+ checks.push({
252
+ code: 'regression_within_limit',
253
+ status: 'pass',
254
+ message: `Regression is within the ${maxRegressionPoints} point allowance.`,
255
+ detail: `${regressionPoints} point${regressionPoints === 1 ? '' : 's'} below baseline`,
256
+ })
257
+ }
258
+
259
+ return {
260
+ agentId: input.agentId,
261
+ scope,
262
+ status: statusFromChecks(checks),
263
+ generatedAt,
264
+ baseline,
265
+ latestRuns: aggregate.runs,
266
+ currentScore: aggregate.score,
267
+ currentMaxScore: aggregate.maxScore,
268
+ currentPercent: aggregate.percent,
269
+ regressionPoints,
270
+ minPercent,
271
+ maxRegressionPoints,
272
+ checks,
273
+ }
274
+ }
@@ -1,6 +1,6 @@
1
1
  import Database from 'better-sqlite3'
2
2
  import path from 'path'
3
- import type { EvalRun } from './types'
3
+ import type { EvalBaseline, EvalRun } from './types'
4
4
  import { DATA_DIR } from '../data-dir'
5
5
 
6
6
  const DB_PATH = path.join(DATA_DIR, 'eval-runs.db')
@@ -15,6 +15,15 @@ function getDb(): Database.Database {
15
15
  id TEXT PRIMARY KEY,
16
16
  data TEXT NOT NULL
17
17
  )`)
18
+ db.exec(`CREATE TABLE IF NOT EXISTS eval_baselines (
19
+ id TEXT PRIMARY KEY,
20
+ agent_id TEXT NOT NULL,
21
+ scope_type TEXT NOT NULL,
22
+ scope_id TEXT NOT NULL,
23
+ data TEXT NOT NULL,
24
+ updated_at INTEGER NOT NULL
25
+ )`)
26
+ db.exec('CREATE INDEX IF NOT EXISTS idx_eval_baselines_agent ON eval_baselines(agent_id, scope_type, scope_id)')
18
27
  }
19
28
  return db
20
29
  }
@@ -36,3 +45,40 @@ export function listEvalRuns(limit = 50): EvalRun[] {
36
45
  export function listEvalRunsByAgent(agentId: string, limit = 50): EvalRun[] {
37
46
  return listEvalRuns(limit * 2).filter(r => r.agentId === agentId).slice(0, limit)
38
47
  }
48
+
49
+ export function saveEvalBaseline(baseline: EvalBaseline): void {
50
+ getDb().prepare(`
51
+ INSERT OR REPLACE INTO eval_baselines (id, agent_id, scope_type, scope_id, data, updated_at)
52
+ VALUES (?, ?, ?, ?, ?, ?)
53
+ `).run(
54
+ baseline.id,
55
+ baseline.agentId,
56
+ baseline.scope.type,
57
+ baseline.scope.id,
58
+ JSON.stringify(baseline),
59
+ baseline.updatedAt,
60
+ )
61
+ }
62
+
63
+ export function getEvalBaseline(id: string): EvalBaseline | null {
64
+ const row = getDb().prepare('SELECT data FROM eval_baselines WHERE id = ?').get(id) as { data: string } | undefined
65
+ return row ? JSON.parse(row.data) as EvalBaseline : null
66
+ }
67
+
68
+ export function getEvalBaselineForScope(agentId: string, scopeType: EvalBaseline['scope']['type'], scopeId: string): EvalBaseline | null {
69
+ const row = getDb().prepare(`
70
+ SELECT data FROM eval_baselines
71
+ WHERE agent_id = ? AND scope_type = ? AND scope_id = ?
72
+ ORDER BY updated_at DESC
73
+ LIMIT 1
74
+ `).get(agentId, scopeType, scopeId) as { data: string } | undefined
75
+ return row ? JSON.parse(row.data) as EvalBaseline : null
76
+ }
77
+
78
+ export function listEvalBaselines(filters: { agentId?: string; limit?: number } = {}): EvalBaseline[] {
79
+ const limit = Math.max(1, Math.min(filters.limit ?? 100, 500))
80
+ const rows = filters.agentId
81
+ ? getDb().prepare('SELECT data FROM eval_baselines WHERE agent_id = ? ORDER BY updated_at DESC LIMIT ?').all(filters.agentId, limit) as { data: string }[]
82
+ : getDb().prepare('SELECT data FROM eval_baselines ORDER BY updated_at DESC LIMIT ?').all(limit) as { data: string }[]
83
+ return rows.map((row) => JSON.parse(row.data) as EvalBaseline)
84
+ }
@@ -105,3 +105,53 @@ export interface EvalSuiteResult {
105
105
  runs: EvalRun[]
106
106
  completedAt: number
107
107
  }
108
+
109
+ export type EvalGateScopeType = 'scenario' | 'suite'
110
+
111
+ export interface EvalGateScope {
112
+ type: EvalGateScopeType
113
+ id: string
114
+ label: string
115
+ scenarioIds: string[]
116
+ }
117
+
118
+ export interface EvalBaseline {
119
+ id: string
120
+ agentId: string
121
+ scope: EvalGateScope
122
+ baselineScore: number
123
+ baselineMaxScore: number
124
+ baselinePercent: number
125
+ minPercent: number
126
+ maxRegressionPoints: number
127
+ runIds: string[]
128
+ label?: string | null
129
+ notes?: string | null
130
+ createdAt: number
131
+ updatedAt: number
132
+ }
133
+
134
+ export type EvalGateStatus = 'pass' | 'warn' | 'fail'
135
+
136
+ export interface EvalGateCheck {
137
+ code: string
138
+ status: EvalGateStatus
139
+ message: string
140
+ detail?: string
141
+ }
142
+
143
+ export interface EvalGateResult {
144
+ agentId: string
145
+ scope: EvalGateScope
146
+ status: EvalGateStatus
147
+ generatedAt: number
148
+ baseline: EvalBaseline | null
149
+ latestRuns: EvalRun[]
150
+ currentScore: number
151
+ currentMaxScore: number
152
+ currentPercent: number | null
153
+ regressionPoints: number | null
154
+ minPercent: number
155
+ maxRegressionPoints: number
156
+ checks: EvalGateCheck[]
157
+ }
@@ -138,7 +138,7 @@ module.exports = {
138
138
  }
139
139
  ],
140
140
 
141
- // --- Managed Resources (Paperclip-compatible) ---
141
+ // --- Managed Resources ---
142
142
  managedResources: {
143
143
  agents: [
144
144
  {
@@ -211,7 +211,7 @@ Key rules:
211
211
  - Dependency installs are run by the extension manager inside a per-extension workspace using the selected package manager with scripts disabled.
212
212
  - Extension settings are declared through ui.settingsFields and stored per extension ID
213
213
  - Managed resources let an extension declare provisionable agents, schedules/routines, trusted local folders, gateway platforms, and setup checks. Operators reconcile them through Extensions > Managed Resources or /api/extensions/managed-resources.
214
- - Paperclip-compatible top-level agents, routines, and localFolders are also accepted; SwarmClaw reconciles routines as schedules when they include schedule timing.
214
+ - Top-level agents, routines, and localFolders are also accepted; SwarmClaw reconciles routines as schedules when they include schedule timing.
215
215
  - Keep extensions focused: one clear purpose per extension
216
216
  `
217
217
  }
@@ -6,7 +6,7 @@ import { loadTasks, saveTasks } from '@/lib/server/tasks/task-repository'
6
6
  * Atomically transition a task from queued → running with a checkout run ID.
7
7
  *
8
8
  * Uses a SQLite IMMEDIATE transaction to prevent two runners from starting the
9
- * same task concurrently (Paperclip-inspired atomic checkout pattern).
9
+ * same task concurrently.
10
10
  *
11
11
  * Returns the checked-out task on success, or null if the task was already
12
12
  * taken, missing, or no longer in queued status.
@@ -362,10 +362,10 @@ export interface ExtensionSetupCheckDeclaration {
362
362
  export interface ExtensionManagedResources {
363
363
  agents?: ExtensionManagedAgentDeclaration[]
364
364
  schedules?: ExtensionManagedScheduleDeclaration[]
365
- /** Paperclip-compatible alias. SwarmClaw reconciles routines as managed schedules. */
365
+ /** Routine alias. SwarmClaw reconciles routines as managed schedules. */
366
366
  routines?: ExtensionManagedScheduleDeclaration[]
367
367
  localFolders?: ExtensionManagedLocalFolderDeclaration[]
368
- /** Hermes-style gateway/platform declaration metadata for setup and diagnostics surfaces. */
368
+ /** Gateway/platform declaration metadata for setup and diagnostics surfaces. */
369
369
  gatewayPlatforms?: ExtensionGatewayPlatformDeclaration[]
370
370
  setupChecks?: ExtensionSetupCheckDeclaration[]
371
371
  }
@@ -420,7 +420,7 @@ export interface Extension {
420
420
  providers?: ExtensionProviderDefinition[]
421
421
  connectors?: ExtensionConnectorDefinition[]
422
422
  managedResources?: ExtensionManagedResources
423
- /** Paperclip-compatible top-level aliases. Prefer managedResources for new SwarmClaw extensions. */
423
+ /** Top-level managed-resource aliases. Prefer managedResources for new SwarmClaw extensions. */
424
424
  agents?: ExtensionManagedAgentDeclaration[]
425
425
  schedules?: ExtensionManagedScheduleDeclaration[]
426
426
  routines?: ExtensionManagedScheduleDeclaration[]
@@ -1,218 +0,0 @@
1
- "use strict";
2
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
- if (k2 === undefined) k2 = k;
4
- var desc = Object.getOwnPropertyDescriptor(m, k);
5
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
- desc = { enumerable: true, get: function() { return m[k]; } };
7
- }
8
- Object.defineProperty(o, k2, desc);
9
- }) : (function(o, m, k, k2) {
10
- if (k2 === undefined) k2 = k;
11
- o[k2] = m[k];
12
- }));
13
- var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
- Object.defineProperty(o, "default", { enumerable: true, value: v });
15
- }) : function(o, v) {
16
- o["default"] = v;
17
- });
18
- var __importStar = (this && this.__importStar) || (function () {
19
- var ownKeys = function(o) {
20
- ownKeys = Object.getOwnPropertyNames || function (o) {
21
- var ar = [];
22
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
- return ar;
24
- };
25
- return ownKeys(o);
26
- };
27
- return function (mod) {
28
- if (mod && mod.__esModule) return mod;
29
- var result = {};
30
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
- __setModuleDefault(result, mod);
32
- return result;
33
- };
34
- })();
35
- var __importDefault = (this && this.__importDefault) || function (mod) {
36
- return (mod && mod.__esModule) ? mod : { "default": mod };
37
- };
38
- Object.defineProperty(exports, "__esModule", { value: true });
39
- const electron_1 = require("electron");
40
- const node_fs_1 = __importDefault(require("node:fs"));
41
- const node_path_1 = __importDefault(require("node:path"));
42
- const paths_1 = require("./paths");
43
- const server_lifecycle_1 = require("./server-lifecycle");
44
- const menu_1 = require("./menu");
45
- const DEV_URL_DEFAULT = 'http://127.0.0.1:3456';
46
- const LOG_TAIL_BYTES = 1500;
47
- let mainWindow = null;
48
- let serverHandle = null;
49
- let serverLogFile = null;
50
- let isQuitting = false;
51
- const gotLock = electron_1.app.requestSingleInstanceLock();
52
- if (!gotLock) {
53
- electron_1.app.quit();
54
- }
55
- else {
56
- electron_1.app.on('second-instance', () => {
57
- if (mainWindow) {
58
- if (mainWindow.isMinimized())
59
- mainWindow.restore();
60
- mainWindow.focus();
61
- }
62
- });
63
- electron_1.app.on('ready', () => void onReady());
64
- electron_1.app.on('window-all-closed', () => {
65
- if (process.platform !== 'darwin')
66
- electron_1.app.quit();
67
- });
68
- electron_1.app.on('activate', () => {
69
- if (mainWindow !== null)
70
- return;
71
- if (serverHandle) {
72
- createMainWindow(serverHandle.url);
73
- }
74
- else if (!electron_1.app.isPackaged) {
75
- createMainWindow(process.env.SWARMCLAW_DEV_URL || DEV_URL_DEFAULT);
76
- }
77
- });
78
- electron_1.app.on('before-quit', () => {
79
- isQuitting = true;
80
- });
81
- electron_1.app.on('will-quit', async (event) => {
82
- if (!serverHandle)
83
- return;
84
- event.preventDefault();
85
- try {
86
- await serverHandle.stop();
87
- }
88
- finally {
89
- serverHandle = null;
90
- electron_1.app.exit(0);
91
- }
92
- });
93
- }
94
- async function onReady() {
95
- const paths = (0, paths_1.resolveRuntimePaths)();
96
- (0, menu_1.buildAppMenu)(paths, () => mainWindow);
97
- const iconPath = resolveIconPath();
98
- if (process.platform === 'darwin' && iconPath && electron_1.app.dock) {
99
- const img = electron_1.nativeImage.createFromPath(iconPath);
100
- if (!img.isEmpty())
101
- electron_1.app.dock.setIcon(img);
102
- }
103
- if (!electron_1.app.isPackaged) {
104
- const devUrl = process.env.SWARMCLAW_DEV_URL || DEV_URL_DEFAULT;
105
- console.log(`[swarmclaw] dev mode, loading ${devUrl}`);
106
- createMainWindow(devUrl);
107
- return;
108
- }
109
- serverLogFile = node_path_1.default.join(electron_1.app.getPath('userData'), 'logs', 'server.log');
110
- node_fs_1.default.mkdirSync(node_path_1.default.dirname(serverLogFile), { recursive: true });
111
- try {
112
- serverHandle = await (0, server_lifecycle_1.startEmbeddedServer)({
113
- paths,
114
- logFile: serverLogFile,
115
- onStdout: (c) => process.stdout.write(`[swarmclaw] ${c}`),
116
- onStderr: (c) => process.stderr.write(`[swarmclaw] ${c}`),
117
- onExit: (code, signal) => {
118
- if (!isQuitting) {
119
- console.error(`[swarmclaw] server exited unexpectedly (code=${code}, signal=${signal ?? 'none'})`);
120
- void showServerCrashDialog(code, signal);
121
- }
122
- },
123
- });
124
- }
125
- catch (err) {
126
- await showStartupFailureDialog(err, paths);
127
- electron_1.app.exit(1);
128
- return;
129
- }
130
- createMainWindow(serverHandle.url);
131
- void Promise.resolve().then(() => __importStar(require('./updater'))).then((m) => m.initAutoUpdater());
132
- }
133
- function resolveIconPath() {
134
- const candidate = electron_1.app.isPackaged
135
- ? node_path_1.default.join(process.resourcesPath, 'icon.png')
136
- : node_path_1.default.join(__dirname, '..', 'resources', 'icon.png');
137
- return node_fs_1.default.existsSync(candidate) ? candidate : undefined;
138
- }
139
- function createMainWindow(startUrl) {
140
- const iconPath = resolveIconPath();
141
- mainWindow = new electron_1.BrowserWindow({
142
- width: 1440,
143
- height: 900,
144
- minWidth: 1024,
145
- minHeight: 640,
146
- backgroundColor: '#0b0b0f',
147
- show: true,
148
- ...(iconPath ? { icon: iconPath } : {}),
149
- webPreferences: {
150
- contextIsolation: true,
151
- nodeIntegration: false,
152
- sandbox: false,
153
- },
154
- });
155
- const wc = mainWindow.webContents;
156
- if (!electron_1.app.isPackaged)
157
- wc.openDevTools({ mode: 'detach' });
158
- wc.on('did-start-loading', () => console.log('[swarmclaw] did-start-loading'));
159
- wc.on('did-finish-load', () => console.log('[swarmclaw] did-finish-load'));
160
- wc.on('did-fail-load', (_e, code, desc, url) => console.error(`[swarmclaw] did-fail-load code=${code} desc=${desc} url=${url}`));
161
- wc.on('render-process-gone', (_e, details) => console.error(`[swarmclaw] render-process-gone reason=${details.reason}`));
162
- wc.on('unresponsive', () => console.error('[swarmclaw] webContents unresponsive'));
163
- mainWindow.on('closed', () => {
164
- mainWindow = null;
165
- });
166
- mainWindow.webContents.setWindowOpenHandler(({ url }) => {
167
- if (url.startsWith(startUrl))
168
- return { action: 'allow' };
169
- void electron_1.shell.openExternal(url);
170
- return { action: 'deny' };
171
- });
172
- void mainWindow.loadURL(startUrl).catch((err) => {
173
- console.error('[swarmclaw] loadURL rejected:', err);
174
- });
175
- }
176
- async function showServerCrashDialog(code, signal) {
177
- const buttons = serverLogFile ? ['Open Logs Folder', 'Quit'] : ['Quit'];
178
- const quitButtonId = buttons.length - 1;
179
- const detail = buildLogDetail(`code=${code ?? 'null'} signal=${signal ?? 'none'}`);
180
- const res = await electron_1.dialog.showMessageBox({
181
- type: 'error',
182
- buttons,
183
- defaultId: quitButtonId,
184
- cancelId: quitButtonId,
185
- title: 'SwarmClaw stopped',
186
- message: 'The SwarmClaw server exited unexpectedly.',
187
- detail,
188
- });
189
- if (serverLogFile && res.response === 0)
190
- electron_1.shell.showItemInFolder(serverLogFile);
191
- electron_1.app.exit(1);
192
- }
193
- async function showStartupFailureDialog(err, paths) {
194
- const message = err instanceof Error ? err.message : String(err);
195
- const base = `${message}\n\nStandalone entry: ${paths.standaloneEntry}\nData dir: ${paths.dataDir}`;
196
- const detail = buildLogDetail(base);
197
- const buttons = serverLogFile ? ['Open Logs Folder', 'Quit'] : ['Quit'];
198
- const quitButtonId = buttons.length - 1;
199
- const res = await electron_1.dialog.showMessageBox({
200
- type: 'error',
201
- buttons,
202
- defaultId: quitButtonId,
203
- cancelId: quitButtonId,
204
- title: 'SwarmClaw failed to start',
205
- message: 'The embedded server did not start.',
206
- detail,
207
- });
208
- if (serverLogFile && res.response === 0)
209
- electron_1.shell.showItemInFolder(serverLogFile);
210
- }
211
- function buildLogDetail(base) {
212
- if (!serverLogFile)
213
- return base;
214
- const tail = (0, server_lifecycle_1.tailLogFile)(serverLogFile, LOG_TAIL_BYTES).trim();
215
- if (!tail)
216
- return `${base}\n\nLog file: ${serverLogFile}\n(no output captured yet)`;
217
- return `${base}\n\nLog tail (${serverLogFile}):\n${tail}`;
218
- }