@swarmclawai/swarmclaw 1.9.6 → 1.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -0
- package/package.json +2 -2
- package/src/app/api/eval/baselines/route.ts +55 -0
- package/src/app/api/eval/gate/route.ts +36 -0
- package/src/cli/index.js +3 -0
- package/src/components/quality/quality-workspace.tsx +191 -3
- package/src/lib/server/eval/baseline.test.ts +111 -0
- package/src/lib/server/eval/baseline.ts +274 -0
- package/src/lib/server/eval/store.ts +47 -1
- package/src/lib/server/eval/types.ts +50 -0
- package/src/lib/server/session-tools/extension-creator.ts +2 -2
- package/src/lib/server/tasks/task-checkout.ts +1 -1
- package/src/types/extension.ts +3 -3
- package/electron-dist/main.js +0 -218
package/README.md
CHANGED
|
@@ -399,6 +399,16 @@ Operational docs: https://swarmclaw.ai/docs/observability
|
|
|
399
399
|
|
|
400
400
|
## Releases
|
|
401
401
|
|
|
402
|
+
### v1.9.7 Highlights
|
|
403
|
+
|
|
404
|
+
Bundled eval-gate release: approved baselines, regression checks, and Quality Center release gates for repeatable eval evidence.
|
|
405
|
+
|
|
406
|
+
- **Eval regression baselines.** Operators can snapshot the latest scenario or suite score as an approved baseline with minimum score and regression allowance settings.
|
|
407
|
+
- **Release gate API.** `/api/eval/gate` compares current eval evidence against thresholds and baselines, while `/api/eval/baselines` lists and updates approved baselines.
|
|
408
|
+
- **CLI gate checks.** `swarmclaw eval gate`, `swarmclaw eval baselines`, and `swarmclaw eval baseline-set` expose the same release-gate workflow from automation.
|
|
409
|
+
- **Quality Center gate panel.** Eval Lab now shows pass/warn/fail status, latest-run coverage, current score, baseline score, regression points, and actionable checks.
|
|
410
|
+
- **Public-source hygiene.** Generic implementation comments now describe SwarmClaw behavior without naming internal comparison sources.
|
|
411
|
+
|
|
402
412
|
### v1.9.6 Highlights
|
|
403
413
|
|
|
404
414
|
Bundled eval-environment release: validation preflights, deterministic eval workspaces, and clearer operator readiness before spending run budget.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@swarmclawai/swarmclaw",
|
|
3
|
-
"version": "1.9.
|
|
3
|
+
"version": "1.9.7",
|
|
4
4
|
"description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
|
|
5
5
|
"main": "electron-dist/main.js",
|
|
6
6
|
"license": "MIT",
|
|
@@ -87,7 +87,7 @@
|
|
|
87
87
|
"test:cli": "node --test src/cli/*.test.js bin/*.test.js scripts/electron-after-pack.test.mjs scripts/ensure-sandbox-browser-image.test.mjs scripts/postinstall.test.mjs scripts/run-next-build.test.mjs scripts/run-next-typegen.test.mjs",
|
|
88
88
|
"test:setup": "tsx --test src/app/api/setup/check-provider/route.test.ts src/lib/server/provider-model-discovery.test.ts src/components/auth/setup-wizard/utils.test.ts src/components/auth/setup-wizard/types.test.ts src/hooks/setup-done-detection.test.ts src/lib/setup-defaults.test.ts src/lib/server/storage-auth.test.ts src/lib/server/storage-auth-docker.test.ts",
|
|
89
89
|
"test:openclaw": "tsx --test src/lib/openclaw/openclaw-agent-id.test.ts src/lib/openclaw/openclaw-endpoint.test.ts src/lib/server/agents/agent-runtime-config.test.ts src/lib/server/build-llm.test.ts src/lib/server/connectors/connector-routing.test.ts src/lib/server/connectors/openclaw.test.ts src/lib/server/connectors/swarmdock.test.ts src/lib/server/gateway/protocol.test.ts src/lib/server/gateways/gateway-topology.test.ts src/lib/server/llm-response-cache.test.ts src/lib/server/mcp-conformance.test.ts src/lib/server/openclaw/agent-resolver.test.ts src/lib/server/openclaw/deploy.test.ts src/lib/server/openclaw/skills-normalize.test.ts src/lib/server/session-tools/openclaw-nodes.test.ts src/lib/server/session-tools/swarmdock.test.ts src/lib/server/tasks/task-quality-gate.test.ts src/lib/server/tasks/task-validation.test.ts src/lib/server/tool-capability-policy.test.ts src/lib/providers/openai.test.ts src/lib/providers/openclaw-exports.test.ts src/app/api/gateways/topology-route.test.ts src/app/api/openclaw/dashboard-url/route.test.ts",
|
|
90
|
-
"test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/eval/environment-plan.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/tts/route.test.ts",
|
|
90
|
+
"test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/eval/baseline.test.ts src/lib/server/eval/environment-plan.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/tts/route.test.ts",
|
|
91
91
|
"test:builder": "tsx --test src/features/protocols/builder/utils/nodes-to-template.test.ts src/features/protocols/builder/utils/template-to-nodes.test.ts src/features/protocols/builder/validators/dag-validator.test.ts",
|
|
92
92
|
"test:e2e": "node --import tsx scripts/browser-e2e-smoke.ts",
|
|
93
93
|
"test:mcp:conformance": "node --import tsx ./scripts/mcp-conformance-check.ts",
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { NextResponse } from 'next/server'
|
|
2
|
+
import { z } from 'zod'
|
|
3
|
+
import { evaluateEvalGate, listEvalBaselinesForAgent, setEvalBaseline } from '@/lib/server/eval/baseline'
|
|
4
|
+
import { errorMessage } from '@/lib/shared-utils'
|
|
5
|
+
|
|
6
|
+
const BaselineSchema = z.object({
|
|
7
|
+
agentId: z.string().min(1),
|
|
8
|
+
scenarioId: z.string().min(1).nullable().optional(),
|
|
9
|
+
suite: z.string().min(1).nullable().optional(),
|
|
10
|
+
minPercent: z.number().min(0).max(100).nullable().optional(),
|
|
11
|
+
maxRegressionPoints: z.number().min(0).max(100).nullable().optional(),
|
|
12
|
+
label: z.string().max(160).nullable().optional(),
|
|
13
|
+
notes: z.string().max(1_000).nullable().optional(),
|
|
14
|
+
})
|
|
15
|
+
|
|
16
|
+
export async function GET(req: Request) {
|
|
17
|
+
try {
|
|
18
|
+
const { searchParams } = new URL(req.url)
|
|
19
|
+
const agentId = searchParams.get('agentId')
|
|
20
|
+
return NextResponse.json(listEvalBaselinesForAgent(agentId))
|
|
21
|
+
} catch (err: unknown) {
|
|
22
|
+
return NextResponse.json(
|
|
23
|
+
{ error: errorMessage(err) },
|
|
24
|
+
{ status: 500 },
|
|
25
|
+
)
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export async function POST(req: Request) {
|
|
30
|
+
try {
|
|
31
|
+
const body: unknown = await req.json()
|
|
32
|
+
const parsed = BaselineSchema.safeParse(body)
|
|
33
|
+
if (!parsed.success) {
|
|
34
|
+
return NextResponse.json(
|
|
35
|
+
{ error: parsed.error.issues.map((issue) => issue.message).join(', ') },
|
|
36
|
+
{ status: 400 },
|
|
37
|
+
)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const baseline = setEvalBaseline(parsed.data)
|
|
41
|
+
const gate = evaluateEvalGate({
|
|
42
|
+
agentId: parsed.data.agentId,
|
|
43
|
+
scenarioId: parsed.data.scenarioId,
|
|
44
|
+
suite: parsed.data.suite,
|
|
45
|
+
minPercent: parsed.data.minPercent,
|
|
46
|
+
maxRegressionPoints: parsed.data.maxRegressionPoints,
|
|
47
|
+
})
|
|
48
|
+
return NextResponse.json({ baseline, gate })
|
|
49
|
+
} catch (err: unknown) {
|
|
50
|
+
return NextResponse.json(
|
|
51
|
+
{ error: errorMessage(err) },
|
|
52
|
+
{ status: 500 },
|
|
53
|
+
)
|
|
54
|
+
}
|
|
55
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { NextResponse } from 'next/server'
|
|
2
|
+
import { evaluateEvalGate } from '@/lib/server/eval/baseline'
|
|
3
|
+
import { errorMessage } from '@/lib/shared-utils'
|
|
4
|
+
|
|
5
|
+
function parseNumberParam(value: string | null): number | null {
|
|
6
|
+
if (value == null || value.trim() === '') return null
|
|
7
|
+
const parsed = Number(value)
|
|
8
|
+
return Number.isFinite(parsed) ? parsed : null
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export async function GET(req: Request) {
|
|
12
|
+
try {
|
|
13
|
+
const { searchParams } = new URL(req.url)
|
|
14
|
+
const agentId = searchParams.get('agentId') || ''
|
|
15
|
+
if (!agentId) {
|
|
16
|
+
return NextResponse.json(
|
|
17
|
+
{ error: 'agentId is required' },
|
|
18
|
+
{ status: 400 },
|
|
19
|
+
)
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const result = evaluateEvalGate({
|
|
23
|
+
agentId,
|
|
24
|
+
scenarioId: searchParams.get('scenarioId'),
|
|
25
|
+
suite: searchParams.get('suite'),
|
|
26
|
+
minPercent: parseNumberParam(searchParams.get('minPercent')),
|
|
27
|
+
maxRegressionPoints: parseNumberParam(searchParams.get('maxRegressionPoints')),
|
|
28
|
+
})
|
|
29
|
+
return NextResponse.json(result)
|
|
30
|
+
} catch (err: unknown) {
|
|
31
|
+
return NextResponse.json(
|
|
32
|
+
{ error: errorMessage(err) },
|
|
33
|
+
{ status: 500 },
|
|
34
|
+
)
|
|
35
|
+
}
|
|
36
|
+
}
|
package/src/cli/index.js
CHANGED
|
@@ -232,9 +232,12 @@ const COMMAND_GROUPS = [
|
|
|
232
232
|
cmd('suites', 'GET', '/eval/suites', 'List available eval suites (core, swe-bench-lite, gaia-l1, ...)'),
|
|
233
233
|
cmd('status', 'GET', '/eval/run', 'Get eval run status'),
|
|
234
234
|
cmd('environment', 'GET', '/eval/environments', 'Preview validation environment readiness for an eval'),
|
|
235
|
+
cmd('baselines', 'GET', '/eval/baselines', 'List eval regression baselines'),
|
|
236
|
+
cmd('gate', 'GET', '/eval/gate', 'Check the latest eval score against thresholds and baseline'),
|
|
235
237
|
cmd('run', 'POST', '/eval/run', 'Run an eval scenario against an agent', { expectsJsonBody: true }),
|
|
236
238
|
cmd('suite', 'POST', '/eval/suite', 'Run a full eval suite against an agent (pass { suite: "swe-bench-lite" } in body)', { expectsJsonBody: true }),
|
|
237
239
|
cmd('environment-prepare', 'POST', '/eval/environments', 'Prepare validation environment readiness for an eval', { expectsJsonBody: true }),
|
|
240
|
+
cmd('baseline-set', 'POST', '/eval/baselines', 'Set an eval regression baseline from latest completed runs', { expectsJsonBody: true }),
|
|
238
241
|
],
|
|
239
242
|
},
|
|
240
243
|
{
|
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
} from '@/lib/quality/quality-summary'
|
|
18
18
|
import { cn } from '@/lib/utils'
|
|
19
19
|
import { useAppStore } from '@/stores/use-app-store'
|
|
20
|
-
import type { EvalEnvironmentPlan, EvalRun, EvalSuiteResult } from '@/lib/server/eval/types'
|
|
20
|
+
import type { EvalEnvironmentPlan, EvalGateResult, EvalRun, EvalSuiteResult } from '@/lib/server/eval/types'
|
|
21
21
|
import type { Agent, ApprovalRequest, SessionRunRecord } from '@/types'
|
|
22
22
|
|
|
23
23
|
type QualityTab = 'overview' | 'evals' | 'approvals' | 'runs'
|
|
@@ -117,6 +117,18 @@ function checkClass(level: 'info' | 'warn' | 'error'): string {
|
|
|
117
117
|
return 'border-white/[0.06] bg-white/[0.025] text-text-3'
|
|
118
118
|
}
|
|
119
119
|
|
|
120
|
+
function gateStatusClass(status: EvalGateResult['status']): string {
|
|
121
|
+
if (status === 'pass') return 'border-emerald-500/25 bg-emerald-500/10 text-emerald-200'
|
|
122
|
+
if (status === 'warn') return 'border-amber-500/25 bg-amber-500/10 text-amber-200'
|
|
123
|
+
return 'border-rose-500/25 bg-rose-500/10 text-rose-200'
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function gateCheckClass(status: EvalGateResult['status']): string {
|
|
127
|
+
if (status === 'fail') return 'border-rose-500/20 bg-rose-500/[0.05] text-rose-200'
|
|
128
|
+
if (status === 'warn') return 'border-amber-500/20 bg-amber-500/[0.05] text-amber-200'
|
|
129
|
+
return 'border-emerald-500/20 bg-emerald-500/[0.05] text-emerald-200'
|
|
130
|
+
}
|
|
131
|
+
|
|
120
132
|
function EvalEnvironmentPanel({ plan, loading, onRefresh }: {
|
|
121
133
|
plan: EvalEnvironmentPlan | null
|
|
122
134
|
loading: boolean
|
|
@@ -195,6 +207,115 @@ function EvalEnvironmentPanel({ plan, loading, onRefresh }: {
|
|
|
195
207
|
)
|
|
196
208
|
}
|
|
197
209
|
|
|
210
|
+
function EvalGatePanel({
|
|
211
|
+
gate,
|
|
212
|
+
loading,
|
|
213
|
+
busy,
|
|
214
|
+
scope,
|
|
215
|
+
onScopeChange,
|
|
216
|
+
onRefresh,
|
|
217
|
+
onSetBaseline,
|
|
218
|
+
}: {
|
|
219
|
+
gate: EvalGateResult | null
|
|
220
|
+
loading: boolean
|
|
221
|
+
busy: boolean
|
|
222
|
+
scope: 'scenario' | 'suite'
|
|
223
|
+
onScopeChange: (scope: 'scenario' | 'suite') => void
|
|
224
|
+
onRefresh: () => void
|
|
225
|
+
onSetBaseline: () => void
|
|
226
|
+
}) {
|
|
227
|
+
return (
|
|
228
|
+
<div className="rounded-[12px] border border-white/[0.06] bg-white/[0.025] px-3 py-3">
|
|
229
|
+
<div className="flex items-start justify-between gap-3">
|
|
230
|
+
<div>
|
|
231
|
+
<div className="text-[13px] font-800 text-text">Regression gate</div>
|
|
232
|
+
<p className="mt-1 text-[11px] leading-relaxed text-text-3/65">
|
|
233
|
+
Compare latest eval evidence against thresholds and an approved baseline.
|
|
234
|
+
</p>
|
|
235
|
+
</div>
|
|
236
|
+
<button
|
|
237
|
+
type="button"
|
|
238
|
+
onClick={onRefresh}
|
|
239
|
+
disabled={loading}
|
|
240
|
+
className="shrink-0 rounded-[8px] border border-white/[0.08] px-2 py-1 text-[10px] font-800 text-text-2 transition-colors hover:bg-white/[0.06] disabled:opacity-40"
|
|
241
|
+
>
|
|
242
|
+
{loading ? 'Checking' : 'Refresh'}
|
|
243
|
+
</button>
|
|
244
|
+
</div>
|
|
245
|
+
|
|
246
|
+
<div className="mt-3 flex rounded-[10px] border border-white/[0.06] bg-white/[0.025] p-1">
|
|
247
|
+
{(['scenario', 'suite'] as const).map((item) => (
|
|
248
|
+
<button
|
|
249
|
+
key={item}
|
|
250
|
+
type="button"
|
|
251
|
+
onClick={() => onScopeChange(item)}
|
|
252
|
+
className={cn(
|
|
253
|
+
'flex-1 rounded-[8px] px-2 py-1.5 text-[10px] font-800 uppercase tracking-[0.08em] transition-colors',
|
|
254
|
+
scope === item ? 'bg-white/[0.1] text-text' : 'text-text-3 hover:bg-white/[0.05]',
|
|
255
|
+
)}
|
|
256
|
+
>
|
|
257
|
+
{item}
|
|
258
|
+
</button>
|
|
259
|
+
))}
|
|
260
|
+
</div>
|
|
261
|
+
|
|
262
|
+
{!gate ? (
|
|
263
|
+
<div className="mt-3 text-[11px] text-text-3/60">{loading ? 'Checking gate...' : 'Run evals to build gate evidence.'}</div>
|
|
264
|
+
) : (
|
|
265
|
+
<div className="mt-3 flex flex-col gap-3">
|
|
266
|
+
<div className="flex flex-wrap items-center gap-2">
|
|
267
|
+
<span className={cn('rounded-full border px-2 py-1 text-[10px] font-800 uppercase tracking-[0.08em]', gateStatusClass(gate.status))}>
|
|
268
|
+
{gate.status}
|
|
269
|
+
</span>
|
|
270
|
+
<span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
271
|
+
{gate.scope.label}
|
|
272
|
+
</span>
|
|
273
|
+
<span className="rounded-full bg-white/[0.05] px-2 py-1 text-[10px] font-700 text-text-3">
|
|
274
|
+
{gate.latestRuns.length}/{gate.scope.scenarioIds.length} latest runs
|
|
275
|
+
</span>
|
|
276
|
+
</div>
|
|
277
|
+
|
|
278
|
+
<div className="grid grid-cols-3 gap-2">
|
|
279
|
+
<div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
|
|
280
|
+
<div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Current</div>
|
|
281
|
+
<div className="mt-1 text-[14px] font-800 text-text">{formatPercent(gate.currentPercent)}</div>
|
|
282
|
+
</div>
|
|
283
|
+
<div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
|
|
284
|
+
<div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Baseline</div>
|
|
285
|
+
<div className="mt-1 text-[14px] font-800 text-text">{gate.baseline ? `${gate.baseline.baselinePercent}%` : 'none'}</div>
|
|
286
|
+
</div>
|
|
287
|
+
<div className="rounded-[10px] border border-white/[0.06] bg-white/[0.02] px-2 py-2">
|
|
288
|
+
<div className="text-[9px] font-800 uppercase tracking-[0.08em] text-text-3/50">Regression</div>
|
|
289
|
+
<div className="mt-1 text-[14px] font-800 text-text">{gate.regressionPoints == null ? 'n/a' : `${gate.regressionPoints}pt`}</div>
|
|
290
|
+
</div>
|
|
291
|
+
</div>
|
|
292
|
+
|
|
293
|
+
<div className="flex flex-col gap-1.5">
|
|
294
|
+
{gate.checks.slice(0, 4).map((check) => (
|
|
295
|
+
<div key={`${check.code}:${check.message}`} className={cn('rounded-[9px] border px-2.5 py-2 text-[11px] leading-relaxed', gateCheckClass(check.status))}>
|
|
296
|
+
<span className="font-800 uppercase tracking-[0.08em]">{check.status}</span>
|
|
297
|
+
<span className="ml-2">{check.message}</span>
|
|
298
|
+
</div>
|
|
299
|
+
))}
|
|
300
|
+
{gate.checks.length > 4 && (
|
|
301
|
+
<div className="text-[10px] text-text-3/55">+{gate.checks.length - 4} more check{gate.checks.length - 4 === 1 ? '' : 's'}</div>
|
|
302
|
+
)}
|
|
303
|
+
</div>
|
|
304
|
+
|
|
305
|
+
<button
|
|
306
|
+
type="button"
|
|
307
|
+
onClick={onSetBaseline}
|
|
308
|
+
disabled={busy || gate.latestRuns.length === 0 || gate.checks.some((check) => check.code === 'missing_scope_runs')}
|
|
309
|
+
className="rounded-[9px] border border-white/[0.08] bg-white/[0.04] px-3 py-2 text-[11px] font-800 text-text-2 transition-colors hover:bg-white/[0.08] disabled:cursor-not-allowed disabled:opacity-40"
|
|
310
|
+
>
|
|
311
|
+
{busy ? 'Saving baseline' : gate.baseline ? 'Update baseline' : 'Set baseline'}
|
|
312
|
+
</button>
|
|
313
|
+
</div>
|
|
314
|
+
)}
|
|
315
|
+
</div>
|
|
316
|
+
)
|
|
317
|
+
}
|
|
318
|
+
|
|
198
319
|
export function QualityWorkspace() {
|
|
199
320
|
const router = useRouter()
|
|
200
321
|
const searchParams = useSearchParams()
|
|
@@ -219,6 +340,10 @@ export function QualityWorkspace() {
|
|
|
219
340
|
const [evalBusy, setEvalBusy] = useState<string | null>(null)
|
|
220
341
|
const [evalEnvironmentPlan, setEvalEnvironmentPlan] = useState<EvalEnvironmentPlan | null>(null)
|
|
221
342
|
const [evalEnvironmentLoading, setEvalEnvironmentLoading] = useState(false)
|
|
343
|
+
const [evalGate, setEvalGate] = useState<EvalGateResult | null>(null)
|
|
344
|
+
const [evalGateScope, setEvalGateScope] = useState<'scenario' | 'suite'>('scenario')
|
|
345
|
+
const [evalGateLoading, setEvalGateLoading] = useState(false)
|
|
346
|
+
const [evalBaselineBusy, setEvalBaselineBusy] = useState(false)
|
|
222
347
|
const [approvalBusy, setApprovalBusy] = useState<string | null>(null)
|
|
223
348
|
|
|
224
349
|
useEffect(() => {
|
|
@@ -283,6 +408,30 @@ export function QualityWorkspace() {
|
|
|
283
408
|
}
|
|
284
409
|
}, [selectedAgentId, selectedScenarioId, selectedSuite])
|
|
285
410
|
|
|
411
|
+
const loadEvalGate = useCallback(async () => {
|
|
412
|
+
if (!selectedAgentId) {
|
|
413
|
+
setEvalGate(null)
|
|
414
|
+
return
|
|
415
|
+
}
|
|
416
|
+
if (evalGateScope === 'scenario' && !selectedScenarioId) {
|
|
417
|
+
setEvalGate(null)
|
|
418
|
+
return
|
|
419
|
+
}
|
|
420
|
+
const params = new URLSearchParams({ agentId: selectedAgentId })
|
|
421
|
+
if (evalGateScope === 'scenario') params.set('scenarioId', selectedScenarioId)
|
|
422
|
+
else params.set('suite', selectedSuite)
|
|
423
|
+
setEvalGateLoading(true)
|
|
424
|
+
try {
|
|
425
|
+
const gate = await api<EvalGateResult>('GET', `/eval/gate?${params.toString()}`)
|
|
426
|
+
setEvalGate(gate)
|
|
427
|
+
} catch (err) {
|
|
428
|
+
setEvalGate(null)
|
|
429
|
+
toast.error(err instanceof Error ? err.message : 'Unable to check eval gate')
|
|
430
|
+
} finally {
|
|
431
|
+
setEvalGateLoading(false)
|
|
432
|
+
}
|
|
433
|
+
}, [evalGateScope, selectedAgentId, selectedScenarioId, selectedSuite])
|
|
434
|
+
|
|
286
435
|
useEffect(() => {
|
|
287
436
|
void loadQualityData()
|
|
288
437
|
}, [loadQualityData])
|
|
@@ -301,6 +450,10 @@ export function QualityWorkspace() {
|
|
|
301
450
|
void loadEvalEnvironmentPlan()
|
|
302
451
|
}, [loadEvalEnvironmentPlan])
|
|
303
452
|
|
|
453
|
+
useEffect(() => {
|
|
454
|
+
void loadEvalGate()
|
|
455
|
+
}, [loadEvalGate])
|
|
456
|
+
|
|
304
457
|
useEffect(() => {
|
|
305
458
|
if (!suites.some((suite) => suite.name === selectedSuite) && suites[0]) {
|
|
306
459
|
setSelectedSuite(suites[0].name)
|
|
@@ -341,12 +494,13 @@ export function QualityWorkspace() {
|
|
|
341
494
|
toast.success('Eval scenario completed')
|
|
342
495
|
await loadQualityData({ silent: true })
|
|
343
496
|
await loadEvalEnvironmentPlan()
|
|
497
|
+
await loadEvalGate()
|
|
344
498
|
} catch (err) {
|
|
345
499
|
toast.error(err instanceof Error ? err.message : 'Eval scenario failed')
|
|
346
500
|
} finally {
|
|
347
501
|
setEvalBusy(null)
|
|
348
502
|
}
|
|
349
|
-
}, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadQualityData, selectedAgentId, selectedScenarioId])
|
|
503
|
+
}, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadEvalGate, loadQualityData, selectedAgentId, selectedScenarioId])
|
|
350
504
|
|
|
351
505
|
const runSuite = useCallback(async (suiteName: string) => {
|
|
352
506
|
if (!selectedAgentId) {
|
|
@@ -369,12 +523,37 @@ export function QualityWorkspace() {
|
|
|
369
523
|
toast.success(`Suite completed at ${Math.round(result.percentage)}%`)
|
|
370
524
|
await loadQualityData({ silent: true })
|
|
371
525
|
await loadEvalEnvironmentPlan()
|
|
526
|
+
await loadEvalGate()
|
|
372
527
|
} catch (err) {
|
|
373
528
|
toast.error(err instanceof Error ? err.message : 'Eval suite failed')
|
|
374
529
|
} finally {
|
|
375
530
|
setEvalBusy(null)
|
|
376
531
|
}
|
|
377
|
-
}, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadQualityData, selectedAgentId])
|
|
532
|
+
}, [evalEnvironmentPlan, loadEvalEnvironmentPlan, loadEvalGate, loadQualityData, selectedAgentId])
|
|
533
|
+
|
|
534
|
+
const setEvalBaseline = useCallback(async () => {
|
|
535
|
+
if (!selectedAgentId) {
|
|
536
|
+
toast.error('Choose an agent first')
|
|
537
|
+
return
|
|
538
|
+
}
|
|
539
|
+
if (evalGateScope === 'scenario' && !selectedScenarioId) {
|
|
540
|
+
toast.error('Choose a scenario first')
|
|
541
|
+
return
|
|
542
|
+
}
|
|
543
|
+
setEvalBaselineBusy(true)
|
|
544
|
+
try {
|
|
545
|
+
const body = evalGateScope === 'scenario'
|
|
546
|
+
? { agentId: selectedAgentId, scenarioId: selectedScenarioId, minPercent: evalGate?.minPercent ?? 80, maxRegressionPoints: evalGate?.maxRegressionPoints ?? 5 }
|
|
547
|
+
: { agentId: selectedAgentId, suite: selectedSuite, minPercent: evalGate?.minPercent ?? 80, maxRegressionPoints: evalGate?.maxRegressionPoints ?? 5 }
|
|
548
|
+
const result = await api<{ gate: EvalGateResult }>('POST', '/eval/baselines', body)
|
|
549
|
+
setEvalGate(result.gate)
|
|
550
|
+
toast.success('Eval baseline saved')
|
|
551
|
+
} catch (err) {
|
|
552
|
+
toast.error(err instanceof Error ? err.message : 'Unable to save eval baseline')
|
|
553
|
+
} finally {
|
|
554
|
+
setEvalBaselineBusy(false)
|
|
555
|
+
}
|
|
556
|
+
}, [evalGate, evalGateScope, selectedAgentId, selectedScenarioId, selectedSuite])
|
|
378
557
|
|
|
379
558
|
const actOnApproval = useCallback(async (approval: ApprovalRequest, approved: boolean) => {
|
|
380
559
|
setApprovalBusy(approval.id)
|
|
@@ -600,6 +779,15 @@ export function QualityWorkspace() {
|
|
|
600
779
|
loading={evalEnvironmentLoading}
|
|
601
780
|
onRefresh={() => void loadEvalEnvironmentPlan({ refreshGateway: true })}
|
|
602
781
|
/>
|
|
782
|
+
<EvalGatePanel
|
|
783
|
+
gate={evalGate}
|
|
784
|
+
loading={evalGateLoading}
|
|
785
|
+
busy={evalBaselineBusy}
|
|
786
|
+
scope={evalGateScope}
|
|
787
|
+
onScopeChange={setEvalGateScope}
|
|
788
|
+
onRefresh={() => void loadEvalGate()}
|
|
789
|
+
onSetBaseline={() => void setEvalBaseline()}
|
|
790
|
+
/>
|
|
603
791
|
<button
|
|
604
792
|
type="button"
|
|
605
793
|
onClick={() => openMissionTemplate('release-candidate-qa')}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import assert from 'node:assert/strict'
|
|
2
|
+
import test from 'node:test'
|
|
3
|
+
|
|
4
|
+
import {
|
|
5
|
+
evaluateEvalGate,
|
|
6
|
+
setEvalBaseline,
|
|
7
|
+
} from './baseline'
|
|
8
|
+
import type { EvalBaseline, EvalRun } from './types'
|
|
9
|
+
|
|
10
|
+
function makeRun(overrides: Partial<EvalRun> = {}): EvalRun {
|
|
11
|
+
return {
|
|
12
|
+
id: 'run-1',
|
|
13
|
+
scenarioId: 'coding-prime',
|
|
14
|
+
agentId: 'agent-1',
|
|
15
|
+
status: 'completed',
|
|
16
|
+
startedAt: 1,
|
|
17
|
+
endedAt: 2,
|
|
18
|
+
score: 8,
|
|
19
|
+
maxScore: 10,
|
|
20
|
+
details: [],
|
|
21
|
+
...overrides,
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function depsFor(runs: EvalRun[], baseline: EvalBaseline | null = null, saved: EvalBaseline[] = []) {
|
|
26
|
+
return {
|
|
27
|
+
now: () => 123,
|
|
28
|
+
listRunsByAgent: (agentId: string) => runs.filter((run) => run.agentId === agentId),
|
|
29
|
+
getBaselineForScope: () => baseline,
|
|
30
|
+
saveBaseline: (next: EvalBaseline) => { saved.push(next) },
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
test('setEvalBaseline snapshots the latest scenario score and gate defaults', () => {
|
|
35
|
+
const saved: EvalBaseline[] = []
|
|
36
|
+
const baseline = setEvalBaseline(
|
|
37
|
+
{
|
|
38
|
+
agentId: 'agent-1',
|
|
39
|
+
scenarioId: 'coding-prime',
|
|
40
|
+
minPercent: 75,
|
|
41
|
+
maxRegressionPoints: 3,
|
|
42
|
+
label: 'Release candidate',
|
|
43
|
+
},
|
|
44
|
+
depsFor([
|
|
45
|
+
makeRun({ id: 'older', score: 4, startedAt: 1, endedAt: 2 }),
|
|
46
|
+
makeRun({ id: 'latest', score: 8, startedAt: 5, endedAt: 6 }),
|
|
47
|
+
], null, saved),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
assert.equal(saved.length, 1)
|
|
51
|
+
assert.equal(baseline.scope.type, 'scenario')
|
|
52
|
+
assert.equal(baseline.scope.id, 'coding-prime')
|
|
53
|
+
assert.equal(baseline.baselinePercent, 80)
|
|
54
|
+
assert.equal(baseline.minPercent, 75)
|
|
55
|
+
assert.equal(baseline.maxRegressionPoints, 3)
|
|
56
|
+
assert.deepEqual(baseline.runIds, ['latest'])
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
test('evaluateEvalGate warns until a baseline is approved', () => {
|
|
60
|
+
const gate = evaluateEvalGate(
|
|
61
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70 },
|
|
62
|
+
depsFor([makeRun({ score: 8, maxScore: 10 })]),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
assert.equal(gate.currentPercent, 80)
|
|
66
|
+
assert.equal(gate.status, 'warn')
|
|
67
|
+
assert.ok(gate.checks.some((check) => check.code === 'baseline_missing' && check.status === 'warn'))
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
test('evaluateEvalGate fails when regression exceeds the baseline allowance', () => {
|
|
71
|
+
const baseline = setEvalBaseline(
|
|
72
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 2 },
|
|
73
|
+
depsFor([makeRun({ id: 'baseline', score: 9, maxScore: 10 })]),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
const gate = evaluateEvalGate(
|
|
77
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime' },
|
|
78
|
+
depsFor([makeRun({ id: 'current', score: 6, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
assert.equal(gate.currentPercent, 60)
|
|
82
|
+
assert.equal(gate.regressionPoints, 30)
|
|
83
|
+
assert.equal(gate.status, 'fail')
|
|
84
|
+
assert.ok(gate.checks.some((check) => check.code === 'regression_limit_exceeded'))
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
test('evaluateEvalGate passes when score and regression checks pass', () => {
|
|
88
|
+
const baseline = setEvalBaseline(
|
|
89
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime', minPercent: 70, maxRegressionPoints: 5 },
|
|
90
|
+
depsFor([makeRun({ id: 'baseline', score: 8, maxScore: 10 })]),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
const gate = evaluateEvalGate(
|
|
94
|
+
{ agentId: 'agent-1', scenarioId: 'coding-prime' },
|
|
95
|
+
depsFor([makeRun({ id: 'current', score: 8, maxScore: 10, startedAt: 10, endedAt: 11 })], baseline),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
assert.equal(gate.status, 'pass')
|
|
99
|
+
assert.equal(gate.regressionPoints, 0)
|
|
100
|
+
assert.ok(gate.checks.some((check) => check.code === 'score_threshold_met'))
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
test('suite gates require latest runs for every scenario in scope before baselining', () => {
|
|
104
|
+
assert.throws(
|
|
105
|
+
() => setEvalBaseline(
|
|
106
|
+
{ agentId: 'agent-1', suite: 'core' },
|
|
107
|
+
depsFor([makeRun({ scenarioId: 'coding-prime' })]),
|
|
108
|
+
),
|
|
109
|
+
/Baseline requires latest runs for every scenario in scope/,
|
|
110
|
+
)
|
|
111
|
+
})
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
import {
|
|
2
|
+
getEvalBaselineForScope,
|
|
3
|
+
listEvalBaselines,
|
|
4
|
+
listEvalRunsByAgent,
|
|
5
|
+
saveEvalBaseline,
|
|
6
|
+
} from './store'
|
|
7
|
+
import { getScenario, getSuiteScenarios } from './scenarios'
|
|
8
|
+
import type {
|
|
9
|
+
EvalBaseline,
|
|
10
|
+
EvalGateCheck,
|
|
11
|
+
EvalGateResult,
|
|
12
|
+
EvalGateScope,
|
|
13
|
+
EvalGateScopeType,
|
|
14
|
+
EvalRun,
|
|
15
|
+
} from './types'
|
|
16
|
+
|
|
17
|
+
const DEFAULT_MIN_PERCENT = 80
|
|
18
|
+
const DEFAULT_MAX_REGRESSION_POINTS = 5
|
|
19
|
+
const MAX_LOOKBACK_RUNS = 1_000
|
|
20
|
+
|
|
21
|
+
export interface EvalGateInput {
|
|
22
|
+
agentId: string
|
|
23
|
+
scenarioId?: string | null
|
|
24
|
+
suite?: string | null
|
|
25
|
+
minPercent?: number | null
|
|
26
|
+
maxRegressionPoints?: number | null
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface SetEvalBaselineInput extends EvalGateInput {
|
|
30
|
+
label?: string | null
|
|
31
|
+
notes?: string | null
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
interface EvalGateDeps {
|
|
35
|
+
now?: () => number
|
|
36
|
+
listRunsByAgent?: (agentId: string, limit: number) => EvalRun[]
|
|
37
|
+
getBaselineForScope?: (agentId: string, scopeType: EvalGateScopeType, scopeId: string) => EvalBaseline | null
|
|
38
|
+
saveBaseline?: (baseline: EvalBaseline) => void
|
|
39
|
+
listBaselines?: (filters?: { agentId?: string; limit?: number }) => EvalBaseline[]
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
interface EvalAggregate {
|
|
43
|
+
runs: EvalRun[]
|
|
44
|
+
missingScenarioIds: string[]
|
|
45
|
+
score: number
|
|
46
|
+
maxScore: number
|
|
47
|
+
percent: number | null
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function normalizePercent(value: number | null | undefined, fallback: number): number {
|
|
51
|
+
if (!Number.isFinite(value) || value == null) return fallback
|
|
52
|
+
return Math.max(0, Math.min(100, Math.round(value)))
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function normalizeRegressionPoints(value: number | null | undefined, fallback: number): number {
|
|
56
|
+
if (!Number.isFinite(value) || value == null) return fallback
|
|
57
|
+
return Math.max(0, Math.round(value))
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function scorePercent(score: number, maxScore: number): number | null {
|
|
61
|
+
if (!Number.isFinite(score) || !Number.isFinite(maxScore) || maxScore <= 0) return null
|
|
62
|
+
return Math.round((score / maxScore) * 100)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function maxScoreForScenario(scenarioId: string): number {
|
|
66
|
+
const scenario = getScenario(scenarioId)
|
|
67
|
+
return scenario?.scoringCriteria.reduce((sum, criterion) => sum + criterion.weight, 0) ?? 0
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export function resolveEvalGateScope(input: Pick<EvalGateInput, 'scenarioId' | 'suite'>): EvalGateScope {
|
|
71
|
+
const scenarioId = input.scenarioId?.trim()
|
|
72
|
+
if (scenarioId) {
|
|
73
|
+
const scenario = getScenario(scenarioId)
|
|
74
|
+
if (!scenario) throw new Error(`Unknown eval scenario: ${scenarioId}`)
|
|
75
|
+
return {
|
|
76
|
+
type: 'scenario',
|
|
77
|
+
id: scenario.id,
|
|
78
|
+
label: scenario.name,
|
|
79
|
+
scenarioIds: [scenario.id],
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const suite = input.suite?.trim() || 'core'
|
|
84
|
+
const scenarios = getSuiteScenarios(suite)
|
|
85
|
+
if (scenarios.length === 0) throw new Error(`Unknown or empty eval suite: ${suite}`)
|
|
86
|
+
return {
|
|
87
|
+
type: 'suite',
|
|
88
|
+
id: suite,
|
|
89
|
+
label: suite,
|
|
90
|
+
scenarioIds: scenarios.map((scenario) => scenario.id),
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export function evalBaselineId(agentId: string, scope: EvalGateScope): string {
|
|
95
|
+
return `eval-baseline:${agentId}:${scope.type}:${scope.id}`
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function latestRunsForScope(runs: EvalRun[], scope: EvalGateScope): EvalRun[] {
|
|
99
|
+
const scenarioSet = new Set(scope.scenarioIds)
|
|
100
|
+
const latest = new Map<string, EvalRun>()
|
|
101
|
+
|
|
102
|
+
for (const run of runs) {
|
|
103
|
+
if (!scenarioSet.has(run.scenarioId)) continue
|
|
104
|
+
if (run.status === 'pending' || run.status === 'running') continue
|
|
105
|
+
const previous = latest.get(run.scenarioId)
|
|
106
|
+
if (!previous || (run.endedAt ?? run.startedAt) > (previous.endedAt ?? previous.startedAt)) {
|
|
107
|
+
latest.set(run.scenarioId, run)
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return scope.scenarioIds
|
|
112
|
+
.map((scenarioId) => latest.get(scenarioId))
|
|
113
|
+
.filter(Boolean) as EvalRun[]
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function aggregateRuns(scope: EvalGateScope, runs: EvalRun[]): EvalAggregate {
|
|
117
|
+
const byScenario = new Map(runs.map((run) => [run.scenarioId, run]))
|
|
118
|
+
const missingScenarioIds = scope.scenarioIds.filter((scenarioId) => !byScenario.has(scenarioId))
|
|
119
|
+
const score = scope.scenarioIds.reduce((sum, scenarioId) => sum + (byScenario.get(scenarioId)?.score ?? 0), 0)
|
|
120
|
+
const maxScore = scope.scenarioIds.reduce((sum, scenarioId) => {
|
|
121
|
+
const runMaxScore = byScenario.get(scenarioId)?.maxScore
|
|
122
|
+
return sum + (Number.isFinite(runMaxScore) && runMaxScore != null ? runMaxScore : maxScoreForScenario(scenarioId))
|
|
123
|
+
}, 0)
|
|
124
|
+
return {
|
|
125
|
+
runs,
|
|
126
|
+
missingScenarioIds,
|
|
127
|
+
score,
|
|
128
|
+
maxScore,
|
|
129
|
+
percent: scorePercent(score, maxScore),
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function statusFromChecks(checks: EvalGateCheck[]): EvalGateResult['status'] {
|
|
134
|
+
if (checks.some((check) => check.status === 'fail')) return 'fail'
|
|
135
|
+
if (checks.some((check) => check.status === 'warn')) return 'warn'
|
|
136
|
+
return 'pass'
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
export function listEvalBaselinesForAgent(agentId?: string | null, deps: EvalGateDeps = {}): EvalBaseline[] {
|
|
140
|
+
const list = deps.listBaselines || listEvalBaselines
|
|
141
|
+
return list({ agentId: agentId || undefined, limit: 200 })
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
export function setEvalBaseline(input: SetEvalBaselineInput, deps: EvalGateDeps = {}): EvalBaseline {
|
|
145
|
+
if (!input.agentId.trim()) throw new Error('agentId is required')
|
|
146
|
+
|
|
147
|
+
const now = deps.now?.() ?? Date.now()
|
|
148
|
+
const scope = resolveEvalGateScope(input)
|
|
149
|
+
const runs = latestRunsForScope(
|
|
150
|
+
(deps.listRunsByAgent || listEvalRunsByAgent)(input.agentId, MAX_LOOKBACK_RUNS),
|
|
151
|
+
scope,
|
|
152
|
+
)
|
|
153
|
+
const aggregate = aggregateRuns(scope, runs)
|
|
154
|
+
if (aggregate.runs.length === 0) {
|
|
155
|
+
throw new Error('Run the selected eval before setting a baseline.')
|
|
156
|
+
}
|
|
157
|
+
if (aggregate.missingScenarioIds.length > 0) {
|
|
158
|
+
throw new Error(`Baseline requires latest runs for every scenario in scope. Missing: ${aggregate.missingScenarioIds.join(', ')}`)
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const existing = (deps.getBaselineForScope || getEvalBaselineForScope)(input.agentId, scope.type, scope.id)
|
|
162
|
+
const baseline: EvalBaseline = {
|
|
163
|
+
id: existing?.id || evalBaselineId(input.agentId, scope),
|
|
164
|
+
agentId: input.agentId,
|
|
165
|
+
scope,
|
|
166
|
+
baselineScore: aggregate.score,
|
|
167
|
+
baselineMaxScore: aggregate.maxScore,
|
|
168
|
+
baselinePercent: aggregate.percent ?? 0,
|
|
169
|
+
minPercent: normalizePercent(input.minPercent, aggregate.percent ?? DEFAULT_MIN_PERCENT),
|
|
170
|
+
maxRegressionPoints: normalizeRegressionPoints(input.maxRegressionPoints, existing?.maxRegressionPoints ?? DEFAULT_MAX_REGRESSION_POINTS),
|
|
171
|
+
runIds: aggregate.runs.map((run) => run.id),
|
|
172
|
+
label: input.label?.trim() || existing?.label || null,
|
|
173
|
+
notes: input.notes?.trim() || existing?.notes || null,
|
|
174
|
+
createdAt: existing?.createdAt || now,
|
|
175
|
+
updatedAt: now,
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
;(deps.saveBaseline || saveEvalBaseline)(baseline)
|
|
179
|
+
return baseline
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
export function evaluateEvalGate(input: EvalGateInput, deps: EvalGateDeps = {}): EvalGateResult {
|
|
183
|
+
if (!input.agentId.trim()) throw new Error('agentId is required')
|
|
184
|
+
|
|
185
|
+
const generatedAt = deps.now?.() ?? Date.now()
|
|
186
|
+
const scope = resolveEvalGateScope(input)
|
|
187
|
+
const baseline = (deps.getBaselineForScope || getEvalBaselineForScope)(input.agentId, scope.type, scope.id)
|
|
188
|
+
const runs = latestRunsForScope(
|
|
189
|
+
(deps.listRunsByAgent || listEvalRunsByAgent)(input.agentId, MAX_LOOKBACK_RUNS),
|
|
190
|
+
scope,
|
|
191
|
+
)
|
|
192
|
+
const aggregate = aggregateRuns(scope, runs)
|
|
193
|
+
const minPercent = normalizePercent(input.minPercent, baseline?.minPercent ?? DEFAULT_MIN_PERCENT)
|
|
194
|
+
const maxRegressionPoints = normalizeRegressionPoints(input.maxRegressionPoints, baseline?.maxRegressionPoints ?? DEFAULT_MAX_REGRESSION_POINTS)
|
|
195
|
+
const regressionPoints = baseline && aggregate.percent != null
|
|
196
|
+
? Math.max(0, baseline.baselinePercent - aggregate.percent)
|
|
197
|
+
: null
|
|
198
|
+
|
|
199
|
+
const checks: EvalGateCheck[] = []
|
|
200
|
+
if (aggregate.runs.length === 0) {
|
|
201
|
+
checks.push({
|
|
202
|
+
code: 'no_eval_runs',
|
|
203
|
+
status: 'fail',
|
|
204
|
+
message: 'No completed eval runs are available for this gate.',
|
|
205
|
+
})
|
|
206
|
+
}
|
|
207
|
+
if (aggregate.missingScenarioIds.length > 0) {
|
|
208
|
+
checks.push({
|
|
209
|
+
code: 'missing_scope_runs',
|
|
210
|
+
status: 'fail',
|
|
211
|
+
message: `${aggregate.missingScenarioIds.length} scenario${aggregate.missingScenarioIds.length === 1 ? '' : 's'} have no latest run in this gate.`,
|
|
212
|
+
detail: aggregate.missingScenarioIds.join(', '),
|
|
213
|
+
})
|
|
214
|
+
}
|
|
215
|
+
if (aggregate.runs.some((run) => run.status === 'failed')) {
|
|
216
|
+
checks.push({
|
|
217
|
+
code: 'failed_eval_run',
|
|
218
|
+
status: 'fail',
|
|
219
|
+
message: 'At least one latest eval run failed.',
|
|
220
|
+
})
|
|
221
|
+
}
|
|
222
|
+
if (aggregate.percent == null || aggregate.percent < minPercent) {
|
|
223
|
+
checks.push({
|
|
224
|
+
code: 'score_below_threshold',
|
|
225
|
+
status: 'fail',
|
|
226
|
+
message: `Current score is below the ${minPercent}% gate.`,
|
|
227
|
+
detail: aggregate.percent == null ? 'n/a' : `${aggregate.percent}%`,
|
|
228
|
+
})
|
|
229
|
+
} else {
|
|
230
|
+
checks.push({
|
|
231
|
+
code: 'score_threshold_met',
|
|
232
|
+
status: 'pass',
|
|
233
|
+
message: `Current score meets the ${minPercent}% gate.`,
|
|
234
|
+
detail: `${aggregate.percent}%`,
|
|
235
|
+
})
|
|
236
|
+
}
|
|
237
|
+
if (!baseline) {
|
|
238
|
+
checks.push({
|
|
239
|
+
code: 'baseline_missing',
|
|
240
|
+
status: 'warn',
|
|
241
|
+
message: 'No approved baseline is set for this gate.',
|
|
242
|
+
})
|
|
243
|
+
} else if (regressionPoints != null && regressionPoints > maxRegressionPoints) {
|
|
244
|
+
checks.push({
|
|
245
|
+
code: 'regression_limit_exceeded',
|
|
246
|
+
status: 'fail',
|
|
247
|
+
message: `Regression exceeds the ${maxRegressionPoints} point allowance.`,
|
|
248
|
+
detail: `${regressionPoints} points below baseline`,
|
|
249
|
+
})
|
|
250
|
+
} else if (regressionPoints != null) {
|
|
251
|
+
checks.push({
|
|
252
|
+
code: 'regression_within_limit',
|
|
253
|
+
status: 'pass',
|
|
254
|
+
message: `Regression is within the ${maxRegressionPoints} point allowance.`,
|
|
255
|
+
detail: `${regressionPoints} point${regressionPoints === 1 ? '' : 's'} below baseline`,
|
|
256
|
+
})
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
return {
|
|
260
|
+
agentId: input.agentId,
|
|
261
|
+
scope,
|
|
262
|
+
status: statusFromChecks(checks),
|
|
263
|
+
generatedAt,
|
|
264
|
+
baseline,
|
|
265
|
+
latestRuns: aggregate.runs,
|
|
266
|
+
currentScore: aggregate.score,
|
|
267
|
+
currentMaxScore: aggregate.maxScore,
|
|
268
|
+
currentPercent: aggregate.percent,
|
|
269
|
+
regressionPoints,
|
|
270
|
+
minPercent,
|
|
271
|
+
maxRegressionPoints,
|
|
272
|
+
checks,
|
|
273
|
+
}
|
|
274
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import Database from 'better-sqlite3'
|
|
2
2
|
import path from 'path'
|
|
3
|
-
import type { EvalRun } from './types'
|
|
3
|
+
import type { EvalBaseline, EvalRun } from './types'
|
|
4
4
|
import { DATA_DIR } from '../data-dir'
|
|
5
5
|
|
|
6
6
|
const DB_PATH = path.join(DATA_DIR, 'eval-runs.db')
|
|
@@ -15,6 +15,15 @@ function getDb(): Database.Database {
|
|
|
15
15
|
id TEXT PRIMARY KEY,
|
|
16
16
|
data TEXT NOT NULL
|
|
17
17
|
)`)
|
|
18
|
+
db.exec(`CREATE TABLE IF NOT EXISTS eval_baselines (
|
|
19
|
+
id TEXT PRIMARY KEY,
|
|
20
|
+
agent_id TEXT NOT NULL,
|
|
21
|
+
scope_type TEXT NOT NULL,
|
|
22
|
+
scope_id TEXT NOT NULL,
|
|
23
|
+
data TEXT NOT NULL,
|
|
24
|
+
updated_at INTEGER NOT NULL
|
|
25
|
+
)`)
|
|
26
|
+
db.exec('CREATE INDEX IF NOT EXISTS idx_eval_baselines_agent ON eval_baselines(agent_id, scope_type, scope_id)')
|
|
18
27
|
}
|
|
19
28
|
return db
|
|
20
29
|
}
|
|
@@ -36,3 +45,40 @@ export function listEvalRuns(limit = 50): EvalRun[] {
|
|
|
36
45
|
export function listEvalRunsByAgent(agentId: string, limit = 50): EvalRun[] {
|
|
37
46
|
return listEvalRuns(limit * 2).filter(r => r.agentId === agentId).slice(0, limit)
|
|
38
47
|
}
|
|
48
|
+
|
|
49
|
+
export function saveEvalBaseline(baseline: EvalBaseline): void {
|
|
50
|
+
getDb().prepare(`
|
|
51
|
+
INSERT OR REPLACE INTO eval_baselines (id, agent_id, scope_type, scope_id, data, updated_at)
|
|
52
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
53
|
+
`).run(
|
|
54
|
+
baseline.id,
|
|
55
|
+
baseline.agentId,
|
|
56
|
+
baseline.scope.type,
|
|
57
|
+
baseline.scope.id,
|
|
58
|
+
JSON.stringify(baseline),
|
|
59
|
+
baseline.updatedAt,
|
|
60
|
+
)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export function getEvalBaseline(id: string): EvalBaseline | null {
|
|
64
|
+
const row = getDb().prepare('SELECT data FROM eval_baselines WHERE id = ?').get(id) as { data: string } | undefined
|
|
65
|
+
return row ? JSON.parse(row.data) as EvalBaseline : null
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export function getEvalBaselineForScope(agentId: string, scopeType: EvalBaseline['scope']['type'], scopeId: string): EvalBaseline | null {
|
|
69
|
+
const row = getDb().prepare(`
|
|
70
|
+
SELECT data FROM eval_baselines
|
|
71
|
+
WHERE agent_id = ? AND scope_type = ? AND scope_id = ?
|
|
72
|
+
ORDER BY updated_at DESC
|
|
73
|
+
LIMIT 1
|
|
74
|
+
`).get(agentId, scopeType, scopeId) as { data: string } | undefined
|
|
75
|
+
return row ? JSON.parse(row.data) as EvalBaseline : null
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export function listEvalBaselines(filters: { agentId?: string; limit?: number } = {}): EvalBaseline[] {
|
|
79
|
+
const limit = Math.max(1, Math.min(filters.limit ?? 100, 500))
|
|
80
|
+
const rows = filters.agentId
|
|
81
|
+
? getDb().prepare('SELECT data FROM eval_baselines WHERE agent_id = ? ORDER BY updated_at DESC LIMIT ?').all(filters.agentId, limit) as { data: string }[]
|
|
82
|
+
: getDb().prepare('SELECT data FROM eval_baselines ORDER BY updated_at DESC LIMIT ?').all(limit) as { data: string }[]
|
|
83
|
+
return rows.map((row) => JSON.parse(row.data) as EvalBaseline)
|
|
84
|
+
}
|
|
@@ -105,3 +105,53 @@ export interface EvalSuiteResult {
|
|
|
105
105
|
runs: EvalRun[]
|
|
106
106
|
completedAt: number
|
|
107
107
|
}
|
|
108
|
+
|
|
109
|
+
export type EvalGateScopeType = 'scenario' | 'suite'
|
|
110
|
+
|
|
111
|
+
export interface EvalGateScope {
|
|
112
|
+
type: EvalGateScopeType
|
|
113
|
+
id: string
|
|
114
|
+
label: string
|
|
115
|
+
scenarioIds: string[]
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
export interface EvalBaseline {
|
|
119
|
+
id: string
|
|
120
|
+
agentId: string
|
|
121
|
+
scope: EvalGateScope
|
|
122
|
+
baselineScore: number
|
|
123
|
+
baselineMaxScore: number
|
|
124
|
+
baselinePercent: number
|
|
125
|
+
minPercent: number
|
|
126
|
+
maxRegressionPoints: number
|
|
127
|
+
runIds: string[]
|
|
128
|
+
label?: string | null
|
|
129
|
+
notes?: string | null
|
|
130
|
+
createdAt: number
|
|
131
|
+
updatedAt: number
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
export type EvalGateStatus = 'pass' | 'warn' | 'fail'
|
|
135
|
+
|
|
136
|
+
export interface EvalGateCheck {
|
|
137
|
+
code: string
|
|
138
|
+
status: EvalGateStatus
|
|
139
|
+
message: string
|
|
140
|
+
detail?: string
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
export interface EvalGateResult {
|
|
144
|
+
agentId: string
|
|
145
|
+
scope: EvalGateScope
|
|
146
|
+
status: EvalGateStatus
|
|
147
|
+
generatedAt: number
|
|
148
|
+
baseline: EvalBaseline | null
|
|
149
|
+
latestRuns: EvalRun[]
|
|
150
|
+
currentScore: number
|
|
151
|
+
currentMaxScore: number
|
|
152
|
+
currentPercent: number | null
|
|
153
|
+
regressionPoints: number | null
|
|
154
|
+
minPercent: number
|
|
155
|
+
maxRegressionPoints: number
|
|
156
|
+
checks: EvalGateCheck[]
|
|
157
|
+
}
|
|
@@ -138,7 +138,7 @@ module.exports = {
|
|
|
138
138
|
}
|
|
139
139
|
],
|
|
140
140
|
|
|
141
|
-
// --- Managed Resources
|
|
141
|
+
// --- Managed Resources ---
|
|
142
142
|
managedResources: {
|
|
143
143
|
agents: [
|
|
144
144
|
{
|
|
@@ -211,7 +211,7 @@ Key rules:
|
|
|
211
211
|
- Dependency installs are run by the extension manager inside a per-extension workspace using the selected package manager with scripts disabled.
|
|
212
212
|
- Extension settings are declared through ui.settingsFields and stored per extension ID
|
|
213
213
|
- Managed resources let an extension declare provisionable agents, schedules/routines, trusted local folders, gateway platforms, and setup checks. Operators reconcile them through Extensions > Managed Resources or /api/extensions/managed-resources.
|
|
214
|
-
-
|
|
214
|
+
- Top-level agents, routines, and localFolders are also accepted; SwarmClaw reconciles routines as schedules when they include schedule timing.
|
|
215
215
|
- Keep extensions focused: one clear purpose per extension
|
|
216
216
|
`
|
|
217
217
|
}
|
|
@@ -6,7 +6,7 @@ import { loadTasks, saveTasks } from '@/lib/server/tasks/task-repository'
|
|
|
6
6
|
* Atomically transition a task from queued → running with a checkout run ID.
|
|
7
7
|
*
|
|
8
8
|
* Uses a SQLite IMMEDIATE transaction to prevent two runners from starting the
|
|
9
|
-
* same task concurrently
|
|
9
|
+
* same task concurrently.
|
|
10
10
|
*
|
|
11
11
|
* Returns the checked-out task on success, or null if the task was already
|
|
12
12
|
* taken, missing, or no longer in queued status.
|
package/src/types/extension.ts
CHANGED
|
@@ -362,10 +362,10 @@ export interface ExtensionSetupCheckDeclaration {
|
|
|
362
362
|
export interface ExtensionManagedResources {
|
|
363
363
|
agents?: ExtensionManagedAgentDeclaration[]
|
|
364
364
|
schedules?: ExtensionManagedScheduleDeclaration[]
|
|
365
|
-
/**
|
|
365
|
+
/** Routine alias. SwarmClaw reconciles routines as managed schedules. */
|
|
366
366
|
routines?: ExtensionManagedScheduleDeclaration[]
|
|
367
367
|
localFolders?: ExtensionManagedLocalFolderDeclaration[]
|
|
368
|
-
/**
|
|
368
|
+
/** Gateway/platform declaration metadata for setup and diagnostics surfaces. */
|
|
369
369
|
gatewayPlatforms?: ExtensionGatewayPlatformDeclaration[]
|
|
370
370
|
setupChecks?: ExtensionSetupCheckDeclaration[]
|
|
371
371
|
}
|
|
@@ -420,7 +420,7 @@ export interface Extension {
|
|
|
420
420
|
providers?: ExtensionProviderDefinition[]
|
|
421
421
|
connectors?: ExtensionConnectorDefinition[]
|
|
422
422
|
managedResources?: ExtensionManagedResources
|
|
423
|
-
/**
|
|
423
|
+
/** Top-level managed-resource aliases. Prefer managedResources for new SwarmClaw extensions. */
|
|
424
424
|
agents?: ExtensionManagedAgentDeclaration[]
|
|
425
425
|
schedules?: ExtensionManagedScheduleDeclaration[]
|
|
426
426
|
routines?: ExtensionManagedScheduleDeclaration[]
|
package/electron-dist/main.js
DELETED
|
@@ -1,218 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
-
};
|
|
38
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
-
const electron_1 = require("electron");
|
|
40
|
-
const node_fs_1 = __importDefault(require("node:fs"));
|
|
41
|
-
const node_path_1 = __importDefault(require("node:path"));
|
|
42
|
-
const paths_1 = require("./paths");
|
|
43
|
-
const server_lifecycle_1 = require("./server-lifecycle");
|
|
44
|
-
const menu_1 = require("./menu");
|
|
45
|
-
const DEV_URL_DEFAULT = 'http://127.0.0.1:3456';
|
|
46
|
-
const LOG_TAIL_BYTES = 1500;
|
|
47
|
-
let mainWindow = null;
|
|
48
|
-
let serverHandle = null;
|
|
49
|
-
let serverLogFile = null;
|
|
50
|
-
let isQuitting = false;
|
|
51
|
-
const gotLock = electron_1.app.requestSingleInstanceLock();
|
|
52
|
-
if (!gotLock) {
|
|
53
|
-
electron_1.app.quit();
|
|
54
|
-
}
|
|
55
|
-
else {
|
|
56
|
-
electron_1.app.on('second-instance', () => {
|
|
57
|
-
if (mainWindow) {
|
|
58
|
-
if (mainWindow.isMinimized())
|
|
59
|
-
mainWindow.restore();
|
|
60
|
-
mainWindow.focus();
|
|
61
|
-
}
|
|
62
|
-
});
|
|
63
|
-
electron_1.app.on('ready', () => void onReady());
|
|
64
|
-
electron_1.app.on('window-all-closed', () => {
|
|
65
|
-
if (process.platform !== 'darwin')
|
|
66
|
-
electron_1.app.quit();
|
|
67
|
-
});
|
|
68
|
-
electron_1.app.on('activate', () => {
|
|
69
|
-
if (mainWindow !== null)
|
|
70
|
-
return;
|
|
71
|
-
if (serverHandle) {
|
|
72
|
-
createMainWindow(serverHandle.url);
|
|
73
|
-
}
|
|
74
|
-
else if (!electron_1.app.isPackaged) {
|
|
75
|
-
createMainWindow(process.env.SWARMCLAW_DEV_URL || DEV_URL_DEFAULT);
|
|
76
|
-
}
|
|
77
|
-
});
|
|
78
|
-
electron_1.app.on('before-quit', () => {
|
|
79
|
-
isQuitting = true;
|
|
80
|
-
});
|
|
81
|
-
electron_1.app.on('will-quit', async (event) => {
|
|
82
|
-
if (!serverHandle)
|
|
83
|
-
return;
|
|
84
|
-
event.preventDefault();
|
|
85
|
-
try {
|
|
86
|
-
await serverHandle.stop();
|
|
87
|
-
}
|
|
88
|
-
finally {
|
|
89
|
-
serverHandle = null;
|
|
90
|
-
electron_1.app.exit(0);
|
|
91
|
-
}
|
|
92
|
-
});
|
|
93
|
-
}
|
|
94
|
-
async function onReady() {
|
|
95
|
-
const paths = (0, paths_1.resolveRuntimePaths)();
|
|
96
|
-
(0, menu_1.buildAppMenu)(paths, () => mainWindow);
|
|
97
|
-
const iconPath = resolveIconPath();
|
|
98
|
-
if (process.platform === 'darwin' && iconPath && electron_1.app.dock) {
|
|
99
|
-
const img = electron_1.nativeImage.createFromPath(iconPath);
|
|
100
|
-
if (!img.isEmpty())
|
|
101
|
-
electron_1.app.dock.setIcon(img);
|
|
102
|
-
}
|
|
103
|
-
if (!electron_1.app.isPackaged) {
|
|
104
|
-
const devUrl = process.env.SWARMCLAW_DEV_URL || DEV_URL_DEFAULT;
|
|
105
|
-
console.log(`[swarmclaw] dev mode, loading ${devUrl}`);
|
|
106
|
-
createMainWindow(devUrl);
|
|
107
|
-
return;
|
|
108
|
-
}
|
|
109
|
-
serverLogFile = node_path_1.default.join(electron_1.app.getPath('userData'), 'logs', 'server.log');
|
|
110
|
-
node_fs_1.default.mkdirSync(node_path_1.default.dirname(serverLogFile), { recursive: true });
|
|
111
|
-
try {
|
|
112
|
-
serverHandle = await (0, server_lifecycle_1.startEmbeddedServer)({
|
|
113
|
-
paths,
|
|
114
|
-
logFile: serverLogFile,
|
|
115
|
-
onStdout: (c) => process.stdout.write(`[swarmclaw] ${c}`),
|
|
116
|
-
onStderr: (c) => process.stderr.write(`[swarmclaw] ${c}`),
|
|
117
|
-
onExit: (code, signal) => {
|
|
118
|
-
if (!isQuitting) {
|
|
119
|
-
console.error(`[swarmclaw] server exited unexpectedly (code=${code}, signal=${signal ?? 'none'})`);
|
|
120
|
-
void showServerCrashDialog(code, signal);
|
|
121
|
-
}
|
|
122
|
-
},
|
|
123
|
-
});
|
|
124
|
-
}
|
|
125
|
-
catch (err) {
|
|
126
|
-
await showStartupFailureDialog(err, paths);
|
|
127
|
-
electron_1.app.exit(1);
|
|
128
|
-
return;
|
|
129
|
-
}
|
|
130
|
-
createMainWindow(serverHandle.url);
|
|
131
|
-
void Promise.resolve().then(() => __importStar(require('./updater'))).then((m) => m.initAutoUpdater());
|
|
132
|
-
}
|
|
133
|
-
function resolveIconPath() {
|
|
134
|
-
const candidate = electron_1.app.isPackaged
|
|
135
|
-
? node_path_1.default.join(process.resourcesPath, 'icon.png')
|
|
136
|
-
: node_path_1.default.join(__dirname, '..', 'resources', 'icon.png');
|
|
137
|
-
return node_fs_1.default.existsSync(candidate) ? candidate : undefined;
|
|
138
|
-
}
|
|
139
|
-
function createMainWindow(startUrl) {
|
|
140
|
-
const iconPath = resolveIconPath();
|
|
141
|
-
mainWindow = new electron_1.BrowserWindow({
|
|
142
|
-
width: 1440,
|
|
143
|
-
height: 900,
|
|
144
|
-
minWidth: 1024,
|
|
145
|
-
minHeight: 640,
|
|
146
|
-
backgroundColor: '#0b0b0f',
|
|
147
|
-
show: true,
|
|
148
|
-
...(iconPath ? { icon: iconPath } : {}),
|
|
149
|
-
webPreferences: {
|
|
150
|
-
contextIsolation: true,
|
|
151
|
-
nodeIntegration: false,
|
|
152
|
-
sandbox: false,
|
|
153
|
-
},
|
|
154
|
-
});
|
|
155
|
-
const wc = mainWindow.webContents;
|
|
156
|
-
if (!electron_1.app.isPackaged)
|
|
157
|
-
wc.openDevTools({ mode: 'detach' });
|
|
158
|
-
wc.on('did-start-loading', () => console.log('[swarmclaw] did-start-loading'));
|
|
159
|
-
wc.on('did-finish-load', () => console.log('[swarmclaw] did-finish-load'));
|
|
160
|
-
wc.on('did-fail-load', (_e, code, desc, url) => console.error(`[swarmclaw] did-fail-load code=${code} desc=${desc} url=${url}`));
|
|
161
|
-
wc.on('render-process-gone', (_e, details) => console.error(`[swarmclaw] render-process-gone reason=${details.reason}`));
|
|
162
|
-
wc.on('unresponsive', () => console.error('[swarmclaw] webContents unresponsive'));
|
|
163
|
-
mainWindow.on('closed', () => {
|
|
164
|
-
mainWindow = null;
|
|
165
|
-
});
|
|
166
|
-
mainWindow.webContents.setWindowOpenHandler(({ url }) => {
|
|
167
|
-
if (url.startsWith(startUrl))
|
|
168
|
-
return { action: 'allow' };
|
|
169
|
-
void electron_1.shell.openExternal(url);
|
|
170
|
-
return { action: 'deny' };
|
|
171
|
-
});
|
|
172
|
-
void mainWindow.loadURL(startUrl).catch((err) => {
|
|
173
|
-
console.error('[swarmclaw] loadURL rejected:', err);
|
|
174
|
-
});
|
|
175
|
-
}
|
|
176
|
-
async function showServerCrashDialog(code, signal) {
|
|
177
|
-
const buttons = serverLogFile ? ['Open Logs Folder', 'Quit'] : ['Quit'];
|
|
178
|
-
const quitButtonId = buttons.length - 1;
|
|
179
|
-
const detail = buildLogDetail(`code=${code ?? 'null'} signal=${signal ?? 'none'}`);
|
|
180
|
-
const res = await electron_1.dialog.showMessageBox({
|
|
181
|
-
type: 'error',
|
|
182
|
-
buttons,
|
|
183
|
-
defaultId: quitButtonId,
|
|
184
|
-
cancelId: quitButtonId,
|
|
185
|
-
title: 'SwarmClaw stopped',
|
|
186
|
-
message: 'The SwarmClaw server exited unexpectedly.',
|
|
187
|
-
detail,
|
|
188
|
-
});
|
|
189
|
-
if (serverLogFile && res.response === 0)
|
|
190
|
-
electron_1.shell.showItemInFolder(serverLogFile);
|
|
191
|
-
electron_1.app.exit(1);
|
|
192
|
-
}
|
|
193
|
-
async function showStartupFailureDialog(err, paths) {
|
|
194
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
195
|
-
const base = `${message}\n\nStandalone entry: ${paths.standaloneEntry}\nData dir: ${paths.dataDir}`;
|
|
196
|
-
const detail = buildLogDetail(base);
|
|
197
|
-
const buttons = serverLogFile ? ['Open Logs Folder', 'Quit'] : ['Quit'];
|
|
198
|
-
const quitButtonId = buttons.length - 1;
|
|
199
|
-
const res = await electron_1.dialog.showMessageBox({
|
|
200
|
-
type: 'error',
|
|
201
|
-
buttons,
|
|
202
|
-
defaultId: quitButtonId,
|
|
203
|
-
cancelId: quitButtonId,
|
|
204
|
-
title: 'SwarmClaw failed to start',
|
|
205
|
-
message: 'The embedded server did not start.',
|
|
206
|
-
detail,
|
|
207
|
-
});
|
|
208
|
-
if (serverLogFile && res.response === 0)
|
|
209
|
-
electron_1.shell.showItemInFolder(serverLogFile);
|
|
210
|
-
}
|
|
211
|
-
function buildLogDetail(base) {
|
|
212
|
-
if (!serverLogFile)
|
|
213
|
-
return base;
|
|
214
|
-
const tail = (0, server_lifecycle_1.tailLogFile)(serverLogFile, LOG_TAIL_BYTES).trim();
|
|
215
|
-
if (!tail)
|
|
216
|
-
return `${base}\n\nLog file: ${serverLogFile}\n(no output captured yet)`;
|
|
217
|
-
return `${base}\n\nLog tail (${serverLogFile}):\n${tail}`;
|
|
218
|
-
}
|