@swarmclawai/swarmclaw 1.9.5 → 1.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -0
- package/package.json +2 -2
- package/src/app/api/eval/baselines/route.ts +55 -0
- package/src/app/api/eval/environments/route.ts +59 -0
- package/src/app/api/eval/gate/route.ts +36 -0
- package/src/app/api/eval/run/route.ts +8 -1
- package/src/app/api/eval/suite/route.ts +6 -0
- package/src/cli/index.js +5 -0
- package/src/components/quality/quality-workspace.tsx +337 -5
- package/src/lib/server/eval/baseline.test.ts +111 -0
- package/src/lib/server/eval/baseline.ts +274 -0
- package/src/lib/server/eval/environment-plan.test.ts +221 -0
- package/src/lib/server/eval/environment-plan.ts +498 -0
- package/src/lib/server/eval/runner.ts +53 -3
- package/src/lib/server/eval/scenarios.ts +18 -0
- package/src/lib/server/eval/store.ts +47 -1
- package/src/lib/server/eval/types.ts +105 -0
- package/src/lib/server/session-tools/extension-creator.ts +2 -2
- package/src/lib/server/tasks/task-checkout.ts +1 -1
- package/src/types/extension.ts +3 -3
- package/electron-dist/main.js +0 -218
package/README.md
CHANGED
|
@@ -399,6 +399,26 @@ Operational docs: https://swarmclaw.ai/docs/observability
|
|
|
399
399
|
|
|
400
400
|
## Releases
|
|
401
401
|
|
|
402
|
+
### v1.9.7 Highlights
|
|
403
|
+
|
|
404
|
+
Bundled eval-gate release: approved baselines, regression checks, and Quality Center release gates for repeatable eval evidence.
|
|
405
|
+
|
|
406
|
+
- **Eval regression baselines.** Operators can snapshot the latest scenario or suite score as an approved baseline with minimum score and regression allowance settings.
|
|
407
|
+
- **Release gate API.** `/api/eval/gate` compares current eval evidence against thresholds and baselines, while `/api/eval/baselines` lists and updates approved baselines.
|
|
408
|
+
- **CLI gate checks.** `swarmclaw eval gate`, `swarmclaw eval baselines`, and `swarmclaw eval baseline-set` expose the same release-gate workflow from automation.
|
|
409
|
+
- **Quality Center gate panel.** Eval Lab now shows pass/warn/fail status, latest-run coverage, current score, baseline score, regression points, and actionable checks.
|
|
410
|
+
- **Public-source hygiene.** Generic implementation comments now describe SwarmClaw behavior without naming internal comparison sources.
|
|
411
|
+
|
|
412
|
+
### v1.9.6 Highlights
|
|
413
|
+
|
|
414
|
+
Bundled eval-environment release: validation preflights, deterministic eval workspaces, and clearer operator readiness before spending run budget.
|
|
415
|
+
|
|
416
|
+
- **Eval validation environments.** `/api/eval/environments` now resolves the selected agent route, gateway target, scenario tools, generated files, and readiness checks before an eval runs.
|
|
417
|
+
- **Workspace manifests.** Eval runs now write `environment.json`, `.env.swarmclaw-eval`, and a task-focused `README.md` into each isolated eval workspace without embedding secrets.
|
|
418
|
+
- **Scenario fixtures.** Eval scenarios can declare fixture files, and the package-analysis scenario now gets a deterministic `package.json` in its workspace.
|
|
419
|
+
- **Fail-fast readiness.** Blocked evals stop before model execution when the agent route, CLI provider, gateway profile, or execution environment is not ready.
|
|
420
|
+
- **Quality UI preflight.** The Eval Lab now shows target status, gateway environment, checks, tools, and generated files next to the selected scenario.
|
|
421
|
+
|
|
402
422
|
### v1.9.5 Highlights
|
|
403
423
|
|
|
404
424
|
Bundled portability release: project-scoped workspace bundles, safer v2 imports, and preserved internal relationships for reusable teams.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@swarmclawai/swarmclaw",
|
|
3
|
-
"version": "1.9.
|
|
3
|
+
"version": "1.9.7",
|
|
4
4
|
"description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
|
|
5
5
|
"main": "electron-dist/main.js",
|
|
6
6
|
"license": "MIT",
|
|
@@ -87,7 +87,7 @@
|
|
|
87
87
|
"test:cli": "node --test src/cli/*.test.js bin/*.test.js scripts/electron-after-pack.test.mjs scripts/ensure-sandbox-browser-image.test.mjs scripts/postinstall.test.mjs scripts/run-next-build.test.mjs scripts/run-next-typegen.test.mjs",
|
|
88
88
|
"test:setup": "tsx --test src/app/api/setup/check-provider/route.test.ts src/lib/server/provider-model-discovery.test.ts src/components/auth/setup-wizard/utils.test.ts src/components/auth/setup-wizard/types.test.ts src/hooks/setup-done-detection.test.ts src/lib/setup-defaults.test.ts src/lib/server/storage-auth.test.ts src/lib/server/storage-auth-docker.test.ts",
|
|
89
89
|
"test:openclaw": "tsx --test src/lib/openclaw/openclaw-agent-id.test.ts src/lib/openclaw/openclaw-endpoint.test.ts src/lib/server/agents/agent-runtime-config.test.ts src/lib/server/build-llm.test.ts src/lib/server/connectors/connector-routing.test.ts src/lib/server/connectors/openclaw.test.ts src/lib/server/connectors/swarmdock.test.ts src/lib/server/gateway/protocol.test.ts src/lib/server/gateways/gateway-topology.test.ts src/lib/server/llm-response-cache.test.ts src/lib/server/mcp-conformance.test.ts src/lib/server/openclaw/agent-resolver.test.ts src/lib/server/openclaw/deploy.test.ts src/lib/server/openclaw/skills-normalize.test.ts src/lib/server/session-tools/openclaw-nodes.test.ts src/lib/server/session-tools/swarmdock.test.ts src/lib/server/tasks/task-quality-gate.test.ts src/lib/server/tasks/task-validation.test.ts src/lib/server/tool-capability-policy.test.ts src/lib/providers/openai.test.ts src/lib/providers/openclaw-exports.test.ts src/app/api/gateways/topology-route.test.ts src/app/api/openclaw/dashboard-url/route.test.ts",
|
|
90
|
-
"test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/tts/route.test.ts",
|
|
90
|
+
"test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/eval/baseline.test.ts src/lib/server/eval/environment-plan.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/tts/route.test.ts",
|
|
91
91
|
"test:builder": "tsx --test src/features/protocols/builder/utils/nodes-to-template.test.ts src/features/protocols/builder/utils/template-to-nodes.test.ts src/features/protocols/builder/validators/dag-validator.test.ts",
|
|
92
92
|
"test:e2e": "node --import tsx scripts/browser-e2e-smoke.ts",
|
|
93
93
|
"test:mcp:conformance": "node --import tsx ./scripts/mcp-conformance-check.ts",
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { NextResponse } from 'next/server'
|
|
2
|
+
import { z } from 'zod'
|
|
3
|
+
import { evaluateEvalGate, listEvalBaselinesForAgent, setEvalBaseline } from '@/lib/server/eval/baseline'
|
|
4
|
+
import { errorMessage } from '@/lib/shared-utils'
|
|
5
|
+
|
|
6
|
+
const BaselineSchema = z.object({
|
|
7
|
+
agentId: z.string().min(1),
|
|
8
|
+
scenarioId: z.string().min(1).nullable().optional(),
|
|
9
|
+
suite: z.string().min(1).nullable().optional(),
|
|
10
|
+
minPercent: z.number().min(0).max(100).nullable().optional(),
|
|
11
|
+
maxRegressionPoints: z.number().min(0).max(100).nullable().optional(),
|
|
12
|
+
label: z.string().max(160).nullable().optional(),
|
|
13
|
+
notes: z.string().max(1_000).nullable().optional(),
|
|
14
|
+
})
|
|
15
|
+
|
|
16
|
+
export async function GET(req: Request) {
|
|
17
|
+
try {
|
|
18
|
+
const { searchParams } = new URL(req.url)
|
|
19
|
+
const agentId = searchParams.get('agentId')
|
|
20
|
+
return NextResponse.json(listEvalBaselinesForAgent(agentId))
|
|
21
|
+
} catch (err: unknown) {
|
|
22
|
+
return NextResponse.json(
|
|
23
|
+
{ error: errorMessage(err) },
|
|
24
|
+
{ status: 500 },
|
|
25
|
+
)
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export async function POST(req: Request) {
|
|
30
|
+
try {
|
|
31
|
+
const body: unknown = await req.json()
|
|
32
|
+
const parsed = BaselineSchema.safeParse(body)
|
|
33
|
+
if (!parsed.success) {
|
|
34
|
+
return NextResponse.json(
|
|
35
|
+
{ error: parsed.error.issues.map((issue) => issue.message).join(', ') },
|
|
36
|
+
{ status: 400 },
|
|
37
|
+
)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const baseline = setEvalBaseline(parsed.data)
|
|
41
|
+
const gate = evaluateEvalGate({
|
|
42
|
+
agentId: parsed.data.agentId,
|
|
43
|
+
scenarioId: parsed.data.scenarioId,
|
|
44
|
+
suite: parsed.data.suite,
|
|
45
|
+
minPercent: parsed.data.minPercent,
|
|
46
|
+
maxRegressionPoints: parsed.data.maxRegressionPoints,
|
|
47
|
+
})
|
|
48
|
+
return NextResponse.json({ baseline, gate })
|
|
49
|
+
} catch (err: unknown) {
|
|
50
|
+
return NextResponse.json(
|
|
51
|
+
{ error: errorMessage(err) },
|
|
52
|
+
{ status: 500 },
|
|
53
|
+
)
|
|
54
|
+
}
|
|
55
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { NextResponse } from 'next/server'
|
|
2
|
+
import { z } from 'zod'
|
|
3
|
+
|
|
4
|
+
import { buildEvalEnvironmentPlan } from '@/lib/server/eval/environment-plan'
|
|
5
|
+
import { errorMessage } from '@/lib/shared-utils'
|
|
6
|
+
|
|
7
|
+
const PlanSchema = z.object({
|
|
8
|
+
agentId: z.string().min(1),
|
|
9
|
+
scenarioId: z.string().min(1).nullable().optional(),
|
|
10
|
+
suite: z.string().min(1).nullable().optional(),
|
|
11
|
+
gatewayProfileId: z.string().min(1).nullable().optional(),
|
|
12
|
+
environmentId: z.string().min(1).nullable().optional(),
|
|
13
|
+
refreshGateway: z.boolean().optional(),
|
|
14
|
+
})
|
|
15
|
+
|
|
16
|
+
function readBoolean(value: string | null): boolean {
|
|
17
|
+
return value === '1' || value === 'true'
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export async function GET(req: Request) {
|
|
21
|
+
try {
|
|
22
|
+
const { searchParams } = new URL(req.url)
|
|
23
|
+
const parsed = PlanSchema.safeParse({
|
|
24
|
+
agentId: searchParams.get('agentId') || '',
|
|
25
|
+
scenarioId: searchParams.get('scenarioId'),
|
|
26
|
+
suite: searchParams.get('suite'),
|
|
27
|
+
gatewayProfileId: searchParams.get('gatewayProfileId'),
|
|
28
|
+
environmentId: searchParams.get('environmentId'),
|
|
29
|
+
refreshGateway: readBoolean(searchParams.get('refreshGateway')),
|
|
30
|
+
})
|
|
31
|
+
if (!parsed.success) {
|
|
32
|
+
return NextResponse.json(
|
|
33
|
+
{ error: parsed.error.issues.map((issue) => issue.message).join(', ') },
|
|
34
|
+
{ status: 400 },
|
|
35
|
+
)
|
|
36
|
+
}
|
|
37
|
+
const plan = await buildEvalEnvironmentPlan(parsed.data)
|
|
38
|
+
return NextResponse.json(plan)
|
|
39
|
+
} catch (err: unknown) {
|
|
40
|
+
return NextResponse.json({ error: errorMessage(err) }, { status: 500 })
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export async function POST(req: Request) {
|
|
45
|
+
try {
|
|
46
|
+
const body: unknown = await req.json()
|
|
47
|
+
const parsed = PlanSchema.safeParse(body)
|
|
48
|
+
if (!parsed.success) {
|
|
49
|
+
return NextResponse.json(
|
|
50
|
+
{ error: parsed.error.issues.map((issue) => issue.message).join(', ') },
|
|
51
|
+
{ status: 400 },
|
|
52
|
+
)
|
|
53
|
+
}
|
|
54
|
+
const plan = await buildEvalEnvironmentPlan(parsed.data)
|
|
55
|
+
return NextResponse.json(plan)
|
|
56
|
+
} catch (err: unknown) {
|
|
57
|
+
return NextResponse.json({ error: errorMessage(err) }, { status: 500 })
|
|
58
|
+
}
|
|
59
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { NextResponse } from 'next/server'
|
|
2
|
+
import { evaluateEvalGate } from '@/lib/server/eval/baseline'
|
|
3
|
+
import { errorMessage } from '@/lib/shared-utils'
|
|
4
|
+
|
|
5
|
+
function parseNumberParam(value: string | null): number | null {
|
|
6
|
+
if (value == null || value.trim() === '') return null
|
|
7
|
+
const parsed = Number(value)
|
|
8
|
+
return Number.isFinite(parsed) ? parsed : null
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export async function GET(req: Request) {
|
|
12
|
+
try {
|
|
13
|
+
const { searchParams } = new URL(req.url)
|
|
14
|
+
const agentId = searchParams.get('agentId') || ''
|
|
15
|
+
if (!agentId) {
|
|
16
|
+
return NextResponse.json(
|
|
17
|
+
{ error: 'agentId is required' },
|
|
18
|
+
{ status: 400 },
|
|
19
|
+
)
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const result = evaluateEvalGate({
|
|
23
|
+
agentId,
|
|
24
|
+
scenarioId: searchParams.get('scenarioId'),
|
|
25
|
+
suite: searchParams.get('suite'),
|
|
26
|
+
minPercent: parseNumberParam(searchParams.get('minPercent')),
|
|
27
|
+
maxRegressionPoints: parseNumberParam(searchParams.get('maxRegressionPoints')),
|
|
28
|
+
})
|
|
29
|
+
return NextResponse.json(result)
|
|
30
|
+
} catch (err: unknown) {
|
|
31
|
+
return NextResponse.json(
|
|
32
|
+
{ error: errorMessage(err) },
|
|
33
|
+
{ status: 500 },
|
|
34
|
+
)
|
|
35
|
+
}
|
|
36
|
+
}
|
|
@@ -7,6 +7,9 @@ import { errorMessage } from '@/lib/shared-utils'
|
|
|
7
7
|
const RunSchema = z.object({
|
|
8
8
|
scenarioId: z.string().min(1),
|
|
9
9
|
agentId: z.string().min(1),
|
|
10
|
+
gatewayProfileId: z.string().min(1).nullable().optional(),
|
|
11
|
+
environmentId: z.string().min(1).nullable().optional(),
|
|
12
|
+
refreshGateway: z.boolean().optional(),
|
|
10
13
|
})
|
|
11
14
|
|
|
12
15
|
export async function POST(req: Request) {
|
|
@@ -20,7 +23,11 @@ export async function POST(req: Request) {
|
|
|
20
23
|
)
|
|
21
24
|
}
|
|
22
25
|
|
|
23
|
-
const result = await runEvalScenario(parsed.data.scenarioId, parsed.data.agentId
|
|
26
|
+
const result = await runEvalScenario(parsed.data.scenarioId, parsed.data.agentId, {
|
|
27
|
+
gatewayProfileId: parsed.data.gatewayProfileId || null,
|
|
28
|
+
environmentId: parsed.data.environmentId || null,
|
|
29
|
+
refreshGateway: parsed.data.refreshGateway === true,
|
|
30
|
+
})
|
|
24
31
|
return NextResponse.json(result)
|
|
25
32
|
} catch (err: unknown) {
|
|
26
33
|
return NextResponse.json(
|
|
@@ -7,6 +7,9 @@ const SuiteSchema = z.object({
|
|
|
7
7
|
agentId: z.string().min(1),
|
|
8
8
|
categories: z.array(z.string()).optional(),
|
|
9
9
|
suite: z.string().min(1).optional(),
|
|
10
|
+
gatewayProfileId: z.string().min(1).nullable().optional(),
|
|
11
|
+
environmentId: z.string().min(1).nullable().optional(),
|
|
12
|
+
refreshGateway: z.boolean().optional(),
|
|
10
13
|
})
|
|
11
14
|
|
|
12
15
|
export async function POST(req: Request) {
|
|
@@ -23,6 +26,9 @@ export async function POST(req: Request) {
|
|
|
23
26
|
const result = await runEvalSuite(parsed.data.agentId, {
|
|
24
27
|
categories: parsed.data.categories,
|
|
25
28
|
suite: parsed.data.suite,
|
|
29
|
+
gatewayProfileId: parsed.data.gatewayProfileId || null,
|
|
30
|
+
environmentId: parsed.data.environmentId || null,
|
|
31
|
+
refreshGateway: parsed.data.refreshGateway === true,
|
|
26
32
|
})
|
|
27
33
|
return NextResponse.json(result)
|
|
28
34
|
} catch (err: unknown) {
|
package/src/cli/index.js
CHANGED
|
@@ -231,8 +231,13 @@ const COMMAND_GROUPS = [
|
|
|
231
231
|
cmd('scenarios', 'GET', '/eval/scenarios', 'List available eval scenarios'),
|
|
232
232
|
cmd('suites', 'GET', '/eval/suites', 'List available eval suites (core, swe-bench-lite, gaia-l1, ...)'),
|
|
233
233
|
cmd('status', 'GET', '/eval/run', 'Get eval run status'),
|
|
234
|
+
cmd('environment', 'GET', '/eval/environments', 'Preview validation environment readiness for an eval'),
|
|
235
|
+
cmd('baselines', 'GET', '/eval/baselines', 'List eval regression baselines'),
|
|
236
|
+
cmd('gate', 'GET', '/eval/gate', 'Check the latest eval score against thresholds and baseline'),
|
|
234
237
|
cmd('run', 'POST', '/eval/run', 'Run an eval scenario against an agent', { expectsJsonBody: true }),
|
|
235
238
|
cmd('suite', 'POST', '/eval/suite', 'Run a full eval suite against an agent (pass { suite: "swe-bench-lite" } in body)', { expectsJsonBody: true }),
|
|
239
|
+
cmd('environment-prepare', 'POST', '/eval/environments', 'Prepare validation environment readiness for an eval', { expectsJsonBody: true }),
|
|
240
|
+
cmd('baseline-set', 'POST', '/eval/baselines', 'Set an eval regression baseline from latest completed runs', { expectsJsonBody: true }),
|
|
236
241
|
],
|
|
237
242
|
},
|
|
238
243
|
{
|