npm - @swarmclawai/swarmclaw - Versions diffs - 1.9.5 → 1.9.7 - Mend

@swarmclawai/swarmclaw 1.9.5 → 1.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/README.md +20 -0
package/package.json +2 -2
package/src/app/api/eval/baselines/route.ts +55 -0
package/src/app/api/eval/environments/route.ts +59 -0
package/src/app/api/eval/gate/route.ts +36 -0
package/src/app/api/eval/run/route.ts +8 -1
package/src/app/api/eval/suite/route.ts +6 -0
package/src/cli/index.js +5 -0
package/src/components/quality/quality-workspace.tsx +337 -5
package/src/lib/server/eval/baseline.test.ts +111 -0
package/src/lib/server/eval/baseline.ts +274 -0
package/src/lib/server/eval/environment-plan.test.ts +221 -0
package/src/lib/server/eval/environment-plan.ts +498 -0
package/src/lib/server/eval/runner.ts +53 -3
package/src/lib/server/eval/scenarios.ts +18 -0
package/src/lib/server/eval/store.ts +47 -1
package/src/lib/server/eval/types.ts +105 -0
package/src/lib/server/session-tools/extension-creator.ts +2 -2
package/src/lib/server/tasks/task-checkout.ts +1 -1
package/src/types/extension.ts +3 -3
package/electron-dist/main.js +0 -218

package/README.md CHANGED Viewed

@@ -399,6 +399,26 @@ Operational docs: https://swarmclaw.ai/docs/observability
 ## Releases
+### v1.9.7 Highlights
+Bundled eval-gate release: approved baselines, regression checks, and Quality Center release gates for repeatable eval evidence.
+- **Eval regression baselines.** Operators can snapshot the latest scenario or suite score as an approved baseline with minimum score and regression allowance settings.
+- **Release gate API.** `/api/eval/gate` compares current eval evidence against thresholds and baselines, while `/api/eval/baselines` lists and updates approved baselines.
+- **CLI gate checks.** `swarmclaw eval gate`, `swarmclaw eval baselines`, and `swarmclaw eval baseline-set` expose the same release-gate workflow from automation.
+- **Quality Center gate panel.** Eval Lab now shows pass/warn/fail status, latest-run coverage, current score, baseline score, regression points, and actionable checks.
+- **Public-source hygiene.** Generic implementation comments now describe SwarmClaw behavior without naming internal comparison sources.
+### v1.9.6 Highlights
+Bundled eval-environment release: validation preflights, deterministic eval workspaces, and clearer operator readiness before spending run budget.
+- **Eval validation environments.** `/api/eval/environments` now resolves the selected agent route, gateway target, scenario tools, generated files, and readiness checks before an eval runs.
+- **Workspace manifests.** Eval runs now write `environment.json`, `.env.swarmclaw-eval`, and a task-focused `README.md` into each isolated eval workspace without embedding secrets.
+- **Scenario fixtures.** Eval scenarios can declare fixture files, and the package-analysis scenario now gets a deterministic `package.json` in its workspace.
+- **Fail-fast readiness.** Blocked evals stop before model execution when the agent route, CLI provider, gateway profile, or execution environment is not ready.
+- **Quality UI preflight.** The Eval Lab now shows target status, gateway environment, checks, tools, and generated files next to the selected scenario.
 ### v1.9.5 Highlights
 Bundled portability release: project-scoped workspace bundles, safer v2 imports, and preserved internal relationships for reusable teams.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@swarmclawai/swarmclaw",
-  "version": "1.9.5",
+  "version": "1.9.7",
   "description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
   "main": "electron-dist/main.js",
   "license": "MIT",
@@ -87,7 +87,7 @@
     "test:cli": "node --test src/cli/*.test.js bin/*.test.js scripts/electron-after-pack.test.mjs scripts/ensure-sandbox-browser-image.test.mjs scripts/postinstall.test.mjs scripts/run-next-build.test.mjs scripts/run-next-typegen.test.mjs",
     "test:setup": "tsx --test src/app/api/setup/check-provider/route.test.ts src/lib/server/provider-model-discovery.test.ts src/components/auth/setup-wizard/utils.test.ts src/components/auth/setup-wizard/types.test.ts src/hooks/setup-done-detection.test.ts src/lib/setup-defaults.test.ts src/lib/server/storage-auth.test.ts src/lib/server/storage-auth-docker.test.ts",
     "test:openclaw": "tsx --test src/lib/openclaw/openclaw-agent-id.test.ts src/lib/openclaw/openclaw-endpoint.test.ts src/lib/server/agents/agent-runtime-config.test.ts src/lib/server/build-llm.test.ts src/lib/server/connectors/connector-routing.test.ts src/lib/server/connectors/openclaw.test.ts src/lib/server/connectors/swarmdock.test.ts src/lib/server/gateway/protocol.test.ts src/lib/server/gateways/gateway-topology.test.ts src/lib/server/llm-response-cache.test.ts src/lib/server/mcp-conformance.test.ts src/lib/server/openclaw/agent-resolver.test.ts src/lib/server/openclaw/deploy.test.ts src/lib/server/openclaw/skills-normalize.test.ts src/lib/server/session-tools/openclaw-nodes.test.ts src/lib/server/session-tools/swarmdock.test.ts src/lib/server/tasks/task-quality-gate.test.ts src/lib/server/tasks/task-validation.test.ts src/lib/server/tool-capability-policy.test.ts src/lib/providers/openai.test.ts src/lib/providers/openclaw-exports.test.ts src/app/api/gateways/topology-route.test.ts src/app/api/openclaw/dashboard-url/route.test.ts",
-    "test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/tts/route.test.ts",
+    "test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/eval/baseline.test.ts src/lib/server/eval/environment-plan.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/tts/route.test.ts",
     "test:builder": "tsx --test src/features/protocols/builder/utils/nodes-to-template.test.ts src/features/protocols/builder/utils/template-to-nodes.test.ts src/features/protocols/builder/validators/dag-validator.test.ts",
     "test:e2e": "node --import tsx scripts/browser-e2e-smoke.ts",
     "test:mcp:conformance": "node --import tsx ./scripts/mcp-conformance-check.ts",

package/src/app/api/eval/baselines/route.ts ADDED Viewed

@@ -0,0 +1,55 @@
+import { NextResponse } from 'next/server'
+import { z } from 'zod'
+import { evaluateEvalGate, listEvalBaselinesForAgent, setEvalBaseline } from '@/lib/server/eval/baseline'
+import { errorMessage } from '@/lib/shared-utils'
+const BaselineSchema = z.object({
+  agentId: z.string().min(1),
+  scenarioId: z.string().min(1).nullable().optional(),
+  suite: z.string().min(1).nullable().optional(),
+  minPercent: z.number().min(0).max(100).nullable().optional(),
+  maxRegressionPoints: z.number().min(0).max(100).nullable().optional(),
+  label: z.string().max(160).nullable().optional(),
+  notes: z.string().max(1_000).nullable().optional(),
+})
+export async function GET(req: Request) {
+  try {
+    const { searchParams } = new URL(req.url)
+    const agentId = searchParams.get('agentId')
+    return NextResponse.json(listEvalBaselinesForAgent(agentId))
+  } catch (err: unknown) {
+    return NextResponse.json(
+      { error: errorMessage(err) },
+      { status: 500 },
+    )
+  }
+}
+export async function POST(req: Request) {
+  try {
+    const body: unknown = await req.json()
+    const parsed = BaselineSchema.safeParse(body)
+    if (!parsed.success) {
+      return NextResponse.json(
+        { error: parsed.error.issues.map((issue) => issue.message).join(', ') },
+        { status: 400 },
+      )
+    }
+    const baseline = setEvalBaseline(parsed.data)
+    const gate = evaluateEvalGate({
+      agentId: parsed.data.agentId,
+      scenarioId: parsed.data.scenarioId,
+      suite: parsed.data.suite,
+      minPercent: parsed.data.minPercent,
+      maxRegressionPoints: parsed.data.maxRegressionPoints,
+    })
+    return NextResponse.json({ baseline, gate })
+  } catch (err: unknown) {
+    return NextResponse.json(
+      { error: errorMessage(err) },
+      { status: 500 },
+    )
+  }
+}

package/src/app/api/eval/environments/route.ts ADDED Viewed

@@ -0,0 +1,59 @@
+import { NextResponse } from 'next/server'
+import { z } from 'zod'
+import { buildEvalEnvironmentPlan } from '@/lib/server/eval/environment-plan'
+import { errorMessage } from '@/lib/shared-utils'
+const PlanSchema = z.object({
+  agentId: z.string().min(1),
+  scenarioId: z.string().min(1).nullable().optional(),
+  suite: z.string().min(1).nullable().optional(),
+  gatewayProfileId: z.string().min(1).nullable().optional(),
+  environmentId: z.string().min(1).nullable().optional(),
+  refreshGateway: z.boolean().optional(),
+})
+function readBoolean(value: string | null): boolean {
+  return value === '1' || value === 'true'
+}
+export async function GET(req: Request) {
+  try {
+    const { searchParams } = new URL(req.url)
+    const parsed = PlanSchema.safeParse({
+      agentId: searchParams.get('agentId') || '',
+      scenarioId: searchParams.get('scenarioId'),
+      suite: searchParams.get('suite'),
+      gatewayProfileId: searchParams.get('gatewayProfileId'),
+      environmentId: searchParams.get('environmentId'),
+      refreshGateway: readBoolean(searchParams.get('refreshGateway')),
+    })
+    if (!parsed.success) {
+      return NextResponse.json(
+        { error: parsed.error.issues.map((issue) => issue.message).join(', ') },
+        { status: 400 },
+      )
+    }
+    const plan = await buildEvalEnvironmentPlan(parsed.data)
+    return NextResponse.json(plan)
+  } catch (err: unknown) {
+    return NextResponse.json({ error: errorMessage(err) }, { status: 500 })
+  }
+}
+export async function POST(req: Request) {
+  try {
+    const body: unknown = await req.json()
+    const parsed = PlanSchema.safeParse(body)
+    if (!parsed.success) {
+      return NextResponse.json(
+        { error: parsed.error.issues.map((issue) => issue.message).join(', ') },
+        { status: 400 },
+      )
+    }
+    const plan = await buildEvalEnvironmentPlan(parsed.data)
+    return NextResponse.json(plan)
+  } catch (err: unknown) {
+    return NextResponse.json({ error: errorMessage(err) }, { status: 500 })
+  }
+}

package/src/app/api/eval/gate/route.ts ADDED Viewed

@@ -0,0 +1,36 @@
+import { NextResponse } from 'next/server'
+import { evaluateEvalGate } from '@/lib/server/eval/baseline'
+import { errorMessage } from '@/lib/shared-utils'
+function parseNumberParam(value: string | null): number | null {
+  if (value == null || value.trim() === '') return null
+  const parsed = Number(value)
+  return Number.isFinite(parsed) ? parsed : null
+}
+export async function GET(req: Request) {
+  try {
+    const { searchParams } = new URL(req.url)
+    const agentId = searchParams.get('agentId') || ''
+    if (!agentId) {
+      return NextResponse.json(
+        { error: 'agentId is required' },
+        { status: 400 },
+      )
+    }
+    const result = evaluateEvalGate({
+      agentId,
+      scenarioId: searchParams.get('scenarioId'),
+      suite: searchParams.get('suite'),
+      minPercent: parseNumberParam(searchParams.get('minPercent')),
+      maxRegressionPoints: parseNumberParam(searchParams.get('maxRegressionPoints')),
+    })
+    return NextResponse.json(result)
+  } catch (err: unknown) {
+    return NextResponse.json(
+      { error: errorMessage(err) },
+      { status: 500 },
+    )
+  }
+}

package/src/app/api/eval/run/route.ts CHANGED Viewed

@@ -7,6 +7,9 @@ import { errorMessage } from '@/lib/shared-utils'
 const RunSchema = z.object({
   scenarioId: z.string().min(1),
   agentId: z.string().min(1),
+  gatewayProfileId: z.string().min(1).nullable().optional(),
+  environmentId: z.string().min(1).nullable().optional(),
+  refreshGateway: z.boolean().optional(),
 })
 export async function POST(req: Request) {
@@ -20,7 +23,11 @@ export async function POST(req: Request) {
       )
     }
-    const result = await runEvalScenario(parsed.data.scenarioId, parsed.data.agentId)
+    const result = await runEvalScenario(parsed.data.scenarioId, parsed.data.agentId, {
+      gatewayProfileId: parsed.data.gatewayProfileId || null,
+      environmentId: parsed.data.environmentId || null,
+      refreshGateway: parsed.data.refreshGateway === true,
+    })
     return NextResponse.json(result)
   } catch (err: unknown) {
     return NextResponse.json(

package/src/app/api/eval/suite/route.ts CHANGED Viewed

@@ -7,6 +7,9 @@ const SuiteSchema = z.object({
   agentId: z.string().min(1),
   categories: z.array(z.string()).optional(),
   suite: z.string().min(1).optional(),
+  gatewayProfileId: z.string().min(1).nullable().optional(),
+  environmentId: z.string().min(1).nullable().optional(),
+  refreshGateway: z.boolean().optional(),
 })
 export async function POST(req: Request) {
@@ -23,6 +26,9 @@ export async function POST(req: Request) {
     const result = await runEvalSuite(parsed.data.agentId, {
       categories: parsed.data.categories,
       suite: parsed.data.suite,
+      gatewayProfileId: parsed.data.gatewayProfileId || null,
+      environmentId: parsed.data.environmentId || null,
+      refreshGateway: parsed.data.refreshGateway === true,
     })
     return NextResponse.json(result)
   } catch (err: unknown) {

package/src/cli/index.js CHANGED Viewed

@@ -231,8 +231,13 @@ const COMMAND_GROUPS = [
       cmd('scenarios', 'GET', '/eval/scenarios', 'List available eval scenarios'),
       cmd('suites', 'GET', '/eval/suites', 'List available eval suites (core, swe-bench-lite, gaia-l1, ...)'),
       cmd('status', 'GET', '/eval/run', 'Get eval run status'),
+      cmd('environment', 'GET', '/eval/environments', 'Preview validation environment readiness for an eval'),
+      cmd('baselines', 'GET', '/eval/baselines', 'List eval regression baselines'),
+      cmd('gate', 'GET', '/eval/gate', 'Check the latest eval score against thresholds and baseline'),
       cmd('run', 'POST', '/eval/run', 'Run an eval scenario against an agent', { expectsJsonBody: true }),
       cmd('suite', 'POST', '/eval/suite', 'Run a full eval suite against an agent (pass { suite: "swe-bench-lite" } in body)', { expectsJsonBody: true }),
+      cmd('environment-prepare', 'POST', '/eval/environments', 'Prepare validation environment readiness for an eval', { expectsJsonBody: true }),
+      cmd('baseline-set', 'POST', '/eval/baselines', 'Set an eval regression baseline from latest completed runs', { expectsJsonBody: true }),
     ],
   },
   {