eve 0.6.0-beta.14 → 0.6.0-beta.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/README.md +6 -5
- package/dist/docs/evals-v2-plan.md +29 -4
- package/dist/docs/public/README.md +1 -1
- package/dist/docs/public/advanced/auth-and-route-protection.md +1 -1
- package/dist/docs/public/advanced/instrumentation.md +1 -1
- package/dist/docs/public/advanced/meta.json +0 -1
- package/dist/docs/public/channels/eve.mdx +1 -1
- package/dist/docs/public/evals/cases.mdx +114 -0
- package/dist/docs/public/evals/checks.mdx +63 -0
- package/dist/docs/public/evals/meta.json +4 -0
- package/dist/docs/public/evals/overview.mdx +91 -0
- package/dist/docs/public/evals/reporters.mdx +61 -0
- package/dist/docs/public/evals/running.mdx +63 -0
- package/dist/docs/public/evals/scores.mdx +57 -0
- package/dist/docs/public/evals/targets.mdx +54 -0
- package/dist/docs/public/getting-started.mdx +12 -8
- package/dist/docs/public/meta.json +1 -0
- package/dist/docs/public/reference/cli.md +14 -15
- package/dist/docs/public/reference/typescript-api.md +3 -2
- package/dist/docs/public/tutorial/first-agent.mdx +5 -2
- package/dist/src/cli/banner.d.ts +7 -0
- package/dist/src/cli/banner.js +1 -0
- package/dist/src/cli/commands/channels.js +1 -1
- package/dist/src/cli/commands/init-git.d.ts +15 -0
- package/dist/src/cli/commands/init-git.js +1 -0
- package/dist/src/cli/commands/init.d.ts +21 -0
- package/dist/src/cli/commands/init.js +1 -0
- package/dist/src/cli/run.d.ts +0 -1
- package/dist/src/cli/run.js +2 -2
- package/dist/src/compiler/artifacts.js +1 -1
- package/dist/src/compiler/normalize-agent-config.js +1 -1
- package/dist/src/compiler/normalize-sandbox.js +1 -1
- package/dist/src/compiler/workspace-resources.js +1 -1
- package/dist/src/evals/cli/eval.d.ts +3 -4
- package/dist/src/evals/cli/eval.js +1 -1
- package/dist/src/evals/define-eval-config.d.ts +16 -0
- package/dist/src/evals/define-eval-config.js +1 -0
- package/dist/src/evals/define-eval.d.ts +16 -14
- package/dist/src/evals/define-eval.js +1 -1
- package/dist/src/evals/index.d.ts +2 -1
- package/dist/src/evals/index.js +1 -1
- package/dist/src/evals/requirements.d.ts +1 -2
- package/dist/src/evals/requirements.js +1 -1
- package/dist/src/evals/runner/artifacts.d.ts +7 -6
- package/dist/src/evals/runner/artifacts.js +3 -3
- package/dist/src/evals/runner/discover.d.ts +28 -7
- package/dist/src/evals/runner/discover.js +1 -1
- package/dist/src/evals/runner/execute-eval.d.ts +8 -10
- package/dist/src/evals/runner/execute-eval.js +1 -1
- package/dist/src/evals/runner/execute-task.d.ts +22 -0
- package/dist/src/evals/runner/execute-task.js +1 -0
- package/dist/src/evals/runner/reporters/braintrust.d.ts +6 -4
- package/dist/src/evals/runner/reporters/braintrust.js +2 -2
- package/dist/src/evals/runner/reporters/console.d.ts +4 -4
- package/dist/src/evals/runner/reporters/console.js +1 -1
- package/dist/src/evals/runner/reporters/junit.d.ts +1 -0
- package/dist/src/evals/runner/reporters/junit.js +3 -7
- package/dist/src/evals/runner/reporters/types.d.ts +14 -8
- package/dist/src/evals/runner/run-evals.d.ts +38 -0
- package/dist/src/evals/runner/run-evals.js +1 -0
- package/dist/src/evals/runner/verdict.d.ts +5 -5
- package/dist/src/evals/runner/verdict.js +1 -1
- package/dist/src/evals/scorers/autoevals.js +1 -1
- package/dist/src/evals/scorers/json.d.ts +3 -3
- package/dist/src/evals/scorers/json.js +1 -1
- package/dist/src/evals/session.js +1 -1
- package/dist/src/evals/types.d.ts +134 -176
- package/dist/src/execution/sandbox/bindings/local.js +1 -1
- package/dist/src/harness/action-result-helpers.js +1 -1
- package/dist/src/harness/authorization.d.ts +26 -0
- package/dist/src/harness/authorization.js +1 -1
- package/dist/src/harness/emission.d.ts +12 -5
- package/dist/src/harness/emission.js +1 -1
- package/dist/src/harness/step-hooks.d.ts +4 -4
- package/dist/src/harness/step-hooks.js +1 -1
- package/dist/src/harness/tool-loop.js +1 -1
- package/dist/src/harness/tools.d.ts +4 -6
- package/dist/src/harness/tools.js +1 -1
- package/dist/src/internal/application/cache-metadata.js +1 -1
- package/dist/src/internal/application/compiled-artifacts.js +1 -1
- package/dist/src/internal/application/package.js +1 -1
- package/dist/src/internal/application/paths.js +1 -1
- package/dist/src/internal/nitro/dev-runtime-artifacts.js +1 -1
- package/dist/src/internal/nitro/host/build-application.js +1 -1
- package/dist/src/internal/nitro/host/build-vercel-agent-summary.js +1 -1
- package/dist/src/internal/nitro/host/configure-nitro-routes.js +3 -3
- package/dist/src/internal/nitro/host/create-application-nitro.js +1 -1
- package/dist/src/internal/nitro/host/prepare-application-host.js +1 -1
- package/dist/src/internal/nitro/host/start-production-server.js +1 -1
- package/dist/src/internal/workflow-bundle/builder-support.js +2 -2
- package/dist/src/internal/workflow-bundle/builder.js +3 -3
- package/dist/src/public/next/server.js +1 -1
- package/dist/src/public/nuxt/dev-server.js +1 -1
- package/dist/src/public/sveltekit/dev-server.js +1 -1
- package/dist/src/runtime/agent/mock-model-adapter.js +1 -1
- package/dist/src/runtime/agent/mock-model-fixtures.js +3 -2
- package/dist/src/runtime/agent/mock-model-skill-selection.js +3 -4
- package/dist/src/runtime/sandbox/keys.js +1 -1
- package/dist/src/setup/boxes/add-channels.js +1 -1
- package/dist/src/setup/boxes/deploy-project.js +1 -1
- package/dist/src/setup/boxes/one-shot-next-steps.js +1 -1
- package/dist/src/setup/boxes/resolve-target.js +1 -1
- package/dist/src/setup/boxes/select-channels.d.ts +5 -11
- package/dist/src/setup/boxes/select-model.d.ts +0 -5
- package/dist/src/setup/boxes/select-model.js +1 -1
- package/dist/src/setup/boxes/select-setup-mode.d.ts +1 -1
- package/dist/src/setup/boxes/select-setup-mode.js +1 -1
- package/dist/src/setup/cli/rail-log.d.ts +2 -0
- package/dist/src/setup/cli/rail-log.js +2 -2
- package/dist/src/setup/connection-connector.js +1 -1
- package/dist/src/setup/headless.d.ts +1 -1
- package/dist/src/setup/onboarding.d.ts +1 -1
- package/dist/src/setup/primitives/run-pnpm.js +1 -1
- package/dist/src/setup/primitives/run-vercel.js +1 -1
- package/dist/src/setup/project-name.d.ts +4 -0
- package/dist/src/setup/project-name.js +1 -0
- package/dist/src/setup/scaffold/channels-catalog.d.ts +2 -2
- package/dist/src/setup/scaffold/create/project.js +1 -1
- package/dist/src/setup/scaffold/update/channels.js +1 -1
- package/dist/src/setup/scaffold/update/pnpm-workspace.d.ts +1 -1
- package/dist/src/setup/scaffold/update/pnpm-workspace.js +2 -5
- package/dist/src/setup/slackbot.js +1 -1
- package/dist/src/setup/state.d.ts +3 -4
- package/dist/src/setup/step.d.ts +1 -1
- package/dist/src/setup/vercel-project.js +1 -1
- package/dist/src/shared/default-agent-model.d.ts +5 -0
- package/dist/src/shared/default-agent-model.js +1 -0
- package/package.json +1 -1
- package/dist/docs/public/advanced/evals.md +0 -225
- package/dist/src/cli/commands/setup.d.ts +0 -58
- package/dist/src/cli/commands/setup.js +0 -1
- package/dist/src/evals/runner/execute-case.d.ts +0 -23
- package/dist/src/evals/runner/execute-case.js +0 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,28 @@
|
|
|
1
1
|
# eve
|
|
2
2
|
|
|
3
|
+
## 0.6.0-beta.16
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- 5a6ac17: Add a required `evals/evals.config.ts` (authored with `defineEvalConfig`) that declares run-wide eval defaults: a mandatory scorer `model`, plus optional run-level `reporters`, `maxConcurrency`, and `timeoutMs`. Model-backed scorers now fall back to the config `model`, so `model` is optional on `defineEval` and a shared reporter (e.g. one `Braintrust()`) no longer needs to be repeated in every eval. CLI flags and per-eval values still take precedence over the config defaults.
|
|
8
|
+
- 5a6ac17: `defineEval` is now always a single case, with identity fully derived from the file path — `cases`, `load`, `task`, per-case `id`, and `maxConcurrency` are removed. Declare `input` or `run` (plus `expected`, `checks`, `scores`, `parseOutput`, …) at the top level, organize related evals with directory nesting (`evals/runtime/multi-turn.eval.ts` → `runtime/multi-turn`), and default-export an array of `defineEval(...)` values for dataset fan-out (ids get a zero-padded index suffix, e.g. `weather/0000`). The runner now executes eval files concurrently (default 8, `--max-concurrency`), positional `eve eval` ids match by directory prefix, `--case` is removed, reporters use a run-level lifecycle (`onRunStart`/`onEvalComplete`/`onRunComplete`), check/scorer args expose `evaluation` instead of `case`, and artifacts land under one `.eve/evals/<timestamp>/` directory per run.
|
|
9
|
+
|
|
10
|
+
### Patch Changes
|
|
11
|
+
|
|
12
|
+
- a8363e6: Fix `authorization.required` not being emitted when a tool combines `needsApproval` with interactive auth. Approval-resume auth signals are now routed through the authorization park path instead of being replayed to the model as a plain tool result.
|
|
13
|
+
- a8363e6: Authorization-pending tool results no longer expose OAuth URLs, user codes, or hook URLs to the model. Channels still receive full `authorization.required` events.
|
|
14
|
+
|
|
15
|
+
## 0.6.0-beta.15
|
|
16
|
+
|
|
17
|
+
### Minor Changes
|
|
18
|
+
|
|
19
|
+
- 0a8d93b: Add `eve init <name>` for non-interactive agent creation, with optional Web Chat through `--web`. The command installs dependencies, initializes Git, and starts the development server, but does not provision or deploy a Vercel project.
|
|
20
|
+
- f733fe5: The deterministic mock model adapter (`EVE_MOCK_AUTHORED_MODELS=1`) now synthesizes tool inputs for string properties anchored to quoted spans in the message (e.g. `with label "x"`), honors exact-reply fixture directives (`reply with the exact string … and nothing else`, `include the exact token … verbatim`) found in the current turn's user messages and per-turn client context, discovers static skill advertisements embedded in instruction text (not just standalone announcements), derives skill activation from `load_skill` calls in history so skills are never re-loaded in a loop, and no longer matches `load_skill` by explicit name. This widens what agent smoke evals can assert deterministically without a real model.
|
|
21
|
+
|
|
22
|
+
### Patch Changes
|
|
23
|
+
|
|
24
|
+
- 0a8d93b: Scaffolded projects now carry a `packageExtensions` entry in `pnpm-workspace.yaml` that restores eve's missing `oxc-parser` dependency, so fresh agents boot against the published `eve` 0.6.0-beta.13/14 builds that import it without declaring it. The entry lives in `pnpm-workspace.yaml` because pnpm 11 reads settings only from there, not from the `package.json` `pnpm` field. Remove it from the template once the scaffolded `eve` range only resolves to versions that declare `oxc-parser` themselves.
|
|
25
|
+
|
|
3
26
|
## 0.6.0-beta.14
|
|
4
27
|
|
|
5
28
|
### Minor Changes
|
package/README.md
CHANGED
|
@@ -52,7 +52,7 @@ Every authored directory has a typed helper. Import each from the matching subpa
|
|
|
52
52
|
| `eveChannel(...)`, `slackChannel(...)`, `vercelOidc(...)` | `eve/channels/eve`, `/slack`, `/auth` | reused from `channels/<name>.ts` |
|
|
53
53
|
| `defineSandbox(...)` | `eve/sandbox` | `sandbox.ts` (or `sandbox/sandbox.ts`) |
|
|
54
54
|
| `defineSchedule(...)` | `eve/schedules` | `schedules/<name>.ts` (or `schedules/<name>.md`) |
|
|
55
|
-
| `defineEval(...)`
|
|
55
|
+
| `defineEval(...)`, `defineEvalConfig(...)` | `eve/evals` | `evals/<name>.eval.ts`, `evals/evals.config.ts` |
|
|
56
56
|
|
|
57
57
|
Runtime accessors live on the subpath that owns the concern:
|
|
58
58
|
|
|
@@ -105,15 +105,16 @@ export default defineAgent({
|
|
|
105
105
|
## Quick Start
|
|
106
106
|
|
|
107
107
|
```bash
|
|
108
|
-
pnpm
|
|
109
|
-
cd my-agent
|
|
110
|
-
pnpm dev
|
|
108
|
+
pnpm dlx eve@beta init my-agent
|
|
111
109
|
```
|
|
112
110
|
|
|
113
|
-
|
|
111
|
+
`eve init` writes a new agent with Eve's default model. Pass `--web` to add the
|
|
112
|
+
Web Chat application. It installs dependencies, initializes Git, and starts the
|
|
113
|
+
development server. It does not create a Vercel project or deploy the agent.
|
|
114
114
|
|
|
115
115
|
CLI commands:
|
|
116
116
|
|
|
117
|
+
- `eve init <name>` — create a new agent
|
|
117
118
|
- `eve info` — discovery results and compiled artifacts
|
|
118
119
|
- `eve build` — compile `.eve/` and build the host output
|
|
119
120
|
- `eve start` — serve the built `.output/` app
|
|
@@ -812,13 +812,13 @@ weather-result-reaches-parent` reruns it in isolation.
|
|
|
812
812
|
|
|
813
813
|
### CI
|
|
814
814
|
|
|
815
|
-
`.github/workflows/
|
|
815
|
+
`.github/workflows/e2e.yml` discovery changes from globbing
|
|
816
816
|
`e2e/tests/*/*.ts` to globbing fixture apps with `evals/` directories. Each
|
|
817
817
|
matrix leg provisions, then runs the evals against the resulting URL:
|
|
818
818
|
|
|
819
819
|
```sh
|
|
820
820
|
node e2e/provision/<group>.ts & # build + sidecars + env + eve start
|
|
821
|
-
pnpm --filter <fixture-app> exec eve eval --strict --
|
|
821
|
+
pnpm --filter <fixture-app> exec eve eval --strict --url "$TARGET_URL"
|
|
822
822
|
```
|
|
823
823
|
|
|
824
824
|
twice (direct / `EVE_EXPERIMENTAL_CODE_MODE=1` set by the provisioner),
|
|
@@ -842,7 +842,7 @@ Phase 5.
|
|
|
842
842
|
## Implementation phases
|
|
843
843
|
|
|
844
844
|
Each phase ships independently, keeps `pnpm test` green, and includes docs
|
|
845
|
-
(`docs/public/
|
|
845
|
+
(now the top-level `docs/public/evals/` section) + changesets per repo policy.
|
|
846
846
|
|
|
847
847
|
### Phase 1 — Assertions and pass/fail (no breaking interaction changes)
|
|
848
848
|
|
|
@@ -906,6 +906,31 @@ addresses. Nothing in phase 4 deploys anywhere.
|
|
|
906
906
|
area policy; shrink `e2e/lib`/`e2e/target` into `e2e/provision/`; update
|
|
907
907
|
`e2e/README.md` and AGENTS.md smoke-test guidance to point at `eve eval`.
|
|
908
908
|
|
|
909
|
+
> **Decisions made during the phase-4 migration (June 2026):**
|
|
910
|
+
>
|
|
911
|
+
> - **Hybrid mock/real split.** Evals default to the deterministic mock
|
|
912
|
+
> (`requires: ["mockModels"]`). Behaviors the mock cannot express — the
|
|
913
|
+
> built-in `agent` tool, Workflow fan-out, provider-executed `web_search`,
|
|
914
|
+
> url-file attachment reads — live in separate `real-model`-tagged eval
|
|
915
|
+
> files run against a non-mock server only when model credentials are
|
|
916
|
+
> present (provisioners log a visible skip otherwise). Net: real-model CI
|
|
917
|
+
> legs dropped from ~156 to ~7 cases.
|
|
918
|
+
> - **Code-mode e2e dropped entirely.** No `EVE_EXPERIMENTAL_CODE_MODE`
|
|
919
|
+
> matrix dimension and no codemode group port; the `agent-codemode` fixture
|
|
920
|
+
> was deleted. Purpose-built code-mode fixtures/evals will be designed
|
|
921
|
+
> later.
|
|
922
|
+
> - **Mock adapter extensions** (in `eve`): anchored quoted-span tool-input
|
|
923
|
+
> synthesis, exact-reply directives in current-turn user text (proves
|
|
924
|
+
> clientContext delivery), embedded static-skill advert discovery, and
|
|
925
|
+
> history-derived skill-activation tracking.
|
|
926
|
+
> - **Coverage consciously dropped:** semantic multi-turn recall and the
|
|
927
|
+
> channel-rekey marker recall (pure model behaviors), `subagent-output-schema`
|
|
928
|
+
> moved to the real leg, and the eval-CLI self-smoke (`evals.ts`) — now the
|
|
929
|
+
> CI mechanism itself.
|
|
930
|
+
> - Sidecar fake providers expose a `GET /_probe/requests` inspection
|
|
931
|
+
> endpoint (`e2e/lib/http-test-server.ts`) so evals assert captured
|
|
932
|
+
> traffic across the provisioner process boundary.
|
|
933
|
+
|
|
909
934
|
### Phase 5 — Remote leg: deploy fixtures and eval the deployment
|
|
910
935
|
|
|
911
936
|
After phase 4, CI gains a second e2e variant that runs the **same eval
|
|
@@ -918,7 +943,7 @@ skip.
|
|
|
918
943
|
preview env sets `EVE_MOCK_AUTHORED_MODELS=1` (determinism, zero model
|
|
919
944
|
spend) and the workflow backend credentials.
|
|
920
945
|
16. CI job: deploy the preview, capture the URL, then
|
|
921
|
-
`eve eval --strict --
|
|
946
|
+
`eve eval --strict --junit ... --url "$DEPLOY_URL"` with
|
|
922
947
|
`VERCEL_AUTOMATION_BYPASS_SECRET` in the job env (the eval client's
|
|
923
948
|
existing remote-auth cascade handles protection bypass / OIDC). No
|
|
924
949
|
`--no-skips` on this leg: schedule evals (`devRoutes`) and local-sidecar
|
|
@@ -30,7 +30,7 @@ Read in this order:
|
|
|
30
30
|
14. [TypeScript Client](./client/overview.mdx)
|
|
31
31
|
15. [Subagents](./subagents.mdx)
|
|
32
32
|
16. [Schedules](./schedules.mdx)
|
|
33
|
-
17. [Evals](./
|
|
33
|
+
17. [Evals](./evals/overview.mdx)
|
|
34
34
|
18. [Auth And Route Protection](./advanced/auth-and-route-protection.md)
|
|
35
35
|
19. [Vercel Deployment](./advanced/deployment.md)
|
|
36
36
|
20. [CLI, Build, And Debugging](./reference/cli.md)
|
|
@@ -184,7 +184,7 @@ export default defineChannel({
|
|
|
184
184
|
|
|
185
185
|
## Replace `placeholderAuth` before production
|
|
186
186
|
|
|
187
|
-
`
|
|
187
|
+
`eve init` scaffolds `agent/channels/eve.ts` with a `placeholderAuth()` guardrail:
|
|
188
188
|
|
|
189
189
|
```ts
|
|
190
190
|
import { eveChannel } from "eve/channels/eve";
|
|
@@ -162,4 +162,4 @@ When `eve build` fails on discovery errors, the CLI prints the full diagnostics
|
|
|
162
162
|
- [`agent.ts`](../agent-config)
|
|
163
163
|
- [Hooks](./hooks): observe the runtime event stream
|
|
164
164
|
- [Dev TUI](./dev-tui): drive the agent locally
|
|
165
|
-
- [Evals](
|
|
165
|
+
- [Evals](../evals/overview): repeatable scored checks
|
|
@@ -36,7 +36,7 @@ The `auth` option decides who can call these routes. The built-in helpers are me
|
|
|
36
36
|
|
|
37
37
|
Neither admits browser users or external clients in production. For a public app, wire the channel to your own auth (Clerk, Auth.js, or your own OIDC/JWT verification).
|
|
38
38
|
|
|
39
|
-
`
|
|
39
|
+
`eve init` scaffolds an `agent/channels/eve.ts` with a production placeholder so you replace it before going live. The generated channel allows Vercel OIDC and localhost, and includes `placeholderAuth()`, which returns a setup-focused 401 in production until you swap it for real auth. Delete the file and Eve falls back to `[localDev(), vercelOidc()]`, which still does not admit browser users in production.
|
|
40
40
|
|
|
41
41
|
For the full auth model and helper list, see [Auth & route protection](../advanced/auth-and-route-protection).
|
|
42
42
|
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Cases"
|
|
3
|
+
description: "Author prompt evals, script multi-turn evals with run(ctx), and fan one file out over a dataset."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
Each eval file is one graded case: the runner executes it against the target, captures every event, and applies [checks](./checks) and [scores](./scores) to the result. An eval is either a prompt eval (`input`) or a scripted eval (`run`).
|
|
7
|
+
|
|
8
|
+
## Prompt evals
|
|
9
|
+
|
|
10
|
+
Prompt evals pair an `input` with an optional `expected`. `input` can be a string or an object (objects are `JSON.stringify`d); the runner sends it as a single user turn. `expected` is optional, which is handy when you only care about behavior:
|
|
11
|
+
|
|
12
|
+
```ts title="evals/weather/brooklyn-forecast.eval.ts"
|
|
13
|
+
import { defineEval } from "eve/evals";
|
|
14
|
+
import { Checks } from "eve/evals/checks";
|
|
15
|
+
|
|
16
|
+
export default defineEval({
|
|
17
|
+
input: "What is the weather in Brooklyn?",
|
|
18
|
+
expected: "Sunny",
|
|
19
|
+
checks: [Checks.didNotFail()],
|
|
20
|
+
scores: [],
|
|
21
|
+
});
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
```ts title="evals/weather/no-tools-for-greetings.eval.ts"
|
|
25
|
+
import { defineEval } from "eve/evals";
|
|
26
|
+
import { Checks } from "eve/evals/checks";
|
|
27
|
+
|
|
28
|
+
export default defineEval({
|
|
29
|
+
input: "Hello!",
|
|
30
|
+
checks: [Checks.didNotFail(), Checks.toolNotCalled("get_weather")],
|
|
31
|
+
scores: [],
|
|
32
|
+
});
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Organizing with directories
|
|
36
|
+
|
|
37
|
+
Identity is the file path, so directories are the grouping mechanism. `evals/weather/brooklyn-forecast.eval.ts` gets the id `weather/brooklyn-forecast`, and `eve eval weather` runs everything under `evals/weather/`. Shared constants and helpers live in sibling non-eval files (any name that doesn't end in `.eval.ts`):
|
|
38
|
+
|
|
39
|
+
```text
|
|
40
|
+
evals/
|
|
41
|
+
├── weather/
|
|
42
|
+
│ ├── shared.ts # helpers — not an eval
|
|
43
|
+
│ ├── brooklyn-forecast.eval.ts
|
|
44
|
+
│ └── no-tools-for-greetings.eval.ts
|
|
45
|
+
└── smoke.eval.ts
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Datasets: exporting an array
|
|
49
|
+
|
|
50
|
+
To fan one file out over a dataset, default-export an array of `defineEval(...)` values. Eval modules are ESM, so top-level `await` can load anything. Ids derive from the file name plus a zero-padded index (`sql/0000`, `sql/0001`, …, in array order). The loaders (`loadJson`, `loadYaml` from `eve/evals/loaders`) parse fixture files relative to the app root:
|
|
51
|
+
|
|
52
|
+
```ts title="evals/sql.eval.ts"
|
|
53
|
+
import { defineEval } from "eve/evals";
|
|
54
|
+
import { loadYaml } from "eve/evals/loaders";
|
|
55
|
+
import { Text } from "eve/evals/scores";
|
|
56
|
+
|
|
57
|
+
const doc = await loadYaml("evals/data/cases.yaml");
|
|
58
|
+
const rows = doc.evals as readonly { task: string; prompt: string; sql: string }[];
|
|
59
|
+
|
|
60
|
+
export default rows.map((row) =>
|
|
61
|
+
defineEval({
|
|
62
|
+
description: row.task,
|
|
63
|
+
input: row.prompt,
|
|
64
|
+
expected: row.sql,
|
|
65
|
+
scores: [Text.exact()],
|
|
66
|
+
}),
|
|
67
|
+
);
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
The loaders are meant for fixtures, not runtime agent code.
|
|
71
|
+
|
|
72
|
+
## Scripted evals
|
|
73
|
+
|
|
74
|
+
Scripted evals define `run(ctx)` instead of `input`. Use them for smoke-style behavior: multi-turn branching, HITL approvals, structured output, attachments, and multiple independent sessions.
|
|
75
|
+
|
|
76
|
+
```ts title="evals/approve-tool.eval.ts"
|
|
77
|
+
import { defineEval } from "eve/evals";
|
|
78
|
+
import { Checks } from "eve/evals/checks";
|
|
79
|
+
|
|
80
|
+
export default defineEval({
|
|
81
|
+
async run({ session }) {
|
|
82
|
+
await session.send("run `pwd`");
|
|
83
|
+
session.expectInputRequests({ toolName: "bash" });
|
|
84
|
+
const turn = await session.respondAll("approve");
|
|
85
|
+
return turn.message;
|
|
86
|
+
},
|
|
87
|
+
checks: [Checks.didNotFail(), Checks.toolCalled("bash", { input: { command: /pwd/ } })],
|
|
88
|
+
scores: [],
|
|
89
|
+
});
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
The return value of `run` becomes the output that scorers grade (set `parseOutput` to transform the raw result instead). Throwing marks the eval failed with the error message in the result.
|
|
93
|
+
|
|
94
|
+
## The session API
|
|
95
|
+
|
|
96
|
+
`ctx.session` is the primary `EveEvalSession`; `ctx.newSession()` creates another independent session against the same target:
|
|
97
|
+
|
|
98
|
+
- `session.send(input)` sends a turn and waits for it to settle. It accepts the same input as `ClientSession.send()` — a string or a structured message.
|
|
99
|
+
- `session.sendFile(text, path, mediaType?)` attaches a local file as a data URL.
|
|
100
|
+
- `session.expectInputRequests(filter?)` asserts the previous turn parked on HITL input and returns the pending requests.
|
|
101
|
+
- `session.respondAll(optionId)` answers every pending input request with the same option and sends the responses as the next turn.
|
|
102
|
+
- `session.events` is the full typed event stream captured so far.
|
|
103
|
+
|
|
104
|
+
Each `send` resolves to an `EveEvalTurn` carrying the turn's `message`, `events`, and status. `turn.expectOk()` throws only when the turn ended failed — a session left open for a next message is the normal end state of a successful turn.
|
|
105
|
+
|
|
106
|
+
Events from every eval session are captured in the result and artifacts. `ctx.log(message)` records debug lines into the eval artifact; `--verbose` also streams them to stdout as evals run.
|
|
107
|
+
|
|
108
|
+
For driving sessions created outside the eval — by a channel webhook or a schedule — see [Targets and requirements](./targets).
|
|
109
|
+
|
|
110
|
+
## What to read next
|
|
111
|
+
|
|
112
|
+
- [Checks](./checks): assert on what the eval did
|
|
113
|
+
- [Scores](./scores): grade how well it did it
|
|
114
|
+
- [TypeScript client](../client/messages): the send/turn protocol eval sessions build on
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Checks"
|
|
3
|
+
description: "Hard assertions over runs, tool calls, and output — any failed check fails the eval and the exit code."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
Checks are hard assertions. Any failed check marks the eval failed and `eve eval` exits non-zero. Use them for the things that must hold — the run completed, the right tool ran, the output parses. For graded, non-fatal signals, use [scores](./scores) instead.
|
|
7
|
+
|
|
8
|
+
## Built-in checks
|
|
9
|
+
|
|
10
|
+
Built-in checks live on `eve/evals/checks`:
|
|
11
|
+
|
|
12
|
+
| Check | Asserts |
|
|
13
|
+
| ------------------------------------------------------------- | --------------------------------------------------------------------------------- |
|
|
14
|
+
| `Checks.completed()` | The run did not fail and did not park on unanswered HITL input |
|
|
15
|
+
| `Checks.waiting()` | The run parked on HITL input (for approval-shaped evals) |
|
|
16
|
+
| `Checks.didNotFail()` | No terminal failure and no `turn.failed`/`step.failed` events (parked runs pass) |
|
|
17
|
+
| `Checks.messageIncludes(token)` | Joined assistant text contains `token` (string or RegExp) |
|
|
18
|
+
| `Checks.outputEquals(value)` / `Checks.outputMatches(schema)` | Deep equality / Standard Schema (e.g. Zod) validation of the parsed output |
|
|
19
|
+
| `Checks.toolCalled(name, opts?)` | A matching tool call happened (`input`, `output`, `isError`, `times` constraints) |
|
|
20
|
+
| `Checks.toolNotCalled(name)` | No call to `name` |
|
|
21
|
+
| `Checks.toolOrder([...names])` | Tool names appear in order (other calls may interleave) |
|
|
22
|
+
| `Checks.noFailedActions()` | No tool, subagent, or skill action reported a failure |
|
|
23
|
+
| `Checks.subagentCalled(name, opts?)` | A subagent delegation happened (`remoteUrl`, `output` constraints) |
|
|
24
|
+
| `Checks.event(predicate, label)` | Escape hatch: any predicate over the typed event stream |
|
|
25
|
+
|
|
26
|
+
## Matchers
|
|
27
|
+
|
|
28
|
+
Matcher options accept a literal (objects partial-deep-match), a RegExp, or a function — return a boolean to act as a predicate, or return a value to compare against (handy for runner-assigned values like environment-provided URLs):
|
|
29
|
+
|
|
30
|
+
```ts
|
|
31
|
+
Checks.toolCalled("bash", { input: { command: /^pwd/ }, isError: false, times: 1 });
|
|
32
|
+
Checks.subagentCalled("weather", {
|
|
33
|
+
remoteUrl: () => process.env.WEATHER_AGENT_URL!,
|
|
34
|
+
output: /72F/,
|
|
35
|
+
});
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Custom checks
|
|
39
|
+
|
|
40
|
+
A custom check is a plain function receiving `{ evaluation, result, target }` and returning `{ name, passed, message? }`:
|
|
41
|
+
|
|
42
|
+
```ts
|
|
43
|
+
import type { EveEvalCheck } from "eve/evals/checks";
|
|
44
|
+
|
|
45
|
+
const repliedFast: EveEvalCheck = ({ result }) => ({
|
|
46
|
+
name: "replied-fast",
|
|
47
|
+
passed: result.durationMs < 5_000,
|
|
48
|
+
message: `took ${result.durationMs}ms`,
|
|
49
|
+
});
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Write a `message` for the failing path — it is what the console reporter prints under the eval line and what lands in JUnit output.
|
|
53
|
+
|
|
54
|
+
## Run status and parking
|
|
55
|
+
|
|
56
|
+
`result.status` is `"waiting"` whenever the session is left open for a next message — the normal end state of a successful turn. Parking on unanswered HITL input is tracked separately as `result.derived.parked`, which is what `Checks.completed()` and `Checks.waiting()` key off.
|
|
57
|
+
|
|
58
|
+
`result.derived` also carries typed tool calls (name, input, output, error state), subagent calls, and HITL input requests, so custom checks rarely need to walk the raw event stream. When they do, `Checks.event(predicate, label)` covers it.
|
|
59
|
+
|
|
60
|
+
## What to read next
|
|
61
|
+
|
|
62
|
+
- [Scores](./scores): graded signals with thresholds
|
|
63
|
+
- [Cases](./cases): where checks attach
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Evals"
|
|
3
|
+
description: "Define repeatable scored checks for an Eve agent with defineEval and run them with eve eval."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
An eval is a scored check that runs your agent against real sessions and grades the result. Use it to catch regressions when you change a prompt or a tool: compare the output against expected text, JSON, SQL, or behavior, and optionally ship the results to Braintrust.
|
|
7
|
+
|
|
8
|
+
Evals exercise the same HTTP surface your users hit. The runner boots (or targets) a real agent server, drives sessions through the [TypeScript client](../client/overview) protocol, and grades what comes back — so a passing eval means the agent actually booted, accepted a request, and produced the result you asserted.
|
|
9
|
+
|
|
10
|
+
## `defineEval`
|
|
11
|
+
|
|
12
|
+
Eve discovers evals under the app-root `evals/` directory, in `.eval.ts` files. Each file is exactly one eval — one graded case. The file path is the eval's identity, so you don't author an `id` or `name`; directories group related evals (`evals/weather/brooklyn-forecast.eval.ts` → id `weather/brooklyn-forecast`).
|
|
13
|
+
|
|
14
|
+
```text
|
|
15
|
+
my-agent/
|
|
16
|
+
├── agent/
|
|
17
|
+
├── evals/
|
|
18
|
+
│ ├── evals.config.ts
|
|
19
|
+
│ ├── smoke.eval.ts
|
|
20
|
+
│ └── weather/
|
|
21
|
+
│ ├── brooklyn-forecast.eval.ts
|
|
22
|
+
│ └── no-tools-for-greetings.eval.ts
|
|
23
|
+
└── package.json
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
```ts title="evals/weather/brooklyn-forecast.eval.ts"
|
|
27
|
+
import { defineEval } from "eve/evals";
|
|
28
|
+
import { Checks } from "eve/evals/checks";
|
|
29
|
+
import { Run } from "eve/evals/scores";
|
|
30
|
+
|
|
31
|
+
export default defineEval({
|
|
32
|
+
description: "Basic message and tool-usage coverage for the weather agent.",
|
|
33
|
+
input: "What is the weather in Brooklyn?",
|
|
34
|
+
expected: "Sunny",
|
|
35
|
+
checks: [Checks.didNotFail(), Checks.toolCalled("get_weather")],
|
|
36
|
+
scores: [Run.didNotFail()],
|
|
37
|
+
});
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Every eval needs `scores` (an empty array is fine) and either `input` (a prompt sent as a single turn) or `run` (an imperative script). The rest are optional: `description`, `expected`, `checks`, `requires`, `parseOutput`, `model`, `thresholds`, `modelOptions`, `tags`, `metadata`, `timeoutMs`, `reporters`. The init template adds `evals/**/*.ts` to `tsconfig.json`, so your eval code type-checks alongside the app.
|
|
41
|
+
|
|
42
|
+
## `evals.config.ts`
|
|
43
|
+
|
|
44
|
+
Every `evals/` directory needs exactly one `evals.config.ts` at its root. It declares the defaults every eval shares — most importantly the `model` used by model-backed scorers, so you don't repeat it in each file:
|
|
45
|
+
|
|
46
|
+
```ts title="evals/evals.config.ts"
|
|
47
|
+
import { defineEvalConfig } from "eve/evals";
|
|
48
|
+
import { Braintrust } from "eve/evals/reporters";
|
|
49
|
+
|
|
50
|
+
export default defineEvalConfig({
|
|
51
|
+
model: "openai/gpt-5.4-mini",
|
|
52
|
+
reporters: [Braintrust({ projectName: "my-agent" })],
|
|
53
|
+
});
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
`model` is required; `reporters`, `maxConcurrency`, and `timeoutMs` are optional. Config `reporters` observe every eval in the run — set one `Braintrust()` here instead of adding it to each eval. CLI flags (`--max-concurrency`, `--timeout`) and per-eval values take precedence over the config defaults. An eval that needs a different judge model overrides it with its own `model`; otherwise the config `model` applies.
|
|
57
|
+
|
|
58
|
+
## Two grading tiers
|
|
59
|
+
|
|
60
|
+
Evals are graded on two distinct tiers:
|
|
61
|
+
|
|
62
|
+
- **[Checks](./checks) are hard assertions.** Any failed check marks the eval failed and `eve eval` exits non-zero. Use them for the things that must hold — the run completed, the right tool ran, the output parses.
|
|
63
|
+
- **[Scores](./scores) are soft data.** They land in reports and artifacts, and a below-threshold score marks the eval `scored` — visible but not fatal, unless you pass `--strict`.
|
|
64
|
+
|
|
65
|
+
## Run it
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
eve eval # run all discovered evals against a local dev server
|
|
69
|
+
eve eval weather # run one eval, or every eval under evals/weather/
|
|
70
|
+
eve eval --url https://<app> # target an existing server or deployment
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Exit code `0` means every eval passed its checks. See [Running evals](./running) for the full flag list, exit codes, and CI guidance.
|
|
74
|
+
|
|
75
|
+
## A good baseline
|
|
76
|
+
|
|
77
|
+
Most apps do fine with a few small smoke evals. Assert behavior with `Checks.didNotFail()` plus one or two content checks, keep dataset fixtures in `evals/data/`, and only reach for Braintrust once you actually need shared result review or experiment history. In CI, run `eve eval --strict` so threshold misses fail the build too.
|
|
78
|
+
|
|
79
|
+
The rest of this section covers each piece:
|
|
80
|
+
|
|
81
|
+
- [Cases](./cases): prompt evals, scripted multi-turn evals, and dataset fan-out
|
|
82
|
+
- [Checks](./checks): hard assertions over runs, tools, and output
|
|
83
|
+
- [Scores](./scores): deterministic and LLM-judged scorers, with thresholds
|
|
84
|
+
- [Targets and requirements](./targets): local vs remote targets, and gating evals on capabilities
|
|
85
|
+
- [Reporters](./reporters): Braintrust experiments and JUnit XML
|
|
86
|
+
- [Running evals](./running): the `eve eval` CLI, exit codes, and artifacts
|
|
87
|
+
|
|
88
|
+
## What to read next
|
|
89
|
+
|
|
90
|
+
- [Cases](./cases): author your first evals
|
|
91
|
+
- [Tools](../tools): the surface most evals assert on
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Reporters"
|
|
3
|
+
description: "Ship eval results to Braintrust experiments or JUnit XML — Eve runs and scores everything itself."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
Eve runs and scores everything itself; reporters just ship the results out. The CLI prints a console summary by default — one line per eval, failed checks with their messages — and reporters from `eve/evals/reporters` add destinations on top.
|
|
7
|
+
|
|
8
|
+
Reporters attach in two places. Declare them in `evals.config.ts` to observe **every** eval in the run — the usual choice for a shared destination like one Braintrust experiment, so you don't repeat the reporter in each file. Or list them on an individual eval's `reporters` to scope a destination to that eval (or to a group of evals that share one instance).
|
|
9
|
+
|
|
10
|
+
## Braintrust
|
|
11
|
+
|
|
12
|
+
`Braintrust(...)` uploads eval results to Braintrust experiments. Put one instance in the config so it covers the whole run:
|
|
13
|
+
|
|
14
|
+
```ts title="evals/evals.config.ts"
|
|
15
|
+
import { defineEvalConfig } from "eve/evals";
|
|
16
|
+
import { Braintrust } from "eve/evals/reporters";
|
|
17
|
+
|
|
18
|
+
export default defineEvalConfig({
|
|
19
|
+
model: "openai/gpt-5.4-mini",
|
|
20
|
+
reporters: [Braintrust({ projectName: "weather-agent" })],
|
|
21
|
+
});
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Need a destination for only some evals? Attach it per eval instead:
|
|
25
|
+
|
|
26
|
+
```ts title="evals/brooklyn-forecast.eval.ts"
|
|
27
|
+
import { defineEval } from "eve/evals";
|
|
28
|
+
import { Braintrust } from "eve/evals/reporters";
|
|
29
|
+
import { Run } from "eve/evals/scores";
|
|
30
|
+
|
|
31
|
+
export default defineEval({
|
|
32
|
+
input: "What is the weather in Brooklyn?",
|
|
33
|
+
scores: [Run.didNotFail()],
|
|
34
|
+
reporters: [Braintrust({ projectName: "weather-agent" })],
|
|
35
|
+
});
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
The reporter config takes an optional `projectName` and `experimentName`, plus a base experiment (by name or id) to diff against. Checks log as binary scores under a `check:` prefix so experiments diff check regressions the same way they diff score regressions. Eval `metadata` rides along to reporters.
|
|
39
|
+
|
|
40
|
+
A reporter instance observes the evals that reference it: share one instance across several evals — the config, a `shared.ts` export, or every entry of a dataset array — and their results land in a single experiment. Listing the same config reporter on an eval too does not double-report it.
|
|
41
|
+
|
|
42
|
+
Braintrust needs its SDK installed in the app and credentials in the environment. Pass `--skip-report` to run the eval without shipping results (this also suppresses config reporters) — useful locally when iterating.
|
|
43
|
+
|
|
44
|
+
## JUnit
|
|
45
|
+
|
|
46
|
+
`JUnit({ filePath })` writes JUnit XML for CI annotations. The `--junit <path>` CLI flag does the same thing without touching the eval file, which is usually the better fit — CI owns the output path, not the eval:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
eve eval --strict --junit .eve/junit.xml
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Each eval becomes one `<testcase>` named by its path-derived id; failed checks and execution errors land as failure messages on the matching test case, so CI surfaces them inline.
|
|
53
|
+
|
|
54
|
+
## Custom reporters
|
|
55
|
+
|
|
56
|
+
A reporter implements the `EvalReporter` interface from `eve/evals/reporters` and receives the same structured results the built-ins do. Reach for one only when a destination isn't covered — the per-run artifacts under `.eve/evals/` already capture everything for ad-hoc inspection.
|
|
57
|
+
|
|
58
|
+
## What to read next
|
|
59
|
+
|
|
60
|
+
- [Running evals](./running): console output, `--json`, and artifacts
|
|
61
|
+
- [Scores](./scores): what the reported numbers mean
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Running evals"
|
|
3
|
+
description: "The eve eval CLI: flags, filters, exit codes, artifacts, and how to wire evals into CI."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
`eve eval` discovers every `.eval.ts` file under `evals/`, boots a local dev server (or targets a remote one), runs the evals concurrently, and prints a per-eval summary.
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
eve eval # run all discovered evals locally
|
|
10
|
+
eve eval weather smoke # run selected evals (an id, or a directory prefix)
|
|
11
|
+
eve eval --url https://<app> # target a remote app instead of a local host
|
|
12
|
+
eve eval --mock-models # local dev target uses deterministic mock models
|
|
13
|
+
eve eval --tag fast # only evals carrying a tag
|
|
14
|
+
eve eval --strict # below-threshold scores also fail the exit code
|
|
15
|
+
eve eval --no-skips # unmet requirements fail instead of skipping
|
|
16
|
+
eve eval --timeout 60000 # per-eval timeout in milliseconds
|
|
17
|
+
eve eval --max-concurrency 4 # cap concurrent eval executions (default 8)
|
|
18
|
+
eve eval --junit .eve/junit.xml # write JUnit XML
|
|
19
|
+
eve eval --list # print discovered evals without running
|
|
20
|
+
eve eval --verbose # stream per-eval ctx.log lines to stdout
|
|
21
|
+
eve eval --json # machine-readable output
|
|
22
|
+
eve eval --skip-report # skip config and eval-defined reporters (e.g. Braintrust)
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Positional ids match exactly or by directory prefix: `eve eval weather` runs `evals/weather.eval.ts`, every eval under `evals/weather/`, and every entry of an array-exported `weather.eval.ts`.
|
|
26
|
+
|
|
27
|
+
## Exit codes
|
|
28
|
+
|
|
29
|
+
| Code | Means |
|
|
30
|
+
| ---- | -------------------------------------------------------------------------------- |
|
|
31
|
+
| `0` | Every eval passed its checks (and thresholds, under `--strict`) |
|
|
32
|
+
| `1` | Any eval failed — a failed check, an execution error, or a strict threshold miss |
|
|
33
|
+
| `2` | Configuration error |
|
|
34
|
+
|
|
35
|
+
Unmet [requirements](./targets) skip visibly without affecting the exit code unless you pass `--no-skips`.
|
|
36
|
+
|
|
37
|
+
## Artifacts
|
|
38
|
+
|
|
39
|
+
Each run drops artifacts under `.eve/evals/<timestamp>/`: a run `summary.json`, a `results.jsonl` index, and per-eval check results, verdicts, captured event streams, and `ctx.log` lines under `evals/`. The console output stays tight on purpose; when an eval fails, the artifact has the full story.
|
|
40
|
+
|
|
41
|
+
## CI
|
|
42
|
+
|
|
43
|
+
A solid CI invocation is strict, deterministic, and machine-reportable:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
eve eval --strict --mock-models --junit .eve/junit.xml
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
- `--strict` turns threshold misses into failures, so score regressions block the merge.
|
|
50
|
+
- `--mock-models` keeps the default leg deterministic and credential-free. Put real-model evals in their own files gated on `requires: ["env:..."]`, and add `--no-skips` on legs that must prove those ran.
|
|
51
|
+
- `--junit` gives the CI provider per-eval annotations; upload the `.eve/evals/` directory as a failure artifact for the full event streams.
|
|
52
|
+
|
|
53
|
+
Against a deployed app, swap `--mock-models` for `--url`:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
eve eval --strict --url "$DEPLOY_URL" --junit .eve/junit.xml
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## What to read next
|
|
60
|
+
|
|
61
|
+
- [Targets and requirements](./targets): what `--url`, `--mock-models`, and `--no-skips` interact with
|
|
62
|
+
- [Reporters](./reporters): Braintrust and JUnit output
|
|
63
|
+
- [CLI reference](../reference/cli): the rest of the `eve` CLI
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Scores"
|
|
3
|
+
description: "Grade evals with deterministic scorers or LLM judges, and gate them with thresholds."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
Scores are soft data. They land in reports and artifacts, and a below-threshold score marks the eval `scored` — visible but not fatal, unless you pass `--strict`. Use them to grade quality fractionally where a [check](./checks) would assert it absolutely.
|
|
7
|
+
|
|
8
|
+
## Choosing a scorer
|
|
9
|
+
|
|
10
|
+
Scorers live in namespaces on `eve/evals/scores`. Pick the cheapest one that captures what "correct" means here. The deterministic scorers run instantly for free; an LLM judge runs once per eval and burns tokens, so save it for when nothing simpler will do.
|
|
11
|
+
|
|
12
|
+
| Need | Use |
|
|
13
|
+
| ---------------------------------------------------------- | --------------------------------------------------------------------------------------------- |
|
|
14
|
+
| Grade agent behavior (run succeeded, used the right tools) | `Run.didNotFail()`, `Run.usedTool(name, opts?)`, `Run.usedNoTools()`, `Run.maxToolCalls(max)` |
|
|
15
|
+
| Exact string match | `Text.exact()`, `Text.includes()` |
|
|
16
|
+
| Fuzzy text match (typos, whitespace) | `Text.levenshtein()` |
|
|
17
|
+
| Exact JSON match | `Json.deepEqual()` |
|
|
18
|
+
| Exact SQL match (after normalization) | `Sql.exactNormalized()` |
|
|
19
|
+
| LLM-judged factual correctness vs an expected answer | `Autoevals.factuality()` |
|
|
20
|
+
| LLM-judged summary quality | `Autoevals.summary()` |
|
|
21
|
+
| LLM-judged SQL semantic equivalence | `Autoevals.sql()` |
|
|
22
|
+
| LLM-judged free-form criteria (no `expected` to match) | `Autoevals.closedQA({ criteria: "..." })` |
|
|
23
|
+
|
|
24
|
+
Each scorer gets the flattened `input`, `output`, and `expected` strings along with the full eval and task result — including derived facts: typed tool calls (name, input, output, error state), subagent calls, HITL input requests, and whether the run parked. `Run.usedTool` accepts the same matcher options as `Checks.toolCalled`. Return `null` from a scorer to skip it.
|
|
25
|
+
|
|
26
|
+
## Thresholds
|
|
27
|
+
|
|
28
|
+
By default a scorer has to hit an exact match to pass. `thresholds` loosens that, mapping each scorer name to the minimum score you'll accept:
|
|
29
|
+
|
|
30
|
+
```ts
|
|
31
|
+
import { defineEval } from "eve/evals";
|
|
32
|
+
import { Run, Text } from "eve/evals/scores";
|
|
33
|
+
|
|
34
|
+
export default defineEval({
|
|
35
|
+
input: "Hello",
|
|
36
|
+
expected: "Hello",
|
|
37
|
+
scores: [Run.didNotFail(), Text.includes()],
|
|
38
|
+
thresholds: { "run.didNotFail": 1, "text.includes": 0.5 },
|
|
39
|
+
});
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
An eval below a threshold gets the `scored` verdict — reported, but only fatal under `eve eval --strict`.
|
|
43
|
+
|
|
44
|
+
## The scorer model
|
|
45
|
+
|
|
46
|
+
Model-backed scorers (the `Autoevals` wrappers) need a judge model — the scorer model, not the agent's. Eve only uses it for scoring, never to swap out the agent under test. The default lives in [`evals.config.ts`](./overview#evalsconfigts) as the required `model`, so most evals inherit it without setting anything. Pass a string id (e.g. `"anthropic/claude-opus-4.8"`) to route through the Vercel AI Gateway, or an AI SDK model instance to use it directly.
|
|
47
|
+
|
|
48
|
+
Override the default on a single eval by setting that eval's own `model`. For provider-specific scorer-model settings, use `modelOptions.providerOptions`. Individual Autoevals scorers can also take their own `model` / `modelOptions`, which win over both the eval and config defaults.
|
|
49
|
+
|
|
50
|
+
## Concurrency and timeouts
|
|
51
|
+
|
|
52
|
+
`timeoutMs` bounds one eval's execution: the eval's own value wins, then `evals.config.ts`'s default, and `eve eval --timeout <ms>` overrides both for a run. The runner executes up to 8 evals at once — set a default `maxConcurrency` in `evals.config.ts` or pass `--max-concurrency <n>` (which wins) to change that, and lower it when evals contend for a shared resource: a rate-limited connection, or a sandbox-heavy fixture.
|
|
53
|
+
|
|
54
|
+
## What to read next
|
|
55
|
+
|
|
56
|
+
- [Checks](./checks): hard assertions that fail the build
|
|
57
|
+
- [Reporters](./reporters): ship scores to Braintrust experiments
|