eve 0.6.0-beta.15 → 0.6.0-beta.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +1 -1
- package/dist/docs/public/advanced/dev-tui.md +4 -2
- package/dist/docs/public/evals/cases.mdx +58 -59
- package/dist/docs/public/evals/checks.mdx +5 -7
- package/dist/docs/public/evals/overview.mdx +34 -18
- package/dist/docs/public/evals/reporters.mdx +23 -7
- package/dist/docs/public/evals/running.mdx +15 -14
- package/dist/docs/public/evals/scores.mdx +10 -9
- package/dist/docs/public/evals/targets.mdx +12 -17
- package/dist/docs/public/reference/cli.md +30 -13
- package/dist/docs/public/reference/typescript-api.md +2 -1
- package/dist/src/cli/commands/channels.js +1 -1
- package/dist/src/cli/commands/deploy.d.ts +21 -0
- package/dist/src/cli/commands/deploy.js +1 -0
- package/dist/src/cli/commands/init.js +1 -1
- package/dist/src/cli/commands/link.d.ts +21 -0
- package/dist/src/cli/commands/link.js +1 -0
- package/dist/src/cli/commands/preconditions.d.ts +7 -0
- package/dist/src/cli/commands/preconditions.js +1 -0
- package/dist/src/cli/commands/register-project-commands.d.ts +12 -0
- package/dist/src/cli/commands/register-project-commands.js +1 -0
- package/dist/src/cli/dev/tui/agent-header.d.ts +2 -0
- package/dist/src/cli/dev/tui/agent-header.js +1 -1
- package/dist/src/cli/dev/tui/blocks.d.ts +1 -1
- package/dist/src/cli/dev/tui/blocks.js +2 -2
- package/dist/src/cli/dev/tui/command-typeahead.d.ts +47 -0
- package/dist/src/cli/dev/tui/command-typeahead.js +1 -0
- package/dist/src/cli/dev/tui/errors.d.ts +18 -0
- package/dist/src/cli/dev/tui/errors.js +1 -1
- package/dist/src/cli/dev/tui/prompt-command-handler.d.ts +10 -0
- package/dist/src/cli/dev/tui/prompt-command-handler.js +1 -0
- package/dist/src/cli/dev/tui/prompt-commands.d.ts +51 -0
- package/dist/src/cli/dev/tui/prompt-commands.js +2 -0
- package/dist/src/cli/dev/tui/runner.d.ts +22 -36
- package/dist/src/cli/dev/tui/runner.js +1 -1
- package/dist/src/cli/dev/tui/setup-commands.d.ts +47 -0
- package/dist/src/cli/dev/tui/setup-commands.js +1 -0
- package/dist/src/cli/dev/tui/setup-flow.d.ts +32 -0
- package/dist/src/cli/dev/tui/setup-flow.js +1 -0
- package/dist/src/cli/dev/tui/setup-issues.d.ts +40 -0
- package/dist/src/cli/dev/tui/setup-issues.js +1 -0
- package/dist/src/cli/dev/tui/setup-panel.d.ts +95 -0
- package/dist/src/cli/dev/tui/setup-panel.js +1 -0
- package/dist/src/cli/dev/tui/terminal-renderer.d.ts +15 -0
- package/dist/src/cli/dev/tui/terminal-renderer.js +2 -2
- package/dist/src/cli/dev/tui/test/index.d.ts +1 -0
- package/dist/src/cli/dev/tui/test/index.js +1 -1
- package/dist/src/cli/dev/tui/test/mock-terminal.d.ts +1 -0
- package/dist/src/cli/dev/tui/test/mock-terminal.js +1 -1
- package/dist/src/cli/dev/tui/theme.d.ts +4 -0
- package/dist/src/cli/dev/tui/theme.js +1 -1
- package/dist/src/cli/dev/tui/tui-prompter.d.ts +20 -0
- package/dist/src/cli/dev/tui/tui-prompter.js +1 -0
- package/dist/src/cli/dev/tui/tui.js +1 -1
- package/dist/src/cli/run.d.ts +0 -1
- package/dist/src/cli/run.js +2 -2
- package/dist/src/compiler/normalize-agent-config.js +1 -1
- package/dist/src/compiler/normalize-manifest.js +1 -1
- package/dist/src/evals/cli/eval.d.ts +3 -4
- package/dist/src/evals/cli/eval.js +1 -1
- package/dist/src/evals/define-eval-config.d.ts +16 -0
- package/dist/src/evals/define-eval-config.js +1 -0
- package/dist/src/evals/define-eval.d.ts +16 -14
- package/dist/src/evals/define-eval.js +1 -1
- package/dist/src/evals/index.d.ts +2 -1
- package/dist/src/evals/index.js +1 -1
- package/dist/src/evals/requirements.d.ts +1 -2
- package/dist/src/evals/requirements.js +1 -1
- package/dist/src/evals/runner/artifacts.d.ts +7 -6
- package/dist/src/evals/runner/artifacts.js +3 -3
- package/dist/src/evals/runner/discover.d.ts +28 -7
- package/dist/src/evals/runner/discover.js +1 -1
- package/dist/src/evals/runner/execute-eval.d.ts +8 -10
- package/dist/src/evals/runner/execute-eval.js +1 -1
- package/dist/src/evals/runner/execute-task.d.ts +22 -0
- package/dist/src/evals/runner/execute-task.js +1 -0
- package/dist/src/evals/runner/reporters/braintrust.d.ts +6 -4
- package/dist/src/evals/runner/reporters/braintrust.js +2 -2
- package/dist/src/evals/runner/reporters/console.d.ts +4 -4
- package/dist/src/evals/runner/reporters/console.js +1 -1
- package/dist/src/evals/runner/reporters/junit.d.ts +1 -0
- package/dist/src/evals/runner/reporters/junit.js +3 -7
- package/dist/src/evals/runner/reporters/types.d.ts +14 -8
- package/dist/src/evals/runner/run-evals.d.ts +38 -0
- package/dist/src/evals/runner/run-evals.js +1 -0
- package/dist/src/evals/runner/verdict.d.ts +5 -5
- package/dist/src/evals/runner/verdict.js +1 -1
- package/dist/src/evals/scorers/autoevals.js +1 -1
- package/dist/src/evals/scorers/json.d.ts +3 -3
- package/dist/src/evals/scorers/json.js +1 -1
- package/dist/src/evals/types.d.ts +134 -176
- package/dist/src/harness/action-result-helpers.js +1 -1
- package/dist/src/harness/authorization.d.ts +26 -0
- package/dist/src/harness/authorization.js +1 -1
- package/dist/src/harness/emission.d.ts +12 -5
- package/dist/src/harness/emission.js +1 -1
- package/dist/src/harness/model-call-error.d.ts +12 -0
- package/dist/src/harness/model-call-error.js +1 -1
- package/dist/src/harness/step-hooks.d.ts +4 -4
- package/dist/src/harness/step-hooks.js +1 -1
- package/dist/src/harness/tool-loop.js +1 -1
- package/dist/src/harness/tools.d.ts +4 -6
- package/dist/src/harness/tools.js +1 -1
- package/dist/src/internal/application/package.js +1 -1
- package/dist/src/internal/nitro/host/ports.d.ts +8 -0
- package/dist/src/internal/nitro/host/ports.js +1 -0
- package/dist/src/internal/nitro/host/start-development-server.js +1 -1
- package/dist/src/services/dev-client/client-options.d.ts +8 -0
- package/dist/src/services/dev-client/client-options.js +1 -0
- package/dist/src/services/dev-client/runtime-artifacts.d.ts +13 -0
- package/dist/src/services/dev-client/runtime-artifacts.js +1 -0
- package/dist/src/services/dev-client.js +1 -1
- package/dist/src/setup/boxes/add-channels.d.ts +11 -1
- package/dist/src/setup/boxes/add-channels.js +2 -2
- package/dist/src/setup/boxes/apply-ai-gateway-credential.js +1 -1
- package/dist/src/setup/boxes/deploy-project.js +1 -1
- package/dist/src/setup/boxes/detect-ai-gateway.d.ts +8 -1
- package/dist/src/setup/boxes/detect-ai-gateway.js +1 -1
- package/dist/src/setup/boxes/resolve-provisioning.d.ts +8 -0
- package/dist/src/setup/boxes/resolve-provisioning.js +1 -1
- package/dist/src/setup/boxes/select-channels.d.ts +2 -0
- package/dist/src/setup/boxes/select-channels.js +1 -1
- package/dist/src/setup/boxes/select-model.d.ts +2 -0
- package/dist/src/setup/boxes/select-model.js +1 -1
- package/dist/src/setup/channel-add-conflicts.d.ts +28 -0
- package/dist/src/setup/channel-add-conflicts.js +1 -0
- package/dist/src/setup/cli/channel-setup-prompter.d.ts +10 -0
- package/dist/src/setup/cli/index.d.ts +1 -0
- package/dist/src/setup/cli/index.js +1 -1
- package/dist/src/setup/cli/select-component.d.ts +2 -2
- package/dist/src/setup/cli/select-component.js +1 -1
- package/dist/src/setup/cli/select-option-codec.d.ts +12 -0
- package/dist/src/setup/cli/select-option-codec.js +1 -0
- package/dist/src/setup/connection-connector.js +1 -1
- package/dist/src/setup/flows/channels.d.ts +43 -0
- package/dist/src/setup/flows/channels.js +1 -0
- package/dist/src/setup/flows/deploy.d.ts +40 -0
- package/dist/src/setup/flows/deploy.js +1 -0
- package/dist/src/setup/flows/in-project.d.ts +16 -0
- package/dist/src/setup/flows/in-project.js +1 -0
- package/dist/src/setup/flows/link.d.ts +43 -0
- package/dist/src/setup/flows/link.js +1 -0
- package/dist/src/setup/flows/model.d.ts +43 -0
- package/dist/src/setup/flows/model.js +1 -0
- package/dist/src/setup/flows/vercel.d.ts +30 -0
- package/dist/src/setup/flows/vercel.js +2 -0
- package/dist/src/setup/index.js +1 -1
- package/dist/src/setup/project-resolution.d.ts +18 -0
- package/dist/src/setup/project-resolution.js +1 -1
- package/dist/src/setup/prompter.d.ts +13 -0
- package/dist/src/setup/prompter.js +1 -1
- package/dist/src/setup/scaffold/channels-catalog.js +1 -1
- package/dist/src/setup/scaffold/create/project.js +2 -2
- package/dist/src/setup/scaffold/index.d.ts +1 -1
- package/dist/src/setup/scaffold/index.js +1 -1
- package/dist/src/setup/scaffold/update/channels.d.ts +8 -0
- package/dist/src/setup/scaffold/update/channels.js +2 -2
- package/dist/src/setup/scaffold/update/connections.js +2 -2
- package/dist/src/setup/scaffold/version-tokens.d.ts +11 -0
- package/dist/src/setup/scaffold/version-tokens.js +1 -0
- package/dist/src/setup/slackbot.js +1 -1
- package/dist/src/setup/vercel-project.js +1 -1
- package/package.json +1 -1
- package/dist/src/cli/dev/change-agent-model.d.ts +0 -27
- package/dist/src/cli/dev/change-agent-model.js +0 -1
- package/dist/src/evals/runner/execute-case.d.ts +0 -23
- package/dist/src/evals/runner/execute-case.js +0 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,37 @@
|
|
|
1
1
|
# eve
|
|
2
2
|
|
|
3
|
+
## 0.6.0-beta.17
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- 20f0fc8: The dev TUI's `/channels` is now an action list: pick an unregistered channel to run its add flow, return to the repainted list (added channels show locked), and leave with the Done row or Esc. Web Chat (the Next.js app) is now addable in projects that already have the scaffolded `agent/channels/eve.ts` session channel — the row locks only when the project actually depends on `next`, and the REPL channel shows as its own locked row.
|
|
8
|
+
- 20f0fc8: The dev TUI now discovers its slash commands: typing `/` opens a suggestion list above the prompt with each command's argument hint and description — `↑`/`↓` move the highlight, `Tab` completes, `Enter` runs, `Esc` dismisses — and a new `/help` command prints the full table. Unknown `/text` is still sent to the agent as a normal message.
|
|
9
|
+
- 20f0fc8: New `eve link` and `eve deploy` commands, also available inside the `eve dev` TUI as `/vercel`, `/channels`, and `/deploy` when the server runs locally. `eve link` picks the Vercel team and project and pulls AI Gateway credentials into `.env.local`; `eve deploy` links when needed and remains successful when its production URL cannot be verified; AI Gateway authentication failures in the TUI now suggest `/vercel`, and the dev server picks up pulled credentials without a restart.
|
|
10
|
+
- 20f0fc8: The dev TUI's bare `/model` now opens the same searchable AI Gateway catalog picker that `eve init` uses — full catalog with the featured shortlist, pre-selected on the model the runtime is serving — instead of a short hand-curated list. `/model <provider/model-id>` still applies directly.
|
|
11
|
+
- 20f0fc8: The dev TUI's `/vercel` now asks which model provider you want before any linking: pick AI Gateway and connect via a project or by pasting your own `AI_GATEWAY_API_KEY` (saved to `.env.local`), or pick another provider to get wiring instructions instead.
|
|
12
|
+
|
|
13
|
+
### Patch Changes
|
|
14
|
+
|
|
15
|
+
- 20f0fc8: `eve channels add` and the dev TUI `/channels` command now run `pnpm install` after recording channels, so a running `eve dev` can load newly scaffolded channel modules (for example `@vercel/connect` for Slack) immediately instead of failing to resolve them until the next deploy.
|
|
16
|
+
- 20f0fc8: `eve dev` no longer clears the terminal scrollback when the terminal UI starts, and skips the redundant `server listening at` line in TUI mode — the header already shows the URL. Headless and REPL modes still print it.
|
|
17
|
+
- 20f0fc8: Make `eve init` install the newest Eve prerelease through the npm `beta` dist-tag, including when scaffolding Web Chat.
|
|
18
|
+
- 20f0fc8: Replace removed `create-eve` and `eve setup` guidance with the current `eve init` and `eve link` commands.
|
|
19
|
+
- 20f0fc8: Restore the `eve link` and `eve deploy` command registrations after the `eve init` migration.
|
|
20
|
+
- 20f0fc8: Keep superseded Vercel project-name warnings inside the `/vercel` panel when linking ultimately succeeds.
|
|
21
|
+
- 20f0fc8: Scaffolding channels, connections, and new projects from an unstamped dev build (for example under `pnpm dev`'s watch emit) now resolves dependency versions from the live workspace catalog instead of failing with "unstamped version token". Published builds are unchanged: they are stamped at build time, and an unstamped token outside a dev tree still fails loudly.
|
|
22
|
+
|
|
23
|
+
## 0.6.0-beta.16
|
|
24
|
+
|
|
25
|
+
### Minor Changes
|
|
26
|
+
|
|
27
|
+
- 5a6ac17: Add a required `evals/evals.config.ts` (authored with `defineEvalConfig`) that declares run-wide eval defaults: a mandatory scorer `model`, plus optional run-level `reporters`, `maxConcurrency`, and `timeoutMs`. Model-backed scorers now fall back to the config `model`, so `model` is optional on `defineEval` and a shared reporter (e.g. one `Braintrust()`) no longer needs to be repeated in every eval. CLI flags and per-eval values still take precedence over the config defaults.
|
|
28
|
+
- 5a6ac17: `defineEval` is now always a single case, with identity fully derived from the file path — `cases`, `load`, `task`, per-case `id`, and `maxConcurrency` are removed. Declare `input` or `run` (plus `expected`, `checks`, `scores`, `parseOutput`, …) at the top level, organize related evals with directory nesting (`evals/runtime/multi-turn.eval.ts` → `runtime/multi-turn`), and default-export an array of `defineEval(...)` values for dataset fan-out (ids get a zero-padded index suffix, e.g. `weather/0000`). The runner now executes eval files concurrently (default 8, `--max-concurrency`), positional `eve eval` ids match by directory prefix, `--case` is removed, reporters use a run-level lifecycle (`onRunStart`/`onEvalComplete`/`onRunComplete`), check/scorer args expose `evaluation` instead of `case`, and artifacts land under one `.eve/evals/<timestamp>/` directory per run.
|
|
29
|
+
|
|
30
|
+
### Patch Changes
|
|
31
|
+
|
|
32
|
+
- a8363e6: Fix `authorization.required` not being emitted when a tool combines `needsApproval` with interactive auth. Approval-resume auth signals are now routed through the authorization park path instead of being replayed to the model as a plain tool result.
|
|
33
|
+
- a8363e6: Authorization-pending tool results no longer expose OAuth URLs, user codes, or hook URLs to the model. Channels still receive full `authorization.required` events.
|
|
34
|
+
|
|
3
35
|
## 0.6.0-beta.15
|
|
4
36
|
|
|
5
37
|
### Minor Changes
|
package/README.md
CHANGED
|
@@ -52,7 +52,7 @@ Every authored directory has a typed helper. Import each from the matching subpa
|
|
|
52
52
|
| `eveChannel(...)`, `slackChannel(...)`, `vercelOidc(...)` | `eve/channels/eve`, `/slack`, `/auth` | reused from `channels/<name>.ts` |
|
|
53
53
|
| `defineSandbox(...)` | `eve/sandbox` | `sandbox.ts` (or `sandbox/sandbox.ts`) |
|
|
54
54
|
| `defineSchedule(...)` | `eve/schedules` | `schedules/<name>.ts` (or `schedules/<name>.md`) |
|
|
55
|
-
| `defineEval(...)`
|
|
55
|
+
| `defineEval(...)`, `defineEvalConfig(...)` | `eve/evals` | `evals/<name>.eval.ts`, `evals/evals.config.ts` |
|
|
56
56
|
|
|
57
57
|
Runtime accessors live on the subpath that owns the concern:
|
|
58
58
|
|
|
@@ -19,10 +19,12 @@ On startup the TUI prints a header for the connected agent — the model, instru
|
|
|
19
19
|
· Subagents researcher
|
|
20
20
|
· Server http://localhost:3000
|
|
21
21
|
|
|
22
|
-
Type to chat · ↑ history · /new reset session · /exit quit · Ctrl+C interrupt
|
|
22
|
+
Type to chat · ↑ history · /new reset session · /model /vercel /channels /deploy · /exit quit · Ctrl+C interrupt
|
|
23
23
|
```
|
|
24
24
|
|
|
25
|
-
From there the conversation streams straight into your terminal's normal scrollback — your prompts, the agent's replies, reasoning, tool calls, nested subagents, connection-authorization prompts, and any captured `stdout`/`stderr` — so you keep native scrolling, copy/paste, and a transcript that persists after you exit. Each turn is rendered without boxes: a colored gutter glyph marks who's speaking, tool calls collapse to a one-line summary (`✓ get_weather city="SF" → 73°F`), and a subagent's work is indented beneath its `◆` header. A sticky line at the bottom shows the input prompt or the live status (spinner, token usage). Press `Enter` to send; `Ctrl+C` interrupts a running turn or quits at the prompt.
|
|
25
|
+
From there the conversation streams straight into your terminal's normal scrollback — your prompts, the agent's replies, reasoning, tool calls, nested subagents, connection-authorization prompts, and any captured `stdout`/`stderr` — so you keep native scrolling, copy/paste, and a transcript that persists after you exit. Each turn is rendered without boxes: a colored gutter glyph marks who's speaking, tool calls collapse to a one-line summary (`✓ get_weather city="SF" → 73°F`), and a subagent's work is indented beneath its `◆` header. A sticky line at the bottom shows the input prompt or the live status (spinner, token usage). Press `Enter` to send; `Ctrl+C` interrupts a running turn or quits at the prompt. Slash commands: `/new` starts a fresh session and `/exit` quits.
|
|
26
|
+
|
|
27
|
+
When `eve dev` runs the server locally, three more slash commands manage the project without leaving the session. `/vercel` opens with two questions — which model provider to use (picking something other than AI Gateway shows wiring instructions for your own provider and stops there) and how to connect to AI Gateway (paste your own `AI_GATEWAY_API_KEY`, saved straight to `.env.local`, or connect via a project) — and the project path then walks the same Vercel team/project pickers as setup (picking again re-links) and pulls the project's environment so an AI Gateway credential lands in `.env.local` — the dev server reloads env files automatically, no restart needed. `/channels` shows the agent's channel list — already-registered channels render as locked rows — and adds the one you pick, including the Slack Connect provisioning, then installs the dependencies the scaffold added so the dev server can load the new channels right away; after each addition the list repaints with the new channel locked, until Done (or Esc) leaves the flow. `/deploy` ships the agent to Vercel production, linking first when the directory is unlinked. Each command echoes as an invocation line, asks through a bordered panel that takes the input area's place — one question at a time, clearly separate from the chat transcript — and finishes with a one-line `⎿` result; loading states stay on the ephemeral status line instead of piling into the transcript. These commands are not available when connected to a remote server with `--url`, and when a turn fails because AI Gateway authentication is missing or stale, the error points you at `/vercel` directly. The TUI also checks at startup: a missing model-provider setup surfaces as an attention line — `⚠ 1 setup issue: model provider not linked · /vercel` — so the fix is visible before the first message fails, and each command's outcome hangs under it with the `⎿` connector. Local sessions also get `/model`: bare `/model` opens the same searchable model picker setup uses (the AI Gateway catalog, pre-selected on the model the runtime is serving), `/model <provider/model-id>` applies one directly, and the change is written into your agent's authored source — the command reports success only after the reloaded runtime confirms the new id.
|
|
26
28
|
|
|
27
29
|
The prompt input behaves like a shell line editor: `↑`/`↓` cycle through the messages you've sent this session, `←`/`→`, Home/End, and `Ctrl+A`/`Ctrl+E` move the caret, and `Ctrl+U`/`Ctrl+K`/`Ctrl+W` kill the line, the rest of the line, or the previous word. If a turn fails terminally — the server session dies or the connection drops — the TUI starts a fresh session and notes it inline so you can keep going (server-side context resets with the old session). Errors render compactly, with docs links highlighted, and a code bug escaping your agent's own code shows its stack trace dim beneath the error headline. Captured server `stdout`/`stderr` renders as dim, indented log runs behind a `│` rule — consecutive lines from the same source share one label, and nothing is ever hidden.
|
|
28
30
|
|
|
@@ -1,96 +1,95 @@
|
|
|
1
1
|
---
|
|
2
|
-
title: "Cases
|
|
3
|
-
description: "Author
|
|
2
|
+
title: "Cases"
|
|
3
|
+
description: "Author prompt evals, script multi-turn evals with run(ctx), and fan one file out over a dataset."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
Each eval file is one graded case: the runner executes it against the target, captures every event, and applies [checks](./checks) and [scores](./scores) to the result. An eval is either a prompt eval (`input`) or a scripted eval (`run`).
|
|
7
7
|
|
|
8
|
-
##
|
|
8
|
+
## Prompt evals
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
Prompt evals pair an `input` with an optional `expected`. `input` can be a string or an object (objects are `JSON.stringify`d); the runner sends it as a single user turn. `expected` is optional, which is handy when you only care about behavior:
|
|
11
11
|
|
|
12
|
-
```ts title="evals/weather.eval.ts"
|
|
12
|
+
```ts title="evals/weather/brooklyn-forecast.eval.ts"
|
|
13
13
|
import { defineEval } from "eve/evals";
|
|
14
14
|
import { Checks } from "eve/evals/checks";
|
|
15
15
|
|
|
16
16
|
export default defineEval({
|
|
17
|
+
input: "What is the weather in Brooklyn?",
|
|
18
|
+
expected: "Sunny",
|
|
17
19
|
checks: [Checks.didNotFail()],
|
|
18
20
|
scores: [],
|
|
19
|
-
cases: [
|
|
20
|
-
{ id: "brooklyn-forecast", input: "What is the weather in Brooklyn?", expected: "Sunny" },
|
|
21
|
-
{
|
|
22
|
-
id: "no-tools-for-greetings",
|
|
23
|
-
input: "Hello!",
|
|
24
|
-
checks: [Checks.toolNotCalled("get_weather")],
|
|
25
|
-
},
|
|
26
|
-
],
|
|
27
21
|
});
|
|
28
22
|
```
|
|
29
23
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
## Loading cases from fixtures
|
|
33
|
-
|
|
34
|
-
List cases inline, or load them dynamically with `load`. The loaders (`loadJson`, `loadYaml` from `eve/evals/loaders`) resolve paths relative to the app root:
|
|
35
|
-
|
|
36
|
-
```ts title="evals/sql.eval.ts"
|
|
24
|
+
```ts title="evals/weather/no-tools-for-greetings.eval.ts"
|
|
37
25
|
import { defineEval } from "eve/evals";
|
|
38
|
-
import {
|
|
39
|
-
import { Text } from "eve/evals/scores";
|
|
26
|
+
import { Checks } from "eve/evals/checks";
|
|
40
27
|
|
|
41
28
|
export default defineEval({
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
id: row.task,
|
|
46
|
-
input: row.prompt,
|
|
47
|
-
expected: row.sql,
|
|
48
|
-
}));
|
|
49
|
-
},
|
|
50
|
-
scores: [Text.exact()],
|
|
29
|
+
input: "Hello!",
|
|
30
|
+
checks: [Checks.didNotFail(), Checks.toolNotCalled("get_weather")],
|
|
31
|
+
scores: [],
|
|
51
32
|
});
|
|
52
33
|
```
|
|
53
34
|
|
|
54
|
-
|
|
35
|
+
## Organizing with directories
|
|
36
|
+
|
|
37
|
+
Identity is the file path, so directories are the grouping mechanism. `evals/weather/brooklyn-forecast.eval.ts` gets the id `weather/brooklyn-forecast`, and `eve eval weather` runs everything under `evals/weather/`. Shared constants and helpers live in sibling non-eval files (any name that doesn't end in `.eval.ts`):
|
|
38
|
+
|
|
39
|
+
```text
|
|
40
|
+
evals/
|
|
41
|
+
├── weather/
|
|
42
|
+
│ ├── shared.ts # helpers — not an eval
|
|
43
|
+
│ ├── brooklyn-forecast.eval.ts
|
|
44
|
+
│ └── no-tools-for-greetings.eval.ts
|
|
45
|
+
└── smoke.eval.ts
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Datasets: exporting an array
|
|
55
49
|
|
|
56
|
-
|
|
50
|
+
To fan one file out over a dataset, default-export an array of `defineEval(...)` values. Eval modules are ESM, so top-level `await` can load anything. Ids derive from the file name plus a zero-padded index (`sql/0000`, `sql/0001`, …, in array order). The loaders (`loadJson`, `loadYaml` from `eve/evals/loaders`) parse fixture files relative to the app root:
|
|
57
51
|
|
|
58
|
-
|
|
52
|
+
```ts title="evals/sql.eval.ts"
|
|
53
|
+
import { defineEval } from "eve/evals";
|
|
54
|
+
import { loadYaml } from "eve/evals/loaders";
|
|
55
|
+
import { Text } from "eve/evals/scores";
|
|
59
56
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
57
|
+
const doc = await loadYaml("evals/data/cases.yaml");
|
|
58
|
+
const rows = doc.evals as readonly { task: string; prompt: string; sql: string }[];
|
|
59
|
+
|
|
60
|
+
export default rows.map((row) =>
|
|
61
|
+
defineEval({
|
|
62
|
+
description: row.task,
|
|
63
|
+
input: row.prompt,
|
|
64
|
+
expected: row.sql,
|
|
65
|
+
scores: [Text.exact()],
|
|
66
|
+
}),
|
|
67
|
+
);
|
|
68
|
+
```
|
|
64
69
|
|
|
65
|
-
|
|
70
|
+
The loaders are meant for fixtures, not runtime agent code.
|
|
66
71
|
|
|
67
|
-
## Scripted
|
|
72
|
+
## Scripted evals
|
|
68
73
|
|
|
69
|
-
Scripted
|
|
74
|
+
Scripted evals define `run(ctx)` instead of `input`. Use them for smoke-style behavior: multi-turn branching, HITL approvals, structured output, attachments, and multiple independent sessions.
|
|
70
75
|
|
|
71
|
-
```ts title="evals/
|
|
76
|
+
```ts title="evals/approve-tool.eval.ts"
|
|
72
77
|
import { defineEval } from "eve/evals";
|
|
73
78
|
import { Checks } from "eve/evals/checks";
|
|
74
79
|
|
|
75
80
|
export default defineEval({
|
|
76
|
-
|
|
81
|
+
async run({ session }) {
|
|
82
|
+
await session.send("run `pwd`");
|
|
83
|
+
session.expectInputRequests({ toolName: "bash" });
|
|
84
|
+
const turn = await session.respondAll("approve");
|
|
85
|
+
return turn.message;
|
|
86
|
+
},
|
|
87
|
+
checks: [Checks.didNotFail(), Checks.toolCalled("bash", { input: { command: /pwd/ } })],
|
|
77
88
|
scores: [],
|
|
78
|
-
cases: [
|
|
79
|
-
{
|
|
80
|
-
id: "approve-tool",
|
|
81
|
-
async run({ session }) {
|
|
82
|
-
await session.send("run `pwd`");
|
|
83
|
-
session.expectInputRequests({ toolName: "bash" });
|
|
84
|
-
const turn = await session.respondAll("approve");
|
|
85
|
-
return turn.message;
|
|
86
|
-
},
|
|
87
|
-
checks: [Checks.toolCalled("bash", { input: { command: /pwd/ } })],
|
|
88
|
-
},
|
|
89
|
-
],
|
|
90
89
|
});
|
|
91
90
|
```
|
|
92
91
|
|
|
93
|
-
The return value of `run` becomes the
|
|
92
|
+
The return value of `run` becomes the output that scorers grade (set `parseOutput` to transform the raw result instead). Throwing marks the eval failed with the error message in the result.
|
|
94
93
|
|
|
95
94
|
## The session API
|
|
96
95
|
|
|
@@ -104,12 +103,12 @@ The return value of `run` becomes the case output that scorers grade. Throwing m
|
|
|
104
103
|
|
|
105
104
|
Each `send` resolves to an `EveEvalTurn` carrying the turn's `message`, `events`, and status. `turn.expectOk()` throws only when the turn ended failed — a session left open for a next message is the normal end state of a successful turn.
|
|
106
105
|
|
|
107
|
-
Events from every eval session are captured in the
|
|
106
|
+
Events from every eval session are captured in the result and artifacts. `ctx.log(message)` records debug lines into the eval artifact; `--verbose` also streams them to stdout as evals run.
|
|
108
107
|
|
|
109
108
|
For driving sessions created outside the eval — by a channel webhook or a schedule — see [Targets and requirements](./targets).
|
|
110
109
|
|
|
111
110
|
## What to read next
|
|
112
111
|
|
|
113
|
-
- [Checks](./checks): assert on what the
|
|
112
|
+
- [Checks](./checks): assert on what the eval did
|
|
114
113
|
- [Scores](./scores): grade how well it did it
|
|
115
114
|
- [TypeScript client](../client/messages): the send/turn protocol eval sessions build on
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
---
|
|
2
2
|
title: "Checks"
|
|
3
|
-
description: "Hard assertions over runs, tool calls, and output — any failed check fails the
|
|
3
|
+
description: "Hard assertions over runs, tool calls, and output — any failed check fails the eval and the exit code."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
Checks are hard assertions. Any failed check marks the
|
|
7
|
-
|
|
8
|
-
Checks exist at the eval level (applied to every case) and the case level (appended to the eval's).
|
|
6
|
+
Checks are hard assertions. Any failed check marks the eval failed and `eve eval` exits non-zero. Use them for the things that must hold — the run completed, the right tool ran, the output parses. For graded, non-fatal signals, use [scores](./scores) instead.
|
|
9
7
|
|
|
10
8
|
## Built-in checks
|
|
11
9
|
|
|
@@ -39,7 +37,7 @@ Checks.subagentCalled("weather", {
|
|
|
39
37
|
|
|
40
38
|
## Custom checks
|
|
41
39
|
|
|
42
|
-
A custom check is a plain function receiving `{
|
|
40
|
+
A custom check is a plain function receiving `{ evaluation, result, target }` and returning `{ name, passed, message? }`:
|
|
43
41
|
|
|
44
42
|
```ts
|
|
45
43
|
import type { EveEvalCheck } from "eve/evals/checks";
|
|
@@ -51,7 +49,7 @@ const repliedFast: EveEvalCheck = ({ result }) => ({
|
|
|
51
49
|
});
|
|
52
50
|
```
|
|
53
51
|
|
|
54
|
-
Write a `message` for the failing path — it is what the console reporter prints under the
|
|
52
|
+
Write a `message` for the failing path — it is what the console reporter prints under the eval line and what lands in JUnit output.
|
|
55
53
|
|
|
56
54
|
## Run status and parking
|
|
57
55
|
|
|
@@ -62,4 +60,4 @@ Write a `message` for the failing path — it is what the console reporter print
|
|
|
62
60
|
## What to read next
|
|
63
61
|
|
|
64
62
|
- [Scores](./scores): graded signals with thresholds
|
|
65
|
-
- [Cases
|
|
63
|
+
- [Cases](./cases): where checks attach
|
|
@@ -9,67 +9,83 @@ Evals exercise the same HTTP surface your users hit. The runner boots (or target
|
|
|
9
9
|
|
|
10
10
|
## `defineEval`
|
|
11
11
|
|
|
12
|
-
Eve discovers evals under the app-root `evals/` directory, in `.eval.ts` files. The file path is the eval's identity, so you don't author an `id` or `name
|
|
12
|
+
Eve discovers evals under the app-root `evals/` directory, in `.eval.ts` files. Each file is exactly one eval — one graded case. The file path is the eval's identity, so you don't author an `id` or `name`; directories group related evals (`evals/weather/brooklyn-forecast.eval.ts` → id `weather/brooklyn-forecast`).
|
|
13
13
|
|
|
14
14
|
```text
|
|
15
15
|
my-agent/
|
|
16
16
|
├── agent/
|
|
17
17
|
├── evals/
|
|
18
|
+
│ ├── evals.config.ts
|
|
18
19
|
│ ├── smoke.eval.ts
|
|
19
|
-
│ └── weather
|
|
20
|
+
│ └── weather/
|
|
21
|
+
│ ├── brooklyn-forecast.eval.ts
|
|
22
|
+
│ └── no-tools-for-greetings.eval.ts
|
|
20
23
|
└── package.json
|
|
21
24
|
```
|
|
22
25
|
|
|
23
|
-
```ts title="evals/weather.eval.ts"
|
|
26
|
+
```ts title="evals/weather/brooklyn-forecast.eval.ts"
|
|
24
27
|
import { defineEval } from "eve/evals";
|
|
25
28
|
import { Checks } from "eve/evals/checks";
|
|
26
29
|
import { Run } from "eve/evals/scores";
|
|
27
30
|
|
|
28
31
|
export default defineEval({
|
|
29
32
|
description: "Basic message and tool-usage coverage for the weather agent.",
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
],
|
|
33
|
+
input: "What is the weather in Brooklyn?",
|
|
34
|
+
expected: "Sunny",
|
|
33
35
|
checks: [Checks.didNotFail(), Checks.toolCalled("get_weather")],
|
|
34
36
|
scores: [Run.didNotFail()],
|
|
35
37
|
});
|
|
36
38
|
```
|
|
37
39
|
|
|
38
|
-
Every eval needs `scores` (an empty array is fine) and either `
|
|
40
|
+
Every eval needs `scores` (an empty array is fine) and either `input` (a prompt sent as a single turn) or `run` (an imperative script). The rest are optional: `description`, `expected`, `checks`, `requires`, `parseOutput`, `model`, `thresholds`, `modelOptions`, `tags`, `metadata`, `timeoutMs`, `reporters`. The init template adds `evals/**/*.ts` to `tsconfig.json`, so your eval code type-checks alongside the app.
|
|
39
41
|
|
|
40
|
-
##
|
|
42
|
+
## `evals.config.ts`
|
|
43
|
+
|
|
44
|
+
Every `evals/` directory needs exactly one `evals.config.ts` at its root. It declares the defaults every eval shares — most importantly the `model` used by model-backed scorers, so you don't repeat it in each file:
|
|
45
|
+
|
|
46
|
+
```ts title="evals/evals.config.ts"
|
|
47
|
+
import { defineEvalConfig } from "eve/evals";
|
|
48
|
+
import { Braintrust } from "eve/evals/reporters";
|
|
49
|
+
|
|
50
|
+
export default defineEvalConfig({
|
|
51
|
+
model: "openai/gpt-5.4-mini",
|
|
52
|
+
reporters: [Braintrust({ projectName: "my-agent" })],
|
|
53
|
+
});
|
|
54
|
+
```
|
|
41
55
|
|
|
42
|
-
|
|
56
|
+
`model` is required; `reporters`, `maxConcurrency`, and `timeoutMs` are optional. Config `reporters` observe every eval in the run — set one `Braintrust()` here instead of adding it to each eval. CLI flags (`--max-concurrency`, `--timeout`) and per-eval values take precedence over the config defaults. An eval that needs a different judge model overrides it with its own `model`; otherwise the config `model` applies.
|
|
57
|
+
|
|
58
|
+
## Two grading tiers
|
|
43
59
|
|
|
44
|
-
|
|
45
|
-
- **[Scores](./scores) are soft data.** They land in reports and artifacts, and a below-threshold score marks the case `scored` — visible but not fatal, unless you pass `--strict`.
|
|
60
|
+
Evals are graded on two distinct tiers:
|
|
46
61
|
|
|
47
|
-
|
|
62
|
+
- **[Checks](./checks) are hard assertions.** Any failed check marks the eval failed and `eve eval` exits non-zero. Use them for the things that must hold — the run completed, the right tool ran, the output parses.
|
|
63
|
+
- **[Scores](./scores) are soft data.** They land in reports and artifacts, and a below-threshold score marks the eval `scored` — visible but not fatal, unless you pass `--strict`.
|
|
48
64
|
|
|
49
65
|
## Run it
|
|
50
66
|
|
|
51
67
|
```bash
|
|
52
68
|
eve eval # run all discovered evals against a local dev server
|
|
53
|
-
eve eval weather # run
|
|
69
|
+
eve eval weather # run one eval, or every eval under evals/weather/
|
|
54
70
|
eve eval --url https://<app> # target an existing server or deployment
|
|
55
71
|
```
|
|
56
72
|
|
|
57
|
-
Exit code `0` means every
|
|
73
|
+
Exit code `0` means every eval passed its checks. See [Running evals](./running) for the full flag list, exit codes, and CI guidance.
|
|
58
74
|
|
|
59
75
|
## A good baseline
|
|
60
76
|
|
|
61
|
-
Most apps do fine with a small smoke
|
|
77
|
+
Most apps do fine with a few small smoke evals. Assert behavior with `Checks.didNotFail()` plus one or two content checks, keep dataset fixtures in `evals/data/`, and only reach for Braintrust once you actually need shared result review or experiment history. In CI, run `eve eval --strict` so threshold misses fail the build too.
|
|
62
78
|
|
|
63
79
|
The rest of this section covers each piece:
|
|
64
80
|
|
|
65
|
-
- [Cases
|
|
81
|
+
- [Cases](./cases): prompt evals, scripted multi-turn evals, and dataset fan-out
|
|
66
82
|
- [Checks](./checks): hard assertions over runs, tools, and output
|
|
67
83
|
- [Scores](./scores): deterministic and LLM-judged scorers, with thresholds
|
|
68
|
-
- [Targets and requirements](./targets): local vs remote targets, and gating
|
|
84
|
+
- [Targets and requirements](./targets): local vs remote targets, and gating evals on capabilities
|
|
69
85
|
- [Reporters](./reporters): Braintrust experiments and JUnit XML
|
|
70
86
|
- [Running evals](./running): the `eve eval` CLI, exit codes, and artifacts
|
|
71
87
|
|
|
72
88
|
## What to read next
|
|
73
89
|
|
|
74
|
-
- [Cases
|
|
90
|
+
- [Cases](./cases): author your first evals
|
|
75
91
|
- [Tools](../tools): the surface most evals assert on
|
|
@@ -3,27 +3,43 @@ title: "Reporters"
|
|
|
3
3
|
description: "Ship eval results to Braintrust experiments or JUnit XML — Eve runs and scores everything itself."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
Eve runs and scores everything itself; reporters just ship the results out. The CLI prints a console summary by default — one line per
|
|
6
|
+
Eve runs and scores everything itself; reporters just ship the results out. The CLI prints a console summary by default — one line per eval, failed checks with their messages — and reporters from `eve/evals/reporters` add destinations on top.
|
|
7
|
+
|
|
8
|
+
Reporters attach in two places. Declare them in `evals.config.ts` to observe **every** eval in the run — the usual choice for a shared destination like one Braintrust experiment, so you don't repeat the reporter in each file. Or list them on an individual eval's `reporters` to scope a destination to that eval (or to a group of evals that share one instance).
|
|
7
9
|
|
|
8
10
|
## Braintrust
|
|
9
11
|
|
|
10
|
-
`Braintrust(...)` uploads eval results to Braintrust experiments:
|
|
12
|
+
`Braintrust(...)` uploads eval results to Braintrust experiments. Put one instance in the config so it covers the whole run:
|
|
13
|
+
|
|
14
|
+
```ts title="evals/evals.config.ts"
|
|
15
|
+
import { defineEvalConfig } from "eve/evals";
|
|
16
|
+
import { Braintrust } from "eve/evals/reporters";
|
|
11
17
|
|
|
12
|
-
|
|
18
|
+
export default defineEvalConfig({
|
|
19
|
+
model: "openai/gpt-5.4-mini",
|
|
20
|
+
reporters: [Braintrust({ projectName: "weather-agent" })],
|
|
21
|
+
});
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Need a destination for only some evals? Attach it per eval instead:
|
|
25
|
+
|
|
26
|
+
```ts title="evals/brooklyn-forecast.eval.ts"
|
|
13
27
|
import { defineEval } from "eve/evals";
|
|
14
28
|
import { Braintrust } from "eve/evals/reporters";
|
|
15
29
|
import { Run } from "eve/evals/scores";
|
|
16
30
|
|
|
17
31
|
export default defineEval({
|
|
18
|
-
|
|
32
|
+
input: "What is the weather in Brooklyn?",
|
|
19
33
|
scores: [Run.didNotFail()],
|
|
20
34
|
reporters: [Braintrust({ projectName: "weather-agent" })],
|
|
21
35
|
});
|
|
22
36
|
```
|
|
23
37
|
|
|
24
|
-
The config takes an optional `projectName` and `experimentName`, plus a base experiment (by name or id) to diff against. Checks log as binary scores under a `check:` prefix so experiments diff check regressions the same way they diff score regressions. Eval
|
|
38
|
+
The reporter config takes an optional `projectName` and `experimentName`, plus a base experiment (by name or id) to diff against. Checks log as binary scores under a `check:` prefix so experiments diff check regressions the same way they diff score regressions. Eval `metadata` rides along to reporters.
|
|
39
|
+
|
|
40
|
+
A reporter instance observes the evals that reference it: share one instance across several evals — the config, a `shared.ts` export, or every entry of a dataset array — and their results land in a single experiment. Listing the same config reporter on an eval too does not double-report it.
|
|
25
41
|
|
|
26
|
-
Braintrust needs its SDK installed in the app and credentials in the environment. Pass `--skip-report` to run the eval without shipping results — useful locally when iterating.
|
|
42
|
+
Braintrust needs its SDK installed in the app and credentials in the environment. Pass `--skip-report` to run the eval without shipping results (this also suppresses config reporters) — useful locally when iterating.
|
|
27
43
|
|
|
28
44
|
## JUnit
|
|
29
45
|
|
|
@@ -33,7 +49,7 @@ Braintrust needs its SDK installed in the app and credentials in the environment
|
|
|
33
49
|
eve eval --strict --junit .eve/junit.xml
|
|
34
50
|
```
|
|
35
51
|
|
|
36
|
-
|
|
52
|
+
Each eval becomes one `<testcase>` named by its path-derived id; failed checks and execution errors land as failure messages on the matching test case, so CI surfaces them inline.
|
|
37
53
|
|
|
38
54
|
## Custom reporters
|
|
39
55
|
|
|
@@ -3,39 +3,40 @@ title: "Running evals"
|
|
|
3
3
|
description: "The eve eval CLI: flags, filters, exit codes, artifacts, and how to wire evals into CI."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
`eve eval` discovers every `.eval.ts` file under `evals/`, boots a local dev server (or targets a remote one), runs the
|
|
6
|
+
`eve eval` discovers every `.eval.ts` file under `evals/`, boots a local dev server (or targets a remote one), runs the evals concurrently, and prints a per-eval summary.
|
|
7
7
|
|
|
8
8
|
```bash
|
|
9
9
|
eve eval # run all discovered evals locally
|
|
10
|
-
eve eval weather smoke # run selected evals
|
|
10
|
+
eve eval weather smoke # run selected evals (an id, or a directory prefix)
|
|
11
11
|
eve eval --url https://<app> # target a remote app instead of a local host
|
|
12
12
|
eve eval --mock-models # local dev target uses deterministic mock models
|
|
13
|
-
eve eval --tag fast # only
|
|
14
|
-
eve eval --case brooklyn-forecast # only specific case ids
|
|
13
|
+
eve eval --tag fast # only evals carrying a tag
|
|
15
14
|
eve eval --strict # below-threshold scores also fail the exit code
|
|
16
15
|
eve eval --no-skips # unmet requirements fail instead of skipping
|
|
17
|
-
eve eval --timeout 60000 # per-
|
|
18
|
-
eve eval --max-concurrency 4 # cap concurrent
|
|
16
|
+
eve eval --timeout 60000 # per-eval timeout in milliseconds
|
|
17
|
+
eve eval --max-concurrency 4 # cap concurrent eval executions (default 8)
|
|
19
18
|
eve eval --junit .eve/junit.xml # write JUnit XML
|
|
20
|
-
eve eval --list # print discovered evals
|
|
21
|
-
eve eval --verbose # stream per-
|
|
19
|
+
eve eval --list # print discovered evals without running
|
|
20
|
+
eve eval --verbose # stream per-eval ctx.log lines to stdout
|
|
22
21
|
eve eval --json # machine-readable output
|
|
23
|
-
eve eval --skip-report # skip eval-defined reporters (e.g. Braintrust)
|
|
22
|
+
eve eval --skip-report # skip config and eval-defined reporters (e.g. Braintrust)
|
|
24
23
|
```
|
|
25
24
|
|
|
25
|
+
Positional ids match exactly or by directory prefix: `eve eval weather` runs `evals/weather.eval.ts`, every eval under `evals/weather/`, and every entry of an array-exported `weather.eval.ts`.
|
|
26
|
+
|
|
26
27
|
## Exit codes
|
|
27
28
|
|
|
28
29
|
| Code | Means |
|
|
29
30
|
| ---- | -------------------------------------------------------------------------------- |
|
|
30
|
-
| `0` | Every
|
|
31
|
-
| `1` | Any
|
|
31
|
+
| `0` | Every eval passed its checks (and thresholds, under `--strict`) |
|
|
32
|
+
| `1` | Any eval failed — a failed check, an execution error, or a strict threshold miss |
|
|
32
33
|
| `2` | Configuration error |
|
|
33
34
|
|
|
34
35
|
Unmet [requirements](./targets) skip visibly without affecting the exit code unless you pass `--no-skips`.
|
|
35
36
|
|
|
36
37
|
## Artifacts
|
|
37
38
|
|
|
38
|
-
Each run drops
|
|
39
|
+
Each run drops artifacts under `.eve/evals/<timestamp>/`: a run `summary.json`, a `results.jsonl` index, and per-eval check results, verdicts, captured event streams, and `ctx.log` lines under `evals/`. The console output stays tight on purpose; when an eval fails, the artifact has the full story.
|
|
39
40
|
|
|
40
41
|
## CI
|
|
41
42
|
|
|
@@ -46,8 +47,8 @@ eve eval --strict --mock-models --junit .eve/junit.xml
|
|
|
46
47
|
```
|
|
47
48
|
|
|
48
49
|
- `--strict` turns threshold misses into failures, so score regressions block the merge.
|
|
49
|
-
- `--mock-models` keeps the default leg deterministic and credential-free. Put real-model
|
|
50
|
-
- `--junit` gives the CI provider per-
|
|
50
|
+
- `--mock-models` keeps the default leg deterministic and credential-free. Put real-model evals in their own files gated on `requires: ["env:..."]`, and add `--no-skips` on legs that must prove those ran.
|
|
51
|
+
- `--junit` gives the CI provider per-eval annotations; upload the `.eve/evals/` directory as a failure artifact for the full event streams.
|
|
51
52
|
|
|
52
53
|
Against a deployed app, swap `--mock-models` for `--url`:
|
|
53
54
|
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
---
|
|
2
2
|
title: "Scores"
|
|
3
|
-
description: "Grade
|
|
3
|
+
description: "Grade evals with deterministic scorers or LLM judges, and gate them with thresholds."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
Scores are soft data. They land in reports and artifacts, and a below-threshold score marks the
|
|
6
|
+
Scores are soft data. They land in reports and artifacts, and a below-threshold score marks the eval `scored` — visible but not fatal, unless you pass `--strict`. Use them to grade quality fractionally where a [check](./checks) would assert it absolutely.
|
|
7
7
|
|
|
8
8
|
## Choosing a scorer
|
|
9
9
|
|
|
10
|
-
Scorers live in namespaces on `eve/evals/scores`. Pick the cheapest one that captures what "correct" means here. The deterministic scorers run instantly for free; an LLM judge runs once per
|
|
10
|
+
Scorers live in namespaces on `eve/evals/scores`. Pick the cheapest one that captures what "correct" means here. The deterministic scorers run instantly for free; an LLM judge runs once per eval and burns tokens, so save it for when nothing simpler will do.
|
|
11
11
|
|
|
12
12
|
| Need | Use |
|
|
13
13
|
| ---------------------------------------------------------- | --------------------------------------------------------------------------------------------- |
|
|
@@ -21,7 +21,7 @@ Scorers live in namespaces on `eve/evals/scores`. Pick the cheapest one that cap
|
|
|
21
21
|
| LLM-judged SQL semantic equivalence | `Autoevals.sql()` |
|
|
22
22
|
| LLM-judged free-form criteria (no `expected` to match) | `Autoevals.closedQA({ criteria: "..." })` |
|
|
23
23
|
|
|
24
|
-
Each scorer gets the flattened `input`, `output`, and `expected` strings along with the full
|
|
24
|
+
Each scorer gets the flattened `input`, `output`, and `expected` strings along with the full eval and task result — including derived facts: typed tool calls (name, input, output, error state), subagent calls, HITL input requests, and whether the run parked. `Run.usedTool` accepts the same matcher options as `Checks.toolCalled`. Return `null` from a scorer to skip it.
|
|
25
25
|
|
|
26
26
|
## Thresholds
|
|
27
27
|
|
|
@@ -32,23 +32,24 @@ import { defineEval } from "eve/evals";
|
|
|
32
32
|
import { Run, Text } from "eve/evals/scores";
|
|
33
33
|
|
|
34
34
|
export default defineEval({
|
|
35
|
-
|
|
35
|
+
input: "Hello",
|
|
36
|
+
expected: "Hello",
|
|
36
37
|
scores: [Run.didNotFail(), Text.includes()],
|
|
37
38
|
thresholds: { "run.didNotFail": 1, "text.includes": 0.5 },
|
|
38
39
|
});
|
|
39
40
|
```
|
|
40
41
|
|
|
41
|
-
|
|
42
|
+
An eval below a threshold gets the `scored` verdict — reported, but only fatal under `eve eval --strict`.
|
|
42
43
|
|
|
43
44
|
## The scorer model
|
|
44
45
|
|
|
45
|
-
|
|
46
|
+
Model-backed scorers (the `Autoevals` wrappers) need a judge model — the scorer model, not the agent's. Eve only uses it for scoring, never to swap out the agent under test. The default lives in [`evals.config.ts`](./overview#evalsconfigts) as the required `model`, so most evals inherit it without setting anything. Pass a string id (e.g. `"anthropic/claude-opus-4.8"`) to route through the Vercel AI Gateway, or an AI SDK model instance to use it directly.
|
|
46
47
|
|
|
47
|
-
For provider-specific scorer-model settings, use `modelOptions.providerOptions`. Individual Autoevals scorers can also take their own `model` / `modelOptions`, which win over the eval
|
|
48
|
+
Override the default on a single eval by setting that eval's own `model`. For provider-specific scorer-model settings, use `modelOptions.providerOptions`. Individual Autoevals scorers can also take their own `model` / `modelOptions`, which win over both the eval and config defaults.
|
|
48
49
|
|
|
49
50
|
## Concurrency and timeouts
|
|
50
51
|
|
|
51
|
-
`
|
|
52
|
+
`timeoutMs` bounds one eval's execution: the eval's own value wins, then `evals.config.ts`'s default, and `eve eval --timeout <ms>` overrides both for a run. The runner executes up to 8 evals at once — set a default `maxConcurrency` in `evals.config.ts` or pass `--max-concurrency <n>` (which wins) to change that, and lower it when evals contend for a shared resource: a rate-limited connection, or a sandbox-heavy fixture.
|
|
52
53
|
|
|
53
54
|
## What to read next
|
|
54
55
|
|