eve 0.6.0-beta.18 → 0.6.0-beta.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/dist/docs/public/advanced/dev-tui.md +8 -12
- package/dist/docs/public/advanced/security-model.md +1 -1
- package/dist/docs/public/evals/assertions.mdx +108 -0
- package/dist/docs/public/evals/cases.mdx +84 -55
- package/dist/docs/public/evals/judge.mdx +94 -0
- package/dist/docs/public/evals/meta.json +1 -1
- package/dist/docs/public/evals/overview.mdx +47 -20
- package/dist/docs/public/evals/reporters.mdx +9 -8
- package/dist/docs/public/evals/running.mdx +8 -8
- package/dist/docs/public/evals/targets.mdx +10 -10
- package/dist/docs/public/reference/typescript-api.md +4 -1
- package/dist/src/cli/commands/link.d.ts +1 -1
- package/dist/src/cli/dev/tui/agent-header.d.ts +15 -11
- package/dist/src/cli/dev/tui/agent-header.js +1 -1
- package/dist/src/cli/dev/tui/dev-rebuild-status.d.ts +21 -0
- package/dist/src/cli/dev/tui/dev-rebuild-status.js +1 -0
- package/dist/src/cli/dev/tui/errors.d.ts +1 -1
- package/dist/src/cli/dev/tui/errors.js +1 -1
- package/dist/src/cli/dev/tui/prompt-command-handler.d.ts +5 -1
- package/dist/src/cli/dev/tui/prompt-command-handler.js +1 -1
- package/dist/src/cli/dev/tui/prompt-commands.d.ts +4 -1
- package/dist/src/cli/dev/tui/prompt-commands.js +1 -1
- package/dist/src/cli/dev/tui/runner.d.ts +37 -3
- package/dist/src/cli/dev/tui/runner.js +1 -1
- package/dist/src/cli/dev/tui/setup-commands.d.ts +5 -4
- package/dist/src/cli/dev/tui/setup-commands.js +2 -1
- package/dist/src/cli/dev/tui/setup-flow.d.ts +3 -0
- package/dist/src/cli/dev/tui/setup-issues.d.ts +1 -1
- package/dist/src/cli/dev/tui/setup-issues.js +1 -1
- package/dist/src/cli/dev/tui/setup-panel.d.ts +12 -4
- package/dist/src/cli/dev/tui/setup-panel.js +1 -1
- package/dist/src/cli/dev/tui/status-line.d.ts +25 -0
- package/dist/src/cli/dev/tui/status-line.js +1 -0
- package/dist/src/cli/dev/tui/stream-format.d.ts +16 -1
- package/dist/src/cli/dev/tui/stream-format.js +1 -1
- package/dist/src/cli/dev/tui/terminal-renderer.d.ts +15 -3
- package/dist/src/cli/dev/tui/terminal-renderer.js +5 -3
- package/dist/src/cli/dev/tui/test/index.d.ts +2 -1
- package/dist/src/cli/dev/tui/test/index.js +1 -1
- package/dist/src/cli/dev/tui/theme.d.ts +7 -1
- package/dist/src/cli/dev/tui/theme.js +1 -1
- package/dist/src/cli/dev/tui/tui-prompter.js +1 -1
- package/dist/src/cli/dev/tui/types.d.ts +4 -3
- package/dist/src/cli/dev/tui/vercel-status.d.ts +47 -0
- package/dist/src/cli/dev/tui/vercel-status.js +1 -0
- package/dist/src/compiler/manifest.d.ts +8 -8
- package/dist/src/context/node.d.ts +1 -1
- package/dist/src/evals/assertions/collector.d.ts +43 -0
- package/dist/src/evals/assertions/collector.js +1 -0
- package/dist/src/evals/assertions/run.d.ts +72 -0
- package/dist/src/evals/assertions/run.js +2 -0
- package/dist/src/evals/autoevals-client.js +2 -0
- package/dist/src/evals/cli/eval-client.d.ts +22 -0
- package/dist/src/evals/cli/eval-client.js +1 -0
- package/dist/src/evals/cli/eval.d.ts +3 -3
- package/dist/src/evals/cli/eval.js +1 -1
- package/dist/src/evals/context.d.ts +19 -0
- package/dist/src/evals/context.js +1 -0
- package/dist/src/evals/define-eval-config.d.ts +6 -6
- package/dist/src/evals/define-eval-config.js +1 -1
- package/dist/src/evals/define-eval.d.ts +14 -14
- package/dist/src/evals/define-eval.js +1 -1
- package/dist/src/evals/expect/index.d.ts +25 -0
- package/dist/src/evals/expect/index.js +1 -0
- package/dist/src/evals/index.d.ts +2 -1
- package/dist/src/evals/index.js +1 -1
- package/dist/src/evals/judge.d.ts +20 -0
- package/dist/src/evals/judge.js +1 -0
- package/dist/src/evals/{checks/match.d.ts → match.d.ts} +16 -17
- package/dist/src/evals/match.js +1 -0
- package/dist/src/evals/runner/artifacts.js +1 -1
- package/dist/src/evals/runner/discover.js +1 -1
- package/dist/src/evals/runner/execute-eval.d.ts +3 -3
- package/dist/src/evals/runner/execute-eval.js +1 -1
- package/dist/src/evals/runner/execute-task.d.ts +16 -7
- package/dist/src/evals/runner/execute-task.js +1 -1
- package/dist/src/evals/runner/reporters/braintrust.js +1 -1
- package/dist/src/evals/runner/reporters/console.js +1 -1
- package/dist/src/evals/runner/reporters/junit.js +1 -1
- package/dist/src/evals/runner/run-evals.js +1 -1
- package/dist/src/evals/runner/verdict.d.ts +8 -13
- package/dist/src/evals/runner/verdict.js +1 -1
- package/dist/src/evals/types.d.ts +139 -156
- package/dist/src/execution/workflow-runtime.d.ts +2 -2
- package/dist/src/execution/workflow-runtime.js +1 -1
- package/dist/src/harness/model-call-error.d.ts +1 -1
- package/dist/src/internal/application/package.js +1 -1
- package/dist/src/internal/nitro/host/dev-authored-source-watcher.js +1 -1
- package/dist/src/internal/nitro/host/dev-watcher-log.d.ts +37 -0
- package/dist/src/internal/nitro/host/dev-watcher-log.js +1 -0
- package/dist/src/runtime/actions/types.d.ts +11 -11
- package/dist/src/runtime/input/types.d.ts +1 -1
- package/dist/src/setup/flows/channels.d.ts +1 -1
- package/dist/src/setup/flows/link.d.ts +2 -2
- package/dist/src/setup/flows/model.d.ts +81 -12
- package/dist/src/setup/flows/model.js +1 -1
- package/dist/src/setup/flows/vercel.d.ts +7 -6
- package/dist/src/setup/flows/vercel.js +1 -1
- package/dist/src/setup/primitives/pm/pnpm.js +1 -1
- package/dist/src/setup/project-resolution.js +1 -1
- package/dist/src/setup/prompter.d.ts +18 -0
- package/dist/src/setup/prompter.js +1 -1
- package/dist/src/setup/scaffold/create/add-to-project.js +1 -1
- package/dist/src/setup/scaffold/create/project.js +2 -2
- package/dist/src/setup/scaffold/update/channels.js +2 -2
- package/package.json +9 -12
- package/dist/docs/evals-v2-plan.md +0 -1001
- package/dist/docs/public/evals/checks.mdx +0 -63
- package/dist/docs/public/evals/scores.mdx +0 -57
- package/dist/src/evals/checks/checks.d.ts +0 -66
- package/dist/src/evals/checks/checks.js +0 -2
- package/dist/src/evals/checks/index.d.ts +0 -21
- package/dist/src/evals/checks/index.js +0 -1
- package/dist/src/evals/checks/match.js +0 -1
- package/dist/src/evals/scorers/autoevals-client.js +0 -2
- package/dist/src/evals/scorers/autoevals.d.ts +0 -58
- package/dist/src/evals/scorers/autoevals.js +0 -1
- package/dist/src/evals/scorers/json.d.ts +0 -10
- package/dist/src/evals/scorers/json.js +0 -1
- package/dist/src/evals/scorers/model-marker.d.ts +0 -12
- package/dist/src/evals/scorers/model-marker.js +0 -1
- package/dist/src/evals/scorers/run.d.ts +0 -24
- package/dist/src/evals/scorers/run.js +0 -1
- package/dist/src/evals/scorers/sql.d.ts +0 -9
- package/dist/src/evals/scorers/sql.js +0 -1
- package/dist/src/evals/scorers/text.d.ts +0 -18
- package/dist/src/evals/scorers/text.js +0 -1
- package/dist/src/evals/scores/index.d.ts +0 -72
- package/dist/src/evals/scores/index.js +0 -1
- /package/dist/src/evals/{scorers/autoevals-client.d.ts → autoevals-client.d.ts} +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,28 @@
|
|
|
1
1
|
# eve
|
|
2
2
|
|
|
3
|
+
## 0.6.0-beta.20
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- 407c6e2: The dev TUI startup header is now a single `▲ eve <agent name>` brand line plus one rotating tip ("Use /channels to add more ways to reach your agent.", "Use /deploy to see your agent go live.", "Type /help to see every command."). The config rows (Model, Instructions, Tools, Skills, Subagents, Server) and the key-hints line are gone. The model moved to the status line, the empty input row shows a dim `Type to chat · / for commands` placeholder, discovery error/warning counts still render, and `eve info` keeps the full configuration detail.
|
|
8
|
+
- 407c6e2: The dev TUI footer now ends with a persistent status line: the agent's model, the session's token flow (`⁕ ↑ 394.4K ↓ 4.3K`, input up, output down, `⁕ ↑ 0 ↓ 0` at startup), the linked Vercel project and team, and a `deploy pending` marker that appears when `/channels` adds a channel and clears when `/deploy` ships it. The Vercel segment stays hidden until the directory is linked. Token usage moved here from the working row, and `--context-size` appends a context-fill percentage.
|
|
9
|
+
- 1d65cd6: Reworked the eval authoring API around a single imperative `test(t)` function. An eval now drives the agent and asserts on what it produced in one place — `async test(t) { await t.send(...); t.completed(); t.calledTool(...); t.check(t.reply, includes(...)); t.judge.autoevals.closedQA(...).atLeast(0.6); }` — replacing the separate `run`/`input`, `checks`, `scores`, `expected`, and `thresholds` fields. Assertions carry their own severity: run-level checks and `eve/evals/expect` value assertions (`includes`/`equals`/`matches`/`similarity`) are hard gates by default, while `t.judge.autoevals.*` and `.soft(...)` assertions are tracked and only fail under `--strict`. LLM-as-judge moved under `t.judge.autoevals.*`, and the judge model is now configured via `judge: { model }` (optional) on `defineEvalConfig`/`defineEval` instead of `model`. The `eve/evals/checks` and `eve/evals/scores` entry points are removed in favor of `t`'s built-in vocabulary and `eve/evals/expect`.
|
|
10
|
+
- a33fa66: The dev TUI now shows dev-server rebuilds as one status row that updates in place: `tui/setup-panel.ts changed · rebuilding…`, then `· rebuilt`. Only the latest rebuild shows, paths shrink to their last two components, and the transcript no longer stacks a full log line per rebuild.
|
|
11
|
+
- a33fa66: The dev TUI now starts with stdout and stderr logs hidden. Use `/loglevel <all|stderr|none>` to change what the transcript shows. Logs stay buffered either way, so `/loglevel all` brings back earlier output in its original order.
|
|
12
|
+
|
|
13
|
+
### Patch Changes
|
|
14
|
+
|
|
15
|
+
- 407c6e2: The dev TUI's bottom question panel (setup flows, select/text/acknowledge questions) opens with a stronger `▔` full-width rule and carries a consistent one-space left margin. Gutter glyphs sit at column 1 and text aligns at column 3 instead of touching the terminal edge.
|
|
16
|
+
- e1cd134: `eve eval --url` now authenticates to remote targets with the same Vercel OIDC headers as the dev client, including the trusted OIDC IDP token that bypasses Deployment Protection — so evals can run against protected deployments without configuring a `VERCEL_AUTOMATION_BYPASS_SECRET`.
|
|
17
|
+
- 9a35ea7: Fix `eve init` to launch pnpm-managed projects through `pnpm exec` and generate Web Chat configuration that matches the installed Eve Next.js API.
|
|
18
|
+
- e1cd134: Workflow runs only route to `deploymentId: "latest"` on Vercel production. Preview and CLI deployments carry no git branch reference, so latest resolution failed with HTTP 400 ("Source deployment has no git branch") and every turn errored; they now pin workflow runs to their own immutable deployment.
|
|
19
|
+
|
|
20
|
+
## 0.6.0-beta.19
|
|
21
|
+
|
|
22
|
+
### Minor Changes
|
|
23
|
+
|
|
24
|
+
- 9061941: The dev TUI's `/model` now opens a configure menu uniting the model and provider setup, and the separate `/vercel` command is removed: "Change model" runs the searchable AI Gateway catalog picker, and "Configure provider" (bold yellow with "Required to enable the agent" until a Vercel link or gateway credential is detected) runs the provider questions `/vercel` used to ask. Each action returns to the menu, which shows each row's current value on its own line — e.g. "AI Gateway (Linked to my-project)" — and keeps the latest outcome visible beneath the options ("✓ Model changed to …"); Esc leaves. Error messages and the startup attention line now point at `/model`, and `/model <provider/model-id>` still applies a model directly.
|
|
25
|
+
|
|
3
26
|
## 0.6.0-beta.18
|
|
4
27
|
|
|
5
28
|
### Minor Changes
|
|
@@ -9,24 +9,20 @@ description: "Drive an Eve agent locally in an interactive terminal UI: chat, st
|
|
|
9
9
|
eve dev
|
|
10
10
|
```
|
|
11
11
|
|
|
12
|
-
On startup the TUI prints a
|
|
12
|
+
On startup the TUI prints a brand line with your agent's name, plus a rotating tip (local sessions only):
|
|
13
13
|
|
|
14
14
|
```text
|
|
15
|
-
▲
|
|
16
|
-
|
|
17
|
-
· Instructions agent/instructions.md
|
|
18
|
-
· Tools get_weather, get_forecast, geocode
|
|
19
|
-
· Subagents researcher
|
|
20
|
-
· Server http://localhost:3000
|
|
21
|
-
|
|
22
|
-
Type to chat · ↑ history · /new reset session · /model /vercel /channels /deploy · /exit quit · Ctrl+C interrupt
|
|
15
|
+
▲ eve weather-agent
|
|
16
|
+
Use /channels to add more ways to reach your agent.
|
|
23
17
|
```
|
|
24
18
|
|
|
25
|
-
|
|
19
|
+
If agent discovery reported problems, an error/warning count renders between the two lines. Instructions, tools, skills, and subagents are one `eve info` away, and `/help` lists every command.
|
|
26
20
|
|
|
27
|
-
|
|
21
|
+
From there the conversation streams straight into your terminal's normal scrollback — your prompts, the agent's replies, reasoning, tool calls, nested subagents, connection-authorization prompts, and any captured `stdout`/`stderr` — so you keep native scrolling, copy/paste, and a transcript that persists after you exit. Each turn is rendered without boxes: a colored gutter glyph marks who's speaking, tool calls collapse to a one-line summary (`✓ get_weather city="SF" → 73°F`), and a subagent's work is indented beneath its `◆` header. A sticky footer keeps you oriented. The input prompt shows a dim `Type to chat · / for commands` placeholder while empty, and beneath it a persistent status line shows the model, the session's token flow (`⁕ ↑ 394.4K ↓ 4.3K`), the linked Vercel project and team (`▲ my-agent (acme)`), and a yellow `deploy pending` marker once a channel added this session still needs `/deploy`. The Vercel segment stays hidden until the directory is linked. Press `Enter` to send; `Ctrl+C` interrupts a running turn or quits at the prompt. Slash commands: `/new` starts a fresh session and `/exit` quits.
|
|
28
22
|
|
|
29
|
-
|
|
23
|
+
When `eve dev` runs the server locally, three more slash commands manage the project without leaving the session. Bare `/model` opens a two-row configure menu that loops until Esc. "Change model" runs the same searchable model picker setup uses (the AI Gateway catalog, pre-selected on the model the runtime is serving); a model change is written into your agent's authored source, and the command reports success only after Eve confirms the new id (`/model <provider/model-id>` applies one directly, skipping the menu). The provider row opens the provider questions: which model provider to use (picking something other than AI Gateway shows wiring instructions for your own provider and stops there, leaving any existing setup untouched) and how to connect to AI Gateway — paste your own `AI_GATEWAY_API_KEY`, saved straight to `.env.local`, or connect via a project, which walks the same Vercel team/project pickers as setup (picking again re-links) and pulls the project's environment so an AI Gateway credential lands in `.env.local`; the dev server reloads env files automatically, no restart needed. The row demands attention (a bold yellow "Configure provider" with "Required to enable the agent") until a link or gateway credential is detected, then names the connection (e.g. "AI Gateway (Linked to my-project in my-team)") after, and each action's latest outcome stays visible beneath the menu (e.g. "✓ Model changed to openai/gpt-5.5"). `/channels` shows the agent's channel list — already-registered channels render as locked rows — and adds the one you pick, including the Slack Connect provisioning, then installs the dependencies the scaffold added so the dev server can load the new channels right away; after each addition the list repaints with the new channel locked, until Done (or Esc) leaves the flow. `/deploy` ships the agent to Vercel production, linking first when the directory is unlinked. Each command echoes as an invocation line, asks through a bordered panel that takes the input area's place — one question at a time, clearly separate from the chat transcript — and finishes with a one-line `⎿` result; loading states stay on the ephemeral status line instead of piling into the transcript. These commands are not available when connected to a remote server with `--url`, and when a turn fails because AI Gateway authentication is missing or stale, the error points you at `/model` directly. The TUI also checks at startup: a missing model-provider setup surfaces as an attention line — `⚠ 1 setup issue: model provider not linked · /model` — so the fix is visible before the first message fails, and each command's outcome hangs under it with the `⎿` connector.
|
|
24
|
+
|
|
25
|
+
The prompt input behaves like a shell line editor: `↑`/`↓` cycle through the messages you've sent this session, `←`/`→`, Home/End, and `Ctrl+A`/`Ctrl+E` move the caret, and `Ctrl+U`/`Ctrl+K`/`Ctrl+W` kill the line, the rest of the line, or the previous word. If a turn fails terminally — the server session dies or the connection drops — the TUI starts a fresh session and notes it inline so you can keep going (server-side context resets with the old session). Errors render compactly, with docs links highlighted, and a code bug escaping your agent's own code shows its stack trace dim beneath the error headline. Captured server `stdout`/`stderr` renders as dim, indented log runs behind a `│` rule — consecutive lines from the same source share one label. Dev-server rebuilds condense further, into one status row that updates in place: `tui/setup-panel.ts changed · rebuilding…`, then `· rebuilt`. Only the latest rebuild shows, and paths shrink to their last two components. The TUI hides logs by default. `/loglevel <all|stderr|none>` switches what the transcript shows, and because logs stay buffered either way, the switch is retroactive: `/loglevel all` brings back everything captured so far, in its original order. Bare `/loglevel` reports the current mode; the `--logs` flag sets the starting one.
|
|
30
26
|
|
|
31
27
|
The agent will sometimes need something from you, and the TUI asks inline. Tool approvals are a `y`/`n`. Option questions let you pick with `↑`/`↓` and `Enter`, or you can type a freeform answer. If a tool needs an authorized [connection](../connections), the URL shows up right in the transcript, and the turn picks back up once you've finished the flow.
|
|
32
28
|
|
|
@@ -42,7 +42,7 @@ A [channel](../channels/overview) is your agent's front door, which makes authen
|
|
|
42
42
|
claims. A body field is attacker-controlled; treating it as identity is
|
|
43
43
|
cross-user impersonation.
|
|
44
44
|
|
|
45
|
-
|
|
45
|
+
A custom channel that accepts dashboard-style webhooks should follow the same shape: authenticate the raw body with an HMAC, compare signatures in constant time, and trust any body-supplied principal only after the signature verifies.
|
|
46
46
|
|
|
47
47
|
## Authored markdown is data
|
|
48
48
|
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Assertions"
|
|
3
|
+
description: "Run-level methods, t.check value assertions, the matcher mini-language, and gate vs soft severity."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
Assertions are how an eval grades what its `test(t)` function produced. Each one **records** a result onto `t` and returns a chainable handle — the runner reads the recorded results to compute the verdict, so a single run reports every failing assertion rather than dying on the first. There are two deterministic surfaces: run-level methods on `t`, and `t.check` for grading a specific value. For model-graded assertions, see [Judge](./judge).
|
|
7
|
+
|
|
8
|
+
## Run-level assertions
|
|
9
|
+
|
|
10
|
+
Run-level assertions read the whole run, so they take no value. They are methods on `t` and gate by default.
|
|
11
|
+
|
|
12
|
+
| Assertion | Asserts |
|
|
13
|
+
| --------------------------------------------------- | --------------------------------------------------------------------------------- |
|
|
14
|
+
| `t.completed()` | The run did not fail and did not park on unanswered HITL input |
|
|
15
|
+
| `t.didNotFail()` | No terminal failure and no `turn.failed`/`step.failed` events (parked runs pass) |
|
|
16
|
+
| `t.waiting()` | The run parked on HITL input (for approval-shaped evals) |
|
|
17
|
+
| `t.messageIncludes(token)` | Joined assistant text contains `token` (string or RegExp) |
|
|
18
|
+
| `t.outputEquals(value)` / `t.outputMatches(schema)` | Deep equality / Standard Schema (e.g. Zod) validation of the parsed output |
|
|
19
|
+
| `t.calledTool(name, opts?)` | A matching tool call happened (`input`, `output`, `isError`, `times` constraints) |
|
|
20
|
+
| `t.notCalledTool(name)` | No call to `name` |
|
|
21
|
+
| `t.toolOrder([...names])` | Tool names appear in order (other calls may interleave) |
|
|
22
|
+
| `t.usedNoTools()` | No tool calls at all |
|
|
23
|
+
| `t.maxToolCalls(n)` | At most `n` tool calls |
|
|
24
|
+
| `t.noFailedActions()` | No tool, subagent, or skill action reported a failure |
|
|
25
|
+
| `t.calledSubagent(name, opts?)` | A subagent delegation happened (`remoteUrl`, `output` constraints) |
|
|
26
|
+
| `t.event(predicate, label)` | Escape hatch: any predicate over the typed event stream |
|
|
27
|
+
|
|
28
|
+
`t.completed()` subsumes `t.didNotFail()` — reach for `completed` unless you specifically want to allow a parked run.
|
|
29
|
+
|
|
30
|
+
```ts
|
|
31
|
+
await t.send("What is the weather in Brooklyn?");
|
|
32
|
+
t.completed();
|
|
33
|
+
t.calledTool("get_weather");
|
|
34
|
+
t.usedNoTools(); // mutually exclusive with the line above — pick the one you mean
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Value assertions with `t.check`
|
|
38
|
+
|
|
39
|
+
`t.check(value, assertion)` grades an explicit value against a builder from `eve/evals/expect`. The value can be `t.reply`, a turn's `.message`, parsed JSON, or any local you computed:
|
|
40
|
+
|
|
41
|
+
```ts
|
|
42
|
+
import { includes, equals, matches, similarity } from "eve/evals/expect";
|
|
43
|
+
|
|
44
|
+
t.check(t.reply, includes("sunny")); // substring (gate)
|
|
45
|
+
t.check(parsed, equals({ city: "Brooklyn" })); // deep structural equality (gate)
|
|
46
|
+
t.check(parsed, matches(WeatherSchema)); // Standard Schema, e.g. Zod (gate)
|
|
47
|
+
t.check(t.reply, similarity("Sunny, 72F")); // fuzzy 0–1 Levenshtein (soft)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
| Builder | Scores | Default |
|
|
51
|
+
| ---------------------- | ------------------------------------------------ | ------- |
|
|
52
|
+
| `includes(substring)` | value (coerced to string) contains `substring` | gate |
|
|
53
|
+
| `equals(value)` | deep structural equality | gate |
|
|
54
|
+
| `matches(schema)` | validates against a Standard Schema | gate |
|
|
55
|
+
| `similarity(expected)` | normalized Levenshtein similarity, 1 = identical | soft |
|
|
56
|
+
|
|
57
|
+
Pick the cheapest builder that captures what "correct" means. When exact match is too strict but a judge model is overkill, `similarity` is the middle ground; for nuanced grading, reach for the [judge](./judge).
|
|
58
|
+
|
|
59
|
+
## The matcher mini-language
|
|
60
|
+
|
|
61
|
+
`t.calledTool` and `t.calledSubagent` take a matcher object — `{ input, output, isError, times }` for tools, `{ remoteUrl, output }` for subagents. Each field accepts a literal (objects partial-deep-match), a RegExp, or a function. A matcher function receives the value and returns either a boolean (acts as a predicate) or an expected value to compare against (handy for runner-assigned values like environment-provided URLs):
|
|
62
|
+
|
|
63
|
+
```ts
|
|
64
|
+
t.calledTool("bash", { input: { command: /^pwd/ }, isError: false, times: 1 });
|
|
65
|
+
|
|
66
|
+
t.calledTool("echo", { output: (value) => String(value).includes(marker) });
|
|
67
|
+
|
|
68
|
+
t.calledSubagent("weather", {
|
|
69
|
+
remoteUrl: () => process.env.WEATHER_AGENT_URL!,
|
|
70
|
+
output: /72F/,
|
|
71
|
+
});
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Run state and derived facts
|
|
75
|
+
|
|
76
|
+
A turn that leaves the session open for a next message is the normal end state of a successful turn. Parking on unanswered HITL input is tracked separately — that is what `t.completed()` and `t.waiting()` key off.
|
|
77
|
+
|
|
78
|
+
Beyond the raw `t.events` stream, the runner derives typed facts the assertions read: tool calls (name, input, output, error state), subagent calls, and HITL input requests. The built-in assertions cover almost everything; when you need to read the stream directly, `t.event(predicate, label)` is the escape hatch:
|
|
79
|
+
|
|
80
|
+
```ts
|
|
81
|
+
t.event(
|
|
82
|
+
(events) =>
|
|
83
|
+
events.some((e) => e.type === "message.completed" && e.data.message?.includes(marker)),
|
|
84
|
+
"assistant reply includes the marker",
|
|
85
|
+
);
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Severity
|
|
89
|
+
|
|
90
|
+
Every assertion returns a chainable handle. Severity rides on the assertion — there is no separate thresholds map to keep in sync.
|
|
91
|
+
|
|
92
|
+
- `.gate(threshold?)` — hard. A miss marks the eval `failed` and `eve eval` exits non-zero.
|
|
93
|
+
- `.soft(threshold?)` — tracked data. A below-threshold miss marks the eval `scored`, fatal only under `--strict`. With no threshold, it is tracked-only and never fails.
|
|
94
|
+
- `.atLeast(threshold)` — soft with a bar (equivalent to `.soft(threshold)`).
|
|
95
|
+
|
|
96
|
+
The defaults are chosen so you rarely set severity. Run-level methods and `includes`/`equals`/`matches` are gates; `similarity` and every `t.judge.*` assertion are soft. Annotate only when you deviate:
|
|
97
|
+
|
|
98
|
+
```ts
|
|
99
|
+
t.calledTool("get_weather").soft(); // record the tool call as a metric, don't gate
|
|
100
|
+
t.check(t.reply, similarity("Sunny")).atLeast(0.8); // gate the fuzzy match under --strict
|
|
101
|
+
t.check(t.reply, includes("error")).soft(); // track without failing the build
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## What to read next
|
|
105
|
+
|
|
106
|
+
- [Judge](./judge): LLM-graded assertions with thresholds
|
|
107
|
+
- [Cases](./cases): where assertions attach
|
|
108
|
+
- [Running evals](./running): how verdicts map to exit codes
|
|
@@ -1,34 +1,38 @@
|
|
|
1
1
|
---
|
|
2
2
|
title: "Cases"
|
|
3
|
-
description: "Author
|
|
3
|
+
description: "Author single-turn and multi-turn evals with test(t), and fan one file out over a dataset."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
Each eval file is one graded case
|
|
6
|
+
Each eval file is one graded case. The runner executes its `test(t)` function against the target, captures every event, and computes a verdict from the [assertions](./assertions) you recorded. Every eval — single-turn, multi-turn, HITL, or dataset-driven — is the same shape: one `async test(t)` function that drives the agent and asserts inline.
|
|
7
7
|
|
|
8
|
-
##
|
|
8
|
+
## Single-turn evals
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
The common case sends one turn and asserts on the reply. `t.send(input)` resolves once the turn settles; `t.reply` is the last assistant message:
|
|
11
11
|
|
|
12
12
|
```ts title="evals/weather/brooklyn-forecast.eval.ts"
|
|
13
13
|
import { defineEval } from "eve/evals";
|
|
14
|
-
import {
|
|
14
|
+
import { includes } from "eve/evals/expect";
|
|
15
15
|
|
|
16
16
|
export default defineEval({
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
17
|
+
async test(t) {
|
|
18
|
+
await t.send("What is the weather in Brooklyn?");
|
|
19
|
+
t.completed();
|
|
20
|
+
t.check(t.reply, includes("Sunny"));
|
|
21
|
+
},
|
|
21
22
|
});
|
|
22
23
|
```
|
|
23
24
|
|
|
25
|
+
Some evals only care about behavior, not text — assert on the run and skip the content check entirely:
|
|
26
|
+
|
|
24
27
|
```ts title="evals/weather/no-tools-for-greetings.eval.ts"
|
|
25
28
|
import { defineEval } from "eve/evals";
|
|
26
|
-
import { Checks } from "eve/evals/checks";
|
|
27
29
|
|
|
28
30
|
export default defineEval({
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
31
|
+
async test(t) {
|
|
32
|
+
await t.send("Hello!");
|
|
33
|
+
t.completed();
|
|
34
|
+
t.notCalledTool("get_weather");
|
|
35
|
+
},
|
|
32
36
|
});
|
|
33
37
|
```
|
|
34
38
|
|
|
@@ -45,70 +49,95 @@ evals/
|
|
|
45
49
|
└── smoke.eval.ts
|
|
46
50
|
```
|
|
47
51
|
|
|
48
|
-
##
|
|
52
|
+
## Multi-turn evals
|
|
49
53
|
|
|
50
|
-
|
|
54
|
+
Drive several turns in sequence — branching, HITL approvals, structured output, attachments, multiple sessions. Because assertions live in the function, an intermediate value is just a local variable: judge a draft before the next turn overwrites it, then keep going.
|
|
51
55
|
|
|
52
|
-
```ts title="evals/
|
|
56
|
+
```ts title="evals/draft-then-send.eval.ts"
|
|
53
57
|
import { defineEval } from "eve/evals";
|
|
54
|
-
import {
|
|
55
|
-
import { Text } from "eve/evals/scores";
|
|
58
|
+
import { includes } from "eve/evals/expect";
|
|
56
59
|
|
|
57
|
-
|
|
58
|
-
|
|
60
|
+
export default defineEval({
|
|
61
|
+
async test(t) {
|
|
62
|
+
const draft = await t.send("Draft the follow-up email.");
|
|
63
|
+
t.check(draft.message, includes("Best regards"));
|
|
64
|
+
t.judge.autoevals.closedQA("professional tone", { on: draft.message }).atLeast(0.6);
|
|
59
65
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
expected: row.sql,
|
|
65
|
-
scores: [Text.exact()],
|
|
66
|
-
}),
|
|
67
|
-
);
|
|
66
|
+
await t.send("Now send it.");
|
|
67
|
+
t.calledTool("send_email");
|
|
68
|
+
},
|
|
69
|
+
});
|
|
68
70
|
```
|
|
69
71
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
## Scripted evals
|
|
72
|
+
Bespoke preconditions that no built-in assertion expresses are plain `throw`s — a thrown error marks the eval `failed` with the message in the result:
|
|
73
73
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
```ts title="evals/approve-tool.eval.ts"
|
|
74
|
+
```ts title="evals/session-continuity.eval.ts"
|
|
77
75
|
import { defineEval } from "eve/evals";
|
|
78
|
-
import {
|
|
76
|
+
import { includes } from "eve/evals/expect";
|
|
79
77
|
|
|
80
78
|
export default defineEval({
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
const
|
|
85
|
-
|
|
79
|
+
requires: ["mockModels"],
|
|
80
|
+
async test(t) {
|
|
81
|
+
await t.send("My favorite word is marigold.");
|
|
82
|
+
const firstSessionId = t.sessionId;
|
|
83
|
+
|
|
84
|
+
const second = await t.send("Thanks for remembering.");
|
|
85
|
+
second.expectOk();
|
|
86
|
+
if (t.sessionId !== firstSessionId) {
|
|
87
|
+
throw new Error(`Expected one session; got ${firstSessionId} then ${t.sessionId}.`);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
t.completed();
|
|
91
|
+
t.check(second.message, includes("Thanks for remembering."));
|
|
86
92
|
},
|
|
87
|
-
checks: [Checks.didNotFail(), Checks.toolCalled("bash", { input: { command: /pwd/ } })],
|
|
88
|
-
scores: [],
|
|
89
93
|
});
|
|
90
94
|
```
|
|
91
95
|
|
|
92
|
-
The
|
|
93
|
-
|
|
94
|
-
## The session API
|
|
96
|
+
## The drive API
|
|
95
97
|
|
|
96
|
-
`
|
|
98
|
+
`t` drives the primary session; `t.newSession()` returns an independent `EveEvalSession` against the same target, whose events feed the same run-level assertions.
|
|
97
99
|
|
|
98
|
-
- `
|
|
99
|
-
- `
|
|
100
|
-
- `
|
|
101
|
-
- `
|
|
102
|
-
- `
|
|
100
|
+
- `t.send(input)` sends a turn and waits for it to settle. It accepts the same input as `ClientSession.send()` — a string or a structured message — and resolves to a turn carrying `.message` and `.expectOk()`.
|
|
101
|
+
- `t.sendFile(text, path, mediaType?)` attaches a local file as a data URL.
|
|
102
|
+
- `t.expectInputRequests(filter?)` asserts the previous turn parked on HITL input and returns the pending requests.
|
|
103
|
+
- `t.respond(...responses)` answers specific pending input requests and sends them as the next turn.
|
|
104
|
+
- `t.respondAll(optionId)` answers every pending input request with the same option and sends the responses as the next turn.
|
|
105
|
+
- `t.reply` is the last assistant message (or `null`); `t.sessionId` is the current session id; `t.events` is the full typed event stream captured so far.
|
|
103
106
|
|
|
104
|
-
Each `send`
|
|
107
|
+
Each `send` (and `respond`/`respondAll`) resolves to a turn whose `expectOk()` throws only when the turn ended failed — a session left open for a next message is the normal end state of a successful turn.
|
|
105
108
|
|
|
106
|
-
Events from every
|
|
109
|
+
Events from every session are captured in the result and artifacts. `t.log(message)` records debug lines into the eval artifact; `--verbose` also streams them to stdout as evals run. `t.signal` is an `AbortSignal` that fires on timeout.
|
|
107
110
|
|
|
108
111
|
For driving sessions created outside the eval — by a channel webhook or a schedule — see [Targets and requirements](./targets).
|
|
109
112
|
|
|
113
|
+
## Datasets: exporting an array
|
|
114
|
+
|
|
115
|
+
To fan one file out over a dataset, default-export an array of `defineEval(...)` values. Eval modules are ESM, so top-level `await` can load anything. Ids derive from the file name plus a zero-padded index (`sql/0000`, `sql/0001`, …, in array order). The loaders (`loadJson`, `loadYaml` from `eve/evals/loaders`) parse fixture files relative to the app root:
|
|
116
|
+
|
|
117
|
+
```ts title="evals/sql.eval.ts"
|
|
118
|
+
import { defineEval } from "eve/evals";
|
|
119
|
+
import { loadYaml } from "eve/evals/loaders";
|
|
120
|
+
import { equals } from "eve/evals/expect";
|
|
121
|
+
|
|
122
|
+
const doc = await loadYaml("evals/data/cases.yaml");
|
|
123
|
+
const rows = doc.evals as readonly { task: string; prompt: string; sql: string }[];
|
|
124
|
+
|
|
125
|
+
export default rows.map((row) =>
|
|
126
|
+
defineEval({
|
|
127
|
+
description: row.task,
|
|
128
|
+
async test(t) {
|
|
129
|
+
await t.send(row.prompt);
|
|
130
|
+
t.completed();
|
|
131
|
+
t.check(t.reply, equals(row.sql));
|
|
132
|
+
},
|
|
133
|
+
}),
|
|
134
|
+
);
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
The loaders are meant for fixtures, not runtime agent code.
|
|
138
|
+
|
|
110
139
|
## What to read next
|
|
111
140
|
|
|
112
|
-
- [
|
|
113
|
-
- [
|
|
141
|
+
- [Assertions](./assertions): assert on what the eval did
|
|
142
|
+
- [Judge](./judge): grade quality with an LLM judge
|
|
114
143
|
- [TypeScript client](../client/messages): the send/turn protocol eval sessions build on
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Judge"
|
|
3
|
+
description: "Grade evals with an LLM judge via t.judge.autoevals, set thresholds on the assertion, and configure the judge model."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
When no deterministic [assertion](./assertions) captures what "good" means — factual correctness, summary quality, free-form criteria — grade the run with an LLM judge. The `t.judge.*` assertions are the only model-backed ones, and they use a judge model that is resolved separately from the agent under test: Eve only uses it for scoring, never to swap out the agent.
|
|
7
|
+
|
|
8
|
+
```ts
|
|
9
|
+
import { defineEval } from "eve/evals";
|
|
10
|
+
|
|
11
|
+
export default defineEval({
|
|
12
|
+
async test(t) {
|
|
13
|
+
await t.send("Explain quantum tunneling to a 10-year-old.");
|
|
14
|
+
t.completed();
|
|
15
|
+
t.judge.autoevals.closedQA("uses no math beyond arithmetic").atLeast(0.8);
|
|
16
|
+
},
|
|
17
|
+
});
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## The graders
|
|
21
|
+
|
|
22
|
+
The judges live under `t.judge.autoevals` — the namespace names the [Braintrust autoevals](https://github.com/braintrustdata/autoevals) grader family, so the factuality and closedQA semantics are autoevals', not Eve-invented. Each grades `t.reply` by default and is soft by default (tracked, no gate):
|
|
23
|
+
|
|
24
|
+
| Grader | Grades |
|
|
25
|
+
| ---------------------------------------- | -------------------------------------------------------------------------------------- |
|
|
26
|
+
| `t.judge.autoevals.factuality(expected)` | Factual consistency of the reply against an expected answer (A–E buckets) |
|
|
27
|
+
| `t.judge.autoevals.summarizes(expected)` | How well the reply summarizes the expected text |
|
|
28
|
+
| `t.judge.autoevals.closedQA(criteria)` | Whether the reply satisfies a free-form yes/no criterion (no expected answer to match) |
|
|
29
|
+
| `t.judge.autoevals.sql(expected)` | Semantic equivalence of two SQL statements |
|
|
30
|
+
|
|
31
|
+
The reference or criteria is the positional argument. An options object follows:
|
|
32
|
+
|
|
33
|
+
- `on` — the value to grade, defaulting to `t.reply`. Pass an intermediate draft or parsed value to grade it instead.
|
|
34
|
+
- `model` / `modelOptions` — a per-call judge override (see below).
|
|
35
|
+
|
|
36
|
+
```ts
|
|
37
|
+
const draft = await t.send("Draft the welcome email.");
|
|
38
|
+
t.judge.autoevals.closedQA("professional tone", { on: draft.message }).atLeast(0.6);
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Soft scoring and thresholds
|
|
42
|
+
|
|
43
|
+
Judge assertions are soft, so the threshold rides on the assertion handle — there is no separate thresholds map:
|
|
44
|
+
|
|
45
|
+
- **No threshold** — tracked-only. The score lands in reports and artifacts and never fails the eval. Use it to watch a metric without gating on it.
|
|
46
|
+
- `.atLeast(threshold)` — a soft bar. A below-threshold score marks the eval `scored`, fatal only under `eve eval --strict`.
|
|
47
|
+
- `.gate(threshold)` — promote a judge to a hard gate that fails the eval outright.
|
|
48
|
+
|
|
49
|
+
```ts
|
|
50
|
+
t.judge.autoevals.closedQA("cites a source"); // tracked, never fails
|
|
51
|
+
t.judge.autoevals.closedQA("cites a source").atLeast(0.6); // soft, fails under --strict below 0.6
|
|
52
|
+
t.judge.autoevals.factuality(reference).gate(0.8); // hard gate at 0.8
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
A judge runs once per assertion and burns tokens, so reach for one only when nothing deterministic will do. Several slow judge calls in one eval can fan out with `await Promise.all([...])`.
|
|
56
|
+
|
|
57
|
+
## Configuring the judge model
|
|
58
|
+
|
|
59
|
+
The judge model is resolved once when the runner builds `t`. It is **never** the model under test. Three levels resolve innermost-wins:
|
|
60
|
+
|
|
61
|
+
1. **Per-call** — `t.judge.autoevals.closedQA("…", { model, modelOptions })`.
|
|
62
|
+
2. **Per-eval** — `defineEval({ judge: { model, modelOptions }, test })`.
|
|
63
|
+
3. **Project default** — `defineEvalConfig({ judge: { model, modelOptions } })` in `evals.config.ts`.
|
|
64
|
+
|
|
65
|
+
```ts title="evals/evals.config.ts"
|
|
66
|
+
import { defineEvalConfig } from "eve/evals";
|
|
67
|
+
|
|
68
|
+
export default defineEvalConfig({
|
|
69
|
+
judge: { model: "openai/gpt-5.4-mini" }, // the default judge for every eval in this tree
|
|
70
|
+
});
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
```ts title="evals/quantum.eval.ts"
|
|
74
|
+
import { defineEval } from "eve/evals";
|
|
75
|
+
|
|
76
|
+
export default defineEval({
|
|
77
|
+
judge: { model: "anthropic/claude-opus-4.8" }, // a stronger judge for this eval
|
|
78
|
+
async test(t) {
|
|
79
|
+
await t.send("Explain quantum tunneling to a 10-year-old.");
|
|
80
|
+
t.judge.autoevals.factuality(reference).atLeast(0.7);
|
|
81
|
+
t.judge.autoevals.closedQA("is concise", { model: "anthropic/claude-haiku-4-5" }); // cheaper, per-call
|
|
82
|
+
},
|
|
83
|
+
});
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
`judge` in `evals.config.ts` is optional — a tree of fully deterministic evals can omit it. But calling `t.judge.*` with no judge model resolved is a fail-fast error at eval definition time.
|
|
87
|
+
|
|
88
|
+
A **string model id** (e.g. `"anthropic/claude-opus-4.8"`) routes through the Vercel AI Gateway and needs `AI_GATEWAY_API_KEY` or `VERCEL_OIDC_TOKEN` in the environment; an **AI SDK `LanguageModel` instance** is used directly. With a model configured but no credentials, a judge-backed eval **skips visibly** like other real-model legs, so mock-model fixture runs stay green. For provider-specific judge settings, use `modelOptions.providerOptions`.
|
|
89
|
+
|
|
90
|
+
## What to read next
|
|
91
|
+
|
|
92
|
+
- [Assertions](./assertions): deterministic run-level and value assertions
|
|
93
|
+
- [Reporters](./reporters): ship judged scores to Braintrust experiments
|
|
94
|
+
- [Targets and requirements](./targets): gating judge-backed evals on credentials
|