eve 0.6.0-beta.13 → 0.6.0-beta.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/CHANGELOG.md +21 -0
  2. package/README.md +6 -5
  3. package/dist/docs/evals-v2-plan.md +29 -4
  4. package/dist/docs/public/README.md +1 -1
  5. package/dist/docs/public/advanced/auth-and-route-protection.md +1 -1
  6. package/dist/docs/public/advanced/instrumentation.md +1 -1
  7. package/dist/docs/public/advanced/meta.json +0 -1
  8. package/dist/docs/public/channels/eve.mdx +1 -1
  9. package/dist/docs/public/evals/cases.mdx +115 -0
  10. package/dist/docs/public/evals/checks.mdx +65 -0
  11. package/dist/docs/public/evals/meta.json +4 -0
  12. package/dist/docs/public/evals/overview.mdx +75 -0
  13. package/dist/docs/public/evals/reporters.mdx +45 -0
  14. package/dist/docs/public/evals/running.mdx +62 -0
  15. package/dist/docs/public/evals/scores.mdx +56 -0
  16. package/dist/docs/public/evals/targets.mdx +59 -0
  17. package/dist/docs/public/getting-started.mdx +12 -8
  18. package/dist/docs/public/meta.json +1 -0
  19. package/dist/docs/public/reference/cli.md +1 -1
  20. package/dist/docs/public/reference/typescript-api.md +1 -1
  21. package/dist/docs/public/tutorial/first-agent.mdx +5 -2
  22. package/dist/src/cli/banner.d.ts +7 -0
  23. package/dist/src/cli/banner.js +1 -0
  24. package/dist/src/cli/commands/channels.js +1 -1
  25. package/dist/src/cli/commands/init-git.d.ts +15 -0
  26. package/dist/src/cli/commands/init-git.js +1 -0
  27. package/dist/src/cli/commands/init.d.ts +21 -0
  28. package/dist/src/cli/commands/init.js +1 -0
  29. package/dist/src/cli/dev/change-agent-model.d.ts +27 -0
  30. package/dist/src/cli/dev/change-agent-model.js +1 -0
  31. package/dist/src/cli/dev/tui/runner.d.ts +38 -2
  32. package/dist/src/cli/dev/tui/runner.js +1 -1
  33. package/dist/src/cli/dev/tui/tui.d.ts +6 -0
  34. package/dist/src/cli/dev/tui/tui.js +1 -1
  35. package/dist/src/cli/run.d.ts +1 -0
  36. package/dist/src/cli/run.js +2 -2
  37. package/dist/src/compiled/.vendor-stamp.json +1 -1
  38. package/dist/src/compiled/just-bash/index.d.ts +4 -4
  39. package/dist/src/compiler/artifacts.js +1 -1
  40. package/dist/src/compiler/normalize-agent-config.js +1 -1
  41. package/dist/src/compiler/normalize-manifest.js +1 -1
  42. package/dist/src/compiler/normalize-sandbox.js +1 -1
  43. package/dist/src/compiler/workspace-resources.js +1 -1
  44. package/dist/src/evals/cli/eval.js +1 -1
  45. package/dist/src/evals/session.js +1 -1
  46. package/dist/src/execution/sandbox/bash-tool.d.ts +6 -6
  47. package/dist/src/execution/sandbox/bash-tool.js +1 -1
  48. package/dist/src/execution/sandbox/bindings/local.js +1 -1
  49. package/dist/src/harness/tool-loop.js +1 -1
  50. package/dist/src/internal/application/cache-metadata.js +1 -1
  51. package/dist/src/internal/application/compiled-artifacts.js +1 -1
  52. package/dist/src/internal/application/package.js +1 -1
  53. package/dist/src/internal/application/paths.js +1 -1
  54. package/dist/src/internal/nitro/dev-runtime-artifacts.js +1 -1
  55. package/dist/src/internal/nitro/host/build-application.js +1 -1
  56. package/dist/src/internal/nitro/host/build-vercel-agent-summary.js +1 -1
  57. package/dist/src/internal/nitro/host/configure-nitro-routes.js +3 -3
  58. package/dist/src/internal/nitro/host/create-application-nitro.js +1 -1
  59. package/dist/src/internal/nitro/host/prepare-application-host.js +1 -1
  60. package/dist/src/internal/nitro/host/start-production-server.js +1 -1
  61. package/dist/src/internal/workflow-bundle/builder-support.js +2 -2
  62. package/dist/src/internal/workflow-bundle/builder.js +3 -3
  63. package/dist/src/public/next/server.js +1 -1
  64. package/dist/src/public/nuxt/dev-server.js +1 -1
  65. package/dist/src/public/sveltekit/dev-server.js +1 -1
  66. package/dist/src/public/tools/define-bash-tool.d.ts +3 -3
  67. package/dist/src/public/tools/define-bash-tool.js +1 -1
  68. package/dist/src/public/tools/index.d.ts +1 -1
  69. package/dist/src/public/tools/index.js +1 -1
  70. package/dist/src/runtime/agent/mock-model-adapter.js +1 -1
  71. package/dist/src/runtime/agent/mock-model-fixtures.js +3 -2
  72. package/dist/src/runtime/agent/mock-model-skill-selection.js +3 -4
  73. package/dist/src/runtime/framework-tools/bash.d.ts +3 -3
  74. package/dist/src/runtime/framework-tools/bash.js +1 -1
  75. package/dist/src/runtime/governance/auth/http-basic.js +1 -1
  76. package/dist/src/runtime/sandbox/keys.js +1 -1
  77. package/dist/src/setup/boxes/add-channels.js +1 -1
  78. package/dist/src/setup/boxes/deploy-project.js +1 -1
  79. package/dist/src/setup/boxes/one-shot-next-steps.js +1 -1
  80. package/dist/src/setup/boxes/resolve-target.js +1 -1
  81. package/dist/src/setup/boxes/select-channels.d.ts +5 -11
  82. package/dist/src/setup/boxes/select-model.d.ts +0 -5
  83. package/dist/src/setup/boxes/select-model.js +1 -1
  84. package/dist/src/setup/boxes/select-setup-mode.d.ts +1 -1
  85. package/dist/src/setup/boxes/select-setup-mode.js +1 -1
  86. package/dist/src/setup/cli/rail-log.d.ts +2 -0
  87. package/dist/src/setup/cli/rail-log.js +2 -2
  88. package/dist/src/setup/connection-connector.js +1 -1
  89. package/dist/src/setup/headless.d.ts +1 -1
  90. package/dist/src/setup/onboarding.d.ts +1 -1
  91. package/dist/src/setup/primitives/run-pnpm.js +1 -1
  92. package/dist/src/setup/primitives/run-vercel.js +1 -1
  93. package/dist/src/setup/project-name.d.ts +4 -0
  94. package/dist/src/setup/project-name.js +1 -0
  95. package/dist/src/setup/scaffold/channels-catalog.d.ts +2 -2
  96. package/dist/src/setup/scaffold/create/project.js +1 -1
  97. package/dist/src/setup/scaffold/update/channels.js +1 -1
  98. package/dist/src/setup/scaffold/update/pnpm-workspace.d.ts +1 -1
  99. package/dist/src/setup/scaffold/update/pnpm-workspace.js +2 -5
  100. package/dist/src/setup/slackbot.js +1 -1
  101. package/dist/src/setup/state.d.ts +3 -4
  102. package/dist/src/setup/step.d.ts +1 -1
  103. package/dist/src/setup/vercel-project.js +1 -1
  104. package/dist/src/shared/default-agent-model.d.ts +5 -0
  105. package/dist/src/shared/default-agent-model.js +1 -0
  106. package/package.json +1 -1
  107. package/dist/docs/public/advanced/evals.md +0 -225
  108. package/dist/src/cli/commands/setup.d.ts +0 -58
  109. package/dist/src/cli/commands/setup.js +0 -1
package/CHANGELOG.md CHANGED
@@ -1,5 +1,26 @@
1
1
  # eve
2
2
 
3
+ ## 0.6.0-beta.15
4
+
5
+ ### Minor Changes
6
+
7
+ - 0a8d93b: Add `eve init <name>` for non-interactive agent creation, with optional Web Chat through `--web`. The command installs dependencies, initializes Git, and starts the development server, but does not provision or deploy a Vercel project.
8
+ - f733fe5: The deterministic mock model adapter (`EVE_MOCK_AUTHORED_MODELS=1`) now synthesizes tool inputs for string properties anchored to quoted spans in the message (e.g. `with label "x"`), honors exact-reply fixture directives (`reply with the exact string … and nothing else`, `include the exact token … verbatim`) found in the current turn's user messages and per-turn client context, discovers static skill advertisements embedded in instruction text (not just standalone announcements), derives skill activation from `load_skill` calls in history so skills are never re-loaded in a loop, and no longer matches `load_skill` by explicit name. This widens what agent smoke evals can assert deterministically without a real model.
9
+
10
+ ### Patch Changes
11
+
12
+ - 0a8d93b: Scaffolded projects now carry a `packageExtensions` entry in `pnpm-workspace.yaml` that restores eve's missing `oxc-parser` dependency, so fresh agents boot against the published `eve` 0.6.0-beta.13/14 builds that import it without declaring it. The entry lives in `pnpm-workspace.yaml` because pnpm 11 reads settings only from there, not from the `package.json` `pnpm` field. Remove it from the template once the scaffolded `eve` range only resolves to versions that declare `oxc-parser` themselves.
13
+
14
+ ## 0.6.0-beta.14
15
+
16
+ ### Minor Changes
17
+
18
+ - 9d806d0: Add a `/model` command to the `eve dev` TUI. Run it bare to pick from a list (the running model flagged, a catalog-validated shortlist, and a freeform entry), or `/model <provider/model-id>` to set one directly. The pick rewrites `agent.ts`, and the dev server's HMR watcher applies it on the next prompt.
19
+
20
+ ### Patch Changes
21
+
22
+ - c99668f: Correct the public bash tool helper export from `defineBeveTool` to `defineBashTool`.
23
+
3
24
  ## 0.6.0-beta.13
4
25
 
5
26
  ### Minor Changes
package/README.md CHANGED
@@ -45,7 +45,7 @@ Every authored directory has a typed helper. Import each from the matching subpa
45
45
  | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------- | ------------------------------------------------ |
46
46
  | `defineAgent(...)` | `eve` | `agent.ts`, `subagents/<id>/agent.ts` |
47
47
  | `defineInstructions(...)` | `eve/instructions` | `instructions.ts` (or `instructions.md`) |
48
- | `defineTool(...)`, `defineBeveTool(...)`, `defineReadFileTool(...)`, `defineWriteFileTool(...)`, `disableTool(...)` | `eve/tools` | `tools/<name>.ts` |
48
+ | `defineTool(...)`, `defineBashTool(...)`, `defineReadFileTool(...)`, `defineWriteFileTool(...)`, `disableTool(...)` | `eve/tools` | `tools/<name>.ts` |
49
49
  | `defineSkill(...)`, `getSkill(...)` | `eve/skills` | `skills/<name>.ts` (or `skills/<name>.md`) |
50
50
  | `defineHook(...)` | `eve/hooks` | `hooks/<slug>.ts` |
51
51
  | `defineChannel(...)`, `POST`, `GET` | `eve/channels` | `channels/<name>.ts` |
@@ -105,15 +105,16 @@ export default defineAgent({
105
105
  ## Quick Start
106
106
 
107
107
  ```bash
108
- pnpm create eve@beta
109
- cd my-agent
110
- pnpm dev
108
+ pnpm dlx eve@beta init my-agent
111
109
  ```
112
110
 
113
- The wizard scaffolds the project, picks a model, and (for the REPL channel) installs dependencies and starts the dev server for you. To scaffold into the current empty directory, run `pnpm create eve@beta .`.
111
+ `eve init` writes a new agent with Eve's default model. Pass `--web` to add the
112
+ Web Chat application. It installs dependencies, initializes Git, and starts the
113
+ development server. It does not create a Vercel project or deploy the agent.
114
114
 
115
115
  CLI commands:
116
116
 
117
+ - `eve init <name>` — create a new agent
117
118
  - `eve info` — discovery results and compiled artifacts
118
119
  - `eve build` — compile `.eve/` and build the host output
119
120
  - `eve start` — serve the built `.output/` app
@@ -812,13 +812,13 @@ weather-result-reaches-parent` reruns it in isolation.
812
812
 
813
813
  ### CI
814
814
 
815
- `.github/workflows/smoke.yml` discovery changes from globbing
815
+ `.github/workflows/e2e.yml` discovery changes from globbing
816
816
  `e2e/tests/*/*.ts` to globbing fixture apps with `evals/` directories. Each
817
817
  matrix leg provisions, then runs the evals against the resulting URL:
818
818
 
819
819
  ```sh
820
820
  node e2e/provision/<group>.ts & # build + sidecars + env + eve start
821
- pnpm --filter <fixture-app> exec eve eval --strict --json --url "$TARGET_URL"
821
+ pnpm --filter <fixture-app> exec eve eval --strict --url "$TARGET_URL"
822
822
  ```
823
823
 
824
824
  twice (direct / `EVE_EXPERIMENTAL_CODE_MODE=1` set by the provisioner),
@@ -842,7 +842,7 @@ Phase 5.
842
842
  ## Implementation phases
843
843
 
844
844
  Each phase ships independently, keeps `pnpm test` green, and includes docs
845
- (`docs/public/advanced/evals.md`) + changesets per repo policy.
845
+ (now the top-level `docs/public/evals/` section) + changesets per repo policy.
846
846
 
847
847
  ### Phase 1 — Assertions and pass/fail (no breaking interaction changes)
848
848
 
@@ -906,6 +906,31 @@ addresses. Nothing in phase 4 deploys anywhere.
906
906
  area policy; shrink `e2e/lib`/`e2e/target` into `e2e/provision/`; update
907
907
  `e2e/README.md` and AGENTS.md smoke-test guidance to point at `eve eval`.
908
908
 
909
+ > **Decisions made during the phase-4 migration (June 2026):**
910
+ >
911
+ > - **Hybrid mock/real split.** Evals default to the deterministic mock
912
+ > (`requires: ["mockModels"]`). Behaviors the mock cannot express — the
913
+ > built-in `agent` tool, Workflow fan-out, provider-executed `web_search`,
914
+ > url-file attachment reads — live in separate `real-model`-tagged eval
915
+ > files run against a non-mock server only when model credentials are
916
+ > present (provisioners log a visible skip otherwise). Net: real-model CI
917
+ > legs dropped from ~156 to ~7 cases.
918
+ > - **Code-mode e2e dropped entirely.** No `EVE_EXPERIMENTAL_CODE_MODE`
919
+ > matrix dimension and no codemode group port; the `agent-codemode` fixture
920
+ > was deleted. Purpose-built code-mode fixtures/evals will be designed
921
+ > later.
922
+ > - **Mock adapter extensions** (in `eve`): anchored quoted-span tool-input
923
+ > synthesis, exact-reply directives in current-turn user text (proves
924
+ > clientContext delivery), embedded static-skill advert discovery, and
925
+ > history-derived skill-activation tracking.
926
+ > - **Coverage consciously dropped:** semantic multi-turn recall and the
927
+ > channel-rekey marker recall (pure model behaviors), `subagent-output-schema`
928
+ > moved to the real leg, and the eval-CLI self-smoke (`evals.ts`) — now the
929
+ > CI mechanism itself.
930
+ > - Sidecar fake providers expose a `GET /_probe/requests` inspection
931
+ > endpoint (`e2e/lib/http-test-server.ts`) so evals assert captured
932
+ > traffic across the provisioner process boundary.
933
+
909
934
  ### Phase 5 — Remote leg: deploy fixtures and eval the deployment
910
935
 
911
936
  After phase 4, CI gains a second e2e variant that runs the **same eval
@@ -918,7 +943,7 @@ skip.
918
943
  preview env sets `EVE_MOCK_AUTHORED_MODELS=1` (determinism, zero model
919
944
  spend) and the workflow backend credentials.
920
945
  16. CI job: deploy the preview, capture the URL, then
921
- `eve eval --strict --json --junit ... --url "$DEPLOY_URL"` with
946
+ `eve eval --strict --junit ... --url "$DEPLOY_URL"` with
922
947
  `VERCEL_AUTOMATION_BYPASS_SECRET` in the job env (the eval client's
923
948
  existing remote-auth cascade handles protection bypass / OIDC). No
924
949
  `--no-skips` on this leg: schedule evals (`devRoutes`) and local-sidecar
@@ -30,7 +30,7 @@ Read in this order:
30
30
  14. [TypeScript Client](./client/overview.mdx)
31
31
  15. [Subagents](./subagents.mdx)
32
32
  16. [Schedules](./schedules.mdx)
33
- 17. [Evals](./advanced/evals.md)
33
+ 17. [Evals](./evals/overview.mdx)
34
34
  18. [Auth And Route Protection](./advanced/auth-and-route-protection.md)
35
35
  19. [Vercel Deployment](./advanced/deployment.md)
36
36
  20. [CLI, Build, And Debugging](./reference/cli.md)
@@ -184,7 +184,7 @@ export default defineChannel({
184
184
 
185
185
  ## Replace `placeholderAuth` before production
186
186
 
187
- `pnpm create eve@beta` sometimes scaffolds `agent/channels/eve.ts` with a `placeholderAuth()` guardrail:
187
+ `eve init` scaffolds `agent/channels/eve.ts` with a `placeholderAuth()` guardrail:
188
188
 
189
189
  ```ts
190
190
  import { eveChannel } from "eve/channels/eve";
@@ -162,4 +162,4 @@ When `eve build` fails on discovery errors, the CLI prints the full diagnostics
162
162
  - [`agent.ts`](../agent-config)
163
163
  - [Hooks](./hooks): observe the runtime event stream
164
164
  - [Dev TUI](./dev-tui): drive the agent locally
165
- - [Evals](./evals): repeatable scored checks
165
+ - [Evals](../evals/overview): repeatable scored checks
@@ -14,7 +14,6 @@
14
14
  "deployment",
15
15
  "instrumentation",
16
16
  "dev-tui",
17
- "evals",
18
17
  "execution-model-and-durability",
19
18
  "sessions-runs-and-streaming"
20
19
  ]
@@ -36,7 +36,7 @@ The `auth` option decides who can call these routes. The built-in helpers are me
36
36
 
37
37
  Neither admits browser users or external clients in production. For a public app, wire the channel to your own auth (Clerk, Auth.js, or your own OIDC/JWT verification).
38
38
 
39
- `pnpm create eve@beta` scaffolds an `agent/channels/eve.ts` with a production placeholder so you replace it before going live. The generated channel allows Vercel OIDC and localhost, and includes `placeholderAuth()`, which returns a setup-focused 401 in production until you swap it for real auth. Delete the file and Eve falls back to `[localDev(), vercelOidc()]`, which still does not admit browser users in production.
39
+ `eve init` scaffolds an `agent/channels/eve.ts` with a production placeholder so you replace it before going live. The generated channel allows Vercel OIDC and localhost, and includes `placeholderAuth()`, which returns a setup-focused 401 in production until you swap it for real auth. Delete the file and Eve falls back to `[localDev(), vercelOidc()]`, which still does not admit browser users in production.
40
40
 
41
41
  For the full auth model and helper list, see [Auth & route protection](../advanced/auth-and-route-protection).
42
42
 
@@ -0,0 +1,115 @@
1
+ ---
2
+ title: "Cases and tasks"
3
+ description: "Author data cases, load fixtures with loaders, and script multi-turn cases with run(ctx)."
4
+ ---
5
+
6
+ A case is one graded unit of work: the runner executes it against the target, captures every event, and applies [checks](./checks) and [scores](./scores) to the result. A case is either a data case or a scripted case.
7
+
8
+ ## Data cases
9
+
10
+ Data cases pair an `input` with an `expected`, plus optional `checks`, `scores`, `requires`, `tags`, and `metadata`. `input` can be a string or an object. `expected` is optional, which is handy when you only care about behavior:
11
+
12
+ ```ts title="evals/weather.eval.ts"
13
+ import { defineEval } from "eve/evals";
14
+ import { Checks } from "eve/evals/checks";
15
+
16
+ export default defineEval({
17
+ checks: [Checks.didNotFail()],
18
+ scores: [],
19
+ cases: [
20
+ { id: "brooklyn-forecast", input: "What is the weather in Brooklyn?", expected: "Sunny" },
21
+ {
22
+ id: "no-tools-for-greetings",
23
+ input: "Hello!",
24
+ checks: [Checks.toolNotCalled("get_weather")],
25
+ },
26
+ ],
27
+ });
28
+ ```
29
+
30
+ Eval-level `checks` and `scores` apply to every case; case-level entries append to them.
31
+
32
+ ## Loading cases from fixtures
33
+
34
+ List cases inline, or load them dynamically with `load`. The loaders (`loadJson`, `loadYaml` from `eve/evals/loaders`) resolve paths relative to the app root:
35
+
36
+ ```ts title="evals/sql.eval.ts"
37
+ import { defineEval } from "eve/evals";
38
+ import { loadYaml } from "eve/evals/loaders";
39
+ import { Text } from "eve/evals/scores";
40
+
41
+ export default defineEval({
42
+ async load() {
43
+ const doc = await loadYaml("evals/data/cases.yaml");
44
+ return (doc.evals as readonly { task: string; prompt: string; sql: string }[]).map((row) => ({
45
+ id: row.task,
46
+ input: row.prompt,
47
+ expected: row.sql,
48
+ }));
49
+ },
50
+ scores: [Text.exact()],
51
+ });
52
+ ```
53
+
54
+ Pass either `cases` or `load`, never both. The loaders are meant for fixtures, not runtime agent code.
55
+
56
+ ## Tasks
57
+
58
+ The eval-level `task` controls how the runner turns a data case into agent work:
59
+
60
+ - `task.prompt` covers the single-string case — a template that interpolates the case input.
61
+ - `task.messages` sets up a static multi-turn conversation.
62
+ - `task.run` gives you imperative control flow for every case in the eval.
63
+ - `task.parseOutput` compares a transformed result rather than the raw final message.
64
+
65
+ A task can specify at most one of `prompt`, `messages`, and `run`. When none is set, the case `input` is sent as a single user message.
66
+
67
+ ## Scripted cases
68
+
69
+ Scripted cases define their own `run(ctx)` and do not need `input`. Use them for smoke-style behavior: multi-turn branching, HITL approvals, structured output, attachments, and multiple independent sessions.
70
+
71
+ ```ts title="evals/approvals.eval.ts"
72
+ import { defineEval } from "eve/evals";
73
+ import { Checks } from "eve/evals/checks";
74
+
75
+ export default defineEval({
76
+ checks: [Checks.didNotFail()],
77
+ scores: [],
78
+ cases: [
79
+ {
80
+ id: "approve-tool",
81
+ async run({ session }) {
82
+ await session.send("run `pwd`");
83
+ session.expectInputRequests({ toolName: "bash" });
84
+ const turn = await session.respondAll("approve");
85
+ return turn.message;
86
+ },
87
+ checks: [Checks.toolCalled("bash", { input: { command: /pwd/ } })],
88
+ },
89
+ ],
90
+ });
91
+ ```
92
+
93
+ The return value of `run` becomes the case output that scorers grade. Throwing marks the case failed with the error message in the result.
94
+
95
+ ## The session API
96
+
97
+ `ctx.session` is the primary `EveEvalSession`; `ctx.newSession()` creates another independent session against the same target:
98
+
99
+ - `session.send(input)` sends a turn and waits for it to settle. It accepts the same input as `ClientSession.send()` — a string or a structured message.
100
+ - `session.sendFile(text, path, mediaType?)` attaches a local file as a data URL.
101
+ - `session.expectInputRequests(filter?)` asserts the previous turn parked on HITL input and returns the pending requests.
102
+ - `session.respondAll(optionId)` answers every pending input request with the same option and sends the responses as the next turn.
103
+ - `session.events` is the full typed event stream captured so far.
104
+
105
+ Each `send` resolves to an `EveEvalTurn` carrying the turn's `message`, `events`, and status. `turn.expectOk()` throws only when the turn ended failed — a session left open for a next message is the normal end state of a successful turn.
106
+
107
+ Events from every eval session are captured in the case result and artifacts. `ctx.log(message)` records debug lines into the case artifact; `--verbose` also streams them to stdout as cases run.
108
+
109
+ For driving sessions created outside the eval — by a channel webhook or a schedule — see [Targets and requirements](./targets).
110
+
111
+ ## What to read next
112
+
113
+ - [Checks](./checks): assert on what the case did
114
+ - [Scores](./scores): grade how well it did it
115
+ - [TypeScript client](../client/messages): the send/turn protocol eval sessions build on
@@ -0,0 +1,65 @@
1
+ ---
2
+ title: "Checks"
3
+ description: "Hard assertions over runs, tool calls, and output — any failed check fails the case and the exit code."
4
+ ---
5
+
6
+ Checks are hard assertions. Any failed check marks the case failed and `eve eval` exits non-zero. Use them for the things that must hold — the run completed, the right tool ran, the output parses. For graded, non-fatal signals, use [scores](./scores) instead.
7
+
8
+ Checks exist at the eval level (applied to every case) and the case level (appended to the eval's).
9
+
10
+ ## Built-in checks
11
+
12
+ Built-in checks live on `eve/evals/checks`:
13
+
14
+ | Check | Asserts |
15
+ | ------------------------------------------------------------- | --------------------------------------------------------------------------------- |
16
+ | `Checks.completed()` | The run did not fail and did not park on unanswered HITL input |
17
+ | `Checks.waiting()` | The run parked on HITL input (for approval-shaped evals) |
18
+ | `Checks.didNotFail()` | No terminal failure and no `turn.failed`/`step.failed` events (parked runs pass) |
19
+ | `Checks.messageIncludes(token)` | Joined assistant text contains `token` (string or RegExp) |
20
+ | `Checks.outputEquals(value)` / `Checks.outputMatches(schema)` | Deep equality / Standard Schema (e.g. Zod) validation of the parsed output |
21
+ | `Checks.toolCalled(name, opts?)` | A matching tool call happened (`input`, `output`, `isError`, `times` constraints) |
22
+ | `Checks.toolNotCalled(name)` | No call to `name` |
23
+ | `Checks.toolOrder([...names])` | Tool names appear in order (other calls may interleave) |
24
+ | `Checks.noFailedActions()` | No tool, subagent, or skill action reported a failure |
25
+ | `Checks.subagentCalled(name, opts?)` | A subagent delegation happened (`remoteUrl`, `output` constraints) |
26
+ | `Checks.event(predicate, label)` | Escape hatch: any predicate over the typed event stream |
27
+
28
+ ## Matchers
29
+
30
+ Matcher options accept a literal (objects partial-deep-match), a RegExp, or a function — return a boolean to act as a predicate, or return a value to compare against (handy for runner-assigned values like environment-provided URLs):
31
+
32
+ ```ts
33
+ Checks.toolCalled("bash", { input: { command: /^pwd/ }, isError: false, times: 1 });
34
+ Checks.subagentCalled("weather", {
35
+ remoteUrl: () => process.env.WEATHER_AGENT_URL!,
36
+ output: /72F/,
37
+ });
38
+ ```
39
+
40
+ ## Custom checks
41
+
42
+ A custom check is a plain function receiving `{ case, result, target }` and returning `{ name, passed, message? }`:
43
+
44
+ ```ts
45
+ import type { EveEvalCheck } from "eve/evals/checks";
46
+
47
+ const repliedFast: EveEvalCheck = ({ result }) => ({
48
+ name: "replied-fast",
49
+ passed: result.durationMs < 5_000,
50
+ message: `took ${result.durationMs}ms`,
51
+ });
52
+ ```
53
+
54
+ Write a `message` for the failing path — it is what the console reporter prints under the case line and what lands in JUnit output.
55
+
56
+ ## Run status and parking
57
+
58
+ `result.status` is `"waiting"` whenever the session is left open for a next message — the normal end state of a successful turn. Parking on unanswered HITL input is tracked separately as `result.derived.parked`, which is what `Checks.completed()` and `Checks.waiting()` key off.
59
+
60
+ `result.derived` also carries typed tool calls (name, input, output, error state), subagent calls, and HITL input requests, so custom checks rarely need to walk the raw event stream. When they do, `Checks.event(predicate, label)` covers it.
61
+
62
+ ## What to read next
63
+
64
+ - [Scores](./scores): graded signals with thresholds
65
+ - [Cases and tasks](./cases): where checks attach
@@ -0,0 +1,4 @@
1
+ {
2
+ "title": "Evals",
3
+ "pages": ["overview", "cases", "checks", "scores", "targets", "reporters", "running"]
4
+ }
@@ -0,0 +1,75 @@
1
+ ---
2
+ title: "Evals"
3
+ description: "Define repeatable scored checks for an Eve agent with defineEval and run them with eve eval."
4
+ ---
5
+
6
+ An eval is a scored check that runs your agent against real sessions and grades the result. Use it to catch regressions when you change a prompt or a tool: compare the output against expected text, JSON, SQL, or behavior, and optionally ship the results to Braintrust.
7
+
8
+ Evals exercise the same HTTP surface your users hit. The runner boots (or targets) a real agent server, drives sessions through the [TypeScript client](../client/overview) protocol, and grades what comes back — so a passing eval means the agent actually booted, accepted a request, and produced the result you asserted.
9
+
10
+ ## `defineEval`
11
+
12
+ Eve discovers evals under the app-root `evals/` directory, in `.eval.ts` files. The file path is the eval's identity, so you don't author an `id` or `name`.
13
+
14
+ ```text
15
+ my-agent/
16
+ ├── agent/
17
+ ├── evals/
18
+ │ ├── smoke.eval.ts
19
+ │ └── weather.eval.ts
20
+ └── package.json
21
+ ```
22
+
23
+ ```ts title="evals/weather.eval.ts"
24
+ import { defineEval } from "eve/evals";
25
+ import { Checks } from "eve/evals/checks";
26
+ import { Run } from "eve/evals/scores";
27
+
28
+ export default defineEval({
29
+ description: "Basic message and tool-usage coverage for the weather agent.",
30
+ cases: [
31
+ { id: "brooklyn-forecast", input: "What is the weather in Brooklyn?", expected: "Sunny" },
32
+ ],
33
+ checks: [Checks.didNotFail(), Checks.toolCalled("get_weather")],
34
+ scores: [Run.didNotFail()],
35
+ });
36
+ ```
37
+
38
+ Every eval needs `scores` (an empty array is fine) and either `cases` or `load`. The rest are optional: `description`, `task`, `checks`, `requires`, `model`, `thresholds`, `modelOptions`, `tags`, `metadata`, `maxConcurrency`, `timeoutMs`, `reporters`. The init template adds `evals/**/*.ts` to `tsconfig.json`, so your eval code type-checks alongside the app.
39
+
40
+ ## Two grading tiers
41
+
42
+ Evals grade a case on two distinct tiers:
43
+
44
+ - **[Checks](./checks) are hard assertions.** Any failed check marks the case failed and `eve eval` exits non-zero. Use them for the things that must hold — the run completed, the right tool ran, the output parses.
45
+ - **[Scores](./scores) are soft data.** They land in reports and artifacts, and a below-threshold score marks the case `scored` — visible but not fatal, unless you pass `--strict`.
46
+
47
+ Both exist at the eval level (applied to every case) and the case level (appended to the eval's).
48
+
49
+ ## Run it
50
+
51
+ ```bash
52
+ eve eval # run all discovered evals against a local dev server
53
+ eve eval weather # run selected evals
54
+ eve eval --url https://<app> # target an existing server or deployment
55
+ ```
56
+
57
+ Exit code `0` means every case passed its checks. See [Running evals](./running) for the full flag list, exit codes, and CI guidance.
58
+
59
+ ## A good baseline
60
+
61
+ Most apps do fine with a small smoke eval. Assert behavior with `Checks.didNotFail()` plus one or two content checks, keep case fixtures in `evals/data/`, and only reach for Braintrust once you actually need shared result review or experiment history. In CI, run `eve eval --strict` so threshold misses fail the build too.
62
+
63
+ The rest of this section covers each piece:
64
+
65
+ - [Cases and tasks](./cases): data cases, loaders, and scripted multi-turn cases
66
+ - [Checks](./checks): hard assertions over runs, tools, and output
67
+ - [Scores](./scores): deterministic and LLM-judged scorers, with thresholds
68
+ - [Targets and requirements](./targets): local vs remote targets, and gating cases on capabilities
69
+ - [Reporters](./reporters): Braintrust experiments and JUnit XML
70
+ - [Running evals](./running): the `eve eval` CLI, exit codes, and artifacts
71
+
72
+ ## What to read next
73
+
74
+ - [Cases and tasks](./cases): author your first cases
75
+ - [Tools](../tools): the surface most evals assert on
@@ -0,0 +1,45 @@
1
+ ---
2
+ title: "Reporters"
3
+ description: "Ship eval results to Braintrust experiments or JUnit XML — Eve runs and scores everything itself."
4
+ ---
5
+
6
+ Eve runs and scores everything itself; reporters just ship the results out. The CLI prints a console summary by default — one line per case, failed checks with their messages — and eval-level reporters from `eve/evals/reporters` add destinations on top.
7
+
8
+ ## Braintrust
9
+
10
+ `Braintrust(...)` uploads eval results to Braintrust experiments:
11
+
12
+ ```ts title="evals/weather.eval.ts"
13
+ import { defineEval } from "eve/evals";
14
+ import { Braintrust } from "eve/evals/reporters";
15
+ import { Run } from "eve/evals/scores";
16
+
17
+ export default defineEval({
18
+ cases: [{ id: "brooklyn-forecast", input: "What is the weather in Brooklyn?" }],
19
+ scores: [Run.didNotFail()],
20
+ reporters: [Braintrust({ projectName: "weather-agent" })],
21
+ });
22
+ ```
23
+
24
+ The config takes an optional `projectName` and `experimentName`, plus a base experiment (by name or id) to diff against. Checks log as binary scores under a `check:` prefix so experiments diff check regressions the same way they diff score regressions. Eval and case `metadata` ride along to reporters.
25
+
26
+ Braintrust needs its SDK installed in the app and credentials in the environment. Pass `--skip-report` to run the eval without shipping results — useful locally when iterating.
27
+
28
+ ## JUnit
29
+
30
+ `JUnit({ filePath })` writes JUnit XML for CI annotations. The `--junit <path>` CLI flag does the same thing without touching the eval file, which is usually the better fit — CI owns the output path, not the eval:
31
+
32
+ ```bash
33
+ eve eval --strict --junit .eve/junit.xml
34
+ ```
35
+
36
+ Failed checks and execution errors land as failure messages on the matching test case, so CI surfaces them inline.
37
+
38
+ ## Custom reporters
39
+
40
+ A reporter implements the `EvalReporter` interface from `eve/evals/reporters` and receives the same structured results the built-ins do. Reach for one only when a destination isn't covered — the per-run artifacts under `.eve/evals/` already capture everything for ad-hoc inspection.
41
+
42
+ ## What to read next
43
+
44
+ - [Running evals](./running): console output, `--json`, and artifacts
45
+ - [Scores](./scores): what the reported numbers mean
@@ -0,0 +1,62 @@
1
+ ---
2
+ title: "Running evals"
3
+ description: "The eve eval CLI: flags, filters, exit codes, artifacts, and how to wire evals into CI."
4
+ ---
5
+
6
+ `eve eval` discovers every `.eval.ts` file under `evals/`, boots a local dev server (or targets a remote one), runs the cases, and prints a per-case summary.
7
+
8
+ ```bash
9
+ eve eval # run all discovered evals locally
10
+ eve eval weather smoke # run selected evals
11
+ eve eval --url https://<app> # target a remote app instead of a local host
12
+ eve eval --mock-models # local dev target uses deterministic mock models
13
+ eve eval --tag fast # only cases (or evals) carrying a tag
14
+ eve eval --case brooklyn-forecast # only specific case ids
15
+ eve eval --strict # below-threshold scores also fail the exit code
16
+ eve eval --no-skips # unmet requirements fail instead of skipping
17
+ eve eval --timeout 60000 # per-case timeout in milliseconds
18
+ eve eval --max-concurrency 4 # cap concurrent cases per eval
19
+ eve eval --junit .eve/junit.xml # write JUnit XML
20
+ eve eval --list # print discovered evals and cases without running
21
+ eve eval --verbose # stream per-case ctx.log lines to stdout
22
+ eve eval --json # machine-readable output
23
+ eve eval --skip-report # skip eval-defined reporters (e.g. Braintrust)
24
+ ```
25
+
26
+ ## Exit codes
27
+
28
+ | Code | Means |
29
+ | ---- | -------------------------------------------------------------------------------- |
30
+ | `0` | Every case passed its checks (and thresholds, under `--strict`) |
31
+ | `1` | Any case failed — a failed check, an execution error, or a strict threshold miss |
32
+ | `2` | Configuration error |
33
+
34
+ Unmet [requirements](./targets) skip visibly without affecting the exit code unless you pass `--no-skips`.
35
+
36
+ ## Artifacts
37
+
38
+ Each run drops per-eval artifacts under `.eve/evals/<timestamp>-<eval-id>/`, including per-case check results, verdicts, captured event streams, and `ctx.log` lines. The console output stays tight on purpose; when a case fails, the artifact has the full story.
39
+
40
+ ## CI
41
+
42
+ A solid CI invocation is strict, deterministic, and machine-reportable:
43
+
44
+ ```bash
45
+ eve eval --strict --mock-models --junit .eve/junit.xml
46
+ ```
47
+
48
+ - `--strict` turns threshold misses into failures, so score regressions block the merge.
49
+ - `--mock-models` keeps the default leg deterministic and credential-free. Put real-model cases in their own eval files gated on `requires: ["env:..."]`, and add `--no-skips` on legs that must prove those ran.
50
+ - `--junit` gives the CI provider per-case annotations; upload the `.eve/evals/` directory as a failure artifact for the full event streams.
51
+
52
+ Against a deployed app, swap `--mock-models` for `--url`:
53
+
54
+ ```bash
55
+ eve eval --strict --url "$DEPLOY_URL" --junit .eve/junit.xml
56
+ ```
57
+
58
+ ## What to read next
59
+
60
+ - [Targets and requirements](./targets): what `--url`, `--mock-models`, and `--no-skips` interact with
61
+ - [Reporters](./reporters): Braintrust and JUnit output
62
+ - [CLI reference](../reference/cli): the rest of the `eve` CLI
@@ -0,0 +1,56 @@
1
+ ---
2
+ title: "Scores"
3
+ description: "Grade eval cases with deterministic scorers or LLM judges, and gate them with thresholds."
4
+ ---
5
+
6
+ Scores are soft data. They land in reports and artifacts, and a below-threshold score marks the case `scored` — visible but not fatal, unless you pass `--strict`. Use them to grade quality fractionally where a [check](./checks) would assert it absolutely.
7
+
8
+ ## Choosing a scorer
9
+
10
+ Scorers live in namespaces on `eve/evals/scores`. Pick the cheapest one that captures what "correct" means here. The deterministic scorers run instantly for free; an LLM judge runs once per case and burns tokens, so save it for when nothing simpler will do.
11
+
12
+ | Need | Use |
13
+ | ---------------------------------------------------------- | --------------------------------------------------------------------------------------------- |
14
+ | Grade agent behavior (run succeeded, used the right tools) | `Run.didNotFail()`, `Run.usedTool(name, opts?)`, `Run.usedNoTools()`, `Run.maxToolCalls(max)` |
15
+ | Exact string match | `Text.exact()`, `Text.includes()` |
16
+ | Fuzzy text match (typos, whitespace) | `Text.levenshtein()` |
17
+ | Exact JSON match | `Json.deepEqual()` |
18
+ | Exact SQL match (after normalization) | `Sql.exactNormalized()` |
19
+ | LLM-judged factual correctness vs an expected answer | `Autoevals.factuality()` |
20
+ | LLM-judged summary quality | `Autoevals.summary()` |
21
+ | LLM-judged SQL semantic equivalence | `Autoevals.sql()` |
22
+ | LLM-judged free-form criteria (no `expected` to match) | `Autoevals.closedQA({ criteria: "..." })` |
23
+
24
+ Each scorer gets the flattened `input`, `output`, and `expected` strings along with the full case and task result — including derived facts: typed tool calls (name, input, output, error state), subagent calls, HITL input requests, and whether the run parked. `Run.usedTool` accepts the same matcher options as `Checks.toolCalled`. Return `null` from a scorer to skip a case.
25
+
26
+ ## Thresholds
27
+
28
+ By default a scorer has to hit an exact match to pass. `thresholds` loosens that, mapping each scorer name to the minimum score you'll accept:
29
+
30
+ ```ts
31
+ import { defineEval } from "eve/evals";
32
+ import { Run, Text } from "eve/evals/scores";
33
+
34
+ export default defineEval({
35
+ cases: [{ id: "hello", input: "Hello", expected: "Hello" }],
36
+ scores: [Run.didNotFail(), Text.includes()],
37
+ thresholds: { "run.didNotFail": 1, "text.includes": 0.5 },
38
+ });
39
+ ```
40
+
41
+ A case below a threshold gets the `scored` verdict — reported, but only fatal under `eve eval --strict`.
42
+
43
+ ## The scorer model
44
+
45
+ `model` is only required when a model-backed scorer (one of the `Autoevals` wrappers) is present without its own per-scorer model override — and it's the scorer model, not the agent's. Eve only uses it for model-backed scoring, never to swap out the agent under test. Pass a string id (e.g. `"anthropic/claude-opus-4.8"`) to route through the Vercel AI Gateway, or hand it an AI SDK model instance to use that directly.
46
+
47
+ For provider-specific scorer-model settings, use `modelOptions.providerOptions`. Individual Autoevals scorers can also take their own `model` / `modelOptions`, which win over the eval default.
48
+
49
+ ## Concurrency and timeouts
50
+
51
+ `maxConcurrency` caps parallelism and `timeoutMs` bounds each case. Leave them off and `eve eval` runs up to 8 cases per eval at once. Lower `maxConcurrency` when cases contend for a shared resource — a rate-limited connection, or a sandbox-heavy fixture.
52
+
53
+ ## What to read next
54
+
55
+ - [Checks](./checks): hard assertions that fail the build
56
+ - [Reporters](./reporters): ship scores to Braintrust experiments