eve 0.6.0-beta.8 → 0.6.0-beta.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/dist/docs/evals-v2-plan.md +939 -0
  3. package/dist/docs/public/advanced/evals.md +61 -24
  4. package/dist/docs/public/agent-config.md +5 -2
  5. package/dist/docs/public/reference/cli.md +16 -10
  6. package/dist/docs/public/sandbox.mdx +3 -1
  7. package/dist/src/cli/run.d.ts +5 -3
  8. package/dist/src/cli/run.js +1 -1
  9. package/dist/src/compiled/.vendor-stamp.json +1 -1
  10. package/dist/src/compiled/just-bash/index.d.ts +23 -0
  11. package/dist/src/compiler/normalize-agent-config.d.ts +0 -10
  12. package/dist/src/compiler/normalize-agent-config.js +1 -1
  13. package/dist/src/evals/checks/checks.d.ts +66 -0
  14. package/dist/src/evals/checks/checks.js +2 -0
  15. package/dist/src/evals/checks/index.d.ts +21 -0
  16. package/dist/src/evals/checks/index.js +1 -0
  17. package/dist/src/evals/checks/match.d.ts +68 -0
  18. package/dist/src/evals/checks/match.js +1 -0
  19. package/dist/src/evals/cli/eval.d.ts +9 -3
  20. package/dist/src/evals/cli/eval.js +1 -1
  21. package/dist/src/evals/define-eval-suite.d.ts +10 -8
  22. package/dist/src/evals/define-eval-suite.js +1 -1
  23. package/dist/src/evals/index.d.ts +2 -1
  24. package/dist/src/evals/runner/artifacts.js +1 -1
  25. package/dist/src/evals/runner/derive-run-facts.d.ts +17 -3
  26. package/dist/src/evals/runner/derive-run-facts.js +1 -1
  27. package/dist/src/evals/runner/execute-case.js +1 -1
  28. package/dist/src/evals/runner/execute-suite.d.ts +2 -2
  29. package/dist/src/evals/runner/execute-suite.js +1 -1
  30. package/dist/src/evals/runner/reporters/braintrust.js +1 -1
  31. package/dist/src/evals/runner/reporters/console.js +1 -1
  32. package/dist/src/evals/runner/verdict.d.ts +18 -0
  33. package/dist/src/evals/runner/verdict.js +1 -0
  34. package/dist/src/evals/scorers/autoevals.js +1 -1
  35. package/dist/src/evals/scorers/model-marker.d.ts +12 -0
  36. package/dist/src/evals/scorers/model-marker.js +1 -0
  37. package/dist/src/evals/scorers/run.d.ts +9 -3
  38. package/dist/src/evals/scorers/run.js +1 -1
  39. package/dist/src/evals/types.d.ts +121 -8
  40. package/dist/src/execution/sandbox/bindings/local.d.ts +11 -0
  41. package/dist/src/execution/sandbox/bindings/local.js +1 -1
  42. package/dist/src/internal/application/package.js +1 -1
  43. package/dist/src/internal/nitro/dev-runtime-artifacts.d.ts +11 -0
  44. package/dist/src/internal/nitro/dev-runtime-artifacts.js +1 -1
  45. package/dist/src/internal/nitro/host/start-development-server.js +1 -1
  46. package/dist/src/public/sandbox/backends/local.d.ts +2 -3
  47. package/dist/src/services/inspect-application.js +1 -1
  48. package/dist/src/setup/scaffold/channels.js +1 -1
  49. package/dist/src/setup/scaffold/project.js +1 -1
  50. package/dist/src/shared/agent-definition.d.ts +4 -0
  51. package/dist/src/shared/sandbox-backend.d.ts +11 -11
  52. package/package.json +6 -1
package/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # eve
2
2
 
3
+ ## 0.6.0-beta.9
4
+
5
+ ### Minor Changes
6
+
7
+ - 1cf3593: Evals gain hard assertions and CI-grade pass/fail. Suites and cases accept a `checks` array (built-ins on `eve/evals/checks`: `Checks.completed`, `Checks.didNotFail`, `Checks.toolCalled` with input/output matchers, `Checks.toolOrder`, `Checks.subagentCalled`, and more); any failed check fails the case and the `eve eval` exit code, while scores stay soft unless `--strict`. Derived facts are now typed records — `toolCalls`/`subagentCalls` carry inputs, outputs, and error state (breaking: previously `string[]`), plus new `inputRequests` and `parked` fields, and `Run.didNotFail` no longer passes runs parked on HITL input. The CLI takes positional suite ids (replacing `--suite`; `--all` is removed) and adds `--tag`, `--case`, `--strict`, and `--list`. Suite `model` is now optional and only required when a model-backed scorer needs it.
8
+ - a648895: Make `agent.ts` optional. Agents without an `agent.ts` now default to
9
+ `anthropic/claude-sonnet-4.6`; when `agent.ts` is present, `model` remains required.
10
+
11
+ ### Patch Changes
12
+
13
+ - 74cd2e5: Store local just-bash sandbox templates and sessions as normal on-disk directories instead of JSON filesystem snapshots, and prune stale `eve dev` runtime snapshots and local sandbox templates in the background.
14
+
3
15
  ## 0.6.0-beta.8
4
16
 
5
17
  ### Patch Changes