npm - eve - Versions diffs - 0.6.0-beta.9 → 0.7.2 - Mend

eve 0.6.0-beta.9 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (650) hide show

package/dist/src/evals/types.d.ts CHANGED Viewed

@@ -1,39 +1,17 @@
 import type { LanguageModel } from "ai";
+import type { StandardSchemaV1 } from "#compiled/@standard-schema/spec/index.js";
 import type { HandleMessageStreamEvent, RuntimeIdentity } from "#protocol/message.js";
-import type { InputRequest } from "#runtime/input/types.js";
+import type { SendTurnInput, SessionState } from "#client/types.js";
+import type { InputRequest, InputResponse } from "#runtime/input/types.js";
 import type { JsonObject } from "#shared/json.js";
 import type { AgentModelOptionsDefinition } from "#shared/agent-definition.js";
 import type { EvalReporter } from "#evals/runner/reporters/types.js";
+import type { EveEvalSubagentCallMatchOptions, EveEvalToolCallMatchOptions } from "#evals/match.js";
 /**
- * One normalized eval case. Suites produce these from raw data sources.
+ * Assumptions an eval needs the runner to verify against the live target
+ * or eval process environment before executing it.
  */
-export interface EveEvalCase {
-    /** Uniquely identifies the case within its suite. */
-    readonly id: string;
-    /**
-     * The case prompt, or a structured record the task derives messages from.
-     * A string is sent verbatim and a record is `JSON.stringify`d, unless the
-     * task's `messages`/`prompt` reads specific fields.
-     */
-    readonly input: string | Record<string, unknown>;
-    /**
-     * Reference value scorers compare against. The runner coerces this to a
-     * string for autoevals-compatible scorers; `args.case.expected` exposes it
-     * unmodified.
-     */
-    readonly expected?: unknown;
-    /**
-     * Hard assertions for this case, appended to the suite-level `checks`.
-     * Any failure marks the case failed and flips the CLI exit code.
-     */
-    readonly checks?: readonly EveEvalCheck[];
-    /** Additional scorers for this case, appended to the suite-level `scores`. */
-    readonly scores?: readonly EveEvalScorer[];
-    /** Used by `--tag` filtering, and passed through for reporting. */
-    readonly tags?: readonly string[];
-    /** Passed through for reporting and Braintrust span logging. */
-    readonly metadata?: Readonly<Record<string, unknown>>;
-}
+export type EveEvalRequirement = "mockModels" | "devRoutes" | `env:${string}`;
 /**
  * One tool call extracted from the captured stream, pairing the
  * `actions.requested` request with its matching `action.result`.
@@ -87,12 +65,23 @@ export interface EveEvalDerivedFacts {
     readonly failureCode?: string;
 }
 /**
- * Full result of executing one eval case against an Eve agent.
+ * Captured event stream and facts for one session involved in an eval.
+ */
+export interface EveEvalSessionResult {
+    readonly derived: EveEvalDerivedFacts;
+    readonly events: readonly HandleMessageStreamEvent[];
+    readonly primary: boolean;
+    readonly sessionId?: string;
+    readonly state: SessionState;
+}
+/**
+ * Full result of executing one eval against an Eve agent.
  */
 export interface EveEvalTaskResult {
     /**
-     * The scored value. Defaults to `finalMessage` unless `task.parseOutput`
-     * overrides it. Mutable because the runner assigns it after parsing.
+     * The agent's last assistant message (same as {@link finalMessage}), retained
+     * for reporters and artifacts that log a single "output" value. Mutable
+     * because the runner assigns it after the run completes.
      */
     output: unknown;
     /** The agent's last assistant message, or null when none was produced. */
@@ -106,8 +95,12 @@ export interface EveEvalTaskResult {
     readonly status: "completed" | "failed" | "waiting";
     /** The captured stream events from the run. */
     readonly events: readonly HandleMessageStreamEvent[];
+    /** Lines written through `t.log` while the eval ran. */
+    readonly logs?: readonly string[];
     /** Facts extracted from the stream (tool calls, message counts, etc.). */
     readonly derived: EveEvalDerivedFacts;
+    /** Per-session event streams captured while executing this eval. */
+    readonly sessions?: readonly EveEvalSessionResult[];
     /**
      * Runtime identity metadata captured from the `session.started` stream event.
      * Present when the Eve server populates the event with its runtime metadata.
@@ -115,116 +108,175 @@ export interface EveEvalTaskResult {
     readonly runtimeIdentity?: RuntimeIdentity;
 }
 /**
- * Result returned by a single scorer invocation.
+ * How a failing assertion affects the verdict. A `"gate"` is a hard
+ * assertion: missing it fails the eval. A `"soft"` assertion is tracked
+ * data that only fails the eval under `eve eval --strict` (and only when it
+ * carries a threshold).
  */
-export interface EveEvalScorerResult {
-    /** Scorer name. Used as the key in Braintrust score maps. */
-    readonly name: string;
-    /** Score between 0 and 1, or null if the scorer could not produce a score. */
-    readonly score: number | null;
-    /** Optional metadata for debugging or Braintrust span logging. */
-    readonly metadata?: Readonly<Record<string, unknown>>;
-}
+export type AssertionSeverity = "gate" | "soft";
 /**
- * Arguments passed to every scorer invocation.
+ * A value-level assertion produced by the builders in `eve/evals/expect`
+ * (e.g. `includes`, `equals`, `similarity`) and applied to an explicit value
+ * via `t.check(value, assertion)`. Boolean assertions score exactly 0 or 1.
  *
- * The flat `input`, `output`, and `expected` fields are coerced to `string`
- * for compatibility with Braintrust autoevals scorers (e.g. `Factuality`,
- * `Levenshtein`). For the original values, use `case.input`, `case.expected`,
- * and `result.output`.
+ * The chainable `gate`/`soft`/`atLeast` return a new assertion with the
+ * severity or threshold overridden, so the threshold rides on the assertion
+ * itself rather than a detached map.
  */
-export interface EveEvalScorerArgs {
-    /** The eval case input, coerced to string. Autoevals-compatible. */
-    readonly input: string;
-    /** The task output, coerced to string. Autoevals-compatible. */
-    readonly output: string;
-    /** The expected value, coerced to string. Autoevals-compatible. */
-    readonly expected?: string | undefined;
-    /**
-     * Suite-level scorer model, when the suite provides one. This does not
-     * change the target Eve agent model. `undefined` when the suite omits
-     * `model`; model-backed scorers throw a descriptive error in that case.
-     */
-    readonly model: LanguageModel | undefined;
-    /** Suite-level provider options for model-backed scorers. */
-    readonly modelOptions?: AgentModelOptionsDefinition;
-    /** Full eval case. */
-    readonly case: EveEvalCase;
-    /** Full task result with events and derived facts. */
-    readonly result: EveEvalTaskResult;
+export interface Assertion {
+    readonly name: string;
+    readonly severity: AssertionSeverity;
+    /** Minimum passing score. `undefined` on a soft assertion = tracked only. */
+    readonly threshold?: number;
+    score(value: unknown): number | Promise<number>;
+    gate(threshold?: number): Assertion;
+    soft(threshold?: number): Assertion;
+    atLeast(threshold: number): Assertion;
 }
 /**
- * Receives flattened input/output/expected fields (autoevals-compatible) plus
- * the full Eve case and result. Return `null` to skip scoring a case (e.g. when
- * expected is absent).
+ * Handle to a recorded assertion, returned by every `t` assertion method.
+ * Chain `gate`/`soft`/`atLeast` to override the recorded severity or
+ * threshold, and `await` it to surface model-backed (judge) errors and ensure
+ * the assertion has resolved before the run continues.
  */
-export type EveEvalScorer = (args: EveEvalScorerArgs) => EveEvalScorerResult | Promise<EveEvalScorerResult | null> | null;
+export interface AssertionHandle extends PromiseLike<void> {
+    gate(threshold?: number): this;
+    soft(threshold?: number): this;
+    atLeast(threshold: number): this;
+}
 /**
- * Result returned by a single check invocation.
- *
- * Unlike scores, checks are hard assertions: any `passed: false` marks the
- * case failed and produces a non-zero `eve eval` exit code.
+ * The recorded outcome of one assertion, consumed by the verdict, reporters,
+ * and artifacts. A boolean assertion has `score` 0 or 1.
  */
-export interface EveEvalCheckResult {
+export interface AssertionResult {
     readonly name: string;
+    readonly score: number;
+    readonly severity: AssertionSeverity;
+    readonly threshold?: number;
     readonly passed: boolean;
     /** Human-readable failure detail, shown in console output and artifacts. */
     readonly message?: string;
     readonly metadata?: Readonly<Record<string, unknown>>;
 }
 /**
- * Arguments passed to every check invocation. Checks see the same result data
- * as scorers but never receive a judge model — they are deterministic
- * assertions over the captured run.
+ * Driver for one session, exposed on the eval context and by `t.newSession()`.
  */
-export interface EveEvalCheckArgs {
-    readonly case: EveEvalCase;
-    readonly result: EveEvalTaskResult;
-    /** The target under test, so checks can reference runner-assigned values like its URL. */
-    readonly target: EveEvalTarget;
+export interface EveEvalSession {
+    /** All events observed on this session so far. */
+    readonly events: readonly HandleMessageStreamEvent[];
+    /** Input requests left pending by the last parked turn. */
+    readonly pendingInputRequests: readonly InputRequest[];
+    /** Serializable cursor for resuming this session. */
+    readonly state: SessionState;
+    /** Eve session id after the first successful send. */
+    readonly sessionId: string | undefined;
+    /** Assert the last turn parked on HITL input and return matching requests. */
+    expectInputRequests(filter?: {
+        readonly display?: InputRequest["display"];
+        readonly toolName?: string;
+    }): readonly InputRequest[];
+    /** Resolve specific pending requests and run the resumed turn. */
+    respond(...responses: InputResponse[]): Promise<EveEvalTurn>;
+    /** Resolve every pending request with the same option id. */
+    respondAll(optionId: string): Promise<EveEvalTurn>;
+    /** Send one turn through this session. */
+    send(input: SendTurnInput): Promise<EveEvalTurn>;
+    /** Send one text turn with a local file attached as a data URL. */
+    sendFile(text: string, filePath: string, mediaType?: string): Promise<EveEvalTurn>;
 }
 /**
- * One hard assertion over a completed eval case. Built-ins live in
- * `eve/evals/checks`; custom checks are plain functions with this shape.
+ * One completed eval-driver turn.
  */
-export type EveEvalCheck = (args: EveEvalCheckArgs) => EveEvalCheckResult | Promise<EveEvalCheckResult>;
-export interface EveEvalTaskFields {
-    readonly messages?: (testCase: EveEvalCase) => string[];
-    /**
-     * Transform the raw task result into the scored output value. Available on
-     * every task variant. When omitted, `result.output` defaults to
-     * `result.finalMessage`.
-     */
-    readonly parseOutput?: (result: EveEvalTaskResult) => unknown;
-    readonly prompt?: (testCase: EveEvalCase) => string;
+export interface EveEvalTurn {
+    readonly data: unknown;
+    readonly events: readonly HandleMessageStreamEvent[];
+    readonly inputRequests: readonly InputRequest[];
+    readonly message: string | undefined;
+    readonly status: "completed" | "failed" | "waiting";
+    readonly toolCalls: readonly EveEvalToolCall[];
+    expectOk(): this;
 }
 /**
- * Declarative task configuration for a suite. The runner owns session
- * lifecycle, stream capture, and derived metadata; suites only declare how to
- * derive messages and parse outputs.
+ * The judge model used by `t.judge.*` assertions, configured per-eval or as
+ * the run-wide default in `evals.config.ts`. Only ever used for scoring; it
+ * never changes the agent under test. String model ids route through the
+ * Vercel AI Gateway; provider model instances run directly.
  */
-export type EveEvalTask = (EveEvalTaskFields & {
-    /**
-     * Derive an ordered list of messages for a multi-turn eval.
-     * Mutually exclusive with `prompt`.
-     */
-    readonly messages: (testCase: EveEvalCase) => string[];
-    readonly prompt?: never;
-}) | (EveEvalTaskFields & {
-    readonly messages?: never;
-    /**
-     * Derive a single prompt string from one eval case.
-     * Mutually exclusive with `messages`.
-     */
-    readonly prompt: (testCase: EveEvalCase) => string;
-}) | {
-    readonly messages?: never;
-    /** See {@link EveEvalTaskFields.parseOutput}. */
-    readonly parseOutput?: (result: EveEvalTaskResult) => unknown;
-    readonly prompt?: never;
-};
+export interface EveEvalJudgeConfig {
+    readonly model: LanguageModel;
+    readonly modelOptions?: AgentModelOptionsDefinition;
+}
+/**
+ * Per-call options for `t.judge.autoevals.*` assertions.
+ */
+export interface JudgeOpts {
+    /** Value to grade. Defaults to the final assistant message (`t.reply`). */
+    readonly on?: unknown;
+    /** Judge model for this call only; overrides the eval/config judge model. */
+    readonly model?: LanguageModel;
+    readonly modelOptions?: AgentModelOptionsDefinition;
+}
+/**
+ * Braintrust autoevals graders, bound to the resolved judge model. The grader
+ * family is named so its semantics are explicit: `factuality`'s consistency
+ * buckets and `closedQA`'s yes/no grading are autoevals' behavior, not Eve's.
+ * These are Eve-owned wrappers, not the raw library.
+ */
+export interface AutoevalsJudges {
+    factuality(expected: string, opts?: JudgeOpts): AssertionHandle;
+    summarizes(expected: string, opts?: JudgeOpts): AssertionHandle;
+    closedQA(criteria: string, opts?: JudgeOpts): AssertionHandle;
+    sql(expected: string, opts?: JudgeOpts): AssertionHandle;
+}
+/**
+ * Model-backed assertion namespaces on `t.judge`. A future non-autoevals
+ * engine would slot in as a sibling of `autoevals`.
+ */
+export interface JudgeContext {
+    readonly autoevals: AutoevalsJudges;
+}
 /**
- * Describes the Eve server an eval suite runs against.
+ * The single context passed to an eval's `test(t)` function. It drives the
+ * primary session (it extends {@link EveEvalSession}), carries the run-level
+ * and value-level assertion vocabulary, and exposes `judge` for LLM-as-judge.
+ *
+ * Run-level assertions (`completed`, `calledTool`, …) record an entry
+ * evaluated against the final run and never throw; `check` and `judge`
+ * evaluate the supplied value immediately. Use plain `throw` /
+ * `turn.expectOk()` for bespoke preconditions that should abort the run.
+ */
+export interface EveEvalContext extends EveEvalSession {
+    /** Eval timeout signal. */
+    readonly signal: AbortSignal;
+    /** Current target under test. */
+    readonly target: EveEvalTargetHandle;
+    /** The primary session's last assistant message, or null. */
+    readonly reply: string | null;
+    /** Structured eval log hook. */
+    log(message: string): void;
+    /** Create an additional independent session against the same target. */
+    newSession(): EveEvalSession;
+    completed(): AssertionHandle;
+    didNotFail(): AssertionHandle;
+    waiting(): AssertionHandle;
+    messageIncludes(token: string | RegExp): AssertionHandle;
+    calledTool(name: string, options?: EveEvalToolCallMatchOptions): AssertionHandle;
+    notCalledTool(name: string): AssertionHandle;
+    toolOrder(names: readonly string[]): AssertionHandle;
+    usedNoTools(): AssertionHandle;
+    maxToolCalls(max: number): AssertionHandle;
+    calledSubagent(name: string, options?: EveEvalSubagentCallMatchOptions): AssertionHandle;
+    noFailedActions(): AssertionHandle;
+    event(predicate: (events: readonly HandleMessageStreamEvent[]) => boolean, label: string): AssertionHandle;
+    outputEquals(value: unknown): AssertionHandle;
+    outputMatches(schema: StandardSchemaV1): AssertionHandle;
+    /** Apply a value-level assertion (from `eve/evals/expect`) to a value. */
+    check(value: unknown, assertion: Assertion): AssertionHandle;
+    /** LLM-as-judge assertions, bound to the resolved judge model. */
+    readonly judge: JudgeContext;
+}
+/**
+ * Describes the Eve server an eval runs against.
  */
 export interface EveEvalTarget {
     /**
@@ -234,152 +286,175 @@ export interface EveEvalTarget {
     readonly kind: "local" | "remote";
     /** Base HTTP URL the eval client connects to and sends message requests. */
     readonly url: string;
+    /** Capabilities discovered from the live target's info route. */
+    readonly capabilities: EveEvalTargetCapabilities;
+}
+export interface EveEvalTargetCapabilities {
+    readonly devRoutes: boolean;
+    readonly mockModels: boolean;
+}
+export interface EveEvalScheduleDispatchResult {
+    readonly scheduleId: string;
+    readonly sessionIds: readonly string[];
+}
+/**
+ * Live target handle exposed to eval runs.
+ */
+export interface EveEvalTargetHandle extends EveEvalTarget {
+    /** Dispatch a dev-only authored schedule. Requires declaring `"devRoutes"`. */
+    dispatchSchedule(scheduleId: string): Promise<EveEvalScheduleDispatchResult>;
+    /** Authenticated fetch against the target base URL. */
+    fetch(path: string, init?: RequestInit): Promise<Response>;
+    /** Attach to a pre-existing session and consume one turn boundary. */
+    attachSession(sessionId: string, opts?: {
+        readonly startIndex?: number;
+    }): Promise<EveEvalSession>;
 }
 /**
- * Shared fields between the user-facing input and the validated suite.
+ * Shared fields between the user-facing input and the validated eval.
  *
- * Suite identity (`id`) is derived from the `evals/<path>.eval.ts` file
+ * Eval identity (`id`) is derived from the `evals/<path>.eval.ts` file
  * path by the discovery layer; it is not authored on the input.
  */
-interface EveEvalSuiteBase {
+interface EveEvalBase {
     readonly description?: string;
-    readonly task?: EveEvalTask;
-    /**
-     * Hard assertions applied to every case in the suite. Case-level `checks`
-     * append to these. Any failed check marks the case failed and produces a
-     * non-zero `eve eval` exit code, unlike scores which stay soft data.
-     */
-    readonly checks?: readonly EveEvalCheck[];
-    readonly scores: readonly EveEvalScorer[];
     /**
-     * Model used by model-backed scorers in this suite. Required only when a
-     * model-backed scorer (e.g. the `Autoevals` wrappers) is present without
-     * its own per-scorer model override.
-     *
-     * String model IDs route through the Vercel AI Gateway; the runner uses
-     * provider model instances directly. This model is only for scoring and
-     * never changes the Eve agent under test.
+     * Target/process assumptions verified before execution. The eval is
+     * skipped when any requirement is unmet.
      */
-    readonly model?: LanguageModel;
-    /**
-     * Provider-specific options passed to model-backed scorers.
-     */
-    readonly modelOptions?: AgentModelOptionsDefinition;
+    readonly requires?: readonly EveEvalRequirement[];
     /**
-     * Maximum number of cases the runner executes at once.
-     * Defaults to 8 when omitted.
+     * Judge model for this eval's `t.judge.*` assertions. Optional: when
+     * omitted, judge assertions fall back to the `judge` declared in
+     * `evals.config.ts`. Only used for scoring; never changes the agent
+     * under test.
      */
-    readonly maxConcurrency?: number;
+    readonly judge?: EveEvalJudgeConfig;
     readonly timeoutMs?: number;
-    /** Used by `--tag` filtering: a suite carrying a requested tag runs all its cases. */
+    /** Used by `--tag` filtering. */
     readonly tags?: readonly string[];
     readonly metadata?: Readonly<Record<string, unknown>>;
     readonly reporters?: readonly EvalReporter[];
-    /**
-     * Minimum score thresholds per scorer name. A case "passes" when every
-     * scorer meets or exceeds its threshold. Scorers not listed here
-     * default to a threshold of 1.0 (exact match).
-     *
-     * @example
-     * ```ts
-     * thresholds: {
-     *   "Factuality": 0.5,
-     *   "run.didNotFail": 1.0,
-     * }
-     * ```
-     */
-    readonly thresholds?: Readonly<Record<string, number>>;
 }
 /**
- * Complete top-level key set accepted by {@link defineEvalSuite}, used to reject
- * unknown authored keys. The stricter {@link EveEvalSuiteInput} union enforces
- * `load`/`cases` exclusivity.
+ * Complete top-level key set accepted by {@link defineEval}, used to reject
+ * unknown authored keys.
  */
-export interface EveEvalSuiteInputFields extends Omit<EveEvalSuiteBase, "task"> {
-    readonly cases?: readonly EveEvalCase[];
-    readonly task?: EveEvalTaskFields;
-    load?(): Promise<EveEvalCase[]>;
+export interface EveEvalInputFields extends EveEvalBase {
+    readonly test?: (t: EveEvalContext) => void | Promise<void>;
 }
 /**
- * Full suite input passed to `defineEvalSuite()`.
+ * Full eval input passed to `defineEval()`.
  *
- * Provide either `load` (async function) or `cases` (static array), not both;
- * `cases` causes the runner to synthesize a `load` internally. Suite identity is
- * derived from the file path, so authors do not specify an `id` or `name`.
+ * Each eval file is exactly one case: an imperative `test(t)` function that
+ * drives the agent and asserts on what it produced. Eval identity is derived
+ * from the file path, so authors do not specify an `id` or `name`.
  */
-export type EveEvalSuiteInput = (EveEvalSuiteBase & {
-    readonly cases?: never;
-    /** Load cases dynamically. Mutually exclusive with `cases`. */
-    load(): Promise<EveEvalCase[]>;
-}) | (EveEvalSuiteBase & {
-    /** Static inline cases. Mutually exclusive with `load`. */
-    readonly cases: readonly EveEvalCase[];
-    load?: never;
-});
+export interface EveEvalInput extends EveEvalBase {
+    /** Imperative interaction-and-assertion script. */
+    test(t: EveEvalContext): void | Promise<void>;
+}
 /**
- * Suite returned by `defineEvalSuite()`. Carries no `id` yet: discovery stamps
- * the path-derived id at import time to produce a full {@link EveEvalSuite}. The
- * `_tag` literal (`"EveEvalSuite"`) brands the value so discovery and the runner
- * can recognize a defined suite.
+ * Eval returned by `defineEval()`. Carries no `id` yet: discovery stamps
+ * the path-derived id at import time to produce a full {@link EveEval}. The
+ * `_tag` literal (`"EveEval"`) brands the value so discovery and the runner
+ * can recognize a defined eval.
  */
-export interface EveEvalSuiteDefinition extends EveEvalSuiteBase {
-    readonly _tag: "EveEvalSuite";
-    /** Always present. Synthesized from `cases` when the input uses static cases. */
-    load(): Promise<EveEvalCase[]>;
-}
+export type EveEvalDefinition = EveEvalInput & {
+    readonly _tag: "EveEval";
+};
 /**
- * Validated suite consumed by the runner and reporters. The `id` is the
- * path-derived slug attached by discovery
- * (e.g. `evals/weather.eval.ts` → `"weather"`).
+ * Validated eval consumed by the runner and reporters. The `id` is the
+ * path-derived slug attached by discovery (e.g. `evals/weather.eval.ts` →
+ * `"weather"`, `evals/runtime/multi-turn.eval.ts` → `"runtime/multi-turn"`).
+ * Files that default-export an array of evals derive
+ * `<file-id>/<zero-padded index>` ids (e.g. `"weather/0000"`).
  */
-export interface EveEvalSuite extends EveEvalSuiteDefinition {
+export type EveEval = EveEvalDefinition & {
     readonly id: string;
-}
+};
 /**
- * Per-case outcome computed by the runner:
+ * Per-eval outcome computed by the runner:
  *
- * - `"passed"`  — no execution error, every check passed, every score met its threshold
- * - `"failed"`  — a check failed or execution errored (timeout, transport, thrown task)
- * - `"scored"`  — passed checks but at least one score fell below its threshold
- * - `"skipped"` — the case was not executed (reserved for unmet `requires` entries)
+ * - `"passed"`  — no execution error, every gate held, every soft threshold met
+ * - `"failed"`  — a gate assertion failed or execution errored (timeout, transport, thrown task)
+ * - `"scored"`  — every gate held but a soft assertion fell below its threshold
+ * - `"skipped"` — the eval was not executed (unmet `requires` entries)
  */
-export type EveEvalCaseVerdict = "passed" | "failed" | "scored" | "skipped";
+export type EveEvalVerdict = "passed" | "failed" | "scored" | "skipped";
 /**
- * Result of scoring one eval case.
+ * Result of executing and asserting one eval.
+ *
+ * `id` is the path-derived eval id
+ * (e.g. `evals/weather.eval.ts` → `"weather"`).
  */
-export interface EveEvalCaseResult {
-    readonly case: EveEvalCase;
+export interface EveEvalResult {
+    readonly id: string;
     readonly result: EveEvalTaskResult;
-    /** Hard-assertion results (suite-level checks first, then case-level). */
-    readonly checks: readonly EveEvalCheckResult[];
-    readonly scores: readonly EveEvalScorerResult[];
-    /** Per-case verdict; see {@link EveEvalCaseVerdict}. */
-    readonly verdict: EveEvalCaseVerdict;
+    /** Every assertion recorded by the eval's `test(t)`, in record order. */
+    readonly assertions: readonly AssertionResult[];
+    /** Per-eval verdict; see {@link EveEvalVerdict}. */
+    readonly verdict: EveEvalVerdict;
     readonly error?: string;
-    /** Why the case was skipped, when `verdict` is `"skipped"`. */
+    /** Why the eval was skipped, when `verdict` is `"skipped"`. */
     readonly skipReason?: string;
+    readonly startedAt: string;
+    readonly completedAt: string;
 }
 /**
- * Aggregated result for one suite run.
- *
- * `suite` is the path-derived suite id
- * (e.g. `evals/weather.eval.ts` → `"weather"`).
+ * Aggregated outcome of one `eve eval` run across every executed eval.
  */
-export interface EveEvalSuiteResult {
-    readonly suite: string;
+export interface EveEvalRunSummary {
     readonly target: EveEvalTarget;
-    readonly cases: readonly EveEvalCaseResult[];
+    readonly results: readonly EveEvalResult[];
     readonly startedAt: string;
     readonly completedAt: string;
-    /** Cases with verdict `"passed"`. */
+    /** Evals with verdict `"passed"`. */
     readonly passed: number;
-    /** Cases with verdict `"failed"` (check failures and execution errors). */
+    /** Evals with verdict `"failed"` (gate failures and execution errors). */
     readonly failed: number;
-    /** Cases with verdict `"scored"` (below-threshold scores only). */
+    /** Evals with verdict `"scored"` (below-threshold soft assertions only). */
     readonly scored: number;
-    /** Cases with verdict `"skipped"`. */
+    /** Evals with verdict `"skipped"`. */
     readonly skipped: number;
     /** The execution-error subset of `failed` (timeouts, connection failures, exceptions). */
     readonly errored: number;
 }
+/**
+ * Run-wide eval configuration authored in `evals.config.ts`.
+ *
+ * Exactly one `evals.config.ts` is required at the root of the `evals/`
+ * directory; it supplies the defaults every eval in the run shares.
+ */
+export interface EveEvalConfigInput {
+    /**
+     * Default judge model for `t.judge.*` assertions across every eval.
+     * Optional: evals that use no judge need not set it, and individual evals
+     * may override it with their own `judge`. Only ever used for scoring.
+     */
+    readonly judge?: EveEvalJudgeConfig;
+    /**
+     * Reporters that observe every eval in the run (e.g. a shared
+     * `Braintrust()` experiment). Suppressed by `eve eval --skip-report`.
+     */
+    readonly reporters?: readonly EvalReporter[];
+    /**
+     * Default maximum number of evals executing at once. Must be a positive
+     * integer. `eve eval --max-concurrency` overrides it; defaults to 8 when
+     * neither is set.
+     */
+    readonly maxConcurrency?: number;
+    /**
+     * Default per-eval timeout in milliseconds. An eval's own `timeoutMs`
+     * overrides it, and `eve eval --timeout` overrides both.
+     */
+    readonly timeoutMs?: number;
+}
+/**
+ * Validated eval run configuration returned by `defineEvalConfig()`. The
+ * `_tag` literal brands the value so discovery can recognize it.
+ */
+export type EveEvalConfig = EveEvalConfigInput & {
+    readonly _tag: "EveEvalConfig";
+};
 export {};

package/dist/src/execution/compaction.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+import type { ModelMessage } from "ai";
+/**
+ * Re-applies framework-owned state preservation after the harness compacts
+ * message history, returning any messages to append to the compacted history.
+ *
+ * Runs the framework's built-in preservation steps:
+ * - resets read-before-write tracking, so a write after compaction re-reads
+ *   the file whose read evidence was summarized away;
+ * - re-injects the todo list (when present), so the model keeps its task list.
+ *
+ * Must be called inside the harness step's `AlsContext`; both steps read
+ * durable context state.
+ */
+export declare function preserveFrameworkStateOnCompaction(): readonly ModelMessage[];

package/dist/src/execution/compaction.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ import{clearReadFileState}from"#runtime/framework-tools/file-state.js";import{getTodoCompactionMessage}from"#runtime/framework-tools/todo.js";function preserveFrameworkStateOnCompaction(){clearReadFileState();let e=getTodoCompactionMessage();return e===void 0?[]:[e]}export{preserveFrameworkStateOnCompaction};