npm - vitest-evals - Versions diffs - 0.9.0-beta.5 → 0.9.0 - Mend

vitest-evals 0.9.0-beta.5 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/README.md +13 -23
package/dist/harness.d.mts +329 -20
package/dist/harness.d.ts +329 -20
package/dist/harness.js.map +1 -1
package/dist/harness.mjs.map +1 -1
package/dist/index.d.mts +155 -12
package/dist/index.d.ts +155 -12
package/dist/index.js.map +1 -1
package/dist/index.mjs.map +1 -1
package/dist/internal/matchers.d.mts +41 -3
package/dist/internal/matchers.d.ts +41 -3
package/dist/internal/matchers.js.map +1 -1
package/dist/internal/matchers.mjs.map +1 -1
package/dist/internal/structuredOutputScorer.d.mts +4 -0
package/dist/internal/structuredOutputScorer.d.ts +4 -0
package/dist/internal/structuredOutputScorer.js.map +1 -1
package/dist/internal/structuredOutputScorer.mjs.map +1 -1
package/dist/internal/toolCallScorer.d.mts +6 -0
package/dist/internal/toolCallScorer.d.ts +6 -0
package/dist/internal/toolCallScorer.js.map +1 -1
package/dist/internal/toolCallScorer.mjs.map +1 -1
package/dist/judges/index.d.mts +2 -2
package/dist/judges/index.d.ts +2 -2
package/dist/judges/index.js.map +1 -1
package/dist/judges/index.mjs.map +1 -1
package/dist/judges/structuredOutputJudge.d.mts +54 -4
package/dist/judges/structuredOutputJudge.d.ts +54 -4
package/dist/judges/structuredOutputJudge.js.map +1 -1
package/dist/judges/structuredOutputJudge.mjs.map +1 -1
package/dist/judges/toolCallJudge.d.mts +56 -6
package/dist/judges/toolCallJudge.d.ts +56 -6
package/dist/judges/toolCallJudge.js.map +1 -1
package/dist/judges/toolCallJudge.mjs.map +1 -1
package/dist/judges/types.d.mts +68 -3
package/dist/judges/types.d.ts +68 -3
package/dist/judges/types.js.map +1 -1
package/dist/legacy/scorers/index.js.map +1 -1
package/dist/legacy/scorers/index.mjs.map +1 -1
package/dist/legacy/scorers/structuredOutputScorer.js.map +1 -1
package/dist/legacy/scorers/structuredOutputScorer.mjs.map +1 -1
package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
package/dist/legacy/scorers/utils.js.map +1 -1
package/dist/legacy/scorers/utils.mjs.map +1 -1
package/dist/legacy.js.map +1 -1
package/dist/legacy.mjs.map +1 -1
package/dist/reporter.js.map +1 -1
package/dist/reporter.mjs.map +1 -1
package/package.json +13 -1

package/README.md CHANGED Viewed

@@ -18,11 +18,9 @@ npm install -D @vitest-evals/harness-ai-sdk
 npm install -D @vitest-evals/harness-openai-agents
 ```
-For GitHub Actions summaries and annotations, install the JSON post-processor:
-```sh
-npm install -D @vitest-evals/github-reporter
-```
+For GitHub Actions summaries and annotations, emit Vitest JSON and use the
+native `getsentry/vitest-evals` action. No extra npm package is needed in the
+workflow.
 ## Core Model
@@ -155,17 +153,22 @@ Use Vitest JSON as the eval report artifact. It preserves the `meta` field that
 contains eval scores and normalized harness runs.
 ```sh
-vitest run evals \
+vitest run --config vitest.evals.config.ts \
   --reporter=vitest-evals/reporter \
   --reporter=json \
   --outputFile.json=vitest-results.json
+```
-vitest-evals-github-report
+```yaml
+- uses: getsentry/vitest-evals@v0
+  if: always()
+  with:
+    results: vitest-results.json
 ```
-The GitHub reporter writes a job summary when `GITHUB_STEP_SUMMARY` is present,
-emits short failure annotations in Actions, and can publish a separate Check Run
-with `--check-run` when `checks: write` permission is configured.
+The GitHub reporter action writes a job summary, emits short failure
+annotations, can publish a separate Check Run, and can reduce sharded eval JSON
+artifacts into one combined report.
 ## Existing Agents
@@ -386,16 +389,3 @@ When you only need deterministic contract checks, built-ins such as
 `StructuredOutputJudge()` and `ToolCallJudge()` are still available. The primary
 documentation examples intentionally use factuality/rubric judges because those
 match the product's LLM-as-a-judge direction.
-## Legacy Compatibility
-The root package is harness-first and judge-first. Legacy scorer-first suites
-and `evaluate(...)` live under `vitest-evals/legacy`.
-```ts
-import {
-  describeEval,
-  StructuredOutputScorer,
-  ToolCallScorer,
-} from "vitest-evals/legacy";
-```

package/dist/harness.d.mts CHANGED Viewed

@@ -4,52 +4,128 @@ type JsonPrimitive = string | number | boolean | null;
 type JsonValue = JsonPrimitive | JsonValue[] | {
     [key: string]: JsonValue;
 };
-/** Normalized record for one tool call observed during a harness run. */
+/**
+ * Normalized record for one tool call observed during a harness run.
+ *
+ * @example
+ * ```ts
+ * const call: ToolCallRecord = {
+ *   name: "lookupInvoice",
+ *   arguments: { invoiceId: "inv_123" },
+ *   result: { refundable: true },
+ * };
+ * ```
+ */
 type ToolCallRecord = {
+    /** Provider or runtime tool-call id when one is available. */
     id?: string;
+    /** Tool name as exposed to the agent or application runtime. */
     name: string;
+    /** JSON-safe tool arguments after provider/runtime normalization. */
     arguments?: Record<string, JsonValue>;
+    /** JSON-safe tool result returned by the application tool. */
     result?: JsonValue;
+    /** Normalized tool error when execution failed. */
     error?: {
         message: string;
         type?: string;
         [key: string]: JsonValue | undefined;
     };
+    /** ISO timestamp for the start of tool execution. */
     startedAt?: string;
+    /** ISO timestamp for the end of tool execution. */
     finishedAt?: string;
+    /** Tool execution duration in milliseconds. */
     durationMs?: number;
+    /** Extra JSON-safe tool metadata for reporters and custom judges. */
     metadata?: Record<string, JsonValue>;
 };
-/** Normalized message recorded in a harness session transcript. */
+/**
+ * Normalized message recorded in a harness session transcript.
+ *
+ * @example
+ * ```ts
+ * const message: NormalizedMessage = {
+ *   role: "assistant",
+ *   content: { status: "approved" },
+ *   toolCalls: [{ name: "lookupInvoice" }],
+ * };
+ * ```
+ */
 type NormalizedMessage = {
+    /** Transcript role for the normalized message. */
     role: "system" | "user" | "assistant" | "tool";
+    /** JSON-safe message content. */
     content?: JsonValue;
+    /** Tool calls associated with this message. */
     toolCalls?: ToolCallRecord[];
+    /** Extra JSON-safe message metadata. */
     metadata?: Record<string, JsonValue>;
 };
-/** Provider usage summary attached to a normalized harness run. */
+/**
+ * Provider usage summary attached to a normalized harness run.
+ *
+ * @example
+ * ```ts
+ * const usage: UsageSummary = {
+ *   provider: "openai",
+ *   model: "gpt-4o-mini",
+ *   inputTokens: 212,
+ *   outputTokens: 48,
+ *   totalTokens: 260,
+ * };
+ * ```
+ */
 type UsageSummary = {
+    /** Provider that served the application run. */
     provider?: string;
+    /** Model used for the application run. */
     model?: string;
+    /** Input, prompt, or request tokens consumed by the run. */
     inputTokens?: number;
+    /** Output or completion tokens produced by the run. */
     outputTokens?: number;
+    /** Reasoning tokens reported by providers that expose them. */
     reasoningTokens?: number;
+    /** Total token count reported by the provider or adapter. */
     totalTokens?: number;
-    estimatedCost?: number;
+    /** Count of tool calls observed during the run. */
     toolCalls?: number;
+    /** Retry count observed during the run. */
     retries?: number;
+    /** Provider-specific JSON-safe usage details. Cost estimates belong here. */
     metadata?: Record<string, JsonValue>;
 };
 /** Timing summary attached to a normalized harness run. */
 type TimingSummary = {
+    /** End-to-end run duration in milliseconds. */
     totalMs?: number;
+    /** Extra JSON-safe timing metadata. */
     metadata?: Record<string, JsonValue>;
 };
-/** JSON-serializable transcript produced by the system under test. */
+/**
+ * JSON-serializable transcript produced by the system under test.
+ *
+ * @example
+ * ```ts
+ * const session: NormalizedSession = {
+ *   provider: "openai",
+ *   model: "gpt-4o-mini",
+ *   messages: [
+ *     { role: "user", content: "Refund invoice inv_123" },
+ *     { role: "assistant", content: { status: "approved" } },
+ *   ],
+ * };
+ * ```
+ */
 type NormalizedSession = {
+    /** Ordered normalized transcript messages. */
     messages: NormalizedMessage[];
+    /** Provider that produced the session when known. */
     provider?: string;
+    /** Model that produced the session when known. */
     model?: string;
+    /** Extra JSON-safe session metadata. */
     metadata?: Record<string, JsonValue>;
 };
 type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
@@ -57,64 +133,165 @@ type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOut
 } : {
     output: TOutput;
 };
-/** Normalized result returned by every harness execution. */
+/**
+ * Normalized result returned by every harness execution.
+ *
+ * @example
+ * ```ts
+ * const run: HarnessRun<{ status: "approved" }> = {
+ *   output: { status: "approved" },
+ *   session: {
+ *     messages: [
+ *       { role: "user", content: "Refund invoice inv_123" },
+ *       { role: "assistant", content: { status: "approved" } },
+ *     ],
+ *   },
+ *   usage: { totalTokens: 260 },
+ *   errors: [],
+ * };
+ * ```
+ */
 type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
+    /** Normalized transcript and provider/session metadata. */
     session: NormalizedSession;
+    /** Stable provider usage units such as tokens, tools, and retries. */
     usage: UsageSummary;
+    /** Optional timing summary for the run. */
     timings?: TimingSummary;
+    /** JSON-safe run artifacts captured by the harness or test context. */
     artifacts?: Record<string, JsonValue>;
+    /** Normalized errors captured during execution. */
     errors: Array<Record<string, JsonValue>>;
 };
 /** Error value with an attached partial or complete normalized harness run. */
 type HarnessRunError = Error & {
+    /** Attached normalized harness run recovered by `getHarnessRunFromError(...)`. */
     vitestEvalsRun: HarnessRun;
 };
 /** Per-run metadata shape accepted by harnesses and eval tests. */
 type HarnessMetadata = Record<string, unknown>;
-/** Runtime context passed from the eval fixture into a harness run. */
+/**
+ * Runtime context passed from the eval fixture into a harness run.
+ *
+ * @example
+ * ```ts
+ * const harness: Harness<string> = {
+ *   name: "refund-agent",
+ *   async run(input, context) {
+ *     context.setArtifact("inputLength", input.length);
+ *
+ *     return {
+ *       output: undefined,
+ *       session: { messages: [{ role: "user", content: input }] },
+ *       usage: {},
+ *       errors: [],
+ *     };
+ *   },
+ * };
+ * ```
+ */
 type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
+    /** Per-run metadata passed through `run(input, { metadata })`. */
     metadata: Readonly<TMetadata>;
+    /** Abort signal from Vitest when available. */
     signal?: AbortSignal;
+    /** Mutable JSON-safe artifact bag shared with the harness. */
     artifacts: Record<string, JsonValue>;
+    /** Stores one JSON-safe artifact on the current run. */
     setArtifact: (name: string, value: JsonValue) => void;
 };
-/** Adapter that executes the system under test and returns a normalized run. */
+/**
+ * Adapter that executes the system under test and returns a normalized run.
+ *
+ * @example
+ * ```ts
+ * const harness: Harness<string, { status: "approved" | "denied" }> = {
+ *   name: "refund-agent",
+ *   async run(input, context) {
+ *     return normalizeHarnessRun(input, await runRefundFlow(input), context);
+ *   },
+ * };
+ * ```
+ */
 type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
+    /** Stable harness name used in reports. */
     name: string;
+    /** Executes the system under test and returns a normalized run. */
     run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun<TOutput>>;
 };
 /** Value or promise accepted by lightweight harness callbacks. */
 type MaybePromise<T> = T | Promise<T>;
 /** Lightweight tool-call record accepted by `createHarness(...)` results. */
 type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
+    /** Raw tool arguments accepted by `createHarness(...)` before normalization. */
     arguments?: unknown;
+    /** Raw tool result accepted by `createHarness(...)` before normalization. */
     result?: unknown;
+    /** Raw tool error accepted by `createHarness(...)` before normalization. */
     error?: unknown;
+    /** Raw tool metadata accepted by `createHarness(...)` before normalization. */
     metadata?: Record<string, unknown>;
 };
-/** Lightweight result shape normalized by `createHarness(...)`. */
+/**
+ * Lightweight result shape normalized by `createHarness(...)`.
+ *
+ * @example
+ * ```ts
+ * const result: SimpleHarnessResult<{ status: "approved" }> = {
+ *   output: { status: "approved" },
+ *   toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
+ *   usage: { totalTokens: 260 },
+ * };
+ * ```
+ */
 type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
+    /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */
     messages?: NormalizedMessage[];
+    /** Lightweight tool-call records to normalize into the session. */
     toolCalls?: SimpleToolCallRecord[];
+    /** Usage summary to attach to the run. */
     usage?: UsageSummary;
+    /** Timing summary to attach to the run. */
     timings?: TimingSummary;
+    /** Raw artifact values to normalize and merge into the run. */
     artifacts?: Record<string, unknown>;
+    /** Raw session metadata to normalize into the session. */
     metadata?: Record<string, unknown>;
+    /** Raw errors to normalize into the run. */
     errors?: unknown[];
 };
 /** Either a complete normalized run or a lightweight result to normalize. */
 type HarnessResultLike<TOutput extends JsonValue | undefined = JsonValue | undefined> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;
 /** Arguments passed to the `createHarness(...)` convenience callback. */
 type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {
+    /** Original input passed to `run(input)`. */
     input: TInput;
+    /** Read-only metadata passed to `run(input, { metadata })`. */
     metadata: Readonly<TMetadata>;
+    /** Abort signal from Vitest when available. */
     signal?: AbortSignal;
+    /** Mutable run artifact bag. */
     artifacts: HarnessContext<TMetadata>["artifacts"];
+    /** Stores one JSON-safe artifact on the current run. */
     setArtifact: HarnessContext<TMetadata>["setArtifact"];
 };
-/** Options for creating a lightweight custom application harness. */
+/**
+ * Options for creating a lightweight custom application harness.
+ *
+ * @example
+ * ```ts
+ * const options: CreateHarnessOptions<string, { status: "approved" }> = {
+ *   name: "refund-agent",
+ *   run: async ({ input }) => ({
+ *     output: await classifyRefund(input),
+ *   }),
+ * };
+ * ```
+ */
 type CreateHarnessOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
+    /** Stable harness name used in reports. */
     name: string;
+    /** Executes application code and returns either a lightweight result or full `HarnessRun`. */
     run: (args: CreateHarnessRunArgs<TInput, TMetadata>) => MaybePromise<HarnessResultLike<TOutput>>;
 };
 /** Returns true when a value exposes a callable method with the given name. */
@@ -127,25 +304,157 @@ declare function normalizeRecord(value: Record<string, unknown>): Record<string,
 declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
 /** Converts arbitrary content into the JSON-safe message content shape. */
 declare function normalizeContent(value: unknown): JsonValue;
-/** Creates a harness from the common "run app code and return output" shape. */
+/**
+ * Creates a harness from the common "run app code and return output" shape.
+ *
+ * @param options - Harness name plus the callback that executes app code.
+ *
+ * @example
+ * ```ts
+ * import { createHarness } from "vitest-evals";
+ *
+ * export const refundHarness = createHarness<
+ *   string,
+ *   { status: "approved" | "denied" },
+ *   { expected: { status: "approved" | "denied" } }
+ * >({
+ *   name: "refund-agent",
+ *   run: async ({ input, metadata, setArtifact }) => {
+ *     const result = await runRefundFlow(input, metadata);
+ *     const output = { status: result.status };
+ *
+ *     setArtifact("case", { expected: metadata.expected.status });
+ *
+ *     return {
+ *       output,
+ *       toolCalls: result.toolCalls,
+ *       usage: { provider: "openai", model: "gpt-4o-mini" },
+ *     };
+ *   },
+ * });
+ * ```
+ */
 declare function createHarness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata>(options: CreateHarnessOptions<TInput, TOutput, TMetadata>): Harness<TInput, TOutput, TMetadata>;
-/** Normalizes a lightweight harness result into the reporter-facing run shape. */
+/**
+ * Normalizes a lightweight harness result into the reporter-facing run shape.
+ *
+ * @param input - Original input passed to the harness.
+ * @param result - Lightweight result or pre-normalized harness run.
+ * @param context - Optional per-run context used to merge artifacts.
+ *
+ * @example
+ * ```ts
+ * const run = normalizeHarnessRun("Refund invoice inv_123", {
+ *   output: { status: "approved" },
+ *   toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
+ *   usage: { provider: "openai", model: "gpt-4o-mini" },
+ * });
+ *
+ * expect(toolCalls(run.session)).toHaveLength(1);
+ * ```
+ */
 declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
-/** Flattens every recorded tool call from a normalized session. */
+/**
+ * Flattens every recorded tool call from a normalized session.
+ *
+ * @param session - Normalized session produced by a harness run.
+ *
+ * @example
+ * ```ts
+ * const names = toolCalls(result.session).map((call) => call.name);
+ *
+ * expect(names).toEqual(["lookupInvoice", "createRefund"]);
+ * ```
+ */
 declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
-/** Filters normalized session messages by role. */
+/**
+ * Filters normalized session messages by role.
+ *
+ * @param session - Normalized session produced by a harness run.
+ * @param role - Message role to keep.
+ *
+ * @example
+ * ```ts
+ * const assistantText = messagesByRole(result.session, "assistant")
+ *   .map((message) => message.content)
+ *   .join("\n");
+ * ```
+ */
 declare function messagesByRole(session: NormalizedSession, role: NormalizedMessage["role"]): NormalizedMessage[];
-/** Returns every normalized system message from a session. */
+/**
+ * Returns every normalized system message from a session.
+ *
+ * @param session - Normalized session produced by a harness run.
+ *
+ * @example
+ * ```ts
+ * const systemPrompts = systemMessages(result.session);
+ * ```
+ */
 declare function systemMessages(session: NormalizedSession): NormalizedMessage[];
-/** Returns every normalized user message from a session. */
+/**
+ * Returns every normalized user message from a session.
+ *
+ * @param session - Normalized session produced by a harness run.
+ *
+ * @example
+ * ```ts
+ * const firstPrompt = userMessages(result.session)[0]?.content;
+ * ```
+ */
 declare function userMessages(session: NormalizedSession): NormalizedMessage[];
-/** Returns every normalized assistant message from a session. */
+/**
+ * Returns every normalized assistant message from a session.
+ *
+ * @param session - Normalized session produced by a harness run.
+ *
+ * @example
+ * ```ts
+ * const finalAnswer = assistantMessages(result.session).at(-1)?.content;
+ * ```
+ */
 declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
-/** Returns every normalized tool message from a session. */
+/**
+ * Returns every normalized tool message from a session.
+ *
+ * @param session - Normalized session produced by a harness run.
+ *
+ * @example
+ * ```ts
+ * const toolOutputs = toolMessages(result.session).map((message) => message.content);
+ * ```
+ */
 declare function toolMessages(session: NormalizedSession): NormalizedMessage[];
-/** Attaches a partial or complete harness run to an arbitrary thrown error. */
+/**
+ * Attaches a partial or complete harness run to an arbitrary thrown error.
+ *
+ * @param error - Thrown value to wrap.
+ * @param run - Partial or complete normalized harness run to preserve.
+ *
+ * @example
+ * ```ts
+ * try {
+ *   return await runAgent(input);
+ * } catch (error) {
+ *   throw attachHarnessRunToError(error, partialRun);
+ * }
+ * ```
+ */
 declare function attachHarnessRunToError(error: unknown, run: HarnessRun): HarnessRunError;
-/** Reads an attached harness run back off a previously wrapped error value. */
+/**
+ * Reads an attached harness run back off a previously wrapped error value.
+ *
+ * @param error - Unknown thrown value that may contain a harness run.
+ *
+ * @example
+ * ```ts
+ * const partialRun = getHarnessRunFromError(error);
+ *
+ * if (partialRun) {
+ *   console.log(toolCalls(partialRun.session));
+ * }
+ * ```
+ */
 declare function getHarnessRunFromError(error: unknown): HarnessRun | undefined;
 /** Returns true when a value matches the normalized `HarnessRun` contract. */
 declare function isHarnessRun(value: unknown): value is HarnessRun;