npm - vitest-evals - Versions diffs - 0.9.0-beta.3 → 0.9.0-beta.5 - Mend

vitest-evals 0.9.0-beta.3 → 0.9.0-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

package/README.md +104 -97
package/dist/harness.d.mts +59 -18
package/dist/harness.d.ts +59 -18
package/dist/harness.js +136 -1
package/dist/harness.js.map +1 -1
package/dist/harness.mjs +134 -1
package/dist/harness.mjs.map +1 -1
package/dist/index.d.mts +45 -29
package/dist/index.d.ts +45 -29
package/dist/index.js +293 -103
package/dist/index.js.map +1 -1
package/dist/index.mjs +290 -102
package/dist/index.mjs.map +1 -1
package/dist/internal/matchers.d.mts +4 -0
package/dist/internal/matchers.d.ts +4 -0
package/dist/internal/matchers.js.map +1 -1
package/dist/internal/matchers.mjs.map +1 -1
package/dist/internal/structuredOutputScorer.js.map +1 -1
package/dist/internal/structuredOutputScorer.mjs.map +1 -1
package/dist/internal/toolCallScorer.js.map +1 -1
package/dist/internal/toolCallScorer.mjs.map +1 -1
package/dist/judges/index.d.mts +1 -1
package/dist/judges/index.d.ts +1 -1
package/dist/judges/index.js +37 -23
package/dist/judges/index.js.map +1 -1
package/dist/judges/index.mjs +37 -23
package/dist/judges/index.mjs.map +1 -1
package/dist/judges/structuredOutputJudge.d.mts +6 -3
package/dist/judges/structuredOutputJudge.d.ts +6 -3
package/dist/judges/structuredOutputJudge.js +11 -11
package/dist/judges/structuredOutputJudge.js.map +1 -1
package/dist/judges/structuredOutputJudge.mjs +11 -11
package/dist/judges/structuredOutputJudge.mjs.map +1 -1
package/dist/judges/toolCallJudge.d.mts +6 -3
package/dist/judges/toolCallJudge.d.ts +6 -3
package/dist/judges/toolCallJudge.js +26 -12
package/dist/judges/toolCallJudge.js.map +1 -1
package/dist/judges/toolCallJudge.mjs +26 -12
package/dist/judges/toolCallJudge.mjs.map +1 -1
package/dist/judges/types.d.mts +33 -16
package/dist/judges/types.d.ts +33 -16
package/dist/judges/types.js.map +1 -1
package/dist/legacy/evaluate/index.d.mts +2 -0
package/dist/legacy/evaluate/index.d.ts +2 -0
package/dist/legacy/evaluate/index.js.map +1 -1
package/dist/legacy/evaluate/index.mjs.map +1 -1
package/dist/legacy/scorers/index.js.map +1 -1
package/dist/legacy/scorers/index.mjs.map +1 -1
package/dist/legacy/scorers/structuredOutputScorer.d.mts +2 -0
package/dist/legacy/scorers/structuredOutputScorer.d.ts +2 -0
package/dist/legacy/scorers/structuredOutputScorer.js.map +1 -1
package/dist/legacy/scorers/structuredOutputScorer.mjs.map +1 -1
package/dist/legacy/scorers/toolCallScorer.d.mts +2 -0
package/dist/legacy/scorers/toolCallScorer.d.ts +2 -0
package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
package/dist/legacy/scorers/utils.js.map +1 -1
package/dist/legacy/scorers/utils.mjs.map +1 -1
package/dist/legacy/shared.d.mts +6 -0
package/dist/legacy/shared.d.ts +6 -0
package/dist/legacy/shared.js.map +1 -1
package/dist/legacy.d.mts +3 -0
package/dist/legacy.d.ts +3 -0
package/dist/legacy.js.map +1 -1
package/dist/legacy.mjs.map +1 -1
package/dist/replay.d.mts +7 -0
package/dist/replay.d.ts +7 -0
package/dist/replay.js.map +1 -1
package/dist/replay.mjs.map +1 -1
package/dist/reporter.d.mts +1 -0
package/dist/reporter.d.ts +1 -0
package/dist/reporter.js.map +1 -1
package/dist/reporter.mjs.map +1 -1
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -18,6 +18,12 @@ npm install -D @vitest-evals/harness-ai-sdk
 npm install -D @vitest-evals/harness-openai-agents
 ```
+For GitHub Actions summaries and annotations, install the JSON post-processor:
+```sh
+npm install -D @vitest-evals/github-reporter
+```
 ## Core Model
 - `describeEval(...)` binds exactly one harness to a suite
@@ -27,13 +33,18 @@ npm install -D @vitest-evals/harness-openai-agents
 - the returned `result.output` is the app-facing value you assert on directly
 - the returned `result.session` is the canonical JSON-serializable trace for
   reporting, replay, tool assertions, and judges
-- scenario-specific judge criteria can live in `inputValue`; use `metadata` for
+- scenario-specific judge criteria can live in `input`; use `metadata` for
   per-run expectations or harness configuration that are not part of the
   scenario payload
 - suite-level `judges` are optional and run automatically after each `run(...)`
 - suite-level `judgeThreshold` controls fail-on-score for those automatic judges
-- every judge receives `JudgeContext`, including the configured `harness` with
-  its required `prompt` function
+- every judge is a named object with `assess(ctx)`
+- every judge receives `JudgeContext` with typed `input`, typed `output`, the
+  normalized run/session, tool calls, and metadata; `output` is only optional
+  when the harness output type includes `undefined`
+- judges own their prompt, rubric, model call, and parsing; use
+  `createJudge(...)` for custom judges and its provider-helper overload only
+  when multiple judges share setup
 - explicit judge assertions use
   `await expect(result).toSatisfyJudge(judge, context)`
@@ -43,25 +54,29 @@ npm install -D @vitest-evals/harness-openai-agents
 import { expect } from "vitest";
 import { piAiHarness } from "@vitest-evals/harness-pi-ai";
 import {
+  createJudge,
   describeEval,
-  namedJudge,
   toolCalls,
   type JudgeContext,
 } from "vitest-evals";
-import { createRefundAgent, judgePrompt } from "../src/refundAgent";
+import { createRefundAgent } from "../src/refundAgent";
 type RefundEvalMetadata = {
   expectedStatus: "approved" | "denied";
   expectedTools: string[];
 };
-const FactualityJudge = namedJudge(
+type RefundOutput = {
+  status: "approved" | "denied";
+};
+const FactualityJudge = createJudge(
   "FactualityJudge",
   async ({
     input,
     output,
     metadata,
-  }: JudgeContext<string, RefundEvalMetadata>) => {
+  }: JudgeContext<string, RefundOutput, RefundEvalMetadata>) => {
     const verdict = await judgeFactuality({
       question: input,
       answer: output,
@@ -81,8 +96,7 @@ describeEval(
   "refund agent",
   {
     harness: piAiHarness({
-      createAgent: () => createRefundAgent(),
-      prompt: judgePrompt,
+      agent: () => createRefundAgent(),
     }),
     judges: [FactualityJudge],
   },
@@ -135,6 +149,24 @@ describeEval("refund agent", { harness }, (it) => {
 });
 ```
+## GitHub Actions Reporting
+Use Vitest JSON as the eval report artifact. It preserves the `meta` field that
+contains eval scores and normalized harness runs.
+```sh
+vitest run evals \
+  --reporter=vitest-evals/reporter \
+  --reporter=json \
+  --outputFile.json=vitest-results.json
+vitest-evals-github-report
+```
+The GitHub reporter writes a job summary when `GITHUB_STEP_SUMMARY` is present,
+emits short failure annotations in Actions, and can publish a separate Check Run
+with `--check-run` when `checks: write` permission is configured.
 ## Existing Agents
 For an existing agent, the intended contract is:
@@ -157,37 +189,31 @@ be inferred automatically. Treat low-level normalization callbacks as an escape
 hatch, not part of the primary authoring path.
 For OpenAI Agents SDK apps, use
-`@vitest-evals/harness-openai-agents` with an existing `Agent` or
-`createAgent()` factory and a `Runner` / `createRunner()` callback. The harness
-calls `Runner.run(agent, input, options)` by default and exposes the same
+`@vitest-evals/harness-openai-agents` with an existing `Agent` or an `agent`
+factory and a `Runner` or `runner` factory. The harness calls
+`Runner.run(agent, input, options)` by default and exposes the same
 normalization and replay hooks when the app needs a custom entrypoint or
 structured domain output mapping.
 ## Custom App Harnesses
 First-party harness packages are conveniences, not the only supported path. If
-you need to test a full application flow, define a harness that runs your app
-through its normal entrypoint and returns a normalized `HarnessRun`. The same
-harness should also expose `prompt`, which LLM-backed judges can reuse through
-`JudgeContext.harness.prompt`.
+you need to test a full application flow, use `createHarness(...)` to run your
+app through its normal entrypoint and return the app-facing output. Judges own
+their prompt/rubric text separately from the system under test.
+When generics are needed, use `createHarness<Input, Output, Metadata>(...)`.
 ```ts
 import {
+  createHarness,
+  createJudge,
   describeEval,
-  namedJudge,
   type JudgeContext,
 } from "vitest-evals";
-import {
-  normalizeContent,
-  normalizeMetadata,
-  toJsonValue,
-  type Harness,
-  type HarnessRun,
-} from "vitest-evals/harness";
 type AppEvent = {
   type: string;
-  payload: Record<string, unknown>;
+  payload: Record<string, string>;
 };
 type AppEvalInput = {
@@ -199,65 +225,42 @@ type AppEvalInput = {
   };
 };
-const appHarness: Harness<AppEvalInput> = {
+type AppEvalMetadata = Record<string, never>;
+type AppOutput = {
+  replies: Array<{ text: string }>;
+  sideEffects: string[];
+};
+const appHarness = createHarness<AppEvalInput, AppOutput, AppEvalMetadata>({
   name: "custom-app",
-  prompt: (input, options) => promptJudgeModel(input, options),
-  run: async (input, context): Promise<HarnessRun> => {
+  run: async ({ input, signal }) => {
     const result = await replayAppEvents(input.events, {
-      signal: context.signal,
+      signal,
     });
-    const output = {
-      replies: result.replies,
-      sideEffects: result.sideEffects,
-    };
     return {
-      output: toJsonValue(output),
-      session: {
-        messages: [
-          ...input.events.map((event) => ({
-            role: "user" as const,
-            content: normalizeContent(event),
-          })),
-          ...result.replies.map((reply) => ({
-            role: "assistant" as const,
-            content: normalizeContent(reply.text),
-            metadata: normalizeMetadata({
-              target: reply.target,
-            }),
-          })),
-        ],
-        outputText: result.replies.map((reply) => reply.text).join("\n\n"),
-        metadata: normalizeMetadata({
-          replyCount: result.replies.length,
-        }),
+      output: {
+        replies: result.replies,
+        sideEffects: result.sideEffects,
+      },
+      artifacts: {
+        replyCount: result.replies.length,
       },
       usage: {},
-      artifacts:
-        Object.keys(context.artifacts).length > 0
-          ? context.artifacts
-          : undefined,
-      errors: [],
     };
   },
-};
+});
-const AppRubricJudge = namedJudge(
+const AppRubricJudge = createJudge(
   "AppRubricJudge",
-  async (
-    ctx: JudgeContext<AppEvalInput, Record<string, unknown>, typeof appHarness>,
-  ) => {
-    const verdict = await ctx.harness.prompt(
-      formatRubricPrompt({
+  async (ctx: JudgeContext<AppEvalInput, AppOutput, AppEvalMetadata>) => {
+    const verdict = await promptJudgeModel({
+      prompt: formatRubricPrompt({
         output: ctx.output,
-        criteria: ctx.inputValue.criteria,
+        criteria: ctx.input.criteria,
       }),
-      {
-        metadata: {
-          judge: "AppRubricJudge",
-        },
-      },
-    );
+    });
     return parseRubricVerdict(verdict);
   },
@@ -292,16 +295,16 @@ describeEval(
 );
 ```
-Use `Harness.run(...)` for the application under test and `Harness.prompt(...)`
-for judge model calls. Calling `ctx.harness.run(...)` from inside a judge runs
-the application a second time, so reserve that for judges that intentionally
-need a second execution. Put criteria on `inputValue` when they are part of the
-scenario itself; use per-run `metadata` for harness configuration or
-expectations that are not part of the scenario payload. `session.outputText` is
-the canonical text sent to judges, so define it deliberately when your app
-returns structured artifacts.
+Use `Harness.run(...)` for the application under test. Calling
+`ctx.harness.run(...)` from inside a judge runs the application a second time,
+so reserve that for judges that intentionally need a second execution. Put
+criteria on `input` when they are part of the scenario itself; use per-run
+`metadata` for harness configuration or expectations that are not part of the
+scenario payload. `createHarness(...)` builds a default user/assistant session
+from `input` and typed `output`; return a full `HarnessRun` only when you need
+exact session control.
-Provider setup and rubric parsing stay in your harness and judge. The core
+Provider setup and rubric parsing stay in your judge. The core
 package only requires the judge to return a `JudgeResult` with a score and
 optional metadata.
@@ -330,17 +333,17 @@ context:
 ```ts
 await expect({ status: "approved" }).toSatisfyJudge(MyJudge, {
-  inputValue: "Refund invoice inv_123",
+  input: "Refund invoice inv_123",
 });
 ```
-If you are writing a custom judge, wrap it with `namedJudge(...)` so reporter
-output uses a stable label:
+Use `createJudge(...)` for custom judges so reporter output gets a stable
+label:
 ```ts
-import { namedJudge } from "vitest-evals";
+import { createJudge } from "vitest-evals";
-const FactualityJudge = namedJudge(
+const FactualityJudge = createJudge(
   "FactualityJudge",
   async ({ output }) => {
     const answer = output;
@@ -356,21 +359,25 @@ const FactualityJudge = namedJudge(
 );
 ```
-LLM-backed judges can reuse the suite harness prompt by calling
-`harness.prompt(...)`. `vitest-evals` does not prescribe a rubric schema,
-scoring scale, model provider, or parser; those stay in the judge. Calling
-`harness.run(...)` from a judge executes the application again, so use that
-only when a second run is intentional.
+LLM-backed judges should provide their own judge prompt and rubric text.
+`vitest-evals` does not prescribe a rubric schema, scoring scale, model
+provider, or parser; those stay in the judge. When multiple judges share a
+reusable judge-side provider helper, use the provider-helper overload of
+`createJudge(...)` so run-scoped options such as abort signals stay curried.
+Calling `harness.run(...)` from a judge executes the application again, so use
+that only when a second run is intentional.
 For an `EvalHarnessRun` returned by fixture `run(...)`,
-`toSatisfyJudge(...)` uses the run's canonical text output and reuses the
-registered input, metadata, and harness prompt. Inside an eval test,
-matcher calls on registered raw output or session objects reuse that exact run
-context; raw output values are serialized as the judge `output`, so
-`expect(result.output).toSatisfyJudge(judge)` stays concise. Other raw values
-fall back to the current test's most recent `run(...)` context. For
+`toSatisfyJudge(...)` uses the run's typed `output` and reuses the registered
+input and metadata. It requires any custom judge params and rejects judges whose
+output type cannot assess the received value. Inside an eval test,
+matcher calls on registered output objects or session objects reuse that exact
+run context when the value can be registered by reference, so
+`expect(result.output).toSatisfyJudge(judge)` stays concise for structured
+outputs. Other raw values fall back to the current test's most recent
+`run(...)` context. For
 manually-created runs or values outside an eval context, pass any required
-`inputValue`, `metadata`, or `harness` in matcher options. Structured or
+`input`, `metadata`, or `harness` in matcher options. Structured or
 programmatic result checks should usually assert on `result.output` directly.
 When a judge needs richer normalized context or the configured suite harness,
 type it with `JudgeContext`.

package/dist/harness.d.mts CHANGED Viewed

@@ -1,7 +1,10 @@
+/** Primitive scalar values allowed in normalized JSON-safe eval data. */
 type JsonPrimitive = string | number | boolean | null;
+/** JSON-safe value shape used by normalized sessions, artifacts, and errors. */
 type JsonValue = JsonPrimitive | JsonValue[] | {
     [key: string]: JsonValue;
 };
+/** Normalized record for one tool call observed during a harness run. */
 type ToolCallRecord = {
     id?: string;
     name: string;
@@ -17,12 +20,14 @@ type ToolCallRecord = {
     durationMs?: number;
     metadata?: Record<string, JsonValue>;
 };
+/** Normalized message recorded in a harness session transcript. */
 type NormalizedMessage = {
     role: "system" | "user" | "assistant" | "tool";
     content?: JsonValue;
     toolCalls?: ToolCallRecord[];
     metadata?: Record<string, JsonValue>;
 };
+/** Provider usage summary attached to a normalized harness run. */
 type UsageSummary = {
     provider?: string;
     model?: string;
@@ -35,50 +40,82 @@ type UsageSummary = {
     retries?: number;
     metadata?: Record<string, JsonValue>;
 };
+/** Timing summary attached to a normalized harness run. */
 type TimingSummary = {
     totalMs?: number;
     metadata?: Record<string, JsonValue>;
 };
+/** JSON-serializable transcript produced by the system under test. */
 type NormalizedSession = {
     messages: NormalizedMessage[];
-    outputText?: string;
     provider?: string;
     model?: string;
     metadata?: Record<string, JsonValue>;
 };
-type HarnessRun = {
+type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
+    output?: TOutput;
+} : {
+    output: TOutput;
+};
+/** Normalized result returned by every harness execution. */
+type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
     session: NormalizedSession;
-    output?: JsonValue;
     usage: UsageSummary;
     timings?: TimingSummary;
     artifacts?: Record<string, JsonValue>;
     errors: Array<Record<string, JsonValue>>;
 };
-/** Optional provider-facing hints for harness prompt calls. */
-type HarnessPromptOptions = {
-    system?: string;
-    metadata?: Record<string, JsonValue>;
-};
-/** Provider-agnostic prompt seam that judges can reuse from a harness. */
-type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
+/** Error value with an attached partial or complete normalized harness run. */
 type HarnessRunError = Error & {
     vitestEvalsRun: HarnessRun;
 };
+/** Per-run metadata shape accepted by harnesses and eval tests. */
 type HarnessMetadata = Record<string, unknown>;
+/** Runtime context passed from the eval fixture into a harness run. */
 type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
     metadata: Readonly<TMetadata>;
-    task: {
-        meta: Record<string, unknown>;
-    };
     signal?: AbortSignal;
     artifacts: Record<string, JsonValue>;
     setArtifact: (name: string, value: JsonValue) => void;
 };
-type Harness<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = {
+/** Adapter that executes the system under test and returns a normalized run. */
+type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
+    name: string;
+    run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun<TOutput>>;
+};
+/** Value or promise accepted by lightweight harness callbacks. */
+type MaybePromise<T> = T | Promise<T>;
+/** Lightweight tool-call record accepted by `createHarness(...)` results. */
+type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
+    arguments?: unknown;
+    result?: unknown;
+    error?: unknown;
+    metadata?: Record<string, unknown>;
+};
+/** Lightweight result shape normalized by `createHarness(...)`. */
+type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
+    messages?: NormalizedMessage[];
+    toolCalls?: SimpleToolCallRecord[];
+    usage?: UsageSummary;
+    timings?: TimingSummary;
+    artifacts?: Record<string, unknown>;
+    metadata?: Record<string, unknown>;
+    errors?: unknown[];
+};
+/** Either a complete normalized run or a lightweight result to normalize. */
+type HarnessResultLike<TOutput extends JsonValue | undefined = JsonValue | undefined> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;
+/** Arguments passed to the `createHarness(...)` convenience callback. */
+type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {
+    input: TInput;
+    metadata: Readonly<TMetadata>;
+    signal?: AbortSignal;
+    artifacts: HarnessContext<TMetadata>["artifacts"];
+    setArtifact: HarnessContext<TMetadata>["setArtifact"];
+};
+/** Options for creating a lightweight custom application harness. */
+type CreateHarnessOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
     name: string;
-    /** Prompt seam reused by LLM-backed judges. */
-    prompt: HarnessPrompt;
-    run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
+    run: (args: CreateHarnessRunArgs<TInput, TMetadata>) => MaybePromise<HarnessResultLike<TOutput>>;
 };
 /** Returns true when a value exposes a callable method with the given name. */
 declare function hasCallableMethod(value: unknown, methodName: string): boolean;
@@ -90,6 +127,10 @@ declare function normalizeRecord(value: Record<string, unknown>): Record<string,
 declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
 /** Converts arbitrary content into the JSON-safe message content shape. */
 declare function normalizeContent(value: unknown): JsonValue;
+/** Creates a harness from the common "run app code and return output" shape. */
+declare function createHarness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata>(options: CreateHarnessOptions<TInput, TOutput, TMetadata>): Harness<TInput, TOutput, TMetadata>;
+/** Normalizes a lightweight harness result into the reporter-facing run shape. */
+declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
 /** Flattens every recorded tool call from a normalized session. */
 declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
 /** Filters normalized session messages by role. */
@@ -115,4 +156,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
 /** Serializes an arbitrary thrown value into the normalized error shape. */
 declare function serializeError(error: unknown): Record<string, JsonValue>;
-export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type NormalizedMessage, type NormalizedSession, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
+export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };

package/dist/harness.d.ts CHANGED Viewed

@@ -1,7 +1,10 @@
+/** Primitive scalar values allowed in normalized JSON-safe eval data. */
 type JsonPrimitive = string | number | boolean | null;
+/** JSON-safe value shape used by normalized sessions, artifacts, and errors. */
 type JsonValue = JsonPrimitive | JsonValue[] | {
     [key: string]: JsonValue;
 };
+/** Normalized record for one tool call observed during a harness run. */
 type ToolCallRecord = {
     id?: string;
     name: string;
@@ -17,12 +20,14 @@ type ToolCallRecord = {
     durationMs?: number;
     metadata?: Record<string, JsonValue>;
 };
+/** Normalized message recorded in a harness session transcript. */
 type NormalizedMessage = {
     role: "system" | "user" | "assistant" | "tool";
     content?: JsonValue;
     toolCalls?: ToolCallRecord[];
     metadata?: Record<string, JsonValue>;
 };
+/** Provider usage summary attached to a normalized harness run. */
 type UsageSummary = {
     provider?: string;
     model?: string;
@@ -35,50 +40,82 @@ type UsageSummary = {
     retries?: number;
     metadata?: Record<string, JsonValue>;
 };
+/** Timing summary attached to a normalized harness run. */
 type TimingSummary = {
     totalMs?: number;
     metadata?: Record<string, JsonValue>;
 };
+/** JSON-serializable transcript produced by the system under test. */
 type NormalizedSession = {
     messages: NormalizedMessage[];
-    outputText?: string;
     provider?: string;
     model?: string;
     metadata?: Record<string, JsonValue>;
 };
-type HarnessRun = {
+type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
+    output?: TOutput;
+} : {
+    output: TOutput;
+};
+/** Normalized result returned by every harness execution. */
+type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
     session: NormalizedSession;
-    output?: JsonValue;
     usage: UsageSummary;
     timings?: TimingSummary;
     artifacts?: Record<string, JsonValue>;
     errors: Array<Record<string, JsonValue>>;
 };
-/** Optional provider-facing hints for harness prompt calls. */
-type HarnessPromptOptions = {
-    system?: string;
-    metadata?: Record<string, JsonValue>;
-};
-/** Provider-agnostic prompt seam that judges can reuse from a harness. */
-type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
+/** Error value with an attached partial or complete normalized harness run. */
 type HarnessRunError = Error & {
     vitestEvalsRun: HarnessRun;
 };
+/** Per-run metadata shape accepted by harnesses and eval tests. */
 type HarnessMetadata = Record<string, unknown>;
+/** Runtime context passed from the eval fixture into a harness run. */
 type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
     metadata: Readonly<TMetadata>;
-    task: {
-        meta: Record<string, unknown>;
-    };
     signal?: AbortSignal;
     artifacts: Record<string, JsonValue>;
     setArtifact: (name: string, value: JsonValue) => void;
 };
-type Harness<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = {
+/** Adapter that executes the system under test and returns a normalized run. */
+type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
+    name: string;
+    run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun<TOutput>>;
+};
+/** Value or promise accepted by lightweight harness callbacks. */
+type MaybePromise<T> = T | Promise<T>;
+/** Lightweight tool-call record accepted by `createHarness(...)` results. */
+type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
+    arguments?: unknown;
+    result?: unknown;
+    error?: unknown;
+    metadata?: Record<string, unknown>;
+};
+/** Lightweight result shape normalized by `createHarness(...)`. */
+type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
+    messages?: NormalizedMessage[];
+    toolCalls?: SimpleToolCallRecord[];
+    usage?: UsageSummary;
+    timings?: TimingSummary;
+    artifacts?: Record<string, unknown>;
+    metadata?: Record<string, unknown>;
+    errors?: unknown[];
+};
+/** Either a complete normalized run or a lightweight result to normalize. */
+type HarnessResultLike<TOutput extends JsonValue | undefined = JsonValue | undefined> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;
+/** Arguments passed to the `createHarness(...)` convenience callback. */
+type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {
+    input: TInput;
+    metadata: Readonly<TMetadata>;
+    signal?: AbortSignal;
+    artifacts: HarnessContext<TMetadata>["artifacts"];
+    setArtifact: HarnessContext<TMetadata>["setArtifact"];
+};
+/** Options for creating a lightweight custom application harness. */
+type CreateHarnessOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
     name: string;
-    /** Prompt seam reused by LLM-backed judges. */
-    prompt: HarnessPrompt;
-    run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
+    run: (args: CreateHarnessRunArgs<TInput, TMetadata>) => MaybePromise<HarnessResultLike<TOutput>>;
 };
 /** Returns true when a value exposes a callable method with the given name. */
 declare function hasCallableMethod(value: unknown, methodName: string): boolean;
@@ -90,6 +127,10 @@ declare function normalizeRecord(value: Record<string, unknown>): Record<string,
 declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
 /** Converts arbitrary content into the JSON-safe message content shape. */
 declare function normalizeContent(value: unknown): JsonValue;
+/** Creates a harness from the common "run app code and return output" shape. */
+declare function createHarness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata>(options: CreateHarnessOptions<TInput, TOutput, TMetadata>): Harness<TInput, TOutput, TMetadata>;
+/** Normalizes a lightweight harness result into the reporter-facing run shape. */
+declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
 /** Flattens every recorded tool call from a normalized session. */
 declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
 /** Filters normalized session messages by role. */
@@ -115,4 +156,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
 /** Serializes an arbitrary thrown value into the normalized error shape. */
 declare function serializeError(error: unknown): Record<string, JsonValue>;
-export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type NormalizedMessage, type NormalizedSession, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
+export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };