vitest-evals 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/README.md +27 -35
  2. package/dist/harness.d.mts +15 -20
  3. package/dist/harness.d.ts +15 -20
  4. package/dist/harness.js +0 -1
  5. package/dist/harness.js.map +1 -1
  6. package/dist/harness.mjs +0 -1
  7. package/dist/harness.mjs.map +1 -1
  8. package/dist/index.d.mts +45 -68
  9. package/dist/index.d.ts +45 -68
  10. package/dist/index.js +21 -40
  11. package/dist/index.js.map +1 -1
  12. package/dist/index.mjs +21 -40
  13. package/dist/index.mjs.map +1 -1
  14. package/dist/internal/toolCallScorer.js.map +1 -1
  15. package/dist/internal/toolCallScorer.mjs.map +1 -1
  16. package/dist/judges/factualityJudge.d.mts +14 -13
  17. package/dist/judges/factualityJudge.d.ts +14 -13
  18. package/dist/judges/factualityJudge.js +9 -9
  19. package/dist/judges/factualityJudge.js.map +1 -1
  20. package/dist/judges/factualityJudge.mjs +9 -9
  21. package/dist/judges/factualityJudge.mjs.map +1 -1
  22. package/dist/judges/index.js +17 -20
  23. package/dist/judges/index.js.map +1 -1
  24. package/dist/judges/index.mjs +17 -20
  25. package/dist/judges/index.mjs.map +1 -1
  26. package/dist/judges/judgeHarness.d.mts +6 -10
  27. package/dist/judges/judgeHarness.d.ts +6 -10
  28. package/dist/judges/judgeHarness.js +3 -8
  29. package/dist/judges/judgeHarness.js.map +1 -1
  30. package/dist/judges/judgeHarness.mjs +3 -8
  31. package/dist/judges/judgeHarness.mjs.map +1 -1
  32. package/dist/judges/structuredOutputJudge.d.mts +7 -9
  33. package/dist/judges/structuredOutputJudge.d.ts +7 -9
  34. package/dist/judges/structuredOutputJudge.js +3 -3
  35. package/dist/judges/structuredOutputJudge.js.map +1 -1
  36. package/dist/judges/structuredOutputJudge.mjs +3 -3
  37. package/dist/judges/structuredOutputJudge.mjs.map +1 -1
  38. package/dist/judges/toolCallJudge.d.mts +12 -9
  39. package/dist/judges/toolCallJudge.d.ts +12 -9
  40. package/dist/judges/toolCallJudge.js +3 -3
  41. package/dist/judges/toolCallJudge.js.map +1 -1
  42. package/dist/judges/toolCallJudge.mjs +3 -3
  43. package/dist/judges/toolCallJudge.mjs.map +1 -1
  44. package/dist/judges/types.d.mts +13 -24
  45. package/dist/judges/types.d.ts +13 -24
  46. package/dist/judges/types.js.map +1 -1
  47. package/dist/legacy/scorers/index.js.map +1 -1
  48. package/dist/legacy/scorers/index.mjs.map +1 -1
  49. package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
  50. package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
  51. package/dist/legacy.js.map +1 -1
  52. package/dist/legacy.mjs.map +1 -1
  53. package/dist/reporter.js.map +1 -1
  54. package/dist/reporter.mjs.map +1 -1
  55. package/package.json +3 -3
package/dist/index.d.ts CHANGED
@@ -1,9 +1,9 @@
1
1
  import * as vitest from 'vitest';
2
2
  import { TestAPI } from 'vitest';
3
- import { HarnessMetadata, Harness } from './harness.js';
4
- export { CreateHarnessOptions, CreateHarnessRunArgs, CreateToolCallSpansOptions, EnsureRunTraceOptions, HarnessContext, HarnessResultLike, MaybePromise, SimpleHarnessResult, SimpleSpanEvent, SimpleSpanRecord, SimpleToolCallRecord, SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, normalizeHarnessRun, normalizeSpanAttributes, normalizeSpanError, toJsonValue } from './harness.js';
5
- import { JudgeContext, Judge, JudgeResult, JudgeAssessFn, JudgeAssessor, JudgeAssessWithAssessorFn } from './judges/types.js';
6
- export { BoundJudgeAssessor, JudgeAssessorOptions, JudgeOptions } from './judges/types.js';
3
+ import { Harness } from './harness.js';
4
+ export { CreateHarnessOptions, CreateHarnessRunArgs, CreateToolCallSpansOptions, EnsureRunTraceOptions, HarnessContext, HarnessMetadata, HarnessResultLike, MaybePromise, SimpleHarnessResult, SimpleSpanEvent, SimpleSpanRecord, SimpleToolCallRecord, SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, normalizeHarnessRun, normalizeSpanAttributes, normalizeSpanError, toJsonValue } from './harness.js';
5
+ import { JudgeContext, Judge, JudgeResult, JudgeAssessFn, JudgeOptions, JudgeAssessor, JudgeAssessWithAssessorFn } from './judges/types.js';
6
+ export { BoundJudgeAssessor, JudgeAssessorOptions } from './judges/types.js';
7
7
  import { JudgeHarness } from './judges/judgeHarness.js';
8
8
  export { CreateJudgeHarnessOptions, CreateJudgeHarnessRunOptions, JudgeHarnessInput, JudgeHarnessOutput, RunJudge, RunJudgeOptions, createJudgeHarness, runJudgeHarness } from './judges/judgeHarness.js';
9
9
  export { wrapText } from './wrapText.js';
@@ -32,14 +32,14 @@ type EvalTaskMeta = {
32
32
  run: HarnessRun;
33
33
  };
34
34
  };
35
- type HarnessInput<THarness extends Harness<any, any, any>> = THarness extends Harness<infer TInput, any, any> ? TInput : unknown;
36
- type HarnessMetadataFor<THarness extends Harness<any, any, any>> = THarness extends Harness<any, any, infer TMetadata> ? TMetadata : HarnessMetadata;
37
- type HarnessOutput<THarness extends Harness<any, any, any>> = THarness extends Harness<any, infer TOutput, any> ? TOutput : JsonValue | undefined;
38
- type CreateJudgeConfig<TOptions extends JudgeContext<any, any, any, any> = JudgeContext> = {
35
+ type HarnessInput<THarness extends Harness<any, any>> = THarness extends Harness<infer TInput, any> ? TInput : unknown;
36
+ type HarnessOutput<THarness extends Harness<any, any>> = THarness extends Harness<any, infer TOutput> ? TOutput : JsonValue | undefined;
37
+ type CreateJudgeConfig<TOptions extends JudgeContext<any, any, any> = JudgeContext> = {
39
38
  name: string;
40
39
  judgeHarness?: JudgeHarness;
41
40
  assess: JudgeAssessFn<TOptions>;
42
41
  };
42
+ type CreateJudgeContext<TInput, TOutput extends JsonValue | undefined, TOptions extends object, THarness extends Harness<TInput, TOutput> | undefined = Harness<TInput, TOutput> | undefined> = JudgeOptions<TInput, TOutput, TOptions, THarness>;
43
43
  declare const evalHarnessRunBrand: unique symbol;
44
44
  /**
45
45
  * Harness run returned by the fixture-backed `run(...)` API.
@@ -53,42 +53,22 @@ declare const evalHarnessRunBrand: unique symbol;
53
53
  * });
54
54
  * ```
55
55
  */
56
- type EvalHarnessRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TOutput, TMetadata> = Harness<TInput, TOutput, TMetadata>> = HarnessRun<TOutput> & {
56
+ type EvalHarnessRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, THarness extends Harness<TInput, TOutput> = Harness<TInput, TOutput>> = HarnessRun<TOutput> & {
57
57
  readonly [evalHarnessRunBrand]: {
58
58
  readonly input: TInput;
59
- readonly metadata: TMetadata;
60
59
  readonly output: TOutput;
61
60
  readonly harness: THarness;
62
61
  };
63
62
  };
64
- /**
65
- * Per-run metadata forwarded to the harness alongside the test input.
66
- *
67
- * @example
68
- * ```ts
69
- * await run("Refund invoice inv_123", {
70
- * metadata: {
71
- * expected: { status: "approved" },
72
- * expectedTools: ["lookupInvoice", "createRefund"],
73
- * },
74
- * });
75
- * ```
76
- */
77
- interface EvalRunOptions<TMetadata extends HarnessMetadata = HarnessMetadata> {
78
- /** Per-run expectations or configuration forwarded to harnesses and judges. */
79
- metadata?: TMetadata;
80
- }
81
63
  /**
82
64
  * Explicit harness execution primitive exposed to each eval test.
83
65
  *
84
66
  * @example
85
67
  * ```ts
86
- * const result = await run("Refund invoice inv_123", {
87
- * metadata: { expected: { status: "approved" } },
88
- * });
68
+ * const result = await run("Refund invoice inv_123");
89
69
  * ```
90
70
  */
91
- type EvalRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TOutput, TMetadata> = Harness<TInput, TOutput, TMetadata>> = (input: TInput, options?: EvalRunOptions<TMetadata>) => Promise<EvalHarnessRun<TInput, TOutput, TMetadata, THarness>>;
71
+ type EvalRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, THarness extends Harness<TInput, TOutput> = Harness<TInput, TOutput>> = (input: TInput) => Promise<EvalHarnessRun<TInput, TOutput, THarness>>;
92
72
  /**
93
73
  * Fixture-backed Vitest context exposed inside `describeEval(...)` tests.
94
74
  *
@@ -103,11 +83,11 @@ type EvalRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue
103
83
  * });
104
84
  * ```
105
85
  */
106
- interface EvalTestContext<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TOutput, TMetadata> = Harness<TInput, TOutput, TMetadata>> {
107
- run: EvalRun<TInput, TOutput, TMetadata, THarness>;
86
+ interface EvalTestContext<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, THarness extends Harness<TInput, TOutput> = Harness<TInput, TOutput>> {
87
+ run: EvalRun<TInput, TOutput, THarness>;
108
88
  }
109
89
  /** Fixture-backed Vitest test API exposed inside `describeEval(...)`. */
110
- type EvalTestAPI<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TOutput, TMetadata> = Harness<TInput, TOutput, TMetadata>> = TestAPI<EvalTestContext<TInput, TOutput, TMetadata, THarness>>;
90
+ type EvalTestAPI<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, THarness extends Harness<TInput, TOutput> = Harness<TInput, TOutput>> = TestAPI<EvalTestContext<TInput, TOutput, THarness>>;
111
91
  /**
112
92
  * Suite-level configuration for a harness-backed eval block.
113
93
  *
@@ -124,11 +104,11 @@ type EvalTestAPI<TInput = unknown, TOutput extends JsonValue | undefined = JsonV
124
104
  * };
125
105
  * ```
126
106
  */
127
- interface DescribeEvalOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TOutput, TMetadata> = Harness<TInput, TOutput, TMetadata>> {
107
+ interface DescribeEvalOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, THarness extends Harness<TInput, TOutput> = Harness<TInput, TOutput>> {
128
108
  /** Harness used for every explicit `run(...)` call in the suite. */
129
109
  harness: THarness;
130
110
  /** Automatic judges applied after each successful `run(...)`. */
131
- judges?: Array<Judge<JudgeContext<TInput, TOutput, TMetadata, THarness>>>;
111
+ judges?: Array<Judge<JudgeContext<TInput, TOutput, THarness>>>;
132
112
  /** Optional judge-side harness used only by judges that call `ctx.runJudge(...)`. */
133
113
  judgeHarness?: JudgeHarness;
134
114
  /** Passing threshold for automatic suite-level judges. `null` disables fail-on-score. */
@@ -136,26 +116,23 @@ interface DescribeEvalOptions<TInput = unknown, TOutput extends JsonValue | unde
136
116
  /** Skips the entire eval suite when the predicate returns true. */
137
117
  skipIf?: () => boolean;
138
118
  }
139
- type JudgeAssertionInput<TJudgeOptions extends JudgeContext<any, any, any, any>> = TJudgeOptions extends {
119
+ type JudgeAssertionInput<TJudgeOptions extends JudgeContext<any, any, any>> = TJudgeOptions extends {
140
120
  input: infer TInput;
141
121
  } ? TInput : unknown;
142
- type JudgeAssertionOutput<TJudgeOptions extends JudgeContext<any, any, any, any>> = TJudgeOptions extends {
122
+ type JudgeAssertionOutput<TJudgeOptions extends JudgeContext<any, any, any>> = TJudgeOptions extends {
143
123
  output: infer TOutput;
144
124
  } ? TOutput : JsonValue | undefined;
145
- type JudgeAssertionMetadata<TJudgeOptions extends JudgeContext<any, any, any, any>> = TJudgeOptions extends {
146
- metadata: infer TMetadata;
147
- } ? TMetadata : HarnessMetadata;
148
- type JudgeAssertionHarness<TJudgeOptions extends JudgeContext<any, any, any, any>> = TJudgeOptions extends {
125
+ type JudgeAssertionHarness<TJudgeOptions extends JudgeContext<any, any, any>> = TJudgeOptions extends {
149
126
  harness: infer THarness;
150
- } ? Exclude<THarness, undefined> : Harness<JudgeAssertionInput<TJudgeOptions>, JudgeAssertionOutput<TJudgeOptions>, JudgeAssertionMetadata<TJudgeOptions>>;
151
- type JudgeAssertionReservedKey = keyof JudgeContext<any, any, any, any> | "judgeHarness" | "signal" | "threshold";
152
- type JudgeAssertionParams<TJudgeOptions extends JudgeContext<any, any, any, any>> = Omit<TJudgeOptions, JudgeAssertionReservedKey>;
127
+ } ? Exclude<THarness, undefined> : Harness<JudgeAssertionInput<TJudgeOptions>, JudgeAssertionOutput<TJudgeOptions>>;
128
+ type JudgeAssertionReservedKey = keyof JudgeContext<any, any, any> | "judgeHarness" | "signal" | "threshold";
129
+ type JudgeAssertionParams<TJudgeOptions extends JudgeContext<any, any, any>> = Omit<TJudgeOptions, JudgeAssertionReservedKey>;
153
130
  type RequiredKeys<T> = {
154
131
  [K in keyof T]-?: Record<string, never> extends Pick<T, K> ? never : K;
155
132
  }[keyof T];
156
- type JudgeAssertionArgs<TJudgeOptions extends JudgeContext<any, any, any, any>> = RequiredKeys<JudgeAssertionParams<TJudgeOptions>> extends never ? [options?: JudgeAssertionOptions<TJudgeOptions>] : [options: JudgeAssertionOptions<TJudgeOptions>];
157
- type MatcherOutput<TReceived> = TReceived extends EvalHarnessRun<any, infer TOutput, any, any> ? TOutput : TReceived extends HarnessRun<infer TOutput> ? TOutput : TReceived extends NormalizedSession ? JsonValue | undefined : TReceived extends JsonValue ? TReceived : JsonValue | undefined;
158
- type JudgeForReceived<TReceived, TJudgeOptions extends JudgeContext<any, any, any, any>> = MatcherOutput<TReceived> extends JudgeAssertionOutput<TJudgeOptions> ? Judge<TJudgeOptions> : never;
133
+ type JudgeAssertionArgs<TJudgeOptions extends JudgeContext<any, any, any>> = RequiredKeys<JudgeAssertionParams<TJudgeOptions>> extends never ? [options?: JudgeAssertionOptions<TJudgeOptions>] : [options: JudgeAssertionOptions<TJudgeOptions>];
134
+ type MatcherOutput<TReceived> = TReceived extends EvalHarnessRun<any, infer TOutput, any> ? TOutput : TReceived extends HarnessRun<infer TOutput> ? TOutput : TReceived extends NormalizedSession ? JsonValue | undefined : TReceived extends JsonValue ? TReceived : JsonValue | undefined;
135
+ type JudgeForReceived<TReceived, TJudgeOptions extends JudgeContext<any, any, any>> = MatcherOutput<TReceived> extends JudgeAssertionOutput<TJudgeOptions> ? Judge<TJudgeOptions> : never;
159
136
  /**
160
137
  * Optional overrides passed to `expect(...).toSatisfyJudge(...)`.
161
138
  *
@@ -166,13 +143,11 @@ type JudgeForReceived<TReceived, TJudgeOptions extends JudgeContext<any, any, an
166
143
  * });
167
144
  * ```
168
145
  */
169
- type JudgeAssertionOptions<TJudgeOptions extends JudgeContext<any, any, any, any> = JudgeContext> = JudgeAssertionParams<TJudgeOptions> & {
146
+ type JudgeAssertionOptions<TJudgeOptions extends JudgeContext<any, any, any> = JudgeContext> = JudgeAssertionParams<TJudgeOptions> & {
170
147
  /** Override or provide the original eval input for the judge. */
171
148
  input?: JudgeAssertionInput<TJudgeOptions>;
172
149
  /** Override or provide the app-facing output for the judge. */
173
150
  output?: JudgeAssertionOutput<TJudgeOptions>;
174
- /** Override or provide per-run judge metadata. */
175
- metadata?: JudgeAssertionMetadata<TJudgeOptions>;
176
151
  /** Override or provide flattened tool calls for the judge. */
177
152
  toolCalls?: ToolCallRecord[];
178
153
  /** Override or provide the complete normalized harness run. */
@@ -187,7 +162,7 @@ type JudgeAssertionOptions<TJudgeOptions extends JudgeContext<any, any, any, any
187
162
  threshold?: number | null;
188
163
  };
189
164
  /** Function type installed as the `toSatisfyJudge(...)` matcher. */
190
- type ToSatisfyJudge<TReceived = unknown> = <TJudgeOptions extends JudgeContext<any, any, any, any> = JudgeContext>(judge: JudgeForReceived<TReceived, TJudgeOptions>, ...args: JudgeAssertionArgs<TJudgeOptions>) => Promise<TReceived>;
165
+ type ToSatisfyJudge<TReceived = unknown> = <TJudgeOptions extends JudgeContext<any, any, any> = JudgeContext>(judge: JudgeForReceived<TReceived, TJudgeOptions>, ...args: JudgeAssertionArgs<TJudgeOptions>) => Promise<TReceived>;
191
166
  /**
192
167
  * Vitest matcher extension surface added by `vitest-evals`.
193
168
  *
@@ -241,22 +216,19 @@ declare module "vitest" {
241
216
  * judges: [ToolCallJudge()],
242
217
  * }, (it) => {
243
218
  * it("approves a refundable invoice", async ({ run }) => {
244
- * const result = await run("Refund invoice inv_123", {
245
- * metadata: {
246
- * expected: "Invoice inv_123 should be refunded.",
247
- * },
248
- * });
219
+ * const result = await run("Refund invoice inv_123");
249
220
  *
250
221
  * expect(result.output).toMatchObject({ status: "approved" });
251
222
  * expect(toolCalls(result.session)).toHaveLength(2);
252
223
  * await expect(result).toSatisfyJudge(FactualityJudge(), {
224
+ * expected: "Invoice inv_123 should be refunded.",
253
225
  * threshold: 0.6,
254
226
  * });
255
227
  * });
256
228
  * });
257
229
  * ```
258
230
  */
259
- declare function describeEval<THarness extends Harness<any, any, any>>(name: string, options: DescribeEvalOptions<HarnessInput<THarness>, HarnessOutput<THarness>, HarnessMetadataFor<THarness>, THarness>, define: (it: EvalTestAPI<HarnessInput<THarness>, HarnessOutput<THarness>, HarnessMetadataFor<THarness>, THarness>) => void): vitest.SuiteCollector<object>;
231
+ declare function describeEval<THarness extends Harness<any, any>>(name: string, options: DescribeEvalOptions<HarnessInput<THarness>, HarnessOutput<THarness>, THarness>, define: (it: EvalTestAPI<HarnessInput<THarness>, HarnessOutput<THarness>, THarness>) => void): vitest.SuiteCollector<object>;
260
232
  /**
261
233
  * Formats judge results for reporter and assertion output.
262
234
  *
@@ -284,17 +256,20 @@ declare function formatScores(scores: (JudgeResult & {
284
256
  *
285
257
  * @example
286
258
  * ```ts
287
- * import { createJudge, type JudgeContext } from "vitest-evals";
259
+ * import { createJudge } from "vitest-evals";
288
260
  *
289
261
  * type RefundOutput = { status: "approved" | "denied" };
290
- * type RefundMetadata = { expected: { status: RefundOutput["status"] } };
291
262
  *
292
- * export const RefundStatusJudge = createJudge(
263
+ * export const RefundStatusJudge = createJudge<
264
+ * string,
265
+ * RefundOutput,
266
+ * { expectedStatus: RefundOutput["status"] }
267
+ * >(
293
268
  * "RefundStatusJudge",
294
- * async ({ output, metadata }: JudgeContext<string, RefundOutput, RefundMetadata>) => ({
295
- * score: output.status === metadata.expected.status ? 1 : 0,
269
+ * async ({ output, expectedStatus }) => ({
270
+ * score: output.status === expectedStatus ? 1 : 0,
296
271
  * metadata: {
297
- * rationale: `Expected ${metadata.expected.status}, got ${output.status}`,
272
+ * rationale: `Expected ${expectedStatus}, got ${output.status}`,
298
273
  * },
299
274
  * }),
300
275
  * );
@@ -303,12 +278,14 @@ declare function formatScores(scores: (JudgeResult & {
303
278
  * For LLM-backed judges, prefer the object form with `ctx.runJudge(...)` so
304
279
  * provider-specific model configuration stays in the judge harness.
305
280
  */
306
- declare function createJudge<TOptions extends JudgeContext<any, any, any, any>>(name: string, assess: JudgeAssessFn<TOptions>): Judge<TOptions>;
307
- declare function createJudge<TOptions extends JudgeContext<any, any, any, any>>(config: CreateJudgeConfig<TOptions>): Judge<TOptions>;
281
+ declare function createJudge<TOptions extends JudgeContext<any, any, any>>(name: string, assess: JudgeAssessFn<TOptions>): Judge<TOptions>;
282
+ declare function createJudge<TOptions extends JudgeContext<any, any, any>>(config: CreateJudgeConfig<TOptions>): Judge<TOptions>;
283
+ declare function createJudge<TInput, TOutput extends JsonValue | undefined, TOptions extends object = Record<never, never>, THarness extends Harness<TInput, TOutput> | undefined = Harness<TInput, TOutput> | undefined>(name: string, assess: JudgeAssessFn<CreateJudgeContext<TInput, TOutput, TOptions, THarness>>): Judge<CreateJudgeContext<TInput, TOutput, TOptions, THarness>>;
284
+ declare function createJudge<TInput, TOutput extends JsonValue | undefined, TOptions extends object = Record<never, never>, THarness extends Harness<TInput, TOutput> | undefined = Harness<TInput, TOutput> | undefined>(config: CreateJudgeConfig<CreateJudgeContext<TInput, TOutput, TOptions, THarness>>): Judge<CreateJudgeContext<TInput, TOutput, TOptions, THarness>>;
308
285
  /**
309
286
  * @deprecated Prefer `createJudge({ name, judgeHarness, assess })` and call
310
287
  * `ctx.runJudge(...)` from LLM-backed judges.
311
288
  */
312
- declare function createJudge<TOptions extends JudgeContext<any, any, any, any>, TInput, TOutput>(name: string, assessor: JudgeAssessor<TInput, TOutput>, assess: JudgeAssessWithAssessorFn<TOptions, TInput, TOutput>): Judge<TOptions>;
289
+ declare function createJudge<TOptions extends JudgeContext<any, any, any>, TInput, TOutput>(name: string, assessor: JudgeAssessor<TInput, TOutput>, assess: JudgeAssessWithAssessorFn<TOptions, TInput, TOutput>): Judge<TOptions>;
313
290
 
314
- export { type DescribeEvalOptions, type EvalHarnessRun, type EvalMatchers, type EvalRun, type EvalRunOptions, type EvalTestAPI, type EvalTestContext, Harness, HarnessMetadata, Judge, type JudgeAssertionOptions, JudgeAssessFn, JudgeAssessWithAssessorFn, JudgeAssessor, JudgeContext, JudgeHarness, JudgeResult, type ToSatisfyJudge, createJudge, describeEval, formatScores };
291
+ export { type DescribeEvalOptions, type EvalHarnessRun, type EvalMatchers, type EvalRun, type EvalTestAPI, type EvalTestContext, Harness, Judge, type JudgeAssertionOptions, JudgeAssessFn, JudgeAssessWithAssessorFn, JudgeAssessor, JudgeContext, JudgeHarness, JudgeOptions, JudgeResult, type ToSatisfyJudge, createJudge, describeEval, formatScores };
package/dist/index.js CHANGED
@@ -131,7 +131,6 @@ function createHarness(options) {
131
131
  try {
132
132
  const result = await options.run({
133
133
  input,
134
- metadata: context.metadata,
135
134
  signal: context.signal,
136
135
  artifacts: context.artifacts,
137
136
  setArtifact: context.setArtifact
@@ -522,17 +521,14 @@ function serializeError(error) {
522
521
  function createJudgeHarness(options) {
523
522
  return createHarness({
524
523
  name: options.name ?? "judge-harness",
525
- run: async ({ input, signal, metadata }) => {
526
- return normalizeJudgeHarnessResult(
527
- await options.run(input, { signal, metadata })
528
- );
524
+ run: async ({ input, signal }) => {
525
+ return normalizeJudgeHarnessResult(await options.run(input, { signal }));
529
526
  }
530
527
  });
531
528
  }
532
529
  async function runJudgeHarness(judgeHarness, input, options = {}) {
533
530
  const artifacts = {};
534
531
  const run = await judgeHarness.run(input, {
535
- metadata: options.metadata ?? {},
536
532
  signal: options.signal,
537
533
  artifacts,
538
534
  setArtifact: (name, value) => {
@@ -546,8 +542,7 @@ function createRunJudge(judgeHarness, signal) {
546
542
  return void 0;
547
543
  }
548
544
  return (input, options) => runJudgeHarness(judgeHarness, input, {
549
- metadata: options?.metadata,
550
- signal
545
+ signal: options?.signal ?? signal
551
546
  });
552
547
  }
553
548
  function normalizeJudgeHarnessResult(result) {
@@ -625,22 +620,24 @@ function FactualityJudge(config = {}) {
625
620
  return {
626
621
  name: config.name ?? "FactualityJudge",
627
622
  judgeHarness,
628
- assess: (opts) => assessFactuality(opts, judgeHarness)
623
+ assess: (opts) => assessFactuality(opts, {
624
+ expected: config.expected,
625
+ judgeHarness
626
+ })
629
627
  };
630
628
  }
631
- async function assessFactuality(opts, configuredJudgeHarness) {
632
- const metadata = opts.metadata;
633
- const expected = opts.expected === void 0 ? metadata.expected : opts.expected;
629
+ async function assessFactuality(opts, config) {
630
+ const expected = opts.expected ?? config.expected;
634
631
  if (isMissingExpectedAnswer(expected)) {
635
632
  return {
636
633
  score: 0,
637
634
  metadata: {
638
- rationale: "FactualityJudge requires a non-empty expert answer in `expected` or `metadata.expected`."
635
+ rationale: "FactualityJudge requires a non-empty expert answer in `expected` or FactualityJudge(...) config."
639
636
  }
640
637
  };
641
638
  }
642
639
  const runJudge = opts.runJudge ?? createRunJudge(
643
- configuredJudgeHarness,
640
+ config.judgeHarness,
644
641
  opts.signal
645
642
  );
646
643
  if (!runJudge) {
@@ -1048,15 +1045,15 @@ function StructuredOutputScorer(config = {}) {
1048
1045
 
1049
1046
  // src/judges/structuredOutputJudge.ts
1050
1047
  function StructuredOutputJudge(config = {}) {
1051
- const scorer = StructuredOutputScorer(config);
1048
+ const { expected, ...scorerConfig } = config;
1049
+ const scorer = StructuredOutputScorer(scorerConfig);
1052
1050
  return {
1053
1051
  name: "StructuredOutputJudge",
1054
1052
  assess: (opts) => {
1055
- const metadata = opts.metadata;
1056
1053
  return scorer({
1057
1054
  ...opts,
1058
1055
  input: formatStructuredOutput(opts.input),
1059
- expected: opts.expected ?? metadata.expected,
1056
+ expected: opts.expected ?? expected,
1060
1057
  output: formatStructuredOutput(opts.output)
1061
1058
  });
1062
1059
  }
@@ -1266,17 +1263,17 @@ function evaluateUnorderedTools(expected, actual, options) {
1266
1263
 
1267
1264
  // src/judges/toolCallJudge.ts
1268
1265
  function ToolCallJudge(config = {}) {
1269
- const scorer = ToolCallScorer(config);
1266
+ const { expectedTools, ...scorerConfig } = config;
1267
+ const scorer = ToolCallScorer(scorerConfig);
1270
1268
  return {
1271
1269
  name: "ToolCallJudge",
1272
1270
  assess: (opts) => {
1273
- const metadata = opts.metadata;
1274
1271
  return scorer({
1275
1272
  ...opts,
1276
1273
  input: formatJudgeValue2(opts.input),
1277
1274
  output: formatJudgeValue2(opts.output),
1278
1275
  expectedTools: normalizeExpectedTools(
1279
- opts.expectedTools ?? metadata.expectedTools
1276
+ opts.expectedTools ?? expectedTools
1280
1277
  )
1281
1278
  });
1282
1279
  }
@@ -1307,10 +1304,7 @@ var evalTest = import_vitest.test.extend("harness", async () => {
1307
1304
  throw new Error(
1308
1305
  "describeEval must override the harness fixture before running tests."
1309
1306
  );
1310
- }).extend(
1311
- "automaticJudges",
1312
- []
1313
- ).extend("judgeThreshold", void 0).extend("judgeHarness", void 0).extend("explicitJudgeHarness", void 0).extend(
1307
+ }).extend("automaticJudges", []).extend("judgeThreshold", void 0).extend("judgeHarness", void 0).extend("explicitJudgeHarness", void 0).extend(
1314
1308
  "run",
1315
1309
  async ({
1316
1310
  automaticJudges,
@@ -1321,12 +1315,10 @@ var evalTest = import_vitest.test.extend("harness", async () => {
1321
1315
  signal,
1322
1316
  task
1323
1317
  }) => {
1324
- return async (input, options) => {
1318
+ return async (input) => {
1325
1319
  const resolvedHarness = harness;
1326
- const metadata = createMetadata(options?.metadata);
1327
1320
  const artifacts = {};
1328
1321
  const context = {
1329
- metadata,
1330
1322
  signal,
1331
1323
  artifacts,
1332
1324
  setArtifact: (artifactName, value) => {
@@ -1356,7 +1348,6 @@ var evalTest = import_vitest.test.extend("harness", async () => {
1356
1348
  resolvedHarness,
1357
1349
  input,
1358
1350
  explicitJudgeHarness,
1359
- metadata,
1360
1351
  signal
1361
1352
  );
1362
1353
  }
@@ -1375,7 +1366,6 @@ var evalTest = import_vitest.test.extend("harness", async () => {
1375
1366
  resolvedHarness,
1376
1367
  input,
1377
1368
  explicitJudgeHarness,
1378
- metadata,
1379
1369
  signal
1380
1370
  );
1381
1371
  }
@@ -1395,7 +1385,6 @@ var evalTest = import_vitest.test.extend("harness", async () => {
1395
1385
  resolvedHarness,
1396
1386
  input,
1397
1387
  explicitJudgeHarness,
1398
- metadata,
1399
1388
  signal
1400
1389
  );
1401
1390
  if (automaticJudges.length > 0) {
@@ -1406,7 +1395,6 @@ var evalTest = import_vitest.test.extend("harness", async () => {
1406
1395
  resolvedHarness,
1407
1396
  input,
1408
1397
  judgeHarness,
1409
- metadata,
1410
1398
  run,
1411
1399
  signal
1412
1400
  );
@@ -1473,10 +1461,7 @@ function describeEval(name, options, define) {
1473
1461
  define(it);
1474
1462
  });
1475
1463
  }
1476
- function createMetadata(metadata) {
1477
- return { ...metadata ?? {} };
1478
- }
1479
- async function applyAutomaticJudges(task, judges, threshold, harness, input, judgeHarness, metadata, run, signal) {
1464
+ async function applyAutomaticJudges(task, judges, threshold, harness, input, judgeHarness, run, signal) {
1480
1465
  const runToolCalls = (0, import_core2.toolCalls)(run.session);
1481
1466
  const scores = await Promise.all(
1482
1467
  judges.map((judge) => {
@@ -1488,7 +1473,6 @@ async function applyAutomaticJudges(task, judges, threshold, harness, input, jud
1488
1473
  input,
1489
1474
  output: run.output,
1490
1475
  toolCalls: runToolCalls,
1491
- metadata,
1492
1476
  run,
1493
1477
  session: run.session,
1494
1478
  signal,
@@ -1533,12 +1517,11 @@ function setHarnessMeta(task, name, run) {
1533
1517
  run
1534
1518
  };
1535
1519
  }
1536
- function recordJudgeRunContext(run, harness, input, judgeHarness, metadata, signal) {
1520
+ function recordJudgeRunContext(run, harness, input, judgeHarness, signal) {
1537
1521
  const context = {
1538
1522
  harness,
1539
1523
  input,
1540
1524
  judgeHarness,
1541
- metadata,
1542
1525
  run,
1543
1526
  signal
1544
1527
  };
@@ -1608,7 +1591,6 @@ function buildJudgeAssertionOptions(received, judge, options, task) {
1608
1591
  const judgeHarness = options.judgeHarness ?? resolveJudgeHarnessForJudge(judge, registeredContext?.judgeHarness);
1609
1592
  const runJudge = createRunJudge(judgeHarness, registeredContext?.signal);
1610
1593
  const signal = registeredContext?.signal;
1611
- const metadata = options.metadata ?? registeredContext?.metadata ?? {};
1612
1594
  const input = options.input ?? registeredContext?.input ?? void 0;
1613
1595
  const contextualOptions = {
1614
1596
  ...options,
@@ -1631,7 +1613,6 @@ function buildJudgeAssertionOptions(received, judge, options, task) {
1631
1613
  ...judgeParams,
1632
1614
  input: resolvedInput,
1633
1615
  output,
1634
- metadata,
1635
1616
  run,
1636
1617
  session: options.session ?? run.session,
1637
1618
  signal,