vitest-evals 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -35
- package/dist/harness.d.mts +15 -20
- package/dist/harness.d.ts +15 -20
- package/dist/harness.js +0 -1
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs +0 -1
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +45 -68
- package/dist/index.d.ts +45 -68
- package/dist/index.js +21 -40
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +21 -40
- package/dist/index.mjs.map +1 -1
- package/dist/internal/toolCallScorer.js.map +1 -1
- package/dist/internal/toolCallScorer.mjs.map +1 -1
- package/dist/judges/factualityJudge.d.mts +14 -13
- package/dist/judges/factualityJudge.d.ts +14 -13
- package/dist/judges/factualityJudge.js +9 -9
- package/dist/judges/factualityJudge.js.map +1 -1
- package/dist/judges/factualityJudge.mjs +9 -9
- package/dist/judges/factualityJudge.mjs.map +1 -1
- package/dist/judges/index.js +17 -20
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs +17 -20
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/judgeHarness.d.mts +6 -10
- package/dist/judges/judgeHarness.d.ts +6 -10
- package/dist/judges/judgeHarness.js +3 -8
- package/dist/judges/judgeHarness.js.map +1 -1
- package/dist/judges/judgeHarness.mjs +3 -8
- package/dist/judges/judgeHarness.mjs.map +1 -1
- package/dist/judges/structuredOutputJudge.d.mts +7 -9
- package/dist/judges/structuredOutputJudge.d.ts +7 -9
- package/dist/judges/structuredOutputJudge.js +3 -3
- package/dist/judges/structuredOutputJudge.js.map +1 -1
- package/dist/judges/structuredOutputJudge.mjs +3 -3
- package/dist/judges/structuredOutputJudge.mjs.map +1 -1
- package/dist/judges/toolCallJudge.d.mts +12 -9
- package/dist/judges/toolCallJudge.d.ts +12 -9
- package/dist/judges/toolCallJudge.js +3 -3
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs +3 -3
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +13 -24
- package/dist/judges/types.d.ts +13 -24
- package/dist/judges/types.js.map +1 -1
- package/dist/legacy/scorers/index.js.map +1 -1
- package/dist/legacy/scorers/index.mjs.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/legacy.js.map +1 -1
- package/dist/legacy.mjs.map +1 -1
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs.map +1 -1
- package/package.json +3 -3
package/dist/index.d.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import * as vitest from 'vitest';
|
|
2
2
|
import { TestAPI } from 'vitest';
|
|
3
|
-
import {
|
|
4
|
-
export { CreateHarnessOptions, CreateHarnessRunArgs, CreateToolCallSpansOptions, EnsureRunTraceOptions, HarnessContext, HarnessResultLike, MaybePromise, SimpleHarnessResult, SimpleSpanEvent, SimpleSpanRecord, SimpleToolCallRecord, SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, normalizeHarnessRun, normalizeSpanAttributes, normalizeSpanError, toJsonValue } from './harness.js';
|
|
5
|
-
import { JudgeContext, Judge, JudgeResult, JudgeAssessFn, JudgeAssessor, JudgeAssessWithAssessorFn } from './judges/types.js';
|
|
6
|
-
export { BoundJudgeAssessor, JudgeAssessorOptions
|
|
3
|
+
import { Harness } from './harness.js';
|
|
4
|
+
export { CreateHarnessOptions, CreateHarnessRunArgs, CreateToolCallSpansOptions, EnsureRunTraceOptions, HarnessContext, HarnessMetadata, HarnessResultLike, MaybePromise, SimpleHarnessResult, SimpleSpanEvent, SimpleSpanRecord, SimpleToolCallRecord, SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, normalizeHarnessRun, normalizeSpanAttributes, normalizeSpanError, toJsonValue } from './harness.js';
|
|
5
|
+
import { JudgeContext, Judge, JudgeResult, JudgeAssessFn, JudgeOptions, JudgeAssessor, JudgeAssessWithAssessorFn } from './judges/types.js';
|
|
6
|
+
export { BoundJudgeAssessor, JudgeAssessorOptions } from './judges/types.js';
|
|
7
7
|
import { JudgeHarness } from './judges/judgeHarness.js';
|
|
8
8
|
export { CreateJudgeHarnessOptions, CreateJudgeHarnessRunOptions, JudgeHarnessInput, JudgeHarnessOutput, RunJudge, RunJudgeOptions, createJudgeHarness, runJudgeHarness } from './judges/judgeHarness.js';
|
|
9
9
|
export { wrapText } from './wrapText.js';
|
|
@@ -32,14 +32,14 @@ type EvalTaskMeta = {
|
|
|
32
32
|
run: HarnessRun;
|
|
33
33
|
};
|
|
34
34
|
};
|
|
35
|
-
type HarnessInput<THarness extends Harness<any, any
|
|
36
|
-
type
|
|
37
|
-
type
|
|
38
|
-
type CreateJudgeConfig<TOptions extends JudgeContext<any, any, any, any> = JudgeContext> = {
|
|
35
|
+
type HarnessInput<THarness extends Harness<any, any>> = THarness extends Harness<infer TInput, any> ? TInput : unknown;
|
|
36
|
+
type HarnessOutput<THarness extends Harness<any, any>> = THarness extends Harness<any, infer TOutput> ? TOutput : JsonValue | undefined;
|
|
37
|
+
type CreateJudgeConfig<TOptions extends JudgeContext<any, any, any> = JudgeContext> = {
|
|
39
38
|
name: string;
|
|
40
39
|
judgeHarness?: JudgeHarness;
|
|
41
40
|
assess: JudgeAssessFn<TOptions>;
|
|
42
41
|
};
|
|
42
|
+
type CreateJudgeContext<TInput, TOutput extends JsonValue | undefined, TOptions extends object, THarness extends Harness<TInput, TOutput> | undefined = Harness<TInput, TOutput> | undefined> = JudgeOptions<TInput, TOutput, TOptions, THarness>;
|
|
43
43
|
declare const evalHarnessRunBrand: unique symbol;
|
|
44
44
|
/**
|
|
45
45
|
* Harness run returned by the fixture-backed `run(...)` API.
|
|
@@ -53,42 +53,22 @@ declare const evalHarnessRunBrand: unique symbol;
|
|
|
53
53
|
* });
|
|
54
54
|
* ```
|
|
55
55
|
*/
|
|
56
|
-
type EvalHarnessRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined,
|
|
56
|
+
type EvalHarnessRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, THarness extends Harness<TInput, TOutput> = Harness<TInput, TOutput>> = HarnessRun<TOutput> & {
|
|
57
57
|
readonly [evalHarnessRunBrand]: {
|
|
58
58
|
readonly input: TInput;
|
|
59
|
-
readonly metadata: TMetadata;
|
|
60
59
|
readonly output: TOutput;
|
|
61
60
|
readonly harness: THarness;
|
|
62
61
|
};
|
|
63
62
|
};
|
|
64
|
-
/**
|
|
65
|
-
* Per-run metadata forwarded to the harness alongside the test input.
|
|
66
|
-
*
|
|
67
|
-
* @example
|
|
68
|
-
* ```ts
|
|
69
|
-
* await run("Refund invoice inv_123", {
|
|
70
|
-
* metadata: {
|
|
71
|
-
* expected: { status: "approved" },
|
|
72
|
-
* expectedTools: ["lookupInvoice", "createRefund"],
|
|
73
|
-
* },
|
|
74
|
-
* });
|
|
75
|
-
* ```
|
|
76
|
-
*/
|
|
77
|
-
interface EvalRunOptions<TMetadata extends HarnessMetadata = HarnessMetadata> {
|
|
78
|
-
/** Per-run expectations or configuration forwarded to harnesses and judges. */
|
|
79
|
-
metadata?: TMetadata;
|
|
80
|
-
}
|
|
81
63
|
/**
|
|
82
64
|
* Explicit harness execution primitive exposed to each eval test.
|
|
83
65
|
*
|
|
84
66
|
* @example
|
|
85
67
|
* ```ts
|
|
86
|
-
* const result = await run("Refund invoice inv_123"
|
|
87
|
-
* metadata: { expected: { status: "approved" } },
|
|
88
|
-
* });
|
|
68
|
+
* const result = await run("Refund invoice inv_123");
|
|
89
69
|
* ```
|
|
90
70
|
*/
|
|
91
|
-
type EvalRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined,
|
|
71
|
+
type EvalRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, THarness extends Harness<TInput, TOutput> = Harness<TInput, TOutput>> = (input: TInput) => Promise<EvalHarnessRun<TInput, TOutput, THarness>>;
|
|
92
72
|
/**
|
|
93
73
|
* Fixture-backed Vitest context exposed inside `describeEval(...)` tests.
|
|
94
74
|
*
|
|
@@ -103,11 +83,11 @@ type EvalRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue
|
|
|
103
83
|
* });
|
|
104
84
|
* ```
|
|
105
85
|
*/
|
|
106
|
-
interface EvalTestContext<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined,
|
|
107
|
-
run: EvalRun<TInput, TOutput,
|
|
86
|
+
interface EvalTestContext<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, THarness extends Harness<TInput, TOutput> = Harness<TInput, TOutput>> {
|
|
87
|
+
run: EvalRun<TInput, TOutput, THarness>;
|
|
108
88
|
}
|
|
109
89
|
/** Fixture-backed Vitest test API exposed inside `describeEval(...)`. */
|
|
110
|
-
type EvalTestAPI<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined,
|
|
90
|
+
type EvalTestAPI<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, THarness extends Harness<TInput, TOutput> = Harness<TInput, TOutput>> = TestAPI<EvalTestContext<TInput, TOutput, THarness>>;
|
|
111
91
|
/**
|
|
112
92
|
* Suite-level configuration for a harness-backed eval block.
|
|
113
93
|
*
|
|
@@ -124,11 +104,11 @@ type EvalTestAPI<TInput = unknown, TOutput extends JsonValue | undefined = JsonV
|
|
|
124
104
|
* };
|
|
125
105
|
* ```
|
|
126
106
|
*/
|
|
127
|
-
interface DescribeEvalOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined,
|
|
107
|
+
interface DescribeEvalOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, THarness extends Harness<TInput, TOutput> = Harness<TInput, TOutput>> {
|
|
128
108
|
/** Harness used for every explicit `run(...)` call in the suite. */
|
|
129
109
|
harness: THarness;
|
|
130
110
|
/** Automatic judges applied after each successful `run(...)`. */
|
|
131
|
-
judges?: Array<Judge<JudgeContext<TInput, TOutput,
|
|
111
|
+
judges?: Array<Judge<JudgeContext<TInput, TOutput, THarness>>>;
|
|
132
112
|
/** Optional judge-side harness used only by judges that call `ctx.runJudge(...)`. */
|
|
133
113
|
judgeHarness?: JudgeHarness;
|
|
134
114
|
/** Passing threshold for automatic suite-level judges. `null` disables fail-on-score. */
|
|
@@ -136,26 +116,23 @@ interface DescribeEvalOptions<TInput = unknown, TOutput extends JsonValue | unde
|
|
|
136
116
|
/** Skips the entire eval suite when the predicate returns true. */
|
|
137
117
|
skipIf?: () => boolean;
|
|
138
118
|
}
|
|
139
|
-
type JudgeAssertionInput<TJudgeOptions extends JudgeContext<any, any, any
|
|
119
|
+
type JudgeAssertionInput<TJudgeOptions extends JudgeContext<any, any, any>> = TJudgeOptions extends {
|
|
140
120
|
input: infer TInput;
|
|
141
121
|
} ? TInput : unknown;
|
|
142
|
-
type JudgeAssertionOutput<TJudgeOptions extends JudgeContext<any, any, any
|
|
122
|
+
type JudgeAssertionOutput<TJudgeOptions extends JudgeContext<any, any, any>> = TJudgeOptions extends {
|
|
143
123
|
output: infer TOutput;
|
|
144
124
|
} ? TOutput : JsonValue | undefined;
|
|
145
|
-
type
|
|
146
|
-
metadata: infer TMetadata;
|
|
147
|
-
} ? TMetadata : HarnessMetadata;
|
|
148
|
-
type JudgeAssertionHarness<TJudgeOptions extends JudgeContext<any, any, any, any>> = TJudgeOptions extends {
|
|
125
|
+
type JudgeAssertionHarness<TJudgeOptions extends JudgeContext<any, any, any>> = TJudgeOptions extends {
|
|
149
126
|
harness: infer THarness;
|
|
150
|
-
} ? Exclude<THarness, undefined> : Harness<JudgeAssertionInput<TJudgeOptions>, JudgeAssertionOutput<TJudgeOptions
|
|
151
|
-
type JudgeAssertionReservedKey = keyof JudgeContext<any, any, any
|
|
152
|
-
type JudgeAssertionParams<TJudgeOptions extends JudgeContext<any, any, any
|
|
127
|
+
} ? Exclude<THarness, undefined> : Harness<JudgeAssertionInput<TJudgeOptions>, JudgeAssertionOutput<TJudgeOptions>>;
|
|
128
|
+
type JudgeAssertionReservedKey = keyof JudgeContext<any, any, any> | "judgeHarness" | "signal" | "threshold";
|
|
129
|
+
type JudgeAssertionParams<TJudgeOptions extends JudgeContext<any, any, any>> = Omit<TJudgeOptions, JudgeAssertionReservedKey>;
|
|
153
130
|
type RequiredKeys<T> = {
|
|
154
131
|
[K in keyof T]-?: Record<string, never> extends Pick<T, K> ? never : K;
|
|
155
132
|
}[keyof T];
|
|
156
|
-
type JudgeAssertionArgs<TJudgeOptions extends JudgeContext<any, any, any
|
|
157
|
-
type MatcherOutput<TReceived> = TReceived extends EvalHarnessRun<any, infer TOutput, any
|
|
158
|
-
type JudgeForReceived<TReceived, TJudgeOptions extends JudgeContext<any, any, any
|
|
133
|
+
type JudgeAssertionArgs<TJudgeOptions extends JudgeContext<any, any, any>> = RequiredKeys<JudgeAssertionParams<TJudgeOptions>> extends never ? [options?: JudgeAssertionOptions<TJudgeOptions>] : [options: JudgeAssertionOptions<TJudgeOptions>];
|
|
134
|
+
type MatcherOutput<TReceived> = TReceived extends EvalHarnessRun<any, infer TOutput, any> ? TOutput : TReceived extends HarnessRun<infer TOutput> ? TOutput : TReceived extends NormalizedSession ? JsonValue | undefined : TReceived extends JsonValue ? TReceived : JsonValue | undefined;
|
|
135
|
+
type JudgeForReceived<TReceived, TJudgeOptions extends JudgeContext<any, any, any>> = MatcherOutput<TReceived> extends JudgeAssertionOutput<TJudgeOptions> ? Judge<TJudgeOptions> : never;
|
|
159
136
|
/**
|
|
160
137
|
* Optional overrides passed to `expect(...).toSatisfyJudge(...)`.
|
|
161
138
|
*
|
|
@@ -166,13 +143,11 @@ type JudgeForReceived<TReceived, TJudgeOptions extends JudgeContext<any, any, an
|
|
|
166
143
|
* });
|
|
167
144
|
* ```
|
|
168
145
|
*/
|
|
169
|
-
type JudgeAssertionOptions<TJudgeOptions extends JudgeContext<any, any, any
|
|
146
|
+
type JudgeAssertionOptions<TJudgeOptions extends JudgeContext<any, any, any> = JudgeContext> = JudgeAssertionParams<TJudgeOptions> & {
|
|
170
147
|
/** Override or provide the original eval input for the judge. */
|
|
171
148
|
input?: JudgeAssertionInput<TJudgeOptions>;
|
|
172
149
|
/** Override or provide the app-facing output for the judge. */
|
|
173
150
|
output?: JudgeAssertionOutput<TJudgeOptions>;
|
|
174
|
-
/** Override or provide per-run judge metadata. */
|
|
175
|
-
metadata?: JudgeAssertionMetadata<TJudgeOptions>;
|
|
176
151
|
/** Override or provide flattened tool calls for the judge. */
|
|
177
152
|
toolCalls?: ToolCallRecord[];
|
|
178
153
|
/** Override or provide the complete normalized harness run. */
|
|
@@ -187,7 +162,7 @@ type JudgeAssertionOptions<TJudgeOptions extends JudgeContext<any, any, any, any
|
|
|
187
162
|
threshold?: number | null;
|
|
188
163
|
};
|
|
189
164
|
/** Function type installed as the `toSatisfyJudge(...)` matcher. */
|
|
190
|
-
type ToSatisfyJudge<TReceived = unknown> = <TJudgeOptions extends JudgeContext<any, any, any
|
|
165
|
+
type ToSatisfyJudge<TReceived = unknown> = <TJudgeOptions extends JudgeContext<any, any, any> = JudgeContext>(judge: JudgeForReceived<TReceived, TJudgeOptions>, ...args: JudgeAssertionArgs<TJudgeOptions>) => Promise<TReceived>;
|
|
191
166
|
/**
|
|
192
167
|
* Vitest matcher extension surface added by `vitest-evals`.
|
|
193
168
|
*
|
|
@@ -241,22 +216,19 @@ declare module "vitest" {
|
|
|
241
216
|
* judges: [ToolCallJudge()],
|
|
242
217
|
* }, (it) => {
|
|
243
218
|
* it("approves a refundable invoice", async ({ run }) => {
|
|
244
|
-
* const result = await run("Refund invoice inv_123"
|
|
245
|
-
* metadata: {
|
|
246
|
-
* expected: "Invoice inv_123 should be refunded.",
|
|
247
|
-
* },
|
|
248
|
-
* });
|
|
219
|
+
* const result = await run("Refund invoice inv_123");
|
|
249
220
|
*
|
|
250
221
|
* expect(result.output).toMatchObject({ status: "approved" });
|
|
251
222
|
* expect(toolCalls(result.session)).toHaveLength(2);
|
|
252
223
|
* await expect(result).toSatisfyJudge(FactualityJudge(), {
|
|
224
|
+
* expected: "Invoice inv_123 should be refunded.",
|
|
253
225
|
* threshold: 0.6,
|
|
254
226
|
* });
|
|
255
227
|
* });
|
|
256
228
|
* });
|
|
257
229
|
* ```
|
|
258
230
|
*/
|
|
259
|
-
declare function describeEval<THarness extends Harness<any, any
|
|
231
|
+
declare function describeEval<THarness extends Harness<any, any>>(name: string, options: DescribeEvalOptions<HarnessInput<THarness>, HarnessOutput<THarness>, THarness>, define: (it: EvalTestAPI<HarnessInput<THarness>, HarnessOutput<THarness>, THarness>) => void): vitest.SuiteCollector<object>;
|
|
260
232
|
/**
|
|
261
233
|
* Formats judge results for reporter and assertion output.
|
|
262
234
|
*
|
|
@@ -284,17 +256,20 @@ declare function formatScores(scores: (JudgeResult & {
|
|
|
284
256
|
*
|
|
285
257
|
* @example
|
|
286
258
|
* ```ts
|
|
287
|
-
* import { createJudge
|
|
259
|
+
* import { createJudge } from "vitest-evals";
|
|
288
260
|
*
|
|
289
261
|
* type RefundOutput = { status: "approved" | "denied" };
|
|
290
|
-
* type RefundMetadata = { expected: { status: RefundOutput["status"] } };
|
|
291
262
|
*
|
|
292
|
-
* export const RefundStatusJudge = createJudge
|
|
263
|
+
* export const RefundStatusJudge = createJudge<
|
|
264
|
+
* string,
|
|
265
|
+
* RefundOutput,
|
|
266
|
+
* { expectedStatus: RefundOutput["status"] }
|
|
267
|
+
* >(
|
|
293
268
|
* "RefundStatusJudge",
|
|
294
|
-
* async ({ output,
|
|
295
|
-
* score: output.status ===
|
|
269
|
+
* async ({ output, expectedStatus }) => ({
|
|
270
|
+
* score: output.status === expectedStatus ? 1 : 0,
|
|
296
271
|
* metadata: {
|
|
297
|
-
* rationale: `Expected ${
|
|
272
|
+
* rationale: `Expected ${expectedStatus}, got ${output.status}`,
|
|
298
273
|
* },
|
|
299
274
|
* }),
|
|
300
275
|
* );
|
|
@@ -303,12 +278,14 @@ declare function formatScores(scores: (JudgeResult & {
|
|
|
303
278
|
* For LLM-backed judges, prefer the object form with `ctx.runJudge(...)` so
|
|
304
279
|
* provider-specific model configuration stays in the judge harness.
|
|
305
280
|
*/
|
|
306
|
-
declare function createJudge<TOptions extends JudgeContext<any, any, any
|
|
307
|
-
declare function createJudge<TOptions extends JudgeContext<any, any, any
|
|
281
|
+
declare function createJudge<TOptions extends JudgeContext<any, any, any>>(name: string, assess: JudgeAssessFn<TOptions>): Judge<TOptions>;
|
|
282
|
+
declare function createJudge<TOptions extends JudgeContext<any, any, any>>(config: CreateJudgeConfig<TOptions>): Judge<TOptions>;
|
|
283
|
+
declare function createJudge<TInput, TOutput extends JsonValue | undefined, TOptions extends object = Record<never, never>, THarness extends Harness<TInput, TOutput> | undefined = Harness<TInput, TOutput> | undefined>(name: string, assess: JudgeAssessFn<CreateJudgeContext<TInput, TOutput, TOptions, THarness>>): Judge<CreateJudgeContext<TInput, TOutput, TOptions, THarness>>;
|
|
284
|
+
declare function createJudge<TInput, TOutput extends JsonValue | undefined, TOptions extends object = Record<never, never>, THarness extends Harness<TInput, TOutput> | undefined = Harness<TInput, TOutput> | undefined>(config: CreateJudgeConfig<CreateJudgeContext<TInput, TOutput, TOptions, THarness>>): Judge<CreateJudgeContext<TInput, TOutput, TOptions, THarness>>;
|
|
308
285
|
/**
|
|
309
286
|
* @deprecated Prefer `createJudge({ name, judgeHarness, assess })` and call
|
|
310
287
|
* `ctx.runJudge(...)` from LLM-backed judges.
|
|
311
288
|
*/
|
|
312
|
-
declare function createJudge<TOptions extends JudgeContext<any, any, any
|
|
289
|
+
declare function createJudge<TOptions extends JudgeContext<any, any, any>, TInput, TOutput>(name: string, assessor: JudgeAssessor<TInput, TOutput>, assess: JudgeAssessWithAssessorFn<TOptions, TInput, TOutput>): Judge<TOptions>;
|
|
313
290
|
|
|
314
|
-
export { type DescribeEvalOptions, type EvalHarnessRun, type EvalMatchers, type EvalRun, type
|
|
291
|
+
export { type DescribeEvalOptions, type EvalHarnessRun, type EvalMatchers, type EvalRun, type EvalTestAPI, type EvalTestContext, Harness, Judge, type JudgeAssertionOptions, JudgeAssessFn, JudgeAssessWithAssessorFn, JudgeAssessor, JudgeContext, JudgeHarness, JudgeOptions, JudgeResult, type ToSatisfyJudge, createJudge, describeEval, formatScores };
|
package/dist/index.js
CHANGED
|
@@ -131,7 +131,6 @@ function createHarness(options) {
|
|
|
131
131
|
try {
|
|
132
132
|
const result = await options.run({
|
|
133
133
|
input,
|
|
134
|
-
metadata: context.metadata,
|
|
135
134
|
signal: context.signal,
|
|
136
135
|
artifacts: context.artifacts,
|
|
137
136
|
setArtifact: context.setArtifact
|
|
@@ -522,17 +521,14 @@ function serializeError(error) {
|
|
|
522
521
|
function createJudgeHarness(options) {
|
|
523
522
|
return createHarness({
|
|
524
523
|
name: options.name ?? "judge-harness",
|
|
525
|
-
run: async ({ input, signal
|
|
526
|
-
return normalizeJudgeHarnessResult(
|
|
527
|
-
await options.run(input, { signal, metadata })
|
|
528
|
-
);
|
|
524
|
+
run: async ({ input, signal }) => {
|
|
525
|
+
return normalizeJudgeHarnessResult(await options.run(input, { signal }));
|
|
529
526
|
}
|
|
530
527
|
});
|
|
531
528
|
}
|
|
532
529
|
async function runJudgeHarness(judgeHarness, input, options = {}) {
|
|
533
530
|
const artifacts = {};
|
|
534
531
|
const run = await judgeHarness.run(input, {
|
|
535
|
-
metadata: options.metadata ?? {},
|
|
536
532
|
signal: options.signal,
|
|
537
533
|
artifacts,
|
|
538
534
|
setArtifact: (name, value) => {
|
|
@@ -546,8 +542,7 @@ function createRunJudge(judgeHarness, signal) {
|
|
|
546
542
|
return void 0;
|
|
547
543
|
}
|
|
548
544
|
return (input, options) => runJudgeHarness(judgeHarness, input, {
|
|
549
|
-
|
|
550
|
-
signal
|
|
545
|
+
signal: options?.signal ?? signal
|
|
551
546
|
});
|
|
552
547
|
}
|
|
553
548
|
function normalizeJudgeHarnessResult(result) {
|
|
@@ -625,22 +620,24 @@ function FactualityJudge(config = {}) {
|
|
|
625
620
|
return {
|
|
626
621
|
name: config.name ?? "FactualityJudge",
|
|
627
622
|
judgeHarness,
|
|
628
|
-
assess: (opts) => assessFactuality(opts,
|
|
623
|
+
assess: (opts) => assessFactuality(opts, {
|
|
624
|
+
expected: config.expected,
|
|
625
|
+
judgeHarness
|
|
626
|
+
})
|
|
629
627
|
};
|
|
630
628
|
}
|
|
631
|
-
async function assessFactuality(opts,
|
|
632
|
-
const
|
|
633
|
-
const expected = opts.expected === void 0 ? metadata.expected : opts.expected;
|
|
629
|
+
async function assessFactuality(opts, config) {
|
|
630
|
+
const expected = opts.expected ?? config.expected;
|
|
634
631
|
if (isMissingExpectedAnswer(expected)) {
|
|
635
632
|
return {
|
|
636
633
|
score: 0,
|
|
637
634
|
metadata: {
|
|
638
|
-
rationale: "FactualityJudge requires a non-empty expert answer in `expected` or
|
|
635
|
+
rationale: "FactualityJudge requires a non-empty expert answer in `expected` or FactualityJudge(...) config."
|
|
639
636
|
}
|
|
640
637
|
};
|
|
641
638
|
}
|
|
642
639
|
const runJudge = opts.runJudge ?? createRunJudge(
|
|
643
|
-
|
|
640
|
+
config.judgeHarness,
|
|
644
641
|
opts.signal
|
|
645
642
|
);
|
|
646
643
|
if (!runJudge) {
|
|
@@ -1048,15 +1045,15 @@ function StructuredOutputScorer(config = {}) {
|
|
|
1048
1045
|
|
|
1049
1046
|
// src/judges/structuredOutputJudge.ts
|
|
1050
1047
|
function StructuredOutputJudge(config = {}) {
|
|
1051
|
-
const
|
|
1048
|
+
const { expected, ...scorerConfig } = config;
|
|
1049
|
+
const scorer = StructuredOutputScorer(scorerConfig);
|
|
1052
1050
|
return {
|
|
1053
1051
|
name: "StructuredOutputJudge",
|
|
1054
1052
|
assess: (opts) => {
|
|
1055
|
-
const metadata = opts.metadata;
|
|
1056
1053
|
return scorer({
|
|
1057
1054
|
...opts,
|
|
1058
1055
|
input: formatStructuredOutput(opts.input),
|
|
1059
|
-
expected: opts.expected ??
|
|
1056
|
+
expected: opts.expected ?? expected,
|
|
1060
1057
|
output: formatStructuredOutput(opts.output)
|
|
1061
1058
|
});
|
|
1062
1059
|
}
|
|
@@ -1266,17 +1263,17 @@ function evaluateUnorderedTools(expected, actual, options) {
|
|
|
1266
1263
|
|
|
1267
1264
|
// src/judges/toolCallJudge.ts
|
|
1268
1265
|
function ToolCallJudge(config = {}) {
|
|
1269
|
-
const
|
|
1266
|
+
const { expectedTools, ...scorerConfig } = config;
|
|
1267
|
+
const scorer = ToolCallScorer(scorerConfig);
|
|
1270
1268
|
return {
|
|
1271
1269
|
name: "ToolCallJudge",
|
|
1272
1270
|
assess: (opts) => {
|
|
1273
|
-
const metadata = opts.metadata;
|
|
1274
1271
|
return scorer({
|
|
1275
1272
|
...opts,
|
|
1276
1273
|
input: formatJudgeValue2(opts.input),
|
|
1277
1274
|
output: formatJudgeValue2(opts.output),
|
|
1278
1275
|
expectedTools: normalizeExpectedTools(
|
|
1279
|
-
opts.expectedTools ??
|
|
1276
|
+
opts.expectedTools ?? expectedTools
|
|
1280
1277
|
)
|
|
1281
1278
|
});
|
|
1282
1279
|
}
|
|
@@ -1307,10 +1304,7 @@ var evalTest = import_vitest.test.extend("harness", async () => {
|
|
|
1307
1304
|
throw new Error(
|
|
1308
1305
|
"describeEval must override the harness fixture before running tests."
|
|
1309
1306
|
);
|
|
1310
|
-
}).extend(
|
|
1311
|
-
"automaticJudges",
|
|
1312
|
-
[]
|
|
1313
|
-
).extend("judgeThreshold", void 0).extend("judgeHarness", void 0).extend("explicitJudgeHarness", void 0).extend(
|
|
1307
|
+
}).extend("automaticJudges", []).extend("judgeThreshold", void 0).extend("judgeHarness", void 0).extend("explicitJudgeHarness", void 0).extend(
|
|
1314
1308
|
"run",
|
|
1315
1309
|
async ({
|
|
1316
1310
|
automaticJudges,
|
|
@@ -1321,12 +1315,10 @@ var evalTest = import_vitest.test.extend("harness", async () => {
|
|
|
1321
1315
|
signal,
|
|
1322
1316
|
task
|
|
1323
1317
|
}) => {
|
|
1324
|
-
return async (input
|
|
1318
|
+
return async (input) => {
|
|
1325
1319
|
const resolvedHarness = harness;
|
|
1326
|
-
const metadata = createMetadata(options?.metadata);
|
|
1327
1320
|
const artifacts = {};
|
|
1328
1321
|
const context = {
|
|
1329
|
-
metadata,
|
|
1330
1322
|
signal,
|
|
1331
1323
|
artifacts,
|
|
1332
1324
|
setArtifact: (artifactName, value) => {
|
|
@@ -1356,7 +1348,6 @@ var evalTest = import_vitest.test.extend("harness", async () => {
|
|
|
1356
1348
|
resolvedHarness,
|
|
1357
1349
|
input,
|
|
1358
1350
|
explicitJudgeHarness,
|
|
1359
|
-
metadata,
|
|
1360
1351
|
signal
|
|
1361
1352
|
);
|
|
1362
1353
|
}
|
|
@@ -1375,7 +1366,6 @@ var evalTest = import_vitest.test.extend("harness", async () => {
|
|
|
1375
1366
|
resolvedHarness,
|
|
1376
1367
|
input,
|
|
1377
1368
|
explicitJudgeHarness,
|
|
1378
|
-
metadata,
|
|
1379
1369
|
signal
|
|
1380
1370
|
);
|
|
1381
1371
|
}
|
|
@@ -1395,7 +1385,6 @@ var evalTest = import_vitest.test.extend("harness", async () => {
|
|
|
1395
1385
|
resolvedHarness,
|
|
1396
1386
|
input,
|
|
1397
1387
|
explicitJudgeHarness,
|
|
1398
|
-
metadata,
|
|
1399
1388
|
signal
|
|
1400
1389
|
);
|
|
1401
1390
|
if (automaticJudges.length > 0) {
|
|
@@ -1406,7 +1395,6 @@ var evalTest = import_vitest.test.extend("harness", async () => {
|
|
|
1406
1395
|
resolvedHarness,
|
|
1407
1396
|
input,
|
|
1408
1397
|
judgeHarness,
|
|
1409
|
-
metadata,
|
|
1410
1398
|
run,
|
|
1411
1399
|
signal
|
|
1412
1400
|
);
|
|
@@ -1473,10 +1461,7 @@ function describeEval(name, options, define) {
|
|
|
1473
1461
|
define(it);
|
|
1474
1462
|
});
|
|
1475
1463
|
}
|
|
1476
|
-
function
|
|
1477
|
-
return { ...metadata ?? {} };
|
|
1478
|
-
}
|
|
1479
|
-
async function applyAutomaticJudges(task, judges, threshold, harness, input, judgeHarness, metadata, run, signal) {
|
|
1464
|
+
async function applyAutomaticJudges(task, judges, threshold, harness, input, judgeHarness, run, signal) {
|
|
1480
1465
|
const runToolCalls = (0, import_core2.toolCalls)(run.session);
|
|
1481
1466
|
const scores = await Promise.all(
|
|
1482
1467
|
judges.map((judge) => {
|
|
@@ -1488,7 +1473,6 @@ async function applyAutomaticJudges(task, judges, threshold, harness, input, jud
|
|
|
1488
1473
|
input,
|
|
1489
1474
|
output: run.output,
|
|
1490
1475
|
toolCalls: runToolCalls,
|
|
1491
|
-
metadata,
|
|
1492
1476
|
run,
|
|
1493
1477
|
session: run.session,
|
|
1494
1478
|
signal,
|
|
@@ -1533,12 +1517,11 @@ function setHarnessMeta(task, name, run) {
|
|
|
1533
1517
|
run
|
|
1534
1518
|
};
|
|
1535
1519
|
}
|
|
1536
|
-
function recordJudgeRunContext(run, harness, input, judgeHarness,
|
|
1520
|
+
function recordJudgeRunContext(run, harness, input, judgeHarness, signal) {
|
|
1537
1521
|
const context = {
|
|
1538
1522
|
harness,
|
|
1539
1523
|
input,
|
|
1540
1524
|
judgeHarness,
|
|
1541
|
-
metadata,
|
|
1542
1525
|
run,
|
|
1543
1526
|
signal
|
|
1544
1527
|
};
|
|
@@ -1608,7 +1591,6 @@ function buildJudgeAssertionOptions(received, judge, options, task) {
|
|
|
1608
1591
|
const judgeHarness = options.judgeHarness ?? resolveJudgeHarnessForJudge(judge, registeredContext?.judgeHarness);
|
|
1609
1592
|
const runJudge = createRunJudge(judgeHarness, registeredContext?.signal);
|
|
1610
1593
|
const signal = registeredContext?.signal;
|
|
1611
|
-
const metadata = options.metadata ?? registeredContext?.metadata ?? {};
|
|
1612
1594
|
const input = options.input ?? registeredContext?.input ?? void 0;
|
|
1613
1595
|
const contextualOptions = {
|
|
1614
1596
|
...options,
|
|
@@ -1631,7 +1613,6 @@ function buildJudgeAssertionOptions(received, judge, options, task) {
|
|
|
1631
1613
|
...judgeParams,
|
|
1632
1614
|
input: resolvedInput,
|
|
1633
1615
|
output,
|
|
1634
|
-
metadata,
|
|
1635
1616
|
run,
|
|
1636
1617
|
session: options.session ?? run.session,
|
|
1637
1618
|
signal,
|