vitest-evals 0.9.0-beta.0 → 0.9.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +190 -8
- package/dist/harness.d.mts +5 -5
- package/dist/harness.d.ts +5 -5
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +33 -23
- package/dist/index.d.ts +33 -23
- package/dist/index.js +124 -41
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +124 -41
- package/dist/index.mjs.map +1 -1
- package/dist/judges/index.d.mts +2 -2
- package/dist/judges/index.d.ts +2 -2
- package/dist/judges/index.js +17 -11
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs +17 -11
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/structuredOutputJudge.d.mts +3 -8
- package/dist/judges/structuredOutputJudge.d.ts +3 -8
- package/dist/judges/structuredOutputJudge.js +8 -5
- package/dist/judges/structuredOutputJudge.js.map +1 -1
- package/dist/judges/structuredOutputJudge.mjs +8 -5
- package/dist/judges/structuredOutputJudge.mjs.map +1 -1
- package/dist/judges/toolCallJudge.d.mts +2 -6
- package/dist/judges/toolCallJudge.d.ts +2 -6
- package/dist/judges/toolCallJudge.js +9 -6
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs +9 -6
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +14 -16
- package/dist/judges/types.d.ts +14 -16
- package/dist/judges/types.js.map +1 -1
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -14,6 +14,8 @@ Install a first-party harness package for the runtime you want to test:
|
|
|
14
14
|
npm install -D @vitest-evals/harness-pi-ai
|
|
15
15
|
# or
|
|
16
16
|
npm install -D @vitest-evals/harness-ai-sdk
|
|
17
|
+
# or
|
|
18
|
+
npm install -D @vitest-evals/harness-openai-agents
|
|
17
19
|
```
|
|
18
20
|
|
|
19
21
|
## Core Model
|
|
@@ -25,9 +27,13 @@ npm install -D @vitest-evals/harness-ai-sdk
|
|
|
25
27
|
- the returned `result.output` is the app-facing value you assert on directly
|
|
26
28
|
- the returned `result.session` is the canonical JSON-serializable trace for
|
|
27
29
|
reporting, replay, tool assertions, and judges
|
|
28
|
-
-
|
|
30
|
+
- scenario-specific judge criteria can live in `inputValue`; use `metadata` for
|
|
31
|
+
per-run expectations or harness configuration that are not part of the
|
|
32
|
+
scenario payload
|
|
29
33
|
- suite-level `judges` are optional and run automatically after each `run(...)`
|
|
30
34
|
- suite-level `judgeThreshold` controls fail-on-score for those automatic judges
|
|
35
|
+
- every judge receives `JudgeContext`, including the configured `harness` with
|
|
36
|
+
its required `prompt` function
|
|
31
37
|
- explicit judge assertions use
|
|
32
38
|
`await expect(result).toSatisfyJudge(judge, context)`
|
|
33
39
|
|
|
@@ -42,7 +48,7 @@ import {
|
|
|
42
48
|
toolCalls,
|
|
43
49
|
type JudgeContext,
|
|
44
50
|
} from "vitest-evals";
|
|
45
|
-
import { createRefundAgent } from "../src/refundAgent";
|
|
51
|
+
import { createRefundAgent, judgePrompt } from "../src/refundAgent";
|
|
46
52
|
|
|
47
53
|
type RefundEvalMetadata = {
|
|
48
54
|
expectedStatus: "approved" | "denied";
|
|
@@ -76,6 +82,7 @@ describeEval(
|
|
|
76
82
|
{
|
|
77
83
|
harness: piAiHarness({
|
|
78
84
|
createAgent: () => createRefundAgent(),
|
|
85
|
+
prompt: judgePrompt,
|
|
79
86
|
}),
|
|
80
87
|
judges: [FactualityJudge],
|
|
81
88
|
},
|
|
@@ -141,10 +148,173 @@ The harness owns normalization, diagnostics, tool capture, replay plumbing, and
|
|
|
141
148
|
reporter-facing artifacts. Your app just needs one runtime seam where those
|
|
142
149
|
wrapped pieces can be injected.
|
|
143
150
|
|
|
151
|
+
Replay opt-in belongs on the harness, via `toolReplay`, while replay mode and
|
|
152
|
+
recording directory can live in Vitest environment config. Tool definitions
|
|
153
|
+
should stay free of VCR policy.
|
|
154
|
+
|
|
144
155
|
For the Pi-specific harness, output/session/usage normalization should usually
|
|
145
156
|
be inferred automatically. Treat low-level normalization callbacks as an escape
|
|
146
157
|
hatch, not part of the primary authoring path.
|
|
147
158
|
|
|
159
|
+
For OpenAI Agents SDK apps, use
|
|
160
|
+
`@vitest-evals/harness-openai-agents` with an existing `Agent` or
|
|
161
|
+
`createAgent()` factory and a `Runner` / `createRunner()` callback. The harness
|
|
162
|
+
calls `Runner.run(agent, input, options)` by default and exposes the same
|
|
163
|
+
normalization and replay hooks when the app needs a custom entrypoint or
|
|
164
|
+
structured domain output mapping.
|
|
165
|
+
|
|
166
|
+
## Custom App Harnesses
|
|
167
|
+
|
|
168
|
+
First-party harness packages are conveniences, not the only supported path. If
|
|
169
|
+
you need to test a full application flow, define a harness that runs your app
|
|
170
|
+
through its normal entrypoint and returns a normalized `HarnessRun`. The same
|
|
171
|
+
harness should also expose `prompt`, which LLM-backed judges can reuse through
|
|
172
|
+
`JudgeContext.harness.prompt`.
|
|
173
|
+
|
|
174
|
+
```ts
|
|
175
|
+
import {
|
|
176
|
+
describeEval,
|
|
177
|
+
namedJudge,
|
|
178
|
+
type JudgeContext,
|
|
179
|
+
} from "vitest-evals";
|
|
180
|
+
import {
|
|
181
|
+
normalizeContent,
|
|
182
|
+
normalizeMetadata,
|
|
183
|
+
toJsonValue,
|
|
184
|
+
type Harness,
|
|
185
|
+
type HarnessRun,
|
|
186
|
+
} from "vitest-evals/harness";
|
|
187
|
+
|
|
188
|
+
type AppEvent = {
|
|
189
|
+
type: string;
|
|
190
|
+
payload: Record<string, unknown>;
|
|
191
|
+
};
|
|
192
|
+
|
|
193
|
+
type AppEvalInput = {
|
|
194
|
+
events: AppEvent[];
|
|
195
|
+
criteria: {
|
|
196
|
+
contract: string;
|
|
197
|
+
pass: string[];
|
|
198
|
+
fail?: string[];
|
|
199
|
+
};
|
|
200
|
+
};
|
|
201
|
+
|
|
202
|
+
const appHarness: Harness<AppEvalInput> = {
|
|
203
|
+
name: "custom-app",
|
|
204
|
+
prompt: (input, options) => promptJudgeModel(input, options),
|
|
205
|
+
run: async (input, context): Promise<HarnessRun> => {
|
|
206
|
+
const result = await replayAppEvents(input.events, {
|
|
207
|
+
signal: context.signal,
|
|
208
|
+
});
|
|
209
|
+
const output = {
|
|
210
|
+
replies: result.replies,
|
|
211
|
+
sideEffects: result.sideEffects,
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
return {
|
|
215
|
+
output: toJsonValue(output),
|
|
216
|
+
session: {
|
|
217
|
+
messages: [
|
|
218
|
+
...input.events.map((event) => ({
|
|
219
|
+
role: "user" as const,
|
|
220
|
+
content: normalizeContent(event),
|
|
221
|
+
})),
|
|
222
|
+
...result.replies.map((reply) => ({
|
|
223
|
+
role: "assistant" as const,
|
|
224
|
+
content: normalizeContent(reply.text),
|
|
225
|
+
metadata: normalizeMetadata({
|
|
226
|
+
target: reply.target,
|
|
227
|
+
}),
|
|
228
|
+
})),
|
|
229
|
+
],
|
|
230
|
+
outputText: result.replies.map((reply) => reply.text).join("\n\n"),
|
|
231
|
+
metadata: normalizeMetadata({
|
|
232
|
+
replyCount: result.replies.length,
|
|
233
|
+
}),
|
|
234
|
+
},
|
|
235
|
+
usage: {},
|
|
236
|
+
artifacts:
|
|
237
|
+
Object.keys(context.artifacts).length > 0
|
|
238
|
+
? context.artifacts
|
|
239
|
+
: undefined,
|
|
240
|
+
errors: [],
|
|
241
|
+
};
|
|
242
|
+
},
|
|
243
|
+
};
|
|
244
|
+
|
|
245
|
+
const AppRubricJudge = namedJudge(
|
|
246
|
+
"AppRubricJudge",
|
|
247
|
+
async (
|
|
248
|
+
ctx: JudgeContext<AppEvalInput, Record<string, unknown>, typeof appHarness>,
|
|
249
|
+
) => {
|
|
250
|
+
const verdict = await ctx.harness.prompt(
|
|
251
|
+
formatRubricPrompt({
|
|
252
|
+
output: ctx.output,
|
|
253
|
+
criteria: ctx.inputValue.criteria,
|
|
254
|
+
}),
|
|
255
|
+
{
|
|
256
|
+
metadata: {
|
|
257
|
+
judge: "AppRubricJudge",
|
|
258
|
+
},
|
|
259
|
+
},
|
|
260
|
+
);
|
|
261
|
+
|
|
262
|
+
return parseRubricVerdict(verdict);
|
|
263
|
+
},
|
|
264
|
+
);
|
|
265
|
+
|
|
266
|
+
describeEval(
|
|
267
|
+
"app behavior",
|
|
268
|
+
{
|
|
269
|
+
harness: appHarness,
|
|
270
|
+
judges: [AppRubricJudge],
|
|
271
|
+
judgeThreshold: 0.75,
|
|
272
|
+
},
|
|
273
|
+
(it) => {
|
|
274
|
+
it("handles an event flow", async ({ run }) => {
|
|
275
|
+
await run({
|
|
276
|
+
events: [
|
|
277
|
+
{
|
|
278
|
+
type: "message.created",
|
|
279
|
+
payload: {
|
|
280
|
+
text: "Summarize the current incident.",
|
|
281
|
+
},
|
|
282
|
+
},
|
|
283
|
+
],
|
|
284
|
+
criteria: {
|
|
285
|
+
contract: "The app posts one user-visible incident summary.",
|
|
286
|
+
pass: ["The reply names the incident status."],
|
|
287
|
+
fail: ["The reply exposes internal metadata."],
|
|
288
|
+
},
|
|
289
|
+
});
|
|
290
|
+
});
|
|
291
|
+
},
|
|
292
|
+
);
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
Use `Harness.run(...)` for the application under test and `Harness.prompt(...)`
|
|
296
|
+
for judge model calls. Calling `ctx.harness.run(...)` from inside a judge runs
|
|
297
|
+
the application a second time, so reserve that for judges that intentionally
|
|
298
|
+
need a second execution. Put criteria on `inputValue` when they are part of the
|
|
299
|
+
scenario itself; use per-run `metadata` for harness configuration or
|
|
300
|
+
expectations that are not part of the scenario payload. `session.outputText` is
|
|
301
|
+
the canonical text sent to judges, so define it deliberately when your app
|
|
302
|
+
returns structured artifacts.
|
|
303
|
+
|
|
304
|
+
Provider setup and rubric parsing stay in your harness and judge. The core
|
|
305
|
+
package only requires the judge to return a `JudgeResult` with a score and
|
|
306
|
+
optional metadata.
|
|
307
|
+
|
|
308
|
+
Automatic suite-level judges are a good fit when every `run(...)` should get
|
|
309
|
+
the same scoring. For cases where only some runs need an LLM judge, keep the
|
|
310
|
+
suite free of automatic judges and use an explicit matcher:
|
|
311
|
+
|
|
312
|
+
```ts
|
|
313
|
+
await expect(result).toSatisfyJudge(AppRubricJudge, {
|
|
314
|
+
threshold: 0.75,
|
|
315
|
+
});
|
|
316
|
+
```
|
|
317
|
+
|
|
148
318
|
## Judge Matchers
|
|
149
319
|
|
|
150
320
|
Use the matcher when a judge should behave like a normal Vitest assertion.
|
|
@@ -186,12 +356,24 @@ const FactualityJudge = namedJudge(
|
|
|
186
356
|
);
|
|
187
357
|
```
|
|
188
358
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
359
|
+
LLM-backed judges can reuse the suite harness prompt by calling
|
|
360
|
+
`harness.prompt(...)`. `vitest-evals` does not prescribe a rubric schema,
|
|
361
|
+
scoring scale, model provider, or parser; those stay in the judge. Calling
|
|
362
|
+
`harness.run(...)` from a judge executes the application again, so use that
|
|
363
|
+
only when a second run is intentional.
|
|
364
|
+
|
|
365
|
+
For an `EvalHarnessRun` returned by fixture `run(...)`,
|
|
366
|
+
`toSatisfyJudge(...)` uses the run's canonical text output and reuses the
|
|
367
|
+
registered input, metadata, and harness prompt. Inside an eval test,
|
|
368
|
+
matcher calls on registered raw output or session objects reuse that exact run
|
|
369
|
+
context; raw output values are serialized as the judge `output`, so
|
|
370
|
+
`expect(result.output).toSatisfyJudge(judge)` stays concise. Other raw values
|
|
371
|
+
fall back to the current test's most recent `run(...)` context. For
|
|
372
|
+
manually-created runs or values outside an eval context, pass any required
|
|
373
|
+
`inputValue`, `metadata`, or `harness` in matcher options. Structured or
|
|
374
|
+
programmatic result checks should usually assert on `result.output` directly.
|
|
375
|
+
When a judge needs richer normalized context or the configured suite harness,
|
|
376
|
+
type it with `JudgeContext`.
|
|
195
377
|
|
|
196
378
|
When you only need deterministic contract checks, built-ins such as
|
|
197
379
|
`StructuredOutputJudge()` and `ToolCallJudge()` are still available. The primary
|
package/dist/harness.d.mts
CHANGED
|
@@ -54,14 +54,13 @@ type HarnessRun = {
|
|
|
54
54
|
artifacts?: Record<string, JsonValue>;
|
|
55
55
|
errors: Array<Record<string, JsonValue>>;
|
|
56
56
|
};
|
|
57
|
+
/** Optional provider-facing hints for harness prompt calls. */
|
|
57
58
|
type HarnessPromptOptions = {
|
|
58
59
|
system?: string;
|
|
59
60
|
metadata?: Record<string, JsonValue>;
|
|
60
61
|
};
|
|
62
|
+
/** Provider-agnostic prompt seam that judges can reuse from a harness. */
|
|
61
63
|
type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
|
|
62
|
-
type HarnessRuntime = {
|
|
63
|
-
prompt: HarnessPrompt;
|
|
64
|
-
};
|
|
65
64
|
type HarnessRunError = Error & {
|
|
66
65
|
vitestEvalsRun: HarnessRun;
|
|
67
66
|
};
|
|
@@ -77,7 +76,8 @@ type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
|
77
76
|
};
|
|
78
77
|
type Harness<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
79
78
|
name: string;
|
|
80
|
-
|
|
79
|
+
/** Prompt seam reused by LLM-backed judges. */
|
|
80
|
+
prompt: HarnessPrompt;
|
|
81
81
|
run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
|
|
82
82
|
};
|
|
83
83
|
/** Returns true when a value exposes a callable method with the given name. */
|
|
@@ -115,4 +115,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
|
|
|
115
115
|
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
116
116
|
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
117
117
|
|
|
118
|
-
export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type
|
|
118
|
+
export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type NormalizedMessage, type NormalizedSession, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|
package/dist/harness.d.ts
CHANGED
|
@@ -54,14 +54,13 @@ type HarnessRun = {
|
|
|
54
54
|
artifacts?: Record<string, JsonValue>;
|
|
55
55
|
errors: Array<Record<string, JsonValue>>;
|
|
56
56
|
};
|
|
57
|
+
/** Optional provider-facing hints for harness prompt calls. */
|
|
57
58
|
type HarnessPromptOptions = {
|
|
58
59
|
system?: string;
|
|
59
60
|
metadata?: Record<string, JsonValue>;
|
|
60
61
|
};
|
|
62
|
+
/** Provider-agnostic prompt seam that judges can reuse from a harness. */
|
|
61
63
|
type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
|
|
62
|
-
type HarnessRuntime = {
|
|
63
|
-
prompt: HarnessPrompt;
|
|
64
|
-
};
|
|
65
64
|
type HarnessRunError = Error & {
|
|
66
65
|
vitestEvalsRun: HarnessRun;
|
|
67
66
|
};
|
|
@@ -77,7 +76,8 @@ type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
|
77
76
|
};
|
|
78
77
|
type Harness<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
79
78
|
name: string;
|
|
80
|
-
|
|
79
|
+
/** Prompt seam reused by LLM-backed judges. */
|
|
80
|
+
prompt: HarnessPrompt;
|
|
81
81
|
run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
|
|
82
82
|
};
|
|
83
83
|
/** Returns true when a value exposes a callable method with the given name. */
|
|
@@ -115,4 +115,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
|
|
|
115
115
|
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
116
116
|
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
117
117
|
|
|
118
|
-
export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type
|
|
118
|
+
export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type NormalizedMessage, type NormalizedSession, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|
package/dist/harness.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/harness.ts"],"sourcesContent":["export type JsonPrimitive = string | number | boolean | null;\n\nexport type JsonValue =\n | JsonPrimitive\n | JsonValue[]\n | { [key: string]: JsonValue };\n\nexport type ToolCallRecord = {\n id?: string;\n name: string;\n arguments?: Record<string, JsonValue>;\n result?: JsonValue;\n error?: {\n message: string;\n type?: string;\n [key: string]: JsonValue | undefined;\n };\n startedAt?: string;\n finishedAt?: string;\n durationMs?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type NormalizedMessage = {\n role: \"system\" | \"user\" | \"assistant\" | \"tool\";\n content?: JsonValue;\n toolCalls?: ToolCallRecord[];\n metadata?: Record<string, JsonValue>;\n};\n\nexport type UsageSummary = {\n provider?: string;\n model?: string;\n inputTokens?: number;\n outputTokens?: number;\n reasoningTokens?: number;\n totalTokens?: number;\n estimatedCost?: number;\n toolCalls?: number;\n retries?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type TimingSummary = {\n totalMs?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type NormalizedSession = {\n messages: NormalizedMessage[];\n outputText?: string;\n provider?: string;\n model?: string;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type HarnessRun = {\n session: NormalizedSession;\n output?: JsonValue;\n usage: UsageSummary;\n timings?: TimingSummary;\n artifacts?: Record<string, JsonValue>;\n errors: Array<Record<string, JsonValue>>;\n};\n\nexport type HarnessPromptOptions = {\n system?: string;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type HarnessPrompt = (\n input: string,\n options?: HarnessPromptOptions,\n) => Promise<string>;\n\nexport type HarnessRuntime = {\n prompt: HarnessPrompt;\n};\n\nexport type HarnessRunError = Error & {\n vitestEvalsRun: HarnessRun;\n};\n\nexport type HarnessMetadata = Record<string, unknown>;\n\nexport type HarnessContext<\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n metadata: Readonly<TMetadata>;\n task: {\n meta: Record<string, unknown>;\n };\n signal?: AbortSignal;\n artifacts: Record<string, JsonValue>;\n setArtifact: (name: string, value: JsonValue) => void;\n};\n\nexport type Harness<\n TInput = unknown,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n name: string;\n prompt?: HarnessPrompt;\n run: (\n input: TInput,\n context: HarnessContext<TMetadata>,\n ) => Promise<HarnessRun>;\n};\n\nfunction isJsonPrimitive(value: unknown): value is JsonPrimitive {\n return (\n value === null ||\n typeof value === \"string\" ||\n typeof value === \"number\" ||\n typeof value === \"boolean\"\n );\n}\n\nfunction isJsonRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\nfunction normalizeJsonArray(value: unknown[]): JsonValue[] {\n return value.map((item) => {\n const normalized = toJsonValue(item);\n return normalized === undefined ? null : normalized;\n });\n}\n\nfunction normalizeJsonObject(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n const normalized: Record<string, JsonValue> = {};\n\n for (const [key, entryValue] of Object.entries(value)) {\n const entry = toJsonValue(entryValue);\n if (entry !== undefined) {\n normalized[key] = entry;\n }\n }\n\n return normalized;\n}\n\n/** Returns true when a value exposes a callable method with the given name. */\nexport function hasCallableMethod(value: unknown, methodName: string) {\n return (\n value !== null &&\n (typeof value === \"object\" || typeof value === \"function\") &&\n methodName in value &&\n typeof (value as Record<string, unknown>)[methodName] === \"function\"\n );\n}\n\n/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */\nexport function toJsonValue(value: unknown): JsonValue | undefined {\n if (isJsonPrimitive(value)) {\n return value;\n }\n\n if (Array.isArray(value)) {\n return normalizeJsonArray(value);\n }\n\n if (isJsonRecord(value)) {\n return normalizeJsonObject(value);\n }\n\n return undefined;\n}\n\n/** Drops non-JSON properties from a record while preserving valid values. */\nexport function normalizeRecord(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n return normalizeJsonObject(value);\n}\n\n/** Normalizes metadata and omits the field entirely when nothing survives. */\nexport function normalizeMetadata(\n value: Record<string, unknown>,\n): Record<string, JsonValue> | undefined {\n const normalized = normalizeRecord(value);\n return Object.keys(normalized).length > 0 ? normalized : undefined;\n}\n\n/** Converts arbitrary content into the JSON-safe message content shape. */\nexport function normalizeContent(value: unknown): JsonValue {\n return toJsonValue(value) ?? String(value);\n}\n\n/** Flattens every recorded tool call from a normalized session. */\nexport function toolCalls(session: NormalizedSession): ToolCallRecord[] {\n return session.messages.flatMap((message) => message.toolCalls ?? []);\n}\n\n/** Filters normalized session messages by role. */\nexport function messagesByRole(\n session: NormalizedSession,\n role: NormalizedMessage[\"role\"],\n): NormalizedMessage[] {\n return session.messages.filter((message) => message.role === role);\n}\n\n/** Returns every normalized system message from a session. */\nexport function systemMessages(session: NormalizedSession) {\n return messagesByRole(session, \"system\");\n}\n\n/** Returns every normalized user message from a session. */\nexport function userMessages(session: NormalizedSession) {\n return messagesByRole(session, \"user\");\n}\n\n/** Returns every normalized assistant message from a session. */\nexport function assistantMessages(session: NormalizedSession) {\n return messagesByRole(session, \"assistant\");\n}\n\n/** Returns every normalized tool message from a session. */\nexport function toolMessages(session: NormalizedSession) {\n return messagesByRole(session, \"tool\");\n}\n\n/** Attaches a partial or complete harness run to an arbitrary thrown error. */\nexport function attachHarnessRunToError(\n error: unknown,\n run: HarnessRun,\n): HarnessRunError {\n const baseError =\n error instanceof Error\n ? error\n : new Error(String(error ?? \"Unknown error\"));\n return Object.assign(baseError, {\n vitestEvalsRun: run,\n });\n}\n\n/** Reads an attached harness run back off a previously wrapped error value. */\nexport function getHarnessRunFromError(error: unknown): HarnessRun | undefined {\n if (\n error &&\n typeof error === \"object\" &&\n \"vitestEvalsRun\" in error &&\n isHarnessRun((error as { vitestEvalsRun?: unknown }).vitestEvalsRun)\n ) {\n return (error as { vitestEvalsRun: HarnessRun }).vitestEvalsRun;\n }\n\n return undefined;\n}\n\n/** Returns true when a value matches the normalized `HarnessRun` contract. */\nexport function isHarnessRun(value: unknown): value is HarnessRun {\n if (!value || typeof value !== \"object\") {\n return false;\n }\n\n const candidate = value as {\n session?: unknown;\n usage?: unknown;\n errors?: unknown;\n };\n\n return (\n isNormalizedSession(candidate.session) &&\n Boolean(candidate.usage) &&\n typeof candidate.usage === \"object\" &&\n !Array.isArray(candidate.usage) &&\n Array.isArray(candidate.errors)\n );\n}\n\n/** Returns true when a value matches the normalized session contract. */\nexport function isNormalizedSession(\n value: unknown,\n): value is NormalizedSession {\n return (\n Boolean(value) &&\n typeof value === \"object\" &&\n value !== null &&\n \"messages\" in value &&\n Array.isArray((value as { messages?: unknown }).messages)\n );\n}\n\n/** Reuses pre-normalized harness errors when a runtime already returns them. */\nexport function resolveHarnessRunErrors(\n result: unknown,\n): Array<Record<string, JsonValue>> {\n if (\n result &&\n typeof result === \"object\" &&\n Array.isArray((result as Record<string, unknown>).errors)\n ) {\n return (result as { errors: Array<Record<string, JsonValue>> }).errors;\n }\n\n return [];\n}\n\n/** Serializes an arbitrary thrown value into the normalized error shape. */\nexport function serializeError(error: unknown): Record<string, JsonValue> {\n if (error instanceof Error) {\n return {\n type: error.name,\n message: error.message,\n };\n }\n\n return {\n type: \"Error\",\n message: String(error),\n };\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AA6GA,SAAS,gBAAgB,OAAwC;AAC/D,SACE,UAAU,QACV,OAAO,UAAU,YACjB,OAAO,UAAU,YACjB,OAAO,UAAU;AAErB;AAEA,SAAS,aAAa,OAAkD;AACtE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;AAEA,SAAS,mBAAmB,OAA+B;AACzD,SAAO,MAAM,IAAI,CAAC,SAAS;AACzB,UAAM,aAAa,YAAY,IAAI;AACnC,WAAO,eAAe,SAAY,OAAO;AAAA,EAC3C,CAAC;AACH;AAEA,SAAS,oBACP,OAC2B;AAC3B,QAAM,aAAwC,CAAC;AAE/C,aAAW,CAAC,KAAK,UAAU,KAAK,OAAO,QAAQ,KAAK,GAAG;AACrD,UAAM,QAAQ,YAAY,UAAU;AACpC,QAAI,UAAU,QAAW;AACvB,iBAAW,GAAG,IAAI;AAAA,IACpB;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,kBAAkB,OAAgB,YAAoB;AACpE,SACE,UAAU,SACT,OAAO,UAAU,YAAY,OAAO,UAAU,eAC/C,cAAc,SACd,OAAQ,MAAkC,UAAU,MAAM;AAE9D;AAGO,SAAS,YAAY,OAAuC;AACjE,MAAI,gBAAgB,KAAK,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,KAAK,GAAG;AACxB,WAAO,mBAAmB,KAAK;AAAA,EACjC;AAEA,MAAI,aAAa,KAAK,GAAG;AACvB,WAAO,oBAAoB,KAAK;AAAA,EAClC;AAEA,SAAO;AACT;AAGO,SAAS,gBACd,OAC2B;AAC3B,SAAO,oBAAoB,KAAK;AAClC;AAGO,SAAS,kBACd,OACuC;AACvC,QAAM,aAAa,gBAAgB,KAAK;AACxC,SAAO,OAAO,KAAK,UAAU,EAAE,SAAS,IAAI,aAAa;AAC3D;AAGO,SAAS,iBAAiB,OAA2B;AAC1D,SAAO,YAAY,KAAK,KAAK,OAAO,KAAK;AAC3C;AAGO,SAAS,UAAU,SAA8C;AACtE,SAAO,QAAQ,SAAS,QAAQ,CAAC,YAAY,QAAQ,aAAa,CAAC,CAAC;AACtE;AAGO,SAAS,eACd,SACA,MACqB;AACrB,SAAO,QAAQ,SAAS,OAAO,CAAC,YAAY,QAAQ,SAAS,IAAI;AACnE;AAGO,SAAS,eAAe,SAA4B;AACzD,SAAO,eAAe,SAAS,QAAQ;AACzC;AAGO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAGO,SAAS,kBAAkB,SAA4B;AAC5D,SAAO,eAAe,SAAS,WAAW;AAC5C;AAGO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAGO,SAAS,wBACd,OACA,KACiB;AACjB,QAAM,YACJ,iBAAiB,QACb,QACA,IAAI,MAAM,OAAO,SAAS,eAAe,CAAC;AAChD,SAAO,OAAO,OAAO,WAAW;AAAA,IAC9B,gBAAgB;AAAA,EAClB,CAAC;AACH;AAGO,SAAS,uBAAuB,OAAwC;AAC7E,MACE,SACA,OAAO,UAAU,YACjB,oBAAoB,SACpB,aAAc,MAAuC,cAAc,GACnE;AACA,WAAQ,MAAyC;AAAA,EACnD;AAEA,SAAO;AACT;AAGO,SAAS,aAAa,OAAqC;AAChE,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY;AAMlB,SACE,oBAAoB,UAAU,OAAO,KACrC,QAAQ,UAAU,KAAK,KACvB,OAAO,UAAU,UAAU,YAC3B,CAAC,MAAM,QAAQ,UAAU,KAAK,KAC9B,MAAM,QAAQ,UAAU,MAAM;AAElC;AAGO,SAAS,oBACd,OAC4B;AAC5B,SACE,QAAQ,KAAK,KACb,OAAO,UAAU,YACjB,UAAU,QACV,cAAc,SACd,MAAM,QAAS,MAAiC,QAAQ;AAE5D;AAGO,SAAS,wBACd,QACkC;AAClC,MACE,UACA,OAAO,WAAW,YAClB,MAAM,QAAS,OAAmC,MAAM,GACxD;AACA,WAAQ,OAAwD;AAAA,EAClE;AAEA,SAAO,CAAC;AACV;AAGO,SAAS,eAAe,OAA2C;AACxE,MAAI,iBAAiB,OAAO;AAC1B,WAAO;AAAA,MACL,MAAM,MAAM;AAAA,MACZ,SAAS,MAAM;AAAA,IACjB;AAAA,EACF;AAEA,SAAO;AAAA,IACL,MAAM;AAAA,IACN,SAAS,OAAO,KAAK;AAAA,EACvB;AACF;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/harness.ts"],"sourcesContent":["export type JsonPrimitive = string | number | boolean | null;\n\nexport type JsonValue =\n | JsonPrimitive\n | JsonValue[]\n | { [key: string]: JsonValue };\n\nexport type ToolCallRecord = {\n id?: string;\n name: string;\n arguments?: Record<string, JsonValue>;\n result?: JsonValue;\n error?: {\n message: string;\n type?: string;\n [key: string]: JsonValue | undefined;\n };\n startedAt?: string;\n finishedAt?: string;\n durationMs?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type NormalizedMessage = {\n role: \"system\" | \"user\" | \"assistant\" | \"tool\";\n content?: JsonValue;\n toolCalls?: ToolCallRecord[];\n metadata?: Record<string, JsonValue>;\n};\n\nexport type UsageSummary = {\n provider?: string;\n model?: string;\n inputTokens?: number;\n outputTokens?: number;\n reasoningTokens?: number;\n totalTokens?: number;\n estimatedCost?: number;\n toolCalls?: number;\n retries?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type TimingSummary = {\n totalMs?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type NormalizedSession = {\n messages: NormalizedMessage[];\n outputText?: string;\n provider?: string;\n model?: string;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type HarnessRun = {\n session: NormalizedSession;\n output?: JsonValue;\n usage: UsageSummary;\n timings?: TimingSummary;\n artifacts?: Record<string, JsonValue>;\n errors: Array<Record<string, JsonValue>>;\n};\n\n/** Optional provider-facing hints for harness prompt calls. */\nexport type HarnessPromptOptions = {\n system?: string;\n metadata?: Record<string, JsonValue>;\n};\n\n/** Provider-agnostic prompt seam that judges can reuse from a harness. */\nexport type HarnessPrompt = (\n input: string,\n options?: HarnessPromptOptions,\n) => Promise<string>;\n\nexport type HarnessRunError = Error & {\n vitestEvalsRun: HarnessRun;\n};\n\nexport type HarnessMetadata = Record<string, unknown>;\n\nexport type HarnessContext<\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n metadata: Readonly<TMetadata>;\n task: {\n meta: Record<string, unknown>;\n };\n signal?: AbortSignal;\n artifacts: Record<string, JsonValue>;\n setArtifact: (name: string, value: JsonValue) => void;\n};\n\nexport type Harness<\n TInput = unknown,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n name: string;\n /** Prompt seam reused by LLM-backed judges. */\n prompt: HarnessPrompt;\n run: (\n input: TInput,\n context: HarnessContext<TMetadata>,\n ) => Promise<HarnessRun>;\n};\n\nfunction isJsonPrimitive(value: unknown): value is JsonPrimitive {\n return (\n value === null ||\n typeof value === \"string\" ||\n typeof value === \"number\" ||\n typeof value === \"boolean\"\n );\n}\n\nfunction isJsonRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\nfunction normalizeJsonArray(value: unknown[]): JsonValue[] {\n return value.map((item) => {\n const normalized = toJsonValue(item);\n return normalized === undefined ? null : normalized;\n });\n}\n\nfunction normalizeJsonObject(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n const normalized: Record<string, JsonValue> = {};\n\n for (const [key, entryValue] of Object.entries(value)) {\n const entry = toJsonValue(entryValue);\n if (entry !== undefined) {\n normalized[key] = entry;\n }\n }\n\n return normalized;\n}\n\n/** Returns true when a value exposes a callable method with the given name. */\nexport function hasCallableMethod(value: unknown, methodName: string) {\n return (\n value !== null &&\n (typeof value === \"object\" || typeof value === \"function\") &&\n methodName in value &&\n typeof (value as Record<string, unknown>)[methodName] === \"function\"\n );\n}\n\n/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */\nexport function toJsonValue(value: unknown): JsonValue | undefined {\n if (isJsonPrimitive(value)) {\n return value;\n }\n\n if (Array.isArray(value)) {\n return normalizeJsonArray(value);\n }\n\n if (isJsonRecord(value)) {\n return normalizeJsonObject(value);\n }\n\n return undefined;\n}\n\n/** Drops non-JSON properties from a record while preserving valid values. */\nexport function normalizeRecord(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n return normalizeJsonObject(value);\n}\n\n/** Normalizes metadata and omits the field entirely when nothing survives. */\nexport function normalizeMetadata(\n value: Record<string, unknown>,\n): Record<string, JsonValue> | undefined {\n const normalized = normalizeRecord(value);\n return Object.keys(normalized).length > 0 ? normalized : undefined;\n}\n\n/** Converts arbitrary content into the JSON-safe message content shape. */\nexport function normalizeContent(value: unknown): JsonValue {\n return toJsonValue(value) ?? String(value);\n}\n\n/** Flattens every recorded tool call from a normalized session. */\nexport function toolCalls(session: NormalizedSession): ToolCallRecord[] {\n return session.messages.flatMap((message) => message.toolCalls ?? []);\n}\n\n/** Filters normalized session messages by role. */\nexport function messagesByRole(\n session: NormalizedSession,\n role: NormalizedMessage[\"role\"],\n): NormalizedMessage[] {\n return session.messages.filter((message) => message.role === role);\n}\n\n/** Returns every normalized system message from a session. */\nexport function systemMessages(session: NormalizedSession) {\n return messagesByRole(session, \"system\");\n}\n\n/** Returns every normalized user message from a session. */\nexport function userMessages(session: NormalizedSession) {\n return messagesByRole(session, \"user\");\n}\n\n/** Returns every normalized assistant message from a session. */\nexport function assistantMessages(session: NormalizedSession) {\n return messagesByRole(session, \"assistant\");\n}\n\n/** Returns every normalized tool message from a session. */\nexport function toolMessages(session: NormalizedSession) {\n return messagesByRole(session, \"tool\");\n}\n\n/** Attaches a partial or complete harness run to an arbitrary thrown error. */\nexport function attachHarnessRunToError(\n error: unknown,\n run: HarnessRun,\n): HarnessRunError {\n const baseError =\n error instanceof Error\n ? error\n : new Error(String(error ?? \"Unknown error\"));\n return Object.assign(baseError, {\n vitestEvalsRun: run,\n });\n}\n\n/** Reads an attached harness run back off a previously wrapped error value. */\nexport function getHarnessRunFromError(error: unknown): HarnessRun | undefined {\n if (\n error &&\n typeof error === \"object\" &&\n \"vitestEvalsRun\" in error &&\n isHarnessRun((error as { vitestEvalsRun?: unknown }).vitestEvalsRun)\n ) {\n return (error as { vitestEvalsRun: HarnessRun }).vitestEvalsRun;\n }\n\n return undefined;\n}\n\n/** Returns true when a value matches the normalized `HarnessRun` contract. */\nexport function isHarnessRun(value: unknown): value is HarnessRun {\n if (!value || typeof value !== \"object\") {\n return false;\n }\n\n const candidate = value as {\n session?: unknown;\n usage?: unknown;\n errors?: unknown;\n };\n\n return (\n isNormalizedSession(candidate.session) &&\n Boolean(candidate.usage) &&\n typeof candidate.usage === \"object\" &&\n !Array.isArray(candidate.usage) &&\n Array.isArray(candidate.errors)\n );\n}\n\n/** Returns true when a value matches the normalized session contract. */\nexport function isNormalizedSession(\n value: unknown,\n): value is NormalizedSession {\n return (\n Boolean(value) &&\n typeof value === \"object\" &&\n value !== null &&\n \"messages\" in value &&\n Array.isArray((value as { messages?: unknown }).messages)\n );\n}\n\n/** Reuses pre-normalized harness errors when a runtime already returns them. */\nexport function resolveHarnessRunErrors(\n result: unknown,\n): Array<Record<string, JsonValue>> {\n if (\n result &&\n typeof result === \"object\" &&\n Array.isArray((result as Record<string, unknown>).errors)\n ) {\n return (result as { errors: Array<Record<string, JsonValue>> }).errors;\n }\n\n return [];\n}\n\n/** Serializes an arbitrary thrown value into the normalized error shape. */\nexport function serializeError(error: unknown): Record<string, JsonValue> {\n if (error instanceof Error) {\n return {\n type: error.name,\n message: error.message,\n };\n }\n\n return {\n type: \"Error\",\n message: String(error),\n };\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AA4GA,SAAS,gBAAgB,OAAwC;AAC/D,SACE,UAAU,QACV,OAAO,UAAU,YACjB,OAAO,UAAU,YACjB,OAAO,UAAU;AAErB;AAEA,SAAS,aAAa,OAAkD;AACtE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;AAEA,SAAS,mBAAmB,OAA+B;AACzD,SAAO,MAAM,IAAI,CAAC,SAAS;AACzB,UAAM,aAAa,YAAY,IAAI;AACnC,WAAO,eAAe,SAAY,OAAO;AAAA,EAC3C,CAAC;AACH;AAEA,SAAS,oBACP,OAC2B;AAC3B,QAAM,aAAwC,CAAC;AAE/C,aAAW,CAAC,KAAK,UAAU,KAAK,OAAO,QAAQ,KAAK,GAAG;AACrD,UAAM,QAAQ,YAAY,UAAU;AACpC,QAAI,UAAU,QAAW;AACvB,iBAAW,GAAG,IAAI;AAAA,IACpB;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,kBAAkB,OAAgB,YAAoB;AACpE,SACE,UAAU,SACT,OAAO,UAAU,YAAY,OAAO,UAAU,eAC/C,cAAc,SACd,OAAQ,MAAkC,UAAU,MAAM;AAE9D;AAGO,SAAS,YAAY,OAAuC;AACjE,MAAI,gBAAgB,KAAK,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,KAAK,GAAG;AACxB,WAAO,mBAAmB,KAAK;AAAA,EACjC;AAEA,MAAI,aAAa,KAAK,GAAG;AACvB,WAAO,oBAAoB,KAAK;AAAA,EAClC;AAEA,SAAO;AACT;AAGO,SAAS,gBACd,OAC2B;AAC3B,SAAO,oBAAoB,KAAK;AAClC;AAGO,SAAS,kBACd,OACuC;AACvC,QAAM,aAAa,gBAAgB,KAAK;AACxC,SAAO,OAAO,KAAK,UAAU,EAAE,SAAS,IAAI,aAAa;AAC3D;AAGO,SAAS,iBAAiB,OAA2B;AAC1D,SAAO,YAAY,KAAK,KAAK,OAAO,KAAK;AAC3C;AAGO,SAAS,UAAU,SAA8C;AACtE,SAAO,QAAQ,SAAS,QAAQ,CAAC,YAAY,QAAQ,aAAa,CAAC,CAAC;AACtE;AAGO,SAAS,eACd,SACA,MACqB;AACrB,SAAO,QAAQ,SAAS,OAAO,CAAC,YAAY,QAAQ,SAAS,IAAI;AACnE;AAGO,SAAS,eAAe,SAA4B;AACzD,SAAO,eAAe,SAAS,QAAQ;AACzC;AAGO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAGO,SAAS,kBAAkB,SAA4B;AAC5D,SAAO,eAAe,SAAS,WAAW;AAC5C;AAGO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAGO,SAAS,wBACd,OACA,KACiB;AACjB,QAAM,YACJ,iBAAiB,QACb,QACA,IAAI,MAAM,OAAO,SAAS,eAAe,CAAC;AAChD,SAAO,OAAO,OAAO,WAAW;AAAA,IAC9B,gBAAgB;AAAA,EAClB,CAAC;AACH;AAGO,SAAS,uBAAuB,OAAwC;AAC7E,MACE,SACA,OAAO,UAAU,YACjB,oBAAoB,SACpB,aAAc,MAAuC,cAAc,GACnE;AACA,WAAQ,MAAyC;AAAA,EACnD;AAEA,SAAO;AACT;AAGO,SAAS,aAAa,OAAqC;AAChE,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY;AAMlB,SACE,oBAAoB,UAAU,OAAO,KACrC,QAAQ,UAAU,KAAK,KACvB,OAAO,UAAU,UAAU,YAC3B,CAAC,MAAM,QAAQ,UAAU,KAAK,KAC9B,MAAM,QAAQ,UAAU,MAAM;AAElC;AAGO,SAAS,oBACd,OAC4B;AAC5B,SACE,QAAQ,KAAK,KACb,OAAO,UAAU,YACjB,UAAU,QACV,cAAc,SACd,MAAM,QAAS,MAAiC,QAAQ;AAE5D;AAGO,SAAS,wBACd,QACkC;AAClC,MACE,UACA,OAAO,WAAW,YAClB,MAAM,QAAS,OAAmC,MAAM,GACxD;AACA,WAAQ,OAAwD;AAAA,EAClE;AAEA,SAAO,CAAC;AACV;AAGO,SAAS,eAAe,OAA2C;AACxE,MAAI,iBAAiB,OAAO;AAC1B,WAAO;AAAA,MACL,MAAM,MAAM;AAAA,MACZ,SAAS,MAAM;AAAA,IACjB;AAAA,EACF;AAEA,SAAO;AAAA,IACL,MAAM;AAAA,IACN,SAAS,OAAO,KAAK;AAAA,EACvB;AACF;","names":[]}
|
package/dist/harness.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/harness.ts"],"sourcesContent":["export type JsonPrimitive = string | number | boolean | null;\n\nexport type JsonValue =\n | JsonPrimitive\n | JsonValue[]\n | { [key: string]: JsonValue };\n\nexport type ToolCallRecord = {\n id?: string;\n name: string;\n arguments?: Record<string, JsonValue>;\n result?: JsonValue;\n error?: {\n message: string;\n type?: string;\n [key: string]: JsonValue | undefined;\n };\n startedAt?: string;\n finishedAt?: string;\n durationMs?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type NormalizedMessage = {\n role: \"system\" | \"user\" | \"assistant\" | \"tool\";\n content?: JsonValue;\n toolCalls?: ToolCallRecord[];\n metadata?: Record<string, JsonValue>;\n};\n\nexport type UsageSummary = {\n provider?: string;\n model?: string;\n inputTokens?: number;\n outputTokens?: number;\n reasoningTokens?: number;\n totalTokens?: number;\n estimatedCost?: number;\n toolCalls?: number;\n retries?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type TimingSummary = {\n totalMs?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type NormalizedSession = {\n messages: NormalizedMessage[];\n outputText?: string;\n provider?: string;\n model?: string;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type HarnessRun = {\n session: NormalizedSession;\n output?: JsonValue;\n usage: UsageSummary;\n timings?: TimingSummary;\n artifacts?: Record<string, JsonValue>;\n errors: Array<Record<string, JsonValue>>;\n};\n\nexport type HarnessPromptOptions = {\n system?: string;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type HarnessPrompt = (\n input: string,\n options?: HarnessPromptOptions,\n) => Promise<string>;\n\nexport type HarnessRuntime = {\n prompt: HarnessPrompt;\n};\n\nexport type HarnessRunError = Error & {\n vitestEvalsRun: HarnessRun;\n};\n\nexport type HarnessMetadata = Record<string, unknown>;\n\nexport type HarnessContext<\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n metadata: Readonly<TMetadata>;\n task: {\n meta: Record<string, unknown>;\n };\n signal?: AbortSignal;\n artifacts: Record<string, JsonValue>;\n setArtifact: (name: string, value: JsonValue) => void;\n};\n\nexport type Harness<\n TInput = unknown,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n name: string;\n prompt?: HarnessPrompt;\n run: (\n input: TInput,\n context: HarnessContext<TMetadata>,\n ) => Promise<HarnessRun>;\n};\n\nfunction isJsonPrimitive(value: unknown): value is JsonPrimitive {\n return (\n value === null ||\n typeof value === \"string\" ||\n typeof value === \"number\" ||\n typeof value === \"boolean\"\n );\n}\n\nfunction isJsonRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\nfunction normalizeJsonArray(value: unknown[]): JsonValue[] {\n return value.map((item) => {\n const normalized = toJsonValue(item);\n return normalized === undefined ? null : normalized;\n });\n}\n\nfunction normalizeJsonObject(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n const normalized: Record<string, JsonValue> = {};\n\n for (const [key, entryValue] of Object.entries(value)) {\n const entry = toJsonValue(entryValue);\n if (entry !== undefined) {\n normalized[key] = entry;\n }\n }\n\n return normalized;\n}\n\n/** Returns true when a value exposes a callable method with the given name. */\nexport function hasCallableMethod(value: unknown, methodName: string) {\n return (\n value !== null &&\n (typeof value === \"object\" || typeof value === \"function\") &&\n methodName in value &&\n typeof (value as Record<string, unknown>)[methodName] === \"function\"\n );\n}\n\n/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */\nexport function toJsonValue(value: unknown): JsonValue | undefined {\n if (isJsonPrimitive(value)) {\n return value;\n }\n\n if (Array.isArray(value)) {\n return normalizeJsonArray(value);\n }\n\n if (isJsonRecord(value)) {\n return normalizeJsonObject(value);\n }\n\n return undefined;\n}\n\n/** Drops non-JSON properties from a record while preserving valid values. */\nexport function normalizeRecord(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n return normalizeJsonObject(value);\n}\n\n/** Normalizes metadata and omits the field entirely when nothing survives. */\nexport function normalizeMetadata(\n value: Record<string, unknown>,\n): Record<string, JsonValue> | undefined {\n const normalized = normalizeRecord(value);\n return Object.keys(normalized).length > 0 ? normalized : undefined;\n}\n\n/** Converts arbitrary content into the JSON-safe message content shape. */\nexport function normalizeContent(value: unknown): JsonValue {\n return toJsonValue(value) ?? String(value);\n}\n\n/** Flattens every recorded tool call from a normalized session. */\nexport function toolCalls(session: NormalizedSession): ToolCallRecord[] {\n return session.messages.flatMap((message) => message.toolCalls ?? []);\n}\n\n/** Filters normalized session messages by role. */\nexport function messagesByRole(\n session: NormalizedSession,\n role: NormalizedMessage[\"role\"],\n): NormalizedMessage[] {\n return session.messages.filter((message) => message.role === role);\n}\n\n/** Returns every normalized system message from a session. */\nexport function systemMessages(session: NormalizedSession) {\n return messagesByRole(session, \"system\");\n}\n\n/** Returns every normalized user message from a session. */\nexport function userMessages(session: NormalizedSession) {\n return messagesByRole(session, \"user\");\n}\n\n/** Returns every normalized assistant message from a session. */\nexport function assistantMessages(session: NormalizedSession) {\n return messagesByRole(session, \"assistant\");\n}\n\n/** Returns every normalized tool message from a session. */\nexport function toolMessages(session: NormalizedSession) {\n return messagesByRole(session, \"tool\");\n}\n\n/** Attaches a partial or complete harness run to an arbitrary thrown error. */\nexport function attachHarnessRunToError(\n error: unknown,\n run: HarnessRun,\n): HarnessRunError {\n const baseError =\n error instanceof Error\n ? error\n : new Error(String(error ?? \"Unknown error\"));\n return Object.assign(baseError, {\n vitestEvalsRun: run,\n });\n}\n\n/** Reads an attached harness run back off a previously wrapped error value. */\nexport function getHarnessRunFromError(error: unknown): HarnessRun | undefined {\n if (\n error &&\n typeof error === \"object\" &&\n \"vitestEvalsRun\" in error &&\n isHarnessRun((error as { vitestEvalsRun?: unknown }).vitestEvalsRun)\n ) {\n return (error as { vitestEvalsRun: HarnessRun }).vitestEvalsRun;\n }\n\n return undefined;\n}\n\n/** Returns true when a value matches the normalized `HarnessRun` contract. */\nexport function isHarnessRun(value: unknown): value is HarnessRun {\n if (!value || typeof value !== \"object\") {\n return false;\n }\n\n const candidate = value as {\n session?: unknown;\n usage?: unknown;\n errors?: unknown;\n };\n\n return (\n isNormalizedSession(candidate.session) &&\n Boolean(candidate.usage) &&\n typeof candidate.usage === \"object\" &&\n !Array.isArray(candidate.usage) &&\n Array.isArray(candidate.errors)\n );\n}\n\n/** Returns true when a value matches the normalized session contract. */\nexport function isNormalizedSession(\n value: unknown,\n): value is NormalizedSession {\n return (\n Boolean(value) &&\n typeof value === \"object\" &&\n value !== null &&\n \"messages\" in value &&\n Array.isArray((value as { messages?: unknown }).messages)\n );\n}\n\n/** Reuses pre-normalized harness errors when a runtime already returns them. */\nexport function resolveHarnessRunErrors(\n result: unknown,\n): Array<Record<string, JsonValue>> {\n if (\n result &&\n typeof result === \"object\" &&\n Array.isArray((result as Record<string, unknown>).errors)\n ) {\n return (result as { errors: Array<Record<string, JsonValue>> }).errors;\n }\n\n return [];\n}\n\n/** Serializes an arbitrary thrown value into the normalized error shape. */\nexport function serializeError(error: unknown): Record<string, JsonValue> {\n if (error instanceof Error) {\n return {\n type: error.name,\n message: error.message,\n };\n }\n\n return {\n type: \"Error\",\n message: String(error),\n };\n}\n"],"mappings":";AA6GA,SAAS,gBAAgB,OAAwC;AAC/D,SACE,UAAU,QACV,OAAO,UAAU,YACjB,OAAO,UAAU,YACjB,OAAO,UAAU;AAErB;AAEA,SAAS,aAAa,OAAkD;AACtE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;AAEA,SAAS,mBAAmB,OAA+B;AACzD,SAAO,MAAM,IAAI,CAAC,SAAS;AACzB,UAAM,aAAa,YAAY,IAAI;AACnC,WAAO,eAAe,SAAY,OAAO;AAAA,EAC3C,CAAC;AACH;AAEA,SAAS,oBACP,OAC2B;AAC3B,QAAM,aAAwC,CAAC;AAE/C,aAAW,CAAC,KAAK,UAAU,KAAK,OAAO,QAAQ,KAAK,GAAG;AACrD,UAAM,QAAQ,YAAY,UAAU;AACpC,QAAI,UAAU,QAAW;AACvB,iBAAW,GAAG,IAAI;AAAA,IACpB;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,kBAAkB,OAAgB,YAAoB;AACpE,SACE,UAAU,SACT,OAAO,UAAU,YAAY,OAAO,UAAU,eAC/C,cAAc,SACd,OAAQ,MAAkC,UAAU,MAAM;AAE9D;AAGO,SAAS,YAAY,OAAuC;AACjE,MAAI,gBAAgB,KAAK,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,KAAK,GAAG;AACxB,WAAO,mBAAmB,KAAK;AAAA,EACjC;AAEA,MAAI,aAAa,KAAK,GAAG;AACvB,WAAO,oBAAoB,KAAK;AAAA,EAClC;AAEA,SAAO;AACT;AAGO,SAAS,gBACd,OAC2B;AAC3B,SAAO,oBAAoB,KAAK;AAClC;AAGO,SAAS,kBACd,OACuC;AACvC,QAAM,aAAa,gBAAgB,KAAK;AACxC,SAAO,OAAO,KAAK,UAAU,EAAE,SAAS,IAAI,aAAa;AAC3D;AAGO,SAAS,iBAAiB,OAA2B;AAC1D,SAAO,YAAY,KAAK,KAAK,OAAO,KAAK;AAC3C;AAGO,SAAS,UAAU,SAA8C;AACtE,SAAO,QAAQ,SAAS,QAAQ,CAAC,YAAY,QAAQ,aAAa,CAAC,CAAC;AACtE;AAGO,SAAS,eACd,SACA,MACqB;AACrB,SAAO,QAAQ,SAAS,OAAO,CAAC,YAAY,QAAQ,SAAS,IAAI;AACnE;AAGO,SAAS,eAAe,SAA4B;AACzD,SAAO,eAAe,SAAS,QAAQ;AACzC;AAGO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAGO,SAAS,kBAAkB,SAA4B;AAC5D,SAAO,eAAe,SAAS,WAAW;AAC5C;AAGO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAGO,SAAS,wBACd,OACA,KACiB;AACjB,QAAM,YACJ,iBAAiB,QACb,QACA,IAAI,MAAM,OAAO,SAAS,eAAe,CAAC;AAChD,SAAO,OAAO,OAAO,WAAW;AAAA,IAC9B,gBAAgB;AAAA,EAClB,CAAC;AACH;AAGO,SAAS,uBAAuB,OAAwC;AAC7E,MACE,SACA,OAAO,UAAU,YACjB,oBAAoB,SACpB,aAAc,MAAuC,cAAc,GACnE;AACA,WAAQ,MAAyC;AAAA,EACnD;AAEA,SAAO;AACT;AAGO,SAAS,aAAa,OAAqC;AAChE,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY;AAMlB,SACE,oBAAoB,UAAU,OAAO,KACrC,QAAQ,UAAU,KAAK,KACvB,OAAO,UAAU,UAAU,YAC3B,CAAC,MAAM,QAAQ,UAAU,KAAK,KAC9B,MAAM,QAAQ,UAAU,MAAM;AAElC;AAGO,SAAS,oBACd,OAC4B;AAC5B,SACE,QAAQ,KAAK,KACb,OAAO,UAAU,YACjB,UAAU,QACV,cAAc,SACd,MAAM,QAAS,MAAiC,QAAQ;AAE5D;AAGO,SAAS,wBACd,QACkC;AAClC,MACE,UACA,OAAO,WAAW,YAClB,MAAM,QAAS,OAAmC,MAAM,GACxD;AACA,WAAQ,OAAwD;AAAA,EAClE;AAEA,SAAO,CAAC;AACV;AAGO,SAAS,eAAe,OAA2C;AACxE,MAAI,iBAAiB,OAAO;AAC1B,WAAO;AAAA,MACL,MAAM,MAAM;AAAA,MACZ,SAAS,MAAM;AAAA,IACjB;AAAA,EACF;AAEA,SAAO;AAAA,IACL,MAAM;AAAA,IACN,SAAS,OAAO,KAAK;AAAA,EACvB;AACF;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/harness.ts"],"sourcesContent":["export type JsonPrimitive = string | number | boolean | null;\n\nexport type JsonValue =\n | JsonPrimitive\n | JsonValue[]\n | { [key: string]: JsonValue };\n\nexport type ToolCallRecord = {\n id?: string;\n name: string;\n arguments?: Record<string, JsonValue>;\n result?: JsonValue;\n error?: {\n message: string;\n type?: string;\n [key: string]: JsonValue | undefined;\n };\n startedAt?: string;\n finishedAt?: string;\n durationMs?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type NormalizedMessage = {\n role: \"system\" | \"user\" | \"assistant\" | \"tool\";\n content?: JsonValue;\n toolCalls?: ToolCallRecord[];\n metadata?: Record<string, JsonValue>;\n};\n\nexport type UsageSummary = {\n provider?: string;\n model?: string;\n inputTokens?: number;\n outputTokens?: number;\n reasoningTokens?: number;\n totalTokens?: number;\n estimatedCost?: number;\n toolCalls?: number;\n retries?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type TimingSummary = {\n totalMs?: number;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type NormalizedSession = {\n messages: NormalizedMessage[];\n outputText?: string;\n provider?: string;\n model?: string;\n metadata?: Record<string, JsonValue>;\n};\n\nexport type HarnessRun = {\n session: NormalizedSession;\n output?: JsonValue;\n usage: UsageSummary;\n timings?: TimingSummary;\n artifacts?: Record<string, JsonValue>;\n errors: Array<Record<string, JsonValue>>;\n};\n\n/** Optional provider-facing hints for harness prompt calls. */\nexport type HarnessPromptOptions = {\n system?: string;\n metadata?: Record<string, JsonValue>;\n};\n\n/** Provider-agnostic prompt seam that judges can reuse from a harness. */\nexport type HarnessPrompt = (\n input: string,\n options?: HarnessPromptOptions,\n) => Promise<string>;\n\nexport type HarnessRunError = Error & {\n vitestEvalsRun: HarnessRun;\n};\n\nexport type HarnessMetadata = Record<string, unknown>;\n\nexport type HarnessContext<\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n metadata: Readonly<TMetadata>;\n task: {\n meta: Record<string, unknown>;\n };\n signal?: AbortSignal;\n artifacts: Record<string, JsonValue>;\n setArtifact: (name: string, value: JsonValue) => void;\n};\n\nexport type Harness<\n TInput = unknown,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n name: string;\n /** Prompt seam reused by LLM-backed judges. */\n prompt: HarnessPrompt;\n run: (\n input: TInput,\n context: HarnessContext<TMetadata>,\n ) => Promise<HarnessRun>;\n};\n\nfunction isJsonPrimitive(value: unknown): value is JsonPrimitive {\n return (\n value === null ||\n typeof value === \"string\" ||\n typeof value === \"number\" ||\n typeof value === \"boolean\"\n );\n}\n\nfunction isJsonRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\nfunction normalizeJsonArray(value: unknown[]): JsonValue[] {\n return value.map((item) => {\n const normalized = toJsonValue(item);\n return normalized === undefined ? null : normalized;\n });\n}\n\nfunction normalizeJsonObject(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n const normalized: Record<string, JsonValue> = {};\n\n for (const [key, entryValue] of Object.entries(value)) {\n const entry = toJsonValue(entryValue);\n if (entry !== undefined) {\n normalized[key] = entry;\n }\n }\n\n return normalized;\n}\n\n/** Returns true when a value exposes a callable method with the given name. */\nexport function hasCallableMethod(value: unknown, methodName: string) {\n return (\n value !== null &&\n (typeof value === \"object\" || typeof value === \"function\") &&\n methodName in value &&\n typeof (value as Record<string, unknown>)[methodName] === \"function\"\n );\n}\n\n/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */\nexport function toJsonValue(value: unknown): JsonValue | undefined {\n if (isJsonPrimitive(value)) {\n return value;\n }\n\n if (Array.isArray(value)) {\n return normalizeJsonArray(value);\n }\n\n if (isJsonRecord(value)) {\n return normalizeJsonObject(value);\n }\n\n return undefined;\n}\n\n/** Drops non-JSON properties from a record while preserving valid values. */\nexport function normalizeRecord(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n return normalizeJsonObject(value);\n}\n\n/** Normalizes metadata and omits the field entirely when nothing survives. */\nexport function normalizeMetadata(\n value: Record<string, unknown>,\n): Record<string, JsonValue> | undefined {\n const normalized = normalizeRecord(value);\n return Object.keys(normalized).length > 0 ? normalized : undefined;\n}\n\n/** Converts arbitrary content into the JSON-safe message content shape. */\nexport function normalizeContent(value: unknown): JsonValue {\n return toJsonValue(value) ?? String(value);\n}\n\n/** Flattens every recorded tool call from a normalized session. */\nexport function toolCalls(session: NormalizedSession): ToolCallRecord[] {\n return session.messages.flatMap((message) => message.toolCalls ?? []);\n}\n\n/** Filters normalized session messages by role. */\nexport function messagesByRole(\n session: NormalizedSession,\n role: NormalizedMessage[\"role\"],\n): NormalizedMessage[] {\n return session.messages.filter((message) => message.role === role);\n}\n\n/** Returns every normalized system message from a session. */\nexport function systemMessages(session: NormalizedSession) {\n return messagesByRole(session, \"system\");\n}\n\n/** Returns every normalized user message from a session. */\nexport function userMessages(session: NormalizedSession) {\n return messagesByRole(session, \"user\");\n}\n\n/** Returns every normalized assistant message from a session. */\nexport function assistantMessages(session: NormalizedSession) {\n return messagesByRole(session, \"assistant\");\n}\n\n/** Returns every normalized tool message from a session. */\nexport function toolMessages(session: NormalizedSession) {\n return messagesByRole(session, \"tool\");\n}\n\n/** Attaches a partial or complete harness run to an arbitrary thrown error. */\nexport function attachHarnessRunToError(\n error: unknown,\n run: HarnessRun,\n): HarnessRunError {\n const baseError =\n error instanceof Error\n ? error\n : new Error(String(error ?? \"Unknown error\"));\n return Object.assign(baseError, {\n vitestEvalsRun: run,\n });\n}\n\n/** Reads an attached harness run back off a previously wrapped error value. */\nexport function getHarnessRunFromError(error: unknown): HarnessRun | undefined {\n if (\n error &&\n typeof error === \"object\" &&\n \"vitestEvalsRun\" in error &&\n isHarnessRun((error as { vitestEvalsRun?: unknown }).vitestEvalsRun)\n ) {\n return (error as { vitestEvalsRun: HarnessRun }).vitestEvalsRun;\n }\n\n return undefined;\n}\n\n/** Returns true when a value matches the normalized `HarnessRun` contract. */\nexport function isHarnessRun(value: unknown): value is HarnessRun {\n if (!value || typeof value !== \"object\") {\n return false;\n }\n\n const candidate = value as {\n session?: unknown;\n usage?: unknown;\n errors?: unknown;\n };\n\n return (\n isNormalizedSession(candidate.session) &&\n Boolean(candidate.usage) &&\n typeof candidate.usage === \"object\" &&\n !Array.isArray(candidate.usage) &&\n Array.isArray(candidate.errors)\n );\n}\n\n/** Returns true when a value matches the normalized session contract. */\nexport function isNormalizedSession(\n value: unknown,\n): value is NormalizedSession {\n return (\n Boolean(value) &&\n typeof value === \"object\" &&\n value !== null &&\n \"messages\" in value &&\n Array.isArray((value as { messages?: unknown }).messages)\n );\n}\n\n/** Reuses pre-normalized harness errors when a runtime already returns them. */\nexport function resolveHarnessRunErrors(\n result: unknown,\n): Array<Record<string, JsonValue>> {\n if (\n result &&\n typeof result === \"object\" &&\n Array.isArray((result as Record<string, unknown>).errors)\n ) {\n return (result as { errors: Array<Record<string, JsonValue>> }).errors;\n }\n\n return [];\n}\n\n/** Serializes an arbitrary thrown value into the normalized error shape. */\nexport function serializeError(error: unknown): Record<string, JsonValue> {\n if (error instanceof Error) {\n return {\n type: error.name,\n message: error.message,\n };\n }\n\n return {\n type: \"Error\",\n message: String(error),\n };\n}\n"],"mappings":";AA4GA,SAAS,gBAAgB,OAAwC;AAC/D,SACE,UAAU,QACV,OAAO,UAAU,YACjB,OAAO,UAAU,YACjB,OAAO,UAAU;AAErB;AAEA,SAAS,aAAa,OAAkD;AACtE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;AAEA,SAAS,mBAAmB,OAA+B;AACzD,SAAO,MAAM,IAAI,CAAC,SAAS;AACzB,UAAM,aAAa,YAAY,IAAI;AACnC,WAAO,eAAe,SAAY,OAAO;AAAA,EAC3C,CAAC;AACH;AAEA,SAAS,oBACP,OAC2B;AAC3B,QAAM,aAAwC,CAAC;AAE/C,aAAW,CAAC,KAAK,UAAU,KAAK,OAAO,QAAQ,KAAK,GAAG;AACrD,UAAM,QAAQ,YAAY,UAAU;AACpC,QAAI,UAAU,QAAW;AACvB,iBAAW,GAAG,IAAI;AAAA,IACpB;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,kBAAkB,OAAgB,YAAoB;AACpE,SACE,UAAU,SACT,OAAO,UAAU,YAAY,OAAO,UAAU,eAC/C,cAAc,SACd,OAAQ,MAAkC,UAAU,MAAM;AAE9D;AAGO,SAAS,YAAY,OAAuC;AACjE,MAAI,gBAAgB,KAAK,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,KAAK,GAAG;AACxB,WAAO,mBAAmB,KAAK;AAAA,EACjC;AAEA,MAAI,aAAa,KAAK,GAAG;AACvB,WAAO,oBAAoB,KAAK;AAAA,EAClC;AAEA,SAAO;AACT;AAGO,SAAS,gBACd,OAC2B;AAC3B,SAAO,oBAAoB,KAAK;AAClC;AAGO,SAAS,kBACd,OACuC;AACvC,QAAM,aAAa,gBAAgB,KAAK;AACxC,SAAO,OAAO,KAAK,UAAU,EAAE,SAAS,IAAI,aAAa;AAC3D;AAGO,SAAS,iBAAiB,OAA2B;AAC1D,SAAO,YAAY,KAAK,KAAK,OAAO,KAAK;AAC3C;AAGO,SAAS,UAAU,SAA8C;AACtE,SAAO,QAAQ,SAAS,QAAQ,CAAC,YAAY,QAAQ,aAAa,CAAC,CAAC;AACtE;AAGO,SAAS,eACd,SACA,MACqB;AACrB,SAAO,QAAQ,SAAS,OAAO,CAAC,YAAY,QAAQ,SAAS,IAAI;AACnE;AAGO,SAAS,eAAe,SAA4B;AACzD,SAAO,eAAe,SAAS,QAAQ;AACzC;AAGO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAGO,SAAS,kBAAkB,SAA4B;AAC5D,SAAO,eAAe,SAAS,WAAW;AAC5C;AAGO,SAAS,aAAa,SAA4B;AACvD,SAAO,eAAe,SAAS,MAAM;AACvC;AAGO,SAAS,wBACd,OACA,KACiB;AACjB,QAAM,YACJ,iBAAiB,QACb,QACA,IAAI,MAAM,OAAO,SAAS,eAAe,CAAC;AAChD,SAAO,OAAO,OAAO,WAAW;AAAA,IAC9B,gBAAgB;AAAA,EAClB,CAAC;AACH;AAGO,SAAS,uBAAuB,OAAwC;AAC7E,MACE,SACA,OAAO,UAAU,YACjB,oBAAoB,SACpB,aAAc,MAAuC,cAAc,GACnE;AACA,WAAQ,MAAyC;AAAA,EACnD;AAEA,SAAO;AACT;AAGO,SAAS,aAAa,OAAqC;AAChE,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY;AAMlB,SACE,oBAAoB,UAAU,OAAO,KACrC,QAAQ,UAAU,KAAK,KACvB,OAAO,UAAU,UAAU,YAC3B,CAAC,MAAM,QAAQ,UAAU,KAAK,KAC9B,MAAM,QAAQ,UAAU,MAAM;AAElC;AAGO,SAAS,oBACd,OAC4B;AAC5B,SACE,QAAQ,KAAK,KACb,OAAO,UAAU,YACjB,UAAU,QACV,cAAc,SACd,MAAM,QAAS,MAAiC,QAAQ;AAE5D;AAGO,SAAS,wBACd,QACkC;AAClC,MACE,UACA,OAAO,WAAW,YAClB,MAAM,QAAS,OAAmC,MAAM,GACxD;AACA,WAAQ,OAAwD;AAAA,EAClE;AAEA,SAAO,CAAC;AACV;AAGO,SAAS,eAAe,OAA2C;AACxE,MAAI,iBAAiB,OAAO;AAC1B,WAAO;AAAA,MACL,MAAM,MAAM;AAAA,MACZ,SAAS,MAAM;AAAA,IACjB;AAAA,EACF;AAEA,SAAO;AAAA,IACL,MAAM;AAAA,IACN,SAAS,OAAO,KAAK;AAAA,EACvB;AACF;","names":[]}
|
package/dist/index.d.mts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import * as vitest from 'vitest';
|
|
2
2
|
import { TestAPI } from 'vitest';
|
|
3
3
|
import { HarnessMetadata, ToolCallRecord, HarnessRun, Harness } from './harness.mjs';
|
|
4
|
-
export { HarnessContext, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, TimingSummary, UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, messagesByRole, systemMessages, toolCalls, toolMessages, userMessages } from './harness.mjs';
|
|
5
|
-
import {
|
|
4
|
+
export { HarnessContext, HarnessPrompt, HarnessPromptOptions, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, TimingSummary, UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, messagesByRole, systemMessages, toolCalls, toolMessages, userMessages } from './harness.mjs';
|
|
5
|
+
import { JudgeContext, JudgeFn, JudgeResult } from './judges/types.mjs';
|
|
6
|
+
export { JudgeOptions } from './judges/types.mjs';
|
|
6
7
|
export { wrapText } from './wrapText.mjs';
|
|
7
8
|
export { StructuredOutputJudge, StructuredOutputJudgeConfig, StructuredOutputJudgeOptions } from './judges/structuredOutputJudge.mjs';
|
|
8
9
|
export { ToolCallJudge, ToolCallJudgeConfig, ToolCallJudgeOptions } from './judges/toolCallJudge.mjs';
|
|
@@ -26,52 +27,60 @@ type EvalTaskMeta = {
|
|
|
26
27
|
run: HarnessRun;
|
|
27
28
|
};
|
|
28
29
|
};
|
|
30
|
+
type HarnessInput<THarness extends Harness<any, any>> = THarness extends Harness<infer TInput, any> ? TInput : unknown;
|
|
31
|
+
type HarnessMetadataFor<THarness extends Harness<any, any>> = THarness extends Harness<any, infer TMetadata> ? TMetadata : HarnessMetadata;
|
|
32
|
+
declare const evalHarnessRunBrand: unique symbol;
|
|
33
|
+
/** Harness run returned by the fixture-backed `run(...)` API. */
|
|
34
|
+
type EvalHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TMetadata> = Harness<TInput, TMetadata>> = HarnessRun & {
|
|
35
|
+
readonly [evalHarnessRunBrand]: {
|
|
36
|
+
readonly input: TInput;
|
|
37
|
+
readonly metadata: TMetadata;
|
|
38
|
+
readonly harness: THarness;
|
|
39
|
+
};
|
|
40
|
+
};
|
|
29
41
|
/** Per-run metadata forwarded to the harness alongside the test input. */
|
|
30
42
|
interface EvalRunOptions<TMetadata extends HarnessMetadata = HarnessMetadata> {
|
|
31
43
|
metadata?: TMetadata;
|
|
32
44
|
}
|
|
33
45
|
/** Explicit harness execution primitive exposed to each eval test. */
|
|
34
|
-
type EvalRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = (input: TInput, options?: EvalRunOptions<TMetadata>) => Promise<
|
|
46
|
+
type EvalRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TMetadata> = Harness<TInput, TMetadata>> = (input: TInput, options?: EvalRunOptions<TMetadata>) => Promise<EvalHarnessRun<TInput, TMetadata, THarness>>;
|
|
35
47
|
/** Fixture-backed Vitest context exposed inside `describeEval(...)` tests. */
|
|
36
|
-
interface EvalTestContext<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> {
|
|
37
|
-
run: EvalRun<TInput, TMetadata>;
|
|
48
|
+
interface EvalTestContext<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TMetadata> = Harness<TInput, TMetadata>> {
|
|
49
|
+
run: EvalRun<TInput, TMetadata, THarness>;
|
|
38
50
|
}
|
|
39
|
-
type EvalTestAPI<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = TestAPI<EvalTestContext<TInput, TMetadata>>;
|
|
40
|
-
/**
|
|
41
|
-
* Compatibility alias for harness-backed judge inputs.
|
|
42
|
-
*
|
|
43
|
-
* New custom judges should prefer `JudgeContext` directly. This alias remains
|
|
44
|
-
* for older imports that were already using the harness-backed judge shape.
|
|
45
|
-
*/
|
|
46
|
-
type HarnessJudgeOptions<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = JudgeContext<TInput, TMetadata>;
|
|
51
|
+
type EvalTestAPI<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TMetadata> = Harness<TInput, TMetadata>> = TestAPI<EvalTestContext<TInput, TMetadata, THarness>>;
|
|
47
52
|
/** Suite-level configuration for a harness-backed eval block. */
|
|
48
|
-
interface DescribeEvalOptions<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> {
|
|
53
|
+
interface DescribeEvalOptions<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TMetadata> = Harness<TInput, TMetadata>> {
|
|
49
54
|
/** Harness used for every explicit `run(...)` call in the suite. */
|
|
50
|
-
harness:
|
|
55
|
+
harness: THarness;
|
|
51
56
|
/** Automatic judges applied after each successful `run(...)`. */
|
|
52
|
-
judges?: Array<JudgeFn<
|
|
57
|
+
judges?: Array<JudgeFn<JudgeContext<TInput, TMetadata, THarness>>>;
|
|
53
58
|
/** Passing threshold for automatic suite-level judges. `null` disables fail-on-score. */
|
|
54
59
|
judgeThreshold?: number | null;
|
|
55
60
|
skipIf?: () => boolean;
|
|
56
61
|
}
|
|
57
|
-
type JudgeAssertionInputValue<TJudgeOptions extends
|
|
62
|
+
type JudgeAssertionInputValue<TJudgeOptions extends JudgeContext<any, any, any>> = TJudgeOptions extends {
|
|
58
63
|
inputValue: infer TInput;
|
|
59
64
|
} ? TInput : unknown;
|
|
60
|
-
type JudgeAssertionMetadata<TJudgeOptions extends
|
|
65
|
+
type JudgeAssertionMetadata<TJudgeOptions extends JudgeContext<any, any, any>> = TJudgeOptions extends {
|
|
61
66
|
metadata: infer TMetadata;
|
|
62
67
|
} ? TMetadata : HarnessMetadata;
|
|
68
|
+
type JudgeAssertionHarness<TJudgeOptions extends JudgeContext<any, any, any>> = TJudgeOptions extends {
|
|
69
|
+
harness: infer THarness;
|
|
70
|
+
} ? Exclude<THarness, undefined> : Harness<JudgeAssertionInputValue<TJudgeOptions>, JudgeAssertionMetadata<TJudgeOptions>>;
|
|
63
71
|
/** Optional overrides passed to `expect(...).toSatisfyJudge(...)`. */
|
|
64
|
-
type JudgeAssertionOptions<TJudgeOptions extends
|
|
72
|
+
type JudgeAssertionOptions<TJudgeOptions extends JudgeContext<any, any, any> = JudgeContext> = Partial<Omit<TJudgeOptions, "input" | "output" | "inputValue" | "metadata" | "toolCalls" | "run" | "session" | "harness">> & {
|
|
65
73
|
input?: string;
|
|
66
74
|
inputValue?: JudgeAssertionInputValue<TJudgeOptions>;
|
|
67
75
|
metadata?: JudgeAssertionMetadata<TJudgeOptions>;
|
|
68
76
|
toolCalls?: ToolCallRecord[];
|
|
69
77
|
run?: HarnessRun;
|
|
70
78
|
session?: HarnessRun["session"];
|
|
79
|
+
harness?: JudgeAssertionHarness<TJudgeOptions>;
|
|
71
80
|
/** Passing threshold for the explicit matcher. `null` records the score without failing. */
|
|
72
81
|
threshold?: number | null;
|
|
73
82
|
};
|
|
74
|
-
type ToSatisfyJudge<
|
|
83
|
+
type ToSatisfyJudge<TReceived = unknown> = <TJudgeOptions extends JudgeContext<any, any, any> = JudgeContext>(judge: JudgeFn<TJudgeOptions>, options?: JudgeAssertionOptions<TJudgeOptions>) => Promise<TReceived>;
|
|
75
84
|
interface EvalMatchers<R = unknown> {
|
|
76
85
|
toSatisfyJudge: ToSatisfyJudge<R>;
|
|
77
86
|
}
|
|
@@ -91,6 +100,7 @@ declare module "vitest" {
|
|
|
91
100
|
* describeEval("refund agent", {
|
|
92
101
|
* harness: piAiHarness({
|
|
93
102
|
* createAgent: () => createRefundAgent(),
|
|
103
|
+
* prompt: judgePrompt,
|
|
94
104
|
* }),
|
|
95
105
|
* judges: [ToolCallJudge()],
|
|
96
106
|
* }, (it) => {
|
|
@@ -104,12 +114,12 @@ declare module "vitest" {
|
|
|
104
114
|
* });
|
|
105
115
|
* ```
|
|
106
116
|
*/
|
|
107
|
-
declare function describeEval<
|
|
117
|
+
declare function describeEval<THarness extends Harness<any, any>>(name: string, options: DescribeEvalOptions<HarnessInput<THarness>, HarnessMetadataFor<THarness>, THarness>, define: (it: EvalTestAPI<HarnessInput<THarness>, HarnessMetadataFor<THarness>, THarness>) => void): vitest.SuiteCollector<object>;
|
|
108
118
|
/** Formats judge results for reporter and assertion output. */
|
|
109
119
|
declare function formatScores(scores: (JudgeResult & {
|
|
110
120
|
name: string;
|
|
111
121
|
})[]): string;
|
|
112
122
|
/** Applies a stable display name to a custom judge function. */
|
|
113
|
-
declare function namedJudge<TOptions extends
|
|
123
|
+
declare function namedJudge<TOptions extends JudgeContext<any, any, any>>(name: string, judge: JudgeFn<TOptions>): JudgeFn<TOptions>;
|
|
114
124
|
|
|
115
|
-
export {
|
|
125
|
+
export { type DescribeEvalOptions, type EvalHarnessRun, type EvalMatchers, type EvalRun, type EvalRunOptions, type EvalTestAPI, type EvalTestContext, Harness, HarnessMetadata, HarnessRun, type JudgeAssertionOptions, JudgeContext, JudgeFn, JudgeResult, type ToSatisfyJudge, ToolCallRecord, describeEval, formatScores, namedJudge };
|