vitest-evals 0.9.0-beta.6 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -14
- package/dist/harness.d.mts +329 -19
- package/dist/harness.d.ts +329 -19
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +155 -12
- package/dist/index.d.ts +155 -12
- package/dist/index.js.map +1 -1
- package/dist/index.mjs.map +1 -1
- package/dist/internal/matchers.d.mts +41 -3
- package/dist/internal/matchers.d.ts +41 -3
- package/dist/internal/matchers.js.map +1 -1
- package/dist/internal/matchers.mjs.map +1 -1
- package/dist/internal/structuredOutputScorer.d.mts +4 -0
- package/dist/internal/structuredOutputScorer.d.ts +4 -0
- package/dist/internal/structuredOutputScorer.js.map +1 -1
- package/dist/internal/structuredOutputScorer.mjs.map +1 -1
- package/dist/internal/toolCallScorer.d.mts +6 -0
- package/dist/internal/toolCallScorer.d.ts +6 -0
- package/dist/internal/toolCallScorer.js.map +1 -1
- package/dist/internal/toolCallScorer.mjs.map +1 -1
- package/dist/judges/index.d.mts +2 -2
- package/dist/judges/index.d.ts +2 -2
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/structuredOutputJudge.d.mts +54 -4
- package/dist/judges/structuredOutputJudge.d.ts +54 -4
- package/dist/judges/structuredOutputJudge.js.map +1 -1
- package/dist/judges/structuredOutputJudge.mjs.map +1 -1
- package/dist/judges/toolCallJudge.d.mts +56 -6
- package/dist/judges/toolCallJudge.d.ts +56 -6
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +68 -3
- package/dist/judges/types.d.ts +68 -3
- package/dist/judges/types.js.map +1 -1
- package/dist/legacy/scorers/index.js.map +1 -1
- package/dist/legacy/scorers/index.mjs.map +1 -1
- package/dist/legacy/scorers/structuredOutputScorer.js.map +1 -1
- package/dist/legacy/scorers/structuredOutputScorer.mjs.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/legacy/scorers/utils.js.map +1 -1
- package/dist/legacy/scorers/utils.mjs.map +1 -1
- package/dist/legacy.js.map +1 -1
- package/dist/legacy.mjs.map +1 -1
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs.map +1 -1
- package/package.json +13 -1
package/README.md
CHANGED
|
@@ -153,7 +153,7 @@ Use Vitest JSON as the eval report artifact. It preserves the `meta` field that
|
|
|
153
153
|
contains eval scores and normalized harness runs.
|
|
154
154
|
|
|
155
155
|
```sh
|
|
156
|
-
vitest run evals \
|
|
156
|
+
vitest run --config vitest.evals.config.ts \
|
|
157
157
|
--reporter=vitest-evals/reporter \
|
|
158
158
|
--reporter=json \
|
|
159
159
|
--outputFile.json=vitest-results.json
|
|
@@ -389,16 +389,3 @@ When you only need deterministic contract checks, built-ins such as
|
|
|
389
389
|
`StructuredOutputJudge()` and `ToolCallJudge()` are still available. The primary
|
|
390
390
|
documentation examples intentionally use factuality/rubric judges because those
|
|
391
391
|
match the product's LLM-as-a-judge direction.
|
|
392
|
-
|
|
393
|
-
## Legacy Compatibility
|
|
394
|
-
|
|
395
|
-
The root package is harness-first and judge-first. Legacy scorer-first suites
|
|
396
|
-
and `evaluate(...)` live under `vitest-evals/legacy`.
|
|
397
|
-
|
|
398
|
-
```ts
|
|
399
|
-
import {
|
|
400
|
-
describeEval,
|
|
401
|
-
StructuredOutputScorer,
|
|
402
|
-
ToolCallScorer,
|
|
403
|
-
} from "vitest-evals/legacy";
|
|
404
|
-
```
|
package/dist/harness.d.mts
CHANGED
|
@@ -4,51 +4,128 @@ type JsonPrimitive = string | number | boolean | null;
|
|
|
4
4
|
type JsonValue = JsonPrimitive | JsonValue[] | {
|
|
5
5
|
[key: string]: JsonValue;
|
|
6
6
|
};
|
|
7
|
-
/**
|
|
7
|
+
/**
|
|
8
|
+
* Normalized record for one tool call observed during a harness run.
|
|
9
|
+
*
|
|
10
|
+
* @example
|
|
11
|
+
* ```ts
|
|
12
|
+
* const call: ToolCallRecord = {
|
|
13
|
+
* name: "lookupInvoice",
|
|
14
|
+
* arguments: { invoiceId: "inv_123" },
|
|
15
|
+
* result: { refundable: true },
|
|
16
|
+
* };
|
|
17
|
+
* ```
|
|
18
|
+
*/
|
|
8
19
|
type ToolCallRecord = {
|
|
20
|
+
/** Provider or runtime tool-call id when one is available. */
|
|
9
21
|
id?: string;
|
|
22
|
+
/** Tool name as exposed to the agent or application runtime. */
|
|
10
23
|
name: string;
|
|
24
|
+
/** JSON-safe tool arguments after provider/runtime normalization. */
|
|
11
25
|
arguments?: Record<string, JsonValue>;
|
|
26
|
+
/** JSON-safe tool result returned by the application tool. */
|
|
12
27
|
result?: JsonValue;
|
|
28
|
+
/** Normalized tool error when execution failed. */
|
|
13
29
|
error?: {
|
|
14
30
|
message: string;
|
|
15
31
|
type?: string;
|
|
16
32
|
[key: string]: JsonValue | undefined;
|
|
17
33
|
};
|
|
34
|
+
/** ISO timestamp for the start of tool execution. */
|
|
18
35
|
startedAt?: string;
|
|
36
|
+
/** ISO timestamp for the end of tool execution. */
|
|
19
37
|
finishedAt?: string;
|
|
38
|
+
/** Tool execution duration in milliseconds. */
|
|
20
39
|
durationMs?: number;
|
|
40
|
+
/** Extra JSON-safe tool metadata for reporters and custom judges. */
|
|
21
41
|
metadata?: Record<string, JsonValue>;
|
|
22
42
|
};
|
|
23
|
-
/**
|
|
43
|
+
/**
|
|
44
|
+
* Normalized message recorded in a harness session transcript.
|
|
45
|
+
*
|
|
46
|
+
* @example
|
|
47
|
+
* ```ts
|
|
48
|
+
* const message: NormalizedMessage = {
|
|
49
|
+
* role: "assistant",
|
|
50
|
+
* content: { status: "approved" },
|
|
51
|
+
* toolCalls: [{ name: "lookupInvoice" }],
|
|
52
|
+
* };
|
|
53
|
+
* ```
|
|
54
|
+
*/
|
|
24
55
|
type NormalizedMessage = {
|
|
56
|
+
/** Transcript role for the normalized message. */
|
|
25
57
|
role: "system" | "user" | "assistant" | "tool";
|
|
58
|
+
/** JSON-safe message content. */
|
|
26
59
|
content?: JsonValue;
|
|
60
|
+
/** Tool calls associated with this message. */
|
|
27
61
|
toolCalls?: ToolCallRecord[];
|
|
62
|
+
/** Extra JSON-safe message metadata. */
|
|
28
63
|
metadata?: Record<string, JsonValue>;
|
|
29
64
|
};
|
|
30
|
-
/**
|
|
65
|
+
/**
|
|
66
|
+
* Provider usage summary attached to a normalized harness run.
|
|
67
|
+
*
|
|
68
|
+
* @example
|
|
69
|
+
* ```ts
|
|
70
|
+
* const usage: UsageSummary = {
|
|
71
|
+
* provider: "openai",
|
|
72
|
+
* model: "gpt-4o-mini",
|
|
73
|
+
* inputTokens: 212,
|
|
74
|
+
* outputTokens: 48,
|
|
75
|
+
* totalTokens: 260,
|
|
76
|
+
* };
|
|
77
|
+
* ```
|
|
78
|
+
*/
|
|
31
79
|
type UsageSummary = {
|
|
80
|
+
/** Provider that served the application run. */
|
|
32
81
|
provider?: string;
|
|
82
|
+
/** Model used for the application run. */
|
|
33
83
|
model?: string;
|
|
84
|
+
/** Input, prompt, or request tokens consumed by the run. */
|
|
34
85
|
inputTokens?: number;
|
|
86
|
+
/** Output or completion tokens produced by the run. */
|
|
35
87
|
outputTokens?: number;
|
|
88
|
+
/** Reasoning tokens reported by providers that expose them. */
|
|
36
89
|
reasoningTokens?: number;
|
|
90
|
+
/** Total token count reported by the provider or adapter. */
|
|
37
91
|
totalTokens?: number;
|
|
92
|
+
/** Count of tool calls observed during the run. */
|
|
38
93
|
toolCalls?: number;
|
|
94
|
+
/** Retry count observed during the run. */
|
|
39
95
|
retries?: number;
|
|
96
|
+
/** Provider-specific JSON-safe usage details. Cost estimates belong here. */
|
|
40
97
|
metadata?: Record<string, JsonValue>;
|
|
41
98
|
};
|
|
42
99
|
/** Timing summary attached to a normalized harness run. */
|
|
43
100
|
type TimingSummary = {
|
|
101
|
+
/** End-to-end run duration in milliseconds. */
|
|
44
102
|
totalMs?: number;
|
|
103
|
+
/** Extra JSON-safe timing metadata. */
|
|
45
104
|
metadata?: Record<string, JsonValue>;
|
|
46
105
|
};
|
|
47
|
-
/**
|
|
106
|
+
/**
|
|
107
|
+
* JSON-serializable transcript produced by the system under test.
|
|
108
|
+
*
|
|
109
|
+
* @example
|
|
110
|
+
* ```ts
|
|
111
|
+
* const session: NormalizedSession = {
|
|
112
|
+
* provider: "openai",
|
|
113
|
+
* model: "gpt-4o-mini",
|
|
114
|
+
* messages: [
|
|
115
|
+
* { role: "user", content: "Refund invoice inv_123" },
|
|
116
|
+
* { role: "assistant", content: { status: "approved" } },
|
|
117
|
+
* ],
|
|
118
|
+
* };
|
|
119
|
+
* ```
|
|
120
|
+
*/
|
|
48
121
|
type NormalizedSession = {
|
|
122
|
+
/** Ordered normalized transcript messages. */
|
|
49
123
|
messages: NormalizedMessage[];
|
|
124
|
+
/** Provider that produced the session when known. */
|
|
50
125
|
provider?: string;
|
|
126
|
+
/** Model that produced the session when known. */
|
|
51
127
|
model?: string;
|
|
128
|
+
/** Extra JSON-safe session metadata. */
|
|
52
129
|
metadata?: Record<string, JsonValue>;
|
|
53
130
|
};
|
|
54
131
|
type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
|
|
@@ -56,64 +133,165 @@ type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOut
|
|
|
56
133
|
} : {
|
|
57
134
|
output: TOutput;
|
|
58
135
|
};
|
|
59
|
-
/**
|
|
136
|
+
/**
|
|
137
|
+
* Normalized result returned by every harness execution.
|
|
138
|
+
*
|
|
139
|
+
* @example
|
|
140
|
+
* ```ts
|
|
141
|
+
* const run: HarnessRun<{ status: "approved" }> = {
|
|
142
|
+
* output: { status: "approved" },
|
|
143
|
+
* session: {
|
|
144
|
+
* messages: [
|
|
145
|
+
* { role: "user", content: "Refund invoice inv_123" },
|
|
146
|
+
* { role: "assistant", content: { status: "approved" } },
|
|
147
|
+
* ],
|
|
148
|
+
* },
|
|
149
|
+
* usage: { totalTokens: 260 },
|
|
150
|
+
* errors: [],
|
|
151
|
+
* };
|
|
152
|
+
* ```
|
|
153
|
+
*/
|
|
60
154
|
type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
|
|
155
|
+
/** Normalized transcript and provider/session metadata. */
|
|
61
156
|
session: NormalizedSession;
|
|
157
|
+
/** Stable provider usage units such as tokens, tools, and retries. */
|
|
62
158
|
usage: UsageSummary;
|
|
159
|
+
/** Optional timing summary for the run. */
|
|
63
160
|
timings?: TimingSummary;
|
|
161
|
+
/** JSON-safe run artifacts captured by the harness or test context. */
|
|
64
162
|
artifacts?: Record<string, JsonValue>;
|
|
163
|
+
/** Normalized errors captured during execution. */
|
|
65
164
|
errors: Array<Record<string, JsonValue>>;
|
|
66
165
|
};
|
|
67
166
|
/** Error value with an attached partial or complete normalized harness run. */
|
|
68
167
|
type HarnessRunError = Error & {
|
|
168
|
+
/** Attached normalized harness run recovered by `getHarnessRunFromError(...)`. */
|
|
69
169
|
vitestEvalsRun: HarnessRun;
|
|
70
170
|
};
|
|
71
171
|
/** Per-run metadata shape accepted by harnesses and eval tests. */
|
|
72
172
|
type HarnessMetadata = Record<string, unknown>;
|
|
73
|
-
/**
|
|
173
|
+
/**
|
|
174
|
+
* Runtime context passed from the eval fixture into a harness run.
|
|
175
|
+
*
|
|
176
|
+
* @example
|
|
177
|
+
* ```ts
|
|
178
|
+
* const harness: Harness<string> = {
|
|
179
|
+
* name: "refund-agent",
|
|
180
|
+
* async run(input, context) {
|
|
181
|
+
* context.setArtifact("inputLength", input.length);
|
|
182
|
+
*
|
|
183
|
+
* return {
|
|
184
|
+
* output: undefined,
|
|
185
|
+
* session: { messages: [{ role: "user", content: input }] },
|
|
186
|
+
* usage: {},
|
|
187
|
+
* errors: [],
|
|
188
|
+
* };
|
|
189
|
+
* },
|
|
190
|
+
* };
|
|
191
|
+
* ```
|
|
192
|
+
*/
|
|
74
193
|
type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
194
|
+
/** Per-run metadata passed through `run(input, { metadata })`. */
|
|
75
195
|
metadata: Readonly<TMetadata>;
|
|
196
|
+
/** Abort signal from Vitest when available. */
|
|
76
197
|
signal?: AbortSignal;
|
|
198
|
+
/** Mutable JSON-safe artifact bag shared with the harness. */
|
|
77
199
|
artifacts: Record<string, JsonValue>;
|
|
200
|
+
/** Stores one JSON-safe artifact on the current run. */
|
|
78
201
|
setArtifact: (name: string, value: JsonValue) => void;
|
|
79
202
|
};
|
|
80
|
-
/**
|
|
203
|
+
/**
|
|
204
|
+
* Adapter that executes the system under test and returns a normalized run.
|
|
205
|
+
*
|
|
206
|
+
* @example
|
|
207
|
+
* ```ts
|
|
208
|
+
* const harness: Harness<string, { status: "approved" | "denied" }> = {
|
|
209
|
+
* name: "refund-agent",
|
|
210
|
+
* async run(input, context) {
|
|
211
|
+
* return normalizeHarnessRun(input, await runRefundFlow(input), context);
|
|
212
|
+
* },
|
|
213
|
+
* };
|
|
214
|
+
* ```
|
|
215
|
+
*/
|
|
81
216
|
type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
217
|
+
/** Stable harness name used in reports. */
|
|
82
218
|
name: string;
|
|
219
|
+
/** Executes the system under test and returns a normalized run. */
|
|
83
220
|
run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun<TOutput>>;
|
|
84
221
|
};
|
|
85
222
|
/** Value or promise accepted by lightweight harness callbacks. */
|
|
86
223
|
type MaybePromise<T> = T | Promise<T>;
|
|
87
224
|
/** Lightweight tool-call record accepted by `createHarness(...)` results. */
|
|
88
225
|
type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
|
|
226
|
+
/** Raw tool arguments accepted by `createHarness(...)` before normalization. */
|
|
89
227
|
arguments?: unknown;
|
|
228
|
+
/** Raw tool result accepted by `createHarness(...)` before normalization. */
|
|
90
229
|
result?: unknown;
|
|
230
|
+
/** Raw tool error accepted by `createHarness(...)` before normalization. */
|
|
91
231
|
error?: unknown;
|
|
232
|
+
/** Raw tool metadata accepted by `createHarness(...)` before normalization. */
|
|
92
233
|
metadata?: Record<string, unknown>;
|
|
93
234
|
};
|
|
94
|
-
/**
|
|
235
|
+
/**
|
|
236
|
+
* Lightweight result shape normalized by `createHarness(...)`.
|
|
237
|
+
*
|
|
238
|
+
* @example
|
|
239
|
+
* ```ts
|
|
240
|
+
* const result: SimpleHarnessResult<{ status: "approved" }> = {
|
|
241
|
+
* output: { status: "approved" },
|
|
242
|
+
* toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
|
|
243
|
+
* usage: { totalTokens: 260 },
|
|
244
|
+
* };
|
|
245
|
+
* ```
|
|
246
|
+
*/
|
|
95
247
|
type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
|
|
248
|
+
/** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */
|
|
96
249
|
messages?: NormalizedMessage[];
|
|
250
|
+
/** Lightweight tool-call records to normalize into the session. */
|
|
97
251
|
toolCalls?: SimpleToolCallRecord[];
|
|
252
|
+
/** Usage summary to attach to the run. */
|
|
98
253
|
usage?: UsageSummary;
|
|
254
|
+
/** Timing summary to attach to the run. */
|
|
99
255
|
timings?: TimingSummary;
|
|
256
|
+
/** Raw artifact values to normalize and merge into the run. */
|
|
100
257
|
artifacts?: Record<string, unknown>;
|
|
258
|
+
/** Raw session metadata to normalize into the session. */
|
|
101
259
|
metadata?: Record<string, unknown>;
|
|
260
|
+
/** Raw errors to normalize into the run. */
|
|
102
261
|
errors?: unknown[];
|
|
103
262
|
};
|
|
104
263
|
/** Either a complete normalized run or a lightweight result to normalize. */
|
|
105
264
|
type HarnessResultLike<TOutput extends JsonValue | undefined = JsonValue | undefined> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;
|
|
106
265
|
/** Arguments passed to the `createHarness(...)` convenience callback. */
|
|
107
266
|
type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {
|
|
267
|
+
/** Original input passed to `run(input)`. */
|
|
108
268
|
input: TInput;
|
|
269
|
+
/** Read-only metadata passed to `run(input, { metadata })`. */
|
|
109
270
|
metadata: Readonly<TMetadata>;
|
|
271
|
+
/** Abort signal from Vitest when available. */
|
|
110
272
|
signal?: AbortSignal;
|
|
273
|
+
/** Mutable run artifact bag. */
|
|
111
274
|
artifacts: HarnessContext<TMetadata>["artifacts"];
|
|
275
|
+
/** Stores one JSON-safe artifact on the current run. */
|
|
112
276
|
setArtifact: HarnessContext<TMetadata>["setArtifact"];
|
|
113
277
|
};
|
|
114
|
-
/**
|
|
278
|
+
/**
|
|
279
|
+
* Options for creating a lightweight custom application harness.
|
|
280
|
+
*
|
|
281
|
+
* @example
|
|
282
|
+
* ```ts
|
|
283
|
+
* const options: CreateHarnessOptions<string, { status: "approved" }> = {
|
|
284
|
+
* name: "refund-agent",
|
|
285
|
+
* run: async ({ input }) => ({
|
|
286
|
+
* output: await classifyRefund(input),
|
|
287
|
+
* }),
|
|
288
|
+
* };
|
|
289
|
+
* ```
|
|
290
|
+
*/
|
|
115
291
|
type CreateHarnessOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
292
|
+
/** Stable harness name used in reports. */
|
|
116
293
|
name: string;
|
|
294
|
+
/** Executes application code and returns either a lightweight result or full `HarnessRun`. */
|
|
117
295
|
run: (args: CreateHarnessRunArgs<TInput, TMetadata>) => MaybePromise<HarnessResultLike<TOutput>>;
|
|
118
296
|
};
|
|
119
297
|
/** Returns true when a value exposes a callable method with the given name. */
|
|
@@ -126,25 +304,157 @@ declare function normalizeRecord(value: Record<string, unknown>): Record<string,
|
|
|
126
304
|
declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
|
|
127
305
|
/** Converts arbitrary content into the JSON-safe message content shape. */
|
|
128
306
|
declare function normalizeContent(value: unknown): JsonValue;
|
|
129
|
-
/**
|
|
307
|
+
/**
|
|
308
|
+
* Creates a harness from the common "run app code and return output" shape.
|
|
309
|
+
*
|
|
310
|
+
* @param options - Harness name plus the callback that executes app code.
|
|
311
|
+
*
|
|
312
|
+
* @example
|
|
313
|
+
* ```ts
|
|
314
|
+
* import { createHarness } from "vitest-evals";
|
|
315
|
+
*
|
|
316
|
+
* export const refundHarness = createHarness<
|
|
317
|
+
* string,
|
|
318
|
+
* { status: "approved" | "denied" },
|
|
319
|
+
* { expected: { status: "approved" | "denied" } }
|
|
320
|
+
* >({
|
|
321
|
+
* name: "refund-agent",
|
|
322
|
+
* run: async ({ input, metadata, setArtifact }) => {
|
|
323
|
+
* const result = await runRefundFlow(input, metadata);
|
|
324
|
+
* const output = { status: result.status };
|
|
325
|
+
*
|
|
326
|
+
* setArtifact("case", { expected: metadata.expected.status });
|
|
327
|
+
*
|
|
328
|
+
* return {
|
|
329
|
+
* output,
|
|
330
|
+
* toolCalls: result.toolCalls,
|
|
331
|
+
* usage: { provider: "openai", model: "gpt-4o-mini" },
|
|
332
|
+
* };
|
|
333
|
+
* },
|
|
334
|
+
* });
|
|
335
|
+
* ```
|
|
336
|
+
*/
|
|
130
337
|
declare function createHarness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata>(options: CreateHarnessOptions<TInput, TOutput, TMetadata>): Harness<TInput, TOutput, TMetadata>;
|
|
131
|
-
/**
|
|
338
|
+
/**
|
|
339
|
+
* Normalizes a lightweight harness result into the reporter-facing run shape.
|
|
340
|
+
*
|
|
341
|
+
* @param input - Original input passed to the harness.
|
|
342
|
+
* @param result - Lightweight result or pre-normalized harness run.
|
|
343
|
+
* @param context - Optional per-run context used to merge artifacts.
|
|
344
|
+
*
|
|
345
|
+
* @example
|
|
346
|
+
* ```ts
|
|
347
|
+
* const run = normalizeHarnessRun("Refund invoice inv_123", {
|
|
348
|
+
* output: { status: "approved" },
|
|
349
|
+
* toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
|
|
350
|
+
* usage: { provider: "openai", model: "gpt-4o-mini" },
|
|
351
|
+
* });
|
|
352
|
+
*
|
|
353
|
+
* expect(toolCalls(run.session)).toHaveLength(1);
|
|
354
|
+
* ```
|
|
355
|
+
*/
|
|
132
356
|
declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
|
|
133
|
-
/**
|
|
357
|
+
/**
|
|
358
|
+
* Flattens every recorded tool call from a normalized session.
|
|
359
|
+
*
|
|
360
|
+
* @param session - Normalized session produced by a harness run.
|
|
361
|
+
*
|
|
362
|
+
* @example
|
|
363
|
+
* ```ts
|
|
364
|
+
* const names = toolCalls(result.session).map((call) => call.name);
|
|
365
|
+
*
|
|
366
|
+
* expect(names).toEqual(["lookupInvoice", "createRefund"]);
|
|
367
|
+
* ```
|
|
368
|
+
*/
|
|
134
369
|
declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
|
|
135
|
-
/**
|
|
370
|
+
/**
|
|
371
|
+
* Filters normalized session messages by role.
|
|
372
|
+
*
|
|
373
|
+
* @param session - Normalized session produced by a harness run.
|
|
374
|
+
* @param role - Message role to keep.
|
|
375
|
+
*
|
|
376
|
+
* @example
|
|
377
|
+
* ```ts
|
|
378
|
+
* const assistantText = messagesByRole(result.session, "assistant")
|
|
379
|
+
* .map((message) => message.content)
|
|
380
|
+
* .join("\n");
|
|
381
|
+
* ```
|
|
382
|
+
*/
|
|
136
383
|
declare function messagesByRole(session: NormalizedSession, role: NormalizedMessage["role"]): NormalizedMessage[];
|
|
137
|
-
/**
|
|
384
|
+
/**
|
|
385
|
+
* Returns every normalized system message from a session.
|
|
386
|
+
*
|
|
387
|
+
* @param session - Normalized session produced by a harness run.
|
|
388
|
+
*
|
|
389
|
+
* @example
|
|
390
|
+
* ```ts
|
|
391
|
+
* const systemPrompts = systemMessages(result.session);
|
|
392
|
+
* ```
|
|
393
|
+
*/
|
|
138
394
|
declare function systemMessages(session: NormalizedSession): NormalizedMessage[];
|
|
139
|
-
/**
|
|
395
|
+
/**
|
|
396
|
+
* Returns every normalized user message from a session.
|
|
397
|
+
*
|
|
398
|
+
* @param session - Normalized session produced by a harness run.
|
|
399
|
+
*
|
|
400
|
+
* @example
|
|
401
|
+
* ```ts
|
|
402
|
+
* const firstPrompt = userMessages(result.session)[0]?.content;
|
|
403
|
+
* ```
|
|
404
|
+
*/
|
|
140
405
|
declare function userMessages(session: NormalizedSession): NormalizedMessage[];
|
|
141
|
-
/**
|
|
406
|
+
/**
|
|
407
|
+
* Returns every normalized assistant message from a session.
|
|
408
|
+
*
|
|
409
|
+
* @param session - Normalized session produced by a harness run.
|
|
410
|
+
*
|
|
411
|
+
* @example
|
|
412
|
+
* ```ts
|
|
413
|
+
* const finalAnswer = assistantMessages(result.session).at(-1)?.content;
|
|
414
|
+
* ```
|
|
415
|
+
*/
|
|
142
416
|
declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
|
|
143
|
-
/**
|
|
417
|
+
/**
|
|
418
|
+
* Returns every normalized tool message from a session.
|
|
419
|
+
*
|
|
420
|
+
* @param session - Normalized session produced by a harness run.
|
|
421
|
+
*
|
|
422
|
+
* @example
|
|
423
|
+
* ```ts
|
|
424
|
+
* const toolOutputs = toolMessages(result.session).map((message) => message.content);
|
|
425
|
+
* ```
|
|
426
|
+
*/
|
|
144
427
|
declare function toolMessages(session: NormalizedSession): NormalizedMessage[];
|
|
145
|
-
/**
|
|
428
|
+
/**
|
|
429
|
+
* Attaches a partial or complete harness run to an arbitrary thrown error.
|
|
430
|
+
*
|
|
431
|
+
* @param error - Thrown value to wrap.
|
|
432
|
+
* @param run - Partial or complete normalized harness run to preserve.
|
|
433
|
+
*
|
|
434
|
+
* @example
|
|
435
|
+
* ```ts
|
|
436
|
+
* try {
|
|
437
|
+
* return await runAgent(input);
|
|
438
|
+
* } catch (error) {
|
|
439
|
+
* throw attachHarnessRunToError(error, partialRun);
|
|
440
|
+
* }
|
|
441
|
+
* ```
|
|
442
|
+
*/
|
|
146
443
|
declare function attachHarnessRunToError(error: unknown, run: HarnessRun): HarnessRunError;
|
|
147
|
-
/**
|
|
444
|
+
/**
|
|
445
|
+
* Reads an attached harness run back off a previously wrapped error value.
|
|
446
|
+
*
|
|
447
|
+
* @param error - Unknown thrown value that may contain a harness run.
|
|
448
|
+
*
|
|
449
|
+
* @example
|
|
450
|
+
* ```ts
|
|
451
|
+
* const partialRun = getHarnessRunFromError(error);
|
|
452
|
+
*
|
|
453
|
+
* if (partialRun) {
|
|
454
|
+
* console.log(toolCalls(partialRun.session));
|
|
455
|
+
* }
|
|
456
|
+
* ```
|
|
457
|
+
*/
|
|
148
458
|
declare function getHarnessRunFromError(error: unknown): HarnessRun | undefined;
|
|
149
459
|
/** Returns true when a value matches the normalized `HarnessRun` contract. */
|
|
150
460
|
declare function isHarnessRun(value: unknown): value is HarnessRun;
|