vitest-evals 0.9.0-beta.5 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +13 -23
  2. package/dist/harness.d.mts +329 -20
  3. package/dist/harness.d.ts +329 -20
  4. package/dist/harness.js.map +1 -1
  5. package/dist/harness.mjs.map +1 -1
  6. package/dist/index.d.mts +155 -12
  7. package/dist/index.d.ts +155 -12
  8. package/dist/index.js.map +1 -1
  9. package/dist/index.mjs.map +1 -1
  10. package/dist/internal/matchers.d.mts +41 -3
  11. package/dist/internal/matchers.d.ts +41 -3
  12. package/dist/internal/matchers.js.map +1 -1
  13. package/dist/internal/matchers.mjs.map +1 -1
  14. package/dist/internal/structuredOutputScorer.d.mts +4 -0
  15. package/dist/internal/structuredOutputScorer.d.ts +4 -0
  16. package/dist/internal/structuredOutputScorer.js.map +1 -1
  17. package/dist/internal/structuredOutputScorer.mjs.map +1 -1
  18. package/dist/internal/toolCallScorer.d.mts +6 -0
  19. package/dist/internal/toolCallScorer.d.ts +6 -0
  20. package/dist/internal/toolCallScorer.js.map +1 -1
  21. package/dist/internal/toolCallScorer.mjs.map +1 -1
  22. package/dist/judges/index.d.mts +2 -2
  23. package/dist/judges/index.d.ts +2 -2
  24. package/dist/judges/index.js.map +1 -1
  25. package/dist/judges/index.mjs.map +1 -1
  26. package/dist/judges/structuredOutputJudge.d.mts +54 -4
  27. package/dist/judges/structuredOutputJudge.d.ts +54 -4
  28. package/dist/judges/structuredOutputJudge.js.map +1 -1
  29. package/dist/judges/structuredOutputJudge.mjs.map +1 -1
  30. package/dist/judges/toolCallJudge.d.mts +56 -6
  31. package/dist/judges/toolCallJudge.d.ts +56 -6
  32. package/dist/judges/toolCallJudge.js.map +1 -1
  33. package/dist/judges/toolCallJudge.mjs.map +1 -1
  34. package/dist/judges/types.d.mts +68 -3
  35. package/dist/judges/types.d.ts +68 -3
  36. package/dist/judges/types.js.map +1 -1
  37. package/dist/legacy/scorers/index.js.map +1 -1
  38. package/dist/legacy/scorers/index.mjs.map +1 -1
  39. package/dist/legacy/scorers/structuredOutputScorer.js.map +1 -1
  40. package/dist/legacy/scorers/structuredOutputScorer.mjs.map +1 -1
  41. package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
  42. package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
  43. package/dist/legacy/scorers/utils.js.map +1 -1
  44. package/dist/legacy/scorers/utils.mjs.map +1 -1
  45. package/dist/legacy.js.map +1 -1
  46. package/dist/legacy.mjs.map +1 -1
  47. package/dist/reporter.js.map +1 -1
  48. package/dist/reporter.mjs.map +1 -1
  49. package/package.json +13 -1
package/README.md CHANGED
@@ -18,11 +18,9 @@ npm install -D @vitest-evals/harness-ai-sdk
18
18
  npm install -D @vitest-evals/harness-openai-agents
19
19
  ```
20
20
 
21
- For GitHub Actions summaries and annotations, install the JSON post-processor:
22
-
23
- ```sh
24
- npm install -D @vitest-evals/github-reporter
25
- ```
21
+ For GitHub Actions summaries and annotations, emit Vitest JSON and use the
22
+ native `getsentry/vitest-evals` action. No extra npm package is needed in the
23
+ workflow.
26
24
 
27
25
  ## Core Model
28
26
 
@@ -155,17 +153,22 @@ Use Vitest JSON as the eval report artifact. It preserves the `meta` field that
155
153
  contains eval scores and normalized harness runs.
156
154
 
157
155
  ```sh
158
- vitest run evals \
156
+ vitest run --config vitest.evals.config.ts \
159
157
  --reporter=vitest-evals/reporter \
160
158
  --reporter=json \
161
159
  --outputFile.json=vitest-results.json
160
+ ```
162
161
 
163
- vitest-evals-github-report
162
+ ```yaml
163
+ - uses: getsentry/vitest-evals@v0
164
+ if: always()
165
+ with:
166
+ results: vitest-results.json
164
167
  ```
165
168
 
166
- The GitHub reporter writes a job summary when `GITHUB_STEP_SUMMARY` is present,
167
- emits short failure annotations in Actions, and can publish a separate Check Run
168
- with `--check-run` when `checks: write` permission is configured.
169
+ The GitHub reporter action writes a job summary, emits short failure
170
+ annotations, can publish a separate Check Run, and can reduce sharded eval JSON
171
+ artifacts into one combined report.
169
172
 
170
173
  ## Existing Agents
171
174
 
@@ -386,16 +389,3 @@ When you only need deterministic contract checks, built-ins such as
386
389
  `StructuredOutputJudge()` and `ToolCallJudge()` are still available. The primary
387
390
  documentation examples intentionally use factuality/rubric judges because those
388
391
  match the product's LLM-as-a-judge direction.
389
-
390
- ## Legacy Compatibility
391
-
392
- The root package is harness-first and judge-first. Legacy scorer-first suites
393
- and `evaluate(...)` live under `vitest-evals/legacy`.
394
-
395
- ```ts
396
- import {
397
- describeEval,
398
- StructuredOutputScorer,
399
- ToolCallScorer,
400
- } from "vitest-evals/legacy";
401
- ```
@@ -4,52 +4,128 @@ type JsonPrimitive = string | number | boolean | null;
4
4
  type JsonValue = JsonPrimitive | JsonValue[] | {
5
5
  [key: string]: JsonValue;
6
6
  };
7
- /** Normalized record for one tool call observed during a harness run. */
7
+ /**
8
+ * Normalized record for one tool call observed during a harness run.
9
+ *
10
+ * @example
11
+ * ```ts
12
+ * const call: ToolCallRecord = {
13
+ * name: "lookupInvoice",
14
+ * arguments: { invoiceId: "inv_123" },
15
+ * result: { refundable: true },
16
+ * };
17
+ * ```
18
+ */
8
19
  type ToolCallRecord = {
20
+ /** Provider or runtime tool-call id when one is available. */
9
21
  id?: string;
22
+ /** Tool name as exposed to the agent or application runtime. */
10
23
  name: string;
24
+ /** JSON-safe tool arguments after provider/runtime normalization. */
11
25
  arguments?: Record<string, JsonValue>;
26
+ /** JSON-safe tool result returned by the application tool. */
12
27
  result?: JsonValue;
28
+ /** Normalized tool error when execution failed. */
13
29
  error?: {
14
30
  message: string;
15
31
  type?: string;
16
32
  [key: string]: JsonValue | undefined;
17
33
  };
34
+ /** ISO timestamp for the start of tool execution. */
18
35
  startedAt?: string;
36
+ /** ISO timestamp for the end of tool execution. */
19
37
  finishedAt?: string;
38
+ /** Tool execution duration in milliseconds. */
20
39
  durationMs?: number;
40
+ /** Extra JSON-safe tool metadata for reporters and custom judges. */
21
41
  metadata?: Record<string, JsonValue>;
22
42
  };
23
- /** Normalized message recorded in a harness session transcript. */
43
+ /**
44
+ * Normalized message recorded in a harness session transcript.
45
+ *
46
+ * @example
47
+ * ```ts
48
+ * const message: NormalizedMessage = {
49
+ * role: "assistant",
50
+ * content: { status: "approved" },
51
+ * toolCalls: [{ name: "lookupInvoice" }],
52
+ * };
53
+ * ```
54
+ */
24
55
  type NormalizedMessage = {
56
+ /** Transcript role for the normalized message. */
25
57
  role: "system" | "user" | "assistant" | "tool";
58
+ /** JSON-safe message content. */
26
59
  content?: JsonValue;
60
+ /** Tool calls associated with this message. */
27
61
  toolCalls?: ToolCallRecord[];
62
+ /** Extra JSON-safe message metadata. */
28
63
  metadata?: Record<string, JsonValue>;
29
64
  };
30
- /** Provider usage summary attached to a normalized harness run. */
65
+ /**
66
+ * Provider usage summary attached to a normalized harness run.
67
+ *
68
+ * @example
69
+ * ```ts
70
+ * const usage: UsageSummary = {
71
+ * provider: "openai",
72
+ * model: "gpt-4o-mini",
73
+ * inputTokens: 212,
74
+ * outputTokens: 48,
75
+ * totalTokens: 260,
76
+ * };
77
+ * ```
78
+ */
31
79
  type UsageSummary = {
80
+ /** Provider that served the application run. */
32
81
  provider?: string;
82
+ /** Model used for the application run. */
33
83
  model?: string;
84
+ /** Input, prompt, or request tokens consumed by the run. */
34
85
  inputTokens?: number;
86
+ /** Output or completion tokens produced by the run. */
35
87
  outputTokens?: number;
88
+ /** Reasoning tokens reported by providers that expose them. */
36
89
  reasoningTokens?: number;
90
+ /** Total token count reported by the provider or adapter. */
37
91
  totalTokens?: number;
38
- estimatedCost?: number;
92
+ /** Count of tool calls observed during the run. */
39
93
  toolCalls?: number;
94
+ /** Retry count observed during the run. */
40
95
  retries?: number;
96
+ /** Provider-specific JSON-safe usage details. Cost estimates belong here. */
41
97
  metadata?: Record<string, JsonValue>;
42
98
  };
43
99
  /** Timing summary attached to a normalized harness run. */
44
100
  type TimingSummary = {
101
+ /** End-to-end run duration in milliseconds. */
45
102
  totalMs?: number;
103
+ /** Extra JSON-safe timing metadata. */
46
104
  metadata?: Record<string, JsonValue>;
47
105
  };
48
- /** JSON-serializable transcript produced by the system under test. */
106
+ /**
107
+ * JSON-serializable transcript produced by the system under test.
108
+ *
109
+ * @example
110
+ * ```ts
111
+ * const session: NormalizedSession = {
112
+ * provider: "openai",
113
+ * model: "gpt-4o-mini",
114
+ * messages: [
115
+ * { role: "user", content: "Refund invoice inv_123" },
116
+ * { role: "assistant", content: { status: "approved" } },
117
+ * ],
118
+ * };
119
+ * ```
120
+ */
49
121
  type NormalizedSession = {
122
+ /** Ordered normalized transcript messages. */
50
123
  messages: NormalizedMessage[];
124
+ /** Provider that produced the session when known. */
51
125
  provider?: string;
126
+ /** Model that produced the session when known. */
52
127
  model?: string;
128
+ /** Extra JSON-safe session metadata. */
53
129
  metadata?: Record<string, JsonValue>;
54
130
  };
55
131
  type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
@@ -57,64 +133,165 @@ type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOut
57
133
  } : {
58
134
  output: TOutput;
59
135
  };
60
- /** Normalized result returned by every harness execution. */
136
+ /**
137
+ * Normalized result returned by every harness execution.
138
+ *
139
+ * @example
140
+ * ```ts
141
+ * const run: HarnessRun<{ status: "approved" }> = {
142
+ * output: { status: "approved" },
143
+ * session: {
144
+ * messages: [
145
+ * { role: "user", content: "Refund invoice inv_123" },
146
+ * { role: "assistant", content: { status: "approved" } },
147
+ * ],
148
+ * },
149
+ * usage: { totalTokens: 260 },
150
+ * errors: [],
151
+ * };
152
+ * ```
153
+ */
61
154
  type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
155
+ /** Normalized transcript and provider/session metadata. */
62
156
  session: NormalizedSession;
157
+ /** Stable provider usage units such as tokens, tools, and retries. */
63
158
  usage: UsageSummary;
159
+ /** Optional timing summary for the run. */
64
160
  timings?: TimingSummary;
161
+ /** JSON-safe run artifacts captured by the harness or test context. */
65
162
  artifacts?: Record<string, JsonValue>;
163
+ /** Normalized errors captured during execution. */
66
164
  errors: Array<Record<string, JsonValue>>;
67
165
  };
68
166
  /** Error value with an attached partial or complete normalized harness run. */
69
167
  type HarnessRunError = Error & {
168
+ /** Attached normalized harness run recovered by `getHarnessRunFromError(...)`. */
70
169
  vitestEvalsRun: HarnessRun;
71
170
  };
72
171
  /** Per-run metadata shape accepted by harnesses and eval tests. */
73
172
  type HarnessMetadata = Record<string, unknown>;
74
- /** Runtime context passed from the eval fixture into a harness run. */
173
+ /**
174
+ * Runtime context passed from the eval fixture into a harness run.
175
+ *
176
+ * @example
177
+ * ```ts
178
+ * const harness: Harness<string> = {
179
+ * name: "refund-agent",
180
+ * async run(input, context) {
181
+ * context.setArtifact("inputLength", input.length);
182
+ *
183
+ * return {
184
+ * output: undefined,
185
+ * session: { messages: [{ role: "user", content: input }] },
186
+ * usage: {},
187
+ * errors: [],
188
+ * };
189
+ * },
190
+ * };
191
+ * ```
192
+ */
75
193
  type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
194
+ /** Per-run metadata passed through `run(input, { metadata })`. */
76
195
  metadata: Readonly<TMetadata>;
196
+ /** Abort signal from Vitest when available. */
77
197
  signal?: AbortSignal;
198
+ /** Mutable JSON-safe artifact bag shared with the harness. */
78
199
  artifacts: Record<string, JsonValue>;
200
+ /** Stores one JSON-safe artifact on the current run. */
79
201
  setArtifact: (name: string, value: JsonValue) => void;
80
202
  };
81
- /** Adapter that executes the system under test and returns a normalized run. */
203
+ /**
204
+ * Adapter that executes the system under test and returns a normalized run.
205
+ *
206
+ * @example
207
+ * ```ts
208
+ * const harness: Harness<string, { status: "approved" | "denied" }> = {
209
+ * name: "refund-agent",
210
+ * async run(input, context) {
211
+ * return normalizeHarnessRun(input, await runRefundFlow(input), context);
212
+ * },
213
+ * };
214
+ * ```
215
+ */
82
216
  type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
217
+ /** Stable harness name used in reports. */
83
218
  name: string;
219
+ /** Executes the system under test and returns a normalized run. */
84
220
  run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun<TOutput>>;
85
221
  };
86
222
  /** Value or promise accepted by lightweight harness callbacks. */
87
223
  type MaybePromise<T> = T | Promise<T>;
88
224
  /** Lightweight tool-call record accepted by `createHarness(...)` results. */
89
225
  type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
226
+ /** Raw tool arguments accepted by `createHarness(...)` before normalization. */
90
227
  arguments?: unknown;
228
+ /** Raw tool result accepted by `createHarness(...)` before normalization. */
91
229
  result?: unknown;
230
+ /** Raw tool error accepted by `createHarness(...)` before normalization. */
92
231
  error?: unknown;
232
+ /** Raw tool metadata accepted by `createHarness(...)` before normalization. */
93
233
  metadata?: Record<string, unknown>;
94
234
  };
95
- /** Lightweight result shape normalized by `createHarness(...)`. */
235
+ /**
236
+ * Lightweight result shape normalized by `createHarness(...)`.
237
+ *
238
+ * @example
239
+ * ```ts
240
+ * const result: SimpleHarnessResult<{ status: "approved" }> = {
241
+ * output: { status: "approved" },
242
+ * toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
243
+ * usage: { totalTokens: 260 },
244
+ * };
245
+ * ```
246
+ */
96
247
  type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
248
+ /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */
97
249
  messages?: NormalizedMessage[];
250
+ /** Lightweight tool-call records to normalize into the session. */
98
251
  toolCalls?: SimpleToolCallRecord[];
252
+ /** Usage summary to attach to the run. */
99
253
  usage?: UsageSummary;
254
+ /** Timing summary to attach to the run. */
100
255
  timings?: TimingSummary;
256
+ /** Raw artifact values to normalize and merge into the run. */
101
257
  artifacts?: Record<string, unknown>;
258
+ /** Raw session metadata to normalize into the session. */
102
259
  metadata?: Record<string, unknown>;
260
+ /** Raw errors to normalize into the run. */
103
261
  errors?: unknown[];
104
262
  };
105
263
  /** Either a complete normalized run or a lightweight result to normalize. */
106
264
  type HarnessResultLike<TOutput extends JsonValue | undefined = JsonValue | undefined> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;
107
265
  /** Arguments passed to the `createHarness(...)` convenience callback. */
108
266
  type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {
267
+ /** Original input passed to `run(input)`. */
109
268
  input: TInput;
269
+ /** Read-only metadata passed to `run(input, { metadata })`. */
110
270
  metadata: Readonly<TMetadata>;
271
+ /** Abort signal from Vitest when available. */
111
272
  signal?: AbortSignal;
273
+ /** Mutable run artifact bag. */
112
274
  artifacts: HarnessContext<TMetadata>["artifacts"];
275
+ /** Stores one JSON-safe artifact on the current run. */
113
276
  setArtifact: HarnessContext<TMetadata>["setArtifact"];
114
277
  };
115
- /** Options for creating a lightweight custom application harness. */
278
+ /**
279
+ * Options for creating a lightweight custom application harness.
280
+ *
281
+ * @example
282
+ * ```ts
283
+ * const options: CreateHarnessOptions<string, { status: "approved" }> = {
284
+ * name: "refund-agent",
285
+ * run: async ({ input }) => ({
286
+ * output: await classifyRefund(input),
287
+ * }),
288
+ * };
289
+ * ```
290
+ */
116
291
  type CreateHarnessOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
292
+ /** Stable harness name used in reports. */
117
293
  name: string;
294
+ /** Executes application code and returns either a lightweight result or full `HarnessRun`. */
118
295
  run: (args: CreateHarnessRunArgs<TInput, TMetadata>) => MaybePromise<HarnessResultLike<TOutput>>;
119
296
  };
120
297
  /** Returns true when a value exposes a callable method with the given name. */
@@ -127,25 +304,157 @@ declare function normalizeRecord(value: Record<string, unknown>): Record<string,
127
304
  declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
128
305
  /** Converts arbitrary content into the JSON-safe message content shape. */
129
306
  declare function normalizeContent(value: unknown): JsonValue;
130
- /** Creates a harness from the common "run app code and return output" shape. */
307
+ /**
308
+ * Creates a harness from the common "run app code and return output" shape.
309
+ *
310
+ * @param options - Harness name plus the callback that executes app code.
311
+ *
312
+ * @example
313
+ * ```ts
314
+ * import { createHarness } from "vitest-evals";
315
+ *
316
+ * export const refundHarness = createHarness<
317
+ * string,
318
+ * { status: "approved" | "denied" },
319
+ * { expected: { status: "approved" | "denied" } }
320
+ * >({
321
+ * name: "refund-agent",
322
+ * run: async ({ input, metadata, setArtifact }) => {
323
+ * const result = await runRefundFlow(input, metadata);
324
+ * const output = { status: result.status };
325
+ *
326
+ * setArtifact("case", { expected: metadata.expected.status });
327
+ *
328
+ * return {
329
+ * output,
330
+ * toolCalls: result.toolCalls,
331
+ * usage: { provider: "openai", model: "gpt-4o-mini" },
332
+ * };
333
+ * },
334
+ * });
335
+ * ```
336
+ */
131
337
  declare function createHarness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata>(options: CreateHarnessOptions<TInput, TOutput, TMetadata>): Harness<TInput, TOutput, TMetadata>;
132
- /** Normalizes a lightweight harness result into the reporter-facing run shape. */
338
+ /**
339
+ * Normalizes a lightweight harness result into the reporter-facing run shape.
340
+ *
341
+ * @param input - Original input passed to the harness.
342
+ * @param result - Lightweight result or pre-normalized harness run.
343
+ * @param context - Optional per-run context used to merge artifacts.
344
+ *
345
+ * @example
346
+ * ```ts
347
+ * const run = normalizeHarnessRun("Refund invoice inv_123", {
348
+ * output: { status: "approved" },
349
+ * toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
350
+ * usage: { provider: "openai", model: "gpt-4o-mini" },
351
+ * });
352
+ *
353
+ * expect(toolCalls(run.session)).toHaveLength(1);
354
+ * ```
355
+ */
133
356
  declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
134
- /** Flattens every recorded tool call from a normalized session. */
357
+ /**
358
+ * Flattens every recorded tool call from a normalized session.
359
+ *
360
+ * @param session - Normalized session produced by a harness run.
361
+ *
362
+ * @example
363
+ * ```ts
364
+ * const names = toolCalls(result.session).map((call) => call.name);
365
+ *
366
+ * expect(names).toEqual(["lookupInvoice", "createRefund"]);
367
+ * ```
368
+ */
135
369
  declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
136
- /** Filters normalized session messages by role. */
370
+ /**
371
+ * Filters normalized session messages by role.
372
+ *
373
+ * @param session - Normalized session produced by a harness run.
374
+ * @param role - Message role to keep.
375
+ *
376
+ * @example
377
+ * ```ts
378
+ * const assistantText = messagesByRole(result.session, "assistant")
379
+ * .map((message) => message.content)
380
+ * .join("\n");
381
+ * ```
382
+ */
137
383
  declare function messagesByRole(session: NormalizedSession, role: NormalizedMessage["role"]): NormalizedMessage[];
138
- /** Returns every normalized system message from a session. */
384
+ /**
385
+ * Returns every normalized system message from a session.
386
+ *
387
+ * @param session - Normalized session produced by a harness run.
388
+ *
389
+ * @example
390
+ * ```ts
391
+ * const systemPrompts = systemMessages(result.session);
392
+ * ```
393
+ */
139
394
  declare function systemMessages(session: NormalizedSession): NormalizedMessage[];
140
- /** Returns every normalized user message from a session. */
395
+ /**
396
+ * Returns every normalized user message from a session.
397
+ *
398
+ * @param session - Normalized session produced by a harness run.
399
+ *
400
+ * @example
401
+ * ```ts
402
+ * const firstPrompt = userMessages(result.session)[0]?.content;
403
+ * ```
404
+ */
141
405
  declare function userMessages(session: NormalizedSession): NormalizedMessage[];
142
- /** Returns every normalized assistant message from a session. */
406
+ /**
407
+ * Returns every normalized assistant message from a session.
408
+ *
409
+ * @param session - Normalized session produced by a harness run.
410
+ *
411
+ * @example
412
+ * ```ts
413
+ * const finalAnswer = assistantMessages(result.session).at(-1)?.content;
414
+ * ```
415
+ */
143
416
  declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
144
- /** Returns every normalized tool message from a session. */
417
+ /**
418
+ * Returns every normalized tool message from a session.
419
+ *
420
+ * @param session - Normalized session produced by a harness run.
421
+ *
422
+ * @example
423
+ * ```ts
424
+ * const toolOutputs = toolMessages(result.session).map((message) => message.content);
425
+ * ```
426
+ */
145
427
  declare function toolMessages(session: NormalizedSession): NormalizedMessage[];
146
- /** Attaches a partial or complete harness run to an arbitrary thrown error. */
428
+ /**
429
+ * Attaches a partial or complete harness run to an arbitrary thrown error.
430
+ *
431
+ * @param error - Thrown value to wrap.
432
+ * @param run - Partial or complete normalized harness run to preserve.
433
+ *
434
+ * @example
435
+ * ```ts
436
+ * try {
437
+ * return await runAgent(input);
438
+ * } catch (error) {
439
+ * throw attachHarnessRunToError(error, partialRun);
440
+ * }
441
+ * ```
442
+ */
147
443
  declare function attachHarnessRunToError(error: unknown, run: HarnessRun): HarnessRunError;
148
- /** Reads an attached harness run back off a previously wrapped error value. */
444
+ /**
445
+ * Reads an attached harness run back off a previously wrapped error value.
446
+ *
447
+ * @param error - Unknown thrown value that may contain a harness run.
448
+ *
449
+ * @example
450
+ * ```ts
451
+ * const partialRun = getHarnessRunFromError(error);
452
+ *
453
+ * if (partialRun) {
454
+ * console.log(toolCalls(partialRun.session));
455
+ * }
456
+ * ```
457
+ */
149
458
  declare function getHarnessRunFromError(error: unknown): HarnessRun | undefined;
150
459
  /** Returns true when a value matches the normalized `HarnessRun` contract. */
151
460
  declare function isHarnessRun(value: unknown): value is HarnessRun;