vitest-evals 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +39 -35
  2. package/bin/vitest-evals.js +8 -0
  3. package/dist/cli.d.mts +13 -0
  4. package/dist/cli.d.ts +13 -0
  5. package/dist/cli.js +83 -0
  6. package/dist/cli.js.map +1 -0
  7. package/dist/cli.mjs +55 -0
  8. package/dist/cli.mjs.map +1 -0
  9. package/dist/harness.d.mts +19 -433
  10. package/dist/harness.d.ts +19 -433
  11. package/dist/harness.js +19 -51
  12. package/dist/harness.js.map +1 -1
  13. package/dist/harness.mjs +31 -49
  14. package/dist/harness.mjs.map +1 -1
  15. package/dist/index.d.mts +47 -68
  16. package/dist/index.d.ts +47 -68
  17. package/dist/index.js +46 -96
  18. package/dist/index.js.map +1 -1
  19. package/dist/index.mjs +58 -94
  20. package/dist/index.mjs.map +1 -1
  21. package/dist/internal/scoring.d.mts +1 -1
  22. package/dist/internal/scoring.d.ts +1 -1
  23. package/dist/internal/structuredOutputScorer.d.mts +1 -1
  24. package/dist/internal/structuredOutputScorer.d.ts +1 -1
  25. package/dist/internal/toolCallScorer.d.mts +1 -1
  26. package/dist/internal/toolCallScorer.d.ts +1 -1
  27. package/dist/internal/toolCallScorer.js +2 -0
  28. package/dist/internal/toolCallScorer.js.map +1 -1
  29. package/dist/internal/toolCallScorer.mjs +16 -0
  30. package/dist/internal/toolCallScorer.mjs.map +1 -1
  31. package/dist/judges/factualityJudge.d.mts +15 -13
  32. package/dist/judges/factualityJudge.d.ts +15 -13
  33. package/dist/judges/factualityJudge.js +13 -23
  34. package/dist/judges/factualityJudge.js.map +1 -1
  35. package/dist/judges/factualityJudge.mjs +27 -23
  36. package/dist/judges/factualityJudge.mjs.map +1 -1
  37. package/dist/judges/index.d.mts +1 -0
  38. package/dist/judges/index.d.ts +1 -0
  39. package/dist/judges/index.js +28 -47
  40. package/dist/judges/index.js.map +1 -1
  41. package/dist/judges/index.mjs +40 -45
  42. package/dist/judges/index.mjs.map +1 -1
  43. package/dist/judges/judgeHarness.d.mts +7 -10
  44. package/dist/judges/judgeHarness.d.ts +7 -10
  45. package/dist/judges/judgeHarness.js +13 -34
  46. package/dist/judges/judgeHarness.js.map +1 -1
  47. package/dist/judges/judgeHarness.mjs +25 -32
  48. package/dist/judges/judgeHarness.mjs.map +1 -1
  49. package/dist/judges/structuredOutputJudge.d.mts +7 -8
  50. package/dist/judges/structuredOutputJudge.d.ts +7 -8
  51. package/dist/judges/structuredOutputJudge.js +3 -3
  52. package/dist/judges/structuredOutputJudge.js.map +1 -1
  53. package/dist/judges/structuredOutputJudge.mjs +3 -3
  54. package/dist/judges/structuredOutputJudge.mjs.map +1 -1
  55. package/dist/judges/toolCallJudge.d.mts +12 -8
  56. package/dist/judges/toolCallJudge.d.ts +12 -8
  57. package/dist/judges/toolCallJudge.js +5 -3
  58. package/dist/judges/toolCallJudge.js.map +1 -1
  59. package/dist/judges/toolCallJudge.mjs +19 -3
  60. package/dist/judges/toolCallJudge.mjs.map +1 -1
  61. package/dist/judges/types.d.mts +14 -24
  62. package/dist/judges/types.d.ts +14 -24
  63. package/dist/judges/types.js.map +1 -1
  64. package/dist/legacy/scorers/index.js +2 -0
  65. package/dist/legacy/scorers/index.js.map +1 -1
  66. package/dist/legacy/scorers/index.mjs +16 -0
  67. package/dist/legacy/scorers/index.mjs.map +1 -1
  68. package/dist/legacy/scorers/toolCallScorer.js +2 -0
  69. package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
  70. package/dist/legacy/scorers/toolCallScorer.mjs +16 -0
  71. package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
  72. package/dist/legacy.js +7 -5
  73. package/dist/legacy.js.map +1 -1
  74. package/dist/legacy.mjs +21 -5
  75. package/dist/legacy.mjs.map +1 -1
  76. package/dist/replay.d.mts +1 -1
  77. package/dist/replay.d.ts +1 -1
  78. package/dist/reporter.js +4 -5
  79. package/dist/reporter.js.map +1 -1
  80. package/dist/reporter.mjs +18 -5
  81. package/dist/reporter.mjs.map +1 -1
  82. package/package.json +9 -1
package/dist/harness.d.ts CHANGED
@@ -1,148 +1,6 @@
1
- /** Primitive scalar values allowed in normalized JSON-safe eval data. */
2
- type JsonPrimitive = string | number | boolean | null;
3
- /** JSON-safe value shape used by normalized sessions, artifacts, and errors. */
4
- type JsonValue = JsonPrimitive | JsonValue[] | {
5
- [key: string]: JsonValue;
6
- };
7
- /** Well-known OpenTelemetry GenAI operation names. */
8
- type GenAiOperationName = "chat" | "create_agent" | "embeddings" | "execute_tool" | "generate_content" | "invoke_agent" | "invoke_workflow" | "retrieval" | "text_completion" | (string & {});
9
- /** Well-known OpenTelemetry GenAI output content types. */
10
- type GenAiOutputType = "image" | "json" | "speech" | "text" | (string & {});
11
- /** Well-known OpenTelemetry GenAI provider names. */
12
- type GenAiProviderName = "anthropic" | "aws.bedrock" | "azure.ai.inference" | "azure.ai.openai" | "cohere" | "deepseek" | "gcp.gemini" | "gcp.gen_ai" | "gcp.vertex_ai" | "groq" | "ibm.watsonx.ai" | "mistral_ai" | "openai" | "perplexity" | "x_ai" | (string & {});
13
- /** Well-known OpenTelemetry GenAI token types. */
14
- type GenAiTokenType = "input" | "output" | (string & {});
15
- /** Well-known OpenTelemetry GenAI tool execution types. */
16
- type GenAiToolType = "datastore" | "extension" | "function" | (string & {});
17
- /** Typed subset of OpenTelemetry GenAI semantic attributes. */
18
- type GenAiSemanticAttributes = {
19
- "gen_ai.agent.description"?: string;
20
- "gen_ai.agent.id"?: string;
21
- "gen_ai.agent.name"?: string;
22
- "gen_ai.agent.version"?: string;
23
- "gen_ai.conversation.id"?: string;
24
- "gen_ai.data_source.id"?: string;
25
- "gen_ai.embeddings.dimension.count"?: number;
26
- "gen_ai.evaluation.explanation"?: string;
27
- "gen_ai.evaluation.name"?: string;
28
- "gen_ai.evaluation.score.label"?: string;
29
- "gen_ai.evaluation.score.value"?: number;
30
- "gen_ai.input.messages"?: JsonValue;
31
- "gen_ai.operation.name"?: GenAiOperationName;
32
- "gen_ai.output.messages"?: JsonValue;
33
- "gen_ai.output.type"?: GenAiOutputType;
34
- "gen_ai.prompt.name"?: string;
35
- "gen_ai.provider.name"?: GenAiProviderName;
36
- "gen_ai.request.choice.count"?: number;
37
- "gen_ai.request.encoding_formats"?: string[];
38
- "gen_ai.request.frequency_penalty"?: number;
39
- "gen_ai.request.max_tokens"?: number;
40
- "gen_ai.request.model"?: string;
41
- "gen_ai.request.presence_penalty"?: number;
42
- "gen_ai.request.seed"?: number;
43
- "gen_ai.request.stop_sequences"?: string[];
44
- "gen_ai.request.stream"?: boolean;
45
- "gen_ai.request.temperature"?: number;
46
- "gen_ai.request.top_k"?: number;
47
- "gen_ai.request.top_p"?: number;
48
- "gen_ai.response.finish_reasons"?: string[];
49
- "gen_ai.response.id"?: string;
50
- "gen_ai.response.model"?: string;
51
- "gen_ai.response.time_to_first_chunk"?: number;
52
- "gen_ai.retrieval.documents"?: JsonValue;
53
- "gen_ai.retrieval.query.text"?: string;
54
- "gen_ai.system_instructions"?: JsonValue;
55
- "gen_ai.token.type"?: GenAiTokenType;
56
- "gen_ai.tool.call.arguments"?: JsonValue;
57
- "gen_ai.tool.call.id"?: string;
58
- "gen_ai.tool.call.result"?: JsonValue;
59
- "gen_ai.tool.definitions"?: JsonValue;
60
- "gen_ai.tool.description"?: string;
61
- "gen_ai.tool.name"?: string;
62
- "gen_ai.tool.type"?: GenAiToolType;
63
- "gen_ai.usage.cache_creation.input_tokens"?: number;
64
- "gen_ai.usage.cache_read.input_tokens"?: number;
65
- "gen_ai.usage.input_tokens"?: number;
66
- "gen_ai.usage.output_tokens"?: number;
67
- "gen_ai.usage.reasoning.output_tokens"?: number;
68
- "gen_ai.workflow.name"?: string;
69
- };
70
- /** Attribute keys defined by the OpenTelemetry GenAI semantic conventions. */
71
- type GenAiSemanticAttributeKey = keyof GenAiSemanticAttributes;
72
- /** Typed OpenTelemetry semantic attributes accepted on normalized spans. */
73
- type OpenTelemetrySemanticAttributes = GenAiSemanticAttributes & {
74
- "error.type"?: string;
75
- "server.address"?: string;
76
- "server.port"?: number;
77
- };
78
- /** Known OpenTelemetry semantic attribute keys accepted on normalized spans. */
79
- type OpenTelemetrySemanticAttributeKey = keyof OpenTelemetrySemanticAttributes;
80
- /** Attribute keys accepted on normalized spans. */
81
- type NormalizedSpanAttributeKey = OpenTelemetrySemanticAttributeKey | (string & {});
82
- /**
83
- * JSON-safe span attributes. Known OpenTelemetry GenAI keys are typed while
84
- * custom provider and application keys remain allowed.
85
- */
86
- type NormalizedSpanAttributes = OpenTelemetrySemanticAttributes & {
87
- [key: string]: JsonValue | undefined;
88
- };
89
- /** Event attached to one normalized span. */
90
- type NormalizedSpanEvent = {
91
- /** Event name emitted by the runtime or harness. */
92
- name: string;
93
- /** ISO timestamp for the event when available. */
94
- timestamp?: string;
95
- /** JSON-safe event attributes. */
96
- attributes?: NormalizedSpanAttributes;
97
- };
98
- /** Normalized operation span captured during a harness run. */
99
- type NormalizedSpan = {
100
- /** Runtime or provider span id when one is available. */
101
- id?: string;
102
- /** Trace id this span belongs to. */
103
- traceId?: string;
104
- /** Parent span id when the runtime exposes hierarchy. */
105
- parentId?: string;
106
- /** Human-readable operation name. */
107
- name: string;
108
- /** Coarse operation kind used by reporters and judges. */
109
- kind?: "run" | "agent" | "model" | "tool" | "guardrail" | "handoff" | "custom";
110
- /** ISO timestamp for the start of the span. */
111
- startedAt?: string;
112
- /** ISO timestamp for the end of the span. */
113
- finishedAt?: string;
114
- /** Span duration in milliseconds. */
115
- durationMs?: number;
116
- /** Success or failure status for the span. */
117
- status?: "ok" | "error";
118
- /** Normalized error when the span failed. */
119
- error?: {
120
- message: string;
121
- type?: string;
122
- [key: string]: JsonValue | undefined;
123
- };
124
- /** JSON-safe operation attributes. */
125
- attributes?: NormalizedSpanAttributes;
126
- /** Events observed inside this span. */
127
- events?: NormalizedSpanEvent[];
128
- };
129
- /** Normalized trace captured during a harness run. */
130
- type NormalizedTrace = {
131
- /** Runtime or provider trace id when one is available. */
132
- id?: string;
133
- /** Human-readable trace or workflow name. */
134
- name?: string;
135
- /** ISO timestamp for the start of the trace. */
136
- startedAt?: string;
137
- /** ISO timestamp for the end of the trace. */
138
- finishedAt?: string;
139
- /** Trace duration in milliseconds. */
140
- durationMs?: number;
141
- /** Extra JSON-safe trace metadata. */
142
- metadata?: Record<string, JsonValue>;
143
- /** Spans that make up this trace. */
144
- spans: NormalizedSpan[];
145
- };
1
+ import { JsonValue, HarnessRun, GenAiOperationName, ToolCallRecord, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, NormalizedMessage, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
2
+ export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCallRecord, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
3
+
146
4
  /** Options for converting normalized tool calls into trace spans. */
147
5
  type CreateToolCallSpansOptions = {
148
6
  /** Trace id to attach to each generated tool span. */
@@ -167,173 +25,12 @@ type EnsureRunTraceOptions = {
167
25
  /** Optional JSON-safe source marker for the trace metadata. */
168
26
  source?: string;
169
27
  };
170
- /**
171
- * Normalized record for one tool call observed during a harness run.
172
- *
173
- * @example
174
- * ```ts
175
- * const call: ToolCallRecord = {
176
- * name: "lookupInvoice",
177
- * arguments: { invoiceId: "inv_123" },
178
- * result: { refundable: true },
179
- * };
180
- * ```
181
- */
182
- type ToolCallRecord = {
183
- /** Provider or runtime tool-call id when one is available. */
184
- id?: string;
185
- /** Tool name as exposed to the agent or application runtime. */
186
- name: string;
187
- /** JSON-safe tool arguments after provider/runtime normalization. */
188
- arguments?: Record<string, JsonValue>;
189
- /** JSON-safe tool result returned by the application tool. */
190
- result?: JsonValue;
191
- /** Normalized tool error when execution failed. */
192
- error?: {
193
- message: string;
194
- type?: string;
195
- [key: string]: JsonValue | undefined;
196
- };
197
- /** ISO timestamp for the start of tool execution. */
198
- startedAt?: string;
199
- /** ISO timestamp for the end of tool execution. */
200
- finishedAt?: string;
201
- /** Tool execution duration in milliseconds. */
202
- durationMs?: number;
203
- /** Extra JSON-safe tool metadata for reporters and custom judges. */
204
- metadata?: Record<string, JsonValue>;
205
- };
206
- /**
207
- * Normalized message recorded in a harness session transcript.
208
- *
209
- * @example
210
- * ```ts
211
- * const message: NormalizedMessage = {
212
- * role: "assistant",
213
- * content: { status: "approved" },
214
- * toolCalls: [{ name: "lookupInvoice" }],
215
- * };
216
- * ```
217
- */
218
- type NormalizedMessage = {
219
- /** Transcript role for the normalized message. */
220
- role: "system" | "user" | "assistant" | "tool";
221
- /** JSON-safe message content. */
222
- content?: JsonValue;
223
- /** Tool calls associated with this message. */
224
- toolCalls?: ToolCallRecord[];
225
- /** Extra JSON-safe message metadata. */
226
- metadata?: Record<string, JsonValue>;
227
- };
228
- /**
229
- * Provider usage summary attached to a normalized harness run.
230
- *
231
- * @example
232
- * ```ts
233
- * const usage: UsageSummary = {
234
- * provider: "openai",
235
- * model: "gpt-4o-mini",
236
- * inputTokens: 212,
237
- * outputTokens: 48,
238
- * totalTokens: 260,
239
- * };
240
- * ```
241
- */
242
- type UsageSummary = {
243
- /** Provider that served the application run. */
244
- provider?: string;
245
- /** Model used for the application run. */
246
- model?: string;
247
- /** Input, prompt, or request tokens consumed by the run. */
248
- inputTokens?: number;
249
- /** Output or completion tokens produced by the run. */
250
- outputTokens?: number;
251
- /** Reasoning tokens reported by providers that expose them. */
252
- reasoningTokens?: number;
253
- /** Total token count reported by the provider or adapter. */
254
- totalTokens?: number;
255
- /** Count of tool calls observed during the run. */
256
- toolCalls?: number;
257
- /** Retry count observed during the run. */
258
- retries?: number;
259
- /** Provider-specific JSON-safe usage details. Cost estimates belong here. */
260
- metadata?: Record<string, JsonValue>;
261
- };
262
- /** Timing summary attached to a normalized harness run. */
263
- type TimingSummary = {
264
- /** End-to-end run duration in milliseconds. */
265
- totalMs?: number;
266
- /** Extra JSON-safe timing metadata. */
267
- metadata?: Record<string, JsonValue>;
268
- };
269
- /**
270
- * JSON-serializable transcript produced by the system under test.
271
- *
272
- * @example
273
- * ```ts
274
- * const session: NormalizedSession = {
275
- * provider: "openai",
276
- * model: "gpt-4o-mini",
277
- * messages: [
278
- * { role: "user", content: "Refund invoice inv_123" },
279
- * { role: "assistant", content: { status: "approved" } },
280
- * ],
281
- * };
282
- * ```
283
- */
284
- type NormalizedSession = {
285
- /** Ordered normalized transcript messages. */
286
- messages: NormalizedMessage[];
287
- /** Provider that produced the session when known. */
288
- provider?: string;
289
- /** Model that produced the session when known. */
290
- model?: string;
291
- /** Extra JSON-safe session metadata. */
292
- metadata?: Record<string, JsonValue>;
293
- };
294
28
  type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
295
29
  output?: TOutput;
296
30
  } : {
297
31
  output: TOutput;
298
32
  };
299
- /**
300
- * Normalized result returned by every harness execution.
301
- *
302
- * @example
303
- * ```ts
304
- * const run: HarnessRun<{ status: "approved" }> = {
305
- * output: { status: "approved" },
306
- * session: {
307
- * messages: [
308
- * { role: "user", content: "Refund invoice inv_123" },
309
- * { role: "assistant", content: { status: "approved" } },
310
- * ],
311
- * },
312
- * usage: { totalTokens: 260 },
313
- * errors: [],
314
- * };
315
- * ```
316
- */
317
- type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
318
- /** Normalized transcript and provider/session metadata. */
319
- session: NormalizedSession;
320
- /** Stable provider usage units such as tokens, tools, and retries. */
321
- usage: UsageSummary;
322
- /** Optional timing summary for the run. */
323
- timings?: TimingSummary;
324
- /** JSON-safe run artifacts captured by the harness or test context. */
325
- artifacts?: Record<string, JsonValue>;
326
- /** Normalized traces and spans captured during execution. */
327
- traces?: NormalizedTrace[];
328
- /** Normalized errors captured during execution. */
329
- errors: Array<Record<string, JsonValue>>;
330
- };
331
- /** Error value with an attached partial or complete normalized harness run. */
332
- type HarnessRunError = Error & {
333
- /** Attached normalized harness run recovered by `getHarnessRunFromError(...)`. */
334
- vitestEvalsRun: HarnessRun;
335
- };
336
- /** Per-run metadata shape accepted by harnesses and eval tests. */
33
+ /** Generic JSON-like metadata record used by normalized artifacts and reports. */
337
34
  type HarnessMetadata = Record<string, unknown>;
338
35
  /**
339
36
  * Runtime context passed from the eval fixture into a harness run.
@@ -355,9 +52,7 @@ type HarnessMetadata = Record<string, unknown>;
355
52
  * };
356
53
  * ```
357
54
  */
358
- type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
359
- /** Per-run metadata passed through `run(input, { metadata })`. */
360
- metadata: Readonly<TMetadata>;
55
+ type HarnessContext = {
361
56
  /** Abort signal from Vitest when available. */
362
57
  signal?: AbortSignal;
363
58
  /** Mutable JSON-safe artifact bag shared with the harness. */
@@ -378,11 +73,11 @@ type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
378
73
  * };
379
74
  * ```
380
75
  */
381
- type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
76
+ type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined> = {
382
77
  /** Stable harness name used in reports. */
383
78
  name: string;
384
79
  /** Executes the system under test and returns a normalized run. */
385
- run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun<TOutput>>;
80
+ run: (input: TInput, context: HarnessContext) => Promise<HarnessRun<TOutput>>;
386
81
  };
387
82
  /** Value or promise accepted by lightweight harness callbacks. */
388
83
  type MaybePromise<T> = T | Promise<T>;
@@ -451,17 +146,15 @@ type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | und
451
146
  /** Either a complete normalized run or a lightweight result to normalize. */
452
147
  type HarnessResultLike<TOutput extends JsonValue | undefined = JsonValue | undefined> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;
453
148
  /** Arguments passed to the `createHarness(...)` convenience callback. */
454
- type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {
149
+ type CreateHarnessRunArgs<TInput> = {
455
150
  /** Original input passed to `run(input)`. */
456
151
  input: TInput;
457
- /** Read-only metadata passed to `run(input, { metadata })`. */
458
- metadata: Readonly<TMetadata>;
459
152
  /** Abort signal from Vitest when available. */
460
153
  signal?: AbortSignal;
461
154
  /** Mutable run artifact bag. */
462
- artifacts: HarnessContext<TMetadata>["artifacts"];
155
+ artifacts: HarnessContext["artifacts"];
463
156
  /** Stores one JSON-safe artifact on the current run. */
464
- setArtifact: HarnessContext<TMetadata>["setArtifact"];
157
+ setArtifact: HarnessContext["setArtifact"];
465
158
  };
466
159
  /**
467
160
  * Options for creating a lightweight custom application harness.
@@ -476,11 +169,11 @@ type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {
476
169
  * };
477
170
  * ```
478
171
  */
479
- type CreateHarnessOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
172
+ type CreateHarnessOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined> = {
480
173
  /** Stable harness name used in reports. */
481
174
  name: string;
482
175
  /** Executes application code and returns either a lightweight result or full `HarnessRun`. */
483
- run: (args: CreateHarnessRunArgs<TInput, TMetadata>) => MaybePromise<HarnessResultLike<TOutput>>;
176
+ run: (args: CreateHarnessRunArgs<TInput>) => MaybePromise<HarnessResultLike<TOutput>>;
484
177
  };
485
178
  /** Returns true when a value exposes a callable method with the given name. */
486
179
  declare function hasCallableMethod(value: unknown, methodName: string): boolean;
@@ -503,15 +196,14 @@ declare function normalizeContent(value: unknown): JsonValue;
503
196
  *
504
197
  * export const refundHarness = createHarness<
505
198
  * string,
506
- * { status: "approved" | "denied" },
507
- * { expected: { status: "approved" | "denied" } }
199
+ * { status: "approved" | "denied" }
508
200
  * >({
509
201
  * name: "refund-agent",
510
- * run: async ({ input, metadata, setArtifact }) => {
511
- * const result = await runRefundFlow(input, metadata);
202
+ * run: async ({ input, setArtifact }) => {
203
+ * const result = await runRefundFlow(input);
512
204
  * const output = { status: result.status };
513
205
  *
514
- * setArtifact("case", { expected: metadata.expected.status });
206
+ * setArtifact("case", { invoiceId: result.invoiceId });
515
207
  *
516
208
  * return {
517
209
  * output,
@@ -522,7 +214,7 @@ declare function normalizeContent(value: unknown): JsonValue;
522
214
  * });
523
215
  * ```
524
216
  */
525
- declare function createHarness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata>(options: CreateHarnessOptions<TInput, TOutput, TMetadata>): Harness<TInput, TOutput, TMetadata>;
217
+ declare function createHarness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined>(options: CreateHarnessOptions<TInput, TOutput>): Harness<TInput, TOutput>;
526
218
  /**
527
219
  * Normalizes a lightweight harness result into the reporter-facing run shape.
528
220
  *
@@ -541,7 +233,7 @@ declare function createHarness<TInput = unknown, TOutput extends JsonValue | und
541
233
  * expect(toolCalls(run.session)).toHaveLength(1);
542
234
  * ```
543
235
  */
544
- declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
236
+ declare function normalizeHarnessRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext): HarnessRun<TOutput>;
545
237
  /**
546
238
  * Builds a JSON-safe failed run for errors that happen before a harness can return.
547
239
  *
@@ -567,19 +259,6 @@ declare function createGenAiUsageAttributes(usage: UsageSummary | undefined, opt
567
259
  "gen_ai.usage.output_tokens": number | undefined;
568
260
  "gen_ai.usage.reasoning.output_tokens": number | undefined;
569
261
  };
570
- /**
571
- * Flattens every recorded tool call from a normalized session.
572
- *
573
- * @param session - Normalized session produced by a harness run.
574
- *
575
- * @example
576
- * ```ts
577
- * const names = toolCalls(result.session).map((call) => call.name);
578
- *
579
- * expect(names).toEqual(["lookupInvoice", "createRefund"]);
580
- * ```
581
- */
582
- declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
583
262
  /**
584
263
  * Converts normalized tool-call records into trace spans.
585
264
  *
@@ -594,99 +273,6 @@ declare function createToolCallSpans(calls: ToolCallRecord[], options?: CreateTo
594
273
  * remain free to attach richer native traces.
595
274
  */
596
275
  declare function ensureRunTrace(run: HarnessRun, options: EnsureRunTraceOptions): NormalizedTrace | undefined;
597
- /**
598
- * Flattens every recorded span from a normalized harness run.
599
- *
600
- * @param run - Normalized harness run produced by a harness.
601
- *
602
- * @example
603
- * ```ts
604
- * const modelSpans = spans(result).filter((span) => span.kind === "model");
605
- * ```
606
- */
607
- declare function spans(run: HarnessRun): NormalizedSpan[];
608
- /**
609
- * Returns spans of one coarse operation kind from a normalized run.
610
- *
611
- * @param run - Normalized harness run produced by a harness.
612
- * @param kind - Span kind to keep.
613
- */
614
- declare function spansByKind(run: HarnessRun, kind: NonNullable<NormalizedSpan["kind"]>): NormalizedSpan[];
615
- /**
616
- * Returns every span that explicitly failed or carries a normalized error.
617
- *
618
- * @param run - Normalized harness run produced by a harness.
619
- */
620
- declare function failedSpans(run: HarnessRun): NormalizedSpan[];
621
- /**
622
- * Filters normalized session messages by role.
623
- *
624
- * @param session - Normalized session produced by a harness run.
625
- * @param role - Message role to keep.
626
- *
627
- * @example
628
- * ```ts
629
- * const assistantText = messagesByRole(result.session, "assistant")
630
- * .map((message) => message.content)
631
- * .join("\n");
632
- * ```
633
- */
634
- declare function messagesByRole(session: NormalizedSession, role: NormalizedMessage["role"]): NormalizedMessage[];
635
- /**
636
- * Returns every normalized system message from a session.
637
- *
638
- * @param session - Normalized session produced by a harness run.
639
- *
640
- * @example
641
- * ```ts
642
- * const systemPrompts = systemMessages(result.session);
643
- * ```
644
- */
645
- declare function systemMessages(session: NormalizedSession): NormalizedMessage[];
646
- /**
647
- * Returns every normalized user message from a session.
648
- *
649
- * @param session - Normalized session produced by a harness run.
650
- *
651
- * @example
652
- * ```ts
653
- * const firstPrompt = userMessages(result.session)[0]?.content;
654
- * ```
655
- */
656
- declare function userMessages(session: NormalizedSession): NormalizedMessage[];
657
- /**
658
- * Returns every normalized assistant message from a session.
659
- *
660
- * @param session - Normalized session produced by a harness run.
661
- *
662
- * @example
663
- * ```ts
664
- * const finalAnswer = assistantMessages(result.session).at(-1)?.content;
665
- * ```
666
- */
667
- declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
668
- /**
669
- * Returns the latest assistant message content, ignoring empty text messages.
670
- *
671
- * @param session - Normalized session produced by a harness run.
672
- *
673
- * @example
674
- * ```ts
675
- * const finalAnswer = latestAssistantMessageContent(result.session);
676
- * ```
677
- */
678
- declare function latestAssistantMessageContent(session: NormalizedSession): JsonValue | undefined;
679
- /**
680
- * Returns every normalized tool message from a session.
681
- *
682
- * @param session - Normalized session produced by a harness run.
683
- *
684
- * @example
685
- * ```ts
686
- * const toolOutputs = toolMessages(result.session).map((message) => message.content);
687
- * ```
688
- */
689
- declare function toolMessages(session: NormalizedSession): NormalizedMessage[];
690
276
  /**
691
277
  * Attaches a partial or complete harness run to an arbitrary thrown error.
692
278
  *
@@ -727,4 +313,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
727
313
  /** Serializes an arbitrary thrown value into the normalized error shape. */
728
314
  declare function serializeError(error: unknown): Record<string, JsonValue>;
729
315
 
730
- export { type CreateHarnessOptions, type CreateHarnessRunArgs, type CreateToolCallSpansOptions, type EnsureRunTraceOptions, type GenAiOperationName, type GenAiOutputType, type GenAiProviderName, type GenAiSemanticAttributeKey, type GenAiSemanticAttributes, type GenAiTokenType, type GenAiToolType, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type NormalizedSpan, type NormalizedSpanAttributeKey, type NormalizedSpanAttributes, type NormalizedSpanEvent, type NormalizedTrace, type OpenTelemetrySemanticAttributeKey, type OpenTelemetrySemanticAttributes, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleToolCallRecord, type SimpleTraceRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, failedSpans, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, latestAssistantMessageContent, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, spans, spansByKind, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
316
+ export { type CreateHarnessOptions, type CreateHarnessRunArgs, type CreateToolCallSpansOptions, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleToolCallRecord, type SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };