vitest-evals 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/README.md +57 -10
  2. package/dist/harness.d.mts +56 -40
  3. package/dist/harness.d.ts +56 -40
  4. package/dist/harness.js +34 -104
  5. package/dist/harness.js.map +1 -1
  6. package/dist/harness.mjs +37 -104
  7. package/dist/harness.mjs.map +1 -1
  8. package/dist/index.d.mts +6 -6
  9. package/dist/index.d.ts +6 -6
  10. package/dist/index.js +56 -117
  11. package/dist/index.js.map +1 -1
  12. package/dist/index.mjs +59 -117
  13. package/dist/index.mjs.map +1 -1
  14. package/dist/internal/scoring.d.mts +2 -2
  15. package/dist/internal/scoring.d.ts +2 -2
  16. package/dist/internal/scoring.js.map +1 -1
  17. package/dist/internal/toolCallScorer.js.map +1 -1
  18. package/dist/internal/toolCallScorer.mjs +4 -1
  19. package/dist/internal/toolCallScorer.mjs.map +1 -1
  20. package/dist/judges/factualityJudge.js.map +1 -1
  21. package/dist/judges/factualityJudge.mjs +4 -1
  22. package/dist/judges/factualityJudge.mjs.map +1 -1
  23. package/dist/judges/index.js +47 -110
  24. package/dist/judges/index.js.map +1 -1
  25. package/dist/judges/index.mjs +51 -111
  26. package/dist/judges/index.mjs.map +1 -1
  27. package/dist/judges/judgeHarness.js +47 -110
  28. package/dist/judges/judgeHarness.js.map +1 -1
  29. package/dist/judges/judgeHarness.mjs +51 -111
  30. package/dist/judges/judgeHarness.mjs.map +1 -1
  31. package/dist/judges/toolCallJudge.js.map +1 -1
  32. package/dist/judges/toolCallJudge.mjs +4 -1
  33. package/dist/judges/toolCallJudge.mjs.map +1 -1
  34. package/dist/judges/types.d.mts +2 -2
  35. package/dist/judges/types.d.ts +2 -2
  36. package/dist/judges/types.js.map +1 -1
  37. package/dist/legacy/scorers/index.js.map +1 -1
  38. package/dist/legacy/scorers/index.mjs +4 -1
  39. package/dist/legacy/scorers/index.mjs.map +1 -1
  40. package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
  41. package/dist/legacy/scorers/toolCallScorer.mjs +4 -1
  42. package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
  43. package/dist/legacy/shared.d.mts +1 -8
  44. package/dist/legacy/shared.d.ts +1 -8
  45. package/dist/legacy/shared.js.map +1 -1
  46. package/dist/legacy.js +15 -1
  47. package/dist/legacy.js.map +1 -1
  48. package/dist/legacy.mjs +19 -2
  49. package/dist/legacy.mjs.map +1 -1
  50. package/dist/reporter.d.mts +0 -3
  51. package/dist/reporter.d.ts +0 -3
  52. package/dist/reporter.js +10 -40
  53. package/dist/reporter.js.map +1 -1
  54. package/dist/reporter.mjs +14 -41
  55. package/dist/reporter.mjs.map +1 -1
  56. package/package.json +3 -3
package/README.md CHANGED
@@ -33,12 +33,14 @@ workflow.
33
33
  - `run(input)` executes the harness explicitly and returns a normalized
34
34
  `HarnessRun`
35
35
  - the returned `result.output` is the app-facing value you assert on directly
36
- - the returned `result.session` is the canonical JSON-serializable transcript for
37
- reporting, replay, tool assertions, and judges
38
- - the returned `result.traces` contains JSON-serializable operation spans; the
39
- first-party harnesses attach run, model, and tool spans automatically, while
40
- `createHarness(...)` attaches fallback run and tool spans for custom harnesses
41
- that do not return traces themselves. Span attributes include typed
36
+ - helper assertions usually read the returned `result`, for example
37
+ `toolCalls(result)` or `assistantMessages(result)`
38
+ - `result.session` is the canonical JSON-serializable transcript for reporting,
39
+ replay, tool assertions, and judges
40
+ - `result.traces` contains JSON-serializable operation spans; first-party
41
+ harnesses attach native spans when provider/runtime data is available, while
42
+ `createHarness(...)` attaches a fallback run span for custom harnesses that do
43
+ not return traces themselves. Span attributes include typed
42
44
  OpenTelemetry GenAI semantic keys while still allowing provider-specific
43
45
  metadata
44
46
  - scenario-specific judge criteria should live in `input` or explicit matcher
@@ -91,7 +93,7 @@ describeEval(
91
93
  const result = await run("Refund invoice inv_123");
92
94
 
93
95
  expect(result.output).toMatchObject({ status: "approved" });
94
- expect(toolCalls(result.session).map((call) => call.name)).toEqual([
96
+ expect(toolCalls(result).map((call) => call.name)).toEqual([
95
97
  "lookupInvoice",
96
98
  "createRefund",
97
99
  ]);
@@ -245,6 +247,18 @@ const appHarness = createHarness<AppEvalInput, AppOutput>({
245
247
  });
246
248
 
247
249
  return {
250
+ events: [
251
+ {
252
+ type: "message",
253
+ role: "user",
254
+ content: input.events.map((event) => event.type).join(", "),
255
+ },
256
+ {
257
+ type: "message",
258
+ role: "assistant",
259
+ content: result.replies.map((reply) => reply.text).join("\n"),
260
+ },
261
+ ],
248
262
  output: {
249
263
  replies: result.replies,
250
264
  sideEffects: result.sideEffects,
@@ -317,9 +331,42 @@ Use `Harness.run(...)` for the application under test. Calling
317
331
  so reserve that for judges that intentionally need a second execution. Put
318
332
  criteria on `input` when they are part of the scenario itself; pass
319
333
  case-specific judge criteria through matcher options, or configure suite-wide
320
- criteria on the judge instance. `createHarness(...)` builds a default
321
- user/assistant session from `input` and typed `output`; return a full
322
- `HarnessRun` only when you need exact session control.
334
+ criteria on the judge instance.
335
+
336
+ `createHarness(...)` lightweight results must return at least one normalized
337
+ event, either directly as `events` or from strict camelCase `messages`. Stored
338
+ run metadata always uses `session.events`, a flat ordered transcript:
339
+
340
+ ```ts
341
+ events: [
342
+ { type: "message", role: "user", content: input },
343
+ {
344
+ type: "tool_call",
345
+ id: "call_lookup",
346
+ name: "lookupInvoice",
347
+ arguments: { invoiceId: "inv_123" },
348
+ },
349
+ {
350
+ type: "tool_result",
351
+ toolCallId: "call_lookup",
352
+ name: "lookupInvoice",
353
+ content: { refundable: true },
354
+ },
355
+ { type: "message", role: "assistant", content: output },
356
+ ];
357
+ ```
358
+
359
+ For apps that already produce message-shaped data, returning `messages` is also
360
+ accepted; the harness normalizer converts assistant `toolCalls`, `role: "tool"`
361
+ results keyed by `toolCallId`, and AI SDK `tool-call`/`tool-result` content
362
+ parts into the same flat `events` shape. Provider wire formats such as OpenAI
363
+ snake_case fields should be mapped by the harness before they reach this
364
+ boundary. Other provider content blocks or item streams should adapt those
365
+ records into `events` directly. Assertions and judges should read normalized
366
+ projections through helpers such as `toolCalls(result)`, `userMessages(result)`,
367
+ `assistantMessages(result)`, `toolMessages(result)`, and `spans(result)` instead
368
+ of manually walking provider payloads. Return a full `HarnessRun` only when you
369
+ need exact canonical `session.events`, trace, or usage control.
323
370
 
324
371
  Provider setup and rubric parsing stay in your judge. The core
325
372
  package only requires the judge to return a `JudgeResult` with a score and
@@ -1,15 +1,6 @@
1
- import { JsonValue, HarnessRun, GenAiOperationName, ToolCallRecord, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, NormalizedMessage, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
2
- export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCallRecord, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
1
+ import { JsonValue, HarnessRun, GenAiOperationName, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, TranscriptEvent, TranscriptMessageInput, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
2
+ export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedError, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCall, TranscriptEvent, TranscriptMessageContentPart, TranscriptMessageEvent, TranscriptMessageInput, TranscriptMessageTextPart, TranscriptMessageToolCall, TranscriptMessageToolCallPart, TranscriptMessageToolResultPart, TranscriptToolCallEvent, TranscriptToolResultEvent, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, messagesToTranscriptEvents, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
3
3
 
4
- /** Options for converting normalized tool calls into trace spans. */
5
- type CreateToolCallSpansOptions = {
6
- /** Trace id to attach to each generated tool span. */
7
- traceId?: string;
8
- /** Parent span id to attach to each generated tool span. */
9
- parentId?: string;
10
- /** Prefix used to create internal span ids instead of reusing tool-call ids. */
11
- spanIdPrefix?: string;
12
- };
13
4
  /** Options for attaching a fallback run trace to a harness result. */
14
5
  type EnsureRunTraceOptions = {
15
6
  /** Human-readable run or harness name. */
@@ -44,7 +35,7 @@ type HarnessMetadata = Record<string, unknown>;
44
35
  *
45
36
  * return {
46
37
  * output: undefined,
47
- * session: { messages: [{ role: "user", content: input }] },
38
+ * session: { events: [{ type: "message", role: "user", content: input }] },
48
39
  * usage: {},
49
40
  * errors: [],
50
41
  * };
@@ -81,17 +72,6 @@ type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue
81
72
  };
82
73
  /** Value or promise accepted by lightweight harness callbacks. */
83
74
  type MaybePromise<T> = T | Promise<T>;
84
- /** Lightweight tool-call record accepted by `createHarness(...)` results. */
85
- type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
86
- /** Raw tool arguments accepted by `createHarness(...)` before normalization. */
87
- arguments?: unknown;
88
- /** Raw tool result accepted by `createHarness(...)` before normalization. */
89
- result?: unknown;
90
- /** Raw tool error accepted by `createHarness(...)` before normalization. */
91
- error?: unknown;
92
- /** Raw tool metadata accepted by `createHarness(...)` before normalization. */
93
- metadata?: Record<string, unknown>;
94
- };
95
75
  /** Lightweight span event accepted by `createHarness(...)` results. */
96
76
  type SimpleSpanEvent = Omit<NormalizedSpanEvent, "attributes"> & {
97
77
  /** Raw event attributes accepted by `createHarness(...)` before normalization. */
@@ -113,6 +93,16 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
113
93
  /** Lightweight spans to normalize into the trace. */
114
94
  spans: SimpleSpanRecord[];
115
95
  };
96
+ /** Lightweight transcript input accepted by `createHarness(...)` results. */
97
+ type SimpleTranscriptInput = {
98
+ /** Ordered normalized transcript events for the application run. */
99
+ events: TranscriptEvent[];
100
+ messages?: never;
101
+ } | {
102
+ /** Strict camelCase message transport normalized into transcript events. */
103
+ messages: TranscriptMessageInput[];
104
+ events?: never;
105
+ };
116
106
  /**
117
107
  * Lightweight result shape normalized by `createHarness(...)`.
118
108
  *
@@ -120,16 +110,15 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
120
110
  * ```ts
121
111
  * const result: SimpleHarnessResult<{ status: "approved" }> = {
122
112
  * output: { status: "approved" },
123
- * toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
113
+ * events: [
114
+ * { type: "message", role: "user", content: "Refund invoice inv_123" },
115
+ * { type: "message", role: "assistant", content: { status: "approved" } },
116
+ * ],
124
117
  * usage: { totalTokens: 260 },
125
118
  * };
126
119
  * ```
127
120
  */
128
- type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
129
- /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */
130
- messages?: NormalizedMessage[];
131
- /** Lightweight tool-call records to normalize into the session. */
132
- toolCalls?: SimpleToolCallRecord[];
121
+ type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & SimpleTranscriptInput & {
133
122
  /** Usage summary to attach to the run. */
134
123
  usage?: UsageSummary;
135
124
  /** Timing summary to attach to the run. */
@@ -159,12 +148,17 @@ type CreateHarnessRunArgs<TInput> = {
159
148
  /**
160
149
  * Options for creating a lightweight custom application harness.
161
150
  *
151
+ * Prefer this helper for custom harnesses. Implement `Harness` directly only
152
+ * when the callback already returns a full `HarnessRun` with canonical
153
+ * `session.events`.
154
+ *
162
155
  * @example
163
156
  * ```ts
164
157
  * const options: CreateHarnessOptions<string, { status: "approved" }> = {
165
158
  * name: "refund-agent",
166
159
  * run: async ({ input }) => ({
167
160
  * output: await classifyRefund(input),
161
+ * events: [{ type: "message", role: "user", content: input }],
168
162
  * }),
169
163
  * };
170
164
  * ```
@@ -207,7 +201,22 @@ declare function normalizeContent(value: unknown): JsonValue;
207
201
  *
208
202
  * return {
209
203
  * output,
210
- * toolCalls: result.toolCalls,
204
+ * events: [
205
+ * { type: "message", role: "user", content: input },
206
+ * {
207
+ * type: "tool_call",
208
+ * id: "call_lookup",
209
+ * name: "lookupInvoice",
210
+ * arguments: { invoiceId: result.invoiceId },
211
+ * },
212
+ * {
213
+ * type: "tool_result",
214
+ * toolCallId: "call_lookup",
215
+ * name: "lookupInvoice",
216
+ * content: { refundable: result.refundable },
217
+ * },
218
+ * { type: "message", role: "assistant", content: output },
219
+ * ],
211
220
  * usage: { provider: "openai", model: "gpt-4o-mini" },
212
221
  * };
213
222
  * },
@@ -226,11 +235,25 @@ declare function createHarness<TInput = unknown, TOutput extends JsonValue | und
226
235
  * ```ts
227
236
  * const run = normalizeHarnessRun("Refund invoice inv_123", {
228
237
  * output: { status: "approved" },
229
- * toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
238
+ * events: [
239
+ * { type: "message", role: "user", content: "Refund invoice inv_123" },
240
+ * {
241
+ * type: "tool_call",
242
+ * id: "call_lookup",
243
+ * name: "lookupInvoice",
244
+ * arguments: { invoiceId: "inv_123" },
245
+ * },
246
+ * {
247
+ * type: "tool_result",
248
+ * toolCallId: "call_lookup",
249
+ * name: "lookupInvoice",
250
+ * content: { refundable: true },
251
+ * },
252
+ * ],
230
253
  * usage: { provider: "openai", model: "gpt-4o-mini" },
231
254
  * });
232
255
  *
233
- * expect(toolCalls(run.session)).toHaveLength(1);
256
+ * expect(toolCalls(run)).toHaveLength(1);
234
257
  * ```
235
258
  */
236
259
  declare function normalizeHarnessRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext): HarnessRun<TOutput>;
@@ -259,13 +282,6 @@ declare function createGenAiUsageAttributes(usage: UsageSummary | undefined, opt
259
282
  "gen_ai.usage.output_tokens": number | undefined;
260
283
  "gen_ai.usage.reasoning.output_tokens": number | undefined;
261
284
  };
262
- /**
263
- * Converts normalized tool-call records into trace spans.
264
- *
265
- * Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the
266
- * spans belong to a known trace so span ids stay internally unique.
267
- */
268
- declare function createToolCallSpans(calls: ToolCallRecord[], options?: CreateToolCallSpansOptions): NormalizedSpan[];
269
285
  /**
270
286
  * Attaches a fallback run trace when a harness result does not already contain spans.
271
287
  *
@@ -313,4 +329,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
313
329
  /** Serializes an arbitrary thrown value into the normalized error shape. */
314
330
  declare function serializeError(error: unknown): Record<string, JsonValue>;
315
331
 
316
- export { type CreateHarnessOptions, type CreateHarnessRunArgs, type CreateToolCallSpansOptions, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleToolCallRecord, type SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };
332
+ export { type CreateHarnessOptions, type CreateHarnessRunArgs, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleTraceRecord, type SimpleTranscriptInput, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };
package/dist/harness.d.ts CHANGED
@@ -1,15 +1,6 @@
1
- import { JsonValue, HarnessRun, GenAiOperationName, ToolCallRecord, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, NormalizedMessage, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
2
- export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCallRecord, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
1
+ import { JsonValue, HarnessRun, GenAiOperationName, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, TranscriptEvent, TranscriptMessageInput, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
2
+ export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedError, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCall, TranscriptEvent, TranscriptMessageContentPart, TranscriptMessageEvent, TranscriptMessageInput, TranscriptMessageTextPart, TranscriptMessageToolCall, TranscriptMessageToolCallPart, TranscriptMessageToolResultPart, TranscriptToolCallEvent, TranscriptToolResultEvent, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, messagesToTranscriptEvents, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
3
3
 
4
- /** Options for converting normalized tool calls into trace spans. */
5
- type CreateToolCallSpansOptions = {
6
- /** Trace id to attach to each generated tool span. */
7
- traceId?: string;
8
- /** Parent span id to attach to each generated tool span. */
9
- parentId?: string;
10
- /** Prefix used to create internal span ids instead of reusing tool-call ids. */
11
- spanIdPrefix?: string;
12
- };
13
4
  /** Options for attaching a fallback run trace to a harness result. */
14
5
  type EnsureRunTraceOptions = {
15
6
  /** Human-readable run or harness name. */
@@ -44,7 +35,7 @@ type HarnessMetadata = Record<string, unknown>;
44
35
  *
45
36
  * return {
46
37
  * output: undefined,
47
- * session: { messages: [{ role: "user", content: input }] },
38
+ * session: { events: [{ type: "message", role: "user", content: input }] },
48
39
  * usage: {},
49
40
  * errors: [],
50
41
  * };
@@ -81,17 +72,6 @@ type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue
81
72
  };
82
73
  /** Value or promise accepted by lightweight harness callbacks. */
83
74
  type MaybePromise<T> = T | Promise<T>;
84
- /** Lightweight tool-call record accepted by `createHarness(...)` results. */
85
- type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
86
- /** Raw tool arguments accepted by `createHarness(...)` before normalization. */
87
- arguments?: unknown;
88
- /** Raw tool result accepted by `createHarness(...)` before normalization. */
89
- result?: unknown;
90
- /** Raw tool error accepted by `createHarness(...)` before normalization. */
91
- error?: unknown;
92
- /** Raw tool metadata accepted by `createHarness(...)` before normalization. */
93
- metadata?: Record<string, unknown>;
94
- };
95
75
  /** Lightweight span event accepted by `createHarness(...)` results. */
96
76
  type SimpleSpanEvent = Omit<NormalizedSpanEvent, "attributes"> & {
97
77
  /** Raw event attributes accepted by `createHarness(...)` before normalization. */
@@ -113,6 +93,16 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
113
93
  /** Lightweight spans to normalize into the trace. */
114
94
  spans: SimpleSpanRecord[];
115
95
  };
96
+ /** Lightweight transcript input accepted by `createHarness(...)` results. */
97
+ type SimpleTranscriptInput = {
98
+ /** Ordered normalized transcript events for the application run. */
99
+ events: TranscriptEvent[];
100
+ messages?: never;
101
+ } | {
102
+ /** Strict camelCase message transport normalized into transcript events. */
103
+ messages: TranscriptMessageInput[];
104
+ events?: never;
105
+ };
116
106
  /**
117
107
  * Lightweight result shape normalized by `createHarness(...)`.
118
108
  *
@@ -120,16 +110,15 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
120
110
  * ```ts
121
111
  * const result: SimpleHarnessResult<{ status: "approved" }> = {
122
112
  * output: { status: "approved" },
123
- * toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
113
+ * events: [
114
+ * { type: "message", role: "user", content: "Refund invoice inv_123" },
115
+ * { type: "message", role: "assistant", content: { status: "approved" } },
116
+ * ],
124
117
  * usage: { totalTokens: 260 },
125
118
  * };
126
119
  * ```
127
120
  */
128
- type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
129
- /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */
130
- messages?: NormalizedMessage[];
131
- /** Lightweight tool-call records to normalize into the session. */
132
- toolCalls?: SimpleToolCallRecord[];
121
+ type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & SimpleTranscriptInput & {
133
122
  /** Usage summary to attach to the run. */
134
123
  usage?: UsageSummary;
135
124
  /** Timing summary to attach to the run. */
@@ -159,12 +148,17 @@ type CreateHarnessRunArgs<TInput> = {
159
148
  /**
160
149
  * Options for creating a lightweight custom application harness.
161
150
  *
151
+ * Prefer this helper for custom harnesses. Implement `Harness` directly only
152
+ * when the callback already returns a full `HarnessRun` with canonical
153
+ * `session.events`.
154
+ *
162
155
  * @example
163
156
  * ```ts
164
157
  * const options: CreateHarnessOptions<string, { status: "approved" }> = {
165
158
  * name: "refund-agent",
166
159
  * run: async ({ input }) => ({
167
160
  * output: await classifyRefund(input),
161
+ * events: [{ type: "message", role: "user", content: input }],
168
162
  * }),
169
163
  * };
170
164
  * ```
@@ -207,7 +201,22 @@ declare function normalizeContent(value: unknown): JsonValue;
207
201
  *
208
202
  * return {
209
203
  * output,
210
- * toolCalls: result.toolCalls,
204
+ * events: [
205
+ * { type: "message", role: "user", content: input },
206
+ * {
207
+ * type: "tool_call",
208
+ * id: "call_lookup",
209
+ * name: "lookupInvoice",
210
+ * arguments: { invoiceId: result.invoiceId },
211
+ * },
212
+ * {
213
+ * type: "tool_result",
214
+ * toolCallId: "call_lookup",
215
+ * name: "lookupInvoice",
216
+ * content: { refundable: result.refundable },
217
+ * },
218
+ * { type: "message", role: "assistant", content: output },
219
+ * ],
211
220
  * usage: { provider: "openai", model: "gpt-4o-mini" },
212
221
  * };
213
222
  * },
@@ -226,11 +235,25 @@ declare function createHarness<TInput = unknown, TOutput extends JsonValue | und
226
235
  * ```ts
227
236
  * const run = normalizeHarnessRun("Refund invoice inv_123", {
228
237
  * output: { status: "approved" },
229
- * toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
238
+ * events: [
239
+ * { type: "message", role: "user", content: "Refund invoice inv_123" },
240
+ * {
241
+ * type: "tool_call",
242
+ * id: "call_lookup",
243
+ * name: "lookupInvoice",
244
+ * arguments: { invoiceId: "inv_123" },
245
+ * },
246
+ * {
247
+ * type: "tool_result",
248
+ * toolCallId: "call_lookup",
249
+ * name: "lookupInvoice",
250
+ * content: { refundable: true },
251
+ * },
252
+ * ],
230
253
  * usage: { provider: "openai", model: "gpt-4o-mini" },
231
254
  * });
232
255
  *
233
- * expect(toolCalls(run.session)).toHaveLength(1);
256
+ * expect(toolCalls(run)).toHaveLength(1);
234
257
  * ```
235
258
  */
236
259
  declare function normalizeHarnessRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext): HarnessRun<TOutput>;
@@ -259,13 +282,6 @@ declare function createGenAiUsageAttributes(usage: UsageSummary | undefined, opt
259
282
  "gen_ai.usage.output_tokens": number | undefined;
260
283
  "gen_ai.usage.reasoning.output_tokens": number | undefined;
261
284
  };
262
- /**
263
- * Converts normalized tool-call records into trace spans.
264
- *
265
- * Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the
266
- * spans belong to a known trace so span ids stay internally unique.
267
- */
268
- declare function createToolCallSpans(calls: ToolCallRecord[], options?: CreateToolCallSpansOptions): NormalizedSpan[];
269
285
  /**
270
286
  * Attaches a fallback run trace when a harness result does not already contain spans.
271
287
  *
@@ -313,4 +329,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
313
329
  /** Serializes an arbitrary thrown value into the normalized error shape. */
314
330
  declare function serializeError(error: unknown): Record<string, JsonValue>;
315
331
 
316
- export { type CreateHarnessOptions, type CreateHarnessRunArgs, type CreateToolCallSpansOptions, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleToolCallRecord, type SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };
332
+ export { type CreateHarnessOptions, type CreateHarnessRunArgs, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleTraceRecord, type SimpleTranscriptInput, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };
package/dist/harness.js CHANGED
@@ -25,7 +25,6 @@ __export(harness_exports, {
25
25
  createFailedHarnessRun: () => createFailedHarnessRun,
26
26
  createGenAiUsageAttributes: () => createGenAiUsageAttributes,
27
27
  createHarness: () => createHarness,
28
- createToolCallSpans: () => createToolCallSpans,
29
28
  ensureRunTrace: () => ensureRunTrace,
30
29
  failedSpans: () => import_core2.failedSpans,
31
30
  getHarnessRunFromError: () => getHarnessRunFromError,
@@ -34,6 +33,7 @@ __export(harness_exports, {
34
33
  isNormalizedSession: () => isNormalizedSession,
35
34
  latestAssistantMessageContent: () => import_core2.latestAssistantMessageContent,
36
35
  messagesByRole: () => import_core2.messagesByRole,
36
+ messagesToTranscriptEvents: () => import_core2.messagesToTranscriptEvents,
37
37
  normalizeContent: () => normalizeContent,
38
38
  normalizeHarnessRun: () => normalizeHarnessRun,
39
39
  normalizeMetadata: () => normalizeMetadata,
@@ -177,14 +177,24 @@ function normalizeHarnessRun(input, result, context) {
177
177
  }
178
178
  return result;
179
179
  }
180
+ if ("toolCalls" in result) {
181
+ throw new TypeError(
182
+ 'createHarness results do not accept top-level toolCalls. Return ordered session events with type: "tool_call" and type: "tool_result" entries instead.'
183
+ );
184
+ }
180
185
  const output = result.output;
181
- const toolCalls3 = normalizeSimpleToolCalls(result.toolCalls);
182
186
  const usage = result.usage ?? {};
183
- const messages = result.messages ?? createDefaultSessionMessages({
184
- input,
185
- output,
186
- toolCalls: toolCalls3
187
- });
187
+ const events = normalizeTranscriptInput(result);
188
+ if (!events) {
189
+ throw new TypeError(
190
+ "createHarness results must include ordered events or messages. Return a full HarnessRun or a lightweight result with events/messages."
191
+ );
192
+ }
193
+ if (events.length === 0) {
194
+ throw new TypeError(
195
+ "createHarness results must include at least one transcript event. Return ordered events or message transport inputs that normalize into events."
196
+ );
197
+ }
188
198
  const metadata = result.metadata ? normalizeMetadata(result.metadata) : void 0;
189
199
  const artifacts = normalizeMergedArtifacts(
190
200
  context?.artifacts,
@@ -193,7 +203,7 @@ function normalizeHarnessRun(input, result, context) {
193
203
  const traces = normalizeSimpleTraces(result.traces);
194
204
  return {
195
205
  session: {
196
- messages,
206
+ events,
197
207
  ...usage.provider ? { provider: usage.provider } : {},
198
208
  ...usage.model ? { model: usage.model } : {},
199
209
  ...metadata ? { metadata } : {}
@@ -206,12 +216,24 @@ function normalizeHarnessRun(input, result, context) {
206
216
  errors: normalizeSimpleErrors(result.errors)
207
217
  };
208
218
  }
219
+ function normalizeTranscriptInput(result) {
220
+ if ("events" in result && Array.isArray(result.events)) {
221
+ return result.events.map((event) => import_core.TranscriptEventSchema.parse(event));
222
+ }
223
+ if ("messages" in result && Array.isArray(result.messages)) {
224
+ return (0, import_core.messagesToTranscriptEvents)(result.messages).map(
225
+ (event) => import_core.TranscriptEventSchema.parse(event)
226
+ );
227
+ }
228
+ return void 0;
229
+ }
209
230
  function createFailedHarnessRun(input, error, options = {}) {
210
231
  const artifacts = options.artifacts;
211
232
  return {
212
233
  session: {
213
- messages: [
234
+ events: [
214
235
  {
236
+ type: "message",
215
237
  role: "user",
216
238
  content: normalizeContent(input)
217
239
  }
@@ -222,67 +244,6 @@ function createFailedHarnessRun(input, error, options = {}) {
222
244
  errors: [serializeError(error)]
223
245
  };
224
246
  }
225
- function createDefaultSessionMessages({
226
- input,
227
- output,
228
- toolCalls: normalizedToolCalls
229
- }) {
230
- const messages = [
231
- {
232
- role: "user",
233
- content: normalizeContent(input)
234
- }
235
- ];
236
- if (output !== void 0 || normalizedToolCalls.length > 0) {
237
- messages.push({
238
- role: "assistant",
239
- ...output !== void 0 ? { content: normalizeContent(output) } : {},
240
- ...normalizedToolCalls.length > 0 ? { toolCalls: normalizedToolCalls } : {}
241
- });
242
- }
243
- return messages;
244
- }
245
- function normalizeSimpleToolCalls(calls) {
246
- return (calls ?? []).map((call) => {
247
- const {
248
- arguments: rawArguments,
249
- result: rawResult,
250
- error: rawError,
251
- metadata: rawMetadata,
252
- ...toolCall
253
- } = call;
254
- const args = normalizeToolCallArguments(rawArguments);
255
- const result = toJsonValue(rawResult);
256
- const error = normalizeToolCallError(rawError);
257
- const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : void 0;
258
- return {
259
- ...toolCall,
260
- ...args ? { arguments: args } : {},
261
- ...result !== void 0 ? { result } : {},
262
- ...error ? { error } : {},
263
- ...metadata ? { metadata } : {}
264
- };
265
- });
266
- }
267
- function normalizeToolCallArguments(value) {
268
- if (value === void 0) {
269
- return void 0;
270
- }
271
- const normalized = toJsonValue(value);
272
- return normalized && typeof normalized === "object" && !Array.isArray(normalized) ? normalized : void 0;
273
- }
274
- function normalizeToolCallError(value) {
275
- if (value === void 0) {
276
- return void 0;
277
- }
278
- const serialized = serializeError(value);
279
- const { message, type, ...details } = serialized;
280
- return {
281
- ...details,
282
- message: typeof message === "string" ? message : String(message),
283
- ...typeof type === "string" ? { type } : {}
284
- };
285
- }
286
247
  function normalizeMergedArtifacts(contextArtifacts, resultArtifacts) {
287
248
  const artifacts = {
288
249
  ...contextArtifacts ?? {},
@@ -408,32 +369,6 @@ function createGenAiUsageAttributes(usage, options = {}) {
408
369
  "gen_ai.usage.reasoning.output_tokens": usage?.reasoningTokens
409
370
  };
410
371
  }
411
- function createToolCallSpans(calls, options = {}) {
412
- return calls.map((call, index) => {
413
- const spanError = call.error ? normalizeSpanError(call.error) : void 0;
414
- const spanId = options.spanIdPrefix ? `${options.spanIdPrefix}:${index + 1}` : call.id;
415
- return {
416
- ...spanId ? { id: spanId } : {},
417
- ...options.traceId ? { traceId: options.traceId } : {},
418
- ...options.parentId ? { parentId: options.parentId } : {},
419
- name: call.name,
420
- kind: "tool",
421
- ...call.startedAt ? { startedAt: call.startedAt } : {},
422
- ...call.finishedAt ? { finishedAt: call.finishedAt } : {},
423
- ...call.durationMs !== void 0 ? { durationMs: call.durationMs } : {},
424
- status: spanError ? "error" : "ok",
425
- ...spanError ? { error: spanError } : {},
426
- attributes: normalizeSpanAttributes({
427
- "gen_ai.operation.name": "execute_tool",
428
- "gen_ai.tool.name": call.name,
429
- "gen_ai.tool.type": "function",
430
- ...call.id ? { "gen_ai.tool.call.id": call.id } : {},
431
- ...call.arguments !== void 0 ? { "gen_ai.tool.call.arguments": call.arguments } : {},
432
- ...call.result !== void 0 ? { "gen_ai.tool.call.result": call.result } : {}
433
- })
434
- };
435
- });
436
- }
437
372
  function ensureRunTrace(run, options) {
438
373
  if ((0, import_core.spans)(run).length > 0) {
439
374
  return void 0;
@@ -458,11 +393,6 @@ function ensureRunTrace(run, options) {
458
393
  ...createGenAiUsageAttributes(run.usage)
459
394
  })
460
395
  };
461
- const toolSpans = createToolCallSpans((0, import_core.toolCalls)(run.session), {
462
- traceId,
463
- parentId: rootSpanId,
464
- spanIdPrefix: `${traceId}:tool`
465
- });
466
396
  const trace = {
467
397
  id: traceId,
468
398
  name: options.name,
@@ -470,7 +400,7 @@ function ensureRunTrace(run, options) {
470
400
  finishedAt: options.finishedAt.toISOString(),
471
401
  durationMs,
472
402
  ...options.source ? { metadata: { source: options.source } } : {},
473
- spans: [runSpan, ...toolSpans]
403
+ spans: [runSpan]
474
404
  };
475
405
  run.traces = [trace];
476
406
  return trace;
@@ -500,7 +430,7 @@ function isHarnessRun(value) {
500
430
  return isNormalizedSession(candidate.session) && Boolean(candidate.usage) && typeof candidate.usage === "object" && !Array.isArray(candidate.usage) && Array.isArray(candidate.errors);
501
431
  }
502
432
  function isNormalizedSession(value) {
503
- return Boolean(value) && typeof value === "object" && value !== null && "messages" in value && Array.isArray(value.messages);
433
+ return import_core.NormalizedSessionSchema.safeParse(value).success;
504
434
  }
505
435
  function resolveHarnessRunErrors(result) {
506
436
  if (result && typeof result === "object" && Array.isArray(result.errors)) {
@@ -527,7 +457,6 @@ function serializeError(error) {
527
457
  createFailedHarnessRun,
528
458
  createGenAiUsageAttributes,
529
459
  createHarness,
530
- createToolCallSpans,
531
460
  ensureRunTrace,
532
461
  failedSpans,
533
462
  getHarnessRunFromError,
@@ -536,6 +465,7 @@ function serializeError(error) {
536
465
  isNormalizedSession,
537
466
  latestAssistantMessageContent,
538
467
  messagesByRole,
468
+ messagesToTranscriptEvents,
539
469
  normalizeContent,
540
470
  normalizeHarnessRun,
541
471
  normalizeMetadata,