npm - vitest-evals - Versions diffs - 0.13.0 → 0.14.0 - Mend

vitest-evals 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/README.md +57 -10
package/dist/harness.d.mts +56 -40
package/dist/harness.d.ts +56 -40
package/dist/harness.js +34 -104
package/dist/harness.js.map +1 -1
package/dist/harness.mjs +37 -104
package/dist/harness.mjs.map +1 -1
package/dist/index.d.mts +6 -6
package/dist/index.d.ts +6 -6
package/dist/index.js +56 -117
package/dist/index.js.map +1 -1
package/dist/index.mjs +59 -117
package/dist/index.mjs.map +1 -1
package/dist/internal/scoring.d.mts +2 -2
package/dist/internal/scoring.d.ts +2 -2
package/dist/internal/scoring.js.map +1 -1
package/dist/internal/toolCallScorer.js.map +1 -1
package/dist/internal/toolCallScorer.mjs +4 -1
package/dist/internal/toolCallScorer.mjs.map +1 -1
package/dist/judges/factualityJudge.js.map +1 -1
package/dist/judges/factualityJudge.mjs +4 -1
package/dist/judges/factualityJudge.mjs.map +1 -1
package/dist/judges/index.js +47 -110
package/dist/judges/index.js.map +1 -1
package/dist/judges/index.mjs +51 -111
package/dist/judges/index.mjs.map +1 -1
package/dist/judges/judgeHarness.js +47 -110
package/dist/judges/judgeHarness.js.map +1 -1
package/dist/judges/judgeHarness.mjs +51 -111
package/dist/judges/judgeHarness.mjs.map +1 -1
package/dist/judges/toolCallJudge.js.map +1 -1
package/dist/judges/toolCallJudge.mjs +4 -1
package/dist/judges/toolCallJudge.mjs.map +1 -1
package/dist/judges/types.d.mts +2 -2
package/dist/judges/types.d.ts +2 -2
package/dist/judges/types.js.map +1 -1
package/dist/legacy/scorers/index.js.map +1 -1
package/dist/legacy/scorers/index.mjs +4 -1
package/dist/legacy/scorers/index.mjs.map +1 -1
package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
package/dist/legacy/scorers/toolCallScorer.mjs +4 -1
package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
package/dist/legacy/shared.d.mts +1 -8
package/dist/legacy/shared.d.ts +1 -8
package/dist/legacy/shared.js.map +1 -1
package/dist/legacy.js +15 -1
package/dist/legacy.js.map +1 -1
package/dist/legacy.mjs +19 -2
package/dist/legacy.mjs.map +1 -1
package/dist/reporter.d.mts +0 -3
package/dist/reporter.d.ts +0 -3
package/dist/reporter.js +10 -40
package/dist/reporter.js.map +1 -1
package/dist/reporter.mjs +14 -41
package/dist/reporter.mjs.map +1 -1
package/package.json +3 -3

package/README.md CHANGED Viewed

@@ -33,12 +33,14 @@ workflow.
 - `run(input)` executes the harness explicitly and returns a normalized
   `HarnessRun`
 - the returned `result.output` is the app-facing value you assert on directly
-- the returned `result.session` is the canonical JSON-serializable transcript for
-  reporting, replay, tool assertions, and judges
-- the returned `result.traces` contains JSON-serializable operation spans; the
-  first-party harnesses attach run, model, and tool spans automatically, while
-  `createHarness(...)` attaches fallback run and tool spans for custom harnesses
-  that do not return traces themselves. Span attributes include typed
+- helper assertions usually read the returned `result`, for example
+  `toolCalls(result)` or `assistantMessages(result)`
+- `result.session` is the canonical JSON-serializable transcript for reporting,
+  replay, tool assertions, and judges
+- `result.traces` contains JSON-serializable operation spans; first-party
+  harnesses attach native spans when provider/runtime data is available, while
+  `createHarness(...)` attaches a fallback run span for custom harnesses that do
+  not return traces themselves. Span attributes include typed
   OpenTelemetry GenAI semantic keys while still allowing provider-specific
   metadata
 - scenario-specific judge criteria should live in `input` or explicit matcher
@@ -91,7 +93,7 @@ describeEval(
       const result = await run("Refund invoice inv_123");
       expect(result.output).toMatchObject({ status: "approved" });
-      expect(toolCalls(result.session).map((call) => call.name)).toEqual([
+      expect(toolCalls(result).map((call) => call.name)).toEqual([
         "lookupInvoice",
         "createRefund",
       ]);
@@ -245,6 +247,18 @@ const appHarness = createHarness<AppEvalInput, AppOutput>({
     });
     return {
+      events: [
+        {
+          type: "message",
+          role: "user",
+          content: input.events.map((event) => event.type).join(", "),
+        },
+        {
+          type: "message",
+          role: "assistant",
+          content: result.replies.map((reply) => reply.text).join("\n"),
+        },
+      ],
       output: {
         replies: result.replies,
         sideEffects: result.sideEffects,
@@ -317,9 +331,42 @@ Use `Harness.run(...)` for the application under test. Calling
 so reserve that for judges that intentionally need a second execution. Put
 criteria on `input` when they are part of the scenario itself; pass
 case-specific judge criteria through matcher options, or configure suite-wide
-criteria on the judge instance. `createHarness(...)` builds a default
-user/assistant session from `input` and typed `output`; return a full
-`HarnessRun` only when you need exact session control.
+criteria on the judge instance.
+`createHarness(...)` lightweight results must return at least one normalized
+event, either directly as `events` or from strict camelCase `messages`. Stored
+run metadata always uses `session.events`, a flat ordered transcript:
+```ts
+events: [
+  { type: "message", role: "user", content: input },
+  {
+    type: "tool_call",
+    id: "call_lookup",
+    name: "lookupInvoice",
+    arguments: { invoiceId: "inv_123" },
+  },
+  {
+    type: "tool_result",
+    toolCallId: "call_lookup",
+    name: "lookupInvoice",
+    content: { refundable: true },
+  },
+  { type: "message", role: "assistant", content: output },
+];
+```
+For apps that already produce message-shaped data, returning `messages` is also
+accepted; the harness normalizer converts assistant `toolCalls`, `role: "tool"`
+results keyed by `toolCallId`, and AI SDK `tool-call`/`tool-result` content
+parts into the same flat `events` shape. Provider wire formats such as OpenAI
+snake_case fields should be mapped by the harness before they reach this
+boundary. Other provider content blocks or item streams should adapt those
+records into `events` directly. Assertions and judges should read normalized
+projections through helpers such as `toolCalls(result)`, `userMessages(result)`,
+`assistantMessages(result)`, `toolMessages(result)`, and `spans(result)` instead
+of manually walking provider payloads. Return a full `HarnessRun` only when you
+need exact canonical `session.events`, trace, or usage control.
 Provider setup and rubric parsing stay in your judge. The core
 package only requires the judge to return a `JudgeResult` with a score and

package/dist/harness.d.mts CHANGED Viewed

@@ -1,15 +1,6 @@
-import { JsonValue, HarnessRun, GenAiOperationName, ToolCallRecord, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, NormalizedMessage, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
-export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCallRecord, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
+import { JsonValue, HarnessRun, GenAiOperationName, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, TranscriptEvent, TranscriptMessageInput, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
+export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedError, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCall, TranscriptEvent, TranscriptMessageContentPart, TranscriptMessageEvent, TranscriptMessageInput, TranscriptMessageTextPart, TranscriptMessageToolCall, TranscriptMessageToolCallPart, TranscriptMessageToolResultPart, TranscriptToolCallEvent, TranscriptToolResultEvent, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, messagesToTranscriptEvents, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
-/** Options for converting normalized tool calls into trace spans. */
-type CreateToolCallSpansOptions = {
-    /** Trace id to attach to each generated tool span. */
-    traceId?: string;
-    /** Parent span id to attach to each generated tool span. */
-    parentId?: string;
-    /** Prefix used to create internal span ids instead of reusing tool-call ids. */
-    spanIdPrefix?: string;
-};
 /** Options for attaching a fallback run trace to a harness result. */
 type EnsureRunTraceOptions = {
     /** Human-readable run or harness name. */
@@ -44,7 +35,7 @@ type HarnessMetadata = Record<string, unknown>;
  *
  *     return {
  *       output: undefined,
- *       session: { messages: [{ role: "user", content: input }] },
+ *       session: { events: [{ type: "message", role: "user", content: input }] },
  *       usage: {},
  *       errors: [],
  *     };
@@ -81,17 +72,6 @@ type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue
 };
 /** Value or promise accepted by lightweight harness callbacks. */
 type MaybePromise<T> = T | Promise<T>;
-/** Lightweight tool-call record accepted by `createHarness(...)` results. */
-type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
-    /** Raw tool arguments accepted by `createHarness(...)` before normalization. */
-    arguments?: unknown;
-    /** Raw tool result accepted by `createHarness(...)` before normalization. */
-    result?: unknown;
-    /** Raw tool error accepted by `createHarness(...)` before normalization. */
-    error?: unknown;
-    /** Raw tool metadata accepted by `createHarness(...)` before normalization. */
-    metadata?: Record<string, unknown>;
-};
 /** Lightweight span event accepted by `createHarness(...)` results. */
 type SimpleSpanEvent = Omit<NormalizedSpanEvent, "attributes"> & {
     /** Raw event attributes accepted by `createHarness(...)` before normalization. */
@@ -113,6 +93,16 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
     /** Lightweight spans to normalize into the trace. */
     spans: SimpleSpanRecord[];
 };
+/** Lightweight transcript input accepted by `createHarness(...)` results. */
+type SimpleTranscriptInput = {
+    /** Ordered normalized transcript events for the application run. */
+    events: TranscriptEvent[];
+    messages?: never;
+} | {
+    /** Strict camelCase message transport normalized into transcript events. */
+    messages: TranscriptMessageInput[];
+    events?: never;
+};
 /**
  * Lightweight result shape normalized by `createHarness(...)`.
  *
@@ -120,16 +110,15 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
  * ```ts
  * const result: SimpleHarnessResult<{ status: "approved" }> = {
  *   output: { status: "approved" },
- *   toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
+ *   events: [
+ *     { type: "message", role: "user", content: "Refund invoice inv_123" },
+ *     { type: "message", role: "assistant", content: { status: "approved" } },
+ *   ],
  *   usage: { totalTokens: 260 },
  * };
  * ```
  */
-type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
-    /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */
-    messages?: NormalizedMessage[];
-    /** Lightweight tool-call records to normalize into the session. */
-    toolCalls?: SimpleToolCallRecord[];
+type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & SimpleTranscriptInput & {
     /** Usage summary to attach to the run. */
     usage?: UsageSummary;
     /** Timing summary to attach to the run. */
@@ -159,12 +148,17 @@ type CreateHarnessRunArgs<TInput> = {
 /**
  * Options for creating a lightweight custom application harness.
  *
+ * Prefer this helper for custom harnesses. Implement `Harness` directly only
+ * when the callback already returns a full `HarnessRun` with canonical
+ * `session.events`.
+ *
  * @example
  * ```ts
  * const options: CreateHarnessOptions<string, { status: "approved" }> = {
  *   name: "refund-agent",
  *   run: async ({ input }) => ({
  *     output: await classifyRefund(input),
+ *     events: [{ type: "message", role: "user", content: input }],
  *   }),
  * };
  * ```
@@ -207,7 +201,22 @@ declare function normalizeContent(value: unknown): JsonValue;
  *
  *     return {
  *       output,
- *       toolCalls: result.toolCalls,
+ *       events: [
+ *         { type: "message", role: "user", content: input },
+ *         {
+ *           type: "tool_call",
+ *           id: "call_lookup",
+ *           name: "lookupInvoice",
+ *           arguments: { invoiceId: result.invoiceId },
+ *         },
+ *         {
+ *           type: "tool_result",
+ *           toolCallId: "call_lookup",
+ *           name: "lookupInvoice",
+ *           content: { refundable: result.refundable },
+ *         },
+ *         { type: "message", role: "assistant", content: output },
+ *       ],
  *       usage: { provider: "openai", model: "gpt-4o-mini" },
  *     };
  *   },
@@ -226,11 +235,25 @@ declare function createHarness<TInput = unknown, TOutput extends JsonValue | und
  * ```ts
  * const run = normalizeHarnessRun("Refund invoice inv_123", {
  *   output: { status: "approved" },
- *   toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
+ *   events: [
+ *     { type: "message", role: "user", content: "Refund invoice inv_123" },
+ *     {
+ *       type: "tool_call",
+ *       id: "call_lookup",
+ *       name: "lookupInvoice",
+ *       arguments: { invoiceId: "inv_123" },
+ *     },
+ *     {
+ *       type: "tool_result",
+ *       toolCallId: "call_lookup",
+ *       name: "lookupInvoice",
+ *       content: { refundable: true },
+ *     },
+ *   ],
  *   usage: { provider: "openai", model: "gpt-4o-mini" },
  * });
  *
- * expect(toolCalls(run.session)).toHaveLength(1);
+ * expect(toolCalls(run)).toHaveLength(1);
  * ```
  */
 declare function normalizeHarnessRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext): HarnessRun<TOutput>;
@@ -259,13 +282,6 @@ declare function createGenAiUsageAttributes(usage: UsageSummary | undefined, opt
     "gen_ai.usage.output_tokens": number | undefined;
     "gen_ai.usage.reasoning.output_tokens": number | undefined;
 };
-/**
- * Converts normalized tool-call records into trace spans.
- *
- * Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the
- * spans belong to a known trace so span ids stay internally unique.
- */
-declare function createToolCallSpans(calls: ToolCallRecord[], options?: CreateToolCallSpansOptions): NormalizedSpan[];
 /**
  * Attaches a fallback run trace when a harness result does not already contain spans.
  *
@@ -313,4 +329,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
 /** Serializes an arbitrary thrown value into the normalized error shape. */
 declare function serializeError(error: unknown): Record<string, JsonValue>;
-export { type CreateHarnessOptions, type CreateHarnessRunArgs, type CreateToolCallSpansOptions, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleToolCallRecord, type SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };
+export { type CreateHarnessOptions, type CreateHarnessRunArgs, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleTraceRecord, type SimpleTranscriptInput, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };

package/dist/harness.d.ts CHANGED Viewed

@@ -1,15 +1,6 @@
-import { JsonValue, HarnessRun, GenAiOperationName, ToolCallRecord, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, NormalizedMessage, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
-export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCallRecord, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
+import { JsonValue, HarnessRun, GenAiOperationName, NormalizedSpanEvent, NormalizedSpan, NormalizedTrace, TranscriptEvent, TranscriptMessageInput, UsageSummary, TimingSummary, NormalizedSpanAttributes, HarnessRunError, NormalizedSession } from '@vitest-evals/core';
+export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedError, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCall, TranscriptEvent, TranscriptMessageContentPart, TranscriptMessageEvent, TranscriptMessageInput, TranscriptMessageTextPart, TranscriptMessageToolCall, TranscriptMessageToolCallPart, TranscriptMessageToolResultPart, TranscriptToolCallEvent, TranscriptToolResultEvent, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, messagesToTranscriptEvents, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
-/** Options for converting normalized tool calls into trace spans. */
-type CreateToolCallSpansOptions = {
-    /** Trace id to attach to each generated tool span. */
-    traceId?: string;
-    /** Parent span id to attach to each generated tool span. */
-    parentId?: string;
-    /** Prefix used to create internal span ids instead of reusing tool-call ids. */
-    spanIdPrefix?: string;
-};
 /** Options for attaching a fallback run trace to a harness result. */
 type EnsureRunTraceOptions = {
     /** Human-readable run or harness name. */
@@ -44,7 +35,7 @@ type HarnessMetadata = Record<string, unknown>;
  *
  *     return {
  *       output: undefined,
- *       session: { messages: [{ role: "user", content: input }] },
+ *       session: { events: [{ type: "message", role: "user", content: input }] },
  *       usage: {},
  *       errors: [],
  *     };
@@ -81,17 +72,6 @@ type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue
 };
 /** Value or promise accepted by lightweight harness callbacks. */
 type MaybePromise<T> = T | Promise<T>;
-/** Lightweight tool-call record accepted by `createHarness(...)` results. */
-type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
-    /** Raw tool arguments accepted by `createHarness(...)` before normalization. */
-    arguments?: unknown;
-    /** Raw tool result accepted by `createHarness(...)` before normalization. */
-    result?: unknown;
-    /** Raw tool error accepted by `createHarness(...)` before normalization. */
-    error?: unknown;
-    /** Raw tool metadata accepted by `createHarness(...)` before normalization. */
-    metadata?: Record<string, unknown>;
-};
 /** Lightweight span event accepted by `createHarness(...)` results. */
 type SimpleSpanEvent = Omit<NormalizedSpanEvent, "attributes"> & {
     /** Raw event attributes accepted by `createHarness(...)` before normalization. */
@@ -113,6 +93,16 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
     /** Lightweight spans to normalize into the trace. */
     spans: SimpleSpanRecord[];
 };
+/** Lightweight transcript input accepted by `createHarness(...)` results. */
+type SimpleTranscriptInput = {
+    /** Ordered normalized transcript events for the application run. */
+    events: TranscriptEvent[];
+    messages?: never;
+} | {
+    /** Strict camelCase message transport normalized into transcript events. */
+    messages: TranscriptMessageInput[];
+    events?: never;
+};
 /**
  * Lightweight result shape normalized by `createHarness(...)`.
  *
@@ -120,16 +110,15 @@ type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
  * ```ts
  * const result: SimpleHarnessResult<{ status: "approved" }> = {
  *   output: { status: "approved" },
- *   toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
+ *   events: [
+ *     { type: "message", role: "user", content: "Refund invoice inv_123" },
+ *     { type: "message", role: "assistant", content: { status: "approved" } },
+ *   ],
  *   usage: { totalTokens: 260 },
  * };
  * ```
  */
-type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
-    /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */
-    messages?: NormalizedMessage[];
-    /** Lightweight tool-call records to normalize into the session. */
-    toolCalls?: SimpleToolCallRecord[];
+type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & SimpleTranscriptInput & {
     /** Usage summary to attach to the run. */
     usage?: UsageSummary;
     /** Timing summary to attach to the run. */
@@ -159,12 +148,17 @@ type CreateHarnessRunArgs<TInput> = {
 /**
  * Options for creating a lightweight custom application harness.
  *
+ * Prefer this helper for custom harnesses. Implement `Harness` directly only
+ * when the callback already returns a full `HarnessRun` with canonical
+ * `session.events`.
+ *
  * @example
  * ```ts
  * const options: CreateHarnessOptions<string, { status: "approved" }> = {
  *   name: "refund-agent",
  *   run: async ({ input }) => ({
  *     output: await classifyRefund(input),
+ *     events: [{ type: "message", role: "user", content: input }],
  *   }),
  * };
  * ```
@@ -207,7 +201,22 @@ declare function normalizeContent(value: unknown): JsonValue;
  *
  *     return {
  *       output,
- *       toolCalls: result.toolCalls,
+ *       events: [
+ *         { type: "message", role: "user", content: input },
+ *         {
+ *           type: "tool_call",
+ *           id: "call_lookup",
+ *           name: "lookupInvoice",
+ *           arguments: { invoiceId: result.invoiceId },
+ *         },
+ *         {
+ *           type: "tool_result",
+ *           toolCallId: "call_lookup",
+ *           name: "lookupInvoice",
+ *           content: { refundable: result.refundable },
+ *         },
+ *         { type: "message", role: "assistant", content: output },
+ *       ],
  *       usage: { provider: "openai", model: "gpt-4o-mini" },
  *     };
  *   },
@@ -226,11 +235,25 @@ declare function createHarness<TInput = unknown, TOutput extends JsonValue | und
  * ```ts
  * const run = normalizeHarnessRun("Refund invoice inv_123", {
  *   output: { status: "approved" },
- *   toolCalls: [{ name: "lookupInvoice", arguments: { invoiceId: "inv_123" } }],
+ *   events: [
+ *     { type: "message", role: "user", content: "Refund invoice inv_123" },
+ *     {
+ *       type: "tool_call",
+ *       id: "call_lookup",
+ *       name: "lookupInvoice",
+ *       arguments: { invoiceId: "inv_123" },
+ *     },
+ *     {
+ *       type: "tool_result",
+ *       toolCallId: "call_lookup",
+ *       name: "lookupInvoice",
+ *       content: { refundable: true },
+ *     },
+ *   ],
  *   usage: { provider: "openai", model: "gpt-4o-mini" },
  * });
  *
- * expect(toolCalls(run.session)).toHaveLength(1);
+ * expect(toolCalls(run)).toHaveLength(1);
  * ```
  */
 declare function normalizeHarnessRun<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext): HarnessRun<TOutput>;
@@ -259,13 +282,6 @@ declare function createGenAiUsageAttributes(usage: UsageSummary | undefined, opt
     "gen_ai.usage.output_tokens": number | undefined;
     "gen_ai.usage.reasoning.output_tokens": number | undefined;
 };
-/**
- * Converts normalized tool-call records into trace spans.
- *
- * Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the
- * spans belong to a known trace so span ids stay internally unique.
- */
-declare function createToolCallSpans(calls: ToolCallRecord[], options?: CreateToolCallSpansOptions): NormalizedSpan[];
 /**
  * Attaches a fallback run trace when a harness result does not already contain spans.
  *
@@ -313,4 +329,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
 /** Serializes an arbitrary thrown value into the normalized error shape. */
 declare function serializeError(error: unknown): Record<string, JsonValue>;
-export { type CreateHarnessOptions, type CreateHarnessRunArgs, type CreateToolCallSpansOptions, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleToolCallRecord, type SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };
+export { type CreateHarnessOptions, type CreateHarnessRunArgs, type EnsureRunTraceOptions, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type MaybePromise, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleTraceRecord, type SimpleTranscriptInput, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, ensureRunTrace, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, toJsonValue };

package/dist/harness.js CHANGED Viewed

@@ -25,7 +25,6 @@ __export(harness_exports, {
   createFailedHarnessRun: () => createFailedHarnessRun,
   createGenAiUsageAttributes: () => createGenAiUsageAttributes,
   createHarness: () => createHarness,
-  createToolCallSpans: () => createToolCallSpans,
   ensureRunTrace: () => ensureRunTrace,
   failedSpans: () => import_core2.failedSpans,
   getHarnessRunFromError: () => getHarnessRunFromError,
@@ -34,6 +33,7 @@ __export(harness_exports, {
   isNormalizedSession: () => isNormalizedSession,
   latestAssistantMessageContent: () => import_core2.latestAssistantMessageContent,
   messagesByRole: () => import_core2.messagesByRole,
+  messagesToTranscriptEvents: () => import_core2.messagesToTranscriptEvents,
   normalizeContent: () => normalizeContent,
   normalizeHarnessRun: () => normalizeHarnessRun,
   normalizeMetadata: () => normalizeMetadata,
@@ -177,14 +177,24 @@ function normalizeHarnessRun(input, result, context) {
     }
     return result;
   }
+  if ("toolCalls" in result) {
+    throw new TypeError(
+      'createHarness results do not accept top-level toolCalls. Return ordered session events with type: "tool_call" and type: "tool_result" entries instead.'
+    );
+  }
   const output = result.output;
-  const toolCalls3 = normalizeSimpleToolCalls(result.toolCalls);
   const usage = result.usage ?? {};
-  const messages = result.messages ?? createDefaultSessionMessages({
-    input,
-    output,
-    toolCalls: toolCalls3
-  });
+  const events = normalizeTranscriptInput(result);
+  if (!events) {
+    throw new TypeError(
+      "createHarness results must include ordered events or messages. Return a full HarnessRun or a lightweight result with events/messages."
+    );
+  }
+  if (events.length === 0) {
+    throw new TypeError(
+      "createHarness results must include at least one transcript event. Return ordered events or message transport inputs that normalize into events."
+    );
+  }
   const metadata = result.metadata ? normalizeMetadata(result.metadata) : void 0;
   const artifacts = normalizeMergedArtifacts(
     context?.artifacts,
@@ -193,7 +203,7 @@ function normalizeHarnessRun(input, result, context) {
   const traces = normalizeSimpleTraces(result.traces);
   return {
     session: {
-      messages,
+      events,
       ...usage.provider ? { provider: usage.provider } : {},
       ...usage.model ? { model: usage.model } : {},
       ...metadata ? { metadata } : {}
@@ -206,12 +216,24 @@ function normalizeHarnessRun(input, result, context) {
     errors: normalizeSimpleErrors(result.errors)
   };
 }
+function normalizeTranscriptInput(result) {
+  if ("events" in result && Array.isArray(result.events)) {
+    return result.events.map((event) => import_core.TranscriptEventSchema.parse(event));
+  }
+  if ("messages" in result && Array.isArray(result.messages)) {
+    return (0, import_core.messagesToTranscriptEvents)(result.messages).map(
+      (event) => import_core.TranscriptEventSchema.parse(event)
+    );
+  }
+  return void 0;
+}
 function createFailedHarnessRun(input, error, options = {}) {
   const artifacts = options.artifacts;
   return {
     session: {
-      messages: [
+      events: [
         {
+          type: "message",
           role: "user",
           content: normalizeContent(input)
         }
@@ -222,67 +244,6 @@ function createFailedHarnessRun(input, error, options = {}) {
     errors: [serializeError(error)]
   };
 }
-function createDefaultSessionMessages({
-  input,
-  output,
-  toolCalls: normalizedToolCalls
-}) {
-  const messages = [
-    {
-      role: "user",
-      content: normalizeContent(input)
-    }
-  ];
-  if (output !== void 0 || normalizedToolCalls.length > 0) {
-    messages.push({
-      role: "assistant",
-      ...output !== void 0 ? { content: normalizeContent(output) } : {},
-      ...normalizedToolCalls.length > 0 ? { toolCalls: normalizedToolCalls } : {}
-    });
-  }
-  return messages;
-}
-function normalizeSimpleToolCalls(calls) {
-  return (calls ?? []).map((call) => {
-    const {
-      arguments: rawArguments,
-      result: rawResult,
-      error: rawError,
-      metadata: rawMetadata,
-      ...toolCall
-    } = call;
-    const args = normalizeToolCallArguments(rawArguments);
-    const result = toJsonValue(rawResult);
-    const error = normalizeToolCallError(rawError);
-    const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : void 0;
-    return {
-      ...toolCall,
-      ...args ? { arguments: args } : {},
-      ...result !== void 0 ? { result } : {},
-      ...error ? { error } : {},
-      ...metadata ? { metadata } : {}
-    };
-  });
-}
-function normalizeToolCallArguments(value) {
-  if (value === void 0) {
-    return void 0;
-  }
-  const normalized = toJsonValue(value);
-  return normalized && typeof normalized === "object" && !Array.isArray(normalized) ? normalized : void 0;
-}
-function normalizeToolCallError(value) {
-  if (value === void 0) {
-    return void 0;
-  }
-  const serialized = serializeError(value);
-  const { message, type, ...details } = serialized;
-  return {
-    ...details,
-    message: typeof message === "string" ? message : String(message),
-    ...typeof type === "string" ? { type } : {}
-  };
-}
 function normalizeMergedArtifacts(contextArtifacts, resultArtifacts) {
   const artifacts = {
     ...contextArtifacts ?? {},
@@ -408,32 +369,6 @@ function createGenAiUsageAttributes(usage, options = {}) {
     "gen_ai.usage.reasoning.output_tokens": usage?.reasoningTokens
   };
 }
-function createToolCallSpans(calls, options = {}) {
-  return calls.map((call, index) => {
-    const spanError = call.error ? normalizeSpanError(call.error) : void 0;
-    const spanId = options.spanIdPrefix ? `${options.spanIdPrefix}:${index + 1}` : call.id;
-    return {
-      ...spanId ? { id: spanId } : {},
-      ...options.traceId ? { traceId: options.traceId } : {},
-      ...options.parentId ? { parentId: options.parentId } : {},
-      name: call.name,
-      kind: "tool",
-      ...call.startedAt ? { startedAt: call.startedAt } : {},
-      ...call.finishedAt ? { finishedAt: call.finishedAt } : {},
-      ...call.durationMs !== void 0 ? { durationMs: call.durationMs } : {},
-      status: spanError ? "error" : "ok",
-      ...spanError ? { error: spanError } : {},
-      attributes: normalizeSpanAttributes({
-        "gen_ai.operation.name": "execute_tool",
-        "gen_ai.tool.name": call.name,
-        "gen_ai.tool.type": "function",
-        ...call.id ? { "gen_ai.tool.call.id": call.id } : {},
-        ...call.arguments !== void 0 ? { "gen_ai.tool.call.arguments": call.arguments } : {},
-        ...call.result !== void 0 ? { "gen_ai.tool.call.result": call.result } : {}
-      })
-    };
-  });
-}
 function ensureRunTrace(run, options) {
   if ((0, import_core.spans)(run).length > 0) {
     return void 0;
@@ -458,11 +393,6 @@ function ensureRunTrace(run, options) {
       ...createGenAiUsageAttributes(run.usage)
     })
   };
-  const toolSpans = createToolCallSpans((0, import_core.toolCalls)(run.session), {
-    traceId,
-    parentId: rootSpanId,
-    spanIdPrefix: `${traceId}:tool`
-  });
   const trace = {
     id: traceId,
     name: options.name,
@@ -470,7 +400,7 @@ function ensureRunTrace(run, options) {
     finishedAt: options.finishedAt.toISOString(),
     durationMs,
     ...options.source ? { metadata: { source: options.source } } : {},
-    spans: [runSpan, ...toolSpans]
+    spans: [runSpan]
   };
   run.traces = [trace];
   return trace;
@@ -500,7 +430,7 @@ function isHarnessRun(value) {
   return isNormalizedSession(candidate.session) && Boolean(candidate.usage) && typeof candidate.usage === "object" && !Array.isArray(candidate.usage) && Array.isArray(candidate.errors);
 }
 function isNormalizedSession(value) {
-  return Boolean(value) && typeof value === "object" && value !== null && "messages" in value && Array.isArray(value.messages);
+  return import_core.NormalizedSessionSchema.safeParse(value).success;
 }
 function resolveHarnessRunErrors(result) {
   if (result && typeof result === "object" && Array.isArray(result.errors)) {
@@ -527,7 +457,6 @@ function serializeError(error) {
   createFailedHarnessRun,
   createGenAiUsageAttributes,
   createHarness,
-  createToolCallSpans,
   ensureRunTrace,
   failedSpans,
   getHarnessRunFromError,
@@ -536,6 +465,7 @@ function serializeError(error) {
   isNormalizedSession,
   latestAssistantMessageContent,
   messagesByRole,
+  messagesToTranscriptEvents,
   normalizeContent,
   normalizeHarnessRun,
   normalizeMetadata,