vitest-evals 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/README.md +57 -10
  2. package/dist/harness.d.mts +56 -40
  3. package/dist/harness.d.ts +56 -40
  4. package/dist/harness.js +34 -104
  5. package/dist/harness.js.map +1 -1
  6. package/dist/harness.mjs +37 -104
  7. package/dist/harness.mjs.map +1 -1
  8. package/dist/index.d.mts +6 -6
  9. package/dist/index.d.ts +6 -6
  10. package/dist/index.js +56 -117
  11. package/dist/index.js.map +1 -1
  12. package/dist/index.mjs +59 -117
  13. package/dist/index.mjs.map +1 -1
  14. package/dist/internal/scoring.d.mts +2 -2
  15. package/dist/internal/scoring.d.ts +2 -2
  16. package/dist/internal/scoring.js.map +1 -1
  17. package/dist/internal/toolCallScorer.js.map +1 -1
  18. package/dist/internal/toolCallScorer.mjs +4 -1
  19. package/dist/internal/toolCallScorer.mjs.map +1 -1
  20. package/dist/judges/factualityJudge.js.map +1 -1
  21. package/dist/judges/factualityJudge.mjs +4 -1
  22. package/dist/judges/factualityJudge.mjs.map +1 -1
  23. package/dist/judges/index.js +47 -110
  24. package/dist/judges/index.js.map +1 -1
  25. package/dist/judges/index.mjs +51 -111
  26. package/dist/judges/index.mjs.map +1 -1
  27. package/dist/judges/judgeHarness.js +47 -110
  28. package/dist/judges/judgeHarness.js.map +1 -1
  29. package/dist/judges/judgeHarness.mjs +51 -111
  30. package/dist/judges/judgeHarness.mjs.map +1 -1
  31. package/dist/judges/toolCallJudge.js.map +1 -1
  32. package/dist/judges/toolCallJudge.mjs +4 -1
  33. package/dist/judges/toolCallJudge.mjs.map +1 -1
  34. package/dist/judges/types.d.mts +2 -2
  35. package/dist/judges/types.d.ts +2 -2
  36. package/dist/judges/types.js.map +1 -1
  37. package/dist/legacy/scorers/index.js.map +1 -1
  38. package/dist/legacy/scorers/index.mjs +4 -1
  39. package/dist/legacy/scorers/index.mjs.map +1 -1
  40. package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
  41. package/dist/legacy/scorers/toolCallScorer.mjs +4 -1
  42. package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
  43. package/dist/legacy/shared.d.mts +1 -8
  44. package/dist/legacy/shared.d.ts +1 -8
  45. package/dist/legacy/shared.js.map +1 -1
  46. package/dist/legacy.js +15 -1
  47. package/dist/legacy.js.map +1 -1
  48. package/dist/legacy.mjs +19 -2
  49. package/dist/legacy.mjs.map +1 -1
  50. package/dist/reporter.d.mts +0 -3
  51. package/dist/reporter.d.ts +0 -3
  52. package/dist/reporter.js +10 -40
  53. package/dist/reporter.js.map +1 -1
  54. package/dist/reporter.mjs +14 -41
  55. package/dist/reporter.mjs.map +1 -1
  56. package/package.json +3 -3
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/harness.ts"],"sourcesContent":["import {\n assistantMessages,\n failedSpans,\n latestAssistantMessageContent,\n messagesByRole,\n spans,\n spansByKind,\n systemMessages,\n toolCalls,\n toolMessages,\n userMessages,\n} from \"@vitest-evals/core\";\nimport type {\n GenAiOperationName,\n HarnessRun,\n HarnessRunError,\n JsonPrimitive,\n JsonValue,\n NormalizedMessage,\n NormalizedSession,\n NormalizedSpan,\n NormalizedSpanAttributes,\n NormalizedSpanEvent,\n NormalizedTrace,\n TimingSummary,\n ToolCallRecord,\n UsageSummary,\n} from \"@vitest-evals/core\";\n\nexport {\n assistantMessages,\n failedSpans,\n latestAssistantMessageContent,\n messagesByRole,\n spans,\n spansByKind,\n systemMessages,\n toolCalls,\n toolMessages,\n userMessages,\n} from \"@vitest-evals/core\";\nexport type {\n GenAiOperationName,\n GenAiOutputType,\n GenAiProviderName,\n GenAiSemanticAttributeKey,\n GenAiSemanticAttributes,\n GenAiTokenType,\n GenAiToolType,\n HarnessRun,\n HarnessRunError,\n JsonPrimitive,\n JsonValue,\n NormalizedMessage,\n NormalizedSession,\n NormalizedSpan,\n NormalizedSpanAttributeKey,\n NormalizedSpanAttributes,\n NormalizedSpanEvent,\n NormalizedTrace,\n OpenTelemetrySemanticAttributeKey,\n OpenTelemetrySemanticAttributes,\n TimingSummary,\n ToolCallRecord,\n UsageSummary,\n} from \"@vitest-evals/core\";\n\n/** Options for converting normalized tool calls into trace spans. */\nexport type CreateToolCallSpansOptions = {\n /** Trace id to attach to each generated tool span. */\n traceId?: string;\n /** Parent span id to attach to each generated tool span. */\n parentId?: string;\n /** Prefix used to create internal span ids instead of reusing tool-call ids. */\n spanIdPrefix?: string;\n};\n\n/** Options for attaching a fallback run trace to a harness result. */\nexport type EnsureRunTraceOptions = {\n /** Human-readable run or harness name. */\n name: string;\n /** Wall-clock start time for the harness run. */\n startedAt: Date;\n /** Wall-clock finish time for the harness run. */\n finishedAt: Date;\n /** Optional trace id. A generated id is used when omitted. */\n id?: string;\n /** GenAI operation name to place on the root run span. */\n operationName?: GenAiOperationName;\n /** Optional JSON-safe source marker for the trace metadata. */\n source?: string;\n};\n\ntype OutputField<TOutput extends JsonValue | undefined> =\n undefined extends TOutput ? { output?: TOutput } : { output: TOutput };\n\n/** Generic JSON-like metadata record used by normalized artifacts and reports. */\nexport type HarnessMetadata = Record<string, unknown>;\n\n/**\n * Runtime context passed from the eval fixture into a harness run.\n *\n * @example\n * ```ts\n * const harness: Harness<string> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * context.setArtifact(\"inputLength\", input.length);\n *\n * return {\n * output: undefined,\n * session: { messages: [{ role: \"user\", content: input }] },\n * usage: {},\n * errors: [],\n * };\n * },\n * };\n * ```\n */\nexport type HarnessContext = {\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable JSON-safe artifact bag shared with the harness. */\n artifacts: Record<string, JsonValue>;\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: (name: string, value: JsonValue) => void;\n};\n\n/**\n * Adapter that executes the system under test and returns a normalized run.\n *\n * @example\n * ```ts\n * const harness: Harness<string, { status: \"approved\" | \"denied\" }> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * return normalizeHarnessRun(input, await runRefundFlow(input), context);\n * },\n * };\n * ```\n */\nexport type Harness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes the system under test and returns a normalized run. */\n run: (input: TInput, context: HarnessContext) => Promise<HarnessRun<TOutput>>;\n};\n\n/** Value or promise accepted by lightweight harness callbacks. */\nexport type MaybePromise<T> = T | Promise<T>;\n\n/** Lightweight tool-call record accepted by `createHarness(...)` results. */\nexport type SimpleToolCallRecord = Omit<\n ToolCallRecord,\n \"arguments\" | \"result\" | \"error\" | \"metadata\"\n> & {\n /** Raw tool arguments accepted by `createHarness(...)` before normalization. */\n arguments?: unknown;\n /** Raw tool result accepted by `createHarness(...)` before normalization. */\n result?: unknown;\n /** Raw tool error accepted by `createHarness(...)` before normalization. */\n error?: unknown;\n /** Raw tool metadata accepted by `createHarness(...)` before normalization. */\n metadata?: Record<string, unknown>;\n};\n\n/** Lightweight span event accepted by `createHarness(...)` results. */\nexport type SimpleSpanEvent = Omit<NormalizedSpanEvent, \"attributes\"> & {\n /** Raw event attributes accepted by `createHarness(...)` before normalization. */\n attributes?: Record<string, unknown>;\n};\n\n/** Lightweight span record accepted by `createHarness(...)` results. */\nexport type SimpleSpanRecord = Omit<\n NormalizedSpan,\n \"attributes\" | \"error\" | \"events\"\n> & {\n /** Raw span attributes accepted by `createHarness(...)` before normalization. */\n attributes?: Record<string, unknown>;\n /** Raw span error accepted by `createHarness(...)` before normalization. */\n error?: unknown;\n /** Raw span events accepted by `createHarness(...)` before normalization. */\n events?: SimpleSpanEvent[];\n};\n\n/** Lightweight trace record accepted by `createHarness(...)` results. */\nexport type SimpleTraceRecord = Omit<NormalizedTrace, \"metadata\" | \"spans\"> & {\n /** Raw trace metadata accepted by `createHarness(...)` before normalization. */\n metadata?: Record<string, unknown>;\n /** Lightweight spans to normalize into the trace. */\n spans: SimpleSpanRecord[];\n};\n\n/**\n * Lightweight result shape normalized by `createHarness(...)`.\n *\n * @example\n * ```ts\n * const result: SimpleHarnessResult<{ status: \"approved\" }> = {\n * output: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n * usage: { totalTokens: 260 },\n * };\n * ```\n */\nexport type SimpleHarnessResult<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = OutputField<TOutput> & {\n /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */\n messages?: NormalizedMessage[];\n /** Lightweight tool-call records to normalize into the session. */\n toolCalls?: SimpleToolCallRecord[];\n /** Usage summary to attach to the run. */\n usage?: UsageSummary;\n /** Timing summary to attach to the run. */\n timings?: TimingSummary;\n /** Raw artifact values to normalize and merge into the run. */\n artifacts?: Record<string, unknown>;\n /** Lightweight traces and spans to normalize into the run. */\n traces?: SimpleTraceRecord[];\n /** Raw session metadata to normalize into the session. */\n metadata?: Record<string, unknown>;\n /** Raw errors to normalize into the run. */\n errors?: unknown[];\n};\n\n/** Either a complete normalized run or a lightweight result to normalize. */\nexport type HarnessResultLike<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;\n\n/** Arguments passed to the `createHarness(...)` convenience callback. */\nexport type CreateHarnessRunArgs<TInput> = {\n /** Original input passed to `run(input)`. */\n input: TInput;\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable run artifact bag. */\n artifacts: HarnessContext[\"artifacts\"];\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: HarnessContext[\"setArtifact\"];\n};\n\n/**\n * Options for creating a lightweight custom application harness.\n *\n * @example\n * ```ts\n * const options: CreateHarnessOptions<string, { status: \"approved\" }> = {\n * name: \"refund-agent\",\n * run: async ({ input }) => ({\n * output: await classifyRefund(input),\n * }),\n * };\n * ```\n */\nexport type CreateHarnessOptions<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes application code and returns either a lightweight result or full `HarnessRun`. */\n run: (\n args: CreateHarnessRunArgs<TInput>,\n ) => MaybePromise<HarnessResultLike<TOutput>>;\n};\n\nfunction isJsonPrimitive(value: unknown): value is JsonPrimitive {\n return (\n value === null ||\n typeof value === \"string\" ||\n typeof value === \"boolean\" ||\n (typeof value === \"number\" && Number.isFinite(value))\n );\n}\n\nfunction isJsonRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\nfunction normalizeJsonArray(value: unknown[], seen: WeakSet<object>) {\n if (seen.has(value)) {\n return undefined;\n }\n\n seen.add(value);\n const normalized = value.map((item) => {\n const normalized = toJsonValueInternal(item, seen);\n return normalized === undefined ? null : normalized;\n });\n seen.delete(value);\n\n return normalized;\n}\n\nfunction normalizeJsonObject(\n value: Record<string, unknown>,\n seen: WeakSet<object>,\n): Record<string, JsonValue> {\n const normalized: Record<string, JsonValue> = {};\n\n if (seen.has(value)) {\n return normalized;\n }\n\n seen.add(value);\n try {\n for (const [key, entryValue] of Object.entries(value)) {\n const entry = toJsonValueInternal(entryValue, seen);\n if (entry !== undefined) {\n normalized[key] = entry;\n }\n }\n } finally {\n seen.delete(value);\n }\n\n return normalized;\n}\n\n/** Returns true when a value exposes a callable method with the given name. */\nexport function hasCallableMethod(value: unknown, methodName: string) {\n return (\n value !== null &&\n (typeof value === \"object\" || typeof value === \"function\") &&\n methodName in value &&\n typeof (value as Record<string, unknown>)[methodName] === \"function\"\n );\n}\n\n/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */\nexport function toJsonValue(value: unknown): JsonValue | undefined {\n return toJsonValueInternal(value, new WeakSet());\n}\n\nfunction toJsonValueInternal(\n value: unknown,\n seen: WeakSet<object>,\n): JsonValue | undefined {\n if (isJsonPrimitive(value)) {\n return value;\n }\n\n if (\n value !== null &&\n typeof value === \"object\" &&\n seen.has(value as object)\n ) {\n return undefined;\n }\n\n if (Array.isArray(value)) {\n return normalizeJsonArray(value, seen);\n }\n\n if (isJsonRecord(value)) {\n return normalizeJsonObject(value, seen);\n }\n\n return undefined;\n}\n\n/** Drops non-JSON properties from a record while preserving valid values. */\nexport function normalizeRecord(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n return normalizeJsonObject(value, new WeakSet());\n}\n\n/** Normalizes metadata and omits the field entirely when nothing survives. */\nexport function normalizeMetadata(\n value: Record<string, unknown>,\n): Record<string, JsonValue> | undefined {\n const normalized = normalizeRecord(value);\n return Object.keys(normalized).length > 0 ? normalized : undefined;\n}\n\n/** Converts arbitrary content into the JSON-safe message content shape. */\nexport function normalizeContent(value: unknown): JsonValue {\n const normalized = toJsonValue(value);\n return normalized !== undefined ? normalized : String(value);\n}\n\n/**\n * Creates a harness from the common \"run app code and return output\" shape.\n *\n * @param options - Harness name plus the callback that executes app code.\n *\n * @example\n * ```ts\n * import { createHarness } from \"vitest-evals\";\n *\n * export const refundHarness = createHarness<\n * string,\n * { status: \"approved\" | \"denied\" }\n * >({\n * name: \"refund-agent\",\n * run: async ({ input, setArtifact }) => {\n * const result = await runRefundFlow(input);\n * const output = { status: result.status };\n *\n * setArtifact(\"case\", { invoiceId: result.invoiceId });\n *\n * return {\n * output,\n * toolCalls: result.toolCalls,\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * };\n * },\n * });\n * ```\n */\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(options: CreateHarnessOptions<TInput, TOutput>): Harness<TInput, TOutput>;\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(options: CreateHarnessOptions<TInput, TOutput>): Harness<TInput, TOutput> {\n const harness: Harness<TInput, TOutput> = {\n name: options.name,\n run: async (input, context) => {\n const startedAt = new Date();\n\n try {\n const result = await options.run({\n input,\n signal: context.signal,\n artifacts: context.artifacts,\n setArtifact: context.setArtifact,\n });\n const run = normalizeHarnessRun(input, result, context);\n ensureRunTrace(run, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n\n return run;\n } catch (error) {\n const partialRun = getHarnessRunFromError(error);\n if (partialRun) {\n if (\n Object.keys(context.artifacts).length > 0 &&\n !partialRun.artifacts\n ) {\n partialRun.artifacts = context.artifacts;\n }\n ensureRunTrace(partialRun, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n throw attachHarnessRunToError(error, partialRun);\n }\n\n const failedRun = createFailedHarnessRun(input, error, {\n artifacts: context.artifacts,\n });\n ensureRunTrace(failedRun, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n\n throw attachHarnessRunToError(error, failedRun);\n }\n },\n };\n\n return harness;\n}\n\n/**\n * Normalizes a lightweight harness result into the reporter-facing run shape.\n *\n * @param input - Original input passed to the harness.\n * @param result - Lightweight result or pre-normalized harness run.\n * @param context - Optional per-run context used to merge artifacts.\n *\n * @example\n * ```ts\n * const run = normalizeHarnessRun(\"Refund invoice inv_123\", {\n * output: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * });\n *\n * expect(toolCalls(run.session)).toHaveLength(1);\n * ```\n */\nexport function normalizeHarnessRun<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(\n input: TInput,\n result: HarnessResultLike<TOutput>,\n context?: HarnessContext,\n): HarnessRun<TOutput> {\n if (isHarnessRun(result)) {\n if (\n context &&\n Object.keys(context.artifacts).length > 0 &&\n !result.artifacts\n ) {\n return {\n ...result,\n artifacts: context.artifacts,\n };\n }\n\n return result;\n }\n\n const output = result.output;\n const toolCalls = normalizeSimpleToolCalls(result.toolCalls);\n const usage = result.usage ?? {};\n const messages =\n result.messages ??\n createDefaultSessionMessages({\n input,\n output,\n toolCalls,\n });\n const metadata = result.metadata\n ? normalizeMetadata(result.metadata)\n : undefined;\n const artifacts = normalizeMergedArtifacts(\n context?.artifacts,\n result.artifacts,\n );\n const traces = normalizeSimpleTraces(result.traces);\n\n return {\n session: {\n messages,\n ...(usage.provider ? { provider: usage.provider } : {}),\n ...(usage.model ? { model: usage.model } : {}),\n ...(metadata ? { metadata } : {}),\n },\n ...(output !== undefined ? { output } : {}),\n usage,\n ...(result.timings ? { timings: result.timings } : {}),\n ...(artifacts ? { artifacts } : {}),\n ...(traces ? { traces } : {}),\n errors: normalizeSimpleErrors(result.errors),\n } as HarnessRun<TOutput>;\n}\n\n/**\n * Builds a JSON-safe failed run for errors that happen before a harness can return.\n *\n * @param input - Original input passed to the harness.\n * @param error - Error thrown by setup or execution.\n * @param options - Optional artifacts to preserve on the failed run.\n */\nexport function createFailedHarnessRun(\n input: unknown,\n error: unknown,\n options: { artifacts?: Record<string, JsonValue> } = {},\n): HarnessRun {\n const artifacts = options.artifacts;\n\n return {\n session: {\n messages: [\n {\n role: \"user\",\n content: normalizeContent(input),\n },\n ],\n },\n usage: {},\n ...(artifacts && Object.keys(artifacts).length > 0 ? { artifacts } : {}),\n errors: [serializeError(error)],\n };\n}\n\nfunction createDefaultSessionMessages<TInput>({\n input,\n output,\n toolCalls: normalizedToolCalls,\n}: {\n input: TInput;\n output: JsonValue | undefined;\n toolCalls: ToolCallRecord[];\n}): NormalizedMessage[] {\n const messages: NormalizedMessage[] = [\n {\n role: \"user\",\n content: normalizeContent(input),\n },\n ];\n\n if (output !== undefined || normalizedToolCalls.length > 0) {\n messages.push({\n role: \"assistant\",\n ...(output !== undefined ? { content: normalizeContent(output) } : {}),\n ...(normalizedToolCalls.length > 0\n ? { toolCalls: normalizedToolCalls }\n : {}),\n });\n }\n\n return messages;\n}\n\nfunction normalizeSimpleToolCalls(\n calls: SimpleToolCallRecord[] | undefined,\n): ToolCallRecord[] {\n return (calls ?? []).map((call) => {\n const {\n arguments: rawArguments,\n result: rawResult,\n error: rawError,\n metadata: rawMetadata,\n ...toolCall\n } = call;\n const args = normalizeToolCallArguments(rawArguments);\n const result = toJsonValue(rawResult);\n const error = normalizeToolCallError(rawError);\n const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : undefined;\n\n return {\n ...toolCall,\n ...(args ? { arguments: args } : {}),\n ...(result !== undefined ? { result } : {}),\n ...(error ? { error } : {}),\n ...(metadata ? { metadata } : {}),\n };\n });\n}\n\nfunction normalizeToolCallArguments(\n value: unknown,\n): Record<string, JsonValue> | undefined {\n if (value === undefined) {\n return undefined;\n }\n\n const normalized = toJsonValue(value);\n return normalized &&\n typeof normalized === \"object\" &&\n !Array.isArray(normalized)\n ? normalized\n : undefined;\n}\n\nfunction normalizeToolCallError(\n value: unknown,\n): ToolCallRecord[\"error\"] | undefined {\n if (value === undefined) {\n return undefined;\n }\n\n const serialized = serializeError(value);\n const { message, type, ...details } = serialized;\n\n return {\n ...details,\n message: typeof message === \"string\" ? message : String(message),\n ...(typeof type === \"string\" ? { type } : {}),\n };\n}\n\nfunction normalizeMergedArtifacts(\n contextArtifacts: Record<string, JsonValue> | undefined,\n resultArtifacts: Record<string, unknown> | undefined,\n) {\n const artifacts = {\n ...(contextArtifacts ?? {}),\n ...(resultArtifacts ? normalizeRecord(resultArtifacts) : {}),\n };\n\n return Object.keys(artifacts).length > 0 ? artifacts : undefined;\n}\n\nfunction normalizeSimpleErrors(\n errors: unknown[] | undefined,\n): Array<Record<string, JsonValue>> {\n return (errors ?? []).map((error) => {\n const normalized = toJsonValue(error);\n\n if (\n normalized &&\n typeof normalized === \"object\" &&\n !Array.isArray(normalized) &&\n Object.keys(normalized).length > 0\n ) {\n return normalized;\n }\n\n return serializeError(error);\n });\n}\n\nfunction normalizeSimpleTraces(\n traces: SimpleTraceRecord[] | undefined,\n): NormalizedTrace[] | undefined {\n if (!Array.isArray(traces)) {\n return undefined;\n }\n\n const normalized = traces\n .map(normalizeSimpleTrace)\n .filter((trace): trace is NormalizedTrace => Boolean(trace));\n\n return normalized.length > 0 ? normalized : undefined;\n}\n\nfunction normalizeSimpleTrace(trace: unknown): NormalizedTrace | undefined {\n if (!isJsonRecord(trace)) {\n return undefined;\n }\n\n const {\n metadata: rawMetadata,\n spans: rawSpans,\n ...traceFields\n } = trace as Partial<SimpleTraceRecord>;\n const spans = (Array.isArray(rawSpans) ? rawSpans : [])\n .map((span) => normalizeSimpleSpan(span))\n .filter((span): span is NormalizedSpan => Boolean(span));\n const metadata = isJsonRecord(rawMetadata)\n ? normalizeMetadata(rawMetadata)\n : undefined;\n\n if (spans.length === 0 && !traceFields.id && !traceFields.name) {\n return undefined;\n }\n\n return {\n ...traceFields,\n ...(metadata ? { metadata } : {}),\n spans,\n };\n}\n\nfunction normalizeSimpleSpan(span: unknown): NormalizedSpan | undefined {\n if (!isJsonRecord(span) || typeof span.name !== \"string\" || !span.name) {\n return undefined;\n }\n\n const {\n attributes: rawAttributes,\n error: rawError,\n events: rawEvents,\n ...spanFields\n } = span as Partial<SimpleSpanRecord> & { name: string };\n const attributes = rawAttributes\n ? isJsonRecord(rawAttributes)\n ? normalizeMetadata(rawAttributes)\n : undefined\n : undefined;\n const error = normalizeSpanError(rawError);\n const events = normalizeSimpleSpanEvents(rawEvents);\n\n return {\n ...spanFields,\n ...(attributes\n ? { attributes: attributes as NormalizedSpanAttributes }\n : {}),\n ...(error ? { error } : {}),\n ...(events ? { events } : {}),\n };\n}\n\nfunction normalizeSimpleSpanEvents(\n events: unknown,\n): NormalizedSpanEvent[] | undefined {\n if (!Array.isArray(events)) {\n return undefined;\n }\n\n const normalized = events\n .map(normalizeSimpleSpanEvent)\n .filter((event): event is NormalizedSpanEvent => Boolean(event));\n\n return normalized.length > 0 ? normalized : undefined;\n}\n\nfunction normalizeSimpleSpanEvent(\n event: unknown,\n): NormalizedSpanEvent | undefined {\n if (!isJsonRecord(event) || typeof event.name !== \"string\" || !event.name) {\n return undefined;\n }\n\n const { attributes: rawAttributes, ...eventFields } =\n event as Partial<SimpleSpanEvent> & { name: string };\n const attributes = rawAttributes\n ? isJsonRecord(rawAttributes)\n ? normalizeMetadata(rawAttributes)\n : undefined\n : undefined;\n\n return {\n ...eventFields,\n ...(attributes\n ? { attributes: attributes as NormalizedSpanAttributes }\n : {}),\n };\n}\n\n/** Normalizes arbitrary span errors while preserving object-shaped messages. */\nexport function normalizeSpanError(\n error: unknown,\n): NormalizedSpan[\"error\"] | undefined {\n if (error === undefined) {\n return undefined;\n }\n\n if (error instanceof Error) {\n const details = normalizeMetadata(\n error as unknown as Record<string, unknown>,\n );\n\n return {\n ...(details ?? {}),\n type: error.name,\n message: error.message,\n };\n }\n\n if (\n error &&\n typeof error === \"object\" &&\n !Array.isArray(error) &&\n typeof (error as { message?: unknown }).message === \"string\"\n ) {\n const normalized = normalizeMetadata(error as Record<string, unknown>);\n const { message, type, ...details } = normalized ?? {};\n\n return {\n ...details,\n message: message as string,\n ...(typeof type === \"string\" ? { type } : {}),\n };\n }\n\n const serialized = serializeError(error);\n const { message, type, ...details } = serialized;\n\n return {\n ...details,\n message: typeof message === \"string\" ? message : String(message),\n ...(typeof type === \"string\" ? { type } : {}),\n };\n}\n\n/** Normalizes raw span attributes into the JSON-safe span attribute shape. */\nexport function normalizeSpanAttributes(\n attributes: Record<string, unknown>,\n): NormalizedSpanAttributes | undefined {\n return normalizeMetadata(attributes) as NormalizedSpanAttributes | undefined;\n}\n\n/** Builds common OpenTelemetry GenAI usage attributes from a usage summary. */\nexport function createGenAiUsageAttributes(\n usage: UsageSummary | undefined,\n options: { provider?: string } = {},\n) {\n return {\n \"gen_ai.provider.name\": usage?.provider ?? options.provider,\n \"gen_ai.request.model\": usage?.model,\n \"gen_ai.response.model\": usage?.model,\n \"gen_ai.usage.input_tokens\": usage?.inputTokens,\n \"gen_ai.usage.output_tokens\": usage?.outputTokens,\n \"gen_ai.usage.reasoning.output_tokens\": usage?.reasoningTokens,\n } satisfies Record<string, unknown>;\n}\n\n/**\n * Converts normalized tool-call records into trace spans.\n *\n * Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the\n * spans belong to a known trace so span ids stay internally unique.\n */\nexport function createToolCallSpans(\n calls: ToolCallRecord[],\n options: CreateToolCallSpansOptions = {},\n): NormalizedSpan[] {\n return calls.map((call, index) => {\n const spanError = call.error ? normalizeSpanError(call.error) : undefined;\n const spanId = options.spanIdPrefix\n ? `${options.spanIdPrefix}:${index + 1}`\n : call.id;\n\n return {\n ...(spanId ? { id: spanId } : {}),\n ...(options.traceId ? { traceId: options.traceId } : {}),\n ...(options.parentId ? { parentId: options.parentId } : {}),\n name: call.name,\n kind: \"tool\",\n ...(call.startedAt ? { startedAt: call.startedAt } : {}),\n ...(call.finishedAt ? { finishedAt: call.finishedAt } : {}),\n ...(call.durationMs !== undefined ? { durationMs: call.durationMs } : {}),\n status: spanError ? \"error\" : \"ok\",\n ...(spanError ? { error: spanError } : {}),\n attributes: normalizeSpanAttributes({\n \"gen_ai.operation.name\": \"execute_tool\",\n \"gen_ai.tool.name\": call.name,\n \"gen_ai.tool.type\": \"function\",\n ...(call.id ? { \"gen_ai.tool.call.id\": call.id } : {}),\n ...(call.arguments !== undefined\n ? { \"gen_ai.tool.call.arguments\": call.arguments }\n : {}),\n ...(call.result !== undefined\n ? { \"gen_ai.tool.call.result\": call.result }\n : {}),\n }),\n } satisfies NormalizedSpan;\n });\n}\n\n/**\n * Attaches a fallback run trace when a harness result does not already contain spans.\n *\n * This keeps custom harnesses inspectable while first-party harness packages\n * remain free to attach richer native traces.\n */\nexport function ensureRunTrace(\n run: HarnessRun,\n options: EnsureRunTraceOptions,\n): NormalizedTrace | undefined {\n if (spans(run).length > 0) {\n return undefined;\n }\n\n const traceId = options.id ?? createGeneratedTraceId();\n const rootSpanId = `${traceId}:run`;\n const durationMs = options.finishedAt.getTime() - options.startedAt.getTime();\n const rootError =\n run.errors.length > 0 ? normalizeSpanError(run.errors[0]) : undefined;\n const runSpan: NormalizedSpan = {\n id: rootSpanId,\n traceId,\n name: options.name,\n kind: \"run\",\n startedAt: options.startedAt.toISOString(),\n finishedAt: options.finishedAt.toISOString(),\n durationMs,\n status: rootError ? \"error\" : \"ok\",\n ...(rootError ? { error: rootError } : {}),\n attributes: normalizeSpanAttributes({\n \"gen_ai.operation.name\": options.operationName ?? \"invoke_workflow\",\n \"gen_ai.workflow.name\": options.name,\n ...createGenAiUsageAttributes(run.usage),\n }),\n };\n const toolSpans = createToolCallSpans(toolCalls(run.session), {\n traceId,\n parentId: rootSpanId,\n spanIdPrefix: `${traceId}:tool`,\n });\n const trace: NormalizedTrace = {\n id: traceId,\n name: options.name,\n startedAt: options.startedAt.toISOString(),\n finishedAt: options.finishedAt.toISOString(),\n durationMs,\n ...(options.source ? { metadata: { source: options.source } } : {}),\n spans: [runSpan, ...toolSpans],\n };\n\n run.traces = [trace];\n return trace;\n}\n\nlet nextGeneratedTraceId = 0;\n\nfunction createGeneratedTraceId() {\n nextGeneratedTraceId += 1;\n return `trace_${nextGeneratedTraceId}`;\n}\n\n/**\n * Attaches a partial or complete harness run to an arbitrary thrown error.\n *\n * @param error - Thrown value to wrap.\n * @param run - Partial or complete normalized harness run to preserve.\n *\n * @example\n * ```ts\n * try {\n * return await runAgent(input);\n * } catch (error) {\n * throw attachHarnessRunToError(error, partialRun);\n * }\n * ```\n */\nexport function attachHarnessRunToError(\n error: unknown,\n run: HarnessRun,\n): HarnessRunError {\n const baseError =\n error instanceof Error\n ? error\n : new Error(String(error ?? \"Unknown error\"));\n return Object.assign(baseError, {\n vitestEvalsRun: run,\n });\n}\n\n/**\n * Reads an attached harness run back off a previously wrapped error value.\n *\n * @param error - Unknown thrown value that may contain a harness run.\n *\n * @example\n * ```ts\n * const partialRun = getHarnessRunFromError(error);\n *\n * if (partialRun) {\n * console.log(toolCalls(partialRun.session));\n * }\n * ```\n */\nexport function getHarnessRunFromError(error: unknown): HarnessRun | undefined {\n if (\n error &&\n typeof error === \"object\" &&\n \"vitestEvalsRun\" in error &&\n isHarnessRun((error as { vitestEvalsRun?: unknown }).vitestEvalsRun)\n ) {\n return (error as { vitestEvalsRun: HarnessRun }).vitestEvalsRun;\n }\n\n return undefined;\n}\n\n/** Returns true when a value matches the normalized `HarnessRun` contract. */\nexport function isHarnessRun(value: unknown): value is HarnessRun {\n if (!value || typeof value !== \"object\") {\n return false;\n }\n\n const candidate = value as {\n session?: unknown;\n usage?: unknown;\n errors?: unknown;\n };\n\n return (\n isNormalizedSession(candidate.session) &&\n Boolean(candidate.usage) &&\n typeof candidate.usage === \"object\" &&\n !Array.isArray(candidate.usage) &&\n Array.isArray(candidate.errors)\n );\n}\n\n/** Returns true when a value matches the normalized session contract. */\nexport function isNormalizedSession(\n value: unknown,\n): value is NormalizedSession {\n return (\n Boolean(value) &&\n typeof value === \"object\" &&\n value !== null &&\n \"messages\" in value &&\n Array.isArray((value as { messages?: unknown }).messages)\n );\n}\n\n/** Reuses pre-normalized harness errors when a runtime already returns them. */\nexport function resolveHarnessRunErrors(\n result: unknown,\n): Array<Record<string, JsonValue>> {\n if (\n result &&\n typeof result === \"object\" &&\n Array.isArray((result as Record<string, unknown>).errors)\n ) {\n return (result as { errors: Array<Record<string, JsonValue>> }).errors;\n }\n\n return [];\n}\n\n/** Serializes an arbitrary thrown value into the normalized error shape. */\nexport function serializeError(error: unknown): Record<string, JsonValue> {\n if (error instanceof Error) {\n return {\n type: error.name,\n message: error.message,\n };\n }\n\n return {\n type: \"Error\",\n message: String(error),\n };\n}\n"],"mappings":";AAAA;AAAA,EAKE;AAAA,EAGA;AAAA,OAGK;AAkBP;AAAA,EACE,qBAAAA;AAAA,EACA,eAAAC;AAAA,EACA,iCAAAC;AAAA,EACA,kBAAAC;AAAA,EACA,SAAAC;AAAA,EACA,eAAAC;AAAA,EACA,kBAAAC;AAAA,EACA,aAAAC;AAAA,EACA,gBAAAC;AAAA,EACA,gBAAAC;AAAA,OACK;AAuOP,SAAS,gBAAgB,OAAwC;AAC/D,SACE,UAAU,QACV,OAAO,UAAU,YACjB,OAAO,UAAU,aAChB,OAAO,UAAU,YAAY,OAAO,SAAS,KAAK;AAEvD;AAEA,SAAS,aAAa,OAAkD;AACtE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;AAEA,SAAS,mBAAmB,OAAkB,MAAuB;AACnE,MAAI,KAAK,IAAI,KAAK,GAAG;AACnB,WAAO;AAAA,EACT;AAEA,OAAK,IAAI,KAAK;AACd,QAAM,aAAa,MAAM,IAAI,CAAC,SAAS;AACrC,UAAMC,cAAa,oBAAoB,MAAM,IAAI;AACjD,WAAOA,gBAAe,SAAY,OAAOA;AAAA,EAC3C,CAAC;AACD,OAAK,OAAO,KAAK;AAEjB,SAAO;AACT;AAEA,SAAS,oBACP,OACA,MAC2B;AAC3B,QAAM,aAAwC,CAAC;AAE/C,MAAI,KAAK,IAAI,KAAK,GAAG;AACnB,WAAO;AAAA,EACT;AAEA,OAAK,IAAI,KAAK;AACd,MAAI;AACF,eAAW,CAAC,KAAK,UAAU,KAAK,OAAO,QAAQ,KAAK,GAAG;AACrD,YAAM,QAAQ,oBAAoB,YAAY,IAAI;AAClD,UAAI,UAAU,QAAW;AACvB,mBAAW,GAAG,IAAI;AAAA,MACpB;AAAA,IACF;AAAA,EACF,UAAE;AACA,SAAK,OAAO,KAAK;AAAA,EACnB;AAEA,SAAO;AACT;AAGO,SAAS,kBAAkB,OAAgB,YAAoB;AACpE,SACE,UAAU,SACT,OAAO,UAAU,YAAY,OAAO,UAAU,eAC/C,cAAc,SACd,OAAQ,MAAkC,UAAU,MAAM;AAE9D;AAGO,SAAS,YAAY,OAAuC;AACjE,SAAO,oBAAoB,OAAO,oBAAI,QAAQ,CAAC;AACjD;AAEA,SAAS,oBACP,OACA,MACuB;AACvB,MAAI,gBAAgB,KAAK,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,MACE,UAAU,QACV,OAAO,UAAU,YACjB,KAAK,IAAI,KAAe,GACxB;AACA,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,KAAK,GAAG;AACxB,WAAO,mBAAmB,OAAO,IAAI;AAAA,EACvC;AAEA,MAAI,aAAa,KAAK,GAAG;AACvB,WAAO,oBAAoB,OAAO,IAAI;AAAA,EACxC;AAEA,SAAO;AACT;AAGO,SAAS,gBACd,OAC2B;AAC3B,SAAO,oBAAoB,OAAO,oBAAI,QAAQ,CAAC;AACjD;AAGO,SAAS,kBACd,OACuC;AACvC,QAAM,aAAa,gBAAgB,KAAK;AACxC,SAAO,OAAO,KAAK,UAAU,EAAE,SAAS,IAAI,aAAa;AAC3D;AAGO,SAAS,iBAAiB,OAA2B;AAC1D,QAAM,aAAa,YAAY,KAAK;AACpC,SAAO,eAAe,SAAY,aAAa,OAAO,KAAK;AAC7D;AAmCO,SAAS,cAGd,SAA0E;AAC1E,QAAM,UAAoC;AAAA,IACxC,MAAM,QAAQ;AAAA,IACd,KAAK,OAAO,OAAO,YAAY;AAC7B,YAAM,YAAY,oBAAI,KAAK;AAE3B,UAAI;AACF,cAAM,SAAS,MAAM,QAAQ,IAAI;AAAA,UAC/B;AAAA,UACA,QAAQ,QAAQ;AAAA,UAChB,WAAW,QAAQ;AAAA,UACnB,aAAa,QAAQ;AAAA,QACvB,CAAC;AACD,cAAM,MAAM,oBAAoB,OAAO,QAAQ,OAAO;AACtD,uBAAe,KAAK;AAAA,UAClB,MAAM,QAAQ;AAAA,UACd;AAAA,UACA,YAAY,oBAAI,KAAK;AAAA,QACvB,CAAC;AAED,eAAO;AAAA,MACT,SAAS,OAAO;AACd,cAAM,aAAa,uBAAuB,KAAK;AAC/C,YAAI,YAAY;AACd,cACE,OAAO,KAAK,QAAQ,SAAS,EAAE,SAAS,KACxC,CAAC,WAAW,WACZ;AACA,uBAAW,YAAY,QAAQ;AAAA,UACjC;AACA,yBAAe,YAAY;AAAA,YACzB,MAAM,QAAQ;AAAA,YACd;AAAA,YACA,YAAY,oBAAI,KAAK;AAAA,UACvB,CAAC;AACD,gBAAM,wBAAwB,OAAO,UAAU;AAAA,QACjD;AAEA,cAAM,YAAY,uBAAuB,OAAO,OAAO;AAAA,UACrD,WAAW,QAAQ;AAAA,QACrB,CAAC;AACD,uBAAe,WAAW;AAAA,UACxB,MAAM,QAAQ;AAAA,UACd;AAAA,UACA,YAAY,oBAAI,KAAK;AAAA,QACvB,CAAC;AAED,cAAM,wBAAwB,OAAO,SAAS;AAAA,MAChD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAoBO,SAAS,oBAId,OACA,QACA,SACqB;AACrB,MAAI,aAAa,MAAM,GAAG;AACxB,QACE,WACA,OAAO,KAAK,QAAQ,SAAS,EAAE,SAAS,KACxC,CAAC,OAAO,WACR;AACA,aAAO;AAAA,QACL,GAAG;AAAA,QACH,WAAW,QAAQ;AAAA,MACrB;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AAEA,QAAM,SAAS,OAAO;AACtB,QAAMH,aAAY,yBAAyB,OAAO,SAAS;AAC3D,QAAM,QAAQ,OAAO,SAAS,CAAC;AAC/B,QAAM,WACJ,OAAO,YACP,6BAA6B;AAAA,IAC3B;AAAA,IACA;AAAA,IACA,WAAAA;AAAA,EACF,CAAC;AACH,QAAM,WAAW,OAAO,WACpB,kBAAkB,OAAO,QAAQ,IACjC;AACJ,QAAM,YAAY;AAAA,IAChB,SAAS;AAAA,IACT,OAAO;AAAA,EACT;AACA,QAAM,SAAS,sBAAsB,OAAO,MAAM;AAElD,SAAO;AAAA,IACL,SAAS;AAAA,MACP;AAAA,MACA,GAAI,MAAM,WAAW,EAAE,UAAU,MAAM,SAAS,IAAI,CAAC;AAAA,MACrD,GAAI,MAAM,QAAQ,EAAE,OAAO,MAAM,MAAM,IAAI,CAAC;AAAA,MAC5C,GAAI,WAAW,EAAE,SAAS,IAAI,CAAC;AAAA,IACjC;AAAA,IACA,GAAI,WAAW,SAAY,EAAE,OAAO,IAAI,CAAC;AAAA,IACzC;AAAA,IACA,GAAI,OAAO,UAAU,EAAE,SAAS,OAAO,QAAQ,IAAI,CAAC;AAAA,IACpD,GAAI,YAAY,EAAE,UAAU,IAAI,CAAC;AAAA,IACjC,GAAI,SAAS,EAAE,OAAO,IAAI,CAAC;AAAA,IAC3B,QAAQ,sBAAsB,OAAO,MAAM;AAAA,EAC7C;AACF;AASO,SAAS,uBACd,OACA,OACA,UAAqD,CAAC,GAC1C;AACZ,QAAM,YAAY,QAAQ;AAE1B,SAAO;AAAA,IACL,SAAS;AAAA,MACP,UAAU;AAAA,QACR;AAAA,UACE,MAAM;AAAA,UACN,SAAS,iBAAiB,KAAK;AAAA,QACjC;AAAA,MACF;AAAA,IACF;AAAA,IACA,OAAO,CAAC;AAAA,IACR,GAAI,aAAa,OAAO,KAAK,SAAS,EAAE,SAAS,IAAI,EAAE,UAAU,IAAI,CAAC;AAAA,IACtE,QAAQ,CAAC,eAAe,KAAK,CAAC;AAAA,EAChC;AACF;AAEA,SAAS,6BAAqC;AAAA,EAC5C;AAAA,EACA;AAAA,EACA,WAAW;AACb,GAIwB;AACtB,QAAM,WAAgC;AAAA,IACpC;AAAA,MACE,MAAM;AAAA,MACN,SAAS,iBAAiB,KAAK;AAAA,IACjC;AAAA,EACF;AAEA,MAAI,WAAW,UAAa,oBAAoB,SAAS,GAAG;AAC1D,aAAS,KAAK;AAAA,MACZ,MAAM;AAAA,MACN,GAAI,WAAW,SAAY,EAAE,SAAS,iBAAiB,MAAM,EAAE,IAAI,CAAC;AAAA,MACpE,GAAI,oBAAoB,SAAS,IAC7B,EAAE,WAAW,oBAAoB,IACjC,CAAC;AAAA,IACP,CAAC;AAAA,EACH;AAEA,SAAO;AACT;AAEA,SAAS,yBACP,OACkB;AAClB,UAAQ,SAAS,CAAC,GAAG,IAAI,CAAC,SAAS;AACjC,UAAM;AAAA,MACJ,WAAW;AAAA,MACX,QAAQ;AAAA,MACR,OAAO;AAAA,MACP,UAAU;AAAA,MACV,GAAG;AAAA,IACL,IAAI;AACJ,UAAM,OAAO,2BAA2B,YAAY;AACpD,UAAM,SAAS,YAAY,SAAS;AACpC,UAAM,QAAQ,uBAAuB,QAAQ;AAC7C,UAAM,WAAW,cAAc,kBAAkB,WAAW,IAAI;AAEhE,WAAO;AAAA,MACL,GAAG;AAAA,MACH,GAAI,OAAO,EAAE,WAAW,KAAK,IAAI,CAAC;AAAA,MAClC,GAAI,WAAW,SAAY,EAAE,OAAO,IAAI,CAAC;AAAA,MACzC,GAAI,QAAQ,EAAE,MAAM,IAAI,CAAC;AAAA,MACzB,GAAI,WAAW,EAAE,SAAS,IAAI,CAAC;AAAA,IACjC;AAAA,EACF,CAAC;AACH;AAEA,SAAS,2BACP,OACuC;AACvC,MAAI,UAAU,QAAW;AACvB,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,YAAY,KAAK;AACpC,SAAO,cACL,OAAO,eAAe,YACtB,CAAC,MAAM,QAAQ,UAAU,IACvB,aACA;AACN;AAEA,SAAS,uBACP,OACqC;AACrC,MAAI,UAAU,QAAW;AACvB,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,eAAe,KAAK;AACvC,QAAM,EAAE,SAAS,MAAM,GAAG,QAAQ,IAAI;AAEtC,SAAO;AAAA,IACL,GAAG;AAAA,IACH,SAAS,OAAO,YAAY,WAAW,UAAU,OAAO,OAAO;AAAA,IAC/D,GAAI,OAAO,SAAS,WAAW,EAAE,KAAK,IAAI,CAAC;AAAA,EAC7C;AACF;AAEA,SAAS,yBACP,kBACA,iBACA;AACA,QAAM,YAAY;AAAA,IAChB,GAAI,oBAAoB,CAAC;AAAA,IACzB,GAAI,kBAAkB,gBAAgB,eAAe,IAAI,CAAC;AAAA,EAC5D;AAEA,SAAO,OAAO,KAAK,SAAS,EAAE,SAAS,IAAI,YAAY;AACzD;AAEA,SAAS,sBACP,QACkC;AAClC,UAAQ,UAAU,CAAC,GAAG,IAAI,CAAC,UAAU;AACnC,UAAM,aAAa,YAAY,KAAK;AAEpC,QACE,cACA,OAAO,eAAe,YACtB,CAAC,MAAM,QAAQ,UAAU,KACzB,OAAO,KAAK,UAAU,EAAE,SAAS,GACjC;AACA,aAAO;AAAA,IACT;AAEA,WAAO,eAAe,KAAK;AAAA,EAC7B,CAAC;AACH;AAEA,SAAS,sBACP,QAC+B;AAC/B,MAAI,CAAC,MAAM,QAAQ,MAAM,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,OAChB,IAAI,oBAAoB,EACxB,OAAO,CAAC,UAAoC,QAAQ,KAAK,CAAC;AAE7D,SAAO,WAAW,SAAS,IAAI,aAAa;AAC9C;AAEA,SAAS,qBAAqB,OAA6C;AACzE,MAAI,CAAC,aAAa,KAAK,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM;AAAA,IACJ,UAAU;AAAA,IACV,OAAO;AAAA,IACP,GAAG;AAAA,EACL,IAAI;AACJ,QAAMH,UAAS,MAAM,QAAQ,QAAQ,IAAI,WAAW,CAAC,GAClD,IAAI,CAAC,SAAS,oBAAoB,IAAI,CAAC,EACvC,OAAO,CAAC,SAAiC,QAAQ,IAAI,CAAC;AACzD,QAAM,WAAW,aAAa,WAAW,IACrC,kBAAkB,WAAW,IAC7B;AAEJ,MAAIA,OAAM,WAAW,KAAK,CAAC,YAAY,MAAM,CAAC,YAAY,MAAM;AAC9D,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,GAAG;AAAA,IACH,GAAI,WAAW,EAAE,SAAS,IAAI,CAAC;AAAA,IAC/B,OAAAA;AAAA,EACF;AACF;AAEA,SAAS,oBAAoB,MAA2C;AACtE,MAAI,CAAC,aAAa,IAAI,KAAK,OAAO,KAAK,SAAS,YAAY,CAAC,KAAK,MAAM;AACtE,WAAO;AAAA,EACT;AAEA,QAAM;AAAA,IACJ,YAAY;AAAA,IACZ,OAAO;AAAA,IACP,QAAQ;AAAA,IACR,GAAG;AAAA,EACL,IAAI;AACJ,QAAM,aAAa,gBACf,aAAa,aAAa,IACxB,kBAAkB,aAAa,IAC/B,SACF;AACJ,QAAM,QAAQ,mBAAmB,QAAQ;AACzC,QAAM,SAAS,0BAA0B,SAAS;AAElD,SAAO;AAAA,IACL,GAAG;AAAA,IACH,GAAI,aACA,EAAE,WAAmD,IACrD,CAAC;AAAA,IACL,GAAI,QAAQ,EAAE,MAAM,IAAI,CAAC;AAAA,IACzB,GAAI,SAAS,EAAE,OAAO,IAAI,CAAC;AAAA,EAC7B;AACF;AAEA,SAAS,0BACP,QACmC;AACnC,MAAI,CAAC,MAAM,QAAQ,MAAM,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,OAChB,IAAI,wBAAwB,EAC5B,OAAO,CAAC,UAAwC,QAAQ,KAAK,CAAC;AAEjE,SAAO,WAAW,SAAS,IAAI,aAAa;AAC9C;AAEA,SAAS,yBACP,OACiC;AACjC,MAAI,CAAC,aAAa,KAAK,KAAK,OAAO,MAAM,SAAS,YAAY,CAAC,MAAM,MAAM;AACzE,WAAO;AAAA,EACT;AAEA,QAAM,EAAE,YAAY,eAAe,GAAG,YAAY,IAChD;AACF,QAAM,aAAa,gBACf,aAAa,aAAa,IACxB,kBAAkB,aAAa,IAC/B,SACF;AAEJ,SAAO;AAAA,IACL,GAAG;AAAA,IACH,GAAI,aACA,EAAE,WAAmD,IACrD,CAAC;AAAA,EACP;AACF;AAGO,SAAS,mBACd,OACqC;AACrC,MAAI,UAAU,QAAW;AACvB,WAAO;AAAA,EACT;AAEA,MAAI,iBAAiB,OAAO;AAC1B,UAAMO,WAAU;AAAA,MACd;AAAA,IACF;AAEA,WAAO;AAAA,MACL,GAAIA,YAAW,CAAC;AAAA,MAChB,MAAM,MAAM;AAAA,MACZ,SAAS,MAAM;AAAA,IACjB;AAAA,EACF;AAEA,MACE,SACA,OAAO,UAAU,YACjB,CAAC,MAAM,QAAQ,KAAK,KACpB,OAAQ,MAAgC,YAAY,UACpD;AACA,UAAM,aAAa,kBAAkB,KAAgC;AACrE,UAAM,EAAE,SAAAC,UAAS,MAAAC,OAAM,GAAGF,SAAQ,IAAI,cAAc,CAAC;AAErD,WAAO;AAAA,MACL,GAAGA;AAAA,MACH,SAASC;AAAA,MACT,GAAI,OAAOC,UAAS,WAAW,EAAE,MAAAA,MAAK,IAAI,CAAC;AAAA,IAC7C;AAAA,EACF;AAEA,QAAM,aAAa,eAAe,KAAK;AACvC,QAAM,EAAE,SAAS,MAAM,GAAG,QAAQ,IAAI;AAEtC,SAAO;AAAA,IACL,GAAG;AAAA,IACH,SAAS,OAAO,YAAY,WAAW,UAAU,OAAO,OAAO;AAAA,IAC/D,GAAI,OAAO,SAAS,WAAW,EAAE,KAAK,IAAI,CAAC;AAAA,EAC7C;AACF;AAGO,SAAS,wBACd,YACsC;AACtC,SAAO,kBAAkB,UAAU;AACrC;AAGO,SAAS,2BACd,OACA,UAAiC,CAAC,GAClC;AACA,SAAO;AAAA,IACL,wBAAwB,OAAO,YAAY,QAAQ;AAAA,IACnD,wBAAwB,OAAO;AAAA,IAC/B,yBAAyB,OAAO;AAAA,IAChC,6BAA6B,OAAO;AAAA,IACpC,8BAA8B,OAAO;AAAA,IACrC,wCAAwC,OAAO;AAAA,EACjD;AACF;AAQO,SAAS,oBACd,OACA,UAAsC,CAAC,GACrB;AAClB,SAAO,MAAM,IAAI,CAAC,MAAM,UAAU;AAChC,UAAM,YAAY,KAAK,QAAQ,mBAAmB,KAAK,KAAK,IAAI;AAChE,UAAM,SAAS,QAAQ,eACnB,GAAG,QAAQ,YAAY,IAAI,QAAQ,CAAC,KACpC,KAAK;AAET,WAAO;AAAA,MACL,GAAI,SAAS,EAAE,IAAI,OAAO,IAAI,CAAC;AAAA,MAC/B,GAAI,QAAQ,UAAU,EAAE,SAAS,QAAQ,QAAQ,IAAI,CAAC;AAAA,MACtD,GAAI,QAAQ,WAAW,EAAE,UAAU,QAAQ,SAAS,IAAI,CAAC;AAAA,MACzD,MAAM,KAAK;AAAA,MACX,MAAM;AAAA,MACN,GAAI,KAAK,YAAY,EAAE,WAAW,KAAK,UAAU,IAAI,CAAC;AAAA,MACtD,GAAI,KAAK,aAAa,EAAE,YAAY,KAAK,WAAW,IAAI,CAAC;AAAA,MACzD,GAAI,KAAK,eAAe,SAAY,EAAE,YAAY,KAAK,WAAW,IAAI,CAAC;AAAA,MACvE,QAAQ,YAAY,UAAU;AAAA,MAC9B,GAAI,YAAY,EAAE,OAAO,UAAU,IAAI,CAAC;AAAA,MACxC,YAAY,wBAAwB;AAAA,QAClC,yBAAyB;AAAA,QACzB,oBAAoB,KAAK;AAAA,QACzB,oBAAoB;AAAA,QACpB,GAAI,KAAK,KAAK,EAAE,uBAAuB,KAAK,GAAG,IAAI,CAAC;AAAA,QACpD,GAAI,KAAK,cAAc,SACnB,EAAE,8BAA8B,KAAK,UAAU,IAC/C,CAAC;AAAA,QACL,GAAI,KAAK,WAAW,SAChB,EAAE,2BAA2B,KAAK,OAAO,IACzC,CAAC;AAAA,MACP,CAAC;AAAA,IACH;AAAA,EACF,CAAC;AACH;AAQO,SAAS,eACd,KACA,SAC6B;AAC7B,MAAI,MAAM,GAAG,EAAE,SAAS,GAAG;AACzB,WAAO;AAAA,EACT;AAEA,QAAM,UAAU,QAAQ,MAAM,uBAAuB;AACrD,QAAM,aAAa,GAAG,OAAO;AAC7B,QAAM,aAAa,QAAQ,WAAW,QAAQ,IAAI,QAAQ,UAAU,QAAQ;AAC5E,QAAM,YACJ,IAAI,OAAO,SAAS,IAAI,mBAAmB,IAAI,OAAO,CAAC,CAAC,IAAI;AAC9D,QAAM,UAA0B;AAAA,IAC9B,IAAI;AAAA,IACJ;AAAA,IACA,MAAM,QAAQ;AAAA,IACd,MAAM;AAAA,IACN,WAAW,QAAQ,UAAU,YAAY;AAAA,IACzC,YAAY,QAAQ,WAAW,YAAY;AAAA,IAC3C;AAAA,IACA,QAAQ,YAAY,UAAU;AAAA,IAC9B,GAAI,YAAY,EAAE,OAAO,UAAU,IAAI,CAAC;AAAA,IACxC,YAAY,wBAAwB;AAAA,MAClC,yBAAyB,QAAQ,iBAAiB;AAAA,MAClD,wBAAwB,QAAQ;AAAA,MAChC,GAAG,2BAA2B,IAAI,KAAK;AAAA,IACzC,CAAC;AAAA,EACH;AACA,QAAM,YAAY,oBAAoB,UAAU,IAAI,OAAO,GAAG;AAAA,IAC5D;AAAA,IACA,UAAU;AAAA,IACV,cAAc,GAAG,OAAO;AAAA,EAC1B,CAAC;AACD,QAAM,QAAyB;AAAA,IAC7B,IAAI;AAAA,IACJ,MAAM,QAAQ;AAAA,IACd,WAAW,QAAQ,UAAU,YAAY;AAAA,IACzC,YAAY,QAAQ,WAAW,YAAY;AAAA,IAC3C;AAAA,IACA,GAAI,QAAQ,SAAS,EAAE,UAAU,EAAE,QAAQ,QAAQ,OAAO,EAAE,IAAI,CAAC;AAAA,IACjE,OAAO,CAAC,SAAS,GAAG,SAAS;AAAA,EAC/B;AAEA,MAAI,SAAS,CAAC,KAAK;AACnB,SAAO;AACT;AAEA,IAAI,uBAAuB;AAE3B,SAAS,yBAAyB;AAChC,0BAAwB;AACxB,SAAO,SAAS,oBAAoB;AACtC;AAiBO,SAAS,wBACd,OACA,KACiB;AACjB,QAAM,YACJ,iBAAiB,QACb,QACA,IAAI,MAAM,OAAO,SAAS,eAAe,CAAC;AAChD,SAAO,OAAO,OAAO,WAAW;AAAA,IAC9B,gBAAgB;AAAA,EAClB,CAAC;AACH;AAgBO,SAAS,uBAAuB,OAAwC;AAC7E,MACE,SACA,OAAO,UAAU,YACjB,oBAAoB,SACpB,aAAc,MAAuC,cAAc,GACnE;AACA,WAAQ,MAAyC;AAAA,EACnD;AAEA,SAAO;AACT;AAGO,SAAS,aAAa,OAAqC;AAChE,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY;AAMlB,SACE,oBAAoB,UAAU,OAAO,KACrC,QAAQ,UAAU,KAAK,KACvB,OAAO,UAAU,UAAU,YAC3B,CAAC,MAAM,QAAQ,UAAU,KAAK,KAC9B,MAAM,QAAQ,UAAU,MAAM;AAElC;AAGO,SAAS,oBACd,OAC4B;AAC5B,SACE,QAAQ,KAAK,KACb,OAAO,UAAU,YACjB,UAAU,QACV,cAAc,SACd,MAAM,QAAS,MAAiC,QAAQ;AAE5D;AAGO,SAAS,wBACd,QACkC;AAClC,MACE,UACA,OAAO,WAAW,YAClB,MAAM,QAAS,OAAmC,MAAM,GACxD;AACA,WAAQ,OAAwD;AAAA,EAClE;AAEA,SAAO,CAAC;AACV;AAGO,SAAS,eAAe,OAA2C;AACxE,MAAI,iBAAiB,OAAO;AAC1B,WAAO;AAAA,MACL,MAAM,MAAM;AAAA,MACZ,SAAS,MAAM;AAAA,IACjB;AAAA,EACF;AAEA,SAAO;AAAA,IACL,MAAM;AAAA,IACN,SAAS,OAAO,KAAK;AAAA,EACvB;AACF;","names":["assistantMessages","failedSpans","latestAssistantMessageContent","messagesByRole","spans","spansByKind","systemMessages","toolCalls","toolMessages","userMessages","normalized","details","message","type"]}
1
+ {"version":3,"sources":["../src/harness.ts"],"sourcesContent":["import {\n assistantMessages,\n failedSpans,\n latestAssistantMessageContent,\n messagesByRole,\n messagesToTranscriptEvents,\n NormalizedSessionSchema,\n spans,\n spansByKind,\n systemMessages,\n TranscriptEventSchema,\n toolCalls,\n toolMessages,\n userMessages,\n} from \"@vitest-evals/core\";\nimport type {\n GenAiOperationName,\n HarnessRun,\n HarnessRunError,\n NormalizedError,\n JsonPrimitive,\n JsonValue,\n TranscriptMessageEvent,\n NormalizedSession,\n NormalizedSpan,\n NormalizedSpanAttributes,\n NormalizedSpanEvent,\n TranscriptEvent,\n NormalizedTrace,\n TimingSummary,\n ToolCall,\n TranscriptMessageInput,\n TranscriptMessageContentPart,\n TranscriptMessageTextPart,\n TranscriptMessageToolCallPart,\n TranscriptMessageToolCall,\n TranscriptMessageToolResultPart,\n UsageSummary,\n} from \"@vitest-evals/core\";\n\nexport {\n assistantMessages,\n failedSpans,\n latestAssistantMessageContent,\n messagesByRole,\n messagesToTranscriptEvents,\n spans,\n spansByKind,\n systemMessages,\n toolCalls,\n toolMessages,\n userMessages,\n} from \"@vitest-evals/core\";\nexport type {\n GenAiOperationName,\n GenAiOutputType,\n GenAiProviderName,\n GenAiSemanticAttributeKey,\n GenAiSemanticAttributes,\n GenAiTokenType,\n GenAiToolType,\n HarnessRun,\n HarnessRunError,\n NormalizedError,\n JsonPrimitive,\n JsonValue,\n TranscriptMessageEvent,\n NormalizedSession,\n NormalizedSpan,\n NormalizedSpanAttributeKey,\n NormalizedSpanAttributes,\n NormalizedSpanEvent,\n TranscriptToolCallEvent,\n TranscriptToolResultEvent,\n TranscriptEvent,\n NormalizedTrace,\n OpenTelemetrySemanticAttributeKey,\n OpenTelemetrySemanticAttributes,\n TimingSummary,\n ToolCall,\n TranscriptMessageInput,\n TranscriptMessageContentPart,\n TranscriptMessageTextPart,\n TranscriptMessageToolCallPart,\n TranscriptMessageToolCall,\n TranscriptMessageToolResultPart,\n UsageSummary,\n} from \"@vitest-evals/core\";\n\n/** Options for attaching a fallback run trace to a harness result. */\nexport type EnsureRunTraceOptions = {\n /** Human-readable run or harness name. */\n name: string;\n /** Wall-clock start time for the harness run. */\n startedAt: Date;\n /** Wall-clock finish time for the harness run. */\n finishedAt: Date;\n /** Optional trace id. A generated id is used when omitted. */\n id?: string;\n /** GenAI operation name to place on the root run span. */\n operationName?: GenAiOperationName;\n /** Optional JSON-safe source marker for the trace metadata. */\n source?: string;\n};\n\ntype OutputField<TOutput extends JsonValue | undefined> =\n undefined extends TOutput ? { output?: TOutput } : { output: TOutput };\n\n/** Generic JSON-like metadata record used by normalized artifacts and reports. */\nexport type HarnessMetadata = Record<string, unknown>;\n\n/**\n * Runtime context passed from the eval fixture into a harness run.\n *\n * @example\n * ```ts\n * const harness: Harness<string> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * context.setArtifact(\"inputLength\", input.length);\n *\n * return {\n * output: undefined,\n * session: { events: [{ type: \"message\", role: \"user\", content: input }] },\n * usage: {},\n * errors: [],\n * };\n * },\n * };\n * ```\n */\nexport type HarnessContext = {\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable JSON-safe artifact bag shared with the harness. */\n artifacts: Record<string, JsonValue>;\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: (name: string, value: JsonValue) => void;\n};\n\n/**\n * Adapter that executes the system under test and returns a normalized run.\n *\n * @example\n * ```ts\n * const harness: Harness<string, { status: \"approved\" | \"denied\" }> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * return normalizeHarnessRun(input, await runRefundFlow(input), context);\n * },\n * };\n * ```\n */\nexport type Harness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes the system under test and returns a normalized run. */\n run: (input: TInput, context: HarnessContext) => Promise<HarnessRun<TOutput>>;\n};\n\n/** Value or promise accepted by lightweight harness callbacks. */\nexport type MaybePromise<T> = T | Promise<T>;\n\n/** Lightweight span event accepted by `createHarness(...)` results. */\nexport type SimpleSpanEvent = Omit<NormalizedSpanEvent, \"attributes\"> & {\n /** Raw event attributes accepted by `createHarness(...)` before normalization. */\n attributes?: Record<string, unknown>;\n};\n\n/** Lightweight span record accepted by `createHarness(...)` results. */\nexport type SimpleSpanRecord = Omit<\n NormalizedSpan,\n \"attributes\" | \"error\" | \"events\"\n> & {\n /** Raw span attributes accepted by `createHarness(...)` before normalization. */\n attributes?: Record<string, unknown>;\n /** Raw span error accepted by `createHarness(...)` before normalization. */\n error?: unknown;\n /** Raw span events accepted by `createHarness(...)` before normalization. */\n events?: SimpleSpanEvent[];\n};\n\n/** Lightweight trace record accepted by `createHarness(...)` results. */\nexport type SimpleTraceRecord = Omit<NormalizedTrace, \"metadata\" | \"spans\"> & {\n /** Raw trace metadata accepted by `createHarness(...)` before normalization. */\n metadata?: Record<string, unknown>;\n /** Lightweight spans to normalize into the trace. */\n spans: SimpleSpanRecord[];\n};\n\n/** Lightweight transcript input accepted by `createHarness(...)` results. */\nexport type SimpleTranscriptInput =\n | {\n /** Ordered normalized transcript events for the application run. */\n events: TranscriptEvent[];\n messages?: never;\n }\n | {\n /** Strict camelCase message transport normalized into transcript events. */\n messages: TranscriptMessageInput[];\n events?: never;\n };\n\n/**\n * Lightweight result shape normalized by `createHarness(...)`.\n *\n * @example\n * ```ts\n * const result: SimpleHarnessResult<{ status: \"approved\" }> = {\n * output: { status: \"approved\" },\n * events: [\n * { type: \"message\", role: \"user\", content: \"Refund invoice inv_123\" },\n * { type: \"message\", role: \"assistant\", content: { status: \"approved\" } },\n * ],\n * usage: { totalTokens: 260 },\n * };\n * ```\n */\nexport type SimpleHarnessResult<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = OutputField<TOutput> &\n SimpleTranscriptInput & {\n /** Usage summary to attach to the run. */\n usage?: UsageSummary;\n /** Timing summary to attach to the run. */\n timings?: TimingSummary;\n /** Raw artifact values to normalize and merge into the run. */\n artifacts?: Record<string, unknown>;\n /** Lightweight traces and spans to normalize into the run. */\n traces?: SimpleTraceRecord[];\n /** Raw session metadata to normalize into the session. */\n metadata?: Record<string, unknown>;\n /** Raw errors to normalize into the run. */\n errors?: unknown[];\n };\n\n/** Either a complete normalized run or a lightweight result to normalize. */\nexport type HarnessResultLike<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;\n\n/** Arguments passed to the `createHarness(...)` convenience callback. */\nexport type CreateHarnessRunArgs<TInput> = {\n /** Original input passed to `run(input)`. */\n input: TInput;\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable run artifact bag. */\n artifacts: HarnessContext[\"artifacts\"];\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: HarnessContext[\"setArtifact\"];\n};\n\n/**\n * Options for creating a lightweight custom application harness.\n *\n * Prefer this helper for custom harnesses. Implement `Harness` directly only\n * when the callback already returns a full `HarnessRun` with canonical\n * `session.events`.\n *\n * @example\n * ```ts\n * const options: CreateHarnessOptions<string, { status: \"approved\" }> = {\n * name: \"refund-agent\",\n * run: async ({ input }) => ({\n * output: await classifyRefund(input),\n * events: [{ type: \"message\", role: \"user\", content: input }],\n * }),\n * };\n * ```\n */\nexport type CreateHarnessOptions<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes application code and returns either a lightweight result or full `HarnessRun`. */\n run: (\n args: CreateHarnessRunArgs<TInput>,\n ) => MaybePromise<HarnessResultLike<TOutput>>;\n};\n\nfunction isJsonPrimitive(value: unknown): value is JsonPrimitive {\n return (\n value === null ||\n typeof value === \"string\" ||\n typeof value === \"boolean\" ||\n (typeof value === \"number\" && Number.isFinite(value))\n );\n}\n\nfunction isJsonRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\nfunction normalizeJsonArray(value: unknown[], seen: WeakSet<object>) {\n if (seen.has(value)) {\n return undefined;\n }\n\n seen.add(value);\n const normalized = value.map((item) => {\n const normalized = toJsonValueInternal(item, seen);\n return normalized === undefined ? null : normalized;\n });\n seen.delete(value);\n\n return normalized;\n}\n\nfunction normalizeJsonObject(\n value: Record<string, unknown>,\n seen: WeakSet<object>,\n): Record<string, JsonValue> {\n const normalized: Record<string, JsonValue> = {};\n\n if (seen.has(value)) {\n return normalized;\n }\n\n seen.add(value);\n try {\n for (const [key, entryValue] of Object.entries(value)) {\n const entry = toJsonValueInternal(entryValue, seen);\n if (entry !== undefined) {\n normalized[key] = entry;\n }\n }\n } finally {\n seen.delete(value);\n }\n\n return normalized;\n}\n\n/** Returns true when a value exposes a callable method with the given name. */\nexport function hasCallableMethod(value: unknown, methodName: string) {\n return (\n value !== null &&\n (typeof value === \"object\" || typeof value === \"function\") &&\n methodName in value &&\n typeof (value as Record<string, unknown>)[methodName] === \"function\"\n );\n}\n\n/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */\nexport function toJsonValue(value: unknown): JsonValue | undefined {\n return toJsonValueInternal(value, new WeakSet());\n}\n\nfunction toJsonValueInternal(\n value: unknown,\n seen: WeakSet<object>,\n): JsonValue | undefined {\n if (isJsonPrimitive(value)) {\n return value;\n }\n\n if (\n value !== null &&\n typeof value === \"object\" &&\n seen.has(value as object)\n ) {\n return undefined;\n }\n\n if (Array.isArray(value)) {\n return normalizeJsonArray(value, seen);\n }\n\n if (isJsonRecord(value)) {\n return normalizeJsonObject(value, seen);\n }\n\n return undefined;\n}\n\n/** Drops non-JSON properties from a record while preserving valid values. */\nexport function normalizeRecord(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n return normalizeJsonObject(value, new WeakSet());\n}\n\n/** Normalizes metadata and omits the field entirely when nothing survives. */\nexport function normalizeMetadata(\n value: Record<string, unknown>,\n): Record<string, JsonValue> | undefined {\n const normalized = normalizeRecord(value);\n return Object.keys(normalized).length > 0 ? normalized : undefined;\n}\n\n/** Converts arbitrary content into the JSON-safe message content shape. */\nexport function normalizeContent(value: unknown): JsonValue {\n const normalized = toJsonValue(value);\n return normalized !== undefined ? normalized : String(value);\n}\n\n/**\n * Creates a harness from the common \"run app code and return output\" shape.\n *\n * @param options - Harness name plus the callback that executes app code.\n *\n * @example\n * ```ts\n * import { createHarness } from \"vitest-evals\";\n *\n * export const refundHarness = createHarness<\n * string,\n * { status: \"approved\" | \"denied\" }\n * >({\n * name: \"refund-agent\",\n * run: async ({ input, setArtifact }) => {\n * const result = await runRefundFlow(input);\n * const output = { status: result.status };\n *\n * setArtifact(\"case\", { invoiceId: result.invoiceId });\n *\n * return {\n * output,\n * events: [\n * { type: \"message\", role: \"user\", content: input },\n * {\n * type: \"tool_call\",\n * id: \"call_lookup\",\n * name: \"lookupInvoice\",\n * arguments: { invoiceId: result.invoiceId },\n * },\n * {\n * type: \"tool_result\",\n * toolCallId: \"call_lookup\",\n * name: \"lookupInvoice\",\n * content: { refundable: result.refundable },\n * },\n * { type: \"message\", role: \"assistant\", content: output },\n * ],\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * };\n * },\n * });\n * ```\n */\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(options: CreateHarnessOptions<TInput, TOutput>): Harness<TInput, TOutput>;\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(options: CreateHarnessOptions<TInput, TOutput>): Harness<TInput, TOutput> {\n const harness: Harness<TInput, TOutput> = {\n name: options.name,\n run: async (input, context) => {\n const startedAt = new Date();\n\n try {\n const result = await options.run({\n input,\n signal: context.signal,\n artifacts: context.artifacts,\n setArtifact: context.setArtifact,\n });\n const run = normalizeHarnessRun(input, result, context);\n ensureRunTrace(run, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n\n return run;\n } catch (error) {\n const partialRun = getHarnessRunFromError(error);\n if (partialRun) {\n if (\n Object.keys(context.artifacts).length > 0 &&\n !partialRun.artifacts\n ) {\n partialRun.artifacts = context.artifacts;\n }\n ensureRunTrace(partialRun, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n throw attachHarnessRunToError(error, partialRun);\n }\n\n const failedRun = createFailedHarnessRun(input, error, {\n artifacts: context.artifacts,\n });\n ensureRunTrace(failedRun, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n\n throw attachHarnessRunToError(error, failedRun);\n }\n },\n };\n\n return harness;\n}\n\n/**\n * Normalizes a lightweight harness result into the reporter-facing run shape.\n *\n * @param input - Original input passed to the harness.\n * @param result - Lightweight result or pre-normalized harness run.\n * @param context - Optional per-run context used to merge artifacts.\n *\n * @example\n * ```ts\n * const run = normalizeHarnessRun(\"Refund invoice inv_123\", {\n * output: { status: \"approved\" },\n * events: [\n * { type: \"message\", role: \"user\", content: \"Refund invoice inv_123\" },\n * {\n * type: \"tool_call\",\n * id: \"call_lookup\",\n * name: \"lookupInvoice\",\n * arguments: { invoiceId: \"inv_123\" },\n * },\n * {\n * type: \"tool_result\",\n * toolCallId: \"call_lookup\",\n * name: \"lookupInvoice\",\n * content: { refundable: true },\n * },\n * ],\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * });\n *\n * expect(toolCalls(run)).toHaveLength(1);\n * ```\n */\nexport function normalizeHarnessRun<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(\n input: TInput,\n result: HarnessResultLike<TOutput>,\n context?: HarnessContext,\n): HarnessRun<TOutput> {\n if (isHarnessRun(result)) {\n if (\n context &&\n Object.keys(context.artifacts).length > 0 &&\n !result.artifacts\n ) {\n return {\n ...result,\n artifacts: context.artifacts,\n };\n }\n\n return result;\n }\n\n if (\"toolCalls\" in result) {\n throw new TypeError(\n 'createHarness results do not accept top-level toolCalls. Return ordered session events with type: \"tool_call\" and type: \"tool_result\" entries instead.',\n );\n }\n\n const output = result.output;\n const usage = result.usage ?? {};\n const events = normalizeTranscriptInput(result);\n if (!events) {\n throw new TypeError(\n \"createHarness results must include ordered events or messages. Return a full HarnessRun or a lightweight result with events/messages.\",\n );\n }\n if (events.length === 0) {\n throw new TypeError(\n \"createHarness results must include at least one transcript event. Return ordered events or message transport inputs that normalize into events.\",\n );\n }\n const metadata = result.metadata\n ? normalizeMetadata(result.metadata)\n : undefined;\n const artifacts = normalizeMergedArtifacts(\n context?.artifacts,\n result.artifacts,\n );\n const traces = normalizeSimpleTraces(result.traces);\n\n return {\n session: {\n events,\n ...(usage.provider ? { provider: usage.provider } : {}),\n ...(usage.model ? { model: usage.model } : {}),\n ...(metadata ? { metadata } : {}),\n },\n ...(output !== undefined ? { output } : {}),\n usage,\n ...(result.timings ? { timings: result.timings } : {}),\n ...(artifacts ? { artifacts } : {}),\n ...(traces ? { traces } : {}),\n errors: normalizeSimpleErrors(result.errors),\n } as HarnessRun<TOutput>;\n}\n\nfunction normalizeTranscriptInput(\n result: HarnessResultLike,\n): TranscriptEvent[] | undefined {\n if (\"events\" in result && Array.isArray(result.events)) {\n return result.events.map((event) => TranscriptEventSchema.parse(event));\n }\n\n if (\"messages\" in result && Array.isArray(result.messages)) {\n return messagesToTranscriptEvents(result.messages).map((event) =>\n TranscriptEventSchema.parse(event),\n );\n }\n\n return undefined;\n}\n\n/**\n * Builds a JSON-safe failed run for errors that happen before a harness can return.\n *\n * @param input - Original input passed to the harness.\n * @param error - Error thrown by setup or execution.\n * @param options - Optional artifacts to preserve on the failed run.\n */\nexport function createFailedHarnessRun(\n input: unknown,\n error: unknown,\n options: { artifacts?: Record<string, JsonValue> } = {},\n): HarnessRun {\n const artifacts = options.artifacts;\n\n return {\n session: {\n events: [\n {\n type: \"message\",\n role: \"user\",\n content: normalizeContent(input),\n },\n ],\n },\n usage: {},\n ...(artifacts && Object.keys(artifacts).length > 0 ? { artifacts } : {}),\n errors: [serializeError(error)],\n };\n}\n\nfunction normalizeMergedArtifacts(\n contextArtifacts: Record<string, JsonValue> | undefined,\n resultArtifacts: Record<string, unknown> | undefined,\n) {\n const artifacts = {\n ...(contextArtifacts ?? {}),\n ...(resultArtifacts ? normalizeRecord(resultArtifacts) : {}),\n };\n\n return Object.keys(artifacts).length > 0 ? artifacts : undefined;\n}\n\nfunction normalizeSimpleErrors(\n errors: unknown[] | undefined,\n): Array<Record<string, JsonValue>> {\n return (errors ?? []).map((error) => {\n const normalized = toJsonValue(error);\n\n if (\n normalized &&\n typeof normalized === \"object\" &&\n !Array.isArray(normalized) &&\n Object.keys(normalized).length > 0\n ) {\n return normalized;\n }\n\n return serializeError(error);\n });\n}\n\nfunction normalizeSimpleTraces(\n traces: SimpleTraceRecord[] | undefined,\n): NormalizedTrace[] | undefined {\n if (!Array.isArray(traces)) {\n return undefined;\n }\n\n const normalized = traces\n .map(normalizeSimpleTrace)\n .filter((trace): trace is NormalizedTrace => Boolean(trace));\n\n return normalized.length > 0 ? normalized : undefined;\n}\n\nfunction normalizeSimpleTrace(trace: unknown): NormalizedTrace | undefined {\n if (!isJsonRecord(trace)) {\n return undefined;\n }\n\n const {\n metadata: rawMetadata,\n spans: rawSpans,\n ...traceFields\n } = trace as Partial<SimpleTraceRecord>;\n const spans = (Array.isArray(rawSpans) ? rawSpans : [])\n .map((span) => normalizeSimpleSpan(span))\n .filter((span): span is NormalizedSpan => Boolean(span));\n const metadata = isJsonRecord(rawMetadata)\n ? normalizeMetadata(rawMetadata)\n : undefined;\n\n if (spans.length === 0 && !traceFields.id && !traceFields.name) {\n return undefined;\n }\n\n return {\n ...traceFields,\n ...(metadata ? { metadata } : {}),\n spans,\n };\n}\n\nfunction normalizeSimpleSpan(span: unknown): NormalizedSpan | undefined {\n if (!isJsonRecord(span) || typeof span.name !== \"string\" || !span.name) {\n return undefined;\n }\n\n const {\n attributes: rawAttributes,\n error: rawError,\n events: rawEvents,\n ...spanFields\n } = span as Partial<SimpleSpanRecord> & { name: string };\n const attributes = rawAttributes\n ? isJsonRecord(rawAttributes)\n ? normalizeMetadata(rawAttributes)\n : undefined\n : undefined;\n const error = normalizeSpanError(rawError);\n const events = normalizeSimpleSpanEvents(rawEvents);\n\n return {\n ...spanFields,\n ...(attributes\n ? { attributes: attributes as NormalizedSpanAttributes }\n : {}),\n ...(error ? { error } : {}),\n ...(events ? { events } : {}),\n };\n}\n\nfunction normalizeSimpleSpanEvents(\n events: unknown,\n): NormalizedSpanEvent[] | undefined {\n if (!Array.isArray(events)) {\n return undefined;\n }\n\n const normalized = events\n .map(normalizeSimpleSpanEvent)\n .filter((event): event is NormalizedSpanEvent => Boolean(event));\n\n return normalized.length > 0 ? normalized : undefined;\n}\n\nfunction normalizeSimpleSpanEvent(\n event: unknown,\n): NormalizedSpanEvent | undefined {\n if (!isJsonRecord(event) || typeof event.name !== \"string\" || !event.name) {\n return undefined;\n }\n\n const { attributes: rawAttributes, ...eventFields } =\n event as Partial<SimpleSpanEvent> & { name: string };\n const attributes = rawAttributes\n ? isJsonRecord(rawAttributes)\n ? normalizeMetadata(rawAttributes)\n : undefined\n : undefined;\n\n return {\n ...eventFields,\n ...(attributes\n ? { attributes: attributes as NormalizedSpanAttributes }\n : {}),\n };\n}\n\n/** Normalizes arbitrary span errors while preserving object-shaped messages. */\nexport function normalizeSpanError(\n error: unknown,\n): NormalizedSpan[\"error\"] | undefined {\n if (error === undefined) {\n return undefined;\n }\n\n if (error instanceof Error) {\n const details = normalizeMetadata(\n error as unknown as Record<string, unknown>,\n );\n\n return {\n ...(details ?? {}),\n type: error.name,\n message: error.message,\n };\n }\n\n if (\n error &&\n typeof error === \"object\" &&\n !Array.isArray(error) &&\n typeof (error as { message?: unknown }).message === \"string\"\n ) {\n const normalized = normalizeMetadata(error as Record<string, unknown>);\n const { message, type, ...details } = normalized ?? {};\n\n return {\n ...details,\n message: message as string,\n ...(typeof type === \"string\" ? { type } : {}),\n };\n }\n\n const serialized = serializeError(error);\n const { message, type, ...details } = serialized;\n\n return {\n ...details,\n message: typeof message === \"string\" ? message : String(message),\n ...(typeof type === \"string\" ? { type } : {}),\n };\n}\n\n/** Normalizes raw span attributes into the JSON-safe span attribute shape. */\nexport function normalizeSpanAttributes(\n attributes: Record<string, unknown>,\n): NormalizedSpanAttributes | undefined {\n return normalizeMetadata(attributes) as NormalizedSpanAttributes | undefined;\n}\n\n/** Builds common OpenTelemetry GenAI usage attributes from a usage summary. */\nexport function createGenAiUsageAttributes(\n usage: UsageSummary | undefined,\n options: { provider?: string } = {},\n) {\n return {\n \"gen_ai.provider.name\": usage?.provider ?? options.provider,\n \"gen_ai.request.model\": usage?.model,\n \"gen_ai.response.model\": usage?.model,\n \"gen_ai.usage.input_tokens\": usage?.inputTokens,\n \"gen_ai.usage.output_tokens\": usage?.outputTokens,\n \"gen_ai.usage.reasoning.output_tokens\": usage?.reasoningTokens,\n } satisfies Record<string, unknown>;\n}\n\n/**\n * Attaches a fallback run trace when a harness result does not already contain spans.\n *\n * This keeps custom harnesses inspectable while first-party harness packages\n * remain free to attach richer native traces.\n */\nexport function ensureRunTrace(\n run: HarnessRun,\n options: EnsureRunTraceOptions,\n): NormalizedTrace | undefined {\n if (spans(run).length > 0) {\n return undefined;\n }\n\n const traceId = options.id ?? createGeneratedTraceId();\n const rootSpanId = `${traceId}:run`;\n const durationMs = options.finishedAt.getTime() - options.startedAt.getTime();\n const rootError =\n run.errors.length > 0 ? normalizeSpanError(run.errors[0]) : undefined;\n const runSpan: NormalizedSpan = {\n id: rootSpanId,\n traceId,\n name: options.name,\n kind: \"run\",\n startedAt: options.startedAt.toISOString(),\n finishedAt: options.finishedAt.toISOString(),\n durationMs,\n status: rootError ? \"error\" : \"ok\",\n ...(rootError ? { error: rootError } : {}),\n attributes: normalizeSpanAttributes({\n \"gen_ai.operation.name\": options.operationName ?? \"invoke_workflow\",\n \"gen_ai.workflow.name\": options.name,\n ...createGenAiUsageAttributes(run.usage),\n }),\n };\n const trace: NormalizedTrace = {\n id: traceId,\n name: options.name,\n startedAt: options.startedAt.toISOString(),\n finishedAt: options.finishedAt.toISOString(),\n durationMs,\n ...(options.source ? { metadata: { source: options.source } } : {}),\n spans: [runSpan],\n };\n\n run.traces = [trace];\n return trace;\n}\n\nlet nextGeneratedTraceId = 0;\n\nfunction createGeneratedTraceId() {\n nextGeneratedTraceId += 1;\n return `trace_${nextGeneratedTraceId}`;\n}\n\n/**\n * Attaches a partial or complete harness run to an arbitrary thrown error.\n *\n * @param error - Thrown value to wrap.\n * @param run - Partial or complete normalized harness run to preserve.\n *\n * @example\n * ```ts\n * try {\n * return await runAgent(input);\n * } catch (error) {\n * throw attachHarnessRunToError(error, partialRun);\n * }\n * ```\n */\nexport function attachHarnessRunToError(\n error: unknown,\n run: HarnessRun,\n): HarnessRunError {\n const baseError =\n error instanceof Error\n ? error\n : new Error(String(error ?? \"Unknown error\"));\n return Object.assign(baseError, {\n vitestEvalsRun: run,\n });\n}\n\n/**\n * Reads an attached harness run back off a previously wrapped error value.\n *\n * @param error - Unknown thrown value that may contain a harness run.\n *\n * @example\n * ```ts\n * const partialRun = getHarnessRunFromError(error);\n *\n * if (partialRun) {\n * console.log(toolCalls(partialRun.session));\n * }\n * ```\n */\nexport function getHarnessRunFromError(error: unknown): HarnessRun | undefined {\n if (\n error &&\n typeof error === \"object\" &&\n \"vitestEvalsRun\" in error &&\n isHarnessRun((error as { vitestEvalsRun?: unknown }).vitestEvalsRun)\n ) {\n return (error as { vitestEvalsRun: HarnessRun }).vitestEvalsRun;\n }\n\n return undefined;\n}\n\n/** Returns true when a value matches the normalized `HarnessRun` contract. */\nexport function isHarnessRun(value: unknown): value is HarnessRun {\n if (!value || typeof value !== \"object\") {\n return false;\n }\n\n const candidate = value as {\n session?: unknown;\n usage?: unknown;\n errors?: unknown;\n };\n\n return (\n isNormalizedSession(candidate.session) &&\n Boolean(candidate.usage) &&\n typeof candidate.usage === \"object\" &&\n !Array.isArray(candidate.usage) &&\n Array.isArray(candidate.errors)\n );\n}\n\n/** Returns true when a value matches the normalized session contract. */\nexport function isNormalizedSession(\n value: unknown,\n): value is NormalizedSession {\n return NormalizedSessionSchema.safeParse(value).success;\n}\n\n/** Reuses pre-normalized harness errors when a runtime already returns them. */\nexport function resolveHarnessRunErrors(\n result: unknown,\n): Array<Record<string, JsonValue>> {\n if (\n result &&\n typeof result === \"object\" &&\n Array.isArray((result as Record<string, unknown>).errors)\n ) {\n return (result as { errors: Array<Record<string, JsonValue>> }).errors;\n }\n\n return [];\n}\n\n/** Serializes an arbitrary thrown value into the normalized error shape. */\nexport function serializeError(error: unknown): Record<string, JsonValue> {\n if (error instanceof Error) {\n return {\n type: error.name,\n message: error.message,\n };\n }\n\n return {\n type: \"Error\",\n message: String(error),\n };\n}\n"],"mappings":";AAAA;AAAA,EAKE;AAAA,EACA;AAAA,EACA;AAAA,EAGA;AAAA,OAIK;AA0BP;AAAA,EACE,qBAAAA;AAAA,EACA,eAAAC;AAAA,EACA,iCAAAC;AAAA,EACA,kBAAAC;AAAA,EACA,8BAAAC;AAAA,EACA,SAAAC;AAAA,EACA,eAAAC;AAAA,EACA,kBAAAC;AAAA,EACA,aAAAC;AAAA,EACA,gBAAAC;AAAA,EACA,gBAAAC;AAAA,OACK;AA0OP,SAAS,gBAAgB,OAAwC;AAC/D,SACE,UAAU,QACV,OAAO,UAAU,YACjB,OAAO,UAAU,aAChB,OAAO,UAAU,YAAY,OAAO,SAAS,KAAK;AAEvD;AAEA,SAAS,aAAa,OAAkD;AACtE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;AAEA,SAAS,mBAAmB,OAAkB,MAAuB;AACnE,MAAI,KAAK,IAAI,KAAK,GAAG;AACnB,WAAO;AAAA,EACT;AAEA,OAAK,IAAI,KAAK;AACd,QAAM,aAAa,MAAM,IAAI,CAAC,SAAS;AACrC,UAAMC,cAAa,oBAAoB,MAAM,IAAI;AACjD,WAAOA,gBAAe,SAAY,OAAOA;AAAA,EAC3C,CAAC;AACD,OAAK,OAAO,KAAK;AAEjB,SAAO;AACT;AAEA,SAAS,oBACP,OACA,MAC2B;AAC3B,QAAM,aAAwC,CAAC;AAE/C,MAAI,KAAK,IAAI,KAAK,GAAG;AACnB,WAAO;AAAA,EACT;AAEA,OAAK,IAAI,KAAK;AACd,MAAI;AACF,eAAW,CAAC,KAAK,UAAU,KAAK,OAAO,QAAQ,KAAK,GAAG;AACrD,YAAM,QAAQ,oBAAoB,YAAY,IAAI;AAClD,UAAI,UAAU,QAAW;AACvB,mBAAW,GAAG,IAAI;AAAA,MACpB;AAAA,IACF;AAAA,EACF,UAAE;AACA,SAAK,OAAO,KAAK;AAAA,EACnB;AAEA,SAAO;AACT;AAGO,SAAS,kBAAkB,OAAgB,YAAoB;AACpE,SACE,UAAU,SACT,OAAO,UAAU,YAAY,OAAO,UAAU,eAC/C,cAAc,SACd,OAAQ,MAAkC,UAAU,MAAM;AAE9D;AAGO,SAAS,YAAY,OAAuC;AACjE,SAAO,oBAAoB,OAAO,oBAAI,QAAQ,CAAC;AACjD;AAEA,SAAS,oBACP,OACA,MACuB;AACvB,MAAI,gBAAgB,KAAK,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,MACE,UAAU,QACV,OAAO,UAAU,YACjB,KAAK,IAAI,KAAe,GACxB;AACA,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,KAAK,GAAG;AACxB,WAAO,mBAAmB,OAAO,IAAI;AAAA,EACvC;AAEA,MAAI,aAAa,KAAK,GAAG;AACvB,WAAO,oBAAoB,OAAO,IAAI;AAAA,EACxC;AAEA,SAAO;AACT;AAGO,SAAS,gBACd,OAC2B;AAC3B,SAAO,oBAAoB,OAAO,oBAAI,QAAQ,CAAC;AACjD;AAGO,SAAS,kBACd,OACuC;AACvC,QAAM,aAAa,gBAAgB,KAAK;AACxC,SAAO,OAAO,KAAK,UAAU,EAAE,SAAS,IAAI,aAAa;AAC3D;AAGO,SAAS,iBAAiB,OAA2B;AAC1D,QAAM,aAAa,YAAY,KAAK;AACpC,SAAO,eAAe,SAAY,aAAa,OAAO,KAAK;AAC7D;AAkDO,SAAS,cAGd,SAA0E;AAC1E,QAAM,UAAoC;AAAA,IACxC,MAAM,QAAQ;AAAA,IACd,KAAK,OAAO,OAAO,YAAY;AAC7B,YAAM,YAAY,oBAAI,KAAK;AAE3B,UAAI;AACF,cAAM,SAAS,MAAM,QAAQ,IAAI;AAAA,UAC/B;AAAA,UACA,QAAQ,QAAQ;AAAA,UAChB,WAAW,QAAQ;AAAA,UACnB,aAAa,QAAQ;AAAA,QACvB,CAAC;AACD,cAAM,MAAM,oBAAoB,OAAO,QAAQ,OAAO;AACtD,uBAAe,KAAK;AAAA,UAClB,MAAM,QAAQ;AAAA,UACd;AAAA,UACA,YAAY,oBAAI,KAAK;AAAA,QACvB,CAAC;AAED,eAAO;AAAA,MACT,SAAS,OAAO;AACd,cAAM,aAAa,uBAAuB,KAAK;AAC/C,YAAI,YAAY;AACd,cACE,OAAO,KAAK,QAAQ,SAAS,EAAE,SAAS,KACxC,CAAC,WAAW,WACZ;AACA,uBAAW,YAAY,QAAQ;AAAA,UACjC;AACA,yBAAe,YAAY;AAAA,YACzB,MAAM,QAAQ;AAAA,YACd;AAAA,YACA,YAAY,oBAAI,KAAK;AAAA,UACvB,CAAC;AACD,gBAAM,wBAAwB,OAAO,UAAU;AAAA,QACjD;AAEA,cAAM,YAAY,uBAAuB,OAAO,OAAO;AAAA,UACrD,WAAW,QAAQ;AAAA,QACrB,CAAC;AACD,uBAAe,WAAW;AAAA,UACxB,MAAM,QAAQ;AAAA,UACd;AAAA,UACA,YAAY,oBAAI,KAAK;AAAA,QACvB,CAAC;AAED,cAAM,wBAAwB,OAAO,SAAS;AAAA,MAChD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAkCO,SAAS,oBAId,OACA,QACA,SACqB;AACrB,MAAI,aAAa,MAAM,GAAG;AACxB,QACE,WACA,OAAO,KAAK,QAAQ,SAAS,EAAE,SAAS,KACxC,CAAC,OAAO,WACR;AACA,aAAO;AAAA,QACL,GAAG;AAAA,QACH,WAAW,QAAQ;AAAA,MACrB;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AAEA,MAAI,eAAe,QAAQ;AACzB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,QAAM,SAAS,OAAO;AACtB,QAAM,QAAQ,OAAO,SAAS,CAAC;AAC/B,QAAM,SAAS,yBAAyB,MAAM;AAC9C,MAAI,CAAC,QAAQ;AACX,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,QAAM,WAAW,OAAO,WACpB,kBAAkB,OAAO,QAAQ,IACjC;AACJ,QAAM,YAAY;AAAA,IAChB,SAAS;AAAA,IACT,OAAO;AAAA,EACT;AACA,QAAM,SAAS,sBAAsB,OAAO,MAAM;AAElD,SAAO;AAAA,IACL,SAAS;AAAA,MACP;AAAA,MACA,GAAI,MAAM,WAAW,EAAE,UAAU,MAAM,SAAS,IAAI,CAAC;AAAA,MACrD,GAAI,MAAM,QAAQ,EAAE,OAAO,MAAM,MAAM,IAAI,CAAC;AAAA,MAC5C,GAAI,WAAW,EAAE,SAAS,IAAI,CAAC;AAAA,IACjC;AAAA,IACA,GAAI,WAAW,SAAY,EAAE,OAAO,IAAI,CAAC;AAAA,IACzC;AAAA,IACA,GAAI,OAAO,UAAU,EAAE,SAAS,OAAO,QAAQ,IAAI,CAAC;AAAA,IACpD,GAAI,YAAY,EAAE,UAAU,IAAI,CAAC;AAAA,IACjC,GAAI,SAAS,EAAE,OAAO,IAAI,CAAC;AAAA,IAC3B,QAAQ,sBAAsB,OAAO,MAAM;AAAA,EAC7C;AACF;AAEA,SAAS,yBACP,QAC+B;AAC/B,MAAI,YAAY,UAAU,MAAM,QAAQ,OAAO,MAAM,GAAG;AACtD,WAAO,OAAO,OAAO,IAAI,CAAC,UAAU,sBAAsB,MAAM,KAAK,CAAC;AAAA,EACxE;AAEA,MAAI,cAAc,UAAU,MAAM,QAAQ,OAAO,QAAQ,GAAG;AAC1D,WAAO,2BAA2B,OAAO,QAAQ,EAAE;AAAA,MAAI,CAAC,UACtD,sBAAsB,MAAM,KAAK;AAAA,IACnC;AAAA,EACF;AAEA,SAAO;AACT;AASO,SAAS,uBACd,OACA,OACA,UAAqD,CAAC,GAC1C;AACZ,QAAM,YAAY,QAAQ;AAE1B,SAAO;AAAA,IACL,SAAS;AAAA,MACP,QAAQ;AAAA,QACN;AAAA,UACE,MAAM;AAAA,UACN,MAAM;AAAA,UACN,SAAS,iBAAiB,KAAK;AAAA,QACjC;AAAA,MACF;AAAA,IACF;AAAA,IACA,OAAO,CAAC;AAAA,IACR,GAAI,aAAa,OAAO,KAAK,SAAS,EAAE,SAAS,IAAI,EAAE,UAAU,IAAI,CAAC;AAAA,IACtE,QAAQ,CAAC,eAAe,KAAK,CAAC;AAAA,EAChC;AACF;AAEA,SAAS,yBACP,kBACA,iBACA;AACA,QAAM,YAAY;AAAA,IAChB,GAAI,oBAAoB,CAAC;AAAA,IACzB,GAAI,kBAAkB,gBAAgB,eAAe,IAAI,CAAC;AAAA,EAC5D;AAEA,SAAO,OAAO,KAAK,SAAS,EAAE,SAAS,IAAI,YAAY;AACzD;AAEA,SAAS,sBACP,QACkC;AAClC,UAAQ,UAAU,CAAC,GAAG,IAAI,CAAC,UAAU;AACnC,UAAM,aAAa,YAAY,KAAK;AAEpC,QACE,cACA,OAAO,eAAe,YACtB,CAAC,MAAM,QAAQ,UAAU,KACzB,OAAO,KAAK,UAAU,EAAE,SAAS,GACjC;AACA,aAAO;AAAA,IACT;AAEA,WAAO,eAAe,KAAK;AAAA,EAC7B,CAAC;AACH;AAEA,SAAS,sBACP,QAC+B;AAC/B,MAAI,CAAC,MAAM,QAAQ,MAAM,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,OAChB,IAAI,oBAAoB,EACxB,OAAO,CAAC,UAAoC,QAAQ,KAAK,CAAC;AAE7D,SAAO,WAAW,SAAS,IAAI,aAAa;AAC9C;AAEA,SAAS,qBAAqB,OAA6C;AACzE,MAAI,CAAC,aAAa,KAAK,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM;AAAA,IACJ,UAAU;AAAA,IACV,OAAO;AAAA,IACP,GAAG;AAAA,EACL,IAAI;AACJ,QAAMN,UAAS,MAAM,QAAQ,QAAQ,IAAI,WAAW,CAAC,GAClD,IAAI,CAAC,SAAS,oBAAoB,IAAI,CAAC,EACvC,OAAO,CAAC,SAAiC,QAAQ,IAAI,CAAC;AACzD,QAAM,WAAW,aAAa,WAAW,IACrC,kBAAkB,WAAW,IAC7B;AAEJ,MAAIA,OAAM,WAAW,KAAK,CAAC,YAAY,MAAM,CAAC,YAAY,MAAM;AAC9D,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,GAAG;AAAA,IACH,GAAI,WAAW,EAAE,SAAS,IAAI,CAAC;AAAA,IAC/B,OAAAA;AAAA,EACF;AACF;AAEA,SAAS,oBAAoB,MAA2C;AACtE,MAAI,CAAC,aAAa,IAAI,KAAK,OAAO,KAAK,SAAS,YAAY,CAAC,KAAK,MAAM;AACtE,WAAO;AAAA,EACT;AAEA,QAAM;AAAA,IACJ,YAAY;AAAA,IACZ,OAAO;AAAA,IACP,QAAQ;AAAA,IACR,GAAG;AAAA,EACL,IAAI;AACJ,QAAM,aAAa,gBACf,aAAa,aAAa,IACxB,kBAAkB,aAAa,IAC/B,SACF;AACJ,QAAM,QAAQ,mBAAmB,QAAQ;AACzC,QAAM,SAAS,0BAA0B,SAAS;AAElD,SAAO;AAAA,IACL,GAAG;AAAA,IACH,GAAI,aACA,EAAE,WAAmD,IACrD,CAAC;AAAA,IACL,GAAI,QAAQ,EAAE,MAAM,IAAI,CAAC;AAAA,IACzB,GAAI,SAAS,EAAE,OAAO,IAAI,CAAC;AAAA,EAC7B;AACF;AAEA,SAAS,0BACP,QACmC;AACnC,MAAI,CAAC,MAAM,QAAQ,MAAM,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,QAAM,aAAa,OAChB,IAAI,wBAAwB,EAC5B,OAAO,CAAC,UAAwC,QAAQ,KAAK,CAAC;AAEjE,SAAO,WAAW,SAAS,IAAI,aAAa;AAC9C;AAEA,SAAS,yBACP,OACiC;AACjC,MAAI,CAAC,aAAa,KAAK,KAAK,OAAO,MAAM,SAAS,YAAY,CAAC,MAAM,MAAM;AACzE,WAAO;AAAA,EACT;AAEA,QAAM,EAAE,YAAY,eAAe,GAAG,YAAY,IAChD;AACF,QAAM,aAAa,gBACf,aAAa,aAAa,IACxB,kBAAkB,aAAa,IAC/B,SACF;AAEJ,SAAO;AAAA,IACL,GAAG;AAAA,IACH,GAAI,aACA,EAAE,WAAmD,IACrD,CAAC;AAAA,EACP;AACF;AAGO,SAAS,mBACd,OACqC;AACrC,MAAI,UAAU,QAAW;AACvB,WAAO;AAAA,EACT;AAEA,MAAI,iBAAiB,OAAO;AAC1B,UAAMO,WAAU;AAAA,MACd;AAAA,IACF;AAEA,WAAO;AAAA,MACL,GAAIA,YAAW,CAAC;AAAA,MAChB,MAAM,MAAM;AAAA,MACZ,SAAS,MAAM;AAAA,IACjB;AAAA,EACF;AAEA,MACE,SACA,OAAO,UAAU,YACjB,CAAC,MAAM,QAAQ,KAAK,KACpB,OAAQ,MAAgC,YAAY,UACpD;AACA,UAAM,aAAa,kBAAkB,KAAgC;AACrE,UAAM,EAAE,SAAAC,UAAS,MAAAC,OAAM,GAAGF,SAAQ,IAAI,cAAc,CAAC;AAErD,WAAO;AAAA,MACL,GAAGA;AAAA,MACH,SAASC;AAAA,MACT,GAAI,OAAOC,UAAS,WAAW,EAAE,MAAAA,MAAK,IAAI,CAAC;AAAA,IAC7C;AAAA,EACF;AAEA,QAAM,aAAa,eAAe,KAAK;AACvC,QAAM,EAAE,SAAS,MAAM,GAAG,QAAQ,IAAI;AAEtC,SAAO;AAAA,IACL,GAAG;AAAA,IACH,SAAS,OAAO,YAAY,WAAW,UAAU,OAAO,OAAO;AAAA,IAC/D,GAAI,OAAO,SAAS,WAAW,EAAE,KAAK,IAAI,CAAC;AAAA,EAC7C;AACF;AAGO,SAAS,wBACd,YACsC;AACtC,SAAO,kBAAkB,UAAU;AACrC;AAGO,SAAS,2BACd,OACA,UAAiC,CAAC,GAClC;AACA,SAAO;AAAA,IACL,wBAAwB,OAAO,YAAY,QAAQ;AAAA,IACnD,wBAAwB,OAAO;AAAA,IAC/B,yBAAyB,OAAO;AAAA,IAChC,6BAA6B,OAAO;AAAA,IACpC,8BAA8B,OAAO;AAAA,IACrC,wCAAwC,OAAO;AAAA,EACjD;AACF;AAQO,SAAS,eACd,KACA,SAC6B;AAC7B,MAAI,MAAM,GAAG,EAAE,SAAS,GAAG;AACzB,WAAO;AAAA,EACT;AAEA,QAAM,UAAU,QAAQ,MAAM,uBAAuB;AACrD,QAAM,aAAa,GAAG,OAAO;AAC7B,QAAM,aAAa,QAAQ,WAAW,QAAQ,IAAI,QAAQ,UAAU,QAAQ;AAC5E,QAAM,YACJ,IAAI,OAAO,SAAS,IAAI,mBAAmB,IAAI,OAAO,CAAC,CAAC,IAAI;AAC9D,QAAM,UAA0B;AAAA,IAC9B,IAAI;AAAA,IACJ;AAAA,IACA,MAAM,QAAQ;AAAA,IACd,MAAM;AAAA,IACN,WAAW,QAAQ,UAAU,YAAY;AAAA,IACzC,YAAY,QAAQ,WAAW,YAAY;AAAA,IAC3C;AAAA,IACA,QAAQ,YAAY,UAAU;AAAA,IAC9B,GAAI,YAAY,EAAE,OAAO,UAAU,IAAI,CAAC;AAAA,IACxC,YAAY,wBAAwB;AAAA,MAClC,yBAAyB,QAAQ,iBAAiB;AAAA,MAClD,wBAAwB,QAAQ;AAAA,MAChC,GAAG,2BAA2B,IAAI,KAAK;AAAA,IACzC,CAAC;AAAA,EACH;AACA,QAAM,QAAyB;AAAA,IAC7B,IAAI;AAAA,IACJ,MAAM,QAAQ;AAAA,IACd,WAAW,QAAQ,UAAU,YAAY;AAAA,IACzC,YAAY,QAAQ,WAAW,YAAY;AAAA,IAC3C;AAAA,IACA,GAAI,QAAQ,SAAS,EAAE,UAAU,EAAE,QAAQ,QAAQ,OAAO,EAAE,IAAI,CAAC;AAAA,IACjE,OAAO,CAAC,OAAO;AAAA,EACjB;AAEA,MAAI,SAAS,CAAC,KAAK;AACnB,SAAO;AACT;AAEA,IAAI,uBAAuB;AAE3B,SAAS,yBAAyB;AAChC,0BAAwB;AACxB,SAAO,SAAS,oBAAoB;AACtC;AAiBO,SAAS,wBACd,OACA,KACiB;AACjB,QAAM,YACJ,iBAAiB,QACb,QACA,IAAI,MAAM,OAAO,SAAS,eAAe,CAAC;AAChD,SAAO,OAAO,OAAO,WAAW;AAAA,IAC9B,gBAAgB;AAAA,EAClB,CAAC;AACH;AAgBO,SAAS,uBAAuB,OAAwC;AAC7E,MACE,SACA,OAAO,UAAU,YACjB,oBAAoB,SACpB,aAAc,MAAuC,cAAc,GACnE;AACA,WAAQ,MAAyC;AAAA,EACnD;AAEA,SAAO;AACT;AAGO,SAAS,aAAa,OAAqC;AAChE,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY;AAMlB,SACE,oBAAoB,UAAU,OAAO,KACrC,QAAQ,UAAU,KAAK,KACvB,OAAO,UAAU,UAAU,YAC3B,CAAC,MAAM,QAAQ,UAAU,KAAK,KAC9B,MAAM,QAAQ,UAAU,MAAM;AAElC;AAGO,SAAS,oBACd,OAC4B;AAC5B,SAAO,wBAAwB,UAAU,KAAK,EAAE;AAClD;AAGO,SAAS,wBACd,QACkC;AAClC,MACE,UACA,OAAO,WAAW,YAClB,MAAM,QAAS,OAAmC,MAAM,GACxD;AACA,WAAQ,OAAwD;AAAA,EAClE;AAEA,SAAO,CAAC;AACV;AAGO,SAAS,eAAe,OAA2C;AACxE,MAAI,iBAAiB,OAAO;AAC1B,WAAO;AAAA,MACL,MAAM,MAAM;AAAA,MACZ,SAAS,MAAM;AAAA,IACjB;AAAA,EACF;AAEA,SAAO;AAAA,IACL,MAAM;AAAA,IACN,SAAS,OAAO,KAAK;AAAA,EACvB;AACF;","names":["assistantMessages","failedSpans","latestAssistantMessageContent","messagesByRole","messagesToTranscriptEvents","spans","spansByKind","systemMessages","toolCalls","toolMessages","userMessages","normalized","details","message","type"]}
package/dist/index.d.mts CHANGED
@@ -1,7 +1,7 @@
1
1
  import * as vitest from 'vitest';
2
2
  import { TestAPI } from 'vitest';
3
3
  import { Harness } from './harness.mjs';
4
- export { CreateHarnessOptions, CreateHarnessRunArgs, CreateToolCallSpansOptions, EnsureRunTraceOptions, HarnessContext, HarnessMetadata, HarnessResultLike, MaybePromise, SimpleHarnessResult, SimpleSpanEvent, SimpleSpanRecord, SimpleToolCallRecord, SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, normalizeHarnessRun, normalizeSpanAttributes, normalizeSpanError, toJsonValue } from './harness.mjs';
4
+ export { CreateHarnessOptions, CreateHarnessRunArgs, EnsureRunTraceOptions, HarnessContext, HarnessMetadata, HarnessResultLike, MaybePromise, SimpleHarnessResult, SimpleSpanEvent, SimpleSpanRecord, SimpleTraceRecord, SimpleTranscriptInput, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, ensureRunTrace, getHarnessRunFromError, normalizeHarnessRun, normalizeSpanAttributes, normalizeSpanError, toJsonValue } from './harness.mjs';
5
5
  import { JudgeContext, Judge, JudgeResult, JudgeAssessFn, JudgeOptions, JudgeAssessor, JudgeAssessWithAssessorFn } from './judges/types.mjs';
6
6
  export { BoundJudgeAssessor, JudgeAssessorOptions } from './judges/types.mjs';
7
7
  import { JudgeHarness } from './judges/judgeHarness.mjs';
@@ -11,8 +11,8 @@ export { FactualityJudge, FactualityJudgeChoice, FactualityJudgeConfig, Factuali
11
11
  export { StructuredOutputJudge, StructuredOutputJudgeConfig, StructuredOutputJudgeExpected, StructuredOutputJudgeOptions } from './judges/structuredOutputJudge.mjs';
12
12
  export { ToolCallJudge, ToolCallJudgeConfig, ToolCallJudgeExpectedTool, ToolCallJudgeOptions } from './judges/toolCallJudge.mjs';
13
13
  export { BaseMatcherConfig, FuzzyMatchOptions, MatchStrategy } from './internal/matchers.mjs';
14
- import { JsonValue, HarnessRun, NormalizedSession, ToolCallRecord } from '@vitest-evals/core';
15
- export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCallRecord, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
14
+ import { JsonValue, HarnessRun, NormalizedSession, ToolCall } from '@vitest-evals/core';
15
+ export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCall, TranscriptEvent, TranscriptMessageContentPart, TranscriptMessageEvent, TranscriptMessageInput, TranscriptMessageTextPart, TranscriptMessageToolCall, TranscriptMessageToolCallPart, TranscriptMessageToolResultPart, TranscriptToolCallEvent, TranscriptToolResultEvent, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, messagesToTranscriptEvents, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
16
16
  import './internal/structuredOutputScorer.mjs';
17
17
  import './internal/scoring.mjs';
18
18
  import './internal/toolCallScorer.mjs';
@@ -24,7 +24,7 @@ type EvalTaskMeta = {
24
24
  })[];
25
25
  avgScore: number;
26
26
  output?: unknown;
27
- toolCalls?: ToolCallRecord[];
27
+ toolCalls?: ToolCall[];
28
28
  thresholdFailed?: boolean;
29
29
  };
30
30
  harness?: {
@@ -149,7 +149,7 @@ type JudgeAssertionOptions<TJudgeOptions extends JudgeContext<any, any, any> = J
149
149
  /** Override or provide the app-facing output for the judge. */
150
150
  output?: JudgeAssertionOutput<TJudgeOptions>;
151
151
  /** Override or provide flattened tool calls for the judge. */
152
- toolCalls?: ToolCallRecord[];
152
+ toolCalls?: ToolCall[];
153
153
  /** Override or provide the complete normalized harness run. */
154
154
  run?: HarnessRun<JudgeAssertionOutput<TJudgeOptions>>;
155
155
  /** Override or provide the normalized session transcript. */
@@ -219,7 +219,7 @@ declare module "vitest" {
219
219
  * const result = await run("Refund invoice inv_123");
220
220
  *
221
221
  * expect(result.output).toMatchObject({ status: "approved" });
222
- * expect(toolCalls(result.session)).toHaveLength(2);
222
+ * expect(toolCalls(result)).toHaveLength(2);
223
223
  * await expect(result).toSatisfyJudge(FactualityJudge(), {
224
224
  * expected: "Invoice inv_123 should be refunded.",
225
225
  * threshold: 0.6,
package/dist/index.d.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import * as vitest from 'vitest';
2
2
  import { TestAPI } from 'vitest';
3
3
  import { Harness } from './harness.js';
4
- export { CreateHarnessOptions, CreateHarnessRunArgs, CreateToolCallSpansOptions, EnsureRunTraceOptions, HarnessContext, HarnessMetadata, HarnessResultLike, MaybePromise, SimpleHarnessResult, SimpleSpanEvent, SimpleSpanRecord, SimpleToolCallRecord, SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, normalizeHarnessRun, normalizeSpanAttributes, normalizeSpanError, toJsonValue } from './harness.js';
4
+ export { CreateHarnessOptions, CreateHarnessRunArgs, EnsureRunTraceOptions, HarnessContext, HarnessMetadata, HarnessResultLike, MaybePromise, SimpleHarnessResult, SimpleSpanEvent, SimpleSpanRecord, SimpleTraceRecord, SimpleTranscriptInput, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, ensureRunTrace, getHarnessRunFromError, normalizeHarnessRun, normalizeSpanAttributes, normalizeSpanError, toJsonValue } from './harness.js';
5
5
  import { JudgeContext, Judge, JudgeResult, JudgeAssessFn, JudgeOptions, JudgeAssessor, JudgeAssessWithAssessorFn } from './judges/types.js';
6
6
  export { BoundJudgeAssessor, JudgeAssessorOptions } from './judges/types.js';
7
7
  import { JudgeHarness } from './judges/judgeHarness.js';
@@ -11,8 +11,8 @@ export { FactualityJudge, FactualityJudgeChoice, FactualityJudgeConfig, Factuali
11
11
  export { StructuredOutputJudge, StructuredOutputJudgeConfig, StructuredOutputJudgeExpected, StructuredOutputJudgeOptions } from './judges/structuredOutputJudge.js';
12
12
  export { ToolCallJudge, ToolCallJudgeConfig, ToolCallJudgeExpectedTool, ToolCallJudgeOptions } from './judges/toolCallJudge.js';
13
13
  export { BaseMatcherConfig, FuzzyMatchOptions, MatchStrategy } from './internal/matchers.js';
14
- import { JsonValue, HarnessRun, NormalizedSession, ToolCallRecord } from '@vitest-evals/core';
15
- export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCallRecord, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
14
+ import { JsonValue, HarnessRun, NormalizedSession, ToolCall } from '@vitest-evals/core';
15
+ export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCall, TranscriptEvent, TranscriptMessageContentPart, TranscriptMessageEvent, TranscriptMessageInput, TranscriptMessageTextPart, TranscriptMessageToolCall, TranscriptMessageToolCallPart, TranscriptMessageToolResultPart, TranscriptToolCallEvent, TranscriptToolResultEvent, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, messagesToTranscriptEvents, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core';
16
16
  import './internal/structuredOutputScorer.js';
17
17
  import './internal/scoring.js';
18
18
  import './internal/toolCallScorer.js';
@@ -24,7 +24,7 @@ type EvalTaskMeta = {
24
24
  })[];
25
25
  avgScore: number;
26
26
  output?: unknown;
27
- toolCalls?: ToolCallRecord[];
27
+ toolCalls?: ToolCall[];
28
28
  thresholdFailed?: boolean;
29
29
  };
30
30
  harness?: {
@@ -149,7 +149,7 @@ type JudgeAssertionOptions<TJudgeOptions extends JudgeContext<any, any, any> = J
149
149
  /** Override or provide the app-facing output for the judge. */
150
150
  output?: JudgeAssertionOutput<TJudgeOptions>;
151
151
  /** Override or provide flattened tool calls for the judge. */
152
- toolCalls?: ToolCallRecord[];
152
+ toolCalls?: ToolCall[];
153
153
  /** Override or provide the complete normalized harness run. */
154
154
  run?: HarnessRun<JudgeAssertionOutput<TJudgeOptions>>;
155
155
  /** Override or provide the normalized session transcript. */
@@ -219,7 +219,7 @@ declare module "vitest" {
219
219
  * const result = await run("Refund invoice inv_123");
220
220
  *
221
221
  * expect(result.output).toMatchObject({ status: "approved" });
222
- * expect(toolCalls(result.session)).toHaveLength(2);
222
+ * expect(toolCalls(result)).toHaveLength(2);
223
223
  * await expect(result).toSatisfyJudge(FactualityJudge(), {
224
224
  * expected: "Invoice inv_123 should be refunded.",
225
225
  * threshold: 0.6,
package/dist/index.js CHANGED
@@ -30,7 +30,6 @@ __export(index_exports, {
30
30
  createHarness: () => createHarness,
31
31
  createJudge: () => createJudge,
32
32
  createJudgeHarness: () => createJudgeHarness,
33
- createToolCallSpans: () => createToolCallSpans,
34
33
  describeEval: () => describeEval,
35
34
  ensureRunTrace: () => ensureRunTrace,
36
35
  failedSpans: () => import_core2.failedSpans,
@@ -38,6 +37,7 @@ __export(index_exports, {
38
37
  getHarnessRunFromError: () => getHarnessRunFromError,
39
38
  latestAssistantMessageContent: () => import_core2.latestAssistantMessageContent,
40
39
  messagesByRole: () => import_core2.messagesByRole,
40
+ messagesToTranscriptEvents: () => import_core2.messagesToTranscriptEvents,
41
41
  normalizeHarnessRun: () => normalizeHarnessRun,
42
42
  normalizeSpanAttributes: () => normalizeSpanAttributes,
43
43
  normalizeSpanError: () => normalizeSpanError,
@@ -179,14 +179,24 @@ function normalizeHarnessRun(input, result, context) {
179
179
  }
180
180
  return result;
181
181
  }
182
+ if ("toolCalls" in result) {
183
+ throw new TypeError(
184
+ 'createHarness results do not accept top-level toolCalls. Return ordered session events with type: "tool_call" and type: "tool_result" entries instead.'
185
+ );
186
+ }
182
187
  const output = result.output;
183
- const toolCalls3 = normalizeSimpleToolCalls(result.toolCalls);
184
188
  const usage = result.usage ?? {};
185
- const messages = result.messages ?? createDefaultSessionMessages({
186
- input,
187
- output,
188
- toolCalls: toolCalls3
189
- });
189
+ const events = normalizeTranscriptInput(result);
190
+ if (!events) {
191
+ throw new TypeError(
192
+ "createHarness results must include ordered events or messages. Return a full HarnessRun or a lightweight result with events/messages."
193
+ );
194
+ }
195
+ if (events.length === 0) {
196
+ throw new TypeError(
197
+ "createHarness results must include at least one transcript event. Return ordered events or message transport inputs that normalize into events."
198
+ );
199
+ }
190
200
  const metadata = result.metadata ? normalizeMetadata(result.metadata) : void 0;
191
201
  const artifacts = normalizeMergedArtifacts(
192
202
  context?.artifacts,
@@ -195,7 +205,7 @@ function normalizeHarnessRun(input, result, context) {
195
205
  const traces = normalizeSimpleTraces(result.traces);
196
206
  return {
197
207
  session: {
198
- messages,
208
+ events,
199
209
  ...usage.provider ? { provider: usage.provider } : {},
200
210
  ...usage.model ? { model: usage.model } : {},
201
211
  ...metadata ? { metadata } : {}
@@ -208,12 +218,24 @@ function normalizeHarnessRun(input, result, context) {
208
218
  errors: normalizeSimpleErrors(result.errors)
209
219
  };
210
220
  }
221
+ function normalizeTranscriptInput(result) {
222
+ if ("events" in result && Array.isArray(result.events)) {
223
+ return result.events.map((event) => import_core.TranscriptEventSchema.parse(event));
224
+ }
225
+ if ("messages" in result && Array.isArray(result.messages)) {
226
+ return (0, import_core.messagesToTranscriptEvents)(result.messages).map(
227
+ (event) => import_core.TranscriptEventSchema.parse(event)
228
+ );
229
+ }
230
+ return void 0;
231
+ }
211
232
  function createFailedHarnessRun(input, error, options = {}) {
212
233
  const artifacts = options.artifacts;
213
234
  return {
214
235
  session: {
215
- messages: [
236
+ events: [
216
237
  {
238
+ type: "message",
217
239
  role: "user",
218
240
  content: normalizeContent(input)
219
241
  }
@@ -224,67 +246,6 @@ function createFailedHarnessRun(input, error, options = {}) {
224
246
  errors: [serializeError(error)]
225
247
  };
226
248
  }
227
- function createDefaultSessionMessages({
228
- input,
229
- output,
230
- toolCalls: normalizedToolCalls
231
- }) {
232
- const messages = [
233
- {
234
- role: "user",
235
- content: normalizeContent(input)
236
- }
237
- ];
238
- if (output !== void 0 || normalizedToolCalls.length > 0) {
239
- messages.push({
240
- role: "assistant",
241
- ...output !== void 0 ? { content: normalizeContent(output) } : {},
242
- ...normalizedToolCalls.length > 0 ? { toolCalls: normalizedToolCalls } : {}
243
- });
244
- }
245
- return messages;
246
- }
247
- function normalizeSimpleToolCalls(calls) {
248
- return (calls ?? []).map((call) => {
249
- const {
250
- arguments: rawArguments,
251
- result: rawResult,
252
- error: rawError,
253
- metadata: rawMetadata,
254
- ...toolCall
255
- } = call;
256
- const args = normalizeToolCallArguments(rawArguments);
257
- const result = toJsonValue(rawResult);
258
- const error = normalizeToolCallError(rawError);
259
- const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : void 0;
260
- return {
261
- ...toolCall,
262
- ...args ? { arguments: args } : {},
263
- ...result !== void 0 ? { result } : {},
264
- ...error ? { error } : {},
265
- ...metadata ? { metadata } : {}
266
- };
267
- });
268
- }
269
- function normalizeToolCallArguments(value) {
270
- if (value === void 0) {
271
- return void 0;
272
- }
273
- const normalized = toJsonValue(value);
274
- return normalized && typeof normalized === "object" && !Array.isArray(normalized) ? normalized : void 0;
275
- }
276
- function normalizeToolCallError(value) {
277
- if (value === void 0) {
278
- return void 0;
279
- }
280
- const serialized = serializeError(value);
281
- const { message, type, ...details } = serialized;
282
- return {
283
- ...details,
284
- message: typeof message === "string" ? message : String(message),
285
- ...typeof type === "string" ? { type } : {}
286
- };
287
- }
288
249
  function normalizeMergedArtifacts(contextArtifacts, resultArtifacts) {
289
250
  const artifacts = {
290
251
  ...contextArtifacts ?? {},
@@ -410,32 +371,6 @@ function createGenAiUsageAttributes(usage, options = {}) {
410
371
  "gen_ai.usage.reasoning.output_tokens": usage?.reasoningTokens
411
372
  };
412
373
  }
413
- function createToolCallSpans(calls, options = {}) {
414
- return calls.map((call, index) => {
415
- const spanError = call.error ? normalizeSpanError(call.error) : void 0;
416
- const spanId = options.spanIdPrefix ? `${options.spanIdPrefix}:${index + 1}` : call.id;
417
- return {
418
- ...spanId ? { id: spanId } : {},
419
- ...options.traceId ? { traceId: options.traceId } : {},
420
- ...options.parentId ? { parentId: options.parentId } : {},
421
- name: call.name,
422
- kind: "tool",
423
- ...call.startedAt ? { startedAt: call.startedAt } : {},
424
- ...call.finishedAt ? { finishedAt: call.finishedAt } : {},
425
- ...call.durationMs !== void 0 ? { durationMs: call.durationMs } : {},
426
- status: spanError ? "error" : "ok",
427
- ...spanError ? { error: spanError } : {},
428
- attributes: normalizeSpanAttributes({
429
- "gen_ai.operation.name": "execute_tool",
430
- "gen_ai.tool.name": call.name,
431
- "gen_ai.tool.type": "function",
432
- ...call.id ? { "gen_ai.tool.call.id": call.id } : {},
433
- ...call.arguments !== void 0 ? { "gen_ai.tool.call.arguments": call.arguments } : {},
434
- ...call.result !== void 0 ? { "gen_ai.tool.call.result": call.result } : {}
435
- })
436
- };
437
- });
438
- }
439
374
  function ensureRunTrace(run, options) {
440
375
  if ((0, import_core.spans)(run).length > 0) {
441
376
  return void 0;
@@ -460,11 +395,6 @@ function ensureRunTrace(run, options) {
460
395
  ...createGenAiUsageAttributes(run.usage)
461
396
  })
462
397
  };
463
- const toolSpans = createToolCallSpans((0, import_core.toolCalls)(run.session), {
464
- traceId,
465
- parentId: rootSpanId,
466
- spanIdPrefix: `${traceId}:tool`
467
- });
468
398
  const trace = {
469
399
  id: traceId,
470
400
  name: options.name,
@@ -472,7 +402,7 @@ function ensureRunTrace(run, options) {
472
402
  finishedAt: options.finishedAt.toISOString(),
473
403
  durationMs,
474
404
  ...options.source ? { metadata: { source: options.source } } : {},
475
- spans: [runSpan, ...toolSpans]
405
+ spans: [runSpan]
476
406
  };
477
407
  run.traces = [trace];
478
408
  return trace;
@@ -502,7 +432,7 @@ function isHarnessRun(value) {
502
432
  return isNormalizedSession(candidate.session) && Boolean(candidate.usage) && typeof candidate.usage === "object" && !Array.isArray(candidate.usage) && Array.isArray(candidate.errors);
503
433
  }
504
434
  function isNormalizedSession(value) {
505
- return Boolean(value) && typeof value === "object" && value !== null && "messages" in value && Array.isArray(value.messages);
435
+ return import_core.NormalizedSessionSchema.safeParse(value).success;
506
436
  }
507
437
  function serializeError(error) {
508
438
  if (error instanceof Error) {
@@ -522,7 +452,10 @@ function createJudgeHarness(options) {
522
452
  return createHarness({
523
453
  name: options.name ?? "judge-harness",
524
454
  run: async ({ input, signal }) => {
525
- return normalizeJudgeHarnessResult(await options.run(input, { signal }));
455
+ return normalizeJudgeHarnessResult(
456
+ input,
457
+ await options.run(input, { signal })
458
+ );
526
459
  }
527
460
  });
528
461
  }
@@ -545,17 +478,14 @@ function createRunJudge(judgeHarness, signal) {
545
478
  signal: options?.signal ?? signal
546
479
  });
547
480
  }
548
- function normalizeJudgeHarnessResult(result) {
481
+ function normalizeJudgeHarnessResult(input, result) {
549
482
  if (isHarnessRun(result)) {
550
483
  return result;
551
484
  }
552
- if (hasOutputField(result)) {
553
- return {
554
- output: normalizeJudgeHarnessOutput(result.output)
555
- };
556
- }
485
+ const output = hasOutputField(result) ? normalizeJudgeHarnessOutput(result.output) : normalizeJudgeHarnessOutput(result);
557
486
  return {
558
- output: normalizeJudgeHarnessOutput(result)
487
+ output,
488
+ messages: createJudgeHarnessMessages(input, output)
559
489
  };
560
490
  }
561
491
  function hasOutputField(value) {
@@ -567,6 +497,13 @@ function normalizeJudgeHarnessOutput(value) {
567
497
  }
568
498
  return normalizeContent(value);
569
499
  }
500
+ function createJudgeHarnessMessages(input, output) {
501
+ return [
502
+ ...input.system ? [{ role: "system", content: input.system }] : [],
503
+ { role: "user", content: input.prompt },
504
+ ...output !== void 0 ? [{ role: "assistant", content: output }] : []
505
+ ];
506
+ }
570
507
  function resolveJudgeHarnessAssistantOutput(run) {
571
508
  return (0, import_core2.latestAssistantMessageContent)(run.session) ?? "";
572
509
  }
@@ -1694,23 +1631,25 @@ function resolveJudgeAssertionOutput(received, run, explicitOutput) {
1694
1631
  return normalizeJudgeJsonValue(received);
1695
1632
  }
1696
1633
  function createSyntheticJudgeSession(received, options) {
1697
- const messages = [];
1634
+ const events = [];
1698
1635
  const userContent = normalizeJudgeJsonValue(options.input);
1699
1636
  if (userContent !== void 0) {
1700
- messages.push({
1637
+ events.push({
1638
+ type: "message",
1701
1639
  role: "user",
1702
1640
  content: userContent
1703
1641
  });
1704
1642
  }
1705
1643
  const assistantContent = normalizeJudgeJsonValue(received);
1706
1644
  if (assistantContent !== void 0) {
1707
- messages.push({
1645
+ events.push({
1646
+ type: "message",
1708
1647
  role: "assistant",
1709
1648
  content: assistantContent
1710
1649
  });
1711
1650
  }
1712
1651
  return {
1713
- messages
1652
+ events
1714
1653
  };
1715
1654
  }
1716
1655
  function inferJudgeOutputValue(received, session) {
@@ -1718,7 +1657,7 @@ function inferJudgeOutputValue(received, session) {
1718
1657
  return received.output;
1719
1658
  }
1720
1659
  if (isNormalizedSession(received)) {
1721
- return resolveAssistantOutput(session) ?? normalizeJudgeJsonValue(received.messages);
1660
+ return resolveAssistantOutput(session) ?? normalizeJudgeJsonValue(received.events);
1722
1661
  }
1723
1662
  return normalizeJudgeJsonValue(received);
1724
1663
  }
@@ -1792,7 +1731,6 @@ function createJudge(nameOrConfig, assessOrAssessor, assess) {
1792
1731
  createHarness,
1793
1732
  createJudge,
1794
1733
  createJudgeHarness,
1795
- createToolCallSpans,
1796
1734
  describeEval,
1797
1735
  ensureRunTrace,
1798
1736
  failedSpans,
@@ -1800,6 +1738,7 @@ function createJudge(nameOrConfig, assessOrAssessor, assess) {
1800
1738
  getHarnessRunFromError,
1801
1739
  latestAssistantMessageContent,
1802
1740
  messagesByRole,
1741
+ messagesToTranscriptEvents,
1803
1742
  normalizeHarnessRun,
1804
1743
  normalizeSpanAttributes,
1805
1744
  normalizeSpanError,