npm - vitest-evals - Versions diffs - 0.9.0 → 0.11.0 - Mend

vitest-evals 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/README.md +130 -67
package/dist/harness.d.mts +263 -1
package/dist/harness.d.ts +263 -1
package/dist/harness.js +306 -21
package/dist/harness.js.map +1 -1
package/dist/harness.mjs +296 -21
package/dist/harness.mjs.map +1 -1
package/dist/index.d.mts +48 -20
package/dist/index.d.ts +48 -20
package/dist/index.js +639 -42
package/dist/index.js.map +1 -1
package/dist/index.mjs +625 -42
package/dist/index.mjs.map +1 -1
package/dist/internal/scoring.d.mts +3 -3
package/dist/internal/scoring.d.ts +3 -3
package/dist/internal/scoring.js.map +1 -1
package/dist/internal/toolCallScorer.js +62 -2
package/dist/internal/toolCallScorer.js.map +1 -1
package/dist/internal/toolCallScorer.mjs +62 -2
package/dist/internal/toolCallScorer.mjs.map +1 -1
package/dist/judges/factualityJudge.d.mts +151 -0
package/dist/judges/factualityJudge.d.ts +151 -0
package/dist/judges/factualityJudge.js +235 -0
package/dist/judges/factualityJudge.js.map +1 -0
package/dist/judges/factualityJudge.mjs +208 -0
package/dist/judges/factualityJudge.mjs.map +1 -0
package/dist/judges/index.d.mts +3 -1
package/dist/judges/index.d.ts +3 -1
package/dist/judges/index.js +715 -7
package/dist/judges/index.js.map +1 -1
package/dist/judges/index.mjs +711 -6
package/dist/judges/index.mjs.map +1 -1
package/dist/judges/judgeHarness.d.mts +122 -0
package/dist/judges/judgeHarness.d.ts +122 -0
package/dist/judges/judgeHarness.js +571 -0
package/dist/judges/judgeHarness.js.map +1 -0
package/dist/judges/judgeHarness.mjs +542 -0
package/dist/judges/judgeHarness.mjs.map +1 -0
package/dist/judges/structuredOutputJudge.d.mts +1 -0
package/dist/judges/structuredOutputJudge.d.ts +1 -0
package/dist/judges/toolCallJudge.d.mts +1 -0
package/dist/judges/toolCallJudge.d.ts +1 -0
package/dist/judges/toolCallJudge.js +62 -2
package/dist/judges/toolCallJudge.js.map +1 -1
package/dist/judges/toolCallJudge.mjs +62 -2
package/dist/judges/toolCallJudge.mjs.map +1 -1
package/dist/judges/types.d.mts +33 -6
package/dist/judges/types.d.ts +33 -6
package/dist/judges/types.js.map +1 -1
package/dist/legacy/scorers/index.js +62 -2
package/dist/legacy/scorers/index.js.map +1 -1
package/dist/legacy/scorers/index.mjs +62 -2
package/dist/legacy/scorers/index.mjs.map +1 -1
package/dist/legacy/scorers/toolCallScorer.js +62 -2
package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
package/dist/legacy/scorers/toolCallScorer.mjs +62 -2
package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
package/dist/legacy.js +76 -3
package/dist/legacy.js.map +1 -1
package/dist/legacy.mjs +76 -3
package/dist/legacy.mjs.map +1 -1
package/dist/replay.js +1 -1
package/dist/replay.js.map +1 -1
package/dist/replay.mjs +1 -1
package/dist/replay.mjs.map +1 -1
package/dist/reporter.d.mts +5 -0
package/dist/reporter.d.ts +5 -0
package/dist/reporter.js +26 -2
package/dist/reporter.js.map +1 -1
package/dist/reporter.mjs +26 -2
package/dist/reporter.mjs.map +1 -1
package/package.json +1 -1

package/dist/harness.d.ts CHANGED Viewed

@@ -4,6 +4,169 @@ type JsonPrimitive = string | number | boolean | null;
 type JsonValue = JsonPrimitive | JsonValue[] | {
     [key: string]: JsonValue;
 };
+/** Well-known OpenTelemetry GenAI operation names. */
+type GenAiOperationName = "chat" | "create_agent" | "embeddings" | "execute_tool" | "generate_content" | "invoke_agent" | "invoke_workflow" | "retrieval" | "text_completion" | (string & {});
+/** Well-known OpenTelemetry GenAI output content types. */
+type GenAiOutputType = "image" | "json" | "speech" | "text" | (string & {});
+/** Well-known OpenTelemetry GenAI provider names. */
+type GenAiProviderName = "anthropic" | "aws.bedrock" | "azure.ai.inference" | "azure.ai.openai" | "cohere" | "deepseek" | "gcp.gemini" | "gcp.gen_ai" | "gcp.vertex_ai" | "groq" | "ibm.watsonx.ai" | "mistral_ai" | "openai" | "perplexity" | "x_ai" | (string & {});
+/** Well-known OpenTelemetry GenAI token types. */
+type GenAiTokenType = "input" | "output" | (string & {});
+/** Well-known OpenTelemetry GenAI tool execution types. */
+type GenAiToolType = "datastore" | "extension" | "function" | (string & {});
+/** Typed subset of OpenTelemetry GenAI semantic attributes. */
+type GenAiSemanticAttributes = {
+    "gen_ai.agent.description"?: string;
+    "gen_ai.agent.id"?: string;
+    "gen_ai.agent.name"?: string;
+    "gen_ai.agent.version"?: string;
+    "gen_ai.conversation.id"?: string;
+    "gen_ai.data_source.id"?: string;
+    "gen_ai.embeddings.dimension.count"?: number;
+    "gen_ai.evaluation.explanation"?: string;
+    "gen_ai.evaluation.name"?: string;
+    "gen_ai.evaluation.score.label"?: string;
+    "gen_ai.evaluation.score.value"?: number;
+    "gen_ai.input.messages"?: JsonValue;
+    "gen_ai.operation.name"?: GenAiOperationName;
+    "gen_ai.output.messages"?: JsonValue;
+    "gen_ai.output.type"?: GenAiOutputType;
+    "gen_ai.prompt.name"?: string;
+    "gen_ai.provider.name"?: GenAiProviderName;
+    "gen_ai.request.choice.count"?: number;
+    "gen_ai.request.encoding_formats"?: string[];
+    "gen_ai.request.frequency_penalty"?: number;
+    "gen_ai.request.max_tokens"?: number;
+    "gen_ai.request.model"?: string;
+    "gen_ai.request.presence_penalty"?: number;
+    "gen_ai.request.seed"?: number;
+    "gen_ai.request.stop_sequences"?: string[];
+    "gen_ai.request.stream"?: boolean;
+    "gen_ai.request.temperature"?: number;
+    "gen_ai.request.top_k"?: number;
+    "gen_ai.request.top_p"?: number;
+    "gen_ai.response.finish_reasons"?: string[];
+    "gen_ai.response.id"?: string;
+    "gen_ai.response.model"?: string;
+    "gen_ai.response.time_to_first_chunk"?: number;
+    "gen_ai.retrieval.documents"?: JsonValue;
+    "gen_ai.retrieval.query.text"?: string;
+    "gen_ai.system_instructions"?: JsonValue;
+    "gen_ai.token.type"?: GenAiTokenType;
+    "gen_ai.tool.call.arguments"?: JsonValue;
+    "gen_ai.tool.call.id"?: string;
+    "gen_ai.tool.call.result"?: JsonValue;
+    "gen_ai.tool.definitions"?: JsonValue;
+    "gen_ai.tool.description"?: string;
+    "gen_ai.tool.name"?: string;
+    "gen_ai.tool.type"?: GenAiToolType;
+    "gen_ai.usage.cache_creation.input_tokens"?: number;
+    "gen_ai.usage.cache_read.input_tokens"?: number;
+    "gen_ai.usage.input_tokens"?: number;
+    "gen_ai.usage.output_tokens"?: number;
+    "gen_ai.usage.reasoning.output_tokens"?: number;
+    "gen_ai.workflow.name"?: string;
+};
+/** Attribute keys defined by the OpenTelemetry GenAI semantic conventions. */
+type GenAiSemanticAttributeKey = keyof GenAiSemanticAttributes;
+/** Typed OpenTelemetry semantic attributes accepted on normalized spans. */
+type OpenTelemetrySemanticAttributes = GenAiSemanticAttributes & {
+    "error.type"?: string;
+    "server.address"?: string;
+    "server.port"?: number;
+};
+/** Known OpenTelemetry semantic attribute keys accepted on normalized spans. */
+type OpenTelemetrySemanticAttributeKey = keyof OpenTelemetrySemanticAttributes;
+/** Attribute keys accepted on normalized spans. */
+type NormalizedSpanAttributeKey = OpenTelemetrySemanticAttributeKey | (string & {});
+/**
+ * JSON-safe span attributes. Known OpenTelemetry GenAI keys are typed while
+ * custom provider and application keys remain allowed.
+ */
+type NormalizedSpanAttributes = OpenTelemetrySemanticAttributes & {
+    [key: string]: JsonValue | undefined;
+};
+/** Event attached to one normalized span. */
+type NormalizedSpanEvent = {
+    /** Event name emitted by the runtime or harness. */
+    name: string;
+    /** ISO timestamp for the event when available. */
+    timestamp?: string;
+    /** JSON-safe event attributes. */
+    attributes?: NormalizedSpanAttributes;
+};
+/** Normalized operation span captured during a harness run. */
+type NormalizedSpan = {
+    /** Runtime or provider span id when one is available. */
+    id?: string;
+    /** Trace id this span belongs to. */
+    traceId?: string;
+    /** Parent span id when the runtime exposes hierarchy. */
+    parentId?: string;
+    /** Human-readable operation name. */
+    name: string;
+    /** Coarse operation kind used by reporters and judges. */
+    kind?: "run" | "agent" | "model" | "tool" | "guardrail" | "handoff" | "custom";
+    /** ISO timestamp for the start of the span. */
+    startedAt?: string;
+    /** ISO timestamp for the end of the span. */
+    finishedAt?: string;
+    /** Span duration in milliseconds. */
+    durationMs?: number;
+    /** Success or failure status for the span. */
+    status?: "ok" | "error";
+    /** Normalized error when the span failed. */
+    error?: {
+        message: string;
+        type?: string;
+        [key: string]: JsonValue | undefined;
+    };
+    /** JSON-safe operation attributes. */
+    attributes?: NormalizedSpanAttributes;
+    /** Events observed inside this span. */
+    events?: NormalizedSpanEvent[];
+};
+/** Normalized trace captured during a harness run. */
+type NormalizedTrace = {
+    /** Runtime or provider trace id when one is available. */
+    id?: string;
+    /** Human-readable trace or workflow name. */
+    name?: string;
+    /** ISO timestamp for the start of the trace. */
+    startedAt?: string;
+    /** ISO timestamp for the end of the trace. */
+    finishedAt?: string;
+    /** Trace duration in milliseconds. */
+    durationMs?: number;
+    /** Extra JSON-safe trace metadata. */
+    metadata?: Record<string, JsonValue>;
+    /** Spans that make up this trace. */
+    spans: NormalizedSpan[];
+};
+/** Options for converting normalized tool calls into trace spans. */
+type CreateToolCallSpansOptions = {
+    /** Trace id to attach to each generated tool span. */
+    traceId?: string;
+    /** Parent span id to attach to each generated tool span. */
+    parentId?: string;
+    /** Prefix used to create internal span ids instead of reusing tool-call ids. */
+    spanIdPrefix?: string;
+};
+/** Options for attaching a fallback run trace to a harness result. */
+type EnsureRunTraceOptions = {
+    /** Human-readable run or harness name. */
+    name: string;
+    /** Wall-clock start time for the harness run. */
+    startedAt: Date;
+    /** Wall-clock finish time for the harness run. */
+    finishedAt: Date;
+    /** Optional trace id. A generated id is used when omitted. */
+    id?: string;
+    /** GenAI operation name to place on the root run span. */
+    operationName?: GenAiOperationName;
+    /** Optional JSON-safe source marker for the trace metadata. */
+    source?: string;
+};
 /**
  * Normalized record for one tool call observed during a harness run.
  *
@@ -160,6 +323,8 @@ type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> =
     timings?: TimingSummary;
     /** JSON-safe run artifacts captured by the harness or test context. */
     artifacts?: Record<string, JsonValue>;
+    /** Normalized traces and spans captured during execution. */
+    traces?: NormalizedTrace[];
     /** Normalized errors captured during execution. */
     errors: Array<Record<string, JsonValue>>;
 };
@@ -232,6 +397,27 @@ type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error
     /** Raw tool metadata accepted by `createHarness(...)` before normalization. */
     metadata?: Record<string, unknown>;
 };
+/** Lightweight span event accepted by `createHarness(...)` results. */
+type SimpleSpanEvent = Omit<NormalizedSpanEvent, "attributes"> & {
+    /** Raw event attributes accepted by `createHarness(...)` before normalization. */
+    attributes?: Record<string, unknown>;
+};
+/** Lightweight span record accepted by `createHarness(...)` results. */
+type SimpleSpanRecord = Omit<NormalizedSpan, "attributes" | "error" | "events"> & {
+    /** Raw span attributes accepted by `createHarness(...)` before normalization. */
+    attributes?: Record<string, unknown>;
+    /** Raw span error accepted by `createHarness(...)` before normalization. */
+    error?: unknown;
+    /** Raw span events accepted by `createHarness(...)` before normalization. */
+    events?: SimpleSpanEvent[];
+};
+/** Lightweight trace record accepted by `createHarness(...)` results. */
+type SimpleTraceRecord = Omit<NormalizedTrace, "metadata" | "spans"> & {
+    /** Raw trace metadata accepted by `createHarness(...)` before normalization. */
+    metadata?: Record<string, unknown>;
+    /** Lightweight spans to normalize into the trace. */
+    spans: SimpleSpanRecord[];
+};
 /**
  * Lightweight result shape normalized by `createHarness(...)`.
  *
@@ -255,6 +441,8 @@ type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | und
     timings?: TimingSummary;
     /** Raw artifact values to normalize and merge into the run. */
     artifacts?: Record<string, unknown>;
+    /** Lightweight traces and spans to normalize into the run. */
+    traces?: SimpleTraceRecord[];
     /** Raw session metadata to normalize into the session. */
     metadata?: Record<string, unknown>;
     /** Raw errors to normalize into the run. */
@@ -354,6 +542,31 @@ declare function createHarness<TInput = unknown, TOutput extends JsonValue | und
  * ```
  */
 declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
+/**
+ * Builds a JSON-safe failed run for errors that happen before a harness can return.
+ *
+ * @param input - Original input passed to the harness.
+ * @param error - Error thrown by setup or execution.
+ * @param options - Optional artifacts to preserve on the failed run.
+ */
+declare function createFailedHarnessRun(input: unknown, error: unknown, options?: {
+    artifacts?: Record<string, JsonValue>;
+}): HarnessRun;
+/** Normalizes arbitrary span errors while preserving object-shaped messages. */
+declare function normalizeSpanError(error: unknown): NormalizedSpan["error"] | undefined;
+/** Normalizes raw span attributes into the JSON-safe span attribute shape. */
+declare function normalizeSpanAttributes(attributes: Record<string, unknown>): NormalizedSpanAttributes | undefined;
+/** Builds common OpenTelemetry GenAI usage attributes from a usage summary. */
+declare function createGenAiUsageAttributes(usage: UsageSummary | undefined, options?: {
+    provider?: string;
+}): {
+    "gen_ai.provider.name": string | undefined;
+    "gen_ai.request.model": string | undefined;
+    "gen_ai.response.model": string | undefined;
+    "gen_ai.usage.input_tokens": number | undefined;
+    "gen_ai.usage.output_tokens": number | undefined;
+    "gen_ai.usage.reasoning.output_tokens": number | undefined;
+};
 /**
  * Flattens every recorded tool call from a normalized session.
  *
@@ -367,6 +580,44 @@ declare function normalizeHarnessRun<TInput = unknown, TMetadata extends Harness
  * ```
  */
 declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
+/**
+ * Converts normalized tool-call records into trace spans.
+ *
+ * Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the
+ * spans belong to a known trace so span ids stay internally unique.
+ */
+declare function createToolCallSpans(calls: ToolCallRecord[], options?: CreateToolCallSpansOptions): NormalizedSpan[];
+/**
+ * Attaches a fallback run trace when a harness result does not already contain spans.
+ *
+ * This keeps custom harnesses inspectable while first-party harness packages
+ * remain free to attach richer native traces.
+ */
+declare function ensureRunTrace(run: HarnessRun, options: EnsureRunTraceOptions): NormalizedTrace | undefined;
+/**
+ * Flattens every recorded span from a normalized harness run.
+ *
+ * @param run - Normalized harness run produced by a harness.
+ *
+ * @example
+ * ```ts
+ * const modelSpans = spans(result).filter((span) => span.kind === "model");
+ * ```
+ */
+declare function spans(run: HarnessRun): NormalizedSpan[];
+/**
+ * Returns spans of one coarse operation kind from a normalized run.
+ *
+ * @param run - Normalized harness run produced by a harness.
+ * @param kind - Span kind to keep.
+ */
+declare function spansByKind(run: HarnessRun, kind: NonNullable<NormalizedSpan["kind"]>): NormalizedSpan[];
+/**
+ * Returns every span that explicitly failed or carries a normalized error.
+ *
+ * @param run - Normalized harness run produced by a harness.
+ */
+declare function failedSpans(run: HarnessRun): NormalizedSpan[];
 /**
  * Filters normalized session messages by role.
  *
@@ -414,6 +665,17 @@ declare function userMessages(session: NormalizedSession): NormalizedMessage[];
  * ```
  */
 declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
+/**
+ * Returns the latest assistant message content, ignoring empty text messages.
+ *
+ * @param session - Normalized session produced by a harness run.
+ *
+ * @example
+ * ```ts
+ * const finalAnswer = latestAssistantMessageContent(result.session);
+ * ```
+ */
+declare function latestAssistantMessageContent(session: NormalizedSession): JsonValue | undefined;
 /**
  * Returns every normalized tool message from a session.
  *
@@ -465,4 +727,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
 /** Serializes an arbitrary thrown value into the normalized error shape. */
 declare function serializeError(error: unknown): Record<string, JsonValue>;
-export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
+export { type CreateHarnessOptions, type CreateHarnessRunArgs, type CreateToolCallSpansOptions, type EnsureRunTraceOptions, type GenAiOperationName, type GenAiOutputType, type GenAiProviderName, type GenAiSemanticAttributeKey, type GenAiSemanticAttributes, type GenAiTokenType, type GenAiToolType, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type NormalizedSpan, type NormalizedSpanAttributeKey, type NormalizedSpanAttributes, type NormalizedSpanEvent, type NormalizedTrace, type OpenTelemetrySemanticAttributeKey, type OpenTelemetrySemanticAttributes, type SimpleHarnessResult, type SimpleSpanEvent, type SimpleSpanRecord, type SimpleToolCallRecord, type SimpleTraceRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, failedSpans, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, latestAssistantMessageContent, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, normalizeSpanAttributes, normalizeSpanError, resolveHarnessRunErrors, serializeError, spans, spansByKind, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };