vitest-evals 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/README.md +27 -35
  2. package/dist/harness.d.mts +15 -20
  3. package/dist/harness.d.ts +15 -20
  4. package/dist/harness.js +0 -1
  5. package/dist/harness.js.map +1 -1
  6. package/dist/harness.mjs +0 -1
  7. package/dist/harness.mjs.map +1 -1
  8. package/dist/index.d.mts +45 -68
  9. package/dist/index.d.ts +45 -68
  10. package/dist/index.js +21 -40
  11. package/dist/index.js.map +1 -1
  12. package/dist/index.mjs +21 -40
  13. package/dist/index.mjs.map +1 -1
  14. package/dist/internal/toolCallScorer.js.map +1 -1
  15. package/dist/internal/toolCallScorer.mjs.map +1 -1
  16. package/dist/judges/factualityJudge.d.mts +14 -13
  17. package/dist/judges/factualityJudge.d.ts +14 -13
  18. package/dist/judges/factualityJudge.js +9 -9
  19. package/dist/judges/factualityJudge.js.map +1 -1
  20. package/dist/judges/factualityJudge.mjs +9 -9
  21. package/dist/judges/factualityJudge.mjs.map +1 -1
  22. package/dist/judges/index.js +17 -20
  23. package/dist/judges/index.js.map +1 -1
  24. package/dist/judges/index.mjs +17 -20
  25. package/dist/judges/index.mjs.map +1 -1
  26. package/dist/judges/judgeHarness.d.mts +6 -10
  27. package/dist/judges/judgeHarness.d.ts +6 -10
  28. package/dist/judges/judgeHarness.js +3 -8
  29. package/dist/judges/judgeHarness.js.map +1 -1
  30. package/dist/judges/judgeHarness.mjs +3 -8
  31. package/dist/judges/judgeHarness.mjs.map +1 -1
  32. package/dist/judges/structuredOutputJudge.d.mts +7 -9
  33. package/dist/judges/structuredOutputJudge.d.ts +7 -9
  34. package/dist/judges/structuredOutputJudge.js +3 -3
  35. package/dist/judges/structuredOutputJudge.js.map +1 -1
  36. package/dist/judges/structuredOutputJudge.mjs +3 -3
  37. package/dist/judges/structuredOutputJudge.mjs.map +1 -1
  38. package/dist/judges/toolCallJudge.d.mts +12 -9
  39. package/dist/judges/toolCallJudge.d.ts +12 -9
  40. package/dist/judges/toolCallJudge.js +3 -3
  41. package/dist/judges/toolCallJudge.js.map +1 -1
  42. package/dist/judges/toolCallJudge.mjs +3 -3
  43. package/dist/judges/toolCallJudge.mjs.map +1 -1
  44. package/dist/judges/types.d.mts +13 -24
  45. package/dist/judges/types.d.ts +13 -24
  46. package/dist/judges/types.js.map +1 -1
  47. package/dist/legacy/scorers/index.js.map +1 -1
  48. package/dist/legacy/scorers/index.mjs.map +1 -1
  49. package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
  50. package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
  51. package/dist/legacy.js.map +1 -1
  52. package/dist/legacy.mjs.map +1 -1
  53. package/dist/reporter.js.map +1 -1
  54. package/dist/reporter.mjs.map +1 -1
  55. package/package.json +3 -3
@@ -1,4 +1,4 @@
1
- import { HarnessMetadata, Harness } from '../harness.mjs';
1
+ import { Harness } from '../harness.mjs';
2
2
  import { JudgeHarness } from './judgeHarness.mjs';
3
3
  import { Judge, JudgeContext } from './types.mjs';
4
4
  import { JsonValue } from '@vitest-evals/core';
@@ -85,6 +85,8 @@ type FactualityJudgeConfig = {
85
85
  name?: string;
86
86
  /** Default judge-side harness used when matcher options do not provide one. */
87
87
  judgeHarness?: JudgeHarness;
88
+ /** Expert answer or reference facts used by this judge instance. */
89
+ expected?: FactualityJudgeExpected;
88
90
  };
89
91
  /**
90
92
  * Matcher context accepted by `FactualityJudge()`.
@@ -106,17 +108,17 @@ type FactualityJudgeConfig = {
106
108
  * });
107
109
  * ```
108
110
  */
109
- type FactualityJudgeOptions<TInput = any, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TOutput, TMetadata> | undefined = Harness<TInput, TOutput, TMetadata> | undefined> = JudgeContext<TInput, TOutput, TMetadata, THarness> & {
110
- /** Expert answer or reference facts. Defaults to `metadata.expected`. */
111
+ type FactualityJudgeOptions<TInput = any, TOutput extends JsonValue | undefined = any, THarness extends Harness<TInput, TOutput> | undefined = any> = JudgeContext<TInput, TOutput, THarness> & {
112
+ /** Expert answer or reference facts. Overrides the judge config default. */
111
113
  expected?: FactualityJudgeExpected;
112
114
  };
113
115
  /**
114
116
  * Creates a factuality judge over normalized harness output.
115
117
  *
116
- * `FactualityJudge()` compares `input`, `output`, and `expected` from the
117
- * current `JudgeContext`, so the same judge can run against any application
118
- * harness. Configure the LLM used for grading with `judgeHarness` on the
119
- * judge, suite, or matcher options.
118
+ * `FactualityJudge()` compares `input`, `output`, and an expert answer. Bind a
119
+ * suite-wide expert answer on the judge config, or pass a case-specific
120
+ * `expected` value to `toSatisfyJudge(...)`. Configure the LLM used for grading
121
+ * with `judgeHarness` on the judge, suite, or matcher options.
120
122
  *
121
123
  * @param config - Optional judge name and reusable judge harness default.
122
124
  *
@@ -131,18 +133,17 @@ type FactualityJudgeOptions<TInput = any, TOutput extends JsonValue | undefined
131
133
  * model: anthropic("claude-sonnet-4-5"),
132
134
  * temperature: 0,
133
135
  * });
134
- * const factualityJudge = FactualityJudge({ judgeHarness });
136
+ * const factualityJudge = FactualityJudge({
137
+ * judgeHarness,
138
+ * expected: "Paris is the capital of France.",
139
+ * });
135
140
  *
136
141
  * describeEval("qa agent", {
137
142
  * harness: qaHarness,
138
143
  * judges: [factualityJudge],
139
144
  * }, (it) => {
140
145
  * it("answers a geography question", async ({ run }) => {
141
- * await run("What is the capital of France?", {
142
- * metadata: {
143
- * expected: "Paris is the capital of France.",
144
- * },
145
- * });
146
+ * await run("What is the capital of France?");
146
147
  * });
147
148
  * });
148
149
  * ```
@@ -1,4 +1,4 @@
1
- import { HarnessMetadata, Harness } from '../harness.js';
1
+ import { Harness } from '../harness.js';
2
2
  import { JudgeHarness } from './judgeHarness.js';
3
3
  import { Judge, JudgeContext } from './types.js';
4
4
  import { JsonValue } from '@vitest-evals/core';
@@ -85,6 +85,8 @@ type FactualityJudgeConfig = {
85
85
  name?: string;
86
86
  /** Default judge-side harness used when matcher options do not provide one. */
87
87
  judgeHarness?: JudgeHarness;
88
+ /** Expert answer or reference facts used by this judge instance. */
89
+ expected?: FactualityJudgeExpected;
88
90
  };
89
91
  /**
90
92
  * Matcher context accepted by `FactualityJudge()`.
@@ -106,17 +108,17 @@ type FactualityJudgeConfig = {
106
108
  * });
107
109
  * ```
108
110
  */
109
- type FactualityJudgeOptions<TInput = any, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TOutput, TMetadata> | undefined = Harness<TInput, TOutput, TMetadata> | undefined> = JudgeContext<TInput, TOutput, TMetadata, THarness> & {
110
- /** Expert answer or reference facts. Defaults to `metadata.expected`. */
111
+ type FactualityJudgeOptions<TInput = any, TOutput extends JsonValue | undefined = any, THarness extends Harness<TInput, TOutput> | undefined = any> = JudgeContext<TInput, TOutput, THarness> & {
112
+ /** Expert answer or reference facts. Overrides the judge config default. */
111
113
  expected?: FactualityJudgeExpected;
112
114
  };
113
115
  /**
114
116
  * Creates a factuality judge over normalized harness output.
115
117
  *
116
- * `FactualityJudge()` compares `input`, `output`, and `expected` from the
117
- * current `JudgeContext`, so the same judge can run against any application
118
- * harness. Configure the LLM used for grading with `judgeHarness` on the
119
- * judge, suite, or matcher options.
118
+ * `FactualityJudge()` compares `input`, `output`, and an expert answer. Bind a
119
+ * suite-wide expert answer on the judge config, or pass a case-specific
120
+ * `expected` value to `toSatisfyJudge(...)`. Configure the LLM used for grading
121
+ * with `judgeHarness` on the judge, suite, or matcher options.
120
122
  *
121
123
  * @param config - Optional judge name and reusable judge harness default.
122
124
  *
@@ -131,18 +133,17 @@ type FactualityJudgeOptions<TInput = any, TOutput extends JsonValue | undefined
131
133
  * model: anthropic("claude-sonnet-4-5"),
132
134
  * temperature: 0,
133
135
  * });
134
- * const factualityJudge = FactualityJudge({ judgeHarness });
136
+ * const factualityJudge = FactualityJudge({
137
+ * judgeHarness,
138
+ * expected: "Paris is the capital of France.",
139
+ * });
135
140
  *
136
141
  * describeEval("qa agent", {
137
142
  * harness: qaHarness,
138
143
  * judges: [factualityJudge],
139
144
  * }, (it) => {
140
145
  * it("answers a geography question", async ({ run }) => {
141
- * await run("What is the capital of France?", {
142
- * metadata: {
143
- * expected: "Paris is the capital of France.",
144
- * },
145
- * });
146
+ * await run("What is the capital of France?");
146
147
  * });
147
148
  * });
148
149
  * ```
@@ -32,7 +32,6 @@ var import_core2 = require("@vitest-evals/core");
32
32
  async function runJudgeHarness(judgeHarness, input, options = {}) {
33
33
  const artifacts = {};
34
34
  const run = await judgeHarness.run(input, {
35
- metadata: options.metadata ?? {},
36
35
  signal: options.signal,
37
36
  artifacts,
38
37
  setArtifact: (name, value) => {
@@ -46,8 +45,7 @@ function createRunJudge(judgeHarness, signal) {
46
45
  return void 0;
47
46
  }
48
47
  return (input, options) => runJudgeHarness(judgeHarness, input, {
49
- metadata: options?.metadata,
50
- signal
48
+ signal: options?.signal ?? signal
51
49
  });
52
50
  }
53
51
  function resolveJudgeHarnessAssistantOutput(run) {
@@ -81,22 +79,24 @@ function FactualityJudge(config = {}) {
81
79
  return {
82
80
  name: config.name ?? "FactualityJudge",
83
81
  judgeHarness,
84
- assess: (opts) => assessFactuality(opts, judgeHarness)
82
+ assess: (opts) => assessFactuality(opts, {
83
+ expected: config.expected,
84
+ judgeHarness
85
+ })
85
86
  };
86
87
  }
87
- async function assessFactuality(opts, configuredJudgeHarness) {
88
- const metadata = opts.metadata;
89
- const expected = opts.expected === void 0 ? metadata.expected : opts.expected;
88
+ async function assessFactuality(opts, config) {
89
+ const expected = opts.expected ?? config.expected;
90
90
  if (isMissingExpectedAnswer(expected)) {
91
91
  return {
92
92
  score: 0,
93
93
  metadata: {
94
- rationale: "FactualityJudge requires a non-empty expert answer in `expected` or `metadata.expected`."
94
+ rationale: "FactualityJudge requires a non-empty expert answer in `expected` or FactualityJudge(...) config."
95
95
  }
96
96
  };
97
97
  }
98
98
  const runJudge = opts.runJudge ?? createRunJudge(
99
- configuredJudgeHarness,
99
+ config.judgeHarness,
100
100
  opts.signal
101
101
  );
102
102
  if (!runJudge) {
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/judges/factualityJudge.ts","../../src/harness.ts","../../src/judges/judgeHarness.ts"],"sourcesContent":["import {\n type Harness,\n type HarnessMetadata,\n latestAssistantMessageContent,\n} from \"../harness\";\nimport type { JsonValue } from \"../harness\";\nimport { createRunJudge } from \"./judgeHarness\";\nimport type { JudgeHarness } from \"./judgeHarness\";\nimport type { Judge, JudgeContext, JudgeResult } from \"./types\";\n\n/**\n * Rubric choice returned by a factuality judge model call.\n *\n * @example\n * ```ts\n * import type { FactualityJudgeChoice } from \"vitest-evals\";\n *\n * const choice: FactualityJudgeChoice = \"C\";\n * ```\n */\nexport type FactualityJudgeChoice = \"A\" | \"B\" | \"C\" | \"D\" | \"E\";\n\n/**\n * Prompt payload sent to the configured judge harness.\n *\n * @example\n * ```ts\n * import type { FactualityJudgePrompt } from \"vitest-evals\";\n *\n * const payload: FactualityJudgePrompt = {\n * system: \"Grade factual consistency.\",\n * prompt: \"Compare these answers.\",\n * };\n * ```\n */\nexport type FactualityJudgePrompt = {\n /** System prompt for the judge model. */\n system: string;\n /** User prompt containing the question, expert answer, submitted answer, and rubric. */\n prompt: string;\n};\n\n/**\n * Parsed verdict returned by a factuality judge model call.\n *\n * @example\n * ```ts\n * import type { FactualityJudgeVerdict } from \"vitest-evals\";\n *\n * const verdict: FactualityJudgeVerdict = {\n * choice: \"C\",\n * rationale: \"The submitted answer matches the expert answer.\",\n * };\n * ```\n */\nexport type FactualityJudgeVerdict = {\n /** Rubric choice selected by the judge model. */\n choice: FactualityJudgeChoice;\n /** Human-readable explanation for the selected choice. */\n rationale: string;\n};\n\nconst FACTUALITY_CHOICE_SCORES: Record<FactualityJudgeChoice, number> = {\n A: 0.4,\n B: 0.6,\n C: 1,\n D: 0,\n E: 1,\n};\n\nconst FACTUALITY_SYSTEM =\n \"You are comparing factual content. Ignore differences in style, grammar, punctuation, and formatting.\";\n\nconst FACTUALITY_RESPONSE_SCHEMA = {\n type: \"object\",\n additionalProperties: false,\n required: [\"choice\", \"rationale\"],\n properties: {\n choice: {\n enum: [\"A\", \"B\", \"C\", \"D\", \"E\"],\n },\n rationale: {\n type: \"string\",\n },\n },\n} as const satisfies JsonValue;\n\n/**\n * Expert answer or reference facts accepted by `FactualityJudge()`.\n *\n * @example\n * ```ts\n * import type { FactualityJudgeExpected } from \"vitest-evals\";\n *\n * const expected: FactualityJudgeExpected =\n * \"Paris is the capital of France.\";\n * ```\n */\nexport type FactualityJudgeExpected = JsonValue;\n\n/**\n * Configuration for the factuality judge.\n *\n * The judge harness can be supplied here, by `describeEval({ judgeHarness })`,\n * or by `expect(...).toSatisfyJudge(..., { judgeHarness })`. Passing it here\n * keeps the judge self-contained while preserving provider neutrality.\n *\n * @example\n * ```ts\n * import { FactualityJudge, type JudgeHarness } from \"vitest-evals\";\n *\n * declare const judgeHarness: JudgeHarness;\n *\n * const judge = FactualityJudge({ name: \"FactJudge\", judgeHarness });\n * ```\n */\nexport type FactualityJudgeConfig = {\n /** Stable judge name used in assertion messages and reports. */\n name?: string;\n /** Default judge-side harness used when matcher options do not provide one. */\n judgeHarness?: JudgeHarness;\n};\n\ntype FactualityJudgeMetadata = HarnessMetadata & {\n expected?: FactualityJudgeExpected;\n};\n\n/**\n * Matcher context accepted by `FactualityJudge()`.\n *\n * @example\n * ```ts\n * import { aiSdkJudgeHarness } from \"@vitest-evals/harness-ai-sdk\";\n * import { openai } from \"@ai-sdk/openai\";\n * import { expect } from \"vitest\";\n * import { FactualityJudge } from \"vitest-evals\";\n *\n * const judgeHarness = aiSdkJudgeHarness({\n * model: openai(\"gpt-4.1-mini\"),\n * });\n *\n * await expect(result).toSatisfyJudge(FactualityJudge(), {\n * expected: \"Paris is the capital of France.\",\n * judgeHarness,\n * });\n * ```\n */\nexport type FactualityJudgeOptions<\n TInput = any,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n THarness extends Harness<TInput, TOutput, TMetadata> | undefined =\n | Harness<TInput, TOutput, TMetadata>\n | undefined,\n> = JudgeContext<TInput, TOutput, TMetadata, THarness> & {\n /** Expert answer or reference facts. Defaults to `metadata.expected`. */\n expected?: FactualityJudgeExpected;\n};\n\n/**\n * Creates a factuality judge over normalized harness output.\n *\n * `FactualityJudge()` compares `input`, `output`, and `expected` from the\n * current `JudgeContext`, so the same judge can run against any application\n * harness. Configure the LLM used for grading with `judgeHarness` on the\n * judge, suite, or matcher options.\n *\n * @param config - Optional judge name and reusable judge harness default.\n *\n * @example\n * ```ts\n * import { anthropic } from \"@ai-sdk/anthropic\";\n * import { aiSdkJudgeHarness } from \"@vitest-evals/harness-ai-sdk\";\n * import { describeEval, FactualityJudge } from \"vitest-evals\";\n * import { qaHarness } from \"./qaHarness\";\n *\n * const judgeHarness = aiSdkJudgeHarness({\n * model: anthropic(\"claude-sonnet-4-5\"),\n * temperature: 0,\n * });\n * const factualityJudge = FactualityJudge({ judgeHarness });\n *\n * describeEval(\"qa agent\", {\n * harness: qaHarness,\n * judges: [factualityJudge],\n * }, (it) => {\n * it(\"answers a geography question\", async ({ run }) => {\n * await run(\"What is the capital of France?\", {\n * metadata: {\n * expected: \"Paris is the capital of France.\",\n * },\n * });\n * });\n * });\n * ```\n */\nexport function FactualityJudge(\n config: FactualityJudgeConfig = {},\n): Judge<FactualityJudgeOptions> {\n const judgeHarness = config.judgeHarness;\n\n return {\n name: config.name ?? \"FactualityJudge\",\n judgeHarness,\n assess: (opts) => assessFactuality(opts, judgeHarness),\n };\n}\n\nasync function assessFactuality(\n opts: FactualityJudgeOptions,\n configuredJudgeHarness: JudgeHarness | undefined,\n) {\n const metadata = opts.metadata as FactualityJudgeMetadata;\n const expected =\n opts.expected === undefined ? metadata.expected : opts.expected;\n\n if (isMissingExpectedAnswer(expected)) {\n return {\n score: 0,\n metadata: {\n rationale:\n \"FactualityJudge requires a non-empty expert answer in `expected` or `metadata.expected`.\",\n },\n };\n }\n\n const runJudge =\n opts.runJudge ??\n createRunJudge(\n configuredJudgeHarness,\n (opts as { signal?: AbortSignal }).signal,\n );\n\n if (!runJudge) {\n throw new Error(\n \"FactualityJudge requires a judgeHarness in FactualityJudge(...) config, describeEval(...) options, toSatisfyJudge(...) options, or JudgeContext.runJudge.\",\n );\n }\n\n const verdict = await runJudge({\n system: FACTUALITY_SYSTEM,\n prompt: formatFactualityPrompt({\n input: opts.input,\n expected,\n output: resolveJudgeOutput(opts),\n }),\n responseFormat: {\n type: \"json\",\n schema: FACTUALITY_RESPONSE_SCHEMA,\n },\n });\n\n return formatJudgeResult(parseFactualityJudgeVerdict(verdict));\n}\n\nfunction isMissingExpectedAnswer(value: FactualityJudgeExpected | undefined) {\n return (\n value == null || (typeof value === \"string\" && value.trim().length === 0)\n );\n}\n\nfunction resolveJudgeOutput(opts: FactualityJudgeOptions) {\n if (opts.output !== undefined) {\n return opts.output;\n }\n\n return latestAssistantMessageContent(opts.session) ?? \"\";\n}\n\nfunction parseFactualityJudgeVerdict(value: unknown): FactualityJudgeVerdict {\n const parsed = typeof value === \"string\" ? parseJsonObject(value) : value;\n\n if (!parsed || typeof parsed !== \"object\") {\n throw new Error(\n \"FactualityJudge judgeHarness must return an object with `choice` and `rationale`.\",\n );\n }\n\n const verdict = parsed as Record<string, unknown>;\n if (!isFactualityChoice(verdict.choice)) {\n throw new Error(\n \"FactualityJudge judgeHarness must return choice A, B, C, D, or E.\",\n );\n }\n\n if (typeof verdict.rationale !== \"string\") {\n throw new Error(\n \"FactualityJudge judgeHarness must return a string `rationale`.\",\n );\n }\n\n return {\n choice: verdict.choice,\n rationale: verdict.rationale,\n };\n}\n\nfunction parseJsonObject(value: string) {\n try {\n return JSON.parse(value);\n } catch {\n const fencedJson = value.match(/```(?:json)?\\s*([\\s\\S]*?)\\s*```/i);\n if (!fencedJson) {\n throw new Error(\n \"FactualityJudge judgeHarness must return JSON with `choice` and `rationale`.\",\n );\n }\n\n return JSON.parse(fencedJson[1]);\n }\n}\n\nfunction isFactualityChoice(value: unknown): value is FactualityJudgeChoice {\n return (\n value === \"A\" ||\n value === \"B\" ||\n value === \"C\" ||\n value === \"D\" ||\n value === \"E\"\n );\n}\n\nfunction formatFactualityPrompt({\n input,\n expected,\n output,\n}: {\n input: unknown;\n expected: unknown;\n output: unknown;\n}) {\n const comparison = formatJudgeValue({\n question: input ?? \"\",\n expert_answer: expected,\n submitted_answer: output ?? \"\",\n });\n\n return `Compare the submitted answer with the expert answer.\n\nComparison payload:\n${comparison}\n\nSelect exactly one option:\nA: The submission is a fully consistent subset of the expert answer.\nB: The submission is a fully consistent superset of the expert answer.\nC: The submission contains the same factual details as the expert answer.\nD: The submission disagrees with the expert answer.\nE: The answers differ only in ways that do not affect factuality.\n\nReturn JSON with exactly these fields:\n{\n \"choice\": \"C\",\n \"rationale\": \"Brief explanation for the selected choice\"\n}\n\nThe choice value must be one of A, B, C, D, or E.`;\n}\n\nfunction formatJudgeValue(value: unknown) {\n if (typeof value === \"string\") {\n return value;\n }\n\n if (value === undefined) {\n return \"\";\n }\n\n try {\n return JSON.stringify(value, null, 2) ?? String(value);\n } catch {\n return String(value);\n }\n}\n\nfunction formatJudgeResult(object: FactualityJudgeVerdict): JudgeResult {\n return {\n score: FACTUALITY_CHOICE_SCORES[object.choice],\n metadata: {\n rationale: object.rationale,\n choice: object.choice,\n },\n };\n}\n","import {\n assistantMessages,\n failedSpans,\n latestAssistantMessageContent,\n messagesByRole,\n spans,\n spansByKind,\n systemMessages,\n toolCalls,\n toolMessages,\n userMessages,\n} from \"@vitest-evals/core\";\nimport type {\n GenAiOperationName,\n HarnessRun,\n HarnessRunError,\n JsonPrimitive,\n JsonValue,\n NormalizedMessage,\n NormalizedSession,\n NormalizedSpan,\n NormalizedSpanAttributes,\n NormalizedSpanEvent,\n NormalizedTrace,\n TimingSummary,\n ToolCallRecord,\n UsageSummary,\n} from \"@vitest-evals/core\";\n\nexport {\n assistantMessages,\n failedSpans,\n latestAssistantMessageContent,\n messagesByRole,\n spans,\n spansByKind,\n systemMessages,\n toolCalls,\n toolMessages,\n userMessages,\n} from \"@vitest-evals/core\";\nexport type {\n GenAiOperationName,\n GenAiOutputType,\n GenAiProviderName,\n GenAiSemanticAttributeKey,\n GenAiSemanticAttributes,\n GenAiTokenType,\n GenAiToolType,\n HarnessRun,\n HarnessRunError,\n JsonPrimitive,\n JsonValue,\n NormalizedMessage,\n NormalizedSession,\n NormalizedSpan,\n NormalizedSpanAttributeKey,\n NormalizedSpanAttributes,\n NormalizedSpanEvent,\n NormalizedTrace,\n OpenTelemetrySemanticAttributeKey,\n OpenTelemetrySemanticAttributes,\n TimingSummary,\n ToolCallRecord,\n UsageSummary,\n} from \"@vitest-evals/core\";\n\n/** Options for converting normalized tool calls into trace spans. */\nexport type CreateToolCallSpansOptions = {\n /** Trace id to attach to each generated tool span. */\n traceId?: string;\n /** Parent span id to attach to each generated tool span. */\n parentId?: string;\n /** Prefix used to create internal span ids instead of reusing tool-call ids. */\n spanIdPrefix?: string;\n};\n\n/** Options for attaching a fallback run trace to a harness result. */\nexport type EnsureRunTraceOptions = {\n /** Human-readable run or harness name. */\n name: string;\n /** Wall-clock start time for the harness run. */\n startedAt: Date;\n /** Wall-clock finish time for the harness run. */\n finishedAt: Date;\n /** Optional trace id. A generated id is used when omitted. */\n id?: string;\n /** GenAI operation name to place on the root run span. */\n operationName?: GenAiOperationName;\n /** Optional JSON-safe source marker for the trace metadata. */\n source?: string;\n};\n\ntype OutputField<TOutput extends JsonValue | undefined> =\n undefined extends TOutput ? { output?: TOutput } : { output: TOutput };\n\n/** Per-run metadata shape accepted by harnesses and eval tests. */\nexport type HarnessMetadata = Record<string, unknown>;\n\n/**\n * Runtime context passed from the eval fixture into a harness run.\n *\n * @example\n * ```ts\n * const harness: Harness<string> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * context.setArtifact(\"inputLength\", input.length);\n *\n * return {\n * output: undefined,\n * session: { messages: [{ role: \"user\", content: input }] },\n * usage: {},\n * errors: [],\n * };\n * },\n * };\n * ```\n */\nexport type HarnessContext<\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n /** Per-run metadata passed through `run(input, { metadata })`. */\n metadata: Readonly<TMetadata>;\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable JSON-safe artifact bag shared with the harness. */\n artifacts: Record<string, JsonValue>;\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: (name: string, value: JsonValue) => void;\n};\n\n/**\n * Adapter that executes the system under test and returns a normalized run.\n *\n * @example\n * ```ts\n * const harness: Harness<string, { status: \"approved\" | \"denied\" }> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * return normalizeHarnessRun(input, await runRefundFlow(input), context);\n * },\n * };\n * ```\n */\nexport type Harness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes the system under test and returns a normalized run. */\n run: (\n input: TInput,\n context: HarnessContext<TMetadata>,\n ) => Promise<HarnessRun<TOutput>>;\n};\n\n/** Value or promise accepted by lightweight harness callbacks. */\nexport type MaybePromise<T> = T | Promise<T>;\n\n/** Lightweight tool-call record accepted by `createHarness(...)` results. */\nexport type SimpleToolCallRecord = Omit<\n ToolCallRecord,\n \"arguments\" | \"result\" | \"error\" | \"metadata\"\n> & {\n /** Raw tool arguments accepted by `createHarness(...)` before normalization. */\n arguments?: unknown;\n /** Raw tool result accepted by `createHarness(...)` before normalization. */\n result?: unknown;\n /** Raw tool error accepted by `createHarness(...)` before normalization. */\n error?: unknown;\n /** Raw tool metadata accepted by `createHarness(...)` before normalization. */\n metadata?: Record<string, unknown>;\n};\n\n/** Lightweight span event accepted by `createHarness(...)` results. */\nexport type SimpleSpanEvent = Omit<NormalizedSpanEvent, \"attributes\"> & {\n /** Raw event attributes accepted by `createHarness(...)` before normalization. */\n attributes?: Record<string, unknown>;\n};\n\n/** Lightweight span record accepted by `createHarness(...)` results. */\nexport type SimpleSpanRecord = Omit<\n NormalizedSpan,\n \"attributes\" | \"error\" | \"events\"\n> & {\n /** Raw span attributes accepted by `createHarness(...)` before normalization. */\n attributes?: Record<string, unknown>;\n /** Raw span error accepted by `createHarness(...)` before normalization. */\n error?: unknown;\n /** Raw span events accepted by `createHarness(...)` before normalization. */\n events?: SimpleSpanEvent[];\n};\n\n/** Lightweight trace record accepted by `createHarness(...)` results. */\nexport type SimpleTraceRecord = Omit<NormalizedTrace, \"metadata\" | \"spans\"> & {\n /** Raw trace metadata accepted by `createHarness(...)` before normalization. */\n metadata?: Record<string, unknown>;\n /** Lightweight spans to normalize into the trace. */\n spans: SimpleSpanRecord[];\n};\n\n/**\n * Lightweight result shape normalized by `createHarness(...)`.\n *\n * @example\n * ```ts\n * const result: SimpleHarnessResult<{ status: \"approved\" }> = {\n * output: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n * usage: { totalTokens: 260 },\n * };\n * ```\n */\nexport type SimpleHarnessResult<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = OutputField<TOutput> & {\n /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */\n messages?: NormalizedMessage[];\n /** Lightweight tool-call records to normalize into the session. */\n toolCalls?: SimpleToolCallRecord[];\n /** Usage summary to attach to the run. */\n usage?: UsageSummary;\n /** Timing summary to attach to the run. */\n timings?: TimingSummary;\n /** Raw artifact values to normalize and merge into the run. */\n artifacts?: Record<string, unknown>;\n /** Lightweight traces and spans to normalize into the run. */\n traces?: SimpleTraceRecord[];\n /** Raw session metadata to normalize into the session. */\n metadata?: Record<string, unknown>;\n /** Raw errors to normalize into the run. */\n errors?: unknown[];\n};\n\n/** Either a complete normalized run or a lightweight result to normalize. */\nexport type HarnessResultLike<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;\n\n/** Arguments passed to the `createHarness(...)` convenience callback. */\nexport type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {\n /** Original input passed to `run(input)`. */\n input: TInput;\n /** Read-only metadata passed to `run(input, { metadata })`. */\n metadata: Readonly<TMetadata>;\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable run artifact bag. */\n artifacts: HarnessContext<TMetadata>[\"artifacts\"];\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: HarnessContext<TMetadata>[\"setArtifact\"];\n};\n\n/**\n * Options for creating a lightweight custom application harness.\n *\n * @example\n * ```ts\n * const options: CreateHarnessOptions<string, { status: \"approved\" }> = {\n * name: \"refund-agent\",\n * run: async ({ input }) => ({\n * output: await classifyRefund(input),\n * }),\n * };\n * ```\n */\nexport type CreateHarnessOptions<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes application code and returns either a lightweight result or full `HarnessRun`. */\n run: (\n args: CreateHarnessRunArgs<TInput, TMetadata>,\n ) => MaybePromise<HarnessResultLike<TOutput>>;\n};\n\nfunction isJsonPrimitive(value: unknown): value is JsonPrimitive {\n return (\n value === null ||\n typeof value === \"string\" ||\n typeof value === \"boolean\" ||\n (typeof value === \"number\" && Number.isFinite(value))\n );\n}\n\nfunction isJsonRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\nfunction normalizeJsonArray(value: unknown[], seen: WeakSet<object>) {\n if (seen.has(value)) {\n return undefined;\n }\n\n seen.add(value);\n const normalized = value.map((item) => {\n const normalized = toJsonValueInternal(item, seen);\n return normalized === undefined ? null : normalized;\n });\n seen.delete(value);\n\n return normalized;\n}\n\nfunction normalizeJsonObject(\n value: Record<string, unknown>,\n seen: WeakSet<object>,\n): Record<string, JsonValue> {\n const normalized: Record<string, JsonValue> = {};\n\n if (seen.has(value)) {\n return normalized;\n }\n\n seen.add(value);\n try {\n for (const [key, entryValue] of Object.entries(value)) {\n const entry = toJsonValueInternal(entryValue, seen);\n if (entry !== undefined) {\n normalized[key] = entry;\n }\n }\n } finally {\n seen.delete(value);\n }\n\n return normalized;\n}\n\n/** Returns true when a value exposes a callable method with the given name. */\nexport function hasCallableMethod(value: unknown, methodName: string) {\n return (\n value !== null &&\n (typeof value === \"object\" || typeof value === \"function\") &&\n methodName in value &&\n typeof (value as Record<string, unknown>)[methodName] === \"function\"\n );\n}\n\n/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */\nexport function toJsonValue(value: unknown): JsonValue | undefined {\n return toJsonValueInternal(value, new WeakSet());\n}\n\nfunction toJsonValueInternal(\n value: unknown,\n seen: WeakSet<object>,\n): JsonValue | undefined {\n if (isJsonPrimitive(value)) {\n return value;\n }\n\n if (\n value !== null &&\n typeof value === \"object\" &&\n seen.has(value as object)\n ) {\n return undefined;\n }\n\n if (Array.isArray(value)) {\n return normalizeJsonArray(value, seen);\n }\n\n if (isJsonRecord(value)) {\n return normalizeJsonObject(value, seen);\n }\n\n return undefined;\n}\n\n/** Drops non-JSON properties from a record while preserving valid values. */\nexport function normalizeRecord(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n return normalizeJsonObject(value, new WeakSet());\n}\n\n/** Normalizes metadata and omits the field entirely when nothing survives. */\nexport function normalizeMetadata(\n value: Record<string, unknown>,\n): Record<string, JsonValue> | undefined {\n const normalized = normalizeRecord(value);\n return Object.keys(normalized).length > 0 ? normalized : undefined;\n}\n\n/** Converts arbitrary content into the JSON-safe message content shape. */\nexport function normalizeContent(value: unknown): JsonValue {\n const normalized = toJsonValue(value);\n return normalized !== undefined ? normalized : String(value);\n}\n\n/**\n * Creates a harness from the common \"run app code and return output\" shape.\n *\n * @param options - Harness name plus the callback that executes app code.\n *\n * @example\n * ```ts\n * import { createHarness } from \"vitest-evals\";\n *\n * export const refundHarness = createHarness<\n * string,\n * { status: \"approved\" | \"denied\" },\n * { expected: { status: \"approved\" | \"denied\" } }\n * >({\n * name: \"refund-agent\",\n * run: async ({ input, metadata, setArtifact }) => {\n * const result = await runRefundFlow(input, metadata);\n * const output = { status: result.status };\n *\n * setArtifact(\"case\", { expected: metadata.expected.status });\n *\n * return {\n * output,\n * toolCalls: result.toolCalls,\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * };\n * },\n * });\n * ```\n */\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n>(\n options: CreateHarnessOptions<TInput, TOutput, TMetadata>,\n): Harness<TInput, TOutput, TMetadata>;\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n>(\n options: CreateHarnessOptions<TInput, TOutput, TMetadata>,\n): Harness<TInput, TOutput, TMetadata> {\n const harness: Harness<TInput, TOutput, TMetadata> = {\n name: options.name,\n run: async (input, context) => {\n const startedAt = new Date();\n\n try {\n const result = await options.run({\n input,\n metadata: context.metadata,\n signal: context.signal,\n artifacts: context.artifacts,\n setArtifact: context.setArtifact,\n });\n const run = normalizeHarnessRun(input, result, context);\n ensureRunTrace(run, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n\n return run;\n } catch (error) {\n const partialRun = getHarnessRunFromError(error);\n if (partialRun) {\n if (\n Object.keys(context.artifacts).length > 0 &&\n !partialRun.artifacts\n ) {\n partialRun.artifacts = context.artifacts;\n }\n ensureRunTrace(partialRun, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n throw attachHarnessRunToError(error, partialRun);\n }\n\n const failedRun = createFailedHarnessRun(input, error, {\n artifacts: context.artifacts,\n });\n ensureRunTrace(failedRun, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n\n throw attachHarnessRunToError(error, failedRun);\n }\n },\n };\n\n return harness;\n}\n\n/**\n * Normalizes a lightweight harness result into the reporter-facing run shape.\n *\n * @param input - Original input passed to the harness.\n * @param result - Lightweight result or pre-normalized harness run.\n * @param context - Optional per-run context used to merge artifacts.\n *\n * @example\n * ```ts\n * const run = normalizeHarnessRun(\"Refund invoice inv_123\", {\n * output: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * });\n *\n * expect(toolCalls(run.session)).toHaveLength(1);\n * ```\n */\nexport function normalizeHarnessRun<\n TInput = unknown,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(\n input: TInput,\n result: HarnessResultLike<TOutput>,\n context?: HarnessContext<TMetadata>,\n): HarnessRun<TOutput> {\n if (isHarnessRun(result)) {\n if (\n context &&\n Object.keys(context.artifacts).length > 0 &&\n !result.artifacts\n ) {\n return {\n ...result,\n artifacts: context.artifacts,\n };\n }\n\n return result;\n }\n\n const output = result.output;\n const toolCalls = normalizeSimpleToolCalls(result.toolCalls);\n const usage = result.usage ?? {};\n const messages =\n result.messages ??\n createDefaultSessionMessages({\n input,\n output,\n toolCalls,\n });\n const metadata = result.metadata\n ? normalizeMetadata(result.metadata)\n : undefined;\n const artifacts = normalizeMergedArtifacts(\n context?.artifacts,\n result.artifacts,\n );\n const traces = normalizeSimpleTraces(result.traces);\n\n return {\n session: {\n messages,\n ...(usage.provider ? { provider: usage.provider } : {}),\n ...(usage.model ? { model: usage.model } : {}),\n ...(metadata ? { metadata } : {}),\n },\n ...(output !== undefined ? { output } : {}),\n usage,\n ...(result.timings ? { timings: result.timings } : {}),\n ...(artifacts ? { artifacts } : {}),\n ...(traces ? { traces } : {}),\n errors: normalizeSimpleErrors(result.errors),\n } as HarnessRun<TOutput>;\n}\n\n/**\n * Builds a JSON-safe failed run for errors that happen before a harness can return.\n *\n * @param input - Original input passed to the harness.\n * @param error - Error thrown by setup or execution.\n * @param options - Optional artifacts to preserve on the failed run.\n */\nexport function createFailedHarnessRun(\n input: unknown,\n error: unknown,\n options: { artifacts?: Record<string, JsonValue> } = {},\n): HarnessRun {\n const artifacts = options.artifacts;\n\n return {\n session: {\n messages: [\n {\n role: \"user\",\n content: normalizeContent(input),\n },\n ],\n },\n usage: {},\n ...(artifacts && Object.keys(artifacts).length > 0 ? { artifacts } : {}),\n errors: [serializeError(error)],\n };\n}\n\nfunction createDefaultSessionMessages<TInput>({\n input,\n output,\n toolCalls: normalizedToolCalls,\n}: {\n input: TInput;\n output: JsonValue | undefined;\n toolCalls: ToolCallRecord[];\n}): NormalizedMessage[] {\n const messages: NormalizedMessage[] = [\n {\n role: \"user\",\n content: normalizeContent(input),\n },\n ];\n\n if (output !== undefined || normalizedToolCalls.length > 0) {\n messages.push({\n role: \"assistant\",\n ...(output !== undefined ? { content: normalizeContent(output) } : {}),\n ...(normalizedToolCalls.length > 0\n ? { toolCalls: normalizedToolCalls }\n : {}),\n });\n }\n\n return messages;\n}\n\nfunction normalizeSimpleToolCalls(\n calls: SimpleToolCallRecord[] | undefined,\n): ToolCallRecord[] {\n return (calls ?? []).map((call) => {\n const {\n arguments: rawArguments,\n result: rawResult,\n error: rawError,\n metadata: rawMetadata,\n ...toolCall\n } = call;\n const args = normalizeToolCallArguments(rawArguments);\n const result = toJsonValue(rawResult);\n const error = normalizeToolCallError(rawError);\n const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : undefined;\n\n return {\n ...toolCall,\n ...(args ? { arguments: args } : {}),\n ...(result !== undefined ? { result } : {}),\n ...(error ? { error } : {}),\n ...(metadata ? { metadata } : {}),\n };\n });\n}\n\nfunction normalizeToolCallArguments(\n value: unknown,\n): Record<string, JsonValue> | undefined {\n if (value === undefined) {\n return undefined;\n }\n\n const normalized = toJsonValue(value);\n return normalized &&\n typeof normalized === \"object\" &&\n !Array.isArray(normalized)\n ? normalized\n : undefined;\n}\n\nfunction normalizeToolCallError(\n value: unknown,\n): ToolCallRecord[\"error\"] | undefined {\n if (value === undefined) {\n return undefined;\n }\n\n const serialized = serializeError(value);\n const { message, type, ...details } = serialized;\n\n return {\n ...details,\n message: typeof message === \"string\" ? message : String(message),\n ...(typeof type === \"string\" ? { type } : {}),\n };\n}\n\nfunction normalizeMergedArtifacts(\n contextArtifacts: Record<string, JsonValue> | undefined,\n resultArtifacts: Record<string, unknown> | undefined,\n) {\n const artifacts = {\n ...(contextArtifacts ?? {}),\n ...(resultArtifacts ? normalizeRecord(resultArtifacts) : {}),\n };\n\n return Object.keys(artifacts).length > 0 ? artifacts : undefined;\n}\n\nfunction normalizeSimpleErrors(\n errors: unknown[] | undefined,\n): Array<Record<string, JsonValue>> {\n return (errors ?? []).map((error) => {\n const normalized = toJsonValue(error);\n\n if (\n normalized &&\n typeof normalized === \"object\" &&\n !Array.isArray(normalized) &&\n Object.keys(normalized).length > 0\n ) {\n return normalized;\n }\n\n return serializeError(error);\n });\n}\n\nfunction normalizeSimpleTraces(\n traces: SimpleTraceRecord[] | undefined,\n): NormalizedTrace[] | undefined {\n if (!Array.isArray(traces)) {\n return undefined;\n }\n\n const normalized = traces\n .map(normalizeSimpleTrace)\n .filter((trace): trace is NormalizedTrace => Boolean(trace));\n\n return normalized.length > 0 ? normalized : undefined;\n}\n\nfunction normalizeSimpleTrace(trace: unknown): NormalizedTrace | undefined {\n if (!isJsonRecord(trace)) {\n return undefined;\n }\n\n const {\n metadata: rawMetadata,\n spans: rawSpans,\n ...traceFields\n } = trace as Partial<SimpleTraceRecord>;\n const spans = (Array.isArray(rawSpans) ? rawSpans : [])\n .map((span) => normalizeSimpleSpan(span))\n .filter((span): span is NormalizedSpan => Boolean(span));\n const metadata = isJsonRecord(rawMetadata)\n ? normalizeMetadata(rawMetadata)\n : undefined;\n\n if (spans.length === 0 && !traceFields.id && !traceFields.name) {\n return undefined;\n }\n\n return {\n ...traceFields,\n ...(metadata ? { metadata } : {}),\n spans,\n };\n}\n\nfunction normalizeSimpleSpan(span: unknown): NormalizedSpan | undefined {\n if (!isJsonRecord(span) || typeof span.name !== \"string\" || !span.name) {\n return undefined;\n }\n\n const {\n attributes: rawAttributes,\n error: rawError,\n events: rawEvents,\n ...spanFields\n } = span as Partial<SimpleSpanRecord> & { name: string };\n const attributes = rawAttributes\n ? isJsonRecord(rawAttributes)\n ? normalizeMetadata(rawAttributes)\n : undefined\n : undefined;\n const error = normalizeSpanError(rawError);\n const events = normalizeSimpleSpanEvents(rawEvents);\n\n return {\n ...spanFields,\n ...(attributes\n ? { attributes: attributes as NormalizedSpanAttributes }\n : {}),\n ...(error ? { error } : {}),\n ...(events ? { events } : {}),\n };\n}\n\nfunction normalizeSimpleSpanEvents(\n events: unknown,\n): NormalizedSpanEvent[] | undefined {\n if (!Array.isArray(events)) {\n return undefined;\n }\n\n const normalized = events\n .map(normalizeSimpleSpanEvent)\n .filter((event): event is NormalizedSpanEvent => Boolean(event));\n\n return normalized.length > 0 ? normalized : undefined;\n}\n\nfunction normalizeSimpleSpanEvent(\n event: unknown,\n): NormalizedSpanEvent | undefined {\n if (!isJsonRecord(event) || typeof event.name !== \"string\" || !event.name) {\n return undefined;\n }\n\n const { attributes: rawAttributes, ...eventFields } =\n event as Partial<SimpleSpanEvent> & { name: string };\n const attributes = rawAttributes\n ? isJsonRecord(rawAttributes)\n ? normalizeMetadata(rawAttributes)\n : undefined\n : undefined;\n\n return {\n ...eventFields,\n ...(attributes\n ? { attributes: attributes as NormalizedSpanAttributes }\n : {}),\n };\n}\n\n/** Normalizes arbitrary span errors while preserving object-shaped messages. */\nexport function normalizeSpanError(\n error: unknown,\n): NormalizedSpan[\"error\"] | undefined {\n if (error === undefined) {\n return undefined;\n }\n\n if (error instanceof Error) {\n const details = normalizeMetadata(\n error as unknown as Record<string, unknown>,\n );\n\n return {\n ...(details ?? {}),\n type: error.name,\n message: error.message,\n };\n }\n\n if (\n error &&\n typeof error === \"object\" &&\n !Array.isArray(error) &&\n typeof (error as { message?: unknown }).message === \"string\"\n ) {\n const normalized = normalizeMetadata(error as Record<string, unknown>);\n const { message, type, ...details } = normalized ?? {};\n\n return {\n ...details,\n message: message as string,\n ...(typeof type === \"string\" ? { type } : {}),\n };\n }\n\n const serialized = serializeError(error);\n const { message, type, ...details } = serialized;\n\n return {\n ...details,\n message: typeof message === \"string\" ? message : String(message),\n ...(typeof type === \"string\" ? { type } : {}),\n };\n}\n\n/** Normalizes raw span attributes into the JSON-safe span attribute shape. */\nexport function normalizeSpanAttributes(\n attributes: Record<string, unknown>,\n): NormalizedSpanAttributes | undefined {\n return normalizeMetadata(attributes) as NormalizedSpanAttributes | undefined;\n}\n\n/** Builds common OpenTelemetry GenAI usage attributes from a usage summary. */\nexport function createGenAiUsageAttributes(\n usage: UsageSummary | undefined,\n options: { provider?: string } = {},\n) {\n return {\n \"gen_ai.provider.name\": usage?.provider ?? options.provider,\n \"gen_ai.request.model\": usage?.model,\n \"gen_ai.response.model\": usage?.model,\n \"gen_ai.usage.input_tokens\": usage?.inputTokens,\n \"gen_ai.usage.output_tokens\": usage?.outputTokens,\n \"gen_ai.usage.reasoning.output_tokens\": usage?.reasoningTokens,\n } satisfies Record<string, unknown>;\n}\n\n/**\n * Converts normalized tool-call records into trace spans.\n *\n * Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the\n * spans belong to a known trace so span ids stay internally unique.\n */\nexport function createToolCallSpans(\n calls: ToolCallRecord[],\n options: CreateToolCallSpansOptions = {},\n): NormalizedSpan[] {\n return calls.map((call, index) => {\n const spanError = call.error ? normalizeSpanError(call.error) : undefined;\n const spanId = options.spanIdPrefix\n ? `${options.spanIdPrefix}:${index + 1}`\n : call.id;\n\n return {\n ...(spanId ? { id: spanId } : {}),\n ...(options.traceId ? { traceId: options.traceId } : {}),\n ...(options.parentId ? { parentId: options.parentId } : {}),\n name: call.name,\n kind: \"tool\",\n ...(call.startedAt ? { startedAt: call.startedAt } : {}),\n ...(call.finishedAt ? { finishedAt: call.finishedAt } : {}),\n ...(call.durationMs !== undefined ? { durationMs: call.durationMs } : {}),\n status: spanError ? \"error\" : \"ok\",\n ...(spanError ? { error: spanError } : {}),\n attributes: normalizeSpanAttributes({\n \"gen_ai.operation.name\": \"execute_tool\",\n \"gen_ai.tool.name\": call.name,\n \"gen_ai.tool.type\": \"function\",\n ...(call.id ? { \"gen_ai.tool.call.id\": call.id } : {}),\n ...(call.arguments !== undefined\n ? { \"gen_ai.tool.call.arguments\": call.arguments }\n : {}),\n ...(call.result !== undefined\n ? { \"gen_ai.tool.call.result\": call.result }\n : {}),\n }),\n } satisfies NormalizedSpan;\n });\n}\n\n/**\n * Attaches a fallback run trace when a harness result does not already contain spans.\n *\n * This keeps custom harnesses inspectable while first-party harness packages\n * remain free to attach richer native traces.\n */\nexport function ensureRunTrace(\n run: HarnessRun,\n options: EnsureRunTraceOptions,\n): NormalizedTrace | undefined {\n if (spans(run).length > 0) {\n return undefined;\n }\n\n const traceId = options.id ?? createGeneratedTraceId();\n const rootSpanId = `${traceId}:run`;\n const durationMs = options.finishedAt.getTime() - options.startedAt.getTime();\n const rootError =\n run.errors.length > 0 ? normalizeSpanError(run.errors[0]) : undefined;\n const runSpan: NormalizedSpan = {\n id: rootSpanId,\n traceId,\n name: options.name,\n kind: \"run\",\n startedAt: options.startedAt.toISOString(),\n finishedAt: options.finishedAt.toISOString(),\n durationMs,\n status: rootError ? \"error\" : \"ok\",\n ...(rootError ? { error: rootError } : {}),\n attributes: normalizeSpanAttributes({\n \"gen_ai.operation.name\": options.operationName ?? \"invoke_workflow\",\n \"gen_ai.workflow.name\": options.name,\n ...createGenAiUsageAttributes(run.usage),\n }),\n };\n const toolSpans = createToolCallSpans(toolCalls(run.session), {\n traceId,\n parentId: rootSpanId,\n spanIdPrefix: `${traceId}:tool`,\n });\n const trace: NormalizedTrace = {\n id: traceId,\n name: options.name,\n startedAt: options.startedAt.toISOString(),\n finishedAt: options.finishedAt.toISOString(),\n durationMs,\n ...(options.source ? { metadata: { source: options.source } } : {}),\n spans: [runSpan, ...toolSpans],\n };\n\n run.traces = [trace];\n return trace;\n}\n\nlet nextGeneratedTraceId = 0;\n\nfunction createGeneratedTraceId() {\n nextGeneratedTraceId += 1;\n return `trace_${nextGeneratedTraceId}`;\n}\n\n/**\n * Attaches a partial or complete harness run to an arbitrary thrown error.\n *\n * @param error - Thrown value to wrap.\n * @param run - Partial or complete normalized harness run to preserve.\n *\n * @example\n * ```ts\n * try {\n * return await runAgent(input);\n * } catch (error) {\n * throw attachHarnessRunToError(error, partialRun);\n * }\n * ```\n */\nexport function attachHarnessRunToError(\n error: unknown,\n run: HarnessRun,\n): HarnessRunError {\n const baseError =\n error instanceof Error\n ? error\n : new Error(String(error ?? \"Unknown error\"));\n return Object.assign(baseError, {\n vitestEvalsRun: run,\n });\n}\n\n/**\n * Reads an attached harness run back off a previously wrapped error value.\n *\n * @param error - Unknown thrown value that may contain a harness run.\n *\n * @example\n * ```ts\n * const partialRun = getHarnessRunFromError(error);\n *\n * if (partialRun) {\n * console.log(toolCalls(partialRun.session));\n * }\n * ```\n */\nexport function getHarnessRunFromError(error: unknown): HarnessRun | undefined {\n if (\n error &&\n typeof error === \"object\" &&\n \"vitestEvalsRun\" in error &&\n isHarnessRun((error as { vitestEvalsRun?: unknown }).vitestEvalsRun)\n ) {\n return (error as { vitestEvalsRun: HarnessRun }).vitestEvalsRun;\n }\n\n return undefined;\n}\n\n/** Returns true when a value matches the normalized `HarnessRun` contract. */\nexport function isHarnessRun(value: unknown): value is HarnessRun {\n if (!value || typeof value !== \"object\") {\n return false;\n }\n\n const candidate = value as {\n session?: unknown;\n usage?: unknown;\n errors?: unknown;\n };\n\n return (\n isNormalizedSession(candidate.session) &&\n Boolean(candidate.usage) &&\n typeof candidate.usage === \"object\" &&\n !Array.isArray(candidate.usage) &&\n Array.isArray(candidate.errors)\n );\n}\n\n/** Returns true when a value matches the normalized session contract. */\nexport function isNormalizedSession(\n value: unknown,\n): value is NormalizedSession {\n return (\n Boolean(value) &&\n typeof value === \"object\" &&\n value !== null &&\n \"messages\" in value &&\n Array.isArray((value as { messages?: unknown }).messages)\n );\n}\n\n/** Reuses pre-normalized harness errors when a runtime already returns them. */\nexport function resolveHarnessRunErrors(\n result: unknown,\n): Array<Record<string, JsonValue>> {\n if (\n result &&\n typeof result === \"object\" &&\n Array.isArray((result as Record<string, unknown>).errors)\n ) {\n return (result as { errors: Array<Record<string, JsonValue>> }).errors;\n }\n\n return [];\n}\n\n/** Serializes an arbitrary thrown value into the normalized error shape. */\nexport function serializeError(error: unknown): Record<string, JsonValue> {\n if (error instanceof Error) {\n return {\n type: error.name,\n message: error.message,\n };\n }\n\n return {\n type: \"Error\",\n message: String(error),\n };\n}\n","import {\n createHarness,\n type Harness,\n type HarnessContext,\n type HarnessMetadata,\n type HarnessResultLike,\n type HarnessRun,\n isHarnessRun,\n latestAssistantMessageContent,\n type JsonValue,\n type MaybePromise,\n normalizeContent,\n} from \"../harness\";\n\n/**\n * Provider-neutral prompt request issued by an LLM-backed judge.\n *\n * @example\n * ```ts\n * const input: JudgeHarnessInput = {\n * system: \"Grade factual consistency.\",\n * prompt: \"Compare the submitted answer with the reference answer.\",\n * responseFormat: {\n * type: \"json\",\n * },\n * };\n * ```\n */\nexport type JudgeHarnessInput = {\n /** Optional system prompt for the judge model. */\n system?: string;\n /** User prompt or instruction payload for the judge model. */\n prompt: string;\n /** Optional response-format hint for adapters that support structured output. */\n responseFormat?: {\n /** Requests a JSON-compatible response. */\n type: \"json\";\n /** Optional JSON Schema passed through to provider-specific adapters. */\n schema?: JsonValue;\n };\n};\n\n/** JSON-safe output returned by a judge harness. */\nexport type JudgeHarnessOutput = JsonValue | undefined;\n\n/**\n * Harness used by LLM-backed judges to issue judge-side prompts.\n *\n * This is separate from the application harness under test.\n *\n * @example\n * ```ts\n * const judgeHarness: JudgeHarness = createJudgeHarness({\n * name: \"judge-model\",\n * run: async ({ prompt }, { signal }) => {\n * return callJudgeModel({ prompt, signal });\n * },\n * });\n * ```\n */\nexport type JudgeHarness = Harness<\n JudgeHarnessInput,\n JudgeHarnessOutput,\n HarnessMetadata\n>;\n\n/** Runtime options supplied when a judge calls `runJudge(...)`. */\nexport type RunJudgeOptions = {\n /** Optional metadata forwarded to the judge harness run. */\n metadata?: HarnessMetadata;\n};\n\n/**\n * Curried judge-harness runner available inside `JudgeContext`.\n *\n * @example\n * ```ts\n * const verdict = await ctx.runJudge?.({\n * prompt: \"Return a JSON verdict.\",\n * responseFormat: { type: \"json\" },\n * });\n * ```\n */\nexport type RunJudge = (\n input: JudgeHarnessInput,\n options?: RunJudgeOptions,\n) => Promise<JudgeHarnessOutput>;\n\n/** Runtime options passed to `createJudgeHarness(...)` callbacks. */\nexport type CreateJudgeHarnessRunOptions = {\n /** Abort signal from the current eval run when available. */\n signal?: AbortSignal;\n /** Metadata for this judge-harness run. */\n metadata: Readonly<HarnessMetadata>;\n};\n\n/**\n * Configuration for `createJudgeHarness(...)`.\n *\n * @example\n * ```ts\n * const judgeHarness = createJudgeHarness({\n * name: \"custom-judge\",\n * run: async ({ system, prompt }, { signal }) => {\n * return callProvider({ system, prompt, signal });\n * },\n * });\n * ```\n */\nexport type CreateJudgeHarnessOptions = {\n /** Stable harness name used in diagnostics. */\n name?: string;\n /**\n * Runs one provider-specific judge prompt.\n *\n * Return a JSON-safe value, a raw provider value to normalize, a lightweight\n * `{ output }` result, or a full normalized `HarnessRun`.\n */\n run: (\n input: JudgeHarnessInput,\n options: CreateJudgeHarnessRunOptions,\n ) => MaybePromise<unknown>;\n};\n\n/**\n * Creates a judge harness from a provider-specific prompt callback.\n *\n * @param options - Harness name plus the callback that issues the judge prompt.\n *\n * @example\n * ```ts\n * const judgeHarness = createJudgeHarness({\n * run: async ({ prompt }) => callJudgeModel(prompt),\n * });\n * ```\n */\nexport function createJudgeHarness(\n options: CreateJudgeHarnessOptions,\n): JudgeHarness {\n return createHarness({\n name: options.name ?? \"judge-harness\",\n run: async ({ input, signal, metadata }) => {\n return normalizeJudgeHarnessResult(\n await options.run(input, { signal, metadata }),\n );\n },\n });\n}\n\n/**\n * Runs a judge harness with eval-scoped context already supplied.\n *\n * @param judgeHarness - Judge-side harness configured on the matcher, judge, or suite.\n * @param input - Provider-neutral judge prompt request.\n * @param options - Run-scoped metadata and abort signal.\n */\nexport async function runJudgeHarness(\n judgeHarness: JudgeHarness,\n input: JudgeHarnessInput,\n options: RunJudgeOptions & { signal?: AbortSignal } = {},\n): Promise<JudgeHarnessOutput> {\n const artifacts: HarnessContext[\"artifacts\"] = {};\n const run = await judgeHarness.run(input, {\n metadata: options.metadata ?? {},\n signal: options.signal,\n artifacts,\n setArtifact: (name, value) => {\n artifacts[name] = value;\n },\n });\n\n return run.output !== undefined\n ? run.output\n : resolveJudgeHarnessAssistantOutput(run);\n}\n\n/** Binds a judge harness to the current eval run context. */\nexport function createRunJudge(\n judgeHarness: JudgeHarness | undefined,\n signal?: AbortSignal,\n): RunJudge | undefined {\n if (!judgeHarness) {\n return undefined;\n }\n\n return (input, options) =>\n runJudgeHarness(judgeHarness, input, {\n metadata: options?.metadata,\n signal,\n });\n}\n\nfunction normalizeJudgeHarnessResult(\n result: Awaited<ReturnType<CreateJudgeHarnessOptions[\"run\"]>>,\n): HarnessResultLike<JudgeHarnessOutput> {\n if (isHarnessRun(result)) {\n return result as HarnessRun<JudgeHarnessOutput>;\n }\n\n if (hasOutputField(result)) {\n return {\n output: normalizeJudgeHarnessOutput(result.output),\n };\n }\n\n return {\n output: normalizeJudgeHarnessOutput(result),\n };\n}\n\nfunction hasOutputField(value: unknown): value is { output?: unknown } {\n return (\n value !== null &&\n typeof value === \"object\" &&\n !Array.isArray(value) &&\n Object.keys(value).length === 1 &&\n \"output\" in value\n );\n}\n\nfunction normalizeJudgeHarnessOutput(value: unknown): JudgeHarnessOutput {\n if (value === undefined) {\n return undefined;\n }\n\n return normalizeContent(value);\n}\n\nfunction resolveJudgeHarnessAssistantOutput(\n run: HarnessRun<JudgeHarnessOutput>,\n): JudgeHarnessOutput {\n return latestAssistantMessageContent(run.session) ?? \"\";\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,kBAWO;AAkBP,IAAAA,eAWO;;;ACoHP,eAAsB,gBACpB,cACA,OACA,UAAsD,CAAC,GAC1B;AAC7B,QAAM,YAAyC,CAAC;AAChD,QAAM,MAAM,MAAM,aAAa,IAAI,OAAO;AAAA,IACxC,UAAU,QAAQ,YAAY,CAAC;AAAA,IAC/B,QAAQ,QAAQ;AAAA,IAChB;AAAA,IACA,aAAa,CAAC,MAAM,UAAU;AAC5B,gBAAU,IAAI,IAAI;AAAA,IACpB;AAAA,EACF,CAAC;AAED,SAAO,IAAI,WAAW,SAClB,IAAI,SACJ,mCAAmC,GAAG;AAC5C;AAGO,SAAS,eACd,cACA,QACsB;AACtB,MAAI,CAAC,cAAc;AACjB,WAAO;AAAA,EACT;AAEA,SAAO,CAAC,OAAO,YACb,gBAAgB,cAAc,OAAO;AAAA,IACnC,UAAU,SAAS;AAAA,IACnB;AAAA,EACF,CAAC;AACL;AAsCA,SAAS,mCACP,KACoB;AACpB,aAAO,4CAA8B,IAAI,OAAO,KAAK;AACvD;;;AF1KA,IAAM,2BAAkE;AAAA,EACtE,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AACL;AAEA,IAAM,oBACJ;AAEF,IAAM,6BAA6B;AAAA,EACjC,MAAM;AAAA,EACN,sBAAsB;AAAA,EACtB,UAAU,CAAC,UAAU,WAAW;AAAA,EAChC,YAAY;AAAA,IACV,QAAQ;AAAA,MACN,MAAM,CAAC,KAAK,KAAK,KAAK,KAAK,GAAG;AAAA,IAChC;AAAA,IACA,WAAW;AAAA,MACT,MAAM;AAAA,IACR;AAAA,EACF;AACF;AA+GO,SAAS,gBACd,SAAgC,CAAC,GACF;AAC/B,QAAM,eAAe,OAAO;AAE5B,SAAO;AAAA,IACL,MAAM,OAAO,QAAQ;AAAA,IACrB;AAAA,IACA,QAAQ,CAAC,SAAS,iBAAiB,MAAM,YAAY;AAAA,EACvD;AACF;AAEA,eAAe,iBACb,MACA,wBACA;AACA,QAAM,WAAW,KAAK;AACtB,QAAM,WACJ,KAAK,aAAa,SAAY,SAAS,WAAW,KAAK;AAEzD,MAAI,wBAAwB,QAAQ,GAAG;AACrC,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WACE;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AAEA,QAAM,WACJ,KAAK,YACL;AAAA,IACE;AAAA,IACC,KAAkC;AAAA,EACrC;AAEF,MAAI,CAAC,UAAU;AACb,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAU,MAAM,SAAS;AAAA,IAC7B,QAAQ;AAAA,IACR,QAAQ,uBAAuB;AAAA,MAC7B,OAAO,KAAK;AAAA,MACZ;AAAA,MACA,QAAQ,mBAAmB,IAAI;AAAA,IACjC,CAAC;AAAA,IACD,gBAAgB;AAAA,MACd,MAAM;AAAA,MACN,QAAQ;AAAA,IACV;AAAA,EACF,CAAC;AAED,SAAO,kBAAkB,4BAA4B,OAAO,CAAC;AAC/D;AAEA,SAAS,wBAAwB,OAA4C;AAC3E,SACE,SAAS,QAAS,OAAO,UAAU,YAAY,MAAM,KAAK,EAAE,WAAW;AAE3E;AAEA,SAAS,mBAAmB,MAA8B;AACxD,MAAI,KAAK,WAAW,QAAW;AAC7B,WAAO,KAAK;AAAA,EACd;AAEA,aAAO,4CAA8B,KAAK,OAAO,KAAK;AACxD;AAEA,SAAS,4BAA4B,OAAwC;AAC3E,QAAM,SAAS,OAAO,UAAU,WAAW,gBAAgB,KAAK,IAAI;AAEpE,MAAI,CAAC,UAAU,OAAO,WAAW,UAAU;AACzC,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAU;AAChB,MAAI,CAAC,mBAAmB,QAAQ,MAAM,GAAG;AACvC,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,MAAI,OAAO,QAAQ,cAAc,UAAU;AACzC,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,QAAQ,QAAQ;AAAA,IAChB,WAAW,QAAQ;AAAA,EACrB;AACF;AAEA,SAAS,gBAAgB,OAAe;AACtC,MAAI;AACF,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB,QAAQ;AACN,UAAM,aAAa,MAAM,MAAM,kCAAkC;AACjE,QAAI,CAAC,YAAY;AACf,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,WAAO,KAAK,MAAM,WAAW,CAAC,CAAC;AAAA,EACjC;AACF;AAEA,SAAS,mBAAmB,OAAgD;AAC1E,SACE,UAAU,OACV,UAAU,OACV,UAAU,OACV,UAAU,OACV,UAAU;AAEd;AAEA,SAAS,uBAAuB;AAAA,EAC9B;AAAA,EACA;AAAA,EACA;AACF,GAIG;AACD,QAAM,aAAa,iBAAiB;AAAA,IAClC,UAAU,SAAS;AAAA,IACnB,eAAe;AAAA,IACf,kBAAkB,UAAU;AAAA,EAC9B,CAAC;AAED,SAAO;AAAA;AAAA;AAAA,EAGP,UAAU;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAgBZ;AAEA,SAAS,iBAAiB,OAAgB;AACxC,MAAI,OAAO,UAAU,UAAU;AAC7B,WAAO;AAAA,EACT;AAEA,MAAI,UAAU,QAAW;AACvB,WAAO;AAAA,EACT;AAEA,MAAI;AACF,WAAO,KAAK,UAAU,OAAO,MAAM,CAAC,KAAK,OAAO,KAAK;AAAA,EACvD,QAAQ;AACN,WAAO,OAAO,KAAK;AAAA,EACrB;AACF;AAEA,SAAS,kBAAkB,QAA6C;AACtE,SAAO;AAAA,IACL,OAAO,yBAAyB,OAAO,MAAM;AAAA,IAC7C,UAAU;AAAA,MACR,WAAW,OAAO;AAAA,MAClB,QAAQ,OAAO;AAAA,IACjB;AAAA,EACF;AACF;","names":["import_core"]}
1
+ {"version":3,"sources":["../../src/judges/factualityJudge.ts","../../src/harness.ts","../../src/judges/judgeHarness.ts"],"sourcesContent":["import { type Harness, latestAssistantMessageContent } from \"../harness\";\nimport type { JsonValue } from \"../harness\";\nimport { createRunJudge } from \"./judgeHarness\";\nimport type { JudgeHarness } from \"./judgeHarness\";\nimport type { Judge, JudgeContext, JudgeResult } from \"./types\";\n\n/**\n * Rubric choice returned by a factuality judge model call.\n *\n * @example\n * ```ts\n * import type { FactualityJudgeChoice } from \"vitest-evals\";\n *\n * const choice: FactualityJudgeChoice = \"C\";\n * ```\n */\nexport type FactualityJudgeChoice = \"A\" | \"B\" | \"C\" | \"D\" | \"E\";\n\n/**\n * Prompt payload sent to the configured judge harness.\n *\n * @example\n * ```ts\n * import type { FactualityJudgePrompt } from \"vitest-evals\";\n *\n * const payload: FactualityJudgePrompt = {\n * system: \"Grade factual consistency.\",\n * prompt: \"Compare these answers.\",\n * };\n * ```\n */\nexport type FactualityJudgePrompt = {\n /** System prompt for the judge model. */\n system: string;\n /** User prompt containing the question, expert answer, submitted answer, and rubric. */\n prompt: string;\n};\n\n/**\n * Parsed verdict returned by a factuality judge model call.\n *\n * @example\n * ```ts\n * import type { FactualityJudgeVerdict } from \"vitest-evals\";\n *\n * const verdict: FactualityJudgeVerdict = {\n * choice: \"C\",\n * rationale: \"The submitted answer matches the expert answer.\",\n * };\n * ```\n */\nexport type FactualityJudgeVerdict = {\n /** Rubric choice selected by the judge model. */\n choice: FactualityJudgeChoice;\n /** Human-readable explanation for the selected choice. */\n rationale: string;\n};\n\nconst FACTUALITY_CHOICE_SCORES: Record<FactualityJudgeChoice, number> = {\n A: 0.4,\n B: 0.6,\n C: 1,\n D: 0,\n E: 1,\n};\n\nconst FACTUALITY_SYSTEM =\n \"You are comparing factual content. Ignore differences in style, grammar, punctuation, and formatting.\";\n\nconst FACTUALITY_RESPONSE_SCHEMA = {\n type: \"object\",\n additionalProperties: false,\n required: [\"choice\", \"rationale\"],\n properties: {\n choice: {\n enum: [\"A\", \"B\", \"C\", \"D\", \"E\"],\n },\n rationale: {\n type: \"string\",\n },\n },\n} as const satisfies JsonValue;\n\n/**\n * Expert answer or reference facts accepted by `FactualityJudge()`.\n *\n * @example\n * ```ts\n * import type { FactualityJudgeExpected } from \"vitest-evals\";\n *\n * const expected: FactualityJudgeExpected =\n * \"Paris is the capital of France.\";\n * ```\n */\nexport type FactualityJudgeExpected = JsonValue;\n\n/**\n * Configuration for the factuality judge.\n *\n * The judge harness can be supplied here, by `describeEval({ judgeHarness })`,\n * or by `expect(...).toSatisfyJudge(..., { judgeHarness })`. Passing it here\n * keeps the judge self-contained while preserving provider neutrality.\n *\n * @example\n * ```ts\n * import { FactualityJudge, type JudgeHarness } from \"vitest-evals\";\n *\n * declare const judgeHarness: JudgeHarness;\n *\n * const judge = FactualityJudge({ name: \"FactJudge\", judgeHarness });\n * ```\n */\nexport type FactualityJudgeConfig = {\n /** Stable judge name used in assertion messages and reports. */\n name?: string;\n /** Default judge-side harness used when matcher options do not provide one. */\n judgeHarness?: JudgeHarness;\n /** Expert answer or reference facts used by this judge instance. */\n expected?: FactualityJudgeExpected;\n};\n\n/**\n * Matcher context accepted by `FactualityJudge()`.\n *\n * @example\n * ```ts\n * import { aiSdkJudgeHarness } from \"@vitest-evals/harness-ai-sdk\";\n * import { openai } from \"@ai-sdk/openai\";\n * import { expect } from \"vitest\";\n * import { FactualityJudge } from \"vitest-evals\";\n *\n * const judgeHarness = aiSdkJudgeHarness({\n * model: openai(\"gpt-4.1-mini\"),\n * });\n *\n * await expect(result).toSatisfyJudge(FactualityJudge(), {\n * expected: \"Paris is the capital of France.\",\n * judgeHarness,\n * });\n * ```\n */\nexport type FactualityJudgeOptions<\n TInput = any,\n TOutput extends JsonValue | undefined = any,\n THarness extends Harness<TInput, TOutput> | undefined = any,\n> = JudgeContext<TInput, TOutput, THarness> & {\n /** Expert answer or reference facts. Overrides the judge config default. */\n expected?: FactualityJudgeExpected;\n};\n\n/**\n * Creates a factuality judge over normalized harness output.\n *\n * `FactualityJudge()` compares `input`, `output`, and an expert answer. Bind a\n * suite-wide expert answer on the judge config, or pass a case-specific\n * `expected` value to `toSatisfyJudge(...)`. Configure the LLM used for grading\n * with `judgeHarness` on the judge, suite, or matcher options.\n *\n * @param config - Optional judge name and reusable judge harness default.\n *\n * @example\n * ```ts\n * import { anthropic } from \"@ai-sdk/anthropic\";\n * import { aiSdkJudgeHarness } from \"@vitest-evals/harness-ai-sdk\";\n * import { describeEval, FactualityJudge } from \"vitest-evals\";\n * import { qaHarness } from \"./qaHarness\";\n *\n * const judgeHarness = aiSdkJudgeHarness({\n * model: anthropic(\"claude-sonnet-4-5\"),\n * temperature: 0,\n * });\n * const factualityJudge = FactualityJudge({\n * judgeHarness,\n * expected: \"Paris is the capital of France.\",\n * });\n *\n * describeEval(\"qa agent\", {\n * harness: qaHarness,\n * judges: [factualityJudge],\n * }, (it) => {\n * it(\"answers a geography question\", async ({ run }) => {\n * await run(\"What is the capital of France?\");\n * });\n * });\n * ```\n */\nexport function FactualityJudge(\n config: FactualityJudgeConfig = {},\n): Judge<FactualityJudgeOptions> {\n const judgeHarness = config.judgeHarness;\n\n return {\n name: config.name ?? \"FactualityJudge\",\n judgeHarness,\n assess: (opts) =>\n assessFactuality(opts, {\n expected: config.expected,\n judgeHarness,\n }),\n };\n}\n\nasync function assessFactuality(\n opts: FactualityJudgeOptions,\n config: {\n expected: FactualityJudgeExpected | undefined;\n judgeHarness: JudgeHarness | undefined;\n },\n) {\n const expected = opts.expected ?? config.expected;\n\n if (isMissingExpectedAnswer(expected)) {\n return {\n score: 0,\n metadata: {\n rationale:\n \"FactualityJudge requires a non-empty expert answer in `expected` or FactualityJudge(...) config.\",\n },\n };\n }\n\n const runJudge =\n opts.runJudge ??\n createRunJudge(\n config.judgeHarness,\n (opts as { signal?: AbortSignal }).signal,\n );\n\n if (!runJudge) {\n throw new Error(\n \"FactualityJudge requires a judgeHarness in FactualityJudge(...) config, describeEval(...) options, toSatisfyJudge(...) options, or JudgeContext.runJudge.\",\n );\n }\n\n const verdict = await runJudge({\n system: FACTUALITY_SYSTEM,\n prompt: formatFactualityPrompt({\n input: opts.input,\n expected,\n output: resolveJudgeOutput(opts),\n }),\n responseFormat: {\n type: \"json\",\n schema: FACTUALITY_RESPONSE_SCHEMA,\n },\n });\n\n return formatJudgeResult(parseFactualityJudgeVerdict(verdict));\n}\n\nfunction isMissingExpectedAnswer(value: FactualityJudgeExpected | undefined) {\n return (\n value == null || (typeof value === \"string\" && value.trim().length === 0)\n );\n}\n\nfunction resolveJudgeOutput(opts: FactualityJudgeOptions) {\n if (opts.output !== undefined) {\n return opts.output;\n }\n\n return latestAssistantMessageContent(opts.session) ?? \"\";\n}\n\nfunction parseFactualityJudgeVerdict(value: unknown): FactualityJudgeVerdict {\n const parsed = typeof value === \"string\" ? parseJsonObject(value) : value;\n\n if (!parsed || typeof parsed !== \"object\") {\n throw new Error(\n \"FactualityJudge judgeHarness must return an object with `choice` and `rationale`.\",\n );\n }\n\n const verdict = parsed as Record<string, unknown>;\n if (!isFactualityChoice(verdict.choice)) {\n throw new Error(\n \"FactualityJudge judgeHarness must return choice A, B, C, D, or E.\",\n );\n }\n\n if (typeof verdict.rationale !== \"string\") {\n throw new Error(\n \"FactualityJudge judgeHarness must return a string `rationale`.\",\n );\n }\n\n return {\n choice: verdict.choice,\n rationale: verdict.rationale,\n };\n}\n\nfunction parseJsonObject(value: string) {\n try {\n return JSON.parse(value);\n } catch {\n const fencedJson = value.match(/```(?:json)?\\s*([\\s\\S]*?)\\s*```/i);\n if (!fencedJson) {\n throw new Error(\n \"FactualityJudge judgeHarness must return JSON with `choice` and `rationale`.\",\n );\n }\n\n return JSON.parse(fencedJson[1]);\n }\n}\n\nfunction isFactualityChoice(value: unknown): value is FactualityJudgeChoice {\n return (\n value === \"A\" ||\n value === \"B\" ||\n value === \"C\" ||\n value === \"D\" ||\n value === \"E\"\n );\n}\n\nfunction formatFactualityPrompt({\n input,\n expected,\n output,\n}: {\n input: unknown;\n expected: unknown;\n output: unknown;\n}) {\n const comparison = formatJudgeValue({\n question: input ?? \"\",\n expert_answer: expected,\n submitted_answer: output ?? \"\",\n });\n\n return `Compare the submitted answer with the expert answer.\n\nComparison payload:\n${comparison}\n\nSelect exactly one option:\nA: The submission is a fully consistent subset of the expert answer.\nB: The submission is a fully consistent superset of the expert answer.\nC: The submission contains the same factual details as the expert answer.\nD: The submission disagrees with the expert answer.\nE: The answers differ only in ways that do not affect factuality.\n\nReturn JSON with exactly these fields:\n{\n \"choice\": \"C\",\n \"rationale\": \"Brief explanation for the selected choice\"\n}\n\nThe choice value must be one of A, B, C, D, or E.`;\n}\n\nfunction formatJudgeValue(value: unknown) {\n if (typeof value === \"string\") {\n return value;\n }\n\n if (value === undefined) {\n return \"\";\n }\n\n try {\n return JSON.stringify(value, null, 2) ?? String(value);\n } catch {\n return String(value);\n }\n}\n\nfunction formatJudgeResult(object: FactualityJudgeVerdict): JudgeResult {\n return {\n score: FACTUALITY_CHOICE_SCORES[object.choice],\n metadata: {\n rationale: object.rationale,\n choice: object.choice,\n },\n };\n}\n","import {\n assistantMessages,\n failedSpans,\n latestAssistantMessageContent,\n messagesByRole,\n spans,\n spansByKind,\n systemMessages,\n toolCalls,\n toolMessages,\n userMessages,\n} from \"@vitest-evals/core\";\nimport type {\n GenAiOperationName,\n HarnessRun,\n HarnessRunError,\n JsonPrimitive,\n JsonValue,\n NormalizedMessage,\n NormalizedSession,\n NormalizedSpan,\n NormalizedSpanAttributes,\n NormalizedSpanEvent,\n NormalizedTrace,\n TimingSummary,\n ToolCallRecord,\n UsageSummary,\n} from \"@vitest-evals/core\";\n\nexport {\n assistantMessages,\n failedSpans,\n latestAssistantMessageContent,\n messagesByRole,\n spans,\n spansByKind,\n systemMessages,\n toolCalls,\n toolMessages,\n userMessages,\n} from \"@vitest-evals/core\";\nexport type {\n GenAiOperationName,\n GenAiOutputType,\n GenAiProviderName,\n GenAiSemanticAttributeKey,\n GenAiSemanticAttributes,\n GenAiTokenType,\n GenAiToolType,\n HarnessRun,\n HarnessRunError,\n JsonPrimitive,\n JsonValue,\n NormalizedMessage,\n NormalizedSession,\n NormalizedSpan,\n NormalizedSpanAttributeKey,\n NormalizedSpanAttributes,\n NormalizedSpanEvent,\n NormalizedTrace,\n OpenTelemetrySemanticAttributeKey,\n OpenTelemetrySemanticAttributes,\n TimingSummary,\n ToolCallRecord,\n UsageSummary,\n} from \"@vitest-evals/core\";\n\n/** Options for converting normalized tool calls into trace spans. */\nexport type CreateToolCallSpansOptions = {\n /** Trace id to attach to each generated tool span. */\n traceId?: string;\n /** Parent span id to attach to each generated tool span. */\n parentId?: string;\n /** Prefix used to create internal span ids instead of reusing tool-call ids. */\n spanIdPrefix?: string;\n};\n\n/** Options for attaching a fallback run trace to a harness result. */\nexport type EnsureRunTraceOptions = {\n /** Human-readable run or harness name. */\n name: string;\n /** Wall-clock start time for the harness run. */\n startedAt: Date;\n /** Wall-clock finish time for the harness run. */\n finishedAt: Date;\n /** Optional trace id. A generated id is used when omitted. */\n id?: string;\n /** GenAI operation name to place on the root run span. */\n operationName?: GenAiOperationName;\n /** Optional JSON-safe source marker for the trace metadata. */\n source?: string;\n};\n\ntype OutputField<TOutput extends JsonValue | undefined> =\n undefined extends TOutput ? { output?: TOutput } : { output: TOutput };\n\n/** Generic JSON-like metadata record used by normalized artifacts and reports. */\nexport type HarnessMetadata = Record<string, unknown>;\n\n/**\n * Runtime context passed from the eval fixture into a harness run.\n *\n * @example\n * ```ts\n * const harness: Harness<string> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * context.setArtifact(\"inputLength\", input.length);\n *\n * return {\n * output: undefined,\n * session: { messages: [{ role: \"user\", content: input }] },\n * usage: {},\n * errors: [],\n * };\n * },\n * };\n * ```\n */\nexport type HarnessContext = {\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable JSON-safe artifact bag shared with the harness. */\n artifacts: Record<string, JsonValue>;\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: (name: string, value: JsonValue) => void;\n};\n\n/**\n * Adapter that executes the system under test and returns a normalized run.\n *\n * @example\n * ```ts\n * const harness: Harness<string, { status: \"approved\" | \"denied\" }> = {\n * name: \"refund-agent\",\n * async run(input, context) {\n * return normalizeHarnessRun(input, await runRefundFlow(input), context);\n * },\n * };\n * ```\n */\nexport type Harness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes the system under test and returns a normalized run. */\n run: (input: TInput, context: HarnessContext) => Promise<HarnessRun<TOutput>>;\n};\n\n/** Value or promise accepted by lightweight harness callbacks. */\nexport type MaybePromise<T> = T | Promise<T>;\n\n/** Lightweight tool-call record accepted by `createHarness(...)` results. */\nexport type SimpleToolCallRecord = Omit<\n ToolCallRecord,\n \"arguments\" | \"result\" | \"error\" | \"metadata\"\n> & {\n /** Raw tool arguments accepted by `createHarness(...)` before normalization. */\n arguments?: unknown;\n /** Raw tool result accepted by `createHarness(...)` before normalization. */\n result?: unknown;\n /** Raw tool error accepted by `createHarness(...)` before normalization. */\n error?: unknown;\n /** Raw tool metadata accepted by `createHarness(...)` before normalization. */\n metadata?: Record<string, unknown>;\n};\n\n/** Lightweight span event accepted by `createHarness(...)` results. */\nexport type SimpleSpanEvent = Omit<NormalizedSpanEvent, \"attributes\"> & {\n /** Raw event attributes accepted by `createHarness(...)` before normalization. */\n attributes?: Record<string, unknown>;\n};\n\n/** Lightweight span record accepted by `createHarness(...)` results. */\nexport type SimpleSpanRecord = Omit<\n NormalizedSpan,\n \"attributes\" | \"error\" | \"events\"\n> & {\n /** Raw span attributes accepted by `createHarness(...)` before normalization. */\n attributes?: Record<string, unknown>;\n /** Raw span error accepted by `createHarness(...)` before normalization. */\n error?: unknown;\n /** Raw span events accepted by `createHarness(...)` before normalization. */\n events?: SimpleSpanEvent[];\n};\n\n/** Lightweight trace record accepted by `createHarness(...)` results. */\nexport type SimpleTraceRecord = Omit<NormalizedTrace, \"metadata\" | \"spans\"> & {\n /** Raw trace metadata accepted by `createHarness(...)` before normalization. */\n metadata?: Record<string, unknown>;\n /** Lightweight spans to normalize into the trace. */\n spans: SimpleSpanRecord[];\n};\n\n/**\n * Lightweight result shape normalized by `createHarness(...)`.\n *\n * @example\n * ```ts\n * const result: SimpleHarnessResult<{ status: \"approved\" }> = {\n * output: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n * usage: { totalTokens: 260 },\n * };\n * ```\n */\nexport type SimpleHarnessResult<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = OutputField<TOutput> & {\n /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */\n messages?: NormalizedMessage[];\n /** Lightweight tool-call records to normalize into the session. */\n toolCalls?: SimpleToolCallRecord[];\n /** Usage summary to attach to the run. */\n usage?: UsageSummary;\n /** Timing summary to attach to the run. */\n timings?: TimingSummary;\n /** Raw artifact values to normalize and merge into the run. */\n artifacts?: Record<string, unknown>;\n /** Lightweight traces and spans to normalize into the run. */\n traces?: SimpleTraceRecord[];\n /** Raw session metadata to normalize into the session. */\n metadata?: Record<string, unknown>;\n /** Raw errors to normalize into the run. */\n errors?: unknown[];\n};\n\n/** Either a complete normalized run or a lightweight result to normalize. */\nexport type HarnessResultLike<\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;\n\n/** Arguments passed to the `createHarness(...)` convenience callback. */\nexport type CreateHarnessRunArgs<TInput> = {\n /** Original input passed to `run(input)`. */\n input: TInput;\n /** Abort signal from Vitest when available. */\n signal?: AbortSignal;\n /** Mutable run artifact bag. */\n artifacts: HarnessContext[\"artifacts\"];\n /** Stores one JSON-safe artifact on the current run. */\n setArtifact: HarnessContext[\"setArtifact\"];\n};\n\n/**\n * Options for creating a lightweight custom application harness.\n *\n * @example\n * ```ts\n * const options: CreateHarnessOptions<string, { status: \"approved\" }> = {\n * name: \"refund-agent\",\n * run: async ({ input }) => ({\n * output: await classifyRefund(input),\n * }),\n * };\n * ```\n */\nexport type CreateHarnessOptions<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = {\n /** Stable harness name used in reports. */\n name: string;\n /** Executes application code and returns either a lightweight result or full `HarnessRun`. */\n run: (\n args: CreateHarnessRunArgs<TInput>,\n ) => MaybePromise<HarnessResultLike<TOutput>>;\n};\n\nfunction isJsonPrimitive(value: unknown): value is JsonPrimitive {\n return (\n value === null ||\n typeof value === \"string\" ||\n typeof value === \"boolean\" ||\n (typeof value === \"number\" && Number.isFinite(value))\n );\n}\n\nfunction isJsonRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\nfunction normalizeJsonArray(value: unknown[], seen: WeakSet<object>) {\n if (seen.has(value)) {\n return undefined;\n }\n\n seen.add(value);\n const normalized = value.map((item) => {\n const normalized = toJsonValueInternal(item, seen);\n return normalized === undefined ? null : normalized;\n });\n seen.delete(value);\n\n return normalized;\n}\n\nfunction normalizeJsonObject(\n value: Record<string, unknown>,\n seen: WeakSet<object>,\n): Record<string, JsonValue> {\n const normalized: Record<string, JsonValue> = {};\n\n if (seen.has(value)) {\n return normalized;\n }\n\n seen.add(value);\n try {\n for (const [key, entryValue] of Object.entries(value)) {\n const entry = toJsonValueInternal(entryValue, seen);\n if (entry !== undefined) {\n normalized[key] = entry;\n }\n }\n } finally {\n seen.delete(value);\n }\n\n return normalized;\n}\n\n/** Returns true when a value exposes a callable method with the given name. */\nexport function hasCallableMethod(value: unknown, methodName: string) {\n return (\n value !== null &&\n (typeof value === \"object\" || typeof value === \"function\") &&\n methodName in value &&\n typeof (value as Record<string, unknown>)[methodName] === \"function\"\n );\n}\n\n/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */\nexport function toJsonValue(value: unknown): JsonValue | undefined {\n return toJsonValueInternal(value, new WeakSet());\n}\n\nfunction toJsonValueInternal(\n value: unknown,\n seen: WeakSet<object>,\n): JsonValue | undefined {\n if (isJsonPrimitive(value)) {\n return value;\n }\n\n if (\n value !== null &&\n typeof value === \"object\" &&\n seen.has(value as object)\n ) {\n return undefined;\n }\n\n if (Array.isArray(value)) {\n return normalizeJsonArray(value, seen);\n }\n\n if (isJsonRecord(value)) {\n return normalizeJsonObject(value, seen);\n }\n\n return undefined;\n}\n\n/** Drops non-JSON properties from a record while preserving valid values. */\nexport function normalizeRecord(\n value: Record<string, unknown>,\n): Record<string, JsonValue> {\n return normalizeJsonObject(value, new WeakSet());\n}\n\n/** Normalizes metadata and omits the field entirely when nothing survives. */\nexport function normalizeMetadata(\n value: Record<string, unknown>,\n): Record<string, JsonValue> | undefined {\n const normalized = normalizeRecord(value);\n return Object.keys(normalized).length > 0 ? normalized : undefined;\n}\n\n/** Converts arbitrary content into the JSON-safe message content shape. */\nexport function normalizeContent(value: unknown): JsonValue {\n const normalized = toJsonValue(value);\n return normalized !== undefined ? normalized : String(value);\n}\n\n/**\n * Creates a harness from the common \"run app code and return output\" shape.\n *\n * @param options - Harness name plus the callback that executes app code.\n *\n * @example\n * ```ts\n * import { createHarness } from \"vitest-evals\";\n *\n * export const refundHarness = createHarness<\n * string,\n * { status: \"approved\" | \"denied\" }\n * >({\n * name: \"refund-agent\",\n * run: async ({ input, setArtifact }) => {\n * const result = await runRefundFlow(input);\n * const output = { status: result.status };\n *\n * setArtifact(\"case\", { invoiceId: result.invoiceId });\n *\n * return {\n * output,\n * toolCalls: result.toolCalls,\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * };\n * },\n * });\n * ```\n */\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(options: CreateHarnessOptions<TInput, TOutput>): Harness<TInput, TOutput>;\nexport function createHarness<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(options: CreateHarnessOptions<TInput, TOutput>): Harness<TInput, TOutput> {\n const harness: Harness<TInput, TOutput> = {\n name: options.name,\n run: async (input, context) => {\n const startedAt = new Date();\n\n try {\n const result = await options.run({\n input,\n signal: context.signal,\n artifacts: context.artifacts,\n setArtifact: context.setArtifact,\n });\n const run = normalizeHarnessRun(input, result, context);\n ensureRunTrace(run, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n\n return run;\n } catch (error) {\n const partialRun = getHarnessRunFromError(error);\n if (partialRun) {\n if (\n Object.keys(context.artifacts).length > 0 &&\n !partialRun.artifacts\n ) {\n partialRun.artifacts = context.artifacts;\n }\n ensureRunTrace(partialRun, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n throw attachHarnessRunToError(error, partialRun);\n }\n\n const failedRun = createFailedHarnessRun(input, error, {\n artifacts: context.artifacts,\n });\n ensureRunTrace(failedRun, {\n name: options.name,\n startedAt,\n finishedAt: new Date(),\n });\n\n throw attachHarnessRunToError(error, failedRun);\n }\n },\n };\n\n return harness;\n}\n\n/**\n * Normalizes a lightweight harness result into the reporter-facing run shape.\n *\n * @param input - Original input passed to the harness.\n * @param result - Lightweight result or pre-normalized harness run.\n * @param context - Optional per-run context used to merge artifacts.\n *\n * @example\n * ```ts\n * const run = normalizeHarnessRun(\"Refund invoice inv_123\", {\n * output: { status: \"approved\" },\n * toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n * usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * });\n *\n * expect(toolCalls(run.session)).toHaveLength(1);\n * ```\n */\nexport function normalizeHarnessRun<\n TInput = unknown,\n TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(\n input: TInput,\n result: HarnessResultLike<TOutput>,\n context?: HarnessContext,\n): HarnessRun<TOutput> {\n if (isHarnessRun(result)) {\n if (\n context &&\n Object.keys(context.artifacts).length > 0 &&\n !result.artifacts\n ) {\n return {\n ...result,\n artifacts: context.artifacts,\n };\n }\n\n return result;\n }\n\n const output = result.output;\n const toolCalls = normalizeSimpleToolCalls(result.toolCalls);\n const usage = result.usage ?? {};\n const messages =\n result.messages ??\n createDefaultSessionMessages({\n input,\n output,\n toolCalls,\n });\n const metadata = result.metadata\n ? normalizeMetadata(result.metadata)\n : undefined;\n const artifacts = normalizeMergedArtifacts(\n context?.artifacts,\n result.artifacts,\n );\n const traces = normalizeSimpleTraces(result.traces);\n\n return {\n session: {\n messages,\n ...(usage.provider ? { provider: usage.provider } : {}),\n ...(usage.model ? { model: usage.model } : {}),\n ...(metadata ? { metadata } : {}),\n },\n ...(output !== undefined ? { output } : {}),\n usage,\n ...(result.timings ? { timings: result.timings } : {}),\n ...(artifacts ? { artifacts } : {}),\n ...(traces ? { traces } : {}),\n errors: normalizeSimpleErrors(result.errors),\n } as HarnessRun<TOutput>;\n}\n\n/**\n * Builds a JSON-safe failed run for errors that happen before a harness can return.\n *\n * @param input - Original input passed to the harness.\n * @param error - Error thrown by setup or execution.\n * @param options - Optional artifacts to preserve on the failed run.\n */\nexport function createFailedHarnessRun(\n input: unknown,\n error: unknown,\n options: { artifacts?: Record<string, JsonValue> } = {},\n): HarnessRun {\n const artifacts = options.artifacts;\n\n return {\n session: {\n messages: [\n {\n role: \"user\",\n content: normalizeContent(input),\n },\n ],\n },\n usage: {},\n ...(artifacts && Object.keys(artifacts).length > 0 ? { artifacts } : {}),\n errors: [serializeError(error)],\n };\n}\n\nfunction createDefaultSessionMessages<TInput>({\n input,\n output,\n toolCalls: normalizedToolCalls,\n}: {\n input: TInput;\n output: JsonValue | undefined;\n toolCalls: ToolCallRecord[];\n}): NormalizedMessage[] {\n const messages: NormalizedMessage[] = [\n {\n role: \"user\",\n content: normalizeContent(input),\n },\n ];\n\n if (output !== undefined || normalizedToolCalls.length > 0) {\n messages.push({\n role: \"assistant\",\n ...(output !== undefined ? { content: normalizeContent(output) } : {}),\n ...(normalizedToolCalls.length > 0\n ? { toolCalls: normalizedToolCalls }\n : {}),\n });\n }\n\n return messages;\n}\n\nfunction normalizeSimpleToolCalls(\n calls: SimpleToolCallRecord[] | undefined,\n): ToolCallRecord[] {\n return (calls ?? []).map((call) => {\n const {\n arguments: rawArguments,\n result: rawResult,\n error: rawError,\n metadata: rawMetadata,\n ...toolCall\n } = call;\n const args = normalizeToolCallArguments(rawArguments);\n const result = toJsonValue(rawResult);\n const error = normalizeToolCallError(rawError);\n const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : undefined;\n\n return {\n ...toolCall,\n ...(args ? { arguments: args } : {}),\n ...(result !== undefined ? { result } : {}),\n ...(error ? { error } : {}),\n ...(metadata ? { metadata } : {}),\n };\n });\n}\n\nfunction normalizeToolCallArguments(\n value: unknown,\n): Record<string, JsonValue> | undefined {\n if (value === undefined) {\n return undefined;\n }\n\n const normalized = toJsonValue(value);\n return normalized &&\n typeof normalized === \"object\" &&\n !Array.isArray(normalized)\n ? normalized\n : undefined;\n}\n\nfunction normalizeToolCallError(\n value: unknown,\n): ToolCallRecord[\"error\"] | undefined {\n if (value === undefined) {\n return undefined;\n }\n\n const serialized = serializeError(value);\n const { message, type, ...details } = serialized;\n\n return {\n ...details,\n message: typeof message === \"string\" ? message : String(message),\n ...(typeof type === \"string\" ? { type } : {}),\n };\n}\n\nfunction normalizeMergedArtifacts(\n contextArtifacts: Record<string, JsonValue> | undefined,\n resultArtifacts: Record<string, unknown> | undefined,\n) {\n const artifacts = {\n ...(contextArtifacts ?? {}),\n ...(resultArtifacts ? normalizeRecord(resultArtifacts) : {}),\n };\n\n return Object.keys(artifacts).length > 0 ? artifacts : undefined;\n}\n\nfunction normalizeSimpleErrors(\n errors: unknown[] | undefined,\n): Array<Record<string, JsonValue>> {\n return (errors ?? []).map((error) => {\n const normalized = toJsonValue(error);\n\n if (\n normalized &&\n typeof normalized === \"object\" &&\n !Array.isArray(normalized) &&\n Object.keys(normalized).length > 0\n ) {\n return normalized;\n }\n\n return serializeError(error);\n });\n}\n\nfunction normalizeSimpleTraces(\n traces: SimpleTraceRecord[] | undefined,\n): NormalizedTrace[] | undefined {\n if (!Array.isArray(traces)) {\n return undefined;\n }\n\n const normalized = traces\n .map(normalizeSimpleTrace)\n .filter((trace): trace is NormalizedTrace => Boolean(trace));\n\n return normalized.length > 0 ? normalized : undefined;\n}\n\nfunction normalizeSimpleTrace(trace: unknown): NormalizedTrace | undefined {\n if (!isJsonRecord(trace)) {\n return undefined;\n }\n\n const {\n metadata: rawMetadata,\n spans: rawSpans,\n ...traceFields\n } = trace as Partial<SimpleTraceRecord>;\n const spans = (Array.isArray(rawSpans) ? rawSpans : [])\n .map((span) => normalizeSimpleSpan(span))\n .filter((span): span is NormalizedSpan => Boolean(span));\n const metadata = isJsonRecord(rawMetadata)\n ? normalizeMetadata(rawMetadata)\n : undefined;\n\n if (spans.length === 0 && !traceFields.id && !traceFields.name) {\n return undefined;\n }\n\n return {\n ...traceFields,\n ...(metadata ? { metadata } : {}),\n spans,\n };\n}\n\nfunction normalizeSimpleSpan(span: unknown): NormalizedSpan | undefined {\n if (!isJsonRecord(span) || typeof span.name !== \"string\" || !span.name) {\n return undefined;\n }\n\n const {\n attributes: rawAttributes,\n error: rawError,\n events: rawEvents,\n ...spanFields\n } = span as Partial<SimpleSpanRecord> & { name: string };\n const attributes = rawAttributes\n ? isJsonRecord(rawAttributes)\n ? normalizeMetadata(rawAttributes)\n : undefined\n : undefined;\n const error = normalizeSpanError(rawError);\n const events = normalizeSimpleSpanEvents(rawEvents);\n\n return {\n ...spanFields,\n ...(attributes\n ? { attributes: attributes as NormalizedSpanAttributes }\n : {}),\n ...(error ? { error } : {}),\n ...(events ? { events } : {}),\n };\n}\n\nfunction normalizeSimpleSpanEvents(\n events: unknown,\n): NormalizedSpanEvent[] | undefined {\n if (!Array.isArray(events)) {\n return undefined;\n }\n\n const normalized = events\n .map(normalizeSimpleSpanEvent)\n .filter((event): event is NormalizedSpanEvent => Boolean(event));\n\n return normalized.length > 0 ? normalized : undefined;\n}\n\nfunction normalizeSimpleSpanEvent(\n event: unknown,\n): NormalizedSpanEvent | undefined {\n if (!isJsonRecord(event) || typeof event.name !== \"string\" || !event.name) {\n return undefined;\n }\n\n const { attributes: rawAttributes, ...eventFields } =\n event as Partial<SimpleSpanEvent> & { name: string };\n const attributes = rawAttributes\n ? isJsonRecord(rawAttributes)\n ? normalizeMetadata(rawAttributes)\n : undefined\n : undefined;\n\n return {\n ...eventFields,\n ...(attributes\n ? { attributes: attributes as NormalizedSpanAttributes }\n : {}),\n };\n}\n\n/** Normalizes arbitrary span errors while preserving object-shaped messages. */\nexport function normalizeSpanError(\n error: unknown,\n): NormalizedSpan[\"error\"] | undefined {\n if (error === undefined) {\n return undefined;\n }\n\n if (error instanceof Error) {\n const details = normalizeMetadata(\n error as unknown as Record<string, unknown>,\n );\n\n return {\n ...(details ?? {}),\n type: error.name,\n message: error.message,\n };\n }\n\n if (\n error &&\n typeof error === \"object\" &&\n !Array.isArray(error) &&\n typeof (error as { message?: unknown }).message === \"string\"\n ) {\n const normalized = normalizeMetadata(error as Record<string, unknown>);\n const { message, type, ...details } = normalized ?? {};\n\n return {\n ...details,\n message: message as string,\n ...(typeof type === \"string\" ? { type } : {}),\n };\n }\n\n const serialized = serializeError(error);\n const { message, type, ...details } = serialized;\n\n return {\n ...details,\n message: typeof message === \"string\" ? message : String(message),\n ...(typeof type === \"string\" ? { type } : {}),\n };\n}\n\n/** Normalizes raw span attributes into the JSON-safe span attribute shape. */\nexport function normalizeSpanAttributes(\n attributes: Record<string, unknown>,\n): NormalizedSpanAttributes | undefined {\n return normalizeMetadata(attributes) as NormalizedSpanAttributes | undefined;\n}\n\n/** Builds common OpenTelemetry GenAI usage attributes from a usage summary. */\nexport function createGenAiUsageAttributes(\n usage: UsageSummary | undefined,\n options: { provider?: string } = {},\n) {\n return {\n \"gen_ai.provider.name\": usage?.provider ?? options.provider,\n \"gen_ai.request.model\": usage?.model,\n \"gen_ai.response.model\": usage?.model,\n \"gen_ai.usage.input_tokens\": usage?.inputTokens,\n \"gen_ai.usage.output_tokens\": usage?.outputTokens,\n \"gen_ai.usage.reasoning.output_tokens\": usage?.reasoningTokens,\n } satisfies Record<string, unknown>;\n}\n\n/**\n * Converts normalized tool-call records into trace spans.\n *\n * Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the\n * spans belong to a known trace so span ids stay internally unique.\n */\nexport function createToolCallSpans(\n calls: ToolCallRecord[],\n options: CreateToolCallSpansOptions = {},\n): NormalizedSpan[] {\n return calls.map((call, index) => {\n const spanError = call.error ? normalizeSpanError(call.error) : undefined;\n const spanId = options.spanIdPrefix\n ? `${options.spanIdPrefix}:${index + 1}`\n : call.id;\n\n return {\n ...(spanId ? { id: spanId } : {}),\n ...(options.traceId ? { traceId: options.traceId } : {}),\n ...(options.parentId ? { parentId: options.parentId } : {}),\n name: call.name,\n kind: \"tool\",\n ...(call.startedAt ? { startedAt: call.startedAt } : {}),\n ...(call.finishedAt ? { finishedAt: call.finishedAt } : {}),\n ...(call.durationMs !== undefined ? { durationMs: call.durationMs } : {}),\n status: spanError ? \"error\" : \"ok\",\n ...(spanError ? { error: spanError } : {}),\n attributes: normalizeSpanAttributes({\n \"gen_ai.operation.name\": \"execute_tool\",\n \"gen_ai.tool.name\": call.name,\n \"gen_ai.tool.type\": \"function\",\n ...(call.id ? { \"gen_ai.tool.call.id\": call.id } : {}),\n ...(call.arguments !== undefined\n ? { \"gen_ai.tool.call.arguments\": call.arguments }\n : {}),\n ...(call.result !== undefined\n ? { \"gen_ai.tool.call.result\": call.result }\n : {}),\n }),\n } satisfies NormalizedSpan;\n });\n}\n\n/**\n * Attaches a fallback run trace when a harness result does not already contain spans.\n *\n * This keeps custom harnesses inspectable while first-party harness packages\n * remain free to attach richer native traces.\n */\nexport function ensureRunTrace(\n run: HarnessRun,\n options: EnsureRunTraceOptions,\n): NormalizedTrace | undefined {\n if (spans(run).length > 0) {\n return undefined;\n }\n\n const traceId = options.id ?? createGeneratedTraceId();\n const rootSpanId = `${traceId}:run`;\n const durationMs = options.finishedAt.getTime() - options.startedAt.getTime();\n const rootError =\n run.errors.length > 0 ? normalizeSpanError(run.errors[0]) : undefined;\n const runSpan: NormalizedSpan = {\n id: rootSpanId,\n traceId,\n name: options.name,\n kind: \"run\",\n startedAt: options.startedAt.toISOString(),\n finishedAt: options.finishedAt.toISOString(),\n durationMs,\n status: rootError ? \"error\" : \"ok\",\n ...(rootError ? { error: rootError } : {}),\n attributes: normalizeSpanAttributes({\n \"gen_ai.operation.name\": options.operationName ?? \"invoke_workflow\",\n \"gen_ai.workflow.name\": options.name,\n ...createGenAiUsageAttributes(run.usage),\n }),\n };\n const toolSpans = createToolCallSpans(toolCalls(run.session), {\n traceId,\n parentId: rootSpanId,\n spanIdPrefix: `${traceId}:tool`,\n });\n const trace: NormalizedTrace = {\n id: traceId,\n name: options.name,\n startedAt: options.startedAt.toISOString(),\n finishedAt: options.finishedAt.toISOString(),\n durationMs,\n ...(options.source ? { metadata: { source: options.source } } : {}),\n spans: [runSpan, ...toolSpans],\n };\n\n run.traces = [trace];\n return trace;\n}\n\nlet nextGeneratedTraceId = 0;\n\nfunction createGeneratedTraceId() {\n nextGeneratedTraceId += 1;\n return `trace_${nextGeneratedTraceId}`;\n}\n\n/**\n * Attaches a partial or complete harness run to an arbitrary thrown error.\n *\n * @param error - Thrown value to wrap.\n * @param run - Partial or complete normalized harness run to preserve.\n *\n * @example\n * ```ts\n * try {\n * return await runAgent(input);\n * } catch (error) {\n * throw attachHarnessRunToError(error, partialRun);\n * }\n * ```\n */\nexport function attachHarnessRunToError(\n error: unknown,\n run: HarnessRun,\n): HarnessRunError {\n const baseError =\n error instanceof Error\n ? error\n : new Error(String(error ?? \"Unknown error\"));\n return Object.assign(baseError, {\n vitestEvalsRun: run,\n });\n}\n\n/**\n * Reads an attached harness run back off a previously wrapped error value.\n *\n * @param error - Unknown thrown value that may contain a harness run.\n *\n * @example\n * ```ts\n * const partialRun = getHarnessRunFromError(error);\n *\n * if (partialRun) {\n * console.log(toolCalls(partialRun.session));\n * }\n * ```\n */\nexport function getHarnessRunFromError(error: unknown): HarnessRun | undefined {\n if (\n error &&\n typeof error === \"object\" &&\n \"vitestEvalsRun\" in error &&\n isHarnessRun((error as { vitestEvalsRun?: unknown }).vitestEvalsRun)\n ) {\n return (error as { vitestEvalsRun: HarnessRun }).vitestEvalsRun;\n }\n\n return undefined;\n}\n\n/** Returns true when a value matches the normalized `HarnessRun` contract. */\nexport function isHarnessRun(value: unknown): value is HarnessRun {\n if (!value || typeof value !== \"object\") {\n return false;\n }\n\n const candidate = value as {\n session?: unknown;\n usage?: unknown;\n errors?: unknown;\n };\n\n return (\n isNormalizedSession(candidate.session) &&\n Boolean(candidate.usage) &&\n typeof candidate.usage === \"object\" &&\n !Array.isArray(candidate.usage) &&\n Array.isArray(candidate.errors)\n );\n}\n\n/** Returns true when a value matches the normalized session contract. */\nexport function isNormalizedSession(\n value: unknown,\n): value is NormalizedSession {\n return (\n Boolean(value) &&\n typeof value === \"object\" &&\n value !== null &&\n \"messages\" in value &&\n Array.isArray((value as { messages?: unknown }).messages)\n );\n}\n\n/** Reuses pre-normalized harness errors when a runtime already returns them. */\nexport function resolveHarnessRunErrors(\n result: unknown,\n): Array<Record<string, JsonValue>> {\n if (\n result &&\n typeof result === \"object\" &&\n Array.isArray((result as Record<string, unknown>).errors)\n ) {\n return (result as { errors: Array<Record<string, JsonValue>> }).errors;\n }\n\n return [];\n}\n\n/** Serializes an arbitrary thrown value into the normalized error shape. */\nexport function serializeError(error: unknown): Record<string, JsonValue> {\n if (error instanceof Error) {\n return {\n type: error.name,\n message: error.message,\n };\n }\n\n return {\n type: \"Error\",\n message: String(error),\n };\n}\n","import {\n createHarness,\n type Harness,\n type HarnessContext,\n type HarnessResultLike,\n type HarnessRun,\n isHarnessRun,\n latestAssistantMessageContent,\n type JsonValue,\n type MaybePromise,\n normalizeContent,\n} from \"../harness\";\n\n/**\n * Provider-neutral prompt request issued by an LLM-backed judge.\n *\n * @example\n * ```ts\n * const input: JudgeHarnessInput = {\n * system: \"Grade factual consistency.\",\n * prompt: \"Compare the submitted answer with the reference answer.\",\n * responseFormat: {\n * type: \"json\",\n * },\n * };\n * ```\n */\nexport type JudgeHarnessInput = {\n /** Optional system prompt for the judge model. */\n system?: string;\n /** User prompt or instruction payload for the judge model. */\n prompt: string;\n /** Optional response-format hint for adapters that support structured output. */\n responseFormat?: {\n /** Requests a JSON-compatible response. */\n type: \"json\";\n /** Optional JSON Schema passed through to provider-specific adapters. */\n schema?: JsonValue;\n };\n};\n\n/** JSON-safe output returned by a judge harness. */\nexport type JudgeHarnessOutput = JsonValue | undefined;\n\n/**\n * Harness used by LLM-backed judges to issue judge-side prompts.\n *\n * This is separate from the application harness under test.\n *\n * @example\n * ```ts\n * const judgeHarness: JudgeHarness = createJudgeHarness({\n * name: \"judge-model\",\n * run: async ({ prompt }, { signal }) => {\n * return callJudgeModel({ prompt, signal });\n * },\n * });\n * ```\n */\nexport type JudgeHarness = Harness<JudgeHarnessInput, JudgeHarnessOutput>;\n\n/** Runtime options supplied when a judge calls `runJudge(...)`. */\nexport type RunJudgeOptions = {\n /** Abort signal from the current eval run when available. */\n signal?: AbortSignal;\n};\n\n/**\n * Curried judge-harness runner available inside `JudgeContext`.\n *\n * @example\n * ```ts\n * const verdict = await ctx.runJudge?.({\n * prompt: \"Return a JSON verdict.\",\n * responseFormat: { type: \"json\" },\n * });\n * ```\n */\nexport type RunJudge = (\n input: JudgeHarnessInput,\n options?: RunJudgeOptions,\n) => Promise<JudgeHarnessOutput>;\n\n/** Runtime options passed to `createJudgeHarness(...)` callbacks. */\nexport type CreateJudgeHarnessRunOptions = {\n /** Abort signal from the current eval run when available. */\n signal?: AbortSignal;\n};\n\n/**\n * Configuration for `createJudgeHarness(...)`.\n *\n * @example\n * ```ts\n * const judgeHarness = createJudgeHarness({\n * name: \"custom-judge\",\n * run: async ({ system, prompt }, { signal }) => {\n * return callProvider({ system, prompt, signal });\n * },\n * });\n * ```\n */\nexport type CreateJudgeHarnessOptions = {\n /** Stable harness name used in diagnostics. */\n name?: string;\n /**\n * Runs one provider-specific judge prompt.\n *\n * Return a JSON-safe value, a raw provider value to normalize, a lightweight\n * `{ output }` result, or a full normalized `HarnessRun`.\n */\n run: (\n input: JudgeHarnessInput,\n options: CreateJudgeHarnessRunOptions,\n ) => MaybePromise<unknown>;\n};\n\n/**\n * Creates a judge harness from a provider-specific prompt callback.\n *\n * @param options - Harness name plus the callback that issues the judge prompt.\n *\n * @example\n * ```ts\n * const judgeHarness = createJudgeHarness({\n * run: async ({ prompt }) => callJudgeModel(prompt),\n * });\n * ```\n */\nexport function createJudgeHarness(\n options: CreateJudgeHarnessOptions,\n): JudgeHarness {\n return createHarness({\n name: options.name ?? \"judge-harness\",\n run: async ({ input, signal }) => {\n return normalizeJudgeHarnessResult(await options.run(input, { signal }));\n },\n });\n}\n\n/**\n * Runs a judge harness with eval-scoped context already supplied.\n *\n * @param judgeHarness - Judge-side harness configured on the matcher, judge, or suite.\n * @param input - Provider-neutral judge prompt request.\n * @param options - Run-scoped abort signal.\n */\nexport async function runJudgeHarness(\n judgeHarness: JudgeHarness,\n input: JudgeHarnessInput,\n options: RunJudgeOptions = {},\n): Promise<JudgeHarnessOutput> {\n const artifacts: HarnessContext[\"artifacts\"] = {};\n const run = await judgeHarness.run(input, {\n signal: options.signal,\n artifacts,\n setArtifact: (name, value) => {\n artifacts[name] = value;\n },\n });\n\n return run.output !== undefined\n ? run.output\n : resolveJudgeHarnessAssistantOutput(run);\n}\n\n/** Binds a judge harness to the current eval run context. */\nexport function createRunJudge(\n judgeHarness: JudgeHarness | undefined,\n signal?: AbortSignal,\n): RunJudge | undefined {\n if (!judgeHarness) {\n return undefined;\n }\n\n return (input, options) =>\n runJudgeHarness(judgeHarness, input, {\n signal: options?.signal ?? signal,\n });\n}\n\nfunction normalizeJudgeHarnessResult(\n result: Awaited<ReturnType<CreateJudgeHarnessOptions[\"run\"]>>,\n): HarnessResultLike<JudgeHarnessOutput> {\n if (isHarnessRun(result)) {\n return result as HarnessRun<JudgeHarnessOutput>;\n }\n\n if (hasOutputField(result)) {\n return {\n output: normalizeJudgeHarnessOutput(result.output),\n };\n }\n\n return {\n output: normalizeJudgeHarnessOutput(result),\n };\n}\n\nfunction hasOutputField(value: unknown): value is { output?: unknown } {\n return (\n value !== null &&\n typeof value === \"object\" &&\n !Array.isArray(value) &&\n Object.keys(value).length === 1 &&\n \"output\" in value\n );\n}\n\nfunction normalizeJudgeHarnessOutput(value: unknown): JudgeHarnessOutput {\n if (value === undefined) {\n return undefined;\n }\n\n return normalizeContent(value);\n}\n\nfunction resolveJudgeHarnessAssistantOutput(\n run: HarnessRun<JudgeHarnessOutput>,\n): JudgeHarnessOutput {\n return latestAssistantMessageContent(run.session) ?? \"\";\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,kBAWO;AAkBP,IAAAA,eAWO;;;AC2GP,eAAsB,gBACpB,cACA,OACA,UAA2B,CAAC,GACC;AAC7B,QAAM,YAAyC,CAAC;AAChD,QAAM,MAAM,MAAM,aAAa,IAAI,OAAO;AAAA,IACxC,QAAQ,QAAQ;AAAA,IAChB;AAAA,IACA,aAAa,CAAC,MAAM,UAAU;AAC5B,gBAAU,IAAI,IAAI;AAAA,IACpB;AAAA,EACF,CAAC;AAED,SAAO,IAAI,WAAW,SAClB,IAAI,SACJ,mCAAmC,GAAG;AAC5C;AAGO,SAAS,eACd,cACA,QACsB;AACtB,MAAI,CAAC,cAAc;AACjB,WAAO;AAAA,EACT;AAEA,SAAO,CAAC,OAAO,YACb,gBAAgB,cAAc,OAAO;AAAA,IACnC,QAAQ,SAAS,UAAU;AAAA,EAC7B,CAAC;AACL;AAsCA,SAAS,mCACP,KACoB;AACpB,aAAO,4CAA8B,IAAI,OAAO,KAAK;AACvD;;;AFnKA,IAAM,2BAAkE;AAAA,EACtE,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AACL;AAEA,IAAM,oBACJ;AAEF,IAAM,6BAA6B;AAAA,EACjC,MAAM;AAAA,EACN,sBAAsB;AAAA,EACtB,UAAU,CAAC,UAAU,WAAW;AAAA,EAChC,YAAY;AAAA,IACV,QAAQ;AAAA,MACN,MAAM,CAAC,KAAK,KAAK,KAAK,KAAK,GAAG;AAAA,IAChC;AAAA,IACA,WAAW;AAAA,MACT,MAAM;AAAA,IACR;AAAA,EACF;AACF;AAyGO,SAAS,gBACd,SAAgC,CAAC,GACF;AAC/B,QAAM,eAAe,OAAO;AAE5B,SAAO;AAAA,IACL,MAAM,OAAO,QAAQ;AAAA,IACrB;AAAA,IACA,QAAQ,CAAC,SACP,iBAAiB,MAAM;AAAA,MACrB,UAAU,OAAO;AAAA,MACjB;AAAA,IACF,CAAC;AAAA,EACL;AACF;AAEA,eAAe,iBACb,MACA,QAIA;AACA,QAAM,WAAW,KAAK,YAAY,OAAO;AAEzC,MAAI,wBAAwB,QAAQ,GAAG;AACrC,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WACE;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AAEA,QAAM,WACJ,KAAK,YACL;AAAA,IACE,OAAO;AAAA,IACN,KAAkC;AAAA,EACrC;AAEF,MAAI,CAAC,UAAU;AACb,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAU,MAAM,SAAS;AAAA,IAC7B,QAAQ;AAAA,IACR,QAAQ,uBAAuB;AAAA,MAC7B,OAAO,KAAK;AAAA,MACZ;AAAA,MACA,QAAQ,mBAAmB,IAAI;AAAA,IACjC,CAAC;AAAA,IACD,gBAAgB;AAAA,MACd,MAAM;AAAA,MACN,QAAQ;AAAA,IACV;AAAA,EACF,CAAC;AAED,SAAO,kBAAkB,4BAA4B,OAAO,CAAC;AAC/D;AAEA,SAAS,wBAAwB,OAA4C;AAC3E,SACE,SAAS,QAAS,OAAO,UAAU,YAAY,MAAM,KAAK,EAAE,WAAW;AAE3E;AAEA,SAAS,mBAAmB,MAA8B;AACxD,MAAI,KAAK,WAAW,QAAW;AAC7B,WAAO,KAAK;AAAA,EACd;AAEA,aAAO,4CAA8B,KAAK,OAAO,KAAK;AACxD;AAEA,SAAS,4BAA4B,OAAwC;AAC3E,QAAM,SAAS,OAAO,UAAU,WAAW,gBAAgB,KAAK,IAAI;AAEpE,MAAI,CAAC,UAAU,OAAO,WAAW,UAAU;AACzC,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAU;AAChB,MAAI,CAAC,mBAAmB,QAAQ,MAAM,GAAG;AACvC,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,MAAI,OAAO,QAAQ,cAAc,UAAU;AACzC,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,QAAQ,QAAQ;AAAA,IAChB,WAAW,QAAQ;AAAA,EACrB;AACF;AAEA,SAAS,gBAAgB,OAAe;AACtC,MAAI;AACF,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB,QAAQ;AACN,UAAM,aAAa,MAAM,MAAM,kCAAkC;AACjE,QAAI,CAAC,YAAY;AACf,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,WAAO,KAAK,MAAM,WAAW,CAAC,CAAC;AAAA,EACjC;AACF;AAEA,SAAS,mBAAmB,OAAgD;AAC1E,SACE,UAAU,OACV,UAAU,OACV,UAAU,OACV,UAAU,OACV,UAAU;AAEd;AAEA,SAAS,uBAAuB;AAAA,EAC9B;AAAA,EACA;AAAA,EACA;AACF,GAIG;AACD,QAAM,aAAa,iBAAiB;AAAA,IAClC,UAAU,SAAS;AAAA,IACnB,eAAe;AAAA,IACf,kBAAkB,UAAU;AAAA,EAC9B,CAAC;AAED,SAAO;AAAA;AAAA;AAAA,EAGP,UAAU;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAgBZ;AAEA,SAAS,iBAAiB,OAAgB;AACxC,MAAI,OAAO,UAAU,UAAU;AAC7B,WAAO;AAAA,EACT;AAEA,MAAI,UAAU,QAAW;AACvB,WAAO;AAAA,EACT;AAEA,MAAI;AACF,WAAO,KAAK,UAAU,OAAO,MAAM,CAAC,KAAK,OAAO,KAAK;AAAA,EACvD,QAAQ;AACN,WAAO,OAAO,KAAK;AAAA,EACrB;AACF;AAEA,SAAS,kBAAkB,QAA6C;AACtE,SAAO;AAAA,IACL,OAAO,yBAAyB,OAAO,MAAM;AAAA,IAC7C,UAAU;AAAA,MACR,WAAW,OAAO;AAAA,MAClB,QAAQ,OAAO;AAAA,IACjB;AAAA,EACF;AACF;","names":["import_core"]}
@@ -20,7 +20,6 @@ import {
20
20
  async function runJudgeHarness(judgeHarness, input, options = {}) {
21
21
  const artifacts = {};
22
22
  const run = await judgeHarness.run(input, {
23
- metadata: options.metadata ?? {},
24
23
  signal: options.signal,
25
24
  artifacts,
26
25
  setArtifact: (name, value) => {
@@ -34,8 +33,7 @@ function createRunJudge(judgeHarness, signal) {
34
33
  return void 0;
35
34
  }
36
35
  return (input, options) => runJudgeHarness(judgeHarness, input, {
37
- metadata: options?.metadata,
38
- signal
36
+ signal: options?.signal ?? signal
39
37
  });
40
38
  }
41
39
  function resolveJudgeHarnessAssistantOutput(run) {
@@ -69,22 +67,24 @@ function FactualityJudge(config = {}) {
69
67
  return {
70
68
  name: config.name ?? "FactualityJudge",
71
69
  judgeHarness,
72
- assess: (opts) => assessFactuality(opts, judgeHarness)
70
+ assess: (opts) => assessFactuality(opts, {
71
+ expected: config.expected,
72
+ judgeHarness
73
+ })
73
74
  };
74
75
  }
75
- async function assessFactuality(opts, configuredJudgeHarness) {
76
- const metadata = opts.metadata;
77
- const expected = opts.expected === void 0 ? metadata.expected : opts.expected;
76
+ async function assessFactuality(opts, config) {
77
+ const expected = opts.expected ?? config.expected;
78
78
  if (isMissingExpectedAnswer(expected)) {
79
79
  return {
80
80
  score: 0,
81
81
  metadata: {
82
- rationale: "FactualityJudge requires a non-empty expert answer in `expected` or `metadata.expected`."
82
+ rationale: "FactualityJudge requires a non-empty expert answer in `expected` or FactualityJudge(...) config."
83
83
  }
84
84
  };
85
85
  }
86
86
  const runJudge = opts.runJudge ?? createRunJudge(
87
- configuredJudgeHarness,
87
+ config.judgeHarness,
88
88
  opts.signal
89
89
  );
90
90
  if (!runJudge) {