npm - @agentv/core - Versions diffs - 3.4.0 → 3.6.0 - Mend

@agentv/core 3.4.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/agentv-provider-NFFLXG5M.js +7 -0
package/dist/{chunk-JO4HIAEF.js → chunk-2IZOTQ25.js} +1 -5
package/dist/chunk-2IZOTQ25.js.map +1 -0
package/dist/{chunk-Q52FQPKQ.js → chunk-W5YDZWT4.js} +2 -2
package/dist/chunk-W5YDZWT4.js.map +1 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/index.cjs +449 -491
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +57 -47
package/dist/index.d.ts +57 -47
package/dist/index.js +451 -490
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/agentv-provider-HDSAUUEF.js +0 -7
package/dist/chunk-JO4HIAEF.js.map +0 -1
package/dist/chunk-Q52FQPKQ.js.map +0 -1
/package/dist/{agentv-provider-HDSAUUEF.js.map → agentv-provider-NFFLXG5M.js.map} +0 -0

package/dist/index.cjs CHANGED Viewed

@@ -55,7 +55,7 @@ function createLanguageModel(modelString) {
     case "anthropic":
       return (0, import_anthropic.createAnthropic)()(modelName);
     case "azure":
-      return (0, import_azure.createAzure)()(modelName);
+      return (0, import_azure.createAzure)().chat(modelName);
     case "google":
       return (0, import_google.createGoogleGenerativeAI)()(modelName);
     default:
@@ -1580,7 +1580,6 @@ __export(index_exports, {
   freeformEvaluationSchema: () => freeformEvaluationSchema,
   generateRubrics: () => generateRubrics,
   getAgentvHome: () => getAgentvHome,
-  getHitCount: () => getHitCount,
   getOutputFilenames: () => getOutputFilenames,
   getSubagentsRoot: () => getSubagentsRoot,
   getTraceStateRoot: () => getTraceStateRoot,
@@ -1730,9 +1729,6 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
 function isEvaluatorKind(value) {
   return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
 }
-function getHitCount(result) {
-  return result.hits.length;
-}
 // src/evaluation/trace.ts
 function computeTraceSummary(messages) {
@@ -2449,14 +2445,8 @@ var import_promises5 = require("fs/promises");
 // src/evaluation/template-variables.ts
 var TEMPLATE_VARIABLES = {
-  /** @deprecated Use OUTPUT_TEXT instead */
-  ANSWER: "answer",
   EXPECTED_OUTPUT: "expected_output",
-  /** @deprecated Use INPUT_TEXT instead */
-  QUESTION: "question",
   CRITERIA: "criteria",
-  /** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
-  REFERENCE_ANSWER: "reference_answer",
   INPUT: "input",
   OUTPUT: "output",
   FILE_CHANGES: "file_changes",
@@ -2466,9 +2456,8 @@ var TEMPLATE_VARIABLES = {
 };
 var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
 var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
-  TEMPLATE_VARIABLES.ANSWER,
-  TEMPLATE_VARIABLES.EXPECTED_OUTPUT,
-  TEMPLATE_VARIABLES.OUTPUT_TEXT
+  TEMPLATE_VARIABLES.OUTPUT_TEXT,
+  TEMPLATE_VARIABLES.EXPECTED_OUTPUT
 ]);
 // src/evaluation/validation/prompt-validator.ts
@@ -2491,13 +2480,13 @@ function validateTemplateVariables(content, source) {
     }
     match = variablePattern.exec(content);
   }
-  const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.ANSWER) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
+  const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
   const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
   const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
   if (!hasRequiredFields) {
     throw new Error(
       `Missing required fields. Must include at least one of:
-  - {{ ${TEMPLATE_VARIABLES.ANSWER} }} or {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
+  - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
   - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
     );
   }
@@ -5576,7 +5565,7 @@ var AzureProvider = class {
     };
     this.retryConfig = config.retry;
     const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
-    this.model = azure(config.deploymentName);
+    this.model = azure.chat(config.deploymentName);
   }
   id;
   kind = "azure";
@@ -5799,6 +5788,8 @@ async function invokeModel(options) {
   const { model, request, defaults, retryConfig, providerOptions } = options;
   const chatPrompt = buildChatPrompt(request);
   const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
+  const startTime = (/* @__PURE__ */ new Date()).toISOString();
+  const startMs = Date.now();
   const result = await withRetry(
     () => (0, import_ai.generateText)({
       model,
@@ -5812,9 +5803,11 @@ async function invokeModel(options) {
     retryConfig,
     request.signal
   );
-  return mapResponse(result);
+  const endTime = (/* @__PURE__ */ new Date()).toISOString();
+  const durationMs = Date.now() - startMs;
+  return mapResponse(result, { durationMs, startTime, endTime });
 }
-function mapResponse(result) {
+function mapResponse(result, timing) {
   const content = result.text ?? "";
   const rawUsage = result.totalUsage ?? result.usage;
   const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
@@ -5829,7 +5822,10 @@ function mapResponse(result) {
     raw: result,
     usage: toJsonObject(rawUsage),
     output: [{ role: "assistant", content }],
-    tokenUsage
+    tokenUsage,
+    durationMs: timing?.durationMs,
+    startTime: timing?.startTime,
+    endTime: timing?.endTime
   };
 }
 function toJsonObject(value) {
@@ -6707,10 +6703,12 @@ var ClaudeSdkProvider = class {
             if (usage) {
               const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
               const outputTokens = usage.output_tokens ?? 0;
+              const reasoningTokens = usage.reasoning_tokens ?? void 0;
               tokenUsage = {
                 input: inputTokens,
                 output: outputTokens,
-                cached: usage.cache_read_input_tokens ?? void 0
+                cached: usage.cache_read_input_tokens ?? void 0,
+                reasoning: reasoningTokens
               };
               request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
             }
@@ -7724,7 +7722,8 @@ ${basePrompt}` : basePrompt;
           onUsage({
             input: usage.input_tokens ?? 0,
             output: usage.output_tokens ?? 0,
-            cached: usage.cached_input_tokens ?? void 0
+            cached: usage.cached_input_tokens ?? void 0,
+            reasoning: usage.reasoning_tokens ?? void 0
           });
         }
       }
@@ -9739,10 +9738,12 @@ function extractTokenUsage(events) {
           output: output ?? 0
         };
         const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
-        if (cached !== void 0) {
-          return { ...result, cached };
-        }
-        return result;
+        const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
+        return {
+          ...result,
+          ...cached !== void 0 ? { cached } : {},
+          ...reasoning !== void 0 ? { reasoning } : {}
+        };
       }
     }
     const messages = record.messages;
@@ -12807,9 +12808,11 @@ function negateScore(score) {
     ...score,
     score: negatedScore,
     verdict: negatedVerdict,
-    reasoning: score.reasoning ? `[Negated] ${score.reasoning} (original score: ${score.score.toFixed(2)})` : `[Negated] Original score: ${score.score.toFixed(2)}`,
-    hits: score.misses,
-    misses: score.hits
+    assertions: score.assertions.map((a) => ({
+      ...a,
+      passed: !a.passed,
+      evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
+    }))
   };
 }
@@ -13267,11 +13270,9 @@ var CodeEvaluator = class {
       }
     }
     const payload = {
-      question: context2.evalCase.question,
       criteria: context2.evalCase.criteria,
       expectedOutput: context2.evalCase.expected_output,
-      referenceAnswer: context2.evalCase.reference_answer,
-      answer: context2.candidate,
+      outputText: context2.candidate,
       output: outputForPayload,
       outputPath,
       guidelineFiles: context2.evalCase.guideline_paths,
@@ -13288,9 +13289,7 @@ var CodeEvaluator = class {
       fileChanges: context2.fileChanges ?? null,
       workspacePath: context2.workspacePath ?? null,
       config: this.config ?? null,
-      // Text convenience accessors (new names, always strings)
       inputText: context2.evalCase.question,
-      outputText: context2.candidate,
       expectedOutputText: context2.evalCase.reference_answer ?? ""
     };
     const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -13324,9 +13323,13 @@ var CodeEvaluator = class {
       );
       const parsed = parseJsonSafe(stdout);
       const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
-      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
-      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
-      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
+        (a) => typeof a === "object" && a !== null && typeof a.text === "string"
+      ).map((a) => ({
+        text: String(a.text),
+        passed: Boolean(a.passed),
+        ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
+      })) : [];
       const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
       const proxyUsage = getProxyUsage?.();
       const evaluatorRawRequest = {
@@ -13342,10 +13345,8 @@ var CodeEvaluator = class {
       return {
         score,
         verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount: hits.length + misses.length || 1,
-        reasoning,
+        assertions,
+        expectedAspectCount: assertions.length || 1,
         evaluatorRawRequest,
         ...details ? { details } : {},
         tokenUsage: proxyUsage?.tokenUsage
@@ -13356,10 +13357,8 @@ var CodeEvaluator = class {
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: [`Code evaluator failed: ${message}`],
+        assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
         expectedAspectCount: 1,
-        reasoning: message,
         evaluatorRawRequest: {
           command: this.command,
           ...this.cwd ? { cwd: this.cwd } : {},
@@ -13490,18 +13489,22 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
 {{${TEMPLATE_VARIABLES.CRITERIA}}}
 [[ ## question ## ]]
-{{${TEMPLATE_VARIABLES.QUESTION}}}
+{{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
 [[ ## reference_answer ## ]]
-{{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
+{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
 [[ ## answer ## ]]
-{{${TEMPLATE_VARIABLES.ANSWER}}}`;
+{{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
 var freeformEvaluationSchema = import_zod4.z.object({
   score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
-  hits: import_zod4.z.array(import_zod4.z.string()).describe("Brief specific achievements").optional(),
-  misses: import_zod4.z.array(import_zod4.z.string()).describe("Brief failures or omissions").optional(),
-  reasoning: import_zod4.z.string().describe("Concise explanation (1-2 sentences)").optional()
+  assertions: import_zod4.z.array(
+    import_zod4.z.object({
+      text: import_zod4.z.string().describe("Brief description of what was checked"),
+      passed: import_zod4.z.boolean().describe("Whether this aspect was satisfied"),
+      evidence: import_zod4.z.string().describe("Concise evidence (1-2 sentences)").optional()
+    })
+  ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
 });
 var rubricCheckResultSchema = import_zod4.z.object({
   id: import_zod4.z.string().describe("The ID of the rubric item being checked"),
@@ -13570,12 +13573,8 @@ var LlmGraderEvaluator = class {
         2
       ),
       [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
-      [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
-      [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
       [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
-      [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
       [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
-      // Text convenience accessors (new names, always strings)
       [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
       [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
       [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
@@ -13603,17 +13602,12 @@ ${context2.fileChanges}`;
         schema: freeformEvaluationSchema
       });
       const score = clampScore(data.score);
-      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
-      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
-      const reasoning = data.reasoning;
-      const expectedAspectCount = Math.max(hits.length + misses.length, 1);
+      const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
       return {
         score,
         verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount,
-        reasoning,
+        assertions,
+        expectedAspectCount: Math.max(assertions.length, 1),
         evaluatorRawRequest,
         tokenUsage
       };
@@ -13624,10 +13618,8 @@ ${context2.fileChanges}`;
       return {
         score: 0,
         verdict: "skip",
-        hits: [],
-        misses: [`Grader parse failure after 3 attempts: ${message}`],
+        assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
         expectedAspectCount: 1,
-        reasoning: `Grader parse failure after 3 attempts: ${message}`,
         evaluatorRawRequest
       };
     }
@@ -13657,14 +13649,12 @@ ${context2.fileChanges}`;
         userPrompt: prompt,
         schema: rubricEvaluationSchema
       });
-      const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
+      const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
       return {
         score,
         verdict,
-        hits,
-        misses,
+        assertions,
         expectedAspectCount: rubrics.length,
-        reasoning: data.overall_reasoning,
         evaluatorRawRequest,
         tokenUsage
       };
@@ -13675,10 +13665,8 @@ ${context2.fileChanges}`;
       return {
         score: 0,
         verdict: "skip",
-        hits: [],
-        misses: [`Grader parse failure after 3 attempts: ${message}`],
+        assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
         expectedAspectCount: rubrics.length,
-        reasoning: `Grader parse failure after 3 attempts: ${message}`,
         evaluatorRawRequest
       };
     }
@@ -13703,14 +13691,12 @@ ${context2.fileChanges}`;
         userPrompt: prompt,
         schema: scoreRangeEvaluationSchema
       });
-      const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
+      const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
       return {
         score,
         verdict,
-        hits,
-        misses,
+        assertions,
         expectedAspectCount: rubrics.length,
-        reasoning: data.overall_reasoning,
         evaluatorRawRequest,
         details,
         tokenUsage
@@ -13722,10 +13708,8 @@ ${context2.fileChanges}`;
       return {
         score: 0,
         verdict: "skip",
-        hits: [],
-        misses: [`Grader parse failure after 3 attempts: ${message}`],
+        assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
         expectedAspectCount: rubrics.length,
-        reasoning: `Grader parse failure after 3 attempts: ${message}`,
         evaluatorRawRequest
       };
     }
@@ -13782,8 +13766,7 @@ ${context2.fileChanges}`;
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: [`llm-grader built-in evaluation failed: ${message}`],
+        assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
         expectedAspectCount: 1,
         evaluatorRawRequest,
         details: { mode: "built-in", error: message }
@@ -13833,8 +13816,9 @@ ${context2.fileChanges}`;
         return {
           score: 0,
           verdict: "fail",
-          hits: [],
-          misses: [`llm-grader ${modeLabel} returned no assistant response`],
+          assertions: [
+            { text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
+          ],
           expectedAspectCount: 1,
           evaluatorRawRequest,
           details: { mode: modeLabel, grader_target: provider.targetName }
@@ -13852,8 +13836,9 @@ ${context2.fileChanges}`;
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: [`llm-grader ${modeLabel} evaluation failed: ${message}`],
+        assertions: [
+          { text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
+        ],
         expectedAspectCount: 1,
         evaluatorRawRequest,
         details: {
@@ -13894,10 +13879,10 @@ ${context2.fileChanges}`;
   buildAgentUserPrompt(context2) {
     const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
     const variables = {
-      [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
-      [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
       [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
-      [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
+      [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
+      [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
+      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
       [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
     };
     if (this.evaluatorTemplate) {
@@ -13950,10 +13935,10 @@ ${context2.fileChanges}`;
     const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
     if (this.evaluatorTemplate) {
       const variables = {
-        [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
-        [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
         [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
-        [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
+        [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
+        [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
+        [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
         [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
       };
       const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
@@ -14005,29 +13990,24 @@ ${outputSchema}`;
       const parsed = parseJsonFromText(text);
       if (rubrics && rubrics.length > 0) {
         const data2 = rubricEvaluationSchema.parse(parsed);
-        const { score: score2, verdict, hits: hits2, misses: misses2 } = calculateRubricScore(data2, rubrics);
+        const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
         return {
           score: score2,
           verdict,
-          hits: hits2,
-          misses: misses2,
+          assertions: assertions2,
           expectedAspectCount: rubrics.length,
-          reasoning: data2.overall_reasoning,
           evaluatorRawRequest,
           details
         };
       }
       const data = freeformEvaluationSchema.parse(parsed);
       const score = clampScore(data.score);
-      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
-      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
+      const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
       return {
         score,
         verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount: Math.max(hits.length + misses.length, 1),
-        reasoning: data.reasoning,
+        assertions,
+        expectedAspectCount: Math.max(assertions.length, 1),
         evaluatorRawRequest,
         details
       };
@@ -14035,8 +14015,12 @@ ${outputSchema}`;
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: ["Failed to parse llm-grader agent response as valid evaluation JSON"],
+        assertions: [
+          {
+            text: "Failed to parse llm-grader agent response as valid evaluation JSON",
+            passed: false
+          }
+        ],
         expectedAspectCount: 1,
         evaluatorRawRequest,
         details
@@ -14165,9 +14149,13 @@ function buildOutputSchema() {
     "",
     "{",
     '  "score": <number between 0.0 and 1.0>,',
-    '  "hits": [<array of strings, max 4 items, brief specific achievements>],',
-    '  "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
-    '  "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
+    '  "assertions": [',
+    "    {",
+    '      "text": "<brief description of what was checked>",',
+    '      "passed": <boolean>,',
+    '      "evidence": "<concise evidence, 1-2 sentences, optional>"',
+    "    }",
+    "  ]",
     "}"
   ].join("\n");
 }
@@ -14192,8 +14180,7 @@ function substituteVariables(template, variables) {
 }
 function calculateRubricScore(result, rubrics) {
   const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
-  const hits = [];
-  const misses = [];
+  const assertions = [];
   let totalWeight = 0;
   let earnedWeight = 0;
   let failedRequired = false;
@@ -14203,19 +14190,20 @@ function calculateRubricScore(result, rubrics) {
       continue;
     }
     totalWeight += rubric.weight;
+    assertions.push({
+      text: `[${rubric.id}] ${rubric.outcome}`,
+      passed: check.satisfied,
+      evidence: check.reasoning
+    });
     if (check.satisfied) {
       earnedWeight += rubric.weight;
-      hits.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
-    } else {
-      misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
-      if (rubric.required) {
-        failedRequired = true;
-      }
+    } else if (rubric.required) {
+      failedRequired = true;
     }
   }
   const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
   const verdict = failedRequired ? "fail" : scoreToVerdict(score);
-  return { score, verdict, hits, misses };
+  return { score, verdict, assertions };
 }
 function buildScoreRangeOutputSchema() {
   return `You are an expert evaluator. Score the candidate answer on each criterion.
@@ -14235,8 +14223,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
 }
 function calculateScoreRangeResult(result, rubrics) {
   const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
-  const hits = [];
-  const misses = [];
+  const assertions = [];
   const rawScores = {};
   let totalWeight = 0;
   let weightedScoreSum = 0;
@@ -14262,24 +14249,22 @@ function calculateScoreRangeResult(result, rubrics) {
     );
     const rangeDescription = matchingRange?.outcome ?? "";
     const criterionLabel = rubric.outcome ?? rubric.id;
-    const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
-    const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
+    const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
     if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
       failedRequired = true;
-      misses.push(scoreInfo);
-    } else if (rawScore >= 7) {
-      hits.push(scoreInfo);
-    } else {
-      misses.push(scoreInfo);
     }
+    assertions.push({
+      text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
+      passed,
+      evidence: check.reasoning
+    });
   }
   const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
   const verdict = failedRequired ? "fail" : scoreToVerdict(score);
   return {
     score,
     verdict,
-    hits,
-    misses,
+    assertions,
     details: {
       raw_scores: rawScores,
       normalization: "score / 10",
@@ -14455,9 +14440,7 @@ var CompositeEvaluator = class {
     let totalWeight = 0;
     let weightedSum = 0;
     let evaluatedCount = 0;
-    const allHits = [];
-    const allMisses = [];
-    const reasoningParts = [];
+    const allAssertions = [];
     const scores = [];
     for (const member of results) {
       const weight = weights?.[member.id] ?? 1;
@@ -14467,9 +14450,7 @@ var CompositeEvaluator = class {
         score: member.result.score,
         weight,
         verdict: member.result.verdict,
-        hits: [...member.result.hits],
-        misses: [...member.result.misses],
-        reasoning: member.result.reasoning,
+        assertions: [...member.result.assertions],
         evaluatorRawRequest: member.result.evaluatorRawRequest,
         scores: member.result.scores,
         details: member.result.details,
@@ -14481,20 +14462,16 @@ var CompositeEvaluator = class {
       evaluatedCount++;
       totalWeight += weight;
       weightedSum += member.result.score * weight;
-      allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
-      allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
-      if (member.result.reasoning) {
-        reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
-      }
+      allAssertions.push(
+        ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
+      );
     }
     if (evaluatedCount === 0 && results.length > 0) {
       return {
         score: 0,
         verdict: "skip",
-        hits: [],
-        misses: [],
+        assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
         expectedAspectCount: 1,
-        reasoning: "All evaluators skipped (infrastructure failure)",
         evaluatorRawRequest: {
           aggregator: "weighted_average",
           ...weights ? { weights } : {}
@@ -14506,10 +14483,8 @@ var CompositeEvaluator = class {
     return {
       score: clampScore(finalScore),
       verdict: scoreToVerdict(finalScore),
-      hits: allHits,
-      misses: allMisses,
-      expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
-      reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
+      assertions: allAssertions,
+      expectedAspectCount: allAssertions.length || 1,
       evaluatorRawRequest: {
         aggregator: "weighted_average",
         ...weights ? { weights } : {}
@@ -14519,11 +14494,8 @@ var CompositeEvaluator = class {
   }
   runThreshold(results, threshold) {
     const scores = [];
-    const allHits = [];
-    const allMisses = [];
-    const reasoningParts = [];
+    const allAssertions = [];
     let passingCount = 0;
-    let borderlineCount = 0;
     let evaluatedCount = 0;
     for (const member of results) {
       scores.push({
@@ -14531,9 +14503,7 @@ var CompositeEvaluator = class {
         type: member.type,
         score: member.result.score,
         verdict: member.result.verdict,
-        hits: [...member.result.hits],
-        misses: [...member.result.misses],
-        reasoning: member.result.reasoning,
+        assertions: [...member.result.assertions],
         evaluatorRawRequest: member.result.evaluatorRawRequest,
         scores: member.result.scores,
         details: member.result.details,
@@ -14546,24 +14516,17 @@ var CompositeEvaluator = class {
       const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
       if (isPassing) {
         passingCount++;
-        if (member.result.verdict === "borderline") {
-          borderlineCount++;
-        }
-      }
-      allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
-      allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
-      if (member.result.reasoning) {
-        reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
       }
+      allAssertions.push(
+        ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
+      );
     }
     if (evaluatedCount === 0 && results.length > 0) {
       return {
         score: 0,
         verdict: "skip",
-        hits: [],
-        misses: [],
+        assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
         expectedAspectCount: 1,
-        reasoning: "All evaluators skipped (infrastructure failure)",
         evaluatorRawRequest: {
           aggregator: "threshold",
           threshold
@@ -14574,19 +14537,15 @@ var CompositeEvaluator = class {
     const totalCount = evaluatedCount;
     const score = totalCount > 0 ? passingCount / totalCount : 0;
     const pass = score >= threshold;
-    if (pass && borderlineCount > 0) {
-      reasoningParts.push(`Warning: ${borderlineCount} borderline evaluator(s) counted as passing`);
-    }
-    reasoningParts.unshift(
-      `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
-    );
+    allAssertions.unshift({
+      text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
+      passed: pass
+    });
     return {
       score: clampScore(score),
       verdict: pass ? "pass" : "fail",
-      hits: allHits,
-      misses: allMisses,
-      expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
-      reasoning: reasoningParts.join("; "),
+      assertions: allAssertions,
+      expectedAspectCount: allAssertions.length || 1,
       evaluatorRawRequest: {
         aggregator: "threshold",
         threshold
@@ -14603,9 +14562,7 @@ var CompositeEvaluator = class {
       score: member.result.score,
       weight: weights?.[member.id] ?? 1,
       verdict: member.result.verdict,
-      hits: [...member.result.hits],
-      misses: [...member.result.misses],
-      reasoning: member.result.reasoning,
+      assertions: [...member.result.assertions],
       evaluatorRawRequest: member.result.evaluatorRawRequest,
       scores: member.result.scores,
       details: member.result.details
@@ -14614,17 +14571,19 @@ var CompositeEvaluator = class {
       const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
       const parsed = parseJsonSafe(stdout);
       const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
-      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
-      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
-      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
+        (a) => typeof a === "object" && a !== null && typeof a.text === "string"
+      ).map((a) => ({
+        text: String(a.text),
+        passed: Boolean(a.passed),
+        ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
+      })) : [];
       const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
       return {
         score,
         verdict,
-        hits,
-        misses,
-        expectedAspectCount: hits.length + misses.length || 1,
-        reasoning,
+        assertions,
+        expectedAspectCount: assertions.length || 1,
         evaluatorRawRequest: {
           aggregator: "code-grader",
           script: scriptPath
@@ -14636,10 +14595,8 @@ var CompositeEvaluator = class {
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: [`Code aggregator failed: ${message}`],
+        assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
         expectedAspectCount: 1,
-        reasoning: message,
         evaluatorRawRequest: {
           aggregator: "code-grader",
           script: scriptPath,
@@ -14661,9 +14618,7 @@ var CompositeEvaluator = class {
       type: member.type,
       score: member.result.score,
       verdict: member.result.verdict,
-      hits: [...member.result.hits],
-      misses: [...member.result.misses],
-      reasoning: member.result.reasoning,
+      assertions: [...member.result.assertions],
       evaluatorRawRequest: member.result.evaluatorRawRequest,
       scores: member.result.scores,
       details: member.result.details
@@ -14687,16 +14642,12 @@ var CompositeEvaluator = class {
         });
         const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
         const score2 = clampScore(data2.score);
-        const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
-        const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
-        const reasoning2 = data2.reasoning;
+        const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
         return {
           score: score2,
           verdict: scoreToVerdict(score2),
-          hits: hits2,
-          misses: misses2,
-          expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
-          reasoning: reasoning2,
+          assertions: assertions2,
+          expectedAspectCount: Math.max(assertions2.length, 1),
           evaluatorRawRequest,
           scores
         };
@@ -14711,16 +14662,12 @@ var CompositeEvaluator = class {
         parseJsonFromText(extractLastAssistantContent2(response.output))
       );
       const score = clampScore(data.score);
-      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
-      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
-      const reasoning = data.reasoning;
+      const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
       return {
         score,
         verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount: Math.max(hits.length + misses.length, 1),
-        reasoning,
+        assertions,
+        expectedAspectCount: Math.max(assertions.length, 1),
         evaluatorRawRequest,
         scores
       };
@@ -14728,8 +14675,7 @@ var CompositeEvaluator = class {
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: [],
+        assertions: [{ text: "LLM aggregator failed", passed: false }],
         expectedAspectCount: 1,
         evaluatorRawRequest,
         scores
@@ -14752,10 +14698,8 @@ var CostEvaluator = class {
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: ["No cost data available in trace"],
+        assertions: [{ text: "No cost data available in trace", passed: false }],
         expectedAspectCount: 1,
-        reasoning: "Execution cost not reported by provider",
         evaluatorRawRequest: {
           type: "cost",
           budget,
@@ -14769,10 +14713,10 @@ var CostEvaluator = class {
     return {
       score,
       verdict: passed ? "pass" : "fail",
-      hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
-      misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
+      assertions: [
+        passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
+      ],
       expectedAspectCount: 1,
-      reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
       evaluatorRawRequest: {
         type: "cost",
         budget,
@@ -14805,10 +14749,8 @@ var ExecutionMetricsEvaluator = class {
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: ["No trace summary available"],
+        assertions: [{ text: "No trace summary available", passed: false }],
         expectedAspectCount: 1,
-        reasoning: "Execution metrics not available - no trace summary provided",
         evaluatorRawRequest: {
           type: "execution-metrics",
           config: this.extractConfiguredThresholds(),
@@ -14817,116 +14759,114 @@ var ExecutionMetricsEvaluator = class {
       };
     }
     const narrowedTrace = trace2;
-    const hits = [];
-    const misses = [];
+    const assertions = [];
     const actualMetrics = {};
     if (max_tool_calls !== void 0 && narrowedTrace) {
       const toolCalls = narrowedTrace.eventCount;
       actualMetrics.tool_calls = toolCalls;
       if (toolCalls <= max_tool_calls) {
-        hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
+        assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
       } else {
-        misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
+        assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
       }
     }
     if (max_llm_calls !== void 0 && narrowedTrace) {
       const llmCalls = narrowedTrace.llmCallCount;
       if (llmCalls === void 0) {
-        misses.push("LLM call count data not available");
+        assertions.push({ text: "LLM call count data not available", passed: false });
       } else {
         actualMetrics.llm_calls = llmCalls;
         if (llmCalls <= max_llm_calls) {
-          hits.push(`LLM calls ${llmCalls} <= ${max_llm_calls} max`);
+          assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
         } else {
-          misses.push(`LLM calls ${llmCalls} > ${max_llm_calls} max`);
+          assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
         }
       }
     }
     if (max_tokens !== void 0) {
       if (!tokenUsage) {
-        misses.push("Token usage data not available");
+        assertions.push({ text: "Token usage data not available", passed: false });
       } else {
         const totalTokens = tokenUsage.input + tokenUsage.output;
         actualMetrics.tokens = totalTokens;
         if (totalTokens <= max_tokens) {
-          hits.push(`Total tokens ${totalTokens} <= ${max_tokens} max`);
+          assertions.push({
+            text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
+            passed: true
+          });
         } else {
-          misses.push(`Total tokens ${totalTokens} > ${max_tokens} max`);
+          assertions.push({
+            text: `Total tokens ${totalTokens} > ${max_tokens} max`,
+            passed: false
+          });
         }
       }
     }
     if (max_cost_usd !== void 0) {
       if (costUsd === void 0) {
-        misses.push("Cost data not available");
+        assertions.push({ text: "Cost data not available", passed: false });
       } else {
         actualMetrics.cost_usd = costUsd;
         const formatCost = (n) => `$${n.toFixed(4)}`;
         if (costUsd <= max_cost_usd) {
-          hits.push(`Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`);
+          assertions.push({
+            text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
+            passed: true
+          });
         } else {
-          misses.push(`Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`);
+          assertions.push({
+            text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
+            passed: false
+          });
         }
       }
     }
     if (max_duration_ms !== void 0) {
       if (durationMs === void 0) {
-        misses.push("Duration data not available");
+        assertions.push({ text: "Duration data not available", passed: false });
       } else {
         actualMetrics.duration_ms = durationMs;
         if (durationMs <= max_duration_ms) {
-          hits.push(`Duration ${durationMs}ms <= ${max_duration_ms}ms max`);
+          assertions.push({
+            text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
+            passed: true
+          });
         } else {
-          misses.push(`Duration ${durationMs}ms > ${max_duration_ms}ms max`);
+          assertions.push({
+            text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
+            passed: false
+          });
         }
       }
     }
     if (target_exploration_ratio !== void 0 && narrowedTrace) {
       const ratio = explorationRatio(narrowedTrace);
       if (ratio === void 0) {
-        misses.push("Exploration ratio not available (no tool calls)");
+        assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
       } else {
         actualMetrics.exploration_ratio = ratio;
         const diff = Math.abs(ratio - target_exploration_ratio);
         if (diff <= exploration_tolerance) {
-          hits.push(
-            `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`
-          );
+          assertions.push({
+            text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
+            passed: true
+          });
         } else {
-          misses.push(
-            `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`
-          );
+          assertions.push({
+            text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
+            passed: false
+          });
         }
       }
     }
-    const totalChecks = hits.length + misses.length;
-    const score = totalChecks > 0 ? hits.length / totalChecks : 0;
-    const reasoningParts = [];
-    if (actualMetrics.tool_calls !== void 0) {
-      reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
-    }
-    if (actualMetrics.llm_calls !== void 0) {
-      reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
-    }
-    if (actualMetrics.tokens !== void 0) {
-      reasoningParts.push(`tokens=${actualMetrics.tokens}`);
-    }
-    if (actualMetrics.cost_usd !== void 0) {
-      reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
-    }
-    if (actualMetrics.duration_ms !== void 0) {
-      reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
-    }
-    if (actualMetrics.exploration_ratio !== void 0) {
-      reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
-    }
-    const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
+    const totalChecks = assertions.length;
+    const passedCount = assertions.filter((a) => a.passed).length;
+    const score = totalChecks > 0 ? passedCount / totalChecks : 0;
     return {
       score,
       verdict: scoreToVerdict(score),
-      hits,
-      misses,
+      assertions,
       expectedAspectCount: totalChecks || 1,
-      reasoning,
       evaluatorRawRequest: {
         type: "execution-metrics",
         config: this.extractConfiguredThresholds(),
@@ -15030,10 +14970,8 @@ var FieldAccuracyEvaluator = class {
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: ["Failed to parse candidate answer as JSON"],
-        expectedAspectCount: this.config.fields.length,
-        reasoning: "Candidate answer is not valid JSON"
+        assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
+        expectedAspectCount: this.config.fields.length
       };
     }
     const expectedData = this.extractExpectedData(evalCase.expected_output);
@@ -15041,10 +14979,8 @@ var FieldAccuracyEvaluator = class {
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: ["No expected data found in expected_output"],
-        expectedAspectCount: this.config.fields.length,
-        reasoning: "Could not extract expected data from expected_output"
+        assertions: [{ text: "No expected data found in expected_output", passed: false }],
+        expectedAspectCount: this.config.fields.length
       };
     }
     const fieldResults = [];
@@ -15262,18 +15198,14 @@ var FieldAccuracyEvaluator = class {
    */
   aggregateResults(results) {
     const aggregation = this.config.aggregation ?? "weighted_average";
-    const hits = [];
-    const misses = [];
+    const assertions = [];
     for (const result of results) {
-      if (result.hit) {
-        hits.push(result.message);
-      } else {
-        misses.push(result.message);
-      }
+      assertions.push({ text: result.message, passed: result.hit });
     }
     let score;
     if (aggregation === "all_or_nothing") {
-      score = misses.length === 0 ? 1 : 0;
+      const hasFailed = assertions.some((a) => !a.passed);
+      score = hasFailed ? 0 : 1;
     } else {
       const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
       if (totalWeight === 0) {
@@ -15283,15 +15215,11 @@ var FieldAccuracyEvaluator = class {
         score = weightedSum / totalWeight;
       }
     }
-    const reasoning = `${hits.length}/${results.length} fields matched`;
     return {
       score: clampScore(score),
       verdict: scoreToVerdict(score),
-      hits: hits.slice(0, 4),
-      // Cap at 4 to keep output concise
-      misses: misses.slice(0, 4),
-      expectedAspectCount: results.length,
-      reasoning
+      assertions,
+      expectedAspectCount: results.length
     };
   }
 };
@@ -15400,10 +15328,8 @@ var LatencyEvaluator = class {
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: ["No duration data available in trace"],
+        assertions: [{ text: "No duration data available in trace", passed: false }],
         expectedAspectCount: 1,
-        reasoning: "Execution duration not reported by provider",
         evaluatorRawRequest: {
           type: "latency",
           threshold,
@@ -15416,10 +15342,10 @@ var LatencyEvaluator = class {
     return {
       score,
       verdict: passed ? "pass" : "fail",
-      hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
-      misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
+      assertions: [
+        passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
+      ],
       expectedAspectCount: 1,
-      reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
       evaluatorRawRequest: {
         type: "latency",
         threshold,
@@ -15440,7 +15366,10 @@ var COPILOT_MATCHER = {
   skillTools: ["Skill", "skill"],
   skillInputField: "skill",
   readTools: ["Read File", "readFile", "Read", "readTextFile"],
-  readInputField: "file_path"
+  readInputField: "file_path",
+  skillToolPrefixes: ["Using skill: "],
+  readToolPrefixes: ["Viewing "],
+  readInputFields: ["file_path", "path"]
 };
 var PROVIDER_TOOL_SEMANTICS = {
   claude: CLAUDE_MATCHER,
@@ -15482,12 +15411,22 @@ var SkillTriggerEvaluator = class {
           triggered = true;
           evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
         }
+      } else if (matcher.skillToolPrefixes?.some(
+        (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
+      )) {
+        triggered = true;
+        evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
       } else if (matcher.readTools.includes(firstTool.tool)) {
-        const filePath = String(input[matcher.readInputField] ?? "");
+        const filePath = this.readPathFromInput(input, matcher);
         if (filePath.includes(skillName)) {
           triggered = true;
           evidence = `Read tool loaded skill file: ${filePath}`;
         }
+      } else if (matcher.readToolPrefixes?.some(
+        (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
+      )) {
+        triggered = true;
+        evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
       }
     }
     const pass = triggered === shouldTrigger;
@@ -15495,25 +15434,37 @@ var SkillTriggerEvaluator = class {
       return {
         score: 1,
         verdict: "pass",
-        hits: [
-          shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`
+        assertions: [
+          {
+            text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
+            passed: true
+          }
         ],
-        misses: [],
-        expectedAspectCount: 1,
-        reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
+        expectedAspectCount: 1
       };
     }
     return {
       score: 0,
       verdict: "fail",
-      hits: [],
-      misses: [
-        shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`
+      assertions: [
+        {
+          text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
+          passed: false
+        }
       ],
-      expectedAspectCount: 1,
-      reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
+      expectedAspectCount: 1
     };
   }
+  readPathFromInput(input, matcher) {
+    const fields = matcher.readInputFields ?? [matcher.readInputField];
+    for (const field of fields) {
+      const value = input[field];
+      if (value !== void 0 && value !== null) {
+        return String(value);
+      }
+    }
+    return "";
+  }
 };
 // src/evaluation/evaluators/llm-grader-prompt.ts
@@ -15548,12 +15499,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
     [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
     [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
     [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
-    [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
-    [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
     [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
-    [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
     [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
-    // Text convenience accessors (new names, always strings)
     [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
     [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
     [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -15680,10 +15627,8 @@ var TokenUsageEvaluator = class {
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: ["No token usage data available in trace"],
+        assertions: [{ text: "No token usage data available in trace", passed: false }],
         expectedAspectCount,
-        reasoning: "Token usage not reported by provider",
         evaluatorRawRequest: {
           type: "token-usage",
           max_total: maxTotal ?? null,
@@ -15697,37 +15642,34 @@ var TokenUsageEvaluator = class {
     const output = usage.output;
     const cached = usage.cached ?? 0;
     const total = input + output + cached;
-    const hits = [];
-    const misses = [];
+    const assertions = [];
     if (typeof maxInput === "number") {
       if (input <= maxInput) {
-        hits.push(`Input tokens ${input} <= ${maxInput}`);
+        assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
       } else {
-        misses.push(`Input tokens ${input} > ${maxInput}`);
+        assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
       }
     }
     if (typeof maxOutput === "number") {
       if (output <= maxOutput) {
-        hits.push(`Output tokens ${output} <= ${maxOutput}`);
+        assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
       } else {
-        misses.push(`Output tokens ${output} > ${maxOutput}`);
+        assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
       }
     }
     if (typeof maxTotal === "number") {
       if (total <= maxTotal) {
-        hits.push(`Total tokens ${total} <= ${maxTotal}`);
+        assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
       } else {
-        misses.push(`Total tokens ${total} > ${maxTotal}`);
+        assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
       }
     }
-    const passed = misses.length === 0;
+    const passed = assertions.every((a) => a.passed);
     return {
       score: passed ? 1 : 0,
       verdict: passed ? "pass" : "fail",
-      hits,
-      misses,
+      assertions,
       expectedAspectCount,
-      reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
       evaluatorRawRequest: {
         type: "token-usage",
         max_total: maxTotal ?? null,
@@ -15827,8 +15769,7 @@ var ToolTrajectoryEvaluator = class {
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: ["No trace available for evaluation"],
+        assertions: [{ text: "No trace available for evaluation", passed: false }],
         expectedAspectCount: 1
       };
     }
@@ -15839,8 +15780,7 @@ var ToolTrajectoryEvaluator = class {
           return {
             score: 0,
             verdict: "fail",
-            hits: [],
-            misses: ["No trace available for evaluation"],
+            assertions: [{ text: "No trace available for evaluation", passed: false }],
             expectedAspectCount: 1
           };
         }
@@ -15858,8 +15798,7 @@ var ToolTrajectoryEvaluator = class {
         return {
           score: 0,
           verdict: "fail",
-          hits: [],
-          misses: [`Unknown mode: ${this.config.mode}`],
+          assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
           expectedAspectCount: 1
         };
     }
@@ -15908,28 +15847,32 @@ var ToolTrajectoryEvaluator = class {
       return {
         score: 1,
         verdict: "pass",
-        hits: ["No tool requirements specified"],
-        misses: [],
+        assertions: [{ text: "No tool requirements specified", passed: true }],
         expectedAspectCount: 0
       };
     }
-    const hits = [];
-    const misses = [];
+    const assertions = [];
     for (const toolName of toolNames) {
       const required = minimums[toolName];
       const actual = summary.toolCallsByName[toolName] ?? 0;
       if (actual >= required) {
-        hits.push(`${toolName}: called ${actual} times (required >=${required})`);
+        assertions.push({
+          text: `${toolName}: called ${actual} times (required >=${required})`,
+          passed: true
+        });
       } else {
-        misses.push(`${toolName}: called ${actual} times (required >=${required})`);
+        assertions.push({
+          text: `${toolName}: called ${actual} times (required >=${required})`,
+          passed: false
+        });
       }
     }
-    const score = hits.length / toolNames.length;
+    const passedCount = assertions.filter((a) => a.passed).length;
+    const score = passedCount / toolNames.length;
     return {
       score,
       verdict: scoreToVerdict(score),
-      hits,
-      misses,
+      assertions,
       expectedAspectCount: toolNames.length
     };
   }
@@ -15939,13 +15882,11 @@ var ToolTrajectoryEvaluator = class {
       return {
         score: 1,
         verdict: "pass",
-        hits: ["No tool sequence specified"],
-        misses: [],
+        assertions: [{ text: "No tool sequence specified", passed: true }],
         expectedAspectCount: 0
       };
     }
-    const hits = [];
-    const misses = [];
+    const assertions = [];
     const warnings = [];
     let actualIndex = 0;
     let sequenceHits = 0;
@@ -15965,16 +15906,20 @@ var ToolTrajectoryEvaluator = class {
         const actualCall = toolCalls[actualIndex];
         if (actualCall.name === expectedTool) {
           if (argsMatch(expectedItem.args, actualCall.args, mode)) {
-            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+            assertions.push({
+              text: `Found ${expectedTool} at position ${actualIndex}`,
+              passed: true
+            });
             sequenceHits++;
             matchedCall = actualCall;
             actualIndex++;
             found = true;
             break;
           }
-          misses.push(
-            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
-          );
+          assertions.push({
+            text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
+            passed: false
+          });
           actualIndex++;
           argsMismatch = true;
           break;
@@ -15982,7 +15927,10 @@ var ToolTrajectoryEvaluator = class {
         actualIndex++;
       }
       if (!found && !argsMismatch) {
-        misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
+        assertions.push({
+          text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
+          passed: false
+        });
       }
       if (found && matchedCall) {
         const latencyResult = checkLatency(
@@ -15991,10 +15939,10 @@ var ToolTrajectoryEvaluator = class {
           matchedCall.durationMs
         );
         if (latencyResult.status === "pass") {
-          hits.push(latencyResult.message);
+          assertions.push({ text: latencyResult.message, passed: true });
           latencyHits++;
         } else if (latencyResult.status === "fail") {
-          misses.push(latencyResult.message);
+          assertions.push({ text: latencyResult.message, passed: false });
         } else if (latencyResult.message) {
           warnings.push(latencyResult.message);
           latencySkips++;
@@ -16010,8 +15958,7 @@ var ToolTrajectoryEvaluator = class {
     return {
       score,
       verdict: scoreToVerdict(score),
-      hits,
-      misses,
+      assertions,
       expectedAspectCount: totalAssertions
     };
   }
@@ -16021,13 +15968,11 @@ var ToolTrajectoryEvaluator = class {
       return {
         score: 1,
         verdict: "pass",
-        hits: ["No tool sequence specified"],
-        misses: [],
+        assertions: [{ text: "No tool sequence specified", passed: true }],
         expectedAspectCount: 0
       };
     }
-    const hits = [];
-    const misses = [];
+    const assertions = [];
     const warnings = [];
     let sequenceHits = 0;
     let latencyHits = 0;
@@ -16036,7 +15981,10 @@ var ToolTrajectoryEvaluator = class {
       (item) => item.maxDurationMs !== void 0
     ).length;
     if (toolCalls.length !== expected.length) {
-      misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
+      assertions.push({
+        text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
+        passed: false
+      });
     }
     const checkLength = Math.min(expected.length, toolCalls.length);
     for (let i = 0; i < checkLength; i++) {
@@ -16048,14 +15996,17 @@ var ToolTrajectoryEvaluator = class {
       let sequenceMatched = false;
       if (actualTool === expectedTool) {
         if (argsMatch(expectedItem.args, actualCall.args, mode)) {
-          hits.push(`Position ${i}: ${expectedTool}`);
+          assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
           sequenceHits++;
           sequenceMatched = true;
         } else {
-          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
+          assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
         }
       } else {
-        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
+        assertions.push({
+          text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
+          passed: false
+        });
       }
       if (sequenceMatched) {
         const latencyResult = checkLatency(
@@ -16064,10 +16015,10 @@ var ToolTrajectoryEvaluator = class {
           actualCall.durationMs
         );
         if (latencyResult.status === "pass") {
-          hits.push(latencyResult.message);
+          assertions.push({ text: latencyResult.message, passed: true });
           latencyHits++;
         } else if (latencyResult.status === "fail") {
-          misses.push(latencyResult.message);
+          assertions.push({ text: latencyResult.message, passed: false });
         } else if (latencyResult.message) {
           warnings.push(latencyResult.message);
           latencySkips++;
@@ -16075,7 +16026,10 @@ var ToolTrajectoryEvaluator = class {
       }
     }
     for (let i = checkLength; i < expected.length; i++) {
-      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+      assertions.push({
+        text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
+        passed: false
+      });
     }
     for (const warning of warnings) {
       console.warn(`[tool-trajectory] ${warning}`);
@@ -16086,8 +16040,7 @@ var ToolTrajectoryEvaluator = class {
     return {
       score,
       verdict: scoreToVerdict(score),
-      hits,
-      misses,
+      assertions,
       expectedAspectCount: totalAssertions
     };
   }
@@ -16102,13 +16055,11 @@ var ToolTrajectoryEvaluator = class {
       return {
         score: 1,
         verdict: "pass",
-        hits: ["No expected tools specified"],
-        misses: [],
+        assertions: [{ text: "No expected tools specified", passed: true }],
         expectedAspectCount: 0
       };
     }
-    const hits = [];
-    const misses = [];
+    const assertions = [];
     const consumed = /* @__PURE__ */ new Set();
     for (let i = 0; i < expected.length; i++) {
       const expectedItem = expected[i];
@@ -16119,22 +16070,25 @@ var ToolTrajectoryEvaluator = class {
         if (consumed.has(j)) continue;
         const actualCall = toolCalls[j];
         if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
-          hits.push(`Found ${expectedTool} at position ${j}`);
+          assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
           consumed.add(j);
           found = true;
           break;
         }
       }
       if (!found) {
-        misses.push(`Expected ${expectedTool} not found in actual trajectory`);
+        assertions.push({
+          text: `Expected ${expectedTool} not found in actual trajectory`,
+          passed: false
+        });
       }
     }
-    const score = expected.length > 0 ? hits.length / expected.length : 1;
+    const passedCount = assertions.filter((a) => a.passed).length;
+    const score = expected.length > 0 ? passedCount / expected.length : 1;
     return {
       score,
       verdict: scoreToVerdict(score),
-      hits,
-      misses,
+      assertions,
       expectedAspectCount: expected.length
     };
   }
@@ -16150,16 +16104,19 @@ var ToolTrajectoryEvaluator = class {
         return {
           score: 1,
           verdict: "pass",
-          hits: ["No tool calls and no expected tools"],
-          misses: [],
+          assertions: [{ text: "No tool calls and no expected tools", passed: true }],
           expectedAspectCount: 0
         };
       }
       return {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: [`${toolCalls.length} unexpected tool call(s) with empty allowed list`],
+        assertions: [
+          {
+            text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
+            passed: false
+          }
+        ],
         expectedAspectCount: toolCalls.length
       };
     }
@@ -16167,13 +16124,11 @@ var ToolTrajectoryEvaluator = class {
       return {
         score: 1,
         verdict: "pass",
-        hits: ["No actual tool calls (trivially a subset)"],
-        misses: [],
+        assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
         expectedAspectCount: 0
       };
     }
-    const hits = [];
-    const misses = [];
+    const assertions = [];
     for (let i = 0; i < toolCalls.length; i++) {
       const actualCall = toolCalls[i];
       let allowed = false;
@@ -16185,17 +16140,23 @@ var ToolTrajectoryEvaluator = class {
         }
       }
       if (allowed) {
-        hits.push(`Position ${i}: ${actualCall.name} is in allowed set`);
+        assertions.push({
+          text: `Position ${i}: ${actualCall.name} is in allowed set`,
+          passed: true
+        });
       } else {
-        misses.push(`Position ${i}: ${actualCall.name} is not in allowed set`);
+        assertions.push({
+          text: `Position ${i}: ${actualCall.name} is not in allowed set`,
+          passed: false
+        });
       }
     }
-    const score = toolCalls.length > 0 ? hits.length / toolCalls.length : 1;
+    const passedCount = assertions.filter((a) => a.passed).length;
+    const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
     return {
       score,
       verdict: scoreToVerdict(score),
-      hits,
-      misses,
+      assertions,
       expectedAspectCount: toolCalls.length
     };
   }
@@ -16206,8 +16167,12 @@ function runContainsAssertion(output, value) {
   const passed = output.includes(value);
   return {
     score: passed ? 1 : 0,
-    hits: passed ? [`Output contains "${value}"`] : [],
-    misses: passed ? [] : [`Output does not contain "${value}"`]
+    assertions: [
+      {
+        text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
+        passed
+      }
+    ]
   };
 }
 function runContainsAnyAssertion(output, values) {
@@ -16215,8 +16180,12 @@ function runContainsAnyAssertion(output, values) {
   const passed = matched.length > 0;
   return {
     score: passed ? 1 : 0,
-    hits: passed ? [`Output contains "${matched[0]}"`] : [],
-    misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
+    assertions: [
+      {
+        text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
+        passed
+      }
+    ]
   };
 }
 function runContainsAllAssertion(output, values) {
@@ -16224,16 +16193,24 @@ function runContainsAllAssertion(output, values) {
   const passed = missing.length === 0;
   return {
     score: passed ? 1 : 0,
-    hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
-    misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
+    assertions: [
+      {
+        text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
+        passed
+      }
+    ]
   };
 }
 function runIcontainsAssertion(output, value) {
   const passed = output.toLowerCase().includes(value.toLowerCase());
   return {
     score: passed ? 1 : 0,
-    hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
-    misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
+    assertions: [
+      {
+        text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
+        passed
+      }
+    ]
   };
 }
 function runIcontainsAnyAssertion(output, values) {
@@ -16242,9 +16219,11 @@ function runIcontainsAnyAssertion(output, values) {
   const passed = matched.length > 0;
   return {
     score: passed ? 1 : 0,
-    hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
-    misses: passed ? [] : [
-      `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
+    assertions: [
+      {
+        text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
+        passed
+      }
     ]
   };
 }
@@ -16254,24 +16233,36 @@ function runIcontainsAllAssertion(output, values) {
   const passed = missing.length === 0;
   return {
     score: passed ? 1 : 0,
-    hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
-    misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
+    assertions: [
+      {
+        text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
+        passed
+      }
+    ]
   };
 }
 function runStartsWithAssertion(output, value) {
   const passed = output.trim().startsWith(value.trim());
   return {
     score: passed ? 1 : 0,
-    hits: passed ? [`Output starts with "${value}"`] : [],
-    misses: passed ? [] : [`Output does not start with "${value}"`]
+    assertions: [
+      {
+        text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
+        passed
+      }
+    ]
   };
 }
 function runEndsWithAssertion(output, value) {
   const passed = output.trim().endsWith(value.trim());
   return {
     score: passed ? 1 : 0,
-    hits: passed ? [`Output ends with "${value}"`] : [],
-    misses: passed ? [] : [`Output does not end with "${value}"`]
+    assertions: [
+      {
+        text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
+        passed
+      }
+    ]
   };
 }
 function runRegexAssertion(output, pattern, flags) {
@@ -16280,8 +16271,12 @@ function runRegexAssertion(output, pattern, flags) {
   const flagsLabel = flags ? ` (flags: ${flags})` : "";
   return {
     score: passed ? 1 : 0,
-    hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
-    misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
+    assertions: [
+      {
+        text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
+        passed
+      }
+    ]
   };
 }
 function runIsJsonAssertion(output) {
@@ -16293,16 +16288,24 @@ function runIsJsonAssertion(output) {
   }
   return {
     score: passed ? 1 : 0,
-    hits: passed ? ["Output is valid JSON"] : [],
-    misses: passed ? [] : ["Output is not valid JSON"]
+    assertions: [
+      {
+        text: passed ? "Output is valid JSON" : "Output is not valid JSON",
+        passed
+      }
+    ]
   };
 }
 function runEqualsAssertion(output, value) {
   const passed = output.trim() === value.trim();
   return {
     score: passed ? 1 : 0,
-    hits: passed ? [`Output equals "${value}"`] : [],
-    misses: passed ? [] : [`Output does not equal "${value}"`]
+    assertions: [
+      {
+        text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
+        passed
+      }
+    ]
   };
 }
@@ -16515,10 +16518,8 @@ var InlineAssertEvaluator = class {
     return {
       score,
       verdict: scoreToVerdict(score),
-      hits: score >= 0.8 ? [result.name] : [],
-      misses: score < 0.5 ? [result.name] : [],
+      assertions: [{ text: result.name, passed: score >= 0.5 }],
       expectedAspectCount: 1,
-      reasoning: void 0,
       details: result.metadata ? result.metadata : void 0
     };
   }
@@ -16556,11 +16557,9 @@ async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
 }
 async function executePromptTemplate(script, context2, config, timeoutMs) {
   const payload = {
-    question: context2.evalCase.question,
     criteria: context2.evalCase.criteria,
     expectedOutput: context2.evalCase.expected_output,
-    referenceAnswer: context2.evalCase.reference_answer,
-    answer: context2.candidate,
+    outputText: context2.candidate,
     output: context2.output ?? null,
     guidelineFiles: context2.evalCase.guideline_paths,
     inputFiles: context2.evalCase.file_paths.filter(
@@ -16571,9 +16570,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
     fileChanges: context2.fileChanges ?? null,
     workspacePath: context2.workspacePath ?? null,
     config: config ?? context2.config ?? null,
-    // Text convenience accessors (new names, always strings)
     inputText: context2.evalCase.question,
-    outputText: context2.candidate,
     expectedOutputText: context2.evalCase.reference_answer ?? ""
   };
   const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -16711,9 +16708,7 @@ var containsFactory = (config) => {
     return {
       score: result.score,
       verdict: result.score === 1 ? "pass" : "fail",
-      hits: result.hits,
-      misses: result.misses,
-      reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
+      assertions: result.assertions,
       expectedAspectCount: 1
     };
   });
@@ -16725,9 +16720,7 @@ var regexFactory = (config) => {
     return {
       score: result.score,
       verdict: result.score === 1 ? "pass" : "fail",
-      hits: result.hits,
-      misses: result.misses,
-      reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
+      assertions: result.assertions,
       expectedAspectCount: 1
     };
   });
@@ -16738,9 +16731,7 @@ var isJsonFactory = () => {
     return {
       score: result.score,
       verdict: result.score === 1 ? "pass" : "fail",
-      hits: result.hits,
-      misses: result.misses,
-      reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
+      assertions: result.assertions,
       expectedAspectCount: 1
     };
   });
@@ -16752,9 +16743,7 @@ var equalsFactory = (config) => {
     return {
       score: result.score,
       verdict: result.score === 1 ? "pass" : "fail",
-      hits: result.hits,
-      misses: result.misses,
-      reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
+      assertions: result.assertions,
       expectedAspectCount: 1
     };
   });
@@ -16766,9 +16755,7 @@ var containsAnyFactory = (config) => {
     return {
       score: result.score,
       verdict: result.score === 1 ? "pass" : "fail",
-      hits: result.hits,
-      misses: result.misses,
-      reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
+      assertions: result.assertions,
       expectedAspectCount: 1
     };
   });
@@ -16780,9 +16767,7 @@ var containsAllFactory = (config) => {
     return {
       score: result.score,
       verdict: result.score === 1 ? "pass" : "fail",
-      hits: result.hits,
-      misses: result.misses,
-      reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
+      assertions: result.assertions,
       expectedAspectCount: 1
     };
   });
@@ -16794,9 +16779,7 @@ var icontainsFactory = (config) => {
     return {
       score: result.score,
       verdict: result.score === 1 ? "pass" : "fail",
-      hits: result.hits,
-      misses: result.misses,
-      reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
+      assertions: result.assertions,
       expectedAspectCount: 1
     };
   });
@@ -16808,9 +16791,7 @@ var icontainsAnyFactory = (config) => {
     return {
       score: result.score,
       verdict: result.score === 1 ? "pass" : "fail",
-      hits: result.hits,
-      misses: result.misses,
-      reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
+      assertions: result.assertions,
       expectedAspectCount: 1
     };
   });
@@ -16822,9 +16803,7 @@ var icontainsAllFactory = (config) => {
     return {
       score: result.score,
       verdict: result.score === 1 ? "pass" : "fail",
-      hits: result.hits,
-      misses: result.misses,
-      reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
+      assertions: result.assertions,
       expectedAspectCount: 1
     };
   });
@@ -16836,9 +16815,7 @@ var startsWithFactory = (config) => {
     return {
       score: result.score,
       verdict: result.score === 1 ? "pass" : "fail",
-      hits: result.hits,
-      misses: result.misses,
-      reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
+      assertions: result.assertions,
       expectedAspectCount: 1
     };
   });
@@ -16850,9 +16827,7 @@ var endsWithFactory = (config) => {
     return {
       score: result.score,
       verdict: result.score === 1 ? "pass" : "fail",
-      hits: result.hits,
-      misses: result.misses,
-      reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
+      assertions: result.assertions,
       expectedAspectCount: 1
     };
   });
@@ -18258,9 +18233,8 @@ async function runEvaluation(options) {
             testId: evalCase.id,
             dataset: evalCase.dataset,
             score: 0,
-            hits: [],
-            misses: [],
-            answer: "",
+            assertions: [],
+            outputText: "",
             target: target.name,
             error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
             budgetExceeded: true,
@@ -18295,9 +18269,8 @@ async function runEvaluation(options) {
             testId: evalCase.id,
             dataset: evalCase.dataset,
             score: 0,
-            hits: [],
-            misses: [],
-            answer: "",
+            assertions: [],
+            outputText: "",
             target: target.name,
             error: errorMsg,
             executionStatus: "execution_error",
@@ -19263,11 +19236,9 @@ async function evaluateCandidate(options) {
     dataset: evalCase.dataset,
     conversationId: evalCase.conversation_id,
     score: score.score,
-    hits: score.hits,
-    misses: score.misses,
-    answer: candidate,
+    assertions: score.assertions,
+    outputText: candidate,
     target: target.name,
-    reasoning: score.reasoning,
     tokenUsage,
     costUsd,
     durationMs,
@@ -19441,9 +19412,7 @@ async function runEvaluatorList(options) {
         score: score2.score,
         weight,
         verdict: score2.verdict,
-        hits: score2.hits,
-        misses: score2.misses,
-        reasoning: score2.reasoning,
+        assertions: score2.assertions,
         evaluatorProviderRequest: score2.evaluatorRawRequest,
         details: score2.details,
         scores: mapChildResults(score2.scores),
@@ -19458,10 +19427,10 @@ async function runEvaluatorList(options) {
       const fallbackScore = {
         score: 0,
         verdict: "fail",
-        hits: [],
-        misses: [`Evaluator '${evaluatorConfig.name}' failed: ${message}`],
-        expectedAspectCount: 1,
-        reasoning: message
+        assertions: [
+          { text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
+        ],
+        expectedAspectCount: 1
       };
       const weight = evaluatorConfig.weight ?? 1;
       scored.push({
@@ -19477,9 +19446,12 @@ async function runEvaluatorList(options) {
         score: 0,
         weight,
         verdict: "fail",
-        hits: [],
-        misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
-        reasoning: message,
+        assertions: [
+          {
+            text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
+            passed: false
+          }
+        ],
         durationMs: endedAt.getTime() - startedAt.getTime(),
         startedAt: startedAt.toISOString(),
         endedAt: endedAt.toISOString()
@@ -19495,9 +19467,7 @@ async function runEvaluatorList(options) {
           ...scores[lastScoresIdx],
           score: negated.score,
           verdict: negated.verdict,
-          hits: [...negated.hits],
-          misses: [...negated.misses],
-          reasoning: negated.reasoning
+          assertions: [...negated.assertions]
         };
       }
     }
@@ -19512,21 +19482,13 @@ async function runEvaluatorList(options) {
   const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
     scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
   ) : 0;
-  const hits = scored.flatMap((entry) => entry.score.hits);
-  const misses = scored.flatMap((entry) => entry.score.misses);
-  const expectedAspectCount = scored.reduce(
-    (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
-    0
-  );
-  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
-  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
+  const assertions = scored.flatMap((entry) => entry.score.assertions);
+  const expectedAspectCount = assertions.length || 1;
   const score = {
     score: aggregateScore,
     verdict: scoreToVerdict(aggregateScore),
-    hits,
-    misses,
-    expectedAspectCount,
-    reasoning
+    assertions,
+    expectedAspectCount
   };
   return { score, scores };
 }
@@ -19630,9 +19592,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
     dataset: evalCase.dataset,
     conversationId: evalCase.conversation_id,
     score: 0,
-    hits: [],
-    misses: [`Error: ${message}`],
-    answer: `Error occurred: ${message}`,
+    assertions: [{ text: `Error: ${message}`, passed: false }],
+    outputText: `Error occurred: ${message}`,
     target: targetName,
     requests,
     input,
@@ -19741,9 +19702,7 @@ function mapChildResults(children) {
     score: child.score,
     weight: child.weight,
     verdict: child.verdict,
-    hits: child.hits,
-    misses: child.misses,
-    reasoning: child.reasoning,
+    assertions: child.assertions,
     evaluatorProviderRequest: child.evaluatorRawRequest,
     scores: mapChildResults(child.scores),
     details: child.details,
@@ -20172,7 +20131,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
 // src/evaluation/baseline.ts
 var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
-  "answer",
+  "outputText",
   "requests",
   "trace",
   "workspacePath",
@@ -20346,7 +20305,7 @@ var OtelTraceExporter = class {
         rootSpan.setAttribute("agentv.target", result.target);
         if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
         rootSpan.setAttribute("agentv.score", result.score);
-        if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
+        if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
         if (result.durationMs != null)
           rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
         if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
@@ -20713,7 +20672,6 @@ function createAgentKernel() {
   freeformEvaluationSchema,
   generateRubrics,
   getAgentvHome,
-  getHitCount,
   getOutputFilenames,
   getSubagentsRoot,
   getTraceStateRoot,