@agentv/core 3.4.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,7 +5,6 @@ import {
5
5
  extractLastAssistantContent,
6
6
  fileExists,
7
7
  findGitRoot,
8
- getHitCount,
9
8
  isAgentProvider,
10
9
  isEvaluatorKind,
11
10
  isJsonObject,
@@ -17,10 +16,10 @@ import {
17
16
  readTextFile,
18
17
  resolveFileReference,
19
18
  resolveTargetDefinition
20
- } from "./chunk-JO4HIAEF.js";
19
+ } from "./chunk-2IZOTQ25.js";
21
20
  import {
22
21
  AgentvProvider
23
- } from "./chunk-Q52FQPKQ.js";
22
+ } from "./chunk-W5YDZWT4.js";
24
23
  import {
25
24
  OtlpJsonFileExporter
26
25
  } from "./chunk-HFSYZHGF.js";
@@ -743,14 +742,8 @@ import { readFile as readFile4 } from "node:fs/promises";
743
742
 
744
743
  // src/evaluation/template-variables.ts
745
744
  var TEMPLATE_VARIABLES = {
746
- /** @deprecated Use OUTPUT_TEXT instead */
747
- ANSWER: "answer",
748
745
  EXPECTED_OUTPUT: "expected_output",
749
- /** @deprecated Use INPUT_TEXT instead */
750
- QUESTION: "question",
751
746
  CRITERIA: "criteria",
752
- /** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
753
- REFERENCE_ANSWER: "reference_answer",
754
747
  INPUT: "input",
755
748
  OUTPUT: "output",
756
749
  FILE_CHANGES: "file_changes",
@@ -760,9 +753,8 @@ var TEMPLATE_VARIABLES = {
760
753
  };
761
754
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
762
755
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
763
- TEMPLATE_VARIABLES.ANSWER,
764
- TEMPLATE_VARIABLES.EXPECTED_OUTPUT,
765
- TEMPLATE_VARIABLES.OUTPUT_TEXT
756
+ TEMPLATE_VARIABLES.OUTPUT_TEXT,
757
+ TEMPLATE_VARIABLES.EXPECTED_OUTPUT
766
758
  ]);
767
759
 
768
760
  // src/evaluation/validation/prompt-validator.ts
@@ -785,13 +777,13 @@ function validateTemplateVariables(content, source) {
785
777
  }
786
778
  match = variablePattern.exec(content);
787
779
  }
788
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.ANSWER) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
780
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
789
781
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
790
782
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
791
783
  if (!hasRequiredFields) {
792
784
  throw new Error(
793
785
  `Missing required fields. Must include at least one of:
794
- - {{ ${TEMPLATE_VARIABLES.ANSWER} }} or {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
786
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
795
787
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
796
788
  );
797
789
  }
@@ -3752,7 +3744,7 @@ var AzureProvider = class {
3752
3744
  };
3753
3745
  this.retryConfig = config.retry;
3754
3746
  const azure = createAzure(buildAzureOptions(config));
3755
- this.model = azure(config.deploymentName);
3747
+ this.model = azure.chat(config.deploymentName);
3756
3748
  }
3757
3749
  id;
3758
3750
  kind = "azure";
@@ -3975,6 +3967,8 @@ async function invokeModel(options) {
3975
3967
  const { model, request, defaults, retryConfig, providerOptions } = options;
3976
3968
  const chatPrompt = buildChatPrompt(request);
3977
3969
  const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
3970
+ const startTime = (/* @__PURE__ */ new Date()).toISOString();
3971
+ const startMs = Date.now();
3978
3972
  const result = await withRetry(
3979
3973
  () => generateText({
3980
3974
  model,
@@ -3988,9 +3982,11 @@ async function invokeModel(options) {
3988
3982
  retryConfig,
3989
3983
  request.signal
3990
3984
  );
3991
- return mapResponse(result);
3985
+ const endTime = (/* @__PURE__ */ new Date()).toISOString();
3986
+ const durationMs = Date.now() - startMs;
3987
+ return mapResponse(result, { durationMs, startTime, endTime });
3992
3988
  }
3993
- function mapResponse(result) {
3989
+ function mapResponse(result, timing) {
3994
3990
  const content = result.text ?? "";
3995
3991
  const rawUsage = result.totalUsage ?? result.usage;
3996
3992
  const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
@@ -4005,7 +4001,10 @@ function mapResponse(result) {
4005
4001
  raw: result,
4006
4002
  usage: toJsonObject(rawUsage),
4007
4003
  output: [{ role: "assistant", content }],
4008
- tokenUsage
4004
+ tokenUsage,
4005
+ durationMs: timing?.durationMs,
4006
+ startTime: timing?.startTime,
4007
+ endTime: timing?.endTime
4009
4008
  };
4010
4009
  }
4011
4010
  function toJsonObject(value) {
@@ -4883,10 +4882,12 @@ var ClaudeSdkProvider = class {
4883
4882
  if (usage) {
4884
4883
  const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
4885
4884
  const outputTokens = usage.output_tokens ?? 0;
4885
+ const reasoningTokens = usage.reasoning_tokens ?? void 0;
4886
4886
  tokenUsage = {
4887
4887
  input: inputTokens,
4888
4888
  output: outputTokens,
4889
- cached: usage.cache_read_input_tokens ?? void 0
4889
+ cached: usage.cache_read_input_tokens ?? void 0,
4890
+ reasoning: reasoningTokens
4890
4891
  };
4891
4892
  request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
4892
4893
  }
@@ -5900,7 +5901,8 @@ ${basePrompt}` : basePrompt;
5900
5901
  onUsage({
5901
5902
  input: usage.input_tokens ?? 0,
5902
5903
  output: usage.output_tokens ?? 0,
5903
- cached: usage.cached_input_tokens ?? void 0
5904
+ cached: usage.cached_input_tokens ?? void 0,
5905
+ reasoning: usage.reasoning_tokens ?? void 0
5904
5906
  });
5905
5907
  }
5906
5908
  }
@@ -7914,10 +7916,12 @@ function extractTokenUsage(events) {
7914
7916
  output: output ?? 0
7915
7917
  };
7916
7918
  const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
7917
- if (cached !== void 0) {
7918
- return { ...result, cached };
7919
- }
7920
- return result;
7919
+ const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
7920
+ return {
7921
+ ...result,
7922
+ ...cached !== void 0 ? { cached } : {},
7923
+ ...reasoning !== void 0 ? { reasoning } : {}
7924
+ };
7921
7925
  }
7922
7926
  }
7923
7927
  const messages = record.messages;
@@ -9784,9 +9788,11 @@ function negateScore(score) {
9784
9788
  ...score,
9785
9789
  score: negatedScore,
9786
9790
  verdict: negatedVerdict,
9787
- reasoning: score.reasoning ? `[Negated] ${score.reasoning} (original score: ${score.score.toFixed(2)})` : `[Negated] Original score: ${score.score.toFixed(2)}`,
9788
- hits: score.misses,
9789
- misses: score.hits
9791
+ assertions: score.assertions.map((a) => ({
9792
+ ...a,
9793
+ passed: !a.passed,
9794
+ evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
9795
+ }))
9790
9796
  };
9791
9797
  }
9792
9798
 
@@ -10244,11 +10250,9 @@ var CodeEvaluator = class {
10244
10250
  }
10245
10251
  }
10246
10252
  const payload = {
10247
- question: context.evalCase.question,
10248
10253
  criteria: context.evalCase.criteria,
10249
10254
  expectedOutput: context.evalCase.expected_output,
10250
- referenceAnswer: context.evalCase.reference_answer,
10251
- answer: context.candidate,
10255
+ outputText: context.candidate,
10252
10256
  output: outputForPayload,
10253
10257
  outputPath,
10254
10258
  guidelineFiles: context.evalCase.guideline_paths,
@@ -10265,9 +10269,7 @@ var CodeEvaluator = class {
10265
10269
  fileChanges: context.fileChanges ?? null,
10266
10270
  workspacePath: context.workspacePath ?? null,
10267
10271
  config: this.config ?? null,
10268
- // Text convenience accessors (new names, always strings)
10269
10272
  inputText: context.evalCase.question,
10270
- outputText: context.candidate,
10271
10273
  expectedOutputText: context.evalCase.reference_answer ?? ""
10272
10274
  };
10273
10275
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -10301,9 +10303,13 @@ var CodeEvaluator = class {
10301
10303
  );
10302
10304
  const parsed = parseJsonSafe(stdout);
10303
10305
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
10304
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
10305
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
10306
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
10306
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
10307
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
10308
+ ).map((a) => ({
10309
+ text: String(a.text),
10310
+ passed: Boolean(a.passed),
10311
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
10312
+ })) : [];
10307
10313
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
10308
10314
  const proxyUsage = getProxyUsage?.();
10309
10315
  const evaluatorRawRequest = {
@@ -10319,10 +10325,8 @@ var CodeEvaluator = class {
10319
10325
  return {
10320
10326
  score,
10321
10327
  verdict: scoreToVerdict(score),
10322
- hits,
10323
- misses,
10324
- expectedAspectCount: hits.length + misses.length || 1,
10325
- reasoning,
10328
+ assertions,
10329
+ expectedAspectCount: assertions.length || 1,
10326
10330
  evaluatorRawRequest,
10327
10331
  ...details ? { details } : {},
10328
10332
  tokenUsage: proxyUsage?.tokenUsage
@@ -10333,10 +10337,8 @@ var CodeEvaluator = class {
10333
10337
  return {
10334
10338
  score: 0,
10335
10339
  verdict: "fail",
10336
- hits: [],
10337
- misses: [`Code evaluator failed: ${message}`],
10340
+ assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
10338
10341
  expectedAspectCount: 1,
10339
- reasoning: message,
10340
10342
  evaluatorRawRequest: {
10341
10343
  command: this.command,
10342
10344
  ...this.cwd ? { cwd: this.cwd } : {},
@@ -10435,18 +10437,22 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
10435
10437
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
10436
10438
 
10437
10439
  [[ ## question ## ]]
10438
- {{${TEMPLATE_VARIABLES.QUESTION}}}
10440
+ {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
10439
10441
 
10440
10442
  [[ ## reference_answer ## ]]
10441
- {{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
10443
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
10442
10444
 
10443
10445
  [[ ## answer ## ]]
10444
- {{${TEMPLATE_VARIABLES.ANSWER}}}`;
10446
+ {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
10445
10447
  var freeformEvaluationSchema = z3.object({
10446
10448
  score: z3.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
10447
- hits: z3.array(z3.string()).describe("Brief specific achievements").optional(),
10448
- misses: z3.array(z3.string()).describe("Brief failures or omissions").optional(),
10449
- reasoning: z3.string().describe("Concise explanation (1-2 sentences)").optional()
10449
+ assertions: z3.array(
10450
+ z3.object({
10451
+ text: z3.string().describe("Brief description of what was checked"),
10452
+ passed: z3.boolean().describe("Whether this aspect was satisfied"),
10453
+ evidence: z3.string().describe("Concise evidence (1-2 sentences)").optional()
10454
+ })
10455
+ ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
10450
10456
  });
10451
10457
  var rubricCheckResultSchema = z3.object({
10452
10458
  id: z3.string().describe("The ID of the rubric item being checked"),
@@ -10515,12 +10521,8 @@ var LlmGraderEvaluator = class {
10515
10521
  2
10516
10522
  ),
10517
10523
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context.output ?? [], null, 2),
10518
- [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
10519
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
10520
10524
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
10521
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
10522
10525
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
10523
- // Text convenience accessors (new names, always strings)
10524
10526
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
10525
10527
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
10526
10528
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
@@ -10548,17 +10550,12 @@ ${context.fileChanges}`;
10548
10550
  schema: freeformEvaluationSchema
10549
10551
  });
10550
10552
  const score = clampScore(data.score);
10551
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
10552
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
10553
- const reasoning = data.reasoning;
10554
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
10553
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
10555
10554
  return {
10556
10555
  score,
10557
10556
  verdict: scoreToVerdict(score),
10558
- hits,
10559
- misses,
10560
- expectedAspectCount,
10561
- reasoning,
10557
+ assertions,
10558
+ expectedAspectCount: Math.max(assertions.length, 1),
10562
10559
  evaluatorRawRequest,
10563
10560
  tokenUsage
10564
10561
  };
@@ -10569,10 +10566,8 @@ ${context.fileChanges}`;
10569
10566
  return {
10570
10567
  score: 0,
10571
10568
  verdict: "skip",
10572
- hits: [],
10573
- misses: [`Grader parse failure after 3 attempts: ${message}`],
10569
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10574
10570
  expectedAspectCount: 1,
10575
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
10576
10571
  evaluatorRawRequest
10577
10572
  };
10578
10573
  }
@@ -10602,14 +10597,12 @@ ${context.fileChanges}`;
10602
10597
  userPrompt: prompt,
10603
10598
  schema: rubricEvaluationSchema
10604
10599
  });
10605
- const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
10600
+ const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
10606
10601
  return {
10607
10602
  score,
10608
10603
  verdict,
10609
- hits,
10610
- misses,
10604
+ assertions,
10611
10605
  expectedAspectCount: rubrics.length,
10612
- reasoning: data.overall_reasoning,
10613
10606
  evaluatorRawRequest,
10614
10607
  tokenUsage
10615
10608
  };
@@ -10620,10 +10613,8 @@ ${context.fileChanges}`;
10620
10613
  return {
10621
10614
  score: 0,
10622
10615
  verdict: "skip",
10623
- hits: [],
10624
- misses: [`Grader parse failure after 3 attempts: ${message}`],
10616
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10625
10617
  expectedAspectCount: rubrics.length,
10626
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
10627
10618
  evaluatorRawRequest
10628
10619
  };
10629
10620
  }
@@ -10648,14 +10639,12 @@ ${context.fileChanges}`;
10648
10639
  userPrompt: prompt,
10649
10640
  schema: scoreRangeEvaluationSchema
10650
10641
  });
10651
- const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
10642
+ const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
10652
10643
  return {
10653
10644
  score,
10654
10645
  verdict,
10655
- hits,
10656
- misses,
10646
+ assertions,
10657
10647
  expectedAspectCount: rubrics.length,
10658
- reasoning: data.overall_reasoning,
10659
10648
  evaluatorRawRequest,
10660
10649
  details,
10661
10650
  tokenUsage
@@ -10667,10 +10656,8 @@ ${context.fileChanges}`;
10667
10656
  return {
10668
10657
  score: 0,
10669
10658
  verdict: "skip",
10670
- hits: [],
10671
- misses: [`Grader parse failure after 3 attempts: ${message}`],
10659
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10672
10660
  expectedAspectCount: rubrics.length,
10673
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
10674
10661
  evaluatorRawRequest
10675
10662
  };
10676
10663
  }
@@ -10727,8 +10714,7 @@ ${context.fileChanges}`;
10727
10714
  return {
10728
10715
  score: 0,
10729
10716
  verdict: "fail",
10730
- hits: [],
10731
- misses: [`llm-grader built-in evaluation failed: ${message}`],
10717
+ assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
10732
10718
  expectedAspectCount: 1,
10733
10719
  evaluatorRawRequest,
10734
10720
  details: { mode: "built-in", error: message }
@@ -10778,8 +10764,9 @@ ${context.fileChanges}`;
10778
10764
  return {
10779
10765
  score: 0,
10780
10766
  verdict: "fail",
10781
- hits: [],
10782
- misses: [`llm-grader ${modeLabel} returned no assistant response`],
10767
+ assertions: [
10768
+ { text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
10769
+ ],
10783
10770
  expectedAspectCount: 1,
10784
10771
  evaluatorRawRequest,
10785
10772
  details: { mode: modeLabel, grader_target: provider.targetName }
@@ -10797,8 +10784,9 @@ ${context.fileChanges}`;
10797
10784
  return {
10798
10785
  score: 0,
10799
10786
  verdict: "fail",
10800
- hits: [],
10801
- misses: [`llm-grader ${modeLabel} evaluation failed: ${message}`],
10787
+ assertions: [
10788
+ { text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
10789
+ ],
10802
10790
  expectedAspectCount: 1,
10803
10791
  evaluatorRawRequest,
10804
10792
  details: {
@@ -10839,10 +10827,10 @@ ${context.fileChanges}`;
10839
10827
  buildAgentUserPrompt(context) {
10840
10828
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
10841
10829
  const variables = {
10842
- [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
10843
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
10844
10830
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
10845
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
10831
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
10832
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
10833
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
10846
10834
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
10847
10835
  };
10848
10836
  if (this.evaluatorTemplate) {
@@ -10895,10 +10883,10 @@ ${context.fileChanges}`;
10895
10883
  const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10896
10884
  if (this.evaluatorTemplate) {
10897
10885
  const variables = {
10898
- [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
10899
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
10900
10886
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
10901
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
10887
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
10888
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
10889
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
10902
10890
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
10903
10891
  };
10904
10892
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
@@ -10950,29 +10938,24 @@ ${outputSchema}`;
10950
10938
  const parsed = parseJsonFromText(text);
10951
10939
  if (rubrics && rubrics.length > 0) {
10952
10940
  const data2 = rubricEvaluationSchema.parse(parsed);
10953
- const { score: score2, verdict, hits: hits2, misses: misses2 } = calculateRubricScore(data2, rubrics);
10941
+ const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
10954
10942
  return {
10955
10943
  score: score2,
10956
10944
  verdict,
10957
- hits: hits2,
10958
- misses: misses2,
10945
+ assertions: assertions2,
10959
10946
  expectedAspectCount: rubrics.length,
10960
- reasoning: data2.overall_reasoning,
10961
10947
  evaluatorRawRequest,
10962
10948
  details
10963
10949
  };
10964
10950
  }
10965
10951
  const data = freeformEvaluationSchema.parse(parsed);
10966
10952
  const score = clampScore(data.score);
10967
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
10968
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
10953
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
10969
10954
  return {
10970
10955
  score,
10971
10956
  verdict: scoreToVerdict(score),
10972
- hits,
10973
- misses,
10974
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
10975
- reasoning: data.reasoning,
10957
+ assertions,
10958
+ expectedAspectCount: Math.max(assertions.length, 1),
10976
10959
  evaluatorRawRequest,
10977
10960
  details
10978
10961
  };
@@ -10980,8 +10963,12 @@ ${outputSchema}`;
10980
10963
  return {
10981
10964
  score: 0,
10982
10965
  verdict: "fail",
10983
- hits: [],
10984
- misses: ["Failed to parse llm-grader agent response as valid evaluation JSON"],
10966
+ assertions: [
10967
+ {
10968
+ text: "Failed to parse llm-grader agent response as valid evaluation JSON",
10969
+ passed: false
10970
+ }
10971
+ ],
10985
10972
  expectedAspectCount: 1,
10986
10973
  evaluatorRawRequest,
10987
10974
  details
@@ -11110,9 +11097,13 @@ function buildOutputSchema() {
11110
11097
  "",
11111
11098
  "{",
11112
11099
  ' "score": <number between 0.0 and 1.0>,',
11113
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
11114
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
11115
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
11100
+ ' "assertions": [',
11101
+ " {",
11102
+ ' "text": "<brief description of what was checked>",',
11103
+ ' "passed": <boolean>,',
11104
+ ' "evidence": "<concise evidence, 1-2 sentences, optional>"',
11105
+ " }",
11106
+ " ]",
11116
11107
  "}"
11117
11108
  ].join("\n");
11118
11109
  }
@@ -11137,8 +11128,7 @@ function substituteVariables(template, variables) {
11137
11128
  }
11138
11129
  function calculateRubricScore(result, rubrics) {
11139
11130
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
11140
- const hits = [];
11141
- const misses = [];
11131
+ const assertions = [];
11142
11132
  let totalWeight = 0;
11143
11133
  let earnedWeight = 0;
11144
11134
  let failedRequired = false;
@@ -11148,19 +11138,20 @@ function calculateRubricScore(result, rubrics) {
11148
11138
  continue;
11149
11139
  }
11150
11140
  totalWeight += rubric.weight;
11141
+ assertions.push({
11142
+ text: `[${rubric.id}] ${rubric.outcome}`,
11143
+ passed: check.satisfied,
11144
+ evidence: check.reasoning
11145
+ });
11151
11146
  if (check.satisfied) {
11152
11147
  earnedWeight += rubric.weight;
11153
- hits.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
11154
- } else {
11155
- misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
11156
- if (rubric.required) {
11157
- failedRequired = true;
11158
- }
11148
+ } else if (rubric.required) {
11149
+ failedRequired = true;
11159
11150
  }
11160
11151
  }
11161
11152
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
11162
11153
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
11163
- return { score, verdict, hits, misses };
11154
+ return { score, verdict, assertions };
11164
11155
  }
11165
11156
  function buildScoreRangeOutputSchema() {
11166
11157
  return `You are an expert evaluator. Score the candidate answer on each criterion.
@@ -11180,8 +11171,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
11180
11171
  }
11181
11172
  function calculateScoreRangeResult(result, rubrics) {
11182
11173
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
11183
- const hits = [];
11184
- const misses = [];
11174
+ const assertions = [];
11185
11175
  const rawScores = {};
11186
11176
  let totalWeight = 0;
11187
11177
  let weightedScoreSum = 0;
@@ -11207,24 +11197,22 @@ function calculateScoreRangeResult(result, rubrics) {
11207
11197
  );
11208
11198
  const rangeDescription = matchingRange?.outcome ?? "";
11209
11199
  const criterionLabel = rubric.outcome ?? rubric.id;
11210
- const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
11211
- const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
11200
+ const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
11212
11201
  if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
11213
11202
  failedRequired = true;
11214
- misses.push(scoreInfo);
11215
- } else if (rawScore >= 7) {
11216
- hits.push(scoreInfo);
11217
- } else {
11218
- misses.push(scoreInfo);
11219
11203
  }
11204
+ assertions.push({
11205
+ text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
11206
+ passed,
11207
+ evidence: check.reasoning
11208
+ });
11220
11209
  }
11221
11210
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
11222
11211
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
11223
11212
  return {
11224
11213
  score,
11225
11214
  verdict,
11226
- hits,
11227
- misses,
11215
+ assertions,
11228
11216
  details: {
11229
11217
  raw_scores: rawScores,
11230
11218
  normalization: "score / 10",
@@ -11400,9 +11388,7 @@ var CompositeEvaluator = class {
11400
11388
  let totalWeight = 0;
11401
11389
  let weightedSum = 0;
11402
11390
  let evaluatedCount = 0;
11403
- const allHits = [];
11404
- const allMisses = [];
11405
- const reasoningParts = [];
11391
+ const allAssertions = [];
11406
11392
  const scores = [];
11407
11393
  for (const member of results) {
11408
11394
  const weight = weights?.[member.id] ?? 1;
@@ -11412,9 +11398,7 @@ var CompositeEvaluator = class {
11412
11398
  score: member.result.score,
11413
11399
  weight,
11414
11400
  verdict: member.result.verdict,
11415
- hits: [...member.result.hits],
11416
- misses: [...member.result.misses],
11417
- reasoning: member.result.reasoning,
11401
+ assertions: [...member.result.assertions],
11418
11402
  evaluatorRawRequest: member.result.evaluatorRawRequest,
11419
11403
  scores: member.result.scores,
11420
11404
  details: member.result.details,
@@ -11426,20 +11410,16 @@ var CompositeEvaluator = class {
11426
11410
  evaluatedCount++;
11427
11411
  totalWeight += weight;
11428
11412
  weightedSum += member.result.score * weight;
11429
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
11430
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
11431
- if (member.result.reasoning) {
11432
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
11433
- }
11413
+ allAssertions.push(
11414
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
11415
+ );
11434
11416
  }
11435
11417
  if (evaluatedCount === 0 && results.length > 0) {
11436
11418
  return {
11437
11419
  score: 0,
11438
11420
  verdict: "skip",
11439
- hits: [],
11440
- misses: [],
11421
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
11441
11422
  expectedAspectCount: 1,
11442
- reasoning: "All evaluators skipped (infrastructure failure)",
11443
11423
  evaluatorRawRequest: {
11444
11424
  aggregator: "weighted_average",
11445
11425
  ...weights ? { weights } : {}
@@ -11451,10 +11431,8 @@ var CompositeEvaluator = class {
11451
11431
  return {
11452
11432
  score: clampScore(finalScore),
11453
11433
  verdict: scoreToVerdict(finalScore),
11454
- hits: allHits,
11455
- misses: allMisses,
11456
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
11457
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
11434
+ assertions: allAssertions,
11435
+ expectedAspectCount: allAssertions.length || 1,
11458
11436
  evaluatorRawRequest: {
11459
11437
  aggregator: "weighted_average",
11460
11438
  ...weights ? { weights } : {}
@@ -11464,11 +11442,8 @@ var CompositeEvaluator = class {
11464
11442
  }
11465
11443
  runThreshold(results, threshold) {
11466
11444
  const scores = [];
11467
- const allHits = [];
11468
- const allMisses = [];
11469
- const reasoningParts = [];
11445
+ const allAssertions = [];
11470
11446
  let passingCount = 0;
11471
- let borderlineCount = 0;
11472
11447
  let evaluatedCount = 0;
11473
11448
  for (const member of results) {
11474
11449
  scores.push({
@@ -11476,9 +11451,7 @@ var CompositeEvaluator = class {
11476
11451
  type: member.type,
11477
11452
  score: member.result.score,
11478
11453
  verdict: member.result.verdict,
11479
- hits: [...member.result.hits],
11480
- misses: [...member.result.misses],
11481
- reasoning: member.result.reasoning,
11454
+ assertions: [...member.result.assertions],
11482
11455
  evaluatorRawRequest: member.result.evaluatorRawRequest,
11483
11456
  scores: member.result.scores,
11484
11457
  details: member.result.details,
@@ -11491,24 +11464,17 @@ var CompositeEvaluator = class {
11491
11464
  const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
11492
11465
  if (isPassing) {
11493
11466
  passingCount++;
11494
- if (member.result.verdict === "borderline") {
11495
- borderlineCount++;
11496
- }
11497
- }
11498
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
11499
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
11500
- if (member.result.reasoning) {
11501
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
11502
11467
  }
11468
+ allAssertions.push(
11469
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
11470
+ );
11503
11471
  }
11504
11472
  if (evaluatedCount === 0 && results.length > 0) {
11505
11473
  return {
11506
11474
  score: 0,
11507
11475
  verdict: "skip",
11508
- hits: [],
11509
- misses: [],
11476
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
11510
11477
  expectedAspectCount: 1,
11511
- reasoning: "All evaluators skipped (infrastructure failure)",
11512
11478
  evaluatorRawRequest: {
11513
11479
  aggregator: "threshold",
11514
11480
  threshold
@@ -11519,19 +11485,15 @@ var CompositeEvaluator = class {
11519
11485
  const totalCount = evaluatedCount;
11520
11486
  const score = totalCount > 0 ? passingCount / totalCount : 0;
11521
11487
  const pass = score >= threshold;
11522
- if (pass && borderlineCount > 0) {
11523
- reasoningParts.push(`Warning: ${borderlineCount} borderline evaluator(s) counted as passing`);
11524
- }
11525
- reasoningParts.unshift(
11526
- `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
11527
- );
11488
+ allAssertions.unshift({
11489
+ text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
11490
+ passed: pass
11491
+ });
11528
11492
  return {
11529
11493
  score: clampScore(score),
11530
11494
  verdict: pass ? "pass" : "fail",
11531
- hits: allHits,
11532
- misses: allMisses,
11533
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
11534
- reasoning: reasoningParts.join("; "),
11495
+ assertions: allAssertions,
11496
+ expectedAspectCount: allAssertions.length || 1,
11535
11497
  evaluatorRawRequest: {
11536
11498
  aggregator: "threshold",
11537
11499
  threshold
@@ -11548,9 +11510,7 @@ var CompositeEvaluator = class {
11548
11510
  score: member.result.score,
11549
11511
  weight: weights?.[member.id] ?? 1,
11550
11512
  verdict: member.result.verdict,
11551
- hits: [...member.result.hits],
11552
- misses: [...member.result.misses],
11553
- reasoning: member.result.reasoning,
11513
+ assertions: [...member.result.assertions],
11554
11514
  evaluatorRawRequest: member.result.evaluatorRawRequest,
11555
11515
  scores: member.result.scores,
11556
11516
  details: member.result.details
@@ -11559,17 +11519,19 @@ var CompositeEvaluator = class {
11559
11519
  const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
11560
11520
  const parsed = parseJsonSafe(stdout);
11561
11521
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
11562
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
11563
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
11564
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
11522
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
11523
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
11524
+ ).map((a) => ({
11525
+ text: String(a.text),
11526
+ passed: Boolean(a.passed),
11527
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
11528
+ })) : [];
11565
11529
  const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
11566
11530
  return {
11567
11531
  score,
11568
11532
  verdict,
11569
- hits,
11570
- misses,
11571
- expectedAspectCount: hits.length + misses.length || 1,
11572
- reasoning,
11533
+ assertions,
11534
+ expectedAspectCount: assertions.length || 1,
11573
11535
  evaluatorRawRequest: {
11574
11536
  aggregator: "code-grader",
11575
11537
  script: scriptPath
@@ -11581,10 +11543,8 @@ var CompositeEvaluator = class {
11581
11543
  return {
11582
11544
  score: 0,
11583
11545
  verdict: "fail",
11584
- hits: [],
11585
- misses: [`Code aggregator failed: ${message}`],
11546
+ assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
11586
11547
  expectedAspectCount: 1,
11587
- reasoning: message,
11588
11548
  evaluatorRawRequest: {
11589
11549
  aggregator: "code-grader",
11590
11550
  script: scriptPath,
@@ -11606,9 +11566,7 @@ var CompositeEvaluator = class {
11606
11566
  type: member.type,
11607
11567
  score: member.result.score,
11608
11568
  verdict: member.result.verdict,
11609
- hits: [...member.result.hits],
11610
- misses: [...member.result.misses],
11611
- reasoning: member.result.reasoning,
11569
+ assertions: [...member.result.assertions],
11612
11570
  evaluatorRawRequest: member.result.evaluatorRawRequest,
11613
11571
  scores: member.result.scores,
11614
11572
  details: member.result.details
@@ -11632,16 +11590,12 @@ var CompositeEvaluator = class {
11632
11590
  });
11633
11591
  const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
11634
11592
  const score2 = clampScore(data2.score);
11635
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
11636
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
11637
- const reasoning2 = data2.reasoning;
11593
+ const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
11638
11594
  return {
11639
11595
  score: score2,
11640
11596
  verdict: scoreToVerdict(score2),
11641
- hits: hits2,
11642
- misses: misses2,
11643
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
11644
- reasoning: reasoning2,
11597
+ assertions: assertions2,
11598
+ expectedAspectCount: Math.max(assertions2.length, 1),
11645
11599
  evaluatorRawRequest,
11646
11600
  scores
11647
11601
  };
@@ -11656,16 +11610,12 @@ var CompositeEvaluator = class {
11656
11610
  parseJsonFromText(extractLastAssistantContent(response.output))
11657
11611
  );
11658
11612
  const score = clampScore(data.score);
11659
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
11660
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
11661
- const reasoning = data.reasoning;
11613
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
11662
11614
  return {
11663
11615
  score,
11664
11616
  verdict: scoreToVerdict(score),
11665
- hits,
11666
- misses,
11667
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
11668
- reasoning,
11617
+ assertions,
11618
+ expectedAspectCount: Math.max(assertions.length, 1),
11669
11619
  evaluatorRawRequest,
11670
11620
  scores
11671
11621
  };
@@ -11673,8 +11623,7 @@ var CompositeEvaluator = class {
11673
11623
  return {
11674
11624
  score: 0,
11675
11625
  verdict: "fail",
11676
- hits: [],
11677
- misses: [],
11626
+ assertions: [{ text: "LLM aggregator failed", passed: false }],
11678
11627
  expectedAspectCount: 1,
11679
11628
  evaluatorRawRequest,
11680
11629
  scores
@@ -11697,10 +11646,8 @@ var CostEvaluator = class {
11697
11646
  return {
11698
11647
  score: 0,
11699
11648
  verdict: "fail",
11700
- hits: [],
11701
- misses: ["No cost data available in trace"],
11649
+ assertions: [{ text: "No cost data available in trace", passed: false }],
11702
11650
  expectedAspectCount: 1,
11703
- reasoning: "Execution cost not reported by provider",
11704
11651
  evaluatorRawRequest: {
11705
11652
  type: "cost",
11706
11653
  budget,
@@ -11714,10 +11661,10 @@ var CostEvaluator = class {
11714
11661
  return {
11715
11662
  score,
11716
11663
  verdict: passed ? "pass" : "fail",
11717
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
11718
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
11664
+ assertions: [
11665
+ passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
11666
+ ],
11719
11667
  expectedAspectCount: 1,
11720
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
11721
11668
  evaluatorRawRequest: {
11722
11669
  type: "cost",
11723
11670
  budget,
@@ -11750,10 +11697,8 @@ var ExecutionMetricsEvaluator = class {
11750
11697
  return {
11751
11698
  score: 0,
11752
11699
  verdict: "fail",
11753
- hits: [],
11754
- misses: ["No trace summary available"],
11700
+ assertions: [{ text: "No trace summary available", passed: false }],
11755
11701
  expectedAspectCount: 1,
11756
- reasoning: "Execution metrics not available - no trace summary provided",
11757
11702
  evaluatorRawRequest: {
11758
11703
  type: "execution-metrics",
11759
11704
  config: this.extractConfiguredThresholds(),
@@ -11762,116 +11707,114 @@ var ExecutionMetricsEvaluator = class {
11762
11707
  };
11763
11708
  }
11764
11709
  const narrowedTrace = trace;
11765
- const hits = [];
11766
- const misses = [];
11710
+ const assertions = [];
11767
11711
  const actualMetrics = {};
11768
11712
  if (max_tool_calls !== void 0 && narrowedTrace) {
11769
11713
  const toolCalls = narrowedTrace.eventCount;
11770
11714
  actualMetrics.tool_calls = toolCalls;
11771
11715
  if (toolCalls <= max_tool_calls) {
11772
- hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
11716
+ assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
11773
11717
  } else {
11774
- misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
11718
+ assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
11775
11719
  }
11776
11720
  }
11777
11721
  if (max_llm_calls !== void 0 && narrowedTrace) {
11778
11722
  const llmCalls = narrowedTrace.llmCallCount;
11779
11723
  if (llmCalls === void 0) {
11780
- misses.push("LLM call count data not available");
11724
+ assertions.push({ text: "LLM call count data not available", passed: false });
11781
11725
  } else {
11782
11726
  actualMetrics.llm_calls = llmCalls;
11783
11727
  if (llmCalls <= max_llm_calls) {
11784
- hits.push(`LLM calls ${llmCalls} <= ${max_llm_calls} max`);
11728
+ assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
11785
11729
  } else {
11786
- misses.push(`LLM calls ${llmCalls} > ${max_llm_calls} max`);
11730
+ assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
11787
11731
  }
11788
11732
  }
11789
11733
  }
11790
11734
  if (max_tokens !== void 0) {
11791
11735
  if (!tokenUsage) {
11792
- misses.push("Token usage data not available");
11736
+ assertions.push({ text: "Token usage data not available", passed: false });
11793
11737
  } else {
11794
11738
  const totalTokens = tokenUsage.input + tokenUsage.output;
11795
11739
  actualMetrics.tokens = totalTokens;
11796
11740
  if (totalTokens <= max_tokens) {
11797
- hits.push(`Total tokens ${totalTokens} <= ${max_tokens} max`);
11741
+ assertions.push({
11742
+ text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
11743
+ passed: true
11744
+ });
11798
11745
  } else {
11799
- misses.push(`Total tokens ${totalTokens} > ${max_tokens} max`);
11746
+ assertions.push({
11747
+ text: `Total tokens ${totalTokens} > ${max_tokens} max`,
11748
+ passed: false
11749
+ });
11800
11750
  }
11801
11751
  }
11802
11752
  }
11803
11753
  if (max_cost_usd !== void 0) {
11804
11754
  if (costUsd === void 0) {
11805
- misses.push("Cost data not available");
11755
+ assertions.push({ text: "Cost data not available", passed: false });
11806
11756
  } else {
11807
11757
  actualMetrics.cost_usd = costUsd;
11808
11758
  const formatCost = (n) => `$${n.toFixed(4)}`;
11809
11759
  if (costUsd <= max_cost_usd) {
11810
- hits.push(`Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`);
11760
+ assertions.push({
11761
+ text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
11762
+ passed: true
11763
+ });
11811
11764
  } else {
11812
- misses.push(`Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`);
11765
+ assertions.push({
11766
+ text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
11767
+ passed: false
11768
+ });
11813
11769
  }
11814
11770
  }
11815
11771
  }
11816
11772
  if (max_duration_ms !== void 0) {
11817
11773
  if (durationMs === void 0) {
11818
- misses.push("Duration data not available");
11774
+ assertions.push({ text: "Duration data not available", passed: false });
11819
11775
  } else {
11820
11776
  actualMetrics.duration_ms = durationMs;
11821
11777
  if (durationMs <= max_duration_ms) {
11822
- hits.push(`Duration ${durationMs}ms <= ${max_duration_ms}ms max`);
11778
+ assertions.push({
11779
+ text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
11780
+ passed: true
11781
+ });
11823
11782
  } else {
11824
- misses.push(`Duration ${durationMs}ms > ${max_duration_ms}ms max`);
11783
+ assertions.push({
11784
+ text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
11785
+ passed: false
11786
+ });
11825
11787
  }
11826
11788
  }
11827
11789
  }
11828
11790
  if (target_exploration_ratio !== void 0 && narrowedTrace) {
11829
11791
  const ratio = explorationRatio(narrowedTrace);
11830
11792
  if (ratio === void 0) {
11831
- misses.push("Exploration ratio not available (no tool calls)");
11793
+ assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
11832
11794
  } else {
11833
11795
  actualMetrics.exploration_ratio = ratio;
11834
11796
  const diff = Math.abs(ratio - target_exploration_ratio);
11835
11797
  if (diff <= exploration_tolerance) {
11836
- hits.push(
11837
- `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`
11838
- );
11798
+ assertions.push({
11799
+ text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
11800
+ passed: true
11801
+ });
11839
11802
  } else {
11840
- misses.push(
11841
- `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`
11842
- );
11803
+ assertions.push({
11804
+ text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
11805
+ passed: false
11806
+ });
11843
11807
  }
11844
11808
  }
11845
11809
  }
11846
- const totalChecks = hits.length + misses.length;
11847
- const score = totalChecks > 0 ? hits.length / totalChecks : 0;
11848
- const reasoningParts = [];
11849
- if (actualMetrics.tool_calls !== void 0) {
11850
- reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
11851
- }
11852
- if (actualMetrics.llm_calls !== void 0) {
11853
- reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
11854
- }
11855
- if (actualMetrics.tokens !== void 0) {
11856
- reasoningParts.push(`tokens=${actualMetrics.tokens}`);
11857
- }
11858
- if (actualMetrics.cost_usd !== void 0) {
11859
- reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
11860
- }
11861
- if (actualMetrics.duration_ms !== void 0) {
11862
- reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
11863
- }
11864
- if (actualMetrics.exploration_ratio !== void 0) {
11865
- reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
11866
- }
11867
- const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
11810
+ const totalChecks = assertions.length;
11811
+ const passedCount = assertions.filter((a) => a.passed).length;
11812
+ const score = totalChecks > 0 ? passedCount / totalChecks : 0;
11868
11813
  return {
11869
11814
  score,
11870
11815
  verdict: scoreToVerdict(score),
11871
- hits,
11872
- misses,
11816
+ assertions,
11873
11817
  expectedAspectCount: totalChecks || 1,
11874
- reasoning,
11875
11818
  evaluatorRawRequest: {
11876
11819
  type: "execution-metrics",
11877
11820
  config: this.extractConfiguredThresholds(),
@@ -11975,10 +11918,8 @@ var FieldAccuracyEvaluator = class {
11975
11918
  return {
11976
11919
  score: 0,
11977
11920
  verdict: "fail",
11978
- hits: [],
11979
- misses: ["Failed to parse candidate answer as JSON"],
11980
- expectedAspectCount: this.config.fields.length,
11981
- reasoning: "Candidate answer is not valid JSON"
11921
+ assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
11922
+ expectedAspectCount: this.config.fields.length
11982
11923
  };
11983
11924
  }
11984
11925
  const expectedData = this.extractExpectedData(evalCase.expected_output);
@@ -11986,10 +11927,8 @@ var FieldAccuracyEvaluator = class {
11986
11927
  return {
11987
11928
  score: 0,
11988
11929
  verdict: "fail",
11989
- hits: [],
11990
- misses: ["No expected data found in expected_output"],
11991
- expectedAspectCount: this.config.fields.length,
11992
- reasoning: "Could not extract expected data from expected_output"
11930
+ assertions: [{ text: "No expected data found in expected_output", passed: false }],
11931
+ expectedAspectCount: this.config.fields.length
11993
11932
  };
11994
11933
  }
11995
11934
  const fieldResults = [];
@@ -12207,18 +12146,14 @@ var FieldAccuracyEvaluator = class {
12207
12146
  */
12208
12147
  aggregateResults(results) {
12209
12148
  const aggregation = this.config.aggregation ?? "weighted_average";
12210
- const hits = [];
12211
- const misses = [];
12149
+ const assertions = [];
12212
12150
  for (const result of results) {
12213
- if (result.hit) {
12214
- hits.push(result.message);
12215
- } else {
12216
- misses.push(result.message);
12217
- }
12151
+ assertions.push({ text: result.message, passed: result.hit });
12218
12152
  }
12219
12153
  let score;
12220
12154
  if (aggregation === "all_or_nothing") {
12221
- score = misses.length === 0 ? 1 : 0;
12155
+ const hasFailed = assertions.some((a) => !a.passed);
12156
+ score = hasFailed ? 0 : 1;
12222
12157
  } else {
12223
12158
  const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
12224
12159
  if (totalWeight === 0) {
@@ -12228,15 +12163,11 @@ var FieldAccuracyEvaluator = class {
12228
12163
  score = weightedSum / totalWeight;
12229
12164
  }
12230
12165
  }
12231
- const reasoning = `${hits.length}/${results.length} fields matched`;
12232
12166
  return {
12233
12167
  score: clampScore(score),
12234
12168
  verdict: scoreToVerdict(score),
12235
- hits: hits.slice(0, 4),
12236
- // Cap at 4 to keep output concise
12237
- misses: misses.slice(0, 4),
12238
- expectedAspectCount: results.length,
12239
- reasoning
12169
+ assertions,
12170
+ expectedAspectCount: results.length
12240
12171
  };
12241
12172
  }
12242
12173
  };
@@ -12345,10 +12276,8 @@ var LatencyEvaluator = class {
12345
12276
  return {
12346
12277
  score: 0,
12347
12278
  verdict: "fail",
12348
- hits: [],
12349
- misses: ["No duration data available in trace"],
12279
+ assertions: [{ text: "No duration data available in trace", passed: false }],
12350
12280
  expectedAspectCount: 1,
12351
- reasoning: "Execution duration not reported by provider",
12352
12281
  evaluatorRawRequest: {
12353
12282
  type: "latency",
12354
12283
  threshold,
@@ -12361,10 +12290,10 @@ var LatencyEvaluator = class {
12361
12290
  return {
12362
12291
  score,
12363
12292
  verdict: passed ? "pass" : "fail",
12364
- hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
12365
- misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
12293
+ assertions: [
12294
+ passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
12295
+ ],
12366
12296
  expectedAspectCount: 1,
12367
- reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
12368
12297
  evaluatorRawRequest: {
12369
12298
  type: "latency",
12370
12299
  threshold,
@@ -12385,7 +12314,10 @@ var COPILOT_MATCHER = {
12385
12314
  skillTools: ["Skill", "skill"],
12386
12315
  skillInputField: "skill",
12387
12316
  readTools: ["Read File", "readFile", "Read", "readTextFile"],
12388
- readInputField: "file_path"
12317
+ readInputField: "file_path",
12318
+ skillToolPrefixes: ["Using skill: "],
12319
+ readToolPrefixes: ["Viewing "],
12320
+ readInputFields: ["file_path", "path"]
12389
12321
  };
12390
12322
  var PROVIDER_TOOL_SEMANTICS = {
12391
12323
  claude: CLAUDE_MATCHER,
@@ -12427,12 +12359,22 @@ var SkillTriggerEvaluator = class {
12427
12359
  triggered = true;
12428
12360
  evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
12429
12361
  }
12362
+ } else if (matcher.skillToolPrefixes?.some(
12363
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
12364
+ )) {
12365
+ triggered = true;
12366
+ evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
12430
12367
  } else if (matcher.readTools.includes(firstTool.tool)) {
12431
- const filePath = String(input[matcher.readInputField] ?? "");
12368
+ const filePath = this.readPathFromInput(input, matcher);
12432
12369
  if (filePath.includes(skillName)) {
12433
12370
  triggered = true;
12434
12371
  evidence = `Read tool loaded skill file: ${filePath}`;
12435
12372
  }
12373
+ } else if (matcher.readToolPrefixes?.some(
12374
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
12375
+ )) {
12376
+ triggered = true;
12377
+ evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
12436
12378
  }
12437
12379
  }
12438
12380
  const pass = triggered === shouldTrigger;
@@ -12440,25 +12382,37 @@ var SkillTriggerEvaluator = class {
12440
12382
  return {
12441
12383
  score: 1,
12442
12384
  verdict: "pass",
12443
- hits: [
12444
- shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`
12385
+ assertions: [
12386
+ {
12387
+ text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
12388
+ passed: true
12389
+ }
12445
12390
  ],
12446
- misses: [],
12447
- expectedAspectCount: 1,
12448
- reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
12391
+ expectedAspectCount: 1
12449
12392
  };
12450
12393
  }
12451
12394
  return {
12452
12395
  score: 0,
12453
12396
  verdict: "fail",
12454
- hits: [],
12455
- misses: [
12456
- shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`
12397
+ assertions: [
12398
+ {
12399
+ text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
12400
+ passed: false
12401
+ }
12457
12402
  ],
12458
- expectedAspectCount: 1,
12459
- reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
12403
+ expectedAspectCount: 1
12460
12404
  };
12461
12405
  }
12406
+ readPathFromInput(input, matcher) {
12407
+ const fields = matcher.readInputFields ?? [matcher.readInputField];
12408
+ for (const field of fields) {
12409
+ const value = input[field];
12410
+ if (value !== void 0 && value !== null) {
12411
+ return String(value);
12412
+ }
12413
+ }
12414
+ return "";
12415
+ }
12462
12416
  };
12463
12417
 
12464
12418
  // src/evaluation/evaluators/llm-grader-prompt.ts
@@ -12493,12 +12447,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
12493
12447
  [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
12494
12448
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
12495
12449
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
12496
- [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
12497
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
12498
12450
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
12499
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
12500
12451
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
12501
- // Text convenience accessors (new names, always strings)
12502
12452
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
12503
12453
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
12504
12454
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -12625,10 +12575,8 @@ var TokenUsageEvaluator = class {
12625
12575
  return {
12626
12576
  score: 0,
12627
12577
  verdict: "fail",
12628
- hits: [],
12629
- misses: ["No token usage data available in trace"],
12578
+ assertions: [{ text: "No token usage data available in trace", passed: false }],
12630
12579
  expectedAspectCount,
12631
- reasoning: "Token usage not reported by provider",
12632
12580
  evaluatorRawRequest: {
12633
12581
  type: "token-usage",
12634
12582
  max_total: maxTotal ?? null,
@@ -12642,37 +12590,34 @@ var TokenUsageEvaluator = class {
12642
12590
  const output = usage.output;
12643
12591
  const cached = usage.cached ?? 0;
12644
12592
  const total = input + output + cached;
12645
- const hits = [];
12646
- const misses = [];
12593
+ const assertions = [];
12647
12594
  if (typeof maxInput === "number") {
12648
12595
  if (input <= maxInput) {
12649
- hits.push(`Input tokens ${input} <= ${maxInput}`);
12596
+ assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
12650
12597
  } else {
12651
- misses.push(`Input tokens ${input} > ${maxInput}`);
12598
+ assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
12652
12599
  }
12653
12600
  }
12654
12601
  if (typeof maxOutput === "number") {
12655
12602
  if (output <= maxOutput) {
12656
- hits.push(`Output tokens ${output} <= ${maxOutput}`);
12603
+ assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
12657
12604
  } else {
12658
- misses.push(`Output tokens ${output} > ${maxOutput}`);
12605
+ assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
12659
12606
  }
12660
12607
  }
12661
12608
  if (typeof maxTotal === "number") {
12662
12609
  if (total <= maxTotal) {
12663
- hits.push(`Total tokens ${total} <= ${maxTotal}`);
12610
+ assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
12664
12611
  } else {
12665
- misses.push(`Total tokens ${total} > ${maxTotal}`);
12612
+ assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
12666
12613
  }
12667
12614
  }
12668
- const passed = misses.length === 0;
12615
+ const passed = assertions.every((a) => a.passed);
12669
12616
  return {
12670
12617
  score: passed ? 1 : 0,
12671
12618
  verdict: passed ? "pass" : "fail",
12672
- hits,
12673
- misses,
12619
+ assertions,
12674
12620
  expectedAspectCount,
12675
- reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
12676
12621
  evaluatorRawRequest: {
12677
12622
  type: "token-usage",
12678
12623
  max_total: maxTotal ?? null,
@@ -12772,8 +12717,7 @@ var ToolTrajectoryEvaluator = class {
12772
12717
  return {
12773
12718
  score: 0,
12774
12719
  verdict: "fail",
12775
- hits: [],
12776
- misses: ["No trace available for evaluation"],
12720
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
12777
12721
  expectedAspectCount: 1
12778
12722
  };
12779
12723
  }
@@ -12784,8 +12728,7 @@ var ToolTrajectoryEvaluator = class {
12784
12728
  return {
12785
12729
  score: 0,
12786
12730
  verdict: "fail",
12787
- hits: [],
12788
- misses: ["No trace available for evaluation"],
12731
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
12789
12732
  expectedAspectCount: 1
12790
12733
  };
12791
12734
  }
@@ -12803,8 +12746,7 @@ var ToolTrajectoryEvaluator = class {
12803
12746
  return {
12804
12747
  score: 0,
12805
12748
  verdict: "fail",
12806
- hits: [],
12807
- misses: [`Unknown mode: ${this.config.mode}`],
12749
+ assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
12808
12750
  expectedAspectCount: 1
12809
12751
  };
12810
12752
  }
@@ -12853,28 +12795,32 @@ var ToolTrajectoryEvaluator = class {
12853
12795
  return {
12854
12796
  score: 1,
12855
12797
  verdict: "pass",
12856
- hits: ["No tool requirements specified"],
12857
- misses: [],
12798
+ assertions: [{ text: "No tool requirements specified", passed: true }],
12858
12799
  expectedAspectCount: 0
12859
12800
  };
12860
12801
  }
12861
- const hits = [];
12862
- const misses = [];
12802
+ const assertions = [];
12863
12803
  for (const toolName of toolNames) {
12864
12804
  const required = minimums[toolName];
12865
12805
  const actual = summary.toolCallsByName[toolName] ?? 0;
12866
12806
  if (actual >= required) {
12867
- hits.push(`${toolName}: called ${actual} times (required >=${required})`);
12807
+ assertions.push({
12808
+ text: `${toolName}: called ${actual} times (required >=${required})`,
12809
+ passed: true
12810
+ });
12868
12811
  } else {
12869
- misses.push(`${toolName}: called ${actual} times (required >=${required})`);
12812
+ assertions.push({
12813
+ text: `${toolName}: called ${actual} times (required >=${required})`,
12814
+ passed: false
12815
+ });
12870
12816
  }
12871
12817
  }
12872
- const score = hits.length / toolNames.length;
12818
+ const passedCount = assertions.filter((a) => a.passed).length;
12819
+ const score = passedCount / toolNames.length;
12873
12820
  return {
12874
12821
  score,
12875
12822
  verdict: scoreToVerdict(score),
12876
- hits,
12877
- misses,
12823
+ assertions,
12878
12824
  expectedAspectCount: toolNames.length
12879
12825
  };
12880
12826
  }
@@ -12884,13 +12830,11 @@ var ToolTrajectoryEvaluator = class {
12884
12830
  return {
12885
12831
  score: 1,
12886
12832
  verdict: "pass",
12887
- hits: ["No tool sequence specified"],
12888
- misses: [],
12833
+ assertions: [{ text: "No tool sequence specified", passed: true }],
12889
12834
  expectedAspectCount: 0
12890
12835
  };
12891
12836
  }
12892
- const hits = [];
12893
- const misses = [];
12837
+ const assertions = [];
12894
12838
  const warnings = [];
12895
12839
  let actualIndex = 0;
12896
12840
  let sequenceHits = 0;
@@ -12910,16 +12854,20 @@ var ToolTrajectoryEvaluator = class {
12910
12854
  const actualCall = toolCalls[actualIndex];
12911
12855
  if (actualCall.name === expectedTool) {
12912
12856
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
12913
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
12857
+ assertions.push({
12858
+ text: `Found ${expectedTool} at position ${actualIndex}`,
12859
+ passed: true
12860
+ });
12914
12861
  sequenceHits++;
12915
12862
  matchedCall = actualCall;
12916
12863
  actualIndex++;
12917
12864
  found = true;
12918
12865
  break;
12919
12866
  }
12920
- misses.push(
12921
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
12922
- );
12867
+ assertions.push({
12868
+ text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
12869
+ passed: false
12870
+ });
12923
12871
  actualIndex++;
12924
12872
  argsMismatch = true;
12925
12873
  break;
@@ -12927,7 +12875,10 @@ var ToolTrajectoryEvaluator = class {
12927
12875
  actualIndex++;
12928
12876
  }
12929
12877
  if (!found && !argsMismatch) {
12930
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
12878
+ assertions.push({
12879
+ text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
12880
+ passed: false
12881
+ });
12931
12882
  }
12932
12883
  if (found && matchedCall) {
12933
12884
  const latencyResult = checkLatency(
@@ -12936,10 +12887,10 @@ var ToolTrajectoryEvaluator = class {
12936
12887
  matchedCall.durationMs
12937
12888
  );
12938
12889
  if (latencyResult.status === "pass") {
12939
- hits.push(latencyResult.message);
12890
+ assertions.push({ text: latencyResult.message, passed: true });
12940
12891
  latencyHits++;
12941
12892
  } else if (latencyResult.status === "fail") {
12942
- misses.push(latencyResult.message);
12893
+ assertions.push({ text: latencyResult.message, passed: false });
12943
12894
  } else if (latencyResult.message) {
12944
12895
  warnings.push(latencyResult.message);
12945
12896
  latencySkips++;
@@ -12955,8 +12906,7 @@ var ToolTrajectoryEvaluator = class {
12955
12906
  return {
12956
12907
  score,
12957
12908
  verdict: scoreToVerdict(score),
12958
- hits,
12959
- misses,
12909
+ assertions,
12960
12910
  expectedAspectCount: totalAssertions
12961
12911
  };
12962
12912
  }
@@ -12966,13 +12916,11 @@ var ToolTrajectoryEvaluator = class {
12966
12916
  return {
12967
12917
  score: 1,
12968
12918
  verdict: "pass",
12969
- hits: ["No tool sequence specified"],
12970
- misses: [],
12919
+ assertions: [{ text: "No tool sequence specified", passed: true }],
12971
12920
  expectedAspectCount: 0
12972
12921
  };
12973
12922
  }
12974
- const hits = [];
12975
- const misses = [];
12923
+ const assertions = [];
12976
12924
  const warnings = [];
12977
12925
  let sequenceHits = 0;
12978
12926
  let latencyHits = 0;
@@ -12981,7 +12929,10 @@ var ToolTrajectoryEvaluator = class {
12981
12929
  (item) => item.maxDurationMs !== void 0
12982
12930
  ).length;
12983
12931
  if (toolCalls.length !== expected.length) {
12984
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
12932
+ assertions.push({
12933
+ text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
12934
+ passed: false
12935
+ });
12985
12936
  }
12986
12937
  const checkLength = Math.min(expected.length, toolCalls.length);
12987
12938
  for (let i = 0; i < checkLength; i++) {
@@ -12993,14 +12944,17 @@ var ToolTrajectoryEvaluator = class {
12993
12944
  let sequenceMatched = false;
12994
12945
  if (actualTool === expectedTool) {
12995
12946
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
12996
- hits.push(`Position ${i}: ${expectedTool}`);
12947
+ assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
12997
12948
  sequenceHits++;
12998
12949
  sequenceMatched = true;
12999
12950
  } else {
13000
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
12951
+ assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
13001
12952
  }
13002
12953
  } else {
13003
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
12954
+ assertions.push({
12955
+ text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
12956
+ passed: false
12957
+ });
13004
12958
  }
13005
12959
  if (sequenceMatched) {
13006
12960
  const latencyResult = checkLatency(
@@ -13009,10 +12963,10 @@ var ToolTrajectoryEvaluator = class {
13009
12963
  actualCall.durationMs
13010
12964
  );
13011
12965
  if (latencyResult.status === "pass") {
13012
- hits.push(latencyResult.message);
12966
+ assertions.push({ text: latencyResult.message, passed: true });
13013
12967
  latencyHits++;
13014
12968
  } else if (latencyResult.status === "fail") {
13015
- misses.push(latencyResult.message);
12969
+ assertions.push({ text: latencyResult.message, passed: false });
13016
12970
  } else if (latencyResult.message) {
13017
12971
  warnings.push(latencyResult.message);
13018
12972
  latencySkips++;
@@ -13020,7 +12974,10 @@ var ToolTrajectoryEvaluator = class {
13020
12974
  }
13021
12975
  }
13022
12976
  for (let i = checkLength; i < expected.length; i++) {
13023
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
12977
+ assertions.push({
12978
+ text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
12979
+ passed: false
12980
+ });
13024
12981
  }
13025
12982
  for (const warning of warnings) {
13026
12983
  console.warn(`[tool-trajectory] ${warning}`);
@@ -13031,8 +12988,7 @@ var ToolTrajectoryEvaluator = class {
13031
12988
  return {
13032
12989
  score,
13033
12990
  verdict: scoreToVerdict(score),
13034
- hits,
13035
- misses,
12991
+ assertions,
13036
12992
  expectedAspectCount: totalAssertions
13037
12993
  };
13038
12994
  }
@@ -13047,13 +13003,11 @@ var ToolTrajectoryEvaluator = class {
13047
13003
  return {
13048
13004
  score: 1,
13049
13005
  verdict: "pass",
13050
- hits: ["No expected tools specified"],
13051
- misses: [],
13006
+ assertions: [{ text: "No expected tools specified", passed: true }],
13052
13007
  expectedAspectCount: 0
13053
13008
  };
13054
13009
  }
13055
- const hits = [];
13056
- const misses = [];
13010
+ const assertions = [];
13057
13011
  const consumed = /* @__PURE__ */ new Set();
13058
13012
  for (let i = 0; i < expected.length; i++) {
13059
13013
  const expectedItem = expected[i];
@@ -13064,22 +13018,25 @@ var ToolTrajectoryEvaluator = class {
13064
13018
  if (consumed.has(j)) continue;
13065
13019
  const actualCall = toolCalls[j];
13066
13020
  if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
13067
- hits.push(`Found ${expectedTool} at position ${j}`);
13021
+ assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
13068
13022
  consumed.add(j);
13069
13023
  found = true;
13070
13024
  break;
13071
13025
  }
13072
13026
  }
13073
13027
  if (!found) {
13074
- misses.push(`Expected ${expectedTool} not found in actual trajectory`);
13028
+ assertions.push({
13029
+ text: `Expected ${expectedTool} not found in actual trajectory`,
13030
+ passed: false
13031
+ });
13075
13032
  }
13076
13033
  }
13077
- const score = expected.length > 0 ? hits.length / expected.length : 1;
13034
+ const passedCount = assertions.filter((a) => a.passed).length;
13035
+ const score = expected.length > 0 ? passedCount / expected.length : 1;
13078
13036
  return {
13079
13037
  score,
13080
13038
  verdict: scoreToVerdict(score),
13081
- hits,
13082
- misses,
13039
+ assertions,
13083
13040
  expectedAspectCount: expected.length
13084
13041
  };
13085
13042
  }
@@ -13095,16 +13052,19 @@ var ToolTrajectoryEvaluator = class {
13095
13052
  return {
13096
13053
  score: 1,
13097
13054
  verdict: "pass",
13098
- hits: ["No tool calls and no expected tools"],
13099
- misses: [],
13055
+ assertions: [{ text: "No tool calls and no expected tools", passed: true }],
13100
13056
  expectedAspectCount: 0
13101
13057
  };
13102
13058
  }
13103
13059
  return {
13104
13060
  score: 0,
13105
13061
  verdict: "fail",
13106
- hits: [],
13107
- misses: [`${toolCalls.length} unexpected tool call(s) with empty allowed list`],
13062
+ assertions: [
13063
+ {
13064
+ text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
13065
+ passed: false
13066
+ }
13067
+ ],
13108
13068
  expectedAspectCount: toolCalls.length
13109
13069
  };
13110
13070
  }
@@ -13112,13 +13072,11 @@ var ToolTrajectoryEvaluator = class {
13112
13072
  return {
13113
13073
  score: 1,
13114
13074
  verdict: "pass",
13115
- hits: ["No actual tool calls (trivially a subset)"],
13116
- misses: [],
13075
+ assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
13117
13076
  expectedAspectCount: 0
13118
13077
  };
13119
13078
  }
13120
- const hits = [];
13121
- const misses = [];
13079
+ const assertions = [];
13122
13080
  for (let i = 0; i < toolCalls.length; i++) {
13123
13081
  const actualCall = toolCalls[i];
13124
13082
  let allowed = false;
@@ -13130,17 +13088,23 @@ var ToolTrajectoryEvaluator = class {
13130
13088
  }
13131
13089
  }
13132
13090
  if (allowed) {
13133
- hits.push(`Position ${i}: ${actualCall.name} is in allowed set`);
13091
+ assertions.push({
13092
+ text: `Position ${i}: ${actualCall.name} is in allowed set`,
13093
+ passed: true
13094
+ });
13134
13095
  } else {
13135
- misses.push(`Position ${i}: ${actualCall.name} is not in allowed set`);
13096
+ assertions.push({
13097
+ text: `Position ${i}: ${actualCall.name} is not in allowed set`,
13098
+ passed: false
13099
+ });
13136
13100
  }
13137
13101
  }
13138
- const score = toolCalls.length > 0 ? hits.length / toolCalls.length : 1;
13102
+ const passedCount = assertions.filter((a) => a.passed).length;
13103
+ const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
13139
13104
  return {
13140
13105
  score,
13141
13106
  verdict: scoreToVerdict(score),
13142
- hits,
13143
- misses,
13107
+ assertions,
13144
13108
  expectedAspectCount: toolCalls.length
13145
13109
  };
13146
13110
  }
@@ -13151,8 +13115,12 @@ function runContainsAssertion(output, value) {
13151
13115
  const passed = output.includes(value);
13152
13116
  return {
13153
13117
  score: passed ? 1 : 0,
13154
- hits: passed ? [`Output contains "${value}"`] : [],
13155
- misses: passed ? [] : [`Output does not contain "${value}"`]
13118
+ assertions: [
13119
+ {
13120
+ text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
13121
+ passed
13122
+ }
13123
+ ]
13156
13124
  };
13157
13125
  }
13158
13126
  function runContainsAnyAssertion(output, values) {
@@ -13160,8 +13128,12 @@ function runContainsAnyAssertion(output, values) {
13160
13128
  const passed = matched.length > 0;
13161
13129
  return {
13162
13130
  score: passed ? 1 : 0,
13163
- hits: passed ? [`Output contains "${matched[0]}"`] : [],
13164
- misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
13131
+ assertions: [
13132
+ {
13133
+ text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
13134
+ passed
13135
+ }
13136
+ ]
13165
13137
  };
13166
13138
  }
13167
13139
  function runContainsAllAssertion(output, values) {
@@ -13169,16 +13141,24 @@ function runContainsAllAssertion(output, values) {
13169
13141
  const passed = missing.length === 0;
13170
13142
  return {
13171
13143
  score: passed ? 1 : 0,
13172
- hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
13173
- misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
13144
+ assertions: [
13145
+ {
13146
+ text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
13147
+ passed
13148
+ }
13149
+ ]
13174
13150
  };
13175
13151
  }
13176
13152
  function runIcontainsAssertion(output, value) {
13177
13153
  const passed = output.toLowerCase().includes(value.toLowerCase());
13178
13154
  return {
13179
13155
  score: passed ? 1 : 0,
13180
- hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
13181
- misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
13156
+ assertions: [
13157
+ {
13158
+ text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
13159
+ passed
13160
+ }
13161
+ ]
13182
13162
  };
13183
13163
  }
13184
13164
  function runIcontainsAnyAssertion(output, values) {
@@ -13187,9 +13167,11 @@ function runIcontainsAnyAssertion(output, values) {
13187
13167
  const passed = matched.length > 0;
13188
13168
  return {
13189
13169
  score: passed ? 1 : 0,
13190
- hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
13191
- misses: passed ? [] : [
13192
- `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
13170
+ assertions: [
13171
+ {
13172
+ text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
13173
+ passed
13174
+ }
13193
13175
  ]
13194
13176
  };
13195
13177
  }
@@ -13199,24 +13181,36 @@ function runIcontainsAllAssertion(output, values) {
13199
13181
  const passed = missing.length === 0;
13200
13182
  return {
13201
13183
  score: passed ? 1 : 0,
13202
- hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
13203
- misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
13184
+ assertions: [
13185
+ {
13186
+ text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
13187
+ passed
13188
+ }
13189
+ ]
13204
13190
  };
13205
13191
  }
13206
13192
  function runStartsWithAssertion(output, value) {
13207
13193
  const passed = output.trim().startsWith(value.trim());
13208
13194
  return {
13209
13195
  score: passed ? 1 : 0,
13210
- hits: passed ? [`Output starts with "${value}"`] : [],
13211
- misses: passed ? [] : [`Output does not start with "${value}"`]
13196
+ assertions: [
13197
+ {
13198
+ text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
13199
+ passed
13200
+ }
13201
+ ]
13212
13202
  };
13213
13203
  }
13214
13204
  function runEndsWithAssertion(output, value) {
13215
13205
  const passed = output.trim().endsWith(value.trim());
13216
13206
  return {
13217
13207
  score: passed ? 1 : 0,
13218
- hits: passed ? [`Output ends with "${value}"`] : [],
13219
- misses: passed ? [] : [`Output does not end with "${value}"`]
13208
+ assertions: [
13209
+ {
13210
+ text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
13211
+ passed
13212
+ }
13213
+ ]
13220
13214
  };
13221
13215
  }
13222
13216
  function runRegexAssertion(output, pattern, flags) {
@@ -13225,8 +13219,12 @@ function runRegexAssertion(output, pattern, flags) {
13225
13219
  const flagsLabel = flags ? ` (flags: ${flags})` : "";
13226
13220
  return {
13227
13221
  score: passed ? 1 : 0,
13228
- hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
13229
- misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
13222
+ assertions: [
13223
+ {
13224
+ text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
13225
+ passed
13226
+ }
13227
+ ]
13230
13228
  };
13231
13229
  }
13232
13230
  function runIsJsonAssertion(output) {
@@ -13238,16 +13236,24 @@ function runIsJsonAssertion(output) {
13238
13236
  }
13239
13237
  return {
13240
13238
  score: passed ? 1 : 0,
13241
- hits: passed ? ["Output is valid JSON"] : [],
13242
- misses: passed ? [] : ["Output is not valid JSON"]
13239
+ assertions: [
13240
+ {
13241
+ text: passed ? "Output is valid JSON" : "Output is not valid JSON",
13242
+ passed
13243
+ }
13244
+ ]
13243
13245
  };
13244
13246
  }
13245
13247
  function runEqualsAssertion(output, value) {
13246
13248
  const passed = output.trim() === value.trim();
13247
13249
  return {
13248
13250
  score: passed ? 1 : 0,
13249
- hits: passed ? [`Output equals "${value}"`] : [],
13250
- misses: passed ? [] : [`Output does not equal "${value}"`]
13251
+ assertions: [
13252
+ {
13253
+ text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
13254
+ passed
13255
+ }
13256
+ ]
13251
13257
  };
13252
13258
  }
13253
13259
 
@@ -13460,10 +13466,8 @@ var InlineAssertEvaluator = class {
13460
13466
  return {
13461
13467
  score,
13462
13468
  verdict: scoreToVerdict(score),
13463
- hits: score >= 0.8 ? [result.name] : [],
13464
- misses: score < 0.5 ? [result.name] : [],
13469
+ assertions: [{ text: result.name, passed: score >= 0.5 }],
13465
13470
  expectedAspectCount: 1,
13466
- reasoning: void 0,
13467
13471
  details: result.metadata ? result.metadata : void 0
13468
13472
  };
13469
13473
  }
@@ -13501,11 +13505,9 @@ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
13501
13505
  }
13502
13506
  async function executePromptTemplate(script, context, config, timeoutMs) {
13503
13507
  const payload = {
13504
- question: context.evalCase.question,
13505
13508
  criteria: context.evalCase.criteria,
13506
13509
  expectedOutput: context.evalCase.expected_output,
13507
- referenceAnswer: context.evalCase.reference_answer,
13508
- answer: context.candidate,
13510
+ outputText: context.candidate,
13509
13511
  output: context.output ?? null,
13510
13512
  guidelineFiles: context.evalCase.guideline_paths,
13511
13513
  inputFiles: context.evalCase.file_paths.filter(
@@ -13516,9 +13518,7 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
13516
13518
  fileChanges: context.fileChanges ?? null,
13517
13519
  workspacePath: context.workspacePath ?? null,
13518
13520
  config: config ?? context.config ?? null,
13519
- // Text convenience accessors (new names, always strings)
13520
13521
  inputText: context.evalCase.question,
13521
- outputText: context.candidate,
13522
13522
  expectedOutputText: context.evalCase.reference_answer ?? ""
13523
13523
  };
13524
13524
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -13656,9 +13656,7 @@ var containsFactory = (config) => {
13656
13656
  return {
13657
13657
  score: result.score,
13658
13658
  verdict: result.score === 1 ? "pass" : "fail",
13659
- hits: result.hits,
13660
- misses: result.misses,
13661
- reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
13659
+ assertions: result.assertions,
13662
13660
  expectedAspectCount: 1
13663
13661
  };
13664
13662
  });
@@ -13670,9 +13668,7 @@ var regexFactory = (config) => {
13670
13668
  return {
13671
13669
  score: result.score,
13672
13670
  verdict: result.score === 1 ? "pass" : "fail",
13673
- hits: result.hits,
13674
- misses: result.misses,
13675
- reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
13671
+ assertions: result.assertions,
13676
13672
  expectedAspectCount: 1
13677
13673
  };
13678
13674
  });
@@ -13683,9 +13679,7 @@ var isJsonFactory = () => {
13683
13679
  return {
13684
13680
  score: result.score,
13685
13681
  verdict: result.score === 1 ? "pass" : "fail",
13686
- hits: result.hits,
13687
- misses: result.misses,
13688
- reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
13682
+ assertions: result.assertions,
13689
13683
  expectedAspectCount: 1
13690
13684
  };
13691
13685
  });
@@ -13697,9 +13691,7 @@ var equalsFactory = (config) => {
13697
13691
  return {
13698
13692
  score: result.score,
13699
13693
  verdict: result.score === 1 ? "pass" : "fail",
13700
- hits: result.hits,
13701
- misses: result.misses,
13702
- reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
13694
+ assertions: result.assertions,
13703
13695
  expectedAspectCount: 1
13704
13696
  };
13705
13697
  });
@@ -13711,9 +13703,7 @@ var containsAnyFactory = (config) => {
13711
13703
  return {
13712
13704
  score: result.score,
13713
13705
  verdict: result.score === 1 ? "pass" : "fail",
13714
- hits: result.hits,
13715
- misses: result.misses,
13716
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13706
+ assertions: result.assertions,
13717
13707
  expectedAspectCount: 1
13718
13708
  };
13719
13709
  });
@@ -13725,9 +13715,7 @@ var containsAllFactory = (config) => {
13725
13715
  return {
13726
13716
  score: result.score,
13727
13717
  verdict: result.score === 1 ? "pass" : "fail",
13728
- hits: result.hits,
13729
- misses: result.misses,
13730
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13718
+ assertions: result.assertions,
13731
13719
  expectedAspectCount: 1
13732
13720
  };
13733
13721
  });
@@ -13739,9 +13727,7 @@ var icontainsFactory = (config) => {
13739
13727
  return {
13740
13728
  score: result.score,
13741
13729
  verdict: result.score === 1 ? "pass" : "fail",
13742
- hits: result.hits,
13743
- misses: result.misses,
13744
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13730
+ assertions: result.assertions,
13745
13731
  expectedAspectCount: 1
13746
13732
  };
13747
13733
  });
@@ -13753,9 +13739,7 @@ var icontainsAnyFactory = (config) => {
13753
13739
  return {
13754
13740
  score: result.score,
13755
13741
  verdict: result.score === 1 ? "pass" : "fail",
13756
- hits: result.hits,
13757
- misses: result.misses,
13758
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13742
+ assertions: result.assertions,
13759
13743
  expectedAspectCount: 1
13760
13744
  };
13761
13745
  });
@@ -13767,9 +13751,7 @@ var icontainsAllFactory = (config) => {
13767
13751
  return {
13768
13752
  score: result.score,
13769
13753
  verdict: result.score === 1 ? "pass" : "fail",
13770
- hits: result.hits,
13771
- misses: result.misses,
13772
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13754
+ assertions: result.assertions,
13773
13755
  expectedAspectCount: 1
13774
13756
  };
13775
13757
  });
@@ -13781,9 +13763,7 @@ var startsWithFactory = (config) => {
13781
13763
  return {
13782
13764
  score: result.score,
13783
13765
  verdict: result.score === 1 ? "pass" : "fail",
13784
- hits: result.hits,
13785
- misses: result.misses,
13786
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13766
+ assertions: result.assertions,
13787
13767
  expectedAspectCount: 1
13788
13768
  };
13789
13769
  });
@@ -13795,9 +13775,7 @@ var endsWithFactory = (config) => {
13795
13775
  return {
13796
13776
  score: result.score,
13797
13777
  verdict: result.score === 1 ? "pass" : "fail",
13798
- hits: result.hits,
13799
- misses: result.misses,
13800
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13778
+ assertions: result.assertions,
13801
13779
  expectedAspectCount: 1
13802
13780
  };
13803
13781
  });
@@ -14868,7 +14846,7 @@ async function runEvaluation(options) {
14868
14846
  if (!cliModel) {
14869
14847
  throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
14870
14848
  }
14871
- const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-HDSAUUEF.js");
14849
+ const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-NFFLXG5M.js");
14872
14850
  return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
14873
14851
  }
14874
14852
  const overrideTarget = resolveTargetByName(cliGraderTarget);
@@ -15203,9 +15181,8 @@ async function runEvaluation(options) {
15203
15181
  testId: evalCase.id,
15204
15182
  dataset: evalCase.dataset,
15205
15183
  score: 0,
15206
- hits: [],
15207
- misses: [],
15208
- answer: "",
15184
+ assertions: [],
15185
+ outputText: "",
15209
15186
  target: target.name,
15210
15187
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
15211
15188
  budgetExceeded: true,
@@ -15240,9 +15217,8 @@ async function runEvaluation(options) {
15240
15217
  testId: evalCase.id,
15241
15218
  dataset: evalCase.dataset,
15242
15219
  score: 0,
15243
- hits: [],
15244
- misses: [],
15245
- answer: "",
15220
+ assertions: [],
15221
+ outputText: "",
15246
15222
  target: target.name,
15247
15223
  error: errorMsg,
15248
15224
  executionStatus: "execution_error",
@@ -16208,11 +16184,9 @@ async function evaluateCandidate(options) {
16208
16184
  dataset: evalCase.dataset,
16209
16185
  conversationId: evalCase.conversation_id,
16210
16186
  score: score.score,
16211
- hits: score.hits,
16212
- misses: score.misses,
16213
- answer: candidate,
16187
+ assertions: score.assertions,
16188
+ outputText: candidate,
16214
16189
  target: target.name,
16215
- reasoning: score.reasoning,
16216
16190
  tokenUsage,
16217
16191
  costUsd,
16218
16192
  durationMs,
@@ -16386,9 +16360,7 @@ async function runEvaluatorList(options) {
16386
16360
  score: score2.score,
16387
16361
  weight,
16388
16362
  verdict: score2.verdict,
16389
- hits: score2.hits,
16390
- misses: score2.misses,
16391
- reasoning: score2.reasoning,
16363
+ assertions: score2.assertions,
16392
16364
  evaluatorProviderRequest: score2.evaluatorRawRequest,
16393
16365
  details: score2.details,
16394
16366
  scores: mapChildResults(score2.scores),
@@ -16403,10 +16375,10 @@ async function runEvaluatorList(options) {
16403
16375
  const fallbackScore = {
16404
16376
  score: 0,
16405
16377
  verdict: "fail",
16406
- hits: [],
16407
- misses: [`Evaluator '${evaluatorConfig.name}' failed: ${message}`],
16408
- expectedAspectCount: 1,
16409
- reasoning: message
16378
+ assertions: [
16379
+ { text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
16380
+ ],
16381
+ expectedAspectCount: 1
16410
16382
  };
16411
16383
  const weight = evaluatorConfig.weight ?? 1;
16412
16384
  scored.push({
@@ -16422,9 +16394,12 @@ async function runEvaluatorList(options) {
16422
16394
  score: 0,
16423
16395
  weight,
16424
16396
  verdict: "fail",
16425
- hits: [],
16426
- misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
16427
- reasoning: message,
16397
+ assertions: [
16398
+ {
16399
+ text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
16400
+ passed: false
16401
+ }
16402
+ ],
16428
16403
  durationMs: endedAt.getTime() - startedAt.getTime(),
16429
16404
  startedAt: startedAt.toISOString(),
16430
16405
  endedAt: endedAt.toISOString()
@@ -16440,9 +16415,7 @@ async function runEvaluatorList(options) {
16440
16415
  ...scores[lastScoresIdx],
16441
16416
  score: negated.score,
16442
16417
  verdict: negated.verdict,
16443
- hits: [...negated.hits],
16444
- misses: [...negated.misses],
16445
- reasoning: negated.reasoning
16418
+ assertions: [...negated.assertions]
16446
16419
  };
16447
16420
  }
16448
16421
  }
@@ -16457,21 +16430,13 @@ async function runEvaluatorList(options) {
16457
16430
  const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
16458
16431
  scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
16459
16432
  ) : 0;
16460
- const hits = scored.flatMap((entry) => entry.score.hits);
16461
- const misses = scored.flatMap((entry) => entry.score.misses);
16462
- const expectedAspectCount = scored.reduce(
16463
- (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
16464
- 0
16465
- );
16466
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
16467
- const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
16433
+ const assertions = scored.flatMap((entry) => entry.score.assertions);
16434
+ const expectedAspectCount = assertions.length || 1;
16468
16435
  const score = {
16469
16436
  score: aggregateScore,
16470
16437
  verdict: scoreToVerdict(aggregateScore),
16471
- hits,
16472
- misses,
16473
- expectedAspectCount,
16474
- reasoning
16438
+ assertions,
16439
+ expectedAspectCount
16475
16440
  };
16476
16441
  return { score, scores };
16477
16442
  }
@@ -16575,9 +16540,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
16575
16540
  dataset: evalCase.dataset,
16576
16541
  conversationId: evalCase.conversation_id,
16577
16542
  score: 0,
16578
- hits: [],
16579
- misses: [`Error: ${message}`],
16580
- answer: `Error occurred: ${message}`,
16543
+ assertions: [{ text: `Error: ${message}`, passed: false }],
16544
+ outputText: `Error occurred: ${message}`,
16581
16545
  target: targetName,
16582
16546
  requests,
16583
16547
  input,
@@ -16686,9 +16650,7 @@ function mapChildResults(children) {
16686
16650
  score: child.score,
16687
16651
  weight: child.weight,
16688
16652
  verdict: child.verdict,
16689
- hits: child.hits,
16690
- misses: child.misses,
16691
- reasoning: child.reasoning,
16653
+ assertions: child.assertions,
16692
16654
  evaluatorProviderRequest: child.evaluatorRawRequest,
16693
16655
  scores: mapChildResults(child.scores),
16694
16656
  details: child.details,
@@ -17117,7 +17079,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
17117
17079
 
17118
17080
  // src/evaluation/baseline.ts
17119
17081
  var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
17120
- "answer",
17082
+ "outputText",
17121
17083
  "requests",
17122
17084
  "trace",
17123
17085
  "workspacePath",
@@ -17291,7 +17253,7 @@ var OtelTraceExporter = class {
17291
17253
  rootSpan.setAttribute("agentv.target", result.target);
17292
17254
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
17293
17255
  rootSpan.setAttribute("agentv.score", result.score);
17294
- if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
17256
+ if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
17295
17257
  if (result.durationMs != null)
17296
17258
  rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
17297
17259
  if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
@@ -17653,7 +17615,6 @@ export {
17653
17615
  freeformEvaluationSchema,
17654
17616
  generateRubrics,
17655
17617
  getAgentvHome,
17656
- getHitCount,
17657
17618
  getOutputFilenames,
17658
17619
  getSubagentsRoot,
17659
17620
  getTraceStateRoot,