@agentv/core 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,7 +5,6 @@ import {
5
5
  extractLastAssistantContent,
6
6
  fileExists,
7
7
  findGitRoot,
8
- getHitCount,
9
8
  isAgentProvider,
10
9
  isEvaluatorKind,
11
10
  isJsonObject,
@@ -17,10 +16,10 @@ import {
17
16
  readTextFile,
18
17
  resolveFileReference,
19
18
  resolveTargetDefinition
20
- } from "./chunk-JO4HIAEF.js";
19
+ } from "./chunk-EFR4JHPL.js";
21
20
  import {
22
21
  AgentvProvider
23
- } from "./chunk-Q52FQPKQ.js";
22
+ } from "./chunk-W5YDZWT4.js";
24
23
  import {
25
24
  OtlpJsonFileExporter
26
25
  } from "./chunk-HFSYZHGF.js";
@@ -3752,7 +3751,7 @@ var AzureProvider = class {
3752
3751
  };
3753
3752
  this.retryConfig = config.retry;
3754
3753
  const azure = createAzure(buildAzureOptions(config));
3755
- this.model = azure(config.deploymentName);
3754
+ this.model = azure.chat(config.deploymentName);
3756
3755
  }
3757
3756
  id;
3758
3757
  kind = "azure";
@@ -9784,9 +9783,11 @@ function negateScore(score) {
9784
9783
  ...score,
9785
9784
  score: negatedScore,
9786
9785
  verdict: negatedVerdict,
9787
- reasoning: score.reasoning ? `[Negated] ${score.reasoning} (original score: ${score.score.toFixed(2)})` : `[Negated] Original score: ${score.score.toFixed(2)}`,
9788
- hits: score.misses,
9789
- misses: score.hits
9786
+ assertions: score.assertions.map((a) => ({
9787
+ ...a,
9788
+ passed: !a.passed,
9789
+ evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
9790
+ }))
9790
9791
  };
9791
9792
  }
9792
9793
 
@@ -10301,9 +10302,13 @@ var CodeEvaluator = class {
10301
10302
  );
10302
10303
  const parsed = parseJsonSafe(stdout);
10303
10304
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
10304
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
10305
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
10306
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
10305
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
10306
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
10307
+ ).map((a) => ({
10308
+ text: String(a.text),
10309
+ passed: Boolean(a.passed),
10310
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
10311
+ })) : [];
10307
10312
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
10308
10313
  const proxyUsage = getProxyUsage?.();
10309
10314
  const evaluatorRawRequest = {
@@ -10319,10 +10324,8 @@ var CodeEvaluator = class {
10319
10324
  return {
10320
10325
  score,
10321
10326
  verdict: scoreToVerdict(score),
10322
- hits,
10323
- misses,
10324
- expectedAspectCount: hits.length + misses.length || 1,
10325
- reasoning,
10327
+ assertions,
10328
+ expectedAspectCount: assertions.length || 1,
10326
10329
  evaluatorRawRequest,
10327
10330
  ...details ? { details } : {},
10328
10331
  tokenUsage: proxyUsage?.tokenUsage
@@ -10333,10 +10336,8 @@ var CodeEvaluator = class {
10333
10336
  return {
10334
10337
  score: 0,
10335
10338
  verdict: "fail",
10336
- hits: [],
10337
- misses: [`Code evaluator failed: ${message}`],
10339
+ assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
10338
10340
  expectedAspectCount: 1,
10339
- reasoning: message,
10340
10341
  evaluatorRawRequest: {
10341
10342
  command: this.command,
10342
10343
  ...this.cwd ? { cwd: this.cwd } : {},
@@ -10444,9 +10445,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
10444
10445
  {{${TEMPLATE_VARIABLES.ANSWER}}}`;
10445
10446
  var freeformEvaluationSchema = z3.object({
10446
10447
  score: z3.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
10447
- hits: z3.array(z3.string()).describe("Brief specific achievements").optional(),
10448
- misses: z3.array(z3.string()).describe("Brief failures or omissions").optional(),
10449
- reasoning: z3.string().describe("Concise explanation (1-2 sentences)").optional()
10448
+ assertions: z3.array(
10449
+ z3.object({
10450
+ text: z3.string().describe("Brief description of what was checked"),
10451
+ passed: z3.boolean().describe("Whether this aspect was satisfied"),
10452
+ evidence: z3.string().describe("Concise evidence (1-2 sentences)").optional()
10453
+ })
10454
+ ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
10450
10455
  });
10451
10456
  var rubricCheckResultSchema = z3.object({
10452
10457
  id: z3.string().describe("The ID of the rubric item being checked"),
@@ -10548,17 +10553,12 @@ ${context.fileChanges}`;
10548
10553
  schema: freeformEvaluationSchema
10549
10554
  });
10550
10555
  const score = clampScore(data.score);
10551
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
10552
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
10553
- const reasoning = data.reasoning;
10554
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
10556
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
10555
10557
  return {
10556
10558
  score,
10557
10559
  verdict: scoreToVerdict(score),
10558
- hits,
10559
- misses,
10560
- expectedAspectCount,
10561
- reasoning,
10560
+ assertions,
10561
+ expectedAspectCount: Math.max(assertions.length, 1),
10562
10562
  evaluatorRawRequest,
10563
10563
  tokenUsage
10564
10564
  };
@@ -10569,10 +10569,8 @@ ${context.fileChanges}`;
10569
10569
  return {
10570
10570
  score: 0,
10571
10571
  verdict: "skip",
10572
- hits: [],
10573
- misses: [`Grader parse failure after 3 attempts: ${message}`],
10572
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10574
10573
  expectedAspectCount: 1,
10575
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
10576
10574
  evaluatorRawRequest
10577
10575
  };
10578
10576
  }
@@ -10602,14 +10600,12 @@ ${context.fileChanges}`;
10602
10600
  userPrompt: prompt,
10603
10601
  schema: rubricEvaluationSchema
10604
10602
  });
10605
- const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
10603
+ const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
10606
10604
  return {
10607
10605
  score,
10608
10606
  verdict,
10609
- hits,
10610
- misses,
10607
+ assertions,
10611
10608
  expectedAspectCount: rubrics.length,
10612
- reasoning: data.overall_reasoning,
10613
10609
  evaluatorRawRequest,
10614
10610
  tokenUsage
10615
10611
  };
@@ -10620,10 +10616,8 @@ ${context.fileChanges}`;
10620
10616
  return {
10621
10617
  score: 0,
10622
10618
  verdict: "skip",
10623
- hits: [],
10624
- misses: [`Grader parse failure after 3 attempts: ${message}`],
10619
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10625
10620
  expectedAspectCount: rubrics.length,
10626
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
10627
10621
  evaluatorRawRequest
10628
10622
  };
10629
10623
  }
@@ -10648,14 +10642,12 @@ ${context.fileChanges}`;
10648
10642
  userPrompt: prompt,
10649
10643
  schema: scoreRangeEvaluationSchema
10650
10644
  });
10651
- const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
10645
+ const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
10652
10646
  return {
10653
10647
  score,
10654
10648
  verdict,
10655
- hits,
10656
- misses,
10649
+ assertions,
10657
10650
  expectedAspectCount: rubrics.length,
10658
- reasoning: data.overall_reasoning,
10659
10651
  evaluatorRawRequest,
10660
10652
  details,
10661
10653
  tokenUsage
@@ -10667,10 +10659,8 @@ ${context.fileChanges}`;
10667
10659
  return {
10668
10660
  score: 0,
10669
10661
  verdict: "skip",
10670
- hits: [],
10671
- misses: [`Grader parse failure after 3 attempts: ${message}`],
10662
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10672
10663
  expectedAspectCount: rubrics.length,
10673
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
10674
10664
  evaluatorRawRequest
10675
10665
  };
10676
10666
  }
@@ -10727,8 +10717,7 @@ ${context.fileChanges}`;
10727
10717
  return {
10728
10718
  score: 0,
10729
10719
  verdict: "fail",
10730
- hits: [],
10731
- misses: [`llm-grader built-in evaluation failed: ${message}`],
10720
+ assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
10732
10721
  expectedAspectCount: 1,
10733
10722
  evaluatorRawRequest,
10734
10723
  details: { mode: "built-in", error: message }
@@ -10778,8 +10767,9 @@ ${context.fileChanges}`;
10778
10767
  return {
10779
10768
  score: 0,
10780
10769
  verdict: "fail",
10781
- hits: [],
10782
- misses: [`llm-grader ${modeLabel} returned no assistant response`],
10770
+ assertions: [
10771
+ { text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
10772
+ ],
10783
10773
  expectedAspectCount: 1,
10784
10774
  evaluatorRawRequest,
10785
10775
  details: { mode: modeLabel, grader_target: provider.targetName }
@@ -10797,8 +10787,9 @@ ${context.fileChanges}`;
10797
10787
  return {
10798
10788
  score: 0,
10799
10789
  verdict: "fail",
10800
- hits: [],
10801
- misses: [`llm-grader ${modeLabel} evaluation failed: ${message}`],
10790
+ assertions: [
10791
+ { text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
10792
+ ],
10802
10793
  expectedAspectCount: 1,
10803
10794
  evaluatorRawRequest,
10804
10795
  details: {
@@ -10950,29 +10941,24 @@ ${outputSchema}`;
10950
10941
  const parsed = parseJsonFromText(text);
10951
10942
  if (rubrics && rubrics.length > 0) {
10952
10943
  const data2 = rubricEvaluationSchema.parse(parsed);
10953
- const { score: score2, verdict, hits: hits2, misses: misses2 } = calculateRubricScore(data2, rubrics);
10944
+ const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
10954
10945
  return {
10955
10946
  score: score2,
10956
10947
  verdict,
10957
- hits: hits2,
10958
- misses: misses2,
10948
+ assertions: assertions2,
10959
10949
  expectedAspectCount: rubrics.length,
10960
- reasoning: data2.overall_reasoning,
10961
10950
  evaluatorRawRequest,
10962
10951
  details
10963
10952
  };
10964
10953
  }
10965
10954
  const data = freeformEvaluationSchema.parse(parsed);
10966
10955
  const score = clampScore(data.score);
10967
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
10968
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
10956
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
10969
10957
  return {
10970
10958
  score,
10971
10959
  verdict: scoreToVerdict(score),
10972
- hits,
10973
- misses,
10974
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
10975
- reasoning: data.reasoning,
10960
+ assertions,
10961
+ expectedAspectCount: Math.max(assertions.length, 1),
10976
10962
  evaluatorRawRequest,
10977
10963
  details
10978
10964
  };
@@ -10980,8 +10966,12 @@ ${outputSchema}`;
10980
10966
  return {
10981
10967
  score: 0,
10982
10968
  verdict: "fail",
10983
- hits: [],
10984
- misses: ["Failed to parse llm-grader agent response as valid evaluation JSON"],
10969
+ assertions: [
10970
+ {
10971
+ text: "Failed to parse llm-grader agent response as valid evaluation JSON",
10972
+ passed: false
10973
+ }
10974
+ ],
10985
10975
  expectedAspectCount: 1,
10986
10976
  evaluatorRawRequest,
10987
10977
  details
@@ -11110,9 +11100,13 @@ function buildOutputSchema() {
11110
11100
  "",
11111
11101
  "{",
11112
11102
  ' "score": <number between 0.0 and 1.0>,',
11113
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
11114
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
11115
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
11103
+ ' "assertions": [',
11104
+ " {",
11105
+ ' "text": "<brief description of what was checked>",',
11106
+ ' "passed": <boolean>,',
11107
+ ' "evidence": "<concise evidence, 1-2 sentences, optional>"',
11108
+ " }",
11109
+ " ]",
11116
11110
  "}"
11117
11111
  ].join("\n");
11118
11112
  }
@@ -11137,8 +11131,7 @@ function substituteVariables(template, variables) {
11137
11131
  }
11138
11132
  function calculateRubricScore(result, rubrics) {
11139
11133
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
11140
- const hits = [];
11141
- const misses = [];
11134
+ const assertions = [];
11142
11135
  let totalWeight = 0;
11143
11136
  let earnedWeight = 0;
11144
11137
  let failedRequired = false;
@@ -11148,19 +11141,20 @@ function calculateRubricScore(result, rubrics) {
11148
11141
  continue;
11149
11142
  }
11150
11143
  totalWeight += rubric.weight;
11144
+ assertions.push({
11145
+ text: `[${rubric.id}] ${rubric.outcome}`,
11146
+ passed: check.satisfied,
11147
+ evidence: check.reasoning
11148
+ });
11151
11149
  if (check.satisfied) {
11152
11150
  earnedWeight += rubric.weight;
11153
- hits.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
11154
- } else {
11155
- misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
11156
- if (rubric.required) {
11157
- failedRequired = true;
11158
- }
11151
+ } else if (rubric.required) {
11152
+ failedRequired = true;
11159
11153
  }
11160
11154
  }
11161
11155
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
11162
11156
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
11163
- return { score, verdict, hits, misses };
11157
+ return { score, verdict, assertions };
11164
11158
  }
11165
11159
  function buildScoreRangeOutputSchema() {
11166
11160
  return `You are an expert evaluator. Score the candidate answer on each criterion.
@@ -11180,8 +11174,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
11180
11174
  }
11181
11175
  function calculateScoreRangeResult(result, rubrics) {
11182
11176
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
11183
- const hits = [];
11184
- const misses = [];
11177
+ const assertions = [];
11185
11178
  const rawScores = {};
11186
11179
  let totalWeight = 0;
11187
11180
  let weightedScoreSum = 0;
@@ -11207,24 +11200,22 @@ function calculateScoreRangeResult(result, rubrics) {
11207
11200
  );
11208
11201
  const rangeDescription = matchingRange?.outcome ?? "";
11209
11202
  const criterionLabel = rubric.outcome ?? rubric.id;
11210
- const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
11211
- const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
11203
+ const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
11212
11204
  if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
11213
11205
  failedRequired = true;
11214
- misses.push(scoreInfo);
11215
- } else if (rawScore >= 7) {
11216
- hits.push(scoreInfo);
11217
- } else {
11218
- misses.push(scoreInfo);
11219
11206
  }
11207
+ assertions.push({
11208
+ text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
11209
+ passed,
11210
+ evidence: check.reasoning
11211
+ });
11220
11212
  }
11221
11213
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
11222
11214
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
11223
11215
  return {
11224
11216
  score,
11225
11217
  verdict,
11226
- hits,
11227
- misses,
11218
+ assertions,
11228
11219
  details: {
11229
11220
  raw_scores: rawScores,
11230
11221
  normalization: "score / 10",
@@ -11400,9 +11391,7 @@ var CompositeEvaluator = class {
11400
11391
  let totalWeight = 0;
11401
11392
  let weightedSum = 0;
11402
11393
  let evaluatedCount = 0;
11403
- const allHits = [];
11404
- const allMisses = [];
11405
- const reasoningParts = [];
11394
+ const allAssertions = [];
11406
11395
  const scores = [];
11407
11396
  for (const member of results) {
11408
11397
  const weight = weights?.[member.id] ?? 1;
@@ -11412,9 +11401,7 @@ var CompositeEvaluator = class {
11412
11401
  score: member.result.score,
11413
11402
  weight,
11414
11403
  verdict: member.result.verdict,
11415
- hits: [...member.result.hits],
11416
- misses: [...member.result.misses],
11417
- reasoning: member.result.reasoning,
11404
+ assertions: [...member.result.assertions],
11418
11405
  evaluatorRawRequest: member.result.evaluatorRawRequest,
11419
11406
  scores: member.result.scores,
11420
11407
  details: member.result.details,
@@ -11426,20 +11413,16 @@ var CompositeEvaluator = class {
11426
11413
  evaluatedCount++;
11427
11414
  totalWeight += weight;
11428
11415
  weightedSum += member.result.score * weight;
11429
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
11430
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
11431
- if (member.result.reasoning) {
11432
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
11433
- }
11416
+ allAssertions.push(
11417
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
11418
+ );
11434
11419
  }
11435
11420
  if (evaluatedCount === 0 && results.length > 0) {
11436
11421
  return {
11437
11422
  score: 0,
11438
11423
  verdict: "skip",
11439
- hits: [],
11440
- misses: [],
11424
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
11441
11425
  expectedAspectCount: 1,
11442
- reasoning: "All evaluators skipped (infrastructure failure)",
11443
11426
  evaluatorRawRequest: {
11444
11427
  aggregator: "weighted_average",
11445
11428
  ...weights ? { weights } : {}
@@ -11451,10 +11434,8 @@ var CompositeEvaluator = class {
11451
11434
  return {
11452
11435
  score: clampScore(finalScore),
11453
11436
  verdict: scoreToVerdict(finalScore),
11454
- hits: allHits,
11455
- misses: allMisses,
11456
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
11457
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
11437
+ assertions: allAssertions,
11438
+ expectedAspectCount: allAssertions.length || 1,
11458
11439
  evaluatorRawRequest: {
11459
11440
  aggregator: "weighted_average",
11460
11441
  ...weights ? { weights } : {}
@@ -11464,11 +11445,8 @@ var CompositeEvaluator = class {
11464
11445
  }
11465
11446
  runThreshold(results, threshold) {
11466
11447
  const scores = [];
11467
- const allHits = [];
11468
- const allMisses = [];
11469
- const reasoningParts = [];
11448
+ const allAssertions = [];
11470
11449
  let passingCount = 0;
11471
- let borderlineCount = 0;
11472
11450
  let evaluatedCount = 0;
11473
11451
  for (const member of results) {
11474
11452
  scores.push({
@@ -11476,9 +11454,7 @@ var CompositeEvaluator = class {
11476
11454
  type: member.type,
11477
11455
  score: member.result.score,
11478
11456
  verdict: member.result.verdict,
11479
- hits: [...member.result.hits],
11480
- misses: [...member.result.misses],
11481
- reasoning: member.result.reasoning,
11457
+ assertions: [...member.result.assertions],
11482
11458
  evaluatorRawRequest: member.result.evaluatorRawRequest,
11483
11459
  scores: member.result.scores,
11484
11460
  details: member.result.details,
@@ -11491,24 +11467,17 @@ var CompositeEvaluator = class {
11491
11467
  const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
11492
11468
  if (isPassing) {
11493
11469
  passingCount++;
11494
- if (member.result.verdict === "borderline") {
11495
- borderlineCount++;
11496
- }
11497
- }
11498
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
11499
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
11500
- if (member.result.reasoning) {
11501
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
11502
11470
  }
11471
+ allAssertions.push(
11472
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
11473
+ );
11503
11474
  }
11504
11475
  if (evaluatedCount === 0 && results.length > 0) {
11505
11476
  return {
11506
11477
  score: 0,
11507
11478
  verdict: "skip",
11508
- hits: [],
11509
- misses: [],
11479
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
11510
11480
  expectedAspectCount: 1,
11511
- reasoning: "All evaluators skipped (infrastructure failure)",
11512
11481
  evaluatorRawRequest: {
11513
11482
  aggregator: "threshold",
11514
11483
  threshold
@@ -11519,19 +11488,15 @@ var CompositeEvaluator = class {
11519
11488
  const totalCount = evaluatedCount;
11520
11489
  const score = totalCount > 0 ? passingCount / totalCount : 0;
11521
11490
  const pass = score >= threshold;
11522
- if (pass && borderlineCount > 0) {
11523
- reasoningParts.push(`Warning: ${borderlineCount} borderline evaluator(s) counted as passing`);
11524
- }
11525
- reasoningParts.unshift(
11526
- `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
11527
- );
11491
+ allAssertions.unshift({
11492
+ text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
11493
+ passed: pass
11494
+ });
11528
11495
  return {
11529
11496
  score: clampScore(score),
11530
11497
  verdict: pass ? "pass" : "fail",
11531
- hits: allHits,
11532
- misses: allMisses,
11533
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
11534
- reasoning: reasoningParts.join("; "),
11498
+ assertions: allAssertions,
11499
+ expectedAspectCount: allAssertions.length || 1,
11535
11500
  evaluatorRawRequest: {
11536
11501
  aggregator: "threshold",
11537
11502
  threshold
@@ -11548,9 +11513,7 @@ var CompositeEvaluator = class {
11548
11513
  score: member.result.score,
11549
11514
  weight: weights?.[member.id] ?? 1,
11550
11515
  verdict: member.result.verdict,
11551
- hits: [...member.result.hits],
11552
- misses: [...member.result.misses],
11553
- reasoning: member.result.reasoning,
11516
+ assertions: [...member.result.assertions],
11554
11517
  evaluatorRawRequest: member.result.evaluatorRawRequest,
11555
11518
  scores: member.result.scores,
11556
11519
  details: member.result.details
@@ -11559,17 +11522,19 @@ var CompositeEvaluator = class {
11559
11522
  const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
11560
11523
  const parsed = parseJsonSafe(stdout);
11561
11524
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
11562
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
11563
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
11564
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
11525
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
11526
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
11527
+ ).map((a) => ({
11528
+ text: String(a.text),
11529
+ passed: Boolean(a.passed),
11530
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
11531
+ })) : [];
11565
11532
  const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
11566
11533
  return {
11567
11534
  score,
11568
11535
  verdict,
11569
- hits,
11570
- misses,
11571
- expectedAspectCount: hits.length + misses.length || 1,
11572
- reasoning,
11536
+ assertions,
11537
+ expectedAspectCount: assertions.length || 1,
11573
11538
  evaluatorRawRequest: {
11574
11539
  aggregator: "code-grader",
11575
11540
  script: scriptPath
@@ -11581,10 +11546,8 @@ var CompositeEvaluator = class {
11581
11546
  return {
11582
11547
  score: 0,
11583
11548
  verdict: "fail",
11584
- hits: [],
11585
- misses: [`Code aggregator failed: ${message}`],
11549
+ assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
11586
11550
  expectedAspectCount: 1,
11587
- reasoning: message,
11588
11551
  evaluatorRawRequest: {
11589
11552
  aggregator: "code-grader",
11590
11553
  script: scriptPath,
@@ -11606,9 +11569,7 @@ var CompositeEvaluator = class {
11606
11569
  type: member.type,
11607
11570
  score: member.result.score,
11608
11571
  verdict: member.result.verdict,
11609
- hits: [...member.result.hits],
11610
- misses: [...member.result.misses],
11611
- reasoning: member.result.reasoning,
11572
+ assertions: [...member.result.assertions],
11612
11573
  evaluatorRawRequest: member.result.evaluatorRawRequest,
11613
11574
  scores: member.result.scores,
11614
11575
  details: member.result.details
@@ -11632,16 +11593,12 @@ var CompositeEvaluator = class {
11632
11593
  });
11633
11594
  const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
11634
11595
  const score2 = clampScore(data2.score);
11635
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
11636
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
11637
- const reasoning2 = data2.reasoning;
11596
+ const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
11638
11597
  return {
11639
11598
  score: score2,
11640
11599
  verdict: scoreToVerdict(score2),
11641
- hits: hits2,
11642
- misses: misses2,
11643
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
11644
- reasoning: reasoning2,
11600
+ assertions: assertions2,
11601
+ expectedAspectCount: Math.max(assertions2.length, 1),
11645
11602
  evaluatorRawRequest,
11646
11603
  scores
11647
11604
  };
@@ -11656,16 +11613,12 @@ var CompositeEvaluator = class {
11656
11613
  parseJsonFromText(extractLastAssistantContent(response.output))
11657
11614
  );
11658
11615
  const score = clampScore(data.score);
11659
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
11660
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
11661
- const reasoning = data.reasoning;
11616
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
11662
11617
  return {
11663
11618
  score,
11664
11619
  verdict: scoreToVerdict(score),
11665
- hits,
11666
- misses,
11667
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
11668
- reasoning,
11620
+ assertions,
11621
+ expectedAspectCount: Math.max(assertions.length, 1),
11669
11622
  evaluatorRawRequest,
11670
11623
  scores
11671
11624
  };
@@ -11673,8 +11626,7 @@ var CompositeEvaluator = class {
11673
11626
  return {
11674
11627
  score: 0,
11675
11628
  verdict: "fail",
11676
- hits: [],
11677
- misses: [],
11629
+ assertions: [{ text: "LLM aggregator failed", passed: false }],
11678
11630
  expectedAspectCount: 1,
11679
11631
  evaluatorRawRequest,
11680
11632
  scores
@@ -11697,10 +11649,8 @@ var CostEvaluator = class {
11697
11649
  return {
11698
11650
  score: 0,
11699
11651
  verdict: "fail",
11700
- hits: [],
11701
- misses: ["No cost data available in trace"],
11652
+ assertions: [{ text: "No cost data available in trace", passed: false }],
11702
11653
  expectedAspectCount: 1,
11703
- reasoning: "Execution cost not reported by provider",
11704
11654
  evaluatorRawRequest: {
11705
11655
  type: "cost",
11706
11656
  budget,
@@ -11714,10 +11664,10 @@ var CostEvaluator = class {
11714
11664
  return {
11715
11665
  score,
11716
11666
  verdict: passed ? "pass" : "fail",
11717
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
11718
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
11667
+ assertions: [
11668
+ passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
11669
+ ],
11719
11670
  expectedAspectCount: 1,
11720
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
11721
11671
  evaluatorRawRequest: {
11722
11672
  type: "cost",
11723
11673
  budget,
@@ -11750,10 +11700,8 @@ var ExecutionMetricsEvaluator = class {
11750
11700
  return {
11751
11701
  score: 0,
11752
11702
  verdict: "fail",
11753
- hits: [],
11754
- misses: ["No trace summary available"],
11703
+ assertions: [{ text: "No trace summary available", passed: false }],
11755
11704
  expectedAspectCount: 1,
11756
- reasoning: "Execution metrics not available - no trace summary provided",
11757
11705
  evaluatorRawRequest: {
11758
11706
  type: "execution-metrics",
11759
11707
  config: this.extractConfiguredThresholds(),
@@ -11762,116 +11710,114 @@ var ExecutionMetricsEvaluator = class {
11762
11710
  };
11763
11711
  }
11764
11712
  const narrowedTrace = trace;
11765
- const hits = [];
11766
- const misses = [];
11713
+ const assertions = [];
11767
11714
  const actualMetrics = {};
11768
11715
  if (max_tool_calls !== void 0 && narrowedTrace) {
11769
11716
  const toolCalls = narrowedTrace.eventCount;
11770
11717
  actualMetrics.tool_calls = toolCalls;
11771
11718
  if (toolCalls <= max_tool_calls) {
11772
- hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
11719
+ assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
11773
11720
  } else {
11774
- misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
11721
+ assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
11775
11722
  }
11776
11723
  }
11777
11724
  if (max_llm_calls !== void 0 && narrowedTrace) {
11778
11725
  const llmCalls = narrowedTrace.llmCallCount;
11779
11726
  if (llmCalls === void 0) {
11780
- misses.push("LLM call count data not available");
11727
+ assertions.push({ text: "LLM call count data not available", passed: false });
11781
11728
  } else {
11782
11729
  actualMetrics.llm_calls = llmCalls;
11783
11730
  if (llmCalls <= max_llm_calls) {
11784
- hits.push(`LLM calls ${llmCalls} <= ${max_llm_calls} max`);
11731
+ assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
11785
11732
  } else {
11786
- misses.push(`LLM calls ${llmCalls} > ${max_llm_calls} max`);
11733
+ assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
11787
11734
  }
11788
11735
  }
11789
11736
  }
11790
11737
  if (max_tokens !== void 0) {
11791
11738
  if (!tokenUsage) {
11792
- misses.push("Token usage data not available");
11739
+ assertions.push({ text: "Token usage data not available", passed: false });
11793
11740
  } else {
11794
11741
  const totalTokens = tokenUsage.input + tokenUsage.output;
11795
11742
  actualMetrics.tokens = totalTokens;
11796
11743
  if (totalTokens <= max_tokens) {
11797
- hits.push(`Total tokens ${totalTokens} <= ${max_tokens} max`);
11744
+ assertions.push({
11745
+ text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
11746
+ passed: true
11747
+ });
11798
11748
  } else {
11799
- misses.push(`Total tokens ${totalTokens} > ${max_tokens} max`);
11749
+ assertions.push({
11750
+ text: `Total tokens ${totalTokens} > ${max_tokens} max`,
11751
+ passed: false
11752
+ });
11800
11753
  }
11801
11754
  }
11802
11755
  }
11803
11756
  if (max_cost_usd !== void 0) {
11804
11757
  if (costUsd === void 0) {
11805
- misses.push("Cost data not available");
11758
+ assertions.push({ text: "Cost data not available", passed: false });
11806
11759
  } else {
11807
11760
  actualMetrics.cost_usd = costUsd;
11808
11761
  const formatCost = (n) => `$${n.toFixed(4)}`;
11809
11762
  if (costUsd <= max_cost_usd) {
11810
- hits.push(`Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`);
11763
+ assertions.push({
11764
+ text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
11765
+ passed: true
11766
+ });
11811
11767
  } else {
11812
- misses.push(`Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`);
11768
+ assertions.push({
11769
+ text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
11770
+ passed: false
11771
+ });
11813
11772
  }
11814
11773
  }
11815
11774
  }
11816
11775
  if (max_duration_ms !== void 0) {
11817
11776
  if (durationMs === void 0) {
11818
- misses.push("Duration data not available");
11777
+ assertions.push({ text: "Duration data not available", passed: false });
11819
11778
  } else {
11820
11779
  actualMetrics.duration_ms = durationMs;
11821
11780
  if (durationMs <= max_duration_ms) {
11822
- hits.push(`Duration ${durationMs}ms <= ${max_duration_ms}ms max`);
11781
+ assertions.push({
11782
+ text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
11783
+ passed: true
11784
+ });
11823
11785
  } else {
11824
- misses.push(`Duration ${durationMs}ms > ${max_duration_ms}ms max`);
11786
+ assertions.push({
11787
+ text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
11788
+ passed: false
11789
+ });
11825
11790
  }
11826
11791
  }
11827
11792
  }
11828
11793
  if (target_exploration_ratio !== void 0 && narrowedTrace) {
11829
11794
  const ratio = explorationRatio(narrowedTrace);
11830
11795
  if (ratio === void 0) {
11831
- misses.push("Exploration ratio not available (no tool calls)");
11796
+ assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
11832
11797
  } else {
11833
11798
  actualMetrics.exploration_ratio = ratio;
11834
11799
  const diff = Math.abs(ratio - target_exploration_ratio);
11835
11800
  if (diff <= exploration_tolerance) {
11836
- hits.push(
11837
- `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`
11838
- );
11801
+ assertions.push({
11802
+ text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
11803
+ passed: true
11804
+ });
11839
11805
  } else {
11840
- misses.push(
11841
- `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`
11842
- );
11806
+ assertions.push({
11807
+ text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
11808
+ passed: false
11809
+ });
11843
11810
  }
11844
11811
  }
11845
11812
  }
11846
- const totalChecks = hits.length + misses.length;
11847
- const score = totalChecks > 0 ? hits.length / totalChecks : 0;
11848
- const reasoningParts = [];
11849
- if (actualMetrics.tool_calls !== void 0) {
11850
- reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
11851
- }
11852
- if (actualMetrics.llm_calls !== void 0) {
11853
- reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
11854
- }
11855
- if (actualMetrics.tokens !== void 0) {
11856
- reasoningParts.push(`tokens=${actualMetrics.tokens}`);
11857
- }
11858
- if (actualMetrics.cost_usd !== void 0) {
11859
- reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
11860
- }
11861
- if (actualMetrics.duration_ms !== void 0) {
11862
- reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
11863
- }
11864
- if (actualMetrics.exploration_ratio !== void 0) {
11865
- reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
11866
- }
11867
- const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
11813
+ const totalChecks = assertions.length;
11814
+ const passedCount = assertions.filter((a) => a.passed).length;
11815
+ const score = totalChecks > 0 ? passedCount / totalChecks : 0;
11868
11816
  return {
11869
11817
  score,
11870
11818
  verdict: scoreToVerdict(score),
11871
- hits,
11872
- misses,
11819
+ assertions,
11873
11820
  expectedAspectCount: totalChecks || 1,
11874
- reasoning,
11875
11821
  evaluatorRawRequest: {
11876
11822
  type: "execution-metrics",
11877
11823
  config: this.extractConfiguredThresholds(),
@@ -11975,10 +11921,8 @@ var FieldAccuracyEvaluator = class {
11975
11921
  return {
11976
11922
  score: 0,
11977
11923
  verdict: "fail",
11978
- hits: [],
11979
- misses: ["Failed to parse candidate answer as JSON"],
11980
- expectedAspectCount: this.config.fields.length,
11981
- reasoning: "Candidate answer is not valid JSON"
11924
+ assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
11925
+ expectedAspectCount: this.config.fields.length
11982
11926
  };
11983
11927
  }
11984
11928
  const expectedData = this.extractExpectedData(evalCase.expected_output);
@@ -11986,10 +11930,8 @@ var FieldAccuracyEvaluator = class {
11986
11930
  return {
11987
11931
  score: 0,
11988
11932
  verdict: "fail",
11989
- hits: [],
11990
- misses: ["No expected data found in expected_output"],
11991
- expectedAspectCount: this.config.fields.length,
11992
- reasoning: "Could not extract expected data from expected_output"
11933
+ assertions: [{ text: "No expected data found in expected_output", passed: false }],
11934
+ expectedAspectCount: this.config.fields.length
11993
11935
  };
11994
11936
  }
11995
11937
  const fieldResults = [];
@@ -12207,18 +12149,14 @@ var FieldAccuracyEvaluator = class {
12207
12149
  */
12208
12150
  aggregateResults(results) {
12209
12151
  const aggregation = this.config.aggregation ?? "weighted_average";
12210
- const hits = [];
12211
- const misses = [];
12152
+ const assertions = [];
12212
12153
  for (const result of results) {
12213
- if (result.hit) {
12214
- hits.push(result.message);
12215
- } else {
12216
- misses.push(result.message);
12217
- }
12154
+ assertions.push({ text: result.message, passed: result.hit });
12218
12155
  }
12219
12156
  let score;
12220
12157
  if (aggregation === "all_or_nothing") {
12221
- score = misses.length === 0 ? 1 : 0;
12158
+ const hasFailed = assertions.some((a) => !a.passed);
12159
+ score = hasFailed ? 0 : 1;
12222
12160
  } else {
12223
12161
  const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
12224
12162
  if (totalWeight === 0) {
@@ -12228,15 +12166,11 @@ var FieldAccuracyEvaluator = class {
12228
12166
  score = weightedSum / totalWeight;
12229
12167
  }
12230
12168
  }
12231
- const reasoning = `${hits.length}/${results.length} fields matched`;
12232
12169
  return {
12233
12170
  score: clampScore(score),
12234
12171
  verdict: scoreToVerdict(score),
12235
- hits: hits.slice(0, 4),
12236
- // Cap at 4 to keep output concise
12237
- misses: misses.slice(0, 4),
12238
- expectedAspectCount: results.length,
12239
- reasoning
12172
+ assertions,
12173
+ expectedAspectCount: results.length
12240
12174
  };
12241
12175
  }
12242
12176
  };
@@ -12345,10 +12279,8 @@ var LatencyEvaluator = class {
12345
12279
  return {
12346
12280
  score: 0,
12347
12281
  verdict: "fail",
12348
- hits: [],
12349
- misses: ["No duration data available in trace"],
12282
+ assertions: [{ text: "No duration data available in trace", passed: false }],
12350
12283
  expectedAspectCount: 1,
12351
- reasoning: "Execution duration not reported by provider",
12352
12284
  evaluatorRawRequest: {
12353
12285
  type: "latency",
12354
12286
  threshold,
@@ -12361,10 +12293,10 @@ var LatencyEvaluator = class {
12361
12293
  return {
12362
12294
  score,
12363
12295
  verdict: passed ? "pass" : "fail",
12364
- hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
12365
- misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
12296
+ assertions: [
12297
+ passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
12298
+ ],
12366
12299
  expectedAspectCount: 1,
12367
- reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
12368
12300
  evaluatorRawRequest: {
12369
12301
  type: "latency",
12370
12302
  threshold,
@@ -12440,23 +12372,25 @@ var SkillTriggerEvaluator = class {
12440
12372
  return {
12441
12373
  score: 1,
12442
12374
  verdict: "pass",
12443
- hits: [
12444
- shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`
12375
+ assertions: [
12376
+ {
12377
+ text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
12378
+ passed: true
12379
+ }
12445
12380
  ],
12446
- misses: [],
12447
- expectedAspectCount: 1,
12448
- reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
12381
+ expectedAspectCount: 1
12449
12382
  };
12450
12383
  }
12451
12384
  return {
12452
12385
  score: 0,
12453
12386
  verdict: "fail",
12454
- hits: [],
12455
- misses: [
12456
- shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`
12387
+ assertions: [
12388
+ {
12389
+ text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
12390
+ passed: false
12391
+ }
12457
12392
  ],
12458
- expectedAspectCount: 1,
12459
- reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
12393
+ expectedAspectCount: 1
12460
12394
  };
12461
12395
  }
12462
12396
  };
@@ -12625,10 +12559,8 @@ var TokenUsageEvaluator = class {
12625
12559
  return {
12626
12560
  score: 0,
12627
12561
  verdict: "fail",
12628
- hits: [],
12629
- misses: ["No token usage data available in trace"],
12562
+ assertions: [{ text: "No token usage data available in trace", passed: false }],
12630
12563
  expectedAspectCount,
12631
- reasoning: "Token usage not reported by provider",
12632
12564
  evaluatorRawRequest: {
12633
12565
  type: "token-usage",
12634
12566
  max_total: maxTotal ?? null,
@@ -12642,37 +12574,34 @@ var TokenUsageEvaluator = class {
12642
12574
  const output = usage.output;
12643
12575
  const cached = usage.cached ?? 0;
12644
12576
  const total = input + output + cached;
12645
- const hits = [];
12646
- const misses = [];
12577
+ const assertions = [];
12647
12578
  if (typeof maxInput === "number") {
12648
12579
  if (input <= maxInput) {
12649
- hits.push(`Input tokens ${input} <= ${maxInput}`);
12580
+ assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
12650
12581
  } else {
12651
- misses.push(`Input tokens ${input} > ${maxInput}`);
12582
+ assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
12652
12583
  }
12653
12584
  }
12654
12585
  if (typeof maxOutput === "number") {
12655
12586
  if (output <= maxOutput) {
12656
- hits.push(`Output tokens ${output} <= ${maxOutput}`);
12587
+ assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
12657
12588
  } else {
12658
- misses.push(`Output tokens ${output} > ${maxOutput}`);
12589
+ assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
12659
12590
  }
12660
12591
  }
12661
12592
  if (typeof maxTotal === "number") {
12662
12593
  if (total <= maxTotal) {
12663
- hits.push(`Total tokens ${total} <= ${maxTotal}`);
12594
+ assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
12664
12595
  } else {
12665
- misses.push(`Total tokens ${total} > ${maxTotal}`);
12596
+ assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
12666
12597
  }
12667
12598
  }
12668
- const passed = misses.length === 0;
12599
+ const passed = assertions.every((a) => a.passed);
12669
12600
  return {
12670
12601
  score: passed ? 1 : 0,
12671
12602
  verdict: passed ? "pass" : "fail",
12672
- hits,
12673
- misses,
12603
+ assertions,
12674
12604
  expectedAspectCount,
12675
- reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
12676
12605
  evaluatorRawRequest: {
12677
12606
  type: "token-usage",
12678
12607
  max_total: maxTotal ?? null,
@@ -12772,8 +12701,7 @@ var ToolTrajectoryEvaluator = class {
12772
12701
  return {
12773
12702
  score: 0,
12774
12703
  verdict: "fail",
12775
- hits: [],
12776
- misses: ["No trace available for evaluation"],
12704
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
12777
12705
  expectedAspectCount: 1
12778
12706
  };
12779
12707
  }
@@ -12784,8 +12712,7 @@ var ToolTrajectoryEvaluator = class {
12784
12712
  return {
12785
12713
  score: 0,
12786
12714
  verdict: "fail",
12787
- hits: [],
12788
- misses: ["No trace available for evaluation"],
12715
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
12789
12716
  expectedAspectCount: 1
12790
12717
  };
12791
12718
  }
@@ -12803,8 +12730,7 @@ var ToolTrajectoryEvaluator = class {
12803
12730
  return {
12804
12731
  score: 0,
12805
12732
  verdict: "fail",
12806
- hits: [],
12807
- misses: [`Unknown mode: ${this.config.mode}`],
12733
+ assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
12808
12734
  expectedAspectCount: 1
12809
12735
  };
12810
12736
  }
@@ -12853,28 +12779,32 @@ var ToolTrajectoryEvaluator = class {
12853
12779
  return {
12854
12780
  score: 1,
12855
12781
  verdict: "pass",
12856
- hits: ["No tool requirements specified"],
12857
- misses: [],
12782
+ assertions: [{ text: "No tool requirements specified", passed: true }],
12858
12783
  expectedAspectCount: 0
12859
12784
  };
12860
12785
  }
12861
- const hits = [];
12862
- const misses = [];
12786
+ const assertions = [];
12863
12787
  for (const toolName of toolNames) {
12864
12788
  const required = minimums[toolName];
12865
12789
  const actual = summary.toolCallsByName[toolName] ?? 0;
12866
12790
  if (actual >= required) {
12867
- hits.push(`${toolName}: called ${actual} times (required >=${required})`);
12791
+ assertions.push({
12792
+ text: `${toolName}: called ${actual} times (required >=${required})`,
12793
+ passed: true
12794
+ });
12868
12795
  } else {
12869
- misses.push(`${toolName}: called ${actual} times (required >=${required})`);
12796
+ assertions.push({
12797
+ text: `${toolName}: called ${actual} times (required >=${required})`,
12798
+ passed: false
12799
+ });
12870
12800
  }
12871
12801
  }
12872
- const score = hits.length / toolNames.length;
12802
+ const passedCount = assertions.filter((a) => a.passed).length;
12803
+ const score = passedCount / toolNames.length;
12873
12804
  return {
12874
12805
  score,
12875
12806
  verdict: scoreToVerdict(score),
12876
- hits,
12877
- misses,
12807
+ assertions,
12878
12808
  expectedAspectCount: toolNames.length
12879
12809
  };
12880
12810
  }
@@ -12884,13 +12814,11 @@ var ToolTrajectoryEvaluator = class {
12884
12814
  return {
12885
12815
  score: 1,
12886
12816
  verdict: "pass",
12887
- hits: ["No tool sequence specified"],
12888
- misses: [],
12817
+ assertions: [{ text: "No tool sequence specified", passed: true }],
12889
12818
  expectedAspectCount: 0
12890
12819
  };
12891
12820
  }
12892
- const hits = [];
12893
- const misses = [];
12821
+ const assertions = [];
12894
12822
  const warnings = [];
12895
12823
  let actualIndex = 0;
12896
12824
  let sequenceHits = 0;
@@ -12910,16 +12838,20 @@ var ToolTrajectoryEvaluator = class {
12910
12838
  const actualCall = toolCalls[actualIndex];
12911
12839
  if (actualCall.name === expectedTool) {
12912
12840
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
12913
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
12841
+ assertions.push({
12842
+ text: `Found ${expectedTool} at position ${actualIndex}`,
12843
+ passed: true
12844
+ });
12914
12845
  sequenceHits++;
12915
12846
  matchedCall = actualCall;
12916
12847
  actualIndex++;
12917
12848
  found = true;
12918
12849
  break;
12919
12850
  }
12920
- misses.push(
12921
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
12922
- );
12851
+ assertions.push({
12852
+ text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
12853
+ passed: false
12854
+ });
12923
12855
  actualIndex++;
12924
12856
  argsMismatch = true;
12925
12857
  break;
@@ -12927,7 +12859,10 @@ var ToolTrajectoryEvaluator = class {
12927
12859
  actualIndex++;
12928
12860
  }
12929
12861
  if (!found && !argsMismatch) {
12930
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
12862
+ assertions.push({
12863
+ text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
12864
+ passed: false
12865
+ });
12931
12866
  }
12932
12867
  if (found && matchedCall) {
12933
12868
  const latencyResult = checkLatency(
@@ -12936,10 +12871,10 @@ var ToolTrajectoryEvaluator = class {
12936
12871
  matchedCall.durationMs
12937
12872
  );
12938
12873
  if (latencyResult.status === "pass") {
12939
- hits.push(latencyResult.message);
12874
+ assertions.push({ text: latencyResult.message, passed: true });
12940
12875
  latencyHits++;
12941
12876
  } else if (latencyResult.status === "fail") {
12942
- misses.push(latencyResult.message);
12877
+ assertions.push({ text: latencyResult.message, passed: false });
12943
12878
  } else if (latencyResult.message) {
12944
12879
  warnings.push(latencyResult.message);
12945
12880
  latencySkips++;
@@ -12955,8 +12890,7 @@ var ToolTrajectoryEvaluator = class {
12955
12890
  return {
12956
12891
  score,
12957
12892
  verdict: scoreToVerdict(score),
12958
- hits,
12959
- misses,
12893
+ assertions,
12960
12894
  expectedAspectCount: totalAssertions
12961
12895
  };
12962
12896
  }
@@ -12966,13 +12900,11 @@ var ToolTrajectoryEvaluator = class {
12966
12900
  return {
12967
12901
  score: 1,
12968
12902
  verdict: "pass",
12969
- hits: ["No tool sequence specified"],
12970
- misses: [],
12903
+ assertions: [{ text: "No tool sequence specified", passed: true }],
12971
12904
  expectedAspectCount: 0
12972
12905
  };
12973
12906
  }
12974
- const hits = [];
12975
- const misses = [];
12907
+ const assertions = [];
12976
12908
  const warnings = [];
12977
12909
  let sequenceHits = 0;
12978
12910
  let latencyHits = 0;
@@ -12981,7 +12913,10 @@ var ToolTrajectoryEvaluator = class {
12981
12913
  (item) => item.maxDurationMs !== void 0
12982
12914
  ).length;
12983
12915
  if (toolCalls.length !== expected.length) {
12984
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
12916
+ assertions.push({
12917
+ text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
12918
+ passed: false
12919
+ });
12985
12920
  }
12986
12921
  const checkLength = Math.min(expected.length, toolCalls.length);
12987
12922
  for (let i = 0; i < checkLength; i++) {
@@ -12993,14 +12928,17 @@ var ToolTrajectoryEvaluator = class {
12993
12928
  let sequenceMatched = false;
12994
12929
  if (actualTool === expectedTool) {
12995
12930
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
12996
- hits.push(`Position ${i}: ${expectedTool}`);
12931
+ assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
12997
12932
  sequenceHits++;
12998
12933
  sequenceMatched = true;
12999
12934
  } else {
13000
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
12935
+ assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
13001
12936
  }
13002
12937
  } else {
13003
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
12938
+ assertions.push({
12939
+ text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
12940
+ passed: false
12941
+ });
13004
12942
  }
13005
12943
  if (sequenceMatched) {
13006
12944
  const latencyResult = checkLatency(
@@ -13009,10 +12947,10 @@ var ToolTrajectoryEvaluator = class {
13009
12947
  actualCall.durationMs
13010
12948
  );
13011
12949
  if (latencyResult.status === "pass") {
13012
- hits.push(latencyResult.message);
12950
+ assertions.push({ text: latencyResult.message, passed: true });
13013
12951
  latencyHits++;
13014
12952
  } else if (latencyResult.status === "fail") {
13015
- misses.push(latencyResult.message);
12953
+ assertions.push({ text: latencyResult.message, passed: false });
13016
12954
  } else if (latencyResult.message) {
13017
12955
  warnings.push(latencyResult.message);
13018
12956
  latencySkips++;
@@ -13020,7 +12958,10 @@ var ToolTrajectoryEvaluator = class {
13020
12958
  }
13021
12959
  }
13022
12960
  for (let i = checkLength; i < expected.length; i++) {
13023
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
12961
+ assertions.push({
12962
+ text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
12963
+ passed: false
12964
+ });
13024
12965
  }
13025
12966
  for (const warning of warnings) {
13026
12967
  console.warn(`[tool-trajectory] ${warning}`);
@@ -13031,8 +12972,7 @@ var ToolTrajectoryEvaluator = class {
13031
12972
  return {
13032
12973
  score,
13033
12974
  verdict: scoreToVerdict(score),
13034
- hits,
13035
- misses,
12975
+ assertions,
13036
12976
  expectedAspectCount: totalAssertions
13037
12977
  };
13038
12978
  }
@@ -13047,13 +12987,11 @@ var ToolTrajectoryEvaluator = class {
13047
12987
  return {
13048
12988
  score: 1,
13049
12989
  verdict: "pass",
13050
- hits: ["No expected tools specified"],
13051
- misses: [],
12990
+ assertions: [{ text: "No expected tools specified", passed: true }],
13052
12991
  expectedAspectCount: 0
13053
12992
  };
13054
12993
  }
13055
- const hits = [];
13056
- const misses = [];
12994
+ const assertions = [];
13057
12995
  const consumed = /* @__PURE__ */ new Set();
13058
12996
  for (let i = 0; i < expected.length; i++) {
13059
12997
  const expectedItem = expected[i];
@@ -13064,22 +13002,25 @@ var ToolTrajectoryEvaluator = class {
13064
13002
  if (consumed.has(j)) continue;
13065
13003
  const actualCall = toolCalls[j];
13066
13004
  if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
13067
- hits.push(`Found ${expectedTool} at position ${j}`);
13005
+ assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
13068
13006
  consumed.add(j);
13069
13007
  found = true;
13070
13008
  break;
13071
13009
  }
13072
13010
  }
13073
13011
  if (!found) {
13074
- misses.push(`Expected ${expectedTool} not found in actual trajectory`);
13012
+ assertions.push({
13013
+ text: `Expected ${expectedTool} not found in actual trajectory`,
13014
+ passed: false
13015
+ });
13075
13016
  }
13076
13017
  }
13077
- const score = expected.length > 0 ? hits.length / expected.length : 1;
13018
+ const passedCount = assertions.filter((a) => a.passed).length;
13019
+ const score = expected.length > 0 ? passedCount / expected.length : 1;
13078
13020
  return {
13079
13021
  score,
13080
13022
  verdict: scoreToVerdict(score),
13081
- hits,
13082
- misses,
13023
+ assertions,
13083
13024
  expectedAspectCount: expected.length
13084
13025
  };
13085
13026
  }
@@ -13095,16 +13036,19 @@ var ToolTrajectoryEvaluator = class {
13095
13036
  return {
13096
13037
  score: 1,
13097
13038
  verdict: "pass",
13098
- hits: ["No tool calls and no expected tools"],
13099
- misses: [],
13039
+ assertions: [{ text: "No tool calls and no expected tools", passed: true }],
13100
13040
  expectedAspectCount: 0
13101
13041
  };
13102
13042
  }
13103
13043
  return {
13104
13044
  score: 0,
13105
13045
  verdict: "fail",
13106
- hits: [],
13107
- misses: [`${toolCalls.length} unexpected tool call(s) with empty allowed list`],
13046
+ assertions: [
13047
+ {
13048
+ text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
13049
+ passed: false
13050
+ }
13051
+ ],
13108
13052
  expectedAspectCount: toolCalls.length
13109
13053
  };
13110
13054
  }
@@ -13112,13 +13056,11 @@ var ToolTrajectoryEvaluator = class {
13112
13056
  return {
13113
13057
  score: 1,
13114
13058
  verdict: "pass",
13115
- hits: ["No actual tool calls (trivially a subset)"],
13116
- misses: [],
13059
+ assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
13117
13060
  expectedAspectCount: 0
13118
13061
  };
13119
13062
  }
13120
- const hits = [];
13121
- const misses = [];
13063
+ const assertions = [];
13122
13064
  for (let i = 0; i < toolCalls.length; i++) {
13123
13065
  const actualCall = toolCalls[i];
13124
13066
  let allowed = false;
@@ -13130,17 +13072,23 @@ var ToolTrajectoryEvaluator = class {
13130
13072
  }
13131
13073
  }
13132
13074
  if (allowed) {
13133
- hits.push(`Position ${i}: ${actualCall.name} is in allowed set`);
13075
+ assertions.push({
13076
+ text: `Position ${i}: ${actualCall.name} is in allowed set`,
13077
+ passed: true
13078
+ });
13134
13079
  } else {
13135
- misses.push(`Position ${i}: ${actualCall.name} is not in allowed set`);
13080
+ assertions.push({
13081
+ text: `Position ${i}: ${actualCall.name} is not in allowed set`,
13082
+ passed: false
13083
+ });
13136
13084
  }
13137
13085
  }
13138
- const score = toolCalls.length > 0 ? hits.length / toolCalls.length : 1;
13086
+ const passedCount = assertions.filter((a) => a.passed).length;
13087
+ const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
13139
13088
  return {
13140
13089
  score,
13141
13090
  verdict: scoreToVerdict(score),
13142
- hits,
13143
- misses,
13091
+ assertions,
13144
13092
  expectedAspectCount: toolCalls.length
13145
13093
  };
13146
13094
  }
@@ -13151,8 +13099,12 @@ function runContainsAssertion(output, value) {
13151
13099
  const passed = output.includes(value);
13152
13100
  return {
13153
13101
  score: passed ? 1 : 0,
13154
- hits: passed ? [`Output contains "${value}"`] : [],
13155
- misses: passed ? [] : [`Output does not contain "${value}"`]
13102
+ assertions: [
13103
+ {
13104
+ text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
13105
+ passed
13106
+ }
13107
+ ]
13156
13108
  };
13157
13109
  }
13158
13110
  function runContainsAnyAssertion(output, values) {
@@ -13160,8 +13112,12 @@ function runContainsAnyAssertion(output, values) {
13160
13112
  const passed = matched.length > 0;
13161
13113
  return {
13162
13114
  score: passed ? 1 : 0,
13163
- hits: passed ? [`Output contains "${matched[0]}"`] : [],
13164
- misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
13115
+ assertions: [
13116
+ {
13117
+ text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
13118
+ passed
13119
+ }
13120
+ ]
13165
13121
  };
13166
13122
  }
13167
13123
  function runContainsAllAssertion(output, values) {
@@ -13169,16 +13125,24 @@ function runContainsAllAssertion(output, values) {
13169
13125
  const passed = missing.length === 0;
13170
13126
  return {
13171
13127
  score: passed ? 1 : 0,
13172
- hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
13173
- misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
13128
+ assertions: [
13129
+ {
13130
+ text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
13131
+ passed
13132
+ }
13133
+ ]
13174
13134
  };
13175
13135
  }
13176
13136
  function runIcontainsAssertion(output, value) {
13177
13137
  const passed = output.toLowerCase().includes(value.toLowerCase());
13178
13138
  return {
13179
13139
  score: passed ? 1 : 0,
13180
- hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
13181
- misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
13140
+ assertions: [
13141
+ {
13142
+ text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
13143
+ passed
13144
+ }
13145
+ ]
13182
13146
  };
13183
13147
  }
13184
13148
  function runIcontainsAnyAssertion(output, values) {
@@ -13187,9 +13151,11 @@ function runIcontainsAnyAssertion(output, values) {
13187
13151
  const passed = matched.length > 0;
13188
13152
  return {
13189
13153
  score: passed ? 1 : 0,
13190
- hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
13191
- misses: passed ? [] : [
13192
- `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
13154
+ assertions: [
13155
+ {
13156
+ text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
13157
+ passed
13158
+ }
13193
13159
  ]
13194
13160
  };
13195
13161
  }
@@ -13199,24 +13165,36 @@ function runIcontainsAllAssertion(output, values) {
13199
13165
  const passed = missing.length === 0;
13200
13166
  return {
13201
13167
  score: passed ? 1 : 0,
13202
- hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
13203
- misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
13168
+ assertions: [
13169
+ {
13170
+ text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
13171
+ passed
13172
+ }
13173
+ ]
13204
13174
  };
13205
13175
  }
13206
13176
  function runStartsWithAssertion(output, value) {
13207
13177
  const passed = output.trim().startsWith(value.trim());
13208
13178
  return {
13209
13179
  score: passed ? 1 : 0,
13210
- hits: passed ? [`Output starts with "${value}"`] : [],
13211
- misses: passed ? [] : [`Output does not start with "${value}"`]
13180
+ assertions: [
13181
+ {
13182
+ text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
13183
+ passed
13184
+ }
13185
+ ]
13212
13186
  };
13213
13187
  }
13214
13188
  function runEndsWithAssertion(output, value) {
13215
13189
  const passed = output.trim().endsWith(value.trim());
13216
13190
  return {
13217
13191
  score: passed ? 1 : 0,
13218
- hits: passed ? [`Output ends with "${value}"`] : [],
13219
- misses: passed ? [] : [`Output does not end with "${value}"`]
13192
+ assertions: [
13193
+ {
13194
+ text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
13195
+ passed
13196
+ }
13197
+ ]
13220
13198
  };
13221
13199
  }
13222
13200
  function runRegexAssertion(output, pattern, flags) {
@@ -13225,8 +13203,12 @@ function runRegexAssertion(output, pattern, flags) {
13225
13203
  const flagsLabel = flags ? ` (flags: ${flags})` : "";
13226
13204
  return {
13227
13205
  score: passed ? 1 : 0,
13228
- hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
13229
- misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
13206
+ assertions: [
13207
+ {
13208
+ text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
13209
+ passed
13210
+ }
13211
+ ]
13230
13212
  };
13231
13213
  }
13232
13214
  function runIsJsonAssertion(output) {
@@ -13238,16 +13220,24 @@ function runIsJsonAssertion(output) {
13238
13220
  }
13239
13221
  return {
13240
13222
  score: passed ? 1 : 0,
13241
- hits: passed ? ["Output is valid JSON"] : [],
13242
- misses: passed ? [] : ["Output is not valid JSON"]
13223
+ assertions: [
13224
+ {
13225
+ text: passed ? "Output is valid JSON" : "Output is not valid JSON",
13226
+ passed
13227
+ }
13228
+ ]
13243
13229
  };
13244
13230
  }
13245
13231
  function runEqualsAssertion(output, value) {
13246
13232
  const passed = output.trim() === value.trim();
13247
13233
  return {
13248
13234
  score: passed ? 1 : 0,
13249
- hits: passed ? [`Output equals "${value}"`] : [],
13250
- misses: passed ? [] : [`Output does not equal "${value}"`]
13235
+ assertions: [
13236
+ {
13237
+ text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
13238
+ passed
13239
+ }
13240
+ ]
13251
13241
  };
13252
13242
  }
13253
13243
 
@@ -13460,10 +13450,8 @@ var InlineAssertEvaluator = class {
13460
13450
  return {
13461
13451
  score,
13462
13452
  verdict: scoreToVerdict(score),
13463
- hits: score >= 0.8 ? [result.name] : [],
13464
- misses: score < 0.5 ? [result.name] : [],
13453
+ assertions: [{ text: result.name, passed: score >= 0.5 }],
13465
13454
  expectedAspectCount: 1,
13466
- reasoning: void 0,
13467
13455
  details: result.metadata ? result.metadata : void 0
13468
13456
  };
13469
13457
  }
@@ -13656,9 +13644,7 @@ var containsFactory = (config) => {
13656
13644
  return {
13657
13645
  score: result.score,
13658
13646
  verdict: result.score === 1 ? "pass" : "fail",
13659
- hits: result.hits,
13660
- misses: result.misses,
13661
- reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
13647
+ assertions: result.assertions,
13662
13648
  expectedAspectCount: 1
13663
13649
  };
13664
13650
  });
@@ -13670,9 +13656,7 @@ var regexFactory = (config) => {
13670
13656
  return {
13671
13657
  score: result.score,
13672
13658
  verdict: result.score === 1 ? "pass" : "fail",
13673
- hits: result.hits,
13674
- misses: result.misses,
13675
- reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
13659
+ assertions: result.assertions,
13676
13660
  expectedAspectCount: 1
13677
13661
  };
13678
13662
  });
@@ -13683,9 +13667,7 @@ var isJsonFactory = () => {
13683
13667
  return {
13684
13668
  score: result.score,
13685
13669
  verdict: result.score === 1 ? "pass" : "fail",
13686
- hits: result.hits,
13687
- misses: result.misses,
13688
- reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
13670
+ assertions: result.assertions,
13689
13671
  expectedAspectCount: 1
13690
13672
  };
13691
13673
  });
@@ -13697,9 +13679,7 @@ var equalsFactory = (config) => {
13697
13679
  return {
13698
13680
  score: result.score,
13699
13681
  verdict: result.score === 1 ? "pass" : "fail",
13700
- hits: result.hits,
13701
- misses: result.misses,
13702
- reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
13682
+ assertions: result.assertions,
13703
13683
  expectedAspectCount: 1
13704
13684
  };
13705
13685
  });
@@ -13711,9 +13691,7 @@ var containsAnyFactory = (config) => {
13711
13691
  return {
13712
13692
  score: result.score,
13713
13693
  verdict: result.score === 1 ? "pass" : "fail",
13714
- hits: result.hits,
13715
- misses: result.misses,
13716
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13694
+ assertions: result.assertions,
13717
13695
  expectedAspectCount: 1
13718
13696
  };
13719
13697
  });
@@ -13725,9 +13703,7 @@ var containsAllFactory = (config) => {
13725
13703
  return {
13726
13704
  score: result.score,
13727
13705
  verdict: result.score === 1 ? "pass" : "fail",
13728
- hits: result.hits,
13729
- misses: result.misses,
13730
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13706
+ assertions: result.assertions,
13731
13707
  expectedAspectCount: 1
13732
13708
  };
13733
13709
  });
@@ -13739,9 +13715,7 @@ var icontainsFactory = (config) => {
13739
13715
  return {
13740
13716
  score: result.score,
13741
13717
  verdict: result.score === 1 ? "pass" : "fail",
13742
- hits: result.hits,
13743
- misses: result.misses,
13744
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13718
+ assertions: result.assertions,
13745
13719
  expectedAspectCount: 1
13746
13720
  };
13747
13721
  });
@@ -13753,9 +13727,7 @@ var icontainsAnyFactory = (config) => {
13753
13727
  return {
13754
13728
  score: result.score,
13755
13729
  verdict: result.score === 1 ? "pass" : "fail",
13756
- hits: result.hits,
13757
- misses: result.misses,
13758
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13730
+ assertions: result.assertions,
13759
13731
  expectedAspectCount: 1
13760
13732
  };
13761
13733
  });
@@ -13767,9 +13739,7 @@ var icontainsAllFactory = (config) => {
13767
13739
  return {
13768
13740
  score: result.score,
13769
13741
  verdict: result.score === 1 ? "pass" : "fail",
13770
- hits: result.hits,
13771
- misses: result.misses,
13772
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13742
+ assertions: result.assertions,
13773
13743
  expectedAspectCount: 1
13774
13744
  };
13775
13745
  });
@@ -13781,9 +13751,7 @@ var startsWithFactory = (config) => {
13781
13751
  return {
13782
13752
  score: result.score,
13783
13753
  verdict: result.score === 1 ? "pass" : "fail",
13784
- hits: result.hits,
13785
- misses: result.misses,
13786
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13754
+ assertions: result.assertions,
13787
13755
  expectedAspectCount: 1
13788
13756
  };
13789
13757
  });
@@ -13795,9 +13763,7 @@ var endsWithFactory = (config) => {
13795
13763
  return {
13796
13764
  score: result.score,
13797
13765
  verdict: result.score === 1 ? "pass" : "fail",
13798
- hits: result.hits,
13799
- misses: result.misses,
13800
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
13766
+ assertions: result.assertions,
13801
13767
  expectedAspectCount: 1
13802
13768
  };
13803
13769
  });
@@ -14868,7 +14834,7 @@ async function runEvaluation(options) {
14868
14834
  if (!cliModel) {
14869
14835
  throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
14870
14836
  }
14871
- const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-HDSAUUEF.js");
14837
+ const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-NFFLXG5M.js");
14872
14838
  return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
14873
14839
  }
14874
14840
  const overrideTarget = resolveTargetByName(cliGraderTarget);
@@ -15203,8 +15169,7 @@ async function runEvaluation(options) {
15203
15169
  testId: evalCase.id,
15204
15170
  dataset: evalCase.dataset,
15205
15171
  score: 0,
15206
- hits: [],
15207
- misses: [],
15172
+ assertions: [],
15208
15173
  answer: "",
15209
15174
  target: target.name,
15210
15175
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
@@ -15240,8 +15205,7 @@ async function runEvaluation(options) {
15240
15205
  testId: evalCase.id,
15241
15206
  dataset: evalCase.dataset,
15242
15207
  score: 0,
15243
- hits: [],
15244
- misses: [],
15208
+ assertions: [],
15245
15209
  answer: "",
15246
15210
  target: target.name,
15247
15211
  error: errorMsg,
@@ -16208,11 +16172,9 @@ async function evaluateCandidate(options) {
16208
16172
  dataset: evalCase.dataset,
16209
16173
  conversationId: evalCase.conversation_id,
16210
16174
  score: score.score,
16211
- hits: score.hits,
16212
- misses: score.misses,
16175
+ assertions: score.assertions,
16213
16176
  answer: candidate,
16214
16177
  target: target.name,
16215
- reasoning: score.reasoning,
16216
16178
  tokenUsage,
16217
16179
  costUsd,
16218
16180
  durationMs,
@@ -16386,9 +16348,7 @@ async function runEvaluatorList(options) {
16386
16348
  score: score2.score,
16387
16349
  weight,
16388
16350
  verdict: score2.verdict,
16389
- hits: score2.hits,
16390
- misses: score2.misses,
16391
- reasoning: score2.reasoning,
16351
+ assertions: score2.assertions,
16392
16352
  evaluatorProviderRequest: score2.evaluatorRawRequest,
16393
16353
  details: score2.details,
16394
16354
  scores: mapChildResults(score2.scores),
@@ -16403,10 +16363,10 @@ async function runEvaluatorList(options) {
16403
16363
  const fallbackScore = {
16404
16364
  score: 0,
16405
16365
  verdict: "fail",
16406
- hits: [],
16407
- misses: [`Evaluator '${evaluatorConfig.name}' failed: ${message}`],
16408
- expectedAspectCount: 1,
16409
- reasoning: message
16366
+ assertions: [
16367
+ { text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
16368
+ ],
16369
+ expectedAspectCount: 1
16410
16370
  };
16411
16371
  const weight = evaluatorConfig.weight ?? 1;
16412
16372
  scored.push({
@@ -16422,9 +16382,12 @@ async function runEvaluatorList(options) {
16422
16382
  score: 0,
16423
16383
  weight,
16424
16384
  verdict: "fail",
16425
- hits: [],
16426
- misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
16427
- reasoning: message,
16385
+ assertions: [
16386
+ {
16387
+ text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
16388
+ passed: false
16389
+ }
16390
+ ],
16428
16391
  durationMs: endedAt.getTime() - startedAt.getTime(),
16429
16392
  startedAt: startedAt.toISOString(),
16430
16393
  endedAt: endedAt.toISOString()
@@ -16440,9 +16403,7 @@ async function runEvaluatorList(options) {
16440
16403
  ...scores[lastScoresIdx],
16441
16404
  score: negated.score,
16442
16405
  verdict: negated.verdict,
16443
- hits: [...negated.hits],
16444
- misses: [...negated.misses],
16445
- reasoning: negated.reasoning
16406
+ assertions: [...negated.assertions]
16446
16407
  };
16447
16408
  }
16448
16409
  }
@@ -16457,21 +16418,13 @@ async function runEvaluatorList(options) {
16457
16418
  const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
16458
16419
  scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
16459
16420
  ) : 0;
16460
- const hits = scored.flatMap((entry) => entry.score.hits);
16461
- const misses = scored.flatMap((entry) => entry.score.misses);
16462
- const expectedAspectCount = scored.reduce(
16463
- (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
16464
- 0
16465
- );
16466
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
16467
- const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
16421
+ const assertions = scored.flatMap((entry) => entry.score.assertions);
16422
+ const expectedAspectCount = assertions.length || 1;
16468
16423
  const score = {
16469
16424
  score: aggregateScore,
16470
16425
  verdict: scoreToVerdict(aggregateScore),
16471
- hits,
16472
- misses,
16473
- expectedAspectCount,
16474
- reasoning
16426
+ assertions,
16427
+ expectedAspectCount
16475
16428
  };
16476
16429
  return { score, scores };
16477
16430
  }
@@ -16575,8 +16528,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
16575
16528
  dataset: evalCase.dataset,
16576
16529
  conversationId: evalCase.conversation_id,
16577
16530
  score: 0,
16578
- hits: [],
16579
- misses: [`Error: ${message}`],
16531
+ assertions: [{ text: `Error: ${message}`, passed: false }],
16580
16532
  answer: `Error occurred: ${message}`,
16581
16533
  target: targetName,
16582
16534
  requests,
@@ -16686,9 +16638,7 @@ function mapChildResults(children) {
16686
16638
  score: child.score,
16687
16639
  weight: child.weight,
16688
16640
  verdict: child.verdict,
16689
- hits: child.hits,
16690
- misses: child.misses,
16691
- reasoning: child.reasoning,
16641
+ assertions: child.assertions,
16692
16642
  evaluatorProviderRequest: child.evaluatorRawRequest,
16693
16643
  scores: mapChildResults(child.scores),
16694
16644
  details: child.details,
@@ -17653,7 +17603,6 @@ export {
17653
17603
  freeformEvaluationSchema,
17654
17604
  generateRubrics,
17655
17605
  getAgentvHome,
17656
- getHitCount,
17657
17606
  getOutputFilenames,
17658
17607
  getSubagentsRoot,
17659
17608
  getTraceStateRoot,