@agentv/core 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -55,7 +55,7 @@ function createLanguageModel(modelString) {
55
55
  case "anthropic":
56
56
  return (0, import_anthropic.createAnthropic)()(modelName);
57
57
  case "azure":
58
- return (0, import_azure.createAzure)()(modelName);
58
+ return (0, import_azure.createAzure)().chat(modelName);
59
59
  case "google":
60
60
  return (0, import_google.createGoogleGenerativeAI)()(modelName);
61
61
  default:
@@ -1580,7 +1580,6 @@ __export(index_exports, {
1580
1580
  freeformEvaluationSchema: () => freeformEvaluationSchema,
1581
1581
  generateRubrics: () => generateRubrics,
1582
1582
  getAgentvHome: () => getAgentvHome,
1583
- getHitCount: () => getHitCount,
1584
1583
  getOutputFilenames: () => getOutputFilenames,
1585
1584
  getSubagentsRoot: () => getSubagentsRoot,
1586
1585
  getTraceStateRoot: () => getTraceStateRoot,
@@ -1730,9 +1729,6 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
1730
1729
  function isEvaluatorKind(value) {
1731
1730
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
1732
1731
  }
1733
- function getHitCount(result) {
1734
- return result.hits.length;
1735
- }
1736
1732
 
1737
1733
  // src/evaluation/trace.ts
1738
1734
  function computeTraceSummary(messages) {
@@ -5576,7 +5572,7 @@ var AzureProvider = class {
5576
5572
  };
5577
5573
  this.retryConfig = config.retry;
5578
5574
  const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
5579
- this.model = azure(config.deploymentName);
5575
+ this.model = azure.chat(config.deploymentName);
5580
5576
  }
5581
5577
  id;
5582
5578
  kind = "azure";
@@ -12807,9 +12803,11 @@ function negateScore(score) {
12807
12803
  ...score,
12808
12804
  score: negatedScore,
12809
12805
  verdict: negatedVerdict,
12810
- reasoning: score.reasoning ? `[Negated] ${score.reasoning} (original score: ${score.score.toFixed(2)})` : `[Negated] Original score: ${score.score.toFixed(2)}`,
12811
- hits: score.misses,
12812
- misses: score.hits
12806
+ assertions: score.assertions.map((a) => ({
12807
+ ...a,
12808
+ passed: !a.passed,
12809
+ evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
12810
+ }))
12813
12811
  };
12814
12812
  }
12815
12813
 
@@ -13324,9 +13322,13 @@ var CodeEvaluator = class {
13324
13322
  );
13325
13323
  const parsed = parseJsonSafe(stdout);
13326
13324
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
13327
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
13328
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
13329
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
13325
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
13326
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
13327
+ ).map((a) => ({
13328
+ text: String(a.text),
13329
+ passed: Boolean(a.passed),
13330
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
13331
+ })) : [];
13330
13332
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
13331
13333
  const proxyUsage = getProxyUsage?.();
13332
13334
  const evaluatorRawRequest = {
@@ -13342,10 +13344,8 @@ var CodeEvaluator = class {
13342
13344
  return {
13343
13345
  score,
13344
13346
  verdict: scoreToVerdict(score),
13345
- hits,
13346
- misses,
13347
- expectedAspectCount: hits.length + misses.length || 1,
13348
- reasoning,
13347
+ assertions,
13348
+ expectedAspectCount: assertions.length || 1,
13349
13349
  evaluatorRawRequest,
13350
13350
  ...details ? { details } : {},
13351
13351
  tokenUsage: proxyUsage?.tokenUsage
@@ -13356,10 +13356,8 @@ var CodeEvaluator = class {
13356
13356
  return {
13357
13357
  score: 0,
13358
13358
  verdict: "fail",
13359
- hits: [],
13360
- misses: [`Code evaluator failed: ${message}`],
13359
+ assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
13361
13360
  expectedAspectCount: 1,
13362
- reasoning: message,
13363
13361
  evaluatorRawRequest: {
13364
13362
  command: this.command,
13365
13363
  ...this.cwd ? { cwd: this.cwd } : {},
@@ -13499,9 +13497,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
13499
13497
  {{${TEMPLATE_VARIABLES.ANSWER}}}`;
13500
13498
  var freeformEvaluationSchema = import_zod4.z.object({
13501
13499
  score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
13502
- hits: import_zod4.z.array(import_zod4.z.string()).describe("Brief specific achievements").optional(),
13503
- misses: import_zod4.z.array(import_zod4.z.string()).describe("Brief failures or omissions").optional(),
13504
- reasoning: import_zod4.z.string().describe("Concise explanation (1-2 sentences)").optional()
13500
+ assertions: import_zod4.z.array(
13501
+ import_zod4.z.object({
13502
+ text: import_zod4.z.string().describe("Brief description of what was checked"),
13503
+ passed: import_zod4.z.boolean().describe("Whether this aspect was satisfied"),
13504
+ evidence: import_zod4.z.string().describe("Concise evidence (1-2 sentences)").optional()
13505
+ })
13506
+ ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
13505
13507
  });
13506
13508
  var rubricCheckResultSchema = import_zod4.z.object({
13507
13509
  id: import_zod4.z.string().describe("The ID of the rubric item being checked"),
@@ -13603,17 +13605,12 @@ ${context2.fileChanges}`;
13603
13605
  schema: freeformEvaluationSchema
13604
13606
  });
13605
13607
  const score = clampScore(data.score);
13606
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
13607
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
13608
- const reasoning = data.reasoning;
13609
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
13608
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
13610
13609
  return {
13611
13610
  score,
13612
13611
  verdict: scoreToVerdict(score),
13613
- hits,
13614
- misses,
13615
- expectedAspectCount,
13616
- reasoning,
13612
+ assertions,
13613
+ expectedAspectCount: Math.max(assertions.length, 1),
13617
13614
  evaluatorRawRequest,
13618
13615
  tokenUsage
13619
13616
  };
@@ -13624,10 +13621,8 @@ ${context2.fileChanges}`;
13624
13621
  return {
13625
13622
  score: 0,
13626
13623
  verdict: "skip",
13627
- hits: [],
13628
- misses: [`Grader parse failure after 3 attempts: ${message}`],
13624
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13629
13625
  expectedAspectCount: 1,
13630
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
13631
13626
  evaluatorRawRequest
13632
13627
  };
13633
13628
  }
@@ -13657,14 +13652,12 @@ ${context2.fileChanges}`;
13657
13652
  userPrompt: prompt,
13658
13653
  schema: rubricEvaluationSchema
13659
13654
  });
13660
- const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
13655
+ const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
13661
13656
  return {
13662
13657
  score,
13663
13658
  verdict,
13664
- hits,
13665
- misses,
13659
+ assertions,
13666
13660
  expectedAspectCount: rubrics.length,
13667
- reasoning: data.overall_reasoning,
13668
13661
  evaluatorRawRequest,
13669
13662
  tokenUsage
13670
13663
  };
@@ -13675,10 +13668,8 @@ ${context2.fileChanges}`;
13675
13668
  return {
13676
13669
  score: 0,
13677
13670
  verdict: "skip",
13678
- hits: [],
13679
- misses: [`Grader parse failure after 3 attempts: ${message}`],
13671
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13680
13672
  expectedAspectCount: rubrics.length,
13681
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
13682
13673
  evaluatorRawRequest
13683
13674
  };
13684
13675
  }
@@ -13703,14 +13694,12 @@ ${context2.fileChanges}`;
13703
13694
  userPrompt: prompt,
13704
13695
  schema: scoreRangeEvaluationSchema
13705
13696
  });
13706
- const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
13697
+ const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
13707
13698
  return {
13708
13699
  score,
13709
13700
  verdict,
13710
- hits,
13711
- misses,
13701
+ assertions,
13712
13702
  expectedAspectCount: rubrics.length,
13713
- reasoning: data.overall_reasoning,
13714
13703
  evaluatorRawRequest,
13715
13704
  details,
13716
13705
  tokenUsage
@@ -13722,10 +13711,8 @@ ${context2.fileChanges}`;
13722
13711
  return {
13723
13712
  score: 0,
13724
13713
  verdict: "skip",
13725
- hits: [],
13726
- misses: [`Grader parse failure after 3 attempts: ${message}`],
13714
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13727
13715
  expectedAspectCount: rubrics.length,
13728
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
13729
13716
  evaluatorRawRequest
13730
13717
  };
13731
13718
  }
@@ -13782,8 +13769,7 @@ ${context2.fileChanges}`;
13782
13769
  return {
13783
13770
  score: 0,
13784
13771
  verdict: "fail",
13785
- hits: [],
13786
- misses: [`llm-grader built-in evaluation failed: ${message}`],
13772
+ assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
13787
13773
  expectedAspectCount: 1,
13788
13774
  evaluatorRawRequest,
13789
13775
  details: { mode: "built-in", error: message }
@@ -13833,8 +13819,9 @@ ${context2.fileChanges}`;
13833
13819
  return {
13834
13820
  score: 0,
13835
13821
  verdict: "fail",
13836
- hits: [],
13837
- misses: [`llm-grader ${modeLabel} returned no assistant response`],
13822
+ assertions: [
13823
+ { text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
13824
+ ],
13838
13825
  expectedAspectCount: 1,
13839
13826
  evaluatorRawRequest,
13840
13827
  details: { mode: modeLabel, grader_target: provider.targetName }
@@ -13852,8 +13839,9 @@ ${context2.fileChanges}`;
13852
13839
  return {
13853
13840
  score: 0,
13854
13841
  verdict: "fail",
13855
- hits: [],
13856
- misses: [`llm-grader ${modeLabel} evaluation failed: ${message}`],
13842
+ assertions: [
13843
+ { text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
13844
+ ],
13857
13845
  expectedAspectCount: 1,
13858
13846
  evaluatorRawRequest,
13859
13847
  details: {
@@ -14005,29 +13993,24 @@ ${outputSchema}`;
14005
13993
  const parsed = parseJsonFromText(text);
14006
13994
  if (rubrics && rubrics.length > 0) {
14007
13995
  const data2 = rubricEvaluationSchema.parse(parsed);
14008
- const { score: score2, verdict, hits: hits2, misses: misses2 } = calculateRubricScore(data2, rubrics);
13996
+ const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
14009
13997
  return {
14010
13998
  score: score2,
14011
13999
  verdict,
14012
- hits: hits2,
14013
- misses: misses2,
14000
+ assertions: assertions2,
14014
14001
  expectedAspectCount: rubrics.length,
14015
- reasoning: data2.overall_reasoning,
14016
14002
  evaluatorRawRequest,
14017
14003
  details
14018
14004
  };
14019
14005
  }
14020
14006
  const data = freeformEvaluationSchema.parse(parsed);
14021
14007
  const score = clampScore(data.score);
14022
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
14023
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
14008
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
14024
14009
  return {
14025
14010
  score,
14026
14011
  verdict: scoreToVerdict(score),
14027
- hits,
14028
- misses,
14029
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
14030
- reasoning: data.reasoning,
14012
+ assertions,
14013
+ expectedAspectCount: Math.max(assertions.length, 1),
14031
14014
  evaluatorRawRequest,
14032
14015
  details
14033
14016
  };
@@ -14035,8 +14018,12 @@ ${outputSchema}`;
14035
14018
  return {
14036
14019
  score: 0,
14037
14020
  verdict: "fail",
14038
- hits: [],
14039
- misses: ["Failed to parse llm-grader agent response as valid evaluation JSON"],
14021
+ assertions: [
14022
+ {
14023
+ text: "Failed to parse llm-grader agent response as valid evaluation JSON",
14024
+ passed: false
14025
+ }
14026
+ ],
14040
14027
  expectedAspectCount: 1,
14041
14028
  evaluatorRawRequest,
14042
14029
  details
@@ -14165,9 +14152,13 @@ function buildOutputSchema() {
14165
14152
  "",
14166
14153
  "{",
14167
14154
  ' "score": <number between 0.0 and 1.0>,',
14168
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
14169
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
14170
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
14155
+ ' "assertions": [',
14156
+ " {",
14157
+ ' "text": "<brief description of what was checked>",',
14158
+ ' "passed": <boolean>,',
14159
+ ' "evidence": "<concise evidence, 1-2 sentences, optional>"',
14160
+ " }",
14161
+ " ]",
14171
14162
  "}"
14172
14163
  ].join("\n");
14173
14164
  }
@@ -14192,8 +14183,7 @@ function substituteVariables(template, variables) {
14192
14183
  }
14193
14184
  function calculateRubricScore(result, rubrics) {
14194
14185
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
14195
- const hits = [];
14196
- const misses = [];
14186
+ const assertions = [];
14197
14187
  let totalWeight = 0;
14198
14188
  let earnedWeight = 0;
14199
14189
  let failedRequired = false;
@@ -14203,19 +14193,20 @@ function calculateRubricScore(result, rubrics) {
14203
14193
  continue;
14204
14194
  }
14205
14195
  totalWeight += rubric.weight;
14196
+ assertions.push({
14197
+ text: `[${rubric.id}] ${rubric.outcome}`,
14198
+ passed: check.satisfied,
14199
+ evidence: check.reasoning
14200
+ });
14206
14201
  if (check.satisfied) {
14207
14202
  earnedWeight += rubric.weight;
14208
- hits.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
14209
- } else {
14210
- misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
14211
- if (rubric.required) {
14212
- failedRequired = true;
14213
- }
14203
+ } else if (rubric.required) {
14204
+ failedRequired = true;
14214
14205
  }
14215
14206
  }
14216
14207
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
14217
14208
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
14218
- return { score, verdict, hits, misses };
14209
+ return { score, verdict, assertions };
14219
14210
  }
14220
14211
  function buildScoreRangeOutputSchema() {
14221
14212
  return `You are an expert evaluator. Score the candidate answer on each criterion.
@@ -14235,8 +14226,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
14235
14226
  }
14236
14227
  function calculateScoreRangeResult(result, rubrics) {
14237
14228
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
14238
- const hits = [];
14239
- const misses = [];
14229
+ const assertions = [];
14240
14230
  const rawScores = {};
14241
14231
  let totalWeight = 0;
14242
14232
  let weightedScoreSum = 0;
@@ -14262,24 +14252,22 @@ function calculateScoreRangeResult(result, rubrics) {
14262
14252
  );
14263
14253
  const rangeDescription = matchingRange?.outcome ?? "";
14264
14254
  const criterionLabel = rubric.outcome ?? rubric.id;
14265
- const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
14266
- const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
14255
+ const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
14267
14256
  if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
14268
14257
  failedRequired = true;
14269
- misses.push(scoreInfo);
14270
- } else if (rawScore >= 7) {
14271
- hits.push(scoreInfo);
14272
- } else {
14273
- misses.push(scoreInfo);
14274
14258
  }
14259
+ assertions.push({
14260
+ text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
14261
+ passed,
14262
+ evidence: check.reasoning
14263
+ });
14275
14264
  }
14276
14265
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
14277
14266
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
14278
14267
  return {
14279
14268
  score,
14280
14269
  verdict,
14281
- hits,
14282
- misses,
14270
+ assertions,
14283
14271
  details: {
14284
14272
  raw_scores: rawScores,
14285
14273
  normalization: "score / 10",
@@ -14455,9 +14443,7 @@ var CompositeEvaluator = class {
14455
14443
  let totalWeight = 0;
14456
14444
  let weightedSum = 0;
14457
14445
  let evaluatedCount = 0;
14458
- const allHits = [];
14459
- const allMisses = [];
14460
- const reasoningParts = [];
14446
+ const allAssertions = [];
14461
14447
  const scores = [];
14462
14448
  for (const member of results) {
14463
14449
  const weight = weights?.[member.id] ?? 1;
@@ -14467,9 +14453,7 @@ var CompositeEvaluator = class {
14467
14453
  score: member.result.score,
14468
14454
  weight,
14469
14455
  verdict: member.result.verdict,
14470
- hits: [...member.result.hits],
14471
- misses: [...member.result.misses],
14472
- reasoning: member.result.reasoning,
14456
+ assertions: [...member.result.assertions],
14473
14457
  evaluatorRawRequest: member.result.evaluatorRawRequest,
14474
14458
  scores: member.result.scores,
14475
14459
  details: member.result.details,
@@ -14481,20 +14465,16 @@ var CompositeEvaluator = class {
14481
14465
  evaluatedCount++;
14482
14466
  totalWeight += weight;
14483
14467
  weightedSum += member.result.score * weight;
14484
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
14485
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
14486
- if (member.result.reasoning) {
14487
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
14488
- }
14468
+ allAssertions.push(
14469
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
14470
+ );
14489
14471
  }
14490
14472
  if (evaluatedCount === 0 && results.length > 0) {
14491
14473
  return {
14492
14474
  score: 0,
14493
14475
  verdict: "skip",
14494
- hits: [],
14495
- misses: [],
14476
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
14496
14477
  expectedAspectCount: 1,
14497
- reasoning: "All evaluators skipped (infrastructure failure)",
14498
14478
  evaluatorRawRequest: {
14499
14479
  aggregator: "weighted_average",
14500
14480
  ...weights ? { weights } : {}
@@ -14506,10 +14486,8 @@ var CompositeEvaluator = class {
14506
14486
  return {
14507
14487
  score: clampScore(finalScore),
14508
14488
  verdict: scoreToVerdict(finalScore),
14509
- hits: allHits,
14510
- misses: allMisses,
14511
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
14512
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
14489
+ assertions: allAssertions,
14490
+ expectedAspectCount: allAssertions.length || 1,
14513
14491
  evaluatorRawRequest: {
14514
14492
  aggregator: "weighted_average",
14515
14493
  ...weights ? { weights } : {}
@@ -14519,11 +14497,8 @@ var CompositeEvaluator = class {
14519
14497
  }
14520
14498
  runThreshold(results, threshold) {
14521
14499
  const scores = [];
14522
- const allHits = [];
14523
- const allMisses = [];
14524
- const reasoningParts = [];
14500
+ const allAssertions = [];
14525
14501
  let passingCount = 0;
14526
- let borderlineCount = 0;
14527
14502
  let evaluatedCount = 0;
14528
14503
  for (const member of results) {
14529
14504
  scores.push({
@@ -14531,9 +14506,7 @@ var CompositeEvaluator = class {
14531
14506
  type: member.type,
14532
14507
  score: member.result.score,
14533
14508
  verdict: member.result.verdict,
14534
- hits: [...member.result.hits],
14535
- misses: [...member.result.misses],
14536
- reasoning: member.result.reasoning,
14509
+ assertions: [...member.result.assertions],
14537
14510
  evaluatorRawRequest: member.result.evaluatorRawRequest,
14538
14511
  scores: member.result.scores,
14539
14512
  details: member.result.details,
@@ -14546,24 +14519,17 @@ var CompositeEvaluator = class {
14546
14519
  const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
14547
14520
  if (isPassing) {
14548
14521
  passingCount++;
14549
- if (member.result.verdict === "borderline") {
14550
- borderlineCount++;
14551
- }
14552
- }
14553
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
14554
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
14555
- if (member.result.reasoning) {
14556
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
14557
14522
  }
14523
+ allAssertions.push(
14524
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
14525
+ );
14558
14526
  }
14559
14527
  if (evaluatedCount === 0 && results.length > 0) {
14560
14528
  return {
14561
14529
  score: 0,
14562
14530
  verdict: "skip",
14563
- hits: [],
14564
- misses: [],
14531
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
14565
14532
  expectedAspectCount: 1,
14566
- reasoning: "All evaluators skipped (infrastructure failure)",
14567
14533
  evaluatorRawRequest: {
14568
14534
  aggregator: "threshold",
14569
14535
  threshold
@@ -14574,19 +14540,15 @@ var CompositeEvaluator = class {
14574
14540
  const totalCount = evaluatedCount;
14575
14541
  const score = totalCount > 0 ? passingCount / totalCount : 0;
14576
14542
  const pass = score >= threshold;
14577
- if (pass && borderlineCount > 0) {
14578
- reasoningParts.push(`Warning: ${borderlineCount} borderline evaluator(s) counted as passing`);
14579
- }
14580
- reasoningParts.unshift(
14581
- `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
14582
- );
14543
+ allAssertions.unshift({
14544
+ text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
14545
+ passed: pass
14546
+ });
14583
14547
  return {
14584
14548
  score: clampScore(score),
14585
14549
  verdict: pass ? "pass" : "fail",
14586
- hits: allHits,
14587
- misses: allMisses,
14588
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
14589
- reasoning: reasoningParts.join("; "),
14550
+ assertions: allAssertions,
14551
+ expectedAspectCount: allAssertions.length || 1,
14590
14552
  evaluatorRawRequest: {
14591
14553
  aggregator: "threshold",
14592
14554
  threshold
@@ -14603,9 +14565,7 @@ var CompositeEvaluator = class {
14603
14565
  score: member.result.score,
14604
14566
  weight: weights?.[member.id] ?? 1,
14605
14567
  verdict: member.result.verdict,
14606
- hits: [...member.result.hits],
14607
- misses: [...member.result.misses],
14608
- reasoning: member.result.reasoning,
14568
+ assertions: [...member.result.assertions],
14609
14569
  evaluatorRawRequest: member.result.evaluatorRawRequest,
14610
14570
  scores: member.result.scores,
14611
14571
  details: member.result.details
@@ -14614,17 +14574,19 @@ var CompositeEvaluator = class {
14614
14574
  const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
14615
14575
  const parsed = parseJsonSafe(stdout);
14616
14576
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
14617
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
14618
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
14619
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
14577
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
14578
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
14579
+ ).map((a) => ({
14580
+ text: String(a.text),
14581
+ passed: Boolean(a.passed),
14582
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
14583
+ })) : [];
14620
14584
  const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
14621
14585
  return {
14622
14586
  score,
14623
14587
  verdict,
14624
- hits,
14625
- misses,
14626
- expectedAspectCount: hits.length + misses.length || 1,
14627
- reasoning,
14588
+ assertions,
14589
+ expectedAspectCount: assertions.length || 1,
14628
14590
  evaluatorRawRequest: {
14629
14591
  aggregator: "code-grader",
14630
14592
  script: scriptPath
@@ -14636,10 +14598,8 @@ var CompositeEvaluator = class {
14636
14598
  return {
14637
14599
  score: 0,
14638
14600
  verdict: "fail",
14639
- hits: [],
14640
- misses: [`Code aggregator failed: ${message}`],
14601
+ assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
14641
14602
  expectedAspectCount: 1,
14642
- reasoning: message,
14643
14603
  evaluatorRawRequest: {
14644
14604
  aggregator: "code-grader",
14645
14605
  script: scriptPath,
@@ -14661,9 +14621,7 @@ var CompositeEvaluator = class {
14661
14621
  type: member.type,
14662
14622
  score: member.result.score,
14663
14623
  verdict: member.result.verdict,
14664
- hits: [...member.result.hits],
14665
- misses: [...member.result.misses],
14666
- reasoning: member.result.reasoning,
14624
+ assertions: [...member.result.assertions],
14667
14625
  evaluatorRawRequest: member.result.evaluatorRawRequest,
14668
14626
  scores: member.result.scores,
14669
14627
  details: member.result.details
@@ -14687,16 +14645,12 @@ var CompositeEvaluator = class {
14687
14645
  });
14688
14646
  const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
14689
14647
  const score2 = clampScore(data2.score);
14690
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
14691
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
14692
- const reasoning2 = data2.reasoning;
14648
+ const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
14693
14649
  return {
14694
14650
  score: score2,
14695
14651
  verdict: scoreToVerdict(score2),
14696
- hits: hits2,
14697
- misses: misses2,
14698
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
14699
- reasoning: reasoning2,
14652
+ assertions: assertions2,
14653
+ expectedAspectCount: Math.max(assertions2.length, 1),
14700
14654
  evaluatorRawRequest,
14701
14655
  scores
14702
14656
  };
@@ -14711,16 +14665,12 @@ var CompositeEvaluator = class {
14711
14665
  parseJsonFromText(extractLastAssistantContent2(response.output))
14712
14666
  );
14713
14667
  const score = clampScore(data.score);
14714
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
14715
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
14716
- const reasoning = data.reasoning;
14668
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
14717
14669
  return {
14718
14670
  score,
14719
14671
  verdict: scoreToVerdict(score),
14720
- hits,
14721
- misses,
14722
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
14723
- reasoning,
14672
+ assertions,
14673
+ expectedAspectCount: Math.max(assertions.length, 1),
14724
14674
  evaluatorRawRequest,
14725
14675
  scores
14726
14676
  };
@@ -14728,8 +14678,7 @@ var CompositeEvaluator = class {
14728
14678
  return {
14729
14679
  score: 0,
14730
14680
  verdict: "fail",
14731
- hits: [],
14732
- misses: [],
14681
+ assertions: [{ text: "LLM aggregator failed", passed: false }],
14733
14682
  expectedAspectCount: 1,
14734
14683
  evaluatorRawRequest,
14735
14684
  scores
@@ -14752,10 +14701,8 @@ var CostEvaluator = class {
14752
14701
  return {
14753
14702
  score: 0,
14754
14703
  verdict: "fail",
14755
- hits: [],
14756
- misses: ["No cost data available in trace"],
14704
+ assertions: [{ text: "No cost data available in trace", passed: false }],
14757
14705
  expectedAspectCount: 1,
14758
- reasoning: "Execution cost not reported by provider",
14759
14706
  evaluatorRawRequest: {
14760
14707
  type: "cost",
14761
14708
  budget,
@@ -14769,10 +14716,10 @@ var CostEvaluator = class {
14769
14716
  return {
14770
14717
  score,
14771
14718
  verdict: passed ? "pass" : "fail",
14772
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
14773
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
14719
+ assertions: [
14720
+ passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
14721
+ ],
14774
14722
  expectedAspectCount: 1,
14775
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
14776
14723
  evaluatorRawRequest: {
14777
14724
  type: "cost",
14778
14725
  budget,
@@ -14805,10 +14752,8 @@ var ExecutionMetricsEvaluator = class {
14805
14752
  return {
14806
14753
  score: 0,
14807
14754
  verdict: "fail",
14808
- hits: [],
14809
- misses: ["No trace summary available"],
14755
+ assertions: [{ text: "No trace summary available", passed: false }],
14810
14756
  expectedAspectCount: 1,
14811
- reasoning: "Execution metrics not available - no trace summary provided",
14812
14757
  evaluatorRawRequest: {
14813
14758
  type: "execution-metrics",
14814
14759
  config: this.extractConfiguredThresholds(),
@@ -14817,116 +14762,114 @@ var ExecutionMetricsEvaluator = class {
14817
14762
  };
14818
14763
  }
14819
14764
  const narrowedTrace = trace2;
14820
- const hits = [];
14821
- const misses = [];
14765
+ const assertions = [];
14822
14766
  const actualMetrics = {};
14823
14767
  if (max_tool_calls !== void 0 && narrowedTrace) {
14824
14768
  const toolCalls = narrowedTrace.eventCount;
14825
14769
  actualMetrics.tool_calls = toolCalls;
14826
14770
  if (toolCalls <= max_tool_calls) {
14827
- hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
14771
+ assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
14828
14772
  } else {
14829
- misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
14773
+ assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
14830
14774
  }
14831
14775
  }
14832
14776
  if (max_llm_calls !== void 0 && narrowedTrace) {
14833
14777
  const llmCalls = narrowedTrace.llmCallCount;
14834
14778
  if (llmCalls === void 0) {
14835
- misses.push("LLM call count data not available");
14779
+ assertions.push({ text: "LLM call count data not available", passed: false });
14836
14780
  } else {
14837
14781
  actualMetrics.llm_calls = llmCalls;
14838
14782
  if (llmCalls <= max_llm_calls) {
14839
- hits.push(`LLM calls ${llmCalls} <= ${max_llm_calls} max`);
14783
+ assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
14840
14784
  } else {
14841
- misses.push(`LLM calls ${llmCalls} > ${max_llm_calls} max`);
14785
+ assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
14842
14786
  }
14843
14787
  }
14844
14788
  }
14845
14789
  if (max_tokens !== void 0) {
14846
14790
  if (!tokenUsage) {
14847
- misses.push("Token usage data not available");
14791
+ assertions.push({ text: "Token usage data not available", passed: false });
14848
14792
  } else {
14849
14793
  const totalTokens = tokenUsage.input + tokenUsage.output;
14850
14794
  actualMetrics.tokens = totalTokens;
14851
14795
  if (totalTokens <= max_tokens) {
14852
- hits.push(`Total tokens ${totalTokens} <= ${max_tokens} max`);
14796
+ assertions.push({
14797
+ text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
14798
+ passed: true
14799
+ });
14853
14800
  } else {
14854
- misses.push(`Total tokens ${totalTokens} > ${max_tokens} max`);
14801
+ assertions.push({
14802
+ text: `Total tokens ${totalTokens} > ${max_tokens} max`,
14803
+ passed: false
14804
+ });
14855
14805
  }
14856
14806
  }
14857
14807
  }
14858
14808
  if (max_cost_usd !== void 0) {
14859
14809
  if (costUsd === void 0) {
14860
- misses.push("Cost data not available");
14810
+ assertions.push({ text: "Cost data not available", passed: false });
14861
14811
  } else {
14862
14812
  actualMetrics.cost_usd = costUsd;
14863
14813
  const formatCost = (n) => `$${n.toFixed(4)}`;
14864
14814
  if (costUsd <= max_cost_usd) {
14865
- hits.push(`Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`);
14815
+ assertions.push({
14816
+ text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
14817
+ passed: true
14818
+ });
14866
14819
  } else {
14867
- misses.push(`Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`);
14820
+ assertions.push({
14821
+ text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
14822
+ passed: false
14823
+ });
14868
14824
  }
14869
14825
  }
14870
14826
  }
14871
14827
  if (max_duration_ms !== void 0) {
14872
14828
  if (durationMs === void 0) {
14873
- misses.push("Duration data not available");
14829
+ assertions.push({ text: "Duration data not available", passed: false });
14874
14830
  } else {
14875
14831
  actualMetrics.duration_ms = durationMs;
14876
14832
  if (durationMs <= max_duration_ms) {
14877
- hits.push(`Duration ${durationMs}ms <= ${max_duration_ms}ms max`);
14833
+ assertions.push({
14834
+ text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
14835
+ passed: true
14836
+ });
14878
14837
  } else {
14879
- misses.push(`Duration ${durationMs}ms > ${max_duration_ms}ms max`);
14838
+ assertions.push({
14839
+ text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
14840
+ passed: false
14841
+ });
14880
14842
  }
14881
14843
  }
14882
14844
  }
14883
14845
  if (target_exploration_ratio !== void 0 && narrowedTrace) {
14884
14846
  const ratio = explorationRatio(narrowedTrace);
14885
14847
  if (ratio === void 0) {
14886
- misses.push("Exploration ratio not available (no tool calls)");
14848
+ assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
14887
14849
  } else {
14888
14850
  actualMetrics.exploration_ratio = ratio;
14889
14851
  const diff = Math.abs(ratio - target_exploration_ratio);
14890
14852
  if (diff <= exploration_tolerance) {
14891
- hits.push(
14892
- `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`
14893
- );
14853
+ assertions.push({
14854
+ text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
14855
+ passed: true
14856
+ });
14894
14857
  } else {
14895
- misses.push(
14896
- `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`
14897
- );
14858
+ assertions.push({
14859
+ text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
14860
+ passed: false
14861
+ });
14898
14862
  }
14899
14863
  }
14900
14864
  }
14901
- const totalChecks = hits.length + misses.length;
14902
- const score = totalChecks > 0 ? hits.length / totalChecks : 0;
14903
- const reasoningParts = [];
14904
- if (actualMetrics.tool_calls !== void 0) {
14905
- reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
14906
- }
14907
- if (actualMetrics.llm_calls !== void 0) {
14908
- reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
14909
- }
14910
- if (actualMetrics.tokens !== void 0) {
14911
- reasoningParts.push(`tokens=${actualMetrics.tokens}`);
14912
- }
14913
- if (actualMetrics.cost_usd !== void 0) {
14914
- reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
14915
- }
14916
- if (actualMetrics.duration_ms !== void 0) {
14917
- reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
14918
- }
14919
- if (actualMetrics.exploration_ratio !== void 0) {
14920
- reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
14921
- }
14922
- const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
14865
+ const totalChecks = assertions.length;
14866
+ const passedCount = assertions.filter((a) => a.passed).length;
14867
+ const score = totalChecks > 0 ? passedCount / totalChecks : 0;
14923
14868
  return {
14924
14869
  score,
14925
14870
  verdict: scoreToVerdict(score),
14926
- hits,
14927
- misses,
14871
+ assertions,
14928
14872
  expectedAspectCount: totalChecks || 1,
14929
- reasoning,
14930
14873
  evaluatorRawRequest: {
14931
14874
  type: "execution-metrics",
14932
14875
  config: this.extractConfiguredThresholds(),
@@ -15030,10 +14973,8 @@ var FieldAccuracyEvaluator = class {
15030
14973
  return {
15031
14974
  score: 0,
15032
14975
  verdict: "fail",
15033
- hits: [],
15034
- misses: ["Failed to parse candidate answer as JSON"],
15035
- expectedAspectCount: this.config.fields.length,
15036
- reasoning: "Candidate answer is not valid JSON"
14976
+ assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
14977
+ expectedAspectCount: this.config.fields.length
15037
14978
  };
15038
14979
  }
15039
14980
  const expectedData = this.extractExpectedData(evalCase.expected_output);
@@ -15041,10 +14982,8 @@ var FieldAccuracyEvaluator = class {
15041
14982
  return {
15042
14983
  score: 0,
15043
14984
  verdict: "fail",
15044
- hits: [],
15045
- misses: ["No expected data found in expected_output"],
15046
- expectedAspectCount: this.config.fields.length,
15047
- reasoning: "Could not extract expected data from expected_output"
14985
+ assertions: [{ text: "No expected data found in expected_output", passed: false }],
14986
+ expectedAspectCount: this.config.fields.length
15048
14987
  };
15049
14988
  }
15050
14989
  const fieldResults = [];
@@ -15262,18 +15201,14 @@ var FieldAccuracyEvaluator = class {
15262
15201
  */
15263
15202
  aggregateResults(results) {
15264
15203
  const aggregation = this.config.aggregation ?? "weighted_average";
15265
- const hits = [];
15266
- const misses = [];
15204
+ const assertions = [];
15267
15205
  for (const result of results) {
15268
- if (result.hit) {
15269
- hits.push(result.message);
15270
- } else {
15271
- misses.push(result.message);
15272
- }
15206
+ assertions.push({ text: result.message, passed: result.hit });
15273
15207
  }
15274
15208
  let score;
15275
15209
  if (aggregation === "all_or_nothing") {
15276
- score = misses.length === 0 ? 1 : 0;
15210
+ const hasFailed = assertions.some((a) => !a.passed);
15211
+ score = hasFailed ? 0 : 1;
15277
15212
  } else {
15278
15213
  const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
15279
15214
  if (totalWeight === 0) {
@@ -15283,15 +15218,11 @@ var FieldAccuracyEvaluator = class {
15283
15218
  score = weightedSum / totalWeight;
15284
15219
  }
15285
15220
  }
15286
- const reasoning = `${hits.length}/${results.length} fields matched`;
15287
15221
  return {
15288
15222
  score: clampScore(score),
15289
15223
  verdict: scoreToVerdict(score),
15290
- hits: hits.slice(0, 4),
15291
- // Cap at 4 to keep output concise
15292
- misses: misses.slice(0, 4),
15293
- expectedAspectCount: results.length,
15294
- reasoning
15224
+ assertions,
15225
+ expectedAspectCount: results.length
15295
15226
  };
15296
15227
  }
15297
15228
  };
@@ -15400,10 +15331,8 @@ var LatencyEvaluator = class {
15400
15331
  return {
15401
15332
  score: 0,
15402
15333
  verdict: "fail",
15403
- hits: [],
15404
- misses: ["No duration data available in trace"],
15334
+ assertions: [{ text: "No duration data available in trace", passed: false }],
15405
15335
  expectedAspectCount: 1,
15406
- reasoning: "Execution duration not reported by provider",
15407
15336
  evaluatorRawRequest: {
15408
15337
  type: "latency",
15409
15338
  threshold,
@@ -15416,10 +15345,10 @@ var LatencyEvaluator = class {
15416
15345
  return {
15417
15346
  score,
15418
15347
  verdict: passed ? "pass" : "fail",
15419
- hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
15420
- misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
15348
+ assertions: [
15349
+ passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
15350
+ ],
15421
15351
  expectedAspectCount: 1,
15422
- reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
15423
15352
  evaluatorRawRequest: {
15424
15353
  type: "latency",
15425
15354
  threshold,
@@ -15495,23 +15424,25 @@ var SkillTriggerEvaluator = class {
15495
15424
  return {
15496
15425
  score: 1,
15497
15426
  verdict: "pass",
15498
- hits: [
15499
- shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`
15427
+ assertions: [
15428
+ {
15429
+ text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
15430
+ passed: true
15431
+ }
15500
15432
  ],
15501
- misses: [],
15502
- expectedAspectCount: 1,
15503
- reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
15433
+ expectedAspectCount: 1
15504
15434
  };
15505
15435
  }
15506
15436
  return {
15507
15437
  score: 0,
15508
15438
  verdict: "fail",
15509
- hits: [],
15510
- misses: [
15511
- shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`
15439
+ assertions: [
15440
+ {
15441
+ text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
15442
+ passed: false
15443
+ }
15512
15444
  ],
15513
- expectedAspectCount: 1,
15514
- reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
15445
+ expectedAspectCount: 1
15515
15446
  };
15516
15447
  }
15517
15448
  };
@@ -15680,10 +15611,8 @@ var TokenUsageEvaluator = class {
15680
15611
  return {
15681
15612
  score: 0,
15682
15613
  verdict: "fail",
15683
- hits: [],
15684
- misses: ["No token usage data available in trace"],
15614
+ assertions: [{ text: "No token usage data available in trace", passed: false }],
15685
15615
  expectedAspectCount,
15686
- reasoning: "Token usage not reported by provider",
15687
15616
  evaluatorRawRequest: {
15688
15617
  type: "token-usage",
15689
15618
  max_total: maxTotal ?? null,
@@ -15697,37 +15626,34 @@ var TokenUsageEvaluator = class {
15697
15626
  const output = usage.output;
15698
15627
  const cached = usage.cached ?? 0;
15699
15628
  const total = input + output + cached;
15700
- const hits = [];
15701
- const misses = [];
15629
+ const assertions = [];
15702
15630
  if (typeof maxInput === "number") {
15703
15631
  if (input <= maxInput) {
15704
- hits.push(`Input tokens ${input} <= ${maxInput}`);
15632
+ assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
15705
15633
  } else {
15706
- misses.push(`Input tokens ${input} > ${maxInput}`);
15634
+ assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
15707
15635
  }
15708
15636
  }
15709
15637
  if (typeof maxOutput === "number") {
15710
15638
  if (output <= maxOutput) {
15711
- hits.push(`Output tokens ${output} <= ${maxOutput}`);
15639
+ assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
15712
15640
  } else {
15713
- misses.push(`Output tokens ${output} > ${maxOutput}`);
15641
+ assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
15714
15642
  }
15715
15643
  }
15716
15644
  if (typeof maxTotal === "number") {
15717
15645
  if (total <= maxTotal) {
15718
- hits.push(`Total tokens ${total} <= ${maxTotal}`);
15646
+ assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
15719
15647
  } else {
15720
- misses.push(`Total tokens ${total} > ${maxTotal}`);
15648
+ assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
15721
15649
  }
15722
15650
  }
15723
- const passed = misses.length === 0;
15651
+ const passed = assertions.every((a) => a.passed);
15724
15652
  return {
15725
15653
  score: passed ? 1 : 0,
15726
15654
  verdict: passed ? "pass" : "fail",
15727
- hits,
15728
- misses,
15655
+ assertions,
15729
15656
  expectedAspectCount,
15730
- reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
15731
15657
  evaluatorRawRequest: {
15732
15658
  type: "token-usage",
15733
15659
  max_total: maxTotal ?? null,
@@ -15827,8 +15753,7 @@ var ToolTrajectoryEvaluator = class {
15827
15753
  return {
15828
15754
  score: 0,
15829
15755
  verdict: "fail",
15830
- hits: [],
15831
- misses: ["No trace available for evaluation"],
15756
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
15832
15757
  expectedAspectCount: 1
15833
15758
  };
15834
15759
  }
@@ -15839,8 +15764,7 @@ var ToolTrajectoryEvaluator = class {
15839
15764
  return {
15840
15765
  score: 0,
15841
15766
  verdict: "fail",
15842
- hits: [],
15843
- misses: ["No trace available for evaluation"],
15767
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
15844
15768
  expectedAspectCount: 1
15845
15769
  };
15846
15770
  }
@@ -15858,8 +15782,7 @@ var ToolTrajectoryEvaluator = class {
15858
15782
  return {
15859
15783
  score: 0,
15860
15784
  verdict: "fail",
15861
- hits: [],
15862
- misses: [`Unknown mode: ${this.config.mode}`],
15785
+ assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
15863
15786
  expectedAspectCount: 1
15864
15787
  };
15865
15788
  }
@@ -15908,28 +15831,32 @@ var ToolTrajectoryEvaluator = class {
15908
15831
  return {
15909
15832
  score: 1,
15910
15833
  verdict: "pass",
15911
- hits: ["No tool requirements specified"],
15912
- misses: [],
15834
+ assertions: [{ text: "No tool requirements specified", passed: true }],
15913
15835
  expectedAspectCount: 0
15914
15836
  };
15915
15837
  }
15916
- const hits = [];
15917
- const misses = [];
15838
+ const assertions = [];
15918
15839
  for (const toolName of toolNames) {
15919
15840
  const required = minimums[toolName];
15920
15841
  const actual = summary.toolCallsByName[toolName] ?? 0;
15921
15842
  if (actual >= required) {
15922
- hits.push(`${toolName}: called ${actual} times (required >=${required})`);
15843
+ assertions.push({
15844
+ text: `${toolName}: called ${actual} times (required >=${required})`,
15845
+ passed: true
15846
+ });
15923
15847
  } else {
15924
- misses.push(`${toolName}: called ${actual} times (required >=${required})`);
15848
+ assertions.push({
15849
+ text: `${toolName}: called ${actual} times (required >=${required})`,
15850
+ passed: false
15851
+ });
15925
15852
  }
15926
15853
  }
15927
- const score = hits.length / toolNames.length;
15854
+ const passedCount = assertions.filter((a) => a.passed).length;
15855
+ const score = passedCount / toolNames.length;
15928
15856
  return {
15929
15857
  score,
15930
15858
  verdict: scoreToVerdict(score),
15931
- hits,
15932
- misses,
15859
+ assertions,
15933
15860
  expectedAspectCount: toolNames.length
15934
15861
  };
15935
15862
  }
@@ -15939,13 +15866,11 @@ var ToolTrajectoryEvaluator = class {
15939
15866
  return {
15940
15867
  score: 1,
15941
15868
  verdict: "pass",
15942
- hits: ["No tool sequence specified"],
15943
- misses: [],
15869
+ assertions: [{ text: "No tool sequence specified", passed: true }],
15944
15870
  expectedAspectCount: 0
15945
15871
  };
15946
15872
  }
15947
- const hits = [];
15948
- const misses = [];
15873
+ const assertions = [];
15949
15874
  const warnings = [];
15950
15875
  let actualIndex = 0;
15951
15876
  let sequenceHits = 0;
@@ -15965,16 +15890,20 @@ var ToolTrajectoryEvaluator = class {
15965
15890
  const actualCall = toolCalls[actualIndex];
15966
15891
  if (actualCall.name === expectedTool) {
15967
15892
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
15968
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
15893
+ assertions.push({
15894
+ text: `Found ${expectedTool} at position ${actualIndex}`,
15895
+ passed: true
15896
+ });
15969
15897
  sequenceHits++;
15970
15898
  matchedCall = actualCall;
15971
15899
  actualIndex++;
15972
15900
  found = true;
15973
15901
  break;
15974
15902
  }
15975
- misses.push(
15976
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
15977
- );
15903
+ assertions.push({
15904
+ text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
15905
+ passed: false
15906
+ });
15978
15907
  actualIndex++;
15979
15908
  argsMismatch = true;
15980
15909
  break;
@@ -15982,7 +15911,10 @@ var ToolTrajectoryEvaluator = class {
15982
15911
  actualIndex++;
15983
15912
  }
15984
15913
  if (!found && !argsMismatch) {
15985
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
15914
+ assertions.push({
15915
+ text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
15916
+ passed: false
15917
+ });
15986
15918
  }
15987
15919
  if (found && matchedCall) {
15988
15920
  const latencyResult = checkLatency(
@@ -15991,10 +15923,10 @@ var ToolTrajectoryEvaluator = class {
15991
15923
  matchedCall.durationMs
15992
15924
  );
15993
15925
  if (latencyResult.status === "pass") {
15994
- hits.push(latencyResult.message);
15926
+ assertions.push({ text: latencyResult.message, passed: true });
15995
15927
  latencyHits++;
15996
15928
  } else if (latencyResult.status === "fail") {
15997
- misses.push(latencyResult.message);
15929
+ assertions.push({ text: latencyResult.message, passed: false });
15998
15930
  } else if (latencyResult.message) {
15999
15931
  warnings.push(latencyResult.message);
16000
15932
  latencySkips++;
@@ -16010,8 +15942,7 @@ var ToolTrajectoryEvaluator = class {
16010
15942
  return {
16011
15943
  score,
16012
15944
  verdict: scoreToVerdict(score),
16013
- hits,
16014
- misses,
15945
+ assertions,
16015
15946
  expectedAspectCount: totalAssertions
16016
15947
  };
16017
15948
  }
@@ -16021,13 +15952,11 @@ var ToolTrajectoryEvaluator = class {
16021
15952
  return {
16022
15953
  score: 1,
16023
15954
  verdict: "pass",
16024
- hits: ["No tool sequence specified"],
16025
- misses: [],
15955
+ assertions: [{ text: "No tool sequence specified", passed: true }],
16026
15956
  expectedAspectCount: 0
16027
15957
  };
16028
15958
  }
16029
- const hits = [];
16030
- const misses = [];
15959
+ const assertions = [];
16031
15960
  const warnings = [];
16032
15961
  let sequenceHits = 0;
16033
15962
  let latencyHits = 0;
@@ -16036,7 +15965,10 @@ var ToolTrajectoryEvaluator = class {
16036
15965
  (item) => item.maxDurationMs !== void 0
16037
15966
  ).length;
16038
15967
  if (toolCalls.length !== expected.length) {
16039
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
15968
+ assertions.push({
15969
+ text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
15970
+ passed: false
15971
+ });
16040
15972
  }
16041
15973
  const checkLength = Math.min(expected.length, toolCalls.length);
16042
15974
  for (let i = 0; i < checkLength; i++) {
@@ -16048,14 +15980,17 @@ var ToolTrajectoryEvaluator = class {
16048
15980
  let sequenceMatched = false;
16049
15981
  if (actualTool === expectedTool) {
16050
15982
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
16051
- hits.push(`Position ${i}: ${expectedTool}`);
15983
+ assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
16052
15984
  sequenceHits++;
16053
15985
  sequenceMatched = true;
16054
15986
  } else {
16055
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
15987
+ assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
16056
15988
  }
16057
15989
  } else {
16058
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
15990
+ assertions.push({
15991
+ text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
15992
+ passed: false
15993
+ });
16059
15994
  }
16060
15995
  if (sequenceMatched) {
16061
15996
  const latencyResult = checkLatency(
@@ -16064,10 +15999,10 @@ var ToolTrajectoryEvaluator = class {
16064
15999
  actualCall.durationMs
16065
16000
  );
16066
16001
  if (latencyResult.status === "pass") {
16067
- hits.push(latencyResult.message);
16002
+ assertions.push({ text: latencyResult.message, passed: true });
16068
16003
  latencyHits++;
16069
16004
  } else if (latencyResult.status === "fail") {
16070
- misses.push(latencyResult.message);
16005
+ assertions.push({ text: latencyResult.message, passed: false });
16071
16006
  } else if (latencyResult.message) {
16072
16007
  warnings.push(latencyResult.message);
16073
16008
  latencySkips++;
@@ -16075,7 +16010,10 @@ var ToolTrajectoryEvaluator = class {
16075
16010
  }
16076
16011
  }
16077
16012
  for (let i = checkLength; i < expected.length; i++) {
16078
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
16013
+ assertions.push({
16014
+ text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
16015
+ passed: false
16016
+ });
16079
16017
  }
16080
16018
  for (const warning of warnings) {
16081
16019
  console.warn(`[tool-trajectory] ${warning}`);
@@ -16086,8 +16024,7 @@ var ToolTrajectoryEvaluator = class {
16086
16024
  return {
16087
16025
  score,
16088
16026
  verdict: scoreToVerdict(score),
16089
- hits,
16090
- misses,
16027
+ assertions,
16091
16028
  expectedAspectCount: totalAssertions
16092
16029
  };
16093
16030
  }
@@ -16102,13 +16039,11 @@ var ToolTrajectoryEvaluator = class {
16102
16039
  return {
16103
16040
  score: 1,
16104
16041
  verdict: "pass",
16105
- hits: ["No expected tools specified"],
16106
- misses: [],
16042
+ assertions: [{ text: "No expected tools specified", passed: true }],
16107
16043
  expectedAspectCount: 0
16108
16044
  };
16109
16045
  }
16110
- const hits = [];
16111
- const misses = [];
16046
+ const assertions = [];
16112
16047
  const consumed = /* @__PURE__ */ new Set();
16113
16048
  for (let i = 0; i < expected.length; i++) {
16114
16049
  const expectedItem = expected[i];
@@ -16119,22 +16054,25 @@ var ToolTrajectoryEvaluator = class {
16119
16054
  if (consumed.has(j)) continue;
16120
16055
  const actualCall = toolCalls[j];
16121
16056
  if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
16122
- hits.push(`Found ${expectedTool} at position ${j}`);
16057
+ assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
16123
16058
  consumed.add(j);
16124
16059
  found = true;
16125
16060
  break;
16126
16061
  }
16127
16062
  }
16128
16063
  if (!found) {
16129
- misses.push(`Expected ${expectedTool} not found in actual trajectory`);
16064
+ assertions.push({
16065
+ text: `Expected ${expectedTool} not found in actual trajectory`,
16066
+ passed: false
16067
+ });
16130
16068
  }
16131
16069
  }
16132
- const score = expected.length > 0 ? hits.length / expected.length : 1;
16070
+ const passedCount = assertions.filter((a) => a.passed).length;
16071
+ const score = expected.length > 0 ? passedCount / expected.length : 1;
16133
16072
  return {
16134
16073
  score,
16135
16074
  verdict: scoreToVerdict(score),
16136
- hits,
16137
- misses,
16075
+ assertions,
16138
16076
  expectedAspectCount: expected.length
16139
16077
  };
16140
16078
  }
@@ -16150,16 +16088,19 @@ var ToolTrajectoryEvaluator = class {
16150
16088
  return {
16151
16089
  score: 1,
16152
16090
  verdict: "pass",
16153
- hits: ["No tool calls and no expected tools"],
16154
- misses: [],
16091
+ assertions: [{ text: "No tool calls and no expected tools", passed: true }],
16155
16092
  expectedAspectCount: 0
16156
16093
  };
16157
16094
  }
16158
16095
  return {
16159
16096
  score: 0,
16160
16097
  verdict: "fail",
16161
- hits: [],
16162
- misses: [`${toolCalls.length} unexpected tool call(s) with empty allowed list`],
16098
+ assertions: [
16099
+ {
16100
+ text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
16101
+ passed: false
16102
+ }
16103
+ ],
16163
16104
  expectedAspectCount: toolCalls.length
16164
16105
  };
16165
16106
  }
@@ -16167,13 +16108,11 @@ var ToolTrajectoryEvaluator = class {
16167
16108
  return {
16168
16109
  score: 1,
16169
16110
  verdict: "pass",
16170
- hits: ["No actual tool calls (trivially a subset)"],
16171
- misses: [],
16111
+ assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
16172
16112
  expectedAspectCount: 0
16173
16113
  };
16174
16114
  }
16175
- const hits = [];
16176
- const misses = [];
16115
+ const assertions = [];
16177
16116
  for (let i = 0; i < toolCalls.length; i++) {
16178
16117
  const actualCall = toolCalls[i];
16179
16118
  let allowed = false;
@@ -16185,17 +16124,23 @@ var ToolTrajectoryEvaluator = class {
16185
16124
  }
16186
16125
  }
16187
16126
  if (allowed) {
16188
- hits.push(`Position ${i}: ${actualCall.name} is in allowed set`);
16127
+ assertions.push({
16128
+ text: `Position ${i}: ${actualCall.name} is in allowed set`,
16129
+ passed: true
16130
+ });
16189
16131
  } else {
16190
- misses.push(`Position ${i}: ${actualCall.name} is not in allowed set`);
16132
+ assertions.push({
16133
+ text: `Position ${i}: ${actualCall.name} is not in allowed set`,
16134
+ passed: false
16135
+ });
16191
16136
  }
16192
16137
  }
16193
- const score = toolCalls.length > 0 ? hits.length / toolCalls.length : 1;
16138
+ const passedCount = assertions.filter((a) => a.passed).length;
16139
+ const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
16194
16140
  return {
16195
16141
  score,
16196
16142
  verdict: scoreToVerdict(score),
16197
- hits,
16198
- misses,
16143
+ assertions,
16199
16144
  expectedAspectCount: toolCalls.length
16200
16145
  };
16201
16146
  }
@@ -16206,8 +16151,12 @@ function runContainsAssertion(output, value) {
16206
16151
  const passed = output.includes(value);
16207
16152
  return {
16208
16153
  score: passed ? 1 : 0,
16209
- hits: passed ? [`Output contains "${value}"`] : [],
16210
- misses: passed ? [] : [`Output does not contain "${value}"`]
16154
+ assertions: [
16155
+ {
16156
+ text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
16157
+ passed
16158
+ }
16159
+ ]
16211
16160
  };
16212
16161
  }
16213
16162
  function runContainsAnyAssertion(output, values) {
@@ -16215,8 +16164,12 @@ function runContainsAnyAssertion(output, values) {
16215
16164
  const passed = matched.length > 0;
16216
16165
  return {
16217
16166
  score: passed ? 1 : 0,
16218
- hits: passed ? [`Output contains "${matched[0]}"`] : [],
16219
- misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
16167
+ assertions: [
16168
+ {
16169
+ text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
16170
+ passed
16171
+ }
16172
+ ]
16220
16173
  };
16221
16174
  }
16222
16175
  function runContainsAllAssertion(output, values) {
@@ -16224,16 +16177,24 @@ function runContainsAllAssertion(output, values) {
16224
16177
  const passed = missing.length === 0;
16225
16178
  return {
16226
16179
  score: passed ? 1 : 0,
16227
- hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
16228
- misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
16180
+ assertions: [
16181
+ {
16182
+ text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
16183
+ passed
16184
+ }
16185
+ ]
16229
16186
  };
16230
16187
  }
16231
16188
  function runIcontainsAssertion(output, value) {
16232
16189
  const passed = output.toLowerCase().includes(value.toLowerCase());
16233
16190
  return {
16234
16191
  score: passed ? 1 : 0,
16235
- hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
16236
- misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
16192
+ assertions: [
16193
+ {
16194
+ text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
16195
+ passed
16196
+ }
16197
+ ]
16237
16198
  };
16238
16199
  }
16239
16200
  function runIcontainsAnyAssertion(output, values) {
@@ -16242,9 +16203,11 @@ function runIcontainsAnyAssertion(output, values) {
16242
16203
  const passed = matched.length > 0;
16243
16204
  return {
16244
16205
  score: passed ? 1 : 0,
16245
- hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
16246
- misses: passed ? [] : [
16247
- `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
16206
+ assertions: [
16207
+ {
16208
+ text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
16209
+ passed
16210
+ }
16248
16211
  ]
16249
16212
  };
16250
16213
  }
@@ -16254,24 +16217,36 @@ function runIcontainsAllAssertion(output, values) {
16254
16217
  const passed = missing.length === 0;
16255
16218
  return {
16256
16219
  score: passed ? 1 : 0,
16257
- hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
16258
- misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
16220
+ assertions: [
16221
+ {
16222
+ text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
16223
+ passed
16224
+ }
16225
+ ]
16259
16226
  };
16260
16227
  }
16261
16228
  function runStartsWithAssertion(output, value) {
16262
16229
  const passed = output.trim().startsWith(value.trim());
16263
16230
  return {
16264
16231
  score: passed ? 1 : 0,
16265
- hits: passed ? [`Output starts with "${value}"`] : [],
16266
- misses: passed ? [] : [`Output does not start with "${value}"`]
16232
+ assertions: [
16233
+ {
16234
+ text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
16235
+ passed
16236
+ }
16237
+ ]
16267
16238
  };
16268
16239
  }
16269
16240
  function runEndsWithAssertion(output, value) {
16270
16241
  const passed = output.trim().endsWith(value.trim());
16271
16242
  return {
16272
16243
  score: passed ? 1 : 0,
16273
- hits: passed ? [`Output ends with "${value}"`] : [],
16274
- misses: passed ? [] : [`Output does not end with "${value}"`]
16244
+ assertions: [
16245
+ {
16246
+ text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
16247
+ passed
16248
+ }
16249
+ ]
16275
16250
  };
16276
16251
  }
16277
16252
  function runRegexAssertion(output, pattern, flags) {
@@ -16280,8 +16255,12 @@ function runRegexAssertion(output, pattern, flags) {
16280
16255
  const flagsLabel = flags ? ` (flags: ${flags})` : "";
16281
16256
  return {
16282
16257
  score: passed ? 1 : 0,
16283
- hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
16284
- misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
16258
+ assertions: [
16259
+ {
16260
+ text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
16261
+ passed
16262
+ }
16263
+ ]
16285
16264
  };
16286
16265
  }
16287
16266
  function runIsJsonAssertion(output) {
@@ -16293,16 +16272,24 @@ function runIsJsonAssertion(output) {
16293
16272
  }
16294
16273
  return {
16295
16274
  score: passed ? 1 : 0,
16296
- hits: passed ? ["Output is valid JSON"] : [],
16297
- misses: passed ? [] : ["Output is not valid JSON"]
16275
+ assertions: [
16276
+ {
16277
+ text: passed ? "Output is valid JSON" : "Output is not valid JSON",
16278
+ passed
16279
+ }
16280
+ ]
16298
16281
  };
16299
16282
  }
16300
16283
  function runEqualsAssertion(output, value) {
16301
16284
  const passed = output.trim() === value.trim();
16302
16285
  return {
16303
16286
  score: passed ? 1 : 0,
16304
- hits: passed ? [`Output equals "${value}"`] : [],
16305
- misses: passed ? [] : [`Output does not equal "${value}"`]
16287
+ assertions: [
16288
+ {
16289
+ text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
16290
+ passed
16291
+ }
16292
+ ]
16306
16293
  };
16307
16294
  }
16308
16295
 
@@ -16515,10 +16502,8 @@ var InlineAssertEvaluator = class {
16515
16502
  return {
16516
16503
  score,
16517
16504
  verdict: scoreToVerdict(score),
16518
- hits: score >= 0.8 ? [result.name] : [],
16519
- misses: score < 0.5 ? [result.name] : [],
16505
+ assertions: [{ text: result.name, passed: score >= 0.5 }],
16520
16506
  expectedAspectCount: 1,
16521
- reasoning: void 0,
16522
16507
  details: result.metadata ? result.metadata : void 0
16523
16508
  };
16524
16509
  }
@@ -16711,9 +16696,7 @@ var containsFactory = (config) => {
16711
16696
  return {
16712
16697
  score: result.score,
16713
16698
  verdict: result.score === 1 ? "pass" : "fail",
16714
- hits: result.hits,
16715
- misses: result.misses,
16716
- reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
16699
+ assertions: result.assertions,
16717
16700
  expectedAspectCount: 1
16718
16701
  };
16719
16702
  });
@@ -16725,9 +16708,7 @@ var regexFactory = (config) => {
16725
16708
  return {
16726
16709
  score: result.score,
16727
16710
  verdict: result.score === 1 ? "pass" : "fail",
16728
- hits: result.hits,
16729
- misses: result.misses,
16730
- reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
16711
+ assertions: result.assertions,
16731
16712
  expectedAspectCount: 1
16732
16713
  };
16733
16714
  });
@@ -16738,9 +16719,7 @@ var isJsonFactory = () => {
16738
16719
  return {
16739
16720
  score: result.score,
16740
16721
  verdict: result.score === 1 ? "pass" : "fail",
16741
- hits: result.hits,
16742
- misses: result.misses,
16743
- reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
16722
+ assertions: result.assertions,
16744
16723
  expectedAspectCount: 1
16745
16724
  };
16746
16725
  });
@@ -16752,9 +16731,7 @@ var equalsFactory = (config) => {
16752
16731
  return {
16753
16732
  score: result.score,
16754
16733
  verdict: result.score === 1 ? "pass" : "fail",
16755
- hits: result.hits,
16756
- misses: result.misses,
16757
- reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
16734
+ assertions: result.assertions,
16758
16735
  expectedAspectCount: 1
16759
16736
  };
16760
16737
  });
@@ -16766,9 +16743,7 @@ var containsAnyFactory = (config) => {
16766
16743
  return {
16767
16744
  score: result.score,
16768
16745
  verdict: result.score === 1 ? "pass" : "fail",
16769
- hits: result.hits,
16770
- misses: result.misses,
16771
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16746
+ assertions: result.assertions,
16772
16747
  expectedAspectCount: 1
16773
16748
  };
16774
16749
  });
@@ -16780,9 +16755,7 @@ var containsAllFactory = (config) => {
16780
16755
  return {
16781
16756
  score: result.score,
16782
16757
  verdict: result.score === 1 ? "pass" : "fail",
16783
- hits: result.hits,
16784
- misses: result.misses,
16785
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16758
+ assertions: result.assertions,
16786
16759
  expectedAspectCount: 1
16787
16760
  };
16788
16761
  });
@@ -16794,9 +16767,7 @@ var icontainsFactory = (config) => {
16794
16767
  return {
16795
16768
  score: result.score,
16796
16769
  verdict: result.score === 1 ? "pass" : "fail",
16797
- hits: result.hits,
16798
- misses: result.misses,
16799
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16770
+ assertions: result.assertions,
16800
16771
  expectedAspectCount: 1
16801
16772
  };
16802
16773
  });
@@ -16808,9 +16779,7 @@ var icontainsAnyFactory = (config) => {
16808
16779
  return {
16809
16780
  score: result.score,
16810
16781
  verdict: result.score === 1 ? "pass" : "fail",
16811
- hits: result.hits,
16812
- misses: result.misses,
16813
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16782
+ assertions: result.assertions,
16814
16783
  expectedAspectCount: 1
16815
16784
  };
16816
16785
  });
@@ -16822,9 +16791,7 @@ var icontainsAllFactory = (config) => {
16822
16791
  return {
16823
16792
  score: result.score,
16824
16793
  verdict: result.score === 1 ? "pass" : "fail",
16825
- hits: result.hits,
16826
- misses: result.misses,
16827
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16794
+ assertions: result.assertions,
16828
16795
  expectedAspectCount: 1
16829
16796
  };
16830
16797
  });
@@ -16836,9 +16803,7 @@ var startsWithFactory = (config) => {
16836
16803
  return {
16837
16804
  score: result.score,
16838
16805
  verdict: result.score === 1 ? "pass" : "fail",
16839
- hits: result.hits,
16840
- misses: result.misses,
16841
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16806
+ assertions: result.assertions,
16842
16807
  expectedAspectCount: 1
16843
16808
  };
16844
16809
  });
@@ -16850,9 +16815,7 @@ var endsWithFactory = (config) => {
16850
16815
  return {
16851
16816
  score: result.score,
16852
16817
  verdict: result.score === 1 ? "pass" : "fail",
16853
- hits: result.hits,
16854
- misses: result.misses,
16855
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16818
+ assertions: result.assertions,
16856
16819
  expectedAspectCount: 1
16857
16820
  };
16858
16821
  });
@@ -18258,8 +18221,7 @@ async function runEvaluation(options) {
18258
18221
  testId: evalCase.id,
18259
18222
  dataset: evalCase.dataset,
18260
18223
  score: 0,
18261
- hits: [],
18262
- misses: [],
18224
+ assertions: [],
18263
18225
  answer: "",
18264
18226
  target: target.name,
18265
18227
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
@@ -18295,8 +18257,7 @@ async function runEvaluation(options) {
18295
18257
  testId: evalCase.id,
18296
18258
  dataset: evalCase.dataset,
18297
18259
  score: 0,
18298
- hits: [],
18299
- misses: [],
18260
+ assertions: [],
18300
18261
  answer: "",
18301
18262
  target: target.name,
18302
18263
  error: errorMsg,
@@ -19263,11 +19224,9 @@ async function evaluateCandidate(options) {
19263
19224
  dataset: evalCase.dataset,
19264
19225
  conversationId: evalCase.conversation_id,
19265
19226
  score: score.score,
19266
- hits: score.hits,
19267
- misses: score.misses,
19227
+ assertions: score.assertions,
19268
19228
  answer: candidate,
19269
19229
  target: target.name,
19270
- reasoning: score.reasoning,
19271
19230
  tokenUsage,
19272
19231
  costUsd,
19273
19232
  durationMs,
@@ -19441,9 +19400,7 @@ async function runEvaluatorList(options) {
19441
19400
  score: score2.score,
19442
19401
  weight,
19443
19402
  verdict: score2.verdict,
19444
- hits: score2.hits,
19445
- misses: score2.misses,
19446
- reasoning: score2.reasoning,
19403
+ assertions: score2.assertions,
19447
19404
  evaluatorProviderRequest: score2.evaluatorRawRequest,
19448
19405
  details: score2.details,
19449
19406
  scores: mapChildResults(score2.scores),
@@ -19458,10 +19415,10 @@ async function runEvaluatorList(options) {
19458
19415
  const fallbackScore = {
19459
19416
  score: 0,
19460
19417
  verdict: "fail",
19461
- hits: [],
19462
- misses: [`Evaluator '${evaluatorConfig.name}' failed: ${message}`],
19463
- expectedAspectCount: 1,
19464
- reasoning: message
19418
+ assertions: [
19419
+ { text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
19420
+ ],
19421
+ expectedAspectCount: 1
19465
19422
  };
19466
19423
  const weight = evaluatorConfig.weight ?? 1;
19467
19424
  scored.push({
@@ -19477,9 +19434,12 @@ async function runEvaluatorList(options) {
19477
19434
  score: 0,
19478
19435
  weight,
19479
19436
  verdict: "fail",
19480
- hits: [],
19481
- misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
19482
- reasoning: message,
19437
+ assertions: [
19438
+ {
19439
+ text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
19440
+ passed: false
19441
+ }
19442
+ ],
19483
19443
  durationMs: endedAt.getTime() - startedAt.getTime(),
19484
19444
  startedAt: startedAt.toISOString(),
19485
19445
  endedAt: endedAt.toISOString()
@@ -19495,9 +19455,7 @@ async function runEvaluatorList(options) {
19495
19455
  ...scores[lastScoresIdx],
19496
19456
  score: negated.score,
19497
19457
  verdict: negated.verdict,
19498
- hits: [...negated.hits],
19499
- misses: [...negated.misses],
19500
- reasoning: negated.reasoning
19458
+ assertions: [...negated.assertions]
19501
19459
  };
19502
19460
  }
19503
19461
  }
@@ -19512,21 +19470,13 @@ async function runEvaluatorList(options) {
19512
19470
  const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
19513
19471
  scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
19514
19472
  ) : 0;
19515
- const hits = scored.flatMap((entry) => entry.score.hits);
19516
- const misses = scored.flatMap((entry) => entry.score.misses);
19517
- const expectedAspectCount = scored.reduce(
19518
- (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
19519
- 0
19520
- );
19521
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
19522
- const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
19473
+ const assertions = scored.flatMap((entry) => entry.score.assertions);
19474
+ const expectedAspectCount = assertions.length || 1;
19523
19475
  const score = {
19524
19476
  score: aggregateScore,
19525
19477
  verdict: scoreToVerdict(aggregateScore),
19526
- hits,
19527
- misses,
19528
- expectedAspectCount,
19529
- reasoning
19478
+ assertions,
19479
+ expectedAspectCount
19530
19480
  };
19531
19481
  return { score, scores };
19532
19482
  }
@@ -19630,8 +19580,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
19630
19580
  dataset: evalCase.dataset,
19631
19581
  conversationId: evalCase.conversation_id,
19632
19582
  score: 0,
19633
- hits: [],
19634
- misses: [`Error: ${message}`],
19583
+ assertions: [{ text: `Error: ${message}`, passed: false }],
19635
19584
  answer: `Error occurred: ${message}`,
19636
19585
  target: targetName,
19637
19586
  requests,
@@ -19741,9 +19690,7 @@ function mapChildResults(children) {
19741
19690
  score: child.score,
19742
19691
  weight: child.weight,
19743
19692
  verdict: child.verdict,
19744
- hits: child.hits,
19745
- misses: child.misses,
19746
- reasoning: child.reasoning,
19693
+ assertions: child.assertions,
19747
19694
  evaluatorProviderRequest: child.evaluatorRawRequest,
19748
19695
  scores: mapChildResults(child.scores),
19749
19696
  details: child.details,
@@ -20713,7 +20660,6 @@ function createAgentKernel() {
20713
20660
  freeformEvaluationSchema,
20714
20661
  generateRubrics,
20715
20662
  getAgentvHome,
20716
- getHitCount,
20717
20663
  getOutputFilenames,
20718
20664
  getSubagentsRoot,
20719
20665
  getTraceStateRoot,