agentv 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -149,7 +149,7 @@ import {
149
149
  withUserAgentSuffix,
150
150
  withoutTrailingSlash,
151
151
  zodSchema
152
- } from "./chunk-AR3QEKXH.js";
152
+ } from "./chunk-BJV6MDBE.js";
153
153
  import {
154
154
  SpanStatusCode,
155
155
  context,
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-JO4HIAEF.js
304
+ // ../../packages/core/dist/chunk-EFR4JHPL.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,7 +419,7 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-JO4HIAEF.js
422
+ // ../../packages/core/dist/chunk-EFR4JHPL.js
423
423
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
424
424
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
425
425
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -498,9 +498,6 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
498
498
  function isEvaluatorKind(value) {
499
499
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
500
500
  }
501
- function getHitCount(result) {
502
- return result.hits.length;
503
- }
504
501
  async function fileExists(filePath) {
505
502
  try {
506
503
  await access(filePath, constants.F_OK);
@@ -17623,7 +17620,7 @@ var AzureProvider = class {
17623
17620
  };
17624
17621
  this.retryConfig = config.retry;
17625
17622
  const azure = createAzure(buildAzureOptions(config));
17626
- this.model = azure(config.deploymentName);
17623
+ this.model = azure.chat(config.deploymentName);
17627
17624
  }
17628
17625
  id;
17629
17626
  kind = "azure";
@@ -23483,9 +23480,11 @@ function negateScore(score) {
23483
23480
  ...score,
23484
23481
  score: negatedScore,
23485
23482
  verdict: negatedVerdict,
23486
- reasoning: score.reasoning ? `[Negated] ${score.reasoning} (original score: ${score.score.toFixed(2)})` : `[Negated] Original score: ${score.score.toFixed(2)}`,
23487
- hits: score.misses,
23488
- misses: score.hits
23483
+ assertions: score.assertions.map((a) => ({
23484
+ ...a,
23485
+ passed: !a.passed,
23486
+ evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
23487
+ }))
23489
23488
  };
23490
23489
  }
23491
23490
  function shellEscapePath(value) {
@@ -23985,9 +23984,13 @@ var CodeEvaluator = class {
23985
23984
  );
23986
23985
  const parsed = parseJsonSafe(stdout);
23987
23986
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
23988
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
23989
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
23990
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
23987
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
23988
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
23989
+ ).map((a) => ({
23990
+ text: String(a.text),
23991
+ passed: Boolean(a.passed),
23992
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
23993
+ })) : [];
23991
23994
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
23992
23995
  const proxyUsage = getProxyUsage?.();
23993
23996
  const evaluatorRawRequest = {
@@ -24003,10 +24006,8 @@ var CodeEvaluator = class {
24003
24006
  return {
24004
24007
  score,
24005
24008
  verdict: scoreToVerdict(score),
24006
- hits,
24007
- misses,
24008
- expectedAspectCount: hits.length + misses.length || 1,
24009
- reasoning,
24009
+ assertions,
24010
+ expectedAspectCount: assertions.length || 1,
24010
24011
  evaluatorRawRequest,
24011
24012
  ...details ? { details } : {},
24012
24013
  tokenUsage: proxyUsage?.tokenUsage
@@ -24017,10 +24018,8 @@ var CodeEvaluator = class {
24017
24018
  return {
24018
24019
  score: 0,
24019
24020
  verdict: "fail",
24020
- hits: [],
24021
- misses: [`Code evaluator failed: ${message}`],
24021
+ assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
24022
24022
  expectedAspectCount: 1,
24023
- reasoning: message,
24024
24023
  evaluatorRawRequest: {
24025
24024
  command: this.command,
24026
24025
  ...this.cwd ? { cwd: this.cwd } : {},
@@ -24119,9 +24118,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
24119
24118
  {{${TEMPLATE_VARIABLES.ANSWER}}}`;
24120
24119
  var freeformEvaluationSchema = external_exports2.object({
24121
24120
  score: external_exports2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
24122
- hits: external_exports2.array(external_exports2.string()).describe("Brief specific achievements").optional(),
24123
- misses: external_exports2.array(external_exports2.string()).describe("Brief failures or omissions").optional(),
24124
- reasoning: external_exports2.string().describe("Concise explanation (1-2 sentences)").optional()
24121
+ assertions: external_exports2.array(
24122
+ external_exports2.object({
24123
+ text: external_exports2.string().describe("Brief description of what was checked"),
24124
+ passed: external_exports2.boolean().describe("Whether this aspect was satisfied"),
24125
+ evidence: external_exports2.string().describe("Concise evidence (1-2 sentences)").optional()
24126
+ })
24127
+ ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
24125
24128
  });
24126
24129
  var rubricCheckResultSchema = external_exports2.object({
24127
24130
  id: external_exports2.string().describe("The ID of the rubric item being checked"),
@@ -24223,17 +24226,12 @@ ${context2.fileChanges}`;
24223
24226
  schema: freeformEvaluationSchema
24224
24227
  });
24225
24228
  const score = clampScore(data.score);
24226
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
24227
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
24228
- const reasoning = data.reasoning;
24229
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
24229
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
24230
24230
  return {
24231
24231
  score,
24232
24232
  verdict: scoreToVerdict(score),
24233
- hits,
24234
- misses,
24235
- expectedAspectCount,
24236
- reasoning,
24233
+ assertions,
24234
+ expectedAspectCount: Math.max(assertions.length, 1),
24237
24235
  evaluatorRawRequest,
24238
24236
  tokenUsage
24239
24237
  };
@@ -24244,10 +24242,8 @@ ${context2.fileChanges}`;
24244
24242
  return {
24245
24243
  score: 0,
24246
24244
  verdict: "skip",
24247
- hits: [],
24248
- misses: [`Grader parse failure after 3 attempts: ${message}`],
24245
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
24249
24246
  expectedAspectCount: 1,
24250
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
24251
24247
  evaluatorRawRequest
24252
24248
  };
24253
24249
  }
@@ -24277,14 +24273,12 @@ ${context2.fileChanges}`;
24277
24273
  userPrompt: prompt,
24278
24274
  schema: rubricEvaluationSchema
24279
24275
  });
24280
- const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
24276
+ const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
24281
24277
  return {
24282
24278
  score,
24283
24279
  verdict,
24284
- hits,
24285
- misses,
24280
+ assertions,
24286
24281
  expectedAspectCount: rubrics.length,
24287
- reasoning: data.overall_reasoning,
24288
24282
  evaluatorRawRequest,
24289
24283
  tokenUsage
24290
24284
  };
@@ -24295,10 +24289,8 @@ ${context2.fileChanges}`;
24295
24289
  return {
24296
24290
  score: 0,
24297
24291
  verdict: "skip",
24298
- hits: [],
24299
- misses: [`Grader parse failure after 3 attempts: ${message}`],
24292
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
24300
24293
  expectedAspectCount: rubrics.length,
24301
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
24302
24294
  evaluatorRawRequest
24303
24295
  };
24304
24296
  }
@@ -24323,14 +24315,12 @@ ${context2.fileChanges}`;
24323
24315
  userPrompt: prompt,
24324
24316
  schema: scoreRangeEvaluationSchema
24325
24317
  });
24326
- const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
24318
+ const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
24327
24319
  return {
24328
24320
  score,
24329
24321
  verdict,
24330
- hits,
24331
- misses,
24322
+ assertions,
24332
24323
  expectedAspectCount: rubrics.length,
24333
- reasoning: data.overall_reasoning,
24334
24324
  evaluatorRawRequest,
24335
24325
  details,
24336
24326
  tokenUsage
@@ -24342,10 +24332,8 @@ ${context2.fileChanges}`;
24342
24332
  return {
24343
24333
  score: 0,
24344
24334
  verdict: "skip",
24345
- hits: [],
24346
- misses: [`Grader parse failure after 3 attempts: ${message}`],
24335
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
24347
24336
  expectedAspectCount: rubrics.length,
24348
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
24349
24337
  evaluatorRawRequest
24350
24338
  };
24351
24339
  }
@@ -24402,8 +24390,7 @@ ${context2.fileChanges}`;
24402
24390
  return {
24403
24391
  score: 0,
24404
24392
  verdict: "fail",
24405
- hits: [],
24406
- misses: [`llm-grader built-in evaluation failed: ${message}`],
24393
+ assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
24407
24394
  expectedAspectCount: 1,
24408
24395
  evaluatorRawRequest,
24409
24396
  details: { mode: "built-in", error: message }
@@ -24453,8 +24440,9 @@ ${context2.fileChanges}`;
24453
24440
  return {
24454
24441
  score: 0,
24455
24442
  verdict: "fail",
24456
- hits: [],
24457
- misses: [`llm-grader ${modeLabel} returned no assistant response`],
24443
+ assertions: [
24444
+ { text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
24445
+ ],
24458
24446
  expectedAspectCount: 1,
24459
24447
  evaluatorRawRequest,
24460
24448
  details: { mode: modeLabel, grader_target: provider.targetName }
@@ -24472,8 +24460,9 @@ ${context2.fileChanges}`;
24472
24460
  return {
24473
24461
  score: 0,
24474
24462
  verdict: "fail",
24475
- hits: [],
24476
- misses: [`llm-grader ${modeLabel} evaluation failed: ${message}`],
24463
+ assertions: [
24464
+ { text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
24465
+ ],
24477
24466
  expectedAspectCount: 1,
24478
24467
  evaluatorRawRequest,
24479
24468
  details: {
@@ -24625,29 +24614,24 @@ ${outputSchema2}`;
24625
24614
  const parsed = parseJsonFromText(text2);
24626
24615
  if (rubrics && rubrics.length > 0) {
24627
24616
  const data2 = rubricEvaluationSchema.parse(parsed);
24628
- const { score: score2, verdict, hits: hits2, misses: misses2 } = calculateRubricScore(data2, rubrics);
24617
+ const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
24629
24618
  return {
24630
24619
  score: score2,
24631
24620
  verdict,
24632
- hits: hits2,
24633
- misses: misses2,
24621
+ assertions: assertions2,
24634
24622
  expectedAspectCount: rubrics.length,
24635
- reasoning: data2.overall_reasoning,
24636
24623
  evaluatorRawRequest,
24637
24624
  details
24638
24625
  };
24639
24626
  }
24640
24627
  const data = freeformEvaluationSchema.parse(parsed);
24641
24628
  const score = clampScore(data.score);
24642
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
24643
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
24629
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
24644
24630
  return {
24645
24631
  score,
24646
24632
  verdict: scoreToVerdict(score),
24647
- hits,
24648
- misses,
24649
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
24650
- reasoning: data.reasoning,
24633
+ assertions,
24634
+ expectedAspectCount: Math.max(assertions.length, 1),
24651
24635
  evaluatorRawRequest,
24652
24636
  details
24653
24637
  };
@@ -24655,8 +24639,12 @@ ${outputSchema2}`;
24655
24639
  return {
24656
24640
  score: 0,
24657
24641
  verdict: "fail",
24658
- hits: [],
24659
- misses: ["Failed to parse llm-grader agent response as valid evaluation JSON"],
24642
+ assertions: [
24643
+ {
24644
+ text: "Failed to parse llm-grader agent response as valid evaluation JSON",
24645
+ passed: false
24646
+ }
24647
+ ],
24660
24648
  expectedAspectCount: 1,
24661
24649
  evaluatorRawRequest,
24662
24650
  details
@@ -24785,9 +24773,13 @@ function buildOutputSchema() {
24785
24773
  "",
24786
24774
  "{",
24787
24775
  ' "score": <number between 0.0 and 1.0>,',
24788
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
24789
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
24790
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
24776
+ ' "assertions": [',
24777
+ " {",
24778
+ ' "text": "<brief description of what was checked>",',
24779
+ ' "passed": <boolean>,',
24780
+ ' "evidence": "<concise evidence, 1-2 sentences, optional>"',
24781
+ " }",
24782
+ " ]",
24791
24783
  "}"
24792
24784
  ].join("\n");
24793
24785
  }
@@ -24812,8 +24804,7 @@ function substituteVariables(template, variables) {
24812
24804
  }
24813
24805
  function calculateRubricScore(result, rubrics) {
24814
24806
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
24815
- const hits = [];
24816
- const misses = [];
24807
+ const assertions = [];
24817
24808
  let totalWeight = 0;
24818
24809
  let earnedWeight = 0;
24819
24810
  let failedRequired = false;
@@ -24823,19 +24814,20 @@ function calculateRubricScore(result, rubrics) {
24823
24814
  continue;
24824
24815
  }
24825
24816
  totalWeight += rubric.weight;
24817
+ assertions.push({
24818
+ text: `[${rubric.id}] ${rubric.outcome}`,
24819
+ passed: check.satisfied,
24820
+ evidence: check.reasoning
24821
+ });
24826
24822
  if (check.satisfied) {
24827
24823
  earnedWeight += rubric.weight;
24828
- hits.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
24829
- } else {
24830
- misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
24831
- if (rubric.required) {
24832
- failedRequired = true;
24833
- }
24824
+ } else if (rubric.required) {
24825
+ failedRequired = true;
24834
24826
  }
24835
24827
  }
24836
24828
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
24837
24829
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
24838
- return { score, verdict, hits, misses };
24830
+ return { score, verdict, assertions };
24839
24831
  }
24840
24832
  function buildScoreRangeOutputSchema() {
24841
24833
  return `You are an expert evaluator. Score the candidate answer on each criterion.
@@ -24855,8 +24847,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
24855
24847
  }
24856
24848
  function calculateScoreRangeResult(result, rubrics) {
24857
24849
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
24858
- const hits = [];
24859
- const misses = [];
24850
+ const assertions = [];
24860
24851
  const rawScores = {};
24861
24852
  let totalWeight = 0;
24862
24853
  let weightedScoreSum = 0;
@@ -24882,24 +24873,22 @@ function calculateScoreRangeResult(result, rubrics) {
24882
24873
  );
24883
24874
  const rangeDescription = matchingRange?.outcome ?? "";
24884
24875
  const criterionLabel = rubric.outcome ?? rubric.id;
24885
- const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
24886
- const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
24876
+ const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
24887
24877
  if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
24888
24878
  failedRequired = true;
24889
- misses.push(scoreInfo);
24890
- } else if (rawScore >= 7) {
24891
- hits.push(scoreInfo);
24892
- } else {
24893
- misses.push(scoreInfo);
24894
24879
  }
24880
+ assertions.push({
24881
+ text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
24882
+ passed,
24883
+ evidence: check.reasoning
24884
+ });
24895
24885
  }
24896
24886
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
24897
24887
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
24898
24888
  return {
24899
24889
  score,
24900
24890
  verdict,
24901
- hits,
24902
- misses,
24891
+ assertions,
24903
24892
  details: {
24904
24893
  raw_scores: rawScores,
24905
24894
  normalization: "score / 10",
@@ -25073,9 +25062,7 @@ var CompositeEvaluator = class {
25073
25062
  let totalWeight = 0;
25074
25063
  let weightedSum = 0;
25075
25064
  let evaluatedCount = 0;
25076
- const allHits = [];
25077
- const allMisses = [];
25078
- const reasoningParts = [];
25065
+ const allAssertions = [];
25079
25066
  const scores = [];
25080
25067
  for (const member of results) {
25081
25068
  const weight = weights?.[member.id] ?? 1;
@@ -25085,9 +25072,7 @@ var CompositeEvaluator = class {
25085
25072
  score: member.result.score,
25086
25073
  weight,
25087
25074
  verdict: member.result.verdict,
25088
- hits: [...member.result.hits],
25089
- misses: [...member.result.misses],
25090
- reasoning: member.result.reasoning,
25075
+ assertions: [...member.result.assertions],
25091
25076
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25092
25077
  scores: member.result.scores,
25093
25078
  details: member.result.details,
@@ -25099,20 +25084,16 @@ var CompositeEvaluator = class {
25099
25084
  evaluatedCount++;
25100
25085
  totalWeight += weight;
25101
25086
  weightedSum += member.result.score * weight;
25102
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
25103
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
25104
- if (member.result.reasoning) {
25105
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
25106
- }
25087
+ allAssertions.push(
25088
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
25089
+ );
25107
25090
  }
25108
25091
  if (evaluatedCount === 0 && results.length > 0) {
25109
25092
  return {
25110
25093
  score: 0,
25111
25094
  verdict: "skip",
25112
- hits: [],
25113
- misses: [],
25095
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
25114
25096
  expectedAspectCount: 1,
25115
- reasoning: "All evaluators skipped (infrastructure failure)",
25116
25097
  evaluatorRawRequest: {
25117
25098
  aggregator: "weighted_average",
25118
25099
  ...weights ? { weights } : {}
@@ -25124,10 +25105,8 @@ var CompositeEvaluator = class {
25124
25105
  return {
25125
25106
  score: clampScore(finalScore),
25126
25107
  verdict: scoreToVerdict(finalScore),
25127
- hits: allHits,
25128
- misses: allMisses,
25129
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
25130
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
25108
+ assertions: allAssertions,
25109
+ expectedAspectCount: allAssertions.length || 1,
25131
25110
  evaluatorRawRequest: {
25132
25111
  aggregator: "weighted_average",
25133
25112
  ...weights ? { weights } : {}
@@ -25137,11 +25116,8 @@ var CompositeEvaluator = class {
25137
25116
  }
25138
25117
  runThreshold(results, threshold) {
25139
25118
  const scores = [];
25140
- const allHits = [];
25141
- const allMisses = [];
25142
- const reasoningParts = [];
25119
+ const allAssertions = [];
25143
25120
  let passingCount = 0;
25144
- let borderlineCount = 0;
25145
25121
  let evaluatedCount = 0;
25146
25122
  for (const member of results) {
25147
25123
  scores.push({
@@ -25149,9 +25125,7 @@ var CompositeEvaluator = class {
25149
25125
  type: member.type,
25150
25126
  score: member.result.score,
25151
25127
  verdict: member.result.verdict,
25152
- hits: [...member.result.hits],
25153
- misses: [...member.result.misses],
25154
- reasoning: member.result.reasoning,
25128
+ assertions: [...member.result.assertions],
25155
25129
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25156
25130
  scores: member.result.scores,
25157
25131
  details: member.result.details,
@@ -25164,24 +25138,17 @@ var CompositeEvaluator = class {
25164
25138
  const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
25165
25139
  if (isPassing) {
25166
25140
  passingCount++;
25167
- if (member.result.verdict === "borderline") {
25168
- borderlineCount++;
25169
- }
25170
- }
25171
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
25172
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
25173
- if (member.result.reasoning) {
25174
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
25175
25141
  }
25142
+ allAssertions.push(
25143
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
25144
+ );
25176
25145
  }
25177
25146
  if (evaluatedCount === 0 && results.length > 0) {
25178
25147
  return {
25179
25148
  score: 0,
25180
25149
  verdict: "skip",
25181
- hits: [],
25182
- misses: [],
25150
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
25183
25151
  expectedAspectCount: 1,
25184
- reasoning: "All evaluators skipped (infrastructure failure)",
25185
25152
  evaluatorRawRequest: {
25186
25153
  aggregator: "threshold",
25187
25154
  threshold
@@ -25192,19 +25159,15 @@ var CompositeEvaluator = class {
25192
25159
  const totalCount = evaluatedCount;
25193
25160
  const score = totalCount > 0 ? passingCount / totalCount : 0;
25194
25161
  const pass = score >= threshold;
25195
- if (pass && borderlineCount > 0) {
25196
- reasoningParts.push(`Warning: ${borderlineCount} borderline evaluator(s) counted as passing`);
25197
- }
25198
- reasoningParts.unshift(
25199
- `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
25200
- );
25162
+ allAssertions.unshift({
25163
+ text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
25164
+ passed: pass
25165
+ });
25201
25166
  return {
25202
25167
  score: clampScore(score),
25203
25168
  verdict: pass ? "pass" : "fail",
25204
- hits: allHits,
25205
- misses: allMisses,
25206
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
25207
- reasoning: reasoningParts.join("; "),
25169
+ assertions: allAssertions,
25170
+ expectedAspectCount: allAssertions.length || 1,
25208
25171
  evaluatorRawRequest: {
25209
25172
  aggregator: "threshold",
25210
25173
  threshold
@@ -25221,9 +25184,7 @@ var CompositeEvaluator = class {
25221
25184
  score: member.result.score,
25222
25185
  weight: weights?.[member.id] ?? 1,
25223
25186
  verdict: member.result.verdict,
25224
- hits: [...member.result.hits],
25225
- misses: [...member.result.misses],
25226
- reasoning: member.result.reasoning,
25187
+ assertions: [...member.result.assertions],
25227
25188
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25228
25189
  scores: member.result.scores,
25229
25190
  details: member.result.details
@@ -25232,17 +25193,19 @@ var CompositeEvaluator = class {
25232
25193
  const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
25233
25194
  const parsed = parseJsonSafe(stdout);
25234
25195
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
25235
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
25236
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
25237
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
25196
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
25197
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
25198
+ ).map((a) => ({
25199
+ text: String(a.text),
25200
+ passed: Boolean(a.passed),
25201
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
25202
+ })) : [];
25238
25203
  const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
25239
25204
  return {
25240
25205
  score,
25241
25206
  verdict,
25242
- hits,
25243
- misses,
25244
- expectedAspectCount: hits.length + misses.length || 1,
25245
- reasoning,
25207
+ assertions,
25208
+ expectedAspectCount: assertions.length || 1,
25246
25209
  evaluatorRawRequest: {
25247
25210
  aggregator: "code-grader",
25248
25211
  script: scriptPath
@@ -25254,10 +25217,8 @@ var CompositeEvaluator = class {
25254
25217
  return {
25255
25218
  score: 0,
25256
25219
  verdict: "fail",
25257
- hits: [],
25258
- misses: [`Code aggregator failed: ${message}`],
25220
+ assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
25259
25221
  expectedAspectCount: 1,
25260
- reasoning: message,
25261
25222
  evaluatorRawRequest: {
25262
25223
  aggregator: "code-grader",
25263
25224
  script: scriptPath,
@@ -25279,9 +25240,7 @@ var CompositeEvaluator = class {
25279
25240
  type: member.type,
25280
25241
  score: member.result.score,
25281
25242
  verdict: member.result.verdict,
25282
- hits: [...member.result.hits],
25283
- misses: [...member.result.misses],
25284
- reasoning: member.result.reasoning,
25243
+ assertions: [...member.result.assertions],
25285
25244
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25286
25245
  scores: member.result.scores,
25287
25246
  details: member.result.details
@@ -25305,16 +25264,12 @@ var CompositeEvaluator = class {
25305
25264
  });
25306
25265
  const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text2));
25307
25266
  const score2 = clampScore(data2.score);
25308
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
25309
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
25310
- const reasoning2 = data2.reasoning;
25267
+ const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
25311
25268
  return {
25312
25269
  score: score2,
25313
25270
  verdict: scoreToVerdict(score2),
25314
- hits: hits2,
25315
- misses: misses2,
25316
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
25317
- reasoning: reasoning2,
25271
+ assertions: assertions2,
25272
+ expectedAspectCount: Math.max(assertions2.length, 1),
25318
25273
  evaluatorRawRequest,
25319
25274
  scores
25320
25275
  };
@@ -25329,16 +25284,12 @@ var CompositeEvaluator = class {
25329
25284
  parseJsonFromText(extractLastAssistantContent(response.output))
25330
25285
  );
25331
25286
  const score = clampScore(data.score);
25332
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
25333
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
25334
- const reasoning = data.reasoning;
25287
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
25335
25288
  return {
25336
25289
  score,
25337
25290
  verdict: scoreToVerdict(score),
25338
- hits,
25339
- misses,
25340
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
25341
- reasoning,
25291
+ assertions,
25292
+ expectedAspectCount: Math.max(assertions.length, 1),
25342
25293
  evaluatorRawRequest,
25343
25294
  scores
25344
25295
  };
@@ -25346,8 +25297,7 @@ var CompositeEvaluator = class {
25346
25297
  return {
25347
25298
  score: 0,
25348
25299
  verdict: "fail",
25349
- hits: [],
25350
- misses: [],
25300
+ assertions: [{ text: "LLM aggregator failed", passed: false }],
25351
25301
  expectedAspectCount: 1,
25352
25302
  evaluatorRawRequest,
25353
25303
  scores
@@ -25368,10 +25318,8 @@ var CostEvaluator = class {
25368
25318
  return {
25369
25319
  score: 0,
25370
25320
  verdict: "fail",
25371
- hits: [],
25372
- misses: ["No cost data available in trace"],
25321
+ assertions: [{ text: "No cost data available in trace", passed: false }],
25373
25322
  expectedAspectCount: 1,
25374
- reasoning: "Execution cost not reported by provider",
25375
25323
  evaluatorRawRequest: {
25376
25324
  type: "cost",
25377
25325
  budget,
@@ -25385,10 +25333,10 @@ var CostEvaluator = class {
25385
25333
  return {
25386
25334
  score,
25387
25335
  verdict: passed ? "pass" : "fail",
25388
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
25389
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
25336
+ assertions: [
25337
+ passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
25338
+ ],
25390
25339
  expectedAspectCount: 1,
25391
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
25392
25340
  evaluatorRawRequest: {
25393
25341
  type: "cost",
25394
25342
  budget,
@@ -25419,10 +25367,8 @@ var ExecutionMetricsEvaluator = class {
25419
25367
  return {
25420
25368
  score: 0,
25421
25369
  verdict: "fail",
25422
- hits: [],
25423
- misses: ["No trace summary available"],
25370
+ assertions: [{ text: "No trace summary available", passed: false }],
25424
25371
  expectedAspectCount: 1,
25425
- reasoning: "Execution metrics not available - no trace summary provided",
25426
25372
  evaluatorRawRequest: {
25427
25373
  type: "execution-metrics",
25428
25374
  config: this.extractConfiguredThresholds(),
@@ -25431,116 +25377,114 @@ var ExecutionMetricsEvaluator = class {
25431
25377
  };
25432
25378
  }
25433
25379
  const narrowedTrace = trace2;
25434
- const hits = [];
25435
- const misses = [];
25380
+ const assertions = [];
25436
25381
  const actualMetrics = {};
25437
25382
  if (max_tool_calls !== void 0 && narrowedTrace) {
25438
25383
  const toolCalls = narrowedTrace.eventCount;
25439
25384
  actualMetrics.tool_calls = toolCalls;
25440
25385
  if (toolCalls <= max_tool_calls) {
25441
- hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
25386
+ assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
25442
25387
  } else {
25443
- misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
25388
+ assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
25444
25389
  }
25445
25390
  }
25446
25391
  if (max_llm_calls !== void 0 && narrowedTrace) {
25447
25392
  const llmCalls = narrowedTrace.llmCallCount;
25448
25393
  if (llmCalls === void 0) {
25449
- misses.push("LLM call count data not available");
25394
+ assertions.push({ text: "LLM call count data not available", passed: false });
25450
25395
  } else {
25451
25396
  actualMetrics.llm_calls = llmCalls;
25452
25397
  if (llmCalls <= max_llm_calls) {
25453
- hits.push(`LLM calls ${llmCalls} <= ${max_llm_calls} max`);
25398
+ assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
25454
25399
  } else {
25455
- misses.push(`LLM calls ${llmCalls} > ${max_llm_calls} max`);
25400
+ assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
25456
25401
  }
25457
25402
  }
25458
25403
  }
25459
25404
  if (max_tokens !== void 0) {
25460
25405
  if (!tokenUsage) {
25461
- misses.push("Token usage data not available");
25406
+ assertions.push({ text: "Token usage data not available", passed: false });
25462
25407
  } else {
25463
25408
  const totalTokens = tokenUsage.input + tokenUsage.output;
25464
25409
  actualMetrics.tokens = totalTokens;
25465
25410
  if (totalTokens <= max_tokens) {
25466
- hits.push(`Total tokens ${totalTokens} <= ${max_tokens} max`);
25411
+ assertions.push({
25412
+ text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
25413
+ passed: true
25414
+ });
25467
25415
  } else {
25468
- misses.push(`Total tokens ${totalTokens} > ${max_tokens} max`);
25416
+ assertions.push({
25417
+ text: `Total tokens ${totalTokens} > ${max_tokens} max`,
25418
+ passed: false
25419
+ });
25469
25420
  }
25470
25421
  }
25471
25422
  }
25472
25423
  if (max_cost_usd !== void 0) {
25473
25424
  if (costUsd === void 0) {
25474
- misses.push("Cost data not available");
25425
+ assertions.push({ text: "Cost data not available", passed: false });
25475
25426
  } else {
25476
25427
  actualMetrics.cost_usd = costUsd;
25477
25428
  const formatCost = (n) => `$${n.toFixed(4)}`;
25478
25429
  if (costUsd <= max_cost_usd) {
25479
- hits.push(`Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`);
25430
+ assertions.push({
25431
+ text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
25432
+ passed: true
25433
+ });
25480
25434
  } else {
25481
- misses.push(`Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`);
25435
+ assertions.push({
25436
+ text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
25437
+ passed: false
25438
+ });
25482
25439
  }
25483
25440
  }
25484
25441
  }
25485
25442
  if (max_duration_ms !== void 0) {
25486
25443
  if (durationMs === void 0) {
25487
- misses.push("Duration data not available");
25444
+ assertions.push({ text: "Duration data not available", passed: false });
25488
25445
  } else {
25489
25446
  actualMetrics.duration_ms = durationMs;
25490
25447
  if (durationMs <= max_duration_ms) {
25491
- hits.push(`Duration ${durationMs}ms <= ${max_duration_ms}ms max`);
25448
+ assertions.push({
25449
+ text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
25450
+ passed: true
25451
+ });
25492
25452
  } else {
25493
- misses.push(`Duration ${durationMs}ms > ${max_duration_ms}ms max`);
25453
+ assertions.push({
25454
+ text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
25455
+ passed: false
25456
+ });
25494
25457
  }
25495
25458
  }
25496
25459
  }
25497
25460
  if (target_exploration_ratio !== void 0 && narrowedTrace) {
25498
25461
  const ratio = explorationRatio(narrowedTrace);
25499
25462
  if (ratio === void 0) {
25500
- misses.push("Exploration ratio not available (no tool calls)");
25463
+ assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
25501
25464
  } else {
25502
25465
  actualMetrics.exploration_ratio = ratio;
25503
25466
  const diff = Math.abs(ratio - target_exploration_ratio);
25504
25467
  if (diff <= exploration_tolerance) {
25505
- hits.push(
25506
- `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`
25507
- );
25468
+ assertions.push({
25469
+ text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
25470
+ passed: true
25471
+ });
25508
25472
  } else {
25509
- misses.push(
25510
- `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`
25511
- );
25473
+ assertions.push({
25474
+ text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
25475
+ passed: false
25476
+ });
25512
25477
  }
25513
25478
  }
25514
25479
  }
25515
- const totalChecks = hits.length + misses.length;
25516
- const score = totalChecks > 0 ? hits.length / totalChecks : 0;
25517
- const reasoningParts = [];
25518
- if (actualMetrics.tool_calls !== void 0) {
25519
- reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
25520
- }
25521
- if (actualMetrics.llm_calls !== void 0) {
25522
- reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
25523
- }
25524
- if (actualMetrics.tokens !== void 0) {
25525
- reasoningParts.push(`tokens=${actualMetrics.tokens}`);
25526
- }
25527
- if (actualMetrics.cost_usd !== void 0) {
25528
- reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
25529
- }
25530
- if (actualMetrics.duration_ms !== void 0) {
25531
- reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
25532
- }
25533
- if (actualMetrics.exploration_ratio !== void 0) {
25534
- reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
25535
- }
25536
- const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
25480
+ const totalChecks = assertions.length;
25481
+ const passedCount = assertions.filter((a) => a.passed).length;
25482
+ const score = totalChecks > 0 ? passedCount / totalChecks : 0;
25537
25483
  return {
25538
25484
  score,
25539
25485
  verdict: scoreToVerdict(score),
25540
- hits,
25541
- misses,
25486
+ assertions,
25542
25487
  expectedAspectCount: totalChecks || 1,
25543
- reasoning,
25544
25488
  evaluatorRawRequest: {
25545
25489
  type: "execution-metrics",
25546
25490
  config: this.extractConfiguredThresholds(),
@@ -25642,10 +25586,8 @@ var FieldAccuracyEvaluator = class {
25642
25586
  return {
25643
25587
  score: 0,
25644
25588
  verdict: "fail",
25645
- hits: [],
25646
- misses: ["Failed to parse candidate answer as JSON"],
25647
- expectedAspectCount: this.config.fields.length,
25648
- reasoning: "Candidate answer is not valid JSON"
25589
+ assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
25590
+ expectedAspectCount: this.config.fields.length
25649
25591
  };
25650
25592
  }
25651
25593
  const expectedData = this.extractExpectedData(evalCase.expected_output);
@@ -25653,10 +25595,8 @@ var FieldAccuracyEvaluator = class {
25653
25595
  return {
25654
25596
  score: 0,
25655
25597
  verdict: "fail",
25656
- hits: [],
25657
- misses: ["No expected data found in expected_output"],
25658
- expectedAspectCount: this.config.fields.length,
25659
- reasoning: "Could not extract expected data from expected_output"
25598
+ assertions: [{ text: "No expected data found in expected_output", passed: false }],
25599
+ expectedAspectCount: this.config.fields.length
25660
25600
  };
25661
25601
  }
25662
25602
  const fieldResults = [];
@@ -25874,18 +25814,14 @@ var FieldAccuracyEvaluator = class {
25874
25814
  */
25875
25815
  aggregateResults(results) {
25876
25816
  const aggregation = this.config.aggregation ?? "weighted_average";
25877
- const hits = [];
25878
- const misses = [];
25817
+ const assertions = [];
25879
25818
  for (const result of results) {
25880
- if (result.hit) {
25881
- hits.push(result.message);
25882
- } else {
25883
- misses.push(result.message);
25884
- }
25819
+ assertions.push({ text: result.message, passed: result.hit });
25885
25820
  }
25886
25821
  let score;
25887
25822
  if (aggregation === "all_or_nothing") {
25888
- score = misses.length === 0 ? 1 : 0;
25823
+ const hasFailed = assertions.some((a) => !a.passed);
25824
+ score = hasFailed ? 0 : 1;
25889
25825
  } else {
25890
25826
  const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
25891
25827
  if (totalWeight === 0) {
@@ -25895,15 +25831,11 @@ var FieldAccuracyEvaluator = class {
25895
25831
  score = weightedSum / totalWeight;
25896
25832
  }
25897
25833
  }
25898
- const reasoning = `${hits.length}/${results.length} fields matched`;
25899
25834
  return {
25900
25835
  score: clampScore(score),
25901
25836
  verdict: scoreToVerdict(score),
25902
- hits: hits.slice(0, 4),
25903
- // Cap at 4 to keep output concise
25904
- misses: misses.slice(0, 4),
25905
- expectedAspectCount: results.length,
25906
- reasoning
25837
+ assertions,
25838
+ expectedAspectCount: results.length
25907
25839
  };
25908
25840
  }
25909
25841
  };
@@ -26010,10 +25942,8 @@ var LatencyEvaluator = class {
26010
25942
  return {
26011
25943
  score: 0,
26012
25944
  verdict: "fail",
26013
- hits: [],
26014
- misses: ["No duration data available in trace"],
25945
+ assertions: [{ text: "No duration data available in trace", passed: false }],
26015
25946
  expectedAspectCount: 1,
26016
- reasoning: "Execution duration not reported by provider",
26017
25947
  evaluatorRawRequest: {
26018
25948
  type: "latency",
26019
25949
  threshold,
@@ -26026,10 +25956,10 @@ var LatencyEvaluator = class {
26026
25956
  return {
26027
25957
  score,
26028
25958
  verdict: passed ? "pass" : "fail",
26029
- hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
26030
- misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
25959
+ assertions: [
25960
+ passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
25961
+ ],
26031
25962
  expectedAspectCount: 1,
26032
- reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
26033
25963
  evaluatorRawRequest: {
26034
25964
  type: "latency",
26035
25965
  threshold,
@@ -26103,23 +26033,25 @@ var SkillTriggerEvaluator = class {
26103
26033
  return {
26104
26034
  score: 1,
26105
26035
  verdict: "pass",
26106
- hits: [
26107
- shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`
26036
+ assertions: [
26037
+ {
26038
+ text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
26039
+ passed: true
26040
+ }
26108
26041
  ],
26109
- misses: [],
26110
- expectedAspectCount: 1,
26111
- reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
26042
+ expectedAspectCount: 1
26112
26043
  };
26113
26044
  }
26114
26045
  return {
26115
26046
  score: 0,
26116
26047
  verdict: "fail",
26117
- hits: [],
26118
- misses: [
26119
- shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`
26048
+ assertions: [
26049
+ {
26050
+ text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
26051
+ passed: false
26052
+ }
26120
26053
  ],
26121
- expectedAspectCount: 1,
26122
- reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
26054
+ expectedAspectCount: 1
26123
26055
  };
26124
26056
  }
26125
26057
  };
@@ -26284,10 +26216,8 @@ var TokenUsageEvaluator = class {
26284
26216
  return {
26285
26217
  score: 0,
26286
26218
  verdict: "fail",
26287
- hits: [],
26288
- misses: ["No token usage data available in trace"],
26219
+ assertions: [{ text: "No token usage data available in trace", passed: false }],
26289
26220
  expectedAspectCount,
26290
- reasoning: "Token usage not reported by provider",
26291
26221
  evaluatorRawRequest: {
26292
26222
  type: "token-usage",
26293
26223
  max_total: maxTotal ?? null,
@@ -26301,37 +26231,34 @@ var TokenUsageEvaluator = class {
26301
26231
  const output = usage.output;
26302
26232
  const cached = usage.cached ?? 0;
26303
26233
  const total = input + output + cached;
26304
- const hits = [];
26305
- const misses = [];
26234
+ const assertions = [];
26306
26235
  if (typeof maxInput === "number") {
26307
26236
  if (input <= maxInput) {
26308
- hits.push(`Input tokens ${input} <= ${maxInput}`);
26237
+ assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
26309
26238
  } else {
26310
- misses.push(`Input tokens ${input} > ${maxInput}`);
26239
+ assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
26311
26240
  }
26312
26241
  }
26313
26242
  if (typeof maxOutput === "number") {
26314
26243
  if (output <= maxOutput) {
26315
- hits.push(`Output tokens ${output} <= ${maxOutput}`);
26244
+ assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
26316
26245
  } else {
26317
- misses.push(`Output tokens ${output} > ${maxOutput}`);
26246
+ assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
26318
26247
  }
26319
26248
  }
26320
26249
  if (typeof maxTotal === "number") {
26321
26250
  if (total <= maxTotal) {
26322
- hits.push(`Total tokens ${total} <= ${maxTotal}`);
26251
+ assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
26323
26252
  } else {
26324
- misses.push(`Total tokens ${total} > ${maxTotal}`);
26253
+ assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
26325
26254
  }
26326
26255
  }
26327
- const passed = misses.length === 0;
26256
+ const passed = assertions.every((a) => a.passed);
26328
26257
  return {
26329
26258
  score: passed ? 1 : 0,
26330
26259
  verdict: passed ? "pass" : "fail",
26331
- hits,
26332
- misses,
26260
+ assertions,
26333
26261
  expectedAspectCount,
26334
- reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
26335
26262
  evaluatorRawRequest: {
26336
26263
  type: "token-usage",
26337
26264
  max_total: maxTotal ?? null,
@@ -26429,8 +26356,7 @@ var ToolTrajectoryEvaluator = class {
26429
26356
  return {
26430
26357
  score: 0,
26431
26358
  verdict: "fail",
26432
- hits: [],
26433
- misses: ["No trace available for evaluation"],
26359
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
26434
26360
  expectedAspectCount: 1
26435
26361
  };
26436
26362
  }
@@ -26441,8 +26367,7 @@ var ToolTrajectoryEvaluator = class {
26441
26367
  return {
26442
26368
  score: 0,
26443
26369
  verdict: "fail",
26444
- hits: [],
26445
- misses: ["No trace available for evaluation"],
26370
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
26446
26371
  expectedAspectCount: 1
26447
26372
  };
26448
26373
  }
@@ -26460,8 +26385,7 @@ var ToolTrajectoryEvaluator = class {
26460
26385
  return {
26461
26386
  score: 0,
26462
26387
  verdict: "fail",
26463
- hits: [],
26464
- misses: [`Unknown mode: ${this.config.mode}`],
26388
+ assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
26465
26389
  expectedAspectCount: 1
26466
26390
  };
26467
26391
  }
@@ -26510,28 +26434,32 @@ var ToolTrajectoryEvaluator = class {
26510
26434
  return {
26511
26435
  score: 1,
26512
26436
  verdict: "pass",
26513
- hits: ["No tool requirements specified"],
26514
- misses: [],
26437
+ assertions: [{ text: "No tool requirements specified", passed: true }],
26515
26438
  expectedAspectCount: 0
26516
26439
  };
26517
26440
  }
26518
- const hits = [];
26519
- const misses = [];
26441
+ const assertions = [];
26520
26442
  for (const toolName of toolNames) {
26521
26443
  const required = minimums[toolName];
26522
26444
  const actual = summary.toolCallsByName[toolName] ?? 0;
26523
26445
  if (actual >= required) {
26524
- hits.push(`${toolName}: called ${actual} times (required >=${required})`);
26446
+ assertions.push({
26447
+ text: `${toolName}: called ${actual} times (required >=${required})`,
26448
+ passed: true
26449
+ });
26525
26450
  } else {
26526
- misses.push(`${toolName}: called ${actual} times (required >=${required})`);
26451
+ assertions.push({
26452
+ text: `${toolName}: called ${actual} times (required >=${required})`,
26453
+ passed: false
26454
+ });
26527
26455
  }
26528
26456
  }
26529
- const score = hits.length / toolNames.length;
26457
+ const passedCount = assertions.filter((a) => a.passed).length;
26458
+ const score = passedCount / toolNames.length;
26530
26459
  return {
26531
26460
  score,
26532
26461
  verdict: scoreToVerdict(score),
26533
- hits,
26534
- misses,
26462
+ assertions,
26535
26463
  expectedAspectCount: toolNames.length
26536
26464
  };
26537
26465
  }
@@ -26541,13 +26469,11 @@ var ToolTrajectoryEvaluator = class {
26541
26469
  return {
26542
26470
  score: 1,
26543
26471
  verdict: "pass",
26544
- hits: ["No tool sequence specified"],
26545
- misses: [],
26472
+ assertions: [{ text: "No tool sequence specified", passed: true }],
26546
26473
  expectedAspectCount: 0
26547
26474
  };
26548
26475
  }
26549
- const hits = [];
26550
- const misses = [];
26476
+ const assertions = [];
26551
26477
  const warnings = [];
26552
26478
  let actualIndex = 0;
26553
26479
  let sequenceHits = 0;
@@ -26567,16 +26493,20 @@ var ToolTrajectoryEvaluator = class {
26567
26493
  const actualCall = toolCalls[actualIndex];
26568
26494
  if (actualCall.name === expectedTool) {
26569
26495
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
26570
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
26496
+ assertions.push({
26497
+ text: `Found ${expectedTool} at position ${actualIndex}`,
26498
+ passed: true
26499
+ });
26571
26500
  sequenceHits++;
26572
26501
  matchedCall = actualCall;
26573
26502
  actualIndex++;
26574
26503
  found = true;
26575
26504
  break;
26576
26505
  }
26577
- misses.push(
26578
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
26579
- );
26506
+ assertions.push({
26507
+ text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
26508
+ passed: false
26509
+ });
26580
26510
  actualIndex++;
26581
26511
  argsMismatch = true;
26582
26512
  break;
@@ -26584,7 +26514,10 @@ var ToolTrajectoryEvaluator = class {
26584
26514
  actualIndex++;
26585
26515
  }
26586
26516
  if (!found && !argsMismatch) {
26587
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
26517
+ assertions.push({
26518
+ text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
26519
+ passed: false
26520
+ });
26588
26521
  }
26589
26522
  if (found && matchedCall) {
26590
26523
  const latencyResult = checkLatency(
@@ -26593,10 +26526,10 @@ var ToolTrajectoryEvaluator = class {
26593
26526
  matchedCall.durationMs
26594
26527
  );
26595
26528
  if (latencyResult.status === "pass") {
26596
- hits.push(latencyResult.message);
26529
+ assertions.push({ text: latencyResult.message, passed: true });
26597
26530
  latencyHits++;
26598
26531
  } else if (latencyResult.status === "fail") {
26599
- misses.push(latencyResult.message);
26532
+ assertions.push({ text: latencyResult.message, passed: false });
26600
26533
  } else if (latencyResult.message) {
26601
26534
  warnings.push(latencyResult.message);
26602
26535
  latencySkips++;
@@ -26612,8 +26545,7 @@ var ToolTrajectoryEvaluator = class {
26612
26545
  return {
26613
26546
  score,
26614
26547
  verdict: scoreToVerdict(score),
26615
- hits,
26616
- misses,
26548
+ assertions,
26617
26549
  expectedAspectCount: totalAssertions
26618
26550
  };
26619
26551
  }
@@ -26623,13 +26555,11 @@ var ToolTrajectoryEvaluator = class {
26623
26555
  return {
26624
26556
  score: 1,
26625
26557
  verdict: "pass",
26626
- hits: ["No tool sequence specified"],
26627
- misses: [],
26558
+ assertions: [{ text: "No tool sequence specified", passed: true }],
26628
26559
  expectedAspectCount: 0
26629
26560
  };
26630
26561
  }
26631
- const hits = [];
26632
- const misses = [];
26562
+ const assertions = [];
26633
26563
  const warnings = [];
26634
26564
  let sequenceHits = 0;
26635
26565
  let latencyHits = 0;
@@ -26638,7 +26568,10 @@ var ToolTrajectoryEvaluator = class {
26638
26568
  (item) => item.maxDurationMs !== void 0
26639
26569
  ).length;
26640
26570
  if (toolCalls.length !== expected.length) {
26641
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
26571
+ assertions.push({
26572
+ text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
26573
+ passed: false
26574
+ });
26642
26575
  }
26643
26576
  const checkLength = Math.min(expected.length, toolCalls.length);
26644
26577
  for (let i = 0; i < checkLength; i++) {
@@ -26650,14 +26583,17 @@ var ToolTrajectoryEvaluator = class {
26650
26583
  let sequenceMatched = false;
26651
26584
  if (actualTool === expectedTool) {
26652
26585
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
26653
- hits.push(`Position ${i}: ${expectedTool}`);
26586
+ assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
26654
26587
  sequenceHits++;
26655
26588
  sequenceMatched = true;
26656
26589
  } else {
26657
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
26590
+ assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
26658
26591
  }
26659
26592
  } else {
26660
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
26593
+ assertions.push({
26594
+ text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
26595
+ passed: false
26596
+ });
26661
26597
  }
26662
26598
  if (sequenceMatched) {
26663
26599
  const latencyResult = checkLatency(
@@ -26666,10 +26602,10 @@ var ToolTrajectoryEvaluator = class {
26666
26602
  actualCall.durationMs
26667
26603
  );
26668
26604
  if (latencyResult.status === "pass") {
26669
- hits.push(latencyResult.message);
26605
+ assertions.push({ text: latencyResult.message, passed: true });
26670
26606
  latencyHits++;
26671
26607
  } else if (latencyResult.status === "fail") {
26672
- misses.push(latencyResult.message);
26608
+ assertions.push({ text: latencyResult.message, passed: false });
26673
26609
  } else if (latencyResult.message) {
26674
26610
  warnings.push(latencyResult.message);
26675
26611
  latencySkips++;
@@ -26677,7 +26613,10 @@ var ToolTrajectoryEvaluator = class {
26677
26613
  }
26678
26614
  }
26679
26615
  for (let i = checkLength; i < expected.length; i++) {
26680
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
26616
+ assertions.push({
26617
+ text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
26618
+ passed: false
26619
+ });
26681
26620
  }
26682
26621
  for (const warning of warnings) {
26683
26622
  console.warn(`[tool-trajectory] ${warning}`);
@@ -26688,8 +26627,7 @@ var ToolTrajectoryEvaluator = class {
26688
26627
  return {
26689
26628
  score,
26690
26629
  verdict: scoreToVerdict(score),
26691
- hits,
26692
- misses,
26630
+ assertions,
26693
26631
  expectedAspectCount: totalAssertions
26694
26632
  };
26695
26633
  }
@@ -26704,13 +26642,11 @@ var ToolTrajectoryEvaluator = class {
26704
26642
  return {
26705
26643
  score: 1,
26706
26644
  verdict: "pass",
26707
- hits: ["No expected tools specified"],
26708
- misses: [],
26645
+ assertions: [{ text: "No expected tools specified", passed: true }],
26709
26646
  expectedAspectCount: 0
26710
26647
  };
26711
26648
  }
26712
- const hits = [];
26713
- const misses = [];
26649
+ const assertions = [];
26714
26650
  const consumed = /* @__PURE__ */ new Set();
26715
26651
  for (let i = 0; i < expected.length; i++) {
26716
26652
  const expectedItem = expected[i];
@@ -26721,22 +26657,25 @@ var ToolTrajectoryEvaluator = class {
26721
26657
  if (consumed.has(j)) continue;
26722
26658
  const actualCall = toolCalls[j];
26723
26659
  if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
26724
- hits.push(`Found ${expectedTool} at position ${j}`);
26660
+ assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
26725
26661
  consumed.add(j);
26726
26662
  found = true;
26727
26663
  break;
26728
26664
  }
26729
26665
  }
26730
26666
  if (!found) {
26731
- misses.push(`Expected ${expectedTool} not found in actual trajectory`);
26667
+ assertions.push({
26668
+ text: `Expected ${expectedTool} not found in actual trajectory`,
26669
+ passed: false
26670
+ });
26732
26671
  }
26733
26672
  }
26734
- const score = expected.length > 0 ? hits.length / expected.length : 1;
26673
+ const passedCount = assertions.filter((a) => a.passed).length;
26674
+ const score = expected.length > 0 ? passedCount / expected.length : 1;
26735
26675
  return {
26736
26676
  score,
26737
26677
  verdict: scoreToVerdict(score),
26738
- hits,
26739
- misses,
26678
+ assertions,
26740
26679
  expectedAspectCount: expected.length
26741
26680
  };
26742
26681
  }
@@ -26752,16 +26691,19 @@ var ToolTrajectoryEvaluator = class {
26752
26691
  return {
26753
26692
  score: 1,
26754
26693
  verdict: "pass",
26755
- hits: ["No tool calls and no expected tools"],
26756
- misses: [],
26694
+ assertions: [{ text: "No tool calls and no expected tools", passed: true }],
26757
26695
  expectedAspectCount: 0
26758
26696
  };
26759
26697
  }
26760
26698
  return {
26761
26699
  score: 0,
26762
26700
  verdict: "fail",
26763
- hits: [],
26764
- misses: [`${toolCalls.length} unexpected tool call(s) with empty allowed list`],
26701
+ assertions: [
26702
+ {
26703
+ text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
26704
+ passed: false
26705
+ }
26706
+ ],
26765
26707
  expectedAspectCount: toolCalls.length
26766
26708
  };
26767
26709
  }
@@ -26769,13 +26711,11 @@ var ToolTrajectoryEvaluator = class {
26769
26711
  return {
26770
26712
  score: 1,
26771
26713
  verdict: "pass",
26772
- hits: ["No actual tool calls (trivially a subset)"],
26773
- misses: [],
26714
+ assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
26774
26715
  expectedAspectCount: 0
26775
26716
  };
26776
26717
  }
26777
- const hits = [];
26778
- const misses = [];
26718
+ const assertions = [];
26779
26719
  for (let i = 0; i < toolCalls.length; i++) {
26780
26720
  const actualCall = toolCalls[i];
26781
26721
  let allowed = false;
@@ -26787,17 +26727,23 @@ var ToolTrajectoryEvaluator = class {
26787
26727
  }
26788
26728
  }
26789
26729
  if (allowed) {
26790
- hits.push(`Position ${i}: ${actualCall.name} is in allowed set`);
26730
+ assertions.push({
26731
+ text: `Position ${i}: ${actualCall.name} is in allowed set`,
26732
+ passed: true
26733
+ });
26791
26734
  } else {
26792
- misses.push(`Position ${i}: ${actualCall.name} is not in allowed set`);
26735
+ assertions.push({
26736
+ text: `Position ${i}: ${actualCall.name} is not in allowed set`,
26737
+ passed: false
26738
+ });
26793
26739
  }
26794
26740
  }
26795
- const score = toolCalls.length > 0 ? hits.length / toolCalls.length : 1;
26741
+ const passedCount = assertions.filter((a) => a.passed).length;
26742
+ const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
26796
26743
  return {
26797
26744
  score,
26798
26745
  verdict: scoreToVerdict(score),
26799
- hits,
26800
- misses,
26746
+ assertions,
26801
26747
  expectedAspectCount: toolCalls.length
26802
26748
  };
26803
26749
  }
@@ -26806,8 +26752,12 @@ function runContainsAssertion(output, value) {
26806
26752
  const passed = output.includes(value);
26807
26753
  return {
26808
26754
  score: passed ? 1 : 0,
26809
- hits: passed ? [`Output contains "${value}"`] : [],
26810
- misses: passed ? [] : [`Output does not contain "${value}"`]
26755
+ assertions: [
26756
+ {
26757
+ text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
26758
+ passed
26759
+ }
26760
+ ]
26811
26761
  };
26812
26762
  }
26813
26763
  function runContainsAnyAssertion(output, values) {
@@ -26815,8 +26765,12 @@ function runContainsAnyAssertion(output, values) {
26815
26765
  const passed = matched.length > 0;
26816
26766
  return {
26817
26767
  score: passed ? 1 : 0,
26818
- hits: passed ? [`Output contains "${matched[0]}"`] : [],
26819
- misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
26768
+ assertions: [
26769
+ {
26770
+ text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
26771
+ passed
26772
+ }
26773
+ ]
26820
26774
  };
26821
26775
  }
26822
26776
  function runContainsAllAssertion(output, values) {
@@ -26824,16 +26778,24 @@ function runContainsAllAssertion(output, values) {
26824
26778
  const passed = missing.length === 0;
26825
26779
  return {
26826
26780
  score: passed ? 1 : 0,
26827
- hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
26828
- misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
26781
+ assertions: [
26782
+ {
26783
+ text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
26784
+ passed
26785
+ }
26786
+ ]
26829
26787
  };
26830
26788
  }
26831
26789
  function runIcontainsAssertion(output, value) {
26832
26790
  const passed = output.toLowerCase().includes(value.toLowerCase());
26833
26791
  return {
26834
26792
  score: passed ? 1 : 0,
26835
- hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
26836
- misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
26793
+ assertions: [
26794
+ {
26795
+ text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
26796
+ passed
26797
+ }
26798
+ ]
26837
26799
  };
26838
26800
  }
26839
26801
  function runIcontainsAnyAssertion(output, values) {
@@ -26842,9 +26804,11 @@ function runIcontainsAnyAssertion(output, values) {
26842
26804
  const passed = matched.length > 0;
26843
26805
  return {
26844
26806
  score: passed ? 1 : 0,
26845
- hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
26846
- misses: passed ? [] : [
26847
- `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
26807
+ assertions: [
26808
+ {
26809
+ text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
26810
+ passed
26811
+ }
26848
26812
  ]
26849
26813
  };
26850
26814
  }
@@ -26854,24 +26818,36 @@ function runIcontainsAllAssertion(output, values) {
26854
26818
  const passed = missing.length === 0;
26855
26819
  return {
26856
26820
  score: passed ? 1 : 0,
26857
- hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
26858
- misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
26821
+ assertions: [
26822
+ {
26823
+ text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
26824
+ passed
26825
+ }
26826
+ ]
26859
26827
  };
26860
26828
  }
26861
26829
  function runStartsWithAssertion(output, value) {
26862
26830
  const passed = output.trim().startsWith(value.trim());
26863
26831
  return {
26864
26832
  score: passed ? 1 : 0,
26865
- hits: passed ? [`Output starts with "${value}"`] : [],
26866
- misses: passed ? [] : [`Output does not start with "${value}"`]
26833
+ assertions: [
26834
+ {
26835
+ text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
26836
+ passed
26837
+ }
26838
+ ]
26867
26839
  };
26868
26840
  }
26869
26841
  function runEndsWithAssertion(output, value) {
26870
26842
  const passed = output.trim().endsWith(value.trim());
26871
26843
  return {
26872
26844
  score: passed ? 1 : 0,
26873
- hits: passed ? [`Output ends with "${value}"`] : [],
26874
- misses: passed ? [] : [`Output does not end with "${value}"`]
26845
+ assertions: [
26846
+ {
26847
+ text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
26848
+ passed
26849
+ }
26850
+ ]
26875
26851
  };
26876
26852
  }
26877
26853
  function runRegexAssertion(output, pattern, flags) {
@@ -26880,8 +26856,12 @@ function runRegexAssertion(output, pattern, flags) {
26880
26856
  const flagsLabel = flags ? ` (flags: ${flags})` : "";
26881
26857
  return {
26882
26858
  score: passed ? 1 : 0,
26883
- hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
26884
- misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
26859
+ assertions: [
26860
+ {
26861
+ text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
26862
+ passed
26863
+ }
26864
+ ]
26885
26865
  };
26886
26866
  }
26887
26867
  function runIsJsonAssertion(output) {
@@ -26893,16 +26873,24 @@ function runIsJsonAssertion(output) {
26893
26873
  }
26894
26874
  return {
26895
26875
  score: passed ? 1 : 0,
26896
- hits: passed ? ["Output is valid JSON"] : [],
26897
- misses: passed ? [] : ["Output is not valid JSON"]
26876
+ assertions: [
26877
+ {
26878
+ text: passed ? "Output is valid JSON" : "Output is not valid JSON",
26879
+ passed
26880
+ }
26881
+ ]
26898
26882
  };
26899
26883
  }
26900
26884
  function runEqualsAssertion(output, value) {
26901
26885
  const passed = output.trim() === value.trim();
26902
26886
  return {
26903
26887
  score: passed ? 1 : 0,
26904
- hits: passed ? [`Output equals "${value}"`] : [],
26905
- misses: passed ? [] : [`Output does not equal "${value}"`]
26888
+ assertions: [
26889
+ {
26890
+ text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
26891
+ passed
26892
+ }
26893
+ ]
26906
26894
  };
26907
26895
  }
26908
26896
  var Node = class {
@@ -27101,10 +27089,8 @@ var InlineAssertEvaluator = class {
27101
27089
  return {
27102
27090
  score,
27103
27091
  verdict: scoreToVerdict(score),
27104
- hits: score >= 0.8 ? [result.name] : [],
27105
- misses: score < 0.5 ? [result.name] : [],
27092
+ assertions: [{ text: result.name, passed: score >= 0.5 }],
27106
27093
  expectedAspectCount: 1,
27107
- reasoning: void 0,
27108
27094
  details: result.metadata ? result.metadata : void 0
27109
27095
  };
27110
27096
  }
@@ -27292,9 +27278,7 @@ var containsFactory = (config) => {
27292
27278
  return {
27293
27279
  score: result.score,
27294
27280
  verdict: result.score === 1 ? "pass" : "fail",
27295
- hits: result.hits,
27296
- misses: result.misses,
27297
- reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
27281
+ assertions: result.assertions,
27298
27282
  expectedAspectCount: 1
27299
27283
  };
27300
27284
  });
@@ -27306,9 +27290,7 @@ var regexFactory = (config) => {
27306
27290
  return {
27307
27291
  score: result.score,
27308
27292
  verdict: result.score === 1 ? "pass" : "fail",
27309
- hits: result.hits,
27310
- misses: result.misses,
27311
- reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
27293
+ assertions: result.assertions,
27312
27294
  expectedAspectCount: 1
27313
27295
  };
27314
27296
  });
@@ -27319,9 +27301,7 @@ var isJsonFactory = () => {
27319
27301
  return {
27320
27302
  score: result.score,
27321
27303
  verdict: result.score === 1 ? "pass" : "fail",
27322
- hits: result.hits,
27323
- misses: result.misses,
27324
- reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
27304
+ assertions: result.assertions,
27325
27305
  expectedAspectCount: 1
27326
27306
  };
27327
27307
  });
@@ -27333,9 +27313,7 @@ var equalsFactory = (config) => {
27333
27313
  return {
27334
27314
  score: result.score,
27335
27315
  verdict: result.score === 1 ? "pass" : "fail",
27336
- hits: result.hits,
27337
- misses: result.misses,
27338
- reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
27316
+ assertions: result.assertions,
27339
27317
  expectedAspectCount: 1
27340
27318
  };
27341
27319
  });
@@ -27347,9 +27325,7 @@ var containsAnyFactory = (config) => {
27347
27325
  return {
27348
27326
  score: result.score,
27349
27327
  verdict: result.score === 1 ? "pass" : "fail",
27350
- hits: result.hits,
27351
- misses: result.misses,
27352
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27328
+ assertions: result.assertions,
27353
27329
  expectedAspectCount: 1
27354
27330
  };
27355
27331
  });
@@ -27361,9 +27337,7 @@ var containsAllFactory = (config) => {
27361
27337
  return {
27362
27338
  score: result.score,
27363
27339
  verdict: result.score === 1 ? "pass" : "fail",
27364
- hits: result.hits,
27365
- misses: result.misses,
27366
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27340
+ assertions: result.assertions,
27367
27341
  expectedAspectCount: 1
27368
27342
  };
27369
27343
  });
@@ -27375,9 +27349,7 @@ var icontainsFactory = (config) => {
27375
27349
  return {
27376
27350
  score: result.score,
27377
27351
  verdict: result.score === 1 ? "pass" : "fail",
27378
- hits: result.hits,
27379
- misses: result.misses,
27380
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27352
+ assertions: result.assertions,
27381
27353
  expectedAspectCount: 1
27382
27354
  };
27383
27355
  });
@@ -27389,9 +27361,7 @@ var icontainsAnyFactory = (config) => {
27389
27361
  return {
27390
27362
  score: result.score,
27391
27363
  verdict: result.score === 1 ? "pass" : "fail",
27392
- hits: result.hits,
27393
- misses: result.misses,
27394
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27364
+ assertions: result.assertions,
27395
27365
  expectedAspectCount: 1
27396
27366
  };
27397
27367
  });
@@ -27403,9 +27373,7 @@ var icontainsAllFactory = (config) => {
27403
27373
  return {
27404
27374
  score: result.score,
27405
27375
  verdict: result.score === 1 ? "pass" : "fail",
27406
- hits: result.hits,
27407
- misses: result.misses,
27408
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27376
+ assertions: result.assertions,
27409
27377
  expectedAspectCount: 1
27410
27378
  };
27411
27379
  });
@@ -27417,9 +27385,7 @@ var startsWithFactory = (config) => {
27417
27385
  return {
27418
27386
  score: result.score,
27419
27387
  verdict: result.score === 1 ? "pass" : "fail",
27420
- hits: result.hits,
27421
- misses: result.misses,
27422
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27388
+ assertions: result.assertions,
27423
27389
  expectedAspectCount: 1
27424
27390
  };
27425
27391
  });
@@ -27431,9 +27397,7 @@ var endsWithFactory = (config) => {
27431
27397
  return {
27432
27398
  score: result.score,
27433
27399
  verdict: result.score === 1 ? "pass" : "fail",
27434
- hits: result.hits,
27435
- misses: result.misses,
27436
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27400
+ assertions: result.assertions,
27437
27401
  expectedAspectCount: 1
27438
27402
  };
27439
27403
  });
@@ -28462,7 +28426,7 @@ async function runEvaluation(options) {
28462
28426
  if (!cliModel) {
28463
28427
  throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
28464
28428
  }
28465
- const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-HDSAUUEF-LUBMM7TH.js");
28429
+ const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-NFFLXG5M-TJAWCWCX.js");
28466
28430
  return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
28467
28431
  }
28468
28432
  const overrideTarget = resolveTargetByName(cliGraderTarget);
@@ -28797,8 +28761,7 @@ async function runEvaluation(options) {
28797
28761
  testId: evalCase.id,
28798
28762
  dataset: evalCase.dataset,
28799
28763
  score: 0,
28800
- hits: [],
28801
- misses: [],
28764
+ assertions: [],
28802
28765
  answer: "",
28803
28766
  target: target.name,
28804
28767
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
@@ -28834,8 +28797,7 @@ async function runEvaluation(options) {
28834
28797
  testId: evalCase.id,
28835
28798
  dataset: evalCase.dataset,
28836
28799
  score: 0,
28837
- hits: [],
28838
- misses: [],
28800
+ assertions: [],
28839
28801
  answer: "",
28840
28802
  target: target.name,
28841
28803
  error: errorMsg,
@@ -29802,11 +29764,9 @@ async function evaluateCandidate(options) {
29802
29764
  dataset: evalCase.dataset,
29803
29765
  conversationId: evalCase.conversation_id,
29804
29766
  score: score.score,
29805
- hits: score.hits,
29806
- misses: score.misses,
29767
+ assertions: score.assertions,
29807
29768
  answer: candidate,
29808
29769
  target: target.name,
29809
- reasoning: score.reasoning,
29810
29770
  tokenUsage,
29811
29771
  costUsd,
29812
29772
  durationMs,
@@ -29980,9 +29940,7 @@ async function runEvaluatorList(options) {
29980
29940
  score: score2.score,
29981
29941
  weight,
29982
29942
  verdict: score2.verdict,
29983
- hits: score2.hits,
29984
- misses: score2.misses,
29985
- reasoning: score2.reasoning,
29943
+ assertions: score2.assertions,
29986
29944
  evaluatorProviderRequest: score2.evaluatorRawRequest,
29987
29945
  details: score2.details,
29988
29946
  scores: mapChildResults(score2.scores),
@@ -29997,10 +29955,10 @@ async function runEvaluatorList(options) {
29997
29955
  const fallbackScore = {
29998
29956
  score: 0,
29999
29957
  verdict: "fail",
30000
- hits: [],
30001
- misses: [`Evaluator '${evaluatorConfig.name}' failed: ${message}`],
30002
- expectedAspectCount: 1,
30003
- reasoning: message
29958
+ assertions: [
29959
+ { text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
29960
+ ],
29961
+ expectedAspectCount: 1
30004
29962
  };
30005
29963
  const weight = evaluatorConfig.weight ?? 1;
30006
29964
  scored.push({
@@ -30016,9 +29974,12 @@ async function runEvaluatorList(options) {
30016
29974
  score: 0,
30017
29975
  weight,
30018
29976
  verdict: "fail",
30019
- hits: [],
30020
- misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
30021
- reasoning: message,
29977
+ assertions: [
29978
+ {
29979
+ text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
29980
+ passed: false
29981
+ }
29982
+ ],
30022
29983
  durationMs: endedAt.getTime() - startedAt.getTime(),
30023
29984
  startedAt: startedAt.toISOString(),
30024
29985
  endedAt: endedAt.toISOString()
@@ -30034,9 +29995,7 @@ async function runEvaluatorList(options) {
30034
29995
  ...scores[lastScoresIdx],
30035
29996
  score: negated.score,
30036
29997
  verdict: negated.verdict,
30037
- hits: [...negated.hits],
30038
- misses: [...negated.misses],
30039
- reasoning: negated.reasoning
29998
+ assertions: [...negated.assertions]
30040
29999
  };
30041
30000
  }
30042
30001
  }
@@ -30051,21 +30010,13 @@ async function runEvaluatorList(options) {
30051
30010
  const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
30052
30011
  scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
30053
30012
  ) : 0;
30054
- const hits = scored.flatMap((entry) => entry.score.hits);
30055
- const misses = scored.flatMap((entry) => entry.score.misses);
30056
- const expectedAspectCount = scored.reduce(
30057
- (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
30058
- 0
30059
- );
30060
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
30061
- const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
30013
+ const assertions = scored.flatMap((entry) => entry.score.assertions);
30014
+ const expectedAspectCount = assertions.length || 1;
30062
30015
  const score = {
30063
30016
  score: aggregateScore,
30064
30017
  verdict: scoreToVerdict(aggregateScore),
30065
- hits,
30066
- misses,
30067
- expectedAspectCount,
30068
- reasoning
30018
+ assertions,
30019
+ expectedAspectCount
30069
30020
  };
30070
30021
  return { score, scores };
30071
30022
  }
@@ -30169,8 +30120,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
30169
30120
  dataset: evalCase.dataset,
30170
30121
  conversationId: evalCase.conversation_id,
30171
30122
  score: 0,
30172
- hits: [],
30173
- misses: [`Error: ${message}`],
30123
+ assertions: [{ text: `Error: ${message}`, passed: false }],
30174
30124
  answer: `Error occurred: ${message}`,
30175
30125
  target: targetName,
30176
30126
  requests,
@@ -30280,9 +30230,7 @@ function mapChildResults(children) {
30280
30230
  score: child.score,
30281
30231
  weight: child.weight,
30282
30232
  verdict: child.verdict,
30283
- hits: child.hits,
30284
- misses: child.misses,
30285
- reasoning: child.reasoning,
30233
+ assertions: child.assertions,
30286
30234
  evaluatorProviderRequest: child.evaluatorRawRequest,
30287
30235
  scores: mapChildResults(child.scores),
30288
30236
  details: child.details,
@@ -31150,7 +31098,6 @@ export {
31150
31098
  isJsonValue,
31151
31099
  isTestMessage,
31152
31100
  isEvaluatorKind,
31153
- getHitCount,
31154
31101
  fileExists,
31155
31102
  normalizeLineEndings,
31156
31103
  readTextFile,
@@ -31290,4 +31237,4 @@ export {
31290
31237
  OtelStreamingObserver,
31291
31238
  createAgentKernel
31292
31239
  };
31293
- //# sourceMappingURL=chunk-GOZV2HN2.js.map
31240
+ //# sourceMappingURL=chunk-D6G4N2H2.js.map