@agentv/core 3.4.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -55,7 +55,7 @@ function createLanguageModel(modelString) {
55
55
  case "anthropic":
56
56
  return (0, import_anthropic.createAnthropic)()(modelName);
57
57
  case "azure":
58
- return (0, import_azure.createAzure)()(modelName);
58
+ return (0, import_azure.createAzure)().chat(modelName);
59
59
  case "google":
60
60
  return (0, import_google.createGoogleGenerativeAI)()(modelName);
61
61
  default:
@@ -1580,7 +1580,6 @@ __export(index_exports, {
1580
1580
  freeformEvaluationSchema: () => freeformEvaluationSchema,
1581
1581
  generateRubrics: () => generateRubrics,
1582
1582
  getAgentvHome: () => getAgentvHome,
1583
- getHitCount: () => getHitCount,
1584
1583
  getOutputFilenames: () => getOutputFilenames,
1585
1584
  getSubagentsRoot: () => getSubagentsRoot,
1586
1585
  getTraceStateRoot: () => getTraceStateRoot,
@@ -1730,9 +1729,6 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
1730
1729
  function isEvaluatorKind(value) {
1731
1730
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
1732
1731
  }
1733
- function getHitCount(result) {
1734
- return result.hits.length;
1735
- }
1736
1732
 
1737
1733
  // src/evaluation/trace.ts
1738
1734
  function computeTraceSummary(messages) {
@@ -2449,14 +2445,8 @@ var import_promises5 = require("fs/promises");
2449
2445
 
2450
2446
  // src/evaluation/template-variables.ts
2451
2447
  var TEMPLATE_VARIABLES = {
2452
- /** @deprecated Use OUTPUT_TEXT instead */
2453
- ANSWER: "answer",
2454
2448
  EXPECTED_OUTPUT: "expected_output",
2455
- /** @deprecated Use INPUT_TEXT instead */
2456
- QUESTION: "question",
2457
2449
  CRITERIA: "criteria",
2458
- /** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
2459
- REFERENCE_ANSWER: "reference_answer",
2460
2450
  INPUT: "input",
2461
2451
  OUTPUT: "output",
2462
2452
  FILE_CHANGES: "file_changes",
@@ -2466,9 +2456,8 @@ var TEMPLATE_VARIABLES = {
2466
2456
  };
2467
2457
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
2468
2458
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
2469
- TEMPLATE_VARIABLES.ANSWER,
2470
- TEMPLATE_VARIABLES.EXPECTED_OUTPUT,
2471
- TEMPLATE_VARIABLES.OUTPUT_TEXT
2459
+ TEMPLATE_VARIABLES.OUTPUT_TEXT,
2460
+ TEMPLATE_VARIABLES.EXPECTED_OUTPUT
2472
2461
  ]);
2473
2462
 
2474
2463
  // src/evaluation/validation/prompt-validator.ts
@@ -2491,13 +2480,13 @@ function validateTemplateVariables(content, source) {
2491
2480
  }
2492
2481
  match = variablePattern.exec(content);
2493
2482
  }
2494
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.ANSWER) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
2483
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
2495
2484
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
2496
2485
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
2497
2486
  if (!hasRequiredFields) {
2498
2487
  throw new Error(
2499
2488
  `Missing required fields. Must include at least one of:
2500
- - {{ ${TEMPLATE_VARIABLES.ANSWER} }} or {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
2489
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
2501
2490
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
2502
2491
  );
2503
2492
  }
@@ -5576,7 +5565,7 @@ var AzureProvider = class {
5576
5565
  };
5577
5566
  this.retryConfig = config.retry;
5578
5567
  const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
5579
- this.model = azure(config.deploymentName);
5568
+ this.model = azure.chat(config.deploymentName);
5580
5569
  }
5581
5570
  id;
5582
5571
  kind = "azure";
@@ -5799,6 +5788,8 @@ async function invokeModel(options) {
5799
5788
  const { model, request, defaults, retryConfig, providerOptions } = options;
5800
5789
  const chatPrompt = buildChatPrompt(request);
5801
5790
  const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
5791
+ const startTime = (/* @__PURE__ */ new Date()).toISOString();
5792
+ const startMs = Date.now();
5802
5793
  const result = await withRetry(
5803
5794
  () => (0, import_ai.generateText)({
5804
5795
  model,
@@ -5812,9 +5803,11 @@ async function invokeModel(options) {
5812
5803
  retryConfig,
5813
5804
  request.signal
5814
5805
  );
5815
- return mapResponse(result);
5806
+ const endTime = (/* @__PURE__ */ new Date()).toISOString();
5807
+ const durationMs = Date.now() - startMs;
5808
+ return mapResponse(result, { durationMs, startTime, endTime });
5816
5809
  }
5817
- function mapResponse(result) {
5810
+ function mapResponse(result, timing) {
5818
5811
  const content = result.text ?? "";
5819
5812
  const rawUsage = result.totalUsage ?? result.usage;
5820
5813
  const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
@@ -5829,7 +5822,10 @@ function mapResponse(result) {
5829
5822
  raw: result,
5830
5823
  usage: toJsonObject(rawUsage),
5831
5824
  output: [{ role: "assistant", content }],
5832
- tokenUsage
5825
+ tokenUsage,
5826
+ durationMs: timing?.durationMs,
5827
+ startTime: timing?.startTime,
5828
+ endTime: timing?.endTime
5833
5829
  };
5834
5830
  }
5835
5831
  function toJsonObject(value) {
@@ -6707,10 +6703,12 @@ var ClaudeSdkProvider = class {
6707
6703
  if (usage) {
6708
6704
  const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
6709
6705
  const outputTokens = usage.output_tokens ?? 0;
6706
+ const reasoningTokens = usage.reasoning_tokens ?? void 0;
6710
6707
  tokenUsage = {
6711
6708
  input: inputTokens,
6712
6709
  output: outputTokens,
6713
- cached: usage.cache_read_input_tokens ?? void 0
6710
+ cached: usage.cache_read_input_tokens ?? void 0,
6711
+ reasoning: reasoningTokens
6714
6712
  };
6715
6713
  request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
6716
6714
  }
@@ -7724,7 +7722,8 @@ ${basePrompt}` : basePrompt;
7724
7722
  onUsage({
7725
7723
  input: usage.input_tokens ?? 0,
7726
7724
  output: usage.output_tokens ?? 0,
7727
- cached: usage.cached_input_tokens ?? void 0
7725
+ cached: usage.cached_input_tokens ?? void 0,
7726
+ reasoning: usage.reasoning_tokens ?? void 0
7728
7727
  });
7729
7728
  }
7730
7729
  }
@@ -9739,10 +9738,12 @@ function extractTokenUsage(events) {
9739
9738
  output: output ?? 0
9740
9739
  };
9741
9740
  const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
9742
- if (cached !== void 0) {
9743
- return { ...result, cached };
9744
- }
9745
- return result;
9741
+ const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
9742
+ return {
9743
+ ...result,
9744
+ ...cached !== void 0 ? { cached } : {},
9745
+ ...reasoning !== void 0 ? { reasoning } : {}
9746
+ };
9746
9747
  }
9747
9748
  }
9748
9749
  const messages = record.messages;
@@ -12807,9 +12808,11 @@ function negateScore(score) {
12807
12808
  ...score,
12808
12809
  score: negatedScore,
12809
12810
  verdict: negatedVerdict,
12810
- reasoning: score.reasoning ? `[Negated] ${score.reasoning} (original score: ${score.score.toFixed(2)})` : `[Negated] Original score: ${score.score.toFixed(2)}`,
12811
- hits: score.misses,
12812
- misses: score.hits
12811
+ assertions: score.assertions.map((a) => ({
12812
+ ...a,
12813
+ passed: !a.passed,
12814
+ evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
12815
+ }))
12813
12816
  };
12814
12817
  }
12815
12818
 
@@ -13267,11 +13270,9 @@ var CodeEvaluator = class {
13267
13270
  }
13268
13271
  }
13269
13272
  const payload = {
13270
- question: context2.evalCase.question,
13271
13273
  criteria: context2.evalCase.criteria,
13272
13274
  expectedOutput: context2.evalCase.expected_output,
13273
- referenceAnswer: context2.evalCase.reference_answer,
13274
- answer: context2.candidate,
13275
+ outputText: context2.candidate,
13275
13276
  output: outputForPayload,
13276
13277
  outputPath,
13277
13278
  guidelineFiles: context2.evalCase.guideline_paths,
@@ -13288,9 +13289,7 @@ var CodeEvaluator = class {
13288
13289
  fileChanges: context2.fileChanges ?? null,
13289
13290
  workspacePath: context2.workspacePath ?? null,
13290
13291
  config: this.config ?? null,
13291
- // Text convenience accessors (new names, always strings)
13292
13292
  inputText: context2.evalCase.question,
13293
- outputText: context2.candidate,
13294
13293
  expectedOutputText: context2.evalCase.reference_answer ?? ""
13295
13294
  };
13296
13295
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -13324,9 +13323,13 @@ var CodeEvaluator = class {
13324
13323
  );
13325
13324
  const parsed = parseJsonSafe(stdout);
13326
13325
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
13327
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
13328
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
13329
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
13326
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
13327
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
13328
+ ).map((a) => ({
13329
+ text: String(a.text),
13330
+ passed: Boolean(a.passed),
13331
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
13332
+ })) : [];
13330
13333
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
13331
13334
  const proxyUsage = getProxyUsage?.();
13332
13335
  const evaluatorRawRequest = {
@@ -13342,10 +13345,8 @@ var CodeEvaluator = class {
13342
13345
  return {
13343
13346
  score,
13344
13347
  verdict: scoreToVerdict(score),
13345
- hits,
13346
- misses,
13347
- expectedAspectCount: hits.length + misses.length || 1,
13348
- reasoning,
13348
+ assertions,
13349
+ expectedAspectCount: assertions.length || 1,
13349
13350
  evaluatorRawRequest,
13350
13351
  ...details ? { details } : {},
13351
13352
  tokenUsage: proxyUsage?.tokenUsage
@@ -13356,10 +13357,8 @@ var CodeEvaluator = class {
13356
13357
  return {
13357
13358
  score: 0,
13358
13359
  verdict: "fail",
13359
- hits: [],
13360
- misses: [`Code evaluator failed: ${message}`],
13360
+ assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
13361
13361
  expectedAspectCount: 1,
13362
- reasoning: message,
13363
13362
  evaluatorRawRequest: {
13364
13363
  command: this.command,
13365
13364
  ...this.cwd ? { cwd: this.cwd } : {},
@@ -13490,18 +13489,22 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
13490
13489
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
13491
13490
 
13492
13491
  [[ ## question ## ]]
13493
- {{${TEMPLATE_VARIABLES.QUESTION}}}
13492
+ {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
13494
13493
 
13495
13494
  [[ ## reference_answer ## ]]
13496
- {{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
13495
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
13497
13496
 
13498
13497
  [[ ## answer ## ]]
13499
- {{${TEMPLATE_VARIABLES.ANSWER}}}`;
13498
+ {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
13500
13499
  var freeformEvaluationSchema = import_zod4.z.object({
13501
13500
  score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
13502
- hits: import_zod4.z.array(import_zod4.z.string()).describe("Brief specific achievements").optional(),
13503
- misses: import_zod4.z.array(import_zod4.z.string()).describe("Brief failures or omissions").optional(),
13504
- reasoning: import_zod4.z.string().describe("Concise explanation (1-2 sentences)").optional()
13501
+ assertions: import_zod4.z.array(
13502
+ import_zod4.z.object({
13503
+ text: import_zod4.z.string().describe("Brief description of what was checked"),
13504
+ passed: import_zod4.z.boolean().describe("Whether this aspect was satisfied"),
13505
+ evidence: import_zod4.z.string().describe("Concise evidence (1-2 sentences)").optional()
13506
+ })
13507
+ ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
13505
13508
  });
13506
13509
  var rubricCheckResultSchema = import_zod4.z.object({
13507
13510
  id: import_zod4.z.string().describe("The ID of the rubric item being checked"),
@@ -13570,12 +13573,8 @@ var LlmGraderEvaluator = class {
13570
13573
  2
13571
13574
  ),
13572
13575
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
13573
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
13574
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
13575
13576
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
13576
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
13577
13577
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
13578
- // Text convenience accessors (new names, always strings)
13579
13578
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
13580
13579
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
13581
13580
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
@@ -13603,17 +13602,12 @@ ${context2.fileChanges}`;
13603
13602
  schema: freeformEvaluationSchema
13604
13603
  });
13605
13604
  const score = clampScore(data.score);
13606
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
13607
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
13608
- const reasoning = data.reasoning;
13609
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
13605
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
13610
13606
  return {
13611
13607
  score,
13612
13608
  verdict: scoreToVerdict(score),
13613
- hits,
13614
- misses,
13615
- expectedAspectCount,
13616
- reasoning,
13609
+ assertions,
13610
+ expectedAspectCount: Math.max(assertions.length, 1),
13617
13611
  evaluatorRawRequest,
13618
13612
  tokenUsage
13619
13613
  };
@@ -13624,10 +13618,8 @@ ${context2.fileChanges}`;
13624
13618
  return {
13625
13619
  score: 0,
13626
13620
  verdict: "skip",
13627
- hits: [],
13628
- misses: [`Grader parse failure after 3 attempts: ${message}`],
13621
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13629
13622
  expectedAspectCount: 1,
13630
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
13631
13623
  evaluatorRawRequest
13632
13624
  };
13633
13625
  }
@@ -13657,14 +13649,12 @@ ${context2.fileChanges}`;
13657
13649
  userPrompt: prompt,
13658
13650
  schema: rubricEvaluationSchema
13659
13651
  });
13660
- const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
13652
+ const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
13661
13653
  return {
13662
13654
  score,
13663
13655
  verdict,
13664
- hits,
13665
- misses,
13656
+ assertions,
13666
13657
  expectedAspectCount: rubrics.length,
13667
- reasoning: data.overall_reasoning,
13668
13658
  evaluatorRawRequest,
13669
13659
  tokenUsage
13670
13660
  };
@@ -13675,10 +13665,8 @@ ${context2.fileChanges}`;
13675
13665
  return {
13676
13666
  score: 0,
13677
13667
  verdict: "skip",
13678
- hits: [],
13679
- misses: [`Grader parse failure after 3 attempts: ${message}`],
13668
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13680
13669
  expectedAspectCount: rubrics.length,
13681
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
13682
13670
  evaluatorRawRequest
13683
13671
  };
13684
13672
  }
@@ -13703,14 +13691,12 @@ ${context2.fileChanges}`;
13703
13691
  userPrompt: prompt,
13704
13692
  schema: scoreRangeEvaluationSchema
13705
13693
  });
13706
- const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
13694
+ const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
13707
13695
  return {
13708
13696
  score,
13709
13697
  verdict,
13710
- hits,
13711
- misses,
13698
+ assertions,
13712
13699
  expectedAspectCount: rubrics.length,
13713
- reasoning: data.overall_reasoning,
13714
13700
  evaluatorRawRequest,
13715
13701
  details,
13716
13702
  tokenUsage
@@ -13722,10 +13708,8 @@ ${context2.fileChanges}`;
13722
13708
  return {
13723
13709
  score: 0,
13724
13710
  verdict: "skip",
13725
- hits: [],
13726
- misses: [`Grader parse failure after 3 attempts: ${message}`],
13711
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13727
13712
  expectedAspectCount: rubrics.length,
13728
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
13729
13713
  evaluatorRawRequest
13730
13714
  };
13731
13715
  }
@@ -13782,8 +13766,7 @@ ${context2.fileChanges}`;
13782
13766
  return {
13783
13767
  score: 0,
13784
13768
  verdict: "fail",
13785
- hits: [],
13786
- misses: [`llm-grader built-in evaluation failed: ${message}`],
13769
+ assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
13787
13770
  expectedAspectCount: 1,
13788
13771
  evaluatorRawRequest,
13789
13772
  details: { mode: "built-in", error: message }
@@ -13833,8 +13816,9 @@ ${context2.fileChanges}`;
13833
13816
  return {
13834
13817
  score: 0,
13835
13818
  verdict: "fail",
13836
- hits: [],
13837
- misses: [`llm-grader ${modeLabel} returned no assistant response`],
13819
+ assertions: [
13820
+ { text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
13821
+ ],
13838
13822
  expectedAspectCount: 1,
13839
13823
  evaluatorRawRequest,
13840
13824
  details: { mode: modeLabel, grader_target: provider.targetName }
@@ -13852,8 +13836,9 @@ ${context2.fileChanges}`;
13852
13836
  return {
13853
13837
  score: 0,
13854
13838
  verdict: "fail",
13855
- hits: [],
13856
- misses: [`llm-grader ${modeLabel} evaluation failed: ${message}`],
13839
+ assertions: [
13840
+ { text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
13841
+ ],
13857
13842
  expectedAspectCount: 1,
13858
13843
  evaluatorRawRequest,
13859
13844
  details: {
@@ -13894,10 +13879,10 @@ ${context2.fileChanges}`;
13894
13879
  buildAgentUserPrompt(context2) {
13895
13880
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
13896
13881
  const variables = {
13897
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
13898
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
13899
13882
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
13900
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
13883
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
13884
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
13885
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
13901
13886
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
13902
13887
  };
13903
13888
  if (this.evaluatorTemplate) {
@@ -13950,10 +13935,10 @@ ${context2.fileChanges}`;
13950
13935
  const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13951
13936
  if (this.evaluatorTemplate) {
13952
13937
  const variables = {
13953
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
13954
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
13955
13938
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
13956
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
13939
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
13940
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
13941
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
13957
13942
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
13958
13943
  };
13959
13944
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
@@ -14005,29 +13990,24 @@ ${outputSchema}`;
14005
13990
  const parsed = parseJsonFromText(text);
14006
13991
  if (rubrics && rubrics.length > 0) {
14007
13992
  const data2 = rubricEvaluationSchema.parse(parsed);
14008
- const { score: score2, verdict, hits: hits2, misses: misses2 } = calculateRubricScore(data2, rubrics);
13993
+ const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
14009
13994
  return {
14010
13995
  score: score2,
14011
13996
  verdict,
14012
- hits: hits2,
14013
- misses: misses2,
13997
+ assertions: assertions2,
14014
13998
  expectedAspectCount: rubrics.length,
14015
- reasoning: data2.overall_reasoning,
14016
13999
  evaluatorRawRequest,
14017
14000
  details
14018
14001
  };
14019
14002
  }
14020
14003
  const data = freeformEvaluationSchema.parse(parsed);
14021
14004
  const score = clampScore(data.score);
14022
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
14023
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
14005
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
14024
14006
  return {
14025
14007
  score,
14026
14008
  verdict: scoreToVerdict(score),
14027
- hits,
14028
- misses,
14029
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
14030
- reasoning: data.reasoning,
14009
+ assertions,
14010
+ expectedAspectCount: Math.max(assertions.length, 1),
14031
14011
  evaluatorRawRequest,
14032
14012
  details
14033
14013
  };
@@ -14035,8 +14015,12 @@ ${outputSchema}`;
14035
14015
  return {
14036
14016
  score: 0,
14037
14017
  verdict: "fail",
14038
- hits: [],
14039
- misses: ["Failed to parse llm-grader agent response as valid evaluation JSON"],
14018
+ assertions: [
14019
+ {
14020
+ text: "Failed to parse llm-grader agent response as valid evaluation JSON",
14021
+ passed: false
14022
+ }
14023
+ ],
14040
14024
  expectedAspectCount: 1,
14041
14025
  evaluatorRawRequest,
14042
14026
  details
@@ -14165,9 +14149,13 @@ function buildOutputSchema() {
14165
14149
  "",
14166
14150
  "{",
14167
14151
  ' "score": <number between 0.0 and 1.0>,',
14168
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
14169
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
14170
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
14152
+ ' "assertions": [',
14153
+ " {",
14154
+ ' "text": "<brief description of what was checked>",',
14155
+ ' "passed": <boolean>,',
14156
+ ' "evidence": "<concise evidence, 1-2 sentences, optional>"',
14157
+ " }",
14158
+ " ]",
14171
14159
  "}"
14172
14160
  ].join("\n");
14173
14161
  }
@@ -14192,8 +14180,7 @@ function substituteVariables(template, variables) {
14192
14180
  }
14193
14181
  function calculateRubricScore(result, rubrics) {
14194
14182
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
14195
- const hits = [];
14196
- const misses = [];
14183
+ const assertions = [];
14197
14184
  let totalWeight = 0;
14198
14185
  let earnedWeight = 0;
14199
14186
  let failedRequired = false;
@@ -14203,19 +14190,20 @@ function calculateRubricScore(result, rubrics) {
14203
14190
  continue;
14204
14191
  }
14205
14192
  totalWeight += rubric.weight;
14193
+ assertions.push({
14194
+ text: `[${rubric.id}] ${rubric.outcome}`,
14195
+ passed: check.satisfied,
14196
+ evidence: check.reasoning
14197
+ });
14206
14198
  if (check.satisfied) {
14207
14199
  earnedWeight += rubric.weight;
14208
- hits.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
14209
- } else {
14210
- misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
14211
- if (rubric.required) {
14212
- failedRequired = true;
14213
- }
14200
+ } else if (rubric.required) {
14201
+ failedRequired = true;
14214
14202
  }
14215
14203
  }
14216
14204
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
14217
14205
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
14218
- return { score, verdict, hits, misses };
14206
+ return { score, verdict, assertions };
14219
14207
  }
14220
14208
  function buildScoreRangeOutputSchema() {
14221
14209
  return `You are an expert evaluator. Score the candidate answer on each criterion.
@@ -14235,8 +14223,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
14235
14223
  }
14236
14224
  function calculateScoreRangeResult(result, rubrics) {
14237
14225
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
14238
- const hits = [];
14239
- const misses = [];
14226
+ const assertions = [];
14240
14227
  const rawScores = {};
14241
14228
  let totalWeight = 0;
14242
14229
  let weightedScoreSum = 0;
@@ -14262,24 +14249,22 @@ function calculateScoreRangeResult(result, rubrics) {
14262
14249
  );
14263
14250
  const rangeDescription = matchingRange?.outcome ?? "";
14264
14251
  const criterionLabel = rubric.outcome ?? rubric.id;
14265
- const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
14266
- const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
14252
+ const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
14267
14253
  if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
14268
14254
  failedRequired = true;
14269
- misses.push(scoreInfo);
14270
- } else if (rawScore >= 7) {
14271
- hits.push(scoreInfo);
14272
- } else {
14273
- misses.push(scoreInfo);
14274
14255
  }
14256
+ assertions.push({
14257
+ text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
14258
+ passed,
14259
+ evidence: check.reasoning
14260
+ });
14275
14261
  }
14276
14262
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
14277
14263
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
14278
14264
  return {
14279
14265
  score,
14280
14266
  verdict,
14281
- hits,
14282
- misses,
14267
+ assertions,
14283
14268
  details: {
14284
14269
  raw_scores: rawScores,
14285
14270
  normalization: "score / 10",
@@ -14455,9 +14440,7 @@ var CompositeEvaluator = class {
14455
14440
  let totalWeight = 0;
14456
14441
  let weightedSum = 0;
14457
14442
  let evaluatedCount = 0;
14458
- const allHits = [];
14459
- const allMisses = [];
14460
- const reasoningParts = [];
14443
+ const allAssertions = [];
14461
14444
  const scores = [];
14462
14445
  for (const member of results) {
14463
14446
  const weight = weights?.[member.id] ?? 1;
@@ -14467,9 +14450,7 @@ var CompositeEvaluator = class {
14467
14450
  score: member.result.score,
14468
14451
  weight,
14469
14452
  verdict: member.result.verdict,
14470
- hits: [...member.result.hits],
14471
- misses: [...member.result.misses],
14472
- reasoning: member.result.reasoning,
14453
+ assertions: [...member.result.assertions],
14473
14454
  evaluatorRawRequest: member.result.evaluatorRawRequest,
14474
14455
  scores: member.result.scores,
14475
14456
  details: member.result.details,
@@ -14481,20 +14462,16 @@ var CompositeEvaluator = class {
14481
14462
  evaluatedCount++;
14482
14463
  totalWeight += weight;
14483
14464
  weightedSum += member.result.score * weight;
14484
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
14485
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
14486
- if (member.result.reasoning) {
14487
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
14488
- }
14465
+ allAssertions.push(
14466
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
14467
+ );
14489
14468
  }
14490
14469
  if (evaluatedCount === 0 && results.length > 0) {
14491
14470
  return {
14492
14471
  score: 0,
14493
14472
  verdict: "skip",
14494
- hits: [],
14495
- misses: [],
14473
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
14496
14474
  expectedAspectCount: 1,
14497
- reasoning: "All evaluators skipped (infrastructure failure)",
14498
14475
  evaluatorRawRequest: {
14499
14476
  aggregator: "weighted_average",
14500
14477
  ...weights ? { weights } : {}
@@ -14506,10 +14483,8 @@ var CompositeEvaluator = class {
14506
14483
  return {
14507
14484
  score: clampScore(finalScore),
14508
14485
  verdict: scoreToVerdict(finalScore),
14509
- hits: allHits,
14510
- misses: allMisses,
14511
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
14512
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
14486
+ assertions: allAssertions,
14487
+ expectedAspectCount: allAssertions.length || 1,
14513
14488
  evaluatorRawRequest: {
14514
14489
  aggregator: "weighted_average",
14515
14490
  ...weights ? { weights } : {}
@@ -14519,11 +14494,8 @@ var CompositeEvaluator = class {
14519
14494
  }
14520
14495
  runThreshold(results, threshold) {
14521
14496
  const scores = [];
14522
- const allHits = [];
14523
- const allMisses = [];
14524
- const reasoningParts = [];
14497
+ const allAssertions = [];
14525
14498
  let passingCount = 0;
14526
- let borderlineCount = 0;
14527
14499
  let evaluatedCount = 0;
14528
14500
  for (const member of results) {
14529
14501
  scores.push({
@@ -14531,9 +14503,7 @@ var CompositeEvaluator = class {
14531
14503
  type: member.type,
14532
14504
  score: member.result.score,
14533
14505
  verdict: member.result.verdict,
14534
- hits: [...member.result.hits],
14535
- misses: [...member.result.misses],
14536
- reasoning: member.result.reasoning,
14506
+ assertions: [...member.result.assertions],
14537
14507
  evaluatorRawRequest: member.result.evaluatorRawRequest,
14538
14508
  scores: member.result.scores,
14539
14509
  details: member.result.details,
@@ -14546,24 +14516,17 @@ var CompositeEvaluator = class {
14546
14516
  const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
14547
14517
  if (isPassing) {
14548
14518
  passingCount++;
14549
- if (member.result.verdict === "borderline") {
14550
- borderlineCount++;
14551
- }
14552
- }
14553
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
14554
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
14555
- if (member.result.reasoning) {
14556
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
14557
14519
  }
14520
+ allAssertions.push(
14521
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
14522
+ );
14558
14523
  }
14559
14524
  if (evaluatedCount === 0 && results.length > 0) {
14560
14525
  return {
14561
14526
  score: 0,
14562
14527
  verdict: "skip",
14563
- hits: [],
14564
- misses: [],
14528
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
14565
14529
  expectedAspectCount: 1,
14566
- reasoning: "All evaluators skipped (infrastructure failure)",
14567
14530
  evaluatorRawRequest: {
14568
14531
  aggregator: "threshold",
14569
14532
  threshold
@@ -14574,19 +14537,15 @@ var CompositeEvaluator = class {
14574
14537
  const totalCount = evaluatedCount;
14575
14538
  const score = totalCount > 0 ? passingCount / totalCount : 0;
14576
14539
  const pass = score >= threshold;
14577
- if (pass && borderlineCount > 0) {
14578
- reasoningParts.push(`Warning: ${borderlineCount} borderline evaluator(s) counted as passing`);
14579
- }
14580
- reasoningParts.unshift(
14581
- `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
14582
- );
14540
+ allAssertions.unshift({
14541
+ text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
14542
+ passed: pass
14543
+ });
14583
14544
  return {
14584
14545
  score: clampScore(score),
14585
14546
  verdict: pass ? "pass" : "fail",
14586
- hits: allHits,
14587
- misses: allMisses,
14588
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
14589
- reasoning: reasoningParts.join("; "),
14547
+ assertions: allAssertions,
14548
+ expectedAspectCount: allAssertions.length || 1,
14590
14549
  evaluatorRawRequest: {
14591
14550
  aggregator: "threshold",
14592
14551
  threshold
@@ -14603,9 +14562,7 @@ var CompositeEvaluator = class {
14603
14562
  score: member.result.score,
14604
14563
  weight: weights?.[member.id] ?? 1,
14605
14564
  verdict: member.result.verdict,
14606
- hits: [...member.result.hits],
14607
- misses: [...member.result.misses],
14608
- reasoning: member.result.reasoning,
14565
+ assertions: [...member.result.assertions],
14609
14566
  evaluatorRawRequest: member.result.evaluatorRawRequest,
14610
14567
  scores: member.result.scores,
14611
14568
  details: member.result.details
@@ -14614,17 +14571,19 @@ var CompositeEvaluator = class {
14614
14571
  const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
14615
14572
  const parsed = parseJsonSafe(stdout);
14616
14573
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
14617
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
14618
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
14619
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
14574
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
14575
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
14576
+ ).map((a) => ({
14577
+ text: String(a.text),
14578
+ passed: Boolean(a.passed),
14579
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
14580
+ })) : [];
14620
14581
  const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
14621
14582
  return {
14622
14583
  score,
14623
14584
  verdict,
14624
- hits,
14625
- misses,
14626
- expectedAspectCount: hits.length + misses.length || 1,
14627
- reasoning,
14585
+ assertions,
14586
+ expectedAspectCount: assertions.length || 1,
14628
14587
  evaluatorRawRequest: {
14629
14588
  aggregator: "code-grader",
14630
14589
  script: scriptPath
@@ -14636,10 +14595,8 @@ var CompositeEvaluator = class {
14636
14595
  return {
14637
14596
  score: 0,
14638
14597
  verdict: "fail",
14639
- hits: [],
14640
- misses: [`Code aggregator failed: ${message}`],
14598
+ assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
14641
14599
  expectedAspectCount: 1,
14642
- reasoning: message,
14643
14600
  evaluatorRawRequest: {
14644
14601
  aggregator: "code-grader",
14645
14602
  script: scriptPath,
@@ -14661,9 +14618,7 @@ var CompositeEvaluator = class {
14661
14618
  type: member.type,
14662
14619
  score: member.result.score,
14663
14620
  verdict: member.result.verdict,
14664
- hits: [...member.result.hits],
14665
- misses: [...member.result.misses],
14666
- reasoning: member.result.reasoning,
14621
+ assertions: [...member.result.assertions],
14667
14622
  evaluatorRawRequest: member.result.evaluatorRawRequest,
14668
14623
  scores: member.result.scores,
14669
14624
  details: member.result.details
@@ -14687,16 +14642,12 @@ var CompositeEvaluator = class {
14687
14642
  });
14688
14643
  const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
14689
14644
  const score2 = clampScore(data2.score);
14690
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
14691
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
14692
- const reasoning2 = data2.reasoning;
14645
+ const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
14693
14646
  return {
14694
14647
  score: score2,
14695
14648
  verdict: scoreToVerdict(score2),
14696
- hits: hits2,
14697
- misses: misses2,
14698
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
14699
- reasoning: reasoning2,
14649
+ assertions: assertions2,
14650
+ expectedAspectCount: Math.max(assertions2.length, 1),
14700
14651
  evaluatorRawRequest,
14701
14652
  scores
14702
14653
  };
@@ -14711,16 +14662,12 @@ var CompositeEvaluator = class {
14711
14662
  parseJsonFromText(extractLastAssistantContent2(response.output))
14712
14663
  );
14713
14664
  const score = clampScore(data.score);
14714
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
14715
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
14716
- const reasoning = data.reasoning;
14665
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
14717
14666
  return {
14718
14667
  score,
14719
14668
  verdict: scoreToVerdict(score),
14720
- hits,
14721
- misses,
14722
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
14723
- reasoning,
14669
+ assertions,
14670
+ expectedAspectCount: Math.max(assertions.length, 1),
14724
14671
  evaluatorRawRequest,
14725
14672
  scores
14726
14673
  };
@@ -14728,8 +14675,7 @@ var CompositeEvaluator = class {
14728
14675
  return {
14729
14676
  score: 0,
14730
14677
  verdict: "fail",
14731
- hits: [],
14732
- misses: [],
14678
+ assertions: [{ text: "LLM aggregator failed", passed: false }],
14733
14679
  expectedAspectCount: 1,
14734
14680
  evaluatorRawRequest,
14735
14681
  scores
@@ -14752,10 +14698,8 @@ var CostEvaluator = class {
14752
14698
  return {
14753
14699
  score: 0,
14754
14700
  verdict: "fail",
14755
- hits: [],
14756
- misses: ["No cost data available in trace"],
14701
+ assertions: [{ text: "No cost data available in trace", passed: false }],
14757
14702
  expectedAspectCount: 1,
14758
- reasoning: "Execution cost not reported by provider",
14759
14703
  evaluatorRawRequest: {
14760
14704
  type: "cost",
14761
14705
  budget,
@@ -14769,10 +14713,10 @@ var CostEvaluator = class {
14769
14713
  return {
14770
14714
  score,
14771
14715
  verdict: passed ? "pass" : "fail",
14772
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
14773
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
14716
+ assertions: [
14717
+ passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
14718
+ ],
14774
14719
  expectedAspectCount: 1,
14775
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
14776
14720
  evaluatorRawRequest: {
14777
14721
  type: "cost",
14778
14722
  budget,
@@ -14805,10 +14749,8 @@ var ExecutionMetricsEvaluator = class {
14805
14749
  return {
14806
14750
  score: 0,
14807
14751
  verdict: "fail",
14808
- hits: [],
14809
- misses: ["No trace summary available"],
14752
+ assertions: [{ text: "No trace summary available", passed: false }],
14810
14753
  expectedAspectCount: 1,
14811
- reasoning: "Execution metrics not available - no trace summary provided",
14812
14754
  evaluatorRawRequest: {
14813
14755
  type: "execution-metrics",
14814
14756
  config: this.extractConfiguredThresholds(),
@@ -14817,116 +14759,114 @@ var ExecutionMetricsEvaluator = class {
14817
14759
  };
14818
14760
  }
14819
14761
  const narrowedTrace = trace2;
14820
- const hits = [];
14821
- const misses = [];
14762
+ const assertions = [];
14822
14763
  const actualMetrics = {};
14823
14764
  if (max_tool_calls !== void 0 && narrowedTrace) {
14824
14765
  const toolCalls = narrowedTrace.eventCount;
14825
14766
  actualMetrics.tool_calls = toolCalls;
14826
14767
  if (toolCalls <= max_tool_calls) {
14827
- hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
14768
+ assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
14828
14769
  } else {
14829
- misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
14770
+ assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
14830
14771
  }
14831
14772
  }
14832
14773
  if (max_llm_calls !== void 0 && narrowedTrace) {
14833
14774
  const llmCalls = narrowedTrace.llmCallCount;
14834
14775
  if (llmCalls === void 0) {
14835
- misses.push("LLM call count data not available");
14776
+ assertions.push({ text: "LLM call count data not available", passed: false });
14836
14777
  } else {
14837
14778
  actualMetrics.llm_calls = llmCalls;
14838
14779
  if (llmCalls <= max_llm_calls) {
14839
- hits.push(`LLM calls ${llmCalls} <= ${max_llm_calls} max`);
14780
+ assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
14840
14781
  } else {
14841
- misses.push(`LLM calls ${llmCalls} > ${max_llm_calls} max`);
14782
+ assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
14842
14783
  }
14843
14784
  }
14844
14785
  }
14845
14786
  if (max_tokens !== void 0) {
14846
14787
  if (!tokenUsage) {
14847
- misses.push("Token usage data not available");
14788
+ assertions.push({ text: "Token usage data not available", passed: false });
14848
14789
  } else {
14849
14790
  const totalTokens = tokenUsage.input + tokenUsage.output;
14850
14791
  actualMetrics.tokens = totalTokens;
14851
14792
  if (totalTokens <= max_tokens) {
14852
- hits.push(`Total tokens ${totalTokens} <= ${max_tokens} max`);
14793
+ assertions.push({
14794
+ text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
14795
+ passed: true
14796
+ });
14853
14797
  } else {
14854
- misses.push(`Total tokens ${totalTokens} > ${max_tokens} max`);
14798
+ assertions.push({
14799
+ text: `Total tokens ${totalTokens} > ${max_tokens} max`,
14800
+ passed: false
14801
+ });
14855
14802
  }
14856
14803
  }
14857
14804
  }
14858
14805
  if (max_cost_usd !== void 0) {
14859
14806
  if (costUsd === void 0) {
14860
- misses.push("Cost data not available");
14807
+ assertions.push({ text: "Cost data not available", passed: false });
14861
14808
  } else {
14862
14809
  actualMetrics.cost_usd = costUsd;
14863
14810
  const formatCost = (n) => `$${n.toFixed(4)}`;
14864
14811
  if (costUsd <= max_cost_usd) {
14865
- hits.push(`Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`);
14812
+ assertions.push({
14813
+ text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
14814
+ passed: true
14815
+ });
14866
14816
  } else {
14867
- misses.push(`Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`);
14817
+ assertions.push({
14818
+ text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
14819
+ passed: false
14820
+ });
14868
14821
  }
14869
14822
  }
14870
14823
  }
14871
14824
  if (max_duration_ms !== void 0) {
14872
14825
  if (durationMs === void 0) {
14873
- misses.push("Duration data not available");
14826
+ assertions.push({ text: "Duration data not available", passed: false });
14874
14827
  } else {
14875
14828
  actualMetrics.duration_ms = durationMs;
14876
14829
  if (durationMs <= max_duration_ms) {
14877
- hits.push(`Duration ${durationMs}ms <= ${max_duration_ms}ms max`);
14830
+ assertions.push({
14831
+ text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
14832
+ passed: true
14833
+ });
14878
14834
  } else {
14879
- misses.push(`Duration ${durationMs}ms > ${max_duration_ms}ms max`);
14835
+ assertions.push({
14836
+ text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
14837
+ passed: false
14838
+ });
14880
14839
  }
14881
14840
  }
14882
14841
  }
14883
14842
  if (target_exploration_ratio !== void 0 && narrowedTrace) {
14884
14843
  const ratio = explorationRatio(narrowedTrace);
14885
14844
  if (ratio === void 0) {
14886
- misses.push("Exploration ratio not available (no tool calls)");
14845
+ assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
14887
14846
  } else {
14888
14847
  actualMetrics.exploration_ratio = ratio;
14889
14848
  const diff = Math.abs(ratio - target_exploration_ratio);
14890
14849
  if (diff <= exploration_tolerance) {
14891
- hits.push(
14892
- `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`
14893
- );
14850
+ assertions.push({
14851
+ text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
14852
+ passed: true
14853
+ });
14894
14854
  } else {
14895
- misses.push(
14896
- `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`
14897
- );
14855
+ assertions.push({
14856
+ text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
14857
+ passed: false
14858
+ });
14898
14859
  }
14899
14860
  }
14900
14861
  }
14901
- const totalChecks = hits.length + misses.length;
14902
- const score = totalChecks > 0 ? hits.length / totalChecks : 0;
14903
- const reasoningParts = [];
14904
- if (actualMetrics.tool_calls !== void 0) {
14905
- reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
14906
- }
14907
- if (actualMetrics.llm_calls !== void 0) {
14908
- reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
14909
- }
14910
- if (actualMetrics.tokens !== void 0) {
14911
- reasoningParts.push(`tokens=${actualMetrics.tokens}`);
14912
- }
14913
- if (actualMetrics.cost_usd !== void 0) {
14914
- reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
14915
- }
14916
- if (actualMetrics.duration_ms !== void 0) {
14917
- reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
14918
- }
14919
- if (actualMetrics.exploration_ratio !== void 0) {
14920
- reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
14921
- }
14922
- const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
14862
+ const totalChecks = assertions.length;
14863
+ const passedCount = assertions.filter((a) => a.passed).length;
14864
+ const score = totalChecks > 0 ? passedCount / totalChecks : 0;
14923
14865
  return {
14924
14866
  score,
14925
14867
  verdict: scoreToVerdict(score),
14926
- hits,
14927
- misses,
14868
+ assertions,
14928
14869
  expectedAspectCount: totalChecks || 1,
14929
- reasoning,
14930
14870
  evaluatorRawRequest: {
14931
14871
  type: "execution-metrics",
14932
14872
  config: this.extractConfiguredThresholds(),
@@ -15030,10 +14970,8 @@ var FieldAccuracyEvaluator = class {
15030
14970
  return {
15031
14971
  score: 0,
15032
14972
  verdict: "fail",
15033
- hits: [],
15034
- misses: ["Failed to parse candidate answer as JSON"],
15035
- expectedAspectCount: this.config.fields.length,
15036
- reasoning: "Candidate answer is not valid JSON"
14973
+ assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
14974
+ expectedAspectCount: this.config.fields.length
15037
14975
  };
15038
14976
  }
15039
14977
  const expectedData = this.extractExpectedData(evalCase.expected_output);
@@ -15041,10 +14979,8 @@ var FieldAccuracyEvaluator = class {
15041
14979
  return {
15042
14980
  score: 0,
15043
14981
  verdict: "fail",
15044
- hits: [],
15045
- misses: ["No expected data found in expected_output"],
15046
- expectedAspectCount: this.config.fields.length,
15047
- reasoning: "Could not extract expected data from expected_output"
14982
+ assertions: [{ text: "No expected data found in expected_output", passed: false }],
14983
+ expectedAspectCount: this.config.fields.length
15048
14984
  };
15049
14985
  }
15050
14986
  const fieldResults = [];
@@ -15262,18 +15198,14 @@ var FieldAccuracyEvaluator = class {
15262
15198
  */
15263
15199
  aggregateResults(results) {
15264
15200
  const aggregation = this.config.aggregation ?? "weighted_average";
15265
- const hits = [];
15266
- const misses = [];
15201
+ const assertions = [];
15267
15202
  for (const result of results) {
15268
- if (result.hit) {
15269
- hits.push(result.message);
15270
- } else {
15271
- misses.push(result.message);
15272
- }
15203
+ assertions.push({ text: result.message, passed: result.hit });
15273
15204
  }
15274
15205
  let score;
15275
15206
  if (aggregation === "all_or_nothing") {
15276
- score = misses.length === 0 ? 1 : 0;
15207
+ const hasFailed = assertions.some((a) => !a.passed);
15208
+ score = hasFailed ? 0 : 1;
15277
15209
  } else {
15278
15210
  const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
15279
15211
  if (totalWeight === 0) {
@@ -15283,15 +15215,11 @@ var FieldAccuracyEvaluator = class {
15283
15215
  score = weightedSum / totalWeight;
15284
15216
  }
15285
15217
  }
15286
- const reasoning = `${hits.length}/${results.length} fields matched`;
15287
15218
  return {
15288
15219
  score: clampScore(score),
15289
15220
  verdict: scoreToVerdict(score),
15290
- hits: hits.slice(0, 4),
15291
- // Cap at 4 to keep output concise
15292
- misses: misses.slice(0, 4),
15293
- expectedAspectCount: results.length,
15294
- reasoning
15221
+ assertions,
15222
+ expectedAspectCount: results.length
15295
15223
  };
15296
15224
  }
15297
15225
  };
@@ -15400,10 +15328,8 @@ var LatencyEvaluator = class {
15400
15328
  return {
15401
15329
  score: 0,
15402
15330
  verdict: "fail",
15403
- hits: [],
15404
- misses: ["No duration data available in trace"],
15331
+ assertions: [{ text: "No duration data available in trace", passed: false }],
15405
15332
  expectedAspectCount: 1,
15406
- reasoning: "Execution duration not reported by provider",
15407
15333
  evaluatorRawRequest: {
15408
15334
  type: "latency",
15409
15335
  threshold,
@@ -15416,10 +15342,10 @@ var LatencyEvaluator = class {
15416
15342
  return {
15417
15343
  score,
15418
15344
  verdict: passed ? "pass" : "fail",
15419
- hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
15420
- misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
15345
+ assertions: [
15346
+ passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
15347
+ ],
15421
15348
  expectedAspectCount: 1,
15422
- reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
15423
15349
  evaluatorRawRequest: {
15424
15350
  type: "latency",
15425
15351
  threshold,
@@ -15440,7 +15366,10 @@ var COPILOT_MATCHER = {
15440
15366
  skillTools: ["Skill", "skill"],
15441
15367
  skillInputField: "skill",
15442
15368
  readTools: ["Read File", "readFile", "Read", "readTextFile"],
15443
- readInputField: "file_path"
15369
+ readInputField: "file_path",
15370
+ skillToolPrefixes: ["Using skill: "],
15371
+ readToolPrefixes: ["Viewing "],
15372
+ readInputFields: ["file_path", "path"]
15444
15373
  };
15445
15374
  var PROVIDER_TOOL_SEMANTICS = {
15446
15375
  claude: CLAUDE_MATCHER,
@@ -15482,12 +15411,22 @@ var SkillTriggerEvaluator = class {
15482
15411
  triggered = true;
15483
15412
  evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
15484
15413
  }
15414
+ } else if (matcher.skillToolPrefixes?.some(
15415
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
15416
+ )) {
15417
+ triggered = true;
15418
+ evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
15485
15419
  } else if (matcher.readTools.includes(firstTool.tool)) {
15486
- const filePath = String(input[matcher.readInputField] ?? "");
15420
+ const filePath = this.readPathFromInput(input, matcher);
15487
15421
  if (filePath.includes(skillName)) {
15488
15422
  triggered = true;
15489
15423
  evidence = `Read tool loaded skill file: ${filePath}`;
15490
15424
  }
15425
+ } else if (matcher.readToolPrefixes?.some(
15426
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
15427
+ )) {
15428
+ triggered = true;
15429
+ evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
15491
15430
  }
15492
15431
  }
15493
15432
  const pass = triggered === shouldTrigger;
@@ -15495,25 +15434,37 @@ var SkillTriggerEvaluator = class {
15495
15434
  return {
15496
15435
  score: 1,
15497
15436
  verdict: "pass",
15498
- hits: [
15499
- shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`
15437
+ assertions: [
15438
+ {
15439
+ text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
15440
+ passed: true
15441
+ }
15500
15442
  ],
15501
- misses: [],
15502
- expectedAspectCount: 1,
15503
- reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
15443
+ expectedAspectCount: 1
15504
15444
  };
15505
15445
  }
15506
15446
  return {
15507
15447
  score: 0,
15508
15448
  verdict: "fail",
15509
- hits: [],
15510
- misses: [
15511
- shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`
15449
+ assertions: [
15450
+ {
15451
+ text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
15452
+ passed: false
15453
+ }
15512
15454
  ],
15513
- expectedAspectCount: 1,
15514
- reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
15455
+ expectedAspectCount: 1
15515
15456
  };
15516
15457
  }
15458
+ readPathFromInput(input, matcher) {
15459
+ const fields = matcher.readInputFields ?? [matcher.readInputField];
15460
+ for (const field of fields) {
15461
+ const value = input[field];
15462
+ if (value !== void 0 && value !== null) {
15463
+ return String(value);
15464
+ }
15465
+ }
15466
+ return "";
15467
+ }
15517
15468
  };
15518
15469
 
15519
15470
  // src/evaluation/evaluators/llm-grader-prompt.ts
@@ -15548,12 +15499,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
15548
15499
  [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
15549
15500
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
15550
15501
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
15551
- [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
15552
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
15553
15502
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
15554
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
15555
15503
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
15556
- // Text convenience accessors (new names, always strings)
15557
15504
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
15558
15505
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
15559
15506
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -15680,10 +15627,8 @@ var TokenUsageEvaluator = class {
15680
15627
  return {
15681
15628
  score: 0,
15682
15629
  verdict: "fail",
15683
- hits: [],
15684
- misses: ["No token usage data available in trace"],
15630
+ assertions: [{ text: "No token usage data available in trace", passed: false }],
15685
15631
  expectedAspectCount,
15686
- reasoning: "Token usage not reported by provider",
15687
15632
  evaluatorRawRequest: {
15688
15633
  type: "token-usage",
15689
15634
  max_total: maxTotal ?? null,
@@ -15697,37 +15642,34 @@ var TokenUsageEvaluator = class {
15697
15642
  const output = usage.output;
15698
15643
  const cached = usage.cached ?? 0;
15699
15644
  const total = input + output + cached;
15700
- const hits = [];
15701
- const misses = [];
15645
+ const assertions = [];
15702
15646
  if (typeof maxInput === "number") {
15703
15647
  if (input <= maxInput) {
15704
- hits.push(`Input tokens ${input} <= ${maxInput}`);
15648
+ assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
15705
15649
  } else {
15706
- misses.push(`Input tokens ${input} > ${maxInput}`);
15650
+ assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
15707
15651
  }
15708
15652
  }
15709
15653
  if (typeof maxOutput === "number") {
15710
15654
  if (output <= maxOutput) {
15711
- hits.push(`Output tokens ${output} <= ${maxOutput}`);
15655
+ assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
15712
15656
  } else {
15713
- misses.push(`Output tokens ${output} > ${maxOutput}`);
15657
+ assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
15714
15658
  }
15715
15659
  }
15716
15660
  if (typeof maxTotal === "number") {
15717
15661
  if (total <= maxTotal) {
15718
- hits.push(`Total tokens ${total} <= ${maxTotal}`);
15662
+ assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
15719
15663
  } else {
15720
- misses.push(`Total tokens ${total} > ${maxTotal}`);
15664
+ assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
15721
15665
  }
15722
15666
  }
15723
- const passed = misses.length === 0;
15667
+ const passed = assertions.every((a) => a.passed);
15724
15668
  return {
15725
15669
  score: passed ? 1 : 0,
15726
15670
  verdict: passed ? "pass" : "fail",
15727
- hits,
15728
- misses,
15671
+ assertions,
15729
15672
  expectedAspectCount,
15730
- reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
15731
15673
  evaluatorRawRequest: {
15732
15674
  type: "token-usage",
15733
15675
  max_total: maxTotal ?? null,
@@ -15827,8 +15769,7 @@ var ToolTrajectoryEvaluator = class {
15827
15769
  return {
15828
15770
  score: 0,
15829
15771
  verdict: "fail",
15830
- hits: [],
15831
- misses: ["No trace available for evaluation"],
15772
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
15832
15773
  expectedAspectCount: 1
15833
15774
  };
15834
15775
  }
@@ -15839,8 +15780,7 @@ var ToolTrajectoryEvaluator = class {
15839
15780
  return {
15840
15781
  score: 0,
15841
15782
  verdict: "fail",
15842
- hits: [],
15843
- misses: ["No trace available for evaluation"],
15783
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
15844
15784
  expectedAspectCount: 1
15845
15785
  };
15846
15786
  }
@@ -15858,8 +15798,7 @@ var ToolTrajectoryEvaluator = class {
15858
15798
  return {
15859
15799
  score: 0,
15860
15800
  verdict: "fail",
15861
- hits: [],
15862
- misses: [`Unknown mode: ${this.config.mode}`],
15801
+ assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
15863
15802
  expectedAspectCount: 1
15864
15803
  };
15865
15804
  }
@@ -15908,28 +15847,32 @@ var ToolTrajectoryEvaluator = class {
15908
15847
  return {
15909
15848
  score: 1,
15910
15849
  verdict: "pass",
15911
- hits: ["No tool requirements specified"],
15912
- misses: [],
15850
+ assertions: [{ text: "No tool requirements specified", passed: true }],
15913
15851
  expectedAspectCount: 0
15914
15852
  };
15915
15853
  }
15916
- const hits = [];
15917
- const misses = [];
15854
+ const assertions = [];
15918
15855
  for (const toolName of toolNames) {
15919
15856
  const required = minimums[toolName];
15920
15857
  const actual = summary.toolCallsByName[toolName] ?? 0;
15921
15858
  if (actual >= required) {
15922
- hits.push(`${toolName}: called ${actual} times (required >=${required})`);
15859
+ assertions.push({
15860
+ text: `${toolName}: called ${actual} times (required >=${required})`,
15861
+ passed: true
15862
+ });
15923
15863
  } else {
15924
- misses.push(`${toolName}: called ${actual} times (required >=${required})`);
15864
+ assertions.push({
15865
+ text: `${toolName}: called ${actual} times (required >=${required})`,
15866
+ passed: false
15867
+ });
15925
15868
  }
15926
15869
  }
15927
- const score = hits.length / toolNames.length;
15870
+ const passedCount = assertions.filter((a) => a.passed).length;
15871
+ const score = passedCount / toolNames.length;
15928
15872
  return {
15929
15873
  score,
15930
15874
  verdict: scoreToVerdict(score),
15931
- hits,
15932
- misses,
15875
+ assertions,
15933
15876
  expectedAspectCount: toolNames.length
15934
15877
  };
15935
15878
  }
@@ -15939,13 +15882,11 @@ var ToolTrajectoryEvaluator = class {
15939
15882
  return {
15940
15883
  score: 1,
15941
15884
  verdict: "pass",
15942
- hits: ["No tool sequence specified"],
15943
- misses: [],
15885
+ assertions: [{ text: "No tool sequence specified", passed: true }],
15944
15886
  expectedAspectCount: 0
15945
15887
  };
15946
15888
  }
15947
- const hits = [];
15948
- const misses = [];
15889
+ const assertions = [];
15949
15890
  const warnings = [];
15950
15891
  let actualIndex = 0;
15951
15892
  let sequenceHits = 0;
@@ -15965,16 +15906,20 @@ var ToolTrajectoryEvaluator = class {
15965
15906
  const actualCall = toolCalls[actualIndex];
15966
15907
  if (actualCall.name === expectedTool) {
15967
15908
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
15968
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
15909
+ assertions.push({
15910
+ text: `Found ${expectedTool} at position ${actualIndex}`,
15911
+ passed: true
15912
+ });
15969
15913
  sequenceHits++;
15970
15914
  matchedCall = actualCall;
15971
15915
  actualIndex++;
15972
15916
  found = true;
15973
15917
  break;
15974
15918
  }
15975
- misses.push(
15976
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
15977
- );
15919
+ assertions.push({
15920
+ text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
15921
+ passed: false
15922
+ });
15978
15923
  actualIndex++;
15979
15924
  argsMismatch = true;
15980
15925
  break;
@@ -15982,7 +15927,10 @@ var ToolTrajectoryEvaluator = class {
15982
15927
  actualIndex++;
15983
15928
  }
15984
15929
  if (!found && !argsMismatch) {
15985
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
15930
+ assertions.push({
15931
+ text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
15932
+ passed: false
15933
+ });
15986
15934
  }
15987
15935
  if (found && matchedCall) {
15988
15936
  const latencyResult = checkLatency(
@@ -15991,10 +15939,10 @@ var ToolTrajectoryEvaluator = class {
15991
15939
  matchedCall.durationMs
15992
15940
  );
15993
15941
  if (latencyResult.status === "pass") {
15994
- hits.push(latencyResult.message);
15942
+ assertions.push({ text: latencyResult.message, passed: true });
15995
15943
  latencyHits++;
15996
15944
  } else if (latencyResult.status === "fail") {
15997
- misses.push(latencyResult.message);
15945
+ assertions.push({ text: latencyResult.message, passed: false });
15998
15946
  } else if (latencyResult.message) {
15999
15947
  warnings.push(latencyResult.message);
16000
15948
  latencySkips++;
@@ -16010,8 +15958,7 @@ var ToolTrajectoryEvaluator = class {
16010
15958
  return {
16011
15959
  score,
16012
15960
  verdict: scoreToVerdict(score),
16013
- hits,
16014
- misses,
15961
+ assertions,
16015
15962
  expectedAspectCount: totalAssertions
16016
15963
  };
16017
15964
  }
@@ -16021,13 +15968,11 @@ var ToolTrajectoryEvaluator = class {
16021
15968
  return {
16022
15969
  score: 1,
16023
15970
  verdict: "pass",
16024
- hits: ["No tool sequence specified"],
16025
- misses: [],
15971
+ assertions: [{ text: "No tool sequence specified", passed: true }],
16026
15972
  expectedAspectCount: 0
16027
15973
  };
16028
15974
  }
16029
- const hits = [];
16030
- const misses = [];
15975
+ const assertions = [];
16031
15976
  const warnings = [];
16032
15977
  let sequenceHits = 0;
16033
15978
  let latencyHits = 0;
@@ -16036,7 +15981,10 @@ var ToolTrajectoryEvaluator = class {
16036
15981
  (item) => item.maxDurationMs !== void 0
16037
15982
  ).length;
16038
15983
  if (toolCalls.length !== expected.length) {
16039
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
15984
+ assertions.push({
15985
+ text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
15986
+ passed: false
15987
+ });
16040
15988
  }
16041
15989
  const checkLength = Math.min(expected.length, toolCalls.length);
16042
15990
  for (let i = 0; i < checkLength; i++) {
@@ -16048,14 +15996,17 @@ var ToolTrajectoryEvaluator = class {
16048
15996
  let sequenceMatched = false;
16049
15997
  if (actualTool === expectedTool) {
16050
15998
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
16051
- hits.push(`Position ${i}: ${expectedTool}`);
15999
+ assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
16052
16000
  sequenceHits++;
16053
16001
  sequenceMatched = true;
16054
16002
  } else {
16055
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
16003
+ assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
16056
16004
  }
16057
16005
  } else {
16058
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
16006
+ assertions.push({
16007
+ text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
16008
+ passed: false
16009
+ });
16059
16010
  }
16060
16011
  if (sequenceMatched) {
16061
16012
  const latencyResult = checkLatency(
@@ -16064,10 +16015,10 @@ var ToolTrajectoryEvaluator = class {
16064
16015
  actualCall.durationMs
16065
16016
  );
16066
16017
  if (latencyResult.status === "pass") {
16067
- hits.push(latencyResult.message);
16018
+ assertions.push({ text: latencyResult.message, passed: true });
16068
16019
  latencyHits++;
16069
16020
  } else if (latencyResult.status === "fail") {
16070
- misses.push(latencyResult.message);
16021
+ assertions.push({ text: latencyResult.message, passed: false });
16071
16022
  } else if (latencyResult.message) {
16072
16023
  warnings.push(latencyResult.message);
16073
16024
  latencySkips++;
@@ -16075,7 +16026,10 @@ var ToolTrajectoryEvaluator = class {
16075
16026
  }
16076
16027
  }
16077
16028
  for (let i = checkLength; i < expected.length; i++) {
16078
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
16029
+ assertions.push({
16030
+ text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
16031
+ passed: false
16032
+ });
16079
16033
  }
16080
16034
  for (const warning of warnings) {
16081
16035
  console.warn(`[tool-trajectory] ${warning}`);
@@ -16086,8 +16040,7 @@ var ToolTrajectoryEvaluator = class {
16086
16040
  return {
16087
16041
  score,
16088
16042
  verdict: scoreToVerdict(score),
16089
- hits,
16090
- misses,
16043
+ assertions,
16091
16044
  expectedAspectCount: totalAssertions
16092
16045
  };
16093
16046
  }
@@ -16102,13 +16055,11 @@ var ToolTrajectoryEvaluator = class {
16102
16055
  return {
16103
16056
  score: 1,
16104
16057
  verdict: "pass",
16105
- hits: ["No expected tools specified"],
16106
- misses: [],
16058
+ assertions: [{ text: "No expected tools specified", passed: true }],
16107
16059
  expectedAspectCount: 0
16108
16060
  };
16109
16061
  }
16110
- const hits = [];
16111
- const misses = [];
16062
+ const assertions = [];
16112
16063
  const consumed = /* @__PURE__ */ new Set();
16113
16064
  for (let i = 0; i < expected.length; i++) {
16114
16065
  const expectedItem = expected[i];
@@ -16119,22 +16070,25 @@ var ToolTrajectoryEvaluator = class {
16119
16070
  if (consumed.has(j)) continue;
16120
16071
  const actualCall = toolCalls[j];
16121
16072
  if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
16122
- hits.push(`Found ${expectedTool} at position ${j}`);
16073
+ assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
16123
16074
  consumed.add(j);
16124
16075
  found = true;
16125
16076
  break;
16126
16077
  }
16127
16078
  }
16128
16079
  if (!found) {
16129
- misses.push(`Expected ${expectedTool} not found in actual trajectory`);
16080
+ assertions.push({
16081
+ text: `Expected ${expectedTool} not found in actual trajectory`,
16082
+ passed: false
16083
+ });
16130
16084
  }
16131
16085
  }
16132
- const score = expected.length > 0 ? hits.length / expected.length : 1;
16086
+ const passedCount = assertions.filter((a) => a.passed).length;
16087
+ const score = expected.length > 0 ? passedCount / expected.length : 1;
16133
16088
  return {
16134
16089
  score,
16135
16090
  verdict: scoreToVerdict(score),
16136
- hits,
16137
- misses,
16091
+ assertions,
16138
16092
  expectedAspectCount: expected.length
16139
16093
  };
16140
16094
  }
@@ -16150,16 +16104,19 @@ var ToolTrajectoryEvaluator = class {
16150
16104
  return {
16151
16105
  score: 1,
16152
16106
  verdict: "pass",
16153
- hits: ["No tool calls and no expected tools"],
16154
- misses: [],
16107
+ assertions: [{ text: "No tool calls and no expected tools", passed: true }],
16155
16108
  expectedAspectCount: 0
16156
16109
  };
16157
16110
  }
16158
16111
  return {
16159
16112
  score: 0,
16160
16113
  verdict: "fail",
16161
- hits: [],
16162
- misses: [`${toolCalls.length} unexpected tool call(s) with empty allowed list`],
16114
+ assertions: [
16115
+ {
16116
+ text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
16117
+ passed: false
16118
+ }
16119
+ ],
16163
16120
  expectedAspectCount: toolCalls.length
16164
16121
  };
16165
16122
  }
@@ -16167,13 +16124,11 @@ var ToolTrajectoryEvaluator = class {
16167
16124
  return {
16168
16125
  score: 1,
16169
16126
  verdict: "pass",
16170
- hits: ["No actual tool calls (trivially a subset)"],
16171
- misses: [],
16127
+ assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
16172
16128
  expectedAspectCount: 0
16173
16129
  };
16174
16130
  }
16175
- const hits = [];
16176
- const misses = [];
16131
+ const assertions = [];
16177
16132
  for (let i = 0; i < toolCalls.length; i++) {
16178
16133
  const actualCall = toolCalls[i];
16179
16134
  let allowed = false;
@@ -16185,17 +16140,23 @@ var ToolTrajectoryEvaluator = class {
16185
16140
  }
16186
16141
  }
16187
16142
  if (allowed) {
16188
- hits.push(`Position ${i}: ${actualCall.name} is in allowed set`);
16143
+ assertions.push({
16144
+ text: `Position ${i}: ${actualCall.name} is in allowed set`,
16145
+ passed: true
16146
+ });
16189
16147
  } else {
16190
- misses.push(`Position ${i}: ${actualCall.name} is not in allowed set`);
16148
+ assertions.push({
16149
+ text: `Position ${i}: ${actualCall.name} is not in allowed set`,
16150
+ passed: false
16151
+ });
16191
16152
  }
16192
16153
  }
16193
- const score = toolCalls.length > 0 ? hits.length / toolCalls.length : 1;
16154
+ const passedCount = assertions.filter((a) => a.passed).length;
16155
+ const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
16194
16156
  return {
16195
16157
  score,
16196
16158
  verdict: scoreToVerdict(score),
16197
- hits,
16198
- misses,
16159
+ assertions,
16199
16160
  expectedAspectCount: toolCalls.length
16200
16161
  };
16201
16162
  }
@@ -16206,8 +16167,12 @@ function runContainsAssertion(output, value) {
16206
16167
  const passed = output.includes(value);
16207
16168
  return {
16208
16169
  score: passed ? 1 : 0,
16209
- hits: passed ? [`Output contains "${value}"`] : [],
16210
- misses: passed ? [] : [`Output does not contain "${value}"`]
16170
+ assertions: [
16171
+ {
16172
+ text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
16173
+ passed
16174
+ }
16175
+ ]
16211
16176
  };
16212
16177
  }
16213
16178
  function runContainsAnyAssertion(output, values) {
@@ -16215,8 +16180,12 @@ function runContainsAnyAssertion(output, values) {
16215
16180
  const passed = matched.length > 0;
16216
16181
  return {
16217
16182
  score: passed ? 1 : 0,
16218
- hits: passed ? [`Output contains "${matched[0]}"`] : [],
16219
- misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
16183
+ assertions: [
16184
+ {
16185
+ text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
16186
+ passed
16187
+ }
16188
+ ]
16220
16189
  };
16221
16190
  }
16222
16191
  function runContainsAllAssertion(output, values) {
@@ -16224,16 +16193,24 @@ function runContainsAllAssertion(output, values) {
16224
16193
  const passed = missing.length === 0;
16225
16194
  return {
16226
16195
  score: passed ? 1 : 0,
16227
- hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
16228
- misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
16196
+ assertions: [
16197
+ {
16198
+ text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
16199
+ passed
16200
+ }
16201
+ ]
16229
16202
  };
16230
16203
  }
16231
16204
  function runIcontainsAssertion(output, value) {
16232
16205
  const passed = output.toLowerCase().includes(value.toLowerCase());
16233
16206
  return {
16234
16207
  score: passed ? 1 : 0,
16235
- hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
16236
- misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
16208
+ assertions: [
16209
+ {
16210
+ text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
16211
+ passed
16212
+ }
16213
+ ]
16237
16214
  };
16238
16215
  }
16239
16216
  function runIcontainsAnyAssertion(output, values) {
@@ -16242,9 +16219,11 @@ function runIcontainsAnyAssertion(output, values) {
16242
16219
  const passed = matched.length > 0;
16243
16220
  return {
16244
16221
  score: passed ? 1 : 0,
16245
- hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
16246
- misses: passed ? [] : [
16247
- `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
16222
+ assertions: [
16223
+ {
16224
+ text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
16225
+ passed
16226
+ }
16248
16227
  ]
16249
16228
  };
16250
16229
  }
@@ -16254,24 +16233,36 @@ function runIcontainsAllAssertion(output, values) {
16254
16233
  const passed = missing.length === 0;
16255
16234
  return {
16256
16235
  score: passed ? 1 : 0,
16257
- hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
16258
- misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
16236
+ assertions: [
16237
+ {
16238
+ text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
16239
+ passed
16240
+ }
16241
+ ]
16259
16242
  };
16260
16243
  }
16261
16244
  function runStartsWithAssertion(output, value) {
16262
16245
  const passed = output.trim().startsWith(value.trim());
16263
16246
  return {
16264
16247
  score: passed ? 1 : 0,
16265
- hits: passed ? [`Output starts with "${value}"`] : [],
16266
- misses: passed ? [] : [`Output does not start with "${value}"`]
16248
+ assertions: [
16249
+ {
16250
+ text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
16251
+ passed
16252
+ }
16253
+ ]
16267
16254
  };
16268
16255
  }
16269
16256
  function runEndsWithAssertion(output, value) {
16270
16257
  const passed = output.trim().endsWith(value.trim());
16271
16258
  return {
16272
16259
  score: passed ? 1 : 0,
16273
- hits: passed ? [`Output ends with "${value}"`] : [],
16274
- misses: passed ? [] : [`Output does not end with "${value}"`]
16260
+ assertions: [
16261
+ {
16262
+ text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
16263
+ passed
16264
+ }
16265
+ ]
16275
16266
  };
16276
16267
  }
16277
16268
  function runRegexAssertion(output, pattern, flags) {
@@ -16280,8 +16271,12 @@ function runRegexAssertion(output, pattern, flags) {
16280
16271
  const flagsLabel = flags ? ` (flags: ${flags})` : "";
16281
16272
  return {
16282
16273
  score: passed ? 1 : 0,
16283
- hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
16284
- misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
16274
+ assertions: [
16275
+ {
16276
+ text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
16277
+ passed
16278
+ }
16279
+ ]
16285
16280
  };
16286
16281
  }
16287
16282
  function runIsJsonAssertion(output) {
@@ -16293,16 +16288,24 @@ function runIsJsonAssertion(output) {
16293
16288
  }
16294
16289
  return {
16295
16290
  score: passed ? 1 : 0,
16296
- hits: passed ? ["Output is valid JSON"] : [],
16297
- misses: passed ? [] : ["Output is not valid JSON"]
16291
+ assertions: [
16292
+ {
16293
+ text: passed ? "Output is valid JSON" : "Output is not valid JSON",
16294
+ passed
16295
+ }
16296
+ ]
16298
16297
  };
16299
16298
  }
16300
16299
  function runEqualsAssertion(output, value) {
16301
16300
  const passed = output.trim() === value.trim();
16302
16301
  return {
16303
16302
  score: passed ? 1 : 0,
16304
- hits: passed ? [`Output equals "${value}"`] : [],
16305
- misses: passed ? [] : [`Output does not equal "${value}"`]
16303
+ assertions: [
16304
+ {
16305
+ text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
16306
+ passed
16307
+ }
16308
+ ]
16306
16309
  };
16307
16310
  }
16308
16311
 
@@ -16515,10 +16518,8 @@ var InlineAssertEvaluator = class {
16515
16518
  return {
16516
16519
  score,
16517
16520
  verdict: scoreToVerdict(score),
16518
- hits: score >= 0.8 ? [result.name] : [],
16519
- misses: score < 0.5 ? [result.name] : [],
16521
+ assertions: [{ text: result.name, passed: score >= 0.5 }],
16520
16522
  expectedAspectCount: 1,
16521
- reasoning: void 0,
16522
16523
  details: result.metadata ? result.metadata : void 0
16523
16524
  };
16524
16525
  }
@@ -16556,11 +16557,9 @@ async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
16556
16557
  }
16557
16558
  async function executePromptTemplate(script, context2, config, timeoutMs) {
16558
16559
  const payload = {
16559
- question: context2.evalCase.question,
16560
16560
  criteria: context2.evalCase.criteria,
16561
16561
  expectedOutput: context2.evalCase.expected_output,
16562
- referenceAnswer: context2.evalCase.reference_answer,
16563
- answer: context2.candidate,
16562
+ outputText: context2.candidate,
16564
16563
  output: context2.output ?? null,
16565
16564
  guidelineFiles: context2.evalCase.guideline_paths,
16566
16565
  inputFiles: context2.evalCase.file_paths.filter(
@@ -16571,9 +16570,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
16571
16570
  fileChanges: context2.fileChanges ?? null,
16572
16571
  workspacePath: context2.workspacePath ?? null,
16573
16572
  config: config ?? context2.config ?? null,
16574
- // Text convenience accessors (new names, always strings)
16575
16573
  inputText: context2.evalCase.question,
16576
- outputText: context2.candidate,
16577
16574
  expectedOutputText: context2.evalCase.reference_answer ?? ""
16578
16575
  };
16579
16576
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -16711,9 +16708,7 @@ var containsFactory = (config) => {
16711
16708
  return {
16712
16709
  score: result.score,
16713
16710
  verdict: result.score === 1 ? "pass" : "fail",
16714
- hits: result.hits,
16715
- misses: result.misses,
16716
- reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
16711
+ assertions: result.assertions,
16717
16712
  expectedAspectCount: 1
16718
16713
  };
16719
16714
  });
@@ -16725,9 +16720,7 @@ var regexFactory = (config) => {
16725
16720
  return {
16726
16721
  score: result.score,
16727
16722
  verdict: result.score === 1 ? "pass" : "fail",
16728
- hits: result.hits,
16729
- misses: result.misses,
16730
- reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
16723
+ assertions: result.assertions,
16731
16724
  expectedAspectCount: 1
16732
16725
  };
16733
16726
  });
@@ -16738,9 +16731,7 @@ var isJsonFactory = () => {
16738
16731
  return {
16739
16732
  score: result.score,
16740
16733
  verdict: result.score === 1 ? "pass" : "fail",
16741
- hits: result.hits,
16742
- misses: result.misses,
16743
- reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
16734
+ assertions: result.assertions,
16744
16735
  expectedAspectCount: 1
16745
16736
  };
16746
16737
  });
@@ -16752,9 +16743,7 @@ var equalsFactory = (config) => {
16752
16743
  return {
16753
16744
  score: result.score,
16754
16745
  verdict: result.score === 1 ? "pass" : "fail",
16755
- hits: result.hits,
16756
- misses: result.misses,
16757
- reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
16746
+ assertions: result.assertions,
16758
16747
  expectedAspectCount: 1
16759
16748
  };
16760
16749
  });
@@ -16766,9 +16755,7 @@ var containsAnyFactory = (config) => {
16766
16755
  return {
16767
16756
  score: result.score,
16768
16757
  verdict: result.score === 1 ? "pass" : "fail",
16769
- hits: result.hits,
16770
- misses: result.misses,
16771
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16758
+ assertions: result.assertions,
16772
16759
  expectedAspectCount: 1
16773
16760
  };
16774
16761
  });
@@ -16780,9 +16767,7 @@ var containsAllFactory = (config) => {
16780
16767
  return {
16781
16768
  score: result.score,
16782
16769
  verdict: result.score === 1 ? "pass" : "fail",
16783
- hits: result.hits,
16784
- misses: result.misses,
16785
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16770
+ assertions: result.assertions,
16786
16771
  expectedAspectCount: 1
16787
16772
  };
16788
16773
  });
@@ -16794,9 +16779,7 @@ var icontainsFactory = (config) => {
16794
16779
  return {
16795
16780
  score: result.score,
16796
16781
  verdict: result.score === 1 ? "pass" : "fail",
16797
- hits: result.hits,
16798
- misses: result.misses,
16799
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16782
+ assertions: result.assertions,
16800
16783
  expectedAspectCount: 1
16801
16784
  };
16802
16785
  });
@@ -16808,9 +16791,7 @@ var icontainsAnyFactory = (config) => {
16808
16791
  return {
16809
16792
  score: result.score,
16810
16793
  verdict: result.score === 1 ? "pass" : "fail",
16811
- hits: result.hits,
16812
- misses: result.misses,
16813
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16794
+ assertions: result.assertions,
16814
16795
  expectedAspectCount: 1
16815
16796
  };
16816
16797
  });
@@ -16822,9 +16803,7 @@ var icontainsAllFactory = (config) => {
16822
16803
  return {
16823
16804
  score: result.score,
16824
16805
  verdict: result.score === 1 ? "pass" : "fail",
16825
- hits: result.hits,
16826
- misses: result.misses,
16827
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16806
+ assertions: result.assertions,
16828
16807
  expectedAspectCount: 1
16829
16808
  };
16830
16809
  });
@@ -16836,9 +16815,7 @@ var startsWithFactory = (config) => {
16836
16815
  return {
16837
16816
  score: result.score,
16838
16817
  verdict: result.score === 1 ? "pass" : "fail",
16839
- hits: result.hits,
16840
- misses: result.misses,
16841
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16818
+ assertions: result.assertions,
16842
16819
  expectedAspectCount: 1
16843
16820
  };
16844
16821
  });
@@ -16850,9 +16827,7 @@ var endsWithFactory = (config) => {
16850
16827
  return {
16851
16828
  score: result.score,
16852
16829
  verdict: result.score === 1 ? "pass" : "fail",
16853
- hits: result.hits,
16854
- misses: result.misses,
16855
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
16830
+ assertions: result.assertions,
16856
16831
  expectedAspectCount: 1
16857
16832
  };
16858
16833
  });
@@ -18258,9 +18233,8 @@ async function runEvaluation(options) {
18258
18233
  testId: evalCase.id,
18259
18234
  dataset: evalCase.dataset,
18260
18235
  score: 0,
18261
- hits: [],
18262
- misses: [],
18263
- answer: "",
18236
+ assertions: [],
18237
+ outputText: "",
18264
18238
  target: target.name,
18265
18239
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
18266
18240
  budgetExceeded: true,
@@ -18295,9 +18269,8 @@ async function runEvaluation(options) {
18295
18269
  testId: evalCase.id,
18296
18270
  dataset: evalCase.dataset,
18297
18271
  score: 0,
18298
- hits: [],
18299
- misses: [],
18300
- answer: "",
18272
+ assertions: [],
18273
+ outputText: "",
18301
18274
  target: target.name,
18302
18275
  error: errorMsg,
18303
18276
  executionStatus: "execution_error",
@@ -19263,11 +19236,9 @@ async function evaluateCandidate(options) {
19263
19236
  dataset: evalCase.dataset,
19264
19237
  conversationId: evalCase.conversation_id,
19265
19238
  score: score.score,
19266
- hits: score.hits,
19267
- misses: score.misses,
19268
- answer: candidate,
19239
+ assertions: score.assertions,
19240
+ outputText: candidate,
19269
19241
  target: target.name,
19270
- reasoning: score.reasoning,
19271
19242
  tokenUsage,
19272
19243
  costUsd,
19273
19244
  durationMs,
@@ -19441,9 +19412,7 @@ async function runEvaluatorList(options) {
19441
19412
  score: score2.score,
19442
19413
  weight,
19443
19414
  verdict: score2.verdict,
19444
- hits: score2.hits,
19445
- misses: score2.misses,
19446
- reasoning: score2.reasoning,
19415
+ assertions: score2.assertions,
19447
19416
  evaluatorProviderRequest: score2.evaluatorRawRequest,
19448
19417
  details: score2.details,
19449
19418
  scores: mapChildResults(score2.scores),
@@ -19458,10 +19427,10 @@ async function runEvaluatorList(options) {
19458
19427
  const fallbackScore = {
19459
19428
  score: 0,
19460
19429
  verdict: "fail",
19461
- hits: [],
19462
- misses: [`Evaluator '${evaluatorConfig.name}' failed: ${message}`],
19463
- expectedAspectCount: 1,
19464
- reasoning: message
19430
+ assertions: [
19431
+ { text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
19432
+ ],
19433
+ expectedAspectCount: 1
19465
19434
  };
19466
19435
  const weight = evaluatorConfig.weight ?? 1;
19467
19436
  scored.push({
@@ -19477,9 +19446,12 @@ async function runEvaluatorList(options) {
19477
19446
  score: 0,
19478
19447
  weight,
19479
19448
  verdict: "fail",
19480
- hits: [],
19481
- misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
19482
- reasoning: message,
19449
+ assertions: [
19450
+ {
19451
+ text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
19452
+ passed: false
19453
+ }
19454
+ ],
19483
19455
  durationMs: endedAt.getTime() - startedAt.getTime(),
19484
19456
  startedAt: startedAt.toISOString(),
19485
19457
  endedAt: endedAt.toISOString()
@@ -19495,9 +19467,7 @@ async function runEvaluatorList(options) {
19495
19467
  ...scores[lastScoresIdx],
19496
19468
  score: negated.score,
19497
19469
  verdict: negated.verdict,
19498
- hits: [...negated.hits],
19499
- misses: [...negated.misses],
19500
- reasoning: negated.reasoning
19470
+ assertions: [...negated.assertions]
19501
19471
  };
19502
19472
  }
19503
19473
  }
@@ -19512,21 +19482,13 @@ async function runEvaluatorList(options) {
19512
19482
  const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
19513
19483
  scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
19514
19484
  ) : 0;
19515
- const hits = scored.flatMap((entry) => entry.score.hits);
19516
- const misses = scored.flatMap((entry) => entry.score.misses);
19517
- const expectedAspectCount = scored.reduce(
19518
- (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
19519
- 0
19520
- );
19521
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
19522
- const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
19485
+ const assertions = scored.flatMap((entry) => entry.score.assertions);
19486
+ const expectedAspectCount = assertions.length || 1;
19523
19487
  const score = {
19524
19488
  score: aggregateScore,
19525
19489
  verdict: scoreToVerdict(aggregateScore),
19526
- hits,
19527
- misses,
19528
- expectedAspectCount,
19529
- reasoning
19490
+ assertions,
19491
+ expectedAspectCount
19530
19492
  };
19531
19493
  return { score, scores };
19532
19494
  }
@@ -19630,9 +19592,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
19630
19592
  dataset: evalCase.dataset,
19631
19593
  conversationId: evalCase.conversation_id,
19632
19594
  score: 0,
19633
- hits: [],
19634
- misses: [`Error: ${message}`],
19635
- answer: `Error occurred: ${message}`,
19595
+ assertions: [{ text: `Error: ${message}`, passed: false }],
19596
+ outputText: `Error occurred: ${message}`,
19636
19597
  target: targetName,
19637
19598
  requests,
19638
19599
  input,
@@ -19741,9 +19702,7 @@ function mapChildResults(children) {
19741
19702
  score: child.score,
19742
19703
  weight: child.weight,
19743
19704
  verdict: child.verdict,
19744
- hits: child.hits,
19745
- misses: child.misses,
19746
- reasoning: child.reasoning,
19705
+ assertions: child.assertions,
19747
19706
  evaluatorProviderRequest: child.evaluatorRawRequest,
19748
19707
  scores: mapChildResults(child.scores),
19749
19708
  details: child.details,
@@ -20172,7 +20131,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
20172
20131
 
20173
20132
  // src/evaluation/baseline.ts
20174
20133
  var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
20175
- "answer",
20134
+ "outputText",
20176
20135
  "requests",
20177
20136
  "trace",
20178
20137
  "workspacePath",
@@ -20346,7 +20305,7 @@ var OtelTraceExporter = class {
20346
20305
  rootSpan.setAttribute("agentv.target", result.target);
20347
20306
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
20348
20307
  rootSpan.setAttribute("agentv.score", result.score);
20349
- if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
20308
+ if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
20350
20309
  if (result.durationMs != null)
20351
20310
  rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
20352
20311
  if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
@@ -20713,7 +20672,6 @@ function createAgentKernel() {
20713
20672
  freeformEvaluationSchema,
20714
20673
  generateRubrics,
20715
20674
  getAgentvHome,
20716
- getHitCount,
20717
20675
  getOutputFilenames,
20718
20676
  getSubagentsRoot,
20719
20677
  getTraceStateRoot,