agentv 3.4.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -149,7 +149,7 @@ import {
149
149
  withUserAgentSuffix,
150
150
  withoutTrailingSlash,
151
151
  zodSchema
152
- } from "./chunk-AR3QEKXH.js";
152
+ } from "./chunk-BJV6MDBE.js";
153
153
  import {
154
154
  SpanStatusCode,
155
155
  context,
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-JO4HIAEF.js
304
+ // ../../packages/core/dist/chunk-2IZOTQ25.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,7 +419,7 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-JO4HIAEF.js
422
+ // ../../packages/core/dist/chunk-2IZOTQ25.js
423
423
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
424
424
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
425
425
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -498,9 +498,6 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
498
498
  function isEvaluatorKind(value) {
499
499
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
500
500
  }
501
- function getHitCount(result) {
502
- return result.hits.length;
503
- }
504
501
  async function fileExists(filePath) {
505
502
  try {
506
503
  await access(filePath, constants.F_OK);
@@ -14658,14 +14655,8 @@ function logWarning(message) {
14658
14655
  console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET3}`);
14659
14656
  }
14660
14657
  var TEMPLATE_VARIABLES = {
14661
- /** @deprecated Use OUTPUT_TEXT instead */
14662
- ANSWER: "answer",
14663
14658
  EXPECTED_OUTPUT: "expected_output",
14664
- /** @deprecated Use INPUT_TEXT instead */
14665
- QUESTION: "question",
14666
14659
  CRITERIA: "criteria",
14667
- /** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
14668
- REFERENCE_ANSWER: "reference_answer",
14669
14660
  INPUT: "input",
14670
14661
  OUTPUT: "output",
14671
14662
  FILE_CHANGES: "file_changes",
@@ -14675,9 +14666,8 @@ var TEMPLATE_VARIABLES = {
14675
14666
  };
14676
14667
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
14677
14668
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
14678
- TEMPLATE_VARIABLES.ANSWER,
14679
- TEMPLATE_VARIABLES.EXPECTED_OUTPUT,
14680
- TEMPLATE_VARIABLES.OUTPUT_TEXT
14669
+ TEMPLATE_VARIABLES.OUTPUT_TEXT,
14670
+ TEMPLATE_VARIABLES.EXPECTED_OUTPUT
14681
14671
  ]);
14682
14672
  var ANSI_YELLOW3 = "\x1B[33m";
14683
14673
  var ANSI_RESET4 = "\x1B[0m";
@@ -14698,13 +14688,13 @@ function validateTemplateVariables(content, source) {
14698
14688
  }
14699
14689
  match = variablePattern.exec(content);
14700
14690
  }
14701
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.ANSWER) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
14691
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
14702
14692
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
14703
14693
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
14704
14694
  if (!hasRequiredFields) {
14705
14695
  throw new Error(
14706
14696
  `Missing required fields. Must include at least one of:
14707
- - {{ ${TEMPLATE_VARIABLES.ANSWER} }} or {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
14697
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
14708
14698
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
14709
14699
  );
14710
14700
  }
@@ -17623,7 +17613,7 @@ var AzureProvider = class {
17623
17613
  };
17624
17614
  this.retryConfig = config.retry;
17625
17615
  const azure = createAzure(buildAzureOptions(config));
17626
- this.model = azure(config.deploymentName);
17616
+ this.model = azure.chat(config.deploymentName);
17627
17617
  }
17628
17618
  id;
17629
17619
  kind = "azure";
@@ -17846,6 +17836,8 @@ async function invokeModel(options) {
17846
17836
  const { model, request, defaults, retryConfig, providerOptions } = options;
17847
17837
  const chatPrompt = buildChatPrompt(request);
17848
17838
  const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
17839
+ const startTime = (/* @__PURE__ */ new Date()).toISOString();
17840
+ const startMs = Date.now();
17849
17841
  const result = await withRetry(
17850
17842
  () => generateText({
17851
17843
  model,
@@ -17859,9 +17851,11 @@ async function invokeModel(options) {
17859
17851
  retryConfig,
17860
17852
  request.signal
17861
17853
  );
17862
- return mapResponse(result);
17854
+ const endTime = (/* @__PURE__ */ new Date()).toISOString();
17855
+ const durationMs = Date.now() - startMs;
17856
+ return mapResponse(result, { durationMs, startTime, endTime });
17863
17857
  }
17864
- function mapResponse(result) {
17858
+ function mapResponse(result, timing) {
17865
17859
  const content = result.text ?? "";
17866
17860
  const rawUsage = result.totalUsage ?? result.usage;
17867
17861
  const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
@@ -17876,7 +17870,10 @@ function mapResponse(result) {
17876
17870
  raw: result,
17877
17871
  usage: toJsonObject(rawUsage),
17878
17872
  output: [{ role: "assistant", content }],
17879
- tokenUsage
17873
+ tokenUsage,
17874
+ durationMs: timing?.durationMs,
17875
+ startTime: timing?.startTime,
17876
+ endTime: timing?.endTime
17880
17877
  };
17881
17878
  }
17882
17879
  function toJsonObject(value) {
@@ -18734,10 +18731,12 @@ var ClaudeSdkProvider = class {
18734
18731
  if (usage) {
18735
18732
  const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
18736
18733
  const outputTokens = usage.output_tokens ?? 0;
18734
+ const reasoningTokens = usage.reasoning_tokens ?? void 0;
18737
18735
  tokenUsage = {
18738
18736
  input: inputTokens,
18739
18737
  output: outputTokens,
18740
- cached: usage.cache_read_input_tokens ?? void 0
18738
+ cached: usage.cache_read_input_tokens ?? void 0,
18739
+ reasoning: reasoningTokens
18741
18740
  };
18742
18741
  request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
18743
18742
  }
@@ -19733,7 +19732,8 @@ ${basePrompt}` : basePrompt;
19733
19732
  onUsage({
19734
19733
  input: usage.input_tokens ?? 0,
19735
19734
  output: usage.output_tokens ?? 0,
19736
- cached: usage.cached_input_tokens ?? void 0
19735
+ cached: usage.cached_input_tokens ?? void 0,
19736
+ reasoning: usage.reasoning_tokens ?? void 0
19737
19737
  });
19738
19738
  }
19739
19739
  }
@@ -21701,10 +21701,12 @@ function extractTokenUsage(events) {
21701
21701
  output: output ?? 0
21702
21702
  };
21703
21703
  const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
21704
- if (cached !== void 0) {
21705
- return { ...result, cached };
21706
- }
21707
- return result;
21704
+ const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
21705
+ return {
21706
+ ...result,
21707
+ ...cached !== void 0 ? { cached } : {},
21708
+ ...reasoning !== void 0 ? { reasoning } : {}
21709
+ };
21708
21710
  }
21709
21711
  }
21710
21712
  const messages = record.messages;
@@ -23483,9 +23485,11 @@ function negateScore(score) {
23483
23485
  ...score,
23484
23486
  score: negatedScore,
23485
23487
  verdict: negatedVerdict,
23486
- reasoning: score.reasoning ? `[Negated] ${score.reasoning} (original score: ${score.score.toFixed(2)})` : `[Negated] Original score: ${score.score.toFixed(2)}`,
23487
- hits: score.misses,
23488
- misses: score.hits
23488
+ assertions: score.assertions.map((a) => ({
23489
+ ...a,
23490
+ passed: !a.passed,
23491
+ evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
23492
+ }))
23489
23493
  };
23490
23494
  }
23491
23495
  function shellEscapePath(value) {
@@ -23928,11 +23932,9 @@ var CodeEvaluator = class {
23928
23932
  }
23929
23933
  }
23930
23934
  const payload = {
23931
- question: context2.evalCase.question,
23932
23935
  criteria: context2.evalCase.criteria,
23933
23936
  expectedOutput: context2.evalCase.expected_output,
23934
- referenceAnswer: context2.evalCase.reference_answer,
23935
- answer: context2.candidate,
23937
+ outputText: context2.candidate,
23936
23938
  output: outputForPayload,
23937
23939
  outputPath,
23938
23940
  guidelineFiles: context2.evalCase.guideline_paths,
@@ -23949,9 +23951,7 @@ var CodeEvaluator = class {
23949
23951
  fileChanges: context2.fileChanges ?? null,
23950
23952
  workspacePath: context2.workspacePath ?? null,
23951
23953
  config: this.config ?? null,
23952
- // Text convenience accessors (new names, always strings)
23953
23954
  inputText: context2.evalCase.question,
23954
- outputText: context2.candidate,
23955
23955
  expectedOutputText: context2.evalCase.reference_answer ?? ""
23956
23956
  };
23957
23957
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -23985,9 +23985,13 @@ var CodeEvaluator = class {
23985
23985
  );
23986
23986
  const parsed = parseJsonSafe(stdout);
23987
23987
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
23988
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
23989
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
23990
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
23988
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
23989
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
23990
+ ).map((a) => ({
23991
+ text: String(a.text),
23992
+ passed: Boolean(a.passed),
23993
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
23994
+ })) : [];
23991
23995
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
23992
23996
  const proxyUsage = getProxyUsage?.();
23993
23997
  const evaluatorRawRequest = {
@@ -24003,10 +24007,8 @@ var CodeEvaluator = class {
24003
24007
  return {
24004
24008
  score,
24005
24009
  verdict: scoreToVerdict(score),
24006
- hits,
24007
- misses,
24008
- expectedAspectCount: hits.length + misses.length || 1,
24009
- reasoning,
24010
+ assertions,
24011
+ expectedAspectCount: assertions.length || 1,
24010
24012
  evaluatorRawRequest,
24011
24013
  ...details ? { details } : {},
24012
24014
  tokenUsage: proxyUsage?.tokenUsage
@@ -24017,10 +24019,8 @@ var CodeEvaluator = class {
24017
24019
  return {
24018
24020
  score: 0,
24019
24021
  verdict: "fail",
24020
- hits: [],
24021
- misses: [`Code evaluator failed: ${message}`],
24022
+ assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
24022
24023
  expectedAspectCount: 1,
24023
- reasoning: message,
24024
24024
  evaluatorRawRequest: {
24025
24025
  command: this.command,
24026
24026
  ...this.cwd ? { cwd: this.cwd } : {},
@@ -24110,18 +24110,22 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
24110
24110
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
24111
24111
 
24112
24112
  [[ ## question ## ]]
24113
- {{${TEMPLATE_VARIABLES.QUESTION}}}
24113
+ {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
24114
24114
 
24115
24115
  [[ ## reference_answer ## ]]
24116
- {{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
24116
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
24117
24117
 
24118
24118
  [[ ## answer ## ]]
24119
- {{${TEMPLATE_VARIABLES.ANSWER}}}`;
24119
+ {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
24120
24120
  var freeformEvaluationSchema = external_exports2.object({
24121
24121
  score: external_exports2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
24122
- hits: external_exports2.array(external_exports2.string()).describe("Brief specific achievements").optional(),
24123
- misses: external_exports2.array(external_exports2.string()).describe("Brief failures or omissions").optional(),
24124
- reasoning: external_exports2.string().describe("Concise explanation (1-2 sentences)").optional()
24122
+ assertions: external_exports2.array(
24123
+ external_exports2.object({
24124
+ text: external_exports2.string().describe("Brief description of what was checked"),
24125
+ passed: external_exports2.boolean().describe("Whether this aspect was satisfied"),
24126
+ evidence: external_exports2.string().describe("Concise evidence (1-2 sentences)").optional()
24127
+ })
24128
+ ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
24125
24129
  });
24126
24130
  var rubricCheckResultSchema = external_exports2.object({
24127
24131
  id: external_exports2.string().describe("The ID of the rubric item being checked"),
@@ -24190,12 +24194,8 @@ var LlmGraderEvaluator = class {
24190
24194
  2
24191
24195
  ),
24192
24196
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
24193
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
24194
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
24195
24197
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
24196
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
24197
24198
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
24198
- // Text convenience accessors (new names, always strings)
24199
24199
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
24200
24200
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
24201
24201
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
@@ -24223,17 +24223,12 @@ ${context2.fileChanges}`;
24223
24223
  schema: freeformEvaluationSchema
24224
24224
  });
24225
24225
  const score = clampScore(data.score);
24226
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
24227
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
24228
- const reasoning = data.reasoning;
24229
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
24226
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
24230
24227
  return {
24231
24228
  score,
24232
24229
  verdict: scoreToVerdict(score),
24233
- hits,
24234
- misses,
24235
- expectedAspectCount,
24236
- reasoning,
24230
+ assertions,
24231
+ expectedAspectCount: Math.max(assertions.length, 1),
24237
24232
  evaluatorRawRequest,
24238
24233
  tokenUsage
24239
24234
  };
@@ -24244,10 +24239,8 @@ ${context2.fileChanges}`;
24244
24239
  return {
24245
24240
  score: 0,
24246
24241
  verdict: "skip",
24247
- hits: [],
24248
- misses: [`Grader parse failure after 3 attempts: ${message}`],
24242
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
24249
24243
  expectedAspectCount: 1,
24250
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
24251
24244
  evaluatorRawRequest
24252
24245
  };
24253
24246
  }
@@ -24277,14 +24270,12 @@ ${context2.fileChanges}`;
24277
24270
  userPrompt: prompt,
24278
24271
  schema: rubricEvaluationSchema
24279
24272
  });
24280
- const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
24273
+ const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
24281
24274
  return {
24282
24275
  score,
24283
24276
  verdict,
24284
- hits,
24285
- misses,
24277
+ assertions,
24286
24278
  expectedAspectCount: rubrics.length,
24287
- reasoning: data.overall_reasoning,
24288
24279
  evaluatorRawRequest,
24289
24280
  tokenUsage
24290
24281
  };
@@ -24295,10 +24286,8 @@ ${context2.fileChanges}`;
24295
24286
  return {
24296
24287
  score: 0,
24297
24288
  verdict: "skip",
24298
- hits: [],
24299
- misses: [`Grader parse failure after 3 attempts: ${message}`],
24289
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
24300
24290
  expectedAspectCount: rubrics.length,
24301
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
24302
24291
  evaluatorRawRequest
24303
24292
  };
24304
24293
  }
@@ -24323,14 +24312,12 @@ ${context2.fileChanges}`;
24323
24312
  userPrompt: prompt,
24324
24313
  schema: scoreRangeEvaluationSchema
24325
24314
  });
24326
- const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
24315
+ const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
24327
24316
  return {
24328
24317
  score,
24329
24318
  verdict,
24330
- hits,
24331
- misses,
24319
+ assertions,
24332
24320
  expectedAspectCount: rubrics.length,
24333
- reasoning: data.overall_reasoning,
24334
24321
  evaluatorRawRequest,
24335
24322
  details,
24336
24323
  tokenUsage
@@ -24342,10 +24329,8 @@ ${context2.fileChanges}`;
24342
24329
  return {
24343
24330
  score: 0,
24344
24331
  verdict: "skip",
24345
- hits: [],
24346
- misses: [`Grader parse failure after 3 attempts: ${message}`],
24332
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
24347
24333
  expectedAspectCount: rubrics.length,
24348
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
24349
24334
  evaluatorRawRequest
24350
24335
  };
24351
24336
  }
@@ -24402,8 +24387,7 @@ ${context2.fileChanges}`;
24402
24387
  return {
24403
24388
  score: 0,
24404
24389
  verdict: "fail",
24405
- hits: [],
24406
- misses: [`llm-grader built-in evaluation failed: ${message}`],
24390
+ assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
24407
24391
  expectedAspectCount: 1,
24408
24392
  evaluatorRawRequest,
24409
24393
  details: { mode: "built-in", error: message }
@@ -24453,8 +24437,9 @@ ${context2.fileChanges}`;
24453
24437
  return {
24454
24438
  score: 0,
24455
24439
  verdict: "fail",
24456
- hits: [],
24457
- misses: [`llm-grader ${modeLabel} returned no assistant response`],
24440
+ assertions: [
24441
+ { text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
24442
+ ],
24458
24443
  expectedAspectCount: 1,
24459
24444
  evaluatorRawRequest,
24460
24445
  details: { mode: modeLabel, grader_target: provider.targetName }
@@ -24472,8 +24457,9 @@ ${context2.fileChanges}`;
24472
24457
  return {
24473
24458
  score: 0,
24474
24459
  verdict: "fail",
24475
- hits: [],
24476
- misses: [`llm-grader ${modeLabel} evaluation failed: ${message}`],
24460
+ assertions: [
24461
+ { text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
24462
+ ],
24477
24463
  expectedAspectCount: 1,
24478
24464
  evaluatorRawRequest,
24479
24465
  details: {
@@ -24514,10 +24500,10 @@ ${context2.fileChanges}`;
24514
24500
  buildAgentUserPrompt(context2) {
24515
24501
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
24516
24502
  const variables = {
24517
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
24518
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
24519
24503
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
24520
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
24504
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
24505
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
24506
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
24521
24507
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
24522
24508
  };
24523
24509
  if (this.evaluatorTemplate) {
@@ -24570,10 +24556,10 @@ ${context2.fileChanges}`;
24570
24556
  const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24571
24557
  if (this.evaluatorTemplate) {
24572
24558
  const variables = {
24573
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
24574
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
24575
24559
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
24576
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
24560
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
24561
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
24562
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
24577
24563
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
24578
24564
  };
24579
24565
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
@@ -24625,29 +24611,24 @@ ${outputSchema2}`;
24625
24611
  const parsed = parseJsonFromText(text2);
24626
24612
  if (rubrics && rubrics.length > 0) {
24627
24613
  const data2 = rubricEvaluationSchema.parse(parsed);
24628
- const { score: score2, verdict, hits: hits2, misses: misses2 } = calculateRubricScore(data2, rubrics);
24614
+ const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
24629
24615
  return {
24630
24616
  score: score2,
24631
24617
  verdict,
24632
- hits: hits2,
24633
- misses: misses2,
24618
+ assertions: assertions2,
24634
24619
  expectedAspectCount: rubrics.length,
24635
- reasoning: data2.overall_reasoning,
24636
24620
  evaluatorRawRequest,
24637
24621
  details
24638
24622
  };
24639
24623
  }
24640
24624
  const data = freeformEvaluationSchema.parse(parsed);
24641
24625
  const score = clampScore(data.score);
24642
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
24643
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
24626
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
24644
24627
  return {
24645
24628
  score,
24646
24629
  verdict: scoreToVerdict(score),
24647
- hits,
24648
- misses,
24649
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
24650
- reasoning: data.reasoning,
24630
+ assertions,
24631
+ expectedAspectCount: Math.max(assertions.length, 1),
24651
24632
  evaluatorRawRequest,
24652
24633
  details
24653
24634
  };
@@ -24655,8 +24636,12 @@ ${outputSchema2}`;
24655
24636
  return {
24656
24637
  score: 0,
24657
24638
  verdict: "fail",
24658
- hits: [],
24659
- misses: ["Failed to parse llm-grader agent response as valid evaluation JSON"],
24639
+ assertions: [
24640
+ {
24641
+ text: "Failed to parse llm-grader agent response as valid evaluation JSON",
24642
+ passed: false
24643
+ }
24644
+ ],
24660
24645
  expectedAspectCount: 1,
24661
24646
  evaluatorRawRequest,
24662
24647
  details
@@ -24785,9 +24770,13 @@ function buildOutputSchema() {
24785
24770
  "",
24786
24771
  "{",
24787
24772
  ' "score": <number between 0.0 and 1.0>,',
24788
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
24789
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
24790
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
24773
+ ' "assertions": [',
24774
+ " {",
24775
+ ' "text": "<brief description of what was checked>",',
24776
+ ' "passed": <boolean>,',
24777
+ ' "evidence": "<concise evidence, 1-2 sentences, optional>"',
24778
+ " }",
24779
+ " ]",
24791
24780
  "}"
24792
24781
  ].join("\n");
24793
24782
  }
@@ -24812,8 +24801,7 @@ function substituteVariables(template, variables) {
24812
24801
  }
24813
24802
  function calculateRubricScore(result, rubrics) {
24814
24803
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
24815
- const hits = [];
24816
- const misses = [];
24804
+ const assertions = [];
24817
24805
  let totalWeight = 0;
24818
24806
  let earnedWeight = 0;
24819
24807
  let failedRequired = false;
@@ -24823,19 +24811,20 @@ function calculateRubricScore(result, rubrics) {
24823
24811
  continue;
24824
24812
  }
24825
24813
  totalWeight += rubric.weight;
24814
+ assertions.push({
24815
+ text: `[${rubric.id}] ${rubric.outcome}`,
24816
+ passed: check.satisfied,
24817
+ evidence: check.reasoning
24818
+ });
24826
24819
  if (check.satisfied) {
24827
24820
  earnedWeight += rubric.weight;
24828
- hits.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
24829
- } else {
24830
- misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
24831
- if (rubric.required) {
24832
- failedRequired = true;
24833
- }
24821
+ } else if (rubric.required) {
24822
+ failedRequired = true;
24834
24823
  }
24835
24824
  }
24836
24825
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
24837
24826
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
24838
- return { score, verdict, hits, misses };
24827
+ return { score, verdict, assertions };
24839
24828
  }
24840
24829
  function buildScoreRangeOutputSchema() {
24841
24830
  return `You are an expert evaluator. Score the candidate answer on each criterion.
@@ -24855,8 +24844,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
24855
24844
  }
24856
24845
  function calculateScoreRangeResult(result, rubrics) {
24857
24846
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
24858
- const hits = [];
24859
- const misses = [];
24847
+ const assertions = [];
24860
24848
  const rawScores = {};
24861
24849
  let totalWeight = 0;
24862
24850
  let weightedScoreSum = 0;
@@ -24882,24 +24870,22 @@ function calculateScoreRangeResult(result, rubrics) {
24882
24870
  );
24883
24871
  const rangeDescription = matchingRange?.outcome ?? "";
24884
24872
  const criterionLabel = rubric.outcome ?? rubric.id;
24885
- const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
24886
- const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
24873
+ const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
24887
24874
  if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
24888
24875
  failedRequired = true;
24889
- misses.push(scoreInfo);
24890
- } else if (rawScore >= 7) {
24891
- hits.push(scoreInfo);
24892
- } else {
24893
- misses.push(scoreInfo);
24894
24876
  }
24877
+ assertions.push({
24878
+ text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
24879
+ passed,
24880
+ evidence: check.reasoning
24881
+ });
24895
24882
  }
24896
24883
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
24897
24884
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
24898
24885
  return {
24899
24886
  score,
24900
24887
  verdict,
24901
- hits,
24902
- misses,
24888
+ assertions,
24903
24889
  details: {
24904
24890
  raw_scores: rawScores,
24905
24891
  normalization: "score / 10",
@@ -25073,9 +25059,7 @@ var CompositeEvaluator = class {
25073
25059
  let totalWeight = 0;
25074
25060
  let weightedSum = 0;
25075
25061
  let evaluatedCount = 0;
25076
- const allHits = [];
25077
- const allMisses = [];
25078
- const reasoningParts = [];
25062
+ const allAssertions = [];
25079
25063
  const scores = [];
25080
25064
  for (const member of results) {
25081
25065
  const weight = weights?.[member.id] ?? 1;
@@ -25085,9 +25069,7 @@ var CompositeEvaluator = class {
25085
25069
  score: member.result.score,
25086
25070
  weight,
25087
25071
  verdict: member.result.verdict,
25088
- hits: [...member.result.hits],
25089
- misses: [...member.result.misses],
25090
- reasoning: member.result.reasoning,
25072
+ assertions: [...member.result.assertions],
25091
25073
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25092
25074
  scores: member.result.scores,
25093
25075
  details: member.result.details,
@@ -25099,20 +25081,16 @@ var CompositeEvaluator = class {
25099
25081
  evaluatedCount++;
25100
25082
  totalWeight += weight;
25101
25083
  weightedSum += member.result.score * weight;
25102
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
25103
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
25104
- if (member.result.reasoning) {
25105
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
25106
- }
25084
+ allAssertions.push(
25085
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
25086
+ );
25107
25087
  }
25108
25088
  if (evaluatedCount === 0 && results.length > 0) {
25109
25089
  return {
25110
25090
  score: 0,
25111
25091
  verdict: "skip",
25112
- hits: [],
25113
- misses: [],
25092
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
25114
25093
  expectedAspectCount: 1,
25115
- reasoning: "All evaluators skipped (infrastructure failure)",
25116
25094
  evaluatorRawRequest: {
25117
25095
  aggregator: "weighted_average",
25118
25096
  ...weights ? { weights } : {}
@@ -25124,10 +25102,8 @@ var CompositeEvaluator = class {
25124
25102
  return {
25125
25103
  score: clampScore(finalScore),
25126
25104
  verdict: scoreToVerdict(finalScore),
25127
- hits: allHits,
25128
- misses: allMisses,
25129
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
25130
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
25105
+ assertions: allAssertions,
25106
+ expectedAspectCount: allAssertions.length || 1,
25131
25107
  evaluatorRawRequest: {
25132
25108
  aggregator: "weighted_average",
25133
25109
  ...weights ? { weights } : {}
@@ -25137,11 +25113,8 @@ var CompositeEvaluator = class {
25137
25113
  }
25138
25114
  runThreshold(results, threshold) {
25139
25115
  const scores = [];
25140
- const allHits = [];
25141
- const allMisses = [];
25142
- const reasoningParts = [];
25116
+ const allAssertions = [];
25143
25117
  let passingCount = 0;
25144
- let borderlineCount = 0;
25145
25118
  let evaluatedCount = 0;
25146
25119
  for (const member of results) {
25147
25120
  scores.push({
@@ -25149,9 +25122,7 @@ var CompositeEvaluator = class {
25149
25122
  type: member.type,
25150
25123
  score: member.result.score,
25151
25124
  verdict: member.result.verdict,
25152
- hits: [...member.result.hits],
25153
- misses: [...member.result.misses],
25154
- reasoning: member.result.reasoning,
25125
+ assertions: [...member.result.assertions],
25155
25126
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25156
25127
  scores: member.result.scores,
25157
25128
  details: member.result.details,
@@ -25164,24 +25135,17 @@ var CompositeEvaluator = class {
25164
25135
  const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
25165
25136
  if (isPassing) {
25166
25137
  passingCount++;
25167
- if (member.result.verdict === "borderline") {
25168
- borderlineCount++;
25169
- }
25170
- }
25171
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
25172
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
25173
- if (member.result.reasoning) {
25174
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
25175
25138
  }
25139
+ allAssertions.push(
25140
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
25141
+ );
25176
25142
  }
25177
25143
  if (evaluatedCount === 0 && results.length > 0) {
25178
25144
  return {
25179
25145
  score: 0,
25180
25146
  verdict: "skip",
25181
- hits: [],
25182
- misses: [],
25147
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
25183
25148
  expectedAspectCount: 1,
25184
- reasoning: "All evaluators skipped (infrastructure failure)",
25185
25149
  evaluatorRawRequest: {
25186
25150
  aggregator: "threshold",
25187
25151
  threshold
@@ -25192,19 +25156,15 @@ var CompositeEvaluator = class {
25192
25156
  const totalCount = evaluatedCount;
25193
25157
  const score = totalCount > 0 ? passingCount / totalCount : 0;
25194
25158
  const pass = score >= threshold;
25195
- if (pass && borderlineCount > 0) {
25196
- reasoningParts.push(`Warning: ${borderlineCount} borderline evaluator(s) counted as passing`);
25197
- }
25198
- reasoningParts.unshift(
25199
- `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
25200
- );
25159
+ allAssertions.unshift({
25160
+ text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
25161
+ passed: pass
25162
+ });
25201
25163
  return {
25202
25164
  score: clampScore(score),
25203
25165
  verdict: pass ? "pass" : "fail",
25204
- hits: allHits,
25205
- misses: allMisses,
25206
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
25207
- reasoning: reasoningParts.join("; "),
25166
+ assertions: allAssertions,
25167
+ expectedAspectCount: allAssertions.length || 1,
25208
25168
  evaluatorRawRequest: {
25209
25169
  aggregator: "threshold",
25210
25170
  threshold
@@ -25221,9 +25181,7 @@ var CompositeEvaluator = class {
25221
25181
  score: member.result.score,
25222
25182
  weight: weights?.[member.id] ?? 1,
25223
25183
  verdict: member.result.verdict,
25224
- hits: [...member.result.hits],
25225
- misses: [...member.result.misses],
25226
- reasoning: member.result.reasoning,
25184
+ assertions: [...member.result.assertions],
25227
25185
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25228
25186
  scores: member.result.scores,
25229
25187
  details: member.result.details
@@ -25232,17 +25190,19 @@ var CompositeEvaluator = class {
25232
25190
  const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
25233
25191
  const parsed = parseJsonSafe(stdout);
25234
25192
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
25235
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
25236
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
25237
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
25193
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
25194
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
25195
+ ).map((a) => ({
25196
+ text: String(a.text),
25197
+ passed: Boolean(a.passed),
25198
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
25199
+ })) : [];
25238
25200
  const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
25239
25201
  return {
25240
25202
  score,
25241
25203
  verdict,
25242
- hits,
25243
- misses,
25244
- expectedAspectCount: hits.length + misses.length || 1,
25245
- reasoning,
25204
+ assertions,
25205
+ expectedAspectCount: assertions.length || 1,
25246
25206
  evaluatorRawRequest: {
25247
25207
  aggregator: "code-grader",
25248
25208
  script: scriptPath
@@ -25254,10 +25214,8 @@ var CompositeEvaluator = class {
25254
25214
  return {
25255
25215
  score: 0,
25256
25216
  verdict: "fail",
25257
- hits: [],
25258
- misses: [`Code aggregator failed: ${message}`],
25217
+ assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
25259
25218
  expectedAspectCount: 1,
25260
- reasoning: message,
25261
25219
  evaluatorRawRequest: {
25262
25220
  aggregator: "code-grader",
25263
25221
  script: scriptPath,
@@ -25279,9 +25237,7 @@ var CompositeEvaluator = class {
25279
25237
  type: member.type,
25280
25238
  score: member.result.score,
25281
25239
  verdict: member.result.verdict,
25282
- hits: [...member.result.hits],
25283
- misses: [...member.result.misses],
25284
- reasoning: member.result.reasoning,
25240
+ assertions: [...member.result.assertions],
25285
25241
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25286
25242
  scores: member.result.scores,
25287
25243
  details: member.result.details
@@ -25305,16 +25261,12 @@ var CompositeEvaluator = class {
25305
25261
  });
25306
25262
  const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text2));
25307
25263
  const score2 = clampScore(data2.score);
25308
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
25309
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
25310
- const reasoning2 = data2.reasoning;
25264
+ const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
25311
25265
  return {
25312
25266
  score: score2,
25313
25267
  verdict: scoreToVerdict(score2),
25314
- hits: hits2,
25315
- misses: misses2,
25316
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
25317
- reasoning: reasoning2,
25268
+ assertions: assertions2,
25269
+ expectedAspectCount: Math.max(assertions2.length, 1),
25318
25270
  evaluatorRawRequest,
25319
25271
  scores
25320
25272
  };
@@ -25329,16 +25281,12 @@ var CompositeEvaluator = class {
25329
25281
  parseJsonFromText(extractLastAssistantContent(response.output))
25330
25282
  );
25331
25283
  const score = clampScore(data.score);
25332
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
25333
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
25334
- const reasoning = data.reasoning;
25284
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
25335
25285
  return {
25336
25286
  score,
25337
25287
  verdict: scoreToVerdict(score),
25338
- hits,
25339
- misses,
25340
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
25341
- reasoning,
25288
+ assertions,
25289
+ expectedAspectCount: Math.max(assertions.length, 1),
25342
25290
  evaluatorRawRequest,
25343
25291
  scores
25344
25292
  };
@@ -25346,8 +25294,7 @@ var CompositeEvaluator = class {
25346
25294
  return {
25347
25295
  score: 0,
25348
25296
  verdict: "fail",
25349
- hits: [],
25350
- misses: [],
25297
+ assertions: [{ text: "LLM aggregator failed", passed: false }],
25351
25298
  expectedAspectCount: 1,
25352
25299
  evaluatorRawRequest,
25353
25300
  scores
@@ -25368,10 +25315,8 @@ var CostEvaluator = class {
25368
25315
  return {
25369
25316
  score: 0,
25370
25317
  verdict: "fail",
25371
- hits: [],
25372
- misses: ["No cost data available in trace"],
25318
+ assertions: [{ text: "No cost data available in trace", passed: false }],
25373
25319
  expectedAspectCount: 1,
25374
- reasoning: "Execution cost not reported by provider",
25375
25320
  evaluatorRawRequest: {
25376
25321
  type: "cost",
25377
25322
  budget,
@@ -25385,10 +25330,10 @@ var CostEvaluator = class {
25385
25330
  return {
25386
25331
  score,
25387
25332
  verdict: passed ? "pass" : "fail",
25388
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
25389
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
25333
+ assertions: [
25334
+ passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
25335
+ ],
25390
25336
  expectedAspectCount: 1,
25391
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
25392
25337
  evaluatorRawRequest: {
25393
25338
  type: "cost",
25394
25339
  budget,
@@ -25419,10 +25364,8 @@ var ExecutionMetricsEvaluator = class {
25419
25364
  return {
25420
25365
  score: 0,
25421
25366
  verdict: "fail",
25422
- hits: [],
25423
- misses: ["No trace summary available"],
25367
+ assertions: [{ text: "No trace summary available", passed: false }],
25424
25368
  expectedAspectCount: 1,
25425
- reasoning: "Execution metrics not available - no trace summary provided",
25426
25369
  evaluatorRawRequest: {
25427
25370
  type: "execution-metrics",
25428
25371
  config: this.extractConfiguredThresholds(),
@@ -25431,116 +25374,114 @@ var ExecutionMetricsEvaluator = class {
25431
25374
  };
25432
25375
  }
25433
25376
  const narrowedTrace = trace2;
25434
- const hits = [];
25435
- const misses = [];
25377
+ const assertions = [];
25436
25378
  const actualMetrics = {};
25437
25379
  if (max_tool_calls !== void 0 && narrowedTrace) {
25438
25380
  const toolCalls = narrowedTrace.eventCount;
25439
25381
  actualMetrics.tool_calls = toolCalls;
25440
25382
  if (toolCalls <= max_tool_calls) {
25441
- hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
25383
+ assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
25442
25384
  } else {
25443
- misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
25385
+ assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
25444
25386
  }
25445
25387
  }
25446
25388
  if (max_llm_calls !== void 0 && narrowedTrace) {
25447
25389
  const llmCalls = narrowedTrace.llmCallCount;
25448
25390
  if (llmCalls === void 0) {
25449
- misses.push("LLM call count data not available");
25391
+ assertions.push({ text: "LLM call count data not available", passed: false });
25450
25392
  } else {
25451
25393
  actualMetrics.llm_calls = llmCalls;
25452
25394
  if (llmCalls <= max_llm_calls) {
25453
- hits.push(`LLM calls ${llmCalls} <= ${max_llm_calls} max`);
25395
+ assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
25454
25396
  } else {
25455
- misses.push(`LLM calls ${llmCalls} > ${max_llm_calls} max`);
25397
+ assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
25456
25398
  }
25457
25399
  }
25458
25400
  }
25459
25401
  if (max_tokens !== void 0) {
25460
25402
  if (!tokenUsage) {
25461
- misses.push("Token usage data not available");
25403
+ assertions.push({ text: "Token usage data not available", passed: false });
25462
25404
  } else {
25463
25405
  const totalTokens = tokenUsage.input + tokenUsage.output;
25464
25406
  actualMetrics.tokens = totalTokens;
25465
25407
  if (totalTokens <= max_tokens) {
25466
- hits.push(`Total tokens ${totalTokens} <= ${max_tokens} max`);
25408
+ assertions.push({
25409
+ text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
25410
+ passed: true
25411
+ });
25467
25412
  } else {
25468
- misses.push(`Total tokens ${totalTokens} > ${max_tokens} max`);
25413
+ assertions.push({
25414
+ text: `Total tokens ${totalTokens} > ${max_tokens} max`,
25415
+ passed: false
25416
+ });
25469
25417
  }
25470
25418
  }
25471
25419
  }
25472
25420
  if (max_cost_usd !== void 0) {
25473
25421
  if (costUsd === void 0) {
25474
- misses.push("Cost data not available");
25422
+ assertions.push({ text: "Cost data not available", passed: false });
25475
25423
  } else {
25476
25424
  actualMetrics.cost_usd = costUsd;
25477
25425
  const formatCost = (n) => `$${n.toFixed(4)}`;
25478
25426
  if (costUsd <= max_cost_usd) {
25479
- hits.push(`Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`);
25427
+ assertions.push({
25428
+ text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
25429
+ passed: true
25430
+ });
25480
25431
  } else {
25481
- misses.push(`Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`);
25432
+ assertions.push({
25433
+ text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
25434
+ passed: false
25435
+ });
25482
25436
  }
25483
25437
  }
25484
25438
  }
25485
25439
  if (max_duration_ms !== void 0) {
25486
25440
  if (durationMs === void 0) {
25487
- misses.push("Duration data not available");
25441
+ assertions.push({ text: "Duration data not available", passed: false });
25488
25442
  } else {
25489
25443
  actualMetrics.duration_ms = durationMs;
25490
25444
  if (durationMs <= max_duration_ms) {
25491
- hits.push(`Duration ${durationMs}ms <= ${max_duration_ms}ms max`);
25445
+ assertions.push({
25446
+ text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
25447
+ passed: true
25448
+ });
25492
25449
  } else {
25493
- misses.push(`Duration ${durationMs}ms > ${max_duration_ms}ms max`);
25450
+ assertions.push({
25451
+ text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
25452
+ passed: false
25453
+ });
25494
25454
  }
25495
25455
  }
25496
25456
  }
25497
25457
  if (target_exploration_ratio !== void 0 && narrowedTrace) {
25498
25458
  const ratio = explorationRatio(narrowedTrace);
25499
25459
  if (ratio === void 0) {
25500
- misses.push("Exploration ratio not available (no tool calls)");
25460
+ assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
25501
25461
  } else {
25502
25462
  actualMetrics.exploration_ratio = ratio;
25503
25463
  const diff = Math.abs(ratio - target_exploration_ratio);
25504
25464
  if (diff <= exploration_tolerance) {
25505
- hits.push(
25506
- `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`
25507
- );
25465
+ assertions.push({
25466
+ text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
25467
+ passed: true
25468
+ });
25508
25469
  } else {
25509
- misses.push(
25510
- `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`
25511
- );
25470
+ assertions.push({
25471
+ text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
25472
+ passed: false
25473
+ });
25512
25474
  }
25513
25475
  }
25514
25476
  }
25515
- const totalChecks = hits.length + misses.length;
25516
- const score = totalChecks > 0 ? hits.length / totalChecks : 0;
25517
- const reasoningParts = [];
25518
- if (actualMetrics.tool_calls !== void 0) {
25519
- reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
25520
- }
25521
- if (actualMetrics.llm_calls !== void 0) {
25522
- reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
25523
- }
25524
- if (actualMetrics.tokens !== void 0) {
25525
- reasoningParts.push(`tokens=${actualMetrics.tokens}`);
25526
- }
25527
- if (actualMetrics.cost_usd !== void 0) {
25528
- reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
25529
- }
25530
- if (actualMetrics.duration_ms !== void 0) {
25531
- reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
25532
- }
25533
- if (actualMetrics.exploration_ratio !== void 0) {
25534
- reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
25535
- }
25536
- const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
25477
+ const totalChecks = assertions.length;
25478
+ const passedCount = assertions.filter((a) => a.passed).length;
25479
+ const score = totalChecks > 0 ? passedCount / totalChecks : 0;
25537
25480
  return {
25538
25481
  score,
25539
25482
  verdict: scoreToVerdict(score),
25540
- hits,
25541
- misses,
25483
+ assertions,
25542
25484
  expectedAspectCount: totalChecks || 1,
25543
- reasoning,
25544
25485
  evaluatorRawRequest: {
25545
25486
  type: "execution-metrics",
25546
25487
  config: this.extractConfiguredThresholds(),
@@ -25642,10 +25583,8 @@ var FieldAccuracyEvaluator = class {
25642
25583
  return {
25643
25584
  score: 0,
25644
25585
  verdict: "fail",
25645
- hits: [],
25646
- misses: ["Failed to parse candidate answer as JSON"],
25647
- expectedAspectCount: this.config.fields.length,
25648
- reasoning: "Candidate answer is not valid JSON"
25586
+ assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
25587
+ expectedAspectCount: this.config.fields.length
25649
25588
  };
25650
25589
  }
25651
25590
  const expectedData = this.extractExpectedData(evalCase.expected_output);
@@ -25653,10 +25592,8 @@ var FieldAccuracyEvaluator = class {
25653
25592
  return {
25654
25593
  score: 0,
25655
25594
  verdict: "fail",
25656
- hits: [],
25657
- misses: ["No expected data found in expected_output"],
25658
- expectedAspectCount: this.config.fields.length,
25659
- reasoning: "Could not extract expected data from expected_output"
25595
+ assertions: [{ text: "No expected data found in expected_output", passed: false }],
25596
+ expectedAspectCount: this.config.fields.length
25660
25597
  };
25661
25598
  }
25662
25599
  const fieldResults = [];
@@ -25874,18 +25811,14 @@ var FieldAccuracyEvaluator = class {
25874
25811
  */
25875
25812
  aggregateResults(results) {
25876
25813
  const aggregation = this.config.aggregation ?? "weighted_average";
25877
- const hits = [];
25878
- const misses = [];
25814
+ const assertions = [];
25879
25815
  for (const result of results) {
25880
- if (result.hit) {
25881
- hits.push(result.message);
25882
- } else {
25883
- misses.push(result.message);
25884
- }
25816
+ assertions.push({ text: result.message, passed: result.hit });
25885
25817
  }
25886
25818
  let score;
25887
25819
  if (aggregation === "all_or_nothing") {
25888
- score = misses.length === 0 ? 1 : 0;
25820
+ const hasFailed = assertions.some((a) => !a.passed);
25821
+ score = hasFailed ? 0 : 1;
25889
25822
  } else {
25890
25823
  const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
25891
25824
  if (totalWeight === 0) {
@@ -25895,15 +25828,11 @@ var FieldAccuracyEvaluator = class {
25895
25828
  score = weightedSum / totalWeight;
25896
25829
  }
25897
25830
  }
25898
- const reasoning = `${hits.length}/${results.length} fields matched`;
25899
25831
  return {
25900
25832
  score: clampScore(score),
25901
25833
  verdict: scoreToVerdict(score),
25902
- hits: hits.slice(0, 4),
25903
- // Cap at 4 to keep output concise
25904
- misses: misses.slice(0, 4),
25905
- expectedAspectCount: results.length,
25906
- reasoning
25834
+ assertions,
25835
+ expectedAspectCount: results.length
25907
25836
  };
25908
25837
  }
25909
25838
  };
@@ -26010,10 +25939,8 @@ var LatencyEvaluator = class {
26010
25939
  return {
26011
25940
  score: 0,
26012
25941
  verdict: "fail",
26013
- hits: [],
26014
- misses: ["No duration data available in trace"],
25942
+ assertions: [{ text: "No duration data available in trace", passed: false }],
26015
25943
  expectedAspectCount: 1,
26016
- reasoning: "Execution duration not reported by provider",
26017
25944
  evaluatorRawRequest: {
26018
25945
  type: "latency",
26019
25946
  threshold,
@@ -26026,10 +25953,10 @@ var LatencyEvaluator = class {
26026
25953
  return {
26027
25954
  score,
26028
25955
  verdict: passed ? "pass" : "fail",
26029
- hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
26030
- misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
25956
+ assertions: [
25957
+ passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
25958
+ ],
26031
25959
  expectedAspectCount: 1,
26032
- reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
26033
25960
  evaluatorRawRequest: {
26034
25961
  type: "latency",
26035
25962
  threshold,
@@ -26048,7 +25975,10 @@ var COPILOT_MATCHER = {
26048
25975
  skillTools: ["Skill", "skill"],
26049
25976
  skillInputField: "skill",
26050
25977
  readTools: ["Read File", "readFile", "Read", "readTextFile"],
26051
- readInputField: "file_path"
25978
+ readInputField: "file_path",
25979
+ skillToolPrefixes: ["Using skill: "],
25980
+ readToolPrefixes: ["Viewing "],
25981
+ readInputFields: ["file_path", "path"]
26052
25982
  };
26053
25983
  var PROVIDER_TOOL_SEMANTICS = {
26054
25984
  claude: CLAUDE_MATCHER,
@@ -26090,12 +26020,22 @@ var SkillTriggerEvaluator = class {
26090
26020
  triggered = true;
26091
26021
  evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
26092
26022
  }
26023
+ } else if (matcher.skillToolPrefixes?.some(
26024
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
26025
+ )) {
26026
+ triggered = true;
26027
+ evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
26093
26028
  } else if (matcher.readTools.includes(firstTool.tool)) {
26094
- const filePath = String(input[matcher.readInputField] ?? "");
26029
+ const filePath = this.readPathFromInput(input, matcher);
26095
26030
  if (filePath.includes(skillName)) {
26096
26031
  triggered = true;
26097
26032
  evidence = `Read tool loaded skill file: ${filePath}`;
26098
26033
  }
26034
+ } else if (matcher.readToolPrefixes?.some(
26035
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
26036
+ )) {
26037
+ triggered = true;
26038
+ evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
26099
26039
  }
26100
26040
  }
26101
26041
  const pass = triggered === shouldTrigger;
@@ -26103,25 +26043,37 @@ var SkillTriggerEvaluator = class {
26103
26043
  return {
26104
26044
  score: 1,
26105
26045
  verdict: "pass",
26106
- hits: [
26107
- shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`
26046
+ assertions: [
26047
+ {
26048
+ text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
26049
+ passed: true
26050
+ }
26108
26051
  ],
26109
- misses: [],
26110
- expectedAspectCount: 1,
26111
- reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
26052
+ expectedAspectCount: 1
26112
26053
  };
26113
26054
  }
26114
26055
  return {
26115
26056
  score: 0,
26116
26057
  verdict: "fail",
26117
- hits: [],
26118
- misses: [
26119
- shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`
26058
+ assertions: [
26059
+ {
26060
+ text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
26061
+ passed: false
26062
+ }
26120
26063
  ],
26121
- expectedAspectCount: 1,
26122
- reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
26064
+ expectedAspectCount: 1
26123
26065
  };
26124
26066
  }
26067
+ readPathFromInput(input, matcher) {
26068
+ const fields = matcher.readInputFields ?? [matcher.readInputField];
26069
+ for (const field of fields) {
26070
+ const value = input[field];
26071
+ if (value !== void 0 && value !== null) {
26072
+ return String(value);
26073
+ }
26074
+ }
26075
+ return "";
26076
+ }
26125
26077
  };
26126
26078
  function assembleLlmGraderPrompt(input) {
26127
26079
  const {
@@ -26154,12 +26106,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
26154
26106
  [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
26155
26107
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
26156
26108
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
26157
- [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
26158
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
26159
26109
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
26160
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
26161
26110
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
26162
- // Text convenience accessors (new names, always strings)
26163
26111
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
26164
26112
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
26165
26113
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -26284,10 +26232,8 @@ var TokenUsageEvaluator = class {
26284
26232
  return {
26285
26233
  score: 0,
26286
26234
  verdict: "fail",
26287
- hits: [],
26288
- misses: ["No token usage data available in trace"],
26235
+ assertions: [{ text: "No token usage data available in trace", passed: false }],
26289
26236
  expectedAspectCount,
26290
- reasoning: "Token usage not reported by provider",
26291
26237
  evaluatorRawRequest: {
26292
26238
  type: "token-usage",
26293
26239
  max_total: maxTotal ?? null,
@@ -26301,37 +26247,34 @@ var TokenUsageEvaluator = class {
26301
26247
  const output = usage.output;
26302
26248
  const cached = usage.cached ?? 0;
26303
26249
  const total = input + output + cached;
26304
- const hits = [];
26305
- const misses = [];
26250
+ const assertions = [];
26306
26251
  if (typeof maxInput === "number") {
26307
26252
  if (input <= maxInput) {
26308
- hits.push(`Input tokens ${input} <= ${maxInput}`);
26253
+ assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
26309
26254
  } else {
26310
- misses.push(`Input tokens ${input} > ${maxInput}`);
26255
+ assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
26311
26256
  }
26312
26257
  }
26313
26258
  if (typeof maxOutput === "number") {
26314
26259
  if (output <= maxOutput) {
26315
- hits.push(`Output tokens ${output} <= ${maxOutput}`);
26260
+ assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
26316
26261
  } else {
26317
- misses.push(`Output tokens ${output} > ${maxOutput}`);
26262
+ assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
26318
26263
  }
26319
26264
  }
26320
26265
  if (typeof maxTotal === "number") {
26321
26266
  if (total <= maxTotal) {
26322
- hits.push(`Total tokens ${total} <= ${maxTotal}`);
26267
+ assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
26323
26268
  } else {
26324
- misses.push(`Total tokens ${total} > ${maxTotal}`);
26269
+ assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
26325
26270
  }
26326
26271
  }
26327
- const passed = misses.length === 0;
26272
+ const passed = assertions.every((a) => a.passed);
26328
26273
  return {
26329
26274
  score: passed ? 1 : 0,
26330
26275
  verdict: passed ? "pass" : "fail",
26331
- hits,
26332
- misses,
26276
+ assertions,
26333
26277
  expectedAspectCount,
26334
- reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
26335
26278
  evaluatorRawRequest: {
26336
26279
  type: "token-usage",
26337
26280
  max_total: maxTotal ?? null,
@@ -26429,8 +26372,7 @@ var ToolTrajectoryEvaluator = class {
26429
26372
  return {
26430
26373
  score: 0,
26431
26374
  verdict: "fail",
26432
- hits: [],
26433
- misses: ["No trace available for evaluation"],
26375
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
26434
26376
  expectedAspectCount: 1
26435
26377
  };
26436
26378
  }
@@ -26441,8 +26383,7 @@ var ToolTrajectoryEvaluator = class {
26441
26383
  return {
26442
26384
  score: 0,
26443
26385
  verdict: "fail",
26444
- hits: [],
26445
- misses: ["No trace available for evaluation"],
26386
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
26446
26387
  expectedAspectCount: 1
26447
26388
  };
26448
26389
  }
@@ -26460,8 +26401,7 @@ var ToolTrajectoryEvaluator = class {
26460
26401
  return {
26461
26402
  score: 0,
26462
26403
  verdict: "fail",
26463
- hits: [],
26464
- misses: [`Unknown mode: ${this.config.mode}`],
26404
+ assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
26465
26405
  expectedAspectCount: 1
26466
26406
  };
26467
26407
  }
@@ -26510,28 +26450,32 @@ var ToolTrajectoryEvaluator = class {
26510
26450
  return {
26511
26451
  score: 1,
26512
26452
  verdict: "pass",
26513
- hits: ["No tool requirements specified"],
26514
- misses: [],
26453
+ assertions: [{ text: "No tool requirements specified", passed: true }],
26515
26454
  expectedAspectCount: 0
26516
26455
  };
26517
26456
  }
26518
- const hits = [];
26519
- const misses = [];
26457
+ const assertions = [];
26520
26458
  for (const toolName of toolNames) {
26521
26459
  const required = minimums[toolName];
26522
26460
  const actual = summary.toolCallsByName[toolName] ?? 0;
26523
26461
  if (actual >= required) {
26524
- hits.push(`${toolName}: called ${actual} times (required >=${required})`);
26462
+ assertions.push({
26463
+ text: `${toolName}: called ${actual} times (required >=${required})`,
26464
+ passed: true
26465
+ });
26525
26466
  } else {
26526
- misses.push(`${toolName}: called ${actual} times (required >=${required})`);
26467
+ assertions.push({
26468
+ text: `${toolName}: called ${actual} times (required >=${required})`,
26469
+ passed: false
26470
+ });
26527
26471
  }
26528
26472
  }
26529
- const score = hits.length / toolNames.length;
26473
+ const passedCount = assertions.filter((a) => a.passed).length;
26474
+ const score = passedCount / toolNames.length;
26530
26475
  return {
26531
26476
  score,
26532
26477
  verdict: scoreToVerdict(score),
26533
- hits,
26534
- misses,
26478
+ assertions,
26535
26479
  expectedAspectCount: toolNames.length
26536
26480
  };
26537
26481
  }
@@ -26541,13 +26485,11 @@ var ToolTrajectoryEvaluator = class {
26541
26485
  return {
26542
26486
  score: 1,
26543
26487
  verdict: "pass",
26544
- hits: ["No tool sequence specified"],
26545
- misses: [],
26488
+ assertions: [{ text: "No tool sequence specified", passed: true }],
26546
26489
  expectedAspectCount: 0
26547
26490
  };
26548
26491
  }
26549
- const hits = [];
26550
- const misses = [];
26492
+ const assertions = [];
26551
26493
  const warnings = [];
26552
26494
  let actualIndex = 0;
26553
26495
  let sequenceHits = 0;
@@ -26567,16 +26509,20 @@ var ToolTrajectoryEvaluator = class {
26567
26509
  const actualCall = toolCalls[actualIndex];
26568
26510
  if (actualCall.name === expectedTool) {
26569
26511
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
26570
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
26512
+ assertions.push({
26513
+ text: `Found ${expectedTool} at position ${actualIndex}`,
26514
+ passed: true
26515
+ });
26571
26516
  sequenceHits++;
26572
26517
  matchedCall = actualCall;
26573
26518
  actualIndex++;
26574
26519
  found = true;
26575
26520
  break;
26576
26521
  }
26577
- misses.push(
26578
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
26579
- );
26522
+ assertions.push({
26523
+ text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
26524
+ passed: false
26525
+ });
26580
26526
  actualIndex++;
26581
26527
  argsMismatch = true;
26582
26528
  break;
@@ -26584,7 +26530,10 @@ var ToolTrajectoryEvaluator = class {
26584
26530
  actualIndex++;
26585
26531
  }
26586
26532
  if (!found && !argsMismatch) {
26587
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
26533
+ assertions.push({
26534
+ text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
26535
+ passed: false
26536
+ });
26588
26537
  }
26589
26538
  if (found && matchedCall) {
26590
26539
  const latencyResult = checkLatency(
@@ -26593,10 +26542,10 @@ var ToolTrajectoryEvaluator = class {
26593
26542
  matchedCall.durationMs
26594
26543
  );
26595
26544
  if (latencyResult.status === "pass") {
26596
- hits.push(latencyResult.message);
26545
+ assertions.push({ text: latencyResult.message, passed: true });
26597
26546
  latencyHits++;
26598
26547
  } else if (latencyResult.status === "fail") {
26599
- misses.push(latencyResult.message);
26548
+ assertions.push({ text: latencyResult.message, passed: false });
26600
26549
  } else if (latencyResult.message) {
26601
26550
  warnings.push(latencyResult.message);
26602
26551
  latencySkips++;
@@ -26612,8 +26561,7 @@ var ToolTrajectoryEvaluator = class {
26612
26561
  return {
26613
26562
  score,
26614
26563
  verdict: scoreToVerdict(score),
26615
- hits,
26616
- misses,
26564
+ assertions,
26617
26565
  expectedAspectCount: totalAssertions
26618
26566
  };
26619
26567
  }
@@ -26623,13 +26571,11 @@ var ToolTrajectoryEvaluator = class {
26623
26571
  return {
26624
26572
  score: 1,
26625
26573
  verdict: "pass",
26626
- hits: ["No tool sequence specified"],
26627
- misses: [],
26574
+ assertions: [{ text: "No tool sequence specified", passed: true }],
26628
26575
  expectedAspectCount: 0
26629
26576
  };
26630
26577
  }
26631
- const hits = [];
26632
- const misses = [];
26578
+ const assertions = [];
26633
26579
  const warnings = [];
26634
26580
  let sequenceHits = 0;
26635
26581
  let latencyHits = 0;
@@ -26638,7 +26584,10 @@ var ToolTrajectoryEvaluator = class {
26638
26584
  (item) => item.maxDurationMs !== void 0
26639
26585
  ).length;
26640
26586
  if (toolCalls.length !== expected.length) {
26641
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
26587
+ assertions.push({
26588
+ text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
26589
+ passed: false
26590
+ });
26642
26591
  }
26643
26592
  const checkLength = Math.min(expected.length, toolCalls.length);
26644
26593
  for (let i = 0; i < checkLength; i++) {
@@ -26650,14 +26599,17 @@ var ToolTrajectoryEvaluator = class {
26650
26599
  let sequenceMatched = false;
26651
26600
  if (actualTool === expectedTool) {
26652
26601
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
26653
- hits.push(`Position ${i}: ${expectedTool}`);
26602
+ assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
26654
26603
  sequenceHits++;
26655
26604
  sequenceMatched = true;
26656
26605
  } else {
26657
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
26606
+ assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
26658
26607
  }
26659
26608
  } else {
26660
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
26609
+ assertions.push({
26610
+ text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
26611
+ passed: false
26612
+ });
26661
26613
  }
26662
26614
  if (sequenceMatched) {
26663
26615
  const latencyResult = checkLatency(
@@ -26666,10 +26618,10 @@ var ToolTrajectoryEvaluator = class {
26666
26618
  actualCall.durationMs
26667
26619
  );
26668
26620
  if (latencyResult.status === "pass") {
26669
- hits.push(latencyResult.message);
26621
+ assertions.push({ text: latencyResult.message, passed: true });
26670
26622
  latencyHits++;
26671
26623
  } else if (latencyResult.status === "fail") {
26672
- misses.push(latencyResult.message);
26624
+ assertions.push({ text: latencyResult.message, passed: false });
26673
26625
  } else if (latencyResult.message) {
26674
26626
  warnings.push(latencyResult.message);
26675
26627
  latencySkips++;
@@ -26677,7 +26629,10 @@ var ToolTrajectoryEvaluator = class {
26677
26629
  }
26678
26630
  }
26679
26631
  for (let i = checkLength; i < expected.length; i++) {
26680
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
26632
+ assertions.push({
26633
+ text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
26634
+ passed: false
26635
+ });
26681
26636
  }
26682
26637
  for (const warning of warnings) {
26683
26638
  console.warn(`[tool-trajectory] ${warning}`);
@@ -26688,8 +26643,7 @@ var ToolTrajectoryEvaluator = class {
26688
26643
  return {
26689
26644
  score,
26690
26645
  verdict: scoreToVerdict(score),
26691
- hits,
26692
- misses,
26646
+ assertions,
26693
26647
  expectedAspectCount: totalAssertions
26694
26648
  };
26695
26649
  }
@@ -26704,13 +26658,11 @@ var ToolTrajectoryEvaluator = class {
26704
26658
  return {
26705
26659
  score: 1,
26706
26660
  verdict: "pass",
26707
- hits: ["No expected tools specified"],
26708
- misses: [],
26661
+ assertions: [{ text: "No expected tools specified", passed: true }],
26709
26662
  expectedAspectCount: 0
26710
26663
  };
26711
26664
  }
26712
- const hits = [];
26713
- const misses = [];
26665
+ const assertions = [];
26714
26666
  const consumed = /* @__PURE__ */ new Set();
26715
26667
  for (let i = 0; i < expected.length; i++) {
26716
26668
  const expectedItem = expected[i];
@@ -26721,22 +26673,25 @@ var ToolTrajectoryEvaluator = class {
26721
26673
  if (consumed.has(j)) continue;
26722
26674
  const actualCall = toolCalls[j];
26723
26675
  if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
26724
- hits.push(`Found ${expectedTool} at position ${j}`);
26676
+ assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
26725
26677
  consumed.add(j);
26726
26678
  found = true;
26727
26679
  break;
26728
26680
  }
26729
26681
  }
26730
26682
  if (!found) {
26731
- misses.push(`Expected ${expectedTool} not found in actual trajectory`);
26683
+ assertions.push({
26684
+ text: `Expected ${expectedTool} not found in actual trajectory`,
26685
+ passed: false
26686
+ });
26732
26687
  }
26733
26688
  }
26734
- const score = expected.length > 0 ? hits.length / expected.length : 1;
26689
+ const passedCount = assertions.filter((a) => a.passed).length;
26690
+ const score = expected.length > 0 ? passedCount / expected.length : 1;
26735
26691
  return {
26736
26692
  score,
26737
26693
  verdict: scoreToVerdict(score),
26738
- hits,
26739
- misses,
26694
+ assertions,
26740
26695
  expectedAspectCount: expected.length
26741
26696
  };
26742
26697
  }
@@ -26752,16 +26707,19 @@ var ToolTrajectoryEvaluator = class {
26752
26707
  return {
26753
26708
  score: 1,
26754
26709
  verdict: "pass",
26755
- hits: ["No tool calls and no expected tools"],
26756
- misses: [],
26710
+ assertions: [{ text: "No tool calls and no expected tools", passed: true }],
26757
26711
  expectedAspectCount: 0
26758
26712
  };
26759
26713
  }
26760
26714
  return {
26761
26715
  score: 0,
26762
26716
  verdict: "fail",
26763
- hits: [],
26764
- misses: [`${toolCalls.length} unexpected tool call(s) with empty allowed list`],
26717
+ assertions: [
26718
+ {
26719
+ text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
26720
+ passed: false
26721
+ }
26722
+ ],
26765
26723
  expectedAspectCount: toolCalls.length
26766
26724
  };
26767
26725
  }
@@ -26769,13 +26727,11 @@ var ToolTrajectoryEvaluator = class {
26769
26727
  return {
26770
26728
  score: 1,
26771
26729
  verdict: "pass",
26772
- hits: ["No actual tool calls (trivially a subset)"],
26773
- misses: [],
26730
+ assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
26774
26731
  expectedAspectCount: 0
26775
26732
  };
26776
26733
  }
26777
- const hits = [];
26778
- const misses = [];
26734
+ const assertions = [];
26779
26735
  for (let i = 0; i < toolCalls.length; i++) {
26780
26736
  const actualCall = toolCalls[i];
26781
26737
  let allowed = false;
@@ -26787,17 +26743,23 @@ var ToolTrajectoryEvaluator = class {
26787
26743
  }
26788
26744
  }
26789
26745
  if (allowed) {
26790
- hits.push(`Position ${i}: ${actualCall.name} is in allowed set`);
26746
+ assertions.push({
26747
+ text: `Position ${i}: ${actualCall.name} is in allowed set`,
26748
+ passed: true
26749
+ });
26791
26750
  } else {
26792
- misses.push(`Position ${i}: ${actualCall.name} is not in allowed set`);
26751
+ assertions.push({
26752
+ text: `Position ${i}: ${actualCall.name} is not in allowed set`,
26753
+ passed: false
26754
+ });
26793
26755
  }
26794
26756
  }
26795
- const score = toolCalls.length > 0 ? hits.length / toolCalls.length : 1;
26757
+ const passedCount = assertions.filter((a) => a.passed).length;
26758
+ const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
26796
26759
  return {
26797
26760
  score,
26798
26761
  verdict: scoreToVerdict(score),
26799
- hits,
26800
- misses,
26762
+ assertions,
26801
26763
  expectedAspectCount: toolCalls.length
26802
26764
  };
26803
26765
  }
@@ -26806,8 +26768,12 @@ function runContainsAssertion(output, value) {
26806
26768
  const passed = output.includes(value);
26807
26769
  return {
26808
26770
  score: passed ? 1 : 0,
26809
- hits: passed ? [`Output contains "${value}"`] : [],
26810
- misses: passed ? [] : [`Output does not contain "${value}"`]
26771
+ assertions: [
26772
+ {
26773
+ text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
26774
+ passed
26775
+ }
26776
+ ]
26811
26777
  };
26812
26778
  }
26813
26779
  function runContainsAnyAssertion(output, values) {
@@ -26815,8 +26781,12 @@ function runContainsAnyAssertion(output, values) {
26815
26781
  const passed = matched.length > 0;
26816
26782
  return {
26817
26783
  score: passed ? 1 : 0,
26818
- hits: passed ? [`Output contains "${matched[0]}"`] : [],
26819
- misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
26784
+ assertions: [
26785
+ {
26786
+ text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
26787
+ passed
26788
+ }
26789
+ ]
26820
26790
  };
26821
26791
  }
26822
26792
  function runContainsAllAssertion(output, values) {
@@ -26824,16 +26794,24 @@ function runContainsAllAssertion(output, values) {
26824
26794
  const passed = missing.length === 0;
26825
26795
  return {
26826
26796
  score: passed ? 1 : 0,
26827
- hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
26828
- misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
26797
+ assertions: [
26798
+ {
26799
+ text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
26800
+ passed
26801
+ }
26802
+ ]
26829
26803
  };
26830
26804
  }
26831
26805
  function runIcontainsAssertion(output, value) {
26832
26806
  const passed = output.toLowerCase().includes(value.toLowerCase());
26833
26807
  return {
26834
26808
  score: passed ? 1 : 0,
26835
- hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
26836
- misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
26809
+ assertions: [
26810
+ {
26811
+ text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
26812
+ passed
26813
+ }
26814
+ ]
26837
26815
  };
26838
26816
  }
26839
26817
  function runIcontainsAnyAssertion(output, values) {
@@ -26842,9 +26820,11 @@ function runIcontainsAnyAssertion(output, values) {
26842
26820
  const passed = matched.length > 0;
26843
26821
  return {
26844
26822
  score: passed ? 1 : 0,
26845
- hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
26846
- misses: passed ? [] : [
26847
- `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
26823
+ assertions: [
26824
+ {
26825
+ text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
26826
+ passed
26827
+ }
26848
26828
  ]
26849
26829
  };
26850
26830
  }
@@ -26854,24 +26834,36 @@ function runIcontainsAllAssertion(output, values) {
26854
26834
  const passed = missing.length === 0;
26855
26835
  return {
26856
26836
  score: passed ? 1 : 0,
26857
- hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
26858
- misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
26837
+ assertions: [
26838
+ {
26839
+ text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
26840
+ passed
26841
+ }
26842
+ ]
26859
26843
  };
26860
26844
  }
26861
26845
  function runStartsWithAssertion(output, value) {
26862
26846
  const passed = output.trim().startsWith(value.trim());
26863
26847
  return {
26864
26848
  score: passed ? 1 : 0,
26865
- hits: passed ? [`Output starts with "${value}"`] : [],
26866
- misses: passed ? [] : [`Output does not start with "${value}"`]
26849
+ assertions: [
26850
+ {
26851
+ text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
26852
+ passed
26853
+ }
26854
+ ]
26867
26855
  };
26868
26856
  }
26869
26857
  function runEndsWithAssertion(output, value) {
26870
26858
  const passed = output.trim().endsWith(value.trim());
26871
26859
  return {
26872
26860
  score: passed ? 1 : 0,
26873
- hits: passed ? [`Output ends with "${value}"`] : [],
26874
- misses: passed ? [] : [`Output does not end with "${value}"`]
26861
+ assertions: [
26862
+ {
26863
+ text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
26864
+ passed
26865
+ }
26866
+ ]
26875
26867
  };
26876
26868
  }
26877
26869
  function runRegexAssertion(output, pattern, flags) {
@@ -26880,8 +26872,12 @@ function runRegexAssertion(output, pattern, flags) {
26880
26872
  const flagsLabel = flags ? ` (flags: ${flags})` : "";
26881
26873
  return {
26882
26874
  score: passed ? 1 : 0,
26883
- hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
26884
- misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
26875
+ assertions: [
26876
+ {
26877
+ text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
26878
+ passed
26879
+ }
26880
+ ]
26885
26881
  };
26886
26882
  }
26887
26883
  function runIsJsonAssertion(output) {
@@ -26893,16 +26889,24 @@ function runIsJsonAssertion(output) {
26893
26889
  }
26894
26890
  return {
26895
26891
  score: passed ? 1 : 0,
26896
- hits: passed ? ["Output is valid JSON"] : [],
26897
- misses: passed ? [] : ["Output is not valid JSON"]
26892
+ assertions: [
26893
+ {
26894
+ text: passed ? "Output is valid JSON" : "Output is not valid JSON",
26895
+ passed
26896
+ }
26897
+ ]
26898
26898
  };
26899
26899
  }
26900
26900
  function runEqualsAssertion(output, value) {
26901
26901
  const passed = output.trim() === value.trim();
26902
26902
  return {
26903
26903
  score: passed ? 1 : 0,
26904
- hits: passed ? [`Output equals "${value}"`] : [],
26905
- misses: passed ? [] : [`Output does not equal "${value}"`]
26904
+ assertions: [
26905
+ {
26906
+ text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
26907
+ passed
26908
+ }
26909
+ ]
26906
26910
  };
26907
26911
  }
26908
26912
  var Node = class {
@@ -27101,10 +27105,8 @@ var InlineAssertEvaluator = class {
27101
27105
  return {
27102
27106
  score,
27103
27107
  verdict: scoreToVerdict(score),
27104
- hits: score >= 0.8 ? [result.name] : [],
27105
- misses: score < 0.5 ? [result.name] : [],
27108
+ assertions: [{ text: result.name, passed: score >= 0.5 }],
27106
27109
  expectedAspectCount: 1,
27107
- reasoning: void 0,
27108
27110
  details: result.metadata ? result.metadata : void 0
27109
27111
  };
27110
27112
  }
@@ -27139,11 +27141,9 @@ async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
27139
27141
  }
27140
27142
  async function executePromptTemplate(script, context2, config, timeoutMs) {
27141
27143
  const payload = {
27142
- question: context2.evalCase.question,
27143
27144
  criteria: context2.evalCase.criteria,
27144
27145
  expectedOutput: context2.evalCase.expected_output,
27145
- referenceAnswer: context2.evalCase.reference_answer,
27146
- answer: context2.candidate,
27146
+ outputText: context2.candidate,
27147
27147
  output: context2.output ?? null,
27148
27148
  guidelineFiles: context2.evalCase.guideline_paths,
27149
27149
  inputFiles: context2.evalCase.file_paths.filter(
@@ -27154,9 +27154,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
27154
27154
  fileChanges: context2.fileChanges ?? null,
27155
27155
  workspacePath: context2.workspacePath ?? null,
27156
27156
  config: config ?? context2.config ?? null,
27157
- // Text convenience accessors (new names, always strings)
27158
27157
  inputText: context2.evalCase.question,
27159
- outputText: context2.candidate,
27160
27158
  expectedOutputText: context2.evalCase.reference_answer ?? ""
27161
27159
  };
27162
27160
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -27292,9 +27290,7 @@ var containsFactory = (config) => {
27292
27290
  return {
27293
27291
  score: result.score,
27294
27292
  verdict: result.score === 1 ? "pass" : "fail",
27295
- hits: result.hits,
27296
- misses: result.misses,
27297
- reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
27293
+ assertions: result.assertions,
27298
27294
  expectedAspectCount: 1
27299
27295
  };
27300
27296
  });
@@ -27306,9 +27302,7 @@ var regexFactory = (config) => {
27306
27302
  return {
27307
27303
  score: result.score,
27308
27304
  verdict: result.score === 1 ? "pass" : "fail",
27309
- hits: result.hits,
27310
- misses: result.misses,
27311
- reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
27305
+ assertions: result.assertions,
27312
27306
  expectedAspectCount: 1
27313
27307
  };
27314
27308
  });
@@ -27319,9 +27313,7 @@ var isJsonFactory = () => {
27319
27313
  return {
27320
27314
  score: result.score,
27321
27315
  verdict: result.score === 1 ? "pass" : "fail",
27322
- hits: result.hits,
27323
- misses: result.misses,
27324
- reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
27316
+ assertions: result.assertions,
27325
27317
  expectedAspectCount: 1
27326
27318
  };
27327
27319
  });
@@ -27333,9 +27325,7 @@ var equalsFactory = (config) => {
27333
27325
  return {
27334
27326
  score: result.score,
27335
27327
  verdict: result.score === 1 ? "pass" : "fail",
27336
- hits: result.hits,
27337
- misses: result.misses,
27338
- reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
27328
+ assertions: result.assertions,
27339
27329
  expectedAspectCount: 1
27340
27330
  };
27341
27331
  });
@@ -27347,9 +27337,7 @@ var containsAnyFactory = (config) => {
27347
27337
  return {
27348
27338
  score: result.score,
27349
27339
  verdict: result.score === 1 ? "pass" : "fail",
27350
- hits: result.hits,
27351
- misses: result.misses,
27352
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27340
+ assertions: result.assertions,
27353
27341
  expectedAspectCount: 1
27354
27342
  };
27355
27343
  });
@@ -27361,9 +27349,7 @@ var containsAllFactory = (config) => {
27361
27349
  return {
27362
27350
  score: result.score,
27363
27351
  verdict: result.score === 1 ? "pass" : "fail",
27364
- hits: result.hits,
27365
- misses: result.misses,
27366
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27352
+ assertions: result.assertions,
27367
27353
  expectedAspectCount: 1
27368
27354
  };
27369
27355
  });
@@ -27375,9 +27361,7 @@ var icontainsFactory = (config) => {
27375
27361
  return {
27376
27362
  score: result.score,
27377
27363
  verdict: result.score === 1 ? "pass" : "fail",
27378
- hits: result.hits,
27379
- misses: result.misses,
27380
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27364
+ assertions: result.assertions,
27381
27365
  expectedAspectCount: 1
27382
27366
  };
27383
27367
  });
@@ -27389,9 +27373,7 @@ var icontainsAnyFactory = (config) => {
27389
27373
  return {
27390
27374
  score: result.score,
27391
27375
  verdict: result.score === 1 ? "pass" : "fail",
27392
- hits: result.hits,
27393
- misses: result.misses,
27394
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27376
+ assertions: result.assertions,
27395
27377
  expectedAspectCount: 1
27396
27378
  };
27397
27379
  });
@@ -27403,9 +27385,7 @@ var icontainsAllFactory = (config) => {
27403
27385
  return {
27404
27386
  score: result.score,
27405
27387
  verdict: result.score === 1 ? "pass" : "fail",
27406
- hits: result.hits,
27407
- misses: result.misses,
27408
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27388
+ assertions: result.assertions,
27409
27389
  expectedAspectCount: 1
27410
27390
  };
27411
27391
  });
@@ -27417,9 +27397,7 @@ var startsWithFactory = (config) => {
27417
27397
  return {
27418
27398
  score: result.score,
27419
27399
  verdict: result.score === 1 ? "pass" : "fail",
27420
- hits: result.hits,
27421
- misses: result.misses,
27422
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27400
+ assertions: result.assertions,
27423
27401
  expectedAspectCount: 1
27424
27402
  };
27425
27403
  });
@@ -27431,9 +27409,7 @@ var endsWithFactory = (config) => {
27431
27409
  return {
27432
27410
  score: result.score,
27433
27411
  verdict: result.score === 1 ? "pass" : "fail",
27434
- hits: result.hits,
27435
- misses: result.misses,
27436
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27412
+ assertions: result.assertions,
27437
27413
  expectedAspectCount: 1
27438
27414
  };
27439
27415
  });
@@ -28462,7 +28438,7 @@ async function runEvaluation(options) {
28462
28438
  if (!cliModel) {
28463
28439
  throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
28464
28440
  }
28465
- const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-HDSAUUEF-LUBMM7TH.js");
28441
+ const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-NFFLXG5M-TJAWCWCX.js");
28466
28442
  return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
28467
28443
  }
28468
28444
  const overrideTarget = resolveTargetByName(cliGraderTarget);
@@ -28797,9 +28773,8 @@ async function runEvaluation(options) {
28797
28773
  testId: evalCase.id,
28798
28774
  dataset: evalCase.dataset,
28799
28775
  score: 0,
28800
- hits: [],
28801
- misses: [],
28802
- answer: "",
28776
+ assertions: [],
28777
+ outputText: "",
28803
28778
  target: target.name,
28804
28779
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
28805
28780
  budgetExceeded: true,
@@ -28834,9 +28809,8 @@ async function runEvaluation(options) {
28834
28809
  testId: evalCase.id,
28835
28810
  dataset: evalCase.dataset,
28836
28811
  score: 0,
28837
- hits: [],
28838
- misses: [],
28839
- answer: "",
28812
+ assertions: [],
28813
+ outputText: "",
28840
28814
  target: target.name,
28841
28815
  error: errorMsg,
28842
28816
  executionStatus: "execution_error",
@@ -29802,11 +29776,9 @@ async function evaluateCandidate(options) {
29802
29776
  dataset: evalCase.dataset,
29803
29777
  conversationId: evalCase.conversation_id,
29804
29778
  score: score.score,
29805
- hits: score.hits,
29806
- misses: score.misses,
29807
- answer: candidate,
29779
+ assertions: score.assertions,
29780
+ outputText: candidate,
29808
29781
  target: target.name,
29809
- reasoning: score.reasoning,
29810
29782
  tokenUsage,
29811
29783
  costUsd,
29812
29784
  durationMs,
@@ -29980,9 +29952,7 @@ async function runEvaluatorList(options) {
29980
29952
  score: score2.score,
29981
29953
  weight,
29982
29954
  verdict: score2.verdict,
29983
- hits: score2.hits,
29984
- misses: score2.misses,
29985
- reasoning: score2.reasoning,
29955
+ assertions: score2.assertions,
29986
29956
  evaluatorProviderRequest: score2.evaluatorRawRequest,
29987
29957
  details: score2.details,
29988
29958
  scores: mapChildResults(score2.scores),
@@ -29997,10 +29967,10 @@ async function runEvaluatorList(options) {
29997
29967
  const fallbackScore = {
29998
29968
  score: 0,
29999
29969
  verdict: "fail",
30000
- hits: [],
30001
- misses: [`Evaluator '${evaluatorConfig.name}' failed: ${message}`],
30002
- expectedAspectCount: 1,
30003
- reasoning: message
29970
+ assertions: [
29971
+ { text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
29972
+ ],
29973
+ expectedAspectCount: 1
30004
29974
  };
30005
29975
  const weight = evaluatorConfig.weight ?? 1;
30006
29976
  scored.push({
@@ -30016,9 +29986,12 @@ async function runEvaluatorList(options) {
30016
29986
  score: 0,
30017
29987
  weight,
30018
29988
  verdict: "fail",
30019
- hits: [],
30020
- misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
30021
- reasoning: message,
29989
+ assertions: [
29990
+ {
29991
+ text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
29992
+ passed: false
29993
+ }
29994
+ ],
30022
29995
  durationMs: endedAt.getTime() - startedAt.getTime(),
30023
29996
  startedAt: startedAt.toISOString(),
30024
29997
  endedAt: endedAt.toISOString()
@@ -30034,9 +30007,7 @@ async function runEvaluatorList(options) {
30034
30007
  ...scores[lastScoresIdx],
30035
30008
  score: negated.score,
30036
30009
  verdict: negated.verdict,
30037
- hits: [...negated.hits],
30038
- misses: [...negated.misses],
30039
- reasoning: negated.reasoning
30010
+ assertions: [...negated.assertions]
30040
30011
  };
30041
30012
  }
30042
30013
  }
@@ -30051,21 +30022,13 @@ async function runEvaluatorList(options) {
30051
30022
  const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
30052
30023
  scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
30053
30024
  ) : 0;
30054
- const hits = scored.flatMap((entry) => entry.score.hits);
30055
- const misses = scored.flatMap((entry) => entry.score.misses);
30056
- const expectedAspectCount = scored.reduce(
30057
- (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
30058
- 0
30059
- );
30060
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
30061
- const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
30025
+ const assertions = scored.flatMap((entry) => entry.score.assertions);
30026
+ const expectedAspectCount = assertions.length || 1;
30062
30027
  const score = {
30063
30028
  score: aggregateScore,
30064
30029
  verdict: scoreToVerdict(aggregateScore),
30065
- hits,
30066
- misses,
30067
- expectedAspectCount,
30068
- reasoning
30030
+ assertions,
30031
+ expectedAspectCount
30069
30032
  };
30070
30033
  return { score, scores };
30071
30034
  }
@@ -30169,9 +30132,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
30169
30132
  dataset: evalCase.dataset,
30170
30133
  conversationId: evalCase.conversation_id,
30171
30134
  score: 0,
30172
- hits: [],
30173
- misses: [`Error: ${message}`],
30174
- answer: `Error occurred: ${message}`,
30135
+ assertions: [{ text: `Error: ${message}`, passed: false }],
30136
+ outputText: `Error occurred: ${message}`,
30175
30137
  target: targetName,
30176
30138
  requests,
30177
30139
  input,
@@ -30280,9 +30242,7 @@ function mapChildResults(children) {
30280
30242
  score: child.score,
30281
30243
  weight: child.weight,
30282
30244
  verdict: child.verdict,
30283
- hits: child.hits,
30284
- misses: child.misses,
30285
- reasoning: child.reasoning,
30245
+ assertions: child.assertions,
30286
30246
  evaluatorProviderRequest: child.evaluatorRawRequest,
30287
30247
  scores: mapChildResults(child.scores),
30288
30248
  details: child.details,
@@ -30690,7 +30650,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
30690
30650
  return false;
30691
30651
  }
30692
30652
  var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
30693
- "answer",
30653
+ "outputText",
30694
30654
  "requests",
30695
30655
  "trace",
30696
30656
  "workspacePath",
@@ -30862,7 +30822,7 @@ var OtelTraceExporter = class {
30862
30822
  rootSpan.setAttribute("agentv.target", result.target);
30863
30823
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
30864
30824
  rootSpan.setAttribute("agentv.score", result.score);
30865
- if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
30825
+ if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
30866
30826
  if (result.durationMs != null)
30867
30827
  rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
30868
30828
  if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
@@ -31150,7 +31110,6 @@ export {
31150
31110
  isJsonValue,
31151
31111
  isTestMessage,
31152
31112
  isEvaluatorKind,
31153
- getHitCount,
31154
31113
  fileExists,
31155
31114
  normalizeLineEndings,
31156
31115
  readTextFile,
@@ -31290,4 +31249,4 @@ export {
31290
31249
  OtelStreamingObserver,
31291
31250
  createAgentKernel
31292
31251
  };
31293
- //# sourceMappingURL=chunk-GOZV2HN2.js.map
31252
+ //# sourceMappingURL=chunk-K4RXLQWV.js.map