agentv 3.3.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -149,7 +149,7 @@ import {
149
149
  withUserAgentSuffix,
150
150
  withoutTrailingSlash,
151
151
  zodSchema
152
- } from "./chunk-AR3QEKXH.js";
152
+ } from "./chunk-BJV6MDBE.js";
153
153
  import {
154
154
  SpanStatusCode,
155
155
  context,
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-C4MKEQR5.js
304
+ // ../../packages/core/dist/chunk-EFR4JHPL.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,7 +419,7 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-C4MKEQR5.js
422
+ // ../../packages/core/dist/chunk-EFR4JHPL.js
423
423
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
424
424
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
425
425
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -498,9 +498,6 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
498
498
  function isEvaluatorKind(value) {
499
499
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
500
500
  }
501
- function getHitCount(result) {
502
- return result.hits.length;
503
- }
504
501
  async function fileExists(filePath) {
505
502
  try {
506
503
  await access(filePath, constants.F_OK);
@@ -1810,6 +1807,7 @@ var AGENT_PROVIDER_KINDS = [
1810
1807
  "copilot-sdk",
1811
1808
  "copilot-cli",
1812
1809
  "pi-coding-agent",
1810
+ "pi-agent-sdk",
1813
1811
  "claude",
1814
1812
  "claude-cli",
1815
1813
  "claude-sdk",
@@ -17622,7 +17620,7 @@ var AzureProvider = class {
17622
17620
  };
17623
17621
  this.retryConfig = config.retry;
17624
17622
  const azure = createAzure(buildAzureOptions(config));
17625
- this.model = azure(config.deploymentName);
17623
+ this.model = azure.chat(config.deploymentName);
17626
17624
  }
17627
17625
  id;
17628
17626
  kind = "azure";
@@ -20912,6 +20910,29 @@ var MockProvider = class {
20912
20910
  return this.delayMs;
20913
20911
  }
20914
20912
  };
20913
+ function extractPiTextContent(content) {
20914
+ if (typeof content === "string") {
20915
+ return content;
20916
+ }
20917
+ if (!Array.isArray(content)) {
20918
+ return void 0;
20919
+ }
20920
+ const textParts = [];
20921
+ for (const part of content) {
20922
+ if (!part || typeof part !== "object") {
20923
+ continue;
20924
+ }
20925
+ const p = part;
20926
+ if (p.type === "text" && typeof p.text === "string") {
20927
+ textParts.push(p.text);
20928
+ }
20929
+ }
20930
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
20931
+ }
20932
+ function toFiniteNumber(value) {
20933
+ if (typeof value === "number" && Number.isFinite(value)) return value;
20934
+ return void 0;
20935
+ }
20915
20936
  var piAgentModule = null;
20916
20937
  var piAiModule = null;
20917
20938
  async function loadPiModules() {
@@ -20952,7 +20973,8 @@ var PiAgentSdkProvider = class {
20952
20973
  throw new Error("Pi agent SDK request was aborted before execution");
20953
20974
  }
20954
20975
  const { Agent, getModel, getEnvApiKey } = await loadPiModules();
20955
- const startTime = Date.now();
20976
+ const startTimeIso = (/* @__PURE__ */ new Date()).toISOString();
20977
+ const startMs = Date.now();
20956
20978
  const providerName = this.config.provider ?? "anthropic";
20957
20979
  const modelId = this.config.model ?? "claude-sonnet-4-20250514";
20958
20980
  const model = getModel(providerName, modelId);
@@ -20969,16 +20991,73 @@ var PiAgentSdkProvider = class {
20969
20991
  return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
20970
20992
  }
20971
20993
  });
20972
- const output = [];
20973
- let finalAssistantContent = "";
20994
+ let tokenUsage;
20995
+ let costUsd;
20996
+ const toolTrackers = /* @__PURE__ */ new Map();
20997
+ const completedToolResults = /* @__PURE__ */ new Map();
20974
20998
  const unsubscribe = agent.subscribe((event) => {
20975
- if (event.type === "message_end") {
20976
- const msg = event.message;
20977
- if (msg.role === "assistant") {
20978
- const content = extractTextContent3(msg.content);
20979
- if (content) {
20980
- finalAssistantContent = content;
20999
+ switch (event.type) {
21000
+ case "message_end": {
21001
+ const msg = event.message;
21002
+ if (msg && typeof msg === "object" && "role" in msg && msg.role === "assistant" && "usage" in msg) {
21003
+ const usage = msg.usage;
21004
+ if (usage && typeof usage === "object") {
21005
+ const u = usage;
21006
+ const input = toFiniteNumber(u.input);
21007
+ const output = toFiniteNumber(u.output);
21008
+ const cached = toFiniteNumber(u.cacheRead);
21009
+ let callDelta;
21010
+ if (input !== void 0 || output !== void 0) {
21011
+ callDelta = {
21012
+ input: input ?? 0,
21013
+ output: output ?? 0,
21014
+ ...cached !== void 0 ? { cached } : {}
21015
+ };
21016
+ tokenUsage = {
21017
+ input: (tokenUsage?.input ?? 0) + callDelta.input,
21018
+ output: (tokenUsage?.output ?? 0) + callDelta.output,
21019
+ ...cached !== void 0 ? { cached: (tokenUsage?.cached ?? 0) + cached } : tokenUsage?.cached !== void 0 ? { cached: tokenUsage.cached } : {}
21020
+ };
21021
+ }
21022
+ const cost = u.cost;
21023
+ if (cost && typeof cost === "object") {
21024
+ const total = toFiniteNumber(cost.total);
21025
+ if (total !== void 0) {
21026
+ costUsd = (costUsd ?? 0) + total;
21027
+ }
21028
+ }
21029
+ request.streamCallbacks?.onLlmCallEnd?.(modelId, callDelta);
21030
+ }
20981
21031
  }
21032
+ break;
21033
+ }
21034
+ case "tool_execution_start": {
21035
+ toolTrackers.set(event.toolCallId, {
21036
+ toolCallId: event.toolCallId,
21037
+ toolName: event.toolName,
21038
+ args: event.args,
21039
+ startMs: Date.now(),
21040
+ startTime: (/* @__PURE__ */ new Date()).toISOString()
21041
+ });
21042
+ request.streamCallbacks?.onToolCallStart?.(event.toolName, event.toolCallId);
21043
+ break;
21044
+ }
21045
+ case "tool_execution_end": {
21046
+ const tracker = toolTrackers.get(event.toolCallId);
21047
+ const durationMs = tracker ? Date.now() - tracker.startMs : 0;
21048
+ completedToolResults.set(event.toolCallId, {
21049
+ output: event.result,
21050
+ durationMs
21051
+ });
21052
+ request.streamCallbacks?.onToolCallEnd?.(
21053
+ event.toolName,
21054
+ tracker?.args,
21055
+ event.result,
21056
+ durationMs,
21057
+ event.toolCallId
21058
+ );
21059
+ toolTrackers.delete(event.toolCallId);
21060
+ break;
20982
21061
  }
20983
21062
  }
20984
21063
  });
@@ -20997,10 +21076,12 @@ var PiAgentSdkProvider = class {
20997
21076
  }
20998
21077
  await agent.waitForIdle();
20999
21078
  const agentMessages = agent.state.messages;
21079
+ const output = [];
21000
21080
  for (const msg of agentMessages) {
21001
- output.push(convertAgentMessage(msg));
21081
+ output.push(convertAgentMessage(msg, toolTrackers, completedToolResults));
21002
21082
  }
21003
- const durationMs = Date.now() - startTime;
21083
+ const endTimeIso = (/* @__PURE__ */ new Date()).toISOString();
21084
+ const durationMs = Date.now() - startMs;
21004
21085
  return {
21005
21086
  raw: {
21006
21087
  messages: agentMessages,
@@ -21009,49 +21090,54 @@ var PiAgentSdkProvider = class {
21009
21090
  provider: this.config.provider
21010
21091
  },
21011
21092
  output,
21012
- durationMs
21093
+ tokenUsage,
21094
+ costUsd,
21095
+ durationMs,
21096
+ startTime: startTimeIso,
21097
+ endTime: endTimeIso
21013
21098
  };
21014
21099
  } finally {
21015
21100
  unsubscribe();
21016
21101
  }
21017
21102
  }
21018
21103
  };
21019
- function extractTextContent3(content) {
21020
- if (typeof content === "string") {
21021
- return content;
21022
- }
21023
- if (!Array.isArray(content)) {
21024
- return void 0;
21025
- }
21026
- const textParts = [];
21027
- for (const part of content) {
21028
- if (!part || typeof part !== "object") {
21029
- continue;
21030
- }
21031
- const p = part;
21032
- if (p.type === "text" && typeof p.text === "string") {
21033
- textParts.push(p.text);
21034
- }
21035
- }
21036
- return textParts.length > 0 ? textParts.join("\n") : void 0;
21037
- }
21038
- function convertAgentMessage(message) {
21104
+ function convertAgentMessage(message, toolTrackers, completedToolResults) {
21039
21105
  if (!message || typeof message !== "object") {
21040
21106
  return { role: "unknown", content: String(message) };
21041
21107
  }
21042
21108
  const msg = message;
21043
21109
  const role = typeof msg.role === "string" ? msg.role : "unknown";
21044
- const content = extractTextContent3(msg.content);
21045
- const toolCalls = extractToolCalls3(msg.content);
21110
+ const content = extractPiTextContent(msg.content);
21111
+ const toolCalls = extractToolCalls3(msg.content, toolTrackers, completedToolResults);
21046
21112
  const startTime = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
21113
+ let msgTokenUsage;
21114
+ if (msg.usage && typeof msg.usage === "object") {
21115
+ const u = msg.usage;
21116
+ const input = toFiniteNumber(u.input);
21117
+ const output = toFiniteNumber(u.output);
21118
+ if (input !== void 0 || output !== void 0) {
21119
+ msgTokenUsage = {
21120
+ input: input ?? 0,
21121
+ output: output ?? 0,
21122
+ ...toFiniteNumber(u.cacheRead) !== void 0 ? { cached: toFiniteNumber(u.cacheRead) } : {}
21123
+ };
21124
+ }
21125
+ }
21126
+ const metadata = {};
21127
+ if (msg.api) metadata.api = msg.api;
21128
+ if (msg.provider) metadata.provider = msg.provider;
21129
+ if (msg.model) metadata.model = msg.model;
21130
+ if (msg.stopReason) metadata.stopReason = msg.stopReason;
21047
21131
  return {
21048
21132
  role,
21049
21133
  content,
21050
21134
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
21051
- startTime
21135
+ startTime,
21136
+ metadata: Object.keys(metadata).length > 0 ? metadata : void 0,
21137
+ tokenUsage: msgTokenUsage
21052
21138
  };
21053
21139
  }
21054
- function extractToolCalls3(content) {
21140
+ function extractToolCalls3(content, toolTrackers, completedToolResults) {
21055
21141
  if (!Array.isArray(content)) {
21056
21142
  return [];
21057
21143
  }
@@ -21062,10 +21148,17 @@ function extractToolCalls3(content) {
21062
21148
  }
21063
21149
  const p = part;
21064
21150
  if (p.type === "toolCall" && typeof p.name === "string") {
21151
+ const id = typeof p.id === "string" ? p.id : void 0;
21152
+ const tracker = id ? toolTrackers.get(id) : void 0;
21153
+ const completed = id ? completedToolResults.get(id) : void 0;
21065
21154
  toolCalls.push({
21066
21155
  tool: p.name,
21067
21156
  input: p.arguments,
21068
- id: typeof p.id === "string" ? p.id : void 0
21157
+ id,
21158
+ output: completed?.output,
21159
+ durationMs: completed?.durationMs,
21160
+ startTime: tracker?.startTime,
21161
+ endTime: tracker?.startTime && completed?.durationMs !== void 0 ? new Date(new Date(tracker.startTime).getTime() + completed.durationMs).toISOString() : void 0
21069
21162
  });
21070
21163
  }
21071
21164
  }
@@ -21597,14 +21690,14 @@ function extractTokenUsage(events) {
21597
21690
  const usage = record.usage;
21598
21691
  if (usage && typeof usage === "object") {
21599
21692
  const u = usage;
21600
- const input = toNumber(u.input_tokens ?? u.inputTokens ?? u.input);
21601
- const output = toNumber(u.output_tokens ?? u.outputTokens ?? u.output);
21693
+ const input = toFiniteNumber(u.input_tokens ?? u.inputTokens ?? u.input);
21694
+ const output = toFiniteNumber(u.output_tokens ?? u.outputTokens ?? u.output);
21602
21695
  if (input !== void 0 || output !== void 0) {
21603
21696
  const result = {
21604
21697
  input: input ?? 0,
21605
21698
  output: output ?? 0
21606
21699
  };
21607
- const cached = toNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
21700
+ const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
21608
21701
  if (cached !== void 0) {
21609
21702
  return { ...result, cached };
21610
21703
  }
@@ -21629,13 +21722,13 @@ function aggregateUsageFromMessages(messages) {
21629
21722
  const usage = m.usage;
21630
21723
  if (!usage || typeof usage !== "object") continue;
21631
21724
  const u = usage;
21632
- const input = toNumber(u.input_tokens ?? u.inputTokens ?? u.input);
21633
- const output = toNumber(u.output_tokens ?? u.outputTokens ?? u.output);
21725
+ const input = toFiniteNumber(u.input_tokens ?? u.inputTokens ?? u.input);
21726
+ const output = toFiniteNumber(u.output_tokens ?? u.outputTokens ?? u.output);
21634
21727
  if (input !== void 0 || output !== void 0) {
21635
21728
  found = true;
21636
21729
  totalInput += input ?? 0;
21637
21730
  totalOutput += output ?? 0;
21638
- const cached = toNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
21731
+ const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
21639
21732
  if (cached !== void 0) {
21640
21733
  totalCached = (totalCached ?? 0) + cached;
21641
21734
  }
@@ -21648,10 +21741,6 @@ function aggregateUsageFromMessages(messages) {
21648
21741
  }
21649
21742
  return result;
21650
21743
  }
21651
- function toNumber(value) {
21652
- if (typeof value === "number" && Number.isFinite(value)) return value;
21653
- return void 0;
21654
- }
21655
21744
  function convertPiMessage(message) {
21656
21745
  if (!message || typeof message !== "object") {
21657
21746
  return void 0;
@@ -21661,7 +21750,7 @@ function convertPiMessage(message) {
21661
21750
  if (typeof role !== "string") {
21662
21751
  return void 0;
21663
21752
  }
21664
- const content = extractTextContent4(msg.content);
21753
+ const content = extractPiTextContent(msg.content);
21665
21754
  const toolCalls = extractToolCalls4(msg.content);
21666
21755
  const startTime = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
21667
21756
  const metadata = {};
@@ -21678,25 +21767,6 @@ function convertPiMessage(message) {
21678
21767
  metadata: Object.keys(metadata).length > 0 ? metadata : void 0
21679
21768
  };
21680
21769
  }
21681
- function extractTextContent4(content) {
21682
- if (typeof content === "string") {
21683
- return content;
21684
- }
21685
- if (!Array.isArray(content)) {
21686
- return void 0;
21687
- }
21688
- const textParts = [];
21689
- for (const part of content) {
21690
- if (!part || typeof part !== "object") {
21691
- continue;
21692
- }
21693
- const p = part;
21694
- if (p.type === "text" && typeof p.text === "string") {
21695
- textParts.push(p.text);
21696
- }
21697
- }
21698
- return textParts.length > 0 ? textParts.join("\n") : void 0;
21699
- }
21700
21770
  function extractToolCalls4(content) {
21701
21771
  if (!Array.isArray(content)) {
21702
21772
  return [];
@@ -23410,9 +23480,11 @@ function negateScore(score) {
23410
23480
  ...score,
23411
23481
  score: negatedScore,
23412
23482
  verdict: negatedVerdict,
23413
- reasoning: score.reasoning ? `[Negated] ${score.reasoning} (original score: ${score.score.toFixed(2)})` : `[Negated] Original score: ${score.score.toFixed(2)}`,
23414
- hits: score.misses,
23415
- misses: score.hits
23483
+ assertions: score.assertions.map((a) => ({
23484
+ ...a,
23485
+ passed: !a.passed,
23486
+ evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
23487
+ }))
23416
23488
  };
23417
23489
  }
23418
23490
  function shellEscapePath(value) {
@@ -23912,9 +23984,13 @@ var CodeEvaluator = class {
23912
23984
  );
23913
23985
  const parsed = parseJsonSafe(stdout);
23914
23986
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
23915
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
23916
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
23917
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
23987
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
23988
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
23989
+ ).map((a) => ({
23990
+ text: String(a.text),
23991
+ passed: Boolean(a.passed),
23992
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
23993
+ })) : [];
23918
23994
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
23919
23995
  const proxyUsage = getProxyUsage?.();
23920
23996
  const evaluatorRawRequest = {
@@ -23930,10 +24006,8 @@ var CodeEvaluator = class {
23930
24006
  return {
23931
24007
  score,
23932
24008
  verdict: scoreToVerdict(score),
23933
- hits,
23934
- misses,
23935
- expectedAspectCount: hits.length + misses.length || 1,
23936
- reasoning,
24009
+ assertions,
24010
+ expectedAspectCount: assertions.length || 1,
23937
24011
  evaluatorRawRequest,
23938
24012
  ...details ? { details } : {},
23939
24013
  tokenUsage: proxyUsage?.tokenUsage
@@ -23944,10 +24018,8 @@ var CodeEvaluator = class {
23944
24018
  return {
23945
24019
  score: 0,
23946
24020
  verdict: "fail",
23947
- hits: [],
23948
- misses: [`Code evaluator failed: ${message}`],
24021
+ assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
23949
24022
  expectedAspectCount: 1,
23950
- reasoning: message,
23951
24023
  evaluatorRawRequest: {
23952
24024
  command: this.command,
23953
24025
  ...this.cwd ? { cwd: this.cwd } : {},
@@ -24046,9 +24118,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
24046
24118
  {{${TEMPLATE_VARIABLES.ANSWER}}}`;
24047
24119
  var freeformEvaluationSchema = external_exports2.object({
24048
24120
  score: external_exports2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
24049
- hits: external_exports2.array(external_exports2.string()).describe("Brief specific achievements").optional(),
24050
- misses: external_exports2.array(external_exports2.string()).describe("Brief failures or omissions").optional(),
24051
- reasoning: external_exports2.string().describe("Concise explanation (1-2 sentences)").optional()
24121
+ assertions: external_exports2.array(
24122
+ external_exports2.object({
24123
+ text: external_exports2.string().describe("Brief description of what was checked"),
24124
+ passed: external_exports2.boolean().describe("Whether this aspect was satisfied"),
24125
+ evidence: external_exports2.string().describe("Concise evidence (1-2 sentences)").optional()
24126
+ })
24127
+ ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
24052
24128
  });
24053
24129
  var rubricCheckResultSchema = external_exports2.object({
24054
24130
  id: external_exports2.string().describe("The ID of the rubric item being checked"),
@@ -24150,17 +24226,12 @@ ${context2.fileChanges}`;
24150
24226
  schema: freeformEvaluationSchema
24151
24227
  });
24152
24228
  const score = clampScore(data.score);
24153
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
24154
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
24155
- const reasoning = data.reasoning;
24156
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
24229
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
24157
24230
  return {
24158
24231
  score,
24159
24232
  verdict: scoreToVerdict(score),
24160
- hits,
24161
- misses,
24162
- expectedAspectCount,
24163
- reasoning,
24233
+ assertions,
24234
+ expectedAspectCount: Math.max(assertions.length, 1),
24164
24235
  evaluatorRawRequest,
24165
24236
  tokenUsage
24166
24237
  };
@@ -24171,10 +24242,8 @@ ${context2.fileChanges}`;
24171
24242
  return {
24172
24243
  score: 0,
24173
24244
  verdict: "skip",
24174
- hits: [],
24175
- misses: [`Grader parse failure after 3 attempts: ${message}`],
24245
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
24176
24246
  expectedAspectCount: 1,
24177
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
24178
24247
  evaluatorRawRequest
24179
24248
  };
24180
24249
  }
@@ -24204,14 +24273,12 @@ ${context2.fileChanges}`;
24204
24273
  userPrompt: prompt,
24205
24274
  schema: rubricEvaluationSchema
24206
24275
  });
24207
- const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
24276
+ const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
24208
24277
  return {
24209
24278
  score,
24210
24279
  verdict,
24211
- hits,
24212
- misses,
24280
+ assertions,
24213
24281
  expectedAspectCount: rubrics.length,
24214
- reasoning: data.overall_reasoning,
24215
24282
  evaluatorRawRequest,
24216
24283
  tokenUsage
24217
24284
  };
@@ -24222,10 +24289,8 @@ ${context2.fileChanges}`;
24222
24289
  return {
24223
24290
  score: 0,
24224
24291
  verdict: "skip",
24225
- hits: [],
24226
- misses: [`Grader parse failure after 3 attempts: ${message}`],
24292
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
24227
24293
  expectedAspectCount: rubrics.length,
24228
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
24229
24294
  evaluatorRawRequest
24230
24295
  };
24231
24296
  }
@@ -24250,14 +24315,12 @@ ${context2.fileChanges}`;
24250
24315
  userPrompt: prompt,
24251
24316
  schema: scoreRangeEvaluationSchema
24252
24317
  });
24253
- const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
24318
+ const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
24254
24319
  return {
24255
24320
  score,
24256
24321
  verdict,
24257
- hits,
24258
- misses,
24322
+ assertions,
24259
24323
  expectedAspectCount: rubrics.length,
24260
- reasoning: data.overall_reasoning,
24261
24324
  evaluatorRawRequest,
24262
24325
  details,
24263
24326
  tokenUsage
@@ -24269,10 +24332,8 @@ ${context2.fileChanges}`;
24269
24332
  return {
24270
24333
  score: 0,
24271
24334
  verdict: "skip",
24272
- hits: [],
24273
- misses: [`Grader parse failure after 3 attempts: ${message}`],
24335
+ assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
24274
24336
  expectedAspectCount: rubrics.length,
24275
- reasoning: `Grader parse failure after 3 attempts: ${message}`,
24276
24337
  evaluatorRawRequest
24277
24338
  };
24278
24339
  }
@@ -24329,8 +24390,7 @@ ${context2.fileChanges}`;
24329
24390
  return {
24330
24391
  score: 0,
24331
24392
  verdict: "fail",
24332
- hits: [],
24333
- misses: [`llm-grader built-in evaluation failed: ${message}`],
24393
+ assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
24334
24394
  expectedAspectCount: 1,
24335
24395
  evaluatorRawRequest,
24336
24396
  details: { mode: "built-in", error: message }
@@ -24380,8 +24440,9 @@ ${context2.fileChanges}`;
24380
24440
  return {
24381
24441
  score: 0,
24382
24442
  verdict: "fail",
24383
- hits: [],
24384
- misses: [`llm-grader ${modeLabel} returned no assistant response`],
24443
+ assertions: [
24444
+ { text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
24445
+ ],
24385
24446
  expectedAspectCount: 1,
24386
24447
  evaluatorRawRequest,
24387
24448
  details: { mode: modeLabel, grader_target: provider.targetName }
@@ -24399,8 +24460,9 @@ ${context2.fileChanges}`;
24399
24460
  return {
24400
24461
  score: 0,
24401
24462
  verdict: "fail",
24402
- hits: [],
24403
- misses: [`llm-grader ${modeLabel} evaluation failed: ${message}`],
24463
+ assertions: [
24464
+ { text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
24465
+ ],
24404
24466
  expectedAspectCount: 1,
24405
24467
  evaluatorRawRequest,
24406
24468
  details: {
@@ -24552,29 +24614,24 @@ ${outputSchema2}`;
24552
24614
  const parsed = parseJsonFromText(text2);
24553
24615
  if (rubrics && rubrics.length > 0) {
24554
24616
  const data2 = rubricEvaluationSchema.parse(parsed);
24555
- const { score: score2, verdict, hits: hits2, misses: misses2 } = calculateRubricScore(data2, rubrics);
24617
+ const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
24556
24618
  return {
24557
24619
  score: score2,
24558
24620
  verdict,
24559
- hits: hits2,
24560
- misses: misses2,
24621
+ assertions: assertions2,
24561
24622
  expectedAspectCount: rubrics.length,
24562
- reasoning: data2.overall_reasoning,
24563
24623
  evaluatorRawRequest,
24564
24624
  details
24565
24625
  };
24566
24626
  }
24567
24627
  const data = freeformEvaluationSchema.parse(parsed);
24568
24628
  const score = clampScore(data.score);
24569
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
24570
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
24629
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
24571
24630
  return {
24572
24631
  score,
24573
24632
  verdict: scoreToVerdict(score),
24574
- hits,
24575
- misses,
24576
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
24577
- reasoning: data.reasoning,
24633
+ assertions,
24634
+ expectedAspectCount: Math.max(assertions.length, 1),
24578
24635
  evaluatorRawRequest,
24579
24636
  details
24580
24637
  };
@@ -24582,8 +24639,12 @@ ${outputSchema2}`;
24582
24639
  return {
24583
24640
  score: 0,
24584
24641
  verdict: "fail",
24585
- hits: [],
24586
- misses: ["Failed to parse llm-grader agent response as valid evaluation JSON"],
24642
+ assertions: [
24643
+ {
24644
+ text: "Failed to parse llm-grader agent response as valid evaluation JSON",
24645
+ passed: false
24646
+ }
24647
+ ],
24587
24648
  expectedAspectCount: 1,
24588
24649
  evaluatorRawRequest,
24589
24650
  details
@@ -24712,9 +24773,13 @@ function buildOutputSchema() {
24712
24773
  "",
24713
24774
  "{",
24714
24775
  ' "score": <number between 0.0 and 1.0>,',
24715
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
24716
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
24717
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
24776
+ ' "assertions": [',
24777
+ " {",
24778
+ ' "text": "<brief description of what was checked>",',
24779
+ ' "passed": <boolean>,',
24780
+ ' "evidence": "<concise evidence, 1-2 sentences, optional>"',
24781
+ " }",
24782
+ " ]",
24718
24783
  "}"
24719
24784
  ].join("\n");
24720
24785
  }
@@ -24739,8 +24804,7 @@ function substituteVariables(template, variables) {
24739
24804
  }
24740
24805
  function calculateRubricScore(result, rubrics) {
24741
24806
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
24742
- const hits = [];
24743
- const misses = [];
24807
+ const assertions = [];
24744
24808
  let totalWeight = 0;
24745
24809
  let earnedWeight = 0;
24746
24810
  let failedRequired = false;
@@ -24750,19 +24814,20 @@ function calculateRubricScore(result, rubrics) {
24750
24814
  continue;
24751
24815
  }
24752
24816
  totalWeight += rubric.weight;
24817
+ assertions.push({
24818
+ text: `[${rubric.id}] ${rubric.outcome}`,
24819
+ passed: check.satisfied,
24820
+ evidence: check.reasoning
24821
+ });
24753
24822
  if (check.satisfied) {
24754
24823
  earnedWeight += rubric.weight;
24755
- hits.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
24756
- } else {
24757
- misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
24758
- if (rubric.required) {
24759
- failedRequired = true;
24760
- }
24824
+ } else if (rubric.required) {
24825
+ failedRequired = true;
24761
24826
  }
24762
24827
  }
24763
24828
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
24764
24829
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
24765
- return { score, verdict, hits, misses };
24830
+ return { score, verdict, assertions };
24766
24831
  }
24767
24832
  function buildScoreRangeOutputSchema() {
24768
24833
  return `You are an expert evaluator. Score the candidate answer on each criterion.
@@ -24782,8 +24847,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
24782
24847
  }
24783
24848
  function calculateScoreRangeResult(result, rubrics) {
24784
24849
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
24785
- const hits = [];
24786
- const misses = [];
24850
+ const assertions = [];
24787
24851
  const rawScores = {};
24788
24852
  let totalWeight = 0;
24789
24853
  let weightedScoreSum = 0;
@@ -24809,24 +24873,22 @@ function calculateScoreRangeResult(result, rubrics) {
24809
24873
  );
24810
24874
  const rangeDescription = matchingRange?.outcome ?? "";
24811
24875
  const criterionLabel = rubric.outcome ?? rubric.id;
24812
- const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
24813
- const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
24876
+ const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
24814
24877
  if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
24815
24878
  failedRequired = true;
24816
- misses.push(scoreInfo);
24817
- } else if (rawScore >= 7) {
24818
- hits.push(scoreInfo);
24819
- } else {
24820
- misses.push(scoreInfo);
24821
24879
  }
24880
+ assertions.push({
24881
+ text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
24882
+ passed,
24883
+ evidence: check.reasoning
24884
+ });
24822
24885
  }
24823
24886
  const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
24824
24887
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
24825
24888
  return {
24826
24889
  score,
24827
24890
  verdict,
24828
- hits,
24829
- misses,
24891
+ assertions,
24830
24892
  details: {
24831
24893
  raw_scores: rawScores,
24832
24894
  normalization: "score / 10",
@@ -25000,9 +25062,7 @@ var CompositeEvaluator = class {
25000
25062
  let totalWeight = 0;
25001
25063
  let weightedSum = 0;
25002
25064
  let evaluatedCount = 0;
25003
- const allHits = [];
25004
- const allMisses = [];
25005
- const reasoningParts = [];
25065
+ const allAssertions = [];
25006
25066
  const scores = [];
25007
25067
  for (const member of results) {
25008
25068
  const weight = weights?.[member.id] ?? 1;
@@ -25012,9 +25072,7 @@ var CompositeEvaluator = class {
25012
25072
  score: member.result.score,
25013
25073
  weight,
25014
25074
  verdict: member.result.verdict,
25015
- hits: [...member.result.hits],
25016
- misses: [...member.result.misses],
25017
- reasoning: member.result.reasoning,
25075
+ assertions: [...member.result.assertions],
25018
25076
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25019
25077
  scores: member.result.scores,
25020
25078
  details: member.result.details,
@@ -25026,20 +25084,16 @@ var CompositeEvaluator = class {
25026
25084
  evaluatedCount++;
25027
25085
  totalWeight += weight;
25028
25086
  weightedSum += member.result.score * weight;
25029
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
25030
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
25031
- if (member.result.reasoning) {
25032
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
25033
- }
25087
+ allAssertions.push(
25088
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
25089
+ );
25034
25090
  }
25035
25091
  if (evaluatedCount === 0 && results.length > 0) {
25036
25092
  return {
25037
25093
  score: 0,
25038
25094
  verdict: "skip",
25039
- hits: [],
25040
- misses: [],
25095
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
25041
25096
  expectedAspectCount: 1,
25042
- reasoning: "All evaluators skipped (infrastructure failure)",
25043
25097
  evaluatorRawRequest: {
25044
25098
  aggregator: "weighted_average",
25045
25099
  ...weights ? { weights } : {}
@@ -25051,10 +25105,8 @@ var CompositeEvaluator = class {
25051
25105
  return {
25052
25106
  score: clampScore(finalScore),
25053
25107
  verdict: scoreToVerdict(finalScore),
25054
- hits: allHits,
25055
- misses: allMisses,
25056
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
25057
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
25108
+ assertions: allAssertions,
25109
+ expectedAspectCount: allAssertions.length || 1,
25058
25110
  evaluatorRawRequest: {
25059
25111
  aggregator: "weighted_average",
25060
25112
  ...weights ? { weights } : {}
@@ -25064,11 +25116,8 @@ var CompositeEvaluator = class {
25064
25116
  }
25065
25117
  runThreshold(results, threshold) {
25066
25118
  const scores = [];
25067
- const allHits = [];
25068
- const allMisses = [];
25069
- const reasoningParts = [];
25119
+ const allAssertions = [];
25070
25120
  let passingCount = 0;
25071
- let borderlineCount = 0;
25072
25121
  let evaluatedCount = 0;
25073
25122
  for (const member of results) {
25074
25123
  scores.push({
@@ -25076,9 +25125,7 @@ var CompositeEvaluator = class {
25076
25125
  type: member.type,
25077
25126
  score: member.result.score,
25078
25127
  verdict: member.result.verdict,
25079
- hits: [...member.result.hits],
25080
- misses: [...member.result.misses],
25081
- reasoning: member.result.reasoning,
25128
+ assertions: [...member.result.assertions],
25082
25129
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25083
25130
  scores: member.result.scores,
25084
25131
  details: member.result.details,
@@ -25091,24 +25138,17 @@ var CompositeEvaluator = class {
25091
25138
  const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
25092
25139
  if (isPassing) {
25093
25140
  passingCount++;
25094
- if (member.result.verdict === "borderline") {
25095
- borderlineCount++;
25096
- }
25097
- }
25098
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
25099
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
25100
- if (member.result.reasoning) {
25101
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
25102
25141
  }
25142
+ allAssertions.push(
25143
+ ...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
25144
+ );
25103
25145
  }
25104
25146
  if (evaluatedCount === 0 && results.length > 0) {
25105
25147
  return {
25106
25148
  score: 0,
25107
25149
  verdict: "skip",
25108
- hits: [],
25109
- misses: [],
25150
+ assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
25110
25151
  expectedAspectCount: 1,
25111
- reasoning: "All evaluators skipped (infrastructure failure)",
25112
25152
  evaluatorRawRequest: {
25113
25153
  aggregator: "threshold",
25114
25154
  threshold
@@ -25119,19 +25159,15 @@ var CompositeEvaluator = class {
25119
25159
  const totalCount = evaluatedCount;
25120
25160
  const score = totalCount > 0 ? passingCount / totalCount : 0;
25121
25161
  const pass = score >= threshold;
25122
- if (pass && borderlineCount > 0) {
25123
- reasoningParts.push(`Warning: ${borderlineCount} borderline evaluator(s) counted as passing`);
25124
- }
25125
- reasoningParts.unshift(
25126
- `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
25127
- );
25162
+ allAssertions.unshift({
25163
+ text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
25164
+ passed: pass
25165
+ });
25128
25166
  return {
25129
25167
  score: clampScore(score),
25130
25168
  verdict: pass ? "pass" : "fail",
25131
- hits: allHits,
25132
- misses: allMisses,
25133
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
25134
- reasoning: reasoningParts.join("; "),
25169
+ assertions: allAssertions,
25170
+ expectedAspectCount: allAssertions.length || 1,
25135
25171
  evaluatorRawRequest: {
25136
25172
  aggregator: "threshold",
25137
25173
  threshold
@@ -25148,9 +25184,7 @@ var CompositeEvaluator = class {
25148
25184
  score: member.result.score,
25149
25185
  weight: weights?.[member.id] ?? 1,
25150
25186
  verdict: member.result.verdict,
25151
- hits: [...member.result.hits],
25152
- misses: [...member.result.misses],
25153
- reasoning: member.result.reasoning,
25187
+ assertions: [...member.result.assertions],
25154
25188
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25155
25189
  scores: member.result.scores,
25156
25190
  details: member.result.details
@@ -25159,17 +25193,19 @@ var CompositeEvaluator = class {
25159
25193
  const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
25160
25194
  const parsed = parseJsonSafe(stdout);
25161
25195
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
25162
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
25163
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
25164
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
25196
+ const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
25197
+ (a) => typeof a === "object" && a !== null && typeof a.text === "string"
25198
+ ).map((a) => ({
25199
+ text: String(a.text),
25200
+ passed: Boolean(a.passed),
25201
+ ...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
25202
+ })) : [];
25165
25203
  const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
25166
25204
  return {
25167
25205
  score,
25168
25206
  verdict,
25169
- hits,
25170
- misses,
25171
- expectedAspectCount: hits.length + misses.length || 1,
25172
- reasoning,
25207
+ assertions,
25208
+ expectedAspectCount: assertions.length || 1,
25173
25209
  evaluatorRawRequest: {
25174
25210
  aggregator: "code-grader",
25175
25211
  script: scriptPath
@@ -25181,10 +25217,8 @@ var CompositeEvaluator = class {
25181
25217
  return {
25182
25218
  score: 0,
25183
25219
  verdict: "fail",
25184
- hits: [],
25185
- misses: [`Code aggregator failed: ${message}`],
25220
+ assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
25186
25221
  expectedAspectCount: 1,
25187
- reasoning: message,
25188
25222
  evaluatorRawRequest: {
25189
25223
  aggregator: "code-grader",
25190
25224
  script: scriptPath,
@@ -25206,9 +25240,7 @@ var CompositeEvaluator = class {
25206
25240
  type: member.type,
25207
25241
  score: member.result.score,
25208
25242
  verdict: member.result.verdict,
25209
- hits: [...member.result.hits],
25210
- misses: [...member.result.misses],
25211
- reasoning: member.result.reasoning,
25243
+ assertions: [...member.result.assertions],
25212
25244
  evaluatorRawRequest: member.result.evaluatorRawRequest,
25213
25245
  scores: member.result.scores,
25214
25246
  details: member.result.details
@@ -25232,16 +25264,12 @@ var CompositeEvaluator = class {
25232
25264
  });
25233
25265
  const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text2));
25234
25266
  const score2 = clampScore(data2.score);
25235
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
25236
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
25237
- const reasoning2 = data2.reasoning;
25267
+ const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
25238
25268
  return {
25239
25269
  score: score2,
25240
25270
  verdict: scoreToVerdict(score2),
25241
- hits: hits2,
25242
- misses: misses2,
25243
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
25244
- reasoning: reasoning2,
25271
+ assertions: assertions2,
25272
+ expectedAspectCount: Math.max(assertions2.length, 1),
25245
25273
  evaluatorRawRequest,
25246
25274
  scores
25247
25275
  };
@@ -25256,16 +25284,12 @@ var CompositeEvaluator = class {
25256
25284
  parseJsonFromText(extractLastAssistantContent(response.output))
25257
25285
  );
25258
25286
  const score = clampScore(data.score);
25259
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
25260
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
25261
- const reasoning = data.reasoning;
25287
+ const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
25262
25288
  return {
25263
25289
  score,
25264
25290
  verdict: scoreToVerdict(score),
25265
- hits,
25266
- misses,
25267
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
25268
- reasoning,
25291
+ assertions,
25292
+ expectedAspectCount: Math.max(assertions.length, 1),
25269
25293
  evaluatorRawRequest,
25270
25294
  scores
25271
25295
  };
@@ -25273,8 +25297,7 @@ var CompositeEvaluator = class {
25273
25297
  return {
25274
25298
  score: 0,
25275
25299
  verdict: "fail",
25276
- hits: [],
25277
- misses: [],
25300
+ assertions: [{ text: "LLM aggregator failed", passed: false }],
25278
25301
  expectedAspectCount: 1,
25279
25302
  evaluatorRawRequest,
25280
25303
  scores
@@ -25295,10 +25318,8 @@ var CostEvaluator = class {
25295
25318
  return {
25296
25319
  score: 0,
25297
25320
  verdict: "fail",
25298
- hits: [],
25299
- misses: ["No cost data available in trace"],
25321
+ assertions: [{ text: "No cost data available in trace", passed: false }],
25300
25322
  expectedAspectCount: 1,
25301
- reasoning: "Execution cost not reported by provider",
25302
25323
  evaluatorRawRequest: {
25303
25324
  type: "cost",
25304
25325
  budget,
@@ -25312,10 +25333,10 @@ var CostEvaluator = class {
25312
25333
  return {
25313
25334
  score,
25314
25335
  verdict: passed ? "pass" : "fail",
25315
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
25316
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
25336
+ assertions: [
25337
+ passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
25338
+ ],
25317
25339
  expectedAspectCount: 1,
25318
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
25319
25340
  evaluatorRawRequest: {
25320
25341
  type: "cost",
25321
25342
  budget,
@@ -25346,10 +25367,8 @@ var ExecutionMetricsEvaluator = class {
25346
25367
  return {
25347
25368
  score: 0,
25348
25369
  verdict: "fail",
25349
- hits: [],
25350
- misses: ["No trace summary available"],
25370
+ assertions: [{ text: "No trace summary available", passed: false }],
25351
25371
  expectedAspectCount: 1,
25352
- reasoning: "Execution metrics not available - no trace summary provided",
25353
25372
  evaluatorRawRequest: {
25354
25373
  type: "execution-metrics",
25355
25374
  config: this.extractConfiguredThresholds(),
@@ -25358,116 +25377,114 @@ var ExecutionMetricsEvaluator = class {
25358
25377
  };
25359
25378
  }
25360
25379
  const narrowedTrace = trace2;
25361
- const hits = [];
25362
- const misses = [];
25380
+ const assertions = [];
25363
25381
  const actualMetrics = {};
25364
25382
  if (max_tool_calls !== void 0 && narrowedTrace) {
25365
25383
  const toolCalls = narrowedTrace.eventCount;
25366
25384
  actualMetrics.tool_calls = toolCalls;
25367
25385
  if (toolCalls <= max_tool_calls) {
25368
- hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
25386
+ assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
25369
25387
  } else {
25370
- misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
25388
+ assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
25371
25389
  }
25372
25390
  }
25373
25391
  if (max_llm_calls !== void 0 && narrowedTrace) {
25374
25392
  const llmCalls = narrowedTrace.llmCallCount;
25375
25393
  if (llmCalls === void 0) {
25376
- misses.push("LLM call count data not available");
25394
+ assertions.push({ text: "LLM call count data not available", passed: false });
25377
25395
  } else {
25378
25396
  actualMetrics.llm_calls = llmCalls;
25379
25397
  if (llmCalls <= max_llm_calls) {
25380
- hits.push(`LLM calls ${llmCalls} <= ${max_llm_calls} max`);
25398
+ assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
25381
25399
  } else {
25382
- misses.push(`LLM calls ${llmCalls} > ${max_llm_calls} max`);
25400
+ assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
25383
25401
  }
25384
25402
  }
25385
25403
  }
25386
25404
  if (max_tokens !== void 0) {
25387
25405
  if (!tokenUsage) {
25388
- misses.push("Token usage data not available");
25406
+ assertions.push({ text: "Token usage data not available", passed: false });
25389
25407
  } else {
25390
25408
  const totalTokens = tokenUsage.input + tokenUsage.output;
25391
25409
  actualMetrics.tokens = totalTokens;
25392
25410
  if (totalTokens <= max_tokens) {
25393
- hits.push(`Total tokens ${totalTokens} <= ${max_tokens} max`);
25411
+ assertions.push({
25412
+ text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
25413
+ passed: true
25414
+ });
25394
25415
  } else {
25395
- misses.push(`Total tokens ${totalTokens} > ${max_tokens} max`);
25416
+ assertions.push({
25417
+ text: `Total tokens ${totalTokens} > ${max_tokens} max`,
25418
+ passed: false
25419
+ });
25396
25420
  }
25397
25421
  }
25398
25422
  }
25399
25423
  if (max_cost_usd !== void 0) {
25400
25424
  if (costUsd === void 0) {
25401
- misses.push("Cost data not available");
25425
+ assertions.push({ text: "Cost data not available", passed: false });
25402
25426
  } else {
25403
25427
  actualMetrics.cost_usd = costUsd;
25404
25428
  const formatCost = (n) => `$${n.toFixed(4)}`;
25405
25429
  if (costUsd <= max_cost_usd) {
25406
- hits.push(`Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`);
25430
+ assertions.push({
25431
+ text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
25432
+ passed: true
25433
+ });
25407
25434
  } else {
25408
- misses.push(`Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`);
25435
+ assertions.push({
25436
+ text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
25437
+ passed: false
25438
+ });
25409
25439
  }
25410
25440
  }
25411
25441
  }
25412
25442
  if (max_duration_ms !== void 0) {
25413
25443
  if (durationMs === void 0) {
25414
- misses.push("Duration data not available");
25444
+ assertions.push({ text: "Duration data not available", passed: false });
25415
25445
  } else {
25416
25446
  actualMetrics.duration_ms = durationMs;
25417
25447
  if (durationMs <= max_duration_ms) {
25418
- hits.push(`Duration ${durationMs}ms <= ${max_duration_ms}ms max`);
25448
+ assertions.push({
25449
+ text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
25450
+ passed: true
25451
+ });
25419
25452
  } else {
25420
- misses.push(`Duration ${durationMs}ms > ${max_duration_ms}ms max`);
25453
+ assertions.push({
25454
+ text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
25455
+ passed: false
25456
+ });
25421
25457
  }
25422
25458
  }
25423
25459
  }
25424
25460
  if (target_exploration_ratio !== void 0 && narrowedTrace) {
25425
25461
  const ratio = explorationRatio(narrowedTrace);
25426
25462
  if (ratio === void 0) {
25427
- misses.push("Exploration ratio not available (no tool calls)");
25463
+ assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
25428
25464
  } else {
25429
25465
  actualMetrics.exploration_ratio = ratio;
25430
25466
  const diff = Math.abs(ratio - target_exploration_ratio);
25431
25467
  if (diff <= exploration_tolerance) {
25432
- hits.push(
25433
- `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`
25434
- );
25468
+ assertions.push({
25469
+ text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
25470
+ passed: true
25471
+ });
25435
25472
  } else {
25436
- misses.push(
25437
- `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`
25438
- );
25473
+ assertions.push({
25474
+ text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
25475
+ passed: false
25476
+ });
25439
25477
  }
25440
25478
  }
25441
25479
  }
25442
- const totalChecks = hits.length + misses.length;
25443
- const score = totalChecks > 0 ? hits.length / totalChecks : 0;
25444
- const reasoningParts = [];
25445
- if (actualMetrics.tool_calls !== void 0) {
25446
- reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
25447
- }
25448
- if (actualMetrics.llm_calls !== void 0) {
25449
- reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
25450
- }
25451
- if (actualMetrics.tokens !== void 0) {
25452
- reasoningParts.push(`tokens=${actualMetrics.tokens}`);
25453
- }
25454
- if (actualMetrics.cost_usd !== void 0) {
25455
- reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
25456
- }
25457
- if (actualMetrics.duration_ms !== void 0) {
25458
- reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
25459
- }
25460
- if (actualMetrics.exploration_ratio !== void 0) {
25461
- reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
25462
- }
25463
- const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
25480
+ const totalChecks = assertions.length;
25481
+ const passedCount = assertions.filter((a) => a.passed).length;
25482
+ const score = totalChecks > 0 ? passedCount / totalChecks : 0;
25464
25483
  return {
25465
25484
  score,
25466
25485
  verdict: scoreToVerdict(score),
25467
- hits,
25468
- misses,
25486
+ assertions,
25469
25487
  expectedAspectCount: totalChecks || 1,
25470
- reasoning,
25471
25488
  evaluatorRawRequest: {
25472
25489
  type: "execution-metrics",
25473
25490
  config: this.extractConfiguredThresholds(),
@@ -25569,10 +25586,8 @@ var FieldAccuracyEvaluator = class {
25569
25586
  return {
25570
25587
  score: 0,
25571
25588
  verdict: "fail",
25572
- hits: [],
25573
- misses: ["Failed to parse candidate answer as JSON"],
25574
- expectedAspectCount: this.config.fields.length,
25575
- reasoning: "Candidate answer is not valid JSON"
25589
+ assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
25590
+ expectedAspectCount: this.config.fields.length
25576
25591
  };
25577
25592
  }
25578
25593
  const expectedData = this.extractExpectedData(evalCase.expected_output);
@@ -25580,10 +25595,8 @@ var FieldAccuracyEvaluator = class {
25580
25595
  return {
25581
25596
  score: 0,
25582
25597
  verdict: "fail",
25583
- hits: [],
25584
- misses: ["No expected data found in expected_output"],
25585
- expectedAspectCount: this.config.fields.length,
25586
- reasoning: "Could not extract expected data from expected_output"
25598
+ assertions: [{ text: "No expected data found in expected_output", passed: false }],
25599
+ expectedAspectCount: this.config.fields.length
25587
25600
  };
25588
25601
  }
25589
25602
  const fieldResults = [];
@@ -25709,8 +25722,8 @@ var FieldAccuracyEvaluator = class {
25709
25722
  */
25710
25723
  compareNumericTolerance(path46, candidateValue, expectedValue, fieldConfig, weight) {
25711
25724
  const { tolerance = 0, relative = false } = fieldConfig;
25712
- const candidateNum = toNumber2(candidateValue);
25713
- const expectedNum = toNumber2(expectedValue);
25725
+ const candidateNum = toNumber(candidateValue);
25726
+ const expectedNum = toNumber(expectedValue);
25714
25727
  if (candidateNum === null || expectedNum === null) {
25715
25728
  return {
25716
25729
  path: path46,
@@ -25801,18 +25814,14 @@ var FieldAccuracyEvaluator = class {
25801
25814
  */
25802
25815
  aggregateResults(results) {
25803
25816
  const aggregation = this.config.aggregation ?? "weighted_average";
25804
- const hits = [];
25805
- const misses = [];
25817
+ const assertions = [];
25806
25818
  for (const result of results) {
25807
- if (result.hit) {
25808
- hits.push(result.message);
25809
- } else {
25810
- misses.push(result.message);
25811
- }
25819
+ assertions.push({ text: result.message, passed: result.hit });
25812
25820
  }
25813
25821
  let score;
25814
25822
  if (aggregation === "all_or_nothing") {
25815
- score = misses.length === 0 ? 1 : 0;
25823
+ const hasFailed = assertions.some((a) => !a.passed);
25824
+ score = hasFailed ? 0 : 1;
25816
25825
  } else {
25817
25826
  const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
25818
25827
  if (totalWeight === 0) {
@@ -25822,15 +25831,11 @@ var FieldAccuracyEvaluator = class {
25822
25831
  score = weightedSum / totalWeight;
25823
25832
  }
25824
25833
  }
25825
- const reasoning = `${hits.length}/${results.length} fields matched`;
25826
25834
  return {
25827
25835
  score: clampScore(score),
25828
25836
  verdict: scoreToVerdict(score),
25829
- hits: hits.slice(0, 4),
25830
- // Cap at 4 to keep output concise
25831
- misses: misses.slice(0, 4),
25832
- expectedAspectCount: results.length,
25833
- reasoning
25837
+ assertions,
25838
+ expectedAspectCount: results.length
25834
25839
  };
25835
25840
  }
25836
25841
  };
@@ -25856,7 +25861,7 @@ function resolvePath(obj, path46) {
25856
25861
  }
25857
25862
  return current;
25858
25863
  }
25859
- function toNumber2(value) {
25864
+ function toNumber(value) {
25860
25865
  if (typeof value === "number") {
25861
25866
  return value;
25862
25867
  }
@@ -25937,10 +25942,8 @@ var LatencyEvaluator = class {
25937
25942
  return {
25938
25943
  score: 0,
25939
25944
  verdict: "fail",
25940
- hits: [],
25941
- misses: ["No duration data available in trace"],
25945
+ assertions: [{ text: "No duration data available in trace", passed: false }],
25942
25946
  expectedAspectCount: 1,
25943
- reasoning: "Execution duration not reported by provider",
25944
25947
  evaluatorRawRequest: {
25945
25948
  type: "latency",
25946
25949
  threshold,
@@ -25953,10 +25956,10 @@ var LatencyEvaluator = class {
25953
25956
  return {
25954
25957
  score,
25955
25958
  verdict: passed ? "pass" : "fail",
25956
- hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
25957
- misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
25959
+ assertions: [
25960
+ passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
25961
+ ],
25958
25962
  expectedAspectCount: 1,
25959
- reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
25960
25963
  evaluatorRawRequest: {
25961
25964
  type: "latency",
25962
25965
  threshold,
@@ -26030,23 +26033,25 @@ var SkillTriggerEvaluator = class {
26030
26033
  return {
26031
26034
  score: 1,
26032
26035
  verdict: "pass",
26033
- hits: [
26034
- shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`
26036
+ assertions: [
26037
+ {
26038
+ text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
26039
+ passed: true
26040
+ }
26035
26041
  ],
26036
- misses: [],
26037
- expectedAspectCount: 1,
26038
- reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
26042
+ expectedAspectCount: 1
26039
26043
  };
26040
26044
  }
26041
26045
  return {
26042
26046
  score: 0,
26043
26047
  verdict: "fail",
26044
- hits: [],
26045
- misses: [
26046
- shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`
26048
+ assertions: [
26049
+ {
26050
+ text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
26051
+ passed: false
26052
+ }
26047
26053
  ],
26048
- expectedAspectCount: 1,
26049
- reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
26054
+ expectedAspectCount: 1
26050
26055
  };
26051
26056
  }
26052
26057
  };
@@ -26211,10 +26216,8 @@ var TokenUsageEvaluator = class {
26211
26216
  return {
26212
26217
  score: 0,
26213
26218
  verdict: "fail",
26214
- hits: [],
26215
- misses: ["No token usage data available in trace"],
26219
+ assertions: [{ text: "No token usage data available in trace", passed: false }],
26216
26220
  expectedAspectCount,
26217
- reasoning: "Token usage not reported by provider",
26218
26221
  evaluatorRawRequest: {
26219
26222
  type: "token-usage",
26220
26223
  max_total: maxTotal ?? null,
@@ -26228,37 +26231,34 @@ var TokenUsageEvaluator = class {
26228
26231
  const output = usage.output;
26229
26232
  const cached = usage.cached ?? 0;
26230
26233
  const total = input + output + cached;
26231
- const hits = [];
26232
- const misses = [];
26234
+ const assertions = [];
26233
26235
  if (typeof maxInput === "number") {
26234
26236
  if (input <= maxInput) {
26235
- hits.push(`Input tokens ${input} <= ${maxInput}`);
26237
+ assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
26236
26238
  } else {
26237
- misses.push(`Input tokens ${input} > ${maxInput}`);
26239
+ assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
26238
26240
  }
26239
26241
  }
26240
26242
  if (typeof maxOutput === "number") {
26241
26243
  if (output <= maxOutput) {
26242
- hits.push(`Output tokens ${output} <= ${maxOutput}`);
26244
+ assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
26243
26245
  } else {
26244
- misses.push(`Output tokens ${output} > ${maxOutput}`);
26246
+ assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
26245
26247
  }
26246
26248
  }
26247
26249
  if (typeof maxTotal === "number") {
26248
26250
  if (total <= maxTotal) {
26249
- hits.push(`Total tokens ${total} <= ${maxTotal}`);
26251
+ assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
26250
26252
  } else {
26251
- misses.push(`Total tokens ${total} > ${maxTotal}`);
26253
+ assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
26252
26254
  }
26253
26255
  }
26254
- const passed = misses.length === 0;
26256
+ const passed = assertions.every((a) => a.passed);
26255
26257
  return {
26256
26258
  score: passed ? 1 : 0,
26257
26259
  verdict: passed ? "pass" : "fail",
26258
- hits,
26259
- misses,
26260
+ assertions,
26260
26261
  expectedAspectCount,
26261
- reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
26262
26262
  evaluatorRawRequest: {
26263
26263
  type: "token-usage",
26264
26264
  max_total: maxTotal ?? null,
@@ -26356,8 +26356,7 @@ var ToolTrajectoryEvaluator = class {
26356
26356
  return {
26357
26357
  score: 0,
26358
26358
  verdict: "fail",
26359
- hits: [],
26360
- misses: ["No trace available for evaluation"],
26359
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
26361
26360
  expectedAspectCount: 1
26362
26361
  };
26363
26362
  }
@@ -26368,8 +26367,7 @@ var ToolTrajectoryEvaluator = class {
26368
26367
  return {
26369
26368
  score: 0,
26370
26369
  verdict: "fail",
26371
- hits: [],
26372
- misses: ["No trace available for evaluation"],
26370
+ assertions: [{ text: "No trace available for evaluation", passed: false }],
26373
26371
  expectedAspectCount: 1
26374
26372
  };
26375
26373
  }
@@ -26387,8 +26385,7 @@ var ToolTrajectoryEvaluator = class {
26387
26385
  return {
26388
26386
  score: 0,
26389
26387
  verdict: "fail",
26390
- hits: [],
26391
- misses: [`Unknown mode: ${this.config.mode}`],
26388
+ assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
26392
26389
  expectedAspectCount: 1
26393
26390
  };
26394
26391
  }
@@ -26437,28 +26434,32 @@ var ToolTrajectoryEvaluator = class {
26437
26434
  return {
26438
26435
  score: 1,
26439
26436
  verdict: "pass",
26440
- hits: ["No tool requirements specified"],
26441
- misses: [],
26437
+ assertions: [{ text: "No tool requirements specified", passed: true }],
26442
26438
  expectedAspectCount: 0
26443
26439
  };
26444
26440
  }
26445
- const hits = [];
26446
- const misses = [];
26441
+ const assertions = [];
26447
26442
  for (const toolName of toolNames) {
26448
26443
  const required = minimums[toolName];
26449
26444
  const actual = summary.toolCallsByName[toolName] ?? 0;
26450
26445
  if (actual >= required) {
26451
- hits.push(`${toolName}: called ${actual} times (required >=${required})`);
26446
+ assertions.push({
26447
+ text: `${toolName}: called ${actual} times (required >=${required})`,
26448
+ passed: true
26449
+ });
26452
26450
  } else {
26453
- misses.push(`${toolName}: called ${actual} times (required >=${required})`);
26451
+ assertions.push({
26452
+ text: `${toolName}: called ${actual} times (required >=${required})`,
26453
+ passed: false
26454
+ });
26454
26455
  }
26455
26456
  }
26456
- const score = hits.length / toolNames.length;
26457
+ const passedCount = assertions.filter((a) => a.passed).length;
26458
+ const score = passedCount / toolNames.length;
26457
26459
  return {
26458
26460
  score,
26459
26461
  verdict: scoreToVerdict(score),
26460
- hits,
26461
- misses,
26462
+ assertions,
26462
26463
  expectedAspectCount: toolNames.length
26463
26464
  };
26464
26465
  }
@@ -26468,13 +26469,11 @@ var ToolTrajectoryEvaluator = class {
26468
26469
  return {
26469
26470
  score: 1,
26470
26471
  verdict: "pass",
26471
- hits: ["No tool sequence specified"],
26472
- misses: [],
26472
+ assertions: [{ text: "No tool sequence specified", passed: true }],
26473
26473
  expectedAspectCount: 0
26474
26474
  };
26475
26475
  }
26476
- const hits = [];
26477
- const misses = [];
26476
+ const assertions = [];
26478
26477
  const warnings = [];
26479
26478
  let actualIndex = 0;
26480
26479
  let sequenceHits = 0;
@@ -26494,16 +26493,20 @@ var ToolTrajectoryEvaluator = class {
26494
26493
  const actualCall = toolCalls[actualIndex];
26495
26494
  if (actualCall.name === expectedTool) {
26496
26495
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
26497
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
26496
+ assertions.push({
26497
+ text: `Found ${expectedTool} at position ${actualIndex}`,
26498
+ passed: true
26499
+ });
26498
26500
  sequenceHits++;
26499
26501
  matchedCall = actualCall;
26500
26502
  actualIndex++;
26501
26503
  found = true;
26502
26504
  break;
26503
26505
  }
26504
- misses.push(
26505
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
26506
- );
26506
+ assertions.push({
26507
+ text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
26508
+ passed: false
26509
+ });
26507
26510
  actualIndex++;
26508
26511
  argsMismatch = true;
26509
26512
  break;
@@ -26511,7 +26514,10 @@ var ToolTrajectoryEvaluator = class {
26511
26514
  actualIndex++;
26512
26515
  }
26513
26516
  if (!found && !argsMismatch) {
26514
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
26517
+ assertions.push({
26518
+ text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
26519
+ passed: false
26520
+ });
26515
26521
  }
26516
26522
  if (found && matchedCall) {
26517
26523
  const latencyResult = checkLatency(
@@ -26520,10 +26526,10 @@ var ToolTrajectoryEvaluator = class {
26520
26526
  matchedCall.durationMs
26521
26527
  );
26522
26528
  if (latencyResult.status === "pass") {
26523
- hits.push(latencyResult.message);
26529
+ assertions.push({ text: latencyResult.message, passed: true });
26524
26530
  latencyHits++;
26525
26531
  } else if (latencyResult.status === "fail") {
26526
- misses.push(latencyResult.message);
26532
+ assertions.push({ text: latencyResult.message, passed: false });
26527
26533
  } else if (latencyResult.message) {
26528
26534
  warnings.push(latencyResult.message);
26529
26535
  latencySkips++;
@@ -26539,8 +26545,7 @@ var ToolTrajectoryEvaluator = class {
26539
26545
  return {
26540
26546
  score,
26541
26547
  verdict: scoreToVerdict(score),
26542
- hits,
26543
- misses,
26548
+ assertions,
26544
26549
  expectedAspectCount: totalAssertions
26545
26550
  };
26546
26551
  }
@@ -26550,13 +26555,11 @@ var ToolTrajectoryEvaluator = class {
26550
26555
  return {
26551
26556
  score: 1,
26552
26557
  verdict: "pass",
26553
- hits: ["No tool sequence specified"],
26554
- misses: [],
26558
+ assertions: [{ text: "No tool sequence specified", passed: true }],
26555
26559
  expectedAspectCount: 0
26556
26560
  };
26557
26561
  }
26558
- const hits = [];
26559
- const misses = [];
26562
+ const assertions = [];
26560
26563
  const warnings = [];
26561
26564
  let sequenceHits = 0;
26562
26565
  let latencyHits = 0;
@@ -26565,7 +26568,10 @@ var ToolTrajectoryEvaluator = class {
26565
26568
  (item) => item.maxDurationMs !== void 0
26566
26569
  ).length;
26567
26570
  if (toolCalls.length !== expected.length) {
26568
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
26571
+ assertions.push({
26572
+ text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
26573
+ passed: false
26574
+ });
26569
26575
  }
26570
26576
  const checkLength = Math.min(expected.length, toolCalls.length);
26571
26577
  for (let i = 0; i < checkLength; i++) {
@@ -26577,14 +26583,17 @@ var ToolTrajectoryEvaluator = class {
26577
26583
  let sequenceMatched = false;
26578
26584
  if (actualTool === expectedTool) {
26579
26585
  if (argsMatch(expectedItem.args, actualCall.args, mode)) {
26580
- hits.push(`Position ${i}: ${expectedTool}`);
26586
+ assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
26581
26587
  sequenceHits++;
26582
26588
  sequenceMatched = true;
26583
26589
  } else {
26584
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
26590
+ assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
26585
26591
  }
26586
26592
  } else {
26587
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
26593
+ assertions.push({
26594
+ text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
26595
+ passed: false
26596
+ });
26588
26597
  }
26589
26598
  if (sequenceMatched) {
26590
26599
  const latencyResult = checkLatency(
@@ -26593,10 +26602,10 @@ var ToolTrajectoryEvaluator = class {
26593
26602
  actualCall.durationMs
26594
26603
  );
26595
26604
  if (latencyResult.status === "pass") {
26596
- hits.push(latencyResult.message);
26605
+ assertions.push({ text: latencyResult.message, passed: true });
26597
26606
  latencyHits++;
26598
26607
  } else if (latencyResult.status === "fail") {
26599
- misses.push(latencyResult.message);
26608
+ assertions.push({ text: latencyResult.message, passed: false });
26600
26609
  } else if (latencyResult.message) {
26601
26610
  warnings.push(latencyResult.message);
26602
26611
  latencySkips++;
@@ -26604,7 +26613,10 @@ var ToolTrajectoryEvaluator = class {
26604
26613
  }
26605
26614
  }
26606
26615
  for (let i = checkLength; i < expected.length; i++) {
26607
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
26616
+ assertions.push({
26617
+ text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
26618
+ passed: false
26619
+ });
26608
26620
  }
26609
26621
  for (const warning of warnings) {
26610
26622
  console.warn(`[tool-trajectory] ${warning}`);
@@ -26615,8 +26627,7 @@ var ToolTrajectoryEvaluator = class {
26615
26627
  return {
26616
26628
  score,
26617
26629
  verdict: scoreToVerdict(score),
26618
- hits,
26619
- misses,
26630
+ assertions,
26620
26631
  expectedAspectCount: totalAssertions
26621
26632
  };
26622
26633
  }
@@ -26631,13 +26642,11 @@ var ToolTrajectoryEvaluator = class {
26631
26642
  return {
26632
26643
  score: 1,
26633
26644
  verdict: "pass",
26634
- hits: ["No expected tools specified"],
26635
- misses: [],
26645
+ assertions: [{ text: "No expected tools specified", passed: true }],
26636
26646
  expectedAspectCount: 0
26637
26647
  };
26638
26648
  }
26639
- const hits = [];
26640
- const misses = [];
26649
+ const assertions = [];
26641
26650
  const consumed = /* @__PURE__ */ new Set();
26642
26651
  for (let i = 0; i < expected.length; i++) {
26643
26652
  const expectedItem = expected[i];
@@ -26648,22 +26657,25 @@ var ToolTrajectoryEvaluator = class {
26648
26657
  if (consumed.has(j)) continue;
26649
26658
  const actualCall = toolCalls[j];
26650
26659
  if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
26651
- hits.push(`Found ${expectedTool} at position ${j}`);
26660
+ assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
26652
26661
  consumed.add(j);
26653
26662
  found = true;
26654
26663
  break;
26655
26664
  }
26656
26665
  }
26657
26666
  if (!found) {
26658
- misses.push(`Expected ${expectedTool} not found in actual trajectory`);
26667
+ assertions.push({
26668
+ text: `Expected ${expectedTool} not found in actual trajectory`,
26669
+ passed: false
26670
+ });
26659
26671
  }
26660
26672
  }
26661
- const score = expected.length > 0 ? hits.length / expected.length : 1;
26673
+ const passedCount = assertions.filter((a) => a.passed).length;
26674
+ const score = expected.length > 0 ? passedCount / expected.length : 1;
26662
26675
  return {
26663
26676
  score,
26664
26677
  verdict: scoreToVerdict(score),
26665
- hits,
26666
- misses,
26678
+ assertions,
26667
26679
  expectedAspectCount: expected.length
26668
26680
  };
26669
26681
  }
@@ -26679,16 +26691,19 @@ var ToolTrajectoryEvaluator = class {
26679
26691
  return {
26680
26692
  score: 1,
26681
26693
  verdict: "pass",
26682
- hits: ["No tool calls and no expected tools"],
26683
- misses: [],
26694
+ assertions: [{ text: "No tool calls and no expected tools", passed: true }],
26684
26695
  expectedAspectCount: 0
26685
26696
  };
26686
26697
  }
26687
26698
  return {
26688
26699
  score: 0,
26689
26700
  verdict: "fail",
26690
- hits: [],
26691
- misses: [`${toolCalls.length} unexpected tool call(s) with empty allowed list`],
26701
+ assertions: [
26702
+ {
26703
+ text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
26704
+ passed: false
26705
+ }
26706
+ ],
26692
26707
  expectedAspectCount: toolCalls.length
26693
26708
  };
26694
26709
  }
@@ -26696,13 +26711,11 @@ var ToolTrajectoryEvaluator = class {
26696
26711
  return {
26697
26712
  score: 1,
26698
26713
  verdict: "pass",
26699
- hits: ["No actual tool calls (trivially a subset)"],
26700
- misses: [],
26714
+ assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
26701
26715
  expectedAspectCount: 0
26702
26716
  };
26703
26717
  }
26704
- const hits = [];
26705
- const misses = [];
26718
+ const assertions = [];
26706
26719
  for (let i = 0; i < toolCalls.length; i++) {
26707
26720
  const actualCall = toolCalls[i];
26708
26721
  let allowed = false;
@@ -26714,17 +26727,23 @@ var ToolTrajectoryEvaluator = class {
26714
26727
  }
26715
26728
  }
26716
26729
  if (allowed) {
26717
- hits.push(`Position ${i}: ${actualCall.name} is in allowed set`);
26730
+ assertions.push({
26731
+ text: `Position ${i}: ${actualCall.name} is in allowed set`,
26732
+ passed: true
26733
+ });
26718
26734
  } else {
26719
- misses.push(`Position ${i}: ${actualCall.name} is not in allowed set`);
26735
+ assertions.push({
26736
+ text: `Position ${i}: ${actualCall.name} is not in allowed set`,
26737
+ passed: false
26738
+ });
26720
26739
  }
26721
26740
  }
26722
- const score = toolCalls.length > 0 ? hits.length / toolCalls.length : 1;
26741
+ const passedCount = assertions.filter((a) => a.passed).length;
26742
+ const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
26723
26743
  return {
26724
26744
  score,
26725
26745
  verdict: scoreToVerdict(score),
26726
- hits,
26727
- misses,
26746
+ assertions,
26728
26747
  expectedAspectCount: toolCalls.length
26729
26748
  };
26730
26749
  }
@@ -26733,8 +26752,12 @@ function runContainsAssertion(output, value) {
26733
26752
  const passed = output.includes(value);
26734
26753
  return {
26735
26754
  score: passed ? 1 : 0,
26736
- hits: passed ? [`Output contains "${value}"`] : [],
26737
- misses: passed ? [] : [`Output does not contain "${value}"`]
26755
+ assertions: [
26756
+ {
26757
+ text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
26758
+ passed
26759
+ }
26760
+ ]
26738
26761
  };
26739
26762
  }
26740
26763
  function runContainsAnyAssertion(output, values) {
@@ -26742,8 +26765,12 @@ function runContainsAnyAssertion(output, values) {
26742
26765
  const passed = matched.length > 0;
26743
26766
  return {
26744
26767
  score: passed ? 1 : 0,
26745
- hits: passed ? [`Output contains "${matched[0]}"`] : [],
26746
- misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
26768
+ assertions: [
26769
+ {
26770
+ text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
26771
+ passed
26772
+ }
26773
+ ]
26747
26774
  };
26748
26775
  }
26749
26776
  function runContainsAllAssertion(output, values) {
@@ -26751,16 +26778,24 @@ function runContainsAllAssertion(output, values) {
26751
26778
  const passed = missing.length === 0;
26752
26779
  return {
26753
26780
  score: passed ? 1 : 0,
26754
- hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
26755
- misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
26781
+ assertions: [
26782
+ {
26783
+ text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
26784
+ passed
26785
+ }
26786
+ ]
26756
26787
  };
26757
26788
  }
26758
26789
  function runIcontainsAssertion(output, value) {
26759
26790
  const passed = output.toLowerCase().includes(value.toLowerCase());
26760
26791
  return {
26761
26792
  score: passed ? 1 : 0,
26762
- hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
26763
- misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
26793
+ assertions: [
26794
+ {
26795
+ text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
26796
+ passed
26797
+ }
26798
+ ]
26764
26799
  };
26765
26800
  }
26766
26801
  function runIcontainsAnyAssertion(output, values) {
@@ -26769,9 +26804,11 @@ function runIcontainsAnyAssertion(output, values) {
26769
26804
  const passed = matched.length > 0;
26770
26805
  return {
26771
26806
  score: passed ? 1 : 0,
26772
- hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
26773
- misses: passed ? [] : [
26774
- `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
26807
+ assertions: [
26808
+ {
26809
+ text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
26810
+ passed
26811
+ }
26775
26812
  ]
26776
26813
  };
26777
26814
  }
@@ -26781,24 +26818,36 @@ function runIcontainsAllAssertion(output, values) {
26781
26818
  const passed = missing.length === 0;
26782
26819
  return {
26783
26820
  score: passed ? 1 : 0,
26784
- hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
26785
- misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
26821
+ assertions: [
26822
+ {
26823
+ text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
26824
+ passed
26825
+ }
26826
+ ]
26786
26827
  };
26787
26828
  }
26788
26829
  function runStartsWithAssertion(output, value) {
26789
26830
  const passed = output.trim().startsWith(value.trim());
26790
26831
  return {
26791
26832
  score: passed ? 1 : 0,
26792
- hits: passed ? [`Output starts with "${value}"`] : [],
26793
- misses: passed ? [] : [`Output does not start with "${value}"`]
26833
+ assertions: [
26834
+ {
26835
+ text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
26836
+ passed
26837
+ }
26838
+ ]
26794
26839
  };
26795
26840
  }
26796
26841
  function runEndsWithAssertion(output, value) {
26797
26842
  const passed = output.trim().endsWith(value.trim());
26798
26843
  return {
26799
26844
  score: passed ? 1 : 0,
26800
- hits: passed ? [`Output ends with "${value}"`] : [],
26801
- misses: passed ? [] : [`Output does not end with "${value}"`]
26845
+ assertions: [
26846
+ {
26847
+ text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
26848
+ passed
26849
+ }
26850
+ ]
26802
26851
  };
26803
26852
  }
26804
26853
  function runRegexAssertion(output, pattern, flags) {
@@ -26807,8 +26856,12 @@ function runRegexAssertion(output, pattern, flags) {
26807
26856
  const flagsLabel = flags ? ` (flags: ${flags})` : "";
26808
26857
  return {
26809
26858
  score: passed ? 1 : 0,
26810
- hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
26811
- misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
26859
+ assertions: [
26860
+ {
26861
+ text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
26862
+ passed
26863
+ }
26864
+ ]
26812
26865
  };
26813
26866
  }
26814
26867
  function runIsJsonAssertion(output) {
@@ -26820,16 +26873,24 @@ function runIsJsonAssertion(output) {
26820
26873
  }
26821
26874
  return {
26822
26875
  score: passed ? 1 : 0,
26823
- hits: passed ? ["Output is valid JSON"] : [],
26824
- misses: passed ? [] : ["Output is not valid JSON"]
26876
+ assertions: [
26877
+ {
26878
+ text: passed ? "Output is valid JSON" : "Output is not valid JSON",
26879
+ passed
26880
+ }
26881
+ ]
26825
26882
  };
26826
26883
  }
26827
26884
  function runEqualsAssertion(output, value) {
26828
26885
  const passed = output.trim() === value.trim();
26829
26886
  return {
26830
26887
  score: passed ? 1 : 0,
26831
- hits: passed ? [`Output equals "${value}"`] : [],
26832
- misses: passed ? [] : [`Output does not equal "${value}"`]
26888
+ assertions: [
26889
+ {
26890
+ text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
26891
+ passed
26892
+ }
26893
+ ]
26833
26894
  };
26834
26895
  }
26835
26896
  var Node = class {
@@ -27028,10 +27089,8 @@ var InlineAssertEvaluator = class {
27028
27089
  return {
27029
27090
  score,
27030
27091
  verdict: scoreToVerdict(score),
27031
- hits: score >= 0.8 ? [result.name] : [],
27032
- misses: score < 0.5 ? [result.name] : [],
27092
+ assertions: [{ text: result.name, passed: score >= 0.5 }],
27033
27093
  expectedAspectCount: 1,
27034
- reasoning: void 0,
27035
27094
  details: result.metadata ? result.metadata : void 0
27036
27095
  };
27037
27096
  }
@@ -27219,9 +27278,7 @@ var containsFactory = (config) => {
27219
27278
  return {
27220
27279
  score: result.score,
27221
27280
  verdict: result.score === 1 ? "pass" : "fail",
27222
- hits: result.hits,
27223
- misses: result.misses,
27224
- reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
27281
+ assertions: result.assertions,
27225
27282
  expectedAspectCount: 1
27226
27283
  };
27227
27284
  });
@@ -27233,9 +27290,7 @@ var regexFactory = (config) => {
27233
27290
  return {
27234
27291
  score: result.score,
27235
27292
  verdict: result.score === 1 ? "pass" : "fail",
27236
- hits: result.hits,
27237
- misses: result.misses,
27238
- reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
27293
+ assertions: result.assertions,
27239
27294
  expectedAspectCount: 1
27240
27295
  };
27241
27296
  });
@@ -27246,9 +27301,7 @@ var isJsonFactory = () => {
27246
27301
  return {
27247
27302
  score: result.score,
27248
27303
  verdict: result.score === 1 ? "pass" : "fail",
27249
- hits: result.hits,
27250
- misses: result.misses,
27251
- reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
27304
+ assertions: result.assertions,
27252
27305
  expectedAspectCount: 1
27253
27306
  };
27254
27307
  });
@@ -27260,9 +27313,7 @@ var equalsFactory = (config) => {
27260
27313
  return {
27261
27314
  score: result.score,
27262
27315
  verdict: result.score === 1 ? "pass" : "fail",
27263
- hits: result.hits,
27264
- misses: result.misses,
27265
- reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
27316
+ assertions: result.assertions,
27266
27317
  expectedAspectCount: 1
27267
27318
  };
27268
27319
  });
@@ -27274,9 +27325,7 @@ var containsAnyFactory = (config) => {
27274
27325
  return {
27275
27326
  score: result.score,
27276
27327
  verdict: result.score === 1 ? "pass" : "fail",
27277
- hits: result.hits,
27278
- misses: result.misses,
27279
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27328
+ assertions: result.assertions,
27280
27329
  expectedAspectCount: 1
27281
27330
  };
27282
27331
  });
@@ -27288,9 +27337,7 @@ var containsAllFactory = (config) => {
27288
27337
  return {
27289
27338
  score: result.score,
27290
27339
  verdict: result.score === 1 ? "pass" : "fail",
27291
- hits: result.hits,
27292
- misses: result.misses,
27293
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27340
+ assertions: result.assertions,
27294
27341
  expectedAspectCount: 1
27295
27342
  };
27296
27343
  });
@@ -27302,9 +27349,7 @@ var icontainsFactory = (config) => {
27302
27349
  return {
27303
27350
  score: result.score,
27304
27351
  verdict: result.score === 1 ? "pass" : "fail",
27305
- hits: result.hits,
27306
- misses: result.misses,
27307
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27352
+ assertions: result.assertions,
27308
27353
  expectedAspectCount: 1
27309
27354
  };
27310
27355
  });
@@ -27316,9 +27361,7 @@ var icontainsAnyFactory = (config) => {
27316
27361
  return {
27317
27362
  score: result.score,
27318
27363
  verdict: result.score === 1 ? "pass" : "fail",
27319
- hits: result.hits,
27320
- misses: result.misses,
27321
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27364
+ assertions: result.assertions,
27322
27365
  expectedAspectCount: 1
27323
27366
  };
27324
27367
  });
@@ -27330,9 +27373,7 @@ var icontainsAllFactory = (config) => {
27330
27373
  return {
27331
27374
  score: result.score,
27332
27375
  verdict: result.score === 1 ? "pass" : "fail",
27333
- hits: result.hits,
27334
- misses: result.misses,
27335
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27376
+ assertions: result.assertions,
27336
27377
  expectedAspectCount: 1
27337
27378
  };
27338
27379
  });
@@ -27344,9 +27385,7 @@ var startsWithFactory = (config) => {
27344
27385
  return {
27345
27386
  score: result.score,
27346
27387
  verdict: result.score === 1 ? "pass" : "fail",
27347
- hits: result.hits,
27348
- misses: result.misses,
27349
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27388
+ assertions: result.assertions,
27350
27389
  expectedAspectCount: 1
27351
27390
  };
27352
27391
  });
@@ -27358,9 +27397,7 @@ var endsWithFactory = (config) => {
27358
27397
  return {
27359
27398
  score: result.score,
27360
27399
  verdict: result.score === 1 ? "pass" : "fail",
27361
- hits: result.hits,
27362
- misses: result.misses,
27363
- reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
27400
+ assertions: result.assertions,
27364
27401
  expectedAspectCount: 1
27365
27402
  };
27366
27403
  });
@@ -28389,7 +28426,7 @@ async function runEvaluation(options) {
28389
28426
  if (!cliModel) {
28390
28427
  throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
28391
28428
  }
28392
- const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-HDSAUUEF-LUBMM7TH.js");
28429
+ const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-NFFLXG5M-TJAWCWCX.js");
28393
28430
  return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
28394
28431
  }
28395
28432
  const overrideTarget = resolveTargetByName(cliGraderTarget);
@@ -28724,8 +28761,7 @@ async function runEvaluation(options) {
28724
28761
  testId: evalCase.id,
28725
28762
  dataset: evalCase.dataset,
28726
28763
  score: 0,
28727
- hits: [],
28728
- misses: [],
28764
+ assertions: [],
28729
28765
  answer: "",
28730
28766
  target: target.name,
28731
28767
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
@@ -28744,7 +28780,9 @@ async function runEvaluation(options) {
28744
28780
  testId: evalCase.id,
28745
28781
  status: "failed",
28746
28782
  completedAt: Date.now(),
28747
- error: budgetResult.error
28783
+ error: budgetResult.error,
28784
+ score: budgetResult.score,
28785
+ executionStatus: budgetResult.executionStatus
28748
28786
  });
28749
28787
  }
28750
28788
  if (onResult) {
@@ -28759,8 +28797,7 @@ async function runEvaluation(options) {
28759
28797
  testId: evalCase.id,
28760
28798
  dataset: evalCase.dataset,
28761
28799
  score: 0,
28762
- hits: [],
28763
- misses: [],
28800
+ assertions: [],
28764
28801
  answer: "",
28765
28802
  target: target.name,
28766
28803
  error: errorMsg,
@@ -28775,7 +28812,9 @@ async function runEvaluation(options) {
28775
28812
  testId: evalCase.id,
28776
28813
  status: "failed",
28777
28814
  completedAt: Date.now(),
28778
- error: haltResult.error
28815
+ error: haltResult.error,
28816
+ score: haltResult.score,
28817
+ executionStatus: haltResult.executionStatus
28779
28818
  });
28780
28819
  }
28781
28820
  if (onResult) {
@@ -28855,7 +28894,9 @@ async function runEvaluation(options) {
28855
28894
  startedAt: 0,
28856
28895
  // Not used for completed status
28857
28896
  completedAt: Date.now(),
28858
- error: result.error
28897
+ error: result.error,
28898
+ score: result.score,
28899
+ executionStatus: result.executionStatus
28859
28900
  });
28860
28901
  }
28861
28902
  if (onResult) {
@@ -29026,7 +29067,9 @@ async function runBatchEvaluation(options) {
29026
29067
  const merged = computed ? mergeExecutionMetrics(computed, {
29027
29068
  tokenUsage: providerResponse.tokenUsage,
29028
29069
  costUsd: providerResponse.costUsd,
29029
- durationMs: providerResponse.durationMs
29070
+ durationMs: providerResponse.durationMs,
29071
+ startTime: providerResponse.startTime,
29072
+ endTime: providerResponse.endTime
29030
29073
  }) : void 0;
29031
29074
  const trace2 = merged?.trace;
29032
29075
  const costUsd = merged?.costUsd;
@@ -29091,7 +29134,9 @@ async function runBatchEvaluation(options) {
29091
29134
  testId: evalCase.id,
29092
29135
  status: "failed",
29093
29136
  completedAt: Date.now(),
29094
- error: error instanceof Error ? error.message : String(error)
29137
+ error: error instanceof Error ? error.message : String(error),
29138
+ score: errorResult.score,
29139
+ executionStatus: errorResult.executionStatus
29095
29140
  });
29096
29141
  }
29097
29142
  continue;
@@ -29107,7 +29152,9 @@ async function runBatchEvaluation(options) {
29107
29152
  status: result.error ? "failed" : "completed",
29108
29153
  startedAt: 0,
29109
29154
  completedAt: Date.now(),
29110
- error: result.error
29155
+ error: result.error,
29156
+ score: result.score,
29157
+ executionStatus: result.executionStatus
29111
29158
  });
29112
29159
  }
29113
29160
  }
@@ -29417,7 +29464,9 @@ async function runEvalCase(options) {
29417
29464
  const merged = computed ? mergeExecutionMetrics(computed, {
29418
29465
  tokenUsage: providerResponse.tokenUsage,
29419
29466
  costUsd: providerResponse.costUsd,
29420
- durationMs: providerResponse.durationMs
29467
+ durationMs: providerResponse.durationMs,
29468
+ startTime: providerResponse.startTime,
29469
+ endTime: providerResponse.endTime
29421
29470
  }) : void 0;
29422
29471
  const trace2 = merged?.trace;
29423
29472
  const costUsd = merged?.costUsd;
@@ -29715,11 +29764,9 @@ async function evaluateCandidate(options) {
29715
29764
  dataset: evalCase.dataset,
29716
29765
  conversationId: evalCase.conversation_id,
29717
29766
  score: score.score,
29718
- hits: score.hits,
29719
- misses: score.misses,
29767
+ assertions: score.assertions,
29720
29768
  answer: candidate,
29721
29769
  target: target.name,
29722
- reasoning: score.reasoning,
29723
29770
  tokenUsage,
29724
29771
  costUsd,
29725
29772
  durationMs,
@@ -29893,9 +29940,7 @@ async function runEvaluatorList(options) {
29893
29940
  score: score2.score,
29894
29941
  weight,
29895
29942
  verdict: score2.verdict,
29896
- hits: score2.hits,
29897
- misses: score2.misses,
29898
- reasoning: score2.reasoning,
29943
+ assertions: score2.assertions,
29899
29944
  evaluatorProviderRequest: score2.evaluatorRawRequest,
29900
29945
  details: score2.details,
29901
29946
  scores: mapChildResults(score2.scores),
@@ -29910,10 +29955,10 @@ async function runEvaluatorList(options) {
29910
29955
  const fallbackScore = {
29911
29956
  score: 0,
29912
29957
  verdict: "fail",
29913
- hits: [],
29914
- misses: [`Evaluator '${evaluatorConfig.name}' failed: ${message}`],
29915
- expectedAspectCount: 1,
29916
- reasoning: message
29958
+ assertions: [
29959
+ { text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
29960
+ ],
29961
+ expectedAspectCount: 1
29917
29962
  };
29918
29963
  const weight = evaluatorConfig.weight ?? 1;
29919
29964
  scored.push({
@@ -29929,9 +29974,12 @@ async function runEvaluatorList(options) {
29929
29974
  score: 0,
29930
29975
  weight,
29931
29976
  verdict: "fail",
29932
- hits: [],
29933
- misses: [`Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`],
29934
- reasoning: message,
29977
+ assertions: [
29978
+ {
29979
+ text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
29980
+ passed: false
29981
+ }
29982
+ ],
29935
29983
  durationMs: endedAt.getTime() - startedAt.getTime(),
29936
29984
  startedAt: startedAt.toISOString(),
29937
29985
  endedAt: endedAt.toISOString()
@@ -29947,9 +29995,7 @@ async function runEvaluatorList(options) {
29947
29995
  ...scores[lastScoresIdx],
29948
29996
  score: negated.score,
29949
29997
  verdict: negated.verdict,
29950
- hits: [...negated.hits],
29951
- misses: [...negated.misses],
29952
- reasoning: negated.reasoning
29998
+ assertions: [...negated.assertions]
29953
29999
  };
29954
30000
  }
29955
30001
  }
@@ -29964,21 +30010,13 @@ async function runEvaluatorList(options) {
29964
30010
  const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
29965
30011
  scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
29966
30012
  ) : 0;
29967
- const hits = scored.flatMap((entry) => entry.score.hits);
29968
- const misses = scored.flatMap((entry) => entry.score.misses);
29969
- const expectedAspectCount = scored.reduce(
29970
- (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
29971
- 0
29972
- );
29973
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
29974
- const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
30013
+ const assertions = scored.flatMap((entry) => entry.score.assertions);
30014
+ const expectedAspectCount = assertions.length || 1;
29975
30015
  const score = {
29976
30016
  score: aggregateScore,
29977
30017
  verdict: scoreToVerdict(aggregateScore),
29978
- hits,
29979
- misses,
29980
- expectedAspectCount,
29981
- reasoning
30018
+ assertions,
30019
+ expectedAspectCount
29982
30020
  };
29983
30021
  return { score, scores };
29984
30022
  }
@@ -30082,8 +30120,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
30082
30120
  dataset: evalCase.dataset,
30083
30121
  conversationId: evalCase.conversation_id,
30084
30122
  score: 0,
30085
- hits: [],
30086
- misses: [`Error: ${message}`],
30123
+ assertions: [{ text: `Error: ${message}`, passed: false }],
30087
30124
  answer: `Error occurred: ${message}`,
30088
30125
  target: targetName,
30089
30126
  requests,
@@ -30193,9 +30230,7 @@ function mapChildResults(children) {
30193
30230
  score: child.score,
30194
30231
  weight: child.weight,
30195
30232
  verdict: child.verdict,
30196
- hits: child.hits,
30197
- misses: child.misses,
30198
- reasoning: child.reasoning,
30233
+ assertions: child.assertions,
30199
30234
  evaluatorProviderRequest: child.evaluatorRawRequest,
30200
30235
  scores: mapChildResults(child.scores),
30201
30236
  details: child.details,
@@ -31063,7 +31098,6 @@ export {
31063
31098
  isJsonValue,
31064
31099
  isTestMessage,
31065
31100
  isEvaluatorKind,
31066
- getHitCount,
31067
31101
  fileExists,
31068
31102
  normalizeLineEndings,
31069
31103
  readTextFile,
@@ -31203,4 +31237,4 @@ export {
31203
31237
  OtelStreamingObserver,
31204
31238
  createAgentKernel
31205
31239
  };
31206
- //# sourceMappingURL=chunk-5M3K2DMV.js.map
31240
+ //# sourceMappingURL=chunk-D6G4N2H2.js.map