@agentv/core 3.10.3 → 3.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  readTextFile,
20
20
  resolveFileReference,
21
21
  resolveTargetDefinition
22
- } from "./chunk-VCFYWLFV.js";
22
+ } from "./chunk-AVTN5AB7.js";
23
23
  import {
24
24
  AgentvProvider
25
25
  } from "./chunk-W5YDZWT4.js";
@@ -154,6 +154,64 @@ import path7 from "node:path";
154
154
  import micromatch2 from "micromatch";
155
155
  import { parse as parse2 } from "yaml";
156
156
 
157
+ // src/evaluation/input-message-utils.ts
158
+ function flattenInputMessages(messages) {
159
+ return messages.flatMap((message) => extractContentSegments(message.content));
160
+ }
161
+ function collectResolvedInputFilePaths(messages) {
162
+ const filePaths = [];
163
+ for (const message of messages) {
164
+ if (!Array.isArray(message.content)) {
165
+ continue;
166
+ }
167
+ for (const segment of message.content) {
168
+ if (isJsonObject(segment) && segment.type === "file" && typeof segment.resolvedPath === "string") {
169
+ filePaths.push(segment.resolvedPath);
170
+ }
171
+ }
172
+ }
173
+ return filePaths;
174
+ }
175
+ function extractContentSegments(content) {
176
+ if (typeof content === "string") {
177
+ return content.trim().length > 0 ? [{ type: "text", value: content }] : [];
178
+ }
179
+ if (isJsonObject(content)) {
180
+ const rendered = JSON.stringify(content, null, 2);
181
+ return rendered.trim().length > 0 ? [{ type: "text", value: rendered }] : [];
182
+ }
183
+ if (!Array.isArray(content)) {
184
+ return [];
185
+ }
186
+ const segments = [];
187
+ for (const segment of content) {
188
+ if (!isJsonObject(segment)) {
189
+ continue;
190
+ }
191
+ segments.push(cloneJsonObject(segment));
192
+ }
193
+ return segments;
194
+ }
195
+ function cloneJsonObject(source) {
196
+ const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
197
+ return Object.fromEntries(entries);
198
+ }
199
+ function cloneJsonValue(value) {
200
+ if (value === null) {
201
+ return null;
202
+ }
203
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
204
+ return value;
205
+ }
206
+ if (Array.isArray(value)) {
207
+ return value.map((item) => cloneJsonValue(item));
208
+ }
209
+ if (typeof value === "object") {
210
+ return cloneJsonObject(value);
211
+ }
212
+ return value;
213
+ }
214
+
157
215
  // src/evaluation/loaders/agent-skills-parser.ts
158
216
  import { readFile } from "node:fs/promises";
159
217
  import path from "node:path";
@@ -222,7 +280,6 @@ function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
222
280
  id: String(id),
223
281
  question: prompt,
224
282
  input: [{ role: "user", content: prompt }],
225
- input_segments: [{ type: "text", value: prompt }],
226
283
  expected_output: evalCase.expected_output ? [{ role: "assistant", content: evalCase.expected_output }] : [],
227
284
  reference_answer: evalCase.expected_output,
228
285
  file_paths: filePaths,
@@ -357,7 +414,7 @@ async function loadConfig(evalFilePath, repoRoot) {
357
414
  }
358
415
  try {
359
416
  const rawConfig = await readFile2(configPath, "utf8");
360
- const parsed = parse(rawConfig);
417
+ const parsed = interpolateEnv(parse(rawConfig), process.env);
361
418
  if (!isJsonObject(parsed)) {
362
419
  logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
363
420
  continue;
@@ -575,6 +632,27 @@ function parseExecutionDefaults(raw, configPath) {
575
632
  } else if (otelFile !== void 0) {
576
633
  logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
577
634
  }
635
+ if (typeof obj.export_otel === "boolean") {
636
+ result.export_otel = obj.export_otel;
637
+ } else if (obj.export_otel !== void 0) {
638
+ logWarning(`Invalid execution.export_otel in ${configPath}, expected boolean`);
639
+ }
640
+ const otelBackend = obj.otel_backend;
641
+ if (typeof otelBackend === "string" && otelBackend.trim().length > 0) {
642
+ result.otel_backend = otelBackend.trim();
643
+ } else if (otelBackend !== void 0) {
644
+ logWarning(`Invalid execution.otel_backend in ${configPath}, expected non-empty string`);
645
+ }
646
+ if (typeof obj.otel_capture_content === "boolean") {
647
+ result.otel_capture_content = obj.otel_capture_content;
648
+ } else if (obj.otel_capture_content !== void 0) {
649
+ logWarning(`Invalid execution.otel_capture_content in ${configPath}, expected boolean`);
650
+ }
651
+ if (typeof obj.otel_group_turns === "boolean") {
652
+ result.otel_group_turns = obj.otel_group_turns;
653
+ } else if (obj.otel_group_turns !== void 0) {
654
+ logWarning(`Invalid execution.otel_group_turns in ${configPath}, expected boolean`);
655
+ }
578
656
  if (typeof obj.pool_workspaces === "boolean") {
579
657
  result.pool_workspaces = obj.pool_workspaces;
580
658
  } else if (obj.pool_workspaces !== void 0) {
@@ -2045,27 +2123,28 @@ var ANSI_YELLOW4 = "\x1B[33m";
2045
2123
  var ANSI_RESET5 = "\x1B[0m";
2046
2124
  async function processMessages(options) {
2047
2125
  const { messages, searchRoots, repoRootPath, textParts, messageType, verbose } = options;
2048
- const segments = [];
2126
+ const processedMessages = [];
2049
2127
  for (const message of messages) {
2050
2128
  const content = message.content;
2051
2129
  if (typeof content === "string") {
2052
- segments.push({ type: "text", value: content });
2053
2130
  if (textParts) {
2054
2131
  textParts.push(content);
2055
2132
  }
2133
+ processedMessages.push({ ...message, content });
2056
2134
  continue;
2057
2135
  }
2058
2136
  if (isJsonObject(content)) {
2059
2137
  const rendered = JSON.stringify(content, null, 2);
2060
- segments.push({ type: "text", value: rendered });
2061
2138
  if (textParts) {
2062
2139
  textParts.push(rendered);
2063
2140
  }
2141
+ processedMessages.push({ ...message, content: cloneJsonObject(content) });
2064
2142
  continue;
2065
2143
  }
2066
2144
  if (!Array.isArray(content)) {
2067
2145
  continue;
2068
2146
  }
2147
+ const processedContent = [];
2069
2148
  for (const rawSegment of content) {
2070
2149
  if (!isJsonObject(rawSegment)) {
2071
2150
  continue;
@@ -2088,8 +2167,8 @@ async function processMessages(options) {
2088
2167
  }
2089
2168
  try {
2090
2169
  const fileContent = (await readFile4(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
2091
- segments.push({
2092
- type: "file",
2170
+ processedContent.push({
2171
+ ...cloneJsonObject(rawSegment),
2093
2172
  path: displayPath,
2094
2173
  text: fileContent,
2095
2174
  resolvedPath: path5.resolve(resolvedPath)
@@ -2106,37 +2185,19 @@ async function processMessages(options) {
2106
2185
  continue;
2107
2186
  }
2108
2187
  const clonedSegment = cloneJsonObject(rawSegment);
2109
- segments.push(clonedSegment);
2188
+ processedContent.push(clonedSegment);
2110
2189
  const inlineValue = clonedSegment.value;
2111
2190
  if (typeof inlineValue === "string" && textParts) {
2112
2191
  textParts.push(inlineValue);
2113
2192
  }
2114
2193
  }
2194
+ processedMessages.push({ ...message, content: processedContent });
2115
2195
  }
2116
- return segments;
2196
+ return processedMessages;
2117
2197
  }
2118
2198
  function asString3(value) {
2119
2199
  return typeof value === "string" ? value : void 0;
2120
2200
  }
2121
- function cloneJsonObject(source) {
2122
- const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
2123
- return Object.fromEntries(entries);
2124
- }
2125
- function cloneJsonValue(value) {
2126
- if (value === null) {
2127
- return null;
2128
- }
2129
- if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
2130
- return value;
2131
- }
2132
- if (Array.isArray(value)) {
2133
- return value.map((item) => cloneJsonValue(item));
2134
- }
2135
- if (typeof value === "object") {
2136
- return cloneJsonObject(value);
2137
- }
2138
- return value;
2139
- }
2140
2201
  function logWarning3(message, details) {
2141
2202
  if (details && details.length > 0) {
2142
2203
  const detailBlock = details.join("\n");
@@ -2385,10 +2446,10 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2385
2446
  );
2386
2447
  }
2387
2448
  }
2388
- const inputMessages = resolveInputMessages(evalcase);
2449
+ const rawInputMessages = resolveInputMessages(evalcase);
2389
2450
  const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
2390
2451
  const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
2391
- if (!id || !hasEvaluationSpec || !inputMessages || inputMessages.length === 0) {
2452
+ if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
2392
2453
  logError2(
2393
2454
  `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
2394
2455
  );
@@ -2396,8 +2457,8 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2396
2457
  }
2397
2458
  const hasExpectedMessages = expectedMessages.length > 0;
2398
2459
  const inputTextParts = [];
2399
- const inputSegments = await processMessages({
2400
- messages: inputMessages,
2460
+ const inputMessages = await processMessages({
2461
+ messages: rawInputMessages,
2401
2462
  searchRoots,
2402
2463
  repoRootPath,
2403
2464
  textParts: inputTextParts,
@@ -2443,19 +2504,13 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2443
2504
  }
2444
2505
  }
2445
2506
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
2446
- const userFilePaths = [];
2447
- for (const segment of inputSegments) {
2448
- if (segment.type === "file" && typeof segment.resolvedPath === "string") {
2449
- userFilePaths.push(segment.resolvedPath);
2450
- }
2451
- }
2507
+ const userFilePaths = collectResolvedInputFilePaths(inputMessages);
2452
2508
  const testCase = {
2453
2509
  id,
2454
2510
  eval_set: evalSetName,
2455
2511
  conversation_id: conversationId,
2456
2512
  question,
2457
2513
  input: inputMessages,
2458
- input_segments: inputSegments,
2459
2514
  expected_output: outputSegments,
2460
2515
  reference_answer: referenceAnswer,
2461
2516
  file_paths: userFilePaths,
@@ -2521,50 +2576,9 @@ function parseMetadata(suite) {
2521
2576
 
2522
2577
  // src/evaluation/formatting/prompt-builder.ts
2523
2578
  async function buildPromptInputs(testCase, mode = "lm") {
2524
- const segmentsByMessage = [];
2525
- const fileContentsByPath = /* @__PURE__ */ new Map();
2526
- for (const segment of testCase.input_segments) {
2527
- if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
2528
- fileContentsByPath.set(segment.path, segment.text);
2529
- }
2530
- }
2531
- for (const message of testCase.input) {
2532
- const messageSegments = [];
2533
- if (typeof message.content === "string") {
2534
- if (message.content.trim().length > 0) {
2535
- messageSegments.push({ type: "text", value: message.content });
2536
- }
2537
- } else if (Array.isArray(message.content)) {
2538
- for (const segment of message.content) {
2539
- if (typeof segment === "string") {
2540
- if (segment.trim().length > 0) {
2541
- messageSegments.push({ type: "text", value: segment });
2542
- }
2543
- } else if (isJsonObject(segment)) {
2544
- const type = asString5(segment.type);
2545
- if (type === "file") {
2546
- const value = asString5(segment.value);
2547
- if (!value) continue;
2548
- const fileText = fileContentsByPath.get(value);
2549
- if (fileText !== void 0) {
2550
- messageSegments.push({ type: "file", text: fileText, path: value });
2551
- }
2552
- } else if (type === "text") {
2553
- const textValue = asString5(segment.value);
2554
- if (textValue && textValue.trim().length > 0) {
2555
- messageSegments.push({ type: "text", value: textValue });
2556
- }
2557
- }
2558
- }
2559
- }
2560
- } else if (isJsonObject(message.content)) {
2561
- const rendered = JSON.stringify(message.content, null, 2);
2562
- if (rendered.trim().length > 0) {
2563
- messageSegments.push({ type: "text", value: rendered });
2564
- }
2565
- }
2566
- segmentsByMessage.push(messageSegments);
2567
- }
2579
+ const segmentsByMessage = testCase.input.map(
2580
+ (message) => extractContentSegments(message.content)
2581
+ );
2568
2582
  const useRoleMarkers = needsRoleMarkers(testCase.input, segmentsByMessage);
2569
2583
  let question;
2570
2584
  if (useRoleMarkers) {
@@ -2592,7 +2606,7 @@ ${messageContent}`);
2592
2606
  question = messageParts.join("\n\n");
2593
2607
  } else {
2594
2608
  const questionParts = [];
2595
- for (const segment of testCase.input_segments) {
2609
+ for (const segment of flattenInputMessages(testCase.input)) {
2596
2610
  const formattedContent = formatSegment(segment, mode);
2597
2611
  if (formattedContent) {
2598
2612
  questionParts.push(formattedContent);
@@ -2679,9 +2693,6 @@ function buildChatPromptFromSegments(options) {
2679
2693
  }
2680
2694
  return chatPrompt.length > 0 ? chatPrompt : void 0;
2681
2695
  }
2682
- function asString5(value) {
2683
- return typeof value === "string" ? value : void 0;
2684
- }
2685
2696
 
2686
2697
  // src/evaluation/yaml-parser.ts
2687
2698
  var ANSI_YELLOW6 = "\x1B[33m";
@@ -2764,7 +2775,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2764
2775
  throw new Error(`Invalid test file format: ${evalFilePath}`);
2765
2776
  }
2766
2777
  const suite = interpolated;
2767
- const evalSetNameFromSuite = asString6(suite.name)?.trim();
2778
+ const evalSetNameFromSuite = asString5(suite.name)?.trim();
2768
2779
  const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
2769
2780
  const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
2770
2781
  const rawTestcases = resolveTests(suite);
@@ -2783,7 +2794,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2783
2794
  const suiteInputMessages = expandInputShorthand(suite.input);
2784
2795
  const suiteInputFiles = suite.input_files;
2785
2796
  const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
2786
- const _globalTarget = asString6(rawGlobalExecution?.target) ?? asString6(suite.target);
2797
+ const _globalTarget = asString5(rawGlobalExecution?.target) ?? asString5(suite.target);
2787
2798
  const suiteAssertions = suite.assertions ?? suite.assert;
2788
2799
  if (suite.assert !== void 0 && suite.assertions === void 0) {
2789
2800
  logWarning5("'assert' is deprecated at the suite level. Use 'assertions' instead.");
@@ -2796,17 +2807,17 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2796
2807
  continue;
2797
2808
  }
2798
2809
  const evalcase = rawEvalcase;
2799
- const id = asString6(evalcase.id);
2810
+ const id = asString5(evalcase.id);
2800
2811
  if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
2801
2812
  continue;
2802
2813
  }
2803
- const conversationId = asString6(evalcase.conversation_id);
2804
- let outcome = asString6(evalcase.criteria);
2814
+ const conversationId = asString5(evalcase.conversation_id);
2815
+ let outcome = asString5(evalcase.criteria);
2805
2816
  if (!outcome && evalcase.expected_outcome !== void 0) {
2806
- outcome = asString6(evalcase.expected_outcome);
2817
+ outcome = asString5(evalcase.expected_outcome);
2807
2818
  if (outcome) {
2808
2819
  logWarning5(
2809
- `Test '${asString6(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
2820
+ `Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
2810
2821
  );
2811
2822
  }
2812
2823
  }
@@ -2823,10 +2834,9 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2823
2834
  continue;
2824
2835
  }
2825
2836
  const effectiveSuiteInputMessages = suiteInputMessages && !skipDefaults ? suiteInputMessages : void 0;
2826
- const inputMessages = effectiveSuiteInputMessages ? [...effectiveSuiteInputMessages, ...testInputMessages] : testInputMessages;
2827
2837
  const hasExpectedMessages = expectedMessages.length > 0;
2828
2838
  const inputTextParts = [];
2829
- const suiteInputSegments = effectiveSuiteInputMessages ? await processMessages({
2839
+ const suiteResolvedInputMessages = effectiveSuiteInputMessages ? await processMessages({
2830
2840
  messages: effectiveSuiteInputMessages,
2831
2841
  searchRoots,
2832
2842
  repoRootPath,
@@ -2834,7 +2844,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2834
2844
  messageType: "input",
2835
2845
  verbose
2836
2846
  }) : [];
2837
- const testInputSegments = await processMessages({
2847
+ const testResolvedInputMessages = await processMessages({
2838
2848
  messages: testInputMessages,
2839
2849
  searchRoots,
2840
2850
  repoRootPath,
@@ -2842,7 +2852,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2842
2852
  messageType: "input",
2843
2853
  verbose
2844
2854
  });
2845
- const inputSegments = [...suiteInputSegments, ...testInputSegments];
2855
+ const inputMessages = [...suiteResolvedInputMessages, ...testResolvedInputMessages];
2846
2856
  const outputSegments = hasExpectedMessages ? await processExpectedMessages({
2847
2857
  messages: expectedMessages,
2848
2858
  searchRoots,
@@ -2880,12 +2890,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2880
2890
  }
2881
2891
  }
2882
2892
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
2883
- const userFilePaths = [];
2884
- for (const segment of inputSegments) {
2885
- if (segment.type === "file" && typeof segment.resolvedPath === "string") {
2886
- userFilePaths.push(segment.resolvedPath);
2887
- }
2888
- }
2893
+ const userFilePaths = collectResolvedInputFilePaths(inputMessages);
2889
2894
  const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
2890
2895
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
2891
2896
  const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
@@ -2896,7 +2901,6 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2896
2901
  conversation_id: conversationId,
2897
2902
  question,
2898
2903
  input: inputMessages,
2899
- input_segments: inputSegments,
2900
2904
  expected_output: outputSegments,
2901
2905
  reference_answer: referenceAnswer,
2902
2906
  file_paths: userFilePaths,
@@ -3105,7 +3109,7 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
3105
3109
  path: caseLevel.path ?? suiteLevel.path
3106
3110
  };
3107
3111
  }
3108
- function asString6(value) {
3112
+ function asString5(value) {
3109
3113
  return typeof value === "string" ? value : void 0;
3110
3114
  }
3111
3115
  function logWarning5(message, details) {
@@ -6857,7 +6861,7 @@ var PiAgentSdkProvider = class {
6857
6861
  const { Agent, getModel, getEnvApiKey } = await loadPiModules();
6858
6862
  const startTimeIso = (/* @__PURE__ */ new Date()).toISOString();
6859
6863
  const startMs = Date.now();
6860
- const providerName = this.config.provider ?? "anthropic";
6864
+ const providerName = this.config.subprovider ?? "anthropic";
6861
6865
  const modelId = this.config.model ?? "claude-sonnet-4-20250514";
6862
6866
  const model = getModel(providerName, modelId);
6863
6867
  const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
@@ -6969,7 +6973,7 @@ var PiAgentSdkProvider = class {
6969
6973
  messages: agentMessages,
6970
6974
  systemPrompt,
6971
6975
  model: this.config.model,
6972
- provider: this.config.provider
6976
+ subprovider: this.config.subprovider
6973
6977
  },
6974
6978
  output,
6975
6979
  tokenUsage,
@@ -7205,8 +7209,8 @@ var PiCodingAgentProvider = class {
7205
7209
  }
7206
7210
  buildPiArgs(prompt, inputFiles, _captureFileChanges) {
7207
7211
  const args = [];
7208
- if (this.config.provider) {
7209
- args.push("--provider", this.config.provider);
7212
+ if (this.config.subprovider) {
7213
+ args.push("--provider", this.config.subprovider);
7210
7214
  }
7211
7215
  if (this.config.model) {
7212
7216
  args.push("--model", this.config.model);
@@ -7264,7 +7268,7 @@ ${prompt}` : prompt;
7264
7268
  buildEnv() {
7265
7269
  const env = { ...process.env };
7266
7270
  if (this.config.apiKey) {
7267
- const provider = this.config.provider?.toLowerCase() ?? "google";
7271
+ const provider = this.config.subprovider?.toLowerCase() ?? "google";
7268
7272
  switch (provider) {
7269
7273
  case "google":
7270
7274
  case "gemini":
@@ -10110,7 +10114,8 @@ var freeformEvaluationSchema = z3.object({
10110
10114
  passed: z3.boolean().describe("Whether this aspect was satisfied"),
10111
10115
  evidence: z3.string().describe("Concise evidence (1-2 sentences)").optional()
10112
10116
  })
10113
- ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
10117
+ ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional(),
10118
+ details: z3.record(z3.unknown()).describe("Optional structured metadata for domain-specific metrics").optional()
10114
10119
  });
10115
10120
  var rubricCheckResultSchema = z3.object({
10116
10121
  id: z3.string().describe("The ID of the rubric item being checked"),
@@ -10172,7 +10177,7 @@ var LlmGraderEvaluator = class {
10172
10177
  async evaluateFreeform(context, graderProvider) {
10173
10178
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
10174
10179
  const variables = {
10175
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input_segments, null, 2),
10180
+ [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input, null, 2),
10176
10181
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
10177
10182
  context.evalCase.expected_output,
10178
10183
  null,
@@ -10215,6 +10220,7 @@ ${context.fileChanges}`;
10215
10220
  expectedAspectCount: Math.max(assertions.length, 1),
10216
10221
  evaluatorRawRequest,
10217
10222
  graderTarget: graderProvider.targetName,
10223
+ details: data.details,
10218
10224
  tokenUsage
10219
10225
  };
10220
10226
  } catch (e) {
@@ -10634,7 +10640,7 @@ ${outputSchema}`;
10634
10640
  expectedAspectCount: Math.max(assertions.length, 1),
10635
10641
  evaluatorRawRequest,
10636
10642
  graderTarget,
10637
- details
10643
+ details: data.details && Object.keys(data.details).length > 0 ? { ...details, ...data.details } : details
10638
10644
  };
10639
10645
  } catch {
10640
10646
  return {
@@ -10781,7 +10787,8 @@ function buildOutputSchema() {
10781
10787
  ' "passed": <boolean>,',
10782
10788
  ' "evidence": "<concise evidence, 1-2 sentences, optional>"',
10783
10789
  " }",
10784
- " ]",
10790
+ " ],",
10791
+ ' "details": {<optional object with domain-specific structured metrics>}',
10785
10792
  "}"
10786
10793
  ].join("\n");
10787
10794
  }
@@ -12145,7 +12152,7 @@ function assembleLlmGraderPrompt(input) {
12145
12152
  function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
12146
12153
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
12147
12154
  const variables = {
12148
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
12155
+ [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
12149
12156
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
12150
12157
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
12151
12158
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
@@ -14426,6 +14433,18 @@ var QUALITY_PASS_THRESHOLD = 0.8;
14426
14433
  function classifyQualityStatus(score) {
14427
14434
  return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
14428
14435
  }
14436
+ function buildSkippedEvaluatorError(scores) {
14437
+ const skippedScores = scores?.filter((score) => score.verdict === "skip") ?? [];
14438
+ if (skippedScores.length === 0) {
14439
+ return void 0;
14440
+ }
14441
+ const messages = skippedScores.map((score) => {
14442
+ const label = score.name || score.type;
14443
+ const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Evaluator skipped";
14444
+ return `${label}: ${assertionMessage}`;
14445
+ });
14446
+ return messages.length === 1 ? messages[0] : `Evaluators skipped: ${messages.join(" | ")}`;
14447
+ }
14429
14448
  function usesFileReferencePrompt(provider) {
14430
14449
  return isAgentProvider(provider) || provider.kind === "cli";
14431
14450
  }
@@ -15690,7 +15709,8 @@ async function runEvalCase(options) {
15690
15709
  durationMs: totalDurationMs,
15691
15710
  ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
15692
15711
  };
15693
- const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
15712
+ const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
15713
+ const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score);
15694
15714
  const finalResult = providerError ? {
15695
15715
  ...result,
15696
15716
  evalRun,
@@ -15702,7 +15722,26 @@ async function runEvalCase(options) {
15702
15722
  beforeAllOutput,
15703
15723
  beforeEachOutput,
15704
15724
  afterEachOutput
15705
- } : { ...result, evalRun, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
15725
+ } : skippedEvaluatorError ? {
15726
+ ...result,
15727
+ score: 0,
15728
+ evalRun,
15729
+ error: skippedEvaluatorError,
15730
+ executionStatus,
15731
+ failureStage: "evaluator",
15732
+ failureReasonCode: "evaluator_error",
15733
+ executionError: { message: skippedEvaluatorError, stage: "evaluator" },
15734
+ beforeAllOutput,
15735
+ beforeEachOutput,
15736
+ afterEachOutput
15737
+ } : {
15738
+ ...result,
15739
+ evalRun,
15740
+ executionStatus,
15741
+ beforeAllOutput,
15742
+ beforeEachOutput,
15743
+ afterEachOutput
15744
+ };
15706
15745
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
15707
15746
  if (workspacePath && !isSharedWorkspace) {
15708
15747
  if (forceCleanup) {
@@ -16447,11 +16486,6 @@ async function evaluate(config) {
16447
16486
  evalCases = (config.tests ?? []).map((test) => {
16448
16487
  const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
16449
16488
  const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
16450
- const inputSegments = input.map((m) => ({
16451
- type: "text",
16452
- value: typeof m.content === "string" ? m.content : JSON.stringify(m.content),
16453
- messageIndex: 0
16454
- }));
16455
16489
  const expectedOutputValue = test.expectedOutput ?? test.expected_output;
16456
16490
  const expectedOutput = expectedOutputValue ? [
16457
16491
  { role: "assistant", content: expectedOutputValue }
@@ -16480,7 +16514,6 @@ async function evaluate(config) {
16480
16514
  criteria: test.criteria ?? "",
16481
16515
  question: String(question),
16482
16516
  input,
16483
- input_segments: inputSegments,
16484
16517
  expected_output: expectedOutput,
16485
16518
  reference_answer: expectedOutputValue,
16486
16519
  file_paths: [],