@agentv/core 4.31.4-next.1 → 4.33.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,16 @@
1
1
  import {
2
2
  LLM_GRADER_CAPABLE_KINDS,
3
+ RUBRIC_OPERATOR_VALUES,
3
4
  buildDirectoryChain,
4
5
  expandFileReferences,
5
6
  extractLastAssistantContent,
6
7
  fileExists,
7
8
  findGitRoot,
9
+ getAgentvConfigDir,
10
+ getAgentvDataDir,
11
+ getSubagentsRoot,
12
+ getWorkspacePoolRoot,
13
+ getWorkspacesRoot,
8
14
  interpolateEnv,
9
15
  interpolateTemplateVars,
10
16
  isAgentProvider,
@@ -18,7 +24,7 @@ import {
18
24
  readTextFile,
19
25
  resolveDelegatedTargetDefinition,
20
26
  resolveTargetDefinition
21
- } from "./chunk-5RQMJZDJ.js";
27
+ } from "./chunk-EW5X2RGJ.js";
22
28
  import {
23
29
  execFileWithStdin,
24
30
  execShellWithStdin
@@ -41,6 +47,49 @@ import { existsSync as existsSync6 } from "node:fs";
41
47
  import path45 from "node:path";
42
48
  import micromatch4 from "micromatch";
43
49
 
50
+ // src/evaluation/cache/response-cache.ts
51
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
52
+ import path from "node:path";
53
+ var DEFAULT_CACHE_PATH = ".agentv/cache";
54
+ var ResponseCache = class {
55
+ cachePath;
56
+ constructor(cachePath) {
57
+ this.cachePath = cachePath ?? DEFAULT_CACHE_PATH;
58
+ }
59
+ async get(key) {
60
+ const filePath = this.keyToPath(key);
61
+ try {
62
+ const data = await readFile(filePath, "utf8");
63
+ return JSON.parse(data);
64
+ } catch {
65
+ return void 0;
66
+ }
67
+ }
68
+ async set(key, value) {
69
+ const filePath = this.keyToPath(key);
70
+ const dir = path.dirname(filePath);
71
+ await mkdir(dir, { recursive: true });
72
+ await writeFile(filePath, JSON.stringify(value, null, 2), "utf8");
73
+ }
74
+ keyToPath(key) {
75
+ const prefix = key.slice(0, 2);
76
+ return path.join(this.cachePath, prefix, `${key}.json`);
77
+ }
78
+ };
79
+ function shouldEnableCache(params) {
80
+ if (params.cliNoCache) return false;
81
+ if (params.cliCache) return true;
82
+ if (params.yamlCache !== void 0) return params.yamlCache;
83
+ return params.tsConfigCache === true;
84
+ }
85
+ function shouldSkipCacheForTemperature(targetConfig) {
86
+ const temp = targetConfig.temperature;
87
+ if (typeof temp === "number" && temp > 0) {
88
+ return true;
89
+ }
90
+ return false;
91
+ }
92
+
44
93
  // src/evaluation/graders/scoring.ts
45
94
  var DEFAULT_THRESHOLD = 0.8;
46
95
  var PASS_THRESHOLD = DEFAULT_THRESHOLD;
@@ -133,7 +182,7 @@ function negateScore(score) {
133
182
  import { execFile as execFile3 } from "node:child_process";
134
183
  import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
135
184
  import { existsSync as existsSync5 } from "node:fs";
136
- import { copyFile as copyFile2, mkdir as mkdir14, readdir as readdir8, stat as stat9 } from "node:fs/promises";
185
+ import { copyFile as copyFile2, mkdir as mkdir15, readdir as readdir8, stat as stat9 } from "node:fs/promises";
137
186
  import path44 from "node:path";
138
187
  import { promisify as promisify7 } from "node:util";
139
188
  import micromatch3 from "micromatch";
@@ -277,39 +326,8 @@ function validateConcurrency(concurrency) {
277
326
  }
278
327
  }
279
328
 
280
- // src/paths.ts
281
- import os from "node:os";
282
- import path from "node:path";
283
- var logged = false;
284
- function getAgentvConfigDir() {
285
- return path.join(os.homedir(), ".agentv");
286
- }
287
- function getAgentvHome() {
288
- const envHome = process.env.AGENTV_HOME;
289
- if (envHome && envHome !== "undefined") {
290
- if (!logged) {
291
- logged = true;
292
- console.log(`Using AGENTV_HOME: ${envHome}`);
293
- }
294
- return envHome;
295
- }
296
- return path.join(os.homedir(), ".agentv");
297
- }
298
- function getWorkspacesRoot() {
299
- return path.join(getAgentvHome(), "workspaces");
300
- }
301
- function getSubagentsRoot() {
302
- return path.join(getAgentvHome(), "subagents");
303
- }
304
- function getTraceStateRoot() {
305
- return path.join(getAgentvHome(), "trace-state");
306
- }
307
- function getWorkspacePoolRoot() {
308
- return path.join(getAgentvHome(), "workspace-pool");
309
- }
310
-
311
329
  // src/evaluation/graders/code-grader.ts
312
- import { mkdtemp, rm, writeFile } from "node:fs/promises";
330
+ import { mkdtemp, rm, writeFile as writeFile2 } from "node:fs/promises";
313
331
  import { tmpdir } from "node:os";
314
332
  import { dirname, join } from "node:path";
315
333
 
@@ -643,7 +661,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
643
661
  const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
644
662
  const dir = await getWorkDir();
645
663
  const filePath = join(dir, `img-${counter++}.${ext}`);
646
- await writeFile(filePath, Buffer.from(base64Data, "base64"));
664
+ await writeFile2(filePath, Buffer.from(base64Data, "base64"));
647
665
  blocks.push({ type: "image", media_type: img.media_type, path: filePath });
648
666
  } else {
649
667
  blocks.push({ type: "image", media_type: img.media_type, path: img.source });
@@ -686,7 +704,7 @@ var CodeGrader = class {
686
704
  if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
687
705
  const tmpDir = await mkdtemp(join(tmpdir(), "agentv-grader-"));
688
706
  outputPath = join(tmpDir, "output.json");
689
- await writeFile(outputPath, serialized);
707
+ await writeFile2(outputPath, serialized);
690
708
  outputForPayload = null;
691
709
  }
692
710
  }
@@ -703,6 +721,7 @@ var CodeGrader = class {
703
721
  context.evalCase.input,
704
722
  getImageDir
705
723
  ),
724
+ metadata: context.evalCase.metadata ?? null,
706
725
  trace: context.trace ?? null,
707
726
  tokenUsage: context.tokenUsage ?? null,
708
727
  costUsd: context.costUsd ?? null,
@@ -875,7 +894,7 @@ import path3 from "node:path";
875
894
  import { z } from "zod";
876
895
 
877
896
  // src/evaluation/content-preprocessor.ts
878
- import { readFile } from "node:fs/promises";
897
+ import { readFile as readFile2 } from "node:fs/promises";
879
898
  import path2 from "node:path";
880
899
  import { fileURLToPath } from "node:url";
881
900
  var MIME_TYPE_ALIASES = {
@@ -944,7 +963,7 @@ async function preprocessContentFile(block, preprocessors, basePath) {
944
963
  return runContentPreprocessor(block, resolvedPath, preprocessor);
945
964
  }
946
965
  try {
947
- const buffer = await readFile(resolvedPath);
966
+ const buffer = await readFile2(resolvedPath);
948
967
  const text = buffer.toString("utf8").replace(/\r\n/g, "\n");
949
968
  if (buffer.includes(0) || text.includes(REPLACEMENT_CHAR)) {
950
969
  return {
@@ -1040,6 +1059,10 @@ ${text}`;
1040
1059
  var TEMPLATE_VARIABLES = {
1041
1060
  EXPECTED_OUTPUT: "expected_output",
1042
1061
  CRITERIA: "criteria",
1062
+ METADATA: "metadata",
1063
+ METADATA_JSON: "metadata_json",
1064
+ RUBRICS: "rubrics",
1065
+ RUBRICS_JSON: "rubrics_json",
1043
1066
  INPUT: "input",
1044
1067
  OUTPUT: "output",
1045
1068
  FILE_CHANGES: "file_changes",
@@ -1062,6 +1085,27 @@ var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
1062
1085
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
1063
1086
  ]);
1064
1087
 
1088
+ // src/evaluation/graders/rubric-operators.ts
1089
+ var OPERATOR_GUIDANCE = {
1090
+ correctness: "Correctness: mark satisfied only when the answer positively supports or fulfills the outcome. Omission or contradiction should not satisfy it.",
1091
+ contradiction: "Contradiction guard: mark satisfied when the answer does not make a claim that contradicts the outcome. Do not require the answer to mention the outcome; mark unsatisfied only for incompatible claims."
1092
+ };
1093
+ function formatRubricOperatorLabel(operator) {
1094
+ return operator ? ` (operator: ${operator})` : "";
1095
+ }
1096
+ function formatRubricOperatorGuidance(rubrics) {
1097
+ const operators = /* @__PURE__ */ new Set();
1098
+ for (const rubric of rubrics) {
1099
+ if (rubric.operator) {
1100
+ operators.add(rubric.operator);
1101
+ }
1102
+ }
1103
+ if (operators.size === 0) {
1104
+ return [];
1105
+ }
1106
+ return [...operators].map((operator) => OPERATOR_GUIDANCE[operator]);
1107
+ }
1108
+
1065
1109
  // src/evaluation/graders/llm-grader.ts
1066
1110
  var DEFAULT_MAX_STEPS = 10;
1067
1111
  var MAX_STEPS_LIMIT = 50;
@@ -1144,6 +1188,32 @@ var scoreRangeEvaluationSchema = z.object({
1144
1188
  checks: z.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
1145
1189
  overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)").optional()
1146
1190
  });
1191
+ function stringifyPretty(value) {
1192
+ return value === void 0 ? "" : JSON.stringify(value, null, 2);
1193
+ }
1194
+ function stringifyCompact(value) {
1195
+ return value === void 0 ? "" : JSON.stringify(value);
1196
+ }
1197
+ function buildTemplateVariables(context) {
1198
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
1199
+ const rubrics = context.evaluator?.type === "llm-grader" ? context.evaluator.rubrics : void 0;
1200
+ return {
1201
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
1202
+ [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
1203
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
1204
+ [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
1205
+ [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(context.evalCase.metadata),
1206
+ [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(context.evalCase.metadata),
1207
+ [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics),
1208
+ [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics),
1209
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
1210
+ [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
1211
+ // Deprecated aliases — same values as the primary variables above
1212
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
1213
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
1214
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
1215
+ };
1216
+ }
1147
1217
  function resolveContentBasePath(context) {
1148
1218
  if (context.workspacePath) {
1149
1219
  return context.workspacePath;
@@ -1215,19 +1285,7 @@ var LlmGrader = class {
1215
1285
  // LLM mode (existing)
1216
1286
  // ---------------------------------------------------------------------------
1217
1287
  async evaluateFreeform(context, graderProvider) {
1218
- const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
1219
- const variables = {
1220
- [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
1221
- [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
1222
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
1223
- [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
1224
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
1225
- [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
1226
- // Deprecated aliases — same values as the primary variables above
1227
- [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
1228
- [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
1229
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
1230
- };
1288
+ const variables = buildTemplateVariables(context);
1231
1289
  const systemPrompt = buildOutputSchema();
1232
1290
  const graderTemplate = context.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
1233
1291
  warnDeprecatedTemplateVars(graderTemplate);
@@ -1294,7 +1352,7 @@ ${context.toolCalls}`;
1294
1352
  if (hasScoreRanges) {
1295
1353
  return this.evaluateWithScoreRanges(context, graderProvider, rubrics);
1296
1354
  }
1297
- const prompt = this.buildRubricPrompt(context, rubrics);
1355
+ const prompt = context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildRubricPrompt(context, rubrics);
1298
1356
  const systemPrompt = buildRubricOutputSchema();
1299
1357
  const graderRawRequest = {
1300
1358
  userPrompt: prompt,
@@ -1339,7 +1397,7 @@ ${context.toolCalls}`;
1339
1397
  * Each criterion is scored 0-10 and normalized to 0-1.
1340
1398
  */
1341
1399
  async evaluateWithScoreRanges(context, graderProvider, rubrics) {
1342
- const prompt = this.buildScoreRangePrompt(context, rubrics);
1400
+ const prompt = context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildScoreRangePrompt(context, rubrics);
1343
1401
  const systemPrompt = buildScoreRangeOutputSchema();
1344
1402
  const graderRawRequest = {
1345
1403
  userPrompt: prompt,
@@ -1558,21 +1616,11 @@ ${context.toolCalls}`;
1558
1616
  */
1559
1617
  buildAgentUserPrompt(context) {
1560
1618
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
1561
- const variables = {
1562
- [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
1563
- [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
1564
- [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
1565
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
1566
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
1567
- [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
1568
- // Deprecated aliases
1569
- [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
1570
- [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
1571
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
1572
- };
1573
- if (this.graderTemplate) {
1574
- warnDeprecatedTemplateVars(this.graderTemplate);
1575
- return substituteVariables(this.graderTemplate, variables);
1619
+ const variables = buildTemplateVariables(context);
1620
+ const template = context.graderTemplateOverride ?? this.graderTemplate;
1621
+ if (template) {
1622
+ warnDeprecatedTemplateVars(template);
1623
+ return substituteVariables(template, variables);
1576
1624
  }
1577
1625
  const config = context.evaluator;
1578
1626
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
@@ -1622,21 +1670,11 @@ ${context.toolCalls}`;
1622
1670
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
1623
1671
  const config = context.evaluator;
1624
1672
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
1625
- if (this.graderTemplate) {
1626
- const variables = {
1627
- [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
1628
- [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
1629
- [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
1630
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
1631
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
1632
- [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
1633
- // Deprecated aliases
1634
- [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
1635
- [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
1636
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
1637
- };
1638
- warnDeprecatedTemplateVars(this.graderTemplate);
1639
- const customPrompt = substituteVariables(this.graderTemplate, variables);
1673
+ const template = context.graderTemplateOverride ?? this.graderTemplate;
1674
+ if (template) {
1675
+ const variables = buildTemplateVariables(context);
1676
+ warnDeprecatedTemplateVars(template);
1677
+ const customPrompt = substituteVariables(template, variables);
1640
1678
  const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
1641
1679
  return `${customPrompt}
1642
1680
 
@@ -1762,6 +1800,9 @@ ${outputSchema}`;
1762
1800
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
1763
1801
  const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
1764
1802
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
1803
+ if (rubric.operator) {
1804
+ parts.push(`Operator: ${rubric.operator}`);
1805
+ }
1765
1806
  if (rubric.outcome) {
1766
1807
  parts.push(`Description: ${rubric.outcome}`);
1767
1808
  }
@@ -1774,12 +1815,21 @@ ${outputSchema}`;
1774
1815
  }
1775
1816
  }
1776
1817
  }
1818
+ const operatorGuidance = formatRubricOperatorGuidance(rubrics);
1819
+ if (operatorGuidance.length > 0) {
1820
+ parts.push("", ...operatorGuidance);
1821
+ }
1777
1822
  parts.push(
1778
1823
  "",
1779
1824
  "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
1780
1825
  );
1781
1826
  return parts.join("\n");
1782
1827
  }
1828
+ buildCustomPrompt(context) {
1829
+ const template = context.graderTemplateOverride ?? this.graderTemplate ?? "";
1830
+ warnDeprecatedTemplateVars(template);
1831
+ return substituteVariables(template, buildTemplateVariables(context));
1832
+ }
1783
1833
  buildRubricPrompt(context, rubrics) {
1784
1834
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
1785
1835
  const parts = [
@@ -1803,10 +1853,21 @@ ${outputSchema}`;
1803
1853
  parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
1804
1854
  }
1805
1855
  parts.push("[[ ## rubrics ## ]]");
1856
+ const operatorGuidance = formatRubricOperatorGuidance(rubrics);
1857
+ if (operatorGuidance.length > 0) {
1858
+ parts.push("", "Operator guidance:");
1859
+ for (const guidance of operatorGuidance) {
1860
+ parts.push(`- ${guidance}`);
1861
+ }
1862
+ parts.push("");
1863
+ }
1806
1864
  for (const rubric of rubrics) {
1807
1865
  const requiredLabel = rubric.required ? " (REQUIRED)" : "";
1808
1866
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
1809
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`);
1867
+ const operatorLabel = formatRubricOperatorLabel(rubric.operator);
1868
+ parts.push(
1869
+ `- [${rubric.id}]${requiredLabel}${weightLabel}${operatorLabel}: ${rubric.outcome}`
1870
+ );
1810
1871
  }
1811
1872
  parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
1812
1873
  return parts.join("\n");
@@ -2538,6 +2599,385 @@ var CostGrader = class {
2538
2599
  };
2539
2600
 
2540
2601
  // src/evaluation/trace.ts
2602
+ import { z as z2 } from "zod";
2603
+ var NORMALIZED_TRAJECTORY_SCHEMA_VERSION = "agentv.trace.v1";
2604
+ var NORMALIZED_TRACE_SOURCE_KINDS = [
2605
+ "agentv_run",
2606
+ "otlp",
2607
+ "phoenix",
2608
+ "langfuse",
2609
+ "pi_session",
2610
+ "imported_transcript",
2611
+ "compact_transcript"
2612
+ ];
2613
+ var NORMALIZED_TRACE_EVENT_TYPES = [
2614
+ "message",
2615
+ "model_turn",
2616
+ "tool_call",
2617
+ "tool_result"
2618
+ ];
2619
+ var NORMALIZED_TOOL_STATUSES = ["ok", "error", "timeout", "cancelled", "unknown"];
2620
+ var NORMALIZED_REDACTION_LEVELS = ["none", "partial", "full"];
2621
+ function omitUndefinedProperties(value) {
2622
+ return Object.fromEntries(
2623
+ Object.entries(value).filter(([, property]) => property !== void 0)
2624
+ );
2625
+ }
2626
+ var MetadataWireSchema = z2.record(z2.string(), z2.unknown());
2627
+ var TokenUsageWireSchema = z2.object({
2628
+ input: z2.number(),
2629
+ output: z2.number(),
2630
+ cached: z2.number().optional(),
2631
+ reasoning: z2.number().optional()
2632
+ });
2633
+ var NormalizedRedactionStateWireSchema = z2.object({
2634
+ level: z2.enum(NORMALIZED_REDACTION_LEVELS),
2635
+ fields: z2.array(z2.string()).optional(),
2636
+ reason: z2.string().optional()
2637
+ });
2638
+ var NormalizedTraceErrorWireSchema = z2.object({
2639
+ message: z2.string(),
2640
+ name: z2.string().optional(),
2641
+ code: z2.string().optional(),
2642
+ stack: z2.string().optional(),
2643
+ metadata: MetadataWireSchema.optional()
2644
+ });
2645
+ var NormalizedTraceSourceWireSchema = z2.object({
2646
+ kind: z2.enum(NORMALIZED_TRACE_SOURCE_KINDS),
2647
+ path: z2.string().optional(),
2648
+ url: z2.string().optional(),
2649
+ provider: z2.string().optional(),
2650
+ format: z2.string().optional(),
2651
+ version: z2.string().optional(),
2652
+ metadata: MetadataWireSchema.optional()
2653
+ });
2654
+ var NormalizedTraceSessionWireSchema = z2.object({
2655
+ session_id: z2.string().optional(),
2656
+ conversation_id: z2.string().optional(),
2657
+ cwd: z2.string().optional(),
2658
+ started_at: z2.string().optional(),
2659
+ ended_at: z2.string().optional(),
2660
+ metadata: MetadataWireSchema.optional()
2661
+ });
2662
+ var NormalizedTraceBranchWireSchema = z2.object({
2663
+ selected_leaf_id: z2.string().optional(),
2664
+ selected_path_ids: z2.array(z2.string()).optional(),
2665
+ included_event_ids: z2.array(z2.string()).optional(),
2666
+ omitted_event_ids: z2.array(z2.string()).optional(),
2667
+ selection_reason: z2.string().optional()
2668
+ });
2669
+ var NormalizedTraceSourceRefWireSchema = z2.object({
2670
+ event_id: z2.string().optional(),
2671
+ message_id: z2.string().optional(),
2672
+ span_id: z2.string().optional(),
2673
+ trace_id: z2.string().optional(),
2674
+ raw_kind: z2.string().optional(),
2675
+ path: z2.string().optional(),
2676
+ line: z2.number().int().nonnegative().optional(),
2677
+ metadata: MetadataWireSchema.optional()
2678
+ });
2679
+ var NormalizedRawEvidenceWireSchema = z2.object({
2680
+ kind: z2.string(),
2681
+ ref: z2.string().optional(),
2682
+ media_type: z2.string().optional(),
2683
+ content: z2.unknown().optional(),
2684
+ redacted: z2.boolean().optional(),
2685
+ metadata: MetadataWireSchema.optional()
2686
+ });
2687
+ var NormalizedTraceMessageWireSchema = z2.object({
2688
+ role: z2.string(),
2689
+ name: z2.string().optional(),
2690
+ content: z2.unknown().optional(),
2691
+ redaction: NormalizedRedactionStateWireSchema.optional(),
2692
+ token_usage: TokenUsageWireSchema.optional(),
2693
+ metadata: MetadataWireSchema.optional()
2694
+ });
2695
+ var NormalizedTraceModelWireSchema = z2.object({
2696
+ provider: z2.string().optional(),
2697
+ name: z2.string().optional(),
2698
+ invocation_id: z2.string().optional(),
2699
+ token_usage: TokenUsageWireSchema.optional(),
2700
+ metadata: MetadataWireSchema.optional()
2701
+ });
2702
+ var NormalizedTraceToolWireSchema = z2.object({
2703
+ name: z2.string(),
2704
+ call_id: z2.string().optional(),
2705
+ input: z2.unknown().optional(),
2706
+ output: z2.unknown().optional(),
2707
+ status: z2.enum(NORMALIZED_TOOL_STATUSES).optional(),
2708
+ error: NormalizedTraceErrorWireSchema.optional(),
2709
+ redaction: NormalizedRedactionStateWireSchema.optional(),
2710
+ metadata: MetadataWireSchema.optional()
2711
+ });
2712
+ var NormalizedTraceEventWireSchema = z2.object({
2713
+ event_id: z2.string(),
2714
+ parent_event_id: z2.string().optional(),
2715
+ ordinal: z2.number().int().nonnegative(),
2716
+ type: z2.enum(NORMALIZED_TRACE_EVENT_TYPES),
2717
+ timestamp: z2.string().optional(),
2718
+ duration_ms: z2.number().nonnegative().optional(),
2719
+ duration_inferred: z2.boolean().optional(),
2720
+ turn_index: z2.number().int().nonnegative().optional(),
2721
+ message: NormalizedTraceMessageWireSchema.optional(),
2722
+ model: NormalizedTraceModelWireSchema.optional(),
2723
+ tool: NormalizedTraceToolWireSchema.optional(),
2724
+ source_ref: NormalizedTraceSourceRefWireSchema.optional(),
2725
+ raw_evidence: z2.array(NormalizedRawEvidenceWireSchema).optional(),
2726
+ redaction: NormalizedRedactionStateWireSchema.optional(),
2727
+ metadata: MetadataWireSchema.optional()
2728
+ });
2729
+ var NormalizedTrajectoryWireSchema = z2.object({
2730
+ schema_version: z2.literal(NORMALIZED_TRAJECTORY_SCHEMA_VERSION),
2731
+ source: NormalizedTraceSourceWireSchema,
2732
+ session: NormalizedTraceSessionWireSchema,
2733
+ branch: NormalizedTraceBranchWireSchema.optional(),
2734
+ events: z2.array(NormalizedTraceEventWireSchema),
2735
+ token_usage: TokenUsageWireSchema.optional(),
2736
+ cost_usd: z2.number().optional(),
2737
+ duration_ms: z2.number().optional(),
2738
+ started_at: z2.string().optional(),
2739
+ ended_at: z2.string().optional(),
2740
+ metadata: MetadataWireSchema.optional()
2741
+ });
2742
+ function toNormalizedTrajectoryWire(trajectory) {
2743
+ return NormalizedTrajectoryWireSchema.parse(
2744
+ omitUndefinedProperties({
2745
+ schema_version: trajectory.schemaVersion,
2746
+ source: toNormalizedTraceSourceWire(trajectory.source),
2747
+ session: toNormalizedTraceSessionWire(trajectory.session),
2748
+ branch: trajectory.branch ? toNormalizedTraceBranchWire(trajectory.branch) : void 0,
2749
+ events: trajectory.events.map(toNormalizedTraceEventWire),
2750
+ token_usage: trajectory.tokenUsage,
2751
+ cost_usd: trajectory.costUsd,
2752
+ duration_ms: trajectory.durationMs,
2753
+ started_at: trajectory.startedAt,
2754
+ ended_at: trajectory.endedAt,
2755
+ metadata: trajectory.metadata
2756
+ })
2757
+ );
2758
+ }
2759
+ function fromNormalizedTrajectoryWire(input) {
2760
+ const wire = NormalizedTrajectoryWireSchema.parse(input);
2761
+ return {
2762
+ schemaVersion: wire.schema_version,
2763
+ source: fromNormalizedTraceSourceWire(wire.source),
2764
+ session: fromNormalizedTraceSessionWire(wire.session),
2765
+ branch: wire.branch ? fromNormalizedTraceBranchWire(wire.branch) : void 0,
2766
+ events: wire.events.map(fromNormalizedTraceEventWire),
2767
+ tokenUsage: wire.token_usage,
2768
+ costUsd: wire.cost_usd,
2769
+ durationMs: wire.duration_ms,
2770
+ startedAt: wire.started_at,
2771
+ endedAt: wire.ended_at,
2772
+ metadata: wire.metadata
2773
+ };
2774
+ }
2775
+ function toNormalizedTraceSourceWire(source) {
2776
+ return omitUndefinedProperties({
2777
+ kind: source.kind,
2778
+ path: source.path,
2779
+ url: source.url,
2780
+ provider: source.provider,
2781
+ format: source.format,
2782
+ version: source.version,
2783
+ metadata: source.metadata
2784
+ });
2785
+ }
2786
+ function fromNormalizedTraceSourceWire(source) {
2787
+ return {
2788
+ kind: source.kind,
2789
+ path: source.path,
2790
+ url: source.url,
2791
+ provider: source.provider,
2792
+ format: source.format,
2793
+ version: source.version,
2794
+ metadata: source.metadata
2795
+ };
2796
+ }
2797
+ function toNormalizedTraceSessionWire(session) {
2798
+ return omitUndefinedProperties({
2799
+ session_id: session.sessionId,
2800
+ conversation_id: session.conversationId,
2801
+ cwd: session.cwd,
2802
+ started_at: session.startedAt,
2803
+ ended_at: session.endedAt,
2804
+ metadata: session.metadata
2805
+ });
2806
+ }
2807
+ function fromNormalizedTraceSessionWire(session) {
2808
+ return {
2809
+ sessionId: session.session_id,
2810
+ conversationId: session.conversation_id,
2811
+ cwd: session.cwd,
2812
+ startedAt: session.started_at,
2813
+ endedAt: session.ended_at,
2814
+ metadata: session.metadata
2815
+ };
2816
+ }
2817
+ function toNormalizedTraceBranchWire(branch) {
2818
+ return omitUndefinedProperties({
2819
+ selected_leaf_id: branch.selectedLeafId,
2820
+ selected_path_ids: branch.selectedPathIds,
2821
+ included_event_ids: branch.includedEventIds,
2822
+ omitted_event_ids: branch.omittedEventIds,
2823
+ selection_reason: branch.selectionReason
2824
+ });
2825
+ }
2826
+ function fromNormalizedTraceBranchWire(branch) {
2827
+ return {
2828
+ selectedLeafId: branch.selected_leaf_id,
2829
+ selectedPathIds: branch.selected_path_ids,
2830
+ includedEventIds: branch.included_event_ids,
2831
+ omittedEventIds: branch.omitted_event_ids,
2832
+ selectionReason: branch.selection_reason
2833
+ };
2834
+ }
2835
+ function toNormalizedTraceEventWire(event) {
2836
+ return NormalizedTraceEventWireSchema.parse(
2837
+ omitUndefinedProperties({
2838
+ event_id: event.eventId,
2839
+ parent_event_id: event.parentEventId,
2840
+ ordinal: event.ordinal,
2841
+ type: event.type,
2842
+ timestamp: event.timestamp,
2843
+ duration_ms: event.durationMs,
2844
+ duration_inferred: event.durationInferred,
2845
+ turn_index: event.turnIndex,
2846
+ message: event.message ? toNormalizedTraceMessageWire(event.message) : void 0,
2847
+ model: event.model ? toNormalizedTraceModelWire(event.model) : void 0,
2848
+ tool: event.tool ? toNormalizedTraceToolWire(event.tool) : void 0,
2849
+ source_ref: event.sourceRef ? toNormalizedTraceSourceRefWire(event.sourceRef) : void 0,
2850
+ raw_evidence: event.rawEvidence?.map(toNormalizedRawEvidenceWire),
2851
+ redaction: event.redaction,
2852
+ metadata: event.metadata
2853
+ })
2854
+ );
2855
+ }
2856
+ function fromNormalizedTraceEventWire(event) {
2857
+ return {
2858
+ eventId: event.event_id,
2859
+ parentEventId: event.parent_event_id,
2860
+ ordinal: event.ordinal,
2861
+ type: event.type,
2862
+ timestamp: event.timestamp,
2863
+ durationMs: event.duration_ms,
2864
+ durationInferred: event.duration_inferred,
2865
+ turnIndex: event.turn_index,
2866
+ message: event.message ? fromNormalizedTraceMessageWire(event.message) : void 0,
2867
+ model: event.model ? fromNormalizedTraceModelWire(event.model) : void 0,
2868
+ tool: event.tool ? fromNormalizedTraceToolWire(event.tool) : void 0,
2869
+ sourceRef: event.source_ref ? fromNormalizedTraceSourceRefWire(event.source_ref) : void 0,
2870
+ rawEvidence: event.raw_evidence?.map(fromNormalizedRawEvidenceWire),
2871
+ redaction: event.redaction,
2872
+ metadata: event.metadata
2873
+ };
2874
+ }
2875
+ function toNormalizedTraceMessageWire(message) {
2876
+ return omitUndefinedProperties({
2877
+ role: message.role,
2878
+ name: message.name,
2879
+ content: message.content,
2880
+ redaction: message.redaction,
2881
+ token_usage: message.tokenUsage,
2882
+ metadata: message.metadata
2883
+ });
2884
+ }
2885
+ function fromNormalizedTraceMessageWire(message) {
2886
+ return {
2887
+ role: message.role,
2888
+ name: message.name,
2889
+ content: message.content,
2890
+ redaction: message.redaction,
2891
+ tokenUsage: message.token_usage,
2892
+ metadata: message.metadata
2893
+ };
2894
+ }
2895
+ function toNormalizedTraceModelWire(model) {
2896
+ return omitUndefinedProperties({
2897
+ provider: model.provider,
2898
+ name: model.name,
2899
+ invocation_id: model.invocationId,
2900
+ token_usage: model.tokenUsage,
2901
+ metadata: model.metadata
2902
+ });
2903
+ }
2904
+ function fromNormalizedTraceModelWire(model) {
2905
+ return {
2906
+ provider: model.provider,
2907
+ name: model.name,
2908
+ invocationId: model.invocation_id,
2909
+ tokenUsage: model.token_usage,
2910
+ metadata: model.metadata
2911
+ };
2912
+ }
2913
+ function toNormalizedTraceToolWire(tool) {
2914
+ return omitUndefinedProperties({
2915
+ name: tool.name,
2916
+ call_id: tool.callId,
2917
+ input: tool.input,
2918
+ output: tool.output,
2919
+ status: tool.status,
2920
+ error: tool.error,
2921
+ redaction: tool.redaction,
2922
+ metadata: tool.metadata
2923
+ });
2924
+ }
2925
+ function fromNormalizedTraceToolWire(tool) {
2926
+ return {
2927
+ name: tool.name,
2928
+ callId: tool.call_id,
2929
+ input: tool.input,
2930
+ output: tool.output,
2931
+ status: tool.status,
2932
+ error: tool.error,
2933
+ redaction: tool.redaction,
2934
+ metadata: tool.metadata
2935
+ };
2936
+ }
2937
+ function toNormalizedTraceSourceRefWire(sourceRef) {
2938
+ return omitUndefinedProperties({
2939
+ event_id: sourceRef.eventId,
2940
+ message_id: sourceRef.messageId,
2941
+ span_id: sourceRef.spanId,
2942
+ trace_id: sourceRef.traceId,
2943
+ raw_kind: sourceRef.rawKind,
2944
+ path: sourceRef.path,
2945
+ line: sourceRef.line,
2946
+ metadata: sourceRef.metadata
2947
+ });
2948
+ }
2949
+ function fromNormalizedTraceSourceRefWire(sourceRef) {
2950
+ return {
2951
+ eventId: sourceRef.event_id,
2952
+ messageId: sourceRef.message_id,
2953
+ spanId: sourceRef.span_id,
2954
+ traceId: sourceRef.trace_id,
2955
+ rawKind: sourceRef.raw_kind,
2956
+ path: sourceRef.path,
2957
+ line: sourceRef.line,
2958
+ metadata: sourceRef.metadata
2959
+ };
2960
+ }
2961
+ function toNormalizedRawEvidenceWire(evidence) {
2962
+ return omitUndefinedProperties({
2963
+ kind: evidence.kind,
2964
+ ref: evidence.ref,
2965
+ media_type: evidence.mediaType,
2966
+ content: evidence.content,
2967
+ redacted: evidence.redacted,
2968
+ metadata: evidence.metadata
2969
+ });
2970
+ }
2971
+ function fromNormalizedRawEvidenceWire(evidence) {
2972
+ return {
2973
+ kind: evidence.kind,
2974
+ ref: evidence.ref,
2975
+ mediaType: evidence.media_type,
2976
+ content: evidence.content,
2977
+ redacted: evidence.redacted,
2978
+ metadata: evidence.metadata
2979
+ };
2980
+ }
2541
2981
  function computeTraceSummary(messages) {
2542
2982
  const toolCallCounts = {};
2543
2983
  const toolDurations = {};
@@ -2605,6 +3045,82 @@ function computeTraceSummary(messages) {
2605
3045
  endTime: latestEnd?.toISOString()
2606
3046
  };
2607
3047
  }
3048
+ function getSelectedTrajectoryEvents(trajectory) {
3049
+ if (!trajectory.branch?.includedEventIds || trajectory.branch.includedEventIds.length === 0) {
3050
+ return trajectory.events;
3051
+ }
3052
+ const includedIds = new Set(trajectory.branch.includedEventIds);
3053
+ return trajectory.events.filter((event) => includedIds.has(event.eventId));
3054
+ }
3055
+ function computeTraceSummaryFromTrajectory(trajectory) {
3056
+ const selectedEvents = getSelectedTrajectoryEvents(trajectory);
3057
+ const hasModelTurnEvents = selectedEvents.some((event) => event.type === "model_turn");
3058
+ const toolCallCounts = {};
3059
+ const toolDurations = {};
3060
+ let totalToolCalls = 0;
3061
+ let errorCount = 0;
3062
+ let llmCallCount = 0;
3063
+ let earliestStart;
3064
+ let latestEnd;
3065
+ let hasAnyDuration = false;
3066
+ for (const event of selectedEvents) {
3067
+ if (event.type === "model_turn" || !hasModelTurnEvents && event.type === "message" && event.message?.role === "assistant") {
3068
+ llmCallCount++;
3069
+ }
3070
+ const eventStart = parseTimestamp(event.timestamp);
3071
+ if (eventStart && (!earliestStart || eventStart < earliestStart)) {
3072
+ earliestStart = eventStart;
3073
+ }
3074
+ const eventEnd = deriveEventEnd(eventStart, event.durationMs);
3075
+ if (eventEnd && (!latestEnd || eventEnd > latestEnd)) {
3076
+ latestEnd = eventEnd;
3077
+ }
3078
+ if (event.type !== "tool_call" || !event.tool) {
3079
+ continue;
3080
+ }
3081
+ toolCallCounts[event.tool.name] = (toolCallCounts[event.tool.name] ?? 0) + 1;
3082
+ totalToolCalls++;
3083
+ if (isErrorToolEvent(event)) {
3084
+ errorCount++;
3085
+ }
3086
+ if (event.durationMs !== void 0) {
3087
+ hasAnyDuration = true;
3088
+ if (!toolDurations[event.tool.name]) {
3089
+ toolDurations[event.tool.name] = [];
3090
+ }
3091
+ toolDurations[event.tool.name].push(event.durationMs);
3092
+ }
3093
+ }
3094
+ return {
3095
+ trace: {
3096
+ eventCount: totalToolCalls,
3097
+ toolCalls: toolCallCounts,
3098
+ errorCount,
3099
+ llmCallCount,
3100
+ ...hasAnyDuration ? { toolDurations } : {}
3101
+ },
3102
+ tokenUsage: trajectory.tokenUsage,
3103
+ costUsd: trajectory.costUsd,
3104
+ durationMs: trajectory.durationMs,
3105
+ startTime: trajectory.startedAt ?? earliestStart?.toISOString(),
3106
+ endTime: trajectory.endedAt ?? latestEnd?.toISOString()
3107
+ };
3108
+ }
3109
+ function parseTimestamp(timestamp) {
3110
+ if (!timestamp) return void 0;
3111
+ const value = new Date(timestamp);
3112
+ return Number.isNaN(value.getTime()) ? void 0 : value;
3113
+ }
3114
+ function deriveEventEnd(start, durationMs) {
3115
+ if (!start) return void 0;
3116
+ if (durationMs === void 0) return start;
3117
+ return new Date(start.getTime() + durationMs);
3118
+ }
3119
+ function isErrorToolEvent(event) {
3120
+ return Boolean(
3121
+ event.tool?.error || event.tool?.status === "error" || event.tool?.status === "timeout" || event.tool?.status === "cancelled"
3122
+ );
3123
+ }
2608
3124
  var DEFAULT_EXPLORATION_TOOLS = [
2609
3125
  "read",
2610
3126
  "grep",
@@ -3401,6 +3917,30 @@ var SkillTriggerGrader = class {
3401
3917
  };
3402
3918
 
3403
3919
  // src/evaluation/graders/llm-grader-prompt.ts
3920
+ function stringifyPretty2(value) {
3921
+ return value === void 0 ? "" : JSON.stringify(value, null, 2);
3922
+ }
3923
+ function stringifyCompact2(value) {
3924
+ return value === void 0 ? "" : JSON.stringify(value);
3925
+ }
3926
+ function buildTemplateVariables2(input) {
3927
+ const formattedQuestion = input.promptInputs.question && input.promptInputs.question.trim().length > 0 ? input.promptInputs.question : input.evalCase.question;
3928
+ return {
3929
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
3930
+ [TEMPLATE_VARIABLES.OUTPUT]: input.candidate.trim(),
3931
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (input.evalCase.reference_answer ?? "").trim(),
3932
+ [TEMPLATE_VARIABLES.CRITERIA]: input.evalCase.criteria.trim(),
3933
+ [TEMPLATE_VARIABLES.METADATA]: stringifyPretty2(input.evalCase.metadata),
3934
+ [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact2(input.evalCase.metadata),
3935
+ [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty2(input.rubrics),
3936
+ [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact2(input.rubrics),
3937
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? "",
3938
+ [TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? "",
3939
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
3940
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: input.candidate.trim(),
3941
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (input.evalCase.reference_answer ?? "").trim()
3942
+ };
3943
+ }
3404
3944
  function assembleLlmGraderPrompt(input) {
3405
3945
  const {
3406
3946
  evalCase,
@@ -3413,6 +3953,17 @@ function assembleLlmGraderPrompt(input) {
3413
3953
  } = input;
3414
3954
  const rubrics = evaluatorConfig?.rubrics;
3415
3955
  if (rubrics && rubrics.length > 0) {
3956
+ if (graderTemplateOverride) {
3957
+ return assembleCustom(
3958
+ evalCase,
3959
+ candidate,
3960
+ promptInputs,
3961
+ rubrics,
3962
+ fileChanges,
3963
+ toolCalls,
3964
+ graderTemplateOverride
3965
+ );
3966
+ }
3416
3967
  const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
3417
3968
  if (hasScoreRanges) {
3418
3969
  return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
@@ -3429,19 +3980,13 @@ function assembleLlmGraderPrompt(input) {
3429
3980
  );
3430
3981
  }
3431
3982
  function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, toolCalls, graderTemplateOverride) {
3432
- const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
3433
- const variables = {
3434
- [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
3435
- [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
3436
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
3437
- [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
3438
- [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
3439
- [TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? "",
3440
- // Deprecated aliases
3441
- [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
3442
- [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
3443
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
3444
- };
3983
+ const variables = buildTemplateVariables2({
3984
+ evalCase,
3985
+ candidate,
3986
+ promptInputs,
3987
+ fileChanges,
3988
+ toolCalls
3989
+ });
3445
3990
  const systemPrompt = buildOutputSchema();
3446
3991
  const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
3447
3992
  let userPrompt = substituteVariables(template, variables);
@@ -3464,6 +4009,27 @@ ${toolCalls}`;
3464
4009
  mode: "freeform"
3465
4010
  };
3466
4011
  }
4012
+ function assembleCustom(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls, graderTemplateOverride) {
4013
+ const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
4014
+ const systemPrompt = hasScoreRanges ? buildScoreRangeOutputSchema() : buildRubricOutputSchema();
4015
+ const userPrompt = substituteVariables(
4016
+ graderTemplateOverride,
4017
+ buildTemplateVariables2({
4018
+ evalCase,
4019
+ candidate,
4020
+ promptInputs,
4021
+ rubrics,
4022
+ fileChanges,
4023
+ toolCalls
4024
+ })
4025
+ );
4026
+ return {
4027
+ systemPrompt,
4028
+ userPrompt,
4029
+ responseSchema: systemPrompt,
4030
+ mode: hasScoreRanges ? "score_range" : "checklist"
4031
+ };
4032
+ }
3467
4033
  function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
3468
4034
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
3469
4035
  const parts = [
@@ -3487,10 +4053,19 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
3487
4053
  parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
3488
4054
  }
3489
4055
  parts.push("[[ ## rubrics ## ]]");
4056
+ const operatorGuidance = formatRubricOperatorGuidance(rubrics);
4057
+ if (operatorGuidance.length > 0) {
4058
+ parts.push("", "Operator guidance:");
4059
+ for (const guidance of operatorGuidance) {
4060
+ parts.push(`- ${guidance}`);
4061
+ }
4062
+ parts.push("");
4063
+ }
3490
4064
  for (const rubric of rubrics) {
3491
4065
  const requiredLabel = rubric.required ? " (REQUIRED)" : "";
3492
4066
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3493
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`);
4067
+ const operatorLabel = formatRubricOperatorLabel(rubric.operator);
4068
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}${operatorLabel}: ${rubric.outcome}`);
3494
4069
  }
3495
4070
  parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
3496
4071
  const systemPrompt = buildRubricOutputSchema();
@@ -3530,6 +4105,9 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
3530
4105
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3531
4106
  const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
3532
4107
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
4108
+ if (rubric.operator) {
4109
+ parts.push(`Operator: ${rubric.operator}`);
4110
+ }
3533
4111
  if (rubric.outcome) {
3534
4112
  parts.push(`Description: ${rubric.outcome}`);
3535
4113
  }
@@ -3542,6 +4120,10 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
3542
4120
  }
3543
4121
  }
3544
4122
  }
4123
+ const operatorGuidance = formatRubricOperatorGuidance(rubrics);
4124
+ if (operatorGuidance.length > 0) {
4125
+ parts.push("", ...operatorGuidance);
4126
+ }
3545
4127
  parts.push(
3546
4128
  "",
3547
4129
  "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
@@ -4260,7 +4842,7 @@ function runEqualsAssertion(output, value) {
4260
4842
  import { spawn } from "node:child_process";
4261
4843
  import { randomUUID } from "node:crypto";
4262
4844
  import { createWriteStream } from "node:fs";
4263
- import { mkdir } from "node:fs/promises";
4845
+ import { mkdir as mkdir2 } from "node:fs/promises";
4264
4846
  import path5 from "node:path";
4265
4847
 
4266
4848
  // src/runtime/child-tracker.ts
@@ -4760,7 +5342,7 @@ var ClaudeCliProvider = class {
4760
5342
  return void 0;
4761
5343
  }
4762
5344
  try {
4763
- await mkdir(logDir, { recursive: true });
5345
+ await mkdir2(logDir, { recursive: true });
4764
5346
  } catch (error) {
4765
5347
  const message = error instanceof Error ? error.message : String(error);
4766
5348
  console.warn(`Skipping Claude CLI stream logging (could not create ${logDir}): ${message}`);
@@ -5070,7 +5652,7 @@ function tryParseJson(line) {
5070
5652
  // src/evaluation/providers/claude-sdk.ts
5071
5653
  import { randomUUID as randomUUID2 } from "node:crypto";
5072
5654
  import { createWriteStream as createWriteStream2 } from "node:fs";
5073
- import { mkdir as mkdir2 } from "node:fs/promises";
5655
+ import { mkdir as mkdir3 } from "node:fs/promises";
5074
5656
  import path6 from "node:path";
5075
5657
  var claudeSdkModule = null;
5076
5658
  async function loadClaudeSdk() {
@@ -5255,7 +5837,7 @@ var ClaudeSdkProvider = class {
5255
5837
  return void 0;
5256
5838
  }
5257
5839
  try {
5258
- await mkdir2(logDir, { recursive: true });
5840
+ await mkdir3(logDir, { recursive: true });
5259
5841
  } catch (error) {
5260
5842
  const message = error instanceof Error ? error.message : String(error);
5261
5843
  console.warn(`Skipping Claude stream logging (could not create ${logDir}): ${message}`);
@@ -5450,44 +6032,44 @@ function formatElapsed2(startedAt) {
5450
6032
  // src/evaluation/providers/cli.ts
5451
6033
  import { exec as execWithCallback } from "node:child_process";
5452
6034
  import fs2 from "node:fs/promises";
5453
- import os2 from "node:os";
6035
+ import os from "node:os";
5454
6036
  import path7 from "node:path";
5455
6037
  import { promisify } from "node:util";
5456
- import { z as z2 } from "zod";
5457
- var ToolCallSchema = z2.object({
5458
- tool: z2.string(),
5459
- input: z2.unknown().optional(),
5460
- output: z2.unknown().optional(),
5461
- id: z2.string().optional(),
5462
- start_time: z2.string().optional(),
5463
- end_time: z2.string().optional(),
5464
- duration_ms: z2.number().optional()
6038
+ import { z as z3 } from "zod";
6039
+ var ToolCallSchema = z3.object({
6040
+ tool: z3.string(),
6041
+ input: z3.unknown().optional(),
6042
+ output: z3.unknown().optional(),
6043
+ id: z3.string().optional(),
6044
+ start_time: z3.string().optional(),
6045
+ end_time: z3.string().optional(),
6046
+ duration_ms: z3.number().optional()
5465
6047
  });
5466
- var MessageInputSchema = z2.object({
5467
- role: z2.string(),
5468
- name: z2.string().optional(),
5469
- content: z2.unknown().optional(),
5470
- tool_calls: z2.array(ToolCallSchema).optional(),
5471
- start_time: z2.string().optional(),
5472
- end_time: z2.string().optional(),
5473
- duration_ms: z2.number().optional(),
5474
- metadata: z2.record(z2.unknown()).optional()
6048
+ var MessageInputSchema = z3.object({
6049
+ role: z3.string(),
6050
+ name: z3.string().optional(),
6051
+ content: z3.unknown().optional(),
6052
+ tool_calls: z3.array(ToolCallSchema).optional(),
6053
+ start_time: z3.string().optional(),
6054
+ end_time: z3.string().optional(),
6055
+ duration_ms: z3.number().optional(),
6056
+ metadata: z3.record(z3.unknown()).optional()
5475
6057
  });
5476
- var TokenUsageSchema = z2.object({
5477
- input: z2.number(),
5478
- output: z2.number(),
5479
- cached: z2.number().optional()
6058
+ var TokenUsageSchema = z3.object({
6059
+ input: z3.number(),
6060
+ output: z3.number(),
6061
+ cached: z3.number().optional()
5480
6062
  });
5481
- var CliOutputSchema = z2.object({
5482
- text: z2.unknown().optional(),
5483
- output: z2.array(MessageInputSchema).optional(),
5484
- output_messages: z2.array(MessageInputSchema).optional(),
6063
+ var CliOutputSchema = z3.object({
6064
+ text: z3.unknown().optional(),
6065
+ output: z3.array(MessageInputSchema).optional(),
6066
+ output_messages: z3.array(MessageInputSchema).optional(),
5485
6067
  token_usage: TokenUsageSchema.optional(),
5486
- cost_usd: z2.number().optional(),
5487
- duration_ms: z2.number().optional()
6068
+ cost_usd: z3.number().optional(),
6069
+ duration_ms: z3.number().optional()
5488
6070
  });
5489
6071
  var CliJsonlRecordSchema = CliOutputSchema.extend({
5490
- id: z2.string().min(1)
6072
+ id: z3.string().min(1)
5491
6073
  });
5492
6074
  function validateMetrics(costUsd, durationMs, context) {
5493
6075
  let validCostUsd = costUsd;
@@ -5992,7 +6574,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
5992
6574
  const safeEvalId = evalCaseId || "unknown";
5993
6575
  const timestamp = Date.now();
5994
6576
  const random = Math.random().toString(36).substring(2, 9);
5995
- return path7.join(os2.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
6577
+ return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
5996
6578
  }
5997
6579
  function formatTimeoutSuffix2(timeoutMs) {
5998
6580
  if (!timeoutMs || timeoutMs <= 0) {
@@ -6005,7 +6587,7 @@ function formatTimeoutSuffix2(timeoutMs) {
6005
6587
  // src/evaluation/providers/codex.ts
6006
6588
  import { randomUUID as randomUUID3 } from "node:crypto";
6007
6589
  import { createWriteStream as createWriteStream3 } from "node:fs";
6008
- import { mkdir as mkdir3 } from "node:fs/promises";
6590
+ import { mkdir as mkdir4 } from "node:fs/promises";
6009
6591
  import path8 from "node:path";
6010
6592
 
6011
6593
  // src/evaluation/providers/codex-log-tracker.ts
@@ -6098,6 +6680,9 @@ var CodexProvider = class {
6098
6680
  const startMs = Date.now();
6099
6681
  const logger = await this.createStreamLogger(request).catch(() => void 0);
6100
6682
  const codexOptions = {};
6683
+ if (this.config.executable) {
6684
+ codexOptions.codexPathOverride = this.config.executable;
6685
+ }
6101
6686
  if (this.config.model) {
6102
6687
  codexOptions.config = { model: this.config.model };
6103
6688
  }
@@ -6109,6 +6694,9 @@ var CodexProvider = class {
6109
6694
  if (cwd) {
6110
6695
  threadOptions.workingDirectory = cwd;
6111
6696
  }
6697
+ if (this.config.modelReasoningEffort) {
6698
+ threadOptions.modelReasoningEffort = this.config.modelReasoningEffort;
6699
+ }
6112
6700
  const thread = codex.startThread(threadOptions);
6113
6701
  const inputFiles = normalizeInputFiles(request.inputFiles);
6114
6702
  const basePrompt = buildPromptDocument(request, inputFiles);
@@ -6256,7 +6844,7 @@ ${basePrompt}` : basePrompt;
6256
6844
  }
6257
6845
  resolveLogDirectory() {
6258
6846
  const disabled = isCodexLogStreamingDisabled();
6259
- if (disabled) {
6847
+ if (disabled || this.config.streamLog === false) {
6260
6848
  return void 0;
6261
6849
  }
6262
6850
  if (this.config.logDir) {
@@ -6270,7 +6858,7 @@ ${basePrompt}` : basePrompt;
6270
6858
  return void 0;
6271
6859
  }
6272
6860
  try {
6273
- await mkdir3(logDir, { recursive: true });
6861
+ await mkdir4(logDir, { recursive: true });
6274
6862
  } catch (error) {
6275
6863
  const message = error instanceof Error ? error.message : String(error);
6276
6864
  console.warn(`Skipping Codex SDK stream logging (could not create ${logDir}): ${message}`);
@@ -6283,7 +6871,7 @@ ${basePrompt}` : basePrompt;
6283
6871
  targetName: this.targetName,
6284
6872
  evalCaseId: request.evalCaseId,
6285
6873
  attempt: request.attempt,
6286
- format: this.config.logFormat ?? "summary"
6874
+ format: this.config.streamLog === "raw" ? "json" : "summary"
6287
6875
  });
6288
6876
  recordCodexLogEntry({
6289
6877
  filePath,
@@ -6419,7 +7007,7 @@ function formatElapsed3(startedAt) {
6419
7007
 
6420
7008
  // src/evaluation/providers/copilot-cli.ts
6421
7009
  import { randomUUID as randomUUID5 } from "node:crypto";
6422
- import { mkdir as mkdir4 } from "node:fs/promises";
7010
+ import { mkdir as mkdir5 } from "node:fs/promises";
6423
7011
  import { homedir as homedir2 } from "node:os";
6424
7012
  import path11 from "node:path";
6425
7013
  import { Readable, Writable } from "node:stream";
@@ -6429,7 +7017,7 @@ import * as acp from "@agentclientprotocol/sdk";
6429
7017
  // src/evaluation/workspace/file-changes.ts
6430
7018
  import { exec as execCallback } from "node:child_process";
6431
7019
  import { readdirSync, statSync } from "node:fs";
6432
- import { readFile as readFile2, readdir, stat } from "node:fs/promises";
7020
+ import { readFile as readFile3, readdir, stat } from "node:fs/promises";
6433
7021
  import path9 from "node:path";
6434
7022
  import { promisify as promisify2 } from "node:util";
6435
7023
  var execAsync2 = promisify2(execCallback);
@@ -6504,7 +7092,7 @@ async function walkDir(rootDir, currentDir, snapshot) {
6504
7092
  if (fileStat.size > SNAPSHOT_MAX_FILE_BYTES) continue;
6505
7093
  let content;
6506
7094
  try {
6507
- content = await readFile2(fullPath, "utf8");
7095
+ content = await readFile3(fullPath, "utf8");
6508
7096
  if (content.includes("\0")) continue;
6509
7097
  } catch {
6510
7098
  continue;
@@ -6597,7 +7185,7 @@ import { arch, homedir, platform } from "node:os";
6597
7185
  import path10 from "node:path";
6598
7186
  import { fileURLToPath as fileURLToPath2 } from "node:url";
6599
7187
  function resolvePlatformCliPath() {
6600
- const os3 = platform();
7188
+ const os2 = platform();
6601
7189
  const cpu = arch();
6602
7190
  const platformMap = {
6603
7191
  linux: "linux",
@@ -6608,13 +7196,13 @@ function resolvePlatformCliPath() {
6608
7196
  x64: "x64",
6609
7197
  arm64: "arm64"
6610
7198
  };
6611
- const osPart = platformMap[os3];
7199
+ const osPart = platformMap[os2];
6612
7200
  const archPart = archMap[cpu];
6613
7201
  if (!osPart || !archPart) {
6614
7202
  return void 0;
6615
7203
  }
6616
7204
  const packageName = `@github/copilot-${osPart}-${archPart}`;
6617
- const binaryName = os3 === "win32" ? "copilot.exe" : "copilot";
7205
+ const binaryName = os2 === "win32" ? "copilot.exe" : "copilot";
6618
7206
  try {
6619
7207
  const resolved = import.meta.resolve(`${packageName}/package.json`);
6620
7208
  const packageJsonPath = resolved.startsWith("file:") ? fileURLToPath2(resolved) : resolved;
@@ -6682,9 +7270,9 @@ function resolvePlatformCliPath() {
6682
7270
  }
6683
7271
  function globalNpmRoots() {
6684
7272
  const roots = [];
6685
- const os3 = platform();
7273
+ const os2 = platform();
6686
7274
  const home = homedir();
6687
- if (os3 === "win32") {
7275
+ if (os2 === "win32") {
6688
7276
  if (process.env.APPDATA) {
6689
7277
  roots.push(path10.join(process.env.APPDATA, "npm", "node_modules"));
6690
7278
  }
@@ -6699,7 +7287,7 @@ function globalNpmRoots() {
6699
7287
  if (process.env.npm_config_prefix) {
6700
7288
  const prefix = process.env.npm_config_prefix;
6701
7289
  roots.push(
6702
- os3 === "win32" ? path10.join(prefix, "node_modules") : path10.join(prefix, "lib", "node_modules")
7290
+ os2 === "win32" ? path10.join(prefix, "node_modules") : path10.join(prefix, "lib", "node_modules")
6703
7291
  );
6704
7292
  }
6705
7293
  return Array.from(new Set(roots));
@@ -7120,7 +7708,7 @@ var CopilotCliProvider = class {
7120
7708
  return void 0;
7121
7709
  }
7122
7710
  try {
7123
- await mkdir4(logDir, { recursive: true });
7711
+ await mkdir5(logDir, { recursive: true });
7124
7712
  } catch (error) {
7125
7713
  const message = error instanceof Error ? error.message : String(error);
7126
7714
  console.warn(`Skipping Copilot CLI stream logging (could not create ${logDir}): ${message}`);
@@ -7228,7 +7816,7 @@ function summarizeAcpEvent(eventType, data) {
7228
7816
  }
7229
7817
 
7230
7818
  // src/evaluation/providers/copilot-log.ts
7231
- import { readFile as readFile4 } from "node:fs/promises";
7819
+ import { readFile as readFile5 } from "node:fs/promises";
7232
7820
  import { homedir as homedir4 } from "node:os";
7233
7821
  import path13 from "node:path";
7234
7822
 
@@ -7364,7 +7952,7 @@ function parseCopilotEvents(eventsJsonl) {
7364
7952
  }
7365
7953
 
7366
7954
  // src/evaluation/providers/copilot-session-discovery.ts
7367
- import { readFile as readFile3, readdir as readdir2, stat as stat2 } from "node:fs/promises";
7955
+ import { readFile as readFile4, readdir as readdir2, stat as stat2 } from "node:fs/promises";
7368
7956
  import { homedir as homedir3 } from "node:os";
7369
7957
  import path12 from "node:path";
7370
7958
  var DEFAULT_SESSION_STATE_DIR = () => path12.join(homedir3(), ".copilot", "session-state");
@@ -7383,7 +7971,7 @@ async function discoverCopilotSessions(opts) {
7383
7971
  const workspacePath = path12.join(sessionDir, "workspace.yaml");
7384
7972
  const eventsPath = path12.join(sessionDir, "events.jsonl");
7385
7973
  try {
7386
- const workspaceContent = await readFile3(workspacePath, "utf8");
7974
+ const workspaceContent = await readFile4(workspacePath, "utf8");
7387
7975
  const workspace = parseYamlValue(workspaceContent) ?? {};
7388
7976
  const cwd = String(workspace.cwd ?? "");
7389
7977
  let updatedAt;
@@ -7445,7 +8033,7 @@ var CopilotLogProvider = class {
7445
8033
  const eventsPath = path13.join(sessionDir, "events.jsonl");
7446
8034
  let eventsContent;
7447
8035
  try {
7448
- eventsContent = await readFile4(eventsPath, "utf8");
8036
+ eventsContent = await readFile5(eventsPath, "utf8");
7449
8037
  } catch (err) {
7450
8038
  throw new Error(
7451
8039
  `Failed to read Copilot session transcript at ${eventsPath}: ${err instanceof Error ? err.message : String(err)}`
@@ -7492,7 +8080,7 @@ var CopilotLogProvider = class {
7492
8080
  // src/evaluation/providers/copilot-sdk.ts
7493
8081
  import { randomUUID as randomUUID6 } from "node:crypto";
7494
8082
  import { existsSync as existsSync2 } from "node:fs";
7495
- import { mkdir as mkdir5 } from "node:fs/promises";
8083
+ import { mkdir as mkdir6 } from "node:fs/promises";
7496
8084
  import path14 from "node:path";
7497
8085
 
7498
8086
  // src/evaluation/providers/copilot-sdk-log-tracker.ts
@@ -7832,7 +8420,7 @@ var CopilotSdkProvider = class {
7832
8420
  return void 0;
7833
8421
  }
7834
8422
  try {
7835
- await mkdir5(logDir, { recursive: true });
8423
+ await mkdir6(logDir, { recursive: true });
7836
8424
  } catch (error) {
7837
8425
  const message = error instanceof Error ? error.message : String(error);
7838
8426
  console.warn(`Skipping Copilot SDK stream logging (could not create ${logDir}): ${message}`);
@@ -7958,7 +8546,7 @@ var MockProvider = class {
7958
8546
  import { execSync, spawn as spawn3 } from "node:child_process";
7959
8547
  import { randomUUID as randomUUID7 } from "node:crypto";
7960
8548
  import { accessSync, createWriteStream as createWriteStream5, readFileSync } from "node:fs";
7961
- import { mkdir as mkdir6, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
8549
+ import { mkdir as mkdir7, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile3 } from "node:fs/promises";
7962
8550
  import { tmpdir as tmpdir2 } from "node:os";
7963
8551
  import path15 from "node:path";
7964
8552
 
@@ -8167,7 +8755,7 @@ var PiCliProvider = class {
8167
8755
  const logger = await this.createStreamLogger(request).catch(() => void 0);
8168
8756
  try {
8169
8757
  const promptFile = path15.join(cwd, PROMPT_FILENAME);
8170
- await writeFile2(promptFile, request.question, "utf8");
8758
+ await writeFile3(promptFile, request.question, "utf8");
8171
8759
  const args = this.buildPiArgs(request.question, inputFiles);
8172
8760
  const result = await this.executePi(args, cwd, request.signal, logger);
8173
8761
  if (result.timedOut) {
@@ -8358,7 +8946,7 @@ ${prompt}` : prompt;
8358
8946
  return void 0;
8359
8947
  }
8360
8948
  try {
8361
- await mkdir6(logDir, { recursive: true });
8949
+ await mkdir7(logDir, { recursive: true });
8362
8950
  } catch (error) {
8363
8951
  const message = error instanceof Error ? error.message : String(error);
8364
8952
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
@@ -8921,7 +9509,7 @@ async function defaultPiRunner(options) {
8921
9509
  import { execSync as execSync2 } from "node:child_process";
8922
9510
  import { randomUUID as randomUUID8 } from "node:crypto";
8923
9511
  import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
8924
- import { mkdir as mkdir7 } from "node:fs/promises";
9512
+ import { mkdir as mkdir8 } from "node:fs/promises";
8925
9513
  import path16 from "node:path";
8926
9514
  import { createInterface } from "node:readline";
8927
9515
  import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
@@ -8943,7 +9531,7 @@ async function promptInstall() {
8943
9531
  }
8944
9532
  }
8945
9533
  function findManagedSdkInstallRoot() {
8946
- return path16.join(getAgentvHome(), "deps", "pi-sdk");
9534
+ return path16.join(getAgentvDataDir(), "deps", "pi-sdk");
8947
9535
  }
8948
9536
  function resolveGlobalNpmRoot() {
8949
9537
  try {
@@ -9358,7 +9946,7 @@ ${fileList}`;
9358
9946
  return void 0;
9359
9947
  }
9360
9948
  try {
9361
- await mkdir7(logDir, { recursive: true });
9949
+ await mkdir8(logDir, { recursive: true });
9362
9950
  } catch (error) {
9363
9951
  const message = error instanceof Error ? error.message : String(error);
9364
9952
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
@@ -9583,12 +10171,12 @@ import path27 from "node:path";
9583
10171
  import { promisify as promisify4 } from "node:util";
9584
10172
 
9585
10173
  // src/evaluation/providers/vscode/dispatch/agentDispatch.ts
9586
- import { stat as stat5, writeFile as writeFile5 } from "node:fs/promises";
10174
+ import { stat as stat5, writeFile as writeFile6 } from "node:fs/promises";
9587
10175
  import path25 from "node:path";
9588
10176
 
9589
10177
  // src/evaluation/providers/vscode/utils/fs.ts
9590
10178
  import { constants } from "node:fs";
9591
- import { access, mkdir as mkdir8, readdir as readdir3, rm as rm3, stat as stat3 } from "node:fs/promises";
10179
+ import { access, mkdir as mkdir9, readdir as readdir3, rm as rm3, stat as stat3 } from "node:fs/promises";
9592
10180
  import path17 from "node:path";
9593
10181
  async function pathExists(target) {
9594
10182
  try {
@@ -9599,7 +10187,7 @@ async function pathExists(target) {
9599
10187
  }
9600
10188
  }
9601
10189
  async function ensureDir(target) {
9602
- await mkdir8(target, { recursive: true });
10190
+ await mkdir9(target, { recursive: true });
9603
10191
  }
9604
10192
  async function readDirEntries(target) {
9605
10193
  const entries = await readdir3(target, { withFileTypes: true });
@@ -9732,7 +10320,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
9732
10320
  }
9733
10321
 
9734
10322
  // src/evaluation/providers/vscode/dispatch/responseWaiter.ts
9735
- import { readFile as readFile5 } from "node:fs/promises";
10323
+ import { readFile as readFile6 } from "node:fs/promises";
9736
10324
  import path20 from "node:path";
9737
10325
 
9738
10326
  // src/evaluation/providers/vscode/utils/time.ts
@@ -9771,7 +10359,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
9771
10359
  const maxAttempts = 10;
9772
10360
  while (attempts < maxAttempts) {
9773
10361
  try {
9774
- const content = await readFile5(responseFileFinal, { encoding: "utf8" });
10362
+ const content = await readFile6(responseFileFinal, { encoding: "utf8" });
9775
10363
  if (!silent) {
9776
10364
  process.stdout.write(`${content}
9777
10365
  `);
@@ -9828,7 +10416,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
9828
10416
  const maxAttempts = 10;
9829
10417
  while (attempts < maxAttempts) {
9830
10418
  try {
9831
- const content = await readFile5(file, { encoding: "utf8" });
10419
+ const content = await readFile6(file, { encoding: "utf8" });
9832
10420
  if (!silent) {
9833
10421
  process.stdout.write(`${content}
9834
10422
  `);
@@ -9851,7 +10439,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
9851
10439
 
9852
10440
  // src/evaluation/providers/vscode/dispatch/vscodeProcess.ts
9853
10441
  import { exec, spawn as spawn4 } from "node:child_process";
9854
- import { mkdir as mkdir9, writeFile as writeFile3 } from "node:fs/promises";
10442
+ import { mkdir as mkdir10, writeFile as writeFile4 } from "node:fs/promises";
9855
10443
  import path22 from "node:path";
9856
10444
  import { promisify as promisify3 } from "node:util";
9857
10445
 
@@ -9932,9 +10520,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
9932
10520
  const aliveFile = path22.join(subagentDir, DEFAULT_ALIVE_FILENAME);
9933
10521
  await removeIfExists(aliveFile);
9934
10522
  const githubAgentsDir = path22.join(subagentDir, ".github", "agents");
9935
- await mkdir9(githubAgentsDir, { recursive: true });
10523
+ await mkdir10(githubAgentsDir, { recursive: true });
9936
10524
  const wakeupDst = path22.join(githubAgentsDir, "wakeup.md");
9937
- await writeFile3(wakeupDst, DEFAULT_WAKEUP_CONTENT, "utf8");
10525
+ await writeFile4(wakeupDst, DEFAULT_WAKEUP_CONTENT, "utf8");
9938
10526
  const workspaceChild = spawnVsCode(vscodeCmd, [workspacePath], {
9939
10527
  label: "open-workspace"
9940
10528
  });
@@ -9963,9 +10551,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
9963
10551
  async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, requestInstructions, timestamp, vscodeCmd) {
9964
10552
  const workspacePath = path22.join(subagentDir, `${path22.basename(subagentDir)}.code-workspace`);
9965
10553
  const messagesDir = path22.join(subagentDir, "messages");
9966
- await mkdir9(messagesDir, { recursive: true });
10554
+ await mkdir10(messagesDir, { recursive: true });
9967
10555
  const reqFile = path22.join(messagesDir, `${timestamp}_req.md`);
9968
- await writeFile3(reqFile, requestInstructions, { encoding: "utf8" });
10556
+ await writeFile4(reqFile, requestInstructions, { encoding: "utf8" });
9969
10557
  const reqUri = pathToFileUri2(reqFile);
9970
10558
  const chatArgs = ["-r", "chat", "-m", chatId];
9971
10559
  for (const attachment of attachmentPaths) {
@@ -9991,7 +10579,7 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
9991
10579
  async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, chatInstruction, vscodeCmd) {
9992
10580
  const workspacePath = path22.join(subagentDir, `${path22.basename(subagentDir)}.code-workspace`);
9993
10581
  const messagesDir = path22.join(subagentDir, "messages");
9994
- await mkdir9(messagesDir, { recursive: true });
10582
+ await mkdir10(messagesDir, { recursive: true });
9995
10583
  const chatArgs = ["-r", "chat", "-m", chatId];
9996
10584
  for (const attachment of attachmentPaths) {
9997
10585
  chatArgs.push("-a", attachment);
@@ -10014,7 +10602,7 @@ async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, c
10014
10602
  }
10015
10603
 
10016
10604
  // src/evaluation/providers/vscode/dispatch/workspaceManager.ts
10017
- import { copyFile, mkdir as mkdir10, readFile as readFile6, readdir as readdir4, stat as stat4, writeFile as writeFile4 } from "node:fs/promises";
10605
+ import { copyFile, mkdir as mkdir11, readFile as readFile7, readdir as readdir4, stat as stat4, writeFile as writeFile5 } from "node:fs/promises";
10018
10606
  import path24 from "node:path";
10019
10607
 
10020
10608
  // src/evaluation/providers/vscode/utils/workspace.ts
@@ -10131,7 +10719,7 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
10131
10719
  if (!stats.isFile()) {
10132
10720
  throw new Error(`workspace template must be a file, not a directory: ${workspaceSrc}`);
10133
10721
  }
10134
- const templateText = await readFile6(workspaceSrc, "utf8");
10722
+ const templateText = await readFile7(workspaceSrc, "utf8");
10135
10723
  workspaceContent = JSON.parse(templateText);
10136
10724
  } else {
10137
10725
  workspaceContent = DEFAULT_WORKSPACE_TEMPLATE;
@@ -10150,9 +10738,9 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
10150
10738
  transformedContent = JSON.stringify(parsed, null, 2);
10151
10739
  }
10152
10740
  }
10153
- await writeFile4(workspaceDst, transformedContent, "utf8");
10741
+ await writeFile5(workspaceDst, transformedContent, "utf8");
10154
10742
  const messagesDir = path24.join(subagentDir, "messages");
10155
- await mkdir10(messagesDir, { recursive: true });
10743
+ await mkdir11(messagesDir, { recursive: true });
10156
10744
  return { workspace: workspaceDst, messagesDir };
10157
10745
  }
10158
10746
  async function createSubagentLock(subagentDir) {
@@ -10175,7 +10763,7 @@ async function createSubagentLock(subagentDir) {
10175
10763
  );
10176
10764
  }
10177
10765
  const lockFile = path24.join(subagentDir, DEFAULT_LOCK_NAME);
10178
- await writeFile4(lockFile, "", { encoding: "utf8" });
10766
+ await writeFile5(lockFile, "", { encoding: "utf8" });
10179
10767
  return lockFile;
10180
10768
  }
10181
10769
  async function removeSubagentLock(subagentDir) {
@@ -10200,7 +10788,7 @@ async function prepareSubagentDirectory(subagentDir, promptFile, chatId, workspa
10200
10788
  }
10201
10789
  if (promptFile) {
10202
10790
  const githubAgentsDir = path24.join(subagentDir, ".github", "agents");
10203
- await mkdir10(githubAgentsDir, { recursive: true });
10791
+ await mkdir11(githubAgentsDir, { recursive: true });
10204
10792
  const agentFile = path24.join(githubAgentsDir, `${chatId}.md`);
10205
10793
  try {
10206
10794
  await copyFile(promptFile, agentFile);
@@ -10461,7 +11049,7 @@ async function dispatchBatchAgent(options) {
10461
11049
  const reqFile = requestFiles[index];
10462
11050
  const tmpFile = responseTmpFiles[index];
10463
11051
  const finalFile = responseFilesFinal[index];
10464
- return writeFile5(
11052
+ return writeFile6(
10465
11053
  reqFile,
10466
11054
  createBatchRequestPrompt(query, tmpFile, finalFile, batchRequestTemplateContent),
10467
11055
  { encoding: "utf8" }
@@ -10473,7 +11061,7 @@ async function dispatchBatchAgent(options) {
10473
11061
  responseFilesFinal,
10474
11062
  orchestratorTemplateContent
10475
11063
  );
10476
- await writeFile5(orchestratorFile, orchestratorContent, { encoding: "utf8" });
11064
+ await writeFile6(orchestratorFile, orchestratorContent, { encoding: "utf8" });
10477
11065
  }
10478
11066
  const chatAttachments = [orchestratorFile, ...attachments];
10479
11067
  const orchestratorUri = pathToFileUri2(orchestratorFile);
@@ -10539,7 +11127,7 @@ async function dispatchBatchAgent(options) {
10539
11127
  }
10540
11128
 
10541
11129
  // src/evaluation/providers/vscode/dispatch/provision.ts
10542
- import { writeFile as writeFile6 } from "node:fs/promises";
11130
+ import { writeFile as writeFile7 } from "node:fs/promises";
10543
11131
  import path26 from "node:path";
10544
11132
  var DEFAULT_WORKSPACE_TEMPLATE2 = {
10545
11133
  folders: [
@@ -10620,8 +11208,8 @@ async function provisionSubagents(options) {
10620
11208
  if (!dryRun) {
10621
11209
  await removeIfExists(lockFile);
10622
11210
  await ensureDir(githubAgentsDir);
10623
- await writeFile6(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
10624
- await writeFile6(wakeupDst, wakeupContent, "utf8");
11211
+ await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
11212
+ await writeFile7(wakeupDst, wakeupContent, "utf8");
10625
11213
  }
10626
11214
  created.push(subagentDir);
10627
11215
  lockedSubagents.delete(subagentDir);
@@ -10631,8 +11219,8 @@ async function provisionSubagents(options) {
10631
11219
  if (!isLocked && force) {
10632
11220
  if (!dryRun) {
10633
11221
  await ensureDir(githubAgentsDir);
10634
- await writeFile6(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
10635
- await writeFile6(wakeupDst, wakeupContent, "utf8");
11222
+ await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
11223
+ await writeFile7(wakeupDst, wakeupContent, "utf8");
10636
11224
  }
10637
11225
  created.push(subagentDir);
10638
11226
  subagentsProvisioned += 1;
@@ -10640,8 +11228,8 @@ async function provisionSubagents(options) {
10640
11228
  }
10641
11229
  if (!dryRun && !await pathExists(workspaceDst)) {
10642
11230
  await ensureDir(githubAgentsDir);
10643
- await writeFile6(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
10644
- await writeFile6(wakeupDst, wakeupContent, "utf8");
11231
+ await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
11232
+ await writeFile7(wakeupDst, wakeupContent, "utf8");
10645
11233
  }
10646
11234
  skippedExisting.push(subagentDir);
10647
11235
  subagentsProvisioned += 1;
@@ -10656,8 +11244,8 @@ async function provisionSubagents(options) {
10656
11244
  if (!dryRun) {
10657
11245
  await ensureDir(subagentDir);
10658
11246
  await ensureDir(githubAgentsDir);
10659
- await writeFile6(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
10660
- await writeFile6(wakeupDst, wakeupContent, "utf8");
11247
+ await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
11248
+ await writeFile7(wakeupDst, wakeupContent, "utf8");
10661
11249
  }
10662
11250
  created.push(subagentDir);
10663
11251
  subagentsProvisioned += 1;
@@ -10982,7 +11570,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
10982
11570
 
10983
11571
  // src/evaluation/providers/targets-file.ts
10984
11572
  import { constants as constants3 } from "node:fs";
10985
- import { access as access3, readFile as readFile7 } from "node:fs/promises";
11573
+ import { access as access3, readFile as readFile8 } from "node:fs/promises";
10986
11574
  import path28 from "node:path";
10987
11575
  function isRecord(value) {
10988
11576
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -11026,7 +11614,7 @@ async function readTargetDefinitions(filePath) {
11026
11614
  if (!await fileExists2(absolutePath)) {
11027
11615
  throw new Error(`targets.yaml not found at ${absolutePath}`);
11028
11616
  }
11029
- const raw = await readFile7(absolutePath, "utf8");
11617
+ const raw = await readFile8(absolutePath, "utf8");
11030
11618
  const parsed = parseYamlValue(raw);
11031
11619
  if (!isRecord(parsed)) {
11032
11620
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
@@ -11217,6 +11805,7 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
11217
11805
  output: context.output ?? null,
11218
11806
  inputFiles: context.evalCase.file_paths,
11219
11807
  input: context.evalCase.input,
11808
+ metadata: context.evalCase.metadata ?? null,
11220
11809
  trace: context.trace ?? null,
11221
11810
  fileChanges: context.fileChanges ?? null,
11222
11811
  workspacePath: context.workspacePath ?? null,
@@ -11734,7 +12323,7 @@ function getTCritical(df) {
11734
12323
  }
11735
12324
 
11736
12325
  // src/evaluation/workspace/manager.ts
11737
- import { cp, mkdir as mkdir12, readdir as readdir5, rm as rm4, stat as stat6 } from "node:fs/promises";
12326
+ import { cp, mkdir as mkdir13, readdir as readdir5, rm as rm4, stat as stat6 } from "node:fs/promises";
11738
12327
  import path33 from "node:path";
11739
12328
  var TemplateNotFoundError = class extends Error {
11740
12329
  constructor(templatePath) {
@@ -11768,7 +12357,7 @@ function getWorkspacePath(evalRunId, caseId, workspaceRoot) {
11768
12357
  return path33.join(root, evalRunId, caseId);
11769
12358
  }
11770
12359
  async function copyDirectoryRecursive(src, dest) {
11771
- await mkdir12(dest, { recursive: true });
12360
+ await mkdir13(dest, { recursive: true });
11772
12361
  const entries = await readdir5(src, { withFileTypes: true });
11773
12362
  for (const entry of entries) {
11774
12363
  const srcPath = path33.join(src, entry.name);
@@ -11843,7 +12432,7 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
11843
12432
  import { execFile } from "node:child_process";
11844
12433
  import { createHash } from "node:crypto";
11845
12434
  import { existsSync as existsSync3 } from "node:fs";
11846
- import { cp as cp2, mkdir as mkdir13, readFile as readFile8, readdir as readdir6, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
12435
+ import { cp as cp2, mkdir as mkdir14, readFile as readFile9, readdir as readdir6, rm as rm5, unlink, writeFile as writeFile8 } from "node:fs/promises";
11847
12436
  import path34 from "node:path";
11848
12437
  import { promisify as promisify5 } from "node:util";
11849
12438
  var execFileAsync = promisify5(execFile);
@@ -11897,7 +12486,7 @@ function computeWorkspaceFingerprint(repos) {
11897
12486
  return createHash("sha256").update(JSON.stringify(canonical)).digest("hex");
11898
12487
  }
11899
12488
  async function copyDirectoryRecursive2(src, dest, skipDirs) {
11900
- await mkdir13(dest, { recursive: true });
12489
+ await mkdir14(dest, { recursive: true });
11901
12490
  const entries = await readdir6(src, { withFileTypes: true });
11902
12491
  for (const entry of entries) {
11903
12492
  const srcPath = path34.join(src, entry.name);
@@ -11935,7 +12524,7 @@ var WorkspacePoolManager = class {
11935
12524
  const { templatePath, repos, maxSlots, repoManager, poolReset } = options;
11936
12525
  const fingerprint = computeWorkspaceFingerprint(repos);
11937
12526
  const poolDir = path34.join(this.poolRoot, fingerprint);
11938
- await mkdir13(poolDir, { recursive: true });
12527
+ await mkdir14(poolDir, { recursive: true });
11939
12528
  const drifted = await this.checkDrift(poolDir, fingerprint);
11940
12529
  if (drifted) {
11941
12530
  console.warn(
@@ -11962,7 +12551,7 @@ var WorkspacePoolManager = class {
11962
12551
  poolDir
11963
12552
  };
11964
12553
  }
11965
- await mkdir13(slotPath, { recursive: true });
12554
+ await mkdir14(slotPath, { recursive: true });
11966
12555
  if (templatePath) {
11967
12556
  await copyDirectoryRecursive2(templatePath, slotPath);
11968
12557
  }
@@ -11999,14 +12588,14 @@ var WorkspacePoolManager = class {
11999
12588
  async tryLock(lockPath) {
12000
12589
  for (let attempt = 0; attempt < 3; attempt++) {
12001
12590
  try {
12002
- await writeFile7(lockPath, String(process.pid), { flag: "wx" });
12591
+ await writeFile8(lockPath, String(process.pid), { flag: "wx" });
12003
12592
  return true;
12004
12593
  } catch (err) {
12005
12594
  if (err.code !== "EEXIST") {
12006
12595
  throw err;
12007
12596
  }
12008
12597
  try {
12009
- const pidStr = await readFile8(lockPath, "utf-8");
12598
+ const pidStr = await readFile9(lockPath, "utf-8");
12010
12599
  const pid = Number.parseInt(pidStr.trim(), 10);
12011
12600
  if (!Number.isNaN(pid)) {
12012
12601
  try {
@@ -12033,7 +12622,7 @@ var WorkspacePoolManager = class {
12033
12622
  async checkDrift(poolDir, fingerprint) {
12034
12623
  const metadataPath = path34.join(poolDir, "metadata.json");
12035
12624
  try {
12036
- const raw = await readFile8(metadataPath, "utf-8");
12625
+ const raw = await readFile9(metadataPath, "utf-8");
12037
12626
  const metadata = JSON.parse(raw);
12038
12627
  return metadata.fingerprint !== fingerprint;
12039
12628
  } catch {
@@ -12048,7 +12637,7 @@ var WorkspacePoolManager = class {
12048
12637
  repos,
12049
12638
  createdAt: (/* @__PURE__ */ new Date()).toISOString()
12050
12639
  };
12051
- await writeFile7(path34.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
12640
+ await writeFile8(path34.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
12052
12641
  }
12053
12642
  /** Remove all slot directories and their lock files from a pool directory. */
12054
12643
  async removeAllSlots(poolDir) {
@@ -12058,7 +12647,7 @@ var WorkspacePoolManager = class {
12058
12647
  const lockPath = path34.join(poolDir, `${entry}.lock`);
12059
12648
  if (existsSync3(lockPath)) {
12060
12649
  try {
12061
- const pidStr = await readFile8(lockPath, "utf-8");
12650
+ const pidStr = await readFile9(lockPath, "utf-8");
12062
12651
  const pid = Number.parseInt(pidStr.trim(), 10);
12063
12652
  if (!Number.isNaN(pid)) {
12064
12653
  try {
@@ -12417,9 +13006,10 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
12417
13006
  }
12418
13007
 
12419
13008
  // src/evaluation/yaml-parser.ts
12420
- import { readFile as readFile15, stat as stat8 } from "node:fs/promises";
13009
+ import { readFile as readFile16, stat as stat8 } from "node:fs/promises";
12421
13010
  import path43 from "node:path";
12422
13011
  import micromatch2 from "micromatch";
13012
+ import { stringify as stringifyYaml } from "yaml";
12423
13013
 
12424
13014
  // src/evaluation/input-message-utils.ts
12425
13015
  function flattenInputMessages(messages) {
@@ -12486,7 +13076,7 @@ function cloneJsonValue(value) {
12486
13076
  }
12487
13077
 
12488
13078
  // src/evaluation/loaders/agent-skills-parser.ts
12489
- import { readFile as readFile9 } from "node:fs/promises";
13079
+ import { readFile as readFile10 } from "node:fs/promises";
12490
13080
  import path37 from "node:path";
12491
13081
  var ANSI_RED = "\x1B[31m";
12492
13082
  var ANSI_RESET2 = "\x1B[0m";
@@ -12499,7 +13089,7 @@ function isAgentSkillsFormat(parsed) {
12499
13089
  return Array.isArray(obj.evals);
12500
13090
  }
12501
13091
  async function loadTestsFromAgentSkills(filePath) {
12502
- const raw = await readFile9(filePath, "utf8");
13092
+ const raw = await readFile10(filePath, "utf8");
12503
13093
  let parsed;
12504
13094
  try {
12505
13095
  parsed = JSON.parse(raw);
@@ -12566,7 +13156,7 @@ function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
12566
13156
  }
12567
13157
 
12568
13158
  // src/evaluation/loaders/config-loader.ts
12569
- import { readFile as readFile10 } from "node:fs/promises";
13159
+ import { readFile as readFile11 } from "node:fs/promises";
12570
13160
  import path39 from "node:path";
12571
13161
 
12572
13162
  // src/evaluation/loaders/file-resolver.ts
@@ -12680,53 +13270,59 @@ var DEFAULT_EVAL_PATTERNS = [
12680
13270
  ];
12681
13271
  async function loadConfig(evalFilePath, repoRoot) {
12682
13272
  const directories = buildDirectoryChain2(evalFilePath, repoRoot);
13273
+ const globalConfigPath = path39.join(getAgentvConfigDir(), "config.yaml");
12683
13274
  for (const directory of directories) {
12684
13275
  const configPath = path39.join(directory, ".agentv", "config.yaml");
12685
13276
  if (!await fileExists3(configPath)) {
12686
13277
  continue;
12687
13278
  }
12688
- try {
12689
- const rawConfig = await readFile10(configPath, "utf8");
12690
- const parsed = interpolateEnv(parseYamlValue(rawConfig), process.env);
12691
- if (!isJsonObject(parsed)) {
12692
- logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
12693
- continue;
12694
- }
12695
- const config = parsed;
12696
- const requiredVersion = parsed.required_version;
12697
- if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
12698
- logWarning(`Invalid required_version in ${configPath}, expected string`);
12699
- continue;
12700
- }
12701
- const evalPatterns = config.eval_patterns;
12702
- if (evalPatterns !== void 0 && !Array.isArray(evalPatterns)) {
12703
- logWarning(`Invalid eval_patterns in ${configPath}, expected array`);
12704
- continue;
12705
- }
12706
- if (Array.isArray(evalPatterns) && !evalPatterns.every((p) => typeof p === "string")) {
12707
- logWarning(`Invalid eval_patterns in ${configPath}, all entries must be strings`);
12708
- continue;
12709
- }
12710
- const executionDefaults = parseExecutionDefaults(
12711
- parsed.execution,
12712
- configPath
12713
- );
12714
- const results = parseResultsConfig(parsed.results, configPath);
12715
- const hooks = parseHooksConfig(parsed.hooks, configPath);
12716
- return {
12717
- required_version: requiredVersion,
12718
- eval_patterns: evalPatterns,
12719
- execution: executionDefaults,
12720
- results,
12721
- ...hooks && { hooks }
12722
- };
12723
- } catch (error) {
12724
- logWarning(
12725
- `Could not read .agentv/config.yaml at ${configPath}: ${error.message}`
12726
- );
13279
+ const config = await readConfigFile(configPath);
13280
+ if (config) {
13281
+ return config;
12727
13282
  }
12728
13283
  }
12729
- return null;
13284
+ return await fileExists3(globalConfigPath) ? readConfigFile(globalConfigPath) : null;
13285
+ }
13286
+ async function readConfigFile(configPath) {
13287
+ try {
13288
+ const rawConfig = await readFile11(configPath, "utf8");
13289
+ const parsed = interpolateEnv(parseYamlValue(rawConfig), process.env);
13290
+ if (!isJsonObject(parsed)) {
13291
+ logWarning(`Invalid config.yaml format at ${configPath}`);
13292
+ return null;
13293
+ }
13294
+ const config = parsed;
13295
+ const requiredVersion = parsed.required_version;
13296
+ if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
13297
+ logWarning(`Invalid required_version in ${configPath}, expected string`);
13298
+ return null;
13299
+ }
13300
+ const evalPatterns = config.eval_patterns;
13301
+ if (evalPatterns !== void 0 && !Array.isArray(evalPatterns)) {
13302
+ logWarning(`Invalid eval_patterns in ${configPath}, expected array`);
13303
+ return null;
13304
+ }
13305
+ if (Array.isArray(evalPatterns) && !evalPatterns.every((p) => typeof p === "string")) {
13306
+ logWarning(`Invalid eval_patterns in ${configPath}, all entries must be strings`);
13307
+ return null;
13308
+ }
13309
+ const executionDefaults = parseExecutionDefaults(
13310
+ parsed.execution,
13311
+ configPath
13312
+ );
13313
+ const results = parseResultsConfig(parsed.results, configPath);
13314
+ const hooks = parseHooksConfig(parsed.hooks, configPath);
13315
+ return {
13316
+ required_version: requiredVersion,
13317
+ eval_patterns: evalPatterns,
13318
+ execution: executionDefaults,
13319
+ results,
13320
+ ...hooks && { hooks }
13321
+ };
13322
+ } catch (error) {
13323
+ logWarning(`Could not read config.yaml at ${configPath}: ${error.message}`);
13324
+ return null;
13325
+ }
12730
13326
  }
12731
13327
  function extractTargetFromSuite(suite) {
12732
13328
  const execution = suite.execution;
@@ -12902,7 +13498,10 @@ function extractCacheConfig(suite) {
12902
13498
  logWarning(`Invalid execution.cache: ${cache}. Must be a boolean. Ignoring.`);
12903
13499
  return void 0;
12904
13500
  }
12905
- const cachePath = executionObj.cache_path ?? executionObj.cachePath;
13501
+ if (executionObj.cachePath !== void 0) {
13502
+ logWarning("Invalid execution.cachePath: use snake_case execution.cache_path in YAML.");
13503
+ }
13504
+ const cachePath = executionObj.cache_path;
12906
13505
  const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
12907
13506
  return { enabled: cache, cachePath: resolvedCachePath };
12908
13507
  }
@@ -13071,6 +13670,12 @@ function parseResultsConfig(raw, configPath) {
13071
13670
  ...branchPrefix && { branch_prefix: branchPrefix }
13072
13671
  };
13073
13672
  }
13673
+ function resolveResultsConfigForProject(config, _projectId) {
13674
+ if (!config) {
13675
+ return void 0;
13676
+ }
13677
+ return config.results;
13678
+ }
13074
13679
  function parseHooksConfig(raw, configPath) {
13075
13680
  if (raw === void 0 || raw === null) {
13076
13681
  return void 0;
@@ -13095,15 +13700,15 @@ function logWarning(message) {
13095
13700
  }
13096
13701
 
13097
13702
  // src/evaluation/loaders/grader-parser.ts
13098
- import { readFile as readFile12 } from "node:fs/promises";
13703
+ import { readFile as readFile13 } from "node:fs/promises";
13099
13704
  import path40 from "node:path";
13100
13705
 
13101
13706
  // src/evaluation/validation/prompt-validator.ts
13102
- import { readFile as readFile11 } from "node:fs/promises";
13707
+ import { readFile as readFile12 } from "node:fs/promises";
13103
13708
  var ANSI_YELLOW3 = "\x1B[33m";
13104
13709
  var ANSI_RESET4 = "\x1B[0m";
13105
13710
  async function validateCustomPromptContent(promptPath) {
13106
- const content = await readFile11(promptPath, "utf8");
13711
+ const content = await readFile12(promptPath, "utf8");
13107
13712
  validateTemplateVariables(content, promptPath);
13108
13713
  }
13109
13714
  function validateTemplateVariables(content, source) {
@@ -13235,7 +13840,7 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
13235
13840
  const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
13236
13841
  throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
13237
13842
  }
13238
- const content = await readFile12(resolved.resolvedPath, "utf8");
13843
+ const content = await readFile13(resolved.resolvedPath, "utf8");
13239
13844
  const parsed = interpolateEnv(parseYamlValue(content), process.env);
13240
13845
  if (!isJsonObject2(parsed)) {
13241
13846
  throw new Error(
@@ -13282,6 +13887,103 @@ async function expandGraderEntries(candidateEvaluators, searchRoots, evalId, inc
13282
13887
  }
13283
13888
  return expanded;
13284
13889
  }
13890
+ async function collectAssertionTemplateSourceReferences(rawEvalCase, globalExecution, searchRoots, evalId) {
13891
+ const execution = rawEvalCase.execution;
13892
+ const executionObject = isJsonObject2(execution) ? execution : void 0;
13893
+ const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators;
13894
+ const skipDefaults = executionObject?.skip_defaults === true;
13895
+ const rootEvaluators = skipDefaults ? void 0 : globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators;
13896
+ return [
13897
+ ...await collectAssertionTemplateReferencesFromValue(caseEvaluators, searchRoots, evalId),
13898
+ ...await collectAssertionTemplateReferencesFromValue(rootEvaluators, searchRoots, evalId)
13899
+ ];
13900
+ }
13901
+ async function collectAssertionTemplateReferencesFromValue(value, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
13902
+ if (value === void 0) {
13903
+ return [];
13904
+ }
13905
+ const references = [];
13906
+ if (Array.isArray(value)) {
13907
+ for (const item of value) {
13908
+ if (isIncludeEntry(item)) {
13909
+ const nextDepth = includeContext.depth + 1;
13910
+ if (nextDepth > MAX_ASSERTION_INCLUDE_DEPTH) {
13911
+ const chain = [...includeContext.chain, item.include].join(" -> ");
13912
+ throw new Error(
13913
+ `Assertion template include depth exceeded ${MAX_ASSERTION_INCLUDE_DEPTH} in '${evalId}'. Include chain: ${chain}`
13914
+ );
13915
+ }
13916
+ const resolved = await resolveAssertionTemplateReference(item.include, searchRoots);
13917
+ references.push({
13918
+ kind: "assertion_template",
13919
+ displayPath: resolved.displayPath,
13920
+ ...resolved.resolvedPath ? { resolvedPath: path40.resolve(resolved.resolvedPath) } : {}
13921
+ });
13922
+ if (resolved.resolvedPath) {
13923
+ if (includeContext.chain.includes(resolved.resolvedPath)) {
13924
+ const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
13925
+ throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
13926
+ }
13927
+ const content = await readFile13(resolved.resolvedPath, "utf8");
13928
+ const parsed = interpolateEnv(parseYamlValue(content), process.env);
13929
+ if (isJsonObject2(parsed) && Array.isArray(parsed.assertions)) {
13930
+ const templateDir = path40.dirname(resolved.resolvedPath);
13931
+ const nestedSearchRoots = [
13932
+ templateDir,
13933
+ ...searchRoots.filter((root) => path40.resolve(root) !== templateDir)
13934
+ ];
13935
+ references.push(
13936
+ ...await collectAssertionTemplateReferencesFromValue(
13937
+ parsed.assertions,
13938
+ nestedSearchRoots,
13939
+ evalId,
13940
+ {
13941
+ depth: nextDepth,
13942
+ chain: [...includeContext.chain, resolved.resolvedPath]
13943
+ }
13944
+ )
13945
+ );
13946
+ }
13947
+ }
13948
+ continue;
13949
+ }
13950
+ if (isJsonObject2(item)) {
13951
+ references.push(
13952
+ ...await collectAssertionTemplateReferencesFromObject(
13953
+ item,
13954
+ searchRoots,
13955
+ evalId,
13956
+ includeContext
13957
+ )
13958
+ );
13959
+ }
13960
+ }
13961
+ } else if (isJsonObject2(value)) {
13962
+ references.push(
13963
+ ...await collectAssertionTemplateReferencesFromObject(
13964
+ value,
13965
+ searchRoots,
13966
+ evalId,
13967
+ includeContext
13968
+ )
13969
+ );
13970
+ }
13971
+ return references;
13972
+ }
13973
+ async function collectAssertionTemplateReferencesFromObject(value, searchRoots, evalId, includeContext) {
13974
+ const references = [];
13975
+ for (const key of ["assertions", "assert", "evaluators"]) {
13976
+ references.push(
13977
+ ...await collectAssertionTemplateReferencesFromValue(
13978
+ value[key],
13979
+ searchRoots,
13980
+ evalId,
13981
+ includeContext
13982
+ )
13983
+ );
13984
+ }
13985
+ return references;
13986
+ }
13285
13987
  async function parseGraderList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
13286
13988
  const expandedEvaluators = await expandGraderEntries(candidateEvaluators, searchRoots, evalId);
13287
13989
  if (!expandedEvaluators) {
@@ -13408,6 +14110,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
13408
14110
  continue;
13409
14111
  }
13410
14112
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
14113
+ const resolvedScriptPath = await resolveOptionalCommandSource(command, searchRoots);
13411
14114
  const cwd = asString(rawEvaluator.cwd);
13412
14115
  let resolvedCwd;
13413
14116
  if (cwd) {
@@ -13473,6 +14176,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
13473
14176
  name,
13474
14177
  type: "code-grader",
13475
14178
  command,
14179
+ ...resolvedScriptPath ? { resolvedScriptPath } : {},
13476
14180
  cwd,
13477
14181
  resolvedCwd,
13478
14182
  ...weight2 !== void 0 ? { weight: weight2 } : {},
@@ -14540,6 +15244,17 @@ function asStringArray(value, description) {
14540
15244
  }
14541
15245
  return result;
14542
15246
  }
15247
+ async function resolveOptionalCommandSource(command, searchRoots) {
15248
+ const candidate = command.at(-1);
15249
+ if (!candidate || !looksLikeFilePath(candidate)) {
15250
+ return void 0;
15251
+ }
15252
+ const resolved = await resolveFileReference(candidate, searchRoots);
15253
+ return resolved.resolvedPath ? path40.resolve(resolved.resolvedPath) : void 0;
15254
+ }
15255
+ function looksLikeFilePath(value) {
15256
+ return path40.isAbsolute(value) || value.startsWith(".") || value.includes("/") || value.includes("\\") || /\.[cm]?[jt]sx?$|\.py$|\.sh$|\.bash$|\.rb$|\.go$|\.rs$/i.test(value);
15257
+ }
14543
15258
  function parseCommandToArgv(command) {
14544
15259
  if (process.platform === "win32") {
14545
15260
  return ["cmd.exe", "/c", command];
@@ -14608,6 +15323,19 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
14608
15323
  function isValidFieldAggregationType(value) {
14609
15324
  return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
14610
15325
  }
15326
+ var VALID_RUBRIC_OPERATORS = new Set(RUBRIC_OPERATOR_VALUES);
15327
+ function parseRubricOperator(value, rubricId, evaluatorName, evalId) {
15328
+ if (value === void 0) {
15329
+ return void 0;
15330
+ }
15331
+ if (typeof value === "string" && VALID_RUBRIC_OPERATORS.has(value)) {
15332
+ return value;
15333
+ }
15334
+ logWarning2(
15335
+ `Ignoring invalid operator for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be one of ${RUBRIC_OPERATOR_VALUES.join(", ")}`
15336
+ );
15337
+ return void 0;
15338
+ }
14611
15339
  function parseRubricItems(rawRubrics, evaluatorName, evalId) {
14612
15340
  const items = [];
14613
15341
  for (const [index, rawRubric] of rawRubrics.entries()) {
@@ -14618,7 +15346,8 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
14618
15346
  continue;
14619
15347
  }
14620
15348
  const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
14621
- const expectedOutcome = asString(rawRubric.outcome) ?? "";
15349
+ const expectedOutcome = asString(rawRubric.outcome) ?? asString(rawRubric.criteria) ?? "";
15350
+ const operator = parseRubricOperator(rawRubric.operator, id, evaluatorName, evalId);
14622
15351
  const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
14623
15352
  let minScore;
14624
15353
  let requiredMinScore;
@@ -14662,6 +15391,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
14662
15391
  id,
14663
15392
  weight,
14664
15393
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
15394
+ ...operator !== void 0 ? { operator } : {},
14665
15395
  ...required !== void 0 ? { required } : {},
14666
15396
  ...minScore !== void 0 ? { min_score: minScore } : {},
14667
15397
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
@@ -14677,6 +15407,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
14677
15407
  items.push({
14678
15408
  id,
14679
15409
  outcome: expectedOutcome,
15410
+ ...operator !== void 0 ? { operator } : {},
14680
15411
  weight,
14681
15412
  // Default to required: true if not specified (backward compatibility)
14682
15413
  required: required ?? true,
@@ -14799,6 +15530,8 @@ function parseInlineRubrics(rawRubrics) {
14799
15530
  };
14800
15531
  }
14801
15532
  const expectedOutcome = asString(rubric.outcome) ?? "";
15533
+ const id = asString(rubric.id) ?? `rubric-${index + 1}`;
15534
+ const operator = parseRubricOperator(rubric.operator, id, "rubrics", "<inline>");
14802
15535
  const rawScoreRanges = rubric.score_ranges;
14803
15536
  const normalizedScoreRanges = rawScoreRanges !== void 0 ? normalizeScoreRangesShorthand(rawScoreRanges) : void 0;
14804
15537
  const scoreRanges = Array.isArray(normalizedScoreRanges) && normalizedScoreRanges.length > 0 ? normalizedScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
@@ -14806,7 +15539,8 @@ function parseInlineRubrics(rawRubrics) {
14806
15539
  outcome: asString(range.outcome) ?? ""
14807
15540
  })).filter((r) => r.outcome.length > 0) : void 0;
14808
15541
  const baseRubric = {
14809
- id: asString(rubric.id) ?? `rubric-${index + 1}`,
15542
+ id,
15543
+ ...operator !== void 0 ? { operator } : {},
14810
15544
  weight: typeof rubric.weight === "number" ? rubric.weight : 1
14811
15545
  };
14812
15546
  let inlineMinScore;
@@ -14847,12 +15581,12 @@ function parseInlineRubrics(rawRubrics) {
14847
15581
  }
14848
15582
 
14849
15583
  // src/evaluation/loaders/jsonl-parser.ts
14850
- import { readFile as readFile14 } from "node:fs/promises";
15584
+ import { readFile as readFile15 } from "node:fs/promises";
14851
15585
  import path42 from "node:path";
14852
15586
  import micromatch from "micromatch";
14853
15587
 
14854
15588
  // src/evaluation/loaders/message-processor.ts
14855
- import { readFile as readFile13 } from "node:fs/promises";
15589
+ import { readFile as readFile14 } from "node:fs/promises";
14856
15590
  import path41 from "node:path";
14857
15591
 
14858
15592
  // src/evaluation/formatting/segment-formatter.ts
@@ -14979,7 +15713,7 @@ async function processMessages(options) {
14979
15713
  continue;
14980
15714
  }
14981
15715
  try {
14982
- const fileContent = (await readFile13(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
15716
+ const fileContent = (await readFile14(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
14983
15717
  processedContent.push({
14984
15718
  ...cloneJsonObject(rawSegment),
14985
15719
  path: displayPath,
@@ -15020,7 +15754,7 @@ async function processMessages(options) {
15020
15754
  continue;
15021
15755
  }
15022
15756
  try {
15023
- const imageBuffer = await readFile13(resolvedPath);
15757
+ const imageBuffer = await readFile14(resolvedPath);
15024
15758
  const base64 = imageBuffer.toString("base64");
15025
15759
  processedContent.push({
15026
15760
  type: "image",
@@ -15103,7 +15837,7 @@ async function processExpectedMessages(options) {
15103
15837
  continue;
15104
15838
  }
15105
15839
  try {
15106
- const fileContent = (await readFile13(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
15840
+ const fileContent = (await readFile14(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
15107
15841
  processedContent.push({
15108
15842
  type: "file",
15109
15843
  path: displayPath,
@@ -15143,7 +15877,7 @@ async function processExpectedMessages(options) {
15143
15877
  continue;
15144
15878
  }
15145
15879
  try {
15146
- const imageBuffer = await readFile13(resolvedPath);
15880
+ const imageBuffer = await readFile14(resolvedPath);
15147
15881
  const base64 = imageBuffer.toString("base64");
15148
15882
  processedContent.push({
15149
15883
  type: "image",
@@ -15185,6 +15919,12 @@ function expandInputShorthand(value) {
15185
15919
  if (typeof value === "string") {
15186
15920
  return [{ role: "user", content: value }];
15187
15921
  }
15922
+ if (isJsonObject(value)) {
15923
+ if ("role" in value) {
15924
+ return isTestMessage(value) ? [value] : void 0;
15925
+ }
15926
+ return [{ role: "user", content: value }];
15927
+ }
15188
15928
  if (Array.isArray(value)) {
15189
15929
  const messages = value.filter((msg) => isTestMessage(msg));
15190
15930
  return messages.length > 0 ? messages : void 0;
@@ -15272,7 +16012,7 @@ async function loadSidecarMetadata(jsonlPath, verbose) {
15272
16012
  return {};
15273
16013
  }
15274
16014
  try {
15275
- const content = await readFile14(sidecarPath, "utf8");
16015
+ const content = await readFile15(sidecarPath, "utf8");
15276
16016
  const parsed = interpolateEnv(parseYamlValue(content), process.env);
15277
16017
  if (!isJsonObject(parsed)) {
15278
16018
  logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
@@ -15317,7 +16057,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
15317
16057
  const repoRootPath = resolveToAbsolutePath(repoRoot);
15318
16058
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
15319
16059
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
15320
- const rawFile = await readFile14(absoluteTestPath, "utf8");
16060
+ const rawFile = await readFile15(absoluteTestPath, "utf8");
15321
16061
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
15322
16062
  const fallbackSuiteName = path42.basename(absoluteTestPath, ".jsonl") || "eval";
15323
16063
  const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
@@ -15454,16 +16194,16 @@ ${detailBlock}${ANSI_RESET7}`);
15454
16194
  }
15455
16195
 
15456
16196
  // src/evaluation/metadata.ts
15457
- import { z as z3 } from "zod";
15458
- var MetadataSchema = z3.object({
15459
- name: z3.string().min(1).max(64).regex(/^[a-z0-9-]+$/).optional(),
15460
- description: z3.string().min(1).max(1024).optional(),
15461
- version: z3.string().optional(),
15462
- author: z3.string().optional(),
15463
- tags: z3.array(z3.string()).optional(),
15464
- license: z3.string().optional(),
15465
- requires: z3.object({
15466
- agentv: z3.string().optional()
16197
+ import { z as z4 } from "zod";
16198
+ var MetadataSchema = z4.object({
16199
+ name: z4.string().min(1).max(64).regex(/^[a-z0-9-]+$/).optional(),
16200
+ description: z4.string().min(1).max(1024).optional(),
16201
+ version: z4.string().optional(),
16202
+ author: z4.string().optional(),
16203
+ tags: z4.array(z4.string()).optional(),
16204
+ license: z4.string().optional(),
16205
+ requires: z4.object({
16206
+ agentv: z4.string().optional()
15467
16207
  }).optional()
15468
16208
  });
15469
16209
  function parseMetadata(suite) {
@@ -15735,7 +16475,7 @@ function interpolateRawEvalCase(raw, vars) {
15735
16475
  async function readTestSuiteMetadata(testFilePath) {
15736
16476
  try {
15737
16477
  const absolutePath = path43.resolve(testFilePath);
15738
- const content = await readFile15(absolutePath, "utf8");
16478
+ const content = await readFile16(absolutePath, "utf8");
15739
16479
  const parsed = interpolateEnv(parseYamlValue(content), process.env);
15740
16480
  if (!isJsonObject(parsed)) {
15741
16481
  return {};
@@ -15759,7 +16499,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
15759
16499
  return { tests: await loadTestsFromAgentSkills(evalFilePath) };
15760
16500
  }
15761
16501
  if (format === "typescript") {
15762
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-FRQF6KHR.js");
16502
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-EQJX3OLT.js");
15763
16503
  return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
15764
16504
  }
15765
16505
  const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
@@ -15794,7 +16534,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
15794
16534
  return loadTestsFromAgentSkills(evalFilePath);
15795
16535
  }
15796
16536
  if (format === "typescript") {
15797
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-FRQF6KHR.js");
16537
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-EQJX3OLT.js");
15798
16538
  const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
15799
16539
  return suite.tests;
15800
16540
  }
@@ -15809,8 +16549,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
15809
16549
  const repoRootPath = resolveToAbsolutePath(repoRoot);
15810
16550
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
15811
16551
  const config = await loadConfig(absoluteTestPath, repoRootPath);
15812
- const rawFile = await readFile15(absoluteTestPath, "utf8");
15813
- const interpolated = interpolateEnv(parseYamlValue(rawFile), process.env);
16552
+ const rawFile = await readFile16(absoluteTestPath, "utf8");
16553
+ const rawParsed = parseYamlValue(rawFile);
16554
+ const rawCaseSnapshots = buildRawInlineTestSnapshots(rawParsed);
16555
+ const interpolated = interpolateEnv(rawParsed, process.env);
15814
16556
  if (!isJsonObject(interpolated)) {
15815
16557
  throw new Error(`Invalid test file format: ${evalFilePath}`);
15816
16558
  }
@@ -15847,7 +16589,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
15847
16589
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
15848
16590
  }
15849
16591
  const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
15850
- const suiteGovernance = extractSuiteGovernance(suite);
16592
+ const suiteMetadataPayload = extractSuiteMetadataPayload(suite);
15851
16593
  const rawSuiteInput = suite.input;
15852
16594
  const rawSuiteInputFiles = suite.input_files;
15853
16595
  const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
@@ -15949,6 +16691,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
15949
16691
  logError3(`Skipping test '${id}': ${message}`);
15950
16692
  continue;
15951
16693
  }
16694
+ const assertionTemplateReferences = await collectAssertionTemplateSourceReferences(
16695
+ renderedCase,
16696
+ globalExecution,
16697
+ searchRoots,
16698
+ id ?? "unknown"
16699
+ );
15952
16700
  const inlineRubrics = renderedCase.rubrics;
15953
16701
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
15954
16702
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
@@ -15961,8 +16709,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
15961
16709
  const caseWorkspace = await resolveWorkspaceConfig(renderedCase.workspace, evalFileDir);
15962
16710
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
15963
16711
  const rawCaseMetadata = isJsonObject(renderedCase.metadata) ? renderedCase.metadata : void 0;
15964
- const suitePayload = suiteGovernance !== void 0 ? { governance: suiteGovernance } : void 0;
15965
- const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suitePayload);
16712
+ const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload);
15966
16713
  const caseTargets = extractTargetsFromTestCase(renderedCase);
15967
16714
  const dependsOn = Array.isArray(renderedCase.depends_on) ? renderedCase.depends_on.filter(
15968
16715
  (v) => typeof v === "string"
@@ -16001,12 +16748,245 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
16001
16748
  ...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
16002
16749
  ...windowSize !== void 0 ? { window_size: windowSize } : {},
16003
16750
  ...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
16004
- ...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
16751
+ ...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {},
16752
+ source: buildEvalTestSource({
16753
+ evalFilePath,
16754
+ absoluteTestPath,
16755
+ repoRootPath,
16756
+ id,
16757
+ renderedCase,
16758
+ rawCaseSnapshots,
16759
+ inputMessages,
16760
+ evaluators,
16761
+ assertionTemplateReferences
16762
+ })
16005
16763
  };
16006
16764
  results.push(testCase);
16007
16765
  }
16008
16766
  return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path };
16009
16767
  }
16768
+ var SOURCE_SECRET_KEY_PATTERN = /(api[_-]?key|authorization|bearer|credential|password|private[_-]?key|secret|token)/i;
16769
+ var REDACTED_SOURCE_VALUE = "[redacted]";
16770
+ function buildRawInlineTestSnapshots(rawParsed) {
16771
+ const snapshots = /* @__PURE__ */ new Map();
16772
+ if (!isJsonObject(rawParsed)) {
16773
+ return snapshots;
16774
+ }
16775
+ const rawTests = rawParsed.tests ?? rawParsed.eval_cases ?? rawParsed.evalcases;
16776
+ if (!Array.isArray(rawTests)) {
16777
+ return snapshots;
16778
+ }
16779
+ for (const rawTest of rawTests) {
16780
+ if (!isJsonObject(rawTest) || typeof rawTest.id !== "string") {
16781
+ continue;
16782
+ }
16783
+ snapshots.set(rawTest.id, stringifySourceYaml(rawTest));
16784
+ }
16785
+ return snapshots;
16786
+ }
16787
+ function buildEvalTestSource(params) {
16788
+ const evalFileRepoPath = toPortableRelativePath(params.repoRootPath, params.absoluteTestPath);
16789
+ const testSnapshotYaml = params.rawCaseSnapshots.get(params.id) ?? stringifySourceYaml(params.renderedCase);
16790
+ const evaluatorReferences = collectGraderSourceReferences(params.evaluators);
16791
+ const inputReferences = collectInputSourceReferences(params.inputMessages);
16792
+ const references = dedupeSourceReferences([
16793
+ ...inputReferences,
16794
+ ...evaluatorReferences,
16795
+ ...params.assertionTemplateReferences
16796
+ ]);
16797
+ return {
16798
+ evalFilePath: params.evalFilePath,
16799
+ evalFileAbsolutePath: params.absoluteTestPath,
16800
+ ...evalFileRepoPath ? { evalFileRepoPath } : {},
16801
+ testId: params.id,
16802
+ testSnapshotYaml,
16803
+ graderDefinitions: buildGraderSourceDefinitions(params.evaluators),
16804
+ references
16805
+ };
16806
+ }
16807
+ function stringifySourceYaml(value) {
16808
+ return stringifyYaml(sanitizeSourceValue(value), { lineWidth: 0 }).trimEnd();
16809
+ }
16810
+ function sanitizeSourceValue(value, keyHint) {
16811
+ if (keyHint && SOURCE_SECRET_KEY_PATTERN.test(keyHint)) {
16812
+ return REDACTED_SOURCE_VALUE;
16813
+ }
16814
+ if (value === null || typeof value === "string" || typeof value === "number") {
16815
+ return value;
16816
+ }
16817
+ if (typeof value === "boolean") {
16818
+ return value;
16819
+ }
16820
+ if (Array.isArray(value)) {
16821
+ return value.map((item) => sanitizeSourceValue(item));
16822
+ }
16823
+ if (typeof value === "object" && value !== null) {
16824
+ const entries = Object.entries(value).map(([key, entryValue]) => [
16825
+ key,
16826
+ sanitizeSourceValue(entryValue, key)
16827
+ ]);
16828
+ return Object.fromEntries(entries);
16829
+ }
16830
+ return String(value);
16831
+ }
16832
+ function buildGraderSourceDefinitions(evaluators) {
16833
+ return (evaluators ?? []).map((evaluator) => ({
16834
+ name: evaluator.name,
16835
+ type: evaluator.type,
16836
+ ...evaluator.weight !== void 0 ? { weight: evaluator.weight } : {},
16837
+ ...evaluator.required !== void 0 ? { required: evaluator.required } : {},
16838
+ ..."min_score" in evaluator && evaluator.min_score !== void 0 ? { minScore: evaluator.min_score } : {},
16839
+ definition: sanitizeGraderDefinition(evaluator)
16840
+ }));
16841
+ }
16842
+ function sanitizeGraderDefinition(evaluator) {
16843
+ const copy = sanitizeSourceValue(evaluator);
16844
+ return stripRuntimeResolutionFields(copy);
16845
+ }
16846
+ function stripRuntimeResolutionFields(value) {
16847
+ const stripped = {};
16848
+ for (const [key, entryValue] of Object.entries(value)) {
16849
+ if (key === "resolvedPromptPath" || key === "promptPath" || key === "resolvedPromptScript" || key === "resolvedScriptPath" || key === "resolvedCwd" || key === "resolvedCommand") {
16850
+ continue;
16851
+ }
16852
+ if (Array.isArray(entryValue)) {
16853
+ stripped[key] = entryValue.map(
16854
+ (item) => isJsonObject(item) ? stripRuntimeResolutionFields(item) : item
16855
+ );
16856
+ } else if (isJsonObject(entryValue)) {
16857
+ stripped[key] = stripRuntimeResolutionFields(entryValue);
16858
+ } else {
16859
+ stripped[key] = entryValue;
16860
+ }
16861
+ }
16862
+ return stripped;
16863
+ }
16864
+ function collectInputSourceReferences(inputMessages) {
16865
+ const references = [];
16866
+ for (const message of inputMessages) {
16867
+ if (!Array.isArray(message.content)) {
16868
+ continue;
16869
+ }
16870
+ for (const segment of message.content) {
16871
+ if (!isJsonObject(segment) || segment.type !== "file") {
16872
+ continue;
16873
+ }
16874
+ const displayPath = typeof segment.path === "string" ? segment.path : typeof segment.value === "string" ? segment.value : "input file";
16875
+ references.push({
16876
+ kind: "input_file",
16877
+ displayPath,
16878
+ ...typeof segment.resolvedPath === "string" ? { resolvedPath: path43.resolve(segment.resolvedPath) } : {}
16879
+ });
16880
+ }
16881
+ }
16882
+ return references;
16883
+ }
16884
+ function collectGraderSourceReferences(evaluators) {
16885
+ const references = [];
16886
+ for (const evaluator of evaluators ?? []) {
16887
+ references.push(...collectSingleGraderSourceReferences(evaluator));
16888
+ }
16889
+ return references;
16890
+ }
16891
+ function collectSingleGraderSourceReferences(evaluator) {
16892
+ const references = [];
16893
+ if (evaluator.type === "code-grader") {
16894
+ const command = evaluator.command ?? evaluator.script ?? [];
16895
+ references.push({
16896
+ kind: "code_grader_command",
16897
+ displayPath: evaluator.resolvedScriptPath ?? command.join(" "),
16898
+ ...evaluator.resolvedScriptPath ? { resolvedPath: evaluator.resolvedScriptPath } : {},
16899
+ graderName: evaluator.name,
16900
+ command
16901
+ });
16902
+ if (evaluator.resolvedCwd) {
16903
+ references.push({
16904
+ kind: "code_grader_cwd",
16905
+ displayPath: evaluator.cwd ?? evaluator.resolvedCwd,
16906
+ resolvedPath: evaluator.resolvedCwd,
16907
+ graderName: evaluator.name
16908
+ });
16909
+ }
16910
+ }
16911
+ if (evaluator.type === "llm-grader") {
16912
+ const promptPath = evaluator.resolvedPromptPath ?? evaluator.promptPath;
16913
+ if (promptPath) {
16914
+ references.push({
16915
+ kind: "llm_grader_prompt",
16916
+ displayPath: typeof evaluator.prompt === "string" ? evaluator.prompt : promptPath,
16917
+ resolvedPath: promptPath,
16918
+ graderName: evaluator.name
16919
+ });
16920
+ }
16921
+ if (evaluator.resolvedPromptScript && evaluator.resolvedPromptScript.length > 0) {
16922
+ references.push({
16923
+ kind: "prompt_script",
16924
+ displayPath: evaluator.resolvedPromptScript.at(-1) ?? evaluator.name,
16925
+ resolvedPath: evaluator.resolvedPromptScript.at(-1),
16926
+ graderName: evaluator.name,
16927
+ command: evaluator.resolvedPromptScript
16928
+ });
16929
+ }
16930
+ }
16931
+ const preprocessors = "preprocessors" in evaluator ? evaluator.preprocessors : void 0;
16932
+ for (const preprocessor of preprocessors ?? []) {
16933
+ if (preprocessor.resolvedCommand && preprocessor.resolvedCommand.length > 0) {
16934
+ references.push({
16935
+ kind: "preprocessor_command",
16936
+ displayPath: preprocessor.resolvedCommand.at(-1) ?? preprocessor.type,
16937
+ resolvedPath: preprocessor.resolvedCommand.at(-1),
16938
+ graderName: evaluator.name,
16939
+ command: preprocessor.resolvedCommand
16940
+ });
16941
+ }
16942
+ }
16943
+ if (evaluator.type === "composite") {
16944
+ for (const member of evaluator.assertions) {
16945
+ references.push(...collectSingleGraderSourceReferences(member));
16946
+ }
16947
+ if (evaluator.aggregator.type === "code-grader") {
16948
+ references.push({
16949
+ kind: "code_grader_command",
16950
+ displayPath: evaluator.aggregator.path,
16951
+ resolvedPath: path43.resolve(evaluator.aggregator.cwd ?? "", evaluator.aggregator.path),
16952
+ graderName: evaluator.name
16953
+ });
16954
+ } else if (evaluator.aggregator.type === "llm-grader" && evaluator.aggregator.promptPath) {
16955
+ references.push({
16956
+ kind: "llm_grader_prompt",
16957
+ displayPath: evaluator.aggregator.prompt ?? evaluator.aggregator.promptPath,
16958
+ resolvedPath: evaluator.aggregator.promptPath,
16959
+ graderName: evaluator.name
16960
+ });
16961
+ }
16962
+ }
16963
+ return references;
16964
+ }
16965
+ function dedupeSourceReferences(references) {
16966
+ const seen = /* @__PURE__ */ new Set();
16967
+ const deduped = [];
16968
+ for (const reference of references) {
16969
+ const key = JSON.stringify([
16970
+ reference.kind,
16971
+ reference.resolvedPath ?? reference.displayPath,
16972
+ reference.graderName ?? "",
16973
+ reference.command?.join("\0") ?? ""
16974
+ ]);
16975
+ if (seen.has(key)) {
16976
+ continue;
16977
+ }
16978
+ seen.add(key);
16979
+ deduped.push(reference);
16980
+ }
16981
+ return deduped;
16982
+ }
16983
+ function toPortableRelativePath(root, candidate) {
16984
+ const relative = path43.relative(root, candidate);
16985
+ if (relative && !relative.startsWith("..") && !path43.isAbsolute(relative)) {
16986
+ return relative.split(path43.sep).join("/");
16987
+ }
16988
+ return void 0;
16989
+ }
16010
16990
  async function loadTestById(evalFilePath, repoRoot, evalId) {
16011
16991
  const tests = await loadTests(evalFilePath, repoRoot);
16012
16992
  const match = tests.find((c) => c.id === evalId);
@@ -16099,7 +17079,7 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
16099
17079
  const workspaceFilePath = path43.resolve(evalFileDir, raw);
16100
17080
  let content;
16101
17081
  try {
16102
- content = await readFile15(workspaceFilePath, "utf8");
17082
+ content = await readFile16(workspaceFilePath, "utf8");
16103
17083
  } catch {
16104
17084
  throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
16105
17085
  }
@@ -16223,19 +17203,18 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
16223
17203
  function asString5(value) {
16224
17204
  return typeof value === "string" ? value : void 0;
16225
17205
  }
16226
- function extractSuiteGovernance(suite) {
17206
+ function extractSuiteMetadataPayload(suite) {
17207
+ const payload = isJsonObject(suite.metadata) ? { ...suite.metadata } : {};
16227
17208
  const top = suite.governance;
16228
17209
  if (isJsonObject(top)) {
16229
- return top;
16230
- }
16231
- const wrapper = suite.metadata;
16232
- if (isJsonObject(wrapper)) {
16233
- const nested = wrapper.governance;
17210
+ payload.governance = top;
17211
+ } else {
17212
+ const nested = payload.governance;
16234
17213
  if (isJsonObject(nested)) {
16235
- return nested;
17214
+ payload.governance = nested;
16236
17215
  }
16237
17216
  }
16238
- return void 0;
17217
+ return Object.keys(payload).length > 0 ? payload : void 0;
16239
17218
  }
16240
17219
  function mergeSuiteMetadataPayload(caseMetadata, suitePayload) {
16241
17220
  if (!suitePayload) return caseMetadata;
@@ -16726,7 +17705,7 @@ async function runEvaluation(options) {
16726
17705
  const isEmpty = dirExists ? (await readdir8(configuredStaticPath)).length === 0 : false;
16727
17706
  if (isYamlConfiguredPath && (!dirExists || isEmpty)) {
16728
17707
  if (!dirExists) {
16729
- await mkdir14(configuredStaticPath, { recursive: true });
17708
+ await mkdir15(configuredStaticPath, { recursive: true });
16730
17709
  }
16731
17710
  if (workspaceTemplate) {
16732
17711
  await copyDirectoryRecursive(workspaceTemplate, configuredStaticPath);
@@ -16771,7 +17750,7 @@ async function runEvaluation(options) {
16771
17750
  }
16772
17751
  } else if (!isPerTestIsolation && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) {
16773
17752
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
16774
- await mkdir14(sharedWorkspacePath, { recursive: true });
17753
+ await mkdir15(sharedWorkspacePath, { recursive: true });
16775
17754
  setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
16776
17755
  }
16777
17756
  try {
@@ -17621,7 +18600,7 @@ async function runEvalCase(options) {
17621
18600
  }
17622
18601
  if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
17623
18602
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
17624
- await mkdir14(workspacePath, { recursive: true });
18603
+ await mkdir15(workspacePath, { recursive: true });
17625
18604
  }
17626
18605
  if (evalCase.workspace?.repos?.length && workspacePath) {
17627
18606
  const localPathErrors = RepoManager.validateLocalPaths(evalCase.workspace.repos);
@@ -17676,7 +18655,7 @@ async function runEvalCase(options) {
17676
18655
  const srcPath = path44.resolve(baseDir, relPath);
17677
18656
  const destPath = path44.resolve(workspacePath, relPath);
17678
18657
  try {
17679
- await mkdir14(path44.dirname(destPath), { recursive: true });
18658
+ await mkdir15(path44.dirname(destPath), { recursive: true });
17680
18659
  await copyFile2(srcPath, destPath);
17681
18660
  } catch (error) {
17682
18661
  const message = error instanceof Error ? error.message : String(error);
@@ -19244,6 +20223,12 @@ async function evaluate(config) {
19244
20223
  resolvedTarget = resolveTargetDefinition(targetDef);
19245
20224
  }
19246
20225
  const collectedResults = [];
20226
+ const cacheEnabled = shouldEnableCache({
20227
+ cliCache: config.cache === true,
20228
+ cliNoCache: false,
20229
+ yamlCache: config.cache === void 0 ? materialized.cache : void 0
20230
+ });
20231
+ const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ? path45.resolve(materialized.cachePath) : void 0) : void 0;
19247
20232
  const results = await runEvaluation({
19248
20233
  testFilePath,
19249
20234
  repoRoot,
@@ -19256,6 +20241,8 @@ async function evaluate(config) {
19256
20241
  filter: config.filter,
19257
20242
  threshold: config.threshold,
19258
20243
  evalCases: materialized.tests,
20244
+ cache,
20245
+ useCache: !!cache && !shouldSkipCacheForTemperature(resolvedTarget.config),
19259
20246
  ...materialized.budgetUsd !== void 0 && { budgetUsd: materialized.budgetUsd },
19260
20247
  onResult: async (result) => {
19261
20248
  collectedResults.push(result);
@@ -19286,6 +20273,7 @@ async function materializeEvalConfig(config, options) {
19286
20273
  tests: tests2,
19287
20274
  workers: config.workers ?? suite.workers,
19288
20275
  cache: config.cache ?? suite.cacheConfig?.enabled,
20276
+ cachePath: config.cachePath ?? suite.cacheConfig?.cachePath,
19289
20277
  budgetUsd: config.budgetUsd ?? suite.budgetUsd,
19290
20278
  threshold: config.threshold ?? suite.threshold,
19291
20279
  metadata: config.metadata ?? suite.metadata,
@@ -19304,6 +20292,7 @@ async function materializeEvalConfig(config, options) {
19304
20292
  tests,
19305
20293
  workers: config.workers,
19306
20294
  cache: config.cache,
20295
+ cachePath: config.cachePath,
19307
20296
  budgetUsd: config.budgetUsd,
19308
20297
  threshold: config.threshold,
19309
20298
  metadata: config.metadata,
@@ -19421,9 +20410,11 @@ function mapAssertionType(type) {
19421
20410
  }
19422
20411
  function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
19423
20412
  const total = results.length;
20413
+ const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
20414
+ const executionErrors = total - qualityResults.length;
19424
20415
  let passed = 0;
19425
20416
  let scoreSum = 0;
19426
- for (const r of results) {
20417
+ for (const r of qualityResults) {
19427
20418
  scoreSum += r.score;
19428
20419
  if (r.score >= threshold) {
19429
20420
  passed++;
@@ -19432,9 +20423,10 @@ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
19432
20423
  return {
19433
20424
  total,
19434
20425
  passed,
19435
- failed: total - passed,
20426
+ failed: qualityResults.length - passed,
20427
+ executionErrors,
19436
20428
  durationMs,
19437
- meanScore: total > 0 ? scoreSum / total : 0
20429
+ meanScore: qualityResults.length > 0 ? scoreSum / qualityResults.length : 0
19438
20430
  };
19439
20431
  }
19440
20432
  var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
@@ -19517,7 +20509,12 @@ async function loadTsEvalSuite(filePath, repoRoot, options) {
19517
20509
  return {
19518
20510
  tests: materialized.tests,
19519
20511
  ...materialized.workers !== void 0 && { workers: materialized.workers },
19520
- ...materialized.cache !== void 0 && { cacheConfig: { enabled: materialized.cache } },
20512
+ ...materialized.cache !== void 0 && {
20513
+ cacheConfig: {
20514
+ enabled: materialized.cache,
20515
+ ...materialized.cachePath !== void 0 && { cachePath: materialized.cachePath }
20516
+ }
20517
+ },
19521
20518
  ...materialized.budgetUsd !== void 0 && { budgetUsd: materialized.budgetUsd },
19522
20519
  ...materialized.threshold !== void 0 && { threshold: materialized.threshold },
19523
20520
  ...materialized.metadata !== void 0 && { metadata: materialized.metadata },
@@ -19540,7 +20537,28 @@ function isEvalConfigLike(value) {
19540
20537
  }
19541
20538
 
19542
20539
  export {
20540
+ NORMALIZED_TRAJECTORY_SCHEMA_VERSION,
20541
+ NORMALIZED_TRACE_SOURCE_KINDS,
20542
+ NORMALIZED_TRACE_EVENT_TYPES,
20543
+ NORMALIZED_TOOL_STATUSES,
20544
+ NORMALIZED_REDACTION_LEVELS,
20545
+ NormalizedRedactionStateWireSchema,
20546
+ NormalizedTraceErrorWireSchema,
20547
+ NormalizedTraceSourceWireSchema,
20548
+ NormalizedTraceSessionWireSchema,
20549
+ NormalizedTraceBranchWireSchema,
20550
+ NormalizedTraceSourceRefWireSchema,
20551
+ NormalizedRawEvidenceWireSchema,
20552
+ NormalizedTraceMessageWireSchema,
20553
+ NormalizedTraceModelWireSchema,
20554
+ NormalizedTraceToolWireSchema,
20555
+ NormalizedTraceEventWireSchema,
20556
+ NormalizedTrajectoryWireSchema,
20557
+ toNormalizedTrajectoryWire,
20558
+ fromNormalizedTrajectoryWire,
19543
20559
  computeTraceSummary,
20560
+ getSelectedTrajectoryEvents,
20561
+ computeTraceSummaryFromTrajectory,
19544
20562
  DEFAULT_EXPLORATION_TOOLS,
19545
20563
  explorationRatio,
19546
20564
  tokensPerTool,
@@ -19559,11 +20577,15 @@ export {
19559
20577
  extractCacheConfig,
19560
20578
  extractFailOnError,
19561
20579
  extractThreshold,
20580
+ resolveResultsConfigForProject,
19562
20581
  detectFormat,
19563
20582
  parseRepoSource,
19564
20583
  parseRepoCheckout,
19565
20584
  parseRepoClone,
19566
20585
  buildPromptInputs,
20586
+ ResponseCache,
20587
+ shouldEnableCache,
20588
+ shouldSkipCacheForTemperature,
19567
20589
  DEFAULT_THRESHOLD,
19568
20590
  PASS_THRESHOLD,
19569
20591
  scoreToVerdict,
@@ -19574,12 +20596,6 @@ export {
19574
20596
  parseJsonSafe,
19575
20597
  deepEqual,
19576
20598
  negateScore,
19577
- getAgentvConfigDir,
19578
- getAgentvHome,
19579
- getWorkspacesRoot,
19580
- getSubagentsRoot,
19581
- getTraceStateRoot,
19582
- getWorkspacePoolRoot,
19583
20599
  toSnakeCaseDeep,
19584
20600
  toCamelCaseDeep,
19585
20601
  CodeGrader,
@@ -19672,4 +20688,4 @@ export {
19672
20688
  loadTestById,
19673
20689
  loadEvalCaseById
19674
20690
  };
19675
- //# sourceMappingURL=chunk-575K7WRM.js.map
20691
+ //# sourceMappingURL=chunk-7QB53OPK.js.map