@agentv/core 1.5.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,15 +32,20 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  CompositeEvaluator: () => CompositeEvaluator,
35
+ CostEvaluator: () => CostEvaluator,
35
36
  DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
37
+ FieldAccuracyEvaluator: () => FieldAccuracyEvaluator,
38
+ LatencyEvaluator: () => LatencyEvaluator,
36
39
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
37
40
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
41
+ TokenUsageEvaluator: () => TokenUsageEvaluator,
38
42
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
39
43
  avgToolDurationMs: () => avgToolDurationMs,
40
44
  buildDirectoryChain: () => buildDirectoryChain2,
41
45
  buildPromptInputs: () => buildPromptInputs,
42
46
  buildSearchRoots: () => buildSearchRoots2,
43
47
  computeTraceSummary: () => computeTraceSummary,
48
+ consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
44
49
  consumeCodexLogEntries: () => consumeCodexLogEntries,
45
50
  consumePiLogEntries: () => consumePiLogEntries,
46
51
  createAgentKernel: () => createAgentKernel,
@@ -71,6 +76,7 @@ __export(index_exports, {
71
76
  resolveTargetDefinition: () => resolveTargetDefinition,
72
77
  runEvalCase: () => runEvalCase,
73
78
  runEvaluation: () => runEvaluation,
79
+ subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
74
80
  subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
75
81
  subscribeToPiLogEntries: () => subscribeToPiLogEntries,
76
82
  tokensPerTool: () => tokensPerTool
@@ -129,7 +135,11 @@ var EVALUATOR_KIND_VALUES = [
129
135
  "llm_judge",
130
136
  "rubric",
131
137
  "composite",
132
- "tool_trajectory"
138
+ "tool_trajectory",
139
+ "field_accuracy",
140
+ "latency",
141
+ "cost",
142
+ "token_usage"
133
143
  ];
134
144
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
135
145
  function isEvaluatorKind(value) {
@@ -551,7 +561,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
551
561
  continue;
552
562
  }
553
563
  if (typeValue === "code_judge") {
554
- const script = asString2(rawEvaluator.script);
564
+ let script;
565
+ const rawScript = rawEvaluator.script;
566
+ if (typeof rawScript === "string") {
567
+ const trimmed = rawScript.trim();
568
+ if (trimmed.length === 0) {
569
+ throw new Error(
570
+ `Invalid code_judge script for evaluator '${name}' in '${evalId}': script cannot be empty`
571
+ );
572
+ }
573
+ script = parseCommandToArgv(trimmed);
574
+ } else {
575
+ script = asStringArray(
576
+ rawScript,
577
+ `code_judge script for evaluator '${name}' in '${evalId}'`
578
+ );
579
+ }
555
580
  if (!script) {
556
581
  logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
557
582
  continue;
@@ -572,13 +597,21 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
572
597
  } else {
573
598
  resolvedCwd = searchRoots[0];
574
599
  }
600
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
601
+ const config = {};
602
+ for (const [key, value] of Object.entries(rawEvaluator)) {
603
+ if (!knownProps.has(key) && value !== void 0) {
604
+ config[key] = value;
605
+ }
606
+ }
575
607
  evaluators.push({
576
608
  name,
577
609
  type: "code",
578
610
  script,
579
611
  cwd,
580
612
  resolvedCwd,
581
- ...weight2 !== void 0 ? { weight: weight2 } : {}
613
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
614
+ ...Object.keys(config).length > 0 ? { config } : {}
582
615
  });
583
616
  continue;
584
617
  }
@@ -753,6 +786,140 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
753
786
  evaluators.push(config);
754
787
  continue;
755
788
  }
789
+ if (typeValue === "field_accuracy") {
790
+ const rawFields = rawEvaluator.fields;
791
+ if (!Array.isArray(rawFields)) {
792
+ logWarning2(
793
+ `Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
794
+ );
795
+ continue;
796
+ }
797
+ if (rawFields.length === 0) {
798
+ logWarning2(
799
+ `Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
800
+ );
801
+ continue;
802
+ }
803
+ const fields = [];
804
+ for (const rawField of rawFields) {
805
+ if (!isJsonObject2(rawField)) {
806
+ logWarning2(
807
+ `Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
808
+ );
809
+ continue;
810
+ }
811
+ const fieldPath = asString2(rawField.path);
812
+ const match = asString2(rawField.match);
813
+ if (!fieldPath) {
814
+ logWarning2(
815
+ `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
816
+ );
817
+ continue;
818
+ }
819
+ if (!match || !isValidFieldMatchType(match)) {
820
+ logWarning2(
821
+ `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
822
+ );
823
+ continue;
824
+ }
825
+ const fieldConfig = {
826
+ path: fieldPath,
827
+ match,
828
+ ...typeof rawField.required === "boolean" ? { required: rawField.required } : {},
829
+ ...typeof rawField.weight === "number" ? { weight: rawField.weight } : {},
830
+ ...typeof rawField.tolerance === "number" ? { tolerance: rawField.tolerance } : {},
831
+ ...typeof rawField.relative === "boolean" ? { relative: rawField.relative } : {},
832
+ ...Array.isArray(rawField.formats) ? { formats: rawField.formats.filter((f) => typeof f === "string") } : {}
833
+ };
834
+ fields.push(fieldConfig);
835
+ }
836
+ if (fields.length === 0) {
837
+ logWarning2(
838
+ `Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
839
+ );
840
+ continue;
841
+ }
842
+ const aggregation = asString2(rawEvaluator.aggregation);
843
+ const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
844
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
845
+ evaluators.push({
846
+ name,
847
+ type: "field_accuracy",
848
+ fields,
849
+ ...validAggregation ? { aggregation: validAggregation } : {},
850
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
851
+ });
852
+ continue;
853
+ }
854
+ if (typeValue === "latency") {
855
+ const threshold = rawEvaluator.threshold;
856
+ if (typeof threshold !== "number" || threshold < 0) {
857
+ logWarning2(
858
+ `Skipping latency evaluator '${name}' in '${evalId}': threshold must be a non-negative number`
859
+ );
860
+ continue;
861
+ }
862
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
863
+ evaluators.push({
864
+ name,
865
+ type: "latency",
866
+ threshold,
867
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
868
+ });
869
+ continue;
870
+ }
871
+ if (typeValue === "cost") {
872
+ const budget = rawEvaluator.budget;
873
+ if (typeof budget !== "number" || budget < 0) {
874
+ logWarning2(
875
+ `Skipping cost evaluator '${name}' in '${evalId}': budget must be a non-negative number`
876
+ );
877
+ continue;
878
+ }
879
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
880
+ evaluators.push({
881
+ name,
882
+ type: "cost",
883
+ budget,
884
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
885
+ });
886
+ continue;
887
+ }
888
+ if (typeValue === "token_usage") {
889
+ const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
890
+ const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
891
+ const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
892
+ const limits = [
893
+ ["max_total", maxTotal],
894
+ ["max_input", maxInput],
895
+ ["max_output", maxOutput]
896
+ ];
897
+ const validLimits = {};
898
+ for (const [key, raw] of limits) {
899
+ if (raw === void 0) continue;
900
+ if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
901
+ logWarning2(
902
+ `Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
903
+ );
904
+ continue;
905
+ }
906
+ validLimits[key] = raw;
907
+ }
908
+ if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
909
+ logWarning2(
910
+ `Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
911
+ );
912
+ continue;
913
+ }
914
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
915
+ evaluators.push({
916
+ name,
917
+ type: "token_usage",
918
+ ...validLimits,
919
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
920
+ });
921
+ continue;
922
+ }
756
923
  const prompt = asString2(rawEvaluator.prompt);
757
924
  let promptPath;
758
925
  if (prompt) {
@@ -823,6 +990,34 @@ function coerceEvaluator(candidate, contextId) {
823
990
  function asString2(value) {
824
991
  return typeof value === "string" ? value : void 0;
825
992
  }
993
+ function asStringArray(value, description) {
994
+ if (value === void 0) {
995
+ return void 0;
996
+ }
997
+ if (!Array.isArray(value)) {
998
+ throw new Error(`${description} must be an array of strings (argv tokens)`);
999
+ }
1000
+ if (value.length === 0) {
1001
+ throw new Error(`${description} cannot be empty`);
1002
+ }
1003
+ const result = [];
1004
+ for (const [index, entry] of value.entries()) {
1005
+ if (typeof entry !== "string") {
1006
+ throw new Error(`${description}[${index}] must be a string`);
1007
+ }
1008
+ if (entry.trim().length === 0) {
1009
+ throw new Error(`${description}[${index}] cannot be empty`);
1010
+ }
1011
+ result.push(entry);
1012
+ }
1013
+ return result;
1014
+ }
1015
+ function parseCommandToArgv(command) {
1016
+ if (process.platform === "win32") {
1017
+ return ["cmd.exe", "/c", command];
1018
+ }
1019
+ return ["sh", "-lc", command];
1020
+ }
826
1021
  function isJsonObject2(value) {
827
1022
  return typeof value === "object" && value !== null && !Array.isArray(value);
828
1023
  }
@@ -856,6 +1051,14 @@ function validateWeight(rawWeight, evaluatorName, evalId) {
856
1051
  }
857
1052
  return rawWeight;
858
1053
  }
1054
+ var VALID_FIELD_MATCH_TYPES = /* @__PURE__ */ new Set(["exact", "numeric_tolerance", "date"]);
1055
+ function isValidFieldMatchType(value) {
1056
+ return typeof value === "string" && VALID_FIELD_MATCH_TYPES.has(value);
1057
+ }
1058
+ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average", "all_or_nothing"]);
1059
+ function isValidFieldAggregationType(value) {
1060
+ return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
1061
+ }
859
1062
 
860
1063
  // src/evaluation/loaders/message-processor.ts
861
1064
  var import_promises4 = require("fs/promises");
@@ -1930,92 +2133,993 @@ async function withRetry(fn, retryConfig, signal) {
1930
2133
  throw lastError;
1931
2134
  }
1932
2135
 
1933
- // src/evaluation/providers/cli.ts
2136
+ // src/evaluation/providers/claude-code.ts
1934
2137
  var import_node_child_process = require("child_process");
1935
- var import_promises8 = __toESM(require("fs/promises"), 1);
1936
- var import_node_os = __toESM(require("os"), 1);
1937
- var import_node_path8 = __toESM(require("path"), 1);
1938
- var import_node_util = require("util");
1939
- var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
1940
- var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
1941
- async function defaultCommandRunner(command, options) {
1942
- const execOptions = {
1943
- cwd: options.cwd,
1944
- env: options.env,
1945
- timeout: options.timeoutMs,
1946
- signal: options.signal,
1947
- maxBuffer: DEFAULT_MAX_BUFFER,
1948
- shell: process.platform === "win32" ? "powershell.exe" : void 0
2138
+ var import_node_crypto = require("crypto");
2139
+ var import_node_fs3 = require("fs");
2140
+ var import_promises8 = require("fs/promises");
2141
+ var import_node_os = require("os");
2142
+ var import_node_path9 = __toESM(require("path"), 1);
2143
+
2144
+ // src/evaluation/providers/claude-code-log-tracker.ts
2145
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
2146
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeCodeLogSubscribers");
2147
+ function getClaudeCodeLogStore() {
2148
+ const globalObject = globalThis;
2149
+ const existing = globalObject[GLOBAL_LOGS_KEY];
2150
+ if (existing) {
2151
+ return existing;
2152
+ }
2153
+ const created = [];
2154
+ globalObject[GLOBAL_LOGS_KEY] = created;
2155
+ return created;
2156
+ }
2157
+ function getSubscriberStore() {
2158
+ const globalObject = globalThis;
2159
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
2160
+ if (existing) {
2161
+ return existing;
2162
+ }
2163
+ const created = /* @__PURE__ */ new Set();
2164
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
2165
+ return created;
2166
+ }
2167
+ function notifySubscribers(entry) {
2168
+ const subscribers = Array.from(getSubscriberStore());
2169
+ for (const listener of subscribers) {
2170
+ try {
2171
+ listener(entry);
2172
+ } catch (error) {
2173
+ const message = error instanceof Error ? error.message : String(error);
2174
+ console.warn(`Claude Code log subscriber failed: ${message}`);
2175
+ }
2176
+ }
2177
+ }
2178
+ function recordClaudeCodeLogEntry(entry) {
2179
+ getClaudeCodeLogStore().push(entry);
2180
+ notifySubscribers(entry);
2181
+ }
2182
+ function consumeClaudeCodeLogEntries() {
2183
+ const store = getClaudeCodeLogStore();
2184
+ if (store.length === 0) {
2185
+ return [];
2186
+ }
2187
+ return store.splice(0, store.length);
2188
+ }
2189
+ function subscribeToClaudeCodeLogEntries(listener) {
2190
+ const store = getSubscriberStore();
2191
+ store.add(listener);
2192
+ return () => {
2193
+ store.delete(listener);
1949
2194
  };
1950
- try {
1951
- const { stdout, stderr } = await execAsync(command, execOptions);
1952
- return {
1953
- stdout,
1954
- stderr,
1955
- exitCode: 0,
1956
- failed: false,
1957
- timedOut: false,
1958
- signal: null
1959
- };
1960
- } catch (error) {
1961
- const execError = error;
1962
- return {
1963
- stdout: execError.stdout ?? "",
1964
- stderr: execError.stderr ?? "",
1965
- exitCode: typeof execError.code === "number" ? execError.code : null,
1966
- failed: true,
1967
- timedOut: execError.timedOut === true || execError.killed === true,
1968
- signal: execError.signal ?? null
1969
- };
2195
+ }
2196
+
2197
+ // src/evaluation/providers/preread.ts
2198
+ var import_node_path8 = __toESM(require("path"), 1);
2199
+ function buildPromptDocument(request, inputFiles, options) {
2200
+ const parts = [];
2201
+ const guidelineFiles = collectGuidelineFiles(
2202
+ inputFiles,
2203
+ options?.guidelinePatterns ?? request.guideline_patterns,
2204
+ options?.guidelineOverrides
2205
+ );
2206
+ const inputFilesList = collectInputFiles(inputFiles);
2207
+ const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
2208
+ const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
2209
+ if (prereadBlock.length > 0) {
2210
+ parts.push("\n", prereadBlock);
1970
2211
  }
2212
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2213
+ return parts.join("\n").trim();
1971
2214
  }
1972
- var CliProvider = class {
1973
- id;
1974
- kind = "cli";
1975
- targetName;
1976
- supportsBatch = true;
1977
- config;
1978
- runCommand;
1979
- verbose;
1980
- keepTempFiles;
1981
- healthcheckPromise;
1982
- constructor(targetName, config, runner = defaultCommandRunner) {
1983
- this.targetName = targetName;
1984
- this.id = `cli:${targetName}`;
1985
- this.config = config;
1986
- this.runCommand = runner;
1987
- this.verbose = config.verbose ?? false;
1988
- this.keepTempFiles = config.keepTempFiles ?? false;
2215
+ function normalizeInputFiles(inputFiles) {
2216
+ if (!inputFiles || inputFiles.length === 0) {
2217
+ return void 0;
1989
2218
  }
1990
- async invoke(request) {
1991
- if (request.signal?.aborted) {
1992
- throw new Error("CLI provider request was aborted before execution");
2219
+ const deduped = /* @__PURE__ */ new Map();
2220
+ for (const inputFile of inputFiles) {
2221
+ const absolutePath = import_node_path8.default.resolve(inputFile);
2222
+ if (!deduped.has(absolutePath)) {
2223
+ deduped.set(absolutePath, absolutePath);
1993
2224
  }
1994
- await this.ensureHealthy(request.signal);
1995
- const outputFilePath = generateOutputFilePath(request.evalCaseId);
1996
- const templateValues = buildTemplateValues(request, this.config, outputFilePath);
1997
- const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
1998
- if (this.verbose) {
1999
- console.log(
2000
- `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
2001
- );
2225
+ }
2226
+ return Array.from(deduped.values());
2227
+ }
2228
+ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
2229
+ if (!inputFiles || inputFiles.length === 0) {
2230
+ return [];
2231
+ }
2232
+ const unique = /* @__PURE__ */ new Map();
2233
+ for (const inputFile of inputFiles) {
2234
+ const absolutePath = import_node_path8.default.resolve(inputFile);
2235
+ if (overrides?.has(absolutePath)) {
2236
+ if (!unique.has(absolutePath)) {
2237
+ unique.set(absolutePath, absolutePath);
2238
+ }
2239
+ continue;
2002
2240
  }
2003
- const startTime = Date.now();
2004
- const result = await this.runCommand(renderedCommand, {
2005
- cwd: this.config.cwd,
2006
- env: process.env,
2007
- timeoutMs: this.config.timeoutMs,
2008
- signal: request.signal
2009
- });
2010
- const measuredDurationMs = Date.now() - startTime;
2011
- if (result.failed || (result.exitCode ?? 0) !== 0) {
2012
- if (request.signal?.aborted) {
2013
- throw new Error("CLI provider request was aborted");
2241
+ const normalized = absolutePath.split(import_node_path8.default.sep).join("/");
2242
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
2243
+ if (!unique.has(absolutePath)) {
2244
+ unique.set(absolutePath, absolutePath);
2014
2245
  }
2015
- if (result.timedOut) {
2016
- throw new Error(
2017
- `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
2018
- );
2246
+ }
2247
+ }
2248
+ return Array.from(unique.values());
2249
+ }
2250
+ function collectInputFiles(inputFiles) {
2251
+ if (!inputFiles || inputFiles.length === 0) {
2252
+ return [];
2253
+ }
2254
+ const unique = /* @__PURE__ */ new Map();
2255
+ for (const inputFile of inputFiles) {
2256
+ const absolutePath = import_node_path8.default.resolve(inputFile);
2257
+ if (!unique.has(absolutePath)) {
2258
+ unique.set(absolutePath, absolutePath);
2259
+ }
2260
+ }
2261
+ return Array.from(unique.values());
2262
+ }
2263
+ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
2264
+ if (guidelineFiles.length === 0 && inputFiles.length === 0) {
2265
+ return "";
2266
+ }
2267
+ const buildList = (files) => files.map((absolutePath) => {
2268
+ const fileName = import_node_path8.default.basename(absolutePath);
2269
+ const fileUri = pathToFileUri(absolutePath);
2270
+ return `* [${fileName}](${fileUri})`;
2271
+ });
2272
+ const sections = [];
2273
+ if (guidelineFiles.length > 0) {
2274
+ sections.push(`Read all guideline files:
2275
+ ${buildList(guidelineFiles).join("\n")}.`);
2276
+ }
2277
+ if (inputFiles.length > 0) {
2278
+ sections.push(`Read all input files:
2279
+ ${buildList(inputFiles).join("\n")}.`);
2280
+ }
2281
+ sections.push(
2282
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
2283
+ "Then apply system_instructions on the user query below."
2284
+ );
2285
+ return sections.join("\n");
2286
+ }
2287
+ function pathToFileUri(filePath) {
2288
+ const absolutePath = import_node_path8.default.isAbsolute(filePath) ? filePath : import_node_path8.default.resolve(filePath);
2289
+ const normalizedPath = absolutePath.replace(/\\/g, "/");
2290
+ if (/^[a-zA-Z]:\//.test(normalizedPath)) {
2291
+ return `file:///${normalizedPath}`;
2292
+ }
2293
+ return `file://${normalizedPath}`;
2294
+ }
2295
+
2296
+ // src/evaluation/providers/claude-code.ts
2297
+ var WORKSPACE_PREFIX = "agentv-claude-code-";
2298
+ var PROMPT_FILENAME = "prompt.md";
2299
+ var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
2300
+ - Do NOT create any additional output files in the workspace.
2301
+ - All intended file outputs/changes MUST be written in your response.
2302
+ - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
2303
+ This is required for evaluation scoring.`;
2304
+ var ClaudeCodeProvider = class {
2305
+ id;
2306
+ kind = "claude-code";
2307
+ targetName;
2308
+ supportsBatch = false;
2309
+ config;
2310
+ runClaudeCode;
2311
+ constructor(targetName, config, runner = defaultClaudeCodeRunner) {
2312
+ this.id = `claude-code:${targetName}`;
2313
+ this.targetName = targetName;
2314
+ this.config = config;
2315
+ this.runClaudeCode = runner;
2316
+ }
2317
+ async invoke(request) {
2318
+ if (request.signal?.aborted) {
2319
+ throw new Error("Claude Code request was aborted before execution");
2320
+ }
2321
+ const inputFiles = normalizeInputFiles(request.inputFiles);
2322
+ const workspaceRoot = await this.createWorkspace();
2323
+ const logger = await this.createStreamLogger(request).catch(() => void 0);
2324
+ try {
2325
+ const promptFile = import_node_path9.default.join(workspaceRoot, PROMPT_FILENAME);
2326
+ await (0, import_promises8.writeFile)(promptFile, request.question, "utf8");
2327
+ const args = this.buildClaudeCodeArgs(request.question, inputFiles);
2328
+ const cwd = this.resolveCwd();
2329
+ const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
2330
+ if (result.timedOut) {
2331
+ throw new Error(
2332
+ `Claude Code CLI timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
2333
+ );
2334
+ }
2335
+ if (result.exitCode !== 0) {
2336
+ const detail = pickDetail(result.stderr, result.stdout);
2337
+ const prefix = `Claude Code CLI exited with code ${result.exitCode}`;
2338
+ if (isNestedClaudeCodeAuthError(result.stdout)) {
2339
+ throw new Error(
2340
+ `${prefix}: Claude Code detected a nested session and requires API key authentication. Set ANTHROPIC_API_KEY environment variable or run AgentV outside of a Claude Code session.`
2341
+ );
2342
+ }
2343
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
2344
+ }
2345
+ const parsed = parseClaudeCodeJsonl(result.stdout);
2346
+ const outputMessages = extractOutputMessages(parsed);
2347
+ const usage = extractUsage(parsed);
2348
+ return {
2349
+ raw: {
2350
+ response: parsed,
2351
+ stdout: result.stdout,
2352
+ stderr: result.stderr,
2353
+ exitCode: result.exitCode,
2354
+ args,
2355
+ executable: this.config.executable,
2356
+ promptFile,
2357
+ workspace: workspaceRoot,
2358
+ inputFiles,
2359
+ logFile: logger?.filePath
2360
+ },
2361
+ outputMessages,
2362
+ usage
2363
+ };
2364
+ } finally {
2365
+ await logger?.close();
2366
+ await this.cleanupWorkspace(workspaceRoot);
2367
+ }
2368
+ }
2369
+ resolveCwd() {
2370
+ if (!this.config.cwd) {
2371
+ return process.cwd();
2372
+ }
2373
+ return import_node_path9.default.resolve(this.config.cwd);
2374
+ }
2375
+ buildClaudeCodeArgs(prompt, inputFiles) {
2376
+ const args = [];
2377
+ args.push("--output-format", "stream-json");
2378
+ args.push("--verbose");
2379
+ args.push("-p");
2380
+ if (this.config.model) {
2381
+ args.push("--model", this.config.model);
2382
+ }
2383
+ if (this.config.args && this.config.args.length > 0) {
2384
+ args.push(...this.config.args);
2385
+ }
2386
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
2387
+ const fullPrompt = `${systemPrompt}
2388
+
2389
+ ${prompt}`;
2390
+ let finalPrompt = fullPrompt;
2391
+ if (inputFiles && inputFiles.length > 0) {
2392
+ const filesContext = inputFiles.map((f) => `[File: ${f}]`).join("\n");
2393
+ finalPrompt = `${fullPrompt}
2394
+
2395
+ ## Input Files
2396
+ ${filesContext}`;
2397
+ }
2398
+ args.push(finalPrompt);
2399
+ return args;
2400
+ }
2401
+ buildEnv() {
2402
+ const env = { ...process.env };
2403
+ env.CLAUDECODE = void 0;
2404
+ env.CLAUDE_CODE_ENTRYPOINT = void 0;
2405
+ return env;
2406
+ }
2407
+ async executeClaudeCode(args, cwd, signal, logger) {
2408
+ try {
2409
+ return await this.runClaudeCode({
2410
+ executable: this.config.executable,
2411
+ args,
2412
+ cwd,
2413
+ timeoutMs: this.config.timeoutMs,
2414
+ env: this.buildEnv(),
2415
+ signal,
2416
+ onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
2417
+ onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
2418
+ });
2419
+ } catch (error) {
2420
+ const err = error;
2421
+ if (err.code === "ENOENT") {
2422
+ throw new Error(
2423
+ `Claude Code executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
2424
+ );
2425
+ }
2426
+ throw error;
2427
+ }
2428
+ }
2429
+ async createWorkspace() {
2430
+ return await (0, import_promises8.mkdtemp)(import_node_path9.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
2431
+ }
2432
+ async cleanupWorkspace(workspaceRoot) {
2433
+ try {
2434
+ await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
2435
+ } catch {
2436
+ }
2437
+ }
2438
+ resolveLogDirectory() {
2439
+ const disabled = isClaudeCodeLogStreamingDisabled();
2440
+ if (disabled) {
2441
+ return void 0;
2442
+ }
2443
+ if (this.config.logDir) {
2444
+ return import_node_path9.default.resolve(this.config.logDir);
2445
+ }
2446
+ return import_node_path9.default.join(process.cwd(), ".agentv", "logs", "claude-code");
2447
+ }
2448
+ async createStreamLogger(request) {
2449
+ const logDir = this.resolveLogDirectory();
2450
+ if (!logDir) {
2451
+ return void 0;
2452
+ }
2453
+ try {
2454
+ await (0, import_promises8.mkdir)(logDir, { recursive: true });
2455
+ } catch (error) {
2456
+ const message = error instanceof Error ? error.message : String(error);
2457
+ console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
2458
+ return void 0;
2459
+ }
2460
+ const filePath = import_node_path9.default.join(logDir, buildLogFilename(request, this.targetName));
2461
+ try {
2462
+ const logger = await ClaudeCodeStreamLogger.create({
2463
+ filePath,
2464
+ targetName: this.targetName,
2465
+ evalCaseId: request.evalCaseId,
2466
+ attempt: request.attempt,
2467
+ format: this.config.logFormat ?? "summary"
2468
+ });
2469
+ recordClaudeCodeLogEntry({
2470
+ filePath,
2471
+ targetName: this.targetName,
2472
+ evalCaseId: request.evalCaseId,
2473
+ attempt: request.attempt
2474
+ });
2475
+ return logger;
2476
+ } catch (error) {
2477
+ const message = error instanceof Error ? error.message : String(error);
2478
+ console.warn(`Skipping Claude Code stream logging for ${filePath}: ${message}`);
2479
+ return void 0;
2480
+ }
2481
+ }
2482
+ };
2483
+ var ClaudeCodeStreamLogger = class _ClaudeCodeStreamLogger {
2484
+ filePath;
2485
+ stream;
2486
+ startedAt = Date.now();
2487
+ stdoutBuffer = "";
2488
+ stderrBuffer = "";
2489
+ format;
2490
+ constructor(filePath, format) {
2491
+ this.filePath = filePath;
2492
+ this.format = format;
2493
+ this.stream = (0, import_node_fs3.createWriteStream)(filePath, { flags: "a" });
2494
+ }
2495
+ static async create(options) {
2496
+ const logger = new _ClaudeCodeStreamLogger(options.filePath, options.format);
2497
+ const header = [
2498
+ "# Claude Code CLI stream log",
2499
+ `# target: ${options.targetName}`,
2500
+ options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
2501
+ options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
2502
+ `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
2503
+ ""
2504
+ ].filter((line) => Boolean(line));
2505
+ logger.writeLines(header);
2506
+ return logger;
2507
+ }
2508
+ handleStdoutChunk(chunk) {
2509
+ this.stdoutBuffer += chunk;
2510
+ this.flushBuffer("stdout");
2511
+ }
2512
+ handleStderrChunk(chunk) {
2513
+ this.stderrBuffer += chunk;
2514
+ this.flushBuffer("stderr");
2515
+ }
2516
+ async close() {
2517
+ this.flushBuffer("stdout");
2518
+ this.flushBuffer("stderr");
2519
+ this.flushRemainder();
2520
+ await new Promise((resolve, reject) => {
2521
+ this.stream.once("error", reject);
2522
+ this.stream.end(() => resolve());
2523
+ });
2524
+ }
2525
+ writeLines(lines) {
2526
+ for (const line of lines) {
2527
+ this.stream.write(`${line}
2528
+ `);
2529
+ }
2530
+ }
2531
+ flushBuffer(source) {
2532
+ const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
2533
+ const lines = buffer.split(/\r?\n/);
2534
+ const remainder = lines.pop() ?? "";
2535
+ if (source === "stdout") {
2536
+ this.stdoutBuffer = remainder;
2537
+ } else {
2538
+ this.stderrBuffer = remainder;
2539
+ }
2540
+ for (const line of lines) {
2541
+ const formatted = this.formatLine(line, source);
2542
+ if (formatted) {
2543
+ this.stream.write(formatted);
2544
+ this.stream.write("\n");
2545
+ }
2546
+ }
2547
+ }
2548
+ formatLine(rawLine, source) {
2549
+ const trimmed = rawLine.trim();
2550
+ if (trimmed.length === 0) {
2551
+ return void 0;
2552
+ }
2553
+ const message = this.format === "json" ? formatClaudeCodeJsonLog(trimmed) : formatClaudeCodeLogMessage(trimmed, source);
2554
+ return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
2555
+ }
2556
+ flushRemainder() {
2557
+ const stdoutRemainder = this.stdoutBuffer.trim();
2558
+ if (stdoutRemainder.length > 0) {
2559
+ const formatted = this.formatLine(stdoutRemainder, "stdout");
2560
+ if (formatted) {
2561
+ this.stream.write(formatted);
2562
+ this.stream.write("\n");
2563
+ }
2564
+ }
2565
+ const stderrRemainder = this.stderrBuffer.trim();
2566
+ if (stderrRemainder.length > 0) {
2567
+ const formatted = this.formatLine(stderrRemainder, "stderr");
2568
+ if (formatted) {
2569
+ this.stream.write(formatted);
2570
+ this.stream.write("\n");
2571
+ }
2572
+ }
2573
+ this.stdoutBuffer = "";
2574
+ this.stderrBuffer = "";
2575
+ }
2576
+ };
2577
+ function isClaudeCodeLogStreamingDisabled() {
2578
+ const envValue = process.env.AGENTV_CLAUDE_CODE_STREAM_LOGS;
2579
+ if (!envValue) {
2580
+ return false;
2581
+ }
2582
+ const normalized = envValue.trim().toLowerCase();
2583
+ return normalized === "false" || normalized === "0" || normalized === "off";
2584
+ }
2585
+ function buildLogFilename(request, targetName) {
2586
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2587
+ const evalId = sanitizeForFilename(request.evalCaseId ?? "claude-code");
2588
+ const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
2589
+ const target = sanitizeForFilename(targetName);
2590
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto.randomUUID)().slice(0, 8)}.log`;
2591
+ }
2592
+ function sanitizeForFilename(value) {
2593
+ const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
2594
+ return sanitized.length > 0 ? sanitized : "claude-code";
2595
+ }
2596
+ function formatElapsed(startedAt) {
2597
+ const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
2598
+ const hours = Math.floor(elapsedSeconds / 3600);
2599
+ const minutes = Math.floor(elapsedSeconds % 3600 / 60);
2600
+ const seconds = elapsedSeconds % 60;
2601
+ if (hours > 0) {
2602
+ return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
2603
+ }
2604
+ return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
2605
+ }
2606
+ function formatClaudeCodeLogMessage(rawLine, source) {
2607
+ const parsed = tryParseJsonValue(rawLine);
2608
+ if (parsed) {
2609
+ const summary = summarizeClaudeCodeEvent(parsed);
2610
+ if (summary) {
2611
+ return summary;
2612
+ }
2613
+ }
2614
+ if (source === "stderr") {
2615
+ return `stderr: ${rawLine}`;
2616
+ }
2617
+ return rawLine;
2618
+ }
2619
+ function formatClaudeCodeJsonLog(rawLine) {
2620
+ const parsed = tryParseJsonValue(rawLine);
2621
+ if (!parsed) {
2622
+ return rawLine;
2623
+ }
2624
+ try {
2625
+ return JSON.stringify(parsed, null, 2);
2626
+ } catch {
2627
+ return rawLine;
2628
+ }
2629
+ }
2630
+ function summarizeClaudeCodeEvent(event) {
2631
+ if (!event || typeof event !== "object") {
2632
+ return void 0;
2633
+ }
2634
+ const record = event;
2635
+ const type = typeof record.type === "string" ? record.type : void 0;
2636
+ if (!type) {
2637
+ return void 0;
2638
+ }
2639
+ switch (type) {
2640
+ case "system":
2641
+ return "system: init";
2642
+ case "assistant": {
2643
+ const message = record.message;
2644
+ if (message) {
2645
+ const content = message.content;
2646
+ if (Array.isArray(content) && content.length > 0) {
2647
+ const first = content[0];
2648
+ if (first?.type === "tool_use") {
2649
+ return `assistant: tool_use (${first.name})`;
2650
+ }
2651
+ if (first?.type === "text") {
2652
+ const text = first.text;
2653
+ if (typeof text === "string") {
2654
+ const preview = text.length > 50 ? `${text.slice(0, 50)}...` : text;
2655
+ return `assistant: ${preview}`;
2656
+ }
2657
+ }
2658
+ }
2659
+ }
2660
+ return "assistant";
2661
+ }
2662
+ case "user": {
2663
+ const message = record.message;
2664
+ if (message) {
2665
+ const content = message.content;
2666
+ if (Array.isArray(content) && content.length > 0) {
2667
+ const first = content[0];
2668
+ if (first?.type === "tool_result") {
2669
+ return `user: tool_result (${first.tool_use_id})`;
2670
+ }
2671
+ }
2672
+ }
2673
+ return "user";
2674
+ }
2675
+ case "result": {
2676
+ const cost = record.cost_usd;
2677
+ const duration = record.duration_ms;
2678
+ if (typeof cost === "number" && typeof duration === "number") {
2679
+ return `result: $${cost.toFixed(4)}, ${Math.round(duration)}ms`;
2680
+ }
2681
+ return "result";
2682
+ }
2683
+ default:
2684
+ return type;
2685
+ }
2686
+ }
2687
+ function tryParseJsonValue(rawLine) {
2688
+ try {
2689
+ return JSON.parse(rawLine);
2690
+ } catch {
2691
+ return void 0;
2692
+ }
2693
+ }
2694
+ function parseClaudeCodeJsonl(output) {
2695
+ const trimmed = output.trim();
2696
+ if (trimmed.length === 0) {
2697
+ throw new Error("Claude Code CLI produced no output");
2698
+ }
2699
+ const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
2700
+ const parsed = [];
2701
+ for (const line of lines) {
2702
+ try {
2703
+ parsed.push(JSON.parse(line));
2704
+ } catch {
2705
+ }
2706
+ }
2707
+ if (parsed.length === 0) {
2708
+ throw new Error("Claude Code CLI produced no valid JSON output");
2709
+ }
2710
+ return parsed;
2711
+ }
2712
+ function extractOutputMessages(events) {
2713
+ const outputMessages = [];
2714
+ for (const event of events) {
2715
+ if (!event || typeof event !== "object") {
2716
+ continue;
2717
+ }
2718
+ const record = event;
2719
+ const type = record.type;
2720
+ if (type === "assistant" || type === "user") {
2721
+ const message = record.message;
2722
+ if (message) {
2723
+ const converted = convertClaudeCodeMessage(message, type);
2724
+ if (converted) {
2725
+ outputMessages.push(converted);
2726
+ }
2727
+ }
2728
+ }
2729
+ }
2730
+ return outputMessages;
2731
+ }
2732
+ function convertClaudeCodeMessage(message, type) {
2733
+ const role = type === "assistant" ? "assistant" : "user";
2734
+ const content = extractTextContent(message.content);
2735
+ const toolCalls = extractToolCalls(message.content);
2736
+ return {
2737
+ role,
2738
+ content,
2739
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0
2740
+ };
2741
+ }
2742
+ function extractTextContent(content) {
2743
+ if (typeof content === "string") {
2744
+ return content;
2745
+ }
2746
+ if (!Array.isArray(content)) {
2747
+ return void 0;
2748
+ }
2749
+ const textParts = [];
2750
+ for (const part of content) {
2751
+ if (!part || typeof part !== "object") {
2752
+ continue;
2753
+ }
2754
+ const p = part;
2755
+ if (p.type === "text" && typeof p.text === "string") {
2756
+ textParts.push(p.text);
2757
+ }
2758
+ }
2759
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
2760
+ }
2761
+ function extractToolCalls(content) {
2762
+ if (!Array.isArray(content)) {
2763
+ return [];
2764
+ }
2765
+ const toolCalls = [];
2766
+ for (const part of content) {
2767
+ if (!part || typeof part !== "object") {
2768
+ continue;
2769
+ }
2770
+ const p = part;
2771
+ if (p.type === "tool_use" && typeof p.name === "string") {
2772
+ toolCalls.push({
2773
+ tool: p.name,
2774
+ input: p.input,
2775
+ id: typeof p.id === "string" ? p.id : void 0
2776
+ });
2777
+ }
2778
+ if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
2779
+ toolCalls.push({
2780
+ tool: "tool_result",
2781
+ output: p.content,
2782
+ id: p.tool_use_id
2783
+ });
2784
+ }
2785
+ }
2786
+ return toolCalls;
2787
+ }
2788
+ function extractUsage(events) {
2789
+ for (let i = events.length - 1; i >= 0; i--) {
2790
+ const event = events[i];
2791
+ if (!event || typeof event !== "object") {
2792
+ continue;
2793
+ }
2794
+ const record = event;
2795
+ if (record.type !== "result") {
2796
+ continue;
2797
+ }
2798
+ const usage = {};
2799
+ if (typeof record.cost_usd === "number") {
2800
+ usage.cost_usd = record.cost_usd;
2801
+ }
2802
+ if (typeof record.duration_ms === "number") {
2803
+ usage.duration_ms = record.duration_ms;
2804
+ }
2805
+ if (typeof record.duration_api_ms === "number") {
2806
+ usage.duration_api_ms = record.duration_api_ms;
2807
+ }
2808
+ if (typeof record.input_tokens === "number") {
2809
+ usage.input_tokens = record.input_tokens;
2810
+ }
2811
+ if (typeof record.output_tokens === "number") {
2812
+ usage.output_tokens = record.output_tokens;
2813
+ }
2814
+ if (typeof record.session_id === "string") {
2815
+ usage.session_id = record.session_id;
2816
+ }
2817
+ return Object.keys(usage).length > 0 ? usage : void 0;
2818
+ }
2819
+ return void 0;
2820
+ }
2821
+ function pickDetail(stderr, stdout) {
2822
+ const errorText = stderr.trim();
2823
+ if (errorText.length > 0) {
2824
+ return errorText;
2825
+ }
2826
+ const stdoutText = stdout.trim();
2827
+ return stdoutText.length > 0 ? stdoutText : void 0;
2828
+ }
2829
+ function formatTimeoutSuffix(timeoutMs) {
2830
+ if (!timeoutMs || timeoutMs <= 0) {
2831
+ return "";
2832
+ }
2833
+ const seconds = Math.ceil(timeoutMs / 1e3);
2834
+ return ` after ${seconds}s`;
2835
+ }
2836
+ function isNestedClaudeCodeAuthError(stdout) {
2837
+ try {
2838
+ const lines = stdout.split("\n");
2839
+ let hasApiKeySource = false;
2840
+ let hasAuthError = false;
2841
+ for (const line of lines) {
2842
+ const trimmed = line.trim();
2843
+ if (!trimmed) continue;
2844
+ try {
2845
+ const event = JSON.parse(trimmed);
2846
+ if (event.type === "system" && event.apiKeySource === "ANTHROPIC_API_KEY") {
2847
+ hasApiKeySource = true;
2848
+ }
2849
+ if (event.error === "authentication_failed" || event.type === "result" && event.is_error) {
2850
+ hasAuthError = true;
2851
+ }
2852
+ } catch {
2853
+ }
2854
+ }
2855
+ return hasApiKeySource && hasAuthError;
2856
+ } catch {
2857
+ return false;
2858
+ }
2859
+ }
2860
+ function escapeShellArg(arg) {
2861
+ return `'${arg.replace(/'/g, "'\\''")}'`;
2862
+ }
2863
+ async function defaultClaudeCodeRunner(options) {
2864
+ const tempId = (0, import_node_crypto.randomUUID)();
2865
+ const stdoutFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
2866
+ const stderrFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
2867
+ const exitFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
2868
+ const pidFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
2869
+ try {
2870
+ return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
2871
+ } finally {
2872
+ for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
2873
+ try {
2874
+ await (0, import_promises8.rm)(file, { force: true });
2875
+ } catch {
2876
+ }
2877
+ }
2878
+ }
2879
+ }
2880
+ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile) {
2881
+ const parts = options.executable.split(/\s+/);
2882
+ const executable = parts[0];
2883
+ const executableArgs = parts.slice(1);
2884
+ const allArgs = [...executableArgs, ...options.args];
2885
+ const escapedArgs = allArgs.map((arg) => escapeShellArg(arg));
2886
+ const fullCommand = [escapeShellArg(executable), ...escapedArgs].join(" ");
2887
+ const bashScript = `
2888
+ unset CLAUDECODE CLAUDE_CODE_ENTRYPOINT 2>/dev/null
2889
+ ${fullCommand} >${escapeShellArg(stdoutFile)} 2>${escapeShellArg(stderrFile)} &
2890
+ CHILD_PID=$!
2891
+ echo $CHILD_PID > ${escapeShellArg(pidFile)}
2892
+ wait $CHILD_PID
2893
+ echo $? > ${escapeShellArg(exitFile)}
2894
+ `;
2895
+ const child = (0, import_node_child_process.spawn)("setsid", ["bash", "-c", bashScript], {
2896
+ cwd: options.cwd,
2897
+ env: options.env,
2898
+ detached: true,
2899
+ stdio: "ignore"
2900
+ });
2901
+ child.unref();
2902
+ const pollInterval = 100;
2903
+ const startTime = Date.now();
2904
+ let timedOut = false;
2905
+ let lastStdoutSize = 0;
2906
+ const readFileIfExists = async (filePath) => {
2907
+ try {
2908
+ const { readFile: readFile8 } = await import("fs/promises");
2909
+ return await readFile8(filePath, "utf8");
2910
+ } catch {
2911
+ return "";
2912
+ }
2913
+ };
2914
+ const fileExists4 = async (filePath) => {
2915
+ try {
2916
+ const { access: access5 } = await import("fs/promises");
2917
+ await access5(filePath);
2918
+ return true;
2919
+ } catch {
2920
+ return false;
2921
+ }
2922
+ };
2923
+ const killProcess = async () => {
2924
+ try {
2925
+ const pid = await readFileIfExists(pidFile);
2926
+ if (pid.trim()) {
2927
+ process.kill(Number.parseInt(pid.trim(), 10), "SIGTERM");
2928
+ }
2929
+ } catch {
2930
+ }
2931
+ };
2932
+ if (options.signal?.aborted) {
2933
+ await killProcess();
2934
+ return { stdout: "", stderr: "Aborted", exitCode: -1, timedOut: false };
2935
+ }
2936
+ const abortHandler = () => {
2937
+ killProcess().catch(() => {
2938
+ });
2939
+ };
2940
+ options.signal?.addEventListener("abort", abortHandler, { once: true });
2941
+ try {
2942
+ while (true) {
2943
+ if (options.timeoutMs && Date.now() - startTime > options.timeoutMs) {
2944
+ timedOut = true;
2945
+ await killProcess();
2946
+ break;
2947
+ }
2948
+ if (options.signal?.aborted) {
2949
+ await killProcess();
2950
+ break;
2951
+ }
2952
+ if (options.onStdoutChunk) {
2953
+ const currentStdout = await readFileIfExists(stdoutFile);
2954
+ if (currentStdout.length > lastStdoutSize) {
2955
+ options.onStdoutChunk(currentStdout.slice(lastStdoutSize));
2956
+ lastStdoutSize = currentStdout.length;
2957
+ }
2958
+ }
2959
+ if (await fileExists4(exitFile)) {
2960
+ break;
2961
+ }
2962
+ await new Promise((resolve) => setTimeout(resolve, pollInterval));
2963
+ }
2964
+ const stdout = await readFileIfExists(stdoutFile);
2965
+ const stderr = await readFileIfExists(stderrFile);
2966
+ const exitCodeStr = await readFileIfExists(exitFile);
2967
+ const exitCode = exitCodeStr.trim() ? Number.parseInt(exitCodeStr.trim(), 10) : -1;
2968
+ if (options.onStdoutChunk && stdout.length > lastStdoutSize) {
2969
+ options.onStdoutChunk(stdout.slice(lastStdoutSize));
2970
+ }
2971
+ if (options.onStderrChunk && stderr) {
2972
+ options.onStderrChunk(stderr);
2973
+ }
2974
+ return { stdout, stderr, exitCode, timedOut };
2975
+ } finally {
2976
+ options.signal?.removeEventListener("abort", abortHandler);
2977
+ }
2978
+ }
2979
+
2980
+ // src/evaluation/providers/cli.ts
2981
+ var import_node_child_process2 = require("child_process");
2982
+ var import_promises9 = __toESM(require("fs/promises"), 1);
2983
+ var import_node_os2 = __toESM(require("os"), 1);
2984
+ var import_node_path10 = __toESM(require("path"), 1);
2985
+ var import_node_util = require("util");
2986
+ var import_zod = require("zod");
2987
+ var ToolCallSchema = import_zod.z.object({
2988
+ tool: import_zod.z.string(),
2989
+ input: import_zod.z.unknown().optional(),
2990
+ output: import_zod.z.unknown().optional(),
2991
+ id: import_zod.z.string().optional(),
2992
+ timestamp: import_zod.z.string().optional()
2993
+ });
2994
+ var OutputMessageInputSchema = import_zod.z.object({
2995
+ role: import_zod.z.string(),
2996
+ name: import_zod.z.string().optional(),
2997
+ content: import_zod.z.unknown().optional(),
2998
+ tool_calls: import_zod.z.array(ToolCallSchema).optional(),
2999
+ timestamp: import_zod.z.string().optional(),
3000
+ metadata: import_zod.z.record(import_zod.z.unknown()).optional()
3001
+ });
3002
+ var TokenUsageSchema = import_zod.z.object({
3003
+ input: import_zod.z.number(),
3004
+ output: import_zod.z.number(),
3005
+ cached: import_zod.z.number().optional()
3006
+ });
3007
+ var CliOutputSchema = import_zod.z.object({
3008
+ text: import_zod.z.unknown().optional(),
3009
+ output_messages: import_zod.z.array(OutputMessageInputSchema).optional(),
3010
+ token_usage: TokenUsageSchema.optional(),
3011
+ cost_usd: import_zod.z.number().optional(),
3012
+ duration_ms: import_zod.z.number().optional()
3013
+ });
3014
+ var CliJsonlRecordSchema = CliOutputSchema.extend({
3015
+ id: import_zod.z.string().min(1)
3016
+ });
3017
+ function validateMetrics(costUsd, durationMs, context) {
3018
+ let validCostUsd = costUsd;
3019
+ let validDurationMs = durationMs;
3020
+ if (costUsd !== void 0 && costUsd < 0) {
3021
+ console.warn(`[cli-provider] ${context}: ignoring negative cost_usd value (${costUsd})`);
3022
+ validCostUsd = void 0;
3023
+ }
3024
+ if (durationMs !== void 0 && durationMs < 0) {
3025
+ console.warn(`[cli-provider] ${context}: ignoring negative duration_ms value (${durationMs})`);
3026
+ validDurationMs = void 0;
3027
+ }
3028
+ return { costUsd: validCostUsd, durationMs: validDurationMs };
3029
+ }
3030
+ function convertOutputMessages(messages) {
3031
+ if (!messages || messages.length === 0) {
3032
+ return void 0;
3033
+ }
3034
+ return messages.map((msg) => ({
3035
+ role: msg.role,
3036
+ name: msg.name,
3037
+ content: msg.content,
3038
+ toolCalls: msg.tool_calls,
3039
+ timestamp: msg.timestamp,
3040
+ metadata: msg.metadata
3041
+ }));
3042
+ }
3043
+ var execAsync = (0, import_node_util.promisify)(import_node_child_process2.exec);
3044
+ var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
3045
+ async function defaultCommandRunner(command, options) {
3046
+ const execOptions = {
3047
+ cwd: options.cwd,
3048
+ env: options.env,
3049
+ timeout: options.timeoutMs,
3050
+ signal: options.signal,
3051
+ maxBuffer: DEFAULT_MAX_BUFFER,
3052
+ shell: process.platform === "win32" ? "powershell.exe" : void 0
3053
+ };
3054
+ try {
3055
+ const { stdout, stderr } = await execAsync(command, execOptions);
3056
+ return {
3057
+ stdout,
3058
+ stderr,
3059
+ exitCode: 0,
3060
+ failed: false,
3061
+ timedOut: false,
3062
+ signal: null
3063
+ };
3064
+ } catch (error) {
3065
+ const execError = error;
3066
+ return {
3067
+ stdout: execError.stdout ?? "",
3068
+ stderr: execError.stderr ?? "",
3069
+ exitCode: typeof execError.code === "number" ? execError.code : null,
3070
+ failed: true,
3071
+ timedOut: execError.timedOut === true || execError.killed === true,
3072
+ signal: execError.signal ?? null
3073
+ };
3074
+ }
3075
+ }
3076
+ var CliProvider = class {
3077
+ id;
3078
+ kind = "cli";
3079
+ targetName;
3080
+ supportsBatch = true;
3081
+ config;
3082
+ runCommand;
3083
+ verbose;
3084
+ keepTempFiles;
3085
+ healthcheckPromise;
3086
+ constructor(targetName, config, runner = defaultCommandRunner) {
3087
+ this.targetName = targetName;
3088
+ this.id = `cli:${targetName}`;
3089
+ this.config = config;
3090
+ this.runCommand = runner;
3091
+ this.verbose = config.verbose ?? false;
3092
+ this.keepTempFiles = config.keepTempFiles ?? false;
3093
+ }
3094
+ async invoke(request) {
3095
+ if (request.signal?.aborted) {
3096
+ throw new Error("CLI provider request was aborted before execution");
3097
+ }
3098
+ await this.ensureHealthy(request.signal);
3099
+ const outputFilePath = generateOutputFilePath(request.evalCaseId);
3100
+ const templateValues = buildTemplateValues(request, this.config, outputFilePath);
3101
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
3102
+ if (this.verbose) {
3103
+ console.log(
3104
+ `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
3105
+ );
3106
+ }
3107
+ const startTime = Date.now();
3108
+ const result = await this.runCommand(renderedCommand, {
3109
+ cwd: this.config.cwd,
3110
+ env: process.env,
3111
+ timeoutMs: this.config.timeoutMs,
3112
+ signal: request.signal
3113
+ });
3114
+ const measuredDurationMs = Date.now() - startTime;
3115
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
3116
+ if (request.signal?.aborted) {
3117
+ throw new Error("CLI provider request was aborted");
3118
+ }
3119
+ if (result.timedOut) {
3120
+ throw new Error(
3121
+ `CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
3122
+ );
2019
3123
  }
2020
3124
  const codeText = result.exitCode !== null ? result.exitCode : "unknown";
2021
3125
  const detail = result.stderr.trim() || result.stdout.trim();
@@ -2090,7 +3194,7 @@ var CliProvider = class {
2090
3194
  }
2091
3195
  if (result.timedOut) {
2092
3196
  throw new Error(
2093
- `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
3197
+ `CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
2094
3198
  );
2095
3199
  }
2096
3200
  const codeText = result.exitCode !== null ? result.exitCode : "unknown";
@@ -2100,11 +3204,6 @@ var CliProvider = class {
2100
3204
  }
2101
3205
  const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
2102
3206
  const recordsById = this.parseJsonlBatchOutput(responseContent);
2103
- const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
2104
- const missingIds = requestedIds.filter((id) => !recordsById.has(id));
2105
- if (missingIds.length > 0) {
2106
- throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
2107
- }
2108
3207
  const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
2109
3208
  const responses = requests.map((request) => {
2110
3209
  const evalCaseId = request.evalCaseId;
@@ -2123,15 +3222,20 @@ var CliProvider = class {
2123
3222
  }
2124
3223
  const parsed = recordsById.get(evalCaseId);
2125
3224
  if (!parsed) {
3225
+ const errorMessage = `Batch output missing id '${evalCaseId}'`;
3226
+ if (this.verbose) {
3227
+ console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
3228
+ }
2126
3229
  return {
2127
- outputMessages: [],
3230
+ outputMessages: [{ role: "assistant", content: `Error: ${errorMessage}` }],
2128
3231
  durationMs: perRequestFallbackMs,
2129
3232
  raw: {
2130
3233
  command: renderedCommand,
2131
3234
  stderr: result.stderr,
2132
3235
  exitCode: result.exitCode ?? 0,
2133
3236
  cwd: this.config.cwd,
2134
- outputFile: outputFilePath
3237
+ outputFile: outputFilePath,
3238
+ error: errorMessage
2135
3239
  }
2136
3240
  };
2137
3241
  }
@@ -2164,101 +3268,37 @@ var CliProvider = class {
2164
3268
  * - duration_ms: number
2165
3269
  */
2166
3270
  parseOutputContent(content) {
3271
+ let parsed;
2167
3272
  try {
2168
- const parsed = JSON.parse(content);
2169
- if (typeof parsed === "object" && parsed !== null) {
2170
- const obj = parsed;
2171
- const tokenUsage = this.parseTokenUsage(obj.token_usage);
2172
- const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
2173
- const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
2174
- const outputMessages = this.parseOutputMessages(obj.output_messages);
2175
- if (outputMessages && outputMessages.length > 0) {
2176
- return { outputMessages, tokenUsage, costUsd, durationMs };
2177
- }
2178
- if ("text" in obj) {
2179
- const text = typeof obj.text === "string" ? obj.text : String(obj.text);
2180
- return {
2181
- outputMessages: [{ role: "assistant", content: text }],
2182
- tokenUsage,
2183
- costUsd,
2184
- durationMs
2185
- };
2186
- }
2187
- }
3273
+ parsed = JSON.parse(content);
2188
3274
  } catch {
3275
+ return { outputMessages: [{ role: "assistant", content }] };
2189
3276
  }
2190
- return { outputMessages: [{ role: "assistant", content }] };
2191
- }
2192
- /**
2193
- * Parse token_usage from CLI output.
2194
- */
2195
- parseTokenUsage(tokenUsage) {
2196
- if (typeof tokenUsage !== "object" || tokenUsage === null) {
2197
- return void 0;
2198
- }
2199
- const obj = tokenUsage;
2200
- if (typeof obj.input !== "number" || typeof obj.output !== "number") {
2201
- return void 0;
2202
- }
2203
- return {
2204
- input: obj.input,
2205
- output: obj.output,
2206
- cached: typeof obj.cached === "number" ? obj.cached : void 0
2207
- };
2208
- }
2209
- /**
2210
- * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
2211
- */
2212
- parseOutputMessages(outputMessages) {
2213
- if (!Array.isArray(outputMessages)) {
2214
- return void 0;
3277
+ const result = CliOutputSchema.safeParse(parsed);
3278
+ if (!result.success) {
3279
+ return { outputMessages: [{ role: "assistant", content }] };
2215
3280
  }
2216
- const messages = [];
2217
- for (const msg of outputMessages) {
2218
- if (typeof msg !== "object" || msg === null) {
2219
- continue;
2220
- }
2221
- const rawMsg = msg;
2222
- if (typeof rawMsg.role !== "string") {
2223
- continue;
2224
- }
2225
- const message = {
2226
- role: rawMsg.role,
2227
- name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
2228
- content: rawMsg.content,
2229
- toolCalls: this.parseToolCalls(rawMsg.tool_calls),
2230
- timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
2231
- metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
3281
+ const obj = result.data;
3282
+ const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, "parsing output");
3283
+ const outputMessages = convertOutputMessages(obj.output_messages);
3284
+ if (outputMessages && outputMessages.length > 0) {
3285
+ return {
3286
+ outputMessages,
3287
+ tokenUsage: obj.token_usage,
3288
+ costUsd: metrics.costUsd,
3289
+ durationMs: metrics.durationMs
2232
3290
  };
2233
- messages.push(message);
2234
- }
2235
- return messages.length > 0 ? messages : void 0;
2236
- }
2237
- /**
2238
- * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
2239
- */
2240
- parseToolCalls(toolCalls) {
2241
- if (!Array.isArray(toolCalls)) {
2242
- return void 0;
2243
3291
  }
2244
- const calls = [];
2245
- for (const call of toolCalls) {
2246
- if (typeof call !== "object" || call === null) {
2247
- continue;
2248
- }
2249
- const rawCall = call;
2250
- if (typeof rawCall.tool !== "string") {
2251
- continue;
2252
- }
2253
- calls.push({
2254
- tool: rawCall.tool,
2255
- input: rawCall.input,
2256
- output: rawCall.output,
2257
- id: typeof rawCall.id === "string" ? rawCall.id : void 0,
2258
- timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
2259
- });
3292
+ if (obj.text !== void 0) {
3293
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
3294
+ return {
3295
+ outputMessages: [{ role: "assistant", content: text }],
3296
+ tokenUsage: obj.token_usage,
3297
+ costUsd: metrics.costUsd,
3298
+ durationMs: metrics.durationMs
3299
+ };
2260
3300
  }
2261
- return calls.length > 0 ? calls : void 0;
3301
+ return { outputMessages: [{ role: "assistant", content }] };
2262
3302
  }
2263
3303
  parseJsonlBatchOutput(content) {
2264
3304
  const records = /* @__PURE__ */ new Map();
@@ -2271,33 +3311,32 @@ var CliProvider = class {
2271
3311
  const reason = error instanceof Error ? error.message : String(error);
2272
3312
  throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
2273
3313
  }
2274
- if (typeof parsed !== "object" || parsed === null) {
3314
+ const result = CliJsonlRecordSchema.safeParse(parsed);
3315
+ if (!result.success) {
3316
+ const firstError = result.error.errors[0];
3317
+ if (firstError?.path.includes("id")) {
3318
+ throw new Error("CLI batch output JSONL line missing required string field: id");
3319
+ }
2275
3320
  throw new Error("CLI batch output JSONL line must be an object");
2276
3321
  }
2277
- const obj = parsed;
2278
- const id = typeof obj.id === "string" ? obj.id : void 0;
2279
- if (!id || id.trim().length === 0) {
2280
- throw new Error("CLI batch output JSONL line missing required string field: id");
2281
- }
2282
- if (records.has(id)) {
2283
- throw new Error(`CLI batch output contains duplicate id: ${id}`);
2284
- }
2285
- const tokenUsage = this.parseTokenUsage(obj.token_usage);
2286
- const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
2287
- const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
2288
- const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
2289
- let outputMessages;
2290
- if (parsedOutputMessages && parsedOutputMessages.length > 0) {
2291
- outputMessages = parsedOutputMessages;
3322
+ const obj = result.data;
3323
+ if (records.has(obj.id)) {
3324
+ throw new Error(`CLI batch output contains duplicate id: ${obj.id}`);
3325
+ }
3326
+ const outputMessages = convertOutputMessages(obj.output_messages);
3327
+ let finalOutputMessages;
3328
+ if (outputMessages && outputMessages.length > 0) {
3329
+ finalOutputMessages = outputMessages;
2292
3330
  } else {
2293
3331
  const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2294
- outputMessages = text ? [{ role: "assistant", content: text }] : [];
2295
- }
2296
- records.set(id, {
2297
- outputMessages,
2298
- tokenUsage,
2299
- costUsd,
2300
- durationMs
3332
+ finalOutputMessages = text ? [{ role: "assistant", content: text }] : [];
3333
+ }
3334
+ const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, `batch record '${obj.id}'`);
3335
+ records.set(obj.id, {
3336
+ outputMessages: finalOutputMessages,
3337
+ tokenUsage: obj.token_usage,
3338
+ costUsd: metrics.costUsd,
3339
+ durationMs: metrics.durationMs
2301
3340
  });
2302
3341
  }
2303
3342
  return records;
@@ -2311,7 +3350,7 @@ var CliProvider = class {
2311
3350
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
2312
3351
  } finally {
2313
3352
  if (!this.keepTempFiles) {
2314
- await import_promises8.default.unlink(filePath).catch(() => {
3353
+ await import_promises9.default.unlink(filePath).catch(() => {
2315
3354
  });
2316
3355
  }
2317
3356
  }
@@ -2383,7 +3422,7 @@ var CliProvider = class {
2383
3422
  }
2384
3423
  };
2385
3424
  function buildTemplateValues(request, config, outputFilePath) {
2386
- const inputFiles = normalizeInputFiles(request.inputFiles);
3425
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
2387
3426
  return {
2388
3427
  PROMPT: shellEscape(request.question ?? ""),
2389
3428
  GUIDELINES: shellEscape(request.guidelines ?? ""),
@@ -2393,13 +3432,13 @@ function buildTemplateValues(request, config, outputFilePath) {
2393
3432
  OUTPUT_FILE: shellEscape(outputFilePath)
2394
3433
  };
2395
3434
  }
2396
- function normalizeInputFiles(inputFiles) {
3435
+ function normalizeInputFiles2(inputFiles) {
2397
3436
  if (!inputFiles || inputFiles.length === 0) {
2398
3437
  return void 0;
2399
3438
  }
2400
3439
  const unique = /* @__PURE__ */ new Map();
2401
3440
  for (const inputFile of inputFiles) {
2402
- const absolutePath = import_node_path8.default.resolve(inputFile);
3441
+ const absolutePath = import_node_path10.default.resolve(inputFile);
2403
3442
  if (!unique.has(absolutePath)) {
2404
3443
  unique.set(absolutePath, absolutePath);
2405
3444
  }
@@ -2413,7 +3452,7 @@ function formatFileList(files, template) {
2413
3452
  const formatter = template ?? "{path}";
2414
3453
  return files.map((filePath) => {
2415
3454
  const escapedPath = shellEscape(filePath);
2416
- const escapedName = shellEscape(import_node_path8.default.basename(filePath));
3455
+ const escapedName = shellEscape(import_node_path10.default.basename(filePath));
2417
3456
  return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
2418
3457
  }).join(" ");
2419
3458
  }
@@ -2437,9 +3476,9 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
2437
3476
  const safeEvalId = evalCaseId || "unknown";
2438
3477
  const timestamp = Date.now();
2439
3478
  const random = Math.random().toString(36).substring(2, 9);
2440
- return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
3479
+ return import_node_path10.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
2441
3480
  }
2442
- function formatTimeoutSuffix(timeoutMs) {
3481
+ function formatTimeoutSuffix2(timeoutMs) {
2443
3482
  if (!timeoutMs || timeoutMs <= 0) {
2444
3483
  return "";
2445
3484
  }
@@ -2448,39 +3487,39 @@ function formatTimeoutSuffix(timeoutMs) {
2448
3487
  }
2449
3488
 
2450
3489
  // src/evaluation/providers/codex.ts
2451
- var import_node_child_process2 = require("child_process");
2452
- var import_node_crypto = require("crypto");
2453
- var import_node_fs3 = require("fs");
2454
- var import_promises9 = require("fs/promises");
2455
- var import_node_os2 = require("os");
2456
- var import_node_path10 = __toESM(require("path"), 1);
3490
+ var import_node_child_process3 = require("child_process");
3491
+ var import_node_crypto2 = require("crypto");
3492
+ var import_node_fs4 = require("fs");
3493
+ var import_promises10 = require("fs/promises");
3494
+ var import_node_os3 = require("os");
3495
+ var import_node_path11 = __toESM(require("path"), 1);
2457
3496
  var import_node_util2 = require("util");
2458
3497
 
2459
3498
  // src/evaluation/providers/codex-log-tracker.ts
2460
- var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
2461
- var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
3499
+ var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.codexLogs");
3500
+ var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.codexLogSubscribers");
2462
3501
  function getCodexLogStore() {
2463
3502
  const globalObject = globalThis;
2464
- const existing = globalObject[GLOBAL_LOGS_KEY];
3503
+ const existing = globalObject[GLOBAL_LOGS_KEY2];
2465
3504
  if (existing) {
2466
3505
  return existing;
2467
3506
  }
2468
3507
  const created = [];
2469
- globalObject[GLOBAL_LOGS_KEY] = created;
3508
+ globalObject[GLOBAL_LOGS_KEY2] = created;
2470
3509
  return created;
2471
3510
  }
2472
- function getSubscriberStore() {
3511
+ function getSubscriberStore2() {
2473
3512
  const globalObject = globalThis;
2474
- const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
3513
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
2475
3514
  if (existing) {
2476
3515
  return existing;
2477
3516
  }
2478
3517
  const created = /* @__PURE__ */ new Set();
2479
- globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
3518
+ globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
2480
3519
  return created;
2481
3520
  }
2482
- function notifySubscribers(entry) {
2483
- const subscribers = Array.from(getSubscriberStore());
3521
+ function notifySubscribers2(entry) {
3522
+ const subscribers = Array.from(getSubscriberStore2());
2484
3523
  for (const listener of subscribers) {
2485
3524
  try {
2486
3525
  listener(entry);
@@ -2492,128 +3531,29 @@ function notifySubscribers(entry) {
2492
3531
  }
2493
3532
  function recordCodexLogEntry(entry) {
2494
3533
  getCodexLogStore().push(entry);
2495
- notifySubscribers(entry);
2496
- }
2497
- function consumeCodexLogEntries() {
2498
- const store = getCodexLogStore();
2499
- if (store.length === 0) {
2500
- return [];
2501
- }
2502
- return store.splice(0, store.length);
2503
- }
2504
- function subscribeToCodexLogEntries(listener) {
2505
- const store = getSubscriberStore();
2506
- store.add(listener);
2507
- return () => {
2508
- store.delete(listener);
2509
- };
2510
- }
2511
-
2512
- // src/evaluation/providers/preread.ts
2513
- var import_node_path9 = __toESM(require("path"), 1);
2514
- function buildPromptDocument(request, inputFiles, options) {
2515
- const parts = [];
2516
- const guidelineFiles = collectGuidelineFiles(
2517
- inputFiles,
2518
- options?.guidelinePatterns ?? request.guideline_patterns,
2519
- options?.guidelineOverrides
2520
- );
2521
- const inputFilesList = collectInputFiles(inputFiles);
2522
- const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
2523
- const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
2524
- if (prereadBlock.length > 0) {
2525
- parts.push("\n", prereadBlock);
2526
- }
2527
- parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2528
- return parts.join("\n").trim();
2529
- }
2530
- function normalizeInputFiles2(inputFiles) {
2531
- if (!inputFiles || inputFiles.length === 0) {
2532
- return void 0;
2533
- }
2534
- const deduped = /* @__PURE__ */ new Map();
2535
- for (const inputFile of inputFiles) {
2536
- const absolutePath = import_node_path9.default.resolve(inputFile);
2537
- if (!deduped.has(absolutePath)) {
2538
- deduped.set(absolutePath, absolutePath);
2539
- }
2540
- }
2541
- return Array.from(deduped.values());
2542
- }
2543
- function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
2544
- if (!inputFiles || inputFiles.length === 0) {
2545
- return [];
2546
- }
2547
- const unique = /* @__PURE__ */ new Map();
2548
- for (const inputFile of inputFiles) {
2549
- const absolutePath = import_node_path9.default.resolve(inputFile);
2550
- if (overrides?.has(absolutePath)) {
2551
- if (!unique.has(absolutePath)) {
2552
- unique.set(absolutePath, absolutePath);
2553
- }
2554
- continue;
2555
- }
2556
- const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
2557
- if (isGuidelineFile(normalized, guidelinePatterns)) {
2558
- if (!unique.has(absolutePath)) {
2559
- unique.set(absolutePath, absolutePath);
2560
- }
2561
- }
2562
- }
2563
- return Array.from(unique.values());
2564
- }
2565
- function collectInputFiles(inputFiles) {
2566
- if (!inputFiles || inputFiles.length === 0) {
2567
- return [];
2568
- }
2569
- const unique = /* @__PURE__ */ new Map();
2570
- for (const inputFile of inputFiles) {
2571
- const absolutePath = import_node_path9.default.resolve(inputFile);
2572
- if (!unique.has(absolutePath)) {
2573
- unique.set(absolutePath, absolutePath);
2574
- }
2575
- }
2576
- return Array.from(unique.values());
2577
- }
2578
- function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
2579
- if (guidelineFiles.length === 0 && inputFiles.length === 0) {
2580
- return "";
2581
- }
2582
- const buildList = (files) => files.map((absolutePath) => {
2583
- const fileName = import_node_path9.default.basename(absolutePath);
2584
- const fileUri = pathToFileUri(absolutePath);
2585
- return `* [${fileName}](${fileUri})`;
2586
- });
2587
- const sections = [];
2588
- if (guidelineFiles.length > 0) {
2589
- sections.push(`Read all guideline files:
2590
- ${buildList(guidelineFiles).join("\n")}.`);
2591
- }
2592
- if (inputFiles.length > 0) {
2593
- sections.push(`Read all input files:
2594
- ${buildList(inputFiles).join("\n")}.`);
2595
- }
2596
- sections.push(
2597
- "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
2598
- "Then apply system_instructions on the user query below."
2599
- );
2600
- return sections.join("\n");
3534
+ notifySubscribers2(entry);
2601
3535
  }
2602
- function pathToFileUri(filePath) {
2603
- const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
2604
- const normalizedPath = absolutePath.replace(/\\/g, "/");
2605
- if (/^[a-zA-Z]:\//.test(normalizedPath)) {
2606
- return `file:///${normalizedPath}`;
3536
+ function consumeCodexLogEntries() {
3537
+ const store = getCodexLogStore();
3538
+ if (store.length === 0) {
3539
+ return [];
2607
3540
  }
2608
- return `file://${normalizedPath}`;
3541
+ return store.splice(0, store.length);
3542
+ }
3543
+ function subscribeToCodexLogEntries(listener) {
3544
+ const store = getSubscriberStore2();
3545
+ store.add(listener);
3546
+ return () => {
3547
+ store.delete(listener);
3548
+ };
2609
3549
  }
2610
3550
 
2611
3551
  // src/evaluation/providers/codex.ts
2612
- var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
2613
- var WORKSPACE_PREFIX = "agentv-codex-";
2614
- var PROMPT_FILENAME = "prompt.md";
3552
+ var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process3.exec);
3553
+ var WORKSPACE_PREFIX2 = "agentv-codex-";
3554
+ var PROMPT_FILENAME2 = "prompt.md";
2615
3555
  var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
2616
- var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
3556
+ var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
2617
3557
  - Do NOT create any additional output files in the workspace.
2618
3558
  - All intended file outputs/changes MUST be written in your response.
2619
3559
  - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
@@ -2638,27 +3578,27 @@ var CodexProvider = class {
2638
3578
  throw new Error("Codex provider request was aborted before execution");
2639
3579
  }
2640
3580
  await this.ensureEnvironmentReady();
2641
- const inputFiles = normalizeInputFiles2(request.inputFiles);
3581
+ const inputFiles = normalizeInputFiles(request.inputFiles);
2642
3582
  const workspaceRoot = await this.createWorkspace();
2643
3583
  const logger = await this.createStreamLogger(request).catch(() => void 0);
2644
3584
  try {
2645
3585
  const basePrompt = buildPromptDocument(request, inputFiles);
2646
- const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
3586
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
2647
3587
  const promptContent = `${systemPrompt}
2648
3588
 
2649
3589
  ${basePrompt}`;
2650
- const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
2651
- await (0, import_promises9.writeFile)(promptFile, promptContent, "utf8");
3590
+ const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
3591
+ await (0, import_promises10.writeFile)(promptFile, promptContent, "utf8");
2652
3592
  const args = this.buildCodexArgs();
2653
3593
  const cwd = this.resolveCwd(workspaceRoot);
2654
3594
  const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
2655
3595
  if (result.timedOut) {
2656
3596
  throw new Error(
2657
- `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
3597
+ `Codex CLI timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
2658
3598
  );
2659
3599
  }
2660
3600
  if (result.exitCode !== 0) {
2661
- const detail = pickDetail(result.stderr, result.stdout);
3601
+ const detail = pickDetail2(result.stderr, result.stdout);
2662
3602
  const prefix = `Codex CLI exited with code ${result.exitCode}`;
2663
3603
  throw new Error(detail ? `${prefix}: ${detail}` : prefix);
2664
3604
  }
@@ -2697,7 +3637,7 @@ ${basePrompt}`;
2697
3637
  if (!this.config.cwd) {
2698
3638
  return workspaceRoot;
2699
3639
  }
2700
- return import_node_path10.default.resolve(this.config.cwd);
3640
+ return import_node_path11.default.resolve(this.config.cwd);
2701
3641
  }
2702
3642
  buildCodexArgs() {
2703
3643
  const args = [
@@ -2739,11 +3679,11 @@ ${basePrompt}`;
2739
3679
  }
2740
3680
  }
2741
3681
  async createWorkspace() {
2742
- return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
3682
+ return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
2743
3683
  }
2744
3684
  async cleanupWorkspace(workspaceRoot) {
2745
3685
  try {
2746
- await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
3686
+ await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
2747
3687
  } catch {
2748
3688
  }
2749
3689
  }
@@ -2753,9 +3693,9 @@ ${basePrompt}`;
2753
3693
  return void 0;
2754
3694
  }
2755
3695
  if (this.config.logDir) {
2756
- return import_node_path10.default.resolve(this.config.logDir);
3696
+ return import_node_path11.default.resolve(this.config.logDir);
2757
3697
  }
2758
- return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "codex");
3698
+ return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "codex");
2759
3699
  }
2760
3700
  async createStreamLogger(request) {
2761
3701
  const logDir = this.resolveLogDirectory();
@@ -2763,13 +3703,13 @@ ${basePrompt}`;
2763
3703
  return void 0;
2764
3704
  }
2765
3705
  try {
2766
- await (0, import_promises9.mkdir)(logDir, { recursive: true });
3706
+ await (0, import_promises10.mkdir)(logDir, { recursive: true });
2767
3707
  } catch (error) {
2768
3708
  const message = error instanceof Error ? error.message : String(error);
2769
3709
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
2770
3710
  return void 0;
2771
3711
  }
2772
- const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
3712
+ const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
2773
3713
  try {
2774
3714
  const logger = await CodexStreamLogger.create({
2775
3715
  filePath,
@@ -2802,7 +3742,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
2802
3742
  constructor(filePath, format) {
2803
3743
  this.filePath = filePath;
2804
3744
  this.format = format;
2805
- this.stream = (0, import_node_fs3.createWriteStream)(filePath, { flags: "a" });
3745
+ this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
2806
3746
  }
2807
3747
  static async create(options) {
2808
3748
  const logger = new _CodexStreamLogger(options.filePath, options.format);
@@ -2863,7 +3803,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
2863
3803
  return void 0;
2864
3804
  }
2865
3805
  const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
2866
- return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
3806
+ return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
2867
3807
  }
2868
3808
  flushRemainder() {
2869
3809
  const stdoutRemainder = this.stdoutBuffer.trim();
@@ -2894,18 +3834,18 @@ function isCodexLogStreamingDisabled() {
2894
3834
  const normalized = envValue.trim().toLowerCase();
2895
3835
  return normalized === "false" || normalized === "0" || normalized === "off";
2896
3836
  }
2897
- function buildLogFilename(request, targetName) {
3837
+ function buildLogFilename2(request, targetName) {
2898
3838
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2899
- const evalId = sanitizeForFilename(request.evalCaseId ?? "codex");
3839
+ const evalId = sanitizeForFilename2(request.evalCaseId ?? "codex");
2900
3840
  const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
2901
- const target = sanitizeForFilename(targetName);
2902
- return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto.randomUUID)().slice(0, 8)}.log`;
3841
+ const target = sanitizeForFilename2(targetName);
3842
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
2903
3843
  }
2904
- function sanitizeForFilename(value) {
3844
+ function sanitizeForFilename2(value) {
2905
3845
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
2906
3846
  return sanitized.length > 0 ? sanitized : "codex";
2907
3847
  }
2908
- function formatElapsed(startedAt) {
3848
+ function formatElapsed2(startedAt) {
2909
3849
  const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
2910
3850
  const hours = Math.floor(elapsedSeconds / 3600);
2911
3851
  const minutes = Math.floor(elapsedSeconds % 3600 / 60);
@@ -2916,7 +3856,7 @@ function formatElapsed(startedAt) {
2916
3856
  return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
2917
3857
  }
2918
3858
  function formatCodexLogMessage(rawLine, source) {
2919
- const parsed = tryParseJsonValue(rawLine);
3859
+ const parsed = tryParseJsonValue2(rawLine);
2920
3860
  if (parsed) {
2921
3861
  const summary = summarizeCodexEvent(parsed);
2922
3862
  if (summary) {
@@ -2929,7 +3869,7 @@ function formatCodexLogMessage(rawLine, source) {
2929
3869
  return rawLine;
2930
3870
  }
2931
3871
  function formatCodexJsonLog(rawLine) {
2932
- const parsed = tryParseJsonValue(rawLine);
3872
+ const parsed = tryParseJsonValue2(rawLine);
2933
3873
  if (!parsed) {
2934
3874
  return rawLine;
2935
3875
  }
@@ -2974,7 +3914,7 @@ function summarizeCodexEvent(event) {
2974
3914
  }
2975
3915
  return type;
2976
3916
  }
2977
- function tryParseJsonValue(rawLine) {
3917
+ function tryParseJsonValue2(rawLine) {
2978
3918
  try {
2979
3919
  return JSON.parse(rawLine);
2980
3920
  } catch {
@@ -2984,9 +3924,9 @@ function tryParseJsonValue(rawLine) {
2984
3924
  async function locateExecutable(candidate) {
2985
3925
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
2986
3926
  if (includesPathSeparator) {
2987
- const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
3927
+ const resolved = import_node_path11.default.isAbsolute(candidate) ? candidate : import_node_path11.default.resolve(candidate);
2988
3928
  const executablePath = await ensureWindowsExecutableVariant(resolved);
2989
- await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
3929
+ await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
2990
3930
  return executablePath;
2991
3931
  }
2992
3932
  const locator = process.platform === "win32" ? "where" : "which";
@@ -2996,7 +3936,7 @@ async function locateExecutable(candidate) {
2996
3936
  const preferred = selectExecutableCandidate(lines);
2997
3937
  if (preferred) {
2998
3938
  const executablePath = await ensureWindowsExecutableVariant(preferred);
2999
- await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
3939
+ await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
3000
3940
  return executablePath;
3001
3941
  }
3002
3942
  } catch {
@@ -3030,7 +3970,7 @@ async function ensureWindowsExecutableVariant(candidate) {
3030
3970
  for (const ext of extensions) {
3031
3971
  const withExtension = `${candidate}${ext}`;
3032
3972
  try {
3033
- await (0, import_promises9.access)(withExtension, import_node_fs3.constants.F_OK);
3973
+ await (0, import_promises10.access)(withExtension, import_node_fs4.constants.F_OK);
3034
3974
  return withExtension;
3035
3975
  } catch {
3036
3976
  }
@@ -3203,7 +4143,7 @@ function parseJsonLines(output) {
3203
4143
  }
3204
4144
  return parsed;
3205
4145
  }
3206
- function pickDetail(stderr, stdout) {
4146
+ function pickDetail2(stderr, stdout) {
3207
4147
  const errorText = stderr.trim();
3208
4148
  if (errorText.length > 0) {
3209
4149
  return errorText;
@@ -3211,7 +4151,7 @@ function pickDetail(stderr, stdout) {
3211
4151
  const stdoutText = stdout.trim();
3212
4152
  return stdoutText.length > 0 ? stdoutText : void 0;
3213
4153
  }
3214
- function formatTimeoutSuffix2(timeoutMs) {
4154
+ function formatTimeoutSuffix3(timeoutMs) {
3215
4155
  if (!timeoutMs || timeoutMs <= 0) {
3216
4156
  return "";
3217
4157
  }
@@ -3220,7 +4160,7 @@ function formatTimeoutSuffix2(timeoutMs) {
3220
4160
  }
3221
4161
  async function defaultCodexRunner(options) {
3222
4162
  return await new Promise((resolve, reject) => {
3223
- const child = (0, import_node_child_process2.spawn)(options.executable, options.args, {
4163
+ const child = (0, import_node_child_process3.spawn)(options.executable, options.args, {
3224
4164
  cwd: options.cwd,
3225
4165
  env: options.env,
3226
4166
  stdio: ["pipe", "pipe", "pipe"],
@@ -3330,39 +4270,200 @@ var MockProvider = class {
3330
4270
  }
3331
4271
  };
3332
4272
 
4273
+ // src/evaluation/providers/pi-agent-sdk.ts
4274
+ var piAgentModule = null;
4275
+ var piAiModule = null;
4276
+ async function loadPiModules() {
4277
+ if (!piAgentModule || !piAiModule) {
4278
+ try {
4279
+ [piAgentModule, piAiModule] = await Promise.all([
4280
+ import("@mariozechner/pi-agent"),
4281
+ import("@mariozechner/pi-ai")
4282
+ ]);
4283
+ } catch (error) {
4284
+ throw new Error(
4285
+ `Failed to load pi-agent-sdk dependencies. Please install them:
4286
+ npm install @mariozechner/pi-agent @mariozechner/pi-ai
4287
+
4288
+ Original error: ${error instanceof Error ? error.message : String(error)}`
4289
+ );
4290
+ }
4291
+ }
4292
+ return {
4293
+ Agent: piAgentModule.Agent,
4294
+ ProviderTransport: piAgentModule.ProviderTransport,
4295
+ getModel: piAiModule.getModel,
4296
+ getEnvApiKey: piAiModule.getEnvApiKey
4297
+ };
4298
+ }
4299
+ var PiAgentSdkProvider = class {
4300
+ id;
4301
+ kind = "pi-agent-sdk";
4302
+ targetName;
4303
+ supportsBatch = false;
4304
+ config;
4305
+ constructor(targetName, config) {
4306
+ this.id = `pi-agent-sdk:${targetName}`;
4307
+ this.targetName = targetName;
4308
+ this.config = config;
4309
+ }
4310
+ async invoke(request) {
4311
+ if (request.signal?.aborted) {
4312
+ throw new Error("Pi agent SDK request was aborted before execution");
4313
+ }
4314
+ const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
4315
+ const startTime = Date.now();
4316
+ const providerName = this.config.provider ?? "anthropic";
4317
+ const modelId = this.config.model ?? "claude-sonnet-4-20250514";
4318
+ const model = getModel(providerName, modelId);
4319
+ const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
4320
+ const transport = new ProviderTransport({
4321
+ getApiKey: async (provider) => {
4322
+ return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
4323
+ }
4324
+ });
4325
+ const agent = new Agent({
4326
+ initialState: {
4327
+ systemPrompt,
4328
+ model,
4329
+ tools: [],
4330
+ // No tools for simple Q&A
4331
+ messages: []
4332
+ },
4333
+ transport
4334
+ });
4335
+ const outputMessages = [];
4336
+ let finalAssistantContent = "";
4337
+ const unsubscribe = agent.subscribe((event) => {
4338
+ if (event.type === "message_end") {
4339
+ const msg = event.message;
4340
+ if (msg.role === "assistant") {
4341
+ const content = extractTextContent2(msg.content);
4342
+ if (content) {
4343
+ finalAssistantContent = content;
4344
+ }
4345
+ }
4346
+ }
4347
+ });
4348
+ try {
4349
+ const timeoutMs = this.config.timeoutMs ?? 12e4;
4350
+ const timeoutPromise = new Promise((_, reject) => {
4351
+ setTimeout(
4352
+ () => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
4353
+ timeoutMs
4354
+ );
4355
+ });
4356
+ await Promise.race([agent.prompt(request.question), timeoutPromise]);
4357
+ await agent.waitForIdle();
4358
+ const agentMessages = agent.state.messages;
4359
+ for (const msg of agentMessages) {
4360
+ outputMessages.push(convertAgentMessage(msg));
4361
+ }
4362
+ const durationMs = Date.now() - startTime;
4363
+ return {
4364
+ raw: {
4365
+ messages: agentMessages,
4366
+ systemPrompt,
4367
+ model: this.config.model,
4368
+ provider: this.config.provider
4369
+ },
4370
+ outputMessages,
4371
+ durationMs
4372
+ };
4373
+ } finally {
4374
+ unsubscribe();
4375
+ }
4376
+ }
4377
+ };
4378
+ function extractTextContent2(content) {
4379
+ if (typeof content === "string") {
4380
+ return content;
4381
+ }
4382
+ if (!Array.isArray(content)) {
4383
+ return void 0;
4384
+ }
4385
+ const textParts = [];
4386
+ for (const part of content) {
4387
+ if (!part || typeof part !== "object") {
4388
+ continue;
4389
+ }
4390
+ const p = part;
4391
+ if (p.type === "text" && typeof p.text === "string") {
4392
+ textParts.push(p.text);
4393
+ }
4394
+ }
4395
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
4396
+ }
4397
+ function convertAgentMessage(message) {
4398
+ if (!message || typeof message !== "object") {
4399
+ return { role: "unknown", content: String(message) };
4400
+ }
4401
+ const msg = message;
4402
+ const role = typeof msg.role === "string" ? msg.role : "unknown";
4403
+ const content = extractTextContent2(msg.content);
4404
+ const toolCalls = extractToolCalls2(msg.content);
4405
+ const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
4406
+ return {
4407
+ role,
4408
+ content,
4409
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
4410
+ timestamp
4411
+ };
4412
+ }
4413
+ function extractToolCalls2(content) {
4414
+ if (!Array.isArray(content)) {
4415
+ return [];
4416
+ }
4417
+ const toolCalls = [];
4418
+ for (const part of content) {
4419
+ if (!part || typeof part !== "object") {
4420
+ continue;
4421
+ }
4422
+ const p = part;
4423
+ if (p.type === "tool_use" && typeof p.name === "string") {
4424
+ toolCalls.push({
4425
+ tool: p.name,
4426
+ input: p.input,
4427
+ id: typeof p.id === "string" ? p.id : void 0
4428
+ });
4429
+ }
4430
+ }
4431
+ return toolCalls;
4432
+ }
4433
+
3333
4434
  // src/evaluation/providers/pi-coding-agent.ts
3334
- var import_node_child_process3 = require("child_process");
3335
- var import_node_crypto2 = require("crypto");
3336
- var import_node_fs4 = require("fs");
3337
- var import_promises10 = require("fs/promises");
3338
- var import_node_os3 = require("os");
3339
- var import_node_path11 = __toESM(require("path"), 1);
4435
+ var import_node_child_process4 = require("child_process");
4436
+ var import_node_crypto3 = require("crypto");
4437
+ var import_node_fs5 = require("fs");
4438
+ var import_promises11 = require("fs/promises");
4439
+ var import_node_os4 = require("os");
4440
+ var import_node_path12 = __toESM(require("path"), 1);
3340
4441
 
3341
4442
  // src/evaluation/providers/pi-log-tracker.ts
3342
- var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
3343
- var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
4443
+ var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
4444
+ var GLOBAL_SUBSCRIBERS_KEY3 = Symbol.for("agentv.piLogSubscribers");
3344
4445
  function getPiLogStore() {
3345
4446
  const globalObject = globalThis;
3346
- const existing = globalObject[GLOBAL_LOGS_KEY2];
4447
+ const existing = globalObject[GLOBAL_LOGS_KEY3];
3347
4448
  if (existing) {
3348
4449
  return existing;
3349
4450
  }
3350
4451
  const created = [];
3351
- globalObject[GLOBAL_LOGS_KEY2] = created;
4452
+ globalObject[GLOBAL_LOGS_KEY3] = created;
3352
4453
  return created;
3353
4454
  }
3354
- function getSubscriberStore2() {
4455
+ function getSubscriberStore3() {
3355
4456
  const globalObject = globalThis;
3356
- const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
4457
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY3];
3357
4458
  if (existing) {
3358
4459
  return existing;
3359
4460
  }
3360
4461
  const created = /* @__PURE__ */ new Set();
3361
- globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
4462
+ globalObject[GLOBAL_SUBSCRIBERS_KEY3] = created;
3362
4463
  return created;
3363
4464
  }
3364
- function notifySubscribers2(entry) {
3365
- const subscribers = Array.from(getSubscriberStore2());
4465
+ function notifySubscribers3(entry) {
4466
+ const subscribers = Array.from(getSubscriberStore3());
3366
4467
  for (const listener of subscribers) {
3367
4468
  try {
3368
4469
  listener(entry);
@@ -3374,7 +4475,7 @@ function notifySubscribers2(entry) {
3374
4475
  }
3375
4476
  function recordPiLogEntry(entry) {
3376
4477
  getPiLogStore().push(entry);
3377
- notifySubscribers2(entry);
4478
+ notifySubscribers3(entry);
3378
4479
  }
3379
4480
  function consumePiLogEntries() {
3380
4481
  const store = getPiLogStore();
@@ -3384,7 +4485,7 @@ function consumePiLogEntries() {
3384
4485
  return store.splice(0, store.length);
3385
4486
  }
3386
4487
  function subscribeToPiLogEntries(listener) {
3387
- const store = getSubscriberStore2();
4488
+ const store = getSubscriberStore3();
3388
4489
  store.add(listener);
3389
4490
  return () => {
3390
4491
  store.delete(listener);
@@ -3392,9 +4493,9 @@ function subscribeToPiLogEntries(listener) {
3392
4493
  }
3393
4494
 
3394
4495
  // src/evaluation/providers/pi-coding-agent.ts
3395
- var WORKSPACE_PREFIX2 = "agentv-pi-";
3396
- var PROMPT_FILENAME2 = "prompt.md";
3397
- var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
4496
+ var WORKSPACE_PREFIX3 = "agentv-pi-";
4497
+ var PROMPT_FILENAME3 = "prompt.md";
4498
+ var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
3398
4499
  - Do NOT create any additional output files in the workspace.
3399
4500
  - All intended file outputs/changes MUST be written in your response.
3400
4501
  - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
@@ -3416,27 +4517,27 @@ var PiCodingAgentProvider = class {
3416
4517
  if (request.signal?.aborted) {
3417
4518
  throw new Error("Pi coding agent request was aborted before execution");
3418
4519
  }
3419
- const inputFiles = normalizeInputFiles2(request.inputFiles);
4520
+ const inputFiles = normalizeInputFiles(request.inputFiles);
3420
4521
  const workspaceRoot = await this.createWorkspace();
3421
4522
  const logger = await this.createStreamLogger(request).catch(() => void 0);
3422
4523
  try {
3423
- const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
3424
- await (0, import_promises10.writeFile)(promptFile, request.question, "utf8");
4524
+ const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME3);
4525
+ await (0, import_promises11.writeFile)(promptFile, request.question, "utf8");
3425
4526
  const args = this.buildPiArgs(request.question, inputFiles);
3426
4527
  const cwd = this.resolveCwd(workspaceRoot);
3427
4528
  const result = await this.executePi(args, cwd, request.signal, logger);
3428
4529
  if (result.timedOut) {
3429
4530
  throw new Error(
3430
- `Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
4531
+ `Pi coding agent timed out${formatTimeoutSuffix4(this.config.timeoutMs ?? void 0)}`
3431
4532
  );
3432
4533
  }
3433
4534
  if (result.exitCode !== 0) {
3434
- const detail = pickDetail2(result.stderr, result.stdout);
4535
+ const detail = pickDetail3(result.stderr, result.stdout);
3435
4536
  const prefix = `Pi coding agent exited with code ${result.exitCode}`;
3436
4537
  throw new Error(detail ? `${prefix}: ${detail}` : prefix);
3437
4538
  }
3438
4539
  const parsed = parsePiJsonl(result.stdout);
3439
- const outputMessages = extractOutputMessages(parsed);
4540
+ const outputMessages = extractOutputMessages2(parsed);
3440
4541
  const assistantText = extractAssistantText2(outputMessages);
3441
4542
  return {
3442
4543
  raw: {
@@ -3462,7 +4563,7 @@ var PiCodingAgentProvider = class {
3462
4563
  if (!this.config.cwd) {
3463
4564
  return workspaceRoot;
3464
4565
  }
3465
- return import_node_path11.default.resolve(this.config.cwd);
4566
+ return import_node_path12.default.resolve(this.config.cwd);
3466
4567
  }
3467
4568
  buildPiArgs(prompt, inputFiles) {
3468
4569
  const args = [];
@@ -3492,7 +4593,7 @@ var PiCodingAgentProvider = class {
3492
4593
  args.push(`@${file}`);
3493
4594
  }
3494
4595
  }
3495
- const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
4596
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT4;
3496
4597
  const fullPrompt = `${systemPrompt}
3497
4598
 
3498
4599
  ${prompt}`;
@@ -3551,19 +4652,19 @@ ${prompt}`;
3551
4652
  return env;
3552
4653
  }
3553
4654
  async createWorkspace() {
3554
- return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
4655
+ return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
3555
4656
  }
3556
4657
  async cleanupWorkspace(workspaceRoot) {
3557
4658
  try {
3558
- await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
4659
+ await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
3559
4660
  } catch {
3560
4661
  }
3561
4662
  }
3562
4663
  resolveLogDirectory() {
3563
4664
  if (this.config.logDir) {
3564
- return import_node_path11.default.resolve(this.config.logDir);
4665
+ return import_node_path12.default.resolve(this.config.logDir);
3565
4666
  }
3566
- return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
4667
+ return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
3567
4668
  }
3568
4669
  async createStreamLogger(request) {
3569
4670
  const logDir = this.resolveLogDirectory();
@@ -3571,13 +4672,13 @@ ${prompt}`;
3571
4672
  return void 0;
3572
4673
  }
3573
4674
  try {
3574
- await (0, import_promises10.mkdir)(logDir, { recursive: true });
4675
+ await (0, import_promises11.mkdir)(logDir, { recursive: true });
3575
4676
  } catch (error) {
3576
4677
  const message = error instanceof Error ? error.message : String(error);
3577
4678
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
3578
4679
  return void 0;
3579
4680
  }
3580
- const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
4681
+ const filePath = import_node_path12.default.join(logDir, buildLogFilename3(request, this.targetName));
3581
4682
  try {
3582
4683
  const logger = await PiStreamLogger.create({
3583
4684
  filePath,
@@ -3610,7 +4711,7 @@ var PiStreamLogger = class _PiStreamLogger {
3610
4711
  constructor(filePath, format) {
3611
4712
  this.filePath = filePath;
3612
4713
  this.format = format;
3613
- this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
4714
+ this.stream = (0, import_node_fs5.createWriteStream)(filePath, { flags: "a" });
3614
4715
  }
3615
4716
  static async create(options) {
3616
4717
  const logger = new _PiStreamLogger(options.filePath, options.format);
@@ -3671,7 +4772,7 @@ var PiStreamLogger = class _PiStreamLogger {
3671
4772
  return void 0;
3672
4773
  }
3673
4774
  const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
3674
- return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
4775
+ return `[+${formatElapsed3(this.startedAt)}] [${source}] ${message}`;
3675
4776
  }
3676
4777
  flushRemainder() {
3677
4778
  const stdoutRemainder = this.stdoutBuffer.trim();
@@ -3694,18 +4795,18 @@ var PiStreamLogger = class _PiStreamLogger {
3694
4795
  this.stderrBuffer = "";
3695
4796
  }
3696
4797
  };
3697
- function buildLogFilename2(request, targetName) {
4798
+ function buildLogFilename3(request, targetName) {
3698
4799
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3699
- const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
4800
+ const evalId = sanitizeForFilename3(request.evalCaseId ?? "pi");
3700
4801
  const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
3701
- const target = sanitizeForFilename2(targetName);
3702
- return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
4802
+ const target = sanitizeForFilename3(targetName);
4803
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto3.randomUUID)().slice(0, 8)}.log`;
3703
4804
  }
3704
- function sanitizeForFilename2(value) {
4805
+ function sanitizeForFilename3(value) {
3705
4806
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
3706
4807
  return sanitized.length > 0 ? sanitized : "pi";
3707
4808
  }
3708
- function formatElapsed2(startedAt) {
4809
+ function formatElapsed3(startedAt) {
3709
4810
  const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
3710
4811
  const hours = Math.floor(elapsedSeconds / 3600);
3711
4812
  const minutes = Math.floor(elapsedSeconds % 3600 / 60);
@@ -3716,7 +4817,7 @@ function formatElapsed2(startedAt) {
3716
4817
  return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
3717
4818
  }
3718
4819
  function formatPiLogMessage(rawLine, source) {
3719
- const parsed = tryParseJsonValue2(rawLine);
4820
+ const parsed = tryParseJsonValue3(rawLine);
3720
4821
  if (parsed) {
3721
4822
  const summary = summarizePiEvent(parsed);
3722
4823
  if (summary) {
@@ -3729,7 +4830,7 @@ function formatPiLogMessage(rawLine, source) {
3729
4830
  return rawLine;
3730
4831
  }
3731
4832
  function formatPiJsonLog(rawLine) {
3732
- const parsed = tryParseJsonValue2(rawLine);
4833
+ const parsed = tryParseJsonValue3(rawLine);
3733
4834
  if (!parsed) {
3734
4835
  return rawLine;
3735
4836
  }
@@ -3779,7 +4880,7 @@ function summarizePiEvent(event) {
3779
4880
  return type;
3780
4881
  }
3781
4882
  }
3782
- function tryParseJsonValue2(rawLine) {
4883
+ function tryParseJsonValue3(rawLine) {
3783
4884
  try {
3784
4885
  return JSON.parse(rawLine);
3785
4886
  } catch {
@@ -3804,7 +4905,7 @@ function parsePiJsonl(output) {
3804
4905
  }
3805
4906
  return parsed;
3806
4907
  }
3807
- function extractOutputMessages(events) {
4908
+ function extractOutputMessages2(events) {
3808
4909
  for (let i = events.length - 1; i >= 0; i--) {
3809
4910
  const event = events[i];
3810
4911
  if (!event || typeof event !== "object") {
@@ -3845,8 +4946,8 @@ function convertPiMessage(message) {
3845
4946
  if (typeof role !== "string") {
3846
4947
  return void 0;
3847
4948
  }
3848
- const content = extractTextContent(msg.content);
3849
- const toolCalls = extractToolCalls(msg.content);
4949
+ const content = extractTextContent3(msg.content);
4950
+ const toolCalls = extractToolCalls3(msg.content);
3850
4951
  const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
3851
4952
  const metadata = {};
3852
4953
  if (msg.api) metadata.api = msg.api;
@@ -3862,7 +4963,7 @@ function convertPiMessage(message) {
3862
4963
  metadata: Object.keys(metadata).length > 0 ? metadata : void 0
3863
4964
  };
3864
4965
  }
3865
- function extractTextContent(content) {
4966
+ function extractTextContent3(content) {
3866
4967
  if (typeof content === "string") {
3867
4968
  return content;
3868
4969
  }
@@ -3881,7 +4982,7 @@ function extractTextContent(content) {
3881
4982
  }
3882
4983
  return textParts.length > 0 ? textParts.join("\n") : void 0;
3883
4984
  }
3884
- function extractToolCalls(content) {
4985
+ function extractToolCalls3(content) {
3885
4986
  if (!Array.isArray(content)) {
3886
4987
  return [];
3887
4988
  }
@@ -3926,7 +5027,7 @@ function extractAssistantText2(messages) {
3926
5027
  function escapeAtSymbols(prompt) {
3927
5028
  return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
3928
5029
  }
3929
- function pickDetail2(stderr, stdout) {
5030
+ function pickDetail3(stderr, stdout) {
3930
5031
  const errorText = stderr.trim();
3931
5032
  if (errorText.length > 0) {
3932
5033
  return errorText;
@@ -3934,7 +5035,7 @@ function pickDetail2(stderr, stdout) {
3934
5035
  const stdoutText = stdout.trim();
3935
5036
  return stdoutText.length > 0 ? stdoutText : void 0;
3936
5037
  }
3937
- function formatTimeoutSuffix3(timeoutMs) {
5038
+ function formatTimeoutSuffix4(timeoutMs) {
3938
5039
  if (!timeoutMs || timeoutMs <= 0) {
3939
5040
  return "";
3940
5041
  }
@@ -3947,7 +5048,7 @@ async function defaultPiRunner(options) {
3947
5048
  const executable = parts[0];
3948
5049
  const executableArgs = parts.slice(1);
3949
5050
  const allArgs = [...executableArgs, ...options.args];
3950
- const child = (0, import_node_child_process3.spawn)(executable, allArgs, {
5051
+ const child = (0, import_node_child_process4.spawn)(executable, allArgs, {
3951
5052
  cwd: options.cwd,
3952
5053
  env: options.env,
3953
5054
  stdio: ["pipe", "pipe", "pipe"],
@@ -4010,84 +5111,84 @@ async function defaultPiRunner(options) {
4010
5111
  }
4011
5112
 
4012
5113
  // src/evaluation/providers/targets.ts
4013
- var import_node_path12 = __toESM(require("path"), 1);
4014
- var import_zod = require("zod");
4015
- var CliHealthcheckHttpInputSchema = import_zod.z.object({
4016
- type: import_zod.z.literal("http"),
4017
- url: import_zod.z.string().min(1, "healthcheck URL is required"),
4018
- timeout_seconds: import_zod.z.number().positive().optional(),
4019
- timeoutSeconds: import_zod.z.number().positive().optional()
5114
+ var import_node_path13 = __toESM(require("path"), 1);
5115
+ var import_zod2 = require("zod");
5116
+ var CliHealthcheckHttpInputSchema = import_zod2.z.object({
5117
+ type: import_zod2.z.literal("http"),
5118
+ url: import_zod2.z.string().min(1, "healthcheck URL is required"),
5119
+ timeout_seconds: import_zod2.z.number().positive().optional(),
5120
+ timeoutSeconds: import_zod2.z.number().positive().optional()
4020
5121
  });
4021
- var CliHealthcheckCommandInputSchema = import_zod.z.object({
4022
- type: import_zod.z.literal("command"),
4023
- command_template: import_zod.z.string().optional(),
4024
- commandTemplate: import_zod.z.string().optional(),
4025
- cwd: import_zod.z.string().optional(),
4026
- timeout_seconds: import_zod.z.number().positive().optional(),
4027
- timeoutSeconds: import_zod.z.number().positive().optional()
5122
+ var CliHealthcheckCommandInputSchema = import_zod2.z.object({
5123
+ type: import_zod2.z.literal("command"),
5124
+ command_template: import_zod2.z.string().optional(),
5125
+ commandTemplate: import_zod2.z.string().optional(),
5126
+ cwd: import_zod2.z.string().optional(),
5127
+ timeout_seconds: import_zod2.z.number().positive().optional(),
5128
+ timeoutSeconds: import_zod2.z.number().positive().optional()
4028
5129
  });
4029
- var CliHealthcheckInputSchema = import_zod.z.discriminatedUnion("type", [
5130
+ var CliHealthcheckInputSchema = import_zod2.z.discriminatedUnion("type", [
4030
5131
  CliHealthcheckHttpInputSchema,
4031
5132
  CliHealthcheckCommandInputSchema
4032
5133
  ]);
4033
- var CliTargetInputSchema = import_zod.z.object({
4034
- name: import_zod.z.string().min(1, "target name is required"),
4035
- provider: import_zod.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
5134
+ var CliTargetInputSchema = import_zod2.z.object({
5135
+ name: import_zod2.z.string().min(1, "target name is required"),
5136
+ provider: import_zod2.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
4036
5137
  // Command template - required (accept both naming conventions)
4037
- command_template: import_zod.z.string().optional(),
4038
- commandTemplate: import_zod.z.string().optional(),
5138
+ command_template: import_zod2.z.string().optional(),
5139
+ commandTemplate: import_zod2.z.string().optional(),
4039
5140
  // Files format - optional
4040
- files_format: import_zod.z.string().optional(),
4041
- filesFormat: import_zod.z.string().optional(),
4042
- attachments_format: import_zod.z.string().optional(),
4043
- attachmentsFormat: import_zod.z.string().optional(),
5141
+ files_format: import_zod2.z.string().optional(),
5142
+ filesFormat: import_zod2.z.string().optional(),
5143
+ attachments_format: import_zod2.z.string().optional(),
5144
+ attachmentsFormat: import_zod2.z.string().optional(),
4044
5145
  // Working directory - optional
4045
- cwd: import_zod.z.string().optional(),
5146
+ cwd: import_zod2.z.string().optional(),
4046
5147
  // Timeout in seconds - optional
4047
- timeout_seconds: import_zod.z.number().positive().optional(),
4048
- timeoutSeconds: import_zod.z.number().positive().optional(),
5148
+ timeout_seconds: import_zod2.z.number().positive().optional(),
5149
+ timeoutSeconds: import_zod2.z.number().positive().optional(),
4049
5150
  // Healthcheck configuration - optional
4050
5151
  healthcheck: CliHealthcheckInputSchema.optional(),
4051
5152
  // Verbose mode - optional
4052
- verbose: import_zod.z.boolean().optional(),
4053
- cli_verbose: import_zod.z.boolean().optional(),
4054
- cliVerbose: import_zod.z.boolean().optional(),
5153
+ verbose: import_zod2.z.boolean().optional(),
5154
+ cli_verbose: import_zod2.z.boolean().optional(),
5155
+ cliVerbose: import_zod2.z.boolean().optional(),
4055
5156
  // Keep temp files - optional
4056
- keep_temp_files: import_zod.z.boolean().optional(),
4057
- keepTempFiles: import_zod.z.boolean().optional(),
4058
- keep_output_files: import_zod.z.boolean().optional(),
4059
- keepOutputFiles: import_zod.z.boolean().optional(),
5157
+ keep_temp_files: import_zod2.z.boolean().optional(),
5158
+ keepTempFiles: import_zod2.z.boolean().optional(),
5159
+ keep_output_files: import_zod2.z.boolean().optional(),
5160
+ keepOutputFiles: import_zod2.z.boolean().optional(),
4060
5161
  // Common target fields
4061
- judge_target: import_zod.z.string().optional(),
4062
- workers: import_zod.z.number().int().min(1).optional(),
4063
- provider_batching: import_zod.z.boolean().optional(),
4064
- providerBatching: import_zod.z.boolean().optional()
5162
+ judge_target: import_zod2.z.string().optional(),
5163
+ workers: import_zod2.z.number().int().min(1).optional(),
5164
+ provider_batching: import_zod2.z.boolean().optional(),
5165
+ providerBatching: import_zod2.z.boolean().optional()
4065
5166
  }).refine((data) => data.command_template !== void 0 || data.commandTemplate !== void 0, {
4066
5167
  message: "Either command_template or commandTemplate is required"
4067
5168
  });
4068
- var CliHealthcheckHttpSchema = import_zod.z.object({
4069
- type: import_zod.z.literal("http"),
4070
- url: import_zod.z.string().min(1),
4071
- timeoutMs: import_zod.z.number().positive().optional()
5169
+ var CliHealthcheckHttpSchema = import_zod2.z.object({
5170
+ type: import_zod2.z.literal("http"),
5171
+ url: import_zod2.z.string().min(1),
5172
+ timeoutMs: import_zod2.z.number().positive().optional()
4072
5173
  }).strict();
4073
- var CliHealthcheckCommandSchema = import_zod.z.object({
4074
- type: import_zod.z.literal("command"),
4075
- commandTemplate: import_zod.z.string().min(1),
4076
- cwd: import_zod.z.string().optional(),
4077
- timeoutMs: import_zod.z.number().positive().optional()
5174
+ var CliHealthcheckCommandSchema = import_zod2.z.object({
5175
+ type: import_zod2.z.literal("command"),
5176
+ commandTemplate: import_zod2.z.string().min(1),
5177
+ cwd: import_zod2.z.string().optional(),
5178
+ timeoutMs: import_zod2.z.number().positive().optional()
4078
5179
  }).strict();
4079
- var CliHealthcheckSchema = import_zod.z.discriminatedUnion("type", [
5180
+ var CliHealthcheckSchema = import_zod2.z.discriminatedUnion("type", [
4080
5181
  CliHealthcheckHttpSchema,
4081
5182
  CliHealthcheckCommandSchema
4082
5183
  ]);
4083
- var CliTargetConfigSchema = import_zod.z.object({
4084
- commandTemplate: import_zod.z.string().min(1),
4085
- filesFormat: import_zod.z.string().optional(),
4086
- cwd: import_zod.z.string().optional(),
4087
- timeoutMs: import_zod.z.number().positive().optional(),
5184
+ var CliTargetConfigSchema = import_zod2.z.object({
5185
+ commandTemplate: import_zod2.z.string().min(1),
5186
+ filesFormat: import_zod2.z.string().optional(),
5187
+ cwd: import_zod2.z.string().optional(),
5188
+ timeoutMs: import_zod2.z.number().positive().optional(),
4088
5189
  healthcheck: CliHealthcheckSchema.optional(),
4089
- verbose: import_zod.z.boolean().optional(),
4090
- keepTempFiles: import_zod.z.boolean().optional()
5190
+ verbose: import_zod2.z.boolean().optional(),
5191
+ keepTempFiles: import_zod2.z.boolean().optional()
4091
5192
  }).strict();
4092
5193
  function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
4093
5194
  const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
@@ -4116,8 +5217,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
4116
5217
  allowLiteral: true,
4117
5218
  optionalEnv: true
4118
5219
  });
4119
- if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
4120
- cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
5220
+ if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
5221
+ cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
5222
+ }
5223
+ if (!cwd && evalFilePath) {
5224
+ cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
4121
5225
  }
4122
5226
  return {
4123
5227
  type: "command",
@@ -4144,11 +5248,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
4144
5248
  allowLiteral: true,
4145
5249
  optionalEnv: true
4146
5250
  });
4147
- if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
4148
- cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
5251
+ if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
5252
+ cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
4149
5253
  }
4150
5254
  if (!cwd && evalFilePath) {
4151
- cwd = import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath));
5255
+ cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
4152
5256
  }
4153
5257
  const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
4154
5258
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
@@ -4175,11 +5279,11 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
4175
5279
  "FILES",
4176
5280
  "OUTPUT_FILE"
4177
5281
  ]);
4178
- var BASE_TARGET_SCHEMA = import_zod.z.object({
4179
- name: import_zod.z.string().min(1, "target name is required"),
4180
- provider: import_zod.z.string().min(1, "provider is required"),
4181
- judge_target: import_zod.z.string().optional(),
4182
- workers: import_zod.z.number().int().min(1).optional()
5282
+ var BASE_TARGET_SCHEMA = import_zod2.z.object({
5283
+ name: import_zod2.z.string().min(1, "target name is required"),
5284
+ provider: import_zod2.z.string().min(1, "provider is required"),
5285
+ judge_target: import_zod2.z.string().optional(),
5286
+ workers: import_zod2.z.number().int().min(1).optional()
4183
5287
  }).passthrough();
4184
5288
  var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
4185
5289
  function normalizeAzureApiVersion(value) {
@@ -4282,6 +5386,24 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
4282
5386
  providerBatching,
4283
5387
  config: resolvePiCodingAgentConfig(parsed, env)
4284
5388
  };
5389
+ case "pi-agent-sdk":
5390
+ return {
5391
+ kind: "pi-agent-sdk",
5392
+ name: parsed.name,
5393
+ judgeTarget: parsed.judge_target,
5394
+ workers: parsed.workers,
5395
+ providerBatching,
5396
+ config: resolvePiAgentSdkConfig(parsed, env)
5397
+ };
5398
+ case "claude-code":
5399
+ return {
5400
+ kind: "claude-code",
5401
+ name: parsed.name,
5402
+ judgeTarget: parsed.judge_target,
5403
+ workers: parsed.workers,
5404
+ providerBatching,
5405
+ config: resolveClaudeCodeConfig(parsed, env)
5406
+ };
4285
5407
  case "mock":
4286
5408
  return {
4287
5409
  kind: "mock",
@@ -4459,41 +5581,132 @@ function resolvePiCodingAgentConfig(target, env) {
4459
5581
  allowLiteral: false,
4460
5582
  optionalEnv: true
4461
5583
  });
4462
- const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
4463
- allowLiteral: true,
5584
+ const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
5585
+ allowLiteral: true,
5586
+ optionalEnv: true
5587
+ });
5588
+ const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
5589
+ allowLiteral: true,
5590
+ optionalEnv: true
5591
+ });
5592
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
5593
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
5594
+ allowLiteral: true,
5595
+ optionalEnv: true
5596
+ });
5597
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
5598
+ const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
5599
+ allowLiteral: true,
5600
+ optionalEnv: true
5601
+ });
5602
+ const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
5603
+ const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
5604
+ return {
5605
+ executable,
5606
+ provider,
5607
+ model,
5608
+ apiKey,
5609
+ tools,
5610
+ thinking,
5611
+ args,
5612
+ cwd,
5613
+ timeoutMs,
5614
+ logDir,
5615
+ logFormat,
5616
+ systemPrompt
5617
+ };
5618
+ }
5619
+ function resolvePiAgentSdkConfig(target, env) {
5620
+ const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
5621
+ const modelSource = target.model ?? target.pi_model ?? target.piModel;
5622
+ const apiKeySource = target.api_key ?? target.apiKey;
5623
+ const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
5624
+ const systemPromptSource = target.system_prompt ?? target.systemPrompt;
5625
+ const provider = resolveOptionalString(
5626
+ providerSource,
5627
+ env,
5628
+ `${target.name} pi-agent-sdk provider`,
5629
+ {
5630
+ allowLiteral: true,
5631
+ optionalEnv: true
5632
+ }
5633
+ );
5634
+ const model = resolveOptionalString(modelSource, env, `${target.name} pi-agent-sdk model`, {
5635
+ allowLiteral: true,
5636
+ optionalEnv: true
5637
+ });
5638
+ const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi-agent-sdk api key`, {
5639
+ allowLiteral: false,
4464
5640
  optionalEnv: true
4465
5641
  });
4466
- const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
5642
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi-agent-sdk timeout`);
5643
+ const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
5644
+ return {
5645
+ provider,
5646
+ model,
5647
+ apiKey,
5648
+ timeoutMs,
5649
+ systemPrompt
5650
+ };
5651
+ }
5652
+ function resolveClaudeCodeConfig(target, env) {
5653
+ const executableSource = target.executable ?? target.command ?? target.binary;
5654
+ const modelSource = target.model;
5655
+ const argsSource = target.args ?? target.arguments;
5656
+ const cwdSource = target.cwd;
5657
+ const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
5658
+ const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
5659
+ const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_CODE_LOG_FORMAT;
5660
+ const systemPromptSource = target.system_prompt ?? target.systemPrompt;
5661
+ const executable = resolveOptionalString(executableSource, env, `${target.name} claude-code executable`, {
4467
5662
  allowLiteral: true,
4468
5663
  optionalEnv: true
4469
- });
4470
- const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
4471
- const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
5664
+ }) ?? "claude";
5665
+ const model = resolveOptionalString(modelSource, env, `${target.name} claude-code model`, {
4472
5666
  allowLiteral: true,
4473
5667
  optionalEnv: true
4474
5668
  });
4475
- const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
4476
- const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
5669
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} claude-code args`);
5670
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} claude-code cwd`, {
4477
5671
  allowLiteral: true,
4478
5672
  optionalEnv: true
4479
5673
  });
4480
- const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
5674
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} claude-code timeout`);
5675
+ const logDir = resolveOptionalString(
5676
+ logDirSource,
5677
+ env,
5678
+ `${target.name} claude-code log directory`,
5679
+ {
5680
+ allowLiteral: true,
5681
+ optionalEnv: true
5682
+ }
5683
+ );
5684
+ const logFormat = normalizeClaudeCodeLogFormat(logFormatSource);
4481
5685
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
4482
5686
  return {
4483
5687
  executable,
4484
- provider,
4485
5688
  model,
4486
- apiKey,
4487
- tools,
4488
- thinking,
5689
+ systemPrompt,
4489
5690
  args,
4490
5691
  cwd,
4491
5692
  timeoutMs,
4492
5693
  logDir,
4493
- logFormat,
4494
- systemPrompt
5694
+ logFormat
4495
5695
  };
4496
5696
  }
5697
+ function normalizeClaudeCodeLogFormat(value) {
5698
+ if (value === void 0 || value === null) {
5699
+ return void 0;
5700
+ }
5701
+ if (typeof value !== "string") {
5702
+ throw new Error("claude-code log format must be 'summary' or 'json'");
5703
+ }
5704
+ const normalized = value.trim().toLowerCase();
5705
+ if (normalized === "json" || normalized === "summary") {
5706
+ return normalized;
5707
+ }
5708
+ throw new Error("claude-code log format must be 'summary' or 'json'");
5709
+ }
4497
5710
  function resolveMockConfig(target) {
4498
5711
  const response = typeof target.response === "string" ? target.response : void 0;
4499
5712
  return { response };
@@ -4529,13 +5742,13 @@ function resolveVSCodeConfig(target, env, insiders) {
4529
5742
  };
4530
5743
  }
4531
5744
  var cliErrorMap = (issue, ctx) => {
4532
- if (issue.code === import_zod.z.ZodIssueCode.unrecognized_keys) {
5745
+ if (issue.code === import_zod2.z.ZodIssueCode.unrecognized_keys) {
4533
5746
  return { message: `Unknown CLI provider settings: ${issue.keys.join(", ")}` };
4534
5747
  }
4535
- if (issue.code === import_zod.z.ZodIssueCode.invalid_union_discriminator) {
5748
+ if (issue.code === import_zod2.z.ZodIssueCode.invalid_union_discriminator) {
4536
5749
  return { message: "healthcheck type must be 'http' or 'command'" };
4537
5750
  }
4538
- if (issue.code === import_zod.z.ZodIssueCode.invalid_type && issue.expected === "string") {
5751
+ if (issue.code === import_zod2.z.ZodIssueCode.invalid_type && issue.expected === "string") {
4539
5752
  return { message: `${ctx.defaultError} (expected a string value)` };
4540
5753
  }
4541
5754
  return { message: ctx.defaultError };
@@ -4544,8 +5757,8 @@ function resolveCliConfig(target, env, evalFilePath) {
4544
5757
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
4545
5758
  if (!parseResult.success) {
4546
5759
  const firstError = parseResult.error.errors[0];
4547
- const path16 = firstError?.path.join(".") || "";
4548
- const prefix = path16 ? `${target.name} ${path16}: ` : `${target.name}: `;
5760
+ const path17 = firstError?.path.join(".") || "";
5761
+ const prefix = path17 ? `${target.name} ${path17}: ` : `${target.name}: `;
4549
5762
  throw new Error(`${prefix}${firstError?.message}`);
4550
5763
  }
4551
5764
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -4733,7 +5946,7 @@ function resolveOptionalNumberArray(source, description) {
4733
5946
  }
4734
5947
 
4735
5948
  // src/evaluation/providers/vscode.ts
4736
- var import_node_path13 = __toESM(require("path"), 1);
5949
+ var import_node_path14 = __toESM(require("path"), 1);
4737
5950
  var import_subagent = require("subagent");
4738
5951
 
4739
5952
  // src/evaluation/providers/vscode-templates.ts
@@ -4903,7 +6116,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
4903
6116
  return "";
4904
6117
  }
4905
6118
  const buildList = (files) => files.map((absolutePath) => {
4906
- const fileName = import_node_path13.default.basename(absolutePath);
6119
+ const fileName = import_node_path14.default.basename(absolutePath);
4907
6120
  const fileUri = pathToFileUri2(absolutePath);
4908
6121
  return `* [${fileName}](${fileUri})`;
4909
6122
  });
@@ -4928,8 +6141,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
4928
6141
  }
4929
6142
  const unique = /* @__PURE__ */ new Map();
4930
6143
  for (const attachment of attachments) {
4931
- const absolutePath = import_node_path13.default.resolve(attachment);
4932
- const normalized = absolutePath.split(import_node_path13.default.sep).join("/");
6144
+ const absolutePath = import_node_path14.default.resolve(attachment);
6145
+ const normalized = absolutePath.split(import_node_path14.default.sep).join("/");
4933
6146
  if (isGuidelineFile(normalized, guidelinePatterns)) {
4934
6147
  if (!unique.has(absolutePath)) {
4935
6148
  unique.set(absolutePath, absolutePath);
@@ -4944,7 +6157,7 @@ function collectAttachmentFiles(attachments) {
4944
6157
  }
4945
6158
  const unique = /* @__PURE__ */ new Map();
4946
6159
  for (const attachment of attachments) {
4947
- const absolutePath = import_node_path13.default.resolve(attachment);
6160
+ const absolutePath = import_node_path14.default.resolve(attachment);
4948
6161
  if (!unique.has(absolutePath)) {
4949
6162
  unique.set(absolutePath, absolutePath);
4950
6163
  }
@@ -4952,7 +6165,7 @@ function collectAttachmentFiles(attachments) {
4952
6165
  return Array.from(unique.values());
4953
6166
  }
4954
6167
  function pathToFileUri2(filePath) {
4955
- const absolutePath = import_node_path13.default.isAbsolute(filePath) ? filePath : import_node_path13.default.resolve(filePath);
6168
+ const absolutePath = import_node_path14.default.isAbsolute(filePath) ? filePath : import_node_path14.default.resolve(filePath);
4956
6169
  const normalizedPath = absolutePath.replace(/\\/g, "/");
4957
6170
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
4958
6171
  return `file:///${normalizedPath}`;
@@ -4965,7 +6178,7 @@ function normalizeAttachments(attachments) {
4965
6178
  }
4966
6179
  const deduped = /* @__PURE__ */ new Set();
4967
6180
  for (const attachment of attachments) {
4968
- deduped.add(import_node_path13.default.resolve(attachment));
6181
+ deduped.add(import_node_path14.default.resolve(attachment));
4969
6182
  }
4970
6183
  return Array.from(deduped);
4971
6184
  }
@@ -4974,7 +6187,7 @@ function mergeAttachments(all) {
4974
6187
  for (const list of all) {
4975
6188
  if (!list) continue;
4976
6189
  for (const inputFile of list) {
4977
- deduped.add(import_node_path13.default.resolve(inputFile));
6190
+ deduped.add(import_node_path14.default.resolve(inputFile));
4978
6191
  }
4979
6192
  }
4980
6193
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -5021,9 +6234,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
5021
6234
  }
5022
6235
 
5023
6236
  // src/evaluation/providers/targets-file.ts
5024
- var import_node_fs5 = require("fs");
5025
- var import_promises11 = require("fs/promises");
5026
- var import_node_path14 = __toESM(require("path"), 1);
6237
+ var import_node_fs6 = require("fs");
6238
+ var import_promises12 = require("fs/promises");
6239
+ var import_node_path15 = __toESM(require("path"), 1);
5027
6240
  var import_yaml3 = require("yaml");
5028
6241
  function isRecord(value) {
5029
6242
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -5053,18 +6266,18 @@ function assertTargetDefinition(value, index, filePath) {
5053
6266
  }
5054
6267
  async function fileExists3(filePath) {
5055
6268
  try {
5056
- await (0, import_promises11.access)(filePath, import_node_fs5.constants.F_OK);
6269
+ await (0, import_promises12.access)(filePath, import_node_fs6.constants.F_OK);
5057
6270
  return true;
5058
6271
  } catch {
5059
6272
  return false;
5060
6273
  }
5061
6274
  }
5062
6275
  async function readTargetDefinitions(filePath) {
5063
- const absolutePath = import_node_path14.default.resolve(filePath);
6276
+ const absolutePath = import_node_path15.default.resolve(filePath);
5064
6277
  if (!await fileExists3(absolutePath)) {
5065
6278
  throw new Error(`targets.yaml not found at ${absolutePath}`);
5066
6279
  }
5067
- const raw = await (0, import_promises11.readFile)(absolutePath, "utf8");
6280
+ const raw = await (0, import_promises12.readFile)(absolutePath, "utf8");
5068
6281
  const parsed = (0, import_yaml3.parse)(raw);
5069
6282
  if (!isRecord(parsed)) {
5070
6283
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
@@ -5094,6 +6307,10 @@ function createProvider(target) {
5094
6307
  return new CodexProvider(target.name, target.config);
5095
6308
  case "pi-coding-agent":
5096
6309
  return new PiCodingAgentProvider(target.name, target.config);
6310
+ case "pi-agent-sdk":
6311
+ return new PiAgentSdkProvider(target.name, target.config);
6312
+ case "claude-code":
6313
+ return new ClaudeCodeProvider(target.name, target.config);
5097
6314
  case "mock":
5098
6315
  return new MockProvider(target.name, target.config);
5099
6316
  case "vscode":
@@ -5112,78 +6329,176 @@ function resolveAndCreateProvider(definition, env = process.env) {
5112
6329
 
5113
6330
  // src/evaluation/evaluators.ts
5114
6331
  var import_ai2 = require("ai");
5115
- var import_zod2 = require("zod");
6332
+ var import_zod3 = require("zod");
5116
6333
 
5117
6334
  // src/runtime/exec.ts
5118
- function getBunSpawn() {
5119
- const bunSpawn = globalThis.Bun?.spawn;
5120
- return typeof bunSpawn === "function" ? bunSpawn : void 0;
6335
+ function shellEscapePath(value) {
6336
+ if (process.platform === "win32") {
6337
+ return `"${value.replaceAll('"', '""')}"`;
6338
+ }
6339
+ return `'${value.replaceAll("'", `'"'"'`)}'`;
5121
6340
  }
5122
- async function execShellWithStdin(command, stdinPayload, options = {}) {
5123
- const bunSpawn = getBunSpawn();
5124
- if (bunSpawn) {
5125
- const encoder = new TextEncoder();
5126
- const proc = bunSpawn({
5127
- cmd: ["sh", "-c", command],
5128
- cwd: options.cwd,
5129
- stdin: encoder.encode(stdinPayload),
5130
- stdout: "pipe",
5131
- stderr: "pipe"
5132
- });
5133
- const timeout = options.timeoutMs ? setTimeout(() => {
5134
- proc.kill();
5135
- }, options.timeoutMs) : void 0;
5136
- try {
5137
- const stdout = await new Response(proc.stdout).text();
5138
- const stderr = await new Response(proc.stderr).text();
5139
- const exitCode = await proc.exited;
5140
- return { stdout, stderr, exitCode };
5141
- } finally {
5142
- if (timeout !== void 0) {
5143
- clearTimeout(timeout);
5144
- }
6341
+ async function execFileWithStdin(argv, stdinPayload, options = {}) {
6342
+ if (argv.length === 0) {
6343
+ throw new Error("Executable argv must include at least one entry");
6344
+ }
6345
+ if (typeof Bun !== "undefined") {
6346
+ return execFileWithStdinBun(argv, stdinPayload, options);
6347
+ }
6348
+ return execFileWithStdinNode(argv, stdinPayload, options);
6349
+ }
6350
+ async function execFileWithStdinBun(argv, stdinPayload, options) {
6351
+ const command = [...argv];
6352
+ const encoder = new TextEncoder();
6353
+ const proc = Bun.spawn(command, {
6354
+ cwd: options.cwd,
6355
+ stdin: encoder.encode(stdinPayload),
6356
+ stdout: "pipe",
6357
+ stderr: "pipe"
6358
+ });
6359
+ let timedOut = false;
6360
+ const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
6361
+ timedOut = true;
6362
+ proc.kill("SIGKILL");
6363
+ }, options.timeoutMs) : void 0;
6364
+ try {
6365
+ const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
6366
+ const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
6367
+ const [stdout, stderr, exitCode] = await Promise.all([
6368
+ stdoutPromise,
6369
+ stderrPromise,
6370
+ proc.exited
6371
+ ]);
6372
+ if (timedOut) {
6373
+ throw new Error(`Process timed out after ${options.timeoutMs}ms`);
6374
+ }
6375
+ return {
6376
+ stdout: stdout.replace(/\r\n/g, "\n"),
6377
+ stderr: stderr.replace(/\r\n/g, "\n"),
6378
+ exitCode
6379
+ };
6380
+ } finally {
6381
+ if (timeout !== void 0) {
6382
+ clearTimeout(timeout);
5145
6383
  }
5146
6384
  }
5147
- const { spawn: spawn3 } = await import("child_process");
5148
- return await new Promise((resolve, reject) => {
5149
- const child = spawn3(command, {
5150
- shell: true,
6385
+ }
6386
+ async function execFileWithStdinNode(argv, stdinPayload, options) {
6387
+ const { spawn: spawn4 } = await import("child_process");
6388
+ return new Promise((resolve, reject) => {
6389
+ const [cmd, ...args] = argv;
6390
+ const child = spawn4(cmd, args, {
5151
6391
  cwd: options.cwd,
5152
6392
  stdio: ["pipe", "pipe", "pipe"]
5153
6393
  });
5154
- let stdout = "";
5155
- let stderr = "";
5156
- const timeout = options.timeoutMs ? setTimeout(() => {
5157
- child.kill();
5158
- reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
6394
+ const stdoutChunks = [];
6395
+ const stderrChunks = [];
6396
+ child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
6397
+ child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
6398
+ let timedOut = false;
6399
+ const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
6400
+ timedOut = true;
6401
+ child.kill("SIGKILL");
5159
6402
  }, options.timeoutMs) : void 0;
5160
- child.stdout?.on("data", (data) => {
5161
- stdout += data.toString();
5162
- });
5163
- child.stderr?.on("data", (data) => {
5164
- stderr += data.toString();
5165
- });
5166
6403
  child.on("error", (error) => {
5167
- if (timeout !== void 0) {
5168
- clearTimeout(timeout);
5169
- }
6404
+ if (timeout !== void 0) clearTimeout(timeout);
5170
6405
  reject(error);
5171
6406
  });
5172
- child.on("exit", (code) => {
5173
- if (timeout !== void 0) {
5174
- clearTimeout(timeout);
6407
+ child.on("close", (code) => {
6408
+ if (timeout !== void 0) clearTimeout(timeout);
6409
+ if (timedOut) {
6410
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
6411
+ return;
5175
6412
  }
5176
- resolve({ stdout, stderr, exitCode: code ?? 0 });
6413
+ const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
6414
+ const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
6415
+ resolve({
6416
+ stdout,
6417
+ stderr,
6418
+ exitCode: code ?? 0
6419
+ });
5177
6420
  });
5178
- child.stdin?.write(stdinPayload);
5179
- child.stdin?.end();
6421
+ if (child.stdin) {
6422
+ child.stdin.write(stdinPayload);
6423
+ child.stdin.end();
6424
+ }
5180
6425
  });
5181
6426
  }
6427
+ async function execShellWithStdin(command, stdinPayload, options = {}) {
6428
+ const { mkdir: mkdir4, readFile: readFile8, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
6429
+ const { tmpdir: tmpdir4 } = await import("os");
6430
+ const path17 = await import("path");
6431
+ const { randomUUID: randomUUID4 } = await import("crypto");
6432
+ const dir = path17.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
6433
+ await mkdir4(dir, { recursive: true });
6434
+ const stdinPath = path17.join(dir, "stdin.txt");
6435
+ const stdoutPath = path17.join(dir, "stdout.txt");
6436
+ const stderrPath = path17.join(dir, "stderr.txt");
6437
+ await writeFile4(stdinPath, stdinPayload, "utf8");
6438
+ const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
6439
+ const { spawn: spawn4 } = await import("child_process");
6440
+ try {
6441
+ const exitCode = await new Promise((resolve, reject) => {
6442
+ const child = spawn4(wrappedCommand, {
6443
+ shell: true,
6444
+ cwd: options.cwd,
6445
+ stdio: ["ignore", "ignore", "ignore"]
6446
+ });
6447
+ const timeout = options.timeoutMs ? setTimeout(() => {
6448
+ child.kill();
6449
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
6450
+ }, options.timeoutMs) : void 0;
6451
+ child.on("error", (error) => {
6452
+ if (timeout !== void 0) {
6453
+ clearTimeout(timeout);
6454
+ }
6455
+ reject(error);
6456
+ });
6457
+ child.on("exit", (code) => {
6458
+ if (timeout !== void 0) {
6459
+ clearTimeout(timeout);
6460
+ }
6461
+ resolve(code ?? 0);
6462
+ });
6463
+ });
6464
+ const stdout = (await readFile8(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
6465
+ const stderr = (await readFile8(stderrPath, "utf8")).replace(/\r\n/g, "\n");
6466
+ return { stdout, stderr, exitCode };
6467
+ } finally {
6468
+ await rm4(dir, { recursive: true, force: true });
6469
+ }
6470
+ }
6471
+
6472
+ // src/evaluation/case-conversion.ts
6473
+ function toSnakeCase(str) {
6474
+ if (/^[A-Z]/.test(str)) {
6475
+ return str;
6476
+ }
6477
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
6478
+ }
6479
+ function toSnakeCaseDeep(obj) {
6480
+ if (obj === null || obj === void 0) {
6481
+ return obj;
6482
+ }
6483
+ if (Array.isArray(obj)) {
6484
+ return obj.map((item) => toSnakeCaseDeep(item));
6485
+ }
6486
+ if (typeof obj === "object") {
6487
+ const result = {};
6488
+ for (const [key, value] of Object.entries(obj)) {
6489
+ const snakeKey = toSnakeCase(key);
6490
+ result[snakeKey] = toSnakeCaseDeep(value);
6491
+ }
6492
+ return result;
6493
+ }
6494
+ return obj;
6495
+ }
5182
6496
 
5183
6497
  // src/evaluation/providers/types.ts
5184
6498
  var AGENT_PROVIDER_KINDS = [
5185
6499
  "codex",
5186
6500
  "pi-coding-agent",
6501
+ "claude-code",
5187
6502
  "vscode",
5188
6503
  "vscode-insiders"
5189
6504
  ];
@@ -5224,20 +6539,20 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
5224
6539
 
5225
6540
  [[ ## candidate_answer ## ]]
5226
6541
  {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
5227
- var freeformEvaluationSchema = import_zod2.z.object({
5228
- score: import_zod2.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
5229
- hits: import_zod2.z.array(import_zod2.z.string()).describe("Brief specific achievements").optional(),
5230
- misses: import_zod2.z.array(import_zod2.z.string()).describe("Brief failures or omissions").optional(),
5231
- reasoning: import_zod2.z.string().describe("Concise explanation (1-2 sentences)").optional()
6542
+ var freeformEvaluationSchema = import_zod3.z.object({
6543
+ score: import_zod3.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
6544
+ hits: import_zod3.z.array(import_zod3.z.string()).describe("Brief specific achievements").optional(),
6545
+ misses: import_zod3.z.array(import_zod3.z.string()).describe("Brief failures or omissions").optional(),
6546
+ reasoning: import_zod3.z.string().describe("Concise explanation (1-2 sentences)").optional()
5232
6547
  });
5233
- var rubricCheckResultSchema = import_zod2.z.object({
5234
- id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
5235
- satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
5236
- reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
6548
+ var rubricCheckResultSchema = import_zod3.z.object({
6549
+ id: import_zod3.z.string().describe("The ID of the rubric item being checked"),
6550
+ satisfied: import_zod3.z.boolean().describe("Whether this rubric requirement is met"),
6551
+ reasoning: import_zod3.z.string().describe("Brief explanation (1-2 sentences) for this check")
5237
6552
  });
5238
- var rubricEvaluationSchema = import_zod2.z.object({
5239
- checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
5240
- overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
6553
+ var rubricEvaluationSchema = import_zod3.z.object({
6554
+ checks: import_zod3.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
6555
+ overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)")
5241
6556
  });
5242
6557
  var LlmJudgeEvaluator = class {
5243
6558
  kind = "llm_judge";
@@ -5473,30 +6788,30 @@ var CodeEvaluator = class {
5473
6788
  script;
5474
6789
  cwd;
5475
6790
  agentTimeoutMs;
6791
+ config;
5476
6792
  constructor(options) {
5477
6793
  this.script = options.script;
5478
6794
  this.cwd = options.cwd;
5479
6795
  this.agentTimeoutMs = options.agentTimeoutMs;
6796
+ this.config = options.config;
5480
6797
  }
5481
6798
  async evaluate(context) {
5482
- const inputPayload = JSON.stringify(
5483
- {
5484
- question: context.evalCase.question,
5485
- expectedOutcome: context.evalCase.expected_outcome,
5486
- expectedMessages: context.evalCase.expected_messages,
5487
- referenceAnswer: context.evalCase.reference_answer,
5488
- candidateAnswer: context.candidate,
5489
- outputMessages: context.outputMessages ?? null,
5490
- guidelineFiles: context.evalCase.guideline_paths,
5491
- inputFiles: context.evalCase.file_paths.filter(
5492
- (path16) => !context.evalCase.guideline_paths.includes(path16)
5493
- ),
5494
- inputMessages: context.evalCase.input_messages,
5495
- traceSummary: context.traceSummary ?? null
5496
- },
5497
- null,
5498
- 2
5499
- );
6799
+ const payload = {
6800
+ question: context.evalCase.question,
6801
+ expectedOutcome: context.evalCase.expected_outcome,
6802
+ expectedMessages: context.evalCase.expected_messages,
6803
+ referenceAnswer: context.evalCase.reference_answer,
6804
+ candidateAnswer: context.candidate,
6805
+ outputMessages: context.outputMessages ?? null,
6806
+ guidelineFiles: context.evalCase.guideline_paths,
6807
+ inputFiles: context.evalCase.file_paths.filter(
6808
+ (path17) => !context.evalCase.guideline_paths.includes(path17)
6809
+ ),
6810
+ inputMessages: context.evalCase.input_messages,
6811
+ traceSummary: context.traceSummary ?? null,
6812
+ config: this.config ?? null
6813
+ };
6814
+ const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
5500
6815
  try {
5501
6816
  const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
5502
6817
  const parsed = parseJsonSafe(stdout);
@@ -5562,18 +6877,25 @@ function calculateRubricScore(result, rubrics) {
5562
6877
  return { score, verdict, hits, misses };
5563
6878
  }
5564
6879
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
5565
- const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
5566
- cwd,
5567
- timeoutMs: agentTimeoutMs
5568
- });
6880
+ const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
5569
6881
  if (exitCode !== 0) {
5570
- const trimmedErr = stderr.trim();
6882
+ const trimmedErr = formatStderr(stderr);
5571
6883
  throw new Error(
5572
6884
  trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5573
6885
  );
5574
6886
  }
5575
6887
  return stdout.trim();
5576
6888
  }
6889
+ function formatStderr(stderr) {
6890
+ const trimmed = stderr.trim();
6891
+ const maxLength = 2e3;
6892
+ if (trimmed.length <= maxLength) {
6893
+ return trimmed;
6894
+ }
6895
+ const tail = trimmed.slice(-maxLength);
6896
+ return `...(truncated, last ${maxLength} chars)
6897
+ ${tail}`;
6898
+ }
5577
6899
  function parseJsonSafe(payload) {
5578
6900
  try {
5579
6901
  return JSON.parse(payload);
@@ -5805,22 +7127,438 @@ var ToolTrajectoryEvaluator = class {
5805
7127
  misses.push(`Position ${i}: ${expectedTool} args mismatch`);
5806
7128
  }
5807
7129
  } else {
5808
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
7130
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
7131
+ }
7132
+ }
7133
+ for (let i = checkLength; i < expected.length; i++) {
7134
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
7135
+ }
7136
+ const score = hits.length / expected.length;
7137
+ return {
7138
+ score,
7139
+ verdict: scoreToVerdict(score),
7140
+ hits,
7141
+ misses,
7142
+ expectedAspectCount: expected.length
7143
+ };
7144
+ }
7145
+ };
7146
+ var DEFAULT_DATE_FORMATS = [
7147
+ "YYYY-MM-DDTHH:mm:ssZ",
7148
+ // ISO with timezone
7149
+ "YYYY-MM-DDTHH:mm:ss",
7150
+ // ISO with time
7151
+ "YYYY-MM-DD",
7152
+ // ISO date
7153
+ "DD-MMM-YYYY",
7154
+ // Localized (e.g., "15-JAN-2025")
7155
+ "MM/DD/YYYY",
7156
+ // US format
7157
+ "DD/MM/YYYY",
7158
+ // EU format
7159
+ "MM-DD-YYYY",
7160
+ // US with dashes
7161
+ "DD-MM-YYYY"
7162
+ // EU with dashes
7163
+ ];
7164
+ var MONTH_NAMES = {
7165
+ jan: 0,
7166
+ january: 0,
7167
+ feb: 1,
7168
+ february: 1,
7169
+ mar: 2,
7170
+ march: 2,
7171
+ apr: 3,
7172
+ april: 3,
7173
+ may: 4,
7174
+ jun: 5,
7175
+ june: 5,
7176
+ jul: 6,
7177
+ july: 6,
7178
+ aug: 7,
7179
+ august: 7,
7180
+ sep: 8,
7181
+ sept: 8,
7182
+ september: 8,
7183
+ oct: 9,
7184
+ october: 9,
7185
+ nov: 10,
7186
+ november: 10,
7187
+ dec: 11,
7188
+ december: 11
7189
+ };
7190
+ var FieldAccuracyEvaluator = class {
7191
+ kind = "field_accuracy";
7192
+ config;
7193
+ constructor(options) {
7194
+ this.config = options.config;
7195
+ }
7196
+ evaluate(context) {
7197
+ const { evalCase, candidate } = context;
7198
+ let candidateData;
7199
+ try {
7200
+ candidateData = parseJsonFromTextSafe(candidate);
7201
+ } catch {
7202
+ return {
7203
+ score: 0,
7204
+ verdict: "fail",
7205
+ hits: [],
7206
+ misses: ["Failed to parse candidate answer as JSON"],
7207
+ expectedAspectCount: this.config.fields.length,
7208
+ reasoning: "Candidate answer is not valid JSON"
7209
+ };
7210
+ }
7211
+ const expectedData = this.extractExpectedData(evalCase.expected_messages);
7212
+ if (!expectedData) {
7213
+ return {
7214
+ score: 0,
7215
+ verdict: "fail",
7216
+ hits: [],
7217
+ misses: ["No expected data found in expected_messages"],
7218
+ expectedAspectCount: this.config.fields.length,
7219
+ reasoning: "Could not extract expected data from expected_messages"
7220
+ };
7221
+ }
7222
+ const fieldResults = [];
7223
+ for (const fieldConfig of this.config.fields) {
7224
+ const result = this.evaluateField(fieldConfig, candidateData, expectedData);
7225
+ fieldResults.push(result);
7226
+ }
7227
+ return this.aggregateResults(fieldResults);
7228
+ }
7229
+ /**
7230
+ * Extract expected data from expected_messages array.
7231
+ * Looks for the last assistant message with content.
7232
+ */
7233
+ extractExpectedData(expectedMessages) {
7234
+ for (let i = expectedMessages.length - 1; i >= 0; i--) {
7235
+ const message = expectedMessages[i];
7236
+ if (message.role === "assistant" && message.content) {
7237
+ if (typeof message.content === "object" && message.content !== null) {
7238
+ return message.content;
7239
+ }
7240
+ if (typeof message.content === "string") {
7241
+ try {
7242
+ return parseJsonFromTextSafe(message.content);
7243
+ } catch {
7244
+ }
7245
+ }
7246
+ }
7247
+ }
7248
+ return void 0;
7249
+ }
7250
+ /**
7251
+ * Evaluate a single field against the expected value.
7252
+ */
7253
+ evaluateField(fieldConfig, candidateData, expectedData) {
7254
+ const { path: path17, match, required = true, weight = 1 } = fieldConfig;
7255
+ const candidateValue = resolvePath(candidateData, path17);
7256
+ const expectedValue = resolvePath(expectedData, path17);
7257
+ if (expectedValue === void 0) {
7258
+ return {
7259
+ path: path17,
7260
+ score: 1,
7261
+ // No expected value means no comparison needed
7262
+ weight,
7263
+ hit: true,
7264
+ message: `${path17}: no expected value`
7265
+ };
7266
+ }
7267
+ if (candidateValue === void 0) {
7268
+ if (required) {
7269
+ return {
7270
+ path: path17,
7271
+ score: 0,
7272
+ weight,
7273
+ hit: false,
7274
+ message: `${path17} (required, missing)`
7275
+ };
7276
+ }
7277
+ return {
7278
+ path: path17,
7279
+ score: 1,
7280
+ // Don't penalize missing optional fields
7281
+ weight: 0,
7282
+ // Zero weight means it won't affect the score
7283
+ hit: true,
7284
+ message: `${path17}: optional field missing`
7285
+ };
7286
+ }
7287
+ switch (match) {
7288
+ case "exact":
7289
+ return this.compareExact(path17, candidateValue, expectedValue, weight);
7290
+ case "numeric_tolerance":
7291
+ return this.compareNumericTolerance(
7292
+ path17,
7293
+ candidateValue,
7294
+ expectedValue,
7295
+ fieldConfig,
7296
+ weight
7297
+ );
7298
+ case "date":
7299
+ return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
7300
+ default:
7301
+ return {
7302
+ path: path17,
7303
+ score: 0,
7304
+ weight,
7305
+ hit: false,
7306
+ message: `${path17}: unknown match type "${match}"`
7307
+ };
7308
+ }
7309
+ }
7310
+ /**
7311
+ * Exact equality comparison.
7312
+ */
7313
+ compareExact(path17, candidateValue, expectedValue, weight) {
7314
+ if (deepEqual(candidateValue, expectedValue)) {
7315
+ return {
7316
+ path: path17,
7317
+ score: 1,
7318
+ weight,
7319
+ hit: true,
7320
+ message: path17
7321
+ };
7322
+ }
7323
+ if (typeof candidateValue !== typeof expectedValue) {
7324
+ return {
7325
+ path: path17,
7326
+ score: 0,
7327
+ weight,
7328
+ hit: false,
7329
+ message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
7330
+ };
7331
+ }
7332
+ return {
7333
+ path: path17,
7334
+ score: 0,
7335
+ weight,
7336
+ hit: false,
7337
+ message: `${path17} (value mismatch)`
7338
+ };
7339
+ }
7340
+ /**
7341
+ * Numeric comparison with absolute or relative tolerance.
7342
+ */
7343
+ compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
7344
+ const { tolerance = 0, relative = false } = fieldConfig;
7345
+ const candidateNum = toNumber(candidateValue);
7346
+ const expectedNum = toNumber(expectedValue);
7347
+ if (candidateNum === null || expectedNum === null) {
7348
+ return {
7349
+ path: path17,
7350
+ score: 0,
7351
+ weight,
7352
+ hit: false,
7353
+ message: `${path17} (non-numeric value)`
7354
+ };
7355
+ }
7356
+ if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
7357
+ return {
7358
+ path: path17,
7359
+ score: 0,
7360
+ weight,
7361
+ hit: false,
7362
+ message: `${path17} (invalid numeric value)`
7363
+ };
7364
+ }
7365
+ const diff = Math.abs(candidateNum - expectedNum);
7366
+ let withinTolerance;
7367
+ if (relative) {
7368
+ const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
7369
+ withinTolerance = relativeDiff <= tolerance;
7370
+ } else {
7371
+ withinTolerance = diff <= tolerance;
7372
+ }
7373
+ if (withinTolerance) {
7374
+ return {
7375
+ path: path17,
7376
+ score: 1,
7377
+ weight,
7378
+ hit: true,
7379
+ message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
7380
+ };
7381
+ }
7382
+ return {
7383
+ path: path17,
7384
+ score: 0,
7385
+ weight,
7386
+ hit: false,
7387
+ message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
7388
+ };
7389
+ }
7390
+ /**
7391
+ * Date comparison with format normalization.
7392
+ */
7393
+ compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
7394
+ const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
7395
+ const candidateDate = parseDate(String(candidateValue), formats);
7396
+ const expectedDate = parseDate(String(expectedValue), formats);
7397
+ if (candidateDate === null) {
7398
+ return {
7399
+ path: path17,
7400
+ score: 0,
7401
+ weight,
7402
+ hit: false,
7403
+ message: `${path17} (unparseable candidate date)`
7404
+ };
7405
+ }
7406
+ if (expectedDate === null) {
7407
+ return {
7408
+ path: path17,
7409
+ score: 0,
7410
+ weight,
7411
+ hit: false,
7412
+ message: `${path17} (unparseable expected date)`
7413
+ };
7414
+ }
7415
+ if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
7416
+ return {
7417
+ path: path17,
7418
+ score: 1,
7419
+ weight,
7420
+ hit: true,
7421
+ message: path17
7422
+ };
7423
+ }
7424
+ return {
7425
+ path: path17,
7426
+ score: 0,
7427
+ weight,
7428
+ hit: false,
7429
+ message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
7430
+ };
7431
+ }
7432
+ /**
7433
+ * Aggregate field results using configured strategy.
7434
+ */
7435
+ aggregateResults(results) {
7436
+ const aggregation = this.config.aggregation ?? "weighted_average";
7437
+ const hits = [];
7438
+ const misses = [];
7439
+ for (const result of results) {
7440
+ if (result.hit) {
7441
+ hits.push(result.message);
7442
+ } else {
7443
+ misses.push(result.message);
5809
7444
  }
5810
7445
  }
5811
- for (let i = checkLength; i < expected.length; i++) {
5812
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
7446
+ let score;
7447
+ if (aggregation === "all_or_nothing") {
7448
+ score = misses.length === 0 ? 1 : 0;
7449
+ } else {
7450
+ const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
7451
+ if (totalWeight === 0) {
7452
+ score = results.length === 0 ? 1 : 0;
7453
+ } else {
7454
+ const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
7455
+ score = weightedSum / totalWeight;
7456
+ }
5813
7457
  }
5814
- const score = hits.length / expected.length;
7458
+ const reasoning = `${hits.length}/${results.length} fields matched`;
5815
7459
  return {
5816
- score,
7460
+ score: clampScore(score),
5817
7461
  verdict: scoreToVerdict(score),
5818
- hits,
5819
- misses,
5820
- expectedAspectCount: expected.length
7462
+ hits: hits.slice(0, 4),
7463
+ misses: misses.slice(0, 4),
7464
+ expectedAspectCount: results.length,
7465
+ reasoning
5821
7466
  };
5822
7467
  }
5823
7468
  };
7469
+ function resolvePath(obj, path17) {
7470
+ if (!path17 || !obj) {
7471
+ return void 0;
7472
+ }
7473
+ const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
7474
+ let current = obj;
7475
+ for (const part of parts) {
7476
+ if (current === null || current === void 0) {
7477
+ return void 0;
7478
+ }
7479
+ if (typeof current !== "object") {
7480
+ return void 0;
7481
+ }
7482
+ const isIndex = /^\d+$/.test(part);
7483
+ if (isIndex && Array.isArray(current)) {
7484
+ current = current[Number.parseInt(part, 10)];
7485
+ } else {
7486
+ current = current[part];
7487
+ }
7488
+ }
7489
+ return current;
7490
+ }
7491
+ function toNumber(value) {
7492
+ if (typeof value === "number") {
7493
+ return value;
7494
+ }
7495
+ if (typeof value === "string") {
7496
+ const num = Number.parseFloat(value);
7497
+ return Number.isNaN(num) ? null : num;
7498
+ }
7499
+ return null;
7500
+ }
7501
+ function parseDate(dateStr, formats) {
7502
+ if (!dateStr) return null;
7503
+ const trimmed = dateStr.trim();
7504
+ const isoDate = new Date(trimmed);
7505
+ if (!Number.isNaN(isoDate.getTime())) {
7506
+ return isoDate;
7507
+ }
7508
+ const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
7509
+ if (localizedMatch) {
7510
+ const day = Number.parseInt(localizedMatch[1], 10);
7511
+ const monthName = localizedMatch[2].toLowerCase();
7512
+ const year = Number.parseInt(localizedMatch[3], 10);
7513
+ const month = MONTH_NAMES[monthName];
7514
+ if (month !== void 0) {
7515
+ return new Date(year, month, day);
7516
+ }
7517
+ }
7518
+ const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
7519
+ if (usMatch) {
7520
+ const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
7521
+ const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
7522
+ if (hasUSFormat && !hasEUFormat) {
7523
+ const month = Number.parseInt(usMatch[1], 10) - 1;
7524
+ const day = Number.parseInt(usMatch[2], 10);
7525
+ const year = Number.parseInt(usMatch[3], 10);
7526
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7527
+ return new Date(year, month, day);
7528
+ }
7529
+ } else if (hasEUFormat && !hasUSFormat) {
7530
+ const day = Number.parseInt(usMatch[1], 10);
7531
+ const month = Number.parseInt(usMatch[2], 10) - 1;
7532
+ const year = Number.parseInt(usMatch[3], 10);
7533
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7534
+ return new Date(year, month, day);
7535
+ }
7536
+ } else {
7537
+ const num1 = Number.parseInt(usMatch[1], 10);
7538
+ const num2 = Number.parseInt(usMatch[2], 10);
7539
+ const year = Number.parseInt(usMatch[3], 10);
7540
+ if (num1 > 12 && num2 <= 12) {
7541
+ return new Date(year, num2 - 1, num1);
7542
+ }
7543
+ if (num2 > 12 && num1 <= 12) {
7544
+ return new Date(year, num1 - 1, num2);
7545
+ }
7546
+ if (num1 <= 12 && num2 <= 31) {
7547
+ return new Date(year, num1 - 1, num2);
7548
+ }
7549
+ }
7550
+ }
7551
+ return null;
7552
+ }
7553
+ function formatDateISO(date) {
7554
+ return date.toISOString().split("T")[0];
7555
+ }
7556
+ function parseJsonFromTextSafe(text) {
7557
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
7558
+ const match = cleaned.match(/\{[\s\S]*\}/);
7559
+ const blob = match?.[0] ?? cleaned;
7560
+ return JSON.parse(blob);
7561
+ }
5824
7562
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
5825
7563
  {{EVALUATOR_RESULTS_JSON}}
5826
7564
 
@@ -6045,11 +7783,175 @@ var CompositeEvaluator = class {
6045
7783
  }
6046
7784
  }
6047
7785
  };
7786
+ var LatencyEvaluator = class {
7787
+ kind = "latency";
7788
+ config;
7789
+ constructor(options) {
7790
+ this.config = options.config;
7791
+ }
7792
+ evaluate(context) {
7793
+ const { threshold } = this.config;
7794
+ const durationMs = context.traceSummary?.durationMs;
7795
+ if (durationMs === void 0) {
7796
+ return {
7797
+ score: 0,
7798
+ verdict: "fail",
7799
+ hits: [],
7800
+ misses: ["No duration data available in trace"],
7801
+ expectedAspectCount: 1,
7802
+ reasoning: "Execution duration not reported by provider",
7803
+ evaluatorRawRequest: {
7804
+ type: "latency",
7805
+ threshold,
7806
+ durationMs: null
7807
+ }
7808
+ };
7809
+ }
7810
+ const passed = durationMs <= threshold;
7811
+ const score = passed ? 1 : 0;
7812
+ return {
7813
+ score,
7814
+ verdict: passed ? "pass" : "fail",
7815
+ hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
7816
+ misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
7817
+ expectedAspectCount: 1,
7818
+ reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
7819
+ evaluatorRawRequest: {
7820
+ type: "latency",
7821
+ threshold,
7822
+ durationMs
7823
+ }
7824
+ };
7825
+ }
7826
+ };
7827
+ var CostEvaluator = class {
7828
+ kind = "cost";
7829
+ config;
7830
+ constructor(options) {
7831
+ this.config = options.config;
7832
+ }
7833
+ evaluate(context) {
7834
+ const { budget } = this.config;
7835
+ const costUsd = context.traceSummary?.costUsd;
7836
+ if (costUsd === void 0) {
7837
+ return {
7838
+ score: 0,
7839
+ verdict: "fail",
7840
+ hits: [],
7841
+ misses: ["No cost data available in trace"],
7842
+ expectedAspectCount: 1,
7843
+ reasoning: "Execution cost not reported by provider",
7844
+ evaluatorRawRequest: {
7845
+ type: "cost",
7846
+ budget,
7847
+ costUsd: null
7848
+ }
7849
+ };
7850
+ }
7851
+ const passed = costUsd <= budget;
7852
+ const score = passed ? 1 : 0;
7853
+ const formatCost = (n) => `$${n.toFixed(4)}`;
7854
+ return {
7855
+ score,
7856
+ verdict: passed ? "pass" : "fail",
7857
+ hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
7858
+ misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
7859
+ expectedAspectCount: 1,
7860
+ reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
7861
+ evaluatorRawRequest: {
7862
+ type: "cost",
7863
+ budget,
7864
+ costUsd
7865
+ }
7866
+ };
7867
+ }
7868
+ };
7869
+ var TokenUsageEvaluator = class {
7870
+ kind = "token_usage";
7871
+ config;
7872
+ constructor(options) {
7873
+ this.config = options.config;
7874
+ }
7875
+ evaluate(context) {
7876
+ const usage = context.traceSummary?.tokenUsage;
7877
+ const maxTotal = this.config.max_total;
7878
+ const maxInput = this.config.max_input;
7879
+ const maxOutput = this.config.max_output;
7880
+ const expectedAspectCount = Math.max(
7881
+ [maxTotal, maxInput, maxOutput].filter((v) => typeof v === "number").length,
7882
+ 1
7883
+ );
7884
+ if (!usage) {
7885
+ return {
7886
+ score: 0,
7887
+ verdict: "fail",
7888
+ hits: [],
7889
+ misses: ["No token usage data available in trace"],
7890
+ expectedAspectCount,
7891
+ reasoning: "Token usage not reported by provider",
7892
+ evaluatorRawRequest: {
7893
+ type: "token_usage",
7894
+ max_total: maxTotal ?? null,
7895
+ max_input: maxInput ?? null,
7896
+ max_output: maxOutput ?? null,
7897
+ tokenUsage: null
7898
+ }
7899
+ };
7900
+ }
7901
+ const input = usage.input;
7902
+ const output = usage.output;
7903
+ const cached = usage.cached ?? 0;
7904
+ const total = input + output + cached;
7905
+ const hits = [];
7906
+ const misses = [];
7907
+ if (typeof maxInput === "number") {
7908
+ if (input <= maxInput) {
7909
+ hits.push(`Input tokens ${input} <= ${maxInput}`);
7910
+ } else {
7911
+ misses.push(`Input tokens ${input} > ${maxInput}`);
7912
+ }
7913
+ }
7914
+ if (typeof maxOutput === "number") {
7915
+ if (output <= maxOutput) {
7916
+ hits.push(`Output tokens ${output} <= ${maxOutput}`);
7917
+ } else {
7918
+ misses.push(`Output tokens ${output} > ${maxOutput}`);
7919
+ }
7920
+ }
7921
+ if (typeof maxTotal === "number") {
7922
+ if (total <= maxTotal) {
7923
+ hits.push(`Total tokens ${total} <= ${maxTotal}`);
7924
+ } else {
7925
+ misses.push(`Total tokens ${total} > ${maxTotal}`);
7926
+ }
7927
+ }
7928
+ const passed = misses.length === 0;
7929
+ return {
7930
+ score: passed ? 1 : 0,
7931
+ verdict: passed ? "pass" : "fail",
7932
+ hits,
7933
+ misses,
7934
+ expectedAspectCount,
7935
+ reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
7936
+ evaluatorRawRequest: {
7937
+ type: "token_usage",
7938
+ max_total: maxTotal ?? null,
7939
+ max_input: maxInput ?? null,
7940
+ max_output: maxOutput ?? null,
7941
+ tokenUsage: {
7942
+ input,
7943
+ output,
7944
+ cached,
7945
+ total
7946
+ }
7947
+ }
7948
+ };
7949
+ }
7950
+ };
6048
7951
 
6049
7952
  // src/evaluation/orchestrator.ts
6050
- var import_node_crypto3 = require("crypto");
6051
- var import_promises12 = require("fs/promises");
6052
- var import_node_path15 = __toESM(require("path"), 1);
7953
+ var import_node_crypto4 = require("crypto");
7954
+ var import_node_path16 = __toESM(require("path"), 1);
6053
7955
 
6054
7956
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
6055
7957
  var Node = class {
@@ -6191,6 +8093,9 @@ function validateConcurrency(concurrency) {
6191
8093
  }
6192
8094
 
6193
8095
  // src/evaluation/orchestrator.ts
8096
+ function usesFileReferencePrompt(provider) {
8097
+ return isAgentProvider(provider) || provider.kind === "cli";
8098
+ }
6194
8099
  async function runEvaluation(options) {
6195
8100
  const {
6196
8101
  testFilePath: evalFilePath,
@@ -6202,7 +8107,6 @@ async function runEvaluation(options) {
6202
8107
  evaluators,
6203
8108
  maxRetries,
6204
8109
  agentTimeoutMs,
6205
- promptDumpDir,
6206
8110
  cache,
6207
8111
  useCache,
6208
8112
  now,
@@ -6282,7 +8186,6 @@ async function runEvaluation(options) {
6282
8186
  provider: primaryProvider,
6283
8187
  target,
6284
8188
  evaluatorRegistry,
6285
- promptDumpDir,
6286
8189
  nowFn: now ?? (() => /* @__PURE__ */ new Date()),
6287
8190
  onProgress,
6288
8191
  onResult,
@@ -6324,7 +8227,6 @@ async function runEvaluation(options) {
6324
8227
  evaluators: evaluatorRegistry,
6325
8228
  maxRetries,
6326
8229
  agentTimeoutMs,
6327
- promptDumpDir,
6328
8230
  cache,
6329
8231
  useCache,
6330
8232
  now,
@@ -6367,7 +8269,8 @@ async function runEvaluation(options) {
6367
8269
  results.push(outcome.value);
6368
8270
  } else {
6369
8271
  const evalCase = filteredEvalCases[i];
6370
- const promptInputs = await buildPromptInputs(evalCase);
8272
+ const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
8273
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
6371
8274
  const errorResult = buildErrorResult(
6372
8275
  evalCase,
6373
8276
  target.name,
@@ -6390,7 +8293,6 @@ async function runBatchEvaluation(options) {
6390
8293
  provider,
6391
8294
  target,
6392
8295
  evaluatorRegistry,
6393
- promptDumpDir,
6394
8296
  nowFn,
6395
8297
  onProgress,
6396
8298
  onResult,
@@ -6398,12 +8300,9 @@ async function runBatchEvaluation(options) {
6398
8300
  agentTimeoutMs
6399
8301
  } = options;
6400
8302
  const promptInputsList = [];
6401
- const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
8303
+ const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
6402
8304
  for (const evalCase of evalCases) {
6403
8305
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
6404
- if (promptDumpDir) {
6405
- await dumpPrompt(promptDumpDir, evalCase, promptInputs);
6406
- }
6407
8306
  promptInputsList.push(promptInputs);
6408
8307
  }
6409
8308
  const batchRequests = evalCases.map((evalCase, index) => {
@@ -6445,13 +8344,20 @@ async function runBatchEvaluation(options) {
6445
8344
  const promptInputs = promptInputsList[i];
6446
8345
  const providerResponse = batchResponse[i];
6447
8346
  const outputMessages = providerResponse.outputMessages;
6448
- const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
8347
+ const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
8348
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
8349
+ eventCount: 0,
8350
+ toolNames: [],
8351
+ toolCallsByName: {},
8352
+ errorCount: 0
8353
+ } : void 0;
6449
8354
  const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
6450
8355
  tokenUsage: providerResponse.tokenUsage,
6451
8356
  costUsd: providerResponse.costUsd,
6452
8357
  durationMs: providerResponse.durationMs
6453
8358
  }) : void 0;
6454
8359
  const candidate = extractLastAssistantContent(outputMessages);
8360
+ const providerError = extractProviderError(providerResponse);
6455
8361
  let result;
6456
8362
  try {
6457
8363
  result = await evaluateCandidate({
@@ -6468,6 +8374,9 @@ async function runBatchEvaluation(options) {
6468
8374
  outputMessages,
6469
8375
  traceSummary
6470
8376
  });
8377
+ if (providerError) {
8378
+ result = { ...result, error: providerError };
8379
+ }
6471
8380
  } catch (error) {
6472
8381
  const errorResult = buildErrorResult(
6473
8382
  evalCase,
@@ -6500,9 +8409,10 @@ async function runBatchEvaluation(options) {
6500
8409
  await onProgress({
6501
8410
  workerId: 1,
6502
8411
  evalId: evalCase.id,
6503
- status: "completed",
8412
+ status: result.error ? "failed" : "completed",
6504
8413
  startedAt: 0,
6505
- completedAt: Date.now()
8414
+ completedAt: Date.now(),
8415
+ error: result.error
6506
8416
  });
6507
8417
  }
6508
8418
  }
@@ -6517,17 +8427,13 @@ async function runEvalCase(options) {
6517
8427
  now,
6518
8428
  maxRetries,
6519
8429
  agentTimeoutMs,
6520
- promptDumpDir,
6521
8430
  cache,
6522
8431
  useCache,
6523
8432
  signal,
6524
8433
  judgeProvider
6525
8434
  } = options;
6526
- const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
8435
+ const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
6527
8436
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
6528
- if (promptDumpDir) {
6529
- await dumpPrompt(promptDumpDir, evalCase, promptInputs);
6530
- }
6531
8437
  const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
6532
8438
  let cachedResponse;
6533
8439
  if (cacheKey && cache) {
@@ -6571,15 +8477,22 @@ async function runEvalCase(options) {
6571
8477
  await cache.set(cacheKey, providerResponse);
6572
8478
  }
6573
8479
  const outputMessages = providerResponse.outputMessages;
6574
- const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
8480
+ const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
8481
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
8482
+ eventCount: 0,
8483
+ toolNames: [],
8484
+ toolCallsByName: {},
8485
+ errorCount: 0
8486
+ } : void 0;
6575
8487
  const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
6576
8488
  tokenUsage: providerResponse.tokenUsage,
6577
8489
  costUsd: providerResponse.costUsd,
6578
8490
  durationMs: providerResponse.durationMs
6579
8491
  }) : void 0;
6580
8492
  const candidate = extractLastAssistantContent(outputMessages);
8493
+ const providerError = extractProviderError(providerResponse);
6581
8494
  try {
6582
- return await evaluateCandidate({
8495
+ const result = await evaluateCandidate({
6583
8496
  evalCase,
6584
8497
  candidate,
6585
8498
  target,
@@ -6593,6 +8506,7 @@ async function runEvalCase(options) {
6593
8506
  outputMessages,
6594
8507
  traceSummary
6595
8508
  });
8509
+ return providerError ? { ...result, error: providerError } : result;
6596
8510
  } catch (error) {
6597
8511
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
6598
8512
  }
@@ -6658,7 +8572,6 @@ async function evaluateCandidate(options) {
6658
8572
  candidateAnswer: candidate,
6659
8573
  target: target.name,
6660
8574
  reasoning: score.reasoning,
6661
- rawAspects: score.rawAspects,
6662
8575
  agentProviderRequest,
6663
8576
  lmProviderRequest,
6664
8577
  evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
@@ -6768,7 +8681,8 @@ async function runEvaluatorList(options) {
6768
8681
  const codeEvaluator = new CodeEvaluator({
6769
8682
  script: evaluator.script,
6770
8683
  cwd: evaluator.resolvedCwd ?? evaluator.cwd,
6771
- agentTimeoutMs
8684
+ agentTimeoutMs,
8685
+ config: evaluator.config
6772
8686
  });
6773
8687
  const score2 = await codeEvaluator.evaluate({
6774
8688
  evalCase,
@@ -6796,7 +8710,7 @@ async function runEvaluatorList(options) {
6796
8710
  });
6797
8711
  }
6798
8712
  if (evaluator.type === "composite") {
6799
- const evalFileDir = evalCase.guideline_paths[0] ? import_node_path15.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
8713
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path16.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
6800
8714
  const createEvaluator = (memberConfig) => {
6801
8715
  switch (memberConfig.type) {
6802
8716
  case "llm_judge":
@@ -6805,7 +8719,8 @@ async function runEvaluatorList(options) {
6805
8719
  return new CodeEvaluator({
6806
8720
  script: memberConfig.script,
6807
8721
  cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
6808
- agentTimeoutMs
8722
+ agentTimeoutMs,
8723
+ config: memberConfig.config
6809
8724
  });
6810
8725
  case "composite":
6811
8726
  return new CompositeEvaluator({
@@ -6817,6 +8732,22 @@ async function runEvaluatorList(options) {
6817
8732
  return new ToolTrajectoryEvaluator({
6818
8733
  config: memberConfig
6819
8734
  });
8735
+ case "field_accuracy":
8736
+ return new FieldAccuracyEvaluator({
8737
+ config: memberConfig
8738
+ });
8739
+ case "latency":
8740
+ return new LatencyEvaluator({
8741
+ config: memberConfig
8742
+ });
8743
+ case "cost":
8744
+ return new CostEvaluator({
8745
+ config: memberConfig
8746
+ });
8747
+ case "token_usage":
8748
+ return new TokenUsageEvaluator({
8749
+ config: memberConfig
8750
+ });
6820
8751
  default: {
6821
8752
  const unknownConfig = memberConfig;
6822
8753
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -6836,7 +8767,9 @@ async function runEvaluatorList(options) {
6836
8767
  attempt,
6837
8768
  promptInputs,
6838
8769
  now,
6839
- judgeProvider
8770
+ judgeProvider,
8771
+ outputMessages,
8772
+ traceSummary
6840
8773
  });
6841
8774
  const weight = evaluator.weight ?? 1;
6842
8775
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -6881,6 +8814,118 @@ async function runEvaluatorList(options) {
6881
8814
  reasoning: score2.reasoning
6882
8815
  });
6883
8816
  }
8817
+ if (evaluator.type === "field_accuracy") {
8818
+ const fieldAccuracyEvaluator = new FieldAccuracyEvaluator({
8819
+ config: evaluator
8820
+ });
8821
+ const score2 = fieldAccuracyEvaluator.evaluate({
8822
+ evalCase,
8823
+ candidate,
8824
+ target,
8825
+ provider,
8826
+ attempt,
8827
+ promptInputs,
8828
+ now,
8829
+ outputMessages,
8830
+ traceSummary
8831
+ });
8832
+ const weight = evaluator.weight ?? 1;
8833
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
8834
+ evaluatorResults.push({
8835
+ name: evaluator.name,
8836
+ type: evaluator.type,
8837
+ score: score2.score,
8838
+ weight,
8839
+ verdict: score2.verdict,
8840
+ hits: score2.hits,
8841
+ misses: score2.misses,
8842
+ reasoning: score2.reasoning
8843
+ });
8844
+ }
8845
+ if (evaluator.type === "latency") {
8846
+ const latencyEvaluator = new LatencyEvaluator({
8847
+ config: evaluator
8848
+ });
8849
+ const score2 = latencyEvaluator.evaluate({
8850
+ evalCase,
8851
+ candidate,
8852
+ target,
8853
+ provider,
8854
+ attempt,
8855
+ promptInputs,
8856
+ now,
8857
+ outputMessages,
8858
+ traceSummary
8859
+ });
8860
+ const weight = evaluator.weight ?? 1;
8861
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
8862
+ evaluatorResults.push({
8863
+ name: evaluator.name,
8864
+ type: evaluator.type,
8865
+ score: score2.score,
8866
+ weight,
8867
+ verdict: score2.verdict,
8868
+ hits: score2.hits,
8869
+ misses: score2.misses,
8870
+ reasoning: score2.reasoning
8871
+ });
8872
+ }
8873
+ if (evaluator.type === "cost") {
8874
+ const costEvaluator = new CostEvaluator({
8875
+ config: evaluator
8876
+ });
8877
+ const score2 = costEvaluator.evaluate({
8878
+ evalCase,
8879
+ candidate,
8880
+ target,
8881
+ provider,
8882
+ attempt,
8883
+ promptInputs,
8884
+ now,
8885
+ outputMessages,
8886
+ traceSummary
8887
+ });
8888
+ const weight = evaluator.weight ?? 1;
8889
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
8890
+ evaluatorResults.push({
8891
+ name: evaluator.name,
8892
+ type: evaluator.type,
8893
+ score: score2.score,
8894
+ weight,
8895
+ verdict: score2.verdict,
8896
+ hits: score2.hits,
8897
+ misses: score2.misses,
8898
+ reasoning: score2.reasoning
8899
+ });
8900
+ }
8901
+ if (evaluator.type === "token_usage") {
8902
+ const tokenUsageEvaluator = new TokenUsageEvaluator({
8903
+ config: evaluator
8904
+ });
8905
+ const score2 = tokenUsageEvaluator.evaluate({
8906
+ evalCase,
8907
+ candidate,
8908
+ target,
8909
+ provider,
8910
+ attempt,
8911
+ promptInputs,
8912
+ now,
8913
+ outputMessages,
8914
+ traceSummary
8915
+ });
8916
+ const weight = evaluator.weight ?? 1;
8917
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
8918
+ evaluatorResults.push({
8919
+ name: evaluator.name,
8920
+ type: evaluator.type,
8921
+ score: score2.score,
8922
+ weight,
8923
+ verdict: score2.verdict,
8924
+ hits: score2.hits,
8925
+ misses: score2.misses,
8926
+ reasoning: score2.reasoning
8927
+ });
8928
+ }
6884
8929
  } catch (error) {
6885
8930
  const message = error instanceof Error ? error.message : String(error);
6886
8931
  const fallbackScore = {
@@ -6920,7 +8965,6 @@ async function runEvaluatorList(options) {
6920
8965
  (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
6921
8966
  0
6922
8967
  );
6923
- const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
6924
8968
  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
6925
8969
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
6926
8970
  const score = {
@@ -6929,8 +8973,7 @@ async function runEvaluatorList(options) {
6929
8973
  hits,
6930
8974
  misses,
6931
8975
  expectedAspectCount,
6932
- reasoning,
6933
- rawAspects: rawAspects.length > 0 ? rawAspects : void 0
8976
+ reasoning
6934
8977
  };
6935
8978
  return { score, evaluatorResults };
6936
8979
  }
@@ -7005,26 +9048,6 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
7005
9048
  llm_judge: llmJudge
7006
9049
  };
7007
9050
  }
7008
- async function dumpPrompt(directory, evalCase, promptInputs) {
7009
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
7010
- const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
7011
- const filePath = import_node_path15.default.resolve(directory, filename);
7012
- await (0, import_promises12.mkdir)(import_node_path15.default.dirname(filePath), { recursive: true });
7013
- const payload = {
7014
- eval_id: evalCase.id,
7015
- question: promptInputs.question,
7016
- guidelines: promptInputs.guidelines,
7017
- guideline_paths: evalCase.guideline_paths
7018
- };
7019
- await (0, import_promises12.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
7020
- }
7021
- function sanitizeFilename(value) {
7022
- if (!value) {
7023
- return "prompt";
7024
- }
7025
- const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
7026
- return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
7027
- }
7028
9051
  async function invokeProvider(provider, options) {
7029
9052
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
7030
9053
  const controller = new AbortController();
@@ -7088,14 +9111,25 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
7088
9111
  misses: [`Error: ${message}`],
7089
9112
  candidateAnswer: `Error occurred: ${message}`,
7090
9113
  target: targetName,
7091
- rawAspects: [],
7092
9114
  agentProviderRequest,
7093
9115
  lmProviderRequest,
7094
9116
  error: message
7095
9117
  };
7096
9118
  }
9119
+ function extractProviderError(response) {
9120
+ const raw = response.raw;
9121
+ if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
9122
+ return void 0;
9123
+ }
9124
+ const error = raw.error;
9125
+ if (typeof error !== "string") {
9126
+ return void 0;
9127
+ }
9128
+ const trimmed = error.trim();
9129
+ return trimmed.length > 0 ? trimmed : void 0;
9130
+ }
7097
9131
  function createCacheKey(provider, target, evalCase, promptInputs) {
7098
- const hash = (0, import_node_crypto3.createHash)("sha256");
9132
+ const hash = (0, import_node_crypto4.createHash)("sha256");
7099
9133
  hash.update(provider.id);
7100
9134
  hash.update(target.name);
7101
9135
  hash.update(evalCase.id);
@@ -7152,15 +9186,15 @@ function computeWeightedMean(entries) {
7152
9186
 
7153
9187
  // src/evaluation/generators/rubric-generator.ts
7154
9188
  var import_ai3 = require("ai");
7155
- var import_zod3 = require("zod");
7156
- var rubricItemSchema = import_zod3.z.object({
7157
- id: import_zod3.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
7158
- description: import_zod3.z.string().describe("What this rubric checks for"),
7159
- weight: import_zod3.z.number().default(1).describe("Relative importance (default 1.0)"),
7160
- required: import_zod3.z.boolean().default(true).describe("Whether this is a mandatory requirement")
9189
+ var import_zod4 = require("zod");
9190
+ var rubricItemSchema = import_zod4.z.object({
9191
+ id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
9192
+ description: import_zod4.z.string().describe("What this rubric checks for"),
9193
+ weight: import_zod4.z.number().default(1).describe("Relative importance (default 1.0)"),
9194
+ required: import_zod4.z.boolean().default(true).describe("Whether this is a mandatory requirement")
7161
9195
  });
7162
- var rubricGenerationSchema = import_zod3.z.object({
7163
- rubrics: import_zod3.z.array(rubricItemSchema).describe("List of evaluation rubrics")
9196
+ var rubricGenerationSchema = import_zod4.z.object({
9197
+ rubrics: import_zod4.z.array(rubricItemSchema).describe("List of evaluation rubrics")
7164
9198
  });
7165
9199
  async function generateRubrics(options) {
7166
9200
  const { expectedOutcome, question, referenceAnswer, provider } = options;
@@ -7238,15 +9272,20 @@ function createAgentKernel() {
7238
9272
  0 && (module.exports = {
7239
9273
  CodeEvaluator,
7240
9274
  CompositeEvaluator,
9275
+ CostEvaluator,
7241
9276
  DEFAULT_EXPLORATION_TOOLS,
9277
+ FieldAccuracyEvaluator,
9278
+ LatencyEvaluator,
7242
9279
  LlmJudgeEvaluator,
7243
9280
  TEST_MESSAGE_ROLES,
9281
+ TokenUsageEvaluator,
7244
9282
  ToolTrajectoryEvaluator,
7245
9283
  avgToolDurationMs,
7246
9284
  buildDirectoryChain,
7247
9285
  buildPromptInputs,
7248
9286
  buildSearchRoots,
7249
9287
  computeTraceSummary,
9288
+ consumeClaudeCodeLogEntries,
7250
9289
  consumeCodexLogEntries,
7251
9290
  consumePiLogEntries,
7252
9291
  createAgentKernel,
@@ -7277,6 +9316,7 @@ function createAgentKernel() {
7277
9316
  resolveTargetDefinition,
7278
9317
  runEvalCase,
7279
9318
  runEvaluation,
9319
+ subscribeToClaudeCodeLogEntries,
7280
9320
  subscribeToCodexLogEntries,
7281
9321
  subscribeToPiLogEntries,
7282
9322
  tokensPerTool