@agentv/core 1.5.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,15 +32,20 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  CompositeEvaluator: () => CompositeEvaluator,
35
+ CostEvaluator: () => CostEvaluator,
35
36
  DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
37
+ FieldAccuracyEvaluator: () => FieldAccuracyEvaluator,
38
+ LatencyEvaluator: () => LatencyEvaluator,
36
39
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
37
40
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
41
+ TokenUsageEvaluator: () => TokenUsageEvaluator,
38
42
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
39
43
  avgToolDurationMs: () => avgToolDurationMs,
40
44
  buildDirectoryChain: () => buildDirectoryChain2,
41
45
  buildPromptInputs: () => buildPromptInputs,
42
46
  buildSearchRoots: () => buildSearchRoots2,
43
47
  computeTraceSummary: () => computeTraceSummary,
48
+ consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
44
49
  consumeCodexLogEntries: () => consumeCodexLogEntries,
45
50
  consumePiLogEntries: () => consumePiLogEntries,
46
51
  createAgentKernel: () => createAgentKernel,
@@ -62,6 +67,8 @@ __export(index_exports, {
62
67
  loadEvalCases: () => loadEvalCases,
63
68
  mergeExecutionMetrics: () => mergeExecutionMetrics,
64
69
  normalizeLineEndings: () => normalizeLineEndings,
70
+ parseCodeJudgePayload: () => parseCodeJudgePayload,
71
+ readCodeJudgePayload: () => readCodeJudgePayload,
65
72
  readJsonFile: () => readJsonFile,
66
73
  readTargetDefinitions: () => readTargetDefinitions,
67
74
  readTestSuiteMetadata: () => readTestSuiteMetadata,
@@ -71,6 +78,7 @@ __export(index_exports, {
71
78
  resolveTargetDefinition: () => resolveTargetDefinition,
72
79
  runEvalCase: () => runEvalCase,
73
80
  runEvaluation: () => runEvaluation,
81
+ subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
74
82
  subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
75
83
  subscribeToPiLogEntries: () => subscribeToPiLogEntries,
76
84
  tokensPerTool: () => tokensPerTool
@@ -129,7 +137,11 @@ var EVALUATOR_KIND_VALUES = [
129
137
  "llm_judge",
130
138
  "rubric",
131
139
  "composite",
132
- "tool_trajectory"
140
+ "tool_trajectory",
141
+ "field_accuracy",
142
+ "latency",
143
+ "cost",
144
+ "token_usage"
133
145
  ];
134
146
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
135
147
  function isEvaluatorKind(value) {
@@ -551,7 +563,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
551
563
  continue;
552
564
  }
553
565
  if (typeValue === "code_judge") {
554
- const script = asString2(rawEvaluator.script);
566
+ let script;
567
+ const rawScript = rawEvaluator.script;
568
+ if (typeof rawScript === "string") {
569
+ const trimmed = rawScript.trim();
570
+ if (trimmed.length === 0) {
571
+ throw new Error(
572
+ `Invalid code_judge script for evaluator '${name}' in '${evalId}': script cannot be empty`
573
+ );
574
+ }
575
+ script = parseCommandToArgv(trimmed);
576
+ } else {
577
+ script = asStringArray(
578
+ rawScript,
579
+ `code_judge script for evaluator '${name}' in '${evalId}'`
580
+ );
581
+ }
555
582
  if (!script) {
556
583
  logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
557
584
  continue;
@@ -572,13 +599,21 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
572
599
  } else {
573
600
  resolvedCwd = searchRoots[0];
574
601
  }
602
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
603
+ const config = {};
604
+ for (const [key, value] of Object.entries(rawEvaluator)) {
605
+ if (!knownProps.has(key) && value !== void 0) {
606
+ config[key] = value;
607
+ }
608
+ }
575
609
  evaluators.push({
576
610
  name,
577
611
  type: "code",
578
612
  script,
579
613
  cwd,
580
614
  resolvedCwd,
581
- ...weight2 !== void 0 ? { weight: weight2 } : {}
615
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
616
+ ...Object.keys(config).length > 0 ? { config } : {}
582
617
  });
583
618
  continue;
584
619
  }
@@ -753,6 +788,140 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
753
788
  evaluators.push(config);
754
789
  continue;
755
790
  }
791
+ if (typeValue === "field_accuracy") {
792
+ const rawFields = rawEvaluator.fields;
793
+ if (!Array.isArray(rawFields)) {
794
+ logWarning2(
795
+ `Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
796
+ );
797
+ continue;
798
+ }
799
+ if (rawFields.length === 0) {
800
+ logWarning2(
801
+ `Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
802
+ );
803
+ continue;
804
+ }
805
+ const fields = [];
806
+ for (const rawField of rawFields) {
807
+ if (!isJsonObject2(rawField)) {
808
+ logWarning2(
809
+ `Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
810
+ );
811
+ continue;
812
+ }
813
+ const fieldPath = asString2(rawField.path);
814
+ const match = asString2(rawField.match);
815
+ if (!fieldPath) {
816
+ logWarning2(
817
+ `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
818
+ );
819
+ continue;
820
+ }
821
+ if (!match || !isValidFieldMatchType(match)) {
822
+ logWarning2(
823
+ `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
824
+ );
825
+ continue;
826
+ }
827
+ const fieldConfig = {
828
+ path: fieldPath,
829
+ match,
830
+ ...typeof rawField.required === "boolean" ? { required: rawField.required } : {},
831
+ ...typeof rawField.weight === "number" ? { weight: rawField.weight } : {},
832
+ ...typeof rawField.tolerance === "number" ? { tolerance: rawField.tolerance } : {},
833
+ ...typeof rawField.relative === "boolean" ? { relative: rawField.relative } : {},
834
+ ...Array.isArray(rawField.formats) ? { formats: rawField.formats.filter((f) => typeof f === "string") } : {}
835
+ };
836
+ fields.push(fieldConfig);
837
+ }
838
+ if (fields.length === 0) {
839
+ logWarning2(
840
+ `Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
841
+ );
842
+ continue;
843
+ }
844
+ const aggregation = asString2(rawEvaluator.aggregation);
845
+ const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
846
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
847
+ evaluators.push({
848
+ name,
849
+ type: "field_accuracy",
850
+ fields,
851
+ ...validAggregation ? { aggregation: validAggregation } : {},
852
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
853
+ });
854
+ continue;
855
+ }
856
+ if (typeValue === "latency") {
857
+ const threshold = rawEvaluator.threshold;
858
+ if (typeof threshold !== "number" || threshold < 0) {
859
+ logWarning2(
860
+ `Skipping latency evaluator '${name}' in '${evalId}': threshold must be a non-negative number`
861
+ );
862
+ continue;
863
+ }
864
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
865
+ evaluators.push({
866
+ name,
867
+ type: "latency",
868
+ threshold,
869
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
870
+ });
871
+ continue;
872
+ }
873
+ if (typeValue === "cost") {
874
+ const budget = rawEvaluator.budget;
875
+ if (typeof budget !== "number" || budget < 0) {
876
+ logWarning2(
877
+ `Skipping cost evaluator '${name}' in '${evalId}': budget must be a non-negative number`
878
+ );
879
+ continue;
880
+ }
881
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
882
+ evaluators.push({
883
+ name,
884
+ type: "cost",
885
+ budget,
886
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
887
+ });
888
+ continue;
889
+ }
890
+ if (typeValue === "token_usage") {
891
+ const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
892
+ const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
893
+ const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
894
+ const limits = [
895
+ ["max_total", maxTotal],
896
+ ["max_input", maxInput],
897
+ ["max_output", maxOutput]
898
+ ];
899
+ const validLimits = {};
900
+ for (const [key, raw] of limits) {
901
+ if (raw === void 0) continue;
902
+ if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
903
+ logWarning2(
904
+ `Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
905
+ );
906
+ continue;
907
+ }
908
+ validLimits[key] = raw;
909
+ }
910
+ if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
911
+ logWarning2(
912
+ `Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
913
+ );
914
+ continue;
915
+ }
916
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
917
+ evaluators.push({
918
+ name,
919
+ type: "token_usage",
920
+ ...validLimits,
921
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
922
+ });
923
+ continue;
924
+ }
756
925
  const prompt = asString2(rawEvaluator.prompt);
757
926
  let promptPath;
758
927
  if (prompt) {
@@ -823,6 +992,34 @@ function coerceEvaluator(candidate, contextId) {
823
992
  function asString2(value) {
824
993
  return typeof value === "string" ? value : void 0;
825
994
  }
995
+ function asStringArray(value, description) {
996
+ if (value === void 0) {
997
+ return void 0;
998
+ }
999
+ if (!Array.isArray(value)) {
1000
+ throw new Error(`${description} must be an array of strings (argv tokens)`);
1001
+ }
1002
+ if (value.length === 0) {
1003
+ throw new Error(`${description} cannot be empty`);
1004
+ }
1005
+ const result = [];
1006
+ for (const [index, entry] of value.entries()) {
1007
+ if (typeof entry !== "string") {
1008
+ throw new Error(`${description}[${index}] must be a string`);
1009
+ }
1010
+ if (entry.trim().length === 0) {
1011
+ throw new Error(`${description}[${index}] cannot be empty`);
1012
+ }
1013
+ result.push(entry);
1014
+ }
1015
+ return result;
1016
+ }
1017
+ function parseCommandToArgv(command) {
1018
+ if (process.platform === "win32") {
1019
+ return ["cmd.exe", "/c", command];
1020
+ }
1021
+ return ["sh", "-lc", command];
1022
+ }
826
1023
  function isJsonObject2(value) {
827
1024
  return typeof value === "object" && value !== null && !Array.isArray(value);
828
1025
  }
@@ -856,6 +1053,14 @@ function validateWeight(rawWeight, evaluatorName, evalId) {
856
1053
  }
857
1054
  return rawWeight;
858
1055
  }
1056
+ var VALID_FIELD_MATCH_TYPES = /* @__PURE__ */ new Set(["exact", "numeric_tolerance", "date"]);
1057
+ function isValidFieldMatchType(value) {
1058
+ return typeof value === "string" && VALID_FIELD_MATCH_TYPES.has(value);
1059
+ }
1060
+ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average", "all_or_nothing"]);
1061
+ function isValidFieldAggregationType(value) {
1062
+ return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
1063
+ }
859
1064
 
860
1065
  // src/evaluation/loaders/message-processor.ts
861
1066
  var import_promises4 = require("fs/promises");
@@ -1930,92 +2135,993 @@ async function withRetry(fn, retryConfig, signal) {
1930
2135
  throw lastError;
1931
2136
  }
1932
2137
 
1933
- // src/evaluation/providers/cli.ts
2138
+ // src/evaluation/providers/claude-code.ts
1934
2139
  var import_node_child_process = require("child_process");
1935
- var import_promises8 = __toESM(require("fs/promises"), 1);
1936
- var import_node_os = __toESM(require("os"), 1);
1937
- var import_node_path8 = __toESM(require("path"), 1);
1938
- var import_node_util = require("util");
1939
- var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
1940
- var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
1941
- async function defaultCommandRunner(command, options) {
1942
- const execOptions = {
1943
- cwd: options.cwd,
1944
- env: options.env,
1945
- timeout: options.timeoutMs,
1946
- signal: options.signal,
1947
- maxBuffer: DEFAULT_MAX_BUFFER,
1948
- shell: process.platform === "win32" ? "powershell.exe" : void 0
2140
+ var import_node_crypto = require("crypto");
2141
+ var import_node_fs3 = require("fs");
2142
+ var import_promises8 = require("fs/promises");
2143
+ var import_node_os = require("os");
2144
+ var import_node_path9 = __toESM(require("path"), 1);
2145
+
2146
+ // src/evaluation/providers/claude-code-log-tracker.ts
2147
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
2148
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeCodeLogSubscribers");
2149
+ function getClaudeCodeLogStore() {
2150
+ const globalObject = globalThis;
2151
+ const existing = globalObject[GLOBAL_LOGS_KEY];
2152
+ if (existing) {
2153
+ return existing;
2154
+ }
2155
+ const created = [];
2156
+ globalObject[GLOBAL_LOGS_KEY] = created;
2157
+ return created;
2158
+ }
2159
+ function getSubscriberStore() {
2160
+ const globalObject = globalThis;
2161
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
2162
+ if (existing) {
2163
+ return existing;
2164
+ }
2165
+ const created = /* @__PURE__ */ new Set();
2166
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
2167
+ return created;
2168
+ }
2169
+ function notifySubscribers(entry) {
2170
+ const subscribers = Array.from(getSubscriberStore());
2171
+ for (const listener of subscribers) {
2172
+ try {
2173
+ listener(entry);
2174
+ } catch (error) {
2175
+ const message = error instanceof Error ? error.message : String(error);
2176
+ console.warn(`Claude Code log subscriber failed: ${message}`);
2177
+ }
2178
+ }
2179
+ }
2180
+ function recordClaudeCodeLogEntry(entry) {
2181
+ getClaudeCodeLogStore().push(entry);
2182
+ notifySubscribers(entry);
2183
+ }
2184
+ function consumeClaudeCodeLogEntries() {
2185
+ const store = getClaudeCodeLogStore();
2186
+ if (store.length === 0) {
2187
+ return [];
2188
+ }
2189
+ return store.splice(0, store.length);
2190
+ }
2191
+ function subscribeToClaudeCodeLogEntries(listener) {
2192
+ const store = getSubscriberStore();
2193
+ store.add(listener);
2194
+ return () => {
2195
+ store.delete(listener);
1949
2196
  };
1950
- try {
1951
- const { stdout, stderr } = await execAsync(command, execOptions);
1952
- return {
1953
- stdout,
1954
- stderr,
1955
- exitCode: 0,
1956
- failed: false,
1957
- timedOut: false,
1958
- signal: null
1959
- };
1960
- } catch (error) {
1961
- const execError = error;
1962
- return {
1963
- stdout: execError.stdout ?? "",
1964
- stderr: execError.stderr ?? "",
1965
- exitCode: typeof execError.code === "number" ? execError.code : null,
1966
- failed: true,
1967
- timedOut: execError.timedOut === true || execError.killed === true,
1968
- signal: execError.signal ?? null
1969
- };
2197
+ }
2198
+
2199
+ // src/evaluation/providers/preread.ts
2200
+ var import_node_path8 = __toESM(require("path"), 1);
2201
+ function buildPromptDocument(request, inputFiles, options) {
2202
+ const parts = [];
2203
+ const guidelineFiles = collectGuidelineFiles(
2204
+ inputFiles,
2205
+ options?.guidelinePatterns ?? request.guideline_patterns,
2206
+ options?.guidelineOverrides
2207
+ );
2208
+ const inputFilesList = collectInputFiles(inputFiles);
2209
+ const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
2210
+ const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
2211
+ if (prereadBlock.length > 0) {
2212
+ parts.push("\n", prereadBlock);
1970
2213
  }
2214
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2215
+ return parts.join("\n").trim();
1971
2216
  }
1972
- var CliProvider = class {
1973
- id;
1974
- kind = "cli";
1975
- targetName;
1976
- supportsBatch = true;
1977
- config;
1978
- runCommand;
1979
- verbose;
1980
- keepTempFiles;
1981
- healthcheckPromise;
1982
- constructor(targetName, config, runner = defaultCommandRunner) {
1983
- this.targetName = targetName;
1984
- this.id = `cli:${targetName}`;
1985
- this.config = config;
1986
- this.runCommand = runner;
1987
- this.verbose = config.verbose ?? false;
1988
- this.keepTempFiles = config.keepTempFiles ?? false;
2217
+ function normalizeInputFiles(inputFiles) {
2218
+ if (!inputFiles || inputFiles.length === 0) {
2219
+ return void 0;
1989
2220
  }
1990
- async invoke(request) {
1991
- if (request.signal?.aborted) {
1992
- throw new Error("CLI provider request was aborted before execution");
2221
+ const deduped = /* @__PURE__ */ new Map();
2222
+ for (const inputFile of inputFiles) {
2223
+ const absolutePath = import_node_path8.default.resolve(inputFile);
2224
+ if (!deduped.has(absolutePath)) {
2225
+ deduped.set(absolutePath, absolutePath);
1993
2226
  }
1994
- await this.ensureHealthy(request.signal);
1995
- const outputFilePath = generateOutputFilePath(request.evalCaseId);
1996
- const templateValues = buildTemplateValues(request, this.config, outputFilePath);
1997
- const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
1998
- if (this.verbose) {
1999
- console.log(
2000
- `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
2001
- );
2227
+ }
2228
+ return Array.from(deduped.values());
2229
+ }
2230
+ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
2231
+ if (!inputFiles || inputFiles.length === 0) {
2232
+ return [];
2233
+ }
2234
+ const unique = /* @__PURE__ */ new Map();
2235
+ for (const inputFile of inputFiles) {
2236
+ const absolutePath = import_node_path8.default.resolve(inputFile);
2237
+ if (overrides?.has(absolutePath)) {
2238
+ if (!unique.has(absolutePath)) {
2239
+ unique.set(absolutePath, absolutePath);
2240
+ }
2241
+ continue;
2002
2242
  }
2003
- const startTime = Date.now();
2004
- const result = await this.runCommand(renderedCommand, {
2005
- cwd: this.config.cwd,
2006
- env: process.env,
2007
- timeoutMs: this.config.timeoutMs,
2008
- signal: request.signal
2009
- });
2010
- const measuredDurationMs = Date.now() - startTime;
2011
- if (result.failed || (result.exitCode ?? 0) !== 0) {
2012
- if (request.signal?.aborted) {
2013
- throw new Error("CLI provider request was aborted");
2243
+ const normalized = absolutePath.split(import_node_path8.default.sep).join("/");
2244
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
2245
+ if (!unique.has(absolutePath)) {
2246
+ unique.set(absolutePath, absolutePath);
2014
2247
  }
2015
- if (result.timedOut) {
2016
- throw new Error(
2017
- `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
2018
- );
2248
+ }
2249
+ }
2250
+ return Array.from(unique.values());
2251
+ }
2252
+ function collectInputFiles(inputFiles) {
2253
+ if (!inputFiles || inputFiles.length === 0) {
2254
+ return [];
2255
+ }
2256
+ const unique = /* @__PURE__ */ new Map();
2257
+ for (const inputFile of inputFiles) {
2258
+ const absolutePath = import_node_path8.default.resolve(inputFile);
2259
+ if (!unique.has(absolutePath)) {
2260
+ unique.set(absolutePath, absolutePath);
2261
+ }
2262
+ }
2263
+ return Array.from(unique.values());
2264
+ }
2265
+ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
2266
+ if (guidelineFiles.length === 0 && inputFiles.length === 0) {
2267
+ return "";
2268
+ }
2269
+ const buildList = (files) => files.map((absolutePath) => {
2270
+ const fileName = import_node_path8.default.basename(absolutePath);
2271
+ const fileUri = pathToFileUri(absolutePath);
2272
+ return `* [${fileName}](${fileUri})`;
2273
+ });
2274
+ const sections = [];
2275
+ if (guidelineFiles.length > 0) {
2276
+ sections.push(`Read all guideline files:
2277
+ ${buildList(guidelineFiles).join("\n")}.`);
2278
+ }
2279
+ if (inputFiles.length > 0) {
2280
+ sections.push(`Read all input files:
2281
+ ${buildList(inputFiles).join("\n")}.`);
2282
+ }
2283
+ sections.push(
2284
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
2285
+ "Then apply system_instructions on the user query below."
2286
+ );
2287
+ return sections.join("\n");
2288
+ }
2289
+ function pathToFileUri(filePath) {
2290
+ const absolutePath = import_node_path8.default.isAbsolute(filePath) ? filePath : import_node_path8.default.resolve(filePath);
2291
+ const normalizedPath = absolutePath.replace(/\\/g, "/");
2292
+ if (/^[a-zA-Z]:\//.test(normalizedPath)) {
2293
+ return `file:///${normalizedPath}`;
2294
+ }
2295
+ return `file://${normalizedPath}`;
2296
+ }
2297
+
2298
+ // src/evaluation/providers/claude-code.ts
2299
+ var WORKSPACE_PREFIX = "agentv-claude-code-";
2300
+ var PROMPT_FILENAME = "prompt.md";
2301
+ var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
2302
+ - Do NOT create any additional output files in the workspace.
2303
+ - All intended file outputs/changes MUST be written in your response.
2304
+ - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
2305
+ This is required for evaluation scoring.`;
2306
+ var ClaudeCodeProvider = class {
2307
+ id;
2308
+ kind = "claude-code";
2309
+ targetName;
2310
+ supportsBatch = false;
2311
+ config;
2312
+ runClaudeCode;
2313
+ constructor(targetName, config, runner = defaultClaudeCodeRunner) {
2314
+ this.id = `claude-code:${targetName}`;
2315
+ this.targetName = targetName;
2316
+ this.config = config;
2317
+ this.runClaudeCode = runner;
2318
+ }
2319
+ async invoke(request) {
2320
+ if (request.signal?.aborted) {
2321
+ throw new Error("Claude Code request was aborted before execution");
2322
+ }
2323
+ const inputFiles = normalizeInputFiles(request.inputFiles);
2324
+ const workspaceRoot = await this.createWorkspace();
2325
+ const logger = await this.createStreamLogger(request).catch(() => void 0);
2326
+ try {
2327
+ const promptFile = import_node_path9.default.join(workspaceRoot, PROMPT_FILENAME);
2328
+ await (0, import_promises8.writeFile)(promptFile, request.question, "utf8");
2329
+ const args = this.buildClaudeCodeArgs(request.question, inputFiles);
2330
+ const cwd = this.resolveCwd();
2331
+ const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
2332
+ if (result.timedOut) {
2333
+ throw new Error(
2334
+ `Claude Code CLI timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
2335
+ );
2336
+ }
2337
+ if (result.exitCode !== 0) {
2338
+ const detail = pickDetail(result.stderr, result.stdout);
2339
+ const prefix = `Claude Code CLI exited with code ${result.exitCode}`;
2340
+ if (isNestedClaudeCodeAuthError(result.stdout)) {
2341
+ throw new Error(
2342
+ `${prefix}: Claude Code detected a nested session and requires API key authentication. Set ANTHROPIC_API_KEY environment variable or run AgentV outside of a Claude Code session.`
2343
+ );
2344
+ }
2345
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
2346
+ }
2347
+ const parsed = parseClaudeCodeJsonl(result.stdout);
2348
+ const outputMessages = extractOutputMessages(parsed);
2349
+ const usage = extractUsage(parsed);
2350
+ return {
2351
+ raw: {
2352
+ response: parsed,
2353
+ stdout: result.stdout,
2354
+ stderr: result.stderr,
2355
+ exitCode: result.exitCode,
2356
+ args,
2357
+ executable: this.config.executable,
2358
+ promptFile,
2359
+ workspace: workspaceRoot,
2360
+ inputFiles,
2361
+ logFile: logger?.filePath
2362
+ },
2363
+ outputMessages,
2364
+ usage
2365
+ };
2366
+ } finally {
2367
+ await logger?.close();
2368
+ await this.cleanupWorkspace(workspaceRoot);
2369
+ }
2370
+ }
2371
+ resolveCwd() {
2372
+ if (!this.config.cwd) {
2373
+ return process.cwd();
2374
+ }
2375
+ return import_node_path9.default.resolve(this.config.cwd);
2376
+ }
2377
+ buildClaudeCodeArgs(prompt, inputFiles) {
2378
+ const args = [];
2379
+ args.push("--output-format", "stream-json");
2380
+ args.push("--verbose");
2381
+ args.push("-p");
2382
+ if (this.config.model) {
2383
+ args.push("--model", this.config.model);
2384
+ }
2385
+ if (this.config.args && this.config.args.length > 0) {
2386
+ args.push(...this.config.args);
2387
+ }
2388
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
2389
+ const fullPrompt = `${systemPrompt}
2390
+
2391
+ ${prompt}`;
2392
+ let finalPrompt = fullPrompt;
2393
+ if (inputFiles && inputFiles.length > 0) {
2394
+ const filesContext = inputFiles.map((f) => `[File: ${f}]`).join("\n");
2395
+ finalPrompt = `${fullPrompt}
2396
+
2397
+ ## Input Files
2398
+ ${filesContext}`;
2399
+ }
2400
+ args.push(finalPrompt);
2401
+ return args;
2402
+ }
2403
+ buildEnv() {
2404
+ const env = { ...process.env };
2405
+ env.CLAUDECODE = void 0;
2406
+ env.CLAUDE_CODE_ENTRYPOINT = void 0;
2407
+ return env;
2408
+ }
2409
+ async executeClaudeCode(args, cwd, signal, logger) {
2410
+ try {
2411
+ return await this.runClaudeCode({
2412
+ executable: this.config.executable,
2413
+ args,
2414
+ cwd,
2415
+ timeoutMs: this.config.timeoutMs,
2416
+ env: this.buildEnv(),
2417
+ signal,
2418
+ onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
2419
+ onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
2420
+ });
2421
+ } catch (error) {
2422
+ const err = error;
2423
+ if (err.code === "ENOENT") {
2424
+ throw new Error(
2425
+ `Claude Code executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
2426
+ );
2427
+ }
2428
+ throw error;
2429
+ }
2430
+ }
2431
+ async createWorkspace() {
2432
+ return await (0, import_promises8.mkdtemp)(import_node_path9.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
2433
+ }
2434
+ async cleanupWorkspace(workspaceRoot) {
2435
+ try {
2436
+ await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
2437
+ } catch {
2438
+ }
2439
+ }
2440
+ resolveLogDirectory() {
2441
+ const disabled = isClaudeCodeLogStreamingDisabled();
2442
+ if (disabled) {
2443
+ return void 0;
2444
+ }
2445
+ if (this.config.logDir) {
2446
+ return import_node_path9.default.resolve(this.config.logDir);
2447
+ }
2448
+ return import_node_path9.default.join(process.cwd(), ".agentv", "logs", "claude-code");
2449
+ }
2450
+ async createStreamLogger(request) {
2451
+ const logDir = this.resolveLogDirectory();
2452
+ if (!logDir) {
2453
+ return void 0;
2454
+ }
2455
+ try {
2456
+ await (0, import_promises8.mkdir)(logDir, { recursive: true });
2457
+ } catch (error) {
2458
+ const message = error instanceof Error ? error.message : String(error);
2459
+ console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
2460
+ return void 0;
2461
+ }
2462
+ const filePath = import_node_path9.default.join(logDir, buildLogFilename(request, this.targetName));
2463
+ try {
2464
+ const logger = await ClaudeCodeStreamLogger.create({
2465
+ filePath,
2466
+ targetName: this.targetName,
2467
+ evalCaseId: request.evalCaseId,
2468
+ attempt: request.attempt,
2469
+ format: this.config.logFormat ?? "summary"
2470
+ });
2471
+ recordClaudeCodeLogEntry({
2472
+ filePath,
2473
+ targetName: this.targetName,
2474
+ evalCaseId: request.evalCaseId,
2475
+ attempt: request.attempt
2476
+ });
2477
+ return logger;
2478
+ } catch (error) {
2479
+ const message = error instanceof Error ? error.message : String(error);
2480
+ console.warn(`Skipping Claude Code stream logging for ${filePath}: ${message}`);
2481
+ return void 0;
2482
+ }
2483
+ }
2484
+ };
2485
+ var ClaudeCodeStreamLogger = class _ClaudeCodeStreamLogger {
2486
+ filePath;
2487
+ stream;
2488
+ startedAt = Date.now();
2489
+ stdoutBuffer = "";
2490
+ stderrBuffer = "";
2491
+ format;
2492
+ constructor(filePath, format) {
2493
+ this.filePath = filePath;
2494
+ this.format = format;
2495
+ this.stream = (0, import_node_fs3.createWriteStream)(filePath, { flags: "a" });
2496
+ }
2497
+ static async create(options) {
2498
+ const logger = new _ClaudeCodeStreamLogger(options.filePath, options.format);
2499
+ const header = [
2500
+ "# Claude Code CLI stream log",
2501
+ `# target: ${options.targetName}`,
2502
+ options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
2503
+ options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
2504
+ `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
2505
+ ""
2506
+ ].filter((line) => Boolean(line));
2507
+ logger.writeLines(header);
2508
+ return logger;
2509
+ }
2510
+ handleStdoutChunk(chunk) {
2511
+ this.stdoutBuffer += chunk;
2512
+ this.flushBuffer("stdout");
2513
+ }
2514
+ handleStderrChunk(chunk) {
2515
+ this.stderrBuffer += chunk;
2516
+ this.flushBuffer("stderr");
2517
+ }
2518
+ async close() {
2519
+ this.flushBuffer("stdout");
2520
+ this.flushBuffer("stderr");
2521
+ this.flushRemainder();
2522
+ await new Promise((resolve, reject) => {
2523
+ this.stream.once("error", reject);
2524
+ this.stream.end(() => resolve());
2525
+ });
2526
+ }
2527
+ writeLines(lines) {
2528
+ for (const line of lines) {
2529
+ this.stream.write(`${line}
2530
+ `);
2531
+ }
2532
+ }
2533
+ flushBuffer(source) {
2534
+ const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
2535
+ const lines = buffer.split(/\r?\n/);
2536
+ const remainder = lines.pop() ?? "";
2537
+ if (source === "stdout") {
2538
+ this.stdoutBuffer = remainder;
2539
+ } else {
2540
+ this.stderrBuffer = remainder;
2541
+ }
2542
+ for (const line of lines) {
2543
+ const formatted = this.formatLine(line, source);
2544
+ if (formatted) {
2545
+ this.stream.write(formatted);
2546
+ this.stream.write("\n");
2547
+ }
2548
+ }
2549
+ }
2550
+ formatLine(rawLine, source) {
2551
+ const trimmed = rawLine.trim();
2552
+ if (trimmed.length === 0) {
2553
+ return void 0;
2554
+ }
2555
+ const message = this.format === "json" ? formatClaudeCodeJsonLog(trimmed) : formatClaudeCodeLogMessage(trimmed, source);
2556
+ return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
2557
+ }
2558
+ flushRemainder() {
2559
+ const stdoutRemainder = this.stdoutBuffer.trim();
2560
+ if (stdoutRemainder.length > 0) {
2561
+ const formatted = this.formatLine(stdoutRemainder, "stdout");
2562
+ if (formatted) {
2563
+ this.stream.write(formatted);
2564
+ this.stream.write("\n");
2565
+ }
2566
+ }
2567
+ const stderrRemainder = this.stderrBuffer.trim();
2568
+ if (stderrRemainder.length > 0) {
2569
+ const formatted = this.formatLine(stderrRemainder, "stderr");
2570
+ if (formatted) {
2571
+ this.stream.write(formatted);
2572
+ this.stream.write("\n");
2573
+ }
2574
+ }
2575
+ this.stdoutBuffer = "";
2576
+ this.stderrBuffer = "";
2577
+ }
2578
+ };
2579
+ function isClaudeCodeLogStreamingDisabled() {
2580
+ const envValue = process.env.AGENTV_CLAUDE_CODE_STREAM_LOGS;
2581
+ if (!envValue) {
2582
+ return false;
2583
+ }
2584
+ const normalized = envValue.trim().toLowerCase();
2585
+ return normalized === "false" || normalized === "0" || normalized === "off";
2586
+ }
2587
+ function buildLogFilename(request, targetName) {
2588
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2589
+ const evalId = sanitizeForFilename(request.evalCaseId ?? "claude-code");
2590
+ const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
2591
+ const target = sanitizeForFilename(targetName);
2592
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto.randomUUID)().slice(0, 8)}.log`;
2593
+ }
2594
+ function sanitizeForFilename(value) {
2595
+ const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
2596
+ return sanitized.length > 0 ? sanitized : "claude-code";
2597
+ }
2598
+ function formatElapsed(startedAt) {
2599
+ const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
2600
+ const hours = Math.floor(elapsedSeconds / 3600);
2601
+ const minutes = Math.floor(elapsedSeconds % 3600 / 60);
2602
+ const seconds = elapsedSeconds % 60;
2603
+ if (hours > 0) {
2604
+ return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
2605
+ }
2606
+ return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
2607
+ }
2608
+ function formatClaudeCodeLogMessage(rawLine, source) {
2609
+ const parsed = tryParseJsonValue(rawLine);
2610
+ if (parsed) {
2611
+ const summary = summarizeClaudeCodeEvent(parsed);
2612
+ if (summary) {
2613
+ return summary;
2614
+ }
2615
+ }
2616
+ if (source === "stderr") {
2617
+ return `stderr: ${rawLine}`;
2618
+ }
2619
+ return rawLine;
2620
+ }
2621
+ function formatClaudeCodeJsonLog(rawLine) {
2622
+ const parsed = tryParseJsonValue(rawLine);
2623
+ if (!parsed) {
2624
+ return rawLine;
2625
+ }
2626
+ try {
2627
+ return JSON.stringify(parsed, null, 2);
2628
+ } catch {
2629
+ return rawLine;
2630
+ }
2631
+ }
2632
+ function summarizeClaudeCodeEvent(event) {
2633
+ if (!event || typeof event !== "object") {
2634
+ return void 0;
2635
+ }
2636
+ const record = event;
2637
+ const type = typeof record.type === "string" ? record.type : void 0;
2638
+ if (!type) {
2639
+ return void 0;
2640
+ }
2641
+ switch (type) {
2642
+ case "system":
2643
+ return "system: init";
2644
+ case "assistant": {
2645
+ const message = record.message;
2646
+ if (message) {
2647
+ const content = message.content;
2648
+ if (Array.isArray(content) && content.length > 0) {
2649
+ const first = content[0];
2650
+ if (first?.type === "tool_use") {
2651
+ return `assistant: tool_use (${first.name})`;
2652
+ }
2653
+ if (first?.type === "text") {
2654
+ const text = first.text;
2655
+ if (typeof text === "string") {
2656
+ const preview = text.length > 50 ? `${text.slice(0, 50)}...` : text;
2657
+ return `assistant: ${preview}`;
2658
+ }
2659
+ }
2660
+ }
2661
+ }
2662
+ return "assistant";
2663
+ }
2664
+ case "user": {
2665
+ const message = record.message;
2666
+ if (message) {
2667
+ const content = message.content;
2668
+ if (Array.isArray(content) && content.length > 0) {
2669
+ const first = content[0];
2670
+ if (first?.type === "tool_result") {
2671
+ return `user: tool_result (${first.tool_use_id})`;
2672
+ }
2673
+ }
2674
+ }
2675
+ return "user";
2676
+ }
2677
+ case "result": {
2678
+ const cost = record.cost_usd;
2679
+ const duration = record.duration_ms;
2680
+ if (typeof cost === "number" && typeof duration === "number") {
2681
+ return `result: $${cost.toFixed(4)}, ${Math.round(duration)}ms`;
2682
+ }
2683
+ return "result";
2684
+ }
2685
+ default:
2686
+ return type;
2687
+ }
2688
+ }
2689
+ function tryParseJsonValue(rawLine) {
2690
+ try {
2691
+ return JSON.parse(rawLine);
2692
+ } catch {
2693
+ return void 0;
2694
+ }
2695
+ }
2696
+ function parseClaudeCodeJsonl(output) {
2697
+ const trimmed = output.trim();
2698
+ if (trimmed.length === 0) {
2699
+ throw new Error("Claude Code CLI produced no output");
2700
+ }
2701
+ const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
2702
+ const parsed = [];
2703
+ for (const line of lines) {
2704
+ try {
2705
+ parsed.push(JSON.parse(line));
2706
+ } catch {
2707
+ }
2708
+ }
2709
+ if (parsed.length === 0) {
2710
+ throw new Error("Claude Code CLI produced no valid JSON output");
2711
+ }
2712
+ return parsed;
2713
+ }
2714
+ function extractOutputMessages(events) {
2715
+ const outputMessages = [];
2716
+ for (const event of events) {
2717
+ if (!event || typeof event !== "object") {
2718
+ continue;
2719
+ }
2720
+ const record = event;
2721
+ const type = record.type;
2722
+ if (type === "assistant" || type === "user") {
2723
+ const message = record.message;
2724
+ if (message) {
2725
+ const converted = convertClaudeCodeMessage(message, type);
2726
+ if (converted) {
2727
+ outputMessages.push(converted);
2728
+ }
2729
+ }
2730
+ }
2731
+ }
2732
+ return outputMessages;
2733
+ }
2734
+ function convertClaudeCodeMessage(message, type) {
2735
+ const role = type === "assistant" ? "assistant" : "user";
2736
+ const content = extractTextContent(message.content);
2737
+ const toolCalls = extractToolCalls(message.content);
2738
+ return {
2739
+ role,
2740
+ content,
2741
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0
2742
+ };
2743
+ }
2744
+ function extractTextContent(content) {
2745
+ if (typeof content === "string") {
2746
+ return content;
2747
+ }
2748
+ if (!Array.isArray(content)) {
2749
+ return void 0;
2750
+ }
2751
+ const textParts = [];
2752
+ for (const part of content) {
2753
+ if (!part || typeof part !== "object") {
2754
+ continue;
2755
+ }
2756
+ const p = part;
2757
+ if (p.type === "text" && typeof p.text === "string") {
2758
+ textParts.push(p.text);
2759
+ }
2760
+ }
2761
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
2762
+ }
2763
+ function extractToolCalls(content) {
2764
+ if (!Array.isArray(content)) {
2765
+ return [];
2766
+ }
2767
+ const toolCalls = [];
2768
+ for (const part of content) {
2769
+ if (!part || typeof part !== "object") {
2770
+ continue;
2771
+ }
2772
+ const p = part;
2773
+ if (p.type === "tool_use" && typeof p.name === "string") {
2774
+ toolCalls.push({
2775
+ tool: p.name,
2776
+ input: p.input,
2777
+ id: typeof p.id === "string" ? p.id : void 0
2778
+ });
2779
+ }
2780
+ if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
2781
+ toolCalls.push({
2782
+ tool: "tool_result",
2783
+ output: p.content,
2784
+ id: p.tool_use_id
2785
+ });
2786
+ }
2787
+ }
2788
+ return toolCalls;
2789
+ }
2790
+ function extractUsage(events) {
2791
+ for (let i = events.length - 1; i >= 0; i--) {
2792
+ const event = events[i];
2793
+ if (!event || typeof event !== "object") {
2794
+ continue;
2795
+ }
2796
+ const record = event;
2797
+ if (record.type !== "result") {
2798
+ continue;
2799
+ }
2800
+ const usage = {};
2801
+ if (typeof record.cost_usd === "number") {
2802
+ usage.cost_usd = record.cost_usd;
2803
+ }
2804
+ if (typeof record.duration_ms === "number") {
2805
+ usage.duration_ms = record.duration_ms;
2806
+ }
2807
+ if (typeof record.duration_api_ms === "number") {
2808
+ usage.duration_api_ms = record.duration_api_ms;
2809
+ }
2810
+ if (typeof record.input_tokens === "number") {
2811
+ usage.input_tokens = record.input_tokens;
2812
+ }
2813
+ if (typeof record.output_tokens === "number") {
2814
+ usage.output_tokens = record.output_tokens;
2815
+ }
2816
+ if (typeof record.session_id === "string") {
2817
+ usage.session_id = record.session_id;
2818
+ }
2819
+ return Object.keys(usage).length > 0 ? usage : void 0;
2820
+ }
2821
+ return void 0;
2822
+ }
2823
+ function pickDetail(stderr, stdout) {
2824
+ const errorText = stderr.trim();
2825
+ if (errorText.length > 0) {
2826
+ return errorText;
2827
+ }
2828
+ const stdoutText = stdout.trim();
2829
+ return stdoutText.length > 0 ? stdoutText : void 0;
2830
+ }
2831
+ function formatTimeoutSuffix(timeoutMs) {
2832
+ if (!timeoutMs || timeoutMs <= 0) {
2833
+ return "";
2834
+ }
2835
+ const seconds = Math.ceil(timeoutMs / 1e3);
2836
+ return ` after ${seconds}s`;
2837
+ }
2838
+ function isNestedClaudeCodeAuthError(stdout) {
2839
+ try {
2840
+ const lines = stdout.split("\n");
2841
+ let hasApiKeySource = false;
2842
+ let hasAuthError = false;
2843
+ for (const line of lines) {
2844
+ const trimmed = line.trim();
2845
+ if (!trimmed) continue;
2846
+ try {
2847
+ const event = JSON.parse(trimmed);
2848
+ if (event.type === "system" && event.apiKeySource === "ANTHROPIC_API_KEY") {
2849
+ hasApiKeySource = true;
2850
+ }
2851
+ if (event.error === "authentication_failed" || event.type === "result" && event.is_error) {
2852
+ hasAuthError = true;
2853
+ }
2854
+ } catch {
2855
+ }
2856
+ }
2857
+ return hasApiKeySource && hasAuthError;
2858
+ } catch {
2859
+ return false;
2860
+ }
2861
+ }
2862
+ function escapeShellArg(arg) {
2863
+ return `'${arg.replace(/'/g, "'\\''")}'`;
2864
+ }
2865
+ async function defaultClaudeCodeRunner(options) {
2866
+ const tempId = (0, import_node_crypto.randomUUID)();
2867
+ const stdoutFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
2868
+ const stderrFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
2869
+ const exitFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
2870
+ const pidFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
2871
+ try {
2872
+ return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
2873
+ } finally {
2874
+ for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
2875
+ try {
2876
+ await (0, import_promises8.rm)(file, { force: true });
2877
+ } catch {
2878
+ }
2879
+ }
2880
+ }
2881
+ }
2882
+ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile) {
2883
+ const parts = options.executable.split(/\s+/);
2884
+ const executable = parts[0];
2885
+ const executableArgs = parts.slice(1);
2886
+ const allArgs = [...executableArgs, ...options.args];
2887
+ const escapedArgs = allArgs.map((arg) => escapeShellArg(arg));
2888
+ const fullCommand = [escapeShellArg(executable), ...escapedArgs].join(" ");
2889
+ const bashScript = `
2890
+ unset CLAUDECODE CLAUDE_CODE_ENTRYPOINT 2>/dev/null
2891
+ ${fullCommand} >${escapeShellArg(stdoutFile)} 2>${escapeShellArg(stderrFile)} &
2892
+ CHILD_PID=$!
2893
+ echo $CHILD_PID > ${escapeShellArg(pidFile)}
2894
+ wait $CHILD_PID
2895
+ echo $? > ${escapeShellArg(exitFile)}
2896
+ `;
2897
+ const child = (0, import_node_child_process.spawn)("setsid", ["bash", "-c", bashScript], {
2898
+ cwd: options.cwd,
2899
+ env: options.env,
2900
+ detached: true,
2901
+ stdio: "ignore"
2902
+ });
2903
+ child.unref();
2904
+ const pollInterval = 100;
2905
+ const startTime = Date.now();
2906
+ let timedOut = false;
2907
+ let lastStdoutSize = 0;
2908
+ const readFileIfExists = async (filePath) => {
2909
+ try {
2910
+ const { readFile: readFile8 } = await import("fs/promises");
2911
+ return await readFile8(filePath, "utf8");
2912
+ } catch {
2913
+ return "";
2914
+ }
2915
+ };
2916
+ const fileExists4 = async (filePath) => {
2917
+ try {
2918
+ const { access: access5 } = await import("fs/promises");
2919
+ await access5(filePath);
2920
+ return true;
2921
+ } catch {
2922
+ return false;
2923
+ }
2924
+ };
2925
+ const killProcess = async () => {
2926
+ try {
2927
+ const pid = await readFileIfExists(pidFile);
2928
+ if (pid.trim()) {
2929
+ process.kill(Number.parseInt(pid.trim(), 10), "SIGTERM");
2930
+ }
2931
+ } catch {
2932
+ }
2933
+ };
2934
+ if (options.signal?.aborted) {
2935
+ await killProcess();
2936
+ return { stdout: "", stderr: "Aborted", exitCode: -1, timedOut: false };
2937
+ }
2938
+ const abortHandler = () => {
2939
+ killProcess().catch(() => {
2940
+ });
2941
+ };
2942
+ options.signal?.addEventListener("abort", abortHandler, { once: true });
2943
+ try {
2944
+ while (true) {
2945
+ if (options.timeoutMs && Date.now() - startTime > options.timeoutMs) {
2946
+ timedOut = true;
2947
+ await killProcess();
2948
+ break;
2949
+ }
2950
+ if (options.signal?.aborted) {
2951
+ await killProcess();
2952
+ break;
2953
+ }
2954
+ if (options.onStdoutChunk) {
2955
+ const currentStdout = await readFileIfExists(stdoutFile);
2956
+ if (currentStdout.length > lastStdoutSize) {
2957
+ options.onStdoutChunk(currentStdout.slice(lastStdoutSize));
2958
+ lastStdoutSize = currentStdout.length;
2959
+ }
2960
+ }
2961
+ if (await fileExists4(exitFile)) {
2962
+ break;
2963
+ }
2964
+ await new Promise((resolve) => setTimeout(resolve, pollInterval));
2965
+ }
2966
+ const stdout = await readFileIfExists(stdoutFile);
2967
+ const stderr = await readFileIfExists(stderrFile);
2968
+ const exitCodeStr = await readFileIfExists(exitFile);
2969
+ const exitCode = exitCodeStr.trim() ? Number.parseInt(exitCodeStr.trim(), 10) : -1;
2970
+ if (options.onStdoutChunk && stdout.length > lastStdoutSize) {
2971
+ options.onStdoutChunk(stdout.slice(lastStdoutSize));
2972
+ }
2973
+ if (options.onStderrChunk && stderr) {
2974
+ options.onStderrChunk(stderr);
2975
+ }
2976
+ return { stdout, stderr, exitCode, timedOut };
2977
+ } finally {
2978
+ options.signal?.removeEventListener("abort", abortHandler);
2979
+ }
2980
+ }
2981
+
2982
+ // src/evaluation/providers/cli.ts
2983
+ var import_node_child_process2 = require("child_process");
2984
+ var import_promises9 = __toESM(require("fs/promises"), 1);
2985
+ var import_node_os2 = __toESM(require("os"), 1);
2986
+ var import_node_path10 = __toESM(require("path"), 1);
2987
+ var import_node_util = require("util");
2988
+ var import_zod = require("zod");
2989
+ var ToolCallSchema = import_zod.z.object({
2990
+ tool: import_zod.z.string(),
2991
+ input: import_zod.z.unknown().optional(),
2992
+ output: import_zod.z.unknown().optional(),
2993
+ id: import_zod.z.string().optional(),
2994
+ timestamp: import_zod.z.string().optional()
2995
+ });
2996
+ var OutputMessageInputSchema = import_zod.z.object({
2997
+ role: import_zod.z.string(),
2998
+ name: import_zod.z.string().optional(),
2999
+ content: import_zod.z.unknown().optional(),
3000
+ tool_calls: import_zod.z.array(ToolCallSchema).optional(),
3001
+ timestamp: import_zod.z.string().optional(),
3002
+ metadata: import_zod.z.record(import_zod.z.unknown()).optional()
3003
+ });
3004
+ var TokenUsageSchema = import_zod.z.object({
3005
+ input: import_zod.z.number(),
3006
+ output: import_zod.z.number(),
3007
+ cached: import_zod.z.number().optional()
3008
+ });
3009
+ var CliOutputSchema = import_zod.z.object({
3010
+ text: import_zod.z.unknown().optional(),
3011
+ output_messages: import_zod.z.array(OutputMessageInputSchema).optional(),
3012
+ token_usage: TokenUsageSchema.optional(),
3013
+ cost_usd: import_zod.z.number().optional(),
3014
+ duration_ms: import_zod.z.number().optional()
3015
+ });
3016
+ var CliJsonlRecordSchema = CliOutputSchema.extend({
3017
+ id: import_zod.z.string().min(1)
3018
+ });
3019
+ function validateMetrics(costUsd, durationMs, context) {
3020
+ let validCostUsd = costUsd;
3021
+ let validDurationMs = durationMs;
3022
+ if (costUsd !== void 0 && costUsd < 0) {
3023
+ console.warn(`[cli-provider] ${context}: ignoring negative cost_usd value (${costUsd})`);
3024
+ validCostUsd = void 0;
3025
+ }
3026
+ if (durationMs !== void 0 && durationMs < 0) {
3027
+ console.warn(`[cli-provider] ${context}: ignoring negative duration_ms value (${durationMs})`);
3028
+ validDurationMs = void 0;
3029
+ }
3030
+ return { costUsd: validCostUsd, durationMs: validDurationMs };
3031
+ }
3032
+ function convertOutputMessages(messages) {
3033
+ if (!messages || messages.length === 0) {
3034
+ return void 0;
3035
+ }
3036
+ return messages.map((msg) => ({
3037
+ role: msg.role,
3038
+ name: msg.name,
3039
+ content: msg.content,
3040
+ toolCalls: msg.tool_calls,
3041
+ timestamp: msg.timestamp,
3042
+ metadata: msg.metadata
3043
+ }));
3044
+ }
3045
+ var execAsync = (0, import_node_util.promisify)(import_node_child_process2.exec);
3046
+ var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
3047
+ async function defaultCommandRunner(command, options) {
3048
+ const execOptions = {
3049
+ cwd: options.cwd,
3050
+ env: options.env,
3051
+ timeout: options.timeoutMs,
3052
+ signal: options.signal,
3053
+ maxBuffer: DEFAULT_MAX_BUFFER,
3054
+ shell: process.platform === "win32" ? "powershell.exe" : void 0
3055
+ };
3056
+ try {
3057
+ const { stdout, stderr } = await execAsync(command, execOptions);
3058
+ return {
3059
+ stdout,
3060
+ stderr,
3061
+ exitCode: 0,
3062
+ failed: false,
3063
+ timedOut: false,
3064
+ signal: null
3065
+ };
3066
+ } catch (error) {
3067
+ const execError = error;
3068
+ return {
3069
+ stdout: execError.stdout ?? "",
3070
+ stderr: execError.stderr ?? "",
3071
+ exitCode: typeof execError.code === "number" ? execError.code : null,
3072
+ failed: true,
3073
+ timedOut: execError.timedOut === true || execError.killed === true,
3074
+ signal: execError.signal ?? null
3075
+ };
3076
+ }
3077
+ }
3078
+ var CliProvider = class {
3079
+ id;
3080
+ kind = "cli";
3081
+ targetName;
3082
+ supportsBatch = true;
3083
+ config;
3084
+ runCommand;
3085
+ verbose;
3086
+ keepTempFiles;
3087
+ healthcheckPromise;
3088
+ constructor(targetName, config, runner = defaultCommandRunner) {
3089
+ this.targetName = targetName;
3090
+ this.id = `cli:${targetName}`;
3091
+ this.config = config;
3092
+ this.runCommand = runner;
3093
+ this.verbose = config.verbose ?? false;
3094
+ this.keepTempFiles = config.keepTempFiles ?? false;
3095
+ }
3096
+ async invoke(request) {
3097
+ if (request.signal?.aborted) {
3098
+ throw new Error("CLI provider request was aborted before execution");
3099
+ }
3100
+ await this.ensureHealthy(request.signal);
3101
+ const outputFilePath = generateOutputFilePath(request.evalCaseId);
3102
+ const templateValues = buildTemplateValues(request, this.config, outputFilePath);
3103
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
3104
+ if (this.verbose) {
3105
+ console.log(
3106
+ `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
3107
+ );
3108
+ }
3109
+ const startTime = Date.now();
3110
+ const result = await this.runCommand(renderedCommand, {
3111
+ cwd: this.config.cwd,
3112
+ env: process.env,
3113
+ timeoutMs: this.config.timeoutMs,
3114
+ signal: request.signal
3115
+ });
3116
+ const measuredDurationMs = Date.now() - startTime;
3117
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
3118
+ if (request.signal?.aborted) {
3119
+ throw new Error("CLI provider request was aborted");
3120
+ }
3121
+ if (result.timedOut) {
3122
+ throw new Error(
3123
+ `CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
3124
+ );
2019
3125
  }
2020
3126
  const codeText = result.exitCode !== null ? result.exitCode : "unknown";
2021
3127
  const detail = result.stderr.trim() || result.stdout.trim();
@@ -2090,7 +3196,7 @@ var CliProvider = class {
2090
3196
  }
2091
3197
  if (result.timedOut) {
2092
3198
  throw new Error(
2093
- `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
3199
+ `CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
2094
3200
  );
2095
3201
  }
2096
3202
  const codeText = result.exitCode !== null ? result.exitCode : "unknown";
@@ -2100,11 +3206,6 @@ var CliProvider = class {
2100
3206
  }
2101
3207
  const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
2102
3208
  const recordsById = this.parseJsonlBatchOutput(responseContent);
2103
- const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
2104
- const missingIds = requestedIds.filter((id) => !recordsById.has(id));
2105
- if (missingIds.length > 0) {
2106
- throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
2107
- }
2108
3209
  const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
2109
3210
  const responses = requests.map((request) => {
2110
3211
  const evalCaseId = request.evalCaseId;
@@ -2123,15 +3224,20 @@ var CliProvider = class {
2123
3224
  }
2124
3225
  const parsed = recordsById.get(evalCaseId);
2125
3226
  if (!parsed) {
3227
+ const errorMessage = `Batch output missing id '${evalCaseId}'`;
3228
+ if (this.verbose) {
3229
+ console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
3230
+ }
2126
3231
  return {
2127
- outputMessages: [],
3232
+ outputMessages: [{ role: "assistant", content: `Error: ${errorMessage}` }],
2128
3233
  durationMs: perRequestFallbackMs,
2129
3234
  raw: {
2130
3235
  command: renderedCommand,
2131
3236
  stderr: result.stderr,
2132
3237
  exitCode: result.exitCode ?? 0,
2133
3238
  cwd: this.config.cwd,
2134
- outputFile: outputFilePath
3239
+ outputFile: outputFilePath,
3240
+ error: errorMessage
2135
3241
  }
2136
3242
  };
2137
3243
  }
@@ -2164,101 +3270,37 @@ var CliProvider = class {
2164
3270
  * - duration_ms: number
2165
3271
  */
2166
3272
  parseOutputContent(content) {
3273
+ let parsed;
2167
3274
  try {
2168
- const parsed = JSON.parse(content);
2169
- if (typeof parsed === "object" && parsed !== null) {
2170
- const obj = parsed;
2171
- const tokenUsage = this.parseTokenUsage(obj.token_usage);
2172
- const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
2173
- const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
2174
- const outputMessages = this.parseOutputMessages(obj.output_messages);
2175
- if (outputMessages && outputMessages.length > 0) {
2176
- return { outputMessages, tokenUsage, costUsd, durationMs };
2177
- }
2178
- if ("text" in obj) {
2179
- const text = typeof obj.text === "string" ? obj.text : String(obj.text);
2180
- return {
2181
- outputMessages: [{ role: "assistant", content: text }],
2182
- tokenUsage,
2183
- costUsd,
2184
- durationMs
2185
- };
2186
- }
2187
- }
3275
+ parsed = JSON.parse(content);
2188
3276
  } catch {
3277
+ return { outputMessages: [{ role: "assistant", content }] };
2189
3278
  }
2190
- return { outputMessages: [{ role: "assistant", content }] };
2191
- }
2192
- /**
2193
- * Parse token_usage from CLI output.
2194
- */
2195
- parseTokenUsage(tokenUsage) {
2196
- if (typeof tokenUsage !== "object" || tokenUsage === null) {
2197
- return void 0;
2198
- }
2199
- const obj = tokenUsage;
2200
- if (typeof obj.input !== "number" || typeof obj.output !== "number") {
2201
- return void 0;
2202
- }
2203
- return {
2204
- input: obj.input,
2205
- output: obj.output,
2206
- cached: typeof obj.cached === "number" ? obj.cached : void 0
2207
- };
2208
- }
2209
- /**
2210
- * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
2211
- */
2212
- parseOutputMessages(outputMessages) {
2213
- if (!Array.isArray(outputMessages)) {
2214
- return void 0;
2215
- }
2216
- const messages = [];
2217
- for (const msg of outputMessages) {
2218
- if (typeof msg !== "object" || msg === null) {
2219
- continue;
2220
- }
2221
- const rawMsg = msg;
2222
- if (typeof rawMsg.role !== "string") {
2223
- continue;
2224
- }
2225
- const message = {
2226
- role: rawMsg.role,
2227
- name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
2228
- content: rawMsg.content,
2229
- toolCalls: this.parseToolCalls(rawMsg.tool_calls),
2230
- timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
2231
- metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
2232
- };
2233
- messages.push(message);
2234
- }
2235
- return messages.length > 0 ? messages : void 0;
2236
- }
2237
- /**
2238
- * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
2239
- */
2240
- parseToolCalls(toolCalls) {
2241
- if (!Array.isArray(toolCalls)) {
2242
- return void 0;
3279
+ const result = CliOutputSchema.safeParse(parsed);
3280
+ if (!result.success) {
3281
+ return { outputMessages: [{ role: "assistant", content }] };
2243
3282
  }
2244
- const calls = [];
2245
- for (const call of toolCalls) {
2246
- if (typeof call !== "object" || call === null) {
2247
- continue;
2248
- }
2249
- const rawCall = call;
2250
- if (typeof rawCall.tool !== "string") {
2251
- continue;
2252
- }
2253
- calls.push({
2254
- tool: rawCall.tool,
2255
- input: rawCall.input,
2256
- output: rawCall.output,
2257
- id: typeof rawCall.id === "string" ? rawCall.id : void 0,
2258
- timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
2259
- });
3283
+ const obj = result.data;
3284
+ const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, "parsing output");
3285
+ const outputMessages = convertOutputMessages(obj.output_messages);
3286
+ if (outputMessages && outputMessages.length > 0) {
3287
+ return {
3288
+ outputMessages,
3289
+ tokenUsage: obj.token_usage,
3290
+ costUsd: metrics.costUsd,
3291
+ durationMs: metrics.durationMs
3292
+ };
3293
+ }
3294
+ if (obj.text !== void 0) {
3295
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
3296
+ return {
3297
+ outputMessages: [{ role: "assistant", content: text }],
3298
+ tokenUsage: obj.token_usage,
3299
+ costUsd: metrics.costUsd,
3300
+ durationMs: metrics.durationMs
3301
+ };
2260
3302
  }
2261
- return calls.length > 0 ? calls : void 0;
3303
+ return { outputMessages: [{ role: "assistant", content }] };
2262
3304
  }
2263
3305
  parseJsonlBatchOutput(content) {
2264
3306
  const records = /* @__PURE__ */ new Map();
@@ -2271,33 +3313,32 @@ var CliProvider = class {
2271
3313
  const reason = error instanceof Error ? error.message : String(error);
2272
3314
  throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
2273
3315
  }
2274
- if (typeof parsed !== "object" || parsed === null) {
3316
+ const result = CliJsonlRecordSchema.safeParse(parsed);
3317
+ if (!result.success) {
3318
+ const firstError = result.error.errors[0];
3319
+ if (firstError?.path.includes("id")) {
3320
+ throw new Error("CLI batch output JSONL line missing required string field: id");
3321
+ }
2275
3322
  throw new Error("CLI batch output JSONL line must be an object");
2276
3323
  }
2277
- const obj = parsed;
2278
- const id = typeof obj.id === "string" ? obj.id : void 0;
2279
- if (!id || id.trim().length === 0) {
2280
- throw new Error("CLI batch output JSONL line missing required string field: id");
2281
- }
2282
- if (records.has(id)) {
2283
- throw new Error(`CLI batch output contains duplicate id: ${id}`);
2284
- }
2285
- const tokenUsage = this.parseTokenUsage(obj.token_usage);
2286
- const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
2287
- const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
2288
- const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
2289
- let outputMessages;
2290
- if (parsedOutputMessages && parsedOutputMessages.length > 0) {
2291
- outputMessages = parsedOutputMessages;
3324
+ const obj = result.data;
3325
+ if (records.has(obj.id)) {
3326
+ throw new Error(`CLI batch output contains duplicate id: ${obj.id}`);
3327
+ }
3328
+ const outputMessages = convertOutputMessages(obj.output_messages);
3329
+ let finalOutputMessages;
3330
+ if (outputMessages && outputMessages.length > 0) {
3331
+ finalOutputMessages = outputMessages;
2292
3332
  } else {
2293
3333
  const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2294
- outputMessages = text ? [{ role: "assistant", content: text }] : [];
2295
- }
2296
- records.set(id, {
2297
- outputMessages,
2298
- tokenUsage,
2299
- costUsd,
2300
- durationMs
3334
+ finalOutputMessages = text ? [{ role: "assistant", content: text }] : [];
3335
+ }
3336
+ const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, `batch record '${obj.id}'`);
3337
+ records.set(obj.id, {
3338
+ outputMessages: finalOutputMessages,
3339
+ tokenUsage: obj.token_usage,
3340
+ costUsd: metrics.costUsd,
3341
+ durationMs: metrics.durationMs
2301
3342
  });
2302
3343
  }
2303
3344
  return records;
@@ -2311,7 +3352,7 @@ var CliProvider = class {
2311
3352
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
2312
3353
  } finally {
2313
3354
  if (!this.keepTempFiles) {
2314
- await import_promises8.default.unlink(filePath).catch(() => {
3355
+ await import_promises9.default.unlink(filePath).catch(() => {
2315
3356
  });
2316
3357
  }
2317
3358
  }
@@ -2383,7 +3424,7 @@ var CliProvider = class {
2383
3424
  }
2384
3425
  };
2385
3426
  function buildTemplateValues(request, config, outputFilePath) {
2386
- const inputFiles = normalizeInputFiles(request.inputFiles);
3427
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
2387
3428
  return {
2388
3429
  PROMPT: shellEscape(request.question ?? ""),
2389
3430
  GUIDELINES: shellEscape(request.guidelines ?? ""),
@@ -2393,13 +3434,13 @@ function buildTemplateValues(request, config, outputFilePath) {
2393
3434
  OUTPUT_FILE: shellEscape(outputFilePath)
2394
3435
  };
2395
3436
  }
2396
- function normalizeInputFiles(inputFiles) {
3437
+ function normalizeInputFiles2(inputFiles) {
2397
3438
  if (!inputFiles || inputFiles.length === 0) {
2398
3439
  return void 0;
2399
3440
  }
2400
3441
  const unique = /* @__PURE__ */ new Map();
2401
3442
  for (const inputFile of inputFiles) {
2402
- const absolutePath = import_node_path8.default.resolve(inputFile);
3443
+ const absolutePath = import_node_path10.default.resolve(inputFile);
2403
3444
  if (!unique.has(absolutePath)) {
2404
3445
  unique.set(absolutePath, absolutePath);
2405
3446
  }
@@ -2413,7 +3454,7 @@ function formatFileList(files, template) {
2413
3454
  const formatter = template ?? "{path}";
2414
3455
  return files.map((filePath) => {
2415
3456
  const escapedPath = shellEscape(filePath);
2416
- const escapedName = shellEscape(import_node_path8.default.basename(filePath));
3457
+ const escapedName = shellEscape(import_node_path10.default.basename(filePath));
2417
3458
  return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
2418
3459
  }).join(" ");
2419
3460
  }
@@ -2437,9 +3478,9 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
2437
3478
  const safeEvalId = evalCaseId || "unknown";
2438
3479
  const timestamp = Date.now();
2439
3480
  const random = Math.random().toString(36).substring(2, 9);
2440
- return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
3481
+ return import_node_path10.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
2441
3482
  }
2442
- function formatTimeoutSuffix(timeoutMs) {
3483
+ function formatTimeoutSuffix2(timeoutMs) {
2443
3484
  if (!timeoutMs || timeoutMs <= 0) {
2444
3485
  return "";
2445
3486
  }
@@ -2448,39 +3489,39 @@ function formatTimeoutSuffix(timeoutMs) {
2448
3489
  }
2449
3490
 
2450
3491
  // src/evaluation/providers/codex.ts
2451
- var import_node_child_process2 = require("child_process");
2452
- var import_node_crypto = require("crypto");
2453
- var import_node_fs3 = require("fs");
2454
- var import_promises9 = require("fs/promises");
2455
- var import_node_os2 = require("os");
2456
- var import_node_path10 = __toESM(require("path"), 1);
3492
+ var import_node_child_process3 = require("child_process");
3493
+ var import_node_crypto2 = require("crypto");
3494
+ var import_node_fs4 = require("fs");
3495
+ var import_promises10 = require("fs/promises");
3496
+ var import_node_os3 = require("os");
3497
+ var import_node_path11 = __toESM(require("path"), 1);
2457
3498
  var import_node_util2 = require("util");
2458
3499
 
2459
3500
  // src/evaluation/providers/codex-log-tracker.ts
2460
- var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
2461
- var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
3501
+ var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.codexLogs");
3502
+ var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.codexLogSubscribers");
2462
3503
  function getCodexLogStore() {
2463
3504
  const globalObject = globalThis;
2464
- const existing = globalObject[GLOBAL_LOGS_KEY];
3505
+ const existing = globalObject[GLOBAL_LOGS_KEY2];
2465
3506
  if (existing) {
2466
3507
  return existing;
2467
3508
  }
2468
3509
  const created = [];
2469
- globalObject[GLOBAL_LOGS_KEY] = created;
3510
+ globalObject[GLOBAL_LOGS_KEY2] = created;
2470
3511
  return created;
2471
3512
  }
2472
- function getSubscriberStore() {
3513
+ function getSubscriberStore2() {
2473
3514
  const globalObject = globalThis;
2474
- const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
3515
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
2475
3516
  if (existing) {
2476
3517
  return existing;
2477
3518
  }
2478
3519
  const created = /* @__PURE__ */ new Set();
2479
- globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
3520
+ globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
2480
3521
  return created;
2481
3522
  }
2482
- function notifySubscribers(entry) {
2483
- const subscribers = Array.from(getSubscriberStore());
3523
+ function notifySubscribers2(entry) {
3524
+ const subscribers = Array.from(getSubscriberStore2());
2484
3525
  for (const listener of subscribers) {
2485
3526
  try {
2486
3527
  listener(entry);
@@ -2492,7 +3533,7 @@ function notifySubscribers(entry) {
2492
3533
  }
2493
3534
  function recordCodexLogEntry(entry) {
2494
3535
  getCodexLogStore().push(entry);
2495
- notifySubscribers(entry);
3536
+ notifySubscribers2(entry);
2496
3537
  }
2497
3538
  function consumeCodexLogEntries() {
2498
3539
  const store = getCodexLogStore();
@@ -2502,118 +3543,19 @@ function consumeCodexLogEntries() {
2502
3543
  return store.splice(0, store.length);
2503
3544
  }
2504
3545
  function subscribeToCodexLogEntries(listener) {
2505
- const store = getSubscriberStore();
3546
+ const store = getSubscriberStore2();
2506
3547
  store.add(listener);
2507
3548
  return () => {
2508
3549
  store.delete(listener);
2509
3550
  };
2510
3551
  }
2511
3552
 
2512
- // src/evaluation/providers/preread.ts
2513
- var import_node_path9 = __toESM(require("path"), 1);
2514
- function buildPromptDocument(request, inputFiles, options) {
2515
- const parts = [];
2516
- const guidelineFiles = collectGuidelineFiles(
2517
- inputFiles,
2518
- options?.guidelinePatterns ?? request.guideline_patterns,
2519
- options?.guidelineOverrides
2520
- );
2521
- const inputFilesList = collectInputFiles(inputFiles);
2522
- const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
2523
- const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
2524
- if (prereadBlock.length > 0) {
2525
- parts.push("\n", prereadBlock);
2526
- }
2527
- parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2528
- return parts.join("\n").trim();
2529
- }
2530
- function normalizeInputFiles2(inputFiles) {
2531
- if (!inputFiles || inputFiles.length === 0) {
2532
- return void 0;
2533
- }
2534
- const deduped = /* @__PURE__ */ new Map();
2535
- for (const inputFile of inputFiles) {
2536
- const absolutePath = import_node_path9.default.resolve(inputFile);
2537
- if (!deduped.has(absolutePath)) {
2538
- deduped.set(absolutePath, absolutePath);
2539
- }
2540
- }
2541
- return Array.from(deduped.values());
2542
- }
2543
- function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
2544
- if (!inputFiles || inputFiles.length === 0) {
2545
- return [];
2546
- }
2547
- const unique = /* @__PURE__ */ new Map();
2548
- for (const inputFile of inputFiles) {
2549
- const absolutePath = import_node_path9.default.resolve(inputFile);
2550
- if (overrides?.has(absolutePath)) {
2551
- if (!unique.has(absolutePath)) {
2552
- unique.set(absolutePath, absolutePath);
2553
- }
2554
- continue;
2555
- }
2556
- const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
2557
- if (isGuidelineFile(normalized, guidelinePatterns)) {
2558
- if (!unique.has(absolutePath)) {
2559
- unique.set(absolutePath, absolutePath);
2560
- }
2561
- }
2562
- }
2563
- return Array.from(unique.values());
2564
- }
2565
- function collectInputFiles(inputFiles) {
2566
- if (!inputFiles || inputFiles.length === 0) {
2567
- return [];
2568
- }
2569
- const unique = /* @__PURE__ */ new Map();
2570
- for (const inputFile of inputFiles) {
2571
- const absolutePath = import_node_path9.default.resolve(inputFile);
2572
- if (!unique.has(absolutePath)) {
2573
- unique.set(absolutePath, absolutePath);
2574
- }
2575
- }
2576
- return Array.from(unique.values());
2577
- }
2578
- function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
2579
- if (guidelineFiles.length === 0 && inputFiles.length === 0) {
2580
- return "";
2581
- }
2582
- const buildList = (files) => files.map((absolutePath) => {
2583
- const fileName = import_node_path9.default.basename(absolutePath);
2584
- const fileUri = pathToFileUri(absolutePath);
2585
- return `* [${fileName}](${fileUri})`;
2586
- });
2587
- const sections = [];
2588
- if (guidelineFiles.length > 0) {
2589
- sections.push(`Read all guideline files:
2590
- ${buildList(guidelineFiles).join("\n")}.`);
2591
- }
2592
- if (inputFiles.length > 0) {
2593
- sections.push(`Read all input files:
2594
- ${buildList(inputFiles).join("\n")}.`);
2595
- }
2596
- sections.push(
2597
- "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
2598
- "Then apply system_instructions on the user query below."
2599
- );
2600
- return sections.join("\n");
2601
- }
2602
- function pathToFileUri(filePath) {
2603
- const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
2604
- const normalizedPath = absolutePath.replace(/\\/g, "/");
2605
- if (/^[a-zA-Z]:\//.test(normalizedPath)) {
2606
- return `file:///${normalizedPath}`;
2607
- }
2608
- return `file://${normalizedPath}`;
2609
- }
2610
-
2611
3553
  // src/evaluation/providers/codex.ts
2612
- var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
2613
- var WORKSPACE_PREFIX = "agentv-codex-";
2614
- var PROMPT_FILENAME = "prompt.md";
3554
+ var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process3.exec);
3555
+ var WORKSPACE_PREFIX2 = "agentv-codex-";
3556
+ var PROMPT_FILENAME2 = "prompt.md";
2615
3557
  var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
2616
- var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
3558
+ var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
2617
3559
  - Do NOT create any additional output files in the workspace.
2618
3560
  - All intended file outputs/changes MUST be written in your response.
2619
3561
  - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
@@ -2638,27 +3580,27 @@ var CodexProvider = class {
2638
3580
  throw new Error("Codex provider request was aborted before execution");
2639
3581
  }
2640
3582
  await this.ensureEnvironmentReady();
2641
- const inputFiles = normalizeInputFiles2(request.inputFiles);
3583
+ const inputFiles = normalizeInputFiles(request.inputFiles);
2642
3584
  const workspaceRoot = await this.createWorkspace();
2643
3585
  const logger = await this.createStreamLogger(request).catch(() => void 0);
2644
3586
  try {
2645
3587
  const basePrompt = buildPromptDocument(request, inputFiles);
2646
- const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
3588
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
2647
3589
  const promptContent = `${systemPrompt}
2648
3590
 
2649
3591
  ${basePrompt}`;
2650
- const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
2651
- await (0, import_promises9.writeFile)(promptFile, promptContent, "utf8");
3592
+ const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
3593
+ await (0, import_promises10.writeFile)(promptFile, promptContent, "utf8");
2652
3594
  const args = this.buildCodexArgs();
2653
3595
  const cwd = this.resolveCwd(workspaceRoot);
2654
3596
  const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
2655
3597
  if (result.timedOut) {
2656
3598
  throw new Error(
2657
- `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
3599
+ `Codex CLI timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
2658
3600
  );
2659
3601
  }
2660
3602
  if (result.exitCode !== 0) {
2661
- const detail = pickDetail(result.stderr, result.stdout);
3603
+ const detail = pickDetail2(result.stderr, result.stdout);
2662
3604
  const prefix = `Codex CLI exited with code ${result.exitCode}`;
2663
3605
  throw new Error(detail ? `${prefix}: ${detail}` : prefix);
2664
3606
  }
@@ -2697,7 +3639,7 @@ ${basePrompt}`;
2697
3639
  if (!this.config.cwd) {
2698
3640
  return workspaceRoot;
2699
3641
  }
2700
- return import_node_path10.default.resolve(this.config.cwd);
3642
+ return import_node_path11.default.resolve(this.config.cwd);
2701
3643
  }
2702
3644
  buildCodexArgs() {
2703
3645
  const args = [
@@ -2739,11 +3681,11 @@ ${basePrompt}`;
2739
3681
  }
2740
3682
  }
2741
3683
  async createWorkspace() {
2742
- return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
3684
+ return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
2743
3685
  }
2744
3686
  async cleanupWorkspace(workspaceRoot) {
2745
3687
  try {
2746
- await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
3688
+ await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
2747
3689
  } catch {
2748
3690
  }
2749
3691
  }
@@ -2753,9 +3695,9 @@ ${basePrompt}`;
2753
3695
  return void 0;
2754
3696
  }
2755
3697
  if (this.config.logDir) {
2756
- return import_node_path10.default.resolve(this.config.logDir);
3698
+ return import_node_path11.default.resolve(this.config.logDir);
2757
3699
  }
2758
- return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "codex");
3700
+ return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "codex");
2759
3701
  }
2760
3702
  async createStreamLogger(request) {
2761
3703
  const logDir = this.resolveLogDirectory();
@@ -2763,13 +3705,13 @@ ${basePrompt}`;
2763
3705
  return void 0;
2764
3706
  }
2765
3707
  try {
2766
- await (0, import_promises9.mkdir)(logDir, { recursive: true });
3708
+ await (0, import_promises10.mkdir)(logDir, { recursive: true });
2767
3709
  } catch (error) {
2768
3710
  const message = error instanceof Error ? error.message : String(error);
2769
3711
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
2770
3712
  return void 0;
2771
3713
  }
2772
- const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
3714
+ const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
2773
3715
  try {
2774
3716
  const logger = await CodexStreamLogger.create({
2775
3717
  filePath,
@@ -2802,7 +3744,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
2802
3744
  constructor(filePath, format) {
2803
3745
  this.filePath = filePath;
2804
3746
  this.format = format;
2805
- this.stream = (0, import_node_fs3.createWriteStream)(filePath, { flags: "a" });
3747
+ this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
2806
3748
  }
2807
3749
  static async create(options) {
2808
3750
  const logger = new _CodexStreamLogger(options.filePath, options.format);
@@ -2863,7 +3805,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
2863
3805
  return void 0;
2864
3806
  }
2865
3807
  const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
2866
- return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
3808
+ return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
2867
3809
  }
2868
3810
  flushRemainder() {
2869
3811
  const stdoutRemainder = this.stdoutBuffer.trim();
@@ -2894,18 +3836,18 @@ function isCodexLogStreamingDisabled() {
2894
3836
  const normalized = envValue.trim().toLowerCase();
2895
3837
  return normalized === "false" || normalized === "0" || normalized === "off";
2896
3838
  }
2897
- function buildLogFilename(request, targetName) {
3839
+ function buildLogFilename2(request, targetName) {
2898
3840
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2899
- const evalId = sanitizeForFilename(request.evalCaseId ?? "codex");
3841
+ const evalId = sanitizeForFilename2(request.evalCaseId ?? "codex");
2900
3842
  const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
2901
- const target = sanitizeForFilename(targetName);
2902
- return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto.randomUUID)().slice(0, 8)}.log`;
3843
+ const target = sanitizeForFilename2(targetName);
3844
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
2903
3845
  }
2904
- function sanitizeForFilename(value) {
3846
+ function sanitizeForFilename2(value) {
2905
3847
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
2906
3848
  return sanitized.length > 0 ? sanitized : "codex";
2907
3849
  }
2908
- function formatElapsed(startedAt) {
3850
+ function formatElapsed2(startedAt) {
2909
3851
  const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
2910
3852
  const hours = Math.floor(elapsedSeconds / 3600);
2911
3853
  const minutes = Math.floor(elapsedSeconds % 3600 / 60);
@@ -2916,7 +3858,7 @@ function formatElapsed(startedAt) {
2916
3858
  return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
2917
3859
  }
2918
3860
  function formatCodexLogMessage(rawLine, source) {
2919
- const parsed = tryParseJsonValue(rawLine);
3861
+ const parsed = tryParseJsonValue2(rawLine);
2920
3862
  if (parsed) {
2921
3863
  const summary = summarizeCodexEvent(parsed);
2922
3864
  if (summary) {
@@ -2929,7 +3871,7 @@ function formatCodexLogMessage(rawLine, source) {
2929
3871
  return rawLine;
2930
3872
  }
2931
3873
  function formatCodexJsonLog(rawLine) {
2932
- const parsed = tryParseJsonValue(rawLine);
3874
+ const parsed = tryParseJsonValue2(rawLine);
2933
3875
  if (!parsed) {
2934
3876
  return rawLine;
2935
3877
  }
@@ -2974,7 +3916,7 @@ function summarizeCodexEvent(event) {
2974
3916
  }
2975
3917
  return type;
2976
3918
  }
2977
- function tryParseJsonValue(rawLine) {
3919
+ function tryParseJsonValue2(rawLine) {
2978
3920
  try {
2979
3921
  return JSON.parse(rawLine);
2980
3922
  } catch {
@@ -2984,9 +3926,9 @@ function tryParseJsonValue(rawLine) {
2984
3926
  async function locateExecutable(candidate) {
2985
3927
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
2986
3928
  if (includesPathSeparator) {
2987
- const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
3929
+ const resolved = import_node_path11.default.isAbsolute(candidate) ? candidate : import_node_path11.default.resolve(candidate);
2988
3930
  const executablePath = await ensureWindowsExecutableVariant(resolved);
2989
- await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
3931
+ await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
2990
3932
  return executablePath;
2991
3933
  }
2992
3934
  const locator = process.platform === "win32" ? "where" : "which";
@@ -2996,7 +3938,7 @@ async function locateExecutable(candidate) {
2996
3938
  const preferred = selectExecutableCandidate(lines);
2997
3939
  if (preferred) {
2998
3940
  const executablePath = await ensureWindowsExecutableVariant(preferred);
2999
- await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
3941
+ await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
3000
3942
  return executablePath;
3001
3943
  }
3002
3944
  } catch {
@@ -3030,7 +3972,7 @@ async function ensureWindowsExecutableVariant(candidate) {
3030
3972
  for (const ext of extensions) {
3031
3973
  const withExtension = `${candidate}${ext}`;
3032
3974
  try {
3033
- await (0, import_promises9.access)(withExtension, import_node_fs3.constants.F_OK);
3975
+ await (0, import_promises10.access)(withExtension, import_node_fs4.constants.F_OK);
3034
3976
  return withExtension;
3035
3977
  } catch {
3036
3978
  }
@@ -3203,7 +4145,7 @@ function parseJsonLines(output) {
3203
4145
  }
3204
4146
  return parsed;
3205
4147
  }
3206
- function pickDetail(stderr, stdout) {
4148
+ function pickDetail2(stderr, stdout) {
3207
4149
  const errorText = stderr.trim();
3208
4150
  if (errorText.length > 0) {
3209
4151
  return errorText;
@@ -3211,7 +4153,7 @@ function pickDetail(stderr, stdout) {
3211
4153
  const stdoutText = stdout.trim();
3212
4154
  return stdoutText.length > 0 ? stdoutText : void 0;
3213
4155
  }
3214
- function formatTimeoutSuffix2(timeoutMs) {
4156
+ function formatTimeoutSuffix3(timeoutMs) {
3215
4157
  if (!timeoutMs || timeoutMs <= 0) {
3216
4158
  return "";
3217
4159
  }
@@ -3220,7 +4162,7 @@ function formatTimeoutSuffix2(timeoutMs) {
3220
4162
  }
3221
4163
  async function defaultCodexRunner(options) {
3222
4164
  return await new Promise((resolve, reject) => {
3223
- const child = (0, import_node_child_process2.spawn)(options.executable, options.args, {
4165
+ const child = (0, import_node_child_process3.spawn)(options.executable, options.args, {
3224
4166
  cwd: options.cwd,
3225
4167
  env: options.env,
3226
4168
  stdio: ["pipe", "pipe", "pipe"],
@@ -3331,38 +4273,38 @@ var MockProvider = class {
3331
4273
  };
3332
4274
 
3333
4275
  // src/evaluation/providers/pi-coding-agent.ts
3334
- var import_node_child_process3 = require("child_process");
3335
- var import_node_crypto2 = require("crypto");
3336
- var import_node_fs4 = require("fs");
3337
- var import_promises10 = require("fs/promises");
3338
- var import_node_os3 = require("os");
3339
- var import_node_path11 = __toESM(require("path"), 1);
4276
+ var import_node_child_process4 = require("child_process");
4277
+ var import_node_crypto3 = require("crypto");
4278
+ var import_node_fs5 = require("fs");
4279
+ var import_promises11 = require("fs/promises");
4280
+ var import_node_os4 = require("os");
4281
+ var import_node_path12 = __toESM(require("path"), 1);
3340
4282
 
3341
4283
  // src/evaluation/providers/pi-log-tracker.ts
3342
- var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
3343
- var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
4284
+ var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
4285
+ var GLOBAL_SUBSCRIBERS_KEY3 = Symbol.for("agentv.piLogSubscribers");
3344
4286
  function getPiLogStore() {
3345
4287
  const globalObject = globalThis;
3346
- const existing = globalObject[GLOBAL_LOGS_KEY2];
4288
+ const existing = globalObject[GLOBAL_LOGS_KEY3];
3347
4289
  if (existing) {
3348
4290
  return existing;
3349
4291
  }
3350
4292
  const created = [];
3351
- globalObject[GLOBAL_LOGS_KEY2] = created;
4293
+ globalObject[GLOBAL_LOGS_KEY3] = created;
3352
4294
  return created;
3353
4295
  }
3354
- function getSubscriberStore2() {
4296
+ function getSubscriberStore3() {
3355
4297
  const globalObject = globalThis;
3356
- const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
4298
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY3];
3357
4299
  if (existing) {
3358
4300
  return existing;
3359
4301
  }
3360
4302
  const created = /* @__PURE__ */ new Set();
3361
- globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
4303
+ globalObject[GLOBAL_SUBSCRIBERS_KEY3] = created;
3362
4304
  return created;
3363
4305
  }
3364
- function notifySubscribers2(entry) {
3365
- const subscribers = Array.from(getSubscriberStore2());
4306
+ function notifySubscribers3(entry) {
4307
+ const subscribers = Array.from(getSubscriberStore3());
3366
4308
  for (const listener of subscribers) {
3367
4309
  try {
3368
4310
  listener(entry);
@@ -3374,7 +4316,7 @@ function notifySubscribers2(entry) {
3374
4316
  }
3375
4317
  function recordPiLogEntry(entry) {
3376
4318
  getPiLogStore().push(entry);
3377
- notifySubscribers2(entry);
4319
+ notifySubscribers3(entry);
3378
4320
  }
3379
4321
  function consumePiLogEntries() {
3380
4322
  const store = getPiLogStore();
@@ -3384,7 +4326,7 @@ function consumePiLogEntries() {
3384
4326
  return store.splice(0, store.length);
3385
4327
  }
3386
4328
  function subscribeToPiLogEntries(listener) {
3387
- const store = getSubscriberStore2();
4329
+ const store = getSubscriberStore3();
3388
4330
  store.add(listener);
3389
4331
  return () => {
3390
4332
  store.delete(listener);
@@ -3392,9 +4334,9 @@ function subscribeToPiLogEntries(listener) {
3392
4334
  }
3393
4335
 
3394
4336
  // src/evaluation/providers/pi-coding-agent.ts
3395
- var WORKSPACE_PREFIX2 = "agentv-pi-";
3396
- var PROMPT_FILENAME2 = "prompt.md";
3397
- var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
4337
+ var WORKSPACE_PREFIX3 = "agentv-pi-";
4338
+ var PROMPT_FILENAME3 = "prompt.md";
4339
+ var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
3398
4340
  - Do NOT create any additional output files in the workspace.
3399
4341
  - All intended file outputs/changes MUST be written in your response.
3400
4342
  - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
@@ -3416,27 +4358,27 @@ var PiCodingAgentProvider = class {
3416
4358
  if (request.signal?.aborted) {
3417
4359
  throw new Error("Pi coding agent request was aborted before execution");
3418
4360
  }
3419
- const inputFiles = normalizeInputFiles2(request.inputFiles);
4361
+ const inputFiles = normalizeInputFiles(request.inputFiles);
3420
4362
  const workspaceRoot = await this.createWorkspace();
3421
4363
  const logger = await this.createStreamLogger(request).catch(() => void 0);
3422
4364
  try {
3423
- const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
3424
- await (0, import_promises10.writeFile)(promptFile, request.question, "utf8");
4365
+ const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME3);
4366
+ await (0, import_promises11.writeFile)(promptFile, request.question, "utf8");
3425
4367
  const args = this.buildPiArgs(request.question, inputFiles);
3426
4368
  const cwd = this.resolveCwd(workspaceRoot);
3427
4369
  const result = await this.executePi(args, cwd, request.signal, logger);
3428
4370
  if (result.timedOut) {
3429
4371
  throw new Error(
3430
- `Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
4372
+ `Pi coding agent timed out${formatTimeoutSuffix4(this.config.timeoutMs ?? void 0)}`
3431
4373
  );
3432
4374
  }
3433
4375
  if (result.exitCode !== 0) {
3434
- const detail = pickDetail2(result.stderr, result.stdout);
4376
+ const detail = pickDetail3(result.stderr, result.stdout);
3435
4377
  const prefix = `Pi coding agent exited with code ${result.exitCode}`;
3436
4378
  throw new Error(detail ? `${prefix}: ${detail}` : prefix);
3437
4379
  }
3438
4380
  const parsed = parsePiJsonl(result.stdout);
3439
- const outputMessages = extractOutputMessages(parsed);
4381
+ const outputMessages = extractOutputMessages2(parsed);
3440
4382
  const assistantText = extractAssistantText2(outputMessages);
3441
4383
  return {
3442
4384
  raw: {
@@ -3462,7 +4404,7 @@ var PiCodingAgentProvider = class {
3462
4404
  if (!this.config.cwd) {
3463
4405
  return workspaceRoot;
3464
4406
  }
3465
- return import_node_path11.default.resolve(this.config.cwd);
4407
+ return import_node_path12.default.resolve(this.config.cwd);
3466
4408
  }
3467
4409
  buildPiArgs(prompt, inputFiles) {
3468
4410
  const args = [];
@@ -3492,7 +4434,7 @@ var PiCodingAgentProvider = class {
3492
4434
  args.push(`@${file}`);
3493
4435
  }
3494
4436
  }
3495
- const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
4437
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT4;
3496
4438
  const fullPrompt = `${systemPrompt}
3497
4439
 
3498
4440
  ${prompt}`;
@@ -3551,19 +4493,19 @@ ${prompt}`;
3551
4493
  return env;
3552
4494
  }
3553
4495
  async createWorkspace() {
3554
- return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
4496
+ return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
3555
4497
  }
3556
4498
  async cleanupWorkspace(workspaceRoot) {
3557
4499
  try {
3558
- await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
4500
+ await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
3559
4501
  } catch {
3560
4502
  }
3561
4503
  }
3562
4504
  resolveLogDirectory() {
3563
4505
  if (this.config.logDir) {
3564
- return import_node_path11.default.resolve(this.config.logDir);
4506
+ return import_node_path12.default.resolve(this.config.logDir);
3565
4507
  }
3566
- return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
4508
+ return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
3567
4509
  }
3568
4510
  async createStreamLogger(request) {
3569
4511
  const logDir = this.resolveLogDirectory();
@@ -3571,13 +4513,13 @@ ${prompt}`;
3571
4513
  return void 0;
3572
4514
  }
3573
4515
  try {
3574
- await (0, import_promises10.mkdir)(logDir, { recursive: true });
4516
+ await (0, import_promises11.mkdir)(logDir, { recursive: true });
3575
4517
  } catch (error) {
3576
4518
  const message = error instanceof Error ? error.message : String(error);
3577
4519
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
3578
4520
  return void 0;
3579
4521
  }
3580
- const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
4522
+ const filePath = import_node_path12.default.join(logDir, buildLogFilename3(request, this.targetName));
3581
4523
  try {
3582
4524
  const logger = await PiStreamLogger.create({
3583
4525
  filePath,
@@ -3610,7 +4552,7 @@ var PiStreamLogger = class _PiStreamLogger {
3610
4552
  constructor(filePath, format) {
3611
4553
  this.filePath = filePath;
3612
4554
  this.format = format;
3613
- this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
4555
+ this.stream = (0, import_node_fs5.createWriteStream)(filePath, { flags: "a" });
3614
4556
  }
3615
4557
  static async create(options) {
3616
4558
  const logger = new _PiStreamLogger(options.filePath, options.format);
@@ -3671,7 +4613,7 @@ var PiStreamLogger = class _PiStreamLogger {
3671
4613
  return void 0;
3672
4614
  }
3673
4615
  const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
3674
- return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
4616
+ return `[+${formatElapsed3(this.startedAt)}] [${source}] ${message}`;
3675
4617
  }
3676
4618
  flushRemainder() {
3677
4619
  const stdoutRemainder = this.stdoutBuffer.trim();
@@ -3694,18 +4636,18 @@ var PiStreamLogger = class _PiStreamLogger {
3694
4636
  this.stderrBuffer = "";
3695
4637
  }
3696
4638
  };
3697
- function buildLogFilename2(request, targetName) {
4639
+ function buildLogFilename3(request, targetName) {
3698
4640
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3699
- const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
4641
+ const evalId = sanitizeForFilename3(request.evalCaseId ?? "pi");
3700
4642
  const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
3701
- const target = sanitizeForFilename2(targetName);
3702
- return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
4643
+ const target = sanitizeForFilename3(targetName);
4644
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto3.randomUUID)().slice(0, 8)}.log`;
3703
4645
  }
3704
- function sanitizeForFilename2(value) {
4646
+ function sanitizeForFilename3(value) {
3705
4647
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
3706
4648
  return sanitized.length > 0 ? sanitized : "pi";
3707
4649
  }
3708
- function formatElapsed2(startedAt) {
4650
+ function formatElapsed3(startedAt) {
3709
4651
  const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
3710
4652
  const hours = Math.floor(elapsedSeconds / 3600);
3711
4653
  const minutes = Math.floor(elapsedSeconds % 3600 / 60);
@@ -3716,7 +4658,7 @@ function formatElapsed2(startedAt) {
3716
4658
  return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
3717
4659
  }
3718
4660
  function formatPiLogMessage(rawLine, source) {
3719
- const parsed = tryParseJsonValue2(rawLine);
4661
+ const parsed = tryParseJsonValue3(rawLine);
3720
4662
  if (parsed) {
3721
4663
  const summary = summarizePiEvent(parsed);
3722
4664
  if (summary) {
@@ -3729,7 +4671,7 @@ function formatPiLogMessage(rawLine, source) {
3729
4671
  return rawLine;
3730
4672
  }
3731
4673
  function formatPiJsonLog(rawLine) {
3732
- const parsed = tryParseJsonValue2(rawLine);
4674
+ const parsed = tryParseJsonValue3(rawLine);
3733
4675
  if (!parsed) {
3734
4676
  return rawLine;
3735
4677
  }
@@ -3779,7 +4721,7 @@ function summarizePiEvent(event) {
3779
4721
  return type;
3780
4722
  }
3781
4723
  }
3782
- function tryParseJsonValue2(rawLine) {
4724
+ function tryParseJsonValue3(rawLine) {
3783
4725
  try {
3784
4726
  return JSON.parse(rawLine);
3785
4727
  } catch {
@@ -3804,7 +4746,7 @@ function parsePiJsonl(output) {
3804
4746
  }
3805
4747
  return parsed;
3806
4748
  }
3807
- function extractOutputMessages(events) {
4749
+ function extractOutputMessages2(events) {
3808
4750
  for (let i = events.length - 1; i >= 0; i--) {
3809
4751
  const event = events[i];
3810
4752
  if (!event || typeof event !== "object") {
@@ -3845,8 +4787,8 @@ function convertPiMessage(message) {
3845
4787
  if (typeof role !== "string") {
3846
4788
  return void 0;
3847
4789
  }
3848
- const content = extractTextContent(msg.content);
3849
- const toolCalls = extractToolCalls(msg.content);
4790
+ const content = extractTextContent2(msg.content);
4791
+ const toolCalls = extractToolCalls2(msg.content);
3850
4792
  const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
3851
4793
  const metadata = {};
3852
4794
  if (msg.api) metadata.api = msg.api;
@@ -3862,7 +4804,7 @@ function convertPiMessage(message) {
3862
4804
  metadata: Object.keys(metadata).length > 0 ? metadata : void 0
3863
4805
  };
3864
4806
  }
3865
- function extractTextContent(content) {
4807
+ function extractTextContent2(content) {
3866
4808
  if (typeof content === "string") {
3867
4809
  return content;
3868
4810
  }
@@ -3881,7 +4823,7 @@ function extractTextContent(content) {
3881
4823
  }
3882
4824
  return textParts.length > 0 ? textParts.join("\n") : void 0;
3883
4825
  }
3884
- function extractToolCalls(content) {
4826
+ function extractToolCalls2(content) {
3885
4827
  if (!Array.isArray(content)) {
3886
4828
  return [];
3887
4829
  }
@@ -3926,7 +4868,7 @@ function extractAssistantText2(messages) {
3926
4868
  function escapeAtSymbols(prompt) {
3927
4869
  return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
3928
4870
  }
3929
- function pickDetail2(stderr, stdout) {
4871
+ function pickDetail3(stderr, stdout) {
3930
4872
  const errorText = stderr.trim();
3931
4873
  if (errorText.length > 0) {
3932
4874
  return errorText;
@@ -3934,7 +4876,7 @@ function pickDetail2(stderr, stdout) {
3934
4876
  const stdoutText = stdout.trim();
3935
4877
  return stdoutText.length > 0 ? stdoutText : void 0;
3936
4878
  }
3937
- function formatTimeoutSuffix3(timeoutMs) {
4879
+ function formatTimeoutSuffix4(timeoutMs) {
3938
4880
  if (!timeoutMs || timeoutMs <= 0) {
3939
4881
  return "";
3940
4882
  }
@@ -3947,7 +4889,7 @@ async function defaultPiRunner(options) {
3947
4889
  const executable = parts[0];
3948
4890
  const executableArgs = parts.slice(1);
3949
4891
  const allArgs = [...executableArgs, ...options.args];
3950
- const child = (0, import_node_child_process3.spawn)(executable, allArgs, {
4892
+ const child = (0, import_node_child_process4.spawn)(executable, allArgs, {
3951
4893
  cwd: options.cwd,
3952
4894
  env: options.env,
3953
4895
  stdio: ["pipe", "pipe", "pipe"],
@@ -4010,84 +4952,84 @@ async function defaultPiRunner(options) {
4010
4952
  }
4011
4953
 
4012
4954
  // src/evaluation/providers/targets.ts
4013
- var import_node_path12 = __toESM(require("path"), 1);
4014
- var import_zod = require("zod");
4015
- var CliHealthcheckHttpInputSchema = import_zod.z.object({
4016
- type: import_zod.z.literal("http"),
4017
- url: import_zod.z.string().min(1, "healthcheck URL is required"),
4018
- timeout_seconds: import_zod.z.number().positive().optional(),
4019
- timeoutSeconds: import_zod.z.number().positive().optional()
4955
+ var import_node_path13 = __toESM(require("path"), 1);
4956
+ var import_zod2 = require("zod");
4957
+ var CliHealthcheckHttpInputSchema = import_zod2.z.object({
4958
+ type: import_zod2.z.literal("http"),
4959
+ url: import_zod2.z.string().min(1, "healthcheck URL is required"),
4960
+ timeout_seconds: import_zod2.z.number().positive().optional(),
4961
+ timeoutSeconds: import_zod2.z.number().positive().optional()
4020
4962
  });
4021
- var CliHealthcheckCommandInputSchema = import_zod.z.object({
4022
- type: import_zod.z.literal("command"),
4023
- command_template: import_zod.z.string().optional(),
4024
- commandTemplate: import_zod.z.string().optional(),
4025
- cwd: import_zod.z.string().optional(),
4026
- timeout_seconds: import_zod.z.number().positive().optional(),
4027
- timeoutSeconds: import_zod.z.number().positive().optional()
4963
+ var CliHealthcheckCommandInputSchema = import_zod2.z.object({
4964
+ type: import_zod2.z.literal("command"),
4965
+ command_template: import_zod2.z.string().optional(),
4966
+ commandTemplate: import_zod2.z.string().optional(),
4967
+ cwd: import_zod2.z.string().optional(),
4968
+ timeout_seconds: import_zod2.z.number().positive().optional(),
4969
+ timeoutSeconds: import_zod2.z.number().positive().optional()
4028
4970
  });
4029
- var CliHealthcheckInputSchema = import_zod.z.discriminatedUnion("type", [
4971
+ var CliHealthcheckInputSchema = import_zod2.z.discriminatedUnion("type", [
4030
4972
  CliHealthcheckHttpInputSchema,
4031
4973
  CliHealthcheckCommandInputSchema
4032
4974
  ]);
4033
- var CliTargetInputSchema = import_zod.z.object({
4034
- name: import_zod.z.string().min(1, "target name is required"),
4035
- provider: import_zod.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
4975
+ var CliTargetInputSchema = import_zod2.z.object({
4976
+ name: import_zod2.z.string().min(1, "target name is required"),
4977
+ provider: import_zod2.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
4036
4978
  // Command template - required (accept both naming conventions)
4037
- command_template: import_zod.z.string().optional(),
4038
- commandTemplate: import_zod.z.string().optional(),
4979
+ command_template: import_zod2.z.string().optional(),
4980
+ commandTemplate: import_zod2.z.string().optional(),
4039
4981
  // Files format - optional
4040
- files_format: import_zod.z.string().optional(),
4041
- filesFormat: import_zod.z.string().optional(),
4042
- attachments_format: import_zod.z.string().optional(),
4043
- attachmentsFormat: import_zod.z.string().optional(),
4982
+ files_format: import_zod2.z.string().optional(),
4983
+ filesFormat: import_zod2.z.string().optional(),
4984
+ attachments_format: import_zod2.z.string().optional(),
4985
+ attachmentsFormat: import_zod2.z.string().optional(),
4044
4986
  // Working directory - optional
4045
- cwd: import_zod.z.string().optional(),
4987
+ cwd: import_zod2.z.string().optional(),
4046
4988
  // Timeout in seconds - optional
4047
- timeout_seconds: import_zod.z.number().positive().optional(),
4048
- timeoutSeconds: import_zod.z.number().positive().optional(),
4989
+ timeout_seconds: import_zod2.z.number().positive().optional(),
4990
+ timeoutSeconds: import_zod2.z.number().positive().optional(),
4049
4991
  // Healthcheck configuration - optional
4050
4992
  healthcheck: CliHealthcheckInputSchema.optional(),
4051
4993
  // Verbose mode - optional
4052
- verbose: import_zod.z.boolean().optional(),
4053
- cli_verbose: import_zod.z.boolean().optional(),
4054
- cliVerbose: import_zod.z.boolean().optional(),
4994
+ verbose: import_zod2.z.boolean().optional(),
4995
+ cli_verbose: import_zod2.z.boolean().optional(),
4996
+ cliVerbose: import_zod2.z.boolean().optional(),
4055
4997
  // Keep temp files - optional
4056
- keep_temp_files: import_zod.z.boolean().optional(),
4057
- keepTempFiles: import_zod.z.boolean().optional(),
4058
- keep_output_files: import_zod.z.boolean().optional(),
4059
- keepOutputFiles: import_zod.z.boolean().optional(),
4998
+ keep_temp_files: import_zod2.z.boolean().optional(),
4999
+ keepTempFiles: import_zod2.z.boolean().optional(),
5000
+ keep_output_files: import_zod2.z.boolean().optional(),
5001
+ keepOutputFiles: import_zod2.z.boolean().optional(),
4060
5002
  // Common target fields
4061
- judge_target: import_zod.z.string().optional(),
4062
- workers: import_zod.z.number().int().min(1).optional(),
4063
- provider_batching: import_zod.z.boolean().optional(),
4064
- providerBatching: import_zod.z.boolean().optional()
5003
+ judge_target: import_zod2.z.string().optional(),
5004
+ workers: import_zod2.z.number().int().min(1).optional(),
5005
+ provider_batching: import_zod2.z.boolean().optional(),
5006
+ providerBatching: import_zod2.z.boolean().optional()
4065
5007
  }).refine((data) => data.command_template !== void 0 || data.commandTemplate !== void 0, {
4066
5008
  message: "Either command_template or commandTemplate is required"
4067
5009
  });
4068
- var CliHealthcheckHttpSchema = import_zod.z.object({
4069
- type: import_zod.z.literal("http"),
4070
- url: import_zod.z.string().min(1),
4071
- timeoutMs: import_zod.z.number().positive().optional()
5010
+ var CliHealthcheckHttpSchema = import_zod2.z.object({
5011
+ type: import_zod2.z.literal("http"),
5012
+ url: import_zod2.z.string().min(1),
5013
+ timeoutMs: import_zod2.z.number().positive().optional()
4072
5014
  }).strict();
4073
- var CliHealthcheckCommandSchema = import_zod.z.object({
4074
- type: import_zod.z.literal("command"),
4075
- commandTemplate: import_zod.z.string().min(1),
4076
- cwd: import_zod.z.string().optional(),
4077
- timeoutMs: import_zod.z.number().positive().optional()
5015
+ var CliHealthcheckCommandSchema = import_zod2.z.object({
5016
+ type: import_zod2.z.literal("command"),
5017
+ commandTemplate: import_zod2.z.string().min(1),
5018
+ cwd: import_zod2.z.string().optional(),
5019
+ timeoutMs: import_zod2.z.number().positive().optional()
4078
5020
  }).strict();
4079
- var CliHealthcheckSchema = import_zod.z.discriminatedUnion("type", [
5021
+ var CliHealthcheckSchema = import_zod2.z.discriminatedUnion("type", [
4080
5022
  CliHealthcheckHttpSchema,
4081
5023
  CliHealthcheckCommandSchema
4082
5024
  ]);
4083
- var CliTargetConfigSchema = import_zod.z.object({
4084
- commandTemplate: import_zod.z.string().min(1),
4085
- filesFormat: import_zod.z.string().optional(),
4086
- cwd: import_zod.z.string().optional(),
4087
- timeoutMs: import_zod.z.number().positive().optional(),
5025
+ var CliTargetConfigSchema = import_zod2.z.object({
5026
+ commandTemplate: import_zod2.z.string().min(1),
5027
+ filesFormat: import_zod2.z.string().optional(),
5028
+ cwd: import_zod2.z.string().optional(),
5029
+ timeoutMs: import_zod2.z.number().positive().optional(),
4088
5030
  healthcheck: CliHealthcheckSchema.optional(),
4089
- verbose: import_zod.z.boolean().optional(),
4090
- keepTempFiles: import_zod.z.boolean().optional()
5031
+ verbose: import_zod2.z.boolean().optional(),
5032
+ keepTempFiles: import_zod2.z.boolean().optional()
4091
5033
  }).strict();
4092
5034
  function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
4093
5035
  const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
@@ -4116,8 +5058,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
4116
5058
  allowLiteral: true,
4117
5059
  optionalEnv: true
4118
5060
  });
4119
- if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
4120
- cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
5061
+ if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
5062
+ cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
5063
+ }
5064
+ if (!cwd && evalFilePath) {
5065
+ cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
4121
5066
  }
4122
5067
  return {
4123
5068
  type: "command",
@@ -4144,11 +5089,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
4144
5089
  allowLiteral: true,
4145
5090
  optionalEnv: true
4146
5091
  });
4147
- if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
4148
- cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
5092
+ if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
5093
+ cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
4149
5094
  }
4150
5095
  if (!cwd && evalFilePath) {
4151
- cwd = import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath));
5096
+ cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
4152
5097
  }
4153
5098
  const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
4154
5099
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
@@ -4175,11 +5120,11 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
4175
5120
  "FILES",
4176
5121
  "OUTPUT_FILE"
4177
5122
  ]);
4178
- var BASE_TARGET_SCHEMA = import_zod.z.object({
4179
- name: import_zod.z.string().min(1, "target name is required"),
4180
- provider: import_zod.z.string().min(1, "provider is required"),
4181
- judge_target: import_zod.z.string().optional(),
4182
- workers: import_zod.z.number().int().min(1).optional()
5123
+ var BASE_TARGET_SCHEMA = import_zod2.z.object({
5124
+ name: import_zod2.z.string().min(1, "target name is required"),
5125
+ provider: import_zod2.z.string().min(1, "provider is required"),
5126
+ judge_target: import_zod2.z.string().optional(),
5127
+ workers: import_zod2.z.number().int().min(1).optional()
4183
5128
  }).passthrough();
4184
5129
  var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
4185
5130
  function normalizeAzureApiVersion(value) {
@@ -4282,6 +5227,15 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
4282
5227
  providerBatching,
4283
5228
  config: resolvePiCodingAgentConfig(parsed, env)
4284
5229
  };
5230
+ case "claude-code":
5231
+ return {
5232
+ kind: "claude-code",
5233
+ name: parsed.name,
5234
+ judgeTarget: parsed.judge_target,
5235
+ workers: parsed.workers,
5236
+ providerBatching,
5237
+ config: resolveClaudeCodeConfig(parsed, env)
5238
+ };
4285
5239
  case "mock":
4286
5240
  return {
4287
5241
  kind: "mock",
@@ -4466,34 +5420,92 @@ function resolvePiCodingAgentConfig(target, env) {
4466
5420
  const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
4467
5421
  allowLiteral: true,
4468
5422
  optionalEnv: true
4469
- });
4470
- const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
4471
- const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
5423
+ });
5424
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
5425
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
5426
+ allowLiteral: true,
5427
+ optionalEnv: true
5428
+ });
5429
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
5430
+ const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
5431
+ allowLiteral: true,
5432
+ optionalEnv: true
5433
+ });
5434
+ const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
5435
+ const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
5436
+ return {
5437
+ executable,
5438
+ provider,
5439
+ model,
5440
+ apiKey,
5441
+ tools,
5442
+ thinking,
5443
+ args,
5444
+ cwd,
5445
+ timeoutMs,
5446
+ logDir,
5447
+ logFormat,
5448
+ systemPrompt
5449
+ };
5450
+ }
5451
+ function resolveClaudeCodeConfig(target, env) {
5452
+ const executableSource = target.executable ?? target.command ?? target.binary;
5453
+ const modelSource = target.model;
5454
+ const argsSource = target.args ?? target.arguments;
5455
+ const cwdSource = target.cwd;
5456
+ const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
5457
+ const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
5458
+ const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_CODE_LOG_FORMAT;
5459
+ const systemPromptSource = target.system_prompt ?? target.systemPrompt;
5460
+ const executable = resolveOptionalString(executableSource, env, `${target.name} claude-code executable`, {
5461
+ allowLiteral: true,
5462
+ optionalEnv: true
5463
+ }) ?? "claude";
5464
+ const model = resolveOptionalString(modelSource, env, `${target.name} claude-code model`, {
4472
5465
  allowLiteral: true,
4473
5466
  optionalEnv: true
4474
5467
  });
4475
- const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
4476
- const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
5468
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} claude-code args`);
5469
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} claude-code cwd`, {
4477
5470
  allowLiteral: true,
4478
5471
  optionalEnv: true
4479
5472
  });
4480
- const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
5473
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} claude-code timeout`);
5474
+ const logDir = resolveOptionalString(
5475
+ logDirSource,
5476
+ env,
5477
+ `${target.name} claude-code log directory`,
5478
+ {
5479
+ allowLiteral: true,
5480
+ optionalEnv: true
5481
+ }
5482
+ );
5483
+ const logFormat = normalizeClaudeCodeLogFormat(logFormatSource);
4481
5484
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
4482
5485
  return {
4483
5486
  executable,
4484
- provider,
4485
5487
  model,
4486
- apiKey,
4487
- tools,
4488
- thinking,
5488
+ systemPrompt,
4489
5489
  args,
4490
5490
  cwd,
4491
5491
  timeoutMs,
4492
5492
  logDir,
4493
- logFormat,
4494
- systemPrompt
5493
+ logFormat
4495
5494
  };
4496
5495
  }
5496
+ function normalizeClaudeCodeLogFormat(value) {
5497
+ if (value === void 0 || value === null) {
5498
+ return void 0;
5499
+ }
5500
+ if (typeof value !== "string") {
5501
+ throw new Error("claude-code log format must be 'summary' or 'json'");
5502
+ }
5503
+ const normalized = value.trim().toLowerCase();
5504
+ if (normalized === "json" || normalized === "summary") {
5505
+ return normalized;
5506
+ }
5507
+ throw new Error("claude-code log format must be 'summary' or 'json'");
5508
+ }
4497
5509
  function resolveMockConfig(target) {
4498
5510
  const response = typeof target.response === "string" ? target.response : void 0;
4499
5511
  return { response };
@@ -4529,13 +5541,13 @@ function resolveVSCodeConfig(target, env, insiders) {
4529
5541
  };
4530
5542
  }
4531
5543
  var cliErrorMap = (issue, ctx) => {
4532
- if (issue.code === import_zod.z.ZodIssueCode.unrecognized_keys) {
5544
+ if (issue.code === import_zod2.z.ZodIssueCode.unrecognized_keys) {
4533
5545
  return { message: `Unknown CLI provider settings: ${issue.keys.join(", ")}` };
4534
5546
  }
4535
- if (issue.code === import_zod.z.ZodIssueCode.invalid_union_discriminator) {
5547
+ if (issue.code === import_zod2.z.ZodIssueCode.invalid_union_discriminator) {
4536
5548
  return { message: "healthcheck type must be 'http' or 'command'" };
4537
5549
  }
4538
- if (issue.code === import_zod.z.ZodIssueCode.invalid_type && issue.expected === "string") {
5550
+ if (issue.code === import_zod2.z.ZodIssueCode.invalid_type && issue.expected === "string") {
4539
5551
  return { message: `${ctx.defaultError} (expected a string value)` };
4540
5552
  }
4541
5553
  return { message: ctx.defaultError };
@@ -4544,8 +5556,8 @@ function resolveCliConfig(target, env, evalFilePath) {
4544
5556
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
4545
5557
  if (!parseResult.success) {
4546
5558
  const firstError = parseResult.error.errors[0];
4547
- const path16 = firstError?.path.join(".") || "";
4548
- const prefix = path16 ? `${target.name} ${path16}: ` : `${target.name}: `;
5559
+ const path17 = firstError?.path.join(".") || "";
5560
+ const prefix = path17 ? `${target.name} ${path17}: ` : `${target.name}: `;
4549
5561
  throw new Error(`${prefix}${firstError?.message}`);
4550
5562
  }
4551
5563
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -4733,7 +5745,7 @@ function resolveOptionalNumberArray(source, description) {
4733
5745
  }
4734
5746
 
4735
5747
  // src/evaluation/providers/vscode.ts
4736
- var import_node_path13 = __toESM(require("path"), 1);
5748
+ var import_node_path14 = __toESM(require("path"), 1);
4737
5749
  var import_subagent = require("subagent");
4738
5750
 
4739
5751
  // src/evaluation/providers/vscode-templates.ts
@@ -4903,7 +5915,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
4903
5915
  return "";
4904
5916
  }
4905
5917
  const buildList = (files) => files.map((absolutePath) => {
4906
- const fileName = import_node_path13.default.basename(absolutePath);
5918
+ const fileName = import_node_path14.default.basename(absolutePath);
4907
5919
  const fileUri = pathToFileUri2(absolutePath);
4908
5920
  return `* [${fileName}](${fileUri})`;
4909
5921
  });
@@ -4928,8 +5940,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
4928
5940
  }
4929
5941
  const unique = /* @__PURE__ */ new Map();
4930
5942
  for (const attachment of attachments) {
4931
- const absolutePath = import_node_path13.default.resolve(attachment);
4932
- const normalized = absolutePath.split(import_node_path13.default.sep).join("/");
5943
+ const absolutePath = import_node_path14.default.resolve(attachment);
5944
+ const normalized = absolutePath.split(import_node_path14.default.sep).join("/");
4933
5945
  if (isGuidelineFile(normalized, guidelinePatterns)) {
4934
5946
  if (!unique.has(absolutePath)) {
4935
5947
  unique.set(absolutePath, absolutePath);
@@ -4944,7 +5956,7 @@ function collectAttachmentFiles(attachments) {
4944
5956
  }
4945
5957
  const unique = /* @__PURE__ */ new Map();
4946
5958
  for (const attachment of attachments) {
4947
- const absolutePath = import_node_path13.default.resolve(attachment);
5959
+ const absolutePath = import_node_path14.default.resolve(attachment);
4948
5960
  if (!unique.has(absolutePath)) {
4949
5961
  unique.set(absolutePath, absolutePath);
4950
5962
  }
@@ -4952,7 +5964,7 @@ function collectAttachmentFiles(attachments) {
4952
5964
  return Array.from(unique.values());
4953
5965
  }
4954
5966
  function pathToFileUri2(filePath) {
4955
- const absolutePath = import_node_path13.default.isAbsolute(filePath) ? filePath : import_node_path13.default.resolve(filePath);
5967
+ const absolutePath = import_node_path14.default.isAbsolute(filePath) ? filePath : import_node_path14.default.resolve(filePath);
4956
5968
  const normalizedPath = absolutePath.replace(/\\/g, "/");
4957
5969
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
4958
5970
  return `file:///${normalizedPath}`;
@@ -4965,7 +5977,7 @@ function normalizeAttachments(attachments) {
4965
5977
  }
4966
5978
  const deduped = /* @__PURE__ */ new Set();
4967
5979
  for (const attachment of attachments) {
4968
- deduped.add(import_node_path13.default.resolve(attachment));
5980
+ deduped.add(import_node_path14.default.resolve(attachment));
4969
5981
  }
4970
5982
  return Array.from(deduped);
4971
5983
  }
@@ -4974,7 +5986,7 @@ function mergeAttachments(all) {
4974
5986
  for (const list of all) {
4975
5987
  if (!list) continue;
4976
5988
  for (const inputFile of list) {
4977
- deduped.add(import_node_path13.default.resolve(inputFile));
5989
+ deduped.add(import_node_path14.default.resolve(inputFile));
4978
5990
  }
4979
5991
  }
4980
5992
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -5021,9 +6033,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
5021
6033
  }
5022
6034
 
5023
6035
  // src/evaluation/providers/targets-file.ts
5024
- var import_node_fs5 = require("fs");
5025
- var import_promises11 = require("fs/promises");
5026
- var import_node_path14 = __toESM(require("path"), 1);
6036
+ var import_node_fs6 = require("fs");
6037
+ var import_promises12 = require("fs/promises");
6038
+ var import_node_path15 = __toESM(require("path"), 1);
5027
6039
  var import_yaml3 = require("yaml");
5028
6040
  function isRecord(value) {
5029
6041
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -5053,18 +6065,18 @@ function assertTargetDefinition(value, index, filePath) {
5053
6065
  }
5054
6066
  async function fileExists3(filePath) {
5055
6067
  try {
5056
- await (0, import_promises11.access)(filePath, import_node_fs5.constants.F_OK);
6068
+ await (0, import_promises12.access)(filePath, import_node_fs6.constants.F_OK);
5057
6069
  return true;
5058
6070
  } catch {
5059
6071
  return false;
5060
6072
  }
5061
6073
  }
5062
6074
  async function readTargetDefinitions(filePath) {
5063
- const absolutePath = import_node_path14.default.resolve(filePath);
6075
+ const absolutePath = import_node_path15.default.resolve(filePath);
5064
6076
  if (!await fileExists3(absolutePath)) {
5065
6077
  throw new Error(`targets.yaml not found at ${absolutePath}`);
5066
6078
  }
5067
- const raw = await (0, import_promises11.readFile)(absolutePath, "utf8");
6079
+ const raw = await (0, import_promises12.readFile)(absolutePath, "utf8");
5068
6080
  const parsed = (0, import_yaml3.parse)(raw);
5069
6081
  if (!isRecord(parsed)) {
5070
6082
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
@@ -5094,6 +6106,8 @@ function createProvider(target) {
5094
6106
  return new CodexProvider(target.name, target.config);
5095
6107
  case "pi-coding-agent":
5096
6108
  return new PiCodingAgentProvider(target.name, target.config);
6109
+ case "claude-code":
6110
+ return new ClaudeCodeProvider(target.name, target.config);
5097
6111
  case "mock":
5098
6112
  return new MockProvider(target.name, target.config);
5099
6113
  case "vscode":
@@ -5112,78 +6126,199 @@ function resolveAndCreateProvider(definition, env = process.env) {
5112
6126
 
5113
6127
  // src/evaluation/evaluators.ts
5114
6128
  var import_ai2 = require("ai");
5115
- var import_zod2 = require("zod");
6129
+ var import_zod3 = require("zod");
5116
6130
 
5117
6131
  // src/runtime/exec.ts
5118
- function getBunSpawn() {
5119
- const bunSpawn = globalThis.Bun?.spawn;
5120
- return typeof bunSpawn === "function" ? bunSpawn : void 0;
6132
+ function shellEscapePath(value) {
6133
+ if (process.platform === "win32") {
6134
+ return `"${value.replaceAll('"', '""')}"`;
6135
+ }
6136
+ return `'${value.replaceAll("'", `'"'"'`)}'`;
5121
6137
  }
5122
- async function execShellWithStdin(command, stdinPayload, options = {}) {
5123
- const bunSpawn = getBunSpawn();
5124
- if (bunSpawn) {
5125
- const encoder = new TextEncoder();
5126
- const proc = bunSpawn({
5127
- cmd: ["sh", "-c", command],
5128
- cwd: options.cwd,
5129
- stdin: encoder.encode(stdinPayload),
5130
- stdout: "pipe",
5131
- stderr: "pipe"
5132
- });
5133
- const timeout = options.timeoutMs ? setTimeout(() => {
5134
- proc.kill();
5135
- }, options.timeoutMs) : void 0;
5136
- try {
5137
- const stdout = await new Response(proc.stdout).text();
5138
- const stderr = await new Response(proc.stderr).text();
5139
- const exitCode = await proc.exited;
5140
- return { stdout, stderr, exitCode };
5141
- } finally {
5142
- if (timeout !== void 0) {
5143
- clearTimeout(timeout);
5144
- }
6138
+ async function execFileWithStdin(argv, stdinPayload, options = {}) {
6139
+ if (argv.length === 0) {
6140
+ throw new Error("Executable argv must include at least one entry");
6141
+ }
6142
+ if (typeof Bun !== "undefined") {
6143
+ return execFileWithStdinBun(argv, stdinPayload, options);
6144
+ }
6145
+ return execFileWithStdinNode(argv, stdinPayload, options);
6146
+ }
6147
+ async function execFileWithStdinBun(argv, stdinPayload, options) {
6148
+ const command = [...argv];
6149
+ const encoder = new TextEncoder();
6150
+ const proc = Bun.spawn(command, {
6151
+ cwd: options.cwd,
6152
+ stdin: encoder.encode(stdinPayload),
6153
+ stdout: "pipe",
6154
+ stderr: "pipe"
6155
+ });
6156
+ let timedOut = false;
6157
+ const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
6158
+ timedOut = true;
6159
+ proc.kill("SIGKILL");
6160
+ }, options.timeoutMs) : void 0;
6161
+ try {
6162
+ const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
6163
+ const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
6164
+ const [stdout, stderr, exitCode] = await Promise.all([
6165
+ stdoutPromise,
6166
+ stderrPromise,
6167
+ proc.exited
6168
+ ]);
6169
+ if (timedOut) {
6170
+ throw new Error(`Process timed out after ${options.timeoutMs}ms`);
6171
+ }
6172
+ return {
6173
+ stdout: stdout.replace(/\r\n/g, "\n"),
6174
+ stderr: stderr.replace(/\r\n/g, "\n"),
6175
+ exitCode
6176
+ };
6177
+ } finally {
6178
+ if (timeout !== void 0) {
6179
+ clearTimeout(timeout);
5145
6180
  }
5146
6181
  }
5147
- const { spawn: spawn3 } = await import("child_process");
5148
- return await new Promise((resolve, reject) => {
5149
- const child = spawn3(command, {
5150
- shell: true,
6182
+ }
6183
+ async function execFileWithStdinNode(argv, stdinPayload, options) {
6184
+ const { spawn: spawn4 } = await import("child_process");
6185
+ return new Promise((resolve, reject) => {
6186
+ const [cmd, ...args] = argv;
6187
+ const child = spawn4(cmd, args, {
5151
6188
  cwd: options.cwd,
5152
6189
  stdio: ["pipe", "pipe", "pipe"]
5153
6190
  });
5154
- let stdout = "";
5155
- let stderr = "";
5156
- const timeout = options.timeoutMs ? setTimeout(() => {
5157
- child.kill();
5158
- reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
6191
+ const stdoutChunks = [];
6192
+ const stderrChunks = [];
6193
+ child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
6194
+ child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
6195
+ let timedOut = false;
6196
+ const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
6197
+ timedOut = true;
6198
+ child.kill("SIGKILL");
5159
6199
  }, options.timeoutMs) : void 0;
5160
- child.stdout?.on("data", (data) => {
5161
- stdout += data.toString();
5162
- });
5163
- child.stderr?.on("data", (data) => {
5164
- stderr += data.toString();
5165
- });
5166
6200
  child.on("error", (error) => {
5167
- if (timeout !== void 0) {
5168
- clearTimeout(timeout);
5169
- }
6201
+ if (timeout !== void 0) clearTimeout(timeout);
5170
6202
  reject(error);
5171
6203
  });
5172
- child.on("exit", (code) => {
5173
- if (timeout !== void 0) {
5174
- clearTimeout(timeout);
6204
+ child.on("close", (code) => {
6205
+ if (timeout !== void 0) clearTimeout(timeout);
6206
+ if (timedOut) {
6207
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
6208
+ return;
5175
6209
  }
5176
- resolve({ stdout, stderr, exitCode: code ?? 0 });
6210
+ const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
6211
+ const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
6212
+ resolve({
6213
+ stdout,
6214
+ stderr,
6215
+ exitCode: code ?? 0
6216
+ });
5177
6217
  });
5178
- child.stdin?.write(stdinPayload);
5179
- child.stdin?.end();
6218
+ if (child.stdin) {
6219
+ child.stdin.write(stdinPayload);
6220
+ child.stdin.end();
6221
+ }
5180
6222
  });
5181
6223
  }
6224
+ async function execShellWithStdin(command, stdinPayload, options = {}) {
6225
+ const { mkdir: mkdir4, readFile: readFile8, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
6226
+ const { tmpdir: tmpdir4 } = await import("os");
6227
+ const path17 = await import("path");
6228
+ const { randomUUID: randomUUID4 } = await import("crypto");
6229
+ const dir = path17.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
6230
+ await mkdir4(dir, { recursive: true });
6231
+ const stdinPath = path17.join(dir, "stdin.txt");
6232
+ const stdoutPath = path17.join(dir, "stdout.txt");
6233
+ const stderrPath = path17.join(dir, "stderr.txt");
6234
+ await writeFile4(stdinPath, stdinPayload, "utf8");
6235
+ const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
6236
+ const { spawn: spawn4 } = await import("child_process");
6237
+ try {
6238
+ const exitCode = await new Promise((resolve, reject) => {
6239
+ const child = spawn4(wrappedCommand, {
6240
+ shell: true,
6241
+ cwd: options.cwd,
6242
+ stdio: ["ignore", "ignore", "ignore"]
6243
+ });
6244
+ const timeout = options.timeoutMs ? setTimeout(() => {
6245
+ child.kill();
6246
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
6247
+ }, options.timeoutMs) : void 0;
6248
+ child.on("error", (error) => {
6249
+ if (timeout !== void 0) {
6250
+ clearTimeout(timeout);
6251
+ }
6252
+ reject(error);
6253
+ });
6254
+ child.on("exit", (code) => {
6255
+ if (timeout !== void 0) {
6256
+ clearTimeout(timeout);
6257
+ }
6258
+ resolve(code ?? 0);
6259
+ });
6260
+ });
6261
+ const stdout = (await readFile8(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
6262
+ const stderr = (await readFile8(stderrPath, "utf8")).replace(/\r\n/g, "\n");
6263
+ return { stdout, stderr, exitCode };
6264
+ } finally {
6265
+ await rm4(dir, { recursive: true, force: true });
6266
+ }
6267
+ }
6268
+
6269
+ // src/evaluation/case-conversion.ts
6270
+ function toSnakeCase(str) {
6271
+ if (/^[A-Z]/.test(str)) {
6272
+ return str;
6273
+ }
6274
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
6275
+ }
6276
+ function toCamelCase(str) {
6277
+ if (/^[A-Z]/.test(str)) {
6278
+ return str;
6279
+ }
6280
+ return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
6281
+ }
6282
+ function toSnakeCaseDeep(obj) {
6283
+ if (obj === null || obj === void 0) {
6284
+ return obj;
6285
+ }
6286
+ if (Array.isArray(obj)) {
6287
+ return obj.map((item) => toSnakeCaseDeep(item));
6288
+ }
6289
+ if (typeof obj === "object") {
6290
+ const result = {};
6291
+ for (const [key, value] of Object.entries(obj)) {
6292
+ const snakeKey = toSnakeCase(key);
6293
+ result[snakeKey] = toSnakeCaseDeep(value);
6294
+ }
6295
+ return result;
6296
+ }
6297
+ return obj;
6298
+ }
6299
+ function toCamelCaseDeep(obj) {
6300
+ if (obj === null || obj === void 0) {
6301
+ return obj;
6302
+ }
6303
+ if (Array.isArray(obj)) {
6304
+ return obj.map((item) => toCamelCaseDeep(item));
6305
+ }
6306
+ if (typeof obj === "object") {
6307
+ const result = {};
6308
+ for (const [key, value] of Object.entries(obj)) {
6309
+ const camelKey = toCamelCase(key);
6310
+ result[camelKey] = toCamelCaseDeep(value);
6311
+ }
6312
+ return result;
6313
+ }
6314
+ return obj;
6315
+ }
5182
6316
 
5183
6317
  // src/evaluation/providers/types.ts
5184
6318
  var AGENT_PROVIDER_KINDS = [
5185
6319
  "codex",
5186
6320
  "pi-coding-agent",
6321
+ "claude-code",
5187
6322
  "vscode",
5188
6323
  "vscode-insiders"
5189
6324
  ];
@@ -5224,20 +6359,20 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
5224
6359
 
5225
6360
  [[ ## candidate_answer ## ]]
5226
6361
  {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
5227
- var freeformEvaluationSchema = import_zod2.z.object({
5228
- score: import_zod2.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
5229
- hits: import_zod2.z.array(import_zod2.z.string()).describe("Brief specific achievements").optional(),
5230
- misses: import_zod2.z.array(import_zod2.z.string()).describe("Brief failures or omissions").optional(),
5231
- reasoning: import_zod2.z.string().describe("Concise explanation (1-2 sentences)").optional()
6362
+ var freeformEvaluationSchema = import_zod3.z.object({
6363
+ score: import_zod3.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
6364
+ hits: import_zod3.z.array(import_zod3.z.string()).describe("Brief specific achievements").optional(),
6365
+ misses: import_zod3.z.array(import_zod3.z.string()).describe("Brief failures or omissions").optional(),
6366
+ reasoning: import_zod3.z.string().describe("Concise explanation (1-2 sentences)").optional()
5232
6367
  });
5233
- var rubricCheckResultSchema = import_zod2.z.object({
5234
- id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
5235
- satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
5236
- reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
6368
+ var rubricCheckResultSchema = import_zod3.z.object({
6369
+ id: import_zod3.z.string().describe("The ID of the rubric item being checked"),
6370
+ satisfied: import_zod3.z.boolean().describe("Whether this rubric requirement is met"),
6371
+ reasoning: import_zod3.z.string().describe("Brief explanation (1-2 sentences) for this check")
5237
6372
  });
5238
- var rubricEvaluationSchema = import_zod2.z.object({
5239
- checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
5240
- overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
6373
+ var rubricEvaluationSchema = import_zod3.z.object({
6374
+ checks: import_zod3.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
6375
+ overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)")
5241
6376
  });
5242
6377
  var LlmJudgeEvaluator = class {
5243
6378
  kind = "llm_judge";
@@ -5473,30 +6608,30 @@ var CodeEvaluator = class {
5473
6608
  script;
5474
6609
  cwd;
5475
6610
  agentTimeoutMs;
6611
+ config;
5476
6612
  constructor(options) {
5477
6613
  this.script = options.script;
5478
6614
  this.cwd = options.cwd;
5479
6615
  this.agentTimeoutMs = options.agentTimeoutMs;
6616
+ this.config = options.config;
5480
6617
  }
5481
6618
  async evaluate(context) {
5482
- const inputPayload = JSON.stringify(
5483
- {
5484
- question: context.evalCase.question,
5485
- expectedOutcome: context.evalCase.expected_outcome,
5486
- expectedMessages: context.evalCase.expected_messages,
5487
- referenceAnswer: context.evalCase.reference_answer,
5488
- candidateAnswer: context.candidate,
5489
- outputMessages: context.outputMessages ?? null,
5490
- guidelineFiles: context.evalCase.guideline_paths,
5491
- inputFiles: context.evalCase.file_paths.filter(
5492
- (path16) => !context.evalCase.guideline_paths.includes(path16)
5493
- ),
5494
- inputMessages: context.evalCase.input_messages,
5495
- traceSummary: context.traceSummary ?? null
5496
- },
5497
- null,
5498
- 2
5499
- );
6619
+ const payload = {
6620
+ question: context.evalCase.question,
6621
+ expectedOutcome: context.evalCase.expected_outcome,
6622
+ expectedMessages: context.evalCase.expected_messages,
6623
+ referenceAnswer: context.evalCase.reference_answer,
6624
+ candidateAnswer: context.candidate,
6625
+ outputMessages: context.outputMessages ?? null,
6626
+ guidelineFiles: context.evalCase.guideline_paths,
6627
+ inputFiles: context.evalCase.file_paths.filter(
6628
+ (path17) => !context.evalCase.guideline_paths.includes(path17)
6629
+ ),
6630
+ inputMessages: context.evalCase.input_messages,
6631
+ traceSummary: context.traceSummary ?? null,
6632
+ config: this.config ?? null
6633
+ };
6634
+ const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
5500
6635
  try {
5501
6636
  const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
5502
6637
  const parsed = parseJsonSafe(stdout);
@@ -5562,18 +6697,25 @@ function calculateRubricScore(result, rubrics) {
5562
6697
  return { score, verdict, hits, misses };
5563
6698
  }
5564
6699
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
5565
- const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
5566
- cwd,
5567
- timeoutMs: agentTimeoutMs
5568
- });
6700
+ const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
5569
6701
  if (exitCode !== 0) {
5570
- const trimmedErr = stderr.trim();
6702
+ const trimmedErr = formatStderr(stderr);
5571
6703
  throw new Error(
5572
6704
  trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5573
6705
  );
5574
6706
  }
5575
6707
  return stdout.trim();
5576
6708
  }
6709
+ function formatStderr(stderr) {
6710
+ const trimmed = stderr.trim();
6711
+ const maxLength = 2e3;
6712
+ if (trimmed.length <= maxLength) {
6713
+ return trimmed;
6714
+ }
6715
+ const tail = trimmed.slice(-maxLength);
6716
+ return `...(truncated, last ${maxLength} chars)
6717
+ ${tail}`;
6718
+ }
5577
6719
  function parseJsonSafe(payload) {
5578
6720
  try {
5579
6721
  return JSON.parse(payload);
@@ -5805,22 +6947,438 @@ var ToolTrajectoryEvaluator = class {
5805
6947
  misses.push(`Position ${i}: ${expectedTool} args mismatch`);
5806
6948
  }
5807
6949
  } else {
5808
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
6950
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
6951
+ }
6952
+ }
6953
+ for (let i = checkLength; i < expected.length; i++) {
6954
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
6955
+ }
6956
+ const score = hits.length / expected.length;
6957
+ return {
6958
+ score,
6959
+ verdict: scoreToVerdict(score),
6960
+ hits,
6961
+ misses,
6962
+ expectedAspectCount: expected.length
6963
+ };
6964
+ }
6965
+ };
6966
+ var DEFAULT_DATE_FORMATS = [
6967
+ "YYYY-MM-DDTHH:mm:ssZ",
6968
+ // ISO with timezone
6969
+ "YYYY-MM-DDTHH:mm:ss",
6970
+ // ISO with time
6971
+ "YYYY-MM-DD",
6972
+ // ISO date
6973
+ "DD-MMM-YYYY",
6974
+ // Localized (e.g., "15-JAN-2025")
6975
+ "MM/DD/YYYY",
6976
+ // US format
6977
+ "DD/MM/YYYY",
6978
+ // EU format
6979
+ "MM-DD-YYYY",
6980
+ // US with dashes
6981
+ "DD-MM-YYYY"
6982
+ // EU with dashes
6983
+ ];
6984
+ var MONTH_NAMES = {
6985
+ jan: 0,
6986
+ january: 0,
6987
+ feb: 1,
6988
+ february: 1,
6989
+ mar: 2,
6990
+ march: 2,
6991
+ apr: 3,
6992
+ april: 3,
6993
+ may: 4,
6994
+ jun: 5,
6995
+ june: 5,
6996
+ jul: 6,
6997
+ july: 6,
6998
+ aug: 7,
6999
+ august: 7,
7000
+ sep: 8,
7001
+ sept: 8,
7002
+ september: 8,
7003
+ oct: 9,
7004
+ october: 9,
7005
+ nov: 10,
7006
+ november: 10,
7007
+ dec: 11,
7008
+ december: 11
7009
+ };
7010
+ var FieldAccuracyEvaluator = class {
7011
+ kind = "field_accuracy";
7012
+ config;
7013
+ constructor(options) {
7014
+ this.config = options.config;
7015
+ }
7016
+ evaluate(context) {
7017
+ const { evalCase, candidate } = context;
7018
+ let candidateData;
7019
+ try {
7020
+ candidateData = parseJsonFromTextSafe(candidate);
7021
+ } catch {
7022
+ return {
7023
+ score: 0,
7024
+ verdict: "fail",
7025
+ hits: [],
7026
+ misses: ["Failed to parse candidate answer as JSON"],
7027
+ expectedAspectCount: this.config.fields.length,
7028
+ reasoning: "Candidate answer is not valid JSON"
7029
+ };
7030
+ }
7031
+ const expectedData = this.extractExpectedData(evalCase.expected_messages);
7032
+ if (!expectedData) {
7033
+ return {
7034
+ score: 0,
7035
+ verdict: "fail",
7036
+ hits: [],
7037
+ misses: ["No expected data found in expected_messages"],
7038
+ expectedAspectCount: this.config.fields.length,
7039
+ reasoning: "Could not extract expected data from expected_messages"
7040
+ };
7041
+ }
7042
+ const fieldResults = [];
7043
+ for (const fieldConfig of this.config.fields) {
7044
+ const result = this.evaluateField(fieldConfig, candidateData, expectedData);
7045
+ fieldResults.push(result);
7046
+ }
7047
+ return this.aggregateResults(fieldResults);
7048
+ }
7049
+ /**
7050
+ * Extract expected data from expected_messages array.
7051
+ * Looks for the last assistant message with content.
7052
+ */
7053
+ extractExpectedData(expectedMessages) {
7054
+ for (let i = expectedMessages.length - 1; i >= 0; i--) {
7055
+ const message = expectedMessages[i];
7056
+ if (message.role === "assistant" && message.content) {
7057
+ if (typeof message.content === "object" && message.content !== null) {
7058
+ return message.content;
7059
+ }
7060
+ if (typeof message.content === "string") {
7061
+ try {
7062
+ return parseJsonFromTextSafe(message.content);
7063
+ } catch {
7064
+ }
7065
+ }
7066
+ }
7067
+ }
7068
+ return void 0;
7069
+ }
7070
+ /**
7071
+ * Evaluate a single field against the expected value.
7072
+ */
7073
+ evaluateField(fieldConfig, candidateData, expectedData) {
7074
+ const { path: path17, match, required = true, weight = 1 } = fieldConfig;
7075
+ const candidateValue = resolvePath(candidateData, path17);
7076
+ const expectedValue = resolvePath(expectedData, path17);
7077
+ if (expectedValue === void 0) {
7078
+ return {
7079
+ path: path17,
7080
+ score: 1,
7081
+ // No expected value means no comparison needed
7082
+ weight,
7083
+ hit: true,
7084
+ message: `${path17}: no expected value`
7085
+ };
7086
+ }
7087
+ if (candidateValue === void 0) {
7088
+ if (required) {
7089
+ return {
7090
+ path: path17,
7091
+ score: 0,
7092
+ weight,
7093
+ hit: false,
7094
+ message: `${path17} (required, missing)`
7095
+ };
7096
+ }
7097
+ return {
7098
+ path: path17,
7099
+ score: 1,
7100
+ // Don't penalize missing optional fields
7101
+ weight: 0,
7102
+ // Zero weight means it won't affect the score
7103
+ hit: true,
7104
+ message: `${path17}: optional field missing`
7105
+ };
7106
+ }
7107
+ switch (match) {
7108
+ case "exact":
7109
+ return this.compareExact(path17, candidateValue, expectedValue, weight);
7110
+ case "numeric_tolerance":
7111
+ return this.compareNumericTolerance(
7112
+ path17,
7113
+ candidateValue,
7114
+ expectedValue,
7115
+ fieldConfig,
7116
+ weight
7117
+ );
7118
+ case "date":
7119
+ return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
7120
+ default:
7121
+ return {
7122
+ path: path17,
7123
+ score: 0,
7124
+ weight,
7125
+ hit: false,
7126
+ message: `${path17}: unknown match type "${match}"`
7127
+ };
7128
+ }
7129
+ }
7130
+ /**
7131
+ * Exact equality comparison.
7132
+ */
7133
+ compareExact(path17, candidateValue, expectedValue, weight) {
7134
+ if (deepEqual(candidateValue, expectedValue)) {
7135
+ return {
7136
+ path: path17,
7137
+ score: 1,
7138
+ weight,
7139
+ hit: true,
7140
+ message: path17
7141
+ };
7142
+ }
7143
+ if (typeof candidateValue !== typeof expectedValue) {
7144
+ return {
7145
+ path: path17,
7146
+ score: 0,
7147
+ weight,
7148
+ hit: false,
7149
+ message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
7150
+ };
7151
+ }
7152
+ return {
7153
+ path: path17,
7154
+ score: 0,
7155
+ weight,
7156
+ hit: false,
7157
+ message: `${path17} (value mismatch)`
7158
+ };
7159
+ }
7160
+ /**
7161
+ * Numeric comparison with absolute or relative tolerance.
7162
+ */
7163
+ compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
7164
+ const { tolerance = 0, relative = false } = fieldConfig;
7165
+ const candidateNum = toNumber(candidateValue);
7166
+ const expectedNum = toNumber(expectedValue);
7167
+ if (candidateNum === null || expectedNum === null) {
7168
+ return {
7169
+ path: path17,
7170
+ score: 0,
7171
+ weight,
7172
+ hit: false,
7173
+ message: `${path17} (non-numeric value)`
7174
+ };
7175
+ }
7176
+ if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
7177
+ return {
7178
+ path: path17,
7179
+ score: 0,
7180
+ weight,
7181
+ hit: false,
7182
+ message: `${path17} (invalid numeric value)`
7183
+ };
7184
+ }
7185
+ const diff = Math.abs(candidateNum - expectedNum);
7186
+ let withinTolerance;
7187
+ if (relative) {
7188
+ const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
7189
+ withinTolerance = relativeDiff <= tolerance;
7190
+ } else {
7191
+ withinTolerance = diff <= tolerance;
7192
+ }
7193
+ if (withinTolerance) {
7194
+ return {
7195
+ path: path17,
7196
+ score: 1,
7197
+ weight,
7198
+ hit: true,
7199
+ message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
7200
+ };
7201
+ }
7202
+ return {
7203
+ path: path17,
7204
+ score: 0,
7205
+ weight,
7206
+ hit: false,
7207
+ message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
7208
+ };
7209
+ }
7210
+ /**
7211
+ * Date comparison with format normalization.
7212
+ */
7213
+ compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
7214
+ const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
7215
+ const candidateDate = parseDate(String(candidateValue), formats);
7216
+ const expectedDate = parseDate(String(expectedValue), formats);
7217
+ if (candidateDate === null) {
7218
+ return {
7219
+ path: path17,
7220
+ score: 0,
7221
+ weight,
7222
+ hit: false,
7223
+ message: `${path17} (unparseable candidate date)`
7224
+ };
7225
+ }
7226
+ if (expectedDate === null) {
7227
+ return {
7228
+ path: path17,
7229
+ score: 0,
7230
+ weight,
7231
+ hit: false,
7232
+ message: `${path17} (unparseable expected date)`
7233
+ };
7234
+ }
7235
+ if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
7236
+ return {
7237
+ path: path17,
7238
+ score: 1,
7239
+ weight,
7240
+ hit: true,
7241
+ message: path17
7242
+ };
7243
+ }
7244
+ return {
7245
+ path: path17,
7246
+ score: 0,
7247
+ weight,
7248
+ hit: false,
7249
+ message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
7250
+ };
7251
+ }
7252
+ /**
7253
+ * Aggregate field results using configured strategy.
7254
+ */
7255
+ aggregateResults(results) {
7256
+ const aggregation = this.config.aggregation ?? "weighted_average";
7257
+ const hits = [];
7258
+ const misses = [];
7259
+ for (const result of results) {
7260
+ if (result.hit) {
7261
+ hits.push(result.message);
7262
+ } else {
7263
+ misses.push(result.message);
5809
7264
  }
5810
7265
  }
5811
- for (let i = checkLength; i < expected.length; i++) {
5812
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
7266
+ let score;
7267
+ if (aggregation === "all_or_nothing") {
7268
+ score = misses.length === 0 ? 1 : 0;
7269
+ } else {
7270
+ const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
7271
+ if (totalWeight === 0) {
7272
+ score = results.length === 0 ? 1 : 0;
7273
+ } else {
7274
+ const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
7275
+ score = weightedSum / totalWeight;
7276
+ }
5813
7277
  }
5814
- const score = hits.length / expected.length;
7278
+ const reasoning = `${hits.length}/${results.length} fields matched`;
5815
7279
  return {
5816
- score,
7280
+ score: clampScore(score),
5817
7281
  verdict: scoreToVerdict(score),
5818
- hits,
5819
- misses,
5820
- expectedAspectCount: expected.length
7282
+ hits: hits.slice(0, 4),
7283
+ misses: misses.slice(0, 4),
7284
+ expectedAspectCount: results.length,
7285
+ reasoning
5821
7286
  };
5822
7287
  }
5823
7288
  };
7289
+ function resolvePath(obj, path17) {
7290
+ if (!path17 || !obj) {
7291
+ return void 0;
7292
+ }
7293
+ const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
7294
+ let current = obj;
7295
+ for (const part of parts) {
7296
+ if (current === null || current === void 0) {
7297
+ return void 0;
7298
+ }
7299
+ if (typeof current !== "object") {
7300
+ return void 0;
7301
+ }
7302
+ const isIndex = /^\d+$/.test(part);
7303
+ if (isIndex && Array.isArray(current)) {
7304
+ current = current[Number.parseInt(part, 10)];
7305
+ } else {
7306
+ current = current[part];
7307
+ }
7308
+ }
7309
+ return current;
7310
+ }
7311
+ function toNumber(value) {
7312
+ if (typeof value === "number") {
7313
+ return value;
7314
+ }
7315
+ if (typeof value === "string") {
7316
+ const num = Number.parseFloat(value);
7317
+ return Number.isNaN(num) ? null : num;
7318
+ }
7319
+ return null;
7320
+ }
7321
+ function parseDate(dateStr, formats) {
7322
+ if (!dateStr) return null;
7323
+ const trimmed = dateStr.trim();
7324
+ const isoDate = new Date(trimmed);
7325
+ if (!Number.isNaN(isoDate.getTime())) {
7326
+ return isoDate;
7327
+ }
7328
+ const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
7329
+ if (localizedMatch) {
7330
+ const day = Number.parseInt(localizedMatch[1], 10);
7331
+ const monthName = localizedMatch[2].toLowerCase();
7332
+ const year = Number.parseInt(localizedMatch[3], 10);
7333
+ const month = MONTH_NAMES[monthName];
7334
+ if (month !== void 0) {
7335
+ return new Date(year, month, day);
7336
+ }
7337
+ }
7338
+ const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
7339
+ if (usMatch) {
7340
+ const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
7341
+ const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
7342
+ if (hasUSFormat && !hasEUFormat) {
7343
+ const month = Number.parseInt(usMatch[1], 10) - 1;
7344
+ const day = Number.parseInt(usMatch[2], 10);
7345
+ const year = Number.parseInt(usMatch[3], 10);
7346
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7347
+ return new Date(year, month, day);
7348
+ }
7349
+ } else if (hasEUFormat && !hasUSFormat) {
7350
+ const day = Number.parseInt(usMatch[1], 10);
7351
+ const month = Number.parseInt(usMatch[2], 10) - 1;
7352
+ const year = Number.parseInt(usMatch[3], 10);
7353
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7354
+ return new Date(year, month, day);
7355
+ }
7356
+ } else {
7357
+ const num1 = Number.parseInt(usMatch[1], 10);
7358
+ const num2 = Number.parseInt(usMatch[2], 10);
7359
+ const year = Number.parseInt(usMatch[3], 10);
7360
+ if (num1 > 12 && num2 <= 12) {
7361
+ return new Date(year, num2 - 1, num1);
7362
+ }
7363
+ if (num2 > 12 && num1 <= 12) {
7364
+ return new Date(year, num1 - 1, num2);
7365
+ }
7366
+ if (num1 <= 12 && num2 <= 31) {
7367
+ return new Date(year, num1 - 1, num2);
7368
+ }
7369
+ }
7370
+ }
7371
+ return null;
7372
+ }
7373
+ function formatDateISO(date) {
7374
+ return date.toISOString().split("T")[0];
7375
+ }
7376
+ function parseJsonFromTextSafe(text) {
7377
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
7378
+ const match = cleaned.match(/\{[\s\S]*\}/);
7379
+ const blob = match?.[0] ?? cleaned;
7380
+ return JSON.parse(blob);
7381
+ }
5824
7382
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
5825
7383
  {{EVALUATOR_RESULTS_JSON}}
5826
7384
 
@@ -6045,11 +7603,175 @@ var CompositeEvaluator = class {
6045
7603
  }
6046
7604
  }
6047
7605
  };
7606
+ var LatencyEvaluator = class {
7607
+ kind = "latency";
7608
+ config;
7609
+ constructor(options) {
7610
+ this.config = options.config;
7611
+ }
7612
+ evaluate(context) {
7613
+ const { threshold } = this.config;
7614
+ const durationMs = context.traceSummary?.durationMs;
7615
+ if (durationMs === void 0) {
7616
+ return {
7617
+ score: 0,
7618
+ verdict: "fail",
7619
+ hits: [],
7620
+ misses: ["No duration data available in trace"],
7621
+ expectedAspectCount: 1,
7622
+ reasoning: "Execution duration not reported by provider",
7623
+ evaluatorRawRequest: {
7624
+ type: "latency",
7625
+ threshold,
7626
+ durationMs: null
7627
+ }
7628
+ };
7629
+ }
7630
+ const passed = durationMs <= threshold;
7631
+ const score = passed ? 1 : 0;
7632
+ return {
7633
+ score,
7634
+ verdict: passed ? "pass" : "fail",
7635
+ hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
7636
+ misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
7637
+ expectedAspectCount: 1,
7638
+ reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
7639
+ evaluatorRawRequest: {
7640
+ type: "latency",
7641
+ threshold,
7642
+ durationMs
7643
+ }
7644
+ };
7645
+ }
7646
+ };
7647
+ var CostEvaluator = class {
7648
+ kind = "cost";
7649
+ config;
7650
+ constructor(options) {
7651
+ this.config = options.config;
7652
+ }
7653
+ evaluate(context) {
7654
+ const { budget } = this.config;
7655
+ const costUsd = context.traceSummary?.costUsd;
7656
+ if (costUsd === void 0) {
7657
+ return {
7658
+ score: 0,
7659
+ verdict: "fail",
7660
+ hits: [],
7661
+ misses: ["No cost data available in trace"],
7662
+ expectedAspectCount: 1,
7663
+ reasoning: "Execution cost not reported by provider",
7664
+ evaluatorRawRequest: {
7665
+ type: "cost",
7666
+ budget,
7667
+ costUsd: null
7668
+ }
7669
+ };
7670
+ }
7671
+ const passed = costUsd <= budget;
7672
+ const score = passed ? 1 : 0;
7673
+ const formatCost = (n) => `$${n.toFixed(4)}`;
7674
+ return {
7675
+ score,
7676
+ verdict: passed ? "pass" : "fail",
7677
+ hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
7678
+ misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
7679
+ expectedAspectCount: 1,
7680
+ reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
7681
+ evaluatorRawRequest: {
7682
+ type: "cost",
7683
+ budget,
7684
+ costUsd
7685
+ }
7686
+ };
7687
+ }
7688
+ };
7689
+ var TokenUsageEvaluator = class {
7690
+ kind = "token_usage";
7691
+ config;
7692
+ constructor(options) {
7693
+ this.config = options.config;
7694
+ }
7695
+ evaluate(context) {
7696
+ const usage = context.traceSummary?.tokenUsage;
7697
+ const maxTotal = this.config.max_total;
7698
+ const maxInput = this.config.max_input;
7699
+ const maxOutput = this.config.max_output;
7700
+ const expectedAspectCount = Math.max(
7701
+ [maxTotal, maxInput, maxOutput].filter((v) => typeof v === "number").length,
7702
+ 1
7703
+ );
7704
+ if (!usage) {
7705
+ return {
7706
+ score: 0,
7707
+ verdict: "fail",
7708
+ hits: [],
7709
+ misses: ["No token usage data available in trace"],
7710
+ expectedAspectCount,
7711
+ reasoning: "Token usage not reported by provider",
7712
+ evaluatorRawRequest: {
7713
+ type: "token_usage",
7714
+ max_total: maxTotal ?? null,
7715
+ max_input: maxInput ?? null,
7716
+ max_output: maxOutput ?? null,
7717
+ tokenUsage: null
7718
+ }
7719
+ };
7720
+ }
7721
+ const input = usage.input;
7722
+ const output = usage.output;
7723
+ const cached = usage.cached ?? 0;
7724
+ const total = input + output + cached;
7725
+ const hits = [];
7726
+ const misses = [];
7727
+ if (typeof maxInput === "number") {
7728
+ if (input <= maxInput) {
7729
+ hits.push(`Input tokens ${input} <= ${maxInput}`);
7730
+ } else {
7731
+ misses.push(`Input tokens ${input} > ${maxInput}`);
7732
+ }
7733
+ }
7734
+ if (typeof maxOutput === "number") {
7735
+ if (output <= maxOutput) {
7736
+ hits.push(`Output tokens ${output} <= ${maxOutput}`);
7737
+ } else {
7738
+ misses.push(`Output tokens ${output} > ${maxOutput}`);
7739
+ }
7740
+ }
7741
+ if (typeof maxTotal === "number") {
7742
+ if (total <= maxTotal) {
7743
+ hits.push(`Total tokens ${total} <= ${maxTotal}`);
7744
+ } else {
7745
+ misses.push(`Total tokens ${total} > ${maxTotal}`);
7746
+ }
7747
+ }
7748
+ const passed = misses.length === 0;
7749
+ return {
7750
+ score: passed ? 1 : 0,
7751
+ verdict: passed ? "pass" : "fail",
7752
+ hits,
7753
+ misses,
7754
+ expectedAspectCount,
7755
+ reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
7756
+ evaluatorRawRequest: {
7757
+ type: "token_usage",
7758
+ max_total: maxTotal ?? null,
7759
+ max_input: maxInput ?? null,
7760
+ max_output: maxOutput ?? null,
7761
+ tokenUsage: {
7762
+ input,
7763
+ output,
7764
+ cached,
7765
+ total
7766
+ }
7767
+ }
7768
+ };
7769
+ }
7770
+ };
6048
7771
 
6049
7772
  // src/evaluation/orchestrator.ts
6050
- var import_node_crypto3 = require("crypto");
6051
- var import_promises12 = require("fs/promises");
6052
- var import_node_path15 = __toESM(require("path"), 1);
7773
+ var import_node_crypto4 = require("crypto");
7774
+ var import_node_path16 = __toESM(require("path"), 1);
6053
7775
 
6054
7776
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
6055
7777
  var Node = class {
@@ -6191,6 +7913,9 @@ function validateConcurrency(concurrency) {
6191
7913
  }
6192
7914
 
6193
7915
  // src/evaluation/orchestrator.ts
7916
+ function usesFileReferencePrompt(provider) {
7917
+ return isAgentProvider(provider) || provider.kind === "cli";
7918
+ }
6194
7919
  async function runEvaluation(options) {
6195
7920
  const {
6196
7921
  testFilePath: evalFilePath,
@@ -6202,7 +7927,6 @@ async function runEvaluation(options) {
6202
7927
  evaluators,
6203
7928
  maxRetries,
6204
7929
  agentTimeoutMs,
6205
- promptDumpDir,
6206
7930
  cache,
6207
7931
  useCache,
6208
7932
  now,
@@ -6282,7 +8006,6 @@ async function runEvaluation(options) {
6282
8006
  provider: primaryProvider,
6283
8007
  target,
6284
8008
  evaluatorRegistry,
6285
- promptDumpDir,
6286
8009
  nowFn: now ?? (() => /* @__PURE__ */ new Date()),
6287
8010
  onProgress,
6288
8011
  onResult,
@@ -6324,7 +8047,6 @@ async function runEvaluation(options) {
6324
8047
  evaluators: evaluatorRegistry,
6325
8048
  maxRetries,
6326
8049
  agentTimeoutMs,
6327
- promptDumpDir,
6328
8050
  cache,
6329
8051
  useCache,
6330
8052
  now,
@@ -6367,7 +8089,8 @@ async function runEvaluation(options) {
6367
8089
  results.push(outcome.value);
6368
8090
  } else {
6369
8091
  const evalCase = filteredEvalCases[i];
6370
- const promptInputs = await buildPromptInputs(evalCase);
8092
+ const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
8093
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
6371
8094
  const errorResult = buildErrorResult(
6372
8095
  evalCase,
6373
8096
  target.name,
@@ -6390,7 +8113,6 @@ async function runBatchEvaluation(options) {
6390
8113
  provider,
6391
8114
  target,
6392
8115
  evaluatorRegistry,
6393
- promptDumpDir,
6394
8116
  nowFn,
6395
8117
  onProgress,
6396
8118
  onResult,
@@ -6398,12 +8120,9 @@ async function runBatchEvaluation(options) {
6398
8120
  agentTimeoutMs
6399
8121
  } = options;
6400
8122
  const promptInputsList = [];
6401
- const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
8123
+ const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
6402
8124
  for (const evalCase of evalCases) {
6403
8125
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
6404
- if (promptDumpDir) {
6405
- await dumpPrompt(promptDumpDir, evalCase, promptInputs);
6406
- }
6407
8126
  promptInputsList.push(promptInputs);
6408
8127
  }
6409
8128
  const batchRequests = evalCases.map((evalCase, index) => {
@@ -6445,13 +8164,20 @@ async function runBatchEvaluation(options) {
6445
8164
  const promptInputs = promptInputsList[i];
6446
8165
  const providerResponse = batchResponse[i];
6447
8166
  const outputMessages = providerResponse.outputMessages;
6448
- const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
8167
+ const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
8168
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
8169
+ eventCount: 0,
8170
+ toolNames: [],
8171
+ toolCallsByName: {},
8172
+ errorCount: 0
8173
+ } : void 0;
6449
8174
  const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
6450
8175
  tokenUsage: providerResponse.tokenUsage,
6451
8176
  costUsd: providerResponse.costUsd,
6452
8177
  durationMs: providerResponse.durationMs
6453
8178
  }) : void 0;
6454
8179
  const candidate = extractLastAssistantContent(outputMessages);
8180
+ const providerError = extractProviderError(providerResponse);
6455
8181
  let result;
6456
8182
  try {
6457
8183
  result = await evaluateCandidate({
@@ -6468,6 +8194,9 @@ async function runBatchEvaluation(options) {
6468
8194
  outputMessages,
6469
8195
  traceSummary
6470
8196
  });
8197
+ if (providerError) {
8198
+ result = { ...result, error: providerError };
8199
+ }
6471
8200
  } catch (error) {
6472
8201
  const errorResult = buildErrorResult(
6473
8202
  evalCase,
@@ -6500,9 +8229,10 @@ async function runBatchEvaluation(options) {
6500
8229
  await onProgress({
6501
8230
  workerId: 1,
6502
8231
  evalId: evalCase.id,
6503
- status: "completed",
8232
+ status: result.error ? "failed" : "completed",
6504
8233
  startedAt: 0,
6505
- completedAt: Date.now()
8234
+ completedAt: Date.now(),
8235
+ error: result.error
6506
8236
  });
6507
8237
  }
6508
8238
  }
@@ -6517,17 +8247,13 @@ async function runEvalCase(options) {
6517
8247
  now,
6518
8248
  maxRetries,
6519
8249
  agentTimeoutMs,
6520
- promptDumpDir,
6521
8250
  cache,
6522
8251
  useCache,
6523
8252
  signal,
6524
8253
  judgeProvider
6525
8254
  } = options;
6526
- const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
8255
+ const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
6527
8256
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
6528
- if (promptDumpDir) {
6529
- await dumpPrompt(promptDumpDir, evalCase, promptInputs);
6530
- }
6531
8257
  const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
6532
8258
  let cachedResponse;
6533
8259
  if (cacheKey && cache) {
@@ -6571,15 +8297,22 @@ async function runEvalCase(options) {
6571
8297
  await cache.set(cacheKey, providerResponse);
6572
8298
  }
6573
8299
  const outputMessages = providerResponse.outputMessages;
6574
- const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
8300
+ const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
8301
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
8302
+ eventCount: 0,
8303
+ toolNames: [],
8304
+ toolCallsByName: {},
8305
+ errorCount: 0
8306
+ } : void 0;
6575
8307
  const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
6576
8308
  tokenUsage: providerResponse.tokenUsage,
6577
8309
  costUsd: providerResponse.costUsd,
6578
8310
  durationMs: providerResponse.durationMs
6579
8311
  }) : void 0;
6580
8312
  const candidate = extractLastAssistantContent(outputMessages);
8313
+ const providerError = extractProviderError(providerResponse);
6581
8314
  try {
6582
- return await evaluateCandidate({
8315
+ const result = await evaluateCandidate({
6583
8316
  evalCase,
6584
8317
  candidate,
6585
8318
  target,
@@ -6593,6 +8326,7 @@ async function runEvalCase(options) {
6593
8326
  outputMessages,
6594
8327
  traceSummary
6595
8328
  });
8329
+ return providerError ? { ...result, error: providerError } : result;
6596
8330
  } catch (error) {
6597
8331
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
6598
8332
  }
@@ -6658,7 +8392,6 @@ async function evaluateCandidate(options) {
6658
8392
  candidateAnswer: candidate,
6659
8393
  target: target.name,
6660
8394
  reasoning: score.reasoning,
6661
- rawAspects: score.rawAspects,
6662
8395
  agentProviderRequest,
6663
8396
  lmProviderRequest,
6664
8397
  evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
@@ -6768,7 +8501,8 @@ async function runEvaluatorList(options) {
6768
8501
  const codeEvaluator = new CodeEvaluator({
6769
8502
  script: evaluator.script,
6770
8503
  cwd: evaluator.resolvedCwd ?? evaluator.cwd,
6771
- agentTimeoutMs
8504
+ agentTimeoutMs,
8505
+ config: evaluator.config
6772
8506
  });
6773
8507
  const score2 = await codeEvaluator.evaluate({
6774
8508
  evalCase,
@@ -6796,7 +8530,7 @@ async function runEvaluatorList(options) {
6796
8530
  });
6797
8531
  }
6798
8532
  if (evaluator.type === "composite") {
6799
- const evalFileDir = evalCase.guideline_paths[0] ? import_node_path15.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
8533
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path16.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
6800
8534
  const createEvaluator = (memberConfig) => {
6801
8535
  switch (memberConfig.type) {
6802
8536
  case "llm_judge":
@@ -6805,7 +8539,8 @@ async function runEvaluatorList(options) {
6805
8539
  return new CodeEvaluator({
6806
8540
  script: memberConfig.script,
6807
8541
  cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
6808
- agentTimeoutMs
8542
+ agentTimeoutMs,
8543
+ config: memberConfig.config
6809
8544
  });
6810
8545
  case "composite":
6811
8546
  return new CompositeEvaluator({
@@ -6817,6 +8552,22 @@ async function runEvaluatorList(options) {
6817
8552
  return new ToolTrajectoryEvaluator({
6818
8553
  config: memberConfig
6819
8554
  });
8555
+ case "field_accuracy":
8556
+ return new FieldAccuracyEvaluator({
8557
+ config: memberConfig
8558
+ });
8559
+ case "latency":
8560
+ return new LatencyEvaluator({
8561
+ config: memberConfig
8562
+ });
8563
+ case "cost":
8564
+ return new CostEvaluator({
8565
+ config: memberConfig
8566
+ });
8567
+ case "token_usage":
8568
+ return new TokenUsageEvaluator({
8569
+ config: memberConfig
8570
+ });
6820
8571
  default: {
6821
8572
  const unknownConfig = memberConfig;
6822
8573
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -6836,7 +8587,9 @@ async function runEvaluatorList(options) {
6836
8587
  attempt,
6837
8588
  promptInputs,
6838
8589
  now,
6839
- judgeProvider
8590
+ judgeProvider,
8591
+ outputMessages,
8592
+ traceSummary
6840
8593
  });
6841
8594
  const weight = evaluator.weight ?? 1;
6842
8595
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -6881,6 +8634,118 @@ async function runEvaluatorList(options) {
6881
8634
  reasoning: score2.reasoning
6882
8635
  });
6883
8636
  }
8637
+ if (evaluator.type === "field_accuracy") {
8638
+ const fieldAccuracyEvaluator = new FieldAccuracyEvaluator({
8639
+ config: evaluator
8640
+ });
8641
+ const score2 = fieldAccuracyEvaluator.evaluate({
8642
+ evalCase,
8643
+ candidate,
8644
+ target,
8645
+ provider,
8646
+ attempt,
8647
+ promptInputs,
8648
+ now,
8649
+ outputMessages,
8650
+ traceSummary
8651
+ });
8652
+ const weight = evaluator.weight ?? 1;
8653
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
8654
+ evaluatorResults.push({
8655
+ name: evaluator.name,
8656
+ type: evaluator.type,
8657
+ score: score2.score,
8658
+ weight,
8659
+ verdict: score2.verdict,
8660
+ hits: score2.hits,
8661
+ misses: score2.misses,
8662
+ reasoning: score2.reasoning
8663
+ });
8664
+ }
8665
+ if (evaluator.type === "latency") {
8666
+ const latencyEvaluator = new LatencyEvaluator({
8667
+ config: evaluator
8668
+ });
8669
+ const score2 = latencyEvaluator.evaluate({
8670
+ evalCase,
8671
+ candidate,
8672
+ target,
8673
+ provider,
8674
+ attempt,
8675
+ promptInputs,
8676
+ now,
8677
+ outputMessages,
8678
+ traceSummary
8679
+ });
8680
+ const weight = evaluator.weight ?? 1;
8681
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
8682
+ evaluatorResults.push({
8683
+ name: evaluator.name,
8684
+ type: evaluator.type,
8685
+ score: score2.score,
8686
+ weight,
8687
+ verdict: score2.verdict,
8688
+ hits: score2.hits,
8689
+ misses: score2.misses,
8690
+ reasoning: score2.reasoning
8691
+ });
8692
+ }
8693
+ if (evaluator.type === "cost") {
8694
+ const costEvaluator = new CostEvaluator({
8695
+ config: evaluator
8696
+ });
8697
+ const score2 = costEvaluator.evaluate({
8698
+ evalCase,
8699
+ candidate,
8700
+ target,
8701
+ provider,
8702
+ attempt,
8703
+ promptInputs,
8704
+ now,
8705
+ outputMessages,
8706
+ traceSummary
8707
+ });
8708
+ const weight = evaluator.weight ?? 1;
8709
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
8710
+ evaluatorResults.push({
8711
+ name: evaluator.name,
8712
+ type: evaluator.type,
8713
+ score: score2.score,
8714
+ weight,
8715
+ verdict: score2.verdict,
8716
+ hits: score2.hits,
8717
+ misses: score2.misses,
8718
+ reasoning: score2.reasoning
8719
+ });
8720
+ }
8721
+ if (evaluator.type === "token_usage") {
8722
+ const tokenUsageEvaluator = new TokenUsageEvaluator({
8723
+ config: evaluator
8724
+ });
8725
+ const score2 = tokenUsageEvaluator.evaluate({
8726
+ evalCase,
8727
+ candidate,
8728
+ target,
8729
+ provider,
8730
+ attempt,
8731
+ promptInputs,
8732
+ now,
8733
+ outputMessages,
8734
+ traceSummary
8735
+ });
8736
+ const weight = evaluator.weight ?? 1;
8737
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
8738
+ evaluatorResults.push({
8739
+ name: evaluator.name,
8740
+ type: evaluator.type,
8741
+ score: score2.score,
8742
+ weight,
8743
+ verdict: score2.verdict,
8744
+ hits: score2.hits,
8745
+ misses: score2.misses,
8746
+ reasoning: score2.reasoning
8747
+ });
8748
+ }
6884
8749
  } catch (error) {
6885
8750
  const message = error instanceof Error ? error.message : String(error);
6886
8751
  const fallbackScore = {
@@ -6920,7 +8785,6 @@ async function runEvaluatorList(options) {
6920
8785
  (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
6921
8786
  0
6922
8787
  );
6923
- const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
6924
8788
  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
6925
8789
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
6926
8790
  const score = {
@@ -6929,8 +8793,7 @@ async function runEvaluatorList(options) {
6929
8793
  hits,
6930
8794
  misses,
6931
8795
  expectedAspectCount,
6932
- reasoning,
6933
- rawAspects: rawAspects.length > 0 ? rawAspects : void 0
8796
+ reasoning
6934
8797
  };
6935
8798
  return { score, evaluatorResults };
6936
8799
  }
@@ -7005,26 +8868,6 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
7005
8868
  llm_judge: llmJudge
7006
8869
  };
7007
8870
  }
7008
- async function dumpPrompt(directory, evalCase, promptInputs) {
7009
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
7010
- const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
7011
- const filePath = import_node_path15.default.resolve(directory, filename);
7012
- await (0, import_promises12.mkdir)(import_node_path15.default.dirname(filePath), { recursive: true });
7013
- const payload = {
7014
- eval_id: evalCase.id,
7015
- question: promptInputs.question,
7016
- guidelines: promptInputs.guidelines,
7017
- guideline_paths: evalCase.guideline_paths
7018
- };
7019
- await (0, import_promises12.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
7020
- }
7021
- function sanitizeFilename(value) {
7022
- if (!value) {
7023
- return "prompt";
7024
- }
7025
- const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
7026
- return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
7027
- }
7028
8871
  async function invokeProvider(provider, options) {
7029
8872
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
7030
8873
  const controller = new AbortController();
@@ -7088,14 +8931,25 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
7088
8931
  misses: [`Error: ${message}`],
7089
8932
  candidateAnswer: `Error occurred: ${message}`,
7090
8933
  target: targetName,
7091
- rawAspects: [],
7092
8934
  agentProviderRequest,
7093
8935
  lmProviderRequest,
7094
8936
  error: message
7095
8937
  };
7096
8938
  }
8939
+ function extractProviderError(response) {
8940
+ const raw = response.raw;
8941
+ if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
8942
+ return void 0;
8943
+ }
8944
+ const error = raw.error;
8945
+ if (typeof error !== "string") {
8946
+ return void 0;
8947
+ }
8948
+ const trimmed = error.trim();
8949
+ return trimmed.length > 0 ? trimmed : void 0;
8950
+ }
7097
8951
  function createCacheKey(provider, target, evalCase, promptInputs) {
7098
- const hash = (0, import_node_crypto3.createHash)("sha256");
8952
+ const hash = (0, import_node_crypto4.createHash)("sha256");
7099
8953
  hash.update(provider.id);
7100
8954
  hash.update(target.name);
7101
8955
  hash.update(evalCase.id);
@@ -7152,15 +9006,15 @@ function computeWeightedMean(entries) {
7152
9006
 
7153
9007
  // src/evaluation/generators/rubric-generator.ts
7154
9008
  var import_ai3 = require("ai");
7155
- var import_zod3 = require("zod");
7156
- var rubricItemSchema = import_zod3.z.object({
7157
- id: import_zod3.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
7158
- description: import_zod3.z.string().describe("What this rubric checks for"),
7159
- weight: import_zod3.z.number().default(1).describe("Relative importance (default 1.0)"),
7160
- required: import_zod3.z.boolean().default(true).describe("Whether this is a mandatory requirement")
9009
+ var import_zod4 = require("zod");
9010
+ var rubricItemSchema = import_zod4.z.object({
9011
+ id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
9012
+ description: import_zod4.z.string().describe("What this rubric checks for"),
9013
+ weight: import_zod4.z.number().default(1).describe("Relative importance (default 1.0)"),
9014
+ required: import_zod4.z.boolean().default(true).describe("Whether this is a mandatory requirement")
7161
9015
  });
7162
- var rubricGenerationSchema = import_zod3.z.object({
7163
- rubrics: import_zod3.z.array(rubricItemSchema).describe("List of evaluation rubrics")
9016
+ var rubricGenerationSchema = import_zod4.z.object({
9017
+ rubrics: import_zod4.z.array(rubricItemSchema).describe("List of evaluation rubrics")
7164
9018
  });
7165
9019
  async function generateRubrics(options) {
7166
9020
  const { expectedOutcome, question, referenceAnswer, provider } = options;
@@ -7230,6 +9084,17 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
7230
9084
  return parts.join("\n");
7231
9085
  }
7232
9086
 
9087
+ // src/evaluation/code-judge-sdk.ts
9088
+ var import_node_fs7 = require("fs");
9089
+ function parseCodeJudgePayload(payload) {
9090
+ const parsed = JSON.parse(payload);
9091
+ return toCamelCaseDeep(parsed);
9092
+ }
9093
+ function readCodeJudgePayload() {
9094
+ const stdin = (0, import_node_fs7.readFileSync)(0, "utf8");
9095
+ return parseCodeJudgePayload(stdin);
9096
+ }
9097
+
7233
9098
  // src/index.ts
7234
9099
  function createAgentKernel() {
7235
9100
  return { status: "stub" };
@@ -7238,15 +9103,20 @@ function createAgentKernel() {
7238
9103
  0 && (module.exports = {
7239
9104
  CodeEvaluator,
7240
9105
  CompositeEvaluator,
9106
+ CostEvaluator,
7241
9107
  DEFAULT_EXPLORATION_TOOLS,
9108
+ FieldAccuracyEvaluator,
9109
+ LatencyEvaluator,
7242
9110
  LlmJudgeEvaluator,
7243
9111
  TEST_MESSAGE_ROLES,
9112
+ TokenUsageEvaluator,
7244
9113
  ToolTrajectoryEvaluator,
7245
9114
  avgToolDurationMs,
7246
9115
  buildDirectoryChain,
7247
9116
  buildPromptInputs,
7248
9117
  buildSearchRoots,
7249
9118
  computeTraceSummary,
9119
+ consumeClaudeCodeLogEntries,
7250
9120
  consumeCodexLogEntries,
7251
9121
  consumePiLogEntries,
7252
9122
  createAgentKernel,
@@ -7268,6 +9138,8 @@ function createAgentKernel() {
7268
9138
  loadEvalCases,
7269
9139
  mergeExecutionMetrics,
7270
9140
  normalizeLineEndings,
9141
+ parseCodeJudgePayload,
9142
+ readCodeJudgePayload,
7271
9143
  readJsonFile,
7272
9144
  readTargetDefinitions,
7273
9145
  readTestSuiteMetadata,
@@ -7277,6 +9149,7 @@ function createAgentKernel() {
7277
9149
  resolveTargetDefinition,
7278
9150
  runEvalCase,
7279
9151
  runEvaluation,
9152
+ subscribeToClaudeCodeLogEntries,
7280
9153
  subscribeToCodexLogEntries,
7281
9154
  subscribeToPiLogEntries,
7282
9155
  tokensPerTool