@agentv/core 1.5.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -10,7 +10,7 @@ import {
10
10
  readTextFile,
11
11
  resolveFileReference,
12
12
  resolveTargetDefinition
13
- } from "./chunk-E2VSU4WZ.js";
13
+ } from "./chunk-KDEP4I7G.js";
14
14
 
15
15
  // src/evaluation/types.ts
16
16
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -64,7 +64,11 @@ var EVALUATOR_KIND_VALUES = [
64
64
  "llm_judge",
65
65
  "rubric",
66
66
  "composite",
67
- "tool_trajectory"
67
+ "tool_trajectory",
68
+ "field_accuracy",
69
+ "latency",
70
+ "cost",
71
+ "token_usage"
68
72
  ];
69
73
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
70
74
  function isEvaluatorKind(value) {
@@ -486,7 +490,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
486
490
  continue;
487
491
  }
488
492
  if (typeValue === "code_judge") {
489
- const script = asString2(rawEvaluator.script);
493
+ let script;
494
+ const rawScript = rawEvaluator.script;
495
+ if (typeof rawScript === "string") {
496
+ const trimmed = rawScript.trim();
497
+ if (trimmed.length === 0) {
498
+ throw new Error(
499
+ `Invalid code_judge script for evaluator '${name}' in '${evalId}': script cannot be empty`
500
+ );
501
+ }
502
+ script = parseCommandToArgv(trimmed);
503
+ } else {
504
+ script = asStringArray(
505
+ rawScript,
506
+ `code_judge script for evaluator '${name}' in '${evalId}'`
507
+ );
508
+ }
490
509
  if (!script) {
491
510
  logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
492
511
  continue;
@@ -507,13 +526,21 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
507
526
  } else {
508
527
  resolvedCwd = searchRoots[0];
509
528
  }
529
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
530
+ const config = {};
531
+ for (const [key, value] of Object.entries(rawEvaluator)) {
532
+ if (!knownProps.has(key) && value !== void 0) {
533
+ config[key] = value;
534
+ }
535
+ }
510
536
  evaluators.push({
511
537
  name,
512
538
  type: "code",
513
539
  script,
514
540
  cwd,
515
541
  resolvedCwd,
516
- ...weight2 !== void 0 ? { weight: weight2 } : {}
542
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
543
+ ...Object.keys(config).length > 0 ? { config } : {}
517
544
  });
518
545
  continue;
519
546
  }
@@ -688,6 +715,140 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
688
715
  evaluators.push(config);
689
716
  continue;
690
717
  }
718
+ if (typeValue === "field_accuracy") {
719
+ const rawFields = rawEvaluator.fields;
720
+ if (!Array.isArray(rawFields)) {
721
+ logWarning2(
722
+ `Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
723
+ );
724
+ continue;
725
+ }
726
+ if (rawFields.length === 0) {
727
+ logWarning2(
728
+ `Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
729
+ );
730
+ continue;
731
+ }
732
+ const fields = [];
733
+ for (const rawField of rawFields) {
734
+ if (!isJsonObject2(rawField)) {
735
+ logWarning2(
736
+ `Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
737
+ );
738
+ continue;
739
+ }
740
+ const fieldPath = asString2(rawField.path);
741
+ const match = asString2(rawField.match);
742
+ if (!fieldPath) {
743
+ logWarning2(
744
+ `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
745
+ );
746
+ continue;
747
+ }
748
+ if (!match || !isValidFieldMatchType(match)) {
749
+ logWarning2(
750
+ `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
751
+ );
752
+ continue;
753
+ }
754
+ const fieldConfig = {
755
+ path: fieldPath,
756
+ match,
757
+ ...typeof rawField.required === "boolean" ? { required: rawField.required } : {},
758
+ ...typeof rawField.weight === "number" ? { weight: rawField.weight } : {},
759
+ ...typeof rawField.tolerance === "number" ? { tolerance: rawField.tolerance } : {},
760
+ ...typeof rawField.relative === "boolean" ? { relative: rawField.relative } : {},
761
+ ...Array.isArray(rawField.formats) ? { formats: rawField.formats.filter((f) => typeof f === "string") } : {}
762
+ };
763
+ fields.push(fieldConfig);
764
+ }
765
+ if (fields.length === 0) {
766
+ logWarning2(
767
+ `Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
768
+ );
769
+ continue;
770
+ }
771
+ const aggregation = asString2(rawEvaluator.aggregation);
772
+ const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
773
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
774
+ evaluators.push({
775
+ name,
776
+ type: "field_accuracy",
777
+ fields,
778
+ ...validAggregation ? { aggregation: validAggregation } : {},
779
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
780
+ });
781
+ continue;
782
+ }
783
+ if (typeValue === "latency") {
784
+ const threshold = rawEvaluator.threshold;
785
+ if (typeof threshold !== "number" || threshold < 0) {
786
+ logWarning2(
787
+ `Skipping latency evaluator '${name}' in '${evalId}': threshold must be a non-negative number`
788
+ );
789
+ continue;
790
+ }
791
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
792
+ evaluators.push({
793
+ name,
794
+ type: "latency",
795
+ threshold,
796
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
797
+ });
798
+ continue;
799
+ }
800
+ if (typeValue === "cost") {
801
+ const budget = rawEvaluator.budget;
802
+ if (typeof budget !== "number" || budget < 0) {
803
+ logWarning2(
804
+ `Skipping cost evaluator '${name}' in '${evalId}': budget must be a non-negative number`
805
+ );
806
+ continue;
807
+ }
808
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
809
+ evaluators.push({
810
+ name,
811
+ type: "cost",
812
+ budget,
813
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
814
+ });
815
+ continue;
816
+ }
817
+ if (typeValue === "token_usage") {
818
+ const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
819
+ const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
820
+ const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
821
+ const limits = [
822
+ ["max_total", maxTotal],
823
+ ["max_input", maxInput],
824
+ ["max_output", maxOutput]
825
+ ];
826
+ const validLimits = {};
827
+ for (const [key, raw] of limits) {
828
+ if (raw === void 0) continue;
829
+ if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
830
+ logWarning2(
831
+ `Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
832
+ );
833
+ continue;
834
+ }
835
+ validLimits[key] = raw;
836
+ }
837
+ if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
838
+ logWarning2(
839
+ `Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
840
+ );
841
+ continue;
842
+ }
843
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
844
+ evaluators.push({
845
+ name,
846
+ type: "token_usage",
847
+ ...validLimits,
848
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
849
+ });
850
+ continue;
851
+ }
691
852
  const prompt = asString2(rawEvaluator.prompt);
692
853
  let promptPath;
693
854
  if (prompt) {
@@ -758,6 +919,34 @@ function coerceEvaluator(candidate, contextId) {
758
919
  function asString2(value) {
759
920
  return typeof value === "string" ? value : void 0;
760
921
  }
922
+ function asStringArray(value, description) {
923
+ if (value === void 0) {
924
+ return void 0;
925
+ }
926
+ if (!Array.isArray(value)) {
927
+ throw new Error(`${description} must be an array of strings (argv tokens)`);
928
+ }
929
+ if (value.length === 0) {
930
+ throw new Error(`${description} cannot be empty`);
931
+ }
932
+ const result = [];
933
+ for (const [index, entry] of value.entries()) {
934
+ if (typeof entry !== "string") {
935
+ throw new Error(`${description}[${index}] must be a string`);
936
+ }
937
+ if (entry.trim().length === 0) {
938
+ throw new Error(`${description}[${index}] cannot be empty`);
939
+ }
940
+ result.push(entry);
941
+ }
942
+ return result;
943
+ }
944
+ function parseCommandToArgv(command) {
945
+ if (process.platform === "win32") {
946
+ return ["cmd.exe", "/c", command];
947
+ }
948
+ return ["sh", "-lc", command];
949
+ }
761
950
  function isJsonObject2(value) {
762
951
  return typeof value === "object" && value !== null && !Array.isArray(value);
763
952
  }
@@ -791,6 +980,14 @@ function validateWeight(rawWeight, evaluatorName, evalId) {
791
980
  }
792
981
  return rawWeight;
793
982
  }
983
+ var VALID_FIELD_MATCH_TYPES = /* @__PURE__ */ new Set(["exact", "numeric_tolerance", "date"]);
984
+ function isValidFieldMatchType(value) {
985
+ return typeof value === "string" && VALID_FIELD_MATCH_TYPES.has(value);
986
+ }
987
+ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average", "all_or_nothing"]);
988
+ function isValidFieldAggregationType(value) {
989
+ return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
990
+ }
794
991
 
795
992
  // src/evaluation/loaders/message-processor.ts
796
993
  import { readFile as readFile3 } from "node:fs/promises";
@@ -1750,91 +1947,992 @@ async function withRetry(fn, retryConfig, signal) {
1750
1947
  throw lastError;
1751
1948
  }
1752
1949
 
1753
- // src/evaluation/providers/cli.ts
1754
- import { exec as execWithCallback } from "node:child_process";
1755
- import fs from "node:fs/promises";
1756
- import os from "node:os";
1757
- import path7 from "node:path";
1758
- import { promisify } from "node:util";
1759
- var execAsync = promisify(execWithCallback);
1760
- var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
1761
- async function defaultCommandRunner(command, options) {
1762
- const execOptions = {
1763
- cwd: options.cwd,
1764
- env: options.env,
1765
- timeout: options.timeoutMs,
1766
- signal: options.signal,
1767
- maxBuffer: DEFAULT_MAX_BUFFER,
1768
- shell: process.platform === "win32" ? "powershell.exe" : void 0
1950
+ // src/evaluation/providers/claude-code.ts
1951
+ import { spawn } from "node:child_process";
1952
+ import { randomUUID } from "node:crypto";
1953
+ import { createWriteStream } from "node:fs";
1954
+ import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
1955
+ import { tmpdir } from "node:os";
1956
+ import path8 from "node:path";
1957
+
1958
+ // src/evaluation/providers/claude-code-log-tracker.ts
1959
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
1960
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeCodeLogSubscribers");
1961
+ function getClaudeCodeLogStore() {
1962
+ const globalObject = globalThis;
1963
+ const existing = globalObject[GLOBAL_LOGS_KEY];
1964
+ if (existing) {
1965
+ return existing;
1966
+ }
1967
+ const created = [];
1968
+ globalObject[GLOBAL_LOGS_KEY] = created;
1969
+ return created;
1970
+ }
1971
+ function getSubscriberStore() {
1972
+ const globalObject = globalThis;
1973
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1974
+ if (existing) {
1975
+ return existing;
1976
+ }
1977
+ const created = /* @__PURE__ */ new Set();
1978
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1979
+ return created;
1980
+ }
1981
+ function notifySubscribers(entry) {
1982
+ const subscribers = Array.from(getSubscriberStore());
1983
+ for (const listener of subscribers) {
1984
+ try {
1985
+ listener(entry);
1986
+ } catch (error) {
1987
+ const message = error instanceof Error ? error.message : String(error);
1988
+ console.warn(`Claude Code log subscriber failed: ${message}`);
1989
+ }
1990
+ }
1991
+ }
1992
+ function recordClaudeCodeLogEntry(entry) {
1993
+ getClaudeCodeLogStore().push(entry);
1994
+ notifySubscribers(entry);
1995
+ }
1996
+ function consumeClaudeCodeLogEntries() {
1997
+ const store = getClaudeCodeLogStore();
1998
+ if (store.length === 0) {
1999
+ return [];
2000
+ }
2001
+ return store.splice(0, store.length);
2002
+ }
2003
+ function subscribeToClaudeCodeLogEntries(listener) {
2004
+ const store = getSubscriberStore();
2005
+ store.add(listener);
2006
+ return () => {
2007
+ store.delete(listener);
1769
2008
  };
1770
- try {
1771
- const { stdout, stderr } = await execAsync(command, execOptions);
1772
- return {
1773
- stdout,
1774
- stderr,
1775
- exitCode: 0,
1776
- failed: false,
1777
- timedOut: false,
1778
- signal: null
1779
- };
1780
- } catch (error) {
1781
- const execError = error;
1782
- return {
1783
- stdout: execError.stdout ?? "",
1784
- stderr: execError.stderr ?? "",
1785
- exitCode: typeof execError.code === "number" ? execError.code : null,
1786
- failed: true,
1787
- timedOut: execError.timedOut === true || execError.killed === true,
1788
- signal: execError.signal ?? null
1789
- };
2009
+ }
2010
+
2011
+ // src/evaluation/providers/preread.ts
2012
+ import path7 from "node:path";
2013
+ function buildPromptDocument(request, inputFiles, options) {
2014
+ const parts = [];
2015
+ const guidelineFiles = collectGuidelineFiles(
2016
+ inputFiles,
2017
+ options?.guidelinePatterns ?? request.guideline_patterns,
2018
+ options?.guidelineOverrides
2019
+ );
2020
+ const inputFilesList = collectInputFiles(inputFiles);
2021
+ const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
2022
+ const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
2023
+ if (prereadBlock.length > 0) {
2024
+ parts.push("\n", prereadBlock);
1790
2025
  }
2026
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2027
+ return parts.join("\n").trim();
1791
2028
  }
1792
- var CliProvider = class {
1793
- id;
1794
- kind = "cli";
1795
- targetName;
1796
- supportsBatch = true;
1797
- config;
1798
- runCommand;
1799
- verbose;
1800
- keepTempFiles;
1801
- healthcheckPromise;
1802
- constructor(targetName, config, runner = defaultCommandRunner) {
1803
- this.targetName = targetName;
1804
- this.id = `cli:${targetName}`;
1805
- this.config = config;
1806
- this.runCommand = runner;
1807
- this.verbose = config.verbose ?? false;
1808
- this.keepTempFiles = config.keepTempFiles ?? false;
2029
+ function normalizeInputFiles(inputFiles) {
2030
+ if (!inputFiles || inputFiles.length === 0) {
2031
+ return void 0;
1809
2032
  }
1810
- async invoke(request) {
1811
- if (request.signal?.aborted) {
1812
- throw new Error("CLI provider request was aborted before execution");
2033
+ const deduped = /* @__PURE__ */ new Map();
2034
+ for (const inputFile of inputFiles) {
2035
+ const absolutePath = path7.resolve(inputFile);
2036
+ if (!deduped.has(absolutePath)) {
2037
+ deduped.set(absolutePath, absolutePath);
1813
2038
  }
1814
- await this.ensureHealthy(request.signal);
1815
- const outputFilePath = generateOutputFilePath(request.evalCaseId);
1816
- const templateValues = buildTemplateValues(request, this.config, outputFilePath);
1817
- const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
1818
- if (this.verbose) {
1819
- console.log(
1820
- `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1821
- );
2039
+ }
2040
+ return Array.from(deduped.values());
2041
+ }
2042
+ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
2043
+ if (!inputFiles || inputFiles.length === 0) {
2044
+ return [];
2045
+ }
2046
+ const unique = /* @__PURE__ */ new Map();
2047
+ for (const inputFile of inputFiles) {
2048
+ const absolutePath = path7.resolve(inputFile);
2049
+ if (overrides?.has(absolutePath)) {
2050
+ if (!unique.has(absolutePath)) {
2051
+ unique.set(absolutePath, absolutePath);
2052
+ }
2053
+ continue;
1822
2054
  }
1823
- const startTime = Date.now();
1824
- const result = await this.runCommand(renderedCommand, {
1825
- cwd: this.config.cwd,
1826
- env: process.env,
1827
- timeoutMs: this.config.timeoutMs,
1828
- signal: request.signal
1829
- });
1830
- const measuredDurationMs = Date.now() - startTime;
1831
- if (result.failed || (result.exitCode ?? 0) !== 0) {
1832
- if (request.signal?.aborted) {
1833
- throw new Error("CLI provider request was aborted");
2055
+ const normalized = absolutePath.split(path7.sep).join("/");
2056
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
2057
+ if (!unique.has(absolutePath)) {
2058
+ unique.set(absolutePath, absolutePath);
2059
+ }
2060
+ }
2061
+ }
2062
+ return Array.from(unique.values());
2063
+ }
2064
+ function collectInputFiles(inputFiles) {
2065
+ if (!inputFiles || inputFiles.length === 0) {
2066
+ return [];
2067
+ }
2068
+ const unique = /* @__PURE__ */ new Map();
2069
+ for (const inputFile of inputFiles) {
2070
+ const absolutePath = path7.resolve(inputFile);
2071
+ if (!unique.has(absolutePath)) {
2072
+ unique.set(absolutePath, absolutePath);
2073
+ }
2074
+ }
2075
+ return Array.from(unique.values());
2076
+ }
2077
+ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
2078
+ if (guidelineFiles.length === 0 && inputFiles.length === 0) {
2079
+ return "";
2080
+ }
2081
+ const buildList = (files) => files.map((absolutePath) => {
2082
+ const fileName = path7.basename(absolutePath);
2083
+ const fileUri = pathToFileUri(absolutePath);
2084
+ return `* [${fileName}](${fileUri})`;
2085
+ });
2086
+ const sections = [];
2087
+ if (guidelineFiles.length > 0) {
2088
+ sections.push(`Read all guideline files:
2089
+ ${buildList(guidelineFiles).join("\n")}.`);
2090
+ }
2091
+ if (inputFiles.length > 0) {
2092
+ sections.push(`Read all input files:
2093
+ ${buildList(inputFiles).join("\n")}.`);
2094
+ }
2095
+ sections.push(
2096
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
2097
+ "Then apply system_instructions on the user query below."
2098
+ );
2099
+ return sections.join("\n");
2100
+ }
2101
+ function pathToFileUri(filePath) {
2102
+ const absolutePath = path7.isAbsolute(filePath) ? filePath : path7.resolve(filePath);
2103
+ const normalizedPath = absolutePath.replace(/\\/g, "/");
2104
+ if (/^[a-zA-Z]:\//.test(normalizedPath)) {
2105
+ return `file:///${normalizedPath}`;
2106
+ }
2107
+ return `file://${normalizedPath}`;
2108
+ }
2109
+
2110
+ // src/evaluation/providers/claude-code.ts
2111
+ var WORKSPACE_PREFIX = "agentv-claude-code-";
2112
+ var PROMPT_FILENAME = "prompt.md";
2113
+ var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
2114
+ - Do NOT create any additional output files in the workspace.
2115
+ - All intended file outputs/changes MUST be written in your response.
2116
+ - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
2117
+ This is required for evaluation scoring.`;
2118
+ var ClaudeCodeProvider = class {
2119
+ id;
2120
+ kind = "claude-code";
2121
+ targetName;
2122
+ supportsBatch = false;
2123
+ config;
2124
+ runClaudeCode;
2125
+ constructor(targetName, config, runner = defaultClaudeCodeRunner) {
2126
+ this.id = `claude-code:${targetName}`;
2127
+ this.targetName = targetName;
2128
+ this.config = config;
2129
+ this.runClaudeCode = runner;
2130
+ }
2131
+ async invoke(request) {
2132
+ if (request.signal?.aborted) {
2133
+ throw new Error("Claude Code request was aborted before execution");
2134
+ }
2135
+ const inputFiles = normalizeInputFiles(request.inputFiles);
2136
+ const workspaceRoot = await this.createWorkspace();
2137
+ const logger = await this.createStreamLogger(request).catch(() => void 0);
2138
+ try {
2139
+ const promptFile = path8.join(workspaceRoot, PROMPT_FILENAME);
2140
+ await writeFile(promptFile, request.question, "utf8");
2141
+ const args = this.buildClaudeCodeArgs(request.question, inputFiles);
2142
+ const cwd = this.resolveCwd();
2143
+ const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
2144
+ if (result.timedOut) {
2145
+ throw new Error(
2146
+ `Claude Code CLI timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
2147
+ );
2148
+ }
2149
+ if (result.exitCode !== 0) {
2150
+ const detail = pickDetail(result.stderr, result.stdout);
2151
+ const prefix = `Claude Code CLI exited with code ${result.exitCode}`;
2152
+ if (isNestedClaudeCodeAuthError(result.stdout)) {
2153
+ throw new Error(
2154
+ `${prefix}: Claude Code detected a nested session and requires API key authentication. Set ANTHROPIC_API_KEY environment variable or run AgentV outside of a Claude Code session.`
2155
+ );
2156
+ }
2157
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
2158
+ }
2159
+ const parsed = parseClaudeCodeJsonl(result.stdout);
2160
+ const outputMessages = extractOutputMessages(parsed);
2161
+ const usage = extractUsage(parsed);
2162
+ return {
2163
+ raw: {
2164
+ response: parsed,
2165
+ stdout: result.stdout,
2166
+ stderr: result.stderr,
2167
+ exitCode: result.exitCode,
2168
+ args,
2169
+ executable: this.config.executable,
2170
+ promptFile,
2171
+ workspace: workspaceRoot,
2172
+ inputFiles,
2173
+ logFile: logger?.filePath
2174
+ },
2175
+ outputMessages,
2176
+ usage
2177
+ };
2178
+ } finally {
2179
+ await logger?.close();
2180
+ await this.cleanupWorkspace(workspaceRoot);
2181
+ }
2182
+ }
2183
+ resolveCwd() {
2184
+ if (!this.config.cwd) {
2185
+ return process.cwd();
2186
+ }
2187
+ return path8.resolve(this.config.cwd);
2188
+ }
2189
+ buildClaudeCodeArgs(prompt, inputFiles) {
2190
+ const args = [];
2191
+ args.push("--output-format", "stream-json");
2192
+ args.push("--verbose");
2193
+ args.push("-p");
2194
+ if (this.config.model) {
2195
+ args.push("--model", this.config.model);
2196
+ }
2197
+ if (this.config.args && this.config.args.length > 0) {
2198
+ args.push(...this.config.args);
2199
+ }
2200
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
2201
+ const fullPrompt = `${systemPrompt}
2202
+
2203
+ ${prompt}`;
2204
+ let finalPrompt = fullPrompt;
2205
+ if (inputFiles && inputFiles.length > 0) {
2206
+ const filesContext = inputFiles.map((f) => `[File: ${f}]`).join("\n");
2207
+ finalPrompt = `${fullPrompt}
2208
+
2209
+ ## Input Files
2210
+ ${filesContext}`;
2211
+ }
2212
+ args.push(finalPrompt);
2213
+ return args;
2214
+ }
2215
+ buildEnv() {
2216
+ const env = { ...process.env };
2217
+ env.CLAUDECODE = void 0;
2218
+ env.CLAUDE_CODE_ENTRYPOINT = void 0;
2219
+ return env;
2220
+ }
2221
+ async executeClaudeCode(args, cwd, signal, logger) {
2222
+ try {
2223
+ return await this.runClaudeCode({
2224
+ executable: this.config.executable,
2225
+ args,
2226
+ cwd,
2227
+ timeoutMs: this.config.timeoutMs,
2228
+ env: this.buildEnv(),
2229
+ signal,
2230
+ onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
2231
+ onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
2232
+ });
2233
+ } catch (error) {
2234
+ const err = error;
2235
+ if (err.code === "ENOENT") {
2236
+ throw new Error(
2237
+ `Claude Code executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
2238
+ );
2239
+ }
2240
+ throw error;
2241
+ }
2242
+ }
2243
+ async createWorkspace() {
2244
+ return await mkdtemp(path8.join(tmpdir(), WORKSPACE_PREFIX));
2245
+ }
2246
+ async cleanupWorkspace(workspaceRoot) {
2247
+ try {
2248
+ await rm(workspaceRoot, { recursive: true, force: true });
2249
+ } catch {
2250
+ }
2251
+ }
2252
+ resolveLogDirectory() {
2253
+ const disabled = isClaudeCodeLogStreamingDisabled();
2254
+ if (disabled) {
2255
+ return void 0;
2256
+ }
2257
+ if (this.config.logDir) {
2258
+ return path8.resolve(this.config.logDir);
2259
+ }
2260
+ return path8.join(process.cwd(), ".agentv", "logs", "claude-code");
2261
+ }
2262
+ async createStreamLogger(request) {
2263
+ const logDir = this.resolveLogDirectory();
2264
+ if (!logDir) {
2265
+ return void 0;
2266
+ }
2267
+ try {
2268
+ await mkdir(logDir, { recursive: true });
2269
+ } catch (error) {
2270
+ const message = error instanceof Error ? error.message : String(error);
2271
+ console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
2272
+ return void 0;
2273
+ }
2274
+ const filePath = path8.join(logDir, buildLogFilename(request, this.targetName));
2275
+ try {
2276
+ const logger = await ClaudeCodeStreamLogger.create({
2277
+ filePath,
2278
+ targetName: this.targetName,
2279
+ evalCaseId: request.evalCaseId,
2280
+ attempt: request.attempt,
2281
+ format: this.config.logFormat ?? "summary"
2282
+ });
2283
+ recordClaudeCodeLogEntry({
2284
+ filePath,
2285
+ targetName: this.targetName,
2286
+ evalCaseId: request.evalCaseId,
2287
+ attempt: request.attempt
2288
+ });
2289
+ return logger;
2290
+ } catch (error) {
2291
+ const message = error instanceof Error ? error.message : String(error);
2292
+ console.warn(`Skipping Claude Code stream logging for ${filePath}: ${message}`);
2293
+ return void 0;
2294
+ }
2295
+ }
2296
+ };
2297
+ var ClaudeCodeStreamLogger = class _ClaudeCodeStreamLogger {
2298
+ filePath;
2299
+ stream;
2300
+ startedAt = Date.now();
2301
+ stdoutBuffer = "";
2302
+ stderrBuffer = "";
2303
+ format;
2304
+ constructor(filePath, format) {
2305
+ this.filePath = filePath;
2306
+ this.format = format;
2307
+ this.stream = createWriteStream(filePath, { flags: "a" });
2308
+ }
2309
+ static async create(options) {
2310
+ const logger = new _ClaudeCodeStreamLogger(options.filePath, options.format);
2311
+ const header = [
2312
+ "# Claude Code CLI stream log",
2313
+ `# target: ${options.targetName}`,
2314
+ options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
2315
+ options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
2316
+ `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
2317
+ ""
2318
+ ].filter((line) => Boolean(line));
2319
+ logger.writeLines(header);
2320
+ return logger;
2321
+ }
2322
+ handleStdoutChunk(chunk) {
2323
+ this.stdoutBuffer += chunk;
2324
+ this.flushBuffer("stdout");
2325
+ }
2326
+ handleStderrChunk(chunk) {
2327
+ this.stderrBuffer += chunk;
2328
+ this.flushBuffer("stderr");
2329
+ }
2330
+ async close() {
2331
+ this.flushBuffer("stdout");
2332
+ this.flushBuffer("stderr");
2333
+ this.flushRemainder();
2334
+ await new Promise((resolve, reject) => {
2335
+ this.stream.once("error", reject);
2336
+ this.stream.end(() => resolve());
2337
+ });
2338
+ }
2339
+ writeLines(lines) {
2340
+ for (const line of lines) {
2341
+ this.stream.write(`${line}
2342
+ `);
2343
+ }
2344
+ }
2345
+ flushBuffer(source) {
2346
+ const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
2347
+ const lines = buffer.split(/\r?\n/);
2348
+ const remainder = lines.pop() ?? "";
2349
+ if (source === "stdout") {
2350
+ this.stdoutBuffer = remainder;
2351
+ } else {
2352
+ this.stderrBuffer = remainder;
2353
+ }
2354
+ for (const line of lines) {
2355
+ const formatted = this.formatLine(line, source);
2356
+ if (formatted) {
2357
+ this.stream.write(formatted);
2358
+ this.stream.write("\n");
2359
+ }
2360
+ }
2361
+ }
2362
+ formatLine(rawLine, source) {
2363
+ const trimmed = rawLine.trim();
2364
+ if (trimmed.length === 0) {
2365
+ return void 0;
2366
+ }
2367
+ const message = this.format === "json" ? formatClaudeCodeJsonLog(trimmed) : formatClaudeCodeLogMessage(trimmed, source);
2368
+ return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
2369
+ }
2370
+ flushRemainder() {
2371
+ const stdoutRemainder = this.stdoutBuffer.trim();
2372
+ if (stdoutRemainder.length > 0) {
2373
+ const formatted = this.formatLine(stdoutRemainder, "stdout");
2374
+ if (formatted) {
2375
+ this.stream.write(formatted);
2376
+ this.stream.write("\n");
2377
+ }
2378
+ }
2379
+ const stderrRemainder = this.stderrBuffer.trim();
2380
+ if (stderrRemainder.length > 0) {
2381
+ const formatted = this.formatLine(stderrRemainder, "stderr");
2382
+ if (formatted) {
2383
+ this.stream.write(formatted);
2384
+ this.stream.write("\n");
2385
+ }
2386
+ }
2387
+ this.stdoutBuffer = "";
2388
+ this.stderrBuffer = "";
2389
+ }
2390
+ };
2391
+ function isClaudeCodeLogStreamingDisabled() {
2392
+ const envValue = process.env.AGENTV_CLAUDE_CODE_STREAM_LOGS;
2393
+ if (!envValue) {
2394
+ return false;
2395
+ }
2396
+ const normalized = envValue.trim().toLowerCase();
2397
+ return normalized === "false" || normalized === "0" || normalized === "off";
2398
+ }
2399
+ function buildLogFilename(request, targetName) {
2400
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2401
+ const evalId = sanitizeForFilename(request.evalCaseId ?? "claude-code");
2402
+ const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
2403
+ const target = sanitizeForFilename(targetName);
2404
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`;
2405
+ }
2406
+ function sanitizeForFilename(value) {
2407
+ const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
2408
+ return sanitized.length > 0 ? sanitized : "claude-code";
2409
+ }
2410
+ function formatElapsed(startedAt) {
2411
+ const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
2412
+ const hours = Math.floor(elapsedSeconds / 3600);
2413
+ const minutes = Math.floor(elapsedSeconds % 3600 / 60);
2414
+ const seconds = elapsedSeconds % 60;
2415
+ if (hours > 0) {
2416
+ return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
2417
+ }
2418
+ return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
2419
+ }
2420
+ function formatClaudeCodeLogMessage(rawLine, source) {
2421
+ const parsed = tryParseJsonValue(rawLine);
2422
+ if (parsed) {
2423
+ const summary = summarizeClaudeCodeEvent(parsed);
2424
+ if (summary) {
2425
+ return summary;
2426
+ }
2427
+ }
2428
+ if (source === "stderr") {
2429
+ return `stderr: ${rawLine}`;
2430
+ }
2431
+ return rawLine;
2432
+ }
2433
+ function formatClaudeCodeJsonLog(rawLine) {
2434
+ const parsed = tryParseJsonValue(rawLine);
2435
+ if (!parsed) {
2436
+ return rawLine;
2437
+ }
2438
+ try {
2439
+ return JSON.stringify(parsed, null, 2);
2440
+ } catch {
2441
+ return rawLine;
2442
+ }
2443
+ }
2444
+ function summarizeClaudeCodeEvent(event) {
2445
+ if (!event || typeof event !== "object") {
2446
+ return void 0;
2447
+ }
2448
+ const record = event;
2449
+ const type = typeof record.type === "string" ? record.type : void 0;
2450
+ if (!type) {
2451
+ return void 0;
2452
+ }
2453
+ switch (type) {
2454
+ case "system":
2455
+ return "system: init";
2456
+ case "assistant": {
2457
+ const message = record.message;
2458
+ if (message) {
2459
+ const content = message.content;
2460
+ if (Array.isArray(content) && content.length > 0) {
2461
+ const first = content[0];
2462
+ if (first?.type === "tool_use") {
2463
+ return `assistant: tool_use (${first.name})`;
2464
+ }
2465
+ if (first?.type === "text") {
2466
+ const text = first.text;
2467
+ if (typeof text === "string") {
2468
+ const preview = text.length > 50 ? `${text.slice(0, 50)}...` : text;
2469
+ return `assistant: ${preview}`;
2470
+ }
2471
+ }
2472
+ }
2473
+ }
2474
+ return "assistant";
2475
+ }
2476
+ case "user": {
2477
+ const message = record.message;
2478
+ if (message) {
2479
+ const content = message.content;
2480
+ if (Array.isArray(content) && content.length > 0) {
2481
+ const first = content[0];
2482
+ if (first?.type === "tool_result") {
2483
+ return `user: tool_result (${first.tool_use_id})`;
2484
+ }
2485
+ }
2486
+ }
2487
+ return "user";
2488
+ }
2489
+ case "result": {
2490
+ const cost = record.cost_usd;
2491
+ const duration = record.duration_ms;
2492
+ if (typeof cost === "number" && typeof duration === "number") {
2493
+ return `result: $${cost.toFixed(4)}, ${Math.round(duration)}ms`;
2494
+ }
2495
+ return "result";
2496
+ }
2497
+ default:
2498
+ return type;
2499
+ }
2500
+ }
2501
+ function tryParseJsonValue(rawLine) {
2502
+ try {
2503
+ return JSON.parse(rawLine);
2504
+ } catch {
2505
+ return void 0;
2506
+ }
2507
+ }
2508
+ function parseClaudeCodeJsonl(output) {
2509
+ const trimmed = output.trim();
2510
+ if (trimmed.length === 0) {
2511
+ throw new Error("Claude Code CLI produced no output");
2512
+ }
2513
+ const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
2514
+ const parsed = [];
2515
+ for (const line of lines) {
2516
+ try {
2517
+ parsed.push(JSON.parse(line));
2518
+ } catch {
2519
+ }
2520
+ }
2521
+ if (parsed.length === 0) {
2522
+ throw new Error("Claude Code CLI produced no valid JSON output");
2523
+ }
2524
+ return parsed;
2525
+ }
2526
+ function extractOutputMessages(events) {
2527
+ const outputMessages = [];
2528
+ for (const event of events) {
2529
+ if (!event || typeof event !== "object") {
2530
+ continue;
2531
+ }
2532
+ const record = event;
2533
+ const type = record.type;
2534
+ if (type === "assistant" || type === "user") {
2535
+ const message = record.message;
2536
+ if (message) {
2537
+ const converted = convertClaudeCodeMessage(message, type);
2538
+ if (converted) {
2539
+ outputMessages.push(converted);
2540
+ }
2541
+ }
2542
+ }
2543
+ }
2544
+ return outputMessages;
2545
+ }
2546
+ function convertClaudeCodeMessage(message, type) {
2547
+ const role = type === "assistant" ? "assistant" : "user";
2548
+ const content = extractTextContent(message.content);
2549
+ const toolCalls = extractToolCalls(message.content);
2550
+ return {
2551
+ role,
2552
+ content,
2553
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0
2554
+ };
2555
+ }
2556
+ function extractTextContent(content) {
2557
+ if (typeof content === "string") {
2558
+ return content;
2559
+ }
2560
+ if (!Array.isArray(content)) {
2561
+ return void 0;
2562
+ }
2563
+ const textParts = [];
2564
+ for (const part of content) {
2565
+ if (!part || typeof part !== "object") {
2566
+ continue;
2567
+ }
2568
+ const p = part;
2569
+ if (p.type === "text" && typeof p.text === "string") {
2570
+ textParts.push(p.text);
2571
+ }
2572
+ }
2573
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
2574
+ }
2575
+ function extractToolCalls(content) {
2576
+ if (!Array.isArray(content)) {
2577
+ return [];
2578
+ }
2579
+ const toolCalls = [];
2580
+ for (const part of content) {
2581
+ if (!part || typeof part !== "object") {
2582
+ continue;
2583
+ }
2584
+ const p = part;
2585
+ if (p.type === "tool_use" && typeof p.name === "string") {
2586
+ toolCalls.push({
2587
+ tool: p.name,
2588
+ input: p.input,
2589
+ id: typeof p.id === "string" ? p.id : void 0
2590
+ });
2591
+ }
2592
+ if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
2593
+ toolCalls.push({
2594
+ tool: "tool_result",
2595
+ output: p.content,
2596
+ id: p.tool_use_id
2597
+ });
2598
+ }
2599
+ }
2600
+ return toolCalls;
2601
+ }
2602
+ function extractUsage(events) {
2603
+ for (let i = events.length - 1; i >= 0; i--) {
2604
+ const event = events[i];
2605
+ if (!event || typeof event !== "object") {
2606
+ continue;
2607
+ }
2608
+ const record = event;
2609
+ if (record.type !== "result") {
2610
+ continue;
2611
+ }
2612
+ const usage = {};
2613
+ if (typeof record.cost_usd === "number") {
2614
+ usage.cost_usd = record.cost_usd;
2615
+ }
2616
+ if (typeof record.duration_ms === "number") {
2617
+ usage.duration_ms = record.duration_ms;
2618
+ }
2619
+ if (typeof record.duration_api_ms === "number") {
2620
+ usage.duration_api_ms = record.duration_api_ms;
2621
+ }
2622
+ if (typeof record.input_tokens === "number") {
2623
+ usage.input_tokens = record.input_tokens;
2624
+ }
2625
+ if (typeof record.output_tokens === "number") {
2626
+ usage.output_tokens = record.output_tokens;
2627
+ }
2628
+ if (typeof record.session_id === "string") {
2629
+ usage.session_id = record.session_id;
2630
+ }
2631
+ return Object.keys(usage).length > 0 ? usage : void 0;
2632
+ }
2633
+ return void 0;
2634
+ }
2635
+ function pickDetail(stderr, stdout) {
2636
+ const errorText = stderr.trim();
2637
+ if (errorText.length > 0) {
2638
+ return errorText;
2639
+ }
2640
+ const stdoutText = stdout.trim();
2641
+ return stdoutText.length > 0 ? stdoutText : void 0;
2642
+ }
2643
+ function formatTimeoutSuffix(timeoutMs) {
2644
+ if (!timeoutMs || timeoutMs <= 0) {
2645
+ return "";
2646
+ }
2647
+ const seconds = Math.ceil(timeoutMs / 1e3);
2648
+ return ` after ${seconds}s`;
2649
+ }
2650
+ function isNestedClaudeCodeAuthError(stdout) {
2651
+ try {
2652
+ const lines = stdout.split("\n");
2653
+ let hasApiKeySource = false;
2654
+ let hasAuthError = false;
2655
+ for (const line of lines) {
2656
+ const trimmed = line.trim();
2657
+ if (!trimmed) continue;
2658
+ try {
2659
+ const event = JSON.parse(trimmed);
2660
+ if (event.type === "system" && event.apiKeySource === "ANTHROPIC_API_KEY") {
2661
+ hasApiKeySource = true;
2662
+ }
2663
+ if (event.error === "authentication_failed" || event.type === "result" && event.is_error) {
2664
+ hasAuthError = true;
2665
+ }
2666
+ } catch {
2667
+ }
2668
+ }
2669
+ return hasApiKeySource && hasAuthError;
2670
+ } catch {
2671
+ return false;
2672
+ }
2673
+ }
2674
+ function escapeShellArg(arg) {
2675
+ return `'${arg.replace(/'/g, "'\\''")}'`;
2676
+ }
2677
+ async function defaultClaudeCodeRunner(options) {
2678
+ const tempId = randomUUID();
2679
+ const stdoutFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stdout`);
2680
+ const stderrFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stderr`);
2681
+ const exitFile = path8.join(tmpdir(), `agentv-cc-${tempId}-exit`);
2682
+ const pidFile = path8.join(tmpdir(), `agentv-cc-${tempId}-pid`);
2683
+ try {
2684
+ return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
2685
+ } finally {
2686
+ for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
2687
+ try {
2688
+ await rm(file, { force: true });
2689
+ } catch {
2690
+ }
2691
+ }
2692
+ }
2693
+ }
2694
+ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile) {
2695
+ const parts = options.executable.split(/\s+/);
2696
+ const executable = parts[0];
2697
+ const executableArgs = parts.slice(1);
2698
+ const allArgs = [...executableArgs, ...options.args];
2699
+ const escapedArgs = allArgs.map((arg) => escapeShellArg(arg));
2700
+ const fullCommand = [escapeShellArg(executable), ...escapedArgs].join(" ");
2701
+ const bashScript = `
2702
+ unset CLAUDECODE CLAUDE_CODE_ENTRYPOINT 2>/dev/null
2703
+ ${fullCommand} >${escapeShellArg(stdoutFile)} 2>${escapeShellArg(stderrFile)} &
2704
+ CHILD_PID=$!
2705
+ echo $CHILD_PID > ${escapeShellArg(pidFile)}
2706
+ wait $CHILD_PID
2707
+ echo $? > ${escapeShellArg(exitFile)}
2708
+ `;
2709
+ const child = spawn("setsid", ["bash", "-c", bashScript], {
2710
+ cwd: options.cwd,
2711
+ env: options.env,
2712
+ detached: true,
2713
+ stdio: "ignore"
2714
+ });
2715
+ child.unref();
2716
+ const pollInterval = 100;
2717
+ const startTime = Date.now();
2718
+ let timedOut = false;
2719
+ let lastStdoutSize = 0;
2720
+ const readFileIfExists = async (filePath) => {
2721
+ try {
2722
+ const { readFile: readFile7 } = await import("node:fs/promises");
2723
+ return await readFile7(filePath, "utf8");
2724
+ } catch {
2725
+ return "";
2726
+ }
2727
+ };
2728
+ const fileExists4 = async (filePath) => {
2729
+ try {
2730
+ const { access: access4 } = await import("node:fs/promises");
2731
+ await access4(filePath);
2732
+ return true;
2733
+ } catch {
2734
+ return false;
2735
+ }
2736
+ };
2737
+ const killProcess = async () => {
2738
+ try {
2739
+ const pid = await readFileIfExists(pidFile);
2740
+ if (pid.trim()) {
2741
+ process.kill(Number.parseInt(pid.trim(), 10), "SIGTERM");
2742
+ }
2743
+ } catch {
2744
+ }
2745
+ };
2746
+ if (options.signal?.aborted) {
2747
+ await killProcess();
2748
+ return { stdout: "", stderr: "Aborted", exitCode: -1, timedOut: false };
2749
+ }
2750
+ const abortHandler = () => {
2751
+ killProcess().catch(() => {
2752
+ });
2753
+ };
2754
+ options.signal?.addEventListener("abort", abortHandler, { once: true });
2755
+ try {
2756
+ while (true) {
2757
+ if (options.timeoutMs && Date.now() - startTime > options.timeoutMs) {
2758
+ timedOut = true;
2759
+ await killProcess();
2760
+ break;
2761
+ }
2762
+ if (options.signal?.aborted) {
2763
+ await killProcess();
2764
+ break;
2765
+ }
2766
+ if (options.onStdoutChunk) {
2767
+ const currentStdout = await readFileIfExists(stdoutFile);
2768
+ if (currentStdout.length > lastStdoutSize) {
2769
+ options.onStdoutChunk(currentStdout.slice(lastStdoutSize));
2770
+ lastStdoutSize = currentStdout.length;
2771
+ }
2772
+ }
2773
+ if (await fileExists4(exitFile)) {
2774
+ break;
2775
+ }
2776
+ await new Promise((resolve) => setTimeout(resolve, pollInterval));
2777
+ }
2778
+ const stdout = await readFileIfExists(stdoutFile);
2779
+ const stderr = await readFileIfExists(stderrFile);
2780
+ const exitCodeStr = await readFileIfExists(exitFile);
2781
+ const exitCode = exitCodeStr.trim() ? Number.parseInt(exitCodeStr.trim(), 10) : -1;
2782
+ if (options.onStdoutChunk && stdout.length > lastStdoutSize) {
2783
+ options.onStdoutChunk(stdout.slice(lastStdoutSize));
2784
+ }
2785
+ if (options.onStderrChunk && stderr) {
2786
+ options.onStderrChunk(stderr);
2787
+ }
2788
+ return { stdout, stderr, exitCode, timedOut };
2789
+ } finally {
2790
+ options.signal?.removeEventListener("abort", abortHandler);
2791
+ }
2792
+ }
2793
+
2794
+ // src/evaluation/providers/cli.ts
2795
+ import { exec as execWithCallback } from "node:child_process";
2796
+ import fs from "node:fs/promises";
2797
+ import os from "node:os";
2798
+ import path9 from "node:path";
2799
+ import { promisify } from "node:util";
2800
+ import { z } from "zod";
2801
+ var ToolCallSchema = z.object({
2802
+ tool: z.string(),
2803
+ input: z.unknown().optional(),
2804
+ output: z.unknown().optional(),
2805
+ id: z.string().optional(),
2806
+ timestamp: z.string().optional()
2807
+ });
2808
+ var OutputMessageInputSchema = z.object({
2809
+ role: z.string(),
2810
+ name: z.string().optional(),
2811
+ content: z.unknown().optional(),
2812
+ tool_calls: z.array(ToolCallSchema).optional(),
2813
+ timestamp: z.string().optional(),
2814
+ metadata: z.record(z.unknown()).optional()
2815
+ });
2816
+ var TokenUsageSchema = z.object({
2817
+ input: z.number(),
2818
+ output: z.number(),
2819
+ cached: z.number().optional()
2820
+ });
2821
+ var CliOutputSchema = z.object({
2822
+ text: z.unknown().optional(),
2823
+ output_messages: z.array(OutputMessageInputSchema).optional(),
2824
+ token_usage: TokenUsageSchema.optional(),
2825
+ cost_usd: z.number().optional(),
2826
+ duration_ms: z.number().optional()
2827
+ });
2828
+ var CliJsonlRecordSchema = CliOutputSchema.extend({
2829
+ id: z.string().min(1)
2830
+ });
2831
+ function validateMetrics(costUsd, durationMs, context) {
2832
+ let validCostUsd = costUsd;
2833
+ let validDurationMs = durationMs;
2834
+ if (costUsd !== void 0 && costUsd < 0) {
2835
+ console.warn(`[cli-provider] ${context}: ignoring negative cost_usd value (${costUsd})`);
2836
+ validCostUsd = void 0;
2837
+ }
2838
+ if (durationMs !== void 0 && durationMs < 0) {
2839
+ console.warn(`[cli-provider] ${context}: ignoring negative duration_ms value (${durationMs})`);
2840
+ validDurationMs = void 0;
2841
+ }
2842
+ return { costUsd: validCostUsd, durationMs: validDurationMs };
2843
+ }
2844
+ function convertOutputMessages(messages) {
2845
+ if (!messages || messages.length === 0) {
2846
+ return void 0;
2847
+ }
2848
+ return messages.map((msg) => ({
2849
+ role: msg.role,
2850
+ name: msg.name,
2851
+ content: msg.content,
2852
+ toolCalls: msg.tool_calls,
2853
+ timestamp: msg.timestamp,
2854
+ metadata: msg.metadata
2855
+ }));
2856
+ }
2857
+ var execAsync = promisify(execWithCallback);
2858
+ var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
2859
+ async function defaultCommandRunner(command, options) {
2860
+ const execOptions = {
2861
+ cwd: options.cwd,
2862
+ env: options.env,
2863
+ timeout: options.timeoutMs,
2864
+ signal: options.signal,
2865
+ maxBuffer: DEFAULT_MAX_BUFFER,
2866
+ shell: process.platform === "win32" ? "powershell.exe" : void 0
2867
+ };
2868
+ try {
2869
+ const { stdout, stderr } = await execAsync(command, execOptions);
2870
+ return {
2871
+ stdout,
2872
+ stderr,
2873
+ exitCode: 0,
2874
+ failed: false,
2875
+ timedOut: false,
2876
+ signal: null
2877
+ };
2878
+ } catch (error) {
2879
+ const execError = error;
2880
+ return {
2881
+ stdout: execError.stdout ?? "",
2882
+ stderr: execError.stderr ?? "",
2883
+ exitCode: typeof execError.code === "number" ? execError.code : null,
2884
+ failed: true,
2885
+ timedOut: execError.timedOut === true || execError.killed === true,
2886
+ signal: execError.signal ?? null
2887
+ };
2888
+ }
2889
+ }
2890
+ var CliProvider = class {
2891
+ id;
2892
+ kind = "cli";
2893
+ targetName;
2894
+ supportsBatch = true;
2895
+ config;
2896
+ runCommand;
2897
+ verbose;
2898
+ keepTempFiles;
2899
+ healthcheckPromise;
2900
+ constructor(targetName, config, runner = defaultCommandRunner) {
2901
+ this.targetName = targetName;
2902
+ this.id = `cli:${targetName}`;
2903
+ this.config = config;
2904
+ this.runCommand = runner;
2905
+ this.verbose = config.verbose ?? false;
2906
+ this.keepTempFiles = config.keepTempFiles ?? false;
2907
+ }
2908
+ async invoke(request) {
2909
+ if (request.signal?.aborted) {
2910
+ throw new Error("CLI provider request was aborted before execution");
2911
+ }
2912
+ await this.ensureHealthy(request.signal);
2913
+ const outputFilePath = generateOutputFilePath(request.evalCaseId);
2914
+ const templateValues = buildTemplateValues(request, this.config, outputFilePath);
2915
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
2916
+ if (this.verbose) {
2917
+ console.log(
2918
+ `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
2919
+ );
2920
+ }
2921
+ const startTime = Date.now();
2922
+ const result = await this.runCommand(renderedCommand, {
2923
+ cwd: this.config.cwd,
2924
+ env: process.env,
2925
+ timeoutMs: this.config.timeoutMs,
2926
+ signal: request.signal
2927
+ });
2928
+ const measuredDurationMs = Date.now() - startTime;
2929
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
2930
+ if (request.signal?.aborted) {
2931
+ throw new Error("CLI provider request was aborted");
1834
2932
  }
1835
2933
  if (result.timedOut) {
1836
2934
  throw new Error(
1837
- `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
2935
+ `CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
1838
2936
  );
1839
2937
  }
1840
2938
  const codeText = result.exitCode !== null ? result.exitCode : "unknown";
@@ -1910,7 +3008,7 @@ var CliProvider = class {
1910
3008
  }
1911
3009
  if (result.timedOut) {
1912
3010
  throw new Error(
1913
- `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
3011
+ `CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
1914
3012
  );
1915
3013
  }
1916
3014
  const codeText = result.exitCode !== null ? result.exitCode : "unknown";
@@ -1920,11 +3018,6 @@ var CliProvider = class {
1920
3018
  }
1921
3019
  const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1922
3020
  const recordsById = this.parseJsonlBatchOutput(responseContent);
1923
- const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
1924
- const missingIds = requestedIds.filter((id) => !recordsById.has(id));
1925
- if (missingIds.length > 0) {
1926
- throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
1927
- }
1928
3021
  const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
1929
3022
  const responses = requests.map((request) => {
1930
3023
  const evalCaseId = request.evalCaseId;
@@ -1943,15 +3036,20 @@ var CliProvider = class {
1943
3036
  }
1944
3037
  const parsed = recordsById.get(evalCaseId);
1945
3038
  if (!parsed) {
3039
+ const errorMessage = `Batch output missing id '${evalCaseId}'`;
3040
+ if (this.verbose) {
3041
+ console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
3042
+ }
1946
3043
  return {
1947
- outputMessages: [],
3044
+ outputMessages: [{ role: "assistant", content: `Error: ${errorMessage}` }],
1948
3045
  durationMs: perRequestFallbackMs,
1949
3046
  raw: {
1950
3047
  command: renderedCommand,
1951
3048
  stderr: result.stderr,
1952
3049
  exitCode: result.exitCode ?? 0,
1953
3050
  cwd: this.config.cwd,
1954
- outputFile: outputFilePath
3051
+ outputFile: outputFilePath,
3052
+ error: errorMessage
1955
3053
  }
1956
3054
  };
1957
3055
  }
@@ -1984,101 +3082,37 @@ var CliProvider = class {
1984
3082
  * - duration_ms: number
1985
3083
  */
1986
3084
  parseOutputContent(content) {
3085
+ let parsed;
1987
3086
  try {
1988
- const parsed = JSON.parse(content);
1989
- if (typeof parsed === "object" && parsed !== null) {
1990
- const obj = parsed;
1991
- const tokenUsage = this.parseTokenUsage(obj.token_usage);
1992
- const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
1993
- const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
1994
- const outputMessages = this.parseOutputMessages(obj.output_messages);
1995
- if (outputMessages && outputMessages.length > 0) {
1996
- return { outputMessages, tokenUsage, costUsd, durationMs };
1997
- }
1998
- if ("text" in obj) {
1999
- const text = typeof obj.text === "string" ? obj.text : String(obj.text);
2000
- return {
2001
- outputMessages: [{ role: "assistant", content: text }],
2002
- tokenUsage,
2003
- costUsd,
2004
- durationMs
2005
- };
2006
- }
2007
- }
3087
+ parsed = JSON.parse(content);
2008
3088
  } catch {
3089
+ return { outputMessages: [{ role: "assistant", content }] };
2009
3090
  }
2010
- return { outputMessages: [{ role: "assistant", content }] };
2011
- }
2012
- /**
2013
- * Parse token_usage from CLI output.
2014
- */
2015
- parseTokenUsage(tokenUsage) {
2016
- if (typeof tokenUsage !== "object" || tokenUsage === null) {
2017
- return void 0;
2018
- }
2019
- const obj = tokenUsage;
2020
- if (typeof obj.input !== "number" || typeof obj.output !== "number") {
2021
- return void 0;
2022
- }
2023
- return {
2024
- input: obj.input,
2025
- output: obj.output,
2026
- cached: typeof obj.cached === "number" ? obj.cached : void 0
2027
- };
2028
- }
2029
- /**
2030
- * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
2031
- */
2032
- parseOutputMessages(outputMessages) {
2033
- if (!Array.isArray(outputMessages)) {
2034
- return void 0;
3091
+ const result = CliOutputSchema.safeParse(parsed);
3092
+ if (!result.success) {
3093
+ return { outputMessages: [{ role: "assistant", content }] };
2035
3094
  }
2036
- const messages = [];
2037
- for (const msg of outputMessages) {
2038
- if (typeof msg !== "object" || msg === null) {
2039
- continue;
2040
- }
2041
- const rawMsg = msg;
2042
- if (typeof rawMsg.role !== "string") {
2043
- continue;
2044
- }
2045
- const message = {
2046
- role: rawMsg.role,
2047
- name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
2048
- content: rawMsg.content,
2049
- toolCalls: this.parseToolCalls(rawMsg.tool_calls),
2050
- timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
2051
- metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
3095
+ const obj = result.data;
3096
+ const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, "parsing output");
3097
+ const outputMessages = convertOutputMessages(obj.output_messages);
3098
+ if (outputMessages && outputMessages.length > 0) {
3099
+ return {
3100
+ outputMessages,
3101
+ tokenUsage: obj.token_usage,
3102
+ costUsd: metrics.costUsd,
3103
+ durationMs: metrics.durationMs
2052
3104
  };
2053
- messages.push(message);
2054
- }
2055
- return messages.length > 0 ? messages : void 0;
2056
- }
2057
- /**
2058
- * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
2059
- */
2060
- parseToolCalls(toolCalls) {
2061
- if (!Array.isArray(toolCalls)) {
2062
- return void 0;
2063
3105
  }
2064
- const calls = [];
2065
- for (const call of toolCalls) {
2066
- if (typeof call !== "object" || call === null) {
2067
- continue;
2068
- }
2069
- const rawCall = call;
2070
- if (typeof rawCall.tool !== "string") {
2071
- continue;
2072
- }
2073
- calls.push({
2074
- tool: rawCall.tool,
2075
- input: rawCall.input,
2076
- output: rawCall.output,
2077
- id: typeof rawCall.id === "string" ? rawCall.id : void 0,
2078
- timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
2079
- });
3106
+ if (obj.text !== void 0) {
3107
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
3108
+ return {
3109
+ outputMessages: [{ role: "assistant", content: text }],
3110
+ tokenUsage: obj.token_usage,
3111
+ costUsd: metrics.costUsd,
3112
+ durationMs: metrics.durationMs
3113
+ };
2080
3114
  }
2081
- return calls.length > 0 ? calls : void 0;
3115
+ return { outputMessages: [{ role: "assistant", content }] };
2082
3116
  }
2083
3117
  parseJsonlBatchOutput(content) {
2084
3118
  const records = /* @__PURE__ */ new Map();
@@ -2091,33 +3125,32 @@ var CliProvider = class {
2091
3125
  const reason = error instanceof Error ? error.message : String(error);
2092
3126
  throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
2093
3127
  }
2094
- if (typeof parsed !== "object" || parsed === null) {
3128
+ const result = CliJsonlRecordSchema.safeParse(parsed);
3129
+ if (!result.success) {
3130
+ const firstError = result.error.errors[0];
3131
+ if (firstError?.path.includes("id")) {
3132
+ throw new Error("CLI batch output JSONL line missing required string field: id");
3133
+ }
2095
3134
  throw new Error("CLI batch output JSONL line must be an object");
2096
3135
  }
2097
- const obj = parsed;
2098
- const id = typeof obj.id === "string" ? obj.id : void 0;
2099
- if (!id || id.trim().length === 0) {
2100
- throw new Error("CLI batch output JSONL line missing required string field: id");
2101
- }
2102
- if (records.has(id)) {
2103
- throw new Error(`CLI batch output contains duplicate id: ${id}`);
2104
- }
2105
- const tokenUsage = this.parseTokenUsage(obj.token_usage);
2106
- const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
2107
- const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
2108
- const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
2109
- let outputMessages;
2110
- if (parsedOutputMessages && parsedOutputMessages.length > 0) {
2111
- outputMessages = parsedOutputMessages;
3136
+ const obj = result.data;
3137
+ if (records.has(obj.id)) {
3138
+ throw new Error(`CLI batch output contains duplicate id: ${obj.id}`);
3139
+ }
3140
+ const outputMessages = convertOutputMessages(obj.output_messages);
3141
+ let finalOutputMessages;
3142
+ if (outputMessages && outputMessages.length > 0) {
3143
+ finalOutputMessages = outputMessages;
2112
3144
  } else {
2113
3145
  const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2114
- outputMessages = text ? [{ role: "assistant", content: text }] : [];
2115
- }
2116
- records.set(id, {
2117
- outputMessages,
2118
- tokenUsage,
2119
- costUsd,
2120
- durationMs
3146
+ finalOutputMessages = text ? [{ role: "assistant", content: text }] : [];
3147
+ }
3148
+ const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, `batch record '${obj.id}'`);
3149
+ records.set(obj.id, {
3150
+ outputMessages: finalOutputMessages,
3151
+ tokenUsage: obj.token_usage,
3152
+ costUsd: metrics.costUsd,
3153
+ durationMs: metrics.durationMs
2121
3154
  });
2122
3155
  }
2123
3156
  return records;
@@ -2203,7 +3236,7 @@ var CliProvider = class {
2203
3236
  }
2204
3237
  };
2205
3238
  function buildTemplateValues(request, config, outputFilePath) {
2206
- const inputFiles = normalizeInputFiles(request.inputFiles);
3239
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
2207
3240
  return {
2208
3241
  PROMPT: shellEscape(request.question ?? ""),
2209
3242
  GUIDELINES: shellEscape(request.guidelines ?? ""),
@@ -2213,13 +3246,13 @@ function buildTemplateValues(request, config, outputFilePath) {
2213
3246
  OUTPUT_FILE: shellEscape(outputFilePath)
2214
3247
  };
2215
3248
  }
2216
- function normalizeInputFiles(inputFiles) {
3249
+ function normalizeInputFiles2(inputFiles) {
2217
3250
  if (!inputFiles || inputFiles.length === 0) {
2218
3251
  return void 0;
2219
3252
  }
2220
3253
  const unique = /* @__PURE__ */ new Map();
2221
3254
  for (const inputFile of inputFiles) {
2222
- const absolutePath = path7.resolve(inputFile);
3255
+ const absolutePath = path9.resolve(inputFile);
2223
3256
  if (!unique.has(absolutePath)) {
2224
3257
  unique.set(absolutePath, absolutePath);
2225
3258
  }
@@ -2233,7 +3266,7 @@ function formatFileList(files, template) {
2233
3266
  const formatter = template ?? "{path}";
2234
3267
  return files.map((filePath) => {
2235
3268
  const escapedPath = shellEscape(filePath);
2236
- const escapedName = shellEscape(path7.basename(filePath));
3269
+ const escapedName = shellEscape(path9.basename(filePath));
2237
3270
  return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
2238
3271
  }).join(" ");
2239
3272
  }
@@ -2257,9 +3290,9 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
2257
3290
  const safeEvalId = evalCaseId || "unknown";
2258
3291
  const timestamp = Date.now();
2259
3292
  const random = Math.random().toString(36).substring(2, 9);
2260
- return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
3293
+ return path9.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
2261
3294
  }
2262
- function formatTimeoutSuffix(timeoutMs) {
3295
+ function formatTimeoutSuffix2(timeoutMs) {
2263
3296
  if (!timeoutMs || timeoutMs <= 0) {
2264
3297
  return "";
2265
3298
  }
@@ -2268,39 +3301,39 @@ function formatTimeoutSuffix(timeoutMs) {
2268
3301
  }
2269
3302
 
2270
3303
  // src/evaluation/providers/codex.ts
2271
- import { exec as execCallback, spawn } from "node:child_process";
2272
- import { randomUUID } from "node:crypto";
2273
- import { constants as constants2, createWriteStream } from "node:fs";
2274
- import { access as access2, mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
2275
- import { tmpdir } from "node:os";
2276
- import path9 from "node:path";
3304
+ import { exec as execCallback, spawn as spawn2 } from "node:child_process";
3305
+ import { randomUUID as randomUUID2 } from "node:crypto";
3306
+ import { constants as constants2, createWriteStream as createWriteStream2 } from "node:fs";
3307
+ import { access as access2, mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
3308
+ import { tmpdir as tmpdir2 } from "node:os";
3309
+ import path10 from "node:path";
2277
3310
  import { promisify as promisify2 } from "node:util";
2278
3311
 
2279
3312
  // src/evaluation/providers/codex-log-tracker.ts
2280
- var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
2281
- var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
3313
+ var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.codexLogs");
3314
+ var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.codexLogSubscribers");
2282
3315
  function getCodexLogStore() {
2283
3316
  const globalObject = globalThis;
2284
- const existing = globalObject[GLOBAL_LOGS_KEY];
3317
+ const existing = globalObject[GLOBAL_LOGS_KEY2];
2285
3318
  if (existing) {
2286
3319
  return existing;
2287
3320
  }
2288
3321
  const created = [];
2289
- globalObject[GLOBAL_LOGS_KEY] = created;
3322
+ globalObject[GLOBAL_LOGS_KEY2] = created;
2290
3323
  return created;
2291
3324
  }
2292
- function getSubscriberStore() {
3325
+ function getSubscriberStore2() {
2293
3326
  const globalObject = globalThis;
2294
- const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
3327
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
2295
3328
  if (existing) {
2296
3329
  return existing;
2297
3330
  }
2298
3331
  const created = /* @__PURE__ */ new Set();
2299
- globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
3332
+ globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
2300
3333
  return created;
2301
3334
  }
2302
- function notifySubscribers(entry) {
2303
- const subscribers = Array.from(getSubscriberStore());
3335
+ function notifySubscribers2(entry) {
3336
+ const subscribers = Array.from(getSubscriberStore2());
2304
3337
  for (const listener of subscribers) {
2305
3338
  try {
2306
3339
  listener(entry);
@@ -2312,7 +3345,7 @@ function notifySubscribers(entry) {
2312
3345
  }
2313
3346
  function recordCodexLogEntry(entry) {
2314
3347
  getCodexLogStore().push(entry);
2315
- notifySubscribers(entry);
3348
+ notifySubscribers2(entry);
2316
3349
  }
2317
3350
  function consumeCodexLogEntries() {
2318
3351
  const store = getCodexLogStore();
@@ -2322,118 +3355,19 @@ function consumeCodexLogEntries() {
2322
3355
  return store.splice(0, store.length);
2323
3356
  }
2324
3357
  function subscribeToCodexLogEntries(listener) {
2325
- const store = getSubscriberStore();
3358
+ const store = getSubscriberStore2();
2326
3359
  store.add(listener);
2327
3360
  return () => {
2328
3361
  store.delete(listener);
2329
3362
  };
2330
3363
  }
2331
3364
 
2332
- // src/evaluation/providers/preread.ts
2333
- import path8 from "node:path";
2334
- function buildPromptDocument(request, inputFiles, options) {
2335
- const parts = [];
2336
- const guidelineFiles = collectGuidelineFiles(
2337
- inputFiles,
2338
- options?.guidelinePatterns ?? request.guideline_patterns,
2339
- options?.guidelineOverrides
2340
- );
2341
- const inputFilesList = collectInputFiles(inputFiles);
2342
- const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
2343
- const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
2344
- if (prereadBlock.length > 0) {
2345
- parts.push("\n", prereadBlock);
2346
- }
2347
- parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2348
- return parts.join("\n").trim();
2349
- }
2350
- function normalizeInputFiles2(inputFiles) {
2351
- if (!inputFiles || inputFiles.length === 0) {
2352
- return void 0;
2353
- }
2354
- const deduped = /* @__PURE__ */ new Map();
2355
- for (const inputFile of inputFiles) {
2356
- const absolutePath = path8.resolve(inputFile);
2357
- if (!deduped.has(absolutePath)) {
2358
- deduped.set(absolutePath, absolutePath);
2359
- }
2360
- }
2361
- return Array.from(deduped.values());
2362
- }
2363
- function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
2364
- if (!inputFiles || inputFiles.length === 0) {
2365
- return [];
2366
- }
2367
- const unique = /* @__PURE__ */ new Map();
2368
- for (const inputFile of inputFiles) {
2369
- const absolutePath = path8.resolve(inputFile);
2370
- if (overrides?.has(absolutePath)) {
2371
- if (!unique.has(absolutePath)) {
2372
- unique.set(absolutePath, absolutePath);
2373
- }
2374
- continue;
2375
- }
2376
- const normalized = absolutePath.split(path8.sep).join("/");
2377
- if (isGuidelineFile(normalized, guidelinePatterns)) {
2378
- if (!unique.has(absolutePath)) {
2379
- unique.set(absolutePath, absolutePath);
2380
- }
2381
- }
2382
- }
2383
- return Array.from(unique.values());
2384
- }
2385
- function collectInputFiles(inputFiles) {
2386
- if (!inputFiles || inputFiles.length === 0) {
2387
- return [];
2388
- }
2389
- const unique = /* @__PURE__ */ new Map();
2390
- for (const inputFile of inputFiles) {
2391
- const absolutePath = path8.resolve(inputFile);
2392
- if (!unique.has(absolutePath)) {
2393
- unique.set(absolutePath, absolutePath);
2394
- }
2395
- }
2396
- return Array.from(unique.values());
2397
- }
2398
- function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
2399
- if (guidelineFiles.length === 0 && inputFiles.length === 0) {
2400
- return "";
2401
- }
2402
- const buildList = (files) => files.map((absolutePath) => {
2403
- const fileName = path8.basename(absolutePath);
2404
- const fileUri = pathToFileUri(absolutePath);
2405
- return `* [${fileName}](${fileUri})`;
2406
- });
2407
- const sections = [];
2408
- if (guidelineFiles.length > 0) {
2409
- sections.push(`Read all guideline files:
2410
- ${buildList(guidelineFiles).join("\n")}.`);
2411
- }
2412
- if (inputFiles.length > 0) {
2413
- sections.push(`Read all input files:
2414
- ${buildList(inputFiles).join("\n")}.`);
2415
- }
2416
- sections.push(
2417
- "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
2418
- "Then apply system_instructions on the user query below."
2419
- );
2420
- return sections.join("\n");
2421
- }
2422
- function pathToFileUri(filePath) {
2423
- const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
2424
- const normalizedPath = absolutePath.replace(/\\/g, "/");
2425
- if (/^[a-zA-Z]:\//.test(normalizedPath)) {
2426
- return `file:///${normalizedPath}`;
2427
- }
2428
- return `file://${normalizedPath}`;
2429
- }
2430
-
2431
3365
  // src/evaluation/providers/codex.ts
2432
3366
  var execAsync2 = promisify2(execCallback);
2433
- var WORKSPACE_PREFIX = "agentv-codex-";
2434
- var PROMPT_FILENAME = "prompt.md";
3367
+ var WORKSPACE_PREFIX2 = "agentv-codex-";
3368
+ var PROMPT_FILENAME2 = "prompt.md";
2435
3369
  var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
2436
- var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
3370
+ var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
2437
3371
  - Do NOT create any additional output files in the workspace.
2438
3372
  - All intended file outputs/changes MUST be written in your response.
2439
3373
  - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
@@ -2458,27 +3392,27 @@ var CodexProvider = class {
2458
3392
  throw new Error("Codex provider request was aborted before execution");
2459
3393
  }
2460
3394
  await this.ensureEnvironmentReady();
2461
- const inputFiles = normalizeInputFiles2(request.inputFiles);
3395
+ const inputFiles = normalizeInputFiles(request.inputFiles);
2462
3396
  const workspaceRoot = await this.createWorkspace();
2463
3397
  const logger = await this.createStreamLogger(request).catch(() => void 0);
2464
3398
  try {
2465
3399
  const basePrompt = buildPromptDocument(request, inputFiles);
2466
- const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
3400
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
2467
3401
  const promptContent = `${systemPrompt}
2468
3402
 
2469
3403
  ${basePrompt}`;
2470
- const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
2471
- await writeFile(promptFile, promptContent, "utf8");
3404
+ const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
3405
+ await writeFile2(promptFile, promptContent, "utf8");
2472
3406
  const args = this.buildCodexArgs();
2473
3407
  const cwd = this.resolveCwd(workspaceRoot);
2474
3408
  const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
2475
3409
  if (result.timedOut) {
2476
3410
  throw new Error(
2477
- `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
3411
+ `Codex CLI timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
2478
3412
  );
2479
3413
  }
2480
3414
  if (result.exitCode !== 0) {
2481
- const detail = pickDetail(result.stderr, result.stdout);
3415
+ const detail = pickDetail2(result.stderr, result.stdout);
2482
3416
  const prefix = `Codex CLI exited with code ${result.exitCode}`;
2483
3417
  throw new Error(detail ? `${prefix}: ${detail}` : prefix);
2484
3418
  }
@@ -2517,7 +3451,7 @@ ${basePrompt}`;
2517
3451
  if (!this.config.cwd) {
2518
3452
  return workspaceRoot;
2519
3453
  }
2520
- return path9.resolve(this.config.cwd);
3454
+ return path10.resolve(this.config.cwd);
2521
3455
  }
2522
3456
  buildCodexArgs() {
2523
3457
  const args = [
@@ -2559,11 +3493,11 @@ ${basePrompt}`;
2559
3493
  }
2560
3494
  }
2561
3495
  async createWorkspace() {
2562
- return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
3496
+ return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
2563
3497
  }
2564
3498
  async cleanupWorkspace(workspaceRoot) {
2565
3499
  try {
2566
- await rm(workspaceRoot, { recursive: true, force: true });
3500
+ await rm2(workspaceRoot, { recursive: true, force: true });
2567
3501
  } catch {
2568
3502
  }
2569
3503
  }
@@ -2573,9 +3507,9 @@ ${basePrompt}`;
2573
3507
  return void 0;
2574
3508
  }
2575
3509
  if (this.config.logDir) {
2576
- return path9.resolve(this.config.logDir);
3510
+ return path10.resolve(this.config.logDir);
2577
3511
  }
2578
- return path9.join(process.cwd(), ".agentv", "logs", "codex");
3512
+ return path10.join(process.cwd(), ".agentv", "logs", "codex");
2579
3513
  }
2580
3514
  async createStreamLogger(request) {
2581
3515
  const logDir = this.resolveLogDirectory();
@@ -2583,13 +3517,13 @@ ${basePrompt}`;
2583
3517
  return void 0;
2584
3518
  }
2585
3519
  try {
2586
- await mkdir(logDir, { recursive: true });
3520
+ await mkdir2(logDir, { recursive: true });
2587
3521
  } catch (error) {
2588
3522
  const message = error instanceof Error ? error.message : String(error);
2589
3523
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
2590
3524
  return void 0;
2591
3525
  }
2592
- const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
3526
+ const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
2593
3527
  try {
2594
3528
  const logger = await CodexStreamLogger.create({
2595
3529
  filePath,
@@ -2622,7 +3556,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
2622
3556
  constructor(filePath, format) {
2623
3557
  this.filePath = filePath;
2624
3558
  this.format = format;
2625
- this.stream = createWriteStream(filePath, { flags: "a" });
3559
+ this.stream = createWriteStream2(filePath, { flags: "a" });
2626
3560
  }
2627
3561
  static async create(options) {
2628
3562
  const logger = new _CodexStreamLogger(options.filePath, options.format);
@@ -2683,7 +3617,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
2683
3617
  return void 0;
2684
3618
  }
2685
3619
  const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
2686
- return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
3620
+ return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
2687
3621
  }
2688
3622
  flushRemainder() {
2689
3623
  const stdoutRemainder = this.stdoutBuffer.trim();
@@ -2714,18 +3648,18 @@ function isCodexLogStreamingDisabled() {
2714
3648
  const normalized = envValue.trim().toLowerCase();
2715
3649
  return normalized === "false" || normalized === "0" || normalized === "off";
2716
3650
  }
2717
- function buildLogFilename(request, targetName) {
3651
+ function buildLogFilename2(request, targetName) {
2718
3652
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2719
- const evalId = sanitizeForFilename(request.evalCaseId ?? "codex");
3653
+ const evalId = sanitizeForFilename2(request.evalCaseId ?? "codex");
2720
3654
  const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
2721
- const target = sanitizeForFilename(targetName);
2722
- return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`;
3655
+ const target = sanitizeForFilename2(targetName);
3656
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
2723
3657
  }
2724
- function sanitizeForFilename(value) {
3658
+ function sanitizeForFilename2(value) {
2725
3659
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
2726
3660
  return sanitized.length > 0 ? sanitized : "codex";
2727
3661
  }
2728
- function formatElapsed(startedAt) {
3662
+ function formatElapsed2(startedAt) {
2729
3663
  const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
2730
3664
  const hours = Math.floor(elapsedSeconds / 3600);
2731
3665
  const minutes = Math.floor(elapsedSeconds % 3600 / 60);
@@ -2736,7 +3670,7 @@ function formatElapsed(startedAt) {
2736
3670
  return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
2737
3671
  }
2738
3672
  function formatCodexLogMessage(rawLine, source) {
2739
- const parsed = tryParseJsonValue(rawLine);
3673
+ const parsed = tryParseJsonValue2(rawLine);
2740
3674
  if (parsed) {
2741
3675
  const summary = summarizeCodexEvent(parsed);
2742
3676
  if (summary) {
@@ -2749,7 +3683,7 @@ function formatCodexLogMessage(rawLine, source) {
2749
3683
  return rawLine;
2750
3684
  }
2751
3685
  function formatCodexJsonLog(rawLine) {
2752
- const parsed = tryParseJsonValue(rawLine);
3686
+ const parsed = tryParseJsonValue2(rawLine);
2753
3687
  if (!parsed) {
2754
3688
  return rawLine;
2755
3689
  }
@@ -2794,7 +3728,7 @@ function summarizeCodexEvent(event) {
2794
3728
  }
2795
3729
  return type;
2796
3730
  }
2797
- function tryParseJsonValue(rawLine) {
3731
+ function tryParseJsonValue2(rawLine) {
2798
3732
  try {
2799
3733
  return JSON.parse(rawLine);
2800
3734
  } catch {
@@ -2804,7 +3738,7 @@ function tryParseJsonValue(rawLine) {
2804
3738
  async function locateExecutable(candidate) {
2805
3739
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
2806
3740
  if (includesPathSeparator) {
2807
- const resolved = path9.isAbsolute(candidate) ? candidate : path9.resolve(candidate);
3741
+ const resolved = path10.isAbsolute(candidate) ? candidate : path10.resolve(candidate);
2808
3742
  const executablePath = await ensureWindowsExecutableVariant(resolved);
2809
3743
  await access2(executablePath, constants2.F_OK);
2810
3744
  return executablePath;
@@ -3023,7 +3957,7 @@ function parseJsonLines(output) {
3023
3957
  }
3024
3958
  return parsed;
3025
3959
  }
3026
- function pickDetail(stderr, stdout) {
3960
+ function pickDetail2(stderr, stdout) {
3027
3961
  const errorText = stderr.trim();
3028
3962
  if (errorText.length > 0) {
3029
3963
  return errorText;
@@ -3031,7 +3965,7 @@ function pickDetail(stderr, stdout) {
3031
3965
  const stdoutText = stdout.trim();
3032
3966
  return stdoutText.length > 0 ? stdoutText : void 0;
3033
3967
  }
3034
- function formatTimeoutSuffix2(timeoutMs) {
3968
+ function formatTimeoutSuffix3(timeoutMs) {
3035
3969
  if (!timeoutMs || timeoutMs <= 0) {
3036
3970
  return "";
3037
3971
  }
@@ -3040,7 +3974,7 @@ function formatTimeoutSuffix2(timeoutMs) {
3040
3974
  }
3041
3975
  async function defaultCodexRunner(options) {
3042
3976
  return await new Promise((resolve, reject) => {
3043
- const child = spawn(options.executable, options.args, {
3977
+ const child = spawn2(options.executable, options.args, {
3044
3978
  cwd: options.cwd,
3045
3979
  env: options.env,
3046
3980
  stdio: ["pipe", "pipe", "pipe"],
@@ -3146,43 +4080,204 @@ var MockProvider = class {
3146
4080
  const max = Math.max(min, this.delayMaxMs);
3147
4081
  return Math.floor(Math.random() * (max - min + 1)) + min;
3148
4082
  }
3149
- return this.delayMs;
4083
+ return this.delayMs;
4084
+ }
4085
+ };
4086
+
4087
+ // src/evaluation/providers/pi-agent-sdk.ts
4088
+ var piAgentModule = null;
4089
+ var piAiModule = null;
4090
+ async function loadPiModules() {
4091
+ if (!piAgentModule || !piAiModule) {
4092
+ try {
4093
+ [piAgentModule, piAiModule] = await Promise.all([
4094
+ import("@mariozechner/pi-agent"),
4095
+ import("@mariozechner/pi-ai")
4096
+ ]);
4097
+ } catch (error) {
4098
+ throw new Error(
4099
+ `Failed to load pi-agent-sdk dependencies. Please install them:
4100
+ npm install @mariozechner/pi-agent @mariozechner/pi-ai
4101
+
4102
+ Original error: ${error instanceof Error ? error.message : String(error)}`
4103
+ );
4104
+ }
4105
+ }
4106
+ return {
4107
+ Agent: piAgentModule.Agent,
4108
+ ProviderTransport: piAgentModule.ProviderTransport,
4109
+ getModel: piAiModule.getModel,
4110
+ getEnvApiKey: piAiModule.getEnvApiKey
4111
+ };
4112
+ }
4113
+ var PiAgentSdkProvider = class {
4114
+ id;
4115
+ kind = "pi-agent-sdk";
4116
+ targetName;
4117
+ supportsBatch = false;
4118
+ config;
4119
+ constructor(targetName, config) {
4120
+ this.id = `pi-agent-sdk:${targetName}`;
4121
+ this.targetName = targetName;
4122
+ this.config = config;
4123
+ }
4124
+ async invoke(request) {
4125
+ if (request.signal?.aborted) {
4126
+ throw new Error("Pi agent SDK request was aborted before execution");
4127
+ }
4128
+ const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
4129
+ const startTime = Date.now();
4130
+ const providerName = this.config.provider ?? "anthropic";
4131
+ const modelId = this.config.model ?? "claude-sonnet-4-20250514";
4132
+ const model = getModel(providerName, modelId);
4133
+ const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
4134
+ const transport = new ProviderTransport({
4135
+ getApiKey: async (provider) => {
4136
+ return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
4137
+ }
4138
+ });
4139
+ const agent = new Agent({
4140
+ initialState: {
4141
+ systemPrompt,
4142
+ model,
4143
+ tools: [],
4144
+ // No tools for simple Q&A
4145
+ messages: []
4146
+ },
4147
+ transport
4148
+ });
4149
+ const outputMessages = [];
4150
+ let finalAssistantContent = "";
4151
+ const unsubscribe = agent.subscribe((event) => {
4152
+ if (event.type === "message_end") {
4153
+ const msg = event.message;
4154
+ if (msg.role === "assistant") {
4155
+ const content = extractTextContent2(msg.content);
4156
+ if (content) {
4157
+ finalAssistantContent = content;
4158
+ }
4159
+ }
4160
+ }
4161
+ });
4162
+ try {
4163
+ const timeoutMs = this.config.timeoutMs ?? 12e4;
4164
+ const timeoutPromise = new Promise((_, reject) => {
4165
+ setTimeout(
4166
+ () => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
4167
+ timeoutMs
4168
+ );
4169
+ });
4170
+ await Promise.race([agent.prompt(request.question), timeoutPromise]);
4171
+ await agent.waitForIdle();
4172
+ const agentMessages = agent.state.messages;
4173
+ for (const msg of agentMessages) {
4174
+ outputMessages.push(convertAgentMessage(msg));
4175
+ }
4176
+ const durationMs = Date.now() - startTime;
4177
+ return {
4178
+ raw: {
4179
+ messages: agentMessages,
4180
+ systemPrompt,
4181
+ model: this.config.model,
4182
+ provider: this.config.provider
4183
+ },
4184
+ outputMessages,
4185
+ durationMs
4186
+ };
4187
+ } finally {
4188
+ unsubscribe();
4189
+ }
4190
+ }
4191
+ };
4192
+ function extractTextContent2(content) {
4193
+ if (typeof content === "string") {
4194
+ return content;
4195
+ }
4196
+ if (!Array.isArray(content)) {
4197
+ return void 0;
4198
+ }
4199
+ const textParts = [];
4200
+ for (const part of content) {
4201
+ if (!part || typeof part !== "object") {
4202
+ continue;
4203
+ }
4204
+ const p = part;
4205
+ if (p.type === "text" && typeof p.text === "string") {
4206
+ textParts.push(p.text);
4207
+ }
4208
+ }
4209
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
4210
+ }
4211
+ function convertAgentMessage(message) {
4212
+ if (!message || typeof message !== "object") {
4213
+ return { role: "unknown", content: String(message) };
4214
+ }
4215
+ const msg = message;
4216
+ const role = typeof msg.role === "string" ? msg.role : "unknown";
4217
+ const content = extractTextContent2(msg.content);
4218
+ const toolCalls = extractToolCalls2(msg.content);
4219
+ const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
4220
+ return {
4221
+ role,
4222
+ content,
4223
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
4224
+ timestamp
4225
+ };
4226
+ }
4227
+ function extractToolCalls2(content) {
4228
+ if (!Array.isArray(content)) {
4229
+ return [];
4230
+ }
4231
+ const toolCalls = [];
4232
+ for (const part of content) {
4233
+ if (!part || typeof part !== "object") {
4234
+ continue;
4235
+ }
4236
+ const p = part;
4237
+ if (p.type === "tool_use" && typeof p.name === "string") {
4238
+ toolCalls.push({
4239
+ tool: p.name,
4240
+ input: p.input,
4241
+ id: typeof p.id === "string" ? p.id : void 0
4242
+ });
4243
+ }
3150
4244
  }
3151
- };
4245
+ return toolCalls;
4246
+ }
3152
4247
 
3153
4248
  // src/evaluation/providers/pi-coding-agent.ts
3154
- import { spawn as spawn2 } from "node:child_process";
3155
- import { randomUUID as randomUUID2 } from "node:crypto";
3156
- import { createWriteStream as createWriteStream2 } from "node:fs";
3157
- import { mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
3158
- import { tmpdir as tmpdir2 } from "node:os";
3159
- import path10 from "node:path";
4249
+ import { spawn as spawn3 } from "node:child_process";
4250
+ import { randomUUID as randomUUID3 } from "node:crypto";
4251
+ import { createWriteStream as createWriteStream3 } from "node:fs";
4252
+ import { mkdir as mkdir3, mkdtemp as mkdtemp3, rm as rm3, writeFile as writeFile3 } from "node:fs/promises";
4253
+ import { tmpdir as tmpdir3 } from "node:os";
4254
+ import path11 from "node:path";
3160
4255
 
3161
4256
  // src/evaluation/providers/pi-log-tracker.ts
3162
- var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
3163
- var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
4257
+ var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
4258
+ var GLOBAL_SUBSCRIBERS_KEY3 = Symbol.for("agentv.piLogSubscribers");
3164
4259
  function getPiLogStore() {
3165
4260
  const globalObject = globalThis;
3166
- const existing = globalObject[GLOBAL_LOGS_KEY2];
4261
+ const existing = globalObject[GLOBAL_LOGS_KEY3];
3167
4262
  if (existing) {
3168
4263
  return existing;
3169
4264
  }
3170
4265
  const created = [];
3171
- globalObject[GLOBAL_LOGS_KEY2] = created;
4266
+ globalObject[GLOBAL_LOGS_KEY3] = created;
3172
4267
  return created;
3173
4268
  }
3174
- function getSubscriberStore2() {
4269
+ function getSubscriberStore3() {
3175
4270
  const globalObject = globalThis;
3176
- const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
4271
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY3];
3177
4272
  if (existing) {
3178
4273
  return existing;
3179
4274
  }
3180
4275
  const created = /* @__PURE__ */ new Set();
3181
- globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
4276
+ globalObject[GLOBAL_SUBSCRIBERS_KEY3] = created;
3182
4277
  return created;
3183
4278
  }
3184
- function notifySubscribers2(entry) {
3185
- const subscribers = Array.from(getSubscriberStore2());
4279
+ function notifySubscribers3(entry) {
4280
+ const subscribers = Array.from(getSubscriberStore3());
3186
4281
  for (const listener of subscribers) {
3187
4282
  try {
3188
4283
  listener(entry);
@@ -3194,7 +4289,7 @@ function notifySubscribers2(entry) {
3194
4289
  }
3195
4290
  function recordPiLogEntry(entry) {
3196
4291
  getPiLogStore().push(entry);
3197
- notifySubscribers2(entry);
4292
+ notifySubscribers3(entry);
3198
4293
  }
3199
4294
  function consumePiLogEntries() {
3200
4295
  const store = getPiLogStore();
@@ -3204,7 +4299,7 @@ function consumePiLogEntries() {
3204
4299
  return store.splice(0, store.length);
3205
4300
  }
3206
4301
  function subscribeToPiLogEntries(listener) {
3207
- const store = getSubscriberStore2();
4302
+ const store = getSubscriberStore3();
3208
4303
  store.add(listener);
3209
4304
  return () => {
3210
4305
  store.delete(listener);
@@ -3212,9 +4307,9 @@ function subscribeToPiLogEntries(listener) {
3212
4307
  }
3213
4308
 
3214
4309
  // src/evaluation/providers/pi-coding-agent.ts
3215
- var WORKSPACE_PREFIX2 = "agentv-pi-";
3216
- var PROMPT_FILENAME2 = "prompt.md";
3217
- var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
4310
+ var WORKSPACE_PREFIX3 = "agentv-pi-";
4311
+ var PROMPT_FILENAME3 = "prompt.md";
4312
+ var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
3218
4313
  - Do NOT create any additional output files in the workspace.
3219
4314
  - All intended file outputs/changes MUST be written in your response.
3220
4315
  - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
@@ -3236,27 +4331,27 @@ var PiCodingAgentProvider = class {
3236
4331
  if (request.signal?.aborted) {
3237
4332
  throw new Error("Pi coding agent request was aborted before execution");
3238
4333
  }
3239
- const inputFiles = normalizeInputFiles2(request.inputFiles);
4334
+ const inputFiles = normalizeInputFiles(request.inputFiles);
3240
4335
  const workspaceRoot = await this.createWorkspace();
3241
4336
  const logger = await this.createStreamLogger(request).catch(() => void 0);
3242
4337
  try {
3243
- const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
3244
- await writeFile2(promptFile, request.question, "utf8");
4338
+ const promptFile = path11.join(workspaceRoot, PROMPT_FILENAME3);
4339
+ await writeFile3(promptFile, request.question, "utf8");
3245
4340
  const args = this.buildPiArgs(request.question, inputFiles);
3246
4341
  const cwd = this.resolveCwd(workspaceRoot);
3247
4342
  const result = await this.executePi(args, cwd, request.signal, logger);
3248
4343
  if (result.timedOut) {
3249
4344
  throw new Error(
3250
- `Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
4345
+ `Pi coding agent timed out${formatTimeoutSuffix4(this.config.timeoutMs ?? void 0)}`
3251
4346
  );
3252
4347
  }
3253
4348
  if (result.exitCode !== 0) {
3254
- const detail = pickDetail2(result.stderr, result.stdout);
4349
+ const detail = pickDetail3(result.stderr, result.stdout);
3255
4350
  const prefix = `Pi coding agent exited with code ${result.exitCode}`;
3256
4351
  throw new Error(detail ? `${prefix}: ${detail}` : prefix);
3257
4352
  }
3258
4353
  const parsed = parsePiJsonl(result.stdout);
3259
- const outputMessages = extractOutputMessages(parsed);
4354
+ const outputMessages = extractOutputMessages2(parsed);
3260
4355
  const assistantText = extractAssistantText2(outputMessages);
3261
4356
  return {
3262
4357
  raw: {
@@ -3282,7 +4377,7 @@ var PiCodingAgentProvider = class {
3282
4377
  if (!this.config.cwd) {
3283
4378
  return workspaceRoot;
3284
4379
  }
3285
- return path10.resolve(this.config.cwd);
4380
+ return path11.resolve(this.config.cwd);
3286
4381
  }
3287
4382
  buildPiArgs(prompt, inputFiles) {
3288
4383
  const args = [];
@@ -3312,7 +4407,7 @@ var PiCodingAgentProvider = class {
3312
4407
  args.push(`@${file}`);
3313
4408
  }
3314
4409
  }
3315
- const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
4410
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT4;
3316
4411
  const fullPrompt = `${systemPrompt}
3317
4412
 
3318
4413
  ${prompt}`;
@@ -3371,19 +4466,19 @@ ${prompt}`;
3371
4466
  return env;
3372
4467
  }
3373
4468
  async createWorkspace() {
3374
- return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
4469
+ return await mkdtemp3(path11.join(tmpdir3(), WORKSPACE_PREFIX3));
3375
4470
  }
3376
4471
  async cleanupWorkspace(workspaceRoot) {
3377
4472
  try {
3378
- await rm2(workspaceRoot, { recursive: true, force: true });
4473
+ await rm3(workspaceRoot, { recursive: true, force: true });
3379
4474
  } catch {
3380
4475
  }
3381
4476
  }
3382
4477
  resolveLogDirectory() {
3383
4478
  if (this.config.logDir) {
3384
- return path10.resolve(this.config.logDir);
4479
+ return path11.resolve(this.config.logDir);
3385
4480
  }
3386
- return path10.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
4481
+ return path11.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
3387
4482
  }
3388
4483
  async createStreamLogger(request) {
3389
4484
  const logDir = this.resolveLogDirectory();
@@ -3391,13 +4486,13 @@ ${prompt}`;
3391
4486
  return void 0;
3392
4487
  }
3393
4488
  try {
3394
- await mkdir2(logDir, { recursive: true });
4489
+ await mkdir3(logDir, { recursive: true });
3395
4490
  } catch (error) {
3396
4491
  const message = error instanceof Error ? error.message : String(error);
3397
4492
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
3398
4493
  return void 0;
3399
4494
  }
3400
- const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
4495
+ const filePath = path11.join(logDir, buildLogFilename3(request, this.targetName));
3401
4496
  try {
3402
4497
  const logger = await PiStreamLogger.create({
3403
4498
  filePath,
@@ -3430,7 +4525,7 @@ var PiStreamLogger = class _PiStreamLogger {
3430
4525
  constructor(filePath, format) {
3431
4526
  this.filePath = filePath;
3432
4527
  this.format = format;
3433
- this.stream = createWriteStream2(filePath, { flags: "a" });
4528
+ this.stream = createWriteStream3(filePath, { flags: "a" });
3434
4529
  }
3435
4530
  static async create(options) {
3436
4531
  const logger = new _PiStreamLogger(options.filePath, options.format);
@@ -3491,7 +4586,7 @@ var PiStreamLogger = class _PiStreamLogger {
3491
4586
  return void 0;
3492
4587
  }
3493
4588
  const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
3494
- return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
4589
+ return `[+${formatElapsed3(this.startedAt)}] [${source}] ${message}`;
3495
4590
  }
3496
4591
  flushRemainder() {
3497
4592
  const stdoutRemainder = this.stdoutBuffer.trim();
@@ -3514,18 +4609,18 @@ var PiStreamLogger = class _PiStreamLogger {
3514
4609
  this.stderrBuffer = "";
3515
4610
  }
3516
4611
  };
3517
- function buildLogFilename2(request, targetName) {
4612
+ function buildLogFilename3(request, targetName) {
3518
4613
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3519
- const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
4614
+ const evalId = sanitizeForFilename3(request.evalCaseId ?? "pi");
3520
4615
  const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
3521
- const target = sanitizeForFilename2(targetName);
3522
- return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
4616
+ const target = sanitizeForFilename3(targetName);
4617
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID3().slice(0, 8)}.log`;
3523
4618
  }
3524
- function sanitizeForFilename2(value) {
4619
+ function sanitizeForFilename3(value) {
3525
4620
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
3526
4621
  return sanitized.length > 0 ? sanitized : "pi";
3527
4622
  }
3528
- function formatElapsed2(startedAt) {
4623
+ function formatElapsed3(startedAt) {
3529
4624
  const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
3530
4625
  const hours = Math.floor(elapsedSeconds / 3600);
3531
4626
  const minutes = Math.floor(elapsedSeconds % 3600 / 60);
@@ -3536,7 +4631,7 @@ function formatElapsed2(startedAt) {
3536
4631
  return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
3537
4632
  }
3538
4633
  function formatPiLogMessage(rawLine, source) {
3539
- const parsed = tryParseJsonValue2(rawLine);
4634
+ const parsed = tryParseJsonValue3(rawLine);
3540
4635
  if (parsed) {
3541
4636
  const summary = summarizePiEvent(parsed);
3542
4637
  if (summary) {
@@ -3549,7 +4644,7 @@ function formatPiLogMessage(rawLine, source) {
3549
4644
  return rawLine;
3550
4645
  }
3551
4646
  function formatPiJsonLog(rawLine) {
3552
- const parsed = tryParseJsonValue2(rawLine);
4647
+ const parsed = tryParseJsonValue3(rawLine);
3553
4648
  if (!parsed) {
3554
4649
  return rawLine;
3555
4650
  }
@@ -3599,7 +4694,7 @@ function summarizePiEvent(event) {
3599
4694
  return type;
3600
4695
  }
3601
4696
  }
3602
- function tryParseJsonValue2(rawLine) {
4697
+ function tryParseJsonValue3(rawLine) {
3603
4698
  try {
3604
4699
  return JSON.parse(rawLine);
3605
4700
  } catch {
@@ -3624,7 +4719,7 @@ function parsePiJsonl(output) {
3624
4719
  }
3625
4720
  return parsed;
3626
4721
  }
3627
- function extractOutputMessages(events) {
4722
+ function extractOutputMessages2(events) {
3628
4723
  for (let i = events.length - 1; i >= 0; i--) {
3629
4724
  const event = events[i];
3630
4725
  if (!event || typeof event !== "object") {
@@ -3665,8 +4760,8 @@ function convertPiMessage(message) {
3665
4760
  if (typeof role !== "string") {
3666
4761
  return void 0;
3667
4762
  }
3668
- const content = extractTextContent(msg.content);
3669
- const toolCalls = extractToolCalls(msg.content);
4763
+ const content = extractTextContent3(msg.content);
4764
+ const toolCalls = extractToolCalls3(msg.content);
3670
4765
  const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
3671
4766
  const metadata = {};
3672
4767
  if (msg.api) metadata.api = msg.api;
@@ -3682,7 +4777,7 @@ function convertPiMessage(message) {
3682
4777
  metadata: Object.keys(metadata).length > 0 ? metadata : void 0
3683
4778
  };
3684
4779
  }
3685
- function extractTextContent(content) {
4780
+ function extractTextContent3(content) {
3686
4781
  if (typeof content === "string") {
3687
4782
  return content;
3688
4783
  }
@@ -3701,7 +4796,7 @@ function extractTextContent(content) {
3701
4796
  }
3702
4797
  return textParts.length > 0 ? textParts.join("\n") : void 0;
3703
4798
  }
3704
- function extractToolCalls(content) {
4799
+ function extractToolCalls3(content) {
3705
4800
  if (!Array.isArray(content)) {
3706
4801
  return [];
3707
4802
  }
@@ -3746,7 +4841,7 @@ function extractAssistantText2(messages) {
3746
4841
  function escapeAtSymbols(prompt) {
3747
4842
  return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
3748
4843
  }
3749
- function pickDetail2(stderr, stdout) {
4844
+ function pickDetail3(stderr, stdout) {
3750
4845
  const errorText = stderr.trim();
3751
4846
  if (errorText.length > 0) {
3752
4847
  return errorText;
@@ -3754,7 +4849,7 @@ function pickDetail2(stderr, stdout) {
3754
4849
  const stdoutText = stdout.trim();
3755
4850
  return stdoutText.length > 0 ? stdoutText : void 0;
3756
4851
  }
3757
- function formatTimeoutSuffix3(timeoutMs) {
4852
+ function formatTimeoutSuffix4(timeoutMs) {
3758
4853
  if (!timeoutMs || timeoutMs <= 0) {
3759
4854
  return "";
3760
4855
  }
@@ -3767,7 +4862,7 @@ async function defaultPiRunner(options) {
3767
4862
  const executable = parts[0];
3768
4863
  const executableArgs = parts.slice(1);
3769
4864
  const allArgs = [...executableArgs, ...options.args];
3770
- const child = spawn2(executable, allArgs, {
4865
+ const child = spawn3(executable, allArgs, {
3771
4866
  cwd: options.cwd,
3772
4867
  env: options.env,
3773
4868
  stdio: ["pipe", "pipe", "pipe"],
@@ -3830,7 +4925,7 @@ async function defaultPiRunner(options) {
3830
4925
  }
3831
4926
 
3832
4927
  // src/evaluation/providers/vscode.ts
3833
- import path11 from "node:path";
4928
+ import path12 from "node:path";
3834
4929
  import {
3835
4930
  dispatchAgentSession,
3836
4931
  dispatchBatchAgent,
@@ -4005,7 +5100,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
4005
5100
  return "";
4006
5101
  }
4007
5102
  const buildList = (files) => files.map((absolutePath) => {
4008
- const fileName = path11.basename(absolutePath);
5103
+ const fileName = path12.basename(absolutePath);
4009
5104
  const fileUri = pathToFileUri2(absolutePath);
4010
5105
  return `* [${fileName}](${fileUri})`;
4011
5106
  });
@@ -4030,8 +5125,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
4030
5125
  }
4031
5126
  const unique = /* @__PURE__ */ new Map();
4032
5127
  for (const attachment of attachments) {
4033
- const absolutePath = path11.resolve(attachment);
4034
- const normalized = absolutePath.split(path11.sep).join("/");
5128
+ const absolutePath = path12.resolve(attachment);
5129
+ const normalized = absolutePath.split(path12.sep).join("/");
4035
5130
  if (isGuidelineFile(normalized, guidelinePatterns)) {
4036
5131
  if (!unique.has(absolutePath)) {
4037
5132
  unique.set(absolutePath, absolutePath);
@@ -4046,7 +5141,7 @@ function collectAttachmentFiles(attachments) {
4046
5141
  }
4047
5142
  const unique = /* @__PURE__ */ new Map();
4048
5143
  for (const attachment of attachments) {
4049
- const absolutePath = path11.resolve(attachment);
5144
+ const absolutePath = path12.resolve(attachment);
4050
5145
  if (!unique.has(absolutePath)) {
4051
5146
  unique.set(absolutePath, absolutePath);
4052
5147
  }
@@ -4054,7 +5149,7 @@ function collectAttachmentFiles(attachments) {
4054
5149
  return Array.from(unique.values());
4055
5150
  }
4056
5151
  function pathToFileUri2(filePath) {
4057
- const absolutePath = path11.isAbsolute(filePath) ? filePath : path11.resolve(filePath);
5152
+ const absolutePath = path12.isAbsolute(filePath) ? filePath : path12.resolve(filePath);
4058
5153
  const normalizedPath = absolutePath.replace(/\\/g, "/");
4059
5154
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
4060
5155
  return `file:///${normalizedPath}`;
@@ -4067,7 +5162,7 @@ function normalizeAttachments(attachments) {
4067
5162
  }
4068
5163
  const deduped = /* @__PURE__ */ new Set();
4069
5164
  for (const attachment of attachments) {
4070
- deduped.add(path11.resolve(attachment));
5165
+ deduped.add(path12.resolve(attachment));
4071
5166
  }
4072
5167
  return Array.from(deduped);
4073
5168
  }
@@ -4076,7 +5171,7 @@ function mergeAttachments(all) {
4076
5171
  for (const list of all) {
4077
5172
  if (!list) continue;
4078
5173
  for (const inputFile of list) {
4079
- deduped.add(path11.resolve(inputFile));
5174
+ deduped.add(path12.resolve(inputFile));
4080
5175
  }
4081
5176
  }
4082
5177
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -4125,7 +5220,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
4125
5220
  // src/evaluation/providers/targets-file.ts
4126
5221
  import { constants as constants3 } from "node:fs";
4127
5222
  import { access as access3, readFile as readFile6 } from "node:fs/promises";
4128
- import path12 from "node:path";
5223
+ import path13 from "node:path";
4129
5224
  import { parse as parse3 } from "yaml";
4130
5225
  function isRecord(value) {
4131
5226
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -4162,7 +5257,7 @@ async function fileExists3(filePath) {
4162
5257
  }
4163
5258
  }
4164
5259
  async function readTargetDefinitions(filePath) {
4165
- const absolutePath = path12.resolve(filePath);
5260
+ const absolutePath = path13.resolve(filePath);
4166
5261
  if (!await fileExists3(absolutePath)) {
4167
5262
  throw new Error(`targets.yaml not found at ${absolutePath}`);
4168
5263
  }
@@ -4196,6 +5291,10 @@ function createProvider(target) {
4196
5291
  return new CodexProvider(target.name, target.config);
4197
5292
  case "pi-coding-agent":
4198
5293
  return new PiCodingAgentProvider(target.name, target.config);
5294
+ case "pi-agent-sdk":
5295
+ return new PiAgentSdkProvider(target.name, target.config);
5296
+ case "claude-code":
5297
+ return new ClaudeCodeProvider(target.name, target.config);
4199
5298
  case "mock":
4200
5299
  return new MockProvider(target.name, target.config);
4201
5300
  case "vscode":
@@ -4214,73 +5313,170 @@ function resolveAndCreateProvider(definition, env = process.env) {
4214
5313
 
4215
5314
  // src/evaluation/evaluators.ts
4216
5315
  import { generateText as generateText2 } from "ai";
4217
- import { z } from "zod";
5316
+ import { z as z2 } from "zod";
4218
5317
 
4219
5318
  // src/runtime/exec.ts
4220
- function getBunSpawn() {
4221
- const bunSpawn = globalThis.Bun?.spawn;
4222
- return typeof bunSpawn === "function" ? bunSpawn : void 0;
5319
+ function shellEscapePath(value) {
5320
+ if (process.platform === "win32") {
5321
+ return `"${value.replaceAll('"', '""')}"`;
5322
+ }
5323
+ return `'${value.replaceAll("'", `'"'"'`)}'`;
4223
5324
  }
4224
- async function execShellWithStdin(command, stdinPayload, options = {}) {
4225
- const bunSpawn = getBunSpawn();
4226
- if (bunSpawn) {
4227
- const encoder = new TextEncoder();
4228
- const proc = bunSpawn({
4229
- cmd: ["sh", "-c", command],
4230
- cwd: options.cwd,
4231
- stdin: encoder.encode(stdinPayload),
4232
- stdout: "pipe",
4233
- stderr: "pipe"
4234
- });
4235
- const timeout = options.timeoutMs ? setTimeout(() => {
4236
- proc.kill();
4237
- }, options.timeoutMs) : void 0;
4238
- try {
4239
- const stdout = await new Response(proc.stdout).text();
4240
- const stderr = await new Response(proc.stderr).text();
4241
- const exitCode = await proc.exited;
4242
- return { stdout, stderr, exitCode };
4243
- } finally {
4244
- if (timeout !== void 0) {
4245
- clearTimeout(timeout);
4246
- }
5325
+ async function execFileWithStdin(argv, stdinPayload, options = {}) {
5326
+ if (argv.length === 0) {
5327
+ throw new Error("Executable argv must include at least one entry");
5328
+ }
5329
+ if (typeof Bun !== "undefined") {
5330
+ return execFileWithStdinBun(argv, stdinPayload, options);
5331
+ }
5332
+ return execFileWithStdinNode(argv, stdinPayload, options);
5333
+ }
5334
+ async function execFileWithStdinBun(argv, stdinPayload, options) {
5335
+ const command = [...argv];
5336
+ const encoder = new TextEncoder();
5337
+ const proc = Bun.spawn(command, {
5338
+ cwd: options.cwd,
5339
+ stdin: encoder.encode(stdinPayload),
5340
+ stdout: "pipe",
5341
+ stderr: "pipe"
5342
+ });
5343
+ let timedOut = false;
5344
+ const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
5345
+ timedOut = true;
5346
+ proc.kill("SIGKILL");
5347
+ }, options.timeoutMs) : void 0;
5348
+ try {
5349
+ const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
5350
+ const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
5351
+ const [stdout, stderr, exitCode] = await Promise.all([
5352
+ stdoutPromise,
5353
+ stderrPromise,
5354
+ proc.exited
5355
+ ]);
5356
+ if (timedOut) {
5357
+ throw new Error(`Process timed out after ${options.timeoutMs}ms`);
5358
+ }
5359
+ return {
5360
+ stdout: stdout.replace(/\r\n/g, "\n"),
5361
+ stderr: stderr.replace(/\r\n/g, "\n"),
5362
+ exitCode
5363
+ };
5364
+ } finally {
5365
+ if (timeout !== void 0) {
5366
+ clearTimeout(timeout);
4247
5367
  }
4248
5368
  }
4249
- const { spawn: spawn3 } = await import("node:child_process");
4250
- return await new Promise((resolve, reject) => {
4251
- const child = spawn3(command, {
4252
- shell: true,
5369
+ }
5370
+ async function execFileWithStdinNode(argv, stdinPayload, options) {
5371
+ const { spawn: spawn4 } = await import("node:child_process");
5372
+ return new Promise((resolve, reject) => {
5373
+ const [cmd, ...args] = argv;
5374
+ const child = spawn4(cmd, args, {
4253
5375
  cwd: options.cwd,
4254
5376
  stdio: ["pipe", "pipe", "pipe"]
4255
5377
  });
4256
- let stdout = "";
4257
- let stderr = "";
4258
- const timeout = options.timeoutMs ? setTimeout(() => {
4259
- child.kill();
4260
- reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
5378
+ const stdoutChunks = [];
5379
+ const stderrChunks = [];
5380
+ child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
5381
+ child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
5382
+ let timedOut = false;
5383
+ const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
5384
+ timedOut = true;
5385
+ child.kill("SIGKILL");
4261
5386
  }, options.timeoutMs) : void 0;
4262
- child.stdout?.on("data", (data) => {
4263
- stdout += data.toString();
4264
- });
4265
- child.stderr?.on("data", (data) => {
4266
- stderr += data.toString();
4267
- });
4268
5387
  child.on("error", (error) => {
4269
- if (timeout !== void 0) {
4270
- clearTimeout(timeout);
4271
- }
5388
+ if (timeout !== void 0) clearTimeout(timeout);
4272
5389
  reject(error);
4273
5390
  });
4274
- child.on("exit", (code) => {
4275
- if (timeout !== void 0) {
4276
- clearTimeout(timeout);
5391
+ child.on("close", (code) => {
5392
+ if (timeout !== void 0) clearTimeout(timeout);
5393
+ if (timedOut) {
5394
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
5395
+ return;
4277
5396
  }
4278
- resolve({ stdout, stderr, exitCode: code ?? 0 });
5397
+ const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
5398
+ const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
5399
+ resolve({
5400
+ stdout,
5401
+ stderr,
5402
+ exitCode: code ?? 0
5403
+ });
4279
5404
  });
4280
- child.stdin?.write(stdinPayload);
4281
- child.stdin?.end();
5405
+ if (child.stdin) {
5406
+ child.stdin.write(stdinPayload);
5407
+ child.stdin.end();
5408
+ }
4282
5409
  });
4283
5410
  }
5411
+ async function execShellWithStdin(command, stdinPayload, options = {}) {
5412
+ const { mkdir: mkdir4, readFile: readFile7, rm: rm4, writeFile: writeFile4 } = await import("node:fs/promises");
5413
+ const { tmpdir: tmpdir4 } = await import("node:os");
5414
+ const path15 = await import("node:path");
5415
+ const { randomUUID: randomUUID4 } = await import("node:crypto");
5416
+ const dir = path15.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
5417
+ await mkdir4(dir, { recursive: true });
5418
+ const stdinPath = path15.join(dir, "stdin.txt");
5419
+ const stdoutPath = path15.join(dir, "stdout.txt");
5420
+ const stderrPath = path15.join(dir, "stderr.txt");
5421
+ await writeFile4(stdinPath, stdinPayload, "utf8");
5422
+ const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
5423
+ const { spawn: spawn4 } = await import("node:child_process");
5424
+ try {
5425
+ const exitCode = await new Promise((resolve, reject) => {
5426
+ const child = spawn4(wrappedCommand, {
5427
+ shell: true,
5428
+ cwd: options.cwd,
5429
+ stdio: ["ignore", "ignore", "ignore"]
5430
+ });
5431
+ const timeout = options.timeoutMs ? setTimeout(() => {
5432
+ child.kill();
5433
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
5434
+ }, options.timeoutMs) : void 0;
5435
+ child.on("error", (error) => {
5436
+ if (timeout !== void 0) {
5437
+ clearTimeout(timeout);
5438
+ }
5439
+ reject(error);
5440
+ });
5441
+ child.on("exit", (code) => {
5442
+ if (timeout !== void 0) {
5443
+ clearTimeout(timeout);
5444
+ }
5445
+ resolve(code ?? 0);
5446
+ });
5447
+ });
5448
+ const stdout = (await readFile7(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
5449
+ const stderr = (await readFile7(stderrPath, "utf8")).replace(/\r\n/g, "\n");
5450
+ return { stdout, stderr, exitCode };
5451
+ } finally {
5452
+ await rm4(dir, { recursive: true, force: true });
5453
+ }
5454
+ }
5455
+
5456
+ // src/evaluation/case-conversion.ts
5457
+ function toSnakeCase(str) {
5458
+ if (/^[A-Z]/.test(str)) {
5459
+ return str;
5460
+ }
5461
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
5462
+ }
5463
+ function toSnakeCaseDeep(obj) {
5464
+ if (obj === null || obj === void 0) {
5465
+ return obj;
5466
+ }
5467
+ if (Array.isArray(obj)) {
5468
+ return obj.map((item) => toSnakeCaseDeep(item));
5469
+ }
5470
+ if (typeof obj === "object") {
5471
+ const result = {};
5472
+ for (const [key, value] of Object.entries(obj)) {
5473
+ const snakeKey = toSnakeCase(key);
5474
+ result[snakeKey] = toSnakeCaseDeep(value);
5475
+ }
5476
+ return result;
5477
+ }
5478
+ return obj;
5479
+ }
4284
5480
 
4285
5481
  // src/evaluation/evaluators.ts
4286
5482
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
@@ -4300,20 +5496,20 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
4300
5496
 
4301
5497
  [[ ## candidate_answer ## ]]
4302
5498
  {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
4303
- var freeformEvaluationSchema = z.object({
4304
- score: z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
4305
- hits: z.array(z.string()).describe("Brief specific achievements").optional(),
4306
- misses: z.array(z.string()).describe("Brief failures or omissions").optional(),
4307
- reasoning: z.string().describe("Concise explanation (1-2 sentences)").optional()
5499
+ var freeformEvaluationSchema = z2.object({
5500
+ score: z2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
5501
+ hits: z2.array(z2.string()).describe("Brief specific achievements").optional(),
5502
+ misses: z2.array(z2.string()).describe("Brief failures or omissions").optional(),
5503
+ reasoning: z2.string().describe("Concise explanation (1-2 sentences)").optional()
4308
5504
  });
4309
- var rubricCheckResultSchema = z.object({
4310
- id: z.string().describe("The ID of the rubric item being checked"),
4311
- satisfied: z.boolean().describe("Whether this rubric requirement is met"),
4312
- reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
5505
+ var rubricCheckResultSchema = z2.object({
5506
+ id: z2.string().describe("The ID of the rubric item being checked"),
5507
+ satisfied: z2.boolean().describe("Whether this rubric requirement is met"),
5508
+ reasoning: z2.string().describe("Brief explanation (1-2 sentences) for this check")
4313
5509
  });
4314
- var rubricEvaluationSchema = z.object({
4315
- checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
4316
- overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
5510
+ var rubricEvaluationSchema = z2.object({
5511
+ checks: z2.array(rubricCheckResultSchema).describe("Results for each rubric item"),
5512
+ overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)")
4317
5513
  });
4318
5514
  var LlmJudgeEvaluator = class {
4319
5515
  kind = "llm_judge";
@@ -4549,30 +5745,30 @@ var CodeEvaluator = class {
4549
5745
  script;
4550
5746
  cwd;
4551
5747
  agentTimeoutMs;
5748
+ config;
4552
5749
  constructor(options) {
4553
5750
  this.script = options.script;
4554
5751
  this.cwd = options.cwd;
4555
5752
  this.agentTimeoutMs = options.agentTimeoutMs;
5753
+ this.config = options.config;
4556
5754
  }
4557
5755
  async evaluate(context) {
4558
- const inputPayload = JSON.stringify(
4559
- {
4560
- question: context.evalCase.question,
4561
- expectedOutcome: context.evalCase.expected_outcome,
4562
- expectedMessages: context.evalCase.expected_messages,
4563
- referenceAnswer: context.evalCase.reference_answer,
4564
- candidateAnswer: context.candidate,
4565
- outputMessages: context.outputMessages ?? null,
4566
- guidelineFiles: context.evalCase.guideline_paths,
4567
- inputFiles: context.evalCase.file_paths.filter(
4568
- (path14) => !context.evalCase.guideline_paths.includes(path14)
4569
- ),
4570
- inputMessages: context.evalCase.input_messages,
4571
- traceSummary: context.traceSummary ?? null
4572
- },
4573
- null,
4574
- 2
4575
- );
5756
+ const payload = {
5757
+ question: context.evalCase.question,
5758
+ expectedOutcome: context.evalCase.expected_outcome,
5759
+ expectedMessages: context.evalCase.expected_messages,
5760
+ referenceAnswer: context.evalCase.reference_answer,
5761
+ candidateAnswer: context.candidate,
5762
+ outputMessages: context.outputMessages ?? null,
5763
+ guidelineFiles: context.evalCase.guideline_paths,
5764
+ inputFiles: context.evalCase.file_paths.filter(
5765
+ (path15) => !context.evalCase.guideline_paths.includes(path15)
5766
+ ),
5767
+ inputMessages: context.evalCase.input_messages,
5768
+ traceSummary: context.traceSummary ?? null,
5769
+ config: this.config ?? null
5770
+ };
5771
+ const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
4576
5772
  try {
4577
5773
  const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
4578
5774
  const parsed = parseJsonSafe(stdout);
@@ -4638,18 +5834,25 @@ function calculateRubricScore(result, rubrics) {
4638
5834
  return { score, verdict, hits, misses };
4639
5835
  }
4640
5836
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
4641
- const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
4642
- cwd,
4643
- timeoutMs: agentTimeoutMs
4644
- });
5837
+ const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
4645
5838
  if (exitCode !== 0) {
4646
- const trimmedErr = stderr.trim();
5839
+ const trimmedErr = formatStderr(stderr);
4647
5840
  throw new Error(
4648
5841
  trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
4649
5842
  );
4650
5843
  }
4651
5844
  return stdout.trim();
4652
5845
  }
5846
+ function formatStderr(stderr) {
5847
+ const trimmed = stderr.trim();
5848
+ const maxLength = 2e3;
5849
+ if (trimmed.length <= maxLength) {
5850
+ return trimmed;
5851
+ }
5852
+ const tail = trimmed.slice(-maxLength);
5853
+ return `...(truncated, last ${maxLength} chars)
5854
+ ${tail}`;
5855
+ }
4653
5856
  function parseJsonSafe(payload) {
4654
5857
  try {
4655
5858
  return JSON.parse(payload);
@@ -4881,22 +6084,438 @@ var ToolTrajectoryEvaluator = class {
4881
6084
  misses.push(`Position ${i}: ${expectedTool} args mismatch`);
4882
6085
  }
4883
6086
  } else {
4884
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
6087
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
6088
+ }
6089
+ }
6090
+ for (let i = checkLength; i < expected.length; i++) {
6091
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
6092
+ }
6093
+ const score = hits.length / expected.length;
6094
+ return {
6095
+ score,
6096
+ verdict: scoreToVerdict(score),
6097
+ hits,
6098
+ misses,
6099
+ expectedAspectCount: expected.length
6100
+ };
6101
+ }
6102
+ };
6103
+ var DEFAULT_DATE_FORMATS = [
6104
+ "YYYY-MM-DDTHH:mm:ssZ",
6105
+ // ISO with timezone
6106
+ "YYYY-MM-DDTHH:mm:ss",
6107
+ // ISO with time
6108
+ "YYYY-MM-DD",
6109
+ // ISO date
6110
+ "DD-MMM-YYYY",
6111
+ // Localized (e.g., "15-JAN-2025")
6112
+ "MM/DD/YYYY",
6113
+ // US format
6114
+ "DD/MM/YYYY",
6115
+ // EU format
6116
+ "MM-DD-YYYY",
6117
+ // US with dashes
6118
+ "DD-MM-YYYY"
6119
+ // EU with dashes
6120
+ ];
6121
+ var MONTH_NAMES = {
6122
+ jan: 0,
6123
+ january: 0,
6124
+ feb: 1,
6125
+ february: 1,
6126
+ mar: 2,
6127
+ march: 2,
6128
+ apr: 3,
6129
+ april: 3,
6130
+ may: 4,
6131
+ jun: 5,
6132
+ june: 5,
6133
+ jul: 6,
6134
+ july: 6,
6135
+ aug: 7,
6136
+ august: 7,
6137
+ sep: 8,
6138
+ sept: 8,
6139
+ september: 8,
6140
+ oct: 9,
6141
+ october: 9,
6142
+ nov: 10,
6143
+ november: 10,
6144
+ dec: 11,
6145
+ december: 11
6146
+ };
6147
+ var FieldAccuracyEvaluator = class {
6148
+ kind = "field_accuracy";
6149
+ config;
6150
+ constructor(options) {
6151
+ this.config = options.config;
6152
+ }
6153
+ evaluate(context) {
6154
+ const { evalCase, candidate } = context;
6155
+ let candidateData;
6156
+ try {
6157
+ candidateData = parseJsonFromTextSafe(candidate);
6158
+ } catch {
6159
+ return {
6160
+ score: 0,
6161
+ verdict: "fail",
6162
+ hits: [],
6163
+ misses: ["Failed to parse candidate answer as JSON"],
6164
+ expectedAspectCount: this.config.fields.length,
6165
+ reasoning: "Candidate answer is not valid JSON"
6166
+ };
6167
+ }
6168
+ const expectedData = this.extractExpectedData(evalCase.expected_messages);
6169
+ if (!expectedData) {
6170
+ return {
6171
+ score: 0,
6172
+ verdict: "fail",
6173
+ hits: [],
6174
+ misses: ["No expected data found in expected_messages"],
6175
+ expectedAspectCount: this.config.fields.length,
6176
+ reasoning: "Could not extract expected data from expected_messages"
6177
+ };
6178
+ }
6179
+ const fieldResults = [];
6180
+ for (const fieldConfig of this.config.fields) {
6181
+ const result = this.evaluateField(fieldConfig, candidateData, expectedData);
6182
+ fieldResults.push(result);
6183
+ }
6184
+ return this.aggregateResults(fieldResults);
6185
+ }
6186
+ /**
6187
+ * Extract expected data from expected_messages array.
6188
+ * Looks for the last assistant message with content.
6189
+ */
6190
+ extractExpectedData(expectedMessages) {
6191
+ for (let i = expectedMessages.length - 1; i >= 0; i--) {
6192
+ const message = expectedMessages[i];
6193
+ if (message.role === "assistant" && message.content) {
6194
+ if (typeof message.content === "object" && message.content !== null) {
6195
+ return message.content;
6196
+ }
6197
+ if (typeof message.content === "string") {
6198
+ try {
6199
+ return parseJsonFromTextSafe(message.content);
6200
+ } catch {
6201
+ }
6202
+ }
6203
+ }
6204
+ }
6205
+ return void 0;
6206
+ }
6207
+ /**
6208
+ * Evaluate a single field against the expected value.
6209
+ */
6210
+ evaluateField(fieldConfig, candidateData, expectedData) {
6211
+ const { path: path15, match, required = true, weight = 1 } = fieldConfig;
6212
+ const candidateValue = resolvePath(candidateData, path15);
6213
+ const expectedValue = resolvePath(expectedData, path15);
6214
+ if (expectedValue === void 0) {
6215
+ return {
6216
+ path: path15,
6217
+ score: 1,
6218
+ // No expected value means no comparison needed
6219
+ weight,
6220
+ hit: true,
6221
+ message: `${path15}: no expected value`
6222
+ };
6223
+ }
6224
+ if (candidateValue === void 0) {
6225
+ if (required) {
6226
+ return {
6227
+ path: path15,
6228
+ score: 0,
6229
+ weight,
6230
+ hit: false,
6231
+ message: `${path15} (required, missing)`
6232
+ };
6233
+ }
6234
+ return {
6235
+ path: path15,
6236
+ score: 1,
6237
+ // Don't penalize missing optional fields
6238
+ weight: 0,
6239
+ // Zero weight means it won't affect the score
6240
+ hit: true,
6241
+ message: `${path15}: optional field missing`
6242
+ };
6243
+ }
6244
+ switch (match) {
6245
+ case "exact":
6246
+ return this.compareExact(path15, candidateValue, expectedValue, weight);
6247
+ case "numeric_tolerance":
6248
+ return this.compareNumericTolerance(
6249
+ path15,
6250
+ candidateValue,
6251
+ expectedValue,
6252
+ fieldConfig,
6253
+ weight
6254
+ );
6255
+ case "date":
6256
+ return this.compareDate(path15, candidateValue, expectedValue, fieldConfig, weight);
6257
+ default:
6258
+ return {
6259
+ path: path15,
6260
+ score: 0,
6261
+ weight,
6262
+ hit: false,
6263
+ message: `${path15}: unknown match type "${match}"`
6264
+ };
6265
+ }
6266
+ }
6267
+ /**
6268
+ * Exact equality comparison.
6269
+ */
6270
+ compareExact(path15, candidateValue, expectedValue, weight) {
6271
+ if (deepEqual(candidateValue, expectedValue)) {
6272
+ return {
6273
+ path: path15,
6274
+ score: 1,
6275
+ weight,
6276
+ hit: true,
6277
+ message: path15
6278
+ };
6279
+ }
6280
+ if (typeof candidateValue !== typeof expectedValue) {
6281
+ return {
6282
+ path: path15,
6283
+ score: 0,
6284
+ weight,
6285
+ hit: false,
6286
+ message: `${path15} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
6287
+ };
6288
+ }
6289
+ return {
6290
+ path: path15,
6291
+ score: 0,
6292
+ weight,
6293
+ hit: false,
6294
+ message: `${path15} (value mismatch)`
6295
+ };
6296
+ }
6297
+ /**
6298
+ * Numeric comparison with absolute or relative tolerance.
6299
+ */
6300
+ compareNumericTolerance(path15, candidateValue, expectedValue, fieldConfig, weight) {
6301
+ const { tolerance = 0, relative = false } = fieldConfig;
6302
+ const candidateNum = toNumber(candidateValue);
6303
+ const expectedNum = toNumber(expectedValue);
6304
+ if (candidateNum === null || expectedNum === null) {
6305
+ return {
6306
+ path: path15,
6307
+ score: 0,
6308
+ weight,
6309
+ hit: false,
6310
+ message: `${path15} (non-numeric value)`
6311
+ };
6312
+ }
6313
+ if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
6314
+ return {
6315
+ path: path15,
6316
+ score: 0,
6317
+ weight,
6318
+ hit: false,
6319
+ message: `${path15} (invalid numeric value)`
6320
+ };
6321
+ }
6322
+ const diff = Math.abs(candidateNum - expectedNum);
6323
+ let withinTolerance;
6324
+ if (relative) {
6325
+ const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
6326
+ withinTolerance = relativeDiff <= tolerance;
6327
+ } else {
6328
+ withinTolerance = diff <= tolerance;
6329
+ }
6330
+ if (withinTolerance) {
6331
+ return {
6332
+ path: path15,
6333
+ score: 1,
6334
+ weight,
6335
+ hit: true,
6336
+ message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
6337
+ };
6338
+ }
6339
+ return {
6340
+ path: path15,
6341
+ score: 0,
6342
+ weight,
6343
+ hit: false,
6344
+ message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
6345
+ };
6346
+ }
6347
+ /**
6348
+ * Date comparison with format normalization.
6349
+ */
6350
+ compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
6351
+ const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
6352
+ const candidateDate = parseDate(String(candidateValue), formats);
6353
+ const expectedDate = parseDate(String(expectedValue), formats);
6354
+ if (candidateDate === null) {
6355
+ return {
6356
+ path: path15,
6357
+ score: 0,
6358
+ weight,
6359
+ hit: false,
6360
+ message: `${path15} (unparseable candidate date)`
6361
+ };
6362
+ }
6363
+ if (expectedDate === null) {
6364
+ return {
6365
+ path: path15,
6366
+ score: 0,
6367
+ weight,
6368
+ hit: false,
6369
+ message: `${path15} (unparseable expected date)`
6370
+ };
6371
+ }
6372
+ if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
6373
+ return {
6374
+ path: path15,
6375
+ score: 1,
6376
+ weight,
6377
+ hit: true,
6378
+ message: path15
6379
+ };
6380
+ }
6381
+ return {
6382
+ path: path15,
6383
+ score: 0,
6384
+ weight,
6385
+ hit: false,
6386
+ message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
6387
+ };
6388
+ }
6389
+ /**
6390
+ * Aggregate field results using configured strategy.
6391
+ */
6392
+ aggregateResults(results) {
6393
+ const aggregation = this.config.aggregation ?? "weighted_average";
6394
+ const hits = [];
6395
+ const misses = [];
6396
+ for (const result of results) {
6397
+ if (result.hit) {
6398
+ hits.push(result.message);
6399
+ } else {
6400
+ misses.push(result.message);
4885
6401
  }
4886
6402
  }
4887
- for (let i = checkLength; i < expected.length; i++) {
4888
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
6403
+ let score;
6404
+ if (aggregation === "all_or_nothing") {
6405
+ score = misses.length === 0 ? 1 : 0;
6406
+ } else {
6407
+ const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
6408
+ if (totalWeight === 0) {
6409
+ score = results.length === 0 ? 1 : 0;
6410
+ } else {
6411
+ const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
6412
+ score = weightedSum / totalWeight;
6413
+ }
4889
6414
  }
4890
- const score = hits.length / expected.length;
6415
+ const reasoning = `${hits.length}/${results.length} fields matched`;
4891
6416
  return {
4892
- score,
6417
+ score: clampScore(score),
4893
6418
  verdict: scoreToVerdict(score),
4894
- hits,
4895
- misses,
4896
- expectedAspectCount: expected.length
6419
+ hits: hits.slice(0, 4),
6420
+ misses: misses.slice(0, 4),
6421
+ expectedAspectCount: results.length,
6422
+ reasoning
4897
6423
  };
4898
6424
  }
4899
6425
  };
6426
+ function resolvePath(obj, path15) {
6427
+ if (!path15 || !obj) {
6428
+ return void 0;
6429
+ }
6430
+ const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
6431
+ let current = obj;
6432
+ for (const part of parts) {
6433
+ if (current === null || current === void 0) {
6434
+ return void 0;
6435
+ }
6436
+ if (typeof current !== "object") {
6437
+ return void 0;
6438
+ }
6439
+ const isIndex = /^\d+$/.test(part);
6440
+ if (isIndex && Array.isArray(current)) {
6441
+ current = current[Number.parseInt(part, 10)];
6442
+ } else {
6443
+ current = current[part];
6444
+ }
6445
+ }
6446
+ return current;
6447
+ }
6448
+ function toNumber(value) {
6449
+ if (typeof value === "number") {
6450
+ return value;
6451
+ }
6452
+ if (typeof value === "string") {
6453
+ const num = Number.parseFloat(value);
6454
+ return Number.isNaN(num) ? null : num;
6455
+ }
6456
+ return null;
6457
+ }
6458
+ function parseDate(dateStr, formats) {
6459
+ if (!dateStr) return null;
6460
+ const trimmed = dateStr.trim();
6461
+ const isoDate = new Date(trimmed);
6462
+ if (!Number.isNaN(isoDate.getTime())) {
6463
+ return isoDate;
6464
+ }
6465
+ const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
6466
+ if (localizedMatch) {
6467
+ const day = Number.parseInt(localizedMatch[1], 10);
6468
+ const monthName = localizedMatch[2].toLowerCase();
6469
+ const year = Number.parseInt(localizedMatch[3], 10);
6470
+ const month = MONTH_NAMES[monthName];
6471
+ if (month !== void 0) {
6472
+ return new Date(year, month, day);
6473
+ }
6474
+ }
6475
+ const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
6476
+ if (usMatch) {
6477
+ const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
6478
+ const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
6479
+ if (hasUSFormat && !hasEUFormat) {
6480
+ const month = Number.parseInt(usMatch[1], 10) - 1;
6481
+ const day = Number.parseInt(usMatch[2], 10);
6482
+ const year = Number.parseInt(usMatch[3], 10);
6483
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6484
+ return new Date(year, month, day);
6485
+ }
6486
+ } else if (hasEUFormat && !hasUSFormat) {
6487
+ const day = Number.parseInt(usMatch[1], 10);
6488
+ const month = Number.parseInt(usMatch[2], 10) - 1;
6489
+ const year = Number.parseInt(usMatch[3], 10);
6490
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6491
+ return new Date(year, month, day);
6492
+ }
6493
+ } else {
6494
+ const num1 = Number.parseInt(usMatch[1], 10);
6495
+ const num2 = Number.parseInt(usMatch[2], 10);
6496
+ const year = Number.parseInt(usMatch[3], 10);
6497
+ if (num1 > 12 && num2 <= 12) {
6498
+ return new Date(year, num2 - 1, num1);
6499
+ }
6500
+ if (num2 > 12 && num1 <= 12) {
6501
+ return new Date(year, num1 - 1, num2);
6502
+ }
6503
+ if (num1 <= 12 && num2 <= 31) {
6504
+ return new Date(year, num1 - 1, num2);
6505
+ }
6506
+ }
6507
+ }
6508
+ return null;
6509
+ }
6510
+ function formatDateISO(date) {
6511
+ return date.toISOString().split("T")[0];
6512
+ }
6513
+ function parseJsonFromTextSafe(text) {
6514
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
6515
+ const match = cleaned.match(/\{[\s\S]*\}/);
6516
+ const blob = match?.[0] ?? cleaned;
6517
+ return JSON.parse(blob);
6518
+ }
4900
6519
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
4901
6520
  {{EVALUATOR_RESULTS_JSON}}
4902
6521
 
@@ -5121,11 +6740,175 @@ var CompositeEvaluator = class {
5121
6740
  }
5122
6741
  }
5123
6742
  };
6743
+ var LatencyEvaluator = class {
6744
+ kind = "latency";
6745
+ config;
6746
+ constructor(options) {
6747
+ this.config = options.config;
6748
+ }
6749
+ evaluate(context) {
6750
+ const { threshold } = this.config;
6751
+ const durationMs = context.traceSummary?.durationMs;
6752
+ if (durationMs === void 0) {
6753
+ return {
6754
+ score: 0,
6755
+ verdict: "fail",
6756
+ hits: [],
6757
+ misses: ["No duration data available in trace"],
6758
+ expectedAspectCount: 1,
6759
+ reasoning: "Execution duration not reported by provider",
6760
+ evaluatorRawRequest: {
6761
+ type: "latency",
6762
+ threshold,
6763
+ durationMs: null
6764
+ }
6765
+ };
6766
+ }
6767
+ const passed = durationMs <= threshold;
6768
+ const score = passed ? 1 : 0;
6769
+ return {
6770
+ score,
6771
+ verdict: passed ? "pass" : "fail",
6772
+ hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
6773
+ misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
6774
+ expectedAspectCount: 1,
6775
+ reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
6776
+ evaluatorRawRequest: {
6777
+ type: "latency",
6778
+ threshold,
6779
+ durationMs
6780
+ }
6781
+ };
6782
+ }
6783
+ };
6784
+ var CostEvaluator = class {
6785
+ kind = "cost";
6786
+ config;
6787
+ constructor(options) {
6788
+ this.config = options.config;
6789
+ }
6790
+ evaluate(context) {
6791
+ const { budget } = this.config;
6792
+ const costUsd = context.traceSummary?.costUsd;
6793
+ if (costUsd === void 0) {
6794
+ return {
6795
+ score: 0,
6796
+ verdict: "fail",
6797
+ hits: [],
6798
+ misses: ["No cost data available in trace"],
6799
+ expectedAspectCount: 1,
6800
+ reasoning: "Execution cost not reported by provider",
6801
+ evaluatorRawRequest: {
6802
+ type: "cost",
6803
+ budget,
6804
+ costUsd: null
6805
+ }
6806
+ };
6807
+ }
6808
+ const passed = costUsd <= budget;
6809
+ const score = passed ? 1 : 0;
6810
+ const formatCost = (n) => `$${n.toFixed(4)}`;
6811
+ return {
6812
+ score,
6813
+ verdict: passed ? "pass" : "fail",
6814
+ hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
6815
+ misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
6816
+ expectedAspectCount: 1,
6817
+ reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
6818
+ evaluatorRawRequest: {
6819
+ type: "cost",
6820
+ budget,
6821
+ costUsd
6822
+ }
6823
+ };
6824
+ }
6825
+ };
6826
+ var TokenUsageEvaluator = class {
6827
+ kind = "token_usage";
6828
+ config;
6829
+ constructor(options) {
6830
+ this.config = options.config;
6831
+ }
6832
+ evaluate(context) {
6833
+ const usage = context.traceSummary?.tokenUsage;
6834
+ const maxTotal = this.config.max_total;
6835
+ const maxInput = this.config.max_input;
6836
+ const maxOutput = this.config.max_output;
6837
+ const expectedAspectCount = Math.max(
6838
+ [maxTotal, maxInput, maxOutput].filter((v) => typeof v === "number").length,
6839
+ 1
6840
+ );
6841
+ if (!usage) {
6842
+ return {
6843
+ score: 0,
6844
+ verdict: "fail",
6845
+ hits: [],
6846
+ misses: ["No token usage data available in trace"],
6847
+ expectedAspectCount,
6848
+ reasoning: "Token usage not reported by provider",
6849
+ evaluatorRawRequest: {
6850
+ type: "token_usage",
6851
+ max_total: maxTotal ?? null,
6852
+ max_input: maxInput ?? null,
6853
+ max_output: maxOutput ?? null,
6854
+ tokenUsage: null
6855
+ }
6856
+ };
6857
+ }
6858
+ const input = usage.input;
6859
+ const output = usage.output;
6860
+ const cached = usage.cached ?? 0;
6861
+ const total = input + output + cached;
6862
+ const hits = [];
6863
+ const misses = [];
6864
+ if (typeof maxInput === "number") {
6865
+ if (input <= maxInput) {
6866
+ hits.push(`Input tokens ${input} <= ${maxInput}`);
6867
+ } else {
6868
+ misses.push(`Input tokens ${input} > ${maxInput}`);
6869
+ }
6870
+ }
6871
+ if (typeof maxOutput === "number") {
6872
+ if (output <= maxOutput) {
6873
+ hits.push(`Output tokens ${output} <= ${maxOutput}`);
6874
+ } else {
6875
+ misses.push(`Output tokens ${output} > ${maxOutput}`);
6876
+ }
6877
+ }
6878
+ if (typeof maxTotal === "number") {
6879
+ if (total <= maxTotal) {
6880
+ hits.push(`Total tokens ${total} <= ${maxTotal}`);
6881
+ } else {
6882
+ misses.push(`Total tokens ${total} > ${maxTotal}`);
6883
+ }
6884
+ }
6885
+ const passed = misses.length === 0;
6886
+ return {
6887
+ score: passed ? 1 : 0,
6888
+ verdict: passed ? "pass" : "fail",
6889
+ hits,
6890
+ misses,
6891
+ expectedAspectCount,
6892
+ reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
6893
+ evaluatorRawRequest: {
6894
+ type: "token_usage",
6895
+ max_total: maxTotal ?? null,
6896
+ max_input: maxInput ?? null,
6897
+ max_output: maxOutput ?? null,
6898
+ tokenUsage: {
6899
+ input,
6900
+ output,
6901
+ cached,
6902
+ total
6903
+ }
6904
+ }
6905
+ };
6906
+ }
6907
+ };
5124
6908
 
5125
6909
  // src/evaluation/orchestrator.ts
5126
- import { createHash, randomUUID as randomUUID3 } from "node:crypto";
5127
- import { mkdir as mkdir3, writeFile as writeFile3 } from "node:fs/promises";
5128
- import path13 from "node:path";
6910
+ import { createHash } from "node:crypto";
6911
+ import path14 from "node:path";
5129
6912
 
5130
6913
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
5131
6914
  var Node = class {
@@ -5267,6 +7050,9 @@ function validateConcurrency(concurrency) {
5267
7050
  }
5268
7051
 
5269
7052
  // src/evaluation/orchestrator.ts
7053
+ function usesFileReferencePrompt(provider) {
7054
+ return isAgentProvider(provider) || provider.kind === "cli";
7055
+ }
5270
7056
  async function runEvaluation(options) {
5271
7057
  const {
5272
7058
  testFilePath: evalFilePath,
@@ -5278,7 +7064,6 @@ async function runEvaluation(options) {
5278
7064
  evaluators,
5279
7065
  maxRetries,
5280
7066
  agentTimeoutMs,
5281
- promptDumpDir,
5282
7067
  cache,
5283
7068
  useCache,
5284
7069
  now,
@@ -5358,7 +7143,6 @@ async function runEvaluation(options) {
5358
7143
  provider: primaryProvider,
5359
7144
  target,
5360
7145
  evaluatorRegistry,
5361
- promptDumpDir,
5362
7146
  nowFn: now ?? (() => /* @__PURE__ */ new Date()),
5363
7147
  onProgress,
5364
7148
  onResult,
@@ -5400,7 +7184,6 @@ async function runEvaluation(options) {
5400
7184
  evaluators: evaluatorRegistry,
5401
7185
  maxRetries,
5402
7186
  agentTimeoutMs,
5403
- promptDumpDir,
5404
7187
  cache,
5405
7188
  useCache,
5406
7189
  now,
@@ -5443,7 +7226,8 @@ async function runEvaluation(options) {
5443
7226
  results.push(outcome.value);
5444
7227
  } else {
5445
7228
  const evalCase = filteredEvalCases[i];
5446
- const promptInputs = await buildPromptInputs(evalCase);
7229
+ const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
7230
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
5447
7231
  const errorResult = buildErrorResult(
5448
7232
  evalCase,
5449
7233
  target.name,
@@ -5466,7 +7250,6 @@ async function runBatchEvaluation(options) {
5466
7250
  provider,
5467
7251
  target,
5468
7252
  evaluatorRegistry,
5469
- promptDumpDir,
5470
7253
  nowFn,
5471
7254
  onProgress,
5472
7255
  onResult,
@@ -5474,12 +7257,9 @@ async function runBatchEvaluation(options) {
5474
7257
  agentTimeoutMs
5475
7258
  } = options;
5476
7259
  const promptInputsList = [];
5477
- const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
7260
+ const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
5478
7261
  for (const evalCase of evalCases) {
5479
7262
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
5480
- if (promptDumpDir) {
5481
- await dumpPrompt(promptDumpDir, evalCase, promptInputs);
5482
- }
5483
7263
  promptInputsList.push(promptInputs);
5484
7264
  }
5485
7265
  const batchRequests = evalCases.map((evalCase, index) => {
@@ -5521,13 +7301,20 @@ async function runBatchEvaluation(options) {
5521
7301
  const promptInputs = promptInputsList[i];
5522
7302
  const providerResponse = batchResponse[i];
5523
7303
  const outputMessages = providerResponse.outputMessages;
5524
- const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
7304
+ const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
7305
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
7306
+ eventCount: 0,
7307
+ toolNames: [],
7308
+ toolCallsByName: {},
7309
+ errorCount: 0
7310
+ } : void 0;
5525
7311
  const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
5526
7312
  tokenUsage: providerResponse.tokenUsage,
5527
7313
  costUsd: providerResponse.costUsd,
5528
7314
  durationMs: providerResponse.durationMs
5529
7315
  }) : void 0;
5530
7316
  const candidate = extractLastAssistantContent(outputMessages);
7317
+ const providerError = extractProviderError(providerResponse);
5531
7318
  let result;
5532
7319
  try {
5533
7320
  result = await evaluateCandidate({
@@ -5544,6 +7331,9 @@ async function runBatchEvaluation(options) {
5544
7331
  outputMessages,
5545
7332
  traceSummary
5546
7333
  });
7334
+ if (providerError) {
7335
+ result = { ...result, error: providerError };
7336
+ }
5547
7337
  } catch (error) {
5548
7338
  const errorResult = buildErrorResult(
5549
7339
  evalCase,
@@ -5576,9 +7366,10 @@ async function runBatchEvaluation(options) {
5576
7366
  await onProgress({
5577
7367
  workerId: 1,
5578
7368
  evalId: evalCase.id,
5579
- status: "completed",
7369
+ status: result.error ? "failed" : "completed",
5580
7370
  startedAt: 0,
5581
- completedAt: Date.now()
7371
+ completedAt: Date.now(),
7372
+ error: result.error
5582
7373
  });
5583
7374
  }
5584
7375
  }
@@ -5593,17 +7384,13 @@ async function runEvalCase(options) {
5593
7384
  now,
5594
7385
  maxRetries,
5595
7386
  agentTimeoutMs,
5596
- promptDumpDir,
5597
7387
  cache,
5598
7388
  useCache,
5599
7389
  signal,
5600
7390
  judgeProvider
5601
7391
  } = options;
5602
- const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
7392
+ const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
5603
7393
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
5604
- if (promptDumpDir) {
5605
- await dumpPrompt(promptDumpDir, evalCase, promptInputs);
5606
- }
5607
7394
  const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
5608
7395
  let cachedResponse;
5609
7396
  if (cacheKey && cache) {
@@ -5647,15 +7434,22 @@ async function runEvalCase(options) {
5647
7434
  await cache.set(cacheKey, providerResponse);
5648
7435
  }
5649
7436
  const outputMessages = providerResponse.outputMessages;
5650
- const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
7437
+ const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
7438
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
7439
+ eventCount: 0,
7440
+ toolNames: [],
7441
+ toolCallsByName: {},
7442
+ errorCount: 0
7443
+ } : void 0;
5651
7444
  const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
5652
7445
  tokenUsage: providerResponse.tokenUsage,
5653
7446
  costUsd: providerResponse.costUsd,
5654
7447
  durationMs: providerResponse.durationMs
5655
7448
  }) : void 0;
5656
7449
  const candidate = extractLastAssistantContent(outputMessages);
7450
+ const providerError = extractProviderError(providerResponse);
5657
7451
  try {
5658
- return await evaluateCandidate({
7452
+ const result = await evaluateCandidate({
5659
7453
  evalCase,
5660
7454
  candidate,
5661
7455
  target,
@@ -5669,6 +7463,7 @@ async function runEvalCase(options) {
5669
7463
  outputMessages,
5670
7464
  traceSummary
5671
7465
  });
7466
+ return providerError ? { ...result, error: providerError } : result;
5672
7467
  } catch (error) {
5673
7468
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
5674
7469
  }
@@ -5734,7 +7529,6 @@ async function evaluateCandidate(options) {
5734
7529
  candidateAnswer: candidate,
5735
7530
  target: target.name,
5736
7531
  reasoning: score.reasoning,
5737
- rawAspects: score.rawAspects,
5738
7532
  agentProviderRequest,
5739
7533
  lmProviderRequest,
5740
7534
  evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
@@ -5844,7 +7638,8 @@ async function runEvaluatorList(options) {
5844
7638
  const codeEvaluator = new CodeEvaluator({
5845
7639
  script: evaluator.script,
5846
7640
  cwd: evaluator.resolvedCwd ?? evaluator.cwd,
5847
- agentTimeoutMs
7641
+ agentTimeoutMs,
7642
+ config: evaluator.config
5848
7643
  });
5849
7644
  const score2 = await codeEvaluator.evaluate({
5850
7645
  evalCase,
@@ -5872,7 +7667,7 @@ async function runEvaluatorList(options) {
5872
7667
  });
5873
7668
  }
5874
7669
  if (evaluator.type === "composite") {
5875
- const evalFileDir = evalCase.guideline_paths[0] ? path13.dirname(evalCase.guideline_paths[0]) : process.cwd();
7670
+ const evalFileDir = evalCase.guideline_paths[0] ? path14.dirname(evalCase.guideline_paths[0]) : process.cwd();
5876
7671
  const createEvaluator = (memberConfig) => {
5877
7672
  switch (memberConfig.type) {
5878
7673
  case "llm_judge":
@@ -5881,7 +7676,8 @@ async function runEvaluatorList(options) {
5881
7676
  return new CodeEvaluator({
5882
7677
  script: memberConfig.script,
5883
7678
  cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
5884
- agentTimeoutMs
7679
+ agentTimeoutMs,
7680
+ config: memberConfig.config
5885
7681
  });
5886
7682
  case "composite":
5887
7683
  return new CompositeEvaluator({
@@ -5893,6 +7689,22 @@ async function runEvaluatorList(options) {
5893
7689
  return new ToolTrajectoryEvaluator({
5894
7690
  config: memberConfig
5895
7691
  });
7692
+ case "field_accuracy":
7693
+ return new FieldAccuracyEvaluator({
7694
+ config: memberConfig
7695
+ });
7696
+ case "latency":
7697
+ return new LatencyEvaluator({
7698
+ config: memberConfig
7699
+ });
7700
+ case "cost":
7701
+ return new CostEvaluator({
7702
+ config: memberConfig
7703
+ });
7704
+ case "token_usage":
7705
+ return new TokenUsageEvaluator({
7706
+ config: memberConfig
7707
+ });
5896
7708
  default: {
5897
7709
  const unknownConfig = memberConfig;
5898
7710
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -5912,7 +7724,9 @@ async function runEvaluatorList(options) {
5912
7724
  attempt,
5913
7725
  promptInputs,
5914
7726
  now,
5915
- judgeProvider
7727
+ judgeProvider,
7728
+ outputMessages,
7729
+ traceSummary
5916
7730
  });
5917
7731
  const weight = evaluator.weight ?? 1;
5918
7732
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -5957,6 +7771,118 @@ async function runEvaluatorList(options) {
5957
7771
  reasoning: score2.reasoning
5958
7772
  });
5959
7773
  }
7774
+ if (evaluator.type === "field_accuracy") {
7775
+ const fieldAccuracyEvaluator = new FieldAccuracyEvaluator({
7776
+ config: evaluator
7777
+ });
7778
+ const score2 = fieldAccuracyEvaluator.evaluate({
7779
+ evalCase,
7780
+ candidate,
7781
+ target,
7782
+ provider,
7783
+ attempt,
7784
+ promptInputs,
7785
+ now,
7786
+ outputMessages,
7787
+ traceSummary
7788
+ });
7789
+ const weight = evaluator.weight ?? 1;
7790
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
7791
+ evaluatorResults.push({
7792
+ name: evaluator.name,
7793
+ type: evaluator.type,
7794
+ score: score2.score,
7795
+ weight,
7796
+ verdict: score2.verdict,
7797
+ hits: score2.hits,
7798
+ misses: score2.misses,
7799
+ reasoning: score2.reasoning
7800
+ });
7801
+ }
7802
+ if (evaluator.type === "latency") {
7803
+ const latencyEvaluator = new LatencyEvaluator({
7804
+ config: evaluator
7805
+ });
7806
+ const score2 = latencyEvaluator.evaluate({
7807
+ evalCase,
7808
+ candidate,
7809
+ target,
7810
+ provider,
7811
+ attempt,
7812
+ promptInputs,
7813
+ now,
7814
+ outputMessages,
7815
+ traceSummary
7816
+ });
7817
+ const weight = evaluator.weight ?? 1;
7818
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
7819
+ evaluatorResults.push({
7820
+ name: evaluator.name,
7821
+ type: evaluator.type,
7822
+ score: score2.score,
7823
+ weight,
7824
+ verdict: score2.verdict,
7825
+ hits: score2.hits,
7826
+ misses: score2.misses,
7827
+ reasoning: score2.reasoning
7828
+ });
7829
+ }
7830
+ if (evaluator.type === "cost") {
7831
+ const costEvaluator = new CostEvaluator({
7832
+ config: evaluator
7833
+ });
7834
+ const score2 = costEvaluator.evaluate({
7835
+ evalCase,
7836
+ candidate,
7837
+ target,
7838
+ provider,
7839
+ attempt,
7840
+ promptInputs,
7841
+ now,
7842
+ outputMessages,
7843
+ traceSummary
7844
+ });
7845
+ const weight = evaluator.weight ?? 1;
7846
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
7847
+ evaluatorResults.push({
7848
+ name: evaluator.name,
7849
+ type: evaluator.type,
7850
+ score: score2.score,
7851
+ weight,
7852
+ verdict: score2.verdict,
7853
+ hits: score2.hits,
7854
+ misses: score2.misses,
7855
+ reasoning: score2.reasoning
7856
+ });
7857
+ }
7858
+ if (evaluator.type === "token_usage") {
7859
+ const tokenUsageEvaluator = new TokenUsageEvaluator({
7860
+ config: evaluator
7861
+ });
7862
+ const score2 = tokenUsageEvaluator.evaluate({
7863
+ evalCase,
7864
+ candidate,
7865
+ target,
7866
+ provider,
7867
+ attempt,
7868
+ promptInputs,
7869
+ now,
7870
+ outputMessages,
7871
+ traceSummary
7872
+ });
7873
+ const weight = evaluator.weight ?? 1;
7874
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
7875
+ evaluatorResults.push({
7876
+ name: evaluator.name,
7877
+ type: evaluator.type,
7878
+ score: score2.score,
7879
+ weight,
7880
+ verdict: score2.verdict,
7881
+ hits: score2.hits,
7882
+ misses: score2.misses,
7883
+ reasoning: score2.reasoning
7884
+ });
7885
+ }
5960
7886
  } catch (error) {
5961
7887
  const message = error instanceof Error ? error.message : String(error);
5962
7888
  const fallbackScore = {
@@ -5996,7 +7922,6 @@ async function runEvaluatorList(options) {
5996
7922
  (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
5997
7923
  0
5998
7924
  );
5999
- const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
6000
7925
  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
6001
7926
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
6002
7927
  const score = {
@@ -6005,8 +7930,7 @@ async function runEvaluatorList(options) {
6005
7930
  hits,
6006
7931
  misses,
6007
7932
  expectedAspectCount,
6008
- reasoning,
6009
- rawAspects: rawAspects.length > 0 ? rawAspects : void 0
7933
+ reasoning
6010
7934
  };
6011
7935
  return { score, evaluatorResults };
6012
7936
  }
@@ -6081,26 +8005,6 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
6081
8005
  llm_judge: llmJudge
6082
8006
  };
6083
8007
  }
6084
- async function dumpPrompt(directory, evalCase, promptInputs) {
6085
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
6086
- const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
6087
- const filePath = path13.resolve(directory, filename);
6088
- await mkdir3(path13.dirname(filePath), { recursive: true });
6089
- const payload = {
6090
- eval_id: evalCase.id,
6091
- question: promptInputs.question,
6092
- guidelines: promptInputs.guidelines,
6093
- guideline_paths: evalCase.guideline_paths
6094
- };
6095
- await writeFile3(filePath, JSON.stringify(payload, null, 2), "utf8");
6096
- }
6097
- function sanitizeFilename(value) {
6098
- if (!value) {
6099
- return "prompt";
6100
- }
6101
- const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
6102
- return sanitized.length > 0 ? sanitized : randomUUID3();
6103
- }
6104
8008
  async function invokeProvider(provider, options) {
6105
8009
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
6106
8010
  const controller = new AbortController();
@@ -6164,12 +8068,23 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
6164
8068
  misses: [`Error: ${message}`],
6165
8069
  candidateAnswer: `Error occurred: ${message}`,
6166
8070
  target: targetName,
6167
- rawAspects: [],
6168
8071
  agentProviderRequest,
6169
8072
  lmProviderRequest,
6170
8073
  error: message
6171
8074
  };
6172
8075
  }
8076
+ function extractProviderError(response) {
8077
+ const raw = response.raw;
8078
+ if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
8079
+ return void 0;
8080
+ }
8081
+ const error = raw.error;
8082
+ if (typeof error !== "string") {
8083
+ return void 0;
8084
+ }
8085
+ const trimmed = error.trim();
8086
+ return trimmed.length > 0 ? trimmed : void 0;
8087
+ }
6173
8088
  function createCacheKey(provider, target, evalCase, promptInputs) {
6174
8089
  const hash = createHash("sha256");
6175
8090
  hash.update(provider.id);
@@ -6228,15 +8143,15 @@ function computeWeightedMean(entries) {
6228
8143
 
6229
8144
  // src/evaluation/generators/rubric-generator.ts
6230
8145
  import { generateText as generateText3 } from "ai";
6231
- import { z as z2 } from "zod";
6232
- var rubricItemSchema = z2.object({
6233
- id: z2.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
6234
- description: z2.string().describe("What this rubric checks for"),
6235
- weight: z2.number().default(1).describe("Relative importance (default 1.0)"),
6236
- required: z2.boolean().default(true).describe("Whether this is a mandatory requirement")
8146
+ import { z as z3 } from "zod";
8147
+ var rubricItemSchema = z3.object({
8148
+ id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
8149
+ description: z3.string().describe("What this rubric checks for"),
8150
+ weight: z3.number().default(1).describe("Relative importance (default 1.0)"),
8151
+ required: z3.boolean().default(true).describe("Whether this is a mandatory requirement")
6237
8152
  });
6238
- var rubricGenerationSchema = z2.object({
6239
- rubrics: z2.array(rubricItemSchema).describe("List of evaluation rubrics")
8153
+ var rubricGenerationSchema = z3.object({
8154
+ rubrics: z3.array(rubricItemSchema).describe("List of evaluation rubrics")
6240
8155
  });
6241
8156
  async function generateRubrics(options) {
6242
8157
  const { expectedOutcome, question, referenceAnswer, provider } = options;
@@ -6313,15 +8228,20 @@ function createAgentKernel() {
6313
8228
  export {
6314
8229
  CodeEvaluator,
6315
8230
  CompositeEvaluator,
8231
+ CostEvaluator,
6316
8232
  DEFAULT_EXPLORATION_TOOLS,
8233
+ FieldAccuracyEvaluator,
8234
+ LatencyEvaluator,
6317
8235
  LlmJudgeEvaluator,
6318
8236
  TEST_MESSAGE_ROLES,
8237
+ TokenUsageEvaluator,
6319
8238
  ToolTrajectoryEvaluator,
6320
8239
  avgToolDurationMs,
6321
8240
  buildDirectoryChain,
6322
8241
  buildPromptInputs,
6323
8242
  buildSearchRoots,
6324
8243
  computeTraceSummary,
8244
+ consumeClaudeCodeLogEntries,
6325
8245
  consumeCodexLogEntries,
6326
8246
  consumePiLogEntries,
6327
8247
  createAgentKernel,
@@ -6352,6 +8272,7 @@ export {
6352
8272
  resolveTargetDefinition,
6353
8273
  runEvalCase,
6354
8274
  runEvaluation,
8275
+ subscribeToClaudeCodeLogEntries,
6355
8276
  subscribeToCodexLogEntries,
6356
8277
  subscribeToPiLogEntries,
6357
8278
  tokensPerTool