@agentv/core 1.5.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-E2VSU4WZ.js → chunk-IBTKEEOT.js} +73 -1
- package/dist/chunk-IBTKEEOT.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +2536 -663
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +221 -10
- package/dist/index.d.ts +221 -10
- package/dist/index.js +2362 -568
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
- package/dist/chunk-E2VSU4WZ.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -32,15 +32,20 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
CompositeEvaluator: () => CompositeEvaluator,
|
|
35
|
+
CostEvaluator: () => CostEvaluator,
|
|
35
36
|
DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
|
|
37
|
+
FieldAccuracyEvaluator: () => FieldAccuracyEvaluator,
|
|
38
|
+
LatencyEvaluator: () => LatencyEvaluator,
|
|
36
39
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
37
40
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
41
|
+
TokenUsageEvaluator: () => TokenUsageEvaluator,
|
|
38
42
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
39
43
|
avgToolDurationMs: () => avgToolDurationMs,
|
|
40
44
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
41
45
|
buildPromptInputs: () => buildPromptInputs,
|
|
42
46
|
buildSearchRoots: () => buildSearchRoots2,
|
|
43
47
|
computeTraceSummary: () => computeTraceSummary,
|
|
48
|
+
consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
|
|
44
49
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
45
50
|
consumePiLogEntries: () => consumePiLogEntries,
|
|
46
51
|
createAgentKernel: () => createAgentKernel,
|
|
@@ -62,6 +67,8 @@ __export(index_exports, {
|
|
|
62
67
|
loadEvalCases: () => loadEvalCases,
|
|
63
68
|
mergeExecutionMetrics: () => mergeExecutionMetrics,
|
|
64
69
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
70
|
+
parseCodeJudgePayload: () => parseCodeJudgePayload,
|
|
71
|
+
readCodeJudgePayload: () => readCodeJudgePayload,
|
|
65
72
|
readJsonFile: () => readJsonFile,
|
|
66
73
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
67
74
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
@@ -71,6 +78,7 @@ __export(index_exports, {
|
|
|
71
78
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
72
79
|
runEvalCase: () => runEvalCase,
|
|
73
80
|
runEvaluation: () => runEvaluation,
|
|
81
|
+
subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
|
|
74
82
|
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
|
|
75
83
|
subscribeToPiLogEntries: () => subscribeToPiLogEntries,
|
|
76
84
|
tokensPerTool: () => tokensPerTool
|
|
@@ -129,7 +137,11 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
129
137
|
"llm_judge",
|
|
130
138
|
"rubric",
|
|
131
139
|
"composite",
|
|
132
|
-
"tool_trajectory"
|
|
140
|
+
"tool_trajectory",
|
|
141
|
+
"field_accuracy",
|
|
142
|
+
"latency",
|
|
143
|
+
"cost",
|
|
144
|
+
"token_usage"
|
|
133
145
|
];
|
|
134
146
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
135
147
|
function isEvaluatorKind(value) {
|
|
@@ -551,7 +563,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
551
563
|
continue;
|
|
552
564
|
}
|
|
553
565
|
if (typeValue === "code_judge") {
|
|
554
|
-
|
|
566
|
+
let script;
|
|
567
|
+
const rawScript = rawEvaluator.script;
|
|
568
|
+
if (typeof rawScript === "string") {
|
|
569
|
+
const trimmed = rawScript.trim();
|
|
570
|
+
if (trimmed.length === 0) {
|
|
571
|
+
throw new Error(
|
|
572
|
+
`Invalid code_judge script for evaluator '${name}' in '${evalId}': script cannot be empty`
|
|
573
|
+
);
|
|
574
|
+
}
|
|
575
|
+
script = parseCommandToArgv(trimmed);
|
|
576
|
+
} else {
|
|
577
|
+
script = asStringArray(
|
|
578
|
+
rawScript,
|
|
579
|
+
`code_judge script for evaluator '${name}' in '${evalId}'`
|
|
580
|
+
);
|
|
581
|
+
}
|
|
555
582
|
if (!script) {
|
|
556
583
|
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
557
584
|
continue;
|
|
@@ -572,13 +599,21 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
572
599
|
} else {
|
|
573
600
|
resolvedCwd = searchRoots[0];
|
|
574
601
|
}
|
|
602
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
|
|
603
|
+
const config = {};
|
|
604
|
+
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
605
|
+
if (!knownProps.has(key) && value !== void 0) {
|
|
606
|
+
config[key] = value;
|
|
607
|
+
}
|
|
608
|
+
}
|
|
575
609
|
evaluators.push({
|
|
576
610
|
name,
|
|
577
611
|
type: "code",
|
|
578
612
|
script,
|
|
579
613
|
cwd,
|
|
580
614
|
resolvedCwd,
|
|
581
|
-
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
615
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
616
|
+
...Object.keys(config).length > 0 ? { config } : {}
|
|
582
617
|
});
|
|
583
618
|
continue;
|
|
584
619
|
}
|
|
@@ -753,6 +788,140 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
753
788
|
evaluators.push(config);
|
|
754
789
|
continue;
|
|
755
790
|
}
|
|
791
|
+
if (typeValue === "field_accuracy") {
|
|
792
|
+
const rawFields = rawEvaluator.fields;
|
|
793
|
+
if (!Array.isArray(rawFields)) {
|
|
794
|
+
logWarning2(
|
|
795
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
|
|
796
|
+
);
|
|
797
|
+
continue;
|
|
798
|
+
}
|
|
799
|
+
if (rawFields.length === 0) {
|
|
800
|
+
logWarning2(
|
|
801
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
|
|
802
|
+
);
|
|
803
|
+
continue;
|
|
804
|
+
}
|
|
805
|
+
const fields = [];
|
|
806
|
+
for (const rawField of rawFields) {
|
|
807
|
+
if (!isJsonObject2(rawField)) {
|
|
808
|
+
logWarning2(
|
|
809
|
+
`Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
|
|
810
|
+
);
|
|
811
|
+
continue;
|
|
812
|
+
}
|
|
813
|
+
const fieldPath = asString2(rawField.path);
|
|
814
|
+
const match = asString2(rawField.match);
|
|
815
|
+
if (!fieldPath) {
|
|
816
|
+
logWarning2(
|
|
817
|
+
`Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
|
|
818
|
+
);
|
|
819
|
+
continue;
|
|
820
|
+
}
|
|
821
|
+
if (!match || !isValidFieldMatchType(match)) {
|
|
822
|
+
logWarning2(
|
|
823
|
+
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
|
|
824
|
+
);
|
|
825
|
+
continue;
|
|
826
|
+
}
|
|
827
|
+
const fieldConfig = {
|
|
828
|
+
path: fieldPath,
|
|
829
|
+
match,
|
|
830
|
+
...typeof rawField.required === "boolean" ? { required: rawField.required } : {},
|
|
831
|
+
...typeof rawField.weight === "number" ? { weight: rawField.weight } : {},
|
|
832
|
+
...typeof rawField.tolerance === "number" ? { tolerance: rawField.tolerance } : {},
|
|
833
|
+
...typeof rawField.relative === "boolean" ? { relative: rawField.relative } : {},
|
|
834
|
+
...Array.isArray(rawField.formats) ? { formats: rawField.formats.filter((f) => typeof f === "string") } : {}
|
|
835
|
+
};
|
|
836
|
+
fields.push(fieldConfig);
|
|
837
|
+
}
|
|
838
|
+
if (fields.length === 0) {
|
|
839
|
+
logWarning2(
|
|
840
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
|
|
841
|
+
);
|
|
842
|
+
continue;
|
|
843
|
+
}
|
|
844
|
+
const aggregation = asString2(rawEvaluator.aggregation);
|
|
845
|
+
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
846
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
847
|
+
evaluators.push({
|
|
848
|
+
name,
|
|
849
|
+
type: "field_accuracy",
|
|
850
|
+
fields,
|
|
851
|
+
...validAggregation ? { aggregation: validAggregation } : {},
|
|
852
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
853
|
+
});
|
|
854
|
+
continue;
|
|
855
|
+
}
|
|
856
|
+
if (typeValue === "latency") {
|
|
857
|
+
const threshold = rawEvaluator.threshold;
|
|
858
|
+
if (typeof threshold !== "number" || threshold < 0) {
|
|
859
|
+
logWarning2(
|
|
860
|
+
`Skipping latency evaluator '${name}' in '${evalId}': threshold must be a non-negative number`
|
|
861
|
+
);
|
|
862
|
+
continue;
|
|
863
|
+
}
|
|
864
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
865
|
+
evaluators.push({
|
|
866
|
+
name,
|
|
867
|
+
type: "latency",
|
|
868
|
+
threshold,
|
|
869
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
870
|
+
});
|
|
871
|
+
continue;
|
|
872
|
+
}
|
|
873
|
+
if (typeValue === "cost") {
|
|
874
|
+
const budget = rawEvaluator.budget;
|
|
875
|
+
if (typeof budget !== "number" || budget < 0) {
|
|
876
|
+
logWarning2(
|
|
877
|
+
`Skipping cost evaluator '${name}' in '${evalId}': budget must be a non-negative number`
|
|
878
|
+
);
|
|
879
|
+
continue;
|
|
880
|
+
}
|
|
881
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
882
|
+
evaluators.push({
|
|
883
|
+
name,
|
|
884
|
+
type: "cost",
|
|
885
|
+
budget,
|
|
886
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
887
|
+
});
|
|
888
|
+
continue;
|
|
889
|
+
}
|
|
890
|
+
if (typeValue === "token_usage") {
|
|
891
|
+
const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
|
|
892
|
+
const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
|
|
893
|
+
const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
|
|
894
|
+
const limits = [
|
|
895
|
+
["max_total", maxTotal],
|
|
896
|
+
["max_input", maxInput],
|
|
897
|
+
["max_output", maxOutput]
|
|
898
|
+
];
|
|
899
|
+
const validLimits = {};
|
|
900
|
+
for (const [key, raw] of limits) {
|
|
901
|
+
if (raw === void 0) continue;
|
|
902
|
+
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
903
|
+
logWarning2(
|
|
904
|
+
`Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
905
|
+
);
|
|
906
|
+
continue;
|
|
907
|
+
}
|
|
908
|
+
validLimits[key] = raw;
|
|
909
|
+
}
|
|
910
|
+
if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
|
|
911
|
+
logWarning2(
|
|
912
|
+
`Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
|
|
913
|
+
);
|
|
914
|
+
continue;
|
|
915
|
+
}
|
|
916
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
917
|
+
evaluators.push({
|
|
918
|
+
name,
|
|
919
|
+
type: "token_usage",
|
|
920
|
+
...validLimits,
|
|
921
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
922
|
+
});
|
|
923
|
+
continue;
|
|
924
|
+
}
|
|
756
925
|
const prompt = asString2(rawEvaluator.prompt);
|
|
757
926
|
let promptPath;
|
|
758
927
|
if (prompt) {
|
|
@@ -823,6 +992,34 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
823
992
|
function asString2(value) {
|
|
824
993
|
return typeof value === "string" ? value : void 0;
|
|
825
994
|
}
|
|
995
|
+
function asStringArray(value, description) {
|
|
996
|
+
if (value === void 0) {
|
|
997
|
+
return void 0;
|
|
998
|
+
}
|
|
999
|
+
if (!Array.isArray(value)) {
|
|
1000
|
+
throw new Error(`${description} must be an array of strings (argv tokens)`);
|
|
1001
|
+
}
|
|
1002
|
+
if (value.length === 0) {
|
|
1003
|
+
throw new Error(`${description} cannot be empty`);
|
|
1004
|
+
}
|
|
1005
|
+
const result = [];
|
|
1006
|
+
for (const [index, entry] of value.entries()) {
|
|
1007
|
+
if (typeof entry !== "string") {
|
|
1008
|
+
throw new Error(`${description}[${index}] must be a string`);
|
|
1009
|
+
}
|
|
1010
|
+
if (entry.trim().length === 0) {
|
|
1011
|
+
throw new Error(`${description}[${index}] cannot be empty`);
|
|
1012
|
+
}
|
|
1013
|
+
result.push(entry);
|
|
1014
|
+
}
|
|
1015
|
+
return result;
|
|
1016
|
+
}
|
|
1017
|
+
function parseCommandToArgv(command) {
|
|
1018
|
+
if (process.platform === "win32") {
|
|
1019
|
+
return ["cmd.exe", "/c", command];
|
|
1020
|
+
}
|
|
1021
|
+
return ["sh", "-lc", command];
|
|
1022
|
+
}
|
|
826
1023
|
function isJsonObject2(value) {
|
|
827
1024
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
828
1025
|
}
|
|
@@ -856,6 +1053,14 @@ function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
|
856
1053
|
}
|
|
857
1054
|
return rawWeight;
|
|
858
1055
|
}
|
|
1056
|
+
var VALID_FIELD_MATCH_TYPES = /* @__PURE__ */ new Set(["exact", "numeric_tolerance", "date"]);
|
|
1057
|
+
function isValidFieldMatchType(value) {
|
|
1058
|
+
return typeof value === "string" && VALID_FIELD_MATCH_TYPES.has(value);
|
|
1059
|
+
}
|
|
1060
|
+
var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average", "all_or_nothing"]);
|
|
1061
|
+
function isValidFieldAggregationType(value) {
|
|
1062
|
+
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
1063
|
+
}
|
|
859
1064
|
|
|
860
1065
|
// src/evaluation/loaders/message-processor.ts
|
|
861
1066
|
var import_promises4 = require("fs/promises");
|
|
@@ -1930,92 +2135,993 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
1930
2135
|
throw lastError;
|
|
1931
2136
|
}
|
|
1932
2137
|
|
|
1933
|
-
// src/evaluation/providers/
|
|
2138
|
+
// src/evaluation/providers/claude-code.ts
|
|
1934
2139
|
var import_node_child_process = require("child_process");
|
|
1935
|
-
var
|
|
1936
|
-
var
|
|
1937
|
-
var
|
|
1938
|
-
var
|
|
1939
|
-
var
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
2140
|
+
var import_node_crypto = require("crypto");
|
|
2141
|
+
var import_node_fs3 = require("fs");
|
|
2142
|
+
var import_promises8 = require("fs/promises");
|
|
2143
|
+
var import_node_os = require("os");
|
|
2144
|
+
var import_node_path9 = __toESM(require("path"), 1);
|
|
2145
|
+
|
|
2146
|
+
// src/evaluation/providers/claude-code-log-tracker.ts
|
|
2147
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
|
|
2148
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeCodeLogSubscribers");
|
|
2149
|
+
function getClaudeCodeLogStore() {
|
|
2150
|
+
const globalObject = globalThis;
|
|
2151
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
2152
|
+
if (existing) {
|
|
2153
|
+
return existing;
|
|
2154
|
+
}
|
|
2155
|
+
const created = [];
|
|
2156
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
2157
|
+
return created;
|
|
2158
|
+
}
|
|
2159
|
+
function getSubscriberStore() {
|
|
2160
|
+
const globalObject = globalThis;
|
|
2161
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
2162
|
+
if (existing) {
|
|
2163
|
+
return existing;
|
|
2164
|
+
}
|
|
2165
|
+
const created = /* @__PURE__ */ new Set();
|
|
2166
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
2167
|
+
return created;
|
|
2168
|
+
}
|
|
2169
|
+
function notifySubscribers(entry) {
|
|
2170
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
2171
|
+
for (const listener of subscribers) {
|
|
2172
|
+
try {
|
|
2173
|
+
listener(entry);
|
|
2174
|
+
} catch (error) {
|
|
2175
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2176
|
+
console.warn(`Claude Code log subscriber failed: ${message}`);
|
|
2177
|
+
}
|
|
2178
|
+
}
|
|
2179
|
+
}
|
|
2180
|
+
function recordClaudeCodeLogEntry(entry) {
|
|
2181
|
+
getClaudeCodeLogStore().push(entry);
|
|
2182
|
+
notifySubscribers(entry);
|
|
2183
|
+
}
|
|
2184
|
+
function consumeClaudeCodeLogEntries() {
|
|
2185
|
+
const store = getClaudeCodeLogStore();
|
|
2186
|
+
if (store.length === 0) {
|
|
2187
|
+
return [];
|
|
2188
|
+
}
|
|
2189
|
+
return store.splice(0, store.length);
|
|
2190
|
+
}
|
|
2191
|
+
function subscribeToClaudeCodeLogEntries(listener) {
|
|
2192
|
+
const store = getSubscriberStore();
|
|
2193
|
+
store.add(listener);
|
|
2194
|
+
return () => {
|
|
2195
|
+
store.delete(listener);
|
|
1949
2196
|
};
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
failed: true,
|
|
1967
|
-
timedOut: execError.timedOut === true || execError.killed === true,
|
|
1968
|
-
signal: execError.signal ?? null
|
|
1969
|
-
};
|
|
2197
|
+
}
|
|
2198
|
+
|
|
2199
|
+
// src/evaluation/providers/preread.ts
|
|
2200
|
+
var import_node_path8 = __toESM(require("path"), 1);
|
|
2201
|
+
function buildPromptDocument(request, inputFiles, options) {
|
|
2202
|
+
const parts = [];
|
|
2203
|
+
const guidelineFiles = collectGuidelineFiles(
|
|
2204
|
+
inputFiles,
|
|
2205
|
+
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
2206
|
+
options?.guidelineOverrides
|
|
2207
|
+
);
|
|
2208
|
+
const inputFilesList = collectInputFiles(inputFiles);
|
|
2209
|
+
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
2210
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
2211
|
+
if (prereadBlock.length > 0) {
|
|
2212
|
+
parts.push("\n", prereadBlock);
|
|
1970
2213
|
}
|
|
2214
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2215
|
+
return parts.join("\n").trim();
|
|
1971
2216
|
}
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
targetName;
|
|
1976
|
-
supportsBatch = true;
|
|
1977
|
-
config;
|
|
1978
|
-
runCommand;
|
|
1979
|
-
verbose;
|
|
1980
|
-
keepTempFiles;
|
|
1981
|
-
healthcheckPromise;
|
|
1982
|
-
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1983
|
-
this.targetName = targetName;
|
|
1984
|
-
this.id = `cli:${targetName}`;
|
|
1985
|
-
this.config = config;
|
|
1986
|
-
this.runCommand = runner;
|
|
1987
|
-
this.verbose = config.verbose ?? false;
|
|
1988
|
-
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
2217
|
+
function normalizeInputFiles(inputFiles) {
|
|
2218
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2219
|
+
return void 0;
|
|
1989
2220
|
}
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
2221
|
+
const deduped = /* @__PURE__ */ new Map();
|
|
2222
|
+
for (const inputFile of inputFiles) {
|
|
2223
|
+
const absolutePath = import_node_path8.default.resolve(inputFile);
|
|
2224
|
+
if (!deduped.has(absolutePath)) {
|
|
2225
|
+
deduped.set(absolutePath, absolutePath);
|
|
1993
2226
|
}
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
2227
|
+
}
|
|
2228
|
+
return Array.from(deduped.values());
|
|
2229
|
+
}
|
|
2230
|
+
function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
2231
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2232
|
+
return [];
|
|
2233
|
+
}
|
|
2234
|
+
const unique = /* @__PURE__ */ new Map();
|
|
2235
|
+
for (const inputFile of inputFiles) {
|
|
2236
|
+
const absolutePath = import_node_path8.default.resolve(inputFile);
|
|
2237
|
+
if (overrides?.has(absolutePath)) {
|
|
2238
|
+
if (!unique.has(absolutePath)) {
|
|
2239
|
+
unique.set(absolutePath, absolutePath);
|
|
2240
|
+
}
|
|
2241
|
+
continue;
|
|
2002
2242
|
}
|
|
2003
|
-
const
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
timeoutMs: this.config.timeoutMs,
|
|
2008
|
-
signal: request.signal
|
|
2009
|
-
});
|
|
2010
|
-
const measuredDurationMs = Date.now() - startTime;
|
|
2011
|
-
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
2012
|
-
if (request.signal?.aborted) {
|
|
2013
|
-
throw new Error("CLI provider request was aborted");
|
|
2243
|
+
const normalized = absolutePath.split(import_node_path8.default.sep).join("/");
|
|
2244
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2245
|
+
if (!unique.has(absolutePath)) {
|
|
2246
|
+
unique.set(absolutePath, absolutePath);
|
|
2014
2247
|
}
|
|
2015
|
-
|
|
2016
|
-
|
|
2017
|
-
|
|
2018
|
-
|
|
2248
|
+
}
|
|
2249
|
+
}
|
|
2250
|
+
return Array.from(unique.values());
|
|
2251
|
+
}
|
|
2252
|
+
function collectInputFiles(inputFiles) {
|
|
2253
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2254
|
+
return [];
|
|
2255
|
+
}
|
|
2256
|
+
const unique = /* @__PURE__ */ new Map();
|
|
2257
|
+
for (const inputFile of inputFiles) {
|
|
2258
|
+
const absolutePath = import_node_path8.default.resolve(inputFile);
|
|
2259
|
+
if (!unique.has(absolutePath)) {
|
|
2260
|
+
unique.set(absolutePath, absolutePath);
|
|
2261
|
+
}
|
|
2262
|
+
}
|
|
2263
|
+
return Array.from(unique.values());
|
|
2264
|
+
}
|
|
2265
|
+
function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
2266
|
+
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
2267
|
+
return "";
|
|
2268
|
+
}
|
|
2269
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
2270
|
+
const fileName = import_node_path8.default.basename(absolutePath);
|
|
2271
|
+
const fileUri = pathToFileUri(absolutePath);
|
|
2272
|
+
return `* [${fileName}](${fileUri})`;
|
|
2273
|
+
});
|
|
2274
|
+
const sections = [];
|
|
2275
|
+
if (guidelineFiles.length > 0) {
|
|
2276
|
+
sections.push(`Read all guideline files:
|
|
2277
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
2278
|
+
}
|
|
2279
|
+
if (inputFiles.length > 0) {
|
|
2280
|
+
sections.push(`Read all input files:
|
|
2281
|
+
${buildList(inputFiles).join("\n")}.`);
|
|
2282
|
+
}
|
|
2283
|
+
sections.push(
|
|
2284
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
2285
|
+
"Then apply system_instructions on the user query below."
|
|
2286
|
+
);
|
|
2287
|
+
return sections.join("\n");
|
|
2288
|
+
}
|
|
2289
|
+
function pathToFileUri(filePath) {
|
|
2290
|
+
const absolutePath = import_node_path8.default.isAbsolute(filePath) ? filePath : import_node_path8.default.resolve(filePath);
|
|
2291
|
+
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2292
|
+
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2293
|
+
return `file:///${normalizedPath}`;
|
|
2294
|
+
}
|
|
2295
|
+
return `file://${normalizedPath}`;
|
|
2296
|
+
}
|
|
2297
|
+
|
|
2298
|
+
// src/evaluation/providers/claude-code.ts
|
|
2299
|
+
var WORKSPACE_PREFIX = "agentv-claude-code-";
|
|
2300
|
+
var PROMPT_FILENAME = "prompt.md";
|
|
2301
|
+
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2302
|
+
- Do NOT create any additional output files in the workspace.
|
|
2303
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
2304
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
2305
|
+
This is required for evaluation scoring.`;
|
|
2306
|
+
var ClaudeCodeProvider = class {
|
|
2307
|
+
id;
|
|
2308
|
+
kind = "claude-code";
|
|
2309
|
+
targetName;
|
|
2310
|
+
supportsBatch = false;
|
|
2311
|
+
config;
|
|
2312
|
+
runClaudeCode;
|
|
2313
|
+
constructor(targetName, config, runner = defaultClaudeCodeRunner) {
|
|
2314
|
+
this.id = `claude-code:${targetName}`;
|
|
2315
|
+
this.targetName = targetName;
|
|
2316
|
+
this.config = config;
|
|
2317
|
+
this.runClaudeCode = runner;
|
|
2318
|
+
}
|
|
2319
|
+
async invoke(request) {
|
|
2320
|
+
if (request.signal?.aborted) {
|
|
2321
|
+
throw new Error("Claude Code request was aborted before execution");
|
|
2322
|
+
}
|
|
2323
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
2324
|
+
const workspaceRoot = await this.createWorkspace();
|
|
2325
|
+
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2326
|
+
try {
|
|
2327
|
+
const promptFile = import_node_path9.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
2328
|
+
await (0, import_promises8.writeFile)(promptFile, request.question, "utf8");
|
|
2329
|
+
const args = this.buildClaudeCodeArgs(request.question, inputFiles);
|
|
2330
|
+
const cwd = this.resolveCwd();
|
|
2331
|
+
const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
|
|
2332
|
+
if (result.timedOut) {
|
|
2333
|
+
throw new Error(
|
|
2334
|
+
`Claude Code CLI timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
2335
|
+
);
|
|
2336
|
+
}
|
|
2337
|
+
if (result.exitCode !== 0) {
|
|
2338
|
+
const detail = pickDetail(result.stderr, result.stdout);
|
|
2339
|
+
const prefix = `Claude Code CLI exited with code ${result.exitCode}`;
|
|
2340
|
+
if (isNestedClaudeCodeAuthError(result.stdout)) {
|
|
2341
|
+
throw new Error(
|
|
2342
|
+
`${prefix}: Claude Code detected a nested session and requires API key authentication. Set ANTHROPIC_API_KEY environment variable or run AgentV outside of a Claude Code session.`
|
|
2343
|
+
);
|
|
2344
|
+
}
|
|
2345
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
2346
|
+
}
|
|
2347
|
+
const parsed = parseClaudeCodeJsonl(result.stdout);
|
|
2348
|
+
const outputMessages = extractOutputMessages(parsed);
|
|
2349
|
+
const usage = extractUsage(parsed);
|
|
2350
|
+
return {
|
|
2351
|
+
raw: {
|
|
2352
|
+
response: parsed,
|
|
2353
|
+
stdout: result.stdout,
|
|
2354
|
+
stderr: result.stderr,
|
|
2355
|
+
exitCode: result.exitCode,
|
|
2356
|
+
args,
|
|
2357
|
+
executable: this.config.executable,
|
|
2358
|
+
promptFile,
|
|
2359
|
+
workspace: workspaceRoot,
|
|
2360
|
+
inputFiles,
|
|
2361
|
+
logFile: logger?.filePath
|
|
2362
|
+
},
|
|
2363
|
+
outputMessages,
|
|
2364
|
+
usage
|
|
2365
|
+
};
|
|
2366
|
+
} finally {
|
|
2367
|
+
await logger?.close();
|
|
2368
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
2369
|
+
}
|
|
2370
|
+
}
|
|
2371
|
+
resolveCwd() {
|
|
2372
|
+
if (!this.config.cwd) {
|
|
2373
|
+
return process.cwd();
|
|
2374
|
+
}
|
|
2375
|
+
return import_node_path9.default.resolve(this.config.cwd);
|
|
2376
|
+
}
|
|
2377
|
+
buildClaudeCodeArgs(prompt, inputFiles) {
|
|
2378
|
+
const args = [];
|
|
2379
|
+
args.push("--output-format", "stream-json");
|
|
2380
|
+
args.push("--verbose");
|
|
2381
|
+
args.push("-p");
|
|
2382
|
+
if (this.config.model) {
|
|
2383
|
+
args.push("--model", this.config.model);
|
|
2384
|
+
}
|
|
2385
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
2386
|
+
args.push(...this.config.args);
|
|
2387
|
+
}
|
|
2388
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
|
|
2389
|
+
const fullPrompt = `${systemPrompt}
|
|
2390
|
+
|
|
2391
|
+
${prompt}`;
|
|
2392
|
+
let finalPrompt = fullPrompt;
|
|
2393
|
+
if (inputFiles && inputFiles.length > 0) {
|
|
2394
|
+
const filesContext = inputFiles.map((f) => `[File: ${f}]`).join("\n");
|
|
2395
|
+
finalPrompt = `${fullPrompt}
|
|
2396
|
+
|
|
2397
|
+
## Input Files
|
|
2398
|
+
${filesContext}`;
|
|
2399
|
+
}
|
|
2400
|
+
args.push(finalPrompt);
|
|
2401
|
+
return args;
|
|
2402
|
+
}
|
|
2403
|
+
buildEnv() {
|
|
2404
|
+
const env = { ...process.env };
|
|
2405
|
+
env.CLAUDECODE = void 0;
|
|
2406
|
+
env.CLAUDE_CODE_ENTRYPOINT = void 0;
|
|
2407
|
+
return env;
|
|
2408
|
+
}
|
|
2409
|
+
async executeClaudeCode(args, cwd, signal, logger) {
|
|
2410
|
+
try {
|
|
2411
|
+
return await this.runClaudeCode({
|
|
2412
|
+
executable: this.config.executable,
|
|
2413
|
+
args,
|
|
2414
|
+
cwd,
|
|
2415
|
+
timeoutMs: this.config.timeoutMs,
|
|
2416
|
+
env: this.buildEnv(),
|
|
2417
|
+
signal,
|
|
2418
|
+
onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
|
|
2419
|
+
onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
|
|
2420
|
+
});
|
|
2421
|
+
} catch (error) {
|
|
2422
|
+
const err = error;
|
|
2423
|
+
if (err.code === "ENOENT") {
|
|
2424
|
+
throw new Error(
|
|
2425
|
+
`Claude Code executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
2426
|
+
);
|
|
2427
|
+
}
|
|
2428
|
+
throw error;
|
|
2429
|
+
}
|
|
2430
|
+
}
|
|
2431
|
+
async createWorkspace() {
|
|
2432
|
+
return await (0, import_promises8.mkdtemp)(import_node_path9.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
|
|
2433
|
+
}
|
|
2434
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
2435
|
+
try {
|
|
2436
|
+
await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
|
|
2437
|
+
} catch {
|
|
2438
|
+
}
|
|
2439
|
+
}
|
|
2440
|
+
resolveLogDirectory() {
|
|
2441
|
+
const disabled = isClaudeCodeLogStreamingDisabled();
|
|
2442
|
+
if (disabled) {
|
|
2443
|
+
return void 0;
|
|
2444
|
+
}
|
|
2445
|
+
if (this.config.logDir) {
|
|
2446
|
+
return import_node_path9.default.resolve(this.config.logDir);
|
|
2447
|
+
}
|
|
2448
|
+
return import_node_path9.default.join(process.cwd(), ".agentv", "logs", "claude-code");
|
|
2449
|
+
}
|
|
2450
|
+
async createStreamLogger(request) {
|
|
2451
|
+
const logDir = this.resolveLogDirectory();
|
|
2452
|
+
if (!logDir) {
|
|
2453
|
+
return void 0;
|
|
2454
|
+
}
|
|
2455
|
+
try {
|
|
2456
|
+
await (0, import_promises8.mkdir)(logDir, { recursive: true });
|
|
2457
|
+
} catch (error) {
|
|
2458
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2459
|
+
console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
|
|
2460
|
+
return void 0;
|
|
2461
|
+
}
|
|
2462
|
+
const filePath = import_node_path9.default.join(logDir, buildLogFilename(request, this.targetName));
|
|
2463
|
+
try {
|
|
2464
|
+
const logger = await ClaudeCodeStreamLogger.create({
|
|
2465
|
+
filePath,
|
|
2466
|
+
targetName: this.targetName,
|
|
2467
|
+
evalCaseId: request.evalCaseId,
|
|
2468
|
+
attempt: request.attempt,
|
|
2469
|
+
format: this.config.logFormat ?? "summary"
|
|
2470
|
+
});
|
|
2471
|
+
recordClaudeCodeLogEntry({
|
|
2472
|
+
filePath,
|
|
2473
|
+
targetName: this.targetName,
|
|
2474
|
+
evalCaseId: request.evalCaseId,
|
|
2475
|
+
attempt: request.attempt
|
|
2476
|
+
});
|
|
2477
|
+
return logger;
|
|
2478
|
+
} catch (error) {
|
|
2479
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2480
|
+
console.warn(`Skipping Claude Code stream logging for ${filePath}: ${message}`);
|
|
2481
|
+
return void 0;
|
|
2482
|
+
}
|
|
2483
|
+
}
|
|
2484
|
+
};
|
|
2485
|
+
var ClaudeCodeStreamLogger = class _ClaudeCodeStreamLogger {
|
|
2486
|
+
filePath;
|
|
2487
|
+
stream;
|
|
2488
|
+
startedAt = Date.now();
|
|
2489
|
+
stdoutBuffer = "";
|
|
2490
|
+
stderrBuffer = "";
|
|
2491
|
+
format;
|
|
2492
|
+
constructor(filePath, format) {
|
|
2493
|
+
this.filePath = filePath;
|
|
2494
|
+
this.format = format;
|
|
2495
|
+
this.stream = (0, import_node_fs3.createWriteStream)(filePath, { flags: "a" });
|
|
2496
|
+
}
|
|
2497
|
+
static async create(options) {
|
|
2498
|
+
const logger = new _ClaudeCodeStreamLogger(options.filePath, options.format);
|
|
2499
|
+
const header = [
|
|
2500
|
+
"# Claude Code CLI stream log",
|
|
2501
|
+
`# target: ${options.targetName}`,
|
|
2502
|
+
options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
|
|
2503
|
+
options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
|
|
2504
|
+
`# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
|
|
2505
|
+
""
|
|
2506
|
+
].filter((line) => Boolean(line));
|
|
2507
|
+
logger.writeLines(header);
|
|
2508
|
+
return logger;
|
|
2509
|
+
}
|
|
2510
|
+
handleStdoutChunk(chunk) {
|
|
2511
|
+
this.stdoutBuffer += chunk;
|
|
2512
|
+
this.flushBuffer("stdout");
|
|
2513
|
+
}
|
|
2514
|
+
handleStderrChunk(chunk) {
|
|
2515
|
+
this.stderrBuffer += chunk;
|
|
2516
|
+
this.flushBuffer("stderr");
|
|
2517
|
+
}
|
|
2518
|
+
async close() {
|
|
2519
|
+
this.flushBuffer("stdout");
|
|
2520
|
+
this.flushBuffer("stderr");
|
|
2521
|
+
this.flushRemainder();
|
|
2522
|
+
await new Promise((resolve, reject) => {
|
|
2523
|
+
this.stream.once("error", reject);
|
|
2524
|
+
this.stream.end(() => resolve());
|
|
2525
|
+
});
|
|
2526
|
+
}
|
|
2527
|
+
writeLines(lines) {
|
|
2528
|
+
for (const line of lines) {
|
|
2529
|
+
this.stream.write(`${line}
|
|
2530
|
+
`);
|
|
2531
|
+
}
|
|
2532
|
+
}
|
|
2533
|
+
flushBuffer(source) {
|
|
2534
|
+
const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
|
|
2535
|
+
const lines = buffer.split(/\r?\n/);
|
|
2536
|
+
const remainder = lines.pop() ?? "";
|
|
2537
|
+
if (source === "stdout") {
|
|
2538
|
+
this.stdoutBuffer = remainder;
|
|
2539
|
+
} else {
|
|
2540
|
+
this.stderrBuffer = remainder;
|
|
2541
|
+
}
|
|
2542
|
+
for (const line of lines) {
|
|
2543
|
+
const formatted = this.formatLine(line, source);
|
|
2544
|
+
if (formatted) {
|
|
2545
|
+
this.stream.write(formatted);
|
|
2546
|
+
this.stream.write("\n");
|
|
2547
|
+
}
|
|
2548
|
+
}
|
|
2549
|
+
}
|
|
2550
|
+
formatLine(rawLine, source) {
|
|
2551
|
+
const trimmed = rawLine.trim();
|
|
2552
|
+
if (trimmed.length === 0) {
|
|
2553
|
+
return void 0;
|
|
2554
|
+
}
|
|
2555
|
+
const message = this.format === "json" ? formatClaudeCodeJsonLog(trimmed) : formatClaudeCodeLogMessage(trimmed, source);
|
|
2556
|
+
return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
|
|
2557
|
+
}
|
|
2558
|
+
flushRemainder() {
|
|
2559
|
+
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
2560
|
+
if (stdoutRemainder.length > 0) {
|
|
2561
|
+
const formatted = this.formatLine(stdoutRemainder, "stdout");
|
|
2562
|
+
if (formatted) {
|
|
2563
|
+
this.stream.write(formatted);
|
|
2564
|
+
this.stream.write("\n");
|
|
2565
|
+
}
|
|
2566
|
+
}
|
|
2567
|
+
const stderrRemainder = this.stderrBuffer.trim();
|
|
2568
|
+
if (stderrRemainder.length > 0) {
|
|
2569
|
+
const formatted = this.formatLine(stderrRemainder, "stderr");
|
|
2570
|
+
if (formatted) {
|
|
2571
|
+
this.stream.write(formatted);
|
|
2572
|
+
this.stream.write("\n");
|
|
2573
|
+
}
|
|
2574
|
+
}
|
|
2575
|
+
this.stdoutBuffer = "";
|
|
2576
|
+
this.stderrBuffer = "";
|
|
2577
|
+
}
|
|
2578
|
+
};
|
|
2579
|
+
function isClaudeCodeLogStreamingDisabled() {
|
|
2580
|
+
const envValue = process.env.AGENTV_CLAUDE_CODE_STREAM_LOGS;
|
|
2581
|
+
if (!envValue) {
|
|
2582
|
+
return false;
|
|
2583
|
+
}
|
|
2584
|
+
const normalized = envValue.trim().toLowerCase();
|
|
2585
|
+
return normalized === "false" || normalized === "0" || normalized === "off";
|
|
2586
|
+
}
|
|
2587
|
+
function buildLogFilename(request, targetName) {
|
|
2588
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2589
|
+
const evalId = sanitizeForFilename(request.evalCaseId ?? "claude-code");
|
|
2590
|
+
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
2591
|
+
const target = sanitizeForFilename(targetName);
|
|
2592
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto.randomUUID)().slice(0, 8)}.log`;
|
|
2593
|
+
}
|
|
2594
|
+
function sanitizeForFilename(value) {
|
|
2595
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
2596
|
+
return sanitized.length > 0 ? sanitized : "claude-code";
|
|
2597
|
+
}
|
|
2598
|
+
function formatElapsed(startedAt) {
|
|
2599
|
+
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
2600
|
+
const hours = Math.floor(elapsedSeconds / 3600);
|
|
2601
|
+
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
2602
|
+
const seconds = elapsedSeconds % 60;
|
|
2603
|
+
if (hours > 0) {
|
|
2604
|
+
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2605
|
+
}
|
|
2606
|
+
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2607
|
+
}
|
|
2608
|
+
function formatClaudeCodeLogMessage(rawLine, source) {
|
|
2609
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
2610
|
+
if (parsed) {
|
|
2611
|
+
const summary = summarizeClaudeCodeEvent(parsed);
|
|
2612
|
+
if (summary) {
|
|
2613
|
+
return summary;
|
|
2614
|
+
}
|
|
2615
|
+
}
|
|
2616
|
+
if (source === "stderr") {
|
|
2617
|
+
return `stderr: ${rawLine}`;
|
|
2618
|
+
}
|
|
2619
|
+
return rawLine;
|
|
2620
|
+
}
|
|
2621
|
+
function formatClaudeCodeJsonLog(rawLine) {
|
|
2622
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
2623
|
+
if (!parsed) {
|
|
2624
|
+
return rawLine;
|
|
2625
|
+
}
|
|
2626
|
+
try {
|
|
2627
|
+
return JSON.stringify(parsed, null, 2);
|
|
2628
|
+
} catch {
|
|
2629
|
+
return rawLine;
|
|
2630
|
+
}
|
|
2631
|
+
}
|
|
2632
|
+
function summarizeClaudeCodeEvent(event) {
|
|
2633
|
+
if (!event || typeof event !== "object") {
|
|
2634
|
+
return void 0;
|
|
2635
|
+
}
|
|
2636
|
+
const record = event;
|
|
2637
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
2638
|
+
if (!type) {
|
|
2639
|
+
return void 0;
|
|
2640
|
+
}
|
|
2641
|
+
switch (type) {
|
|
2642
|
+
case "system":
|
|
2643
|
+
return "system: init";
|
|
2644
|
+
case "assistant": {
|
|
2645
|
+
const message = record.message;
|
|
2646
|
+
if (message) {
|
|
2647
|
+
const content = message.content;
|
|
2648
|
+
if (Array.isArray(content) && content.length > 0) {
|
|
2649
|
+
const first = content[0];
|
|
2650
|
+
if (first?.type === "tool_use") {
|
|
2651
|
+
return `assistant: tool_use (${first.name})`;
|
|
2652
|
+
}
|
|
2653
|
+
if (first?.type === "text") {
|
|
2654
|
+
const text = first.text;
|
|
2655
|
+
if (typeof text === "string") {
|
|
2656
|
+
const preview = text.length > 50 ? `${text.slice(0, 50)}...` : text;
|
|
2657
|
+
return `assistant: ${preview}`;
|
|
2658
|
+
}
|
|
2659
|
+
}
|
|
2660
|
+
}
|
|
2661
|
+
}
|
|
2662
|
+
return "assistant";
|
|
2663
|
+
}
|
|
2664
|
+
case "user": {
|
|
2665
|
+
const message = record.message;
|
|
2666
|
+
if (message) {
|
|
2667
|
+
const content = message.content;
|
|
2668
|
+
if (Array.isArray(content) && content.length > 0) {
|
|
2669
|
+
const first = content[0];
|
|
2670
|
+
if (first?.type === "tool_result") {
|
|
2671
|
+
return `user: tool_result (${first.tool_use_id})`;
|
|
2672
|
+
}
|
|
2673
|
+
}
|
|
2674
|
+
}
|
|
2675
|
+
return "user";
|
|
2676
|
+
}
|
|
2677
|
+
case "result": {
|
|
2678
|
+
const cost = record.cost_usd;
|
|
2679
|
+
const duration = record.duration_ms;
|
|
2680
|
+
if (typeof cost === "number" && typeof duration === "number") {
|
|
2681
|
+
return `result: $${cost.toFixed(4)}, ${Math.round(duration)}ms`;
|
|
2682
|
+
}
|
|
2683
|
+
return "result";
|
|
2684
|
+
}
|
|
2685
|
+
default:
|
|
2686
|
+
return type;
|
|
2687
|
+
}
|
|
2688
|
+
}
|
|
2689
|
+
function tryParseJsonValue(rawLine) {
|
|
2690
|
+
try {
|
|
2691
|
+
return JSON.parse(rawLine);
|
|
2692
|
+
} catch {
|
|
2693
|
+
return void 0;
|
|
2694
|
+
}
|
|
2695
|
+
}
|
|
2696
|
+
function parseClaudeCodeJsonl(output) {
|
|
2697
|
+
const trimmed = output.trim();
|
|
2698
|
+
if (trimmed.length === 0) {
|
|
2699
|
+
throw new Error("Claude Code CLI produced no output");
|
|
2700
|
+
}
|
|
2701
|
+
const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
2702
|
+
const parsed = [];
|
|
2703
|
+
for (const line of lines) {
|
|
2704
|
+
try {
|
|
2705
|
+
parsed.push(JSON.parse(line));
|
|
2706
|
+
} catch {
|
|
2707
|
+
}
|
|
2708
|
+
}
|
|
2709
|
+
if (parsed.length === 0) {
|
|
2710
|
+
throw new Error("Claude Code CLI produced no valid JSON output");
|
|
2711
|
+
}
|
|
2712
|
+
return parsed;
|
|
2713
|
+
}
|
|
2714
|
+
function extractOutputMessages(events) {
|
|
2715
|
+
const outputMessages = [];
|
|
2716
|
+
for (const event of events) {
|
|
2717
|
+
if (!event || typeof event !== "object") {
|
|
2718
|
+
continue;
|
|
2719
|
+
}
|
|
2720
|
+
const record = event;
|
|
2721
|
+
const type = record.type;
|
|
2722
|
+
if (type === "assistant" || type === "user") {
|
|
2723
|
+
const message = record.message;
|
|
2724
|
+
if (message) {
|
|
2725
|
+
const converted = convertClaudeCodeMessage(message, type);
|
|
2726
|
+
if (converted) {
|
|
2727
|
+
outputMessages.push(converted);
|
|
2728
|
+
}
|
|
2729
|
+
}
|
|
2730
|
+
}
|
|
2731
|
+
}
|
|
2732
|
+
return outputMessages;
|
|
2733
|
+
}
|
|
2734
|
+
function convertClaudeCodeMessage(message, type) {
|
|
2735
|
+
const role = type === "assistant" ? "assistant" : "user";
|
|
2736
|
+
const content = extractTextContent(message.content);
|
|
2737
|
+
const toolCalls = extractToolCalls(message.content);
|
|
2738
|
+
return {
|
|
2739
|
+
role,
|
|
2740
|
+
content,
|
|
2741
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
2742
|
+
};
|
|
2743
|
+
}
|
|
2744
|
+
function extractTextContent(content) {
|
|
2745
|
+
if (typeof content === "string") {
|
|
2746
|
+
return content;
|
|
2747
|
+
}
|
|
2748
|
+
if (!Array.isArray(content)) {
|
|
2749
|
+
return void 0;
|
|
2750
|
+
}
|
|
2751
|
+
const textParts = [];
|
|
2752
|
+
for (const part of content) {
|
|
2753
|
+
if (!part || typeof part !== "object") {
|
|
2754
|
+
continue;
|
|
2755
|
+
}
|
|
2756
|
+
const p = part;
|
|
2757
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
2758
|
+
textParts.push(p.text);
|
|
2759
|
+
}
|
|
2760
|
+
}
|
|
2761
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
2762
|
+
}
|
|
2763
|
+
function extractToolCalls(content) {
|
|
2764
|
+
if (!Array.isArray(content)) {
|
|
2765
|
+
return [];
|
|
2766
|
+
}
|
|
2767
|
+
const toolCalls = [];
|
|
2768
|
+
for (const part of content) {
|
|
2769
|
+
if (!part || typeof part !== "object") {
|
|
2770
|
+
continue;
|
|
2771
|
+
}
|
|
2772
|
+
const p = part;
|
|
2773
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
2774
|
+
toolCalls.push({
|
|
2775
|
+
tool: p.name,
|
|
2776
|
+
input: p.input,
|
|
2777
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
2778
|
+
});
|
|
2779
|
+
}
|
|
2780
|
+
if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
2781
|
+
toolCalls.push({
|
|
2782
|
+
tool: "tool_result",
|
|
2783
|
+
output: p.content,
|
|
2784
|
+
id: p.tool_use_id
|
|
2785
|
+
});
|
|
2786
|
+
}
|
|
2787
|
+
}
|
|
2788
|
+
return toolCalls;
|
|
2789
|
+
}
|
|
2790
|
+
function extractUsage(events) {
|
|
2791
|
+
for (let i = events.length - 1; i >= 0; i--) {
|
|
2792
|
+
const event = events[i];
|
|
2793
|
+
if (!event || typeof event !== "object") {
|
|
2794
|
+
continue;
|
|
2795
|
+
}
|
|
2796
|
+
const record = event;
|
|
2797
|
+
if (record.type !== "result") {
|
|
2798
|
+
continue;
|
|
2799
|
+
}
|
|
2800
|
+
const usage = {};
|
|
2801
|
+
if (typeof record.cost_usd === "number") {
|
|
2802
|
+
usage.cost_usd = record.cost_usd;
|
|
2803
|
+
}
|
|
2804
|
+
if (typeof record.duration_ms === "number") {
|
|
2805
|
+
usage.duration_ms = record.duration_ms;
|
|
2806
|
+
}
|
|
2807
|
+
if (typeof record.duration_api_ms === "number") {
|
|
2808
|
+
usage.duration_api_ms = record.duration_api_ms;
|
|
2809
|
+
}
|
|
2810
|
+
if (typeof record.input_tokens === "number") {
|
|
2811
|
+
usage.input_tokens = record.input_tokens;
|
|
2812
|
+
}
|
|
2813
|
+
if (typeof record.output_tokens === "number") {
|
|
2814
|
+
usage.output_tokens = record.output_tokens;
|
|
2815
|
+
}
|
|
2816
|
+
if (typeof record.session_id === "string") {
|
|
2817
|
+
usage.session_id = record.session_id;
|
|
2818
|
+
}
|
|
2819
|
+
return Object.keys(usage).length > 0 ? usage : void 0;
|
|
2820
|
+
}
|
|
2821
|
+
return void 0;
|
|
2822
|
+
}
|
|
2823
|
+
function pickDetail(stderr, stdout) {
|
|
2824
|
+
const errorText = stderr.trim();
|
|
2825
|
+
if (errorText.length > 0) {
|
|
2826
|
+
return errorText;
|
|
2827
|
+
}
|
|
2828
|
+
const stdoutText = stdout.trim();
|
|
2829
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
2830
|
+
}
|
|
2831
|
+
function formatTimeoutSuffix(timeoutMs) {
|
|
2832
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
2833
|
+
return "";
|
|
2834
|
+
}
|
|
2835
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
2836
|
+
return ` after ${seconds}s`;
|
|
2837
|
+
}
|
|
2838
|
+
function isNestedClaudeCodeAuthError(stdout) {
|
|
2839
|
+
try {
|
|
2840
|
+
const lines = stdout.split("\n");
|
|
2841
|
+
let hasApiKeySource = false;
|
|
2842
|
+
let hasAuthError = false;
|
|
2843
|
+
for (const line of lines) {
|
|
2844
|
+
const trimmed = line.trim();
|
|
2845
|
+
if (!trimmed) continue;
|
|
2846
|
+
try {
|
|
2847
|
+
const event = JSON.parse(trimmed);
|
|
2848
|
+
if (event.type === "system" && event.apiKeySource === "ANTHROPIC_API_KEY") {
|
|
2849
|
+
hasApiKeySource = true;
|
|
2850
|
+
}
|
|
2851
|
+
if (event.error === "authentication_failed" || event.type === "result" && event.is_error) {
|
|
2852
|
+
hasAuthError = true;
|
|
2853
|
+
}
|
|
2854
|
+
} catch {
|
|
2855
|
+
}
|
|
2856
|
+
}
|
|
2857
|
+
return hasApiKeySource && hasAuthError;
|
|
2858
|
+
} catch {
|
|
2859
|
+
return false;
|
|
2860
|
+
}
|
|
2861
|
+
}
|
|
2862
|
+
function escapeShellArg(arg) {
|
|
2863
|
+
return `'${arg.replace(/'/g, "'\\''")}'`;
|
|
2864
|
+
}
|
|
2865
|
+
async function defaultClaudeCodeRunner(options) {
|
|
2866
|
+
const tempId = (0, import_node_crypto.randomUUID)();
|
|
2867
|
+
const stdoutFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
|
|
2868
|
+
const stderrFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
|
|
2869
|
+
const exitFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
|
|
2870
|
+
const pidFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
|
|
2871
|
+
try {
|
|
2872
|
+
return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
|
|
2873
|
+
} finally {
|
|
2874
|
+
for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
|
|
2875
|
+
try {
|
|
2876
|
+
await (0, import_promises8.rm)(file, { force: true });
|
|
2877
|
+
} catch {
|
|
2878
|
+
}
|
|
2879
|
+
}
|
|
2880
|
+
}
|
|
2881
|
+
}
|
|
2882
|
+
async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile) {
|
|
2883
|
+
const parts = options.executable.split(/\s+/);
|
|
2884
|
+
const executable = parts[0];
|
|
2885
|
+
const executableArgs = parts.slice(1);
|
|
2886
|
+
const allArgs = [...executableArgs, ...options.args];
|
|
2887
|
+
const escapedArgs = allArgs.map((arg) => escapeShellArg(arg));
|
|
2888
|
+
const fullCommand = [escapeShellArg(executable), ...escapedArgs].join(" ");
|
|
2889
|
+
const bashScript = `
|
|
2890
|
+
unset CLAUDECODE CLAUDE_CODE_ENTRYPOINT 2>/dev/null
|
|
2891
|
+
${fullCommand} >${escapeShellArg(stdoutFile)} 2>${escapeShellArg(stderrFile)} &
|
|
2892
|
+
CHILD_PID=$!
|
|
2893
|
+
echo $CHILD_PID > ${escapeShellArg(pidFile)}
|
|
2894
|
+
wait $CHILD_PID
|
|
2895
|
+
echo $? > ${escapeShellArg(exitFile)}
|
|
2896
|
+
`;
|
|
2897
|
+
const child = (0, import_node_child_process.spawn)("setsid", ["bash", "-c", bashScript], {
|
|
2898
|
+
cwd: options.cwd,
|
|
2899
|
+
env: options.env,
|
|
2900
|
+
detached: true,
|
|
2901
|
+
stdio: "ignore"
|
|
2902
|
+
});
|
|
2903
|
+
child.unref();
|
|
2904
|
+
const pollInterval = 100;
|
|
2905
|
+
const startTime = Date.now();
|
|
2906
|
+
let timedOut = false;
|
|
2907
|
+
let lastStdoutSize = 0;
|
|
2908
|
+
const readFileIfExists = async (filePath) => {
|
|
2909
|
+
try {
|
|
2910
|
+
const { readFile: readFile8 } = await import("fs/promises");
|
|
2911
|
+
return await readFile8(filePath, "utf8");
|
|
2912
|
+
} catch {
|
|
2913
|
+
return "";
|
|
2914
|
+
}
|
|
2915
|
+
};
|
|
2916
|
+
const fileExists4 = async (filePath) => {
|
|
2917
|
+
try {
|
|
2918
|
+
const { access: access5 } = await import("fs/promises");
|
|
2919
|
+
await access5(filePath);
|
|
2920
|
+
return true;
|
|
2921
|
+
} catch {
|
|
2922
|
+
return false;
|
|
2923
|
+
}
|
|
2924
|
+
};
|
|
2925
|
+
const killProcess = async () => {
|
|
2926
|
+
try {
|
|
2927
|
+
const pid = await readFileIfExists(pidFile);
|
|
2928
|
+
if (pid.trim()) {
|
|
2929
|
+
process.kill(Number.parseInt(pid.trim(), 10), "SIGTERM");
|
|
2930
|
+
}
|
|
2931
|
+
} catch {
|
|
2932
|
+
}
|
|
2933
|
+
};
|
|
2934
|
+
if (options.signal?.aborted) {
|
|
2935
|
+
await killProcess();
|
|
2936
|
+
return { stdout: "", stderr: "Aborted", exitCode: -1, timedOut: false };
|
|
2937
|
+
}
|
|
2938
|
+
const abortHandler = () => {
|
|
2939
|
+
killProcess().catch(() => {
|
|
2940
|
+
});
|
|
2941
|
+
};
|
|
2942
|
+
options.signal?.addEventListener("abort", abortHandler, { once: true });
|
|
2943
|
+
try {
|
|
2944
|
+
while (true) {
|
|
2945
|
+
if (options.timeoutMs && Date.now() - startTime > options.timeoutMs) {
|
|
2946
|
+
timedOut = true;
|
|
2947
|
+
await killProcess();
|
|
2948
|
+
break;
|
|
2949
|
+
}
|
|
2950
|
+
if (options.signal?.aborted) {
|
|
2951
|
+
await killProcess();
|
|
2952
|
+
break;
|
|
2953
|
+
}
|
|
2954
|
+
if (options.onStdoutChunk) {
|
|
2955
|
+
const currentStdout = await readFileIfExists(stdoutFile);
|
|
2956
|
+
if (currentStdout.length > lastStdoutSize) {
|
|
2957
|
+
options.onStdoutChunk(currentStdout.slice(lastStdoutSize));
|
|
2958
|
+
lastStdoutSize = currentStdout.length;
|
|
2959
|
+
}
|
|
2960
|
+
}
|
|
2961
|
+
if (await fileExists4(exitFile)) {
|
|
2962
|
+
break;
|
|
2963
|
+
}
|
|
2964
|
+
await new Promise((resolve) => setTimeout(resolve, pollInterval));
|
|
2965
|
+
}
|
|
2966
|
+
const stdout = await readFileIfExists(stdoutFile);
|
|
2967
|
+
const stderr = await readFileIfExists(stderrFile);
|
|
2968
|
+
const exitCodeStr = await readFileIfExists(exitFile);
|
|
2969
|
+
const exitCode = exitCodeStr.trim() ? Number.parseInt(exitCodeStr.trim(), 10) : -1;
|
|
2970
|
+
if (options.onStdoutChunk && stdout.length > lastStdoutSize) {
|
|
2971
|
+
options.onStdoutChunk(stdout.slice(lastStdoutSize));
|
|
2972
|
+
}
|
|
2973
|
+
if (options.onStderrChunk && stderr) {
|
|
2974
|
+
options.onStderrChunk(stderr);
|
|
2975
|
+
}
|
|
2976
|
+
return { stdout, stderr, exitCode, timedOut };
|
|
2977
|
+
} finally {
|
|
2978
|
+
options.signal?.removeEventListener("abort", abortHandler);
|
|
2979
|
+
}
|
|
2980
|
+
}
|
|
2981
|
+
|
|
2982
|
+
// src/evaluation/providers/cli.ts
|
|
2983
|
+
var import_node_child_process2 = require("child_process");
|
|
2984
|
+
var import_promises9 = __toESM(require("fs/promises"), 1);
|
|
2985
|
+
var import_node_os2 = __toESM(require("os"), 1);
|
|
2986
|
+
var import_node_path10 = __toESM(require("path"), 1);
|
|
2987
|
+
var import_node_util = require("util");
|
|
2988
|
+
var import_zod = require("zod");
|
|
2989
|
+
var ToolCallSchema = import_zod.z.object({
|
|
2990
|
+
tool: import_zod.z.string(),
|
|
2991
|
+
input: import_zod.z.unknown().optional(),
|
|
2992
|
+
output: import_zod.z.unknown().optional(),
|
|
2993
|
+
id: import_zod.z.string().optional(),
|
|
2994
|
+
timestamp: import_zod.z.string().optional()
|
|
2995
|
+
});
|
|
2996
|
+
var OutputMessageInputSchema = import_zod.z.object({
|
|
2997
|
+
role: import_zod.z.string(),
|
|
2998
|
+
name: import_zod.z.string().optional(),
|
|
2999
|
+
content: import_zod.z.unknown().optional(),
|
|
3000
|
+
tool_calls: import_zod.z.array(ToolCallSchema).optional(),
|
|
3001
|
+
timestamp: import_zod.z.string().optional(),
|
|
3002
|
+
metadata: import_zod.z.record(import_zod.z.unknown()).optional()
|
|
3003
|
+
});
|
|
3004
|
+
var TokenUsageSchema = import_zod.z.object({
|
|
3005
|
+
input: import_zod.z.number(),
|
|
3006
|
+
output: import_zod.z.number(),
|
|
3007
|
+
cached: import_zod.z.number().optional()
|
|
3008
|
+
});
|
|
3009
|
+
var CliOutputSchema = import_zod.z.object({
|
|
3010
|
+
text: import_zod.z.unknown().optional(),
|
|
3011
|
+
output_messages: import_zod.z.array(OutputMessageInputSchema).optional(),
|
|
3012
|
+
token_usage: TokenUsageSchema.optional(),
|
|
3013
|
+
cost_usd: import_zod.z.number().optional(),
|
|
3014
|
+
duration_ms: import_zod.z.number().optional()
|
|
3015
|
+
});
|
|
3016
|
+
var CliJsonlRecordSchema = CliOutputSchema.extend({
|
|
3017
|
+
id: import_zod.z.string().min(1)
|
|
3018
|
+
});
|
|
3019
|
+
function validateMetrics(costUsd, durationMs, context) {
|
|
3020
|
+
let validCostUsd = costUsd;
|
|
3021
|
+
let validDurationMs = durationMs;
|
|
3022
|
+
if (costUsd !== void 0 && costUsd < 0) {
|
|
3023
|
+
console.warn(`[cli-provider] ${context}: ignoring negative cost_usd value (${costUsd})`);
|
|
3024
|
+
validCostUsd = void 0;
|
|
3025
|
+
}
|
|
3026
|
+
if (durationMs !== void 0 && durationMs < 0) {
|
|
3027
|
+
console.warn(`[cli-provider] ${context}: ignoring negative duration_ms value (${durationMs})`);
|
|
3028
|
+
validDurationMs = void 0;
|
|
3029
|
+
}
|
|
3030
|
+
return { costUsd: validCostUsd, durationMs: validDurationMs };
|
|
3031
|
+
}
|
|
3032
|
+
function convertOutputMessages(messages) {
|
|
3033
|
+
if (!messages || messages.length === 0) {
|
|
3034
|
+
return void 0;
|
|
3035
|
+
}
|
|
3036
|
+
return messages.map((msg) => ({
|
|
3037
|
+
role: msg.role,
|
|
3038
|
+
name: msg.name,
|
|
3039
|
+
content: msg.content,
|
|
3040
|
+
toolCalls: msg.tool_calls,
|
|
3041
|
+
timestamp: msg.timestamp,
|
|
3042
|
+
metadata: msg.metadata
|
|
3043
|
+
}));
|
|
3044
|
+
}
|
|
3045
|
+
var execAsync = (0, import_node_util.promisify)(import_node_child_process2.exec);
|
|
3046
|
+
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
3047
|
+
async function defaultCommandRunner(command, options) {
|
|
3048
|
+
const execOptions = {
|
|
3049
|
+
cwd: options.cwd,
|
|
3050
|
+
env: options.env,
|
|
3051
|
+
timeout: options.timeoutMs,
|
|
3052
|
+
signal: options.signal,
|
|
3053
|
+
maxBuffer: DEFAULT_MAX_BUFFER,
|
|
3054
|
+
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
3055
|
+
};
|
|
3056
|
+
try {
|
|
3057
|
+
const { stdout, stderr } = await execAsync(command, execOptions);
|
|
3058
|
+
return {
|
|
3059
|
+
stdout,
|
|
3060
|
+
stderr,
|
|
3061
|
+
exitCode: 0,
|
|
3062
|
+
failed: false,
|
|
3063
|
+
timedOut: false,
|
|
3064
|
+
signal: null
|
|
3065
|
+
};
|
|
3066
|
+
} catch (error) {
|
|
3067
|
+
const execError = error;
|
|
3068
|
+
return {
|
|
3069
|
+
stdout: execError.stdout ?? "",
|
|
3070
|
+
stderr: execError.stderr ?? "",
|
|
3071
|
+
exitCode: typeof execError.code === "number" ? execError.code : null,
|
|
3072
|
+
failed: true,
|
|
3073
|
+
timedOut: execError.timedOut === true || execError.killed === true,
|
|
3074
|
+
signal: execError.signal ?? null
|
|
3075
|
+
};
|
|
3076
|
+
}
|
|
3077
|
+
}
|
|
3078
|
+
var CliProvider = class {
|
|
3079
|
+
id;
|
|
3080
|
+
kind = "cli";
|
|
3081
|
+
targetName;
|
|
3082
|
+
supportsBatch = true;
|
|
3083
|
+
config;
|
|
3084
|
+
runCommand;
|
|
3085
|
+
verbose;
|
|
3086
|
+
keepTempFiles;
|
|
3087
|
+
healthcheckPromise;
|
|
3088
|
+
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
3089
|
+
this.targetName = targetName;
|
|
3090
|
+
this.id = `cli:${targetName}`;
|
|
3091
|
+
this.config = config;
|
|
3092
|
+
this.runCommand = runner;
|
|
3093
|
+
this.verbose = config.verbose ?? false;
|
|
3094
|
+
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
3095
|
+
}
|
|
3096
|
+
async invoke(request) {
|
|
3097
|
+
if (request.signal?.aborted) {
|
|
3098
|
+
throw new Error("CLI provider request was aborted before execution");
|
|
3099
|
+
}
|
|
3100
|
+
await this.ensureHealthy(request.signal);
|
|
3101
|
+
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
3102
|
+
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
3103
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
3104
|
+
if (this.verbose) {
|
|
3105
|
+
console.log(
|
|
3106
|
+
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
3107
|
+
);
|
|
3108
|
+
}
|
|
3109
|
+
const startTime = Date.now();
|
|
3110
|
+
const result = await this.runCommand(renderedCommand, {
|
|
3111
|
+
cwd: this.config.cwd,
|
|
3112
|
+
env: process.env,
|
|
3113
|
+
timeoutMs: this.config.timeoutMs,
|
|
3114
|
+
signal: request.signal
|
|
3115
|
+
});
|
|
3116
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
3117
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
3118
|
+
if (request.signal?.aborted) {
|
|
3119
|
+
throw new Error("CLI provider request was aborted");
|
|
3120
|
+
}
|
|
3121
|
+
if (result.timedOut) {
|
|
3122
|
+
throw new Error(
|
|
3123
|
+
`CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
3124
|
+
);
|
|
2019
3125
|
}
|
|
2020
3126
|
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
2021
3127
|
const detail = result.stderr.trim() || result.stdout.trim();
|
|
@@ -2090,7 +3196,7 @@ var CliProvider = class {
|
|
|
2090
3196
|
}
|
|
2091
3197
|
if (result.timedOut) {
|
|
2092
3198
|
throw new Error(
|
|
2093
|
-
`CLI provider timed out${
|
|
3199
|
+
`CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
2094
3200
|
);
|
|
2095
3201
|
}
|
|
2096
3202
|
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
@@ -2100,11 +3206,6 @@ var CliProvider = class {
|
|
|
2100
3206
|
}
|
|
2101
3207
|
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
2102
3208
|
const recordsById = this.parseJsonlBatchOutput(responseContent);
|
|
2103
|
-
const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
|
|
2104
|
-
const missingIds = requestedIds.filter((id) => !recordsById.has(id));
|
|
2105
|
-
if (missingIds.length > 0) {
|
|
2106
|
-
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
2107
|
-
}
|
|
2108
3209
|
const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
|
|
2109
3210
|
const responses = requests.map((request) => {
|
|
2110
3211
|
const evalCaseId = request.evalCaseId;
|
|
@@ -2123,15 +3224,20 @@ var CliProvider = class {
|
|
|
2123
3224
|
}
|
|
2124
3225
|
const parsed = recordsById.get(evalCaseId);
|
|
2125
3226
|
if (!parsed) {
|
|
3227
|
+
const errorMessage = `Batch output missing id '${evalCaseId}'`;
|
|
3228
|
+
if (this.verbose) {
|
|
3229
|
+
console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
|
|
3230
|
+
}
|
|
2126
3231
|
return {
|
|
2127
|
-
outputMessages: [],
|
|
3232
|
+
outputMessages: [{ role: "assistant", content: `Error: ${errorMessage}` }],
|
|
2128
3233
|
durationMs: perRequestFallbackMs,
|
|
2129
3234
|
raw: {
|
|
2130
3235
|
command: renderedCommand,
|
|
2131
3236
|
stderr: result.stderr,
|
|
2132
3237
|
exitCode: result.exitCode ?? 0,
|
|
2133
3238
|
cwd: this.config.cwd,
|
|
2134
|
-
outputFile: outputFilePath
|
|
3239
|
+
outputFile: outputFilePath,
|
|
3240
|
+
error: errorMessage
|
|
2135
3241
|
}
|
|
2136
3242
|
};
|
|
2137
3243
|
}
|
|
@@ -2164,101 +3270,37 @@ var CliProvider = class {
|
|
|
2164
3270
|
* - duration_ms: number
|
|
2165
3271
|
*/
|
|
2166
3272
|
parseOutputContent(content) {
|
|
3273
|
+
let parsed;
|
|
2167
3274
|
try {
|
|
2168
|
-
|
|
2169
|
-
if (typeof parsed === "object" && parsed !== null) {
|
|
2170
|
-
const obj = parsed;
|
|
2171
|
-
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2172
|
-
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2173
|
-
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2174
|
-
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2175
|
-
if (outputMessages && outputMessages.length > 0) {
|
|
2176
|
-
return { outputMessages, tokenUsage, costUsd, durationMs };
|
|
2177
|
-
}
|
|
2178
|
-
if ("text" in obj) {
|
|
2179
|
-
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
2180
|
-
return {
|
|
2181
|
-
outputMessages: [{ role: "assistant", content: text }],
|
|
2182
|
-
tokenUsage,
|
|
2183
|
-
costUsd,
|
|
2184
|
-
durationMs
|
|
2185
|
-
};
|
|
2186
|
-
}
|
|
2187
|
-
}
|
|
3275
|
+
parsed = JSON.parse(content);
|
|
2188
3276
|
} catch {
|
|
3277
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2189
3278
|
}
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
* Parse token_usage from CLI output.
|
|
2194
|
-
*/
|
|
2195
|
-
parseTokenUsage(tokenUsage) {
|
|
2196
|
-
if (typeof tokenUsage !== "object" || tokenUsage === null) {
|
|
2197
|
-
return void 0;
|
|
2198
|
-
}
|
|
2199
|
-
const obj = tokenUsage;
|
|
2200
|
-
if (typeof obj.input !== "number" || typeof obj.output !== "number") {
|
|
2201
|
-
return void 0;
|
|
2202
|
-
}
|
|
2203
|
-
return {
|
|
2204
|
-
input: obj.input,
|
|
2205
|
-
output: obj.output,
|
|
2206
|
-
cached: typeof obj.cached === "number" ? obj.cached : void 0
|
|
2207
|
-
};
|
|
2208
|
-
}
|
|
2209
|
-
/**
|
|
2210
|
-
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
2211
|
-
*/
|
|
2212
|
-
parseOutputMessages(outputMessages) {
|
|
2213
|
-
if (!Array.isArray(outputMessages)) {
|
|
2214
|
-
return void 0;
|
|
2215
|
-
}
|
|
2216
|
-
const messages = [];
|
|
2217
|
-
for (const msg of outputMessages) {
|
|
2218
|
-
if (typeof msg !== "object" || msg === null) {
|
|
2219
|
-
continue;
|
|
2220
|
-
}
|
|
2221
|
-
const rawMsg = msg;
|
|
2222
|
-
if (typeof rawMsg.role !== "string") {
|
|
2223
|
-
continue;
|
|
2224
|
-
}
|
|
2225
|
-
const message = {
|
|
2226
|
-
role: rawMsg.role,
|
|
2227
|
-
name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
|
|
2228
|
-
content: rawMsg.content,
|
|
2229
|
-
toolCalls: this.parseToolCalls(rawMsg.tool_calls),
|
|
2230
|
-
timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
|
|
2231
|
-
metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
|
|
2232
|
-
};
|
|
2233
|
-
messages.push(message);
|
|
2234
|
-
}
|
|
2235
|
-
return messages.length > 0 ? messages : void 0;
|
|
2236
|
-
}
|
|
2237
|
-
/**
|
|
2238
|
-
* Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
|
|
2239
|
-
*/
|
|
2240
|
-
parseToolCalls(toolCalls) {
|
|
2241
|
-
if (!Array.isArray(toolCalls)) {
|
|
2242
|
-
return void 0;
|
|
3279
|
+
const result = CliOutputSchema.safeParse(parsed);
|
|
3280
|
+
if (!result.success) {
|
|
3281
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2243
3282
|
}
|
|
2244
|
-
const
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
|
|
3283
|
+
const obj = result.data;
|
|
3284
|
+
const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, "parsing output");
|
|
3285
|
+
const outputMessages = convertOutputMessages(obj.output_messages);
|
|
3286
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
3287
|
+
return {
|
|
3288
|
+
outputMessages,
|
|
3289
|
+
tokenUsage: obj.token_usage,
|
|
3290
|
+
costUsd: metrics.costUsd,
|
|
3291
|
+
durationMs: metrics.durationMs
|
|
3292
|
+
};
|
|
3293
|
+
}
|
|
3294
|
+
if (obj.text !== void 0) {
|
|
3295
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
3296
|
+
return {
|
|
3297
|
+
outputMessages: [{ role: "assistant", content: text }],
|
|
3298
|
+
tokenUsage: obj.token_usage,
|
|
3299
|
+
costUsd: metrics.costUsd,
|
|
3300
|
+
durationMs: metrics.durationMs
|
|
3301
|
+
};
|
|
2260
3302
|
}
|
|
2261
|
-
return
|
|
3303
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2262
3304
|
}
|
|
2263
3305
|
parseJsonlBatchOutput(content) {
|
|
2264
3306
|
const records = /* @__PURE__ */ new Map();
|
|
@@ -2271,33 +3313,32 @@ var CliProvider = class {
|
|
|
2271
3313
|
const reason = error instanceof Error ? error.message : String(error);
|
|
2272
3314
|
throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
|
|
2273
3315
|
}
|
|
2274
|
-
|
|
3316
|
+
const result = CliJsonlRecordSchema.safeParse(parsed);
|
|
3317
|
+
if (!result.success) {
|
|
3318
|
+
const firstError = result.error.errors[0];
|
|
3319
|
+
if (firstError?.path.includes("id")) {
|
|
3320
|
+
throw new Error("CLI batch output JSONL line missing required string field: id");
|
|
3321
|
+
}
|
|
2275
3322
|
throw new Error("CLI batch output JSONL line must be an object");
|
|
2276
3323
|
}
|
|
2277
|
-
const obj =
|
|
2278
|
-
|
|
2279
|
-
|
|
2280
|
-
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2286
|
-
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2287
|
-
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2288
|
-
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2289
|
-
let outputMessages;
|
|
2290
|
-
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
2291
|
-
outputMessages = parsedOutputMessages;
|
|
3324
|
+
const obj = result.data;
|
|
3325
|
+
if (records.has(obj.id)) {
|
|
3326
|
+
throw new Error(`CLI batch output contains duplicate id: ${obj.id}`);
|
|
3327
|
+
}
|
|
3328
|
+
const outputMessages = convertOutputMessages(obj.output_messages);
|
|
3329
|
+
let finalOutputMessages;
|
|
3330
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
3331
|
+
finalOutputMessages = outputMessages;
|
|
2292
3332
|
} else {
|
|
2293
3333
|
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
2294
|
-
|
|
2295
|
-
}
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
3334
|
+
finalOutputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
3335
|
+
}
|
|
3336
|
+
const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, `batch record '${obj.id}'`);
|
|
3337
|
+
records.set(obj.id, {
|
|
3338
|
+
outputMessages: finalOutputMessages,
|
|
3339
|
+
tokenUsage: obj.token_usage,
|
|
3340
|
+
costUsd: metrics.costUsd,
|
|
3341
|
+
durationMs: metrics.durationMs
|
|
2301
3342
|
});
|
|
2302
3343
|
}
|
|
2303
3344
|
return records;
|
|
@@ -2311,7 +3352,7 @@ var CliProvider = class {
|
|
|
2311
3352
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
2312
3353
|
} finally {
|
|
2313
3354
|
if (!this.keepTempFiles) {
|
|
2314
|
-
await
|
|
3355
|
+
await import_promises9.default.unlink(filePath).catch(() => {
|
|
2315
3356
|
});
|
|
2316
3357
|
}
|
|
2317
3358
|
}
|
|
@@ -2383,7 +3424,7 @@ var CliProvider = class {
|
|
|
2383
3424
|
}
|
|
2384
3425
|
};
|
|
2385
3426
|
function buildTemplateValues(request, config, outputFilePath) {
|
|
2386
|
-
const inputFiles =
|
|
3427
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
2387
3428
|
return {
|
|
2388
3429
|
PROMPT: shellEscape(request.question ?? ""),
|
|
2389
3430
|
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
@@ -2393,13 +3434,13 @@ function buildTemplateValues(request, config, outputFilePath) {
|
|
|
2393
3434
|
OUTPUT_FILE: shellEscape(outputFilePath)
|
|
2394
3435
|
};
|
|
2395
3436
|
}
|
|
2396
|
-
function
|
|
3437
|
+
function normalizeInputFiles2(inputFiles) {
|
|
2397
3438
|
if (!inputFiles || inputFiles.length === 0) {
|
|
2398
3439
|
return void 0;
|
|
2399
3440
|
}
|
|
2400
3441
|
const unique = /* @__PURE__ */ new Map();
|
|
2401
3442
|
for (const inputFile of inputFiles) {
|
|
2402
|
-
const absolutePath =
|
|
3443
|
+
const absolutePath = import_node_path10.default.resolve(inputFile);
|
|
2403
3444
|
if (!unique.has(absolutePath)) {
|
|
2404
3445
|
unique.set(absolutePath, absolutePath);
|
|
2405
3446
|
}
|
|
@@ -2413,7 +3454,7 @@ function formatFileList(files, template) {
|
|
|
2413
3454
|
const formatter = template ?? "{path}";
|
|
2414
3455
|
return files.map((filePath) => {
|
|
2415
3456
|
const escapedPath = shellEscape(filePath);
|
|
2416
|
-
const escapedName = shellEscape(
|
|
3457
|
+
const escapedName = shellEscape(import_node_path10.default.basename(filePath));
|
|
2417
3458
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
2418
3459
|
}).join(" ");
|
|
2419
3460
|
}
|
|
@@ -2437,9 +3478,9 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
2437
3478
|
const safeEvalId = evalCaseId || "unknown";
|
|
2438
3479
|
const timestamp = Date.now();
|
|
2439
3480
|
const random = Math.random().toString(36).substring(2, 9);
|
|
2440
|
-
return
|
|
3481
|
+
return import_node_path10.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
2441
3482
|
}
|
|
2442
|
-
function
|
|
3483
|
+
function formatTimeoutSuffix2(timeoutMs) {
|
|
2443
3484
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
2444
3485
|
return "";
|
|
2445
3486
|
}
|
|
@@ -2448,39 +3489,39 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
2448
3489
|
}
|
|
2449
3490
|
|
|
2450
3491
|
// src/evaluation/providers/codex.ts
|
|
2451
|
-
var
|
|
2452
|
-
var
|
|
2453
|
-
var
|
|
2454
|
-
var
|
|
2455
|
-
var
|
|
2456
|
-
var
|
|
3492
|
+
var import_node_child_process3 = require("child_process");
|
|
3493
|
+
var import_node_crypto2 = require("crypto");
|
|
3494
|
+
var import_node_fs4 = require("fs");
|
|
3495
|
+
var import_promises10 = require("fs/promises");
|
|
3496
|
+
var import_node_os3 = require("os");
|
|
3497
|
+
var import_node_path11 = __toESM(require("path"), 1);
|
|
2457
3498
|
var import_node_util2 = require("util");
|
|
2458
3499
|
|
|
2459
3500
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
2460
|
-
var
|
|
2461
|
-
var
|
|
3501
|
+
var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.codexLogs");
|
|
3502
|
+
var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.codexLogSubscribers");
|
|
2462
3503
|
function getCodexLogStore() {
|
|
2463
3504
|
const globalObject = globalThis;
|
|
2464
|
-
const existing = globalObject[
|
|
3505
|
+
const existing = globalObject[GLOBAL_LOGS_KEY2];
|
|
2465
3506
|
if (existing) {
|
|
2466
3507
|
return existing;
|
|
2467
3508
|
}
|
|
2468
3509
|
const created = [];
|
|
2469
|
-
globalObject[
|
|
3510
|
+
globalObject[GLOBAL_LOGS_KEY2] = created;
|
|
2470
3511
|
return created;
|
|
2471
3512
|
}
|
|
2472
|
-
function
|
|
3513
|
+
function getSubscriberStore2() {
|
|
2473
3514
|
const globalObject = globalThis;
|
|
2474
|
-
const existing = globalObject[
|
|
3515
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
|
|
2475
3516
|
if (existing) {
|
|
2476
3517
|
return existing;
|
|
2477
3518
|
}
|
|
2478
3519
|
const created = /* @__PURE__ */ new Set();
|
|
2479
|
-
globalObject[
|
|
3520
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
|
|
2480
3521
|
return created;
|
|
2481
3522
|
}
|
|
2482
|
-
function
|
|
2483
|
-
const subscribers = Array.from(
|
|
3523
|
+
function notifySubscribers2(entry) {
|
|
3524
|
+
const subscribers = Array.from(getSubscriberStore2());
|
|
2484
3525
|
for (const listener of subscribers) {
|
|
2485
3526
|
try {
|
|
2486
3527
|
listener(entry);
|
|
@@ -2492,7 +3533,7 @@ function notifySubscribers(entry) {
|
|
|
2492
3533
|
}
|
|
2493
3534
|
function recordCodexLogEntry(entry) {
|
|
2494
3535
|
getCodexLogStore().push(entry);
|
|
2495
|
-
|
|
3536
|
+
notifySubscribers2(entry);
|
|
2496
3537
|
}
|
|
2497
3538
|
function consumeCodexLogEntries() {
|
|
2498
3539
|
const store = getCodexLogStore();
|
|
@@ -2502,118 +3543,19 @@ function consumeCodexLogEntries() {
|
|
|
2502
3543
|
return store.splice(0, store.length);
|
|
2503
3544
|
}
|
|
2504
3545
|
function subscribeToCodexLogEntries(listener) {
|
|
2505
|
-
const store =
|
|
3546
|
+
const store = getSubscriberStore2();
|
|
2506
3547
|
store.add(listener);
|
|
2507
3548
|
return () => {
|
|
2508
3549
|
store.delete(listener);
|
|
2509
3550
|
};
|
|
2510
3551
|
}
|
|
2511
3552
|
|
|
2512
|
-
// src/evaluation/providers/preread.ts
|
|
2513
|
-
var import_node_path9 = __toESM(require("path"), 1);
|
|
2514
|
-
function buildPromptDocument(request, inputFiles, options) {
|
|
2515
|
-
const parts = [];
|
|
2516
|
-
const guidelineFiles = collectGuidelineFiles(
|
|
2517
|
-
inputFiles,
|
|
2518
|
-
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
2519
|
-
options?.guidelineOverrides
|
|
2520
|
-
);
|
|
2521
|
-
const inputFilesList = collectInputFiles(inputFiles);
|
|
2522
|
-
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
2523
|
-
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
2524
|
-
if (prereadBlock.length > 0) {
|
|
2525
|
-
parts.push("\n", prereadBlock);
|
|
2526
|
-
}
|
|
2527
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2528
|
-
return parts.join("\n").trim();
|
|
2529
|
-
}
|
|
2530
|
-
function normalizeInputFiles2(inputFiles) {
|
|
2531
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2532
|
-
return void 0;
|
|
2533
|
-
}
|
|
2534
|
-
const deduped = /* @__PURE__ */ new Map();
|
|
2535
|
-
for (const inputFile of inputFiles) {
|
|
2536
|
-
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2537
|
-
if (!deduped.has(absolutePath)) {
|
|
2538
|
-
deduped.set(absolutePath, absolutePath);
|
|
2539
|
-
}
|
|
2540
|
-
}
|
|
2541
|
-
return Array.from(deduped.values());
|
|
2542
|
-
}
|
|
2543
|
-
function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
2544
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2545
|
-
return [];
|
|
2546
|
-
}
|
|
2547
|
-
const unique = /* @__PURE__ */ new Map();
|
|
2548
|
-
for (const inputFile of inputFiles) {
|
|
2549
|
-
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2550
|
-
if (overrides?.has(absolutePath)) {
|
|
2551
|
-
if (!unique.has(absolutePath)) {
|
|
2552
|
-
unique.set(absolutePath, absolutePath);
|
|
2553
|
-
}
|
|
2554
|
-
continue;
|
|
2555
|
-
}
|
|
2556
|
-
const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
|
|
2557
|
-
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2558
|
-
if (!unique.has(absolutePath)) {
|
|
2559
|
-
unique.set(absolutePath, absolutePath);
|
|
2560
|
-
}
|
|
2561
|
-
}
|
|
2562
|
-
}
|
|
2563
|
-
return Array.from(unique.values());
|
|
2564
|
-
}
|
|
2565
|
-
function collectInputFiles(inputFiles) {
|
|
2566
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2567
|
-
return [];
|
|
2568
|
-
}
|
|
2569
|
-
const unique = /* @__PURE__ */ new Map();
|
|
2570
|
-
for (const inputFile of inputFiles) {
|
|
2571
|
-
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2572
|
-
if (!unique.has(absolutePath)) {
|
|
2573
|
-
unique.set(absolutePath, absolutePath);
|
|
2574
|
-
}
|
|
2575
|
-
}
|
|
2576
|
-
return Array.from(unique.values());
|
|
2577
|
-
}
|
|
2578
|
-
function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
2579
|
-
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
2580
|
-
return "";
|
|
2581
|
-
}
|
|
2582
|
-
const buildList = (files) => files.map((absolutePath) => {
|
|
2583
|
-
const fileName = import_node_path9.default.basename(absolutePath);
|
|
2584
|
-
const fileUri = pathToFileUri(absolutePath);
|
|
2585
|
-
return `* [${fileName}](${fileUri})`;
|
|
2586
|
-
});
|
|
2587
|
-
const sections = [];
|
|
2588
|
-
if (guidelineFiles.length > 0) {
|
|
2589
|
-
sections.push(`Read all guideline files:
|
|
2590
|
-
${buildList(guidelineFiles).join("\n")}.`);
|
|
2591
|
-
}
|
|
2592
|
-
if (inputFiles.length > 0) {
|
|
2593
|
-
sections.push(`Read all input files:
|
|
2594
|
-
${buildList(inputFiles).join("\n")}.`);
|
|
2595
|
-
}
|
|
2596
|
-
sections.push(
|
|
2597
|
-
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
2598
|
-
"Then apply system_instructions on the user query below."
|
|
2599
|
-
);
|
|
2600
|
-
return sections.join("\n");
|
|
2601
|
-
}
|
|
2602
|
-
function pathToFileUri(filePath) {
|
|
2603
|
-
const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
|
|
2604
|
-
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2605
|
-
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2606
|
-
return `file:///${normalizedPath}`;
|
|
2607
|
-
}
|
|
2608
|
-
return `file://${normalizedPath}`;
|
|
2609
|
-
}
|
|
2610
|
-
|
|
2611
3553
|
// src/evaluation/providers/codex.ts
|
|
2612
|
-
var execAsync2 = (0, import_node_util2.promisify)(
|
|
2613
|
-
var
|
|
2614
|
-
var
|
|
3554
|
+
var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process3.exec);
|
|
3555
|
+
var WORKSPACE_PREFIX2 = "agentv-codex-";
|
|
3556
|
+
var PROMPT_FILENAME2 = "prompt.md";
|
|
2615
3557
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
2616
|
-
var
|
|
3558
|
+
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2617
3559
|
- Do NOT create any additional output files in the workspace.
|
|
2618
3560
|
- All intended file outputs/changes MUST be written in your response.
|
|
2619
3561
|
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
@@ -2638,27 +3580,27 @@ var CodexProvider = class {
|
|
|
2638
3580
|
throw new Error("Codex provider request was aborted before execution");
|
|
2639
3581
|
}
|
|
2640
3582
|
await this.ensureEnvironmentReady();
|
|
2641
|
-
const inputFiles =
|
|
3583
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
2642
3584
|
const workspaceRoot = await this.createWorkspace();
|
|
2643
3585
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2644
3586
|
try {
|
|
2645
3587
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
2646
|
-
const systemPrompt = this.config.systemPrompt ??
|
|
3588
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
|
|
2647
3589
|
const promptContent = `${systemPrompt}
|
|
2648
3590
|
|
|
2649
3591
|
${basePrompt}`;
|
|
2650
|
-
const promptFile =
|
|
2651
|
-
await (0,
|
|
3592
|
+
const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3593
|
+
await (0, import_promises10.writeFile)(promptFile, promptContent, "utf8");
|
|
2652
3594
|
const args = this.buildCodexArgs();
|
|
2653
3595
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
2654
3596
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
2655
3597
|
if (result.timedOut) {
|
|
2656
3598
|
throw new Error(
|
|
2657
|
-
`Codex CLI timed out${
|
|
3599
|
+
`Codex CLI timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
|
|
2658
3600
|
);
|
|
2659
3601
|
}
|
|
2660
3602
|
if (result.exitCode !== 0) {
|
|
2661
|
-
const detail =
|
|
3603
|
+
const detail = pickDetail2(result.stderr, result.stdout);
|
|
2662
3604
|
const prefix = `Codex CLI exited with code ${result.exitCode}`;
|
|
2663
3605
|
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
2664
3606
|
}
|
|
@@ -2697,7 +3639,7 @@ ${basePrompt}`;
|
|
|
2697
3639
|
if (!this.config.cwd) {
|
|
2698
3640
|
return workspaceRoot;
|
|
2699
3641
|
}
|
|
2700
|
-
return
|
|
3642
|
+
return import_node_path11.default.resolve(this.config.cwd);
|
|
2701
3643
|
}
|
|
2702
3644
|
buildCodexArgs() {
|
|
2703
3645
|
const args = [
|
|
@@ -2739,11 +3681,11 @@ ${basePrompt}`;
|
|
|
2739
3681
|
}
|
|
2740
3682
|
}
|
|
2741
3683
|
async createWorkspace() {
|
|
2742
|
-
return await (0,
|
|
3684
|
+
return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
|
|
2743
3685
|
}
|
|
2744
3686
|
async cleanupWorkspace(workspaceRoot) {
|
|
2745
3687
|
try {
|
|
2746
|
-
await (0,
|
|
3688
|
+
await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
|
|
2747
3689
|
} catch {
|
|
2748
3690
|
}
|
|
2749
3691
|
}
|
|
@@ -2753,9 +3695,9 @@ ${basePrompt}`;
|
|
|
2753
3695
|
return void 0;
|
|
2754
3696
|
}
|
|
2755
3697
|
if (this.config.logDir) {
|
|
2756
|
-
return
|
|
3698
|
+
return import_node_path11.default.resolve(this.config.logDir);
|
|
2757
3699
|
}
|
|
2758
|
-
return
|
|
3700
|
+
return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "codex");
|
|
2759
3701
|
}
|
|
2760
3702
|
async createStreamLogger(request) {
|
|
2761
3703
|
const logDir = this.resolveLogDirectory();
|
|
@@ -2763,13 +3705,13 @@ ${basePrompt}`;
|
|
|
2763
3705
|
return void 0;
|
|
2764
3706
|
}
|
|
2765
3707
|
try {
|
|
2766
|
-
await (0,
|
|
3708
|
+
await (0, import_promises10.mkdir)(logDir, { recursive: true });
|
|
2767
3709
|
} catch (error) {
|
|
2768
3710
|
const message = error instanceof Error ? error.message : String(error);
|
|
2769
3711
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
2770
3712
|
return void 0;
|
|
2771
3713
|
}
|
|
2772
|
-
const filePath =
|
|
3714
|
+
const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
|
|
2773
3715
|
try {
|
|
2774
3716
|
const logger = await CodexStreamLogger.create({
|
|
2775
3717
|
filePath,
|
|
@@ -2802,7 +3744,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
|
|
|
2802
3744
|
constructor(filePath, format) {
|
|
2803
3745
|
this.filePath = filePath;
|
|
2804
3746
|
this.format = format;
|
|
2805
|
-
this.stream = (0,
|
|
3747
|
+
this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
|
|
2806
3748
|
}
|
|
2807
3749
|
static async create(options) {
|
|
2808
3750
|
const logger = new _CodexStreamLogger(options.filePath, options.format);
|
|
@@ -2863,7 +3805,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
|
|
|
2863
3805
|
return void 0;
|
|
2864
3806
|
}
|
|
2865
3807
|
const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
|
|
2866
|
-
return `[+${
|
|
3808
|
+
return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
|
|
2867
3809
|
}
|
|
2868
3810
|
flushRemainder() {
|
|
2869
3811
|
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
@@ -2894,18 +3836,18 @@ function isCodexLogStreamingDisabled() {
|
|
|
2894
3836
|
const normalized = envValue.trim().toLowerCase();
|
|
2895
3837
|
return normalized === "false" || normalized === "0" || normalized === "off";
|
|
2896
3838
|
}
|
|
2897
|
-
function
|
|
3839
|
+
function buildLogFilename2(request, targetName) {
|
|
2898
3840
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2899
|
-
const evalId =
|
|
3841
|
+
const evalId = sanitizeForFilename2(request.evalCaseId ?? "codex");
|
|
2900
3842
|
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
2901
|
-
const target =
|
|
2902
|
-
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0,
|
|
3843
|
+
const target = sanitizeForFilename2(targetName);
|
|
3844
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
|
|
2903
3845
|
}
|
|
2904
|
-
function
|
|
3846
|
+
function sanitizeForFilename2(value) {
|
|
2905
3847
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
2906
3848
|
return sanitized.length > 0 ? sanitized : "codex";
|
|
2907
3849
|
}
|
|
2908
|
-
function
|
|
3850
|
+
function formatElapsed2(startedAt) {
|
|
2909
3851
|
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
2910
3852
|
const hours = Math.floor(elapsedSeconds / 3600);
|
|
2911
3853
|
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
@@ -2916,7 +3858,7 @@ function formatElapsed(startedAt) {
|
|
|
2916
3858
|
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2917
3859
|
}
|
|
2918
3860
|
function formatCodexLogMessage(rawLine, source) {
|
|
2919
|
-
const parsed =
|
|
3861
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
2920
3862
|
if (parsed) {
|
|
2921
3863
|
const summary = summarizeCodexEvent(parsed);
|
|
2922
3864
|
if (summary) {
|
|
@@ -2929,7 +3871,7 @@ function formatCodexLogMessage(rawLine, source) {
|
|
|
2929
3871
|
return rawLine;
|
|
2930
3872
|
}
|
|
2931
3873
|
function formatCodexJsonLog(rawLine) {
|
|
2932
|
-
const parsed =
|
|
3874
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
2933
3875
|
if (!parsed) {
|
|
2934
3876
|
return rawLine;
|
|
2935
3877
|
}
|
|
@@ -2974,7 +3916,7 @@ function summarizeCodexEvent(event) {
|
|
|
2974
3916
|
}
|
|
2975
3917
|
return type;
|
|
2976
3918
|
}
|
|
2977
|
-
function
|
|
3919
|
+
function tryParseJsonValue2(rawLine) {
|
|
2978
3920
|
try {
|
|
2979
3921
|
return JSON.parse(rawLine);
|
|
2980
3922
|
} catch {
|
|
@@ -2984,9 +3926,9 @@ function tryParseJsonValue(rawLine) {
|
|
|
2984
3926
|
async function locateExecutable(candidate) {
|
|
2985
3927
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
2986
3928
|
if (includesPathSeparator) {
|
|
2987
|
-
const resolved =
|
|
3929
|
+
const resolved = import_node_path11.default.isAbsolute(candidate) ? candidate : import_node_path11.default.resolve(candidate);
|
|
2988
3930
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
2989
|
-
await (0,
|
|
3931
|
+
await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
|
|
2990
3932
|
return executablePath;
|
|
2991
3933
|
}
|
|
2992
3934
|
const locator = process.platform === "win32" ? "where" : "which";
|
|
@@ -2996,7 +3938,7 @@ async function locateExecutable(candidate) {
|
|
|
2996
3938
|
const preferred = selectExecutableCandidate(lines);
|
|
2997
3939
|
if (preferred) {
|
|
2998
3940
|
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
2999
|
-
await (0,
|
|
3941
|
+
await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
|
|
3000
3942
|
return executablePath;
|
|
3001
3943
|
}
|
|
3002
3944
|
} catch {
|
|
@@ -3030,7 +3972,7 @@ async function ensureWindowsExecutableVariant(candidate) {
|
|
|
3030
3972
|
for (const ext of extensions) {
|
|
3031
3973
|
const withExtension = `${candidate}${ext}`;
|
|
3032
3974
|
try {
|
|
3033
|
-
await (0,
|
|
3975
|
+
await (0, import_promises10.access)(withExtension, import_node_fs4.constants.F_OK);
|
|
3034
3976
|
return withExtension;
|
|
3035
3977
|
} catch {
|
|
3036
3978
|
}
|
|
@@ -3203,7 +4145,7 @@ function parseJsonLines(output) {
|
|
|
3203
4145
|
}
|
|
3204
4146
|
return parsed;
|
|
3205
4147
|
}
|
|
3206
|
-
function
|
|
4148
|
+
function pickDetail2(stderr, stdout) {
|
|
3207
4149
|
const errorText = stderr.trim();
|
|
3208
4150
|
if (errorText.length > 0) {
|
|
3209
4151
|
return errorText;
|
|
@@ -3211,7 +4153,7 @@ function pickDetail(stderr, stdout) {
|
|
|
3211
4153
|
const stdoutText = stdout.trim();
|
|
3212
4154
|
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3213
4155
|
}
|
|
3214
|
-
function
|
|
4156
|
+
function formatTimeoutSuffix3(timeoutMs) {
|
|
3215
4157
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
3216
4158
|
return "";
|
|
3217
4159
|
}
|
|
@@ -3220,7 +4162,7 @@ function formatTimeoutSuffix2(timeoutMs) {
|
|
|
3220
4162
|
}
|
|
3221
4163
|
async function defaultCodexRunner(options) {
|
|
3222
4164
|
return await new Promise((resolve, reject) => {
|
|
3223
|
-
const child = (0,
|
|
4165
|
+
const child = (0, import_node_child_process3.spawn)(options.executable, options.args, {
|
|
3224
4166
|
cwd: options.cwd,
|
|
3225
4167
|
env: options.env,
|
|
3226
4168
|
stdio: ["pipe", "pipe", "pipe"],
|
|
@@ -3331,38 +4273,38 @@ var MockProvider = class {
|
|
|
3331
4273
|
};
|
|
3332
4274
|
|
|
3333
4275
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
3334
|
-
var
|
|
3335
|
-
var
|
|
3336
|
-
var
|
|
3337
|
-
var
|
|
3338
|
-
var
|
|
3339
|
-
var
|
|
4276
|
+
var import_node_child_process4 = require("child_process");
|
|
4277
|
+
var import_node_crypto3 = require("crypto");
|
|
4278
|
+
var import_node_fs5 = require("fs");
|
|
4279
|
+
var import_promises11 = require("fs/promises");
|
|
4280
|
+
var import_node_os4 = require("os");
|
|
4281
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
3340
4282
|
|
|
3341
4283
|
// src/evaluation/providers/pi-log-tracker.ts
|
|
3342
|
-
var
|
|
3343
|
-
var
|
|
4284
|
+
var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
|
|
4285
|
+
var GLOBAL_SUBSCRIBERS_KEY3 = Symbol.for("agentv.piLogSubscribers");
|
|
3344
4286
|
function getPiLogStore() {
|
|
3345
4287
|
const globalObject = globalThis;
|
|
3346
|
-
const existing = globalObject[
|
|
4288
|
+
const existing = globalObject[GLOBAL_LOGS_KEY3];
|
|
3347
4289
|
if (existing) {
|
|
3348
4290
|
return existing;
|
|
3349
4291
|
}
|
|
3350
4292
|
const created = [];
|
|
3351
|
-
globalObject[
|
|
4293
|
+
globalObject[GLOBAL_LOGS_KEY3] = created;
|
|
3352
4294
|
return created;
|
|
3353
4295
|
}
|
|
3354
|
-
function
|
|
4296
|
+
function getSubscriberStore3() {
|
|
3355
4297
|
const globalObject = globalThis;
|
|
3356
|
-
const existing = globalObject[
|
|
4298
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY3];
|
|
3357
4299
|
if (existing) {
|
|
3358
4300
|
return existing;
|
|
3359
4301
|
}
|
|
3360
4302
|
const created = /* @__PURE__ */ new Set();
|
|
3361
|
-
globalObject[
|
|
4303
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY3] = created;
|
|
3362
4304
|
return created;
|
|
3363
4305
|
}
|
|
3364
|
-
function
|
|
3365
|
-
const subscribers = Array.from(
|
|
4306
|
+
function notifySubscribers3(entry) {
|
|
4307
|
+
const subscribers = Array.from(getSubscriberStore3());
|
|
3366
4308
|
for (const listener of subscribers) {
|
|
3367
4309
|
try {
|
|
3368
4310
|
listener(entry);
|
|
@@ -3374,7 +4316,7 @@ function notifySubscribers2(entry) {
|
|
|
3374
4316
|
}
|
|
3375
4317
|
function recordPiLogEntry(entry) {
|
|
3376
4318
|
getPiLogStore().push(entry);
|
|
3377
|
-
|
|
4319
|
+
notifySubscribers3(entry);
|
|
3378
4320
|
}
|
|
3379
4321
|
function consumePiLogEntries() {
|
|
3380
4322
|
const store = getPiLogStore();
|
|
@@ -3384,7 +4326,7 @@ function consumePiLogEntries() {
|
|
|
3384
4326
|
return store.splice(0, store.length);
|
|
3385
4327
|
}
|
|
3386
4328
|
function subscribeToPiLogEntries(listener) {
|
|
3387
|
-
const store =
|
|
4329
|
+
const store = getSubscriberStore3();
|
|
3388
4330
|
store.add(listener);
|
|
3389
4331
|
return () => {
|
|
3390
4332
|
store.delete(listener);
|
|
@@ -3392,9 +4334,9 @@ function subscribeToPiLogEntries(listener) {
|
|
|
3392
4334
|
}
|
|
3393
4335
|
|
|
3394
4336
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
3395
|
-
var
|
|
3396
|
-
var
|
|
3397
|
-
var
|
|
4337
|
+
var WORKSPACE_PREFIX3 = "agentv-pi-";
|
|
4338
|
+
var PROMPT_FILENAME3 = "prompt.md";
|
|
4339
|
+
var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
|
|
3398
4340
|
- Do NOT create any additional output files in the workspace.
|
|
3399
4341
|
- All intended file outputs/changes MUST be written in your response.
|
|
3400
4342
|
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
@@ -3416,27 +4358,27 @@ var PiCodingAgentProvider = class {
|
|
|
3416
4358
|
if (request.signal?.aborted) {
|
|
3417
4359
|
throw new Error("Pi coding agent request was aborted before execution");
|
|
3418
4360
|
}
|
|
3419
|
-
const inputFiles =
|
|
4361
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
3420
4362
|
const workspaceRoot = await this.createWorkspace();
|
|
3421
4363
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
3422
4364
|
try {
|
|
3423
|
-
const promptFile =
|
|
3424
|
-
await (0,
|
|
4365
|
+
const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME3);
|
|
4366
|
+
await (0, import_promises11.writeFile)(promptFile, request.question, "utf8");
|
|
3425
4367
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
3426
4368
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
3427
4369
|
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
3428
4370
|
if (result.timedOut) {
|
|
3429
4371
|
throw new Error(
|
|
3430
|
-
`Pi coding agent timed out${
|
|
4372
|
+
`Pi coding agent timed out${formatTimeoutSuffix4(this.config.timeoutMs ?? void 0)}`
|
|
3431
4373
|
);
|
|
3432
4374
|
}
|
|
3433
4375
|
if (result.exitCode !== 0) {
|
|
3434
|
-
const detail =
|
|
4376
|
+
const detail = pickDetail3(result.stderr, result.stdout);
|
|
3435
4377
|
const prefix = `Pi coding agent exited with code ${result.exitCode}`;
|
|
3436
4378
|
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
3437
4379
|
}
|
|
3438
4380
|
const parsed = parsePiJsonl(result.stdout);
|
|
3439
|
-
const outputMessages =
|
|
4381
|
+
const outputMessages = extractOutputMessages2(parsed);
|
|
3440
4382
|
const assistantText = extractAssistantText2(outputMessages);
|
|
3441
4383
|
return {
|
|
3442
4384
|
raw: {
|
|
@@ -3462,7 +4404,7 @@ var PiCodingAgentProvider = class {
|
|
|
3462
4404
|
if (!this.config.cwd) {
|
|
3463
4405
|
return workspaceRoot;
|
|
3464
4406
|
}
|
|
3465
|
-
return
|
|
4407
|
+
return import_node_path12.default.resolve(this.config.cwd);
|
|
3466
4408
|
}
|
|
3467
4409
|
buildPiArgs(prompt, inputFiles) {
|
|
3468
4410
|
const args = [];
|
|
@@ -3492,7 +4434,7 @@ var PiCodingAgentProvider = class {
|
|
|
3492
4434
|
args.push(`@${file}`);
|
|
3493
4435
|
}
|
|
3494
4436
|
}
|
|
3495
|
-
const systemPrompt = this.config.systemPrompt ??
|
|
4437
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT4;
|
|
3496
4438
|
const fullPrompt = `${systemPrompt}
|
|
3497
4439
|
|
|
3498
4440
|
${prompt}`;
|
|
@@ -3551,19 +4493,19 @@ ${prompt}`;
|
|
|
3551
4493
|
return env;
|
|
3552
4494
|
}
|
|
3553
4495
|
async createWorkspace() {
|
|
3554
|
-
return await (0,
|
|
4496
|
+
return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
|
|
3555
4497
|
}
|
|
3556
4498
|
async cleanupWorkspace(workspaceRoot) {
|
|
3557
4499
|
try {
|
|
3558
|
-
await (0,
|
|
4500
|
+
await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
|
|
3559
4501
|
} catch {
|
|
3560
4502
|
}
|
|
3561
4503
|
}
|
|
3562
4504
|
resolveLogDirectory() {
|
|
3563
4505
|
if (this.config.logDir) {
|
|
3564
|
-
return
|
|
4506
|
+
return import_node_path12.default.resolve(this.config.logDir);
|
|
3565
4507
|
}
|
|
3566
|
-
return
|
|
4508
|
+
return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
3567
4509
|
}
|
|
3568
4510
|
async createStreamLogger(request) {
|
|
3569
4511
|
const logDir = this.resolveLogDirectory();
|
|
@@ -3571,13 +4513,13 @@ ${prompt}`;
|
|
|
3571
4513
|
return void 0;
|
|
3572
4514
|
}
|
|
3573
4515
|
try {
|
|
3574
|
-
await (0,
|
|
4516
|
+
await (0, import_promises11.mkdir)(logDir, { recursive: true });
|
|
3575
4517
|
} catch (error) {
|
|
3576
4518
|
const message = error instanceof Error ? error.message : String(error);
|
|
3577
4519
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
3578
4520
|
return void 0;
|
|
3579
4521
|
}
|
|
3580
|
-
const filePath =
|
|
4522
|
+
const filePath = import_node_path12.default.join(logDir, buildLogFilename3(request, this.targetName));
|
|
3581
4523
|
try {
|
|
3582
4524
|
const logger = await PiStreamLogger.create({
|
|
3583
4525
|
filePath,
|
|
@@ -3610,7 +4552,7 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3610
4552
|
constructor(filePath, format) {
|
|
3611
4553
|
this.filePath = filePath;
|
|
3612
4554
|
this.format = format;
|
|
3613
|
-
this.stream = (0,
|
|
4555
|
+
this.stream = (0, import_node_fs5.createWriteStream)(filePath, { flags: "a" });
|
|
3614
4556
|
}
|
|
3615
4557
|
static async create(options) {
|
|
3616
4558
|
const logger = new _PiStreamLogger(options.filePath, options.format);
|
|
@@ -3671,7 +4613,7 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3671
4613
|
return void 0;
|
|
3672
4614
|
}
|
|
3673
4615
|
const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
|
|
3674
|
-
return `[+${
|
|
4616
|
+
return `[+${formatElapsed3(this.startedAt)}] [${source}] ${message}`;
|
|
3675
4617
|
}
|
|
3676
4618
|
flushRemainder() {
|
|
3677
4619
|
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
@@ -3694,18 +4636,18 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3694
4636
|
this.stderrBuffer = "";
|
|
3695
4637
|
}
|
|
3696
4638
|
};
|
|
3697
|
-
function
|
|
4639
|
+
function buildLogFilename3(request, targetName) {
|
|
3698
4640
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3699
|
-
const evalId =
|
|
4641
|
+
const evalId = sanitizeForFilename3(request.evalCaseId ?? "pi");
|
|
3700
4642
|
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
3701
|
-
const target =
|
|
3702
|
-
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0,
|
|
4643
|
+
const target = sanitizeForFilename3(targetName);
|
|
4644
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto3.randomUUID)().slice(0, 8)}.log`;
|
|
3703
4645
|
}
|
|
3704
|
-
function
|
|
4646
|
+
function sanitizeForFilename3(value) {
|
|
3705
4647
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3706
4648
|
return sanitized.length > 0 ? sanitized : "pi";
|
|
3707
4649
|
}
|
|
3708
|
-
function
|
|
4650
|
+
function formatElapsed3(startedAt) {
|
|
3709
4651
|
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
3710
4652
|
const hours = Math.floor(elapsedSeconds / 3600);
|
|
3711
4653
|
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
@@ -3716,7 +4658,7 @@ function formatElapsed2(startedAt) {
|
|
|
3716
4658
|
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3717
4659
|
}
|
|
3718
4660
|
function formatPiLogMessage(rawLine, source) {
|
|
3719
|
-
const parsed =
|
|
4661
|
+
const parsed = tryParseJsonValue3(rawLine);
|
|
3720
4662
|
if (parsed) {
|
|
3721
4663
|
const summary = summarizePiEvent(parsed);
|
|
3722
4664
|
if (summary) {
|
|
@@ -3729,7 +4671,7 @@ function formatPiLogMessage(rawLine, source) {
|
|
|
3729
4671
|
return rawLine;
|
|
3730
4672
|
}
|
|
3731
4673
|
function formatPiJsonLog(rawLine) {
|
|
3732
|
-
const parsed =
|
|
4674
|
+
const parsed = tryParseJsonValue3(rawLine);
|
|
3733
4675
|
if (!parsed) {
|
|
3734
4676
|
return rawLine;
|
|
3735
4677
|
}
|
|
@@ -3779,7 +4721,7 @@ function summarizePiEvent(event) {
|
|
|
3779
4721
|
return type;
|
|
3780
4722
|
}
|
|
3781
4723
|
}
|
|
3782
|
-
function
|
|
4724
|
+
function tryParseJsonValue3(rawLine) {
|
|
3783
4725
|
try {
|
|
3784
4726
|
return JSON.parse(rawLine);
|
|
3785
4727
|
} catch {
|
|
@@ -3804,7 +4746,7 @@ function parsePiJsonl(output) {
|
|
|
3804
4746
|
}
|
|
3805
4747
|
return parsed;
|
|
3806
4748
|
}
|
|
3807
|
-
function
|
|
4749
|
+
function extractOutputMessages2(events) {
|
|
3808
4750
|
for (let i = events.length - 1; i >= 0; i--) {
|
|
3809
4751
|
const event = events[i];
|
|
3810
4752
|
if (!event || typeof event !== "object") {
|
|
@@ -3845,8 +4787,8 @@ function convertPiMessage(message) {
|
|
|
3845
4787
|
if (typeof role !== "string") {
|
|
3846
4788
|
return void 0;
|
|
3847
4789
|
}
|
|
3848
|
-
const content =
|
|
3849
|
-
const toolCalls =
|
|
4790
|
+
const content = extractTextContent2(msg.content);
|
|
4791
|
+
const toolCalls = extractToolCalls2(msg.content);
|
|
3850
4792
|
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
3851
4793
|
const metadata = {};
|
|
3852
4794
|
if (msg.api) metadata.api = msg.api;
|
|
@@ -3862,7 +4804,7 @@ function convertPiMessage(message) {
|
|
|
3862
4804
|
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
3863
4805
|
};
|
|
3864
4806
|
}
|
|
3865
|
-
function
|
|
4807
|
+
function extractTextContent2(content) {
|
|
3866
4808
|
if (typeof content === "string") {
|
|
3867
4809
|
return content;
|
|
3868
4810
|
}
|
|
@@ -3881,7 +4823,7 @@ function extractTextContent(content) {
|
|
|
3881
4823
|
}
|
|
3882
4824
|
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
3883
4825
|
}
|
|
3884
|
-
function
|
|
4826
|
+
function extractToolCalls2(content) {
|
|
3885
4827
|
if (!Array.isArray(content)) {
|
|
3886
4828
|
return [];
|
|
3887
4829
|
}
|
|
@@ -3926,7 +4868,7 @@ function extractAssistantText2(messages) {
|
|
|
3926
4868
|
function escapeAtSymbols(prompt) {
|
|
3927
4869
|
return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
|
|
3928
4870
|
}
|
|
3929
|
-
function
|
|
4871
|
+
function pickDetail3(stderr, stdout) {
|
|
3930
4872
|
const errorText = stderr.trim();
|
|
3931
4873
|
if (errorText.length > 0) {
|
|
3932
4874
|
return errorText;
|
|
@@ -3934,7 +4876,7 @@ function pickDetail2(stderr, stdout) {
|
|
|
3934
4876
|
const stdoutText = stdout.trim();
|
|
3935
4877
|
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3936
4878
|
}
|
|
3937
|
-
function
|
|
4879
|
+
function formatTimeoutSuffix4(timeoutMs) {
|
|
3938
4880
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
3939
4881
|
return "";
|
|
3940
4882
|
}
|
|
@@ -3947,7 +4889,7 @@ async function defaultPiRunner(options) {
|
|
|
3947
4889
|
const executable = parts[0];
|
|
3948
4890
|
const executableArgs = parts.slice(1);
|
|
3949
4891
|
const allArgs = [...executableArgs, ...options.args];
|
|
3950
|
-
const child = (0,
|
|
4892
|
+
const child = (0, import_node_child_process4.spawn)(executable, allArgs, {
|
|
3951
4893
|
cwd: options.cwd,
|
|
3952
4894
|
env: options.env,
|
|
3953
4895
|
stdio: ["pipe", "pipe", "pipe"],
|
|
@@ -4010,84 +4952,84 @@ async function defaultPiRunner(options) {
|
|
|
4010
4952
|
}
|
|
4011
4953
|
|
|
4012
4954
|
// src/evaluation/providers/targets.ts
|
|
4013
|
-
var
|
|
4014
|
-
var
|
|
4015
|
-
var CliHealthcheckHttpInputSchema =
|
|
4016
|
-
type:
|
|
4017
|
-
url:
|
|
4018
|
-
timeout_seconds:
|
|
4019
|
-
timeoutSeconds:
|
|
4955
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
4956
|
+
var import_zod2 = require("zod");
|
|
4957
|
+
var CliHealthcheckHttpInputSchema = import_zod2.z.object({
|
|
4958
|
+
type: import_zod2.z.literal("http"),
|
|
4959
|
+
url: import_zod2.z.string().min(1, "healthcheck URL is required"),
|
|
4960
|
+
timeout_seconds: import_zod2.z.number().positive().optional(),
|
|
4961
|
+
timeoutSeconds: import_zod2.z.number().positive().optional()
|
|
4020
4962
|
});
|
|
4021
|
-
var CliHealthcheckCommandInputSchema =
|
|
4022
|
-
type:
|
|
4023
|
-
command_template:
|
|
4024
|
-
commandTemplate:
|
|
4025
|
-
cwd:
|
|
4026
|
-
timeout_seconds:
|
|
4027
|
-
timeoutSeconds:
|
|
4963
|
+
var CliHealthcheckCommandInputSchema = import_zod2.z.object({
|
|
4964
|
+
type: import_zod2.z.literal("command"),
|
|
4965
|
+
command_template: import_zod2.z.string().optional(),
|
|
4966
|
+
commandTemplate: import_zod2.z.string().optional(),
|
|
4967
|
+
cwd: import_zod2.z.string().optional(),
|
|
4968
|
+
timeout_seconds: import_zod2.z.number().positive().optional(),
|
|
4969
|
+
timeoutSeconds: import_zod2.z.number().positive().optional()
|
|
4028
4970
|
});
|
|
4029
|
-
var CliHealthcheckInputSchema =
|
|
4971
|
+
var CliHealthcheckInputSchema = import_zod2.z.discriminatedUnion("type", [
|
|
4030
4972
|
CliHealthcheckHttpInputSchema,
|
|
4031
4973
|
CliHealthcheckCommandInputSchema
|
|
4032
4974
|
]);
|
|
4033
|
-
var CliTargetInputSchema =
|
|
4034
|
-
name:
|
|
4035
|
-
provider:
|
|
4975
|
+
var CliTargetInputSchema = import_zod2.z.object({
|
|
4976
|
+
name: import_zod2.z.string().min(1, "target name is required"),
|
|
4977
|
+
provider: import_zod2.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
|
|
4036
4978
|
// Command template - required (accept both naming conventions)
|
|
4037
|
-
command_template:
|
|
4038
|
-
commandTemplate:
|
|
4979
|
+
command_template: import_zod2.z.string().optional(),
|
|
4980
|
+
commandTemplate: import_zod2.z.string().optional(),
|
|
4039
4981
|
// Files format - optional
|
|
4040
|
-
files_format:
|
|
4041
|
-
filesFormat:
|
|
4042
|
-
attachments_format:
|
|
4043
|
-
attachmentsFormat:
|
|
4982
|
+
files_format: import_zod2.z.string().optional(),
|
|
4983
|
+
filesFormat: import_zod2.z.string().optional(),
|
|
4984
|
+
attachments_format: import_zod2.z.string().optional(),
|
|
4985
|
+
attachmentsFormat: import_zod2.z.string().optional(),
|
|
4044
4986
|
// Working directory - optional
|
|
4045
|
-
cwd:
|
|
4987
|
+
cwd: import_zod2.z.string().optional(),
|
|
4046
4988
|
// Timeout in seconds - optional
|
|
4047
|
-
timeout_seconds:
|
|
4048
|
-
timeoutSeconds:
|
|
4989
|
+
timeout_seconds: import_zod2.z.number().positive().optional(),
|
|
4990
|
+
timeoutSeconds: import_zod2.z.number().positive().optional(),
|
|
4049
4991
|
// Healthcheck configuration - optional
|
|
4050
4992
|
healthcheck: CliHealthcheckInputSchema.optional(),
|
|
4051
4993
|
// Verbose mode - optional
|
|
4052
|
-
verbose:
|
|
4053
|
-
cli_verbose:
|
|
4054
|
-
cliVerbose:
|
|
4994
|
+
verbose: import_zod2.z.boolean().optional(),
|
|
4995
|
+
cli_verbose: import_zod2.z.boolean().optional(),
|
|
4996
|
+
cliVerbose: import_zod2.z.boolean().optional(),
|
|
4055
4997
|
// Keep temp files - optional
|
|
4056
|
-
keep_temp_files:
|
|
4057
|
-
keepTempFiles:
|
|
4058
|
-
keep_output_files:
|
|
4059
|
-
keepOutputFiles:
|
|
4998
|
+
keep_temp_files: import_zod2.z.boolean().optional(),
|
|
4999
|
+
keepTempFiles: import_zod2.z.boolean().optional(),
|
|
5000
|
+
keep_output_files: import_zod2.z.boolean().optional(),
|
|
5001
|
+
keepOutputFiles: import_zod2.z.boolean().optional(),
|
|
4060
5002
|
// Common target fields
|
|
4061
|
-
judge_target:
|
|
4062
|
-
workers:
|
|
4063
|
-
provider_batching:
|
|
4064
|
-
providerBatching:
|
|
5003
|
+
judge_target: import_zod2.z.string().optional(),
|
|
5004
|
+
workers: import_zod2.z.number().int().min(1).optional(),
|
|
5005
|
+
provider_batching: import_zod2.z.boolean().optional(),
|
|
5006
|
+
providerBatching: import_zod2.z.boolean().optional()
|
|
4065
5007
|
}).refine((data) => data.command_template !== void 0 || data.commandTemplate !== void 0, {
|
|
4066
5008
|
message: "Either command_template or commandTemplate is required"
|
|
4067
5009
|
});
|
|
4068
|
-
var CliHealthcheckHttpSchema =
|
|
4069
|
-
type:
|
|
4070
|
-
url:
|
|
4071
|
-
timeoutMs:
|
|
5010
|
+
var CliHealthcheckHttpSchema = import_zod2.z.object({
|
|
5011
|
+
type: import_zod2.z.literal("http"),
|
|
5012
|
+
url: import_zod2.z.string().min(1),
|
|
5013
|
+
timeoutMs: import_zod2.z.number().positive().optional()
|
|
4072
5014
|
}).strict();
|
|
4073
|
-
var CliHealthcheckCommandSchema =
|
|
4074
|
-
type:
|
|
4075
|
-
commandTemplate:
|
|
4076
|
-
cwd:
|
|
4077
|
-
timeoutMs:
|
|
5015
|
+
var CliHealthcheckCommandSchema = import_zod2.z.object({
|
|
5016
|
+
type: import_zod2.z.literal("command"),
|
|
5017
|
+
commandTemplate: import_zod2.z.string().min(1),
|
|
5018
|
+
cwd: import_zod2.z.string().optional(),
|
|
5019
|
+
timeoutMs: import_zod2.z.number().positive().optional()
|
|
4078
5020
|
}).strict();
|
|
4079
|
-
var CliHealthcheckSchema =
|
|
5021
|
+
var CliHealthcheckSchema = import_zod2.z.discriminatedUnion("type", [
|
|
4080
5022
|
CliHealthcheckHttpSchema,
|
|
4081
5023
|
CliHealthcheckCommandSchema
|
|
4082
5024
|
]);
|
|
4083
|
-
var CliTargetConfigSchema =
|
|
4084
|
-
commandTemplate:
|
|
4085
|
-
filesFormat:
|
|
4086
|
-
cwd:
|
|
4087
|
-
timeoutMs:
|
|
5025
|
+
var CliTargetConfigSchema = import_zod2.z.object({
|
|
5026
|
+
commandTemplate: import_zod2.z.string().min(1),
|
|
5027
|
+
filesFormat: import_zod2.z.string().optional(),
|
|
5028
|
+
cwd: import_zod2.z.string().optional(),
|
|
5029
|
+
timeoutMs: import_zod2.z.number().positive().optional(),
|
|
4088
5030
|
healthcheck: CliHealthcheckSchema.optional(),
|
|
4089
|
-
verbose:
|
|
4090
|
-
keepTempFiles:
|
|
5031
|
+
verbose: import_zod2.z.boolean().optional(),
|
|
5032
|
+
keepTempFiles: import_zod2.z.boolean().optional()
|
|
4091
5033
|
}).strict();
|
|
4092
5034
|
function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
4093
5035
|
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
@@ -4116,8 +5058,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
4116
5058
|
allowLiteral: true,
|
|
4117
5059
|
optionalEnv: true
|
|
4118
5060
|
});
|
|
4119
|
-
if (cwd && evalFilePath && !
|
|
4120
|
-
cwd =
|
|
5061
|
+
if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
|
|
5062
|
+
cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
|
|
5063
|
+
}
|
|
5064
|
+
if (!cwd && evalFilePath) {
|
|
5065
|
+
cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
|
|
4121
5066
|
}
|
|
4122
5067
|
return {
|
|
4123
5068
|
type: "command",
|
|
@@ -4144,11 +5089,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
|
4144
5089
|
allowLiteral: true,
|
|
4145
5090
|
optionalEnv: true
|
|
4146
5091
|
});
|
|
4147
|
-
if (cwd && evalFilePath && !
|
|
4148
|
-
cwd =
|
|
5092
|
+
if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
|
|
5093
|
+
cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
|
|
4149
5094
|
}
|
|
4150
5095
|
if (!cwd && evalFilePath) {
|
|
4151
|
-
cwd =
|
|
5096
|
+
cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
|
|
4152
5097
|
}
|
|
4153
5098
|
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
4154
5099
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
@@ -4175,11 +5120,11 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
|
4175
5120
|
"FILES",
|
|
4176
5121
|
"OUTPUT_FILE"
|
|
4177
5122
|
]);
|
|
4178
|
-
var BASE_TARGET_SCHEMA =
|
|
4179
|
-
name:
|
|
4180
|
-
provider:
|
|
4181
|
-
judge_target:
|
|
4182
|
-
workers:
|
|
5123
|
+
var BASE_TARGET_SCHEMA = import_zod2.z.object({
|
|
5124
|
+
name: import_zod2.z.string().min(1, "target name is required"),
|
|
5125
|
+
provider: import_zod2.z.string().min(1, "provider is required"),
|
|
5126
|
+
judge_target: import_zod2.z.string().optional(),
|
|
5127
|
+
workers: import_zod2.z.number().int().min(1).optional()
|
|
4183
5128
|
}).passthrough();
|
|
4184
5129
|
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
4185
5130
|
function normalizeAzureApiVersion(value) {
|
|
@@ -4282,6 +5227,15 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
4282
5227
|
providerBatching,
|
|
4283
5228
|
config: resolvePiCodingAgentConfig(parsed, env)
|
|
4284
5229
|
};
|
|
5230
|
+
case "claude-code":
|
|
5231
|
+
return {
|
|
5232
|
+
kind: "claude-code",
|
|
5233
|
+
name: parsed.name,
|
|
5234
|
+
judgeTarget: parsed.judge_target,
|
|
5235
|
+
workers: parsed.workers,
|
|
5236
|
+
providerBatching,
|
|
5237
|
+
config: resolveClaudeCodeConfig(parsed, env)
|
|
5238
|
+
};
|
|
4285
5239
|
case "mock":
|
|
4286
5240
|
return {
|
|
4287
5241
|
kind: "mock",
|
|
@@ -4466,34 +5420,92 @@ function resolvePiCodingAgentConfig(target, env) {
|
|
|
4466
5420
|
const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
|
|
4467
5421
|
allowLiteral: true,
|
|
4468
5422
|
optionalEnv: true
|
|
4469
|
-
});
|
|
4470
|
-
const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
|
|
4471
|
-
const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
|
|
5423
|
+
});
|
|
5424
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
|
|
5425
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
|
|
5426
|
+
allowLiteral: true,
|
|
5427
|
+
optionalEnv: true
|
|
5428
|
+
});
|
|
5429
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
|
|
5430
|
+
const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
|
|
5431
|
+
allowLiteral: true,
|
|
5432
|
+
optionalEnv: true
|
|
5433
|
+
});
|
|
5434
|
+
const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
|
|
5435
|
+
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
5436
|
+
return {
|
|
5437
|
+
executable,
|
|
5438
|
+
provider,
|
|
5439
|
+
model,
|
|
5440
|
+
apiKey,
|
|
5441
|
+
tools,
|
|
5442
|
+
thinking,
|
|
5443
|
+
args,
|
|
5444
|
+
cwd,
|
|
5445
|
+
timeoutMs,
|
|
5446
|
+
logDir,
|
|
5447
|
+
logFormat,
|
|
5448
|
+
systemPrompt
|
|
5449
|
+
};
|
|
5450
|
+
}
|
|
5451
|
+
function resolveClaudeCodeConfig(target, env) {
|
|
5452
|
+
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
5453
|
+
const modelSource = target.model;
|
|
5454
|
+
const argsSource = target.args ?? target.arguments;
|
|
5455
|
+
const cwdSource = target.cwd;
|
|
5456
|
+
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
5457
|
+
const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
|
|
5458
|
+
const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_CODE_LOG_FORMAT;
|
|
5459
|
+
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
5460
|
+
const executable = resolveOptionalString(executableSource, env, `${target.name} claude-code executable`, {
|
|
5461
|
+
allowLiteral: true,
|
|
5462
|
+
optionalEnv: true
|
|
5463
|
+
}) ?? "claude";
|
|
5464
|
+
const model = resolveOptionalString(modelSource, env, `${target.name} claude-code model`, {
|
|
4472
5465
|
allowLiteral: true,
|
|
4473
5466
|
optionalEnv: true
|
|
4474
5467
|
});
|
|
4475
|
-
const
|
|
4476
|
-
const
|
|
5468
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} claude-code args`);
|
|
5469
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} claude-code cwd`, {
|
|
4477
5470
|
allowLiteral: true,
|
|
4478
5471
|
optionalEnv: true
|
|
4479
5472
|
});
|
|
4480
|
-
const
|
|
5473
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} claude-code timeout`);
|
|
5474
|
+
const logDir = resolveOptionalString(
|
|
5475
|
+
logDirSource,
|
|
5476
|
+
env,
|
|
5477
|
+
`${target.name} claude-code log directory`,
|
|
5478
|
+
{
|
|
5479
|
+
allowLiteral: true,
|
|
5480
|
+
optionalEnv: true
|
|
5481
|
+
}
|
|
5482
|
+
);
|
|
5483
|
+
const logFormat = normalizeClaudeCodeLogFormat(logFormatSource);
|
|
4481
5484
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
4482
5485
|
return {
|
|
4483
5486
|
executable,
|
|
4484
|
-
provider,
|
|
4485
5487
|
model,
|
|
4486
|
-
|
|
4487
|
-
tools,
|
|
4488
|
-
thinking,
|
|
5488
|
+
systemPrompt,
|
|
4489
5489
|
args,
|
|
4490
5490
|
cwd,
|
|
4491
5491
|
timeoutMs,
|
|
4492
5492
|
logDir,
|
|
4493
|
-
logFormat
|
|
4494
|
-
systemPrompt
|
|
5493
|
+
logFormat
|
|
4495
5494
|
};
|
|
4496
5495
|
}
|
|
5496
|
+
function normalizeClaudeCodeLogFormat(value) {
|
|
5497
|
+
if (value === void 0 || value === null) {
|
|
5498
|
+
return void 0;
|
|
5499
|
+
}
|
|
5500
|
+
if (typeof value !== "string") {
|
|
5501
|
+
throw new Error("claude-code log format must be 'summary' or 'json'");
|
|
5502
|
+
}
|
|
5503
|
+
const normalized = value.trim().toLowerCase();
|
|
5504
|
+
if (normalized === "json" || normalized === "summary") {
|
|
5505
|
+
return normalized;
|
|
5506
|
+
}
|
|
5507
|
+
throw new Error("claude-code log format must be 'summary' or 'json'");
|
|
5508
|
+
}
|
|
4497
5509
|
function resolveMockConfig(target) {
|
|
4498
5510
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
4499
5511
|
return { response };
|
|
@@ -4529,13 +5541,13 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
4529
5541
|
};
|
|
4530
5542
|
}
|
|
4531
5543
|
var cliErrorMap = (issue, ctx) => {
|
|
4532
|
-
if (issue.code ===
|
|
5544
|
+
if (issue.code === import_zod2.z.ZodIssueCode.unrecognized_keys) {
|
|
4533
5545
|
return { message: `Unknown CLI provider settings: ${issue.keys.join(", ")}` };
|
|
4534
5546
|
}
|
|
4535
|
-
if (issue.code ===
|
|
5547
|
+
if (issue.code === import_zod2.z.ZodIssueCode.invalid_union_discriminator) {
|
|
4536
5548
|
return { message: "healthcheck type must be 'http' or 'command'" };
|
|
4537
5549
|
}
|
|
4538
|
-
if (issue.code ===
|
|
5550
|
+
if (issue.code === import_zod2.z.ZodIssueCode.invalid_type && issue.expected === "string") {
|
|
4539
5551
|
return { message: `${ctx.defaultError} (expected a string value)` };
|
|
4540
5552
|
}
|
|
4541
5553
|
return { message: ctx.defaultError };
|
|
@@ -4544,8 +5556,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
4544
5556
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
4545
5557
|
if (!parseResult.success) {
|
|
4546
5558
|
const firstError = parseResult.error.errors[0];
|
|
4547
|
-
const
|
|
4548
|
-
const prefix =
|
|
5559
|
+
const path17 = firstError?.path.join(".") || "";
|
|
5560
|
+
const prefix = path17 ? `${target.name} ${path17}: ` : `${target.name}: `;
|
|
4549
5561
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
4550
5562
|
}
|
|
4551
5563
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -4733,7 +5745,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
4733
5745
|
}
|
|
4734
5746
|
|
|
4735
5747
|
// src/evaluation/providers/vscode.ts
|
|
4736
|
-
var
|
|
5748
|
+
var import_node_path14 = __toESM(require("path"), 1);
|
|
4737
5749
|
var import_subagent = require("subagent");
|
|
4738
5750
|
|
|
4739
5751
|
// src/evaluation/providers/vscode-templates.ts
|
|
@@ -4903,7 +5915,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
4903
5915
|
return "";
|
|
4904
5916
|
}
|
|
4905
5917
|
const buildList = (files) => files.map((absolutePath) => {
|
|
4906
|
-
const fileName =
|
|
5918
|
+
const fileName = import_node_path14.default.basename(absolutePath);
|
|
4907
5919
|
const fileUri = pathToFileUri2(absolutePath);
|
|
4908
5920
|
return `* [${fileName}](${fileUri})`;
|
|
4909
5921
|
});
|
|
@@ -4928,8 +5940,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
4928
5940
|
}
|
|
4929
5941
|
const unique = /* @__PURE__ */ new Map();
|
|
4930
5942
|
for (const attachment of attachments) {
|
|
4931
|
-
const absolutePath =
|
|
4932
|
-
const normalized = absolutePath.split(
|
|
5943
|
+
const absolutePath = import_node_path14.default.resolve(attachment);
|
|
5944
|
+
const normalized = absolutePath.split(import_node_path14.default.sep).join("/");
|
|
4933
5945
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
4934
5946
|
if (!unique.has(absolutePath)) {
|
|
4935
5947
|
unique.set(absolutePath, absolutePath);
|
|
@@ -4944,7 +5956,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
4944
5956
|
}
|
|
4945
5957
|
const unique = /* @__PURE__ */ new Map();
|
|
4946
5958
|
for (const attachment of attachments) {
|
|
4947
|
-
const absolutePath =
|
|
5959
|
+
const absolutePath = import_node_path14.default.resolve(attachment);
|
|
4948
5960
|
if (!unique.has(absolutePath)) {
|
|
4949
5961
|
unique.set(absolutePath, absolutePath);
|
|
4950
5962
|
}
|
|
@@ -4952,7 +5964,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
4952
5964
|
return Array.from(unique.values());
|
|
4953
5965
|
}
|
|
4954
5966
|
function pathToFileUri2(filePath) {
|
|
4955
|
-
const absolutePath =
|
|
5967
|
+
const absolutePath = import_node_path14.default.isAbsolute(filePath) ? filePath : import_node_path14.default.resolve(filePath);
|
|
4956
5968
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
4957
5969
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
4958
5970
|
return `file:///${normalizedPath}`;
|
|
@@ -4965,7 +5977,7 @@ function normalizeAttachments(attachments) {
|
|
|
4965
5977
|
}
|
|
4966
5978
|
const deduped = /* @__PURE__ */ new Set();
|
|
4967
5979
|
for (const attachment of attachments) {
|
|
4968
|
-
deduped.add(
|
|
5980
|
+
deduped.add(import_node_path14.default.resolve(attachment));
|
|
4969
5981
|
}
|
|
4970
5982
|
return Array.from(deduped);
|
|
4971
5983
|
}
|
|
@@ -4974,7 +5986,7 @@ function mergeAttachments(all) {
|
|
|
4974
5986
|
for (const list of all) {
|
|
4975
5987
|
if (!list) continue;
|
|
4976
5988
|
for (const inputFile of list) {
|
|
4977
|
-
deduped.add(
|
|
5989
|
+
deduped.add(import_node_path14.default.resolve(inputFile));
|
|
4978
5990
|
}
|
|
4979
5991
|
}
|
|
4980
5992
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -5021,9 +6033,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
5021
6033
|
}
|
|
5022
6034
|
|
|
5023
6035
|
// src/evaluation/providers/targets-file.ts
|
|
5024
|
-
var
|
|
5025
|
-
var
|
|
5026
|
-
var
|
|
6036
|
+
var import_node_fs6 = require("fs");
|
|
6037
|
+
var import_promises12 = require("fs/promises");
|
|
6038
|
+
var import_node_path15 = __toESM(require("path"), 1);
|
|
5027
6039
|
var import_yaml3 = require("yaml");
|
|
5028
6040
|
function isRecord(value) {
|
|
5029
6041
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -5053,18 +6065,18 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
5053
6065
|
}
|
|
5054
6066
|
async function fileExists3(filePath) {
|
|
5055
6067
|
try {
|
|
5056
|
-
await (0,
|
|
6068
|
+
await (0, import_promises12.access)(filePath, import_node_fs6.constants.F_OK);
|
|
5057
6069
|
return true;
|
|
5058
6070
|
} catch {
|
|
5059
6071
|
return false;
|
|
5060
6072
|
}
|
|
5061
6073
|
}
|
|
5062
6074
|
async function readTargetDefinitions(filePath) {
|
|
5063
|
-
const absolutePath =
|
|
6075
|
+
const absolutePath = import_node_path15.default.resolve(filePath);
|
|
5064
6076
|
if (!await fileExists3(absolutePath)) {
|
|
5065
6077
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
5066
6078
|
}
|
|
5067
|
-
const raw = await (0,
|
|
6079
|
+
const raw = await (0, import_promises12.readFile)(absolutePath, "utf8");
|
|
5068
6080
|
const parsed = (0, import_yaml3.parse)(raw);
|
|
5069
6081
|
if (!isRecord(parsed)) {
|
|
5070
6082
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
@@ -5094,6 +6106,8 @@ function createProvider(target) {
|
|
|
5094
6106
|
return new CodexProvider(target.name, target.config);
|
|
5095
6107
|
case "pi-coding-agent":
|
|
5096
6108
|
return new PiCodingAgentProvider(target.name, target.config);
|
|
6109
|
+
case "claude-code":
|
|
6110
|
+
return new ClaudeCodeProvider(target.name, target.config);
|
|
5097
6111
|
case "mock":
|
|
5098
6112
|
return new MockProvider(target.name, target.config);
|
|
5099
6113
|
case "vscode":
|
|
@@ -5112,78 +6126,199 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
5112
6126
|
|
|
5113
6127
|
// src/evaluation/evaluators.ts
|
|
5114
6128
|
var import_ai2 = require("ai");
|
|
5115
|
-
var
|
|
6129
|
+
var import_zod3 = require("zod");
|
|
5116
6130
|
|
|
5117
6131
|
// src/runtime/exec.ts
|
|
5118
|
-
function
|
|
5119
|
-
|
|
5120
|
-
|
|
6132
|
+
function shellEscapePath(value) {
|
|
6133
|
+
if (process.platform === "win32") {
|
|
6134
|
+
return `"${value.replaceAll('"', '""')}"`;
|
|
6135
|
+
}
|
|
6136
|
+
return `'${value.replaceAll("'", `'"'"'`)}'`;
|
|
5121
6137
|
}
|
|
5122
|
-
async function
|
|
5123
|
-
|
|
5124
|
-
|
|
5125
|
-
|
|
5126
|
-
|
|
5127
|
-
|
|
5128
|
-
|
|
5129
|
-
|
|
5130
|
-
|
|
5131
|
-
|
|
5132
|
-
|
|
5133
|
-
|
|
5134
|
-
|
|
5135
|
-
|
|
5136
|
-
|
|
5137
|
-
|
|
5138
|
-
|
|
5139
|
-
|
|
5140
|
-
|
|
5141
|
-
|
|
5142
|
-
|
|
5143
|
-
|
|
5144
|
-
|
|
6138
|
+
async function execFileWithStdin(argv, stdinPayload, options = {}) {
|
|
6139
|
+
if (argv.length === 0) {
|
|
6140
|
+
throw new Error("Executable argv must include at least one entry");
|
|
6141
|
+
}
|
|
6142
|
+
if (typeof Bun !== "undefined") {
|
|
6143
|
+
return execFileWithStdinBun(argv, stdinPayload, options);
|
|
6144
|
+
}
|
|
6145
|
+
return execFileWithStdinNode(argv, stdinPayload, options);
|
|
6146
|
+
}
|
|
6147
|
+
async function execFileWithStdinBun(argv, stdinPayload, options) {
|
|
6148
|
+
const command = [...argv];
|
|
6149
|
+
const encoder = new TextEncoder();
|
|
6150
|
+
const proc = Bun.spawn(command, {
|
|
6151
|
+
cwd: options.cwd,
|
|
6152
|
+
stdin: encoder.encode(stdinPayload),
|
|
6153
|
+
stdout: "pipe",
|
|
6154
|
+
stderr: "pipe"
|
|
6155
|
+
});
|
|
6156
|
+
let timedOut = false;
|
|
6157
|
+
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
6158
|
+
timedOut = true;
|
|
6159
|
+
proc.kill("SIGKILL");
|
|
6160
|
+
}, options.timeoutMs) : void 0;
|
|
6161
|
+
try {
|
|
6162
|
+
const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
|
|
6163
|
+
const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
|
|
6164
|
+
const [stdout, stderr, exitCode] = await Promise.all([
|
|
6165
|
+
stdoutPromise,
|
|
6166
|
+
stderrPromise,
|
|
6167
|
+
proc.exited
|
|
6168
|
+
]);
|
|
6169
|
+
if (timedOut) {
|
|
6170
|
+
throw new Error(`Process timed out after ${options.timeoutMs}ms`);
|
|
6171
|
+
}
|
|
6172
|
+
return {
|
|
6173
|
+
stdout: stdout.replace(/\r\n/g, "\n"),
|
|
6174
|
+
stderr: stderr.replace(/\r\n/g, "\n"),
|
|
6175
|
+
exitCode
|
|
6176
|
+
};
|
|
6177
|
+
} finally {
|
|
6178
|
+
if (timeout !== void 0) {
|
|
6179
|
+
clearTimeout(timeout);
|
|
5145
6180
|
}
|
|
5146
6181
|
}
|
|
5147
|
-
|
|
5148
|
-
|
|
5149
|
-
|
|
5150
|
-
|
|
6182
|
+
}
|
|
6183
|
+
async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
6184
|
+
const { spawn: spawn4 } = await import("child_process");
|
|
6185
|
+
return new Promise((resolve, reject) => {
|
|
6186
|
+
const [cmd, ...args] = argv;
|
|
6187
|
+
const child = spawn4(cmd, args, {
|
|
5151
6188
|
cwd: options.cwd,
|
|
5152
6189
|
stdio: ["pipe", "pipe", "pipe"]
|
|
5153
6190
|
});
|
|
5154
|
-
|
|
5155
|
-
|
|
5156
|
-
|
|
5157
|
-
|
|
5158
|
-
|
|
6191
|
+
const stdoutChunks = [];
|
|
6192
|
+
const stderrChunks = [];
|
|
6193
|
+
child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
|
|
6194
|
+
child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
|
|
6195
|
+
let timedOut = false;
|
|
6196
|
+
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
6197
|
+
timedOut = true;
|
|
6198
|
+
child.kill("SIGKILL");
|
|
5159
6199
|
}, options.timeoutMs) : void 0;
|
|
5160
|
-
child.stdout?.on("data", (data) => {
|
|
5161
|
-
stdout += data.toString();
|
|
5162
|
-
});
|
|
5163
|
-
child.stderr?.on("data", (data) => {
|
|
5164
|
-
stderr += data.toString();
|
|
5165
|
-
});
|
|
5166
6200
|
child.on("error", (error) => {
|
|
5167
|
-
if (timeout !== void 0)
|
|
5168
|
-
clearTimeout(timeout);
|
|
5169
|
-
}
|
|
6201
|
+
if (timeout !== void 0) clearTimeout(timeout);
|
|
5170
6202
|
reject(error);
|
|
5171
6203
|
});
|
|
5172
|
-
child.on("
|
|
5173
|
-
if (timeout !== void 0)
|
|
5174
|
-
|
|
6204
|
+
child.on("close", (code) => {
|
|
6205
|
+
if (timeout !== void 0) clearTimeout(timeout);
|
|
6206
|
+
if (timedOut) {
|
|
6207
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
6208
|
+
return;
|
|
5175
6209
|
}
|
|
5176
|
-
|
|
6210
|
+
const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
|
|
6211
|
+
const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
|
|
6212
|
+
resolve({
|
|
6213
|
+
stdout,
|
|
6214
|
+
stderr,
|
|
6215
|
+
exitCode: code ?? 0
|
|
6216
|
+
});
|
|
5177
6217
|
});
|
|
5178
|
-
child.stdin
|
|
5179
|
-
|
|
6218
|
+
if (child.stdin) {
|
|
6219
|
+
child.stdin.write(stdinPayload);
|
|
6220
|
+
child.stdin.end();
|
|
6221
|
+
}
|
|
5180
6222
|
});
|
|
5181
6223
|
}
|
|
6224
|
+
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
6225
|
+
const { mkdir: mkdir4, readFile: readFile8, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
|
|
6226
|
+
const { tmpdir: tmpdir4 } = await import("os");
|
|
6227
|
+
const path17 = await import("path");
|
|
6228
|
+
const { randomUUID: randomUUID4 } = await import("crypto");
|
|
6229
|
+
const dir = path17.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
|
|
6230
|
+
await mkdir4(dir, { recursive: true });
|
|
6231
|
+
const stdinPath = path17.join(dir, "stdin.txt");
|
|
6232
|
+
const stdoutPath = path17.join(dir, "stdout.txt");
|
|
6233
|
+
const stderrPath = path17.join(dir, "stderr.txt");
|
|
6234
|
+
await writeFile4(stdinPath, stdinPayload, "utf8");
|
|
6235
|
+
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
6236
|
+
const { spawn: spawn4 } = await import("child_process");
|
|
6237
|
+
try {
|
|
6238
|
+
const exitCode = await new Promise((resolve, reject) => {
|
|
6239
|
+
const child = spawn4(wrappedCommand, {
|
|
6240
|
+
shell: true,
|
|
6241
|
+
cwd: options.cwd,
|
|
6242
|
+
stdio: ["ignore", "ignore", "ignore"]
|
|
6243
|
+
});
|
|
6244
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
6245
|
+
child.kill();
|
|
6246
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
6247
|
+
}, options.timeoutMs) : void 0;
|
|
6248
|
+
child.on("error", (error) => {
|
|
6249
|
+
if (timeout !== void 0) {
|
|
6250
|
+
clearTimeout(timeout);
|
|
6251
|
+
}
|
|
6252
|
+
reject(error);
|
|
6253
|
+
});
|
|
6254
|
+
child.on("exit", (code) => {
|
|
6255
|
+
if (timeout !== void 0) {
|
|
6256
|
+
clearTimeout(timeout);
|
|
6257
|
+
}
|
|
6258
|
+
resolve(code ?? 0);
|
|
6259
|
+
});
|
|
6260
|
+
});
|
|
6261
|
+
const stdout = (await readFile8(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
6262
|
+
const stderr = (await readFile8(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
6263
|
+
return { stdout, stderr, exitCode };
|
|
6264
|
+
} finally {
|
|
6265
|
+
await rm4(dir, { recursive: true, force: true });
|
|
6266
|
+
}
|
|
6267
|
+
}
|
|
6268
|
+
|
|
6269
|
+
// src/evaluation/case-conversion.ts
|
|
6270
|
+
function toSnakeCase(str) {
|
|
6271
|
+
if (/^[A-Z]/.test(str)) {
|
|
6272
|
+
return str;
|
|
6273
|
+
}
|
|
6274
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
6275
|
+
}
|
|
6276
|
+
function toCamelCase(str) {
|
|
6277
|
+
if (/^[A-Z]/.test(str)) {
|
|
6278
|
+
return str;
|
|
6279
|
+
}
|
|
6280
|
+
return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
|
|
6281
|
+
}
|
|
6282
|
+
function toSnakeCaseDeep(obj) {
|
|
6283
|
+
if (obj === null || obj === void 0) {
|
|
6284
|
+
return obj;
|
|
6285
|
+
}
|
|
6286
|
+
if (Array.isArray(obj)) {
|
|
6287
|
+
return obj.map((item) => toSnakeCaseDeep(item));
|
|
6288
|
+
}
|
|
6289
|
+
if (typeof obj === "object") {
|
|
6290
|
+
const result = {};
|
|
6291
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
6292
|
+
const snakeKey = toSnakeCase(key);
|
|
6293
|
+
result[snakeKey] = toSnakeCaseDeep(value);
|
|
6294
|
+
}
|
|
6295
|
+
return result;
|
|
6296
|
+
}
|
|
6297
|
+
return obj;
|
|
6298
|
+
}
|
|
6299
|
+
function toCamelCaseDeep(obj) {
|
|
6300
|
+
if (obj === null || obj === void 0) {
|
|
6301
|
+
return obj;
|
|
6302
|
+
}
|
|
6303
|
+
if (Array.isArray(obj)) {
|
|
6304
|
+
return obj.map((item) => toCamelCaseDeep(item));
|
|
6305
|
+
}
|
|
6306
|
+
if (typeof obj === "object") {
|
|
6307
|
+
const result = {};
|
|
6308
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
6309
|
+
const camelKey = toCamelCase(key);
|
|
6310
|
+
result[camelKey] = toCamelCaseDeep(value);
|
|
6311
|
+
}
|
|
6312
|
+
return result;
|
|
6313
|
+
}
|
|
6314
|
+
return obj;
|
|
6315
|
+
}
|
|
5182
6316
|
|
|
5183
6317
|
// src/evaluation/providers/types.ts
|
|
5184
6318
|
var AGENT_PROVIDER_KINDS = [
|
|
5185
6319
|
"codex",
|
|
5186
6320
|
"pi-coding-agent",
|
|
6321
|
+
"claude-code",
|
|
5187
6322
|
"vscode",
|
|
5188
6323
|
"vscode-insiders"
|
|
5189
6324
|
];
|
|
@@ -5224,20 +6359,20 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
5224
6359
|
|
|
5225
6360
|
[[ ## candidate_answer ## ]]
|
|
5226
6361
|
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
5227
|
-
var freeformEvaluationSchema =
|
|
5228
|
-
score:
|
|
5229
|
-
hits:
|
|
5230
|
-
misses:
|
|
5231
|
-
reasoning:
|
|
6362
|
+
var freeformEvaluationSchema = import_zod3.z.object({
|
|
6363
|
+
score: import_zod3.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
6364
|
+
hits: import_zod3.z.array(import_zod3.z.string()).describe("Brief specific achievements").optional(),
|
|
6365
|
+
misses: import_zod3.z.array(import_zod3.z.string()).describe("Brief failures or omissions").optional(),
|
|
6366
|
+
reasoning: import_zod3.z.string().describe("Concise explanation (1-2 sentences)").optional()
|
|
5232
6367
|
});
|
|
5233
|
-
var rubricCheckResultSchema =
|
|
5234
|
-
id:
|
|
5235
|
-
satisfied:
|
|
5236
|
-
reasoning:
|
|
6368
|
+
var rubricCheckResultSchema = import_zod3.z.object({
|
|
6369
|
+
id: import_zod3.z.string().describe("The ID of the rubric item being checked"),
|
|
6370
|
+
satisfied: import_zod3.z.boolean().describe("Whether this rubric requirement is met"),
|
|
6371
|
+
reasoning: import_zod3.z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
5237
6372
|
});
|
|
5238
|
-
var rubricEvaluationSchema =
|
|
5239
|
-
checks:
|
|
5240
|
-
overall_reasoning:
|
|
6373
|
+
var rubricEvaluationSchema = import_zod3.z.object({
|
|
6374
|
+
checks: import_zod3.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
6375
|
+
overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
5241
6376
|
});
|
|
5242
6377
|
var LlmJudgeEvaluator = class {
|
|
5243
6378
|
kind = "llm_judge";
|
|
@@ -5473,30 +6608,30 @@ var CodeEvaluator = class {
|
|
|
5473
6608
|
script;
|
|
5474
6609
|
cwd;
|
|
5475
6610
|
agentTimeoutMs;
|
|
6611
|
+
config;
|
|
5476
6612
|
constructor(options) {
|
|
5477
6613
|
this.script = options.script;
|
|
5478
6614
|
this.cwd = options.cwd;
|
|
5479
6615
|
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
6616
|
+
this.config = options.config;
|
|
5480
6617
|
}
|
|
5481
6618
|
async evaluate(context) {
|
|
5482
|
-
const
|
|
5483
|
-
|
|
5484
|
-
|
|
5485
|
-
|
|
5486
|
-
|
|
5487
|
-
|
|
5488
|
-
|
|
5489
|
-
|
|
5490
|
-
|
|
5491
|
-
|
|
5492
|
-
|
|
5493
|
-
|
|
5494
|
-
|
|
5495
|
-
|
|
5496
|
-
|
|
5497
|
-
|
|
5498
|
-
2
|
|
5499
|
-
);
|
|
6619
|
+
const payload = {
|
|
6620
|
+
question: context.evalCase.question,
|
|
6621
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
6622
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
6623
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
6624
|
+
candidateAnswer: context.candidate,
|
|
6625
|
+
outputMessages: context.outputMessages ?? null,
|
|
6626
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
6627
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
6628
|
+
(path17) => !context.evalCase.guideline_paths.includes(path17)
|
|
6629
|
+
),
|
|
6630
|
+
inputMessages: context.evalCase.input_messages,
|
|
6631
|
+
traceSummary: context.traceSummary ?? null,
|
|
6632
|
+
config: this.config ?? null
|
|
6633
|
+
};
|
|
6634
|
+
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
5500
6635
|
try {
|
|
5501
6636
|
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
5502
6637
|
const parsed = parseJsonSafe(stdout);
|
|
@@ -5562,18 +6697,25 @@ function calculateRubricScore(result, rubrics) {
|
|
|
5562
6697
|
return { score, verdict, hits, misses };
|
|
5563
6698
|
}
|
|
5564
6699
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
5565
|
-
const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
|
|
5566
|
-
cwd,
|
|
5567
|
-
timeoutMs: agentTimeoutMs
|
|
5568
|
-
});
|
|
6700
|
+
const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
|
|
5569
6701
|
if (exitCode !== 0) {
|
|
5570
|
-
const trimmedErr = stderr
|
|
6702
|
+
const trimmedErr = formatStderr(stderr);
|
|
5571
6703
|
throw new Error(
|
|
5572
6704
|
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
5573
6705
|
);
|
|
5574
6706
|
}
|
|
5575
6707
|
return stdout.trim();
|
|
5576
6708
|
}
|
|
6709
|
+
function formatStderr(stderr) {
|
|
6710
|
+
const trimmed = stderr.trim();
|
|
6711
|
+
const maxLength = 2e3;
|
|
6712
|
+
if (trimmed.length <= maxLength) {
|
|
6713
|
+
return trimmed;
|
|
6714
|
+
}
|
|
6715
|
+
const tail = trimmed.slice(-maxLength);
|
|
6716
|
+
return `...(truncated, last ${maxLength} chars)
|
|
6717
|
+
${tail}`;
|
|
6718
|
+
}
|
|
5577
6719
|
function parseJsonSafe(payload) {
|
|
5578
6720
|
try {
|
|
5579
6721
|
return JSON.parse(payload);
|
|
@@ -5805,22 +6947,438 @@ var ToolTrajectoryEvaluator = class {
|
|
|
5805
6947
|
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
5806
6948
|
}
|
|
5807
6949
|
} else {
|
|
5808
|
-
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
6950
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
6951
|
+
}
|
|
6952
|
+
}
|
|
6953
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
6954
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
6955
|
+
}
|
|
6956
|
+
const score = hits.length / expected.length;
|
|
6957
|
+
return {
|
|
6958
|
+
score,
|
|
6959
|
+
verdict: scoreToVerdict(score),
|
|
6960
|
+
hits,
|
|
6961
|
+
misses,
|
|
6962
|
+
expectedAspectCount: expected.length
|
|
6963
|
+
};
|
|
6964
|
+
}
|
|
6965
|
+
};
|
|
6966
|
+
var DEFAULT_DATE_FORMATS = [
|
|
6967
|
+
"YYYY-MM-DDTHH:mm:ssZ",
|
|
6968
|
+
// ISO with timezone
|
|
6969
|
+
"YYYY-MM-DDTHH:mm:ss",
|
|
6970
|
+
// ISO with time
|
|
6971
|
+
"YYYY-MM-DD",
|
|
6972
|
+
// ISO date
|
|
6973
|
+
"DD-MMM-YYYY",
|
|
6974
|
+
// Localized (e.g., "15-JAN-2025")
|
|
6975
|
+
"MM/DD/YYYY",
|
|
6976
|
+
// US format
|
|
6977
|
+
"DD/MM/YYYY",
|
|
6978
|
+
// EU format
|
|
6979
|
+
"MM-DD-YYYY",
|
|
6980
|
+
// US with dashes
|
|
6981
|
+
"DD-MM-YYYY"
|
|
6982
|
+
// EU with dashes
|
|
6983
|
+
];
|
|
6984
|
+
var MONTH_NAMES = {
|
|
6985
|
+
jan: 0,
|
|
6986
|
+
january: 0,
|
|
6987
|
+
feb: 1,
|
|
6988
|
+
february: 1,
|
|
6989
|
+
mar: 2,
|
|
6990
|
+
march: 2,
|
|
6991
|
+
apr: 3,
|
|
6992
|
+
april: 3,
|
|
6993
|
+
may: 4,
|
|
6994
|
+
jun: 5,
|
|
6995
|
+
june: 5,
|
|
6996
|
+
jul: 6,
|
|
6997
|
+
july: 6,
|
|
6998
|
+
aug: 7,
|
|
6999
|
+
august: 7,
|
|
7000
|
+
sep: 8,
|
|
7001
|
+
sept: 8,
|
|
7002
|
+
september: 8,
|
|
7003
|
+
oct: 9,
|
|
7004
|
+
october: 9,
|
|
7005
|
+
nov: 10,
|
|
7006
|
+
november: 10,
|
|
7007
|
+
dec: 11,
|
|
7008
|
+
december: 11
|
|
7009
|
+
};
|
|
7010
|
+
var FieldAccuracyEvaluator = class {
|
|
7011
|
+
kind = "field_accuracy";
|
|
7012
|
+
config;
|
|
7013
|
+
constructor(options) {
|
|
7014
|
+
this.config = options.config;
|
|
7015
|
+
}
|
|
7016
|
+
evaluate(context) {
|
|
7017
|
+
const { evalCase, candidate } = context;
|
|
7018
|
+
let candidateData;
|
|
7019
|
+
try {
|
|
7020
|
+
candidateData = parseJsonFromTextSafe(candidate);
|
|
7021
|
+
} catch {
|
|
7022
|
+
return {
|
|
7023
|
+
score: 0,
|
|
7024
|
+
verdict: "fail",
|
|
7025
|
+
hits: [],
|
|
7026
|
+
misses: ["Failed to parse candidate answer as JSON"],
|
|
7027
|
+
expectedAspectCount: this.config.fields.length,
|
|
7028
|
+
reasoning: "Candidate answer is not valid JSON"
|
|
7029
|
+
};
|
|
7030
|
+
}
|
|
7031
|
+
const expectedData = this.extractExpectedData(evalCase.expected_messages);
|
|
7032
|
+
if (!expectedData) {
|
|
7033
|
+
return {
|
|
7034
|
+
score: 0,
|
|
7035
|
+
verdict: "fail",
|
|
7036
|
+
hits: [],
|
|
7037
|
+
misses: ["No expected data found in expected_messages"],
|
|
7038
|
+
expectedAspectCount: this.config.fields.length,
|
|
7039
|
+
reasoning: "Could not extract expected data from expected_messages"
|
|
7040
|
+
};
|
|
7041
|
+
}
|
|
7042
|
+
const fieldResults = [];
|
|
7043
|
+
for (const fieldConfig of this.config.fields) {
|
|
7044
|
+
const result = this.evaluateField(fieldConfig, candidateData, expectedData);
|
|
7045
|
+
fieldResults.push(result);
|
|
7046
|
+
}
|
|
7047
|
+
return this.aggregateResults(fieldResults);
|
|
7048
|
+
}
|
|
7049
|
+
/**
|
|
7050
|
+
* Extract expected data from expected_messages array.
|
|
7051
|
+
* Looks for the last assistant message with content.
|
|
7052
|
+
*/
|
|
7053
|
+
extractExpectedData(expectedMessages) {
|
|
7054
|
+
for (let i = expectedMessages.length - 1; i >= 0; i--) {
|
|
7055
|
+
const message = expectedMessages[i];
|
|
7056
|
+
if (message.role === "assistant" && message.content) {
|
|
7057
|
+
if (typeof message.content === "object" && message.content !== null) {
|
|
7058
|
+
return message.content;
|
|
7059
|
+
}
|
|
7060
|
+
if (typeof message.content === "string") {
|
|
7061
|
+
try {
|
|
7062
|
+
return parseJsonFromTextSafe(message.content);
|
|
7063
|
+
} catch {
|
|
7064
|
+
}
|
|
7065
|
+
}
|
|
7066
|
+
}
|
|
7067
|
+
}
|
|
7068
|
+
return void 0;
|
|
7069
|
+
}
|
|
7070
|
+
/**
|
|
7071
|
+
* Evaluate a single field against the expected value.
|
|
7072
|
+
*/
|
|
7073
|
+
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
7074
|
+
const { path: path17, match, required = true, weight = 1 } = fieldConfig;
|
|
7075
|
+
const candidateValue = resolvePath(candidateData, path17);
|
|
7076
|
+
const expectedValue = resolvePath(expectedData, path17);
|
|
7077
|
+
if (expectedValue === void 0) {
|
|
7078
|
+
return {
|
|
7079
|
+
path: path17,
|
|
7080
|
+
score: 1,
|
|
7081
|
+
// No expected value means no comparison needed
|
|
7082
|
+
weight,
|
|
7083
|
+
hit: true,
|
|
7084
|
+
message: `${path17}: no expected value`
|
|
7085
|
+
};
|
|
7086
|
+
}
|
|
7087
|
+
if (candidateValue === void 0) {
|
|
7088
|
+
if (required) {
|
|
7089
|
+
return {
|
|
7090
|
+
path: path17,
|
|
7091
|
+
score: 0,
|
|
7092
|
+
weight,
|
|
7093
|
+
hit: false,
|
|
7094
|
+
message: `${path17} (required, missing)`
|
|
7095
|
+
};
|
|
7096
|
+
}
|
|
7097
|
+
return {
|
|
7098
|
+
path: path17,
|
|
7099
|
+
score: 1,
|
|
7100
|
+
// Don't penalize missing optional fields
|
|
7101
|
+
weight: 0,
|
|
7102
|
+
// Zero weight means it won't affect the score
|
|
7103
|
+
hit: true,
|
|
7104
|
+
message: `${path17}: optional field missing`
|
|
7105
|
+
};
|
|
7106
|
+
}
|
|
7107
|
+
switch (match) {
|
|
7108
|
+
case "exact":
|
|
7109
|
+
return this.compareExact(path17, candidateValue, expectedValue, weight);
|
|
7110
|
+
case "numeric_tolerance":
|
|
7111
|
+
return this.compareNumericTolerance(
|
|
7112
|
+
path17,
|
|
7113
|
+
candidateValue,
|
|
7114
|
+
expectedValue,
|
|
7115
|
+
fieldConfig,
|
|
7116
|
+
weight
|
|
7117
|
+
);
|
|
7118
|
+
case "date":
|
|
7119
|
+
return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
|
|
7120
|
+
default:
|
|
7121
|
+
return {
|
|
7122
|
+
path: path17,
|
|
7123
|
+
score: 0,
|
|
7124
|
+
weight,
|
|
7125
|
+
hit: false,
|
|
7126
|
+
message: `${path17}: unknown match type "${match}"`
|
|
7127
|
+
};
|
|
7128
|
+
}
|
|
7129
|
+
}
|
|
7130
|
+
/**
|
|
7131
|
+
* Exact equality comparison.
|
|
7132
|
+
*/
|
|
7133
|
+
compareExact(path17, candidateValue, expectedValue, weight) {
|
|
7134
|
+
if (deepEqual(candidateValue, expectedValue)) {
|
|
7135
|
+
return {
|
|
7136
|
+
path: path17,
|
|
7137
|
+
score: 1,
|
|
7138
|
+
weight,
|
|
7139
|
+
hit: true,
|
|
7140
|
+
message: path17
|
|
7141
|
+
};
|
|
7142
|
+
}
|
|
7143
|
+
if (typeof candidateValue !== typeof expectedValue) {
|
|
7144
|
+
return {
|
|
7145
|
+
path: path17,
|
|
7146
|
+
score: 0,
|
|
7147
|
+
weight,
|
|
7148
|
+
hit: false,
|
|
7149
|
+
message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
7150
|
+
};
|
|
7151
|
+
}
|
|
7152
|
+
return {
|
|
7153
|
+
path: path17,
|
|
7154
|
+
score: 0,
|
|
7155
|
+
weight,
|
|
7156
|
+
hit: false,
|
|
7157
|
+
message: `${path17} (value mismatch)`
|
|
7158
|
+
};
|
|
7159
|
+
}
|
|
7160
|
+
/**
|
|
7161
|
+
* Numeric comparison with absolute or relative tolerance.
|
|
7162
|
+
*/
|
|
7163
|
+
compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7164
|
+
const { tolerance = 0, relative = false } = fieldConfig;
|
|
7165
|
+
const candidateNum = toNumber(candidateValue);
|
|
7166
|
+
const expectedNum = toNumber(expectedValue);
|
|
7167
|
+
if (candidateNum === null || expectedNum === null) {
|
|
7168
|
+
return {
|
|
7169
|
+
path: path17,
|
|
7170
|
+
score: 0,
|
|
7171
|
+
weight,
|
|
7172
|
+
hit: false,
|
|
7173
|
+
message: `${path17} (non-numeric value)`
|
|
7174
|
+
};
|
|
7175
|
+
}
|
|
7176
|
+
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
7177
|
+
return {
|
|
7178
|
+
path: path17,
|
|
7179
|
+
score: 0,
|
|
7180
|
+
weight,
|
|
7181
|
+
hit: false,
|
|
7182
|
+
message: `${path17} (invalid numeric value)`
|
|
7183
|
+
};
|
|
7184
|
+
}
|
|
7185
|
+
const diff = Math.abs(candidateNum - expectedNum);
|
|
7186
|
+
let withinTolerance;
|
|
7187
|
+
if (relative) {
|
|
7188
|
+
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
7189
|
+
withinTolerance = relativeDiff <= tolerance;
|
|
7190
|
+
} else {
|
|
7191
|
+
withinTolerance = diff <= tolerance;
|
|
7192
|
+
}
|
|
7193
|
+
if (withinTolerance) {
|
|
7194
|
+
return {
|
|
7195
|
+
path: path17,
|
|
7196
|
+
score: 1,
|
|
7197
|
+
weight,
|
|
7198
|
+
hit: true,
|
|
7199
|
+
message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
|
|
7200
|
+
};
|
|
7201
|
+
}
|
|
7202
|
+
return {
|
|
7203
|
+
path: path17,
|
|
7204
|
+
score: 0,
|
|
7205
|
+
weight,
|
|
7206
|
+
hit: false,
|
|
7207
|
+
message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
7208
|
+
};
|
|
7209
|
+
}
|
|
7210
|
+
/**
|
|
7211
|
+
* Date comparison with format normalization.
|
|
7212
|
+
*/
|
|
7213
|
+
compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7214
|
+
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
7215
|
+
const candidateDate = parseDate(String(candidateValue), formats);
|
|
7216
|
+
const expectedDate = parseDate(String(expectedValue), formats);
|
|
7217
|
+
if (candidateDate === null) {
|
|
7218
|
+
return {
|
|
7219
|
+
path: path17,
|
|
7220
|
+
score: 0,
|
|
7221
|
+
weight,
|
|
7222
|
+
hit: false,
|
|
7223
|
+
message: `${path17} (unparseable candidate date)`
|
|
7224
|
+
};
|
|
7225
|
+
}
|
|
7226
|
+
if (expectedDate === null) {
|
|
7227
|
+
return {
|
|
7228
|
+
path: path17,
|
|
7229
|
+
score: 0,
|
|
7230
|
+
weight,
|
|
7231
|
+
hit: false,
|
|
7232
|
+
message: `${path17} (unparseable expected date)`
|
|
7233
|
+
};
|
|
7234
|
+
}
|
|
7235
|
+
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
7236
|
+
return {
|
|
7237
|
+
path: path17,
|
|
7238
|
+
score: 1,
|
|
7239
|
+
weight,
|
|
7240
|
+
hit: true,
|
|
7241
|
+
message: path17
|
|
7242
|
+
};
|
|
7243
|
+
}
|
|
7244
|
+
return {
|
|
7245
|
+
path: path17,
|
|
7246
|
+
score: 0,
|
|
7247
|
+
weight,
|
|
7248
|
+
hit: false,
|
|
7249
|
+
message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
7250
|
+
};
|
|
7251
|
+
}
|
|
7252
|
+
/**
|
|
7253
|
+
* Aggregate field results using configured strategy.
|
|
7254
|
+
*/
|
|
7255
|
+
aggregateResults(results) {
|
|
7256
|
+
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
7257
|
+
const hits = [];
|
|
7258
|
+
const misses = [];
|
|
7259
|
+
for (const result of results) {
|
|
7260
|
+
if (result.hit) {
|
|
7261
|
+
hits.push(result.message);
|
|
7262
|
+
} else {
|
|
7263
|
+
misses.push(result.message);
|
|
5809
7264
|
}
|
|
5810
7265
|
}
|
|
5811
|
-
|
|
5812
|
-
|
|
7266
|
+
let score;
|
|
7267
|
+
if (aggregation === "all_or_nothing") {
|
|
7268
|
+
score = misses.length === 0 ? 1 : 0;
|
|
7269
|
+
} else {
|
|
7270
|
+
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
7271
|
+
if (totalWeight === 0) {
|
|
7272
|
+
score = results.length === 0 ? 1 : 0;
|
|
7273
|
+
} else {
|
|
7274
|
+
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
7275
|
+
score = weightedSum / totalWeight;
|
|
7276
|
+
}
|
|
5813
7277
|
}
|
|
5814
|
-
const
|
|
7278
|
+
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
5815
7279
|
return {
|
|
5816
|
-
score,
|
|
7280
|
+
score: clampScore(score),
|
|
5817
7281
|
verdict: scoreToVerdict(score),
|
|
5818
|
-
hits,
|
|
5819
|
-
misses,
|
|
5820
|
-
expectedAspectCount:
|
|
7282
|
+
hits: hits.slice(0, 4),
|
|
7283
|
+
misses: misses.slice(0, 4),
|
|
7284
|
+
expectedAspectCount: results.length,
|
|
7285
|
+
reasoning
|
|
5821
7286
|
};
|
|
5822
7287
|
}
|
|
5823
7288
|
};
|
|
7289
|
+
function resolvePath(obj, path17) {
|
|
7290
|
+
if (!path17 || !obj) {
|
|
7291
|
+
return void 0;
|
|
7292
|
+
}
|
|
7293
|
+
const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
7294
|
+
let current = obj;
|
|
7295
|
+
for (const part of parts) {
|
|
7296
|
+
if (current === null || current === void 0) {
|
|
7297
|
+
return void 0;
|
|
7298
|
+
}
|
|
7299
|
+
if (typeof current !== "object") {
|
|
7300
|
+
return void 0;
|
|
7301
|
+
}
|
|
7302
|
+
const isIndex = /^\d+$/.test(part);
|
|
7303
|
+
if (isIndex && Array.isArray(current)) {
|
|
7304
|
+
current = current[Number.parseInt(part, 10)];
|
|
7305
|
+
} else {
|
|
7306
|
+
current = current[part];
|
|
7307
|
+
}
|
|
7308
|
+
}
|
|
7309
|
+
return current;
|
|
7310
|
+
}
|
|
7311
|
+
function toNumber(value) {
|
|
7312
|
+
if (typeof value === "number") {
|
|
7313
|
+
return value;
|
|
7314
|
+
}
|
|
7315
|
+
if (typeof value === "string") {
|
|
7316
|
+
const num = Number.parseFloat(value);
|
|
7317
|
+
return Number.isNaN(num) ? null : num;
|
|
7318
|
+
}
|
|
7319
|
+
return null;
|
|
7320
|
+
}
|
|
7321
|
+
function parseDate(dateStr, formats) {
|
|
7322
|
+
if (!dateStr) return null;
|
|
7323
|
+
const trimmed = dateStr.trim();
|
|
7324
|
+
const isoDate = new Date(trimmed);
|
|
7325
|
+
if (!Number.isNaN(isoDate.getTime())) {
|
|
7326
|
+
return isoDate;
|
|
7327
|
+
}
|
|
7328
|
+
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
7329
|
+
if (localizedMatch) {
|
|
7330
|
+
const day = Number.parseInt(localizedMatch[1], 10);
|
|
7331
|
+
const monthName = localizedMatch[2].toLowerCase();
|
|
7332
|
+
const year = Number.parseInt(localizedMatch[3], 10);
|
|
7333
|
+
const month = MONTH_NAMES[monthName];
|
|
7334
|
+
if (month !== void 0) {
|
|
7335
|
+
return new Date(year, month, day);
|
|
7336
|
+
}
|
|
7337
|
+
}
|
|
7338
|
+
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
7339
|
+
if (usMatch) {
|
|
7340
|
+
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
7341
|
+
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
7342
|
+
if (hasUSFormat && !hasEUFormat) {
|
|
7343
|
+
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
7344
|
+
const day = Number.parseInt(usMatch[2], 10);
|
|
7345
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7346
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7347
|
+
return new Date(year, month, day);
|
|
7348
|
+
}
|
|
7349
|
+
} else if (hasEUFormat && !hasUSFormat) {
|
|
7350
|
+
const day = Number.parseInt(usMatch[1], 10);
|
|
7351
|
+
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
7352
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7353
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7354
|
+
return new Date(year, month, day);
|
|
7355
|
+
}
|
|
7356
|
+
} else {
|
|
7357
|
+
const num1 = Number.parseInt(usMatch[1], 10);
|
|
7358
|
+
const num2 = Number.parseInt(usMatch[2], 10);
|
|
7359
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7360
|
+
if (num1 > 12 && num2 <= 12) {
|
|
7361
|
+
return new Date(year, num2 - 1, num1);
|
|
7362
|
+
}
|
|
7363
|
+
if (num2 > 12 && num1 <= 12) {
|
|
7364
|
+
return new Date(year, num1 - 1, num2);
|
|
7365
|
+
}
|
|
7366
|
+
if (num1 <= 12 && num2 <= 31) {
|
|
7367
|
+
return new Date(year, num1 - 1, num2);
|
|
7368
|
+
}
|
|
7369
|
+
}
|
|
7370
|
+
}
|
|
7371
|
+
return null;
|
|
7372
|
+
}
|
|
7373
|
+
function formatDateISO(date) {
|
|
7374
|
+
return date.toISOString().split("T")[0];
|
|
7375
|
+
}
|
|
7376
|
+
function parseJsonFromTextSafe(text) {
|
|
7377
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
7378
|
+
const match = cleaned.match(/\{[\s\S]*\}/);
|
|
7379
|
+
const blob = match?.[0] ?? cleaned;
|
|
7380
|
+
return JSON.parse(blob);
|
|
7381
|
+
}
|
|
5824
7382
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
5825
7383
|
{{EVALUATOR_RESULTS_JSON}}
|
|
5826
7384
|
|
|
@@ -6045,11 +7603,175 @@ var CompositeEvaluator = class {
|
|
|
6045
7603
|
}
|
|
6046
7604
|
}
|
|
6047
7605
|
};
|
|
7606
|
+
var LatencyEvaluator = class {
|
|
7607
|
+
kind = "latency";
|
|
7608
|
+
config;
|
|
7609
|
+
constructor(options) {
|
|
7610
|
+
this.config = options.config;
|
|
7611
|
+
}
|
|
7612
|
+
evaluate(context) {
|
|
7613
|
+
const { threshold } = this.config;
|
|
7614
|
+
const durationMs = context.traceSummary?.durationMs;
|
|
7615
|
+
if (durationMs === void 0) {
|
|
7616
|
+
return {
|
|
7617
|
+
score: 0,
|
|
7618
|
+
verdict: "fail",
|
|
7619
|
+
hits: [],
|
|
7620
|
+
misses: ["No duration data available in trace"],
|
|
7621
|
+
expectedAspectCount: 1,
|
|
7622
|
+
reasoning: "Execution duration not reported by provider",
|
|
7623
|
+
evaluatorRawRequest: {
|
|
7624
|
+
type: "latency",
|
|
7625
|
+
threshold,
|
|
7626
|
+
durationMs: null
|
|
7627
|
+
}
|
|
7628
|
+
};
|
|
7629
|
+
}
|
|
7630
|
+
const passed = durationMs <= threshold;
|
|
7631
|
+
const score = passed ? 1 : 0;
|
|
7632
|
+
return {
|
|
7633
|
+
score,
|
|
7634
|
+
verdict: passed ? "pass" : "fail",
|
|
7635
|
+
hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
|
|
7636
|
+
misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
|
|
7637
|
+
expectedAspectCount: 1,
|
|
7638
|
+
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
7639
|
+
evaluatorRawRequest: {
|
|
7640
|
+
type: "latency",
|
|
7641
|
+
threshold,
|
|
7642
|
+
durationMs
|
|
7643
|
+
}
|
|
7644
|
+
};
|
|
7645
|
+
}
|
|
7646
|
+
};
|
|
7647
|
+
var CostEvaluator = class {
|
|
7648
|
+
kind = "cost";
|
|
7649
|
+
config;
|
|
7650
|
+
constructor(options) {
|
|
7651
|
+
this.config = options.config;
|
|
7652
|
+
}
|
|
7653
|
+
evaluate(context) {
|
|
7654
|
+
const { budget } = this.config;
|
|
7655
|
+
const costUsd = context.traceSummary?.costUsd;
|
|
7656
|
+
if (costUsd === void 0) {
|
|
7657
|
+
return {
|
|
7658
|
+
score: 0,
|
|
7659
|
+
verdict: "fail",
|
|
7660
|
+
hits: [],
|
|
7661
|
+
misses: ["No cost data available in trace"],
|
|
7662
|
+
expectedAspectCount: 1,
|
|
7663
|
+
reasoning: "Execution cost not reported by provider",
|
|
7664
|
+
evaluatorRawRequest: {
|
|
7665
|
+
type: "cost",
|
|
7666
|
+
budget,
|
|
7667
|
+
costUsd: null
|
|
7668
|
+
}
|
|
7669
|
+
};
|
|
7670
|
+
}
|
|
7671
|
+
const passed = costUsd <= budget;
|
|
7672
|
+
const score = passed ? 1 : 0;
|
|
7673
|
+
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
7674
|
+
return {
|
|
7675
|
+
score,
|
|
7676
|
+
verdict: passed ? "pass" : "fail",
|
|
7677
|
+
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
7678
|
+
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
7679
|
+
expectedAspectCount: 1,
|
|
7680
|
+
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
7681
|
+
evaluatorRawRequest: {
|
|
7682
|
+
type: "cost",
|
|
7683
|
+
budget,
|
|
7684
|
+
costUsd
|
|
7685
|
+
}
|
|
7686
|
+
};
|
|
7687
|
+
}
|
|
7688
|
+
};
|
|
7689
|
+
var TokenUsageEvaluator = class {
|
|
7690
|
+
kind = "token_usage";
|
|
7691
|
+
config;
|
|
7692
|
+
constructor(options) {
|
|
7693
|
+
this.config = options.config;
|
|
7694
|
+
}
|
|
7695
|
+
evaluate(context) {
|
|
7696
|
+
const usage = context.traceSummary?.tokenUsage;
|
|
7697
|
+
const maxTotal = this.config.max_total;
|
|
7698
|
+
const maxInput = this.config.max_input;
|
|
7699
|
+
const maxOutput = this.config.max_output;
|
|
7700
|
+
const expectedAspectCount = Math.max(
|
|
7701
|
+
[maxTotal, maxInput, maxOutput].filter((v) => typeof v === "number").length,
|
|
7702
|
+
1
|
|
7703
|
+
);
|
|
7704
|
+
if (!usage) {
|
|
7705
|
+
return {
|
|
7706
|
+
score: 0,
|
|
7707
|
+
verdict: "fail",
|
|
7708
|
+
hits: [],
|
|
7709
|
+
misses: ["No token usage data available in trace"],
|
|
7710
|
+
expectedAspectCount,
|
|
7711
|
+
reasoning: "Token usage not reported by provider",
|
|
7712
|
+
evaluatorRawRequest: {
|
|
7713
|
+
type: "token_usage",
|
|
7714
|
+
max_total: maxTotal ?? null,
|
|
7715
|
+
max_input: maxInput ?? null,
|
|
7716
|
+
max_output: maxOutput ?? null,
|
|
7717
|
+
tokenUsage: null
|
|
7718
|
+
}
|
|
7719
|
+
};
|
|
7720
|
+
}
|
|
7721
|
+
const input = usage.input;
|
|
7722
|
+
const output = usage.output;
|
|
7723
|
+
const cached = usage.cached ?? 0;
|
|
7724
|
+
const total = input + output + cached;
|
|
7725
|
+
const hits = [];
|
|
7726
|
+
const misses = [];
|
|
7727
|
+
if (typeof maxInput === "number") {
|
|
7728
|
+
if (input <= maxInput) {
|
|
7729
|
+
hits.push(`Input tokens ${input} <= ${maxInput}`);
|
|
7730
|
+
} else {
|
|
7731
|
+
misses.push(`Input tokens ${input} > ${maxInput}`);
|
|
7732
|
+
}
|
|
7733
|
+
}
|
|
7734
|
+
if (typeof maxOutput === "number") {
|
|
7735
|
+
if (output <= maxOutput) {
|
|
7736
|
+
hits.push(`Output tokens ${output} <= ${maxOutput}`);
|
|
7737
|
+
} else {
|
|
7738
|
+
misses.push(`Output tokens ${output} > ${maxOutput}`);
|
|
7739
|
+
}
|
|
7740
|
+
}
|
|
7741
|
+
if (typeof maxTotal === "number") {
|
|
7742
|
+
if (total <= maxTotal) {
|
|
7743
|
+
hits.push(`Total tokens ${total} <= ${maxTotal}`);
|
|
7744
|
+
} else {
|
|
7745
|
+
misses.push(`Total tokens ${total} > ${maxTotal}`);
|
|
7746
|
+
}
|
|
7747
|
+
}
|
|
7748
|
+
const passed = misses.length === 0;
|
|
7749
|
+
return {
|
|
7750
|
+
score: passed ? 1 : 0,
|
|
7751
|
+
verdict: passed ? "pass" : "fail",
|
|
7752
|
+
hits,
|
|
7753
|
+
misses,
|
|
7754
|
+
expectedAspectCount,
|
|
7755
|
+
reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
7756
|
+
evaluatorRawRequest: {
|
|
7757
|
+
type: "token_usage",
|
|
7758
|
+
max_total: maxTotal ?? null,
|
|
7759
|
+
max_input: maxInput ?? null,
|
|
7760
|
+
max_output: maxOutput ?? null,
|
|
7761
|
+
tokenUsage: {
|
|
7762
|
+
input,
|
|
7763
|
+
output,
|
|
7764
|
+
cached,
|
|
7765
|
+
total
|
|
7766
|
+
}
|
|
7767
|
+
}
|
|
7768
|
+
};
|
|
7769
|
+
}
|
|
7770
|
+
};
|
|
6048
7771
|
|
|
6049
7772
|
// src/evaluation/orchestrator.ts
|
|
6050
|
-
var
|
|
6051
|
-
var
|
|
6052
|
-
var import_node_path15 = __toESM(require("path"), 1);
|
|
7773
|
+
var import_node_crypto4 = require("crypto");
|
|
7774
|
+
var import_node_path16 = __toESM(require("path"), 1);
|
|
6053
7775
|
|
|
6054
7776
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
6055
7777
|
var Node = class {
|
|
@@ -6191,6 +7913,9 @@ function validateConcurrency(concurrency) {
|
|
|
6191
7913
|
}
|
|
6192
7914
|
|
|
6193
7915
|
// src/evaluation/orchestrator.ts
|
|
7916
|
+
function usesFileReferencePrompt(provider) {
|
|
7917
|
+
return isAgentProvider(provider) || provider.kind === "cli";
|
|
7918
|
+
}
|
|
6194
7919
|
async function runEvaluation(options) {
|
|
6195
7920
|
const {
|
|
6196
7921
|
testFilePath: evalFilePath,
|
|
@@ -6202,7 +7927,6 @@ async function runEvaluation(options) {
|
|
|
6202
7927
|
evaluators,
|
|
6203
7928
|
maxRetries,
|
|
6204
7929
|
agentTimeoutMs,
|
|
6205
|
-
promptDumpDir,
|
|
6206
7930
|
cache,
|
|
6207
7931
|
useCache,
|
|
6208
7932
|
now,
|
|
@@ -6282,7 +8006,6 @@ async function runEvaluation(options) {
|
|
|
6282
8006
|
provider: primaryProvider,
|
|
6283
8007
|
target,
|
|
6284
8008
|
evaluatorRegistry,
|
|
6285
|
-
promptDumpDir,
|
|
6286
8009
|
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
6287
8010
|
onProgress,
|
|
6288
8011
|
onResult,
|
|
@@ -6324,7 +8047,6 @@ async function runEvaluation(options) {
|
|
|
6324
8047
|
evaluators: evaluatorRegistry,
|
|
6325
8048
|
maxRetries,
|
|
6326
8049
|
agentTimeoutMs,
|
|
6327
|
-
promptDumpDir,
|
|
6328
8050
|
cache,
|
|
6329
8051
|
useCache,
|
|
6330
8052
|
now,
|
|
@@ -6367,7 +8089,8 @@ async function runEvaluation(options) {
|
|
|
6367
8089
|
results.push(outcome.value);
|
|
6368
8090
|
} else {
|
|
6369
8091
|
const evalCase = filteredEvalCases[i];
|
|
6370
|
-
const
|
|
8092
|
+
const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
|
|
8093
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
6371
8094
|
const errorResult = buildErrorResult(
|
|
6372
8095
|
evalCase,
|
|
6373
8096
|
target.name,
|
|
@@ -6390,7 +8113,6 @@ async function runBatchEvaluation(options) {
|
|
|
6390
8113
|
provider,
|
|
6391
8114
|
target,
|
|
6392
8115
|
evaluatorRegistry,
|
|
6393
|
-
promptDumpDir,
|
|
6394
8116
|
nowFn,
|
|
6395
8117
|
onProgress,
|
|
6396
8118
|
onResult,
|
|
@@ -6398,12 +8120,9 @@ async function runBatchEvaluation(options) {
|
|
|
6398
8120
|
agentTimeoutMs
|
|
6399
8121
|
} = options;
|
|
6400
8122
|
const promptInputsList = [];
|
|
6401
|
-
const formattingMode =
|
|
8123
|
+
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
6402
8124
|
for (const evalCase of evalCases) {
|
|
6403
8125
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
6404
|
-
if (promptDumpDir) {
|
|
6405
|
-
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
6406
|
-
}
|
|
6407
8126
|
promptInputsList.push(promptInputs);
|
|
6408
8127
|
}
|
|
6409
8128
|
const batchRequests = evalCases.map((evalCase, index) => {
|
|
@@ -6445,13 +8164,20 @@ async function runBatchEvaluation(options) {
|
|
|
6445
8164
|
const promptInputs = promptInputsList[i];
|
|
6446
8165
|
const providerResponse = batchResponse[i];
|
|
6447
8166
|
const outputMessages = providerResponse.outputMessages;
|
|
6448
|
-
const
|
|
8167
|
+
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
8168
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
|
|
8169
|
+
eventCount: 0,
|
|
8170
|
+
toolNames: [],
|
|
8171
|
+
toolCallsByName: {},
|
|
8172
|
+
errorCount: 0
|
|
8173
|
+
} : void 0;
|
|
6449
8174
|
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
6450
8175
|
tokenUsage: providerResponse.tokenUsage,
|
|
6451
8176
|
costUsd: providerResponse.costUsd,
|
|
6452
8177
|
durationMs: providerResponse.durationMs
|
|
6453
8178
|
}) : void 0;
|
|
6454
8179
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
8180
|
+
const providerError = extractProviderError(providerResponse);
|
|
6455
8181
|
let result;
|
|
6456
8182
|
try {
|
|
6457
8183
|
result = await evaluateCandidate({
|
|
@@ -6468,6 +8194,9 @@ async function runBatchEvaluation(options) {
|
|
|
6468
8194
|
outputMessages,
|
|
6469
8195
|
traceSummary
|
|
6470
8196
|
});
|
|
8197
|
+
if (providerError) {
|
|
8198
|
+
result = { ...result, error: providerError };
|
|
8199
|
+
}
|
|
6471
8200
|
} catch (error) {
|
|
6472
8201
|
const errorResult = buildErrorResult(
|
|
6473
8202
|
evalCase,
|
|
@@ -6500,9 +8229,10 @@ async function runBatchEvaluation(options) {
|
|
|
6500
8229
|
await onProgress({
|
|
6501
8230
|
workerId: 1,
|
|
6502
8231
|
evalId: evalCase.id,
|
|
6503
|
-
status: "completed",
|
|
8232
|
+
status: result.error ? "failed" : "completed",
|
|
6504
8233
|
startedAt: 0,
|
|
6505
|
-
completedAt: Date.now()
|
|
8234
|
+
completedAt: Date.now(),
|
|
8235
|
+
error: result.error
|
|
6506
8236
|
});
|
|
6507
8237
|
}
|
|
6508
8238
|
}
|
|
@@ -6517,17 +8247,13 @@ async function runEvalCase(options) {
|
|
|
6517
8247
|
now,
|
|
6518
8248
|
maxRetries,
|
|
6519
8249
|
agentTimeoutMs,
|
|
6520
|
-
promptDumpDir,
|
|
6521
8250
|
cache,
|
|
6522
8251
|
useCache,
|
|
6523
8252
|
signal,
|
|
6524
8253
|
judgeProvider
|
|
6525
8254
|
} = options;
|
|
6526
|
-
const formattingMode =
|
|
8255
|
+
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
6527
8256
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
6528
|
-
if (promptDumpDir) {
|
|
6529
|
-
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
6530
|
-
}
|
|
6531
8257
|
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
6532
8258
|
let cachedResponse;
|
|
6533
8259
|
if (cacheKey && cache) {
|
|
@@ -6571,15 +8297,22 @@ async function runEvalCase(options) {
|
|
|
6571
8297
|
await cache.set(cacheKey, providerResponse);
|
|
6572
8298
|
}
|
|
6573
8299
|
const outputMessages = providerResponse.outputMessages;
|
|
6574
|
-
const
|
|
8300
|
+
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
8301
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
|
|
8302
|
+
eventCount: 0,
|
|
8303
|
+
toolNames: [],
|
|
8304
|
+
toolCallsByName: {},
|
|
8305
|
+
errorCount: 0
|
|
8306
|
+
} : void 0;
|
|
6575
8307
|
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
6576
8308
|
tokenUsage: providerResponse.tokenUsage,
|
|
6577
8309
|
costUsd: providerResponse.costUsd,
|
|
6578
8310
|
durationMs: providerResponse.durationMs
|
|
6579
8311
|
}) : void 0;
|
|
6580
8312
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
8313
|
+
const providerError = extractProviderError(providerResponse);
|
|
6581
8314
|
try {
|
|
6582
|
-
|
|
8315
|
+
const result = await evaluateCandidate({
|
|
6583
8316
|
evalCase,
|
|
6584
8317
|
candidate,
|
|
6585
8318
|
target,
|
|
@@ -6593,6 +8326,7 @@ async function runEvalCase(options) {
|
|
|
6593
8326
|
outputMessages,
|
|
6594
8327
|
traceSummary
|
|
6595
8328
|
});
|
|
8329
|
+
return providerError ? { ...result, error: providerError } : result;
|
|
6596
8330
|
} catch (error) {
|
|
6597
8331
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
6598
8332
|
}
|
|
@@ -6658,7 +8392,6 @@ async function evaluateCandidate(options) {
|
|
|
6658
8392
|
candidateAnswer: candidate,
|
|
6659
8393
|
target: target.name,
|
|
6660
8394
|
reasoning: score.reasoning,
|
|
6661
|
-
rawAspects: score.rawAspects,
|
|
6662
8395
|
agentProviderRequest,
|
|
6663
8396
|
lmProviderRequest,
|
|
6664
8397
|
evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
@@ -6768,7 +8501,8 @@ async function runEvaluatorList(options) {
|
|
|
6768
8501
|
const codeEvaluator = new CodeEvaluator({
|
|
6769
8502
|
script: evaluator.script,
|
|
6770
8503
|
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
6771
|
-
agentTimeoutMs
|
|
8504
|
+
agentTimeoutMs,
|
|
8505
|
+
config: evaluator.config
|
|
6772
8506
|
});
|
|
6773
8507
|
const score2 = await codeEvaluator.evaluate({
|
|
6774
8508
|
evalCase,
|
|
@@ -6796,7 +8530,7 @@ async function runEvaluatorList(options) {
|
|
|
6796
8530
|
});
|
|
6797
8531
|
}
|
|
6798
8532
|
if (evaluator.type === "composite") {
|
|
6799
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
8533
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path16.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
6800
8534
|
const createEvaluator = (memberConfig) => {
|
|
6801
8535
|
switch (memberConfig.type) {
|
|
6802
8536
|
case "llm_judge":
|
|
@@ -6805,7 +8539,8 @@ async function runEvaluatorList(options) {
|
|
|
6805
8539
|
return new CodeEvaluator({
|
|
6806
8540
|
script: memberConfig.script,
|
|
6807
8541
|
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
6808
|
-
agentTimeoutMs
|
|
8542
|
+
agentTimeoutMs,
|
|
8543
|
+
config: memberConfig.config
|
|
6809
8544
|
});
|
|
6810
8545
|
case "composite":
|
|
6811
8546
|
return new CompositeEvaluator({
|
|
@@ -6817,6 +8552,22 @@ async function runEvaluatorList(options) {
|
|
|
6817
8552
|
return new ToolTrajectoryEvaluator({
|
|
6818
8553
|
config: memberConfig
|
|
6819
8554
|
});
|
|
8555
|
+
case "field_accuracy":
|
|
8556
|
+
return new FieldAccuracyEvaluator({
|
|
8557
|
+
config: memberConfig
|
|
8558
|
+
});
|
|
8559
|
+
case "latency":
|
|
8560
|
+
return new LatencyEvaluator({
|
|
8561
|
+
config: memberConfig
|
|
8562
|
+
});
|
|
8563
|
+
case "cost":
|
|
8564
|
+
return new CostEvaluator({
|
|
8565
|
+
config: memberConfig
|
|
8566
|
+
});
|
|
8567
|
+
case "token_usage":
|
|
8568
|
+
return new TokenUsageEvaluator({
|
|
8569
|
+
config: memberConfig
|
|
8570
|
+
});
|
|
6820
8571
|
default: {
|
|
6821
8572
|
const unknownConfig = memberConfig;
|
|
6822
8573
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -6836,7 +8587,9 @@ async function runEvaluatorList(options) {
|
|
|
6836
8587
|
attempt,
|
|
6837
8588
|
promptInputs,
|
|
6838
8589
|
now,
|
|
6839
|
-
judgeProvider
|
|
8590
|
+
judgeProvider,
|
|
8591
|
+
outputMessages,
|
|
8592
|
+
traceSummary
|
|
6840
8593
|
});
|
|
6841
8594
|
const weight = evaluator.weight ?? 1;
|
|
6842
8595
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -6881,6 +8634,118 @@ async function runEvaluatorList(options) {
|
|
|
6881
8634
|
reasoning: score2.reasoning
|
|
6882
8635
|
});
|
|
6883
8636
|
}
|
|
8637
|
+
if (evaluator.type === "field_accuracy") {
|
|
8638
|
+
const fieldAccuracyEvaluator = new FieldAccuracyEvaluator({
|
|
8639
|
+
config: evaluator
|
|
8640
|
+
});
|
|
8641
|
+
const score2 = fieldAccuracyEvaluator.evaluate({
|
|
8642
|
+
evalCase,
|
|
8643
|
+
candidate,
|
|
8644
|
+
target,
|
|
8645
|
+
provider,
|
|
8646
|
+
attempt,
|
|
8647
|
+
promptInputs,
|
|
8648
|
+
now,
|
|
8649
|
+
outputMessages,
|
|
8650
|
+
traceSummary
|
|
8651
|
+
});
|
|
8652
|
+
const weight = evaluator.weight ?? 1;
|
|
8653
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
8654
|
+
evaluatorResults.push({
|
|
8655
|
+
name: evaluator.name,
|
|
8656
|
+
type: evaluator.type,
|
|
8657
|
+
score: score2.score,
|
|
8658
|
+
weight,
|
|
8659
|
+
verdict: score2.verdict,
|
|
8660
|
+
hits: score2.hits,
|
|
8661
|
+
misses: score2.misses,
|
|
8662
|
+
reasoning: score2.reasoning
|
|
8663
|
+
});
|
|
8664
|
+
}
|
|
8665
|
+
if (evaluator.type === "latency") {
|
|
8666
|
+
const latencyEvaluator = new LatencyEvaluator({
|
|
8667
|
+
config: evaluator
|
|
8668
|
+
});
|
|
8669
|
+
const score2 = latencyEvaluator.evaluate({
|
|
8670
|
+
evalCase,
|
|
8671
|
+
candidate,
|
|
8672
|
+
target,
|
|
8673
|
+
provider,
|
|
8674
|
+
attempt,
|
|
8675
|
+
promptInputs,
|
|
8676
|
+
now,
|
|
8677
|
+
outputMessages,
|
|
8678
|
+
traceSummary
|
|
8679
|
+
});
|
|
8680
|
+
const weight = evaluator.weight ?? 1;
|
|
8681
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
8682
|
+
evaluatorResults.push({
|
|
8683
|
+
name: evaluator.name,
|
|
8684
|
+
type: evaluator.type,
|
|
8685
|
+
score: score2.score,
|
|
8686
|
+
weight,
|
|
8687
|
+
verdict: score2.verdict,
|
|
8688
|
+
hits: score2.hits,
|
|
8689
|
+
misses: score2.misses,
|
|
8690
|
+
reasoning: score2.reasoning
|
|
8691
|
+
});
|
|
8692
|
+
}
|
|
8693
|
+
if (evaluator.type === "cost") {
|
|
8694
|
+
const costEvaluator = new CostEvaluator({
|
|
8695
|
+
config: evaluator
|
|
8696
|
+
});
|
|
8697
|
+
const score2 = costEvaluator.evaluate({
|
|
8698
|
+
evalCase,
|
|
8699
|
+
candidate,
|
|
8700
|
+
target,
|
|
8701
|
+
provider,
|
|
8702
|
+
attempt,
|
|
8703
|
+
promptInputs,
|
|
8704
|
+
now,
|
|
8705
|
+
outputMessages,
|
|
8706
|
+
traceSummary
|
|
8707
|
+
});
|
|
8708
|
+
const weight = evaluator.weight ?? 1;
|
|
8709
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
8710
|
+
evaluatorResults.push({
|
|
8711
|
+
name: evaluator.name,
|
|
8712
|
+
type: evaluator.type,
|
|
8713
|
+
score: score2.score,
|
|
8714
|
+
weight,
|
|
8715
|
+
verdict: score2.verdict,
|
|
8716
|
+
hits: score2.hits,
|
|
8717
|
+
misses: score2.misses,
|
|
8718
|
+
reasoning: score2.reasoning
|
|
8719
|
+
});
|
|
8720
|
+
}
|
|
8721
|
+
if (evaluator.type === "token_usage") {
|
|
8722
|
+
const tokenUsageEvaluator = new TokenUsageEvaluator({
|
|
8723
|
+
config: evaluator
|
|
8724
|
+
});
|
|
8725
|
+
const score2 = tokenUsageEvaluator.evaluate({
|
|
8726
|
+
evalCase,
|
|
8727
|
+
candidate,
|
|
8728
|
+
target,
|
|
8729
|
+
provider,
|
|
8730
|
+
attempt,
|
|
8731
|
+
promptInputs,
|
|
8732
|
+
now,
|
|
8733
|
+
outputMessages,
|
|
8734
|
+
traceSummary
|
|
8735
|
+
});
|
|
8736
|
+
const weight = evaluator.weight ?? 1;
|
|
8737
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
8738
|
+
evaluatorResults.push({
|
|
8739
|
+
name: evaluator.name,
|
|
8740
|
+
type: evaluator.type,
|
|
8741
|
+
score: score2.score,
|
|
8742
|
+
weight,
|
|
8743
|
+
verdict: score2.verdict,
|
|
8744
|
+
hits: score2.hits,
|
|
8745
|
+
misses: score2.misses,
|
|
8746
|
+
reasoning: score2.reasoning
|
|
8747
|
+
});
|
|
8748
|
+
}
|
|
6884
8749
|
} catch (error) {
|
|
6885
8750
|
const message = error instanceof Error ? error.message : String(error);
|
|
6886
8751
|
const fallbackScore = {
|
|
@@ -6920,7 +8785,6 @@ async function runEvaluatorList(options) {
|
|
|
6920
8785
|
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
6921
8786
|
0
|
|
6922
8787
|
);
|
|
6923
|
-
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
6924
8788
|
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
6925
8789
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
6926
8790
|
const score = {
|
|
@@ -6929,8 +8793,7 @@ async function runEvaluatorList(options) {
|
|
|
6929
8793
|
hits,
|
|
6930
8794
|
misses,
|
|
6931
8795
|
expectedAspectCount,
|
|
6932
|
-
reasoning
|
|
6933
|
-
rawAspects: rawAspects.length > 0 ? rawAspects : void 0
|
|
8796
|
+
reasoning
|
|
6934
8797
|
};
|
|
6935
8798
|
return { score, evaluatorResults };
|
|
6936
8799
|
}
|
|
@@ -7005,26 +8868,6 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
7005
8868
|
llm_judge: llmJudge
|
|
7006
8869
|
};
|
|
7007
8870
|
}
|
|
7008
|
-
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
7009
|
-
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
7010
|
-
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
7011
|
-
const filePath = import_node_path15.default.resolve(directory, filename);
|
|
7012
|
-
await (0, import_promises12.mkdir)(import_node_path15.default.dirname(filePath), { recursive: true });
|
|
7013
|
-
const payload = {
|
|
7014
|
-
eval_id: evalCase.id,
|
|
7015
|
-
question: promptInputs.question,
|
|
7016
|
-
guidelines: promptInputs.guidelines,
|
|
7017
|
-
guideline_paths: evalCase.guideline_paths
|
|
7018
|
-
};
|
|
7019
|
-
await (0, import_promises12.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
7020
|
-
}
|
|
7021
|
-
function sanitizeFilename(value) {
|
|
7022
|
-
if (!value) {
|
|
7023
|
-
return "prompt";
|
|
7024
|
-
}
|
|
7025
|
-
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
7026
|
-
return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
|
|
7027
|
-
}
|
|
7028
8871
|
async function invokeProvider(provider, options) {
|
|
7029
8872
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
7030
8873
|
const controller = new AbortController();
|
|
@@ -7088,14 +8931,25 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
7088
8931
|
misses: [`Error: ${message}`],
|
|
7089
8932
|
candidateAnswer: `Error occurred: ${message}`,
|
|
7090
8933
|
target: targetName,
|
|
7091
|
-
rawAspects: [],
|
|
7092
8934
|
agentProviderRequest,
|
|
7093
8935
|
lmProviderRequest,
|
|
7094
8936
|
error: message
|
|
7095
8937
|
};
|
|
7096
8938
|
}
|
|
8939
|
+
function extractProviderError(response) {
|
|
8940
|
+
const raw = response.raw;
|
|
8941
|
+
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
8942
|
+
return void 0;
|
|
8943
|
+
}
|
|
8944
|
+
const error = raw.error;
|
|
8945
|
+
if (typeof error !== "string") {
|
|
8946
|
+
return void 0;
|
|
8947
|
+
}
|
|
8948
|
+
const trimmed = error.trim();
|
|
8949
|
+
return trimmed.length > 0 ? trimmed : void 0;
|
|
8950
|
+
}
|
|
7097
8951
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
7098
|
-
const hash = (0,
|
|
8952
|
+
const hash = (0, import_node_crypto4.createHash)("sha256");
|
|
7099
8953
|
hash.update(provider.id);
|
|
7100
8954
|
hash.update(target.name);
|
|
7101
8955
|
hash.update(evalCase.id);
|
|
@@ -7152,15 +9006,15 @@ function computeWeightedMean(entries) {
|
|
|
7152
9006
|
|
|
7153
9007
|
// src/evaluation/generators/rubric-generator.ts
|
|
7154
9008
|
var import_ai3 = require("ai");
|
|
7155
|
-
var
|
|
7156
|
-
var rubricItemSchema =
|
|
7157
|
-
id:
|
|
7158
|
-
description:
|
|
7159
|
-
weight:
|
|
7160
|
-
required:
|
|
9009
|
+
var import_zod4 = require("zod");
|
|
9010
|
+
var rubricItemSchema = import_zod4.z.object({
|
|
9011
|
+
id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
9012
|
+
description: import_zod4.z.string().describe("What this rubric checks for"),
|
|
9013
|
+
weight: import_zod4.z.number().default(1).describe("Relative importance (default 1.0)"),
|
|
9014
|
+
required: import_zod4.z.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
7161
9015
|
});
|
|
7162
|
-
var rubricGenerationSchema =
|
|
7163
|
-
rubrics:
|
|
9016
|
+
var rubricGenerationSchema = import_zod4.z.object({
|
|
9017
|
+
rubrics: import_zod4.z.array(rubricItemSchema).describe("List of evaluation rubrics")
|
|
7164
9018
|
});
|
|
7165
9019
|
async function generateRubrics(options) {
|
|
7166
9020
|
const { expectedOutcome, question, referenceAnswer, provider } = options;
|
|
@@ -7230,6 +9084,17 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
7230
9084
|
return parts.join("\n");
|
|
7231
9085
|
}
|
|
7232
9086
|
|
|
9087
|
+
// src/evaluation/code-judge-sdk.ts
|
|
9088
|
+
var import_node_fs7 = require("fs");
|
|
9089
|
+
function parseCodeJudgePayload(payload) {
|
|
9090
|
+
const parsed = JSON.parse(payload);
|
|
9091
|
+
return toCamelCaseDeep(parsed);
|
|
9092
|
+
}
|
|
9093
|
+
function readCodeJudgePayload() {
|
|
9094
|
+
const stdin = (0, import_node_fs7.readFileSync)(0, "utf8");
|
|
9095
|
+
return parseCodeJudgePayload(stdin);
|
|
9096
|
+
}
|
|
9097
|
+
|
|
7233
9098
|
// src/index.ts
|
|
7234
9099
|
function createAgentKernel() {
|
|
7235
9100
|
return { status: "stub" };
|
|
@@ -7238,15 +9103,20 @@ function createAgentKernel() {
|
|
|
7238
9103
|
0 && (module.exports = {
|
|
7239
9104
|
CodeEvaluator,
|
|
7240
9105
|
CompositeEvaluator,
|
|
9106
|
+
CostEvaluator,
|
|
7241
9107
|
DEFAULT_EXPLORATION_TOOLS,
|
|
9108
|
+
FieldAccuracyEvaluator,
|
|
9109
|
+
LatencyEvaluator,
|
|
7242
9110
|
LlmJudgeEvaluator,
|
|
7243
9111
|
TEST_MESSAGE_ROLES,
|
|
9112
|
+
TokenUsageEvaluator,
|
|
7244
9113
|
ToolTrajectoryEvaluator,
|
|
7245
9114
|
avgToolDurationMs,
|
|
7246
9115
|
buildDirectoryChain,
|
|
7247
9116
|
buildPromptInputs,
|
|
7248
9117
|
buildSearchRoots,
|
|
7249
9118
|
computeTraceSummary,
|
|
9119
|
+
consumeClaudeCodeLogEntries,
|
|
7250
9120
|
consumeCodexLogEntries,
|
|
7251
9121
|
consumePiLogEntries,
|
|
7252
9122
|
createAgentKernel,
|
|
@@ -7268,6 +9138,8 @@ function createAgentKernel() {
|
|
|
7268
9138
|
loadEvalCases,
|
|
7269
9139
|
mergeExecutionMetrics,
|
|
7270
9140
|
normalizeLineEndings,
|
|
9141
|
+
parseCodeJudgePayload,
|
|
9142
|
+
readCodeJudgePayload,
|
|
7271
9143
|
readJsonFile,
|
|
7272
9144
|
readTargetDefinitions,
|
|
7273
9145
|
readTestSuiteMetadata,
|
|
@@ -7277,6 +9149,7 @@ function createAgentKernel() {
|
|
|
7277
9149
|
resolveTargetDefinition,
|
|
7278
9150
|
runEvalCase,
|
|
7279
9151
|
runEvaluation,
|
|
9152
|
+
subscribeToClaudeCodeLogEntries,
|
|
7280
9153
|
subscribeToCodexLogEntries,
|
|
7281
9154
|
subscribeToPiLogEntries,
|
|
7282
9155
|
tokensPerTool
|