@agentv/core 1.5.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-E2VSU4WZ.js → chunk-KDEP4I7G.js} +116 -1
- package/dist/chunk-KDEP4I7G.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +2 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +2715 -675
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +207 -10
- package/dist/index.d.ts +207 -10
- package/dist/index.js +2491 -570
- package/dist/index.js.map +1 -1
- package/package.json +8 -2
- package/dist/chunk-E2VSU4WZ.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -32,15 +32,20 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
CompositeEvaluator: () => CompositeEvaluator,
|
|
35
|
+
CostEvaluator: () => CostEvaluator,
|
|
35
36
|
DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
|
|
37
|
+
FieldAccuracyEvaluator: () => FieldAccuracyEvaluator,
|
|
38
|
+
LatencyEvaluator: () => LatencyEvaluator,
|
|
36
39
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
37
40
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
41
|
+
TokenUsageEvaluator: () => TokenUsageEvaluator,
|
|
38
42
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
39
43
|
avgToolDurationMs: () => avgToolDurationMs,
|
|
40
44
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
41
45
|
buildPromptInputs: () => buildPromptInputs,
|
|
42
46
|
buildSearchRoots: () => buildSearchRoots2,
|
|
43
47
|
computeTraceSummary: () => computeTraceSummary,
|
|
48
|
+
consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
|
|
44
49
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
45
50
|
consumePiLogEntries: () => consumePiLogEntries,
|
|
46
51
|
createAgentKernel: () => createAgentKernel,
|
|
@@ -71,6 +76,7 @@ __export(index_exports, {
|
|
|
71
76
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
72
77
|
runEvalCase: () => runEvalCase,
|
|
73
78
|
runEvaluation: () => runEvaluation,
|
|
79
|
+
subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
|
|
74
80
|
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
|
|
75
81
|
subscribeToPiLogEntries: () => subscribeToPiLogEntries,
|
|
76
82
|
tokensPerTool: () => tokensPerTool
|
|
@@ -129,7 +135,11 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
129
135
|
"llm_judge",
|
|
130
136
|
"rubric",
|
|
131
137
|
"composite",
|
|
132
|
-
"tool_trajectory"
|
|
138
|
+
"tool_trajectory",
|
|
139
|
+
"field_accuracy",
|
|
140
|
+
"latency",
|
|
141
|
+
"cost",
|
|
142
|
+
"token_usage"
|
|
133
143
|
];
|
|
134
144
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
135
145
|
function isEvaluatorKind(value) {
|
|
@@ -551,7 +561,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
551
561
|
continue;
|
|
552
562
|
}
|
|
553
563
|
if (typeValue === "code_judge") {
|
|
554
|
-
|
|
564
|
+
let script;
|
|
565
|
+
const rawScript = rawEvaluator.script;
|
|
566
|
+
if (typeof rawScript === "string") {
|
|
567
|
+
const trimmed = rawScript.trim();
|
|
568
|
+
if (trimmed.length === 0) {
|
|
569
|
+
throw new Error(
|
|
570
|
+
`Invalid code_judge script for evaluator '${name}' in '${evalId}': script cannot be empty`
|
|
571
|
+
);
|
|
572
|
+
}
|
|
573
|
+
script = parseCommandToArgv(trimmed);
|
|
574
|
+
} else {
|
|
575
|
+
script = asStringArray(
|
|
576
|
+
rawScript,
|
|
577
|
+
`code_judge script for evaluator '${name}' in '${evalId}'`
|
|
578
|
+
);
|
|
579
|
+
}
|
|
555
580
|
if (!script) {
|
|
556
581
|
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
557
582
|
continue;
|
|
@@ -572,13 +597,21 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
572
597
|
} else {
|
|
573
598
|
resolvedCwd = searchRoots[0];
|
|
574
599
|
}
|
|
600
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
|
|
601
|
+
const config = {};
|
|
602
|
+
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
603
|
+
if (!knownProps.has(key) && value !== void 0) {
|
|
604
|
+
config[key] = value;
|
|
605
|
+
}
|
|
606
|
+
}
|
|
575
607
|
evaluators.push({
|
|
576
608
|
name,
|
|
577
609
|
type: "code",
|
|
578
610
|
script,
|
|
579
611
|
cwd,
|
|
580
612
|
resolvedCwd,
|
|
581
|
-
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
613
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
614
|
+
...Object.keys(config).length > 0 ? { config } : {}
|
|
582
615
|
});
|
|
583
616
|
continue;
|
|
584
617
|
}
|
|
@@ -753,6 +786,140 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
753
786
|
evaluators.push(config);
|
|
754
787
|
continue;
|
|
755
788
|
}
|
|
789
|
+
if (typeValue === "field_accuracy") {
|
|
790
|
+
const rawFields = rawEvaluator.fields;
|
|
791
|
+
if (!Array.isArray(rawFields)) {
|
|
792
|
+
logWarning2(
|
|
793
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
|
|
794
|
+
);
|
|
795
|
+
continue;
|
|
796
|
+
}
|
|
797
|
+
if (rawFields.length === 0) {
|
|
798
|
+
logWarning2(
|
|
799
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
|
|
800
|
+
);
|
|
801
|
+
continue;
|
|
802
|
+
}
|
|
803
|
+
const fields = [];
|
|
804
|
+
for (const rawField of rawFields) {
|
|
805
|
+
if (!isJsonObject2(rawField)) {
|
|
806
|
+
logWarning2(
|
|
807
|
+
`Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
|
|
808
|
+
);
|
|
809
|
+
continue;
|
|
810
|
+
}
|
|
811
|
+
const fieldPath = asString2(rawField.path);
|
|
812
|
+
const match = asString2(rawField.match);
|
|
813
|
+
if (!fieldPath) {
|
|
814
|
+
logWarning2(
|
|
815
|
+
`Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
|
|
816
|
+
);
|
|
817
|
+
continue;
|
|
818
|
+
}
|
|
819
|
+
if (!match || !isValidFieldMatchType(match)) {
|
|
820
|
+
logWarning2(
|
|
821
|
+
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
|
|
822
|
+
);
|
|
823
|
+
continue;
|
|
824
|
+
}
|
|
825
|
+
const fieldConfig = {
|
|
826
|
+
path: fieldPath,
|
|
827
|
+
match,
|
|
828
|
+
...typeof rawField.required === "boolean" ? { required: rawField.required } : {},
|
|
829
|
+
...typeof rawField.weight === "number" ? { weight: rawField.weight } : {},
|
|
830
|
+
...typeof rawField.tolerance === "number" ? { tolerance: rawField.tolerance } : {},
|
|
831
|
+
...typeof rawField.relative === "boolean" ? { relative: rawField.relative } : {},
|
|
832
|
+
...Array.isArray(rawField.formats) ? { formats: rawField.formats.filter((f) => typeof f === "string") } : {}
|
|
833
|
+
};
|
|
834
|
+
fields.push(fieldConfig);
|
|
835
|
+
}
|
|
836
|
+
if (fields.length === 0) {
|
|
837
|
+
logWarning2(
|
|
838
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
|
|
839
|
+
);
|
|
840
|
+
continue;
|
|
841
|
+
}
|
|
842
|
+
const aggregation = asString2(rawEvaluator.aggregation);
|
|
843
|
+
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
844
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
845
|
+
evaluators.push({
|
|
846
|
+
name,
|
|
847
|
+
type: "field_accuracy",
|
|
848
|
+
fields,
|
|
849
|
+
...validAggregation ? { aggregation: validAggregation } : {},
|
|
850
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
851
|
+
});
|
|
852
|
+
continue;
|
|
853
|
+
}
|
|
854
|
+
if (typeValue === "latency") {
|
|
855
|
+
const threshold = rawEvaluator.threshold;
|
|
856
|
+
if (typeof threshold !== "number" || threshold < 0) {
|
|
857
|
+
logWarning2(
|
|
858
|
+
`Skipping latency evaluator '${name}' in '${evalId}': threshold must be a non-negative number`
|
|
859
|
+
);
|
|
860
|
+
continue;
|
|
861
|
+
}
|
|
862
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
863
|
+
evaluators.push({
|
|
864
|
+
name,
|
|
865
|
+
type: "latency",
|
|
866
|
+
threshold,
|
|
867
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
868
|
+
});
|
|
869
|
+
continue;
|
|
870
|
+
}
|
|
871
|
+
if (typeValue === "cost") {
|
|
872
|
+
const budget = rawEvaluator.budget;
|
|
873
|
+
if (typeof budget !== "number" || budget < 0) {
|
|
874
|
+
logWarning2(
|
|
875
|
+
`Skipping cost evaluator '${name}' in '${evalId}': budget must be a non-negative number`
|
|
876
|
+
);
|
|
877
|
+
continue;
|
|
878
|
+
}
|
|
879
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
880
|
+
evaluators.push({
|
|
881
|
+
name,
|
|
882
|
+
type: "cost",
|
|
883
|
+
budget,
|
|
884
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
885
|
+
});
|
|
886
|
+
continue;
|
|
887
|
+
}
|
|
888
|
+
if (typeValue === "token_usage") {
|
|
889
|
+
const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
|
|
890
|
+
const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
|
|
891
|
+
const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
|
|
892
|
+
const limits = [
|
|
893
|
+
["max_total", maxTotal],
|
|
894
|
+
["max_input", maxInput],
|
|
895
|
+
["max_output", maxOutput]
|
|
896
|
+
];
|
|
897
|
+
const validLimits = {};
|
|
898
|
+
for (const [key, raw] of limits) {
|
|
899
|
+
if (raw === void 0) continue;
|
|
900
|
+
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
901
|
+
logWarning2(
|
|
902
|
+
`Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
903
|
+
);
|
|
904
|
+
continue;
|
|
905
|
+
}
|
|
906
|
+
validLimits[key] = raw;
|
|
907
|
+
}
|
|
908
|
+
if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
|
|
909
|
+
logWarning2(
|
|
910
|
+
`Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
|
|
911
|
+
);
|
|
912
|
+
continue;
|
|
913
|
+
}
|
|
914
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
915
|
+
evaluators.push({
|
|
916
|
+
name,
|
|
917
|
+
type: "token_usage",
|
|
918
|
+
...validLimits,
|
|
919
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
920
|
+
});
|
|
921
|
+
continue;
|
|
922
|
+
}
|
|
756
923
|
const prompt = asString2(rawEvaluator.prompt);
|
|
757
924
|
let promptPath;
|
|
758
925
|
if (prompt) {
|
|
@@ -823,6 +990,34 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
823
990
|
function asString2(value) {
|
|
824
991
|
return typeof value === "string" ? value : void 0;
|
|
825
992
|
}
|
|
993
|
+
function asStringArray(value, description) {
|
|
994
|
+
if (value === void 0) {
|
|
995
|
+
return void 0;
|
|
996
|
+
}
|
|
997
|
+
if (!Array.isArray(value)) {
|
|
998
|
+
throw new Error(`${description} must be an array of strings (argv tokens)`);
|
|
999
|
+
}
|
|
1000
|
+
if (value.length === 0) {
|
|
1001
|
+
throw new Error(`${description} cannot be empty`);
|
|
1002
|
+
}
|
|
1003
|
+
const result = [];
|
|
1004
|
+
for (const [index, entry] of value.entries()) {
|
|
1005
|
+
if (typeof entry !== "string") {
|
|
1006
|
+
throw new Error(`${description}[${index}] must be a string`);
|
|
1007
|
+
}
|
|
1008
|
+
if (entry.trim().length === 0) {
|
|
1009
|
+
throw new Error(`${description}[${index}] cannot be empty`);
|
|
1010
|
+
}
|
|
1011
|
+
result.push(entry);
|
|
1012
|
+
}
|
|
1013
|
+
return result;
|
|
1014
|
+
}
|
|
1015
|
+
function parseCommandToArgv(command) {
|
|
1016
|
+
if (process.platform === "win32") {
|
|
1017
|
+
return ["cmd.exe", "/c", command];
|
|
1018
|
+
}
|
|
1019
|
+
return ["sh", "-lc", command];
|
|
1020
|
+
}
|
|
826
1021
|
function isJsonObject2(value) {
|
|
827
1022
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
828
1023
|
}
|
|
@@ -856,6 +1051,14 @@ function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
|
856
1051
|
}
|
|
857
1052
|
return rawWeight;
|
|
858
1053
|
}
|
|
1054
|
+
var VALID_FIELD_MATCH_TYPES = /* @__PURE__ */ new Set(["exact", "numeric_tolerance", "date"]);
|
|
1055
|
+
function isValidFieldMatchType(value) {
|
|
1056
|
+
return typeof value === "string" && VALID_FIELD_MATCH_TYPES.has(value);
|
|
1057
|
+
}
|
|
1058
|
+
var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average", "all_or_nothing"]);
|
|
1059
|
+
function isValidFieldAggregationType(value) {
|
|
1060
|
+
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
1061
|
+
}
|
|
859
1062
|
|
|
860
1063
|
// src/evaluation/loaders/message-processor.ts
|
|
861
1064
|
var import_promises4 = require("fs/promises");
|
|
@@ -1930,92 +2133,993 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
1930
2133
|
throw lastError;
|
|
1931
2134
|
}
|
|
1932
2135
|
|
|
1933
|
-
// src/evaluation/providers/
|
|
2136
|
+
// src/evaluation/providers/claude-code.ts
|
|
1934
2137
|
var import_node_child_process = require("child_process");
|
|
1935
|
-
var
|
|
1936
|
-
var
|
|
1937
|
-
var
|
|
1938
|
-
var
|
|
1939
|
-
var
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
2138
|
+
var import_node_crypto = require("crypto");
|
|
2139
|
+
var import_node_fs3 = require("fs");
|
|
2140
|
+
var import_promises8 = require("fs/promises");
|
|
2141
|
+
var import_node_os = require("os");
|
|
2142
|
+
var import_node_path9 = __toESM(require("path"), 1);
|
|
2143
|
+
|
|
2144
|
+
// src/evaluation/providers/claude-code-log-tracker.ts
|
|
2145
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
|
|
2146
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeCodeLogSubscribers");
|
|
2147
|
+
function getClaudeCodeLogStore() {
|
|
2148
|
+
const globalObject = globalThis;
|
|
2149
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
2150
|
+
if (existing) {
|
|
2151
|
+
return existing;
|
|
2152
|
+
}
|
|
2153
|
+
const created = [];
|
|
2154
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
2155
|
+
return created;
|
|
2156
|
+
}
|
|
2157
|
+
function getSubscriberStore() {
|
|
2158
|
+
const globalObject = globalThis;
|
|
2159
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
2160
|
+
if (existing) {
|
|
2161
|
+
return existing;
|
|
2162
|
+
}
|
|
2163
|
+
const created = /* @__PURE__ */ new Set();
|
|
2164
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
2165
|
+
return created;
|
|
2166
|
+
}
|
|
2167
|
+
function notifySubscribers(entry) {
|
|
2168
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
2169
|
+
for (const listener of subscribers) {
|
|
2170
|
+
try {
|
|
2171
|
+
listener(entry);
|
|
2172
|
+
} catch (error) {
|
|
2173
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2174
|
+
console.warn(`Claude Code log subscriber failed: ${message}`);
|
|
2175
|
+
}
|
|
2176
|
+
}
|
|
2177
|
+
}
|
|
2178
|
+
function recordClaudeCodeLogEntry(entry) {
|
|
2179
|
+
getClaudeCodeLogStore().push(entry);
|
|
2180
|
+
notifySubscribers(entry);
|
|
2181
|
+
}
|
|
2182
|
+
function consumeClaudeCodeLogEntries() {
|
|
2183
|
+
const store = getClaudeCodeLogStore();
|
|
2184
|
+
if (store.length === 0) {
|
|
2185
|
+
return [];
|
|
2186
|
+
}
|
|
2187
|
+
return store.splice(0, store.length);
|
|
2188
|
+
}
|
|
2189
|
+
function subscribeToClaudeCodeLogEntries(listener) {
|
|
2190
|
+
const store = getSubscriberStore();
|
|
2191
|
+
store.add(listener);
|
|
2192
|
+
return () => {
|
|
2193
|
+
store.delete(listener);
|
|
1949
2194
|
};
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
failed: true,
|
|
1967
|
-
timedOut: execError.timedOut === true || execError.killed === true,
|
|
1968
|
-
signal: execError.signal ?? null
|
|
1969
|
-
};
|
|
2195
|
+
}
|
|
2196
|
+
|
|
2197
|
+
// src/evaluation/providers/preread.ts
|
|
2198
|
+
var import_node_path8 = __toESM(require("path"), 1);
|
|
2199
|
+
function buildPromptDocument(request, inputFiles, options) {
|
|
2200
|
+
const parts = [];
|
|
2201
|
+
const guidelineFiles = collectGuidelineFiles(
|
|
2202
|
+
inputFiles,
|
|
2203
|
+
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
2204
|
+
options?.guidelineOverrides
|
|
2205
|
+
);
|
|
2206
|
+
const inputFilesList = collectInputFiles(inputFiles);
|
|
2207
|
+
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
2208
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
2209
|
+
if (prereadBlock.length > 0) {
|
|
2210
|
+
parts.push("\n", prereadBlock);
|
|
1970
2211
|
}
|
|
2212
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2213
|
+
return parts.join("\n").trim();
|
|
1971
2214
|
}
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
targetName;
|
|
1976
|
-
supportsBatch = true;
|
|
1977
|
-
config;
|
|
1978
|
-
runCommand;
|
|
1979
|
-
verbose;
|
|
1980
|
-
keepTempFiles;
|
|
1981
|
-
healthcheckPromise;
|
|
1982
|
-
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1983
|
-
this.targetName = targetName;
|
|
1984
|
-
this.id = `cli:${targetName}`;
|
|
1985
|
-
this.config = config;
|
|
1986
|
-
this.runCommand = runner;
|
|
1987
|
-
this.verbose = config.verbose ?? false;
|
|
1988
|
-
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
2215
|
+
function normalizeInputFiles(inputFiles) {
|
|
2216
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2217
|
+
return void 0;
|
|
1989
2218
|
}
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
2219
|
+
const deduped = /* @__PURE__ */ new Map();
|
|
2220
|
+
for (const inputFile of inputFiles) {
|
|
2221
|
+
const absolutePath = import_node_path8.default.resolve(inputFile);
|
|
2222
|
+
if (!deduped.has(absolutePath)) {
|
|
2223
|
+
deduped.set(absolutePath, absolutePath);
|
|
1993
2224
|
}
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
2225
|
+
}
|
|
2226
|
+
return Array.from(deduped.values());
|
|
2227
|
+
}
|
|
2228
|
+
function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
2229
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2230
|
+
return [];
|
|
2231
|
+
}
|
|
2232
|
+
const unique = /* @__PURE__ */ new Map();
|
|
2233
|
+
for (const inputFile of inputFiles) {
|
|
2234
|
+
const absolutePath = import_node_path8.default.resolve(inputFile);
|
|
2235
|
+
if (overrides?.has(absolutePath)) {
|
|
2236
|
+
if (!unique.has(absolutePath)) {
|
|
2237
|
+
unique.set(absolutePath, absolutePath);
|
|
2238
|
+
}
|
|
2239
|
+
continue;
|
|
2002
2240
|
}
|
|
2003
|
-
const
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
timeoutMs: this.config.timeoutMs,
|
|
2008
|
-
signal: request.signal
|
|
2009
|
-
});
|
|
2010
|
-
const measuredDurationMs = Date.now() - startTime;
|
|
2011
|
-
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
2012
|
-
if (request.signal?.aborted) {
|
|
2013
|
-
throw new Error("CLI provider request was aborted");
|
|
2241
|
+
const normalized = absolutePath.split(import_node_path8.default.sep).join("/");
|
|
2242
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2243
|
+
if (!unique.has(absolutePath)) {
|
|
2244
|
+
unique.set(absolutePath, absolutePath);
|
|
2014
2245
|
}
|
|
2015
|
-
|
|
2016
|
-
|
|
2017
|
-
|
|
2018
|
-
|
|
2246
|
+
}
|
|
2247
|
+
}
|
|
2248
|
+
return Array.from(unique.values());
|
|
2249
|
+
}
|
|
2250
|
+
function collectInputFiles(inputFiles) {
|
|
2251
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2252
|
+
return [];
|
|
2253
|
+
}
|
|
2254
|
+
const unique = /* @__PURE__ */ new Map();
|
|
2255
|
+
for (const inputFile of inputFiles) {
|
|
2256
|
+
const absolutePath = import_node_path8.default.resolve(inputFile);
|
|
2257
|
+
if (!unique.has(absolutePath)) {
|
|
2258
|
+
unique.set(absolutePath, absolutePath);
|
|
2259
|
+
}
|
|
2260
|
+
}
|
|
2261
|
+
return Array.from(unique.values());
|
|
2262
|
+
}
|
|
2263
|
+
function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
2264
|
+
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
2265
|
+
return "";
|
|
2266
|
+
}
|
|
2267
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
2268
|
+
const fileName = import_node_path8.default.basename(absolutePath);
|
|
2269
|
+
const fileUri = pathToFileUri(absolutePath);
|
|
2270
|
+
return `* [${fileName}](${fileUri})`;
|
|
2271
|
+
});
|
|
2272
|
+
const sections = [];
|
|
2273
|
+
if (guidelineFiles.length > 0) {
|
|
2274
|
+
sections.push(`Read all guideline files:
|
|
2275
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
2276
|
+
}
|
|
2277
|
+
if (inputFiles.length > 0) {
|
|
2278
|
+
sections.push(`Read all input files:
|
|
2279
|
+
${buildList(inputFiles).join("\n")}.`);
|
|
2280
|
+
}
|
|
2281
|
+
sections.push(
|
|
2282
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
2283
|
+
"Then apply system_instructions on the user query below."
|
|
2284
|
+
);
|
|
2285
|
+
return sections.join("\n");
|
|
2286
|
+
}
|
|
2287
|
+
function pathToFileUri(filePath) {
|
|
2288
|
+
const absolutePath = import_node_path8.default.isAbsolute(filePath) ? filePath : import_node_path8.default.resolve(filePath);
|
|
2289
|
+
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2290
|
+
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2291
|
+
return `file:///${normalizedPath}`;
|
|
2292
|
+
}
|
|
2293
|
+
return `file://${normalizedPath}`;
|
|
2294
|
+
}
|
|
2295
|
+
|
|
2296
|
+
// src/evaluation/providers/claude-code.ts
|
|
2297
|
+
var WORKSPACE_PREFIX = "agentv-claude-code-";
|
|
2298
|
+
var PROMPT_FILENAME = "prompt.md";
|
|
2299
|
+
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2300
|
+
- Do NOT create any additional output files in the workspace.
|
|
2301
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
2302
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
2303
|
+
This is required for evaluation scoring.`;
|
|
2304
|
+
var ClaudeCodeProvider = class {
|
|
2305
|
+
id;
|
|
2306
|
+
kind = "claude-code";
|
|
2307
|
+
targetName;
|
|
2308
|
+
supportsBatch = false;
|
|
2309
|
+
config;
|
|
2310
|
+
runClaudeCode;
|
|
2311
|
+
constructor(targetName, config, runner = defaultClaudeCodeRunner) {
|
|
2312
|
+
this.id = `claude-code:${targetName}`;
|
|
2313
|
+
this.targetName = targetName;
|
|
2314
|
+
this.config = config;
|
|
2315
|
+
this.runClaudeCode = runner;
|
|
2316
|
+
}
|
|
2317
|
+
async invoke(request) {
|
|
2318
|
+
if (request.signal?.aborted) {
|
|
2319
|
+
throw new Error("Claude Code request was aborted before execution");
|
|
2320
|
+
}
|
|
2321
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
2322
|
+
const workspaceRoot = await this.createWorkspace();
|
|
2323
|
+
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2324
|
+
try {
|
|
2325
|
+
const promptFile = import_node_path9.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
2326
|
+
await (0, import_promises8.writeFile)(promptFile, request.question, "utf8");
|
|
2327
|
+
const args = this.buildClaudeCodeArgs(request.question, inputFiles);
|
|
2328
|
+
const cwd = this.resolveCwd();
|
|
2329
|
+
const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
|
|
2330
|
+
if (result.timedOut) {
|
|
2331
|
+
throw new Error(
|
|
2332
|
+
`Claude Code CLI timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
2333
|
+
);
|
|
2334
|
+
}
|
|
2335
|
+
if (result.exitCode !== 0) {
|
|
2336
|
+
const detail = pickDetail(result.stderr, result.stdout);
|
|
2337
|
+
const prefix = `Claude Code CLI exited with code ${result.exitCode}`;
|
|
2338
|
+
if (isNestedClaudeCodeAuthError(result.stdout)) {
|
|
2339
|
+
throw new Error(
|
|
2340
|
+
`${prefix}: Claude Code detected a nested session and requires API key authentication. Set ANTHROPIC_API_KEY environment variable or run AgentV outside of a Claude Code session.`
|
|
2341
|
+
);
|
|
2342
|
+
}
|
|
2343
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
2344
|
+
}
|
|
2345
|
+
const parsed = parseClaudeCodeJsonl(result.stdout);
|
|
2346
|
+
const outputMessages = extractOutputMessages(parsed);
|
|
2347
|
+
const usage = extractUsage(parsed);
|
|
2348
|
+
return {
|
|
2349
|
+
raw: {
|
|
2350
|
+
response: parsed,
|
|
2351
|
+
stdout: result.stdout,
|
|
2352
|
+
stderr: result.stderr,
|
|
2353
|
+
exitCode: result.exitCode,
|
|
2354
|
+
args,
|
|
2355
|
+
executable: this.config.executable,
|
|
2356
|
+
promptFile,
|
|
2357
|
+
workspace: workspaceRoot,
|
|
2358
|
+
inputFiles,
|
|
2359
|
+
logFile: logger?.filePath
|
|
2360
|
+
},
|
|
2361
|
+
outputMessages,
|
|
2362
|
+
usage
|
|
2363
|
+
};
|
|
2364
|
+
} finally {
|
|
2365
|
+
await logger?.close();
|
|
2366
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
2367
|
+
}
|
|
2368
|
+
}
|
|
2369
|
+
resolveCwd() {
|
|
2370
|
+
if (!this.config.cwd) {
|
|
2371
|
+
return process.cwd();
|
|
2372
|
+
}
|
|
2373
|
+
return import_node_path9.default.resolve(this.config.cwd);
|
|
2374
|
+
}
|
|
2375
|
+
buildClaudeCodeArgs(prompt, inputFiles) {
|
|
2376
|
+
const args = [];
|
|
2377
|
+
args.push("--output-format", "stream-json");
|
|
2378
|
+
args.push("--verbose");
|
|
2379
|
+
args.push("-p");
|
|
2380
|
+
if (this.config.model) {
|
|
2381
|
+
args.push("--model", this.config.model);
|
|
2382
|
+
}
|
|
2383
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
2384
|
+
args.push(...this.config.args);
|
|
2385
|
+
}
|
|
2386
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
|
|
2387
|
+
const fullPrompt = `${systemPrompt}
|
|
2388
|
+
|
|
2389
|
+
${prompt}`;
|
|
2390
|
+
let finalPrompt = fullPrompt;
|
|
2391
|
+
if (inputFiles && inputFiles.length > 0) {
|
|
2392
|
+
const filesContext = inputFiles.map((f) => `[File: ${f}]`).join("\n");
|
|
2393
|
+
finalPrompt = `${fullPrompt}
|
|
2394
|
+
|
|
2395
|
+
## Input Files
|
|
2396
|
+
${filesContext}`;
|
|
2397
|
+
}
|
|
2398
|
+
args.push(finalPrompt);
|
|
2399
|
+
return args;
|
|
2400
|
+
}
|
|
2401
|
+
buildEnv() {
|
|
2402
|
+
const env = { ...process.env };
|
|
2403
|
+
env.CLAUDECODE = void 0;
|
|
2404
|
+
env.CLAUDE_CODE_ENTRYPOINT = void 0;
|
|
2405
|
+
return env;
|
|
2406
|
+
}
|
|
2407
|
+
async executeClaudeCode(args, cwd, signal, logger) {
|
|
2408
|
+
try {
|
|
2409
|
+
return await this.runClaudeCode({
|
|
2410
|
+
executable: this.config.executable,
|
|
2411
|
+
args,
|
|
2412
|
+
cwd,
|
|
2413
|
+
timeoutMs: this.config.timeoutMs,
|
|
2414
|
+
env: this.buildEnv(),
|
|
2415
|
+
signal,
|
|
2416
|
+
onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
|
|
2417
|
+
onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
|
|
2418
|
+
});
|
|
2419
|
+
} catch (error) {
|
|
2420
|
+
const err = error;
|
|
2421
|
+
if (err.code === "ENOENT") {
|
|
2422
|
+
throw new Error(
|
|
2423
|
+
`Claude Code executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
2424
|
+
);
|
|
2425
|
+
}
|
|
2426
|
+
throw error;
|
|
2427
|
+
}
|
|
2428
|
+
}
|
|
2429
|
+
async createWorkspace() {
|
|
2430
|
+
return await (0, import_promises8.mkdtemp)(import_node_path9.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
|
|
2431
|
+
}
|
|
2432
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
2433
|
+
try {
|
|
2434
|
+
await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
|
|
2435
|
+
} catch {
|
|
2436
|
+
}
|
|
2437
|
+
}
|
|
2438
|
+
resolveLogDirectory() {
|
|
2439
|
+
const disabled = isClaudeCodeLogStreamingDisabled();
|
|
2440
|
+
if (disabled) {
|
|
2441
|
+
return void 0;
|
|
2442
|
+
}
|
|
2443
|
+
if (this.config.logDir) {
|
|
2444
|
+
return import_node_path9.default.resolve(this.config.logDir);
|
|
2445
|
+
}
|
|
2446
|
+
return import_node_path9.default.join(process.cwd(), ".agentv", "logs", "claude-code");
|
|
2447
|
+
}
|
|
2448
|
+
async createStreamLogger(request) {
|
|
2449
|
+
const logDir = this.resolveLogDirectory();
|
|
2450
|
+
if (!logDir) {
|
|
2451
|
+
return void 0;
|
|
2452
|
+
}
|
|
2453
|
+
try {
|
|
2454
|
+
await (0, import_promises8.mkdir)(logDir, { recursive: true });
|
|
2455
|
+
} catch (error) {
|
|
2456
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2457
|
+
console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
|
|
2458
|
+
return void 0;
|
|
2459
|
+
}
|
|
2460
|
+
const filePath = import_node_path9.default.join(logDir, buildLogFilename(request, this.targetName));
|
|
2461
|
+
try {
|
|
2462
|
+
const logger = await ClaudeCodeStreamLogger.create({
|
|
2463
|
+
filePath,
|
|
2464
|
+
targetName: this.targetName,
|
|
2465
|
+
evalCaseId: request.evalCaseId,
|
|
2466
|
+
attempt: request.attempt,
|
|
2467
|
+
format: this.config.logFormat ?? "summary"
|
|
2468
|
+
});
|
|
2469
|
+
recordClaudeCodeLogEntry({
|
|
2470
|
+
filePath,
|
|
2471
|
+
targetName: this.targetName,
|
|
2472
|
+
evalCaseId: request.evalCaseId,
|
|
2473
|
+
attempt: request.attempt
|
|
2474
|
+
});
|
|
2475
|
+
return logger;
|
|
2476
|
+
} catch (error) {
|
|
2477
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2478
|
+
console.warn(`Skipping Claude Code stream logging for ${filePath}: ${message}`);
|
|
2479
|
+
return void 0;
|
|
2480
|
+
}
|
|
2481
|
+
}
|
|
2482
|
+
};
|
|
2483
|
+
var ClaudeCodeStreamLogger = class _ClaudeCodeStreamLogger {
|
|
2484
|
+
filePath;
|
|
2485
|
+
stream;
|
|
2486
|
+
startedAt = Date.now();
|
|
2487
|
+
stdoutBuffer = "";
|
|
2488
|
+
stderrBuffer = "";
|
|
2489
|
+
format;
|
|
2490
|
+
constructor(filePath, format) {
|
|
2491
|
+
this.filePath = filePath;
|
|
2492
|
+
this.format = format;
|
|
2493
|
+
this.stream = (0, import_node_fs3.createWriteStream)(filePath, { flags: "a" });
|
|
2494
|
+
}
|
|
2495
|
+
static async create(options) {
|
|
2496
|
+
const logger = new _ClaudeCodeStreamLogger(options.filePath, options.format);
|
|
2497
|
+
const header = [
|
|
2498
|
+
"# Claude Code CLI stream log",
|
|
2499
|
+
`# target: ${options.targetName}`,
|
|
2500
|
+
options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
|
|
2501
|
+
options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
|
|
2502
|
+
`# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
|
|
2503
|
+
""
|
|
2504
|
+
].filter((line) => Boolean(line));
|
|
2505
|
+
logger.writeLines(header);
|
|
2506
|
+
return logger;
|
|
2507
|
+
}
|
|
2508
|
+
handleStdoutChunk(chunk) {
|
|
2509
|
+
this.stdoutBuffer += chunk;
|
|
2510
|
+
this.flushBuffer("stdout");
|
|
2511
|
+
}
|
|
2512
|
+
handleStderrChunk(chunk) {
|
|
2513
|
+
this.stderrBuffer += chunk;
|
|
2514
|
+
this.flushBuffer("stderr");
|
|
2515
|
+
}
|
|
2516
|
+
async close() {
|
|
2517
|
+
this.flushBuffer("stdout");
|
|
2518
|
+
this.flushBuffer("stderr");
|
|
2519
|
+
this.flushRemainder();
|
|
2520
|
+
await new Promise((resolve, reject) => {
|
|
2521
|
+
this.stream.once("error", reject);
|
|
2522
|
+
this.stream.end(() => resolve());
|
|
2523
|
+
});
|
|
2524
|
+
}
|
|
2525
|
+
writeLines(lines) {
|
|
2526
|
+
for (const line of lines) {
|
|
2527
|
+
this.stream.write(`${line}
|
|
2528
|
+
`);
|
|
2529
|
+
}
|
|
2530
|
+
}
|
|
2531
|
+
flushBuffer(source) {
|
|
2532
|
+
const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
|
|
2533
|
+
const lines = buffer.split(/\r?\n/);
|
|
2534
|
+
const remainder = lines.pop() ?? "";
|
|
2535
|
+
if (source === "stdout") {
|
|
2536
|
+
this.stdoutBuffer = remainder;
|
|
2537
|
+
} else {
|
|
2538
|
+
this.stderrBuffer = remainder;
|
|
2539
|
+
}
|
|
2540
|
+
for (const line of lines) {
|
|
2541
|
+
const formatted = this.formatLine(line, source);
|
|
2542
|
+
if (formatted) {
|
|
2543
|
+
this.stream.write(formatted);
|
|
2544
|
+
this.stream.write("\n");
|
|
2545
|
+
}
|
|
2546
|
+
}
|
|
2547
|
+
}
|
|
2548
|
+
formatLine(rawLine, source) {
|
|
2549
|
+
const trimmed = rawLine.trim();
|
|
2550
|
+
if (trimmed.length === 0) {
|
|
2551
|
+
return void 0;
|
|
2552
|
+
}
|
|
2553
|
+
const message = this.format === "json" ? formatClaudeCodeJsonLog(trimmed) : formatClaudeCodeLogMessage(trimmed, source);
|
|
2554
|
+
return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
|
|
2555
|
+
}
|
|
2556
|
+
flushRemainder() {
|
|
2557
|
+
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
2558
|
+
if (stdoutRemainder.length > 0) {
|
|
2559
|
+
const formatted = this.formatLine(stdoutRemainder, "stdout");
|
|
2560
|
+
if (formatted) {
|
|
2561
|
+
this.stream.write(formatted);
|
|
2562
|
+
this.stream.write("\n");
|
|
2563
|
+
}
|
|
2564
|
+
}
|
|
2565
|
+
const stderrRemainder = this.stderrBuffer.trim();
|
|
2566
|
+
if (stderrRemainder.length > 0) {
|
|
2567
|
+
const formatted = this.formatLine(stderrRemainder, "stderr");
|
|
2568
|
+
if (formatted) {
|
|
2569
|
+
this.stream.write(formatted);
|
|
2570
|
+
this.stream.write("\n");
|
|
2571
|
+
}
|
|
2572
|
+
}
|
|
2573
|
+
this.stdoutBuffer = "";
|
|
2574
|
+
this.stderrBuffer = "";
|
|
2575
|
+
}
|
|
2576
|
+
};
|
|
2577
|
+
function isClaudeCodeLogStreamingDisabled() {
|
|
2578
|
+
const envValue = process.env.AGENTV_CLAUDE_CODE_STREAM_LOGS;
|
|
2579
|
+
if (!envValue) {
|
|
2580
|
+
return false;
|
|
2581
|
+
}
|
|
2582
|
+
const normalized = envValue.trim().toLowerCase();
|
|
2583
|
+
return normalized === "false" || normalized === "0" || normalized === "off";
|
|
2584
|
+
}
|
|
2585
|
+
function buildLogFilename(request, targetName) {
|
|
2586
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2587
|
+
const evalId = sanitizeForFilename(request.evalCaseId ?? "claude-code");
|
|
2588
|
+
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
2589
|
+
const target = sanitizeForFilename(targetName);
|
|
2590
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto.randomUUID)().slice(0, 8)}.log`;
|
|
2591
|
+
}
|
|
2592
|
+
function sanitizeForFilename(value) {
|
|
2593
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
2594
|
+
return sanitized.length > 0 ? sanitized : "claude-code";
|
|
2595
|
+
}
|
|
2596
|
+
function formatElapsed(startedAt) {
|
|
2597
|
+
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
2598
|
+
const hours = Math.floor(elapsedSeconds / 3600);
|
|
2599
|
+
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
2600
|
+
const seconds = elapsedSeconds % 60;
|
|
2601
|
+
if (hours > 0) {
|
|
2602
|
+
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2603
|
+
}
|
|
2604
|
+
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2605
|
+
}
|
|
2606
|
+
function formatClaudeCodeLogMessage(rawLine, source) {
|
|
2607
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
2608
|
+
if (parsed) {
|
|
2609
|
+
const summary = summarizeClaudeCodeEvent(parsed);
|
|
2610
|
+
if (summary) {
|
|
2611
|
+
return summary;
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
if (source === "stderr") {
|
|
2615
|
+
return `stderr: ${rawLine}`;
|
|
2616
|
+
}
|
|
2617
|
+
return rawLine;
|
|
2618
|
+
}
|
|
2619
|
+
function formatClaudeCodeJsonLog(rawLine) {
|
|
2620
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
2621
|
+
if (!parsed) {
|
|
2622
|
+
return rawLine;
|
|
2623
|
+
}
|
|
2624
|
+
try {
|
|
2625
|
+
return JSON.stringify(parsed, null, 2);
|
|
2626
|
+
} catch {
|
|
2627
|
+
return rawLine;
|
|
2628
|
+
}
|
|
2629
|
+
}
|
|
2630
|
+
function summarizeClaudeCodeEvent(event) {
|
|
2631
|
+
if (!event || typeof event !== "object") {
|
|
2632
|
+
return void 0;
|
|
2633
|
+
}
|
|
2634
|
+
const record = event;
|
|
2635
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
2636
|
+
if (!type) {
|
|
2637
|
+
return void 0;
|
|
2638
|
+
}
|
|
2639
|
+
switch (type) {
|
|
2640
|
+
case "system":
|
|
2641
|
+
return "system: init";
|
|
2642
|
+
case "assistant": {
|
|
2643
|
+
const message = record.message;
|
|
2644
|
+
if (message) {
|
|
2645
|
+
const content = message.content;
|
|
2646
|
+
if (Array.isArray(content) && content.length > 0) {
|
|
2647
|
+
const first = content[0];
|
|
2648
|
+
if (first?.type === "tool_use") {
|
|
2649
|
+
return `assistant: tool_use (${first.name})`;
|
|
2650
|
+
}
|
|
2651
|
+
if (first?.type === "text") {
|
|
2652
|
+
const text = first.text;
|
|
2653
|
+
if (typeof text === "string") {
|
|
2654
|
+
const preview = text.length > 50 ? `${text.slice(0, 50)}...` : text;
|
|
2655
|
+
return `assistant: ${preview}`;
|
|
2656
|
+
}
|
|
2657
|
+
}
|
|
2658
|
+
}
|
|
2659
|
+
}
|
|
2660
|
+
return "assistant";
|
|
2661
|
+
}
|
|
2662
|
+
case "user": {
|
|
2663
|
+
const message = record.message;
|
|
2664
|
+
if (message) {
|
|
2665
|
+
const content = message.content;
|
|
2666
|
+
if (Array.isArray(content) && content.length > 0) {
|
|
2667
|
+
const first = content[0];
|
|
2668
|
+
if (first?.type === "tool_result") {
|
|
2669
|
+
return `user: tool_result (${first.tool_use_id})`;
|
|
2670
|
+
}
|
|
2671
|
+
}
|
|
2672
|
+
}
|
|
2673
|
+
return "user";
|
|
2674
|
+
}
|
|
2675
|
+
case "result": {
|
|
2676
|
+
const cost = record.cost_usd;
|
|
2677
|
+
const duration = record.duration_ms;
|
|
2678
|
+
if (typeof cost === "number" && typeof duration === "number") {
|
|
2679
|
+
return `result: $${cost.toFixed(4)}, ${Math.round(duration)}ms`;
|
|
2680
|
+
}
|
|
2681
|
+
return "result";
|
|
2682
|
+
}
|
|
2683
|
+
default:
|
|
2684
|
+
return type;
|
|
2685
|
+
}
|
|
2686
|
+
}
|
|
2687
|
+
function tryParseJsonValue(rawLine) {
|
|
2688
|
+
try {
|
|
2689
|
+
return JSON.parse(rawLine);
|
|
2690
|
+
} catch {
|
|
2691
|
+
return void 0;
|
|
2692
|
+
}
|
|
2693
|
+
}
|
|
2694
|
+
function parseClaudeCodeJsonl(output) {
|
|
2695
|
+
const trimmed = output.trim();
|
|
2696
|
+
if (trimmed.length === 0) {
|
|
2697
|
+
throw new Error("Claude Code CLI produced no output");
|
|
2698
|
+
}
|
|
2699
|
+
const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
2700
|
+
const parsed = [];
|
|
2701
|
+
for (const line of lines) {
|
|
2702
|
+
try {
|
|
2703
|
+
parsed.push(JSON.parse(line));
|
|
2704
|
+
} catch {
|
|
2705
|
+
}
|
|
2706
|
+
}
|
|
2707
|
+
if (parsed.length === 0) {
|
|
2708
|
+
throw new Error("Claude Code CLI produced no valid JSON output");
|
|
2709
|
+
}
|
|
2710
|
+
return parsed;
|
|
2711
|
+
}
|
|
2712
|
+
function extractOutputMessages(events) {
|
|
2713
|
+
const outputMessages = [];
|
|
2714
|
+
for (const event of events) {
|
|
2715
|
+
if (!event || typeof event !== "object") {
|
|
2716
|
+
continue;
|
|
2717
|
+
}
|
|
2718
|
+
const record = event;
|
|
2719
|
+
const type = record.type;
|
|
2720
|
+
if (type === "assistant" || type === "user") {
|
|
2721
|
+
const message = record.message;
|
|
2722
|
+
if (message) {
|
|
2723
|
+
const converted = convertClaudeCodeMessage(message, type);
|
|
2724
|
+
if (converted) {
|
|
2725
|
+
outputMessages.push(converted);
|
|
2726
|
+
}
|
|
2727
|
+
}
|
|
2728
|
+
}
|
|
2729
|
+
}
|
|
2730
|
+
return outputMessages;
|
|
2731
|
+
}
|
|
2732
|
+
function convertClaudeCodeMessage(message, type) {
|
|
2733
|
+
const role = type === "assistant" ? "assistant" : "user";
|
|
2734
|
+
const content = extractTextContent(message.content);
|
|
2735
|
+
const toolCalls = extractToolCalls(message.content);
|
|
2736
|
+
return {
|
|
2737
|
+
role,
|
|
2738
|
+
content,
|
|
2739
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
2740
|
+
};
|
|
2741
|
+
}
|
|
2742
|
+
function extractTextContent(content) {
|
|
2743
|
+
if (typeof content === "string") {
|
|
2744
|
+
return content;
|
|
2745
|
+
}
|
|
2746
|
+
if (!Array.isArray(content)) {
|
|
2747
|
+
return void 0;
|
|
2748
|
+
}
|
|
2749
|
+
const textParts = [];
|
|
2750
|
+
for (const part of content) {
|
|
2751
|
+
if (!part || typeof part !== "object") {
|
|
2752
|
+
continue;
|
|
2753
|
+
}
|
|
2754
|
+
const p = part;
|
|
2755
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
2756
|
+
textParts.push(p.text);
|
|
2757
|
+
}
|
|
2758
|
+
}
|
|
2759
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
2760
|
+
}
|
|
2761
|
+
function extractToolCalls(content) {
|
|
2762
|
+
if (!Array.isArray(content)) {
|
|
2763
|
+
return [];
|
|
2764
|
+
}
|
|
2765
|
+
const toolCalls = [];
|
|
2766
|
+
for (const part of content) {
|
|
2767
|
+
if (!part || typeof part !== "object") {
|
|
2768
|
+
continue;
|
|
2769
|
+
}
|
|
2770
|
+
const p = part;
|
|
2771
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
2772
|
+
toolCalls.push({
|
|
2773
|
+
tool: p.name,
|
|
2774
|
+
input: p.input,
|
|
2775
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
2776
|
+
});
|
|
2777
|
+
}
|
|
2778
|
+
if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
2779
|
+
toolCalls.push({
|
|
2780
|
+
tool: "tool_result",
|
|
2781
|
+
output: p.content,
|
|
2782
|
+
id: p.tool_use_id
|
|
2783
|
+
});
|
|
2784
|
+
}
|
|
2785
|
+
}
|
|
2786
|
+
return toolCalls;
|
|
2787
|
+
}
|
|
2788
|
+
function extractUsage(events) {
|
|
2789
|
+
for (let i = events.length - 1; i >= 0; i--) {
|
|
2790
|
+
const event = events[i];
|
|
2791
|
+
if (!event || typeof event !== "object") {
|
|
2792
|
+
continue;
|
|
2793
|
+
}
|
|
2794
|
+
const record = event;
|
|
2795
|
+
if (record.type !== "result") {
|
|
2796
|
+
continue;
|
|
2797
|
+
}
|
|
2798
|
+
const usage = {};
|
|
2799
|
+
if (typeof record.cost_usd === "number") {
|
|
2800
|
+
usage.cost_usd = record.cost_usd;
|
|
2801
|
+
}
|
|
2802
|
+
if (typeof record.duration_ms === "number") {
|
|
2803
|
+
usage.duration_ms = record.duration_ms;
|
|
2804
|
+
}
|
|
2805
|
+
if (typeof record.duration_api_ms === "number") {
|
|
2806
|
+
usage.duration_api_ms = record.duration_api_ms;
|
|
2807
|
+
}
|
|
2808
|
+
if (typeof record.input_tokens === "number") {
|
|
2809
|
+
usage.input_tokens = record.input_tokens;
|
|
2810
|
+
}
|
|
2811
|
+
if (typeof record.output_tokens === "number") {
|
|
2812
|
+
usage.output_tokens = record.output_tokens;
|
|
2813
|
+
}
|
|
2814
|
+
if (typeof record.session_id === "string") {
|
|
2815
|
+
usage.session_id = record.session_id;
|
|
2816
|
+
}
|
|
2817
|
+
return Object.keys(usage).length > 0 ? usage : void 0;
|
|
2818
|
+
}
|
|
2819
|
+
return void 0;
|
|
2820
|
+
}
|
|
2821
|
+
function pickDetail(stderr, stdout) {
|
|
2822
|
+
const errorText = stderr.trim();
|
|
2823
|
+
if (errorText.length > 0) {
|
|
2824
|
+
return errorText;
|
|
2825
|
+
}
|
|
2826
|
+
const stdoutText = stdout.trim();
|
|
2827
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
2828
|
+
}
|
|
2829
|
+
function formatTimeoutSuffix(timeoutMs) {
|
|
2830
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
2831
|
+
return "";
|
|
2832
|
+
}
|
|
2833
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
2834
|
+
return ` after ${seconds}s`;
|
|
2835
|
+
}
|
|
2836
|
+
function isNestedClaudeCodeAuthError(stdout) {
|
|
2837
|
+
try {
|
|
2838
|
+
const lines = stdout.split("\n");
|
|
2839
|
+
let hasApiKeySource = false;
|
|
2840
|
+
let hasAuthError = false;
|
|
2841
|
+
for (const line of lines) {
|
|
2842
|
+
const trimmed = line.trim();
|
|
2843
|
+
if (!trimmed) continue;
|
|
2844
|
+
try {
|
|
2845
|
+
const event = JSON.parse(trimmed);
|
|
2846
|
+
if (event.type === "system" && event.apiKeySource === "ANTHROPIC_API_KEY") {
|
|
2847
|
+
hasApiKeySource = true;
|
|
2848
|
+
}
|
|
2849
|
+
if (event.error === "authentication_failed" || event.type === "result" && event.is_error) {
|
|
2850
|
+
hasAuthError = true;
|
|
2851
|
+
}
|
|
2852
|
+
} catch {
|
|
2853
|
+
}
|
|
2854
|
+
}
|
|
2855
|
+
return hasApiKeySource && hasAuthError;
|
|
2856
|
+
} catch {
|
|
2857
|
+
return false;
|
|
2858
|
+
}
|
|
2859
|
+
}
|
|
2860
|
+
function escapeShellArg(arg) {
|
|
2861
|
+
return `'${arg.replace(/'/g, "'\\''")}'`;
|
|
2862
|
+
}
|
|
2863
|
+
async function defaultClaudeCodeRunner(options) {
|
|
2864
|
+
const tempId = (0, import_node_crypto.randomUUID)();
|
|
2865
|
+
const stdoutFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
|
|
2866
|
+
const stderrFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
|
|
2867
|
+
const exitFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
|
|
2868
|
+
const pidFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
|
|
2869
|
+
try {
|
|
2870
|
+
return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
|
|
2871
|
+
} finally {
|
|
2872
|
+
for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
|
|
2873
|
+
try {
|
|
2874
|
+
await (0, import_promises8.rm)(file, { force: true });
|
|
2875
|
+
} catch {
|
|
2876
|
+
}
|
|
2877
|
+
}
|
|
2878
|
+
}
|
|
2879
|
+
}
|
|
2880
|
+
async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile) {
|
|
2881
|
+
const parts = options.executable.split(/\s+/);
|
|
2882
|
+
const executable = parts[0];
|
|
2883
|
+
const executableArgs = parts.slice(1);
|
|
2884
|
+
const allArgs = [...executableArgs, ...options.args];
|
|
2885
|
+
const escapedArgs = allArgs.map((arg) => escapeShellArg(arg));
|
|
2886
|
+
const fullCommand = [escapeShellArg(executable), ...escapedArgs].join(" ");
|
|
2887
|
+
const bashScript = `
|
|
2888
|
+
unset CLAUDECODE CLAUDE_CODE_ENTRYPOINT 2>/dev/null
|
|
2889
|
+
${fullCommand} >${escapeShellArg(stdoutFile)} 2>${escapeShellArg(stderrFile)} &
|
|
2890
|
+
CHILD_PID=$!
|
|
2891
|
+
echo $CHILD_PID > ${escapeShellArg(pidFile)}
|
|
2892
|
+
wait $CHILD_PID
|
|
2893
|
+
echo $? > ${escapeShellArg(exitFile)}
|
|
2894
|
+
`;
|
|
2895
|
+
const child = (0, import_node_child_process.spawn)("setsid", ["bash", "-c", bashScript], {
|
|
2896
|
+
cwd: options.cwd,
|
|
2897
|
+
env: options.env,
|
|
2898
|
+
detached: true,
|
|
2899
|
+
stdio: "ignore"
|
|
2900
|
+
});
|
|
2901
|
+
child.unref();
|
|
2902
|
+
const pollInterval = 100;
|
|
2903
|
+
const startTime = Date.now();
|
|
2904
|
+
let timedOut = false;
|
|
2905
|
+
let lastStdoutSize = 0;
|
|
2906
|
+
const readFileIfExists = async (filePath) => {
|
|
2907
|
+
try {
|
|
2908
|
+
const { readFile: readFile8 } = await import("fs/promises");
|
|
2909
|
+
return await readFile8(filePath, "utf8");
|
|
2910
|
+
} catch {
|
|
2911
|
+
return "";
|
|
2912
|
+
}
|
|
2913
|
+
};
|
|
2914
|
+
const fileExists4 = async (filePath) => {
|
|
2915
|
+
try {
|
|
2916
|
+
const { access: access5 } = await import("fs/promises");
|
|
2917
|
+
await access5(filePath);
|
|
2918
|
+
return true;
|
|
2919
|
+
} catch {
|
|
2920
|
+
return false;
|
|
2921
|
+
}
|
|
2922
|
+
};
|
|
2923
|
+
const killProcess = async () => {
|
|
2924
|
+
try {
|
|
2925
|
+
const pid = await readFileIfExists(pidFile);
|
|
2926
|
+
if (pid.trim()) {
|
|
2927
|
+
process.kill(Number.parseInt(pid.trim(), 10), "SIGTERM");
|
|
2928
|
+
}
|
|
2929
|
+
} catch {
|
|
2930
|
+
}
|
|
2931
|
+
};
|
|
2932
|
+
if (options.signal?.aborted) {
|
|
2933
|
+
await killProcess();
|
|
2934
|
+
return { stdout: "", stderr: "Aborted", exitCode: -1, timedOut: false };
|
|
2935
|
+
}
|
|
2936
|
+
const abortHandler = () => {
|
|
2937
|
+
killProcess().catch(() => {
|
|
2938
|
+
});
|
|
2939
|
+
};
|
|
2940
|
+
options.signal?.addEventListener("abort", abortHandler, { once: true });
|
|
2941
|
+
try {
|
|
2942
|
+
while (true) {
|
|
2943
|
+
if (options.timeoutMs && Date.now() - startTime > options.timeoutMs) {
|
|
2944
|
+
timedOut = true;
|
|
2945
|
+
await killProcess();
|
|
2946
|
+
break;
|
|
2947
|
+
}
|
|
2948
|
+
if (options.signal?.aborted) {
|
|
2949
|
+
await killProcess();
|
|
2950
|
+
break;
|
|
2951
|
+
}
|
|
2952
|
+
if (options.onStdoutChunk) {
|
|
2953
|
+
const currentStdout = await readFileIfExists(stdoutFile);
|
|
2954
|
+
if (currentStdout.length > lastStdoutSize) {
|
|
2955
|
+
options.onStdoutChunk(currentStdout.slice(lastStdoutSize));
|
|
2956
|
+
lastStdoutSize = currentStdout.length;
|
|
2957
|
+
}
|
|
2958
|
+
}
|
|
2959
|
+
if (await fileExists4(exitFile)) {
|
|
2960
|
+
break;
|
|
2961
|
+
}
|
|
2962
|
+
await new Promise((resolve) => setTimeout(resolve, pollInterval));
|
|
2963
|
+
}
|
|
2964
|
+
const stdout = await readFileIfExists(stdoutFile);
|
|
2965
|
+
const stderr = await readFileIfExists(stderrFile);
|
|
2966
|
+
const exitCodeStr = await readFileIfExists(exitFile);
|
|
2967
|
+
const exitCode = exitCodeStr.trim() ? Number.parseInt(exitCodeStr.trim(), 10) : -1;
|
|
2968
|
+
if (options.onStdoutChunk && stdout.length > lastStdoutSize) {
|
|
2969
|
+
options.onStdoutChunk(stdout.slice(lastStdoutSize));
|
|
2970
|
+
}
|
|
2971
|
+
if (options.onStderrChunk && stderr) {
|
|
2972
|
+
options.onStderrChunk(stderr);
|
|
2973
|
+
}
|
|
2974
|
+
return { stdout, stderr, exitCode, timedOut };
|
|
2975
|
+
} finally {
|
|
2976
|
+
options.signal?.removeEventListener("abort", abortHandler);
|
|
2977
|
+
}
|
|
2978
|
+
}
|
|
2979
|
+
|
|
2980
|
+
// src/evaluation/providers/cli.ts
|
|
2981
|
+
var import_node_child_process2 = require("child_process");
|
|
2982
|
+
var import_promises9 = __toESM(require("fs/promises"), 1);
|
|
2983
|
+
var import_node_os2 = __toESM(require("os"), 1);
|
|
2984
|
+
var import_node_path10 = __toESM(require("path"), 1);
|
|
2985
|
+
var import_node_util = require("util");
|
|
2986
|
+
var import_zod = require("zod");
|
|
2987
|
+
var ToolCallSchema = import_zod.z.object({
|
|
2988
|
+
tool: import_zod.z.string(),
|
|
2989
|
+
input: import_zod.z.unknown().optional(),
|
|
2990
|
+
output: import_zod.z.unknown().optional(),
|
|
2991
|
+
id: import_zod.z.string().optional(),
|
|
2992
|
+
timestamp: import_zod.z.string().optional()
|
|
2993
|
+
});
|
|
2994
|
+
var OutputMessageInputSchema = import_zod.z.object({
|
|
2995
|
+
role: import_zod.z.string(),
|
|
2996
|
+
name: import_zod.z.string().optional(),
|
|
2997
|
+
content: import_zod.z.unknown().optional(),
|
|
2998
|
+
tool_calls: import_zod.z.array(ToolCallSchema).optional(),
|
|
2999
|
+
timestamp: import_zod.z.string().optional(),
|
|
3000
|
+
metadata: import_zod.z.record(import_zod.z.unknown()).optional()
|
|
3001
|
+
});
|
|
3002
|
+
var TokenUsageSchema = import_zod.z.object({
|
|
3003
|
+
input: import_zod.z.number(),
|
|
3004
|
+
output: import_zod.z.number(),
|
|
3005
|
+
cached: import_zod.z.number().optional()
|
|
3006
|
+
});
|
|
3007
|
+
var CliOutputSchema = import_zod.z.object({
|
|
3008
|
+
text: import_zod.z.unknown().optional(),
|
|
3009
|
+
output_messages: import_zod.z.array(OutputMessageInputSchema).optional(),
|
|
3010
|
+
token_usage: TokenUsageSchema.optional(),
|
|
3011
|
+
cost_usd: import_zod.z.number().optional(),
|
|
3012
|
+
duration_ms: import_zod.z.number().optional()
|
|
3013
|
+
});
|
|
3014
|
+
var CliJsonlRecordSchema = CliOutputSchema.extend({
|
|
3015
|
+
id: import_zod.z.string().min(1)
|
|
3016
|
+
});
|
|
3017
|
+
function validateMetrics(costUsd, durationMs, context) {
|
|
3018
|
+
let validCostUsd = costUsd;
|
|
3019
|
+
let validDurationMs = durationMs;
|
|
3020
|
+
if (costUsd !== void 0 && costUsd < 0) {
|
|
3021
|
+
console.warn(`[cli-provider] ${context}: ignoring negative cost_usd value (${costUsd})`);
|
|
3022
|
+
validCostUsd = void 0;
|
|
3023
|
+
}
|
|
3024
|
+
if (durationMs !== void 0 && durationMs < 0) {
|
|
3025
|
+
console.warn(`[cli-provider] ${context}: ignoring negative duration_ms value (${durationMs})`);
|
|
3026
|
+
validDurationMs = void 0;
|
|
3027
|
+
}
|
|
3028
|
+
return { costUsd: validCostUsd, durationMs: validDurationMs };
|
|
3029
|
+
}
|
|
3030
|
+
function convertOutputMessages(messages) {
|
|
3031
|
+
if (!messages || messages.length === 0) {
|
|
3032
|
+
return void 0;
|
|
3033
|
+
}
|
|
3034
|
+
return messages.map((msg) => ({
|
|
3035
|
+
role: msg.role,
|
|
3036
|
+
name: msg.name,
|
|
3037
|
+
content: msg.content,
|
|
3038
|
+
toolCalls: msg.tool_calls,
|
|
3039
|
+
timestamp: msg.timestamp,
|
|
3040
|
+
metadata: msg.metadata
|
|
3041
|
+
}));
|
|
3042
|
+
}
|
|
3043
|
+
var execAsync = (0, import_node_util.promisify)(import_node_child_process2.exec);
|
|
3044
|
+
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
3045
|
+
async function defaultCommandRunner(command, options) {
|
|
3046
|
+
const execOptions = {
|
|
3047
|
+
cwd: options.cwd,
|
|
3048
|
+
env: options.env,
|
|
3049
|
+
timeout: options.timeoutMs,
|
|
3050
|
+
signal: options.signal,
|
|
3051
|
+
maxBuffer: DEFAULT_MAX_BUFFER,
|
|
3052
|
+
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
3053
|
+
};
|
|
3054
|
+
try {
|
|
3055
|
+
const { stdout, stderr } = await execAsync(command, execOptions);
|
|
3056
|
+
return {
|
|
3057
|
+
stdout,
|
|
3058
|
+
stderr,
|
|
3059
|
+
exitCode: 0,
|
|
3060
|
+
failed: false,
|
|
3061
|
+
timedOut: false,
|
|
3062
|
+
signal: null
|
|
3063
|
+
};
|
|
3064
|
+
} catch (error) {
|
|
3065
|
+
const execError = error;
|
|
3066
|
+
return {
|
|
3067
|
+
stdout: execError.stdout ?? "",
|
|
3068
|
+
stderr: execError.stderr ?? "",
|
|
3069
|
+
exitCode: typeof execError.code === "number" ? execError.code : null,
|
|
3070
|
+
failed: true,
|
|
3071
|
+
timedOut: execError.timedOut === true || execError.killed === true,
|
|
3072
|
+
signal: execError.signal ?? null
|
|
3073
|
+
};
|
|
3074
|
+
}
|
|
3075
|
+
}
|
|
3076
|
+
var CliProvider = class {
|
|
3077
|
+
id;
|
|
3078
|
+
kind = "cli";
|
|
3079
|
+
targetName;
|
|
3080
|
+
supportsBatch = true;
|
|
3081
|
+
config;
|
|
3082
|
+
runCommand;
|
|
3083
|
+
verbose;
|
|
3084
|
+
keepTempFiles;
|
|
3085
|
+
healthcheckPromise;
|
|
3086
|
+
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
3087
|
+
this.targetName = targetName;
|
|
3088
|
+
this.id = `cli:${targetName}`;
|
|
3089
|
+
this.config = config;
|
|
3090
|
+
this.runCommand = runner;
|
|
3091
|
+
this.verbose = config.verbose ?? false;
|
|
3092
|
+
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
3093
|
+
}
|
|
3094
|
+
async invoke(request) {
|
|
3095
|
+
if (request.signal?.aborted) {
|
|
3096
|
+
throw new Error("CLI provider request was aborted before execution");
|
|
3097
|
+
}
|
|
3098
|
+
await this.ensureHealthy(request.signal);
|
|
3099
|
+
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
3100
|
+
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
3101
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
3102
|
+
if (this.verbose) {
|
|
3103
|
+
console.log(
|
|
3104
|
+
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
3105
|
+
);
|
|
3106
|
+
}
|
|
3107
|
+
const startTime = Date.now();
|
|
3108
|
+
const result = await this.runCommand(renderedCommand, {
|
|
3109
|
+
cwd: this.config.cwd,
|
|
3110
|
+
env: process.env,
|
|
3111
|
+
timeoutMs: this.config.timeoutMs,
|
|
3112
|
+
signal: request.signal
|
|
3113
|
+
});
|
|
3114
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
3115
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
3116
|
+
if (request.signal?.aborted) {
|
|
3117
|
+
throw new Error("CLI provider request was aborted");
|
|
3118
|
+
}
|
|
3119
|
+
if (result.timedOut) {
|
|
3120
|
+
throw new Error(
|
|
3121
|
+
`CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
3122
|
+
);
|
|
2019
3123
|
}
|
|
2020
3124
|
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
2021
3125
|
const detail = result.stderr.trim() || result.stdout.trim();
|
|
@@ -2090,7 +3194,7 @@ var CliProvider = class {
|
|
|
2090
3194
|
}
|
|
2091
3195
|
if (result.timedOut) {
|
|
2092
3196
|
throw new Error(
|
|
2093
|
-
`CLI provider timed out${
|
|
3197
|
+
`CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
2094
3198
|
);
|
|
2095
3199
|
}
|
|
2096
3200
|
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
@@ -2100,11 +3204,6 @@ var CliProvider = class {
|
|
|
2100
3204
|
}
|
|
2101
3205
|
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
2102
3206
|
const recordsById = this.parseJsonlBatchOutput(responseContent);
|
|
2103
|
-
const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
|
|
2104
|
-
const missingIds = requestedIds.filter((id) => !recordsById.has(id));
|
|
2105
|
-
if (missingIds.length > 0) {
|
|
2106
|
-
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
2107
|
-
}
|
|
2108
3207
|
const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
|
|
2109
3208
|
const responses = requests.map((request) => {
|
|
2110
3209
|
const evalCaseId = request.evalCaseId;
|
|
@@ -2123,15 +3222,20 @@ var CliProvider = class {
|
|
|
2123
3222
|
}
|
|
2124
3223
|
const parsed = recordsById.get(evalCaseId);
|
|
2125
3224
|
if (!parsed) {
|
|
3225
|
+
const errorMessage = `Batch output missing id '${evalCaseId}'`;
|
|
3226
|
+
if (this.verbose) {
|
|
3227
|
+
console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
|
|
3228
|
+
}
|
|
2126
3229
|
return {
|
|
2127
|
-
outputMessages: [],
|
|
3230
|
+
outputMessages: [{ role: "assistant", content: `Error: ${errorMessage}` }],
|
|
2128
3231
|
durationMs: perRequestFallbackMs,
|
|
2129
3232
|
raw: {
|
|
2130
3233
|
command: renderedCommand,
|
|
2131
3234
|
stderr: result.stderr,
|
|
2132
3235
|
exitCode: result.exitCode ?? 0,
|
|
2133
3236
|
cwd: this.config.cwd,
|
|
2134
|
-
outputFile: outputFilePath
|
|
3237
|
+
outputFile: outputFilePath,
|
|
3238
|
+
error: errorMessage
|
|
2135
3239
|
}
|
|
2136
3240
|
};
|
|
2137
3241
|
}
|
|
@@ -2164,101 +3268,37 @@ var CliProvider = class {
|
|
|
2164
3268
|
* - duration_ms: number
|
|
2165
3269
|
*/
|
|
2166
3270
|
parseOutputContent(content) {
|
|
3271
|
+
let parsed;
|
|
2167
3272
|
try {
|
|
2168
|
-
|
|
2169
|
-
if (typeof parsed === "object" && parsed !== null) {
|
|
2170
|
-
const obj = parsed;
|
|
2171
|
-
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2172
|
-
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2173
|
-
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2174
|
-
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2175
|
-
if (outputMessages && outputMessages.length > 0) {
|
|
2176
|
-
return { outputMessages, tokenUsage, costUsd, durationMs };
|
|
2177
|
-
}
|
|
2178
|
-
if ("text" in obj) {
|
|
2179
|
-
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
2180
|
-
return {
|
|
2181
|
-
outputMessages: [{ role: "assistant", content: text }],
|
|
2182
|
-
tokenUsage,
|
|
2183
|
-
costUsd,
|
|
2184
|
-
durationMs
|
|
2185
|
-
};
|
|
2186
|
-
}
|
|
2187
|
-
}
|
|
3273
|
+
parsed = JSON.parse(content);
|
|
2188
3274
|
} catch {
|
|
3275
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2189
3276
|
}
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
* Parse token_usage from CLI output.
|
|
2194
|
-
*/
|
|
2195
|
-
parseTokenUsage(tokenUsage) {
|
|
2196
|
-
if (typeof tokenUsage !== "object" || tokenUsage === null) {
|
|
2197
|
-
return void 0;
|
|
2198
|
-
}
|
|
2199
|
-
const obj = tokenUsage;
|
|
2200
|
-
if (typeof obj.input !== "number" || typeof obj.output !== "number") {
|
|
2201
|
-
return void 0;
|
|
2202
|
-
}
|
|
2203
|
-
return {
|
|
2204
|
-
input: obj.input,
|
|
2205
|
-
output: obj.output,
|
|
2206
|
-
cached: typeof obj.cached === "number" ? obj.cached : void 0
|
|
2207
|
-
};
|
|
2208
|
-
}
|
|
2209
|
-
/**
|
|
2210
|
-
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
2211
|
-
*/
|
|
2212
|
-
parseOutputMessages(outputMessages) {
|
|
2213
|
-
if (!Array.isArray(outputMessages)) {
|
|
2214
|
-
return void 0;
|
|
3277
|
+
const result = CliOutputSchema.safeParse(parsed);
|
|
3278
|
+
if (!result.success) {
|
|
3279
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2215
3280
|
}
|
|
2216
|
-
const
|
|
2217
|
-
|
|
2218
|
-
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
|
|
2222
|
-
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
const message = {
|
|
2226
|
-
role: rawMsg.role,
|
|
2227
|
-
name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
|
|
2228
|
-
content: rawMsg.content,
|
|
2229
|
-
toolCalls: this.parseToolCalls(rawMsg.tool_calls),
|
|
2230
|
-
timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
|
|
2231
|
-
metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
|
|
3281
|
+
const obj = result.data;
|
|
3282
|
+
const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, "parsing output");
|
|
3283
|
+
const outputMessages = convertOutputMessages(obj.output_messages);
|
|
3284
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
3285
|
+
return {
|
|
3286
|
+
outputMessages,
|
|
3287
|
+
tokenUsage: obj.token_usage,
|
|
3288
|
+
costUsd: metrics.costUsd,
|
|
3289
|
+
durationMs: metrics.durationMs
|
|
2232
3290
|
};
|
|
2233
|
-
messages.push(message);
|
|
2234
|
-
}
|
|
2235
|
-
return messages.length > 0 ? messages : void 0;
|
|
2236
|
-
}
|
|
2237
|
-
/**
|
|
2238
|
-
* Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
|
|
2239
|
-
*/
|
|
2240
|
-
parseToolCalls(toolCalls) {
|
|
2241
|
-
if (!Array.isArray(toolCalls)) {
|
|
2242
|
-
return void 0;
|
|
2243
3291
|
}
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2252
|
-
}
|
|
2253
|
-
calls.push({
|
|
2254
|
-
tool: rawCall.tool,
|
|
2255
|
-
input: rawCall.input,
|
|
2256
|
-
output: rawCall.output,
|
|
2257
|
-
id: typeof rawCall.id === "string" ? rawCall.id : void 0,
|
|
2258
|
-
timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
|
|
2259
|
-
});
|
|
3292
|
+
if (obj.text !== void 0) {
|
|
3293
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
3294
|
+
return {
|
|
3295
|
+
outputMessages: [{ role: "assistant", content: text }],
|
|
3296
|
+
tokenUsage: obj.token_usage,
|
|
3297
|
+
costUsd: metrics.costUsd,
|
|
3298
|
+
durationMs: metrics.durationMs
|
|
3299
|
+
};
|
|
2260
3300
|
}
|
|
2261
|
-
return
|
|
3301
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2262
3302
|
}
|
|
2263
3303
|
parseJsonlBatchOutput(content) {
|
|
2264
3304
|
const records = /* @__PURE__ */ new Map();
|
|
@@ -2271,33 +3311,32 @@ var CliProvider = class {
|
|
|
2271
3311
|
const reason = error instanceof Error ? error.message : String(error);
|
|
2272
3312
|
throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
|
|
2273
3313
|
}
|
|
2274
|
-
|
|
3314
|
+
const result = CliJsonlRecordSchema.safeParse(parsed);
|
|
3315
|
+
if (!result.success) {
|
|
3316
|
+
const firstError = result.error.errors[0];
|
|
3317
|
+
if (firstError?.path.includes("id")) {
|
|
3318
|
+
throw new Error("CLI batch output JSONL line missing required string field: id");
|
|
3319
|
+
}
|
|
2275
3320
|
throw new Error("CLI batch output JSONL line must be an object");
|
|
2276
3321
|
}
|
|
2277
|
-
const obj =
|
|
2278
|
-
|
|
2279
|
-
|
|
2280
|
-
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2286
|
-
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2287
|
-
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2288
|
-
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2289
|
-
let outputMessages;
|
|
2290
|
-
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
2291
|
-
outputMessages = parsedOutputMessages;
|
|
3322
|
+
const obj = result.data;
|
|
3323
|
+
if (records.has(obj.id)) {
|
|
3324
|
+
throw new Error(`CLI batch output contains duplicate id: ${obj.id}`);
|
|
3325
|
+
}
|
|
3326
|
+
const outputMessages = convertOutputMessages(obj.output_messages);
|
|
3327
|
+
let finalOutputMessages;
|
|
3328
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
3329
|
+
finalOutputMessages = outputMessages;
|
|
2292
3330
|
} else {
|
|
2293
3331
|
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
2294
|
-
|
|
2295
|
-
}
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
3332
|
+
finalOutputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
3333
|
+
}
|
|
3334
|
+
const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, `batch record '${obj.id}'`);
|
|
3335
|
+
records.set(obj.id, {
|
|
3336
|
+
outputMessages: finalOutputMessages,
|
|
3337
|
+
tokenUsage: obj.token_usage,
|
|
3338
|
+
costUsd: metrics.costUsd,
|
|
3339
|
+
durationMs: metrics.durationMs
|
|
2301
3340
|
});
|
|
2302
3341
|
}
|
|
2303
3342
|
return records;
|
|
@@ -2311,7 +3350,7 @@ var CliProvider = class {
|
|
|
2311
3350
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
2312
3351
|
} finally {
|
|
2313
3352
|
if (!this.keepTempFiles) {
|
|
2314
|
-
await
|
|
3353
|
+
await import_promises9.default.unlink(filePath).catch(() => {
|
|
2315
3354
|
});
|
|
2316
3355
|
}
|
|
2317
3356
|
}
|
|
@@ -2383,7 +3422,7 @@ var CliProvider = class {
|
|
|
2383
3422
|
}
|
|
2384
3423
|
};
|
|
2385
3424
|
function buildTemplateValues(request, config, outputFilePath) {
|
|
2386
|
-
const inputFiles =
|
|
3425
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
2387
3426
|
return {
|
|
2388
3427
|
PROMPT: shellEscape(request.question ?? ""),
|
|
2389
3428
|
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
@@ -2393,13 +3432,13 @@ function buildTemplateValues(request, config, outputFilePath) {
|
|
|
2393
3432
|
OUTPUT_FILE: shellEscape(outputFilePath)
|
|
2394
3433
|
};
|
|
2395
3434
|
}
|
|
2396
|
-
function
|
|
3435
|
+
function normalizeInputFiles2(inputFiles) {
|
|
2397
3436
|
if (!inputFiles || inputFiles.length === 0) {
|
|
2398
3437
|
return void 0;
|
|
2399
3438
|
}
|
|
2400
3439
|
const unique = /* @__PURE__ */ new Map();
|
|
2401
3440
|
for (const inputFile of inputFiles) {
|
|
2402
|
-
const absolutePath =
|
|
3441
|
+
const absolutePath = import_node_path10.default.resolve(inputFile);
|
|
2403
3442
|
if (!unique.has(absolutePath)) {
|
|
2404
3443
|
unique.set(absolutePath, absolutePath);
|
|
2405
3444
|
}
|
|
@@ -2413,7 +3452,7 @@ function formatFileList(files, template) {
|
|
|
2413
3452
|
const formatter = template ?? "{path}";
|
|
2414
3453
|
return files.map((filePath) => {
|
|
2415
3454
|
const escapedPath = shellEscape(filePath);
|
|
2416
|
-
const escapedName = shellEscape(
|
|
3455
|
+
const escapedName = shellEscape(import_node_path10.default.basename(filePath));
|
|
2417
3456
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
2418
3457
|
}).join(" ");
|
|
2419
3458
|
}
|
|
@@ -2437,9 +3476,9 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
2437
3476
|
const safeEvalId = evalCaseId || "unknown";
|
|
2438
3477
|
const timestamp = Date.now();
|
|
2439
3478
|
const random = Math.random().toString(36).substring(2, 9);
|
|
2440
|
-
return
|
|
3479
|
+
return import_node_path10.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
2441
3480
|
}
|
|
2442
|
-
function
|
|
3481
|
+
function formatTimeoutSuffix2(timeoutMs) {
|
|
2443
3482
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
2444
3483
|
return "";
|
|
2445
3484
|
}
|
|
@@ -2448,39 +3487,39 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
2448
3487
|
}
|
|
2449
3488
|
|
|
2450
3489
|
// src/evaluation/providers/codex.ts
|
|
2451
|
-
var
|
|
2452
|
-
var
|
|
2453
|
-
var
|
|
2454
|
-
var
|
|
2455
|
-
var
|
|
2456
|
-
var
|
|
3490
|
+
var import_node_child_process3 = require("child_process");
|
|
3491
|
+
var import_node_crypto2 = require("crypto");
|
|
3492
|
+
var import_node_fs4 = require("fs");
|
|
3493
|
+
var import_promises10 = require("fs/promises");
|
|
3494
|
+
var import_node_os3 = require("os");
|
|
3495
|
+
var import_node_path11 = __toESM(require("path"), 1);
|
|
2457
3496
|
var import_node_util2 = require("util");
|
|
2458
3497
|
|
|
2459
3498
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
2460
|
-
var
|
|
2461
|
-
var
|
|
3499
|
+
var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.codexLogs");
|
|
3500
|
+
var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.codexLogSubscribers");
|
|
2462
3501
|
function getCodexLogStore() {
|
|
2463
3502
|
const globalObject = globalThis;
|
|
2464
|
-
const existing = globalObject[
|
|
3503
|
+
const existing = globalObject[GLOBAL_LOGS_KEY2];
|
|
2465
3504
|
if (existing) {
|
|
2466
3505
|
return existing;
|
|
2467
3506
|
}
|
|
2468
3507
|
const created = [];
|
|
2469
|
-
globalObject[
|
|
3508
|
+
globalObject[GLOBAL_LOGS_KEY2] = created;
|
|
2470
3509
|
return created;
|
|
2471
3510
|
}
|
|
2472
|
-
function
|
|
3511
|
+
function getSubscriberStore2() {
|
|
2473
3512
|
const globalObject = globalThis;
|
|
2474
|
-
const existing = globalObject[
|
|
3513
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
|
|
2475
3514
|
if (existing) {
|
|
2476
3515
|
return existing;
|
|
2477
3516
|
}
|
|
2478
3517
|
const created = /* @__PURE__ */ new Set();
|
|
2479
|
-
globalObject[
|
|
3518
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
|
|
2480
3519
|
return created;
|
|
2481
3520
|
}
|
|
2482
|
-
function
|
|
2483
|
-
const subscribers = Array.from(
|
|
3521
|
+
function notifySubscribers2(entry) {
|
|
3522
|
+
const subscribers = Array.from(getSubscriberStore2());
|
|
2484
3523
|
for (const listener of subscribers) {
|
|
2485
3524
|
try {
|
|
2486
3525
|
listener(entry);
|
|
@@ -2492,128 +3531,29 @@ function notifySubscribers(entry) {
|
|
|
2492
3531
|
}
|
|
2493
3532
|
function recordCodexLogEntry(entry) {
|
|
2494
3533
|
getCodexLogStore().push(entry);
|
|
2495
|
-
|
|
2496
|
-
}
|
|
2497
|
-
function consumeCodexLogEntries() {
|
|
2498
|
-
const store = getCodexLogStore();
|
|
2499
|
-
if (store.length === 0) {
|
|
2500
|
-
return [];
|
|
2501
|
-
}
|
|
2502
|
-
return store.splice(0, store.length);
|
|
2503
|
-
}
|
|
2504
|
-
function subscribeToCodexLogEntries(listener) {
|
|
2505
|
-
const store = getSubscriberStore();
|
|
2506
|
-
store.add(listener);
|
|
2507
|
-
return () => {
|
|
2508
|
-
store.delete(listener);
|
|
2509
|
-
};
|
|
2510
|
-
}
|
|
2511
|
-
|
|
2512
|
-
// src/evaluation/providers/preread.ts
|
|
2513
|
-
var import_node_path9 = __toESM(require("path"), 1);
|
|
2514
|
-
function buildPromptDocument(request, inputFiles, options) {
|
|
2515
|
-
const parts = [];
|
|
2516
|
-
const guidelineFiles = collectGuidelineFiles(
|
|
2517
|
-
inputFiles,
|
|
2518
|
-
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
2519
|
-
options?.guidelineOverrides
|
|
2520
|
-
);
|
|
2521
|
-
const inputFilesList = collectInputFiles(inputFiles);
|
|
2522
|
-
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
2523
|
-
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
2524
|
-
if (prereadBlock.length > 0) {
|
|
2525
|
-
parts.push("\n", prereadBlock);
|
|
2526
|
-
}
|
|
2527
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2528
|
-
return parts.join("\n").trim();
|
|
2529
|
-
}
|
|
2530
|
-
function normalizeInputFiles2(inputFiles) {
|
|
2531
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2532
|
-
return void 0;
|
|
2533
|
-
}
|
|
2534
|
-
const deduped = /* @__PURE__ */ new Map();
|
|
2535
|
-
for (const inputFile of inputFiles) {
|
|
2536
|
-
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2537
|
-
if (!deduped.has(absolutePath)) {
|
|
2538
|
-
deduped.set(absolutePath, absolutePath);
|
|
2539
|
-
}
|
|
2540
|
-
}
|
|
2541
|
-
return Array.from(deduped.values());
|
|
2542
|
-
}
|
|
2543
|
-
function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
2544
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2545
|
-
return [];
|
|
2546
|
-
}
|
|
2547
|
-
const unique = /* @__PURE__ */ new Map();
|
|
2548
|
-
for (const inputFile of inputFiles) {
|
|
2549
|
-
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2550
|
-
if (overrides?.has(absolutePath)) {
|
|
2551
|
-
if (!unique.has(absolutePath)) {
|
|
2552
|
-
unique.set(absolutePath, absolutePath);
|
|
2553
|
-
}
|
|
2554
|
-
continue;
|
|
2555
|
-
}
|
|
2556
|
-
const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
|
|
2557
|
-
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2558
|
-
if (!unique.has(absolutePath)) {
|
|
2559
|
-
unique.set(absolutePath, absolutePath);
|
|
2560
|
-
}
|
|
2561
|
-
}
|
|
2562
|
-
}
|
|
2563
|
-
return Array.from(unique.values());
|
|
2564
|
-
}
|
|
2565
|
-
function collectInputFiles(inputFiles) {
|
|
2566
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2567
|
-
return [];
|
|
2568
|
-
}
|
|
2569
|
-
const unique = /* @__PURE__ */ new Map();
|
|
2570
|
-
for (const inputFile of inputFiles) {
|
|
2571
|
-
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2572
|
-
if (!unique.has(absolutePath)) {
|
|
2573
|
-
unique.set(absolutePath, absolutePath);
|
|
2574
|
-
}
|
|
2575
|
-
}
|
|
2576
|
-
return Array.from(unique.values());
|
|
2577
|
-
}
|
|
2578
|
-
function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
2579
|
-
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
2580
|
-
return "";
|
|
2581
|
-
}
|
|
2582
|
-
const buildList = (files) => files.map((absolutePath) => {
|
|
2583
|
-
const fileName = import_node_path9.default.basename(absolutePath);
|
|
2584
|
-
const fileUri = pathToFileUri(absolutePath);
|
|
2585
|
-
return `* [${fileName}](${fileUri})`;
|
|
2586
|
-
});
|
|
2587
|
-
const sections = [];
|
|
2588
|
-
if (guidelineFiles.length > 0) {
|
|
2589
|
-
sections.push(`Read all guideline files:
|
|
2590
|
-
${buildList(guidelineFiles).join("\n")}.`);
|
|
2591
|
-
}
|
|
2592
|
-
if (inputFiles.length > 0) {
|
|
2593
|
-
sections.push(`Read all input files:
|
|
2594
|
-
${buildList(inputFiles).join("\n")}.`);
|
|
2595
|
-
}
|
|
2596
|
-
sections.push(
|
|
2597
|
-
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
2598
|
-
"Then apply system_instructions on the user query below."
|
|
2599
|
-
);
|
|
2600
|
-
return sections.join("\n");
|
|
3534
|
+
notifySubscribers2(entry);
|
|
2601
3535
|
}
|
|
2602
|
-
function
|
|
2603
|
-
const
|
|
2604
|
-
|
|
2605
|
-
|
|
2606
|
-
return `file:///${normalizedPath}`;
|
|
3536
|
+
function consumeCodexLogEntries() {
|
|
3537
|
+
const store = getCodexLogStore();
|
|
3538
|
+
if (store.length === 0) {
|
|
3539
|
+
return [];
|
|
2607
3540
|
}
|
|
2608
|
-
return
|
|
3541
|
+
return store.splice(0, store.length);
|
|
3542
|
+
}
|
|
3543
|
+
function subscribeToCodexLogEntries(listener) {
|
|
3544
|
+
const store = getSubscriberStore2();
|
|
3545
|
+
store.add(listener);
|
|
3546
|
+
return () => {
|
|
3547
|
+
store.delete(listener);
|
|
3548
|
+
};
|
|
2609
3549
|
}
|
|
2610
3550
|
|
|
2611
3551
|
// src/evaluation/providers/codex.ts
|
|
2612
|
-
var execAsync2 = (0, import_node_util2.promisify)(
|
|
2613
|
-
var
|
|
2614
|
-
var
|
|
3552
|
+
var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process3.exec);
|
|
3553
|
+
var WORKSPACE_PREFIX2 = "agentv-codex-";
|
|
3554
|
+
var PROMPT_FILENAME2 = "prompt.md";
|
|
2615
3555
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
2616
|
-
var
|
|
3556
|
+
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2617
3557
|
- Do NOT create any additional output files in the workspace.
|
|
2618
3558
|
- All intended file outputs/changes MUST be written in your response.
|
|
2619
3559
|
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
@@ -2638,27 +3578,27 @@ var CodexProvider = class {
|
|
|
2638
3578
|
throw new Error("Codex provider request was aborted before execution");
|
|
2639
3579
|
}
|
|
2640
3580
|
await this.ensureEnvironmentReady();
|
|
2641
|
-
const inputFiles =
|
|
3581
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
2642
3582
|
const workspaceRoot = await this.createWorkspace();
|
|
2643
3583
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2644
3584
|
try {
|
|
2645
3585
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
2646
|
-
const systemPrompt = this.config.systemPrompt ??
|
|
3586
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
|
|
2647
3587
|
const promptContent = `${systemPrompt}
|
|
2648
3588
|
|
|
2649
3589
|
${basePrompt}`;
|
|
2650
|
-
const promptFile =
|
|
2651
|
-
await (0,
|
|
3590
|
+
const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3591
|
+
await (0, import_promises10.writeFile)(promptFile, promptContent, "utf8");
|
|
2652
3592
|
const args = this.buildCodexArgs();
|
|
2653
3593
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
2654
3594
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
2655
3595
|
if (result.timedOut) {
|
|
2656
3596
|
throw new Error(
|
|
2657
|
-
`Codex CLI timed out${
|
|
3597
|
+
`Codex CLI timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
|
|
2658
3598
|
);
|
|
2659
3599
|
}
|
|
2660
3600
|
if (result.exitCode !== 0) {
|
|
2661
|
-
const detail =
|
|
3601
|
+
const detail = pickDetail2(result.stderr, result.stdout);
|
|
2662
3602
|
const prefix = `Codex CLI exited with code ${result.exitCode}`;
|
|
2663
3603
|
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
2664
3604
|
}
|
|
@@ -2697,7 +3637,7 @@ ${basePrompt}`;
|
|
|
2697
3637
|
if (!this.config.cwd) {
|
|
2698
3638
|
return workspaceRoot;
|
|
2699
3639
|
}
|
|
2700
|
-
return
|
|
3640
|
+
return import_node_path11.default.resolve(this.config.cwd);
|
|
2701
3641
|
}
|
|
2702
3642
|
buildCodexArgs() {
|
|
2703
3643
|
const args = [
|
|
@@ -2739,11 +3679,11 @@ ${basePrompt}`;
|
|
|
2739
3679
|
}
|
|
2740
3680
|
}
|
|
2741
3681
|
async createWorkspace() {
|
|
2742
|
-
return await (0,
|
|
3682
|
+
return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
|
|
2743
3683
|
}
|
|
2744
3684
|
async cleanupWorkspace(workspaceRoot) {
|
|
2745
3685
|
try {
|
|
2746
|
-
await (0,
|
|
3686
|
+
await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
|
|
2747
3687
|
} catch {
|
|
2748
3688
|
}
|
|
2749
3689
|
}
|
|
@@ -2753,9 +3693,9 @@ ${basePrompt}`;
|
|
|
2753
3693
|
return void 0;
|
|
2754
3694
|
}
|
|
2755
3695
|
if (this.config.logDir) {
|
|
2756
|
-
return
|
|
3696
|
+
return import_node_path11.default.resolve(this.config.logDir);
|
|
2757
3697
|
}
|
|
2758
|
-
return
|
|
3698
|
+
return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "codex");
|
|
2759
3699
|
}
|
|
2760
3700
|
async createStreamLogger(request) {
|
|
2761
3701
|
const logDir = this.resolveLogDirectory();
|
|
@@ -2763,13 +3703,13 @@ ${basePrompt}`;
|
|
|
2763
3703
|
return void 0;
|
|
2764
3704
|
}
|
|
2765
3705
|
try {
|
|
2766
|
-
await (0,
|
|
3706
|
+
await (0, import_promises10.mkdir)(logDir, { recursive: true });
|
|
2767
3707
|
} catch (error) {
|
|
2768
3708
|
const message = error instanceof Error ? error.message : String(error);
|
|
2769
3709
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
2770
3710
|
return void 0;
|
|
2771
3711
|
}
|
|
2772
|
-
const filePath =
|
|
3712
|
+
const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
|
|
2773
3713
|
try {
|
|
2774
3714
|
const logger = await CodexStreamLogger.create({
|
|
2775
3715
|
filePath,
|
|
@@ -2802,7 +3742,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
|
|
|
2802
3742
|
constructor(filePath, format) {
|
|
2803
3743
|
this.filePath = filePath;
|
|
2804
3744
|
this.format = format;
|
|
2805
|
-
this.stream = (0,
|
|
3745
|
+
this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
|
|
2806
3746
|
}
|
|
2807
3747
|
static async create(options) {
|
|
2808
3748
|
const logger = new _CodexStreamLogger(options.filePath, options.format);
|
|
@@ -2863,7 +3803,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
|
|
|
2863
3803
|
return void 0;
|
|
2864
3804
|
}
|
|
2865
3805
|
const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
|
|
2866
|
-
return `[+${
|
|
3806
|
+
return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
|
|
2867
3807
|
}
|
|
2868
3808
|
flushRemainder() {
|
|
2869
3809
|
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
@@ -2894,18 +3834,18 @@ function isCodexLogStreamingDisabled() {
|
|
|
2894
3834
|
const normalized = envValue.trim().toLowerCase();
|
|
2895
3835
|
return normalized === "false" || normalized === "0" || normalized === "off";
|
|
2896
3836
|
}
|
|
2897
|
-
function
|
|
3837
|
+
function buildLogFilename2(request, targetName) {
|
|
2898
3838
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2899
|
-
const evalId =
|
|
3839
|
+
const evalId = sanitizeForFilename2(request.evalCaseId ?? "codex");
|
|
2900
3840
|
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
2901
|
-
const target =
|
|
2902
|
-
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0,
|
|
3841
|
+
const target = sanitizeForFilename2(targetName);
|
|
3842
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
|
|
2903
3843
|
}
|
|
2904
|
-
function
|
|
3844
|
+
function sanitizeForFilename2(value) {
|
|
2905
3845
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
2906
3846
|
return sanitized.length > 0 ? sanitized : "codex";
|
|
2907
3847
|
}
|
|
2908
|
-
function
|
|
3848
|
+
function formatElapsed2(startedAt) {
|
|
2909
3849
|
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
2910
3850
|
const hours = Math.floor(elapsedSeconds / 3600);
|
|
2911
3851
|
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
@@ -2916,7 +3856,7 @@ function formatElapsed(startedAt) {
|
|
|
2916
3856
|
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2917
3857
|
}
|
|
2918
3858
|
function formatCodexLogMessage(rawLine, source) {
|
|
2919
|
-
const parsed =
|
|
3859
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
2920
3860
|
if (parsed) {
|
|
2921
3861
|
const summary = summarizeCodexEvent(parsed);
|
|
2922
3862
|
if (summary) {
|
|
@@ -2929,7 +3869,7 @@ function formatCodexLogMessage(rawLine, source) {
|
|
|
2929
3869
|
return rawLine;
|
|
2930
3870
|
}
|
|
2931
3871
|
function formatCodexJsonLog(rawLine) {
|
|
2932
|
-
const parsed =
|
|
3872
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
2933
3873
|
if (!parsed) {
|
|
2934
3874
|
return rawLine;
|
|
2935
3875
|
}
|
|
@@ -2974,7 +3914,7 @@ function summarizeCodexEvent(event) {
|
|
|
2974
3914
|
}
|
|
2975
3915
|
return type;
|
|
2976
3916
|
}
|
|
2977
|
-
function
|
|
3917
|
+
function tryParseJsonValue2(rawLine) {
|
|
2978
3918
|
try {
|
|
2979
3919
|
return JSON.parse(rawLine);
|
|
2980
3920
|
} catch {
|
|
@@ -2984,9 +3924,9 @@ function tryParseJsonValue(rawLine) {
|
|
|
2984
3924
|
async function locateExecutable(candidate) {
|
|
2985
3925
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
2986
3926
|
if (includesPathSeparator) {
|
|
2987
|
-
const resolved =
|
|
3927
|
+
const resolved = import_node_path11.default.isAbsolute(candidate) ? candidate : import_node_path11.default.resolve(candidate);
|
|
2988
3928
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
2989
|
-
await (0,
|
|
3929
|
+
await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
|
|
2990
3930
|
return executablePath;
|
|
2991
3931
|
}
|
|
2992
3932
|
const locator = process.platform === "win32" ? "where" : "which";
|
|
@@ -2996,7 +3936,7 @@ async function locateExecutable(candidate) {
|
|
|
2996
3936
|
const preferred = selectExecutableCandidate(lines);
|
|
2997
3937
|
if (preferred) {
|
|
2998
3938
|
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
2999
|
-
await (0,
|
|
3939
|
+
await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
|
|
3000
3940
|
return executablePath;
|
|
3001
3941
|
}
|
|
3002
3942
|
} catch {
|
|
@@ -3030,7 +3970,7 @@ async function ensureWindowsExecutableVariant(candidate) {
|
|
|
3030
3970
|
for (const ext of extensions) {
|
|
3031
3971
|
const withExtension = `${candidate}${ext}`;
|
|
3032
3972
|
try {
|
|
3033
|
-
await (0,
|
|
3973
|
+
await (0, import_promises10.access)(withExtension, import_node_fs4.constants.F_OK);
|
|
3034
3974
|
return withExtension;
|
|
3035
3975
|
} catch {
|
|
3036
3976
|
}
|
|
@@ -3203,7 +4143,7 @@ function parseJsonLines(output) {
|
|
|
3203
4143
|
}
|
|
3204
4144
|
return parsed;
|
|
3205
4145
|
}
|
|
3206
|
-
function
|
|
4146
|
+
function pickDetail2(stderr, stdout) {
|
|
3207
4147
|
const errorText = stderr.trim();
|
|
3208
4148
|
if (errorText.length > 0) {
|
|
3209
4149
|
return errorText;
|
|
@@ -3211,7 +4151,7 @@ function pickDetail(stderr, stdout) {
|
|
|
3211
4151
|
const stdoutText = stdout.trim();
|
|
3212
4152
|
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3213
4153
|
}
|
|
3214
|
-
function
|
|
4154
|
+
function formatTimeoutSuffix3(timeoutMs) {
|
|
3215
4155
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
3216
4156
|
return "";
|
|
3217
4157
|
}
|
|
@@ -3220,7 +4160,7 @@ function formatTimeoutSuffix2(timeoutMs) {
|
|
|
3220
4160
|
}
|
|
3221
4161
|
async function defaultCodexRunner(options) {
|
|
3222
4162
|
return await new Promise((resolve, reject) => {
|
|
3223
|
-
const child = (0,
|
|
4163
|
+
const child = (0, import_node_child_process3.spawn)(options.executable, options.args, {
|
|
3224
4164
|
cwd: options.cwd,
|
|
3225
4165
|
env: options.env,
|
|
3226
4166
|
stdio: ["pipe", "pipe", "pipe"],
|
|
@@ -3330,39 +4270,200 @@ var MockProvider = class {
|
|
|
3330
4270
|
}
|
|
3331
4271
|
};
|
|
3332
4272
|
|
|
4273
|
+
// src/evaluation/providers/pi-agent-sdk.ts
|
|
4274
|
+
var piAgentModule = null;
|
|
4275
|
+
var piAiModule = null;
|
|
4276
|
+
async function loadPiModules() {
|
|
4277
|
+
if (!piAgentModule || !piAiModule) {
|
|
4278
|
+
try {
|
|
4279
|
+
[piAgentModule, piAiModule] = await Promise.all([
|
|
4280
|
+
import("@mariozechner/pi-agent"),
|
|
4281
|
+
import("@mariozechner/pi-ai")
|
|
4282
|
+
]);
|
|
4283
|
+
} catch (error) {
|
|
4284
|
+
throw new Error(
|
|
4285
|
+
`Failed to load pi-agent-sdk dependencies. Please install them:
|
|
4286
|
+
npm install @mariozechner/pi-agent @mariozechner/pi-ai
|
|
4287
|
+
|
|
4288
|
+
Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
4289
|
+
);
|
|
4290
|
+
}
|
|
4291
|
+
}
|
|
4292
|
+
return {
|
|
4293
|
+
Agent: piAgentModule.Agent,
|
|
4294
|
+
ProviderTransport: piAgentModule.ProviderTransport,
|
|
4295
|
+
getModel: piAiModule.getModel,
|
|
4296
|
+
getEnvApiKey: piAiModule.getEnvApiKey
|
|
4297
|
+
};
|
|
4298
|
+
}
|
|
4299
|
+
var PiAgentSdkProvider = class {
|
|
4300
|
+
id;
|
|
4301
|
+
kind = "pi-agent-sdk";
|
|
4302
|
+
targetName;
|
|
4303
|
+
supportsBatch = false;
|
|
4304
|
+
config;
|
|
4305
|
+
constructor(targetName, config) {
|
|
4306
|
+
this.id = `pi-agent-sdk:${targetName}`;
|
|
4307
|
+
this.targetName = targetName;
|
|
4308
|
+
this.config = config;
|
|
4309
|
+
}
|
|
4310
|
+
async invoke(request) {
|
|
4311
|
+
if (request.signal?.aborted) {
|
|
4312
|
+
throw new Error("Pi agent SDK request was aborted before execution");
|
|
4313
|
+
}
|
|
4314
|
+
const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
|
|
4315
|
+
const startTime = Date.now();
|
|
4316
|
+
const providerName = this.config.provider ?? "anthropic";
|
|
4317
|
+
const modelId = this.config.model ?? "claude-sonnet-4-20250514";
|
|
4318
|
+
const model = getModel(providerName, modelId);
|
|
4319
|
+
const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
|
|
4320
|
+
const transport = new ProviderTransport({
|
|
4321
|
+
getApiKey: async (provider) => {
|
|
4322
|
+
return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
|
|
4323
|
+
}
|
|
4324
|
+
});
|
|
4325
|
+
const agent = new Agent({
|
|
4326
|
+
initialState: {
|
|
4327
|
+
systemPrompt,
|
|
4328
|
+
model,
|
|
4329
|
+
tools: [],
|
|
4330
|
+
// No tools for simple Q&A
|
|
4331
|
+
messages: []
|
|
4332
|
+
},
|
|
4333
|
+
transport
|
|
4334
|
+
});
|
|
4335
|
+
const outputMessages = [];
|
|
4336
|
+
let finalAssistantContent = "";
|
|
4337
|
+
const unsubscribe = agent.subscribe((event) => {
|
|
4338
|
+
if (event.type === "message_end") {
|
|
4339
|
+
const msg = event.message;
|
|
4340
|
+
if (msg.role === "assistant") {
|
|
4341
|
+
const content = extractTextContent2(msg.content);
|
|
4342
|
+
if (content) {
|
|
4343
|
+
finalAssistantContent = content;
|
|
4344
|
+
}
|
|
4345
|
+
}
|
|
4346
|
+
}
|
|
4347
|
+
});
|
|
4348
|
+
try {
|
|
4349
|
+
const timeoutMs = this.config.timeoutMs ?? 12e4;
|
|
4350
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
4351
|
+
setTimeout(
|
|
4352
|
+
() => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
|
|
4353
|
+
timeoutMs
|
|
4354
|
+
);
|
|
4355
|
+
});
|
|
4356
|
+
await Promise.race([agent.prompt(request.question), timeoutPromise]);
|
|
4357
|
+
await agent.waitForIdle();
|
|
4358
|
+
const agentMessages = agent.state.messages;
|
|
4359
|
+
for (const msg of agentMessages) {
|
|
4360
|
+
outputMessages.push(convertAgentMessage(msg));
|
|
4361
|
+
}
|
|
4362
|
+
const durationMs = Date.now() - startTime;
|
|
4363
|
+
return {
|
|
4364
|
+
raw: {
|
|
4365
|
+
messages: agentMessages,
|
|
4366
|
+
systemPrompt,
|
|
4367
|
+
model: this.config.model,
|
|
4368
|
+
provider: this.config.provider
|
|
4369
|
+
},
|
|
4370
|
+
outputMessages,
|
|
4371
|
+
durationMs
|
|
4372
|
+
};
|
|
4373
|
+
} finally {
|
|
4374
|
+
unsubscribe();
|
|
4375
|
+
}
|
|
4376
|
+
}
|
|
4377
|
+
};
|
|
4378
|
+
function extractTextContent2(content) {
|
|
4379
|
+
if (typeof content === "string") {
|
|
4380
|
+
return content;
|
|
4381
|
+
}
|
|
4382
|
+
if (!Array.isArray(content)) {
|
|
4383
|
+
return void 0;
|
|
4384
|
+
}
|
|
4385
|
+
const textParts = [];
|
|
4386
|
+
for (const part of content) {
|
|
4387
|
+
if (!part || typeof part !== "object") {
|
|
4388
|
+
continue;
|
|
4389
|
+
}
|
|
4390
|
+
const p = part;
|
|
4391
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
4392
|
+
textParts.push(p.text);
|
|
4393
|
+
}
|
|
4394
|
+
}
|
|
4395
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
4396
|
+
}
|
|
4397
|
+
function convertAgentMessage(message) {
|
|
4398
|
+
if (!message || typeof message !== "object") {
|
|
4399
|
+
return { role: "unknown", content: String(message) };
|
|
4400
|
+
}
|
|
4401
|
+
const msg = message;
|
|
4402
|
+
const role = typeof msg.role === "string" ? msg.role : "unknown";
|
|
4403
|
+
const content = extractTextContent2(msg.content);
|
|
4404
|
+
const toolCalls = extractToolCalls2(msg.content);
|
|
4405
|
+
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
4406
|
+
return {
|
|
4407
|
+
role,
|
|
4408
|
+
content,
|
|
4409
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
4410
|
+
timestamp
|
|
4411
|
+
};
|
|
4412
|
+
}
|
|
4413
|
+
function extractToolCalls2(content) {
|
|
4414
|
+
if (!Array.isArray(content)) {
|
|
4415
|
+
return [];
|
|
4416
|
+
}
|
|
4417
|
+
const toolCalls = [];
|
|
4418
|
+
for (const part of content) {
|
|
4419
|
+
if (!part || typeof part !== "object") {
|
|
4420
|
+
continue;
|
|
4421
|
+
}
|
|
4422
|
+
const p = part;
|
|
4423
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
4424
|
+
toolCalls.push({
|
|
4425
|
+
tool: p.name,
|
|
4426
|
+
input: p.input,
|
|
4427
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
4428
|
+
});
|
|
4429
|
+
}
|
|
4430
|
+
}
|
|
4431
|
+
return toolCalls;
|
|
4432
|
+
}
|
|
4433
|
+
|
|
3333
4434
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
3334
|
-
var
|
|
3335
|
-
var
|
|
3336
|
-
var
|
|
3337
|
-
var
|
|
3338
|
-
var
|
|
3339
|
-
var
|
|
4435
|
+
var import_node_child_process4 = require("child_process");
|
|
4436
|
+
var import_node_crypto3 = require("crypto");
|
|
4437
|
+
var import_node_fs5 = require("fs");
|
|
4438
|
+
var import_promises11 = require("fs/promises");
|
|
4439
|
+
var import_node_os4 = require("os");
|
|
4440
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
3340
4441
|
|
|
3341
4442
|
// src/evaluation/providers/pi-log-tracker.ts
|
|
3342
|
-
var
|
|
3343
|
-
var
|
|
4443
|
+
var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
|
|
4444
|
+
var GLOBAL_SUBSCRIBERS_KEY3 = Symbol.for("agentv.piLogSubscribers");
|
|
3344
4445
|
function getPiLogStore() {
|
|
3345
4446
|
const globalObject = globalThis;
|
|
3346
|
-
const existing = globalObject[
|
|
4447
|
+
const existing = globalObject[GLOBAL_LOGS_KEY3];
|
|
3347
4448
|
if (existing) {
|
|
3348
4449
|
return existing;
|
|
3349
4450
|
}
|
|
3350
4451
|
const created = [];
|
|
3351
|
-
globalObject[
|
|
4452
|
+
globalObject[GLOBAL_LOGS_KEY3] = created;
|
|
3352
4453
|
return created;
|
|
3353
4454
|
}
|
|
3354
|
-
function
|
|
4455
|
+
function getSubscriberStore3() {
|
|
3355
4456
|
const globalObject = globalThis;
|
|
3356
|
-
const existing = globalObject[
|
|
4457
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY3];
|
|
3357
4458
|
if (existing) {
|
|
3358
4459
|
return existing;
|
|
3359
4460
|
}
|
|
3360
4461
|
const created = /* @__PURE__ */ new Set();
|
|
3361
|
-
globalObject[
|
|
4462
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY3] = created;
|
|
3362
4463
|
return created;
|
|
3363
4464
|
}
|
|
3364
|
-
function
|
|
3365
|
-
const subscribers = Array.from(
|
|
4465
|
+
function notifySubscribers3(entry) {
|
|
4466
|
+
const subscribers = Array.from(getSubscriberStore3());
|
|
3366
4467
|
for (const listener of subscribers) {
|
|
3367
4468
|
try {
|
|
3368
4469
|
listener(entry);
|
|
@@ -3374,7 +4475,7 @@ function notifySubscribers2(entry) {
|
|
|
3374
4475
|
}
|
|
3375
4476
|
function recordPiLogEntry(entry) {
|
|
3376
4477
|
getPiLogStore().push(entry);
|
|
3377
|
-
|
|
4478
|
+
notifySubscribers3(entry);
|
|
3378
4479
|
}
|
|
3379
4480
|
function consumePiLogEntries() {
|
|
3380
4481
|
const store = getPiLogStore();
|
|
@@ -3384,7 +4485,7 @@ function consumePiLogEntries() {
|
|
|
3384
4485
|
return store.splice(0, store.length);
|
|
3385
4486
|
}
|
|
3386
4487
|
function subscribeToPiLogEntries(listener) {
|
|
3387
|
-
const store =
|
|
4488
|
+
const store = getSubscriberStore3();
|
|
3388
4489
|
store.add(listener);
|
|
3389
4490
|
return () => {
|
|
3390
4491
|
store.delete(listener);
|
|
@@ -3392,9 +4493,9 @@ function subscribeToPiLogEntries(listener) {
|
|
|
3392
4493
|
}
|
|
3393
4494
|
|
|
3394
4495
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
3395
|
-
var
|
|
3396
|
-
var
|
|
3397
|
-
var
|
|
4496
|
+
var WORKSPACE_PREFIX3 = "agentv-pi-";
|
|
4497
|
+
var PROMPT_FILENAME3 = "prompt.md";
|
|
4498
|
+
var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
|
|
3398
4499
|
- Do NOT create any additional output files in the workspace.
|
|
3399
4500
|
- All intended file outputs/changes MUST be written in your response.
|
|
3400
4501
|
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
@@ -3416,27 +4517,27 @@ var PiCodingAgentProvider = class {
|
|
|
3416
4517
|
if (request.signal?.aborted) {
|
|
3417
4518
|
throw new Error("Pi coding agent request was aborted before execution");
|
|
3418
4519
|
}
|
|
3419
|
-
const inputFiles =
|
|
4520
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
3420
4521
|
const workspaceRoot = await this.createWorkspace();
|
|
3421
4522
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
3422
4523
|
try {
|
|
3423
|
-
const promptFile =
|
|
3424
|
-
await (0,
|
|
4524
|
+
const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME3);
|
|
4525
|
+
await (0, import_promises11.writeFile)(promptFile, request.question, "utf8");
|
|
3425
4526
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
3426
4527
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
3427
4528
|
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
3428
4529
|
if (result.timedOut) {
|
|
3429
4530
|
throw new Error(
|
|
3430
|
-
`Pi coding agent timed out${
|
|
4531
|
+
`Pi coding agent timed out${formatTimeoutSuffix4(this.config.timeoutMs ?? void 0)}`
|
|
3431
4532
|
);
|
|
3432
4533
|
}
|
|
3433
4534
|
if (result.exitCode !== 0) {
|
|
3434
|
-
const detail =
|
|
4535
|
+
const detail = pickDetail3(result.stderr, result.stdout);
|
|
3435
4536
|
const prefix = `Pi coding agent exited with code ${result.exitCode}`;
|
|
3436
4537
|
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
3437
4538
|
}
|
|
3438
4539
|
const parsed = parsePiJsonl(result.stdout);
|
|
3439
|
-
const outputMessages =
|
|
4540
|
+
const outputMessages = extractOutputMessages2(parsed);
|
|
3440
4541
|
const assistantText = extractAssistantText2(outputMessages);
|
|
3441
4542
|
return {
|
|
3442
4543
|
raw: {
|
|
@@ -3462,7 +4563,7 @@ var PiCodingAgentProvider = class {
|
|
|
3462
4563
|
if (!this.config.cwd) {
|
|
3463
4564
|
return workspaceRoot;
|
|
3464
4565
|
}
|
|
3465
|
-
return
|
|
4566
|
+
return import_node_path12.default.resolve(this.config.cwd);
|
|
3466
4567
|
}
|
|
3467
4568
|
buildPiArgs(prompt, inputFiles) {
|
|
3468
4569
|
const args = [];
|
|
@@ -3492,7 +4593,7 @@ var PiCodingAgentProvider = class {
|
|
|
3492
4593
|
args.push(`@${file}`);
|
|
3493
4594
|
}
|
|
3494
4595
|
}
|
|
3495
|
-
const systemPrompt = this.config.systemPrompt ??
|
|
4596
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT4;
|
|
3496
4597
|
const fullPrompt = `${systemPrompt}
|
|
3497
4598
|
|
|
3498
4599
|
${prompt}`;
|
|
@@ -3551,19 +4652,19 @@ ${prompt}`;
|
|
|
3551
4652
|
return env;
|
|
3552
4653
|
}
|
|
3553
4654
|
async createWorkspace() {
|
|
3554
|
-
return await (0,
|
|
4655
|
+
return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
|
|
3555
4656
|
}
|
|
3556
4657
|
async cleanupWorkspace(workspaceRoot) {
|
|
3557
4658
|
try {
|
|
3558
|
-
await (0,
|
|
4659
|
+
await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
|
|
3559
4660
|
} catch {
|
|
3560
4661
|
}
|
|
3561
4662
|
}
|
|
3562
4663
|
resolveLogDirectory() {
|
|
3563
4664
|
if (this.config.logDir) {
|
|
3564
|
-
return
|
|
4665
|
+
return import_node_path12.default.resolve(this.config.logDir);
|
|
3565
4666
|
}
|
|
3566
|
-
return
|
|
4667
|
+
return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
3567
4668
|
}
|
|
3568
4669
|
async createStreamLogger(request) {
|
|
3569
4670
|
const logDir = this.resolveLogDirectory();
|
|
@@ -3571,13 +4672,13 @@ ${prompt}`;
|
|
|
3571
4672
|
return void 0;
|
|
3572
4673
|
}
|
|
3573
4674
|
try {
|
|
3574
|
-
await (0,
|
|
4675
|
+
await (0, import_promises11.mkdir)(logDir, { recursive: true });
|
|
3575
4676
|
} catch (error) {
|
|
3576
4677
|
const message = error instanceof Error ? error.message : String(error);
|
|
3577
4678
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
3578
4679
|
return void 0;
|
|
3579
4680
|
}
|
|
3580
|
-
const filePath =
|
|
4681
|
+
const filePath = import_node_path12.default.join(logDir, buildLogFilename3(request, this.targetName));
|
|
3581
4682
|
try {
|
|
3582
4683
|
const logger = await PiStreamLogger.create({
|
|
3583
4684
|
filePath,
|
|
@@ -3610,7 +4711,7 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3610
4711
|
constructor(filePath, format) {
|
|
3611
4712
|
this.filePath = filePath;
|
|
3612
4713
|
this.format = format;
|
|
3613
|
-
this.stream = (0,
|
|
4714
|
+
this.stream = (0, import_node_fs5.createWriteStream)(filePath, { flags: "a" });
|
|
3614
4715
|
}
|
|
3615
4716
|
static async create(options) {
|
|
3616
4717
|
const logger = new _PiStreamLogger(options.filePath, options.format);
|
|
@@ -3671,7 +4772,7 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3671
4772
|
return void 0;
|
|
3672
4773
|
}
|
|
3673
4774
|
const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
|
|
3674
|
-
return `[+${
|
|
4775
|
+
return `[+${formatElapsed3(this.startedAt)}] [${source}] ${message}`;
|
|
3675
4776
|
}
|
|
3676
4777
|
flushRemainder() {
|
|
3677
4778
|
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
@@ -3694,18 +4795,18 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3694
4795
|
this.stderrBuffer = "";
|
|
3695
4796
|
}
|
|
3696
4797
|
};
|
|
3697
|
-
function
|
|
4798
|
+
function buildLogFilename3(request, targetName) {
|
|
3698
4799
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3699
|
-
const evalId =
|
|
4800
|
+
const evalId = sanitizeForFilename3(request.evalCaseId ?? "pi");
|
|
3700
4801
|
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
3701
|
-
const target =
|
|
3702
|
-
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0,
|
|
4802
|
+
const target = sanitizeForFilename3(targetName);
|
|
4803
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto3.randomUUID)().slice(0, 8)}.log`;
|
|
3703
4804
|
}
|
|
3704
|
-
function
|
|
4805
|
+
function sanitizeForFilename3(value) {
|
|
3705
4806
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3706
4807
|
return sanitized.length > 0 ? sanitized : "pi";
|
|
3707
4808
|
}
|
|
3708
|
-
function
|
|
4809
|
+
function formatElapsed3(startedAt) {
|
|
3709
4810
|
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
3710
4811
|
const hours = Math.floor(elapsedSeconds / 3600);
|
|
3711
4812
|
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
@@ -3716,7 +4817,7 @@ function formatElapsed2(startedAt) {
|
|
|
3716
4817
|
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3717
4818
|
}
|
|
3718
4819
|
function formatPiLogMessage(rawLine, source) {
|
|
3719
|
-
const parsed =
|
|
4820
|
+
const parsed = tryParseJsonValue3(rawLine);
|
|
3720
4821
|
if (parsed) {
|
|
3721
4822
|
const summary = summarizePiEvent(parsed);
|
|
3722
4823
|
if (summary) {
|
|
@@ -3729,7 +4830,7 @@ function formatPiLogMessage(rawLine, source) {
|
|
|
3729
4830
|
return rawLine;
|
|
3730
4831
|
}
|
|
3731
4832
|
function formatPiJsonLog(rawLine) {
|
|
3732
|
-
const parsed =
|
|
4833
|
+
const parsed = tryParseJsonValue3(rawLine);
|
|
3733
4834
|
if (!parsed) {
|
|
3734
4835
|
return rawLine;
|
|
3735
4836
|
}
|
|
@@ -3779,7 +4880,7 @@ function summarizePiEvent(event) {
|
|
|
3779
4880
|
return type;
|
|
3780
4881
|
}
|
|
3781
4882
|
}
|
|
3782
|
-
function
|
|
4883
|
+
function tryParseJsonValue3(rawLine) {
|
|
3783
4884
|
try {
|
|
3784
4885
|
return JSON.parse(rawLine);
|
|
3785
4886
|
} catch {
|
|
@@ -3804,7 +4905,7 @@ function parsePiJsonl(output) {
|
|
|
3804
4905
|
}
|
|
3805
4906
|
return parsed;
|
|
3806
4907
|
}
|
|
3807
|
-
function
|
|
4908
|
+
function extractOutputMessages2(events) {
|
|
3808
4909
|
for (let i = events.length - 1; i >= 0; i--) {
|
|
3809
4910
|
const event = events[i];
|
|
3810
4911
|
if (!event || typeof event !== "object") {
|
|
@@ -3845,8 +4946,8 @@ function convertPiMessage(message) {
|
|
|
3845
4946
|
if (typeof role !== "string") {
|
|
3846
4947
|
return void 0;
|
|
3847
4948
|
}
|
|
3848
|
-
const content =
|
|
3849
|
-
const toolCalls =
|
|
4949
|
+
const content = extractTextContent3(msg.content);
|
|
4950
|
+
const toolCalls = extractToolCalls3(msg.content);
|
|
3850
4951
|
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
3851
4952
|
const metadata = {};
|
|
3852
4953
|
if (msg.api) metadata.api = msg.api;
|
|
@@ -3862,7 +4963,7 @@ function convertPiMessage(message) {
|
|
|
3862
4963
|
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
3863
4964
|
};
|
|
3864
4965
|
}
|
|
3865
|
-
function
|
|
4966
|
+
function extractTextContent3(content) {
|
|
3866
4967
|
if (typeof content === "string") {
|
|
3867
4968
|
return content;
|
|
3868
4969
|
}
|
|
@@ -3881,7 +4982,7 @@ function extractTextContent(content) {
|
|
|
3881
4982
|
}
|
|
3882
4983
|
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
3883
4984
|
}
|
|
3884
|
-
function
|
|
4985
|
+
function extractToolCalls3(content) {
|
|
3885
4986
|
if (!Array.isArray(content)) {
|
|
3886
4987
|
return [];
|
|
3887
4988
|
}
|
|
@@ -3926,7 +5027,7 @@ function extractAssistantText2(messages) {
|
|
|
3926
5027
|
function escapeAtSymbols(prompt) {
|
|
3927
5028
|
return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
|
|
3928
5029
|
}
|
|
3929
|
-
function
|
|
5030
|
+
function pickDetail3(stderr, stdout) {
|
|
3930
5031
|
const errorText = stderr.trim();
|
|
3931
5032
|
if (errorText.length > 0) {
|
|
3932
5033
|
return errorText;
|
|
@@ -3934,7 +5035,7 @@ function pickDetail2(stderr, stdout) {
|
|
|
3934
5035
|
const stdoutText = stdout.trim();
|
|
3935
5036
|
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3936
5037
|
}
|
|
3937
|
-
function
|
|
5038
|
+
function formatTimeoutSuffix4(timeoutMs) {
|
|
3938
5039
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
3939
5040
|
return "";
|
|
3940
5041
|
}
|
|
@@ -3947,7 +5048,7 @@ async function defaultPiRunner(options) {
|
|
|
3947
5048
|
const executable = parts[0];
|
|
3948
5049
|
const executableArgs = parts.slice(1);
|
|
3949
5050
|
const allArgs = [...executableArgs, ...options.args];
|
|
3950
|
-
const child = (0,
|
|
5051
|
+
const child = (0, import_node_child_process4.spawn)(executable, allArgs, {
|
|
3951
5052
|
cwd: options.cwd,
|
|
3952
5053
|
env: options.env,
|
|
3953
5054
|
stdio: ["pipe", "pipe", "pipe"],
|
|
@@ -4010,84 +5111,84 @@ async function defaultPiRunner(options) {
|
|
|
4010
5111
|
}
|
|
4011
5112
|
|
|
4012
5113
|
// src/evaluation/providers/targets.ts
|
|
4013
|
-
var
|
|
4014
|
-
var
|
|
4015
|
-
var CliHealthcheckHttpInputSchema =
|
|
4016
|
-
type:
|
|
4017
|
-
url:
|
|
4018
|
-
timeout_seconds:
|
|
4019
|
-
timeoutSeconds:
|
|
5114
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
5115
|
+
var import_zod2 = require("zod");
|
|
5116
|
+
var CliHealthcheckHttpInputSchema = import_zod2.z.object({
|
|
5117
|
+
type: import_zod2.z.literal("http"),
|
|
5118
|
+
url: import_zod2.z.string().min(1, "healthcheck URL is required"),
|
|
5119
|
+
timeout_seconds: import_zod2.z.number().positive().optional(),
|
|
5120
|
+
timeoutSeconds: import_zod2.z.number().positive().optional()
|
|
4020
5121
|
});
|
|
4021
|
-
var CliHealthcheckCommandInputSchema =
|
|
4022
|
-
type:
|
|
4023
|
-
command_template:
|
|
4024
|
-
commandTemplate:
|
|
4025
|
-
cwd:
|
|
4026
|
-
timeout_seconds:
|
|
4027
|
-
timeoutSeconds:
|
|
5122
|
+
var CliHealthcheckCommandInputSchema = import_zod2.z.object({
|
|
5123
|
+
type: import_zod2.z.literal("command"),
|
|
5124
|
+
command_template: import_zod2.z.string().optional(),
|
|
5125
|
+
commandTemplate: import_zod2.z.string().optional(),
|
|
5126
|
+
cwd: import_zod2.z.string().optional(),
|
|
5127
|
+
timeout_seconds: import_zod2.z.number().positive().optional(),
|
|
5128
|
+
timeoutSeconds: import_zod2.z.number().positive().optional()
|
|
4028
5129
|
});
|
|
4029
|
-
var CliHealthcheckInputSchema =
|
|
5130
|
+
var CliHealthcheckInputSchema = import_zod2.z.discriminatedUnion("type", [
|
|
4030
5131
|
CliHealthcheckHttpInputSchema,
|
|
4031
5132
|
CliHealthcheckCommandInputSchema
|
|
4032
5133
|
]);
|
|
4033
|
-
var CliTargetInputSchema =
|
|
4034
|
-
name:
|
|
4035
|
-
provider:
|
|
5134
|
+
var CliTargetInputSchema = import_zod2.z.object({
|
|
5135
|
+
name: import_zod2.z.string().min(1, "target name is required"),
|
|
5136
|
+
provider: import_zod2.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
|
|
4036
5137
|
// Command template - required (accept both naming conventions)
|
|
4037
|
-
command_template:
|
|
4038
|
-
commandTemplate:
|
|
5138
|
+
command_template: import_zod2.z.string().optional(),
|
|
5139
|
+
commandTemplate: import_zod2.z.string().optional(),
|
|
4039
5140
|
// Files format - optional
|
|
4040
|
-
files_format:
|
|
4041
|
-
filesFormat:
|
|
4042
|
-
attachments_format:
|
|
4043
|
-
attachmentsFormat:
|
|
5141
|
+
files_format: import_zod2.z.string().optional(),
|
|
5142
|
+
filesFormat: import_zod2.z.string().optional(),
|
|
5143
|
+
attachments_format: import_zod2.z.string().optional(),
|
|
5144
|
+
attachmentsFormat: import_zod2.z.string().optional(),
|
|
4044
5145
|
// Working directory - optional
|
|
4045
|
-
cwd:
|
|
5146
|
+
cwd: import_zod2.z.string().optional(),
|
|
4046
5147
|
// Timeout in seconds - optional
|
|
4047
|
-
timeout_seconds:
|
|
4048
|
-
timeoutSeconds:
|
|
5148
|
+
timeout_seconds: import_zod2.z.number().positive().optional(),
|
|
5149
|
+
timeoutSeconds: import_zod2.z.number().positive().optional(),
|
|
4049
5150
|
// Healthcheck configuration - optional
|
|
4050
5151
|
healthcheck: CliHealthcheckInputSchema.optional(),
|
|
4051
5152
|
// Verbose mode - optional
|
|
4052
|
-
verbose:
|
|
4053
|
-
cli_verbose:
|
|
4054
|
-
cliVerbose:
|
|
5153
|
+
verbose: import_zod2.z.boolean().optional(),
|
|
5154
|
+
cli_verbose: import_zod2.z.boolean().optional(),
|
|
5155
|
+
cliVerbose: import_zod2.z.boolean().optional(),
|
|
4055
5156
|
// Keep temp files - optional
|
|
4056
|
-
keep_temp_files:
|
|
4057
|
-
keepTempFiles:
|
|
4058
|
-
keep_output_files:
|
|
4059
|
-
keepOutputFiles:
|
|
5157
|
+
keep_temp_files: import_zod2.z.boolean().optional(),
|
|
5158
|
+
keepTempFiles: import_zod2.z.boolean().optional(),
|
|
5159
|
+
keep_output_files: import_zod2.z.boolean().optional(),
|
|
5160
|
+
keepOutputFiles: import_zod2.z.boolean().optional(),
|
|
4060
5161
|
// Common target fields
|
|
4061
|
-
judge_target:
|
|
4062
|
-
workers:
|
|
4063
|
-
provider_batching:
|
|
4064
|
-
providerBatching:
|
|
5162
|
+
judge_target: import_zod2.z.string().optional(),
|
|
5163
|
+
workers: import_zod2.z.number().int().min(1).optional(),
|
|
5164
|
+
provider_batching: import_zod2.z.boolean().optional(),
|
|
5165
|
+
providerBatching: import_zod2.z.boolean().optional()
|
|
4065
5166
|
}).refine((data) => data.command_template !== void 0 || data.commandTemplate !== void 0, {
|
|
4066
5167
|
message: "Either command_template or commandTemplate is required"
|
|
4067
5168
|
});
|
|
4068
|
-
var CliHealthcheckHttpSchema =
|
|
4069
|
-
type:
|
|
4070
|
-
url:
|
|
4071
|
-
timeoutMs:
|
|
5169
|
+
var CliHealthcheckHttpSchema = import_zod2.z.object({
|
|
5170
|
+
type: import_zod2.z.literal("http"),
|
|
5171
|
+
url: import_zod2.z.string().min(1),
|
|
5172
|
+
timeoutMs: import_zod2.z.number().positive().optional()
|
|
4072
5173
|
}).strict();
|
|
4073
|
-
var CliHealthcheckCommandSchema =
|
|
4074
|
-
type:
|
|
4075
|
-
commandTemplate:
|
|
4076
|
-
cwd:
|
|
4077
|
-
timeoutMs:
|
|
5174
|
+
var CliHealthcheckCommandSchema = import_zod2.z.object({
|
|
5175
|
+
type: import_zod2.z.literal("command"),
|
|
5176
|
+
commandTemplate: import_zod2.z.string().min(1),
|
|
5177
|
+
cwd: import_zod2.z.string().optional(),
|
|
5178
|
+
timeoutMs: import_zod2.z.number().positive().optional()
|
|
4078
5179
|
}).strict();
|
|
4079
|
-
var CliHealthcheckSchema =
|
|
5180
|
+
var CliHealthcheckSchema = import_zod2.z.discriminatedUnion("type", [
|
|
4080
5181
|
CliHealthcheckHttpSchema,
|
|
4081
5182
|
CliHealthcheckCommandSchema
|
|
4082
5183
|
]);
|
|
4083
|
-
var CliTargetConfigSchema =
|
|
4084
|
-
commandTemplate:
|
|
4085
|
-
filesFormat:
|
|
4086
|
-
cwd:
|
|
4087
|
-
timeoutMs:
|
|
5184
|
+
var CliTargetConfigSchema = import_zod2.z.object({
|
|
5185
|
+
commandTemplate: import_zod2.z.string().min(1),
|
|
5186
|
+
filesFormat: import_zod2.z.string().optional(),
|
|
5187
|
+
cwd: import_zod2.z.string().optional(),
|
|
5188
|
+
timeoutMs: import_zod2.z.number().positive().optional(),
|
|
4088
5189
|
healthcheck: CliHealthcheckSchema.optional(),
|
|
4089
|
-
verbose:
|
|
4090
|
-
keepTempFiles:
|
|
5190
|
+
verbose: import_zod2.z.boolean().optional(),
|
|
5191
|
+
keepTempFiles: import_zod2.z.boolean().optional()
|
|
4091
5192
|
}).strict();
|
|
4092
5193
|
function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
4093
5194
|
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
@@ -4116,8 +5217,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
4116
5217
|
allowLiteral: true,
|
|
4117
5218
|
optionalEnv: true
|
|
4118
5219
|
});
|
|
4119
|
-
if (cwd && evalFilePath && !
|
|
4120
|
-
cwd =
|
|
5220
|
+
if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
|
|
5221
|
+
cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
|
|
5222
|
+
}
|
|
5223
|
+
if (!cwd && evalFilePath) {
|
|
5224
|
+
cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
|
|
4121
5225
|
}
|
|
4122
5226
|
return {
|
|
4123
5227
|
type: "command",
|
|
@@ -4144,11 +5248,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
|
4144
5248
|
allowLiteral: true,
|
|
4145
5249
|
optionalEnv: true
|
|
4146
5250
|
});
|
|
4147
|
-
if (cwd && evalFilePath && !
|
|
4148
|
-
cwd =
|
|
5251
|
+
if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
|
|
5252
|
+
cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
|
|
4149
5253
|
}
|
|
4150
5254
|
if (!cwd && evalFilePath) {
|
|
4151
|
-
cwd =
|
|
5255
|
+
cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
|
|
4152
5256
|
}
|
|
4153
5257
|
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
4154
5258
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
@@ -4175,11 +5279,11 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
|
4175
5279
|
"FILES",
|
|
4176
5280
|
"OUTPUT_FILE"
|
|
4177
5281
|
]);
|
|
4178
|
-
var BASE_TARGET_SCHEMA =
|
|
4179
|
-
name:
|
|
4180
|
-
provider:
|
|
4181
|
-
judge_target:
|
|
4182
|
-
workers:
|
|
5282
|
+
var BASE_TARGET_SCHEMA = import_zod2.z.object({
|
|
5283
|
+
name: import_zod2.z.string().min(1, "target name is required"),
|
|
5284
|
+
provider: import_zod2.z.string().min(1, "provider is required"),
|
|
5285
|
+
judge_target: import_zod2.z.string().optional(),
|
|
5286
|
+
workers: import_zod2.z.number().int().min(1).optional()
|
|
4183
5287
|
}).passthrough();
|
|
4184
5288
|
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
4185
5289
|
function normalizeAzureApiVersion(value) {
|
|
@@ -4282,6 +5386,24 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
4282
5386
|
providerBatching,
|
|
4283
5387
|
config: resolvePiCodingAgentConfig(parsed, env)
|
|
4284
5388
|
};
|
|
5389
|
+
case "pi-agent-sdk":
|
|
5390
|
+
return {
|
|
5391
|
+
kind: "pi-agent-sdk",
|
|
5392
|
+
name: parsed.name,
|
|
5393
|
+
judgeTarget: parsed.judge_target,
|
|
5394
|
+
workers: parsed.workers,
|
|
5395
|
+
providerBatching,
|
|
5396
|
+
config: resolvePiAgentSdkConfig(parsed, env)
|
|
5397
|
+
};
|
|
5398
|
+
case "claude-code":
|
|
5399
|
+
return {
|
|
5400
|
+
kind: "claude-code",
|
|
5401
|
+
name: parsed.name,
|
|
5402
|
+
judgeTarget: parsed.judge_target,
|
|
5403
|
+
workers: parsed.workers,
|
|
5404
|
+
providerBatching,
|
|
5405
|
+
config: resolveClaudeCodeConfig(parsed, env)
|
|
5406
|
+
};
|
|
4285
5407
|
case "mock":
|
|
4286
5408
|
return {
|
|
4287
5409
|
kind: "mock",
|
|
@@ -4459,41 +5581,132 @@ function resolvePiCodingAgentConfig(target, env) {
|
|
|
4459
5581
|
allowLiteral: false,
|
|
4460
5582
|
optionalEnv: true
|
|
4461
5583
|
});
|
|
4462
|
-
const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
|
|
4463
|
-
allowLiteral: true,
|
|
5584
|
+
const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
|
|
5585
|
+
allowLiteral: true,
|
|
5586
|
+
optionalEnv: true
|
|
5587
|
+
});
|
|
5588
|
+
const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
|
|
5589
|
+
allowLiteral: true,
|
|
5590
|
+
optionalEnv: true
|
|
5591
|
+
});
|
|
5592
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
|
|
5593
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
|
|
5594
|
+
allowLiteral: true,
|
|
5595
|
+
optionalEnv: true
|
|
5596
|
+
});
|
|
5597
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
|
|
5598
|
+
const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
|
|
5599
|
+
allowLiteral: true,
|
|
5600
|
+
optionalEnv: true
|
|
5601
|
+
});
|
|
5602
|
+
const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
|
|
5603
|
+
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
5604
|
+
return {
|
|
5605
|
+
executable,
|
|
5606
|
+
provider,
|
|
5607
|
+
model,
|
|
5608
|
+
apiKey,
|
|
5609
|
+
tools,
|
|
5610
|
+
thinking,
|
|
5611
|
+
args,
|
|
5612
|
+
cwd,
|
|
5613
|
+
timeoutMs,
|
|
5614
|
+
logDir,
|
|
5615
|
+
logFormat,
|
|
5616
|
+
systemPrompt
|
|
5617
|
+
};
|
|
5618
|
+
}
|
|
5619
|
+
function resolvePiAgentSdkConfig(target, env) {
|
|
5620
|
+
const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
|
|
5621
|
+
const modelSource = target.model ?? target.pi_model ?? target.piModel;
|
|
5622
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
5623
|
+
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
5624
|
+
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
5625
|
+
const provider = resolveOptionalString(
|
|
5626
|
+
providerSource,
|
|
5627
|
+
env,
|
|
5628
|
+
`${target.name} pi-agent-sdk provider`,
|
|
5629
|
+
{
|
|
5630
|
+
allowLiteral: true,
|
|
5631
|
+
optionalEnv: true
|
|
5632
|
+
}
|
|
5633
|
+
);
|
|
5634
|
+
const model = resolveOptionalString(modelSource, env, `${target.name} pi-agent-sdk model`, {
|
|
5635
|
+
allowLiteral: true,
|
|
5636
|
+
optionalEnv: true
|
|
5637
|
+
});
|
|
5638
|
+
const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi-agent-sdk api key`, {
|
|
5639
|
+
allowLiteral: false,
|
|
4464
5640
|
optionalEnv: true
|
|
4465
5641
|
});
|
|
4466
|
-
const
|
|
5642
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi-agent-sdk timeout`);
|
|
5643
|
+
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
5644
|
+
return {
|
|
5645
|
+
provider,
|
|
5646
|
+
model,
|
|
5647
|
+
apiKey,
|
|
5648
|
+
timeoutMs,
|
|
5649
|
+
systemPrompt
|
|
5650
|
+
};
|
|
5651
|
+
}
|
|
5652
|
+
function resolveClaudeCodeConfig(target, env) {
|
|
5653
|
+
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
5654
|
+
const modelSource = target.model;
|
|
5655
|
+
const argsSource = target.args ?? target.arguments;
|
|
5656
|
+
const cwdSource = target.cwd;
|
|
5657
|
+
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
5658
|
+
const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
|
|
5659
|
+
const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_CODE_LOG_FORMAT;
|
|
5660
|
+
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
5661
|
+
const executable = resolveOptionalString(executableSource, env, `${target.name} claude-code executable`, {
|
|
4467
5662
|
allowLiteral: true,
|
|
4468
5663
|
optionalEnv: true
|
|
4469
|
-
});
|
|
4470
|
-
const
|
|
4471
|
-
const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
|
|
5664
|
+
}) ?? "claude";
|
|
5665
|
+
const model = resolveOptionalString(modelSource, env, `${target.name} claude-code model`, {
|
|
4472
5666
|
allowLiteral: true,
|
|
4473
5667
|
optionalEnv: true
|
|
4474
5668
|
});
|
|
4475
|
-
const
|
|
4476
|
-
const
|
|
5669
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} claude-code args`);
|
|
5670
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} claude-code cwd`, {
|
|
4477
5671
|
allowLiteral: true,
|
|
4478
5672
|
optionalEnv: true
|
|
4479
5673
|
});
|
|
4480
|
-
const
|
|
5674
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} claude-code timeout`);
|
|
5675
|
+
const logDir = resolveOptionalString(
|
|
5676
|
+
logDirSource,
|
|
5677
|
+
env,
|
|
5678
|
+
`${target.name} claude-code log directory`,
|
|
5679
|
+
{
|
|
5680
|
+
allowLiteral: true,
|
|
5681
|
+
optionalEnv: true
|
|
5682
|
+
}
|
|
5683
|
+
);
|
|
5684
|
+
const logFormat = normalizeClaudeCodeLogFormat(logFormatSource);
|
|
4481
5685
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
4482
5686
|
return {
|
|
4483
5687
|
executable,
|
|
4484
|
-
provider,
|
|
4485
5688
|
model,
|
|
4486
|
-
|
|
4487
|
-
tools,
|
|
4488
|
-
thinking,
|
|
5689
|
+
systemPrompt,
|
|
4489
5690
|
args,
|
|
4490
5691
|
cwd,
|
|
4491
5692
|
timeoutMs,
|
|
4492
5693
|
logDir,
|
|
4493
|
-
logFormat
|
|
4494
|
-
systemPrompt
|
|
5694
|
+
logFormat
|
|
4495
5695
|
};
|
|
4496
5696
|
}
|
|
5697
|
+
function normalizeClaudeCodeLogFormat(value) {
|
|
5698
|
+
if (value === void 0 || value === null) {
|
|
5699
|
+
return void 0;
|
|
5700
|
+
}
|
|
5701
|
+
if (typeof value !== "string") {
|
|
5702
|
+
throw new Error("claude-code log format must be 'summary' or 'json'");
|
|
5703
|
+
}
|
|
5704
|
+
const normalized = value.trim().toLowerCase();
|
|
5705
|
+
if (normalized === "json" || normalized === "summary") {
|
|
5706
|
+
return normalized;
|
|
5707
|
+
}
|
|
5708
|
+
throw new Error("claude-code log format must be 'summary' or 'json'");
|
|
5709
|
+
}
|
|
4497
5710
|
function resolveMockConfig(target) {
|
|
4498
5711
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
4499
5712
|
return { response };
|
|
@@ -4529,13 +5742,13 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
4529
5742
|
};
|
|
4530
5743
|
}
|
|
4531
5744
|
var cliErrorMap = (issue, ctx) => {
|
|
4532
|
-
if (issue.code ===
|
|
5745
|
+
if (issue.code === import_zod2.z.ZodIssueCode.unrecognized_keys) {
|
|
4533
5746
|
return { message: `Unknown CLI provider settings: ${issue.keys.join(", ")}` };
|
|
4534
5747
|
}
|
|
4535
|
-
if (issue.code ===
|
|
5748
|
+
if (issue.code === import_zod2.z.ZodIssueCode.invalid_union_discriminator) {
|
|
4536
5749
|
return { message: "healthcheck type must be 'http' or 'command'" };
|
|
4537
5750
|
}
|
|
4538
|
-
if (issue.code ===
|
|
5751
|
+
if (issue.code === import_zod2.z.ZodIssueCode.invalid_type && issue.expected === "string") {
|
|
4539
5752
|
return { message: `${ctx.defaultError} (expected a string value)` };
|
|
4540
5753
|
}
|
|
4541
5754
|
return { message: ctx.defaultError };
|
|
@@ -4544,8 +5757,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
4544
5757
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
4545
5758
|
if (!parseResult.success) {
|
|
4546
5759
|
const firstError = parseResult.error.errors[0];
|
|
4547
|
-
const
|
|
4548
|
-
const prefix =
|
|
5760
|
+
const path17 = firstError?.path.join(".") || "";
|
|
5761
|
+
const prefix = path17 ? `${target.name} ${path17}: ` : `${target.name}: `;
|
|
4549
5762
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
4550
5763
|
}
|
|
4551
5764
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -4733,7 +5946,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
4733
5946
|
}
|
|
4734
5947
|
|
|
4735
5948
|
// src/evaluation/providers/vscode.ts
|
|
4736
|
-
var
|
|
5949
|
+
var import_node_path14 = __toESM(require("path"), 1);
|
|
4737
5950
|
var import_subagent = require("subagent");
|
|
4738
5951
|
|
|
4739
5952
|
// src/evaluation/providers/vscode-templates.ts
|
|
@@ -4903,7 +6116,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
4903
6116
|
return "";
|
|
4904
6117
|
}
|
|
4905
6118
|
const buildList = (files) => files.map((absolutePath) => {
|
|
4906
|
-
const fileName =
|
|
6119
|
+
const fileName = import_node_path14.default.basename(absolutePath);
|
|
4907
6120
|
const fileUri = pathToFileUri2(absolutePath);
|
|
4908
6121
|
return `* [${fileName}](${fileUri})`;
|
|
4909
6122
|
});
|
|
@@ -4928,8 +6141,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
4928
6141
|
}
|
|
4929
6142
|
const unique = /* @__PURE__ */ new Map();
|
|
4930
6143
|
for (const attachment of attachments) {
|
|
4931
|
-
const absolutePath =
|
|
4932
|
-
const normalized = absolutePath.split(
|
|
6144
|
+
const absolutePath = import_node_path14.default.resolve(attachment);
|
|
6145
|
+
const normalized = absolutePath.split(import_node_path14.default.sep).join("/");
|
|
4933
6146
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
4934
6147
|
if (!unique.has(absolutePath)) {
|
|
4935
6148
|
unique.set(absolutePath, absolutePath);
|
|
@@ -4944,7 +6157,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
4944
6157
|
}
|
|
4945
6158
|
const unique = /* @__PURE__ */ new Map();
|
|
4946
6159
|
for (const attachment of attachments) {
|
|
4947
|
-
const absolutePath =
|
|
6160
|
+
const absolutePath = import_node_path14.default.resolve(attachment);
|
|
4948
6161
|
if (!unique.has(absolutePath)) {
|
|
4949
6162
|
unique.set(absolutePath, absolutePath);
|
|
4950
6163
|
}
|
|
@@ -4952,7 +6165,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
4952
6165
|
return Array.from(unique.values());
|
|
4953
6166
|
}
|
|
4954
6167
|
function pathToFileUri2(filePath) {
|
|
4955
|
-
const absolutePath =
|
|
6168
|
+
const absolutePath = import_node_path14.default.isAbsolute(filePath) ? filePath : import_node_path14.default.resolve(filePath);
|
|
4956
6169
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
4957
6170
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
4958
6171
|
return `file:///${normalizedPath}`;
|
|
@@ -4965,7 +6178,7 @@ function normalizeAttachments(attachments) {
|
|
|
4965
6178
|
}
|
|
4966
6179
|
const deduped = /* @__PURE__ */ new Set();
|
|
4967
6180
|
for (const attachment of attachments) {
|
|
4968
|
-
deduped.add(
|
|
6181
|
+
deduped.add(import_node_path14.default.resolve(attachment));
|
|
4969
6182
|
}
|
|
4970
6183
|
return Array.from(deduped);
|
|
4971
6184
|
}
|
|
@@ -4974,7 +6187,7 @@ function mergeAttachments(all) {
|
|
|
4974
6187
|
for (const list of all) {
|
|
4975
6188
|
if (!list) continue;
|
|
4976
6189
|
for (const inputFile of list) {
|
|
4977
|
-
deduped.add(
|
|
6190
|
+
deduped.add(import_node_path14.default.resolve(inputFile));
|
|
4978
6191
|
}
|
|
4979
6192
|
}
|
|
4980
6193
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -5021,9 +6234,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
5021
6234
|
}
|
|
5022
6235
|
|
|
5023
6236
|
// src/evaluation/providers/targets-file.ts
|
|
5024
|
-
var
|
|
5025
|
-
var
|
|
5026
|
-
var
|
|
6237
|
+
var import_node_fs6 = require("fs");
|
|
6238
|
+
var import_promises12 = require("fs/promises");
|
|
6239
|
+
var import_node_path15 = __toESM(require("path"), 1);
|
|
5027
6240
|
var import_yaml3 = require("yaml");
|
|
5028
6241
|
function isRecord(value) {
|
|
5029
6242
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -5053,18 +6266,18 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
5053
6266
|
}
|
|
5054
6267
|
async function fileExists3(filePath) {
|
|
5055
6268
|
try {
|
|
5056
|
-
await (0,
|
|
6269
|
+
await (0, import_promises12.access)(filePath, import_node_fs6.constants.F_OK);
|
|
5057
6270
|
return true;
|
|
5058
6271
|
} catch {
|
|
5059
6272
|
return false;
|
|
5060
6273
|
}
|
|
5061
6274
|
}
|
|
5062
6275
|
async function readTargetDefinitions(filePath) {
|
|
5063
|
-
const absolutePath =
|
|
6276
|
+
const absolutePath = import_node_path15.default.resolve(filePath);
|
|
5064
6277
|
if (!await fileExists3(absolutePath)) {
|
|
5065
6278
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
5066
6279
|
}
|
|
5067
|
-
const raw = await (0,
|
|
6280
|
+
const raw = await (0, import_promises12.readFile)(absolutePath, "utf8");
|
|
5068
6281
|
const parsed = (0, import_yaml3.parse)(raw);
|
|
5069
6282
|
if (!isRecord(parsed)) {
|
|
5070
6283
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
@@ -5094,6 +6307,10 @@ function createProvider(target) {
|
|
|
5094
6307
|
return new CodexProvider(target.name, target.config);
|
|
5095
6308
|
case "pi-coding-agent":
|
|
5096
6309
|
return new PiCodingAgentProvider(target.name, target.config);
|
|
6310
|
+
case "pi-agent-sdk":
|
|
6311
|
+
return new PiAgentSdkProvider(target.name, target.config);
|
|
6312
|
+
case "claude-code":
|
|
6313
|
+
return new ClaudeCodeProvider(target.name, target.config);
|
|
5097
6314
|
case "mock":
|
|
5098
6315
|
return new MockProvider(target.name, target.config);
|
|
5099
6316
|
case "vscode":
|
|
@@ -5112,78 +6329,176 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
5112
6329
|
|
|
5113
6330
|
// src/evaluation/evaluators.ts
|
|
5114
6331
|
var import_ai2 = require("ai");
|
|
5115
|
-
var
|
|
6332
|
+
var import_zod3 = require("zod");
|
|
5116
6333
|
|
|
5117
6334
|
// src/runtime/exec.ts
|
|
5118
|
-
function
|
|
5119
|
-
|
|
5120
|
-
|
|
6335
|
+
function shellEscapePath(value) {
|
|
6336
|
+
if (process.platform === "win32") {
|
|
6337
|
+
return `"${value.replaceAll('"', '""')}"`;
|
|
6338
|
+
}
|
|
6339
|
+
return `'${value.replaceAll("'", `'"'"'`)}'`;
|
|
5121
6340
|
}
|
|
5122
|
-
async function
|
|
5123
|
-
|
|
5124
|
-
|
|
5125
|
-
|
|
5126
|
-
|
|
5127
|
-
|
|
5128
|
-
|
|
5129
|
-
|
|
5130
|
-
|
|
5131
|
-
|
|
5132
|
-
|
|
5133
|
-
|
|
5134
|
-
|
|
5135
|
-
|
|
5136
|
-
|
|
5137
|
-
|
|
5138
|
-
|
|
5139
|
-
|
|
5140
|
-
|
|
5141
|
-
|
|
5142
|
-
|
|
5143
|
-
|
|
5144
|
-
|
|
6341
|
+
async function execFileWithStdin(argv, stdinPayload, options = {}) {
|
|
6342
|
+
if (argv.length === 0) {
|
|
6343
|
+
throw new Error("Executable argv must include at least one entry");
|
|
6344
|
+
}
|
|
6345
|
+
if (typeof Bun !== "undefined") {
|
|
6346
|
+
return execFileWithStdinBun(argv, stdinPayload, options);
|
|
6347
|
+
}
|
|
6348
|
+
return execFileWithStdinNode(argv, stdinPayload, options);
|
|
6349
|
+
}
|
|
6350
|
+
async function execFileWithStdinBun(argv, stdinPayload, options) {
|
|
6351
|
+
const command = [...argv];
|
|
6352
|
+
const encoder = new TextEncoder();
|
|
6353
|
+
const proc = Bun.spawn(command, {
|
|
6354
|
+
cwd: options.cwd,
|
|
6355
|
+
stdin: encoder.encode(stdinPayload),
|
|
6356
|
+
stdout: "pipe",
|
|
6357
|
+
stderr: "pipe"
|
|
6358
|
+
});
|
|
6359
|
+
let timedOut = false;
|
|
6360
|
+
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
6361
|
+
timedOut = true;
|
|
6362
|
+
proc.kill("SIGKILL");
|
|
6363
|
+
}, options.timeoutMs) : void 0;
|
|
6364
|
+
try {
|
|
6365
|
+
const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
|
|
6366
|
+
const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
|
|
6367
|
+
const [stdout, stderr, exitCode] = await Promise.all([
|
|
6368
|
+
stdoutPromise,
|
|
6369
|
+
stderrPromise,
|
|
6370
|
+
proc.exited
|
|
6371
|
+
]);
|
|
6372
|
+
if (timedOut) {
|
|
6373
|
+
throw new Error(`Process timed out after ${options.timeoutMs}ms`);
|
|
6374
|
+
}
|
|
6375
|
+
return {
|
|
6376
|
+
stdout: stdout.replace(/\r\n/g, "\n"),
|
|
6377
|
+
stderr: stderr.replace(/\r\n/g, "\n"),
|
|
6378
|
+
exitCode
|
|
6379
|
+
};
|
|
6380
|
+
} finally {
|
|
6381
|
+
if (timeout !== void 0) {
|
|
6382
|
+
clearTimeout(timeout);
|
|
5145
6383
|
}
|
|
5146
6384
|
}
|
|
5147
|
-
|
|
5148
|
-
|
|
5149
|
-
|
|
5150
|
-
|
|
6385
|
+
}
|
|
6386
|
+
async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
6387
|
+
const { spawn: spawn4 } = await import("child_process");
|
|
6388
|
+
return new Promise((resolve, reject) => {
|
|
6389
|
+
const [cmd, ...args] = argv;
|
|
6390
|
+
const child = spawn4(cmd, args, {
|
|
5151
6391
|
cwd: options.cwd,
|
|
5152
6392
|
stdio: ["pipe", "pipe", "pipe"]
|
|
5153
6393
|
});
|
|
5154
|
-
|
|
5155
|
-
|
|
5156
|
-
|
|
5157
|
-
|
|
5158
|
-
|
|
6394
|
+
const stdoutChunks = [];
|
|
6395
|
+
const stderrChunks = [];
|
|
6396
|
+
child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
|
|
6397
|
+
child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
|
|
6398
|
+
let timedOut = false;
|
|
6399
|
+
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
6400
|
+
timedOut = true;
|
|
6401
|
+
child.kill("SIGKILL");
|
|
5159
6402
|
}, options.timeoutMs) : void 0;
|
|
5160
|
-
child.stdout?.on("data", (data) => {
|
|
5161
|
-
stdout += data.toString();
|
|
5162
|
-
});
|
|
5163
|
-
child.stderr?.on("data", (data) => {
|
|
5164
|
-
stderr += data.toString();
|
|
5165
|
-
});
|
|
5166
6403
|
child.on("error", (error) => {
|
|
5167
|
-
if (timeout !== void 0)
|
|
5168
|
-
clearTimeout(timeout);
|
|
5169
|
-
}
|
|
6404
|
+
if (timeout !== void 0) clearTimeout(timeout);
|
|
5170
6405
|
reject(error);
|
|
5171
6406
|
});
|
|
5172
|
-
child.on("
|
|
5173
|
-
if (timeout !== void 0)
|
|
5174
|
-
|
|
6407
|
+
child.on("close", (code) => {
|
|
6408
|
+
if (timeout !== void 0) clearTimeout(timeout);
|
|
6409
|
+
if (timedOut) {
|
|
6410
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
6411
|
+
return;
|
|
5175
6412
|
}
|
|
5176
|
-
|
|
6413
|
+
const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
|
|
6414
|
+
const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
|
|
6415
|
+
resolve({
|
|
6416
|
+
stdout,
|
|
6417
|
+
stderr,
|
|
6418
|
+
exitCode: code ?? 0
|
|
6419
|
+
});
|
|
5177
6420
|
});
|
|
5178
|
-
child.stdin
|
|
5179
|
-
|
|
6421
|
+
if (child.stdin) {
|
|
6422
|
+
child.stdin.write(stdinPayload);
|
|
6423
|
+
child.stdin.end();
|
|
6424
|
+
}
|
|
5180
6425
|
});
|
|
5181
6426
|
}
|
|
6427
|
+
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
6428
|
+
const { mkdir: mkdir4, readFile: readFile8, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
|
|
6429
|
+
const { tmpdir: tmpdir4 } = await import("os");
|
|
6430
|
+
const path17 = await import("path");
|
|
6431
|
+
const { randomUUID: randomUUID4 } = await import("crypto");
|
|
6432
|
+
const dir = path17.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
|
|
6433
|
+
await mkdir4(dir, { recursive: true });
|
|
6434
|
+
const stdinPath = path17.join(dir, "stdin.txt");
|
|
6435
|
+
const stdoutPath = path17.join(dir, "stdout.txt");
|
|
6436
|
+
const stderrPath = path17.join(dir, "stderr.txt");
|
|
6437
|
+
await writeFile4(stdinPath, stdinPayload, "utf8");
|
|
6438
|
+
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
6439
|
+
const { spawn: spawn4 } = await import("child_process");
|
|
6440
|
+
try {
|
|
6441
|
+
const exitCode = await new Promise((resolve, reject) => {
|
|
6442
|
+
const child = spawn4(wrappedCommand, {
|
|
6443
|
+
shell: true,
|
|
6444
|
+
cwd: options.cwd,
|
|
6445
|
+
stdio: ["ignore", "ignore", "ignore"]
|
|
6446
|
+
});
|
|
6447
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
6448
|
+
child.kill();
|
|
6449
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
6450
|
+
}, options.timeoutMs) : void 0;
|
|
6451
|
+
child.on("error", (error) => {
|
|
6452
|
+
if (timeout !== void 0) {
|
|
6453
|
+
clearTimeout(timeout);
|
|
6454
|
+
}
|
|
6455
|
+
reject(error);
|
|
6456
|
+
});
|
|
6457
|
+
child.on("exit", (code) => {
|
|
6458
|
+
if (timeout !== void 0) {
|
|
6459
|
+
clearTimeout(timeout);
|
|
6460
|
+
}
|
|
6461
|
+
resolve(code ?? 0);
|
|
6462
|
+
});
|
|
6463
|
+
});
|
|
6464
|
+
const stdout = (await readFile8(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
6465
|
+
const stderr = (await readFile8(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
6466
|
+
return { stdout, stderr, exitCode };
|
|
6467
|
+
} finally {
|
|
6468
|
+
await rm4(dir, { recursive: true, force: true });
|
|
6469
|
+
}
|
|
6470
|
+
}
|
|
6471
|
+
|
|
6472
|
+
// src/evaluation/case-conversion.ts
|
|
6473
|
+
function toSnakeCase(str) {
|
|
6474
|
+
if (/^[A-Z]/.test(str)) {
|
|
6475
|
+
return str;
|
|
6476
|
+
}
|
|
6477
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
6478
|
+
}
|
|
6479
|
+
function toSnakeCaseDeep(obj) {
|
|
6480
|
+
if (obj === null || obj === void 0) {
|
|
6481
|
+
return obj;
|
|
6482
|
+
}
|
|
6483
|
+
if (Array.isArray(obj)) {
|
|
6484
|
+
return obj.map((item) => toSnakeCaseDeep(item));
|
|
6485
|
+
}
|
|
6486
|
+
if (typeof obj === "object") {
|
|
6487
|
+
const result = {};
|
|
6488
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
6489
|
+
const snakeKey = toSnakeCase(key);
|
|
6490
|
+
result[snakeKey] = toSnakeCaseDeep(value);
|
|
6491
|
+
}
|
|
6492
|
+
return result;
|
|
6493
|
+
}
|
|
6494
|
+
return obj;
|
|
6495
|
+
}
|
|
5182
6496
|
|
|
5183
6497
|
// src/evaluation/providers/types.ts
|
|
5184
6498
|
var AGENT_PROVIDER_KINDS = [
|
|
5185
6499
|
"codex",
|
|
5186
6500
|
"pi-coding-agent",
|
|
6501
|
+
"claude-code",
|
|
5187
6502
|
"vscode",
|
|
5188
6503
|
"vscode-insiders"
|
|
5189
6504
|
];
|
|
@@ -5224,20 +6539,20 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
5224
6539
|
|
|
5225
6540
|
[[ ## candidate_answer ## ]]
|
|
5226
6541
|
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
5227
|
-
var freeformEvaluationSchema =
|
|
5228
|
-
score:
|
|
5229
|
-
hits:
|
|
5230
|
-
misses:
|
|
5231
|
-
reasoning:
|
|
6542
|
+
var freeformEvaluationSchema = import_zod3.z.object({
|
|
6543
|
+
score: import_zod3.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
6544
|
+
hits: import_zod3.z.array(import_zod3.z.string()).describe("Brief specific achievements").optional(),
|
|
6545
|
+
misses: import_zod3.z.array(import_zod3.z.string()).describe("Brief failures or omissions").optional(),
|
|
6546
|
+
reasoning: import_zod3.z.string().describe("Concise explanation (1-2 sentences)").optional()
|
|
5232
6547
|
});
|
|
5233
|
-
var rubricCheckResultSchema =
|
|
5234
|
-
id:
|
|
5235
|
-
satisfied:
|
|
5236
|
-
reasoning:
|
|
6548
|
+
var rubricCheckResultSchema = import_zod3.z.object({
|
|
6549
|
+
id: import_zod3.z.string().describe("The ID of the rubric item being checked"),
|
|
6550
|
+
satisfied: import_zod3.z.boolean().describe("Whether this rubric requirement is met"),
|
|
6551
|
+
reasoning: import_zod3.z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
5237
6552
|
});
|
|
5238
|
-
var rubricEvaluationSchema =
|
|
5239
|
-
checks:
|
|
5240
|
-
overall_reasoning:
|
|
6553
|
+
var rubricEvaluationSchema = import_zod3.z.object({
|
|
6554
|
+
checks: import_zod3.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
6555
|
+
overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
5241
6556
|
});
|
|
5242
6557
|
var LlmJudgeEvaluator = class {
|
|
5243
6558
|
kind = "llm_judge";
|
|
@@ -5473,30 +6788,30 @@ var CodeEvaluator = class {
|
|
|
5473
6788
|
script;
|
|
5474
6789
|
cwd;
|
|
5475
6790
|
agentTimeoutMs;
|
|
6791
|
+
config;
|
|
5476
6792
|
constructor(options) {
|
|
5477
6793
|
this.script = options.script;
|
|
5478
6794
|
this.cwd = options.cwd;
|
|
5479
6795
|
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
6796
|
+
this.config = options.config;
|
|
5480
6797
|
}
|
|
5481
6798
|
async evaluate(context) {
|
|
5482
|
-
const
|
|
5483
|
-
|
|
5484
|
-
|
|
5485
|
-
|
|
5486
|
-
|
|
5487
|
-
|
|
5488
|
-
|
|
5489
|
-
|
|
5490
|
-
|
|
5491
|
-
|
|
5492
|
-
|
|
5493
|
-
|
|
5494
|
-
|
|
5495
|
-
|
|
5496
|
-
|
|
5497
|
-
|
|
5498
|
-
2
|
|
5499
|
-
);
|
|
6799
|
+
const payload = {
|
|
6800
|
+
question: context.evalCase.question,
|
|
6801
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
6802
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
6803
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
6804
|
+
candidateAnswer: context.candidate,
|
|
6805
|
+
outputMessages: context.outputMessages ?? null,
|
|
6806
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
6807
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
6808
|
+
(path17) => !context.evalCase.guideline_paths.includes(path17)
|
|
6809
|
+
),
|
|
6810
|
+
inputMessages: context.evalCase.input_messages,
|
|
6811
|
+
traceSummary: context.traceSummary ?? null,
|
|
6812
|
+
config: this.config ?? null
|
|
6813
|
+
};
|
|
6814
|
+
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
5500
6815
|
try {
|
|
5501
6816
|
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
5502
6817
|
const parsed = parseJsonSafe(stdout);
|
|
@@ -5562,18 +6877,25 @@ function calculateRubricScore(result, rubrics) {
|
|
|
5562
6877
|
return { score, verdict, hits, misses };
|
|
5563
6878
|
}
|
|
5564
6879
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
5565
|
-
const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
|
|
5566
|
-
cwd,
|
|
5567
|
-
timeoutMs: agentTimeoutMs
|
|
5568
|
-
});
|
|
6880
|
+
const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
|
|
5569
6881
|
if (exitCode !== 0) {
|
|
5570
|
-
const trimmedErr = stderr
|
|
6882
|
+
const trimmedErr = formatStderr(stderr);
|
|
5571
6883
|
throw new Error(
|
|
5572
6884
|
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
5573
6885
|
);
|
|
5574
6886
|
}
|
|
5575
6887
|
return stdout.trim();
|
|
5576
6888
|
}
|
|
6889
|
+
function formatStderr(stderr) {
|
|
6890
|
+
const trimmed = stderr.trim();
|
|
6891
|
+
const maxLength = 2e3;
|
|
6892
|
+
if (trimmed.length <= maxLength) {
|
|
6893
|
+
return trimmed;
|
|
6894
|
+
}
|
|
6895
|
+
const tail = trimmed.slice(-maxLength);
|
|
6896
|
+
return `...(truncated, last ${maxLength} chars)
|
|
6897
|
+
${tail}`;
|
|
6898
|
+
}
|
|
5577
6899
|
function parseJsonSafe(payload) {
|
|
5578
6900
|
try {
|
|
5579
6901
|
return JSON.parse(payload);
|
|
@@ -5805,22 +7127,438 @@ var ToolTrajectoryEvaluator = class {
|
|
|
5805
7127
|
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
5806
7128
|
}
|
|
5807
7129
|
} else {
|
|
5808
|
-
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
7130
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
7131
|
+
}
|
|
7132
|
+
}
|
|
7133
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
7134
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
7135
|
+
}
|
|
7136
|
+
const score = hits.length / expected.length;
|
|
7137
|
+
return {
|
|
7138
|
+
score,
|
|
7139
|
+
verdict: scoreToVerdict(score),
|
|
7140
|
+
hits,
|
|
7141
|
+
misses,
|
|
7142
|
+
expectedAspectCount: expected.length
|
|
7143
|
+
};
|
|
7144
|
+
}
|
|
7145
|
+
};
|
|
7146
|
+
var DEFAULT_DATE_FORMATS = [
|
|
7147
|
+
"YYYY-MM-DDTHH:mm:ssZ",
|
|
7148
|
+
// ISO with timezone
|
|
7149
|
+
"YYYY-MM-DDTHH:mm:ss",
|
|
7150
|
+
// ISO with time
|
|
7151
|
+
"YYYY-MM-DD",
|
|
7152
|
+
// ISO date
|
|
7153
|
+
"DD-MMM-YYYY",
|
|
7154
|
+
// Localized (e.g., "15-JAN-2025")
|
|
7155
|
+
"MM/DD/YYYY",
|
|
7156
|
+
// US format
|
|
7157
|
+
"DD/MM/YYYY",
|
|
7158
|
+
// EU format
|
|
7159
|
+
"MM-DD-YYYY",
|
|
7160
|
+
// US with dashes
|
|
7161
|
+
"DD-MM-YYYY"
|
|
7162
|
+
// EU with dashes
|
|
7163
|
+
];
|
|
7164
|
+
var MONTH_NAMES = {
|
|
7165
|
+
jan: 0,
|
|
7166
|
+
january: 0,
|
|
7167
|
+
feb: 1,
|
|
7168
|
+
february: 1,
|
|
7169
|
+
mar: 2,
|
|
7170
|
+
march: 2,
|
|
7171
|
+
apr: 3,
|
|
7172
|
+
april: 3,
|
|
7173
|
+
may: 4,
|
|
7174
|
+
jun: 5,
|
|
7175
|
+
june: 5,
|
|
7176
|
+
jul: 6,
|
|
7177
|
+
july: 6,
|
|
7178
|
+
aug: 7,
|
|
7179
|
+
august: 7,
|
|
7180
|
+
sep: 8,
|
|
7181
|
+
sept: 8,
|
|
7182
|
+
september: 8,
|
|
7183
|
+
oct: 9,
|
|
7184
|
+
october: 9,
|
|
7185
|
+
nov: 10,
|
|
7186
|
+
november: 10,
|
|
7187
|
+
dec: 11,
|
|
7188
|
+
december: 11
|
|
7189
|
+
};
|
|
7190
|
+
var FieldAccuracyEvaluator = class {
|
|
7191
|
+
kind = "field_accuracy";
|
|
7192
|
+
config;
|
|
7193
|
+
constructor(options) {
|
|
7194
|
+
this.config = options.config;
|
|
7195
|
+
}
|
|
7196
|
+
evaluate(context) {
|
|
7197
|
+
const { evalCase, candidate } = context;
|
|
7198
|
+
let candidateData;
|
|
7199
|
+
try {
|
|
7200
|
+
candidateData = parseJsonFromTextSafe(candidate);
|
|
7201
|
+
} catch {
|
|
7202
|
+
return {
|
|
7203
|
+
score: 0,
|
|
7204
|
+
verdict: "fail",
|
|
7205
|
+
hits: [],
|
|
7206
|
+
misses: ["Failed to parse candidate answer as JSON"],
|
|
7207
|
+
expectedAspectCount: this.config.fields.length,
|
|
7208
|
+
reasoning: "Candidate answer is not valid JSON"
|
|
7209
|
+
};
|
|
7210
|
+
}
|
|
7211
|
+
const expectedData = this.extractExpectedData(evalCase.expected_messages);
|
|
7212
|
+
if (!expectedData) {
|
|
7213
|
+
return {
|
|
7214
|
+
score: 0,
|
|
7215
|
+
verdict: "fail",
|
|
7216
|
+
hits: [],
|
|
7217
|
+
misses: ["No expected data found in expected_messages"],
|
|
7218
|
+
expectedAspectCount: this.config.fields.length,
|
|
7219
|
+
reasoning: "Could not extract expected data from expected_messages"
|
|
7220
|
+
};
|
|
7221
|
+
}
|
|
7222
|
+
const fieldResults = [];
|
|
7223
|
+
for (const fieldConfig of this.config.fields) {
|
|
7224
|
+
const result = this.evaluateField(fieldConfig, candidateData, expectedData);
|
|
7225
|
+
fieldResults.push(result);
|
|
7226
|
+
}
|
|
7227
|
+
return this.aggregateResults(fieldResults);
|
|
7228
|
+
}
|
|
7229
|
+
/**
|
|
7230
|
+
* Extract expected data from expected_messages array.
|
|
7231
|
+
* Looks for the last assistant message with content.
|
|
7232
|
+
*/
|
|
7233
|
+
extractExpectedData(expectedMessages) {
|
|
7234
|
+
for (let i = expectedMessages.length - 1; i >= 0; i--) {
|
|
7235
|
+
const message = expectedMessages[i];
|
|
7236
|
+
if (message.role === "assistant" && message.content) {
|
|
7237
|
+
if (typeof message.content === "object" && message.content !== null) {
|
|
7238
|
+
return message.content;
|
|
7239
|
+
}
|
|
7240
|
+
if (typeof message.content === "string") {
|
|
7241
|
+
try {
|
|
7242
|
+
return parseJsonFromTextSafe(message.content);
|
|
7243
|
+
} catch {
|
|
7244
|
+
}
|
|
7245
|
+
}
|
|
7246
|
+
}
|
|
7247
|
+
}
|
|
7248
|
+
return void 0;
|
|
7249
|
+
}
|
|
7250
|
+
/**
|
|
7251
|
+
* Evaluate a single field against the expected value.
|
|
7252
|
+
*/
|
|
7253
|
+
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
7254
|
+
const { path: path17, match, required = true, weight = 1 } = fieldConfig;
|
|
7255
|
+
const candidateValue = resolvePath(candidateData, path17);
|
|
7256
|
+
const expectedValue = resolvePath(expectedData, path17);
|
|
7257
|
+
if (expectedValue === void 0) {
|
|
7258
|
+
return {
|
|
7259
|
+
path: path17,
|
|
7260
|
+
score: 1,
|
|
7261
|
+
// No expected value means no comparison needed
|
|
7262
|
+
weight,
|
|
7263
|
+
hit: true,
|
|
7264
|
+
message: `${path17}: no expected value`
|
|
7265
|
+
};
|
|
7266
|
+
}
|
|
7267
|
+
if (candidateValue === void 0) {
|
|
7268
|
+
if (required) {
|
|
7269
|
+
return {
|
|
7270
|
+
path: path17,
|
|
7271
|
+
score: 0,
|
|
7272
|
+
weight,
|
|
7273
|
+
hit: false,
|
|
7274
|
+
message: `${path17} (required, missing)`
|
|
7275
|
+
};
|
|
7276
|
+
}
|
|
7277
|
+
return {
|
|
7278
|
+
path: path17,
|
|
7279
|
+
score: 1,
|
|
7280
|
+
// Don't penalize missing optional fields
|
|
7281
|
+
weight: 0,
|
|
7282
|
+
// Zero weight means it won't affect the score
|
|
7283
|
+
hit: true,
|
|
7284
|
+
message: `${path17}: optional field missing`
|
|
7285
|
+
};
|
|
7286
|
+
}
|
|
7287
|
+
switch (match) {
|
|
7288
|
+
case "exact":
|
|
7289
|
+
return this.compareExact(path17, candidateValue, expectedValue, weight);
|
|
7290
|
+
case "numeric_tolerance":
|
|
7291
|
+
return this.compareNumericTolerance(
|
|
7292
|
+
path17,
|
|
7293
|
+
candidateValue,
|
|
7294
|
+
expectedValue,
|
|
7295
|
+
fieldConfig,
|
|
7296
|
+
weight
|
|
7297
|
+
);
|
|
7298
|
+
case "date":
|
|
7299
|
+
return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
|
|
7300
|
+
default:
|
|
7301
|
+
return {
|
|
7302
|
+
path: path17,
|
|
7303
|
+
score: 0,
|
|
7304
|
+
weight,
|
|
7305
|
+
hit: false,
|
|
7306
|
+
message: `${path17}: unknown match type "${match}"`
|
|
7307
|
+
};
|
|
7308
|
+
}
|
|
7309
|
+
}
|
|
7310
|
+
/**
|
|
7311
|
+
* Exact equality comparison.
|
|
7312
|
+
*/
|
|
7313
|
+
compareExact(path17, candidateValue, expectedValue, weight) {
|
|
7314
|
+
if (deepEqual(candidateValue, expectedValue)) {
|
|
7315
|
+
return {
|
|
7316
|
+
path: path17,
|
|
7317
|
+
score: 1,
|
|
7318
|
+
weight,
|
|
7319
|
+
hit: true,
|
|
7320
|
+
message: path17
|
|
7321
|
+
};
|
|
7322
|
+
}
|
|
7323
|
+
if (typeof candidateValue !== typeof expectedValue) {
|
|
7324
|
+
return {
|
|
7325
|
+
path: path17,
|
|
7326
|
+
score: 0,
|
|
7327
|
+
weight,
|
|
7328
|
+
hit: false,
|
|
7329
|
+
message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
7330
|
+
};
|
|
7331
|
+
}
|
|
7332
|
+
return {
|
|
7333
|
+
path: path17,
|
|
7334
|
+
score: 0,
|
|
7335
|
+
weight,
|
|
7336
|
+
hit: false,
|
|
7337
|
+
message: `${path17} (value mismatch)`
|
|
7338
|
+
};
|
|
7339
|
+
}
|
|
7340
|
+
/**
|
|
7341
|
+
* Numeric comparison with absolute or relative tolerance.
|
|
7342
|
+
*/
|
|
7343
|
+
compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7344
|
+
const { tolerance = 0, relative = false } = fieldConfig;
|
|
7345
|
+
const candidateNum = toNumber(candidateValue);
|
|
7346
|
+
const expectedNum = toNumber(expectedValue);
|
|
7347
|
+
if (candidateNum === null || expectedNum === null) {
|
|
7348
|
+
return {
|
|
7349
|
+
path: path17,
|
|
7350
|
+
score: 0,
|
|
7351
|
+
weight,
|
|
7352
|
+
hit: false,
|
|
7353
|
+
message: `${path17} (non-numeric value)`
|
|
7354
|
+
};
|
|
7355
|
+
}
|
|
7356
|
+
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
7357
|
+
return {
|
|
7358
|
+
path: path17,
|
|
7359
|
+
score: 0,
|
|
7360
|
+
weight,
|
|
7361
|
+
hit: false,
|
|
7362
|
+
message: `${path17} (invalid numeric value)`
|
|
7363
|
+
};
|
|
7364
|
+
}
|
|
7365
|
+
const diff = Math.abs(candidateNum - expectedNum);
|
|
7366
|
+
let withinTolerance;
|
|
7367
|
+
if (relative) {
|
|
7368
|
+
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
7369
|
+
withinTolerance = relativeDiff <= tolerance;
|
|
7370
|
+
} else {
|
|
7371
|
+
withinTolerance = diff <= tolerance;
|
|
7372
|
+
}
|
|
7373
|
+
if (withinTolerance) {
|
|
7374
|
+
return {
|
|
7375
|
+
path: path17,
|
|
7376
|
+
score: 1,
|
|
7377
|
+
weight,
|
|
7378
|
+
hit: true,
|
|
7379
|
+
message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
|
|
7380
|
+
};
|
|
7381
|
+
}
|
|
7382
|
+
return {
|
|
7383
|
+
path: path17,
|
|
7384
|
+
score: 0,
|
|
7385
|
+
weight,
|
|
7386
|
+
hit: false,
|
|
7387
|
+
message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
7388
|
+
};
|
|
7389
|
+
}
|
|
7390
|
+
/**
|
|
7391
|
+
* Date comparison with format normalization.
|
|
7392
|
+
*/
|
|
7393
|
+
compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7394
|
+
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
7395
|
+
const candidateDate = parseDate(String(candidateValue), formats);
|
|
7396
|
+
const expectedDate = parseDate(String(expectedValue), formats);
|
|
7397
|
+
if (candidateDate === null) {
|
|
7398
|
+
return {
|
|
7399
|
+
path: path17,
|
|
7400
|
+
score: 0,
|
|
7401
|
+
weight,
|
|
7402
|
+
hit: false,
|
|
7403
|
+
message: `${path17} (unparseable candidate date)`
|
|
7404
|
+
};
|
|
7405
|
+
}
|
|
7406
|
+
if (expectedDate === null) {
|
|
7407
|
+
return {
|
|
7408
|
+
path: path17,
|
|
7409
|
+
score: 0,
|
|
7410
|
+
weight,
|
|
7411
|
+
hit: false,
|
|
7412
|
+
message: `${path17} (unparseable expected date)`
|
|
7413
|
+
};
|
|
7414
|
+
}
|
|
7415
|
+
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
7416
|
+
return {
|
|
7417
|
+
path: path17,
|
|
7418
|
+
score: 1,
|
|
7419
|
+
weight,
|
|
7420
|
+
hit: true,
|
|
7421
|
+
message: path17
|
|
7422
|
+
};
|
|
7423
|
+
}
|
|
7424
|
+
return {
|
|
7425
|
+
path: path17,
|
|
7426
|
+
score: 0,
|
|
7427
|
+
weight,
|
|
7428
|
+
hit: false,
|
|
7429
|
+
message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
7430
|
+
};
|
|
7431
|
+
}
|
|
7432
|
+
/**
|
|
7433
|
+
* Aggregate field results using configured strategy.
|
|
7434
|
+
*/
|
|
7435
|
+
aggregateResults(results) {
|
|
7436
|
+
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
7437
|
+
const hits = [];
|
|
7438
|
+
const misses = [];
|
|
7439
|
+
for (const result of results) {
|
|
7440
|
+
if (result.hit) {
|
|
7441
|
+
hits.push(result.message);
|
|
7442
|
+
} else {
|
|
7443
|
+
misses.push(result.message);
|
|
5809
7444
|
}
|
|
5810
7445
|
}
|
|
5811
|
-
|
|
5812
|
-
|
|
7446
|
+
let score;
|
|
7447
|
+
if (aggregation === "all_or_nothing") {
|
|
7448
|
+
score = misses.length === 0 ? 1 : 0;
|
|
7449
|
+
} else {
|
|
7450
|
+
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
7451
|
+
if (totalWeight === 0) {
|
|
7452
|
+
score = results.length === 0 ? 1 : 0;
|
|
7453
|
+
} else {
|
|
7454
|
+
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
7455
|
+
score = weightedSum / totalWeight;
|
|
7456
|
+
}
|
|
5813
7457
|
}
|
|
5814
|
-
const
|
|
7458
|
+
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
5815
7459
|
return {
|
|
5816
|
-
score,
|
|
7460
|
+
score: clampScore(score),
|
|
5817
7461
|
verdict: scoreToVerdict(score),
|
|
5818
|
-
hits,
|
|
5819
|
-
misses,
|
|
5820
|
-
expectedAspectCount:
|
|
7462
|
+
hits: hits.slice(0, 4),
|
|
7463
|
+
misses: misses.slice(0, 4),
|
|
7464
|
+
expectedAspectCount: results.length,
|
|
7465
|
+
reasoning
|
|
5821
7466
|
};
|
|
5822
7467
|
}
|
|
5823
7468
|
};
|
|
7469
|
+
function resolvePath(obj, path17) {
|
|
7470
|
+
if (!path17 || !obj) {
|
|
7471
|
+
return void 0;
|
|
7472
|
+
}
|
|
7473
|
+
const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
7474
|
+
let current = obj;
|
|
7475
|
+
for (const part of parts) {
|
|
7476
|
+
if (current === null || current === void 0) {
|
|
7477
|
+
return void 0;
|
|
7478
|
+
}
|
|
7479
|
+
if (typeof current !== "object") {
|
|
7480
|
+
return void 0;
|
|
7481
|
+
}
|
|
7482
|
+
const isIndex = /^\d+$/.test(part);
|
|
7483
|
+
if (isIndex && Array.isArray(current)) {
|
|
7484
|
+
current = current[Number.parseInt(part, 10)];
|
|
7485
|
+
} else {
|
|
7486
|
+
current = current[part];
|
|
7487
|
+
}
|
|
7488
|
+
}
|
|
7489
|
+
return current;
|
|
7490
|
+
}
|
|
7491
|
+
function toNumber(value) {
|
|
7492
|
+
if (typeof value === "number") {
|
|
7493
|
+
return value;
|
|
7494
|
+
}
|
|
7495
|
+
if (typeof value === "string") {
|
|
7496
|
+
const num = Number.parseFloat(value);
|
|
7497
|
+
return Number.isNaN(num) ? null : num;
|
|
7498
|
+
}
|
|
7499
|
+
return null;
|
|
7500
|
+
}
|
|
7501
|
+
function parseDate(dateStr, formats) {
|
|
7502
|
+
if (!dateStr) return null;
|
|
7503
|
+
const trimmed = dateStr.trim();
|
|
7504
|
+
const isoDate = new Date(trimmed);
|
|
7505
|
+
if (!Number.isNaN(isoDate.getTime())) {
|
|
7506
|
+
return isoDate;
|
|
7507
|
+
}
|
|
7508
|
+
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
7509
|
+
if (localizedMatch) {
|
|
7510
|
+
const day = Number.parseInt(localizedMatch[1], 10);
|
|
7511
|
+
const monthName = localizedMatch[2].toLowerCase();
|
|
7512
|
+
const year = Number.parseInt(localizedMatch[3], 10);
|
|
7513
|
+
const month = MONTH_NAMES[monthName];
|
|
7514
|
+
if (month !== void 0) {
|
|
7515
|
+
return new Date(year, month, day);
|
|
7516
|
+
}
|
|
7517
|
+
}
|
|
7518
|
+
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
7519
|
+
if (usMatch) {
|
|
7520
|
+
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
7521
|
+
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
7522
|
+
if (hasUSFormat && !hasEUFormat) {
|
|
7523
|
+
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
7524
|
+
const day = Number.parseInt(usMatch[2], 10);
|
|
7525
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7526
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7527
|
+
return new Date(year, month, day);
|
|
7528
|
+
}
|
|
7529
|
+
} else if (hasEUFormat && !hasUSFormat) {
|
|
7530
|
+
const day = Number.parseInt(usMatch[1], 10);
|
|
7531
|
+
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
7532
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7533
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
7534
|
+
return new Date(year, month, day);
|
|
7535
|
+
}
|
|
7536
|
+
} else {
|
|
7537
|
+
const num1 = Number.parseInt(usMatch[1], 10);
|
|
7538
|
+
const num2 = Number.parseInt(usMatch[2], 10);
|
|
7539
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
7540
|
+
if (num1 > 12 && num2 <= 12) {
|
|
7541
|
+
return new Date(year, num2 - 1, num1);
|
|
7542
|
+
}
|
|
7543
|
+
if (num2 > 12 && num1 <= 12) {
|
|
7544
|
+
return new Date(year, num1 - 1, num2);
|
|
7545
|
+
}
|
|
7546
|
+
if (num1 <= 12 && num2 <= 31) {
|
|
7547
|
+
return new Date(year, num1 - 1, num2);
|
|
7548
|
+
}
|
|
7549
|
+
}
|
|
7550
|
+
}
|
|
7551
|
+
return null;
|
|
7552
|
+
}
|
|
7553
|
+
function formatDateISO(date) {
|
|
7554
|
+
return date.toISOString().split("T")[0];
|
|
7555
|
+
}
|
|
7556
|
+
function parseJsonFromTextSafe(text) {
|
|
7557
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
7558
|
+
const match = cleaned.match(/\{[\s\S]*\}/);
|
|
7559
|
+
const blob = match?.[0] ?? cleaned;
|
|
7560
|
+
return JSON.parse(blob);
|
|
7561
|
+
}
|
|
5824
7562
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
5825
7563
|
{{EVALUATOR_RESULTS_JSON}}
|
|
5826
7564
|
|
|
@@ -6045,11 +7783,175 @@ var CompositeEvaluator = class {
|
|
|
6045
7783
|
}
|
|
6046
7784
|
}
|
|
6047
7785
|
};
|
|
7786
|
+
var LatencyEvaluator = class {
|
|
7787
|
+
kind = "latency";
|
|
7788
|
+
config;
|
|
7789
|
+
constructor(options) {
|
|
7790
|
+
this.config = options.config;
|
|
7791
|
+
}
|
|
7792
|
+
evaluate(context) {
|
|
7793
|
+
const { threshold } = this.config;
|
|
7794
|
+
const durationMs = context.traceSummary?.durationMs;
|
|
7795
|
+
if (durationMs === void 0) {
|
|
7796
|
+
return {
|
|
7797
|
+
score: 0,
|
|
7798
|
+
verdict: "fail",
|
|
7799
|
+
hits: [],
|
|
7800
|
+
misses: ["No duration data available in trace"],
|
|
7801
|
+
expectedAspectCount: 1,
|
|
7802
|
+
reasoning: "Execution duration not reported by provider",
|
|
7803
|
+
evaluatorRawRequest: {
|
|
7804
|
+
type: "latency",
|
|
7805
|
+
threshold,
|
|
7806
|
+
durationMs: null
|
|
7807
|
+
}
|
|
7808
|
+
};
|
|
7809
|
+
}
|
|
7810
|
+
const passed = durationMs <= threshold;
|
|
7811
|
+
const score = passed ? 1 : 0;
|
|
7812
|
+
return {
|
|
7813
|
+
score,
|
|
7814
|
+
verdict: passed ? "pass" : "fail",
|
|
7815
|
+
hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
|
|
7816
|
+
misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
|
|
7817
|
+
expectedAspectCount: 1,
|
|
7818
|
+
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
7819
|
+
evaluatorRawRequest: {
|
|
7820
|
+
type: "latency",
|
|
7821
|
+
threshold,
|
|
7822
|
+
durationMs
|
|
7823
|
+
}
|
|
7824
|
+
};
|
|
7825
|
+
}
|
|
7826
|
+
};
|
|
7827
|
+
var CostEvaluator = class {
|
|
7828
|
+
kind = "cost";
|
|
7829
|
+
config;
|
|
7830
|
+
constructor(options) {
|
|
7831
|
+
this.config = options.config;
|
|
7832
|
+
}
|
|
7833
|
+
evaluate(context) {
|
|
7834
|
+
const { budget } = this.config;
|
|
7835
|
+
const costUsd = context.traceSummary?.costUsd;
|
|
7836
|
+
if (costUsd === void 0) {
|
|
7837
|
+
return {
|
|
7838
|
+
score: 0,
|
|
7839
|
+
verdict: "fail",
|
|
7840
|
+
hits: [],
|
|
7841
|
+
misses: ["No cost data available in trace"],
|
|
7842
|
+
expectedAspectCount: 1,
|
|
7843
|
+
reasoning: "Execution cost not reported by provider",
|
|
7844
|
+
evaluatorRawRequest: {
|
|
7845
|
+
type: "cost",
|
|
7846
|
+
budget,
|
|
7847
|
+
costUsd: null
|
|
7848
|
+
}
|
|
7849
|
+
};
|
|
7850
|
+
}
|
|
7851
|
+
const passed = costUsd <= budget;
|
|
7852
|
+
const score = passed ? 1 : 0;
|
|
7853
|
+
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
7854
|
+
return {
|
|
7855
|
+
score,
|
|
7856
|
+
verdict: passed ? "pass" : "fail",
|
|
7857
|
+
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
7858
|
+
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
7859
|
+
expectedAspectCount: 1,
|
|
7860
|
+
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
7861
|
+
evaluatorRawRequest: {
|
|
7862
|
+
type: "cost",
|
|
7863
|
+
budget,
|
|
7864
|
+
costUsd
|
|
7865
|
+
}
|
|
7866
|
+
};
|
|
7867
|
+
}
|
|
7868
|
+
};
|
|
7869
|
+
var TokenUsageEvaluator = class {
|
|
7870
|
+
kind = "token_usage";
|
|
7871
|
+
config;
|
|
7872
|
+
constructor(options) {
|
|
7873
|
+
this.config = options.config;
|
|
7874
|
+
}
|
|
7875
|
+
evaluate(context) {
|
|
7876
|
+
const usage = context.traceSummary?.tokenUsage;
|
|
7877
|
+
const maxTotal = this.config.max_total;
|
|
7878
|
+
const maxInput = this.config.max_input;
|
|
7879
|
+
const maxOutput = this.config.max_output;
|
|
7880
|
+
const expectedAspectCount = Math.max(
|
|
7881
|
+
[maxTotal, maxInput, maxOutput].filter((v) => typeof v === "number").length,
|
|
7882
|
+
1
|
|
7883
|
+
);
|
|
7884
|
+
if (!usage) {
|
|
7885
|
+
return {
|
|
7886
|
+
score: 0,
|
|
7887
|
+
verdict: "fail",
|
|
7888
|
+
hits: [],
|
|
7889
|
+
misses: ["No token usage data available in trace"],
|
|
7890
|
+
expectedAspectCount,
|
|
7891
|
+
reasoning: "Token usage not reported by provider",
|
|
7892
|
+
evaluatorRawRequest: {
|
|
7893
|
+
type: "token_usage",
|
|
7894
|
+
max_total: maxTotal ?? null,
|
|
7895
|
+
max_input: maxInput ?? null,
|
|
7896
|
+
max_output: maxOutput ?? null,
|
|
7897
|
+
tokenUsage: null
|
|
7898
|
+
}
|
|
7899
|
+
};
|
|
7900
|
+
}
|
|
7901
|
+
const input = usage.input;
|
|
7902
|
+
const output = usage.output;
|
|
7903
|
+
const cached = usage.cached ?? 0;
|
|
7904
|
+
const total = input + output + cached;
|
|
7905
|
+
const hits = [];
|
|
7906
|
+
const misses = [];
|
|
7907
|
+
if (typeof maxInput === "number") {
|
|
7908
|
+
if (input <= maxInput) {
|
|
7909
|
+
hits.push(`Input tokens ${input} <= ${maxInput}`);
|
|
7910
|
+
} else {
|
|
7911
|
+
misses.push(`Input tokens ${input} > ${maxInput}`);
|
|
7912
|
+
}
|
|
7913
|
+
}
|
|
7914
|
+
if (typeof maxOutput === "number") {
|
|
7915
|
+
if (output <= maxOutput) {
|
|
7916
|
+
hits.push(`Output tokens ${output} <= ${maxOutput}`);
|
|
7917
|
+
} else {
|
|
7918
|
+
misses.push(`Output tokens ${output} > ${maxOutput}`);
|
|
7919
|
+
}
|
|
7920
|
+
}
|
|
7921
|
+
if (typeof maxTotal === "number") {
|
|
7922
|
+
if (total <= maxTotal) {
|
|
7923
|
+
hits.push(`Total tokens ${total} <= ${maxTotal}`);
|
|
7924
|
+
} else {
|
|
7925
|
+
misses.push(`Total tokens ${total} > ${maxTotal}`);
|
|
7926
|
+
}
|
|
7927
|
+
}
|
|
7928
|
+
const passed = misses.length === 0;
|
|
7929
|
+
return {
|
|
7930
|
+
score: passed ? 1 : 0,
|
|
7931
|
+
verdict: passed ? "pass" : "fail",
|
|
7932
|
+
hits,
|
|
7933
|
+
misses,
|
|
7934
|
+
expectedAspectCount,
|
|
7935
|
+
reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
7936
|
+
evaluatorRawRequest: {
|
|
7937
|
+
type: "token_usage",
|
|
7938
|
+
max_total: maxTotal ?? null,
|
|
7939
|
+
max_input: maxInput ?? null,
|
|
7940
|
+
max_output: maxOutput ?? null,
|
|
7941
|
+
tokenUsage: {
|
|
7942
|
+
input,
|
|
7943
|
+
output,
|
|
7944
|
+
cached,
|
|
7945
|
+
total
|
|
7946
|
+
}
|
|
7947
|
+
}
|
|
7948
|
+
};
|
|
7949
|
+
}
|
|
7950
|
+
};
|
|
6048
7951
|
|
|
6049
7952
|
// src/evaluation/orchestrator.ts
|
|
6050
|
-
var
|
|
6051
|
-
var
|
|
6052
|
-
var import_node_path15 = __toESM(require("path"), 1);
|
|
7953
|
+
var import_node_crypto4 = require("crypto");
|
|
7954
|
+
var import_node_path16 = __toESM(require("path"), 1);
|
|
6053
7955
|
|
|
6054
7956
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
6055
7957
|
var Node = class {
|
|
@@ -6191,6 +8093,9 @@ function validateConcurrency(concurrency) {
|
|
|
6191
8093
|
}
|
|
6192
8094
|
|
|
6193
8095
|
// src/evaluation/orchestrator.ts
|
|
8096
|
+
function usesFileReferencePrompt(provider) {
|
|
8097
|
+
return isAgentProvider(provider) || provider.kind === "cli";
|
|
8098
|
+
}
|
|
6194
8099
|
async function runEvaluation(options) {
|
|
6195
8100
|
const {
|
|
6196
8101
|
testFilePath: evalFilePath,
|
|
@@ -6202,7 +8107,6 @@ async function runEvaluation(options) {
|
|
|
6202
8107
|
evaluators,
|
|
6203
8108
|
maxRetries,
|
|
6204
8109
|
agentTimeoutMs,
|
|
6205
|
-
promptDumpDir,
|
|
6206
8110
|
cache,
|
|
6207
8111
|
useCache,
|
|
6208
8112
|
now,
|
|
@@ -6282,7 +8186,6 @@ async function runEvaluation(options) {
|
|
|
6282
8186
|
provider: primaryProvider,
|
|
6283
8187
|
target,
|
|
6284
8188
|
evaluatorRegistry,
|
|
6285
|
-
promptDumpDir,
|
|
6286
8189
|
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
6287
8190
|
onProgress,
|
|
6288
8191
|
onResult,
|
|
@@ -6324,7 +8227,6 @@ async function runEvaluation(options) {
|
|
|
6324
8227
|
evaluators: evaluatorRegistry,
|
|
6325
8228
|
maxRetries,
|
|
6326
8229
|
agentTimeoutMs,
|
|
6327
|
-
promptDumpDir,
|
|
6328
8230
|
cache,
|
|
6329
8231
|
useCache,
|
|
6330
8232
|
now,
|
|
@@ -6367,7 +8269,8 @@ async function runEvaluation(options) {
|
|
|
6367
8269
|
results.push(outcome.value);
|
|
6368
8270
|
} else {
|
|
6369
8271
|
const evalCase = filteredEvalCases[i];
|
|
6370
|
-
const
|
|
8272
|
+
const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
|
|
8273
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
6371
8274
|
const errorResult = buildErrorResult(
|
|
6372
8275
|
evalCase,
|
|
6373
8276
|
target.name,
|
|
@@ -6390,7 +8293,6 @@ async function runBatchEvaluation(options) {
|
|
|
6390
8293
|
provider,
|
|
6391
8294
|
target,
|
|
6392
8295
|
evaluatorRegistry,
|
|
6393
|
-
promptDumpDir,
|
|
6394
8296
|
nowFn,
|
|
6395
8297
|
onProgress,
|
|
6396
8298
|
onResult,
|
|
@@ -6398,12 +8300,9 @@ async function runBatchEvaluation(options) {
|
|
|
6398
8300
|
agentTimeoutMs
|
|
6399
8301
|
} = options;
|
|
6400
8302
|
const promptInputsList = [];
|
|
6401
|
-
const formattingMode =
|
|
8303
|
+
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
6402
8304
|
for (const evalCase of evalCases) {
|
|
6403
8305
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
6404
|
-
if (promptDumpDir) {
|
|
6405
|
-
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
6406
|
-
}
|
|
6407
8306
|
promptInputsList.push(promptInputs);
|
|
6408
8307
|
}
|
|
6409
8308
|
const batchRequests = evalCases.map((evalCase, index) => {
|
|
@@ -6445,13 +8344,20 @@ async function runBatchEvaluation(options) {
|
|
|
6445
8344
|
const promptInputs = promptInputsList[i];
|
|
6446
8345
|
const providerResponse = batchResponse[i];
|
|
6447
8346
|
const outputMessages = providerResponse.outputMessages;
|
|
6448
|
-
const
|
|
8347
|
+
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
8348
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
|
|
8349
|
+
eventCount: 0,
|
|
8350
|
+
toolNames: [],
|
|
8351
|
+
toolCallsByName: {},
|
|
8352
|
+
errorCount: 0
|
|
8353
|
+
} : void 0;
|
|
6449
8354
|
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
6450
8355
|
tokenUsage: providerResponse.tokenUsage,
|
|
6451
8356
|
costUsd: providerResponse.costUsd,
|
|
6452
8357
|
durationMs: providerResponse.durationMs
|
|
6453
8358
|
}) : void 0;
|
|
6454
8359
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
8360
|
+
const providerError = extractProviderError(providerResponse);
|
|
6455
8361
|
let result;
|
|
6456
8362
|
try {
|
|
6457
8363
|
result = await evaluateCandidate({
|
|
@@ -6468,6 +8374,9 @@ async function runBatchEvaluation(options) {
|
|
|
6468
8374
|
outputMessages,
|
|
6469
8375
|
traceSummary
|
|
6470
8376
|
});
|
|
8377
|
+
if (providerError) {
|
|
8378
|
+
result = { ...result, error: providerError };
|
|
8379
|
+
}
|
|
6471
8380
|
} catch (error) {
|
|
6472
8381
|
const errorResult = buildErrorResult(
|
|
6473
8382
|
evalCase,
|
|
@@ -6500,9 +8409,10 @@ async function runBatchEvaluation(options) {
|
|
|
6500
8409
|
await onProgress({
|
|
6501
8410
|
workerId: 1,
|
|
6502
8411
|
evalId: evalCase.id,
|
|
6503
|
-
status: "completed",
|
|
8412
|
+
status: result.error ? "failed" : "completed",
|
|
6504
8413
|
startedAt: 0,
|
|
6505
|
-
completedAt: Date.now()
|
|
8414
|
+
completedAt: Date.now(),
|
|
8415
|
+
error: result.error
|
|
6506
8416
|
});
|
|
6507
8417
|
}
|
|
6508
8418
|
}
|
|
@@ -6517,17 +8427,13 @@ async function runEvalCase(options) {
|
|
|
6517
8427
|
now,
|
|
6518
8428
|
maxRetries,
|
|
6519
8429
|
agentTimeoutMs,
|
|
6520
|
-
promptDumpDir,
|
|
6521
8430
|
cache,
|
|
6522
8431
|
useCache,
|
|
6523
8432
|
signal,
|
|
6524
8433
|
judgeProvider
|
|
6525
8434
|
} = options;
|
|
6526
|
-
const formattingMode =
|
|
8435
|
+
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
6527
8436
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
6528
|
-
if (promptDumpDir) {
|
|
6529
|
-
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
6530
|
-
}
|
|
6531
8437
|
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
6532
8438
|
let cachedResponse;
|
|
6533
8439
|
if (cacheKey && cache) {
|
|
@@ -6571,15 +8477,22 @@ async function runEvalCase(options) {
|
|
|
6571
8477
|
await cache.set(cacheKey, providerResponse);
|
|
6572
8478
|
}
|
|
6573
8479
|
const outputMessages = providerResponse.outputMessages;
|
|
6574
|
-
const
|
|
8480
|
+
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
8481
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
|
|
8482
|
+
eventCount: 0,
|
|
8483
|
+
toolNames: [],
|
|
8484
|
+
toolCallsByName: {},
|
|
8485
|
+
errorCount: 0
|
|
8486
|
+
} : void 0;
|
|
6575
8487
|
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
6576
8488
|
tokenUsage: providerResponse.tokenUsage,
|
|
6577
8489
|
costUsd: providerResponse.costUsd,
|
|
6578
8490
|
durationMs: providerResponse.durationMs
|
|
6579
8491
|
}) : void 0;
|
|
6580
8492
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
8493
|
+
const providerError = extractProviderError(providerResponse);
|
|
6581
8494
|
try {
|
|
6582
|
-
|
|
8495
|
+
const result = await evaluateCandidate({
|
|
6583
8496
|
evalCase,
|
|
6584
8497
|
candidate,
|
|
6585
8498
|
target,
|
|
@@ -6593,6 +8506,7 @@ async function runEvalCase(options) {
|
|
|
6593
8506
|
outputMessages,
|
|
6594
8507
|
traceSummary
|
|
6595
8508
|
});
|
|
8509
|
+
return providerError ? { ...result, error: providerError } : result;
|
|
6596
8510
|
} catch (error) {
|
|
6597
8511
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
6598
8512
|
}
|
|
@@ -6658,7 +8572,6 @@ async function evaluateCandidate(options) {
|
|
|
6658
8572
|
candidateAnswer: candidate,
|
|
6659
8573
|
target: target.name,
|
|
6660
8574
|
reasoning: score.reasoning,
|
|
6661
|
-
rawAspects: score.rawAspects,
|
|
6662
8575
|
agentProviderRequest,
|
|
6663
8576
|
lmProviderRequest,
|
|
6664
8577
|
evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
@@ -6768,7 +8681,8 @@ async function runEvaluatorList(options) {
|
|
|
6768
8681
|
const codeEvaluator = new CodeEvaluator({
|
|
6769
8682
|
script: evaluator.script,
|
|
6770
8683
|
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
6771
|
-
agentTimeoutMs
|
|
8684
|
+
agentTimeoutMs,
|
|
8685
|
+
config: evaluator.config
|
|
6772
8686
|
});
|
|
6773
8687
|
const score2 = await codeEvaluator.evaluate({
|
|
6774
8688
|
evalCase,
|
|
@@ -6796,7 +8710,7 @@ async function runEvaluatorList(options) {
|
|
|
6796
8710
|
});
|
|
6797
8711
|
}
|
|
6798
8712
|
if (evaluator.type === "composite") {
|
|
6799
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
8713
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path16.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
6800
8714
|
const createEvaluator = (memberConfig) => {
|
|
6801
8715
|
switch (memberConfig.type) {
|
|
6802
8716
|
case "llm_judge":
|
|
@@ -6805,7 +8719,8 @@ async function runEvaluatorList(options) {
|
|
|
6805
8719
|
return new CodeEvaluator({
|
|
6806
8720
|
script: memberConfig.script,
|
|
6807
8721
|
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
6808
|
-
agentTimeoutMs
|
|
8722
|
+
agentTimeoutMs,
|
|
8723
|
+
config: memberConfig.config
|
|
6809
8724
|
});
|
|
6810
8725
|
case "composite":
|
|
6811
8726
|
return new CompositeEvaluator({
|
|
@@ -6817,6 +8732,22 @@ async function runEvaluatorList(options) {
|
|
|
6817
8732
|
return new ToolTrajectoryEvaluator({
|
|
6818
8733
|
config: memberConfig
|
|
6819
8734
|
});
|
|
8735
|
+
case "field_accuracy":
|
|
8736
|
+
return new FieldAccuracyEvaluator({
|
|
8737
|
+
config: memberConfig
|
|
8738
|
+
});
|
|
8739
|
+
case "latency":
|
|
8740
|
+
return new LatencyEvaluator({
|
|
8741
|
+
config: memberConfig
|
|
8742
|
+
});
|
|
8743
|
+
case "cost":
|
|
8744
|
+
return new CostEvaluator({
|
|
8745
|
+
config: memberConfig
|
|
8746
|
+
});
|
|
8747
|
+
case "token_usage":
|
|
8748
|
+
return new TokenUsageEvaluator({
|
|
8749
|
+
config: memberConfig
|
|
8750
|
+
});
|
|
6820
8751
|
default: {
|
|
6821
8752
|
const unknownConfig = memberConfig;
|
|
6822
8753
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -6836,7 +8767,9 @@ async function runEvaluatorList(options) {
|
|
|
6836
8767
|
attempt,
|
|
6837
8768
|
promptInputs,
|
|
6838
8769
|
now,
|
|
6839
|
-
judgeProvider
|
|
8770
|
+
judgeProvider,
|
|
8771
|
+
outputMessages,
|
|
8772
|
+
traceSummary
|
|
6840
8773
|
});
|
|
6841
8774
|
const weight = evaluator.weight ?? 1;
|
|
6842
8775
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -6881,6 +8814,118 @@ async function runEvaluatorList(options) {
|
|
|
6881
8814
|
reasoning: score2.reasoning
|
|
6882
8815
|
});
|
|
6883
8816
|
}
|
|
8817
|
+
if (evaluator.type === "field_accuracy") {
|
|
8818
|
+
const fieldAccuracyEvaluator = new FieldAccuracyEvaluator({
|
|
8819
|
+
config: evaluator
|
|
8820
|
+
});
|
|
8821
|
+
const score2 = fieldAccuracyEvaluator.evaluate({
|
|
8822
|
+
evalCase,
|
|
8823
|
+
candidate,
|
|
8824
|
+
target,
|
|
8825
|
+
provider,
|
|
8826
|
+
attempt,
|
|
8827
|
+
promptInputs,
|
|
8828
|
+
now,
|
|
8829
|
+
outputMessages,
|
|
8830
|
+
traceSummary
|
|
8831
|
+
});
|
|
8832
|
+
const weight = evaluator.weight ?? 1;
|
|
8833
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
8834
|
+
evaluatorResults.push({
|
|
8835
|
+
name: evaluator.name,
|
|
8836
|
+
type: evaluator.type,
|
|
8837
|
+
score: score2.score,
|
|
8838
|
+
weight,
|
|
8839
|
+
verdict: score2.verdict,
|
|
8840
|
+
hits: score2.hits,
|
|
8841
|
+
misses: score2.misses,
|
|
8842
|
+
reasoning: score2.reasoning
|
|
8843
|
+
});
|
|
8844
|
+
}
|
|
8845
|
+
if (evaluator.type === "latency") {
|
|
8846
|
+
const latencyEvaluator = new LatencyEvaluator({
|
|
8847
|
+
config: evaluator
|
|
8848
|
+
});
|
|
8849
|
+
const score2 = latencyEvaluator.evaluate({
|
|
8850
|
+
evalCase,
|
|
8851
|
+
candidate,
|
|
8852
|
+
target,
|
|
8853
|
+
provider,
|
|
8854
|
+
attempt,
|
|
8855
|
+
promptInputs,
|
|
8856
|
+
now,
|
|
8857
|
+
outputMessages,
|
|
8858
|
+
traceSummary
|
|
8859
|
+
});
|
|
8860
|
+
const weight = evaluator.weight ?? 1;
|
|
8861
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
8862
|
+
evaluatorResults.push({
|
|
8863
|
+
name: evaluator.name,
|
|
8864
|
+
type: evaluator.type,
|
|
8865
|
+
score: score2.score,
|
|
8866
|
+
weight,
|
|
8867
|
+
verdict: score2.verdict,
|
|
8868
|
+
hits: score2.hits,
|
|
8869
|
+
misses: score2.misses,
|
|
8870
|
+
reasoning: score2.reasoning
|
|
8871
|
+
});
|
|
8872
|
+
}
|
|
8873
|
+
if (evaluator.type === "cost") {
|
|
8874
|
+
const costEvaluator = new CostEvaluator({
|
|
8875
|
+
config: evaluator
|
|
8876
|
+
});
|
|
8877
|
+
const score2 = costEvaluator.evaluate({
|
|
8878
|
+
evalCase,
|
|
8879
|
+
candidate,
|
|
8880
|
+
target,
|
|
8881
|
+
provider,
|
|
8882
|
+
attempt,
|
|
8883
|
+
promptInputs,
|
|
8884
|
+
now,
|
|
8885
|
+
outputMessages,
|
|
8886
|
+
traceSummary
|
|
8887
|
+
});
|
|
8888
|
+
const weight = evaluator.weight ?? 1;
|
|
8889
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
8890
|
+
evaluatorResults.push({
|
|
8891
|
+
name: evaluator.name,
|
|
8892
|
+
type: evaluator.type,
|
|
8893
|
+
score: score2.score,
|
|
8894
|
+
weight,
|
|
8895
|
+
verdict: score2.verdict,
|
|
8896
|
+
hits: score2.hits,
|
|
8897
|
+
misses: score2.misses,
|
|
8898
|
+
reasoning: score2.reasoning
|
|
8899
|
+
});
|
|
8900
|
+
}
|
|
8901
|
+
if (evaluator.type === "token_usage") {
|
|
8902
|
+
const tokenUsageEvaluator = new TokenUsageEvaluator({
|
|
8903
|
+
config: evaluator
|
|
8904
|
+
});
|
|
8905
|
+
const score2 = tokenUsageEvaluator.evaluate({
|
|
8906
|
+
evalCase,
|
|
8907
|
+
candidate,
|
|
8908
|
+
target,
|
|
8909
|
+
provider,
|
|
8910
|
+
attempt,
|
|
8911
|
+
promptInputs,
|
|
8912
|
+
now,
|
|
8913
|
+
outputMessages,
|
|
8914
|
+
traceSummary
|
|
8915
|
+
});
|
|
8916
|
+
const weight = evaluator.weight ?? 1;
|
|
8917
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
8918
|
+
evaluatorResults.push({
|
|
8919
|
+
name: evaluator.name,
|
|
8920
|
+
type: evaluator.type,
|
|
8921
|
+
score: score2.score,
|
|
8922
|
+
weight,
|
|
8923
|
+
verdict: score2.verdict,
|
|
8924
|
+
hits: score2.hits,
|
|
8925
|
+
misses: score2.misses,
|
|
8926
|
+
reasoning: score2.reasoning
|
|
8927
|
+
});
|
|
8928
|
+
}
|
|
6884
8929
|
} catch (error) {
|
|
6885
8930
|
const message = error instanceof Error ? error.message : String(error);
|
|
6886
8931
|
const fallbackScore = {
|
|
@@ -6920,7 +8965,6 @@ async function runEvaluatorList(options) {
|
|
|
6920
8965
|
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
6921
8966
|
0
|
|
6922
8967
|
);
|
|
6923
|
-
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
6924
8968
|
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
6925
8969
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
6926
8970
|
const score = {
|
|
@@ -6929,8 +8973,7 @@ async function runEvaluatorList(options) {
|
|
|
6929
8973
|
hits,
|
|
6930
8974
|
misses,
|
|
6931
8975
|
expectedAspectCount,
|
|
6932
|
-
reasoning
|
|
6933
|
-
rawAspects: rawAspects.length > 0 ? rawAspects : void 0
|
|
8976
|
+
reasoning
|
|
6934
8977
|
};
|
|
6935
8978
|
return { score, evaluatorResults };
|
|
6936
8979
|
}
|
|
@@ -7005,26 +9048,6 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
7005
9048
|
llm_judge: llmJudge
|
|
7006
9049
|
};
|
|
7007
9050
|
}
|
|
7008
|
-
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
7009
|
-
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
7010
|
-
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
7011
|
-
const filePath = import_node_path15.default.resolve(directory, filename);
|
|
7012
|
-
await (0, import_promises12.mkdir)(import_node_path15.default.dirname(filePath), { recursive: true });
|
|
7013
|
-
const payload = {
|
|
7014
|
-
eval_id: evalCase.id,
|
|
7015
|
-
question: promptInputs.question,
|
|
7016
|
-
guidelines: promptInputs.guidelines,
|
|
7017
|
-
guideline_paths: evalCase.guideline_paths
|
|
7018
|
-
};
|
|
7019
|
-
await (0, import_promises12.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
7020
|
-
}
|
|
7021
|
-
function sanitizeFilename(value) {
|
|
7022
|
-
if (!value) {
|
|
7023
|
-
return "prompt";
|
|
7024
|
-
}
|
|
7025
|
-
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
7026
|
-
return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
|
|
7027
|
-
}
|
|
7028
9051
|
async function invokeProvider(provider, options) {
|
|
7029
9052
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
7030
9053
|
const controller = new AbortController();
|
|
@@ -7088,14 +9111,25 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
7088
9111
|
misses: [`Error: ${message}`],
|
|
7089
9112
|
candidateAnswer: `Error occurred: ${message}`,
|
|
7090
9113
|
target: targetName,
|
|
7091
|
-
rawAspects: [],
|
|
7092
9114
|
agentProviderRequest,
|
|
7093
9115
|
lmProviderRequest,
|
|
7094
9116
|
error: message
|
|
7095
9117
|
};
|
|
7096
9118
|
}
|
|
9119
|
+
function extractProviderError(response) {
|
|
9120
|
+
const raw = response.raw;
|
|
9121
|
+
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
9122
|
+
return void 0;
|
|
9123
|
+
}
|
|
9124
|
+
const error = raw.error;
|
|
9125
|
+
if (typeof error !== "string") {
|
|
9126
|
+
return void 0;
|
|
9127
|
+
}
|
|
9128
|
+
const trimmed = error.trim();
|
|
9129
|
+
return trimmed.length > 0 ? trimmed : void 0;
|
|
9130
|
+
}
|
|
7097
9131
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
7098
|
-
const hash = (0,
|
|
9132
|
+
const hash = (0, import_node_crypto4.createHash)("sha256");
|
|
7099
9133
|
hash.update(provider.id);
|
|
7100
9134
|
hash.update(target.name);
|
|
7101
9135
|
hash.update(evalCase.id);
|
|
@@ -7152,15 +9186,15 @@ function computeWeightedMean(entries) {
|
|
|
7152
9186
|
|
|
7153
9187
|
// src/evaluation/generators/rubric-generator.ts
|
|
7154
9188
|
var import_ai3 = require("ai");
|
|
7155
|
-
var
|
|
7156
|
-
var rubricItemSchema =
|
|
7157
|
-
id:
|
|
7158
|
-
description:
|
|
7159
|
-
weight:
|
|
7160
|
-
required:
|
|
9189
|
+
var import_zod4 = require("zod");
|
|
9190
|
+
var rubricItemSchema = import_zod4.z.object({
|
|
9191
|
+
id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
9192
|
+
description: import_zod4.z.string().describe("What this rubric checks for"),
|
|
9193
|
+
weight: import_zod4.z.number().default(1).describe("Relative importance (default 1.0)"),
|
|
9194
|
+
required: import_zod4.z.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
7161
9195
|
});
|
|
7162
|
-
var rubricGenerationSchema =
|
|
7163
|
-
rubrics:
|
|
9196
|
+
var rubricGenerationSchema = import_zod4.z.object({
|
|
9197
|
+
rubrics: import_zod4.z.array(rubricItemSchema).describe("List of evaluation rubrics")
|
|
7164
9198
|
});
|
|
7165
9199
|
async function generateRubrics(options) {
|
|
7166
9200
|
const { expectedOutcome, question, referenceAnswer, provider } = options;
|
|
@@ -7238,15 +9272,20 @@ function createAgentKernel() {
|
|
|
7238
9272
|
0 && (module.exports = {
|
|
7239
9273
|
CodeEvaluator,
|
|
7240
9274
|
CompositeEvaluator,
|
|
9275
|
+
CostEvaluator,
|
|
7241
9276
|
DEFAULT_EXPLORATION_TOOLS,
|
|
9277
|
+
FieldAccuracyEvaluator,
|
|
9278
|
+
LatencyEvaluator,
|
|
7242
9279
|
LlmJudgeEvaluator,
|
|
7243
9280
|
TEST_MESSAGE_ROLES,
|
|
9281
|
+
TokenUsageEvaluator,
|
|
7244
9282
|
ToolTrajectoryEvaluator,
|
|
7245
9283
|
avgToolDurationMs,
|
|
7246
9284
|
buildDirectoryChain,
|
|
7247
9285
|
buildPromptInputs,
|
|
7248
9286
|
buildSearchRoots,
|
|
7249
9287
|
computeTraceSummary,
|
|
9288
|
+
consumeClaudeCodeLogEntries,
|
|
7250
9289
|
consumeCodexLogEntries,
|
|
7251
9290
|
consumePiLogEntries,
|
|
7252
9291
|
createAgentKernel,
|
|
@@ -7277,6 +9316,7 @@ function createAgentKernel() {
|
|
|
7277
9316
|
resolveTargetDefinition,
|
|
7278
9317
|
runEvalCase,
|
|
7279
9318
|
runEvaluation,
|
|
9319
|
+
subscribeToClaudeCodeLogEntries,
|
|
7280
9320
|
subscribeToCodexLogEntries,
|
|
7281
9321
|
subscribeToPiLogEntries,
|
|
7282
9322
|
tokensPerTool
|