@agentv/core 1.5.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-E2VSU4WZ.js → chunk-IBTKEEOT.js} +73 -1
- package/dist/chunk-IBTKEEOT.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +2536 -663
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +221 -10
- package/dist/index.d.ts +221 -10
- package/dist/index.js +2362 -568
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
- package/dist/chunk-E2VSU4WZ.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -10,7 +10,7 @@ import {
|
|
|
10
10
|
readTextFile,
|
|
11
11
|
resolveFileReference,
|
|
12
12
|
resolveTargetDefinition
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-IBTKEEOT.js";
|
|
14
14
|
|
|
15
15
|
// src/evaluation/types.ts
|
|
16
16
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -64,7 +64,11 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
64
64
|
"llm_judge",
|
|
65
65
|
"rubric",
|
|
66
66
|
"composite",
|
|
67
|
-
"tool_trajectory"
|
|
67
|
+
"tool_trajectory",
|
|
68
|
+
"field_accuracy",
|
|
69
|
+
"latency",
|
|
70
|
+
"cost",
|
|
71
|
+
"token_usage"
|
|
68
72
|
];
|
|
69
73
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
70
74
|
function isEvaluatorKind(value) {
|
|
@@ -486,7 +490,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
486
490
|
continue;
|
|
487
491
|
}
|
|
488
492
|
if (typeValue === "code_judge") {
|
|
489
|
-
|
|
493
|
+
let script;
|
|
494
|
+
const rawScript = rawEvaluator.script;
|
|
495
|
+
if (typeof rawScript === "string") {
|
|
496
|
+
const trimmed = rawScript.trim();
|
|
497
|
+
if (trimmed.length === 0) {
|
|
498
|
+
throw new Error(
|
|
499
|
+
`Invalid code_judge script for evaluator '${name}' in '${evalId}': script cannot be empty`
|
|
500
|
+
);
|
|
501
|
+
}
|
|
502
|
+
script = parseCommandToArgv(trimmed);
|
|
503
|
+
} else {
|
|
504
|
+
script = asStringArray(
|
|
505
|
+
rawScript,
|
|
506
|
+
`code_judge script for evaluator '${name}' in '${evalId}'`
|
|
507
|
+
);
|
|
508
|
+
}
|
|
490
509
|
if (!script) {
|
|
491
510
|
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
492
511
|
continue;
|
|
@@ -507,13 +526,21 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
507
526
|
} else {
|
|
508
527
|
resolvedCwd = searchRoots[0];
|
|
509
528
|
}
|
|
529
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
|
|
530
|
+
const config = {};
|
|
531
|
+
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
532
|
+
if (!knownProps.has(key) && value !== void 0) {
|
|
533
|
+
config[key] = value;
|
|
534
|
+
}
|
|
535
|
+
}
|
|
510
536
|
evaluators.push({
|
|
511
537
|
name,
|
|
512
538
|
type: "code",
|
|
513
539
|
script,
|
|
514
540
|
cwd,
|
|
515
541
|
resolvedCwd,
|
|
516
|
-
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
542
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
543
|
+
...Object.keys(config).length > 0 ? { config } : {}
|
|
517
544
|
});
|
|
518
545
|
continue;
|
|
519
546
|
}
|
|
@@ -688,6 +715,140 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
688
715
|
evaluators.push(config);
|
|
689
716
|
continue;
|
|
690
717
|
}
|
|
718
|
+
if (typeValue === "field_accuracy") {
|
|
719
|
+
const rawFields = rawEvaluator.fields;
|
|
720
|
+
if (!Array.isArray(rawFields)) {
|
|
721
|
+
logWarning2(
|
|
722
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
|
|
723
|
+
);
|
|
724
|
+
continue;
|
|
725
|
+
}
|
|
726
|
+
if (rawFields.length === 0) {
|
|
727
|
+
logWarning2(
|
|
728
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
|
|
729
|
+
);
|
|
730
|
+
continue;
|
|
731
|
+
}
|
|
732
|
+
const fields = [];
|
|
733
|
+
for (const rawField of rawFields) {
|
|
734
|
+
if (!isJsonObject2(rawField)) {
|
|
735
|
+
logWarning2(
|
|
736
|
+
`Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
|
|
737
|
+
);
|
|
738
|
+
continue;
|
|
739
|
+
}
|
|
740
|
+
const fieldPath = asString2(rawField.path);
|
|
741
|
+
const match = asString2(rawField.match);
|
|
742
|
+
if (!fieldPath) {
|
|
743
|
+
logWarning2(
|
|
744
|
+
`Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
|
|
745
|
+
);
|
|
746
|
+
continue;
|
|
747
|
+
}
|
|
748
|
+
if (!match || !isValidFieldMatchType(match)) {
|
|
749
|
+
logWarning2(
|
|
750
|
+
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
|
|
751
|
+
);
|
|
752
|
+
continue;
|
|
753
|
+
}
|
|
754
|
+
const fieldConfig = {
|
|
755
|
+
path: fieldPath,
|
|
756
|
+
match,
|
|
757
|
+
...typeof rawField.required === "boolean" ? { required: rawField.required } : {},
|
|
758
|
+
...typeof rawField.weight === "number" ? { weight: rawField.weight } : {},
|
|
759
|
+
...typeof rawField.tolerance === "number" ? { tolerance: rawField.tolerance } : {},
|
|
760
|
+
...typeof rawField.relative === "boolean" ? { relative: rawField.relative } : {},
|
|
761
|
+
...Array.isArray(rawField.formats) ? { formats: rawField.formats.filter((f) => typeof f === "string") } : {}
|
|
762
|
+
};
|
|
763
|
+
fields.push(fieldConfig);
|
|
764
|
+
}
|
|
765
|
+
if (fields.length === 0) {
|
|
766
|
+
logWarning2(
|
|
767
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
|
|
768
|
+
);
|
|
769
|
+
continue;
|
|
770
|
+
}
|
|
771
|
+
const aggregation = asString2(rawEvaluator.aggregation);
|
|
772
|
+
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
773
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
774
|
+
evaluators.push({
|
|
775
|
+
name,
|
|
776
|
+
type: "field_accuracy",
|
|
777
|
+
fields,
|
|
778
|
+
...validAggregation ? { aggregation: validAggregation } : {},
|
|
779
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
780
|
+
});
|
|
781
|
+
continue;
|
|
782
|
+
}
|
|
783
|
+
if (typeValue === "latency") {
|
|
784
|
+
const threshold = rawEvaluator.threshold;
|
|
785
|
+
if (typeof threshold !== "number" || threshold < 0) {
|
|
786
|
+
logWarning2(
|
|
787
|
+
`Skipping latency evaluator '${name}' in '${evalId}': threshold must be a non-negative number`
|
|
788
|
+
);
|
|
789
|
+
continue;
|
|
790
|
+
}
|
|
791
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
792
|
+
evaluators.push({
|
|
793
|
+
name,
|
|
794
|
+
type: "latency",
|
|
795
|
+
threshold,
|
|
796
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
797
|
+
});
|
|
798
|
+
continue;
|
|
799
|
+
}
|
|
800
|
+
if (typeValue === "cost") {
|
|
801
|
+
const budget = rawEvaluator.budget;
|
|
802
|
+
if (typeof budget !== "number" || budget < 0) {
|
|
803
|
+
logWarning2(
|
|
804
|
+
`Skipping cost evaluator '${name}' in '${evalId}': budget must be a non-negative number`
|
|
805
|
+
);
|
|
806
|
+
continue;
|
|
807
|
+
}
|
|
808
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
809
|
+
evaluators.push({
|
|
810
|
+
name,
|
|
811
|
+
type: "cost",
|
|
812
|
+
budget,
|
|
813
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
814
|
+
});
|
|
815
|
+
continue;
|
|
816
|
+
}
|
|
817
|
+
if (typeValue === "token_usage") {
|
|
818
|
+
const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
|
|
819
|
+
const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
|
|
820
|
+
const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
|
|
821
|
+
const limits = [
|
|
822
|
+
["max_total", maxTotal],
|
|
823
|
+
["max_input", maxInput],
|
|
824
|
+
["max_output", maxOutput]
|
|
825
|
+
];
|
|
826
|
+
const validLimits = {};
|
|
827
|
+
for (const [key, raw] of limits) {
|
|
828
|
+
if (raw === void 0) continue;
|
|
829
|
+
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
830
|
+
logWarning2(
|
|
831
|
+
`Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
832
|
+
);
|
|
833
|
+
continue;
|
|
834
|
+
}
|
|
835
|
+
validLimits[key] = raw;
|
|
836
|
+
}
|
|
837
|
+
if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
|
|
838
|
+
logWarning2(
|
|
839
|
+
`Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
|
|
840
|
+
);
|
|
841
|
+
continue;
|
|
842
|
+
}
|
|
843
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
844
|
+
evaluators.push({
|
|
845
|
+
name,
|
|
846
|
+
type: "token_usage",
|
|
847
|
+
...validLimits,
|
|
848
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
849
|
+
});
|
|
850
|
+
continue;
|
|
851
|
+
}
|
|
691
852
|
const prompt = asString2(rawEvaluator.prompt);
|
|
692
853
|
let promptPath;
|
|
693
854
|
if (prompt) {
|
|
@@ -758,6 +919,34 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
758
919
|
function asString2(value) {
|
|
759
920
|
return typeof value === "string" ? value : void 0;
|
|
760
921
|
}
|
|
922
|
+
function asStringArray(value, description) {
|
|
923
|
+
if (value === void 0) {
|
|
924
|
+
return void 0;
|
|
925
|
+
}
|
|
926
|
+
if (!Array.isArray(value)) {
|
|
927
|
+
throw new Error(`${description} must be an array of strings (argv tokens)`);
|
|
928
|
+
}
|
|
929
|
+
if (value.length === 0) {
|
|
930
|
+
throw new Error(`${description} cannot be empty`);
|
|
931
|
+
}
|
|
932
|
+
const result = [];
|
|
933
|
+
for (const [index, entry] of value.entries()) {
|
|
934
|
+
if (typeof entry !== "string") {
|
|
935
|
+
throw new Error(`${description}[${index}] must be a string`);
|
|
936
|
+
}
|
|
937
|
+
if (entry.trim().length === 0) {
|
|
938
|
+
throw new Error(`${description}[${index}] cannot be empty`);
|
|
939
|
+
}
|
|
940
|
+
result.push(entry);
|
|
941
|
+
}
|
|
942
|
+
return result;
|
|
943
|
+
}
|
|
944
|
+
function parseCommandToArgv(command) {
|
|
945
|
+
if (process.platform === "win32") {
|
|
946
|
+
return ["cmd.exe", "/c", command];
|
|
947
|
+
}
|
|
948
|
+
return ["sh", "-lc", command];
|
|
949
|
+
}
|
|
761
950
|
function isJsonObject2(value) {
|
|
762
951
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
763
952
|
}
|
|
@@ -791,6 +980,14 @@ function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
|
791
980
|
}
|
|
792
981
|
return rawWeight;
|
|
793
982
|
}
|
|
983
|
+
var VALID_FIELD_MATCH_TYPES = /* @__PURE__ */ new Set(["exact", "numeric_tolerance", "date"]);
|
|
984
|
+
function isValidFieldMatchType(value) {
|
|
985
|
+
return typeof value === "string" && VALID_FIELD_MATCH_TYPES.has(value);
|
|
986
|
+
}
|
|
987
|
+
var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average", "all_or_nothing"]);
|
|
988
|
+
function isValidFieldAggregationType(value) {
|
|
989
|
+
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
990
|
+
}
|
|
794
991
|
|
|
795
992
|
// src/evaluation/loaders/message-processor.ts
|
|
796
993
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
@@ -1750,91 +1947,992 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
1750
1947
|
throw lastError;
|
|
1751
1948
|
}
|
|
1752
1949
|
|
|
1753
|
-
// src/evaluation/providers/
|
|
1754
|
-
import {
|
|
1755
|
-
import
|
|
1756
|
-
import
|
|
1757
|
-
import
|
|
1758
|
-
import {
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1950
|
+
// src/evaluation/providers/claude-code.ts
|
|
1951
|
+
import { spawn } from "node:child_process";
|
|
1952
|
+
import { randomUUID } from "node:crypto";
|
|
1953
|
+
import { createWriteStream } from "node:fs";
|
|
1954
|
+
import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
1955
|
+
import { tmpdir } from "node:os";
|
|
1956
|
+
import path8 from "node:path";
|
|
1957
|
+
|
|
1958
|
+
// src/evaluation/providers/claude-code-log-tracker.ts
|
|
1959
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
|
|
1960
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeCodeLogSubscribers");
|
|
1961
|
+
function getClaudeCodeLogStore() {
|
|
1962
|
+
const globalObject = globalThis;
|
|
1963
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1964
|
+
if (existing) {
|
|
1965
|
+
return existing;
|
|
1966
|
+
}
|
|
1967
|
+
const created = [];
|
|
1968
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1969
|
+
return created;
|
|
1970
|
+
}
|
|
1971
|
+
function getSubscriberStore() {
|
|
1972
|
+
const globalObject = globalThis;
|
|
1973
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1974
|
+
if (existing) {
|
|
1975
|
+
return existing;
|
|
1976
|
+
}
|
|
1977
|
+
const created = /* @__PURE__ */ new Set();
|
|
1978
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1979
|
+
return created;
|
|
1980
|
+
}
|
|
1981
|
+
function notifySubscribers(entry) {
|
|
1982
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1983
|
+
for (const listener of subscribers) {
|
|
1984
|
+
try {
|
|
1985
|
+
listener(entry);
|
|
1986
|
+
} catch (error) {
|
|
1987
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1988
|
+
console.warn(`Claude Code log subscriber failed: ${message}`);
|
|
1989
|
+
}
|
|
1990
|
+
}
|
|
1991
|
+
}
|
|
1992
|
+
function recordClaudeCodeLogEntry(entry) {
|
|
1993
|
+
getClaudeCodeLogStore().push(entry);
|
|
1994
|
+
notifySubscribers(entry);
|
|
1995
|
+
}
|
|
1996
|
+
function consumeClaudeCodeLogEntries() {
|
|
1997
|
+
const store = getClaudeCodeLogStore();
|
|
1998
|
+
if (store.length === 0) {
|
|
1999
|
+
return [];
|
|
2000
|
+
}
|
|
2001
|
+
return store.splice(0, store.length);
|
|
2002
|
+
}
|
|
2003
|
+
function subscribeToClaudeCodeLogEntries(listener) {
|
|
2004
|
+
const store = getSubscriberStore();
|
|
2005
|
+
store.add(listener);
|
|
2006
|
+
return () => {
|
|
2007
|
+
store.delete(listener);
|
|
1769
2008
|
};
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
failed: true,
|
|
1787
|
-
timedOut: execError.timedOut === true || execError.killed === true,
|
|
1788
|
-
signal: execError.signal ?? null
|
|
1789
|
-
};
|
|
2009
|
+
}
|
|
2010
|
+
|
|
2011
|
+
// src/evaluation/providers/preread.ts
|
|
2012
|
+
import path7 from "node:path";
|
|
2013
|
+
function buildPromptDocument(request, inputFiles, options) {
|
|
2014
|
+
const parts = [];
|
|
2015
|
+
const guidelineFiles = collectGuidelineFiles(
|
|
2016
|
+
inputFiles,
|
|
2017
|
+
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
2018
|
+
options?.guidelineOverrides
|
|
2019
|
+
);
|
|
2020
|
+
const inputFilesList = collectInputFiles(inputFiles);
|
|
2021
|
+
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
2022
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
2023
|
+
if (prereadBlock.length > 0) {
|
|
2024
|
+
parts.push("\n", prereadBlock);
|
|
1790
2025
|
}
|
|
2026
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2027
|
+
return parts.join("\n").trim();
|
|
1791
2028
|
}
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
targetName;
|
|
1796
|
-
supportsBatch = true;
|
|
1797
|
-
config;
|
|
1798
|
-
runCommand;
|
|
1799
|
-
verbose;
|
|
1800
|
-
keepTempFiles;
|
|
1801
|
-
healthcheckPromise;
|
|
1802
|
-
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1803
|
-
this.targetName = targetName;
|
|
1804
|
-
this.id = `cli:${targetName}`;
|
|
1805
|
-
this.config = config;
|
|
1806
|
-
this.runCommand = runner;
|
|
1807
|
-
this.verbose = config.verbose ?? false;
|
|
1808
|
-
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
2029
|
+
function normalizeInputFiles(inputFiles) {
|
|
2030
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2031
|
+
return void 0;
|
|
1809
2032
|
}
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
2033
|
+
const deduped = /* @__PURE__ */ new Map();
|
|
2034
|
+
for (const inputFile of inputFiles) {
|
|
2035
|
+
const absolutePath = path7.resolve(inputFile);
|
|
2036
|
+
if (!deduped.has(absolutePath)) {
|
|
2037
|
+
deduped.set(absolutePath, absolutePath);
|
|
1813
2038
|
}
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
2039
|
+
}
|
|
2040
|
+
return Array.from(deduped.values());
|
|
2041
|
+
}
|
|
2042
|
+
function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
2043
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2044
|
+
return [];
|
|
2045
|
+
}
|
|
2046
|
+
const unique = /* @__PURE__ */ new Map();
|
|
2047
|
+
for (const inputFile of inputFiles) {
|
|
2048
|
+
const absolutePath = path7.resolve(inputFile);
|
|
2049
|
+
if (overrides?.has(absolutePath)) {
|
|
2050
|
+
if (!unique.has(absolutePath)) {
|
|
2051
|
+
unique.set(absolutePath, absolutePath);
|
|
2052
|
+
}
|
|
2053
|
+
continue;
|
|
1822
2054
|
}
|
|
1823
|
-
const
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
2055
|
+
const normalized = absolutePath.split(path7.sep).join("/");
|
|
2056
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2057
|
+
if (!unique.has(absolutePath)) {
|
|
2058
|
+
unique.set(absolutePath, absolutePath);
|
|
2059
|
+
}
|
|
2060
|
+
}
|
|
2061
|
+
}
|
|
2062
|
+
return Array.from(unique.values());
|
|
2063
|
+
}
|
|
2064
|
+
function collectInputFiles(inputFiles) {
|
|
2065
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2066
|
+
return [];
|
|
2067
|
+
}
|
|
2068
|
+
const unique = /* @__PURE__ */ new Map();
|
|
2069
|
+
for (const inputFile of inputFiles) {
|
|
2070
|
+
const absolutePath = path7.resolve(inputFile);
|
|
2071
|
+
if (!unique.has(absolutePath)) {
|
|
2072
|
+
unique.set(absolutePath, absolutePath);
|
|
2073
|
+
}
|
|
2074
|
+
}
|
|
2075
|
+
return Array.from(unique.values());
|
|
2076
|
+
}
|
|
2077
|
+
function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
2078
|
+
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
2079
|
+
return "";
|
|
2080
|
+
}
|
|
2081
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
2082
|
+
const fileName = path7.basename(absolutePath);
|
|
2083
|
+
const fileUri = pathToFileUri(absolutePath);
|
|
2084
|
+
return `* [${fileName}](${fileUri})`;
|
|
2085
|
+
});
|
|
2086
|
+
const sections = [];
|
|
2087
|
+
if (guidelineFiles.length > 0) {
|
|
2088
|
+
sections.push(`Read all guideline files:
|
|
2089
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
2090
|
+
}
|
|
2091
|
+
if (inputFiles.length > 0) {
|
|
2092
|
+
sections.push(`Read all input files:
|
|
2093
|
+
${buildList(inputFiles).join("\n")}.`);
|
|
2094
|
+
}
|
|
2095
|
+
sections.push(
|
|
2096
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
2097
|
+
"Then apply system_instructions on the user query below."
|
|
2098
|
+
);
|
|
2099
|
+
return sections.join("\n");
|
|
2100
|
+
}
|
|
2101
|
+
function pathToFileUri(filePath) {
|
|
2102
|
+
const absolutePath = path7.isAbsolute(filePath) ? filePath : path7.resolve(filePath);
|
|
2103
|
+
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2104
|
+
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2105
|
+
return `file:///${normalizedPath}`;
|
|
2106
|
+
}
|
|
2107
|
+
return `file://${normalizedPath}`;
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
// src/evaluation/providers/claude-code.ts
|
|
2111
|
+
var WORKSPACE_PREFIX = "agentv-claude-code-";
|
|
2112
|
+
var PROMPT_FILENAME = "prompt.md";
|
|
2113
|
+
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2114
|
+
- Do NOT create any additional output files in the workspace.
|
|
2115
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
2116
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
2117
|
+
This is required for evaluation scoring.`;
|
|
2118
|
+
var ClaudeCodeProvider = class {
|
|
2119
|
+
id;
|
|
2120
|
+
kind = "claude-code";
|
|
2121
|
+
targetName;
|
|
2122
|
+
supportsBatch = false;
|
|
2123
|
+
config;
|
|
2124
|
+
runClaudeCode;
|
|
2125
|
+
constructor(targetName, config, runner = defaultClaudeCodeRunner) {
|
|
2126
|
+
this.id = `claude-code:${targetName}`;
|
|
2127
|
+
this.targetName = targetName;
|
|
2128
|
+
this.config = config;
|
|
2129
|
+
this.runClaudeCode = runner;
|
|
2130
|
+
}
|
|
2131
|
+
async invoke(request) {
|
|
2132
|
+
if (request.signal?.aborted) {
|
|
2133
|
+
throw new Error("Claude Code request was aborted before execution");
|
|
2134
|
+
}
|
|
2135
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
2136
|
+
const workspaceRoot = await this.createWorkspace();
|
|
2137
|
+
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2138
|
+
try {
|
|
2139
|
+
const promptFile = path8.join(workspaceRoot, PROMPT_FILENAME);
|
|
2140
|
+
await writeFile(promptFile, request.question, "utf8");
|
|
2141
|
+
const args = this.buildClaudeCodeArgs(request.question, inputFiles);
|
|
2142
|
+
const cwd = this.resolveCwd();
|
|
2143
|
+
const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
|
|
2144
|
+
if (result.timedOut) {
|
|
2145
|
+
throw new Error(
|
|
2146
|
+
`Claude Code CLI timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
2147
|
+
);
|
|
2148
|
+
}
|
|
2149
|
+
if (result.exitCode !== 0) {
|
|
2150
|
+
const detail = pickDetail(result.stderr, result.stdout);
|
|
2151
|
+
const prefix = `Claude Code CLI exited with code ${result.exitCode}`;
|
|
2152
|
+
if (isNestedClaudeCodeAuthError(result.stdout)) {
|
|
2153
|
+
throw new Error(
|
|
2154
|
+
`${prefix}: Claude Code detected a nested session and requires API key authentication. Set ANTHROPIC_API_KEY environment variable or run AgentV outside of a Claude Code session.`
|
|
2155
|
+
);
|
|
2156
|
+
}
|
|
2157
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
2158
|
+
}
|
|
2159
|
+
const parsed = parseClaudeCodeJsonl(result.stdout);
|
|
2160
|
+
const outputMessages = extractOutputMessages(parsed);
|
|
2161
|
+
const usage = extractUsage(parsed);
|
|
2162
|
+
return {
|
|
2163
|
+
raw: {
|
|
2164
|
+
response: parsed,
|
|
2165
|
+
stdout: result.stdout,
|
|
2166
|
+
stderr: result.stderr,
|
|
2167
|
+
exitCode: result.exitCode,
|
|
2168
|
+
args,
|
|
2169
|
+
executable: this.config.executable,
|
|
2170
|
+
promptFile,
|
|
2171
|
+
workspace: workspaceRoot,
|
|
2172
|
+
inputFiles,
|
|
2173
|
+
logFile: logger?.filePath
|
|
2174
|
+
},
|
|
2175
|
+
outputMessages,
|
|
2176
|
+
usage
|
|
2177
|
+
};
|
|
2178
|
+
} finally {
|
|
2179
|
+
await logger?.close();
|
|
2180
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
2181
|
+
}
|
|
2182
|
+
}
|
|
2183
|
+
resolveCwd() {
|
|
2184
|
+
if (!this.config.cwd) {
|
|
2185
|
+
return process.cwd();
|
|
2186
|
+
}
|
|
2187
|
+
return path8.resolve(this.config.cwd);
|
|
2188
|
+
}
|
|
2189
|
+
buildClaudeCodeArgs(prompt, inputFiles) {
|
|
2190
|
+
const args = [];
|
|
2191
|
+
args.push("--output-format", "stream-json");
|
|
2192
|
+
args.push("--verbose");
|
|
2193
|
+
args.push("-p");
|
|
2194
|
+
if (this.config.model) {
|
|
2195
|
+
args.push("--model", this.config.model);
|
|
2196
|
+
}
|
|
2197
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
2198
|
+
args.push(...this.config.args);
|
|
2199
|
+
}
|
|
2200
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
|
|
2201
|
+
const fullPrompt = `${systemPrompt}
|
|
2202
|
+
|
|
2203
|
+
${prompt}`;
|
|
2204
|
+
let finalPrompt = fullPrompt;
|
|
2205
|
+
if (inputFiles && inputFiles.length > 0) {
|
|
2206
|
+
const filesContext = inputFiles.map((f) => `[File: ${f}]`).join("\n");
|
|
2207
|
+
finalPrompt = `${fullPrompt}
|
|
2208
|
+
|
|
2209
|
+
## Input Files
|
|
2210
|
+
${filesContext}`;
|
|
2211
|
+
}
|
|
2212
|
+
args.push(finalPrompt);
|
|
2213
|
+
return args;
|
|
2214
|
+
}
|
|
2215
|
+
buildEnv() {
|
|
2216
|
+
const env = { ...process.env };
|
|
2217
|
+
env.CLAUDECODE = void 0;
|
|
2218
|
+
env.CLAUDE_CODE_ENTRYPOINT = void 0;
|
|
2219
|
+
return env;
|
|
2220
|
+
}
|
|
2221
|
+
async executeClaudeCode(args, cwd, signal, logger) {
|
|
2222
|
+
try {
|
|
2223
|
+
return await this.runClaudeCode({
|
|
2224
|
+
executable: this.config.executable,
|
|
2225
|
+
args,
|
|
2226
|
+
cwd,
|
|
2227
|
+
timeoutMs: this.config.timeoutMs,
|
|
2228
|
+
env: this.buildEnv(),
|
|
2229
|
+
signal,
|
|
2230
|
+
onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
|
|
2231
|
+
onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
|
|
2232
|
+
});
|
|
2233
|
+
} catch (error) {
|
|
2234
|
+
const err = error;
|
|
2235
|
+
if (err.code === "ENOENT") {
|
|
2236
|
+
throw new Error(
|
|
2237
|
+
`Claude Code executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
2238
|
+
);
|
|
2239
|
+
}
|
|
2240
|
+
throw error;
|
|
2241
|
+
}
|
|
2242
|
+
}
|
|
2243
|
+
async createWorkspace() {
|
|
2244
|
+
return await mkdtemp(path8.join(tmpdir(), WORKSPACE_PREFIX));
|
|
2245
|
+
}
|
|
2246
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
2247
|
+
try {
|
|
2248
|
+
await rm(workspaceRoot, { recursive: true, force: true });
|
|
2249
|
+
} catch {
|
|
2250
|
+
}
|
|
2251
|
+
}
|
|
2252
|
+
resolveLogDirectory() {
|
|
2253
|
+
const disabled = isClaudeCodeLogStreamingDisabled();
|
|
2254
|
+
if (disabled) {
|
|
2255
|
+
return void 0;
|
|
2256
|
+
}
|
|
2257
|
+
if (this.config.logDir) {
|
|
2258
|
+
return path8.resolve(this.config.logDir);
|
|
2259
|
+
}
|
|
2260
|
+
return path8.join(process.cwd(), ".agentv", "logs", "claude-code");
|
|
2261
|
+
}
|
|
2262
|
+
async createStreamLogger(request) {
|
|
2263
|
+
const logDir = this.resolveLogDirectory();
|
|
2264
|
+
if (!logDir) {
|
|
2265
|
+
return void 0;
|
|
2266
|
+
}
|
|
2267
|
+
try {
|
|
2268
|
+
await mkdir(logDir, { recursive: true });
|
|
2269
|
+
} catch (error) {
|
|
2270
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2271
|
+
console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
|
|
2272
|
+
return void 0;
|
|
2273
|
+
}
|
|
2274
|
+
const filePath = path8.join(logDir, buildLogFilename(request, this.targetName));
|
|
2275
|
+
try {
|
|
2276
|
+
const logger = await ClaudeCodeStreamLogger.create({
|
|
2277
|
+
filePath,
|
|
2278
|
+
targetName: this.targetName,
|
|
2279
|
+
evalCaseId: request.evalCaseId,
|
|
2280
|
+
attempt: request.attempt,
|
|
2281
|
+
format: this.config.logFormat ?? "summary"
|
|
2282
|
+
});
|
|
2283
|
+
recordClaudeCodeLogEntry({
|
|
2284
|
+
filePath,
|
|
2285
|
+
targetName: this.targetName,
|
|
2286
|
+
evalCaseId: request.evalCaseId,
|
|
2287
|
+
attempt: request.attempt
|
|
2288
|
+
});
|
|
2289
|
+
return logger;
|
|
2290
|
+
} catch (error) {
|
|
2291
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2292
|
+
console.warn(`Skipping Claude Code stream logging for ${filePath}: ${message}`);
|
|
2293
|
+
return void 0;
|
|
2294
|
+
}
|
|
2295
|
+
}
|
|
2296
|
+
};
|
|
2297
|
+
var ClaudeCodeStreamLogger = class _ClaudeCodeStreamLogger {
|
|
2298
|
+
filePath;
|
|
2299
|
+
stream;
|
|
2300
|
+
startedAt = Date.now();
|
|
2301
|
+
stdoutBuffer = "";
|
|
2302
|
+
stderrBuffer = "";
|
|
2303
|
+
format;
|
|
2304
|
+
constructor(filePath, format) {
|
|
2305
|
+
this.filePath = filePath;
|
|
2306
|
+
this.format = format;
|
|
2307
|
+
this.stream = createWriteStream(filePath, { flags: "a" });
|
|
2308
|
+
}
|
|
2309
|
+
static async create(options) {
|
|
2310
|
+
const logger = new _ClaudeCodeStreamLogger(options.filePath, options.format);
|
|
2311
|
+
const header = [
|
|
2312
|
+
"# Claude Code CLI stream log",
|
|
2313
|
+
`# target: ${options.targetName}`,
|
|
2314
|
+
options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
|
|
2315
|
+
options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
|
|
2316
|
+
`# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
|
|
2317
|
+
""
|
|
2318
|
+
].filter((line) => Boolean(line));
|
|
2319
|
+
logger.writeLines(header);
|
|
2320
|
+
return logger;
|
|
2321
|
+
}
|
|
2322
|
+
handleStdoutChunk(chunk) {
|
|
2323
|
+
this.stdoutBuffer += chunk;
|
|
2324
|
+
this.flushBuffer("stdout");
|
|
2325
|
+
}
|
|
2326
|
+
handleStderrChunk(chunk) {
|
|
2327
|
+
this.stderrBuffer += chunk;
|
|
2328
|
+
this.flushBuffer("stderr");
|
|
2329
|
+
}
|
|
2330
|
+
async close() {
|
|
2331
|
+
this.flushBuffer("stdout");
|
|
2332
|
+
this.flushBuffer("stderr");
|
|
2333
|
+
this.flushRemainder();
|
|
2334
|
+
await new Promise((resolve, reject) => {
|
|
2335
|
+
this.stream.once("error", reject);
|
|
2336
|
+
this.stream.end(() => resolve());
|
|
2337
|
+
});
|
|
2338
|
+
}
|
|
2339
|
+
writeLines(lines) {
|
|
2340
|
+
for (const line of lines) {
|
|
2341
|
+
this.stream.write(`${line}
|
|
2342
|
+
`);
|
|
2343
|
+
}
|
|
2344
|
+
}
|
|
2345
|
+
flushBuffer(source) {
|
|
2346
|
+
const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
|
|
2347
|
+
const lines = buffer.split(/\r?\n/);
|
|
2348
|
+
const remainder = lines.pop() ?? "";
|
|
2349
|
+
if (source === "stdout") {
|
|
2350
|
+
this.stdoutBuffer = remainder;
|
|
2351
|
+
} else {
|
|
2352
|
+
this.stderrBuffer = remainder;
|
|
2353
|
+
}
|
|
2354
|
+
for (const line of lines) {
|
|
2355
|
+
const formatted = this.formatLine(line, source);
|
|
2356
|
+
if (formatted) {
|
|
2357
|
+
this.stream.write(formatted);
|
|
2358
|
+
this.stream.write("\n");
|
|
2359
|
+
}
|
|
2360
|
+
}
|
|
2361
|
+
}
|
|
2362
|
+
formatLine(rawLine, source) {
|
|
2363
|
+
const trimmed = rawLine.trim();
|
|
2364
|
+
if (trimmed.length === 0) {
|
|
2365
|
+
return void 0;
|
|
2366
|
+
}
|
|
2367
|
+
const message = this.format === "json" ? formatClaudeCodeJsonLog(trimmed) : formatClaudeCodeLogMessage(trimmed, source);
|
|
2368
|
+
return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
|
|
2369
|
+
}
|
|
2370
|
+
flushRemainder() {
|
|
2371
|
+
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
2372
|
+
if (stdoutRemainder.length > 0) {
|
|
2373
|
+
const formatted = this.formatLine(stdoutRemainder, "stdout");
|
|
2374
|
+
if (formatted) {
|
|
2375
|
+
this.stream.write(formatted);
|
|
2376
|
+
this.stream.write("\n");
|
|
2377
|
+
}
|
|
2378
|
+
}
|
|
2379
|
+
const stderrRemainder = this.stderrBuffer.trim();
|
|
2380
|
+
if (stderrRemainder.length > 0) {
|
|
2381
|
+
const formatted = this.formatLine(stderrRemainder, "stderr");
|
|
2382
|
+
if (formatted) {
|
|
2383
|
+
this.stream.write(formatted);
|
|
2384
|
+
this.stream.write("\n");
|
|
2385
|
+
}
|
|
2386
|
+
}
|
|
2387
|
+
this.stdoutBuffer = "";
|
|
2388
|
+
this.stderrBuffer = "";
|
|
2389
|
+
}
|
|
2390
|
+
};
|
|
2391
|
+
function isClaudeCodeLogStreamingDisabled() {
|
|
2392
|
+
const envValue = process.env.AGENTV_CLAUDE_CODE_STREAM_LOGS;
|
|
2393
|
+
if (!envValue) {
|
|
2394
|
+
return false;
|
|
2395
|
+
}
|
|
2396
|
+
const normalized = envValue.trim().toLowerCase();
|
|
2397
|
+
return normalized === "false" || normalized === "0" || normalized === "off";
|
|
2398
|
+
}
|
|
2399
|
+
function buildLogFilename(request, targetName) {
|
|
2400
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2401
|
+
const evalId = sanitizeForFilename(request.evalCaseId ?? "claude-code");
|
|
2402
|
+
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
2403
|
+
const target = sanitizeForFilename(targetName);
|
|
2404
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`;
|
|
2405
|
+
}
|
|
2406
|
+
function sanitizeForFilename(value) {
|
|
2407
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
2408
|
+
return sanitized.length > 0 ? sanitized : "claude-code";
|
|
2409
|
+
}
|
|
2410
|
+
function formatElapsed(startedAt) {
|
|
2411
|
+
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
2412
|
+
const hours = Math.floor(elapsedSeconds / 3600);
|
|
2413
|
+
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
2414
|
+
const seconds = elapsedSeconds % 60;
|
|
2415
|
+
if (hours > 0) {
|
|
2416
|
+
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2417
|
+
}
|
|
2418
|
+
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2419
|
+
}
|
|
2420
|
+
function formatClaudeCodeLogMessage(rawLine, source) {
|
|
2421
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
2422
|
+
if (parsed) {
|
|
2423
|
+
const summary = summarizeClaudeCodeEvent(parsed);
|
|
2424
|
+
if (summary) {
|
|
2425
|
+
return summary;
|
|
2426
|
+
}
|
|
2427
|
+
}
|
|
2428
|
+
if (source === "stderr") {
|
|
2429
|
+
return `stderr: ${rawLine}`;
|
|
2430
|
+
}
|
|
2431
|
+
return rawLine;
|
|
2432
|
+
}
|
|
2433
|
+
function formatClaudeCodeJsonLog(rawLine) {
|
|
2434
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
2435
|
+
if (!parsed) {
|
|
2436
|
+
return rawLine;
|
|
2437
|
+
}
|
|
2438
|
+
try {
|
|
2439
|
+
return JSON.stringify(parsed, null, 2);
|
|
2440
|
+
} catch {
|
|
2441
|
+
return rawLine;
|
|
2442
|
+
}
|
|
2443
|
+
}
|
|
2444
|
+
function summarizeClaudeCodeEvent(event) {
|
|
2445
|
+
if (!event || typeof event !== "object") {
|
|
2446
|
+
return void 0;
|
|
2447
|
+
}
|
|
2448
|
+
const record = event;
|
|
2449
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
2450
|
+
if (!type) {
|
|
2451
|
+
return void 0;
|
|
2452
|
+
}
|
|
2453
|
+
switch (type) {
|
|
2454
|
+
case "system":
|
|
2455
|
+
return "system: init";
|
|
2456
|
+
case "assistant": {
|
|
2457
|
+
const message = record.message;
|
|
2458
|
+
if (message) {
|
|
2459
|
+
const content = message.content;
|
|
2460
|
+
if (Array.isArray(content) && content.length > 0) {
|
|
2461
|
+
const first = content[0];
|
|
2462
|
+
if (first?.type === "tool_use") {
|
|
2463
|
+
return `assistant: tool_use (${first.name})`;
|
|
2464
|
+
}
|
|
2465
|
+
if (first?.type === "text") {
|
|
2466
|
+
const text = first.text;
|
|
2467
|
+
if (typeof text === "string") {
|
|
2468
|
+
const preview = text.length > 50 ? `${text.slice(0, 50)}...` : text;
|
|
2469
|
+
return `assistant: ${preview}`;
|
|
2470
|
+
}
|
|
2471
|
+
}
|
|
2472
|
+
}
|
|
2473
|
+
}
|
|
2474
|
+
return "assistant";
|
|
2475
|
+
}
|
|
2476
|
+
case "user": {
|
|
2477
|
+
const message = record.message;
|
|
2478
|
+
if (message) {
|
|
2479
|
+
const content = message.content;
|
|
2480
|
+
if (Array.isArray(content) && content.length > 0) {
|
|
2481
|
+
const first = content[0];
|
|
2482
|
+
if (first?.type === "tool_result") {
|
|
2483
|
+
return `user: tool_result (${first.tool_use_id})`;
|
|
2484
|
+
}
|
|
2485
|
+
}
|
|
2486
|
+
}
|
|
2487
|
+
return "user";
|
|
2488
|
+
}
|
|
2489
|
+
case "result": {
|
|
2490
|
+
const cost = record.cost_usd;
|
|
2491
|
+
const duration = record.duration_ms;
|
|
2492
|
+
if (typeof cost === "number" && typeof duration === "number") {
|
|
2493
|
+
return `result: $${cost.toFixed(4)}, ${Math.round(duration)}ms`;
|
|
2494
|
+
}
|
|
2495
|
+
return "result";
|
|
2496
|
+
}
|
|
2497
|
+
default:
|
|
2498
|
+
return type;
|
|
2499
|
+
}
|
|
2500
|
+
}
|
|
2501
|
+
function tryParseJsonValue(rawLine) {
|
|
2502
|
+
try {
|
|
2503
|
+
return JSON.parse(rawLine);
|
|
2504
|
+
} catch {
|
|
2505
|
+
return void 0;
|
|
2506
|
+
}
|
|
2507
|
+
}
|
|
2508
|
+
function parseClaudeCodeJsonl(output) {
|
|
2509
|
+
const trimmed = output.trim();
|
|
2510
|
+
if (trimmed.length === 0) {
|
|
2511
|
+
throw new Error("Claude Code CLI produced no output");
|
|
2512
|
+
}
|
|
2513
|
+
const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
2514
|
+
const parsed = [];
|
|
2515
|
+
for (const line of lines) {
|
|
2516
|
+
try {
|
|
2517
|
+
parsed.push(JSON.parse(line));
|
|
2518
|
+
} catch {
|
|
2519
|
+
}
|
|
2520
|
+
}
|
|
2521
|
+
if (parsed.length === 0) {
|
|
2522
|
+
throw new Error("Claude Code CLI produced no valid JSON output");
|
|
2523
|
+
}
|
|
2524
|
+
return parsed;
|
|
2525
|
+
}
|
|
2526
|
+
function extractOutputMessages(events) {
|
|
2527
|
+
const outputMessages = [];
|
|
2528
|
+
for (const event of events) {
|
|
2529
|
+
if (!event || typeof event !== "object") {
|
|
2530
|
+
continue;
|
|
2531
|
+
}
|
|
2532
|
+
const record = event;
|
|
2533
|
+
const type = record.type;
|
|
2534
|
+
if (type === "assistant" || type === "user") {
|
|
2535
|
+
const message = record.message;
|
|
2536
|
+
if (message) {
|
|
2537
|
+
const converted = convertClaudeCodeMessage(message, type);
|
|
2538
|
+
if (converted) {
|
|
2539
|
+
outputMessages.push(converted);
|
|
2540
|
+
}
|
|
2541
|
+
}
|
|
2542
|
+
}
|
|
2543
|
+
}
|
|
2544
|
+
return outputMessages;
|
|
2545
|
+
}
|
|
2546
|
+
function convertClaudeCodeMessage(message, type) {
|
|
2547
|
+
const role = type === "assistant" ? "assistant" : "user";
|
|
2548
|
+
const content = extractTextContent(message.content);
|
|
2549
|
+
const toolCalls = extractToolCalls(message.content);
|
|
2550
|
+
return {
|
|
2551
|
+
role,
|
|
2552
|
+
content,
|
|
2553
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
2554
|
+
};
|
|
2555
|
+
}
|
|
2556
|
+
function extractTextContent(content) {
|
|
2557
|
+
if (typeof content === "string") {
|
|
2558
|
+
return content;
|
|
2559
|
+
}
|
|
2560
|
+
if (!Array.isArray(content)) {
|
|
2561
|
+
return void 0;
|
|
2562
|
+
}
|
|
2563
|
+
const textParts = [];
|
|
2564
|
+
for (const part of content) {
|
|
2565
|
+
if (!part || typeof part !== "object") {
|
|
2566
|
+
continue;
|
|
2567
|
+
}
|
|
2568
|
+
const p = part;
|
|
2569
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
2570
|
+
textParts.push(p.text);
|
|
2571
|
+
}
|
|
2572
|
+
}
|
|
2573
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
2574
|
+
}
|
|
2575
|
+
function extractToolCalls(content) {
|
|
2576
|
+
if (!Array.isArray(content)) {
|
|
2577
|
+
return [];
|
|
2578
|
+
}
|
|
2579
|
+
const toolCalls = [];
|
|
2580
|
+
for (const part of content) {
|
|
2581
|
+
if (!part || typeof part !== "object") {
|
|
2582
|
+
continue;
|
|
2583
|
+
}
|
|
2584
|
+
const p = part;
|
|
2585
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
2586
|
+
toolCalls.push({
|
|
2587
|
+
tool: p.name,
|
|
2588
|
+
input: p.input,
|
|
2589
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
2590
|
+
});
|
|
2591
|
+
}
|
|
2592
|
+
if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
2593
|
+
toolCalls.push({
|
|
2594
|
+
tool: "tool_result",
|
|
2595
|
+
output: p.content,
|
|
2596
|
+
id: p.tool_use_id
|
|
2597
|
+
});
|
|
2598
|
+
}
|
|
2599
|
+
}
|
|
2600
|
+
return toolCalls;
|
|
2601
|
+
}
|
|
2602
|
+
function extractUsage(events) {
|
|
2603
|
+
for (let i = events.length - 1; i >= 0; i--) {
|
|
2604
|
+
const event = events[i];
|
|
2605
|
+
if (!event || typeof event !== "object") {
|
|
2606
|
+
continue;
|
|
2607
|
+
}
|
|
2608
|
+
const record = event;
|
|
2609
|
+
if (record.type !== "result") {
|
|
2610
|
+
continue;
|
|
2611
|
+
}
|
|
2612
|
+
const usage = {};
|
|
2613
|
+
if (typeof record.cost_usd === "number") {
|
|
2614
|
+
usage.cost_usd = record.cost_usd;
|
|
2615
|
+
}
|
|
2616
|
+
if (typeof record.duration_ms === "number") {
|
|
2617
|
+
usage.duration_ms = record.duration_ms;
|
|
2618
|
+
}
|
|
2619
|
+
if (typeof record.duration_api_ms === "number") {
|
|
2620
|
+
usage.duration_api_ms = record.duration_api_ms;
|
|
2621
|
+
}
|
|
2622
|
+
if (typeof record.input_tokens === "number") {
|
|
2623
|
+
usage.input_tokens = record.input_tokens;
|
|
2624
|
+
}
|
|
2625
|
+
if (typeof record.output_tokens === "number") {
|
|
2626
|
+
usage.output_tokens = record.output_tokens;
|
|
2627
|
+
}
|
|
2628
|
+
if (typeof record.session_id === "string") {
|
|
2629
|
+
usage.session_id = record.session_id;
|
|
2630
|
+
}
|
|
2631
|
+
return Object.keys(usage).length > 0 ? usage : void 0;
|
|
2632
|
+
}
|
|
2633
|
+
return void 0;
|
|
2634
|
+
}
|
|
2635
|
+
function pickDetail(stderr, stdout) {
|
|
2636
|
+
const errorText = stderr.trim();
|
|
2637
|
+
if (errorText.length > 0) {
|
|
2638
|
+
return errorText;
|
|
2639
|
+
}
|
|
2640
|
+
const stdoutText = stdout.trim();
|
|
2641
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
2642
|
+
}
|
|
2643
|
+
function formatTimeoutSuffix(timeoutMs) {
|
|
2644
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
2645
|
+
return "";
|
|
2646
|
+
}
|
|
2647
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
2648
|
+
return ` after ${seconds}s`;
|
|
2649
|
+
}
|
|
2650
|
+
function isNestedClaudeCodeAuthError(stdout) {
|
|
2651
|
+
try {
|
|
2652
|
+
const lines = stdout.split("\n");
|
|
2653
|
+
let hasApiKeySource = false;
|
|
2654
|
+
let hasAuthError = false;
|
|
2655
|
+
for (const line of lines) {
|
|
2656
|
+
const trimmed = line.trim();
|
|
2657
|
+
if (!trimmed) continue;
|
|
2658
|
+
try {
|
|
2659
|
+
const event = JSON.parse(trimmed);
|
|
2660
|
+
if (event.type === "system" && event.apiKeySource === "ANTHROPIC_API_KEY") {
|
|
2661
|
+
hasApiKeySource = true;
|
|
2662
|
+
}
|
|
2663
|
+
if (event.error === "authentication_failed" || event.type === "result" && event.is_error) {
|
|
2664
|
+
hasAuthError = true;
|
|
2665
|
+
}
|
|
2666
|
+
} catch {
|
|
2667
|
+
}
|
|
2668
|
+
}
|
|
2669
|
+
return hasApiKeySource && hasAuthError;
|
|
2670
|
+
} catch {
|
|
2671
|
+
return false;
|
|
2672
|
+
}
|
|
2673
|
+
}
|
|
2674
|
+
function escapeShellArg(arg) {
|
|
2675
|
+
return `'${arg.replace(/'/g, "'\\''")}'`;
|
|
2676
|
+
}
|
|
2677
|
+
async function defaultClaudeCodeRunner(options) {
|
|
2678
|
+
const tempId = randomUUID();
|
|
2679
|
+
const stdoutFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stdout`);
|
|
2680
|
+
const stderrFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stderr`);
|
|
2681
|
+
const exitFile = path8.join(tmpdir(), `agentv-cc-${tempId}-exit`);
|
|
2682
|
+
const pidFile = path8.join(tmpdir(), `agentv-cc-${tempId}-pid`);
|
|
2683
|
+
try {
|
|
2684
|
+
return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
|
|
2685
|
+
} finally {
|
|
2686
|
+
for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
|
|
2687
|
+
try {
|
|
2688
|
+
await rm(file, { force: true });
|
|
2689
|
+
} catch {
|
|
2690
|
+
}
|
|
2691
|
+
}
|
|
2692
|
+
}
|
|
2693
|
+
}
|
|
2694
|
+
async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile) {
|
|
2695
|
+
const parts = options.executable.split(/\s+/);
|
|
2696
|
+
const executable = parts[0];
|
|
2697
|
+
const executableArgs = parts.slice(1);
|
|
2698
|
+
const allArgs = [...executableArgs, ...options.args];
|
|
2699
|
+
const escapedArgs = allArgs.map((arg) => escapeShellArg(arg));
|
|
2700
|
+
const fullCommand = [escapeShellArg(executable), ...escapedArgs].join(" ");
|
|
2701
|
+
const bashScript = `
|
|
2702
|
+
unset CLAUDECODE CLAUDE_CODE_ENTRYPOINT 2>/dev/null
|
|
2703
|
+
${fullCommand} >${escapeShellArg(stdoutFile)} 2>${escapeShellArg(stderrFile)} &
|
|
2704
|
+
CHILD_PID=$!
|
|
2705
|
+
echo $CHILD_PID > ${escapeShellArg(pidFile)}
|
|
2706
|
+
wait $CHILD_PID
|
|
2707
|
+
echo $? > ${escapeShellArg(exitFile)}
|
|
2708
|
+
`;
|
|
2709
|
+
const child = spawn("setsid", ["bash", "-c", bashScript], {
|
|
2710
|
+
cwd: options.cwd,
|
|
2711
|
+
env: options.env,
|
|
2712
|
+
detached: true,
|
|
2713
|
+
stdio: "ignore"
|
|
2714
|
+
});
|
|
2715
|
+
child.unref();
|
|
2716
|
+
const pollInterval = 100;
|
|
2717
|
+
const startTime = Date.now();
|
|
2718
|
+
let timedOut = false;
|
|
2719
|
+
let lastStdoutSize = 0;
|
|
2720
|
+
const readFileIfExists = async (filePath) => {
|
|
2721
|
+
try {
|
|
2722
|
+
const { readFile: readFile7 } = await import("node:fs/promises");
|
|
2723
|
+
return await readFile7(filePath, "utf8");
|
|
2724
|
+
} catch {
|
|
2725
|
+
return "";
|
|
2726
|
+
}
|
|
2727
|
+
};
|
|
2728
|
+
const fileExists4 = async (filePath) => {
|
|
2729
|
+
try {
|
|
2730
|
+
const { access: access4 } = await import("node:fs/promises");
|
|
2731
|
+
await access4(filePath);
|
|
2732
|
+
return true;
|
|
2733
|
+
} catch {
|
|
2734
|
+
return false;
|
|
2735
|
+
}
|
|
2736
|
+
};
|
|
2737
|
+
const killProcess = async () => {
|
|
2738
|
+
try {
|
|
2739
|
+
const pid = await readFileIfExists(pidFile);
|
|
2740
|
+
if (pid.trim()) {
|
|
2741
|
+
process.kill(Number.parseInt(pid.trim(), 10), "SIGTERM");
|
|
2742
|
+
}
|
|
2743
|
+
} catch {
|
|
2744
|
+
}
|
|
2745
|
+
};
|
|
2746
|
+
if (options.signal?.aborted) {
|
|
2747
|
+
await killProcess();
|
|
2748
|
+
return { stdout: "", stderr: "Aborted", exitCode: -1, timedOut: false };
|
|
2749
|
+
}
|
|
2750
|
+
const abortHandler = () => {
|
|
2751
|
+
killProcess().catch(() => {
|
|
2752
|
+
});
|
|
2753
|
+
};
|
|
2754
|
+
options.signal?.addEventListener("abort", abortHandler, { once: true });
|
|
2755
|
+
try {
|
|
2756
|
+
while (true) {
|
|
2757
|
+
if (options.timeoutMs && Date.now() - startTime > options.timeoutMs) {
|
|
2758
|
+
timedOut = true;
|
|
2759
|
+
await killProcess();
|
|
2760
|
+
break;
|
|
2761
|
+
}
|
|
2762
|
+
if (options.signal?.aborted) {
|
|
2763
|
+
await killProcess();
|
|
2764
|
+
break;
|
|
2765
|
+
}
|
|
2766
|
+
if (options.onStdoutChunk) {
|
|
2767
|
+
const currentStdout = await readFileIfExists(stdoutFile);
|
|
2768
|
+
if (currentStdout.length > lastStdoutSize) {
|
|
2769
|
+
options.onStdoutChunk(currentStdout.slice(lastStdoutSize));
|
|
2770
|
+
lastStdoutSize = currentStdout.length;
|
|
2771
|
+
}
|
|
2772
|
+
}
|
|
2773
|
+
if (await fileExists4(exitFile)) {
|
|
2774
|
+
break;
|
|
2775
|
+
}
|
|
2776
|
+
await new Promise((resolve) => setTimeout(resolve, pollInterval));
|
|
2777
|
+
}
|
|
2778
|
+
const stdout = await readFileIfExists(stdoutFile);
|
|
2779
|
+
const stderr = await readFileIfExists(stderrFile);
|
|
2780
|
+
const exitCodeStr = await readFileIfExists(exitFile);
|
|
2781
|
+
const exitCode = exitCodeStr.trim() ? Number.parseInt(exitCodeStr.trim(), 10) : -1;
|
|
2782
|
+
if (options.onStdoutChunk && stdout.length > lastStdoutSize) {
|
|
2783
|
+
options.onStdoutChunk(stdout.slice(lastStdoutSize));
|
|
2784
|
+
}
|
|
2785
|
+
if (options.onStderrChunk && stderr) {
|
|
2786
|
+
options.onStderrChunk(stderr);
|
|
2787
|
+
}
|
|
2788
|
+
return { stdout, stderr, exitCode, timedOut };
|
|
2789
|
+
} finally {
|
|
2790
|
+
options.signal?.removeEventListener("abort", abortHandler);
|
|
2791
|
+
}
|
|
2792
|
+
}
|
|
2793
|
+
|
|
2794
|
+
// src/evaluation/providers/cli.ts
|
|
2795
|
+
import { exec as execWithCallback } from "node:child_process";
|
|
2796
|
+
import fs from "node:fs/promises";
|
|
2797
|
+
import os from "node:os";
|
|
2798
|
+
import path9 from "node:path";
|
|
2799
|
+
import { promisify } from "node:util";
|
|
2800
|
+
import { z } from "zod";
|
|
2801
|
+
var ToolCallSchema = z.object({
|
|
2802
|
+
tool: z.string(),
|
|
2803
|
+
input: z.unknown().optional(),
|
|
2804
|
+
output: z.unknown().optional(),
|
|
2805
|
+
id: z.string().optional(),
|
|
2806
|
+
timestamp: z.string().optional()
|
|
2807
|
+
});
|
|
2808
|
+
var OutputMessageInputSchema = z.object({
|
|
2809
|
+
role: z.string(),
|
|
2810
|
+
name: z.string().optional(),
|
|
2811
|
+
content: z.unknown().optional(),
|
|
2812
|
+
tool_calls: z.array(ToolCallSchema).optional(),
|
|
2813
|
+
timestamp: z.string().optional(),
|
|
2814
|
+
metadata: z.record(z.unknown()).optional()
|
|
2815
|
+
});
|
|
2816
|
+
var TokenUsageSchema = z.object({
|
|
2817
|
+
input: z.number(),
|
|
2818
|
+
output: z.number(),
|
|
2819
|
+
cached: z.number().optional()
|
|
2820
|
+
});
|
|
2821
|
+
var CliOutputSchema = z.object({
|
|
2822
|
+
text: z.unknown().optional(),
|
|
2823
|
+
output_messages: z.array(OutputMessageInputSchema).optional(),
|
|
2824
|
+
token_usage: TokenUsageSchema.optional(),
|
|
2825
|
+
cost_usd: z.number().optional(),
|
|
2826
|
+
duration_ms: z.number().optional()
|
|
2827
|
+
});
|
|
2828
|
+
var CliJsonlRecordSchema = CliOutputSchema.extend({
|
|
2829
|
+
id: z.string().min(1)
|
|
2830
|
+
});
|
|
2831
|
+
function validateMetrics(costUsd, durationMs, context) {
|
|
2832
|
+
let validCostUsd = costUsd;
|
|
2833
|
+
let validDurationMs = durationMs;
|
|
2834
|
+
if (costUsd !== void 0 && costUsd < 0) {
|
|
2835
|
+
console.warn(`[cli-provider] ${context}: ignoring negative cost_usd value (${costUsd})`);
|
|
2836
|
+
validCostUsd = void 0;
|
|
2837
|
+
}
|
|
2838
|
+
if (durationMs !== void 0 && durationMs < 0) {
|
|
2839
|
+
console.warn(`[cli-provider] ${context}: ignoring negative duration_ms value (${durationMs})`);
|
|
2840
|
+
validDurationMs = void 0;
|
|
2841
|
+
}
|
|
2842
|
+
return { costUsd: validCostUsd, durationMs: validDurationMs };
|
|
2843
|
+
}
|
|
2844
|
+
function convertOutputMessages(messages) {
|
|
2845
|
+
if (!messages || messages.length === 0) {
|
|
2846
|
+
return void 0;
|
|
2847
|
+
}
|
|
2848
|
+
return messages.map((msg) => ({
|
|
2849
|
+
role: msg.role,
|
|
2850
|
+
name: msg.name,
|
|
2851
|
+
content: msg.content,
|
|
2852
|
+
toolCalls: msg.tool_calls,
|
|
2853
|
+
timestamp: msg.timestamp,
|
|
2854
|
+
metadata: msg.metadata
|
|
2855
|
+
}));
|
|
2856
|
+
}
|
|
2857
|
+
var execAsync = promisify(execWithCallback);
|
|
2858
|
+
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
2859
|
+
async function defaultCommandRunner(command, options) {
|
|
2860
|
+
const execOptions = {
|
|
2861
|
+
cwd: options.cwd,
|
|
2862
|
+
env: options.env,
|
|
2863
|
+
timeout: options.timeoutMs,
|
|
2864
|
+
signal: options.signal,
|
|
2865
|
+
maxBuffer: DEFAULT_MAX_BUFFER,
|
|
2866
|
+
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
2867
|
+
};
|
|
2868
|
+
try {
|
|
2869
|
+
const { stdout, stderr } = await execAsync(command, execOptions);
|
|
2870
|
+
return {
|
|
2871
|
+
stdout,
|
|
2872
|
+
stderr,
|
|
2873
|
+
exitCode: 0,
|
|
2874
|
+
failed: false,
|
|
2875
|
+
timedOut: false,
|
|
2876
|
+
signal: null
|
|
2877
|
+
};
|
|
2878
|
+
} catch (error) {
|
|
2879
|
+
const execError = error;
|
|
2880
|
+
return {
|
|
2881
|
+
stdout: execError.stdout ?? "",
|
|
2882
|
+
stderr: execError.stderr ?? "",
|
|
2883
|
+
exitCode: typeof execError.code === "number" ? execError.code : null,
|
|
2884
|
+
failed: true,
|
|
2885
|
+
timedOut: execError.timedOut === true || execError.killed === true,
|
|
2886
|
+
signal: execError.signal ?? null
|
|
2887
|
+
};
|
|
2888
|
+
}
|
|
2889
|
+
}
|
|
2890
|
+
var CliProvider = class {
|
|
2891
|
+
id;
|
|
2892
|
+
kind = "cli";
|
|
2893
|
+
targetName;
|
|
2894
|
+
supportsBatch = true;
|
|
2895
|
+
config;
|
|
2896
|
+
runCommand;
|
|
2897
|
+
verbose;
|
|
2898
|
+
keepTempFiles;
|
|
2899
|
+
healthcheckPromise;
|
|
2900
|
+
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
2901
|
+
this.targetName = targetName;
|
|
2902
|
+
this.id = `cli:${targetName}`;
|
|
2903
|
+
this.config = config;
|
|
2904
|
+
this.runCommand = runner;
|
|
2905
|
+
this.verbose = config.verbose ?? false;
|
|
2906
|
+
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
2907
|
+
}
|
|
2908
|
+
async invoke(request) {
|
|
2909
|
+
if (request.signal?.aborted) {
|
|
2910
|
+
throw new Error("CLI provider request was aborted before execution");
|
|
2911
|
+
}
|
|
2912
|
+
await this.ensureHealthy(request.signal);
|
|
2913
|
+
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
2914
|
+
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
2915
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
2916
|
+
if (this.verbose) {
|
|
2917
|
+
console.log(
|
|
2918
|
+
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
2919
|
+
);
|
|
2920
|
+
}
|
|
2921
|
+
const startTime = Date.now();
|
|
2922
|
+
const result = await this.runCommand(renderedCommand, {
|
|
2923
|
+
cwd: this.config.cwd,
|
|
2924
|
+
env: process.env,
|
|
2925
|
+
timeoutMs: this.config.timeoutMs,
|
|
2926
|
+
signal: request.signal
|
|
2927
|
+
});
|
|
2928
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
2929
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
2930
|
+
if (request.signal?.aborted) {
|
|
2931
|
+
throw new Error("CLI provider request was aborted");
|
|
1834
2932
|
}
|
|
1835
2933
|
if (result.timedOut) {
|
|
1836
2934
|
throw new Error(
|
|
1837
|
-
`CLI provider timed out${
|
|
2935
|
+
`CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
1838
2936
|
);
|
|
1839
2937
|
}
|
|
1840
2938
|
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
@@ -1910,7 +3008,7 @@ var CliProvider = class {
|
|
|
1910
3008
|
}
|
|
1911
3009
|
if (result.timedOut) {
|
|
1912
3010
|
throw new Error(
|
|
1913
|
-
`CLI provider timed out${
|
|
3011
|
+
`CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
1914
3012
|
);
|
|
1915
3013
|
}
|
|
1916
3014
|
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
@@ -1920,11 +3018,6 @@ var CliProvider = class {
|
|
|
1920
3018
|
}
|
|
1921
3019
|
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1922
3020
|
const recordsById = this.parseJsonlBatchOutput(responseContent);
|
|
1923
|
-
const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
|
|
1924
|
-
const missingIds = requestedIds.filter((id) => !recordsById.has(id));
|
|
1925
|
-
if (missingIds.length > 0) {
|
|
1926
|
-
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
1927
|
-
}
|
|
1928
3021
|
const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
|
|
1929
3022
|
const responses = requests.map((request) => {
|
|
1930
3023
|
const evalCaseId = request.evalCaseId;
|
|
@@ -1943,15 +3036,20 @@ var CliProvider = class {
|
|
|
1943
3036
|
}
|
|
1944
3037
|
const parsed = recordsById.get(evalCaseId);
|
|
1945
3038
|
if (!parsed) {
|
|
3039
|
+
const errorMessage = `Batch output missing id '${evalCaseId}'`;
|
|
3040
|
+
if (this.verbose) {
|
|
3041
|
+
console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
|
|
3042
|
+
}
|
|
1946
3043
|
return {
|
|
1947
|
-
outputMessages: [],
|
|
3044
|
+
outputMessages: [{ role: "assistant", content: `Error: ${errorMessage}` }],
|
|
1948
3045
|
durationMs: perRequestFallbackMs,
|
|
1949
3046
|
raw: {
|
|
1950
3047
|
command: renderedCommand,
|
|
1951
3048
|
stderr: result.stderr,
|
|
1952
3049
|
exitCode: result.exitCode ?? 0,
|
|
1953
3050
|
cwd: this.config.cwd,
|
|
1954
|
-
outputFile: outputFilePath
|
|
3051
|
+
outputFile: outputFilePath,
|
|
3052
|
+
error: errorMessage
|
|
1955
3053
|
}
|
|
1956
3054
|
};
|
|
1957
3055
|
}
|
|
@@ -1984,101 +3082,37 @@ var CliProvider = class {
|
|
|
1984
3082
|
* - duration_ms: number
|
|
1985
3083
|
*/
|
|
1986
3084
|
parseOutputContent(content) {
|
|
3085
|
+
let parsed;
|
|
1987
3086
|
try {
|
|
1988
|
-
|
|
1989
|
-
if (typeof parsed === "object" && parsed !== null) {
|
|
1990
|
-
const obj = parsed;
|
|
1991
|
-
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
1992
|
-
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
1993
|
-
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
1994
|
-
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
1995
|
-
if (outputMessages && outputMessages.length > 0) {
|
|
1996
|
-
return { outputMessages, tokenUsage, costUsd, durationMs };
|
|
1997
|
-
}
|
|
1998
|
-
if ("text" in obj) {
|
|
1999
|
-
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
2000
|
-
return {
|
|
2001
|
-
outputMessages: [{ role: "assistant", content: text }],
|
|
2002
|
-
tokenUsage,
|
|
2003
|
-
costUsd,
|
|
2004
|
-
durationMs
|
|
2005
|
-
};
|
|
2006
|
-
}
|
|
2007
|
-
}
|
|
3087
|
+
parsed = JSON.parse(content);
|
|
2008
3088
|
} catch {
|
|
3089
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2009
3090
|
}
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
* Parse token_usage from CLI output.
|
|
2014
|
-
*/
|
|
2015
|
-
parseTokenUsage(tokenUsage) {
|
|
2016
|
-
if (typeof tokenUsage !== "object" || tokenUsage === null) {
|
|
2017
|
-
return void 0;
|
|
2018
|
-
}
|
|
2019
|
-
const obj = tokenUsage;
|
|
2020
|
-
if (typeof obj.input !== "number" || typeof obj.output !== "number") {
|
|
2021
|
-
return void 0;
|
|
2022
|
-
}
|
|
2023
|
-
return {
|
|
2024
|
-
input: obj.input,
|
|
2025
|
-
output: obj.output,
|
|
2026
|
-
cached: typeof obj.cached === "number" ? obj.cached : void 0
|
|
2027
|
-
};
|
|
2028
|
-
}
|
|
2029
|
-
/**
|
|
2030
|
-
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
2031
|
-
*/
|
|
2032
|
-
parseOutputMessages(outputMessages) {
|
|
2033
|
-
if (!Array.isArray(outputMessages)) {
|
|
2034
|
-
return void 0;
|
|
3091
|
+
const result = CliOutputSchema.safeParse(parsed);
|
|
3092
|
+
if (!result.success) {
|
|
3093
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2035
3094
|
}
|
|
2036
|
-
const
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
const message = {
|
|
2046
|
-
role: rawMsg.role,
|
|
2047
|
-
name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
|
|
2048
|
-
content: rawMsg.content,
|
|
2049
|
-
toolCalls: this.parseToolCalls(rawMsg.tool_calls),
|
|
2050
|
-
timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
|
|
2051
|
-
metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
|
|
3095
|
+
const obj = result.data;
|
|
3096
|
+
const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, "parsing output");
|
|
3097
|
+
const outputMessages = convertOutputMessages(obj.output_messages);
|
|
3098
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
3099
|
+
return {
|
|
3100
|
+
outputMessages,
|
|
3101
|
+
tokenUsage: obj.token_usage,
|
|
3102
|
+
costUsd: metrics.costUsd,
|
|
3103
|
+
durationMs: metrics.durationMs
|
|
2052
3104
|
};
|
|
2053
|
-
messages.push(message);
|
|
2054
|
-
}
|
|
2055
|
-
return messages.length > 0 ? messages : void 0;
|
|
2056
|
-
}
|
|
2057
|
-
/**
|
|
2058
|
-
* Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
|
|
2059
|
-
*/
|
|
2060
|
-
parseToolCalls(toolCalls) {
|
|
2061
|
-
if (!Array.isArray(toolCalls)) {
|
|
2062
|
-
return void 0;
|
|
2063
3105
|
}
|
|
2064
|
-
|
|
2065
|
-
|
|
2066
|
-
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2072
|
-
}
|
|
2073
|
-
calls.push({
|
|
2074
|
-
tool: rawCall.tool,
|
|
2075
|
-
input: rawCall.input,
|
|
2076
|
-
output: rawCall.output,
|
|
2077
|
-
id: typeof rawCall.id === "string" ? rawCall.id : void 0,
|
|
2078
|
-
timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
|
|
2079
|
-
});
|
|
3106
|
+
if (obj.text !== void 0) {
|
|
3107
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
3108
|
+
return {
|
|
3109
|
+
outputMessages: [{ role: "assistant", content: text }],
|
|
3110
|
+
tokenUsage: obj.token_usage,
|
|
3111
|
+
costUsd: metrics.costUsd,
|
|
3112
|
+
durationMs: metrics.durationMs
|
|
3113
|
+
};
|
|
2080
3114
|
}
|
|
2081
|
-
return
|
|
3115
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2082
3116
|
}
|
|
2083
3117
|
parseJsonlBatchOutput(content) {
|
|
2084
3118
|
const records = /* @__PURE__ */ new Map();
|
|
@@ -2091,33 +3125,32 @@ var CliProvider = class {
|
|
|
2091
3125
|
const reason = error instanceof Error ? error.message : String(error);
|
|
2092
3126
|
throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
|
|
2093
3127
|
}
|
|
2094
|
-
|
|
3128
|
+
const result = CliJsonlRecordSchema.safeParse(parsed);
|
|
3129
|
+
if (!result.success) {
|
|
3130
|
+
const firstError = result.error.errors[0];
|
|
3131
|
+
if (firstError?.path.includes("id")) {
|
|
3132
|
+
throw new Error("CLI batch output JSONL line missing required string field: id");
|
|
3133
|
+
}
|
|
2095
3134
|
throw new Error("CLI batch output JSONL line must be an object");
|
|
2096
3135
|
}
|
|
2097
|
-
const obj =
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2106
|
-
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2107
|
-
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2108
|
-
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2109
|
-
let outputMessages;
|
|
2110
|
-
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
2111
|
-
outputMessages = parsedOutputMessages;
|
|
3136
|
+
const obj = result.data;
|
|
3137
|
+
if (records.has(obj.id)) {
|
|
3138
|
+
throw new Error(`CLI batch output contains duplicate id: ${obj.id}`);
|
|
3139
|
+
}
|
|
3140
|
+
const outputMessages = convertOutputMessages(obj.output_messages);
|
|
3141
|
+
let finalOutputMessages;
|
|
3142
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
3143
|
+
finalOutputMessages = outputMessages;
|
|
2112
3144
|
} else {
|
|
2113
3145
|
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
2114
|
-
|
|
2115
|
-
}
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
3146
|
+
finalOutputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
3147
|
+
}
|
|
3148
|
+
const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, `batch record '${obj.id}'`);
|
|
3149
|
+
records.set(obj.id, {
|
|
3150
|
+
outputMessages: finalOutputMessages,
|
|
3151
|
+
tokenUsage: obj.token_usage,
|
|
3152
|
+
costUsd: metrics.costUsd,
|
|
3153
|
+
durationMs: metrics.durationMs
|
|
2121
3154
|
});
|
|
2122
3155
|
}
|
|
2123
3156
|
return records;
|
|
@@ -2203,7 +3236,7 @@ var CliProvider = class {
|
|
|
2203
3236
|
}
|
|
2204
3237
|
};
|
|
2205
3238
|
function buildTemplateValues(request, config, outputFilePath) {
|
|
2206
|
-
const inputFiles =
|
|
3239
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
2207
3240
|
return {
|
|
2208
3241
|
PROMPT: shellEscape(request.question ?? ""),
|
|
2209
3242
|
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
@@ -2213,13 +3246,13 @@ function buildTemplateValues(request, config, outputFilePath) {
|
|
|
2213
3246
|
OUTPUT_FILE: shellEscape(outputFilePath)
|
|
2214
3247
|
};
|
|
2215
3248
|
}
|
|
2216
|
-
function
|
|
3249
|
+
function normalizeInputFiles2(inputFiles) {
|
|
2217
3250
|
if (!inputFiles || inputFiles.length === 0) {
|
|
2218
3251
|
return void 0;
|
|
2219
3252
|
}
|
|
2220
3253
|
const unique = /* @__PURE__ */ new Map();
|
|
2221
3254
|
for (const inputFile of inputFiles) {
|
|
2222
|
-
const absolutePath =
|
|
3255
|
+
const absolutePath = path9.resolve(inputFile);
|
|
2223
3256
|
if (!unique.has(absolutePath)) {
|
|
2224
3257
|
unique.set(absolutePath, absolutePath);
|
|
2225
3258
|
}
|
|
@@ -2233,7 +3266,7 @@ function formatFileList(files, template) {
|
|
|
2233
3266
|
const formatter = template ?? "{path}";
|
|
2234
3267
|
return files.map((filePath) => {
|
|
2235
3268
|
const escapedPath = shellEscape(filePath);
|
|
2236
|
-
const escapedName = shellEscape(
|
|
3269
|
+
const escapedName = shellEscape(path9.basename(filePath));
|
|
2237
3270
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
2238
3271
|
}).join(" ");
|
|
2239
3272
|
}
|
|
@@ -2257,9 +3290,9 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
2257
3290
|
const safeEvalId = evalCaseId || "unknown";
|
|
2258
3291
|
const timestamp = Date.now();
|
|
2259
3292
|
const random = Math.random().toString(36).substring(2, 9);
|
|
2260
|
-
return
|
|
3293
|
+
return path9.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
2261
3294
|
}
|
|
2262
|
-
function
|
|
3295
|
+
function formatTimeoutSuffix2(timeoutMs) {
|
|
2263
3296
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
2264
3297
|
return "";
|
|
2265
3298
|
}
|
|
@@ -2268,39 +3301,39 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
2268
3301
|
}
|
|
2269
3302
|
|
|
2270
3303
|
// src/evaluation/providers/codex.ts
|
|
2271
|
-
import { exec as execCallback, spawn } from "node:child_process";
|
|
2272
|
-
import { randomUUID } from "node:crypto";
|
|
2273
|
-
import { constants as constants2, createWriteStream } from "node:fs";
|
|
2274
|
-
import { access as access2, mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
2275
|
-
import { tmpdir } from "node:os";
|
|
2276
|
-
import
|
|
3304
|
+
import { exec as execCallback, spawn as spawn2 } from "node:child_process";
|
|
3305
|
+
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
3306
|
+
import { constants as constants2, createWriteStream as createWriteStream2 } from "node:fs";
|
|
3307
|
+
import { access as access2, mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
|
|
3308
|
+
import { tmpdir as tmpdir2 } from "node:os";
|
|
3309
|
+
import path10 from "node:path";
|
|
2277
3310
|
import { promisify as promisify2 } from "node:util";
|
|
2278
3311
|
|
|
2279
3312
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
2280
|
-
var
|
|
2281
|
-
var
|
|
3313
|
+
var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.codexLogs");
|
|
3314
|
+
var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.codexLogSubscribers");
|
|
2282
3315
|
function getCodexLogStore() {
|
|
2283
3316
|
const globalObject = globalThis;
|
|
2284
|
-
const existing = globalObject[
|
|
3317
|
+
const existing = globalObject[GLOBAL_LOGS_KEY2];
|
|
2285
3318
|
if (existing) {
|
|
2286
3319
|
return existing;
|
|
2287
3320
|
}
|
|
2288
3321
|
const created = [];
|
|
2289
|
-
globalObject[
|
|
3322
|
+
globalObject[GLOBAL_LOGS_KEY2] = created;
|
|
2290
3323
|
return created;
|
|
2291
3324
|
}
|
|
2292
|
-
function
|
|
3325
|
+
function getSubscriberStore2() {
|
|
2293
3326
|
const globalObject = globalThis;
|
|
2294
|
-
const existing = globalObject[
|
|
3327
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
|
|
2295
3328
|
if (existing) {
|
|
2296
3329
|
return existing;
|
|
2297
3330
|
}
|
|
2298
3331
|
const created = /* @__PURE__ */ new Set();
|
|
2299
|
-
globalObject[
|
|
3332
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
|
|
2300
3333
|
return created;
|
|
2301
3334
|
}
|
|
2302
|
-
function
|
|
2303
|
-
const subscribers = Array.from(
|
|
3335
|
+
function notifySubscribers2(entry) {
|
|
3336
|
+
const subscribers = Array.from(getSubscriberStore2());
|
|
2304
3337
|
for (const listener of subscribers) {
|
|
2305
3338
|
try {
|
|
2306
3339
|
listener(entry);
|
|
@@ -2312,7 +3345,7 @@ function notifySubscribers(entry) {
|
|
|
2312
3345
|
}
|
|
2313
3346
|
function recordCodexLogEntry(entry) {
|
|
2314
3347
|
getCodexLogStore().push(entry);
|
|
2315
|
-
|
|
3348
|
+
notifySubscribers2(entry);
|
|
2316
3349
|
}
|
|
2317
3350
|
function consumeCodexLogEntries() {
|
|
2318
3351
|
const store = getCodexLogStore();
|
|
@@ -2322,118 +3355,19 @@ function consumeCodexLogEntries() {
|
|
|
2322
3355
|
return store.splice(0, store.length);
|
|
2323
3356
|
}
|
|
2324
3357
|
function subscribeToCodexLogEntries(listener) {
|
|
2325
|
-
const store =
|
|
3358
|
+
const store = getSubscriberStore2();
|
|
2326
3359
|
store.add(listener);
|
|
2327
3360
|
return () => {
|
|
2328
3361
|
store.delete(listener);
|
|
2329
3362
|
};
|
|
2330
3363
|
}
|
|
2331
3364
|
|
|
2332
|
-
// src/evaluation/providers/preread.ts
|
|
2333
|
-
import path8 from "node:path";
|
|
2334
|
-
function buildPromptDocument(request, inputFiles, options) {
|
|
2335
|
-
const parts = [];
|
|
2336
|
-
const guidelineFiles = collectGuidelineFiles(
|
|
2337
|
-
inputFiles,
|
|
2338
|
-
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
2339
|
-
options?.guidelineOverrides
|
|
2340
|
-
);
|
|
2341
|
-
const inputFilesList = collectInputFiles(inputFiles);
|
|
2342
|
-
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
2343
|
-
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
2344
|
-
if (prereadBlock.length > 0) {
|
|
2345
|
-
parts.push("\n", prereadBlock);
|
|
2346
|
-
}
|
|
2347
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2348
|
-
return parts.join("\n").trim();
|
|
2349
|
-
}
|
|
2350
|
-
function normalizeInputFiles2(inputFiles) {
|
|
2351
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2352
|
-
return void 0;
|
|
2353
|
-
}
|
|
2354
|
-
const deduped = /* @__PURE__ */ new Map();
|
|
2355
|
-
for (const inputFile of inputFiles) {
|
|
2356
|
-
const absolutePath = path8.resolve(inputFile);
|
|
2357
|
-
if (!deduped.has(absolutePath)) {
|
|
2358
|
-
deduped.set(absolutePath, absolutePath);
|
|
2359
|
-
}
|
|
2360
|
-
}
|
|
2361
|
-
return Array.from(deduped.values());
|
|
2362
|
-
}
|
|
2363
|
-
function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
2364
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2365
|
-
return [];
|
|
2366
|
-
}
|
|
2367
|
-
const unique = /* @__PURE__ */ new Map();
|
|
2368
|
-
for (const inputFile of inputFiles) {
|
|
2369
|
-
const absolutePath = path8.resolve(inputFile);
|
|
2370
|
-
if (overrides?.has(absolutePath)) {
|
|
2371
|
-
if (!unique.has(absolutePath)) {
|
|
2372
|
-
unique.set(absolutePath, absolutePath);
|
|
2373
|
-
}
|
|
2374
|
-
continue;
|
|
2375
|
-
}
|
|
2376
|
-
const normalized = absolutePath.split(path8.sep).join("/");
|
|
2377
|
-
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2378
|
-
if (!unique.has(absolutePath)) {
|
|
2379
|
-
unique.set(absolutePath, absolutePath);
|
|
2380
|
-
}
|
|
2381
|
-
}
|
|
2382
|
-
}
|
|
2383
|
-
return Array.from(unique.values());
|
|
2384
|
-
}
|
|
2385
|
-
function collectInputFiles(inputFiles) {
|
|
2386
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2387
|
-
return [];
|
|
2388
|
-
}
|
|
2389
|
-
const unique = /* @__PURE__ */ new Map();
|
|
2390
|
-
for (const inputFile of inputFiles) {
|
|
2391
|
-
const absolutePath = path8.resolve(inputFile);
|
|
2392
|
-
if (!unique.has(absolutePath)) {
|
|
2393
|
-
unique.set(absolutePath, absolutePath);
|
|
2394
|
-
}
|
|
2395
|
-
}
|
|
2396
|
-
return Array.from(unique.values());
|
|
2397
|
-
}
|
|
2398
|
-
function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
2399
|
-
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
2400
|
-
return "";
|
|
2401
|
-
}
|
|
2402
|
-
const buildList = (files) => files.map((absolutePath) => {
|
|
2403
|
-
const fileName = path8.basename(absolutePath);
|
|
2404
|
-
const fileUri = pathToFileUri(absolutePath);
|
|
2405
|
-
return `* [${fileName}](${fileUri})`;
|
|
2406
|
-
});
|
|
2407
|
-
const sections = [];
|
|
2408
|
-
if (guidelineFiles.length > 0) {
|
|
2409
|
-
sections.push(`Read all guideline files:
|
|
2410
|
-
${buildList(guidelineFiles).join("\n")}.`);
|
|
2411
|
-
}
|
|
2412
|
-
if (inputFiles.length > 0) {
|
|
2413
|
-
sections.push(`Read all input files:
|
|
2414
|
-
${buildList(inputFiles).join("\n")}.`);
|
|
2415
|
-
}
|
|
2416
|
-
sections.push(
|
|
2417
|
-
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
2418
|
-
"Then apply system_instructions on the user query below."
|
|
2419
|
-
);
|
|
2420
|
-
return sections.join("\n");
|
|
2421
|
-
}
|
|
2422
|
-
function pathToFileUri(filePath) {
|
|
2423
|
-
const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
|
|
2424
|
-
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2425
|
-
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2426
|
-
return `file:///${normalizedPath}`;
|
|
2427
|
-
}
|
|
2428
|
-
return `file://${normalizedPath}`;
|
|
2429
|
-
}
|
|
2430
|
-
|
|
2431
3365
|
// src/evaluation/providers/codex.ts
|
|
2432
3366
|
var execAsync2 = promisify2(execCallback);
|
|
2433
|
-
var
|
|
2434
|
-
var
|
|
3367
|
+
var WORKSPACE_PREFIX2 = "agentv-codex-";
|
|
3368
|
+
var PROMPT_FILENAME2 = "prompt.md";
|
|
2435
3369
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
2436
|
-
var
|
|
3370
|
+
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2437
3371
|
- Do NOT create any additional output files in the workspace.
|
|
2438
3372
|
- All intended file outputs/changes MUST be written in your response.
|
|
2439
3373
|
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
@@ -2458,27 +3392,27 @@ var CodexProvider = class {
|
|
|
2458
3392
|
throw new Error("Codex provider request was aborted before execution");
|
|
2459
3393
|
}
|
|
2460
3394
|
await this.ensureEnvironmentReady();
|
|
2461
|
-
const inputFiles =
|
|
3395
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
2462
3396
|
const workspaceRoot = await this.createWorkspace();
|
|
2463
3397
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2464
3398
|
try {
|
|
2465
3399
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
2466
|
-
const systemPrompt = this.config.systemPrompt ??
|
|
3400
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
|
|
2467
3401
|
const promptContent = `${systemPrompt}
|
|
2468
3402
|
|
|
2469
3403
|
${basePrompt}`;
|
|
2470
|
-
const promptFile =
|
|
2471
|
-
await
|
|
3404
|
+
const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3405
|
+
await writeFile2(promptFile, promptContent, "utf8");
|
|
2472
3406
|
const args = this.buildCodexArgs();
|
|
2473
3407
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
2474
3408
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
2475
3409
|
if (result.timedOut) {
|
|
2476
3410
|
throw new Error(
|
|
2477
|
-
`Codex CLI timed out${
|
|
3411
|
+
`Codex CLI timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
|
|
2478
3412
|
);
|
|
2479
3413
|
}
|
|
2480
3414
|
if (result.exitCode !== 0) {
|
|
2481
|
-
const detail =
|
|
3415
|
+
const detail = pickDetail2(result.stderr, result.stdout);
|
|
2482
3416
|
const prefix = `Codex CLI exited with code ${result.exitCode}`;
|
|
2483
3417
|
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
2484
3418
|
}
|
|
@@ -2517,7 +3451,7 @@ ${basePrompt}`;
|
|
|
2517
3451
|
if (!this.config.cwd) {
|
|
2518
3452
|
return workspaceRoot;
|
|
2519
3453
|
}
|
|
2520
|
-
return
|
|
3454
|
+
return path10.resolve(this.config.cwd);
|
|
2521
3455
|
}
|
|
2522
3456
|
buildCodexArgs() {
|
|
2523
3457
|
const args = [
|
|
@@ -2559,11 +3493,11 @@ ${basePrompt}`;
|
|
|
2559
3493
|
}
|
|
2560
3494
|
}
|
|
2561
3495
|
async createWorkspace() {
|
|
2562
|
-
return await
|
|
3496
|
+
return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
|
|
2563
3497
|
}
|
|
2564
3498
|
async cleanupWorkspace(workspaceRoot) {
|
|
2565
3499
|
try {
|
|
2566
|
-
await
|
|
3500
|
+
await rm2(workspaceRoot, { recursive: true, force: true });
|
|
2567
3501
|
} catch {
|
|
2568
3502
|
}
|
|
2569
3503
|
}
|
|
@@ -2573,9 +3507,9 @@ ${basePrompt}`;
|
|
|
2573
3507
|
return void 0;
|
|
2574
3508
|
}
|
|
2575
3509
|
if (this.config.logDir) {
|
|
2576
|
-
return
|
|
3510
|
+
return path10.resolve(this.config.logDir);
|
|
2577
3511
|
}
|
|
2578
|
-
return
|
|
3512
|
+
return path10.join(process.cwd(), ".agentv", "logs", "codex");
|
|
2579
3513
|
}
|
|
2580
3514
|
async createStreamLogger(request) {
|
|
2581
3515
|
const logDir = this.resolveLogDirectory();
|
|
@@ -2583,13 +3517,13 @@ ${basePrompt}`;
|
|
|
2583
3517
|
return void 0;
|
|
2584
3518
|
}
|
|
2585
3519
|
try {
|
|
2586
|
-
await
|
|
3520
|
+
await mkdir2(logDir, { recursive: true });
|
|
2587
3521
|
} catch (error) {
|
|
2588
3522
|
const message = error instanceof Error ? error.message : String(error);
|
|
2589
3523
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
2590
3524
|
return void 0;
|
|
2591
3525
|
}
|
|
2592
|
-
const filePath =
|
|
3526
|
+
const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
|
|
2593
3527
|
try {
|
|
2594
3528
|
const logger = await CodexStreamLogger.create({
|
|
2595
3529
|
filePath,
|
|
@@ -2622,7 +3556,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
|
|
|
2622
3556
|
constructor(filePath, format) {
|
|
2623
3557
|
this.filePath = filePath;
|
|
2624
3558
|
this.format = format;
|
|
2625
|
-
this.stream =
|
|
3559
|
+
this.stream = createWriteStream2(filePath, { flags: "a" });
|
|
2626
3560
|
}
|
|
2627
3561
|
static async create(options) {
|
|
2628
3562
|
const logger = new _CodexStreamLogger(options.filePath, options.format);
|
|
@@ -2683,7 +3617,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
|
|
|
2683
3617
|
return void 0;
|
|
2684
3618
|
}
|
|
2685
3619
|
const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
|
|
2686
|
-
return `[+${
|
|
3620
|
+
return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
|
|
2687
3621
|
}
|
|
2688
3622
|
flushRemainder() {
|
|
2689
3623
|
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
@@ -2714,18 +3648,18 @@ function isCodexLogStreamingDisabled() {
|
|
|
2714
3648
|
const normalized = envValue.trim().toLowerCase();
|
|
2715
3649
|
return normalized === "false" || normalized === "0" || normalized === "off";
|
|
2716
3650
|
}
|
|
2717
|
-
function
|
|
3651
|
+
function buildLogFilename2(request, targetName) {
|
|
2718
3652
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2719
|
-
const evalId =
|
|
3653
|
+
const evalId = sanitizeForFilename2(request.evalCaseId ?? "codex");
|
|
2720
3654
|
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
2721
|
-
const target =
|
|
2722
|
-
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${
|
|
3655
|
+
const target = sanitizeForFilename2(targetName);
|
|
3656
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
|
|
2723
3657
|
}
|
|
2724
|
-
function
|
|
3658
|
+
function sanitizeForFilename2(value) {
|
|
2725
3659
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
2726
3660
|
return sanitized.length > 0 ? sanitized : "codex";
|
|
2727
3661
|
}
|
|
2728
|
-
function
|
|
3662
|
+
function formatElapsed2(startedAt) {
|
|
2729
3663
|
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
2730
3664
|
const hours = Math.floor(elapsedSeconds / 3600);
|
|
2731
3665
|
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
@@ -2736,7 +3670,7 @@ function formatElapsed(startedAt) {
|
|
|
2736
3670
|
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2737
3671
|
}
|
|
2738
3672
|
function formatCodexLogMessage(rawLine, source) {
|
|
2739
|
-
const parsed =
|
|
3673
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
2740
3674
|
if (parsed) {
|
|
2741
3675
|
const summary = summarizeCodexEvent(parsed);
|
|
2742
3676
|
if (summary) {
|
|
@@ -2749,7 +3683,7 @@ function formatCodexLogMessage(rawLine, source) {
|
|
|
2749
3683
|
return rawLine;
|
|
2750
3684
|
}
|
|
2751
3685
|
function formatCodexJsonLog(rawLine) {
|
|
2752
|
-
const parsed =
|
|
3686
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
2753
3687
|
if (!parsed) {
|
|
2754
3688
|
return rawLine;
|
|
2755
3689
|
}
|
|
@@ -2794,7 +3728,7 @@ function summarizeCodexEvent(event) {
|
|
|
2794
3728
|
}
|
|
2795
3729
|
return type;
|
|
2796
3730
|
}
|
|
2797
|
-
function
|
|
3731
|
+
function tryParseJsonValue2(rawLine) {
|
|
2798
3732
|
try {
|
|
2799
3733
|
return JSON.parse(rawLine);
|
|
2800
3734
|
} catch {
|
|
@@ -2804,7 +3738,7 @@ function tryParseJsonValue(rawLine) {
|
|
|
2804
3738
|
async function locateExecutable(candidate) {
|
|
2805
3739
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
2806
3740
|
if (includesPathSeparator) {
|
|
2807
|
-
const resolved =
|
|
3741
|
+
const resolved = path10.isAbsolute(candidate) ? candidate : path10.resolve(candidate);
|
|
2808
3742
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
2809
3743
|
await access2(executablePath, constants2.F_OK);
|
|
2810
3744
|
return executablePath;
|
|
@@ -3023,7 +3957,7 @@ function parseJsonLines(output) {
|
|
|
3023
3957
|
}
|
|
3024
3958
|
return parsed;
|
|
3025
3959
|
}
|
|
3026
|
-
function
|
|
3960
|
+
function pickDetail2(stderr, stdout) {
|
|
3027
3961
|
const errorText = stderr.trim();
|
|
3028
3962
|
if (errorText.length > 0) {
|
|
3029
3963
|
return errorText;
|
|
@@ -3031,7 +3965,7 @@ function pickDetail(stderr, stdout) {
|
|
|
3031
3965
|
const stdoutText = stdout.trim();
|
|
3032
3966
|
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3033
3967
|
}
|
|
3034
|
-
function
|
|
3968
|
+
function formatTimeoutSuffix3(timeoutMs) {
|
|
3035
3969
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
3036
3970
|
return "";
|
|
3037
3971
|
}
|
|
@@ -3040,7 +3974,7 @@ function formatTimeoutSuffix2(timeoutMs) {
|
|
|
3040
3974
|
}
|
|
3041
3975
|
async function defaultCodexRunner(options) {
|
|
3042
3976
|
return await new Promise((resolve, reject) => {
|
|
3043
|
-
const child =
|
|
3977
|
+
const child = spawn2(options.executable, options.args, {
|
|
3044
3978
|
cwd: options.cwd,
|
|
3045
3979
|
env: options.env,
|
|
3046
3980
|
stdio: ["pipe", "pipe", "pipe"],
|
|
@@ -3151,38 +4085,38 @@ var MockProvider = class {
|
|
|
3151
4085
|
};
|
|
3152
4086
|
|
|
3153
4087
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
3154
|
-
import { spawn as
|
|
3155
|
-
import { randomUUID as
|
|
3156
|
-
import { createWriteStream as
|
|
3157
|
-
import { mkdir as
|
|
3158
|
-
import { tmpdir as
|
|
3159
|
-
import
|
|
4088
|
+
import { spawn as spawn3 } from "node:child_process";
|
|
4089
|
+
import { randomUUID as randomUUID3 } from "node:crypto";
|
|
4090
|
+
import { createWriteStream as createWriteStream3 } from "node:fs";
|
|
4091
|
+
import { mkdir as mkdir3, mkdtemp as mkdtemp3, rm as rm3, writeFile as writeFile3 } from "node:fs/promises";
|
|
4092
|
+
import { tmpdir as tmpdir3 } from "node:os";
|
|
4093
|
+
import path11 from "node:path";
|
|
3160
4094
|
|
|
3161
4095
|
// src/evaluation/providers/pi-log-tracker.ts
|
|
3162
|
-
var
|
|
3163
|
-
var
|
|
4096
|
+
var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
|
|
4097
|
+
var GLOBAL_SUBSCRIBERS_KEY3 = Symbol.for("agentv.piLogSubscribers");
|
|
3164
4098
|
function getPiLogStore() {
|
|
3165
4099
|
const globalObject = globalThis;
|
|
3166
|
-
const existing = globalObject[
|
|
4100
|
+
const existing = globalObject[GLOBAL_LOGS_KEY3];
|
|
3167
4101
|
if (existing) {
|
|
3168
4102
|
return existing;
|
|
3169
4103
|
}
|
|
3170
4104
|
const created = [];
|
|
3171
|
-
globalObject[
|
|
4105
|
+
globalObject[GLOBAL_LOGS_KEY3] = created;
|
|
3172
4106
|
return created;
|
|
3173
4107
|
}
|
|
3174
|
-
function
|
|
4108
|
+
function getSubscriberStore3() {
|
|
3175
4109
|
const globalObject = globalThis;
|
|
3176
|
-
const existing = globalObject[
|
|
4110
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY3];
|
|
3177
4111
|
if (existing) {
|
|
3178
4112
|
return existing;
|
|
3179
4113
|
}
|
|
3180
4114
|
const created = /* @__PURE__ */ new Set();
|
|
3181
|
-
globalObject[
|
|
4115
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY3] = created;
|
|
3182
4116
|
return created;
|
|
3183
4117
|
}
|
|
3184
|
-
function
|
|
3185
|
-
const subscribers = Array.from(
|
|
4118
|
+
function notifySubscribers3(entry) {
|
|
4119
|
+
const subscribers = Array.from(getSubscriberStore3());
|
|
3186
4120
|
for (const listener of subscribers) {
|
|
3187
4121
|
try {
|
|
3188
4122
|
listener(entry);
|
|
@@ -3194,7 +4128,7 @@ function notifySubscribers2(entry) {
|
|
|
3194
4128
|
}
|
|
3195
4129
|
function recordPiLogEntry(entry) {
|
|
3196
4130
|
getPiLogStore().push(entry);
|
|
3197
|
-
|
|
4131
|
+
notifySubscribers3(entry);
|
|
3198
4132
|
}
|
|
3199
4133
|
function consumePiLogEntries() {
|
|
3200
4134
|
const store = getPiLogStore();
|
|
@@ -3204,7 +4138,7 @@ function consumePiLogEntries() {
|
|
|
3204
4138
|
return store.splice(0, store.length);
|
|
3205
4139
|
}
|
|
3206
4140
|
function subscribeToPiLogEntries(listener) {
|
|
3207
|
-
const store =
|
|
4141
|
+
const store = getSubscriberStore3();
|
|
3208
4142
|
store.add(listener);
|
|
3209
4143
|
return () => {
|
|
3210
4144
|
store.delete(listener);
|
|
@@ -3212,9 +4146,9 @@ function subscribeToPiLogEntries(listener) {
|
|
|
3212
4146
|
}
|
|
3213
4147
|
|
|
3214
4148
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
3215
|
-
var
|
|
3216
|
-
var
|
|
3217
|
-
var
|
|
4149
|
+
var WORKSPACE_PREFIX3 = "agentv-pi-";
|
|
4150
|
+
var PROMPT_FILENAME3 = "prompt.md";
|
|
4151
|
+
var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
|
|
3218
4152
|
- Do NOT create any additional output files in the workspace.
|
|
3219
4153
|
- All intended file outputs/changes MUST be written in your response.
|
|
3220
4154
|
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
@@ -3236,27 +4170,27 @@ var PiCodingAgentProvider = class {
|
|
|
3236
4170
|
if (request.signal?.aborted) {
|
|
3237
4171
|
throw new Error("Pi coding agent request was aborted before execution");
|
|
3238
4172
|
}
|
|
3239
|
-
const inputFiles =
|
|
4173
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
3240
4174
|
const workspaceRoot = await this.createWorkspace();
|
|
3241
4175
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
3242
4176
|
try {
|
|
3243
|
-
const promptFile =
|
|
3244
|
-
await
|
|
4177
|
+
const promptFile = path11.join(workspaceRoot, PROMPT_FILENAME3);
|
|
4178
|
+
await writeFile3(promptFile, request.question, "utf8");
|
|
3245
4179
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
3246
4180
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
3247
4181
|
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
3248
4182
|
if (result.timedOut) {
|
|
3249
4183
|
throw new Error(
|
|
3250
|
-
`Pi coding agent timed out${
|
|
4184
|
+
`Pi coding agent timed out${formatTimeoutSuffix4(this.config.timeoutMs ?? void 0)}`
|
|
3251
4185
|
);
|
|
3252
4186
|
}
|
|
3253
4187
|
if (result.exitCode !== 0) {
|
|
3254
|
-
const detail =
|
|
4188
|
+
const detail = pickDetail3(result.stderr, result.stdout);
|
|
3255
4189
|
const prefix = `Pi coding agent exited with code ${result.exitCode}`;
|
|
3256
4190
|
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
3257
4191
|
}
|
|
3258
4192
|
const parsed = parsePiJsonl(result.stdout);
|
|
3259
|
-
const outputMessages =
|
|
4193
|
+
const outputMessages = extractOutputMessages2(parsed);
|
|
3260
4194
|
const assistantText = extractAssistantText2(outputMessages);
|
|
3261
4195
|
return {
|
|
3262
4196
|
raw: {
|
|
@@ -3282,7 +4216,7 @@ var PiCodingAgentProvider = class {
|
|
|
3282
4216
|
if (!this.config.cwd) {
|
|
3283
4217
|
return workspaceRoot;
|
|
3284
4218
|
}
|
|
3285
|
-
return
|
|
4219
|
+
return path11.resolve(this.config.cwd);
|
|
3286
4220
|
}
|
|
3287
4221
|
buildPiArgs(prompt, inputFiles) {
|
|
3288
4222
|
const args = [];
|
|
@@ -3312,7 +4246,7 @@ var PiCodingAgentProvider = class {
|
|
|
3312
4246
|
args.push(`@${file}`);
|
|
3313
4247
|
}
|
|
3314
4248
|
}
|
|
3315
|
-
const systemPrompt = this.config.systemPrompt ??
|
|
4249
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT4;
|
|
3316
4250
|
const fullPrompt = `${systemPrompt}
|
|
3317
4251
|
|
|
3318
4252
|
${prompt}`;
|
|
@@ -3371,19 +4305,19 @@ ${prompt}`;
|
|
|
3371
4305
|
return env;
|
|
3372
4306
|
}
|
|
3373
4307
|
async createWorkspace() {
|
|
3374
|
-
return await
|
|
4308
|
+
return await mkdtemp3(path11.join(tmpdir3(), WORKSPACE_PREFIX3));
|
|
3375
4309
|
}
|
|
3376
4310
|
async cleanupWorkspace(workspaceRoot) {
|
|
3377
4311
|
try {
|
|
3378
|
-
await
|
|
4312
|
+
await rm3(workspaceRoot, { recursive: true, force: true });
|
|
3379
4313
|
} catch {
|
|
3380
4314
|
}
|
|
3381
4315
|
}
|
|
3382
4316
|
resolveLogDirectory() {
|
|
3383
4317
|
if (this.config.logDir) {
|
|
3384
|
-
return
|
|
4318
|
+
return path11.resolve(this.config.logDir);
|
|
3385
4319
|
}
|
|
3386
|
-
return
|
|
4320
|
+
return path11.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
3387
4321
|
}
|
|
3388
4322
|
async createStreamLogger(request) {
|
|
3389
4323
|
const logDir = this.resolveLogDirectory();
|
|
@@ -3391,13 +4325,13 @@ ${prompt}`;
|
|
|
3391
4325
|
return void 0;
|
|
3392
4326
|
}
|
|
3393
4327
|
try {
|
|
3394
|
-
await
|
|
4328
|
+
await mkdir3(logDir, { recursive: true });
|
|
3395
4329
|
} catch (error) {
|
|
3396
4330
|
const message = error instanceof Error ? error.message : String(error);
|
|
3397
4331
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
3398
4332
|
return void 0;
|
|
3399
4333
|
}
|
|
3400
|
-
const filePath =
|
|
4334
|
+
const filePath = path11.join(logDir, buildLogFilename3(request, this.targetName));
|
|
3401
4335
|
try {
|
|
3402
4336
|
const logger = await PiStreamLogger.create({
|
|
3403
4337
|
filePath,
|
|
@@ -3430,7 +4364,7 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3430
4364
|
constructor(filePath, format) {
|
|
3431
4365
|
this.filePath = filePath;
|
|
3432
4366
|
this.format = format;
|
|
3433
|
-
this.stream =
|
|
4367
|
+
this.stream = createWriteStream3(filePath, { flags: "a" });
|
|
3434
4368
|
}
|
|
3435
4369
|
static async create(options) {
|
|
3436
4370
|
const logger = new _PiStreamLogger(options.filePath, options.format);
|
|
@@ -3491,7 +4425,7 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3491
4425
|
return void 0;
|
|
3492
4426
|
}
|
|
3493
4427
|
const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
|
|
3494
|
-
return `[+${
|
|
4428
|
+
return `[+${formatElapsed3(this.startedAt)}] [${source}] ${message}`;
|
|
3495
4429
|
}
|
|
3496
4430
|
flushRemainder() {
|
|
3497
4431
|
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
@@ -3514,18 +4448,18 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3514
4448
|
this.stderrBuffer = "";
|
|
3515
4449
|
}
|
|
3516
4450
|
};
|
|
3517
|
-
function
|
|
4451
|
+
function buildLogFilename3(request, targetName) {
|
|
3518
4452
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3519
|
-
const evalId =
|
|
4453
|
+
const evalId = sanitizeForFilename3(request.evalCaseId ?? "pi");
|
|
3520
4454
|
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
3521
|
-
const target =
|
|
3522
|
-
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${
|
|
4455
|
+
const target = sanitizeForFilename3(targetName);
|
|
4456
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID3().slice(0, 8)}.log`;
|
|
3523
4457
|
}
|
|
3524
|
-
function
|
|
4458
|
+
function sanitizeForFilename3(value) {
|
|
3525
4459
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3526
4460
|
return sanitized.length > 0 ? sanitized : "pi";
|
|
3527
4461
|
}
|
|
3528
|
-
function
|
|
4462
|
+
function formatElapsed3(startedAt) {
|
|
3529
4463
|
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
3530
4464
|
const hours = Math.floor(elapsedSeconds / 3600);
|
|
3531
4465
|
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
@@ -3536,7 +4470,7 @@ function formatElapsed2(startedAt) {
|
|
|
3536
4470
|
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3537
4471
|
}
|
|
3538
4472
|
function formatPiLogMessage(rawLine, source) {
|
|
3539
|
-
const parsed =
|
|
4473
|
+
const parsed = tryParseJsonValue3(rawLine);
|
|
3540
4474
|
if (parsed) {
|
|
3541
4475
|
const summary = summarizePiEvent(parsed);
|
|
3542
4476
|
if (summary) {
|
|
@@ -3549,7 +4483,7 @@ function formatPiLogMessage(rawLine, source) {
|
|
|
3549
4483
|
return rawLine;
|
|
3550
4484
|
}
|
|
3551
4485
|
function formatPiJsonLog(rawLine) {
|
|
3552
|
-
const parsed =
|
|
4486
|
+
const parsed = tryParseJsonValue3(rawLine);
|
|
3553
4487
|
if (!parsed) {
|
|
3554
4488
|
return rawLine;
|
|
3555
4489
|
}
|
|
@@ -3599,7 +4533,7 @@ function summarizePiEvent(event) {
|
|
|
3599
4533
|
return type;
|
|
3600
4534
|
}
|
|
3601
4535
|
}
|
|
3602
|
-
function
|
|
4536
|
+
function tryParseJsonValue3(rawLine) {
|
|
3603
4537
|
try {
|
|
3604
4538
|
return JSON.parse(rawLine);
|
|
3605
4539
|
} catch {
|
|
@@ -3624,7 +4558,7 @@ function parsePiJsonl(output) {
|
|
|
3624
4558
|
}
|
|
3625
4559
|
return parsed;
|
|
3626
4560
|
}
|
|
3627
|
-
function
|
|
4561
|
+
function extractOutputMessages2(events) {
|
|
3628
4562
|
for (let i = events.length - 1; i >= 0; i--) {
|
|
3629
4563
|
const event = events[i];
|
|
3630
4564
|
if (!event || typeof event !== "object") {
|
|
@@ -3665,8 +4599,8 @@ function convertPiMessage(message) {
|
|
|
3665
4599
|
if (typeof role !== "string") {
|
|
3666
4600
|
return void 0;
|
|
3667
4601
|
}
|
|
3668
|
-
const content =
|
|
3669
|
-
const toolCalls =
|
|
4602
|
+
const content = extractTextContent2(msg.content);
|
|
4603
|
+
const toolCalls = extractToolCalls2(msg.content);
|
|
3670
4604
|
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
3671
4605
|
const metadata = {};
|
|
3672
4606
|
if (msg.api) metadata.api = msg.api;
|
|
@@ -3682,7 +4616,7 @@ function convertPiMessage(message) {
|
|
|
3682
4616
|
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
3683
4617
|
};
|
|
3684
4618
|
}
|
|
3685
|
-
function
|
|
4619
|
+
function extractTextContent2(content) {
|
|
3686
4620
|
if (typeof content === "string") {
|
|
3687
4621
|
return content;
|
|
3688
4622
|
}
|
|
@@ -3701,7 +4635,7 @@ function extractTextContent(content) {
|
|
|
3701
4635
|
}
|
|
3702
4636
|
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
3703
4637
|
}
|
|
3704
|
-
function
|
|
4638
|
+
function extractToolCalls2(content) {
|
|
3705
4639
|
if (!Array.isArray(content)) {
|
|
3706
4640
|
return [];
|
|
3707
4641
|
}
|
|
@@ -3746,7 +4680,7 @@ function extractAssistantText2(messages) {
|
|
|
3746
4680
|
function escapeAtSymbols(prompt) {
|
|
3747
4681
|
return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
|
|
3748
4682
|
}
|
|
3749
|
-
function
|
|
4683
|
+
function pickDetail3(stderr, stdout) {
|
|
3750
4684
|
const errorText = stderr.trim();
|
|
3751
4685
|
if (errorText.length > 0) {
|
|
3752
4686
|
return errorText;
|
|
@@ -3754,7 +4688,7 @@ function pickDetail2(stderr, stdout) {
|
|
|
3754
4688
|
const stdoutText = stdout.trim();
|
|
3755
4689
|
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3756
4690
|
}
|
|
3757
|
-
function
|
|
4691
|
+
function formatTimeoutSuffix4(timeoutMs) {
|
|
3758
4692
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
3759
4693
|
return "";
|
|
3760
4694
|
}
|
|
@@ -3767,7 +4701,7 @@ async function defaultPiRunner(options) {
|
|
|
3767
4701
|
const executable = parts[0];
|
|
3768
4702
|
const executableArgs = parts.slice(1);
|
|
3769
4703
|
const allArgs = [...executableArgs, ...options.args];
|
|
3770
|
-
const child =
|
|
4704
|
+
const child = spawn3(executable, allArgs, {
|
|
3771
4705
|
cwd: options.cwd,
|
|
3772
4706
|
env: options.env,
|
|
3773
4707
|
stdio: ["pipe", "pipe", "pipe"],
|
|
@@ -3830,7 +4764,7 @@ async function defaultPiRunner(options) {
|
|
|
3830
4764
|
}
|
|
3831
4765
|
|
|
3832
4766
|
// src/evaluation/providers/vscode.ts
|
|
3833
|
-
import
|
|
4767
|
+
import path12 from "node:path";
|
|
3834
4768
|
import {
|
|
3835
4769
|
dispatchAgentSession,
|
|
3836
4770
|
dispatchBatchAgent,
|
|
@@ -4005,7 +4939,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
4005
4939
|
return "";
|
|
4006
4940
|
}
|
|
4007
4941
|
const buildList = (files) => files.map((absolutePath) => {
|
|
4008
|
-
const fileName =
|
|
4942
|
+
const fileName = path12.basename(absolutePath);
|
|
4009
4943
|
const fileUri = pathToFileUri2(absolutePath);
|
|
4010
4944
|
return `* [${fileName}](${fileUri})`;
|
|
4011
4945
|
});
|
|
@@ -4030,8 +4964,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
4030
4964
|
}
|
|
4031
4965
|
const unique = /* @__PURE__ */ new Map();
|
|
4032
4966
|
for (const attachment of attachments) {
|
|
4033
|
-
const absolutePath =
|
|
4034
|
-
const normalized = absolutePath.split(
|
|
4967
|
+
const absolutePath = path12.resolve(attachment);
|
|
4968
|
+
const normalized = absolutePath.split(path12.sep).join("/");
|
|
4035
4969
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
4036
4970
|
if (!unique.has(absolutePath)) {
|
|
4037
4971
|
unique.set(absolutePath, absolutePath);
|
|
@@ -4046,7 +4980,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
4046
4980
|
}
|
|
4047
4981
|
const unique = /* @__PURE__ */ new Map();
|
|
4048
4982
|
for (const attachment of attachments) {
|
|
4049
|
-
const absolutePath =
|
|
4983
|
+
const absolutePath = path12.resolve(attachment);
|
|
4050
4984
|
if (!unique.has(absolutePath)) {
|
|
4051
4985
|
unique.set(absolutePath, absolutePath);
|
|
4052
4986
|
}
|
|
@@ -4054,7 +4988,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
4054
4988
|
return Array.from(unique.values());
|
|
4055
4989
|
}
|
|
4056
4990
|
function pathToFileUri2(filePath) {
|
|
4057
|
-
const absolutePath =
|
|
4991
|
+
const absolutePath = path12.isAbsolute(filePath) ? filePath : path12.resolve(filePath);
|
|
4058
4992
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
4059
4993
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
4060
4994
|
return `file:///${normalizedPath}`;
|
|
@@ -4067,7 +5001,7 @@ function normalizeAttachments(attachments) {
|
|
|
4067
5001
|
}
|
|
4068
5002
|
const deduped = /* @__PURE__ */ new Set();
|
|
4069
5003
|
for (const attachment of attachments) {
|
|
4070
|
-
deduped.add(
|
|
5004
|
+
deduped.add(path12.resolve(attachment));
|
|
4071
5005
|
}
|
|
4072
5006
|
return Array.from(deduped);
|
|
4073
5007
|
}
|
|
@@ -4076,7 +5010,7 @@ function mergeAttachments(all) {
|
|
|
4076
5010
|
for (const list of all) {
|
|
4077
5011
|
if (!list) continue;
|
|
4078
5012
|
for (const inputFile of list) {
|
|
4079
|
-
deduped.add(
|
|
5013
|
+
deduped.add(path12.resolve(inputFile));
|
|
4080
5014
|
}
|
|
4081
5015
|
}
|
|
4082
5016
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -4125,7 +5059,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
4125
5059
|
// src/evaluation/providers/targets-file.ts
|
|
4126
5060
|
import { constants as constants3 } from "node:fs";
|
|
4127
5061
|
import { access as access3, readFile as readFile6 } from "node:fs/promises";
|
|
4128
|
-
import
|
|
5062
|
+
import path13 from "node:path";
|
|
4129
5063
|
import { parse as parse3 } from "yaml";
|
|
4130
5064
|
function isRecord(value) {
|
|
4131
5065
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -4162,7 +5096,7 @@ async function fileExists3(filePath) {
|
|
|
4162
5096
|
}
|
|
4163
5097
|
}
|
|
4164
5098
|
async function readTargetDefinitions(filePath) {
|
|
4165
|
-
const absolutePath =
|
|
5099
|
+
const absolutePath = path13.resolve(filePath);
|
|
4166
5100
|
if (!await fileExists3(absolutePath)) {
|
|
4167
5101
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
4168
5102
|
}
|
|
@@ -4196,6 +5130,8 @@ function createProvider(target) {
|
|
|
4196
5130
|
return new CodexProvider(target.name, target.config);
|
|
4197
5131
|
case "pi-coding-agent":
|
|
4198
5132
|
return new PiCodingAgentProvider(target.name, target.config);
|
|
5133
|
+
case "claude-code":
|
|
5134
|
+
return new ClaudeCodeProvider(target.name, target.config);
|
|
4199
5135
|
case "mock":
|
|
4200
5136
|
return new MockProvider(target.name, target.config);
|
|
4201
5137
|
case "vscode":
|
|
@@ -4214,73 +5150,193 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
4214
5150
|
|
|
4215
5151
|
// src/evaluation/evaluators.ts
|
|
4216
5152
|
import { generateText as generateText2 } from "ai";
|
|
4217
|
-
import { z } from "zod";
|
|
5153
|
+
import { z as z2 } from "zod";
|
|
4218
5154
|
|
|
4219
5155
|
// src/runtime/exec.ts
|
|
4220
|
-
function
|
|
4221
|
-
|
|
4222
|
-
|
|
5156
|
+
function shellEscapePath(value) {
|
|
5157
|
+
if (process.platform === "win32") {
|
|
5158
|
+
return `"${value.replaceAll('"', '""')}"`;
|
|
5159
|
+
}
|
|
5160
|
+
return `'${value.replaceAll("'", `'"'"'`)}'`;
|
|
4223
5161
|
}
|
|
4224
|
-
async function
|
|
4225
|
-
|
|
4226
|
-
|
|
4227
|
-
|
|
4228
|
-
|
|
4229
|
-
|
|
4230
|
-
|
|
4231
|
-
|
|
4232
|
-
|
|
4233
|
-
|
|
4234
|
-
|
|
4235
|
-
|
|
4236
|
-
|
|
4237
|
-
|
|
4238
|
-
|
|
4239
|
-
|
|
4240
|
-
|
|
4241
|
-
|
|
4242
|
-
|
|
4243
|
-
|
|
4244
|
-
|
|
4245
|
-
|
|
4246
|
-
|
|
5162
|
+
async function execFileWithStdin(argv, stdinPayload, options = {}) {
|
|
5163
|
+
if (argv.length === 0) {
|
|
5164
|
+
throw new Error("Executable argv must include at least one entry");
|
|
5165
|
+
}
|
|
5166
|
+
if (typeof Bun !== "undefined") {
|
|
5167
|
+
return execFileWithStdinBun(argv, stdinPayload, options);
|
|
5168
|
+
}
|
|
5169
|
+
return execFileWithStdinNode(argv, stdinPayload, options);
|
|
5170
|
+
}
|
|
5171
|
+
async function execFileWithStdinBun(argv, stdinPayload, options) {
|
|
5172
|
+
const command = [...argv];
|
|
5173
|
+
const encoder = new TextEncoder();
|
|
5174
|
+
const proc = Bun.spawn(command, {
|
|
5175
|
+
cwd: options.cwd,
|
|
5176
|
+
stdin: encoder.encode(stdinPayload),
|
|
5177
|
+
stdout: "pipe",
|
|
5178
|
+
stderr: "pipe"
|
|
5179
|
+
});
|
|
5180
|
+
let timedOut = false;
|
|
5181
|
+
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
5182
|
+
timedOut = true;
|
|
5183
|
+
proc.kill("SIGKILL");
|
|
5184
|
+
}, options.timeoutMs) : void 0;
|
|
5185
|
+
try {
|
|
5186
|
+
const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
|
|
5187
|
+
const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
|
|
5188
|
+
const [stdout, stderr, exitCode] = await Promise.all([
|
|
5189
|
+
stdoutPromise,
|
|
5190
|
+
stderrPromise,
|
|
5191
|
+
proc.exited
|
|
5192
|
+
]);
|
|
5193
|
+
if (timedOut) {
|
|
5194
|
+
throw new Error(`Process timed out after ${options.timeoutMs}ms`);
|
|
5195
|
+
}
|
|
5196
|
+
return {
|
|
5197
|
+
stdout: stdout.replace(/\r\n/g, "\n"),
|
|
5198
|
+
stderr: stderr.replace(/\r\n/g, "\n"),
|
|
5199
|
+
exitCode
|
|
5200
|
+
};
|
|
5201
|
+
} finally {
|
|
5202
|
+
if (timeout !== void 0) {
|
|
5203
|
+
clearTimeout(timeout);
|
|
4247
5204
|
}
|
|
4248
5205
|
}
|
|
4249
|
-
|
|
4250
|
-
|
|
4251
|
-
|
|
4252
|
-
|
|
5206
|
+
}
|
|
5207
|
+
async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
5208
|
+
const { spawn: spawn4 } = await import("node:child_process");
|
|
5209
|
+
return new Promise((resolve, reject) => {
|
|
5210
|
+
const [cmd, ...args] = argv;
|
|
5211
|
+
const child = spawn4(cmd, args, {
|
|
4253
5212
|
cwd: options.cwd,
|
|
4254
5213
|
stdio: ["pipe", "pipe", "pipe"]
|
|
4255
5214
|
});
|
|
4256
|
-
|
|
4257
|
-
|
|
4258
|
-
|
|
4259
|
-
|
|
4260
|
-
|
|
5215
|
+
const stdoutChunks = [];
|
|
5216
|
+
const stderrChunks = [];
|
|
5217
|
+
child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
|
|
5218
|
+
child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
|
|
5219
|
+
let timedOut = false;
|
|
5220
|
+
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
5221
|
+
timedOut = true;
|
|
5222
|
+
child.kill("SIGKILL");
|
|
4261
5223
|
}, options.timeoutMs) : void 0;
|
|
4262
|
-
child.stdout?.on("data", (data) => {
|
|
4263
|
-
stdout += data.toString();
|
|
4264
|
-
});
|
|
4265
|
-
child.stderr?.on("data", (data) => {
|
|
4266
|
-
stderr += data.toString();
|
|
4267
|
-
});
|
|
4268
5224
|
child.on("error", (error) => {
|
|
4269
|
-
if (timeout !== void 0)
|
|
4270
|
-
clearTimeout(timeout);
|
|
4271
|
-
}
|
|
5225
|
+
if (timeout !== void 0) clearTimeout(timeout);
|
|
4272
5226
|
reject(error);
|
|
4273
5227
|
});
|
|
4274
|
-
child.on("
|
|
4275
|
-
if (timeout !== void 0)
|
|
4276
|
-
|
|
5228
|
+
child.on("close", (code) => {
|
|
5229
|
+
if (timeout !== void 0) clearTimeout(timeout);
|
|
5230
|
+
if (timedOut) {
|
|
5231
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
5232
|
+
return;
|
|
4277
5233
|
}
|
|
4278
|
-
|
|
5234
|
+
const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
|
|
5235
|
+
const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
|
|
5236
|
+
resolve({
|
|
5237
|
+
stdout,
|
|
5238
|
+
stderr,
|
|
5239
|
+
exitCode: code ?? 0
|
|
5240
|
+
});
|
|
4279
5241
|
});
|
|
4280
|
-
child.stdin
|
|
4281
|
-
|
|
5242
|
+
if (child.stdin) {
|
|
5243
|
+
child.stdin.write(stdinPayload);
|
|
5244
|
+
child.stdin.end();
|
|
5245
|
+
}
|
|
4282
5246
|
});
|
|
4283
5247
|
}
|
|
5248
|
+
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
5249
|
+
const { mkdir: mkdir4, readFile: readFile7, rm: rm4, writeFile: writeFile4 } = await import("node:fs/promises");
|
|
5250
|
+
const { tmpdir: tmpdir4 } = await import("node:os");
|
|
5251
|
+
const path15 = await import("node:path");
|
|
5252
|
+
const { randomUUID: randomUUID4 } = await import("node:crypto");
|
|
5253
|
+
const dir = path15.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
|
|
5254
|
+
await mkdir4(dir, { recursive: true });
|
|
5255
|
+
const stdinPath = path15.join(dir, "stdin.txt");
|
|
5256
|
+
const stdoutPath = path15.join(dir, "stdout.txt");
|
|
5257
|
+
const stderrPath = path15.join(dir, "stderr.txt");
|
|
5258
|
+
await writeFile4(stdinPath, stdinPayload, "utf8");
|
|
5259
|
+
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
5260
|
+
const { spawn: spawn4 } = await import("node:child_process");
|
|
5261
|
+
try {
|
|
5262
|
+
const exitCode = await new Promise((resolve, reject) => {
|
|
5263
|
+
const child = spawn4(wrappedCommand, {
|
|
5264
|
+
shell: true,
|
|
5265
|
+
cwd: options.cwd,
|
|
5266
|
+
stdio: ["ignore", "ignore", "ignore"]
|
|
5267
|
+
});
|
|
5268
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
5269
|
+
child.kill();
|
|
5270
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
5271
|
+
}, options.timeoutMs) : void 0;
|
|
5272
|
+
child.on("error", (error) => {
|
|
5273
|
+
if (timeout !== void 0) {
|
|
5274
|
+
clearTimeout(timeout);
|
|
5275
|
+
}
|
|
5276
|
+
reject(error);
|
|
5277
|
+
});
|
|
5278
|
+
child.on("exit", (code) => {
|
|
5279
|
+
if (timeout !== void 0) {
|
|
5280
|
+
clearTimeout(timeout);
|
|
5281
|
+
}
|
|
5282
|
+
resolve(code ?? 0);
|
|
5283
|
+
});
|
|
5284
|
+
});
|
|
5285
|
+
const stdout = (await readFile7(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
5286
|
+
const stderr = (await readFile7(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
5287
|
+
return { stdout, stderr, exitCode };
|
|
5288
|
+
} finally {
|
|
5289
|
+
await rm4(dir, { recursive: true, force: true });
|
|
5290
|
+
}
|
|
5291
|
+
}
|
|
5292
|
+
|
|
5293
|
+
// src/evaluation/case-conversion.ts
|
|
5294
|
+
function toSnakeCase(str) {
|
|
5295
|
+
if (/^[A-Z]/.test(str)) {
|
|
5296
|
+
return str;
|
|
5297
|
+
}
|
|
5298
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
5299
|
+
}
|
|
5300
|
+
function toCamelCase(str) {
|
|
5301
|
+
if (/^[A-Z]/.test(str)) {
|
|
5302
|
+
return str;
|
|
5303
|
+
}
|
|
5304
|
+
return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
|
|
5305
|
+
}
|
|
5306
|
+
function toSnakeCaseDeep(obj) {
|
|
5307
|
+
if (obj === null || obj === void 0) {
|
|
5308
|
+
return obj;
|
|
5309
|
+
}
|
|
5310
|
+
if (Array.isArray(obj)) {
|
|
5311
|
+
return obj.map((item) => toSnakeCaseDeep(item));
|
|
5312
|
+
}
|
|
5313
|
+
if (typeof obj === "object") {
|
|
5314
|
+
const result = {};
|
|
5315
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
5316
|
+
const snakeKey = toSnakeCase(key);
|
|
5317
|
+
result[snakeKey] = toSnakeCaseDeep(value);
|
|
5318
|
+
}
|
|
5319
|
+
return result;
|
|
5320
|
+
}
|
|
5321
|
+
return obj;
|
|
5322
|
+
}
|
|
5323
|
+
function toCamelCaseDeep(obj) {
|
|
5324
|
+
if (obj === null || obj === void 0) {
|
|
5325
|
+
return obj;
|
|
5326
|
+
}
|
|
5327
|
+
if (Array.isArray(obj)) {
|
|
5328
|
+
return obj.map((item) => toCamelCaseDeep(item));
|
|
5329
|
+
}
|
|
5330
|
+
if (typeof obj === "object") {
|
|
5331
|
+
const result = {};
|
|
5332
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
5333
|
+
const camelKey = toCamelCase(key);
|
|
5334
|
+
result[camelKey] = toCamelCaseDeep(value);
|
|
5335
|
+
}
|
|
5336
|
+
return result;
|
|
5337
|
+
}
|
|
5338
|
+
return obj;
|
|
5339
|
+
}
|
|
4284
5340
|
|
|
4285
5341
|
// src/evaluation/evaluators.ts
|
|
4286
5342
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
@@ -4300,20 +5356,20 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
4300
5356
|
|
|
4301
5357
|
[[ ## candidate_answer ## ]]
|
|
4302
5358
|
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
4303
|
-
var freeformEvaluationSchema =
|
|
4304
|
-
score:
|
|
4305
|
-
hits:
|
|
4306
|
-
misses:
|
|
4307
|
-
reasoning:
|
|
5359
|
+
var freeformEvaluationSchema = z2.object({
|
|
5360
|
+
score: z2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
5361
|
+
hits: z2.array(z2.string()).describe("Brief specific achievements").optional(),
|
|
5362
|
+
misses: z2.array(z2.string()).describe("Brief failures or omissions").optional(),
|
|
5363
|
+
reasoning: z2.string().describe("Concise explanation (1-2 sentences)").optional()
|
|
4308
5364
|
});
|
|
4309
|
-
var rubricCheckResultSchema =
|
|
4310
|
-
id:
|
|
4311
|
-
satisfied:
|
|
4312
|
-
reasoning:
|
|
5365
|
+
var rubricCheckResultSchema = z2.object({
|
|
5366
|
+
id: z2.string().describe("The ID of the rubric item being checked"),
|
|
5367
|
+
satisfied: z2.boolean().describe("Whether this rubric requirement is met"),
|
|
5368
|
+
reasoning: z2.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
4313
5369
|
});
|
|
4314
|
-
var rubricEvaluationSchema =
|
|
4315
|
-
checks:
|
|
4316
|
-
overall_reasoning:
|
|
5370
|
+
var rubricEvaluationSchema = z2.object({
|
|
5371
|
+
checks: z2.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
5372
|
+
overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)")
|
|
4317
5373
|
});
|
|
4318
5374
|
var LlmJudgeEvaluator = class {
|
|
4319
5375
|
kind = "llm_judge";
|
|
@@ -4549,30 +5605,30 @@ var CodeEvaluator = class {
|
|
|
4549
5605
|
script;
|
|
4550
5606
|
cwd;
|
|
4551
5607
|
agentTimeoutMs;
|
|
5608
|
+
config;
|
|
4552
5609
|
constructor(options) {
|
|
4553
5610
|
this.script = options.script;
|
|
4554
5611
|
this.cwd = options.cwd;
|
|
4555
5612
|
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
5613
|
+
this.config = options.config;
|
|
4556
5614
|
}
|
|
4557
5615
|
async evaluate(context) {
|
|
4558
|
-
const
|
|
4559
|
-
|
|
4560
|
-
|
|
4561
|
-
|
|
4562
|
-
|
|
4563
|
-
|
|
4564
|
-
|
|
4565
|
-
|
|
4566
|
-
|
|
4567
|
-
|
|
4568
|
-
|
|
4569
|
-
|
|
4570
|
-
|
|
4571
|
-
|
|
4572
|
-
|
|
4573
|
-
|
|
4574
|
-
2
|
|
4575
|
-
);
|
|
5616
|
+
const payload = {
|
|
5617
|
+
question: context.evalCase.question,
|
|
5618
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
5619
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
5620
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
5621
|
+
candidateAnswer: context.candidate,
|
|
5622
|
+
outputMessages: context.outputMessages ?? null,
|
|
5623
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
5624
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
5625
|
+
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
5626
|
+
),
|
|
5627
|
+
inputMessages: context.evalCase.input_messages,
|
|
5628
|
+
traceSummary: context.traceSummary ?? null,
|
|
5629
|
+
config: this.config ?? null
|
|
5630
|
+
};
|
|
5631
|
+
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
4576
5632
|
try {
|
|
4577
5633
|
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
4578
5634
|
const parsed = parseJsonSafe(stdout);
|
|
@@ -4638,18 +5694,25 @@ function calculateRubricScore(result, rubrics) {
|
|
|
4638
5694
|
return { score, verdict, hits, misses };
|
|
4639
5695
|
}
|
|
4640
5696
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
4641
|
-
const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
|
|
4642
|
-
cwd,
|
|
4643
|
-
timeoutMs: agentTimeoutMs
|
|
4644
|
-
});
|
|
5697
|
+
const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
|
|
4645
5698
|
if (exitCode !== 0) {
|
|
4646
|
-
const trimmedErr = stderr
|
|
5699
|
+
const trimmedErr = formatStderr(stderr);
|
|
4647
5700
|
throw new Error(
|
|
4648
5701
|
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
4649
5702
|
);
|
|
4650
5703
|
}
|
|
4651
5704
|
return stdout.trim();
|
|
4652
5705
|
}
|
|
5706
|
+
function formatStderr(stderr) {
|
|
5707
|
+
const trimmed = stderr.trim();
|
|
5708
|
+
const maxLength = 2e3;
|
|
5709
|
+
if (trimmed.length <= maxLength) {
|
|
5710
|
+
return trimmed;
|
|
5711
|
+
}
|
|
5712
|
+
const tail = trimmed.slice(-maxLength);
|
|
5713
|
+
return `...(truncated, last ${maxLength} chars)
|
|
5714
|
+
${tail}`;
|
|
5715
|
+
}
|
|
4653
5716
|
function parseJsonSafe(payload) {
|
|
4654
5717
|
try {
|
|
4655
5718
|
return JSON.parse(payload);
|
|
@@ -4881,22 +5944,438 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4881
5944
|
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
4882
5945
|
}
|
|
4883
5946
|
} else {
|
|
4884
|
-
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
5947
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
5948
|
+
}
|
|
5949
|
+
}
|
|
5950
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
5951
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
5952
|
+
}
|
|
5953
|
+
const score = hits.length / expected.length;
|
|
5954
|
+
return {
|
|
5955
|
+
score,
|
|
5956
|
+
verdict: scoreToVerdict(score),
|
|
5957
|
+
hits,
|
|
5958
|
+
misses,
|
|
5959
|
+
expectedAspectCount: expected.length
|
|
5960
|
+
};
|
|
5961
|
+
}
|
|
5962
|
+
};
|
|
5963
|
+
var DEFAULT_DATE_FORMATS = [
|
|
5964
|
+
"YYYY-MM-DDTHH:mm:ssZ",
|
|
5965
|
+
// ISO with timezone
|
|
5966
|
+
"YYYY-MM-DDTHH:mm:ss",
|
|
5967
|
+
// ISO with time
|
|
5968
|
+
"YYYY-MM-DD",
|
|
5969
|
+
// ISO date
|
|
5970
|
+
"DD-MMM-YYYY",
|
|
5971
|
+
// Localized (e.g., "15-JAN-2025")
|
|
5972
|
+
"MM/DD/YYYY",
|
|
5973
|
+
// US format
|
|
5974
|
+
"DD/MM/YYYY",
|
|
5975
|
+
// EU format
|
|
5976
|
+
"MM-DD-YYYY",
|
|
5977
|
+
// US with dashes
|
|
5978
|
+
"DD-MM-YYYY"
|
|
5979
|
+
// EU with dashes
|
|
5980
|
+
];
|
|
5981
|
+
var MONTH_NAMES = {
|
|
5982
|
+
jan: 0,
|
|
5983
|
+
january: 0,
|
|
5984
|
+
feb: 1,
|
|
5985
|
+
february: 1,
|
|
5986
|
+
mar: 2,
|
|
5987
|
+
march: 2,
|
|
5988
|
+
apr: 3,
|
|
5989
|
+
april: 3,
|
|
5990
|
+
may: 4,
|
|
5991
|
+
jun: 5,
|
|
5992
|
+
june: 5,
|
|
5993
|
+
jul: 6,
|
|
5994
|
+
july: 6,
|
|
5995
|
+
aug: 7,
|
|
5996
|
+
august: 7,
|
|
5997
|
+
sep: 8,
|
|
5998
|
+
sept: 8,
|
|
5999
|
+
september: 8,
|
|
6000
|
+
oct: 9,
|
|
6001
|
+
october: 9,
|
|
6002
|
+
nov: 10,
|
|
6003
|
+
november: 10,
|
|
6004
|
+
dec: 11,
|
|
6005
|
+
december: 11
|
|
6006
|
+
};
|
|
6007
|
+
var FieldAccuracyEvaluator = class {
|
|
6008
|
+
kind = "field_accuracy";
|
|
6009
|
+
config;
|
|
6010
|
+
constructor(options) {
|
|
6011
|
+
this.config = options.config;
|
|
6012
|
+
}
|
|
6013
|
+
evaluate(context) {
|
|
6014
|
+
const { evalCase, candidate } = context;
|
|
6015
|
+
let candidateData;
|
|
6016
|
+
try {
|
|
6017
|
+
candidateData = parseJsonFromTextSafe(candidate);
|
|
6018
|
+
} catch {
|
|
6019
|
+
return {
|
|
6020
|
+
score: 0,
|
|
6021
|
+
verdict: "fail",
|
|
6022
|
+
hits: [],
|
|
6023
|
+
misses: ["Failed to parse candidate answer as JSON"],
|
|
6024
|
+
expectedAspectCount: this.config.fields.length,
|
|
6025
|
+
reasoning: "Candidate answer is not valid JSON"
|
|
6026
|
+
};
|
|
6027
|
+
}
|
|
6028
|
+
const expectedData = this.extractExpectedData(evalCase.expected_messages);
|
|
6029
|
+
if (!expectedData) {
|
|
6030
|
+
return {
|
|
6031
|
+
score: 0,
|
|
6032
|
+
verdict: "fail",
|
|
6033
|
+
hits: [],
|
|
6034
|
+
misses: ["No expected data found in expected_messages"],
|
|
6035
|
+
expectedAspectCount: this.config.fields.length,
|
|
6036
|
+
reasoning: "Could not extract expected data from expected_messages"
|
|
6037
|
+
};
|
|
6038
|
+
}
|
|
6039
|
+
const fieldResults = [];
|
|
6040
|
+
for (const fieldConfig of this.config.fields) {
|
|
6041
|
+
const result = this.evaluateField(fieldConfig, candidateData, expectedData);
|
|
6042
|
+
fieldResults.push(result);
|
|
6043
|
+
}
|
|
6044
|
+
return this.aggregateResults(fieldResults);
|
|
6045
|
+
}
|
|
6046
|
+
/**
|
|
6047
|
+
* Extract expected data from expected_messages array.
|
|
6048
|
+
* Looks for the last assistant message with content.
|
|
6049
|
+
*/
|
|
6050
|
+
extractExpectedData(expectedMessages) {
|
|
6051
|
+
for (let i = expectedMessages.length - 1; i >= 0; i--) {
|
|
6052
|
+
const message = expectedMessages[i];
|
|
6053
|
+
if (message.role === "assistant" && message.content) {
|
|
6054
|
+
if (typeof message.content === "object" && message.content !== null) {
|
|
6055
|
+
return message.content;
|
|
6056
|
+
}
|
|
6057
|
+
if (typeof message.content === "string") {
|
|
6058
|
+
try {
|
|
6059
|
+
return parseJsonFromTextSafe(message.content);
|
|
6060
|
+
} catch {
|
|
6061
|
+
}
|
|
6062
|
+
}
|
|
6063
|
+
}
|
|
6064
|
+
}
|
|
6065
|
+
return void 0;
|
|
6066
|
+
}
|
|
6067
|
+
/**
|
|
6068
|
+
* Evaluate a single field against the expected value.
|
|
6069
|
+
*/
|
|
6070
|
+
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
6071
|
+
const { path: path15, match, required = true, weight = 1 } = fieldConfig;
|
|
6072
|
+
const candidateValue = resolvePath(candidateData, path15);
|
|
6073
|
+
const expectedValue = resolvePath(expectedData, path15);
|
|
6074
|
+
if (expectedValue === void 0) {
|
|
6075
|
+
return {
|
|
6076
|
+
path: path15,
|
|
6077
|
+
score: 1,
|
|
6078
|
+
// No expected value means no comparison needed
|
|
6079
|
+
weight,
|
|
6080
|
+
hit: true,
|
|
6081
|
+
message: `${path15}: no expected value`
|
|
6082
|
+
};
|
|
6083
|
+
}
|
|
6084
|
+
if (candidateValue === void 0) {
|
|
6085
|
+
if (required) {
|
|
6086
|
+
return {
|
|
6087
|
+
path: path15,
|
|
6088
|
+
score: 0,
|
|
6089
|
+
weight,
|
|
6090
|
+
hit: false,
|
|
6091
|
+
message: `${path15} (required, missing)`
|
|
6092
|
+
};
|
|
6093
|
+
}
|
|
6094
|
+
return {
|
|
6095
|
+
path: path15,
|
|
6096
|
+
score: 1,
|
|
6097
|
+
// Don't penalize missing optional fields
|
|
6098
|
+
weight: 0,
|
|
6099
|
+
// Zero weight means it won't affect the score
|
|
6100
|
+
hit: true,
|
|
6101
|
+
message: `${path15}: optional field missing`
|
|
6102
|
+
};
|
|
6103
|
+
}
|
|
6104
|
+
switch (match) {
|
|
6105
|
+
case "exact":
|
|
6106
|
+
return this.compareExact(path15, candidateValue, expectedValue, weight);
|
|
6107
|
+
case "numeric_tolerance":
|
|
6108
|
+
return this.compareNumericTolerance(
|
|
6109
|
+
path15,
|
|
6110
|
+
candidateValue,
|
|
6111
|
+
expectedValue,
|
|
6112
|
+
fieldConfig,
|
|
6113
|
+
weight
|
|
6114
|
+
);
|
|
6115
|
+
case "date":
|
|
6116
|
+
return this.compareDate(path15, candidateValue, expectedValue, fieldConfig, weight);
|
|
6117
|
+
default:
|
|
6118
|
+
return {
|
|
6119
|
+
path: path15,
|
|
6120
|
+
score: 0,
|
|
6121
|
+
weight,
|
|
6122
|
+
hit: false,
|
|
6123
|
+
message: `${path15}: unknown match type "${match}"`
|
|
6124
|
+
};
|
|
6125
|
+
}
|
|
6126
|
+
}
|
|
6127
|
+
/**
|
|
6128
|
+
* Exact equality comparison.
|
|
6129
|
+
*/
|
|
6130
|
+
compareExact(path15, candidateValue, expectedValue, weight) {
|
|
6131
|
+
if (deepEqual(candidateValue, expectedValue)) {
|
|
6132
|
+
return {
|
|
6133
|
+
path: path15,
|
|
6134
|
+
score: 1,
|
|
6135
|
+
weight,
|
|
6136
|
+
hit: true,
|
|
6137
|
+
message: path15
|
|
6138
|
+
};
|
|
6139
|
+
}
|
|
6140
|
+
if (typeof candidateValue !== typeof expectedValue) {
|
|
6141
|
+
return {
|
|
6142
|
+
path: path15,
|
|
6143
|
+
score: 0,
|
|
6144
|
+
weight,
|
|
6145
|
+
hit: false,
|
|
6146
|
+
message: `${path15} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
6147
|
+
};
|
|
6148
|
+
}
|
|
6149
|
+
return {
|
|
6150
|
+
path: path15,
|
|
6151
|
+
score: 0,
|
|
6152
|
+
weight,
|
|
6153
|
+
hit: false,
|
|
6154
|
+
message: `${path15} (value mismatch)`
|
|
6155
|
+
};
|
|
6156
|
+
}
|
|
6157
|
+
/**
|
|
6158
|
+
* Numeric comparison with absolute or relative tolerance.
|
|
6159
|
+
*/
|
|
6160
|
+
compareNumericTolerance(path15, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6161
|
+
const { tolerance = 0, relative = false } = fieldConfig;
|
|
6162
|
+
const candidateNum = toNumber(candidateValue);
|
|
6163
|
+
const expectedNum = toNumber(expectedValue);
|
|
6164
|
+
if (candidateNum === null || expectedNum === null) {
|
|
6165
|
+
return {
|
|
6166
|
+
path: path15,
|
|
6167
|
+
score: 0,
|
|
6168
|
+
weight,
|
|
6169
|
+
hit: false,
|
|
6170
|
+
message: `${path15} (non-numeric value)`
|
|
6171
|
+
};
|
|
6172
|
+
}
|
|
6173
|
+
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
6174
|
+
return {
|
|
6175
|
+
path: path15,
|
|
6176
|
+
score: 0,
|
|
6177
|
+
weight,
|
|
6178
|
+
hit: false,
|
|
6179
|
+
message: `${path15} (invalid numeric value)`
|
|
6180
|
+
};
|
|
6181
|
+
}
|
|
6182
|
+
const diff = Math.abs(candidateNum - expectedNum);
|
|
6183
|
+
let withinTolerance;
|
|
6184
|
+
if (relative) {
|
|
6185
|
+
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
6186
|
+
withinTolerance = relativeDiff <= tolerance;
|
|
6187
|
+
} else {
|
|
6188
|
+
withinTolerance = diff <= tolerance;
|
|
6189
|
+
}
|
|
6190
|
+
if (withinTolerance) {
|
|
6191
|
+
return {
|
|
6192
|
+
path: path15,
|
|
6193
|
+
score: 1,
|
|
6194
|
+
weight,
|
|
6195
|
+
hit: true,
|
|
6196
|
+
message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
|
|
6197
|
+
};
|
|
6198
|
+
}
|
|
6199
|
+
return {
|
|
6200
|
+
path: path15,
|
|
6201
|
+
score: 0,
|
|
6202
|
+
weight,
|
|
6203
|
+
hit: false,
|
|
6204
|
+
message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
6205
|
+
};
|
|
6206
|
+
}
|
|
6207
|
+
/**
|
|
6208
|
+
* Date comparison with format normalization.
|
|
6209
|
+
*/
|
|
6210
|
+
compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6211
|
+
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
6212
|
+
const candidateDate = parseDate(String(candidateValue), formats);
|
|
6213
|
+
const expectedDate = parseDate(String(expectedValue), formats);
|
|
6214
|
+
if (candidateDate === null) {
|
|
6215
|
+
return {
|
|
6216
|
+
path: path15,
|
|
6217
|
+
score: 0,
|
|
6218
|
+
weight,
|
|
6219
|
+
hit: false,
|
|
6220
|
+
message: `${path15} (unparseable candidate date)`
|
|
6221
|
+
};
|
|
6222
|
+
}
|
|
6223
|
+
if (expectedDate === null) {
|
|
6224
|
+
return {
|
|
6225
|
+
path: path15,
|
|
6226
|
+
score: 0,
|
|
6227
|
+
weight,
|
|
6228
|
+
hit: false,
|
|
6229
|
+
message: `${path15} (unparseable expected date)`
|
|
6230
|
+
};
|
|
6231
|
+
}
|
|
6232
|
+
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
6233
|
+
return {
|
|
6234
|
+
path: path15,
|
|
6235
|
+
score: 1,
|
|
6236
|
+
weight,
|
|
6237
|
+
hit: true,
|
|
6238
|
+
message: path15
|
|
6239
|
+
};
|
|
6240
|
+
}
|
|
6241
|
+
return {
|
|
6242
|
+
path: path15,
|
|
6243
|
+
score: 0,
|
|
6244
|
+
weight,
|
|
6245
|
+
hit: false,
|
|
6246
|
+
message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
6247
|
+
};
|
|
6248
|
+
}
|
|
6249
|
+
/**
|
|
6250
|
+
* Aggregate field results using configured strategy.
|
|
6251
|
+
*/
|
|
6252
|
+
aggregateResults(results) {
|
|
6253
|
+
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
6254
|
+
const hits = [];
|
|
6255
|
+
const misses = [];
|
|
6256
|
+
for (const result of results) {
|
|
6257
|
+
if (result.hit) {
|
|
6258
|
+
hits.push(result.message);
|
|
6259
|
+
} else {
|
|
6260
|
+
misses.push(result.message);
|
|
4885
6261
|
}
|
|
4886
6262
|
}
|
|
4887
|
-
|
|
4888
|
-
|
|
6263
|
+
let score;
|
|
6264
|
+
if (aggregation === "all_or_nothing") {
|
|
6265
|
+
score = misses.length === 0 ? 1 : 0;
|
|
6266
|
+
} else {
|
|
6267
|
+
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
6268
|
+
if (totalWeight === 0) {
|
|
6269
|
+
score = results.length === 0 ? 1 : 0;
|
|
6270
|
+
} else {
|
|
6271
|
+
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
6272
|
+
score = weightedSum / totalWeight;
|
|
6273
|
+
}
|
|
4889
6274
|
}
|
|
4890
|
-
const
|
|
6275
|
+
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
4891
6276
|
return {
|
|
4892
|
-
score,
|
|
6277
|
+
score: clampScore(score),
|
|
4893
6278
|
verdict: scoreToVerdict(score),
|
|
4894
|
-
hits,
|
|
4895
|
-
misses,
|
|
4896
|
-
expectedAspectCount:
|
|
6279
|
+
hits: hits.slice(0, 4),
|
|
6280
|
+
misses: misses.slice(0, 4),
|
|
6281
|
+
expectedAspectCount: results.length,
|
|
6282
|
+
reasoning
|
|
4897
6283
|
};
|
|
4898
6284
|
}
|
|
4899
6285
|
};
|
|
6286
|
+
function resolvePath(obj, path15) {
|
|
6287
|
+
if (!path15 || !obj) {
|
|
6288
|
+
return void 0;
|
|
6289
|
+
}
|
|
6290
|
+
const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
6291
|
+
let current = obj;
|
|
6292
|
+
for (const part of parts) {
|
|
6293
|
+
if (current === null || current === void 0) {
|
|
6294
|
+
return void 0;
|
|
6295
|
+
}
|
|
6296
|
+
if (typeof current !== "object") {
|
|
6297
|
+
return void 0;
|
|
6298
|
+
}
|
|
6299
|
+
const isIndex = /^\d+$/.test(part);
|
|
6300
|
+
if (isIndex && Array.isArray(current)) {
|
|
6301
|
+
current = current[Number.parseInt(part, 10)];
|
|
6302
|
+
} else {
|
|
6303
|
+
current = current[part];
|
|
6304
|
+
}
|
|
6305
|
+
}
|
|
6306
|
+
return current;
|
|
6307
|
+
}
|
|
6308
|
+
function toNumber(value) {
|
|
6309
|
+
if (typeof value === "number") {
|
|
6310
|
+
return value;
|
|
6311
|
+
}
|
|
6312
|
+
if (typeof value === "string") {
|
|
6313
|
+
const num = Number.parseFloat(value);
|
|
6314
|
+
return Number.isNaN(num) ? null : num;
|
|
6315
|
+
}
|
|
6316
|
+
return null;
|
|
6317
|
+
}
|
|
6318
|
+
function parseDate(dateStr, formats) {
|
|
6319
|
+
if (!dateStr) return null;
|
|
6320
|
+
const trimmed = dateStr.trim();
|
|
6321
|
+
const isoDate = new Date(trimmed);
|
|
6322
|
+
if (!Number.isNaN(isoDate.getTime())) {
|
|
6323
|
+
return isoDate;
|
|
6324
|
+
}
|
|
6325
|
+
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
6326
|
+
if (localizedMatch) {
|
|
6327
|
+
const day = Number.parseInt(localizedMatch[1], 10);
|
|
6328
|
+
const monthName = localizedMatch[2].toLowerCase();
|
|
6329
|
+
const year = Number.parseInt(localizedMatch[3], 10);
|
|
6330
|
+
const month = MONTH_NAMES[monthName];
|
|
6331
|
+
if (month !== void 0) {
|
|
6332
|
+
return new Date(year, month, day);
|
|
6333
|
+
}
|
|
6334
|
+
}
|
|
6335
|
+
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
6336
|
+
if (usMatch) {
|
|
6337
|
+
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
6338
|
+
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
6339
|
+
if (hasUSFormat && !hasEUFormat) {
|
|
6340
|
+
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
6341
|
+
const day = Number.parseInt(usMatch[2], 10);
|
|
6342
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6343
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6344
|
+
return new Date(year, month, day);
|
|
6345
|
+
}
|
|
6346
|
+
} else if (hasEUFormat && !hasUSFormat) {
|
|
6347
|
+
const day = Number.parseInt(usMatch[1], 10);
|
|
6348
|
+
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
6349
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6350
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6351
|
+
return new Date(year, month, day);
|
|
6352
|
+
}
|
|
6353
|
+
} else {
|
|
6354
|
+
const num1 = Number.parseInt(usMatch[1], 10);
|
|
6355
|
+
const num2 = Number.parseInt(usMatch[2], 10);
|
|
6356
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6357
|
+
if (num1 > 12 && num2 <= 12) {
|
|
6358
|
+
return new Date(year, num2 - 1, num1);
|
|
6359
|
+
}
|
|
6360
|
+
if (num2 > 12 && num1 <= 12) {
|
|
6361
|
+
return new Date(year, num1 - 1, num2);
|
|
6362
|
+
}
|
|
6363
|
+
if (num1 <= 12 && num2 <= 31) {
|
|
6364
|
+
return new Date(year, num1 - 1, num2);
|
|
6365
|
+
}
|
|
6366
|
+
}
|
|
6367
|
+
}
|
|
6368
|
+
return null;
|
|
6369
|
+
}
|
|
6370
|
+
function formatDateISO(date) {
|
|
6371
|
+
return date.toISOString().split("T")[0];
|
|
6372
|
+
}
|
|
6373
|
+
function parseJsonFromTextSafe(text) {
|
|
6374
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
6375
|
+
const match = cleaned.match(/\{[\s\S]*\}/);
|
|
6376
|
+
const blob = match?.[0] ?? cleaned;
|
|
6377
|
+
return JSON.parse(blob);
|
|
6378
|
+
}
|
|
4900
6379
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
4901
6380
|
{{EVALUATOR_RESULTS_JSON}}
|
|
4902
6381
|
|
|
@@ -5121,11 +6600,175 @@ var CompositeEvaluator = class {
|
|
|
5121
6600
|
}
|
|
5122
6601
|
}
|
|
5123
6602
|
};
|
|
6603
|
+
var LatencyEvaluator = class {
|
|
6604
|
+
kind = "latency";
|
|
6605
|
+
config;
|
|
6606
|
+
constructor(options) {
|
|
6607
|
+
this.config = options.config;
|
|
6608
|
+
}
|
|
6609
|
+
evaluate(context) {
|
|
6610
|
+
const { threshold } = this.config;
|
|
6611
|
+
const durationMs = context.traceSummary?.durationMs;
|
|
6612
|
+
if (durationMs === void 0) {
|
|
6613
|
+
return {
|
|
6614
|
+
score: 0,
|
|
6615
|
+
verdict: "fail",
|
|
6616
|
+
hits: [],
|
|
6617
|
+
misses: ["No duration data available in trace"],
|
|
6618
|
+
expectedAspectCount: 1,
|
|
6619
|
+
reasoning: "Execution duration not reported by provider",
|
|
6620
|
+
evaluatorRawRequest: {
|
|
6621
|
+
type: "latency",
|
|
6622
|
+
threshold,
|
|
6623
|
+
durationMs: null
|
|
6624
|
+
}
|
|
6625
|
+
};
|
|
6626
|
+
}
|
|
6627
|
+
const passed = durationMs <= threshold;
|
|
6628
|
+
const score = passed ? 1 : 0;
|
|
6629
|
+
return {
|
|
6630
|
+
score,
|
|
6631
|
+
verdict: passed ? "pass" : "fail",
|
|
6632
|
+
hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
|
|
6633
|
+
misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
|
|
6634
|
+
expectedAspectCount: 1,
|
|
6635
|
+
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
6636
|
+
evaluatorRawRequest: {
|
|
6637
|
+
type: "latency",
|
|
6638
|
+
threshold,
|
|
6639
|
+
durationMs
|
|
6640
|
+
}
|
|
6641
|
+
};
|
|
6642
|
+
}
|
|
6643
|
+
};
|
|
6644
|
+
var CostEvaluator = class {
|
|
6645
|
+
kind = "cost";
|
|
6646
|
+
config;
|
|
6647
|
+
constructor(options) {
|
|
6648
|
+
this.config = options.config;
|
|
6649
|
+
}
|
|
6650
|
+
evaluate(context) {
|
|
6651
|
+
const { budget } = this.config;
|
|
6652
|
+
const costUsd = context.traceSummary?.costUsd;
|
|
6653
|
+
if (costUsd === void 0) {
|
|
6654
|
+
return {
|
|
6655
|
+
score: 0,
|
|
6656
|
+
verdict: "fail",
|
|
6657
|
+
hits: [],
|
|
6658
|
+
misses: ["No cost data available in trace"],
|
|
6659
|
+
expectedAspectCount: 1,
|
|
6660
|
+
reasoning: "Execution cost not reported by provider",
|
|
6661
|
+
evaluatorRawRequest: {
|
|
6662
|
+
type: "cost",
|
|
6663
|
+
budget,
|
|
6664
|
+
costUsd: null
|
|
6665
|
+
}
|
|
6666
|
+
};
|
|
6667
|
+
}
|
|
6668
|
+
const passed = costUsd <= budget;
|
|
6669
|
+
const score = passed ? 1 : 0;
|
|
6670
|
+
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
6671
|
+
return {
|
|
6672
|
+
score,
|
|
6673
|
+
verdict: passed ? "pass" : "fail",
|
|
6674
|
+
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
6675
|
+
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
6676
|
+
expectedAspectCount: 1,
|
|
6677
|
+
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
6678
|
+
evaluatorRawRequest: {
|
|
6679
|
+
type: "cost",
|
|
6680
|
+
budget,
|
|
6681
|
+
costUsd
|
|
6682
|
+
}
|
|
6683
|
+
};
|
|
6684
|
+
}
|
|
6685
|
+
};
|
|
6686
|
+
var TokenUsageEvaluator = class {
|
|
6687
|
+
kind = "token_usage";
|
|
6688
|
+
config;
|
|
6689
|
+
constructor(options) {
|
|
6690
|
+
this.config = options.config;
|
|
6691
|
+
}
|
|
6692
|
+
evaluate(context) {
|
|
6693
|
+
const usage = context.traceSummary?.tokenUsage;
|
|
6694
|
+
const maxTotal = this.config.max_total;
|
|
6695
|
+
const maxInput = this.config.max_input;
|
|
6696
|
+
const maxOutput = this.config.max_output;
|
|
6697
|
+
const expectedAspectCount = Math.max(
|
|
6698
|
+
[maxTotal, maxInput, maxOutput].filter((v) => typeof v === "number").length,
|
|
6699
|
+
1
|
|
6700
|
+
);
|
|
6701
|
+
if (!usage) {
|
|
6702
|
+
return {
|
|
6703
|
+
score: 0,
|
|
6704
|
+
verdict: "fail",
|
|
6705
|
+
hits: [],
|
|
6706
|
+
misses: ["No token usage data available in trace"],
|
|
6707
|
+
expectedAspectCount,
|
|
6708
|
+
reasoning: "Token usage not reported by provider",
|
|
6709
|
+
evaluatorRawRequest: {
|
|
6710
|
+
type: "token_usage",
|
|
6711
|
+
max_total: maxTotal ?? null,
|
|
6712
|
+
max_input: maxInput ?? null,
|
|
6713
|
+
max_output: maxOutput ?? null,
|
|
6714
|
+
tokenUsage: null
|
|
6715
|
+
}
|
|
6716
|
+
};
|
|
6717
|
+
}
|
|
6718
|
+
const input = usage.input;
|
|
6719
|
+
const output = usage.output;
|
|
6720
|
+
const cached = usage.cached ?? 0;
|
|
6721
|
+
const total = input + output + cached;
|
|
6722
|
+
const hits = [];
|
|
6723
|
+
const misses = [];
|
|
6724
|
+
if (typeof maxInput === "number") {
|
|
6725
|
+
if (input <= maxInput) {
|
|
6726
|
+
hits.push(`Input tokens ${input} <= ${maxInput}`);
|
|
6727
|
+
} else {
|
|
6728
|
+
misses.push(`Input tokens ${input} > ${maxInput}`);
|
|
6729
|
+
}
|
|
6730
|
+
}
|
|
6731
|
+
if (typeof maxOutput === "number") {
|
|
6732
|
+
if (output <= maxOutput) {
|
|
6733
|
+
hits.push(`Output tokens ${output} <= ${maxOutput}`);
|
|
6734
|
+
} else {
|
|
6735
|
+
misses.push(`Output tokens ${output} > ${maxOutput}`);
|
|
6736
|
+
}
|
|
6737
|
+
}
|
|
6738
|
+
if (typeof maxTotal === "number") {
|
|
6739
|
+
if (total <= maxTotal) {
|
|
6740
|
+
hits.push(`Total tokens ${total} <= ${maxTotal}`);
|
|
6741
|
+
} else {
|
|
6742
|
+
misses.push(`Total tokens ${total} > ${maxTotal}`);
|
|
6743
|
+
}
|
|
6744
|
+
}
|
|
6745
|
+
const passed = misses.length === 0;
|
|
6746
|
+
return {
|
|
6747
|
+
score: passed ? 1 : 0,
|
|
6748
|
+
verdict: passed ? "pass" : "fail",
|
|
6749
|
+
hits,
|
|
6750
|
+
misses,
|
|
6751
|
+
expectedAspectCount,
|
|
6752
|
+
reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
6753
|
+
evaluatorRawRequest: {
|
|
6754
|
+
type: "token_usage",
|
|
6755
|
+
max_total: maxTotal ?? null,
|
|
6756
|
+
max_input: maxInput ?? null,
|
|
6757
|
+
max_output: maxOutput ?? null,
|
|
6758
|
+
tokenUsage: {
|
|
6759
|
+
input,
|
|
6760
|
+
output,
|
|
6761
|
+
cached,
|
|
6762
|
+
total
|
|
6763
|
+
}
|
|
6764
|
+
}
|
|
6765
|
+
};
|
|
6766
|
+
}
|
|
6767
|
+
};
|
|
5124
6768
|
|
|
5125
6769
|
// src/evaluation/orchestrator.ts
|
|
5126
|
-
import { createHash
|
|
5127
|
-
import
|
|
5128
|
-
import path13 from "node:path";
|
|
6770
|
+
import { createHash } from "node:crypto";
|
|
6771
|
+
import path14 from "node:path";
|
|
5129
6772
|
|
|
5130
6773
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
5131
6774
|
var Node = class {
|
|
@@ -5267,6 +6910,9 @@ function validateConcurrency(concurrency) {
|
|
|
5267
6910
|
}
|
|
5268
6911
|
|
|
5269
6912
|
// src/evaluation/orchestrator.ts
|
|
6913
|
+
function usesFileReferencePrompt(provider) {
|
|
6914
|
+
return isAgentProvider(provider) || provider.kind === "cli";
|
|
6915
|
+
}
|
|
5270
6916
|
async function runEvaluation(options) {
|
|
5271
6917
|
const {
|
|
5272
6918
|
testFilePath: evalFilePath,
|
|
@@ -5278,7 +6924,6 @@ async function runEvaluation(options) {
|
|
|
5278
6924
|
evaluators,
|
|
5279
6925
|
maxRetries,
|
|
5280
6926
|
agentTimeoutMs,
|
|
5281
|
-
promptDumpDir,
|
|
5282
6927
|
cache,
|
|
5283
6928
|
useCache,
|
|
5284
6929
|
now,
|
|
@@ -5358,7 +7003,6 @@ async function runEvaluation(options) {
|
|
|
5358
7003
|
provider: primaryProvider,
|
|
5359
7004
|
target,
|
|
5360
7005
|
evaluatorRegistry,
|
|
5361
|
-
promptDumpDir,
|
|
5362
7006
|
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
5363
7007
|
onProgress,
|
|
5364
7008
|
onResult,
|
|
@@ -5400,7 +7044,6 @@ async function runEvaluation(options) {
|
|
|
5400
7044
|
evaluators: evaluatorRegistry,
|
|
5401
7045
|
maxRetries,
|
|
5402
7046
|
agentTimeoutMs,
|
|
5403
|
-
promptDumpDir,
|
|
5404
7047
|
cache,
|
|
5405
7048
|
useCache,
|
|
5406
7049
|
now,
|
|
@@ -5443,7 +7086,8 @@ async function runEvaluation(options) {
|
|
|
5443
7086
|
results.push(outcome.value);
|
|
5444
7087
|
} else {
|
|
5445
7088
|
const evalCase = filteredEvalCases[i];
|
|
5446
|
-
const
|
|
7089
|
+
const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
|
|
7090
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
5447
7091
|
const errorResult = buildErrorResult(
|
|
5448
7092
|
evalCase,
|
|
5449
7093
|
target.name,
|
|
@@ -5466,7 +7110,6 @@ async function runBatchEvaluation(options) {
|
|
|
5466
7110
|
provider,
|
|
5467
7111
|
target,
|
|
5468
7112
|
evaluatorRegistry,
|
|
5469
|
-
promptDumpDir,
|
|
5470
7113
|
nowFn,
|
|
5471
7114
|
onProgress,
|
|
5472
7115
|
onResult,
|
|
@@ -5474,12 +7117,9 @@ async function runBatchEvaluation(options) {
|
|
|
5474
7117
|
agentTimeoutMs
|
|
5475
7118
|
} = options;
|
|
5476
7119
|
const promptInputsList = [];
|
|
5477
|
-
const formattingMode =
|
|
7120
|
+
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
5478
7121
|
for (const evalCase of evalCases) {
|
|
5479
7122
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
5480
|
-
if (promptDumpDir) {
|
|
5481
|
-
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
5482
|
-
}
|
|
5483
7123
|
promptInputsList.push(promptInputs);
|
|
5484
7124
|
}
|
|
5485
7125
|
const batchRequests = evalCases.map((evalCase, index) => {
|
|
@@ -5521,13 +7161,20 @@ async function runBatchEvaluation(options) {
|
|
|
5521
7161
|
const promptInputs = promptInputsList[i];
|
|
5522
7162
|
const providerResponse = batchResponse[i];
|
|
5523
7163
|
const outputMessages = providerResponse.outputMessages;
|
|
5524
|
-
const
|
|
7164
|
+
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
7165
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
|
|
7166
|
+
eventCount: 0,
|
|
7167
|
+
toolNames: [],
|
|
7168
|
+
toolCallsByName: {},
|
|
7169
|
+
errorCount: 0
|
|
7170
|
+
} : void 0;
|
|
5525
7171
|
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
5526
7172
|
tokenUsage: providerResponse.tokenUsage,
|
|
5527
7173
|
costUsd: providerResponse.costUsd,
|
|
5528
7174
|
durationMs: providerResponse.durationMs
|
|
5529
7175
|
}) : void 0;
|
|
5530
7176
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
7177
|
+
const providerError = extractProviderError(providerResponse);
|
|
5531
7178
|
let result;
|
|
5532
7179
|
try {
|
|
5533
7180
|
result = await evaluateCandidate({
|
|
@@ -5544,6 +7191,9 @@ async function runBatchEvaluation(options) {
|
|
|
5544
7191
|
outputMessages,
|
|
5545
7192
|
traceSummary
|
|
5546
7193
|
});
|
|
7194
|
+
if (providerError) {
|
|
7195
|
+
result = { ...result, error: providerError };
|
|
7196
|
+
}
|
|
5547
7197
|
} catch (error) {
|
|
5548
7198
|
const errorResult = buildErrorResult(
|
|
5549
7199
|
evalCase,
|
|
@@ -5576,9 +7226,10 @@ async function runBatchEvaluation(options) {
|
|
|
5576
7226
|
await onProgress({
|
|
5577
7227
|
workerId: 1,
|
|
5578
7228
|
evalId: evalCase.id,
|
|
5579
|
-
status: "completed",
|
|
7229
|
+
status: result.error ? "failed" : "completed",
|
|
5580
7230
|
startedAt: 0,
|
|
5581
|
-
completedAt: Date.now()
|
|
7231
|
+
completedAt: Date.now(),
|
|
7232
|
+
error: result.error
|
|
5582
7233
|
});
|
|
5583
7234
|
}
|
|
5584
7235
|
}
|
|
@@ -5593,17 +7244,13 @@ async function runEvalCase(options) {
|
|
|
5593
7244
|
now,
|
|
5594
7245
|
maxRetries,
|
|
5595
7246
|
agentTimeoutMs,
|
|
5596
|
-
promptDumpDir,
|
|
5597
7247
|
cache,
|
|
5598
7248
|
useCache,
|
|
5599
7249
|
signal,
|
|
5600
7250
|
judgeProvider
|
|
5601
7251
|
} = options;
|
|
5602
|
-
const formattingMode =
|
|
7252
|
+
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
5603
7253
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
5604
|
-
if (promptDumpDir) {
|
|
5605
|
-
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
5606
|
-
}
|
|
5607
7254
|
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
5608
7255
|
let cachedResponse;
|
|
5609
7256
|
if (cacheKey && cache) {
|
|
@@ -5647,15 +7294,22 @@ async function runEvalCase(options) {
|
|
|
5647
7294
|
await cache.set(cacheKey, providerResponse);
|
|
5648
7295
|
}
|
|
5649
7296
|
const outputMessages = providerResponse.outputMessages;
|
|
5650
|
-
const
|
|
7297
|
+
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
7298
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
|
|
7299
|
+
eventCount: 0,
|
|
7300
|
+
toolNames: [],
|
|
7301
|
+
toolCallsByName: {},
|
|
7302
|
+
errorCount: 0
|
|
7303
|
+
} : void 0;
|
|
5651
7304
|
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
5652
7305
|
tokenUsage: providerResponse.tokenUsage,
|
|
5653
7306
|
costUsd: providerResponse.costUsd,
|
|
5654
7307
|
durationMs: providerResponse.durationMs
|
|
5655
7308
|
}) : void 0;
|
|
5656
7309
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
7310
|
+
const providerError = extractProviderError(providerResponse);
|
|
5657
7311
|
try {
|
|
5658
|
-
|
|
7312
|
+
const result = await evaluateCandidate({
|
|
5659
7313
|
evalCase,
|
|
5660
7314
|
candidate,
|
|
5661
7315
|
target,
|
|
@@ -5669,6 +7323,7 @@ async function runEvalCase(options) {
|
|
|
5669
7323
|
outputMessages,
|
|
5670
7324
|
traceSummary
|
|
5671
7325
|
});
|
|
7326
|
+
return providerError ? { ...result, error: providerError } : result;
|
|
5672
7327
|
} catch (error) {
|
|
5673
7328
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
5674
7329
|
}
|
|
@@ -5734,7 +7389,6 @@ async function evaluateCandidate(options) {
|
|
|
5734
7389
|
candidateAnswer: candidate,
|
|
5735
7390
|
target: target.name,
|
|
5736
7391
|
reasoning: score.reasoning,
|
|
5737
|
-
rawAspects: score.rawAspects,
|
|
5738
7392
|
agentProviderRequest,
|
|
5739
7393
|
lmProviderRequest,
|
|
5740
7394
|
evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
@@ -5844,7 +7498,8 @@ async function runEvaluatorList(options) {
|
|
|
5844
7498
|
const codeEvaluator = new CodeEvaluator({
|
|
5845
7499
|
script: evaluator.script,
|
|
5846
7500
|
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
5847
|
-
agentTimeoutMs
|
|
7501
|
+
agentTimeoutMs,
|
|
7502
|
+
config: evaluator.config
|
|
5848
7503
|
});
|
|
5849
7504
|
const score2 = await codeEvaluator.evaluate({
|
|
5850
7505
|
evalCase,
|
|
@@ -5872,7 +7527,7 @@ async function runEvaluatorList(options) {
|
|
|
5872
7527
|
});
|
|
5873
7528
|
}
|
|
5874
7529
|
if (evaluator.type === "composite") {
|
|
5875
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
7530
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path14.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
5876
7531
|
const createEvaluator = (memberConfig) => {
|
|
5877
7532
|
switch (memberConfig.type) {
|
|
5878
7533
|
case "llm_judge":
|
|
@@ -5881,7 +7536,8 @@ async function runEvaluatorList(options) {
|
|
|
5881
7536
|
return new CodeEvaluator({
|
|
5882
7537
|
script: memberConfig.script,
|
|
5883
7538
|
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
5884
|
-
agentTimeoutMs
|
|
7539
|
+
agentTimeoutMs,
|
|
7540
|
+
config: memberConfig.config
|
|
5885
7541
|
});
|
|
5886
7542
|
case "composite":
|
|
5887
7543
|
return new CompositeEvaluator({
|
|
@@ -5893,6 +7549,22 @@ async function runEvaluatorList(options) {
|
|
|
5893
7549
|
return new ToolTrajectoryEvaluator({
|
|
5894
7550
|
config: memberConfig
|
|
5895
7551
|
});
|
|
7552
|
+
case "field_accuracy":
|
|
7553
|
+
return new FieldAccuracyEvaluator({
|
|
7554
|
+
config: memberConfig
|
|
7555
|
+
});
|
|
7556
|
+
case "latency":
|
|
7557
|
+
return new LatencyEvaluator({
|
|
7558
|
+
config: memberConfig
|
|
7559
|
+
});
|
|
7560
|
+
case "cost":
|
|
7561
|
+
return new CostEvaluator({
|
|
7562
|
+
config: memberConfig
|
|
7563
|
+
});
|
|
7564
|
+
case "token_usage":
|
|
7565
|
+
return new TokenUsageEvaluator({
|
|
7566
|
+
config: memberConfig
|
|
7567
|
+
});
|
|
5896
7568
|
default: {
|
|
5897
7569
|
const unknownConfig = memberConfig;
|
|
5898
7570
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -5912,7 +7584,9 @@ async function runEvaluatorList(options) {
|
|
|
5912
7584
|
attempt,
|
|
5913
7585
|
promptInputs,
|
|
5914
7586
|
now,
|
|
5915
|
-
judgeProvider
|
|
7587
|
+
judgeProvider,
|
|
7588
|
+
outputMessages,
|
|
7589
|
+
traceSummary
|
|
5916
7590
|
});
|
|
5917
7591
|
const weight = evaluator.weight ?? 1;
|
|
5918
7592
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -5957,6 +7631,118 @@ async function runEvaluatorList(options) {
|
|
|
5957
7631
|
reasoning: score2.reasoning
|
|
5958
7632
|
});
|
|
5959
7633
|
}
|
|
7634
|
+
if (evaluator.type === "field_accuracy") {
|
|
7635
|
+
const fieldAccuracyEvaluator = new FieldAccuracyEvaluator({
|
|
7636
|
+
config: evaluator
|
|
7637
|
+
});
|
|
7638
|
+
const score2 = fieldAccuracyEvaluator.evaluate({
|
|
7639
|
+
evalCase,
|
|
7640
|
+
candidate,
|
|
7641
|
+
target,
|
|
7642
|
+
provider,
|
|
7643
|
+
attempt,
|
|
7644
|
+
promptInputs,
|
|
7645
|
+
now,
|
|
7646
|
+
outputMessages,
|
|
7647
|
+
traceSummary
|
|
7648
|
+
});
|
|
7649
|
+
const weight = evaluator.weight ?? 1;
|
|
7650
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
7651
|
+
evaluatorResults.push({
|
|
7652
|
+
name: evaluator.name,
|
|
7653
|
+
type: evaluator.type,
|
|
7654
|
+
score: score2.score,
|
|
7655
|
+
weight,
|
|
7656
|
+
verdict: score2.verdict,
|
|
7657
|
+
hits: score2.hits,
|
|
7658
|
+
misses: score2.misses,
|
|
7659
|
+
reasoning: score2.reasoning
|
|
7660
|
+
});
|
|
7661
|
+
}
|
|
7662
|
+
if (evaluator.type === "latency") {
|
|
7663
|
+
const latencyEvaluator = new LatencyEvaluator({
|
|
7664
|
+
config: evaluator
|
|
7665
|
+
});
|
|
7666
|
+
const score2 = latencyEvaluator.evaluate({
|
|
7667
|
+
evalCase,
|
|
7668
|
+
candidate,
|
|
7669
|
+
target,
|
|
7670
|
+
provider,
|
|
7671
|
+
attempt,
|
|
7672
|
+
promptInputs,
|
|
7673
|
+
now,
|
|
7674
|
+
outputMessages,
|
|
7675
|
+
traceSummary
|
|
7676
|
+
});
|
|
7677
|
+
const weight = evaluator.weight ?? 1;
|
|
7678
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
7679
|
+
evaluatorResults.push({
|
|
7680
|
+
name: evaluator.name,
|
|
7681
|
+
type: evaluator.type,
|
|
7682
|
+
score: score2.score,
|
|
7683
|
+
weight,
|
|
7684
|
+
verdict: score2.verdict,
|
|
7685
|
+
hits: score2.hits,
|
|
7686
|
+
misses: score2.misses,
|
|
7687
|
+
reasoning: score2.reasoning
|
|
7688
|
+
});
|
|
7689
|
+
}
|
|
7690
|
+
if (evaluator.type === "cost") {
|
|
7691
|
+
const costEvaluator = new CostEvaluator({
|
|
7692
|
+
config: evaluator
|
|
7693
|
+
});
|
|
7694
|
+
const score2 = costEvaluator.evaluate({
|
|
7695
|
+
evalCase,
|
|
7696
|
+
candidate,
|
|
7697
|
+
target,
|
|
7698
|
+
provider,
|
|
7699
|
+
attempt,
|
|
7700
|
+
promptInputs,
|
|
7701
|
+
now,
|
|
7702
|
+
outputMessages,
|
|
7703
|
+
traceSummary
|
|
7704
|
+
});
|
|
7705
|
+
const weight = evaluator.weight ?? 1;
|
|
7706
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
7707
|
+
evaluatorResults.push({
|
|
7708
|
+
name: evaluator.name,
|
|
7709
|
+
type: evaluator.type,
|
|
7710
|
+
score: score2.score,
|
|
7711
|
+
weight,
|
|
7712
|
+
verdict: score2.verdict,
|
|
7713
|
+
hits: score2.hits,
|
|
7714
|
+
misses: score2.misses,
|
|
7715
|
+
reasoning: score2.reasoning
|
|
7716
|
+
});
|
|
7717
|
+
}
|
|
7718
|
+
if (evaluator.type === "token_usage") {
|
|
7719
|
+
const tokenUsageEvaluator = new TokenUsageEvaluator({
|
|
7720
|
+
config: evaluator
|
|
7721
|
+
});
|
|
7722
|
+
const score2 = tokenUsageEvaluator.evaluate({
|
|
7723
|
+
evalCase,
|
|
7724
|
+
candidate,
|
|
7725
|
+
target,
|
|
7726
|
+
provider,
|
|
7727
|
+
attempt,
|
|
7728
|
+
promptInputs,
|
|
7729
|
+
now,
|
|
7730
|
+
outputMessages,
|
|
7731
|
+
traceSummary
|
|
7732
|
+
});
|
|
7733
|
+
const weight = evaluator.weight ?? 1;
|
|
7734
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
7735
|
+
evaluatorResults.push({
|
|
7736
|
+
name: evaluator.name,
|
|
7737
|
+
type: evaluator.type,
|
|
7738
|
+
score: score2.score,
|
|
7739
|
+
weight,
|
|
7740
|
+
verdict: score2.verdict,
|
|
7741
|
+
hits: score2.hits,
|
|
7742
|
+
misses: score2.misses,
|
|
7743
|
+
reasoning: score2.reasoning
|
|
7744
|
+
});
|
|
7745
|
+
}
|
|
5960
7746
|
} catch (error) {
|
|
5961
7747
|
const message = error instanceof Error ? error.message : String(error);
|
|
5962
7748
|
const fallbackScore = {
|
|
@@ -5996,7 +7782,6 @@ async function runEvaluatorList(options) {
|
|
|
5996
7782
|
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
5997
7783
|
0
|
|
5998
7784
|
);
|
|
5999
|
-
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
6000
7785
|
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
6001
7786
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
6002
7787
|
const score = {
|
|
@@ -6005,8 +7790,7 @@ async function runEvaluatorList(options) {
|
|
|
6005
7790
|
hits,
|
|
6006
7791
|
misses,
|
|
6007
7792
|
expectedAspectCount,
|
|
6008
|
-
reasoning
|
|
6009
|
-
rawAspects: rawAspects.length > 0 ? rawAspects : void 0
|
|
7793
|
+
reasoning
|
|
6010
7794
|
};
|
|
6011
7795
|
return { score, evaluatorResults };
|
|
6012
7796
|
}
|
|
@@ -6081,26 +7865,6 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
6081
7865
|
llm_judge: llmJudge
|
|
6082
7866
|
};
|
|
6083
7867
|
}
|
|
6084
|
-
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
6085
|
-
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
6086
|
-
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
6087
|
-
const filePath = path13.resolve(directory, filename);
|
|
6088
|
-
await mkdir3(path13.dirname(filePath), { recursive: true });
|
|
6089
|
-
const payload = {
|
|
6090
|
-
eval_id: evalCase.id,
|
|
6091
|
-
question: promptInputs.question,
|
|
6092
|
-
guidelines: promptInputs.guidelines,
|
|
6093
|
-
guideline_paths: evalCase.guideline_paths
|
|
6094
|
-
};
|
|
6095
|
-
await writeFile3(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
6096
|
-
}
|
|
6097
|
-
function sanitizeFilename(value) {
|
|
6098
|
-
if (!value) {
|
|
6099
|
-
return "prompt";
|
|
6100
|
-
}
|
|
6101
|
-
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
6102
|
-
return sanitized.length > 0 ? sanitized : randomUUID3();
|
|
6103
|
-
}
|
|
6104
7868
|
async function invokeProvider(provider, options) {
|
|
6105
7869
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
6106
7870
|
const controller = new AbortController();
|
|
@@ -6164,12 +7928,23 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
6164
7928
|
misses: [`Error: ${message}`],
|
|
6165
7929
|
candidateAnswer: `Error occurred: ${message}`,
|
|
6166
7930
|
target: targetName,
|
|
6167
|
-
rawAspects: [],
|
|
6168
7931
|
agentProviderRequest,
|
|
6169
7932
|
lmProviderRequest,
|
|
6170
7933
|
error: message
|
|
6171
7934
|
};
|
|
6172
7935
|
}
|
|
7936
|
+
function extractProviderError(response) {
|
|
7937
|
+
const raw = response.raw;
|
|
7938
|
+
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
7939
|
+
return void 0;
|
|
7940
|
+
}
|
|
7941
|
+
const error = raw.error;
|
|
7942
|
+
if (typeof error !== "string") {
|
|
7943
|
+
return void 0;
|
|
7944
|
+
}
|
|
7945
|
+
const trimmed = error.trim();
|
|
7946
|
+
return trimmed.length > 0 ? trimmed : void 0;
|
|
7947
|
+
}
|
|
6173
7948
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
6174
7949
|
const hash = createHash("sha256");
|
|
6175
7950
|
hash.update(provider.id);
|
|
@@ -6228,15 +8003,15 @@ function computeWeightedMean(entries) {
|
|
|
6228
8003
|
|
|
6229
8004
|
// src/evaluation/generators/rubric-generator.ts
|
|
6230
8005
|
import { generateText as generateText3 } from "ai";
|
|
6231
|
-
import { z as
|
|
6232
|
-
var rubricItemSchema =
|
|
6233
|
-
id:
|
|
6234
|
-
description:
|
|
6235
|
-
weight:
|
|
6236
|
-
required:
|
|
8006
|
+
import { z as z3 } from "zod";
|
|
8007
|
+
var rubricItemSchema = z3.object({
|
|
8008
|
+
id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
8009
|
+
description: z3.string().describe("What this rubric checks for"),
|
|
8010
|
+
weight: z3.number().default(1).describe("Relative importance (default 1.0)"),
|
|
8011
|
+
required: z3.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
6237
8012
|
});
|
|
6238
|
-
var rubricGenerationSchema =
|
|
6239
|
-
rubrics:
|
|
8013
|
+
var rubricGenerationSchema = z3.object({
|
|
8014
|
+
rubrics: z3.array(rubricItemSchema).describe("List of evaluation rubrics")
|
|
6240
8015
|
});
|
|
6241
8016
|
async function generateRubrics(options) {
|
|
6242
8017
|
const { expectedOutcome, question, referenceAnswer, provider } = options;
|
|
@@ -6306,6 +8081,17 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
6306
8081
|
return parts.join("\n");
|
|
6307
8082
|
}
|
|
6308
8083
|
|
|
8084
|
+
// src/evaluation/code-judge-sdk.ts
|
|
8085
|
+
import { readFileSync } from "node:fs";
|
|
8086
|
+
function parseCodeJudgePayload(payload) {
|
|
8087
|
+
const parsed = JSON.parse(payload);
|
|
8088
|
+
return toCamelCaseDeep(parsed);
|
|
8089
|
+
}
|
|
8090
|
+
function readCodeJudgePayload() {
|
|
8091
|
+
const stdin = readFileSync(0, "utf8");
|
|
8092
|
+
return parseCodeJudgePayload(stdin);
|
|
8093
|
+
}
|
|
8094
|
+
|
|
6309
8095
|
// src/index.ts
|
|
6310
8096
|
function createAgentKernel() {
|
|
6311
8097
|
return { status: "stub" };
|
|
@@ -6313,15 +8099,20 @@ function createAgentKernel() {
|
|
|
6313
8099
|
export {
|
|
6314
8100
|
CodeEvaluator,
|
|
6315
8101
|
CompositeEvaluator,
|
|
8102
|
+
CostEvaluator,
|
|
6316
8103
|
DEFAULT_EXPLORATION_TOOLS,
|
|
8104
|
+
FieldAccuracyEvaluator,
|
|
8105
|
+
LatencyEvaluator,
|
|
6317
8106
|
LlmJudgeEvaluator,
|
|
6318
8107
|
TEST_MESSAGE_ROLES,
|
|
8108
|
+
TokenUsageEvaluator,
|
|
6319
8109
|
ToolTrajectoryEvaluator,
|
|
6320
8110
|
avgToolDurationMs,
|
|
6321
8111
|
buildDirectoryChain,
|
|
6322
8112
|
buildPromptInputs,
|
|
6323
8113
|
buildSearchRoots,
|
|
6324
8114
|
computeTraceSummary,
|
|
8115
|
+
consumeClaudeCodeLogEntries,
|
|
6325
8116
|
consumeCodexLogEntries,
|
|
6326
8117
|
consumePiLogEntries,
|
|
6327
8118
|
createAgentKernel,
|
|
@@ -6343,6 +8134,8 @@ export {
|
|
|
6343
8134
|
loadEvalCases,
|
|
6344
8135
|
mergeExecutionMetrics,
|
|
6345
8136
|
normalizeLineEndings,
|
|
8137
|
+
parseCodeJudgePayload,
|
|
8138
|
+
readCodeJudgePayload,
|
|
6346
8139
|
readJsonFile,
|
|
6347
8140
|
readTargetDefinitions,
|
|
6348
8141
|
readTestSuiteMetadata,
|
|
@@ -6352,6 +8145,7 @@ export {
|
|
|
6352
8145
|
resolveTargetDefinition,
|
|
6353
8146
|
runEvalCase,
|
|
6354
8147
|
runEvaluation,
|
|
8148
|
+
subscribeToClaudeCodeLogEntries,
|
|
6355
8149
|
subscribeToCodexLogEntries,
|
|
6356
8150
|
subscribeToPiLogEntries,
|
|
6357
8151
|
tokensPerTool
|