@agentv/core 1.5.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-E2VSU4WZ.js → chunk-KDEP4I7G.js} +116 -1
- package/dist/chunk-KDEP4I7G.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +2 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +2715 -675
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +207 -10
- package/dist/index.d.ts +207 -10
- package/dist/index.js +2491 -570
- package/dist/index.js.map +1 -1
- package/package.json +8 -2
- package/dist/chunk-E2VSU4WZ.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -10,7 +10,7 @@ import {
|
|
|
10
10
|
readTextFile,
|
|
11
11
|
resolveFileReference,
|
|
12
12
|
resolveTargetDefinition
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-KDEP4I7G.js";
|
|
14
14
|
|
|
15
15
|
// src/evaluation/types.ts
|
|
16
16
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -64,7 +64,11 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
64
64
|
"llm_judge",
|
|
65
65
|
"rubric",
|
|
66
66
|
"composite",
|
|
67
|
-
"tool_trajectory"
|
|
67
|
+
"tool_trajectory",
|
|
68
|
+
"field_accuracy",
|
|
69
|
+
"latency",
|
|
70
|
+
"cost",
|
|
71
|
+
"token_usage"
|
|
68
72
|
];
|
|
69
73
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
70
74
|
function isEvaluatorKind(value) {
|
|
@@ -486,7 +490,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
486
490
|
continue;
|
|
487
491
|
}
|
|
488
492
|
if (typeValue === "code_judge") {
|
|
489
|
-
|
|
493
|
+
let script;
|
|
494
|
+
const rawScript = rawEvaluator.script;
|
|
495
|
+
if (typeof rawScript === "string") {
|
|
496
|
+
const trimmed = rawScript.trim();
|
|
497
|
+
if (trimmed.length === 0) {
|
|
498
|
+
throw new Error(
|
|
499
|
+
`Invalid code_judge script for evaluator '${name}' in '${evalId}': script cannot be empty`
|
|
500
|
+
);
|
|
501
|
+
}
|
|
502
|
+
script = parseCommandToArgv(trimmed);
|
|
503
|
+
} else {
|
|
504
|
+
script = asStringArray(
|
|
505
|
+
rawScript,
|
|
506
|
+
`code_judge script for evaluator '${name}' in '${evalId}'`
|
|
507
|
+
);
|
|
508
|
+
}
|
|
490
509
|
if (!script) {
|
|
491
510
|
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
492
511
|
continue;
|
|
@@ -507,13 +526,21 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
507
526
|
} else {
|
|
508
527
|
resolvedCwd = searchRoots[0];
|
|
509
528
|
}
|
|
529
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
|
|
530
|
+
const config = {};
|
|
531
|
+
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
532
|
+
if (!knownProps.has(key) && value !== void 0) {
|
|
533
|
+
config[key] = value;
|
|
534
|
+
}
|
|
535
|
+
}
|
|
510
536
|
evaluators.push({
|
|
511
537
|
name,
|
|
512
538
|
type: "code",
|
|
513
539
|
script,
|
|
514
540
|
cwd,
|
|
515
541
|
resolvedCwd,
|
|
516
|
-
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
542
|
+
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
543
|
+
...Object.keys(config).length > 0 ? { config } : {}
|
|
517
544
|
});
|
|
518
545
|
continue;
|
|
519
546
|
}
|
|
@@ -688,6 +715,140 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
688
715
|
evaluators.push(config);
|
|
689
716
|
continue;
|
|
690
717
|
}
|
|
718
|
+
if (typeValue === "field_accuracy") {
|
|
719
|
+
const rawFields = rawEvaluator.fields;
|
|
720
|
+
if (!Array.isArray(rawFields)) {
|
|
721
|
+
logWarning2(
|
|
722
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
|
|
723
|
+
);
|
|
724
|
+
continue;
|
|
725
|
+
}
|
|
726
|
+
if (rawFields.length === 0) {
|
|
727
|
+
logWarning2(
|
|
728
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
|
|
729
|
+
);
|
|
730
|
+
continue;
|
|
731
|
+
}
|
|
732
|
+
const fields = [];
|
|
733
|
+
for (const rawField of rawFields) {
|
|
734
|
+
if (!isJsonObject2(rawField)) {
|
|
735
|
+
logWarning2(
|
|
736
|
+
`Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
|
|
737
|
+
);
|
|
738
|
+
continue;
|
|
739
|
+
}
|
|
740
|
+
const fieldPath = asString2(rawField.path);
|
|
741
|
+
const match = asString2(rawField.match);
|
|
742
|
+
if (!fieldPath) {
|
|
743
|
+
logWarning2(
|
|
744
|
+
`Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
|
|
745
|
+
);
|
|
746
|
+
continue;
|
|
747
|
+
}
|
|
748
|
+
if (!match || !isValidFieldMatchType(match)) {
|
|
749
|
+
logWarning2(
|
|
750
|
+
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
|
|
751
|
+
);
|
|
752
|
+
continue;
|
|
753
|
+
}
|
|
754
|
+
const fieldConfig = {
|
|
755
|
+
path: fieldPath,
|
|
756
|
+
match,
|
|
757
|
+
...typeof rawField.required === "boolean" ? { required: rawField.required } : {},
|
|
758
|
+
...typeof rawField.weight === "number" ? { weight: rawField.weight } : {},
|
|
759
|
+
...typeof rawField.tolerance === "number" ? { tolerance: rawField.tolerance } : {},
|
|
760
|
+
...typeof rawField.relative === "boolean" ? { relative: rawField.relative } : {},
|
|
761
|
+
...Array.isArray(rawField.formats) ? { formats: rawField.formats.filter((f) => typeof f === "string") } : {}
|
|
762
|
+
};
|
|
763
|
+
fields.push(fieldConfig);
|
|
764
|
+
}
|
|
765
|
+
if (fields.length === 0) {
|
|
766
|
+
logWarning2(
|
|
767
|
+
`Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
|
|
768
|
+
);
|
|
769
|
+
continue;
|
|
770
|
+
}
|
|
771
|
+
const aggregation = asString2(rawEvaluator.aggregation);
|
|
772
|
+
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
773
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
774
|
+
evaluators.push({
|
|
775
|
+
name,
|
|
776
|
+
type: "field_accuracy",
|
|
777
|
+
fields,
|
|
778
|
+
...validAggregation ? { aggregation: validAggregation } : {},
|
|
779
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
780
|
+
});
|
|
781
|
+
continue;
|
|
782
|
+
}
|
|
783
|
+
if (typeValue === "latency") {
|
|
784
|
+
const threshold = rawEvaluator.threshold;
|
|
785
|
+
if (typeof threshold !== "number" || threshold < 0) {
|
|
786
|
+
logWarning2(
|
|
787
|
+
`Skipping latency evaluator '${name}' in '${evalId}': threshold must be a non-negative number`
|
|
788
|
+
);
|
|
789
|
+
continue;
|
|
790
|
+
}
|
|
791
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
792
|
+
evaluators.push({
|
|
793
|
+
name,
|
|
794
|
+
type: "latency",
|
|
795
|
+
threshold,
|
|
796
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
797
|
+
});
|
|
798
|
+
continue;
|
|
799
|
+
}
|
|
800
|
+
if (typeValue === "cost") {
|
|
801
|
+
const budget = rawEvaluator.budget;
|
|
802
|
+
if (typeof budget !== "number" || budget < 0) {
|
|
803
|
+
logWarning2(
|
|
804
|
+
`Skipping cost evaluator '${name}' in '${evalId}': budget must be a non-negative number`
|
|
805
|
+
);
|
|
806
|
+
continue;
|
|
807
|
+
}
|
|
808
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
809
|
+
evaluators.push({
|
|
810
|
+
name,
|
|
811
|
+
type: "cost",
|
|
812
|
+
budget,
|
|
813
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
814
|
+
});
|
|
815
|
+
continue;
|
|
816
|
+
}
|
|
817
|
+
if (typeValue === "token_usage") {
|
|
818
|
+
const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
|
|
819
|
+
const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
|
|
820
|
+
const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
|
|
821
|
+
const limits = [
|
|
822
|
+
["max_total", maxTotal],
|
|
823
|
+
["max_input", maxInput],
|
|
824
|
+
["max_output", maxOutput]
|
|
825
|
+
];
|
|
826
|
+
const validLimits = {};
|
|
827
|
+
for (const [key, raw] of limits) {
|
|
828
|
+
if (raw === void 0) continue;
|
|
829
|
+
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
830
|
+
logWarning2(
|
|
831
|
+
`Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
832
|
+
);
|
|
833
|
+
continue;
|
|
834
|
+
}
|
|
835
|
+
validLimits[key] = raw;
|
|
836
|
+
}
|
|
837
|
+
if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
|
|
838
|
+
logWarning2(
|
|
839
|
+
`Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
|
|
840
|
+
);
|
|
841
|
+
continue;
|
|
842
|
+
}
|
|
843
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
844
|
+
evaluators.push({
|
|
845
|
+
name,
|
|
846
|
+
type: "token_usage",
|
|
847
|
+
...validLimits,
|
|
848
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
849
|
+
});
|
|
850
|
+
continue;
|
|
851
|
+
}
|
|
691
852
|
const prompt = asString2(rawEvaluator.prompt);
|
|
692
853
|
let promptPath;
|
|
693
854
|
if (prompt) {
|
|
@@ -758,6 +919,34 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
758
919
|
function asString2(value) {
|
|
759
920
|
return typeof value === "string" ? value : void 0;
|
|
760
921
|
}
|
|
922
|
+
function asStringArray(value, description) {
|
|
923
|
+
if (value === void 0) {
|
|
924
|
+
return void 0;
|
|
925
|
+
}
|
|
926
|
+
if (!Array.isArray(value)) {
|
|
927
|
+
throw new Error(`${description} must be an array of strings (argv tokens)`);
|
|
928
|
+
}
|
|
929
|
+
if (value.length === 0) {
|
|
930
|
+
throw new Error(`${description} cannot be empty`);
|
|
931
|
+
}
|
|
932
|
+
const result = [];
|
|
933
|
+
for (const [index, entry] of value.entries()) {
|
|
934
|
+
if (typeof entry !== "string") {
|
|
935
|
+
throw new Error(`${description}[${index}] must be a string`);
|
|
936
|
+
}
|
|
937
|
+
if (entry.trim().length === 0) {
|
|
938
|
+
throw new Error(`${description}[${index}] cannot be empty`);
|
|
939
|
+
}
|
|
940
|
+
result.push(entry);
|
|
941
|
+
}
|
|
942
|
+
return result;
|
|
943
|
+
}
|
|
944
|
+
function parseCommandToArgv(command) {
|
|
945
|
+
if (process.platform === "win32") {
|
|
946
|
+
return ["cmd.exe", "/c", command];
|
|
947
|
+
}
|
|
948
|
+
return ["sh", "-lc", command];
|
|
949
|
+
}
|
|
761
950
|
function isJsonObject2(value) {
|
|
762
951
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
763
952
|
}
|
|
@@ -791,6 +980,14 @@ function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
|
791
980
|
}
|
|
792
981
|
return rawWeight;
|
|
793
982
|
}
|
|
983
|
+
var VALID_FIELD_MATCH_TYPES = /* @__PURE__ */ new Set(["exact", "numeric_tolerance", "date"]);
|
|
984
|
+
function isValidFieldMatchType(value) {
|
|
985
|
+
return typeof value === "string" && VALID_FIELD_MATCH_TYPES.has(value);
|
|
986
|
+
}
|
|
987
|
+
var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average", "all_or_nothing"]);
|
|
988
|
+
function isValidFieldAggregationType(value) {
|
|
989
|
+
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
990
|
+
}
|
|
794
991
|
|
|
795
992
|
// src/evaluation/loaders/message-processor.ts
|
|
796
993
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
@@ -1750,91 +1947,992 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
1750
1947
|
throw lastError;
|
|
1751
1948
|
}
|
|
1752
1949
|
|
|
1753
|
-
// src/evaluation/providers/
|
|
1754
|
-
import {
|
|
1755
|
-
import
|
|
1756
|
-
import
|
|
1757
|
-
import
|
|
1758
|
-
import {
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1950
|
+
// src/evaluation/providers/claude-code.ts
|
|
1951
|
+
import { spawn } from "node:child_process";
|
|
1952
|
+
import { randomUUID } from "node:crypto";
|
|
1953
|
+
import { createWriteStream } from "node:fs";
|
|
1954
|
+
import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
1955
|
+
import { tmpdir } from "node:os";
|
|
1956
|
+
import path8 from "node:path";
|
|
1957
|
+
|
|
1958
|
+
// src/evaluation/providers/claude-code-log-tracker.ts
|
|
1959
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
|
|
1960
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeCodeLogSubscribers");
|
|
1961
|
+
function getClaudeCodeLogStore() {
|
|
1962
|
+
const globalObject = globalThis;
|
|
1963
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1964
|
+
if (existing) {
|
|
1965
|
+
return existing;
|
|
1966
|
+
}
|
|
1967
|
+
const created = [];
|
|
1968
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1969
|
+
return created;
|
|
1970
|
+
}
|
|
1971
|
+
function getSubscriberStore() {
|
|
1972
|
+
const globalObject = globalThis;
|
|
1973
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1974
|
+
if (existing) {
|
|
1975
|
+
return existing;
|
|
1976
|
+
}
|
|
1977
|
+
const created = /* @__PURE__ */ new Set();
|
|
1978
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1979
|
+
return created;
|
|
1980
|
+
}
|
|
1981
|
+
function notifySubscribers(entry) {
|
|
1982
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1983
|
+
for (const listener of subscribers) {
|
|
1984
|
+
try {
|
|
1985
|
+
listener(entry);
|
|
1986
|
+
} catch (error) {
|
|
1987
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1988
|
+
console.warn(`Claude Code log subscriber failed: ${message}`);
|
|
1989
|
+
}
|
|
1990
|
+
}
|
|
1991
|
+
}
|
|
1992
|
+
function recordClaudeCodeLogEntry(entry) {
|
|
1993
|
+
getClaudeCodeLogStore().push(entry);
|
|
1994
|
+
notifySubscribers(entry);
|
|
1995
|
+
}
|
|
1996
|
+
function consumeClaudeCodeLogEntries() {
|
|
1997
|
+
const store = getClaudeCodeLogStore();
|
|
1998
|
+
if (store.length === 0) {
|
|
1999
|
+
return [];
|
|
2000
|
+
}
|
|
2001
|
+
return store.splice(0, store.length);
|
|
2002
|
+
}
|
|
2003
|
+
function subscribeToClaudeCodeLogEntries(listener) {
|
|
2004
|
+
const store = getSubscriberStore();
|
|
2005
|
+
store.add(listener);
|
|
2006
|
+
return () => {
|
|
2007
|
+
store.delete(listener);
|
|
1769
2008
|
};
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
failed: true,
|
|
1787
|
-
timedOut: execError.timedOut === true || execError.killed === true,
|
|
1788
|
-
signal: execError.signal ?? null
|
|
1789
|
-
};
|
|
2009
|
+
}
|
|
2010
|
+
|
|
2011
|
+
// src/evaluation/providers/preread.ts
|
|
2012
|
+
import path7 from "node:path";
|
|
2013
|
+
function buildPromptDocument(request, inputFiles, options) {
|
|
2014
|
+
const parts = [];
|
|
2015
|
+
const guidelineFiles = collectGuidelineFiles(
|
|
2016
|
+
inputFiles,
|
|
2017
|
+
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
2018
|
+
options?.guidelineOverrides
|
|
2019
|
+
);
|
|
2020
|
+
const inputFilesList = collectInputFiles(inputFiles);
|
|
2021
|
+
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
2022
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
2023
|
+
if (prereadBlock.length > 0) {
|
|
2024
|
+
parts.push("\n", prereadBlock);
|
|
1790
2025
|
}
|
|
2026
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2027
|
+
return parts.join("\n").trim();
|
|
1791
2028
|
}
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
targetName;
|
|
1796
|
-
supportsBatch = true;
|
|
1797
|
-
config;
|
|
1798
|
-
runCommand;
|
|
1799
|
-
verbose;
|
|
1800
|
-
keepTempFiles;
|
|
1801
|
-
healthcheckPromise;
|
|
1802
|
-
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1803
|
-
this.targetName = targetName;
|
|
1804
|
-
this.id = `cli:${targetName}`;
|
|
1805
|
-
this.config = config;
|
|
1806
|
-
this.runCommand = runner;
|
|
1807
|
-
this.verbose = config.verbose ?? false;
|
|
1808
|
-
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
2029
|
+
function normalizeInputFiles(inputFiles) {
|
|
2030
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2031
|
+
return void 0;
|
|
1809
2032
|
}
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
2033
|
+
const deduped = /* @__PURE__ */ new Map();
|
|
2034
|
+
for (const inputFile of inputFiles) {
|
|
2035
|
+
const absolutePath = path7.resolve(inputFile);
|
|
2036
|
+
if (!deduped.has(absolutePath)) {
|
|
2037
|
+
deduped.set(absolutePath, absolutePath);
|
|
1813
2038
|
}
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
2039
|
+
}
|
|
2040
|
+
return Array.from(deduped.values());
|
|
2041
|
+
}
|
|
2042
|
+
function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
2043
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2044
|
+
return [];
|
|
2045
|
+
}
|
|
2046
|
+
const unique = /* @__PURE__ */ new Map();
|
|
2047
|
+
for (const inputFile of inputFiles) {
|
|
2048
|
+
const absolutePath = path7.resolve(inputFile);
|
|
2049
|
+
if (overrides?.has(absolutePath)) {
|
|
2050
|
+
if (!unique.has(absolutePath)) {
|
|
2051
|
+
unique.set(absolutePath, absolutePath);
|
|
2052
|
+
}
|
|
2053
|
+
continue;
|
|
1822
2054
|
}
|
|
1823
|
-
const
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
2055
|
+
const normalized = absolutePath.split(path7.sep).join("/");
|
|
2056
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2057
|
+
if (!unique.has(absolutePath)) {
|
|
2058
|
+
unique.set(absolutePath, absolutePath);
|
|
2059
|
+
}
|
|
2060
|
+
}
|
|
2061
|
+
}
|
|
2062
|
+
return Array.from(unique.values());
|
|
2063
|
+
}
|
|
2064
|
+
function collectInputFiles(inputFiles) {
|
|
2065
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2066
|
+
return [];
|
|
2067
|
+
}
|
|
2068
|
+
const unique = /* @__PURE__ */ new Map();
|
|
2069
|
+
for (const inputFile of inputFiles) {
|
|
2070
|
+
const absolutePath = path7.resolve(inputFile);
|
|
2071
|
+
if (!unique.has(absolutePath)) {
|
|
2072
|
+
unique.set(absolutePath, absolutePath);
|
|
2073
|
+
}
|
|
2074
|
+
}
|
|
2075
|
+
return Array.from(unique.values());
|
|
2076
|
+
}
|
|
2077
|
+
function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
2078
|
+
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
2079
|
+
return "";
|
|
2080
|
+
}
|
|
2081
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
2082
|
+
const fileName = path7.basename(absolutePath);
|
|
2083
|
+
const fileUri = pathToFileUri(absolutePath);
|
|
2084
|
+
return `* [${fileName}](${fileUri})`;
|
|
2085
|
+
});
|
|
2086
|
+
const sections = [];
|
|
2087
|
+
if (guidelineFiles.length > 0) {
|
|
2088
|
+
sections.push(`Read all guideline files:
|
|
2089
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
2090
|
+
}
|
|
2091
|
+
if (inputFiles.length > 0) {
|
|
2092
|
+
sections.push(`Read all input files:
|
|
2093
|
+
${buildList(inputFiles).join("\n")}.`);
|
|
2094
|
+
}
|
|
2095
|
+
sections.push(
|
|
2096
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
2097
|
+
"Then apply system_instructions on the user query below."
|
|
2098
|
+
);
|
|
2099
|
+
return sections.join("\n");
|
|
2100
|
+
}
|
|
2101
|
+
function pathToFileUri(filePath) {
|
|
2102
|
+
const absolutePath = path7.isAbsolute(filePath) ? filePath : path7.resolve(filePath);
|
|
2103
|
+
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2104
|
+
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2105
|
+
return `file:///${normalizedPath}`;
|
|
2106
|
+
}
|
|
2107
|
+
return `file://${normalizedPath}`;
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
// src/evaluation/providers/claude-code.ts
|
|
2111
|
+
var WORKSPACE_PREFIX = "agentv-claude-code-";
|
|
2112
|
+
var PROMPT_FILENAME = "prompt.md";
|
|
2113
|
+
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2114
|
+
- Do NOT create any additional output files in the workspace.
|
|
2115
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
2116
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
2117
|
+
This is required for evaluation scoring.`;
|
|
2118
|
+
var ClaudeCodeProvider = class {
|
|
2119
|
+
id;
|
|
2120
|
+
kind = "claude-code";
|
|
2121
|
+
targetName;
|
|
2122
|
+
supportsBatch = false;
|
|
2123
|
+
config;
|
|
2124
|
+
runClaudeCode;
|
|
2125
|
+
constructor(targetName, config, runner = defaultClaudeCodeRunner) {
|
|
2126
|
+
this.id = `claude-code:${targetName}`;
|
|
2127
|
+
this.targetName = targetName;
|
|
2128
|
+
this.config = config;
|
|
2129
|
+
this.runClaudeCode = runner;
|
|
2130
|
+
}
|
|
2131
|
+
async invoke(request) {
|
|
2132
|
+
if (request.signal?.aborted) {
|
|
2133
|
+
throw new Error("Claude Code request was aborted before execution");
|
|
2134
|
+
}
|
|
2135
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
2136
|
+
const workspaceRoot = await this.createWorkspace();
|
|
2137
|
+
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2138
|
+
try {
|
|
2139
|
+
const promptFile = path8.join(workspaceRoot, PROMPT_FILENAME);
|
|
2140
|
+
await writeFile(promptFile, request.question, "utf8");
|
|
2141
|
+
const args = this.buildClaudeCodeArgs(request.question, inputFiles);
|
|
2142
|
+
const cwd = this.resolveCwd();
|
|
2143
|
+
const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
|
|
2144
|
+
if (result.timedOut) {
|
|
2145
|
+
throw new Error(
|
|
2146
|
+
`Claude Code CLI timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
2147
|
+
);
|
|
2148
|
+
}
|
|
2149
|
+
if (result.exitCode !== 0) {
|
|
2150
|
+
const detail = pickDetail(result.stderr, result.stdout);
|
|
2151
|
+
const prefix = `Claude Code CLI exited with code ${result.exitCode}`;
|
|
2152
|
+
if (isNestedClaudeCodeAuthError(result.stdout)) {
|
|
2153
|
+
throw new Error(
|
|
2154
|
+
`${prefix}: Claude Code detected a nested session and requires API key authentication. Set ANTHROPIC_API_KEY environment variable or run AgentV outside of a Claude Code session.`
|
|
2155
|
+
);
|
|
2156
|
+
}
|
|
2157
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
2158
|
+
}
|
|
2159
|
+
const parsed = parseClaudeCodeJsonl(result.stdout);
|
|
2160
|
+
const outputMessages = extractOutputMessages(parsed);
|
|
2161
|
+
const usage = extractUsage(parsed);
|
|
2162
|
+
return {
|
|
2163
|
+
raw: {
|
|
2164
|
+
response: parsed,
|
|
2165
|
+
stdout: result.stdout,
|
|
2166
|
+
stderr: result.stderr,
|
|
2167
|
+
exitCode: result.exitCode,
|
|
2168
|
+
args,
|
|
2169
|
+
executable: this.config.executable,
|
|
2170
|
+
promptFile,
|
|
2171
|
+
workspace: workspaceRoot,
|
|
2172
|
+
inputFiles,
|
|
2173
|
+
logFile: logger?.filePath
|
|
2174
|
+
},
|
|
2175
|
+
outputMessages,
|
|
2176
|
+
usage
|
|
2177
|
+
};
|
|
2178
|
+
} finally {
|
|
2179
|
+
await logger?.close();
|
|
2180
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
2181
|
+
}
|
|
2182
|
+
}
|
|
2183
|
+
resolveCwd() {
|
|
2184
|
+
if (!this.config.cwd) {
|
|
2185
|
+
return process.cwd();
|
|
2186
|
+
}
|
|
2187
|
+
return path8.resolve(this.config.cwd);
|
|
2188
|
+
}
|
|
2189
|
+
buildClaudeCodeArgs(prompt, inputFiles) {
|
|
2190
|
+
const args = [];
|
|
2191
|
+
args.push("--output-format", "stream-json");
|
|
2192
|
+
args.push("--verbose");
|
|
2193
|
+
args.push("-p");
|
|
2194
|
+
if (this.config.model) {
|
|
2195
|
+
args.push("--model", this.config.model);
|
|
2196
|
+
}
|
|
2197
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
2198
|
+
args.push(...this.config.args);
|
|
2199
|
+
}
|
|
2200
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
|
|
2201
|
+
const fullPrompt = `${systemPrompt}
|
|
2202
|
+
|
|
2203
|
+
${prompt}`;
|
|
2204
|
+
let finalPrompt = fullPrompt;
|
|
2205
|
+
if (inputFiles && inputFiles.length > 0) {
|
|
2206
|
+
const filesContext = inputFiles.map((f) => `[File: ${f}]`).join("\n");
|
|
2207
|
+
finalPrompt = `${fullPrompt}
|
|
2208
|
+
|
|
2209
|
+
## Input Files
|
|
2210
|
+
${filesContext}`;
|
|
2211
|
+
}
|
|
2212
|
+
args.push(finalPrompt);
|
|
2213
|
+
return args;
|
|
2214
|
+
}
|
|
2215
|
+
buildEnv() {
|
|
2216
|
+
const env = { ...process.env };
|
|
2217
|
+
env.CLAUDECODE = void 0;
|
|
2218
|
+
env.CLAUDE_CODE_ENTRYPOINT = void 0;
|
|
2219
|
+
return env;
|
|
2220
|
+
}
|
|
2221
|
+
async executeClaudeCode(args, cwd, signal, logger) {
|
|
2222
|
+
try {
|
|
2223
|
+
return await this.runClaudeCode({
|
|
2224
|
+
executable: this.config.executable,
|
|
2225
|
+
args,
|
|
2226
|
+
cwd,
|
|
2227
|
+
timeoutMs: this.config.timeoutMs,
|
|
2228
|
+
env: this.buildEnv(),
|
|
2229
|
+
signal,
|
|
2230
|
+
onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
|
|
2231
|
+
onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
|
|
2232
|
+
});
|
|
2233
|
+
} catch (error) {
|
|
2234
|
+
const err = error;
|
|
2235
|
+
if (err.code === "ENOENT") {
|
|
2236
|
+
throw new Error(
|
|
2237
|
+
`Claude Code executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
2238
|
+
);
|
|
2239
|
+
}
|
|
2240
|
+
throw error;
|
|
2241
|
+
}
|
|
2242
|
+
}
|
|
2243
|
+
async createWorkspace() {
|
|
2244
|
+
return await mkdtemp(path8.join(tmpdir(), WORKSPACE_PREFIX));
|
|
2245
|
+
}
|
|
2246
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
2247
|
+
try {
|
|
2248
|
+
await rm(workspaceRoot, { recursive: true, force: true });
|
|
2249
|
+
} catch {
|
|
2250
|
+
}
|
|
2251
|
+
}
|
|
2252
|
+
resolveLogDirectory() {
|
|
2253
|
+
const disabled = isClaudeCodeLogStreamingDisabled();
|
|
2254
|
+
if (disabled) {
|
|
2255
|
+
return void 0;
|
|
2256
|
+
}
|
|
2257
|
+
if (this.config.logDir) {
|
|
2258
|
+
return path8.resolve(this.config.logDir);
|
|
2259
|
+
}
|
|
2260
|
+
return path8.join(process.cwd(), ".agentv", "logs", "claude-code");
|
|
2261
|
+
}
|
|
2262
|
+
async createStreamLogger(request) {
|
|
2263
|
+
const logDir = this.resolveLogDirectory();
|
|
2264
|
+
if (!logDir) {
|
|
2265
|
+
return void 0;
|
|
2266
|
+
}
|
|
2267
|
+
try {
|
|
2268
|
+
await mkdir(logDir, { recursive: true });
|
|
2269
|
+
} catch (error) {
|
|
2270
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2271
|
+
console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
|
|
2272
|
+
return void 0;
|
|
2273
|
+
}
|
|
2274
|
+
const filePath = path8.join(logDir, buildLogFilename(request, this.targetName));
|
|
2275
|
+
try {
|
|
2276
|
+
const logger = await ClaudeCodeStreamLogger.create({
|
|
2277
|
+
filePath,
|
|
2278
|
+
targetName: this.targetName,
|
|
2279
|
+
evalCaseId: request.evalCaseId,
|
|
2280
|
+
attempt: request.attempt,
|
|
2281
|
+
format: this.config.logFormat ?? "summary"
|
|
2282
|
+
});
|
|
2283
|
+
recordClaudeCodeLogEntry({
|
|
2284
|
+
filePath,
|
|
2285
|
+
targetName: this.targetName,
|
|
2286
|
+
evalCaseId: request.evalCaseId,
|
|
2287
|
+
attempt: request.attempt
|
|
2288
|
+
});
|
|
2289
|
+
return logger;
|
|
2290
|
+
} catch (error) {
|
|
2291
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2292
|
+
console.warn(`Skipping Claude Code stream logging for ${filePath}: ${message}`);
|
|
2293
|
+
return void 0;
|
|
2294
|
+
}
|
|
2295
|
+
}
|
|
2296
|
+
};
|
|
2297
|
+
var ClaudeCodeStreamLogger = class _ClaudeCodeStreamLogger {
|
|
2298
|
+
filePath;
|
|
2299
|
+
stream;
|
|
2300
|
+
startedAt = Date.now();
|
|
2301
|
+
stdoutBuffer = "";
|
|
2302
|
+
stderrBuffer = "";
|
|
2303
|
+
format;
|
|
2304
|
+
constructor(filePath, format) {
|
|
2305
|
+
this.filePath = filePath;
|
|
2306
|
+
this.format = format;
|
|
2307
|
+
this.stream = createWriteStream(filePath, { flags: "a" });
|
|
2308
|
+
}
|
|
2309
|
+
static async create(options) {
|
|
2310
|
+
const logger = new _ClaudeCodeStreamLogger(options.filePath, options.format);
|
|
2311
|
+
const header = [
|
|
2312
|
+
"# Claude Code CLI stream log",
|
|
2313
|
+
`# target: ${options.targetName}`,
|
|
2314
|
+
options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
|
|
2315
|
+
options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
|
|
2316
|
+
`# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
|
|
2317
|
+
""
|
|
2318
|
+
].filter((line) => Boolean(line));
|
|
2319
|
+
logger.writeLines(header);
|
|
2320
|
+
return logger;
|
|
2321
|
+
}
|
|
2322
|
+
handleStdoutChunk(chunk) {
|
|
2323
|
+
this.stdoutBuffer += chunk;
|
|
2324
|
+
this.flushBuffer("stdout");
|
|
2325
|
+
}
|
|
2326
|
+
handleStderrChunk(chunk) {
|
|
2327
|
+
this.stderrBuffer += chunk;
|
|
2328
|
+
this.flushBuffer("stderr");
|
|
2329
|
+
}
|
|
2330
|
+
async close() {
|
|
2331
|
+
this.flushBuffer("stdout");
|
|
2332
|
+
this.flushBuffer("stderr");
|
|
2333
|
+
this.flushRemainder();
|
|
2334
|
+
await new Promise((resolve, reject) => {
|
|
2335
|
+
this.stream.once("error", reject);
|
|
2336
|
+
this.stream.end(() => resolve());
|
|
2337
|
+
});
|
|
2338
|
+
}
|
|
2339
|
+
writeLines(lines) {
|
|
2340
|
+
for (const line of lines) {
|
|
2341
|
+
this.stream.write(`${line}
|
|
2342
|
+
`);
|
|
2343
|
+
}
|
|
2344
|
+
}
|
|
2345
|
+
flushBuffer(source) {
|
|
2346
|
+
const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
|
|
2347
|
+
const lines = buffer.split(/\r?\n/);
|
|
2348
|
+
const remainder = lines.pop() ?? "";
|
|
2349
|
+
if (source === "stdout") {
|
|
2350
|
+
this.stdoutBuffer = remainder;
|
|
2351
|
+
} else {
|
|
2352
|
+
this.stderrBuffer = remainder;
|
|
2353
|
+
}
|
|
2354
|
+
for (const line of lines) {
|
|
2355
|
+
const formatted = this.formatLine(line, source);
|
|
2356
|
+
if (formatted) {
|
|
2357
|
+
this.stream.write(formatted);
|
|
2358
|
+
this.stream.write("\n");
|
|
2359
|
+
}
|
|
2360
|
+
}
|
|
2361
|
+
}
|
|
2362
|
+
formatLine(rawLine, source) {
|
|
2363
|
+
const trimmed = rawLine.trim();
|
|
2364
|
+
if (trimmed.length === 0) {
|
|
2365
|
+
return void 0;
|
|
2366
|
+
}
|
|
2367
|
+
const message = this.format === "json" ? formatClaudeCodeJsonLog(trimmed) : formatClaudeCodeLogMessage(trimmed, source);
|
|
2368
|
+
return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
|
|
2369
|
+
}
|
|
2370
|
+
flushRemainder() {
|
|
2371
|
+
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
2372
|
+
if (stdoutRemainder.length > 0) {
|
|
2373
|
+
const formatted = this.formatLine(stdoutRemainder, "stdout");
|
|
2374
|
+
if (formatted) {
|
|
2375
|
+
this.stream.write(formatted);
|
|
2376
|
+
this.stream.write("\n");
|
|
2377
|
+
}
|
|
2378
|
+
}
|
|
2379
|
+
const stderrRemainder = this.stderrBuffer.trim();
|
|
2380
|
+
if (stderrRemainder.length > 0) {
|
|
2381
|
+
const formatted = this.formatLine(stderrRemainder, "stderr");
|
|
2382
|
+
if (formatted) {
|
|
2383
|
+
this.stream.write(formatted);
|
|
2384
|
+
this.stream.write("\n");
|
|
2385
|
+
}
|
|
2386
|
+
}
|
|
2387
|
+
this.stdoutBuffer = "";
|
|
2388
|
+
this.stderrBuffer = "";
|
|
2389
|
+
}
|
|
2390
|
+
};
|
|
2391
|
+
function isClaudeCodeLogStreamingDisabled() {
|
|
2392
|
+
const envValue = process.env.AGENTV_CLAUDE_CODE_STREAM_LOGS;
|
|
2393
|
+
if (!envValue) {
|
|
2394
|
+
return false;
|
|
2395
|
+
}
|
|
2396
|
+
const normalized = envValue.trim().toLowerCase();
|
|
2397
|
+
return normalized === "false" || normalized === "0" || normalized === "off";
|
|
2398
|
+
}
|
|
2399
|
+
function buildLogFilename(request, targetName) {
|
|
2400
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2401
|
+
const evalId = sanitizeForFilename(request.evalCaseId ?? "claude-code");
|
|
2402
|
+
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
2403
|
+
const target = sanitizeForFilename(targetName);
|
|
2404
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`;
|
|
2405
|
+
}
|
|
2406
|
+
function sanitizeForFilename(value) {
|
|
2407
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
2408
|
+
return sanitized.length > 0 ? sanitized : "claude-code";
|
|
2409
|
+
}
|
|
2410
|
+
function formatElapsed(startedAt) {
|
|
2411
|
+
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
2412
|
+
const hours = Math.floor(elapsedSeconds / 3600);
|
|
2413
|
+
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
2414
|
+
const seconds = elapsedSeconds % 60;
|
|
2415
|
+
if (hours > 0) {
|
|
2416
|
+
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2417
|
+
}
|
|
2418
|
+
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2419
|
+
}
|
|
2420
|
+
function formatClaudeCodeLogMessage(rawLine, source) {
|
|
2421
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
2422
|
+
if (parsed) {
|
|
2423
|
+
const summary = summarizeClaudeCodeEvent(parsed);
|
|
2424
|
+
if (summary) {
|
|
2425
|
+
return summary;
|
|
2426
|
+
}
|
|
2427
|
+
}
|
|
2428
|
+
if (source === "stderr") {
|
|
2429
|
+
return `stderr: ${rawLine}`;
|
|
2430
|
+
}
|
|
2431
|
+
return rawLine;
|
|
2432
|
+
}
|
|
2433
|
+
function formatClaudeCodeJsonLog(rawLine) {
|
|
2434
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
2435
|
+
if (!parsed) {
|
|
2436
|
+
return rawLine;
|
|
2437
|
+
}
|
|
2438
|
+
try {
|
|
2439
|
+
return JSON.stringify(parsed, null, 2);
|
|
2440
|
+
} catch {
|
|
2441
|
+
return rawLine;
|
|
2442
|
+
}
|
|
2443
|
+
}
|
|
2444
|
+
function summarizeClaudeCodeEvent(event) {
|
|
2445
|
+
if (!event || typeof event !== "object") {
|
|
2446
|
+
return void 0;
|
|
2447
|
+
}
|
|
2448
|
+
const record = event;
|
|
2449
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
2450
|
+
if (!type) {
|
|
2451
|
+
return void 0;
|
|
2452
|
+
}
|
|
2453
|
+
switch (type) {
|
|
2454
|
+
case "system":
|
|
2455
|
+
return "system: init";
|
|
2456
|
+
case "assistant": {
|
|
2457
|
+
const message = record.message;
|
|
2458
|
+
if (message) {
|
|
2459
|
+
const content = message.content;
|
|
2460
|
+
if (Array.isArray(content) && content.length > 0) {
|
|
2461
|
+
const first = content[0];
|
|
2462
|
+
if (first?.type === "tool_use") {
|
|
2463
|
+
return `assistant: tool_use (${first.name})`;
|
|
2464
|
+
}
|
|
2465
|
+
if (first?.type === "text") {
|
|
2466
|
+
const text = first.text;
|
|
2467
|
+
if (typeof text === "string") {
|
|
2468
|
+
const preview = text.length > 50 ? `${text.slice(0, 50)}...` : text;
|
|
2469
|
+
return `assistant: ${preview}`;
|
|
2470
|
+
}
|
|
2471
|
+
}
|
|
2472
|
+
}
|
|
2473
|
+
}
|
|
2474
|
+
return "assistant";
|
|
2475
|
+
}
|
|
2476
|
+
case "user": {
|
|
2477
|
+
const message = record.message;
|
|
2478
|
+
if (message) {
|
|
2479
|
+
const content = message.content;
|
|
2480
|
+
if (Array.isArray(content) && content.length > 0) {
|
|
2481
|
+
const first = content[0];
|
|
2482
|
+
if (first?.type === "tool_result") {
|
|
2483
|
+
return `user: tool_result (${first.tool_use_id})`;
|
|
2484
|
+
}
|
|
2485
|
+
}
|
|
2486
|
+
}
|
|
2487
|
+
return "user";
|
|
2488
|
+
}
|
|
2489
|
+
case "result": {
|
|
2490
|
+
const cost = record.cost_usd;
|
|
2491
|
+
const duration = record.duration_ms;
|
|
2492
|
+
if (typeof cost === "number" && typeof duration === "number") {
|
|
2493
|
+
return `result: $${cost.toFixed(4)}, ${Math.round(duration)}ms`;
|
|
2494
|
+
}
|
|
2495
|
+
return "result";
|
|
2496
|
+
}
|
|
2497
|
+
default:
|
|
2498
|
+
return type;
|
|
2499
|
+
}
|
|
2500
|
+
}
|
|
2501
|
+
function tryParseJsonValue(rawLine) {
|
|
2502
|
+
try {
|
|
2503
|
+
return JSON.parse(rawLine);
|
|
2504
|
+
} catch {
|
|
2505
|
+
return void 0;
|
|
2506
|
+
}
|
|
2507
|
+
}
|
|
2508
|
+
function parseClaudeCodeJsonl(output) {
|
|
2509
|
+
const trimmed = output.trim();
|
|
2510
|
+
if (trimmed.length === 0) {
|
|
2511
|
+
throw new Error("Claude Code CLI produced no output");
|
|
2512
|
+
}
|
|
2513
|
+
const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
2514
|
+
const parsed = [];
|
|
2515
|
+
for (const line of lines) {
|
|
2516
|
+
try {
|
|
2517
|
+
parsed.push(JSON.parse(line));
|
|
2518
|
+
} catch {
|
|
2519
|
+
}
|
|
2520
|
+
}
|
|
2521
|
+
if (parsed.length === 0) {
|
|
2522
|
+
throw new Error("Claude Code CLI produced no valid JSON output");
|
|
2523
|
+
}
|
|
2524
|
+
return parsed;
|
|
2525
|
+
}
|
|
2526
|
+
function extractOutputMessages(events) {
|
|
2527
|
+
const outputMessages = [];
|
|
2528
|
+
for (const event of events) {
|
|
2529
|
+
if (!event || typeof event !== "object") {
|
|
2530
|
+
continue;
|
|
2531
|
+
}
|
|
2532
|
+
const record = event;
|
|
2533
|
+
const type = record.type;
|
|
2534
|
+
if (type === "assistant" || type === "user") {
|
|
2535
|
+
const message = record.message;
|
|
2536
|
+
if (message) {
|
|
2537
|
+
const converted = convertClaudeCodeMessage(message, type);
|
|
2538
|
+
if (converted) {
|
|
2539
|
+
outputMessages.push(converted);
|
|
2540
|
+
}
|
|
2541
|
+
}
|
|
2542
|
+
}
|
|
2543
|
+
}
|
|
2544
|
+
return outputMessages;
|
|
2545
|
+
}
|
|
2546
|
+
function convertClaudeCodeMessage(message, type) {
|
|
2547
|
+
const role = type === "assistant" ? "assistant" : "user";
|
|
2548
|
+
const content = extractTextContent(message.content);
|
|
2549
|
+
const toolCalls = extractToolCalls(message.content);
|
|
2550
|
+
return {
|
|
2551
|
+
role,
|
|
2552
|
+
content,
|
|
2553
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
2554
|
+
};
|
|
2555
|
+
}
|
|
2556
|
+
function extractTextContent(content) {
|
|
2557
|
+
if (typeof content === "string") {
|
|
2558
|
+
return content;
|
|
2559
|
+
}
|
|
2560
|
+
if (!Array.isArray(content)) {
|
|
2561
|
+
return void 0;
|
|
2562
|
+
}
|
|
2563
|
+
const textParts = [];
|
|
2564
|
+
for (const part of content) {
|
|
2565
|
+
if (!part || typeof part !== "object") {
|
|
2566
|
+
continue;
|
|
2567
|
+
}
|
|
2568
|
+
const p = part;
|
|
2569
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
2570
|
+
textParts.push(p.text);
|
|
2571
|
+
}
|
|
2572
|
+
}
|
|
2573
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
2574
|
+
}
|
|
2575
|
+
function extractToolCalls(content) {
|
|
2576
|
+
if (!Array.isArray(content)) {
|
|
2577
|
+
return [];
|
|
2578
|
+
}
|
|
2579
|
+
const toolCalls = [];
|
|
2580
|
+
for (const part of content) {
|
|
2581
|
+
if (!part || typeof part !== "object") {
|
|
2582
|
+
continue;
|
|
2583
|
+
}
|
|
2584
|
+
const p = part;
|
|
2585
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
2586
|
+
toolCalls.push({
|
|
2587
|
+
tool: p.name,
|
|
2588
|
+
input: p.input,
|
|
2589
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
2590
|
+
});
|
|
2591
|
+
}
|
|
2592
|
+
if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
2593
|
+
toolCalls.push({
|
|
2594
|
+
tool: "tool_result",
|
|
2595
|
+
output: p.content,
|
|
2596
|
+
id: p.tool_use_id
|
|
2597
|
+
});
|
|
2598
|
+
}
|
|
2599
|
+
}
|
|
2600
|
+
return toolCalls;
|
|
2601
|
+
}
|
|
2602
|
+
function extractUsage(events) {
|
|
2603
|
+
for (let i = events.length - 1; i >= 0; i--) {
|
|
2604
|
+
const event = events[i];
|
|
2605
|
+
if (!event || typeof event !== "object") {
|
|
2606
|
+
continue;
|
|
2607
|
+
}
|
|
2608
|
+
const record = event;
|
|
2609
|
+
if (record.type !== "result") {
|
|
2610
|
+
continue;
|
|
2611
|
+
}
|
|
2612
|
+
const usage = {};
|
|
2613
|
+
if (typeof record.cost_usd === "number") {
|
|
2614
|
+
usage.cost_usd = record.cost_usd;
|
|
2615
|
+
}
|
|
2616
|
+
if (typeof record.duration_ms === "number") {
|
|
2617
|
+
usage.duration_ms = record.duration_ms;
|
|
2618
|
+
}
|
|
2619
|
+
if (typeof record.duration_api_ms === "number") {
|
|
2620
|
+
usage.duration_api_ms = record.duration_api_ms;
|
|
2621
|
+
}
|
|
2622
|
+
if (typeof record.input_tokens === "number") {
|
|
2623
|
+
usage.input_tokens = record.input_tokens;
|
|
2624
|
+
}
|
|
2625
|
+
if (typeof record.output_tokens === "number") {
|
|
2626
|
+
usage.output_tokens = record.output_tokens;
|
|
2627
|
+
}
|
|
2628
|
+
if (typeof record.session_id === "string") {
|
|
2629
|
+
usage.session_id = record.session_id;
|
|
2630
|
+
}
|
|
2631
|
+
return Object.keys(usage).length > 0 ? usage : void 0;
|
|
2632
|
+
}
|
|
2633
|
+
return void 0;
|
|
2634
|
+
}
|
|
2635
|
+
function pickDetail(stderr, stdout) {
|
|
2636
|
+
const errorText = stderr.trim();
|
|
2637
|
+
if (errorText.length > 0) {
|
|
2638
|
+
return errorText;
|
|
2639
|
+
}
|
|
2640
|
+
const stdoutText = stdout.trim();
|
|
2641
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
2642
|
+
}
|
|
2643
|
+
function formatTimeoutSuffix(timeoutMs) {
|
|
2644
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
2645
|
+
return "";
|
|
2646
|
+
}
|
|
2647
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
2648
|
+
return ` after ${seconds}s`;
|
|
2649
|
+
}
|
|
2650
|
+
function isNestedClaudeCodeAuthError(stdout) {
|
|
2651
|
+
try {
|
|
2652
|
+
const lines = stdout.split("\n");
|
|
2653
|
+
let hasApiKeySource = false;
|
|
2654
|
+
let hasAuthError = false;
|
|
2655
|
+
for (const line of lines) {
|
|
2656
|
+
const trimmed = line.trim();
|
|
2657
|
+
if (!trimmed) continue;
|
|
2658
|
+
try {
|
|
2659
|
+
const event = JSON.parse(trimmed);
|
|
2660
|
+
if (event.type === "system" && event.apiKeySource === "ANTHROPIC_API_KEY") {
|
|
2661
|
+
hasApiKeySource = true;
|
|
2662
|
+
}
|
|
2663
|
+
if (event.error === "authentication_failed" || event.type === "result" && event.is_error) {
|
|
2664
|
+
hasAuthError = true;
|
|
2665
|
+
}
|
|
2666
|
+
} catch {
|
|
2667
|
+
}
|
|
2668
|
+
}
|
|
2669
|
+
return hasApiKeySource && hasAuthError;
|
|
2670
|
+
} catch {
|
|
2671
|
+
return false;
|
|
2672
|
+
}
|
|
2673
|
+
}
|
|
2674
|
+
function escapeShellArg(arg) {
|
|
2675
|
+
return `'${arg.replace(/'/g, "'\\''")}'`;
|
|
2676
|
+
}
|
|
2677
|
+
async function defaultClaudeCodeRunner(options) {
|
|
2678
|
+
const tempId = randomUUID();
|
|
2679
|
+
const stdoutFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stdout`);
|
|
2680
|
+
const stderrFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stderr`);
|
|
2681
|
+
const exitFile = path8.join(tmpdir(), `agentv-cc-${tempId}-exit`);
|
|
2682
|
+
const pidFile = path8.join(tmpdir(), `agentv-cc-${tempId}-pid`);
|
|
2683
|
+
try {
|
|
2684
|
+
return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
|
|
2685
|
+
} finally {
|
|
2686
|
+
for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
|
|
2687
|
+
try {
|
|
2688
|
+
await rm(file, { force: true });
|
|
2689
|
+
} catch {
|
|
2690
|
+
}
|
|
2691
|
+
}
|
|
2692
|
+
}
|
|
2693
|
+
}
|
|
2694
|
+
async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile) {
|
|
2695
|
+
const parts = options.executable.split(/\s+/);
|
|
2696
|
+
const executable = parts[0];
|
|
2697
|
+
const executableArgs = parts.slice(1);
|
|
2698
|
+
const allArgs = [...executableArgs, ...options.args];
|
|
2699
|
+
const escapedArgs = allArgs.map((arg) => escapeShellArg(arg));
|
|
2700
|
+
const fullCommand = [escapeShellArg(executable), ...escapedArgs].join(" ");
|
|
2701
|
+
const bashScript = `
|
|
2702
|
+
unset CLAUDECODE CLAUDE_CODE_ENTRYPOINT 2>/dev/null
|
|
2703
|
+
${fullCommand} >${escapeShellArg(stdoutFile)} 2>${escapeShellArg(stderrFile)} &
|
|
2704
|
+
CHILD_PID=$!
|
|
2705
|
+
echo $CHILD_PID > ${escapeShellArg(pidFile)}
|
|
2706
|
+
wait $CHILD_PID
|
|
2707
|
+
echo $? > ${escapeShellArg(exitFile)}
|
|
2708
|
+
`;
|
|
2709
|
+
const child = spawn("setsid", ["bash", "-c", bashScript], {
|
|
2710
|
+
cwd: options.cwd,
|
|
2711
|
+
env: options.env,
|
|
2712
|
+
detached: true,
|
|
2713
|
+
stdio: "ignore"
|
|
2714
|
+
});
|
|
2715
|
+
child.unref();
|
|
2716
|
+
const pollInterval = 100;
|
|
2717
|
+
const startTime = Date.now();
|
|
2718
|
+
let timedOut = false;
|
|
2719
|
+
let lastStdoutSize = 0;
|
|
2720
|
+
const readFileIfExists = async (filePath) => {
|
|
2721
|
+
try {
|
|
2722
|
+
const { readFile: readFile7 } = await import("node:fs/promises");
|
|
2723
|
+
return await readFile7(filePath, "utf8");
|
|
2724
|
+
} catch {
|
|
2725
|
+
return "";
|
|
2726
|
+
}
|
|
2727
|
+
};
|
|
2728
|
+
const fileExists4 = async (filePath) => {
|
|
2729
|
+
try {
|
|
2730
|
+
const { access: access4 } = await import("node:fs/promises");
|
|
2731
|
+
await access4(filePath);
|
|
2732
|
+
return true;
|
|
2733
|
+
} catch {
|
|
2734
|
+
return false;
|
|
2735
|
+
}
|
|
2736
|
+
};
|
|
2737
|
+
const killProcess = async () => {
|
|
2738
|
+
try {
|
|
2739
|
+
const pid = await readFileIfExists(pidFile);
|
|
2740
|
+
if (pid.trim()) {
|
|
2741
|
+
process.kill(Number.parseInt(pid.trim(), 10), "SIGTERM");
|
|
2742
|
+
}
|
|
2743
|
+
} catch {
|
|
2744
|
+
}
|
|
2745
|
+
};
|
|
2746
|
+
if (options.signal?.aborted) {
|
|
2747
|
+
await killProcess();
|
|
2748
|
+
return { stdout: "", stderr: "Aborted", exitCode: -1, timedOut: false };
|
|
2749
|
+
}
|
|
2750
|
+
const abortHandler = () => {
|
|
2751
|
+
killProcess().catch(() => {
|
|
2752
|
+
});
|
|
2753
|
+
};
|
|
2754
|
+
options.signal?.addEventListener("abort", abortHandler, { once: true });
|
|
2755
|
+
try {
|
|
2756
|
+
while (true) {
|
|
2757
|
+
if (options.timeoutMs && Date.now() - startTime > options.timeoutMs) {
|
|
2758
|
+
timedOut = true;
|
|
2759
|
+
await killProcess();
|
|
2760
|
+
break;
|
|
2761
|
+
}
|
|
2762
|
+
if (options.signal?.aborted) {
|
|
2763
|
+
await killProcess();
|
|
2764
|
+
break;
|
|
2765
|
+
}
|
|
2766
|
+
if (options.onStdoutChunk) {
|
|
2767
|
+
const currentStdout = await readFileIfExists(stdoutFile);
|
|
2768
|
+
if (currentStdout.length > lastStdoutSize) {
|
|
2769
|
+
options.onStdoutChunk(currentStdout.slice(lastStdoutSize));
|
|
2770
|
+
lastStdoutSize = currentStdout.length;
|
|
2771
|
+
}
|
|
2772
|
+
}
|
|
2773
|
+
if (await fileExists4(exitFile)) {
|
|
2774
|
+
break;
|
|
2775
|
+
}
|
|
2776
|
+
await new Promise((resolve) => setTimeout(resolve, pollInterval));
|
|
2777
|
+
}
|
|
2778
|
+
const stdout = await readFileIfExists(stdoutFile);
|
|
2779
|
+
const stderr = await readFileIfExists(stderrFile);
|
|
2780
|
+
const exitCodeStr = await readFileIfExists(exitFile);
|
|
2781
|
+
const exitCode = exitCodeStr.trim() ? Number.parseInt(exitCodeStr.trim(), 10) : -1;
|
|
2782
|
+
if (options.onStdoutChunk && stdout.length > lastStdoutSize) {
|
|
2783
|
+
options.onStdoutChunk(stdout.slice(lastStdoutSize));
|
|
2784
|
+
}
|
|
2785
|
+
if (options.onStderrChunk && stderr) {
|
|
2786
|
+
options.onStderrChunk(stderr);
|
|
2787
|
+
}
|
|
2788
|
+
return { stdout, stderr, exitCode, timedOut };
|
|
2789
|
+
} finally {
|
|
2790
|
+
options.signal?.removeEventListener("abort", abortHandler);
|
|
2791
|
+
}
|
|
2792
|
+
}
|
|
2793
|
+
|
|
2794
|
+
// src/evaluation/providers/cli.ts
|
|
2795
|
+
import { exec as execWithCallback } from "node:child_process";
|
|
2796
|
+
import fs from "node:fs/promises";
|
|
2797
|
+
import os from "node:os";
|
|
2798
|
+
import path9 from "node:path";
|
|
2799
|
+
import { promisify } from "node:util";
|
|
2800
|
+
import { z } from "zod";
|
|
2801
|
+
var ToolCallSchema = z.object({
|
|
2802
|
+
tool: z.string(),
|
|
2803
|
+
input: z.unknown().optional(),
|
|
2804
|
+
output: z.unknown().optional(),
|
|
2805
|
+
id: z.string().optional(),
|
|
2806
|
+
timestamp: z.string().optional()
|
|
2807
|
+
});
|
|
2808
|
+
var OutputMessageInputSchema = z.object({
|
|
2809
|
+
role: z.string(),
|
|
2810
|
+
name: z.string().optional(),
|
|
2811
|
+
content: z.unknown().optional(),
|
|
2812
|
+
tool_calls: z.array(ToolCallSchema).optional(),
|
|
2813
|
+
timestamp: z.string().optional(),
|
|
2814
|
+
metadata: z.record(z.unknown()).optional()
|
|
2815
|
+
});
|
|
2816
|
+
var TokenUsageSchema = z.object({
|
|
2817
|
+
input: z.number(),
|
|
2818
|
+
output: z.number(),
|
|
2819
|
+
cached: z.number().optional()
|
|
2820
|
+
});
|
|
2821
|
+
var CliOutputSchema = z.object({
|
|
2822
|
+
text: z.unknown().optional(),
|
|
2823
|
+
output_messages: z.array(OutputMessageInputSchema).optional(),
|
|
2824
|
+
token_usage: TokenUsageSchema.optional(),
|
|
2825
|
+
cost_usd: z.number().optional(),
|
|
2826
|
+
duration_ms: z.number().optional()
|
|
2827
|
+
});
|
|
2828
|
+
var CliJsonlRecordSchema = CliOutputSchema.extend({
|
|
2829
|
+
id: z.string().min(1)
|
|
2830
|
+
});
|
|
2831
|
+
function validateMetrics(costUsd, durationMs, context) {
|
|
2832
|
+
let validCostUsd = costUsd;
|
|
2833
|
+
let validDurationMs = durationMs;
|
|
2834
|
+
if (costUsd !== void 0 && costUsd < 0) {
|
|
2835
|
+
console.warn(`[cli-provider] ${context}: ignoring negative cost_usd value (${costUsd})`);
|
|
2836
|
+
validCostUsd = void 0;
|
|
2837
|
+
}
|
|
2838
|
+
if (durationMs !== void 0 && durationMs < 0) {
|
|
2839
|
+
console.warn(`[cli-provider] ${context}: ignoring negative duration_ms value (${durationMs})`);
|
|
2840
|
+
validDurationMs = void 0;
|
|
2841
|
+
}
|
|
2842
|
+
return { costUsd: validCostUsd, durationMs: validDurationMs };
|
|
2843
|
+
}
|
|
2844
|
+
function convertOutputMessages(messages) {
|
|
2845
|
+
if (!messages || messages.length === 0) {
|
|
2846
|
+
return void 0;
|
|
2847
|
+
}
|
|
2848
|
+
return messages.map((msg) => ({
|
|
2849
|
+
role: msg.role,
|
|
2850
|
+
name: msg.name,
|
|
2851
|
+
content: msg.content,
|
|
2852
|
+
toolCalls: msg.tool_calls,
|
|
2853
|
+
timestamp: msg.timestamp,
|
|
2854
|
+
metadata: msg.metadata
|
|
2855
|
+
}));
|
|
2856
|
+
}
|
|
2857
|
+
var execAsync = promisify(execWithCallback);
|
|
2858
|
+
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
2859
|
+
async function defaultCommandRunner(command, options) {
|
|
2860
|
+
const execOptions = {
|
|
2861
|
+
cwd: options.cwd,
|
|
2862
|
+
env: options.env,
|
|
2863
|
+
timeout: options.timeoutMs,
|
|
2864
|
+
signal: options.signal,
|
|
2865
|
+
maxBuffer: DEFAULT_MAX_BUFFER,
|
|
2866
|
+
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
2867
|
+
};
|
|
2868
|
+
try {
|
|
2869
|
+
const { stdout, stderr } = await execAsync(command, execOptions);
|
|
2870
|
+
return {
|
|
2871
|
+
stdout,
|
|
2872
|
+
stderr,
|
|
2873
|
+
exitCode: 0,
|
|
2874
|
+
failed: false,
|
|
2875
|
+
timedOut: false,
|
|
2876
|
+
signal: null
|
|
2877
|
+
};
|
|
2878
|
+
} catch (error) {
|
|
2879
|
+
const execError = error;
|
|
2880
|
+
return {
|
|
2881
|
+
stdout: execError.stdout ?? "",
|
|
2882
|
+
stderr: execError.stderr ?? "",
|
|
2883
|
+
exitCode: typeof execError.code === "number" ? execError.code : null,
|
|
2884
|
+
failed: true,
|
|
2885
|
+
timedOut: execError.timedOut === true || execError.killed === true,
|
|
2886
|
+
signal: execError.signal ?? null
|
|
2887
|
+
};
|
|
2888
|
+
}
|
|
2889
|
+
}
|
|
2890
|
+
var CliProvider = class {
|
|
2891
|
+
id;
|
|
2892
|
+
kind = "cli";
|
|
2893
|
+
targetName;
|
|
2894
|
+
supportsBatch = true;
|
|
2895
|
+
config;
|
|
2896
|
+
runCommand;
|
|
2897
|
+
verbose;
|
|
2898
|
+
keepTempFiles;
|
|
2899
|
+
healthcheckPromise;
|
|
2900
|
+
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
2901
|
+
this.targetName = targetName;
|
|
2902
|
+
this.id = `cli:${targetName}`;
|
|
2903
|
+
this.config = config;
|
|
2904
|
+
this.runCommand = runner;
|
|
2905
|
+
this.verbose = config.verbose ?? false;
|
|
2906
|
+
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
2907
|
+
}
|
|
2908
|
+
async invoke(request) {
|
|
2909
|
+
if (request.signal?.aborted) {
|
|
2910
|
+
throw new Error("CLI provider request was aborted before execution");
|
|
2911
|
+
}
|
|
2912
|
+
await this.ensureHealthy(request.signal);
|
|
2913
|
+
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
2914
|
+
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
2915
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
2916
|
+
if (this.verbose) {
|
|
2917
|
+
console.log(
|
|
2918
|
+
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
2919
|
+
);
|
|
2920
|
+
}
|
|
2921
|
+
const startTime = Date.now();
|
|
2922
|
+
const result = await this.runCommand(renderedCommand, {
|
|
2923
|
+
cwd: this.config.cwd,
|
|
2924
|
+
env: process.env,
|
|
2925
|
+
timeoutMs: this.config.timeoutMs,
|
|
2926
|
+
signal: request.signal
|
|
2927
|
+
});
|
|
2928
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
2929
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
2930
|
+
if (request.signal?.aborted) {
|
|
2931
|
+
throw new Error("CLI provider request was aborted");
|
|
1834
2932
|
}
|
|
1835
2933
|
if (result.timedOut) {
|
|
1836
2934
|
throw new Error(
|
|
1837
|
-
`CLI provider timed out${
|
|
2935
|
+
`CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
1838
2936
|
);
|
|
1839
2937
|
}
|
|
1840
2938
|
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
@@ -1910,7 +3008,7 @@ var CliProvider = class {
|
|
|
1910
3008
|
}
|
|
1911
3009
|
if (result.timedOut) {
|
|
1912
3010
|
throw new Error(
|
|
1913
|
-
`CLI provider timed out${
|
|
3011
|
+
`CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
1914
3012
|
);
|
|
1915
3013
|
}
|
|
1916
3014
|
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
@@ -1920,11 +3018,6 @@ var CliProvider = class {
|
|
|
1920
3018
|
}
|
|
1921
3019
|
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1922
3020
|
const recordsById = this.parseJsonlBatchOutput(responseContent);
|
|
1923
|
-
const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
|
|
1924
|
-
const missingIds = requestedIds.filter((id) => !recordsById.has(id));
|
|
1925
|
-
if (missingIds.length > 0) {
|
|
1926
|
-
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
1927
|
-
}
|
|
1928
3021
|
const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
|
|
1929
3022
|
const responses = requests.map((request) => {
|
|
1930
3023
|
const evalCaseId = request.evalCaseId;
|
|
@@ -1943,15 +3036,20 @@ var CliProvider = class {
|
|
|
1943
3036
|
}
|
|
1944
3037
|
const parsed = recordsById.get(evalCaseId);
|
|
1945
3038
|
if (!parsed) {
|
|
3039
|
+
const errorMessage = `Batch output missing id '${evalCaseId}'`;
|
|
3040
|
+
if (this.verbose) {
|
|
3041
|
+
console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
|
|
3042
|
+
}
|
|
1946
3043
|
return {
|
|
1947
|
-
outputMessages: [],
|
|
3044
|
+
outputMessages: [{ role: "assistant", content: `Error: ${errorMessage}` }],
|
|
1948
3045
|
durationMs: perRequestFallbackMs,
|
|
1949
3046
|
raw: {
|
|
1950
3047
|
command: renderedCommand,
|
|
1951
3048
|
stderr: result.stderr,
|
|
1952
3049
|
exitCode: result.exitCode ?? 0,
|
|
1953
3050
|
cwd: this.config.cwd,
|
|
1954
|
-
outputFile: outputFilePath
|
|
3051
|
+
outputFile: outputFilePath,
|
|
3052
|
+
error: errorMessage
|
|
1955
3053
|
}
|
|
1956
3054
|
};
|
|
1957
3055
|
}
|
|
@@ -1984,101 +3082,37 @@ var CliProvider = class {
|
|
|
1984
3082
|
* - duration_ms: number
|
|
1985
3083
|
*/
|
|
1986
3084
|
parseOutputContent(content) {
|
|
3085
|
+
let parsed;
|
|
1987
3086
|
try {
|
|
1988
|
-
|
|
1989
|
-
if (typeof parsed === "object" && parsed !== null) {
|
|
1990
|
-
const obj = parsed;
|
|
1991
|
-
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
1992
|
-
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
1993
|
-
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
1994
|
-
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
1995
|
-
if (outputMessages && outputMessages.length > 0) {
|
|
1996
|
-
return { outputMessages, tokenUsage, costUsd, durationMs };
|
|
1997
|
-
}
|
|
1998
|
-
if ("text" in obj) {
|
|
1999
|
-
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
2000
|
-
return {
|
|
2001
|
-
outputMessages: [{ role: "assistant", content: text }],
|
|
2002
|
-
tokenUsage,
|
|
2003
|
-
costUsd,
|
|
2004
|
-
durationMs
|
|
2005
|
-
};
|
|
2006
|
-
}
|
|
2007
|
-
}
|
|
3087
|
+
parsed = JSON.parse(content);
|
|
2008
3088
|
} catch {
|
|
3089
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2009
3090
|
}
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
* Parse token_usage from CLI output.
|
|
2014
|
-
*/
|
|
2015
|
-
parseTokenUsage(tokenUsage) {
|
|
2016
|
-
if (typeof tokenUsage !== "object" || tokenUsage === null) {
|
|
2017
|
-
return void 0;
|
|
2018
|
-
}
|
|
2019
|
-
const obj = tokenUsage;
|
|
2020
|
-
if (typeof obj.input !== "number" || typeof obj.output !== "number") {
|
|
2021
|
-
return void 0;
|
|
2022
|
-
}
|
|
2023
|
-
return {
|
|
2024
|
-
input: obj.input,
|
|
2025
|
-
output: obj.output,
|
|
2026
|
-
cached: typeof obj.cached === "number" ? obj.cached : void 0
|
|
2027
|
-
};
|
|
2028
|
-
}
|
|
2029
|
-
/**
|
|
2030
|
-
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
2031
|
-
*/
|
|
2032
|
-
parseOutputMessages(outputMessages) {
|
|
2033
|
-
if (!Array.isArray(outputMessages)) {
|
|
2034
|
-
return void 0;
|
|
3091
|
+
const result = CliOutputSchema.safeParse(parsed);
|
|
3092
|
+
if (!result.success) {
|
|
3093
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2035
3094
|
}
|
|
2036
|
-
const
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
const message = {
|
|
2046
|
-
role: rawMsg.role,
|
|
2047
|
-
name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
|
|
2048
|
-
content: rawMsg.content,
|
|
2049
|
-
toolCalls: this.parseToolCalls(rawMsg.tool_calls),
|
|
2050
|
-
timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
|
|
2051
|
-
metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
|
|
3095
|
+
const obj = result.data;
|
|
3096
|
+
const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, "parsing output");
|
|
3097
|
+
const outputMessages = convertOutputMessages(obj.output_messages);
|
|
3098
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
3099
|
+
return {
|
|
3100
|
+
outputMessages,
|
|
3101
|
+
tokenUsage: obj.token_usage,
|
|
3102
|
+
costUsd: metrics.costUsd,
|
|
3103
|
+
durationMs: metrics.durationMs
|
|
2052
3104
|
};
|
|
2053
|
-
messages.push(message);
|
|
2054
|
-
}
|
|
2055
|
-
return messages.length > 0 ? messages : void 0;
|
|
2056
|
-
}
|
|
2057
|
-
/**
|
|
2058
|
-
* Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
|
|
2059
|
-
*/
|
|
2060
|
-
parseToolCalls(toolCalls) {
|
|
2061
|
-
if (!Array.isArray(toolCalls)) {
|
|
2062
|
-
return void 0;
|
|
2063
3105
|
}
|
|
2064
|
-
|
|
2065
|
-
|
|
2066
|
-
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2072
|
-
}
|
|
2073
|
-
calls.push({
|
|
2074
|
-
tool: rawCall.tool,
|
|
2075
|
-
input: rawCall.input,
|
|
2076
|
-
output: rawCall.output,
|
|
2077
|
-
id: typeof rawCall.id === "string" ? rawCall.id : void 0,
|
|
2078
|
-
timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
|
|
2079
|
-
});
|
|
3106
|
+
if (obj.text !== void 0) {
|
|
3107
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
3108
|
+
return {
|
|
3109
|
+
outputMessages: [{ role: "assistant", content: text }],
|
|
3110
|
+
tokenUsage: obj.token_usage,
|
|
3111
|
+
costUsd: metrics.costUsd,
|
|
3112
|
+
durationMs: metrics.durationMs
|
|
3113
|
+
};
|
|
2080
3114
|
}
|
|
2081
|
-
return
|
|
3115
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2082
3116
|
}
|
|
2083
3117
|
parseJsonlBatchOutput(content) {
|
|
2084
3118
|
const records = /* @__PURE__ */ new Map();
|
|
@@ -2091,33 +3125,32 @@ var CliProvider = class {
|
|
|
2091
3125
|
const reason = error instanceof Error ? error.message : String(error);
|
|
2092
3126
|
throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
|
|
2093
3127
|
}
|
|
2094
|
-
|
|
3128
|
+
const result = CliJsonlRecordSchema.safeParse(parsed);
|
|
3129
|
+
if (!result.success) {
|
|
3130
|
+
const firstError = result.error.errors[0];
|
|
3131
|
+
if (firstError?.path.includes("id")) {
|
|
3132
|
+
throw new Error("CLI batch output JSONL line missing required string field: id");
|
|
3133
|
+
}
|
|
2095
3134
|
throw new Error("CLI batch output JSONL line must be an object");
|
|
2096
3135
|
}
|
|
2097
|
-
const obj =
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2106
|
-
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2107
|
-
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2108
|
-
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2109
|
-
let outputMessages;
|
|
2110
|
-
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
2111
|
-
outputMessages = parsedOutputMessages;
|
|
3136
|
+
const obj = result.data;
|
|
3137
|
+
if (records.has(obj.id)) {
|
|
3138
|
+
throw new Error(`CLI batch output contains duplicate id: ${obj.id}`);
|
|
3139
|
+
}
|
|
3140
|
+
const outputMessages = convertOutputMessages(obj.output_messages);
|
|
3141
|
+
let finalOutputMessages;
|
|
3142
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
3143
|
+
finalOutputMessages = outputMessages;
|
|
2112
3144
|
} else {
|
|
2113
3145
|
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
2114
|
-
|
|
2115
|
-
}
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
3146
|
+
finalOutputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
3147
|
+
}
|
|
3148
|
+
const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, `batch record '${obj.id}'`);
|
|
3149
|
+
records.set(obj.id, {
|
|
3150
|
+
outputMessages: finalOutputMessages,
|
|
3151
|
+
tokenUsage: obj.token_usage,
|
|
3152
|
+
costUsd: metrics.costUsd,
|
|
3153
|
+
durationMs: metrics.durationMs
|
|
2121
3154
|
});
|
|
2122
3155
|
}
|
|
2123
3156
|
return records;
|
|
@@ -2203,7 +3236,7 @@ var CliProvider = class {
|
|
|
2203
3236
|
}
|
|
2204
3237
|
};
|
|
2205
3238
|
function buildTemplateValues(request, config, outputFilePath) {
|
|
2206
|
-
const inputFiles =
|
|
3239
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
2207
3240
|
return {
|
|
2208
3241
|
PROMPT: shellEscape(request.question ?? ""),
|
|
2209
3242
|
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
@@ -2213,13 +3246,13 @@ function buildTemplateValues(request, config, outputFilePath) {
|
|
|
2213
3246
|
OUTPUT_FILE: shellEscape(outputFilePath)
|
|
2214
3247
|
};
|
|
2215
3248
|
}
|
|
2216
|
-
function
|
|
3249
|
+
function normalizeInputFiles2(inputFiles) {
|
|
2217
3250
|
if (!inputFiles || inputFiles.length === 0) {
|
|
2218
3251
|
return void 0;
|
|
2219
3252
|
}
|
|
2220
3253
|
const unique = /* @__PURE__ */ new Map();
|
|
2221
3254
|
for (const inputFile of inputFiles) {
|
|
2222
|
-
const absolutePath =
|
|
3255
|
+
const absolutePath = path9.resolve(inputFile);
|
|
2223
3256
|
if (!unique.has(absolutePath)) {
|
|
2224
3257
|
unique.set(absolutePath, absolutePath);
|
|
2225
3258
|
}
|
|
@@ -2233,7 +3266,7 @@ function formatFileList(files, template) {
|
|
|
2233
3266
|
const formatter = template ?? "{path}";
|
|
2234
3267
|
return files.map((filePath) => {
|
|
2235
3268
|
const escapedPath = shellEscape(filePath);
|
|
2236
|
-
const escapedName = shellEscape(
|
|
3269
|
+
const escapedName = shellEscape(path9.basename(filePath));
|
|
2237
3270
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
2238
3271
|
}).join(" ");
|
|
2239
3272
|
}
|
|
@@ -2257,9 +3290,9 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
2257
3290
|
const safeEvalId = evalCaseId || "unknown";
|
|
2258
3291
|
const timestamp = Date.now();
|
|
2259
3292
|
const random = Math.random().toString(36).substring(2, 9);
|
|
2260
|
-
return
|
|
3293
|
+
return path9.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
2261
3294
|
}
|
|
2262
|
-
function
|
|
3295
|
+
function formatTimeoutSuffix2(timeoutMs) {
|
|
2263
3296
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
2264
3297
|
return "";
|
|
2265
3298
|
}
|
|
@@ -2268,39 +3301,39 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
2268
3301
|
}
|
|
2269
3302
|
|
|
2270
3303
|
// src/evaluation/providers/codex.ts
|
|
2271
|
-
import { exec as execCallback, spawn } from "node:child_process";
|
|
2272
|
-
import { randomUUID } from "node:crypto";
|
|
2273
|
-
import { constants as constants2, createWriteStream } from "node:fs";
|
|
2274
|
-
import { access as access2, mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
2275
|
-
import { tmpdir } from "node:os";
|
|
2276
|
-
import
|
|
3304
|
+
import { exec as execCallback, spawn as spawn2 } from "node:child_process";
|
|
3305
|
+
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
3306
|
+
import { constants as constants2, createWriteStream as createWriteStream2 } from "node:fs";
|
|
3307
|
+
import { access as access2, mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
|
|
3308
|
+
import { tmpdir as tmpdir2 } from "node:os";
|
|
3309
|
+
import path10 from "node:path";
|
|
2277
3310
|
import { promisify as promisify2 } from "node:util";
|
|
2278
3311
|
|
|
2279
3312
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
2280
|
-
var
|
|
2281
|
-
var
|
|
3313
|
+
var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.codexLogs");
|
|
3314
|
+
var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.codexLogSubscribers");
|
|
2282
3315
|
function getCodexLogStore() {
|
|
2283
3316
|
const globalObject = globalThis;
|
|
2284
|
-
const existing = globalObject[
|
|
3317
|
+
const existing = globalObject[GLOBAL_LOGS_KEY2];
|
|
2285
3318
|
if (existing) {
|
|
2286
3319
|
return existing;
|
|
2287
3320
|
}
|
|
2288
3321
|
const created = [];
|
|
2289
|
-
globalObject[
|
|
3322
|
+
globalObject[GLOBAL_LOGS_KEY2] = created;
|
|
2290
3323
|
return created;
|
|
2291
3324
|
}
|
|
2292
|
-
function
|
|
3325
|
+
function getSubscriberStore2() {
|
|
2293
3326
|
const globalObject = globalThis;
|
|
2294
|
-
const existing = globalObject[
|
|
3327
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
|
|
2295
3328
|
if (existing) {
|
|
2296
3329
|
return existing;
|
|
2297
3330
|
}
|
|
2298
3331
|
const created = /* @__PURE__ */ new Set();
|
|
2299
|
-
globalObject[
|
|
3332
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
|
|
2300
3333
|
return created;
|
|
2301
3334
|
}
|
|
2302
|
-
function
|
|
2303
|
-
const subscribers = Array.from(
|
|
3335
|
+
function notifySubscribers2(entry) {
|
|
3336
|
+
const subscribers = Array.from(getSubscriberStore2());
|
|
2304
3337
|
for (const listener of subscribers) {
|
|
2305
3338
|
try {
|
|
2306
3339
|
listener(entry);
|
|
@@ -2312,7 +3345,7 @@ function notifySubscribers(entry) {
|
|
|
2312
3345
|
}
|
|
2313
3346
|
function recordCodexLogEntry(entry) {
|
|
2314
3347
|
getCodexLogStore().push(entry);
|
|
2315
|
-
|
|
3348
|
+
notifySubscribers2(entry);
|
|
2316
3349
|
}
|
|
2317
3350
|
function consumeCodexLogEntries() {
|
|
2318
3351
|
const store = getCodexLogStore();
|
|
@@ -2322,118 +3355,19 @@ function consumeCodexLogEntries() {
|
|
|
2322
3355
|
return store.splice(0, store.length);
|
|
2323
3356
|
}
|
|
2324
3357
|
function subscribeToCodexLogEntries(listener) {
|
|
2325
|
-
const store =
|
|
3358
|
+
const store = getSubscriberStore2();
|
|
2326
3359
|
store.add(listener);
|
|
2327
3360
|
return () => {
|
|
2328
3361
|
store.delete(listener);
|
|
2329
3362
|
};
|
|
2330
3363
|
}
|
|
2331
3364
|
|
|
2332
|
-
// src/evaluation/providers/preread.ts
|
|
2333
|
-
import path8 from "node:path";
|
|
2334
|
-
function buildPromptDocument(request, inputFiles, options) {
|
|
2335
|
-
const parts = [];
|
|
2336
|
-
const guidelineFiles = collectGuidelineFiles(
|
|
2337
|
-
inputFiles,
|
|
2338
|
-
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
2339
|
-
options?.guidelineOverrides
|
|
2340
|
-
);
|
|
2341
|
-
const inputFilesList = collectInputFiles(inputFiles);
|
|
2342
|
-
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
2343
|
-
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
2344
|
-
if (prereadBlock.length > 0) {
|
|
2345
|
-
parts.push("\n", prereadBlock);
|
|
2346
|
-
}
|
|
2347
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2348
|
-
return parts.join("\n").trim();
|
|
2349
|
-
}
|
|
2350
|
-
function normalizeInputFiles2(inputFiles) {
|
|
2351
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2352
|
-
return void 0;
|
|
2353
|
-
}
|
|
2354
|
-
const deduped = /* @__PURE__ */ new Map();
|
|
2355
|
-
for (const inputFile of inputFiles) {
|
|
2356
|
-
const absolutePath = path8.resolve(inputFile);
|
|
2357
|
-
if (!deduped.has(absolutePath)) {
|
|
2358
|
-
deduped.set(absolutePath, absolutePath);
|
|
2359
|
-
}
|
|
2360
|
-
}
|
|
2361
|
-
return Array.from(deduped.values());
|
|
2362
|
-
}
|
|
2363
|
-
function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
2364
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2365
|
-
return [];
|
|
2366
|
-
}
|
|
2367
|
-
const unique = /* @__PURE__ */ new Map();
|
|
2368
|
-
for (const inputFile of inputFiles) {
|
|
2369
|
-
const absolutePath = path8.resolve(inputFile);
|
|
2370
|
-
if (overrides?.has(absolutePath)) {
|
|
2371
|
-
if (!unique.has(absolutePath)) {
|
|
2372
|
-
unique.set(absolutePath, absolutePath);
|
|
2373
|
-
}
|
|
2374
|
-
continue;
|
|
2375
|
-
}
|
|
2376
|
-
const normalized = absolutePath.split(path8.sep).join("/");
|
|
2377
|
-
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2378
|
-
if (!unique.has(absolutePath)) {
|
|
2379
|
-
unique.set(absolutePath, absolutePath);
|
|
2380
|
-
}
|
|
2381
|
-
}
|
|
2382
|
-
}
|
|
2383
|
-
return Array.from(unique.values());
|
|
2384
|
-
}
|
|
2385
|
-
function collectInputFiles(inputFiles) {
|
|
2386
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
2387
|
-
return [];
|
|
2388
|
-
}
|
|
2389
|
-
const unique = /* @__PURE__ */ new Map();
|
|
2390
|
-
for (const inputFile of inputFiles) {
|
|
2391
|
-
const absolutePath = path8.resolve(inputFile);
|
|
2392
|
-
if (!unique.has(absolutePath)) {
|
|
2393
|
-
unique.set(absolutePath, absolutePath);
|
|
2394
|
-
}
|
|
2395
|
-
}
|
|
2396
|
-
return Array.from(unique.values());
|
|
2397
|
-
}
|
|
2398
|
-
function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
2399
|
-
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
2400
|
-
return "";
|
|
2401
|
-
}
|
|
2402
|
-
const buildList = (files) => files.map((absolutePath) => {
|
|
2403
|
-
const fileName = path8.basename(absolutePath);
|
|
2404
|
-
const fileUri = pathToFileUri(absolutePath);
|
|
2405
|
-
return `* [${fileName}](${fileUri})`;
|
|
2406
|
-
});
|
|
2407
|
-
const sections = [];
|
|
2408
|
-
if (guidelineFiles.length > 0) {
|
|
2409
|
-
sections.push(`Read all guideline files:
|
|
2410
|
-
${buildList(guidelineFiles).join("\n")}.`);
|
|
2411
|
-
}
|
|
2412
|
-
if (inputFiles.length > 0) {
|
|
2413
|
-
sections.push(`Read all input files:
|
|
2414
|
-
${buildList(inputFiles).join("\n")}.`);
|
|
2415
|
-
}
|
|
2416
|
-
sections.push(
|
|
2417
|
-
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
2418
|
-
"Then apply system_instructions on the user query below."
|
|
2419
|
-
);
|
|
2420
|
-
return sections.join("\n");
|
|
2421
|
-
}
|
|
2422
|
-
function pathToFileUri(filePath) {
|
|
2423
|
-
const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
|
|
2424
|
-
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2425
|
-
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2426
|
-
return `file:///${normalizedPath}`;
|
|
2427
|
-
}
|
|
2428
|
-
return `file://${normalizedPath}`;
|
|
2429
|
-
}
|
|
2430
|
-
|
|
2431
3365
|
// src/evaluation/providers/codex.ts
|
|
2432
3366
|
var execAsync2 = promisify2(execCallback);
|
|
2433
|
-
var
|
|
2434
|
-
var
|
|
3367
|
+
var WORKSPACE_PREFIX2 = "agentv-codex-";
|
|
3368
|
+
var PROMPT_FILENAME2 = "prompt.md";
|
|
2435
3369
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
2436
|
-
var
|
|
3370
|
+
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2437
3371
|
- Do NOT create any additional output files in the workspace.
|
|
2438
3372
|
- All intended file outputs/changes MUST be written in your response.
|
|
2439
3373
|
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
@@ -2458,27 +3392,27 @@ var CodexProvider = class {
|
|
|
2458
3392
|
throw new Error("Codex provider request was aborted before execution");
|
|
2459
3393
|
}
|
|
2460
3394
|
await this.ensureEnvironmentReady();
|
|
2461
|
-
const inputFiles =
|
|
3395
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
2462
3396
|
const workspaceRoot = await this.createWorkspace();
|
|
2463
3397
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2464
3398
|
try {
|
|
2465
3399
|
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
2466
|
-
const systemPrompt = this.config.systemPrompt ??
|
|
3400
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
|
|
2467
3401
|
const promptContent = `${systemPrompt}
|
|
2468
3402
|
|
|
2469
3403
|
${basePrompt}`;
|
|
2470
|
-
const promptFile =
|
|
2471
|
-
await
|
|
3404
|
+
const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3405
|
+
await writeFile2(promptFile, promptContent, "utf8");
|
|
2472
3406
|
const args = this.buildCodexArgs();
|
|
2473
3407
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
2474
3408
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
2475
3409
|
if (result.timedOut) {
|
|
2476
3410
|
throw new Error(
|
|
2477
|
-
`Codex CLI timed out${
|
|
3411
|
+
`Codex CLI timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
|
|
2478
3412
|
);
|
|
2479
3413
|
}
|
|
2480
3414
|
if (result.exitCode !== 0) {
|
|
2481
|
-
const detail =
|
|
3415
|
+
const detail = pickDetail2(result.stderr, result.stdout);
|
|
2482
3416
|
const prefix = `Codex CLI exited with code ${result.exitCode}`;
|
|
2483
3417
|
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
2484
3418
|
}
|
|
@@ -2517,7 +3451,7 @@ ${basePrompt}`;
|
|
|
2517
3451
|
if (!this.config.cwd) {
|
|
2518
3452
|
return workspaceRoot;
|
|
2519
3453
|
}
|
|
2520
|
-
return
|
|
3454
|
+
return path10.resolve(this.config.cwd);
|
|
2521
3455
|
}
|
|
2522
3456
|
buildCodexArgs() {
|
|
2523
3457
|
const args = [
|
|
@@ -2559,11 +3493,11 @@ ${basePrompt}`;
|
|
|
2559
3493
|
}
|
|
2560
3494
|
}
|
|
2561
3495
|
async createWorkspace() {
|
|
2562
|
-
return await
|
|
3496
|
+
return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
|
|
2563
3497
|
}
|
|
2564
3498
|
async cleanupWorkspace(workspaceRoot) {
|
|
2565
3499
|
try {
|
|
2566
|
-
await
|
|
3500
|
+
await rm2(workspaceRoot, { recursive: true, force: true });
|
|
2567
3501
|
} catch {
|
|
2568
3502
|
}
|
|
2569
3503
|
}
|
|
@@ -2573,9 +3507,9 @@ ${basePrompt}`;
|
|
|
2573
3507
|
return void 0;
|
|
2574
3508
|
}
|
|
2575
3509
|
if (this.config.logDir) {
|
|
2576
|
-
return
|
|
3510
|
+
return path10.resolve(this.config.logDir);
|
|
2577
3511
|
}
|
|
2578
|
-
return
|
|
3512
|
+
return path10.join(process.cwd(), ".agentv", "logs", "codex");
|
|
2579
3513
|
}
|
|
2580
3514
|
async createStreamLogger(request) {
|
|
2581
3515
|
const logDir = this.resolveLogDirectory();
|
|
@@ -2583,13 +3517,13 @@ ${basePrompt}`;
|
|
|
2583
3517
|
return void 0;
|
|
2584
3518
|
}
|
|
2585
3519
|
try {
|
|
2586
|
-
await
|
|
3520
|
+
await mkdir2(logDir, { recursive: true });
|
|
2587
3521
|
} catch (error) {
|
|
2588
3522
|
const message = error instanceof Error ? error.message : String(error);
|
|
2589
3523
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
2590
3524
|
return void 0;
|
|
2591
3525
|
}
|
|
2592
|
-
const filePath =
|
|
3526
|
+
const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
|
|
2593
3527
|
try {
|
|
2594
3528
|
const logger = await CodexStreamLogger.create({
|
|
2595
3529
|
filePath,
|
|
@@ -2622,7 +3556,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
|
|
|
2622
3556
|
constructor(filePath, format) {
|
|
2623
3557
|
this.filePath = filePath;
|
|
2624
3558
|
this.format = format;
|
|
2625
|
-
this.stream =
|
|
3559
|
+
this.stream = createWriteStream2(filePath, { flags: "a" });
|
|
2626
3560
|
}
|
|
2627
3561
|
static async create(options) {
|
|
2628
3562
|
const logger = new _CodexStreamLogger(options.filePath, options.format);
|
|
@@ -2683,7 +3617,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
|
|
|
2683
3617
|
return void 0;
|
|
2684
3618
|
}
|
|
2685
3619
|
const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
|
|
2686
|
-
return `[+${
|
|
3620
|
+
return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
|
|
2687
3621
|
}
|
|
2688
3622
|
flushRemainder() {
|
|
2689
3623
|
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
@@ -2714,18 +3648,18 @@ function isCodexLogStreamingDisabled() {
|
|
|
2714
3648
|
const normalized = envValue.trim().toLowerCase();
|
|
2715
3649
|
return normalized === "false" || normalized === "0" || normalized === "off";
|
|
2716
3650
|
}
|
|
2717
|
-
function
|
|
3651
|
+
function buildLogFilename2(request, targetName) {
|
|
2718
3652
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2719
|
-
const evalId =
|
|
3653
|
+
const evalId = sanitizeForFilename2(request.evalCaseId ?? "codex");
|
|
2720
3654
|
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
2721
|
-
const target =
|
|
2722
|
-
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${
|
|
3655
|
+
const target = sanitizeForFilename2(targetName);
|
|
3656
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
|
|
2723
3657
|
}
|
|
2724
|
-
function
|
|
3658
|
+
function sanitizeForFilename2(value) {
|
|
2725
3659
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
2726
3660
|
return sanitized.length > 0 ? sanitized : "codex";
|
|
2727
3661
|
}
|
|
2728
|
-
function
|
|
3662
|
+
function formatElapsed2(startedAt) {
|
|
2729
3663
|
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
2730
3664
|
const hours = Math.floor(elapsedSeconds / 3600);
|
|
2731
3665
|
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
@@ -2736,7 +3670,7 @@ function formatElapsed(startedAt) {
|
|
|
2736
3670
|
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
2737
3671
|
}
|
|
2738
3672
|
function formatCodexLogMessage(rawLine, source) {
|
|
2739
|
-
const parsed =
|
|
3673
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
2740
3674
|
if (parsed) {
|
|
2741
3675
|
const summary = summarizeCodexEvent(parsed);
|
|
2742
3676
|
if (summary) {
|
|
@@ -2749,7 +3683,7 @@ function formatCodexLogMessage(rawLine, source) {
|
|
|
2749
3683
|
return rawLine;
|
|
2750
3684
|
}
|
|
2751
3685
|
function formatCodexJsonLog(rawLine) {
|
|
2752
|
-
const parsed =
|
|
3686
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
2753
3687
|
if (!parsed) {
|
|
2754
3688
|
return rawLine;
|
|
2755
3689
|
}
|
|
@@ -2794,7 +3728,7 @@ function summarizeCodexEvent(event) {
|
|
|
2794
3728
|
}
|
|
2795
3729
|
return type;
|
|
2796
3730
|
}
|
|
2797
|
-
function
|
|
3731
|
+
function tryParseJsonValue2(rawLine) {
|
|
2798
3732
|
try {
|
|
2799
3733
|
return JSON.parse(rawLine);
|
|
2800
3734
|
} catch {
|
|
@@ -2804,7 +3738,7 @@ function tryParseJsonValue(rawLine) {
|
|
|
2804
3738
|
async function locateExecutable(candidate) {
|
|
2805
3739
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
2806
3740
|
if (includesPathSeparator) {
|
|
2807
|
-
const resolved =
|
|
3741
|
+
const resolved = path10.isAbsolute(candidate) ? candidate : path10.resolve(candidate);
|
|
2808
3742
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
2809
3743
|
await access2(executablePath, constants2.F_OK);
|
|
2810
3744
|
return executablePath;
|
|
@@ -3023,7 +3957,7 @@ function parseJsonLines(output) {
|
|
|
3023
3957
|
}
|
|
3024
3958
|
return parsed;
|
|
3025
3959
|
}
|
|
3026
|
-
function
|
|
3960
|
+
function pickDetail2(stderr, stdout) {
|
|
3027
3961
|
const errorText = stderr.trim();
|
|
3028
3962
|
if (errorText.length > 0) {
|
|
3029
3963
|
return errorText;
|
|
@@ -3031,7 +3965,7 @@ function pickDetail(stderr, stdout) {
|
|
|
3031
3965
|
const stdoutText = stdout.trim();
|
|
3032
3966
|
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3033
3967
|
}
|
|
3034
|
-
function
|
|
3968
|
+
function formatTimeoutSuffix3(timeoutMs) {
|
|
3035
3969
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
3036
3970
|
return "";
|
|
3037
3971
|
}
|
|
@@ -3040,7 +3974,7 @@ function formatTimeoutSuffix2(timeoutMs) {
|
|
|
3040
3974
|
}
|
|
3041
3975
|
async function defaultCodexRunner(options) {
|
|
3042
3976
|
return await new Promise((resolve, reject) => {
|
|
3043
|
-
const child =
|
|
3977
|
+
const child = spawn2(options.executable, options.args, {
|
|
3044
3978
|
cwd: options.cwd,
|
|
3045
3979
|
env: options.env,
|
|
3046
3980
|
stdio: ["pipe", "pipe", "pipe"],
|
|
@@ -3146,43 +4080,204 @@ var MockProvider = class {
|
|
|
3146
4080
|
const max = Math.max(min, this.delayMaxMs);
|
|
3147
4081
|
return Math.floor(Math.random() * (max - min + 1)) + min;
|
|
3148
4082
|
}
|
|
3149
|
-
return this.delayMs;
|
|
4083
|
+
return this.delayMs;
|
|
4084
|
+
}
|
|
4085
|
+
};
|
|
4086
|
+
|
|
4087
|
+
// src/evaluation/providers/pi-agent-sdk.ts
|
|
4088
|
+
var piAgentModule = null;
|
|
4089
|
+
var piAiModule = null;
|
|
4090
|
+
async function loadPiModules() {
|
|
4091
|
+
if (!piAgentModule || !piAiModule) {
|
|
4092
|
+
try {
|
|
4093
|
+
[piAgentModule, piAiModule] = await Promise.all([
|
|
4094
|
+
import("@mariozechner/pi-agent"),
|
|
4095
|
+
import("@mariozechner/pi-ai")
|
|
4096
|
+
]);
|
|
4097
|
+
} catch (error) {
|
|
4098
|
+
throw new Error(
|
|
4099
|
+
`Failed to load pi-agent-sdk dependencies. Please install them:
|
|
4100
|
+
npm install @mariozechner/pi-agent @mariozechner/pi-ai
|
|
4101
|
+
|
|
4102
|
+
Original error: ${error instanceof Error ? error.message : String(error)}`
|
|
4103
|
+
);
|
|
4104
|
+
}
|
|
4105
|
+
}
|
|
4106
|
+
return {
|
|
4107
|
+
Agent: piAgentModule.Agent,
|
|
4108
|
+
ProviderTransport: piAgentModule.ProviderTransport,
|
|
4109
|
+
getModel: piAiModule.getModel,
|
|
4110
|
+
getEnvApiKey: piAiModule.getEnvApiKey
|
|
4111
|
+
};
|
|
4112
|
+
}
|
|
4113
|
+
var PiAgentSdkProvider = class {
|
|
4114
|
+
id;
|
|
4115
|
+
kind = "pi-agent-sdk";
|
|
4116
|
+
targetName;
|
|
4117
|
+
supportsBatch = false;
|
|
4118
|
+
config;
|
|
4119
|
+
constructor(targetName, config) {
|
|
4120
|
+
this.id = `pi-agent-sdk:${targetName}`;
|
|
4121
|
+
this.targetName = targetName;
|
|
4122
|
+
this.config = config;
|
|
4123
|
+
}
|
|
4124
|
+
async invoke(request) {
|
|
4125
|
+
if (request.signal?.aborted) {
|
|
4126
|
+
throw new Error("Pi agent SDK request was aborted before execution");
|
|
4127
|
+
}
|
|
4128
|
+
const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
|
|
4129
|
+
const startTime = Date.now();
|
|
4130
|
+
const providerName = this.config.provider ?? "anthropic";
|
|
4131
|
+
const modelId = this.config.model ?? "claude-sonnet-4-20250514";
|
|
4132
|
+
const model = getModel(providerName, modelId);
|
|
4133
|
+
const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
|
|
4134
|
+
const transport = new ProviderTransport({
|
|
4135
|
+
getApiKey: async (provider) => {
|
|
4136
|
+
return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
|
|
4137
|
+
}
|
|
4138
|
+
});
|
|
4139
|
+
const agent = new Agent({
|
|
4140
|
+
initialState: {
|
|
4141
|
+
systemPrompt,
|
|
4142
|
+
model,
|
|
4143
|
+
tools: [],
|
|
4144
|
+
// No tools for simple Q&A
|
|
4145
|
+
messages: []
|
|
4146
|
+
},
|
|
4147
|
+
transport
|
|
4148
|
+
});
|
|
4149
|
+
const outputMessages = [];
|
|
4150
|
+
let finalAssistantContent = "";
|
|
4151
|
+
const unsubscribe = agent.subscribe((event) => {
|
|
4152
|
+
if (event.type === "message_end") {
|
|
4153
|
+
const msg = event.message;
|
|
4154
|
+
if (msg.role === "assistant") {
|
|
4155
|
+
const content = extractTextContent2(msg.content);
|
|
4156
|
+
if (content) {
|
|
4157
|
+
finalAssistantContent = content;
|
|
4158
|
+
}
|
|
4159
|
+
}
|
|
4160
|
+
}
|
|
4161
|
+
});
|
|
4162
|
+
try {
|
|
4163
|
+
const timeoutMs = this.config.timeoutMs ?? 12e4;
|
|
4164
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
4165
|
+
setTimeout(
|
|
4166
|
+
() => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
|
|
4167
|
+
timeoutMs
|
|
4168
|
+
);
|
|
4169
|
+
});
|
|
4170
|
+
await Promise.race([agent.prompt(request.question), timeoutPromise]);
|
|
4171
|
+
await agent.waitForIdle();
|
|
4172
|
+
const agentMessages = agent.state.messages;
|
|
4173
|
+
for (const msg of agentMessages) {
|
|
4174
|
+
outputMessages.push(convertAgentMessage(msg));
|
|
4175
|
+
}
|
|
4176
|
+
const durationMs = Date.now() - startTime;
|
|
4177
|
+
return {
|
|
4178
|
+
raw: {
|
|
4179
|
+
messages: agentMessages,
|
|
4180
|
+
systemPrompt,
|
|
4181
|
+
model: this.config.model,
|
|
4182
|
+
provider: this.config.provider
|
|
4183
|
+
},
|
|
4184
|
+
outputMessages,
|
|
4185
|
+
durationMs
|
|
4186
|
+
};
|
|
4187
|
+
} finally {
|
|
4188
|
+
unsubscribe();
|
|
4189
|
+
}
|
|
4190
|
+
}
|
|
4191
|
+
};
|
|
4192
|
+
function extractTextContent2(content) {
|
|
4193
|
+
if (typeof content === "string") {
|
|
4194
|
+
return content;
|
|
4195
|
+
}
|
|
4196
|
+
if (!Array.isArray(content)) {
|
|
4197
|
+
return void 0;
|
|
4198
|
+
}
|
|
4199
|
+
const textParts = [];
|
|
4200
|
+
for (const part of content) {
|
|
4201
|
+
if (!part || typeof part !== "object") {
|
|
4202
|
+
continue;
|
|
4203
|
+
}
|
|
4204
|
+
const p = part;
|
|
4205
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
4206
|
+
textParts.push(p.text);
|
|
4207
|
+
}
|
|
4208
|
+
}
|
|
4209
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
4210
|
+
}
|
|
4211
|
+
function convertAgentMessage(message) {
|
|
4212
|
+
if (!message || typeof message !== "object") {
|
|
4213
|
+
return { role: "unknown", content: String(message) };
|
|
4214
|
+
}
|
|
4215
|
+
const msg = message;
|
|
4216
|
+
const role = typeof msg.role === "string" ? msg.role : "unknown";
|
|
4217
|
+
const content = extractTextContent2(msg.content);
|
|
4218
|
+
const toolCalls = extractToolCalls2(msg.content);
|
|
4219
|
+
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
4220
|
+
return {
|
|
4221
|
+
role,
|
|
4222
|
+
content,
|
|
4223
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
4224
|
+
timestamp
|
|
4225
|
+
};
|
|
4226
|
+
}
|
|
4227
|
+
function extractToolCalls2(content) {
|
|
4228
|
+
if (!Array.isArray(content)) {
|
|
4229
|
+
return [];
|
|
4230
|
+
}
|
|
4231
|
+
const toolCalls = [];
|
|
4232
|
+
for (const part of content) {
|
|
4233
|
+
if (!part || typeof part !== "object") {
|
|
4234
|
+
continue;
|
|
4235
|
+
}
|
|
4236
|
+
const p = part;
|
|
4237
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
4238
|
+
toolCalls.push({
|
|
4239
|
+
tool: p.name,
|
|
4240
|
+
input: p.input,
|
|
4241
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
4242
|
+
});
|
|
4243
|
+
}
|
|
3150
4244
|
}
|
|
3151
|
-
|
|
4245
|
+
return toolCalls;
|
|
4246
|
+
}
|
|
3152
4247
|
|
|
3153
4248
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
3154
|
-
import { spawn as
|
|
3155
|
-
import { randomUUID as
|
|
3156
|
-
import { createWriteStream as
|
|
3157
|
-
import { mkdir as
|
|
3158
|
-
import { tmpdir as
|
|
3159
|
-
import
|
|
4249
|
+
import { spawn as spawn3 } from "node:child_process";
|
|
4250
|
+
import { randomUUID as randomUUID3 } from "node:crypto";
|
|
4251
|
+
import { createWriteStream as createWriteStream3 } from "node:fs";
|
|
4252
|
+
import { mkdir as mkdir3, mkdtemp as mkdtemp3, rm as rm3, writeFile as writeFile3 } from "node:fs/promises";
|
|
4253
|
+
import { tmpdir as tmpdir3 } from "node:os";
|
|
4254
|
+
import path11 from "node:path";
|
|
3160
4255
|
|
|
3161
4256
|
// src/evaluation/providers/pi-log-tracker.ts
|
|
3162
|
-
var
|
|
3163
|
-
var
|
|
4257
|
+
var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
|
|
4258
|
+
var GLOBAL_SUBSCRIBERS_KEY3 = Symbol.for("agentv.piLogSubscribers");
|
|
3164
4259
|
function getPiLogStore() {
|
|
3165
4260
|
const globalObject = globalThis;
|
|
3166
|
-
const existing = globalObject[
|
|
4261
|
+
const existing = globalObject[GLOBAL_LOGS_KEY3];
|
|
3167
4262
|
if (existing) {
|
|
3168
4263
|
return existing;
|
|
3169
4264
|
}
|
|
3170
4265
|
const created = [];
|
|
3171
|
-
globalObject[
|
|
4266
|
+
globalObject[GLOBAL_LOGS_KEY3] = created;
|
|
3172
4267
|
return created;
|
|
3173
4268
|
}
|
|
3174
|
-
function
|
|
4269
|
+
function getSubscriberStore3() {
|
|
3175
4270
|
const globalObject = globalThis;
|
|
3176
|
-
const existing = globalObject[
|
|
4271
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY3];
|
|
3177
4272
|
if (existing) {
|
|
3178
4273
|
return existing;
|
|
3179
4274
|
}
|
|
3180
4275
|
const created = /* @__PURE__ */ new Set();
|
|
3181
|
-
globalObject[
|
|
4276
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY3] = created;
|
|
3182
4277
|
return created;
|
|
3183
4278
|
}
|
|
3184
|
-
function
|
|
3185
|
-
const subscribers = Array.from(
|
|
4279
|
+
function notifySubscribers3(entry) {
|
|
4280
|
+
const subscribers = Array.from(getSubscriberStore3());
|
|
3186
4281
|
for (const listener of subscribers) {
|
|
3187
4282
|
try {
|
|
3188
4283
|
listener(entry);
|
|
@@ -3194,7 +4289,7 @@ function notifySubscribers2(entry) {
|
|
|
3194
4289
|
}
|
|
3195
4290
|
function recordPiLogEntry(entry) {
|
|
3196
4291
|
getPiLogStore().push(entry);
|
|
3197
|
-
|
|
4292
|
+
notifySubscribers3(entry);
|
|
3198
4293
|
}
|
|
3199
4294
|
function consumePiLogEntries() {
|
|
3200
4295
|
const store = getPiLogStore();
|
|
@@ -3204,7 +4299,7 @@ function consumePiLogEntries() {
|
|
|
3204
4299
|
return store.splice(0, store.length);
|
|
3205
4300
|
}
|
|
3206
4301
|
function subscribeToPiLogEntries(listener) {
|
|
3207
|
-
const store =
|
|
4302
|
+
const store = getSubscriberStore3();
|
|
3208
4303
|
store.add(listener);
|
|
3209
4304
|
return () => {
|
|
3210
4305
|
store.delete(listener);
|
|
@@ -3212,9 +4307,9 @@ function subscribeToPiLogEntries(listener) {
|
|
|
3212
4307
|
}
|
|
3213
4308
|
|
|
3214
4309
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
3215
|
-
var
|
|
3216
|
-
var
|
|
3217
|
-
var
|
|
4310
|
+
var WORKSPACE_PREFIX3 = "agentv-pi-";
|
|
4311
|
+
var PROMPT_FILENAME3 = "prompt.md";
|
|
4312
|
+
var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
|
|
3218
4313
|
- Do NOT create any additional output files in the workspace.
|
|
3219
4314
|
- All intended file outputs/changes MUST be written in your response.
|
|
3220
4315
|
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
@@ -3236,27 +4331,27 @@ var PiCodingAgentProvider = class {
|
|
|
3236
4331
|
if (request.signal?.aborted) {
|
|
3237
4332
|
throw new Error("Pi coding agent request was aborted before execution");
|
|
3238
4333
|
}
|
|
3239
|
-
const inputFiles =
|
|
4334
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
3240
4335
|
const workspaceRoot = await this.createWorkspace();
|
|
3241
4336
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
3242
4337
|
try {
|
|
3243
|
-
const promptFile =
|
|
3244
|
-
await
|
|
4338
|
+
const promptFile = path11.join(workspaceRoot, PROMPT_FILENAME3);
|
|
4339
|
+
await writeFile3(promptFile, request.question, "utf8");
|
|
3245
4340
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
3246
4341
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
3247
4342
|
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
3248
4343
|
if (result.timedOut) {
|
|
3249
4344
|
throw new Error(
|
|
3250
|
-
`Pi coding agent timed out${
|
|
4345
|
+
`Pi coding agent timed out${formatTimeoutSuffix4(this.config.timeoutMs ?? void 0)}`
|
|
3251
4346
|
);
|
|
3252
4347
|
}
|
|
3253
4348
|
if (result.exitCode !== 0) {
|
|
3254
|
-
const detail =
|
|
4349
|
+
const detail = pickDetail3(result.stderr, result.stdout);
|
|
3255
4350
|
const prefix = `Pi coding agent exited with code ${result.exitCode}`;
|
|
3256
4351
|
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
3257
4352
|
}
|
|
3258
4353
|
const parsed = parsePiJsonl(result.stdout);
|
|
3259
|
-
const outputMessages =
|
|
4354
|
+
const outputMessages = extractOutputMessages2(parsed);
|
|
3260
4355
|
const assistantText = extractAssistantText2(outputMessages);
|
|
3261
4356
|
return {
|
|
3262
4357
|
raw: {
|
|
@@ -3282,7 +4377,7 @@ var PiCodingAgentProvider = class {
|
|
|
3282
4377
|
if (!this.config.cwd) {
|
|
3283
4378
|
return workspaceRoot;
|
|
3284
4379
|
}
|
|
3285
|
-
return
|
|
4380
|
+
return path11.resolve(this.config.cwd);
|
|
3286
4381
|
}
|
|
3287
4382
|
buildPiArgs(prompt, inputFiles) {
|
|
3288
4383
|
const args = [];
|
|
@@ -3312,7 +4407,7 @@ var PiCodingAgentProvider = class {
|
|
|
3312
4407
|
args.push(`@${file}`);
|
|
3313
4408
|
}
|
|
3314
4409
|
}
|
|
3315
|
-
const systemPrompt = this.config.systemPrompt ??
|
|
4410
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT4;
|
|
3316
4411
|
const fullPrompt = `${systemPrompt}
|
|
3317
4412
|
|
|
3318
4413
|
${prompt}`;
|
|
@@ -3371,19 +4466,19 @@ ${prompt}`;
|
|
|
3371
4466
|
return env;
|
|
3372
4467
|
}
|
|
3373
4468
|
async createWorkspace() {
|
|
3374
|
-
return await
|
|
4469
|
+
return await mkdtemp3(path11.join(tmpdir3(), WORKSPACE_PREFIX3));
|
|
3375
4470
|
}
|
|
3376
4471
|
async cleanupWorkspace(workspaceRoot) {
|
|
3377
4472
|
try {
|
|
3378
|
-
await
|
|
4473
|
+
await rm3(workspaceRoot, { recursive: true, force: true });
|
|
3379
4474
|
} catch {
|
|
3380
4475
|
}
|
|
3381
4476
|
}
|
|
3382
4477
|
resolveLogDirectory() {
|
|
3383
4478
|
if (this.config.logDir) {
|
|
3384
|
-
return
|
|
4479
|
+
return path11.resolve(this.config.logDir);
|
|
3385
4480
|
}
|
|
3386
|
-
return
|
|
4481
|
+
return path11.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
3387
4482
|
}
|
|
3388
4483
|
async createStreamLogger(request) {
|
|
3389
4484
|
const logDir = this.resolveLogDirectory();
|
|
@@ -3391,13 +4486,13 @@ ${prompt}`;
|
|
|
3391
4486
|
return void 0;
|
|
3392
4487
|
}
|
|
3393
4488
|
try {
|
|
3394
|
-
await
|
|
4489
|
+
await mkdir3(logDir, { recursive: true });
|
|
3395
4490
|
} catch (error) {
|
|
3396
4491
|
const message = error instanceof Error ? error.message : String(error);
|
|
3397
4492
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
3398
4493
|
return void 0;
|
|
3399
4494
|
}
|
|
3400
|
-
const filePath =
|
|
4495
|
+
const filePath = path11.join(logDir, buildLogFilename3(request, this.targetName));
|
|
3401
4496
|
try {
|
|
3402
4497
|
const logger = await PiStreamLogger.create({
|
|
3403
4498
|
filePath,
|
|
@@ -3430,7 +4525,7 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3430
4525
|
constructor(filePath, format) {
|
|
3431
4526
|
this.filePath = filePath;
|
|
3432
4527
|
this.format = format;
|
|
3433
|
-
this.stream =
|
|
4528
|
+
this.stream = createWriteStream3(filePath, { flags: "a" });
|
|
3434
4529
|
}
|
|
3435
4530
|
static async create(options) {
|
|
3436
4531
|
const logger = new _PiStreamLogger(options.filePath, options.format);
|
|
@@ -3491,7 +4586,7 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3491
4586
|
return void 0;
|
|
3492
4587
|
}
|
|
3493
4588
|
const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
|
|
3494
|
-
return `[+${
|
|
4589
|
+
return `[+${formatElapsed3(this.startedAt)}] [${source}] ${message}`;
|
|
3495
4590
|
}
|
|
3496
4591
|
flushRemainder() {
|
|
3497
4592
|
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
@@ -3514,18 +4609,18 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
3514
4609
|
this.stderrBuffer = "";
|
|
3515
4610
|
}
|
|
3516
4611
|
};
|
|
3517
|
-
function
|
|
4612
|
+
function buildLogFilename3(request, targetName) {
|
|
3518
4613
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3519
|
-
const evalId =
|
|
4614
|
+
const evalId = sanitizeForFilename3(request.evalCaseId ?? "pi");
|
|
3520
4615
|
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
3521
|
-
const target =
|
|
3522
|
-
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${
|
|
4616
|
+
const target = sanitizeForFilename3(targetName);
|
|
4617
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID3().slice(0, 8)}.log`;
|
|
3523
4618
|
}
|
|
3524
|
-
function
|
|
4619
|
+
function sanitizeForFilename3(value) {
|
|
3525
4620
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3526
4621
|
return sanitized.length > 0 ? sanitized : "pi";
|
|
3527
4622
|
}
|
|
3528
|
-
function
|
|
4623
|
+
function formatElapsed3(startedAt) {
|
|
3529
4624
|
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
3530
4625
|
const hours = Math.floor(elapsedSeconds / 3600);
|
|
3531
4626
|
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
@@ -3536,7 +4631,7 @@ function formatElapsed2(startedAt) {
|
|
|
3536
4631
|
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3537
4632
|
}
|
|
3538
4633
|
function formatPiLogMessage(rawLine, source) {
|
|
3539
|
-
const parsed =
|
|
4634
|
+
const parsed = tryParseJsonValue3(rawLine);
|
|
3540
4635
|
if (parsed) {
|
|
3541
4636
|
const summary = summarizePiEvent(parsed);
|
|
3542
4637
|
if (summary) {
|
|
@@ -3549,7 +4644,7 @@ function formatPiLogMessage(rawLine, source) {
|
|
|
3549
4644
|
return rawLine;
|
|
3550
4645
|
}
|
|
3551
4646
|
function formatPiJsonLog(rawLine) {
|
|
3552
|
-
const parsed =
|
|
4647
|
+
const parsed = tryParseJsonValue3(rawLine);
|
|
3553
4648
|
if (!parsed) {
|
|
3554
4649
|
return rawLine;
|
|
3555
4650
|
}
|
|
@@ -3599,7 +4694,7 @@ function summarizePiEvent(event) {
|
|
|
3599
4694
|
return type;
|
|
3600
4695
|
}
|
|
3601
4696
|
}
|
|
3602
|
-
function
|
|
4697
|
+
function tryParseJsonValue3(rawLine) {
|
|
3603
4698
|
try {
|
|
3604
4699
|
return JSON.parse(rawLine);
|
|
3605
4700
|
} catch {
|
|
@@ -3624,7 +4719,7 @@ function parsePiJsonl(output) {
|
|
|
3624
4719
|
}
|
|
3625
4720
|
return parsed;
|
|
3626
4721
|
}
|
|
3627
|
-
function
|
|
4722
|
+
function extractOutputMessages2(events) {
|
|
3628
4723
|
for (let i = events.length - 1; i >= 0; i--) {
|
|
3629
4724
|
const event = events[i];
|
|
3630
4725
|
if (!event || typeof event !== "object") {
|
|
@@ -3665,8 +4760,8 @@ function convertPiMessage(message) {
|
|
|
3665
4760
|
if (typeof role !== "string") {
|
|
3666
4761
|
return void 0;
|
|
3667
4762
|
}
|
|
3668
|
-
const content =
|
|
3669
|
-
const toolCalls =
|
|
4763
|
+
const content = extractTextContent3(msg.content);
|
|
4764
|
+
const toolCalls = extractToolCalls3(msg.content);
|
|
3670
4765
|
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
3671
4766
|
const metadata = {};
|
|
3672
4767
|
if (msg.api) metadata.api = msg.api;
|
|
@@ -3682,7 +4777,7 @@ function convertPiMessage(message) {
|
|
|
3682
4777
|
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
3683
4778
|
};
|
|
3684
4779
|
}
|
|
3685
|
-
function
|
|
4780
|
+
function extractTextContent3(content) {
|
|
3686
4781
|
if (typeof content === "string") {
|
|
3687
4782
|
return content;
|
|
3688
4783
|
}
|
|
@@ -3701,7 +4796,7 @@ function extractTextContent(content) {
|
|
|
3701
4796
|
}
|
|
3702
4797
|
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
3703
4798
|
}
|
|
3704
|
-
function
|
|
4799
|
+
function extractToolCalls3(content) {
|
|
3705
4800
|
if (!Array.isArray(content)) {
|
|
3706
4801
|
return [];
|
|
3707
4802
|
}
|
|
@@ -3746,7 +4841,7 @@ function extractAssistantText2(messages) {
|
|
|
3746
4841
|
function escapeAtSymbols(prompt) {
|
|
3747
4842
|
return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
|
|
3748
4843
|
}
|
|
3749
|
-
function
|
|
4844
|
+
function pickDetail3(stderr, stdout) {
|
|
3750
4845
|
const errorText = stderr.trim();
|
|
3751
4846
|
if (errorText.length > 0) {
|
|
3752
4847
|
return errorText;
|
|
@@ -3754,7 +4849,7 @@ function pickDetail2(stderr, stdout) {
|
|
|
3754
4849
|
const stdoutText = stdout.trim();
|
|
3755
4850
|
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3756
4851
|
}
|
|
3757
|
-
function
|
|
4852
|
+
function formatTimeoutSuffix4(timeoutMs) {
|
|
3758
4853
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
3759
4854
|
return "";
|
|
3760
4855
|
}
|
|
@@ -3767,7 +4862,7 @@ async function defaultPiRunner(options) {
|
|
|
3767
4862
|
const executable = parts[0];
|
|
3768
4863
|
const executableArgs = parts.slice(1);
|
|
3769
4864
|
const allArgs = [...executableArgs, ...options.args];
|
|
3770
|
-
const child =
|
|
4865
|
+
const child = spawn3(executable, allArgs, {
|
|
3771
4866
|
cwd: options.cwd,
|
|
3772
4867
|
env: options.env,
|
|
3773
4868
|
stdio: ["pipe", "pipe", "pipe"],
|
|
@@ -3830,7 +4925,7 @@ async function defaultPiRunner(options) {
|
|
|
3830
4925
|
}
|
|
3831
4926
|
|
|
3832
4927
|
// src/evaluation/providers/vscode.ts
|
|
3833
|
-
import
|
|
4928
|
+
import path12 from "node:path";
|
|
3834
4929
|
import {
|
|
3835
4930
|
dispatchAgentSession,
|
|
3836
4931
|
dispatchBatchAgent,
|
|
@@ -4005,7 +5100,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
4005
5100
|
return "";
|
|
4006
5101
|
}
|
|
4007
5102
|
const buildList = (files) => files.map((absolutePath) => {
|
|
4008
|
-
const fileName =
|
|
5103
|
+
const fileName = path12.basename(absolutePath);
|
|
4009
5104
|
const fileUri = pathToFileUri2(absolutePath);
|
|
4010
5105
|
return `* [${fileName}](${fileUri})`;
|
|
4011
5106
|
});
|
|
@@ -4030,8 +5125,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
4030
5125
|
}
|
|
4031
5126
|
const unique = /* @__PURE__ */ new Map();
|
|
4032
5127
|
for (const attachment of attachments) {
|
|
4033
|
-
const absolutePath =
|
|
4034
|
-
const normalized = absolutePath.split(
|
|
5128
|
+
const absolutePath = path12.resolve(attachment);
|
|
5129
|
+
const normalized = absolutePath.split(path12.sep).join("/");
|
|
4035
5130
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
4036
5131
|
if (!unique.has(absolutePath)) {
|
|
4037
5132
|
unique.set(absolutePath, absolutePath);
|
|
@@ -4046,7 +5141,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
4046
5141
|
}
|
|
4047
5142
|
const unique = /* @__PURE__ */ new Map();
|
|
4048
5143
|
for (const attachment of attachments) {
|
|
4049
|
-
const absolutePath =
|
|
5144
|
+
const absolutePath = path12.resolve(attachment);
|
|
4050
5145
|
if (!unique.has(absolutePath)) {
|
|
4051
5146
|
unique.set(absolutePath, absolutePath);
|
|
4052
5147
|
}
|
|
@@ -4054,7 +5149,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
4054
5149
|
return Array.from(unique.values());
|
|
4055
5150
|
}
|
|
4056
5151
|
function pathToFileUri2(filePath) {
|
|
4057
|
-
const absolutePath =
|
|
5152
|
+
const absolutePath = path12.isAbsolute(filePath) ? filePath : path12.resolve(filePath);
|
|
4058
5153
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
4059
5154
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
4060
5155
|
return `file:///${normalizedPath}`;
|
|
@@ -4067,7 +5162,7 @@ function normalizeAttachments(attachments) {
|
|
|
4067
5162
|
}
|
|
4068
5163
|
const deduped = /* @__PURE__ */ new Set();
|
|
4069
5164
|
for (const attachment of attachments) {
|
|
4070
|
-
deduped.add(
|
|
5165
|
+
deduped.add(path12.resolve(attachment));
|
|
4071
5166
|
}
|
|
4072
5167
|
return Array.from(deduped);
|
|
4073
5168
|
}
|
|
@@ -4076,7 +5171,7 @@ function mergeAttachments(all) {
|
|
|
4076
5171
|
for (const list of all) {
|
|
4077
5172
|
if (!list) continue;
|
|
4078
5173
|
for (const inputFile of list) {
|
|
4079
|
-
deduped.add(
|
|
5174
|
+
deduped.add(path12.resolve(inputFile));
|
|
4080
5175
|
}
|
|
4081
5176
|
}
|
|
4082
5177
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -4125,7 +5220,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
4125
5220
|
// src/evaluation/providers/targets-file.ts
|
|
4126
5221
|
import { constants as constants3 } from "node:fs";
|
|
4127
5222
|
import { access as access3, readFile as readFile6 } from "node:fs/promises";
|
|
4128
|
-
import
|
|
5223
|
+
import path13 from "node:path";
|
|
4129
5224
|
import { parse as parse3 } from "yaml";
|
|
4130
5225
|
function isRecord(value) {
|
|
4131
5226
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -4162,7 +5257,7 @@ async function fileExists3(filePath) {
|
|
|
4162
5257
|
}
|
|
4163
5258
|
}
|
|
4164
5259
|
async function readTargetDefinitions(filePath) {
|
|
4165
|
-
const absolutePath =
|
|
5260
|
+
const absolutePath = path13.resolve(filePath);
|
|
4166
5261
|
if (!await fileExists3(absolutePath)) {
|
|
4167
5262
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
4168
5263
|
}
|
|
@@ -4196,6 +5291,10 @@ function createProvider(target) {
|
|
|
4196
5291
|
return new CodexProvider(target.name, target.config);
|
|
4197
5292
|
case "pi-coding-agent":
|
|
4198
5293
|
return new PiCodingAgentProvider(target.name, target.config);
|
|
5294
|
+
case "pi-agent-sdk":
|
|
5295
|
+
return new PiAgentSdkProvider(target.name, target.config);
|
|
5296
|
+
case "claude-code":
|
|
5297
|
+
return new ClaudeCodeProvider(target.name, target.config);
|
|
4199
5298
|
case "mock":
|
|
4200
5299
|
return new MockProvider(target.name, target.config);
|
|
4201
5300
|
case "vscode":
|
|
@@ -4214,73 +5313,170 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
4214
5313
|
|
|
4215
5314
|
// src/evaluation/evaluators.ts
|
|
4216
5315
|
import { generateText as generateText2 } from "ai";
|
|
4217
|
-
import { z } from "zod";
|
|
5316
|
+
import { z as z2 } from "zod";
|
|
4218
5317
|
|
|
4219
5318
|
// src/runtime/exec.ts
|
|
4220
|
-
function
|
|
4221
|
-
|
|
4222
|
-
|
|
5319
|
+
function shellEscapePath(value) {
|
|
5320
|
+
if (process.platform === "win32") {
|
|
5321
|
+
return `"${value.replaceAll('"', '""')}"`;
|
|
5322
|
+
}
|
|
5323
|
+
return `'${value.replaceAll("'", `'"'"'`)}'`;
|
|
4223
5324
|
}
|
|
4224
|
-
async function
|
|
4225
|
-
|
|
4226
|
-
|
|
4227
|
-
|
|
4228
|
-
|
|
4229
|
-
|
|
4230
|
-
|
|
4231
|
-
|
|
4232
|
-
|
|
4233
|
-
|
|
4234
|
-
|
|
4235
|
-
|
|
4236
|
-
|
|
4237
|
-
|
|
4238
|
-
|
|
4239
|
-
|
|
4240
|
-
|
|
4241
|
-
|
|
4242
|
-
|
|
4243
|
-
|
|
4244
|
-
|
|
4245
|
-
|
|
4246
|
-
|
|
5325
|
+
async function execFileWithStdin(argv, stdinPayload, options = {}) {
|
|
5326
|
+
if (argv.length === 0) {
|
|
5327
|
+
throw new Error("Executable argv must include at least one entry");
|
|
5328
|
+
}
|
|
5329
|
+
if (typeof Bun !== "undefined") {
|
|
5330
|
+
return execFileWithStdinBun(argv, stdinPayload, options);
|
|
5331
|
+
}
|
|
5332
|
+
return execFileWithStdinNode(argv, stdinPayload, options);
|
|
5333
|
+
}
|
|
5334
|
+
async function execFileWithStdinBun(argv, stdinPayload, options) {
|
|
5335
|
+
const command = [...argv];
|
|
5336
|
+
const encoder = new TextEncoder();
|
|
5337
|
+
const proc = Bun.spawn(command, {
|
|
5338
|
+
cwd: options.cwd,
|
|
5339
|
+
stdin: encoder.encode(stdinPayload),
|
|
5340
|
+
stdout: "pipe",
|
|
5341
|
+
stderr: "pipe"
|
|
5342
|
+
});
|
|
5343
|
+
let timedOut = false;
|
|
5344
|
+
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
5345
|
+
timedOut = true;
|
|
5346
|
+
proc.kill("SIGKILL");
|
|
5347
|
+
}, options.timeoutMs) : void 0;
|
|
5348
|
+
try {
|
|
5349
|
+
const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
|
|
5350
|
+
const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
|
|
5351
|
+
const [stdout, stderr, exitCode] = await Promise.all([
|
|
5352
|
+
stdoutPromise,
|
|
5353
|
+
stderrPromise,
|
|
5354
|
+
proc.exited
|
|
5355
|
+
]);
|
|
5356
|
+
if (timedOut) {
|
|
5357
|
+
throw new Error(`Process timed out after ${options.timeoutMs}ms`);
|
|
5358
|
+
}
|
|
5359
|
+
return {
|
|
5360
|
+
stdout: stdout.replace(/\r\n/g, "\n"),
|
|
5361
|
+
stderr: stderr.replace(/\r\n/g, "\n"),
|
|
5362
|
+
exitCode
|
|
5363
|
+
};
|
|
5364
|
+
} finally {
|
|
5365
|
+
if (timeout !== void 0) {
|
|
5366
|
+
clearTimeout(timeout);
|
|
4247
5367
|
}
|
|
4248
5368
|
}
|
|
4249
|
-
|
|
4250
|
-
|
|
4251
|
-
|
|
4252
|
-
|
|
5369
|
+
}
|
|
5370
|
+
async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
5371
|
+
const { spawn: spawn4 } = await import("node:child_process");
|
|
5372
|
+
return new Promise((resolve, reject) => {
|
|
5373
|
+
const [cmd, ...args] = argv;
|
|
5374
|
+
const child = spawn4(cmd, args, {
|
|
4253
5375
|
cwd: options.cwd,
|
|
4254
5376
|
stdio: ["pipe", "pipe", "pipe"]
|
|
4255
5377
|
});
|
|
4256
|
-
|
|
4257
|
-
|
|
4258
|
-
|
|
4259
|
-
|
|
4260
|
-
|
|
5378
|
+
const stdoutChunks = [];
|
|
5379
|
+
const stderrChunks = [];
|
|
5380
|
+
child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
|
|
5381
|
+
child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
|
|
5382
|
+
let timedOut = false;
|
|
5383
|
+
const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
|
|
5384
|
+
timedOut = true;
|
|
5385
|
+
child.kill("SIGKILL");
|
|
4261
5386
|
}, options.timeoutMs) : void 0;
|
|
4262
|
-
child.stdout?.on("data", (data) => {
|
|
4263
|
-
stdout += data.toString();
|
|
4264
|
-
});
|
|
4265
|
-
child.stderr?.on("data", (data) => {
|
|
4266
|
-
stderr += data.toString();
|
|
4267
|
-
});
|
|
4268
5387
|
child.on("error", (error) => {
|
|
4269
|
-
if (timeout !== void 0)
|
|
4270
|
-
clearTimeout(timeout);
|
|
4271
|
-
}
|
|
5388
|
+
if (timeout !== void 0) clearTimeout(timeout);
|
|
4272
5389
|
reject(error);
|
|
4273
5390
|
});
|
|
4274
|
-
child.on("
|
|
4275
|
-
if (timeout !== void 0)
|
|
4276
|
-
|
|
5391
|
+
child.on("close", (code) => {
|
|
5392
|
+
if (timeout !== void 0) clearTimeout(timeout);
|
|
5393
|
+
if (timedOut) {
|
|
5394
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
5395
|
+
return;
|
|
4277
5396
|
}
|
|
4278
|
-
|
|
5397
|
+
const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
|
|
5398
|
+
const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
|
|
5399
|
+
resolve({
|
|
5400
|
+
stdout,
|
|
5401
|
+
stderr,
|
|
5402
|
+
exitCode: code ?? 0
|
|
5403
|
+
});
|
|
4279
5404
|
});
|
|
4280
|
-
child.stdin
|
|
4281
|
-
|
|
5405
|
+
if (child.stdin) {
|
|
5406
|
+
child.stdin.write(stdinPayload);
|
|
5407
|
+
child.stdin.end();
|
|
5408
|
+
}
|
|
4282
5409
|
});
|
|
4283
5410
|
}
|
|
5411
|
+
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
5412
|
+
const { mkdir: mkdir4, readFile: readFile7, rm: rm4, writeFile: writeFile4 } = await import("node:fs/promises");
|
|
5413
|
+
const { tmpdir: tmpdir4 } = await import("node:os");
|
|
5414
|
+
const path15 = await import("node:path");
|
|
5415
|
+
const { randomUUID: randomUUID4 } = await import("node:crypto");
|
|
5416
|
+
const dir = path15.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
|
|
5417
|
+
await mkdir4(dir, { recursive: true });
|
|
5418
|
+
const stdinPath = path15.join(dir, "stdin.txt");
|
|
5419
|
+
const stdoutPath = path15.join(dir, "stdout.txt");
|
|
5420
|
+
const stderrPath = path15.join(dir, "stderr.txt");
|
|
5421
|
+
await writeFile4(stdinPath, stdinPayload, "utf8");
|
|
5422
|
+
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
5423
|
+
const { spawn: spawn4 } = await import("node:child_process");
|
|
5424
|
+
try {
|
|
5425
|
+
const exitCode = await new Promise((resolve, reject) => {
|
|
5426
|
+
const child = spawn4(wrappedCommand, {
|
|
5427
|
+
shell: true,
|
|
5428
|
+
cwd: options.cwd,
|
|
5429
|
+
stdio: ["ignore", "ignore", "ignore"]
|
|
5430
|
+
});
|
|
5431
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
5432
|
+
child.kill();
|
|
5433
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
5434
|
+
}, options.timeoutMs) : void 0;
|
|
5435
|
+
child.on("error", (error) => {
|
|
5436
|
+
if (timeout !== void 0) {
|
|
5437
|
+
clearTimeout(timeout);
|
|
5438
|
+
}
|
|
5439
|
+
reject(error);
|
|
5440
|
+
});
|
|
5441
|
+
child.on("exit", (code) => {
|
|
5442
|
+
if (timeout !== void 0) {
|
|
5443
|
+
clearTimeout(timeout);
|
|
5444
|
+
}
|
|
5445
|
+
resolve(code ?? 0);
|
|
5446
|
+
});
|
|
5447
|
+
});
|
|
5448
|
+
const stdout = (await readFile7(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
5449
|
+
const stderr = (await readFile7(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
5450
|
+
return { stdout, stderr, exitCode };
|
|
5451
|
+
} finally {
|
|
5452
|
+
await rm4(dir, { recursive: true, force: true });
|
|
5453
|
+
}
|
|
5454
|
+
}
|
|
5455
|
+
|
|
5456
|
+
// src/evaluation/case-conversion.ts
|
|
5457
|
+
function toSnakeCase(str) {
|
|
5458
|
+
if (/^[A-Z]/.test(str)) {
|
|
5459
|
+
return str;
|
|
5460
|
+
}
|
|
5461
|
+
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
5462
|
+
}
|
|
5463
|
+
function toSnakeCaseDeep(obj) {
|
|
5464
|
+
if (obj === null || obj === void 0) {
|
|
5465
|
+
return obj;
|
|
5466
|
+
}
|
|
5467
|
+
if (Array.isArray(obj)) {
|
|
5468
|
+
return obj.map((item) => toSnakeCaseDeep(item));
|
|
5469
|
+
}
|
|
5470
|
+
if (typeof obj === "object") {
|
|
5471
|
+
const result = {};
|
|
5472
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
5473
|
+
const snakeKey = toSnakeCase(key);
|
|
5474
|
+
result[snakeKey] = toSnakeCaseDeep(value);
|
|
5475
|
+
}
|
|
5476
|
+
return result;
|
|
5477
|
+
}
|
|
5478
|
+
return obj;
|
|
5479
|
+
}
|
|
4284
5480
|
|
|
4285
5481
|
// src/evaluation/evaluators.ts
|
|
4286
5482
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
@@ -4300,20 +5496,20 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
4300
5496
|
|
|
4301
5497
|
[[ ## candidate_answer ## ]]
|
|
4302
5498
|
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
4303
|
-
var freeformEvaluationSchema =
|
|
4304
|
-
score:
|
|
4305
|
-
hits:
|
|
4306
|
-
misses:
|
|
4307
|
-
reasoning:
|
|
5499
|
+
var freeformEvaluationSchema = z2.object({
|
|
5500
|
+
score: z2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
5501
|
+
hits: z2.array(z2.string()).describe("Brief specific achievements").optional(),
|
|
5502
|
+
misses: z2.array(z2.string()).describe("Brief failures or omissions").optional(),
|
|
5503
|
+
reasoning: z2.string().describe("Concise explanation (1-2 sentences)").optional()
|
|
4308
5504
|
});
|
|
4309
|
-
var rubricCheckResultSchema =
|
|
4310
|
-
id:
|
|
4311
|
-
satisfied:
|
|
4312
|
-
reasoning:
|
|
5505
|
+
var rubricCheckResultSchema = z2.object({
|
|
5506
|
+
id: z2.string().describe("The ID of the rubric item being checked"),
|
|
5507
|
+
satisfied: z2.boolean().describe("Whether this rubric requirement is met"),
|
|
5508
|
+
reasoning: z2.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
4313
5509
|
});
|
|
4314
|
-
var rubricEvaluationSchema =
|
|
4315
|
-
checks:
|
|
4316
|
-
overall_reasoning:
|
|
5510
|
+
var rubricEvaluationSchema = z2.object({
|
|
5511
|
+
checks: z2.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
5512
|
+
overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)")
|
|
4317
5513
|
});
|
|
4318
5514
|
var LlmJudgeEvaluator = class {
|
|
4319
5515
|
kind = "llm_judge";
|
|
@@ -4549,30 +5745,30 @@ var CodeEvaluator = class {
|
|
|
4549
5745
|
script;
|
|
4550
5746
|
cwd;
|
|
4551
5747
|
agentTimeoutMs;
|
|
5748
|
+
config;
|
|
4552
5749
|
constructor(options) {
|
|
4553
5750
|
this.script = options.script;
|
|
4554
5751
|
this.cwd = options.cwd;
|
|
4555
5752
|
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
5753
|
+
this.config = options.config;
|
|
4556
5754
|
}
|
|
4557
5755
|
async evaluate(context) {
|
|
4558
|
-
const
|
|
4559
|
-
|
|
4560
|
-
|
|
4561
|
-
|
|
4562
|
-
|
|
4563
|
-
|
|
4564
|
-
|
|
4565
|
-
|
|
4566
|
-
|
|
4567
|
-
|
|
4568
|
-
|
|
4569
|
-
|
|
4570
|
-
|
|
4571
|
-
|
|
4572
|
-
|
|
4573
|
-
|
|
4574
|
-
2
|
|
4575
|
-
);
|
|
5756
|
+
const payload = {
|
|
5757
|
+
question: context.evalCase.question,
|
|
5758
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
5759
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
5760
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
5761
|
+
candidateAnswer: context.candidate,
|
|
5762
|
+
outputMessages: context.outputMessages ?? null,
|
|
5763
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
5764
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
5765
|
+
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
5766
|
+
),
|
|
5767
|
+
inputMessages: context.evalCase.input_messages,
|
|
5768
|
+
traceSummary: context.traceSummary ?? null,
|
|
5769
|
+
config: this.config ?? null
|
|
5770
|
+
};
|
|
5771
|
+
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
4576
5772
|
try {
|
|
4577
5773
|
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
4578
5774
|
const parsed = parseJsonSafe(stdout);
|
|
@@ -4638,18 +5834,25 @@ function calculateRubricScore(result, rubrics) {
|
|
|
4638
5834
|
return { score, verdict, hits, misses };
|
|
4639
5835
|
}
|
|
4640
5836
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
4641
|
-
const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
|
|
4642
|
-
cwd,
|
|
4643
|
-
timeoutMs: agentTimeoutMs
|
|
4644
|
-
});
|
|
5837
|
+
const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
|
|
4645
5838
|
if (exitCode !== 0) {
|
|
4646
|
-
const trimmedErr = stderr
|
|
5839
|
+
const trimmedErr = formatStderr(stderr);
|
|
4647
5840
|
throw new Error(
|
|
4648
5841
|
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
4649
5842
|
);
|
|
4650
5843
|
}
|
|
4651
5844
|
return stdout.trim();
|
|
4652
5845
|
}
|
|
5846
|
+
function formatStderr(stderr) {
|
|
5847
|
+
const trimmed = stderr.trim();
|
|
5848
|
+
const maxLength = 2e3;
|
|
5849
|
+
if (trimmed.length <= maxLength) {
|
|
5850
|
+
return trimmed;
|
|
5851
|
+
}
|
|
5852
|
+
const tail = trimmed.slice(-maxLength);
|
|
5853
|
+
return `...(truncated, last ${maxLength} chars)
|
|
5854
|
+
${tail}`;
|
|
5855
|
+
}
|
|
4653
5856
|
function parseJsonSafe(payload) {
|
|
4654
5857
|
try {
|
|
4655
5858
|
return JSON.parse(payload);
|
|
@@ -4881,22 +6084,438 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4881
6084
|
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
4882
6085
|
}
|
|
4883
6086
|
} else {
|
|
4884
|
-
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
6087
|
+
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
6088
|
+
}
|
|
6089
|
+
}
|
|
6090
|
+
for (let i = checkLength; i < expected.length; i++) {
|
|
6091
|
+
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
6092
|
+
}
|
|
6093
|
+
const score = hits.length / expected.length;
|
|
6094
|
+
return {
|
|
6095
|
+
score,
|
|
6096
|
+
verdict: scoreToVerdict(score),
|
|
6097
|
+
hits,
|
|
6098
|
+
misses,
|
|
6099
|
+
expectedAspectCount: expected.length
|
|
6100
|
+
};
|
|
6101
|
+
}
|
|
6102
|
+
};
|
|
6103
|
+
var DEFAULT_DATE_FORMATS = [
|
|
6104
|
+
"YYYY-MM-DDTHH:mm:ssZ",
|
|
6105
|
+
// ISO with timezone
|
|
6106
|
+
"YYYY-MM-DDTHH:mm:ss",
|
|
6107
|
+
// ISO with time
|
|
6108
|
+
"YYYY-MM-DD",
|
|
6109
|
+
// ISO date
|
|
6110
|
+
"DD-MMM-YYYY",
|
|
6111
|
+
// Localized (e.g., "15-JAN-2025")
|
|
6112
|
+
"MM/DD/YYYY",
|
|
6113
|
+
// US format
|
|
6114
|
+
"DD/MM/YYYY",
|
|
6115
|
+
// EU format
|
|
6116
|
+
"MM-DD-YYYY",
|
|
6117
|
+
// US with dashes
|
|
6118
|
+
"DD-MM-YYYY"
|
|
6119
|
+
// EU with dashes
|
|
6120
|
+
];
|
|
6121
|
+
var MONTH_NAMES = {
|
|
6122
|
+
jan: 0,
|
|
6123
|
+
january: 0,
|
|
6124
|
+
feb: 1,
|
|
6125
|
+
february: 1,
|
|
6126
|
+
mar: 2,
|
|
6127
|
+
march: 2,
|
|
6128
|
+
apr: 3,
|
|
6129
|
+
april: 3,
|
|
6130
|
+
may: 4,
|
|
6131
|
+
jun: 5,
|
|
6132
|
+
june: 5,
|
|
6133
|
+
jul: 6,
|
|
6134
|
+
july: 6,
|
|
6135
|
+
aug: 7,
|
|
6136
|
+
august: 7,
|
|
6137
|
+
sep: 8,
|
|
6138
|
+
sept: 8,
|
|
6139
|
+
september: 8,
|
|
6140
|
+
oct: 9,
|
|
6141
|
+
october: 9,
|
|
6142
|
+
nov: 10,
|
|
6143
|
+
november: 10,
|
|
6144
|
+
dec: 11,
|
|
6145
|
+
december: 11
|
|
6146
|
+
};
|
|
6147
|
+
var FieldAccuracyEvaluator = class {
|
|
6148
|
+
kind = "field_accuracy";
|
|
6149
|
+
config;
|
|
6150
|
+
constructor(options) {
|
|
6151
|
+
this.config = options.config;
|
|
6152
|
+
}
|
|
6153
|
+
evaluate(context) {
|
|
6154
|
+
const { evalCase, candidate } = context;
|
|
6155
|
+
let candidateData;
|
|
6156
|
+
try {
|
|
6157
|
+
candidateData = parseJsonFromTextSafe(candidate);
|
|
6158
|
+
} catch {
|
|
6159
|
+
return {
|
|
6160
|
+
score: 0,
|
|
6161
|
+
verdict: "fail",
|
|
6162
|
+
hits: [],
|
|
6163
|
+
misses: ["Failed to parse candidate answer as JSON"],
|
|
6164
|
+
expectedAspectCount: this.config.fields.length,
|
|
6165
|
+
reasoning: "Candidate answer is not valid JSON"
|
|
6166
|
+
};
|
|
6167
|
+
}
|
|
6168
|
+
const expectedData = this.extractExpectedData(evalCase.expected_messages);
|
|
6169
|
+
if (!expectedData) {
|
|
6170
|
+
return {
|
|
6171
|
+
score: 0,
|
|
6172
|
+
verdict: "fail",
|
|
6173
|
+
hits: [],
|
|
6174
|
+
misses: ["No expected data found in expected_messages"],
|
|
6175
|
+
expectedAspectCount: this.config.fields.length,
|
|
6176
|
+
reasoning: "Could not extract expected data from expected_messages"
|
|
6177
|
+
};
|
|
6178
|
+
}
|
|
6179
|
+
const fieldResults = [];
|
|
6180
|
+
for (const fieldConfig of this.config.fields) {
|
|
6181
|
+
const result = this.evaluateField(fieldConfig, candidateData, expectedData);
|
|
6182
|
+
fieldResults.push(result);
|
|
6183
|
+
}
|
|
6184
|
+
return this.aggregateResults(fieldResults);
|
|
6185
|
+
}
|
|
6186
|
+
/**
|
|
6187
|
+
* Extract expected data from expected_messages array.
|
|
6188
|
+
* Looks for the last assistant message with content.
|
|
6189
|
+
*/
|
|
6190
|
+
extractExpectedData(expectedMessages) {
|
|
6191
|
+
for (let i = expectedMessages.length - 1; i >= 0; i--) {
|
|
6192
|
+
const message = expectedMessages[i];
|
|
6193
|
+
if (message.role === "assistant" && message.content) {
|
|
6194
|
+
if (typeof message.content === "object" && message.content !== null) {
|
|
6195
|
+
return message.content;
|
|
6196
|
+
}
|
|
6197
|
+
if (typeof message.content === "string") {
|
|
6198
|
+
try {
|
|
6199
|
+
return parseJsonFromTextSafe(message.content);
|
|
6200
|
+
} catch {
|
|
6201
|
+
}
|
|
6202
|
+
}
|
|
6203
|
+
}
|
|
6204
|
+
}
|
|
6205
|
+
return void 0;
|
|
6206
|
+
}
|
|
6207
|
+
/**
|
|
6208
|
+
* Evaluate a single field against the expected value.
|
|
6209
|
+
*/
|
|
6210
|
+
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
6211
|
+
const { path: path15, match, required = true, weight = 1 } = fieldConfig;
|
|
6212
|
+
const candidateValue = resolvePath(candidateData, path15);
|
|
6213
|
+
const expectedValue = resolvePath(expectedData, path15);
|
|
6214
|
+
if (expectedValue === void 0) {
|
|
6215
|
+
return {
|
|
6216
|
+
path: path15,
|
|
6217
|
+
score: 1,
|
|
6218
|
+
// No expected value means no comparison needed
|
|
6219
|
+
weight,
|
|
6220
|
+
hit: true,
|
|
6221
|
+
message: `${path15}: no expected value`
|
|
6222
|
+
};
|
|
6223
|
+
}
|
|
6224
|
+
if (candidateValue === void 0) {
|
|
6225
|
+
if (required) {
|
|
6226
|
+
return {
|
|
6227
|
+
path: path15,
|
|
6228
|
+
score: 0,
|
|
6229
|
+
weight,
|
|
6230
|
+
hit: false,
|
|
6231
|
+
message: `${path15} (required, missing)`
|
|
6232
|
+
};
|
|
6233
|
+
}
|
|
6234
|
+
return {
|
|
6235
|
+
path: path15,
|
|
6236
|
+
score: 1,
|
|
6237
|
+
// Don't penalize missing optional fields
|
|
6238
|
+
weight: 0,
|
|
6239
|
+
// Zero weight means it won't affect the score
|
|
6240
|
+
hit: true,
|
|
6241
|
+
message: `${path15}: optional field missing`
|
|
6242
|
+
};
|
|
6243
|
+
}
|
|
6244
|
+
switch (match) {
|
|
6245
|
+
case "exact":
|
|
6246
|
+
return this.compareExact(path15, candidateValue, expectedValue, weight);
|
|
6247
|
+
case "numeric_tolerance":
|
|
6248
|
+
return this.compareNumericTolerance(
|
|
6249
|
+
path15,
|
|
6250
|
+
candidateValue,
|
|
6251
|
+
expectedValue,
|
|
6252
|
+
fieldConfig,
|
|
6253
|
+
weight
|
|
6254
|
+
);
|
|
6255
|
+
case "date":
|
|
6256
|
+
return this.compareDate(path15, candidateValue, expectedValue, fieldConfig, weight);
|
|
6257
|
+
default:
|
|
6258
|
+
return {
|
|
6259
|
+
path: path15,
|
|
6260
|
+
score: 0,
|
|
6261
|
+
weight,
|
|
6262
|
+
hit: false,
|
|
6263
|
+
message: `${path15}: unknown match type "${match}"`
|
|
6264
|
+
};
|
|
6265
|
+
}
|
|
6266
|
+
}
|
|
6267
|
+
/**
|
|
6268
|
+
* Exact equality comparison.
|
|
6269
|
+
*/
|
|
6270
|
+
compareExact(path15, candidateValue, expectedValue, weight) {
|
|
6271
|
+
if (deepEqual(candidateValue, expectedValue)) {
|
|
6272
|
+
return {
|
|
6273
|
+
path: path15,
|
|
6274
|
+
score: 1,
|
|
6275
|
+
weight,
|
|
6276
|
+
hit: true,
|
|
6277
|
+
message: path15
|
|
6278
|
+
};
|
|
6279
|
+
}
|
|
6280
|
+
if (typeof candidateValue !== typeof expectedValue) {
|
|
6281
|
+
return {
|
|
6282
|
+
path: path15,
|
|
6283
|
+
score: 0,
|
|
6284
|
+
weight,
|
|
6285
|
+
hit: false,
|
|
6286
|
+
message: `${path15} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
6287
|
+
};
|
|
6288
|
+
}
|
|
6289
|
+
return {
|
|
6290
|
+
path: path15,
|
|
6291
|
+
score: 0,
|
|
6292
|
+
weight,
|
|
6293
|
+
hit: false,
|
|
6294
|
+
message: `${path15} (value mismatch)`
|
|
6295
|
+
};
|
|
6296
|
+
}
|
|
6297
|
+
/**
|
|
6298
|
+
* Numeric comparison with absolute or relative tolerance.
|
|
6299
|
+
*/
|
|
6300
|
+
compareNumericTolerance(path15, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6301
|
+
const { tolerance = 0, relative = false } = fieldConfig;
|
|
6302
|
+
const candidateNum = toNumber(candidateValue);
|
|
6303
|
+
const expectedNum = toNumber(expectedValue);
|
|
6304
|
+
if (candidateNum === null || expectedNum === null) {
|
|
6305
|
+
return {
|
|
6306
|
+
path: path15,
|
|
6307
|
+
score: 0,
|
|
6308
|
+
weight,
|
|
6309
|
+
hit: false,
|
|
6310
|
+
message: `${path15} (non-numeric value)`
|
|
6311
|
+
};
|
|
6312
|
+
}
|
|
6313
|
+
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
6314
|
+
return {
|
|
6315
|
+
path: path15,
|
|
6316
|
+
score: 0,
|
|
6317
|
+
weight,
|
|
6318
|
+
hit: false,
|
|
6319
|
+
message: `${path15} (invalid numeric value)`
|
|
6320
|
+
};
|
|
6321
|
+
}
|
|
6322
|
+
const diff = Math.abs(candidateNum - expectedNum);
|
|
6323
|
+
let withinTolerance;
|
|
6324
|
+
if (relative) {
|
|
6325
|
+
const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
|
|
6326
|
+
withinTolerance = relativeDiff <= tolerance;
|
|
6327
|
+
} else {
|
|
6328
|
+
withinTolerance = diff <= tolerance;
|
|
6329
|
+
}
|
|
6330
|
+
if (withinTolerance) {
|
|
6331
|
+
return {
|
|
6332
|
+
path: path15,
|
|
6333
|
+
score: 1,
|
|
6334
|
+
weight,
|
|
6335
|
+
hit: true,
|
|
6336
|
+
message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
|
|
6337
|
+
};
|
|
6338
|
+
}
|
|
6339
|
+
return {
|
|
6340
|
+
path: path15,
|
|
6341
|
+
score: 0,
|
|
6342
|
+
weight,
|
|
6343
|
+
hit: false,
|
|
6344
|
+
message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
6345
|
+
};
|
|
6346
|
+
}
|
|
6347
|
+
/**
|
|
6348
|
+
* Date comparison with format normalization.
|
|
6349
|
+
*/
|
|
6350
|
+
compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6351
|
+
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
6352
|
+
const candidateDate = parseDate(String(candidateValue), formats);
|
|
6353
|
+
const expectedDate = parseDate(String(expectedValue), formats);
|
|
6354
|
+
if (candidateDate === null) {
|
|
6355
|
+
return {
|
|
6356
|
+
path: path15,
|
|
6357
|
+
score: 0,
|
|
6358
|
+
weight,
|
|
6359
|
+
hit: false,
|
|
6360
|
+
message: `${path15} (unparseable candidate date)`
|
|
6361
|
+
};
|
|
6362
|
+
}
|
|
6363
|
+
if (expectedDate === null) {
|
|
6364
|
+
return {
|
|
6365
|
+
path: path15,
|
|
6366
|
+
score: 0,
|
|
6367
|
+
weight,
|
|
6368
|
+
hit: false,
|
|
6369
|
+
message: `${path15} (unparseable expected date)`
|
|
6370
|
+
};
|
|
6371
|
+
}
|
|
6372
|
+
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
6373
|
+
return {
|
|
6374
|
+
path: path15,
|
|
6375
|
+
score: 1,
|
|
6376
|
+
weight,
|
|
6377
|
+
hit: true,
|
|
6378
|
+
message: path15
|
|
6379
|
+
};
|
|
6380
|
+
}
|
|
6381
|
+
return {
|
|
6382
|
+
path: path15,
|
|
6383
|
+
score: 0,
|
|
6384
|
+
weight,
|
|
6385
|
+
hit: false,
|
|
6386
|
+
message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
6387
|
+
};
|
|
6388
|
+
}
|
|
6389
|
+
/**
|
|
6390
|
+
* Aggregate field results using configured strategy.
|
|
6391
|
+
*/
|
|
6392
|
+
aggregateResults(results) {
|
|
6393
|
+
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
6394
|
+
const hits = [];
|
|
6395
|
+
const misses = [];
|
|
6396
|
+
for (const result of results) {
|
|
6397
|
+
if (result.hit) {
|
|
6398
|
+
hits.push(result.message);
|
|
6399
|
+
} else {
|
|
6400
|
+
misses.push(result.message);
|
|
4885
6401
|
}
|
|
4886
6402
|
}
|
|
4887
|
-
|
|
4888
|
-
|
|
6403
|
+
let score;
|
|
6404
|
+
if (aggregation === "all_or_nothing") {
|
|
6405
|
+
score = misses.length === 0 ? 1 : 0;
|
|
6406
|
+
} else {
|
|
6407
|
+
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
6408
|
+
if (totalWeight === 0) {
|
|
6409
|
+
score = results.length === 0 ? 1 : 0;
|
|
6410
|
+
} else {
|
|
6411
|
+
const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
|
|
6412
|
+
score = weightedSum / totalWeight;
|
|
6413
|
+
}
|
|
4889
6414
|
}
|
|
4890
|
-
const
|
|
6415
|
+
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
4891
6416
|
return {
|
|
4892
|
-
score,
|
|
6417
|
+
score: clampScore(score),
|
|
4893
6418
|
verdict: scoreToVerdict(score),
|
|
4894
|
-
hits,
|
|
4895
|
-
misses,
|
|
4896
|
-
expectedAspectCount:
|
|
6419
|
+
hits: hits.slice(0, 4),
|
|
6420
|
+
misses: misses.slice(0, 4),
|
|
6421
|
+
expectedAspectCount: results.length,
|
|
6422
|
+
reasoning
|
|
4897
6423
|
};
|
|
4898
6424
|
}
|
|
4899
6425
|
};
|
|
6426
|
+
function resolvePath(obj, path15) {
|
|
6427
|
+
if (!path15 || !obj) {
|
|
6428
|
+
return void 0;
|
|
6429
|
+
}
|
|
6430
|
+
const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
6431
|
+
let current = obj;
|
|
6432
|
+
for (const part of parts) {
|
|
6433
|
+
if (current === null || current === void 0) {
|
|
6434
|
+
return void 0;
|
|
6435
|
+
}
|
|
6436
|
+
if (typeof current !== "object") {
|
|
6437
|
+
return void 0;
|
|
6438
|
+
}
|
|
6439
|
+
const isIndex = /^\d+$/.test(part);
|
|
6440
|
+
if (isIndex && Array.isArray(current)) {
|
|
6441
|
+
current = current[Number.parseInt(part, 10)];
|
|
6442
|
+
} else {
|
|
6443
|
+
current = current[part];
|
|
6444
|
+
}
|
|
6445
|
+
}
|
|
6446
|
+
return current;
|
|
6447
|
+
}
|
|
6448
|
+
function toNumber(value) {
|
|
6449
|
+
if (typeof value === "number") {
|
|
6450
|
+
return value;
|
|
6451
|
+
}
|
|
6452
|
+
if (typeof value === "string") {
|
|
6453
|
+
const num = Number.parseFloat(value);
|
|
6454
|
+
return Number.isNaN(num) ? null : num;
|
|
6455
|
+
}
|
|
6456
|
+
return null;
|
|
6457
|
+
}
|
|
6458
|
+
function parseDate(dateStr, formats) {
|
|
6459
|
+
if (!dateStr) return null;
|
|
6460
|
+
const trimmed = dateStr.trim();
|
|
6461
|
+
const isoDate = new Date(trimmed);
|
|
6462
|
+
if (!Number.isNaN(isoDate.getTime())) {
|
|
6463
|
+
return isoDate;
|
|
6464
|
+
}
|
|
6465
|
+
const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
|
|
6466
|
+
if (localizedMatch) {
|
|
6467
|
+
const day = Number.parseInt(localizedMatch[1], 10);
|
|
6468
|
+
const monthName = localizedMatch[2].toLowerCase();
|
|
6469
|
+
const year = Number.parseInt(localizedMatch[3], 10);
|
|
6470
|
+
const month = MONTH_NAMES[monthName];
|
|
6471
|
+
if (month !== void 0) {
|
|
6472
|
+
return new Date(year, month, day);
|
|
6473
|
+
}
|
|
6474
|
+
}
|
|
6475
|
+
const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
|
|
6476
|
+
if (usMatch) {
|
|
6477
|
+
const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
|
|
6478
|
+
const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
|
|
6479
|
+
if (hasUSFormat && !hasEUFormat) {
|
|
6480
|
+
const month = Number.parseInt(usMatch[1], 10) - 1;
|
|
6481
|
+
const day = Number.parseInt(usMatch[2], 10);
|
|
6482
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6483
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6484
|
+
return new Date(year, month, day);
|
|
6485
|
+
}
|
|
6486
|
+
} else if (hasEUFormat && !hasUSFormat) {
|
|
6487
|
+
const day = Number.parseInt(usMatch[1], 10);
|
|
6488
|
+
const month = Number.parseInt(usMatch[2], 10) - 1;
|
|
6489
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6490
|
+
if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
|
|
6491
|
+
return new Date(year, month, day);
|
|
6492
|
+
}
|
|
6493
|
+
} else {
|
|
6494
|
+
const num1 = Number.parseInt(usMatch[1], 10);
|
|
6495
|
+
const num2 = Number.parseInt(usMatch[2], 10);
|
|
6496
|
+
const year = Number.parseInt(usMatch[3], 10);
|
|
6497
|
+
if (num1 > 12 && num2 <= 12) {
|
|
6498
|
+
return new Date(year, num2 - 1, num1);
|
|
6499
|
+
}
|
|
6500
|
+
if (num2 > 12 && num1 <= 12) {
|
|
6501
|
+
return new Date(year, num1 - 1, num2);
|
|
6502
|
+
}
|
|
6503
|
+
if (num1 <= 12 && num2 <= 31) {
|
|
6504
|
+
return new Date(year, num1 - 1, num2);
|
|
6505
|
+
}
|
|
6506
|
+
}
|
|
6507
|
+
}
|
|
6508
|
+
return null;
|
|
6509
|
+
}
|
|
6510
|
+
function formatDateISO(date) {
|
|
6511
|
+
return date.toISOString().split("T")[0];
|
|
6512
|
+
}
|
|
6513
|
+
function parseJsonFromTextSafe(text) {
|
|
6514
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
6515
|
+
const match = cleaned.match(/\{[\s\S]*\}/);
|
|
6516
|
+
const blob = match?.[0] ?? cleaned;
|
|
6517
|
+
return JSON.parse(blob);
|
|
6518
|
+
}
|
|
4900
6519
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
4901
6520
|
{{EVALUATOR_RESULTS_JSON}}
|
|
4902
6521
|
|
|
@@ -5121,11 +6740,175 @@ var CompositeEvaluator = class {
|
|
|
5121
6740
|
}
|
|
5122
6741
|
}
|
|
5123
6742
|
};
|
|
6743
|
+
var LatencyEvaluator = class {
|
|
6744
|
+
kind = "latency";
|
|
6745
|
+
config;
|
|
6746
|
+
constructor(options) {
|
|
6747
|
+
this.config = options.config;
|
|
6748
|
+
}
|
|
6749
|
+
evaluate(context) {
|
|
6750
|
+
const { threshold } = this.config;
|
|
6751
|
+
const durationMs = context.traceSummary?.durationMs;
|
|
6752
|
+
if (durationMs === void 0) {
|
|
6753
|
+
return {
|
|
6754
|
+
score: 0,
|
|
6755
|
+
verdict: "fail",
|
|
6756
|
+
hits: [],
|
|
6757
|
+
misses: ["No duration data available in trace"],
|
|
6758
|
+
expectedAspectCount: 1,
|
|
6759
|
+
reasoning: "Execution duration not reported by provider",
|
|
6760
|
+
evaluatorRawRequest: {
|
|
6761
|
+
type: "latency",
|
|
6762
|
+
threshold,
|
|
6763
|
+
durationMs: null
|
|
6764
|
+
}
|
|
6765
|
+
};
|
|
6766
|
+
}
|
|
6767
|
+
const passed = durationMs <= threshold;
|
|
6768
|
+
const score = passed ? 1 : 0;
|
|
6769
|
+
return {
|
|
6770
|
+
score,
|
|
6771
|
+
verdict: passed ? "pass" : "fail",
|
|
6772
|
+
hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
|
|
6773
|
+
misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
|
|
6774
|
+
expectedAspectCount: 1,
|
|
6775
|
+
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
6776
|
+
evaluatorRawRequest: {
|
|
6777
|
+
type: "latency",
|
|
6778
|
+
threshold,
|
|
6779
|
+
durationMs
|
|
6780
|
+
}
|
|
6781
|
+
};
|
|
6782
|
+
}
|
|
6783
|
+
};
|
|
6784
|
+
var CostEvaluator = class {
|
|
6785
|
+
kind = "cost";
|
|
6786
|
+
config;
|
|
6787
|
+
constructor(options) {
|
|
6788
|
+
this.config = options.config;
|
|
6789
|
+
}
|
|
6790
|
+
evaluate(context) {
|
|
6791
|
+
const { budget } = this.config;
|
|
6792
|
+
const costUsd = context.traceSummary?.costUsd;
|
|
6793
|
+
if (costUsd === void 0) {
|
|
6794
|
+
return {
|
|
6795
|
+
score: 0,
|
|
6796
|
+
verdict: "fail",
|
|
6797
|
+
hits: [],
|
|
6798
|
+
misses: ["No cost data available in trace"],
|
|
6799
|
+
expectedAspectCount: 1,
|
|
6800
|
+
reasoning: "Execution cost not reported by provider",
|
|
6801
|
+
evaluatorRawRequest: {
|
|
6802
|
+
type: "cost",
|
|
6803
|
+
budget,
|
|
6804
|
+
costUsd: null
|
|
6805
|
+
}
|
|
6806
|
+
};
|
|
6807
|
+
}
|
|
6808
|
+
const passed = costUsd <= budget;
|
|
6809
|
+
const score = passed ? 1 : 0;
|
|
6810
|
+
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
6811
|
+
return {
|
|
6812
|
+
score,
|
|
6813
|
+
verdict: passed ? "pass" : "fail",
|
|
6814
|
+
hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
|
|
6815
|
+
misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
|
|
6816
|
+
expectedAspectCount: 1,
|
|
6817
|
+
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
6818
|
+
evaluatorRawRequest: {
|
|
6819
|
+
type: "cost",
|
|
6820
|
+
budget,
|
|
6821
|
+
costUsd
|
|
6822
|
+
}
|
|
6823
|
+
};
|
|
6824
|
+
}
|
|
6825
|
+
};
|
|
6826
|
+
var TokenUsageEvaluator = class {
|
|
6827
|
+
kind = "token_usage";
|
|
6828
|
+
config;
|
|
6829
|
+
constructor(options) {
|
|
6830
|
+
this.config = options.config;
|
|
6831
|
+
}
|
|
6832
|
+
evaluate(context) {
|
|
6833
|
+
const usage = context.traceSummary?.tokenUsage;
|
|
6834
|
+
const maxTotal = this.config.max_total;
|
|
6835
|
+
const maxInput = this.config.max_input;
|
|
6836
|
+
const maxOutput = this.config.max_output;
|
|
6837
|
+
const expectedAspectCount = Math.max(
|
|
6838
|
+
[maxTotal, maxInput, maxOutput].filter((v) => typeof v === "number").length,
|
|
6839
|
+
1
|
|
6840
|
+
);
|
|
6841
|
+
if (!usage) {
|
|
6842
|
+
return {
|
|
6843
|
+
score: 0,
|
|
6844
|
+
verdict: "fail",
|
|
6845
|
+
hits: [],
|
|
6846
|
+
misses: ["No token usage data available in trace"],
|
|
6847
|
+
expectedAspectCount,
|
|
6848
|
+
reasoning: "Token usage not reported by provider",
|
|
6849
|
+
evaluatorRawRequest: {
|
|
6850
|
+
type: "token_usage",
|
|
6851
|
+
max_total: maxTotal ?? null,
|
|
6852
|
+
max_input: maxInput ?? null,
|
|
6853
|
+
max_output: maxOutput ?? null,
|
|
6854
|
+
tokenUsage: null
|
|
6855
|
+
}
|
|
6856
|
+
};
|
|
6857
|
+
}
|
|
6858
|
+
const input = usage.input;
|
|
6859
|
+
const output = usage.output;
|
|
6860
|
+
const cached = usage.cached ?? 0;
|
|
6861
|
+
const total = input + output + cached;
|
|
6862
|
+
const hits = [];
|
|
6863
|
+
const misses = [];
|
|
6864
|
+
if (typeof maxInput === "number") {
|
|
6865
|
+
if (input <= maxInput) {
|
|
6866
|
+
hits.push(`Input tokens ${input} <= ${maxInput}`);
|
|
6867
|
+
} else {
|
|
6868
|
+
misses.push(`Input tokens ${input} > ${maxInput}`);
|
|
6869
|
+
}
|
|
6870
|
+
}
|
|
6871
|
+
if (typeof maxOutput === "number") {
|
|
6872
|
+
if (output <= maxOutput) {
|
|
6873
|
+
hits.push(`Output tokens ${output} <= ${maxOutput}`);
|
|
6874
|
+
} else {
|
|
6875
|
+
misses.push(`Output tokens ${output} > ${maxOutput}`);
|
|
6876
|
+
}
|
|
6877
|
+
}
|
|
6878
|
+
if (typeof maxTotal === "number") {
|
|
6879
|
+
if (total <= maxTotal) {
|
|
6880
|
+
hits.push(`Total tokens ${total} <= ${maxTotal}`);
|
|
6881
|
+
} else {
|
|
6882
|
+
misses.push(`Total tokens ${total} > ${maxTotal}`);
|
|
6883
|
+
}
|
|
6884
|
+
}
|
|
6885
|
+
const passed = misses.length === 0;
|
|
6886
|
+
return {
|
|
6887
|
+
score: passed ? 1 : 0,
|
|
6888
|
+
verdict: passed ? "pass" : "fail",
|
|
6889
|
+
hits,
|
|
6890
|
+
misses,
|
|
6891
|
+
expectedAspectCount,
|
|
6892
|
+
reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
6893
|
+
evaluatorRawRequest: {
|
|
6894
|
+
type: "token_usage",
|
|
6895
|
+
max_total: maxTotal ?? null,
|
|
6896
|
+
max_input: maxInput ?? null,
|
|
6897
|
+
max_output: maxOutput ?? null,
|
|
6898
|
+
tokenUsage: {
|
|
6899
|
+
input,
|
|
6900
|
+
output,
|
|
6901
|
+
cached,
|
|
6902
|
+
total
|
|
6903
|
+
}
|
|
6904
|
+
}
|
|
6905
|
+
};
|
|
6906
|
+
}
|
|
6907
|
+
};
|
|
5124
6908
|
|
|
5125
6909
|
// src/evaluation/orchestrator.ts
|
|
5126
|
-
import { createHash
|
|
5127
|
-
import
|
|
5128
|
-
import path13 from "node:path";
|
|
6910
|
+
import { createHash } from "node:crypto";
|
|
6911
|
+
import path14 from "node:path";
|
|
5129
6912
|
|
|
5130
6913
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
5131
6914
|
var Node = class {
|
|
@@ -5267,6 +7050,9 @@ function validateConcurrency(concurrency) {
|
|
|
5267
7050
|
}
|
|
5268
7051
|
|
|
5269
7052
|
// src/evaluation/orchestrator.ts
|
|
7053
|
+
function usesFileReferencePrompt(provider) {
|
|
7054
|
+
return isAgentProvider(provider) || provider.kind === "cli";
|
|
7055
|
+
}
|
|
5270
7056
|
async function runEvaluation(options) {
|
|
5271
7057
|
const {
|
|
5272
7058
|
testFilePath: evalFilePath,
|
|
@@ -5278,7 +7064,6 @@ async function runEvaluation(options) {
|
|
|
5278
7064
|
evaluators,
|
|
5279
7065
|
maxRetries,
|
|
5280
7066
|
agentTimeoutMs,
|
|
5281
|
-
promptDumpDir,
|
|
5282
7067
|
cache,
|
|
5283
7068
|
useCache,
|
|
5284
7069
|
now,
|
|
@@ -5358,7 +7143,6 @@ async function runEvaluation(options) {
|
|
|
5358
7143
|
provider: primaryProvider,
|
|
5359
7144
|
target,
|
|
5360
7145
|
evaluatorRegistry,
|
|
5361
|
-
promptDumpDir,
|
|
5362
7146
|
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
5363
7147
|
onProgress,
|
|
5364
7148
|
onResult,
|
|
@@ -5400,7 +7184,6 @@ async function runEvaluation(options) {
|
|
|
5400
7184
|
evaluators: evaluatorRegistry,
|
|
5401
7185
|
maxRetries,
|
|
5402
7186
|
agentTimeoutMs,
|
|
5403
|
-
promptDumpDir,
|
|
5404
7187
|
cache,
|
|
5405
7188
|
useCache,
|
|
5406
7189
|
now,
|
|
@@ -5443,7 +7226,8 @@ async function runEvaluation(options) {
|
|
|
5443
7226
|
results.push(outcome.value);
|
|
5444
7227
|
} else {
|
|
5445
7228
|
const evalCase = filteredEvalCases[i];
|
|
5446
|
-
const
|
|
7229
|
+
const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
|
|
7230
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
5447
7231
|
const errorResult = buildErrorResult(
|
|
5448
7232
|
evalCase,
|
|
5449
7233
|
target.name,
|
|
@@ -5466,7 +7250,6 @@ async function runBatchEvaluation(options) {
|
|
|
5466
7250
|
provider,
|
|
5467
7251
|
target,
|
|
5468
7252
|
evaluatorRegistry,
|
|
5469
|
-
promptDumpDir,
|
|
5470
7253
|
nowFn,
|
|
5471
7254
|
onProgress,
|
|
5472
7255
|
onResult,
|
|
@@ -5474,12 +7257,9 @@ async function runBatchEvaluation(options) {
|
|
|
5474
7257
|
agentTimeoutMs
|
|
5475
7258
|
} = options;
|
|
5476
7259
|
const promptInputsList = [];
|
|
5477
|
-
const formattingMode =
|
|
7260
|
+
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
5478
7261
|
for (const evalCase of evalCases) {
|
|
5479
7262
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
5480
|
-
if (promptDumpDir) {
|
|
5481
|
-
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
5482
|
-
}
|
|
5483
7263
|
promptInputsList.push(promptInputs);
|
|
5484
7264
|
}
|
|
5485
7265
|
const batchRequests = evalCases.map((evalCase, index) => {
|
|
@@ -5521,13 +7301,20 @@ async function runBatchEvaluation(options) {
|
|
|
5521
7301
|
const promptInputs = promptInputsList[i];
|
|
5522
7302
|
const providerResponse = batchResponse[i];
|
|
5523
7303
|
const outputMessages = providerResponse.outputMessages;
|
|
5524
|
-
const
|
|
7304
|
+
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
7305
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
|
|
7306
|
+
eventCount: 0,
|
|
7307
|
+
toolNames: [],
|
|
7308
|
+
toolCallsByName: {},
|
|
7309
|
+
errorCount: 0
|
|
7310
|
+
} : void 0;
|
|
5525
7311
|
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
5526
7312
|
tokenUsage: providerResponse.tokenUsage,
|
|
5527
7313
|
costUsd: providerResponse.costUsd,
|
|
5528
7314
|
durationMs: providerResponse.durationMs
|
|
5529
7315
|
}) : void 0;
|
|
5530
7316
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
7317
|
+
const providerError = extractProviderError(providerResponse);
|
|
5531
7318
|
let result;
|
|
5532
7319
|
try {
|
|
5533
7320
|
result = await evaluateCandidate({
|
|
@@ -5544,6 +7331,9 @@ async function runBatchEvaluation(options) {
|
|
|
5544
7331
|
outputMessages,
|
|
5545
7332
|
traceSummary
|
|
5546
7333
|
});
|
|
7334
|
+
if (providerError) {
|
|
7335
|
+
result = { ...result, error: providerError };
|
|
7336
|
+
}
|
|
5547
7337
|
} catch (error) {
|
|
5548
7338
|
const errorResult = buildErrorResult(
|
|
5549
7339
|
evalCase,
|
|
@@ -5576,9 +7366,10 @@ async function runBatchEvaluation(options) {
|
|
|
5576
7366
|
await onProgress({
|
|
5577
7367
|
workerId: 1,
|
|
5578
7368
|
evalId: evalCase.id,
|
|
5579
|
-
status: "completed",
|
|
7369
|
+
status: result.error ? "failed" : "completed",
|
|
5580
7370
|
startedAt: 0,
|
|
5581
|
-
completedAt: Date.now()
|
|
7371
|
+
completedAt: Date.now(),
|
|
7372
|
+
error: result.error
|
|
5582
7373
|
});
|
|
5583
7374
|
}
|
|
5584
7375
|
}
|
|
@@ -5593,17 +7384,13 @@ async function runEvalCase(options) {
|
|
|
5593
7384
|
now,
|
|
5594
7385
|
maxRetries,
|
|
5595
7386
|
agentTimeoutMs,
|
|
5596
|
-
promptDumpDir,
|
|
5597
7387
|
cache,
|
|
5598
7388
|
useCache,
|
|
5599
7389
|
signal,
|
|
5600
7390
|
judgeProvider
|
|
5601
7391
|
} = options;
|
|
5602
|
-
const formattingMode =
|
|
7392
|
+
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
5603
7393
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
5604
|
-
if (promptDumpDir) {
|
|
5605
|
-
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
5606
|
-
}
|
|
5607
7394
|
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
5608
7395
|
let cachedResponse;
|
|
5609
7396
|
if (cacheKey && cache) {
|
|
@@ -5647,15 +7434,22 @@ async function runEvalCase(options) {
|
|
|
5647
7434
|
await cache.set(cacheKey, providerResponse);
|
|
5648
7435
|
}
|
|
5649
7436
|
const outputMessages = providerResponse.outputMessages;
|
|
5650
|
-
const
|
|
7437
|
+
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
7438
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
|
|
7439
|
+
eventCount: 0,
|
|
7440
|
+
toolNames: [],
|
|
7441
|
+
toolCallsByName: {},
|
|
7442
|
+
errorCount: 0
|
|
7443
|
+
} : void 0;
|
|
5651
7444
|
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
5652
7445
|
tokenUsage: providerResponse.tokenUsage,
|
|
5653
7446
|
costUsd: providerResponse.costUsd,
|
|
5654
7447
|
durationMs: providerResponse.durationMs
|
|
5655
7448
|
}) : void 0;
|
|
5656
7449
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
7450
|
+
const providerError = extractProviderError(providerResponse);
|
|
5657
7451
|
try {
|
|
5658
|
-
|
|
7452
|
+
const result = await evaluateCandidate({
|
|
5659
7453
|
evalCase,
|
|
5660
7454
|
candidate,
|
|
5661
7455
|
target,
|
|
@@ -5669,6 +7463,7 @@ async function runEvalCase(options) {
|
|
|
5669
7463
|
outputMessages,
|
|
5670
7464
|
traceSummary
|
|
5671
7465
|
});
|
|
7466
|
+
return providerError ? { ...result, error: providerError } : result;
|
|
5672
7467
|
} catch (error) {
|
|
5673
7468
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
5674
7469
|
}
|
|
@@ -5734,7 +7529,6 @@ async function evaluateCandidate(options) {
|
|
|
5734
7529
|
candidateAnswer: candidate,
|
|
5735
7530
|
target: target.name,
|
|
5736
7531
|
reasoning: score.reasoning,
|
|
5737
|
-
rawAspects: score.rawAspects,
|
|
5738
7532
|
agentProviderRequest,
|
|
5739
7533
|
lmProviderRequest,
|
|
5740
7534
|
evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
@@ -5844,7 +7638,8 @@ async function runEvaluatorList(options) {
|
|
|
5844
7638
|
const codeEvaluator = new CodeEvaluator({
|
|
5845
7639
|
script: evaluator.script,
|
|
5846
7640
|
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
5847
|
-
agentTimeoutMs
|
|
7641
|
+
agentTimeoutMs,
|
|
7642
|
+
config: evaluator.config
|
|
5848
7643
|
});
|
|
5849
7644
|
const score2 = await codeEvaluator.evaluate({
|
|
5850
7645
|
evalCase,
|
|
@@ -5872,7 +7667,7 @@ async function runEvaluatorList(options) {
|
|
|
5872
7667
|
});
|
|
5873
7668
|
}
|
|
5874
7669
|
if (evaluator.type === "composite") {
|
|
5875
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
7670
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path14.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
5876
7671
|
const createEvaluator = (memberConfig) => {
|
|
5877
7672
|
switch (memberConfig.type) {
|
|
5878
7673
|
case "llm_judge":
|
|
@@ -5881,7 +7676,8 @@ async function runEvaluatorList(options) {
|
|
|
5881
7676
|
return new CodeEvaluator({
|
|
5882
7677
|
script: memberConfig.script,
|
|
5883
7678
|
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
5884
|
-
agentTimeoutMs
|
|
7679
|
+
agentTimeoutMs,
|
|
7680
|
+
config: memberConfig.config
|
|
5885
7681
|
});
|
|
5886
7682
|
case "composite":
|
|
5887
7683
|
return new CompositeEvaluator({
|
|
@@ -5893,6 +7689,22 @@ async function runEvaluatorList(options) {
|
|
|
5893
7689
|
return new ToolTrajectoryEvaluator({
|
|
5894
7690
|
config: memberConfig
|
|
5895
7691
|
});
|
|
7692
|
+
case "field_accuracy":
|
|
7693
|
+
return new FieldAccuracyEvaluator({
|
|
7694
|
+
config: memberConfig
|
|
7695
|
+
});
|
|
7696
|
+
case "latency":
|
|
7697
|
+
return new LatencyEvaluator({
|
|
7698
|
+
config: memberConfig
|
|
7699
|
+
});
|
|
7700
|
+
case "cost":
|
|
7701
|
+
return new CostEvaluator({
|
|
7702
|
+
config: memberConfig
|
|
7703
|
+
});
|
|
7704
|
+
case "token_usage":
|
|
7705
|
+
return new TokenUsageEvaluator({
|
|
7706
|
+
config: memberConfig
|
|
7707
|
+
});
|
|
5896
7708
|
default: {
|
|
5897
7709
|
const unknownConfig = memberConfig;
|
|
5898
7710
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -5912,7 +7724,9 @@ async function runEvaluatorList(options) {
|
|
|
5912
7724
|
attempt,
|
|
5913
7725
|
promptInputs,
|
|
5914
7726
|
now,
|
|
5915
|
-
judgeProvider
|
|
7727
|
+
judgeProvider,
|
|
7728
|
+
outputMessages,
|
|
7729
|
+
traceSummary
|
|
5916
7730
|
});
|
|
5917
7731
|
const weight = evaluator.weight ?? 1;
|
|
5918
7732
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -5957,6 +7771,118 @@ async function runEvaluatorList(options) {
|
|
|
5957
7771
|
reasoning: score2.reasoning
|
|
5958
7772
|
});
|
|
5959
7773
|
}
|
|
7774
|
+
if (evaluator.type === "field_accuracy") {
|
|
7775
|
+
const fieldAccuracyEvaluator = new FieldAccuracyEvaluator({
|
|
7776
|
+
config: evaluator
|
|
7777
|
+
});
|
|
7778
|
+
const score2 = fieldAccuracyEvaluator.evaluate({
|
|
7779
|
+
evalCase,
|
|
7780
|
+
candidate,
|
|
7781
|
+
target,
|
|
7782
|
+
provider,
|
|
7783
|
+
attempt,
|
|
7784
|
+
promptInputs,
|
|
7785
|
+
now,
|
|
7786
|
+
outputMessages,
|
|
7787
|
+
traceSummary
|
|
7788
|
+
});
|
|
7789
|
+
const weight = evaluator.weight ?? 1;
|
|
7790
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
7791
|
+
evaluatorResults.push({
|
|
7792
|
+
name: evaluator.name,
|
|
7793
|
+
type: evaluator.type,
|
|
7794
|
+
score: score2.score,
|
|
7795
|
+
weight,
|
|
7796
|
+
verdict: score2.verdict,
|
|
7797
|
+
hits: score2.hits,
|
|
7798
|
+
misses: score2.misses,
|
|
7799
|
+
reasoning: score2.reasoning
|
|
7800
|
+
});
|
|
7801
|
+
}
|
|
7802
|
+
if (evaluator.type === "latency") {
|
|
7803
|
+
const latencyEvaluator = new LatencyEvaluator({
|
|
7804
|
+
config: evaluator
|
|
7805
|
+
});
|
|
7806
|
+
const score2 = latencyEvaluator.evaluate({
|
|
7807
|
+
evalCase,
|
|
7808
|
+
candidate,
|
|
7809
|
+
target,
|
|
7810
|
+
provider,
|
|
7811
|
+
attempt,
|
|
7812
|
+
promptInputs,
|
|
7813
|
+
now,
|
|
7814
|
+
outputMessages,
|
|
7815
|
+
traceSummary
|
|
7816
|
+
});
|
|
7817
|
+
const weight = evaluator.weight ?? 1;
|
|
7818
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
7819
|
+
evaluatorResults.push({
|
|
7820
|
+
name: evaluator.name,
|
|
7821
|
+
type: evaluator.type,
|
|
7822
|
+
score: score2.score,
|
|
7823
|
+
weight,
|
|
7824
|
+
verdict: score2.verdict,
|
|
7825
|
+
hits: score2.hits,
|
|
7826
|
+
misses: score2.misses,
|
|
7827
|
+
reasoning: score2.reasoning
|
|
7828
|
+
});
|
|
7829
|
+
}
|
|
7830
|
+
if (evaluator.type === "cost") {
|
|
7831
|
+
const costEvaluator = new CostEvaluator({
|
|
7832
|
+
config: evaluator
|
|
7833
|
+
});
|
|
7834
|
+
const score2 = costEvaluator.evaluate({
|
|
7835
|
+
evalCase,
|
|
7836
|
+
candidate,
|
|
7837
|
+
target,
|
|
7838
|
+
provider,
|
|
7839
|
+
attempt,
|
|
7840
|
+
promptInputs,
|
|
7841
|
+
now,
|
|
7842
|
+
outputMessages,
|
|
7843
|
+
traceSummary
|
|
7844
|
+
});
|
|
7845
|
+
const weight = evaluator.weight ?? 1;
|
|
7846
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
7847
|
+
evaluatorResults.push({
|
|
7848
|
+
name: evaluator.name,
|
|
7849
|
+
type: evaluator.type,
|
|
7850
|
+
score: score2.score,
|
|
7851
|
+
weight,
|
|
7852
|
+
verdict: score2.verdict,
|
|
7853
|
+
hits: score2.hits,
|
|
7854
|
+
misses: score2.misses,
|
|
7855
|
+
reasoning: score2.reasoning
|
|
7856
|
+
});
|
|
7857
|
+
}
|
|
7858
|
+
if (evaluator.type === "token_usage") {
|
|
7859
|
+
const tokenUsageEvaluator = new TokenUsageEvaluator({
|
|
7860
|
+
config: evaluator
|
|
7861
|
+
});
|
|
7862
|
+
const score2 = tokenUsageEvaluator.evaluate({
|
|
7863
|
+
evalCase,
|
|
7864
|
+
candidate,
|
|
7865
|
+
target,
|
|
7866
|
+
provider,
|
|
7867
|
+
attempt,
|
|
7868
|
+
promptInputs,
|
|
7869
|
+
now,
|
|
7870
|
+
outputMessages,
|
|
7871
|
+
traceSummary
|
|
7872
|
+
});
|
|
7873
|
+
const weight = evaluator.weight ?? 1;
|
|
7874
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
7875
|
+
evaluatorResults.push({
|
|
7876
|
+
name: evaluator.name,
|
|
7877
|
+
type: evaluator.type,
|
|
7878
|
+
score: score2.score,
|
|
7879
|
+
weight,
|
|
7880
|
+
verdict: score2.verdict,
|
|
7881
|
+
hits: score2.hits,
|
|
7882
|
+
misses: score2.misses,
|
|
7883
|
+
reasoning: score2.reasoning
|
|
7884
|
+
});
|
|
7885
|
+
}
|
|
5960
7886
|
} catch (error) {
|
|
5961
7887
|
const message = error instanceof Error ? error.message : String(error);
|
|
5962
7888
|
const fallbackScore = {
|
|
@@ -5996,7 +7922,6 @@ async function runEvaluatorList(options) {
|
|
|
5996
7922
|
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
5997
7923
|
0
|
|
5998
7924
|
);
|
|
5999
|
-
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
6000
7925
|
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
6001
7926
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
6002
7927
|
const score = {
|
|
@@ -6005,8 +7930,7 @@ async function runEvaluatorList(options) {
|
|
|
6005
7930
|
hits,
|
|
6006
7931
|
misses,
|
|
6007
7932
|
expectedAspectCount,
|
|
6008
|
-
reasoning
|
|
6009
|
-
rawAspects: rawAspects.length > 0 ? rawAspects : void 0
|
|
7933
|
+
reasoning
|
|
6010
7934
|
};
|
|
6011
7935
|
return { score, evaluatorResults };
|
|
6012
7936
|
}
|
|
@@ -6081,26 +8005,6 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
6081
8005
|
llm_judge: llmJudge
|
|
6082
8006
|
};
|
|
6083
8007
|
}
|
|
6084
|
-
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
6085
|
-
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
6086
|
-
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
6087
|
-
const filePath = path13.resolve(directory, filename);
|
|
6088
|
-
await mkdir3(path13.dirname(filePath), { recursive: true });
|
|
6089
|
-
const payload = {
|
|
6090
|
-
eval_id: evalCase.id,
|
|
6091
|
-
question: promptInputs.question,
|
|
6092
|
-
guidelines: promptInputs.guidelines,
|
|
6093
|
-
guideline_paths: evalCase.guideline_paths
|
|
6094
|
-
};
|
|
6095
|
-
await writeFile3(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
6096
|
-
}
|
|
6097
|
-
function sanitizeFilename(value) {
|
|
6098
|
-
if (!value) {
|
|
6099
|
-
return "prompt";
|
|
6100
|
-
}
|
|
6101
|
-
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
6102
|
-
return sanitized.length > 0 ? sanitized : randomUUID3();
|
|
6103
|
-
}
|
|
6104
8008
|
async function invokeProvider(provider, options) {
|
|
6105
8009
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
6106
8010
|
const controller = new AbortController();
|
|
@@ -6164,12 +8068,23 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
6164
8068
|
misses: [`Error: ${message}`],
|
|
6165
8069
|
candidateAnswer: `Error occurred: ${message}`,
|
|
6166
8070
|
target: targetName,
|
|
6167
|
-
rawAspects: [],
|
|
6168
8071
|
agentProviderRequest,
|
|
6169
8072
|
lmProviderRequest,
|
|
6170
8073
|
error: message
|
|
6171
8074
|
};
|
|
6172
8075
|
}
|
|
8076
|
+
function extractProviderError(response) {
|
|
8077
|
+
const raw = response.raw;
|
|
8078
|
+
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
8079
|
+
return void 0;
|
|
8080
|
+
}
|
|
8081
|
+
const error = raw.error;
|
|
8082
|
+
if (typeof error !== "string") {
|
|
8083
|
+
return void 0;
|
|
8084
|
+
}
|
|
8085
|
+
const trimmed = error.trim();
|
|
8086
|
+
return trimmed.length > 0 ? trimmed : void 0;
|
|
8087
|
+
}
|
|
6173
8088
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
6174
8089
|
const hash = createHash("sha256");
|
|
6175
8090
|
hash.update(provider.id);
|
|
@@ -6228,15 +8143,15 @@ function computeWeightedMean(entries) {
|
|
|
6228
8143
|
|
|
6229
8144
|
// src/evaluation/generators/rubric-generator.ts
|
|
6230
8145
|
import { generateText as generateText3 } from "ai";
|
|
6231
|
-
import { z as
|
|
6232
|
-
var rubricItemSchema =
|
|
6233
|
-
id:
|
|
6234
|
-
description:
|
|
6235
|
-
weight:
|
|
6236
|
-
required:
|
|
8146
|
+
import { z as z3 } from "zod";
|
|
8147
|
+
var rubricItemSchema = z3.object({
|
|
8148
|
+
id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
8149
|
+
description: z3.string().describe("What this rubric checks for"),
|
|
8150
|
+
weight: z3.number().default(1).describe("Relative importance (default 1.0)"),
|
|
8151
|
+
required: z3.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
6237
8152
|
});
|
|
6238
|
-
var rubricGenerationSchema =
|
|
6239
|
-
rubrics:
|
|
8153
|
+
var rubricGenerationSchema = z3.object({
|
|
8154
|
+
rubrics: z3.array(rubricItemSchema).describe("List of evaluation rubrics")
|
|
6240
8155
|
});
|
|
6241
8156
|
async function generateRubrics(options) {
|
|
6242
8157
|
const { expectedOutcome, question, referenceAnswer, provider } = options;
|
|
@@ -6313,15 +8228,20 @@ function createAgentKernel() {
|
|
|
6313
8228
|
export {
|
|
6314
8229
|
CodeEvaluator,
|
|
6315
8230
|
CompositeEvaluator,
|
|
8231
|
+
CostEvaluator,
|
|
6316
8232
|
DEFAULT_EXPLORATION_TOOLS,
|
|
8233
|
+
FieldAccuracyEvaluator,
|
|
8234
|
+
LatencyEvaluator,
|
|
6317
8235
|
LlmJudgeEvaluator,
|
|
6318
8236
|
TEST_MESSAGE_ROLES,
|
|
8237
|
+
TokenUsageEvaluator,
|
|
6319
8238
|
ToolTrajectoryEvaluator,
|
|
6320
8239
|
avgToolDurationMs,
|
|
6321
8240
|
buildDirectoryChain,
|
|
6322
8241
|
buildPromptInputs,
|
|
6323
8242
|
buildSearchRoots,
|
|
6324
8243
|
computeTraceSummary,
|
|
8244
|
+
consumeClaudeCodeLogEntries,
|
|
6325
8245
|
consumeCodexLogEntries,
|
|
6326
8246
|
consumePiLogEntries,
|
|
6327
8247
|
createAgentKernel,
|
|
@@ -6352,6 +8272,7 @@ export {
|
|
|
6352
8272
|
resolveTargetDefinition,
|
|
6353
8273
|
runEvalCase,
|
|
6354
8274
|
runEvaluation,
|
|
8275
|
+
subscribeToClaudeCodeLogEntries,
|
|
6355
8276
|
subscribeToCodexLogEntries,
|
|
6356
8277
|
subscribeToPiLogEntries,
|
|
6357
8278
|
tokensPerTool
|