agentv 3.14.0 → 3.14.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-3TBDSUYD.js → chunk-ELQEFMGO.js} +115 -20
- package/dist/chunk-ELQEFMGO.js.map +1 -0
- package/dist/{chunk-W6CGDNQR.js → chunk-FSNRKR7X.js} +405 -46
- package/dist/chunk-FSNRKR7X.js.map +1 -0
- package/dist/{chunk-YYECEMUV.js → chunk-VYDUBNCD.js} +5 -8
- package/dist/chunk-VYDUBNCD.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-ZGLENPVH.js → dist-5EEXTTC3.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-AI75XY3X.js → interactive-LBVPF2CG.js} +3 -3
- package/package.json +2 -5
- package/dist/chunk-3TBDSUYD.js.map +0 -1
- package/dist/chunk-W6CGDNQR.js.map +0 -1
- package/dist/chunk-YYECEMUV.js.map +0 -1
- /package/dist/{dist-ZGLENPVH.js.map → dist-5EEXTTC3.js.map} +0 -0
- /package/dist/{interactive-AI75XY3X.js.map → interactive-LBVPF2CG.js.map} +0 -0
|
@@ -22,7 +22,7 @@ import {
|
|
|
22
22
|
validateFileReferences,
|
|
23
23
|
validateTargetsFile,
|
|
24
24
|
writeArtifactsFromResults
|
|
25
|
-
} from "./chunk-
|
|
25
|
+
} from "./chunk-VYDUBNCD.js";
|
|
26
26
|
import {
|
|
27
27
|
createBuiltinRegistry,
|
|
28
28
|
executeScript,
|
|
@@ -39,7 +39,7 @@ import {
|
|
|
39
39
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
40
40
|
transpileEvalYamlFile,
|
|
41
41
|
trimBaselineResult
|
|
42
|
-
} from "./chunk-
|
|
42
|
+
} from "./chunk-ELQEFMGO.js";
|
|
43
43
|
import {
|
|
44
44
|
__commonJS,
|
|
45
45
|
__esm,
|
|
@@ -4185,7 +4185,7 @@ var evalRunCommand = command({
|
|
|
4185
4185
|
},
|
|
4186
4186
|
handler: async (args) => {
|
|
4187
4187
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4188
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4188
|
+
const { launchInteractiveWizard } = await import("./interactive-LBVPF2CG.js");
|
|
4189
4189
|
await launchInteractiveWizard();
|
|
4190
4190
|
return;
|
|
4191
4191
|
}
|
|
@@ -4408,13 +4408,23 @@ var evalBenchCommand = command({
|
|
|
4408
4408
|
type: string,
|
|
4409
4409
|
displayName: "export-dir",
|
|
4410
4410
|
description: "Export directory from pipeline input/grade"
|
|
4411
|
+
}),
|
|
4412
|
+
llmScores: option({
|
|
4413
|
+
type: optional(string),
|
|
4414
|
+
long: "llm-scores",
|
|
4415
|
+
description: "Path to LLM scores JSON file (reads from stdin if omitted)"
|
|
4411
4416
|
})
|
|
4412
4417
|
},
|
|
4413
|
-
handler: async ({ exportDir }) => {
|
|
4418
|
+
handler: async ({ exportDir, llmScores: llmScoresPath }) => {
|
|
4414
4419
|
const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
|
|
4415
4420
|
const testIds = manifest.test_ids;
|
|
4416
4421
|
const targetName = manifest.target?.name ?? "unknown";
|
|
4417
|
-
|
|
4422
|
+
let stdinData;
|
|
4423
|
+
if (llmScoresPath) {
|
|
4424
|
+
stdinData = await readFile(llmScoresPath, "utf8");
|
|
4425
|
+
} else {
|
|
4426
|
+
stdinData = await readStdin();
|
|
4427
|
+
}
|
|
4418
4428
|
const llmScores = stdinData ? JSON.parse(stdinData) : {};
|
|
4419
4429
|
const indexLines = [];
|
|
4420
4430
|
const allPassRates = [];
|
|
@@ -4814,6 +4824,354 @@ async function writeJson(filePath, data) {
|
|
|
4814
4824
|
`, "utf8");
|
|
4815
4825
|
}
|
|
4816
4826
|
|
|
4827
|
+
// src/commands/pipeline/run.ts
|
|
4828
|
+
import { execSync } from "node:child_process";
|
|
4829
|
+
import { existsSync as existsSync2, readFileSync as readFileSync4, unlinkSync } from "node:fs";
|
|
4830
|
+
import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
|
|
4831
|
+
import { tmpdir } from "node:os";
|
|
4832
|
+
import { dirname as dirname2, join as join4, resolve as resolve2 } from "node:path";
|
|
4833
|
+
function loadEnvFile(dir) {
|
|
4834
|
+
let current = resolve2(dir);
|
|
4835
|
+
while (true) {
|
|
4836
|
+
const candidate = join4(current, ".env");
|
|
4837
|
+
if (existsSync2(candidate)) {
|
|
4838
|
+
const env3 = {};
|
|
4839
|
+
for (const line of readFileSync4(candidate, "utf8").split("\n")) {
|
|
4840
|
+
const trimmed = line.trim();
|
|
4841
|
+
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
4842
|
+
const eqIdx = trimmed.indexOf("=");
|
|
4843
|
+
if (eqIdx === -1) continue;
|
|
4844
|
+
env3[trimmed.slice(0, eqIdx).trim()] = trimmed.slice(eqIdx + 1).trim();
|
|
4845
|
+
}
|
|
4846
|
+
return env3;
|
|
4847
|
+
}
|
|
4848
|
+
const parent = dirname2(current);
|
|
4849
|
+
if (parent === current) break;
|
|
4850
|
+
current = parent;
|
|
4851
|
+
}
|
|
4852
|
+
return {};
|
|
4853
|
+
}
|
|
4854
|
+
var evalRunCommand2 = command({
|
|
4855
|
+
name: "run",
|
|
4856
|
+
description: "Extract inputs, invoke CLI targets, and run code graders in one step",
|
|
4857
|
+
args: {
|
|
4858
|
+
evalPath: positional({
|
|
4859
|
+
type: string,
|
|
4860
|
+
displayName: "eval-path",
|
|
4861
|
+
description: "Path to eval YAML file"
|
|
4862
|
+
}),
|
|
4863
|
+
out: option({
|
|
4864
|
+
type: string,
|
|
4865
|
+
long: "out",
|
|
4866
|
+
description: "Output directory for results"
|
|
4867
|
+
}),
|
|
4868
|
+
workers: option({
|
|
4869
|
+
type: optional(number),
|
|
4870
|
+
long: "workers",
|
|
4871
|
+
description: "Parallel workers for target invocation (default: all tests)"
|
|
4872
|
+
})
|
|
4873
|
+
},
|
|
4874
|
+
handler: async ({ evalPath, out, workers }) => {
|
|
4875
|
+
const resolvedEvalPath = resolve2(evalPath);
|
|
4876
|
+
const outDir = resolve2(out);
|
|
4877
|
+
const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
|
|
4878
|
+
const evalDir = dirname2(resolvedEvalPath);
|
|
4879
|
+
const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
|
|
4880
|
+
const tests = suite.tests;
|
|
4881
|
+
if (tests.length === 0) {
|
|
4882
|
+
console.error("No tests found in eval file.");
|
|
4883
|
+
process.exit(1);
|
|
4884
|
+
}
|
|
4885
|
+
let targetInfo = null;
|
|
4886
|
+
let targetName = "agent";
|
|
4887
|
+
let targetKind = "agent";
|
|
4888
|
+
try {
|
|
4889
|
+
const selection = await selectTarget({
|
|
4890
|
+
testFilePath: resolvedEvalPath,
|
|
4891
|
+
repoRoot,
|
|
4892
|
+
cwd: evalDir,
|
|
4893
|
+
dryRun: false,
|
|
4894
|
+
dryRunDelay: 0,
|
|
4895
|
+
dryRunDelayMin: 0,
|
|
4896
|
+
dryRunDelayMax: 0,
|
|
4897
|
+
env: process.env
|
|
4898
|
+
});
|
|
4899
|
+
targetName = selection.targetName;
|
|
4900
|
+
if (selection.resolvedTarget.kind === "cli") {
|
|
4901
|
+
targetKind = "cli";
|
|
4902
|
+
const config = selection.resolvedTarget.config;
|
|
4903
|
+
targetInfo = {
|
|
4904
|
+
kind: "cli",
|
|
4905
|
+
command: config.command,
|
|
4906
|
+
cwd: config.cwd ?? evalDir,
|
|
4907
|
+
timeoutMs: config.timeoutMs ?? 3e4
|
|
4908
|
+
};
|
|
4909
|
+
}
|
|
4910
|
+
} catch {
|
|
4911
|
+
}
|
|
4912
|
+
const testIds = [];
|
|
4913
|
+
for (const test of tests) {
|
|
4914
|
+
const testDir = join4(outDir, test.id);
|
|
4915
|
+
await mkdir4(testDir, { recursive: true });
|
|
4916
|
+
testIds.push(test.id);
|
|
4917
|
+
const inputText = test.question;
|
|
4918
|
+
const inputMessages = test.input.map((m) => ({
|
|
4919
|
+
role: m.role,
|
|
4920
|
+
content: typeof m.content === "string" ? m.content : m.content
|
|
4921
|
+
}));
|
|
4922
|
+
await writeJson2(join4(testDir, "input.json"), {
|
|
4923
|
+
input_text: inputText,
|
|
4924
|
+
input_messages: inputMessages,
|
|
4925
|
+
file_paths: test.file_paths,
|
|
4926
|
+
metadata: test.metadata ?? {}
|
|
4927
|
+
});
|
|
4928
|
+
if (targetInfo) {
|
|
4929
|
+
await writeJson2(join4(testDir, "invoke.json"), {
|
|
4930
|
+
kind: "cli",
|
|
4931
|
+
command: targetInfo.command,
|
|
4932
|
+
cwd: targetInfo.cwd,
|
|
4933
|
+
timeout_ms: targetInfo.timeoutMs,
|
|
4934
|
+
env: {}
|
|
4935
|
+
});
|
|
4936
|
+
} else {
|
|
4937
|
+
await writeJson2(join4(testDir, "invoke.json"), {
|
|
4938
|
+
kind: "agent",
|
|
4939
|
+
instructions: "Execute this task in the current workspace. The agent IS the target."
|
|
4940
|
+
});
|
|
4941
|
+
}
|
|
4942
|
+
await writeFile5(join4(testDir, "criteria.md"), test.criteria ?? "", "utf8");
|
|
4943
|
+
if (test.expected_output.length > 0 || test.reference_answer !== void 0 && test.reference_answer !== "") {
|
|
4944
|
+
await writeJson2(join4(testDir, "expected_output.json"), {
|
|
4945
|
+
expected_output: test.expected_output,
|
|
4946
|
+
reference_answer: test.reference_answer ?? ""
|
|
4947
|
+
});
|
|
4948
|
+
}
|
|
4949
|
+
await writeGraderConfigs2(testDir, test.assertions ?? [], evalDir);
|
|
4950
|
+
}
|
|
4951
|
+
await writeJson2(join4(outDir, "manifest.json"), {
|
|
4952
|
+
eval_file: resolvedEvalPath,
|
|
4953
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4954
|
+
target: { name: targetName, kind: targetKind },
|
|
4955
|
+
test_ids: testIds
|
|
4956
|
+
});
|
|
4957
|
+
console.log(`Extracted ${testIds.length} test(s) to ${outDir}`);
|
|
4958
|
+
if (targetInfo) {
|
|
4959
|
+
const envVars = loadEnvFile(evalDir);
|
|
4960
|
+
if (!process.env.AGENTV_RUN_TIMESTAMP) {
|
|
4961
|
+
process.env.AGENTV_RUN_TIMESTAMP = (/* @__PURE__ */ new Date()).toISOString().replace(/:/g, "-").replace(/\./g, "-");
|
|
4962
|
+
}
|
|
4963
|
+
const mergedEnv = { ...process.env, ...envVars };
|
|
4964
|
+
const maxWorkers = workers ?? testIds.length;
|
|
4965
|
+
console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
|
|
4966
|
+
const invokeTarget = async (testId) => {
|
|
4967
|
+
const testDir = join4(outDir, testId);
|
|
4968
|
+
const invoke = JSON.parse(await readFile4(join4(testDir, "invoke.json"), "utf8"));
|
|
4969
|
+
if (invoke.kind !== "cli") return;
|
|
4970
|
+
const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
|
|
4971
|
+
const template = invoke.command;
|
|
4972
|
+
const cwd = invoke.cwd;
|
|
4973
|
+
const timeoutMs = invoke.timeout_ms ?? 12e4;
|
|
4974
|
+
const promptFile = join4(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
|
|
4975
|
+
const outputFile = join4(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
|
|
4976
|
+
await writeFile5(promptFile, inputData.input_text, "utf8");
|
|
4977
|
+
let rendered = template;
|
|
4978
|
+
rendered = rendered.replace("{PROMPT_FILE}", promptFile);
|
|
4979
|
+
rendered = rendered.replace("{OUTPUT_FILE}", outputFile);
|
|
4980
|
+
rendered = rendered.replace("{PROMPT}", inputData.input_text);
|
|
4981
|
+
const start = performance.now();
|
|
4982
|
+
try {
|
|
4983
|
+
execSync(rendered, {
|
|
4984
|
+
cwd,
|
|
4985
|
+
timeout: timeoutMs,
|
|
4986
|
+
env: mergedEnv,
|
|
4987
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
4988
|
+
maxBuffer: 10 * 1024 * 1024
|
|
4989
|
+
});
|
|
4990
|
+
const durationMs = Math.round(performance.now() - start);
|
|
4991
|
+
let response;
|
|
4992
|
+
if (existsSync2(outputFile)) {
|
|
4993
|
+
response = readFileSync4(outputFile, "utf8");
|
|
4994
|
+
} else {
|
|
4995
|
+
response = "ERROR: No output file generated";
|
|
4996
|
+
}
|
|
4997
|
+
await writeFile5(join4(testDir, "response.md"), response, "utf8");
|
|
4998
|
+
await writeJson2(join4(testDir, "timing.json"), {
|
|
4999
|
+
duration_ms: durationMs,
|
|
5000
|
+
total_duration_seconds: Math.round(durationMs / 10) / 100
|
|
5001
|
+
});
|
|
5002
|
+
console.log(` ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
|
|
5003
|
+
} catch (error) {
|
|
5004
|
+
const durationMs = Math.round(performance.now() - start);
|
|
5005
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5006
|
+
const response = `ERROR: target failed \u2014 ${message}`;
|
|
5007
|
+
await writeFile5(join4(testDir, "response.md"), response, "utf8");
|
|
5008
|
+
await writeJson2(join4(testDir, "timing.json"), {
|
|
5009
|
+
duration_ms: durationMs,
|
|
5010
|
+
total_duration_seconds: Math.round(durationMs / 10) / 100
|
|
5011
|
+
});
|
|
5012
|
+
console.error(` ${testId}: FAILED (${durationMs}ms) \u2014 ${message.slice(0, 200)}`);
|
|
5013
|
+
} finally {
|
|
5014
|
+
try {
|
|
5015
|
+
if (existsSync2(promptFile)) unlinkSync(promptFile);
|
|
5016
|
+
if (existsSync2(outputFile)) unlinkSync(outputFile);
|
|
5017
|
+
} catch {
|
|
5018
|
+
}
|
|
5019
|
+
}
|
|
5020
|
+
};
|
|
5021
|
+
const allTasks = testIds.map((testId) => invokeTarget(testId));
|
|
5022
|
+
await Promise.all(allTasks);
|
|
5023
|
+
} else {
|
|
5024
|
+
console.log("Agent-as-target mode \u2014 skipping CLI invocation.");
|
|
5025
|
+
}
|
|
5026
|
+
let totalGraders = 0;
|
|
5027
|
+
let totalPassed = 0;
|
|
5028
|
+
for (const testId of testIds) {
|
|
5029
|
+
const testDir = join4(outDir, testId);
|
|
5030
|
+
const codeGradersDir = join4(testDir, "code_graders");
|
|
5031
|
+
const resultsDir = join4(testDir, "code_grader_results");
|
|
5032
|
+
let graderFiles;
|
|
5033
|
+
try {
|
|
5034
|
+
graderFiles = (await readdir3(codeGradersDir)).filter((f) => f.endsWith(".json"));
|
|
5035
|
+
} catch {
|
|
5036
|
+
continue;
|
|
5037
|
+
}
|
|
5038
|
+
if (graderFiles.length === 0) continue;
|
|
5039
|
+
await mkdir4(resultsDir, { recursive: true });
|
|
5040
|
+
const responseText = await readFile4(join4(testDir, "response.md"), "utf8");
|
|
5041
|
+
const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
|
|
5042
|
+
for (const graderFile of graderFiles) {
|
|
5043
|
+
const graderConfig = JSON.parse(await readFile4(join4(codeGradersDir, graderFile), "utf8"));
|
|
5044
|
+
const graderName = graderConfig.name;
|
|
5045
|
+
const payload = JSON.stringify({
|
|
5046
|
+
output: [{ role: "assistant", content: responseText }],
|
|
5047
|
+
input: inputData.input_messages,
|
|
5048
|
+
question: inputData.input_text,
|
|
5049
|
+
criteria: "",
|
|
5050
|
+
expected_output: [],
|
|
5051
|
+
reference_answer: "",
|
|
5052
|
+
input_files: [],
|
|
5053
|
+
trace: null,
|
|
5054
|
+
token_usage: null,
|
|
5055
|
+
cost_usd: null,
|
|
5056
|
+
duration_ms: null,
|
|
5057
|
+
start_time: null,
|
|
5058
|
+
end_time: null,
|
|
5059
|
+
file_changes: null,
|
|
5060
|
+
workspace_path: null,
|
|
5061
|
+
config: graderConfig.config ?? null,
|
|
5062
|
+
metadata: {},
|
|
5063
|
+
input_text: inputData.input_text,
|
|
5064
|
+
output_text: responseText,
|
|
5065
|
+
expected_output_text: ""
|
|
5066
|
+
});
|
|
5067
|
+
try {
|
|
5068
|
+
const stdout = await executeScript(
|
|
5069
|
+
graderConfig.command,
|
|
5070
|
+
payload,
|
|
5071
|
+
void 0,
|
|
5072
|
+
graderConfig.cwd
|
|
5073
|
+
);
|
|
5074
|
+
const parsed = JSON.parse(stdout);
|
|
5075
|
+
const score = typeof parsed.score === "number" ? parsed.score : 0;
|
|
5076
|
+
const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
|
|
5077
|
+
await writeFile5(
|
|
5078
|
+
join4(resultsDir, `${graderName}.json`),
|
|
5079
|
+
`${JSON.stringify(
|
|
5080
|
+
{
|
|
5081
|
+
name: graderName,
|
|
5082
|
+
type: "code-grader",
|
|
5083
|
+
score,
|
|
5084
|
+
weight: graderConfig.weight ?? 1,
|
|
5085
|
+
assertions,
|
|
5086
|
+
details: parsed.details ?? {}
|
|
5087
|
+
},
|
|
5088
|
+
null,
|
|
5089
|
+
2
|
|
5090
|
+
)}
|
|
5091
|
+
`,
|
|
5092
|
+
"utf8"
|
|
5093
|
+
);
|
|
5094
|
+
totalGraders++;
|
|
5095
|
+
if (score >= 0.5) totalPassed++;
|
|
5096
|
+
} catch (error) {
|
|
5097
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
5098
|
+
console.error(` ${testId}/${graderName}: ERROR \u2014 ${message}`);
|
|
5099
|
+
await writeFile5(
|
|
5100
|
+
join4(resultsDir, `${graderName}.json`),
|
|
5101
|
+
`${JSON.stringify(
|
|
5102
|
+
{
|
|
5103
|
+
name: graderName,
|
|
5104
|
+
type: "code-grader",
|
|
5105
|
+
score: 0,
|
|
5106
|
+
weight: graderConfig.weight ?? 1,
|
|
5107
|
+
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
5108
|
+
details: { error: message }
|
|
5109
|
+
},
|
|
5110
|
+
null,
|
|
5111
|
+
2
|
|
5112
|
+
)}
|
|
5113
|
+
`,
|
|
5114
|
+
"utf8"
|
|
5115
|
+
);
|
|
5116
|
+
totalGraders++;
|
|
5117
|
+
}
|
|
5118
|
+
}
|
|
5119
|
+
}
|
|
5120
|
+
console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
|
|
5121
|
+
console.log(`
|
|
5122
|
+
Done. Agent can now perform LLM grading on responses in ${outDir}`);
|
|
5123
|
+
}
|
|
5124
|
+
});
|
|
5125
|
+
async function writeJson2(filePath, data) {
|
|
5126
|
+
await writeFile5(filePath, `${JSON.stringify(data, null, 2)}
|
|
5127
|
+
`, "utf8");
|
|
5128
|
+
}
|
|
5129
|
+
async function writeGraderConfigs2(testDir, assertions, evalDir) {
|
|
5130
|
+
const codeGradersDir = join4(testDir, "code_graders");
|
|
5131
|
+
const llmGradersDir = join4(testDir, "llm_graders");
|
|
5132
|
+
let hasCodeGraders = false;
|
|
5133
|
+
let hasLlmGraders = false;
|
|
5134
|
+
for (const assertion of assertions) {
|
|
5135
|
+
if (assertion.type === "code-grader") {
|
|
5136
|
+
if (!hasCodeGraders) {
|
|
5137
|
+
await mkdir4(codeGradersDir, { recursive: true });
|
|
5138
|
+
hasCodeGraders = true;
|
|
5139
|
+
}
|
|
5140
|
+
const config = assertion;
|
|
5141
|
+
await writeJson2(join4(codeGradersDir, `${config.name}.json`), {
|
|
5142
|
+
name: config.name,
|
|
5143
|
+
command: config.command,
|
|
5144
|
+
cwd: config.resolvedCwd ?? config.cwd ?? evalDir,
|
|
5145
|
+
weight: config.weight ?? 1,
|
|
5146
|
+
config: config.config ?? {}
|
|
5147
|
+
});
|
|
5148
|
+
} else if (assertion.type === "llm-grader") {
|
|
5149
|
+
if (!hasLlmGraders) {
|
|
5150
|
+
await mkdir4(llmGradersDir, { recursive: true });
|
|
5151
|
+
hasLlmGraders = true;
|
|
5152
|
+
}
|
|
5153
|
+
const config = assertion;
|
|
5154
|
+
let promptContent = "";
|
|
5155
|
+
if (config.resolvedPromptPath) {
|
|
5156
|
+
try {
|
|
5157
|
+
promptContent = readFileSync4(config.resolvedPromptPath, "utf8");
|
|
5158
|
+
} catch {
|
|
5159
|
+
promptContent = typeof config.prompt === "string" ? config.prompt : "";
|
|
5160
|
+
}
|
|
5161
|
+
} else if (typeof config.prompt === "string") {
|
|
5162
|
+
promptContent = config.prompt;
|
|
5163
|
+
}
|
|
5164
|
+
await writeJson2(join4(llmGradersDir, `${config.name}.json`), {
|
|
5165
|
+
name: config.name,
|
|
5166
|
+
prompt_content: promptContent,
|
|
5167
|
+
weight: config.weight ?? 1,
|
|
5168
|
+
threshold: 0.5,
|
|
5169
|
+
config: {}
|
|
5170
|
+
});
|
|
5171
|
+
}
|
|
5172
|
+
}
|
|
5173
|
+
}
|
|
5174
|
+
|
|
4817
5175
|
// src/commands/pipeline/index.ts
|
|
4818
5176
|
var pipelineCommand = subcommands({
|
|
4819
5177
|
name: "pipeline",
|
|
@@ -4821,7 +5179,8 @@ var pipelineCommand = subcommands({
|
|
|
4821
5179
|
cmds: {
|
|
4822
5180
|
input: evalInputCommand,
|
|
4823
5181
|
grade: evalGradeCommand,
|
|
4824
|
-
bench: evalBenchCommand
|
|
5182
|
+
bench: evalBenchCommand,
|
|
5183
|
+
run: evalRunCommand2
|
|
4825
5184
|
}
|
|
4826
5185
|
});
|
|
4827
5186
|
|
|
@@ -4829,10 +5188,10 @@ var pipelineCommand = subcommands({
|
|
|
4829
5188
|
import path7 from "node:path";
|
|
4830
5189
|
|
|
4831
5190
|
// src/commands/results/shared.ts
|
|
4832
|
-
import { existsSync as
|
|
5191
|
+
import { existsSync as existsSync3 } from "node:fs";
|
|
4833
5192
|
|
|
4834
5193
|
// src/commands/trace/utils.ts
|
|
4835
|
-
import { readFileSync as
|
|
5194
|
+
import { readFileSync as readFileSync5, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
|
|
4836
5195
|
import path6 from "node:path";
|
|
4837
5196
|
var colors2 = {
|
|
4838
5197
|
reset: "\x1B[0m",
|
|
@@ -4872,7 +5231,7 @@ function resolveTraceResultPath(filePath) {
|
|
|
4872
5231
|
return resolveWorkspaceOrFilePath(filePath);
|
|
4873
5232
|
}
|
|
4874
5233
|
function loadJsonlRecords(filePath) {
|
|
4875
|
-
const content =
|
|
5234
|
+
const content = readFileSync5(filePath, "utf8");
|
|
4876
5235
|
const lines = content.trim().split("\n").filter((line) => line.trim());
|
|
4877
5236
|
return lines.map((line, i) => {
|
|
4878
5237
|
const record = JSON.parse(line);
|
|
@@ -4925,7 +5284,7 @@ function toRawResult(result) {
|
|
|
4925
5284
|
};
|
|
4926
5285
|
}
|
|
4927
5286
|
function loadOtlpTraceFile(filePath) {
|
|
4928
|
-
const parsed = JSON.parse(
|
|
5287
|
+
const parsed = JSON.parse(readFileSync5(filePath, "utf8"));
|
|
4929
5288
|
const spans = parsed.resourceSpans?.flatMap((resource) => resource.scopeSpans ?? []).flatMap((scope) => scope.spans ?? []);
|
|
4930
5289
|
if (!spans || spans.length === 0) {
|
|
4931
5290
|
return [];
|
|
@@ -5243,14 +5602,14 @@ async function resolveSourceFile(source, cwd) {
|
|
|
5243
5602
|
let sourceFile;
|
|
5244
5603
|
if (source) {
|
|
5245
5604
|
sourceFile = resolveResultSourcePath(source, cwd);
|
|
5246
|
-
if (!
|
|
5605
|
+
if (!existsSync3(sourceFile)) {
|
|
5247
5606
|
console.error(`Error: File not found: ${sourceFile}`);
|
|
5248
5607
|
process.exit(1);
|
|
5249
5608
|
}
|
|
5250
5609
|
} else {
|
|
5251
5610
|
const cache = await loadRunCache(cwd);
|
|
5252
5611
|
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
5253
|
-
if (cachedFile &&
|
|
5612
|
+
if (cachedFile && existsSync3(cachedFile)) {
|
|
5254
5613
|
sourceFile = cachedFile;
|
|
5255
5614
|
} else {
|
|
5256
5615
|
const metas = listResultFiles(cwd, 1);
|
|
@@ -5462,7 +5821,7 @@ var resultsShowCommand = command({
|
|
|
5462
5821
|
});
|
|
5463
5822
|
|
|
5464
5823
|
// src/commands/results/summary.ts
|
|
5465
|
-
import { existsSync as
|
|
5824
|
+
import { existsSync as existsSync4, readFileSync as readFileSync6 } from "node:fs";
|
|
5466
5825
|
function formatSummary(results, grading) {
|
|
5467
5826
|
const total = results.length;
|
|
5468
5827
|
let passed;
|
|
@@ -5513,9 +5872,9 @@ var resultsSummaryCommand = command({
|
|
|
5513
5872
|
const { results, sourceFile } = await loadResults(source, cwd);
|
|
5514
5873
|
let grading;
|
|
5515
5874
|
const gradingPath = sourceFile.replace(/\.jsonl$/, ".grading.json");
|
|
5516
|
-
if (
|
|
5875
|
+
if (existsSync4(gradingPath)) {
|
|
5517
5876
|
try {
|
|
5518
|
-
grading = JSON.parse(
|
|
5877
|
+
grading = JSON.parse(readFileSync6(gradingPath, "utf8"));
|
|
5519
5878
|
} catch {
|
|
5520
5879
|
}
|
|
5521
5880
|
}
|
|
@@ -5540,7 +5899,7 @@ var resultsCommand = subcommands({
|
|
|
5540
5899
|
});
|
|
5541
5900
|
|
|
5542
5901
|
// src/commands/results/serve.ts
|
|
5543
|
-
import { existsSync as
|
|
5902
|
+
import { existsSync as existsSync5, readFileSync as readFileSync7, writeFileSync as writeFileSync3 } from "node:fs";
|
|
5544
5903
|
import path8 from "node:path";
|
|
5545
5904
|
import { Hono } from "hono";
|
|
5546
5905
|
function feedbackPath(resultDir) {
|
|
@@ -5548,11 +5907,11 @@ function feedbackPath(resultDir) {
|
|
|
5548
5907
|
}
|
|
5549
5908
|
function readFeedback(cwd) {
|
|
5550
5909
|
const fp = feedbackPath(cwd);
|
|
5551
|
-
if (!
|
|
5910
|
+
if (!existsSync5(fp)) {
|
|
5552
5911
|
return { reviews: [] };
|
|
5553
5912
|
}
|
|
5554
5913
|
try {
|
|
5555
|
-
return JSON.parse(
|
|
5914
|
+
return JSON.parse(readFileSync7(fp, "utf8"));
|
|
5556
5915
|
} catch (err2) {
|
|
5557
5916
|
console.error(`Warning: could not parse ${fp}, starting fresh: ${err2.message}`);
|
|
5558
5917
|
return { reviews: [] };
|
|
@@ -6352,7 +6711,7 @@ var resultsServeCommand = command({
|
|
|
6352
6711
|
let sourceFile;
|
|
6353
6712
|
if (source) {
|
|
6354
6713
|
const resolved = resolveResultSourcePath(source, cwd);
|
|
6355
|
-
if (!
|
|
6714
|
+
if (!existsSync5(resolved)) {
|
|
6356
6715
|
console.error(`Error: Source file not found: ${resolved}`);
|
|
6357
6716
|
process.exit(1);
|
|
6358
6717
|
}
|
|
@@ -6361,7 +6720,7 @@ var resultsServeCommand = command({
|
|
|
6361
6720
|
} else {
|
|
6362
6721
|
const cache = await loadRunCache(cwd);
|
|
6363
6722
|
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
6364
|
-
if (cachedFile &&
|
|
6723
|
+
if (cachedFile && existsSync5(cachedFile)) {
|
|
6365
6724
|
sourceFile = cachedFile;
|
|
6366
6725
|
results = patchTestIds(loadManifestResults(cachedFile));
|
|
6367
6726
|
} else {
|
|
@@ -6411,7 +6770,7 @@ function detectPackageManager() {
|
|
|
6411
6770
|
return detectPackageManagerFromPath(process.argv[1] ?? "");
|
|
6412
6771
|
}
|
|
6413
6772
|
function runCommand(cmd, args) {
|
|
6414
|
-
return new Promise((
|
|
6773
|
+
return new Promise((resolve3, reject) => {
|
|
6415
6774
|
const child = spawn(cmd, args, { stdio: ["inherit", "pipe", "inherit"], shell: true });
|
|
6416
6775
|
let stdout = "";
|
|
6417
6776
|
child.stdout?.on("data", (data) => {
|
|
@@ -6419,7 +6778,7 @@ function runCommand(cmd, args) {
|
|
|
6419
6778
|
stdout += data.toString();
|
|
6420
6779
|
});
|
|
6421
6780
|
child.on("error", reject);
|
|
6422
|
-
child.on("close", (code) =>
|
|
6781
|
+
child.on("close", (code) => resolve3({ exitCode: code ?? 1, stdout }));
|
|
6423
6782
|
});
|
|
6424
6783
|
}
|
|
6425
6784
|
var updateCommand = command({
|
|
@@ -7327,7 +7686,7 @@ var transpileCommand = command({
|
|
|
7327
7686
|
});
|
|
7328
7687
|
|
|
7329
7688
|
// src/commands/trim/index.ts
|
|
7330
|
-
import { readFileSync as
|
|
7689
|
+
import { readFileSync as readFileSync8, writeFileSync as writeFileSync5 } from "node:fs";
|
|
7331
7690
|
var trimCommand = command({
|
|
7332
7691
|
name: "trim",
|
|
7333
7692
|
description: "Trim evaluation results for baseline storage (strips debug/audit fields)",
|
|
@@ -7346,7 +7705,7 @@ var trimCommand = command({
|
|
|
7346
7705
|
},
|
|
7347
7706
|
handler: async ({ input, out }) => {
|
|
7348
7707
|
try {
|
|
7349
|
-
const content =
|
|
7708
|
+
const content = readFileSync8(input, "utf8");
|
|
7350
7709
|
const lines = content.trim().split("\n").filter((line) => line.trim());
|
|
7351
7710
|
const trimmedLines = lines.map((line) => {
|
|
7352
7711
|
const record = JSON.parse(line);
|
|
@@ -7452,7 +7811,7 @@ function isTTY() {
|
|
|
7452
7811
|
|
|
7453
7812
|
// src/commands/validate/validate-files.ts
|
|
7454
7813
|
import { constants } from "node:fs";
|
|
7455
|
-
import { access, readdir as
|
|
7814
|
+
import { access, readdir as readdir4, stat } from "node:fs/promises";
|
|
7456
7815
|
import path10 from "node:path";
|
|
7457
7816
|
async function validateFiles(paths) {
|
|
7458
7817
|
const filePaths = await expandPaths(paths);
|
|
@@ -7518,7 +7877,7 @@ async function expandPaths(paths) {
|
|
|
7518
7877
|
async function findYamlFiles(dirPath) {
|
|
7519
7878
|
const results = [];
|
|
7520
7879
|
try {
|
|
7521
|
-
const entries2 = await
|
|
7880
|
+
const entries2 = await readdir4(dirPath, { withFileTypes: true });
|
|
7522
7881
|
for (const entry of entries2) {
|
|
7523
7882
|
const fullPath = path10.join(dirPath, entry.name);
|
|
7524
7883
|
if (entry.isDirectory()) {
|
|
@@ -7575,14 +7934,14 @@ var validateCommand = command({
|
|
|
7575
7934
|
});
|
|
7576
7935
|
|
|
7577
7936
|
// src/commands/workspace/clean.ts
|
|
7578
|
-
import { existsSync as
|
|
7579
|
-
import { readFile as
|
|
7937
|
+
import { existsSync as existsSync6 } from "node:fs";
|
|
7938
|
+
import { readFile as readFile5, readdir as readdir5, rm } from "node:fs/promises";
|
|
7580
7939
|
import path11 from "node:path";
|
|
7581
7940
|
async function confirm(message) {
|
|
7582
7941
|
const readline2 = await import("node:readline");
|
|
7583
7942
|
const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
|
|
7584
|
-
const answer = await new Promise((
|
|
7585
|
-
rl.question(`${message} [y/N] `,
|
|
7943
|
+
const answer = await new Promise((resolve3) => {
|
|
7944
|
+
rl.question(`${message} [y/N] `, resolve3);
|
|
7586
7945
|
});
|
|
7587
7946
|
rl.close();
|
|
7588
7947
|
return answer.toLowerCase() === "y";
|
|
@@ -7604,19 +7963,19 @@ var cleanCommand = command({
|
|
|
7604
7963
|
},
|
|
7605
7964
|
handler: async ({ repo, force }) => {
|
|
7606
7965
|
const poolRoot = getWorkspacePoolRoot();
|
|
7607
|
-
if (!
|
|
7966
|
+
if (!existsSync6(poolRoot)) {
|
|
7608
7967
|
console.log("No workspace pool entries found.");
|
|
7609
7968
|
return;
|
|
7610
7969
|
}
|
|
7611
7970
|
if (repo) {
|
|
7612
|
-
const entries2 = await
|
|
7971
|
+
const entries2 = await readdir5(poolRoot, { withFileTypes: true });
|
|
7613
7972
|
const poolDirs = entries2.filter((e) => e.isDirectory());
|
|
7614
7973
|
const matchingDirs = [];
|
|
7615
7974
|
for (const dir of poolDirs) {
|
|
7616
7975
|
const poolDir = path11.join(poolRoot, dir.name);
|
|
7617
7976
|
const metadataPath = path11.join(poolDir, "metadata.json");
|
|
7618
7977
|
try {
|
|
7619
|
-
const raw = await
|
|
7978
|
+
const raw = await readFile5(metadataPath, "utf-8");
|
|
7620
7979
|
const metadata = JSON.parse(raw);
|
|
7621
7980
|
const hasRepo = metadata.repos?.some((r) => {
|
|
7622
7981
|
if (r.source.type === "git" && r.source.url) {
|
|
@@ -7663,13 +8022,13 @@ var cleanCommand = command({
|
|
|
7663
8022
|
});
|
|
7664
8023
|
|
|
7665
8024
|
// src/commands/workspace/list.ts
|
|
7666
|
-
import { existsSync as
|
|
7667
|
-
import { readFile as
|
|
8025
|
+
import { existsSync as existsSync7 } from "node:fs";
|
|
8026
|
+
import { readFile as readFile6, readdir as readdir6, stat as stat2 } from "node:fs/promises";
|
|
7668
8027
|
import path12 from "node:path";
|
|
7669
8028
|
async function getDirectorySize(dirPath) {
|
|
7670
8029
|
let totalSize = 0;
|
|
7671
8030
|
try {
|
|
7672
|
-
const entries2 = await
|
|
8031
|
+
const entries2 = await readdir6(dirPath, { withFileTypes: true });
|
|
7673
8032
|
for (const entry of entries2) {
|
|
7674
8033
|
const fullPath = path12.join(dirPath, entry.name);
|
|
7675
8034
|
if (entry.isDirectory()) {
|
|
@@ -7695,11 +8054,11 @@ var listCommand = command({
|
|
|
7695
8054
|
args: {},
|
|
7696
8055
|
handler: async () => {
|
|
7697
8056
|
const poolRoot = getWorkspacePoolRoot();
|
|
7698
|
-
if (!
|
|
8057
|
+
if (!existsSync7(poolRoot)) {
|
|
7699
8058
|
console.log("No workspace pool entries found.");
|
|
7700
8059
|
return;
|
|
7701
8060
|
}
|
|
7702
|
-
const entries2 = await
|
|
8061
|
+
const entries2 = await readdir6(poolRoot, { withFileTypes: true });
|
|
7703
8062
|
const poolDirs = entries2.filter((e) => e.isDirectory());
|
|
7704
8063
|
if (poolDirs.length === 0) {
|
|
7705
8064
|
console.log("No workspace pool entries found.");
|
|
@@ -7708,12 +8067,12 @@ var listCommand = command({
|
|
|
7708
8067
|
for (const dir of poolDirs) {
|
|
7709
8068
|
const poolDir = path12.join(poolRoot, dir.name);
|
|
7710
8069
|
const fingerprint = dir.name;
|
|
7711
|
-
const poolEntries = await
|
|
8070
|
+
const poolEntries = await readdir6(poolDir, { withFileTypes: true });
|
|
7712
8071
|
const slots = poolEntries.filter((e) => e.isDirectory() && e.name.startsWith("slot-"));
|
|
7713
8072
|
const metadataPath = path12.join(poolDir, "metadata.json");
|
|
7714
8073
|
let metadata = null;
|
|
7715
8074
|
try {
|
|
7716
|
-
const raw = await
|
|
8075
|
+
const raw = await readFile6(metadataPath, "utf-8");
|
|
7717
8076
|
metadata = JSON.parse(raw);
|
|
7718
8077
|
} catch {
|
|
7719
8078
|
}
|
|
@@ -7750,16 +8109,16 @@ var workspaceCommand = subcommands({
|
|
|
7750
8109
|
|
|
7751
8110
|
// src/update-check.ts
|
|
7752
8111
|
import { spawn as spawn2 } from "node:child_process";
|
|
7753
|
-
import { readFile as
|
|
7754
|
-
import { join as
|
|
8112
|
+
import { readFile as readFile7 } from "node:fs/promises";
|
|
8113
|
+
import { join as join5 } from "node:path";
|
|
7755
8114
|
var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
|
|
7756
8115
|
var AGENTV_DIR = getAgentvHome();
|
|
7757
8116
|
var CACHE_FILE = "version-check.json";
|
|
7758
8117
|
var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
|
|
7759
8118
|
async function getCachedUpdateInfo(path13) {
|
|
7760
|
-
const filePath = path13 ??
|
|
8119
|
+
const filePath = path13 ?? join5(AGENTV_DIR, CACHE_FILE);
|
|
7761
8120
|
try {
|
|
7762
|
-
const raw = await
|
|
8121
|
+
const raw = await readFile7(filePath, "utf-8");
|
|
7763
8122
|
const data = JSON.parse(raw);
|
|
7764
8123
|
if (typeof data.latestVersion === "string" && typeof data.lastCheckedAt === "string") {
|
|
7765
8124
|
return data;
|
|
@@ -7791,7 +8150,7 @@ function buildNotice(currentVersion, latestVersion) {
|
|
|
7791
8150
|
}
|
|
7792
8151
|
function backgroundUpdateCheck() {
|
|
7793
8152
|
const dir = AGENTV_DIR;
|
|
7794
|
-
const filePath =
|
|
8153
|
+
const filePath = join5(dir, CACHE_FILE);
|
|
7795
8154
|
const script = `
|
|
7796
8155
|
const https = require('https');
|
|
7797
8156
|
const fs = require('fs');
|
|
@@ -7914,4 +8273,4 @@ export {
|
|
|
7914
8273
|
preprocessArgv,
|
|
7915
8274
|
runCli
|
|
7916
8275
|
};
|
|
7917
|
-
//# sourceMappingURL=chunk-
|
|
8276
|
+
//# sourceMappingURL=chunk-FSNRKR7X.js.map
|