agentv 3.14.2 → 3.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-3UW7KUQ3.js → chunk-3NLBBQX6.js} +314 -59
- package/dist/chunk-3NLBBQX6.js.map +1 -0
- package/dist/{chunk-75PQBKLR.js → chunk-SAPEYQ5U.js} +3 -2
- package/dist/{chunk-75PQBKLR.js.map → chunk-SAPEYQ5U.js.map} +1 -1
- package/dist/cli.js +2 -2
- package/dist/index.js +2 -2
- package/dist/{interactive-Q563ULAR.js → interactive-PGZ55VHT.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-3UW7KUQ3.js.map +0 -1
- /package/dist/{interactive-Q563ULAR.js.map → interactive-PGZ55VHT.js.map} +0 -0
|
@@ -3,6 +3,7 @@ import {
|
|
|
3
3
|
HtmlWriter,
|
|
4
4
|
RESULT_INDEX_FILENAME,
|
|
5
5
|
RESULT_RUNS_DIRNAME,
|
|
6
|
+
buildDefaultRunDir,
|
|
6
7
|
detectFileType,
|
|
7
8
|
findRepoRoot,
|
|
8
9
|
loadLightweightResults,
|
|
@@ -22,7 +23,7 @@ import {
|
|
|
22
23
|
validateFileReferences,
|
|
23
24
|
validateTargetsFile,
|
|
24
25
|
writeArtifactsFromResults
|
|
25
|
-
} from "./chunk-
|
|
26
|
+
} from "./chunk-SAPEYQ5U.js";
|
|
26
27
|
import {
|
|
27
28
|
createBuiltinRegistry,
|
|
28
29
|
executeScript,
|
|
@@ -4185,7 +4186,7 @@ var evalRunCommand = command({
|
|
|
4185
4186
|
},
|
|
4186
4187
|
handler: async (args) => {
|
|
4187
4188
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4188
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4189
|
+
const { launchInteractiveWizard } = await import("./interactive-PGZ55VHT.js");
|
|
4189
4190
|
await launchInteractiveWizard();
|
|
4190
4191
|
return;
|
|
4191
4192
|
}
|
|
@@ -4398,6 +4399,7 @@ var initCmdTsCommand = command({
|
|
|
4398
4399
|
});
|
|
4399
4400
|
|
|
4400
4401
|
// src/commands/pipeline/bench.ts
|
|
4402
|
+
import { existsSync as existsSync2 } from "node:fs";
|
|
4401
4403
|
import { readFile, readdir, writeFile as writeFile2 } from "node:fs/promises";
|
|
4402
4404
|
import { join } from "node:path";
|
|
4403
4405
|
var evalBenchCommand = command({
|
|
@@ -4497,14 +4499,41 @@ var evalBenchCommand = command({
|
|
|
4497
4499
|
`,
|
|
4498
4500
|
"utf8"
|
|
4499
4501
|
);
|
|
4502
|
+
const scores = evaluators.map((e) => ({
|
|
4503
|
+
name: e.name,
|
|
4504
|
+
type: e.type,
|
|
4505
|
+
score: e.score,
|
|
4506
|
+
weight: e.weight,
|
|
4507
|
+
verdict: e.score >= 0.5 ? "pass" : "fail",
|
|
4508
|
+
assertions: e.assertions.map((a) => ({
|
|
4509
|
+
text: a.text,
|
|
4510
|
+
passed: a.passed,
|
|
4511
|
+
evidence: a.evidence ?? ""
|
|
4512
|
+
}))
|
|
4513
|
+
}));
|
|
4514
|
+
let executionStatus = "ok";
|
|
4515
|
+
const timingPath = join(testDir, "timing.json");
|
|
4516
|
+
if (existsSync2(timingPath)) {
|
|
4517
|
+
try {
|
|
4518
|
+
const timing = JSON.parse(await readFile(timingPath, "utf8"));
|
|
4519
|
+
if (typeof timing.execution_status === "string") {
|
|
4520
|
+
executionStatus = timing.execution_status;
|
|
4521
|
+
}
|
|
4522
|
+
} catch {
|
|
4523
|
+
}
|
|
4524
|
+
}
|
|
4525
|
+
const hasResponse = existsSync2(join(testDir, "response.md"));
|
|
4500
4526
|
indexLines.push(
|
|
4501
4527
|
JSON.stringify({
|
|
4502
4528
|
timestamp: manifest.timestamp,
|
|
4503
4529
|
test_id: testId,
|
|
4504
4530
|
score: Math.round(weightedScore * 1e3) / 1e3,
|
|
4505
4531
|
target: targetName,
|
|
4532
|
+
scores,
|
|
4533
|
+
execution_status: executionStatus,
|
|
4506
4534
|
grading_path: `${testId}/grading.json`,
|
|
4507
|
-
timing_path: `${testId}/timing.json
|
|
4535
|
+
timing_path: `${testId}/timing.json`,
|
|
4536
|
+
response_path: hasResponse ? `${testId}/response.md` : null
|
|
4508
4537
|
})
|
|
4509
4538
|
);
|
|
4510
4539
|
}
|
|
@@ -4680,14 +4709,14 @@ var evalInputCommand = command({
|
|
|
4680
4709
|
description: "Path to eval YAML file"
|
|
4681
4710
|
}),
|
|
4682
4711
|
out: option({
|
|
4683
|
-
type: string,
|
|
4712
|
+
type: optional(string),
|
|
4684
4713
|
long: "out",
|
|
4685
|
-
description: "Output directory for extracted inputs"
|
|
4714
|
+
description: "Output directory for extracted inputs (default: .agentv/results/runs/eval_<timestamp>)"
|
|
4686
4715
|
})
|
|
4687
4716
|
},
|
|
4688
4717
|
handler: async ({ evalPath, out }) => {
|
|
4689
4718
|
const resolvedEvalPath = resolve(evalPath);
|
|
4690
|
-
const outDir = resolve(out);
|
|
4719
|
+
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
|
|
4691
4720
|
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
|
|
4692
4721
|
const evalDir = dirname(resolvedEvalPath);
|
|
4693
4722
|
const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
|
|
@@ -4826,7 +4855,7 @@ async function writeJson(filePath, data) {
|
|
|
4826
4855
|
|
|
4827
4856
|
// src/commands/pipeline/run.ts
|
|
4828
4857
|
import { execSync } from "node:child_process";
|
|
4829
|
-
import { existsSync as
|
|
4858
|
+
import { existsSync as existsSync3, readFileSync as readFileSync4, unlinkSync } from "node:fs";
|
|
4830
4859
|
import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
|
|
4831
4860
|
import { tmpdir } from "node:os";
|
|
4832
4861
|
import { dirname as dirname2, join as join4, resolve as resolve2 } from "node:path";
|
|
@@ -4834,7 +4863,7 @@ function loadEnvFile(dir) {
|
|
|
4834
4863
|
let current = resolve2(dir);
|
|
4835
4864
|
while (true) {
|
|
4836
4865
|
const candidate = join4(current, ".env");
|
|
4837
|
-
if (
|
|
4866
|
+
if (existsSync3(candidate)) {
|
|
4838
4867
|
const env3 = {};
|
|
4839
4868
|
for (const line of readFileSync4(candidate, "utf8").split("\n")) {
|
|
4840
4869
|
const trimmed = line.trim();
|
|
@@ -4861,9 +4890,9 @@ var evalRunCommand2 = command({
|
|
|
4861
4890
|
description: "Path to eval YAML file"
|
|
4862
4891
|
}),
|
|
4863
4892
|
out: option({
|
|
4864
|
-
type: string,
|
|
4893
|
+
type: optional(string),
|
|
4865
4894
|
long: "out",
|
|
4866
|
-
description: "Output directory for results"
|
|
4895
|
+
description: "Output directory for results (default: .agentv/results/runs/eval_<timestamp>)"
|
|
4867
4896
|
}),
|
|
4868
4897
|
workers: option({
|
|
4869
4898
|
type: optional(number),
|
|
@@ -4873,7 +4902,7 @@ var evalRunCommand2 = command({
|
|
|
4873
4902
|
},
|
|
4874
4903
|
handler: async ({ evalPath, out, workers }) => {
|
|
4875
4904
|
const resolvedEvalPath = resolve2(evalPath);
|
|
4876
|
-
const outDir = resolve2(out);
|
|
4905
|
+
const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd()));
|
|
4877
4906
|
const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
|
|
4878
4907
|
const evalDir = dirname2(resolvedEvalPath);
|
|
4879
4908
|
const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
|
|
@@ -4957,6 +4986,9 @@ var evalRunCommand2 = command({
|
|
|
4957
4986
|
console.log(`Extracted ${testIds.length} test(s) to ${outDir}`);
|
|
4958
4987
|
if (targetInfo) {
|
|
4959
4988
|
const envVars = loadEnvFile(evalDir);
|
|
4989
|
+
if (!process.env.AGENTV_RUN_TIMESTAMP) {
|
|
4990
|
+
process.env.AGENTV_RUN_TIMESTAMP = (/* @__PURE__ */ new Date()).toISOString().replace(/:/g, "-").replace(/\./g, "-");
|
|
4991
|
+
}
|
|
4960
4992
|
const mergedEnv = { ...process.env, ...envVars };
|
|
4961
4993
|
const maxWorkers = workers ?? testIds.length;
|
|
4962
4994
|
console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
|
|
@@ -4986,7 +5018,7 @@ var evalRunCommand2 = command({
|
|
|
4986
5018
|
});
|
|
4987
5019
|
const durationMs = Math.round(performance.now() - start);
|
|
4988
5020
|
let response;
|
|
4989
|
-
if (
|
|
5021
|
+
if (existsSync3(outputFile)) {
|
|
4990
5022
|
response = readFileSync4(outputFile, "utf8");
|
|
4991
5023
|
} else {
|
|
4992
5024
|
response = "ERROR: No output file generated";
|
|
@@ -4994,7 +5026,8 @@ var evalRunCommand2 = command({
|
|
|
4994
5026
|
await writeFile5(join4(testDir, "response.md"), response, "utf8");
|
|
4995
5027
|
await writeJson2(join4(testDir, "timing.json"), {
|
|
4996
5028
|
duration_ms: durationMs,
|
|
4997
|
-
total_duration_seconds: Math.round(durationMs / 10) / 100
|
|
5029
|
+
total_duration_seconds: Math.round(durationMs / 10) / 100,
|
|
5030
|
+
execution_status: "ok"
|
|
4998
5031
|
});
|
|
4999
5032
|
console.log(` ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
|
|
5000
5033
|
} catch (error) {
|
|
@@ -5004,19 +5037,29 @@ var evalRunCommand2 = command({
|
|
|
5004
5037
|
await writeFile5(join4(testDir, "response.md"), response, "utf8");
|
|
5005
5038
|
await writeJson2(join4(testDir, "timing.json"), {
|
|
5006
5039
|
duration_ms: durationMs,
|
|
5007
|
-
total_duration_seconds: Math.round(durationMs / 10) / 100
|
|
5040
|
+
total_duration_seconds: Math.round(durationMs / 10) / 100,
|
|
5041
|
+
execution_status: "execution_error"
|
|
5008
5042
|
});
|
|
5009
5043
|
console.error(` ${testId}: FAILED (${durationMs}ms) \u2014 ${message.slice(0, 200)}`);
|
|
5010
5044
|
} finally {
|
|
5011
5045
|
try {
|
|
5012
|
-
if (
|
|
5013
|
-
if (
|
|
5046
|
+
if (existsSync3(promptFile)) unlinkSync(promptFile);
|
|
5047
|
+
if (existsSync3(outputFile)) unlinkSync(outputFile);
|
|
5014
5048
|
} catch {
|
|
5015
5049
|
}
|
|
5016
5050
|
}
|
|
5017
5051
|
};
|
|
5018
|
-
const
|
|
5019
|
-
|
|
5052
|
+
const pending = /* @__PURE__ */ new Set();
|
|
5053
|
+
for (const testId of testIds) {
|
|
5054
|
+
const task = invokeTarget(testId).then(() => {
|
|
5055
|
+
pending.delete(task);
|
|
5056
|
+
});
|
|
5057
|
+
pending.add(task);
|
|
5058
|
+
if (pending.size >= maxWorkers) {
|
|
5059
|
+
await Promise.race(pending);
|
|
5060
|
+
}
|
|
5061
|
+
}
|
|
5062
|
+
await Promise.all(pending);
|
|
5020
5063
|
} else {
|
|
5021
5064
|
console.log("Agent-as-target mode \u2014 skipping CLI invocation.");
|
|
5022
5065
|
}
|
|
@@ -5185,7 +5228,7 @@ var pipelineCommand = subcommands({
|
|
|
5185
5228
|
import path7 from "node:path";
|
|
5186
5229
|
|
|
5187
5230
|
// src/commands/results/shared.ts
|
|
5188
|
-
import { existsSync as
|
|
5231
|
+
import { existsSync as existsSync4 } from "node:fs";
|
|
5189
5232
|
|
|
5190
5233
|
// src/commands/trace/utils.ts
|
|
5191
5234
|
import { readFileSync as readFileSync5, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
|
|
@@ -5599,14 +5642,14 @@ async function resolveSourceFile(source, cwd) {
|
|
|
5599
5642
|
let sourceFile;
|
|
5600
5643
|
if (source) {
|
|
5601
5644
|
sourceFile = resolveResultSourcePath(source, cwd);
|
|
5602
|
-
if (!
|
|
5645
|
+
if (!existsSync4(sourceFile)) {
|
|
5603
5646
|
console.error(`Error: File not found: ${sourceFile}`);
|
|
5604
5647
|
process.exit(1);
|
|
5605
5648
|
}
|
|
5606
5649
|
} else {
|
|
5607
5650
|
const cache = await loadRunCache(cwd);
|
|
5608
5651
|
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
5609
|
-
if (cachedFile &&
|
|
5652
|
+
if (cachedFile && existsSync4(cachedFile)) {
|
|
5610
5653
|
sourceFile = cachedFile;
|
|
5611
5654
|
} else {
|
|
5612
5655
|
const metas = listResultFiles(cwd, 1);
|
|
@@ -5818,7 +5861,7 @@ var resultsShowCommand = command({
|
|
|
5818
5861
|
});
|
|
5819
5862
|
|
|
5820
5863
|
// src/commands/results/summary.ts
|
|
5821
|
-
import { existsSync as
|
|
5864
|
+
import { existsSync as existsSync5, readFileSync as readFileSync6 } from "node:fs";
|
|
5822
5865
|
function formatSummary(results, grading) {
|
|
5823
5866
|
const total = results.length;
|
|
5824
5867
|
let passed;
|
|
@@ -5869,7 +5912,7 @@ var resultsSummaryCommand = command({
|
|
|
5869
5912
|
const { results, sourceFile } = await loadResults(source, cwd);
|
|
5870
5913
|
let grading;
|
|
5871
5914
|
const gradingPath = sourceFile.replace(/\.jsonl$/, ".grading.json");
|
|
5872
|
-
if (
|
|
5915
|
+
if (existsSync5(gradingPath)) {
|
|
5873
5916
|
try {
|
|
5874
5917
|
grading = JSON.parse(readFileSync6(gradingPath, "utf8"));
|
|
5875
5918
|
} catch {
|
|
@@ -5883,6 +5926,217 @@ var resultsSummaryCommand = command({
|
|
|
5883
5926
|
}
|
|
5884
5927
|
});
|
|
5885
5928
|
|
|
5929
|
+
// src/commands/results/validate.ts
|
|
5930
|
+
import { existsSync as existsSync6, readFileSync as readFileSync7, statSync as statSync3 } from "node:fs";
|
|
5931
|
+
import path8 from "node:path";
|
|
5932
|
+
function checkDirectoryNaming(runDir) {
|
|
5933
|
+
const dirName = path8.basename(runDir);
|
|
5934
|
+
const parentName = path8.basename(path8.dirname(runDir));
|
|
5935
|
+
const diagnostics = [];
|
|
5936
|
+
if (parentName !== "runs") {
|
|
5937
|
+
diagnostics.push({
|
|
5938
|
+
severity: "warning",
|
|
5939
|
+
message: `Directory is not under a 'runs/' parent (found '${parentName}/'). Expected: .agentv/results/runs/<run-dir>`
|
|
5940
|
+
});
|
|
5941
|
+
}
|
|
5942
|
+
if (!/^eval_\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName)) {
|
|
5943
|
+
diagnostics.push({
|
|
5944
|
+
severity: "warning",
|
|
5945
|
+
message: `Directory name '${dirName}' does not match the expected pattern 'eval_<ISO-timestamp>'. Example: eval_2026-03-27T12-42-24-429Z`
|
|
5946
|
+
});
|
|
5947
|
+
}
|
|
5948
|
+
return diagnostics;
|
|
5949
|
+
}
|
|
5950
|
+
function checkIndexJsonl(runDir) {
|
|
5951
|
+
const indexPath = path8.join(runDir, "index.jsonl");
|
|
5952
|
+
const diagnostics = [];
|
|
5953
|
+
const entries2 = [];
|
|
5954
|
+
if (!existsSync6(indexPath)) {
|
|
5955
|
+
diagnostics.push({ severity: "error", message: "index.jsonl is missing" });
|
|
5956
|
+
return { diagnostics, entries: entries2 };
|
|
5957
|
+
}
|
|
5958
|
+
const content = readFileSync7(indexPath, "utf8");
|
|
5959
|
+
const lines = content.split("\n").filter((l) => l.trim().length > 0);
|
|
5960
|
+
if (lines.length === 0) {
|
|
5961
|
+
diagnostics.push({ severity: "error", message: "index.jsonl is empty" });
|
|
5962
|
+
return { diagnostics, entries: entries2 };
|
|
5963
|
+
}
|
|
5964
|
+
for (let i = 0; i < lines.length; i++) {
|
|
5965
|
+
try {
|
|
5966
|
+
const entry = JSON.parse(lines[i]);
|
|
5967
|
+
entries2.push(entry);
|
|
5968
|
+
if (!entry.test_id) {
|
|
5969
|
+
diagnostics.push({
|
|
5970
|
+
severity: "error",
|
|
5971
|
+
message: `index.jsonl line ${i + 1}: missing 'test_id'`
|
|
5972
|
+
});
|
|
5973
|
+
}
|
|
5974
|
+
if (entry.score === void 0 || entry.score === null) {
|
|
5975
|
+
diagnostics.push({
|
|
5976
|
+
severity: "error",
|
|
5977
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): missing 'score'`
|
|
5978
|
+
});
|
|
5979
|
+
} else if (typeof entry.score !== "number" || entry.score < 0 || entry.score > 1) {
|
|
5980
|
+
diagnostics.push({
|
|
5981
|
+
severity: "error",
|
|
5982
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): score ${entry.score} is outside [0, 1]`
|
|
5983
|
+
});
|
|
5984
|
+
}
|
|
5985
|
+
if (!entry.target) {
|
|
5986
|
+
diagnostics.push({
|
|
5987
|
+
severity: "error",
|
|
5988
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): missing 'target'`
|
|
5989
|
+
});
|
|
5990
|
+
}
|
|
5991
|
+
if (!entry.grading_path) {
|
|
5992
|
+
diagnostics.push({
|
|
5993
|
+
severity: "warning",
|
|
5994
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): missing 'grading_path'`
|
|
5995
|
+
});
|
|
5996
|
+
}
|
|
5997
|
+
if (!entry.scores || !Array.isArray(entry.scores) || entry.scores.length === 0) {
|
|
5998
|
+
diagnostics.push({
|
|
5999
|
+
severity: "warning",
|
|
6000
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): missing 'scores[]' array \u2014 dashboard may not show per-evaluator breakdown`
|
|
6001
|
+
});
|
|
6002
|
+
} else {
|
|
6003
|
+
for (let j = 0; j < entry.scores.length; j++) {
|
|
6004
|
+
const s = entry.scores[j];
|
|
6005
|
+
if (!s || typeof s !== "object") {
|
|
6006
|
+
diagnostics.push({
|
|
6007
|
+
severity: "error",
|
|
6008
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): scores[${j}] is not an object`
|
|
6009
|
+
});
|
|
6010
|
+
continue;
|
|
6011
|
+
}
|
|
6012
|
+
const missing = [];
|
|
6013
|
+
if (typeof s.name !== "string") missing.push("name");
|
|
6014
|
+
if (typeof s.type !== "string") missing.push("type");
|
|
6015
|
+
if (typeof s.score !== "number") missing.push("score");
|
|
6016
|
+
if (typeof s.verdict !== "string") missing.push("verdict");
|
|
6017
|
+
if (missing.length > 0) {
|
|
6018
|
+
diagnostics.push({
|
|
6019
|
+
severity: "warning",
|
|
6020
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): scores[${j}] missing fields: ${missing.join(", ")}`
|
|
6021
|
+
});
|
|
6022
|
+
}
|
|
6023
|
+
}
|
|
6024
|
+
}
|
|
6025
|
+
if (!entry.execution_status) {
|
|
6026
|
+
diagnostics.push({
|
|
6027
|
+
severity: "warning",
|
|
6028
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): missing 'execution_status'`
|
|
6029
|
+
});
|
|
6030
|
+
} else if (!["ok", "quality_failure", "execution_error"].includes(entry.execution_status)) {
|
|
6031
|
+
diagnostics.push({
|
|
6032
|
+
severity: "warning",
|
|
6033
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): unknown execution_status '${entry.execution_status}' (expected: ok, quality_failure, execution_error)`
|
|
6034
|
+
});
|
|
6035
|
+
}
|
|
6036
|
+
} catch {
|
|
6037
|
+
diagnostics.push({
|
|
6038
|
+
severity: "error",
|
|
6039
|
+
message: `index.jsonl line ${i + 1}: invalid JSON`
|
|
6040
|
+
});
|
|
6041
|
+
}
|
|
6042
|
+
}
|
|
6043
|
+
return { diagnostics, entries: entries2 };
|
|
6044
|
+
}
|
|
6045
|
+
function checkArtifactFiles(runDir, entries2) {
|
|
6046
|
+
const diagnostics = [];
|
|
6047
|
+
for (const entry of entries2) {
|
|
6048
|
+
const testId = entry.test_id ?? "?";
|
|
6049
|
+
if (entry.grading_path) {
|
|
6050
|
+
const gradingPath = path8.join(runDir, entry.grading_path);
|
|
6051
|
+
if (!existsSync6(gradingPath)) {
|
|
6052
|
+
diagnostics.push({
|
|
6053
|
+
severity: "error",
|
|
6054
|
+
message: `${testId}: grading.json not found at '${entry.grading_path}'`
|
|
6055
|
+
});
|
|
6056
|
+
} else {
|
|
6057
|
+
try {
|
|
6058
|
+
const grading = JSON.parse(readFileSync7(gradingPath, "utf8"));
|
|
6059
|
+
if (!grading.assertions || !Array.isArray(grading.assertions)) {
|
|
6060
|
+
diagnostics.push({
|
|
6061
|
+
severity: "error",
|
|
6062
|
+
message: `${testId}: grading.json missing 'assertions' array`
|
|
6063
|
+
});
|
|
6064
|
+
}
|
|
6065
|
+
if (!grading.summary) {
|
|
6066
|
+
diagnostics.push({
|
|
6067
|
+
severity: "warning",
|
|
6068
|
+
message: `${testId}: grading.json missing 'summary' object`
|
|
6069
|
+
});
|
|
6070
|
+
}
|
|
6071
|
+
} catch {
|
|
6072
|
+
diagnostics.push({
|
|
6073
|
+
severity: "error",
|
|
6074
|
+
message: `${testId}: grading.json is not valid JSON`
|
|
6075
|
+
});
|
|
6076
|
+
}
|
|
6077
|
+
}
|
|
6078
|
+
}
|
|
6079
|
+
if (entry.timing_path) {
|
|
6080
|
+
const timingPath = path8.join(runDir, entry.timing_path);
|
|
6081
|
+
if (!existsSync6(timingPath)) {
|
|
6082
|
+
diagnostics.push({
|
|
6083
|
+
severity: "warning",
|
|
6084
|
+
message: `${testId}: timing.json not found at '${entry.timing_path}'`
|
|
6085
|
+
});
|
|
6086
|
+
}
|
|
6087
|
+
}
|
|
6088
|
+
}
|
|
6089
|
+
const benchmarkPath = path8.join(runDir, "benchmark.json");
|
|
6090
|
+
if (!existsSync6(benchmarkPath)) {
|
|
6091
|
+
diagnostics.push({ severity: "warning", message: "benchmark.json is missing" });
|
|
6092
|
+
}
|
|
6093
|
+
return diagnostics;
|
|
6094
|
+
}
|
|
6095
|
+
var resultsValidateCommand = command({
|
|
6096
|
+
name: "validate",
|
|
6097
|
+
description: "Validate that a run directory contains well-formed result artifacts",
|
|
6098
|
+
args: {
|
|
6099
|
+
runDir: positional({
|
|
6100
|
+
type: string,
|
|
6101
|
+
displayName: "run-dir",
|
|
6102
|
+
description: "Path to the run directory to validate"
|
|
6103
|
+
})
|
|
6104
|
+
},
|
|
6105
|
+
handler: async ({ runDir }) => {
|
|
6106
|
+
const resolvedDir = path8.resolve(runDir);
|
|
6107
|
+
if (!existsSync6(resolvedDir) || !statSync3(resolvedDir).isDirectory()) {
|
|
6108
|
+
console.error(`Error: '${runDir}' is not a directory`);
|
|
6109
|
+
process.exit(1);
|
|
6110
|
+
}
|
|
6111
|
+
const allDiagnostics = [];
|
|
6112
|
+
allDiagnostics.push(...checkDirectoryNaming(resolvedDir));
|
|
6113
|
+
const { diagnostics: indexDiags, entries: entries2 } = checkIndexJsonl(resolvedDir);
|
|
6114
|
+
allDiagnostics.push(...indexDiags);
|
|
6115
|
+
if (entries2.length > 0) {
|
|
6116
|
+
allDiagnostics.push(...checkArtifactFiles(resolvedDir, entries2));
|
|
6117
|
+
}
|
|
6118
|
+
const errors = allDiagnostics.filter((d) => d.severity === "error");
|
|
6119
|
+
const warnings = allDiagnostics.filter((d) => d.severity === "warning");
|
|
6120
|
+
if (allDiagnostics.length === 0) {
|
|
6121
|
+
console.log(`\u2713 Valid run directory: ${entries2.length} test(s), no issues found`);
|
|
6122
|
+
return;
|
|
6123
|
+
}
|
|
6124
|
+
for (const d of errors) {
|
|
6125
|
+
console.error(` ERROR: ${d.message}`);
|
|
6126
|
+
}
|
|
6127
|
+
for (const d of warnings) {
|
|
6128
|
+
console.warn(` WARN: ${d.message}`);
|
|
6129
|
+
}
|
|
6130
|
+
console.log(
|
|
6131
|
+
`
|
|
6132
|
+
${entries2.length} test(s), ${errors.length} error(s), ${warnings.length} warning(s)`
|
|
6133
|
+
);
|
|
6134
|
+
if (errors.length > 0) {
|
|
6135
|
+
process.exit(1);
|
|
6136
|
+
}
|
|
6137
|
+
}
|
|
6138
|
+
});
|
|
6139
|
+
|
|
5886
6140
|
// src/commands/results/index.ts
|
|
5887
6141
|
var resultsCommand = subcommands({
|
|
5888
6142
|
name: "results",
|
|
@@ -5891,24 +6145,25 @@ var resultsCommand = subcommands({
|
|
|
5891
6145
|
export: resultsExportCommand,
|
|
5892
6146
|
summary: resultsSummaryCommand,
|
|
5893
6147
|
failures: resultsFailuresCommand,
|
|
5894
|
-
show: resultsShowCommand
|
|
6148
|
+
show: resultsShowCommand,
|
|
6149
|
+
validate: resultsValidateCommand
|
|
5895
6150
|
}
|
|
5896
6151
|
});
|
|
5897
6152
|
|
|
5898
6153
|
// src/commands/results/serve.ts
|
|
5899
|
-
import { existsSync as
|
|
5900
|
-
import
|
|
6154
|
+
import { existsSync as existsSync7, readFileSync as readFileSync8, writeFileSync as writeFileSync3 } from "node:fs";
|
|
6155
|
+
import path9 from "node:path";
|
|
5901
6156
|
import { Hono } from "hono";
|
|
5902
6157
|
function feedbackPath(resultDir) {
|
|
5903
|
-
return
|
|
6158
|
+
return path9.join(resultDir, "feedback.json");
|
|
5904
6159
|
}
|
|
5905
6160
|
function readFeedback(cwd) {
|
|
5906
6161
|
const fp = feedbackPath(cwd);
|
|
5907
|
-
if (!
|
|
6162
|
+
if (!existsSync7(fp)) {
|
|
5908
6163
|
return { reviews: [] };
|
|
5909
6164
|
}
|
|
5910
6165
|
try {
|
|
5911
|
-
return JSON.parse(
|
|
6166
|
+
return JSON.parse(readFileSync8(fp, "utf8"));
|
|
5912
6167
|
} catch (err2) {
|
|
5913
6168
|
console.error(`Warning: could not parse ${fp}, starting fresh: ${err2.message}`);
|
|
5914
6169
|
return { reviews: [] };
|
|
@@ -6047,7 +6302,7 @@ ${SERVE_STYLES}
|
|
|
6047
6302
|
<main id="app"></main>
|
|
6048
6303
|
<script>
|
|
6049
6304
|
var DATA = ${dataJson};
|
|
6050
|
-
var INITIAL_SOURCE = ${sourceFile ? JSON.stringify(
|
|
6305
|
+
var INITIAL_SOURCE = ${sourceFile ? JSON.stringify(path9.basename(sourceFile)).replace(/</g, "\\u003c").replace(/>/g, "\\u003e") : "null"};
|
|
6051
6306
|
${SERVE_SCRIPT}
|
|
6052
6307
|
</script>
|
|
6053
6308
|
</body>
|
|
@@ -6708,7 +6963,7 @@ var resultsServeCommand = command({
|
|
|
6708
6963
|
let sourceFile;
|
|
6709
6964
|
if (source) {
|
|
6710
6965
|
const resolved = resolveResultSourcePath(source, cwd);
|
|
6711
|
-
if (!
|
|
6966
|
+
if (!existsSync7(resolved)) {
|
|
6712
6967
|
console.error(`Error: Source file not found: ${resolved}`);
|
|
6713
6968
|
process.exit(1);
|
|
6714
6969
|
}
|
|
@@ -6717,7 +6972,7 @@ var resultsServeCommand = command({
|
|
|
6717
6972
|
} else {
|
|
6718
6973
|
const cache = await loadRunCache(cwd);
|
|
6719
6974
|
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
6720
|
-
if (cachedFile &&
|
|
6975
|
+
if (cachedFile && existsSync7(cachedFile)) {
|
|
6721
6976
|
sourceFile = cachedFile;
|
|
6722
6977
|
results = patchTestIds(loadManifestResults(cachedFile));
|
|
6723
6978
|
} else {
|
|
@@ -6728,7 +6983,7 @@ var resultsServeCommand = command({
|
|
|
6728
6983
|
}
|
|
6729
6984
|
}
|
|
6730
6985
|
}
|
|
6731
|
-
const resultDir = sourceFile ?
|
|
6986
|
+
const resultDir = sourceFile ? path9.dirname(path9.resolve(sourceFile)) : cwd;
|
|
6732
6987
|
const app2 = createApp(results, resultDir, cwd, sourceFile);
|
|
6733
6988
|
if (results.length > 0 && sourceFile) {
|
|
6734
6989
|
console.log(`Serving ${results.length} result(s) from ${sourceFile}`);
|
|
@@ -7622,7 +7877,7 @@ var traceCommand = subcommands({
|
|
|
7622
7877
|
|
|
7623
7878
|
// src/commands/transpile/index.ts
|
|
7624
7879
|
import { writeFileSync as writeFileSync4 } from "node:fs";
|
|
7625
|
-
import
|
|
7880
|
+
import path10 from "node:path";
|
|
7626
7881
|
var transpileCommand = command({
|
|
7627
7882
|
name: "transpile",
|
|
7628
7883
|
description: "Convert an EVAL.yaml file to Agent Skills evals.json format",
|
|
@@ -7646,7 +7901,7 @@ var transpileCommand = command({
|
|
|
7646
7901
|
handler: async ({ input, outDir, stdout }) => {
|
|
7647
7902
|
let result;
|
|
7648
7903
|
try {
|
|
7649
|
-
result = transpileEvalYamlFile(
|
|
7904
|
+
result = transpileEvalYamlFile(path10.resolve(input));
|
|
7650
7905
|
} catch (error) {
|
|
7651
7906
|
console.error(`Error: ${error.message}`);
|
|
7652
7907
|
process.exit(1);
|
|
@@ -7670,11 +7925,11 @@ var transpileCommand = command({
|
|
|
7670
7925
|
process.stdout.write("\n");
|
|
7671
7926
|
return;
|
|
7672
7927
|
}
|
|
7673
|
-
const outputDir = outDir ?
|
|
7928
|
+
const outputDir = outDir ? path10.resolve(outDir) : path10.dirname(path10.resolve(input));
|
|
7674
7929
|
const fileNames = getOutputFilenames(result);
|
|
7675
7930
|
for (const [skill, evalsJson] of result.files) {
|
|
7676
7931
|
const fileName = fileNames.get(skill) ?? "evals.json";
|
|
7677
|
-
const outputPath =
|
|
7932
|
+
const outputPath = path10.join(outputDir, fileName);
|
|
7678
7933
|
writeFileSync4(outputPath, `${JSON.stringify(evalsJson, null, 2)}
|
|
7679
7934
|
`);
|
|
7680
7935
|
console.log(`Transpiled to ${outputPath}`);
|
|
@@ -7683,7 +7938,7 @@ var transpileCommand = command({
|
|
|
7683
7938
|
});
|
|
7684
7939
|
|
|
7685
7940
|
// src/commands/trim/index.ts
|
|
7686
|
-
import { readFileSync as
|
|
7941
|
+
import { readFileSync as readFileSync9, writeFileSync as writeFileSync5 } from "node:fs";
|
|
7687
7942
|
var trimCommand = command({
|
|
7688
7943
|
name: "trim",
|
|
7689
7944
|
description: "Trim evaluation results for baseline storage (strips debug/audit fields)",
|
|
@@ -7702,7 +7957,7 @@ var trimCommand = command({
|
|
|
7702
7957
|
},
|
|
7703
7958
|
handler: async ({ input, out }) => {
|
|
7704
7959
|
try {
|
|
7705
|
-
const content =
|
|
7960
|
+
const content = readFileSync9(input, "utf8");
|
|
7706
7961
|
const lines = content.trim().split("\n").filter((line) => line.trim());
|
|
7707
7962
|
const trimmedLines = lines.map((line) => {
|
|
7708
7963
|
const record = JSON.parse(line);
|
|
@@ -7809,7 +8064,7 @@ function isTTY() {
|
|
|
7809
8064
|
// src/commands/validate/validate-files.ts
|
|
7810
8065
|
import { constants } from "node:fs";
|
|
7811
8066
|
import { access, readdir as readdir4, stat } from "node:fs/promises";
|
|
7812
|
-
import
|
|
8067
|
+
import path11 from "node:path";
|
|
7813
8068
|
async function validateFiles(paths) {
|
|
7814
8069
|
const filePaths = await expandPaths(paths);
|
|
7815
8070
|
const results = [];
|
|
@@ -7827,7 +8082,7 @@ async function validateFiles(paths) {
|
|
|
7827
8082
|
};
|
|
7828
8083
|
}
|
|
7829
8084
|
async function validateSingleFile(filePath) {
|
|
7830
|
-
const absolutePath =
|
|
8085
|
+
const absolutePath = path11.resolve(filePath);
|
|
7831
8086
|
const fileType = await detectFileType(absolutePath);
|
|
7832
8087
|
let result;
|
|
7833
8088
|
if (fileType === "eval") {
|
|
@@ -7852,7 +8107,7 @@ async function validateSingleFile(filePath) {
|
|
|
7852
8107
|
async function expandPaths(paths) {
|
|
7853
8108
|
const expanded = [];
|
|
7854
8109
|
for (const inputPath of paths) {
|
|
7855
|
-
const absolutePath =
|
|
8110
|
+
const absolutePath = path11.resolve(inputPath);
|
|
7856
8111
|
try {
|
|
7857
8112
|
await access(absolutePath, constants.F_OK);
|
|
7858
8113
|
} catch {
|
|
@@ -7876,7 +8131,7 @@ async function findYamlFiles(dirPath) {
|
|
|
7876
8131
|
try {
|
|
7877
8132
|
const entries2 = await readdir4(dirPath, { withFileTypes: true });
|
|
7878
8133
|
for (const entry of entries2) {
|
|
7879
|
-
const fullPath =
|
|
8134
|
+
const fullPath = path11.join(dirPath, entry.name);
|
|
7880
8135
|
if (entry.isDirectory()) {
|
|
7881
8136
|
if (entry.name === "node_modules" || entry.name.startsWith(".")) {
|
|
7882
8137
|
continue;
|
|
@@ -7893,7 +8148,7 @@ async function findYamlFiles(dirPath) {
|
|
|
7893
8148
|
return results;
|
|
7894
8149
|
}
|
|
7895
8150
|
function isYamlFile(filePath) {
|
|
7896
|
-
const ext =
|
|
8151
|
+
const ext = path11.extname(filePath).toLowerCase();
|
|
7897
8152
|
return ext === ".yaml" || ext === ".yml";
|
|
7898
8153
|
}
|
|
7899
8154
|
|
|
@@ -7931,9 +8186,9 @@ var validateCommand = command({
|
|
|
7931
8186
|
});
|
|
7932
8187
|
|
|
7933
8188
|
// src/commands/workspace/clean.ts
|
|
7934
|
-
import { existsSync as
|
|
8189
|
+
import { existsSync as existsSync8 } from "node:fs";
|
|
7935
8190
|
import { readFile as readFile5, readdir as readdir5, rm } from "node:fs/promises";
|
|
7936
|
-
import
|
|
8191
|
+
import path12 from "node:path";
|
|
7937
8192
|
async function confirm(message) {
|
|
7938
8193
|
const readline2 = await import("node:readline");
|
|
7939
8194
|
const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
|
|
@@ -7960,7 +8215,7 @@ var cleanCommand = command({
|
|
|
7960
8215
|
},
|
|
7961
8216
|
handler: async ({ repo, force }) => {
|
|
7962
8217
|
const poolRoot = getWorkspacePoolRoot();
|
|
7963
|
-
if (!
|
|
8218
|
+
if (!existsSync8(poolRoot)) {
|
|
7964
8219
|
console.log("No workspace pool entries found.");
|
|
7965
8220
|
return;
|
|
7966
8221
|
}
|
|
@@ -7969,8 +8224,8 @@ var cleanCommand = command({
|
|
|
7969
8224
|
const poolDirs = entries2.filter((e) => e.isDirectory());
|
|
7970
8225
|
const matchingDirs = [];
|
|
7971
8226
|
for (const dir of poolDirs) {
|
|
7972
|
-
const poolDir =
|
|
7973
|
-
const metadataPath =
|
|
8227
|
+
const poolDir = path12.join(poolRoot, dir.name);
|
|
8228
|
+
const metadataPath = path12.join(poolDir, "metadata.json");
|
|
7974
8229
|
try {
|
|
7975
8230
|
const raw = await readFile5(metadataPath, "utf-8");
|
|
7976
8231
|
const metadata = JSON.parse(raw);
|
|
@@ -8001,7 +8256,7 @@ var cleanCommand = command({
|
|
|
8001
8256
|
}
|
|
8002
8257
|
for (const dir of matchingDirs) {
|
|
8003
8258
|
await rm(dir, { recursive: true, force: true });
|
|
8004
|
-
console.log(`Removed: ${
|
|
8259
|
+
console.log(`Removed: ${path12.basename(dir).slice(0, 12)}...`);
|
|
8005
8260
|
}
|
|
8006
8261
|
console.log("Done.");
|
|
8007
8262
|
} else {
|
|
@@ -8019,15 +8274,15 @@ var cleanCommand = command({
|
|
|
8019
8274
|
});
|
|
8020
8275
|
|
|
8021
8276
|
// src/commands/workspace/list.ts
|
|
8022
|
-
import { existsSync as
|
|
8277
|
+
import { existsSync as existsSync9 } from "node:fs";
|
|
8023
8278
|
import { readFile as readFile6, readdir as readdir6, stat as stat2 } from "node:fs/promises";
|
|
8024
|
-
import
|
|
8279
|
+
import path13 from "node:path";
|
|
8025
8280
|
async function getDirectorySize(dirPath) {
|
|
8026
8281
|
let totalSize = 0;
|
|
8027
8282
|
try {
|
|
8028
8283
|
const entries2 = await readdir6(dirPath, { withFileTypes: true });
|
|
8029
8284
|
for (const entry of entries2) {
|
|
8030
|
-
const fullPath =
|
|
8285
|
+
const fullPath = path13.join(dirPath, entry.name);
|
|
8031
8286
|
if (entry.isDirectory()) {
|
|
8032
8287
|
totalSize += await getDirectorySize(fullPath);
|
|
8033
8288
|
} else {
|
|
@@ -8051,7 +8306,7 @@ var listCommand = command({
|
|
|
8051
8306
|
args: {},
|
|
8052
8307
|
handler: async () => {
|
|
8053
8308
|
const poolRoot = getWorkspacePoolRoot();
|
|
8054
|
-
if (!
|
|
8309
|
+
if (!existsSync9(poolRoot)) {
|
|
8055
8310
|
console.log("No workspace pool entries found.");
|
|
8056
8311
|
return;
|
|
8057
8312
|
}
|
|
@@ -8062,11 +8317,11 @@ var listCommand = command({
|
|
|
8062
8317
|
return;
|
|
8063
8318
|
}
|
|
8064
8319
|
for (const dir of poolDirs) {
|
|
8065
|
-
const poolDir =
|
|
8320
|
+
const poolDir = path13.join(poolRoot, dir.name);
|
|
8066
8321
|
const fingerprint = dir.name;
|
|
8067
8322
|
const poolEntries = await readdir6(poolDir, { withFileTypes: true });
|
|
8068
8323
|
const slots = poolEntries.filter((e) => e.isDirectory() && e.name.startsWith("slot-"));
|
|
8069
|
-
const metadataPath =
|
|
8324
|
+
const metadataPath = path13.join(poolDir, "metadata.json");
|
|
8070
8325
|
let metadata = null;
|
|
8071
8326
|
try {
|
|
8072
8327
|
const raw = await readFile6(metadataPath, "utf-8");
|
|
@@ -8112,8 +8367,8 @@ var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
|
|
|
8112
8367
|
var AGENTV_DIR = getAgentvHome();
|
|
8113
8368
|
var CACHE_FILE = "version-check.json";
|
|
8114
8369
|
var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
|
|
8115
|
-
async function getCachedUpdateInfo(
|
|
8116
|
-
const filePath =
|
|
8370
|
+
async function getCachedUpdateInfo(path14) {
|
|
8371
|
+
const filePath = path14 ?? join5(AGENTV_DIR, CACHE_FILE);
|
|
8117
8372
|
try {
|
|
8118
8373
|
const raw = await readFile7(filePath, "utf-8");
|
|
8119
8374
|
const data = JSON.parse(raw);
|
|
@@ -8270,4 +8525,4 @@ export {
|
|
|
8270
8525
|
preprocessArgv,
|
|
8271
8526
|
runCli
|
|
8272
8527
|
};
|
|
8273
|
-
//# sourceMappingURL=chunk-
|
|
8528
|
+
//# sourceMappingURL=chunk-3NLBBQX6.js.map
|