agentv 3.14.3 → 3.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-FSNRKR7X.js → chunk-3NLBBQX6.js} +311 -59
- package/dist/chunk-3NLBBQX6.js.map +1 -0
- package/dist/{chunk-VYDUBNCD.js → chunk-SAPEYQ5U.js} +3 -2
- package/dist/{chunk-VYDUBNCD.js.map → chunk-SAPEYQ5U.js.map} +1 -1
- package/dist/cli.js +2 -2
- package/dist/index.js +2 -2
- package/dist/{interactive-LBVPF2CG.js → interactive-PGZ55VHT.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-FSNRKR7X.js.map +0 -1
- /package/dist/{interactive-LBVPF2CG.js.map → interactive-PGZ55VHT.js.map} +0 -0
|
@@ -3,6 +3,7 @@ import {
|
|
|
3
3
|
HtmlWriter,
|
|
4
4
|
RESULT_INDEX_FILENAME,
|
|
5
5
|
RESULT_RUNS_DIRNAME,
|
|
6
|
+
buildDefaultRunDir,
|
|
6
7
|
detectFileType,
|
|
7
8
|
findRepoRoot,
|
|
8
9
|
loadLightweightResults,
|
|
@@ -22,7 +23,7 @@ import {
|
|
|
22
23
|
validateFileReferences,
|
|
23
24
|
validateTargetsFile,
|
|
24
25
|
writeArtifactsFromResults
|
|
25
|
-
} from "./chunk-
|
|
26
|
+
} from "./chunk-SAPEYQ5U.js";
|
|
26
27
|
import {
|
|
27
28
|
createBuiltinRegistry,
|
|
28
29
|
executeScript,
|
|
@@ -4185,7 +4186,7 @@ var evalRunCommand = command({
|
|
|
4185
4186
|
},
|
|
4186
4187
|
handler: async (args) => {
|
|
4187
4188
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4188
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4189
|
+
const { launchInteractiveWizard } = await import("./interactive-PGZ55VHT.js");
|
|
4189
4190
|
await launchInteractiveWizard();
|
|
4190
4191
|
return;
|
|
4191
4192
|
}
|
|
@@ -4398,6 +4399,7 @@ var initCmdTsCommand = command({
|
|
|
4398
4399
|
});
|
|
4399
4400
|
|
|
4400
4401
|
// src/commands/pipeline/bench.ts
|
|
4402
|
+
import { existsSync as existsSync2 } from "node:fs";
|
|
4401
4403
|
import { readFile, readdir, writeFile as writeFile2 } from "node:fs/promises";
|
|
4402
4404
|
import { join } from "node:path";
|
|
4403
4405
|
var evalBenchCommand = command({
|
|
@@ -4497,14 +4499,41 @@ var evalBenchCommand = command({
|
|
|
4497
4499
|
`,
|
|
4498
4500
|
"utf8"
|
|
4499
4501
|
);
|
|
4502
|
+
const scores = evaluators.map((e) => ({
|
|
4503
|
+
name: e.name,
|
|
4504
|
+
type: e.type,
|
|
4505
|
+
score: e.score,
|
|
4506
|
+
weight: e.weight,
|
|
4507
|
+
verdict: e.score >= 0.5 ? "pass" : "fail",
|
|
4508
|
+
assertions: e.assertions.map((a) => ({
|
|
4509
|
+
text: a.text,
|
|
4510
|
+
passed: a.passed,
|
|
4511
|
+
evidence: a.evidence ?? ""
|
|
4512
|
+
}))
|
|
4513
|
+
}));
|
|
4514
|
+
let executionStatus = "ok";
|
|
4515
|
+
const timingPath = join(testDir, "timing.json");
|
|
4516
|
+
if (existsSync2(timingPath)) {
|
|
4517
|
+
try {
|
|
4518
|
+
const timing = JSON.parse(await readFile(timingPath, "utf8"));
|
|
4519
|
+
if (typeof timing.execution_status === "string") {
|
|
4520
|
+
executionStatus = timing.execution_status;
|
|
4521
|
+
}
|
|
4522
|
+
} catch {
|
|
4523
|
+
}
|
|
4524
|
+
}
|
|
4525
|
+
const hasResponse = existsSync2(join(testDir, "response.md"));
|
|
4500
4526
|
indexLines.push(
|
|
4501
4527
|
JSON.stringify({
|
|
4502
4528
|
timestamp: manifest.timestamp,
|
|
4503
4529
|
test_id: testId,
|
|
4504
4530
|
score: Math.round(weightedScore * 1e3) / 1e3,
|
|
4505
4531
|
target: targetName,
|
|
4532
|
+
scores,
|
|
4533
|
+
execution_status: executionStatus,
|
|
4506
4534
|
grading_path: `${testId}/grading.json`,
|
|
4507
|
-
timing_path: `${testId}/timing.json
|
|
4535
|
+
timing_path: `${testId}/timing.json`,
|
|
4536
|
+
response_path: hasResponse ? `${testId}/response.md` : null
|
|
4508
4537
|
})
|
|
4509
4538
|
);
|
|
4510
4539
|
}
|
|
@@ -4680,14 +4709,14 @@ var evalInputCommand = command({
|
|
|
4680
4709
|
description: "Path to eval YAML file"
|
|
4681
4710
|
}),
|
|
4682
4711
|
out: option({
|
|
4683
|
-
type: string,
|
|
4712
|
+
type: optional(string),
|
|
4684
4713
|
long: "out",
|
|
4685
|
-
description: "Output directory for extracted inputs"
|
|
4714
|
+
description: "Output directory for extracted inputs (default: .agentv/results/runs/eval_<timestamp>)"
|
|
4686
4715
|
})
|
|
4687
4716
|
},
|
|
4688
4717
|
handler: async ({ evalPath, out }) => {
|
|
4689
4718
|
const resolvedEvalPath = resolve(evalPath);
|
|
4690
|
-
const outDir = resolve(out);
|
|
4719
|
+
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
|
|
4691
4720
|
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
|
|
4692
4721
|
const evalDir = dirname(resolvedEvalPath);
|
|
4693
4722
|
const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
|
|
@@ -4826,7 +4855,7 @@ async function writeJson(filePath, data) {
|
|
|
4826
4855
|
|
|
4827
4856
|
// src/commands/pipeline/run.ts
|
|
4828
4857
|
import { execSync } from "node:child_process";
|
|
4829
|
-
import { existsSync as
|
|
4858
|
+
import { existsSync as existsSync3, readFileSync as readFileSync4, unlinkSync } from "node:fs";
|
|
4830
4859
|
import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
|
|
4831
4860
|
import { tmpdir } from "node:os";
|
|
4832
4861
|
import { dirname as dirname2, join as join4, resolve as resolve2 } from "node:path";
|
|
@@ -4834,7 +4863,7 @@ function loadEnvFile(dir) {
|
|
|
4834
4863
|
let current = resolve2(dir);
|
|
4835
4864
|
while (true) {
|
|
4836
4865
|
const candidate = join4(current, ".env");
|
|
4837
|
-
if (
|
|
4866
|
+
if (existsSync3(candidate)) {
|
|
4838
4867
|
const env3 = {};
|
|
4839
4868
|
for (const line of readFileSync4(candidate, "utf8").split("\n")) {
|
|
4840
4869
|
const trimmed = line.trim();
|
|
@@ -4861,9 +4890,9 @@ var evalRunCommand2 = command({
|
|
|
4861
4890
|
description: "Path to eval YAML file"
|
|
4862
4891
|
}),
|
|
4863
4892
|
out: option({
|
|
4864
|
-
type: string,
|
|
4893
|
+
type: optional(string),
|
|
4865
4894
|
long: "out",
|
|
4866
|
-
description: "Output directory for results"
|
|
4895
|
+
description: "Output directory for results (default: .agentv/results/runs/eval_<timestamp>)"
|
|
4867
4896
|
}),
|
|
4868
4897
|
workers: option({
|
|
4869
4898
|
type: optional(number),
|
|
@@ -4873,7 +4902,7 @@ var evalRunCommand2 = command({
|
|
|
4873
4902
|
},
|
|
4874
4903
|
handler: async ({ evalPath, out, workers }) => {
|
|
4875
4904
|
const resolvedEvalPath = resolve2(evalPath);
|
|
4876
|
-
const outDir = resolve2(out);
|
|
4905
|
+
const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd()));
|
|
4877
4906
|
const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
|
|
4878
4907
|
const evalDir = dirname2(resolvedEvalPath);
|
|
4879
4908
|
const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
|
|
@@ -4989,7 +5018,7 @@ var evalRunCommand2 = command({
|
|
|
4989
5018
|
});
|
|
4990
5019
|
const durationMs = Math.round(performance.now() - start);
|
|
4991
5020
|
let response;
|
|
4992
|
-
if (
|
|
5021
|
+
if (existsSync3(outputFile)) {
|
|
4993
5022
|
response = readFileSync4(outputFile, "utf8");
|
|
4994
5023
|
} else {
|
|
4995
5024
|
response = "ERROR: No output file generated";
|
|
@@ -4997,7 +5026,8 @@ var evalRunCommand2 = command({
|
|
|
4997
5026
|
await writeFile5(join4(testDir, "response.md"), response, "utf8");
|
|
4998
5027
|
await writeJson2(join4(testDir, "timing.json"), {
|
|
4999
5028
|
duration_ms: durationMs,
|
|
5000
|
-
total_duration_seconds: Math.round(durationMs / 10) / 100
|
|
5029
|
+
total_duration_seconds: Math.round(durationMs / 10) / 100,
|
|
5030
|
+
execution_status: "ok"
|
|
5001
5031
|
});
|
|
5002
5032
|
console.log(` ${testId}: OK (${durationMs}ms, ${response.length} chars)`);
|
|
5003
5033
|
} catch (error) {
|
|
@@ -5007,19 +5037,29 @@ var evalRunCommand2 = command({
|
|
|
5007
5037
|
await writeFile5(join4(testDir, "response.md"), response, "utf8");
|
|
5008
5038
|
await writeJson2(join4(testDir, "timing.json"), {
|
|
5009
5039
|
duration_ms: durationMs,
|
|
5010
|
-
total_duration_seconds: Math.round(durationMs / 10) / 100
|
|
5040
|
+
total_duration_seconds: Math.round(durationMs / 10) / 100,
|
|
5041
|
+
execution_status: "execution_error"
|
|
5011
5042
|
});
|
|
5012
5043
|
console.error(` ${testId}: FAILED (${durationMs}ms) \u2014 ${message.slice(0, 200)}`);
|
|
5013
5044
|
} finally {
|
|
5014
5045
|
try {
|
|
5015
|
-
if (
|
|
5016
|
-
if (
|
|
5046
|
+
if (existsSync3(promptFile)) unlinkSync(promptFile);
|
|
5047
|
+
if (existsSync3(outputFile)) unlinkSync(outputFile);
|
|
5017
5048
|
} catch {
|
|
5018
5049
|
}
|
|
5019
5050
|
}
|
|
5020
5051
|
};
|
|
5021
|
-
const
|
|
5022
|
-
|
|
5052
|
+
const pending = /* @__PURE__ */ new Set();
|
|
5053
|
+
for (const testId of testIds) {
|
|
5054
|
+
const task = invokeTarget(testId).then(() => {
|
|
5055
|
+
pending.delete(task);
|
|
5056
|
+
});
|
|
5057
|
+
pending.add(task);
|
|
5058
|
+
if (pending.size >= maxWorkers) {
|
|
5059
|
+
await Promise.race(pending);
|
|
5060
|
+
}
|
|
5061
|
+
}
|
|
5062
|
+
await Promise.all(pending);
|
|
5023
5063
|
} else {
|
|
5024
5064
|
console.log("Agent-as-target mode \u2014 skipping CLI invocation.");
|
|
5025
5065
|
}
|
|
@@ -5188,7 +5228,7 @@ var pipelineCommand = subcommands({
|
|
|
5188
5228
|
import path7 from "node:path";
|
|
5189
5229
|
|
|
5190
5230
|
// src/commands/results/shared.ts
|
|
5191
|
-
import { existsSync as
|
|
5231
|
+
import { existsSync as existsSync4 } from "node:fs";
|
|
5192
5232
|
|
|
5193
5233
|
// src/commands/trace/utils.ts
|
|
5194
5234
|
import { readFileSync as readFileSync5, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
|
|
@@ -5602,14 +5642,14 @@ async function resolveSourceFile(source, cwd) {
|
|
|
5602
5642
|
let sourceFile;
|
|
5603
5643
|
if (source) {
|
|
5604
5644
|
sourceFile = resolveResultSourcePath(source, cwd);
|
|
5605
|
-
if (!
|
|
5645
|
+
if (!existsSync4(sourceFile)) {
|
|
5606
5646
|
console.error(`Error: File not found: ${sourceFile}`);
|
|
5607
5647
|
process.exit(1);
|
|
5608
5648
|
}
|
|
5609
5649
|
} else {
|
|
5610
5650
|
const cache = await loadRunCache(cwd);
|
|
5611
5651
|
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
5612
|
-
if (cachedFile &&
|
|
5652
|
+
if (cachedFile && existsSync4(cachedFile)) {
|
|
5613
5653
|
sourceFile = cachedFile;
|
|
5614
5654
|
} else {
|
|
5615
5655
|
const metas = listResultFiles(cwd, 1);
|
|
@@ -5821,7 +5861,7 @@ var resultsShowCommand = command({
|
|
|
5821
5861
|
});
|
|
5822
5862
|
|
|
5823
5863
|
// src/commands/results/summary.ts
|
|
5824
|
-
import { existsSync as
|
|
5864
|
+
import { existsSync as existsSync5, readFileSync as readFileSync6 } from "node:fs";
|
|
5825
5865
|
function formatSummary(results, grading) {
|
|
5826
5866
|
const total = results.length;
|
|
5827
5867
|
let passed;
|
|
@@ -5872,7 +5912,7 @@ var resultsSummaryCommand = command({
|
|
|
5872
5912
|
const { results, sourceFile } = await loadResults(source, cwd);
|
|
5873
5913
|
let grading;
|
|
5874
5914
|
const gradingPath = sourceFile.replace(/\.jsonl$/, ".grading.json");
|
|
5875
|
-
if (
|
|
5915
|
+
if (existsSync5(gradingPath)) {
|
|
5876
5916
|
try {
|
|
5877
5917
|
grading = JSON.parse(readFileSync6(gradingPath, "utf8"));
|
|
5878
5918
|
} catch {
|
|
@@ -5886,6 +5926,217 @@ var resultsSummaryCommand = command({
|
|
|
5886
5926
|
}
|
|
5887
5927
|
});
|
|
5888
5928
|
|
|
5929
|
+
// src/commands/results/validate.ts
|
|
5930
|
+
import { existsSync as existsSync6, readFileSync as readFileSync7, statSync as statSync3 } from "node:fs";
|
|
5931
|
+
import path8 from "node:path";
|
|
5932
|
+
function checkDirectoryNaming(runDir) {
|
|
5933
|
+
const dirName = path8.basename(runDir);
|
|
5934
|
+
const parentName = path8.basename(path8.dirname(runDir));
|
|
5935
|
+
const diagnostics = [];
|
|
5936
|
+
if (parentName !== "runs") {
|
|
5937
|
+
diagnostics.push({
|
|
5938
|
+
severity: "warning",
|
|
5939
|
+
message: `Directory is not under a 'runs/' parent (found '${parentName}/'). Expected: .agentv/results/runs/<run-dir>`
|
|
5940
|
+
});
|
|
5941
|
+
}
|
|
5942
|
+
if (!/^eval_\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName)) {
|
|
5943
|
+
diagnostics.push({
|
|
5944
|
+
severity: "warning",
|
|
5945
|
+
message: `Directory name '${dirName}' does not match the expected pattern 'eval_<ISO-timestamp>'. Example: eval_2026-03-27T12-42-24-429Z`
|
|
5946
|
+
});
|
|
5947
|
+
}
|
|
5948
|
+
return diagnostics;
|
|
5949
|
+
}
|
|
5950
|
+
function checkIndexJsonl(runDir) {
|
|
5951
|
+
const indexPath = path8.join(runDir, "index.jsonl");
|
|
5952
|
+
const diagnostics = [];
|
|
5953
|
+
const entries2 = [];
|
|
5954
|
+
if (!existsSync6(indexPath)) {
|
|
5955
|
+
diagnostics.push({ severity: "error", message: "index.jsonl is missing" });
|
|
5956
|
+
return { diagnostics, entries: entries2 };
|
|
5957
|
+
}
|
|
5958
|
+
const content = readFileSync7(indexPath, "utf8");
|
|
5959
|
+
const lines = content.split("\n").filter((l) => l.trim().length > 0);
|
|
5960
|
+
if (lines.length === 0) {
|
|
5961
|
+
diagnostics.push({ severity: "error", message: "index.jsonl is empty" });
|
|
5962
|
+
return { diagnostics, entries: entries2 };
|
|
5963
|
+
}
|
|
5964
|
+
for (let i = 0; i < lines.length; i++) {
|
|
5965
|
+
try {
|
|
5966
|
+
const entry = JSON.parse(lines[i]);
|
|
5967
|
+
entries2.push(entry);
|
|
5968
|
+
if (!entry.test_id) {
|
|
5969
|
+
diagnostics.push({
|
|
5970
|
+
severity: "error",
|
|
5971
|
+
message: `index.jsonl line ${i + 1}: missing 'test_id'`
|
|
5972
|
+
});
|
|
5973
|
+
}
|
|
5974
|
+
if (entry.score === void 0 || entry.score === null) {
|
|
5975
|
+
diagnostics.push({
|
|
5976
|
+
severity: "error",
|
|
5977
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): missing 'score'`
|
|
5978
|
+
});
|
|
5979
|
+
} else if (typeof entry.score !== "number" || entry.score < 0 || entry.score > 1) {
|
|
5980
|
+
diagnostics.push({
|
|
5981
|
+
severity: "error",
|
|
5982
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): score ${entry.score} is outside [0, 1]`
|
|
5983
|
+
});
|
|
5984
|
+
}
|
|
5985
|
+
if (!entry.target) {
|
|
5986
|
+
diagnostics.push({
|
|
5987
|
+
severity: "error",
|
|
5988
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): missing 'target'`
|
|
5989
|
+
});
|
|
5990
|
+
}
|
|
5991
|
+
if (!entry.grading_path) {
|
|
5992
|
+
diagnostics.push({
|
|
5993
|
+
severity: "warning",
|
|
5994
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): missing 'grading_path'`
|
|
5995
|
+
});
|
|
5996
|
+
}
|
|
5997
|
+
if (!entry.scores || !Array.isArray(entry.scores) || entry.scores.length === 0) {
|
|
5998
|
+
diagnostics.push({
|
|
5999
|
+
severity: "warning",
|
|
6000
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): missing 'scores[]' array \u2014 dashboard may not show per-evaluator breakdown`
|
|
6001
|
+
});
|
|
6002
|
+
} else {
|
|
6003
|
+
for (let j = 0; j < entry.scores.length; j++) {
|
|
6004
|
+
const s = entry.scores[j];
|
|
6005
|
+
if (!s || typeof s !== "object") {
|
|
6006
|
+
diagnostics.push({
|
|
6007
|
+
severity: "error",
|
|
6008
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): scores[${j}] is not an object`
|
|
6009
|
+
});
|
|
6010
|
+
continue;
|
|
6011
|
+
}
|
|
6012
|
+
const missing = [];
|
|
6013
|
+
if (typeof s.name !== "string") missing.push("name");
|
|
6014
|
+
if (typeof s.type !== "string") missing.push("type");
|
|
6015
|
+
if (typeof s.score !== "number") missing.push("score");
|
|
6016
|
+
if (typeof s.verdict !== "string") missing.push("verdict");
|
|
6017
|
+
if (missing.length > 0) {
|
|
6018
|
+
diagnostics.push({
|
|
6019
|
+
severity: "warning",
|
|
6020
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): scores[${j}] missing fields: ${missing.join(", ")}`
|
|
6021
|
+
});
|
|
6022
|
+
}
|
|
6023
|
+
}
|
|
6024
|
+
}
|
|
6025
|
+
if (!entry.execution_status) {
|
|
6026
|
+
diagnostics.push({
|
|
6027
|
+
severity: "warning",
|
|
6028
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): missing 'execution_status'`
|
|
6029
|
+
});
|
|
6030
|
+
} else if (!["ok", "quality_failure", "execution_error"].includes(entry.execution_status)) {
|
|
6031
|
+
diagnostics.push({
|
|
6032
|
+
severity: "warning",
|
|
6033
|
+
message: `index.jsonl line ${i + 1} (${entry.test_id ?? "?"}): unknown execution_status '${entry.execution_status}' (expected: ok, quality_failure, execution_error)`
|
|
6034
|
+
});
|
|
6035
|
+
}
|
|
6036
|
+
} catch {
|
|
6037
|
+
diagnostics.push({
|
|
6038
|
+
severity: "error",
|
|
6039
|
+
message: `index.jsonl line ${i + 1}: invalid JSON`
|
|
6040
|
+
});
|
|
6041
|
+
}
|
|
6042
|
+
}
|
|
6043
|
+
return { diagnostics, entries: entries2 };
|
|
6044
|
+
}
|
|
6045
|
+
function checkArtifactFiles(runDir, entries2) {
|
|
6046
|
+
const diagnostics = [];
|
|
6047
|
+
for (const entry of entries2) {
|
|
6048
|
+
const testId = entry.test_id ?? "?";
|
|
6049
|
+
if (entry.grading_path) {
|
|
6050
|
+
const gradingPath = path8.join(runDir, entry.grading_path);
|
|
6051
|
+
if (!existsSync6(gradingPath)) {
|
|
6052
|
+
diagnostics.push({
|
|
6053
|
+
severity: "error",
|
|
6054
|
+
message: `${testId}: grading.json not found at '${entry.grading_path}'`
|
|
6055
|
+
});
|
|
6056
|
+
} else {
|
|
6057
|
+
try {
|
|
6058
|
+
const grading = JSON.parse(readFileSync7(gradingPath, "utf8"));
|
|
6059
|
+
if (!grading.assertions || !Array.isArray(grading.assertions)) {
|
|
6060
|
+
diagnostics.push({
|
|
6061
|
+
severity: "error",
|
|
6062
|
+
message: `${testId}: grading.json missing 'assertions' array`
|
|
6063
|
+
});
|
|
6064
|
+
}
|
|
6065
|
+
if (!grading.summary) {
|
|
6066
|
+
diagnostics.push({
|
|
6067
|
+
severity: "warning",
|
|
6068
|
+
message: `${testId}: grading.json missing 'summary' object`
|
|
6069
|
+
});
|
|
6070
|
+
}
|
|
6071
|
+
} catch {
|
|
6072
|
+
diagnostics.push({
|
|
6073
|
+
severity: "error",
|
|
6074
|
+
message: `${testId}: grading.json is not valid JSON`
|
|
6075
|
+
});
|
|
6076
|
+
}
|
|
6077
|
+
}
|
|
6078
|
+
}
|
|
6079
|
+
if (entry.timing_path) {
|
|
6080
|
+
const timingPath = path8.join(runDir, entry.timing_path);
|
|
6081
|
+
if (!existsSync6(timingPath)) {
|
|
6082
|
+
diagnostics.push({
|
|
6083
|
+
severity: "warning",
|
|
6084
|
+
message: `${testId}: timing.json not found at '${entry.timing_path}'`
|
|
6085
|
+
});
|
|
6086
|
+
}
|
|
6087
|
+
}
|
|
6088
|
+
}
|
|
6089
|
+
const benchmarkPath = path8.join(runDir, "benchmark.json");
|
|
6090
|
+
if (!existsSync6(benchmarkPath)) {
|
|
6091
|
+
diagnostics.push({ severity: "warning", message: "benchmark.json is missing" });
|
|
6092
|
+
}
|
|
6093
|
+
return diagnostics;
|
|
6094
|
+
}
|
|
6095
|
+
var resultsValidateCommand = command({
|
|
6096
|
+
name: "validate",
|
|
6097
|
+
description: "Validate that a run directory contains well-formed result artifacts",
|
|
6098
|
+
args: {
|
|
6099
|
+
runDir: positional({
|
|
6100
|
+
type: string,
|
|
6101
|
+
displayName: "run-dir",
|
|
6102
|
+
description: "Path to the run directory to validate"
|
|
6103
|
+
})
|
|
6104
|
+
},
|
|
6105
|
+
handler: async ({ runDir }) => {
|
|
6106
|
+
const resolvedDir = path8.resolve(runDir);
|
|
6107
|
+
if (!existsSync6(resolvedDir) || !statSync3(resolvedDir).isDirectory()) {
|
|
6108
|
+
console.error(`Error: '${runDir}' is not a directory`);
|
|
6109
|
+
process.exit(1);
|
|
6110
|
+
}
|
|
6111
|
+
const allDiagnostics = [];
|
|
6112
|
+
allDiagnostics.push(...checkDirectoryNaming(resolvedDir));
|
|
6113
|
+
const { diagnostics: indexDiags, entries: entries2 } = checkIndexJsonl(resolvedDir);
|
|
6114
|
+
allDiagnostics.push(...indexDiags);
|
|
6115
|
+
if (entries2.length > 0) {
|
|
6116
|
+
allDiagnostics.push(...checkArtifactFiles(resolvedDir, entries2));
|
|
6117
|
+
}
|
|
6118
|
+
const errors = allDiagnostics.filter((d) => d.severity === "error");
|
|
6119
|
+
const warnings = allDiagnostics.filter((d) => d.severity === "warning");
|
|
6120
|
+
if (allDiagnostics.length === 0) {
|
|
6121
|
+
console.log(`\u2713 Valid run directory: ${entries2.length} test(s), no issues found`);
|
|
6122
|
+
return;
|
|
6123
|
+
}
|
|
6124
|
+
for (const d of errors) {
|
|
6125
|
+
console.error(` ERROR: ${d.message}`);
|
|
6126
|
+
}
|
|
6127
|
+
for (const d of warnings) {
|
|
6128
|
+
console.warn(` WARN: ${d.message}`);
|
|
6129
|
+
}
|
|
6130
|
+
console.log(
|
|
6131
|
+
`
|
|
6132
|
+
${entries2.length} test(s), ${errors.length} error(s), ${warnings.length} warning(s)`
|
|
6133
|
+
);
|
|
6134
|
+
if (errors.length > 0) {
|
|
6135
|
+
process.exit(1);
|
|
6136
|
+
}
|
|
6137
|
+
}
|
|
6138
|
+
});
|
|
6139
|
+
|
|
5889
6140
|
// src/commands/results/index.ts
|
|
5890
6141
|
var resultsCommand = subcommands({
|
|
5891
6142
|
name: "results",
|
|
@@ -5894,24 +6145,25 @@ var resultsCommand = subcommands({
|
|
|
5894
6145
|
export: resultsExportCommand,
|
|
5895
6146
|
summary: resultsSummaryCommand,
|
|
5896
6147
|
failures: resultsFailuresCommand,
|
|
5897
|
-
show: resultsShowCommand
|
|
6148
|
+
show: resultsShowCommand,
|
|
6149
|
+
validate: resultsValidateCommand
|
|
5898
6150
|
}
|
|
5899
6151
|
});
|
|
5900
6152
|
|
|
5901
6153
|
// src/commands/results/serve.ts
|
|
5902
|
-
import { existsSync as
|
|
5903
|
-
import
|
|
6154
|
+
import { existsSync as existsSync7, readFileSync as readFileSync8, writeFileSync as writeFileSync3 } from "node:fs";
|
|
6155
|
+
import path9 from "node:path";
|
|
5904
6156
|
import { Hono } from "hono";
|
|
5905
6157
|
function feedbackPath(resultDir) {
|
|
5906
|
-
return
|
|
6158
|
+
return path9.join(resultDir, "feedback.json");
|
|
5907
6159
|
}
|
|
5908
6160
|
function readFeedback(cwd) {
|
|
5909
6161
|
const fp = feedbackPath(cwd);
|
|
5910
|
-
if (!
|
|
6162
|
+
if (!existsSync7(fp)) {
|
|
5911
6163
|
return { reviews: [] };
|
|
5912
6164
|
}
|
|
5913
6165
|
try {
|
|
5914
|
-
return JSON.parse(
|
|
6166
|
+
return JSON.parse(readFileSync8(fp, "utf8"));
|
|
5915
6167
|
} catch (err2) {
|
|
5916
6168
|
console.error(`Warning: could not parse ${fp}, starting fresh: ${err2.message}`);
|
|
5917
6169
|
return { reviews: [] };
|
|
@@ -6050,7 +6302,7 @@ ${SERVE_STYLES}
|
|
|
6050
6302
|
<main id="app"></main>
|
|
6051
6303
|
<script>
|
|
6052
6304
|
var DATA = ${dataJson};
|
|
6053
|
-
var INITIAL_SOURCE = ${sourceFile ? JSON.stringify(
|
|
6305
|
+
var INITIAL_SOURCE = ${sourceFile ? JSON.stringify(path9.basename(sourceFile)).replace(/</g, "\\u003c").replace(/>/g, "\\u003e") : "null"};
|
|
6054
6306
|
${SERVE_SCRIPT}
|
|
6055
6307
|
</script>
|
|
6056
6308
|
</body>
|
|
@@ -6711,7 +6963,7 @@ var resultsServeCommand = command({
|
|
|
6711
6963
|
let sourceFile;
|
|
6712
6964
|
if (source) {
|
|
6713
6965
|
const resolved = resolveResultSourcePath(source, cwd);
|
|
6714
|
-
if (!
|
|
6966
|
+
if (!existsSync7(resolved)) {
|
|
6715
6967
|
console.error(`Error: Source file not found: ${resolved}`);
|
|
6716
6968
|
process.exit(1);
|
|
6717
6969
|
}
|
|
@@ -6720,7 +6972,7 @@ var resultsServeCommand = command({
|
|
|
6720
6972
|
} else {
|
|
6721
6973
|
const cache = await loadRunCache(cwd);
|
|
6722
6974
|
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
6723
|
-
if (cachedFile &&
|
|
6975
|
+
if (cachedFile && existsSync7(cachedFile)) {
|
|
6724
6976
|
sourceFile = cachedFile;
|
|
6725
6977
|
results = patchTestIds(loadManifestResults(cachedFile));
|
|
6726
6978
|
} else {
|
|
@@ -6731,7 +6983,7 @@ var resultsServeCommand = command({
|
|
|
6731
6983
|
}
|
|
6732
6984
|
}
|
|
6733
6985
|
}
|
|
6734
|
-
const resultDir = sourceFile ?
|
|
6986
|
+
const resultDir = sourceFile ? path9.dirname(path9.resolve(sourceFile)) : cwd;
|
|
6735
6987
|
const app2 = createApp(results, resultDir, cwd, sourceFile);
|
|
6736
6988
|
if (results.length > 0 && sourceFile) {
|
|
6737
6989
|
console.log(`Serving ${results.length} result(s) from ${sourceFile}`);
|
|
@@ -7625,7 +7877,7 @@ var traceCommand = subcommands({
|
|
|
7625
7877
|
|
|
7626
7878
|
// src/commands/transpile/index.ts
|
|
7627
7879
|
import { writeFileSync as writeFileSync4 } from "node:fs";
|
|
7628
|
-
import
|
|
7880
|
+
import path10 from "node:path";
|
|
7629
7881
|
var transpileCommand = command({
|
|
7630
7882
|
name: "transpile",
|
|
7631
7883
|
description: "Convert an EVAL.yaml file to Agent Skills evals.json format",
|
|
@@ -7649,7 +7901,7 @@ var transpileCommand = command({
|
|
|
7649
7901
|
handler: async ({ input, outDir, stdout }) => {
|
|
7650
7902
|
let result;
|
|
7651
7903
|
try {
|
|
7652
|
-
result = transpileEvalYamlFile(
|
|
7904
|
+
result = transpileEvalYamlFile(path10.resolve(input));
|
|
7653
7905
|
} catch (error) {
|
|
7654
7906
|
console.error(`Error: ${error.message}`);
|
|
7655
7907
|
process.exit(1);
|
|
@@ -7673,11 +7925,11 @@ var transpileCommand = command({
|
|
|
7673
7925
|
process.stdout.write("\n");
|
|
7674
7926
|
return;
|
|
7675
7927
|
}
|
|
7676
|
-
const outputDir = outDir ?
|
|
7928
|
+
const outputDir = outDir ? path10.resolve(outDir) : path10.dirname(path10.resolve(input));
|
|
7677
7929
|
const fileNames = getOutputFilenames(result);
|
|
7678
7930
|
for (const [skill, evalsJson] of result.files) {
|
|
7679
7931
|
const fileName = fileNames.get(skill) ?? "evals.json";
|
|
7680
|
-
const outputPath =
|
|
7932
|
+
const outputPath = path10.join(outputDir, fileName);
|
|
7681
7933
|
writeFileSync4(outputPath, `${JSON.stringify(evalsJson, null, 2)}
|
|
7682
7934
|
`);
|
|
7683
7935
|
console.log(`Transpiled to ${outputPath}`);
|
|
@@ -7686,7 +7938,7 @@ var transpileCommand = command({
|
|
|
7686
7938
|
});
|
|
7687
7939
|
|
|
7688
7940
|
// src/commands/trim/index.ts
|
|
7689
|
-
import { readFileSync as
|
|
7941
|
+
import { readFileSync as readFileSync9, writeFileSync as writeFileSync5 } from "node:fs";
|
|
7690
7942
|
var trimCommand = command({
|
|
7691
7943
|
name: "trim",
|
|
7692
7944
|
description: "Trim evaluation results for baseline storage (strips debug/audit fields)",
|
|
@@ -7705,7 +7957,7 @@ var trimCommand = command({
|
|
|
7705
7957
|
},
|
|
7706
7958
|
handler: async ({ input, out }) => {
|
|
7707
7959
|
try {
|
|
7708
|
-
const content =
|
|
7960
|
+
const content = readFileSync9(input, "utf8");
|
|
7709
7961
|
const lines = content.trim().split("\n").filter((line) => line.trim());
|
|
7710
7962
|
const trimmedLines = lines.map((line) => {
|
|
7711
7963
|
const record = JSON.parse(line);
|
|
@@ -7812,7 +8064,7 @@ function isTTY() {
|
|
|
7812
8064
|
// src/commands/validate/validate-files.ts
|
|
7813
8065
|
import { constants } from "node:fs";
|
|
7814
8066
|
import { access, readdir as readdir4, stat } from "node:fs/promises";
|
|
7815
|
-
import
|
|
8067
|
+
import path11 from "node:path";
|
|
7816
8068
|
async function validateFiles(paths) {
|
|
7817
8069
|
const filePaths = await expandPaths(paths);
|
|
7818
8070
|
const results = [];
|
|
@@ -7830,7 +8082,7 @@ async function validateFiles(paths) {
|
|
|
7830
8082
|
};
|
|
7831
8083
|
}
|
|
7832
8084
|
async function validateSingleFile(filePath) {
|
|
7833
|
-
const absolutePath =
|
|
8085
|
+
const absolutePath = path11.resolve(filePath);
|
|
7834
8086
|
const fileType = await detectFileType(absolutePath);
|
|
7835
8087
|
let result;
|
|
7836
8088
|
if (fileType === "eval") {
|
|
@@ -7855,7 +8107,7 @@ async function validateSingleFile(filePath) {
|
|
|
7855
8107
|
async function expandPaths(paths) {
|
|
7856
8108
|
const expanded = [];
|
|
7857
8109
|
for (const inputPath of paths) {
|
|
7858
|
-
const absolutePath =
|
|
8110
|
+
const absolutePath = path11.resolve(inputPath);
|
|
7859
8111
|
try {
|
|
7860
8112
|
await access(absolutePath, constants.F_OK);
|
|
7861
8113
|
} catch {
|
|
@@ -7879,7 +8131,7 @@ async function findYamlFiles(dirPath) {
|
|
|
7879
8131
|
try {
|
|
7880
8132
|
const entries2 = await readdir4(dirPath, { withFileTypes: true });
|
|
7881
8133
|
for (const entry of entries2) {
|
|
7882
|
-
const fullPath =
|
|
8134
|
+
const fullPath = path11.join(dirPath, entry.name);
|
|
7883
8135
|
if (entry.isDirectory()) {
|
|
7884
8136
|
if (entry.name === "node_modules" || entry.name.startsWith(".")) {
|
|
7885
8137
|
continue;
|
|
@@ -7896,7 +8148,7 @@ async function findYamlFiles(dirPath) {
|
|
|
7896
8148
|
return results;
|
|
7897
8149
|
}
|
|
7898
8150
|
function isYamlFile(filePath) {
|
|
7899
|
-
const ext =
|
|
8151
|
+
const ext = path11.extname(filePath).toLowerCase();
|
|
7900
8152
|
return ext === ".yaml" || ext === ".yml";
|
|
7901
8153
|
}
|
|
7902
8154
|
|
|
@@ -7934,9 +8186,9 @@ var validateCommand = command({
|
|
|
7934
8186
|
});
|
|
7935
8187
|
|
|
7936
8188
|
// src/commands/workspace/clean.ts
|
|
7937
|
-
import { existsSync as
|
|
8189
|
+
import { existsSync as existsSync8 } from "node:fs";
|
|
7938
8190
|
import { readFile as readFile5, readdir as readdir5, rm } from "node:fs/promises";
|
|
7939
|
-
import
|
|
8191
|
+
import path12 from "node:path";
|
|
7940
8192
|
async function confirm(message) {
|
|
7941
8193
|
const readline2 = await import("node:readline");
|
|
7942
8194
|
const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
|
|
@@ -7963,7 +8215,7 @@ var cleanCommand = command({
|
|
|
7963
8215
|
},
|
|
7964
8216
|
handler: async ({ repo, force }) => {
|
|
7965
8217
|
const poolRoot = getWorkspacePoolRoot();
|
|
7966
|
-
if (!
|
|
8218
|
+
if (!existsSync8(poolRoot)) {
|
|
7967
8219
|
console.log("No workspace pool entries found.");
|
|
7968
8220
|
return;
|
|
7969
8221
|
}
|
|
@@ -7972,8 +8224,8 @@ var cleanCommand = command({
|
|
|
7972
8224
|
const poolDirs = entries2.filter((e) => e.isDirectory());
|
|
7973
8225
|
const matchingDirs = [];
|
|
7974
8226
|
for (const dir of poolDirs) {
|
|
7975
|
-
const poolDir =
|
|
7976
|
-
const metadataPath =
|
|
8227
|
+
const poolDir = path12.join(poolRoot, dir.name);
|
|
8228
|
+
const metadataPath = path12.join(poolDir, "metadata.json");
|
|
7977
8229
|
try {
|
|
7978
8230
|
const raw = await readFile5(metadataPath, "utf-8");
|
|
7979
8231
|
const metadata = JSON.parse(raw);
|
|
@@ -8004,7 +8256,7 @@ var cleanCommand = command({
|
|
|
8004
8256
|
}
|
|
8005
8257
|
for (const dir of matchingDirs) {
|
|
8006
8258
|
await rm(dir, { recursive: true, force: true });
|
|
8007
|
-
console.log(`Removed: ${
|
|
8259
|
+
console.log(`Removed: ${path12.basename(dir).slice(0, 12)}...`);
|
|
8008
8260
|
}
|
|
8009
8261
|
console.log("Done.");
|
|
8010
8262
|
} else {
|
|
@@ -8022,15 +8274,15 @@ var cleanCommand = command({
|
|
|
8022
8274
|
});
|
|
8023
8275
|
|
|
8024
8276
|
// src/commands/workspace/list.ts
|
|
8025
|
-
import { existsSync as
|
|
8277
|
+
import { existsSync as existsSync9 } from "node:fs";
|
|
8026
8278
|
import { readFile as readFile6, readdir as readdir6, stat as stat2 } from "node:fs/promises";
|
|
8027
|
-
import
|
|
8279
|
+
import path13 from "node:path";
|
|
8028
8280
|
async function getDirectorySize(dirPath) {
|
|
8029
8281
|
let totalSize = 0;
|
|
8030
8282
|
try {
|
|
8031
8283
|
const entries2 = await readdir6(dirPath, { withFileTypes: true });
|
|
8032
8284
|
for (const entry of entries2) {
|
|
8033
|
-
const fullPath =
|
|
8285
|
+
const fullPath = path13.join(dirPath, entry.name);
|
|
8034
8286
|
if (entry.isDirectory()) {
|
|
8035
8287
|
totalSize += await getDirectorySize(fullPath);
|
|
8036
8288
|
} else {
|
|
@@ -8054,7 +8306,7 @@ var listCommand = command({
|
|
|
8054
8306
|
args: {},
|
|
8055
8307
|
handler: async () => {
|
|
8056
8308
|
const poolRoot = getWorkspacePoolRoot();
|
|
8057
|
-
if (!
|
|
8309
|
+
if (!existsSync9(poolRoot)) {
|
|
8058
8310
|
console.log("No workspace pool entries found.");
|
|
8059
8311
|
return;
|
|
8060
8312
|
}
|
|
@@ -8065,11 +8317,11 @@ var listCommand = command({
|
|
|
8065
8317
|
return;
|
|
8066
8318
|
}
|
|
8067
8319
|
for (const dir of poolDirs) {
|
|
8068
|
-
const poolDir =
|
|
8320
|
+
const poolDir = path13.join(poolRoot, dir.name);
|
|
8069
8321
|
const fingerprint = dir.name;
|
|
8070
8322
|
const poolEntries = await readdir6(poolDir, { withFileTypes: true });
|
|
8071
8323
|
const slots = poolEntries.filter((e) => e.isDirectory() && e.name.startsWith("slot-"));
|
|
8072
|
-
const metadataPath =
|
|
8324
|
+
const metadataPath = path13.join(poolDir, "metadata.json");
|
|
8073
8325
|
let metadata = null;
|
|
8074
8326
|
try {
|
|
8075
8327
|
const raw = await readFile6(metadataPath, "utf-8");
|
|
@@ -8115,8 +8367,8 @@ var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
|
|
|
8115
8367
|
var AGENTV_DIR = getAgentvHome();
|
|
8116
8368
|
var CACHE_FILE = "version-check.json";
|
|
8117
8369
|
var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
|
|
8118
|
-
async function getCachedUpdateInfo(
|
|
8119
|
-
const filePath =
|
|
8370
|
+
async function getCachedUpdateInfo(path14) {
|
|
8371
|
+
const filePath = path14 ?? join5(AGENTV_DIR, CACHE_FILE);
|
|
8120
8372
|
try {
|
|
8121
8373
|
const raw = await readFile7(filePath, "utf-8");
|
|
8122
8374
|
const data = JSON.parse(raw);
|
|
@@ -8273,4 +8525,4 @@ export {
|
|
|
8273
8525
|
preprocessArgv,
|
|
8274
8526
|
runCli
|
|
8275
8527
|
};
|
|
8276
|
-
//# sourceMappingURL=chunk-
|
|
8528
|
+
//# sourceMappingURL=chunk-3NLBBQX6.js.map
|