agentv 3.10.2 → 3.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6UE665XI.js → chunk-ETMDLQ72.js} +1141 -60
- package/dist/chunk-ETMDLQ72.js.map +1 -0
- package/dist/{chunk-KGK5NUFG.js → chunk-EZGWZVVK.js} +377 -163
- package/dist/chunk-EZGWZVVK.js.map +1 -0
- package/dist/{chunk-F7LAJMTO.js → chunk-JEW3FEO7.js} +68 -32
- package/dist/chunk-JEW3FEO7.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-3QUJEJUT.js → dist-QERRYDSC.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-EO6AR2R3.js → interactive-AD4PRYDN.js} +3 -3
- package/package.json +3 -1
- package/dist/chunk-6UE665XI.js.map +0 -1
- package/dist/chunk-F7LAJMTO.js.map +0 -1
- package/dist/chunk-KGK5NUFG.js.map +0 -1
- package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +0 -84
- package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +0 -144
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +0 -67
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +0 -101
- package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +0 -458
- package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +0 -36
- package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +0 -118
- package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +0 -12753
- package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -77
- package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +0 -50
- package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +0 -78
- package/dist/templates/.agentv/.env.example +0 -25
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +0 -177
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +0 -316
- package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +0 -137
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +0 -215
- package/dist/templates/.claude/skills/agentv-eval-builder/references/config-schema.json +0 -27
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +0 -115
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +0 -278
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +0 -333
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -79
- package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +0 -121
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +0 -298
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +0 -78
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +0 -5
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +0 -4
- /package/dist/{dist-3QUJEJUT.js.map → dist-QERRYDSC.js.map} +0 -0
- /package/dist/{interactive-EO6AR2R3.js.map → interactive-AD4PRYDN.js.map} +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
|
|
2
2
|
import {
|
|
3
3
|
HtmlWriter,
|
|
4
|
+
buildAggregateGradingArtifact,
|
|
4
5
|
buildBenchmarkArtifact,
|
|
5
6
|
buildGradingArtifact,
|
|
6
7
|
buildTimingArtifact,
|
|
@@ -10,6 +11,7 @@ import {
|
|
|
10
11
|
package_default,
|
|
11
12
|
parseJsonlResults,
|
|
12
13
|
resolveEvalPaths,
|
|
14
|
+
resolveRunCacheFile,
|
|
13
15
|
runEvalCommand,
|
|
14
16
|
selectTarget,
|
|
15
17
|
toSnakeCaseDeep,
|
|
@@ -17,7 +19,7 @@ import {
|
|
|
17
19
|
validateEvalFile,
|
|
18
20
|
validateFileReferences,
|
|
19
21
|
validateTargetsFile
|
|
20
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-JEW3FEO7.js";
|
|
21
23
|
import {
|
|
22
24
|
createBuiltinRegistry,
|
|
23
25
|
createProvider,
|
|
@@ -35,7 +37,7 @@ import {
|
|
|
35
37
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
36
38
|
transpileEvalYamlFile,
|
|
37
39
|
trimBaselineResult
|
|
38
|
-
} from "./chunk-
|
|
40
|
+
} from "./chunk-EZGWZVVK.js";
|
|
39
41
|
import {
|
|
40
42
|
__commonJS,
|
|
41
43
|
__esm,
|
|
@@ -3711,7 +3713,7 @@ async function listPromptEvalTestIds(evalPath) {
|
|
|
3711
3713
|
async function getPromptEvalInput(evalPath, testId) {
|
|
3712
3714
|
const repoRoot = await findRepoRoot(process.cwd());
|
|
3713
3715
|
const evalCase = await loadTestById(evalPath, repoRoot, testId);
|
|
3714
|
-
const fileMap = buildFileMap(evalCase.
|
|
3716
|
+
const fileMap = buildFileMap(evalCase.input, evalCase.file_paths);
|
|
3715
3717
|
return {
|
|
3716
3718
|
test_id: evalCase.id,
|
|
3717
3719
|
input: resolveMessages(evalCase.input, fileMap),
|
|
@@ -3732,7 +3734,7 @@ async function getPromptEvalExpectedOutput(evalPath, testId) {
|
|
|
3732
3734
|
async function getPromptEvalGradingBrief(evalPath, testId) {
|
|
3733
3735
|
const repoRoot = await findRepoRoot(process.cwd());
|
|
3734
3736
|
const evalCase = await loadTestById(evalPath, repoRoot, testId);
|
|
3735
|
-
const fileMap = buildFileMap(evalCase.
|
|
3737
|
+
const fileMap = buildFileMap(evalCase.input, evalCase.file_paths);
|
|
3736
3738
|
const resolvedInput = resolveMessages(evalCase.input, fileMap);
|
|
3737
3739
|
const lines = [];
|
|
3738
3740
|
const inputText = extractTextFromMessages(resolvedInput);
|
|
@@ -3795,11 +3797,14 @@ function extractTextFromMessages(messages) {
|
|
|
3795
3797
|
}
|
|
3796
3798
|
return "";
|
|
3797
3799
|
}
|
|
3798
|
-
function buildFileMap(
|
|
3800
|
+
function buildFileMap(inputMessages, allFilePaths) {
|
|
3799
3801
|
const map = /* @__PURE__ */ new Map();
|
|
3800
|
-
for (const
|
|
3801
|
-
if (
|
|
3802
|
-
|
|
3802
|
+
for (const message of inputMessages) {
|
|
3803
|
+
if (!Array.isArray(message.content)) {
|
|
3804
|
+
continue;
|
|
3805
|
+
}
|
|
3806
|
+
for (const segment of message.content) {
|
|
3807
|
+
registerResolvedFileSegment(map, segment);
|
|
3803
3808
|
}
|
|
3804
3809
|
}
|
|
3805
3810
|
return {
|
|
@@ -3813,6 +3818,17 @@ function buildFileMap(inputSegments, allFilePaths) {
|
|
|
3813
3818
|
}
|
|
3814
3819
|
};
|
|
3815
3820
|
}
|
|
3821
|
+
function registerResolvedFileSegment(map, segment) {
|
|
3822
|
+
if (segment.type !== "file" || typeof segment.resolvedPath !== "string") {
|
|
3823
|
+
return;
|
|
3824
|
+
}
|
|
3825
|
+
const aliases = [segment.value, segment.path].filter(
|
|
3826
|
+
(alias) => typeof alias === "string"
|
|
3827
|
+
);
|
|
3828
|
+
for (const alias of aliases) {
|
|
3829
|
+
map.set(alias, segment.resolvedPath);
|
|
3830
|
+
}
|
|
3831
|
+
}
|
|
3816
3832
|
function resolveMessages(messages, fileMap) {
|
|
3817
3833
|
return messages.map((message) => {
|
|
3818
3834
|
if (typeof message.content === "string") {
|
|
@@ -4187,7 +4203,7 @@ var evalRunCommand = command({
|
|
|
4187
4203
|
},
|
|
4188
4204
|
handler: async (args) => {
|
|
4189
4205
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4190
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4206
|
+
const { launchInteractiveWizard } = await import("./interactive-AD4PRYDN.js");
|
|
4191
4207
|
await launchInteractiveWizard();
|
|
4192
4208
|
return;
|
|
4193
4209
|
}
|
|
@@ -4620,37 +4636,65 @@ function loadResultFile(filePath) {
|
|
|
4620
4636
|
});
|
|
4621
4637
|
}
|
|
4622
4638
|
function listResultFiles(cwd, limit) {
|
|
4623
|
-
const
|
|
4624
|
-
|
|
4639
|
+
const baseDir = path7.join(cwd, ".agentv", "results");
|
|
4640
|
+
const rawDir = path7.join(baseDir, "raw");
|
|
4641
|
+
const files = [];
|
|
4625
4642
|
try {
|
|
4626
|
-
|
|
4643
|
+
const entries2 = readdirSync2(rawDir, { withFileTypes: true });
|
|
4644
|
+
for (const entry of entries2) {
|
|
4645
|
+
if (entry.isDirectory()) {
|
|
4646
|
+
const jsonlPath = path7.join(rawDir, entry.name, "results.jsonl");
|
|
4647
|
+
try {
|
|
4648
|
+
statSync2(jsonlPath);
|
|
4649
|
+
files.push({ filePath: jsonlPath, displayName: entry.name });
|
|
4650
|
+
} catch {
|
|
4651
|
+
}
|
|
4652
|
+
}
|
|
4653
|
+
}
|
|
4654
|
+
for (const entry of entries2) {
|
|
4655
|
+
if (!entry.isDirectory() && entry.name.endsWith(".jsonl")) {
|
|
4656
|
+
files.push({ filePath: path7.join(rawDir, entry.name), displayName: entry.name });
|
|
4657
|
+
}
|
|
4658
|
+
}
|
|
4627
4659
|
} catch {
|
|
4628
|
-
return [];
|
|
4629
4660
|
}
|
|
4630
|
-
|
|
4631
|
-
|
|
4632
|
-
|
|
4661
|
+
try {
|
|
4662
|
+
const entries2 = readdirSync2(baseDir).filter((f) => f.endsWith(".jsonl"));
|
|
4663
|
+
for (const entry of entries2) {
|
|
4664
|
+
files.push({ filePath: path7.join(baseDir, entry), displayName: entry });
|
|
4665
|
+
}
|
|
4666
|
+
} catch {
|
|
4633
4667
|
}
|
|
4668
|
+
const seen = /* @__PURE__ */ new Set();
|
|
4669
|
+
const uniqueFiles = [];
|
|
4670
|
+
for (const file of files) {
|
|
4671
|
+
const key = file.displayName.replace(/\.jsonl$/, "");
|
|
4672
|
+
if (!seen.has(key)) {
|
|
4673
|
+
seen.add(key);
|
|
4674
|
+
uniqueFiles.push(file);
|
|
4675
|
+
}
|
|
4676
|
+
}
|
|
4677
|
+
uniqueFiles.sort((a, b) => b.displayName.localeCompare(a.displayName));
|
|
4678
|
+
const limited = limit !== void 0 && limit > 0 ? uniqueFiles.slice(0, limit) : uniqueFiles;
|
|
4634
4679
|
const metas = [];
|
|
4635
|
-
for (const
|
|
4636
|
-
const filePath = path7.join(resultsDir, filename);
|
|
4680
|
+
for (const { filePath, displayName } of limited) {
|
|
4637
4681
|
try {
|
|
4638
|
-
const
|
|
4682
|
+
const fileStat = statSync2(filePath);
|
|
4639
4683
|
const results = loadResultFile(filePath);
|
|
4640
4684
|
const testCount = results.length;
|
|
4641
4685
|
const passCount = results.filter((r) => r.score >= 1).length;
|
|
4642
4686
|
const passRate = testCount > 0 ? passCount / testCount : 0;
|
|
4643
4687
|
const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0;
|
|
4644
|
-
const filenameTimestamp = extractTimestampFromFilename(
|
|
4688
|
+
const filenameTimestamp = extractTimestampFromFilename(displayName);
|
|
4645
4689
|
const timestamp = filenameTimestamp ?? results[0]?.timestamp ?? "unknown";
|
|
4646
4690
|
metas.push({
|
|
4647
4691
|
path: filePath,
|
|
4648
|
-
filename,
|
|
4692
|
+
filename: displayName,
|
|
4649
4693
|
timestamp,
|
|
4650
4694
|
testCount,
|
|
4651
4695
|
passRate,
|
|
4652
4696
|
avgScore,
|
|
4653
|
-
sizeBytes:
|
|
4697
|
+
sizeBytes: fileStat.size
|
|
4654
4698
|
});
|
|
4655
4699
|
} catch {
|
|
4656
4700
|
}
|
|
@@ -4704,6 +4748,12 @@ function exportResults(sourceFile, content, outputDir) {
|
|
|
4704
4748
|
const timing = buildTimingArtifact(patched);
|
|
4705
4749
|
writeFileSync3(path8.join(outputDir, "timing.json"), `${JSON.stringify(timing, null, 2)}
|
|
4706
4750
|
`);
|
|
4751
|
+
const aggregateGrading = buildAggregateGradingArtifact(patched);
|
|
4752
|
+
writeFileSync3(
|
|
4753
|
+
path8.join(outputDir, "grading.json"),
|
|
4754
|
+
`${JSON.stringify(aggregateGrading, null, 2)}
|
|
4755
|
+
`
|
|
4756
|
+
);
|
|
4707
4757
|
const gradingDir = path8.join(outputDir, "grading");
|
|
4708
4758
|
mkdirSync2(gradingDir, { recursive: true });
|
|
4709
4759
|
for (const result of patched) {
|
|
@@ -4717,18 +4767,46 @@ function exportResults(sourceFile, content, outputDir) {
|
|
|
4717
4767
|
for (const result of patched) {
|
|
4718
4768
|
if (result.output && result.output.length > 0) {
|
|
4719
4769
|
const id = safeTestId(result);
|
|
4720
|
-
|
|
4770
|
+
const md = formatOutputMarkdown(result.output);
|
|
4771
|
+
writeFileSync3(path8.join(outputsDir, `${id}.md`), md);
|
|
4772
|
+
}
|
|
4773
|
+
}
|
|
4774
|
+
const inputsDir = path8.join(outputDir, "inputs");
|
|
4775
|
+
mkdirSync2(inputsDir, { recursive: true });
|
|
4776
|
+
for (const result of patched) {
|
|
4777
|
+
const id = safeTestId(result);
|
|
4778
|
+
const input = extractInput(result);
|
|
4779
|
+
if (input) {
|
|
4780
|
+
writeFileSync3(path8.join(inputsDir, `${id}.md`), input);
|
|
4721
4781
|
}
|
|
4722
4782
|
}
|
|
4723
4783
|
}
|
|
4784
|
+
function formatOutputMarkdown(output) {
|
|
4785
|
+
return output.map((msg) => `@[${msg.role}]:
|
|
4786
|
+
${String(msg.content ?? "")}`).join("\n\n");
|
|
4787
|
+
}
|
|
4788
|
+
function extractInput(result) {
|
|
4789
|
+
const input = result.input;
|
|
4790
|
+
if (!input) return null;
|
|
4791
|
+
if (typeof input === "string") return input;
|
|
4792
|
+
if (Array.isArray(input) && input.length > 0) {
|
|
4793
|
+
return formatOutputMarkdown(input);
|
|
4794
|
+
}
|
|
4795
|
+
return null;
|
|
4796
|
+
}
|
|
4724
4797
|
function safeTestId(result) {
|
|
4725
4798
|
const raw = result.testId ?? result.evalId ?? "unknown";
|
|
4726
4799
|
return String(raw).replace(/[/\\:*?"<>|]/g, "_");
|
|
4727
4800
|
}
|
|
4728
4801
|
function deriveOutputDir(cwd, sourceFile) {
|
|
4802
|
+
const parentDir = path8.basename(path8.dirname(sourceFile));
|
|
4803
|
+
if (parentDir.startsWith("eval_")) {
|
|
4804
|
+
const dirName2 = parentDir.slice(5);
|
|
4805
|
+
return path8.join(cwd, ".agentv", "results", "export", dirName2);
|
|
4806
|
+
}
|
|
4729
4807
|
const basename = path8.basename(sourceFile, ".jsonl");
|
|
4730
4808
|
const dirName = basename.startsWith("eval_") ? basename.slice(5) : basename;
|
|
4731
|
-
return path8.join(cwd, ".agentv", "results", dirName);
|
|
4809
|
+
return path8.join(cwd, ".agentv", "results", "export", dirName);
|
|
4732
4810
|
}
|
|
4733
4811
|
var resultsExportCommand = command({
|
|
4734
4812
|
name: "export",
|
|
@@ -4743,7 +4821,7 @@ var resultsExportCommand = command({
|
|
|
4743
4821
|
type: optional(string),
|
|
4744
4822
|
long: "out",
|
|
4745
4823
|
short: "o",
|
|
4746
|
-
description: "Output directory (defaults to .agentv/results/<run-timestamp>/)"
|
|
4824
|
+
description: "Output directory (defaults to .agentv/results/export/<run-timestamp>/)"
|
|
4747
4825
|
}),
|
|
4748
4826
|
dir: option({
|
|
4749
4827
|
type: optional(string),
|
|
@@ -4760,8 +4838,9 @@ var resultsExportCommand = command({
|
|
|
4760
4838
|
sourceFile = path8.isAbsolute(source) ? source : path8.resolve(cwd, source);
|
|
4761
4839
|
} else {
|
|
4762
4840
|
const cache = await loadRunCache(cwd);
|
|
4763
|
-
|
|
4764
|
-
|
|
4841
|
+
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
4842
|
+
if (cachedFile && existsSync2(cachedFile)) {
|
|
4843
|
+
sourceFile = cachedFile;
|
|
4765
4844
|
} else {
|
|
4766
4845
|
const metas = listResultFiles(cwd, 1);
|
|
4767
4846
|
if (metas.length === 0) {
|
|
@@ -4788,12 +4867,1013 @@ var resultsExportCommand = command({
|
|
|
4788
4867
|
}
|
|
4789
4868
|
});
|
|
4790
4869
|
|
|
4870
|
+
// src/commands/results/shared.ts
|
|
4871
|
+
import { existsSync as existsSync3, readFileSync as readFileSync7 } from "node:fs";
|
|
4872
|
+
import path9 from "node:path";
|
|
4873
|
+
var sourceArg = positional({
|
|
4874
|
+
type: optional(string),
|
|
4875
|
+
displayName: "source",
|
|
4876
|
+
description: "JSONL result file (defaults to most recent in .agentv/results/)"
|
|
4877
|
+
});
|
|
4878
|
+
async function resolveSourceFile(source, cwd) {
|
|
4879
|
+
let sourceFile;
|
|
4880
|
+
if (source) {
|
|
4881
|
+
sourceFile = path9.isAbsolute(source) ? source : path9.resolve(cwd, source);
|
|
4882
|
+
if (!existsSync3(sourceFile)) {
|
|
4883
|
+
console.error(`Error: File not found: ${sourceFile}`);
|
|
4884
|
+
process.exit(1);
|
|
4885
|
+
}
|
|
4886
|
+
} else {
|
|
4887
|
+
const cache = await loadRunCache(cwd);
|
|
4888
|
+
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
4889
|
+
if (cachedFile && existsSync3(cachedFile)) {
|
|
4890
|
+
sourceFile = cachedFile;
|
|
4891
|
+
} else {
|
|
4892
|
+
const metas = listResultFiles(cwd, 1);
|
|
4893
|
+
if (metas.length === 0) {
|
|
4894
|
+
console.error("Error: No result files found in .agentv/results/");
|
|
4895
|
+
console.error("Run an evaluation first: agentv eval <eval-file>");
|
|
4896
|
+
process.exit(1);
|
|
4897
|
+
}
|
|
4898
|
+
sourceFile = metas[0].path;
|
|
4899
|
+
}
|
|
4900
|
+
}
|
|
4901
|
+
const content = readFileSync7(sourceFile, "utf8");
|
|
4902
|
+
return { sourceFile, content };
|
|
4903
|
+
}
|
|
4904
|
+
async function loadResults(source, cwd) {
|
|
4905
|
+
const { sourceFile, content } = await resolveSourceFile(source, cwd);
|
|
4906
|
+
const results = parseJsonlResults(content);
|
|
4907
|
+
if (results.length === 0) {
|
|
4908
|
+
console.error(`No results found in ${sourceFile}`);
|
|
4909
|
+
process.exit(1);
|
|
4910
|
+
}
|
|
4911
|
+
return { results: patchTestIds(results), sourceFile };
|
|
4912
|
+
}
|
|
4913
|
+
function patchTestIds(results) {
|
|
4914
|
+
return results.map((r) => {
|
|
4915
|
+
if (!r.testId && r.evalId) {
|
|
4916
|
+
return { ...r, testId: String(r.evalId) };
|
|
4917
|
+
}
|
|
4918
|
+
return r;
|
|
4919
|
+
});
|
|
4920
|
+
}
|
|
4921
|
+
|
|
4922
|
+
// src/commands/results/failures.ts
|
|
4923
|
+
function formatFailures(results) {
|
|
4924
|
+
return results.filter((r) => r.score < 1).map((r) => {
|
|
4925
|
+
let assertions = (r.assertions ?? []).map((a) => ({
|
|
4926
|
+
text: a.text,
|
|
4927
|
+
passed: a.passed,
|
|
4928
|
+
evidence: a.evidence
|
|
4929
|
+
}));
|
|
4930
|
+
if (assertions.length === 0 && r.scores) {
|
|
4931
|
+
assertions = r.scores.flatMap(
|
|
4932
|
+
(s) => (s.assertions ?? []).map((a) => ({
|
|
4933
|
+
text: a.text,
|
|
4934
|
+
passed: a.passed,
|
|
4935
|
+
evidence: a.evidence
|
|
4936
|
+
}))
|
|
4937
|
+
);
|
|
4938
|
+
}
|
|
4939
|
+
return { test_id: r.testId, score: r.score, assertions };
|
|
4940
|
+
});
|
|
4941
|
+
}
|
|
4942
|
+
var resultsFailuresCommand = command({
|
|
4943
|
+
name: "failures",
|
|
4944
|
+
description: "Show only failed tests with assertion evidence",
|
|
4945
|
+
args: {
|
|
4946
|
+
source: sourceArg,
|
|
4947
|
+
dir: option({
|
|
4948
|
+
type: optional(string),
|
|
4949
|
+
long: "dir",
|
|
4950
|
+
short: "d",
|
|
4951
|
+
description: "Working directory (default: current directory)"
|
|
4952
|
+
})
|
|
4953
|
+
},
|
|
4954
|
+
handler: async ({ source, dir }) => {
|
|
4955
|
+
const cwd = dir ?? process.cwd();
|
|
4956
|
+
try {
|
|
4957
|
+
const { results } = await loadResults(source, cwd);
|
|
4958
|
+
console.log(JSON.stringify(formatFailures(results), null, 2));
|
|
4959
|
+
} catch (error) {
|
|
4960
|
+
console.error(`Error: ${error.message}`);
|
|
4961
|
+
process.exit(1);
|
|
4962
|
+
}
|
|
4963
|
+
}
|
|
4964
|
+
});
|
|
4965
|
+
|
|
4966
|
+
// src/commands/results/show.ts
|
|
4967
|
+
function findResult(results, testId) {
|
|
4968
|
+
return results.find((r) => r.testId === testId);
|
|
4969
|
+
}
|
|
4970
|
+
function formatInput(result) {
|
|
4971
|
+
const input = result.input;
|
|
4972
|
+
if (!input) return "(no input)";
|
|
4973
|
+
if (typeof input === "string") return input;
|
|
4974
|
+
if (Array.isArray(input)) {
|
|
4975
|
+
return input.map((msg) => String(msg.content ?? "")).join("\n");
|
|
4976
|
+
}
|
|
4977
|
+
return "(no input)";
|
|
4978
|
+
}
|
|
4979
|
+
function formatOutput(result) {
|
|
4980
|
+
if (!result.output || result.output.length === 0) return "(no output)";
|
|
4981
|
+
return result.output.map((msg) => String(msg.content ?? "")).join("\n");
|
|
4982
|
+
}
|
|
4983
|
+
function formatShow(result) {
|
|
4984
|
+
const usage = result.tokenUsage;
|
|
4985
|
+
let allAssertions = (result.assertions ?? []).map((a) => ({
|
|
4986
|
+
text: a.text,
|
|
4987
|
+
passed: a.passed,
|
|
4988
|
+
evidence: a.evidence
|
|
4989
|
+
}));
|
|
4990
|
+
if (allAssertions.length === 0 && result.scores) {
|
|
4991
|
+
allAssertions = result.scores.flatMap(
|
|
4992
|
+
(s) => (s.assertions ?? []).map((a) => ({
|
|
4993
|
+
text: a.text,
|
|
4994
|
+
passed: a.passed,
|
|
4995
|
+
evidence: a.evidence
|
|
4996
|
+
}))
|
|
4997
|
+
);
|
|
4998
|
+
}
|
|
4999
|
+
const totalTokens = usage ? (usage.input ?? 0) + (usage.output ?? 0) : void 0;
|
|
5000
|
+
return {
|
|
5001
|
+
test_id: result.testId,
|
|
5002
|
+
score: result.score,
|
|
5003
|
+
duration_ms: result.durationMs,
|
|
5004
|
+
total_tokens: totalTokens,
|
|
5005
|
+
input: formatInput(result),
|
|
5006
|
+
assertions: allAssertions,
|
|
5007
|
+
response: formatOutput(result)
|
|
5008
|
+
};
|
|
5009
|
+
}
|
|
5010
|
+
var resultsShowCommand = command({
|
|
5011
|
+
name: "show",
|
|
5012
|
+
description: "Show full detail for a single test result",
|
|
5013
|
+
args: {
|
|
5014
|
+
source: sourceArg,
|
|
5015
|
+
testId: option({
|
|
5016
|
+
type: string,
|
|
5017
|
+
long: "test-id",
|
|
5018
|
+
short: "t",
|
|
5019
|
+
description: "Test ID to display"
|
|
5020
|
+
}),
|
|
5021
|
+
dir: option({
|
|
5022
|
+
type: optional(string),
|
|
5023
|
+
long: "dir",
|
|
5024
|
+
short: "d",
|
|
5025
|
+
description: "Working directory (default: current directory)"
|
|
5026
|
+
})
|
|
5027
|
+
},
|
|
5028
|
+
handler: async ({ source, testId, dir }) => {
|
|
5029
|
+
const cwd = dir ?? process.cwd();
|
|
5030
|
+
try {
|
|
5031
|
+
const { results } = await loadResults(source, cwd);
|
|
5032
|
+
const result = findResult(results, testId);
|
|
5033
|
+
if (!result) {
|
|
5034
|
+
const available = results.map((r) => r.testId).join(", ");
|
|
5035
|
+
console.error(`Error: Test ID "${testId}" not found.`);
|
|
5036
|
+
console.error(`Available test IDs: ${available}`);
|
|
5037
|
+
process.exit(1);
|
|
5038
|
+
}
|
|
5039
|
+
console.log(JSON.stringify(formatShow(result), null, 2));
|
|
5040
|
+
} catch (error) {
|
|
5041
|
+
console.error(`Error: ${error.message}`);
|
|
5042
|
+
process.exit(1);
|
|
5043
|
+
}
|
|
5044
|
+
}
|
|
5045
|
+
});
|
|
5046
|
+
|
|
5047
|
+
// src/commands/results/summary.ts
|
|
5048
|
+
import { existsSync as existsSync4, readFileSync as readFileSync8 } from "node:fs";
|
|
5049
|
+
function formatSummary(results, grading) {
|
|
5050
|
+
const total = results.length;
|
|
5051
|
+
let passed;
|
|
5052
|
+
let failed;
|
|
5053
|
+
let passRate;
|
|
5054
|
+
if (grading) {
|
|
5055
|
+
passed = grading.summary.passed;
|
|
5056
|
+
failed = grading.summary.failed;
|
|
5057
|
+
passRate = grading.summary.pass_rate;
|
|
5058
|
+
} else {
|
|
5059
|
+
passed = results.filter((r) => r.score >= 1).length;
|
|
5060
|
+
failed = total - passed;
|
|
5061
|
+
passRate = total > 0 ? Math.round(results.reduce((s, r) => s + r.score, 0) / total * 1e3) / 1e3 : 0;
|
|
5062
|
+
}
|
|
5063
|
+
let totalDurationMs = 0;
|
|
5064
|
+
let totalTokens = 0;
|
|
5065
|
+
for (const r of results) {
|
|
5066
|
+
if (r.durationMs != null) totalDurationMs += r.durationMs;
|
|
5067
|
+
const usage = r.tokenUsage;
|
|
5068
|
+
if (usage) totalTokens += (usage.input ?? 0) + (usage.output ?? 0);
|
|
5069
|
+
}
|
|
5070
|
+
const failedTestIds = results.filter((r) => r.score < 1).map((r) => r.testId);
|
|
5071
|
+
return {
|
|
5072
|
+
total,
|
|
5073
|
+
passed,
|
|
5074
|
+
failed,
|
|
5075
|
+
pass_rate: { mean: passRate },
|
|
5076
|
+
total_duration_ms: totalDurationMs,
|
|
5077
|
+
total_tokens: totalTokens,
|
|
5078
|
+
failed_test_ids: failedTestIds
|
|
5079
|
+
};
|
|
5080
|
+
}
|
|
5081
|
+
var resultsSummaryCommand = command({
|
|
5082
|
+
name: "summary",
|
|
5083
|
+
description: "Show compact pass/fail summary of eval results",
|
|
5084
|
+
args: {
|
|
5085
|
+
source: sourceArg,
|
|
5086
|
+
dir: option({
|
|
5087
|
+
type: optional(string),
|
|
5088
|
+
long: "dir",
|
|
5089
|
+
short: "d",
|
|
5090
|
+
description: "Working directory (default: current directory)"
|
|
5091
|
+
})
|
|
5092
|
+
},
|
|
5093
|
+
handler: async ({ source, dir }) => {
|
|
5094
|
+
const cwd = dir ?? process.cwd();
|
|
5095
|
+
try {
|
|
5096
|
+
const { results, sourceFile } = await loadResults(source, cwd);
|
|
5097
|
+
let grading;
|
|
5098
|
+
const gradingPath = sourceFile.replace(/\.jsonl$/, ".grading.json");
|
|
5099
|
+
if (existsSync4(gradingPath)) {
|
|
5100
|
+
try {
|
|
5101
|
+
grading = JSON.parse(readFileSync8(gradingPath, "utf8"));
|
|
5102
|
+
} catch {
|
|
5103
|
+
}
|
|
5104
|
+
}
|
|
5105
|
+
console.log(JSON.stringify(formatSummary(results, grading), null, 2));
|
|
5106
|
+
} catch (error) {
|
|
5107
|
+
console.error(`Error: ${error.message}`);
|
|
5108
|
+
process.exit(1);
|
|
5109
|
+
}
|
|
5110
|
+
}
|
|
5111
|
+
});
|
|
5112
|
+
|
|
4791
5113
|
// src/commands/results/index.ts
|
|
4792
5114
|
var resultsCommand = subcommands({
|
|
4793
5115
|
name: "results",
|
|
4794
5116
|
description: "Inspect, export, and manage evaluation results",
|
|
4795
5117
|
cmds: {
|
|
4796
|
-
export: resultsExportCommand
|
|
5118
|
+
export: resultsExportCommand,
|
|
5119
|
+
summary: resultsSummaryCommand,
|
|
5120
|
+
failures: resultsFailuresCommand,
|
|
5121
|
+
show: resultsShowCommand
|
|
5122
|
+
}
|
|
5123
|
+
});
|
|
5124
|
+
|
|
5125
|
+
// src/commands/results/serve.ts
|
|
5126
|
+
import { existsSync as existsSync5, readFileSync as readFileSync9, writeFileSync as writeFileSync4 } from "node:fs";
|
|
5127
|
+
import path10 from "node:path";
|
|
5128
|
+
import { Hono } from "hono";
|
|
5129
|
+
async function resolveSourceFile2(source, cwd) {
|
|
5130
|
+
if (source) {
|
|
5131
|
+
const resolved = path10.isAbsolute(source) ? source : path10.resolve(cwd, source);
|
|
5132
|
+
if (!existsSync5(resolved)) {
|
|
5133
|
+
throw new Error(`Source file not found: ${resolved}`);
|
|
5134
|
+
}
|
|
5135
|
+
return resolved;
|
|
5136
|
+
}
|
|
5137
|
+
const cache = await loadRunCache(cwd);
|
|
5138
|
+
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
5139
|
+
if (cachedFile && existsSync5(cachedFile)) {
|
|
5140
|
+
return cachedFile;
|
|
5141
|
+
}
|
|
5142
|
+
const metas = listResultFiles(cwd, 10);
|
|
5143
|
+
if (metas.length === 0) {
|
|
5144
|
+
throw new Error(
|
|
5145
|
+
"No result files found in .agentv/results/\nRun an evaluation first: agentv eval <eval-file>"
|
|
5146
|
+
);
|
|
5147
|
+
}
|
|
5148
|
+
if (metas.length > 1) {
|
|
5149
|
+
console.log("Available result files:");
|
|
5150
|
+
for (const m of metas) {
|
|
5151
|
+
console.log(` ${m.path}`);
|
|
5152
|
+
}
|
|
5153
|
+
console.log(`
|
|
5154
|
+
Serving most recent: ${metas[0].path}
|
|
5155
|
+
`);
|
|
5156
|
+
}
|
|
5157
|
+
return metas[0].path;
|
|
5158
|
+
}
|
|
5159
|
+
function loadResults2(content) {
|
|
5160
|
+
const results = parseJsonlResults(content);
|
|
5161
|
+
if (results.length === 0) {
|
|
5162
|
+
throw new Error("No valid results found in JSONL content");
|
|
5163
|
+
}
|
|
5164
|
+
return results.map((r) => {
|
|
5165
|
+
if (!r.testId && r.evalId) {
|
|
5166
|
+
return { ...r, testId: String(r.evalId) };
|
|
5167
|
+
}
|
|
5168
|
+
return r;
|
|
5169
|
+
});
|
|
5170
|
+
}
|
|
5171
|
+
function feedbackPath(cwd) {
|
|
5172
|
+
return path10.join(cwd, "feedback.json");
|
|
5173
|
+
}
|
|
5174
|
+
function readFeedback(cwd) {
|
|
5175
|
+
const fp = feedbackPath(cwd);
|
|
5176
|
+
if (!existsSync5(fp)) {
|
|
5177
|
+
return { reviews: [] };
|
|
5178
|
+
}
|
|
5179
|
+
try {
|
|
5180
|
+
return JSON.parse(readFileSync9(fp, "utf8"));
|
|
5181
|
+
} catch (err2) {
|
|
5182
|
+
console.error(`Warning: could not parse ${fp}, starting fresh: ${err2.message}`);
|
|
5183
|
+
return { reviews: [] };
|
|
5184
|
+
}
|
|
5185
|
+
}
|
|
5186
|
+
function writeFeedback(cwd, data) {
|
|
5187
|
+
writeFileSync4(feedbackPath(cwd), `${JSON.stringify(data, null, 2)}
|
|
5188
|
+
`, "utf8");
|
|
5189
|
+
}
|
|
5190
|
+
function createApp(results, cwd) {
|
|
5191
|
+
const app2 = new Hono();
|
|
5192
|
+
app2.get("/", (c3) => {
|
|
5193
|
+
return c3.html(generateServeHtml(results));
|
|
5194
|
+
});
|
|
5195
|
+
app2.get("/api/feedback", (c3) => {
|
|
5196
|
+
const data = readFeedback(cwd);
|
|
5197
|
+
return c3.json(data);
|
|
5198
|
+
});
|
|
5199
|
+
app2.post("/api/feedback", async (c3) => {
|
|
5200
|
+
let body;
|
|
5201
|
+
try {
|
|
5202
|
+
body = await c3.req.json();
|
|
5203
|
+
} catch {
|
|
5204
|
+
return c3.json({ error: "Invalid JSON" }, 400);
|
|
5205
|
+
}
|
|
5206
|
+
if (!body || typeof body !== "object") {
|
|
5207
|
+
return c3.json({ error: "Invalid payload" }, 400);
|
|
5208
|
+
}
|
|
5209
|
+
const payload = body;
|
|
5210
|
+
if (!Array.isArray(payload.reviews)) {
|
|
5211
|
+
return c3.json({ error: "Missing reviews array" }, 400);
|
|
5212
|
+
}
|
|
5213
|
+
const incoming = payload.reviews;
|
|
5214
|
+
for (const review of incoming) {
|
|
5215
|
+
if (typeof review.test_id !== "string" || typeof review.comment !== "string") {
|
|
5216
|
+
return c3.json({ error: "Each review must have test_id and comment strings" }, 400);
|
|
5217
|
+
}
|
|
5218
|
+
}
|
|
5219
|
+
const existing = readFeedback(cwd);
|
|
5220
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
5221
|
+
for (const review of incoming) {
|
|
5222
|
+
const newReview = {
|
|
5223
|
+
test_id: review.test_id,
|
|
5224
|
+
comment: review.comment,
|
|
5225
|
+
updated_at: now
|
|
5226
|
+
};
|
|
5227
|
+
const idx = existing.reviews.findIndex((r) => r.test_id === newReview.test_id);
|
|
5228
|
+
if (idx >= 0) {
|
|
5229
|
+
existing.reviews[idx] = newReview;
|
|
5230
|
+
} else {
|
|
5231
|
+
existing.reviews.push(newReview);
|
|
5232
|
+
}
|
|
5233
|
+
}
|
|
5234
|
+
writeFeedback(cwd, existing);
|
|
5235
|
+
return c3.json(existing);
|
|
5236
|
+
});
|
|
5237
|
+
return app2;
|
|
5238
|
+
}
|
|
5239
|
+
function escapeHtml(s) {
|
|
5240
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
5241
|
+
}
|
|
5242
|
+
function generateServeHtml(results) {
|
|
5243
|
+
const lightResults = results.map((r) => {
|
|
5244
|
+
const { requests, trace, ...rest } = r;
|
|
5245
|
+
const toolCalls = trace?.toolCalls && Object.keys(trace.toolCalls).length > 0 ? trace.toolCalls : void 0;
|
|
5246
|
+
const graderDurationMs = (r.scores ?? []).reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
|
|
5247
|
+
return {
|
|
5248
|
+
...rest,
|
|
5249
|
+
...toolCalls && { _toolCalls: toolCalls },
|
|
5250
|
+
...graderDurationMs > 0 && { _graderDurationMs: graderDurationMs }
|
|
5251
|
+
};
|
|
5252
|
+
});
|
|
5253
|
+
const dataJson = JSON.stringify(lightResults).replace(/</g, "\\u003c").replace(/>/g, "\\u003e").replace(/\u2028/g, "\\u2028").replace(/\u2029/g, "\\u2029");
|
|
5254
|
+
return `<!DOCTYPE html>
|
|
5255
|
+
<html lang="en">
|
|
5256
|
+
<head>
|
|
5257
|
+
<meta charset="utf-8">
|
|
5258
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
5259
|
+
<title>AgentV Results Review</title>
|
|
5260
|
+
<style>
|
|
5261
|
+
${SERVE_STYLES}
|
|
5262
|
+
</style>
|
|
5263
|
+
</head>
|
|
5264
|
+
<body>
|
|
5265
|
+
<header class="header">
|
|
5266
|
+
<div class="header-left">
|
|
5267
|
+
<h1 class="header-title">AgentV</h1>
|
|
5268
|
+
<span class="header-subtitle">Results Review</span>
|
|
5269
|
+
</div>
|
|
5270
|
+
<div class="header-right">
|
|
5271
|
+
<span class="timestamp">${escapeHtml((/* @__PURE__ */ new Date()).toISOString())}</span>
|
|
5272
|
+
</div>
|
|
5273
|
+
</header>
|
|
5274
|
+
<nav class="tabs" id="tabs">
|
|
5275
|
+
<button class="tab active" data-tab="overview">Overview</button>
|
|
5276
|
+
<button class="tab" data-tab="tests">Test Cases</button>
|
|
5277
|
+
</nav>
|
|
5278
|
+
<main id="app"></main>
|
|
5279
|
+
<script>
|
|
5280
|
+
var DATA = ${dataJson};
|
|
5281
|
+
${SERVE_SCRIPT}
|
|
5282
|
+
</script>
|
|
5283
|
+
</body>
|
|
5284
|
+
</html>`;
|
|
5285
|
+
}
|
|
5286
|
+
var SERVE_STYLES = `
|
|
5287
|
+
*{margin:0;padding:0;box-sizing:border-box}
|
|
5288
|
+
:root{
|
|
5289
|
+
--bg:#f6f8fa;--surface:#fff;--border:#d0d7de;--border-light:#e8ebee;
|
|
5290
|
+
--text:#1f2328;--text-muted:#656d76;
|
|
5291
|
+
--primary:#0969da;--primary-bg:#ddf4ff;
|
|
5292
|
+
--success:#1a7f37;--success-bg:#dafbe1;
|
|
5293
|
+
--danger:#cf222e;--danger-bg:#ffebe9;
|
|
5294
|
+
--warning:#9a6700;--warning-bg:#fff8c5;
|
|
5295
|
+
--radius:6px;
|
|
5296
|
+
--shadow:0 1px 3px rgba(31,35,40,.04),0 1px 2px rgba(31,35,40,.06);
|
|
5297
|
+
--font:-apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif;
|
|
5298
|
+
--mono:ui-monospace,SFMono-Regular,"SF Mono",Menlo,Consolas,monospace;
|
|
5299
|
+
}
|
|
5300
|
+
body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:1.5;font-size:14px}
|
|
5301
|
+
|
|
5302
|
+
/* Header */
|
|
5303
|
+
.header{background:var(--surface);border-bottom:1px solid var(--border);padding:12px 24px;display:flex;align-items:center;justify-content:space-between}
|
|
5304
|
+
.header-left{display:flex;align-items:baseline;gap:12px}
|
|
5305
|
+
.header-title{font-size:18px;font-weight:600}
|
|
5306
|
+
.header-subtitle{font-size:14px;color:var(--text-muted)}
|
|
5307
|
+
.timestamp{font-size:12px;color:var(--text-muted);font-family:var(--mono)}
|
|
5308
|
+
|
|
5309
|
+
/* Tabs */
|
|
5310
|
+
.tabs{background:var(--surface);border-bottom:1px solid var(--border);padding:0 24px;display:flex}
|
|
5311
|
+
.tab{background:none;border:none;padding:10px 16px;font-size:14px;color:var(--text-muted);cursor:pointer;border-bottom:2px solid transparent;font-family:var(--font);transition:color .15s,border-color .15s}
|
|
5312
|
+
.tab:hover{color:var(--text)}
|
|
5313
|
+
.tab.active{color:var(--text);font-weight:600;border-bottom-color:var(--primary)}
|
|
5314
|
+
|
|
5315
|
+
#app{max-width:1280px;margin:0 auto;padding:24px}
|
|
5316
|
+
|
|
5317
|
+
/* Stat cards */
|
|
5318
|
+
.stats-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:12px;margin-bottom:24px}
|
|
5319
|
+
.stat-card{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;text-align:center;box-shadow:var(--shadow)}
|
|
5320
|
+
.stat-card.pass .stat-value{color:var(--success)}
|
|
5321
|
+
.stat-card.fail .stat-value{color:var(--danger)}
|
|
5322
|
+
.stat-card.error .stat-value{color:var(--danger)}
|
|
5323
|
+
.stat-card.warn .stat-value{color:var(--warning)}
|
|
5324
|
+
.stat-card.total .stat-value{color:var(--primary)}
|
|
5325
|
+
.stat-value{font-size:28px;font-weight:700;line-height:1.2}
|
|
5326
|
+
.stat-label{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.5px;margin-top:4px}
|
|
5327
|
+
|
|
5328
|
+
/* Sections */
|
|
5329
|
+
.section{margin-bottom:24px}
|
|
5330
|
+
.section-title{font-size:16px;font-weight:600;margin-bottom:12px}
|
|
5331
|
+
|
|
5332
|
+
/* Tables */
|
|
5333
|
+
.table-wrap{overflow-x:auto;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);box-shadow:var(--shadow)}
|
|
5334
|
+
.data-table{width:100%;border-collapse:collapse;font-size:13px}
|
|
5335
|
+
.data-table th{background:var(--bg);border-bottom:1px solid var(--border);padding:8px 12px;text-align:left;font-weight:600;font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;white-space:nowrap}
|
|
5336
|
+
.data-table th.sortable{cursor:pointer;user-select:none}
|
|
5337
|
+
.data-table th.sortable:hover{color:var(--text)}
|
|
5338
|
+
.data-table td{padding:8px 12px;border-bottom:1px solid var(--border-light);vertical-align:middle}
|
|
5339
|
+
.data-table tbody tr:last-child td{border-bottom:none}
|
|
5340
|
+
|
|
5341
|
+
/* Status icons */
|
|
5342
|
+
.status-icon{display:inline-flex;align-items:center;justify-content:center;width:22px;height:22px;border-radius:50%;font-size:12px;font-weight:700}
|
|
5343
|
+
.status-icon.pass{background:var(--success-bg);color:var(--success)}
|
|
5344
|
+
.status-icon.fail{background:var(--danger-bg);color:var(--danger)}
|
|
5345
|
+
.status-icon.error{background:var(--warning-bg);color:var(--warning)}
|
|
5346
|
+
|
|
5347
|
+
/* Score colors */
|
|
5348
|
+
.score-high{color:var(--success);font-weight:600}
|
|
5349
|
+
.score-mid{color:var(--warning);font-weight:600}
|
|
5350
|
+
.score-low{color:var(--danger);font-weight:600}
|
|
5351
|
+
|
|
5352
|
+
/* Pass-rate bar */
|
|
5353
|
+
.bar-bg{width:100px;height:8px;background:var(--border-light);border-radius:4px;overflow:hidden}
|
|
5354
|
+
.bar-fill{height:100%;border-radius:4px;transition:width .3s}
|
|
5355
|
+
.bar-fill.score-high{background:var(--success)}
|
|
5356
|
+
.bar-fill.score-mid{background:var(--warning)}
|
|
5357
|
+
.bar-fill.score-low{background:var(--danger)}
|
|
5358
|
+
|
|
5359
|
+
/* Histogram */
|
|
5360
|
+
.histogram{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;box-shadow:var(--shadow)}
|
|
5361
|
+
.hist-row{display:flex;align-items:center;gap:12px;margin-bottom:8px}
|
|
5362
|
+
.hist-row:last-child{margin-bottom:0}
|
|
5363
|
+
.hist-label{width:60px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
|
|
5364
|
+
.hist-bar-bg{flex:1;height:20px;background:var(--border-light);border-radius:3px;overflow:hidden}
|
|
5365
|
+
.hist-bar{height:100%;border-radius:3px;transition:width .3s}
|
|
5366
|
+
.hist-count{width:30px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
|
|
5367
|
+
|
|
5368
|
+
/* Filters */
|
|
5369
|
+
.filter-bar{display:flex;gap:8px;margin-bottom:16px;align-items:center;flex-wrap:wrap}
|
|
5370
|
+
.filter-select,.filter-search{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font)}
|
|
5371
|
+
.filter-search{flex:1;min-width:200px}
|
|
5372
|
+
.filter-count{font-size:12px;color:var(--text-muted);margin-left:auto}
|
|
5373
|
+
|
|
5374
|
+
/* Test rows */
|
|
5375
|
+
.test-row{cursor:pointer;transition:background .1s}
|
|
5376
|
+
.test-row:hover{background:var(--bg)!important}
|
|
5377
|
+
.test-row.expanded{background:var(--primary-bg)!important}
|
|
5378
|
+
.expand-col{width:32px;text-align:center}
|
|
5379
|
+
.expand-icon{color:var(--text-muted);font-size:12px}
|
|
5380
|
+
.fw-medium{font-weight:500}
|
|
5381
|
+
.text-pass{color:var(--success)}.text-fail{color:var(--danger)}.text-error{color:var(--warning)}
|
|
5382
|
+
|
|
5383
|
+
/* Detail panel */
|
|
5384
|
+
.detail-row td{padding:0!important;background:var(--bg)!important}
|
|
5385
|
+
.detail-panel{padding:16px 24px}
|
|
5386
|
+
.detail-grid{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-bottom:16px}
|
|
5387
|
+
.detail-block h4{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;margin-bottom:6px}
|
|
5388
|
+
.detail-pre{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:12px;font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word;max-height:300px;overflow-y:auto;line-height:1.6}
|
|
5389
|
+
.detail-panel h4{font-size:13px;font-weight:600;margin:16px 0 8px}
|
|
5390
|
+
.eval-table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);margin-bottom:12px}
|
|
5391
|
+
.eval-table th{background:var(--bg);padding:6px 10px;text-align:left;font-size:11px;font-weight:600;color:var(--text-muted);text-transform:uppercase;border-bottom:1px solid var(--border)}
|
|
5392
|
+
.eval-table td{padding:8px 10px;border-bottom:1px solid var(--border-light)}
|
|
5393
|
+
.reasoning-cell{max-width:500px;font-size:12px;color:var(--text-muted)}
|
|
5394
|
+
.expect-list{list-style:none;padding:0;margin-bottom:12px}
|
|
5395
|
+
.expect-list li{padding:4px 8px 4px 24px;position:relative;font-size:13px}
|
|
5396
|
+
.expect-list.pass li::before{content:"\\2713";position:absolute;left:4px;color:var(--success);font-weight:700}
|
|
5397
|
+
.expect-list.fail li::before{content:"\\2717";position:absolute;left:4px;color:var(--danger);font-weight:700}
|
|
5398
|
+
.error-box{background:var(--danger-bg);border:1px solid var(--danger);border-radius:var(--radius);padding:12px;margin-bottom:12px}
|
|
5399
|
+
.error-box h4{color:var(--danger);margin:0 0 6px}
|
|
5400
|
+
.error-box pre{font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word}
|
|
5401
|
+
.detail-meta{font-size:12px;color:var(--text-muted);margin-top:12px;padding-top:12px;border-top:1px solid var(--border-light)}
|
|
5402
|
+
.tool-calls{display:flex;flex-wrap:wrap;gap:6px;margin-bottom:12px}
|
|
5403
|
+
.tool-tag{display:inline-block;padding:2px 10px;font-size:12px;font-family:var(--mono);background:var(--primary-bg);color:var(--primary);border:1px solid var(--border);border-radius:12px}
|
|
5404
|
+
.empty-state{text-align:center;padding:48px 24px;color:var(--text-muted)}
|
|
5405
|
+
.empty-state h3{font-size:16px;margin-bottom:8px;color:var(--text)}
|
|
5406
|
+
|
|
5407
|
+
/* Feedback */
|
|
5408
|
+
.feedback-section{margin-top:16px;padding-top:16px;border-top:1px solid var(--border-light)}
|
|
5409
|
+
.feedback-input{width:100%;min-height:80px;padding:8px 12px;border:1px solid var(--border);border-radius:var(--radius);font-family:var(--font);font-size:13px;resize:vertical;background:var(--surface);color:var(--text)}
|
|
5410
|
+
.feedback-input:focus{outline:none;border-color:var(--primary);box-shadow:0 0 0 3px var(--primary-bg)}
|
|
5411
|
+
.feedback-submit{margin-top:8px;padding:6px 16px;background:var(--primary);color:#fff;border:none;border-radius:var(--radius);font-size:13px;cursor:pointer;font-family:var(--font)}
|
|
5412
|
+
.feedback-submit:hover{opacity:.9}
|
|
5413
|
+
.feedback-submit:disabled{opacity:.5;cursor:default}
|
|
5414
|
+
.feedback-status{margin-left:8px;font-size:12px;color:var(--success)}
|
|
5415
|
+
`;
|
|
5416
|
+
var SERVE_SCRIPT = `
|
|
5417
|
+
(function(){
|
|
5418
|
+
/* ---- helpers ---- */
|
|
5419
|
+
function esc(s){
|
|
5420
|
+
if(s==null)return"";
|
|
5421
|
+
return String(s).replace(/&/g,"&").replace(/</g,"<").replace(/>/g,">").replace(/"/g,""");
|
|
5422
|
+
}
|
|
5423
|
+
function getStatus(r){
|
|
5424
|
+
if(r.executionStatus==="execution_error")return"error";
|
|
5425
|
+
if(r.executionStatus==="quality_failure")return"fail";
|
|
5426
|
+
if(r.executionStatus==="ok")return"pass";
|
|
5427
|
+
if(r.error)return"error";
|
|
5428
|
+
return r.score>=0.5?"pass":"fail";
|
|
5429
|
+
}
|
|
5430
|
+
function sIcon(s){
|
|
5431
|
+
if(s==="pass")return'<span class="status-icon pass">\\u2713</span>';
|
|
5432
|
+
if(s==="fail")return'<span class="status-icon fail">\\u2717</span>';
|
|
5433
|
+
return'<span class="status-icon error">!</span>';
|
|
5434
|
+
}
|
|
5435
|
+
function fmtDur(ms){
|
|
5436
|
+
if(ms==null)return"\\u2014";
|
|
5437
|
+
if(ms<1000)return ms+"ms";
|
|
5438
|
+
if(ms<60000)return(ms/1000).toFixed(1)+"s";
|
|
5439
|
+
return Math.floor(ms/60000)+"m "+Math.round((ms%60000)/1000)+"s";
|
|
5440
|
+
}
|
|
5441
|
+
function fmtTok(n){
|
|
5442
|
+
if(n==null)return"\\u2014";
|
|
5443
|
+
if(n>=1e6)return(n/1e6).toFixed(1)+"M";
|
|
5444
|
+
if(n>=1e3)return(n/1e3).toFixed(1)+"K";
|
|
5445
|
+
return String(n);
|
|
5446
|
+
}
|
|
5447
|
+
function fmtCost(u){if(u==null)return"\\u2014";if(u<0.01)return"<$0.01";return"$"+u.toFixed(2);}
|
|
5448
|
+
function fmtPct(v){if(v==null)return"\\u2014";return(v*100).toFixed(1)+"%";}
|
|
5449
|
+
function sCls(v){if(v==null)return"";if(v>=0.9)return"score-high";if(v>=0.5)return"score-mid";return"score-low";}
|
|
5450
|
+
|
|
5451
|
+
/* ---- feedback state ---- */
|
|
5452
|
+
var feedbackCache={};
|
|
5453
|
+
|
|
5454
|
+
function loadFeedback(){
|
|
5455
|
+
fetch("/api/feedback").then(function(r){return r.json();}).then(function(d){
|
|
5456
|
+
if(d&&d.reviews){
|
|
5457
|
+
for(var i=0;i<d.reviews.length;i++){
|
|
5458
|
+
feedbackCache[d.reviews[i].test_id]=d.reviews[i].comment;
|
|
5459
|
+
}
|
|
5460
|
+
populateFeedbackTextareas();
|
|
5461
|
+
}
|
|
5462
|
+
}).catch(function(){});
|
|
5463
|
+
}
|
|
5464
|
+
|
|
5465
|
+
function populateFeedbackTextareas(){
|
|
5466
|
+
var areas=document.querySelectorAll(".feedback-input");
|
|
5467
|
+
for(var i=0;i<areas.length;i++){
|
|
5468
|
+
var tid=areas[i].getAttribute("data-test-id");
|
|
5469
|
+
if(tid&&feedbackCache[tid]!=null){
|
|
5470
|
+
areas[i].value=feedbackCache[tid];
|
|
5471
|
+
}
|
|
5472
|
+
}
|
|
5473
|
+
}
|
|
5474
|
+
|
|
5475
|
+
function saveFeedback(testId,comment,statusEl,btn){
|
|
5476
|
+
btn.disabled=true;
|
|
5477
|
+
statusEl.textContent="Saving...";
|
|
5478
|
+
statusEl.style.color="var(--text-muted)";
|
|
5479
|
+
fetch("/api/feedback",{
|
|
5480
|
+
method:"POST",
|
|
5481
|
+
headers:{"Content-Type":"application/json"},
|
|
5482
|
+
body:JSON.stringify({reviews:[{test_id:testId,comment:comment}]})
|
|
5483
|
+
}).then(function(r){return r.json();}).then(function(){
|
|
5484
|
+
feedbackCache[testId]=comment;
|
|
5485
|
+
statusEl.textContent="Saved";
|
|
5486
|
+
statusEl.style.color="var(--success)";
|
|
5487
|
+
btn.disabled=false;
|
|
5488
|
+
setTimeout(function(){statusEl.textContent="";},2000);
|
|
5489
|
+
}).catch(function(){
|
|
5490
|
+
statusEl.textContent="Error saving";
|
|
5491
|
+
statusEl.style.color="var(--danger)";
|
|
5492
|
+
btn.disabled=false;
|
|
5493
|
+
});
|
|
5494
|
+
}
|
|
5495
|
+
|
|
5496
|
+
/* ---- compute stats ---- */
|
|
5497
|
+
function computeStats(d){
|
|
5498
|
+
var t=d.length,p=0,f=0,e=0,dur=0,ti=0,to=0,cost=0,sc=[],tc=0;
|
|
5499
|
+
for(var i=0;i<d.length;i++){
|
|
5500
|
+
var r=d[i],s=getStatus(r);
|
|
5501
|
+
if(s==="pass")p++;else if(s==="fail")f++;else e++;
|
|
5502
|
+
if(r.durationMs)dur+=r.durationMs;
|
|
5503
|
+
if(r.tokenUsage){ti+=(r.tokenUsage.input||0);to+=(r.tokenUsage.output||0);}
|
|
5504
|
+
if(r.costUsd)cost+=r.costUsd;
|
|
5505
|
+
if(s!=="error")sc.push(r.score);
|
|
5506
|
+
if(r._toolCalls){for(var k in r._toolCalls)tc+=r._toolCalls[k];}
|
|
5507
|
+
}
|
|
5508
|
+
var g=t-e;
|
|
5509
|
+
return{total:t,passed:p,failed:f,errors:e,passRate:g>0?p/g:0,dur:dur,tokens:ti+to,inTok:ti,outTok:to,cost:cost,scores:sc,toolCalls:tc};
|
|
5510
|
+
}
|
|
5511
|
+
function computeTargets(d){
|
|
5512
|
+
var m={};
|
|
5513
|
+
for(var i=0;i<d.length;i++){
|
|
5514
|
+
var r=d[i],tgt=r.target||"unknown";
|
|
5515
|
+
if(!m[tgt])m[tgt]={target:tgt,results:[],p:0,f:0,e:0,ts:0,sc:0,dur:0,tok:0,cost:0};
|
|
5516
|
+
var o=m[tgt];o.results.push(r);
|
|
5517
|
+
var s=getStatus(r);
|
|
5518
|
+
if(s==="pass")o.p++;else if(s==="fail")o.f++;else o.e++;
|
|
5519
|
+
if(s!=="error"){o.ts+=r.score;o.sc++;}
|
|
5520
|
+
if(r.durationMs)o.dur+=r.durationMs;
|
|
5521
|
+
if(r.tokenUsage)o.tok+=(r.tokenUsage.input||0)+(r.tokenUsage.output||0);
|
|
5522
|
+
if(r.costUsd)o.cost+=r.costUsd;
|
|
5523
|
+
}
|
|
5524
|
+
var a=[];for(var k in m)a.push(m[k]);return a;
|
|
5525
|
+
}
|
|
5526
|
+
function getEvalNames(){
|
|
5527
|
+
var n={};
|
|
5528
|
+
for(var i=0;i<DATA.length;i++){
|
|
5529
|
+
var sc=DATA[i].scores;
|
|
5530
|
+
if(sc)for(var j=0;j<sc.length;j++)n[sc[j].name]=true;
|
|
5531
|
+
}
|
|
5532
|
+
return Object.keys(n);
|
|
5533
|
+
}
|
|
5534
|
+
function getEvalScore(r,name){
|
|
5535
|
+
if(!r.scores)return null;
|
|
5536
|
+
for(var i=0;i<r.scores.length;i++)if(r.scores[i].name===name)return r.scores[i].score;
|
|
5537
|
+
return null;
|
|
5538
|
+
}
|
|
5539
|
+
|
|
5540
|
+
var stats=computeStats(DATA);
|
|
5541
|
+
var tgtStats=computeTargets(DATA);
|
|
5542
|
+
var tgtNames=tgtStats.map(function(t){return t.target;});
|
|
5543
|
+
|
|
5544
|
+
/* ---- state ---- */
|
|
5545
|
+
var state={tab:"overview",filter:{status:"all",target:"all",search:""},sort:{col:"testId",dir:"asc"},expanded:{}};
|
|
5546
|
+
|
|
5547
|
+
/* ---- DOM refs ---- */
|
|
5548
|
+
var app=document.getElementById("app");
|
|
5549
|
+
var tabBtns=document.querySelectorAll(".tab");
|
|
5550
|
+
|
|
5551
|
+
/* ---- tabs ---- */
|
|
5552
|
+
function setTab(t){
|
|
5553
|
+
state.tab=t;
|
|
5554
|
+
for(var i=0;i<tabBtns.length;i++)tabBtns[i].classList.toggle("active",tabBtns[i].getAttribute("data-tab")===t);
|
|
5555
|
+
render();
|
|
5556
|
+
}
|
|
5557
|
+
for(var i=0;i<tabBtns.length;i++){
|
|
5558
|
+
tabBtns[i].addEventListener("click",(function(b){return function(){setTab(b.getAttribute("data-tab"));};})(tabBtns[i]));
|
|
5559
|
+
}
|
|
5560
|
+
|
|
5561
|
+
/* ---- render ---- */
|
|
5562
|
+
function render(){
|
|
5563
|
+
if(DATA.length===0){app.innerHTML='<div class="empty-state"><h3>No results</h3><p>No evaluation results to display.</p></div>';return;}
|
|
5564
|
+
if(state.tab==="overview")renderOverview();else renderTests();
|
|
5565
|
+
}
|
|
5566
|
+
|
|
5567
|
+
/* ---- stat card helper ---- */
|
|
5568
|
+
function card(label,value,type){
|
|
5569
|
+
return'<div class="stat-card '+type+'"><div class="stat-value">'+value+'</div><div class="stat-label">'+label+"</div></div>";
|
|
5570
|
+
}
|
|
5571
|
+
|
|
5572
|
+
/* ---- overview ---- */
|
|
5573
|
+
function renderOverview(){
|
|
5574
|
+
var h='<div class="stats-grid">';
|
|
5575
|
+
h+=card("Total Tests",stats.total,"total");
|
|
5576
|
+
h+=card("Passed",stats.passed,"pass");
|
|
5577
|
+
h+=card("Failed",stats.failed,"fail");
|
|
5578
|
+
h+=card("Errors",stats.errors,"error");
|
|
5579
|
+
var prCls=stats.passRate>=0.9?"pass":stats.passRate>=0.5?"warn":"fail";
|
|
5580
|
+
h+=card("Pass Rate",fmtPct(stats.passRate),prCls);
|
|
5581
|
+
h+=card("Duration",fmtDur(stats.dur),"neutral");
|
|
5582
|
+
h+=card("Tokens",fmtTok(stats.tokens),"neutral");
|
|
5583
|
+
h+=card("Est. Cost",fmtCost(stats.cost),"neutral");
|
|
5584
|
+
if(stats.toolCalls>0)h+=card("Tool Calls",fmtTok(stats.toolCalls),"neutral");
|
|
5585
|
+
h+="</div>";
|
|
5586
|
+
|
|
5587
|
+
/* targets table */
|
|
5588
|
+
if(tgtStats.length>1){
|
|
5589
|
+
h+='<div class="section"><h2 class="section-title">Targets</h2><div class="table-wrap"><table class="data-table">';
|
|
5590
|
+
h+="<thead><tr><th>Target</th><th>Pass Rate</th><th></th><th>Passed</th><th>Failed</th><th>Errors</th><th>Avg Score</th><th>Duration</th><th>Tokens</th><th>Cost</th></tr></thead><tbody>";
|
|
5591
|
+
for(var i=0;i<tgtStats.length;i++){
|
|
5592
|
+
var t=tgtStats[i],g=t.p+t.f,pr=g>0?t.p/g:0,avg=t.sc>0?t.ts/t.sc:0;
|
|
5593
|
+
h+="<tr><td class=\\"fw-medium\\">"+esc(t.target)+"</td><td>"+fmtPct(pr)+'</td><td><div class="bar-bg"><div class="bar-fill '+sCls(pr)+'" style="width:'+(pr*100)+'%"></div></div></td>';
|
|
5594
|
+
h+='<td class="text-pass">'+t.p+'</td><td class="text-fail">'+t.f+'</td><td class="text-error">'+t.e+"</td>";
|
|
5595
|
+
h+='<td class="'+sCls(avg)+'">'+fmtPct(avg)+"</td><td>"+fmtDur(t.dur)+"</td><td>"+fmtTok(t.tok)+"</td><td>"+fmtCost(t.cost)+"</td></tr>";
|
|
5596
|
+
}
|
|
5597
|
+
h+="</tbody></table></div></div>";
|
|
5598
|
+
}
|
|
5599
|
+
|
|
5600
|
+
/* histogram */
|
|
5601
|
+
if(stats.scores.length>0){
|
|
5602
|
+
var bk=[0,0,0,0,0];
|
|
5603
|
+
for(var i=0;i<stats.scores.length;i++){var idx=Math.min(Math.floor(stats.scores[i]*5),4);bk[idx]++;}
|
|
5604
|
+
var mx=Math.max.apply(null,bk);
|
|
5605
|
+
var lb=["0\\u201320%","20\\u201340%","40\\u201360%","60\\u201380%","80\\u2013100%"];
|
|
5606
|
+
h+='<div class="section"><h2 class="section-title">Score Distribution</h2><div class="histogram">';
|
|
5607
|
+
for(var i=0;i<bk.length;i++){
|
|
5608
|
+
var pct=mx>0?(bk[i]/mx*100):0;
|
|
5609
|
+
h+='<div class="hist-row"><span class="hist-label">'+lb[i]+'</span><div class="hist-bar-bg"><div class="hist-bar '+(i>=4?"score-high":i>=2?"score-mid":"score-low")+'" style="width:'+pct+'%"></div></div><span class="hist-count">'+bk[i]+"</span></div>";
|
|
5610
|
+
}
|
|
5611
|
+
h+="</div></div>";
|
|
5612
|
+
}
|
|
5613
|
+
app.innerHTML=h;
|
|
5614
|
+
}
|
|
5615
|
+
|
|
5616
|
+
/* ---- test cases ---- */
|
|
5617
|
+
function renderTests(){
|
|
5618
|
+
var evalNames=getEvalNames();
|
|
5619
|
+
var h='<div class="filter-bar">';
|
|
5620
|
+
h+='<select id="flt-status" class="filter-select"><option value="all">All Status</option><option value="pass">Passed</option><option value="fail">Failed</option><option value="error">Errors</option></select>';
|
|
5621
|
+
if(tgtNames.length>1){
|
|
5622
|
+
h+='<select id="flt-target" class="filter-select"><option value="all">All Targets</option>';
|
|
5623
|
+
for(var i=0;i<tgtNames.length;i++)h+='<option value="'+esc(tgtNames[i])+'">'+esc(tgtNames[i])+"</option>";
|
|
5624
|
+
h+="</select>";
|
|
5625
|
+
}
|
|
5626
|
+
h+='<input type="text" id="flt-search" class="filter-search" placeholder="Search tests..." value="'+esc(state.filter.search)+'">';
|
|
5627
|
+
h+='<span class="filter-count" id="flt-count"></span></div>';
|
|
5628
|
+
|
|
5629
|
+
h+='<div class="table-wrap"><table class="data-table" id="test-tbl"><thead><tr>';
|
|
5630
|
+
h+='<th class="expand-col"></th>';
|
|
5631
|
+
h+=sHdr("Status","status");
|
|
5632
|
+
h+=sHdr("Test ID","testId");
|
|
5633
|
+
if(tgtNames.length>1)h+=sHdr("Target","target");
|
|
5634
|
+
h+=sHdr("Score","score");
|
|
5635
|
+
for(var i=0;i<evalNames.length;i++)h+="<th>"+esc(evalNames[i])+"</th>";
|
|
5636
|
+
h+=sHdr("Duration","durationMs");
|
|
5637
|
+
h+=sHdr("Cost","costUsd");
|
|
5638
|
+
h+="</tr></thead><tbody id=\\"test-body\\"></tbody></table></div>";
|
|
5639
|
+
app.innerHTML=h;
|
|
5640
|
+
|
|
5641
|
+
/* wire events */
|
|
5642
|
+
var selS=document.getElementById("flt-status");
|
|
5643
|
+
selS.value=state.filter.status;
|
|
5644
|
+
selS.addEventListener("change",function(e){state.filter.status=e.target.value;renderRows();});
|
|
5645
|
+
var selT=document.getElementById("flt-target");
|
|
5646
|
+
if(selT){selT.value=state.filter.target;selT.addEventListener("change",function(e){state.filter.target=e.target.value;renderRows();});}
|
|
5647
|
+
document.getElementById("flt-search").addEventListener("input",function(e){state.filter.search=e.target.value;renderRows();});
|
|
5648
|
+
var ths=document.querySelectorAll("th[data-sort]");
|
|
5649
|
+
for(var i=0;i<ths.length;i++){
|
|
5650
|
+
ths[i].addEventListener("click",(function(th){return function(){
|
|
5651
|
+
var c=th.getAttribute("data-sort");
|
|
5652
|
+
if(state.sort.col===c)state.sort.dir=state.sort.dir==="asc"?"desc":"asc";
|
|
5653
|
+
else{state.sort.col=c;state.sort.dir="asc";}
|
|
5654
|
+
renderTests();
|
|
5655
|
+
};})(ths[i]));
|
|
5656
|
+
}
|
|
5657
|
+
renderRows();
|
|
5658
|
+
}
|
|
5659
|
+
|
|
5660
|
+
function sHdr(label,col){
|
|
5661
|
+
var arrow="";
|
|
5662
|
+
if(state.sort.col===col)arrow=state.sort.dir==="asc"?" \\u2191":" \\u2193";
|
|
5663
|
+
return'<th class="sortable" data-sort="'+col+'">'+label+arrow+"</th>";
|
|
5664
|
+
}
|
|
5665
|
+
|
|
5666
|
+
function filtered(){
|
|
5667
|
+
var out=[];
|
|
5668
|
+
for(var i=0;i<DATA.length;i++){
|
|
5669
|
+
var r=DATA[i],s=getStatus(r);
|
|
5670
|
+
if(state.filter.status!=="all"&&s!==state.filter.status)continue;
|
|
5671
|
+
if(state.filter.target!=="all"&&r.target!==state.filter.target)continue;
|
|
5672
|
+
if(state.filter.search&&(r.testId||"").toLowerCase().indexOf(state.filter.search.toLowerCase())===-1)continue;
|
|
5673
|
+
out.push(r);
|
|
5674
|
+
}
|
|
5675
|
+
var col=state.sort.col,dir=state.sort.dir==="asc"?1:-1;
|
|
5676
|
+
out.sort(function(a,b){
|
|
5677
|
+
var va=col==="status"?getStatus(a):a[col],vb=col==="status"?getStatus(b):b[col];
|
|
5678
|
+
if(va==null&&vb==null)return 0;if(va==null)return 1;if(vb==null)return-1;
|
|
5679
|
+
if(typeof va==="string")return va.localeCompare(vb)*dir;
|
|
5680
|
+
return(va-vb)*dir;
|
|
5681
|
+
});
|
|
5682
|
+
return out;
|
|
5683
|
+
}
|
|
5684
|
+
|
|
5685
|
+
function renderRows(){
|
|
5686
|
+
var rows=filtered(),evalNames=getEvalNames();
|
|
5687
|
+
var tbody=document.getElementById("test-body");
|
|
5688
|
+
var colSpan=5+evalNames.length+(tgtNames.length>1?1:0);
|
|
5689
|
+
document.getElementById("flt-count").textContent=rows.length+" of "+DATA.length+" tests";
|
|
5690
|
+
var h="";
|
|
5691
|
+
for(var i=0;i<rows.length;i++){
|
|
5692
|
+
var r=rows[i],s=getStatus(r),key=r.testId+":"+r.target,exp=!!state.expanded[key];
|
|
5693
|
+
h+='<tr class="test-row '+s+(exp?" expanded":"")+'" data-key="'+esc(key)+'" data-test-id="'+esc(r.testId)+'">';
|
|
5694
|
+
h+='<td class="expand-col"><span class="expand-icon">'+(exp?"\\u25BE":"\\u25B8")+"</span></td>";
|
|
5695
|
+
h+="<td>"+sIcon(s)+"</td>";
|
|
5696
|
+
h+='<td class="fw-medium">'+esc(r.testId)+"</td>";
|
|
5697
|
+
if(tgtNames.length>1)h+="<td>"+esc(r.target)+"</td>";
|
|
5698
|
+
h+='<td class="'+sCls(r.score)+'">'+fmtPct(r.score)+"</td>";
|
|
5699
|
+
for(var j=0;j<evalNames.length;j++){
|
|
5700
|
+
var es=getEvalScore(r,evalNames[j]);
|
|
5701
|
+
h+='<td class="'+sCls(es)+'">'+(es!=null?fmtPct(es):"\\u2014")+"</td>";
|
|
5702
|
+
}
|
|
5703
|
+
h+="<td>"+fmtDur(r.durationMs)+"</td><td>"+fmtCost(r.costUsd)+"</td></tr>";
|
|
5704
|
+
if(exp)h+='<tr class="detail-row"><td colspan="'+colSpan+'">'+renderDetail(r)+"</td></tr>";
|
|
5705
|
+
}
|
|
5706
|
+
if(rows.length===0)h+='<tr><td colspan="'+colSpan+'" class="empty-state">No matching tests</td></tr>';
|
|
5707
|
+
tbody.innerHTML=h;
|
|
5708
|
+
|
|
5709
|
+
/* row click */
|
|
5710
|
+
var trs=tbody.querySelectorAll(".test-row");
|
|
5711
|
+
for(var k=0;k<trs.length;k++){
|
|
5712
|
+
trs[k].addEventListener("click",(function(tr){return function(){
|
|
5713
|
+
var key=tr.getAttribute("data-key");
|
|
5714
|
+
state.expanded[key]=!state.expanded[key];
|
|
5715
|
+
renderRows();
|
|
5716
|
+
};})(trs[k]));
|
|
5717
|
+
}
|
|
5718
|
+
|
|
5719
|
+
/* wire feedback buttons */
|
|
5720
|
+
var btns=tbody.querySelectorAll(".feedback-submit");
|
|
5721
|
+
for(var k=0;k<btns.length;k++){
|
|
5722
|
+
btns[k].addEventListener("click",(function(btn){return function(ev){
|
|
5723
|
+
ev.stopPropagation();
|
|
5724
|
+
var tid=btn.getAttribute("data-test-id");
|
|
5725
|
+
var sec=btn.closest(".feedback-section");
|
|
5726
|
+
var ta=sec.querySelector(".feedback-input");
|
|
5727
|
+
var st=sec.querySelector(".feedback-status");
|
|
5728
|
+
saveFeedback(tid,ta.value,st,btn);
|
|
5729
|
+
};})(btns[k]));
|
|
5730
|
+
}
|
|
5731
|
+
|
|
5732
|
+
/* prevent textarea clicks from toggling row */
|
|
5733
|
+
var tas=tbody.querySelectorAll(".feedback-input");
|
|
5734
|
+
for(var k=0;k<tas.length;k++){
|
|
5735
|
+
tas[k].addEventListener("click",function(ev){ev.stopPropagation();});
|
|
5736
|
+
}
|
|
5737
|
+
|
|
5738
|
+
populateFeedbackTextareas();
|
|
5739
|
+
}
|
|
5740
|
+
|
|
5741
|
+
/* ---- detail panel ---- */
|
|
5742
|
+
function renderDetail(r){
|
|
5743
|
+
var h='<div class="detail-panel">';
|
|
5744
|
+
|
|
5745
|
+
/* input / output */
|
|
5746
|
+
h+='<div class="detail-grid">';
|
|
5747
|
+
if(r.input!=null){
|
|
5748
|
+
h+='<div class="detail-block"><h4>Input</h4><pre class="detail-pre">'+esc(JSON.stringify(r.input,null,2))+"</pre></div>";
|
|
5749
|
+
}
|
|
5750
|
+
h+='<div class="detail-block"><h4>Output</h4><pre class="detail-pre">'+esc(r.output?JSON.stringify(r.output,null,2):"")+"</pre></div>";
|
|
5751
|
+
h+="</div>";
|
|
5752
|
+
|
|
5753
|
+
/* evaluator results */
|
|
5754
|
+
if(r.scores&&r.scores.length>0){
|
|
5755
|
+
h+="<h4>Evaluator Results</h4>";
|
|
5756
|
+
h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
|
|
5757
|
+
for(var i=0;i<r.scores.length;i++){
|
|
5758
|
+
var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
|
|
5759
|
+
var evAssertions=ev.assertions||[];
|
|
5760
|
+
var evSummary=evAssertions.map(function(a){return (a.passed?"\\u2713 ":"\\u2717 ")+a.text;}).join("; ");
|
|
5761
|
+
h+="<tr><td class=\\"fw-medium\\">"+esc(ev.name)+'</td><td class="'+sCls(ev.score)+'">'+fmtPct(ev.score)+"</td><td>"+sIcon(evS)+'</td><td class="reasoning-cell">'+esc(evSummary)+"</td></tr>";
|
|
5762
|
+
}
|
|
5763
|
+
h+="</tbody></table>";
|
|
5764
|
+
}
|
|
5765
|
+
|
|
5766
|
+
/* assertions */
|
|
5767
|
+
var passedA=r.assertions?r.assertions.filter(function(a){return a.passed;}):[];
|
|
5768
|
+
var failedA=r.assertions?r.assertions.filter(function(a){return !a.passed;}):[];
|
|
5769
|
+
if(passedA.length>0){
|
|
5770
|
+
h+='<h4>Passed Assertions</h4><ul class="expect-list pass">';
|
|
5771
|
+
for(var i=0;i<passedA.length;i++)h+="<li>"+esc(passedA[i].text)+(passedA[i].evidence?" <span class=\\"reasoning-cell\\">("+esc(passedA[i].evidence)+")</span>":"")+"</li>";
|
|
5772
|
+
h+="</ul>";
|
|
5773
|
+
}
|
|
5774
|
+
if(failedA.length>0){
|
|
5775
|
+
h+='<h4>Failed Assertions</h4><ul class="expect-list fail">';
|
|
5776
|
+
for(var i=0;i<failedA.length;i++)h+="<li>"+esc(failedA[i].text)+(failedA[i].evidence?" <span class=\\"reasoning-cell\\">("+esc(failedA[i].evidence)+")</span>":"")+"</li>";
|
|
5777
|
+
h+="</ul>";
|
|
5778
|
+
}
|
|
5779
|
+
|
|
5780
|
+
/* tool calls */
|
|
5781
|
+
if(r._toolCalls){
|
|
5782
|
+
var tc=r._toolCalls,tcArr=[];
|
|
5783
|
+
for(var k in tc)tcArr.push({name:k,count:tc[k]});
|
|
5784
|
+
tcArr.sort(function(a,b){return b.count-a.count;});
|
|
5785
|
+
h+='<h4>Tool Calls</h4><div class="tool-calls">';
|
|
5786
|
+
for(var i=0;i<tcArr.length;i++)h+='<span class="tool-tag">'+esc(tcArr[i].name)+": "+tcArr[i].count+"</span>";
|
|
5787
|
+
h+="</div>";
|
|
5788
|
+
}
|
|
5789
|
+
|
|
5790
|
+
/* error */
|
|
5791
|
+
if(r.error)h+='<div class="error-box"><h4>Error</h4><pre>'+esc(r.error)+"</pre></div>";
|
|
5792
|
+
|
|
5793
|
+
/* metadata */
|
|
5794
|
+
h+='<div class="detail-meta">';
|
|
5795
|
+
var m=[];
|
|
5796
|
+
if(r.tokenUsage)m.push(fmtTok(r.tokenUsage.input)+" in / "+fmtTok(r.tokenUsage.output)+" out tokens");
|
|
5797
|
+
if(r.durationMs){
|
|
5798
|
+
if(r._graderDurationMs>0){
|
|
5799
|
+
var execMs=r.durationMs-r._graderDurationMs;
|
|
5800
|
+
m.push(fmtDur(execMs>0?execMs:0)+" executor + "+fmtDur(r._graderDurationMs)+" grader");
|
|
5801
|
+
}else{
|
|
5802
|
+
m.push(fmtDur(r.durationMs));
|
|
5803
|
+
}
|
|
5804
|
+
}
|
|
5805
|
+
if(r.target)m.push(r.target);
|
|
5806
|
+
if(r.costUsd)m.push(fmtCost(r.costUsd));
|
|
5807
|
+
if(r.timestamp)m.push(r.timestamp);
|
|
5808
|
+
h+=esc(m.join(" \\u00B7 "));
|
|
5809
|
+
h+="</div>";
|
|
5810
|
+
|
|
5811
|
+
/* feedback section */
|
|
5812
|
+
var tid=r.testId||"";
|
|
5813
|
+
var existingComment=feedbackCache[tid]||"";
|
|
5814
|
+
h+='<div class="feedback-section">';
|
|
5815
|
+
h+='<h4>Feedback</h4>';
|
|
5816
|
+
h+='<textarea class="feedback-input" data-test-id="'+esc(tid)+'" placeholder="Add feedback for this test..." onclick="event.stopPropagation()">'+esc(existingComment)+'</textarea>';
|
|
5817
|
+
h+='<div style="display:flex;align-items:center">';
|
|
5818
|
+
h+='<button class="feedback-submit" data-test-id="'+esc(tid)+'">Save Feedback</button>';
|
|
5819
|
+
h+='<span class="feedback-status"></span>';
|
|
5820
|
+
h+='</div></div>';
|
|
5821
|
+
|
|
5822
|
+
h+="</div>";
|
|
5823
|
+
return h;
|
|
5824
|
+
}
|
|
5825
|
+
|
|
5826
|
+
/* ---- init ---- */
|
|
5827
|
+
loadFeedback();
|
|
5828
|
+
render();
|
|
5829
|
+
})();
|
|
5830
|
+
`;
|
|
5831
|
+
var resultsServeCommand = command({
|
|
5832
|
+
name: "serve",
|
|
5833
|
+
description: "Start a local HTTP server to review evaluation results",
|
|
5834
|
+
args: {
|
|
5835
|
+
source: positional({
|
|
5836
|
+
type: optional(string),
|
|
5837
|
+
displayName: "source",
|
|
5838
|
+
description: "JSONL result file to serve (defaults to most recent in .agentv/results/)"
|
|
5839
|
+
}),
|
|
5840
|
+
port: option({
|
|
5841
|
+
type: optional(number),
|
|
5842
|
+
long: "port",
|
|
5843
|
+
short: "p",
|
|
5844
|
+
description: "Port to listen on (default: 3117)"
|
|
5845
|
+
}),
|
|
5846
|
+
dir: option({
|
|
5847
|
+
type: optional(string),
|
|
5848
|
+
long: "dir",
|
|
5849
|
+
short: "d",
|
|
5850
|
+
description: "Working directory (default: current directory)"
|
|
5851
|
+
})
|
|
5852
|
+
},
|
|
5853
|
+
handler: async ({ source, port, dir }) => {
|
|
5854
|
+
const cwd = dir ?? process.cwd();
|
|
5855
|
+
const listenPort = port ?? 3117;
|
|
5856
|
+
try {
|
|
5857
|
+
const sourceFile = await resolveSourceFile2(source, cwd);
|
|
5858
|
+
const content = readFileSync9(sourceFile, "utf8");
|
|
5859
|
+
const results = loadResults2(content);
|
|
5860
|
+
const app2 = createApp(results, cwd);
|
|
5861
|
+
console.log(`Serving ${results.length} result(s) from ${sourceFile}`);
|
|
5862
|
+
console.log(`Dashboard: http://localhost:${listenPort}`);
|
|
5863
|
+
console.log(`Feedback API: http://localhost:${listenPort}/api/feedback`);
|
|
5864
|
+
console.log(`Feedback file: ${feedbackPath(cwd)}`);
|
|
5865
|
+
console.log("Press Ctrl+C to stop");
|
|
5866
|
+
const { serve: startServer } = await import("@hono/node-server");
|
|
5867
|
+
startServer({
|
|
5868
|
+
fetch: app2.fetch,
|
|
5869
|
+
port: listenPort
|
|
5870
|
+
});
|
|
5871
|
+
await new Promise(() => {
|
|
5872
|
+
});
|
|
5873
|
+
} catch (error) {
|
|
5874
|
+
console.error(`Error: ${error.message}`);
|
|
5875
|
+
process.exit(1);
|
|
5876
|
+
}
|
|
4797
5877
|
}
|
|
4798
5878
|
});
|
|
4799
5879
|
|
|
@@ -5043,7 +6123,6 @@ function buildEvalTest(raw) {
|
|
|
5043
6123
|
id: raw.test_id ?? "unknown",
|
|
5044
6124
|
question: "",
|
|
5045
6125
|
input: [],
|
|
5046
|
-
input_segments: [],
|
|
5047
6126
|
expected_output: [],
|
|
5048
6127
|
file_paths: [],
|
|
5049
6128
|
criteria: ""
|
|
@@ -5640,8 +6719,8 @@ var traceCommand = subcommands({
|
|
|
5640
6719
|
});
|
|
5641
6720
|
|
|
5642
6721
|
// src/commands/transpile/index.ts
|
|
5643
|
-
import { writeFileSync as
|
|
5644
|
-
import
|
|
6722
|
+
import { writeFileSync as writeFileSync5 } from "node:fs";
|
|
6723
|
+
import path11 from "node:path";
|
|
5645
6724
|
var transpileCommand = command({
|
|
5646
6725
|
name: "transpile",
|
|
5647
6726
|
description: "Convert an EVAL.yaml file to Agent Skills evals.json format",
|
|
@@ -5665,7 +6744,7 @@ var transpileCommand = command({
|
|
|
5665
6744
|
handler: async ({ input, outDir, stdout }) => {
|
|
5666
6745
|
let result;
|
|
5667
6746
|
try {
|
|
5668
|
-
result = transpileEvalYamlFile(
|
|
6747
|
+
result = transpileEvalYamlFile(path11.resolve(input));
|
|
5669
6748
|
} catch (error) {
|
|
5670
6749
|
console.error(`Error: ${error.message}`);
|
|
5671
6750
|
process.exit(1);
|
|
@@ -5689,12 +6768,12 @@ var transpileCommand = command({
|
|
|
5689
6768
|
process.stdout.write("\n");
|
|
5690
6769
|
return;
|
|
5691
6770
|
}
|
|
5692
|
-
const outputDir = outDir ?
|
|
6771
|
+
const outputDir = outDir ? path11.resolve(outDir) : path11.dirname(path11.resolve(input));
|
|
5693
6772
|
const fileNames = getOutputFilenames(result);
|
|
5694
6773
|
for (const [skill, evalsJson] of result.files) {
|
|
5695
6774
|
const fileName = fileNames.get(skill) ?? "evals.json";
|
|
5696
|
-
const outputPath =
|
|
5697
|
-
|
|
6775
|
+
const outputPath = path11.join(outputDir, fileName);
|
|
6776
|
+
writeFileSync5(outputPath, `${JSON.stringify(evalsJson, null, 2)}
|
|
5698
6777
|
`);
|
|
5699
6778
|
console.log(`Transpiled to ${outputPath}`);
|
|
5700
6779
|
}
|
|
@@ -5702,7 +6781,7 @@ var transpileCommand = command({
|
|
|
5702
6781
|
});
|
|
5703
6782
|
|
|
5704
6783
|
// src/commands/trim/index.ts
|
|
5705
|
-
import { readFileSync as
|
|
6784
|
+
import { readFileSync as readFileSync10, writeFileSync as writeFileSync6 } from "node:fs";
|
|
5706
6785
|
var trimCommand = command({
|
|
5707
6786
|
name: "trim",
|
|
5708
6787
|
description: "Trim evaluation results for baseline storage (strips debug/audit fields)",
|
|
@@ -5721,7 +6800,7 @@ var trimCommand = command({
|
|
|
5721
6800
|
},
|
|
5722
6801
|
handler: async ({ input, out }) => {
|
|
5723
6802
|
try {
|
|
5724
|
-
const content =
|
|
6803
|
+
const content = readFileSync10(input, "utf8");
|
|
5725
6804
|
const lines = content.trim().split("\n").filter((line) => line.trim());
|
|
5726
6805
|
const trimmedLines = lines.map((line) => {
|
|
5727
6806
|
const record = JSON.parse(line);
|
|
@@ -5733,7 +6812,7 @@ var trimCommand = command({
|
|
|
5733
6812
|
const output = `${trimmedLines.join("\n")}
|
|
5734
6813
|
`;
|
|
5735
6814
|
if (out) {
|
|
5736
|
-
|
|
6815
|
+
writeFileSync6(out, output, "utf8");
|
|
5737
6816
|
console.error(`Trimmed ${lines.length} record(s) \u2192 ${out}`);
|
|
5738
6817
|
} else {
|
|
5739
6818
|
process.stdout.write(output);
|
|
@@ -5752,7 +6831,7 @@ var ANSI_GREEN = "\x1B[32m";
|
|
|
5752
6831
|
var ANSI_CYAN = "\x1B[36m";
|
|
5753
6832
|
var ANSI_BOLD = "\x1B[1m";
|
|
5754
6833
|
var ANSI_RESET = "\x1B[0m";
|
|
5755
|
-
function
|
|
6834
|
+
function formatSummary2(summary, useColors) {
|
|
5756
6835
|
const lines = [];
|
|
5757
6836
|
lines.push("");
|
|
5758
6837
|
lines.push(formatHeader("Validation Summary", useColors));
|
|
@@ -5828,7 +6907,7 @@ function isTTY() {
|
|
|
5828
6907
|
// src/commands/validate/validate-files.ts
|
|
5829
6908
|
import { constants } from "node:fs";
|
|
5830
6909
|
import { access, readdir, stat } from "node:fs/promises";
|
|
5831
|
-
import
|
|
6910
|
+
import path12 from "node:path";
|
|
5832
6911
|
async function validateFiles(paths) {
|
|
5833
6912
|
const filePaths = await expandPaths(paths);
|
|
5834
6913
|
const results = [];
|
|
@@ -5846,7 +6925,7 @@ async function validateFiles(paths) {
|
|
|
5846
6925
|
};
|
|
5847
6926
|
}
|
|
5848
6927
|
async function validateSingleFile(filePath) {
|
|
5849
|
-
const absolutePath =
|
|
6928
|
+
const absolutePath = path12.resolve(filePath);
|
|
5850
6929
|
const fileType = await detectFileType(absolutePath);
|
|
5851
6930
|
let result;
|
|
5852
6931
|
if (fileType === "eval") {
|
|
@@ -5871,7 +6950,7 @@ async function validateSingleFile(filePath) {
|
|
|
5871
6950
|
async function expandPaths(paths) {
|
|
5872
6951
|
const expanded = [];
|
|
5873
6952
|
for (const inputPath of paths) {
|
|
5874
|
-
const absolutePath =
|
|
6953
|
+
const absolutePath = path12.resolve(inputPath);
|
|
5875
6954
|
try {
|
|
5876
6955
|
await access(absolutePath, constants.F_OK);
|
|
5877
6956
|
} catch {
|
|
@@ -5895,7 +6974,7 @@ async function findYamlFiles(dirPath) {
|
|
|
5895
6974
|
try {
|
|
5896
6975
|
const entries2 = await readdir(dirPath, { withFileTypes: true });
|
|
5897
6976
|
for (const entry of entries2) {
|
|
5898
|
-
const fullPath =
|
|
6977
|
+
const fullPath = path12.join(dirPath, entry.name);
|
|
5899
6978
|
if (entry.isDirectory()) {
|
|
5900
6979
|
if (entry.name === "node_modules" || entry.name.startsWith(".")) {
|
|
5901
6980
|
continue;
|
|
@@ -5912,7 +6991,7 @@ async function findYamlFiles(dirPath) {
|
|
|
5912
6991
|
return results;
|
|
5913
6992
|
}
|
|
5914
6993
|
function isYamlFile(filePath) {
|
|
5915
|
-
const ext =
|
|
6994
|
+
const ext = path12.extname(filePath).toLowerCase();
|
|
5916
6995
|
return ext === ".yaml" || ext === ".yml";
|
|
5917
6996
|
}
|
|
5918
6997
|
|
|
@@ -5924,7 +7003,7 @@ async function runValidateCommand(paths) {
|
|
|
5924
7003
|
}
|
|
5925
7004
|
const summary = await validateFiles(paths);
|
|
5926
7005
|
const useColors = isTTY();
|
|
5927
|
-
console.log(
|
|
7006
|
+
console.log(formatSummary2(summary, useColors));
|
|
5928
7007
|
if (summary.invalidFiles > 0) {
|
|
5929
7008
|
process.exit(1);
|
|
5930
7009
|
}
|
|
@@ -5950,9 +7029,9 @@ var validateCommand = command({
|
|
|
5950
7029
|
});
|
|
5951
7030
|
|
|
5952
7031
|
// src/commands/workspace/clean.ts
|
|
5953
|
-
import { existsSync as
|
|
7032
|
+
import { existsSync as existsSync6 } from "node:fs";
|
|
5954
7033
|
import { readFile as readFile2, readdir as readdir2, rm } from "node:fs/promises";
|
|
5955
|
-
import
|
|
7034
|
+
import path13 from "node:path";
|
|
5956
7035
|
async function confirm(message) {
|
|
5957
7036
|
const readline2 = await import("node:readline");
|
|
5958
7037
|
const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
|
|
@@ -5979,7 +7058,7 @@ var cleanCommand = command({
|
|
|
5979
7058
|
},
|
|
5980
7059
|
handler: async ({ repo, force }) => {
|
|
5981
7060
|
const poolRoot = getWorkspacePoolRoot();
|
|
5982
|
-
if (!
|
|
7061
|
+
if (!existsSync6(poolRoot)) {
|
|
5983
7062
|
console.log("No workspace pool entries found.");
|
|
5984
7063
|
return;
|
|
5985
7064
|
}
|
|
@@ -5988,8 +7067,8 @@ var cleanCommand = command({
|
|
|
5988
7067
|
const poolDirs = entries2.filter((e) => e.isDirectory());
|
|
5989
7068
|
const matchingDirs = [];
|
|
5990
7069
|
for (const dir of poolDirs) {
|
|
5991
|
-
const poolDir =
|
|
5992
|
-
const metadataPath =
|
|
7070
|
+
const poolDir = path13.join(poolRoot, dir.name);
|
|
7071
|
+
const metadataPath = path13.join(poolDir, "metadata.json");
|
|
5993
7072
|
try {
|
|
5994
7073
|
const raw = await readFile2(metadataPath, "utf-8");
|
|
5995
7074
|
const metadata = JSON.parse(raw);
|
|
@@ -6020,7 +7099,7 @@ var cleanCommand = command({
|
|
|
6020
7099
|
}
|
|
6021
7100
|
for (const dir of matchingDirs) {
|
|
6022
7101
|
await rm(dir, { recursive: true, force: true });
|
|
6023
|
-
console.log(`Removed: ${
|
|
7102
|
+
console.log(`Removed: ${path13.basename(dir).slice(0, 12)}...`);
|
|
6024
7103
|
}
|
|
6025
7104
|
console.log("Done.");
|
|
6026
7105
|
} else {
|
|
@@ -6038,15 +7117,15 @@ var cleanCommand = command({
|
|
|
6038
7117
|
});
|
|
6039
7118
|
|
|
6040
7119
|
// src/commands/workspace/list.ts
|
|
6041
|
-
import { existsSync as
|
|
7120
|
+
import { existsSync as existsSync7 } from "node:fs";
|
|
6042
7121
|
import { readFile as readFile3, readdir as readdir3, stat as stat2 } from "node:fs/promises";
|
|
6043
|
-
import
|
|
7122
|
+
import path14 from "node:path";
|
|
6044
7123
|
async function getDirectorySize(dirPath) {
|
|
6045
7124
|
let totalSize = 0;
|
|
6046
7125
|
try {
|
|
6047
7126
|
const entries2 = await readdir3(dirPath, { withFileTypes: true });
|
|
6048
7127
|
for (const entry of entries2) {
|
|
6049
|
-
const fullPath =
|
|
7128
|
+
const fullPath = path14.join(dirPath, entry.name);
|
|
6050
7129
|
if (entry.isDirectory()) {
|
|
6051
7130
|
totalSize += await getDirectorySize(fullPath);
|
|
6052
7131
|
} else {
|
|
@@ -6070,7 +7149,7 @@ var listCommand = command({
|
|
|
6070
7149
|
args: {},
|
|
6071
7150
|
handler: async () => {
|
|
6072
7151
|
const poolRoot = getWorkspacePoolRoot();
|
|
6073
|
-
if (!
|
|
7152
|
+
if (!existsSync7(poolRoot)) {
|
|
6074
7153
|
console.log("No workspace pool entries found.");
|
|
6075
7154
|
return;
|
|
6076
7155
|
}
|
|
@@ -6081,11 +7160,11 @@ var listCommand = command({
|
|
|
6081
7160
|
return;
|
|
6082
7161
|
}
|
|
6083
7162
|
for (const dir of poolDirs) {
|
|
6084
|
-
const poolDir =
|
|
7163
|
+
const poolDir = path14.join(poolRoot, dir.name);
|
|
6085
7164
|
const fingerprint = dir.name;
|
|
6086
7165
|
const poolEntries = await readdir3(poolDir, { withFileTypes: true });
|
|
6087
7166
|
const slots = poolEntries.filter((e) => e.isDirectory() && e.name.startsWith("slot-"));
|
|
6088
|
-
const metadataPath =
|
|
7167
|
+
const metadataPath = path14.join(poolDir, "metadata.json");
|
|
6089
7168
|
let metadata = null;
|
|
6090
7169
|
try {
|
|
6091
7170
|
const raw = await readFile3(metadataPath, "utf-8");
|
|
@@ -6131,8 +7210,8 @@ var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
|
|
|
6131
7210
|
var AGENTV_DIR = getAgentvHome();
|
|
6132
7211
|
var CACHE_FILE = "version-check.json";
|
|
6133
7212
|
var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
|
|
6134
|
-
async function getCachedUpdateInfo(
|
|
6135
|
-
const filePath =
|
|
7213
|
+
async function getCachedUpdateInfo(path15) {
|
|
7214
|
+
const filePath = path15 ?? join(AGENTV_DIR, CACHE_FILE);
|
|
6136
7215
|
try {
|
|
6137
7216
|
const raw = await readFile4(filePath, "utf-8");
|
|
6138
7217
|
const data = JSON.parse(raw);
|
|
@@ -6224,6 +7303,7 @@ var app = subcommands({
|
|
|
6224
7303
|
init: initCmdTsCommand,
|
|
6225
7304
|
results: resultsCommand,
|
|
6226
7305
|
self: selfCommand,
|
|
7306
|
+
serve: resultsServeCommand,
|
|
6227
7307
|
trace: traceCommand,
|
|
6228
7308
|
transpile: transpileCommand,
|
|
6229
7309
|
trim: trimCommand,
|
|
@@ -6241,6 +7321,7 @@ var TOP_LEVEL_COMMANDS = /* @__PURE__ */ new Set([
|
|
|
6241
7321
|
"init",
|
|
6242
7322
|
"results",
|
|
6243
7323
|
"self",
|
|
7324
|
+
"serve",
|
|
6244
7325
|
"trace",
|
|
6245
7326
|
"transpile",
|
|
6246
7327
|
"trim",
|
|
@@ -6287,4 +7368,4 @@ export {
|
|
|
6287
7368
|
preprocessArgv,
|
|
6288
7369
|
runCli
|
|
6289
7370
|
};
|
|
6290
|
-
//# sourceMappingURL=chunk-
|
|
7371
|
+
//# sourceMappingURL=chunk-ETMDLQ72.js.map
|