agentv 3.14.6 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +59 -533
- package/dist/{chunk-CQRWNXVG.js → chunk-2W5JKKXC.js} +537 -727
- package/dist/chunk-2W5JKKXC.js.map +1 -0
- package/dist/{chunk-Y25VL7PX.js → chunk-4Z326WWF.js} +40 -17
- package/dist/chunk-4Z326WWF.js.map +1 -0
- package/dist/{chunk-ELQEFMGO.js → chunk-XEAW7OQT.js} +594 -296
- package/dist/chunk-XEAW7OQT.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-5EEXTTC3.js → dist-2JUUJ6PT.js} +18 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-5ESM5DWV.js → interactive-7ZYS6IOC.js} +4 -11
- package/dist/interactive-7ZYS6IOC.js.map +1 -0
- package/dist/studio/assets/index-CDGReinH.js +71 -0
- package/dist/studio/assets/index-DofvSOmX.js +11 -0
- package/dist/studio/assets/index-izxfmBKC.css +1 -0
- package/dist/studio/index.html +13 -0
- package/package.json +1 -1
- package/dist/chunk-CQRWNXVG.js.map +0 -1
- package/dist/chunk-ELQEFMGO.js.map +0 -1
- package/dist/chunk-Y25VL7PX.js.map +0 -1
- package/dist/interactive-5ESM5DWV.js.map +0 -1
- /package/dist/{dist-5EEXTTC3.js.map → dist-2JUUJ6PT.js.map} +0 -0
|
@@ -10,6 +10,7 @@ import {
|
|
|
10
10
|
loadManifestResults,
|
|
11
11
|
loadRunCache,
|
|
12
12
|
package_default,
|
|
13
|
+
parseResultManifest,
|
|
13
14
|
resolveEvalPaths,
|
|
14
15
|
resolveExistingRunPrimaryPath,
|
|
15
16
|
resolveResultSourcePath,
|
|
@@ -23,9 +24,11 @@ import {
|
|
|
23
24
|
validateFileReferences,
|
|
24
25
|
validateTargetsFile,
|
|
25
26
|
writeArtifactsFromResults
|
|
26
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-4Z326WWF.js";
|
|
27
28
|
import {
|
|
29
|
+
DEFAULT_CATEGORY,
|
|
28
30
|
createBuiltinRegistry,
|
|
31
|
+
deriveCategory,
|
|
29
32
|
executeScript,
|
|
30
33
|
getAgentvHome,
|
|
31
34
|
getOutputFilenames,
|
|
@@ -40,7 +43,7 @@ import {
|
|
|
40
43
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
41
44
|
transpileEvalYamlFile,
|
|
42
45
|
trimBaselineResult
|
|
43
|
-
} from "./chunk-
|
|
46
|
+
} from "./chunk-XEAW7OQT.js";
|
|
44
47
|
import {
|
|
45
48
|
__commonJS,
|
|
46
49
|
__esm,
|
|
@@ -3479,9 +3482,23 @@ var ASSERTION_TEMPLATES = {
|
|
|
3479
3482
|
default: `#!/usr/bin/env bun
|
|
3480
3483
|
import { defineAssertion } from '@agentv/eval';
|
|
3481
3484
|
|
|
3482
|
-
|
|
3485
|
+
/** Extract text from the last message with the given role. */
|
|
3486
|
+
function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
|
|
3487
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
3488
|
+
const msg = messages[i];
|
|
3489
|
+
if (msg.role !== role) continue;
|
|
3490
|
+
if (typeof msg.content === 'string') return msg.content;
|
|
3491
|
+
if (Array.isArray(msg.content)) {
|
|
3492
|
+
return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
|
|
3493
|
+
}
|
|
3494
|
+
}
|
|
3495
|
+
return '';
|
|
3496
|
+
}
|
|
3497
|
+
|
|
3498
|
+
export default defineAssertion(({ output }) => {
|
|
3483
3499
|
// TODO: Implement your assertion logic
|
|
3484
|
-
const
|
|
3500
|
+
const text = getMessageText(output ?? []);
|
|
3501
|
+
const pass = text.length > 0;
|
|
3485
3502
|
return {
|
|
3486
3503
|
pass,
|
|
3487
3504
|
reasoning: pass ? 'Output has content' : 'Output is empty',
|
|
@@ -3491,9 +3508,23 @@ export default defineAssertion(({ outputText }) => {
|
|
|
3491
3508
|
score: `#!/usr/bin/env bun
|
|
3492
3509
|
import { defineAssertion } from '@agentv/eval';
|
|
3493
3510
|
|
|
3494
|
-
|
|
3511
|
+
/** Extract text from the last message with the given role. */
|
|
3512
|
+
function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
|
|
3513
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
3514
|
+
const msg = messages[i];
|
|
3515
|
+
if (msg.role !== role) continue;
|
|
3516
|
+
if (typeof msg.content === 'string') return msg.content;
|
|
3517
|
+
if (Array.isArray(msg.content)) {
|
|
3518
|
+
return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
|
|
3519
|
+
}
|
|
3520
|
+
}
|
|
3521
|
+
return '';
|
|
3522
|
+
}
|
|
3523
|
+
|
|
3524
|
+
export default defineAssertion(({ output }) => {
|
|
3495
3525
|
// TODO: Implement your scoring logic (0.0 to 1.0)
|
|
3496
|
-
const
|
|
3526
|
+
const text = getMessageText(output ?? []);
|
|
3527
|
+
const score = text.length > 0 ? 1.0 : 0.0;
|
|
3497
3528
|
return {
|
|
3498
3529
|
pass: score >= 0.5,
|
|
3499
3530
|
score,
|
|
@@ -4186,7 +4217,7 @@ var evalRunCommand = command({
|
|
|
4186
4217
|
},
|
|
4187
4218
|
handler: async (args) => {
|
|
4188
4219
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4189
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4220
|
+
const { launchInteractiveWizard } = await import("./interactive-7ZYS6IOC.js");
|
|
4190
4221
|
await launchInteractiveWizard();
|
|
4191
4222
|
return;
|
|
4192
4223
|
}
|
|
@@ -4421,7 +4452,8 @@ var evalBenchCommand = command({
|
|
|
4421
4452
|
const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
|
|
4422
4453
|
const testIds = manifest.test_ids;
|
|
4423
4454
|
const targetName = manifest.target?.name ?? "unknown";
|
|
4424
|
-
const evalSet = manifest.
|
|
4455
|
+
const evalSet = manifest.dataset ?? "";
|
|
4456
|
+
const experiment = manifest.experiment;
|
|
4425
4457
|
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4426
4458
|
let stdinData;
|
|
4427
4459
|
if (llmScoresPath) {
|
|
@@ -4531,7 +4563,8 @@ var evalBenchCommand = command({
|
|
|
4531
4563
|
JSON.stringify({
|
|
4532
4564
|
timestamp: manifest.timestamp,
|
|
4533
4565
|
test_id: testId,
|
|
4534
|
-
|
|
4566
|
+
dataset: evalSet || void 0,
|
|
4567
|
+
experiment: experiment || void 0,
|
|
4535
4568
|
score: Math.round(weightedScore * 1e3) / 1e3,
|
|
4536
4569
|
target: targetName,
|
|
4537
4570
|
scores,
|
|
@@ -4553,6 +4586,7 @@ var evalBenchCommand = command({
|
|
|
4553
4586
|
metadata: {
|
|
4554
4587
|
eval_file: manifest.eval_file,
|
|
4555
4588
|
timestamp: manifest.timestamp,
|
|
4589
|
+
experiment: experiment || void 0,
|
|
4556
4590
|
targets: [targetName],
|
|
4557
4591
|
tests_run: testIds
|
|
4558
4592
|
},
|
|
@@ -4594,6 +4628,12 @@ function computeStats(values) {
|
|
|
4594
4628
|
// src/commands/pipeline/grade.ts
|
|
4595
4629
|
import { mkdir as mkdir2, readFile as readFile2, readdir as readdir2, writeFile as writeFile3 } from "node:fs/promises";
|
|
4596
4630
|
import { join as join2 } from "node:path";
|
|
4631
|
+
function extractInputText(input) {
|
|
4632
|
+
if (!input || input.length === 0) return "";
|
|
4633
|
+
if (input.length === 1) return input[0].content;
|
|
4634
|
+
return input.map((m) => `@[${m.role}]:
|
|
4635
|
+
${m.content}`).join("\n\n");
|
|
4636
|
+
}
|
|
4597
4637
|
var evalGradeCommand = command({
|
|
4598
4638
|
name: "grade",
|
|
4599
4639
|
description: "Run code-grader assertions on responses in an export directory",
|
|
@@ -4608,7 +4648,7 @@ var evalGradeCommand = command({
|
|
|
4608
4648
|
const manifestPath = join2(exportDir, "manifest.json");
|
|
4609
4649
|
const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
|
|
4610
4650
|
const testIds = manifest.test_ids;
|
|
4611
|
-
const evalSet = manifest.
|
|
4651
|
+
const evalSet = manifest.dataset ?? "";
|
|
4612
4652
|
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4613
4653
|
let totalGraders = 0;
|
|
4614
4654
|
let totalPassed = 0;
|
|
@@ -4630,14 +4670,13 @@ var evalGradeCommand = command({
|
|
|
4630
4670
|
for (const graderFile of graderFiles) {
|
|
4631
4671
|
const graderConfig = JSON.parse(await readFile2(join2(codeGradersDir, graderFile), "utf8"));
|
|
4632
4672
|
const graderName = graderConfig.name;
|
|
4673
|
+
const inputText = extractInputText(inputData.input);
|
|
4633
4674
|
const payload = JSON.stringify({
|
|
4634
4675
|
output: [{ role: "assistant", content: responseText }],
|
|
4635
|
-
input: inputData.
|
|
4636
|
-
question: inputData.input_text,
|
|
4676
|
+
input: inputData.input,
|
|
4637
4677
|
criteria: "",
|
|
4638
4678
|
expected_output: [],
|
|
4639
|
-
|
|
4640
|
-
input_files: [],
|
|
4679
|
+
input_files: inputData.input_files ?? [],
|
|
4641
4680
|
trace: null,
|
|
4642
4681
|
token_usage: null,
|
|
4643
4682
|
cost_usd: null,
|
|
@@ -4647,8 +4686,8 @@ var evalGradeCommand = command({
|
|
|
4647
4686
|
file_changes: null,
|
|
4648
4687
|
workspace_path: null,
|
|
4649
4688
|
config: graderConfig.config ?? null,
|
|
4650
|
-
metadata: {},
|
|
4651
|
-
input_text:
|
|
4689
|
+
metadata: inputData.metadata ?? {},
|
|
4690
|
+
input_text: inputText,
|
|
4652
4691
|
output_text: responseText,
|
|
4653
4692
|
expected_output_text: ""
|
|
4654
4693
|
});
|
|
@@ -4706,7 +4745,7 @@ var evalGradeCommand = command({
|
|
|
4706
4745
|
// src/commands/pipeline/input.ts
|
|
4707
4746
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
4708
4747
|
import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
|
|
4709
|
-
import { dirname, join as join3, resolve } from "node:path";
|
|
4748
|
+
import { dirname, join as join3, relative, resolve } from "node:path";
|
|
4710
4749
|
var evalInputCommand = command({
|
|
4711
4750
|
name: "input",
|
|
4712
4751
|
description: "Extract eval inputs, target commands, and grader prompts for subagent-mode runs",
|
|
@@ -4720,14 +4759,20 @@ var evalInputCommand = command({
|
|
|
4720
4759
|
type: optional(string),
|
|
4721
4760
|
long: "out",
|
|
4722
4761
|
description: "Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)"
|
|
4762
|
+
}),
|
|
4763
|
+
experiment: option({
|
|
4764
|
+
type: optional(string),
|
|
4765
|
+
long: "experiment",
|
|
4766
|
+
description: "Experiment label (e.g. with_skills, without_skills)"
|
|
4723
4767
|
})
|
|
4724
4768
|
},
|
|
4725
|
-
handler: async ({ evalPath, out }) => {
|
|
4769
|
+
handler: async ({ evalPath, out, experiment }) => {
|
|
4726
4770
|
const resolvedEvalPath = resolve(evalPath);
|
|
4727
4771
|
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
|
|
4728
4772
|
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
|
|
4729
4773
|
const evalDir = dirname(resolvedEvalPath);
|
|
4730
|
-
const
|
|
4774
|
+
const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
|
|
4775
|
+
const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
|
|
4731
4776
|
const tests = suite.tests;
|
|
4732
4777
|
if (tests.length === 0) {
|
|
4733
4778
|
console.error("No tests found in eval file.");
|
|
@@ -4736,6 +4781,7 @@ var evalInputCommand = command({
|
|
|
4736
4781
|
let targetInfo = null;
|
|
4737
4782
|
let targetName = "agent";
|
|
4738
4783
|
let targetKind = "agent";
|
|
4784
|
+
let subagentModeAllowed = true;
|
|
4739
4785
|
try {
|
|
4740
4786
|
const selection = await selectTarget({
|
|
4741
4787
|
testFilePath: resolvedEvalPath,
|
|
@@ -4748,15 +4794,20 @@ var evalInputCommand = command({
|
|
|
4748
4794
|
env: process.env
|
|
4749
4795
|
});
|
|
4750
4796
|
targetName = selection.targetName;
|
|
4751
|
-
|
|
4797
|
+
const resolved = selection.resolvedTarget;
|
|
4798
|
+
subagentModeAllowed = resolved.subagentModeAllowed !== false;
|
|
4799
|
+
if (resolved.kind === "cli") {
|
|
4752
4800
|
targetKind = "cli";
|
|
4753
|
-
|
|
4801
|
+
subagentModeAllowed = false;
|
|
4802
|
+
const config = resolved.config;
|
|
4754
4803
|
targetInfo = {
|
|
4755
4804
|
kind: "cli",
|
|
4756
4805
|
command: config.command,
|
|
4757
4806
|
cwd: config.cwd ?? evalDir,
|
|
4758
4807
|
timeoutMs: config.timeoutMs ?? 3e4
|
|
4759
4808
|
};
|
|
4809
|
+
} else {
|
|
4810
|
+
targetKind = resolved.kind;
|
|
4760
4811
|
}
|
|
4761
4812
|
} catch {
|
|
4762
4813
|
}
|
|
@@ -4768,15 +4819,13 @@ var evalInputCommand = command({
|
|
|
4768
4819
|
const testDir = join3(outDir, ...subpath);
|
|
4769
4820
|
await mkdir3(testDir, { recursive: true });
|
|
4770
4821
|
testIds.push(test.id);
|
|
4771
|
-
const inputText = test.question;
|
|
4772
4822
|
const inputMessages = test.input.map((m) => ({
|
|
4773
4823
|
role: m.role,
|
|
4774
4824
|
content: typeof m.content === "string" ? m.content : m.content
|
|
4775
4825
|
}));
|
|
4776
4826
|
await writeJson(join3(testDir, "input.json"), {
|
|
4777
|
-
|
|
4778
|
-
|
|
4779
|
-
file_paths: test.file_paths,
|
|
4827
|
+
input: inputMessages,
|
|
4828
|
+
input_files: test.file_paths,
|
|
4780
4829
|
metadata: test.metadata ?? {}
|
|
4781
4830
|
});
|
|
4782
4831
|
if (targetInfo) {
|
|
@@ -4804,11 +4853,13 @@ var evalInputCommand = command({
|
|
|
4804
4853
|
}
|
|
4805
4854
|
await writeJson(join3(outDir, "manifest.json"), {
|
|
4806
4855
|
eval_file: resolvedEvalPath,
|
|
4807
|
-
|
|
4856
|
+
dataset: evalSetName || void 0,
|
|
4857
|
+
experiment: experiment || void 0,
|
|
4808
4858
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4809
4859
|
target: {
|
|
4810
4860
|
name: targetName,
|
|
4811
|
-
kind: targetKind
|
|
4861
|
+
kind: targetKind,
|
|
4862
|
+
subagent_mode_allowed: subagentModeAllowed
|
|
4812
4863
|
},
|
|
4813
4864
|
test_ids: testIds
|
|
4814
4865
|
});
|
|
@@ -4870,7 +4921,13 @@ import { execSync } from "node:child_process";
|
|
|
4870
4921
|
import { existsSync as existsSync3, readFileSync as readFileSync4, unlinkSync } from "node:fs";
|
|
4871
4922
|
import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
|
|
4872
4923
|
import { tmpdir } from "node:os";
|
|
4873
|
-
import { dirname as dirname2, join as join4, resolve as resolve2 } from "node:path";
|
|
4924
|
+
import { dirname as dirname2, join as join4, relative as relative2, resolve as resolve2 } from "node:path";
|
|
4925
|
+
function extractInputText2(input) {
|
|
4926
|
+
if (!input || input.length === 0) return "";
|
|
4927
|
+
if (input.length === 1) return input[0].content;
|
|
4928
|
+
return input.map((m) => `@[${m.role}]:
|
|
4929
|
+
${m.content}`).join("\n\n");
|
|
4930
|
+
}
|
|
4874
4931
|
function loadEnvFile(dir) {
|
|
4875
4932
|
let current = resolve2(dir);
|
|
4876
4933
|
while (true) {
|
|
@@ -4910,14 +4967,25 @@ var evalRunCommand2 = command({
|
|
|
4910
4967
|
type: optional(number),
|
|
4911
4968
|
long: "workers",
|
|
4912
4969
|
description: "Parallel workers for target invocation (default: all tests)"
|
|
4970
|
+
}),
|
|
4971
|
+
experiment: option({
|
|
4972
|
+
type: optional(string),
|
|
4973
|
+
long: "experiment",
|
|
4974
|
+
description: "Experiment label (e.g. with_skills, without_skills)"
|
|
4975
|
+
}),
|
|
4976
|
+
graderType: option({
|
|
4977
|
+
type: optional(oneOf(["code", "none"])),
|
|
4978
|
+
long: "grader-type",
|
|
4979
|
+
description: 'Which grading phase to run: "code" runs code-graders inline, omit to skip grading (use pipeline grade separately)'
|
|
4913
4980
|
})
|
|
4914
4981
|
},
|
|
4915
|
-
handler: async ({ evalPath, out, workers }) => {
|
|
4982
|
+
handler: async ({ evalPath, out, workers, experiment, graderType }) => {
|
|
4916
4983
|
const resolvedEvalPath = resolve2(evalPath);
|
|
4917
4984
|
const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd()));
|
|
4918
4985
|
const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
|
|
4919
4986
|
const evalDir = dirname2(resolvedEvalPath);
|
|
4920
|
-
const
|
|
4987
|
+
const category = deriveCategory(relative2(process.cwd(), resolvedEvalPath));
|
|
4988
|
+
const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
|
|
4921
4989
|
const tests = suite.tests;
|
|
4922
4990
|
if (tests.length === 0) {
|
|
4923
4991
|
console.error("No tests found in eval file.");
|
|
@@ -4958,15 +5026,13 @@ var evalRunCommand2 = command({
|
|
|
4958
5026
|
const testDir = join4(outDir, ...subpath);
|
|
4959
5027
|
await mkdir4(testDir, { recursive: true });
|
|
4960
5028
|
testIds.push(test.id);
|
|
4961
|
-
const inputText = test.question;
|
|
4962
5029
|
const inputMessages = test.input.map((m) => ({
|
|
4963
5030
|
role: m.role,
|
|
4964
5031
|
content: typeof m.content === "string" ? m.content : m.content
|
|
4965
5032
|
}));
|
|
4966
5033
|
await writeJson2(join4(testDir, "input.json"), {
|
|
4967
|
-
|
|
4968
|
-
|
|
4969
|
-
file_paths: test.file_paths,
|
|
5034
|
+
input: inputMessages,
|
|
5035
|
+
input_files: test.file_paths,
|
|
4970
5036
|
metadata: test.metadata ?? {}
|
|
4971
5037
|
});
|
|
4972
5038
|
if (targetInfo) {
|
|
@@ -4994,7 +5060,8 @@ var evalRunCommand2 = command({
|
|
|
4994
5060
|
}
|
|
4995
5061
|
await writeJson2(join4(outDir, "manifest.json"), {
|
|
4996
5062
|
eval_file: resolvedEvalPath,
|
|
4997
|
-
|
|
5063
|
+
dataset: evalSetName || void 0,
|
|
5064
|
+
experiment: experiment || void 0,
|
|
4998
5065
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4999
5066
|
target: { name: targetName, kind: targetKind },
|
|
5000
5067
|
test_ids: testIds
|
|
@@ -5019,11 +5086,12 @@ var evalRunCommand2 = command({
|
|
|
5019
5086
|
const timeoutMs = invoke.timeout_ms ?? 12e4;
|
|
5020
5087
|
const promptFile = join4(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
|
|
5021
5088
|
const outputFile = join4(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
|
|
5022
|
-
|
|
5089
|
+
const inputText = extractInputText2(inputData.input);
|
|
5090
|
+
await writeFile5(promptFile, inputText, "utf8");
|
|
5023
5091
|
let rendered = template;
|
|
5024
5092
|
rendered = rendered.replace("{PROMPT_FILE}", promptFile);
|
|
5025
5093
|
rendered = rendered.replace("{OUTPUT_FILE}", outputFile);
|
|
5026
|
-
rendered = rendered.replace("{PROMPT}",
|
|
5094
|
+
rendered = rendered.replace("{PROMPT}", inputText);
|
|
5027
5095
|
const start = performance.now();
|
|
5028
5096
|
try {
|
|
5029
5097
|
execSync(rendered, {
|
|
@@ -5080,6 +5148,12 @@ var evalRunCommand2 = command({
|
|
|
5080
5148
|
} else {
|
|
5081
5149
|
console.log("Subagent-as-target mode \u2014 skipping CLI invocation.");
|
|
5082
5150
|
}
|
|
5151
|
+
if (graderType !== "code") {
|
|
5152
|
+
console.log(`
|
|
5153
|
+
Done. Results in ${outDir}`);
|
|
5154
|
+
console.log("To run code graders: agentv pipeline grade <run-dir> (or re-run with --grader-type code)");
|
|
5155
|
+
return;
|
|
5156
|
+
}
|
|
5083
5157
|
let totalGraders = 0;
|
|
5084
5158
|
let totalPassed = 0;
|
|
5085
5159
|
for (const testId of testIds) {
|
|
@@ -5100,14 +5174,13 @@ var evalRunCommand2 = command({
|
|
|
5100
5174
|
for (const graderFile of graderFiles) {
|
|
5101
5175
|
const graderConfig = JSON.parse(await readFile4(join4(codeGradersDir, graderFile), "utf8"));
|
|
5102
5176
|
const graderName = graderConfig.name;
|
|
5177
|
+
const inputText = extractInputText2(inputData.input);
|
|
5103
5178
|
const payload = JSON.stringify({
|
|
5104
5179
|
output: [{ role: "assistant", content: responseText }],
|
|
5105
|
-
input: inputData.
|
|
5106
|
-
question: inputData.input_text,
|
|
5180
|
+
input: inputData.input,
|
|
5107
5181
|
criteria: "",
|
|
5108
5182
|
expected_output: [],
|
|
5109
|
-
|
|
5110
|
-
input_files: [],
|
|
5183
|
+
input_files: inputData.input_files ?? [],
|
|
5111
5184
|
trace: null,
|
|
5112
5185
|
token_usage: null,
|
|
5113
5186
|
cost_usd: null,
|
|
@@ -5117,8 +5190,8 @@ var evalRunCommand2 = command({
|
|
|
5117
5190
|
file_changes: null,
|
|
5118
5191
|
workspace_path: null,
|
|
5119
5192
|
config: graderConfig.config ?? null,
|
|
5120
|
-
metadata: {},
|
|
5121
|
-
input_text:
|
|
5193
|
+
metadata: inputData.metadata ?? {},
|
|
5194
|
+
input_text: inputText,
|
|
5122
5195
|
output_text: responseText,
|
|
5123
5196
|
expected_output_text: ""
|
|
5124
5197
|
});
|
|
@@ -5306,7 +5379,7 @@ function toRawResult(result) {
|
|
|
5306
5379
|
return {
|
|
5307
5380
|
timestamp: result.timestamp,
|
|
5308
5381
|
test_id: result.testId,
|
|
5309
|
-
|
|
5382
|
+
dataset: result.dataset,
|
|
5310
5383
|
conversation_id: result.conversationId,
|
|
5311
5384
|
score: result.score,
|
|
5312
5385
|
assertions: result.assertions?.map((assertion) => ({
|
|
@@ -5429,7 +5502,7 @@ function loadOtlpTraceFile(filePath) {
|
|
|
5429
5502
|
}
|
|
5430
5503
|
return {
|
|
5431
5504
|
test_id: stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`,
|
|
5432
|
-
|
|
5505
|
+
dataset: stringAttr(rootAttrs.agentv_dataset),
|
|
5433
5506
|
target: stringAttr(rootAttrs.agentv_target),
|
|
5434
5507
|
score,
|
|
5435
5508
|
error: root.status?.code === 2 ? root.status.message : void 0,
|
|
@@ -6173,8 +6246,9 @@ var resultsCommand = subcommands({
|
|
|
6173
6246
|
});
|
|
6174
6247
|
|
|
6175
6248
|
// src/commands/results/serve.ts
|
|
6176
|
-
import { existsSync as existsSync7, readFileSync as readFileSync8, writeFileSync as writeFileSync3 } from "node:fs";
|
|
6249
|
+
import { existsSync as existsSync7, readFileSync as readFileSync8, readdirSync as readdirSync3, statSync as statSync4, writeFileSync as writeFileSync3 } from "node:fs";
|
|
6177
6250
|
import path9 from "node:path";
|
|
6251
|
+
import { fileURLToPath as fileURLToPath2 } from "node:url";
|
|
6178
6252
|
import { Hono } from "hono";
|
|
6179
6253
|
function feedbackPath(resultDir) {
|
|
6180
6254
|
return path9.join(resultDir, "feedback.json");
|
|
@@ -6195,24 +6269,46 @@ function writeFeedback(cwd, data) {
|
|
|
6195
6269
|
writeFileSync3(feedbackPath(cwd), `${JSON.stringify(data, null, 2)}
|
|
6196
6270
|
`, "utf8");
|
|
6197
6271
|
}
|
|
6198
|
-
function createApp(results, resultDir, cwd, sourceFile) {
|
|
6272
|
+
function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
6199
6273
|
const searchDir = cwd ?? resultDir;
|
|
6200
6274
|
const app2 = new Hono();
|
|
6275
|
+
const studioDistPath = options?.studioDir ?? resolveStudioDistDir();
|
|
6276
|
+
if (!studioDistPath || !existsSync7(path9.join(studioDistPath, "index.html"))) {
|
|
6277
|
+
throw new Error('Studio dist not found. Run "bun run build" in apps/studio/ to build the SPA.');
|
|
6278
|
+
}
|
|
6201
6279
|
app2.get("/", (c3) => {
|
|
6202
|
-
|
|
6280
|
+
const indexPath = path9.join(studioDistPath, "index.html");
|
|
6281
|
+
if (existsSync7(indexPath)) {
|
|
6282
|
+
return c3.html(readFileSync8(indexPath, "utf8"));
|
|
6283
|
+
}
|
|
6284
|
+
return c3.notFound();
|
|
6203
6285
|
});
|
|
6204
6286
|
app2.get("/api/runs", (c3) => {
|
|
6205
6287
|
const metas = listResultFiles(searchDir);
|
|
6206
6288
|
return c3.json({
|
|
6207
|
-
runs: metas.map((m) =>
|
|
6208
|
-
|
|
6209
|
-
|
|
6210
|
-
|
|
6211
|
-
|
|
6212
|
-
|
|
6213
|
-
|
|
6214
|
-
|
|
6215
|
-
|
|
6289
|
+
runs: metas.map((m) => {
|
|
6290
|
+
let target;
|
|
6291
|
+
let experiment;
|
|
6292
|
+
try {
|
|
6293
|
+
const records = loadLightweightResults(m.path);
|
|
6294
|
+
if (records.length > 0) {
|
|
6295
|
+
target = records[0].target;
|
|
6296
|
+
experiment = records[0].experiment;
|
|
6297
|
+
}
|
|
6298
|
+
} catch {
|
|
6299
|
+
}
|
|
6300
|
+
return {
|
|
6301
|
+
filename: m.filename,
|
|
6302
|
+
path: m.path,
|
|
6303
|
+
timestamp: m.timestamp,
|
|
6304
|
+
test_count: m.testCount,
|
|
6305
|
+
pass_rate: m.passRate,
|
|
6306
|
+
avg_score: m.avgScore,
|
|
6307
|
+
size_bytes: m.sizeBytes,
|
|
6308
|
+
...target && { target },
|
|
6309
|
+
...experiment && { experiment }
|
|
6310
|
+
};
|
|
6311
|
+
})
|
|
6216
6312
|
});
|
|
6217
6313
|
});
|
|
6218
6314
|
app2.get("/api/runs/:filename", (c3) => {
|
|
@@ -6272,692 +6368,406 @@ function createApp(results, resultDir, cwd, sourceFile) {
|
|
|
6272
6368
|
writeFeedback(resultDir, existing);
|
|
6273
6369
|
return c3.json(existing);
|
|
6274
6370
|
});
|
|
6275
|
-
|
|
6276
|
-
|
|
6277
|
-
|
|
6278
|
-
|
|
6279
|
-
|
|
6280
|
-
|
|
6281
|
-
|
|
6282
|
-
|
|
6283
|
-
|
|
6284
|
-
|
|
6285
|
-
|
|
6286
|
-
|
|
6371
|
+
app2.get("/api/runs/:filename/datasets", (c3) => {
|
|
6372
|
+
const filename = c3.req.param("filename");
|
|
6373
|
+
const metas = listResultFiles(searchDir);
|
|
6374
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6375
|
+
if (!meta) {
|
|
6376
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6377
|
+
}
|
|
6378
|
+
try {
|
|
6379
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6380
|
+
const datasetMap = /* @__PURE__ */ new Map();
|
|
6381
|
+
for (const r of loaded) {
|
|
6382
|
+
const ds = r.dataset ?? r.target ?? "default";
|
|
6383
|
+
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
|
|
6384
|
+
entry.total++;
|
|
6385
|
+
if (r.score >= 1) entry.passed++;
|
|
6386
|
+
entry.scoreSum += r.score;
|
|
6387
|
+
datasetMap.set(ds, entry);
|
|
6388
|
+
}
|
|
6389
|
+
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
|
|
6390
|
+
name,
|
|
6391
|
+
total: entry.total,
|
|
6392
|
+
passed: entry.passed,
|
|
6393
|
+
failed: entry.total - entry.passed,
|
|
6394
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
|
|
6395
|
+
}));
|
|
6396
|
+
return c3.json({ datasets });
|
|
6397
|
+
} catch {
|
|
6398
|
+
return c3.json({ error: "Failed to load datasets" }, 500);
|
|
6399
|
+
}
|
|
6287
6400
|
});
|
|
6288
|
-
|
|
6289
|
-
|
|
6290
|
-
|
|
6291
|
-
|
|
6292
|
-
|
|
6293
|
-
|
|
6294
|
-
const dataJson = JSON.stringify(lightResults).replace(/</g, "\\u003c").replace(/>/g, "\\u003e").replace(/\u2028/g, "\\u2028").replace(/\u2029/g, "\\u2029");
|
|
6295
|
-
return `<!DOCTYPE html>
|
|
6296
|
-
<html lang="en">
|
|
6297
|
-
<head>
|
|
6298
|
-
<meta charset="utf-8">
|
|
6299
|
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
6300
|
-
<title>AgentV Results Review</title>
|
|
6301
|
-
<style>
|
|
6302
|
-
${SERVE_STYLES}
|
|
6303
|
-
</style>
|
|
6304
|
-
</head>
|
|
6305
|
-
<body>
|
|
6306
|
-
<header class="header">
|
|
6307
|
-
<div class="header-left">
|
|
6308
|
-
<h1 class="header-title">AgentV</h1>
|
|
6309
|
-
<span class="header-subtitle">Results Review</span>
|
|
6310
|
-
</div>
|
|
6311
|
-
<div class="header-center">
|
|
6312
|
-
<select id="run-picker" class="run-picker" title="Switch result file">
|
|
6313
|
-
<option value="">Loading runs...</option>
|
|
6314
|
-
</select>
|
|
6315
|
-
</div>
|
|
6316
|
-
<div class="header-right">
|
|
6317
|
-
<span class="timestamp">${escapeHtml((/* @__PURE__ */ new Date()).toISOString())}</span>
|
|
6318
|
-
</div>
|
|
6319
|
-
</header>
|
|
6320
|
-
<nav class="tabs" id="tabs">
|
|
6321
|
-
<button class="tab active" data-tab="overview">Overview</button>
|
|
6322
|
-
<button class="tab" data-tab="tests">Test Cases</button>
|
|
6323
|
-
</nav>
|
|
6324
|
-
<main id="app"></main>
|
|
6325
|
-
<script>
|
|
6326
|
-
var DATA = ${dataJson};
|
|
6327
|
-
var INITIAL_SOURCE = ${sourceFile ? JSON.stringify(path9.basename(sourceFile)).replace(/</g, "\\u003c").replace(/>/g, "\\u003e") : "null"};
|
|
6328
|
-
${SERVE_SCRIPT}
|
|
6329
|
-
</script>
|
|
6330
|
-
</body>
|
|
6331
|
-
</html>`;
|
|
6332
|
-
}
|
|
6333
|
-
var SERVE_STYLES = `
|
|
6334
|
-
*{margin:0;padding:0;box-sizing:border-box}
|
|
6335
|
-
:root{
|
|
6336
|
-
--bg:#f6f8fa;--surface:#fff;--border:#d0d7de;--border-light:#e8ebee;
|
|
6337
|
-
--text:#1f2328;--text-muted:#656d76;
|
|
6338
|
-
--primary:#0969da;--primary-bg:#ddf4ff;
|
|
6339
|
-
--success:#1a7f37;--success-bg:#dafbe1;
|
|
6340
|
-
--danger:#cf222e;--danger-bg:#ffebe9;
|
|
6341
|
-
--warning:#9a6700;--warning-bg:#fff8c5;
|
|
6342
|
-
--radius:6px;
|
|
6343
|
-
--shadow:0 1px 3px rgba(31,35,40,.04),0 1px 2px rgba(31,35,40,.06);
|
|
6344
|
-
--font:-apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif;
|
|
6345
|
-
--mono:ui-monospace,SFMono-Regular,"SF Mono",Menlo,Consolas,monospace;
|
|
6346
|
-
}
|
|
6347
|
-
body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:1.5;font-size:14px}
|
|
6348
|
-
|
|
6349
|
-
/* Header */
|
|
6350
|
-
.header{background:var(--surface);border-bottom:1px solid var(--border);padding:12px 24px;display:flex;align-items:center;justify-content:space-between}
|
|
6351
|
-
.header-left{display:flex;align-items:baseline;gap:12px}
|
|
6352
|
-
.header-title{font-size:18px;font-weight:600}
|
|
6353
|
-
.header-subtitle{font-size:14px;color:var(--text-muted)}
|
|
6354
|
-
.header-center{flex:1;display:flex;justify-content:center;padding:0 16px}
|
|
6355
|
-
.run-picker{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font);max-width:400px;width:100%;cursor:pointer}
|
|
6356
|
-
.run-picker:hover{border-color:var(--primary)}
|
|
6357
|
-
.run-picker:focus{outline:none;border-color:var(--primary);box-shadow:0 0 0 3px var(--primary-bg)}
|
|
6358
|
-
.timestamp{font-size:12px;color:var(--text-muted);font-family:var(--mono)}
|
|
6359
|
-
|
|
6360
|
-
/* Tabs */
|
|
6361
|
-
.tabs{background:var(--surface);border-bottom:1px solid var(--border);padding:0 24px;display:flex}
|
|
6362
|
-
.tab{background:none;border:none;padding:10px 16px;font-size:14px;color:var(--text-muted);cursor:pointer;border-bottom:2px solid transparent;font-family:var(--font);transition:color .15s,border-color .15s}
|
|
6363
|
-
.tab:hover{color:var(--text)}
|
|
6364
|
-
.tab.active{color:var(--text);font-weight:600;border-bottom-color:var(--primary)}
|
|
6365
|
-
|
|
6366
|
-
#app{max-width:1280px;margin:0 auto;padding:24px}
|
|
6367
|
-
|
|
6368
|
-
/* Stat cards */
|
|
6369
|
-
.stats-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:12px;margin-bottom:24px}
|
|
6370
|
-
.stat-card{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;text-align:center;box-shadow:var(--shadow)}
|
|
6371
|
-
.stat-card.pass .stat-value{color:var(--success)}
|
|
6372
|
-
.stat-card.fail .stat-value{color:var(--danger)}
|
|
6373
|
-
.stat-card.error .stat-value{color:var(--danger)}
|
|
6374
|
-
.stat-card.warn .stat-value{color:var(--warning)}
|
|
6375
|
-
.stat-card.total .stat-value{color:var(--primary)}
|
|
6376
|
-
.stat-value{font-size:28px;font-weight:700;line-height:1.2}
|
|
6377
|
-
.stat-label{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.5px;margin-top:4px}
|
|
6378
|
-
|
|
6379
|
-
/* Sections */
|
|
6380
|
-
.section{margin-bottom:24px}
|
|
6381
|
-
.section-title{font-size:16px;font-weight:600;margin-bottom:12px}
|
|
6382
|
-
|
|
6383
|
-
/* Tables */
|
|
6384
|
-
.table-wrap{overflow-x:auto;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);box-shadow:var(--shadow)}
|
|
6385
|
-
.data-table{width:100%;border-collapse:collapse;font-size:13px}
|
|
6386
|
-
.data-table th{background:var(--bg);border-bottom:1px solid var(--border);padding:8px 12px;text-align:left;font-weight:600;font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;white-space:nowrap}
|
|
6387
|
-
.data-table th.sortable{cursor:pointer;user-select:none}
|
|
6388
|
-
.data-table th.sortable:hover{color:var(--text)}
|
|
6389
|
-
.data-table td{padding:8px 12px;border-bottom:1px solid var(--border-light);vertical-align:middle}
|
|
6390
|
-
.data-table tbody tr:last-child td{border-bottom:none}
|
|
6391
|
-
|
|
6392
|
-
/* Status icons */
|
|
6393
|
-
.status-icon{display:inline-flex;align-items:center;justify-content:center;width:22px;height:22px;border-radius:50%;font-size:12px;font-weight:700}
|
|
6394
|
-
.status-icon.pass{background:var(--success-bg);color:var(--success)}
|
|
6395
|
-
.status-icon.fail{background:var(--danger-bg);color:var(--danger)}
|
|
6396
|
-
.status-icon.error{background:var(--warning-bg);color:var(--warning)}
|
|
6397
|
-
|
|
6398
|
-
/* Score colors */
|
|
6399
|
-
.score-high{color:var(--success);font-weight:600}
|
|
6400
|
-
.score-mid{color:var(--warning);font-weight:600}
|
|
6401
|
-
.score-low{color:var(--danger);font-weight:600}
|
|
6402
|
-
|
|
6403
|
-
/* Pass-rate bar */
|
|
6404
|
-
.bar-bg{width:100px;height:8px;background:var(--border-light);border-radius:4px;overflow:hidden}
|
|
6405
|
-
.bar-fill{height:100%;border-radius:4px;transition:width .3s}
|
|
6406
|
-
.bar-fill.score-high{background:var(--success)}
|
|
6407
|
-
.bar-fill.score-mid{background:var(--warning)}
|
|
6408
|
-
.bar-fill.score-low{background:var(--danger)}
|
|
6409
|
-
|
|
6410
|
-
/* Histogram */
|
|
6411
|
-
.histogram{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;box-shadow:var(--shadow)}
|
|
6412
|
-
.hist-row{display:flex;align-items:center;gap:12px;margin-bottom:8px}
|
|
6413
|
-
.hist-row:last-child{margin-bottom:0}
|
|
6414
|
-
.hist-label{width:60px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
|
|
6415
|
-
.hist-bar-bg{flex:1;height:20px;background:var(--border-light);border-radius:3px;overflow:hidden}
|
|
6416
|
-
.hist-bar{height:100%;border-radius:3px;transition:width .3s}
|
|
6417
|
-
.hist-count{width:30px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
|
|
6418
|
-
|
|
6419
|
-
/* Filters */
|
|
6420
|
-
.filter-bar{display:flex;gap:8px;margin-bottom:16px;align-items:center;flex-wrap:wrap}
|
|
6421
|
-
.filter-select,.filter-search{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font)}
|
|
6422
|
-
.filter-search{flex:1;min-width:200px}
|
|
6423
|
-
.filter-count{font-size:12px;color:var(--text-muted);margin-left:auto}
|
|
6424
|
-
|
|
6425
|
-
/* Test rows */
|
|
6426
|
-
.test-row{cursor:pointer;transition:background .1s}
|
|
6427
|
-
.test-row:hover{background:var(--bg)!important}
|
|
6428
|
-
.test-row.expanded{background:var(--primary-bg)!important}
|
|
6429
|
-
.expand-col{width:32px;text-align:center}
|
|
6430
|
-
.expand-icon{color:var(--text-muted);font-size:12px}
|
|
6431
|
-
.fw-medium{font-weight:500}
|
|
6432
|
-
.text-pass{color:var(--success)}.text-fail{color:var(--danger)}.text-error{color:var(--warning)}
|
|
6433
|
-
|
|
6434
|
-
/* Detail panel */
|
|
6435
|
-
.detail-row td{padding:0!important;background:var(--bg)!important}
|
|
6436
|
-
.detail-panel{padding:16px 24px}
|
|
6437
|
-
.detail-grid{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-bottom:16px}
|
|
6438
|
-
.detail-block h4{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;margin-bottom:6px}
|
|
6439
|
-
.detail-pre{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:12px;font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word;max-height:300px;overflow-y:auto;line-height:1.6}
|
|
6440
|
-
.detail-panel h4{font-size:13px;font-weight:600;margin:16px 0 8px}
|
|
6441
|
-
.eval-table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);margin-bottom:12px}
|
|
6442
|
-
.eval-table th{background:var(--bg);padding:6px 10px;text-align:left;font-size:11px;font-weight:600;color:var(--text-muted);text-transform:uppercase;border-bottom:1px solid var(--border)}
|
|
6443
|
-
.eval-table td{padding:8px 10px;border-bottom:1px solid var(--border-light)}
|
|
6444
|
-
.reasoning-cell{max-width:500px;font-size:12px;color:var(--text-muted)}
|
|
6445
|
-
.expect-list{list-style:none;padding:0;margin-bottom:12px}
|
|
6446
|
-
.expect-list li{padding:4px 8px 4px 24px;position:relative;font-size:13px}
|
|
6447
|
-
.expect-list.pass li::before{content:"\\2713";position:absolute;left:4px;color:var(--success);font-weight:700}
|
|
6448
|
-
.expect-list.fail li::before{content:"\\2717";position:absolute;left:4px;color:var(--danger);font-weight:700}
|
|
6449
|
-
.error-box{background:var(--danger-bg);border:1px solid var(--danger);border-radius:var(--radius);padding:12px;margin-bottom:12px}
|
|
6450
|
-
.error-box h4{color:var(--danger);margin:0 0 6px}
|
|
6451
|
-
.error-box pre{font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word}
|
|
6452
|
-
.detail-meta{font-size:12px;color:var(--text-muted);margin-top:12px;padding-top:12px;border-top:1px solid var(--border-light)}
|
|
6453
|
-
.tool-calls{display:flex;flex-wrap:wrap;gap:6px;margin-bottom:12px}
|
|
6454
|
-
.tool-tag{display:inline-block;padding:2px 10px;font-size:12px;font-family:var(--mono);background:var(--primary-bg);color:var(--primary);border:1px solid var(--border);border-radius:12px}
|
|
6455
|
-
.empty-state{text-align:center;padding:48px 24px;color:var(--text-muted)}
|
|
6456
|
-
.empty-state h3{font-size:16px;margin-bottom:8px;color:var(--text)}
|
|
6457
|
-
.welcome-state{text-align:center;padding:80px 24px;color:var(--text-muted)}
|
|
6458
|
-
.welcome-state h2{font-size:24px;margin-bottom:12px;color:var(--text);font-weight:600}
|
|
6459
|
-
.welcome-state p{font-size:15px;margin-bottom:8px;max-width:500px;margin-left:auto;margin-right:auto}
|
|
6460
|
-
.welcome-state code{font-family:var(--mono);background:var(--surface);border:1px solid var(--border);border-radius:3px;padding:2px 6px;font-size:13px}
|
|
6461
|
-
.welcome-state .hint{margin-top:24px;font-size:13px;color:var(--text-muted)}
|
|
6462
|
-
|
|
6463
|
-
/* Feedback */
|
|
6464
|
-
.feedback-section{margin-top:16px;padding-top:16px;border-top:1px solid var(--border-light)}
|
|
6465
|
-
.feedback-input{width:100%;min-height:80px;padding:8px 12px;border:1px solid var(--border);border-radius:var(--radius);font-family:var(--font);font-size:13px;resize:vertical;background:var(--surface);color:var(--text)}
|
|
6466
|
-
.feedback-input:focus{outline:none;border-color:var(--primary);box-shadow:0 0 0 3px var(--primary-bg)}
|
|
6467
|
-
.feedback-submit{margin-top:8px;padding:6px 16px;background:var(--primary);color:#fff;border:none;border-radius:var(--radius);font-size:13px;cursor:pointer;font-family:var(--font)}
|
|
6468
|
-
.feedback-submit:hover{opacity:.9}
|
|
6469
|
-
.feedback-submit:disabled{opacity:.5;cursor:default}
|
|
6470
|
-
.feedback-status{margin-left:8px;font-size:12px;color:var(--success)}
|
|
6471
|
-
`;
|
|
6472
|
-
var SERVE_SCRIPT = `
|
|
6473
|
-
(function(){
|
|
6474
|
-
/* ---- helpers ---- */
|
|
6475
|
-
function esc(s){
|
|
6476
|
-
if(s==null)return"";
|
|
6477
|
-
return String(s).replace(/&/g,"&").replace(/</g,"<").replace(/>/g,">").replace(/"/g,""");
|
|
6478
|
-
}
|
|
6479
|
-
function getStatus(r){
|
|
6480
|
-
if(r.executionStatus==="execution_error")return"error";
|
|
6481
|
-
if(r.executionStatus==="quality_failure")return"fail";
|
|
6482
|
-
if(r.executionStatus==="ok")return"pass";
|
|
6483
|
-
if(r.error)return"error";
|
|
6484
|
-
return r.score>=0.5?"pass":"fail";
|
|
6485
|
-
}
|
|
6486
|
-
function sIcon(s){
|
|
6487
|
-
if(s==="pass")return'<span class="status-icon pass">\\u2713</span>';
|
|
6488
|
-
if(s==="fail")return'<span class="status-icon fail">\\u2717</span>';
|
|
6489
|
-
return'<span class="status-icon error">!</span>';
|
|
6490
|
-
}
|
|
6491
|
-
function fmtDur(ms){
|
|
6492
|
-
if(ms==null)return"\\u2014";
|
|
6493
|
-
if(ms<1000)return ms+"ms";
|
|
6494
|
-
if(ms<60000)return(ms/1000).toFixed(1)+"s";
|
|
6495
|
-
return Math.floor(ms/60000)+"m "+Math.round((ms%60000)/1000)+"s";
|
|
6496
|
-
}
|
|
6497
|
-
function fmtTok(n){
|
|
6498
|
-
if(n==null)return"\\u2014";
|
|
6499
|
-
if(n>=1e6)return(n/1e6).toFixed(1)+"M";
|
|
6500
|
-
if(n>=1e3)return(n/1e3).toFixed(1)+"K";
|
|
6501
|
-
return String(n);
|
|
6502
|
-
}
|
|
6503
|
-
function fmtCost(u){if(u==null)return"\\u2014";if(u<0.01)return"<$0.01";return"$"+u.toFixed(2);}
|
|
6504
|
-
function fmtPct(v){if(v==null)return"\\u2014";return(v*100).toFixed(1)+"%";}
|
|
6505
|
-
function sCls(v){if(v==null)return"";if(v>=0.9)return"score-high";if(v>=0.5)return"score-mid";return"score-low";}
|
|
6506
|
-
|
|
6507
|
-
/* ---- feedback state ---- */
|
|
6508
|
-
var feedbackCache={};
|
|
6509
|
-
|
|
6510
|
-
function loadFeedback(){
|
|
6511
|
-
fetch("/api/feedback").then(function(r){return r.json();}).then(function(d){
|
|
6512
|
-
if(d&&d.reviews){
|
|
6513
|
-
for(var i=0;i<d.reviews.length;i++){
|
|
6514
|
-
feedbackCache[d.reviews[i].test_id]=d.reviews[i].comment;
|
|
6515
|
-
}
|
|
6516
|
-
populateFeedbackTextareas();
|
|
6517
|
-
}
|
|
6518
|
-
}).catch(function(){});
|
|
6519
|
-
}
|
|
6520
|
-
|
|
6521
|
-
function populateFeedbackTextareas(){
|
|
6522
|
-
var areas=document.querySelectorAll(".feedback-input");
|
|
6523
|
-
for(var i=0;i<areas.length;i++){
|
|
6524
|
-
var tid=areas[i].getAttribute("data-test-id");
|
|
6525
|
-
if(tid&&feedbackCache[tid]!=null){
|
|
6526
|
-
areas[i].value=feedbackCache[tid];
|
|
6527
|
-
}
|
|
6401
|
+
app2.get("/api/runs/:filename/categories", (c3) => {
|
|
6402
|
+
const filename = c3.req.param("filename");
|
|
6403
|
+
const metas = listResultFiles(searchDir);
|
|
6404
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6405
|
+
if (!meta) {
|
|
6406
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6528
6407
|
}
|
|
6529
|
-
|
|
6530
|
-
|
|
6531
|
-
|
|
6532
|
-
|
|
6533
|
-
|
|
6534
|
-
|
|
6535
|
-
|
|
6536
|
-
|
|
6537
|
-
|
|
6538
|
-
|
|
6539
|
-
|
|
6540
|
-
|
|
6541
|
-
|
|
6542
|
-
|
|
6543
|
-
|
|
6544
|
-
|
|
6545
|
-
|
|
6546
|
-
|
|
6547
|
-
|
|
6548
|
-
|
|
6549
|
-
|
|
6550
|
-
|
|
6551
|
-
|
|
6552
|
-
|
|
6553
|
-
|
|
6554
|
-
|
|
6555
|
-
|
|
6556
|
-
|
|
6557
|
-
if(s==="pass")p++;else if(s==="fail")f++;else e++;
|
|
6558
|
-
if(r.durationMs)dur+=r.durationMs;
|
|
6559
|
-
if(r.tokenUsage){ti+=(r.tokenUsage.input||0);to+=(r.tokenUsage.output||0);}
|
|
6560
|
-
if(r.costUsd)cost+=r.costUsd;
|
|
6561
|
-
if(s!=="error")sc.push(r.score);
|
|
6562
|
-
if(r._toolCalls){for(var k in r._toolCalls)tc+=r._toolCalls[k];}
|
|
6563
|
-
}
|
|
6564
|
-
var g=t-e;
|
|
6565
|
-
return{total:t,passed:p,failed:f,errors:e,passRate:g>0?p/g:0,dur:dur,tokens:ti+to,inTok:ti,outTok:to,cost:cost,scores:sc,toolCalls:tc};
|
|
6566
|
-
}
|
|
6567
|
-
function computeTargets(d){
|
|
6568
|
-
var m={};
|
|
6569
|
-
for(var i=0;i<d.length;i++){
|
|
6570
|
-
var r=d[i],tgt=r.target||"unknown";
|
|
6571
|
-
if(!m[tgt])m[tgt]={target:tgt,results:[],p:0,f:0,e:0,ts:0,sc:0,dur:0,tok:0,cost:0};
|
|
6572
|
-
var o=m[tgt];o.results.push(r);
|
|
6573
|
-
var s=getStatus(r);
|
|
6574
|
-
if(s==="pass")o.p++;else if(s==="fail")o.f++;else o.e++;
|
|
6575
|
-
if(s!=="error"){o.ts+=r.score;o.sc++;}
|
|
6576
|
-
if(r.durationMs)o.dur+=r.durationMs;
|
|
6577
|
-
if(r.tokenUsage)o.tok+=(r.tokenUsage.input||0)+(r.tokenUsage.output||0);
|
|
6578
|
-
if(r.costUsd)o.cost+=r.costUsd;
|
|
6579
|
-
}
|
|
6580
|
-
var a=[];for(var k in m)a.push(m[k]);return a;
|
|
6581
|
-
}
|
|
6582
|
-
function getEvalNames(){
|
|
6583
|
-
var n={};
|
|
6584
|
-
for(var i=0;i<DATA.length;i++){
|
|
6585
|
-
var sc=DATA[i].scores;
|
|
6586
|
-
if(sc)for(var j=0;j<sc.length;j++)n[sc[j].name]=true;
|
|
6587
|
-
}
|
|
6588
|
-
return Object.keys(n);
|
|
6589
|
-
}
|
|
6590
|
-
function getEvalScore(r,name){
|
|
6591
|
-
if(!r.scores)return null;
|
|
6592
|
-
for(var i=0;i<r.scores.length;i++)if(r.scores[i].name===name)return r.scores[i].score;
|
|
6593
|
-
return null;
|
|
6594
|
-
}
|
|
6595
|
-
|
|
6596
|
-
var stats=computeStats(DATA);
|
|
6597
|
-
var tgtStats=computeTargets(DATA);
|
|
6598
|
-
var tgtNames=tgtStats.map(function(t){return t.target;});
|
|
6599
|
-
|
|
6600
|
-
/* ---- state ---- */
|
|
6601
|
-
var state={tab:"overview",filter:{status:"all",target:"all",search:""},sort:{col:"testId",dir:"asc"},expanded:{}};
|
|
6602
|
-
|
|
6603
|
-
/* ---- DOM refs ---- */
|
|
6604
|
-
var app=document.getElementById("app");
|
|
6605
|
-
var tabBtns=document.querySelectorAll(".tab");
|
|
6606
|
-
|
|
6607
|
-
/* ---- tabs ---- */
|
|
6608
|
-
function setTab(t){
|
|
6609
|
-
state.tab=t;
|
|
6610
|
-
for(var i=0;i<tabBtns.length;i++)tabBtns[i].classList.toggle("active",tabBtns[i].getAttribute("data-tab")===t);
|
|
6611
|
-
render();
|
|
6612
|
-
}
|
|
6613
|
-
for(var i=0;i<tabBtns.length;i++){
|
|
6614
|
-
tabBtns[i].addEventListener("click",(function(b){return function(){setTab(b.getAttribute("data-tab"));};})(tabBtns[i]));
|
|
6615
|
-
}
|
|
6616
|
-
|
|
6617
|
-
/* ---- render ---- */
|
|
6618
|
-
function render(){
|
|
6619
|
-
if(DATA.length===0){
|
|
6620
|
-
app.innerHTML='<div class="welcome-state">'
|
|
6621
|
-
+'<h2>No results yet</h2>'
|
|
6622
|
-
+'<p>Run an evaluation or mount a results directory to see results here.</p>'
|
|
6623
|
-
+'<p><code>agentv eval <eval-file></code></p>'
|
|
6624
|
-
+'<p class="hint">The dashboard will automatically detect new result files.</p>'
|
|
6625
|
-
+'</div>';
|
|
6626
|
-
return;
|
|
6408
|
+
try {
|
|
6409
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6410
|
+
const categoryMap = /* @__PURE__ */ new Map();
|
|
6411
|
+
for (const r of loaded) {
|
|
6412
|
+
const cat = r.category ?? DEFAULT_CATEGORY;
|
|
6413
|
+
const entry = categoryMap.get(cat) ?? {
|
|
6414
|
+
total: 0,
|
|
6415
|
+
passed: 0,
|
|
6416
|
+
scoreSum: 0,
|
|
6417
|
+
datasets: /* @__PURE__ */ new Set()
|
|
6418
|
+
};
|
|
6419
|
+
entry.total++;
|
|
6420
|
+
if (r.score >= 1) entry.passed++;
|
|
6421
|
+
entry.scoreSum += r.score;
|
|
6422
|
+
entry.datasets.add(r.dataset ?? r.target ?? "default");
|
|
6423
|
+
categoryMap.set(cat, entry);
|
|
6424
|
+
}
|
|
6425
|
+
const categories = [...categoryMap.entries()].map(([name, entry]) => ({
|
|
6426
|
+
name,
|
|
6427
|
+
total: entry.total,
|
|
6428
|
+
passed: entry.passed,
|
|
6429
|
+
failed: entry.total - entry.passed,
|
|
6430
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
|
|
6431
|
+
dataset_count: entry.datasets.size
|
|
6432
|
+
}));
|
|
6433
|
+
return c3.json({ categories });
|
|
6434
|
+
} catch {
|
|
6435
|
+
return c3.json({ error: "Failed to load categories" }, 500);
|
|
6627
6436
|
}
|
|
6628
|
-
|
|
6629
|
-
|
|
6630
|
-
|
|
6631
|
-
|
|
6632
|
-
|
|
6633
|
-
|
|
6634
|
-
|
|
6635
|
-
|
|
6636
|
-
/* ---- overview ---- */
|
|
6637
|
-
function renderOverview(){
|
|
6638
|
-
var h='<div class="stats-grid">';
|
|
6639
|
-
h+=card("Total Tests",stats.total,"total");
|
|
6640
|
-
h+=card("Passed",stats.passed,"pass");
|
|
6641
|
-
h+=card("Failed",stats.failed,"fail");
|
|
6642
|
-
h+=card("Errors",stats.errors,"error");
|
|
6643
|
-
var prCls=stats.passRate>=0.9?"pass":stats.passRate>=0.5?"warn":"fail";
|
|
6644
|
-
h+=card("Pass Rate",fmtPct(stats.passRate),prCls);
|
|
6645
|
-
h+=card("Duration",fmtDur(stats.dur),"neutral");
|
|
6646
|
-
h+=card("Tokens",fmtTok(stats.tokens),"neutral");
|
|
6647
|
-
h+=card("Est. Cost",fmtCost(stats.cost),"neutral");
|
|
6648
|
-
if(stats.toolCalls>0)h+=card("Tool Calls",fmtTok(stats.toolCalls),"neutral");
|
|
6649
|
-
h+="</div>";
|
|
6650
|
-
|
|
6651
|
-
/* targets table */
|
|
6652
|
-
if(tgtStats.length>1){
|
|
6653
|
-
h+='<div class="section"><h2 class="section-title">Targets</h2><div class="table-wrap"><table class="data-table">';
|
|
6654
|
-
h+="<thead><tr><th>Target</th><th>Pass Rate</th><th></th><th>Passed</th><th>Failed</th><th>Errors</th><th>Avg Score</th><th>Duration</th><th>Tokens</th><th>Cost</th></tr></thead><tbody>";
|
|
6655
|
-
for(var i=0;i<tgtStats.length;i++){
|
|
6656
|
-
var t=tgtStats[i],g=t.p+t.f,pr=g>0?t.p/g:0,avg=t.sc>0?t.ts/t.sc:0;
|
|
6657
|
-
h+="<tr><td class=\\"fw-medium\\">"+esc(t.target)+"</td><td>"+fmtPct(pr)+'</td><td><div class="bar-bg"><div class="bar-fill '+sCls(pr)+'" style="width:'+(pr*100)+'%"></div></div></td>';
|
|
6658
|
-
h+='<td class="text-pass">'+t.p+'</td><td class="text-fail">'+t.f+'</td><td class="text-error">'+t.e+"</td>";
|
|
6659
|
-
h+='<td class="'+sCls(avg)+'">'+fmtPct(avg)+"</td><td>"+fmtDur(t.dur)+"</td><td>"+fmtTok(t.tok)+"</td><td>"+fmtCost(t.cost)+"</td></tr>";
|
|
6660
|
-
}
|
|
6661
|
-
h+="</tbody></table></div></div>";
|
|
6437
|
+
});
|
|
6438
|
+
app2.get("/api/runs/:filename/categories/:category/datasets", (c3) => {
|
|
6439
|
+
const filename = c3.req.param("filename");
|
|
6440
|
+
const category = decodeURIComponent(c3.req.param("category"));
|
|
6441
|
+
const metas = listResultFiles(searchDir);
|
|
6442
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6443
|
+
if (!meta) {
|
|
6444
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6662
6445
|
}
|
|
6663
|
-
|
|
6664
|
-
|
|
6665
|
-
|
|
6666
|
-
|
|
6667
|
-
for(
|
|
6668
|
-
|
|
6669
|
-
|
|
6670
|
-
|
|
6671
|
-
|
|
6672
|
-
|
|
6673
|
-
|
|
6446
|
+
try {
|
|
6447
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6448
|
+
const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
|
|
6449
|
+
const datasetMap = /* @__PURE__ */ new Map();
|
|
6450
|
+
for (const r of filtered) {
|
|
6451
|
+
const ds = r.dataset ?? r.target ?? "default";
|
|
6452
|
+
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
|
|
6453
|
+
entry.total++;
|
|
6454
|
+
if (r.score >= 1) entry.passed++;
|
|
6455
|
+
entry.scoreSum += r.score;
|
|
6456
|
+
datasetMap.set(ds, entry);
|
|
6457
|
+
}
|
|
6458
|
+
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
|
|
6459
|
+
name,
|
|
6460
|
+
total: entry.total,
|
|
6461
|
+
passed: entry.passed,
|
|
6462
|
+
failed: entry.total - entry.passed,
|
|
6463
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
|
|
6464
|
+
}));
|
|
6465
|
+
return c3.json({ datasets });
|
|
6466
|
+
} catch {
|
|
6467
|
+
return c3.json({ error: "Failed to load datasets" }, 500);
|
|
6468
|
+
}
|
|
6469
|
+
});
|
|
6470
|
+
app2.get("/api/runs/:filename/evals/:evalId", (c3) => {
|
|
6471
|
+
const filename = c3.req.param("filename");
|
|
6472
|
+
const evalId = c3.req.param("evalId");
|
|
6473
|
+
const metas = listResultFiles(searchDir);
|
|
6474
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6475
|
+
if (!meta) {
|
|
6476
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6477
|
+
}
|
|
6478
|
+
try {
|
|
6479
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6480
|
+
const result = loaded.find((r) => r.testId === evalId);
|
|
6481
|
+
if (!result) {
|
|
6482
|
+
return c3.json({ error: "Eval not found" }, 404);
|
|
6674
6483
|
}
|
|
6675
|
-
|
|
6484
|
+
return c3.json({ eval: result });
|
|
6485
|
+
} catch {
|
|
6486
|
+
return c3.json({ error: "Failed to load eval" }, 500);
|
|
6676
6487
|
}
|
|
6677
|
-
|
|
6678
|
-
|
|
6679
|
-
|
|
6680
|
-
|
|
6681
|
-
|
|
6682
|
-
|
|
6683
|
-
|
|
6684
|
-
|
|
6685
|
-
|
|
6686
|
-
|
|
6687
|
-
|
|
6688
|
-
|
|
6689
|
-
|
|
6690
|
-
|
|
6691
|
-
|
|
6692
|
-
|
|
6693
|
-
|
|
6694
|
-
|
|
6695
|
-
|
|
6696
|
-
|
|
6697
|
-
|
|
6698
|
-
|
|
6699
|
-
|
|
6700
|
-
|
|
6701
|
-
|
|
6702
|
-
|
|
6703
|
-
|
|
6704
|
-
|
|
6705
|
-
|
|
6706
|
-
|
|
6707
|
-
|
|
6708
|
-
|
|
6709
|
-
|
|
6710
|
-
|
|
6711
|
-
|
|
6712
|
-
|
|
6713
|
-
|
|
6714
|
-
|
|
6715
|
-
|
|
6716
|
-
|
|
6717
|
-
|
|
6718
|
-
renderTests();
|
|
6719
|
-
};})(ths[i]));
|
|
6720
|
-
}
|
|
6721
|
-
renderRows();
|
|
6722
|
-
}
|
|
6723
|
-
|
|
6724
|
-
function sHdr(label,col){
|
|
6725
|
-
var arrow="";
|
|
6726
|
-
if(state.sort.col===col)arrow=state.sort.dir==="asc"?" \\u2191":" \\u2193";
|
|
6727
|
-
return'<th class="sortable" data-sort="'+col+'">'+label+arrow+"</th>";
|
|
6728
|
-
}
|
|
6729
|
-
|
|
6730
|
-
function filtered(){
|
|
6731
|
-
var out=[];
|
|
6732
|
-
for(var i=0;i<DATA.length;i++){
|
|
6733
|
-
var r=DATA[i],s=getStatus(r);
|
|
6734
|
-
if(state.filter.status!=="all"&&s!==state.filter.status)continue;
|
|
6735
|
-
if(state.filter.target!=="all"&&r.target!==state.filter.target)continue;
|
|
6736
|
-
if(state.filter.search&&(r.testId||"").toLowerCase().indexOf(state.filter.search.toLowerCase())===-1)continue;
|
|
6737
|
-
out.push(r);
|
|
6738
|
-
}
|
|
6739
|
-
var col=state.sort.col,dir=state.sort.dir==="asc"?1:-1;
|
|
6740
|
-
out.sort(function(a,b){
|
|
6741
|
-
var va=col==="status"?getStatus(a):a[col],vb=col==="status"?getStatus(b):b[col];
|
|
6742
|
-
if(va==null&&vb==null)return 0;if(va==null)return 1;if(vb==null)return-1;
|
|
6743
|
-
if(typeof va==="string")return va.localeCompare(vb)*dir;
|
|
6744
|
-
return(va-vb)*dir;
|
|
6488
|
+
});
|
|
6489
|
+
app2.get("/api/index", (c3) => {
|
|
6490
|
+
const metas = listResultFiles(searchDir);
|
|
6491
|
+
const entries2 = metas.map((m) => {
|
|
6492
|
+
let totalCostUsd = 0;
|
|
6493
|
+
try {
|
|
6494
|
+
const loaded = patchTestIds(loadManifestResults(m.path));
|
|
6495
|
+
totalCostUsd = loaded.reduce((sum, r) => sum + (r.costUsd ?? 0), 0);
|
|
6496
|
+
} catch {
|
|
6497
|
+
}
|
|
6498
|
+
return {
|
|
6499
|
+
run_filename: m.filename,
|
|
6500
|
+
test_count: m.testCount,
|
|
6501
|
+
pass_rate: m.passRate,
|
|
6502
|
+
avg_score: m.avgScore,
|
|
6503
|
+
total_cost_usd: totalCostUsd,
|
|
6504
|
+
timestamp: m.timestamp
|
|
6505
|
+
};
|
|
6506
|
+
});
|
|
6507
|
+
return c3.json({ entries: entries2 });
|
|
6508
|
+
});
|
|
6509
|
+
function buildFileTree(dirPath, relativeTo) {
|
|
6510
|
+
if (!existsSync7(dirPath) || !statSync4(dirPath).isDirectory()) {
|
|
6511
|
+
return [];
|
|
6512
|
+
}
|
|
6513
|
+
const entries2 = readdirSync3(dirPath, { withFileTypes: true });
|
|
6514
|
+
return entries2.sort((a, b) => {
|
|
6515
|
+
if (a.isDirectory() !== b.isDirectory()) return a.isDirectory() ? -1 : 1;
|
|
6516
|
+
return a.name.localeCompare(b.name);
|
|
6517
|
+
}).map((entry) => {
|
|
6518
|
+
const fullPath = path9.join(dirPath, entry.name);
|
|
6519
|
+
const relPath = path9.relative(relativeTo, fullPath);
|
|
6520
|
+
if (entry.isDirectory()) {
|
|
6521
|
+
return {
|
|
6522
|
+
name: entry.name,
|
|
6523
|
+
path: relPath,
|
|
6524
|
+
type: "dir",
|
|
6525
|
+
children: buildFileTree(fullPath, relativeTo)
|
|
6526
|
+
};
|
|
6527
|
+
}
|
|
6528
|
+
return { name: entry.name, path: relPath, type: "file" };
|
|
6745
6529
|
});
|
|
6746
|
-
return out;
|
|
6747
6530
|
}
|
|
6748
|
-
|
|
6749
|
-
|
|
6750
|
-
|
|
6751
|
-
|
|
6752
|
-
|
|
6753
|
-
|
|
6754
|
-
|
|
6755
|
-
|
|
6756
|
-
|
|
6757
|
-
|
|
6758
|
-
|
|
6759
|
-
|
|
6760
|
-
|
|
6761
|
-
|
|
6762
|
-
|
|
6763
|
-
|
|
6764
|
-
|
|
6765
|
-
|
|
6766
|
-
|
|
6767
|
-
|
|
6768
|
-
|
|
6769
|
-
|
|
6770
|
-
|
|
6771
|
-
|
|
6772
|
-
|
|
6773
|
-
|
|
6774
|
-
|
|
6775
|
-
|
|
6776
|
-
|
|
6777
|
-
|
|
6778
|
-
|
|
6779
|
-
|
|
6780
|
-
|
|
6531
|
+
function inferLanguage(filePath) {
|
|
6532
|
+
const ext = path9.extname(filePath).toLowerCase();
|
|
6533
|
+
const langMap = {
|
|
6534
|
+
".json": "json",
|
|
6535
|
+
".jsonl": "json",
|
|
6536
|
+
".ts": "typescript",
|
|
6537
|
+
".tsx": "typescript",
|
|
6538
|
+
".js": "javascript",
|
|
6539
|
+
".jsx": "javascript",
|
|
6540
|
+
".md": "markdown",
|
|
6541
|
+
".yaml": "yaml",
|
|
6542
|
+
".yml": "yaml",
|
|
6543
|
+
".log": "plaintext",
|
|
6544
|
+
".txt": "plaintext",
|
|
6545
|
+
".py": "python",
|
|
6546
|
+
".sh": "shell",
|
|
6547
|
+
".bash": "shell",
|
|
6548
|
+
".css": "css",
|
|
6549
|
+
".html": "html",
|
|
6550
|
+
".xml": "xml",
|
|
6551
|
+
".svg": "xml",
|
|
6552
|
+
".toml": "toml",
|
|
6553
|
+
".diff": "diff",
|
|
6554
|
+
".patch": "diff"
|
|
6555
|
+
};
|
|
6556
|
+
return langMap[ext] ?? "plaintext";
|
|
6557
|
+
}
|
|
6558
|
+
app2.get("/api/runs/:filename/evals/:evalId/files", (c3) => {
|
|
6559
|
+
const filename = c3.req.param("filename");
|
|
6560
|
+
const evalId = c3.req.param("evalId");
|
|
6561
|
+
const metas = listResultFiles(searchDir);
|
|
6562
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6563
|
+
if (!meta) {
|
|
6564
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6781
6565
|
}
|
|
6782
|
-
|
|
6783
|
-
|
|
6784
|
-
|
|
6785
|
-
|
|
6786
|
-
|
|
6787
|
-
|
|
6788
|
-
|
|
6789
|
-
|
|
6790
|
-
|
|
6791
|
-
|
|
6792
|
-
|
|
6793
|
-
|
|
6566
|
+
try {
|
|
6567
|
+
const content = readFileSync8(meta.path, "utf8");
|
|
6568
|
+
const records = parseResultManifest(content);
|
|
6569
|
+
const record = records.find((r) => (r.test_id ?? r.eval_id) === evalId);
|
|
6570
|
+
if (!record) {
|
|
6571
|
+
return c3.json({ error: "Eval not found" }, 404);
|
|
6572
|
+
}
|
|
6573
|
+
const baseDir = path9.dirname(meta.path);
|
|
6574
|
+
const knownPaths = [
|
|
6575
|
+
record.grading_path,
|
|
6576
|
+
record.timing_path,
|
|
6577
|
+
record.input_path,
|
|
6578
|
+
record.output_path,
|
|
6579
|
+
record.response_path
|
|
6580
|
+
].filter((p) => !!p);
|
|
6581
|
+
if (knownPaths.length === 0) {
|
|
6582
|
+
return c3.json({ files: [] });
|
|
6583
|
+
}
|
|
6584
|
+
const artifactDirs = knownPaths.map((p) => path9.dirname(p));
|
|
6585
|
+
let commonDir = artifactDirs[0];
|
|
6586
|
+
for (const dir of artifactDirs) {
|
|
6587
|
+
while (!dir.startsWith(commonDir)) {
|
|
6588
|
+
commonDir = path9.dirname(commonDir);
|
|
6589
|
+
}
|
|
6590
|
+
}
|
|
6591
|
+
const artifactAbsDir = path9.join(baseDir, commonDir);
|
|
6592
|
+
const files = buildFileTree(artifactAbsDir, baseDir);
|
|
6593
|
+
return c3.json({ files });
|
|
6594
|
+
} catch {
|
|
6595
|
+
return c3.json({ error: "Failed to load file tree" }, 500);
|
|
6794
6596
|
}
|
|
6795
|
-
|
|
6796
|
-
|
|
6797
|
-
|
|
6798
|
-
|
|
6799
|
-
|
|
6597
|
+
});
|
|
6598
|
+
app2.get("/api/runs/:filename/evals/:evalId/files/*", (c3) => {
|
|
6599
|
+
const filename = c3.req.param("filename");
|
|
6600
|
+
const evalId = c3.req.param("evalId");
|
|
6601
|
+
const metas = listResultFiles(searchDir);
|
|
6602
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6603
|
+
if (!meta) {
|
|
6604
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6800
6605
|
}
|
|
6801
|
-
|
|
6802
|
-
|
|
6803
|
-
|
|
6804
|
-
|
|
6805
|
-
|
|
6806
|
-
function renderDetail(r){
|
|
6807
|
-
var h='<div class="detail-panel">';
|
|
6808
|
-
|
|
6809
|
-
/* input / output */
|
|
6810
|
-
h+='<div class="detail-grid">';
|
|
6811
|
-
if(r.input!=null){
|
|
6812
|
-
h+='<div class="detail-block"><h4>Input</h4><pre class="detail-pre">'+esc(JSON.stringify(r.input,null,2))+"</pre></div>";
|
|
6606
|
+
const requestPath = c3.req.path;
|
|
6607
|
+
const prefix = `/api/runs/${filename}/evals/${evalId}/files/`;
|
|
6608
|
+
const filePath = requestPath.slice(prefix.length);
|
|
6609
|
+
if (!filePath) {
|
|
6610
|
+
return c3.json({ error: "No file path specified" }, 400);
|
|
6813
6611
|
}
|
|
6814
|
-
|
|
6815
|
-
|
|
6816
|
-
|
|
6817
|
-
|
|
6818
|
-
if(r.scores&&r.scores.length>0){
|
|
6819
|
-
h+="<h4>Evaluator Results</h4>";
|
|
6820
|
-
h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
|
|
6821
|
-
for(var i=0;i<r.scores.length;i++){
|
|
6822
|
-
var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
|
|
6823
|
-
var evAssertions=ev.assertions||[];
|
|
6824
|
-
var evSummary=evAssertions.map(function(a){return (a.passed?"\\u2713 ":"\\u2717 ")+a.text;}).join("; ");
|
|
6825
|
-
h+="<tr><td class=\\"fw-medium\\">"+esc(ev.name)+'</td><td class="'+sCls(ev.score)+'">'+fmtPct(ev.score)+"</td><td>"+sIcon(evS)+'</td><td class="reasoning-cell">'+esc(evSummary)+"</td></tr>";
|
|
6826
|
-
}
|
|
6827
|
-
h+="</tbody></table>";
|
|
6612
|
+
const baseDir = path9.dirname(meta.path);
|
|
6613
|
+
const absolutePath = path9.resolve(baseDir, filePath);
|
|
6614
|
+
if (!absolutePath.startsWith(path9.resolve(baseDir) + path9.sep) && absolutePath !== path9.resolve(baseDir)) {
|
|
6615
|
+
return c3.json({ error: "Path traversal not allowed" }, 403);
|
|
6828
6616
|
}
|
|
6829
|
-
|
|
6830
|
-
|
|
6831
|
-
var passedA=r.assertions?r.assertions.filter(function(a){return a.passed;}):[];
|
|
6832
|
-
var failedA=r.assertions?r.assertions.filter(function(a){return !a.passed;}):[];
|
|
6833
|
-
if(passedA.length>0){
|
|
6834
|
-
h+='<h4>Passed Assertions</h4><ul class="expect-list pass">';
|
|
6835
|
-
for(var i=0;i<passedA.length;i++)h+="<li>"+esc(passedA[i].text)+(passedA[i].evidence?" <span class=\\"reasoning-cell\\">("+esc(passedA[i].evidence)+")</span>":"")+"</li>";
|
|
6836
|
-
h+="</ul>";
|
|
6837
|
-
}
|
|
6838
|
-
if(failedA.length>0){
|
|
6839
|
-
h+='<h4>Failed Assertions</h4><ul class="expect-list fail">';
|
|
6840
|
-
for(var i=0;i<failedA.length;i++)h+="<li>"+esc(failedA[i].text)+(failedA[i].evidence?" <span class=\\"reasoning-cell\\">("+esc(failedA[i].evidence)+")</span>":"")+"</li>";
|
|
6841
|
-
h+="</ul>";
|
|
6617
|
+
if (!existsSync7(absolutePath) || !statSync4(absolutePath).isFile()) {
|
|
6618
|
+
return c3.json({ error: "File not found" }, 404);
|
|
6842
6619
|
}
|
|
6843
|
-
|
|
6844
|
-
|
|
6845
|
-
|
|
6846
|
-
|
|
6847
|
-
|
|
6848
|
-
|
|
6849
|
-
h+='<h4>Tool Calls</h4><div class="tool-calls">';
|
|
6850
|
-
for(var i=0;i<tcArr.length;i++)h+='<span class="tool-tag">'+esc(tcArr[i].name)+": "+tcArr[i].count+"</span>";
|
|
6851
|
-
h+="</div>";
|
|
6620
|
+
try {
|
|
6621
|
+
const fileContent = readFileSync8(absolutePath, "utf8");
|
|
6622
|
+
const language = inferLanguage(absolutePath);
|
|
6623
|
+
return c3.json({ content: fileContent, language });
|
|
6624
|
+
} catch {
|
|
6625
|
+
return c3.json({ error: "Failed to read file" }, 500);
|
|
6852
6626
|
}
|
|
6853
|
-
|
|
6854
|
-
|
|
6855
|
-
|
|
6856
|
-
|
|
6857
|
-
|
|
6858
|
-
|
|
6859
|
-
|
|
6860
|
-
|
|
6861
|
-
|
|
6862
|
-
|
|
6863
|
-
|
|
6864
|
-
|
|
6865
|
-
|
|
6866
|
-
|
|
6867
|
-
|
|
6868
|
-
|
|
6869
|
-
|
|
6870
|
-
|
|
6871
|
-
|
|
6872
|
-
|
|
6873
|
-
|
|
6874
|
-
|
|
6875
|
-
|
|
6876
|
-
|
|
6877
|
-
var existingComment=feedbackCache[tid]||"";
|
|
6878
|
-
h+='<div class="feedback-section">';
|
|
6879
|
-
h+='<h4>Feedback</h4>';
|
|
6880
|
-
h+='<textarea class="feedback-input" data-test-id="'+esc(tid)+'" placeholder="Add feedback for this test..." onclick="event.stopPropagation()">'+esc(existingComment)+'</textarea>';
|
|
6881
|
-
h+='<div style="display:flex;align-items:center">';
|
|
6882
|
-
h+='<button class="feedback-submit" data-test-id="'+esc(tid)+'">Save Feedback</button>';
|
|
6883
|
-
h+='<span class="feedback-status"></span>';
|
|
6884
|
-
h+='</div></div>';
|
|
6885
|
-
|
|
6886
|
-
h+="</div>";
|
|
6887
|
-
return h;
|
|
6888
|
-
}
|
|
6889
|
-
|
|
6890
|
-
/* ---- run picker ---- */
|
|
6891
|
-
var runPicker=document.getElementById("run-picker");
|
|
6892
|
-
var knownRunFilenames=[];
|
|
6893
|
-
|
|
6894
|
-
function refreshRunList(){
|
|
6895
|
-
fetch("/api/runs").then(function(r){return r.json();}).then(function(d){
|
|
6896
|
-
if(!d||!d.runs)return;
|
|
6897
|
-
var runs=d.runs;
|
|
6898
|
-
var newFilenames=runs.map(function(r){return r.filename;});
|
|
6899
|
-
|
|
6900
|
-
/* Detect new runs that appeared since last poll */
|
|
6901
|
-
if(knownRunFilenames.length>0){
|
|
6902
|
-
var hasNew=newFilenames.some(function(f){return knownRunFilenames.indexOf(f)===-1;});
|
|
6903
|
-
if(hasNew&&DATA.length===0){
|
|
6904
|
-
/* Auto-load the first (most recent) run when starting from empty state */
|
|
6905
|
-
loadRun(runs[0].filename);
|
|
6627
|
+
});
|
|
6628
|
+
app2.get("/api/experiments", (c3) => {
|
|
6629
|
+
const metas = listResultFiles(searchDir);
|
|
6630
|
+
const experimentMap = /* @__PURE__ */ new Map();
|
|
6631
|
+
for (const m of metas) {
|
|
6632
|
+
try {
|
|
6633
|
+
const records = loadLightweightResults(m.path);
|
|
6634
|
+
for (const r of records) {
|
|
6635
|
+
const experiment = r.experiment ?? "default";
|
|
6636
|
+
const entry = experimentMap.get(experiment) ?? {
|
|
6637
|
+
targets: /* @__PURE__ */ new Set(),
|
|
6638
|
+
runFilenames: /* @__PURE__ */ new Set(),
|
|
6639
|
+
evalCount: 0,
|
|
6640
|
+
passedCount: 0,
|
|
6641
|
+
lastTimestamp: ""
|
|
6642
|
+
};
|
|
6643
|
+
entry.runFilenames.add(m.filename);
|
|
6644
|
+
if (r.target) entry.targets.add(r.target);
|
|
6645
|
+
entry.evalCount++;
|
|
6646
|
+
if (r.score >= 1) entry.passedCount++;
|
|
6647
|
+
if (r.timestamp && r.timestamp > entry.lastTimestamp) {
|
|
6648
|
+
entry.lastTimestamp = r.timestamp;
|
|
6649
|
+
}
|
|
6650
|
+
experimentMap.set(experiment, entry);
|
|
6906
6651
|
}
|
|
6652
|
+
} catch {
|
|
6907
6653
|
}
|
|
6908
|
-
|
|
6909
|
-
|
|
6910
|
-
|
|
6911
|
-
|
|
6912
|
-
|
|
6913
|
-
|
|
6914
|
-
|
|
6915
|
-
|
|
6916
|
-
|
|
6917
|
-
|
|
6918
|
-
|
|
6654
|
+
}
|
|
6655
|
+
const experiments = [...experimentMap.entries()].map(([name, entry]) => ({
|
|
6656
|
+
name,
|
|
6657
|
+
run_count: entry.runFilenames.size,
|
|
6658
|
+
target_count: entry.targets.size,
|
|
6659
|
+
eval_count: entry.evalCount,
|
|
6660
|
+
passed_count: entry.passedCount,
|
|
6661
|
+
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
|
|
6662
|
+
last_run: entry.lastTimestamp || null
|
|
6663
|
+
}));
|
|
6664
|
+
return c3.json({ experiments });
|
|
6665
|
+
});
|
|
6666
|
+
app2.get("/api/targets", (c3) => {
|
|
6667
|
+
const metas = listResultFiles(searchDir);
|
|
6668
|
+
const targetMap = /* @__PURE__ */ new Map();
|
|
6669
|
+
for (const m of metas) {
|
|
6670
|
+
try {
|
|
6671
|
+
const records = loadLightweightResults(m.path);
|
|
6672
|
+
for (const r of records) {
|
|
6673
|
+
const target = r.target ?? "default";
|
|
6674
|
+
const entry = targetMap.get(target) ?? {
|
|
6675
|
+
experiments: /* @__PURE__ */ new Set(),
|
|
6676
|
+
runFilenames: /* @__PURE__ */ new Set(),
|
|
6677
|
+
evalCount: 0,
|
|
6678
|
+
passedCount: 0
|
|
6679
|
+
};
|
|
6680
|
+
entry.runFilenames.add(m.filename);
|
|
6681
|
+
if (r.experiment) entry.experiments.add(r.experiment);
|
|
6682
|
+
entry.evalCount++;
|
|
6683
|
+
if (r.score >= 1) entry.passedCount++;
|
|
6684
|
+
targetMap.set(target, entry);
|
|
6685
|
+
}
|
|
6686
|
+
} catch {
|
|
6919
6687
|
}
|
|
6920
|
-
|
|
6921
|
-
|
|
6922
|
-
|
|
6923
|
-
|
|
6688
|
+
}
|
|
6689
|
+
const targets = [...targetMap.entries()].map(([name, entry]) => ({
|
|
6690
|
+
name,
|
|
6691
|
+
run_count: entry.runFilenames.size,
|
|
6692
|
+
experiment_count: entry.experiments.size,
|
|
6693
|
+
eval_count: entry.evalCount,
|
|
6694
|
+
passed_count: entry.passedCount,
|
|
6695
|
+
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0
|
|
6696
|
+
}));
|
|
6697
|
+
return c3.json({ targets });
|
|
6698
|
+
});
|
|
6699
|
+
app2.get("/assets/*", (c3) => {
|
|
6700
|
+
const assetPath = c3.req.path;
|
|
6701
|
+
const filePath = path9.join(studioDistPath, assetPath);
|
|
6702
|
+
if (!existsSync7(filePath)) {
|
|
6703
|
+
return c3.notFound();
|
|
6704
|
+
}
|
|
6705
|
+
const content = readFileSync8(filePath);
|
|
6706
|
+
const ext = path9.extname(filePath);
|
|
6707
|
+
const mimeTypes = {
|
|
6708
|
+
".js": "application/javascript",
|
|
6709
|
+
".css": "text/css",
|
|
6710
|
+
".html": "text/html",
|
|
6711
|
+
".json": "application/json",
|
|
6712
|
+
".svg": "image/svg+xml",
|
|
6713
|
+
".png": "image/png",
|
|
6714
|
+
".woff2": "font/woff2",
|
|
6715
|
+
".woff": "font/woff"
|
|
6716
|
+
};
|
|
6717
|
+
const contentType = mimeTypes[ext] ?? "application/octet-stream";
|
|
6718
|
+
return new Response(content, {
|
|
6719
|
+
headers: {
|
|
6720
|
+
"Content-Type": contentType,
|
|
6721
|
+
"Cache-Control": "public, max-age=31536000, immutable"
|
|
6924
6722
|
}
|
|
6925
|
-
})
|
|
6926
|
-
}
|
|
6927
|
-
|
|
6928
|
-
|
|
6929
|
-
|
|
6930
|
-
|
|
6931
|
-
|
|
6932
|
-
|
|
6933
|
-
|
|
6934
|
-
|
|
6935
|
-
|
|
6936
|
-
|
|
6937
|
-
|
|
6938
|
-
|
|
6939
|
-
|
|
6940
|
-
|
|
6941
|
-
|
|
6723
|
+
});
|
|
6724
|
+
});
|
|
6725
|
+
app2.get("*", (c3) => {
|
|
6726
|
+
if (c3.req.path.startsWith("/api/")) {
|
|
6727
|
+
return c3.json({ error: "Not found" }, 404);
|
|
6728
|
+
}
|
|
6729
|
+
const indexPath = path9.join(studioDistPath, "index.html");
|
|
6730
|
+
if (existsSync7(indexPath)) {
|
|
6731
|
+
return c3.html(readFileSync8(indexPath, "utf8"));
|
|
6732
|
+
}
|
|
6733
|
+
return c3.notFound();
|
|
6734
|
+
});
|
|
6735
|
+
return app2;
|
|
6736
|
+
}
|
|
6737
|
+
function resolveStudioDistDir() {
|
|
6738
|
+
const currentDir = typeof __dirname !== "undefined" ? __dirname : path9.dirname(fileURLToPath2(import.meta.url));
|
|
6739
|
+
const candidates = [
|
|
6740
|
+
// From src/commands/results/ → sibling apps/studio/dist
|
|
6741
|
+
path9.resolve(currentDir, "../../../../studio/dist"),
|
|
6742
|
+
// From dist/ → sibling apps/studio/dist (monorepo dev)
|
|
6743
|
+
path9.resolve(currentDir, "../../studio/dist"),
|
|
6744
|
+
// Bundled inside CLI dist (published package)
|
|
6745
|
+
path9.resolve(currentDir, "../studio"),
|
|
6746
|
+
// From dist/ in monorepo root context
|
|
6747
|
+
path9.resolve(currentDir, "../../../apps/studio/dist")
|
|
6748
|
+
];
|
|
6749
|
+
for (const candidate of candidates) {
|
|
6750
|
+
if (existsSync7(candidate) && existsSync7(path9.join(candidate, "index.html"))) {
|
|
6751
|
+
return candidate;
|
|
6752
|
+
}
|
|
6942
6753
|
}
|
|
6943
|
-
|
|
6944
|
-
|
|
6945
|
-
|
|
6946
|
-
|
|
6754
|
+
return void 0;
|
|
6755
|
+
}
|
|
6756
|
+
function stripHeavyFields(results) {
|
|
6757
|
+
return results.map((r) => {
|
|
6758
|
+
const { requests, trace, ...rest } = r;
|
|
6759
|
+
const toolCalls = trace?.toolCalls && Object.keys(trace.toolCalls).length > 0 ? trace.toolCalls : void 0;
|
|
6760
|
+
const graderDurationMs = (r.scores ?? []).reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
|
|
6761
|
+
return {
|
|
6762
|
+
...rest,
|
|
6763
|
+
...toolCalls && { _toolCalls: toolCalls },
|
|
6764
|
+
...graderDurationMs > 0 && { _graderDurationMs: graderDurationMs }
|
|
6765
|
+
};
|
|
6947
6766
|
});
|
|
6948
|
-
|
|
6949
|
-
/* Poll for new result files every 5 seconds */
|
|
6950
|
-
refreshRunList();
|
|
6951
|
-
setInterval(refreshRunList,5000);
|
|
6952
|
-
|
|
6953
|
-
/* ---- init ---- */
|
|
6954
|
-
loadFeedback();
|
|
6955
|
-
render();
|
|
6956
|
-
})();
|
|
6957
|
-
`;
|
|
6767
|
+
}
|
|
6958
6768
|
var resultsServeCommand = command({
|
|
6959
|
-
name: "
|
|
6960
|
-
description: "Start a local
|
|
6769
|
+
name: "studio",
|
|
6770
|
+
description: "Start AgentV Studio \u2014 a local dashboard for reviewing evaluation results",
|
|
6961
6771
|
args: {
|
|
6962
6772
|
source: positional({
|
|
6963
6773
|
type: optional(string),
|
|
@@ -7594,7 +7404,7 @@ function formatResultDetail(result, index, tree) {
|
|
|
7594
7404
|
}
|
|
7595
7405
|
const scoreColor = result.score >= 0.9 ? c2.green : result.score >= 0.5 ? c2.yellow : c2.red;
|
|
7596
7406
|
lines.push(
|
|
7597
|
-
`${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.
|
|
7407
|
+
`${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.dataset ? ` ${c2.dim}dataset: ${result.dataset}${c2.reset}` : ""}`
|
|
7598
7408
|
);
|
|
7599
7409
|
if (result.error) {
|
|
7600
7410
|
lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
|
|
@@ -7768,8 +7578,8 @@ function groupResults(results, groupBy2) {
|
|
|
7768
7578
|
case "target":
|
|
7769
7579
|
key = result.target ?? "unknown";
|
|
7770
7580
|
break;
|
|
7771
|
-
case "
|
|
7772
|
-
key = result.
|
|
7581
|
+
case "dataset":
|
|
7582
|
+
key = result.dataset ?? "unknown";
|
|
7773
7583
|
break;
|
|
7774
7584
|
case "test-id":
|
|
7775
7585
|
key = result.test_id ?? result.eval_id ?? "unknown";
|
|
@@ -8482,7 +8292,7 @@ var app = subcommands({
|
|
|
8482
8292
|
pipeline: pipelineCommand,
|
|
8483
8293
|
results: resultsCommand,
|
|
8484
8294
|
self: selfCommand,
|
|
8485
|
-
|
|
8295
|
+
studio: resultsServeCommand,
|
|
8486
8296
|
trace: traceCommand,
|
|
8487
8297
|
transpile: transpileCommand,
|
|
8488
8298
|
trim: trimCommand,
|
|
@@ -8500,7 +8310,7 @@ var TOP_LEVEL_COMMANDS = /* @__PURE__ */ new Set([
|
|
|
8500
8310
|
"pipeline",
|
|
8501
8311
|
"results",
|
|
8502
8312
|
"self",
|
|
8503
|
-
"
|
|
8313
|
+
"studio",
|
|
8504
8314
|
"trace",
|
|
8505
8315
|
"transpile",
|
|
8506
8316
|
"trim",
|
|
@@ -8547,4 +8357,4 @@ export {
|
|
|
8547
8357
|
preprocessArgv,
|
|
8548
8358
|
runCli
|
|
8549
8359
|
};
|
|
8550
|
-
//# sourceMappingURL=chunk-
|
|
8360
|
+
//# sourceMappingURL=chunk-2W5JKKXC.js.map
|