agentv 3.14.6 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-CQRWNXVG.js → chunk-E3VSJJI4.js} +534 -62
- package/dist/chunk-E3VSJJI4.js.map +1 -0
- package/dist/{chunk-Y25VL7PX.js → chunk-OT2J474N.js} +43 -17
- package/dist/chunk-OT2J474N.js.map +1 -0
- package/dist/{chunk-ELQEFMGO.js → chunk-OXBBWZOY.js} +592 -295
- package/dist/chunk-OXBBWZOY.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-5EEXTTC3.js → dist-3Z22B6SU.js} +18 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-5ESM5DWV.js → interactive-D5UTP72M.js} +4 -11
- package/dist/interactive-D5UTP72M.js.map +1 -0
- package/dist/studio/assets/index-BuKVkxFj.css +1 -0
- package/dist/studio/assets/index-CE3-mmv0.js +11 -0
- package/dist/studio/assets/index-DBU720Fm.js +71 -0
- package/dist/studio/index.html +13 -0
- package/package.json +1 -1
- package/dist/chunk-CQRWNXVG.js.map +0 -1
- package/dist/chunk-ELQEFMGO.js.map +0 -1
- package/dist/chunk-Y25VL7PX.js.map +0 -1
- package/dist/interactive-5ESM5DWV.js.map +0 -1
- /package/dist/{dist-5EEXTTC3.js.map → dist-3Z22B6SU.js.map} +0 -0
|
@@ -10,6 +10,7 @@ import {
|
|
|
10
10
|
loadManifestResults,
|
|
11
11
|
loadRunCache,
|
|
12
12
|
package_default,
|
|
13
|
+
parseResultManifest,
|
|
13
14
|
resolveEvalPaths,
|
|
14
15
|
resolveExistingRunPrimaryPath,
|
|
15
16
|
resolveResultSourcePath,
|
|
@@ -23,9 +24,11 @@ import {
|
|
|
23
24
|
validateFileReferences,
|
|
24
25
|
validateTargetsFile,
|
|
25
26
|
writeArtifactsFromResults
|
|
26
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-OT2J474N.js";
|
|
27
28
|
import {
|
|
29
|
+
DEFAULT_CATEGORY,
|
|
28
30
|
createBuiltinRegistry,
|
|
31
|
+
deriveCategory,
|
|
29
32
|
executeScript,
|
|
30
33
|
getAgentvHome,
|
|
31
34
|
getOutputFilenames,
|
|
@@ -40,7 +43,7 @@ import {
|
|
|
40
43
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
41
44
|
transpileEvalYamlFile,
|
|
42
45
|
trimBaselineResult
|
|
43
|
-
} from "./chunk-
|
|
46
|
+
} from "./chunk-OXBBWZOY.js";
|
|
44
47
|
import {
|
|
45
48
|
__commonJS,
|
|
46
49
|
__esm,
|
|
@@ -3479,9 +3482,23 @@ var ASSERTION_TEMPLATES = {
|
|
|
3479
3482
|
default: `#!/usr/bin/env bun
|
|
3480
3483
|
import { defineAssertion } from '@agentv/eval';
|
|
3481
3484
|
|
|
3482
|
-
|
|
3485
|
+
/** Extract text from the last message with the given role. */
|
|
3486
|
+
function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
|
|
3487
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
3488
|
+
const msg = messages[i];
|
|
3489
|
+
if (msg.role !== role) continue;
|
|
3490
|
+
if (typeof msg.content === 'string') return msg.content;
|
|
3491
|
+
if (Array.isArray(msg.content)) {
|
|
3492
|
+
return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
|
|
3493
|
+
}
|
|
3494
|
+
}
|
|
3495
|
+
return '';
|
|
3496
|
+
}
|
|
3497
|
+
|
|
3498
|
+
export default defineAssertion(({ output }) => {
|
|
3483
3499
|
// TODO: Implement your assertion logic
|
|
3484
|
-
const
|
|
3500
|
+
const text = getMessageText(output ?? []);
|
|
3501
|
+
const pass = text.length > 0;
|
|
3485
3502
|
return {
|
|
3486
3503
|
pass,
|
|
3487
3504
|
reasoning: pass ? 'Output has content' : 'Output is empty',
|
|
@@ -3491,9 +3508,23 @@ export default defineAssertion(({ outputText }) => {
|
|
|
3491
3508
|
score: `#!/usr/bin/env bun
|
|
3492
3509
|
import { defineAssertion } from '@agentv/eval';
|
|
3493
3510
|
|
|
3494
|
-
|
|
3511
|
+
/** Extract text from the last message with the given role. */
|
|
3512
|
+
function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
|
|
3513
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
3514
|
+
const msg = messages[i];
|
|
3515
|
+
if (msg.role !== role) continue;
|
|
3516
|
+
if (typeof msg.content === 'string') return msg.content;
|
|
3517
|
+
if (Array.isArray(msg.content)) {
|
|
3518
|
+
return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
|
|
3519
|
+
}
|
|
3520
|
+
}
|
|
3521
|
+
return '';
|
|
3522
|
+
}
|
|
3523
|
+
|
|
3524
|
+
export default defineAssertion(({ output }) => {
|
|
3495
3525
|
// TODO: Implement your scoring logic (0.0 to 1.0)
|
|
3496
|
-
const
|
|
3526
|
+
const text = getMessageText(output ?? []);
|
|
3527
|
+
const score = text.length > 0 ? 1.0 : 0.0;
|
|
3497
3528
|
return {
|
|
3498
3529
|
pass: score >= 0.5,
|
|
3499
3530
|
score,
|
|
@@ -4186,7 +4217,7 @@ var evalRunCommand = command({
|
|
|
4186
4217
|
},
|
|
4187
4218
|
handler: async (args) => {
|
|
4188
4219
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4189
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4220
|
+
const { launchInteractiveWizard } = await import("./interactive-D5UTP72M.js");
|
|
4190
4221
|
await launchInteractiveWizard();
|
|
4191
4222
|
return;
|
|
4192
4223
|
}
|
|
@@ -4421,7 +4452,8 @@ var evalBenchCommand = command({
|
|
|
4421
4452
|
const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
|
|
4422
4453
|
const testIds = manifest.test_ids;
|
|
4423
4454
|
const targetName = manifest.target?.name ?? "unknown";
|
|
4424
|
-
const evalSet = manifest.
|
|
4455
|
+
const evalSet = manifest.dataset ?? "";
|
|
4456
|
+
const experiment = manifest.experiment;
|
|
4425
4457
|
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4426
4458
|
let stdinData;
|
|
4427
4459
|
if (llmScoresPath) {
|
|
@@ -4531,7 +4563,8 @@ var evalBenchCommand = command({
|
|
|
4531
4563
|
JSON.stringify({
|
|
4532
4564
|
timestamp: manifest.timestamp,
|
|
4533
4565
|
test_id: testId,
|
|
4534
|
-
|
|
4566
|
+
dataset: evalSet || void 0,
|
|
4567
|
+
experiment: experiment || void 0,
|
|
4535
4568
|
score: Math.round(weightedScore * 1e3) / 1e3,
|
|
4536
4569
|
target: targetName,
|
|
4537
4570
|
scores,
|
|
@@ -4553,6 +4586,7 @@ var evalBenchCommand = command({
|
|
|
4553
4586
|
metadata: {
|
|
4554
4587
|
eval_file: manifest.eval_file,
|
|
4555
4588
|
timestamp: manifest.timestamp,
|
|
4589
|
+
experiment: experiment || void 0,
|
|
4556
4590
|
targets: [targetName],
|
|
4557
4591
|
tests_run: testIds
|
|
4558
4592
|
},
|
|
@@ -4594,6 +4628,12 @@ function computeStats(values) {
|
|
|
4594
4628
|
// src/commands/pipeline/grade.ts
|
|
4595
4629
|
import { mkdir as mkdir2, readFile as readFile2, readdir as readdir2, writeFile as writeFile3 } from "node:fs/promises";
|
|
4596
4630
|
import { join as join2 } from "node:path";
|
|
4631
|
+
function extractInputText(input) {
|
|
4632
|
+
if (!input || input.length === 0) return "";
|
|
4633
|
+
if (input.length === 1) return input[0].content;
|
|
4634
|
+
return input.map((m) => `@[${m.role}]:
|
|
4635
|
+
${m.content}`).join("\n\n");
|
|
4636
|
+
}
|
|
4597
4637
|
var evalGradeCommand = command({
|
|
4598
4638
|
name: "grade",
|
|
4599
4639
|
description: "Run code-grader assertions on responses in an export directory",
|
|
@@ -4608,7 +4648,7 @@ var evalGradeCommand = command({
|
|
|
4608
4648
|
const manifestPath = join2(exportDir, "manifest.json");
|
|
4609
4649
|
const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
|
|
4610
4650
|
const testIds = manifest.test_ids;
|
|
4611
|
-
const evalSet = manifest.
|
|
4651
|
+
const evalSet = manifest.dataset ?? "";
|
|
4612
4652
|
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4613
4653
|
let totalGraders = 0;
|
|
4614
4654
|
let totalPassed = 0;
|
|
@@ -4630,14 +4670,13 @@ var evalGradeCommand = command({
|
|
|
4630
4670
|
for (const graderFile of graderFiles) {
|
|
4631
4671
|
const graderConfig = JSON.parse(await readFile2(join2(codeGradersDir, graderFile), "utf8"));
|
|
4632
4672
|
const graderName = graderConfig.name;
|
|
4673
|
+
const inputText = extractInputText(inputData.input);
|
|
4633
4674
|
const payload = JSON.stringify({
|
|
4634
4675
|
output: [{ role: "assistant", content: responseText }],
|
|
4635
|
-
input: inputData.
|
|
4636
|
-
question: inputData.input_text,
|
|
4676
|
+
input: inputData.input,
|
|
4637
4677
|
criteria: "",
|
|
4638
4678
|
expected_output: [],
|
|
4639
|
-
|
|
4640
|
-
input_files: [],
|
|
4679
|
+
input_files: inputData.input_files ?? [],
|
|
4641
4680
|
trace: null,
|
|
4642
4681
|
token_usage: null,
|
|
4643
4682
|
cost_usd: null,
|
|
@@ -4647,8 +4686,8 @@ var evalGradeCommand = command({
|
|
|
4647
4686
|
file_changes: null,
|
|
4648
4687
|
workspace_path: null,
|
|
4649
4688
|
config: graderConfig.config ?? null,
|
|
4650
|
-
metadata: {},
|
|
4651
|
-
input_text:
|
|
4689
|
+
metadata: inputData.metadata ?? {},
|
|
4690
|
+
input_text: inputText,
|
|
4652
4691
|
output_text: responseText,
|
|
4653
4692
|
expected_output_text: ""
|
|
4654
4693
|
});
|
|
@@ -4706,7 +4745,7 @@ var evalGradeCommand = command({
|
|
|
4706
4745
|
// src/commands/pipeline/input.ts
|
|
4707
4746
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
4708
4747
|
import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
|
|
4709
|
-
import { dirname, join as join3, resolve } from "node:path";
|
|
4748
|
+
import { dirname, join as join3, relative, resolve } from "node:path";
|
|
4710
4749
|
var evalInputCommand = command({
|
|
4711
4750
|
name: "input",
|
|
4712
4751
|
description: "Extract eval inputs, target commands, and grader prompts for subagent-mode runs",
|
|
@@ -4720,14 +4759,20 @@ var evalInputCommand = command({
|
|
|
4720
4759
|
type: optional(string),
|
|
4721
4760
|
long: "out",
|
|
4722
4761
|
description: "Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)"
|
|
4762
|
+
}),
|
|
4763
|
+
experiment: option({
|
|
4764
|
+
type: optional(string),
|
|
4765
|
+
long: "experiment",
|
|
4766
|
+
description: "Experiment label (e.g. with_skills, without_skills)"
|
|
4723
4767
|
})
|
|
4724
4768
|
},
|
|
4725
|
-
handler: async ({ evalPath, out }) => {
|
|
4769
|
+
handler: async ({ evalPath, out, experiment }) => {
|
|
4726
4770
|
const resolvedEvalPath = resolve(evalPath);
|
|
4727
4771
|
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
|
|
4728
4772
|
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
|
|
4729
4773
|
const evalDir = dirname(resolvedEvalPath);
|
|
4730
|
-
const
|
|
4774
|
+
const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
|
|
4775
|
+
const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
|
|
4731
4776
|
const tests = suite.tests;
|
|
4732
4777
|
if (tests.length === 0) {
|
|
4733
4778
|
console.error("No tests found in eval file.");
|
|
@@ -4736,6 +4781,7 @@ var evalInputCommand = command({
|
|
|
4736
4781
|
let targetInfo = null;
|
|
4737
4782
|
let targetName = "agent";
|
|
4738
4783
|
let targetKind = "agent";
|
|
4784
|
+
let subagentModeAllowed = true;
|
|
4739
4785
|
try {
|
|
4740
4786
|
const selection = await selectTarget({
|
|
4741
4787
|
testFilePath: resolvedEvalPath,
|
|
@@ -4748,15 +4794,20 @@ var evalInputCommand = command({
|
|
|
4748
4794
|
env: process.env
|
|
4749
4795
|
});
|
|
4750
4796
|
targetName = selection.targetName;
|
|
4751
|
-
|
|
4797
|
+
const resolved = selection.resolvedTarget;
|
|
4798
|
+
subagentModeAllowed = resolved.subagentModeAllowed !== false;
|
|
4799
|
+
if (resolved.kind === "cli") {
|
|
4752
4800
|
targetKind = "cli";
|
|
4753
|
-
|
|
4801
|
+
subagentModeAllowed = false;
|
|
4802
|
+
const config = resolved.config;
|
|
4754
4803
|
targetInfo = {
|
|
4755
4804
|
kind: "cli",
|
|
4756
4805
|
command: config.command,
|
|
4757
4806
|
cwd: config.cwd ?? evalDir,
|
|
4758
4807
|
timeoutMs: config.timeoutMs ?? 3e4
|
|
4759
4808
|
};
|
|
4809
|
+
} else {
|
|
4810
|
+
targetKind = resolved.kind;
|
|
4760
4811
|
}
|
|
4761
4812
|
} catch {
|
|
4762
4813
|
}
|
|
@@ -4768,15 +4819,13 @@ var evalInputCommand = command({
|
|
|
4768
4819
|
const testDir = join3(outDir, ...subpath);
|
|
4769
4820
|
await mkdir3(testDir, { recursive: true });
|
|
4770
4821
|
testIds.push(test.id);
|
|
4771
|
-
const inputText = test.question;
|
|
4772
4822
|
const inputMessages = test.input.map((m) => ({
|
|
4773
4823
|
role: m.role,
|
|
4774
4824
|
content: typeof m.content === "string" ? m.content : m.content
|
|
4775
4825
|
}));
|
|
4776
4826
|
await writeJson(join3(testDir, "input.json"), {
|
|
4777
|
-
|
|
4778
|
-
|
|
4779
|
-
file_paths: test.file_paths,
|
|
4827
|
+
input: inputMessages,
|
|
4828
|
+
input_files: test.file_paths,
|
|
4780
4829
|
metadata: test.metadata ?? {}
|
|
4781
4830
|
});
|
|
4782
4831
|
if (targetInfo) {
|
|
@@ -4804,11 +4853,13 @@ var evalInputCommand = command({
|
|
|
4804
4853
|
}
|
|
4805
4854
|
await writeJson(join3(outDir, "manifest.json"), {
|
|
4806
4855
|
eval_file: resolvedEvalPath,
|
|
4807
|
-
|
|
4856
|
+
dataset: evalSetName || void 0,
|
|
4857
|
+
experiment: experiment || void 0,
|
|
4808
4858
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4809
4859
|
target: {
|
|
4810
4860
|
name: targetName,
|
|
4811
|
-
kind: targetKind
|
|
4861
|
+
kind: targetKind,
|
|
4862
|
+
subagent_mode_allowed: subagentModeAllowed
|
|
4812
4863
|
},
|
|
4813
4864
|
test_ids: testIds
|
|
4814
4865
|
});
|
|
@@ -4870,7 +4921,13 @@ import { execSync } from "node:child_process";
|
|
|
4870
4921
|
import { existsSync as existsSync3, readFileSync as readFileSync4, unlinkSync } from "node:fs";
|
|
4871
4922
|
import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
|
|
4872
4923
|
import { tmpdir } from "node:os";
|
|
4873
|
-
import { dirname as dirname2, join as join4, resolve as resolve2 } from "node:path";
|
|
4924
|
+
import { dirname as dirname2, join as join4, relative as relative2, resolve as resolve2 } from "node:path";
|
|
4925
|
+
function extractInputText2(input) {
|
|
4926
|
+
if (!input || input.length === 0) return "";
|
|
4927
|
+
if (input.length === 1) return input[0].content;
|
|
4928
|
+
return input.map((m) => `@[${m.role}]:
|
|
4929
|
+
${m.content}`).join("\n\n");
|
|
4930
|
+
}
|
|
4874
4931
|
function loadEnvFile(dir) {
|
|
4875
4932
|
let current = resolve2(dir);
|
|
4876
4933
|
while (true) {
|
|
@@ -4910,14 +4967,20 @@ var evalRunCommand2 = command({
|
|
|
4910
4967
|
type: optional(number),
|
|
4911
4968
|
long: "workers",
|
|
4912
4969
|
description: "Parallel workers for target invocation (default: all tests)"
|
|
4970
|
+
}),
|
|
4971
|
+
experiment: option({
|
|
4972
|
+
type: optional(string),
|
|
4973
|
+
long: "experiment",
|
|
4974
|
+
description: "Experiment label (e.g. with_skills, without_skills)"
|
|
4913
4975
|
})
|
|
4914
4976
|
},
|
|
4915
|
-
handler: async ({ evalPath, out, workers }) => {
|
|
4977
|
+
handler: async ({ evalPath, out, workers, experiment }) => {
|
|
4916
4978
|
const resolvedEvalPath = resolve2(evalPath);
|
|
4917
4979
|
const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd()));
|
|
4918
4980
|
const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
|
|
4919
4981
|
const evalDir = dirname2(resolvedEvalPath);
|
|
4920
|
-
const
|
|
4982
|
+
const category = deriveCategory(relative2(process.cwd(), resolvedEvalPath));
|
|
4983
|
+
const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
|
|
4921
4984
|
const tests = suite.tests;
|
|
4922
4985
|
if (tests.length === 0) {
|
|
4923
4986
|
console.error("No tests found in eval file.");
|
|
@@ -4958,15 +5021,13 @@ var evalRunCommand2 = command({
|
|
|
4958
5021
|
const testDir = join4(outDir, ...subpath);
|
|
4959
5022
|
await mkdir4(testDir, { recursive: true });
|
|
4960
5023
|
testIds.push(test.id);
|
|
4961
|
-
const inputText = test.question;
|
|
4962
5024
|
const inputMessages = test.input.map((m) => ({
|
|
4963
5025
|
role: m.role,
|
|
4964
5026
|
content: typeof m.content === "string" ? m.content : m.content
|
|
4965
5027
|
}));
|
|
4966
5028
|
await writeJson2(join4(testDir, "input.json"), {
|
|
4967
|
-
|
|
4968
|
-
|
|
4969
|
-
file_paths: test.file_paths,
|
|
5029
|
+
input: inputMessages,
|
|
5030
|
+
input_files: test.file_paths,
|
|
4970
5031
|
metadata: test.metadata ?? {}
|
|
4971
5032
|
});
|
|
4972
5033
|
if (targetInfo) {
|
|
@@ -4994,7 +5055,8 @@ var evalRunCommand2 = command({
|
|
|
4994
5055
|
}
|
|
4995
5056
|
await writeJson2(join4(outDir, "manifest.json"), {
|
|
4996
5057
|
eval_file: resolvedEvalPath,
|
|
4997
|
-
|
|
5058
|
+
dataset: evalSetName || void 0,
|
|
5059
|
+
experiment: experiment || void 0,
|
|
4998
5060
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4999
5061
|
target: { name: targetName, kind: targetKind },
|
|
5000
5062
|
test_ids: testIds
|
|
@@ -5019,11 +5081,12 @@ var evalRunCommand2 = command({
|
|
|
5019
5081
|
const timeoutMs = invoke.timeout_ms ?? 12e4;
|
|
5020
5082
|
const promptFile = join4(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
|
|
5021
5083
|
const outputFile = join4(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
|
|
5022
|
-
|
|
5084
|
+
const inputText = extractInputText2(inputData.input);
|
|
5085
|
+
await writeFile5(promptFile, inputText, "utf8");
|
|
5023
5086
|
let rendered = template;
|
|
5024
5087
|
rendered = rendered.replace("{PROMPT_FILE}", promptFile);
|
|
5025
5088
|
rendered = rendered.replace("{OUTPUT_FILE}", outputFile);
|
|
5026
|
-
rendered = rendered.replace("{PROMPT}",
|
|
5089
|
+
rendered = rendered.replace("{PROMPT}", inputText);
|
|
5027
5090
|
const start = performance.now();
|
|
5028
5091
|
try {
|
|
5029
5092
|
execSync(rendered, {
|
|
@@ -5100,14 +5163,13 @@ var evalRunCommand2 = command({
|
|
|
5100
5163
|
for (const graderFile of graderFiles) {
|
|
5101
5164
|
const graderConfig = JSON.parse(await readFile4(join4(codeGradersDir, graderFile), "utf8"));
|
|
5102
5165
|
const graderName = graderConfig.name;
|
|
5166
|
+
const inputText = extractInputText2(inputData.input);
|
|
5103
5167
|
const payload = JSON.stringify({
|
|
5104
5168
|
output: [{ role: "assistant", content: responseText }],
|
|
5105
|
-
input: inputData.
|
|
5106
|
-
question: inputData.input_text,
|
|
5169
|
+
input: inputData.input,
|
|
5107
5170
|
criteria: "",
|
|
5108
5171
|
expected_output: [],
|
|
5109
|
-
|
|
5110
|
-
input_files: [],
|
|
5172
|
+
input_files: inputData.input_files ?? [],
|
|
5111
5173
|
trace: null,
|
|
5112
5174
|
token_usage: null,
|
|
5113
5175
|
cost_usd: null,
|
|
@@ -5117,8 +5179,8 @@ var evalRunCommand2 = command({
|
|
|
5117
5179
|
file_changes: null,
|
|
5118
5180
|
workspace_path: null,
|
|
5119
5181
|
config: graderConfig.config ?? null,
|
|
5120
|
-
metadata: {},
|
|
5121
|
-
input_text:
|
|
5182
|
+
metadata: inputData.metadata ?? {},
|
|
5183
|
+
input_text: inputText,
|
|
5122
5184
|
output_text: responseText,
|
|
5123
5185
|
expected_output_text: ""
|
|
5124
5186
|
});
|
|
@@ -5306,7 +5368,7 @@ function toRawResult(result) {
|
|
|
5306
5368
|
return {
|
|
5307
5369
|
timestamp: result.timestamp,
|
|
5308
5370
|
test_id: result.testId,
|
|
5309
|
-
|
|
5371
|
+
dataset: result.dataset,
|
|
5310
5372
|
conversation_id: result.conversationId,
|
|
5311
5373
|
score: result.score,
|
|
5312
5374
|
assertions: result.assertions?.map((assertion) => ({
|
|
@@ -5429,7 +5491,7 @@ function loadOtlpTraceFile(filePath) {
|
|
|
5429
5491
|
}
|
|
5430
5492
|
return {
|
|
5431
5493
|
test_id: stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`,
|
|
5432
|
-
|
|
5494
|
+
dataset: stringAttr(rootAttrs.agentv_dataset),
|
|
5433
5495
|
target: stringAttr(rootAttrs.agentv_target),
|
|
5434
5496
|
score,
|
|
5435
5497
|
error: root.status?.code === 2 ? root.status.message : void 0,
|
|
@@ -6173,8 +6235,9 @@ var resultsCommand = subcommands({
|
|
|
6173
6235
|
});
|
|
6174
6236
|
|
|
6175
6237
|
// src/commands/results/serve.ts
|
|
6176
|
-
import { existsSync as existsSync7, readFileSync as readFileSync8, writeFileSync as writeFileSync3 } from "node:fs";
|
|
6238
|
+
import { existsSync as existsSync7, readFileSync as readFileSync8, readdirSync as readdirSync3, statSync as statSync4, writeFileSync as writeFileSync3 } from "node:fs";
|
|
6177
6239
|
import path9 from "node:path";
|
|
6240
|
+
import { fileURLToPath as fileURLToPath2 } from "node:url";
|
|
6178
6241
|
import { Hono } from "hono";
|
|
6179
6242
|
function feedbackPath(resultDir) {
|
|
6180
6243
|
return path9.join(resultDir, "feedback.json");
|
|
@@ -6195,24 +6258,45 @@ function writeFeedback(cwd, data) {
|
|
|
6195
6258
|
writeFileSync3(feedbackPath(cwd), `${JSON.stringify(data, null, 2)}
|
|
6196
6259
|
`, "utf8");
|
|
6197
6260
|
}
|
|
6198
|
-
function createApp(results, resultDir, cwd, sourceFile) {
|
|
6261
|
+
function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
6199
6262
|
const searchDir = cwd ?? resultDir;
|
|
6200
6263
|
const app2 = new Hono();
|
|
6264
|
+
const studioDistPath = options?.studioDir === false ? void 0 : options?.studioDir ?? resolveStudioDistDir();
|
|
6201
6265
|
app2.get("/", (c3) => {
|
|
6266
|
+
if (studioDistPath) {
|
|
6267
|
+
const indexPath = path9.join(studioDistPath, "index.html");
|
|
6268
|
+
if (existsSync7(indexPath)) {
|
|
6269
|
+
return c3.html(readFileSync8(indexPath, "utf8"));
|
|
6270
|
+
}
|
|
6271
|
+
}
|
|
6202
6272
|
return c3.html(generateServeHtml(results, sourceFile));
|
|
6203
6273
|
});
|
|
6204
6274
|
app2.get("/api/runs", (c3) => {
|
|
6205
6275
|
const metas = listResultFiles(searchDir);
|
|
6206
6276
|
return c3.json({
|
|
6207
|
-
runs: metas.map((m) =>
|
|
6208
|
-
|
|
6209
|
-
|
|
6210
|
-
|
|
6211
|
-
|
|
6212
|
-
|
|
6213
|
-
|
|
6214
|
-
|
|
6215
|
-
|
|
6277
|
+
runs: metas.map((m) => {
|
|
6278
|
+
let target;
|
|
6279
|
+
let experiment;
|
|
6280
|
+
try {
|
|
6281
|
+
const records = loadLightweightResults(m.path);
|
|
6282
|
+
if (records.length > 0) {
|
|
6283
|
+
target = records[0].target;
|
|
6284
|
+
experiment = records[0].experiment;
|
|
6285
|
+
}
|
|
6286
|
+
} catch {
|
|
6287
|
+
}
|
|
6288
|
+
return {
|
|
6289
|
+
filename: m.filename,
|
|
6290
|
+
path: m.path,
|
|
6291
|
+
timestamp: m.timestamp,
|
|
6292
|
+
test_count: m.testCount,
|
|
6293
|
+
pass_rate: m.passRate,
|
|
6294
|
+
avg_score: m.avgScore,
|
|
6295
|
+
size_bytes: m.sizeBytes,
|
|
6296
|
+
...target && { target },
|
|
6297
|
+
...experiment && { experiment }
|
|
6298
|
+
};
|
|
6299
|
+
})
|
|
6216
6300
|
});
|
|
6217
6301
|
});
|
|
6218
6302
|
app2.get("/api/runs/:filename", (c3) => {
|
|
@@ -6272,8 +6356,393 @@ function createApp(results, resultDir, cwd, sourceFile) {
|
|
|
6272
6356
|
writeFeedback(resultDir, existing);
|
|
6273
6357
|
return c3.json(existing);
|
|
6274
6358
|
});
|
|
6359
|
+
app2.get("/api/runs/:filename/datasets", (c3) => {
|
|
6360
|
+
const filename = c3.req.param("filename");
|
|
6361
|
+
const metas = listResultFiles(searchDir);
|
|
6362
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6363
|
+
if (!meta) {
|
|
6364
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6365
|
+
}
|
|
6366
|
+
try {
|
|
6367
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6368
|
+
const datasetMap = /* @__PURE__ */ new Map();
|
|
6369
|
+
for (const r of loaded) {
|
|
6370
|
+
const ds = r.dataset ?? r.target ?? "default";
|
|
6371
|
+
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
|
|
6372
|
+
entry.total++;
|
|
6373
|
+
if (r.score >= 1) entry.passed++;
|
|
6374
|
+
entry.scoreSum += r.score;
|
|
6375
|
+
datasetMap.set(ds, entry);
|
|
6376
|
+
}
|
|
6377
|
+
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
|
|
6378
|
+
name,
|
|
6379
|
+
total: entry.total,
|
|
6380
|
+
passed: entry.passed,
|
|
6381
|
+
failed: entry.total - entry.passed,
|
|
6382
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
|
|
6383
|
+
}));
|
|
6384
|
+
return c3.json({ datasets });
|
|
6385
|
+
} catch {
|
|
6386
|
+
return c3.json({ error: "Failed to load datasets" }, 500);
|
|
6387
|
+
}
|
|
6388
|
+
});
|
|
6389
|
+
app2.get("/api/runs/:filename/categories", (c3) => {
|
|
6390
|
+
const filename = c3.req.param("filename");
|
|
6391
|
+
const metas = listResultFiles(searchDir);
|
|
6392
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6393
|
+
if (!meta) {
|
|
6394
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6395
|
+
}
|
|
6396
|
+
try {
|
|
6397
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6398
|
+
const categoryMap = /* @__PURE__ */ new Map();
|
|
6399
|
+
for (const r of loaded) {
|
|
6400
|
+
const cat = r.category ?? DEFAULT_CATEGORY;
|
|
6401
|
+
const entry = categoryMap.get(cat) ?? {
|
|
6402
|
+
total: 0,
|
|
6403
|
+
passed: 0,
|
|
6404
|
+
scoreSum: 0,
|
|
6405
|
+
datasets: /* @__PURE__ */ new Set()
|
|
6406
|
+
};
|
|
6407
|
+
entry.total++;
|
|
6408
|
+
if (r.score >= 1) entry.passed++;
|
|
6409
|
+
entry.scoreSum += r.score;
|
|
6410
|
+
entry.datasets.add(r.dataset ?? r.target ?? "default");
|
|
6411
|
+
categoryMap.set(cat, entry);
|
|
6412
|
+
}
|
|
6413
|
+
const categories = [...categoryMap.entries()].map(([name, entry]) => ({
|
|
6414
|
+
name,
|
|
6415
|
+
total: entry.total,
|
|
6416
|
+
passed: entry.passed,
|
|
6417
|
+
failed: entry.total - entry.passed,
|
|
6418
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
|
|
6419
|
+
dataset_count: entry.datasets.size
|
|
6420
|
+
}));
|
|
6421
|
+
return c3.json({ categories });
|
|
6422
|
+
} catch {
|
|
6423
|
+
return c3.json({ error: "Failed to load categories" }, 500);
|
|
6424
|
+
}
|
|
6425
|
+
});
|
|
6426
|
+
app2.get("/api/runs/:filename/categories/:category/datasets", (c3) => {
|
|
6427
|
+
const filename = c3.req.param("filename");
|
|
6428
|
+
const category = decodeURIComponent(c3.req.param("category"));
|
|
6429
|
+
const metas = listResultFiles(searchDir);
|
|
6430
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6431
|
+
if (!meta) {
|
|
6432
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6433
|
+
}
|
|
6434
|
+
try {
|
|
6435
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6436
|
+
const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
|
|
6437
|
+
const datasetMap = /* @__PURE__ */ new Map();
|
|
6438
|
+
for (const r of filtered) {
|
|
6439
|
+
const ds = r.dataset ?? r.target ?? "default";
|
|
6440
|
+
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
|
|
6441
|
+
entry.total++;
|
|
6442
|
+
if (r.score >= 1) entry.passed++;
|
|
6443
|
+
entry.scoreSum += r.score;
|
|
6444
|
+
datasetMap.set(ds, entry);
|
|
6445
|
+
}
|
|
6446
|
+
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
|
|
6447
|
+
name,
|
|
6448
|
+
total: entry.total,
|
|
6449
|
+
passed: entry.passed,
|
|
6450
|
+
failed: entry.total - entry.passed,
|
|
6451
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
|
|
6452
|
+
}));
|
|
6453
|
+
return c3.json({ datasets });
|
|
6454
|
+
} catch {
|
|
6455
|
+
return c3.json({ error: "Failed to load datasets" }, 500);
|
|
6456
|
+
}
|
|
6457
|
+
});
|
|
6458
|
+
app2.get("/api/runs/:filename/evals/:evalId", (c3) => {
|
|
6459
|
+
const filename = c3.req.param("filename");
|
|
6460
|
+
const evalId = c3.req.param("evalId");
|
|
6461
|
+
const metas = listResultFiles(searchDir);
|
|
6462
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6463
|
+
if (!meta) {
|
|
6464
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6465
|
+
}
|
|
6466
|
+
try {
|
|
6467
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6468
|
+
const result = loaded.find((r) => r.testId === evalId);
|
|
6469
|
+
if (!result) {
|
|
6470
|
+
return c3.json({ error: "Eval not found" }, 404);
|
|
6471
|
+
}
|
|
6472
|
+
return c3.json({ eval: result });
|
|
6473
|
+
} catch {
|
|
6474
|
+
return c3.json({ error: "Failed to load eval" }, 500);
|
|
6475
|
+
}
|
|
6476
|
+
});
|
|
6477
|
+
app2.get("/api/index", (c3) => {
|
|
6478
|
+
const metas = listResultFiles(searchDir);
|
|
6479
|
+
const entries2 = metas.map((m) => {
|
|
6480
|
+
let totalCostUsd = 0;
|
|
6481
|
+
try {
|
|
6482
|
+
const loaded = patchTestIds(loadManifestResults(m.path));
|
|
6483
|
+
totalCostUsd = loaded.reduce((sum, r) => sum + (r.costUsd ?? 0), 0);
|
|
6484
|
+
} catch {
|
|
6485
|
+
}
|
|
6486
|
+
return {
|
|
6487
|
+
run_filename: m.filename,
|
|
6488
|
+
test_count: m.testCount,
|
|
6489
|
+
pass_rate: m.passRate,
|
|
6490
|
+
avg_score: m.avgScore,
|
|
6491
|
+
total_cost_usd: totalCostUsd,
|
|
6492
|
+
timestamp: m.timestamp
|
|
6493
|
+
};
|
|
6494
|
+
});
|
|
6495
|
+
return c3.json({ entries: entries2 });
|
|
6496
|
+
});
|
|
6497
|
+
function buildFileTree(dirPath, relativeTo) {
|
|
6498
|
+
if (!existsSync7(dirPath) || !statSync4(dirPath).isDirectory()) {
|
|
6499
|
+
return [];
|
|
6500
|
+
}
|
|
6501
|
+
const entries2 = readdirSync3(dirPath, { withFileTypes: true });
|
|
6502
|
+
return entries2.sort((a, b) => {
|
|
6503
|
+
if (a.isDirectory() !== b.isDirectory()) return a.isDirectory() ? -1 : 1;
|
|
6504
|
+
return a.name.localeCompare(b.name);
|
|
6505
|
+
}).map((entry) => {
|
|
6506
|
+
const fullPath = path9.join(dirPath, entry.name);
|
|
6507
|
+
const relPath = path9.relative(relativeTo, fullPath);
|
|
6508
|
+
if (entry.isDirectory()) {
|
|
6509
|
+
return {
|
|
6510
|
+
name: entry.name,
|
|
6511
|
+
path: relPath,
|
|
6512
|
+
type: "dir",
|
|
6513
|
+
children: buildFileTree(fullPath, relativeTo)
|
|
6514
|
+
};
|
|
6515
|
+
}
|
|
6516
|
+
return { name: entry.name, path: relPath, type: "file" };
|
|
6517
|
+
});
|
|
6518
|
+
}
|
|
6519
|
+
function inferLanguage(filePath) {
|
|
6520
|
+
const ext = path9.extname(filePath).toLowerCase();
|
|
6521
|
+
const langMap = {
|
|
6522
|
+
".json": "json",
|
|
6523
|
+
".jsonl": "json",
|
|
6524
|
+
".ts": "typescript",
|
|
6525
|
+
".tsx": "typescript",
|
|
6526
|
+
".js": "javascript",
|
|
6527
|
+
".jsx": "javascript",
|
|
6528
|
+
".md": "markdown",
|
|
6529
|
+
".yaml": "yaml",
|
|
6530
|
+
".yml": "yaml",
|
|
6531
|
+
".log": "plaintext",
|
|
6532
|
+
".txt": "plaintext",
|
|
6533
|
+
".py": "python",
|
|
6534
|
+
".sh": "shell",
|
|
6535
|
+
".bash": "shell",
|
|
6536
|
+
".css": "css",
|
|
6537
|
+
".html": "html",
|
|
6538
|
+
".xml": "xml",
|
|
6539
|
+
".svg": "xml",
|
|
6540
|
+
".toml": "toml",
|
|
6541
|
+
".diff": "diff",
|
|
6542
|
+
".patch": "diff"
|
|
6543
|
+
};
|
|
6544
|
+
return langMap[ext] ?? "plaintext";
|
|
6545
|
+
}
|
|
6546
|
+
app2.get("/api/runs/:filename/evals/:evalId/files", (c3) => {
|
|
6547
|
+
const filename = c3.req.param("filename");
|
|
6548
|
+
const evalId = c3.req.param("evalId");
|
|
6549
|
+
const metas = listResultFiles(searchDir);
|
|
6550
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6551
|
+
if (!meta) {
|
|
6552
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6553
|
+
}
|
|
6554
|
+
try {
|
|
6555
|
+
const content = readFileSync8(meta.path, "utf8");
|
|
6556
|
+
const records = parseResultManifest(content);
|
|
6557
|
+
const record = records.find((r) => (r.test_id ?? r.eval_id) === evalId);
|
|
6558
|
+
if (!record) {
|
|
6559
|
+
return c3.json({ error: "Eval not found" }, 404);
|
|
6560
|
+
}
|
|
6561
|
+
const baseDir = path9.dirname(meta.path);
|
|
6562
|
+
const knownPaths = [
|
|
6563
|
+
record.grading_path,
|
|
6564
|
+
record.timing_path,
|
|
6565
|
+
record.input_path,
|
|
6566
|
+
record.output_path,
|
|
6567
|
+
record.response_path
|
|
6568
|
+
].filter((p) => !!p);
|
|
6569
|
+
if (knownPaths.length === 0) {
|
|
6570
|
+
return c3.json({ files: [] });
|
|
6571
|
+
}
|
|
6572
|
+
const artifactDirs = knownPaths.map((p) => path9.dirname(p));
|
|
6573
|
+
let commonDir = artifactDirs[0];
|
|
6574
|
+
for (const dir of artifactDirs) {
|
|
6575
|
+
while (!dir.startsWith(commonDir)) {
|
|
6576
|
+
commonDir = path9.dirname(commonDir);
|
|
6577
|
+
}
|
|
6578
|
+
}
|
|
6579
|
+
const artifactAbsDir = path9.join(baseDir, commonDir);
|
|
6580
|
+
const files = buildFileTree(artifactAbsDir, baseDir);
|
|
6581
|
+
return c3.json({ files });
|
|
6582
|
+
} catch {
|
|
6583
|
+
return c3.json({ error: "Failed to load file tree" }, 500);
|
|
6584
|
+
}
|
|
6585
|
+
});
|
|
6586
|
+
app2.get("/api/runs/:filename/evals/:evalId/files/*", (c3) => {
|
|
6587
|
+
const filename = c3.req.param("filename");
|
|
6588
|
+
const evalId = c3.req.param("evalId");
|
|
6589
|
+
const metas = listResultFiles(searchDir);
|
|
6590
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6591
|
+
if (!meta) {
|
|
6592
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6593
|
+
}
|
|
6594
|
+
const requestPath = c3.req.path;
|
|
6595
|
+
const prefix = `/api/runs/${filename}/evals/${evalId}/files/`;
|
|
6596
|
+
const filePath = requestPath.slice(prefix.length);
|
|
6597
|
+
if (!filePath) {
|
|
6598
|
+
return c3.json({ error: "No file path specified" }, 400);
|
|
6599
|
+
}
|
|
6600
|
+
const baseDir = path9.dirname(meta.path);
|
|
6601
|
+
const absolutePath = path9.resolve(baseDir, filePath);
|
|
6602
|
+
if (!absolutePath.startsWith(path9.resolve(baseDir) + path9.sep) && absolutePath !== path9.resolve(baseDir)) {
|
|
6603
|
+
return c3.json({ error: "Path traversal not allowed" }, 403);
|
|
6604
|
+
}
|
|
6605
|
+
if (!existsSync7(absolutePath) || !statSync4(absolutePath).isFile()) {
|
|
6606
|
+
return c3.json({ error: "File not found" }, 404);
|
|
6607
|
+
}
|
|
6608
|
+
try {
|
|
6609
|
+
const fileContent = readFileSync8(absolutePath, "utf8");
|
|
6610
|
+
const language = inferLanguage(absolutePath);
|
|
6611
|
+
return c3.json({ content: fileContent, language });
|
|
6612
|
+
} catch {
|
|
6613
|
+
return c3.json({ error: "Failed to read file" }, 500);
|
|
6614
|
+
}
|
|
6615
|
+
});
|
|
6616
|
+
app2.get("/api/experiments", (c3) => {
|
|
6617
|
+
const metas = listResultFiles(searchDir);
|
|
6618
|
+
const experimentMap = /* @__PURE__ */ new Map();
|
|
6619
|
+
for (const m of metas) {
|
|
6620
|
+
try {
|
|
6621
|
+
const records = loadLightweightResults(m.path);
|
|
6622
|
+
for (const r of records) {
|
|
6623
|
+
const experiment = r.experiment ?? "default";
|
|
6624
|
+
const entry = experimentMap.get(experiment) ?? {
|
|
6625
|
+
targets: /* @__PURE__ */ new Set(),
|
|
6626
|
+
runFilenames: /* @__PURE__ */ new Set(),
|
|
6627
|
+
evalCount: 0,
|
|
6628
|
+
passedCount: 0,
|
|
6629
|
+
lastTimestamp: ""
|
|
6630
|
+
};
|
|
6631
|
+
entry.runFilenames.add(m.filename);
|
|
6632
|
+
if (r.target) entry.targets.add(r.target);
|
|
6633
|
+
entry.evalCount++;
|
|
6634
|
+
if (r.score >= 1) entry.passedCount++;
|
|
6635
|
+
if (r.timestamp && r.timestamp > entry.lastTimestamp) {
|
|
6636
|
+
entry.lastTimestamp = r.timestamp;
|
|
6637
|
+
}
|
|
6638
|
+
experimentMap.set(experiment, entry);
|
|
6639
|
+
}
|
|
6640
|
+
} catch {
|
|
6641
|
+
}
|
|
6642
|
+
}
|
|
6643
|
+
const experiments = [...experimentMap.entries()].map(([name, entry]) => ({
|
|
6644
|
+
name,
|
|
6645
|
+
run_count: entry.runFilenames.size,
|
|
6646
|
+
target_count: entry.targets.size,
|
|
6647
|
+
eval_count: entry.evalCount,
|
|
6648
|
+
passed_count: entry.passedCount,
|
|
6649
|
+
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
|
|
6650
|
+
last_run: entry.lastTimestamp || null
|
|
6651
|
+
}));
|
|
6652
|
+
return c3.json({ experiments });
|
|
6653
|
+
});
|
|
6654
|
+
app2.get("/api/targets", (c3) => {
|
|
6655
|
+
const metas = listResultFiles(searchDir);
|
|
6656
|
+
const targetMap = /* @__PURE__ */ new Map();
|
|
6657
|
+
for (const m of metas) {
|
|
6658
|
+
try {
|
|
6659
|
+
const records = loadLightweightResults(m.path);
|
|
6660
|
+
for (const r of records) {
|
|
6661
|
+
const target = r.target ?? "default";
|
|
6662
|
+
const entry = targetMap.get(target) ?? {
|
|
6663
|
+
experiments: /* @__PURE__ */ new Set(),
|
|
6664
|
+
runFilenames: /* @__PURE__ */ new Set(),
|
|
6665
|
+
evalCount: 0,
|
|
6666
|
+
passedCount: 0
|
|
6667
|
+
};
|
|
6668
|
+
entry.runFilenames.add(m.filename);
|
|
6669
|
+
if (r.experiment) entry.experiments.add(r.experiment);
|
|
6670
|
+
entry.evalCount++;
|
|
6671
|
+
if (r.score >= 1) entry.passedCount++;
|
|
6672
|
+
targetMap.set(target, entry);
|
|
6673
|
+
}
|
|
6674
|
+
} catch {
|
|
6675
|
+
}
|
|
6676
|
+
}
|
|
6677
|
+
const targets = [...targetMap.entries()].map(([name, entry]) => ({
|
|
6678
|
+
name,
|
|
6679
|
+
run_count: entry.runFilenames.size,
|
|
6680
|
+
experiment_count: entry.experiments.size,
|
|
6681
|
+
eval_count: entry.evalCount,
|
|
6682
|
+
passed_count: entry.passedCount,
|
|
6683
|
+
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0
|
|
6684
|
+
}));
|
|
6685
|
+
return c3.json({ targets });
|
|
6686
|
+
});
|
|
6687
|
+
if (studioDistPath) {
|
|
6688
|
+
app2.get("/assets/*", (c3) => {
|
|
6689
|
+
const assetPath = c3.req.path;
|
|
6690
|
+
const filePath = path9.join(studioDistPath, assetPath);
|
|
6691
|
+
if (!existsSync7(filePath)) {
|
|
6692
|
+
return c3.notFound();
|
|
6693
|
+
}
|
|
6694
|
+
const content = readFileSync8(filePath);
|
|
6695
|
+
const ext = path9.extname(filePath);
|
|
6696
|
+
const mimeTypes = {
|
|
6697
|
+
".js": "application/javascript",
|
|
6698
|
+
".css": "text/css",
|
|
6699
|
+
".html": "text/html",
|
|
6700
|
+
".json": "application/json",
|
|
6701
|
+
".svg": "image/svg+xml",
|
|
6702
|
+
".png": "image/png",
|
|
6703
|
+
".woff2": "font/woff2",
|
|
6704
|
+
".woff": "font/woff"
|
|
6705
|
+
};
|
|
6706
|
+
const contentType = mimeTypes[ext] ?? "application/octet-stream";
|
|
6707
|
+
return new Response(content, {
|
|
6708
|
+
headers: {
|
|
6709
|
+
"Content-Type": contentType,
|
|
6710
|
+
"Cache-Control": "public, max-age=31536000, immutable"
|
|
6711
|
+
}
|
|
6712
|
+
});
|
|
6713
|
+
});
|
|
6714
|
+
app2.get("*", (c3) => {
|
|
6715
|
+
if (c3.req.path.startsWith("/api/")) {
|
|
6716
|
+
return c3.json({ error: "Not found" }, 404);
|
|
6717
|
+
}
|
|
6718
|
+
const indexPath = path9.join(studioDistPath, "index.html");
|
|
6719
|
+
if (existsSync7(indexPath)) {
|
|
6720
|
+
return c3.html(readFileSync8(indexPath, "utf8"));
|
|
6721
|
+
}
|
|
6722
|
+
return c3.notFound();
|
|
6723
|
+
});
|
|
6724
|
+
}
|
|
6275
6725
|
return app2;
|
|
6276
6726
|
}
|
|
6727
|
+
function resolveStudioDistDir() {
|
|
6728
|
+
const currentDir = typeof __dirname !== "undefined" ? __dirname : path9.dirname(fileURLToPath2(import.meta.url));
|
|
6729
|
+
const candidates = [
|
|
6730
|
+
// From src/commands/results/ → sibling apps/studio/dist
|
|
6731
|
+
path9.resolve(currentDir, "../../../../studio/dist"),
|
|
6732
|
+
// From dist/ → sibling apps/studio/dist (monorepo dev)
|
|
6733
|
+
path9.resolve(currentDir, "../../studio/dist"),
|
|
6734
|
+
// Bundled inside CLI dist (published package)
|
|
6735
|
+
path9.resolve(currentDir, "../studio"),
|
|
6736
|
+
// From dist/ in monorepo root context
|
|
6737
|
+
path9.resolve(currentDir, "../../../apps/studio/dist")
|
|
6738
|
+
];
|
|
6739
|
+
for (const candidate of candidates) {
|
|
6740
|
+
if (existsSync7(candidate) && existsSync7(path9.join(candidate, "index.html"))) {
|
|
6741
|
+
return candidate;
|
|
6742
|
+
}
|
|
6743
|
+
}
|
|
6744
|
+
return void 0;
|
|
6745
|
+
}
|
|
6277
6746
|
function stripHeavyFields(results) {
|
|
6278
6747
|
return results.map((r) => {
|
|
6279
6748
|
const { requests, trace, ...rest } = r;
|
|
@@ -6956,8 +7425,8 @@ var SERVE_SCRIPT = `
|
|
|
6956
7425
|
})();
|
|
6957
7426
|
`;
|
|
6958
7427
|
var resultsServeCommand = command({
|
|
6959
|
-
name: "
|
|
6960
|
-
description: "Start a local
|
|
7428
|
+
name: "studio",
|
|
7429
|
+
description: "Start AgentV Studio \u2014 a local dashboard for reviewing evaluation results",
|
|
6961
7430
|
args: {
|
|
6962
7431
|
source: positional({
|
|
6963
7432
|
type: optional(string),
|
|
@@ -7594,7 +8063,7 @@ function formatResultDetail(result, index, tree) {
|
|
|
7594
8063
|
}
|
|
7595
8064
|
const scoreColor = result.score >= 0.9 ? c2.green : result.score >= 0.5 ? c2.yellow : c2.red;
|
|
7596
8065
|
lines.push(
|
|
7597
|
-
`${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.
|
|
8066
|
+
`${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.dataset ? ` ${c2.dim}dataset: ${result.dataset}${c2.reset}` : ""}`
|
|
7598
8067
|
);
|
|
7599
8068
|
if (result.error) {
|
|
7600
8069
|
lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
|
|
@@ -7768,8 +8237,8 @@ function groupResults(results, groupBy2) {
|
|
|
7768
8237
|
case "target":
|
|
7769
8238
|
key = result.target ?? "unknown";
|
|
7770
8239
|
break;
|
|
7771
|
-
case "
|
|
7772
|
-
key = result.
|
|
8240
|
+
case "dataset":
|
|
8241
|
+
key = result.dataset ?? "unknown";
|
|
7773
8242
|
break;
|
|
7774
8243
|
case "test-id":
|
|
7775
8244
|
key = result.test_id ?? result.eval_id ?? "unknown";
|
|
@@ -8482,7 +8951,9 @@ var app = subcommands({
|
|
|
8482
8951
|
pipeline: pipelineCommand,
|
|
8483
8952
|
results: resultsCommand,
|
|
8484
8953
|
self: selfCommand,
|
|
8954
|
+
studio: resultsServeCommand,
|
|
8485
8955
|
serve: resultsServeCommand,
|
|
8956
|
+
// hidden alias for backward compatibility
|
|
8486
8957
|
trace: traceCommand,
|
|
8487
8958
|
transpile: transpileCommand,
|
|
8488
8959
|
trim: trimCommand,
|
|
@@ -8501,6 +8972,7 @@ var TOP_LEVEL_COMMANDS = /* @__PURE__ */ new Set([
|
|
|
8501
8972
|
"results",
|
|
8502
8973
|
"self",
|
|
8503
8974
|
"serve",
|
|
8975
|
+
"studio",
|
|
8504
8976
|
"trace",
|
|
8505
8977
|
"transpile",
|
|
8506
8978
|
"trim",
|
|
@@ -8547,4 +9019,4 @@ export {
|
|
|
8547
9019
|
preprocessArgv,
|
|
8548
9020
|
runCli
|
|
8549
9021
|
};
|
|
8550
|
-
//# sourceMappingURL=chunk-
|
|
9022
|
+
//# sourceMappingURL=chunk-E3VSJJI4.js.map
|