agentv 3.14.5 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-UBLKP2F4.js → chunk-E3VSJJI4.js} +568 -74
- package/dist/chunk-E3VSJJI4.js.map +1 -0
- package/dist/{chunk-GUXXTOYK.js → chunk-OT2J474N.js} +44 -18
- package/dist/chunk-OT2J474N.js.map +1 -0
- package/dist/{chunk-ELQEFMGO.js → chunk-OXBBWZOY.js} +592 -295
- package/dist/chunk-OXBBWZOY.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-5EEXTTC3.js → dist-3Z22B6SU.js} +18 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-WUIEXGWM.js → interactive-D5UTP72M.js} +4 -11
- package/dist/interactive-D5UTP72M.js.map +1 -0
- package/dist/studio/assets/index-BuKVkxFj.css +1 -0
- package/dist/studio/assets/index-CE3-mmv0.js +11 -0
- package/dist/studio/assets/index-DBU720Fm.js +71 -0
- package/dist/studio/index.html +13 -0
- package/dist/templates/.env.example +0 -3
- package/package.json +1 -1
- package/dist/chunk-ELQEFMGO.js.map +0 -1
- package/dist/chunk-GUXXTOYK.js.map +0 -1
- package/dist/chunk-UBLKP2F4.js.map +0 -1
- package/dist/interactive-WUIEXGWM.js.map +0 -1
- /package/dist/{dist-5EEXTTC3.js.map → dist-3Z22B6SU.js.map} +0 -0
|
@@ -10,6 +10,7 @@ import {
|
|
|
10
10
|
loadManifestResults,
|
|
11
11
|
loadRunCache,
|
|
12
12
|
package_default,
|
|
13
|
+
parseResultManifest,
|
|
13
14
|
resolveEvalPaths,
|
|
14
15
|
resolveExistingRunPrimaryPath,
|
|
15
16
|
resolveResultSourcePath,
|
|
@@ -23,9 +24,11 @@ import {
|
|
|
23
24
|
validateFileReferences,
|
|
24
25
|
validateTargetsFile,
|
|
25
26
|
writeArtifactsFromResults
|
|
26
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-OT2J474N.js";
|
|
27
28
|
import {
|
|
29
|
+
DEFAULT_CATEGORY,
|
|
28
30
|
createBuiltinRegistry,
|
|
31
|
+
deriveCategory,
|
|
29
32
|
executeScript,
|
|
30
33
|
getAgentvHome,
|
|
31
34
|
getOutputFilenames,
|
|
@@ -40,7 +43,7 @@ import {
|
|
|
40
43
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
41
44
|
transpileEvalYamlFile,
|
|
42
45
|
trimBaselineResult
|
|
43
|
-
} from "./chunk-
|
|
46
|
+
} from "./chunk-OXBBWZOY.js";
|
|
44
47
|
import {
|
|
45
48
|
__commonJS,
|
|
46
49
|
__esm,
|
|
@@ -3479,9 +3482,23 @@ var ASSERTION_TEMPLATES = {
|
|
|
3479
3482
|
default: `#!/usr/bin/env bun
|
|
3480
3483
|
import { defineAssertion } from '@agentv/eval';
|
|
3481
3484
|
|
|
3482
|
-
|
|
3485
|
+
/** Extract text from the last message with the given role. */
|
|
3486
|
+
function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
|
|
3487
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
3488
|
+
const msg = messages[i];
|
|
3489
|
+
if (msg.role !== role) continue;
|
|
3490
|
+
if (typeof msg.content === 'string') return msg.content;
|
|
3491
|
+
if (Array.isArray(msg.content)) {
|
|
3492
|
+
return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
|
|
3493
|
+
}
|
|
3494
|
+
}
|
|
3495
|
+
return '';
|
|
3496
|
+
}
|
|
3497
|
+
|
|
3498
|
+
export default defineAssertion(({ output }) => {
|
|
3483
3499
|
// TODO: Implement your assertion logic
|
|
3484
|
-
const
|
|
3500
|
+
const text = getMessageText(output ?? []);
|
|
3501
|
+
const pass = text.length > 0;
|
|
3485
3502
|
return {
|
|
3486
3503
|
pass,
|
|
3487
3504
|
reasoning: pass ? 'Output has content' : 'Output is empty',
|
|
@@ -3491,9 +3508,23 @@ export default defineAssertion(({ outputText }) => {
|
|
|
3491
3508
|
score: `#!/usr/bin/env bun
|
|
3492
3509
|
import { defineAssertion } from '@agentv/eval';
|
|
3493
3510
|
|
|
3494
|
-
|
|
3511
|
+
/** Extract text from the last message with the given role. */
|
|
3512
|
+
function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
|
|
3513
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
3514
|
+
const msg = messages[i];
|
|
3515
|
+
if (msg.role !== role) continue;
|
|
3516
|
+
if (typeof msg.content === 'string') return msg.content;
|
|
3517
|
+
if (Array.isArray(msg.content)) {
|
|
3518
|
+
return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
|
|
3519
|
+
}
|
|
3520
|
+
}
|
|
3521
|
+
return '';
|
|
3522
|
+
}
|
|
3523
|
+
|
|
3524
|
+
export default defineAssertion(({ output }) => {
|
|
3495
3525
|
// TODO: Implement your scoring logic (0.0 to 1.0)
|
|
3496
|
-
const
|
|
3526
|
+
const text = getMessageText(output ?? []);
|
|
3527
|
+
const score = text.length > 0 ? 1.0 : 0.0;
|
|
3497
3528
|
return {
|
|
3498
3529
|
pass: score >= 0.5,
|
|
3499
3530
|
score,
|
|
@@ -4186,7 +4217,7 @@ var evalRunCommand = command({
|
|
|
4186
4217
|
},
|
|
4187
4218
|
handler: async (args) => {
|
|
4188
4219
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4189
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4220
|
+
const { launchInteractiveWizard } = await import("./interactive-D5UTP72M.js");
|
|
4190
4221
|
await launchInteractiveWizard();
|
|
4191
4222
|
return;
|
|
4192
4223
|
}
|
|
@@ -4421,6 +4452,9 @@ var evalBenchCommand = command({
|
|
|
4421
4452
|
const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
|
|
4422
4453
|
const testIds = manifest.test_ids;
|
|
4423
4454
|
const targetName = manifest.target?.name ?? "unknown";
|
|
4455
|
+
const evalSet = manifest.dataset ?? "";
|
|
4456
|
+
const experiment = manifest.experiment;
|
|
4457
|
+
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4424
4458
|
let stdinData;
|
|
4425
4459
|
if (llmScoresPath) {
|
|
4426
4460
|
stdinData = await readFile(llmScoresPath, "utf8");
|
|
@@ -4431,7 +4465,9 @@ var evalBenchCommand = command({
|
|
|
4431
4465
|
const indexLines = [];
|
|
4432
4466
|
const allPassRates = [];
|
|
4433
4467
|
for (const testId of testIds) {
|
|
4434
|
-
const
|
|
4468
|
+
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
|
|
4469
|
+
const testDir = join(exportDir, ...subpath);
|
|
4470
|
+
const artifactSubdir = subpath.join("/");
|
|
4435
4471
|
const evaluators = [];
|
|
4436
4472
|
const allAssertions = [];
|
|
4437
4473
|
const codeResultsDir = join(testDir, "code_grader_results");
|
|
@@ -4527,13 +4563,15 @@ var evalBenchCommand = command({
|
|
|
4527
4563
|
JSON.stringify({
|
|
4528
4564
|
timestamp: manifest.timestamp,
|
|
4529
4565
|
test_id: testId,
|
|
4566
|
+
dataset: evalSet || void 0,
|
|
4567
|
+
experiment: experiment || void 0,
|
|
4530
4568
|
score: Math.round(weightedScore * 1e3) / 1e3,
|
|
4531
4569
|
target: targetName,
|
|
4532
4570
|
scores,
|
|
4533
4571
|
execution_status: executionStatus,
|
|
4534
|
-
grading_path: `${
|
|
4535
|
-
timing_path: `${
|
|
4536
|
-
response_path: hasResponse ? `${
|
|
4572
|
+
grading_path: `${artifactSubdir}/grading.json`,
|
|
4573
|
+
timing_path: `${artifactSubdir}/timing.json`,
|
|
4574
|
+
response_path: hasResponse ? `${artifactSubdir}/response.md` : void 0
|
|
4537
4575
|
})
|
|
4538
4576
|
);
|
|
4539
4577
|
}
|
|
@@ -4548,6 +4586,7 @@ var evalBenchCommand = command({
|
|
|
4548
4586
|
metadata: {
|
|
4549
4587
|
eval_file: manifest.eval_file,
|
|
4550
4588
|
timestamp: manifest.timestamp,
|
|
4589
|
+
experiment: experiment || void 0,
|
|
4551
4590
|
targets: [targetName],
|
|
4552
4591
|
tests_run: testIds
|
|
4553
4592
|
},
|
|
@@ -4589,6 +4628,12 @@ function computeStats(values) {
|
|
|
4589
4628
|
// src/commands/pipeline/grade.ts
|
|
4590
4629
|
import { mkdir as mkdir2, readFile as readFile2, readdir as readdir2, writeFile as writeFile3 } from "node:fs/promises";
|
|
4591
4630
|
import { join as join2 } from "node:path";
|
|
4631
|
+
function extractInputText(input) {
|
|
4632
|
+
if (!input || input.length === 0) return "";
|
|
4633
|
+
if (input.length === 1) return input[0].content;
|
|
4634
|
+
return input.map((m) => `@[${m.role}]:
|
|
4635
|
+
${m.content}`).join("\n\n");
|
|
4636
|
+
}
|
|
4592
4637
|
var evalGradeCommand = command({
|
|
4593
4638
|
name: "grade",
|
|
4594
4639
|
description: "Run code-grader assertions on responses in an export directory",
|
|
@@ -4603,10 +4648,13 @@ var evalGradeCommand = command({
|
|
|
4603
4648
|
const manifestPath = join2(exportDir, "manifest.json");
|
|
4604
4649
|
const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
|
|
4605
4650
|
const testIds = manifest.test_ids;
|
|
4651
|
+
const evalSet = manifest.dataset ?? "";
|
|
4652
|
+
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4606
4653
|
let totalGraders = 0;
|
|
4607
4654
|
let totalPassed = 0;
|
|
4608
4655
|
for (const testId of testIds) {
|
|
4609
|
-
const
|
|
4656
|
+
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
|
|
4657
|
+
const testDir = join2(exportDir, ...subpath);
|
|
4610
4658
|
const codeGradersDir = join2(testDir, "code_graders");
|
|
4611
4659
|
const resultsDir = join2(testDir, "code_grader_results");
|
|
4612
4660
|
let graderFiles;
|
|
@@ -4622,14 +4670,13 @@ var evalGradeCommand = command({
|
|
|
4622
4670
|
for (const graderFile of graderFiles) {
|
|
4623
4671
|
const graderConfig = JSON.parse(await readFile2(join2(codeGradersDir, graderFile), "utf8"));
|
|
4624
4672
|
const graderName = graderConfig.name;
|
|
4673
|
+
const inputText = extractInputText(inputData.input);
|
|
4625
4674
|
const payload = JSON.stringify({
|
|
4626
4675
|
output: [{ role: "assistant", content: responseText }],
|
|
4627
|
-
input: inputData.
|
|
4628
|
-
question: inputData.input_text,
|
|
4676
|
+
input: inputData.input,
|
|
4629
4677
|
criteria: "",
|
|
4630
4678
|
expected_output: [],
|
|
4631
|
-
|
|
4632
|
-
input_files: [],
|
|
4679
|
+
input_files: inputData.input_files ?? [],
|
|
4633
4680
|
trace: null,
|
|
4634
4681
|
token_usage: null,
|
|
4635
4682
|
cost_usd: null,
|
|
@@ -4639,8 +4686,8 @@ var evalGradeCommand = command({
|
|
|
4639
4686
|
file_changes: null,
|
|
4640
4687
|
workspace_path: null,
|
|
4641
4688
|
config: graderConfig.config ?? null,
|
|
4642
|
-
metadata: {},
|
|
4643
|
-
input_text:
|
|
4689
|
+
metadata: inputData.metadata ?? {},
|
|
4690
|
+
input_text: inputText,
|
|
4644
4691
|
output_text: responseText,
|
|
4645
4692
|
expected_output_text: ""
|
|
4646
4693
|
});
|
|
@@ -4698,10 +4745,10 @@ var evalGradeCommand = command({
|
|
|
4698
4745
|
// src/commands/pipeline/input.ts
|
|
4699
4746
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
4700
4747
|
import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
|
|
4701
|
-
import { dirname, join as join3, resolve } from "node:path";
|
|
4748
|
+
import { dirname, join as join3, relative, resolve } from "node:path";
|
|
4702
4749
|
var evalInputCommand = command({
|
|
4703
4750
|
name: "input",
|
|
4704
|
-
description: "Extract eval inputs, target commands, and grader prompts for
|
|
4751
|
+
description: "Extract eval inputs, target commands, and grader prompts for subagent-mode runs",
|
|
4705
4752
|
args: {
|
|
4706
4753
|
evalPath: positional({
|
|
4707
4754
|
type: string,
|
|
@@ -4711,15 +4758,21 @@ var evalInputCommand = command({
|
|
|
4711
4758
|
out: option({
|
|
4712
4759
|
type: optional(string),
|
|
4713
4760
|
long: "out",
|
|
4714
|
-
description: "Output directory for extracted inputs (default: .agentv/results/runs
|
|
4761
|
+
description: "Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)"
|
|
4762
|
+
}),
|
|
4763
|
+
experiment: option({
|
|
4764
|
+
type: optional(string),
|
|
4765
|
+
long: "experiment",
|
|
4766
|
+
description: "Experiment label (e.g. with_skills, without_skills)"
|
|
4715
4767
|
})
|
|
4716
4768
|
},
|
|
4717
|
-
handler: async ({ evalPath, out }) => {
|
|
4769
|
+
handler: async ({ evalPath, out, experiment }) => {
|
|
4718
4770
|
const resolvedEvalPath = resolve(evalPath);
|
|
4719
4771
|
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
|
|
4720
4772
|
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
|
|
4721
4773
|
const evalDir = dirname(resolvedEvalPath);
|
|
4722
|
-
const
|
|
4774
|
+
const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
|
|
4775
|
+
const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
|
|
4723
4776
|
const tests = suite.tests;
|
|
4724
4777
|
if (tests.length === 0) {
|
|
4725
4778
|
console.error("No tests found in eval file.");
|
|
@@ -4728,6 +4781,7 @@ var evalInputCommand = command({
|
|
|
4728
4781
|
let targetInfo = null;
|
|
4729
4782
|
let targetName = "agent";
|
|
4730
4783
|
let targetKind = "agent";
|
|
4784
|
+
let subagentModeAllowed = true;
|
|
4731
4785
|
try {
|
|
4732
4786
|
const selection = await selectTarget({
|
|
4733
4787
|
testFilePath: resolvedEvalPath,
|
|
@@ -4740,32 +4794,38 @@ var evalInputCommand = command({
|
|
|
4740
4794
|
env: process.env
|
|
4741
4795
|
});
|
|
4742
4796
|
targetName = selection.targetName;
|
|
4743
|
-
|
|
4797
|
+
const resolved = selection.resolvedTarget;
|
|
4798
|
+
subagentModeAllowed = resolved.subagentModeAllowed !== false;
|
|
4799
|
+
if (resolved.kind === "cli") {
|
|
4744
4800
|
targetKind = "cli";
|
|
4745
|
-
|
|
4801
|
+
subagentModeAllowed = false;
|
|
4802
|
+
const config = resolved.config;
|
|
4746
4803
|
targetInfo = {
|
|
4747
4804
|
kind: "cli",
|
|
4748
4805
|
command: config.command,
|
|
4749
4806
|
cwd: config.cwd ?? evalDir,
|
|
4750
4807
|
timeoutMs: config.timeoutMs ?? 3e4
|
|
4751
4808
|
};
|
|
4809
|
+
} else {
|
|
4810
|
+
targetKind = resolved.kind;
|
|
4752
4811
|
}
|
|
4753
4812
|
} catch {
|
|
4754
4813
|
}
|
|
4814
|
+
const evalSetName = suite.metadata?.name?.trim() ?? "";
|
|
4815
|
+
const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4755
4816
|
const testIds = [];
|
|
4756
4817
|
for (const test of tests) {
|
|
4757
|
-
const
|
|
4818
|
+
const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id];
|
|
4819
|
+
const testDir = join3(outDir, ...subpath);
|
|
4758
4820
|
await mkdir3(testDir, { recursive: true });
|
|
4759
4821
|
testIds.push(test.id);
|
|
4760
|
-
const inputText = test.question;
|
|
4761
4822
|
const inputMessages = test.input.map((m) => ({
|
|
4762
4823
|
role: m.role,
|
|
4763
4824
|
content: typeof m.content === "string" ? m.content : m.content
|
|
4764
4825
|
}));
|
|
4765
4826
|
await writeJson(join3(testDir, "input.json"), {
|
|
4766
|
-
|
|
4767
|
-
|
|
4768
|
-
file_paths: test.file_paths,
|
|
4827
|
+
input: inputMessages,
|
|
4828
|
+
input_files: test.file_paths,
|
|
4769
4829
|
metadata: test.metadata ?? {}
|
|
4770
4830
|
});
|
|
4771
4831
|
if (targetInfo) {
|
|
@@ -4793,10 +4853,13 @@ var evalInputCommand = command({
|
|
|
4793
4853
|
}
|
|
4794
4854
|
await writeJson(join3(outDir, "manifest.json"), {
|
|
4795
4855
|
eval_file: resolvedEvalPath,
|
|
4856
|
+
dataset: evalSetName || void 0,
|
|
4857
|
+
experiment: experiment || void 0,
|
|
4796
4858
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4797
4859
|
target: {
|
|
4798
4860
|
name: targetName,
|
|
4799
|
-
kind: targetKind
|
|
4861
|
+
kind: targetKind,
|
|
4862
|
+
subagent_mode_allowed: subagentModeAllowed
|
|
4800
4863
|
},
|
|
4801
4864
|
test_ids: testIds
|
|
4802
4865
|
});
|
|
@@ -4858,7 +4921,13 @@ import { execSync } from "node:child_process";
|
|
|
4858
4921
|
import { existsSync as existsSync3, readFileSync as readFileSync4, unlinkSync } from "node:fs";
|
|
4859
4922
|
import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
|
|
4860
4923
|
import { tmpdir } from "node:os";
|
|
4861
|
-
import { dirname as dirname2, join as join4, resolve as resolve2 } from "node:path";
|
|
4924
|
+
import { dirname as dirname2, join as join4, relative as relative2, resolve as resolve2 } from "node:path";
|
|
4925
|
+
function extractInputText2(input) {
|
|
4926
|
+
if (!input || input.length === 0) return "";
|
|
4927
|
+
if (input.length === 1) return input[0].content;
|
|
4928
|
+
return input.map((m) => `@[${m.role}]:
|
|
4929
|
+
${m.content}`).join("\n\n");
|
|
4930
|
+
}
|
|
4862
4931
|
function loadEnvFile(dir) {
|
|
4863
4932
|
let current = resolve2(dir);
|
|
4864
4933
|
while (true) {
|
|
@@ -4892,20 +4961,26 @@ var evalRunCommand2 = command({
|
|
|
4892
4961
|
out: option({
|
|
4893
4962
|
type: optional(string),
|
|
4894
4963
|
long: "out",
|
|
4895
|
-
description: "Output directory for results (default: .agentv/results/runs
|
|
4964
|
+
description: "Output directory for results (default: .agentv/results/runs/<timestamp>)"
|
|
4896
4965
|
}),
|
|
4897
4966
|
workers: option({
|
|
4898
4967
|
type: optional(number),
|
|
4899
4968
|
long: "workers",
|
|
4900
4969
|
description: "Parallel workers for target invocation (default: all tests)"
|
|
4970
|
+
}),
|
|
4971
|
+
experiment: option({
|
|
4972
|
+
type: optional(string),
|
|
4973
|
+
long: "experiment",
|
|
4974
|
+
description: "Experiment label (e.g. with_skills, without_skills)"
|
|
4901
4975
|
})
|
|
4902
4976
|
},
|
|
4903
|
-
handler: async ({ evalPath, out, workers }) => {
|
|
4977
|
+
handler: async ({ evalPath, out, workers, experiment }) => {
|
|
4904
4978
|
const resolvedEvalPath = resolve2(evalPath);
|
|
4905
4979
|
const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd()));
|
|
4906
4980
|
const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
|
|
4907
4981
|
const evalDir = dirname2(resolvedEvalPath);
|
|
4908
|
-
const
|
|
4982
|
+
const category = deriveCategory(relative2(process.cwd(), resolvedEvalPath));
|
|
4983
|
+
const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
|
|
4909
4984
|
const tests = suite.tests;
|
|
4910
4985
|
if (tests.length === 0) {
|
|
4911
4986
|
console.error("No tests found in eval file.");
|
|
@@ -4938,20 +5013,21 @@ var evalRunCommand2 = command({
|
|
|
4938
5013
|
}
|
|
4939
5014
|
} catch {
|
|
4940
5015
|
}
|
|
5016
|
+
const evalSetName = suite.metadata?.name?.trim() ?? "";
|
|
5017
|
+
const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4941
5018
|
const testIds = [];
|
|
4942
5019
|
for (const test of tests) {
|
|
4943
|
-
const
|
|
5020
|
+
const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id];
|
|
5021
|
+
const testDir = join4(outDir, ...subpath);
|
|
4944
5022
|
await mkdir4(testDir, { recursive: true });
|
|
4945
5023
|
testIds.push(test.id);
|
|
4946
|
-
const inputText = test.question;
|
|
4947
5024
|
const inputMessages = test.input.map((m) => ({
|
|
4948
5025
|
role: m.role,
|
|
4949
5026
|
content: typeof m.content === "string" ? m.content : m.content
|
|
4950
5027
|
}));
|
|
4951
5028
|
await writeJson2(join4(testDir, "input.json"), {
|
|
4952
|
-
|
|
4953
|
-
|
|
4954
|
-
file_paths: test.file_paths,
|
|
5029
|
+
input: inputMessages,
|
|
5030
|
+
input_files: test.file_paths,
|
|
4955
5031
|
metadata: test.metadata ?? {}
|
|
4956
5032
|
});
|
|
4957
5033
|
if (targetInfo) {
|
|
@@ -4979,6 +5055,8 @@ var evalRunCommand2 = command({
|
|
|
4979
5055
|
}
|
|
4980
5056
|
await writeJson2(join4(outDir, "manifest.json"), {
|
|
4981
5057
|
eval_file: resolvedEvalPath,
|
|
5058
|
+
dataset: evalSetName || void 0,
|
|
5059
|
+
experiment: experiment || void 0,
|
|
4982
5060
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4983
5061
|
target: { name: targetName, kind: targetKind },
|
|
4984
5062
|
test_ids: testIds
|
|
@@ -4993,7 +5071,8 @@ var evalRunCommand2 = command({
|
|
|
4993
5071
|
const maxWorkers = workers ?? testIds.length;
|
|
4994
5072
|
console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
|
|
4995
5073
|
const invokeTarget = async (testId) => {
|
|
4996
|
-
const
|
|
5074
|
+
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
|
|
5075
|
+
const testDir = join4(outDir, ...subpath);
|
|
4997
5076
|
const invoke = JSON.parse(await readFile4(join4(testDir, "invoke.json"), "utf8"));
|
|
4998
5077
|
if (invoke.kind !== "cli") return;
|
|
4999
5078
|
const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
|
|
@@ -5002,11 +5081,12 @@ var evalRunCommand2 = command({
|
|
|
5002
5081
|
const timeoutMs = invoke.timeout_ms ?? 12e4;
|
|
5003
5082
|
const promptFile = join4(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
|
|
5004
5083
|
const outputFile = join4(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
|
|
5005
|
-
|
|
5084
|
+
const inputText = extractInputText2(inputData.input);
|
|
5085
|
+
await writeFile5(promptFile, inputText, "utf8");
|
|
5006
5086
|
let rendered = template;
|
|
5007
5087
|
rendered = rendered.replace("{PROMPT_FILE}", promptFile);
|
|
5008
5088
|
rendered = rendered.replace("{OUTPUT_FILE}", outputFile);
|
|
5009
|
-
rendered = rendered.replace("{PROMPT}",
|
|
5089
|
+
rendered = rendered.replace("{PROMPT}", inputText);
|
|
5010
5090
|
const start = performance.now();
|
|
5011
5091
|
try {
|
|
5012
5092
|
execSync(rendered, {
|
|
@@ -5061,12 +5141,13 @@ var evalRunCommand2 = command({
|
|
|
5061
5141
|
}
|
|
5062
5142
|
await Promise.all(pending);
|
|
5063
5143
|
} else {
|
|
5064
|
-
console.log("
|
|
5144
|
+
console.log("Subagent-as-target mode \u2014 skipping CLI invocation.");
|
|
5065
5145
|
}
|
|
5066
5146
|
let totalGraders = 0;
|
|
5067
5147
|
let totalPassed = 0;
|
|
5068
5148
|
for (const testId of testIds) {
|
|
5069
|
-
const
|
|
5149
|
+
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
|
|
5150
|
+
const testDir = join4(outDir, ...subpath);
|
|
5070
5151
|
const codeGradersDir = join4(testDir, "code_graders");
|
|
5071
5152
|
const resultsDir = join4(testDir, "code_grader_results");
|
|
5072
5153
|
let graderFiles;
|
|
@@ -5082,14 +5163,13 @@ var evalRunCommand2 = command({
|
|
|
5082
5163
|
for (const graderFile of graderFiles) {
|
|
5083
5164
|
const graderConfig = JSON.parse(await readFile4(join4(codeGradersDir, graderFile), "utf8"));
|
|
5084
5165
|
const graderName = graderConfig.name;
|
|
5166
|
+
const inputText = extractInputText2(inputData.input);
|
|
5085
5167
|
const payload = JSON.stringify({
|
|
5086
5168
|
output: [{ role: "assistant", content: responseText }],
|
|
5087
|
-
input: inputData.
|
|
5088
|
-
question: inputData.input_text,
|
|
5169
|
+
input: inputData.input,
|
|
5089
5170
|
criteria: "",
|
|
5090
5171
|
expected_output: [],
|
|
5091
|
-
|
|
5092
|
-
input_files: [],
|
|
5172
|
+
input_files: inputData.input_files ?? [],
|
|
5093
5173
|
trace: null,
|
|
5094
5174
|
token_usage: null,
|
|
5095
5175
|
cost_usd: null,
|
|
@@ -5099,8 +5179,8 @@ var evalRunCommand2 = command({
|
|
|
5099
5179
|
file_changes: null,
|
|
5100
5180
|
workspace_path: null,
|
|
5101
5181
|
config: graderConfig.config ?? null,
|
|
5102
|
-
metadata: {},
|
|
5103
|
-
input_text:
|
|
5182
|
+
metadata: inputData.metadata ?? {},
|
|
5183
|
+
input_text: inputText,
|
|
5104
5184
|
output_text: responseText,
|
|
5105
5185
|
expected_output_text: ""
|
|
5106
5186
|
});
|
|
@@ -5288,7 +5368,7 @@ function toRawResult(result) {
|
|
|
5288
5368
|
return {
|
|
5289
5369
|
timestamp: result.timestamp,
|
|
5290
5370
|
test_id: result.testId,
|
|
5291
|
-
|
|
5371
|
+
dataset: result.dataset,
|
|
5292
5372
|
conversation_id: result.conversationId,
|
|
5293
5373
|
score: result.score,
|
|
5294
5374
|
assertions: result.assertions?.map((assertion) => ({
|
|
@@ -5411,7 +5491,7 @@ function loadOtlpTraceFile(filePath) {
|
|
|
5411
5491
|
}
|
|
5412
5492
|
return {
|
|
5413
5493
|
test_id: stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`,
|
|
5414
|
-
|
|
5494
|
+
dataset: stringAttr(rootAttrs.agentv_dataset),
|
|
5415
5495
|
target: stringAttr(rootAttrs.agentv_target),
|
|
5416
5496
|
score,
|
|
5417
5497
|
error: root.status?.code === 2 ? root.status.message : void 0,
|
|
@@ -5684,9 +5764,11 @@ function patchTestIds(results) {
|
|
|
5684
5764
|
// src/commands/results/export.ts
|
|
5685
5765
|
function deriveOutputDir(cwd, sourceFile) {
|
|
5686
5766
|
const parentDir = path7.basename(path7.dirname(sourceFile));
|
|
5767
|
+
if (/^\d{4}-\d{2}-\d{2}T/.test(parentDir)) {
|
|
5768
|
+
return path7.join(cwd, ".agentv", "results", "export", parentDir);
|
|
5769
|
+
}
|
|
5687
5770
|
if (parentDir.startsWith("eval_")) {
|
|
5688
|
-
|
|
5689
|
-
return path7.join(cwd, ".agentv", "results", "export", dirName2);
|
|
5771
|
+
return path7.join(cwd, ".agentv", "results", "export", parentDir.slice(5));
|
|
5690
5772
|
}
|
|
5691
5773
|
const basename = path7.basename(sourceFile, ".jsonl");
|
|
5692
5774
|
const dirName = basename.startsWith("eval_") ? basename.slice(5) : basename;
|
|
@@ -5939,10 +6021,12 @@ function checkDirectoryNaming(runDir) {
|
|
|
5939
6021
|
message: `Directory is not under a 'runs/' parent (found '${parentName}/'). Expected: .agentv/results/runs/<run-dir>`
|
|
5940
6022
|
});
|
|
5941
6023
|
}
|
|
5942
|
-
|
|
6024
|
+
const isNewFormat = /^\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName);
|
|
6025
|
+
const isLegacyFormat = /^eval_\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName);
|
|
6026
|
+
if (!isNewFormat && !isLegacyFormat) {
|
|
5943
6027
|
diagnostics.push({
|
|
5944
6028
|
severity: "warning",
|
|
5945
|
-
message: `Directory name '${dirName}' does not match the expected pattern '
|
|
6029
|
+
message: `Directory name '${dirName}' does not match the expected pattern '<ISO-timestamp>'. Example: 2026-03-27T12-42-24-429Z`
|
|
5946
6030
|
});
|
|
5947
6031
|
}
|
|
5948
6032
|
return diagnostics;
|
|
@@ -6151,8 +6235,9 @@ var resultsCommand = subcommands({
|
|
|
6151
6235
|
});
|
|
6152
6236
|
|
|
6153
6237
|
// src/commands/results/serve.ts
|
|
6154
|
-
import { existsSync as existsSync7, readFileSync as readFileSync8, writeFileSync as writeFileSync3 } from "node:fs";
|
|
6238
|
+
import { existsSync as existsSync7, readFileSync as readFileSync8, readdirSync as readdirSync3, statSync as statSync4, writeFileSync as writeFileSync3 } from "node:fs";
|
|
6155
6239
|
import path9 from "node:path";
|
|
6240
|
+
import { fileURLToPath as fileURLToPath2 } from "node:url";
|
|
6156
6241
|
import { Hono } from "hono";
|
|
6157
6242
|
function feedbackPath(resultDir) {
|
|
6158
6243
|
return path9.join(resultDir, "feedback.json");
|
|
@@ -6173,24 +6258,45 @@ function writeFeedback(cwd, data) {
|
|
|
6173
6258
|
writeFileSync3(feedbackPath(cwd), `${JSON.stringify(data, null, 2)}
|
|
6174
6259
|
`, "utf8");
|
|
6175
6260
|
}
|
|
6176
|
-
function createApp(results, resultDir, cwd, sourceFile) {
|
|
6261
|
+
function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
6177
6262
|
const searchDir = cwd ?? resultDir;
|
|
6178
6263
|
const app2 = new Hono();
|
|
6264
|
+
const studioDistPath = options?.studioDir === false ? void 0 : options?.studioDir ?? resolveStudioDistDir();
|
|
6179
6265
|
app2.get("/", (c3) => {
|
|
6266
|
+
if (studioDistPath) {
|
|
6267
|
+
const indexPath = path9.join(studioDistPath, "index.html");
|
|
6268
|
+
if (existsSync7(indexPath)) {
|
|
6269
|
+
return c3.html(readFileSync8(indexPath, "utf8"));
|
|
6270
|
+
}
|
|
6271
|
+
}
|
|
6180
6272
|
return c3.html(generateServeHtml(results, sourceFile));
|
|
6181
6273
|
});
|
|
6182
6274
|
app2.get("/api/runs", (c3) => {
|
|
6183
6275
|
const metas = listResultFiles(searchDir);
|
|
6184
6276
|
return c3.json({
|
|
6185
|
-
runs: metas.map((m) =>
|
|
6186
|
-
|
|
6187
|
-
|
|
6188
|
-
|
|
6189
|
-
|
|
6190
|
-
|
|
6191
|
-
|
|
6192
|
-
|
|
6193
|
-
|
|
6277
|
+
runs: metas.map((m) => {
|
|
6278
|
+
let target;
|
|
6279
|
+
let experiment;
|
|
6280
|
+
try {
|
|
6281
|
+
const records = loadLightweightResults(m.path);
|
|
6282
|
+
if (records.length > 0) {
|
|
6283
|
+
target = records[0].target;
|
|
6284
|
+
experiment = records[0].experiment;
|
|
6285
|
+
}
|
|
6286
|
+
} catch {
|
|
6287
|
+
}
|
|
6288
|
+
return {
|
|
6289
|
+
filename: m.filename,
|
|
6290
|
+
path: m.path,
|
|
6291
|
+
timestamp: m.timestamp,
|
|
6292
|
+
test_count: m.testCount,
|
|
6293
|
+
pass_rate: m.passRate,
|
|
6294
|
+
avg_score: m.avgScore,
|
|
6295
|
+
size_bytes: m.sizeBytes,
|
|
6296
|
+
...target && { target },
|
|
6297
|
+
...experiment && { experiment }
|
|
6298
|
+
};
|
|
6299
|
+
})
|
|
6194
6300
|
});
|
|
6195
6301
|
});
|
|
6196
6302
|
app2.get("/api/runs/:filename", (c3) => {
|
|
@@ -6250,8 +6356,393 @@ function createApp(results, resultDir, cwd, sourceFile) {
|
|
|
6250
6356
|
writeFeedback(resultDir, existing);
|
|
6251
6357
|
return c3.json(existing);
|
|
6252
6358
|
});
|
|
6359
|
+
app2.get("/api/runs/:filename/datasets", (c3) => {
|
|
6360
|
+
const filename = c3.req.param("filename");
|
|
6361
|
+
const metas = listResultFiles(searchDir);
|
|
6362
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6363
|
+
if (!meta) {
|
|
6364
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6365
|
+
}
|
|
6366
|
+
try {
|
|
6367
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6368
|
+
const datasetMap = /* @__PURE__ */ new Map();
|
|
6369
|
+
for (const r of loaded) {
|
|
6370
|
+
const ds = r.dataset ?? r.target ?? "default";
|
|
6371
|
+
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
|
|
6372
|
+
entry.total++;
|
|
6373
|
+
if (r.score >= 1) entry.passed++;
|
|
6374
|
+
entry.scoreSum += r.score;
|
|
6375
|
+
datasetMap.set(ds, entry);
|
|
6376
|
+
}
|
|
6377
|
+
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
|
|
6378
|
+
name,
|
|
6379
|
+
total: entry.total,
|
|
6380
|
+
passed: entry.passed,
|
|
6381
|
+
failed: entry.total - entry.passed,
|
|
6382
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
|
|
6383
|
+
}));
|
|
6384
|
+
return c3.json({ datasets });
|
|
6385
|
+
} catch {
|
|
6386
|
+
return c3.json({ error: "Failed to load datasets" }, 500);
|
|
6387
|
+
}
|
|
6388
|
+
});
|
|
6389
|
+
app2.get("/api/runs/:filename/categories", (c3) => {
|
|
6390
|
+
const filename = c3.req.param("filename");
|
|
6391
|
+
const metas = listResultFiles(searchDir);
|
|
6392
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6393
|
+
if (!meta) {
|
|
6394
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6395
|
+
}
|
|
6396
|
+
try {
|
|
6397
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6398
|
+
const categoryMap = /* @__PURE__ */ new Map();
|
|
6399
|
+
for (const r of loaded) {
|
|
6400
|
+
const cat = r.category ?? DEFAULT_CATEGORY;
|
|
6401
|
+
const entry = categoryMap.get(cat) ?? {
|
|
6402
|
+
total: 0,
|
|
6403
|
+
passed: 0,
|
|
6404
|
+
scoreSum: 0,
|
|
6405
|
+
datasets: /* @__PURE__ */ new Set()
|
|
6406
|
+
};
|
|
6407
|
+
entry.total++;
|
|
6408
|
+
if (r.score >= 1) entry.passed++;
|
|
6409
|
+
entry.scoreSum += r.score;
|
|
6410
|
+
entry.datasets.add(r.dataset ?? r.target ?? "default");
|
|
6411
|
+
categoryMap.set(cat, entry);
|
|
6412
|
+
}
|
|
6413
|
+
const categories = [...categoryMap.entries()].map(([name, entry]) => ({
|
|
6414
|
+
name,
|
|
6415
|
+
total: entry.total,
|
|
6416
|
+
passed: entry.passed,
|
|
6417
|
+
failed: entry.total - entry.passed,
|
|
6418
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
|
|
6419
|
+
dataset_count: entry.datasets.size
|
|
6420
|
+
}));
|
|
6421
|
+
return c3.json({ categories });
|
|
6422
|
+
} catch {
|
|
6423
|
+
return c3.json({ error: "Failed to load categories" }, 500);
|
|
6424
|
+
}
|
|
6425
|
+
});
|
|
6426
|
+
app2.get("/api/runs/:filename/categories/:category/datasets", (c3) => {
|
|
6427
|
+
const filename = c3.req.param("filename");
|
|
6428
|
+
const category = decodeURIComponent(c3.req.param("category"));
|
|
6429
|
+
const metas = listResultFiles(searchDir);
|
|
6430
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6431
|
+
if (!meta) {
|
|
6432
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6433
|
+
}
|
|
6434
|
+
try {
|
|
6435
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6436
|
+
const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
|
|
6437
|
+
const datasetMap = /* @__PURE__ */ new Map();
|
|
6438
|
+
for (const r of filtered) {
|
|
6439
|
+
const ds = r.dataset ?? r.target ?? "default";
|
|
6440
|
+
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
|
|
6441
|
+
entry.total++;
|
|
6442
|
+
if (r.score >= 1) entry.passed++;
|
|
6443
|
+
entry.scoreSum += r.score;
|
|
6444
|
+
datasetMap.set(ds, entry);
|
|
6445
|
+
}
|
|
6446
|
+
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
|
|
6447
|
+
name,
|
|
6448
|
+
total: entry.total,
|
|
6449
|
+
passed: entry.passed,
|
|
6450
|
+
failed: entry.total - entry.passed,
|
|
6451
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
|
|
6452
|
+
}));
|
|
6453
|
+
return c3.json({ datasets });
|
|
6454
|
+
} catch {
|
|
6455
|
+
return c3.json({ error: "Failed to load datasets" }, 500);
|
|
6456
|
+
}
|
|
6457
|
+
});
|
|
6458
|
+
app2.get("/api/runs/:filename/evals/:evalId", (c3) => {
|
|
6459
|
+
const filename = c3.req.param("filename");
|
|
6460
|
+
const evalId = c3.req.param("evalId");
|
|
6461
|
+
const metas = listResultFiles(searchDir);
|
|
6462
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6463
|
+
if (!meta) {
|
|
6464
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6465
|
+
}
|
|
6466
|
+
try {
|
|
6467
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6468
|
+
const result = loaded.find((r) => r.testId === evalId);
|
|
6469
|
+
if (!result) {
|
|
6470
|
+
return c3.json({ error: "Eval not found" }, 404);
|
|
6471
|
+
}
|
|
6472
|
+
return c3.json({ eval: result });
|
|
6473
|
+
} catch {
|
|
6474
|
+
return c3.json({ error: "Failed to load eval" }, 500);
|
|
6475
|
+
}
|
|
6476
|
+
});
|
|
6477
|
+
app2.get("/api/index", (c3) => {
|
|
6478
|
+
const metas = listResultFiles(searchDir);
|
|
6479
|
+
const entries2 = metas.map((m) => {
|
|
6480
|
+
let totalCostUsd = 0;
|
|
6481
|
+
try {
|
|
6482
|
+
const loaded = patchTestIds(loadManifestResults(m.path));
|
|
6483
|
+
totalCostUsd = loaded.reduce((sum, r) => sum + (r.costUsd ?? 0), 0);
|
|
6484
|
+
} catch {
|
|
6485
|
+
}
|
|
6486
|
+
return {
|
|
6487
|
+
run_filename: m.filename,
|
|
6488
|
+
test_count: m.testCount,
|
|
6489
|
+
pass_rate: m.passRate,
|
|
6490
|
+
avg_score: m.avgScore,
|
|
6491
|
+
total_cost_usd: totalCostUsd,
|
|
6492
|
+
timestamp: m.timestamp
|
|
6493
|
+
};
|
|
6494
|
+
});
|
|
6495
|
+
return c3.json({ entries: entries2 });
|
|
6496
|
+
});
|
|
6497
|
+
function buildFileTree(dirPath, relativeTo) {
|
|
6498
|
+
if (!existsSync7(dirPath) || !statSync4(dirPath).isDirectory()) {
|
|
6499
|
+
return [];
|
|
6500
|
+
}
|
|
6501
|
+
const entries2 = readdirSync3(dirPath, { withFileTypes: true });
|
|
6502
|
+
return entries2.sort((a, b) => {
|
|
6503
|
+
if (a.isDirectory() !== b.isDirectory()) return a.isDirectory() ? -1 : 1;
|
|
6504
|
+
return a.name.localeCompare(b.name);
|
|
6505
|
+
}).map((entry) => {
|
|
6506
|
+
const fullPath = path9.join(dirPath, entry.name);
|
|
6507
|
+
const relPath = path9.relative(relativeTo, fullPath);
|
|
6508
|
+
if (entry.isDirectory()) {
|
|
6509
|
+
return {
|
|
6510
|
+
name: entry.name,
|
|
6511
|
+
path: relPath,
|
|
6512
|
+
type: "dir",
|
|
6513
|
+
children: buildFileTree(fullPath, relativeTo)
|
|
6514
|
+
};
|
|
6515
|
+
}
|
|
6516
|
+
return { name: entry.name, path: relPath, type: "file" };
|
|
6517
|
+
});
|
|
6518
|
+
}
|
|
6519
|
+
function inferLanguage(filePath) {
|
|
6520
|
+
const ext = path9.extname(filePath).toLowerCase();
|
|
6521
|
+
const langMap = {
|
|
6522
|
+
".json": "json",
|
|
6523
|
+
".jsonl": "json",
|
|
6524
|
+
".ts": "typescript",
|
|
6525
|
+
".tsx": "typescript",
|
|
6526
|
+
".js": "javascript",
|
|
6527
|
+
".jsx": "javascript",
|
|
6528
|
+
".md": "markdown",
|
|
6529
|
+
".yaml": "yaml",
|
|
6530
|
+
".yml": "yaml",
|
|
6531
|
+
".log": "plaintext",
|
|
6532
|
+
".txt": "plaintext",
|
|
6533
|
+
".py": "python",
|
|
6534
|
+
".sh": "shell",
|
|
6535
|
+
".bash": "shell",
|
|
6536
|
+
".css": "css",
|
|
6537
|
+
".html": "html",
|
|
6538
|
+
".xml": "xml",
|
|
6539
|
+
".svg": "xml",
|
|
6540
|
+
".toml": "toml",
|
|
6541
|
+
".diff": "diff",
|
|
6542
|
+
".patch": "diff"
|
|
6543
|
+
};
|
|
6544
|
+
return langMap[ext] ?? "plaintext";
|
|
6545
|
+
}
|
|
6546
|
+
app2.get("/api/runs/:filename/evals/:evalId/files", (c3) => {
|
|
6547
|
+
const filename = c3.req.param("filename");
|
|
6548
|
+
const evalId = c3.req.param("evalId");
|
|
6549
|
+
const metas = listResultFiles(searchDir);
|
|
6550
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6551
|
+
if (!meta) {
|
|
6552
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6553
|
+
}
|
|
6554
|
+
try {
|
|
6555
|
+
const content = readFileSync8(meta.path, "utf8");
|
|
6556
|
+
const records = parseResultManifest(content);
|
|
6557
|
+
const record = records.find((r) => (r.test_id ?? r.eval_id) === evalId);
|
|
6558
|
+
if (!record) {
|
|
6559
|
+
return c3.json({ error: "Eval not found" }, 404);
|
|
6560
|
+
}
|
|
6561
|
+
const baseDir = path9.dirname(meta.path);
|
|
6562
|
+
const knownPaths = [
|
|
6563
|
+
record.grading_path,
|
|
6564
|
+
record.timing_path,
|
|
6565
|
+
record.input_path,
|
|
6566
|
+
record.output_path,
|
|
6567
|
+
record.response_path
|
|
6568
|
+
].filter((p) => !!p);
|
|
6569
|
+
if (knownPaths.length === 0) {
|
|
6570
|
+
return c3.json({ files: [] });
|
|
6571
|
+
}
|
|
6572
|
+
const artifactDirs = knownPaths.map((p) => path9.dirname(p));
|
|
6573
|
+
let commonDir = artifactDirs[0];
|
|
6574
|
+
for (const dir of artifactDirs) {
|
|
6575
|
+
while (!dir.startsWith(commonDir)) {
|
|
6576
|
+
commonDir = path9.dirname(commonDir);
|
|
6577
|
+
}
|
|
6578
|
+
}
|
|
6579
|
+
const artifactAbsDir = path9.join(baseDir, commonDir);
|
|
6580
|
+
const files = buildFileTree(artifactAbsDir, baseDir);
|
|
6581
|
+
return c3.json({ files });
|
|
6582
|
+
} catch {
|
|
6583
|
+
return c3.json({ error: "Failed to load file tree" }, 500);
|
|
6584
|
+
}
|
|
6585
|
+
});
|
|
6586
|
+
app2.get("/api/runs/:filename/evals/:evalId/files/*", (c3) => {
|
|
6587
|
+
const filename = c3.req.param("filename");
|
|
6588
|
+
const evalId = c3.req.param("evalId");
|
|
6589
|
+
const metas = listResultFiles(searchDir);
|
|
6590
|
+
const meta = metas.find((m) => m.filename === filename);
|
|
6591
|
+
if (!meta) {
|
|
6592
|
+
return c3.json({ error: "Run not found" }, 404);
|
|
6593
|
+
}
|
|
6594
|
+
const requestPath = c3.req.path;
|
|
6595
|
+
const prefix = `/api/runs/${filename}/evals/${evalId}/files/`;
|
|
6596
|
+
const filePath = requestPath.slice(prefix.length);
|
|
6597
|
+
if (!filePath) {
|
|
6598
|
+
return c3.json({ error: "No file path specified" }, 400);
|
|
6599
|
+
}
|
|
6600
|
+
const baseDir = path9.dirname(meta.path);
|
|
6601
|
+
const absolutePath = path9.resolve(baseDir, filePath);
|
|
6602
|
+
if (!absolutePath.startsWith(path9.resolve(baseDir) + path9.sep) && absolutePath !== path9.resolve(baseDir)) {
|
|
6603
|
+
return c3.json({ error: "Path traversal not allowed" }, 403);
|
|
6604
|
+
}
|
|
6605
|
+
if (!existsSync7(absolutePath) || !statSync4(absolutePath).isFile()) {
|
|
6606
|
+
return c3.json({ error: "File not found" }, 404);
|
|
6607
|
+
}
|
|
6608
|
+
try {
|
|
6609
|
+
const fileContent = readFileSync8(absolutePath, "utf8");
|
|
6610
|
+
const language = inferLanguage(absolutePath);
|
|
6611
|
+
return c3.json({ content: fileContent, language });
|
|
6612
|
+
} catch {
|
|
6613
|
+
return c3.json({ error: "Failed to read file" }, 500);
|
|
6614
|
+
}
|
|
6615
|
+
});
|
|
6616
|
+
app2.get("/api/experiments", (c3) => {
|
|
6617
|
+
const metas = listResultFiles(searchDir);
|
|
6618
|
+
const experimentMap = /* @__PURE__ */ new Map();
|
|
6619
|
+
for (const m of metas) {
|
|
6620
|
+
try {
|
|
6621
|
+
const records = loadLightweightResults(m.path);
|
|
6622
|
+
for (const r of records) {
|
|
6623
|
+
const experiment = r.experiment ?? "default";
|
|
6624
|
+
const entry = experimentMap.get(experiment) ?? {
|
|
6625
|
+
targets: /* @__PURE__ */ new Set(),
|
|
6626
|
+
runFilenames: /* @__PURE__ */ new Set(),
|
|
6627
|
+
evalCount: 0,
|
|
6628
|
+
passedCount: 0,
|
|
6629
|
+
lastTimestamp: ""
|
|
6630
|
+
};
|
|
6631
|
+
entry.runFilenames.add(m.filename);
|
|
6632
|
+
if (r.target) entry.targets.add(r.target);
|
|
6633
|
+
entry.evalCount++;
|
|
6634
|
+
if (r.score >= 1) entry.passedCount++;
|
|
6635
|
+
if (r.timestamp && r.timestamp > entry.lastTimestamp) {
|
|
6636
|
+
entry.lastTimestamp = r.timestamp;
|
|
6637
|
+
}
|
|
6638
|
+
experimentMap.set(experiment, entry);
|
|
6639
|
+
}
|
|
6640
|
+
} catch {
|
|
6641
|
+
}
|
|
6642
|
+
}
|
|
6643
|
+
const experiments = [...experimentMap.entries()].map(([name, entry]) => ({
|
|
6644
|
+
name,
|
|
6645
|
+
run_count: entry.runFilenames.size,
|
|
6646
|
+
target_count: entry.targets.size,
|
|
6647
|
+
eval_count: entry.evalCount,
|
|
6648
|
+
passed_count: entry.passedCount,
|
|
6649
|
+
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
|
|
6650
|
+
last_run: entry.lastTimestamp || null
|
|
6651
|
+
}));
|
|
6652
|
+
return c3.json({ experiments });
|
|
6653
|
+
});
|
|
6654
|
+
app2.get("/api/targets", (c3) => {
|
|
6655
|
+
const metas = listResultFiles(searchDir);
|
|
6656
|
+
const targetMap = /* @__PURE__ */ new Map();
|
|
6657
|
+
for (const m of metas) {
|
|
6658
|
+
try {
|
|
6659
|
+
const records = loadLightweightResults(m.path);
|
|
6660
|
+
for (const r of records) {
|
|
6661
|
+
const target = r.target ?? "default";
|
|
6662
|
+
const entry = targetMap.get(target) ?? {
|
|
6663
|
+
experiments: /* @__PURE__ */ new Set(),
|
|
6664
|
+
runFilenames: /* @__PURE__ */ new Set(),
|
|
6665
|
+
evalCount: 0,
|
|
6666
|
+
passedCount: 0
|
|
6667
|
+
};
|
|
6668
|
+
entry.runFilenames.add(m.filename);
|
|
6669
|
+
if (r.experiment) entry.experiments.add(r.experiment);
|
|
6670
|
+
entry.evalCount++;
|
|
6671
|
+
if (r.score >= 1) entry.passedCount++;
|
|
6672
|
+
targetMap.set(target, entry);
|
|
6673
|
+
}
|
|
6674
|
+
} catch {
|
|
6675
|
+
}
|
|
6676
|
+
}
|
|
6677
|
+
const targets = [...targetMap.entries()].map(([name, entry]) => ({
|
|
6678
|
+
name,
|
|
6679
|
+
run_count: entry.runFilenames.size,
|
|
6680
|
+
experiment_count: entry.experiments.size,
|
|
6681
|
+
eval_count: entry.evalCount,
|
|
6682
|
+
passed_count: entry.passedCount,
|
|
6683
|
+
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0
|
|
6684
|
+
}));
|
|
6685
|
+
return c3.json({ targets });
|
|
6686
|
+
});
|
|
6687
|
+
if (studioDistPath) {
|
|
6688
|
+
app2.get("/assets/*", (c3) => {
|
|
6689
|
+
const assetPath = c3.req.path;
|
|
6690
|
+
const filePath = path9.join(studioDistPath, assetPath);
|
|
6691
|
+
if (!existsSync7(filePath)) {
|
|
6692
|
+
return c3.notFound();
|
|
6693
|
+
}
|
|
6694
|
+
const content = readFileSync8(filePath);
|
|
6695
|
+
const ext = path9.extname(filePath);
|
|
6696
|
+
const mimeTypes = {
|
|
6697
|
+
".js": "application/javascript",
|
|
6698
|
+
".css": "text/css",
|
|
6699
|
+
".html": "text/html",
|
|
6700
|
+
".json": "application/json",
|
|
6701
|
+
".svg": "image/svg+xml",
|
|
6702
|
+
".png": "image/png",
|
|
6703
|
+
".woff2": "font/woff2",
|
|
6704
|
+
".woff": "font/woff"
|
|
6705
|
+
};
|
|
6706
|
+
const contentType = mimeTypes[ext] ?? "application/octet-stream";
|
|
6707
|
+
return new Response(content, {
|
|
6708
|
+
headers: {
|
|
6709
|
+
"Content-Type": contentType,
|
|
6710
|
+
"Cache-Control": "public, max-age=31536000, immutable"
|
|
6711
|
+
}
|
|
6712
|
+
});
|
|
6713
|
+
});
|
|
6714
|
+
app2.get("*", (c3) => {
|
|
6715
|
+
if (c3.req.path.startsWith("/api/")) {
|
|
6716
|
+
return c3.json({ error: "Not found" }, 404);
|
|
6717
|
+
}
|
|
6718
|
+
const indexPath = path9.join(studioDistPath, "index.html");
|
|
6719
|
+
if (existsSync7(indexPath)) {
|
|
6720
|
+
return c3.html(readFileSync8(indexPath, "utf8"));
|
|
6721
|
+
}
|
|
6722
|
+
return c3.notFound();
|
|
6723
|
+
});
|
|
6724
|
+
}
|
|
6253
6725
|
return app2;
|
|
6254
6726
|
}
|
|
6727
|
+
function resolveStudioDistDir() {
|
|
6728
|
+
const currentDir = typeof __dirname !== "undefined" ? __dirname : path9.dirname(fileURLToPath2(import.meta.url));
|
|
6729
|
+
const candidates = [
|
|
6730
|
+
// From src/commands/results/ → sibling apps/studio/dist
|
|
6731
|
+
path9.resolve(currentDir, "../../../../studio/dist"),
|
|
6732
|
+
// From dist/ → sibling apps/studio/dist (monorepo dev)
|
|
6733
|
+
path9.resolve(currentDir, "../../studio/dist"),
|
|
6734
|
+
// Bundled inside CLI dist (published package)
|
|
6735
|
+
path9.resolve(currentDir, "../studio"),
|
|
6736
|
+
// From dist/ in monorepo root context
|
|
6737
|
+
path9.resolve(currentDir, "../../../apps/studio/dist")
|
|
6738
|
+
];
|
|
6739
|
+
for (const candidate of candidates) {
|
|
6740
|
+
if (existsSync7(candidate) && existsSync7(path9.join(candidate, "index.html"))) {
|
|
6741
|
+
return candidate;
|
|
6742
|
+
}
|
|
6743
|
+
}
|
|
6744
|
+
return void 0;
|
|
6745
|
+
}
|
|
6255
6746
|
function stripHeavyFields(results) {
|
|
6256
6747
|
return results.map((r) => {
|
|
6257
6748
|
const { requests, trace, ...rest } = r;
|
|
@@ -6934,8 +7425,8 @@ var SERVE_SCRIPT = `
|
|
|
6934
7425
|
})();
|
|
6935
7426
|
`;
|
|
6936
7427
|
var resultsServeCommand = command({
|
|
6937
|
-
name: "
|
|
6938
|
-
description: "Start a local
|
|
7428
|
+
name: "studio",
|
|
7429
|
+
description: "Start AgentV Studio \u2014 a local dashboard for reviewing evaluation results",
|
|
6939
7430
|
args: {
|
|
6940
7431
|
source: positional({
|
|
6941
7432
|
type: optional(string),
|
|
@@ -7572,7 +8063,7 @@ function formatResultDetail(result, index, tree) {
|
|
|
7572
8063
|
}
|
|
7573
8064
|
const scoreColor = result.score >= 0.9 ? c2.green : result.score >= 0.5 ? c2.yellow : c2.red;
|
|
7574
8065
|
lines.push(
|
|
7575
|
-
`${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.
|
|
8066
|
+
`${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.dataset ? ` ${c2.dim}dataset: ${result.dataset}${c2.reset}` : ""}`
|
|
7576
8067
|
);
|
|
7577
8068
|
if (result.error) {
|
|
7578
8069
|
lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
|
|
@@ -7746,8 +8237,8 @@ function groupResults(results, groupBy2) {
|
|
|
7746
8237
|
case "target":
|
|
7747
8238
|
key = result.target ?? "unknown";
|
|
7748
8239
|
break;
|
|
7749
|
-
case "
|
|
7750
|
-
key = result.
|
|
8240
|
+
case "dataset":
|
|
8241
|
+
key = result.dataset ?? "unknown";
|
|
7751
8242
|
break;
|
|
7752
8243
|
case "test-id":
|
|
7753
8244
|
key = result.test_id ?? result.eval_id ?? "unknown";
|
|
@@ -8460,7 +8951,9 @@ var app = subcommands({
|
|
|
8460
8951
|
pipeline: pipelineCommand,
|
|
8461
8952
|
results: resultsCommand,
|
|
8462
8953
|
self: selfCommand,
|
|
8954
|
+
studio: resultsServeCommand,
|
|
8463
8955
|
serve: resultsServeCommand,
|
|
8956
|
+
// hidden alias for backward compatibility
|
|
8464
8957
|
trace: traceCommand,
|
|
8465
8958
|
transpile: transpileCommand,
|
|
8466
8959
|
trim: trimCommand,
|
|
@@ -8479,6 +8972,7 @@ var TOP_LEVEL_COMMANDS = /* @__PURE__ */ new Set([
|
|
|
8479
8972
|
"results",
|
|
8480
8973
|
"self",
|
|
8481
8974
|
"serve",
|
|
8975
|
+
"studio",
|
|
8482
8976
|
"trace",
|
|
8483
8977
|
"transpile",
|
|
8484
8978
|
"trim",
|
|
@@ -8525,4 +9019,4 @@ export {
|
|
|
8525
9019
|
preprocessArgv,
|
|
8526
9020
|
runCli
|
|
8527
9021
|
};
|
|
8528
|
-
//# sourceMappingURL=chunk-
|
|
9022
|
+
//# sourceMappingURL=chunk-E3VSJJI4.js.map
|