agentv 3.14.5 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ import {
10
10
  loadManifestResults,
11
11
  loadRunCache,
12
12
  package_default,
13
+ parseResultManifest,
13
14
  resolveEvalPaths,
14
15
  resolveExistingRunPrimaryPath,
15
16
  resolveResultSourcePath,
@@ -23,9 +24,11 @@ import {
23
24
  validateFileReferences,
24
25
  validateTargetsFile,
25
26
  writeArtifactsFromResults
26
- } from "./chunk-GUXXTOYK.js";
27
+ } from "./chunk-OT2J474N.js";
27
28
  import {
29
+ DEFAULT_CATEGORY,
28
30
  createBuiltinRegistry,
31
+ deriveCategory,
29
32
  executeScript,
30
33
  getAgentvHome,
31
34
  getOutputFilenames,
@@ -40,7 +43,7 @@ import {
40
43
  toSnakeCaseDeep as toSnakeCaseDeep2,
41
44
  transpileEvalYamlFile,
42
45
  trimBaselineResult
43
- } from "./chunk-ELQEFMGO.js";
46
+ } from "./chunk-OXBBWZOY.js";
44
47
  import {
45
48
  __commonJS,
46
49
  __esm,
@@ -3479,9 +3482,23 @@ var ASSERTION_TEMPLATES = {
3479
3482
  default: `#!/usr/bin/env bun
3480
3483
  import { defineAssertion } from '@agentv/eval';
3481
3484
 
3482
- export default defineAssertion(({ outputText }) => {
3485
+ /** Extract text from the last message with the given role. */
3486
+ function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
3487
+ for (let i = messages.length - 1; i >= 0; i--) {
3488
+ const msg = messages[i];
3489
+ if (msg.role !== role) continue;
3490
+ if (typeof msg.content === 'string') return msg.content;
3491
+ if (Array.isArray(msg.content)) {
3492
+ return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
3493
+ }
3494
+ }
3495
+ return '';
3496
+ }
3497
+
3498
+ export default defineAssertion(({ output }) => {
3483
3499
  // TODO: Implement your assertion logic
3484
- const pass = outputText.length > 0;
3500
+ const text = getMessageText(output ?? []);
3501
+ const pass = text.length > 0;
3485
3502
  return {
3486
3503
  pass,
3487
3504
  reasoning: pass ? 'Output has content' : 'Output is empty',
@@ -3491,9 +3508,23 @@ export default defineAssertion(({ outputText }) => {
3491
3508
  score: `#!/usr/bin/env bun
3492
3509
  import { defineAssertion } from '@agentv/eval';
3493
3510
 
3494
- export default defineAssertion(({ outputText }) => {
3511
+ /** Extract text from the last message with the given role. */
3512
+ function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
3513
+ for (let i = messages.length - 1; i >= 0; i--) {
3514
+ const msg = messages[i];
3515
+ if (msg.role !== role) continue;
3516
+ if (typeof msg.content === 'string') return msg.content;
3517
+ if (Array.isArray(msg.content)) {
3518
+ return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
3519
+ }
3520
+ }
3521
+ return '';
3522
+ }
3523
+
3524
+ export default defineAssertion(({ output }) => {
3495
3525
  // TODO: Implement your scoring logic (0.0 to 1.0)
3496
- const score = outputText.length > 0 ? 1.0 : 0.0;
3526
+ const text = getMessageText(output ?? []);
3527
+ const score = text.length > 0 ? 1.0 : 0.0;
3497
3528
  return {
3498
3529
  pass: score >= 0.5,
3499
3530
  score,
@@ -4186,7 +4217,7 @@ var evalRunCommand = command({
4186
4217
  },
4187
4218
  handler: async (args) => {
4188
4219
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4189
- const { launchInteractiveWizard } = await import("./interactive-WUIEXGWM.js");
4220
+ const { launchInteractiveWizard } = await import("./interactive-D5UTP72M.js");
4190
4221
  await launchInteractiveWizard();
4191
4222
  return;
4192
4223
  }
@@ -4421,6 +4452,9 @@ var evalBenchCommand = command({
4421
4452
  const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
4422
4453
  const testIds = manifest.test_ids;
4423
4454
  const targetName = manifest.target?.name ?? "unknown";
4455
+ const evalSet = manifest.dataset ?? "";
4456
+ const experiment = manifest.experiment;
4457
+ const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
4424
4458
  let stdinData;
4425
4459
  if (llmScoresPath) {
4426
4460
  stdinData = await readFile(llmScoresPath, "utf8");
@@ -4431,7 +4465,9 @@ var evalBenchCommand = command({
4431
4465
  const indexLines = [];
4432
4466
  const allPassRates = [];
4433
4467
  for (const testId of testIds) {
4434
- const testDir = join(exportDir, testId);
4468
+ const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
4469
+ const testDir = join(exportDir, ...subpath);
4470
+ const artifactSubdir = subpath.join("/");
4435
4471
  const evaluators = [];
4436
4472
  const allAssertions = [];
4437
4473
  const codeResultsDir = join(testDir, "code_grader_results");
@@ -4527,13 +4563,15 @@ var evalBenchCommand = command({
4527
4563
  JSON.stringify({
4528
4564
  timestamp: manifest.timestamp,
4529
4565
  test_id: testId,
4566
+ dataset: evalSet || void 0,
4567
+ experiment: experiment || void 0,
4530
4568
  score: Math.round(weightedScore * 1e3) / 1e3,
4531
4569
  target: targetName,
4532
4570
  scores,
4533
4571
  execution_status: executionStatus,
4534
- grading_path: `${testId}/grading.json`,
4535
- timing_path: `${testId}/timing.json`,
4536
- response_path: hasResponse ? `${testId}/response.md` : null
4572
+ grading_path: `${artifactSubdir}/grading.json`,
4573
+ timing_path: `${artifactSubdir}/timing.json`,
4574
+ response_path: hasResponse ? `${artifactSubdir}/response.md` : void 0
4537
4575
  })
4538
4576
  );
4539
4577
  }
@@ -4548,6 +4586,7 @@ var evalBenchCommand = command({
4548
4586
  metadata: {
4549
4587
  eval_file: manifest.eval_file,
4550
4588
  timestamp: manifest.timestamp,
4589
+ experiment: experiment || void 0,
4551
4590
  targets: [targetName],
4552
4591
  tests_run: testIds
4553
4592
  },
@@ -4589,6 +4628,12 @@ function computeStats(values) {
4589
4628
  // src/commands/pipeline/grade.ts
4590
4629
  import { mkdir as mkdir2, readFile as readFile2, readdir as readdir2, writeFile as writeFile3 } from "node:fs/promises";
4591
4630
  import { join as join2 } from "node:path";
4631
+ function extractInputText(input) {
4632
+ if (!input || input.length === 0) return "";
4633
+ if (input.length === 1) return input[0].content;
4634
+ return input.map((m) => `@[${m.role}]:
4635
+ ${m.content}`).join("\n\n");
4636
+ }
4592
4637
  var evalGradeCommand = command({
4593
4638
  name: "grade",
4594
4639
  description: "Run code-grader assertions on responses in an export directory",
@@ -4603,10 +4648,13 @@ var evalGradeCommand = command({
4603
4648
  const manifestPath = join2(exportDir, "manifest.json");
4604
4649
  const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
4605
4650
  const testIds = manifest.test_ids;
4651
+ const evalSet = manifest.dataset ?? "";
4652
+ const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
4606
4653
  let totalGraders = 0;
4607
4654
  let totalPassed = 0;
4608
4655
  for (const testId of testIds) {
4609
- const testDir = join2(exportDir, testId);
4656
+ const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
4657
+ const testDir = join2(exportDir, ...subpath);
4610
4658
  const codeGradersDir = join2(testDir, "code_graders");
4611
4659
  const resultsDir = join2(testDir, "code_grader_results");
4612
4660
  let graderFiles;
@@ -4622,14 +4670,13 @@ var evalGradeCommand = command({
4622
4670
  for (const graderFile of graderFiles) {
4623
4671
  const graderConfig = JSON.parse(await readFile2(join2(codeGradersDir, graderFile), "utf8"));
4624
4672
  const graderName = graderConfig.name;
4673
+ const inputText = extractInputText(inputData.input);
4625
4674
  const payload = JSON.stringify({
4626
4675
  output: [{ role: "assistant", content: responseText }],
4627
- input: inputData.input_messages,
4628
- question: inputData.input_text,
4676
+ input: inputData.input,
4629
4677
  criteria: "",
4630
4678
  expected_output: [],
4631
- reference_answer: "",
4632
- input_files: [],
4679
+ input_files: inputData.input_files ?? [],
4633
4680
  trace: null,
4634
4681
  token_usage: null,
4635
4682
  cost_usd: null,
@@ -4639,8 +4686,8 @@ var evalGradeCommand = command({
4639
4686
  file_changes: null,
4640
4687
  workspace_path: null,
4641
4688
  config: graderConfig.config ?? null,
4642
- metadata: {},
4643
- input_text: inputData.input_text,
4689
+ metadata: inputData.metadata ?? {},
4690
+ input_text: inputText,
4644
4691
  output_text: responseText,
4645
4692
  expected_output_text: ""
4646
4693
  });
@@ -4698,10 +4745,10 @@ var evalGradeCommand = command({
4698
4745
  // src/commands/pipeline/input.ts
4699
4746
  import { readFile as readFile3 } from "node:fs/promises";
4700
4747
  import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
4701
- import { dirname, join as join3, resolve } from "node:path";
4748
+ import { dirname, join as join3, relative, resolve } from "node:path";
4702
4749
  var evalInputCommand = command({
4703
4750
  name: "input",
4704
- description: "Extract eval inputs, target commands, and grader prompts for agent-mode runs",
4751
+ description: "Extract eval inputs, target commands, and grader prompts for subagent-mode runs",
4705
4752
  args: {
4706
4753
  evalPath: positional({
4707
4754
  type: string,
@@ -4711,15 +4758,21 @@ var evalInputCommand = command({
4711
4758
  out: option({
4712
4759
  type: optional(string),
4713
4760
  long: "out",
4714
- description: "Output directory for extracted inputs (default: .agentv/results/runs/eval_<timestamp>)"
4761
+ description: "Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)"
4762
+ }),
4763
+ experiment: option({
4764
+ type: optional(string),
4765
+ long: "experiment",
4766
+ description: "Experiment label (e.g. with_skills, without_skills)"
4715
4767
  })
4716
4768
  },
4717
- handler: async ({ evalPath, out }) => {
4769
+ handler: async ({ evalPath, out, experiment }) => {
4718
4770
  const resolvedEvalPath = resolve(evalPath);
4719
4771
  const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
4720
4772
  const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
4721
4773
  const evalDir = dirname(resolvedEvalPath);
4722
- const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
4774
+ const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
4775
+ const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
4723
4776
  const tests = suite.tests;
4724
4777
  if (tests.length === 0) {
4725
4778
  console.error("No tests found in eval file.");
@@ -4728,6 +4781,7 @@ var evalInputCommand = command({
4728
4781
  let targetInfo = null;
4729
4782
  let targetName = "agent";
4730
4783
  let targetKind = "agent";
4784
+ let subagentModeAllowed = true;
4731
4785
  try {
4732
4786
  const selection = await selectTarget({
4733
4787
  testFilePath: resolvedEvalPath,
@@ -4740,32 +4794,38 @@ var evalInputCommand = command({
4740
4794
  env: process.env
4741
4795
  });
4742
4796
  targetName = selection.targetName;
4743
- if (selection.resolvedTarget.kind === "cli") {
4797
+ const resolved = selection.resolvedTarget;
4798
+ subagentModeAllowed = resolved.subagentModeAllowed !== false;
4799
+ if (resolved.kind === "cli") {
4744
4800
  targetKind = "cli";
4745
- const config = selection.resolvedTarget.config;
4801
+ subagentModeAllowed = false;
4802
+ const config = resolved.config;
4746
4803
  targetInfo = {
4747
4804
  kind: "cli",
4748
4805
  command: config.command,
4749
4806
  cwd: config.cwd ?? evalDir,
4750
4807
  timeoutMs: config.timeoutMs ?? 3e4
4751
4808
  };
4809
+ } else {
4810
+ targetKind = resolved.kind;
4752
4811
  }
4753
4812
  } catch {
4754
4813
  }
4814
+ const evalSetName = suite.metadata?.name?.trim() ?? "";
4815
+ const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, "_") : "";
4755
4816
  const testIds = [];
4756
4817
  for (const test of tests) {
4757
- const testDir = join3(outDir, test.id);
4818
+ const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id];
4819
+ const testDir = join3(outDir, ...subpath);
4758
4820
  await mkdir3(testDir, { recursive: true });
4759
4821
  testIds.push(test.id);
4760
- const inputText = test.question;
4761
4822
  const inputMessages = test.input.map((m) => ({
4762
4823
  role: m.role,
4763
4824
  content: typeof m.content === "string" ? m.content : m.content
4764
4825
  }));
4765
4826
  await writeJson(join3(testDir, "input.json"), {
4766
- input_text: inputText,
4767
- input_messages: inputMessages,
4768
- file_paths: test.file_paths,
4827
+ input: inputMessages,
4828
+ input_files: test.file_paths,
4769
4829
  metadata: test.metadata ?? {}
4770
4830
  });
4771
4831
  if (targetInfo) {
@@ -4793,10 +4853,13 @@ var evalInputCommand = command({
4793
4853
  }
4794
4854
  await writeJson(join3(outDir, "manifest.json"), {
4795
4855
  eval_file: resolvedEvalPath,
4856
+ dataset: evalSetName || void 0,
4857
+ experiment: experiment || void 0,
4796
4858
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4797
4859
  target: {
4798
4860
  name: targetName,
4799
- kind: targetKind
4861
+ kind: targetKind,
4862
+ subagent_mode_allowed: subagentModeAllowed
4800
4863
  },
4801
4864
  test_ids: testIds
4802
4865
  });
@@ -4858,7 +4921,13 @@ import { execSync } from "node:child_process";
4858
4921
  import { existsSync as existsSync3, readFileSync as readFileSync4, unlinkSync } from "node:fs";
4859
4922
  import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
4860
4923
  import { tmpdir } from "node:os";
4861
- import { dirname as dirname2, join as join4, resolve as resolve2 } from "node:path";
4924
+ import { dirname as dirname2, join as join4, relative as relative2, resolve as resolve2 } from "node:path";
4925
+ function extractInputText2(input) {
4926
+ if (!input || input.length === 0) return "";
4927
+ if (input.length === 1) return input[0].content;
4928
+ return input.map((m) => `@[${m.role}]:
4929
+ ${m.content}`).join("\n\n");
4930
+ }
4862
4931
  function loadEnvFile(dir) {
4863
4932
  let current = resolve2(dir);
4864
4933
  while (true) {
@@ -4892,20 +4961,26 @@ var evalRunCommand2 = command({
4892
4961
  out: option({
4893
4962
  type: optional(string),
4894
4963
  long: "out",
4895
- description: "Output directory for results (default: .agentv/results/runs/eval_<timestamp>)"
4964
+ description: "Output directory for results (default: .agentv/results/runs/<timestamp>)"
4896
4965
  }),
4897
4966
  workers: option({
4898
4967
  type: optional(number),
4899
4968
  long: "workers",
4900
4969
  description: "Parallel workers for target invocation (default: all tests)"
4970
+ }),
4971
+ experiment: option({
4972
+ type: optional(string),
4973
+ long: "experiment",
4974
+ description: "Experiment label (e.g. with_skills, without_skills)"
4901
4975
  })
4902
4976
  },
4903
- handler: async ({ evalPath, out, workers }) => {
4977
+ handler: async ({ evalPath, out, workers, experiment }) => {
4904
4978
  const resolvedEvalPath = resolve2(evalPath);
4905
4979
  const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd()));
4906
4980
  const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
4907
4981
  const evalDir = dirname2(resolvedEvalPath);
4908
- const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
4982
+ const category = deriveCategory(relative2(process.cwd(), resolvedEvalPath));
4983
+ const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
4909
4984
  const tests = suite.tests;
4910
4985
  if (tests.length === 0) {
4911
4986
  console.error("No tests found in eval file.");
@@ -4938,20 +5013,21 @@ var evalRunCommand2 = command({
4938
5013
  }
4939
5014
  } catch {
4940
5015
  }
5016
+ const evalSetName = suite.metadata?.name?.trim() ?? "";
5017
+ const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, "_") : "";
4941
5018
  const testIds = [];
4942
5019
  for (const test of tests) {
4943
- const testDir = join4(outDir, test.id);
5020
+ const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id];
5021
+ const testDir = join4(outDir, ...subpath);
4944
5022
  await mkdir4(testDir, { recursive: true });
4945
5023
  testIds.push(test.id);
4946
- const inputText = test.question;
4947
5024
  const inputMessages = test.input.map((m) => ({
4948
5025
  role: m.role,
4949
5026
  content: typeof m.content === "string" ? m.content : m.content
4950
5027
  }));
4951
5028
  await writeJson2(join4(testDir, "input.json"), {
4952
- input_text: inputText,
4953
- input_messages: inputMessages,
4954
- file_paths: test.file_paths,
5029
+ input: inputMessages,
5030
+ input_files: test.file_paths,
4955
5031
  metadata: test.metadata ?? {}
4956
5032
  });
4957
5033
  if (targetInfo) {
@@ -4979,6 +5055,8 @@ var evalRunCommand2 = command({
4979
5055
  }
4980
5056
  await writeJson2(join4(outDir, "manifest.json"), {
4981
5057
  eval_file: resolvedEvalPath,
5058
+ dataset: evalSetName || void 0,
5059
+ experiment: experiment || void 0,
4982
5060
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4983
5061
  target: { name: targetName, kind: targetKind },
4984
5062
  test_ids: testIds
@@ -4993,7 +5071,8 @@ var evalRunCommand2 = command({
4993
5071
  const maxWorkers = workers ?? testIds.length;
4994
5072
  console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
4995
5073
  const invokeTarget = async (testId) => {
4996
- const testDir = join4(outDir, testId);
5074
+ const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
5075
+ const testDir = join4(outDir, ...subpath);
4997
5076
  const invoke = JSON.parse(await readFile4(join4(testDir, "invoke.json"), "utf8"));
4998
5077
  if (invoke.kind !== "cli") return;
4999
5078
  const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
@@ -5002,11 +5081,12 @@ var evalRunCommand2 = command({
5002
5081
  const timeoutMs = invoke.timeout_ms ?? 12e4;
5003
5082
  const promptFile = join4(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
5004
5083
  const outputFile = join4(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
5005
- await writeFile5(promptFile, inputData.input_text, "utf8");
5084
+ const inputText = extractInputText2(inputData.input);
5085
+ await writeFile5(promptFile, inputText, "utf8");
5006
5086
  let rendered = template;
5007
5087
  rendered = rendered.replace("{PROMPT_FILE}", promptFile);
5008
5088
  rendered = rendered.replace("{OUTPUT_FILE}", outputFile);
5009
- rendered = rendered.replace("{PROMPT}", inputData.input_text);
5089
+ rendered = rendered.replace("{PROMPT}", inputText);
5010
5090
  const start = performance.now();
5011
5091
  try {
5012
5092
  execSync(rendered, {
@@ -5061,12 +5141,13 @@ var evalRunCommand2 = command({
5061
5141
  }
5062
5142
  await Promise.all(pending);
5063
5143
  } else {
5064
- console.log("Agent-as-target mode \u2014 skipping CLI invocation.");
5144
+ console.log("Subagent-as-target mode \u2014 skipping CLI invocation.");
5065
5145
  }
5066
5146
  let totalGraders = 0;
5067
5147
  let totalPassed = 0;
5068
5148
  for (const testId of testIds) {
5069
- const testDir = join4(outDir, testId);
5149
+ const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
5150
+ const testDir = join4(outDir, ...subpath);
5070
5151
  const codeGradersDir = join4(testDir, "code_graders");
5071
5152
  const resultsDir = join4(testDir, "code_grader_results");
5072
5153
  let graderFiles;
@@ -5082,14 +5163,13 @@ var evalRunCommand2 = command({
5082
5163
  for (const graderFile of graderFiles) {
5083
5164
  const graderConfig = JSON.parse(await readFile4(join4(codeGradersDir, graderFile), "utf8"));
5084
5165
  const graderName = graderConfig.name;
5166
+ const inputText = extractInputText2(inputData.input);
5085
5167
  const payload = JSON.stringify({
5086
5168
  output: [{ role: "assistant", content: responseText }],
5087
- input: inputData.input_messages,
5088
- question: inputData.input_text,
5169
+ input: inputData.input,
5089
5170
  criteria: "",
5090
5171
  expected_output: [],
5091
- reference_answer: "",
5092
- input_files: [],
5172
+ input_files: inputData.input_files ?? [],
5093
5173
  trace: null,
5094
5174
  token_usage: null,
5095
5175
  cost_usd: null,
@@ -5099,8 +5179,8 @@ var evalRunCommand2 = command({
5099
5179
  file_changes: null,
5100
5180
  workspace_path: null,
5101
5181
  config: graderConfig.config ?? null,
5102
- metadata: {},
5103
- input_text: inputData.input_text,
5182
+ metadata: inputData.metadata ?? {},
5183
+ input_text: inputText,
5104
5184
  output_text: responseText,
5105
5185
  expected_output_text: ""
5106
5186
  });
@@ -5288,7 +5368,7 @@ function toRawResult(result) {
5288
5368
  return {
5289
5369
  timestamp: result.timestamp,
5290
5370
  test_id: result.testId,
5291
- eval_set: result.eval_set,
5371
+ dataset: result.dataset,
5292
5372
  conversation_id: result.conversationId,
5293
5373
  score: result.score,
5294
5374
  assertions: result.assertions?.map((assertion) => ({
@@ -5411,7 +5491,7 @@ function loadOtlpTraceFile(filePath) {
5411
5491
  }
5412
5492
  return {
5413
5493
  test_id: stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`,
5414
- eval_set: stringAttr(rootAttrs.agentv_eval_set),
5494
+ dataset: stringAttr(rootAttrs.agentv_dataset),
5415
5495
  target: stringAttr(rootAttrs.agentv_target),
5416
5496
  score,
5417
5497
  error: root.status?.code === 2 ? root.status.message : void 0,
@@ -5684,9 +5764,11 @@ function patchTestIds(results) {
5684
5764
  // src/commands/results/export.ts
5685
5765
  function deriveOutputDir(cwd, sourceFile) {
5686
5766
  const parentDir = path7.basename(path7.dirname(sourceFile));
5767
+ if (/^\d{4}-\d{2}-\d{2}T/.test(parentDir)) {
5768
+ return path7.join(cwd, ".agentv", "results", "export", parentDir);
5769
+ }
5687
5770
  if (parentDir.startsWith("eval_")) {
5688
- const dirName2 = parentDir.slice(5);
5689
- return path7.join(cwd, ".agentv", "results", "export", dirName2);
5771
+ return path7.join(cwd, ".agentv", "results", "export", parentDir.slice(5));
5690
5772
  }
5691
5773
  const basename = path7.basename(sourceFile, ".jsonl");
5692
5774
  const dirName = basename.startsWith("eval_") ? basename.slice(5) : basename;
@@ -5939,10 +6021,12 @@ function checkDirectoryNaming(runDir) {
5939
6021
  message: `Directory is not under a 'runs/' parent (found '${parentName}/'). Expected: .agentv/results/runs/<run-dir>`
5940
6022
  });
5941
6023
  }
5942
- if (!/^eval_\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName)) {
6024
+ const isNewFormat = /^\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName);
6025
+ const isLegacyFormat = /^eval_\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName);
6026
+ if (!isNewFormat && !isLegacyFormat) {
5943
6027
  diagnostics.push({
5944
6028
  severity: "warning",
5945
- message: `Directory name '${dirName}' does not match the expected pattern 'eval_<ISO-timestamp>'. Example: eval_2026-03-27T12-42-24-429Z`
6029
+ message: `Directory name '${dirName}' does not match the expected pattern '<ISO-timestamp>'. Example: 2026-03-27T12-42-24-429Z`
5946
6030
  });
5947
6031
  }
5948
6032
  return diagnostics;
@@ -6151,8 +6235,9 @@ var resultsCommand = subcommands({
6151
6235
  });
6152
6236
 
6153
6237
  // src/commands/results/serve.ts
6154
- import { existsSync as existsSync7, readFileSync as readFileSync8, writeFileSync as writeFileSync3 } from "node:fs";
6238
+ import { existsSync as existsSync7, readFileSync as readFileSync8, readdirSync as readdirSync3, statSync as statSync4, writeFileSync as writeFileSync3 } from "node:fs";
6155
6239
  import path9 from "node:path";
6240
+ import { fileURLToPath as fileURLToPath2 } from "node:url";
6156
6241
  import { Hono } from "hono";
6157
6242
  function feedbackPath(resultDir) {
6158
6243
  return path9.join(resultDir, "feedback.json");
@@ -6173,24 +6258,45 @@ function writeFeedback(cwd, data) {
6173
6258
  writeFileSync3(feedbackPath(cwd), `${JSON.stringify(data, null, 2)}
6174
6259
  `, "utf8");
6175
6260
  }
6176
- function createApp(results, resultDir, cwd, sourceFile) {
6261
+ function createApp(results, resultDir, cwd, sourceFile, options) {
6177
6262
  const searchDir = cwd ?? resultDir;
6178
6263
  const app2 = new Hono();
6264
+ const studioDistPath = options?.studioDir === false ? void 0 : options?.studioDir ?? resolveStudioDistDir();
6179
6265
  app2.get("/", (c3) => {
6266
+ if (studioDistPath) {
6267
+ const indexPath = path9.join(studioDistPath, "index.html");
6268
+ if (existsSync7(indexPath)) {
6269
+ return c3.html(readFileSync8(indexPath, "utf8"));
6270
+ }
6271
+ }
6180
6272
  return c3.html(generateServeHtml(results, sourceFile));
6181
6273
  });
6182
6274
  app2.get("/api/runs", (c3) => {
6183
6275
  const metas = listResultFiles(searchDir);
6184
6276
  return c3.json({
6185
- runs: metas.map((m) => ({
6186
- filename: m.filename,
6187
- path: m.path,
6188
- timestamp: m.timestamp,
6189
- test_count: m.testCount,
6190
- pass_rate: m.passRate,
6191
- avg_score: m.avgScore,
6192
- size_bytes: m.sizeBytes
6193
- }))
6277
+ runs: metas.map((m) => {
6278
+ let target;
6279
+ let experiment;
6280
+ try {
6281
+ const records = loadLightweightResults(m.path);
6282
+ if (records.length > 0) {
6283
+ target = records[0].target;
6284
+ experiment = records[0].experiment;
6285
+ }
6286
+ } catch {
6287
+ }
6288
+ return {
6289
+ filename: m.filename,
6290
+ path: m.path,
6291
+ timestamp: m.timestamp,
6292
+ test_count: m.testCount,
6293
+ pass_rate: m.passRate,
6294
+ avg_score: m.avgScore,
6295
+ size_bytes: m.sizeBytes,
6296
+ ...target && { target },
6297
+ ...experiment && { experiment }
6298
+ };
6299
+ })
6194
6300
  });
6195
6301
  });
6196
6302
  app2.get("/api/runs/:filename", (c3) => {
@@ -6250,8 +6356,393 @@ function createApp(results, resultDir, cwd, sourceFile) {
6250
6356
  writeFeedback(resultDir, existing);
6251
6357
  return c3.json(existing);
6252
6358
  });
6359
+ app2.get("/api/runs/:filename/datasets", (c3) => {
6360
+ const filename = c3.req.param("filename");
6361
+ const metas = listResultFiles(searchDir);
6362
+ const meta = metas.find((m) => m.filename === filename);
6363
+ if (!meta) {
6364
+ return c3.json({ error: "Run not found" }, 404);
6365
+ }
6366
+ try {
6367
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6368
+ const datasetMap = /* @__PURE__ */ new Map();
6369
+ for (const r of loaded) {
6370
+ const ds = r.dataset ?? r.target ?? "default";
6371
+ const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
6372
+ entry.total++;
6373
+ if (r.score >= 1) entry.passed++;
6374
+ entry.scoreSum += r.score;
6375
+ datasetMap.set(ds, entry);
6376
+ }
6377
+ const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
6378
+ name,
6379
+ total: entry.total,
6380
+ passed: entry.passed,
6381
+ failed: entry.total - entry.passed,
6382
+ avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
6383
+ }));
6384
+ return c3.json({ datasets });
6385
+ } catch {
6386
+ return c3.json({ error: "Failed to load datasets" }, 500);
6387
+ }
6388
+ });
6389
+ app2.get("/api/runs/:filename/categories", (c3) => {
6390
+ const filename = c3.req.param("filename");
6391
+ const metas = listResultFiles(searchDir);
6392
+ const meta = metas.find((m) => m.filename === filename);
6393
+ if (!meta) {
6394
+ return c3.json({ error: "Run not found" }, 404);
6395
+ }
6396
+ try {
6397
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6398
+ const categoryMap = /* @__PURE__ */ new Map();
6399
+ for (const r of loaded) {
6400
+ const cat = r.category ?? DEFAULT_CATEGORY;
6401
+ const entry = categoryMap.get(cat) ?? {
6402
+ total: 0,
6403
+ passed: 0,
6404
+ scoreSum: 0,
6405
+ datasets: /* @__PURE__ */ new Set()
6406
+ };
6407
+ entry.total++;
6408
+ if (r.score >= 1) entry.passed++;
6409
+ entry.scoreSum += r.score;
6410
+ entry.datasets.add(r.dataset ?? r.target ?? "default");
6411
+ categoryMap.set(cat, entry);
6412
+ }
6413
+ const categories = [...categoryMap.entries()].map(([name, entry]) => ({
6414
+ name,
6415
+ total: entry.total,
6416
+ passed: entry.passed,
6417
+ failed: entry.total - entry.passed,
6418
+ avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
6419
+ dataset_count: entry.datasets.size
6420
+ }));
6421
+ return c3.json({ categories });
6422
+ } catch {
6423
+ return c3.json({ error: "Failed to load categories" }, 500);
6424
+ }
6425
+ });
6426
+ app2.get("/api/runs/:filename/categories/:category/datasets", (c3) => {
6427
+ const filename = c3.req.param("filename");
6428
+ const category = decodeURIComponent(c3.req.param("category"));
6429
+ const metas = listResultFiles(searchDir);
6430
+ const meta = metas.find((m) => m.filename === filename);
6431
+ if (!meta) {
6432
+ return c3.json({ error: "Run not found" }, 404);
6433
+ }
6434
+ try {
6435
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6436
+ const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
6437
+ const datasetMap = /* @__PURE__ */ new Map();
6438
+ for (const r of filtered) {
6439
+ const ds = r.dataset ?? r.target ?? "default";
6440
+ const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
6441
+ entry.total++;
6442
+ if (r.score >= 1) entry.passed++;
6443
+ entry.scoreSum += r.score;
6444
+ datasetMap.set(ds, entry);
6445
+ }
6446
+ const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
6447
+ name,
6448
+ total: entry.total,
6449
+ passed: entry.passed,
6450
+ failed: entry.total - entry.passed,
6451
+ avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
6452
+ }));
6453
+ return c3.json({ datasets });
6454
+ } catch {
6455
+ return c3.json({ error: "Failed to load datasets" }, 500);
6456
+ }
6457
+ });
6458
+ app2.get("/api/runs/:filename/evals/:evalId", (c3) => {
6459
+ const filename = c3.req.param("filename");
6460
+ const evalId = c3.req.param("evalId");
6461
+ const metas = listResultFiles(searchDir);
6462
+ const meta = metas.find((m) => m.filename === filename);
6463
+ if (!meta) {
6464
+ return c3.json({ error: "Run not found" }, 404);
6465
+ }
6466
+ try {
6467
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6468
+ const result = loaded.find((r) => r.testId === evalId);
6469
+ if (!result) {
6470
+ return c3.json({ error: "Eval not found" }, 404);
6471
+ }
6472
+ return c3.json({ eval: result });
6473
+ } catch {
6474
+ return c3.json({ error: "Failed to load eval" }, 500);
6475
+ }
6476
+ });
6477
+ app2.get("/api/index", (c3) => {
6478
+ const metas = listResultFiles(searchDir);
6479
+ const entries2 = metas.map((m) => {
6480
+ let totalCostUsd = 0;
6481
+ try {
6482
+ const loaded = patchTestIds(loadManifestResults(m.path));
6483
+ totalCostUsd = loaded.reduce((sum, r) => sum + (r.costUsd ?? 0), 0);
6484
+ } catch {
6485
+ }
6486
+ return {
6487
+ run_filename: m.filename,
6488
+ test_count: m.testCount,
6489
+ pass_rate: m.passRate,
6490
+ avg_score: m.avgScore,
6491
+ total_cost_usd: totalCostUsd,
6492
+ timestamp: m.timestamp
6493
+ };
6494
+ });
6495
+ return c3.json({ entries: entries2 });
6496
+ });
6497
+ function buildFileTree(dirPath, relativeTo) {
6498
+ if (!existsSync7(dirPath) || !statSync4(dirPath).isDirectory()) {
6499
+ return [];
6500
+ }
6501
+ const entries2 = readdirSync3(dirPath, { withFileTypes: true });
6502
+ return entries2.sort((a, b) => {
6503
+ if (a.isDirectory() !== b.isDirectory()) return a.isDirectory() ? -1 : 1;
6504
+ return a.name.localeCompare(b.name);
6505
+ }).map((entry) => {
6506
+ const fullPath = path9.join(dirPath, entry.name);
6507
+ const relPath = path9.relative(relativeTo, fullPath);
6508
+ if (entry.isDirectory()) {
6509
+ return {
6510
+ name: entry.name,
6511
+ path: relPath,
6512
+ type: "dir",
6513
+ children: buildFileTree(fullPath, relativeTo)
6514
+ };
6515
+ }
6516
+ return { name: entry.name, path: relPath, type: "file" };
6517
+ });
6518
+ }
6519
+ function inferLanguage(filePath) {
6520
+ const ext = path9.extname(filePath).toLowerCase();
6521
+ const langMap = {
6522
+ ".json": "json",
6523
+ ".jsonl": "json",
6524
+ ".ts": "typescript",
6525
+ ".tsx": "typescript",
6526
+ ".js": "javascript",
6527
+ ".jsx": "javascript",
6528
+ ".md": "markdown",
6529
+ ".yaml": "yaml",
6530
+ ".yml": "yaml",
6531
+ ".log": "plaintext",
6532
+ ".txt": "plaintext",
6533
+ ".py": "python",
6534
+ ".sh": "shell",
6535
+ ".bash": "shell",
6536
+ ".css": "css",
6537
+ ".html": "html",
6538
+ ".xml": "xml",
6539
+ ".svg": "xml",
6540
+ ".toml": "toml",
6541
+ ".diff": "diff",
6542
+ ".patch": "diff"
6543
+ };
6544
+ return langMap[ext] ?? "plaintext";
6545
+ }
6546
+ app2.get("/api/runs/:filename/evals/:evalId/files", (c3) => {
6547
+ const filename = c3.req.param("filename");
6548
+ const evalId = c3.req.param("evalId");
6549
+ const metas = listResultFiles(searchDir);
6550
+ const meta = metas.find((m) => m.filename === filename);
6551
+ if (!meta) {
6552
+ return c3.json({ error: "Run not found" }, 404);
6553
+ }
6554
+ try {
6555
+ const content = readFileSync8(meta.path, "utf8");
6556
+ const records = parseResultManifest(content);
6557
+ const record = records.find((r) => (r.test_id ?? r.eval_id) === evalId);
6558
+ if (!record) {
6559
+ return c3.json({ error: "Eval not found" }, 404);
6560
+ }
6561
+ const baseDir = path9.dirname(meta.path);
6562
+ const knownPaths = [
6563
+ record.grading_path,
6564
+ record.timing_path,
6565
+ record.input_path,
6566
+ record.output_path,
6567
+ record.response_path
6568
+ ].filter((p) => !!p);
6569
+ if (knownPaths.length === 0) {
6570
+ return c3.json({ files: [] });
6571
+ }
6572
+ const artifactDirs = knownPaths.map((p) => path9.dirname(p));
6573
+ let commonDir = artifactDirs[0];
6574
+ for (const dir of artifactDirs) {
6575
+ while (!dir.startsWith(commonDir)) {
6576
+ commonDir = path9.dirname(commonDir);
6577
+ }
6578
+ }
6579
+ const artifactAbsDir = path9.join(baseDir, commonDir);
6580
+ const files = buildFileTree(artifactAbsDir, baseDir);
6581
+ return c3.json({ files });
6582
+ } catch {
6583
+ return c3.json({ error: "Failed to load file tree" }, 500);
6584
+ }
6585
+ });
6586
+ app2.get("/api/runs/:filename/evals/:evalId/files/*", (c3) => {
6587
+ const filename = c3.req.param("filename");
6588
+ const evalId = c3.req.param("evalId");
6589
+ const metas = listResultFiles(searchDir);
6590
+ const meta = metas.find((m) => m.filename === filename);
6591
+ if (!meta) {
6592
+ return c3.json({ error: "Run not found" }, 404);
6593
+ }
6594
+ const requestPath = c3.req.path;
6595
+ const prefix = `/api/runs/${filename}/evals/${evalId}/files/`;
6596
+ const filePath = requestPath.slice(prefix.length);
6597
+ if (!filePath) {
6598
+ return c3.json({ error: "No file path specified" }, 400);
6599
+ }
6600
+ const baseDir = path9.dirname(meta.path);
6601
+ const absolutePath = path9.resolve(baseDir, filePath);
6602
+ if (!absolutePath.startsWith(path9.resolve(baseDir) + path9.sep) && absolutePath !== path9.resolve(baseDir)) {
6603
+ return c3.json({ error: "Path traversal not allowed" }, 403);
6604
+ }
6605
+ if (!existsSync7(absolutePath) || !statSync4(absolutePath).isFile()) {
6606
+ return c3.json({ error: "File not found" }, 404);
6607
+ }
6608
+ try {
6609
+ const fileContent = readFileSync8(absolutePath, "utf8");
6610
+ const language = inferLanguage(absolutePath);
6611
+ return c3.json({ content: fileContent, language });
6612
+ } catch {
6613
+ return c3.json({ error: "Failed to read file" }, 500);
6614
+ }
6615
+ });
6616
+ app2.get("/api/experiments", (c3) => {
6617
+ const metas = listResultFiles(searchDir);
6618
+ const experimentMap = /* @__PURE__ */ new Map();
6619
+ for (const m of metas) {
6620
+ try {
6621
+ const records = loadLightweightResults(m.path);
6622
+ for (const r of records) {
6623
+ const experiment = r.experiment ?? "default";
6624
+ const entry = experimentMap.get(experiment) ?? {
6625
+ targets: /* @__PURE__ */ new Set(),
6626
+ runFilenames: /* @__PURE__ */ new Set(),
6627
+ evalCount: 0,
6628
+ passedCount: 0,
6629
+ lastTimestamp: ""
6630
+ };
6631
+ entry.runFilenames.add(m.filename);
6632
+ if (r.target) entry.targets.add(r.target);
6633
+ entry.evalCount++;
6634
+ if (r.score >= 1) entry.passedCount++;
6635
+ if (r.timestamp && r.timestamp > entry.lastTimestamp) {
6636
+ entry.lastTimestamp = r.timestamp;
6637
+ }
6638
+ experimentMap.set(experiment, entry);
6639
+ }
6640
+ } catch {
6641
+ }
6642
+ }
6643
+ const experiments = [...experimentMap.entries()].map(([name, entry]) => ({
6644
+ name,
6645
+ run_count: entry.runFilenames.size,
6646
+ target_count: entry.targets.size,
6647
+ eval_count: entry.evalCount,
6648
+ passed_count: entry.passedCount,
6649
+ pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
6650
+ last_run: entry.lastTimestamp || null
6651
+ }));
6652
+ return c3.json({ experiments });
6653
+ });
6654
+ app2.get("/api/targets", (c3) => {
6655
+ const metas = listResultFiles(searchDir);
6656
+ const targetMap = /* @__PURE__ */ new Map();
6657
+ for (const m of metas) {
6658
+ try {
6659
+ const records = loadLightweightResults(m.path);
6660
+ for (const r of records) {
6661
+ const target = r.target ?? "default";
6662
+ const entry = targetMap.get(target) ?? {
6663
+ experiments: /* @__PURE__ */ new Set(),
6664
+ runFilenames: /* @__PURE__ */ new Set(),
6665
+ evalCount: 0,
6666
+ passedCount: 0
6667
+ };
6668
+ entry.runFilenames.add(m.filename);
6669
+ if (r.experiment) entry.experiments.add(r.experiment);
6670
+ entry.evalCount++;
6671
+ if (r.score >= 1) entry.passedCount++;
6672
+ targetMap.set(target, entry);
6673
+ }
6674
+ } catch {
6675
+ }
6676
+ }
6677
+ const targets = [...targetMap.entries()].map(([name, entry]) => ({
6678
+ name,
6679
+ run_count: entry.runFilenames.size,
6680
+ experiment_count: entry.experiments.size,
6681
+ eval_count: entry.evalCount,
6682
+ passed_count: entry.passedCount,
6683
+ pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0
6684
+ }));
6685
+ return c3.json({ targets });
6686
+ });
6687
+ if (studioDistPath) {
6688
+ app2.get("/assets/*", (c3) => {
6689
+ const assetPath = c3.req.path;
6690
+ const filePath = path9.join(studioDistPath, assetPath);
6691
+ if (!existsSync7(filePath)) {
6692
+ return c3.notFound();
6693
+ }
6694
+ const content = readFileSync8(filePath);
6695
+ const ext = path9.extname(filePath);
6696
+ const mimeTypes = {
6697
+ ".js": "application/javascript",
6698
+ ".css": "text/css",
6699
+ ".html": "text/html",
6700
+ ".json": "application/json",
6701
+ ".svg": "image/svg+xml",
6702
+ ".png": "image/png",
6703
+ ".woff2": "font/woff2",
6704
+ ".woff": "font/woff"
6705
+ };
6706
+ const contentType = mimeTypes[ext] ?? "application/octet-stream";
6707
+ return new Response(content, {
6708
+ headers: {
6709
+ "Content-Type": contentType,
6710
+ "Cache-Control": "public, max-age=31536000, immutable"
6711
+ }
6712
+ });
6713
+ });
6714
+ app2.get("*", (c3) => {
6715
+ if (c3.req.path.startsWith("/api/")) {
6716
+ return c3.json({ error: "Not found" }, 404);
6717
+ }
6718
+ const indexPath = path9.join(studioDistPath, "index.html");
6719
+ if (existsSync7(indexPath)) {
6720
+ return c3.html(readFileSync8(indexPath, "utf8"));
6721
+ }
6722
+ return c3.notFound();
6723
+ });
6724
+ }
6253
6725
  return app2;
6254
6726
  }
6727
+ function resolveStudioDistDir() {
6728
+ const currentDir = typeof __dirname !== "undefined" ? __dirname : path9.dirname(fileURLToPath2(import.meta.url));
6729
+ const candidates = [
6730
+ // From src/commands/results/ → sibling apps/studio/dist
6731
+ path9.resolve(currentDir, "../../../../studio/dist"),
6732
+ // From dist/ → sibling apps/studio/dist (monorepo dev)
6733
+ path9.resolve(currentDir, "../../studio/dist"),
6734
+ // Bundled inside CLI dist (published package)
6735
+ path9.resolve(currentDir, "../studio"),
6736
+ // From dist/ in monorepo root context
6737
+ path9.resolve(currentDir, "../../../apps/studio/dist")
6738
+ ];
6739
+ for (const candidate of candidates) {
6740
+ if (existsSync7(candidate) && existsSync7(path9.join(candidate, "index.html"))) {
6741
+ return candidate;
6742
+ }
6743
+ }
6744
+ return void 0;
6745
+ }
6255
6746
  function stripHeavyFields(results) {
6256
6747
  return results.map((r) => {
6257
6748
  const { requests, trace, ...rest } = r;
@@ -6934,8 +7425,8 @@ var SERVE_SCRIPT = `
6934
7425
  })();
6935
7426
  `;
6936
7427
  var resultsServeCommand = command({
6937
- name: "serve",
6938
- description: "Start a local HTTP server to review evaluation results",
7428
+ name: "studio",
7429
+ description: "Start AgentV Studio \u2014 a local dashboard for reviewing evaluation results",
6939
7430
  args: {
6940
7431
  source: positional({
6941
7432
  type: optional(string),
@@ -7572,7 +8063,7 @@ function formatResultDetail(result, index, tree) {
7572
8063
  }
7573
8064
  const scoreColor = result.score >= 0.9 ? c2.green : result.score >= 0.5 ? c2.yellow : c2.red;
7574
8065
  lines.push(
7575
- `${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.eval_set ? ` ${c2.dim}eval-set: ${result.eval_set}${c2.reset}` : ""}`
8066
+ `${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.dataset ? ` ${c2.dim}dataset: ${result.dataset}${c2.reset}` : ""}`
7576
8067
  );
7577
8068
  if (result.error) {
7578
8069
  lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
@@ -7746,8 +8237,8 @@ function groupResults(results, groupBy2) {
7746
8237
  case "target":
7747
8238
  key = result.target ?? "unknown";
7748
8239
  break;
7749
- case "eval-set":
7750
- key = result.eval_set ?? "unknown";
8240
+ case "dataset":
8241
+ key = result.dataset ?? "unknown";
7751
8242
  break;
7752
8243
  case "test-id":
7753
8244
  key = result.test_id ?? result.eval_id ?? "unknown";
@@ -8460,7 +8951,9 @@ var app = subcommands({
8460
8951
  pipeline: pipelineCommand,
8461
8952
  results: resultsCommand,
8462
8953
  self: selfCommand,
8954
+ studio: resultsServeCommand,
8463
8955
  serve: resultsServeCommand,
8956
+ // hidden alias for backward compatibility
8464
8957
  trace: traceCommand,
8465
8958
  transpile: transpileCommand,
8466
8959
  trim: trimCommand,
@@ -8479,6 +8972,7 @@ var TOP_LEVEL_COMMANDS = /* @__PURE__ */ new Set([
8479
8972
  "results",
8480
8973
  "self",
8481
8974
  "serve",
8975
+ "studio",
8482
8976
  "trace",
8483
8977
  "transpile",
8484
8978
  "trim",
@@ -8525,4 +9019,4 @@ export {
8525
9019
  preprocessArgv,
8526
9020
  runCli
8527
9021
  };
8528
- //# sourceMappingURL=chunk-UBLKP2F4.js.map
9022
+ //# sourceMappingURL=chunk-E3VSJJI4.js.map