agentv 3.14.6 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ import {
10
10
  loadManifestResults,
11
11
  loadRunCache,
12
12
  package_default,
13
+ parseResultManifest,
13
14
  resolveEvalPaths,
14
15
  resolveExistingRunPrimaryPath,
15
16
  resolveResultSourcePath,
@@ -23,9 +24,11 @@ import {
23
24
  validateFileReferences,
24
25
  validateTargetsFile,
25
26
  writeArtifactsFromResults
26
- } from "./chunk-Y25VL7PX.js";
27
+ } from "./chunk-OT2J474N.js";
27
28
  import {
29
+ DEFAULT_CATEGORY,
28
30
  createBuiltinRegistry,
31
+ deriveCategory,
29
32
  executeScript,
30
33
  getAgentvHome,
31
34
  getOutputFilenames,
@@ -40,7 +43,7 @@ import {
40
43
  toSnakeCaseDeep as toSnakeCaseDeep2,
41
44
  transpileEvalYamlFile,
42
45
  trimBaselineResult
43
- } from "./chunk-ELQEFMGO.js";
46
+ } from "./chunk-OXBBWZOY.js";
44
47
  import {
45
48
  __commonJS,
46
49
  __esm,
@@ -3479,9 +3482,23 @@ var ASSERTION_TEMPLATES = {
3479
3482
  default: `#!/usr/bin/env bun
3480
3483
  import { defineAssertion } from '@agentv/eval';
3481
3484
 
3482
- export default defineAssertion(({ outputText }) => {
3485
+ /** Extract text from the last message with the given role. */
3486
+ function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
3487
+ for (let i = messages.length - 1; i >= 0; i--) {
3488
+ const msg = messages[i];
3489
+ if (msg.role !== role) continue;
3490
+ if (typeof msg.content === 'string') return msg.content;
3491
+ if (Array.isArray(msg.content)) {
3492
+ return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
3493
+ }
3494
+ }
3495
+ return '';
3496
+ }
3497
+
3498
+ export default defineAssertion(({ output }) => {
3483
3499
  // TODO: Implement your assertion logic
3484
- const pass = outputText.length > 0;
3500
+ const text = getMessageText(output ?? []);
3501
+ const pass = text.length > 0;
3485
3502
  return {
3486
3503
  pass,
3487
3504
  reasoning: pass ? 'Output has content' : 'Output is empty',
@@ -3491,9 +3508,23 @@ export default defineAssertion(({ outputText }) => {
3491
3508
  score: `#!/usr/bin/env bun
3492
3509
  import { defineAssertion } from '@agentv/eval';
3493
3510
 
3494
- export default defineAssertion(({ outputText }) => {
3511
+ /** Extract text from the last message with the given role. */
3512
+ function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
3513
+ for (let i = messages.length - 1; i >= 0; i--) {
3514
+ const msg = messages[i];
3515
+ if (msg.role !== role) continue;
3516
+ if (typeof msg.content === 'string') return msg.content;
3517
+ if (Array.isArray(msg.content)) {
3518
+ return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
3519
+ }
3520
+ }
3521
+ return '';
3522
+ }
3523
+
3524
+ export default defineAssertion(({ output }) => {
3495
3525
  // TODO: Implement your scoring logic (0.0 to 1.0)
3496
- const score = outputText.length > 0 ? 1.0 : 0.0;
3526
+ const text = getMessageText(output ?? []);
3527
+ const score = text.length > 0 ? 1.0 : 0.0;
3497
3528
  return {
3498
3529
  pass: score >= 0.5,
3499
3530
  score,
@@ -4186,7 +4217,7 @@ var evalRunCommand = command({
4186
4217
  },
4187
4218
  handler: async (args) => {
4188
4219
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4189
- const { launchInteractiveWizard } = await import("./interactive-5ESM5DWV.js");
4220
+ const { launchInteractiveWizard } = await import("./interactive-D5UTP72M.js");
4190
4221
  await launchInteractiveWizard();
4191
4222
  return;
4192
4223
  }
@@ -4421,7 +4452,8 @@ var evalBenchCommand = command({
4421
4452
  const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
4422
4453
  const testIds = manifest.test_ids;
4423
4454
  const targetName = manifest.target?.name ?? "unknown";
4424
- const evalSet = manifest.eval_set ?? "";
4455
+ const evalSet = manifest.dataset ?? "";
4456
+ const experiment = manifest.experiment;
4425
4457
  const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
4426
4458
  let stdinData;
4427
4459
  if (llmScoresPath) {
@@ -4531,7 +4563,8 @@ var evalBenchCommand = command({
4531
4563
  JSON.stringify({
4532
4564
  timestamp: manifest.timestamp,
4533
4565
  test_id: testId,
4534
- eval_set: evalSet || void 0,
4566
+ dataset: evalSet || void 0,
4567
+ experiment: experiment || void 0,
4535
4568
  score: Math.round(weightedScore * 1e3) / 1e3,
4536
4569
  target: targetName,
4537
4570
  scores,
@@ -4553,6 +4586,7 @@ var evalBenchCommand = command({
4553
4586
  metadata: {
4554
4587
  eval_file: manifest.eval_file,
4555
4588
  timestamp: manifest.timestamp,
4589
+ experiment: experiment || void 0,
4556
4590
  targets: [targetName],
4557
4591
  tests_run: testIds
4558
4592
  },
@@ -4594,6 +4628,12 @@ function computeStats(values) {
4594
4628
  // src/commands/pipeline/grade.ts
4595
4629
  import { mkdir as mkdir2, readFile as readFile2, readdir as readdir2, writeFile as writeFile3 } from "node:fs/promises";
4596
4630
  import { join as join2 } from "node:path";
4631
+ function extractInputText(input) {
4632
+ if (!input || input.length === 0) return "";
4633
+ if (input.length === 1) return input[0].content;
4634
+ return input.map((m) => `@[${m.role}]:
4635
+ ${m.content}`).join("\n\n");
4636
+ }
4597
4637
  var evalGradeCommand = command({
4598
4638
  name: "grade",
4599
4639
  description: "Run code-grader assertions on responses in an export directory",
@@ -4608,7 +4648,7 @@ var evalGradeCommand = command({
4608
4648
  const manifestPath = join2(exportDir, "manifest.json");
4609
4649
  const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
4610
4650
  const testIds = manifest.test_ids;
4611
- const evalSet = manifest.eval_set ?? "";
4651
+ const evalSet = manifest.dataset ?? "";
4612
4652
  const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
4613
4653
  let totalGraders = 0;
4614
4654
  let totalPassed = 0;
@@ -4630,14 +4670,13 @@ var evalGradeCommand = command({
4630
4670
  for (const graderFile of graderFiles) {
4631
4671
  const graderConfig = JSON.parse(await readFile2(join2(codeGradersDir, graderFile), "utf8"));
4632
4672
  const graderName = graderConfig.name;
4673
+ const inputText = extractInputText(inputData.input);
4633
4674
  const payload = JSON.stringify({
4634
4675
  output: [{ role: "assistant", content: responseText }],
4635
- input: inputData.input_messages,
4636
- question: inputData.input_text,
4676
+ input: inputData.input,
4637
4677
  criteria: "",
4638
4678
  expected_output: [],
4639
- reference_answer: "",
4640
- input_files: [],
4679
+ input_files: inputData.input_files ?? [],
4641
4680
  trace: null,
4642
4681
  token_usage: null,
4643
4682
  cost_usd: null,
@@ -4647,8 +4686,8 @@ var evalGradeCommand = command({
4647
4686
  file_changes: null,
4648
4687
  workspace_path: null,
4649
4688
  config: graderConfig.config ?? null,
4650
- metadata: {},
4651
- input_text: inputData.input_text,
4689
+ metadata: inputData.metadata ?? {},
4690
+ input_text: inputText,
4652
4691
  output_text: responseText,
4653
4692
  expected_output_text: ""
4654
4693
  });
@@ -4706,7 +4745,7 @@ var evalGradeCommand = command({
4706
4745
  // src/commands/pipeline/input.ts
4707
4746
  import { readFile as readFile3 } from "node:fs/promises";
4708
4747
  import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
4709
- import { dirname, join as join3, resolve } from "node:path";
4748
+ import { dirname, join as join3, relative, resolve } from "node:path";
4710
4749
  var evalInputCommand = command({
4711
4750
  name: "input",
4712
4751
  description: "Extract eval inputs, target commands, and grader prompts for subagent-mode runs",
@@ -4720,14 +4759,20 @@ var evalInputCommand = command({
4720
4759
  type: optional(string),
4721
4760
  long: "out",
4722
4761
  description: "Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)"
4762
+ }),
4763
+ experiment: option({
4764
+ type: optional(string),
4765
+ long: "experiment",
4766
+ description: "Experiment label (e.g. with_skills, without_skills)"
4723
4767
  })
4724
4768
  },
4725
- handler: async ({ evalPath, out }) => {
4769
+ handler: async ({ evalPath, out, experiment }) => {
4726
4770
  const resolvedEvalPath = resolve(evalPath);
4727
4771
  const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
4728
4772
  const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
4729
4773
  const evalDir = dirname(resolvedEvalPath);
4730
- const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
4774
+ const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
4775
+ const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
4731
4776
  const tests = suite.tests;
4732
4777
  if (tests.length === 0) {
4733
4778
  console.error("No tests found in eval file.");
@@ -4736,6 +4781,7 @@ var evalInputCommand = command({
4736
4781
  let targetInfo = null;
4737
4782
  let targetName = "agent";
4738
4783
  let targetKind = "agent";
4784
+ let subagentModeAllowed = true;
4739
4785
  try {
4740
4786
  const selection = await selectTarget({
4741
4787
  testFilePath: resolvedEvalPath,
@@ -4748,15 +4794,20 @@ var evalInputCommand = command({
4748
4794
  env: process.env
4749
4795
  });
4750
4796
  targetName = selection.targetName;
4751
- if (selection.resolvedTarget.kind === "cli") {
4797
+ const resolved = selection.resolvedTarget;
4798
+ subagentModeAllowed = resolved.subagentModeAllowed !== false;
4799
+ if (resolved.kind === "cli") {
4752
4800
  targetKind = "cli";
4753
- const config = selection.resolvedTarget.config;
4801
+ subagentModeAllowed = false;
4802
+ const config = resolved.config;
4754
4803
  targetInfo = {
4755
4804
  kind: "cli",
4756
4805
  command: config.command,
4757
4806
  cwd: config.cwd ?? evalDir,
4758
4807
  timeoutMs: config.timeoutMs ?? 3e4
4759
4808
  };
4809
+ } else {
4810
+ targetKind = resolved.kind;
4760
4811
  }
4761
4812
  } catch {
4762
4813
  }
@@ -4768,15 +4819,13 @@ var evalInputCommand = command({
4768
4819
  const testDir = join3(outDir, ...subpath);
4769
4820
  await mkdir3(testDir, { recursive: true });
4770
4821
  testIds.push(test.id);
4771
- const inputText = test.question;
4772
4822
  const inputMessages = test.input.map((m) => ({
4773
4823
  role: m.role,
4774
4824
  content: typeof m.content === "string" ? m.content : m.content
4775
4825
  }));
4776
4826
  await writeJson(join3(testDir, "input.json"), {
4777
- input_text: inputText,
4778
- input_messages: inputMessages,
4779
- file_paths: test.file_paths,
4827
+ input: inputMessages,
4828
+ input_files: test.file_paths,
4780
4829
  metadata: test.metadata ?? {}
4781
4830
  });
4782
4831
  if (targetInfo) {
@@ -4804,11 +4853,13 @@ var evalInputCommand = command({
4804
4853
  }
4805
4854
  await writeJson(join3(outDir, "manifest.json"), {
4806
4855
  eval_file: resolvedEvalPath,
4807
- eval_set: evalSetName || void 0,
4856
+ dataset: evalSetName || void 0,
4857
+ experiment: experiment || void 0,
4808
4858
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4809
4859
  target: {
4810
4860
  name: targetName,
4811
- kind: targetKind
4861
+ kind: targetKind,
4862
+ subagent_mode_allowed: subagentModeAllowed
4812
4863
  },
4813
4864
  test_ids: testIds
4814
4865
  });
@@ -4870,7 +4921,13 @@ import { execSync } from "node:child_process";
4870
4921
  import { existsSync as existsSync3, readFileSync as readFileSync4, unlinkSync } from "node:fs";
4871
4922
  import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
4872
4923
  import { tmpdir } from "node:os";
4873
- import { dirname as dirname2, join as join4, resolve as resolve2 } from "node:path";
4924
+ import { dirname as dirname2, join as join4, relative as relative2, resolve as resolve2 } from "node:path";
4925
+ function extractInputText2(input) {
4926
+ if (!input || input.length === 0) return "";
4927
+ if (input.length === 1) return input[0].content;
4928
+ return input.map((m) => `@[${m.role}]:
4929
+ ${m.content}`).join("\n\n");
4930
+ }
4874
4931
  function loadEnvFile(dir) {
4875
4932
  let current = resolve2(dir);
4876
4933
  while (true) {
@@ -4910,14 +4967,20 @@ var evalRunCommand2 = command({
4910
4967
  type: optional(number),
4911
4968
  long: "workers",
4912
4969
  description: "Parallel workers for target invocation (default: all tests)"
4970
+ }),
4971
+ experiment: option({
4972
+ type: optional(string),
4973
+ long: "experiment",
4974
+ description: "Experiment label (e.g. with_skills, without_skills)"
4913
4975
  })
4914
4976
  },
4915
- handler: async ({ evalPath, out, workers }) => {
4977
+ handler: async ({ evalPath, out, workers, experiment }) => {
4916
4978
  const resolvedEvalPath = resolve2(evalPath);
4917
4979
  const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd()));
4918
4980
  const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
4919
4981
  const evalDir = dirname2(resolvedEvalPath);
4920
- const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
4982
+ const category = deriveCategory(relative2(process.cwd(), resolvedEvalPath));
4983
+ const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
4921
4984
  const tests = suite.tests;
4922
4985
  if (tests.length === 0) {
4923
4986
  console.error("No tests found in eval file.");
@@ -4958,15 +5021,13 @@ var evalRunCommand2 = command({
4958
5021
  const testDir = join4(outDir, ...subpath);
4959
5022
  await mkdir4(testDir, { recursive: true });
4960
5023
  testIds.push(test.id);
4961
- const inputText = test.question;
4962
5024
  const inputMessages = test.input.map((m) => ({
4963
5025
  role: m.role,
4964
5026
  content: typeof m.content === "string" ? m.content : m.content
4965
5027
  }));
4966
5028
  await writeJson2(join4(testDir, "input.json"), {
4967
- input_text: inputText,
4968
- input_messages: inputMessages,
4969
- file_paths: test.file_paths,
5029
+ input: inputMessages,
5030
+ input_files: test.file_paths,
4970
5031
  metadata: test.metadata ?? {}
4971
5032
  });
4972
5033
  if (targetInfo) {
@@ -4994,7 +5055,8 @@ var evalRunCommand2 = command({
4994
5055
  }
4995
5056
  await writeJson2(join4(outDir, "manifest.json"), {
4996
5057
  eval_file: resolvedEvalPath,
4997
- eval_set: evalSetName || void 0,
5058
+ dataset: evalSetName || void 0,
5059
+ experiment: experiment || void 0,
4998
5060
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4999
5061
  target: { name: targetName, kind: targetKind },
5000
5062
  test_ids: testIds
@@ -5019,11 +5081,12 @@ var evalRunCommand2 = command({
5019
5081
  const timeoutMs = invoke.timeout_ms ?? 12e4;
5020
5082
  const promptFile = join4(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
5021
5083
  const outputFile = join4(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
5022
- await writeFile5(promptFile, inputData.input_text, "utf8");
5084
+ const inputText = extractInputText2(inputData.input);
5085
+ await writeFile5(promptFile, inputText, "utf8");
5023
5086
  let rendered = template;
5024
5087
  rendered = rendered.replace("{PROMPT_FILE}", promptFile);
5025
5088
  rendered = rendered.replace("{OUTPUT_FILE}", outputFile);
5026
- rendered = rendered.replace("{PROMPT}", inputData.input_text);
5089
+ rendered = rendered.replace("{PROMPT}", inputText);
5027
5090
  const start = performance.now();
5028
5091
  try {
5029
5092
  execSync(rendered, {
@@ -5100,14 +5163,13 @@ var evalRunCommand2 = command({
5100
5163
  for (const graderFile of graderFiles) {
5101
5164
  const graderConfig = JSON.parse(await readFile4(join4(codeGradersDir, graderFile), "utf8"));
5102
5165
  const graderName = graderConfig.name;
5166
+ const inputText = extractInputText2(inputData.input);
5103
5167
  const payload = JSON.stringify({
5104
5168
  output: [{ role: "assistant", content: responseText }],
5105
- input: inputData.input_messages,
5106
- question: inputData.input_text,
5169
+ input: inputData.input,
5107
5170
  criteria: "",
5108
5171
  expected_output: [],
5109
- reference_answer: "",
5110
- input_files: [],
5172
+ input_files: inputData.input_files ?? [],
5111
5173
  trace: null,
5112
5174
  token_usage: null,
5113
5175
  cost_usd: null,
@@ -5117,8 +5179,8 @@ var evalRunCommand2 = command({
5117
5179
  file_changes: null,
5118
5180
  workspace_path: null,
5119
5181
  config: graderConfig.config ?? null,
5120
- metadata: {},
5121
- input_text: inputData.input_text,
5182
+ metadata: inputData.metadata ?? {},
5183
+ input_text: inputText,
5122
5184
  output_text: responseText,
5123
5185
  expected_output_text: ""
5124
5186
  });
@@ -5306,7 +5368,7 @@ function toRawResult(result) {
5306
5368
  return {
5307
5369
  timestamp: result.timestamp,
5308
5370
  test_id: result.testId,
5309
- eval_set: result.eval_set,
5371
+ dataset: result.dataset,
5310
5372
  conversation_id: result.conversationId,
5311
5373
  score: result.score,
5312
5374
  assertions: result.assertions?.map((assertion) => ({
@@ -5429,7 +5491,7 @@ function loadOtlpTraceFile(filePath) {
5429
5491
  }
5430
5492
  return {
5431
5493
  test_id: stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`,
5432
- eval_set: stringAttr(rootAttrs.agentv_eval_set),
5494
+ dataset: stringAttr(rootAttrs.agentv_dataset),
5433
5495
  target: stringAttr(rootAttrs.agentv_target),
5434
5496
  score,
5435
5497
  error: root.status?.code === 2 ? root.status.message : void 0,
@@ -6173,8 +6235,9 @@ var resultsCommand = subcommands({
6173
6235
  });
6174
6236
 
6175
6237
  // src/commands/results/serve.ts
6176
- import { existsSync as existsSync7, readFileSync as readFileSync8, writeFileSync as writeFileSync3 } from "node:fs";
6238
+ import { existsSync as existsSync7, readFileSync as readFileSync8, readdirSync as readdirSync3, statSync as statSync4, writeFileSync as writeFileSync3 } from "node:fs";
6177
6239
  import path9 from "node:path";
6240
+ import { fileURLToPath as fileURLToPath2 } from "node:url";
6178
6241
  import { Hono } from "hono";
6179
6242
  function feedbackPath(resultDir) {
6180
6243
  return path9.join(resultDir, "feedback.json");
@@ -6195,24 +6258,45 @@ function writeFeedback(cwd, data) {
6195
6258
  writeFileSync3(feedbackPath(cwd), `${JSON.stringify(data, null, 2)}
6196
6259
  `, "utf8");
6197
6260
  }
6198
- function createApp(results, resultDir, cwd, sourceFile) {
6261
+ function createApp(results, resultDir, cwd, sourceFile, options) {
6199
6262
  const searchDir = cwd ?? resultDir;
6200
6263
  const app2 = new Hono();
6264
+ const studioDistPath = options?.studioDir === false ? void 0 : options?.studioDir ?? resolveStudioDistDir();
6201
6265
  app2.get("/", (c3) => {
6266
+ if (studioDistPath) {
6267
+ const indexPath = path9.join(studioDistPath, "index.html");
6268
+ if (existsSync7(indexPath)) {
6269
+ return c3.html(readFileSync8(indexPath, "utf8"));
6270
+ }
6271
+ }
6202
6272
  return c3.html(generateServeHtml(results, sourceFile));
6203
6273
  });
6204
6274
  app2.get("/api/runs", (c3) => {
6205
6275
  const metas = listResultFiles(searchDir);
6206
6276
  return c3.json({
6207
- runs: metas.map((m) => ({
6208
- filename: m.filename,
6209
- path: m.path,
6210
- timestamp: m.timestamp,
6211
- test_count: m.testCount,
6212
- pass_rate: m.passRate,
6213
- avg_score: m.avgScore,
6214
- size_bytes: m.sizeBytes
6215
- }))
6277
+ runs: metas.map((m) => {
6278
+ let target;
6279
+ let experiment;
6280
+ try {
6281
+ const records = loadLightweightResults(m.path);
6282
+ if (records.length > 0) {
6283
+ target = records[0].target;
6284
+ experiment = records[0].experiment;
6285
+ }
6286
+ } catch {
6287
+ }
6288
+ return {
6289
+ filename: m.filename,
6290
+ path: m.path,
6291
+ timestamp: m.timestamp,
6292
+ test_count: m.testCount,
6293
+ pass_rate: m.passRate,
6294
+ avg_score: m.avgScore,
6295
+ size_bytes: m.sizeBytes,
6296
+ ...target && { target },
6297
+ ...experiment && { experiment }
6298
+ };
6299
+ })
6216
6300
  });
6217
6301
  });
6218
6302
  app2.get("/api/runs/:filename", (c3) => {
@@ -6272,8 +6356,393 @@ function createApp(results, resultDir, cwd, sourceFile) {
6272
6356
  writeFeedback(resultDir, existing);
6273
6357
  return c3.json(existing);
6274
6358
  });
6359
+ app2.get("/api/runs/:filename/datasets", (c3) => {
6360
+ const filename = c3.req.param("filename");
6361
+ const metas = listResultFiles(searchDir);
6362
+ const meta = metas.find((m) => m.filename === filename);
6363
+ if (!meta) {
6364
+ return c3.json({ error: "Run not found" }, 404);
6365
+ }
6366
+ try {
6367
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6368
+ const datasetMap = /* @__PURE__ */ new Map();
6369
+ for (const r of loaded) {
6370
+ const ds = r.dataset ?? r.target ?? "default";
6371
+ const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
6372
+ entry.total++;
6373
+ if (r.score >= 1) entry.passed++;
6374
+ entry.scoreSum += r.score;
6375
+ datasetMap.set(ds, entry);
6376
+ }
6377
+ const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
6378
+ name,
6379
+ total: entry.total,
6380
+ passed: entry.passed,
6381
+ failed: entry.total - entry.passed,
6382
+ avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
6383
+ }));
6384
+ return c3.json({ datasets });
6385
+ } catch {
6386
+ return c3.json({ error: "Failed to load datasets" }, 500);
6387
+ }
6388
+ });
6389
+ app2.get("/api/runs/:filename/categories", (c3) => {
6390
+ const filename = c3.req.param("filename");
6391
+ const metas = listResultFiles(searchDir);
6392
+ const meta = metas.find((m) => m.filename === filename);
6393
+ if (!meta) {
6394
+ return c3.json({ error: "Run not found" }, 404);
6395
+ }
6396
+ try {
6397
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6398
+ const categoryMap = /* @__PURE__ */ new Map();
6399
+ for (const r of loaded) {
6400
+ const cat = r.category ?? DEFAULT_CATEGORY;
6401
+ const entry = categoryMap.get(cat) ?? {
6402
+ total: 0,
6403
+ passed: 0,
6404
+ scoreSum: 0,
6405
+ datasets: /* @__PURE__ */ new Set()
6406
+ };
6407
+ entry.total++;
6408
+ if (r.score >= 1) entry.passed++;
6409
+ entry.scoreSum += r.score;
6410
+ entry.datasets.add(r.dataset ?? r.target ?? "default");
6411
+ categoryMap.set(cat, entry);
6412
+ }
6413
+ const categories = [...categoryMap.entries()].map(([name, entry]) => ({
6414
+ name,
6415
+ total: entry.total,
6416
+ passed: entry.passed,
6417
+ failed: entry.total - entry.passed,
6418
+ avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
6419
+ dataset_count: entry.datasets.size
6420
+ }));
6421
+ return c3.json({ categories });
6422
+ } catch {
6423
+ return c3.json({ error: "Failed to load categories" }, 500);
6424
+ }
6425
+ });
6426
+ app2.get("/api/runs/:filename/categories/:category/datasets", (c3) => {
6427
+ const filename = c3.req.param("filename");
6428
+ const category = decodeURIComponent(c3.req.param("category"));
6429
+ const metas = listResultFiles(searchDir);
6430
+ const meta = metas.find((m) => m.filename === filename);
6431
+ if (!meta) {
6432
+ return c3.json({ error: "Run not found" }, 404);
6433
+ }
6434
+ try {
6435
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6436
+ const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
6437
+ const datasetMap = /* @__PURE__ */ new Map();
6438
+ for (const r of filtered) {
6439
+ const ds = r.dataset ?? r.target ?? "default";
6440
+ const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
6441
+ entry.total++;
6442
+ if (r.score >= 1) entry.passed++;
6443
+ entry.scoreSum += r.score;
6444
+ datasetMap.set(ds, entry);
6445
+ }
6446
+ const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
6447
+ name,
6448
+ total: entry.total,
6449
+ passed: entry.passed,
6450
+ failed: entry.total - entry.passed,
6451
+ avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
6452
+ }));
6453
+ return c3.json({ datasets });
6454
+ } catch {
6455
+ return c3.json({ error: "Failed to load datasets" }, 500);
6456
+ }
6457
+ });
6458
+ app2.get("/api/runs/:filename/evals/:evalId", (c3) => {
6459
+ const filename = c3.req.param("filename");
6460
+ const evalId = c3.req.param("evalId");
6461
+ const metas = listResultFiles(searchDir);
6462
+ const meta = metas.find((m) => m.filename === filename);
6463
+ if (!meta) {
6464
+ return c3.json({ error: "Run not found" }, 404);
6465
+ }
6466
+ try {
6467
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6468
+ const result = loaded.find((r) => r.testId === evalId);
6469
+ if (!result) {
6470
+ return c3.json({ error: "Eval not found" }, 404);
6471
+ }
6472
+ return c3.json({ eval: result });
6473
+ } catch {
6474
+ return c3.json({ error: "Failed to load eval" }, 500);
6475
+ }
6476
+ });
6477
+ app2.get("/api/index", (c3) => {
6478
+ const metas = listResultFiles(searchDir);
6479
+ const entries2 = metas.map((m) => {
6480
+ let totalCostUsd = 0;
6481
+ try {
6482
+ const loaded = patchTestIds(loadManifestResults(m.path));
6483
+ totalCostUsd = loaded.reduce((sum, r) => sum + (r.costUsd ?? 0), 0);
6484
+ } catch {
6485
+ }
6486
+ return {
6487
+ run_filename: m.filename,
6488
+ test_count: m.testCount,
6489
+ pass_rate: m.passRate,
6490
+ avg_score: m.avgScore,
6491
+ total_cost_usd: totalCostUsd,
6492
+ timestamp: m.timestamp
6493
+ };
6494
+ });
6495
+ return c3.json({ entries: entries2 });
6496
+ });
6497
+ function buildFileTree(dirPath, relativeTo) {
6498
+ if (!existsSync7(dirPath) || !statSync4(dirPath).isDirectory()) {
6499
+ return [];
6500
+ }
6501
+ const entries2 = readdirSync3(dirPath, { withFileTypes: true });
6502
+ return entries2.sort((a, b) => {
6503
+ if (a.isDirectory() !== b.isDirectory()) return a.isDirectory() ? -1 : 1;
6504
+ return a.name.localeCompare(b.name);
6505
+ }).map((entry) => {
6506
+ const fullPath = path9.join(dirPath, entry.name);
6507
+ const relPath = path9.relative(relativeTo, fullPath);
6508
+ if (entry.isDirectory()) {
6509
+ return {
6510
+ name: entry.name,
6511
+ path: relPath,
6512
+ type: "dir",
6513
+ children: buildFileTree(fullPath, relativeTo)
6514
+ };
6515
+ }
6516
+ return { name: entry.name, path: relPath, type: "file" };
6517
+ });
6518
+ }
6519
+ function inferLanguage(filePath) {
6520
+ const ext = path9.extname(filePath).toLowerCase();
6521
+ const langMap = {
6522
+ ".json": "json",
6523
+ ".jsonl": "json",
6524
+ ".ts": "typescript",
6525
+ ".tsx": "typescript",
6526
+ ".js": "javascript",
6527
+ ".jsx": "javascript",
6528
+ ".md": "markdown",
6529
+ ".yaml": "yaml",
6530
+ ".yml": "yaml",
6531
+ ".log": "plaintext",
6532
+ ".txt": "plaintext",
6533
+ ".py": "python",
6534
+ ".sh": "shell",
6535
+ ".bash": "shell",
6536
+ ".css": "css",
6537
+ ".html": "html",
6538
+ ".xml": "xml",
6539
+ ".svg": "xml",
6540
+ ".toml": "toml",
6541
+ ".diff": "diff",
6542
+ ".patch": "diff"
6543
+ };
6544
+ return langMap[ext] ?? "plaintext";
6545
+ }
6546
+ app2.get("/api/runs/:filename/evals/:evalId/files", (c3) => {
6547
+ const filename = c3.req.param("filename");
6548
+ const evalId = c3.req.param("evalId");
6549
+ const metas = listResultFiles(searchDir);
6550
+ const meta = metas.find((m) => m.filename === filename);
6551
+ if (!meta) {
6552
+ return c3.json({ error: "Run not found" }, 404);
6553
+ }
6554
+ try {
6555
+ const content = readFileSync8(meta.path, "utf8");
6556
+ const records = parseResultManifest(content);
6557
+ const record = records.find((r) => (r.test_id ?? r.eval_id) === evalId);
6558
+ if (!record) {
6559
+ return c3.json({ error: "Eval not found" }, 404);
6560
+ }
6561
+ const baseDir = path9.dirname(meta.path);
6562
+ const knownPaths = [
6563
+ record.grading_path,
6564
+ record.timing_path,
6565
+ record.input_path,
6566
+ record.output_path,
6567
+ record.response_path
6568
+ ].filter((p) => !!p);
6569
+ if (knownPaths.length === 0) {
6570
+ return c3.json({ files: [] });
6571
+ }
6572
+ const artifactDirs = knownPaths.map((p) => path9.dirname(p));
6573
+ let commonDir = artifactDirs[0];
6574
+ for (const dir of artifactDirs) {
6575
+ while (!dir.startsWith(commonDir)) {
6576
+ commonDir = path9.dirname(commonDir);
6577
+ }
6578
+ }
6579
+ const artifactAbsDir = path9.join(baseDir, commonDir);
6580
+ const files = buildFileTree(artifactAbsDir, baseDir);
6581
+ return c3.json({ files });
6582
+ } catch {
6583
+ return c3.json({ error: "Failed to load file tree" }, 500);
6584
+ }
6585
+ });
6586
+ app2.get("/api/runs/:filename/evals/:evalId/files/*", (c3) => {
6587
+ const filename = c3.req.param("filename");
6588
+ const evalId = c3.req.param("evalId");
6589
+ const metas = listResultFiles(searchDir);
6590
+ const meta = metas.find((m) => m.filename === filename);
6591
+ if (!meta) {
6592
+ return c3.json({ error: "Run not found" }, 404);
6593
+ }
6594
+ const requestPath = c3.req.path;
6595
+ const prefix = `/api/runs/${filename}/evals/${evalId}/files/`;
6596
+ const filePath = requestPath.slice(prefix.length);
6597
+ if (!filePath) {
6598
+ return c3.json({ error: "No file path specified" }, 400);
6599
+ }
6600
+ const baseDir = path9.dirname(meta.path);
6601
+ const absolutePath = path9.resolve(baseDir, filePath);
6602
+ if (!absolutePath.startsWith(path9.resolve(baseDir) + path9.sep) && absolutePath !== path9.resolve(baseDir)) {
6603
+ return c3.json({ error: "Path traversal not allowed" }, 403);
6604
+ }
6605
+ if (!existsSync7(absolutePath) || !statSync4(absolutePath).isFile()) {
6606
+ return c3.json({ error: "File not found" }, 404);
6607
+ }
6608
+ try {
6609
+ const fileContent = readFileSync8(absolutePath, "utf8");
6610
+ const language = inferLanguage(absolutePath);
6611
+ return c3.json({ content: fileContent, language });
6612
+ } catch {
6613
+ return c3.json({ error: "Failed to read file" }, 500);
6614
+ }
6615
+ });
6616
+ app2.get("/api/experiments", (c3) => {
6617
+ const metas = listResultFiles(searchDir);
6618
+ const experimentMap = /* @__PURE__ */ new Map();
6619
+ for (const m of metas) {
6620
+ try {
6621
+ const records = loadLightweightResults(m.path);
6622
+ for (const r of records) {
6623
+ const experiment = r.experiment ?? "default";
6624
+ const entry = experimentMap.get(experiment) ?? {
6625
+ targets: /* @__PURE__ */ new Set(),
6626
+ runFilenames: /* @__PURE__ */ new Set(),
6627
+ evalCount: 0,
6628
+ passedCount: 0,
6629
+ lastTimestamp: ""
6630
+ };
6631
+ entry.runFilenames.add(m.filename);
6632
+ if (r.target) entry.targets.add(r.target);
6633
+ entry.evalCount++;
6634
+ if (r.score >= 1) entry.passedCount++;
6635
+ if (r.timestamp && r.timestamp > entry.lastTimestamp) {
6636
+ entry.lastTimestamp = r.timestamp;
6637
+ }
6638
+ experimentMap.set(experiment, entry);
6639
+ }
6640
+ } catch {
6641
+ }
6642
+ }
6643
+ const experiments = [...experimentMap.entries()].map(([name, entry]) => ({
6644
+ name,
6645
+ run_count: entry.runFilenames.size,
6646
+ target_count: entry.targets.size,
6647
+ eval_count: entry.evalCount,
6648
+ passed_count: entry.passedCount,
6649
+ pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
6650
+ last_run: entry.lastTimestamp || null
6651
+ }));
6652
+ return c3.json({ experiments });
6653
+ });
6654
+ app2.get("/api/targets", (c3) => {
6655
+ const metas = listResultFiles(searchDir);
6656
+ const targetMap = /* @__PURE__ */ new Map();
6657
+ for (const m of metas) {
6658
+ try {
6659
+ const records = loadLightweightResults(m.path);
6660
+ for (const r of records) {
6661
+ const target = r.target ?? "default";
6662
+ const entry = targetMap.get(target) ?? {
6663
+ experiments: /* @__PURE__ */ new Set(),
6664
+ runFilenames: /* @__PURE__ */ new Set(),
6665
+ evalCount: 0,
6666
+ passedCount: 0
6667
+ };
6668
+ entry.runFilenames.add(m.filename);
6669
+ if (r.experiment) entry.experiments.add(r.experiment);
6670
+ entry.evalCount++;
6671
+ if (r.score >= 1) entry.passedCount++;
6672
+ targetMap.set(target, entry);
6673
+ }
6674
+ } catch {
6675
+ }
6676
+ }
6677
+ const targets = [...targetMap.entries()].map(([name, entry]) => ({
6678
+ name,
6679
+ run_count: entry.runFilenames.size,
6680
+ experiment_count: entry.experiments.size,
6681
+ eval_count: entry.evalCount,
6682
+ passed_count: entry.passedCount,
6683
+ pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0
6684
+ }));
6685
+ return c3.json({ targets });
6686
+ });
6687
+ if (studioDistPath) {
6688
+ app2.get("/assets/*", (c3) => {
6689
+ const assetPath = c3.req.path;
6690
+ const filePath = path9.join(studioDistPath, assetPath);
6691
+ if (!existsSync7(filePath)) {
6692
+ return c3.notFound();
6693
+ }
6694
+ const content = readFileSync8(filePath);
6695
+ const ext = path9.extname(filePath);
6696
+ const mimeTypes = {
6697
+ ".js": "application/javascript",
6698
+ ".css": "text/css",
6699
+ ".html": "text/html",
6700
+ ".json": "application/json",
6701
+ ".svg": "image/svg+xml",
6702
+ ".png": "image/png",
6703
+ ".woff2": "font/woff2",
6704
+ ".woff": "font/woff"
6705
+ };
6706
+ const contentType = mimeTypes[ext] ?? "application/octet-stream";
6707
+ return new Response(content, {
6708
+ headers: {
6709
+ "Content-Type": contentType,
6710
+ "Cache-Control": "public, max-age=31536000, immutable"
6711
+ }
6712
+ });
6713
+ });
6714
+ app2.get("*", (c3) => {
6715
+ if (c3.req.path.startsWith("/api/")) {
6716
+ return c3.json({ error: "Not found" }, 404);
6717
+ }
6718
+ const indexPath = path9.join(studioDistPath, "index.html");
6719
+ if (existsSync7(indexPath)) {
6720
+ return c3.html(readFileSync8(indexPath, "utf8"));
6721
+ }
6722
+ return c3.notFound();
6723
+ });
6724
+ }
6275
6725
  return app2;
6276
6726
  }
6727
+ function resolveStudioDistDir() {
6728
+ const currentDir = typeof __dirname !== "undefined" ? __dirname : path9.dirname(fileURLToPath2(import.meta.url));
6729
+ const candidates = [
6730
+ // From src/commands/results/ → sibling apps/studio/dist
6731
+ path9.resolve(currentDir, "../../../../studio/dist"),
6732
+ // From dist/ → sibling apps/studio/dist (monorepo dev)
6733
+ path9.resolve(currentDir, "../../studio/dist"),
6734
+ // Bundled inside CLI dist (published package)
6735
+ path9.resolve(currentDir, "../studio"),
6736
+ // From dist/ in monorepo root context
6737
+ path9.resolve(currentDir, "../../../apps/studio/dist")
6738
+ ];
6739
+ for (const candidate of candidates) {
6740
+ if (existsSync7(candidate) && existsSync7(path9.join(candidate, "index.html"))) {
6741
+ return candidate;
6742
+ }
6743
+ }
6744
+ return void 0;
6745
+ }
6277
6746
  function stripHeavyFields(results) {
6278
6747
  return results.map((r) => {
6279
6748
  const { requests, trace, ...rest } = r;
@@ -6956,8 +7425,8 @@ var SERVE_SCRIPT = `
6956
7425
  })();
6957
7426
  `;
6958
7427
  var resultsServeCommand = command({
6959
- name: "serve",
6960
- description: "Start a local HTTP server to review evaluation results",
7428
+ name: "studio",
7429
+ description: "Start AgentV Studio \u2014 a local dashboard for reviewing evaluation results",
6961
7430
  args: {
6962
7431
  source: positional({
6963
7432
  type: optional(string),
@@ -7594,7 +8063,7 @@ function formatResultDetail(result, index, tree) {
7594
8063
  }
7595
8064
  const scoreColor = result.score >= 0.9 ? c2.green : result.score >= 0.5 ? c2.yellow : c2.red;
7596
8065
  lines.push(
7597
- `${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.eval_set ? ` ${c2.dim}eval-set: ${result.eval_set}${c2.reset}` : ""}`
8066
+ `${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.dataset ? ` ${c2.dim}dataset: ${result.dataset}${c2.reset}` : ""}`
7598
8067
  );
7599
8068
  if (result.error) {
7600
8069
  lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
@@ -7768,8 +8237,8 @@ function groupResults(results, groupBy2) {
7768
8237
  case "target":
7769
8238
  key = result.target ?? "unknown";
7770
8239
  break;
7771
- case "eval-set":
7772
- key = result.eval_set ?? "unknown";
8240
+ case "dataset":
8241
+ key = result.dataset ?? "unknown";
7773
8242
  break;
7774
8243
  case "test-id":
7775
8244
  key = result.test_id ?? result.eval_id ?? "unknown";
@@ -8482,7 +8951,9 @@ var app = subcommands({
8482
8951
  pipeline: pipelineCommand,
8483
8952
  results: resultsCommand,
8484
8953
  self: selfCommand,
8954
+ studio: resultsServeCommand,
8485
8955
  serve: resultsServeCommand,
8956
+ // hidden alias for backward compatibility
8486
8957
  trace: traceCommand,
8487
8958
  transpile: transpileCommand,
8488
8959
  trim: trimCommand,
@@ -8501,6 +8972,7 @@ var TOP_LEVEL_COMMANDS = /* @__PURE__ */ new Set([
8501
8972
  "results",
8502
8973
  "self",
8503
8974
  "serve",
8975
+ "studio",
8504
8976
  "trace",
8505
8977
  "transpile",
8506
8978
  "trim",
@@ -8547,4 +9019,4 @@ export {
8547
9019
  preprocessArgv,
8548
9020
  runCli
8549
9021
  };
8550
- //# sourceMappingURL=chunk-CQRWNXVG.js.map
9022
+ //# sourceMappingURL=chunk-E3VSJJI4.js.map