agentv 3.14.6 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ import {
10
10
  loadManifestResults,
11
11
  loadRunCache,
12
12
  package_default,
13
+ parseResultManifest,
13
14
  resolveEvalPaths,
14
15
  resolveExistingRunPrimaryPath,
15
16
  resolveResultSourcePath,
@@ -23,9 +24,11 @@ import {
23
24
  validateFileReferences,
24
25
  validateTargetsFile,
25
26
  writeArtifactsFromResults
26
- } from "./chunk-Y25VL7PX.js";
27
+ } from "./chunk-4Z326WWF.js";
27
28
  import {
29
+ DEFAULT_CATEGORY,
28
30
  createBuiltinRegistry,
31
+ deriveCategory,
29
32
  executeScript,
30
33
  getAgentvHome,
31
34
  getOutputFilenames,
@@ -40,7 +43,7 @@ import {
40
43
  toSnakeCaseDeep as toSnakeCaseDeep2,
41
44
  transpileEvalYamlFile,
42
45
  trimBaselineResult
43
- } from "./chunk-ELQEFMGO.js";
46
+ } from "./chunk-XEAW7OQT.js";
44
47
  import {
45
48
  __commonJS,
46
49
  __esm,
@@ -3479,9 +3482,23 @@ var ASSERTION_TEMPLATES = {
3479
3482
  default: `#!/usr/bin/env bun
3480
3483
  import { defineAssertion } from '@agentv/eval';
3481
3484
 
3482
- export default defineAssertion(({ outputText }) => {
3485
+ /** Extract text from the last message with the given role. */
3486
+ function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
3487
+ for (let i = messages.length - 1; i >= 0; i--) {
3488
+ const msg = messages[i];
3489
+ if (msg.role !== role) continue;
3490
+ if (typeof msg.content === 'string') return msg.content;
3491
+ if (Array.isArray(msg.content)) {
3492
+ return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
3493
+ }
3494
+ }
3495
+ return '';
3496
+ }
3497
+
3498
+ export default defineAssertion(({ output }) => {
3483
3499
  // TODO: Implement your assertion logic
3484
- const pass = outputText.length > 0;
3500
+ const text = getMessageText(output ?? []);
3501
+ const pass = text.length > 0;
3485
3502
  return {
3486
3503
  pass,
3487
3504
  reasoning: pass ? 'Output has content' : 'Output is empty',
@@ -3491,9 +3508,23 @@ export default defineAssertion(({ outputText }) => {
3491
3508
  score: `#!/usr/bin/env bun
3492
3509
  import { defineAssertion } from '@agentv/eval';
3493
3510
 
3494
- export default defineAssertion(({ outputText }) => {
3511
+ /** Extract text from the last message with the given role. */
3512
+ function getMessageText(messages: Array<{ role: string; content?: unknown }>, role = 'assistant'): string {
3513
+ for (let i = messages.length - 1; i >= 0; i--) {
3514
+ const msg = messages[i];
3515
+ if (msg.role !== role) continue;
3516
+ if (typeof msg.content === 'string') return msg.content;
3517
+ if (Array.isArray(msg.content)) {
3518
+ return msg.content.filter((b: any) => b.type === 'text').map((b: any) => b.text).join('\\n');
3519
+ }
3520
+ }
3521
+ return '';
3522
+ }
3523
+
3524
+ export default defineAssertion(({ output }) => {
3495
3525
  // TODO: Implement your scoring logic (0.0 to 1.0)
3496
- const score = outputText.length > 0 ? 1.0 : 0.0;
3526
+ const text = getMessageText(output ?? []);
3527
+ const score = text.length > 0 ? 1.0 : 0.0;
3497
3528
  return {
3498
3529
  pass: score >= 0.5,
3499
3530
  score,
@@ -4186,7 +4217,7 @@ var evalRunCommand = command({
4186
4217
  },
4187
4218
  handler: async (args) => {
4188
4219
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4189
- const { launchInteractiveWizard } = await import("./interactive-5ESM5DWV.js");
4220
+ const { launchInteractiveWizard } = await import("./interactive-7ZYS6IOC.js");
4190
4221
  await launchInteractiveWizard();
4191
4222
  return;
4192
4223
  }
@@ -4421,7 +4452,8 @@ var evalBenchCommand = command({
4421
4452
  const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
4422
4453
  const testIds = manifest.test_ids;
4423
4454
  const targetName = manifest.target?.name ?? "unknown";
4424
- const evalSet = manifest.eval_set ?? "";
4455
+ const evalSet = manifest.dataset ?? "";
4456
+ const experiment = manifest.experiment;
4425
4457
  const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
4426
4458
  let stdinData;
4427
4459
  if (llmScoresPath) {
@@ -4531,7 +4563,8 @@ var evalBenchCommand = command({
4531
4563
  JSON.stringify({
4532
4564
  timestamp: manifest.timestamp,
4533
4565
  test_id: testId,
4534
- eval_set: evalSet || void 0,
4566
+ dataset: evalSet || void 0,
4567
+ experiment: experiment || void 0,
4535
4568
  score: Math.round(weightedScore * 1e3) / 1e3,
4536
4569
  target: targetName,
4537
4570
  scores,
@@ -4553,6 +4586,7 @@ var evalBenchCommand = command({
4553
4586
  metadata: {
4554
4587
  eval_file: manifest.eval_file,
4555
4588
  timestamp: manifest.timestamp,
4589
+ experiment: experiment || void 0,
4556
4590
  targets: [targetName],
4557
4591
  tests_run: testIds
4558
4592
  },
@@ -4594,6 +4628,12 @@ function computeStats(values) {
4594
4628
  // src/commands/pipeline/grade.ts
4595
4629
  import { mkdir as mkdir2, readFile as readFile2, readdir as readdir2, writeFile as writeFile3 } from "node:fs/promises";
4596
4630
  import { join as join2 } from "node:path";
4631
+ function extractInputText(input) {
4632
+ if (!input || input.length === 0) return "";
4633
+ if (input.length === 1) return input[0].content;
4634
+ return input.map((m) => `@[${m.role}]:
4635
+ ${m.content}`).join("\n\n");
4636
+ }
4597
4637
  var evalGradeCommand = command({
4598
4638
  name: "grade",
4599
4639
  description: "Run code-grader assertions on responses in an export directory",
@@ -4608,7 +4648,7 @@ var evalGradeCommand = command({
4608
4648
  const manifestPath = join2(exportDir, "manifest.json");
4609
4649
  const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
4610
4650
  const testIds = manifest.test_ids;
4611
- const evalSet = manifest.eval_set ?? "";
4651
+ const evalSet = manifest.dataset ?? "";
4612
4652
  const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
4613
4653
  let totalGraders = 0;
4614
4654
  let totalPassed = 0;
@@ -4630,14 +4670,13 @@ var evalGradeCommand = command({
4630
4670
  for (const graderFile of graderFiles) {
4631
4671
  const graderConfig = JSON.parse(await readFile2(join2(codeGradersDir, graderFile), "utf8"));
4632
4672
  const graderName = graderConfig.name;
4673
+ const inputText = extractInputText(inputData.input);
4633
4674
  const payload = JSON.stringify({
4634
4675
  output: [{ role: "assistant", content: responseText }],
4635
- input: inputData.input_messages,
4636
- question: inputData.input_text,
4676
+ input: inputData.input,
4637
4677
  criteria: "",
4638
4678
  expected_output: [],
4639
- reference_answer: "",
4640
- input_files: [],
4679
+ input_files: inputData.input_files ?? [],
4641
4680
  trace: null,
4642
4681
  token_usage: null,
4643
4682
  cost_usd: null,
@@ -4647,8 +4686,8 @@ var evalGradeCommand = command({
4647
4686
  file_changes: null,
4648
4687
  workspace_path: null,
4649
4688
  config: graderConfig.config ?? null,
4650
- metadata: {},
4651
- input_text: inputData.input_text,
4689
+ metadata: inputData.metadata ?? {},
4690
+ input_text: inputText,
4652
4691
  output_text: responseText,
4653
4692
  expected_output_text: ""
4654
4693
  });
@@ -4706,7 +4745,7 @@ var evalGradeCommand = command({
4706
4745
  // src/commands/pipeline/input.ts
4707
4746
  import { readFile as readFile3 } from "node:fs/promises";
4708
4747
  import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
4709
- import { dirname, join as join3, resolve } from "node:path";
4748
+ import { dirname, join as join3, relative, resolve } from "node:path";
4710
4749
  var evalInputCommand = command({
4711
4750
  name: "input",
4712
4751
  description: "Extract eval inputs, target commands, and grader prompts for subagent-mode runs",
@@ -4720,14 +4759,20 @@ var evalInputCommand = command({
4720
4759
  type: optional(string),
4721
4760
  long: "out",
4722
4761
  description: "Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)"
4762
+ }),
4763
+ experiment: option({
4764
+ type: optional(string),
4765
+ long: "experiment",
4766
+ description: "Experiment label (e.g. with_skills, without_skills)"
4723
4767
  })
4724
4768
  },
4725
- handler: async ({ evalPath, out }) => {
4769
+ handler: async ({ evalPath, out, experiment }) => {
4726
4770
  const resolvedEvalPath = resolve(evalPath);
4727
4771
  const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
4728
4772
  const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
4729
4773
  const evalDir = dirname(resolvedEvalPath);
4730
- const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
4774
+ const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
4775
+ const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
4731
4776
  const tests = suite.tests;
4732
4777
  if (tests.length === 0) {
4733
4778
  console.error("No tests found in eval file.");
@@ -4736,6 +4781,7 @@ var evalInputCommand = command({
4736
4781
  let targetInfo = null;
4737
4782
  let targetName = "agent";
4738
4783
  let targetKind = "agent";
4784
+ let subagentModeAllowed = true;
4739
4785
  try {
4740
4786
  const selection = await selectTarget({
4741
4787
  testFilePath: resolvedEvalPath,
@@ -4748,15 +4794,20 @@ var evalInputCommand = command({
4748
4794
  env: process.env
4749
4795
  });
4750
4796
  targetName = selection.targetName;
4751
- if (selection.resolvedTarget.kind === "cli") {
4797
+ const resolved = selection.resolvedTarget;
4798
+ subagentModeAllowed = resolved.subagentModeAllowed !== false;
4799
+ if (resolved.kind === "cli") {
4752
4800
  targetKind = "cli";
4753
- const config = selection.resolvedTarget.config;
4801
+ subagentModeAllowed = false;
4802
+ const config = resolved.config;
4754
4803
  targetInfo = {
4755
4804
  kind: "cli",
4756
4805
  command: config.command,
4757
4806
  cwd: config.cwd ?? evalDir,
4758
4807
  timeoutMs: config.timeoutMs ?? 3e4
4759
4808
  };
4809
+ } else {
4810
+ targetKind = resolved.kind;
4760
4811
  }
4761
4812
  } catch {
4762
4813
  }
@@ -4768,15 +4819,13 @@ var evalInputCommand = command({
4768
4819
  const testDir = join3(outDir, ...subpath);
4769
4820
  await mkdir3(testDir, { recursive: true });
4770
4821
  testIds.push(test.id);
4771
- const inputText = test.question;
4772
4822
  const inputMessages = test.input.map((m) => ({
4773
4823
  role: m.role,
4774
4824
  content: typeof m.content === "string" ? m.content : m.content
4775
4825
  }));
4776
4826
  await writeJson(join3(testDir, "input.json"), {
4777
- input_text: inputText,
4778
- input_messages: inputMessages,
4779
- file_paths: test.file_paths,
4827
+ input: inputMessages,
4828
+ input_files: test.file_paths,
4780
4829
  metadata: test.metadata ?? {}
4781
4830
  });
4782
4831
  if (targetInfo) {
@@ -4804,11 +4853,13 @@ var evalInputCommand = command({
4804
4853
  }
4805
4854
  await writeJson(join3(outDir, "manifest.json"), {
4806
4855
  eval_file: resolvedEvalPath,
4807
- eval_set: evalSetName || void 0,
4856
+ dataset: evalSetName || void 0,
4857
+ experiment: experiment || void 0,
4808
4858
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4809
4859
  target: {
4810
4860
  name: targetName,
4811
- kind: targetKind
4861
+ kind: targetKind,
4862
+ subagent_mode_allowed: subagentModeAllowed
4812
4863
  },
4813
4864
  test_ids: testIds
4814
4865
  });
@@ -4870,7 +4921,13 @@ import { execSync } from "node:child_process";
4870
4921
  import { existsSync as existsSync3, readFileSync as readFileSync4, unlinkSync } from "node:fs";
4871
4922
  import { mkdir as mkdir4, readFile as readFile4, readdir as readdir3, writeFile as writeFile5 } from "node:fs/promises";
4872
4923
  import { tmpdir } from "node:os";
4873
- import { dirname as dirname2, join as join4, resolve as resolve2 } from "node:path";
4924
+ import { dirname as dirname2, join as join4, relative as relative2, resolve as resolve2 } from "node:path";
4925
+ function extractInputText2(input) {
4926
+ if (!input || input.length === 0) return "";
4927
+ if (input.length === 1) return input[0].content;
4928
+ return input.map((m) => `@[${m.role}]:
4929
+ ${m.content}`).join("\n\n");
4930
+ }
4874
4931
  function loadEnvFile(dir) {
4875
4932
  let current = resolve2(dir);
4876
4933
  while (true) {
@@ -4910,14 +4967,25 @@ var evalRunCommand2 = command({
4910
4967
  type: optional(number),
4911
4968
  long: "workers",
4912
4969
  description: "Parallel workers for target invocation (default: all tests)"
4970
+ }),
4971
+ experiment: option({
4972
+ type: optional(string),
4973
+ long: "experiment",
4974
+ description: "Experiment label (e.g. with_skills, without_skills)"
4975
+ }),
4976
+ graderType: option({
4977
+ type: optional(oneOf(["code", "none"])),
4978
+ long: "grader-type",
4979
+ description: 'Which grading phase to run: "code" runs code-graders inline, omit to skip grading (use pipeline grade separately)'
4913
4980
  })
4914
4981
  },
4915
- handler: async ({ evalPath, out, workers }) => {
4982
+ handler: async ({ evalPath, out, workers, experiment, graderType }) => {
4916
4983
  const resolvedEvalPath = resolve2(evalPath);
4917
4984
  const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd()));
4918
4985
  const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
4919
4986
  const evalDir = dirname2(resolvedEvalPath);
4920
- const suite = await loadTestSuite(resolvedEvalPath, repoRoot);
4987
+ const category = deriveCategory(relative2(process.cwd(), resolvedEvalPath));
4988
+ const suite = await loadTestSuite(resolvedEvalPath, repoRoot, { category });
4921
4989
  const tests = suite.tests;
4922
4990
  if (tests.length === 0) {
4923
4991
  console.error("No tests found in eval file.");
@@ -4958,15 +5026,13 @@ var evalRunCommand2 = command({
4958
5026
  const testDir = join4(outDir, ...subpath);
4959
5027
  await mkdir4(testDir, { recursive: true });
4960
5028
  testIds.push(test.id);
4961
- const inputText = test.question;
4962
5029
  const inputMessages = test.input.map((m) => ({
4963
5030
  role: m.role,
4964
5031
  content: typeof m.content === "string" ? m.content : m.content
4965
5032
  }));
4966
5033
  await writeJson2(join4(testDir, "input.json"), {
4967
- input_text: inputText,
4968
- input_messages: inputMessages,
4969
- file_paths: test.file_paths,
5034
+ input: inputMessages,
5035
+ input_files: test.file_paths,
4970
5036
  metadata: test.metadata ?? {}
4971
5037
  });
4972
5038
  if (targetInfo) {
@@ -4994,7 +5060,8 @@ var evalRunCommand2 = command({
4994
5060
  }
4995
5061
  await writeJson2(join4(outDir, "manifest.json"), {
4996
5062
  eval_file: resolvedEvalPath,
4997
- eval_set: evalSetName || void 0,
5063
+ dataset: evalSetName || void 0,
5064
+ experiment: experiment || void 0,
4998
5065
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4999
5066
  target: { name: targetName, kind: targetKind },
5000
5067
  test_ids: testIds
@@ -5019,11 +5086,12 @@ var evalRunCommand2 = command({
5019
5086
  const timeoutMs = invoke.timeout_ms ?? 12e4;
5020
5087
  const promptFile = join4(tmpdir(), `agentv-prompt-${testId}-${Date.now()}.txt`);
5021
5088
  const outputFile = join4(tmpdir(), `agentv-output-${testId}-${Date.now()}.txt`);
5022
- await writeFile5(promptFile, inputData.input_text, "utf8");
5089
+ const inputText = extractInputText2(inputData.input);
5090
+ await writeFile5(promptFile, inputText, "utf8");
5023
5091
  let rendered = template;
5024
5092
  rendered = rendered.replace("{PROMPT_FILE}", promptFile);
5025
5093
  rendered = rendered.replace("{OUTPUT_FILE}", outputFile);
5026
- rendered = rendered.replace("{PROMPT}", inputData.input_text);
5094
+ rendered = rendered.replace("{PROMPT}", inputText);
5027
5095
  const start = performance.now();
5028
5096
  try {
5029
5097
  execSync(rendered, {
@@ -5080,6 +5148,12 @@ var evalRunCommand2 = command({
5080
5148
  } else {
5081
5149
  console.log("Subagent-as-target mode \u2014 skipping CLI invocation.");
5082
5150
  }
5151
+ if (graderType !== "code") {
5152
+ console.log(`
5153
+ Done. Results in ${outDir}`);
5154
+ console.log("To run code graders: agentv pipeline grade <run-dir> (or re-run with --grader-type code)");
5155
+ return;
5156
+ }
5083
5157
  let totalGraders = 0;
5084
5158
  let totalPassed = 0;
5085
5159
  for (const testId of testIds) {
@@ -5100,14 +5174,13 @@ var evalRunCommand2 = command({
5100
5174
  for (const graderFile of graderFiles) {
5101
5175
  const graderConfig = JSON.parse(await readFile4(join4(codeGradersDir, graderFile), "utf8"));
5102
5176
  const graderName = graderConfig.name;
5177
+ const inputText = extractInputText2(inputData.input);
5103
5178
  const payload = JSON.stringify({
5104
5179
  output: [{ role: "assistant", content: responseText }],
5105
- input: inputData.input_messages,
5106
- question: inputData.input_text,
5180
+ input: inputData.input,
5107
5181
  criteria: "",
5108
5182
  expected_output: [],
5109
- reference_answer: "",
5110
- input_files: [],
5183
+ input_files: inputData.input_files ?? [],
5111
5184
  trace: null,
5112
5185
  token_usage: null,
5113
5186
  cost_usd: null,
@@ -5117,8 +5190,8 @@ var evalRunCommand2 = command({
5117
5190
  file_changes: null,
5118
5191
  workspace_path: null,
5119
5192
  config: graderConfig.config ?? null,
5120
- metadata: {},
5121
- input_text: inputData.input_text,
5193
+ metadata: inputData.metadata ?? {},
5194
+ input_text: inputText,
5122
5195
  output_text: responseText,
5123
5196
  expected_output_text: ""
5124
5197
  });
@@ -5306,7 +5379,7 @@ function toRawResult(result) {
5306
5379
  return {
5307
5380
  timestamp: result.timestamp,
5308
5381
  test_id: result.testId,
5309
- eval_set: result.eval_set,
5382
+ dataset: result.dataset,
5310
5383
  conversation_id: result.conversationId,
5311
5384
  score: result.score,
5312
5385
  assertions: result.assertions?.map((assertion) => ({
@@ -5429,7 +5502,7 @@ function loadOtlpTraceFile(filePath) {
5429
5502
  }
5430
5503
  return {
5431
5504
  test_id: stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`,
5432
- eval_set: stringAttr(rootAttrs.agentv_eval_set),
5505
+ dataset: stringAttr(rootAttrs.agentv_dataset),
5433
5506
  target: stringAttr(rootAttrs.agentv_target),
5434
5507
  score,
5435
5508
  error: root.status?.code === 2 ? root.status.message : void 0,
@@ -6173,8 +6246,9 @@ var resultsCommand = subcommands({
6173
6246
  });
6174
6247
 
6175
6248
  // src/commands/results/serve.ts
6176
- import { existsSync as existsSync7, readFileSync as readFileSync8, writeFileSync as writeFileSync3 } from "node:fs";
6249
+ import { existsSync as existsSync7, readFileSync as readFileSync8, readdirSync as readdirSync3, statSync as statSync4, writeFileSync as writeFileSync3 } from "node:fs";
6177
6250
  import path9 from "node:path";
6251
+ import { fileURLToPath as fileURLToPath2 } from "node:url";
6178
6252
  import { Hono } from "hono";
6179
6253
  function feedbackPath(resultDir) {
6180
6254
  return path9.join(resultDir, "feedback.json");
@@ -6195,24 +6269,46 @@ function writeFeedback(cwd, data) {
6195
6269
  writeFileSync3(feedbackPath(cwd), `${JSON.stringify(data, null, 2)}
6196
6270
  `, "utf8");
6197
6271
  }
6198
- function createApp(results, resultDir, cwd, sourceFile) {
6272
+ function createApp(results, resultDir, cwd, sourceFile, options) {
6199
6273
  const searchDir = cwd ?? resultDir;
6200
6274
  const app2 = new Hono();
6275
+ const studioDistPath = options?.studioDir ?? resolveStudioDistDir();
6276
+ if (!studioDistPath || !existsSync7(path9.join(studioDistPath, "index.html"))) {
6277
+ throw new Error('Studio dist not found. Run "bun run build" in apps/studio/ to build the SPA.');
6278
+ }
6201
6279
  app2.get("/", (c3) => {
6202
- return c3.html(generateServeHtml(results, sourceFile));
6280
+ const indexPath = path9.join(studioDistPath, "index.html");
6281
+ if (existsSync7(indexPath)) {
6282
+ return c3.html(readFileSync8(indexPath, "utf8"));
6283
+ }
6284
+ return c3.notFound();
6203
6285
  });
6204
6286
  app2.get("/api/runs", (c3) => {
6205
6287
  const metas = listResultFiles(searchDir);
6206
6288
  return c3.json({
6207
- runs: metas.map((m) => ({
6208
- filename: m.filename,
6209
- path: m.path,
6210
- timestamp: m.timestamp,
6211
- test_count: m.testCount,
6212
- pass_rate: m.passRate,
6213
- avg_score: m.avgScore,
6214
- size_bytes: m.sizeBytes
6215
- }))
6289
+ runs: metas.map((m) => {
6290
+ let target;
6291
+ let experiment;
6292
+ try {
6293
+ const records = loadLightweightResults(m.path);
6294
+ if (records.length > 0) {
6295
+ target = records[0].target;
6296
+ experiment = records[0].experiment;
6297
+ }
6298
+ } catch {
6299
+ }
6300
+ return {
6301
+ filename: m.filename,
6302
+ path: m.path,
6303
+ timestamp: m.timestamp,
6304
+ test_count: m.testCount,
6305
+ pass_rate: m.passRate,
6306
+ avg_score: m.avgScore,
6307
+ size_bytes: m.sizeBytes,
6308
+ ...target && { target },
6309
+ ...experiment && { experiment }
6310
+ };
6311
+ })
6216
6312
  });
6217
6313
  });
6218
6314
  app2.get("/api/runs/:filename", (c3) => {
@@ -6272,692 +6368,406 @@ function createApp(results, resultDir, cwd, sourceFile) {
6272
6368
  writeFeedback(resultDir, existing);
6273
6369
  return c3.json(existing);
6274
6370
  });
6275
- return app2;
6276
- }
6277
- function stripHeavyFields(results) {
6278
- return results.map((r) => {
6279
- const { requests, trace, ...rest } = r;
6280
- const toolCalls = trace?.toolCalls && Object.keys(trace.toolCalls).length > 0 ? trace.toolCalls : void 0;
6281
- const graderDurationMs = (r.scores ?? []).reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
6282
- return {
6283
- ...rest,
6284
- ...toolCalls && { _toolCalls: toolCalls },
6285
- ...graderDurationMs > 0 && { _graderDurationMs: graderDurationMs }
6286
- };
6371
+ app2.get("/api/runs/:filename/datasets", (c3) => {
6372
+ const filename = c3.req.param("filename");
6373
+ const metas = listResultFiles(searchDir);
6374
+ const meta = metas.find((m) => m.filename === filename);
6375
+ if (!meta) {
6376
+ return c3.json({ error: "Run not found" }, 404);
6377
+ }
6378
+ try {
6379
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6380
+ const datasetMap = /* @__PURE__ */ new Map();
6381
+ for (const r of loaded) {
6382
+ const ds = r.dataset ?? r.target ?? "default";
6383
+ const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
6384
+ entry.total++;
6385
+ if (r.score >= 1) entry.passed++;
6386
+ entry.scoreSum += r.score;
6387
+ datasetMap.set(ds, entry);
6388
+ }
6389
+ const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
6390
+ name,
6391
+ total: entry.total,
6392
+ passed: entry.passed,
6393
+ failed: entry.total - entry.passed,
6394
+ avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
6395
+ }));
6396
+ return c3.json({ datasets });
6397
+ } catch {
6398
+ return c3.json({ error: "Failed to load datasets" }, 500);
6399
+ }
6287
6400
  });
6288
- }
6289
- function escapeHtml(s) {
6290
- return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
6291
- }
6292
- function generateServeHtml(results, sourceFile) {
6293
- const lightResults = stripHeavyFields(results);
6294
- const dataJson = JSON.stringify(lightResults).replace(/</g, "\\u003c").replace(/>/g, "\\u003e").replace(/\u2028/g, "\\u2028").replace(/\u2029/g, "\\u2029");
6295
- return `<!DOCTYPE html>
6296
- <html lang="en">
6297
- <head>
6298
- <meta charset="utf-8">
6299
- <meta name="viewport" content="width=device-width, initial-scale=1">
6300
- <title>AgentV Results Review</title>
6301
- <style>
6302
- ${SERVE_STYLES}
6303
- </style>
6304
- </head>
6305
- <body>
6306
- <header class="header">
6307
- <div class="header-left">
6308
- <h1 class="header-title">AgentV</h1>
6309
- <span class="header-subtitle">Results Review</span>
6310
- </div>
6311
- <div class="header-center">
6312
- <select id="run-picker" class="run-picker" title="Switch result file">
6313
- <option value="">Loading runs...</option>
6314
- </select>
6315
- </div>
6316
- <div class="header-right">
6317
- <span class="timestamp">${escapeHtml((/* @__PURE__ */ new Date()).toISOString())}</span>
6318
- </div>
6319
- </header>
6320
- <nav class="tabs" id="tabs">
6321
- <button class="tab active" data-tab="overview">Overview</button>
6322
- <button class="tab" data-tab="tests">Test Cases</button>
6323
- </nav>
6324
- <main id="app"></main>
6325
- <script>
6326
- var DATA = ${dataJson};
6327
- var INITIAL_SOURCE = ${sourceFile ? JSON.stringify(path9.basename(sourceFile)).replace(/</g, "\\u003c").replace(/>/g, "\\u003e") : "null"};
6328
- ${SERVE_SCRIPT}
6329
- </script>
6330
- </body>
6331
- </html>`;
6332
- }
6333
- var SERVE_STYLES = `
6334
- *{margin:0;padding:0;box-sizing:border-box}
6335
- :root{
6336
- --bg:#f6f8fa;--surface:#fff;--border:#d0d7de;--border-light:#e8ebee;
6337
- --text:#1f2328;--text-muted:#656d76;
6338
- --primary:#0969da;--primary-bg:#ddf4ff;
6339
- --success:#1a7f37;--success-bg:#dafbe1;
6340
- --danger:#cf222e;--danger-bg:#ffebe9;
6341
- --warning:#9a6700;--warning-bg:#fff8c5;
6342
- --radius:6px;
6343
- --shadow:0 1px 3px rgba(31,35,40,.04),0 1px 2px rgba(31,35,40,.06);
6344
- --font:-apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif;
6345
- --mono:ui-monospace,SFMono-Regular,"SF Mono",Menlo,Consolas,monospace;
6346
- }
6347
- body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:1.5;font-size:14px}
6348
-
6349
- /* Header */
6350
- .header{background:var(--surface);border-bottom:1px solid var(--border);padding:12px 24px;display:flex;align-items:center;justify-content:space-between}
6351
- .header-left{display:flex;align-items:baseline;gap:12px}
6352
- .header-title{font-size:18px;font-weight:600}
6353
- .header-subtitle{font-size:14px;color:var(--text-muted)}
6354
- .header-center{flex:1;display:flex;justify-content:center;padding:0 16px}
6355
- .run-picker{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font);max-width:400px;width:100%;cursor:pointer}
6356
- .run-picker:hover{border-color:var(--primary)}
6357
- .run-picker:focus{outline:none;border-color:var(--primary);box-shadow:0 0 0 3px var(--primary-bg)}
6358
- .timestamp{font-size:12px;color:var(--text-muted);font-family:var(--mono)}
6359
-
6360
- /* Tabs */
6361
- .tabs{background:var(--surface);border-bottom:1px solid var(--border);padding:0 24px;display:flex}
6362
- .tab{background:none;border:none;padding:10px 16px;font-size:14px;color:var(--text-muted);cursor:pointer;border-bottom:2px solid transparent;font-family:var(--font);transition:color .15s,border-color .15s}
6363
- .tab:hover{color:var(--text)}
6364
- .tab.active{color:var(--text);font-weight:600;border-bottom-color:var(--primary)}
6365
-
6366
- #app{max-width:1280px;margin:0 auto;padding:24px}
6367
-
6368
- /* Stat cards */
6369
- .stats-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:12px;margin-bottom:24px}
6370
- .stat-card{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;text-align:center;box-shadow:var(--shadow)}
6371
- .stat-card.pass .stat-value{color:var(--success)}
6372
- .stat-card.fail .stat-value{color:var(--danger)}
6373
- .stat-card.error .stat-value{color:var(--danger)}
6374
- .stat-card.warn .stat-value{color:var(--warning)}
6375
- .stat-card.total .stat-value{color:var(--primary)}
6376
- .stat-value{font-size:28px;font-weight:700;line-height:1.2}
6377
- .stat-label{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.5px;margin-top:4px}
6378
-
6379
- /* Sections */
6380
- .section{margin-bottom:24px}
6381
- .section-title{font-size:16px;font-weight:600;margin-bottom:12px}
6382
-
6383
- /* Tables */
6384
- .table-wrap{overflow-x:auto;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);box-shadow:var(--shadow)}
6385
- .data-table{width:100%;border-collapse:collapse;font-size:13px}
6386
- .data-table th{background:var(--bg);border-bottom:1px solid var(--border);padding:8px 12px;text-align:left;font-weight:600;font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;white-space:nowrap}
6387
- .data-table th.sortable{cursor:pointer;user-select:none}
6388
- .data-table th.sortable:hover{color:var(--text)}
6389
- .data-table td{padding:8px 12px;border-bottom:1px solid var(--border-light);vertical-align:middle}
6390
- .data-table tbody tr:last-child td{border-bottom:none}
6391
-
6392
- /* Status icons */
6393
- .status-icon{display:inline-flex;align-items:center;justify-content:center;width:22px;height:22px;border-radius:50%;font-size:12px;font-weight:700}
6394
- .status-icon.pass{background:var(--success-bg);color:var(--success)}
6395
- .status-icon.fail{background:var(--danger-bg);color:var(--danger)}
6396
- .status-icon.error{background:var(--warning-bg);color:var(--warning)}
6397
-
6398
- /* Score colors */
6399
- .score-high{color:var(--success);font-weight:600}
6400
- .score-mid{color:var(--warning);font-weight:600}
6401
- .score-low{color:var(--danger);font-weight:600}
6402
-
6403
- /* Pass-rate bar */
6404
- .bar-bg{width:100px;height:8px;background:var(--border-light);border-radius:4px;overflow:hidden}
6405
- .bar-fill{height:100%;border-radius:4px;transition:width .3s}
6406
- .bar-fill.score-high{background:var(--success)}
6407
- .bar-fill.score-mid{background:var(--warning)}
6408
- .bar-fill.score-low{background:var(--danger)}
6409
-
6410
- /* Histogram */
6411
- .histogram{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;box-shadow:var(--shadow)}
6412
- .hist-row{display:flex;align-items:center;gap:12px;margin-bottom:8px}
6413
- .hist-row:last-child{margin-bottom:0}
6414
- .hist-label{width:60px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
6415
- .hist-bar-bg{flex:1;height:20px;background:var(--border-light);border-radius:3px;overflow:hidden}
6416
- .hist-bar{height:100%;border-radius:3px;transition:width .3s}
6417
- .hist-count{width:30px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
6418
-
6419
- /* Filters */
6420
- .filter-bar{display:flex;gap:8px;margin-bottom:16px;align-items:center;flex-wrap:wrap}
6421
- .filter-select,.filter-search{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font)}
6422
- .filter-search{flex:1;min-width:200px}
6423
- .filter-count{font-size:12px;color:var(--text-muted);margin-left:auto}
6424
-
6425
- /* Test rows */
6426
- .test-row{cursor:pointer;transition:background .1s}
6427
- .test-row:hover{background:var(--bg)!important}
6428
- .test-row.expanded{background:var(--primary-bg)!important}
6429
- .expand-col{width:32px;text-align:center}
6430
- .expand-icon{color:var(--text-muted);font-size:12px}
6431
- .fw-medium{font-weight:500}
6432
- .text-pass{color:var(--success)}.text-fail{color:var(--danger)}.text-error{color:var(--warning)}
6433
-
6434
- /* Detail panel */
6435
- .detail-row td{padding:0!important;background:var(--bg)!important}
6436
- .detail-panel{padding:16px 24px}
6437
- .detail-grid{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-bottom:16px}
6438
- .detail-block h4{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;margin-bottom:6px}
6439
- .detail-pre{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:12px;font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word;max-height:300px;overflow-y:auto;line-height:1.6}
6440
- .detail-panel h4{font-size:13px;font-weight:600;margin:16px 0 8px}
6441
- .eval-table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);margin-bottom:12px}
6442
- .eval-table th{background:var(--bg);padding:6px 10px;text-align:left;font-size:11px;font-weight:600;color:var(--text-muted);text-transform:uppercase;border-bottom:1px solid var(--border)}
6443
- .eval-table td{padding:8px 10px;border-bottom:1px solid var(--border-light)}
6444
- .reasoning-cell{max-width:500px;font-size:12px;color:var(--text-muted)}
6445
- .expect-list{list-style:none;padding:0;margin-bottom:12px}
6446
- .expect-list li{padding:4px 8px 4px 24px;position:relative;font-size:13px}
6447
- .expect-list.pass li::before{content:"\\2713";position:absolute;left:4px;color:var(--success);font-weight:700}
6448
- .expect-list.fail li::before{content:"\\2717";position:absolute;left:4px;color:var(--danger);font-weight:700}
6449
- .error-box{background:var(--danger-bg);border:1px solid var(--danger);border-radius:var(--radius);padding:12px;margin-bottom:12px}
6450
- .error-box h4{color:var(--danger);margin:0 0 6px}
6451
- .error-box pre{font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word}
6452
- .detail-meta{font-size:12px;color:var(--text-muted);margin-top:12px;padding-top:12px;border-top:1px solid var(--border-light)}
6453
- .tool-calls{display:flex;flex-wrap:wrap;gap:6px;margin-bottom:12px}
6454
- .tool-tag{display:inline-block;padding:2px 10px;font-size:12px;font-family:var(--mono);background:var(--primary-bg);color:var(--primary);border:1px solid var(--border);border-radius:12px}
6455
- .empty-state{text-align:center;padding:48px 24px;color:var(--text-muted)}
6456
- .empty-state h3{font-size:16px;margin-bottom:8px;color:var(--text)}
6457
- .welcome-state{text-align:center;padding:80px 24px;color:var(--text-muted)}
6458
- .welcome-state h2{font-size:24px;margin-bottom:12px;color:var(--text);font-weight:600}
6459
- .welcome-state p{font-size:15px;margin-bottom:8px;max-width:500px;margin-left:auto;margin-right:auto}
6460
- .welcome-state code{font-family:var(--mono);background:var(--surface);border:1px solid var(--border);border-radius:3px;padding:2px 6px;font-size:13px}
6461
- .welcome-state .hint{margin-top:24px;font-size:13px;color:var(--text-muted)}
6462
-
6463
- /* Feedback */
6464
- .feedback-section{margin-top:16px;padding-top:16px;border-top:1px solid var(--border-light)}
6465
- .feedback-input{width:100%;min-height:80px;padding:8px 12px;border:1px solid var(--border);border-radius:var(--radius);font-family:var(--font);font-size:13px;resize:vertical;background:var(--surface);color:var(--text)}
6466
- .feedback-input:focus{outline:none;border-color:var(--primary);box-shadow:0 0 0 3px var(--primary-bg)}
6467
- .feedback-submit{margin-top:8px;padding:6px 16px;background:var(--primary);color:#fff;border:none;border-radius:var(--radius);font-size:13px;cursor:pointer;font-family:var(--font)}
6468
- .feedback-submit:hover{opacity:.9}
6469
- .feedback-submit:disabled{opacity:.5;cursor:default}
6470
- .feedback-status{margin-left:8px;font-size:12px;color:var(--success)}
6471
- `;
6472
- var SERVE_SCRIPT = `
6473
- (function(){
6474
- /* ---- helpers ---- */
6475
- function esc(s){
6476
- if(s==null)return"";
6477
- return String(s).replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;").replace(/"/g,"&quot;");
6478
- }
6479
- function getStatus(r){
6480
- if(r.executionStatus==="execution_error")return"error";
6481
- if(r.executionStatus==="quality_failure")return"fail";
6482
- if(r.executionStatus==="ok")return"pass";
6483
- if(r.error)return"error";
6484
- return r.score>=0.5?"pass":"fail";
6485
- }
6486
- function sIcon(s){
6487
- if(s==="pass")return'<span class="status-icon pass">\\u2713</span>';
6488
- if(s==="fail")return'<span class="status-icon fail">\\u2717</span>';
6489
- return'<span class="status-icon error">!</span>';
6490
- }
6491
- function fmtDur(ms){
6492
- if(ms==null)return"\\u2014";
6493
- if(ms<1000)return ms+"ms";
6494
- if(ms<60000)return(ms/1000).toFixed(1)+"s";
6495
- return Math.floor(ms/60000)+"m "+Math.round((ms%60000)/1000)+"s";
6496
- }
6497
- function fmtTok(n){
6498
- if(n==null)return"\\u2014";
6499
- if(n>=1e6)return(n/1e6).toFixed(1)+"M";
6500
- if(n>=1e3)return(n/1e3).toFixed(1)+"K";
6501
- return String(n);
6502
- }
6503
- function fmtCost(u){if(u==null)return"\\u2014";if(u<0.01)return"<$0.01";return"$"+u.toFixed(2);}
6504
- function fmtPct(v){if(v==null)return"\\u2014";return(v*100).toFixed(1)+"%";}
6505
- function sCls(v){if(v==null)return"";if(v>=0.9)return"score-high";if(v>=0.5)return"score-mid";return"score-low";}
6506
-
6507
- /* ---- feedback state ---- */
6508
- var feedbackCache={};
6509
-
6510
- function loadFeedback(){
6511
- fetch("/api/feedback").then(function(r){return r.json();}).then(function(d){
6512
- if(d&&d.reviews){
6513
- for(var i=0;i<d.reviews.length;i++){
6514
- feedbackCache[d.reviews[i].test_id]=d.reviews[i].comment;
6515
- }
6516
- populateFeedbackTextareas();
6517
- }
6518
- }).catch(function(){});
6519
- }
6520
-
6521
- function populateFeedbackTextareas(){
6522
- var areas=document.querySelectorAll(".feedback-input");
6523
- for(var i=0;i<areas.length;i++){
6524
- var tid=areas[i].getAttribute("data-test-id");
6525
- if(tid&&feedbackCache[tid]!=null){
6526
- areas[i].value=feedbackCache[tid];
6527
- }
6401
+ app2.get("/api/runs/:filename/categories", (c3) => {
6402
+ const filename = c3.req.param("filename");
6403
+ const metas = listResultFiles(searchDir);
6404
+ const meta = metas.find((m) => m.filename === filename);
6405
+ if (!meta) {
6406
+ return c3.json({ error: "Run not found" }, 404);
6528
6407
  }
6529
- }
6530
-
6531
- function saveFeedback(testId,comment,statusEl,btn){
6532
- btn.disabled=true;
6533
- statusEl.textContent="Saving...";
6534
- statusEl.style.color="var(--text-muted)";
6535
- fetch("/api/feedback",{
6536
- method:"POST",
6537
- headers:{"Content-Type":"application/json"},
6538
- body:JSON.stringify({reviews:[{test_id:testId,comment:comment}]})
6539
- }).then(function(r){return r.json();}).then(function(){
6540
- feedbackCache[testId]=comment;
6541
- statusEl.textContent="Saved";
6542
- statusEl.style.color="var(--success)";
6543
- btn.disabled=false;
6544
- setTimeout(function(){statusEl.textContent="";},2000);
6545
- }).catch(function(){
6546
- statusEl.textContent="Error saving";
6547
- statusEl.style.color="var(--danger)";
6548
- btn.disabled=false;
6549
- });
6550
- }
6551
-
6552
- /* ---- compute stats ---- */
6553
- function computeStats(d){
6554
- var t=d.length,p=0,f=0,e=0,dur=0,ti=0,to=0,cost=0,sc=[],tc=0;
6555
- for(var i=0;i<d.length;i++){
6556
- var r=d[i],s=getStatus(r);
6557
- if(s==="pass")p++;else if(s==="fail")f++;else e++;
6558
- if(r.durationMs)dur+=r.durationMs;
6559
- if(r.tokenUsage){ti+=(r.tokenUsage.input||0);to+=(r.tokenUsage.output||0);}
6560
- if(r.costUsd)cost+=r.costUsd;
6561
- if(s!=="error")sc.push(r.score);
6562
- if(r._toolCalls){for(var k in r._toolCalls)tc+=r._toolCalls[k];}
6563
- }
6564
- var g=t-e;
6565
- return{total:t,passed:p,failed:f,errors:e,passRate:g>0?p/g:0,dur:dur,tokens:ti+to,inTok:ti,outTok:to,cost:cost,scores:sc,toolCalls:tc};
6566
- }
6567
- function computeTargets(d){
6568
- var m={};
6569
- for(var i=0;i<d.length;i++){
6570
- var r=d[i],tgt=r.target||"unknown";
6571
- if(!m[tgt])m[tgt]={target:tgt,results:[],p:0,f:0,e:0,ts:0,sc:0,dur:0,tok:0,cost:0};
6572
- var o=m[tgt];o.results.push(r);
6573
- var s=getStatus(r);
6574
- if(s==="pass")o.p++;else if(s==="fail")o.f++;else o.e++;
6575
- if(s!=="error"){o.ts+=r.score;o.sc++;}
6576
- if(r.durationMs)o.dur+=r.durationMs;
6577
- if(r.tokenUsage)o.tok+=(r.tokenUsage.input||0)+(r.tokenUsage.output||0);
6578
- if(r.costUsd)o.cost+=r.costUsd;
6579
- }
6580
- var a=[];for(var k in m)a.push(m[k]);return a;
6581
- }
6582
- function getEvalNames(){
6583
- var n={};
6584
- for(var i=0;i<DATA.length;i++){
6585
- var sc=DATA[i].scores;
6586
- if(sc)for(var j=0;j<sc.length;j++)n[sc[j].name]=true;
6587
- }
6588
- return Object.keys(n);
6589
- }
6590
- function getEvalScore(r,name){
6591
- if(!r.scores)return null;
6592
- for(var i=0;i<r.scores.length;i++)if(r.scores[i].name===name)return r.scores[i].score;
6593
- return null;
6594
- }
6595
-
6596
- var stats=computeStats(DATA);
6597
- var tgtStats=computeTargets(DATA);
6598
- var tgtNames=tgtStats.map(function(t){return t.target;});
6599
-
6600
- /* ---- state ---- */
6601
- var state={tab:"overview",filter:{status:"all",target:"all",search:""},sort:{col:"testId",dir:"asc"},expanded:{}};
6602
-
6603
- /* ---- DOM refs ---- */
6604
- var app=document.getElementById("app");
6605
- var tabBtns=document.querySelectorAll(".tab");
6606
-
6607
- /* ---- tabs ---- */
6608
- function setTab(t){
6609
- state.tab=t;
6610
- for(var i=0;i<tabBtns.length;i++)tabBtns[i].classList.toggle("active",tabBtns[i].getAttribute("data-tab")===t);
6611
- render();
6612
- }
6613
- for(var i=0;i<tabBtns.length;i++){
6614
- tabBtns[i].addEventListener("click",(function(b){return function(){setTab(b.getAttribute("data-tab"));};})(tabBtns[i]));
6615
- }
6616
-
6617
- /* ---- render ---- */
6618
- function render(){
6619
- if(DATA.length===0){
6620
- app.innerHTML='<div class="welcome-state">'
6621
- +'<h2>No results yet</h2>'
6622
- +'<p>Run an evaluation or mount a results directory to see results here.</p>'
6623
- +'<p><code>agentv eval &lt;eval-file&gt;</code></p>'
6624
- +'<p class="hint">The dashboard will automatically detect new result files.</p>'
6625
- +'</div>';
6626
- return;
6408
+ try {
6409
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6410
+ const categoryMap = /* @__PURE__ */ new Map();
6411
+ for (const r of loaded) {
6412
+ const cat = r.category ?? DEFAULT_CATEGORY;
6413
+ const entry = categoryMap.get(cat) ?? {
6414
+ total: 0,
6415
+ passed: 0,
6416
+ scoreSum: 0,
6417
+ datasets: /* @__PURE__ */ new Set()
6418
+ };
6419
+ entry.total++;
6420
+ if (r.score >= 1) entry.passed++;
6421
+ entry.scoreSum += r.score;
6422
+ entry.datasets.add(r.dataset ?? r.target ?? "default");
6423
+ categoryMap.set(cat, entry);
6424
+ }
6425
+ const categories = [...categoryMap.entries()].map(([name, entry]) => ({
6426
+ name,
6427
+ total: entry.total,
6428
+ passed: entry.passed,
6429
+ failed: entry.total - entry.passed,
6430
+ avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
6431
+ dataset_count: entry.datasets.size
6432
+ }));
6433
+ return c3.json({ categories });
6434
+ } catch {
6435
+ return c3.json({ error: "Failed to load categories" }, 500);
6627
6436
  }
6628
- if(state.tab==="overview")renderOverview();else renderTests();
6629
- }
6630
-
6631
- /* ---- stat card helper ---- */
6632
- function card(label,value,type){
6633
- return'<div class="stat-card '+type+'"><div class="stat-value">'+value+'</div><div class="stat-label">'+label+"</div></div>";
6634
- }
6635
-
6636
- /* ---- overview ---- */
6637
- function renderOverview(){
6638
- var h='<div class="stats-grid">';
6639
- h+=card("Total Tests",stats.total,"total");
6640
- h+=card("Passed",stats.passed,"pass");
6641
- h+=card("Failed",stats.failed,"fail");
6642
- h+=card("Errors",stats.errors,"error");
6643
- var prCls=stats.passRate>=0.9?"pass":stats.passRate>=0.5?"warn":"fail";
6644
- h+=card("Pass Rate",fmtPct(stats.passRate),prCls);
6645
- h+=card("Duration",fmtDur(stats.dur),"neutral");
6646
- h+=card("Tokens",fmtTok(stats.tokens),"neutral");
6647
- h+=card("Est. Cost",fmtCost(stats.cost),"neutral");
6648
- if(stats.toolCalls>0)h+=card("Tool Calls",fmtTok(stats.toolCalls),"neutral");
6649
- h+="</div>";
6650
-
6651
- /* targets table */
6652
- if(tgtStats.length>1){
6653
- h+='<div class="section"><h2 class="section-title">Targets</h2><div class="table-wrap"><table class="data-table">';
6654
- h+="<thead><tr><th>Target</th><th>Pass Rate</th><th></th><th>Passed</th><th>Failed</th><th>Errors</th><th>Avg Score</th><th>Duration</th><th>Tokens</th><th>Cost</th></tr></thead><tbody>";
6655
- for(var i=0;i<tgtStats.length;i++){
6656
- var t=tgtStats[i],g=t.p+t.f,pr=g>0?t.p/g:0,avg=t.sc>0?t.ts/t.sc:0;
6657
- h+="<tr><td class=\\"fw-medium\\">"+esc(t.target)+"</td><td>"+fmtPct(pr)+'</td><td><div class="bar-bg"><div class="bar-fill '+sCls(pr)+'" style="width:'+(pr*100)+'%"></div></div></td>';
6658
- h+='<td class="text-pass">'+t.p+'</td><td class="text-fail">'+t.f+'</td><td class="text-error">'+t.e+"</td>";
6659
- h+='<td class="'+sCls(avg)+'">'+fmtPct(avg)+"</td><td>"+fmtDur(t.dur)+"</td><td>"+fmtTok(t.tok)+"</td><td>"+fmtCost(t.cost)+"</td></tr>";
6660
- }
6661
- h+="</tbody></table></div></div>";
6437
+ });
6438
+ app2.get("/api/runs/:filename/categories/:category/datasets", (c3) => {
6439
+ const filename = c3.req.param("filename");
6440
+ const category = decodeURIComponent(c3.req.param("category"));
6441
+ const metas = listResultFiles(searchDir);
6442
+ const meta = metas.find((m) => m.filename === filename);
6443
+ if (!meta) {
6444
+ return c3.json({ error: "Run not found" }, 404);
6662
6445
  }
6663
-
6664
- /* histogram */
6665
- if(stats.scores.length>0){
6666
- var bk=[0,0,0,0,0];
6667
- for(var i=0;i<stats.scores.length;i++){var idx=Math.min(Math.floor(stats.scores[i]*5),4);bk[idx]++;}
6668
- var mx=Math.max.apply(null,bk);
6669
- var lb=["0\\u201320%","20\\u201340%","40\\u201360%","60\\u201380%","80\\u2013100%"];
6670
- h+='<div class="section"><h2 class="section-title">Score Distribution</h2><div class="histogram">';
6671
- for(var i=0;i<bk.length;i++){
6672
- var pct=mx>0?(bk[i]/mx*100):0;
6673
- h+='<div class="hist-row"><span class="hist-label">'+lb[i]+'</span><div class="hist-bar-bg"><div class="hist-bar '+(i>=4?"score-high":i>=2?"score-mid":"score-low")+'" style="width:'+pct+'%"></div></div><span class="hist-count">'+bk[i]+"</span></div>";
6446
+ try {
6447
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6448
+ const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
6449
+ const datasetMap = /* @__PURE__ */ new Map();
6450
+ for (const r of filtered) {
6451
+ const ds = r.dataset ?? r.target ?? "default";
6452
+ const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
6453
+ entry.total++;
6454
+ if (r.score >= 1) entry.passed++;
6455
+ entry.scoreSum += r.score;
6456
+ datasetMap.set(ds, entry);
6457
+ }
6458
+ const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
6459
+ name,
6460
+ total: entry.total,
6461
+ passed: entry.passed,
6462
+ failed: entry.total - entry.passed,
6463
+ avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
6464
+ }));
6465
+ return c3.json({ datasets });
6466
+ } catch {
6467
+ return c3.json({ error: "Failed to load datasets" }, 500);
6468
+ }
6469
+ });
6470
+ app2.get("/api/runs/:filename/evals/:evalId", (c3) => {
6471
+ const filename = c3.req.param("filename");
6472
+ const evalId = c3.req.param("evalId");
6473
+ const metas = listResultFiles(searchDir);
6474
+ const meta = metas.find((m) => m.filename === filename);
6475
+ if (!meta) {
6476
+ return c3.json({ error: "Run not found" }, 404);
6477
+ }
6478
+ try {
6479
+ const loaded = patchTestIds(loadManifestResults(meta.path));
6480
+ const result = loaded.find((r) => r.testId === evalId);
6481
+ if (!result) {
6482
+ return c3.json({ error: "Eval not found" }, 404);
6674
6483
  }
6675
- h+="</div></div>";
6484
+ return c3.json({ eval: result });
6485
+ } catch {
6486
+ return c3.json({ error: "Failed to load eval" }, 500);
6676
6487
  }
6677
- app.innerHTML=h;
6678
- }
6679
-
6680
- /* ---- test cases ---- */
6681
- function renderTests(){
6682
- var evalNames=getEvalNames();
6683
- var h='<div class="filter-bar">';
6684
- h+='<select id="flt-status" class="filter-select"><option value="all">All Status</option><option value="pass">Passed</option><option value="fail">Failed</option><option value="error">Errors</option></select>';
6685
- if(tgtNames.length>1){
6686
- h+='<select id="flt-target" class="filter-select"><option value="all">All Targets</option>';
6687
- for(var i=0;i<tgtNames.length;i++)h+='<option value="'+esc(tgtNames[i])+'">'+esc(tgtNames[i])+"</option>";
6688
- h+="</select>";
6689
- }
6690
- h+='<input type="text" id="flt-search" class="filter-search" placeholder="Search tests..." value="'+esc(state.filter.search)+'">';
6691
- h+='<span class="filter-count" id="flt-count"></span></div>';
6692
-
6693
- h+='<div class="table-wrap"><table class="data-table" id="test-tbl"><thead><tr>';
6694
- h+='<th class="expand-col"></th>';
6695
- h+=sHdr("Status","status");
6696
- h+=sHdr("Test ID","testId");
6697
- if(tgtNames.length>1)h+=sHdr("Target","target");
6698
- h+=sHdr("Score","score");
6699
- for(var i=0;i<evalNames.length;i++)h+="<th>"+esc(evalNames[i])+"</th>";
6700
- h+=sHdr("Duration","durationMs");
6701
- h+=sHdr("Cost","costUsd");
6702
- h+="</tr></thead><tbody id=\\"test-body\\"></tbody></table></div>";
6703
- app.innerHTML=h;
6704
-
6705
- /* wire events */
6706
- var selS=document.getElementById("flt-status");
6707
- selS.value=state.filter.status;
6708
- selS.addEventListener("change",function(e){state.filter.status=e.target.value;renderRows();});
6709
- var selT=document.getElementById("flt-target");
6710
- if(selT){selT.value=state.filter.target;selT.addEventListener("change",function(e){state.filter.target=e.target.value;renderRows();});}
6711
- document.getElementById("flt-search").addEventListener("input",function(e){state.filter.search=e.target.value;renderRows();});
6712
- var ths=document.querySelectorAll("th[data-sort]");
6713
- for(var i=0;i<ths.length;i++){
6714
- ths[i].addEventListener("click",(function(th){return function(){
6715
- var c=th.getAttribute("data-sort");
6716
- if(state.sort.col===c)state.sort.dir=state.sort.dir==="asc"?"desc":"asc";
6717
- else{state.sort.col=c;state.sort.dir="asc";}
6718
- renderTests();
6719
- };})(ths[i]));
6720
- }
6721
- renderRows();
6722
- }
6723
-
6724
- function sHdr(label,col){
6725
- var arrow="";
6726
- if(state.sort.col===col)arrow=state.sort.dir==="asc"?" \\u2191":" \\u2193";
6727
- return'<th class="sortable" data-sort="'+col+'">'+label+arrow+"</th>";
6728
- }
6729
-
6730
- function filtered(){
6731
- var out=[];
6732
- for(var i=0;i<DATA.length;i++){
6733
- var r=DATA[i],s=getStatus(r);
6734
- if(state.filter.status!=="all"&&s!==state.filter.status)continue;
6735
- if(state.filter.target!=="all"&&r.target!==state.filter.target)continue;
6736
- if(state.filter.search&&(r.testId||"").toLowerCase().indexOf(state.filter.search.toLowerCase())===-1)continue;
6737
- out.push(r);
6738
- }
6739
- var col=state.sort.col,dir=state.sort.dir==="asc"?1:-1;
6740
- out.sort(function(a,b){
6741
- var va=col==="status"?getStatus(a):a[col],vb=col==="status"?getStatus(b):b[col];
6742
- if(va==null&&vb==null)return 0;if(va==null)return 1;if(vb==null)return-1;
6743
- if(typeof va==="string")return va.localeCompare(vb)*dir;
6744
- return(va-vb)*dir;
6488
+ });
6489
+ app2.get("/api/index", (c3) => {
6490
+ const metas = listResultFiles(searchDir);
6491
+ const entries2 = metas.map((m) => {
6492
+ let totalCostUsd = 0;
6493
+ try {
6494
+ const loaded = patchTestIds(loadManifestResults(m.path));
6495
+ totalCostUsd = loaded.reduce((sum, r) => sum + (r.costUsd ?? 0), 0);
6496
+ } catch {
6497
+ }
6498
+ return {
6499
+ run_filename: m.filename,
6500
+ test_count: m.testCount,
6501
+ pass_rate: m.passRate,
6502
+ avg_score: m.avgScore,
6503
+ total_cost_usd: totalCostUsd,
6504
+ timestamp: m.timestamp
6505
+ };
6506
+ });
6507
+ return c3.json({ entries: entries2 });
6508
+ });
6509
+ function buildFileTree(dirPath, relativeTo) {
6510
+ if (!existsSync7(dirPath) || !statSync4(dirPath).isDirectory()) {
6511
+ return [];
6512
+ }
6513
+ const entries2 = readdirSync3(dirPath, { withFileTypes: true });
6514
+ return entries2.sort((a, b) => {
6515
+ if (a.isDirectory() !== b.isDirectory()) return a.isDirectory() ? -1 : 1;
6516
+ return a.name.localeCompare(b.name);
6517
+ }).map((entry) => {
6518
+ const fullPath = path9.join(dirPath, entry.name);
6519
+ const relPath = path9.relative(relativeTo, fullPath);
6520
+ if (entry.isDirectory()) {
6521
+ return {
6522
+ name: entry.name,
6523
+ path: relPath,
6524
+ type: "dir",
6525
+ children: buildFileTree(fullPath, relativeTo)
6526
+ };
6527
+ }
6528
+ return { name: entry.name, path: relPath, type: "file" };
6745
6529
  });
6746
- return out;
6747
6530
  }
6748
-
6749
- function renderRows(){
6750
- var rows=filtered(),evalNames=getEvalNames();
6751
- var tbody=document.getElementById("test-body");
6752
- var colSpan=5+evalNames.length+(tgtNames.length>1?1:0);
6753
- document.getElementById("flt-count").textContent=rows.length+" of "+DATA.length+" tests";
6754
- var h="";
6755
- for(var i=0;i<rows.length;i++){
6756
- var r=rows[i],s=getStatus(r),key=r.testId+":"+r.target,exp=!!state.expanded[key];
6757
- h+='<tr class="test-row '+s+(exp?" expanded":"")+'" data-key="'+esc(key)+'" data-test-id="'+esc(r.testId)+'">';
6758
- h+='<td class="expand-col"><span class="expand-icon">'+(exp?"\\u25BE":"\\u25B8")+"</span></td>";
6759
- h+="<td>"+sIcon(s)+"</td>";
6760
- h+='<td class="fw-medium">'+esc(r.testId)+"</td>";
6761
- if(tgtNames.length>1)h+="<td>"+esc(r.target)+"</td>";
6762
- h+='<td class="'+sCls(r.score)+'">'+fmtPct(r.score)+"</td>";
6763
- for(var j=0;j<evalNames.length;j++){
6764
- var es=getEvalScore(r,evalNames[j]);
6765
- h+='<td class="'+sCls(es)+'">'+(es!=null?fmtPct(es):"\\u2014")+"</td>";
6766
- }
6767
- h+="<td>"+fmtDur(r.durationMs)+"</td><td>"+fmtCost(r.costUsd)+"</td></tr>";
6768
- if(exp)h+='<tr class="detail-row"><td colspan="'+colSpan+'">'+renderDetail(r)+"</td></tr>";
6769
- }
6770
- if(rows.length===0)h+='<tr><td colspan="'+colSpan+'" class="empty-state">No matching tests</td></tr>';
6771
- tbody.innerHTML=h;
6772
-
6773
- /* row click */
6774
- var trs=tbody.querySelectorAll(".test-row");
6775
- for(var k=0;k<trs.length;k++){
6776
- trs[k].addEventListener("click",(function(tr){return function(){
6777
- var key=tr.getAttribute("data-key");
6778
- state.expanded[key]=!state.expanded[key];
6779
- renderRows();
6780
- };})(trs[k]));
6531
+ function inferLanguage(filePath) {
6532
+ const ext = path9.extname(filePath).toLowerCase();
6533
+ const langMap = {
6534
+ ".json": "json",
6535
+ ".jsonl": "json",
6536
+ ".ts": "typescript",
6537
+ ".tsx": "typescript",
6538
+ ".js": "javascript",
6539
+ ".jsx": "javascript",
6540
+ ".md": "markdown",
6541
+ ".yaml": "yaml",
6542
+ ".yml": "yaml",
6543
+ ".log": "plaintext",
6544
+ ".txt": "plaintext",
6545
+ ".py": "python",
6546
+ ".sh": "shell",
6547
+ ".bash": "shell",
6548
+ ".css": "css",
6549
+ ".html": "html",
6550
+ ".xml": "xml",
6551
+ ".svg": "xml",
6552
+ ".toml": "toml",
6553
+ ".diff": "diff",
6554
+ ".patch": "diff"
6555
+ };
6556
+ return langMap[ext] ?? "plaintext";
6557
+ }
6558
+ app2.get("/api/runs/:filename/evals/:evalId/files", (c3) => {
6559
+ const filename = c3.req.param("filename");
6560
+ const evalId = c3.req.param("evalId");
6561
+ const metas = listResultFiles(searchDir);
6562
+ const meta = metas.find((m) => m.filename === filename);
6563
+ if (!meta) {
6564
+ return c3.json({ error: "Run not found" }, 404);
6781
6565
  }
6782
-
6783
- /* wire feedback buttons */
6784
- var btns=tbody.querySelectorAll(".feedback-submit");
6785
- for(var k=0;k<btns.length;k++){
6786
- btns[k].addEventListener("click",(function(btn){return function(ev){
6787
- ev.stopPropagation();
6788
- var tid=btn.getAttribute("data-test-id");
6789
- var sec=btn.closest(".feedback-section");
6790
- var ta=sec.querySelector(".feedback-input");
6791
- var st=sec.querySelector(".feedback-status");
6792
- saveFeedback(tid,ta.value,st,btn);
6793
- };})(btns[k]));
6566
+ try {
6567
+ const content = readFileSync8(meta.path, "utf8");
6568
+ const records = parseResultManifest(content);
6569
+ const record = records.find((r) => (r.test_id ?? r.eval_id) === evalId);
6570
+ if (!record) {
6571
+ return c3.json({ error: "Eval not found" }, 404);
6572
+ }
6573
+ const baseDir = path9.dirname(meta.path);
6574
+ const knownPaths = [
6575
+ record.grading_path,
6576
+ record.timing_path,
6577
+ record.input_path,
6578
+ record.output_path,
6579
+ record.response_path
6580
+ ].filter((p) => !!p);
6581
+ if (knownPaths.length === 0) {
6582
+ return c3.json({ files: [] });
6583
+ }
6584
+ const artifactDirs = knownPaths.map((p) => path9.dirname(p));
6585
+ let commonDir = artifactDirs[0];
6586
+ for (const dir of artifactDirs) {
6587
+ while (!dir.startsWith(commonDir)) {
6588
+ commonDir = path9.dirname(commonDir);
6589
+ }
6590
+ }
6591
+ const artifactAbsDir = path9.join(baseDir, commonDir);
6592
+ const files = buildFileTree(artifactAbsDir, baseDir);
6593
+ return c3.json({ files });
6594
+ } catch {
6595
+ return c3.json({ error: "Failed to load file tree" }, 500);
6794
6596
  }
6795
-
6796
- /* prevent textarea clicks from toggling row */
6797
- var tas=tbody.querySelectorAll(".feedback-input");
6798
- for(var k=0;k<tas.length;k++){
6799
- tas[k].addEventListener("click",function(ev){ev.stopPropagation();});
6597
+ });
6598
+ app2.get("/api/runs/:filename/evals/:evalId/files/*", (c3) => {
6599
+ const filename = c3.req.param("filename");
6600
+ const evalId = c3.req.param("evalId");
6601
+ const metas = listResultFiles(searchDir);
6602
+ const meta = metas.find((m) => m.filename === filename);
6603
+ if (!meta) {
6604
+ return c3.json({ error: "Run not found" }, 404);
6800
6605
  }
6801
-
6802
- populateFeedbackTextareas();
6803
- }
6804
-
6805
- /* ---- detail panel ---- */
6806
- function renderDetail(r){
6807
- var h='<div class="detail-panel">';
6808
-
6809
- /* input / output */
6810
- h+='<div class="detail-grid">';
6811
- if(r.input!=null){
6812
- h+='<div class="detail-block"><h4>Input</h4><pre class="detail-pre">'+esc(JSON.stringify(r.input,null,2))+"</pre></div>";
6606
+ const requestPath = c3.req.path;
6607
+ const prefix = `/api/runs/${filename}/evals/${evalId}/files/`;
6608
+ const filePath = requestPath.slice(prefix.length);
6609
+ if (!filePath) {
6610
+ return c3.json({ error: "No file path specified" }, 400);
6813
6611
  }
6814
- h+='<div class="detail-block"><h4>Output</h4><pre class="detail-pre">'+esc(r.output?JSON.stringify(r.output,null,2):"")+"</pre></div>";
6815
- h+="</div>";
6816
-
6817
- /* evaluator results */
6818
- if(r.scores&&r.scores.length>0){
6819
- h+="<h4>Evaluator Results</h4>";
6820
- h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
6821
- for(var i=0;i<r.scores.length;i++){
6822
- var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
6823
- var evAssertions=ev.assertions||[];
6824
- var evSummary=evAssertions.map(function(a){return (a.passed?"\\u2713 ":"\\u2717 ")+a.text;}).join("; ");
6825
- h+="<tr><td class=\\"fw-medium\\">"+esc(ev.name)+'</td><td class="'+sCls(ev.score)+'">'+fmtPct(ev.score)+"</td><td>"+sIcon(evS)+'</td><td class="reasoning-cell">'+esc(evSummary)+"</td></tr>";
6826
- }
6827
- h+="</tbody></table>";
6612
+ const baseDir = path9.dirname(meta.path);
6613
+ const absolutePath = path9.resolve(baseDir, filePath);
6614
+ if (!absolutePath.startsWith(path9.resolve(baseDir) + path9.sep) && absolutePath !== path9.resolve(baseDir)) {
6615
+ return c3.json({ error: "Path traversal not allowed" }, 403);
6828
6616
  }
6829
-
6830
- /* assertions */
6831
- var passedA=r.assertions?r.assertions.filter(function(a){return a.passed;}):[];
6832
- var failedA=r.assertions?r.assertions.filter(function(a){return !a.passed;}):[];
6833
- if(passedA.length>0){
6834
- h+='<h4>Passed Assertions</h4><ul class="expect-list pass">';
6835
- for(var i=0;i<passedA.length;i++)h+="<li>"+esc(passedA[i].text)+(passedA[i].evidence?" <span class=\\"reasoning-cell\\">("+esc(passedA[i].evidence)+")</span>":"")+"</li>";
6836
- h+="</ul>";
6837
- }
6838
- if(failedA.length>0){
6839
- h+='<h4>Failed Assertions</h4><ul class="expect-list fail">';
6840
- for(var i=0;i<failedA.length;i++)h+="<li>"+esc(failedA[i].text)+(failedA[i].evidence?" <span class=\\"reasoning-cell\\">("+esc(failedA[i].evidence)+")</span>":"")+"</li>";
6841
- h+="</ul>";
6617
+ if (!existsSync7(absolutePath) || !statSync4(absolutePath).isFile()) {
6618
+ return c3.json({ error: "File not found" }, 404);
6842
6619
  }
6843
-
6844
- /* tool calls */
6845
- if(r._toolCalls){
6846
- var tc=r._toolCalls,tcArr=[];
6847
- for(var k in tc)tcArr.push({name:k,count:tc[k]});
6848
- tcArr.sort(function(a,b){return b.count-a.count;});
6849
- h+='<h4>Tool Calls</h4><div class="tool-calls">';
6850
- for(var i=0;i<tcArr.length;i++)h+='<span class="tool-tag">'+esc(tcArr[i].name)+": "+tcArr[i].count+"</span>";
6851
- h+="</div>";
6620
+ try {
6621
+ const fileContent = readFileSync8(absolutePath, "utf8");
6622
+ const language = inferLanguage(absolutePath);
6623
+ return c3.json({ content: fileContent, language });
6624
+ } catch {
6625
+ return c3.json({ error: "Failed to read file" }, 500);
6852
6626
  }
6853
-
6854
- /* error */
6855
- if(r.error)h+='<div class="error-box"><h4>Error</h4><pre>'+esc(r.error)+"</pre></div>";
6856
-
6857
- /* metadata */
6858
- h+='<div class="detail-meta">';
6859
- var m=[];
6860
- if(r.tokenUsage)m.push(fmtTok(r.tokenUsage.input)+" in / "+fmtTok(r.tokenUsage.output)+" out tokens");
6861
- if(r.durationMs){
6862
- if(r._graderDurationMs>0){
6863
- var execMs=r.durationMs-r._graderDurationMs;
6864
- m.push(fmtDur(execMs>0?execMs:0)+" executor + "+fmtDur(r._graderDurationMs)+" grader");
6865
- }else{
6866
- m.push(fmtDur(r.durationMs));
6867
- }
6868
- }
6869
- if(r.target)m.push(r.target);
6870
- if(r.costUsd)m.push(fmtCost(r.costUsd));
6871
- if(r.timestamp)m.push(r.timestamp);
6872
- h+=esc(m.join(" \\u00B7 "));
6873
- h+="</div>";
6874
-
6875
- /* feedback section */
6876
- var tid=r.testId||"";
6877
- var existingComment=feedbackCache[tid]||"";
6878
- h+='<div class="feedback-section">';
6879
- h+='<h4>Feedback</h4>';
6880
- h+='<textarea class="feedback-input" data-test-id="'+esc(tid)+'" placeholder="Add feedback for this test..." onclick="event.stopPropagation()">'+esc(existingComment)+'</textarea>';
6881
- h+='<div style="display:flex;align-items:center">';
6882
- h+='<button class="feedback-submit" data-test-id="'+esc(tid)+'">Save Feedback</button>';
6883
- h+='<span class="feedback-status"></span>';
6884
- h+='</div></div>';
6885
-
6886
- h+="</div>";
6887
- return h;
6888
- }
6889
-
6890
- /* ---- run picker ---- */
6891
- var runPicker=document.getElementById("run-picker");
6892
- var knownRunFilenames=[];
6893
-
6894
- function refreshRunList(){
6895
- fetch("/api/runs").then(function(r){return r.json();}).then(function(d){
6896
- if(!d||!d.runs)return;
6897
- var runs=d.runs;
6898
- var newFilenames=runs.map(function(r){return r.filename;});
6899
-
6900
- /* Detect new runs that appeared since last poll */
6901
- if(knownRunFilenames.length>0){
6902
- var hasNew=newFilenames.some(function(f){return knownRunFilenames.indexOf(f)===-1;});
6903
- if(hasNew&&DATA.length===0){
6904
- /* Auto-load the first (most recent) run when starting from empty state */
6905
- loadRun(runs[0].filename);
6627
+ });
6628
+ app2.get("/api/experiments", (c3) => {
6629
+ const metas = listResultFiles(searchDir);
6630
+ const experimentMap = /* @__PURE__ */ new Map();
6631
+ for (const m of metas) {
6632
+ try {
6633
+ const records = loadLightweightResults(m.path);
6634
+ for (const r of records) {
6635
+ const experiment = r.experiment ?? "default";
6636
+ const entry = experimentMap.get(experiment) ?? {
6637
+ targets: /* @__PURE__ */ new Set(),
6638
+ runFilenames: /* @__PURE__ */ new Set(),
6639
+ evalCount: 0,
6640
+ passedCount: 0,
6641
+ lastTimestamp: ""
6642
+ };
6643
+ entry.runFilenames.add(m.filename);
6644
+ if (r.target) entry.targets.add(r.target);
6645
+ entry.evalCount++;
6646
+ if (r.score >= 1) entry.passedCount++;
6647
+ if (r.timestamp && r.timestamp > entry.lastTimestamp) {
6648
+ entry.lastTimestamp = r.timestamp;
6649
+ }
6650
+ experimentMap.set(experiment, entry);
6906
6651
  }
6652
+ } catch {
6907
6653
  }
6908
- knownRunFilenames=newFilenames;
6909
-
6910
- /* Rebuild picker options */
6911
- var h='<option value="">Select a result file...</option>';
6912
- if(runs.length===0){
6913
- h='<option value="">No result files</option>';
6914
- }
6915
- for(var i=0;i<runs.length;i++){
6916
- var r=runs[i];
6917
- var label=r.filename+" ("+r.test_count+" tests, "+(r.pass_rate*100).toFixed(0)+"% pass)";
6918
- h+='<option value="'+esc(r.filename)+'">'+esc(label)+"</option>";
6654
+ }
6655
+ const experiments = [...experimentMap.entries()].map(([name, entry]) => ({
6656
+ name,
6657
+ run_count: entry.runFilenames.size,
6658
+ target_count: entry.targets.size,
6659
+ eval_count: entry.evalCount,
6660
+ passed_count: entry.passedCount,
6661
+ pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
6662
+ last_run: entry.lastTimestamp || null
6663
+ }));
6664
+ return c3.json({ experiments });
6665
+ });
6666
+ app2.get("/api/targets", (c3) => {
6667
+ const metas = listResultFiles(searchDir);
6668
+ const targetMap = /* @__PURE__ */ new Map();
6669
+ for (const m of metas) {
6670
+ try {
6671
+ const records = loadLightweightResults(m.path);
6672
+ for (const r of records) {
6673
+ const target = r.target ?? "default";
6674
+ const entry = targetMap.get(target) ?? {
6675
+ experiments: /* @__PURE__ */ new Set(),
6676
+ runFilenames: /* @__PURE__ */ new Set(),
6677
+ evalCount: 0,
6678
+ passedCount: 0
6679
+ };
6680
+ entry.runFilenames.add(m.filename);
6681
+ if (r.experiment) entry.experiments.add(r.experiment);
6682
+ entry.evalCount++;
6683
+ if (r.score >= 1) entry.passedCount++;
6684
+ targetMap.set(target, entry);
6685
+ }
6686
+ } catch {
6919
6687
  }
6920
- runPicker.innerHTML=h;
6921
- /* Pre-select the initially loaded run */
6922
- if(INITIAL_SOURCE&&runs.length>0){
6923
- runPicker.value=INITIAL_SOURCE;
6688
+ }
6689
+ const targets = [...targetMap.entries()].map(([name, entry]) => ({
6690
+ name,
6691
+ run_count: entry.runFilenames.size,
6692
+ experiment_count: entry.experiments.size,
6693
+ eval_count: entry.evalCount,
6694
+ passed_count: entry.passedCount,
6695
+ pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0
6696
+ }));
6697
+ return c3.json({ targets });
6698
+ });
6699
+ app2.get("/assets/*", (c3) => {
6700
+ const assetPath = c3.req.path;
6701
+ const filePath = path9.join(studioDistPath, assetPath);
6702
+ if (!existsSync7(filePath)) {
6703
+ return c3.notFound();
6704
+ }
6705
+ const content = readFileSync8(filePath);
6706
+ const ext = path9.extname(filePath);
6707
+ const mimeTypes = {
6708
+ ".js": "application/javascript",
6709
+ ".css": "text/css",
6710
+ ".html": "text/html",
6711
+ ".json": "application/json",
6712
+ ".svg": "image/svg+xml",
6713
+ ".png": "image/png",
6714
+ ".woff2": "font/woff2",
6715
+ ".woff": "font/woff"
6716
+ };
6717
+ const contentType = mimeTypes[ext] ?? "application/octet-stream";
6718
+ return new Response(content, {
6719
+ headers: {
6720
+ "Content-Type": contentType,
6721
+ "Cache-Control": "public, max-age=31536000, immutable"
6924
6722
  }
6925
- }).catch(function(err){console.warn("Failed to refresh run list:",err);});
6926
- }
6927
-
6928
- function loadRun(filename){
6929
- fetch("/api/runs/"+encodeURIComponent(filename)).then(function(r){return r.json();}).then(function(d){
6930
- if(d.error){console.error(d.error);return;}
6931
- DATA=d.results;
6932
- stats=computeStats(DATA);
6933
- tgtStats=computeTargets(DATA);
6934
- tgtNames=tgtStats.map(function(t){return t.target;});
6935
- state.expanded={};
6936
- feedbackCache={};
6937
- loadFeedback();
6938
- render();
6939
- /* Update picker selection */
6940
- runPicker.value=filename;
6941
- }).catch(function(err){console.error("Failed to load run:",err);});
6723
+ });
6724
+ });
6725
+ app2.get("*", (c3) => {
6726
+ if (c3.req.path.startsWith("/api/")) {
6727
+ return c3.json({ error: "Not found" }, 404);
6728
+ }
6729
+ const indexPath = path9.join(studioDistPath, "index.html");
6730
+ if (existsSync7(indexPath)) {
6731
+ return c3.html(readFileSync8(indexPath, "utf8"));
6732
+ }
6733
+ return c3.notFound();
6734
+ });
6735
+ return app2;
6736
+ }
6737
+ function resolveStudioDistDir() {
6738
+ const currentDir = typeof __dirname !== "undefined" ? __dirname : path9.dirname(fileURLToPath2(import.meta.url));
6739
+ const candidates = [
6740
+ // From src/commands/results/ → sibling apps/studio/dist
6741
+ path9.resolve(currentDir, "../../../../studio/dist"),
6742
+ // From dist/ → sibling apps/studio/dist (monorepo dev)
6743
+ path9.resolve(currentDir, "../../studio/dist"),
6744
+ // Bundled inside CLI dist (published package)
6745
+ path9.resolve(currentDir, "../studio"),
6746
+ // From dist/ in monorepo root context
6747
+ path9.resolve(currentDir, "../../../apps/studio/dist")
6748
+ ];
6749
+ for (const candidate of candidates) {
6750
+ if (existsSync7(candidate) && existsSync7(path9.join(candidate, "index.html"))) {
6751
+ return candidate;
6752
+ }
6942
6753
  }
6943
-
6944
- runPicker.addEventListener("change",function(){
6945
- var val=runPicker.value;
6946
- if(val)loadRun(val);
6754
+ return void 0;
6755
+ }
6756
+ function stripHeavyFields(results) {
6757
+ return results.map((r) => {
6758
+ const { requests, trace, ...rest } = r;
6759
+ const toolCalls = trace?.toolCalls && Object.keys(trace.toolCalls).length > 0 ? trace.toolCalls : void 0;
6760
+ const graderDurationMs = (r.scores ?? []).reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
6761
+ return {
6762
+ ...rest,
6763
+ ...toolCalls && { _toolCalls: toolCalls },
6764
+ ...graderDurationMs > 0 && { _graderDurationMs: graderDurationMs }
6765
+ };
6947
6766
  });
6948
-
6949
- /* Poll for new result files every 5 seconds */
6950
- refreshRunList();
6951
- setInterval(refreshRunList,5000);
6952
-
6953
- /* ---- init ---- */
6954
- loadFeedback();
6955
- render();
6956
- })();
6957
- `;
6767
+ }
6958
6768
  var resultsServeCommand = command({
6959
- name: "serve",
6960
- description: "Start a local HTTP server to review evaluation results",
6769
+ name: "studio",
6770
+ description: "Start AgentV Studio \u2014 a local dashboard for reviewing evaluation results",
6961
6771
  args: {
6962
6772
  source: positional({
6963
6773
  type: optional(string),
@@ -7594,7 +7404,7 @@ function formatResultDetail(result, index, tree) {
7594
7404
  }
7595
7405
  const scoreColor = result.score >= 0.9 ? c2.green : result.score >= 0.5 ? c2.yellow : c2.red;
7596
7406
  lines.push(
7597
- `${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.eval_set ? ` ${c2.dim}eval-set: ${result.eval_set}${c2.reset}` : ""}`
7407
+ `${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.dataset ? ` ${c2.dim}dataset: ${result.dataset}${c2.reset}` : ""}`
7598
7408
  );
7599
7409
  if (result.error) {
7600
7410
  lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
@@ -7768,8 +7578,8 @@ function groupResults(results, groupBy2) {
7768
7578
  case "target":
7769
7579
  key = result.target ?? "unknown";
7770
7580
  break;
7771
- case "eval-set":
7772
- key = result.eval_set ?? "unknown";
7581
+ case "dataset":
7582
+ key = result.dataset ?? "unknown";
7773
7583
  break;
7774
7584
  case "test-id":
7775
7585
  key = result.test_id ?? result.eval_id ?? "unknown";
@@ -8482,7 +8292,7 @@ var app = subcommands({
8482
8292
  pipeline: pipelineCommand,
8483
8293
  results: resultsCommand,
8484
8294
  self: selfCommand,
8485
- serve: resultsServeCommand,
8295
+ studio: resultsServeCommand,
8486
8296
  trace: traceCommand,
8487
8297
  transpile: transpileCommand,
8488
8298
  trim: trimCommand,
@@ -8500,7 +8310,7 @@ var TOP_LEVEL_COMMANDS = /* @__PURE__ */ new Set([
8500
8310
  "pipeline",
8501
8311
  "results",
8502
8312
  "self",
8503
- "serve",
8313
+ "studio",
8504
8314
  "trace",
8505
8315
  "transpile",
8506
8316
  "trim",
@@ -8547,4 +8357,4 @@ export {
8547
8357
  preprocessArgv,
8548
8358
  runCli
8549
8359
  };
8550
- //# sourceMappingURL=chunk-CQRWNXVG.js.map
8360
+ //# sourceMappingURL=chunk-2W5JKKXC.js.map