agentv 3.4.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,7 +16,7 @@ import {
16
16
  validateEvalFile,
17
17
  validateFileReferences,
18
18
  validateTargetsFile
19
- } from "./chunk-RE5I3U2S.js";
19
+ } from "./chunk-UU5N43YS.js";
20
20
  import {
21
21
  createBuiltinRegistry,
22
22
  createProvider,
@@ -34,7 +34,7 @@ import {
34
34
  toSnakeCaseDeep as toSnakeCaseDeep2,
35
35
  transpileEvalYamlFile,
36
36
  trimBaselineResult
37
- } from "./chunk-GOZV2HN2.js";
37
+ } from "./chunk-K4RXLQWV.js";
38
38
  import {
39
39
  __commonJS,
40
40
  __esm,
@@ -3493,9 +3493,9 @@ var ASSERTION_TEMPLATES = {
3493
3493
  default: `#!/usr/bin/env bun
3494
3494
  import { defineAssertion } from '@agentv/eval';
3495
3495
 
3496
- export default defineAssertion(({ answer }) => {
3496
+ export default defineAssertion(({ outputText }) => {
3497
3497
  // TODO: Implement your assertion logic
3498
- const pass = answer.length > 0;
3498
+ const pass = outputText.length > 0;
3499
3499
  return {
3500
3500
  pass,
3501
3501
  reasoning: pass ? 'Output has content' : 'Output is empty',
@@ -3505,9 +3505,9 @@ export default defineAssertion(({ answer }) => {
3505
3505
  score: `#!/usr/bin/env bun
3506
3506
  import { defineAssertion } from '@agentv/eval';
3507
3507
 
3508
- export default defineAssertion(({ answer }) => {
3508
+ export default defineAssertion(({ outputText }) => {
3509
3509
  // TODO: Implement your scoring logic (0.0 to 1.0)
3510
- const score = answer.length > 0 ? 1.0 : 0.0;
3510
+ const score = outputText.length > 0 ? 1.0 : 0.0;
3511
3511
  return {
3512
3512
  pass: score >= 0.5,
3513
3513
  score,
@@ -3967,7 +3967,6 @@ var evalAssertCommand = command({
3967
3967
  }
3968
3968
  const payload = JSON.stringify(
3969
3969
  {
3970
- answer: resolvedOutput,
3971
3970
  output: [{ role: "assistant", content: resolvedOutput }],
3972
3971
  input: [{ role: "user", content: resolvedInput }],
3973
3972
  question: resolvedInput,
@@ -4185,7 +4184,7 @@ var evalRunCommand = command({
4185
4184
  },
4186
4185
  handler: async (args) => {
4187
4186
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4188
- const { launchInteractiveWizard } = await import("./interactive-WXXTZ7PD.js");
4187
+ const { launchInteractiveWizard } = await import("./interactive-5S4ILY2Y.js");
4189
4188
  await launchInteractiveWizard();
4190
4189
  return;
4191
4190
  }
@@ -4707,10 +4706,10 @@ function exportResults(sourceFile, content, outputDir) {
4707
4706
  const outputsDir = path8.join(outputDir, "outputs");
4708
4707
  mkdirSync2(outputsDir, { recursive: true });
4709
4708
  for (const result of patched) {
4710
- const answer = result.answer;
4711
- if (answer) {
4709
+ const outputText = result.outputText;
4710
+ if (outputText) {
4712
4711
  const id = safeTestId(result);
4713
- writeFileSync3(path8.join(outputsDir, `${id}.txt`), answer);
4712
+ writeFileSync3(path8.join(outputsDir, `${id}.txt`), outputText);
4714
4713
  }
4715
4714
  }
4716
4715
  }
@@ -5022,7 +5021,7 @@ function toTraceSummary(raw) {
5022
5021
  return toCamelCaseDeep(raw.trace);
5023
5022
  }
5024
5023
  function extractCandidate(raw) {
5025
- if (raw.answer !== void 0) return raw.answer;
5024
+ if (raw.output_text !== void 0) return raw.output_text;
5026
5025
  if (raw.output !== void 0)
5027
5026
  return typeof raw.output === "string" ? raw.output : JSON.stringify(raw.output);
5028
5027
  return "";
@@ -5089,9 +5088,7 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
5089
5088
  originalScore: raw.score,
5090
5089
  newScore: score.score,
5091
5090
  verdict: score.verdict,
5092
- hits: score.hits,
5093
- misses: score.misses,
5094
- reasoning: score.reasoning
5091
+ assertions: score.assertions
5095
5092
  });
5096
5093
  }
5097
5094
  return scored;
@@ -5110,7 +5107,9 @@ function renderTable(scored, assertSpec) {
5110
5107
  lines.push(cols.map((col) => "\u2500".repeat(col.width)).join("\u2500\u2500"));
5111
5108
  for (const r of scored) {
5112
5109
  const verdictColor = r.verdict === "pass" ? c2.green : c2.red;
5113
- const detail = r.misses.length > 0 ? r.misses[0].slice(0, 48) : r.hits.length > 0 ? r.hits[0].slice(0, 48) : r.reasoning?.slice(0, 48) ?? "";
5110
+ const failed = r.assertions.filter((a) => !a.passed);
5111
+ const passed = r.assertions.filter((a) => a.passed);
5112
+ const detail = failed.length > 0 ? failed[0].text.slice(0, 48) : passed.length > 0 ? passed[0].text.slice(0, 48) : "";
5114
5113
  const row = [
5115
5114
  padRight2(r.testId.slice(0, 24), cols[0].width),
5116
5115
  padLeft2(formatScore(r.originalScore), cols[1].width),
@@ -5332,11 +5331,17 @@ function formatResultDetail(result, index, tree) {
5332
5331
  if (result.error) {
5333
5332
  lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
5334
5333
  }
5335
- if (result.hits && result.hits.length > 0) {
5336
- lines.push(` ${c2.green}\u2713 Hits:${c2.reset} ${result.hits.join(", ")}`);
5337
- }
5338
- if (result.misses && result.misses.length > 0) {
5339
- lines.push(` ${c2.red}\u2717 Misses:${c2.reset} ${result.misses.join(", ")}`);
5334
+ if (result.assertions && result.assertions.length > 0) {
5335
+ const passed = result.assertions.filter((a) => a.passed);
5336
+ const failed = result.assertions.filter((a) => !a.passed);
5337
+ if (passed.length > 0)
5338
+ lines.push(
5339
+ ` ${c2.green}\u2713 Passed:${c2.reset} ${passed.map((a) => a.text).join(", ")}`
5340
+ );
5341
+ if (failed.length > 0)
5342
+ lines.push(
5343
+ ` ${c2.red}\u2717 Failed:${c2.reset} ${failed.map((a) => a.text).join(", ")}`
5344
+ );
5340
5345
  }
5341
5346
  if (result.scores && result.scores.length > 0) {
5342
5347
  lines.push(` ${c2.dim}Scores:${c2.reset} ${renderScores(result.scores)}`);
@@ -5344,10 +5349,14 @@ function formatResultDetail(result, index, tree) {
5344
5349
  if (result.trace || result.duration_ms !== void 0 || result.cost_usd !== void 0) {
5345
5350
  lines.push(` ${c2.dim}Trace:${c2.reset} ${renderFlatTrace(result)}`);
5346
5351
  }
5347
- if (result.reasoning) {
5348
- const maxLen = 200;
5349
- const truncated = result.reasoning.length > maxLen ? `${result.reasoning.slice(0, maxLen)}...` : result.reasoning;
5350
- lines.push(` ${c2.dim}Reasoning: ${truncated}${c2.reset}`);
5352
+ if (result.assertions && result.assertions.length > 0) {
5353
+ const withEvidence = result.assertions.filter((a) => a.evidence);
5354
+ if (withEvidence.length > 0) {
5355
+ const maxLen = 200;
5356
+ const evidence = withEvidence[0].evidence;
5357
+ const truncated = evidence.length > maxLen ? `${evidence.slice(0, maxLen)}...` : evidence;
5358
+ lines.push(` ${c2.dim}Evidence: ${truncated}${c2.reset}`);
5359
+ }
5351
5360
  }
5352
5361
  return lines.join("\n");
5353
5362
  }
@@ -6268,4 +6277,4 @@ export {
6268
6277
  preprocessArgv,
6269
6278
  runCli
6270
6279
  };
6271
- //# sourceMappingURL=chunk-A7ZDUB46.js.map
6280
+ //# sourceMappingURL=chunk-IP5BO54H.js.map