agentv 3.8.0 → 3.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -164,7 +164,7 @@ For large-scale evaluations, AgentV supports JSONL (JSON Lines) format as an alt
164
164
  Optional sidecar YAML metadata file (`dataset.eval.yaml` alongside `dataset.jsonl`):
165
165
  ```yaml
166
166
  description: Math evaluation dataset
167
- dataset: math-tests
167
+ name: math-tests
168
168
  execution:
169
169
  target: azure-llm
170
170
  assertions:
@@ -16,7 +16,7 @@ import {
16
16
  validateEvalFile,
17
17
  validateFileReferences,
18
18
  validateTargetsFile
19
- } from "./chunk-ASYRKFAI.js";
19
+ } from "./chunk-FRA6PDLZ.js";
20
20
  import {
21
21
  createBuiltinRegistry,
22
22
  createProvider,
@@ -34,7 +34,7 @@ import {
34
34
  toSnakeCaseDeep as toSnakeCaseDeep2,
35
35
  transpileEvalYamlFile,
36
36
  trimBaselineResult
37
- } from "./chunk-F4UDJ7LG.js";
37
+ } from "./chunk-X24J6HCV.js";
38
38
  import {
39
39
  __commonJS,
40
40
  __esm,
@@ -3714,7 +3714,6 @@ async function getPromptEvalInput(evalPath, testId) {
3714
3714
  return {
3715
3715
  test_id: evalCase.id,
3716
3716
  input: resolveMessages(evalCase.input, fileMap),
3717
- guideline_paths: evalCase.guideline_paths,
3718
3717
  criteria: evalCase.criteria
3719
3718
  };
3720
3719
  }
@@ -3739,9 +3738,8 @@ async function getPromptEvalGradingBrief(evalPath, testId) {
3739
3738
  if (inputText) {
3740
3739
  lines.push(`Input: "${inputText}"`);
3741
3740
  }
3742
- const filePaths = evalCase.file_paths.filter((p) => !evalCase.guideline_paths.includes(p));
3743
- if (filePaths.length > 0) {
3744
- lines.push(`Files: ${filePaths.join(", ")}`);
3741
+ if (evalCase.file_paths.length > 0) {
3742
+ lines.push(`Files: ${evalCase.file_paths.join(", ")}`);
3745
3743
  }
3746
3744
  if (evalCase.reference_answer) {
3747
3745
  lines.push(`Expected: "${evalCase.reference_answer}"`);
@@ -3973,7 +3971,6 @@ var evalAssertCommand = command({
3973
3971
  criteria: "",
3974
3972
  expected_output: [],
3975
3973
  reference_answer: "",
3976
- guideline_files: [],
3977
3974
  input_files: [],
3978
3975
  trace: null,
3979
3976
  token_usage: null,
@@ -4189,7 +4186,7 @@ var evalRunCommand = command({
4189
4186
  },
4190
4187
  handler: async (args) => {
4191
4188
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4192
- const { launchInteractiveWizard } = await import("./interactive-OPQGDF77.js");
4189
+ const { launchInteractiveWizard } = await import("./interactive-O7HENH55.js");
4193
4190
  await launchInteractiveWizard();
4194
4191
  return;
4195
4192
  }
@@ -5037,7 +5034,6 @@ function buildEvalTest(raw) {
5037
5034
  input: [],
5038
5035
  input_segments: [],
5039
5036
  expected_output: [],
5040
- guideline_paths: [],
5041
5037
  file_paths: [],
5042
5038
  criteria: ""
5043
5039
  };
@@ -5075,7 +5071,7 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
5075
5071
  target: { kind: "custom", name: raw.target ?? "unknown", config: {} },
5076
5072
  provider: stubProvider,
5077
5073
  attempt: 1,
5078
- promptInputs: { question: "", guidelines: "" },
5074
+ promptInputs: { question: "" },
5079
5075
  now: /* @__PURE__ */ new Date(),
5080
5076
  output: Array.isArray(output) ? output : void 0,
5081
5077
  trace,
@@ -5329,7 +5325,7 @@ function formatResultDetail(result, index, tree) {
5329
5325
  }
5330
5326
  const scoreColor = result.score >= 0.9 ? c2.green : result.score >= 0.5 ? c2.yellow : c2.red;
5331
5327
  lines.push(
5332
- `${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.dataset ? ` ${c2.dim}dataset: ${result.dataset}${c2.reset}` : ""}`
5328
+ `${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.eval_set ? ` ${c2.dim}eval-set: ${result.eval_set}${c2.reset}` : ""}`
5333
5329
  );
5334
5330
  if (result.error) {
5335
5331
  lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
@@ -5503,8 +5499,8 @@ function groupResults(results, groupBy2) {
5503
5499
  case "target":
5504
5500
  key = result.target ?? "unknown";
5505
5501
  break;
5506
- case "dataset":
5507
- key = result.dataset ?? "unknown";
5502
+ case "eval-set":
5503
+ key = result.eval_set ?? "unknown";
5508
5504
  break;
5509
5505
  case "test-id":
5510
5506
  key = result.test_id ?? result.eval_id ?? "unknown";
@@ -5586,10 +5582,10 @@ var traceStatsCommand = command({
5586
5582
  description: "Path to JSONL result file"
5587
5583
  }),
5588
5584
  groupBy: option({
5589
- type: optional(oneOf(["target", "dataset", "test-id"])),
5585
+ type: optional(oneOf(["target", "eval-set", "test-id"])),
5590
5586
  long: "group-by",
5591
5587
  short: "g",
5592
- description: "Group statistics by: target, dataset, or test-id"
5588
+ description: "Group statistics by: target, eval-set, or test-id"
5593
5589
  }),
5594
5590
  format: option({
5595
5591
  type: optional(oneOf(["table", "json"])),
@@ -6280,4 +6276,4 @@ export {
6280
6276
  preprocessArgv,
6281
6277
  runCli
6282
6278
  };
6283
- //# sourceMappingURL=chunk-YZRGQ6ZS.js.map
6279
+ //# sourceMappingURL=chunk-FNIEABNM.js.map