agentv 2.12.0 → 2.14.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
2
2
  import {
3
3
  detectFileType,
4
4
  findRepoRoot,
5
+ package_default,
5
6
  resolveEvalPaths,
6
7
  runEvalCommand,
7
8
  selectTarget,
@@ -10,7 +11,7 @@ import {
10
11
  validateEvalFile,
11
12
  validateFileReferences,
12
13
  validateTargetsFile
13
- } from "./chunk-YBJX5CP6.js";
14
+ } from "./chunk-K2APOWTE.js";
14
15
  import {
15
16
  RepoManager,
16
17
  assembleLlmJudgePrompt,
@@ -25,7 +26,7 @@ import {
25
26
  toCamelCaseDeep,
26
27
  toSnakeCaseDeep,
27
28
  trimBaselineResult
28
- } from "./chunk-LUHCYBMD.js";
29
+ } from "./chunk-OQN2GDEU.js";
29
30
  import {
30
31
  __commonJS,
31
32
  __esm,
@@ -2875,56 +2876,6 @@ function oneOf(literals) {
2875
2876
  };
2876
2877
  }
2877
2878
 
2878
- // package.json
2879
- var package_default = {
2880
- name: "agentv",
2881
- version: "2.12.0",
2882
- description: "CLI entry point for AgentV",
2883
- type: "module",
2884
- repository: {
2885
- type: "git",
2886
- url: "https://github.com/EntityProcess/agentv.git"
2887
- },
2888
- homepage: "https://github.com/EntityProcess/agentv#readme",
2889
- bugs: {
2890
- url: "https://github.com/EntityProcess/agentv/issues"
2891
- },
2892
- bin: {
2893
- agentv: "./dist/cli.js"
2894
- },
2895
- files: ["dist", "README.md"],
2896
- scripts: {
2897
- dev: "bun src/cli.ts",
2898
- build: "tsup && bun run copy-readme",
2899
- "copy-readme": `bun -e "import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')"`,
2900
- prepublishOnly: "bun run copy-readme",
2901
- typecheck: "tsc --noEmit",
2902
- lint: "biome check .",
2903
- format: "biome format --write .",
2904
- fix: "biome check --write .",
2905
- test: "bun test",
2906
- "test:watch": "bun test --watch"
2907
- },
2908
- dependencies: {
2909
- "@anthropic-ai/claude-agent-sdk": "^0.2.49",
2910
- "@github/copilot-sdk": "^0.1.25",
2911
- "@inquirer/prompts": "^8.2.1",
2912
- "@mariozechner/pi-agent-core": "^0.54.2",
2913
- "@mariozechner/pi-ai": "^0.54.2",
2914
- "@openai/codex-sdk": "^0.104.0",
2915
- "cmd-ts": "^0.14.3",
2916
- dotenv: "^16.4.5",
2917
- "fast-glob": "^3.3.3",
2918
- json5: "^2.2.3",
2919
- micromatch: "^4.0.8",
2920
- yaml: "^2.6.1"
2921
- },
2922
- devDependencies: {
2923
- "@agentv/core": "workspace:*",
2924
- execa: "^9.3.0"
2925
- }
2926
- };
2927
-
2928
2879
  // src/commands/cache/add.ts
2929
2880
  import { existsSync } from "node:fs";
2930
2881
  import { join, resolve } from "node:path";
@@ -3533,7 +3484,7 @@ tests:
3533
3484
  input: "Hello, how are you?"
3534
3485
  expected_output: "I'm doing well, thank you for asking!"
3535
3486
  assert:
3536
- - type: llm_judge
3487
+ - type: llm-judge
3537
3488
  rubric:
3538
3489
  accuracy:
3539
3490
  weight: 0.6
@@ -3812,7 +3763,7 @@ var evalPromptJudgeCommand = command({
3812
3763
  });
3813
3764
  outputs.push({
3814
3765
  name: "default_llm_judge",
3815
- type: "llm_judge",
3766
+ type: "llm-judge",
3816
3767
  status: "prompt_ready",
3817
3768
  prompt: {
3818
3769
  system_prompt: assembly.systemPrompt,
@@ -3830,7 +3781,7 @@ var evalPromptJudgeCommand = command({
3830
3781
  });
3831
3782
  async function processEvaluator(config, evalCase, candidate, promptInputs) {
3832
3783
  switch (config.type) {
3833
- case "code": {
3784
+ case "code-judge": {
3834
3785
  const codeConfig = config;
3835
3786
  const script = codeConfig.command ?? codeConfig.script ?? [];
3836
3787
  const scriptCwd = codeConfig.resolvedCwd ?? codeConfig.cwd;
@@ -3855,14 +3806,14 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
3855
3806
  const parsed = JSON.parse(stdout);
3856
3807
  return {
3857
3808
  name: codeConfig.name,
3858
- type: "code_judge",
3809
+ type: "code-judge",
3859
3810
  status: "completed",
3860
3811
  result: parsed
3861
3812
  };
3862
3813
  } catch (error) {
3863
3814
  return {
3864
3815
  name: codeConfig.name,
3865
- type: "code_judge",
3816
+ type: "code-judge",
3866
3817
  status: "completed",
3867
3818
  result: {
3868
3819
  score: 0,
@@ -3871,7 +3822,7 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
3871
3822
  };
3872
3823
  }
3873
3824
  }
3874
- case "llm_judge": {
3825
+ case "llm-judge": {
3875
3826
  const llmConfig = config;
3876
3827
  const assembly = assembleLlmJudgePrompt({
3877
3828
  evalCase,
@@ -3881,7 +3832,7 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
3881
3832
  });
3882
3833
  return {
3883
3834
  name: llmConfig.name,
3884
- type: "llm_judge",
3835
+ type: "llm-judge",
3885
3836
  status: "prompt_ready",
3886
3837
  prompt: {
3887
3838
  system_prompt: assembly.systemPrompt,
@@ -3947,7 +3898,7 @@ var evalPromptOverviewCommand = command({
3947
3898
  "",
3948
3899
  "The output contains an `evaluators` array. Each evaluator has a `status`:",
3949
3900
  "",
3950
- '- **`"completed"`** \u2014 Score is final (code_judge ran deterministically). Read `result.score` (0.0\u20131.0).',
3901
+ '- **`"completed"`** \u2014 Score is final (code-judge ran deterministically). Read `result.score` (0.0\u20131.0).',
3951
3902
  '- **`"prompt_ready"`** \u2014 LLM grading required. Send `prompt.system_prompt` as system and',
3952
3903
  " `prompt.user_prompt` as user to your LLM. Parse the JSON response to get `score`, `hits`, `misses`.",
3953
3904
  ""
@@ -4123,11 +4074,20 @@ var evalRunCommand = command({
4123
4074
  otelGroupTurns: flag({
4124
4075
  long: "otel-group-turns",
4125
4076
  description: "Group messages into turn spans for multi-turn evaluations (requires --export-otel)"
4077
+ }),
4078
+ retryErrors: option({
4079
+ type: optional(string),
4080
+ long: "retry-errors",
4081
+ description: "Path to previous output JSONL \u2014 re-run only execution_error test cases"
4082
+ }),
4083
+ strict: flag({
4084
+ long: "strict",
4085
+ description: "Exit with error on version mismatch (instead of warning)"
4126
4086
  })
4127
4087
  },
4128
4088
  handler: async (args) => {
4129
4089
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4130
- const { launchInteractiveWizard } = await import("./interactive-TOUKPSHP.js");
4090
+ const { launchInteractiveWizard } = await import("./interactive-WF6UO63B.js");
4131
4091
  await launchInteractiveWizard();
4132
4092
  return;
4133
4093
  }
@@ -4157,7 +4117,9 @@ var evalRunCommand = command({
4157
4117
  exportOtel: args.exportOtel,
4158
4118
  otelBackend: args.otelBackend,
4159
4119
  otelCaptureContent: args.otelCaptureContent,
4160
- otelGroupTurns: args.otelGroupTurns
4120
+ otelGroupTurns: args.otelGroupTurns,
4121
+ retryErrors: args.retryErrors,
4122
+ strict: args.strict
4161
4123
  };
4162
4124
  await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
4163
4125
  }
@@ -4758,12 +4720,12 @@ var traceListCommand = command({
4758
4720
  var SUPPORTED_TYPES = [
4759
4721
  "contains",
4760
4722
  "regex",
4761
- "is_json",
4723
+ "is-json",
4762
4724
  "equals",
4763
4725
  "latency",
4764
4726
  "cost",
4765
- "token_usage",
4766
- "execution_metrics"
4727
+ "token-usage",
4728
+ "execution-metrics"
4767
4729
  ];
4768
4730
  function parseKeyValues(s) {
4769
4731
  const result = {};
@@ -4777,7 +4739,7 @@ function parseKeyValues(s) {
4777
4739
  }
4778
4740
  function parseAssertSpec(spec) {
4779
4741
  const colonIdx = spec.indexOf(":");
4780
- const type = colonIdx === -1 ? spec : spec.slice(0, colonIdx);
4742
+ const type = (colonIdx === -1 ? spec : spec.slice(0, colonIdx)).replace(/_/g, "-");
4781
4743
  const params = colonIdx === -1 ? "" : spec.slice(colonIdx + 1);
4782
4744
  switch (type) {
4783
4745
  case "contains":
@@ -4786,8 +4748,8 @@ function parseAssertSpec(spec) {
4786
4748
  case "regex":
4787
4749
  if (!params) throw new Error("regex requires a pattern: regex:<pattern>");
4788
4750
  return { name: "regex", type: "regex", value: params };
4789
- case "is_json":
4790
- return { name: "is_json", type: "is_json" };
4751
+ case "is-json":
4752
+ return { name: "is-json", type: "is-json" };
4791
4753
  case "equals":
4792
4754
  if (!params) throw new Error("equals requires a value: equals:<value>");
4793
4755
  return { name: "equals", type: "equals", value: params };
@@ -4803,19 +4765,19 @@ function parseAssertSpec(spec) {
4803
4765
  throw new Error("cost requires a budget in USD: cost:<usd>");
4804
4766
  return { name: "cost", type: "cost", budget };
4805
4767
  }
4806
- case "token_usage": {
4768
+ case "token-usage": {
4807
4769
  const kv = parseKeyValues(params);
4808
- const config = { name: "token_usage", type: "token_usage" };
4770
+ const config = { name: "token-usage", type: "token-usage" };
4809
4771
  if (kv.max_total) config.max_total = Number(kv.max_total);
4810
4772
  if (kv.max_input) config.max_input = Number(kv.max_input);
4811
4773
  if (kv.max_output) config.max_output = Number(kv.max_output);
4812
4774
  return config;
4813
4775
  }
4814
- case "execution_metrics": {
4776
+ case "execution-metrics": {
4815
4777
  const kv = parseKeyValues(params);
4816
4778
  const config = {
4817
- name: "execution_metrics",
4818
- type: "execution_metrics"
4779
+ name: "execution-metrics",
4780
+ type: "execution-metrics"
4819
4781
  };
4820
4782
  if (kv.max_tool_calls) config.max_tool_calls = Number(kv.max_tool_calls);
4821
4783
  if (kv.max_llm_calls) config.max_llm_calls = Number(kv.max_llm_calls);
@@ -4861,7 +4823,7 @@ var stubProvider = {
4861
4823
  }
4862
4824
  };
4863
4825
  var stubLlmJudge = {
4864
- kind: "llm_judge",
4826
+ kind: "llm-judge",
4865
4827
  evaluate() {
4866
4828
  throw new Error("trace score does not support LLM-based evaluators");
4867
4829
  }
@@ -4955,7 +4917,7 @@ var traceScoreCommand = command({
4955
4917
  type: string,
4956
4918
  long: "assert",
4957
4919
  short: "a",
4958
- description: "Evaluator spec: contains:<val>, regex:<pat>, is_json, equals:<val>, latency:<ms>, cost:<usd>, token_usage:<params>, execution_metrics:<params>"
4920
+ description: "Evaluator spec: contains:<val>, regex:<pat>, is-json, equals:<val>, latency:<ms>, cost:<usd>, token-usage:<params>, execution-metrics:<params>"
4959
4921
  }),
4960
4922
  testId: option({
4961
4923
  type: optional(string),
@@ -4990,7 +4952,7 @@ var traceScoreCommand = command({
4990
4952
  console.error(`${c2.yellow}Warning:${c2.reset} No results found in ${file}`);
4991
4953
  process.exit(0);
4992
4954
  }
4993
- const traceRequired = ["latency", "cost", "token_usage", "execution_metrics"].includes(
4955
+ const traceRequired = ["latency", "cost", "token-usage", "execution-metrics"].includes(
4994
4956
  evaluatorConfig.type
4995
4957
  );
4996
4958
  if (traceRequired) {
@@ -5831,4 +5793,4 @@ export {
5831
5793
  preprocessArgv,
5832
5794
  runCli
5833
5795
  };
5834
- //# sourceMappingURL=chunk-6KU2ZUFJ.js.map
5796
+ //# sourceMappingURL=chunk-ZSSGXZX6.js.map