agentv 2.8.0-next.1 → 2.9.0-next.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/README.md +28 -2
  2. package/dist/{chunk-RCFB5QFS.js → chunk-3INJ7ISP.js} +46 -85
  3. package/dist/chunk-3INJ7ISP.js.map +1 -0
  4. package/dist/{chunk-2SXGPQVR.js → chunk-PC3FAOHT.js} +4 -4
  5. package/dist/chunk-PC3FAOHT.js.map +1 -0
  6. package/dist/{chunk-DJCMBVB3.js → chunk-RJWTL3VS.js} +166 -75
  7. package/dist/chunk-RJWTL3VS.js.map +1 -0
  8. package/dist/cli.js +3 -3
  9. package/dist/{dist-T7REAXNS.js → dist-BGRU67HI.js} +2 -2
  10. package/dist/index.js +3 -3
  11. package/dist/{interactive-TE5SJPJW.js → interactive-7KFUCBIP.js} +3 -3
  12. package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +1 -1
  13. package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +8 -8
  14. package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +12683 -237
  15. package/dist/templates/.agentv/config.yaml +1 -1
  16. package/dist/templates/.agentv/targets.yaml +10 -13
  17. package/package.json +2 -2
  18. package/dist/chunk-2SXGPQVR.js.map +0 -1
  19. package/dist/chunk-DJCMBVB3.js.map +0 -1
  20. package/dist/chunk-RCFB5QFS.js.map +0 -1
  21. package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +0 -202
  22. package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +0 -316
  23. package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +0 -137
  24. package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +0 -215
  25. package/dist/templates/.claude/skills/agentv-eval-builder/references/config-schema.json +0 -27
  26. package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +0 -118
  27. package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +0 -278
  28. package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +0 -333
  29. package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -77
  30. package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +0 -121
  31. package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +0 -298
  32. package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +0 -78
  33. package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +0 -5
  34. package/dist/templates/.github/prompts/agentv-optimize.prompt.md +0 -4
  35. /package/dist/{dist-T7REAXNS.js.map → dist-BGRU67HI.js.map} +0 -0
  36. /package/dist/{interactive-TE5SJPJW.js.map → interactive-7KFUCBIP.js.map} +0 -0
package/README.md CHANGED
@@ -6,6 +6,29 @@ AgentV evaluates your agents locally with multi-objective scoring (correctness,
6
6
 
7
7
  ## Installation
8
8
 
9
+ ### All Agents Plugin Manager
10
+
11
+ **1. Add AgentV marketplace source:**
12
+ ```bash
13
+ npx allagents plugin marketplace add EntityProcess/agentv
14
+ ```
15
+
16
+ **2. Ask Claude to set up AgentV in your current repository**
17
+ Example prompt:
18
+ ```text
19
+ Set up AgentV in this repo.
20
+ ```
21
+
22
+ The `agentv-onboarding` skill bootstraps setup automatically:
23
+ - verifies `agentv` CLI availability
24
+ - installs the CLI if needed
25
+ - runs `agentv init`
26
+ - verifies setup artifacts
27
+
28
+ ### CLI-Only Setup (Fallback)
29
+
30
+ If you are not using Claude plugins, use the CLI directly.
31
+
9
32
  **1. Install:**
10
33
  ```bash
11
34
  npm install -g agentv
@@ -54,7 +77,7 @@ Learn more in the [examples/](examples/README.md) directory. For a detailed comp
54
77
 
55
78
  | Feature | AgentV | [LangWatch](https://github.com/langwatch/langwatch) | [LangSmith](https://github.com/langchain-ai/langsmith-sdk) | [LangFuse](https://github.com/langfuse/langfuse) |
56
79
  |---------|--------|-----------|-----------|----------|
57
- | **Setup** | `npm install` | Cloud account + API key | Cloud account + API key | Cloud account + API key |
80
+ | **Setup** | `npm install agentv` | Cloud account + API key | Cloud account + API key | Cloud account + API key |
58
81
  | **Server** | None (local) | Managed cloud | Managed cloud | Managed cloud |
59
82
  | **Privacy** | All local | Cloud-hosted | Cloud-hosted | Cloud-hosted |
60
83
  | **CLI-first** | ✓ | ✗ | Limited | Limited |
@@ -132,7 +155,10 @@ description: Math evaluation dataset
132
155
  dataset: math-tests
133
156
  execution:
134
157
  target: azure_base
135
- evaluator: llm_judge
158
+ assert:
159
+ - name: correctness
160
+ type: llm_judge
161
+ prompt: ./judges/correctness.md
136
162
  ```
137
163
 
138
164
  Benefits: Streaming-friendly, Git-friendly diffs, programmatic generation, industry standard (DeepEval, LangWatch, Hugging Face).
@@ -10,7 +10,7 @@ import {
10
10
  validateEvalFile,
11
11
  validateFileReferences,
12
12
  validateTargetsFile
13
- } from "./chunk-2SXGPQVR.js";
13
+ } from "./chunk-PC3FAOHT.js";
14
14
  import {
15
15
  assembleLlmJudgePrompt,
16
16
  buildPromptInputs,
@@ -24,7 +24,7 @@ import {
24
24
  toCamelCaseDeep,
25
25
  toSnakeCaseDeep as toSnakeCaseDeep2,
26
26
  trimBaselineResult
27
- } from "./chunk-DJCMBVB3.js";
27
+ } from "./chunk-RJWTL3VS.js";
28
28
  import {
29
29
  __commonJS,
30
30
  __esm,
@@ -2877,7 +2877,7 @@ function oneOf(literals) {
2877
2877
  // package.json
2878
2878
  var package_default = {
2879
2879
  name: "agentv",
2880
- version: "2.8.0-next.1",
2880
+ version: "2.9.0-next.2",
2881
2881
  description: "CLI entry point for AgentV",
2882
2882
  type: "module",
2883
2883
  repository: {
@@ -4042,7 +4042,7 @@ var evalRunCommand = command({
4042
4042
  },
4043
4043
  handler: async (args) => {
4044
4044
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4045
- const { launchInteractiveWizard } = await import("./interactive-TE5SJPJW.js");
4045
+ const { launchInteractiveWizard } = await import("./interactive-7KFUCBIP.js");
4046
4046
  await launchInteractiveWizard();
4047
4047
  return;
4048
4048
  }
@@ -4271,26 +4271,6 @@ import { fileURLToPath } from "node:url";
4271
4271
  function getAgentvTemplates() {
4272
4272
  return getTemplatesFromDir(".agentv");
4273
4273
  }
4274
- function getAgentsTemplates() {
4275
- if (isDistRuntime()) {
4276
- return getTemplatesFromDir(".agents");
4277
- }
4278
- const repoRoot = getRepoRootFromDev();
4279
- const skillsRoot = path4.join(repoRoot, "plugins", "agentv-dev", "skills");
4280
- const skillsToInclude = [
4281
- "agentv-chat-to-eval",
4282
- "agentv-eval-builder",
4283
- "agentv-eval-orchestrator",
4284
- "agentv-prompt-optimizer"
4285
- ];
4286
- const templates = [];
4287
- for (const skill of skillsToInclude) {
4288
- const skillDir = path4.join(skillsRoot, skill);
4289
- const skillTemplates = readTemplatesRecursively(skillDir, path4.join("skills", skill));
4290
- templates.push(...skillTemplates);
4291
- }
4292
- return templates;
4293
- }
4294
4274
  function getTemplatesFromDir(subdir) {
4295
4275
  const currentDir = path4.dirname(fileURLToPath(import.meta.url));
4296
4276
  let templatesDir;
@@ -4301,14 +4281,6 @@ function getTemplatesFromDir(subdir) {
4301
4281
  }
4302
4282
  return readTemplatesRecursively(templatesDir, "");
4303
4283
  }
4304
- function isDistRuntime() {
4305
- const currentDir = path4.dirname(fileURLToPath(import.meta.url));
4306
- return currentDir.includes(`${path4.sep}dist`);
4307
- }
4308
- function getRepoRootFromDev() {
4309
- const currentDir = path4.dirname(fileURLToPath(import.meta.url));
4310
- return path4.resolve(currentDir, "..", "..", "..", "..");
4311
- }
4312
4284
  function readTemplatesRecursively(dir, relativePath) {
4313
4285
  const templates = [];
4314
4286
  const entries2 = readdirSync(dir);
@@ -4331,6 +4303,12 @@ function readTemplatesRecursively(dir, relativePath) {
4331
4303
  }
4332
4304
 
4333
4305
  // src/commands/init/index.ts
4306
+ function printSkillFirstInstructions() {
4307
+ console.log("\nAI-skills-first setup (recommended):");
4308
+ console.log(" npx allagents plugin marketplace add EntityProcess/agentv");
4309
+ console.log(" npx allagents plugin install agentv-dev@agentv");
4310
+ console.log(' Then ask your agent: "Set up AgentV in this repo."');
4311
+ }
4334
4312
  async function promptYesNo(message) {
4335
4313
  const rl = readline.createInterface({
4336
4314
  input: process.stdin,
@@ -4346,9 +4324,7 @@ async function promptYesNo(message) {
4346
4324
  async function initCommand(options = {}) {
4347
4325
  const targetPath = path5.resolve(options.targetPath ?? ".");
4348
4326
  const agentvDir = path5.join(targetPath, ".agentv");
4349
- const agentsDir = path5.join(targetPath, ".agents");
4350
4327
  const agentvTemplates = getAgentvTemplates();
4351
- const agentsTemplates = getAgentsTemplates();
4352
4328
  const envTemplate = agentvTemplates.find((t) => t.path === ".env.example");
4353
4329
  const otherAgentvTemplates = agentvTemplates.filter((t) => t.path !== ".env.example");
4354
4330
  const existingFiles = [];
@@ -4366,14 +4342,6 @@ async function initCommand(options = {}) {
4366
4342
  }
4367
4343
  }
4368
4344
  }
4369
- if (existsSync(agentsDir)) {
4370
- for (const template of agentsTemplates) {
4371
- const targetFilePath = path5.join(agentsDir, template.path);
4372
- if (existsSync(targetFilePath)) {
4373
- existingFiles.push(path5.relative(targetPath, targetFilePath));
4374
- }
4375
- }
4376
- }
4377
4345
  if (existingFiles.length > 0) {
4378
4346
  console.log("We detected an existing setup:");
4379
4347
  for (const file of existingFiles) {
@@ -4383,6 +4351,7 @@ async function initCommand(options = {}) {
4383
4351
  const shouldReplace = await promptYesNo("Do you want to replace these files?");
4384
4352
  if (!shouldReplace) {
4385
4353
  console.log("\nInit cancelled. No files were changed.");
4354
+ printSkillFirstInstructions();
4386
4355
  return;
4387
4356
  }
4388
4357
  console.log();
@@ -4390,9 +4359,6 @@ async function initCommand(options = {}) {
4390
4359
  if (!existsSync(agentvDir)) {
4391
4360
  mkdirSync(agentvDir, { recursive: true });
4392
4361
  }
4393
- if (!existsSync(agentsDir)) {
4394
- mkdirSync(agentsDir, { recursive: true });
4395
- }
4396
4362
  if (envTemplate) {
4397
4363
  const envFilePath = path5.join(targetPath, ".env.example");
4398
4364
  writeFileSync2(envFilePath, envTemplate.content, "utf-8");
@@ -4407,15 +4373,6 @@ async function initCommand(options = {}) {
4407
4373
  writeFileSync2(targetFilePath, template.content, "utf-8");
4408
4374
  console.log(`Created ${path5.relative(targetPath, targetFilePath)}`);
4409
4375
  }
4410
- for (const template of agentsTemplates) {
4411
- const targetFilePath = path5.join(agentsDir, template.path);
4412
- const targetDirPath = path5.dirname(targetFilePath);
4413
- if (!existsSync(targetDirPath)) {
4414
- mkdirSync(targetDirPath, { recursive: true });
4415
- }
4416
- writeFileSync2(targetFilePath, template.content, "utf-8");
4417
- console.log(`Created ${path5.relative(targetPath, targetFilePath)}`);
4418
- }
4419
4376
  console.log("\nAgentV initialized successfully!");
4420
4377
  console.log("\nFiles installed to root:");
4421
4378
  if (envTemplate) {
@@ -4426,19 +4383,15 @@ Files installed to ${path5.relative(targetPath, agentvDir)}:`);
4426
4383
  for (const t of otherAgentvTemplates) {
4427
4384
  console.log(` - ${t.path}`);
4428
4385
  }
4429
- console.log(`
4430
- Files installed to ${path5.relative(targetPath, agentsDir)}:`);
4431
- for (const t of agentsTemplates) {
4432
- console.log(` - ${t.path}`);
4433
- }
4434
4386
  console.log("\nYou can now:");
4435
4387
  console.log(" 1. Copy .env.example to .env and add your API credentials");
4436
4388
  console.log(" 2. Configure targets in .agentv/targets.yaml");
4437
- console.log(" 3. Create eval files using the schema and prompt templates");
4389
+ console.log(" 3. Use AI skills to create and run evals");
4390
+ printSkillFirstInstructions();
4438
4391
  }
4439
4392
  var initCmdTsCommand = command({
4440
4393
  name: "init",
4441
- description: "Initialize AgentV in your project (installs config files and skills)",
4394
+ description: "Initialize AgentV bootstrap files in your project",
4442
4395
  args: {
4443
4396
  path: option({
4444
4397
  type: optional(string),
@@ -4469,7 +4422,7 @@ function detectPackageManager() {
4469
4422
  }
4470
4423
  function runCommand(cmd, args) {
4471
4424
  return new Promise((resolve, reject) => {
4472
- const child = spawn(cmd, args, { stdio: ["inherit", "pipe", "inherit"] });
4425
+ const child = spawn(cmd, args, { stdio: ["inherit", "pipe", "inherit"], shell: true });
4473
4426
  let stdout = "";
4474
4427
  child.stdout?.on("data", (data) => {
4475
4428
  process.stdout.write(data);
@@ -4850,7 +4803,12 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
4850
4803
  promptInputs: { question: "", guidelines: "" },
4851
4804
  now: /* @__PURE__ */ new Date(),
4852
4805
  output: Array.isArray(output) ? output : void 0,
4853
- trace
4806
+ trace,
4807
+ tokenUsage: raw.token_usage ? toCamelCaseDeep(raw.token_usage) : void 0,
4808
+ costUsd: raw.cost_usd,
4809
+ durationMs: raw.duration_ms,
4810
+ startTime: raw.start_time,
4811
+ endTime: raw.end_time
4854
4812
  };
4855
4813
  const score = await evaluator.evaluate(evalContext);
4856
4814
  scored.push({
@@ -4951,7 +4909,9 @@ var traceScoreCommand = command({
4951
4909
  evaluatorConfig.type
4952
4910
  );
4953
4911
  if (traceRequired) {
4954
- const hasTrace = results.some((r) => r.trace);
4912
+ const hasTrace = results.some(
4913
+ (r) => r.trace || r.cost_usd !== void 0 || r.duration_ms !== void 0 || r.token_usage !== void 0
4914
+ );
4955
4915
  if (!hasTrace) {
4956
4916
  console.error(
4957
4917
  `${c2.red}Error:${c2.reset} Result file lacks trace data. Re-run eval with ${c2.bold}--trace${c2.reset} to capture trace summaries.`
@@ -4986,26 +4946,27 @@ var traceScoreCommand = command({
4986
4946
  });
4987
4947
 
4988
4948
  // src/commands/trace/show.ts
4989
- function renderFlatTrace(trace) {
4949
+ function renderFlatTrace(result) {
4950
+ const trace = result.trace;
4990
4951
  const parts = [];
4991
- if (trace.tool_names && trace.tool_names.length > 0) {
4952
+ if (trace?.tool_names && trace.tool_names.length > 0) {
4992
4953
  const toolParts = trace.tool_names.map((name) => {
4993
4954
  const count = trace.tool_calls_by_name?.[name] ?? 0;
4994
4955
  return count > 1 ? `${name} \xD7${count}` : name;
4995
4956
  });
4996
4957
  parts.push(`Tools: ${toolParts.join(", ")}`);
4997
4958
  }
4998
- if (trace.duration_ms !== void 0) {
4999
- parts.push(`Duration: ${formatDuration(trace.duration_ms)}`);
4959
+ if (result.duration_ms !== void 0) {
4960
+ parts.push(`Duration: ${formatDuration(result.duration_ms)}`);
5000
4961
  }
5001
- if (trace.token_usage) {
5002
- const total = trace.token_usage.input + trace.token_usage.output;
4962
+ if (result.token_usage) {
4963
+ const total = result.token_usage.input + result.token_usage.output;
5003
4964
  parts.push(`Tokens: ${formatNumber(total)}`);
5004
4965
  }
5005
- if (trace.cost_usd !== void 0) {
5006
- parts.push(`Cost: ${formatCost(trace.cost_usd)}`);
4966
+ if (result.cost_usd !== void 0) {
4967
+ parts.push(`Cost: ${formatCost(result.cost_usd)}`);
5007
4968
  }
5008
- if (trace.llm_call_count !== void 0) {
4969
+ if (trace?.llm_call_count !== void 0) {
5009
4970
  parts.push(`LLM calls: ${trace.llm_call_count}`);
5010
4971
  }
5011
4972
  return parts.join(" | ");
@@ -5019,19 +4980,19 @@ function renderScores(scores) {
5019
4980
  function renderTree(result) {
5020
4981
  const messages = result.output;
5021
4982
  if (!messages || messages.length === 0) {
5022
- if (result.trace) {
5023
- return renderFlatTrace(result.trace);
4983
+ if (result.trace || result.duration_ms !== void 0 || result.cost_usd !== void 0) {
4984
+ return renderFlatTrace(result);
5024
4985
  }
5025
4986
  return `${c2.dim}No trace data available${c2.reset}`;
5026
4987
  }
5027
4988
  const lines = [];
5028
4989
  const testId = result.test_id ?? result.eval_id ?? "unknown";
5029
- const totalDuration = result.trace?.duration_ms;
5030
- const totalTokens = result.trace?.token_usage ? result.trace.token_usage.input + result.trace.token_usage.output : void 0;
4990
+ const totalDuration = result.duration_ms;
4991
+ const totalTokens = result.token_usage ? result.token_usage.input + result.token_usage.output : void 0;
5031
4992
  const rootParts = [testId];
5032
4993
  if (totalDuration !== void 0) rootParts.push(formatDuration(totalDuration));
5033
4994
  if (totalTokens !== void 0) rootParts.push(`${formatNumber(totalTokens)} tok`);
5034
- if (result.trace?.cost_usd !== void 0) rootParts.push(formatCost(result.trace.cost_usd));
4995
+ if (result.cost_usd !== void 0) rootParts.push(formatCost(result.cost_usd));
5035
4996
  lines.push(`${c2.bold}${rootParts.join(", ")}${c2.reset}`);
5036
4997
  const steps = [];
5037
4998
  for (let i = 0; i < messages.length; i++) {
@@ -5108,8 +5069,8 @@ function formatResultDetail(result, index, tree) {
5108
5069
  if (result.scores && result.scores.length > 0) {
5109
5070
  lines.push(` ${c2.dim}Scores:${c2.reset} ${renderScores(result.scores)}`);
5110
5071
  }
5111
- if (result.trace) {
5112
- lines.push(` ${c2.dim}Trace:${c2.reset} ${renderFlatTrace(result.trace)}`);
5072
+ if (result.trace || result.duration_ms !== void 0 || result.cost_usd !== void 0) {
5073
+ lines.push(` ${c2.dim}Trace:${c2.reset} ${renderFlatTrace(result)}`);
5113
5074
  }
5114
5075
  if (result.reasoning) {
5115
5076
  const maxLen = 200;
@@ -5216,7 +5177,7 @@ function collectMetrics(results) {
5216
5177
  if (scores.length > 0) {
5217
5178
  rows.push({ name: "score", values: scores, formatter: (n) => n.toFixed(2) });
5218
5179
  }
5219
- const latencies = results.map((r) => r.trace?.duration_ms).filter((v) => v !== void 0);
5180
+ const latencies = results.map((r) => r.duration_ms).filter((v) => v !== void 0);
5220
5181
  if (latencies.length > 0) {
5221
5182
  rows.push({
5222
5183
  name: "latency_s",
@@ -5224,13 +5185,13 @@ function collectMetrics(results) {
5224
5185
  formatter: (n) => n.toFixed(1)
5225
5186
  });
5226
5187
  }
5227
- const costs = results.map((r) => r.trace?.cost_usd).filter((v) => v !== void 0);
5188
+ const costs = results.map((r) => r.cost_usd).filter((v) => v !== void 0);
5228
5189
  if (costs.length > 0) {
5229
5190
  rows.push({ name: "cost_usd", values: costs, formatter: (n) => formatCost(n) });
5230
5191
  }
5231
5192
  const tokens = results.map((r) => {
5232
- if (!r.trace?.token_usage) return void 0;
5233
- return r.trace.token_usage.input + r.trace.token_usage.output;
5193
+ if (!r.token_usage) return void 0;
5194
+ return r.token_usage.input + r.token_usage.output;
5234
5195
  }).filter((v) => v !== void 0);
5235
5196
  if (tokens.length > 0) {
5236
5197
  rows.push({
@@ -5688,4 +5649,4 @@ export {
5688
5649
  preprocessArgv,
5689
5650
  runCli
5690
5651
  };
5691
- //# sourceMappingURL=chunk-RCFB5QFS.js.map
5652
+ //# sourceMappingURL=chunk-3INJ7ISP.js.map