agentv 2.10.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -375,6 +375,69 @@ For complete examples and patterns, see:
375
375
  - [custom-evaluators](https://agentv.dev/evaluators/custom-evaluators/)
376
376
  - [code-judge-sdk example](examples/features/code-judge-sdk)
377
377
 
378
+ ### Deterministic Assertions
379
+
380
+ Built-in assertion types for common text-matching patterns — no LLM judge or code_judge needed:
381
+
382
+ | Type | Value | Behavior |
383
+ |------|-------|----------|
384
+ | `contains` | `string` | Pass if output includes the substring |
385
+ | `contains_any` | `string[]` | Pass if output includes ANY of the strings |
386
+ | `contains_all` | `string[]` | Pass if output includes ALL of the strings |
387
+ | `icontains` | `string` | Case-insensitive `contains` |
388
+ | `icontains_any` | `string[]` | Case-insensitive `contains_any` |
389
+ | `icontains_all` | `string[]` | Case-insensitive `contains_all` |
390
+ | `starts_with` | `string` | Pass if output starts with value (trimmed) |
391
+ | `ends_with` | `string` | Pass if output ends with value (trimmed) |
392
+ | `regex` | `string` | Pass if output matches regex (optional `flags: "i"`) |
393
+ | `equals` | `string` | Pass if output exactly equals value (trimmed) |
394
+ | `is_json` | — | Pass if output is valid JSON |
395
+
396
+ All assertions support `weight`, `required`, and `negate` flags. Use `negate: true` to invert (no `not_` prefix needed).
397
+
398
+ ```yaml
399
+ assert:
400
+ # Case-insensitive matching for natural language variation
401
+ - type: icontains_any
402
+ value: ["missing rule code", "need rule code", "provide rule code"]
403
+ required: true
404
+
405
+ # Multiple required terms
406
+ - type: icontains_all
407
+ value: ["country code", "rule codes"]
408
+
409
+ # Case-insensitive regex
410
+ - type: regex
411
+ value: "[a-z]+@[a-z]+\\.[a-z]+"
412
+ flags: "i"
413
+ ```
414
+
415
+ See the [assert-extended example](examples/features/assert-extended) for complete patterns.
416
+
417
+ ### Target Configuration: `judge_target`
418
+
419
+ Agent provider targets (`codex`, `copilot`, `claude`, `vscode`) **must** specify `judge_target` when using `llm_judge` or `rubrics` evaluators. Without it, AgentV errors at startup — agent providers can't return structured JSON for judging.
420
+
421
+ ```yaml
422
+ targets:
423
+ # Agent target — requires judge_target for LLM-based evaluation
424
+ - name: codex_local
425
+ provider: codex
426
+ judge_target: azure_base # Required: LLM provider for judging
427
+
428
+ # LLM target — no judge_target needed (judges itself)
429
+ - name: azure_base
430
+ provider: azure
431
+ ```
432
+
433
+ ### Agentic Eval Patterns
434
+
435
+ When agents respond via tool calls instead of text, use `tool_trajectory` instead of text assertions:
436
+
437
+ - **Agent takes workspace actions** (creates files, runs commands) → `tool_trajectory` evaluator
438
+ - **Agent responds in text** (answers questions, asks for info) → `contains`/`icontains_any`/`llm_judge`
439
+ - **Agent does both** → `composite` evaluator combining both
440
+
378
441
  ### LLM Judges
379
442
 
380
443
  Create markdown judge files with evaluation criteria and scoring guidelines:
@@ -5,13 +5,14 @@ import {
5
5
  resolveEvalPaths,
6
6
  runEvalCommand,
7
7
  selectTarget,
8
- toSnakeCaseDeep,
8
+ toSnakeCaseDeep as toSnakeCaseDeep2,
9
9
  validateConfigFile,
10
10
  validateEvalFile,
11
11
  validateFileReferences,
12
12
  validateTargetsFile
13
- } from "./chunk-PC3FAOHT.js";
13
+ } from "./chunk-GO7OTNQ4.js";
14
14
  import {
15
+ RepoManager,
15
16
  assembleLlmJudgePrompt,
16
17
  buildPromptInputs,
17
18
  createBuiltinRegistry,
@@ -22,9 +23,9 @@ import {
22
23
  loadTests,
23
24
  normalizeLineEndings,
24
25
  toCamelCaseDeep,
25
- toSnakeCaseDeep as toSnakeCaseDeep2,
26
+ toSnakeCaseDeep,
26
27
  trimBaselineResult
27
- } from "./chunk-RJWTL3VS.js";
28
+ } from "./chunk-EXJWRKKL.js";
28
29
  import {
29
30
  __commonJS,
30
31
  __esm,
@@ -2877,7 +2878,7 @@ function oneOf(literals) {
2877
2878
  // package.json
2878
2879
  var package_default = {
2879
2880
  name: "agentv",
2880
- version: "2.10.0",
2881
+ version: "2.11.0",
2881
2882
  description: "CLI entry point for AgentV",
2882
2883
  type: "module",
2883
2884
  repository: {
@@ -2924,6 +2925,43 @@ var package_default = {
2924
2925
  }
2925
2926
  };
2926
2927
 
2928
+ // src/commands/cache/index.ts
2929
+ var cleanCommand = command({
2930
+ name: "clean",
2931
+ description: "Remove all cached git repositories",
2932
+ args: {
2933
+ force: flag({
2934
+ long: "force",
2935
+ short: "f",
2936
+ description: "Skip confirmation prompt"
2937
+ })
2938
+ },
2939
+ handler: async ({ force }) => {
2940
+ if (!force) {
2941
+ const readline2 = await import("node:readline");
2942
+ const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
2943
+ const answer = await new Promise((resolve) => {
2944
+ rl.question("Remove all cached git repos from ~/.agentv/git-cache/? [y/N] ", resolve);
2945
+ });
2946
+ rl.close();
2947
+ if (answer.toLowerCase() !== "y") {
2948
+ console.log("Cancelled.");
2949
+ return;
2950
+ }
2951
+ }
2952
+ const manager = new RepoManager();
2953
+ await manager.cleanCache();
2954
+ console.log("Cache cleaned.");
2955
+ }
2956
+ });
2957
+ var cacheCommand = subcommands({
2958
+ name: "cache",
2959
+ description: "Manage AgentV cache",
2960
+ cmds: {
2961
+ clean: cleanCommand
2962
+ }
2963
+ });
2964
+
2927
2965
  // src/commands/compare/index.ts
2928
2966
  import { readFileSync } from "node:fs";
2929
2967
  var colors = {
@@ -3267,7 +3305,7 @@ var compareCommand = command({
3267
3305
  const results2 = loadJsonlResults(results[1]);
3268
3306
  const comparison = compareResults(results1, results2, effectiveThreshold);
3269
3307
  if (outputFormat === "json") {
3270
- console.log(JSON.stringify(toSnakeCaseDeep(comparison), null, 2));
3308
+ console.log(JSON.stringify(toSnakeCaseDeep2(comparison), null, 2));
3271
3309
  } else {
3272
3310
  console.log(formatTable(comparison, results[0], results[1]));
3273
3311
  }
@@ -3313,7 +3351,7 @@ var compareCommand = command({
3313
3351
  }
3314
3352
  const comparison = compareResults(baselineResults, candidateResults, effectiveThreshold);
3315
3353
  if (outputFormat === "json") {
3316
- console.log(JSON.stringify(toSnakeCaseDeep(comparison), null, 2));
3354
+ console.log(JSON.stringify(toSnakeCaseDeep2(comparison), null, 2));
3317
3355
  } else {
3318
3356
  console.log(formatTable(comparison, baseline, candidate));
3319
3357
  }
@@ -3322,7 +3360,7 @@ var compareCommand = command({
3322
3360
  } else {
3323
3361
  const matrixOutput = compareMatrix(groups, effectiveThreshold);
3324
3362
  if (outputFormat === "json") {
3325
- console.log(JSON.stringify(toSnakeCaseDeep(matrixOutput), null, 2));
3363
+ console.log(JSON.stringify(toSnakeCaseDeep2(matrixOutput), null, 2));
3326
3364
  } else {
3327
3365
  console.log(formatMatrix(matrixOutput, baseline));
3328
3366
  }
@@ -3765,7 +3803,7 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
3765
3803
  config: codeConfig.config ?? null
3766
3804
  };
3767
3805
  try {
3768
- const inputPayload = JSON.stringify(toSnakeCaseDeep2(payload), null, 2);
3806
+ const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
3769
3807
  const stdout = await executeScript(script, inputPayload, 6e4, scriptCwd);
3770
3808
  const parsed = JSON.parse(stdout);
3771
3809
  return {
@@ -4042,7 +4080,7 @@ var evalRunCommand = command({
4042
4080
  },
4043
4081
  handler: async (args) => {
4044
4082
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4045
- const { launchInteractiveWizard } = await import("./interactive-7KFUCBIP.js");
4083
+ const { launchInteractiveWizard } = await import("./interactive-V4A3RRU3.js");
4046
4084
  await launchInteractiveWizard();
4047
4085
  return;
4048
4086
  }
@@ -4658,7 +4696,7 @@ var traceListCommand = command({
4658
4696
  try {
4659
4697
  const metas = listResultFiles(cwd, limit);
4660
4698
  if (outputFormat === "json") {
4661
- console.log(JSON.stringify(toSnakeCaseDeep(metas), null, 2));
4699
+ console.log(JSON.stringify(toSnakeCaseDeep2(metas), null, 2));
4662
4700
  } else {
4663
4701
  console.log(formatListTable(metas));
4664
4702
  }
@@ -5325,7 +5363,7 @@ var traceStatsCommand = command({
5325
5363
  const groups = groupResults(results, groupBy2);
5326
5364
  if (outputFormat === "json") {
5327
5365
  const statsJson = computeStatsJson(groups, file);
5328
- console.log(JSON.stringify(toSnakeCaseDeep(statsJson), null, 2));
5366
+ console.log(JSON.stringify(toSnakeCaseDeep2(statsJson), null, 2));
5329
5367
  } else {
5330
5368
  console.log(formatStatsTable(groups, file));
5331
5369
  }
@@ -5374,7 +5412,7 @@ var trimCommand = command({
5374
5412
  const record = JSON.parse(line);
5375
5413
  const camel = toCamelCaseDeep(record);
5376
5414
  const trimmed = trimBaselineResult(camel);
5377
- const snake = toSnakeCaseDeep2(trimmed);
5415
+ const snake = toSnakeCaseDeep(trimmed);
5378
5416
  return JSON.stringify(snake);
5379
5417
  });
5380
5418
  const output = `${trimmedLines.join("\n")}
@@ -5596,12 +5634,100 @@ var validateCommand = command({
5596
5634
  }
5597
5635
  });
5598
5636
 
5637
+ // src/update-check.ts
5638
+ import { spawn as spawn2 } from "node:child_process";
5639
+ import { readFile as readFile3 } from "node:fs/promises";
5640
+ import { homedir } from "node:os";
5641
+ import { join } from "node:path";
5642
+ var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
5643
+ var AGENTV_DIR = join(homedir(), ".agentv");
5644
+ var CACHE_FILE = "version-check.json";
5645
+ var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
5646
+ async function getCachedUpdateInfo(path8) {
5647
+ const filePath = path8 ?? join(AGENTV_DIR, CACHE_FILE);
5648
+ try {
5649
+ const raw = await readFile3(filePath, "utf-8");
5650
+ const data = JSON.parse(raw);
5651
+ if (typeof data.latestVersion === "string" && typeof data.lastCheckedAt === "string") {
5652
+ return data;
5653
+ }
5654
+ return null;
5655
+ } catch {
5656
+ return null;
5657
+ }
5658
+ }
5659
+ function shouldCheck(cache) {
5660
+ if (!cache) return true;
5661
+ const elapsed = Date.now() - new Date(cache.lastCheckedAt).getTime();
5662
+ return elapsed > CHECK_INTERVAL_MS;
5663
+ }
5664
+ function isNewer(a, b) {
5665
+ const pa = a.split(".").map((s) => Number(s.replace(/-.*$/, "")));
5666
+ const pb = b.split(".").map((s) => Number(s.replace(/-.*$/, "")));
5667
+ for (let i = 0; i < 3; i++) {
5668
+ if ((pa[i] ?? 0) > (pb[i] ?? 0)) return true;
5669
+ if ((pa[i] ?? 0) < (pb[i] ?? 0)) return false;
5670
+ }
5671
+ return false;
5672
+ }
5673
+ function buildNotice(currentVersion, latestVersion) {
5674
+ if (!latestVersion) return null;
5675
+ if (!isNewer(latestVersion, currentVersion)) return null;
5676
+ return ` Update available: ${currentVersion} \u2192 ${latestVersion}
5677
+ Run \`agentv self update\` to upgrade.`;
5678
+ }
5679
+ function backgroundUpdateCheck() {
5680
+ const dir = AGENTV_DIR;
5681
+ const filePath = join(dir, CACHE_FILE);
5682
+ const script = `
5683
+ const https = require('https');
5684
+ const fs = require('fs');
5685
+ const dir = ${JSON.stringify(dir)};
5686
+ const filePath = ${JSON.stringify(filePath)};
5687
+ https.get(${JSON.stringify(NPM_REGISTRY_URL)}, { timeout: 5000 }, (res) => {
5688
+ if (res.statusCode !== 200) { res.resume(); process.exit(); }
5689
+ let body = '';
5690
+ res.on('data', (c) => body += c);
5691
+ res.on('end', () => {
5692
+ try {
5693
+ const v = JSON.parse(body).version;
5694
+ if (typeof v === 'string') {
5695
+ fs.mkdirSync(dir, { recursive: true });
5696
+ fs.writeFileSync(filePath, JSON.stringify({ latestVersion: v, lastCheckedAt: new Date().toISOString() }, null, 2));
5697
+ }
5698
+ } catch {}
5699
+ process.exit();
5700
+ });
5701
+ }).on('error', () => process.exit()).on('timeout', function() { this.destroy(); process.exit(); });
5702
+ `;
5703
+ try {
5704
+ const child = spawn2(process.execPath, ["-e", script], {
5705
+ detached: true,
5706
+ stdio: "ignore",
5707
+ windowsHide: true
5708
+ });
5709
+ child.unref();
5710
+ } catch {
5711
+ }
5712
+ }
5713
+ async function getUpdateNotice(currentVersion) {
5714
+ if (process.env.AGENTV_NO_UPDATE_CHECK === "1" || process.env.CI === "true") {
5715
+ return null;
5716
+ }
5717
+ const cache = await getCachedUpdateInfo();
5718
+ if (shouldCheck(cache)) {
5719
+ backgroundUpdateCheck();
5720
+ }
5721
+ return buildNotice(currentVersion, cache?.latestVersion ?? null);
5722
+ }
5723
+
5599
5724
  // src/index.ts
5600
5725
  var app = subcommands({
5601
5726
  name: "agentv",
5602
5727
  description: "AgentV CLI",
5603
5728
  version: package_default.version,
5604
5729
  cmds: {
5730
+ cache: cacheCommand,
5605
5731
  eval: evalRunCommand,
5606
5732
  prompt: evalPromptCommand,
5607
5733
  compare: compareCommand,
@@ -5640,6 +5766,15 @@ function preprocessArgv(argv) {
5640
5766
  return result;
5641
5767
  }
5642
5768
  async function runCli(argv = process.argv) {
5769
+ let updateNotice = null;
5770
+ process.on("exit", () => {
5771
+ if (updateNotice) process.stderr.write(`
5772
+ ${updateNotice}
5773
+ `);
5774
+ });
5775
+ getUpdateNotice(package_default.version).then((n) => {
5776
+ updateNotice = n;
5777
+ });
5643
5778
  const processedArgv = preprocessArgv(argv);
5644
5779
  await run(binary(app), processedArgv);
5645
5780
  }
@@ -5649,4 +5784,4 @@ export {
5649
5784
  preprocessArgv,
5650
5785
  runCli
5651
5786
  };
5652
- //# sourceMappingURL=chunk-G3OTPFYX.js.map
5787
+ //# sourceMappingURL=chunk-CVC3VMZ3.js.map