agentv 2.9.0-next.2 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -0
- package/dist/{chunk-3INJ7ISP.js → chunk-CVC3VMZ3.js} +149 -14
- package/dist/chunk-CVC3VMZ3.js.map +1 -0
- package/dist/{chunk-RJWTL3VS.js → chunk-EXJWRKKL.js} +741 -176
- package/dist/chunk-EXJWRKKL.js.map +1 -0
- package/dist/{chunk-PC3FAOHT.js → chunk-GO7OTNQ4.js} +109 -9
- package/dist/chunk-GO7OTNQ4.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-BGRU67HI.js → dist-NYXYDALF.js} +18 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-7KFUCBIP.js → interactive-V4A3RRU3.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-3INJ7ISP.js.map +0 -1
- package/dist/chunk-PC3FAOHT.js.map +0 -1
- package/dist/chunk-RJWTL3VS.js.map +0 -1
- /package/dist/{dist-BGRU67HI.js.map → dist-NYXYDALF.js.map} +0 -0
- /package/dist/{interactive-7KFUCBIP.js.map → interactive-V4A3RRU3.js.map} +0 -0
package/README.md
CHANGED
|
@@ -375,6 +375,69 @@ For complete examples and patterns, see:
|
|
|
375
375
|
- [custom-evaluators](https://agentv.dev/evaluators/custom-evaluators/)
|
|
376
376
|
- [code-judge-sdk example](examples/features/code-judge-sdk)
|
|
377
377
|
|
|
378
|
+
### Deterministic Assertions
|
|
379
|
+
|
|
380
|
+
Built-in assertion types for common text-matching patterns — no LLM judge or code_judge needed:
|
|
381
|
+
|
|
382
|
+
| Type | Value | Behavior |
|
|
383
|
+
|------|-------|----------|
|
|
384
|
+
| `contains` | `string` | Pass if output includes the substring |
|
|
385
|
+
| `contains_any` | `string[]` | Pass if output includes ANY of the strings |
|
|
386
|
+
| `contains_all` | `string[]` | Pass if output includes ALL of the strings |
|
|
387
|
+
| `icontains` | `string` | Case-insensitive `contains` |
|
|
388
|
+
| `icontains_any` | `string[]` | Case-insensitive `contains_any` |
|
|
389
|
+
| `icontains_all` | `string[]` | Case-insensitive `contains_all` |
|
|
390
|
+
| `starts_with` | `string` | Pass if output starts with value (trimmed) |
|
|
391
|
+
| `ends_with` | `string` | Pass if output ends with value (trimmed) |
|
|
392
|
+
| `regex` | `string` | Pass if output matches regex (optional `flags: "i"`) |
|
|
393
|
+
| `equals` | `string` | Pass if output exactly equals value (trimmed) |
|
|
394
|
+
| `is_json` | — | Pass if output is valid JSON |
|
|
395
|
+
|
|
396
|
+
All assertions support `weight`, `required`, and `negate` flags. Use `negate: true` to invert (no `not_` prefix needed).
|
|
397
|
+
|
|
398
|
+
```yaml
|
|
399
|
+
assert:
|
|
400
|
+
# Case-insensitive matching for natural language variation
|
|
401
|
+
- type: icontains_any
|
|
402
|
+
value: ["missing rule code", "need rule code", "provide rule code"]
|
|
403
|
+
required: true
|
|
404
|
+
|
|
405
|
+
# Multiple required terms
|
|
406
|
+
- type: icontains_all
|
|
407
|
+
value: ["country code", "rule codes"]
|
|
408
|
+
|
|
409
|
+
# Case-insensitive regex
|
|
410
|
+
- type: regex
|
|
411
|
+
value: "[a-z]+@[a-z]+\\.[a-z]+"
|
|
412
|
+
flags: "i"
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
See the [assert-extended example](examples/features/assert-extended) for complete patterns.
|
|
416
|
+
|
|
417
|
+
### Target Configuration: `judge_target`
|
|
418
|
+
|
|
419
|
+
Agent provider targets (`codex`, `copilot`, `claude`, `vscode`) **must** specify `judge_target` when using `llm_judge` or `rubrics` evaluators. Without it, AgentV errors at startup — agent providers can't return structured JSON for judging.
|
|
420
|
+
|
|
421
|
+
```yaml
|
|
422
|
+
targets:
|
|
423
|
+
# Agent target — requires judge_target for LLM-based evaluation
|
|
424
|
+
- name: codex_local
|
|
425
|
+
provider: codex
|
|
426
|
+
judge_target: azure_base # Required: LLM provider for judging
|
|
427
|
+
|
|
428
|
+
# LLM target — no judge_target needed (judges itself)
|
|
429
|
+
- name: azure_base
|
|
430
|
+
provider: azure
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
### Agentic Eval Patterns
|
|
434
|
+
|
|
435
|
+
When agents respond via tool calls instead of text, use `tool_trajectory` instead of text assertions:
|
|
436
|
+
|
|
437
|
+
- **Agent takes workspace actions** (creates files, runs commands) → `tool_trajectory` evaluator
|
|
438
|
+
- **Agent responds in text** (answers questions, asks for info) → `contains`/`icontains_any`/`llm_judge`
|
|
439
|
+
- **Agent does both** → `composite` evaluator combining both
|
|
440
|
+
|
|
378
441
|
### LLM Judges
|
|
379
442
|
|
|
380
443
|
Create markdown judge files with evaluation criteria and scoring guidelines:
|
|
@@ -5,13 +5,14 @@ import {
|
|
|
5
5
|
resolveEvalPaths,
|
|
6
6
|
runEvalCommand,
|
|
7
7
|
selectTarget,
|
|
8
|
-
toSnakeCaseDeep,
|
|
8
|
+
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
9
9
|
validateConfigFile,
|
|
10
10
|
validateEvalFile,
|
|
11
11
|
validateFileReferences,
|
|
12
12
|
validateTargetsFile
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-GO7OTNQ4.js";
|
|
14
14
|
import {
|
|
15
|
+
RepoManager,
|
|
15
16
|
assembleLlmJudgePrompt,
|
|
16
17
|
buildPromptInputs,
|
|
17
18
|
createBuiltinRegistry,
|
|
@@ -22,9 +23,9 @@ import {
|
|
|
22
23
|
loadTests,
|
|
23
24
|
normalizeLineEndings,
|
|
24
25
|
toCamelCaseDeep,
|
|
25
|
-
toSnakeCaseDeep
|
|
26
|
+
toSnakeCaseDeep,
|
|
26
27
|
trimBaselineResult
|
|
27
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-EXJWRKKL.js";
|
|
28
29
|
import {
|
|
29
30
|
__commonJS,
|
|
30
31
|
__esm,
|
|
@@ -2877,7 +2878,7 @@ function oneOf(literals) {
|
|
|
2877
2878
|
// package.json
|
|
2878
2879
|
var package_default = {
|
|
2879
2880
|
name: "agentv",
|
|
2880
|
-
version: "2.
|
|
2881
|
+
version: "2.11.0",
|
|
2881
2882
|
description: "CLI entry point for AgentV",
|
|
2882
2883
|
type: "module",
|
|
2883
2884
|
repository: {
|
|
@@ -2924,6 +2925,43 @@ var package_default = {
|
|
|
2924
2925
|
}
|
|
2925
2926
|
};
|
|
2926
2927
|
|
|
2928
|
+
// src/commands/cache/index.ts
|
|
2929
|
+
var cleanCommand = command({
|
|
2930
|
+
name: "clean",
|
|
2931
|
+
description: "Remove all cached git repositories",
|
|
2932
|
+
args: {
|
|
2933
|
+
force: flag({
|
|
2934
|
+
long: "force",
|
|
2935
|
+
short: "f",
|
|
2936
|
+
description: "Skip confirmation prompt"
|
|
2937
|
+
})
|
|
2938
|
+
},
|
|
2939
|
+
handler: async ({ force }) => {
|
|
2940
|
+
if (!force) {
|
|
2941
|
+
const readline2 = await import("node:readline");
|
|
2942
|
+
const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
|
|
2943
|
+
const answer = await new Promise((resolve) => {
|
|
2944
|
+
rl.question("Remove all cached git repos from ~/.agentv/git-cache/? [y/N] ", resolve);
|
|
2945
|
+
});
|
|
2946
|
+
rl.close();
|
|
2947
|
+
if (answer.toLowerCase() !== "y") {
|
|
2948
|
+
console.log("Cancelled.");
|
|
2949
|
+
return;
|
|
2950
|
+
}
|
|
2951
|
+
}
|
|
2952
|
+
const manager = new RepoManager();
|
|
2953
|
+
await manager.cleanCache();
|
|
2954
|
+
console.log("Cache cleaned.");
|
|
2955
|
+
}
|
|
2956
|
+
});
|
|
2957
|
+
var cacheCommand = subcommands({
|
|
2958
|
+
name: "cache",
|
|
2959
|
+
description: "Manage AgentV cache",
|
|
2960
|
+
cmds: {
|
|
2961
|
+
clean: cleanCommand
|
|
2962
|
+
}
|
|
2963
|
+
});
|
|
2964
|
+
|
|
2927
2965
|
// src/commands/compare/index.ts
|
|
2928
2966
|
import { readFileSync } from "node:fs";
|
|
2929
2967
|
var colors = {
|
|
@@ -3267,7 +3305,7 @@ var compareCommand = command({
|
|
|
3267
3305
|
const results2 = loadJsonlResults(results[1]);
|
|
3268
3306
|
const comparison = compareResults(results1, results2, effectiveThreshold);
|
|
3269
3307
|
if (outputFormat === "json") {
|
|
3270
|
-
console.log(JSON.stringify(
|
|
3308
|
+
console.log(JSON.stringify(toSnakeCaseDeep2(comparison), null, 2));
|
|
3271
3309
|
} else {
|
|
3272
3310
|
console.log(formatTable(comparison, results[0], results[1]));
|
|
3273
3311
|
}
|
|
@@ -3313,7 +3351,7 @@ var compareCommand = command({
|
|
|
3313
3351
|
}
|
|
3314
3352
|
const comparison = compareResults(baselineResults, candidateResults, effectiveThreshold);
|
|
3315
3353
|
if (outputFormat === "json") {
|
|
3316
|
-
console.log(JSON.stringify(
|
|
3354
|
+
console.log(JSON.stringify(toSnakeCaseDeep2(comparison), null, 2));
|
|
3317
3355
|
} else {
|
|
3318
3356
|
console.log(formatTable(comparison, baseline, candidate));
|
|
3319
3357
|
}
|
|
@@ -3322,7 +3360,7 @@ var compareCommand = command({
|
|
|
3322
3360
|
} else {
|
|
3323
3361
|
const matrixOutput = compareMatrix(groups, effectiveThreshold);
|
|
3324
3362
|
if (outputFormat === "json") {
|
|
3325
|
-
console.log(JSON.stringify(
|
|
3363
|
+
console.log(JSON.stringify(toSnakeCaseDeep2(matrixOutput), null, 2));
|
|
3326
3364
|
} else {
|
|
3327
3365
|
console.log(formatMatrix(matrixOutput, baseline));
|
|
3328
3366
|
}
|
|
@@ -3765,7 +3803,7 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
|
|
|
3765
3803
|
config: codeConfig.config ?? null
|
|
3766
3804
|
};
|
|
3767
3805
|
try {
|
|
3768
|
-
const inputPayload = JSON.stringify(
|
|
3806
|
+
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
3769
3807
|
const stdout = await executeScript(script, inputPayload, 6e4, scriptCwd);
|
|
3770
3808
|
const parsed = JSON.parse(stdout);
|
|
3771
3809
|
return {
|
|
@@ -4042,7 +4080,7 @@ var evalRunCommand = command({
|
|
|
4042
4080
|
},
|
|
4043
4081
|
handler: async (args) => {
|
|
4044
4082
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4045
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4083
|
+
const { launchInteractiveWizard } = await import("./interactive-V4A3RRU3.js");
|
|
4046
4084
|
await launchInteractiveWizard();
|
|
4047
4085
|
return;
|
|
4048
4086
|
}
|
|
@@ -4658,7 +4696,7 @@ var traceListCommand = command({
|
|
|
4658
4696
|
try {
|
|
4659
4697
|
const metas = listResultFiles(cwd, limit);
|
|
4660
4698
|
if (outputFormat === "json") {
|
|
4661
|
-
console.log(JSON.stringify(
|
|
4699
|
+
console.log(JSON.stringify(toSnakeCaseDeep2(metas), null, 2));
|
|
4662
4700
|
} else {
|
|
4663
4701
|
console.log(formatListTable(metas));
|
|
4664
4702
|
}
|
|
@@ -5325,7 +5363,7 @@ var traceStatsCommand = command({
|
|
|
5325
5363
|
const groups = groupResults(results, groupBy2);
|
|
5326
5364
|
if (outputFormat === "json") {
|
|
5327
5365
|
const statsJson = computeStatsJson(groups, file);
|
|
5328
|
-
console.log(JSON.stringify(
|
|
5366
|
+
console.log(JSON.stringify(toSnakeCaseDeep2(statsJson), null, 2));
|
|
5329
5367
|
} else {
|
|
5330
5368
|
console.log(formatStatsTable(groups, file));
|
|
5331
5369
|
}
|
|
@@ -5374,7 +5412,7 @@ var trimCommand = command({
|
|
|
5374
5412
|
const record = JSON.parse(line);
|
|
5375
5413
|
const camel = toCamelCaseDeep(record);
|
|
5376
5414
|
const trimmed = trimBaselineResult(camel);
|
|
5377
|
-
const snake =
|
|
5415
|
+
const snake = toSnakeCaseDeep(trimmed);
|
|
5378
5416
|
return JSON.stringify(snake);
|
|
5379
5417
|
});
|
|
5380
5418
|
const output = `${trimmedLines.join("\n")}
|
|
@@ -5596,12 +5634,100 @@ var validateCommand = command({
|
|
|
5596
5634
|
}
|
|
5597
5635
|
});
|
|
5598
5636
|
|
|
5637
|
+
// src/update-check.ts
|
|
5638
|
+
import { spawn as spawn2 } from "node:child_process";
|
|
5639
|
+
import { readFile as readFile3 } from "node:fs/promises";
|
|
5640
|
+
import { homedir } from "node:os";
|
|
5641
|
+
import { join } from "node:path";
|
|
5642
|
+
var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
|
|
5643
|
+
var AGENTV_DIR = join(homedir(), ".agentv");
|
|
5644
|
+
var CACHE_FILE = "version-check.json";
|
|
5645
|
+
var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
|
|
5646
|
+
async function getCachedUpdateInfo(path8) {
|
|
5647
|
+
const filePath = path8 ?? join(AGENTV_DIR, CACHE_FILE);
|
|
5648
|
+
try {
|
|
5649
|
+
const raw = await readFile3(filePath, "utf-8");
|
|
5650
|
+
const data = JSON.parse(raw);
|
|
5651
|
+
if (typeof data.latestVersion === "string" && typeof data.lastCheckedAt === "string") {
|
|
5652
|
+
return data;
|
|
5653
|
+
}
|
|
5654
|
+
return null;
|
|
5655
|
+
} catch {
|
|
5656
|
+
return null;
|
|
5657
|
+
}
|
|
5658
|
+
}
|
|
5659
|
+
function shouldCheck(cache) {
|
|
5660
|
+
if (!cache) return true;
|
|
5661
|
+
const elapsed = Date.now() - new Date(cache.lastCheckedAt).getTime();
|
|
5662
|
+
return elapsed > CHECK_INTERVAL_MS;
|
|
5663
|
+
}
|
|
5664
|
+
function isNewer(a, b) {
|
|
5665
|
+
const pa = a.split(".").map((s) => Number(s.replace(/-.*$/, "")));
|
|
5666
|
+
const pb = b.split(".").map((s) => Number(s.replace(/-.*$/, "")));
|
|
5667
|
+
for (let i = 0; i < 3; i++) {
|
|
5668
|
+
if ((pa[i] ?? 0) > (pb[i] ?? 0)) return true;
|
|
5669
|
+
if ((pa[i] ?? 0) < (pb[i] ?? 0)) return false;
|
|
5670
|
+
}
|
|
5671
|
+
return false;
|
|
5672
|
+
}
|
|
5673
|
+
function buildNotice(currentVersion, latestVersion) {
|
|
5674
|
+
if (!latestVersion) return null;
|
|
5675
|
+
if (!isNewer(latestVersion, currentVersion)) return null;
|
|
5676
|
+
return ` Update available: ${currentVersion} \u2192 ${latestVersion}
|
|
5677
|
+
Run \`agentv self update\` to upgrade.`;
|
|
5678
|
+
}
|
|
5679
|
+
function backgroundUpdateCheck() {
|
|
5680
|
+
const dir = AGENTV_DIR;
|
|
5681
|
+
const filePath = join(dir, CACHE_FILE);
|
|
5682
|
+
const script = `
|
|
5683
|
+
const https = require('https');
|
|
5684
|
+
const fs = require('fs');
|
|
5685
|
+
const dir = ${JSON.stringify(dir)};
|
|
5686
|
+
const filePath = ${JSON.stringify(filePath)};
|
|
5687
|
+
https.get(${JSON.stringify(NPM_REGISTRY_URL)}, { timeout: 5000 }, (res) => {
|
|
5688
|
+
if (res.statusCode !== 200) { res.resume(); process.exit(); }
|
|
5689
|
+
let body = '';
|
|
5690
|
+
res.on('data', (c) => body += c);
|
|
5691
|
+
res.on('end', () => {
|
|
5692
|
+
try {
|
|
5693
|
+
const v = JSON.parse(body).version;
|
|
5694
|
+
if (typeof v === 'string') {
|
|
5695
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
5696
|
+
fs.writeFileSync(filePath, JSON.stringify({ latestVersion: v, lastCheckedAt: new Date().toISOString() }, null, 2));
|
|
5697
|
+
}
|
|
5698
|
+
} catch {}
|
|
5699
|
+
process.exit();
|
|
5700
|
+
});
|
|
5701
|
+
}).on('error', () => process.exit()).on('timeout', function() { this.destroy(); process.exit(); });
|
|
5702
|
+
`;
|
|
5703
|
+
try {
|
|
5704
|
+
const child = spawn2(process.execPath, ["-e", script], {
|
|
5705
|
+
detached: true,
|
|
5706
|
+
stdio: "ignore",
|
|
5707
|
+
windowsHide: true
|
|
5708
|
+
});
|
|
5709
|
+
child.unref();
|
|
5710
|
+
} catch {
|
|
5711
|
+
}
|
|
5712
|
+
}
|
|
5713
|
+
async function getUpdateNotice(currentVersion) {
|
|
5714
|
+
if (process.env.AGENTV_NO_UPDATE_CHECK === "1" || process.env.CI === "true") {
|
|
5715
|
+
return null;
|
|
5716
|
+
}
|
|
5717
|
+
const cache = await getCachedUpdateInfo();
|
|
5718
|
+
if (shouldCheck(cache)) {
|
|
5719
|
+
backgroundUpdateCheck();
|
|
5720
|
+
}
|
|
5721
|
+
return buildNotice(currentVersion, cache?.latestVersion ?? null);
|
|
5722
|
+
}
|
|
5723
|
+
|
|
5599
5724
|
// src/index.ts
|
|
5600
5725
|
var app = subcommands({
|
|
5601
5726
|
name: "agentv",
|
|
5602
5727
|
description: "AgentV CLI",
|
|
5603
5728
|
version: package_default.version,
|
|
5604
5729
|
cmds: {
|
|
5730
|
+
cache: cacheCommand,
|
|
5605
5731
|
eval: evalRunCommand,
|
|
5606
5732
|
prompt: evalPromptCommand,
|
|
5607
5733
|
compare: compareCommand,
|
|
@@ -5640,6 +5766,15 @@ function preprocessArgv(argv) {
|
|
|
5640
5766
|
return result;
|
|
5641
5767
|
}
|
|
5642
5768
|
async function runCli(argv = process.argv) {
|
|
5769
|
+
let updateNotice = null;
|
|
5770
|
+
process.on("exit", () => {
|
|
5771
|
+
if (updateNotice) process.stderr.write(`
|
|
5772
|
+
${updateNotice}
|
|
5773
|
+
`);
|
|
5774
|
+
});
|
|
5775
|
+
getUpdateNotice(package_default.version).then((n) => {
|
|
5776
|
+
updateNotice = n;
|
|
5777
|
+
});
|
|
5643
5778
|
const processedArgv = preprocessArgv(argv);
|
|
5644
5779
|
await run(binary(app), processedArgv);
|
|
5645
5780
|
}
|
|
@@ -5649,4 +5784,4 @@ export {
|
|
|
5649
5784
|
preprocessArgv,
|
|
5650
5785
|
runCli
|
|
5651
5786
|
};
|
|
5652
|
-
//# sourceMappingURL=chunk-
|
|
5787
|
+
//# sourceMappingURL=chunk-CVC3VMZ3.js.map
|