agentv 2.12.0 → 2.14.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -14
- package/dist/{chunk-YBJX5CP6.js → chunk-K2APOWTE.js} +213 -29
- package/dist/chunk-K2APOWTE.js.map +1 -0
- package/dist/{chunk-LUHCYBMD.js → chunk-OQN2GDEU.js} +251 -164
- package/dist/chunk-OQN2GDEU.js.map +1 -0
- package/dist/{chunk-6KU2ZUFJ.js → chunk-ZSSGXZX6.js} +39 -77
- package/dist/chunk-ZSSGXZX6.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-OPPA4P5R.js → dist-QR5OZ4DH.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-TOUKPSHP.js → interactive-WF6UO63B.js} +3 -3
- package/package.json +4 -2
- package/dist/chunk-6KU2ZUFJ.js.map +0 -1
- package/dist/chunk-LUHCYBMD.js.map +0 -1
- package/dist/chunk-YBJX5CP6.js.map +0 -1
- /package/dist/{dist-OPPA4P5R.js.map → dist-QR5OZ4DH.js.map} +0 -0
- /package/dist/{interactive-TOUKPSHP.js.map → interactive-WF6UO63B.js.map} +0 -0
|
@@ -2,6 +2,7 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
|
|
|
2
2
|
import {
|
|
3
3
|
detectFileType,
|
|
4
4
|
findRepoRoot,
|
|
5
|
+
package_default,
|
|
5
6
|
resolveEvalPaths,
|
|
6
7
|
runEvalCommand,
|
|
7
8
|
selectTarget,
|
|
@@ -10,7 +11,7 @@ import {
|
|
|
10
11
|
validateEvalFile,
|
|
11
12
|
validateFileReferences,
|
|
12
13
|
validateTargetsFile
|
|
13
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-K2APOWTE.js";
|
|
14
15
|
import {
|
|
15
16
|
RepoManager,
|
|
16
17
|
assembleLlmJudgePrompt,
|
|
@@ -25,7 +26,7 @@ import {
|
|
|
25
26
|
toCamelCaseDeep,
|
|
26
27
|
toSnakeCaseDeep,
|
|
27
28
|
trimBaselineResult
|
|
28
|
-
} from "./chunk-
|
|
29
|
+
} from "./chunk-OQN2GDEU.js";
|
|
29
30
|
import {
|
|
30
31
|
__commonJS,
|
|
31
32
|
__esm,
|
|
@@ -2875,56 +2876,6 @@ function oneOf(literals) {
|
|
|
2875
2876
|
};
|
|
2876
2877
|
}
|
|
2877
2878
|
|
|
2878
|
-
// package.json
|
|
2879
|
-
var package_default = {
|
|
2880
|
-
name: "agentv",
|
|
2881
|
-
version: "2.12.0",
|
|
2882
|
-
description: "CLI entry point for AgentV",
|
|
2883
|
-
type: "module",
|
|
2884
|
-
repository: {
|
|
2885
|
-
type: "git",
|
|
2886
|
-
url: "https://github.com/EntityProcess/agentv.git"
|
|
2887
|
-
},
|
|
2888
|
-
homepage: "https://github.com/EntityProcess/agentv#readme",
|
|
2889
|
-
bugs: {
|
|
2890
|
-
url: "https://github.com/EntityProcess/agentv/issues"
|
|
2891
|
-
},
|
|
2892
|
-
bin: {
|
|
2893
|
-
agentv: "./dist/cli.js"
|
|
2894
|
-
},
|
|
2895
|
-
files: ["dist", "README.md"],
|
|
2896
|
-
scripts: {
|
|
2897
|
-
dev: "bun src/cli.ts",
|
|
2898
|
-
build: "tsup && bun run copy-readme",
|
|
2899
|
-
"copy-readme": `bun -e "import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')"`,
|
|
2900
|
-
prepublishOnly: "bun run copy-readme",
|
|
2901
|
-
typecheck: "tsc --noEmit",
|
|
2902
|
-
lint: "biome check .",
|
|
2903
|
-
format: "biome format --write .",
|
|
2904
|
-
fix: "biome check --write .",
|
|
2905
|
-
test: "bun test",
|
|
2906
|
-
"test:watch": "bun test --watch"
|
|
2907
|
-
},
|
|
2908
|
-
dependencies: {
|
|
2909
|
-
"@anthropic-ai/claude-agent-sdk": "^0.2.49",
|
|
2910
|
-
"@github/copilot-sdk": "^0.1.25",
|
|
2911
|
-
"@inquirer/prompts": "^8.2.1",
|
|
2912
|
-
"@mariozechner/pi-agent-core": "^0.54.2",
|
|
2913
|
-
"@mariozechner/pi-ai": "^0.54.2",
|
|
2914
|
-
"@openai/codex-sdk": "^0.104.0",
|
|
2915
|
-
"cmd-ts": "^0.14.3",
|
|
2916
|
-
dotenv: "^16.4.5",
|
|
2917
|
-
"fast-glob": "^3.3.3",
|
|
2918
|
-
json5: "^2.2.3",
|
|
2919
|
-
micromatch: "^4.0.8",
|
|
2920
|
-
yaml: "^2.6.1"
|
|
2921
|
-
},
|
|
2922
|
-
devDependencies: {
|
|
2923
|
-
"@agentv/core": "workspace:*",
|
|
2924
|
-
execa: "^9.3.0"
|
|
2925
|
-
}
|
|
2926
|
-
};
|
|
2927
|
-
|
|
2928
2879
|
// src/commands/cache/add.ts
|
|
2929
2880
|
import { existsSync } from "node:fs";
|
|
2930
2881
|
import { join, resolve } from "node:path";
|
|
@@ -3533,7 +3484,7 @@ tests:
|
|
|
3533
3484
|
input: "Hello, how are you?"
|
|
3534
3485
|
expected_output: "I'm doing well, thank you for asking!"
|
|
3535
3486
|
assert:
|
|
3536
|
-
- type:
|
|
3487
|
+
- type: llm-judge
|
|
3537
3488
|
rubric:
|
|
3538
3489
|
accuracy:
|
|
3539
3490
|
weight: 0.6
|
|
@@ -3812,7 +3763,7 @@ var evalPromptJudgeCommand = command({
|
|
|
3812
3763
|
});
|
|
3813
3764
|
outputs.push({
|
|
3814
3765
|
name: "default_llm_judge",
|
|
3815
|
-
type: "
|
|
3766
|
+
type: "llm-judge",
|
|
3816
3767
|
status: "prompt_ready",
|
|
3817
3768
|
prompt: {
|
|
3818
3769
|
system_prompt: assembly.systemPrompt,
|
|
@@ -3830,7 +3781,7 @@ var evalPromptJudgeCommand = command({
|
|
|
3830
3781
|
});
|
|
3831
3782
|
async function processEvaluator(config, evalCase, candidate, promptInputs) {
|
|
3832
3783
|
switch (config.type) {
|
|
3833
|
-
case "code": {
|
|
3784
|
+
case "code-judge": {
|
|
3834
3785
|
const codeConfig = config;
|
|
3835
3786
|
const script = codeConfig.command ?? codeConfig.script ?? [];
|
|
3836
3787
|
const scriptCwd = codeConfig.resolvedCwd ?? codeConfig.cwd;
|
|
@@ -3855,14 +3806,14 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
|
|
|
3855
3806
|
const parsed = JSON.parse(stdout);
|
|
3856
3807
|
return {
|
|
3857
3808
|
name: codeConfig.name,
|
|
3858
|
-
type: "
|
|
3809
|
+
type: "code-judge",
|
|
3859
3810
|
status: "completed",
|
|
3860
3811
|
result: parsed
|
|
3861
3812
|
};
|
|
3862
3813
|
} catch (error) {
|
|
3863
3814
|
return {
|
|
3864
3815
|
name: codeConfig.name,
|
|
3865
|
-
type: "
|
|
3816
|
+
type: "code-judge",
|
|
3866
3817
|
status: "completed",
|
|
3867
3818
|
result: {
|
|
3868
3819
|
score: 0,
|
|
@@ -3871,7 +3822,7 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
|
|
|
3871
3822
|
};
|
|
3872
3823
|
}
|
|
3873
3824
|
}
|
|
3874
|
-
case "
|
|
3825
|
+
case "llm-judge": {
|
|
3875
3826
|
const llmConfig = config;
|
|
3876
3827
|
const assembly = assembleLlmJudgePrompt({
|
|
3877
3828
|
evalCase,
|
|
@@ -3881,7 +3832,7 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
|
|
|
3881
3832
|
});
|
|
3882
3833
|
return {
|
|
3883
3834
|
name: llmConfig.name,
|
|
3884
|
-
type: "
|
|
3835
|
+
type: "llm-judge",
|
|
3885
3836
|
status: "prompt_ready",
|
|
3886
3837
|
prompt: {
|
|
3887
3838
|
system_prompt: assembly.systemPrompt,
|
|
@@ -3947,7 +3898,7 @@ var evalPromptOverviewCommand = command({
|
|
|
3947
3898
|
"",
|
|
3948
3899
|
"The output contains an `evaluators` array. Each evaluator has a `status`:",
|
|
3949
3900
|
"",
|
|
3950
|
-
'- **`"completed"`** \u2014 Score is final (
|
|
3901
|
+
'- **`"completed"`** \u2014 Score is final (code-judge ran deterministically). Read `result.score` (0.0\u20131.0).',
|
|
3951
3902
|
'- **`"prompt_ready"`** \u2014 LLM grading required. Send `prompt.system_prompt` as system and',
|
|
3952
3903
|
" `prompt.user_prompt` as user to your LLM. Parse the JSON response to get `score`, `hits`, `misses`.",
|
|
3953
3904
|
""
|
|
@@ -4123,11 +4074,20 @@ var evalRunCommand = command({
|
|
|
4123
4074
|
otelGroupTurns: flag({
|
|
4124
4075
|
long: "otel-group-turns",
|
|
4125
4076
|
description: "Group messages into turn spans for multi-turn evaluations (requires --export-otel)"
|
|
4077
|
+
}),
|
|
4078
|
+
retryErrors: option({
|
|
4079
|
+
type: optional(string),
|
|
4080
|
+
long: "retry-errors",
|
|
4081
|
+
description: "Path to previous output JSONL \u2014 re-run only execution_error test cases"
|
|
4082
|
+
}),
|
|
4083
|
+
strict: flag({
|
|
4084
|
+
long: "strict",
|
|
4085
|
+
description: "Exit with error on version mismatch (instead of warning)"
|
|
4126
4086
|
})
|
|
4127
4087
|
},
|
|
4128
4088
|
handler: async (args) => {
|
|
4129
4089
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4130
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4090
|
+
const { launchInteractiveWizard } = await import("./interactive-WF6UO63B.js");
|
|
4131
4091
|
await launchInteractiveWizard();
|
|
4132
4092
|
return;
|
|
4133
4093
|
}
|
|
@@ -4157,7 +4117,9 @@ var evalRunCommand = command({
|
|
|
4157
4117
|
exportOtel: args.exportOtel,
|
|
4158
4118
|
otelBackend: args.otelBackend,
|
|
4159
4119
|
otelCaptureContent: args.otelCaptureContent,
|
|
4160
|
-
otelGroupTurns: args.otelGroupTurns
|
|
4120
|
+
otelGroupTurns: args.otelGroupTurns,
|
|
4121
|
+
retryErrors: args.retryErrors,
|
|
4122
|
+
strict: args.strict
|
|
4161
4123
|
};
|
|
4162
4124
|
await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
|
|
4163
4125
|
}
|
|
@@ -4758,12 +4720,12 @@ var traceListCommand = command({
|
|
|
4758
4720
|
var SUPPORTED_TYPES = [
|
|
4759
4721
|
"contains",
|
|
4760
4722
|
"regex",
|
|
4761
|
-
"
|
|
4723
|
+
"is-json",
|
|
4762
4724
|
"equals",
|
|
4763
4725
|
"latency",
|
|
4764
4726
|
"cost",
|
|
4765
|
-
"
|
|
4766
|
-
"
|
|
4727
|
+
"token-usage",
|
|
4728
|
+
"execution-metrics"
|
|
4767
4729
|
];
|
|
4768
4730
|
function parseKeyValues(s) {
|
|
4769
4731
|
const result = {};
|
|
@@ -4777,7 +4739,7 @@ function parseKeyValues(s) {
|
|
|
4777
4739
|
}
|
|
4778
4740
|
function parseAssertSpec(spec) {
|
|
4779
4741
|
const colonIdx = spec.indexOf(":");
|
|
4780
|
-
const type = colonIdx === -1 ? spec : spec.slice(0, colonIdx);
|
|
4742
|
+
const type = (colonIdx === -1 ? spec : spec.slice(0, colonIdx)).replace(/_/g, "-");
|
|
4781
4743
|
const params = colonIdx === -1 ? "" : spec.slice(colonIdx + 1);
|
|
4782
4744
|
switch (type) {
|
|
4783
4745
|
case "contains":
|
|
@@ -4786,8 +4748,8 @@ function parseAssertSpec(spec) {
|
|
|
4786
4748
|
case "regex":
|
|
4787
4749
|
if (!params) throw new Error("regex requires a pattern: regex:<pattern>");
|
|
4788
4750
|
return { name: "regex", type: "regex", value: params };
|
|
4789
|
-
case "
|
|
4790
|
-
return { name: "
|
|
4751
|
+
case "is-json":
|
|
4752
|
+
return { name: "is-json", type: "is-json" };
|
|
4791
4753
|
case "equals":
|
|
4792
4754
|
if (!params) throw new Error("equals requires a value: equals:<value>");
|
|
4793
4755
|
return { name: "equals", type: "equals", value: params };
|
|
@@ -4803,19 +4765,19 @@ function parseAssertSpec(spec) {
|
|
|
4803
4765
|
throw new Error("cost requires a budget in USD: cost:<usd>");
|
|
4804
4766
|
return { name: "cost", type: "cost", budget };
|
|
4805
4767
|
}
|
|
4806
|
-
case "
|
|
4768
|
+
case "token-usage": {
|
|
4807
4769
|
const kv = parseKeyValues(params);
|
|
4808
|
-
const config = { name: "
|
|
4770
|
+
const config = { name: "token-usage", type: "token-usage" };
|
|
4809
4771
|
if (kv.max_total) config.max_total = Number(kv.max_total);
|
|
4810
4772
|
if (kv.max_input) config.max_input = Number(kv.max_input);
|
|
4811
4773
|
if (kv.max_output) config.max_output = Number(kv.max_output);
|
|
4812
4774
|
return config;
|
|
4813
4775
|
}
|
|
4814
|
-
case "
|
|
4776
|
+
case "execution-metrics": {
|
|
4815
4777
|
const kv = parseKeyValues(params);
|
|
4816
4778
|
const config = {
|
|
4817
|
-
name: "
|
|
4818
|
-
type: "
|
|
4779
|
+
name: "execution-metrics",
|
|
4780
|
+
type: "execution-metrics"
|
|
4819
4781
|
};
|
|
4820
4782
|
if (kv.max_tool_calls) config.max_tool_calls = Number(kv.max_tool_calls);
|
|
4821
4783
|
if (kv.max_llm_calls) config.max_llm_calls = Number(kv.max_llm_calls);
|
|
@@ -4861,7 +4823,7 @@ var stubProvider = {
|
|
|
4861
4823
|
}
|
|
4862
4824
|
};
|
|
4863
4825
|
var stubLlmJudge = {
|
|
4864
|
-
kind: "
|
|
4826
|
+
kind: "llm-judge",
|
|
4865
4827
|
evaluate() {
|
|
4866
4828
|
throw new Error("trace score does not support LLM-based evaluators");
|
|
4867
4829
|
}
|
|
@@ -4955,7 +4917,7 @@ var traceScoreCommand = command({
|
|
|
4955
4917
|
type: string,
|
|
4956
4918
|
long: "assert",
|
|
4957
4919
|
short: "a",
|
|
4958
|
-
description: "Evaluator spec: contains:<val>, regex:<pat>,
|
|
4920
|
+
description: "Evaluator spec: contains:<val>, regex:<pat>, is-json, equals:<val>, latency:<ms>, cost:<usd>, token-usage:<params>, execution-metrics:<params>"
|
|
4959
4921
|
}),
|
|
4960
4922
|
testId: option({
|
|
4961
4923
|
type: optional(string),
|
|
@@ -4990,7 +4952,7 @@ var traceScoreCommand = command({
|
|
|
4990
4952
|
console.error(`${c2.yellow}Warning:${c2.reset} No results found in ${file}`);
|
|
4991
4953
|
process.exit(0);
|
|
4992
4954
|
}
|
|
4993
|
-
const traceRequired = ["latency", "cost", "
|
|
4955
|
+
const traceRequired = ["latency", "cost", "token-usage", "execution-metrics"].includes(
|
|
4994
4956
|
evaluatorConfig.type
|
|
4995
4957
|
);
|
|
4996
4958
|
if (traceRequired) {
|
|
@@ -5831,4 +5793,4 @@ export {
|
|
|
5831
5793
|
preprocessArgv,
|
|
5832
5794
|
runCli
|
|
5833
5795
|
};
|
|
5834
|
-
//# sourceMappingURL=chunk-
|
|
5796
|
+
//# sourceMappingURL=chunk-ZSSGXZX6.js.map
|