agentv 2.13.0 → 2.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -14
- package/dist/{chunk-UWDI4UVN.js → chunk-5646K2XJ.js} +15 -14
- package/dist/{chunk-UWDI4UVN.js.map → chunk-5646K2XJ.js.map} +1 -1
- package/dist/{chunk-FSBZM3HT.js → chunk-OQN2GDEU.js} +188 -162
- package/dist/chunk-OQN2GDEU.js.map +1 -0
- package/dist/{chunk-M6JYP6A6.js → chunk-YVWP4Z3W.js} +26 -26
- package/dist/chunk-YVWP4Z3W.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-CCUHG3SN.js → dist-QR5OZ4DH.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-P3D5O673.js → interactive-Z6ZV5OGM.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-FSBZM3HT.js.map +0 -1
- package/dist/chunk-M6JYP6A6.js.map +0 -1
- /package/dist/{dist-CCUHG3SN.js.map → dist-QR5OZ4DH.js.map} +0 -0
- /package/dist/{interactive-P3D5O673.js.map → interactive-Z6ZV5OGM.js.map} +0 -0
|
@@ -11,7 +11,7 @@ import {
|
|
|
11
11
|
validateEvalFile,
|
|
12
12
|
validateFileReferences,
|
|
13
13
|
validateTargetsFile
|
|
14
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-5646K2XJ.js";
|
|
15
15
|
import {
|
|
16
16
|
RepoManager,
|
|
17
17
|
assembleLlmJudgePrompt,
|
|
@@ -26,7 +26,7 @@ import {
|
|
|
26
26
|
toCamelCaseDeep,
|
|
27
27
|
toSnakeCaseDeep,
|
|
28
28
|
trimBaselineResult
|
|
29
|
-
} from "./chunk-
|
|
29
|
+
} from "./chunk-OQN2GDEU.js";
|
|
30
30
|
import {
|
|
31
31
|
__commonJS,
|
|
32
32
|
__esm,
|
|
@@ -3484,7 +3484,7 @@ tests:
|
|
|
3484
3484
|
input: "Hello, how are you?"
|
|
3485
3485
|
expected_output: "I'm doing well, thank you for asking!"
|
|
3486
3486
|
assert:
|
|
3487
|
-
- type:
|
|
3487
|
+
- type: llm-judge
|
|
3488
3488
|
rubric:
|
|
3489
3489
|
accuracy:
|
|
3490
3490
|
weight: 0.6
|
|
@@ -3763,7 +3763,7 @@ var evalPromptJudgeCommand = command({
|
|
|
3763
3763
|
});
|
|
3764
3764
|
outputs.push({
|
|
3765
3765
|
name: "default_llm_judge",
|
|
3766
|
-
type: "
|
|
3766
|
+
type: "llm-judge",
|
|
3767
3767
|
status: "prompt_ready",
|
|
3768
3768
|
prompt: {
|
|
3769
3769
|
system_prompt: assembly.systemPrompt,
|
|
@@ -3781,7 +3781,7 @@ var evalPromptJudgeCommand = command({
|
|
|
3781
3781
|
});
|
|
3782
3782
|
async function processEvaluator(config, evalCase, candidate, promptInputs) {
|
|
3783
3783
|
switch (config.type) {
|
|
3784
|
-
case "code": {
|
|
3784
|
+
case "code-judge": {
|
|
3785
3785
|
const codeConfig = config;
|
|
3786
3786
|
const script = codeConfig.command ?? codeConfig.script ?? [];
|
|
3787
3787
|
const scriptCwd = codeConfig.resolvedCwd ?? codeConfig.cwd;
|
|
@@ -3806,14 +3806,14 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
|
|
|
3806
3806
|
const parsed = JSON.parse(stdout);
|
|
3807
3807
|
return {
|
|
3808
3808
|
name: codeConfig.name,
|
|
3809
|
-
type: "
|
|
3809
|
+
type: "code-judge",
|
|
3810
3810
|
status: "completed",
|
|
3811
3811
|
result: parsed
|
|
3812
3812
|
};
|
|
3813
3813
|
} catch (error) {
|
|
3814
3814
|
return {
|
|
3815
3815
|
name: codeConfig.name,
|
|
3816
|
-
type: "
|
|
3816
|
+
type: "code-judge",
|
|
3817
3817
|
status: "completed",
|
|
3818
3818
|
result: {
|
|
3819
3819
|
score: 0,
|
|
@@ -3822,7 +3822,7 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
|
|
|
3822
3822
|
};
|
|
3823
3823
|
}
|
|
3824
3824
|
}
|
|
3825
|
-
case "
|
|
3825
|
+
case "llm-judge": {
|
|
3826
3826
|
const llmConfig = config;
|
|
3827
3827
|
const assembly = assembleLlmJudgePrompt({
|
|
3828
3828
|
evalCase,
|
|
@@ -3832,7 +3832,7 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
|
|
|
3832
3832
|
});
|
|
3833
3833
|
return {
|
|
3834
3834
|
name: llmConfig.name,
|
|
3835
|
-
type: "
|
|
3835
|
+
type: "llm-judge",
|
|
3836
3836
|
status: "prompt_ready",
|
|
3837
3837
|
prompt: {
|
|
3838
3838
|
system_prompt: assembly.systemPrompt,
|
|
@@ -3898,7 +3898,7 @@ var evalPromptOverviewCommand = command({
|
|
|
3898
3898
|
"",
|
|
3899
3899
|
"The output contains an `evaluators` array. Each evaluator has a `status`:",
|
|
3900
3900
|
"",
|
|
3901
|
-
'- **`"completed"`** \u2014 Score is final (
|
|
3901
|
+
'- **`"completed"`** \u2014 Score is final (code-judge ran deterministically). Read `result.score` (0.0\u20131.0).',
|
|
3902
3902
|
'- **`"prompt_ready"`** \u2014 LLM grading required. Send `prompt.system_prompt` as system and',
|
|
3903
3903
|
" `prompt.user_prompt` as user to your LLM. Parse the JSON response to get `score`, `hits`, `misses`.",
|
|
3904
3904
|
""
|
|
@@ -4087,7 +4087,7 @@ var evalRunCommand = command({
|
|
|
4087
4087
|
},
|
|
4088
4088
|
handler: async (args) => {
|
|
4089
4089
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4090
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4090
|
+
const { launchInteractiveWizard } = await import("./interactive-Z6ZV5OGM.js");
|
|
4091
4091
|
await launchInteractiveWizard();
|
|
4092
4092
|
return;
|
|
4093
4093
|
}
|
|
@@ -4720,12 +4720,12 @@ var traceListCommand = command({
|
|
|
4720
4720
|
var SUPPORTED_TYPES = [
|
|
4721
4721
|
"contains",
|
|
4722
4722
|
"regex",
|
|
4723
|
-
"
|
|
4723
|
+
"is-json",
|
|
4724
4724
|
"equals",
|
|
4725
4725
|
"latency",
|
|
4726
4726
|
"cost",
|
|
4727
|
-
"
|
|
4728
|
-
"
|
|
4727
|
+
"token-usage",
|
|
4728
|
+
"execution-metrics"
|
|
4729
4729
|
];
|
|
4730
4730
|
function parseKeyValues(s) {
|
|
4731
4731
|
const result = {};
|
|
@@ -4739,7 +4739,7 @@ function parseKeyValues(s) {
|
|
|
4739
4739
|
}
|
|
4740
4740
|
function parseAssertSpec(spec) {
|
|
4741
4741
|
const colonIdx = spec.indexOf(":");
|
|
4742
|
-
const type = colonIdx === -1 ? spec : spec.slice(0, colonIdx);
|
|
4742
|
+
const type = (colonIdx === -1 ? spec : spec.slice(0, colonIdx)).replace(/_/g, "-");
|
|
4743
4743
|
const params = colonIdx === -1 ? "" : spec.slice(colonIdx + 1);
|
|
4744
4744
|
switch (type) {
|
|
4745
4745
|
case "contains":
|
|
@@ -4748,8 +4748,8 @@ function parseAssertSpec(spec) {
|
|
|
4748
4748
|
case "regex":
|
|
4749
4749
|
if (!params) throw new Error("regex requires a pattern: regex:<pattern>");
|
|
4750
4750
|
return { name: "regex", type: "regex", value: params };
|
|
4751
|
-
case "
|
|
4752
|
-
return { name: "
|
|
4751
|
+
case "is-json":
|
|
4752
|
+
return { name: "is-json", type: "is-json" };
|
|
4753
4753
|
case "equals":
|
|
4754
4754
|
if (!params) throw new Error("equals requires a value: equals:<value>");
|
|
4755
4755
|
return { name: "equals", type: "equals", value: params };
|
|
@@ -4765,19 +4765,19 @@ function parseAssertSpec(spec) {
|
|
|
4765
4765
|
throw new Error("cost requires a budget in USD: cost:<usd>");
|
|
4766
4766
|
return { name: "cost", type: "cost", budget };
|
|
4767
4767
|
}
|
|
4768
|
-
case "
|
|
4768
|
+
case "token-usage": {
|
|
4769
4769
|
const kv = parseKeyValues(params);
|
|
4770
|
-
const config = { name: "
|
|
4770
|
+
const config = { name: "token-usage", type: "token-usage" };
|
|
4771
4771
|
if (kv.max_total) config.max_total = Number(kv.max_total);
|
|
4772
4772
|
if (kv.max_input) config.max_input = Number(kv.max_input);
|
|
4773
4773
|
if (kv.max_output) config.max_output = Number(kv.max_output);
|
|
4774
4774
|
return config;
|
|
4775
4775
|
}
|
|
4776
|
-
case "
|
|
4776
|
+
case "execution-metrics": {
|
|
4777
4777
|
const kv = parseKeyValues(params);
|
|
4778
4778
|
const config = {
|
|
4779
|
-
name: "
|
|
4780
|
-
type: "
|
|
4779
|
+
name: "execution-metrics",
|
|
4780
|
+
type: "execution-metrics"
|
|
4781
4781
|
};
|
|
4782
4782
|
if (kv.max_tool_calls) config.max_tool_calls = Number(kv.max_tool_calls);
|
|
4783
4783
|
if (kv.max_llm_calls) config.max_llm_calls = Number(kv.max_llm_calls);
|
|
@@ -4823,7 +4823,7 @@ var stubProvider = {
|
|
|
4823
4823
|
}
|
|
4824
4824
|
};
|
|
4825
4825
|
var stubLlmJudge = {
|
|
4826
|
-
kind: "
|
|
4826
|
+
kind: "llm-judge",
|
|
4827
4827
|
evaluate() {
|
|
4828
4828
|
throw new Error("trace score does not support LLM-based evaluators");
|
|
4829
4829
|
}
|
|
@@ -4917,7 +4917,7 @@ var traceScoreCommand = command({
|
|
|
4917
4917
|
type: string,
|
|
4918
4918
|
long: "assert",
|
|
4919
4919
|
short: "a",
|
|
4920
|
-
description: "Evaluator spec: contains:<val>, regex:<pat>,
|
|
4920
|
+
description: "Evaluator spec: contains:<val>, regex:<pat>, is-json, equals:<val>, latency:<ms>, cost:<usd>, token-usage:<params>, execution-metrics:<params>"
|
|
4921
4921
|
}),
|
|
4922
4922
|
testId: option({
|
|
4923
4923
|
type: optional(string),
|
|
@@ -4952,7 +4952,7 @@ var traceScoreCommand = command({
|
|
|
4952
4952
|
console.error(`${c2.yellow}Warning:${c2.reset} No results found in ${file}`);
|
|
4953
4953
|
process.exit(0);
|
|
4954
4954
|
}
|
|
4955
|
-
const traceRequired = ["latency", "cost", "
|
|
4955
|
+
const traceRequired = ["latency", "cost", "token-usage", "execution-metrics"].includes(
|
|
4956
4956
|
evaluatorConfig.type
|
|
4957
4957
|
);
|
|
4958
4958
|
if (traceRequired) {
|
|
@@ -5793,4 +5793,4 @@ export {
|
|
|
5793
5793
|
preprocessArgv,
|
|
5794
5794
|
runCli
|
|
5795
5795
|
};
|
|
5796
|
-
//# sourceMappingURL=chunk-
|
|
5796
|
+
//# sourceMappingURL=chunk-YVWP4Z3W.js.map
|