agentv 3.13.0 → 3.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/{chunk-7OHZAFND.js → chunk-K747KGDP.js} +47 -34
- package/dist/chunk-K747KGDP.js.map +1 -0
- package/dist/{chunk-6H4IAXQH.js → chunk-LSXO22CF.js} +8 -6
- package/dist/chunk-LSXO22CF.js.map +1 -0
- package/dist/{chunk-DJU4C6NS.js → chunk-UK7UMQOX.js} +20 -17
- package/dist/chunk-UK7UMQOX.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-SMKOBBFB.js → dist-LCZDS36N.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-RV664PCR.js → interactive-76ZJVPI7.js} +3 -3
- package/dist/templates/.agentv/.env.example +23 -0
- package/dist/templates/.agentv/config.yaml +13 -4
- package/dist/templates/.agentv/targets.yaml +16 -0
- package/package.json +1 -1
- package/dist/chunk-6H4IAXQH.js.map +0 -1
- package/dist/chunk-7OHZAFND.js.map +0 -1
- package/dist/chunk-DJU4C6NS.js.map +0 -1
- /package/dist/{dist-SMKOBBFB.js.map → dist-LCZDS36N.js.map} +0 -0
- /package/dist/{interactive-RV664PCR.js.map → interactive-76ZJVPI7.js.map} +0 -0
package/README.md
CHANGED
|
@@ -221,13 +221,13 @@ agentv eval evals/my-eval.yaml -o results.xml
|
|
|
221
221
|
|
|
222
222
|
The HTML report auto-refreshes every 2 seconds during a live run, then locks once the run completes.
|
|
223
223
|
|
|
224
|
-
By default, `agentv eval` creates a run workspace under `.agentv/results/
|
|
224
|
+
By default, `agentv eval` creates a run workspace under `.agentv/results/runs/<run>/`
|
|
225
225
|
with `index.jsonl` as the machine-facing manifest.
|
|
226
226
|
|
|
227
227
|
You can also convert an existing manifest to HTML after the fact:
|
|
228
228
|
|
|
229
229
|
```bash
|
|
230
|
-
agentv convert .agentv/results/
|
|
230
|
+
agentv convert .agentv/results/runs/eval_<timestamp>/index.jsonl -o report.html
|
|
231
231
|
```
|
|
232
232
|
|
|
233
233
|
#### Timeouts
|
|
@@ -358,7 +358,7 @@ agentv create eval my-eval # → evals/my-eval.eval.yaml + .cases.jsonl
|
|
|
358
358
|
Compare a combined results file across all targets (N-way matrix):
|
|
359
359
|
|
|
360
360
|
```bash
|
|
361
|
-
agentv compare .agentv/results/
|
|
361
|
+
agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl
|
|
362
362
|
```
|
|
363
363
|
|
|
364
364
|
```
|
|
@@ -379,8 +379,8 @@ Pairwise Summary:
|
|
|
379
379
|
Designate a baseline for CI regression gating, or compare two specific targets:
|
|
380
380
|
|
|
381
381
|
```bash
|
|
382
|
-
agentv compare .agentv/results/
|
|
383
|
-
agentv compare .agentv/results/
|
|
382
|
+
agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1
|
|
383
|
+
agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
|
|
384
384
|
agentv compare before.jsonl after.jsonl # two-file pairwise
|
|
385
385
|
```
|
|
386
386
|
|
|
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
|
|
|
301
301
|
}
|
|
302
302
|
});
|
|
303
303
|
|
|
304
|
-
// ../../packages/core/dist/chunk-
|
|
304
|
+
// ../../packages/core/dist/chunk-ZB3AUPES.js
|
|
305
305
|
import { constants } from "node:fs";
|
|
306
306
|
import { access, readFile } from "node:fs/promises";
|
|
307
307
|
import path from "node:path";
|
|
@@ -419,7 +419,7 @@ __export(external_exports2, {
|
|
|
419
419
|
void: () => voidType
|
|
420
420
|
});
|
|
421
421
|
|
|
422
|
-
// ../../packages/core/dist/chunk-
|
|
422
|
+
// ../../packages/core/dist/chunk-ZB3AUPES.js
|
|
423
423
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
424
424
|
import path3 from "node:path";
|
|
425
425
|
import fg from "fast-glob";
|
|
@@ -473,8 +473,6 @@ function isTestMessage(value) {
|
|
|
473
473
|
var EVALUATOR_KIND_VALUES = [
|
|
474
474
|
"code-grader",
|
|
475
475
|
"llm-grader",
|
|
476
|
-
"code-judge",
|
|
477
|
-
"llm-judge",
|
|
478
476
|
"rubric",
|
|
479
477
|
"composite",
|
|
480
478
|
"tool-trajectory",
|
|
@@ -14960,6 +14958,9 @@ var ANSI_RESET4 = "\x1B[0m";
|
|
|
14960
14958
|
function normalizeEvaluatorType(type) {
|
|
14961
14959
|
return type.replace(/_/g, "-");
|
|
14962
14960
|
}
|
|
14961
|
+
function isDeprecatedJudgeType(type) {
|
|
14962
|
+
return type === "code-judge" || type === "llm-judge";
|
|
14963
|
+
}
|
|
14963
14964
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
14964
14965
|
const execution = rawEvalCase.execution;
|
|
14965
14966
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -15022,6 +15023,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15022
15023
|
const rawName = asString(rawEvaluator.name);
|
|
15023
15024
|
const rawType = rawEvaluator.type;
|
|
15024
15025
|
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
15026
|
+
if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
|
|
15027
|
+
logWarning2(
|
|
15028
|
+
`Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
|
|
15029
|
+
);
|
|
15030
|
+
continue;
|
|
15031
|
+
}
|
|
15025
15032
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
15026
15033
|
if (typeof typeValue !== "string") {
|
|
15027
15034
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -15054,7 +15061,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15054
15061
|
});
|
|
15055
15062
|
continue;
|
|
15056
15063
|
}
|
|
15057
|
-
if (typeValue === "code-grader"
|
|
15064
|
+
if (typeValue === "code-grader") {
|
|
15058
15065
|
let command;
|
|
15059
15066
|
if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
|
|
15060
15067
|
console.warn(
|
|
@@ -15164,7 +15171,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15164
15171
|
continue;
|
|
15165
15172
|
}
|
|
15166
15173
|
const aggregatorType = asString(rawAggregator.type);
|
|
15167
|
-
|
|
15174
|
+
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
|
|
15175
|
+
if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
|
|
15176
|
+
logWarning2(
|
|
15177
|
+
`Skipping composite evaluator '${name21}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
|
|
15178
|
+
);
|
|
15179
|
+
continue;
|
|
15180
|
+
}
|
|
15181
|
+
if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
|
|
15168
15182
|
logWarning2(
|
|
15169
15183
|
`Skipping composite evaluator '${name21}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
15170
15184
|
);
|
|
@@ -15199,7 +15213,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15199
15213
|
continue;
|
|
15200
15214
|
}
|
|
15201
15215
|
let aggregator;
|
|
15202
|
-
if (
|
|
15216
|
+
if (normalizedAggregatorType === "weighted_average") {
|
|
15203
15217
|
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
15204
15218
|
const parsedWeights = {};
|
|
15205
15219
|
if (weights) {
|
|
@@ -15213,7 +15227,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15213
15227
|
type: "weighted_average",
|
|
15214
15228
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
15215
15229
|
};
|
|
15216
|
-
} else if (
|
|
15230
|
+
} else if (normalizedAggregatorType === "code-grader") {
|
|
15217
15231
|
const aggregatorPath = asString(rawAggregator.path);
|
|
15218
15232
|
if (!aggregatorPath) {
|
|
15219
15233
|
logWarning2(
|
|
@@ -15226,7 +15240,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15226
15240
|
path: aggregatorPath,
|
|
15227
15241
|
cwd: searchRoots[0]
|
|
15228
15242
|
};
|
|
15229
|
-
} else if (
|
|
15243
|
+
} else if (normalizedAggregatorType === "threshold") {
|
|
15230
15244
|
const thresholdValue = rawAggregator.threshold;
|
|
15231
15245
|
if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
|
|
15232
15246
|
logWarning2(
|
|
@@ -15974,10 +15988,15 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
15974
15988
|
return void 0;
|
|
15975
15989
|
}
|
|
15976
15990
|
const normalized = normalizeEvaluatorType(candidate);
|
|
15991
|
+
if (isDeprecatedJudgeType(normalized)) {
|
|
15992
|
+
throw new Error(
|
|
15993
|
+
`Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
|
|
15994
|
+
);
|
|
15995
|
+
}
|
|
15977
15996
|
if (isEvaluatorKind(normalized)) {
|
|
15978
15997
|
return normalized;
|
|
15979
15998
|
}
|
|
15980
|
-
logWarning2(`Unknown
|
|
15999
|
+
logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
|
|
15981
16000
|
return void 0;
|
|
15982
16001
|
}
|
|
15983
16002
|
function asString(value) {
|
|
@@ -17380,9 +17399,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
17380
17399
|
case "ends_with":
|
|
17381
17400
|
return `Output ends with '${entry.value}'`;
|
|
17382
17401
|
case "llm-grader":
|
|
17383
|
-
case "llm_grader":
|
|
17384
|
-
case "llm-judge":
|
|
17385
|
-
case "llm_judge": {
|
|
17402
|
+
case "llm_grader": {
|
|
17386
17403
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
17387
17404
|
return null;
|
|
17388
17405
|
}
|
|
@@ -17395,9 +17412,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
17395
17412
|
return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
|
|
17396
17413
|
}
|
|
17397
17414
|
case "code-grader":
|
|
17398
|
-
case "code_grader":
|
|
17399
|
-
case "code-judge":
|
|
17400
|
-
case "code_judge": {
|
|
17415
|
+
case "code_grader": {
|
|
17401
17416
|
const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
|
|
17402
17417
|
const desc = typeof entry.description === "string" ? entry.description : void 0;
|
|
17403
17418
|
return codeGraderInstruction(graderName, desc);
|
|
@@ -17428,7 +17443,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
17428
17443
|
}
|
|
17429
17444
|
}
|
|
17430
17445
|
function assertionToNaturalLanguageList(entry) {
|
|
17431
|
-
if (entry.type === "llm-grader" || entry.type === "llm_grader"
|
|
17446
|
+
if (entry.type === "llm-grader" || entry.type === "llm_grader") {
|
|
17432
17447
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
17433
17448
|
return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
|
|
17434
17449
|
}
|
|
@@ -24084,7 +24099,7 @@ function toCamelCaseDeep(obj) {
|
|
|
24084
24099
|
}
|
|
24085
24100
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
24086
24101
|
var CodeEvaluator = class {
|
|
24087
|
-
kind = "code-
|
|
24102
|
+
kind = "code-grader";
|
|
24088
24103
|
command;
|
|
24089
24104
|
cwd;
|
|
24090
24105
|
agentTimeoutMs;
|
|
@@ -24103,7 +24118,7 @@ var CodeEvaluator = class {
|
|
|
24103
24118
|
if (outputForPayload) {
|
|
24104
24119
|
const serialized = JSON.stringify(outputForPayload);
|
|
24105
24120
|
if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
|
|
24106
|
-
const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-
|
|
24121
|
+
const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-grader-"));
|
|
24107
24122
|
outputPath = join(tmpDir, "output.json");
|
|
24108
24123
|
await writeFile6(outputPath, serialized);
|
|
24109
24124
|
outputForPayload = null;
|
|
@@ -24352,7 +24367,7 @@ var LlmGraderEvaluator = class {
|
|
|
24352
24367
|
return this.evaluateWithDelegatedAgent(context2, graderProvider);
|
|
24353
24368
|
}
|
|
24354
24369
|
const config = context2.evaluator;
|
|
24355
|
-
if (
|
|
24370
|
+
if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
|
|
24356
24371
|
return this.evaluateWithRubrics(context2, graderProvider, config.rubrics);
|
|
24357
24372
|
}
|
|
24358
24373
|
return this.evaluateFreeform(context2, graderProvider);
|
|
@@ -24537,7 +24552,7 @@ ${context2.fileChanges}`;
|
|
|
24537
24552
|
const systemPrompt = this.buildAgentSystemPrompt(context2);
|
|
24538
24553
|
const userPrompt = this.buildAgentUserPrompt(context2);
|
|
24539
24554
|
const config = context2.evaluator;
|
|
24540
|
-
const rubrics = config?.type === "llm-grader"
|
|
24555
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24541
24556
|
const fsTools = createFilesystemTools(workspacePath);
|
|
24542
24557
|
const evaluatorRawRequest = {
|
|
24543
24558
|
mode: "built-in",
|
|
@@ -24633,7 +24648,7 @@ ${context2.fileChanges}`;
|
|
|
24633
24648
|
};
|
|
24634
24649
|
}
|
|
24635
24650
|
const config = context2.evaluator;
|
|
24636
|
-
const rubrics = config?.type === "llm-grader"
|
|
24651
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24637
24652
|
const details = {
|
|
24638
24653
|
mode: modeLabel,
|
|
24639
24654
|
grader_target: provider.targetName
|
|
@@ -24673,7 +24688,7 @@ ${context2.fileChanges}`;
|
|
|
24673
24688
|
*/
|
|
24674
24689
|
buildAgentSystemPrompt(context2) {
|
|
24675
24690
|
const config = context2.evaluator;
|
|
24676
|
-
const rubrics = config?.type === "llm-grader"
|
|
24691
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24677
24692
|
const parts = [
|
|
24678
24693
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
24679
24694
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -24704,7 +24719,7 @@ ${context2.fileChanges}`;
|
|
|
24704
24719
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
24705
24720
|
}
|
|
24706
24721
|
const config = context2.evaluator;
|
|
24707
|
-
const rubrics = config?.type === "llm-grader"
|
|
24722
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24708
24723
|
const parts = [
|
|
24709
24724
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
24710
24725
|
"",
|
|
@@ -24747,7 +24762,7 @@ ${context2.fileChanges}`;
|
|
|
24747
24762
|
buildDelegatedPrompt(context2) {
|
|
24748
24763
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
24749
24764
|
const config = context2.evaluator;
|
|
24750
|
-
const rubrics = config?.type === "llm-grader"
|
|
24765
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24751
24766
|
if (this.evaluatorTemplate) {
|
|
24752
24767
|
const variables = {
|
|
24753
24768
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
@@ -25242,10 +25257,8 @@ var CompositeEvaluator = class {
|
|
|
25242
25257
|
const aggregator = this.config.aggregator;
|
|
25243
25258
|
switch (aggregator.type) {
|
|
25244
25259
|
case "code-grader":
|
|
25245
|
-
case "code-judge":
|
|
25246
25260
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
25247
25261
|
case "llm-grader":
|
|
25248
|
-
case "llm-judge":
|
|
25249
25262
|
return this.runLlmAggregator(results, context2, aggregator);
|
|
25250
25263
|
case "threshold":
|
|
25251
25264
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -27630,7 +27643,7 @@ var endsWithFactory = (config) => {
|
|
|
27630
27643
|
};
|
|
27631
27644
|
function createBuiltinRegistry() {
|
|
27632
27645
|
const registry = new EvaluatorRegistry();
|
|
27633
|
-
registry.register("llm-grader", llmGraderFactory).register("
|
|
27646
|
+
registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
|
|
27634
27647
|
const fn = config[INLINE_ASSERT_FN];
|
|
27635
27648
|
if (!fn) {
|
|
27636
27649
|
throw new Error(
|
|
@@ -30306,7 +30319,7 @@ function filterEvalCases(evalCases, filter2) {
|
|
|
30306
30319
|
return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter2));
|
|
30307
30320
|
}
|
|
30308
30321
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
30309
|
-
const llmGrader = overrides?.["llm-grader"] ??
|
|
30322
|
+
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
30310
30323
|
resolveGraderProvider: async (context2) => {
|
|
30311
30324
|
if (context2.graderProvider) {
|
|
30312
30325
|
return context2.graderProvider;
|
|
@@ -31127,10 +31140,10 @@ var OtelTraceExporter = class {
|
|
|
31127
31140
|
}
|
|
31128
31141
|
if (result.scores) {
|
|
31129
31142
|
for (const score of result.scores) {
|
|
31130
|
-
rootSpan.addEvent(`agentv.
|
|
31131
|
-
"agentv.
|
|
31132
|
-
"agentv.
|
|
31133
|
-
...score.verdict ? { "agentv.
|
|
31143
|
+
rootSpan.addEvent(`agentv.grader.${score.name}`, {
|
|
31144
|
+
"agentv.grader.score": score.score,
|
|
31145
|
+
"agentv.grader.type": score.type,
|
|
31146
|
+
...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
|
|
31134
31147
|
});
|
|
31135
31148
|
}
|
|
31136
31149
|
}
|
|
@@ -31590,4 +31603,4 @@ export {
|
|
|
31590
31603
|
OtelStreamingObserver,
|
|
31591
31604
|
createAgentKernel
|
|
31592
31605
|
};
|
|
31593
|
-
//# sourceMappingURL=chunk-
|
|
31606
|
+
//# sourceMappingURL=chunk-K747KGDP.js.map
|