agentv 0.25.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/dist/{chunk-ZVSFP6NK.js → chunk-RIJO5WBF.js} +94 -33
- package/dist/chunk-RIJO5WBF.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +37 -20
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +94 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +8 -8
- package/package.json +1 -1
- package/dist/chunk-ZVSFP6NK.js.map +0 -1
package/README.md
CHANGED
|
@@ -253,9 +253,9 @@ Code evaluators receive input via stdin and write output to stdout as JSON.
|
|
|
253
253
|
"expected_outcome": "expected outcome description",
|
|
254
254
|
"reference_answer": "gold standard answer (optional)",
|
|
255
255
|
"candidate_answer": "generated code/text from the agent",
|
|
256
|
-
"
|
|
257
|
-
"input_files": ["
|
|
258
|
-
"
|
|
256
|
+
"guideline_files": ["path/to/guideline1.md", "path/to/guideline2.md"],
|
|
257
|
+
"input_files": ["path/to/data.json", "path/to/config.yaml"],
|
|
258
|
+
"input_messages": [{"role": "user", "content": "..."}]
|
|
259
259
|
}
|
|
260
260
|
```
|
|
261
261
|
|
|
@@ -164,7 +164,7 @@ import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
|
|
|
164
164
|
import path19 from "node:path";
|
|
165
165
|
import { pathToFileURL } from "node:url";
|
|
166
166
|
|
|
167
|
-
// ../../packages/core/dist/chunk-
|
|
167
|
+
// ../../packages/core/dist/chunk-V3JCB3HI.js
|
|
168
168
|
import { constants } from "node:fs";
|
|
169
169
|
import { access, readFile } from "node:fs/promises";
|
|
170
170
|
import path from "node:path";
|
|
@@ -4211,7 +4211,7 @@ var coerce = {
|
|
|
4211
4211
|
};
|
|
4212
4212
|
var NEVER = INVALID;
|
|
4213
4213
|
|
|
4214
|
-
// ../../packages/core/dist/chunk-
|
|
4214
|
+
// ../../packages/core/dist/chunk-V3JCB3HI.js
|
|
4215
4215
|
async function fileExists(filePath) {
|
|
4216
4216
|
try {
|
|
4217
4217
|
await access(filePath, constants.F_OK);
|
|
@@ -4470,7 +4470,10 @@ function resolveAzureConfig(target, env) {
|
|
|
4470
4470
|
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
4471
4471
|
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
4472
4472
|
const version2 = normalizeAzureApiVersion(
|
|
4473
|
-
resolveOptionalString(versionSource, env, `${target.name} api version
|
|
4473
|
+
resolveOptionalString(versionSource, env, `${target.name} api version`, {
|
|
4474
|
+
allowLiteral: true,
|
|
4475
|
+
optionalEnv: true
|
|
4476
|
+
})
|
|
4474
4477
|
);
|
|
4475
4478
|
const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
|
|
4476
4479
|
const maxOutputTokens = resolveOptionalNumber(
|
|
@@ -34575,7 +34578,7 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
34575
34578
|
"rubric",
|
|
34576
34579
|
"composite",
|
|
34577
34580
|
"tool_trajectory",
|
|
34578
|
-
"
|
|
34581
|
+
"expected_tool_calls"
|
|
34579
34582
|
];
|
|
34580
34583
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
34581
34584
|
function isEvaluatorKind(value) {
|
|
@@ -34928,6 +34931,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34928
34931
|
logWarning2(`Skipping code_judge evaluator '${name16}' in '${evalId}': missing script`);
|
|
34929
34932
|
continue;
|
|
34930
34933
|
}
|
|
34934
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
34931
34935
|
const cwd = asString2(rawEvaluator.cwd);
|
|
34932
34936
|
let resolvedCwd;
|
|
34933
34937
|
if (cwd) {
|
|
@@ -34948,7 +34952,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34948
34952
|
type: "code",
|
|
34949
34953
|
script,
|
|
34950
34954
|
cwd,
|
|
34951
|
-
resolvedCwd
|
|
34955
|
+
resolvedCwd,
|
|
34956
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
34952
34957
|
});
|
|
34953
34958
|
continue;
|
|
34954
34959
|
}
|
|
@@ -35043,18 +35048,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35043
35048
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
35044
35049
|
};
|
|
35045
35050
|
}
|
|
35051
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35046
35052
|
evaluators.push({
|
|
35047
35053
|
name: name16,
|
|
35048
35054
|
type: "composite",
|
|
35049
35055
|
evaluators: memberEvaluators,
|
|
35050
|
-
aggregator
|
|
35056
|
+
aggregator,
|
|
35057
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35051
35058
|
});
|
|
35052
35059
|
continue;
|
|
35053
35060
|
}
|
|
35054
|
-
if (typeValue === "
|
|
35061
|
+
if (typeValue === "expected_tool_calls") {
|
|
35062
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35055
35063
|
evaluators.push({
|
|
35056
35064
|
name: name16,
|
|
35057
|
-
type: "
|
|
35065
|
+
type: "expected_tool_calls",
|
|
35066
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35058
35067
|
});
|
|
35059
35068
|
continue;
|
|
35060
35069
|
}
|
|
@@ -35110,12 +35119,14 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35110
35119
|
);
|
|
35111
35120
|
continue;
|
|
35112
35121
|
}
|
|
35122
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35113
35123
|
const config2 = {
|
|
35114
35124
|
name: name16,
|
|
35115
35125
|
type: "tool_trajectory",
|
|
35116
35126
|
mode,
|
|
35117
35127
|
...minimums ? { minimums } : {},
|
|
35118
|
-
...expected ? { expected } : {}
|
|
35128
|
+
...expected ? { expected } : {},
|
|
35129
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35119
35130
|
};
|
|
35120
35131
|
evaluators.push(config2);
|
|
35121
35132
|
continue;
|
|
@@ -35156,19 +35167,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35156
35167
|
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': no valid rubrics found`);
|
|
35157
35168
|
continue;
|
|
35158
35169
|
}
|
|
35170
|
+
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35159
35171
|
evaluators.push({
|
|
35160
35172
|
name: name16,
|
|
35161
35173
|
type: "llm_judge",
|
|
35162
|
-
rubrics: parsedRubrics
|
|
35174
|
+
rubrics: parsedRubrics,
|
|
35175
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35163
35176
|
});
|
|
35164
35177
|
continue;
|
|
35165
35178
|
}
|
|
35179
|
+
const weight = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35166
35180
|
evaluators.push({
|
|
35167
35181
|
name: name16,
|
|
35168
35182
|
type: "llm_judge",
|
|
35169
35183
|
prompt,
|
|
35170
35184
|
promptPath,
|
|
35171
|
-
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
35185
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
35186
|
+
...weight !== void 0 ? { weight } : {}
|
|
35172
35187
|
});
|
|
35173
35188
|
}
|
|
35174
35189
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -35198,6 +35213,27 @@ ${detailBlock}${ANSI_RESET3}`);
|
|
|
35198
35213
|
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
35199
35214
|
}
|
|
35200
35215
|
}
|
|
35216
|
+
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
35217
|
+
if (rawWeight === void 0) {
|
|
35218
|
+
return void 0;
|
|
35219
|
+
}
|
|
35220
|
+
if (typeof rawWeight !== "number") {
|
|
35221
|
+
throw new Error(
|
|
35222
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
|
|
35223
|
+
);
|
|
35224
|
+
}
|
|
35225
|
+
if (!Number.isFinite(rawWeight)) {
|
|
35226
|
+
throw new Error(
|
|
35227
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
|
|
35228
|
+
);
|
|
35229
|
+
}
|
|
35230
|
+
if (rawWeight < 0) {
|
|
35231
|
+
throw new Error(
|
|
35232
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
|
|
35233
|
+
);
|
|
35234
|
+
}
|
|
35235
|
+
return rawWeight;
|
|
35236
|
+
}
|
|
35201
35237
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
35202
35238
|
var ANSI_RESET4 = "\x1B[0m";
|
|
35203
35239
|
async function processMessages(options) {
|
|
@@ -37906,9 +37942,11 @@ var CodeEvaluator = class {
|
|
|
37906
37942
|
expected_outcome: context.evalCase.expected_outcome,
|
|
37907
37943
|
reference_answer: context.evalCase.reference_answer,
|
|
37908
37944
|
candidate_answer: context.candidate,
|
|
37909
|
-
|
|
37910
|
-
input_files: context.evalCase.file_paths
|
|
37911
|
-
|
|
37945
|
+
guideline_files: context.evalCase.guideline_paths,
|
|
37946
|
+
input_files: context.evalCase.file_paths.filter(
|
|
37947
|
+
(path132) => !context.evalCase.guideline_paths.includes(path132)
|
|
37948
|
+
),
|
|
37949
|
+
input_messages: context.evalCase.input_messages
|
|
37912
37950
|
},
|
|
37913
37951
|
null,
|
|
37914
37952
|
2
|
|
@@ -38174,8 +38212,8 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38174
38212
|
};
|
|
38175
38213
|
}
|
|
38176
38214
|
};
|
|
38177
|
-
var
|
|
38178
|
-
kind = "
|
|
38215
|
+
var ExpectedToolCallsEvaluator = class {
|
|
38216
|
+
kind = "expected_tool_calls";
|
|
38179
38217
|
evaluate(context) {
|
|
38180
38218
|
const { candidateTrace, evalCase } = context;
|
|
38181
38219
|
const expectedSegments = evalCase.expected_segments;
|
|
@@ -39070,14 +39108,12 @@ async function evaluateCandidate(options) {
|
|
|
39070
39108
|
} else {
|
|
39071
39109
|
if (promptInputs.chatPrompt) {
|
|
39072
39110
|
lmProviderRequest = {
|
|
39073
|
-
chat_prompt: promptInputs.chatPrompt
|
|
39074
|
-
guideline_paths: evalCase.guideline_paths
|
|
39111
|
+
chat_prompt: promptInputs.chatPrompt
|
|
39075
39112
|
};
|
|
39076
39113
|
} else {
|
|
39077
39114
|
lmProviderRequest = {
|
|
39078
39115
|
question: promptInputs.question,
|
|
39079
|
-
guidelines: promptInputs.guidelines
|
|
39080
|
-
guideline_paths: evalCase.guideline_paths
|
|
39116
|
+
guidelines: promptInputs.guidelines
|
|
39081
39117
|
};
|
|
39082
39118
|
}
|
|
39083
39119
|
}
|
|
@@ -39184,11 +39220,13 @@ async function runEvaluatorList(options) {
|
|
|
39184
39220
|
now,
|
|
39185
39221
|
judgeProvider
|
|
39186
39222
|
});
|
|
39187
|
-
|
|
39223
|
+
const weight = evaluator.weight ?? 1;
|
|
39224
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
39188
39225
|
evaluatorResults.push({
|
|
39189
39226
|
name: evaluator.name,
|
|
39190
39227
|
type: evaluator.type,
|
|
39191
39228
|
score: score2.score,
|
|
39229
|
+
weight,
|
|
39192
39230
|
verdict: score2.verdict,
|
|
39193
39231
|
hits: score2.hits,
|
|
39194
39232
|
misses: score2.misses,
|
|
@@ -39211,11 +39249,13 @@ async function runEvaluatorList(options) {
|
|
|
39211
39249
|
promptInputs,
|
|
39212
39250
|
now
|
|
39213
39251
|
});
|
|
39214
|
-
|
|
39252
|
+
const weight = evaluator.weight ?? 1;
|
|
39253
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
39215
39254
|
evaluatorResults.push({
|
|
39216
39255
|
name: evaluator.name,
|
|
39217
39256
|
type: "code_judge",
|
|
39218
39257
|
score: score2.score,
|
|
39258
|
+
weight,
|
|
39219
39259
|
verdict: score2.verdict,
|
|
39220
39260
|
hits: score2.hits,
|
|
39221
39261
|
misses: score2.misses,
|
|
@@ -39245,8 +39285,8 @@ async function runEvaluatorList(options) {
|
|
|
39245
39285
|
return new ToolTrajectoryEvaluator({
|
|
39246
39286
|
config: memberConfig
|
|
39247
39287
|
});
|
|
39248
|
-
case "
|
|
39249
|
-
return new
|
|
39288
|
+
case "expected_tool_calls":
|
|
39289
|
+
return new ExpectedToolCallsEvaluator();
|
|
39250
39290
|
default: {
|
|
39251
39291
|
const unknownConfig = memberConfig;
|
|
39252
39292
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -39268,11 +39308,13 @@ async function runEvaluatorList(options) {
|
|
|
39268
39308
|
now,
|
|
39269
39309
|
judgeProvider
|
|
39270
39310
|
});
|
|
39271
|
-
|
|
39311
|
+
const weight = evaluator.weight ?? 1;
|
|
39312
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
39272
39313
|
evaluatorResults.push({
|
|
39273
39314
|
name: evaluator.name,
|
|
39274
39315
|
type: evaluator.type,
|
|
39275
39316
|
score: score2.score,
|
|
39317
|
+
weight,
|
|
39276
39318
|
verdict: score2.verdict,
|
|
39277
39319
|
hits: score2.hits,
|
|
39278
39320
|
misses: score2.misses,
|
|
@@ -39296,20 +39338,22 @@ async function runEvaluatorList(options) {
|
|
|
39296
39338
|
candidateTrace,
|
|
39297
39339
|
candidateTraceSummary
|
|
39298
39340
|
});
|
|
39299
|
-
|
|
39341
|
+
const weight = evaluator.weight ?? 1;
|
|
39342
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
39300
39343
|
evaluatorResults.push({
|
|
39301
39344
|
name: evaluator.name,
|
|
39302
39345
|
type: evaluator.type,
|
|
39303
39346
|
score: score2.score,
|
|
39347
|
+
weight,
|
|
39304
39348
|
verdict: score2.verdict,
|
|
39305
39349
|
hits: score2.hits,
|
|
39306
39350
|
misses: score2.misses,
|
|
39307
39351
|
reasoning: score2.reasoning
|
|
39308
39352
|
});
|
|
39309
39353
|
}
|
|
39310
|
-
if (evaluator.type === "
|
|
39311
|
-
const
|
|
39312
|
-
const score2 =
|
|
39354
|
+
if (evaluator.type === "expected_tool_calls") {
|
|
39355
|
+
const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
|
|
39356
|
+
const score2 = expectedToolCallsEvaluator.evaluate({
|
|
39313
39357
|
evalCase,
|
|
39314
39358
|
candidate,
|
|
39315
39359
|
target,
|
|
@@ -39320,11 +39364,13 @@ async function runEvaluatorList(options) {
|
|
|
39320
39364
|
candidateTrace,
|
|
39321
39365
|
candidateTraceSummary
|
|
39322
39366
|
});
|
|
39323
|
-
|
|
39367
|
+
const weight = evaluator.weight ?? 1;
|
|
39368
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
39324
39369
|
evaluatorResults.push({
|
|
39325
39370
|
name: evaluator.name,
|
|
39326
39371
|
type: evaluator.type,
|
|
39327
39372
|
score: score2.score,
|
|
39373
|
+
weight,
|
|
39328
39374
|
verdict: score2.verdict,
|
|
39329
39375
|
hits: score2.hits,
|
|
39330
39376
|
misses: score2.misses,
|
|
@@ -39342,15 +39388,18 @@ async function runEvaluatorList(options) {
|
|
|
39342
39388
|
reasoning: message
|
|
39343
39389
|
};
|
|
39344
39390
|
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
39391
|
+
const weight = evaluator.weight ?? 1;
|
|
39345
39392
|
scored.push({
|
|
39346
39393
|
score: fallbackScore,
|
|
39347
39394
|
name: evaluator.name ?? "unknown",
|
|
39348
|
-
type: resultType ?? "llm_judge"
|
|
39395
|
+
type: resultType ?? "llm_judge",
|
|
39396
|
+
weight
|
|
39349
39397
|
});
|
|
39350
39398
|
evaluatorResults.push({
|
|
39351
39399
|
name: evaluator.name ?? "unknown",
|
|
39352
39400
|
type: resultType ?? "llm_judge",
|
|
39353
39401
|
score: 0,
|
|
39402
|
+
weight,
|
|
39354
39403
|
verdict: "fail",
|
|
39355
39404
|
hits: [],
|
|
39356
39405
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
@@ -39358,7 +39407,9 @@ async function runEvaluatorList(options) {
|
|
|
39358
39407
|
});
|
|
39359
39408
|
}
|
|
39360
39409
|
}
|
|
39361
|
-
const aggregateScore = scored.length > 0 ?
|
|
39410
|
+
const aggregateScore = scored.length > 0 ? computeWeightedMean(
|
|
39411
|
+
scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
39412
|
+
) : 0;
|
|
39362
39413
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
39363
39414
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
39364
39415
|
const expectedAspectCount = scored.reduce(
|
|
@@ -39584,6 +39635,16 @@ function mapChildResults(children) {
|
|
|
39584
39635
|
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
39585
39636
|
}));
|
|
39586
39637
|
}
|
|
39638
|
+
function computeWeightedMean(entries) {
|
|
39639
|
+
let totalWeight = 0;
|
|
39640
|
+
let weightedSum = 0;
|
|
39641
|
+
for (const entry of entries) {
|
|
39642
|
+
const weight = entry.weight ?? 1;
|
|
39643
|
+
totalWeight += weight;
|
|
39644
|
+
weightedSum += entry.score * weight;
|
|
39645
|
+
}
|
|
39646
|
+
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
39647
|
+
}
|
|
39587
39648
|
var rubricItemSchema = external_exports.object({
|
|
39588
39649
|
id: external_exports.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
39589
39650
|
description: external_exports.string().describe("What this rubric checks for"),
|
|
@@ -42647,4 +42708,4 @@ export {
|
|
|
42647
42708
|
app,
|
|
42648
42709
|
runCli
|
|
42649
42710
|
};
|
|
42650
|
-
//# sourceMappingURL=chunk-
|
|
42711
|
+
//# sourceMappingURL=chunk-RIJO5WBF.js.map
|