agentv 3.13.0 → 3.13.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/{chunk-6H4IAXQH.js → chunk-4Z5E5CYT.js} +54 -22
- package/dist/chunk-4Z5E5CYT.js.map +1 -0
- package/dist/{chunk-7OHZAFND.js → chunk-D3LNJUUB.js} +67 -35
- package/dist/chunk-D3LNJUUB.js.map +1 -0
- package/dist/{chunk-DJU4C6NS.js → chunk-X2343WOK.js} +31 -19
- package/dist/chunk-X2343WOK.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-SMKOBBFB.js → dist-KPMR7RBT.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-RV664PCR.js → interactive-HVKLYGRX.js} +3 -3
- package/dist/templates/.agentv/.env.example +23 -0
- package/dist/templates/.agentv/config.yaml +13 -4
- package/dist/templates/.agentv/targets.yaml +16 -0
- package/package.json +1 -1
- package/dist/chunk-6H4IAXQH.js.map +0 -1
- package/dist/chunk-7OHZAFND.js.map +0 -1
- package/dist/chunk-DJU4C6NS.js.map +0 -1
- /package/dist/{dist-SMKOBBFB.js.map → dist-KPMR7RBT.js.map} +0 -0
- /package/dist/{interactive-RV664PCR.js.map → interactive-HVKLYGRX.js.map} +0 -0
|
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
|
|
|
301
301
|
}
|
|
302
302
|
});
|
|
303
303
|
|
|
304
|
-
// ../../packages/core/dist/chunk-
|
|
304
|
+
// ../../packages/core/dist/chunk-ZB3AUPES.js
|
|
305
305
|
import { constants } from "node:fs";
|
|
306
306
|
import { access, readFile } from "node:fs/promises";
|
|
307
307
|
import path from "node:path";
|
|
@@ -419,7 +419,7 @@ __export(external_exports2, {
|
|
|
419
419
|
void: () => voidType
|
|
420
420
|
});
|
|
421
421
|
|
|
422
|
-
// ../../packages/core/dist/chunk-
|
|
422
|
+
// ../../packages/core/dist/chunk-ZB3AUPES.js
|
|
423
423
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
424
424
|
import path3 from "node:path";
|
|
425
425
|
import fg from "fast-glob";
|
|
@@ -473,8 +473,6 @@ function isTestMessage(value) {
|
|
|
473
473
|
var EVALUATOR_KIND_VALUES = [
|
|
474
474
|
"code-grader",
|
|
475
475
|
"llm-grader",
|
|
476
|
-
"code-judge",
|
|
477
|
-
"llm-judge",
|
|
478
476
|
"rubric",
|
|
479
477
|
"composite",
|
|
480
478
|
"tool-trajectory",
|
|
@@ -14845,6 +14843,22 @@ function extractFailOnError(suite) {
|
|
|
14845
14843
|
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
14846
14844
|
return void 0;
|
|
14847
14845
|
}
|
|
14846
|
+
function extractThreshold(suite) {
|
|
14847
|
+
const execution = suite.execution;
|
|
14848
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
14849
|
+
return void 0;
|
|
14850
|
+
}
|
|
14851
|
+
const executionObj = execution;
|
|
14852
|
+
const raw = executionObj.threshold;
|
|
14853
|
+
if (raw === void 0 || raw === null) {
|
|
14854
|
+
return void 0;
|
|
14855
|
+
}
|
|
14856
|
+
if (typeof raw === "number" && raw >= 0 && raw <= 1) {
|
|
14857
|
+
return raw;
|
|
14858
|
+
}
|
|
14859
|
+
logWarning(`Invalid execution.threshold: ${raw}. Must be a number between 0 and 1. Ignoring.`);
|
|
14860
|
+
return void 0;
|
|
14861
|
+
}
|
|
14848
14862
|
function parseExecutionDefaults(raw, configPath) {
|
|
14849
14863
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
14850
14864
|
return void 0;
|
|
@@ -14960,6 +14974,9 @@ var ANSI_RESET4 = "\x1B[0m";
|
|
|
14960
14974
|
function normalizeEvaluatorType(type) {
|
|
14961
14975
|
return type.replace(/_/g, "-");
|
|
14962
14976
|
}
|
|
14977
|
+
function isDeprecatedJudgeType(type) {
|
|
14978
|
+
return type === "code-judge" || type === "llm-judge";
|
|
14979
|
+
}
|
|
14963
14980
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
14964
14981
|
const execution = rawEvalCase.execution;
|
|
14965
14982
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -15022,6 +15039,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15022
15039
|
const rawName = asString(rawEvaluator.name);
|
|
15023
15040
|
const rawType = rawEvaluator.type;
|
|
15024
15041
|
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
15042
|
+
if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
|
|
15043
|
+
logWarning2(
|
|
15044
|
+
`Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
|
|
15045
|
+
);
|
|
15046
|
+
continue;
|
|
15047
|
+
}
|
|
15025
15048
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
15026
15049
|
if (typeof typeValue !== "string") {
|
|
15027
15050
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -15054,7 +15077,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15054
15077
|
});
|
|
15055
15078
|
continue;
|
|
15056
15079
|
}
|
|
15057
|
-
if (typeValue === "code-grader"
|
|
15080
|
+
if (typeValue === "code-grader") {
|
|
15058
15081
|
let command;
|
|
15059
15082
|
if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
|
|
15060
15083
|
console.warn(
|
|
@@ -15164,7 +15187,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15164
15187
|
continue;
|
|
15165
15188
|
}
|
|
15166
15189
|
const aggregatorType = asString(rawAggregator.type);
|
|
15167
|
-
|
|
15190
|
+
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
|
|
15191
|
+
if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
|
|
15192
|
+
logWarning2(
|
|
15193
|
+
`Skipping composite evaluator '${name21}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
|
|
15194
|
+
);
|
|
15195
|
+
continue;
|
|
15196
|
+
}
|
|
15197
|
+
if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
|
|
15168
15198
|
logWarning2(
|
|
15169
15199
|
`Skipping composite evaluator '${name21}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
15170
15200
|
);
|
|
@@ -15199,7 +15229,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15199
15229
|
continue;
|
|
15200
15230
|
}
|
|
15201
15231
|
let aggregator;
|
|
15202
|
-
if (
|
|
15232
|
+
if (normalizedAggregatorType === "weighted_average") {
|
|
15203
15233
|
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
15204
15234
|
const parsedWeights = {};
|
|
15205
15235
|
if (weights) {
|
|
@@ -15213,7 +15243,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15213
15243
|
type: "weighted_average",
|
|
15214
15244
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
15215
15245
|
};
|
|
15216
|
-
} else if (
|
|
15246
|
+
} else if (normalizedAggregatorType === "code-grader") {
|
|
15217
15247
|
const aggregatorPath = asString(rawAggregator.path);
|
|
15218
15248
|
if (!aggregatorPath) {
|
|
15219
15249
|
logWarning2(
|
|
@@ -15226,7 +15256,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15226
15256
|
path: aggregatorPath,
|
|
15227
15257
|
cwd: searchRoots[0]
|
|
15228
15258
|
};
|
|
15229
|
-
} else if (
|
|
15259
|
+
} else if (normalizedAggregatorType === "threshold") {
|
|
15230
15260
|
const thresholdValue = rawAggregator.threshold;
|
|
15231
15261
|
if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
|
|
15232
15262
|
logWarning2(
|
|
@@ -15974,10 +16004,15 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
15974
16004
|
return void 0;
|
|
15975
16005
|
}
|
|
15976
16006
|
const normalized = normalizeEvaluatorType(candidate);
|
|
16007
|
+
if (isDeprecatedJudgeType(normalized)) {
|
|
16008
|
+
throw new Error(
|
|
16009
|
+
`Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
|
|
16010
|
+
);
|
|
16011
|
+
}
|
|
15977
16012
|
if (isEvaluatorKind(normalized)) {
|
|
15978
16013
|
return normalized;
|
|
15979
16014
|
}
|
|
15980
|
-
logWarning2(`Unknown
|
|
16015
|
+
logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
|
|
15981
16016
|
return void 0;
|
|
15982
16017
|
}
|
|
15983
16018
|
function asString(value) {
|
|
@@ -16936,6 +16971,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
16936
16971
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
16937
16972
|
const metadata = parseMetadata(parsed);
|
|
16938
16973
|
const failOnError = extractFailOnError(parsed);
|
|
16974
|
+
const threshold = extractThreshold(parsed);
|
|
16939
16975
|
return {
|
|
16940
16976
|
tests,
|
|
16941
16977
|
trials: extractTrialsConfig(parsed),
|
|
@@ -16944,7 +16980,8 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
16944
16980
|
cacheConfig: extractCacheConfig(parsed),
|
|
16945
16981
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
16946
16982
|
...metadata !== void 0 && { metadata },
|
|
16947
|
-
...failOnError !== void 0 && { failOnError }
|
|
16983
|
+
...failOnError !== void 0 && { failOnError },
|
|
16984
|
+
...threshold !== void 0 && { threshold }
|
|
16948
16985
|
};
|
|
16949
16986
|
}
|
|
16950
16987
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -17380,9 +17417,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
17380
17417
|
case "ends_with":
|
|
17381
17418
|
return `Output ends with '${entry.value}'`;
|
|
17382
17419
|
case "llm-grader":
|
|
17383
|
-
case "llm_grader":
|
|
17384
|
-
case "llm-judge":
|
|
17385
|
-
case "llm_judge": {
|
|
17420
|
+
case "llm_grader": {
|
|
17386
17421
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
17387
17422
|
return null;
|
|
17388
17423
|
}
|
|
@@ -17395,9 +17430,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
17395
17430
|
return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
|
|
17396
17431
|
}
|
|
17397
17432
|
case "code-grader":
|
|
17398
|
-
case "code_grader":
|
|
17399
|
-
case "code-judge":
|
|
17400
|
-
case "code_judge": {
|
|
17433
|
+
case "code_grader": {
|
|
17401
17434
|
const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
|
|
17402
17435
|
const desc = typeof entry.description === "string" ? entry.description : void 0;
|
|
17403
17436
|
return codeGraderInstruction(graderName, desc);
|
|
@@ -17428,7 +17461,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
17428
17461
|
}
|
|
17429
17462
|
}
|
|
17430
17463
|
function assertionToNaturalLanguageList(entry) {
|
|
17431
|
-
if (entry.type === "llm-grader" || entry.type === "llm_grader"
|
|
17464
|
+
if (entry.type === "llm-grader" || entry.type === "llm_grader") {
|
|
17432
17465
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
17433
17466
|
return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
|
|
17434
17467
|
}
|
|
@@ -24084,7 +24117,7 @@ function toCamelCaseDeep(obj) {
|
|
|
24084
24117
|
}
|
|
24085
24118
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
24086
24119
|
var CodeEvaluator = class {
|
|
24087
|
-
kind = "code-
|
|
24120
|
+
kind = "code-grader";
|
|
24088
24121
|
command;
|
|
24089
24122
|
cwd;
|
|
24090
24123
|
agentTimeoutMs;
|
|
@@ -24103,7 +24136,7 @@ var CodeEvaluator = class {
|
|
|
24103
24136
|
if (outputForPayload) {
|
|
24104
24137
|
const serialized = JSON.stringify(outputForPayload);
|
|
24105
24138
|
if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
|
|
24106
|
-
const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-
|
|
24139
|
+
const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-grader-"));
|
|
24107
24140
|
outputPath = join(tmpDir, "output.json");
|
|
24108
24141
|
await writeFile6(outputPath, serialized);
|
|
24109
24142
|
outputForPayload = null;
|
|
@@ -24352,7 +24385,7 @@ var LlmGraderEvaluator = class {
|
|
|
24352
24385
|
return this.evaluateWithDelegatedAgent(context2, graderProvider);
|
|
24353
24386
|
}
|
|
24354
24387
|
const config = context2.evaluator;
|
|
24355
|
-
if (
|
|
24388
|
+
if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
|
|
24356
24389
|
return this.evaluateWithRubrics(context2, graderProvider, config.rubrics);
|
|
24357
24390
|
}
|
|
24358
24391
|
return this.evaluateFreeform(context2, graderProvider);
|
|
@@ -24537,7 +24570,7 @@ ${context2.fileChanges}`;
|
|
|
24537
24570
|
const systemPrompt = this.buildAgentSystemPrompt(context2);
|
|
24538
24571
|
const userPrompt = this.buildAgentUserPrompt(context2);
|
|
24539
24572
|
const config = context2.evaluator;
|
|
24540
|
-
const rubrics = config?.type === "llm-grader"
|
|
24573
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24541
24574
|
const fsTools = createFilesystemTools(workspacePath);
|
|
24542
24575
|
const evaluatorRawRequest = {
|
|
24543
24576
|
mode: "built-in",
|
|
@@ -24633,7 +24666,7 @@ ${context2.fileChanges}`;
|
|
|
24633
24666
|
};
|
|
24634
24667
|
}
|
|
24635
24668
|
const config = context2.evaluator;
|
|
24636
|
-
const rubrics = config?.type === "llm-grader"
|
|
24669
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24637
24670
|
const details = {
|
|
24638
24671
|
mode: modeLabel,
|
|
24639
24672
|
grader_target: provider.targetName
|
|
@@ -24673,7 +24706,7 @@ ${context2.fileChanges}`;
|
|
|
24673
24706
|
*/
|
|
24674
24707
|
buildAgentSystemPrompt(context2) {
|
|
24675
24708
|
const config = context2.evaluator;
|
|
24676
|
-
const rubrics = config?.type === "llm-grader"
|
|
24709
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24677
24710
|
const parts = [
|
|
24678
24711
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
24679
24712
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -24704,7 +24737,7 @@ ${context2.fileChanges}`;
|
|
|
24704
24737
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
24705
24738
|
}
|
|
24706
24739
|
const config = context2.evaluator;
|
|
24707
|
-
const rubrics = config?.type === "llm-grader"
|
|
24740
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24708
24741
|
const parts = [
|
|
24709
24742
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
24710
24743
|
"",
|
|
@@ -24747,7 +24780,7 @@ ${context2.fileChanges}`;
|
|
|
24747
24780
|
buildDelegatedPrompt(context2) {
|
|
24748
24781
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
24749
24782
|
const config = context2.evaluator;
|
|
24750
|
-
const rubrics = config?.type === "llm-grader"
|
|
24783
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24751
24784
|
if (this.evaluatorTemplate) {
|
|
24752
24785
|
const variables = {
|
|
24753
24786
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
@@ -25242,10 +25275,8 @@ var CompositeEvaluator = class {
|
|
|
25242
25275
|
const aggregator = this.config.aggregator;
|
|
25243
25276
|
switch (aggregator.type) {
|
|
25244
25277
|
case "code-grader":
|
|
25245
|
-
case "code-judge":
|
|
25246
25278
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
25247
25279
|
case "llm-grader":
|
|
25248
|
-
case "llm-judge":
|
|
25249
25280
|
return this.runLlmAggregator(results, context2, aggregator);
|
|
25250
25281
|
case "threshold":
|
|
25251
25282
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -27630,7 +27661,7 @@ var endsWithFactory = (config) => {
|
|
|
27630
27661
|
};
|
|
27631
27662
|
function createBuiltinRegistry() {
|
|
27632
27663
|
const registry = new EvaluatorRegistry();
|
|
27633
|
-
registry.register("llm-grader", llmGraderFactory).register("
|
|
27664
|
+
registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
|
|
27634
27665
|
const fn = config[INLINE_ASSERT_FN];
|
|
27635
27666
|
if (!fn) {
|
|
27636
27667
|
throw new Error(
|
|
@@ -30306,7 +30337,7 @@ function filterEvalCases(evalCases, filter2) {
|
|
|
30306
30337
|
return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter2));
|
|
30307
30338
|
}
|
|
30308
30339
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
30309
|
-
const llmGrader = overrides?.["llm-grader"] ??
|
|
30340
|
+
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
30310
30341
|
resolveGraderProvider: async (context2) => {
|
|
30311
30342
|
if (context2.graderProvider) {
|
|
30312
30343
|
return context2.graderProvider;
|
|
@@ -31127,10 +31158,10 @@ var OtelTraceExporter = class {
|
|
|
31127
31158
|
}
|
|
31128
31159
|
if (result.scores) {
|
|
31129
31160
|
for (const score of result.scores) {
|
|
31130
|
-
rootSpan.addEvent(`agentv.
|
|
31131
|
-
"agentv.
|
|
31132
|
-
"agentv.
|
|
31133
|
-
...score.verdict ? { "agentv.
|
|
31161
|
+
rootSpan.addEvent(`agentv.grader.${score.name}`, {
|
|
31162
|
+
"agentv.grader.score": score.score,
|
|
31163
|
+
"agentv.grader.type": score.type,
|
|
31164
|
+
...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
|
|
31134
31165
|
});
|
|
31135
31166
|
}
|
|
31136
31167
|
}
|
|
@@ -31480,6 +31511,7 @@ export {
|
|
|
31480
31511
|
extractTrialsConfig,
|
|
31481
31512
|
extractCacheConfig,
|
|
31482
31513
|
extractFailOnError,
|
|
31514
|
+
extractThreshold,
|
|
31483
31515
|
detectFormat,
|
|
31484
31516
|
buildPromptInputs,
|
|
31485
31517
|
readTestSuiteMetadata,
|
|
@@ -31590,4 +31622,4 @@ export {
|
|
|
31590
31622
|
OtelStreamingObserver,
|
|
31591
31623
|
createAgentKernel
|
|
31592
31624
|
};
|
|
31593
|
-
//# sourceMappingURL=chunk-
|
|
31625
|
+
//# sourceMappingURL=chunk-D3LNJUUB.js.map
|