@agentv/core 3.13.0 → 3.13.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-4XWPXNQM.js → chunk-ZB3AUPES.js} +1 -3
- package/dist/chunk-ZB3AUPES.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +0 -2
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +65 -32
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +21 -22
- package/dist/index.d.ts +21 -22
- package/dist/index.js +65 -31
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-4XWPXNQM.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1455,6 +1455,7 @@ __export(index_exports, {
|
|
|
1455
1455
|
extractTargetFromSuite: () => extractTargetFromSuite,
|
|
1456
1456
|
extractTargetsFromSuite: () => extractTargetsFromSuite,
|
|
1457
1457
|
extractTargetsFromTestCase: () => extractTargetsFromTestCase,
|
|
1458
|
+
extractThreshold: () => extractThreshold,
|
|
1458
1459
|
extractTrialsConfig: () => extractTrialsConfig,
|
|
1459
1460
|
extractWorkersFromSuite: () => extractWorkersFromSuite,
|
|
1460
1461
|
fileExists: () => fileExists2,
|
|
@@ -1581,8 +1582,6 @@ function isTestMessage(value) {
|
|
|
1581
1582
|
var EVALUATOR_KIND_VALUES = [
|
|
1582
1583
|
"code-grader",
|
|
1583
1584
|
"llm-grader",
|
|
1584
|
-
"code-judge",
|
|
1585
|
-
"llm-judge",
|
|
1586
1585
|
"rubric",
|
|
1587
1586
|
"composite",
|
|
1588
1587
|
"tool-trajectory",
|
|
@@ -2322,6 +2321,22 @@ function extractFailOnError(suite) {
|
|
|
2322
2321
|
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
2323
2322
|
return void 0;
|
|
2324
2323
|
}
|
|
2324
|
+
function extractThreshold(suite) {
|
|
2325
|
+
const execution = suite.execution;
|
|
2326
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
2327
|
+
return void 0;
|
|
2328
|
+
}
|
|
2329
|
+
const executionObj = execution;
|
|
2330
|
+
const raw = executionObj.threshold;
|
|
2331
|
+
if (raw === void 0 || raw === null) {
|
|
2332
|
+
return void 0;
|
|
2333
|
+
}
|
|
2334
|
+
if (typeof raw === "number" && raw >= 0 && raw <= 1) {
|
|
2335
|
+
return raw;
|
|
2336
|
+
}
|
|
2337
|
+
logWarning(`Invalid execution.threshold: ${raw}. Must be a number between 0 and 1. Ignoring.`);
|
|
2338
|
+
return void 0;
|
|
2339
|
+
}
|
|
2325
2340
|
function parseExecutionDefaults(raw, configPath) {
|
|
2326
2341
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
2327
2342
|
return void 0;
|
|
@@ -2449,6 +2464,9 @@ var ANSI_RESET5 = "\x1B[0m";
|
|
|
2449
2464
|
function normalizeEvaluatorType(type) {
|
|
2450
2465
|
return type.replace(/_/g, "-");
|
|
2451
2466
|
}
|
|
2467
|
+
function isDeprecatedJudgeType(type) {
|
|
2468
|
+
return type === "code-judge" || type === "llm-judge";
|
|
2469
|
+
}
|
|
2452
2470
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
2453
2471
|
const execution = rawEvalCase.execution;
|
|
2454
2472
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -2511,6 +2529,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2511
2529
|
const rawName = asString(rawEvaluator.name);
|
|
2512
2530
|
const rawType = rawEvaluator.type;
|
|
2513
2531
|
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
2532
|
+
if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
|
|
2533
|
+
logWarning2(
|
|
2534
|
+
`Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
|
|
2535
|
+
);
|
|
2536
|
+
continue;
|
|
2537
|
+
}
|
|
2514
2538
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
2515
2539
|
if (typeof typeValue !== "string") {
|
|
2516
2540
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -2543,7 +2567,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2543
2567
|
});
|
|
2544
2568
|
continue;
|
|
2545
2569
|
}
|
|
2546
|
-
if (typeValue === "code-grader"
|
|
2570
|
+
if (typeValue === "code-grader") {
|
|
2547
2571
|
let command;
|
|
2548
2572
|
if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
|
|
2549
2573
|
console.warn(
|
|
@@ -2653,7 +2677,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2653
2677
|
continue;
|
|
2654
2678
|
}
|
|
2655
2679
|
const aggregatorType = asString(rawAggregator.type);
|
|
2656
|
-
|
|
2680
|
+
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
|
|
2681
|
+
if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
|
|
2682
|
+
logWarning2(
|
|
2683
|
+
`Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
|
|
2684
|
+
);
|
|
2685
|
+
continue;
|
|
2686
|
+
}
|
|
2687
|
+
if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
|
|
2657
2688
|
logWarning2(
|
|
2658
2689
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
2659
2690
|
);
|
|
@@ -2688,7 +2719,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2688
2719
|
continue;
|
|
2689
2720
|
}
|
|
2690
2721
|
let aggregator;
|
|
2691
|
-
if (
|
|
2722
|
+
if (normalizedAggregatorType === "weighted_average") {
|
|
2692
2723
|
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
2693
2724
|
const parsedWeights = {};
|
|
2694
2725
|
if (weights) {
|
|
@@ -2702,7 +2733,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2702
2733
|
type: "weighted_average",
|
|
2703
2734
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
2704
2735
|
};
|
|
2705
|
-
} else if (
|
|
2736
|
+
} else if (normalizedAggregatorType === "code-grader") {
|
|
2706
2737
|
const aggregatorPath = asString(rawAggregator.path);
|
|
2707
2738
|
if (!aggregatorPath) {
|
|
2708
2739
|
logWarning2(
|
|
@@ -2715,7 +2746,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2715
2746
|
path: aggregatorPath,
|
|
2716
2747
|
cwd: searchRoots[0]
|
|
2717
2748
|
};
|
|
2718
|
-
} else if (
|
|
2749
|
+
} else if (normalizedAggregatorType === "threshold") {
|
|
2719
2750
|
const thresholdValue = rawAggregator.threshold;
|
|
2720
2751
|
if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
|
|
2721
2752
|
logWarning2(
|
|
@@ -3463,10 +3494,15 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
3463
3494
|
return void 0;
|
|
3464
3495
|
}
|
|
3465
3496
|
const normalized = normalizeEvaluatorType(candidate);
|
|
3497
|
+
if (isDeprecatedJudgeType(normalized)) {
|
|
3498
|
+
throw new Error(
|
|
3499
|
+
`Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
|
|
3500
|
+
);
|
|
3501
|
+
}
|
|
3466
3502
|
if (isEvaluatorKind(normalized)) {
|
|
3467
3503
|
return normalized;
|
|
3468
3504
|
}
|
|
3469
|
-
logWarning2(`Unknown
|
|
3505
|
+
logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
|
|
3470
3506
|
return void 0;
|
|
3471
3507
|
}
|
|
3472
3508
|
function asString(value) {
|
|
@@ -4450,6 +4486,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
4450
4486
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
4451
4487
|
const metadata = parseMetadata(parsed);
|
|
4452
4488
|
const failOnError = extractFailOnError(parsed);
|
|
4489
|
+
const threshold = extractThreshold(parsed);
|
|
4453
4490
|
return {
|
|
4454
4491
|
tests,
|
|
4455
4492
|
trials: extractTrialsConfig(parsed),
|
|
@@ -4458,7 +4495,8 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
4458
4495
|
cacheConfig: extractCacheConfig(parsed),
|
|
4459
4496
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
4460
4497
|
...metadata !== void 0 && { metadata },
|
|
4461
|
-
...failOnError !== void 0 && { failOnError }
|
|
4498
|
+
...failOnError !== void 0 && { failOnError },
|
|
4499
|
+
...threshold !== void 0 && { threshold }
|
|
4462
4500
|
};
|
|
4463
4501
|
}
|
|
4464
4502
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -4899,9 +4937,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
4899
4937
|
case "ends_with":
|
|
4900
4938
|
return `Output ends with '${entry.value}'`;
|
|
4901
4939
|
case "llm-grader":
|
|
4902
|
-
case "llm_grader":
|
|
4903
|
-
case "llm-judge":
|
|
4904
|
-
case "llm_judge": {
|
|
4940
|
+
case "llm_grader": {
|
|
4905
4941
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
4906
4942
|
return null;
|
|
4907
4943
|
}
|
|
@@ -4914,9 +4950,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
4914
4950
|
return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
|
|
4915
4951
|
}
|
|
4916
4952
|
case "code-grader":
|
|
4917
|
-
case "code_grader":
|
|
4918
|
-
case "code-judge":
|
|
4919
|
-
case "code_judge": {
|
|
4953
|
+
case "code_grader": {
|
|
4920
4954
|
const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
|
|
4921
4955
|
const desc = typeof entry.description === "string" ? entry.description : void 0;
|
|
4922
4956
|
return codeGraderInstruction(graderName, desc);
|
|
@@ -4947,7 +4981,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
4947
4981
|
}
|
|
4948
4982
|
}
|
|
4949
4983
|
function assertionToNaturalLanguageList(entry) {
|
|
4950
|
-
if (entry.type === "llm-grader" || entry.type === "llm_grader"
|
|
4984
|
+
if (entry.type === "llm-grader" || entry.type === "llm_grader") {
|
|
4951
4985
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
4952
4986
|
return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
|
|
4953
4987
|
}
|
|
@@ -13168,7 +13202,7 @@ function toCamelCaseDeep(obj) {
|
|
|
13168
13202
|
// src/evaluation/evaluators/code-evaluator.ts
|
|
13169
13203
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
13170
13204
|
var CodeEvaluator = class {
|
|
13171
|
-
kind = "code-
|
|
13205
|
+
kind = "code-grader";
|
|
13172
13206
|
command;
|
|
13173
13207
|
cwd;
|
|
13174
13208
|
agentTimeoutMs;
|
|
@@ -13187,7 +13221,7 @@ var CodeEvaluator = class {
|
|
|
13187
13221
|
if (outputForPayload) {
|
|
13188
13222
|
const serialized = JSON.stringify(outputForPayload);
|
|
13189
13223
|
if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
|
|
13190
|
-
const tmpDir = await (0, import_promises26.mkdtemp)((0, import_node_path36.join)((0, import_node_os5.tmpdir)(), "agentv-
|
|
13224
|
+
const tmpDir = await (0, import_promises26.mkdtemp)((0, import_node_path36.join)((0, import_node_os5.tmpdir)(), "agentv-grader-"));
|
|
13191
13225
|
outputPath = (0, import_node_path36.join)(tmpDir, "output.json");
|
|
13192
13226
|
await (0, import_promises26.writeFile)(outputPath, serialized);
|
|
13193
13227
|
outputForPayload = null;
|
|
@@ -13477,7 +13511,7 @@ var LlmGraderEvaluator = class {
|
|
|
13477
13511
|
return this.evaluateWithDelegatedAgent(context2, graderProvider);
|
|
13478
13512
|
}
|
|
13479
13513
|
const config = context2.evaluator;
|
|
13480
|
-
if (
|
|
13514
|
+
if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
|
|
13481
13515
|
return this.evaluateWithRubrics(context2, graderProvider, config.rubrics);
|
|
13482
13516
|
}
|
|
13483
13517
|
return this.evaluateFreeform(context2, graderProvider);
|
|
@@ -13662,7 +13696,7 @@ ${context2.fileChanges}`;
|
|
|
13662
13696
|
const systemPrompt = this.buildAgentSystemPrompt(context2);
|
|
13663
13697
|
const userPrompt = this.buildAgentUserPrompt(context2);
|
|
13664
13698
|
const config = context2.evaluator;
|
|
13665
|
-
const rubrics = config?.type === "llm-grader"
|
|
13699
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13666
13700
|
const fsTools = createFilesystemTools(workspacePath);
|
|
13667
13701
|
const evaluatorRawRequest = {
|
|
13668
13702
|
mode: "built-in",
|
|
@@ -13758,7 +13792,7 @@ ${context2.fileChanges}`;
|
|
|
13758
13792
|
};
|
|
13759
13793
|
}
|
|
13760
13794
|
const config = context2.evaluator;
|
|
13761
|
-
const rubrics = config?.type === "llm-grader"
|
|
13795
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13762
13796
|
const details = {
|
|
13763
13797
|
mode: modeLabel,
|
|
13764
13798
|
grader_target: provider.targetName
|
|
@@ -13798,7 +13832,7 @@ ${context2.fileChanges}`;
|
|
|
13798
13832
|
*/
|
|
13799
13833
|
buildAgentSystemPrompt(context2) {
|
|
13800
13834
|
const config = context2.evaluator;
|
|
13801
|
-
const rubrics = config?.type === "llm-grader"
|
|
13835
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13802
13836
|
const parts = [
|
|
13803
13837
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
13804
13838
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -13829,7 +13863,7 @@ ${context2.fileChanges}`;
|
|
|
13829
13863
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
13830
13864
|
}
|
|
13831
13865
|
const config = context2.evaluator;
|
|
13832
|
-
const rubrics = config?.type === "llm-grader"
|
|
13866
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13833
13867
|
const parts = [
|
|
13834
13868
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
13835
13869
|
"",
|
|
@@ -13872,7 +13906,7 @@ ${context2.fileChanges}`;
|
|
|
13872
13906
|
buildDelegatedPrompt(context2) {
|
|
13873
13907
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
13874
13908
|
const config = context2.evaluator;
|
|
13875
|
-
const rubrics = config?.type === "llm-grader"
|
|
13909
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13876
13910
|
if (this.evaluatorTemplate) {
|
|
13877
13911
|
const variables = {
|
|
13878
13912
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
@@ -14369,10 +14403,8 @@ var CompositeEvaluator = class {
|
|
|
14369
14403
|
const aggregator = this.config.aggregator;
|
|
14370
14404
|
switch (aggregator.type) {
|
|
14371
14405
|
case "code-grader":
|
|
14372
|
-
case "code-judge":
|
|
14373
14406
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
14374
14407
|
case "llm-grader":
|
|
14375
|
-
case "llm-judge":
|
|
14376
14408
|
return this.runLlmAggregator(results, context2, aggregator);
|
|
14377
14409
|
case "threshold":
|
|
14378
14410
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -16794,7 +16826,7 @@ var endsWithFactory = (config) => {
|
|
|
16794
16826
|
};
|
|
16795
16827
|
function createBuiltinRegistry() {
|
|
16796
16828
|
const registry = new EvaluatorRegistry();
|
|
16797
|
-
registry.register("llm-grader", llmGraderFactory).register("
|
|
16829
|
+
registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
|
|
16798
16830
|
const fn = config[INLINE_ASSERT_FN];
|
|
16799
16831
|
if (!fn) {
|
|
16800
16832
|
throw new Error(
|
|
@@ -19512,7 +19544,7 @@ function filterEvalCases(evalCases, filter) {
|
|
|
19512
19544
|
return evalCases.filter((evalCase) => import_micromatch3.default.isMatch(evalCase.id, filter));
|
|
19513
19545
|
}
|
|
19514
19546
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
19515
|
-
const llmGrader = overrides?.["llm-grader"] ??
|
|
19547
|
+
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
19516
19548
|
resolveGraderProvider: async (context2) => {
|
|
19517
19549
|
if (context2.graderProvider) {
|
|
19518
19550
|
return context2.graderProvider;
|
|
@@ -20356,10 +20388,10 @@ var OtelTraceExporter = class {
|
|
|
20356
20388
|
}
|
|
20357
20389
|
if (result.scores) {
|
|
20358
20390
|
for (const score of result.scores) {
|
|
20359
|
-
rootSpan.addEvent(`agentv.
|
|
20360
|
-
"agentv.
|
|
20361
|
-
"agentv.
|
|
20362
|
-
...score.verdict ? { "agentv.
|
|
20391
|
+
rootSpan.addEvent(`agentv.grader.${score.name}`, {
|
|
20392
|
+
"agentv.grader.score": score.score,
|
|
20393
|
+
"agentv.grader.type": score.type,
|
|
20394
|
+
...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
|
|
20363
20395
|
});
|
|
20364
20396
|
}
|
|
20365
20397
|
}
|
|
@@ -20749,6 +20781,7 @@ function createAgentKernel() {
|
|
|
20749
20781
|
extractTargetFromSuite,
|
|
20750
20782
|
extractTargetsFromSuite,
|
|
20751
20783
|
extractTargetsFromTestCase,
|
|
20784
|
+
extractThreshold,
|
|
20752
20785
|
extractTrialsConfig,
|
|
20753
20786
|
extractWorkersFromSuite,
|
|
20754
20787
|
fileExists,
|