@agentv/core 3.13.0 → 3.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-4XWPXNQM.js → chunk-ZB3AUPES.js} +1 -3
- package/dist/chunk-ZB3AUPES.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +0 -2
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +44 -31
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +12 -21
- package/dist/index.d.ts +12 -21
- package/dist/index.js +45 -30
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-4XWPXNQM.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -473,7 +473,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
473
473
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
474
474
|
*/
|
|
475
475
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
476
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "
|
|
476
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
|
|
477
477
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
478
478
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
479
479
|
/**
|
|
@@ -576,7 +576,7 @@ type WorkspaceConfig = {
|
|
|
576
576
|
};
|
|
577
577
|
type CodeEvaluatorConfig = {
|
|
578
578
|
readonly name: string;
|
|
579
|
-
readonly type: 'code-
|
|
579
|
+
readonly type: 'code-grader';
|
|
580
580
|
readonly command: readonly string[];
|
|
581
581
|
/** @deprecated Use `command` instead */
|
|
582
582
|
readonly script?: readonly string[];
|
|
@@ -606,7 +606,7 @@ type PromptScriptConfig = {
|
|
|
606
606
|
};
|
|
607
607
|
type LlmGraderEvaluatorConfig = {
|
|
608
608
|
readonly name: string;
|
|
609
|
-
readonly type: 'llm-grader'
|
|
609
|
+
readonly type: 'llm-grader';
|
|
610
610
|
/** Text prompt (inline or file path) or executable script config */
|
|
611
611
|
readonly prompt?: string | PromptScriptConfig;
|
|
612
612
|
readonly promptPath?: string;
|
|
@@ -678,20 +678,11 @@ type CompositeAggregatorConfig = {
|
|
|
678
678
|
readonly type: 'code-grader';
|
|
679
679
|
readonly path: string;
|
|
680
680
|
readonly cwd?: string;
|
|
681
|
-
} | {
|
|
682
|
-
readonly type: 'code-judge';
|
|
683
|
-
readonly path: string;
|
|
684
|
-
readonly cwd?: string;
|
|
685
681
|
} | {
|
|
686
682
|
readonly type: 'llm-grader';
|
|
687
683
|
readonly prompt?: string;
|
|
688
684
|
readonly promptPath?: string;
|
|
689
685
|
readonly model?: string;
|
|
690
|
-
} | {
|
|
691
|
-
readonly type: 'llm-judge';
|
|
692
|
-
readonly prompt?: string;
|
|
693
|
-
readonly promptPath?: string;
|
|
694
|
-
readonly model?: string;
|
|
695
686
|
} | {
|
|
696
687
|
readonly type: 'threshold';
|
|
697
688
|
readonly threshold: number;
|
|
@@ -2117,7 +2108,7 @@ interface CodeEvaluatorOptions {
|
|
|
2117
2108
|
readonly target?: TargetAccessConfig;
|
|
2118
2109
|
}
|
|
2119
2110
|
declare class CodeEvaluator implements Evaluator {
|
|
2120
|
-
readonly kind = "code-
|
|
2111
|
+
readonly kind = "code-grader";
|
|
2121
2112
|
private readonly command;
|
|
2122
2113
|
private readonly cwd?;
|
|
2123
2114
|
private readonly agentTimeoutMs?;
|
|
@@ -2852,7 +2843,7 @@ interface EvalTestInput {
|
|
|
2852
2843
|
readonly expectedOutput?: string;
|
|
2853
2844
|
/** @deprecated Use `expectedOutput` instead */
|
|
2854
2845
|
readonly expected_output?: string;
|
|
2855
|
-
/** Assertion
|
|
2846
|
+
/** Assertion graders — accepts factory functions, config objects, or inline functions */
|
|
2856
2847
|
readonly assert?: readonly AssertEntry[];
|
|
2857
2848
|
/** Arbitrary metadata */
|
|
2858
2849
|
readonly metadata?: Record<string, unknown>;
|
|
@@ -2862,7 +2853,7 @@ interface EvalTestInput {
|
|
|
2862
2853
|
* Matches the YAML `assert` block structure.
|
|
2863
2854
|
*/
|
|
2864
2855
|
interface EvalAssertionInput {
|
|
2865
|
-
/** Assertion type (e.g., 'contains', 'llm-
|
|
2856
|
+
/** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
|
|
2866
2857
|
readonly type: string;
|
|
2867
2858
|
/** Display name */
|
|
2868
2859
|
readonly name?: string;
|
|
@@ -2872,9 +2863,9 @@ interface EvalAssertionInput {
|
|
|
2872
2863
|
readonly weight?: number;
|
|
2873
2864
|
/** Whether this assertion is required to pass */
|
|
2874
2865
|
readonly required?: boolean | number;
|
|
2875
|
-
/** Prompt file for
|
|
2866
|
+
/** Prompt file for llm_grader */
|
|
2876
2867
|
readonly prompt?: string;
|
|
2877
|
-
/** Script for
|
|
2868
|
+
/** Script for code_grader */
|
|
2878
2869
|
readonly script?: string | readonly string[];
|
|
2879
2870
|
/** Additional config passed to the assertion */
|
|
2880
2871
|
readonly config?: Record<string, unknown>;
|
|
@@ -3568,17 +3559,17 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
|
3568
3559
|
* Convention-based discovery of custom assertion scripts.
|
|
3569
3560
|
*
|
|
3570
3561
|
* Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
|
|
3571
|
-
* them as code
|
|
3572
|
-
* extension) becomes the
|
|
3562
|
+
* them as code graders in the registry. The file name (without
|
|
3563
|
+
* extension) becomes the grader type name.
|
|
3573
3564
|
*
|
|
3574
3565
|
* Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
|
|
3575
3566
|
*/
|
|
3576
3567
|
|
|
3577
3568
|
/**
|
|
3578
3569
|
* Discover custom assertion scripts from `.agentv/assertions/` and register
|
|
3579
|
-
* them as
|
|
3570
|
+
* them as grader types in the registry.
|
|
3580
3571
|
*
|
|
3581
|
-
* @param registry - The
|
|
3572
|
+
* @param registry - The grader registry to register discovered assertions into
|
|
3582
3573
|
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
3583
3574
|
* @returns Names of discovered assertion types
|
|
3584
3575
|
*/
|
package/dist/index.d.ts
CHANGED
|
@@ -473,7 +473,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
473
473
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
474
474
|
*/
|
|
475
475
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
476
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "
|
|
476
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
|
|
477
477
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
478
478
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
479
479
|
/**
|
|
@@ -576,7 +576,7 @@ type WorkspaceConfig = {
|
|
|
576
576
|
};
|
|
577
577
|
type CodeEvaluatorConfig = {
|
|
578
578
|
readonly name: string;
|
|
579
|
-
readonly type: 'code-
|
|
579
|
+
readonly type: 'code-grader';
|
|
580
580
|
readonly command: readonly string[];
|
|
581
581
|
/** @deprecated Use `command` instead */
|
|
582
582
|
readonly script?: readonly string[];
|
|
@@ -606,7 +606,7 @@ type PromptScriptConfig = {
|
|
|
606
606
|
};
|
|
607
607
|
type LlmGraderEvaluatorConfig = {
|
|
608
608
|
readonly name: string;
|
|
609
|
-
readonly type: 'llm-grader'
|
|
609
|
+
readonly type: 'llm-grader';
|
|
610
610
|
/** Text prompt (inline or file path) or executable script config */
|
|
611
611
|
readonly prompt?: string | PromptScriptConfig;
|
|
612
612
|
readonly promptPath?: string;
|
|
@@ -678,20 +678,11 @@ type CompositeAggregatorConfig = {
|
|
|
678
678
|
readonly type: 'code-grader';
|
|
679
679
|
readonly path: string;
|
|
680
680
|
readonly cwd?: string;
|
|
681
|
-
} | {
|
|
682
|
-
readonly type: 'code-judge';
|
|
683
|
-
readonly path: string;
|
|
684
|
-
readonly cwd?: string;
|
|
685
681
|
} | {
|
|
686
682
|
readonly type: 'llm-grader';
|
|
687
683
|
readonly prompt?: string;
|
|
688
684
|
readonly promptPath?: string;
|
|
689
685
|
readonly model?: string;
|
|
690
|
-
} | {
|
|
691
|
-
readonly type: 'llm-judge';
|
|
692
|
-
readonly prompt?: string;
|
|
693
|
-
readonly promptPath?: string;
|
|
694
|
-
readonly model?: string;
|
|
695
686
|
} | {
|
|
696
687
|
readonly type: 'threshold';
|
|
697
688
|
readonly threshold: number;
|
|
@@ -2117,7 +2108,7 @@ interface CodeEvaluatorOptions {
|
|
|
2117
2108
|
readonly target?: TargetAccessConfig;
|
|
2118
2109
|
}
|
|
2119
2110
|
declare class CodeEvaluator implements Evaluator {
|
|
2120
|
-
readonly kind = "code-
|
|
2111
|
+
readonly kind = "code-grader";
|
|
2121
2112
|
private readonly command;
|
|
2122
2113
|
private readonly cwd?;
|
|
2123
2114
|
private readonly agentTimeoutMs?;
|
|
@@ -2852,7 +2843,7 @@ interface EvalTestInput {
|
|
|
2852
2843
|
readonly expectedOutput?: string;
|
|
2853
2844
|
/** @deprecated Use `expectedOutput` instead */
|
|
2854
2845
|
readonly expected_output?: string;
|
|
2855
|
-
/** Assertion
|
|
2846
|
+
/** Assertion graders — accepts factory functions, config objects, or inline functions */
|
|
2856
2847
|
readonly assert?: readonly AssertEntry[];
|
|
2857
2848
|
/** Arbitrary metadata */
|
|
2858
2849
|
readonly metadata?: Record<string, unknown>;
|
|
@@ -2862,7 +2853,7 @@ interface EvalTestInput {
|
|
|
2862
2853
|
* Matches the YAML `assert` block structure.
|
|
2863
2854
|
*/
|
|
2864
2855
|
interface EvalAssertionInput {
|
|
2865
|
-
/** Assertion type (e.g., 'contains', 'llm-
|
|
2856
|
+
/** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
|
|
2866
2857
|
readonly type: string;
|
|
2867
2858
|
/** Display name */
|
|
2868
2859
|
readonly name?: string;
|
|
@@ -2872,9 +2863,9 @@ interface EvalAssertionInput {
|
|
|
2872
2863
|
readonly weight?: number;
|
|
2873
2864
|
/** Whether this assertion is required to pass */
|
|
2874
2865
|
readonly required?: boolean | number;
|
|
2875
|
-
/** Prompt file for
|
|
2866
|
+
/** Prompt file for llm_grader */
|
|
2876
2867
|
readonly prompt?: string;
|
|
2877
|
-
/** Script for
|
|
2868
|
+
/** Script for code_grader */
|
|
2878
2869
|
readonly script?: string | readonly string[];
|
|
2879
2870
|
/** Additional config passed to the assertion */
|
|
2880
2871
|
readonly config?: Record<string, unknown>;
|
|
@@ -3568,17 +3559,17 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
|
3568
3559
|
* Convention-based discovery of custom assertion scripts.
|
|
3569
3560
|
*
|
|
3570
3561
|
* Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
|
|
3571
|
-
* them as code
|
|
3572
|
-
* extension) becomes the
|
|
3562
|
+
* them as code graders in the registry. The file name (without
|
|
3563
|
+
* extension) becomes the grader type name.
|
|
3573
3564
|
*
|
|
3574
3565
|
* Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
|
|
3575
3566
|
*/
|
|
3576
3567
|
|
|
3577
3568
|
/**
|
|
3578
3569
|
* Discover custom assertion scripts from `.agentv/assertions/` and register
|
|
3579
|
-
* them as
|
|
3570
|
+
* them as grader types in the registry.
|
|
3580
3571
|
*
|
|
3581
|
-
* @param registry - The
|
|
3572
|
+
* @param registry - The grader registry to register discovered assertions into
|
|
3582
3573
|
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
3583
3574
|
* @returns Names of discovered assertion types
|
|
3584
3575
|
*/
|
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
readTextFile,
|
|
20
20
|
resolveFileReference,
|
|
21
21
|
resolveTargetDefinition
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-ZB3AUPES.js";
|
|
23
23
|
import {
|
|
24
24
|
AgentvProvider
|
|
25
25
|
} from "./chunk-W5YDZWT4.js";
|
|
@@ -728,6 +728,9 @@ var ANSI_RESET4 = "\x1B[0m";
|
|
|
728
728
|
function normalizeEvaluatorType(type) {
|
|
729
729
|
return type.replace(/_/g, "-");
|
|
730
730
|
}
|
|
731
|
+
function isDeprecatedJudgeType(type) {
|
|
732
|
+
return type === "code-judge" || type === "llm-judge";
|
|
733
|
+
}
|
|
731
734
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
732
735
|
const execution = rawEvalCase.execution;
|
|
733
736
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -790,6 +793,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
790
793
|
const rawName = asString(rawEvaluator.name);
|
|
791
794
|
const rawType = rawEvaluator.type;
|
|
792
795
|
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
796
|
+
if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
|
|
797
|
+
logWarning2(
|
|
798
|
+
`Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
|
|
799
|
+
);
|
|
800
|
+
continue;
|
|
801
|
+
}
|
|
793
802
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
794
803
|
if (typeof typeValue !== "string") {
|
|
795
804
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -822,7 +831,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
822
831
|
});
|
|
823
832
|
continue;
|
|
824
833
|
}
|
|
825
|
-
if (typeValue === "code-grader"
|
|
834
|
+
if (typeValue === "code-grader") {
|
|
826
835
|
let command;
|
|
827
836
|
if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
|
|
828
837
|
console.warn(
|
|
@@ -932,7 +941,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
932
941
|
continue;
|
|
933
942
|
}
|
|
934
943
|
const aggregatorType = asString(rawAggregator.type);
|
|
935
|
-
|
|
944
|
+
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
|
|
945
|
+
if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
|
|
946
|
+
logWarning2(
|
|
947
|
+
`Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
|
|
948
|
+
);
|
|
949
|
+
continue;
|
|
950
|
+
}
|
|
951
|
+
if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
|
|
936
952
|
logWarning2(
|
|
937
953
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
938
954
|
);
|
|
@@ -967,7 +983,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
967
983
|
continue;
|
|
968
984
|
}
|
|
969
985
|
let aggregator;
|
|
970
|
-
if (
|
|
986
|
+
if (normalizedAggregatorType === "weighted_average") {
|
|
971
987
|
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
972
988
|
const parsedWeights = {};
|
|
973
989
|
if (weights) {
|
|
@@ -981,7 +997,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
981
997
|
type: "weighted_average",
|
|
982
998
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
983
999
|
};
|
|
984
|
-
} else if (
|
|
1000
|
+
} else if (normalizedAggregatorType === "code-grader") {
|
|
985
1001
|
const aggregatorPath = asString(rawAggregator.path);
|
|
986
1002
|
if (!aggregatorPath) {
|
|
987
1003
|
logWarning2(
|
|
@@ -994,7 +1010,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
994
1010
|
path: aggregatorPath,
|
|
995
1011
|
cwd: searchRoots[0]
|
|
996
1012
|
};
|
|
997
|
-
} else if (
|
|
1013
|
+
} else if (normalizedAggregatorType === "threshold") {
|
|
998
1014
|
const thresholdValue = rawAggregator.threshold;
|
|
999
1015
|
if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
|
|
1000
1016
|
logWarning2(
|
|
@@ -1742,10 +1758,15 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
1742
1758
|
return void 0;
|
|
1743
1759
|
}
|
|
1744
1760
|
const normalized = normalizeEvaluatorType(candidate);
|
|
1761
|
+
if (isDeprecatedJudgeType(normalized)) {
|
|
1762
|
+
throw new Error(
|
|
1763
|
+
`Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
|
|
1764
|
+
);
|
|
1765
|
+
}
|
|
1745
1766
|
if (isEvaluatorKind(normalized)) {
|
|
1746
1767
|
return normalized;
|
|
1747
1768
|
}
|
|
1748
|
-
logWarning2(`Unknown
|
|
1769
|
+
logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
|
|
1749
1770
|
return void 0;
|
|
1750
1771
|
}
|
|
1751
1772
|
function asString(value) {
|
|
@@ -3178,9 +3199,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
3178
3199
|
case "ends_with":
|
|
3179
3200
|
return `Output ends with '${entry.value}'`;
|
|
3180
3201
|
case "llm-grader":
|
|
3181
|
-
case "llm_grader":
|
|
3182
|
-
case "llm-judge":
|
|
3183
|
-
case "llm_judge": {
|
|
3202
|
+
case "llm_grader": {
|
|
3184
3203
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
3185
3204
|
return null;
|
|
3186
3205
|
}
|
|
@@ -3193,9 +3212,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
3193
3212
|
return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
|
|
3194
3213
|
}
|
|
3195
3214
|
case "code-grader":
|
|
3196
|
-
case "code_grader":
|
|
3197
|
-
case "code-judge":
|
|
3198
|
-
case "code_judge": {
|
|
3215
|
+
case "code_grader": {
|
|
3199
3216
|
const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
|
|
3200
3217
|
const desc = typeof entry.description === "string" ? entry.description : void 0;
|
|
3201
3218
|
return codeGraderInstruction(graderName, desc);
|
|
@@ -3226,7 +3243,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
3226
3243
|
}
|
|
3227
3244
|
}
|
|
3228
3245
|
function assertionToNaturalLanguageList(entry) {
|
|
3229
|
-
if (entry.type === "llm-grader" || entry.type === "llm_grader"
|
|
3246
|
+
if (entry.type === "llm-grader" || entry.type === "llm_grader") {
|
|
3230
3247
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
3231
3248
|
return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
|
|
3232
3249
|
}
|
|
@@ -10083,7 +10100,7 @@ function toCamelCaseDeep(obj) {
|
|
|
10083
10100
|
// src/evaluation/evaluators/code-evaluator.ts
|
|
10084
10101
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
10085
10102
|
var CodeEvaluator = class {
|
|
10086
|
-
kind = "code-
|
|
10103
|
+
kind = "code-grader";
|
|
10087
10104
|
command;
|
|
10088
10105
|
cwd;
|
|
10089
10106
|
agentTimeoutMs;
|
|
@@ -10102,7 +10119,7 @@ var CodeEvaluator = class {
|
|
|
10102
10119
|
if (outputForPayload) {
|
|
10103
10120
|
const serialized = JSON.stringify(outputForPayload);
|
|
10104
10121
|
if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
|
|
10105
|
-
const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-
|
|
10122
|
+
const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-grader-"));
|
|
10106
10123
|
outputPath = join(tmpDir, "output.json");
|
|
10107
10124
|
await writeFile6(outputPath, serialized);
|
|
10108
10125
|
outputForPayload = null;
|
|
@@ -10360,7 +10377,7 @@ var LlmGraderEvaluator = class {
|
|
|
10360
10377
|
return this.evaluateWithDelegatedAgent(context, graderProvider);
|
|
10361
10378
|
}
|
|
10362
10379
|
const config = context.evaluator;
|
|
10363
|
-
if (
|
|
10380
|
+
if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
|
|
10364
10381
|
return this.evaluateWithRubrics(context, graderProvider, config.rubrics);
|
|
10365
10382
|
}
|
|
10366
10383
|
return this.evaluateFreeform(context, graderProvider);
|
|
@@ -10545,7 +10562,7 @@ ${context.fileChanges}`;
|
|
|
10545
10562
|
const systemPrompt = this.buildAgentSystemPrompt(context);
|
|
10546
10563
|
const userPrompt = this.buildAgentUserPrompt(context);
|
|
10547
10564
|
const config = context.evaluator;
|
|
10548
|
-
const rubrics = config?.type === "llm-grader"
|
|
10565
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
10549
10566
|
const fsTools = createFilesystemTools(workspacePath);
|
|
10550
10567
|
const evaluatorRawRequest = {
|
|
10551
10568
|
mode: "built-in",
|
|
@@ -10641,7 +10658,7 @@ ${context.fileChanges}`;
|
|
|
10641
10658
|
};
|
|
10642
10659
|
}
|
|
10643
10660
|
const config = context.evaluator;
|
|
10644
|
-
const rubrics = config?.type === "llm-grader"
|
|
10661
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
10645
10662
|
const details = {
|
|
10646
10663
|
mode: modeLabel,
|
|
10647
10664
|
grader_target: provider.targetName
|
|
@@ -10681,7 +10698,7 @@ ${context.fileChanges}`;
|
|
|
10681
10698
|
*/
|
|
10682
10699
|
buildAgentSystemPrompt(context) {
|
|
10683
10700
|
const config = context.evaluator;
|
|
10684
|
-
const rubrics = config?.type === "llm-grader"
|
|
10701
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
10685
10702
|
const parts = [
|
|
10686
10703
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
10687
10704
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -10712,7 +10729,7 @@ ${context.fileChanges}`;
|
|
|
10712
10729
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
10713
10730
|
}
|
|
10714
10731
|
const config = context.evaluator;
|
|
10715
|
-
const rubrics = config?.type === "llm-grader"
|
|
10732
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
10716
10733
|
const parts = [
|
|
10717
10734
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
10718
10735
|
"",
|
|
@@ -10755,7 +10772,7 @@ ${context.fileChanges}`;
|
|
|
10755
10772
|
buildDelegatedPrompt(context) {
|
|
10756
10773
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
10757
10774
|
const config = context.evaluator;
|
|
10758
|
-
const rubrics = config?.type === "llm-grader"
|
|
10775
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
10759
10776
|
if (this.evaluatorTemplate) {
|
|
10760
10777
|
const variables = {
|
|
10761
10778
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
@@ -11252,10 +11269,8 @@ var CompositeEvaluator = class {
|
|
|
11252
11269
|
const aggregator = this.config.aggregator;
|
|
11253
11270
|
switch (aggregator.type) {
|
|
11254
11271
|
case "code-grader":
|
|
11255
|
-
case "code-judge":
|
|
11256
11272
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
11257
11273
|
case "llm-grader":
|
|
11258
|
-
case "llm-judge":
|
|
11259
11274
|
return this.runLlmAggregator(results, context, aggregator);
|
|
11260
11275
|
case "threshold":
|
|
11261
11276
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -13677,7 +13692,7 @@ var endsWithFactory = (config) => {
|
|
|
13677
13692
|
};
|
|
13678
13693
|
function createBuiltinRegistry() {
|
|
13679
13694
|
const registry = new EvaluatorRegistry();
|
|
13680
|
-
registry.register("llm-grader", llmGraderFactory).register("
|
|
13695
|
+
registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
|
|
13681
13696
|
const fn = config[INLINE_ASSERT_FN];
|
|
13682
13697
|
if (!fn) {
|
|
13683
13698
|
throw new Error(
|
|
@@ -16395,7 +16410,7 @@ function filterEvalCases(evalCases, filter) {
|
|
|
16395
16410
|
return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter));
|
|
16396
16411
|
}
|
|
16397
16412
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
16398
|
-
const llmGrader = overrides?.["llm-grader"] ??
|
|
16413
|
+
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
16399
16414
|
resolveGraderProvider: async (context) => {
|
|
16400
16415
|
if (context.graderProvider) {
|
|
16401
16416
|
return context.graderProvider;
|
|
@@ -17239,10 +17254,10 @@ var OtelTraceExporter = class {
|
|
|
17239
17254
|
}
|
|
17240
17255
|
if (result.scores) {
|
|
17241
17256
|
for (const score of result.scores) {
|
|
17242
|
-
rootSpan.addEvent(`agentv.
|
|
17243
|
-
"agentv.
|
|
17244
|
-
"agentv.
|
|
17245
|
-
...score.verdict ? { "agentv.
|
|
17257
|
+
rootSpan.addEvent(`agentv.grader.${score.name}`, {
|
|
17258
|
+
"agentv.grader.score": score.score,
|
|
17259
|
+
"agentv.grader.type": score.type,
|
|
17260
|
+
...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
|
|
17246
17261
|
});
|
|
17247
17262
|
}
|
|
17248
17263
|
}
|