@agentv/core 4.17.1-next.1 → 4.18.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6VZY3B6M.js → chunk-PYDBJOAO.js} +6 -6
- package/dist/chunk-PYDBJOAO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +5 -5
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -3
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +229 -238
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +156 -158
- package/dist/index.d.ts +156 -158
- package/dist/index.js +210 -216
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-6VZY3B6M.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -341,7 +341,7 @@ type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
|
|
|
341
341
|
/**
|
|
342
342
|
* Configuration for tool-trajectory evaluator.
|
|
343
343
|
*/
|
|
344
|
-
interface
|
|
344
|
+
interface ToolTrajectoryGraderConfig {
|
|
345
345
|
readonly name: string;
|
|
346
346
|
readonly type: 'tool-trajectory';
|
|
347
347
|
/** Matching mode */
|
|
@@ -355,7 +355,7 @@ interface ToolTrajectoryEvaluatorConfig {
|
|
|
355
355
|
readonly required?: boolean | number;
|
|
356
356
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
357
357
|
readonly min_score?: number;
|
|
358
|
-
/** When true, inverts the
|
|
358
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
359
359
|
readonly negate?: boolean;
|
|
360
360
|
/** Default argument matching mode for all expected items (defaults to 'exact') */
|
|
361
361
|
readonly argsMatch?: ArgsMatchMode | readonly string[];
|
|
@@ -539,9 +539,9 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
539
539
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
540
540
|
*/
|
|
541
541
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
542
|
-
declare const
|
|
543
|
-
type
|
|
544
|
-
declare function
|
|
542
|
+
declare const GRADER_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
|
|
543
|
+
type GraderKind = (typeof GRADER_KIND_VALUES)[number];
|
|
544
|
+
declare function isGraderKind(value: unknown): value is GraderKind;
|
|
545
545
|
/**
|
|
546
546
|
* Configuration for enabling target access in code-grader evaluators.
|
|
547
547
|
* When present, the runtime will start a local proxy server that allows
|
|
@@ -697,7 +697,7 @@ type WorkspaceConfig = {
|
|
|
697
697
|
* relative paths from their own directory, not the eval file's directory. */
|
|
698
698
|
readonly workspaceFileDir?: string;
|
|
699
699
|
};
|
|
700
|
-
type
|
|
700
|
+
type CodeGraderConfig = {
|
|
701
701
|
readonly name: string;
|
|
702
702
|
readonly type: 'code-grader';
|
|
703
703
|
readonly command: readonly string[];
|
|
@@ -710,7 +710,7 @@ type CodeEvaluatorConfig = {
|
|
|
710
710
|
readonly required?: boolean | number;
|
|
711
711
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
712
712
|
readonly min_score?: number;
|
|
713
|
-
/** When true, inverts the
|
|
713
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
714
714
|
readonly negate?: boolean;
|
|
715
715
|
/** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
|
|
716
716
|
readonly config?: JsonObject;
|
|
@@ -739,7 +739,7 @@ type ContentPreprocessorConfig = {
|
|
|
739
739
|
/** Resolved absolute path for the command script (last argv element) */
|
|
740
740
|
readonly resolvedCommand?: readonly string[];
|
|
741
741
|
};
|
|
742
|
-
type
|
|
742
|
+
type LlmGraderConfig = {
|
|
743
743
|
readonly name: string;
|
|
744
744
|
readonly type: 'llm-grader';
|
|
745
745
|
/** Text prompt (inline or file path) or executable script config */
|
|
@@ -754,7 +754,7 @@ type LlmGraderEvaluatorConfig = {
|
|
|
754
754
|
readonly required?: boolean | number;
|
|
755
755
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
756
756
|
readonly min_score?: number;
|
|
757
|
-
/** When true, inverts the
|
|
757
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
758
758
|
readonly negate?: boolean;
|
|
759
759
|
/** Optional target override for this grader (uses a named LLM target from targets.yaml). */
|
|
760
760
|
readonly target?: string;
|
|
@@ -767,8 +767,6 @@ type LlmGraderEvaluatorConfig = {
|
|
|
767
767
|
/** Optional content preprocessors for ContentFile blocks in assistant output */
|
|
768
768
|
readonly preprocessors?: readonly ContentPreprocessorConfig[];
|
|
769
769
|
};
|
|
770
|
-
/** @deprecated Use `LlmGraderEvaluatorConfig` instead */
|
|
771
|
-
type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
|
|
772
770
|
/**
|
|
773
771
|
* Score range definition for analytic rubric scoring.
|
|
774
772
|
* Each range maps an integer score band (0-10) to an outcome description.
|
|
@@ -830,16 +828,16 @@ type CompositeAggregatorConfig = {
|
|
|
830
828
|
readonly type: 'threshold';
|
|
831
829
|
readonly threshold: number;
|
|
832
830
|
};
|
|
833
|
-
type
|
|
831
|
+
type CompositeGraderConfig = {
|
|
834
832
|
readonly name: string;
|
|
835
833
|
readonly type: 'composite';
|
|
836
|
-
readonly assertions: readonly
|
|
834
|
+
readonly assertions: readonly GraderConfig[];
|
|
837
835
|
readonly aggregator: CompositeAggregatorConfig;
|
|
838
836
|
readonly weight?: number;
|
|
839
837
|
readonly required?: boolean | number;
|
|
840
838
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
841
839
|
readonly min_score?: number;
|
|
842
|
-
/** When true, inverts the
|
|
840
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
843
841
|
readonly negate?: boolean;
|
|
844
842
|
};
|
|
845
843
|
/**
|
|
@@ -874,7 +872,7 @@ type FieldConfig = {
|
|
|
874
872
|
/**
|
|
875
873
|
* Configuration for the field-accuracy evaluator.
|
|
876
874
|
*/
|
|
877
|
-
type
|
|
875
|
+
type FieldAccuracyGraderConfig = {
|
|
878
876
|
readonly name: string;
|
|
879
877
|
readonly type: 'field-accuracy';
|
|
880
878
|
/** Fields to compare between candidate and expected */
|
|
@@ -885,14 +883,14 @@ type FieldAccuracyEvaluatorConfig = {
|
|
|
885
883
|
readonly required?: boolean | number;
|
|
886
884
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
887
885
|
readonly min_score?: number;
|
|
888
|
-
/** When true, inverts the
|
|
886
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
889
887
|
readonly negate?: boolean;
|
|
890
888
|
};
|
|
891
889
|
/**
|
|
892
890
|
* Configuration for the latency evaluator.
|
|
893
891
|
* Checks execution duration against a threshold.
|
|
894
892
|
*/
|
|
895
|
-
type
|
|
893
|
+
type LatencyGraderConfig = {
|
|
896
894
|
readonly name: string;
|
|
897
895
|
readonly type: 'latency';
|
|
898
896
|
/** Maximum allowed duration in milliseconds */
|
|
@@ -901,14 +899,14 @@ type LatencyEvaluatorConfig = {
|
|
|
901
899
|
readonly required?: boolean | number;
|
|
902
900
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
903
901
|
readonly min_score?: number;
|
|
904
|
-
/** When true, inverts the
|
|
902
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
905
903
|
readonly negate?: boolean;
|
|
906
904
|
};
|
|
907
905
|
/**
|
|
908
906
|
* Configuration for the cost evaluator.
|
|
909
907
|
* Checks execution cost against a budget.
|
|
910
908
|
*/
|
|
911
|
-
type
|
|
909
|
+
type CostGraderConfig = {
|
|
912
910
|
readonly name: string;
|
|
913
911
|
readonly type: 'cost';
|
|
914
912
|
/** Maximum allowed cost in USD */
|
|
@@ -917,14 +915,14 @@ type CostEvaluatorConfig = {
|
|
|
917
915
|
readonly required?: boolean | number;
|
|
918
916
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
919
917
|
readonly min_score?: number;
|
|
920
|
-
/** When true, inverts the
|
|
918
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
921
919
|
readonly negate?: boolean;
|
|
922
920
|
};
|
|
923
921
|
/**
|
|
924
922
|
* Configuration for the token-usage evaluator.
|
|
925
923
|
* Checks provider-reported token usage against configured limits.
|
|
926
924
|
*/
|
|
927
|
-
type
|
|
925
|
+
type TokenUsageGraderConfig = {
|
|
928
926
|
readonly name: string;
|
|
929
927
|
readonly type: 'token-usage';
|
|
930
928
|
/** Maximum allowed total tokens (input + output + cached, when present) */
|
|
@@ -937,7 +935,7 @@ type TokenUsageEvaluatorConfig = {
|
|
|
937
935
|
readonly required?: boolean | number;
|
|
938
936
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
939
937
|
readonly min_score?: number;
|
|
940
|
-
/** When true, inverts the
|
|
938
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
941
939
|
readonly negate?: boolean;
|
|
942
940
|
};
|
|
943
941
|
/**
|
|
@@ -945,7 +943,7 @@ type TokenUsageEvaluatorConfig = {
|
|
|
945
943
|
* Provides declarative threshold-based checks on execution metrics.
|
|
946
944
|
* Only specified thresholds are checked; omitted ones are ignored.
|
|
947
945
|
*/
|
|
948
|
-
type
|
|
946
|
+
type ExecutionMetricsGraderConfig = {
|
|
949
947
|
readonly name: string;
|
|
950
948
|
readonly type: 'execution-metrics';
|
|
951
949
|
/** Maximum allowed number of tool calls */
|
|
@@ -966,14 +964,14 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
966
964
|
readonly required?: boolean | number;
|
|
967
965
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
968
966
|
readonly min_score?: number;
|
|
969
|
-
/** When true, inverts the
|
|
967
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
970
968
|
readonly negate?: boolean;
|
|
971
969
|
};
|
|
972
970
|
/**
|
|
973
971
|
* Configuration for the contains assertion evaluator.
|
|
974
972
|
* Checks whether the candidate output contains a specified substring.
|
|
975
973
|
*/
|
|
976
|
-
type
|
|
974
|
+
type ContainsGraderConfig = {
|
|
977
975
|
readonly name: string;
|
|
978
976
|
readonly type: 'contains';
|
|
979
977
|
readonly value: string;
|
|
@@ -981,14 +979,14 @@ type ContainsEvaluatorConfig = {
|
|
|
981
979
|
readonly required?: boolean | number;
|
|
982
980
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
983
981
|
readonly min_score?: number;
|
|
984
|
-
/** When true, inverts the
|
|
982
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
985
983
|
readonly negate?: boolean;
|
|
986
984
|
};
|
|
987
985
|
/**
|
|
988
986
|
* Configuration for the contains_any assertion evaluator.
|
|
989
987
|
* Checks whether the candidate output contains ANY of the specified substrings.
|
|
990
988
|
*/
|
|
991
|
-
type
|
|
989
|
+
type ContainsAnyGraderConfig = {
|
|
992
990
|
readonly name: string;
|
|
993
991
|
readonly type: 'contains-any';
|
|
994
992
|
readonly value: readonly string[];
|
|
@@ -996,14 +994,14 @@ type ContainsAnyEvaluatorConfig = {
|
|
|
996
994
|
readonly required?: boolean | number;
|
|
997
995
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
998
996
|
readonly min_score?: number;
|
|
999
|
-
/** When true, inverts the
|
|
997
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1000
998
|
readonly negate?: boolean;
|
|
1001
999
|
};
|
|
1002
1000
|
/**
|
|
1003
1001
|
* Configuration for the contains_all assertion evaluator.
|
|
1004
1002
|
* Checks whether the candidate output contains ALL of the specified substrings.
|
|
1005
1003
|
*/
|
|
1006
|
-
type
|
|
1004
|
+
type ContainsAllGraderConfig = {
|
|
1007
1005
|
readonly name: string;
|
|
1008
1006
|
readonly type: 'contains-all';
|
|
1009
1007
|
readonly value: readonly string[];
|
|
@@ -1011,14 +1009,14 @@ type ContainsAllEvaluatorConfig = {
|
|
|
1011
1009
|
readonly required?: boolean | number;
|
|
1012
1010
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1013
1011
|
readonly min_score?: number;
|
|
1014
|
-
/** When true, inverts the
|
|
1012
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1015
1013
|
readonly negate?: boolean;
|
|
1016
1014
|
};
|
|
1017
1015
|
/**
|
|
1018
1016
|
* Configuration for the icontains assertion evaluator.
|
|
1019
1017
|
* Case-insensitive check whether the candidate output contains a specified substring.
|
|
1020
1018
|
*/
|
|
1021
|
-
type
|
|
1019
|
+
type IcontainsGraderConfig = {
|
|
1022
1020
|
readonly name: string;
|
|
1023
1021
|
readonly type: 'icontains';
|
|
1024
1022
|
readonly value: string;
|
|
@@ -1026,14 +1024,14 @@ type IcontainsEvaluatorConfig = {
|
|
|
1026
1024
|
readonly required?: boolean | number;
|
|
1027
1025
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1028
1026
|
readonly min_score?: number;
|
|
1029
|
-
/** When true, inverts the
|
|
1027
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1030
1028
|
readonly negate?: boolean;
|
|
1031
1029
|
};
|
|
1032
1030
|
/**
|
|
1033
1031
|
* Configuration for the icontains_any assertion evaluator.
|
|
1034
1032
|
* Case-insensitive check whether the candidate output contains ANY of the specified substrings.
|
|
1035
1033
|
*/
|
|
1036
|
-
type
|
|
1034
|
+
type IcontainsAnyGraderConfig = {
|
|
1037
1035
|
readonly name: string;
|
|
1038
1036
|
readonly type: 'icontains-any';
|
|
1039
1037
|
readonly value: readonly string[];
|
|
@@ -1041,14 +1039,14 @@ type IcontainsAnyEvaluatorConfig = {
|
|
|
1041
1039
|
readonly required?: boolean | number;
|
|
1042
1040
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1043
1041
|
readonly min_score?: number;
|
|
1044
|
-
/** When true, inverts the
|
|
1042
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1045
1043
|
readonly negate?: boolean;
|
|
1046
1044
|
};
|
|
1047
1045
|
/**
|
|
1048
1046
|
* Configuration for the icontains_all assertion evaluator.
|
|
1049
1047
|
* Case-insensitive check whether the candidate output contains ALL of the specified substrings.
|
|
1050
1048
|
*/
|
|
1051
|
-
type
|
|
1049
|
+
type IcontainsAllGraderConfig = {
|
|
1052
1050
|
readonly name: string;
|
|
1053
1051
|
readonly type: 'icontains-all';
|
|
1054
1052
|
readonly value: readonly string[];
|
|
@@ -1056,14 +1054,14 @@ type IcontainsAllEvaluatorConfig = {
|
|
|
1056
1054
|
readonly required?: boolean | number;
|
|
1057
1055
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1058
1056
|
readonly min_score?: number;
|
|
1059
|
-
/** When true, inverts the
|
|
1057
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1060
1058
|
readonly negate?: boolean;
|
|
1061
1059
|
};
|
|
1062
1060
|
/**
|
|
1063
1061
|
* Configuration for the starts_with assertion evaluator.
|
|
1064
1062
|
* Checks whether the candidate output starts with a specified string (both trimmed).
|
|
1065
1063
|
*/
|
|
1066
|
-
type
|
|
1064
|
+
type StartsWithGraderConfig = {
|
|
1067
1065
|
readonly name: string;
|
|
1068
1066
|
readonly type: 'starts-with';
|
|
1069
1067
|
readonly value: string;
|
|
@@ -1071,14 +1069,14 @@ type StartsWithEvaluatorConfig = {
|
|
|
1071
1069
|
readonly required?: boolean | number;
|
|
1072
1070
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1073
1071
|
readonly min_score?: number;
|
|
1074
|
-
/** When true, inverts the
|
|
1072
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1075
1073
|
readonly negate?: boolean;
|
|
1076
1074
|
};
|
|
1077
1075
|
/**
|
|
1078
1076
|
* Configuration for the ends_with assertion evaluator.
|
|
1079
1077
|
* Checks whether the candidate output ends with a specified string (both trimmed).
|
|
1080
1078
|
*/
|
|
1081
|
-
type
|
|
1079
|
+
type EndsWithGraderConfig = {
|
|
1082
1080
|
readonly name: string;
|
|
1083
1081
|
readonly type: 'ends-with';
|
|
1084
1082
|
readonly value: string;
|
|
@@ -1086,14 +1084,14 @@ type EndsWithEvaluatorConfig = {
|
|
|
1086
1084
|
readonly required?: boolean | number;
|
|
1087
1085
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1088
1086
|
readonly min_score?: number;
|
|
1089
|
-
/** When true, inverts the
|
|
1087
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1090
1088
|
readonly negate?: boolean;
|
|
1091
1089
|
};
|
|
1092
1090
|
/**
|
|
1093
1091
|
* Configuration for the regex assertion evaluator.
|
|
1094
1092
|
* Checks whether the candidate output matches a regular expression pattern.
|
|
1095
1093
|
*/
|
|
1096
|
-
type
|
|
1094
|
+
type RegexGraderConfig = {
|
|
1097
1095
|
readonly name: string;
|
|
1098
1096
|
readonly type: 'regex';
|
|
1099
1097
|
readonly value: string;
|
|
@@ -1103,28 +1101,28 @@ type RegexEvaluatorConfig = {
|
|
|
1103
1101
|
readonly required?: boolean | number;
|
|
1104
1102
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1105
1103
|
readonly min_score?: number;
|
|
1106
|
-
/** When true, inverts the
|
|
1104
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1107
1105
|
readonly negate?: boolean;
|
|
1108
1106
|
};
|
|
1109
1107
|
/**
|
|
1110
1108
|
* Configuration for the is_json assertion evaluator.
|
|
1111
1109
|
* Checks whether the candidate output is valid JSON.
|
|
1112
1110
|
*/
|
|
1113
|
-
type
|
|
1111
|
+
type IsJsonGraderConfig = {
|
|
1114
1112
|
readonly name: string;
|
|
1115
1113
|
readonly type: 'is-json';
|
|
1116
1114
|
readonly weight?: number;
|
|
1117
1115
|
readonly required?: boolean | number;
|
|
1118
1116
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1119
1117
|
readonly min_score?: number;
|
|
1120
|
-
/** When true, inverts the
|
|
1118
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1121
1119
|
readonly negate?: boolean;
|
|
1122
1120
|
};
|
|
1123
1121
|
/**
|
|
1124
1122
|
* Configuration for the equals assertion evaluator.
|
|
1125
1123
|
* Checks whether the candidate output exactly equals a specified string.
|
|
1126
1124
|
*/
|
|
1127
|
-
type
|
|
1125
|
+
type EqualsGraderConfig = {
|
|
1128
1126
|
readonly name: string;
|
|
1129
1127
|
readonly type: 'equals';
|
|
1130
1128
|
readonly value: string;
|
|
@@ -1132,7 +1130,7 @@ type EqualsEvaluatorConfig = {
|
|
|
1132
1130
|
readonly required?: boolean | number;
|
|
1133
1131
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1134
1132
|
readonly min_score?: number;
|
|
1135
|
-
/** When true, inverts the
|
|
1133
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1136
1134
|
readonly negate?: boolean;
|
|
1137
1135
|
};
|
|
1138
1136
|
/**
|
|
@@ -1147,7 +1145,7 @@ type RubricsEvaluatorConfig = {
|
|
|
1147
1145
|
readonly required?: boolean | number;
|
|
1148
1146
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1149
1147
|
readonly min_score?: number;
|
|
1150
|
-
/** When true, inverts the
|
|
1148
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1151
1149
|
readonly negate?: boolean;
|
|
1152
1150
|
};
|
|
1153
1151
|
/**
|
|
@@ -1156,7 +1154,7 @@ type RubricsEvaluatorConfig = {
|
|
|
1156
1154
|
* Tool-name resolution is automatic based on the provider kind.
|
|
1157
1155
|
* For providers not covered by the built-in mapping, use a code-grader.
|
|
1158
1156
|
*/
|
|
1159
|
-
type
|
|
1157
|
+
type SkillTriggerGraderConfig = {
|
|
1160
1158
|
readonly name: string;
|
|
1161
1159
|
readonly type: 'skill-trigger';
|
|
1162
1160
|
/** The skill name to check for (case-sensitive substring match) */
|
|
@@ -1182,7 +1180,7 @@ type InlineAssertEvaluatorConfig = {
|
|
|
1182
1180
|
readonly min_score?: number;
|
|
1183
1181
|
readonly negate?: boolean;
|
|
1184
1182
|
};
|
|
1185
|
-
type
|
|
1183
|
+
type GraderConfig = CodeGraderConfig | LlmGraderConfig | CompositeGraderConfig | ToolTrajectoryGraderConfig | FieldAccuracyGraderConfig | LatencyGraderConfig | CostGraderConfig | TokenUsageGraderConfig | ExecutionMetricsGraderConfig | SkillTriggerGraderConfig | ContainsGraderConfig | ContainsAnyGraderConfig | ContainsAllGraderConfig | IcontainsGraderConfig | IcontainsAnyGraderConfig | IcontainsAllGraderConfig | StartsWithGraderConfig | EndsWithGraderConfig | RegexGraderConfig | IsJsonGraderConfig | EqualsGraderConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
|
|
1186
1184
|
/**
|
|
1187
1185
|
* A single turn in a multi-turn conversation evaluation.
|
|
1188
1186
|
* Each turn is a user message. The runner generates the assistant response.
|
|
@@ -1193,7 +1191,7 @@ interface ConversationTurn {
|
|
|
1193
1191
|
/** Reference assistant response for grading (NOT carried forward — actual LLM response is used) */
|
|
1194
1192
|
readonly expected_output?: TestMessageContent;
|
|
1195
1193
|
/** Per-turn assertions. Strings become rubric criteria via shorthand. */
|
|
1196
|
-
readonly assertions?: readonly (string |
|
|
1194
|
+
readonly assertions?: readonly (string | GraderConfig)[];
|
|
1197
1195
|
}
|
|
1198
1196
|
/**
|
|
1199
1197
|
* Conversation evaluation mode.
|
|
@@ -1228,8 +1226,8 @@ interface EvalTest {
|
|
|
1228
1226
|
readonly reference_answer?: string;
|
|
1229
1227
|
readonly file_paths: readonly string[];
|
|
1230
1228
|
readonly criteria: string;
|
|
1231
|
-
readonly evaluator?:
|
|
1232
|
-
readonly assertions?: readonly
|
|
1229
|
+
readonly evaluator?: GraderKind;
|
|
1230
|
+
readonly assertions?: readonly GraderConfig[];
|
|
1233
1231
|
/** Suite-level preprocessors used by the implicit default llm-grader. */
|
|
1234
1232
|
readonly preprocessors?: readonly ContentPreprocessorConfig[];
|
|
1235
1233
|
/** Workspace configuration (merged from suite-level and case-level) */
|
|
@@ -1293,7 +1291,7 @@ interface TrialResult {
|
|
|
1293
1291
|
readonly attempt: number;
|
|
1294
1292
|
readonly score: number;
|
|
1295
1293
|
readonly verdict: EvaluationVerdict;
|
|
1296
|
-
readonly scores?: readonly
|
|
1294
|
+
readonly scores?: readonly GraderResult[];
|
|
1297
1295
|
readonly error?: string;
|
|
1298
1296
|
readonly costUsd?: number;
|
|
1299
1297
|
/** Primary classification for this trial attempt */
|
|
@@ -1359,7 +1357,7 @@ interface ExecutionError {
|
|
|
1359
1357
|
*/
|
|
1360
1358
|
type FailOnError = boolean;
|
|
1361
1359
|
/**
|
|
1362
|
-
*
|
|
1360
|
+
* Grader scorecard for a single eval case run.
|
|
1363
1361
|
*/
|
|
1364
1362
|
interface EvaluationResult {
|
|
1365
1363
|
readonly timestamp: string;
|
|
@@ -1390,7 +1388,7 @@ interface EvaluationResult {
|
|
|
1390
1388
|
readonly lm?: JsonObject;
|
|
1391
1389
|
readonly evaluator?: JsonObject;
|
|
1392
1390
|
};
|
|
1393
|
-
readonly scores?: readonly
|
|
1391
|
+
readonly scores?: readonly GraderResult[];
|
|
1394
1392
|
readonly error?: string;
|
|
1395
1393
|
/** Lightweight summary of the execution trace (always included when available) */
|
|
1396
1394
|
readonly trace?: TraceSummary;
|
|
@@ -1433,9 +1431,9 @@ interface EvaluationResult {
|
|
|
1433
1431
|
readonly executionError?: ExecutionError;
|
|
1434
1432
|
}
|
|
1435
1433
|
type EvaluationVerdict = 'pass' | 'fail' | 'skip';
|
|
1436
|
-
interface
|
|
1434
|
+
interface GraderResult {
|
|
1437
1435
|
readonly name: string;
|
|
1438
|
-
readonly type:
|
|
1436
|
+
readonly type: GraderKind;
|
|
1439
1437
|
readonly score: number;
|
|
1440
1438
|
readonly weight?: number;
|
|
1441
1439
|
readonly verdict?: EvaluationVerdict;
|
|
@@ -1444,7 +1442,7 @@ interface EvaluatorResult {
|
|
|
1444
1442
|
readonly input?: JsonObject;
|
|
1445
1443
|
/** Target name used for grading (e.g., the LLM provider name). */
|
|
1446
1444
|
readonly target?: string;
|
|
1447
|
-
readonly scores?: readonly
|
|
1445
|
+
readonly scores?: readonly GraderResult[];
|
|
1448
1446
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
|
|
1449
1447
|
readonly details?: JsonObject;
|
|
1450
1448
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
@@ -1642,7 +1640,7 @@ type EvalSuiteResult = {
|
|
|
1642
1640
|
/** Suite-level metadata (name, description, version, etc.) */
|
|
1643
1641
|
readonly metadata?: EvalMetadata;
|
|
1644
1642
|
/** Suite-level total cost budget in USD */
|
|
1645
|
-
readonly
|
|
1643
|
+
readonly budgetUsd?: number;
|
|
1646
1644
|
/** Execution error tolerance: true or false */
|
|
1647
1645
|
readonly failOnError?: FailOnError;
|
|
1648
1646
|
/** Suite-level quality threshold (0-1) — suite fails if mean score is below */
|
|
@@ -2346,8 +2344,8 @@ interface EvaluationContext {
|
|
|
2346
2344
|
readonly graderProvider?: Provider;
|
|
2347
2345
|
/** @deprecated Use `graderProvider` instead */
|
|
2348
2346
|
readonly judgeProvider?: Provider;
|
|
2349
|
-
readonly
|
|
2350
|
-
readonly evaluator?:
|
|
2347
|
+
readonly graderTemplateOverride?: string;
|
|
2348
|
+
readonly evaluator?: GraderConfig;
|
|
2351
2349
|
/** Output messages from agent execution (primary source for tool trajectory) */
|
|
2352
2350
|
readonly output?: readonly Message[];
|
|
2353
2351
|
/** Lightweight summary of trace events (if available) */
|
|
@@ -2380,8 +2378,8 @@ interface EvaluationScore {
|
|
|
2380
2378
|
readonly verdict: EvaluationVerdict;
|
|
2381
2379
|
readonly assertions: readonly AssertionEntry[];
|
|
2382
2380
|
readonly expectedAspectCount: number;
|
|
2383
|
-
readonly
|
|
2384
|
-
readonly scores?: readonly
|
|
2381
|
+
readonly graderRawRequest?: JsonObject;
|
|
2382
|
+
readonly scores?: readonly ChildGraderResult[];
|
|
2385
2383
|
/** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
|
|
2386
2384
|
readonly details?: JsonObject;
|
|
2387
2385
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
@@ -2389,26 +2387,26 @@ interface EvaluationScore {
|
|
|
2389
2387
|
/** Target name used for grading (e.g., the LLM provider). */
|
|
2390
2388
|
readonly graderTarget?: string;
|
|
2391
2389
|
}
|
|
2392
|
-
interface
|
|
2390
|
+
interface ChildGraderResult {
|
|
2393
2391
|
readonly name: string;
|
|
2394
2392
|
readonly type: string;
|
|
2395
2393
|
readonly score: number;
|
|
2396
2394
|
readonly weight?: number;
|
|
2397
2395
|
readonly verdict: EvaluationVerdict;
|
|
2398
2396
|
readonly assertions: readonly AssertionEntry[];
|
|
2399
|
-
readonly
|
|
2400
|
-
readonly scores?: readonly
|
|
2397
|
+
readonly graderRawRequest?: JsonObject;
|
|
2398
|
+
readonly scores?: readonly ChildGraderResult[];
|
|
2401
2399
|
/** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
|
|
2402
2400
|
readonly details?: JsonObject;
|
|
2403
2401
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
2404
2402
|
readonly tokenUsage?: TokenUsage;
|
|
2405
2403
|
}
|
|
2406
|
-
interface
|
|
2404
|
+
interface Grader {
|
|
2407
2405
|
readonly kind: string;
|
|
2408
2406
|
evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
|
|
2409
2407
|
}
|
|
2410
|
-
interface
|
|
2411
|
-
create(config:
|
|
2408
|
+
interface GraderFactory {
|
|
2409
|
+
create(config: GraderConfig, context: EvaluationContext): Grader;
|
|
2412
2410
|
}
|
|
2413
2411
|
|
|
2414
2412
|
/**
|
|
@@ -2447,7 +2445,7 @@ declare function deepEqual(a: unknown, b: unknown): boolean;
|
|
|
2447
2445
|
*/
|
|
2448
2446
|
declare function negateScore(score: EvaluationScore): EvaluationScore;
|
|
2449
2447
|
|
|
2450
|
-
interface
|
|
2448
|
+
interface CodeGraderOptions {
|
|
2451
2449
|
readonly command: readonly string[];
|
|
2452
2450
|
/** @deprecated Use `command` instead */
|
|
2453
2451
|
readonly script?: readonly string[];
|
|
@@ -2458,29 +2456,29 @@ interface CodeEvaluatorOptions {
|
|
|
2458
2456
|
/** Target access config - when present, enables target invocation */
|
|
2459
2457
|
readonly target?: TargetAccessConfig;
|
|
2460
2458
|
}
|
|
2461
|
-
declare class
|
|
2459
|
+
declare class CodeGrader implements Grader {
|
|
2462
2460
|
readonly kind = "code-grader";
|
|
2463
2461
|
private readonly command;
|
|
2464
2462
|
private readonly cwd?;
|
|
2465
2463
|
private readonly agentTimeoutMs?;
|
|
2466
2464
|
private readonly config?;
|
|
2467
2465
|
private readonly target?;
|
|
2468
|
-
constructor(options:
|
|
2466
|
+
constructor(options: CodeGraderOptions);
|
|
2469
2467
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
2470
2468
|
}
|
|
2471
2469
|
declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
|
|
2472
2470
|
|
|
2473
|
-
interface
|
|
2474
|
-
readonly config:
|
|
2475
|
-
readonly evaluatorFactory:
|
|
2471
|
+
interface CompositeGraderOptions {
|
|
2472
|
+
readonly config: CompositeGraderConfig;
|
|
2473
|
+
readonly evaluatorFactory: GraderFactory;
|
|
2476
2474
|
readonly cwd?: string;
|
|
2477
2475
|
}
|
|
2478
|
-
declare class
|
|
2476
|
+
declare class CompositeGrader implements Grader {
|
|
2479
2477
|
readonly kind = "composite";
|
|
2480
2478
|
private readonly config;
|
|
2481
2479
|
private readonly evaluatorFactory;
|
|
2482
2480
|
private readonly cwd?;
|
|
2483
|
-
constructor(options:
|
|
2481
|
+
constructor(options: CompositeGraderOptions);
|
|
2484
2482
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
2485
2483
|
private aggregate;
|
|
2486
2484
|
private runWeightedAverage;
|
|
@@ -2489,50 +2487,50 @@ declare class CompositeEvaluator implements Evaluator {
|
|
|
2489
2487
|
private runLlmAggregator;
|
|
2490
2488
|
}
|
|
2491
2489
|
|
|
2492
|
-
interface
|
|
2493
|
-
readonly config:
|
|
2490
|
+
interface CostGraderOptions {
|
|
2491
|
+
readonly config: CostGraderConfig;
|
|
2494
2492
|
}
|
|
2495
2493
|
/**
|
|
2496
|
-
*
|
|
2494
|
+
* Grader that checks execution cost against a budget.
|
|
2497
2495
|
* Uses costUsd from the evaluation context.
|
|
2498
2496
|
*/
|
|
2499
|
-
declare class
|
|
2497
|
+
declare class CostGrader implements Grader {
|
|
2500
2498
|
readonly kind = "cost";
|
|
2501
2499
|
private readonly config;
|
|
2502
|
-
constructor(options:
|
|
2500
|
+
constructor(options: CostGraderOptions);
|
|
2503
2501
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2504
2502
|
}
|
|
2505
2503
|
|
|
2506
|
-
interface
|
|
2507
|
-
readonly config:
|
|
2504
|
+
interface ExecutionMetricsGraderOptions {
|
|
2505
|
+
readonly config: ExecutionMetricsGraderConfig;
|
|
2508
2506
|
}
|
|
2509
2507
|
/**
|
|
2510
|
-
*
|
|
2508
|
+
* Grader that checks execution metrics against configured thresholds.
|
|
2511
2509
|
* Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
|
|
2512
2510
|
* and exploration ratio. Only specified thresholds are checked.
|
|
2513
2511
|
*
|
|
2514
2512
|
* Score is proportional: passed / total assertions
|
|
2515
2513
|
*/
|
|
2516
|
-
declare class
|
|
2514
|
+
declare class ExecutionMetricsGrader implements Grader {
|
|
2517
2515
|
readonly kind = "execution-metrics";
|
|
2518
2516
|
private readonly config;
|
|
2519
|
-
constructor(options:
|
|
2517
|
+
constructor(options: ExecutionMetricsGraderOptions);
|
|
2520
2518
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2521
2519
|
private extractConfiguredThresholds;
|
|
2522
2520
|
private filterDefinedMetrics;
|
|
2523
2521
|
}
|
|
2524
2522
|
|
|
2525
|
-
interface
|
|
2526
|
-
readonly config:
|
|
2523
|
+
interface FieldAccuracyGraderOptions {
|
|
2524
|
+
readonly config: FieldAccuracyGraderConfig;
|
|
2527
2525
|
}
|
|
2528
2526
|
/**
|
|
2529
|
-
*
|
|
2527
|
+
* FieldAccuracyGrader compares extracted structured data against expected values
|
|
2530
2528
|
* with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
|
|
2531
2529
|
*/
|
|
2532
|
-
declare class
|
|
2530
|
+
declare class FieldAccuracyGrader implements Grader {
|
|
2533
2531
|
readonly kind = "field-accuracy";
|
|
2534
2532
|
private readonly config;
|
|
2535
|
-
constructor(options:
|
|
2533
|
+
constructor(options: FieldAccuracyGraderOptions);
|
|
2536
2534
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2537
2535
|
/**
|
|
2538
2536
|
* Extract expected data from expected_output array.
|
|
@@ -2561,33 +2559,33 @@ declare class FieldAccuracyEvaluator implements Evaluator {
|
|
|
2561
2559
|
private aggregateResults;
|
|
2562
2560
|
}
|
|
2563
2561
|
|
|
2564
|
-
interface
|
|
2565
|
-
readonly config:
|
|
2562
|
+
interface LatencyGraderOptions {
|
|
2563
|
+
readonly config: LatencyGraderConfig;
|
|
2566
2564
|
}
|
|
2567
2565
|
/**
|
|
2568
|
-
*
|
|
2566
|
+
* Grader that checks execution duration against a threshold.
|
|
2569
2567
|
* Uses durationMs from the evaluation context.
|
|
2570
2568
|
*/
|
|
2571
|
-
declare class
|
|
2569
|
+
declare class LatencyGrader implements Grader {
|
|
2572
2570
|
readonly kind = "latency";
|
|
2573
2571
|
private readonly config;
|
|
2574
|
-
constructor(options:
|
|
2572
|
+
constructor(options: LatencyGraderOptions);
|
|
2575
2573
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2576
2574
|
}
|
|
2577
2575
|
|
|
2578
2576
|
/**
|
|
2579
|
-
* Default
|
|
2580
|
-
* Custom
|
|
2577
|
+
* Default grader template for the user prompt (variables will be substituted).
|
|
2578
|
+
* Custom graders can override this via graderTemplate option.
|
|
2581
2579
|
*/
|
|
2582
|
-
declare const
|
|
2580
|
+
declare const DEFAULT_GRADER_TEMPLATE: string;
|
|
2583
2581
|
type GraderProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
2584
|
-
interface
|
|
2582
|
+
interface LlmGraderOptions {
|
|
2585
2583
|
readonly resolveGraderProvider: GraderProviderResolver;
|
|
2586
2584
|
/** @deprecated Use `resolveGraderProvider` instead. */
|
|
2587
2585
|
readonly resolveJudgeProvider?: GraderProviderResolver;
|
|
2588
2586
|
readonly maxOutputTokens?: number;
|
|
2589
2587
|
readonly temperature?: number;
|
|
2590
|
-
readonly
|
|
2588
|
+
readonly graderTemplate?: string;
|
|
2591
2589
|
readonly maxSteps?: number;
|
|
2592
2590
|
readonly graderTargetProvider?: Provider;
|
|
2593
2591
|
/** @deprecated Use `graderTargetProvider` instead. */
|
|
@@ -2657,15 +2655,15 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2657
2655
|
overall_reasoning: string;
|
|
2658
2656
|
}>;
|
|
2659
2657
|
|
|
2660
|
-
declare class
|
|
2658
|
+
declare class LlmGrader implements Grader {
|
|
2661
2659
|
readonly kind = "llm-grader";
|
|
2662
2660
|
private readonly resolveGraderProvider;
|
|
2663
2661
|
private readonly maxOutputTokens?;
|
|
2664
2662
|
private readonly temperature?;
|
|
2665
|
-
private readonly
|
|
2663
|
+
private readonly graderTemplate?;
|
|
2666
2664
|
private readonly maxSteps;
|
|
2667
2665
|
private readonly graderTargetProvider?;
|
|
2668
|
-
constructor(options:
|
|
2666
|
+
constructor(options: LlmGraderOptions);
|
|
2669
2667
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
2670
2668
|
private prepareContext;
|
|
2671
2669
|
private evaluateFreeform;
|
|
@@ -2722,7 +2720,7 @@ declare class LlmGraderEvaluator implements Evaluator {
|
|
|
2722
2720
|
}
|
|
2723
2721
|
/**
|
|
2724
2722
|
* Build the mandatory output schema that all evaluators must follow.
|
|
2725
|
-
* This schema is always appended to the
|
|
2723
|
+
* This schema is always appended to the grader template.
|
|
2726
2724
|
*/
|
|
2727
2725
|
declare function buildOutputSchema(): string;
|
|
2728
2726
|
declare function buildRubricOutputSchema(): string;
|
|
@@ -2766,10 +2764,10 @@ declare function extractImageBlocks(messages: readonly Message[]): ContentImage[
|
|
|
2766
2764
|
* names (input.skill, input.file_path) regardless of provider.
|
|
2767
2765
|
*/
|
|
2768
2766
|
|
|
2769
|
-
declare class
|
|
2767
|
+
declare class SkillTriggerGrader implements Grader {
|
|
2770
2768
|
readonly kind = "skill-trigger";
|
|
2771
2769
|
private readonly config;
|
|
2772
|
-
constructor(config:
|
|
2770
|
+
constructor(config: SkillTriggerGraderConfig);
|
|
2773
2771
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2774
2772
|
}
|
|
2775
2773
|
|
|
@@ -2783,33 +2781,33 @@ declare function assembleLlmGraderPrompt(input: {
|
|
|
2783
2781
|
evalCase: EvalTest;
|
|
2784
2782
|
candidate: string;
|
|
2785
2783
|
promptInputs: PromptInputs;
|
|
2786
|
-
evaluatorConfig?:
|
|
2784
|
+
evaluatorConfig?: LlmGraderConfig;
|
|
2787
2785
|
output?: readonly Message[];
|
|
2788
2786
|
fileChanges?: string;
|
|
2789
|
-
|
|
2787
|
+
graderTemplateOverride?: string;
|
|
2790
2788
|
}): LlmGraderPromptAssembly;
|
|
2791
2789
|
|
|
2792
|
-
interface
|
|
2793
|
-
readonly config:
|
|
2790
|
+
interface TokenUsageGraderOptions {
|
|
2791
|
+
readonly config: TokenUsageGraderConfig;
|
|
2794
2792
|
}
|
|
2795
2793
|
/**
|
|
2796
|
-
*
|
|
2794
|
+
* Grader that checks provider-reported token usage against configured limits.
|
|
2797
2795
|
* Uses tokenUsage from the evaluation context.
|
|
2798
2796
|
*/
|
|
2799
|
-
declare class
|
|
2797
|
+
declare class TokenUsageGrader implements Grader {
|
|
2800
2798
|
readonly kind = "token-usage";
|
|
2801
2799
|
private readonly config;
|
|
2802
|
-
constructor(options:
|
|
2800
|
+
constructor(options: TokenUsageGraderOptions);
|
|
2803
2801
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2804
2802
|
}
|
|
2805
2803
|
|
|
2806
|
-
interface
|
|
2807
|
-
readonly config:
|
|
2804
|
+
interface ToolTrajectoryGraderOptions {
|
|
2805
|
+
readonly config: ToolTrajectoryGraderConfig;
|
|
2808
2806
|
}
|
|
2809
|
-
declare class
|
|
2807
|
+
declare class ToolTrajectoryGrader implements Grader {
|
|
2810
2808
|
readonly kind = "tool-trajectory";
|
|
2811
2809
|
private readonly config;
|
|
2812
|
-
constructor(options:
|
|
2810
|
+
constructor(options: ToolTrajectoryGraderOptions);
|
|
2813
2811
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2814
2812
|
/**
|
|
2815
2813
|
* Extract tool calls from output messages.
|
|
@@ -2873,7 +2871,7 @@ declare function runIsJsonAssertion(output: string): AssertionResult;
|
|
|
2873
2871
|
declare function runEqualsAssertion(output: string, value: string): AssertionResult;
|
|
2874
2872
|
|
|
2875
2873
|
/**
|
|
2876
|
-
* Extensible
|
|
2874
|
+
* Extensible grader registry.
|
|
2877
2875
|
*
|
|
2878
2876
|
* Replaces the hardcoded switch/case dispatch in the orchestrator with
|
|
2879
2877
|
* a registry of named factory functions. Built-in evaluators are registered
|
|
@@ -2882,10 +2880,10 @@ declare function runEqualsAssertion(output: string, value: string): AssertionRes
|
|
|
2882
2880
|
*/
|
|
2883
2881
|
|
|
2884
2882
|
/**
|
|
2885
|
-
* Context passed to
|
|
2883
|
+
* Context passed to grader factory functions during creation.
|
|
2886
2884
|
* Contains shared resources needed by evaluator instances.
|
|
2887
2885
|
*/
|
|
2888
|
-
interface
|
|
2886
|
+
interface GraderDispatchContext {
|
|
2889
2887
|
/** Shared LLM grader provider (resolved at suite level) */
|
|
2890
2888
|
readonly graderProvider?: Provider;
|
|
2891
2889
|
/** @deprecated Use `graderProvider` instead */
|
|
@@ -2899,48 +2897,48 @@ interface EvaluatorDispatchContext {
|
|
|
2899
2897
|
/** Directory containing the eval file (for composite member resolution) */
|
|
2900
2898
|
readonly evalFileDir?: string;
|
|
2901
2899
|
/** Shared LLM grader evaluator instance */
|
|
2902
|
-
readonly llmGrader:
|
|
2900
|
+
readonly llmGrader: Grader;
|
|
2903
2901
|
/** @deprecated Use `llmGrader` instead */
|
|
2904
|
-
readonly llmJudge?:
|
|
2902
|
+
readonly llmJudge?: Grader;
|
|
2905
2903
|
/** Reference to the registry itself (for composite evaluators that need to create children) */
|
|
2906
|
-
readonly registry:
|
|
2904
|
+
readonly registry: GraderRegistry;
|
|
2907
2905
|
}
|
|
2908
2906
|
/**
|
|
2909
|
-
* Factory function that creates an
|
|
2907
|
+
* Factory function that creates an Grader instance from a config.
|
|
2910
2908
|
*
|
|
2911
2909
|
* Factory functions handle all type-specific initialization logic:
|
|
2912
2910
|
* - Reading prompt files for LLM graders
|
|
2913
2911
|
* - Resolving script paths for code graders
|
|
2914
2912
|
* - Creating adapter evaluators for deterministic assertions
|
|
2915
2913
|
*/
|
|
2916
|
-
type
|
|
2914
|
+
type GraderFactoryFn = (config: GraderConfig, context: GraderDispatchContext) => Grader | Promise<Grader>;
|
|
2917
2915
|
/**
|
|
2918
|
-
* Registry of
|
|
2916
|
+
* Registry of grader factory functions keyed by grader type name.
|
|
2919
2917
|
*
|
|
2920
2918
|
* Built-in evaluators are registered at startup. Custom evaluators can be
|
|
2921
2919
|
* registered via the `register()` method or discovered from `.agentv/assertions/`.
|
|
2922
2920
|
*/
|
|
2923
|
-
declare class
|
|
2921
|
+
declare class GraderRegistry {
|
|
2924
2922
|
private readonly factories;
|
|
2925
|
-
/** Register a factory function for an
|
|
2926
|
-
register(type: string, factory:
|
|
2927
|
-
/** Get the factory function for an
|
|
2928
|
-
get(type: string):
|
|
2923
|
+
/** Register a factory function for an grader type. */
|
|
2924
|
+
register(type: string, factory: GraderFactoryFn): this;
|
|
2925
|
+
/** Get the factory function for an grader type. */
|
|
2926
|
+
get(type: string): GraderFactoryFn | undefined;
|
|
2929
2927
|
/** Check if a factory is registered for the given type. */
|
|
2930
2928
|
has(type: string): boolean;
|
|
2931
|
-
/** List all registered
|
|
2929
|
+
/** List all registered grader type names. */
|
|
2932
2930
|
list(): string[];
|
|
2933
2931
|
/**
|
|
2934
2932
|
* Create an evaluator instance from a config, using the registered factory.
|
|
2935
|
-
* Throws if no factory is registered for the
|
|
2933
|
+
* Throws if no factory is registered for the grader type.
|
|
2936
2934
|
*/
|
|
2937
|
-
create(config:
|
|
2935
|
+
create(config: GraderConfig, context: GraderDispatchContext): Promise<Grader>;
|
|
2938
2936
|
}
|
|
2939
2937
|
/**
|
|
2940
|
-
* Adapter that wraps a synchronous assertion function as an
|
|
2938
|
+
* Adapter that wraps a synchronous assertion function as an Grader.
|
|
2941
2939
|
* Used for deterministic assertions (contains, regex, is-json, equals).
|
|
2942
2940
|
*/
|
|
2943
|
-
declare class
|
|
2941
|
+
declare class DeterministicAssertionGrader implements Grader {
|
|
2944
2942
|
private readonly assertFn;
|
|
2945
2943
|
readonly kind: string;
|
|
2946
2944
|
constructor(kind: string, assertFn: (context: EvaluationContext) => EvaluationScore);
|
|
@@ -2988,8 +2986,8 @@ interface RunEvalCaseOptions {
|
|
|
2988
2986
|
readonly evalCase: EvalTest;
|
|
2989
2987
|
readonly provider: Provider;
|
|
2990
2988
|
readonly target: ResolvedTarget;
|
|
2991
|
-
readonly evaluators: Partial<Record<string,
|
|
2992
|
-
readonly 'llm-grader':
|
|
2989
|
+
readonly evaluators: Partial<Record<string, Grader>> & {
|
|
2990
|
+
readonly 'llm-grader': Grader;
|
|
2993
2991
|
};
|
|
2994
2992
|
readonly now?: () => Date;
|
|
2995
2993
|
readonly maxRetries?: number;
|
|
@@ -3020,8 +3018,8 @@ interface RunEvalCaseOptions {
|
|
|
3020
3018
|
readonly suiteWorkspaceFile?: string;
|
|
3021
3019
|
/** Real-time observability callbacks passed to the provider */
|
|
3022
3020
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
3023
|
-
/**
|
|
3024
|
-
readonly typeRegistry?:
|
|
3021
|
+
/** Grader type registry (with custom assertions discovered) */
|
|
3022
|
+
readonly typeRegistry?: GraderRegistry;
|
|
3025
3023
|
/** RepoManager instance for repo lifecycle (shared workspace mode) */
|
|
3026
3024
|
readonly repoManager?: RepoManager;
|
|
3027
3025
|
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
@@ -3054,7 +3052,7 @@ interface RunEvaluationOptions {
|
|
|
3054
3052
|
readonly targets?: readonly TargetDefinition[];
|
|
3055
3053
|
readonly env?: EnvLookup;
|
|
3056
3054
|
readonly providerFactory?: (target: ResolvedTarget) => Provider;
|
|
3057
|
-
readonly evaluators?: Partial<Record<string,
|
|
3055
|
+
readonly evaluators?: Partial<Record<string, Grader>>;
|
|
3058
3056
|
readonly maxRetries?: number;
|
|
3059
3057
|
readonly agentTimeoutMs?: number;
|
|
3060
3058
|
readonly cache?: EvaluationCache;
|
|
@@ -3076,7 +3074,7 @@ interface RunEvaluationOptions {
|
|
|
3076
3074
|
/** Real-time observability callbacks passed to the provider */
|
|
3077
3075
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
3078
3076
|
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
|
|
3079
|
-
readonly
|
|
3077
|
+
readonly budgetUsd?: number;
|
|
3080
3078
|
/** Execution error tolerance: true halts on first error */
|
|
3081
3079
|
readonly failOnError?: FailOnError;
|
|
3082
3080
|
/** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
|
|
@@ -3111,7 +3109,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
|
|
|
3111
3109
|
* Types for inline assertion functions used in the evaluate() API.
|
|
3112
3110
|
*
|
|
3113
3111
|
* Inline functions are the escape hatch for custom evaluation logic
|
|
3114
|
-
* that doesn't fit a built-in
|
|
3112
|
+
* that doesn't fit a built-in grader type. For built-in assertions
|
|
3115
3113
|
* (contains, regex, is-json, etc.), use config objects instead:
|
|
3116
3114
|
*
|
|
3117
3115
|
* assert: [{ type: 'contains', value: 'hello' }]
|
|
@@ -4186,17 +4184,17 @@ declare class OtlpJsonFileExporter {
|
|
|
4186
4184
|
}
|
|
4187
4185
|
|
|
4188
4186
|
/**
|
|
4189
|
-
* Factory functions for all built-in
|
|
4187
|
+
* Factory functions for all built-in grader types.
|
|
4190
4188
|
*
|
|
4191
|
-
* Each factory creates an
|
|
4189
|
+
* Each factory creates an Grader instance from an GraderConfig,
|
|
4192
4190
|
* handling type-specific initialization logic. These are registered into
|
|
4193
|
-
* the
|
|
4191
|
+
* the GraderRegistry at startup.
|
|
4194
4192
|
*/
|
|
4195
4193
|
|
|
4196
4194
|
/**
|
|
4197
|
-
* Create a new
|
|
4195
|
+
* Create a new GraderRegistry with all built-in grader types registered.
|
|
4198
4196
|
*/
|
|
4199
|
-
declare function createBuiltinRegistry():
|
|
4197
|
+
declare function createBuiltinRegistry(): GraderRegistry;
|
|
4200
4198
|
|
|
4201
4199
|
/**
|
|
4202
4200
|
* Convention-based discovery of custom assertion scripts.
|
|
@@ -4216,27 +4214,27 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
|
4216
4214
|
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
4217
4215
|
* @returns Names of discovered assertion types
|
|
4218
4216
|
*/
|
|
4219
|
-
declare function discoverAssertions(registry:
|
|
4217
|
+
declare function discoverAssertions(registry: GraderRegistry, baseDir: string): Promise<string[]>;
|
|
4220
4218
|
|
|
4221
4219
|
/**
|
|
4222
4220
|
* Convention-based discovery of custom grader scripts.
|
|
4223
4221
|
*
|
|
4224
4222
|
* Scans `.agentv/graders/` (and legacy `.agentv/judges/`) for TypeScript/JavaScript
|
|
4225
|
-
* files and registers them as code
|
|
4226
|
-
* (without extension) becomes the
|
|
4223
|
+
* files and registers them as code graders in the registry. The file name
|
|
4224
|
+
* (without extension) becomes the grader type name.
|
|
4227
4225
|
*
|
|
4228
4226
|
* Example: `.agentv/graders/custom-grader.ts` → type "custom-grader" in EVAL.yaml
|
|
4229
4227
|
*/
|
|
4230
4228
|
|
|
4231
4229
|
/**
|
|
4232
4230
|
* Discover custom grader scripts from `.agentv/graders/` (and legacy `.agentv/judges/`)
|
|
4233
|
-
* and register them as
|
|
4231
|
+
* and register them as grader types in the registry.
|
|
4234
4232
|
*
|
|
4235
|
-
* @param registry - The
|
|
4233
|
+
* @param registry - The grader registry to register discovered graders into
|
|
4236
4234
|
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
4237
4235
|
* @returns Names of discovered grader types
|
|
4238
4236
|
*/
|
|
4239
|
-
declare function discoverGraders(registry:
|
|
4237
|
+
declare function discoverGraders(registry: GraderRegistry, baseDir: string): Promise<string[]>;
|
|
4240
4238
|
|
|
4241
4239
|
/**
|
|
4242
4240
|
* Core types for the transcript import pipeline.
|
|
@@ -4489,7 +4487,7 @@ declare function discoverClaudeSessions(opts?: ClaudeDiscoverOptions): Promise<C
|
|
|
4489
4487
|
* 1. Reads a transcript JSONL file (produced by `agentv import`)
|
|
4490
4488
|
* 2. Each invocation pops the next line from the transcript
|
|
4491
4489
|
* 3. Returns a ProviderResponse with pre-populated output, token usage, etc.
|
|
4492
|
-
* 4.
|
|
4490
|
+
* 4. Graders run identically to live eval — they see the same ProviderResponse
|
|
4493
4491
|
*
|
|
4494
4492
|
* The provider name in results is set to the source provider from the transcript
|
|
4495
4493
|
* (e.g., "claude", "codex", "copilot").
|
|
@@ -4555,4 +4553,4 @@ type AgentKernel = {
|
|
|
4555
4553
|
};
|
|
4556
4554
|
declare function createAgentKernel(): AgentKernel;
|
|
4557
4555
|
|
|
4558
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type
|
|
4556
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|