@agentv/core 4.17.1 → 4.19.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6VZY3B6M.js → chunk-24ND5HZC.js} +102 -102
- package/dist/chunk-24ND5HZC.js.map +1 -0
- package/dist/chunk-QXX3IBYV.js +19740 -0
- package/dist/chunk-QXX3IBYV.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +5 -5
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -3
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +22852 -21848
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1015 -974
- package/dist/index.d.ts +1015 -974
- package/dist/index.js +494 -19790
- package/dist/index.js.map +1 -1
- package/dist/ts-eval-loader-XFQ6S4DT.js +12 -0
- package/dist/ts-eval-loader-XFQ6S4DT.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-6VZY3B6M.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -341,7 +341,7 @@ type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
|
|
|
341
341
|
/**
|
|
342
342
|
* Configuration for tool-trajectory evaluator.
|
|
343
343
|
*/
|
|
344
|
-
interface
|
|
344
|
+
interface ToolTrajectoryGraderConfig {
|
|
345
345
|
readonly name: string;
|
|
346
346
|
readonly type: 'tool-trajectory';
|
|
347
347
|
/** Matching mode */
|
|
@@ -355,7 +355,7 @@ interface ToolTrajectoryEvaluatorConfig {
|
|
|
355
355
|
readonly required?: boolean | number;
|
|
356
356
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
357
357
|
readonly min_score?: number;
|
|
358
|
-
/** When true, inverts the
|
|
358
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
359
359
|
readonly negate?: boolean;
|
|
360
360
|
/** Default argument matching mode for all expected items (defaults to 'exact') */
|
|
361
361
|
readonly argsMatch?: ArgsMatchMode | readonly string[];
|
|
@@ -539,9 +539,9 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
539
539
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
540
540
|
*/
|
|
541
541
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
542
|
-
declare const
|
|
543
|
-
type
|
|
544
|
-
declare function
|
|
542
|
+
declare const GRADER_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
|
|
543
|
+
type GraderKind = (typeof GRADER_KIND_VALUES)[number];
|
|
544
|
+
declare function isGraderKind(value: unknown): value is GraderKind;
|
|
545
545
|
/**
|
|
546
546
|
* Configuration for enabling target access in code-grader evaluators.
|
|
547
547
|
* When present, the runtime will start a local proxy server that allows
|
|
@@ -697,7 +697,7 @@ type WorkspaceConfig = {
|
|
|
697
697
|
* relative paths from their own directory, not the eval file's directory. */
|
|
698
698
|
readonly workspaceFileDir?: string;
|
|
699
699
|
};
|
|
700
|
-
type
|
|
700
|
+
type CodeGraderConfig = {
|
|
701
701
|
readonly name: string;
|
|
702
702
|
readonly type: 'code-grader';
|
|
703
703
|
readonly command: readonly string[];
|
|
@@ -710,7 +710,7 @@ type CodeEvaluatorConfig = {
|
|
|
710
710
|
readonly required?: boolean | number;
|
|
711
711
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
712
712
|
readonly min_score?: number;
|
|
713
|
-
/** When true, inverts the
|
|
713
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
714
714
|
readonly negate?: boolean;
|
|
715
715
|
/** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
|
|
716
716
|
readonly config?: JsonObject;
|
|
@@ -739,7 +739,7 @@ type ContentPreprocessorConfig = {
|
|
|
739
739
|
/** Resolved absolute path for the command script (last argv element) */
|
|
740
740
|
readonly resolvedCommand?: readonly string[];
|
|
741
741
|
};
|
|
742
|
-
type
|
|
742
|
+
type LlmGraderConfig = {
|
|
743
743
|
readonly name: string;
|
|
744
744
|
readonly type: 'llm-grader';
|
|
745
745
|
/** Text prompt (inline or file path) or executable script config */
|
|
@@ -754,7 +754,7 @@ type LlmGraderEvaluatorConfig = {
|
|
|
754
754
|
readonly required?: boolean | number;
|
|
755
755
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
756
756
|
readonly min_score?: number;
|
|
757
|
-
/** When true, inverts the
|
|
757
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
758
758
|
readonly negate?: boolean;
|
|
759
759
|
/** Optional target override for this grader (uses a named LLM target from targets.yaml). */
|
|
760
760
|
readonly target?: string;
|
|
@@ -767,8 +767,6 @@ type LlmGraderEvaluatorConfig = {
|
|
|
767
767
|
/** Optional content preprocessors for ContentFile blocks in assistant output */
|
|
768
768
|
readonly preprocessors?: readonly ContentPreprocessorConfig[];
|
|
769
769
|
};
|
|
770
|
-
/** @deprecated Use `LlmGraderEvaluatorConfig` instead */
|
|
771
|
-
type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
|
|
772
770
|
/**
|
|
773
771
|
* Score range definition for analytic rubric scoring.
|
|
774
772
|
* Each range maps an integer score band (0-10) to an outcome description.
|
|
@@ -830,16 +828,16 @@ type CompositeAggregatorConfig = {
|
|
|
830
828
|
readonly type: 'threshold';
|
|
831
829
|
readonly threshold: number;
|
|
832
830
|
};
|
|
833
|
-
type
|
|
831
|
+
type CompositeGraderConfig = {
|
|
834
832
|
readonly name: string;
|
|
835
833
|
readonly type: 'composite';
|
|
836
|
-
readonly assertions: readonly
|
|
834
|
+
readonly assertions: readonly GraderConfig[];
|
|
837
835
|
readonly aggregator: CompositeAggregatorConfig;
|
|
838
836
|
readonly weight?: number;
|
|
839
837
|
readonly required?: boolean | number;
|
|
840
838
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
841
839
|
readonly min_score?: number;
|
|
842
|
-
/** When true, inverts the
|
|
840
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
843
841
|
readonly negate?: boolean;
|
|
844
842
|
};
|
|
845
843
|
/**
|
|
@@ -874,7 +872,7 @@ type FieldConfig = {
|
|
|
874
872
|
/**
|
|
875
873
|
* Configuration for the field-accuracy evaluator.
|
|
876
874
|
*/
|
|
877
|
-
type
|
|
875
|
+
type FieldAccuracyGraderConfig = {
|
|
878
876
|
readonly name: string;
|
|
879
877
|
readonly type: 'field-accuracy';
|
|
880
878
|
/** Fields to compare between candidate and expected */
|
|
@@ -885,14 +883,14 @@ type FieldAccuracyEvaluatorConfig = {
|
|
|
885
883
|
readonly required?: boolean | number;
|
|
886
884
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
887
885
|
readonly min_score?: number;
|
|
888
|
-
/** When true, inverts the
|
|
886
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
889
887
|
readonly negate?: boolean;
|
|
890
888
|
};
|
|
891
889
|
/**
|
|
892
890
|
* Configuration for the latency evaluator.
|
|
893
891
|
* Checks execution duration against a threshold.
|
|
894
892
|
*/
|
|
895
|
-
type
|
|
893
|
+
type LatencyGraderConfig = {
|
|
896
894
|
readonly name: string;
|
|
897
895
|
readonly type: 'latency';
|
|
898
896
|
/** Maximum allowed duration in milliseconds */
|
|
@@ -901,14 +899,14 @@ type LatencyEvaluatorConfig = {
|
|
|
901
899
|
readonly required?: boolean | number;
|
|
902
900
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
903
901
|
readonly min_score?: number;
|
|
904
|
-
/** When true, inverts the
|
|
902
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
905
903
|
readonly negate?: boolean;
|
|
906
904
|
};
|
|
907
905
|
/**
|
|
908
906
|
* Configuration for the cost evaluator.
|
|
909
907
|
* Checks execution cost against a budget.
|
|
910
908
|
*/
|
|
911
|
-
type
|
|
909
|
+
type CostGraderConfig = {
|
|
912
910
|
readonly name: string;
|
|
913
911
|
readonly type: 'cost';
|
|
914
912
|
/** Maximum allowed cost in USD */
|
|
@@ -917,14 +915,14 @@ type CostEvaluatorConfig = {
|
|
|
917
915
|
readonly required?: boolean | number;
|
|
918
916
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
919
917
|
readonly min_score?: number;
|
|
920
|
-
/** When true, inverts the
|
|
918
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
921
919
|
readonly negate?: boolean;
|
|
922
920
|
};
|
|
923
921
|
/**
|
|
924
922
|
* Configuration for the token-usage evaluator.
|
|
925
923
|
* Checks provider-reported token usage against configured limits.
|
|
926
924
|
*/
|
|
927
|
-
type
|
|
925
|
+
type TokenUsageGraderConfig = {
|
|
928
926
|
readonly name: string;
|
|
929
927
|
readonly type: 'token-usage';
|
|
930
928
|
/** Maximum allowed total tokens (input + output + cached, when present) */
|
|
@@ -937,7 +935,7 @@ type TokenUsageEvaluatorConfig = {
|
|
|
937
935
|
readonly required?: boolean | number;
|
|
938
936
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
939
937
|
readonly min_score?: number;
|
|
940
|
-
/** When true, inverts the
|
|
938
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
941
939
|
readonly negate?: boolean;
|
|
942
940
|
};
|
|
943
941
|
/**
|
|
@@ -945,7 +943,7 @@ type TokenUsageEvaluatorConfig = {
|
|
|
945
943
|
* Provides declarative threshold-based checks on execution metrics.
|
|
946
944
|
* Only specified thresholds are checked; omitted ones are ignored.
|
|
947
945
|
*/
|
|
948
|
-
type
|
|
946
|
+
type ExecutionMetricsGraderConfig = {
|
|
949
947
|
readonly name: string;
|
|
950
948
|
readonly type: 'execution-metrics';
|
|
951
949
|
/** Maximum allowed number of tool calls */
|
|
@@ -966,14 +964,14 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
966
964
|
readonly required?: boolean | number;
|
|
967
965
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
968
966
|
readonly min_score?: number;
|
|
969
|
-
/** When true, inverts the
|
|
967
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
970
968
|
readonly negate?: boolean;
|
|
971
969
|
};
|
|
972
970
|
/**
|
|
973
971
|
* Configuration for the contains assertion evaluator.
|
|
974
972
|
* Checks whether the candidate output contains a specified substring.
|
|
975
973
|
*/
|
|
976
|
-
type
|
|
974
|
+
type ContainsGraderConfig = {
|
|
977
975
|
readonly name: string;
|
|
978
976
|
readonly type: 'contains';
|
|
979
977
|
readonly value: string;
|
|
@@ -981,14 +979,14 @@ type ContainsEvaluatorConfig = {
|
|
|
981
979
|
readonly required?: boolean | number;
|
|
982
980
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
983
981
|
readonly min_score?: number;
|
|
984
|
-
/** When true, inverts the
|
|
982
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
985
983
|
readonly negate?: boolean;
|
|
986
984
|
};
|
|
987
985
|
/**
|
|
988
986
|
* Configuration for the contains_any assertion evaluator.
|
|
989
987
|
* Checks whether the candidate output contains ANY of the specified substrings.
|
|
990
988
|
*/
|
|
991
|
-
type
|
|
989
|
+
type ContainsAnyGraderConfig = {
|
|
992
990
|
readonly name: string;
|
|
993
991
|
readonly type: 'contains-any';
|
|
994
992
|
readonly value: readonly string[];
|
|
@@ -996,14 +994,14 @@ type ContainsAnyEvaluatorConfig = {
|
|
|
996
994
|
readonly required?: boolean | number;
|
|
997
995
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
998
996
|
readonly min_score?: number;
|
|
999
|
-
/** When true, inverts the
|
|
997
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1000
998
|
readonly negate?: boolean;
|
|
1001
999
|
};
|
|
1002
1000
|
/**
|
|
1003
1001
|
* Configuration for the contains_all assertion evaluator.
|
|
1004
1002
|
* Checks whether the candidate output contains ALL of the specified substrings.
|
|
1005
1003
|
*/
|
|
1006
|
-
type
|
|
1004
|
+
type ContainsAllGraderConfig = {
|
|
1007
1005
|
readonly name: string;
|
|
1008
1006
|
readonly type: 'contains-all';
|
|
1009
1007
|
readonly value: readonly string[];
|
|
@@ -1011,14 +1009,14 @@ type ContainsAllEvaluatorConfig = {
|
|
|
1011
1009
|
readonly required?: boolean | number;
|
|
1012
1010
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1013
1011
|
readonly min_score?: number;
|
|
1014
|
-
/** When true, inverts the
|
|
1012
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1015
1013
|
readonly negate?: boolean;
|
|
1016
1014
|
};
|
|
1017
1015
|
/**
|
|
1018
1016
|
* Configuration for the icontains assertion evaluator.
|
|
1019
1017
|
* Case-insensitive check whether the candidate output contains a specified substring.
|
|
1020
1018
|
*/
|
|
1021
|
-
type
|
|
1019
|
+
type IcontainsGraderConfig = {
|
|
1022
1020
|
readonly name: string;
|
|
1023
1021
|
readonly type: 'icontains';
|
|
1024
1022
|
readonly value: string;
|
|
@@ -1026,14 +1024,14 @@ type IcontainsEvaluatorConfig = {
|
|
|
1026
1024
|
readonly required?: boolean | number;
|
|
1027
1025
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1028
1026
|
readonly min_score?: number;
|
|
1029
|
-
/** When true, inverts the
|
|
1027
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1030
1028
|
readonly negate?: boolean;
|
|
1031
1029
|
};
|
|
1032
1030
|
/**
|
|
1033
1031
|
* Configuration for the icontains_any assertion evaluator.
|
|
1034
1032
|
* Case-insensitive check whether the candidate output contains ANY of the specified substrings.
|
|
1035
1033
|
*/
|
|
1036
|
-
type
|
|
1034
|
+
type IcontainsAnyGraderConfig = {
|
|
1037
1035
|
readonly name: string;
|
|
1038
1036
|
readonly type: 'icontains-any';
|
|
1039
1037
|
readonly value: readonly string[];
|
|
@@ -1041,14 +1039,14 @@ type IcontainsAnyEvaluatorConfig = {
|
|
|
1041
1039
|
readonly required?: boolean | number;
|
|
1042
1040
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1043
1041
|
readonly min_score?: number;
|
|
1044
|
-
/** When true, inverts the
|
|
1042
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1045
1043
|
readonly negate?: boolean;
|
|
1046
1044
|
};
|
|
1047
1045
|
/**
|
|
1048
1046
|
* Configuration for the icontains_all assertion evaluator.
|
|
1049
1047
|
* Case-insensitive check whether the candidate output contains ALL of the specified substrings.
|
|
1050
1048
|
*/
|
|
1051
|
-
type
|
|
1049
|
+
type IcontainsAllGraderConfig = {
|
|
1052
1050
|
readonly name: string;
|
|
1053
1051
|
readonly type: 'icontains-all';
|
|
1054
1052
|
readonly value: readonly string[];
|
|
@@ -1056,14 +1054,14 @@ type IcontainsAllEvaluatorConfig = {
|
|
|
1056
1054
|
readonly required?: boolean | number;
|
|
1057
1055
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1058
1056
|
readonly min_score?: number;
|
|
1059
|
-
/** When true, inverts the
|
|
1057
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1060
1058
|
readonly negate?: boolean;
|
|
1061
1059
|
};
|
|
1062
1060
|
/**
|
|
1063
1061
|
* Configuration for the starts_with assertion evaluator.
|
|
1064
1062
|
* Checks whether the candidate output starts with a specified string (both trimmed).
|
|
1065
1063
|
*/
|
|
1066
|
-
type
|
|
1064
|
+
type StartsWithGraderConfig = {
|
|
1067
1065
|
readonly name: string;
|
|
1068
1066
|
readonly type: 'starts-with';
|
|
1069
1067
|
readonly value: string;
|
|
@@ -1071,14 +1069,14 @@ type StartsWithEvaluatorConfig = {
|
|
|
1071
1069
|
readonly required?: boolean | number;
|
|
1072
1070
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1073
1071
|
readonly min_score?: number;
|
|
1074
|
-
/** When true, inverts the
|
|
1072
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1075
1073
|
readonly negate?: boolean;
|
|
1076
1074
|
};
|
|
1077
1075
|
/**
|
|
1078
1076
|
* Configuration for the ends_with assertion evaluator.
|
|
1079
1077
|
* Checks whether the candidate output ends with a specified string (both trimmed).
|
|
1080
1078
|
*/
|
|
1081
|
-
type
|
|
1079
|
+
type EndsWithGraderConfig = {
|
|
1082
1080
|
readonly name: string;
|
|
1083
1081
|
readonly type: 'ends-with';
|
|
1084
1082
|
readonly value: string;
|
|
@@ -1086,14 +1084,14 @@ type EndsWithEvaluatorConfig = {
|
|
|
1086
1084
|
readonly required?: boolean | number;
|
|
1087
1085
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1088
1086
|
readonly min_score?: number;
|
|
1089
|
-
/** When true, inverts the
|
|
1087
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1090
1088
|
readonly negate?: boolean;
|
|
1091
1089
|
};
|
|
1092
1090
|
/**
|
|
1093
1091
|
* Configuration for the regex assertion evaluator.
|
|
1094
1092
|
* Checks whether the candidate output matches a regular expression pattern.
|
|
1095
1093
|
*/
|
|
1096
|
-
type
|
|
1094
|
+
type RegexGraderConfig = {
|
|
1097
1095
|
readonly name: string;
|
|
1098
1096
|
readonly type: 'regex';
|
|
1099
1097
|
readonly value: string;
|
|
@@ -1103,28 +1101,28 @@ type RegexEvaluatorConfig = {
|
|
|
1103
1101
|
readonly required?: boolean | number;
|
|
1104
1102
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1105
1103
|
readonly min_score?: number;
|
|
1106
|
-
/** When true, inverts the
|
|
1104
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1107
1105
|
readonly negate?: boolean;
|
|
1108
1106
|
};
|
|
1109
1107
|
/**
|
|
1110
1108
|
* Configuration for the is_json assertion evaluator.
|
|
1111
1109
|
* Checks whether the candidate output is valid JSON.
|
|
1112
1110
|
*/
|
|
1113
|
-
type
|
|
1111
|
+
type IsJsonGraderConfig = {
|
|
1114
1112
|
readonly name: string;
|
|
1115
1113
|
readonly type: 'is-json';
|
|
1116
1114
|
readonly weight?: number;
|
|
1117
1115
|
readonly required?: boolean | number;
|
|
1118
1116
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1119
1117
|
readonly min_score?: number;
|
|
1120
|
-
/** When true, inverts the
|
|
1118
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1121
1119
|
readonly negate?: boolean;
|
|
1122
1120
|
};
|
|
1123
1121
|
/**
|
|
1124
1122
|
* Configuration for the equals assertion evaluator.
|
|
1125
1123
|
* Checks whether the candidate output exactly equals a specified string.
|
|
1126
1124
|
*/
|
|
1127
|
-
type
|
|
1125
|
+
type EqualsGraderConfig = {
|
|
1128
1126
|
readonly name: string;
|
|
1129
1127
|
readonly type: 'equals';
|
|
1130
1128
|
readonly value: string;
|
|
@@ -1132,7 +1130,7 @@ type EqualsEvaluatorConfig = {
|
|
|
1132
1130
|
readonly required?: boolean | number;
|
|
1133
1131
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1134
1132
|
readonly min_score?: number;
|
|
1135
|
-
/** When true, inverts the
|
|
1133
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1136
1134
|
readonly negate?: boolean;
|
|
1137
1135
|
};
|
|
1138
1136
|
/**
|
|
@@ -1147,7 +1145,7 @@ type RubricsEvaluatorConfig = {
|
|
|
1147
1145
|
readonly required?: boolean | number;
|
|
1148
1146
|
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1149
1147
|
readonly min_score?: number;
|
|
1150
|
-
/** When true, inverts the
|
|
1148
|
+
/** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
|
|
1151
1149
|
readonly negate?: boolean;
|
|
1152
1150
|
};
|
|
1153
1151
|
/**
|
|
@@ -1156,7 +1154,7 @@ type RubricsEvaluatorConfig = {
|
|
|
1156
1154
|
* Tool-name resolution is automatic based on the provider kind.
|
|
1157
1155
|
* For providers not covered by the built-in mapping, use a code-grader.
|
|
1158
1156
|
*/
|
|
1159
|
-
type
|
|
1157
|
+
type SkillTriggerGraderConfig = {
|
|
1160
1158
|
readonly name: string;
|
|
1161
1159
|
readonly type: 'skill-trigger';
|
|
1162
1160
|
/** The skill name to check for (case-sensitive substring match) */
|
|
@@ -1182,7 +1180,7 @@ type InlineAssertEvaluatorConfig = {
|
|
|
1182
1180
|
readonly min_score?: number;
|
|
1183
1181
|
readonly negate?: boolean;
|
|
1184
1182
|
};
|
|
1185
|
-
type
|
|
1183
|
+
type GraderConfig = CodeGraderConfig | LlmGraderConfig | CompositeGraderConfig | ToolTrajectoryGraderConfig | FieldAccuracyGraderConfig | LatencyGraderConfig | CostGraderConfig | TokenUsageGraderConfig | ExecutionMetricsGraderConfig | SkillTriggerGraderConfig | ContainsGraderConfig | ContainsAnyGraderConfig | ContainsAllGraderConfig | IcontainsGraderConfig | IcontainsAnyGraderConfig | IcontainsAllGraderConfig | StartsWithGraderConfig | EndsWithGraderConfig | RegexGraderConfig | IsJsonGraderConfig | EqualsGraderConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
|
|
1186
1184
|
/**
|
|
1187
1185
|
* A single turn in a multi-turn conversation evaluation.
|
|
1188
1186
|
* Each turn is a user message. The runner generates the assistant response.
|
|
@@ -1193,7 +1191,7 @@ interface ConversationTurn {
|
|
|
1193
1191
|
/** Reference assistant response for grading (NOT carried forward — actual LLM response is used) */
|
|
1194
1192
|
readonly expected_output?: TestMessageContent;
|
|
1195
1193
|
/** Per-turn assertions. Strings become rubric criteria via shorthand. */
|
|
1196
|
-
readonly assertions?: readonly (string |
|
|
1194
|
+
readonly assertions?: readonly (string | GraderConfig)[];
|
|
1197
1195
|
}
|
|
1198
1196
|
/**
|
|
1199
1197
|
* Conversation evaluation mode.
|
|
@@ -1228,8 +1226,8 @@ interface EvalTest {
|
|
|
1228
1226
|
readonly reference_answer?: string;
|
|
1229
1227
|
readonly file_paths: readonly string[];
|
|
1230
1228
|
readonly criteria: string;
|
|
1231
|
-
readonly evaluator?:
|
|
1232
|
-
readonly assertions?: readonly
|
|
1229
|
+
readonly evaluator?: GraderKind;
|
|
1230
|
+
readonly assertions?: readonly GraderConfig[];
|
|
1233
1231
|
/** Suite-level preprocessors used by the implicit default llm-grader. */
|
|
1234
1232
|
readonly preprocessors?: readonly ContentPreprocessorConfig[];
|
|
1235
1233
|
/** Workspace configuration (merged from suite-level and case-level) */
|
|
@@ -1293,7 +1291,7 @@ interface TrialResult {
|
|
|
1293
1291
|
readonly attempt: number;
|
|
1294
1292
|
readonly score: number;
|
|
1295
1293
|
readonly verdict: EvaluationVerdict;
|
|
1296
|
-
readonly scores?: readonly
|
|
1294
|
+
readonly scores?: readonly GraderResult[];
|
|
1297
1295
|
readonly error?: string;
|
|
1298
1296
|
readonly costUsd?: number;
|
|
1299
1297
|
/** Primary classification for this trial attempt */
|
|
@@ -1359,7 +1357,7 @@ interface ExecutionError {
|
|
|
1359
1357
|
*/
|
|
1360
1358
|
type FailOnError = boolean;
|
|
1361
1359
|
/**
|
|
1362
|
-
*
|
|
1360
|
+
* Grader scorecard for a single eval case run.
|
|
1363
1361
|
*/
|
|
1364
1362
|
interface EvaluationResult {
|
|
1365
1363
|
readonly timestamp: string;
|
|
@@ -1390,7 +1388,7 @@ interface EvaluationResult {
|
|
|
1390
1388
|
readonly lm?: JsonObject;
|
|
1391
1389
|
readonly evaluator?: JsonObject;
|
|
1392
1390
|
};
|
|
1393
|
-
readonly scores?: readonly
|
|
1391
|
+
readonly scores?: readonly GraderResult[];
|
|
1394
1392
|
readonly error?: string;
|
|
1395
1393
|
/** Lightweight summary of the execution trace (always included when available) */
|
|
1396
1394
|
readonly trace?: TraceSummary;
|
|
@@ -1433,9 +1431,9 @@ interface EvaluationResult {
|
|
|
1433
1431
|
readonly executionError?: ExecutionError;
|
|
1434
1432
|
}
|
|
1435
1433
|
type EvaluationVerdict = 'pass' | 'fail' | 'skip';
|
|
1436
|
-
interface
|
|
1434
|
+
interface GraderResult {
|
|
1437
1435
|
readonly name: string;
|
|
1438
|
-
readonly type:
|
|
1436
|
+
readonly type: GraderKind;
|
|
1439
1437
|
readonly score: number;
|
|
1440
1438
|
readonly weight?: number;
|
|
1441
1439
|
readonly verdict?: EvaluationVerdict;
|
|
@@ -1444,7 +1442,7 @@ interface EvaluatorResult {
|
|
|
1444
1442
|
readonly input?: JsonObject;
|
|
1445
1443
|
/** Target name used for grading (e.g., the LLM provider name). */
|
|
1446
1444
|
readonly target?: string;
|
|
1447
|
-
readonly scores?: readonly
|
|
1445
|
+
readonly scores?: readonly GraderResult[];
|
|
1448
1446
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
|
|
1449
1447
|
readonly details?: JsonObject;
|
|
1450
1448
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
@@ -1457,156 +1455,558 @@ interface EvaluatorResult {
|
|
|
1457
1455
|
readonly endedAt?: string;
|
|
1458
1456
|
}
|
|
1459
1457
|
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1458
|
+
/**
|
|
1459
|
+
* Strict normalized schema for CLI target configuration.
|
|
1460
|
+
* This is the final validated shape after environment variable resolution
|
|
1461
|
+
* and internal field normalization.
|
|
1462
|
+
*
|
|
1463
|
+
* Uses .strict() to reject unknown properties, ensuring configuration
|
|
1464
|
+
* errors are caught early rather than silently ignored.
|
|
1465
|
+
*
|
|
1466
|
+
* @example
|
|
1467
|
+
* ```typescript
|
|
1468
|
+
* const config: CliNormalizedConfig = {
|
|
1469
|
+
* command: 'agent run {PROMPT}',
|
|
1470
|
+
* timeoutMs: 120000,
|
|
1471
|
+
* verbose: true,
|
|
1472
|
+
* };
|
|
1473
|
+
* CliTargetConfigSchema.parse(config); // Validates the normalized config
|
|
1474
|
+
* ```
|
|
1475
|
+
*/
|
|
1476
|
+
declare const CliTargetConfigSchema: z.ZodObject<{
|
|
1477
|
+
command: z.ZodString;
|
|
1478
|
+
filesFormat: z.ZodOptional<z.ZodString>;
|
|
1479
|
+
cwd: z.ZodOptional<z.ZodString>;
|
|
1480
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1481
|
+
healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
|
|
1482
|
+
url: z.ZodString;
|
|
1483
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1484
|
+
}, "strict", z.ZodTypeAny, {
|
|
1485
|
+
url: string;
|
|
1486
|
+
timeoutMs?: number | undefined;
|
|
1471
1487
|
}, {
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
}
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1488
|
+
url: string;
|
|
1489
|
+
timeoutMs?: number | undefined;
|
|
1490
|
+
}>, z.ZodObject<{
|
|
1491
|
+
command: z.ZodString;
|
|
1492
|
+
cwd: z.ZodOptional<z.ZodString>;
|
|
1493
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1494
|
+
}, "strict", z.ZodTypeAny, {
|
|
1495
|
+
command: string;
|
|
1496
|
+
timeoutMs?: number | undefined;
|
|
1497
|
+
cwd?: string | undefined;
|
|
1498
|
+
}, {
|
|
1499
|
+
command: string;
|
|
1500
|
+
timeoutMs?: number | undefined;
|
|
1501
|
+
cwd?: string | undefined;
|
|
1502
|
+
}>]>>;
|
|
1503
|
+
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
1504
|
+
keepTempFiles: z.ZodOptional<z.ZodBoolean>;
|
|
1505
|
+
}, "strict", z.ZodTypeAny, {
|
|
1506
|
+
command: string;
|
|
1507
|
+
timeoutMs?: number | undefined;
|
|
1508
|
+
cwd?: string | undefined;
|
|
1509
|
+
verbose?: boolean | undefined;
|
|
1510
|
+
healthcheck?: {
|
|
1511
|
+
url: string;
|
|
1512
|
+
timeoutMs?: number | undefined;
|
|
1513
|
+
} | {
|
|
1514
|
+
command: string;
|
|
1515
|
+
timeoutMs?: number | undefined;
|
|
1516
|
+
cwd?: string | undefined;
|
|
1483
1517
|
} | undefined;
|
|
1518
|
+
filesFormat?: string | undefined;
|
|
1519
|
+
keepTempFiles?: boolean | undefined;
|
|
1484
1520
|
}, {
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1521
|
+
command: string;
|
|
1522
|
+
timeoutMs?: number | undefined;
|
|
1523
|
+
cwd?: string | undefined;
|
|
1524
|
+
verbose?: boolean | undefined;
|
|
1525
|
+
healthcheck?: {
|
|
1526
|
+
url: string;
|
|
1527
|
+
timeoutMs?: number | undefined;
|
|
1528
|
+
} | {
|
|
1529
|
+
command: string;
|
|
1530
|
+
timeoutMs?: number | undefined;
|
|
1531
|
+
cwd?: string | undefined;
|
|
1493
1532
|
} | undefined;
|
|
1533
|
+
filesFormat?: string | undefined;
|
|
1534
|
+
keepTempFiles?: boolean | undefined;
|
|
1494
1535
|
}>;
|
|
1495
|
-
type
|
|
1496
|
-
|
|
1497
|
-
declare const DEFAULT_EVAL_PATTERNS: readonly string[];
|
|
1498
|
-
type ExecutionDefaults = {
|
|
1499
|
-
readonly verbose?: boolean;
|
|
1500
|
-
readonly keep_workspaces?: boolean;
|
|
1501
|
-
readonly otel_file?: string;
|
|
1502
|
-
readonly export_otel?: boolean;
|
|
1503
|
-
readonly otel_backend?: string;
|
|
1504
|
-
readonly otel_capture_content?: boolean;
|
|
1505
|
-
readonly otel_group_turns?: boolean;
|
|
1506
|
-
readonly pool_workspaces?: boolean;
|
|
1507
|
-
readonly pool_slots?: number;
|
|
1508
|
-
};
|
|
1509
|
-
type ResultsExportConfig = {
|
|
1510
|
-
readonly repo: string;
|
|
1511
|
-
readonly path: string;
|
|
1512
|
-
readonly auto_push?: boolean;
|
|
1513
|
-
readonly branch_prefix?: string;
|
|
1514
|
-
};
|
|
1515
|
-
type AgentVConfig$1 = {
|
|
1516
|
-
readonly required_version?: string;
|
|
1517
|
-
readonly eval_patterns?: readonly string[];
|
|
1518
|
-
readonly execution?: ExecutionDefaults;
|
|
1519
|
-
readonly results?: {
|
|
1520
|
-
readonly export?: ResultsExportConfig;
|
|
1521
|
-
};
|
|
1522
|
-
};
|
|
1523
|
-
/**
|
|
1524
|
-
* Load optional .agentv/config.yaml configuration file.
|
|
1525
|
-
* Searches from eval file directory up to repo root.
|
|
1526
|
-
*/
|
|
1527
|
-
declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
|
|
1528
|
-
/**
|
|
1529
|
-
* Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
|
|
1530
|
-
*/
|
|
1531
|
-
declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
|
|
1532
|
-
/**
|
|
1533
|
-
* Extract target refs from parsed eval suite.
|
|
1534
|
-
* Supports both string shorthand and object form with hooks.
|
|
1535
|
-
* Returns undefined when no targets array is specified.
|
|
1536
|
-
*/
|
|
1537
|
-
declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
|
|
1538
|
-
/**
|
|
1539
|
-
* Extract target names from parsed eval suite (backward-compat wrapper).
|
|
1540
|
-
* Precedence: execution.targets (array) > execution.target (singular).
|
|
1541
|
-
* Returns undefined when no targets array is specified.
|
|
1542
|
-
*/
|
|
1543
|
-
declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
|
|
1544
|
-
/**
|
|
1545
|
-
* Extract workers count from suite-level execution block.
|
|
1546
|
-
*/
|
|
1547
|
-
declare function extractWorkersFromSuite(suite: JsonObject): number | undefined;
|
|
1548
|
-
/**
|
|
1549
|
-
* Extract per-test targets array from a raw test case object.
|
|
1550
|
-
*/
|
|
1551
|
-
declare function extractTargetsFromTestCase(testCase: JsonObject): readonly string[] | undefined;
|
|
1552
|
-
/**
|
|
1553
|
-
* Extract trials configuration from parsed eval suite's execution block.
|
|
1554
|
-
* Returns undefined when count is 1 or not specified (no-op).
|
|
1555
|
-
*/
|
|
1556
|
-
declare function extractTrialsConfig(suite: JsonObject): TrialsConfig | undefined;
|
|
1536
|
+
type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
|
|
1557
1537
|
/**
|
|
1558
|
-
*
|
|
1538
|
+
* Resolved CLI configuration type derived from CliTargetConfigSchema.
|
|
1539
|
+
* This is the final validated shape used by the CLI provider at runtime.
|
|
1540
|
+
* Using Readonly to ensure immutability for runtime safety.
|
|
1559
1541
|
*/
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
readonly
|
|
1542
|
+
type CliResolvedConfig = Readonly<CliNormalizedConfig>;
|
|
1543
|
+
interface RetryConfig {
|
|
1544
|
+
readonly maxRetries?: number;
|
|
1545
|
+
readonly initialDelayMs?: number;
|
|
1546
|
+
readonly maxDelayMs?: number;
|
|
1547
|
+
readonly backoffFactor?: number;
|
|
1548
|
+
readonly retryableStatusCodes?: readonly number[];
|
|
1563
1549
|
}
|
|
1564
1550
|
/**
|
|
1565
|
-
*
|
|
1566
|
-
*
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
* Extract `execution.fail_on_error` from parsed eval suite.
|
|
1571
|
-
* Accepts `true` or `false`.
|
|
1572
|
-
* Returns undefined when not specified.
|
|
1551
|
+
* Selects which OpenAI-compatible API endpoint to use.
|
|
1552
|
+
* - "chat" (default): POST /chat/completions — universally supported by all OpenAI-compatible providers.
|
|
1553
|
+
* - "responses": POST /responses — only supported by api.openai.com.
|
|
1554
|
+
*
|
|
1555
|
+
* Maps to Vercel AI SDK methods: "chat" → provider.chat(model), "responses" → provider(model).
|
|
1573
1556
|
*/
|
|
1574
|
-
|
|
1557
|
+
type ApiFormat = 'chat' | 'responses';
|
|
1575
1558
|
/**
|
|
1576
|
-
*
|
|
1577
|
-
* Accepts a number in [0, 1] range.
|
|
1578
|
-
* Returns undefined when not specified.
|
|
1559
|
+
* Azure OpenAI settings used by the Vercel AI SDK.
|
|
1579
1560
|
*/
|
|
1580
|
-
|
|
1581
|
-
|
|
1561
|
+
interface AzureResolvedConfig {
|
|
1562
|
+
readonly resourceName: string;
|
|
1563
|
+
readonly deploymentName: string;
|
|
1564
|
+
readonly apiKey: string;
|
|
1565
|
+
readonly version?: string;
|
|
1566
|
+
readonly apiFormat?: ApiFormat;
|
|
1567
|
+
readonly temperature?: number;
|
|
1568
|
+
readonly maxOutputTokens?: number;
|
|
1569
|
+
readonly retry?: RetryConfig;
|
|
1570
|
+
}
|
|
1582
1571
|
/**
|
|
1583
|
-
*
|
|
1584
|
-
* - 'agent': File references only (for providers with filesystem access)
|
|
1585
|
-
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
1572
|
+
* OpenAI-compatible settings used by the Vercel AI SDK.
|
|
1586
1573
|
*/
|
|
1587
|
-
|
|
1588
|
-
|
|
1574
|
+
interface OpenAIResolvedConfig {
|
|
1575
|
+
readonly baseURL: string;
|
|
1576
|
+
readonly apiKey: string;
|
|
1577
|
+
readonly model: string;
|
|
1578
|
+
readonly apiFormat?: ApiFormat;
|
|
1579
|
+
readonly temperature?: number;
|
|
1580
|
+
readonly maxOutputTokens?: number;
|
|
1581
|
+
readonly retry?: RetryConfig;
|
|
1582
|
+
}
|
|
1589
1583
|
/**
|
|
1590
|
-
*
|
|
1584
|
+
* OpenRouter settings used by the Vercel AI SDK provider.
|
|
1591
1585
|
*/
|
|
1592
|
-
interface
|
|
1593
|
-
readonly
|
|
1594
|
-
readonly
|
|
1595
|
-
readonly
|
|
1586
|
+
interface OpenRouterResolvedConfig {
|
|
1587
|
+
readonly apiKey: string;
|
|
1588
|
+
readonly model: string;
|
|
1589
|
+
readonly temperature?: number;
|
|
1590
|
+
readonly maxOutputTokens?: number;
|
|
1591
|
+
readonly retry?: RetryConfig;
|
|
1596
1592
|
}
|
|
1597
1593
|
/**
|
|
1598
|
-
*
|
|
1599
|
-
*
|
|
1600
|
-
* @param testCase - The evaluation test case
|
|
1601
|
-
* @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
|
|
1594
|
+
* Anthropic Claude settings used by the Vercel AI SDK.
|
|
1602
1595
|
*/
|
|
1603
|
-
|
|
1604
|
-
|
|
1596
|
+
interface AnthropicResolvedConfig {
|
|
1597
|
+
readonly apiKey: string;
|
|
1598
|
+
readonly model: string;
|
|
1599
|
+
readonly temperature?: number;
|
|
1600
|
+
readonly maxOutputTokens?: number;
|
|
1601
|
+
readonly thinkingBudget?: number;
|
|
1602
|
+
readonly retry?: RetryConfig;
|
|
1603
|
+
}
|
|
1605
1604
|
/**
|
|
1606
|
-
*
|
|
1605
|
+
* Google Gemini settings used by the Vercel AI SDK.
|
|
1607
1606
|
*/
|
|
1608
|
-
|
|
1609
|
-
|
|
1607
|
+
interface GeminiResolvedConfig {
|
|
1608
|
+
readonly apiKey: string;
|
|
1609
|
+
readonly model: string;
|
|
1610
|
+
readonly temperature?: number;
|
|
1611
|
+
readonly maxOutputTokens?: number;
|
|
1612
|
+
readonly retry?: RetryConfig;
|
|
1613
|
+
}
|
|
1614
|
+
interface CodexResolvedConfig {
|
|
1615
|
+
readonly model?: string;
|
|
1616
|
+
readonly executable: string;
|
|
1617
|
+
readonly args?: readonly string[];
|
|
1618
|
+
readonly cwd?: string;
|
|
1619
|
+
readonly timeoutMs?: number;
|
|
1620
|
+
readonly logDir?: string;
|
|
1621
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1622
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1623
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1624
|
+
readonly systemPrompt?: string;
|
|
1625
|
+
}
|
|
1626
|
+
interface CopilotCliResolvedConfig {
|
|
1627
|
+
readonly executable: string;
|
|
1628
|
+
readonly model?: string;
|
|
1629
|
+
readonly args?: readonly string[];
|
|
1630
|
+
readonly cwd?: string;
|
|
1631
|
+
readonly timeoutMs?: number;
|
|
1632
|
+
readonly logDir?: string;
|
|
1633
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1634
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1635
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1636
|
+
readonly systemPrompt?: string;
|
|
1637
|
+
}
|
|
1638
|
+
interface CopilotSdkResolvedConfig {
|
|
1639
|
+
readonly cliUrl?: string;
|
|
1640
|
+
readonly cliPath?: string;
|
|
1641
|
+
readonly githubToken?: string;
|
|
1642
|
+
readonly model?: string;
|
|
1643
|
+
readonly cwd?: string;
|
|
1644
|
+
readonly timeoutMs?: number;
|
|
1645
|
+
readonly logDir?: string;
|
|
1646
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1647
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1648
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1649
|
+
readonly systemPrompt?: string;
|
|
1650
|
+
/** BYOK provider type: "azure", "openai", or "anthropic". */
|
|
1651
|
+
readonly byokType?: string;
|
|
1652
|
+
/** BYOK base URL for the provider endpoint. */
|
|
1653
|
+
readonly byokBaseUrl?: string;
|
|
1654
|
+
/** BYOK API key for authenticating with the provider. */
|
|
1655
|
+
readonly byokApiKey?: string;
|
|
1656
|
+
/** BYOK bearer token (takes precedence over apiKey when set). */
|
|
1657
|
+
readonly byokBearerToken?: string;
|
|
1658
|
+
/** BYOK Azure API version (e.g. "2024-10-21"). Only used when byokType is "azure". */
|
|
1659
|
+
readonly byokApiVersion?: string;
|
|
1660
|
+
/** BYOK wire API format: "completions" or "responses". */
|
|
1661
|
+
readonly byokWireApi?: string;
|
|
1662
|
+
}
|
|
1663
|
+
interface CopilotLogResolvedConfig {
|
|
1664
|
+
/** Explicit path to a session directory containing events.jsonl. */
|
|
1665
|
+
readonly sessionDir?: string;
|
|
1666
|
+
/** Session UUID — combined with sessionStateDir to build the path. */
|
|
1667
|
+
readonly sessionId?: string;
|
|
1668
|
+
/** Auto-discovery mode. 'latest' picks the most recent session. */
|
|
1669
|
+
readonly discover?: 'latest';
|
|
1670
|
+
/** Override the default ~/.copilot/session-state directory. */
|
|
1671
|
+
readonly sessionStateDir?: string;
|
|
1672
|
+
/** Filter discovery by working directory. */
|
|
1673
|
+
readonly cwd?: string;
|
|
1674
|
+
}
|
|
1675
|
+
interface PiCodingAgentResolvedConfig {
|
|
1676
|
+
readonly subprovider?: string;
|
|
1677
|
+
readonly model?: string;
|
|
1678
|
+
readonly apiKey?: string;
|
|
1679
|
+
readonly baseUrl?: string;
|
|
1680
|
+
readonly tools?: string;
|
|
1681
|
+
readonly thinking?: string;
|
|
1682
|
+
readonly cwd?: string;
|
|
1683
|
+
readonly timeoutMs?: number;
|
|
1684
|
+
readonly logDir?: string;
|
|
1685
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1686
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1687
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1688
|
+
readonly systemPrompt?: string;
|
|
1689
|
+
}
|
|
1690
|
+
interface PiCliResolvedConfig {
|
|
1691
|
+
readonly executable: string;
|
|
1692
|
+
readonly subprovider?: string;
|
|
1693
|
+
readonly model?: string;
|
|
1694
|
+
readonly apiKey?: string;
|
|
1695
|
+
readonly baseUrl?: string;
|
|
1696
|
+
readonly tools?: string;
|
|
1697
|
+
readonly thinking?: string;
|
|
1698
|
+
readonly args?: readonly string[];
|
|
1699
|
+
readonly cwd?: string;
|
|
1700
|
+
readonly timeoutMs?: number;
|
|
1701
|
+
readonly logDir?: string;
|
|
1702
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1703
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1704
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1705
|
+
readonly systemPrompt?: string;
|
|
1706
|
+
}
|
|
1707
|
+
interface ClaudeResolvedConfig {
|
|
1708
|
+
readonly executable: string;
|
|
1709
|
+
readonly model?: string;
|
|
1710
|
+
readonly systemPrompt?: string;
|
|
1711
|
+
readonly cwd?: string;
|
|
1712
|
+
readonly timeoutMs?: number;
|
|
1713
|
+
readonly maxTurns?: number;
|
|
1714
|
+
readonly maxBudgetUsd?: number;
|
|
1715
|
+
readonly logDir?: string;
|
|
1716
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1717
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1718
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1719
|
+
}
|
|
1720
|
+
interface MockResolvedConfig {
|
|
1721
|
+
readonly response?: string;
|
|
1722
|
+
readonly delayMs?: number;
|
|
1723
|
+
readonly delayMinMs?: number;
|
|
1724
|
+
readonly delayMaxMs?: number;
|
|
1725
|
+
}
|
|
1726
|
+
interface VSCodeResolvedConfig {
|
|
1727
|
+
readonly executable: string;
|
|
1728
|
+
readonly waitForResponse: boolean;
|
|
1729
|
+
readonly dryRun: boolean;
|
|
1730
|
+
readonly subagentRoot?: string;
|
|
1731
|
+
readonly timeoutMs?: number;
|
|
1732
|
+
}
|
|
1733
|
+
interface AgentVResolvedConfig {
|
|
1734
|
+
readonly model: string;
|
|
1735
|
+
readonly temperature: number;
|
|
1736
|
+
}
|
|
1737
|
+
/** Base fields shared by all resolved targets. */
|
|
1738
|
+
interface ResolvedTargetBase {
|
|
1739
|
+
readonly name: string;
|
|
1740
|
+
readonly graderTarget?: string;
|
|
1741
|
+
readonly workers?: number;
|
|
1742
|
+
readonly providerBatching?: boolean;
|
|
1743
|
+
/**
|
|
1744
|
+
* Whether this target can be executed via executor subagents in subagent mode.
|
|
1745
|
+
* Defaults to `true` for all non-CLI providers. Set `false` in targets.yaml
|
|
1746
|
+
* to force CLI invocation even in subagent mode.
|
|
1747
|
+
*/
|
|
1748
|
+
readonly subagentModeAllowed?: boolean;
|
|
1749
|
+
/**
|
|
1750
|
+
* Ordered list of target names to try when the primary target fails after
|
|
1751
|
+
* exhausting retries. Each fallback is attempted in order.
|
|
1752
|
+
*/
|
|
1753
|
+
readonly fallbackTargets?: readonly string[];
|
|
1754
|
+
}
|
|
1755
|
+
type ResolvedTarget = (ResolvedTargetBase & {
|
|
1756
|
+
readonly kind: 'openai';
|
|
1757
|
+
readonly config: OpenAIResolvedConfig;
|
|
1758
|
+
}) | (ResolvedTargetBase & {
|
|
1759
|
+
readonly kind: 'openrouter';
|
|
1760
|
+
readonly config: OpenRouterResolvedConfig;
|
|
1761
|
+
}) | (ResolvedTargetBase & {
|
|
1762
|
+
readonly kind: 'azure';
|
|
1763
|
+
readonly config: AzureResolvedConfig;
|
|
1764
|
+
}) | (ResolvedTargetBase & {
|
|
1765
|
+
readonly kind: 'anthropic';
|
|
1766
|
+
readonly config: AnthropicResolvedConfig;
|
|
1767
|
+
}) | (ResolvedTargetBase & {
|
|
1768
|
+
readonly kind: 'gemini';
|
|
1769
|
+
readonly config: GeminiResolvedConfig;
|
|
1770
|
+
}) | (ResolvedTargetBase & {
|
|
1771
|
+
readonly kind: 'codex';
|
|
1772
|
+
readonly config: CodexResolvedConfig;
|
|
1773
|
+
}) | (ResolvedTargetBase & {
|
|
1774
|
+
readonly kind: 'copilot-sdk';
|
|
1775
|
+
readonly config: CopilotSdkResolvedConfig;
|
|
1776
|
+
}) | (ResolvedTargetBase & {
|
|
1777
|
+
readonly kind: 'copilot-cli';
|
|
1778
|
+
readonly config: CopilotCliResolvedConfig;
|
|
1779
|
+
}) | (ResolvedTargetBase & {
|
|
1780
|
+
readonly kind: 'copilot-log';
|
|
1781
|
+
readonly config: CopilotLogResolvedConfig;
|
|
1782
|
+
}) | (ResolvedTargetBase & {
|
|
1783
|
+
readonly kind: 'pi-coding-agent';
|
|
1784
|
+
readonly config: PiCodingAgentResolvedConfig;
|
|
1785
|
+
}) | (ResolvedTargetBase & {
|
|
1786
|
+
readonly kind: 'pi-cli';
|
|
1787
|
+
readonly config: PiCliResolvedConfig;
|
|
1788
|
+
}) | (ResolvedTargetBase & {
|
|
1789
|
+
readonly kind: 'claude';
|
|
1790
|
+
readonly config: ClaudeResolvedConfig;
|
|
1791
|
+
}) | (ResolvedTargetBase & {
|
|
1792
|
+
readonly kind: 'claude-cli';
|
|
1793
|
+
readonly config: ClaudeResolvedConfig;
|
|
1794
|
+
}) | (ResolvedTargetBase & {
|
|
1795
|
+
readonly kind: 'claude-sdk';
|
|
1796
|
+
readonly config: ClaudeResolvedConfig;
|
|
1797
|
+
}) | (ResolvedTargetBase & {
|
|
1798
|
+
readonly kind: 'mock';
|
|
1799
|
+
readonly config: MockResolvedConfig;
|
|
1800
|
+
}) | (ResolvedTargetBase & {
|
|
1801
|
+
readonly kind: 'vscode' | 'vscode-insiders';
|
|
1802
|
+
readonly config: VSCodeResolvedConfig;
|
|
1803
|
+
}) | (ResolvedTargetBase & {
|
|
1804
|
+
readonly kind: 'agentv';
|
|
1805
|
+
readonly config: AgentVResolvedConfig;
|
|
1806
|
+
}) | (ResolvedTargetBase & {
|
|
1807
|
+
readonly kind: 'cli';
|
|
1808
|
+
readonly config: CliResolvedConfig;
|
|
1809
|
+
}) | (ResolvedTargetBase & {
|
|
1810
|
+
readonly kind: 'transcript';
|
|
1811
|
+
readonly config: Record<string, never>;
|
|
1812
|
+
});
|
|
1813
|
+
/**
|
|
1814
|
+
* Optional settings accepted on ALL target definitions regardless of provider.
|
|
1815
|
+
* Exported so the targets validator can reuse the same list — adding a field
|
|
1816
|
+
* here automatically makes it valid in targets.yaml without a separate update.
|
|
1817
|
+
*/
|
|
1818
|
+
declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "subagent_mode_allowed", "fallback_targets"];
|
|
1819
|
+
declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
|
|
1820
|
+
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
|
|
1821
|
+
readonly emitDeprecationWarnings?: boolean;
|
|
1822
|
+
}): ResolvedTarget;
|
|
1823
|
+
|
|
1824
|
+
/**
|
|
1825
|
+
* Extensible provider registry.
|
|
1826
|
+
*
|
|
1827
|
+
* Replaces the hardcoded switch/case dispatch in createProvider() with
|
|
1828
|
+
* a registry of named factory functions. Built-in providers are registered
|
|
1829
|
+
* at startup; users can add custom providers via the registry API or by
|
|
1830
|
+
* dropping files in `.agentv/providers/`.
|
|
1831
|
+
*/
|
|
1832
|
+
|
|
1833
|
+
/**
|
|
1834
|
+
* Factory function that creates a Provider instance from a resolved target.
|
|
1835
|
+
*/
|
|
1836
|
+
type ProviderFactoryFn = (target: ResolvedTarget) => Provider;
|
|
1837
|
+
/**
|
|
1838
|
+
* Registry of provider factory functions keyed by provider kind.
|
|
1839
|
+
*
|
|
1840
|
+
* Built-in providers are registered at startup. Custom providers can be
|
|
1841
|
+
* registered via the `register()` method.
|
|
1842
|
+
*/
|
|
1843
|
+
declare class ProviderRegistry {
|
|
1844
|
+
private readonly factories;
|
|
1845
|
+
/** Register a factory function for a provider kind. */
|
|
1846
|
+
register(kind: string, factory: ProviderFactoryFn): this;
|
|
1847
|
+
/** Get the factory function for a provider kind. */
|
|
1848
|
+
get(kind: string): ProviderFactoryFn | undefined;
|
|
1849
|
+
/** Check if a factory is registered for the given kind. */
|
|
1850
|
+
has(kind: string): boolean;
|
|
1851
|
+
/** List all registered provider kind names. */
|
|
1852
|
+
list(): string[];
|
|
1853
|
+
/**
|
|
1854
|
+
* Create a provider instance from a resolved target.
|
|
1855
|
+
* Falls back to CLI provider for unknown kinds (custom provider escape hatch).
|
|
1856
|
+
*/
|
|
1857
|
+
create(target: ResolvedTarget): Provider;
|
|
1858
|
+
}
|
|
1859
|
+
|
|
1860
|
+
declare const MetadataSchema: z.ZodObject<{
|
|
1861
|
+
name: z.ZodString;
|
|
1862
|
+
description: z.ZodOptional<z.ZodString>;
|
|
1863
|
+
version: z.ZodOptional<z.ZodString>;
|
|
1864
|
+
author: z.ZodOptional<z.ZodString>;
|
|
1865
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
1866
|
+
license: z.ZodOptional<z.ZodString>;
|
|
1867
|
+
requires: z.ZodOptional<z.ZodObject<{
|
|
1868
|
+
agentv: z.ZodOptional<z.ZodString>;
|
|
1869
|
+
}, "strip", z.ZodTypeAny, {
|
|
1870
|
+
agentv?: string | undefined;
|
|
1871
|
+
}, {
|
|
1872
|
+
agentv?: string | undefined;
|
|
1873
|
+
}>>;
|
|
1874
|
+
}, "strip", z.ZodTypeAny, {
|
|
1875
|
+
name: string;
|
|
1876
|
+
description?: string | undefined;
|
|
1877
|
+
version?: string | undefined;
|
|
1878
|
+
author?: string | undefined;
|
|
1879
|
+
tags?: string[] | undefined;
|
|
1880
|
+
license?: string | undefined;
|
|
1881
|
+
requires?: {
|
|
1882
|
+
agentv?: string | undefined;
|
|
1883
|
+
} | undefined;
|
|
1884
|
+
}, {
|
|
1885
|
+
name: string;
|
|
1886
|
+
description?: string | undefined;
|
|
1887
|
+
version?: string | undefined;
|
|
1888
|
+
author?: string | undefined;
|
|
1889
|
+
tags?: string[] | undefined;
|
|
1890
|
+
license?: string | undefined;
|
|
1891
|
+
requires?: {
|
|
1892
|
+
agentv?: string | undefined;
|
|
1893
|
+
} | undefined;
|
|
1894
|
+
}>;
|
|
1895
|
+
type EvalMetadata = z.infer<typeof MetadataSchema>;
|
|
1896
|
+
|
|
1897
|
+
declare const DEFAULT_EVAL_PATTERNS: readonly string[];
|
|
1898
|
+
type ExecutionDefaults = {
|
|
1899
|
+
readonly verbose?: boolean;
|
|
1900
|
+
readonly keep_workspaces?: boolean;
|
|
1901
|
+
readonly otel_file?: string;
|
|
1902
|
+
readonly export_otel?: boolean;
|
|
1903
|
+
readonly otel_backend?: string;
|
|
1904
|
+
readonly otel_capture_content?: boolean;
|
|
1905
|
+
readonly otel_group_turns?: boolean;
|
|
1906
|
+
readonly pool_workspaces?: boolean;
|
|
1907
|
+
readonly pool_slots?: number;
|
|
1908
|
+
};
|
|
1909
|
+
type ResultsExportConfig = {
|
|
1910
|
+
readonly repo: string;
|
|
1911
|
+
readonly path: string;
|
|
1912
|
+
readonly auto_push?: boolean;
|
|
1913
|
+
readonly branch_prefix?: string;
|
|
1914
|
+
};
|
|
1915
|
+
type AgentVConfig$1 = {
|
|
1916
|
+
readonly required_version?: string;
|
|
1917
|
+
readonly eval_patterns?: readonly string[];
|
|
1918
|
+
readonly execution?: ExecutionDefaults;
|
|
1919
|
+
readonly results?: {
|
|
1920
|
+
readonly export?: ResultsExportConfig;
|
|
1921
|
+
};
|
|
1922
|
+
};
|
|
1923
|
+
/**
|
|
1924
|
+
* Load optional .agentv/config.yaml configuration file.
|
|
1925
|
+
* Searches from eval file directory up to repo root.
|
|
1926
|
+
*/
|
|
1927
|
+
declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
|
|
1928
|
+
/**
|
|
1929
|
+
* Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
|
|
1930
|
+
*/
|
|
1931
|
+
declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
|
|
1932
|
+
/**
|
|
1933
|
+
* Extract target refs from parsed eval suite.
|
|
1934
|
+
* Supports both string shorthand and object form with hooks.
|
|
1935
|
+
* Returns undefined when no targets array is specified.
|
|
1936
|
+
*/
|
|
1937
|
+
declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
|
|
1938
|
+
/**
|
|
1939
|
+
* Extract target names from parsed eval suite (backward-compat wrapper).
|
|
1940
|
+
* Precedence: execution.targets (array) > execution.target (singular).
|
|
1941
|
+
* Returns undefined when no targets array is specified.
|
|
1942
|
+
*/
|
|
1943
|
+
declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
|
|
1944
|
+
/**
|
|
1945
|
+
* Extract workers count from suite-level execution block.
|
|
1946
|
+
*/
|
|
1947
|
+
declare function extractWorkersFromSuite(suite: JsonObject): number | undefined;
|
|
1948
|
+
/**
|
|
1949
|
+
* Extract per-test targets array from a raw test case object.
|
|
1950
|
+
*/
|
|
1951
|
+
declare function extractTargetsFromTestCase(testCase: JsonObject): readonly string[] | undefined;
|
|
1952
|
+
/**
|
|
1953
|
+
* Extract trials configuration from parsed eval suite's execution block.
|
|
1954
|
+
* Returns undefined when count is 1 or not specified (no-op).
|
|
1955
|
+
*/
|
|
1956
|
+
declare function extractTrialsConfig(suite: JsonObject): TrialsConfig | undefined;
|
|
1957
|
+
/**
|
|
1958
|
+
* Cache configuration parsed from execution block.
|
|
1959
|
+
*/
|
|
1960
|
+
interface CacheConfig {
|
|
1961
|
+
readonly enabled: boolean;
|
|
1962
|
+
readonly cachePath?: string;
|
|
1963
|
+
}
|
|
1964
|
+
/**
|
|
1965
|
+
* Extract cache configuration from parsed eval suite's execution block.
|
|
1966
|
+
* Returns undefined when no cache config is specified.
|
|
1967
|
+
*/
|
|
1968
|
+
declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
|
|
1969
|
+
/**
|
|
1970
|
+
* Extract `execution.fail_on_error` from parsed eval suite.
|
|
1971
|
+
* Accepts `true` or `false`.
|
|
1972
|
+
* Returns undefined when not specified.
|
|
1973
|
+
*/
|
|
1974
|
+
declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
|
|
1975
|
+
/**
|
|
1976
|
+
* Extract `execution.threshold` from parsed eval suite.
|
|
1977
|
+
* Accepts a number in [0, 1] range.
|
|
1978
|
+
* Returns undefined when not specified.
|
|
1979
|
+
*/
|
|
1980
|
+
declare function extractThreshold(suite: JsonObject): number | undefined;
|
|
1981
|
+
|
|
1982
|
+
/**
|
|
1983
|
+
* Formatting mode for segment content.
|
|
1984
|
+
* - 'agent': File references only (for providers with filesystem access)
|
|
1985
|
+
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
1986
|
+
*/
|
|
1987
|
+
type FormattingMode = 'agent' | 'lm';
|
|
1988
|
+
|
|
1989
|
+
/**
|
|
1990
|
+
* Build prompt inputs by consolidating user request context.
|
|
1991
|
+
*/
|
|
1992
|
+
interface PromptInputs {
|
|
1993
|
+
readonly question: string;
|
|
1994
|
+
readonly chatPrompt?: ChatPrompt;
|
|
1995
|
+
readonly systemMessage?: string;
|
|
1996
|
+
}
|
|
1997
|
+
/**
|
|
1998
|
+
* Build prompt inputs by consolidating user request context.
|
|
1999
|
+
*
|
|
2000
|
+
* @param testCase - The evaluation test case
|
|
2001
|
+
* @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
|
|
2002
|
+
*/
|
|
2003
|
+
declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): Promise<PromptInputs>;
|
|
2004
|
+
|
|
2005
|
+
/**
|
|
2006
|
+
* Detect file format by extension.
|
|
2007
|
+
*/
|
|
2008
|
+
declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json' | 'typescript';
|
|
2009
|
+
|
|
1610
2010
|
type LoadOptions = {
|
|
1611
2011
|
readonly verbose?: boolean;
|
|
1612
2012
|
/** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
|
|
@@ -1642,13 +2042,17 @@ type EvalSuiteResult = {
|
|
|
1642
2042
|
/** Suite-level metadata (name, description, version, etc.) */
|
|
1643
2043
|
readonly metadata?: EvalMetadata;
|
|
1644
2044
|
/** Suite-level total cost budget in USD */
|
|
1645
|
-
readonly
|
|
2045
|
+
readonly budgetUsd?: number;
|
|
1646
2046
|
/** Execution error tolerance: true or false */
|
|
1647
2047
|
readonly failOnError?: FailOnError;
|
|
1648
2048
|
/** Suite-level quality threshold (0-1) — suite fails if mean score is below */
|
|
1649
2049
|
readonly threshold?: number;
|
|
1650
2050
|
/** Resolved workspace.path from the eval YAML (after env-var expansion), if set */
|
|
1651
2051
|
readonly workspacePath?: string;
|
|
2052
|
+
/** Inline target definition from a TS eval config. */
|
|
2053
|
+
readonly inlineTarget?: TargetDefinition;
|
|
2054
|
+
/** Custom provider factory from a TS eval config task(). */
|
|
2055
|
+
readonly providerFactory?: ProviderFactoryFn;
|
|
1652
2056
|
};
|
|
1653
2057
|
/**
|
|
1654
2058
|
* Load tests and suite metadata from a single parse.
|
|
@@ -1695,495 +2099,370 @@ declare function isAgentSkillsFormat(parsed: unknown): parsed is AgentSkillsEval
|
|
|
1695
2099
|
declare function parseAgentSkillsEvals(parsed: unknown, source?: string, baseDir?: string): readonly EvalTest[];
|
|
1696
2100
|
|
|
1697
2101
|
/**
|
|
1698
|
-
*
|
|
2102
|
+
* Types for inline assertion functions used in the evaluate() API.
|
|
1699
2103
|
*
|
|
1700
|
-
*
|
|
1701
|
-
*
|
|
2104
|
+
* Inline functions are the escape hatch for custom evaluation logic
|
|
2105
|
+
* that doesn't fit a built-in grader type. For built-in assertions
|
|
2106
|
+
* (contains, regex, is-json, etc.), use config objects instead:
|
|
1702
2107
|
*
|
|
1703
|
-
*
|
|
1704
|
-
*/
|
|
1705
|
-
interface EvalsJsonCase {
|
|
1706
|
-
id: number;
|
|
1707
|
-
prompt: string;
|
|
1708
|
-
expected_output?: string;
|
|
1709
|
-
files?: string[];
|
|
1710
|
-
should_trigger?: boolean;
|
|
1711
|
-
assertions: string[];
|
|
1712
|
-
}
|
|
1713
|
-
interface EvalsJsonFile {
|
|
1714
|
-
skill_name: string;
|
|
1715
|
-
evals: EvalsJsonCase[];
|
|
1716
|
-
}
|
|
1717
|
-
/**
|
|
1718
|
-
* Result of transpiling a single EVAL.yaml.
|
|
1719
|
-
* May produce multiple evals.json files (one per skill).
|
|
1720
|
-
*/
|
|
1721
|
-
interface TranspileResult {
|
|
1722
|
-
/** Map from skill_name → EvalsJsonFile */
|
|
1723
|
-
files: Map<string, EvalsJsonFile>;
|
|
1724
|
-
/** Warning messages accumulated during transpilation */
|
|
1725
|
-
warnings: string[];
|
|
1726
|
-
}
|
|
1727
|
-
/**
|
|
1728
|
-
* Transpile a parsed EVAL.yaml object into one or more evals.json objects.
|
|
2108
|
+
* assert: [{ type: 'contains', value: 'hello' }]
|
|
1729
2109
|
*
|
|
1730
|
-
*
|
|
1731
|
-
* @param source Source identifier for error messages (e.g. file path)
|
|
1732
|
-
*/
|
|
1733
|
-
declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
|
|
1734
|
-
/**
|
|
1735
|
-
* Transpile an EVAL.yaml file into one or more evals.json objects.
|
|
1736
|
-
* Returns a map from output filename → JSON content.
|
|
2110
|
+
* Inline functions are for custom logic:
|
|
1737
2111
|
*
|
|
1738
|
-
*
|
|
1739
|
-
*/
|
|
1740
|
-
declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
|
|
1741
|
-
/**
|
|
1742
|
-
* Determine the output filename(s) for a transpile result.
|
|
1743
|
-
* Single skill → "evals.json"
|
|
1744
|
-
* Multiple skills → "<skill>.evals.json"
|
|
1745
|
-
*/
|
|
1746
|
-
declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
|
|
1747
|
-
|
|
1748
|
-
declare function fileExists(filePath: string): Promise<boolean>;
|
|
1749
|
-
/**
|
|
1750
|
-
* Normalize line endings to LF (\n).
|
|
1751
|
-
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
1752
|
-
*/
|
|
1753
|
-
declare function normalizeLineEndings(content: string): string;
|
|
1754
|
-
/**
|
|
1755
|
-
* Read a text file and normalize line endings to LF (\n).
|
|
1756
|
-
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
1757
|
-
*/
|
|
1758
|
-
declare function readTextFile(filePath: string): Promise<string>;
|
|
1759
|
-
/**
|
|
1760
|
-
* Read a JSON file and parse it.
|
|
1761
|
-
*/
|
|
1762
|
-
declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
|
|
1763
|
-
/**
|
|
1764
|
-
* Find git repository root by walking up the directory tree.
|
|
1765
|
-
*/
|
|
1766
|
-
declare function findGitRoot(startPath: string): Promise<string | null>;
|
|
1767
|
-
/**
|
|
1768
|
-
* Build a chain of directories walking from a file's location up to repo root.
|
|
1769
|
-
* Used for discovering configuration files like targets.yaml or config.yaml.
|
|
1770
|
-
*/
|
|
1771
|
-
declare function buildDirectoryChain(filePath: string, repoRoot: string): readonly string[];
|
|
1772
|
-
/**
|
|
1773
|
-
* Build search roots for file resolution, matching yaml-parser behavior.
|
|
1774
|
-
* Searches from eval file directory up to repo root.
|
|
1775
|
-
*/
|
|
1776
|
-
declare function buildSearchRoots(evalPath: string, repoRoot: string): readonly string[];
|
|
1777
|
-
/**
|
|
1778
|
-
* Resolve a file reference using search roots, matching yaml-parser behavior.
|
|
2112
|
+
* assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
|
|
1779
2113
|
*/
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
readonly
|
|
1783
|
-
readonly
|
|
1784
|
-
|
|
2114
|
+
/** Context passed to inline assertion functions */
|
|
2115
|
+
interface AssertContext {
|
|
2116
|
+
readonly input: string;
|
|
2117
|
+
readonly output: string;
|
|
2118
|
+
readonly expectedOutput?: string;
|
|
2119
|
+
readonly criteria?: string;
|
|
2120
|
+
readonly metadata?: Record<string, unknown>;
|
|
2121
|
+
}
|
|
2122
|
+
/** Result from an inline assertion function */
|
|
2123
|
+
interface AssertResult {
|
|
2124
|
+
readonly name: string;
|
|
2125
|
+
readonly score: number;
|
|
2126
|
+
readonly metadata?: Record<string, unknown>;
|
|
2127
|
+
}
|
|
2128
|
+
/** Inline assertion function signature */
|
|
2129
|
+
type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
|
|
1785
2130
|
|
|
1786
2131
|
/**
|
|
1787
|
-
*
|
|
1788
|
-
* This is the final validated shape after environment variable resolution
|
|
1789
|
-
* and internal field normalization.
|
|
2132
|
+
* Programmatic API for running evaluations.
|
|
1790
2133
|
*
|
|
1791
|
-
*
|
|
1792
|
-
*
|
|
2134
|
+
* Provides `evaluate()` — a high-level function for using AgentV as a library
|
|
2135
|
+
* instead of a CLI. The config shape mirrors the YAML structure for easy
|
|
2136
|
+
* translation between file-based and programmatic usage.
|
|
1793
2137
|
*
|
|
1794
|
-
* @example
|
|
2138
|
+
* @example Inline tests with config objects
|
|
1795
2139
|
* ```typescript
|
|
1796
|
-
*
|
|
1797
|
-
*
|
|
1798
|
-
*
|
|
1799
|
-
*
|
|
1800
|
-
*
|
|
1801
|
-
*
|
|
2140
|
+
* import { evaluate } from '@agentv/core';
|
|
2141
|
+
*
|
|
2142
|
+
* const results = await evaluate({
|
|
2143
|
+
* tests: [
|
|
2144
|
+
* {
|
|
2145
|
+
* id: 'capital',
|
|
2146
|
+
* input: 'What is the capital of France?',
|
|
2147
|
+
* expectedOutput: 'Paris',
|
|
2148
|
+
* assert: [{ type: 'contains', value: 'Paris' }],
|
|
2149
|
+
* },
|
|
2150
|
+
* ],
|
|
2151
|
+
* target: { provider: 'mock_agent' },
|
|
2152
|
+
* });
|
|
2153
|
+
*
|
|
2154
|
+
* console.log(results.summary.passed, 'passed');
|
|
2155
|
+
* ```
|
|
2156
|
+
*
|
|
2157
|
+
* @example Inline tests with task function and custom assertion
|
|
2158
|
+
* ```typescript
|
|
2159
|
+
* import { evaluate } from '@agentv/core';
|
|
2160
|
+
*
|
|
2161
|
+
* const { summary } = await evaluate({
|
|
2162
|
+
* tests: [
|
|
2163
|
+
* {
|
|
2164
|
+
* id: 'echo',
|
|
2165
|
+
* input: 'hello',
|
|
2166
|
+
* expectedOutput: 'Echo: hello',
|
|
2167
|
+
* assert: [
|
|
2168
|
+
* { type: 'contains', value: 'hello' },
|
|
2169
|
+
* { type: 'equals' },
|
|
2170
|
+
* ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
|
|
2171
|
+
* ],
|
|
2172
|
+
* },
|
|
2173
|
+
* ],
|
|
2174
|
+
* task: async (input) => `Echo: ${input}`,
|
|
2175
|
+
* });
|
|
1802
2176
|
* ```
|
|
1803
|
-
*/
|
|
1804
|
-
declare const CliTargetConfigSchema: z.ZodObject<{
|
|
1805
|
-
command: z.ZodString;
|
|
1806
|
-
filesFormat: z.ZodOptional<z.ZodString>;
|
|
1807
|
-
cwd: z.ZodOptional<z.ZodString>;
|
|
1808
|
-
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1809
|
-
healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
|
|
1810
|
-
url: z.ZodString;
|
|
1811
|
-
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1812
|
-
}, "strict", z.ZodTypeAny, {
|
|
1813
|
-
url: string;
|
|
1814
|
-
timeoutMs?: number | undefined;
|
|
1815
|
-
}, {
|
|
1816
|
-
url: string;
|
|
1817
|
-
timeoutMs?: number | undefined;
|
|
1818
|
-
}>, z.ZodObject<{
|
|
1819
|
-
command: z.ZodString;
|
|
1820
|
-
cwd: z.ZodOptional<z.ZodString>;
|
|
1821
|
-
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1822
|
-
}, "strict", z.ZodTypeAny, {
|
|
1823
|
-
command: string;
|
|
1824
|
-
timeoutMs?: number | undefined;
|
|
1825
|
-
cwd?: string | undefined;
|
|
1826
|
-
}, {
|
|
1827
|
-
command: string;
|
|
1828
|
-
timeoutMs?: number | undefined;
|
|
1829
|
-
cwd?: string | undefined;
|
|
1830
|
-
}>]>>;
|
|
1831
|
-
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
1832
|
-
keepTempFiles: z.ZodOptional<z.ZodBoolean>;
|
|
1833
|
-
}, "strict", z.ZodTypeAny, {
|
|
1834
|
-
command: string;
|
|
1835
|
-
timeoutMs?: number | undefined;
|
|
1836
|
-
cwd?: string | undefined;
|
|
1837
|
-
verbose?: boolean | undefined;
|
|
1838
|
-
healthcheck?: {
|
|
1839
|
-
url: string;
|
|
1840
|
-
timeoutMs?: number | undefined;
|
|
1841
|
-
} | {
|
|
1842
|
-
command: string;
|
|
1843
|
-
timeoutMs?: number | undefined;
|
|
1844
|
-
cwd?: string | undefined;
|
|
1845
|
-
} | undefined;
|
|
1846
|
-
filesFormat?: string | undefined;
|
|
1847
|
-
keepTempFiles?: boolean | undefined;
|
|
1848
|
-
}, {
|
|
1849
|
-
command: string;
|
|
1850
|
-
timeoutMs?: number | undefined;
|
|
1851
|
-
cwd?: string | undefined;
|
|
1852
|
-
verbose?: boolean | undefined;
|
|
1853
|
-
healthcheck?: {
|
|
1854
|
-
url: string;
|
|
1855
|
-
timeoutMs?: number | undefined;
|
|
1856
|
-
} | {
|
|
1857
|
-
command: string;
|
|
1858
|
-
timeoutMs?: number | undefined;
|
|
1859
|
-
cwd?: string | undefined;
|
|
1860
|
-
} | undefined;
|
|
1861
|
-
filesFormat?: string | undefined;
|
|
1862
|
-
keepTempFiles?: boolean | undefined;
|
|
1863
|
-
}>;
|
|
1864
|
-
type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
|
|
1865
|
-
/**
|
|
1866
|
-
* Resolved CLI configuration type derived from CliTargetConfigSchema.
|
|
1867
|
-
* This is the final validated shape used by the CLI provider at runtime.
|
|
1868
|
-
* Using Readonly to ensure immutability for runtime safety.
|
|
1869
|
-
*/
|
|
1870
|
-
type CliResolvedConfig = Readonly<CliNormalizedConfig>;
|
|
1871
|
-
interface RetryConfig {
|
|
1872
|
-
readonly maxRetries?: number;
|
|
1873
|
-
readonly initialDelayMs?: number;
|
|
1874
|
-
readonly maxDelayMs?: number;
|
|
1875
|
-
readonly backoffFactor?: number;
|
|
1876
|
-
readonly retryableStatusCodes?: readonly number[];
|
|
1877
|
-
}
|
|
1878
|
-
/**
|
|
1879
|
-
* Selects which OpenAI-compatible API endpoint to use.
|
|
1880
|
-
* - "chat" (default): POST /chat/completions — universally supported by all OpenAI-compatible providers.
|
|
1881
|
-
* - "responses": POST /responses — only supported by api.openai.com.
|
|
1882
2177
|
*
|
|
1883
|
-
*
|
|
2178
|
+
* @example File-based
|
|
2179
|
+
* ```typescript
|
|
2180
|
+
* const results = await evaluate({
|
|
2181
|
+
* specFile: './evals/EVAL.yaml',
|
|
2182
|
+
* target: { provider: 'claude_agent' },
|
|
2183
|
+
* });
|
|
2184
|
+
* ```
|
|
2185
|
+
*
|
|
2186
|
+
* @module
|
|
1884
2187
|
*/
|
|
1885
|
-
|
|
2188
|
+
|
|
1886
2189
|
/**
|
|
1887
|
-
*
|
|
2190
|
+
* Inline test definition for the programmatic API.
|
|
2191
|
+
* Mirrors the YAML test structure.
|
|
1888
2192
|
*/
|
|
1889
|
-
interface
|
|
1890
|
-
|
|
1891
|
-
readonly
|
|
1892
|
-
|
|
1893
|
-
readonly
|
|
1894
|
-
|
|
1895
|
-
readonly
|
|
1896
|
-
|
|
1897
|
-
|
|
2193
|
+
interface EvalTestInput {
|
|
2194
|
+
/** Unique test identifier */
|
|
2195
|
+
readonly id: string;
|
|
2196
|
+
/** What the response should accomplish */
|
|
2197
|
+
readonly criteria?: string;
|
|
2198
|
+
/** Input to the agent (string or message array). Omit when using turns[]. */
|
|
2199
|
+
readonly input?: string | readonly {
|
|
2200
|
+
role: string;
|
|
2201
|
+
content: string;
|
|
2202
|
+
}[];
|
|
2203
|
+
/** Expected reference output (camelCase preferred) */
|
|
2204
|
+
readonly expectedOutput?: string;
|
|
2205
|
+
/** @deprecated Use `expectedOutput` instead */
|
|
2206
|
+
readonly expected_output?: string;
|
|
2207
|
+
/** Assertion graders — accepts factory functions, config objects, or inline functions */
|
|
2208
|
+
readonly assert?: readonly AssertEntry[];
|
|
2209
|
+
/** Arbitrary metadata */
|
|
2210
|
+
readonly metadata?: Record<string, unknown>;
|
|
2211
|
+
/** Enable multi-turn conversation mode. Inferred automatically when turns[] is provided. */
|
|
2212
|
+
readonly mode?: 'conversation';
|
|
2213
|
+
/** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
|
|
2214
|
+
readonly turns?: readonly ConversationTurnInput[];
|
|
2215
|
+
/** Score aggregation across turns: 'mean' (default), 'min', or 'max'. */
|
|
2216
|
+
readonly aggregation?: ConversationAggregation;
|
|
1898
2217
|
}
|
|
1899
2218
|
/**
|
|
1900
|
-
*
|
|
2219
|
+
* A single turn in a multi-turn conversation evaluation (programmatic API).
|
|
2220
|
+
* Mirrors the YAML `turns` structure with camelCase naming.
|
|
1901
2221
|
*/
|
|
1902
|
-
interface
|
|
1903
|
-
|
|
1904
|
-
readonly
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
readonly
|
|
2222
|
+
interface ConversationTurnInput {
|
|
2223
|
+
/** Input for this turn (string or message array) */
|
|
2224
|
+
readonly input: string | readonly {
|
|
2225
|
+
role: string;
|
|
2226
|
+
content: string;
|
|
2227
|
+
}[];
|
|
2228
|
+
/** Expected reference output for this turn */
|
|
2229
|
+
readonly expectedOutput?: string;
|
|
2230
|
+
/** @deprecated Use `expectedOutput` instead */
|
|
2231
|
+
readonly expected_output?: string;
|
|
2232
|
+
/** Per-turn assertions (string criteria or grader config) */
|
|
2233
|
+
readonly assert?: readonly AssertEntry[];
|
|
1910
2234
|
}
|
|
1911
2235
|
/**
|
|
1912
|
-
*
|
|
2236
|
+
* Inline assertion definition for the programmatic API.
|
|
2237
|
+
* Matches the YAML `assert` block structure.
|
|
1913
2238
|
*/
|
|
1914
|
-
interface
|
|
1915
|
-
|
|
1916
|
-
readonly
|
|
1917
|
-
|
|
1918
|
-
readonly
|
|
1919
|
-
|
|
2239
|
+
interface EvalAssertionInput {
|
|
2240
|
+
/** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
|
|
2241
|
+
readonly type: string;
|
|
2242
|
+
/** Display name */
|
|
2243
|
+
readonly name?: string;
|
|
2244
|
+
/** Value for deterministic assertions (contains, equals, regex) */
|
|
2245
|
+
readonly value?: string;
|
|
2246
|
+
/** Weight for scoring */
|
|
2247
|
+
readonly weight?: number;
|
|
2248
|
+
/** Whether this assertion is required to pass */
|
|
2249
|
+
readonly required?: boolean | number;
|
|
2250
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
2251
|
+
readonly min_score?: number;
|
|
2252
|
+
/** Prompt file for llm_grader */
|
|
2253
|
+
readonly prompt?: string;
|
|
2254
|
+
/** Script for code_grader */
|
|
2255
|
+
readonly script?: string | readonly string[];
|
|
2256
|
+
/** Additional config passed to the assertion */
|
|
2257
|
+
readonly config?: Record<string, unknown>;
|
|
2258
|
+
/** Nested assertions for composite type */
|
|
2259
|
+
readonly assert?: readonly EvalAssertionInput[];
|
|
2260
|
+
/** Rubric criteria for rubrics type */
|
|
2261
|
+
readonly criteria?: readonly (string | {
|
|
2262
|
+
id?: string;
|
|
2263
|
+
outcome: string;
|
|
2264
|
+
weight?: number;
|
|
2265
|
+
})[];
|
|
2266
|
+
/** Additional properties */
|
|
2267
|
+
readonly [key: string]: unknown;
|
|
1920
2268
|
}
|
|
2269
|
+
/** Assert entry: inline function or config object */
|
|
2270
|
+
type AssertEntry = AssertFn | EvalAssertionInput;
|
|
1921
2271
|
/**
|
|
1922
|
-
*
|
|
2272
|
+
* Configuration for `evaluate()`.
|
|
2273
|
+
* Accepts either inline tests or a spec file path.
|
|
1923
2274
|
*/
|
|
1924
|
-
interface
|
|
1925
|
-
|
|
1926
|
-
readonly
|
|
1927
|
-
|
|
1928
|
-
readonly
|
|
1929
|
-
|
|
1930
|
-
readonly
|
|
2275
|
+
interface EvalConfig {
|
|
2276
|
+
/** Inline test definitions (mutually exclusive with specFile) */
|
|
2277
|
+
readonly tests?: readonly EvalTestInput[];
|
|
2278
|
+
/** Path to an EVAL.yaml spec file (mutually exclusive with tests) */
|
|
2279
|
+
readonly specFile?: string;
|
|
2280
|
+
/** Target provider configuration */
|
|
2281
|
+
readonly target?: TargetDefinition;
|
|
2282
|
+
/** Custom task function — mutually exclusive with target */
|
|
2283
|
+
readonly task?: (input: string) => string | Promise<string>;
|
|
2284
|
+
/** Suite-level assertions applied to all tests */
|
|
2285
|
+
readonly assert?: readonly AssertEntry[];
|
|
2286
|
+
/** Optional suite metadata used by CLI discovery, tagging, and reporting. */
|
|
2287
|
+
readonly metadata?: EvalMetadata;
|
|
2288
|
+
/** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
|
|
2289
|
+
readonly filter?: string | readonly string[];
|
|
2290
|
+
/** Maximum concurrent workers (default: 3) */
|
|
2291
|
+
readonly workers?: number;
|
|
2292
|
+
/** Maximum retries on failure (default: 2) */
|
|
2293
|
+
readonly maxRetries?: number;
|
|
2294
|
+
/** Agent timeout in milliseconds. No timeout if not set. */
|
|
2295
|
+
readonly agentTimeoutMs?: number;
|
|
2296
|
+
/** Enable response caching */
|
|
2297
|
+
readonly cache?: boolean;
|
|
2298
|
+
/** Verbose logging */
|
|
2299
|
+
readonly verbose?: boolean;
|
|
2300
|
+
/** Callback for each completed result */
|
|
2301
|
+
readonly onResult?: (result: EvaluationResult) => void;
|
|
2302
|
+
/** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
|
|
2303
|
+
readonly threshold?: number;
|
|
2304
|
+
/** Command(s) to run once before the suite starts. Same semantics as YAML before_all. */
|
|
2305
|
+
readonly beforeAll?: string | readonly string[];
|
|
2306
|
+
/** Suite-level cost cap in USD. Stops dispatching new tests when exceeded. */
|
|
2307
|
+
readonly budgetUsd?: number;
|
|
1931
2308
|
}
|
|
1932
2309
|
/**
|
|
1933
|
-
*
|
|
2310
|
+
* Summary statistics for an evaluation run.
|
|
1934
2311
|
*/
|
|
1935
|
-
interface
|
|
1936
|
-
|
|
1937
|
-
readonly
|
|
1938
|
-
|
|
1939
|
-
readonly
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
readonly
|
|
1944
|
-
|
|
1945
|
-
readonly
|
|
1946
|
-
readonly cwd?: string;
|
|
1947
|
-
readonly timeoutMs?: number;
|
|
1948
|
-
readonly logDir?: string;
|
|
1949
|
-
readonly logFormat?: 'summary' | 'json';
|
|
1950
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1951
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
1952
|
-
readonly systemPrompt?: string;
|
|
1953
|
-
}
|
|
1954
|
-
interface CopilotCliResolvedConfig {
|
|
1955
|
-
readonly executable: string;
|
|
1956
|
-
readonly model?: string;
|
|
1957
|
-
readonly args?: readonly string[];
|
|
1958
|
-
readonly cwd?: string;
|
|
1959
|
-
readonly timeoutMs?: number;
|
|
1960
|
-
readonly logDir?: string;
|
|
1961
|
-
readonly logFormat?: 'summary' | 'json';
|
|
1962
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1963
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
1964
|
-
readonly systemPrompt?: string;
|
|
1965
|
-
}
|
|
1966
|
-
interface CopilotSdkResolvedConfig {
|
|
1967
|
-
readonly cliUrl?: string;
|
|
1968
|
-
readonly cliPath?: string;
|
|
1969
|
-
readonly githubToken?: string;
|
|
1970
|
-
readonly model?: string;
|
|
1971
|
-
readonly cwd?: string;
|
|
1972
|
-
readonly timeoutMs?: number;
|
|
1973
|
-
readonly logDir?: string;
|
|
1974
|
-
readonly logFormat?: 'summary' | 'json';
|
|
1975
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1976
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
1977
|
-
readonly systemPrompt?: string;
|
|
1978
|
-
/** BYOK provider type: "azure", "openai", or "anthropic". */
|
|
1979
|
-
readonly byokType?: string;
|
|
1980
|
-
/** BYOK base URL for the provider endpoint. */
|
|
1981
|
-
readonly byokBaseUrl?: string;
|
|
1982
|
-
/** BYOK API key for authenticating with the provider. */
|
|
1983
|
-
readonly byokApiKey?: string;
|
|
1984
|
-
/** BYOK bearer token (takes precedence over apiKey when set). */
|
|
1985
|
-
readonly byokBearerToken?: string;
|
|
1986
|
-
/** BYOK Azure API version (e.g. "2024-10-21"). Only used when byokType is "azure". */
|
|
1987
|
-
readonly byokApiVersion?: string;
|
|
1988
|
-
/** BYOK wire API format: "completions" or "responses". */
|
|
1989
|
-
readonly byokWireApi?: string;
|
|
1990
|
-
}
|
|
1991
|
-
interface CopilotLogResolvedConfig {
|
|
1992
|
-
/** Explicit path to a session directory containing events.jsonl. */
|
|
1993
|
-
readonly sessionDir?: string;
|
|
1994
|
-
/** Session UUID — combined with sessionStateDir to build the path. */
|
|
1995
|
-
readonly sessionId?: string;
|
|
1996
|
-
/** Auto-discovery mode. 'latest' picks the most recent session. */
|
|
1997
|
-
readonly discover?: 'latest';
|
|
1998
|
-
/** Override the default ~/.copilot/session-state directory. */
|
|
1999
|
-
readonly sessionStateDir?: string;
|
|
2000
|
-
/** Filter discovery by working directory. */
|
|
2001
|
-
readonly cwd?: string;
|
|
2002
|
-
}
|
|
2003
|
-
interface PiCodingAgentResolvedConfig {
|
|
2004
|
-
readonly subprovider?: string;
|
|
2005
|
-
readonly model?: string;
|
|
2006
|
-
readonly apiKey?: string;
|
|
2007
|
-
readonly baseUrl?: string;
|
|
2008
|
-
readonly tools?: string;
|
|
2009
|
-
readonly thinking?: string;
|
|
2010
|
-
readonly cwd?: string;
|
|
2011
|
-
readonly timeoutMs?: number;
|
|
2012
|
-
readonly logDir?: string;
|
|
2013
|
-
readonly logFormat?: 'summary' | 'json';
|
|
2014
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
2015
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
2016
|
-
readonly systemPrompt?: string;
|
|
2017
|
-
}
|
|
2018
|
-
interface PiCliResolvedConfig {
|
|
2019
|
-
readonly executable: string;
|
|
2020
|
-
readonly subprovider?: string;
|
|
2021
|
-
readonly model?: string;
|
|
2022
|
-
readonly apiKey?: string;
|
|
2023
|
-
readonly baseUrl?: string;
|
|
2024
|
-
readonly tools?: string;
|
|
2025
|
-
readonly thinking?: string;
|
|
2026
|
-
readonly args?: readonly string[];
|
|
2027
|
-
readonly cwd?: string;
|
|
2028
|
-
readonly timeoutMs?: number;
|
|
2029
|
-
readonly logDir?: string;
|
|
2030
|
-
readonly logFormat?: 'summary' | 'json';
|
|
2031
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
2032
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
2033
|
-
readonly systemPrompt?: string;
|
|
2312
|
+
interface EvalSummary {
|
|
2313
|
+
/** Total number of test cases */
|
|
2314
|
+
readonly total: number;
|
|
2315
|
+
/** Number of passing test cases (score >= threshold) */
|
|
2316
|
+
readonly passed: number;
|
|
2317
|
+
/** Number of failing test cases (score < threshold) */
|
|
2318
|
+
readonly failed: number;
|
|
2319
|
+
/** Total duration in milliseconds */
|
|
2320
|
+
readonly durationMs: number;
|
|
2321
|
+
/** Mean score across all cases */
|
|
2322
|
+
readonly meanScore: number;
|
|
2034
2323
|
}
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
readonly
|
|
2041
|
-
|
|
2042
|
-
readonly
|
|
2043
|
-
readonly logDir?: string;
|
|
2044
|
-
readonly logFormat?: 'summary' | 'json';
|
|
2045
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
2046
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
2324
|
+
/**
|
|
2325
|
+
* Result of an `evaluate()` call.
|
|
2326
|
+
*/
|
|
2327
|
+
interface EvalRunResult {
|
|
2328
|
+
/** Individual test case results */
|
|
2329
|
+
readonly results: readonly EvaluationResult[];
|
|
2330
|
+
/** Aggregate summary statistics */
|
|
2331
|
+
readonly summary: EvalSummary;
|
|
2047
2332
|
}
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2333
|
+
/**
|
|
2334
|
+
* Run an evaluation suite against a target provider.
|
|
2335
|
+
*
|
|
2336
|
+
* Accepts either inline test definitions or a path to an EVAL.yaml spec file.
|
|
2337
|
+
* The config shape mirrors the YAML structure — users can translate between
|
|
2338
|
+
* file-based and programmatic usage 1:1.
|
|
2339
|
+
*
|
|
2340
|
+
* @param config - Evaluation configuration
|
|
2341
|
+
* @returns Typed evaluation results with summary statistics
|
|
2342
|
+
*
|
|
2343
|
+
* @example Inline tests with assertions
|
|
2344
|
+
* ```typescript
|
|
2345
|
+
* const { results, summary } = await evaluate({
|
|
2346
|
+
* tests: [
|
|
2347
|
+
* {
|
|
2348
|
+
* id: 'greeting',
|
|
2349
|
+
* input: 'Say hello',
|
|
2350
|
+
* assert: [{ type: 'contains', value: 'hello' }],
|
|
2351
|
+
* },
|
|
2352
|
+
* ],
|
|
2353
|
+
* target: { provider: 'mock_agent' },
|
|
2354
|
+
* });
|
|
2355
|
+
* console.log(`${summary.passed}/${summary.total} passed`);
|
|
2356
|
+
* ```
|
|
2357
|
+
*
|
|
2358
|
+
* @example Load from YAML
|
|
2359
|
+
* ```typescript
|
|
2360
|
+
* const { summary } = await evaluate({
|
|
2361
|
+
* specFile: './evals/my-eval.yaml',
|
|
2362
|
+
* filter: 'greeting-*',
|
|
2363
|
+
* });
|
|
2364
|
+
* ```
|
|
2365
|
+
*/
|
|
2366
|
+
declare function evaluate(config: EvalConfig): Promise<EvalRunResult>;
|
|
2367
|
+
|
|
2368
|
+
interface TsEvalResult {
|
|
2369
|
+
readonly config: EvalConfig;
|
|
2370
|
+
readonly filePath: string;
|
|
2053
2371
|
}
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2372
|
+
/**
|
|
2373
|
+
* Import a *.eval.ts file and extract the EvalConfig export.
|
|
2374
|
+
* Tries default, `config`, and `evalConfig` named exports in priority order.
|
|
2375
|
+
*/
|
|
2376
|
+
declare function loadTsEvalFile(filePath: string): Promise<TsEvalResult>;
|
|
2377
|
+
|
|
2378
|
+
/**
|
|
2379
|
+
* EVAL.yaml → evals.json transpiler.
|
|
2380
|
+
*
|
|
2381
|
+
* Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
|
|
2382
|
+
* for consumption by the skill-creator pipeline.
|
|
2383
|
+
*
|
|
2384
|
+
* Handles both `assertions:` (current) and `assert:` (deprecated alias).
|
|
2385
|
+
*/
|
|
2386
|
+
interface EvalsJsonCase {
|
|
2387
|
+
id: number;
|
|
2388
|
+
prompt: string;
|
|
2389
|
+
expected_output?: string;
|
|
2390
|
+
files?: string[];
|
|
2391
|
+
should_trigger?: boolean;
|
|
2392
|
+
assertions: string[];
|
|
2060
2393
|
}
|
|
2061
|
-
interface
|
|
2062
|
-
|
|
2063
|
-
|
|
2394
|
+
interface EvalsJsonFile {
|
|
2395
|
+
skill_name: string;
|
|
2396
|
+
evals: EvalsJsonCase[];
|
|
2064
2397
|
}
|
|
2065
|
-
/**
|
|
2066
|
-
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2072
|
-
|
|
2073
|
-
|
|
2074
|
-
* to force CLI invocation even in subagent mode.
|
|
2075
|
-
*/
|
|
2076
|
-
readonly subagentModeAllowed?: boolean;
|
|
2077
|
-
/**
|
|
2078
|
-
* Ordered list of target names to try when the primary target fails after
|
|
2079
|
-
* exhausting retries. Each fallback is attempted in order.
|
|
2080
|
-
*/
|
|
2081
|
-
readonly fallbackTargets?: readonly string[];
|
|
2398
|
+
/**
|
|
2399
|
+
* Result of transpiling a single EVAL.yaml.
|
|
2400
|
+
* May produce multiple evals.json files (one per skill).
|
|
2401
|
+
*/
|
|
2402
|
+
interface TranspileResult {
|
|
2403
|
+
/** Map from skill_name → EvalsJsonFile */
|
|
2404
|
+
files: Map<string, EvalsJsonFile>;
|
|
2405
|
+
/** Warning messages accumulated during transpilation */
|
|
2406
|
+
warnings: string[];
|
|
2082
2407
|
}
|
|
2083
|
-
type ResolvedTarget = (ResolvedTargetBase & {
|
|
2084
|
-
readonly kind: 'openai';
|
|
2085
|
-
readonly config: OpenAIResolvedConfig;
|
|
2086
|
-
}) | (ResolvedTargetBase & {
|
|
2087
|
-
readonly kind: 'openrouter';
|
|
2088
|
-
readonly config: OpenRouterResolvedConfig;
|
|
2089
|
-
}) | (ResolvedTargetBase & {
|
|
2090
|
-
readonly kind: 'azure';
|
|
2091
|
-
readonly config: AzureResolvedConfig;
|
|
2092
|
-
}) | (ResolvedTargetBase & {
|
|
2093
|
-
readonly kind: 'anthropic';
|
|
2094
|
-
readonly config: AnthropicResolvedConfig;
|
|
2095
|
-
}) | (ResolvedTargetBase & {
|
|
2096
|
-
readonly kind: 'gemini';
|
|
2097
|
-
readonly config: GeminiResolvedConfig;
|
|
2098
|
-
}) | (ResolvedTargetBase & {
|
|
2099
|
-
readonly kind: 'codex';
|
|
2100
|
-
readonly config: CodexResolvedConfig;
|
|
2101
|
-
}) | (ResolvedTargetBase & {
|
|
2102
|
-
readonly kind: 'copilot-sdk';
|
|
2103
|
-
readonly config: CopilotSdkResolvedConfig;
|
|
2104
|
-
}) | (ResolvedTargetBase & {
|
|
2105
|
-
readonly kind: 'copilot-cli';
|
|
2106
|
-
readonly config: CopilotCliResolvedConfig;
|
|
2107
|
-
}) | (ResolvedTargetBase & {
|
|
2108
|
-
readonly kind: 'copilot-log';
|
|
2109
|
-
readonly config: CopilotLogResolvedConfig;
|
|
2110
|
-
}) | (ResolvedTargetBase & {
|
|
2111
|
-
readonly kind: 'pi-coding-agent';
|
|
2112
|
-
readonly config: PiCodingAgentResolvedConfig;
|
|
2113
|
-
}) | (ResolvedTargetBase & {
|
|
2114
|
-
readonly kind: 'pi-cli';
|
|
2115
|
-
readonly config: PiCliResolvedConfig;
|
|
2116
|
-
}) | (ResolvedTargetBase & {
|
|
2117
|
-
readonly kind: 'claude';
|
|
2118
|
-
readonly config: ClaudeResolvedConfig;
|
|
2119
|
-
}) | (ResolvedTargetBase & {
|
|
2120
|
-
readonly kind: 'claude-cli';
|
|
2121
|
-
readonly config: ClaudeResolvedConfig;
|
|
2122
|
-
}) | (ResolvedTargetBase & {
|
|
2123
|
-
readonly kind: 'claude-sdk';
|
|
2124
|
-
readonly config: ClaudeResolvedConfig;
|
|
2125
|
-
}) | (ResolvedTargetBase & {
|
|
2126
|
-
readonly kind: 'mock';
|
|
2127
|
-
readonly config: MockResolvedConfig;
|
|
2128
|
-
}) | (ResolvedTargetBase & {
|
|
2129
|
-
readonly kind: 'vscode' | 'vscode-insiders';
|
|
2130
|
-
readonly config: VSCodeResolvedConfig;
|
|
2131
|
-
}) | (ResolvedTargetBase & {
|
|
2132
|
-
readonly kind: 'agentv';
|
|
2133
|
-
readonly config: AgentVResolvedConfig;
|
|
2134
|
-
}) | (ResolvedTargetBase & {
|
|
2135
|
-
readonly kind: 'cli';
|
|
2136
|
-
readonly config: CliResolvedConfig;
|
|
2137
|
-
}) | (ResolvedTargetBase & {
|
|
2138
|
-
readonly kind: 'transcript';
|
|
2139
|
-
readonly config: Record<string, never>;
|
|
2140
|
-
});
|
|
2141
2408
|
/**
|
|
2142
|
-
*
|
|
2143
|
-
*
|
|
2144
|
-
*
|
|
2409
|
+
* Transpile a parsed EVAL.yaml object into one or more evals.json objects.
|
|
2410
|
+
*
|
|
2411
|
+
* @param suite Parsed YAML object (already loaded, no file I/O here)
|
|
2412
|
+
* @param source Source identifier for error messages (e.g. file path)
|
|
2145
2413
|
*/
|
|
2146
|
-
declare
|
|
2147
|
-
declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
|
|
2148
|
-
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
|
|
2149
|
-
readonly emitDeprecationWarnings?: boolean;
|
|
2150
|
-
}): ResolvedTarget;
|
|
2151
|
-
|
|
2414
|
+
declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
|
|
2152
2415
|
/**
|
|
2153
|
-
*
|
|
2416
|
+
* Transpile an EVAL.yaml file into one or more evals.json objects.
|
|
2417
|
+
* Returns a map from output filename → JSON content.
|
|
2154
2418
|
*
|
|
2155
|
-
*
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2419
|
+
* @param evalYamlPath Absolute path to the EVAL.yaml file
|
|
2420
|
+
*/
|
|
2421
|
+
declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
|
|
2422
|
+
/**
|
|
2423
|
+
* Determine the output filename(s) for a transpile result.
|
|
2424
|
+
* Single skill → "evals.json"
|
|
2425
|
+
* Multiple skills → "<skill>.evals.json"
|
|
2159
2426
|
*/
|
|
2427
|
+
declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
|
|
2160
2428
|
|
|
2429
|
+
declare function fileExists(filePath: string): Promise<boolean>;
|
|
2161
2430
|
/**
|
|
2162
|
-
*
|
|
2431
|
+
* Normalize line endings to LF (\n).
|
|
2432
|
+
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
2163
2433
|
*/
|
|
2164
|
-
|
|
2434
|
+
declare function normalizeLineEndings(content: string): string;
|
|
2165
2435
|
/**
|
|
2166
|
-
*
|
|
2167
|
-
*
|
|
2168
|
-
* Built-in providers are registered at startup. Custom providers can be
|
|
2169
|
-
* registered via the `register()` method.
|
|
2436
|
+
* Read a text file and normalize line endings to LF (\n).
|
|
2437
|
+
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
2170
2438
|
*/
|
|
2171
|
-
declare
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2439
|
+
declare function readTextFile(filePath: string): Promise<string>;
|
|
2440
|
+
/**
|
|
2441
|
+
* Read a JSON file and parse it.
|
|
2442
|
+
*/
|
|
2443
|
+
declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
|
|
2444
|
+
/**
|
|
2445
|
+
* Find git repository root by walking up the directory tree.
|
|
2446
|
+
*/
|
|
2447
|
+
declare function findGitRoot(startPath: string): Promise<string | null>;
|
|
2448
|
+
/**
|
|
2449
|
+
* Build a chain of directories walking from a file's location up to repo root.
|
|
2450
|
+
* Used for discovering configuration files like targets.yaml or config.yaml.
|
|
2451
|
+
*/
|
|
2452
|
+
declare function buildDirectoryChain(filePath: string, repoRoot: string): readonly string[];
|
|
2453
|
+
/**
|
|
2454
|
+
* Build search roots for file resolution, matching yaml-parser behavior.
|
|
2455
|
+
* Searches from eval file directory up to repo root.
|
|
2456
|
+
*/
|
|
2457
|
+
declare function buildSearchRoots(evalPath: string, repoRoot: string): readonly string[];
|
|
2458
|
+
/**
|
|
2459
|
+
* Resolve a file reference using search roots, matching yaml-parser behavior.
|
|
2460
|
+
*/
|
|
2461
|
+
declare function resolveFileReference(rawValue: string, searchRoots: readonly string[]): Promise<{
|
|
2462
|
+
readonly displayPath: string;
|
|
2463
|
+
readonly resolvedPath?: string;
|
|
2464
|
+
readonly attempted: readonly string[];
|
|
2465
|
+
}>;
|
|
2187
2466
|
|
|
2188
2467
|
declare function readTargetDefinitions(filePath: string): Promise<readonly TargetDefinition[]>;
|
|
2189
2468
|
declare function listTargetNames(definitions: readonly TargetDefinition[]): readonly string[];
|
|
@@ -2346,8 +2625,8 @@ interface EvaluationContext {
|
|
|
2346
2625
|
readonly graderProvider?: Provider;
|
|
2347
2626
|
/** @deprecated Use `graderProvider` instead */
|
|
2348
2627
|
readonly judgeProvider?: Provider;
|
|
2349
|
-
readonly
|
|
2350
|
-
readonly evaluator?:
|
|
2628
|
+
readonly graderTemplateOverride?: string;
|
|
2629
|
+
readonly evaluator?: GraderConfig;
|
|
2351
2630
|
/** Output messages from agent execution (primary source for tool trajectory) */
|
|
2352
2631
|
readonly output?: readonly Message[];
|
|
2353
2632
|
/** Lightweight summary of trace events (if available) */
|
|
@@ -2380,8 +2659,8 @@ interface EvaluationScore {
|
|
|
2380
2659
|
readonly verdict: EvaluationVerdict;
|
|
2381
2660
|
readonly assertions: readonly AssertionEntry[];
|
|
2382
2661
|
readonly expectedAspectCount: number;
|
|
2383
|
-
readonly
|
|
2384
|
-
readonly scores?: readonly
|
|
2662
|
+
readonly graderRawRequest?: JsonObject;
|
|
2663
|
+
readonly scores?: readonly ChildGraderResult[];
|
|
2385
2664
|
/** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
|
|
2386
2665
|
readonly details?: JsonObject;
|
|
2387
2666
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
@@ -2389,26 +2668,26 @@ interface EvaluationScore {
|
|
|
2389
2668
|
/** Target name used for grading (e.g., the LLM provider). */
|
|
2390
2669
|
readonly graderTarget?: string;
|
|
2391
2670
|
}
|
|
2392
|
-
interface
|
|
2671
|
+
interface ChildGraderResult {
|
|
2393
2672
|
readonly name: string;
|
|
2394
2673
|
readonly type: string;
|
|
2395
2674
|
readonly score: number;
|
|
2396
2675
|
readonly weight?: number;
|
|
2397
2676
|
readonly verdict: EvaluationVerdict;
|
|
2398
2677
|
readonly assertions: readonly AssertionEntry[];
|
|
2399
|
-
readonly
|
|
2400
|
-
readonly scores?: readonly
|
|
2678
|
+
readonly graderRawRequest?: JsonObject;
|
|
2679
|
+
readonly scores?: readonly ChildGraderResult[];
|
|
2401
2680
|
/** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
|
|
2402
2681
|
readonly details?: JsonObject;
|
|
2403
2682
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
2404
2683
|
readonly tokenUsage?: TokenUsage;
|
|
2405
2684
|
}
|
|
2406
|
-
interface
|
|
2685
|
+
interface Grader {
|
|
2407
2686
|
readonly kind: string;
|
|
2408
2687
|
evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
|
|
2409
2688
|
}
|
|
2410
|
-
interface
|
|
2411
|
-
create(config:
|
|
2689
|
+
interface GraderFactory {
|
|
2690
|
+
create(config: GraderConfig, context: EvaluationContext): Grader;
|
|
2412
2691
|
}
|
|
2413
2692
|
|
|
2414
2693
|
/**
|
|
@@ -2447,7 +2726,7 @@ declare function deepEqual(a: unknown, b: unknown): boolean;
|
|
|
2447
2726
|
*/
|
|
2448
2727
|
declare function negateScore(score: EvaluationScore): EvaluationScore;
|
|
2449
2728
|
|
|
2450
|
-
interface
|
|
2729
|
+
interface CodeGraderOptions {
|
|
2451
2730
|
readonly command: readonly string[];
|
|
2452
2731
|
/** @deprecated Use `command` instead */
|
|
2453
2732
|
readonly script?: readonly string[];
|
|
@@ -2458,29 +2737,29 @@ interface CodeEvaluatorOptions {
|
|
|
2458
2737
|
/** Target access config - when present, enables target invocation */
|
|
2459
2738
|
readonly target?: TargetAccessConfig;
|
|
2460
2739
|
}
|
|
2461
|
-
declare class
|
|
2740
|
+
declare class CodeGrader implements Grader {
|
|
2462
2741
|
readonly kind = "code-grader";
|
|
2463
2742
|
private readonly command;
|
|
2464
2743
|
private readonly cwd?;
|
|
2465
2744
|
private readonly agentTimeoutMs?;
|
|
2466
2745
|
private readonly config?;
|
|
2467
2746
|
private readonly target?;
|
|
2468
|
-
constructor(options:
|
|
2747
|
+
constructor(options: CodeGraderOptions);
|
|
2469
2748
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
2470
2749
|
}
|
|
2471
2750
|
declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
|
|
2472
2751
|
|
|
2473
|
-
interface
|
|
2474
|
-
readonly config:
|
|
2475
|
-
readonly evaluatorFactory:
|
|
2752
|
+
interface CompositeGraderOptions {
|
|
2753
|
+
readonly config: CompositeGraderConfig;
|
|
2754
|
+
readonly evaluatorFactory: GraderFactory;
|
|
2476
2755
|
readonly cwd?: string;
|
|
2477
2756
|
}
|
|
2478
|
-
declare class
|
|
2757
|
+
declare class CompositeGrader implements Grader {
|
|
2479
2758
|
readonly kind = "composite";
|
|
2480
2759
|
private readonly config;
|
|
2481
2760
|
private readonly evaluatorFactory;
|
|
2482
2761
|
private readonly cwd?;
|
|
2483
|
-
constructor(options:
|
|
2762
|
+
constructor(options: CompositeGraderOptions);
|
|
2484
2763
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
2485
2764
|
private aggregate;
|
|
2486
2765
|
private runWeightedAverage;
|
|
@@ -2489,50 +2768,50 @@ declare class CompositeEvaluator implements Evaluator {
|
|
|
2489
2768
|
private runLlmAggregator;
|
|
2490
2769
|
}
|
|
2491
2770
|
|
|
2492
|
-
interface
|
|
2493
|
-
readonly config:
|
|
2771
|
+
interface CostGraderOptions {
|
|
2772
|
+
readonly config: CostGraderConfig;
|
|
2494
2773
|
}
|
|
2495
2774
|
/**
|
|
2496
|
-
*
|
|
2775
|
+
* Grader that checks execution cost against a budget.
|
|
2497
2776
|
* Uses costUsd from the evaluation context.
|
|
2498
2777
|
*/
|
|
2499
|
-
declare class
|
|
2778
|
+
declare class CostGrader implements Grader {
|
|
2500
2779
|
readonly kind = "cost";
|
|
2501
2780
|
private readonly config;
|
|
2502
|
-
constructor(options:
|
|
2781
|
+
constructor(options: CostGraderOptions);
|
|
2503
2782
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2504
2783
|
}
|
|
2505
2784
|
|
|
2506
|
-
interface
|
|
2507
|
-
readonly config:
|
|
2785
|
+
interface ExecutionMetricsGraderOptions {
|
|
2786
|
+
readonly config: ExecutionMetricsGraderConfig;
|
|
2508
2787
|
}
|
|
2509
2788
|
/**
|
|
2510
|
-
*
|
|
2789
|
+
* Grader that checks execution metrics against configured thresholds.
|
|
2511
2790
|
* Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
|
|
2512
2791
|
* and exploration ratio. Only specified thresholds are checked.
|
|
2513
2792
|
*
|
|
2514
2793
|
* Score is proportional: passed / total assertions
|
|
2515
2794
|
*/
|
|
2516
|
-
declare class
|
|
2795
|
+
declare class ExecutionMetricsGrader implements Grader {
|
|
2517
2796
|
readonly kind = "execution-metrics";
|
|
2518
2797
|
private readonly config;
|
|
2519
|
-
constructor(options:
|
|
2798
|
+
constructor(options: ExecutionMetricsGraderOptions);
|
|
2520
2799
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2521
2800
|
private extractConfiguredThresholds;
|
|
2522
2801
|
private filterDefinedMetrics;
|
|
2523
2802
|
}
|
|
2524
2803
|
|
|
2525
|
-
interface
|
|
2526
|
-
readonly config:
|
|
2804
|
+
interface FieldAccuracyGraderOptions {
|
|
2805
|
+
readonly config: FieldAccuracyGraderConfig;
|
|
2527
2806
|
}
|
|
2528
2807
|
/**
|
|
2529
|
-
*
|
|
2808
|
+
* FieldAccuracyGrader compares extracted structured data against expected values
|
|
2530
2809
|
* with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
|
|
2531
2810
|
*/
|
|
2532
|
-
declare class
|
|
2811
|
+
declare class FieldAccuracyGrader implements Grader {
|
|
2533
2812
|
readonly kind = "field-accuracy";
|
|
2534
2813
|
private readonly config;
|
|
2535
|
-
constructor(options:
|
|
2814
|
+
constructor(options: FieldAccuracyGraderOptions);
|
|
2536
2815
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2537
2816
|
/**
|
|
2538
2817
|
* Extract expected data from expected_output array.
|
|
@@ -2561,33 +2840,33 @@ declare class FieldAccuracyEvaluator implements Evaluator {
|
|
|
2561
2840
|
private aggregateResults;
|
|
2562
2841
|
}
|
|
2563
2842
|
|
|
2564
|
-
interface
|
|
2565
|
-
readonly config:
|
|
2843
|
+
interface LatencyGraderOptions {
|
|
2844
|
+
readonly config: LatencyGraderConfig;
|
|
2566
2845
|
}
|
|
2567
2846
|
/**
|
|
2568
|
-
*
|
|
2847
|
+
* Grader that checks execution duration against a threshold.
|
|
2569
2848
|
* Uses durationMs from the evaluation context.
|
|
2570
2849
|
*/
|
|
2571
|
-
declare class
|
|
2850
|
+
declare class LatencyGrader implements Grader {
|
|
2572
2851
|
readonly kind = "latency";
|
|
2573
2852
|
private readonly config;
|
|
2574
|
-
constructor(options:
|
|
2853
|
+
constructor(options: LatencyGraderOptions);
|
|
2575
2854
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2576
2855
|
}
|
|
2577
2856
|
|
|
2578
2857
|
/**
|
|
2579
|
-
* Default
|
|
2580
|
-
* Custom
|
|
2858
|
+
* Default grader template for the user prompt (variables will be substituted).
|
|
2859
|
+
* Custom graders can override this via graderTemplate option.
|
|
2581
2860
|
*/
|
|
2582
|
-
declare const
|
|
2861
|
+
declare const DEFAULT_GRADER_TEMPLATE: string;
|
|
2583
2862
|
type GraderProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
2584
|
-
interface
|
|
2863
|
+
interface LlmGraderOptions {
|
|
2585
2864
|
readonly resolveGraderProvider: GraderProviderResolver;
|
|
2586
2865
|
/** @deprecated Use `resolveGraderProvider` instead. */
|
|
2587
2866
|
readonly resolveJudgeProvider?: GraderProviderResolver;
|
|
2588
2867
|
readonly maxOutputTokens?: number;
|
|
2589
2868
|
readonly temperature?: number;
|
|
2590
|
-
readonly
|
|
2869
|
+
readonly graderTemplate?: string;
|
|
2591
2870
|
readonly maxSteps?: number;
|
|
2592
2871
|
readonly graderTargetProvider?: Provider;
|
|
2593
2872
|
/** @deprecated Use `graderTargetProvider` instead. */
|
|
@@ -2633,39 +2912,39 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2633
2912
|
reasoning: z.ZodString;
|
|
2634
2913
|
}, "strip", z.ZodTypeAny, {
|
|
2635
2914
|
id: string;
|
|
2636
|
-
reasoning: string;
|
|
2637
2915
|
satisfied: boolean;
|
|
2916
|
+
reasoning: string;
|
|
2638
2917
|
}, {
|
|
2639
2918
|
id: string;
|
|
2640
|
-
reasoning: string;
|
|
2641
2919
|
satisfied: boolean;
|
|
2920
|
+
reasoning: string;
|
|
2642
2921
|
}>, "many">;
|
|
2643
2922
|
overall_reasoning: z.ZodString;
|
|
2644
2923
|
}, "strip", z.ZodTypeAny, {
|
|
2645
2924
|
checks: {
|
|
2646
2925
|
id: string;
|
|
2647
|
-
reasoning: string;
|
|
2648
2926
|
satisfied: boolean;
|
|
2927
|
+
reasoning: string;
|
|
2649
2928
|
}[];
|
|
2650
2929
|
overall_reasoning: string;
|
|
2651
2930
|
}, {
|
|
2652
2931
|
checks: {
|
|
2653
2932
|
id: string;
|
|
2654
|
-
reasoning: string;
|
|
2655
2933
|
satisfied: boolean;
|
|
2934
|
+
reasoning: string;
|
|
2656
2935
|
}[];
|
|
2657
2936
|
overall_reasoning: string;
|
|
2658
2937
|
}>;
|
|
2659
2938
|
|
|
2660
|
-
declare class
|
|
2939
|
+
declare class LlmGrader implements Grader {
|
|
2661
2940
|
readonly kind = "llm-grader";
|
|
2662
2941
|
private readonly resolveGraderProvider;
|
|
2663
2942
|
private readonly maxOutputTokens?;
|
|
2664
2943
|
private readonly temperature?;
|
|
2665
|
-
private readonly
|
|
2944
|
+
private readonly graderTemplate?;
|
|
2666
2945
|
private readonly maxSteps;
|
|
2667
2946
|
private readonly graderTargetProvider?;
|
|
2668
|
-
constructor(options:
|
|
2947
|
+
constructor(options: LlmGraderOptions);
|
|
2669
2948
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
2670
2949
|
private prepareContext;
|
|
2671
2950
|
private evaluateFreeform;
|
|
@@ -2722,7 +3001,7 @@ declare class LlmGraderEvaluator implements Evaluator {
|
|
|
2722
3001
|
}
|
|
2723
3002
|
/**
|
|
2724
3003
|
* Build the mandatory output schema that all evaluators must follow.
|
|
2725
|
-
* This schema is always appended to the
|
|
3004
|
+
* This schema is always appended to the grader template.
|
|
2726
3005
|
*/
|
|
2727
3006
|
declare function buildOutputSchema(): string;
|
|
2728
3007
|
declare function buildRubricOutputSchema(): string;
|
|
@@ -2766,10 +3045,10 @@ declare function extractImageBlocks(messages: readonly Message[]): ContentImage[
|
|
|
2766
3045
|
* names (input.skill, input.file_path) regardless of provider.
|
|
2767
3046
|
*/
|
|
2768
3047
|
|
|
2769
|
-
declare class
|
|
3048
|
+
declare class SkillTriggerGrader implements Grader {
|
|
2770
3049
|
readonly kind = "skill-trigger";
|
|
2771
3050
|
private readonly config;
|
|
2772
|
-
constructor(config:
|
|
3051
|
+
constructor(config: SkillTriggerGraderConfig);
|
|
2773
3052
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2774
3053
|
}
|
|
2775
3054
|
|
|
@@ -2783,33 +3062,33 @@ declare function assembleLlmGraderPrompt(input: {
|
|
|
2783
3062
|
evalCase: EvalTest;
|
|
2784
3063
|
candidate: string;
|
|
2785
3064
|
promptInputs: PromptInputs;
|
|
2786
|
-
evaluatorConfig?:
|
|
3065
|
+
evaluatorConfig?: LlmGraderConfig;
|
|
2787
3066
|
output?: readonly Message[];
|
|
2788
3067
|
fileChanges?: string;
|
|
2789
|
-
|
|
3068
|
+
graderTemplateOverride?: string;
|
|
2790
3069
|
}): LlmGraderPromptAssembly;
|
|
2791
3070
|
|
|
2792
|
-
interface
|
|
2793
|
-
readonly config:
|
|
3071
|
+
interface TokenUsageGraderOptions {
|
|
3072
|
+
readonly config: TokenUsageGraderConfig;
|
|
2794
3073
|
}
|
|
2795
3074
|
/**
|
|
2796
|
-
*
|
|
3075
|
+
* Grader that checks provider-reported token usage against configured limits.
|
|
2797
3076
|
* Uses tokenUsage from the evaluation context.
|
|
2798
3077
|
*/
|
|
2799
|
-
declare class
|
|
3078
|
+
declare class TokenUsageGrader implements Grader {
|
|
2800
3079
|
readonly kind = "token-usage";
|
|
2801
3080
|
private readonly config;
|
|
2802
|
-
constructor(options:
|
|
3081
|
+
constructor(options: TokenUsageGraderOptions);
|
|
2803
3082
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2804
3083
|
}
|
|
2805
3084
|
|
|
2806
|
-
interface
|
|
2807
|
-
readonly config:
|
|
3085
|
+
interface ToolTrajectoryGraderOptions {
|
|
3086
|
+
readonly config: ToolTrajectoryGraderConfig;
|
|
2808
3087
|
}
|
|
2809
|
-
declare class
|
|
3088
|
+
declare class ToolTrajectoryGrader implements Grader {
|
|
2810
3089
|
readonly kind = "tool-trajectory";
|
|
2811
3090
|
private readonly config;
|
|
2812
|
-
constructor(options:
|
|
3091
|
+
constructor(options: ToolTrajectoryGraderOptions);
|
|
2813
3092
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2814
3093
|
/**
|
|
2815
3094
|
* Extract tool calls from output messages.
|
|
@@ -2873,7 +3152,7 @@ declare function runIsJsonAssertion(output: string): AssertionResult;
|
|
|
2873
3152
|
declare function runEqualsAssertion(output: string, value: string): AssertionResult;
|
|
2874
3153
|
|
|
2875
3154
|
/**
|
|
2876
|
-
* Extensible
|
|
3155
|
+
* Extensible grader registry.
|
|
2877
3156
|
*
|
|
2878
3157
|
* Replaces the hardcoded switch/case dispatch in the orchestrator with
|
|
2879
3158
|
* a registry of named factory functions. Built-in evaluators are registered
|
|
@@ -2882,10 +3161,10 @@ declare function runEqualsAssertion(output: string, value: string): AssertionRes
|
|
|
2882
3161
|
*/
|
|
2883
3162
|
|
|
2884
3163
|
/**
|
|
2885
|
-
* Context passed to
|
|
3164
|
+
* Context passed to grader factory functions during creation.
|
|
2886
3165
|
* Contains shared resources needed by evaluator instances.
|
|
2887
3166
|
*/
|
|
2888
|
-
interface
|
|
3167
|
+
interface GraderDispatchContext {
|
|
2889
3168
|
/** Shared LLM grader provider (resolved at suite level) */
|
|
2890
3169
|
readonly graderProvider?: Provider;
|
|
2891
3170
|
/** @deprecated Use `graderProvider` instead */
|
|
@@ -2899,48 +3178,48 @@ interface EvaluatorDispatchContext {
|
|
|
2899
3178
|
/** Directory containing the eval file (for composite member resolution) */
|
|
2900
3179
|
readonly evalFileDir?: string;
|
|
2901
3180
|
/** Shared LLM grader evaluator instance */
|
|
2902
|
-
readonly llmGrader:
|
|
3181
|
+
readonly llmGrader: Grader;
|
|
2903
3182
|
/** @deprecated Use `llmGrader` instead */
|
|
2904
|
-
readonly llmJudge?:
|
|
3183
|
+
readonly llmJudge?: Grader;
|
|
2905
3184
|
/** Reference to the registry itself (for composite evaluators that need to create children) */
|
|
2906
|
-
readonly registry:
|
|
3185
|
+
readonly registry: GraderRegistry;
|
|
2907
3186
|
}
|
|
2908
3187
|
/**
|
|
2909
|
-
* Factory function that creates an
|
|
3188
|
+
* Factory function that creates an Grader instance from a config.
|
|
2910
3189
|
*
|
|
2911
3190
|
* Factory functions handle all type-specific initialization logic:
|
|
2912
3191
|
* - Reading prompt files for LLM graders
|
|
2913
3192
|
* - Resolving script paths for code graders
|
|
2914
3193
|
* - Creating adapter evaluators for deterministic assertions
|
|
2915
3194
|
*/
|
|
2916
|
-
type
|
|
3195
|
+
type GraderFactoryFn = (config: GraderConfig, context: GraderDispatchContext) => Grader | Promise<Grader>;
|
|
2917
3196
|
/**
|
|
2918
|
-
* Registry of
|
|
3197
|
+
* Registry of grader factory functions keyed by grader type name.
|
|
2919
3198
|
*
|
|
2920
3199
|
* Built-in evaluators are registered at startup. Custom evaluators can be
|
|
2921
3200
|
* registered via the `register()` method or discovered from `.agentv/assertions/`.
|
|
2922
3201
|
*/
|
|
2923
|
-
declare class
|
|
3202
|
+
declare class GraderRegistry {
|
|
2924
3203
|
private readonly factories;
|
|
2925
|
-
/** Register a factory function for an
|
|
2926
|
-
register(type: string, factory:
|
|
2927
|
-
/** Get the factory function for an
|
|
2928
|
-
get(type: string):
|
|
3204
|
+
/** Register a factory function for an grader type. */
|
|
3205
|
+
register(type: string, factory: GraderFactoryFn): this;
|
|
3206
|
+
/** Get the factory function for an grader type. */
|
|
3207
|
+
get(type: string): GraderFactoryFn | undefined;
|
|
2929
3208
|
/** Check if a factory is registered for the given type. */
|
|
2930
3209
|
has(type: string): boolean;
|
|
2931
|
-
/** List all registered
|
|
3210
|
+
/** List all registered grader type names. */
|
|
2932
3211
|
list(): string[];
|
|
2933
3212
|
/**
|
|
2934
3213
|
* Create an evaluator instance from a config, using the registered factory.
|
|
2935
|
-
* Throws if no factory is registered for the
|
|
3214
|
+
* Throws if no factory is registered for the grader type.
|
|
2936
3215
|
*/
|
|
2937
|
-
create(config:
|
|
3216
|
+
create(config: GraderConfig, context: GraderDispatchContext): Promise<Grader>;
|
|
2938
3217
|
}
|
|
2939
3218
|
/**
|
|
2940
|
-
* Adapter that wraps a synchronous assertion function as an
|
|
3219
|
+
* Adapter that wraps a synchronous assertion function as an Grader.
|
|
2941
3220
|
* Used for deterministic assertions (contains, regex, is-json, equals).
|
|
2942
3221
|
*/
|
|
2943
|
-
declare class
|
|
3222
|
+
declare class DeterministicAssertionGrader implements Grader {
|
|
2944
3223
|
private readonly assertFn;
|
|
2945
3224
|
readonly kind: string;
|
|
2946
3225
|
constructor(kind: string, assertFn: (context: EvaluationContext) => EvaluationScore);
|
|
@@ -2988,8 +3267,8 @@ interface RunEvalCaseOptions {
|
|
|
2988
3267
|
readonly evalCase: EvalTest;
|
|
2989
3268
|
readonly provider: Provider;
|
|
2990
3269
|
readonly target: ResolvedTarget;
|
|
2991
|
-
readonly evaluators: Partial<Record<string,
|
|
2992
|
-
readonly 'llm-grader':
|
|
3270
|
+
readonly evaluators: Partial<Record<string, Grader>> & {
|
|
3271
|
+
readonly 'llm-grader': Grader;
|
|
2993
3272
|
};
|
|
2994
3273
|
readonly now?: () => Date;
|
|
2995
3274
|
readonly maxRetries?: number;
|
|
@@ -3020,8 +3299,8 @@ interface RunEvalCaseOptions {
|
|
|
3020
3299
|
readonly suiteWorkspaceFile?: string;
|
|
3021
3300
|
/** Real-time observability callbacks passed to the provider */
|
|
3022
3301
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
3023
|
-
/**
|
|
3024
|
-
readonly typeRegistry?:
|
|
3302
|
+
/** Grader type registry (with custom assertions discovered) */
|
|
3303
|
+
readonly typeRegistry?: GraderRegistry;
|
|
3025
3304
|
/** RepoManager instance for repo lifecycle (shared workspace mode) */
|
|
3026
3305
|
readonly repoManager?: RepoManager;
|
|
3027
3306
|
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
@@ -3054,7 +3333,7 @@ interface RunEvaluationOptions {
|
|
|
3054
3333
|
readonly targets?: readonly TargetDefinition[];
|
|
3055
3334
|
readonly env?: EnvLookup;
|
|
3056
3335
|
readonly providerFactory?: (target: ResolvedTarget) => Provider;
|
|
3057
|
-
readonly evaluators?: Partial<Record<string,
|
|
3336
|
+
readonly evaluators?: Partial<Record<string, Grader>>;
|
|
3058
3337
|
readonly maxRetries?: number;
|
|
3059
3338
|
readonly agentTimeoutMs?: number;
|
|
3060
3339
|
readonly cache?: EvaluationCache;
|
|
@@ -3076,7 +3355,7 @@ interface RunEvaluationOptions {
|
|
|
3076
3355
|
/** Real-time observability callbacks passed to the provider */
|
|
3077
3356
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
3078
3357
|
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
|
|
3079
|
-
readonly
|
|
3358
|
+
readonly budgetUsd?: number;
|
|
3080
3359
|
/** Execution error tolerance: true halts on first error */
|
|
3081
3360
|
readonly failOnError?: FailOnError;
|
|
3082
3361
|
/** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
|
|
@@ -3107,244 +3386,6 @@ interface RunEvaluationOptions {
|
|
|
3107
3386
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
3108
3387
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
3109
3388
|
|
|
3110
|
-
/**
|
|
3111
|
-
* Types for inline assertion functions used in the evaluate() API.
|
|
3112
|
-
*
|
|
3113
|
-
* Inline functions are the escape hatch for custom evaluation logic
|
|
3114
|
-
* that doesn't fit a built-in evaluator type. For built-in assertions
|
|
3115
|
-
* (contains, regex, is-json, etc.), use config objects instead:
|
|
3116
|
-
*
|
|
3117
|
-
* assert: [{ type: 'contains', value: 'hello' }]
|
|
3118
|
-
*
|
|
3119
|
-
* Inline functions are for custom logic:
|
|
3120
|
-
*
|
|
3121
|
-
* assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
|
|
3122
|
-
*/
|
|
3123
|
-
/** Context passed to inline assertion functions */
|
|
3124
|
-
interface AssertContext {
|
|
3125
|
-
readonly input: string;
|
|
3126
|
-
readonly output: string;
|
|
3127
|
-
readonly expectedOutput?: string;
|
|
3128
|
-
readonly criteria?: string;
|
|
3129
|
-
readonly metadata?: Record<string, unknown>;
|
|
3130
|
-
}
|
|
3131
|
-
/** Result from an inline assertion function */
|
|
3132
|
-
interface AssertResult {
|
|
3133
|
-
readonly name: string;
|
|
3134
|
-
readonly score: number;
|
|
3135
|
-
readonly metadata?: Record<string, unknown>;
|
|
3136
|
-
}
|
|
3137
|
-
/** Inline assertion function signature */
|
|
3138
|
-
type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
|
|
3139
|
-
|
|
3140
|
-
/**
|
|
3141
|
-
* Programmatic API for running evaluations.
|
|
3142
|
-
*
|
|
3143
|
-
* Provides `evaluate()` — a high-level function for using AgentV as a library
|
|
3144
|
-
* instead of a CLI. The config shape mirrors the YAML structure for easy
|
|
3145
|
-
* translation between file-based and programmatic usage.
|
|
3146
|
-
*
|
|
3147
|
-
* @example Inline tests with config objects
|
|
3148
|
-
* ```typescript
|
|
3149
|
-
* import { evaluate } from '@agentv/core';
|
|
3150
|
-
*
|
|
3151
|
-
* const results = await evaluate({
|
|
3152
|
-
* tests: [
|
|
3153
|
-
* {
|
|
3154
|
-
* id: 'capital',
|
|
3155
|
-
* input: 'What is the capital of France?',
|
|
3156
|
-
* expectedOutput: 'Paris',
|
|
3157
|
-
* assert: [{ type: 'contains', value: 'Paris' }],
|
|
3158
|
-
* },
|
|
3159
|
-
* ],
|
|
3160
|
-
* target: { provider: 'mock_agent' },
|
|
3161
|
-
* });
|
|
3162
|
-
*
|
|
3163
|
-
* console.log(results.summary.passed, 'passed');
|
|
3164
|
-
* ```
|
|
3165
|
-
*
|
|
3166
|
-
* @example Inline tests with task function and custom assertion
|
|
3167
|
-
* ```typescript
|
|
3168
|
-
* import { evaluate } from '@agentv/core';
|
|
3169
|
-
*
|
|
3170
|
-
* const { summary } = await evaluate({
|
|
3171
|
-
* tests: [
|
|
3172
|
-
* {
|
|
3173
|
-
* id: 'echo',
|
|
3174
|
-
* input: 'hello',
|
|
3175
|
-
* expectedOutput: 'Echo: hello',
|
|
3176
|
-
* assert: [
|
|
3177
|
-
* { type: 'contains', value: 'hello' },
|
|
3178
|
-
* { type: 'equals' },
|
|
3179
|
-
* ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
|
|
3180
|
-
* ],
|
|
3181
|
-
* },
|
|
3182
|
-
* ],
|
|
3183
|
-
* task: async (input) => `Echo: ${input}`,
|
|
3184
|
-
* });
|
|
3185
|
-
* ```
|
|
3186
|
-
*
|
|
3187
|
-
* @example File-based
|
|
3188
|
-
* ```typescript
|
|
3189
|
-
* const results = await evaluate({
|
|
3190
|
-
* specFile: './evals/EVAL.yaml',
|
|
3191
|
-
* target: { provider: 'claude_agent' },
|
|
3192
|
-
* });
|
|
3193
|
-
* ```
|
|
3194
|
-
*
|
|
3195
|
-
* @module
|
|
3196
|
-
*/
|
|
3197
|
-
|
|
3198
|
-
/**
|
|
3199
|
-
* Inline test definition for the programmatic API.
|
|
3200
|
-
* Mirrors the YAML test structure.
|
|
3201
|
-
*/
|
|
3202
|
-
interface EvalTestInput {
|
|
3203
|
-
/** Unique test identifier */
|
|
3204
|
-
readonly id: string;
|
|
3205
|
-
/** What the response should accomplish */
|
|
3206
|
-
readonly criteria?: string;
|
|
3207
|
-
/** Input to the agent (string or message array) */
|
|
3208
|
-
readonly input: string | readonly {
|
|
3209
|
-
role: string;
|
|
3210
|
-
content: string;
|
|
3211
|
-
}[];
|
|
3212
|
-
/** Expected reference output (camelCase preferred) */
|
|
3213
|
-
readonly expectedOutput?: string;
|
|
3214
|
-
/** @deprecated Use `expectedOutput` instead */
|
|
3215
|
-
readonly expected_output?: string;
|
|
3216
|
-
/** Assertion graders — accepts factory functions, config objects, or inline functions */
|
|
3217
|
-
readonly assert?: readonly AssertEntry[];
|
|
3218
|
-
/** Arbitrary metadata */
|
|
3219
|
-
readonly metadata?: Record<string, unknown>;
|
|
3220
|
-
}
|
|
3221
|
-
/**
|
|
3222
|
-
* Inline assertion definition for the programmatic API.
|
|
3223
|
-
* Matches the YAML `assert` block structure.
|
|
3224
|
-
*/
|
|
3225
|
-
interface EvalAssertionInput {
|
|
3226
|
-
/** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
|
|
3227
|
-
readonly type: string;
|
|
3228
|
-
/** Display name */
|
|
3229
|
-
readonly name?: string;
|
|
3230
|
-
/** Value for deterministic assertions (contains, equals, regex) */
|
|
3231
|
-
readonly value?: string;
|
|
3232
|
-
/** Weight for scoring */
|
|
3233
|
-
readonly weight?: number;
|
|
3234
|
-
/** Whether this assertion is required to pass */
|
|
3235
|
-
readonly required?: boolean | number;
|
|
3236
|
-
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
3237
|
-
readonly min_score?: number;
|
|
3238
|
-
/** Prompt file for llm_grader */
|
|
3239
|
-
readonly prompt?: string;
|
|
3240
|
-
/** Script for code_grader */
|
|
3241
|
-
readonly script?: string | readonly string[];
|
|
3242
|
-
/** Additional config passed to the assertion */
|
|
3243
|
-
readonly config?: Record<string, unknown>;
|
|
3244
|
-
/** Nested assertions for composite type */
|
|
3245
|
-
readonly assert?: readonly EvalAssertionInput[];
|
|
3246
|
-
/** Rubric criteria for rubrics type */
|
|
3247
|
-
readonly criteria?: readonly (string | {
|
|
3248
|
-
id?: string;
|
|
3249
|
-
outcome: string;
|
|
3250
|
-
weight?: number;
|
|
3251
|
-
})[];
|
|
3252
|
-
/** Additional properties */
|
|
3253
|
-
readonly [key: string]: unknown;
|
|
3254
|
-
}
|
|
3255
|
-
/** Assert entry: inline function or config object */
|
|
3256
|
-
type AssertEntry = AssertFn | EvalAssertionInput;
|
|
3257
|
-
/**
|
|
3258
|
-
* Configuration for `evaluate()`.
|
|
3259
|
-
* Accepts either inline tests or a spec file path.
|
|
3260
|
-
*/
|
|
3261
|
-
interface EvalConfig {
|
|
3262
|
-
/** Inline test definitions (mutually exclusive with specFile) */
|
|
3263
|
-
readonly tests?: readonly EvalTestInput[];
|
|
3264
|
-
/** Path to an EVAL.yaml spec file (mutually exclusive with tests) */
|
|
3265
|
-
readonly specFile?: string;
|
|
3266
|
-
/** Target provider configuration */
|
|
3267
|
-
readonly target?: TargetDefinition;
|
|
3268
|
-
/** Custom task function — mutually exclusive with target */
|
|
3269
|
-
readonly task?: (input: string) => string | Promise<string>;
|
|
3270
|
-
/** Suite-level assertions applied to all tests */
|
|
3271
|
-
readonly assert?: readonly AssertEntry[];
|
|
3272
|
-
/** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
|
|
3273
|
-
readonly filter?: string | readonly string[];
|
|
3274
|
-
/** Maximum concurrent workers (default: 3) */
|
|
3275
|
-
readonly workers?: number;
|
|
3276
|
-
/** Maximum retries on failure (default: 2) */
|
|
3277
|
-
readonly maxRetries?: number;
|
|
3278
|
-
/** Agent timeout in milliseconds. No timeout if not set. */
|
|
3279
|
-
readonly agentTimeoutMs?: number;
|
|
3280
|
-
/** Enable response caching */
|
|
3281
|
-
readonly cache?: boolean;
|
|
3282
|
-
/** Verbose logging */
|
|
3283
|
-
readonly verbose?: boolean;
|
|
3284
|
-
/** Callback for each completed result */
|
|
3285
|
-
readonly onResult?: (result: EvaluationResult) => void;
|
|
3286
|
-
/** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
|
|
3287
|
-
readonly threshold?: number;
|
|
3288
|
-
}
|
|
3289
|
-
/**
|
|
3290
|
-
* Summary statistics for an evaluation run.
|
|
3291
|
-
*/
|
|
3292
|
-
interface EvalSummary {
|
|
3293
|
-
/** Total number of test cases */
|
|
3294
|
-
readonly total: number;
|
|
3295
|
-
/** Number of passing test cases (score >= threshold) */
|
|
3296
|
-
readonly passed: number;
|
|
3297
|
-
/** Number of failing test cases (score < threshold) */
|
|
3298
|
-
readonly failed: number;
|
|
3299
|
-
/** Total duration in milliseconds */
|
|
3300
|
-
readonly durationMs: number;
|
|
3301
|
-
/** Mean score across all cases */
|
|
3302
|
-
readonly meanScore: number;
|
|
3303
|
-
}
|
|
3304
|
-
/**
|
|
3305
|
-
* Result of an `evaluate()` call.
|
|
3306
|
-
*/
|
|
3307
|
-
interface EvalRunResult {
|
|
3308
|
-
/** Individual test case results */
|
|
3309
|
-
readonly results: readonly EvaluationResult[];
|
|
3310
|
-
/** Aggregate summary statistics */
|
|
3311
|
-
readonly summary: EvalSummary;
|
|
3312
|
-
}
|
|
3313
|
-
/**
|
|
3314
|
-
* Run an evaluation suite against a target provider.
|
|
3315
|
-
*
|
|
3316
|
-
* Accepts either inline test definitions or a path to an EVAL.yaml spec file.
|
|
3317
|
-
* The config shape mirrors the YAML structure — users can translate between
|
|
3318
|
-
* file-based and programmatic usage 1:1.
|
|
3319
|
-
*
|
|
3320
|
-
* @param config - Evaluation configuration
|
|
3321
|
-
* @returns Typed evaluation results with summary statistics
|
|
3322
|
-
*
|
|
3323
|
-
* @example Inline tests with assertions
|
|
3324
|
-
* ```typescript
|
|
3325
|
-
* const { results, summary } = await evaluate({
|
|
3326
|
-
* tests: [
|
|
3327
|
-
* {
|
|
3328
|
-
* id: 'greeting',
|
|
3329
|
-
* input: 'Say hello',
|
|
3330
|
-
* assert: [{ type: 'contains', value: 'hello' }],
|
|
3331
|
-
* },
|
|
3332
|
-
* ],
|
|
3333
|
-
* target: { provider: 'mock_agent' },
|
|
3334
|
-
* });
|
|
3335
|
-
* console.log(`${summary.passed}/${summary.total} passed`);
|
|
3336
|
-
* ```
|
|
3337
|
-
*
|
|
3338
|
-
* @example Load from YAML
|
|
3339
|
-
* ```typescript
|
|
3340
|
-
* const { summary } = await evaluate({
|
|
3341
|
-
* specFile: './evals/my-eval.yaml',
|
|
3342
|
-
* filter: 'greeting-*',
|
|
3343
|
-
* });
|
|
3344
|
-
* ```
|
|
3345
|
-
*/
|
|
3346
|
-
declare function evaluate(config: EvalConfig): Promise<EvalRunResult>;
|
|
3347
|
-
|
|
3348
3389
|
/**
|
|
3349
3390
|
* Typed configuration file support for AgentV.
|
|
3350
3391
|
*
|
|
@@ -4186,17 +4227,17 @@ declare class OtlpJsonFileExporter {
|
|
|
4186
4227
|
}
|
|
4187
4228
|
|
|
4188
4229
|
/**
|
|
4189
|
-
* Factory functions for all built-in
|
|
4230
|
+
* Factory functions for all built-in grader types.
|
|
4190
4231
|
*
|
|
4191
|
-
* Each factory creates an
|
|
4232
|
+
* Each factory creates an Grader instance from an GraderConfig,
|
|
4192
4233
|
* handling type-specific initialization logic. These are registered into
|
|
4193
|
-
* the
|
|
4234
|
+
* the GraderRegistry at startup.
|
|
4194
4235
|
*/
|
|
4195
4236
|
|
|
4196
4237
|
/**
|
|
4197
|
-
* Create a new
|
|
4238
|
+
* Create a new GraderRegistry with all built-in grader types registered.
|
|
4198
4239
|
*/
|
|
4199
|
-
declare function createBuiltinRegistry():
|
|
4240
|
+
declare function createBuiltinRegistry(): GraderRegistry;
|
|
4200
4241
|
|
|
4201
4242
|
/**
|
|
4202
4243
|
* Convention-based discovery of custom assertion scripts.
|
|
@@ -4216,27 +4257,27 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
|
4216
4257
|
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
4217
4258
|
* @returns Names of discovered assertion types
|
|
4218
4259
|
*/
|
|
4219
|
-
declare function discoverAssertions(registry:
|
|
4260
|
+
declare function discoverAssertions(registry: GraderRegistry, baseDir: string): Promise<string[]>;
|
|
4220
4261
|
|
|
4221
4262
|
/**
|
|
4222
4263
|
* Convention-based discovery of custom grader scripts.
|
|
4223
4264
|
*
|
|
4224
4265
|
* Scans `.agentv/graders/` (and legacy `.agentv/judges/`) for TypeScript/JavaScript
|
|
4225
|
-
* files and registers them as code
|
|
4226
|
-
* (without extension) becomes the
|
|
4266
|
+
* files and registers them as code graders in the registry. The file name
|
|
4267
|
+
* (without extension) becomes the grader type name.
|
|
4227
4268
|
*
|
|
4228
4269
|
* Example: `.agentv/graders/custom-grader.ts` → type "custom-grader" in EVAL.yaml
|
|
4229
4270
|
*/
|
|
4230
4271
|
|
|
4231
4272
|
/**
|
|
4232
4273
|
* Discover custom grader scripts from `.agentv/graders/` (and legacy `.agentv/judges/`)
|
|
4233
|
-
* and register them as
|
|
4274
|
+
* and register them as grader types in the registry.
|
|
4234
4275
|
*
|
|
4235
|
-
* @param registry - The
|
|
4276
|
+
* @param registry - The grader registry to register discovered graders into
|
|
4236
4277
|
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
4237
4278
|
* @returns Names of discovered grader types
|
|
4238
4279
|
*/
|
|
4239
|
-
declare function discoverGraders(registry:
|
|
4280
|
+
declare function discoverGraders(registry: GraderRegistry, baseDir: string): Promise<string[]>;
|
|
4240
4281
|
|
|
4241
4282
|
/**
|
|
4242
4283
|
* Core types for the transcript import pipeline.
|
|
@@ -4489,7 +4530,7 @@ declare function discoverClaudeSessions(opts?: ClaudeDiscoverOptions): Promise<C
|
|
|
4489
4530
|
* 1. Reads a transcript JSONL file (produced by `agentv import`)
|
|
4490
4531
|
* 2. Each invocation pops the next line from the transcript
|
|
4491
4532
|
* 3. Returns a ProviderResponse with pre-populated output, token usage, etc.
|
|
4492
|
-
* 4.
|
|
4533
|
+
* 4. Graders run identically to live eval — they see the same ProviderResponse
|
|
4493
4534
|
*
|
|
4494
4535
|
* The provider name in results is set to the source provider from the transcript
|
|
4495
4536
|
* (e.g., "claude", "codex", "copilot").
|
|
@@ -4555,4 +4596,4 @@ type AgentKernel = {
|
|
|
4555
4596
|
};
|
|
4556
4597
|
declare function createAgentKernel(): AgentKernel;
|
|
4557
4598
|
|
|
4558
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type
|
|
4599
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|