@agentv/core 4.17.1-next.1 → 4.18.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -341,7 +341,7 @@ type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
341
341
  /**
342
342
  * Configuration for tool-trajectory evaluator.
343
343
  */
344
- interface ToolTrajectoryEvaluatorConfig {
344
+ interface ToolTrajectoryGraderConfig {
345
345
  readonly name: string;
346
346
  readonly type: 'tool-trajectory';
347
347
  /** Matching mode */
@@ -355,7 +355,7 @@ interface ToolTrajectoryEvaluatorConfig {
355
355
  readonly required?: boolean | number;
356
356
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
357
357
  readonly min_score?: number;
358
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
358
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
359
359
  readonly negate?: boolean;
360
360
  /** Default argument matching mode for all expected items (defaults to 'exact') */
361
361
  readonly argsMatch?: ArgsMatchMode | readonly string[];
@@ -539,9 +539,9 @@ declare function isJsonValue(value: unknown): value is JsonValue;
539
539
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
540
540
  */
541
541
  declare function isTestMessage(value: unknown): value is TestMessage;
542
- declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
543
- type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
544
- declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
542
+ declare const GRADER_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
543
+ type GraderKind = (typeof GRADER_KIND_VALUES)[number];
544
+ declare function isGraderKind(value: unknown): value is GraderKind;
545
545
  /**
546
546
  * Configuration for enabling target access in code-grader evaluators.
547
547
  * When present, the runtime will start a local proxy server that allows
@@ -697,7 +697,7 @@ type WorkspaceConfig = {
697
697
  * relative paths from their own directory, not the eval file's directory. */
698
698
  readonly workspaceFileDir?: string;
699
699
  };
700
- type CodeEvaluatorConfig = {
700
+ type CodeGraderConfig = {
701
701
  readonly name: string;
702
702
  readonly type: 'code-grader';
703
703
  readonly command: readonly string[];
@@ -710,7 +710,7 @@ type CodeEvaluatorConfig = {
710
710
  readonly required?: boolean | number;
711
711
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
712
712
  readonly min_score?: number;
713
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
713
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
714
714
  readonly negate?: boolean;
715
715
  /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
716
716
  readonly config?: JsonObject;
@@ -739,7 +739,7 @@ type ContentPreprocessorConfig = {
739
739
  /** Resolved absolute path for the command script (last argv element) */
740
740
  readonly resolvedCommand?: readonly string[];
741
741
  };
742
- type LlmGraderEvaluatorConfig = {
742
+ type LlmGraderConfig = {
743
743
  readonly name: string;
744
744
  readonly type: 'llm-grader';
745
745
  /** Text prompt (inline or file path) or executable script config */
@@ -754,7 +754,7 @@ type LlmGraderEvaluatorConfig = {
754
754
  readonly required?: boolean | number;
755
755
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
756
756
  readonly min_score?: number;
757
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
757
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
758
758
  readonly negate?: boolean;
759
759
  /** Optional target override for this grader (uses a named LLM target from targets.yaml). */
760
760
  readonly target?: string;
@@ -767,8 +767,6 @@ type LlmGraderEvaluatorConfig = {
767
767
  /** Optional content preprocessors for ContentFile blocks in assistant output */
768
768
  readonly preprocessors?: readonly ContentPreprocessorConfig[];
769
769
  };
770
- /** @deprecated Use `LlmGraderEvaluatorConfig` instead */
771
- type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
772
770
  /**
773
771
  * Score range definition for analytic rubric scoring.
774
772
  * Each range maps an integer score band (0-10) to an outcome description.
@@ -830,16 +828,16 @@ type CompositeAggregatorConfig = {
830
828
  readonly type: 'threshold';
831
829
  readonly threshold: number;
832
830
  };
833
- type CompositeEvaluatorConfig = {
831
+ type CompositeGraderConfig = {
834
832
  readonly name: string;
835
833
  readonly type: 'composite';
836
- readonly assertions: readonly EvaluatorConfig[];
834
+ readonly assertions: readonly GraderConfig[];
837
835
  readonly aggregator: CompositeAggregatorConfig;
838
836
  readonly weight?: number;
839
837
  readonly required?: boolean | number;
840
838
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
841
839
  readonly min_score?: number;
842
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
840
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
843
841
  readonly negate?: boolean;
844
842
  };
845
843
  /**
@@ -874,7 +872,7 @@ type FieldConfig = {
874
872
  /**
875
873
  * Configuration for the field-accuracy evaluator.
876
874
  */
877
- type FieldAccuracyEvaluatorConfig = {
875
+ type FieldAccuracyGraderConfig = {
878
876
  readonly name: string;
879
877
  readonly type: 'field-accuracy';
880
878
  /** Fields to compare between candidate and expected */
@@ -885,14 +883,14 @@ type FieldAccuracyEvaluatorConfig = {
885
883
  readonly required?: boolean | number;
886
884
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
887
885
  readonly min_score?: number;
888
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
886
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
889
887
  readonly negate?: boolean;
890
888
  };
891
889
  /**
892
890
  * Configuration for the latency evaluator.
893
891
  * Checks execution duration against a threshold.
894
892
  */
895
- type LatencyEvaluatorConfig = {
893
+ type LatencyGraderConfig = {
896
894
  readonly name: string;
897
895
  readonly type: 'latency';
898
896
  /** Maximum allowed duration in milliseconds */
@@ -901,14 +899,14 @@ type LatencyEvaluatorConfig = {
901
899
  readonly required?: boolean | number;
902
900
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
903
901
  readonly min_score?: number;
904
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
902
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
905
903
  readonly negate?: boolean;
906
904
  };
907
905
  /**
908
906
  * Configuration for the cost evaluator.
909
907
  * Checks execution cost against a budget.
910
908
  */
911
- type CostEvaluatorConfig = {
909
+ type CostGraderConfig = {
912
910
  readonly name: string;
913
911
  readonly type: 'cost';
914
912
  /** Maximum allowed cost in USD */
@@ -917,14 +915,14 @@ type CostEvaluatorConfig = {
917
915
  readonly required?: boolean | number;
918
916
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
919
917
  readonly min_score?: number;
920
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
918
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
921
919
  readonly negate?: boolean;
922
920
  };
923
921
  /**
924
922
  * Configuration for the token-usage evaluator.
925
923
  * Checks provider-reported token usage against configured limits.
926
924
  */
927
- type TokenUsageEvaluatorConfig = {
925
+ type TokenUsageGraderConfig = {
928
926
  readonly name: string;
929
927
  readonly type: 'token-usage';
930
928
  /** Maximum allowed total tokens (input + output + cached, when present) */
@@ -937,7 +935,7 @@ type TokenUsageEvaluatorConfig = {
937
935
  readonly required?: boolean | number;
938
936
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
939
937
  readonly min_score?: number;
940
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
938
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
941
939
  readonly negate?: boolean;
942
940
  };
943
941
  /**
@@ -945,7 +943,7 @@ type TokenUsageEvaluatorConfig = {
945
943
  * Provides declarative threshold-based checks on execution metrics.
946
944
  * Only specified thresholds are checked; omitted ones are ignored.
947
945
  */
948
- type ExecutionMetricsEvaluatorConfig = {
946
+ type ExecutionMetricsGraderConfig = {
949
947
  readonly name: string;
950
948
  readonly type: 'execution-metrics';
951
949
  /** Maximum allowed number of tool calls */
@@ -966,14 +964,14 @@ type ExecutionMetricsEvaluatorConfig = {
966
964
  readonly required?: boolean | number;
967
965
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
968
966
  readonly min_score?: number;
969
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
967
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
970
968
  readonly negate?: boolean;
971
969
  };
972
970
  /**
973
971
  * Configuration for the contains assertion evaluator.
974
972
  * Checks whether the candidate output contains a specified substring.
975
973
  */
976
- type ContainsEvaluatorConfig = {
974
+ type ContainsGraderConfig = {
977
975
  readonly name: string;
978
976
  readonly type: 'contains';
979
977
  readonly value: string;
@@ -981,14 +979,14 @@ type ContainsEvaluatorConfig = {
981
979
  readonly required?: boolean | number;
982
980
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
983
981
  readonly min_score?: number;
984
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
982
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
985
983
  readonly negate?: boolean;
986
984
  };
987
985
  /**
988
986
  * Configuration for the contains_any assertion evaluator.
989
987
  * Checks whether the candidate output contains ANY of the specified substrings.
990
988
  */
991
- type ContainsAnyEvaluatorConfig = {
989
+ type ContainsAnyGraderConfig = {
992
990
  readonly name: string;
993
991
  readonly type: 'contains-any';
994
992
  readonly value: readonly string[];
@@ -996,14 +994,14 @@ type ContainsAnyEvaluatorConfig = {
996
994
  readonly required?: boolean | number;
997
995
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
998
996
  readonly min_score?: number;
999
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
997
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1000
998
  readonly negate?: boolean;
1001
999
  };
1002
1000
  /**
1003
1001
  * Configuration for the contains_all assertion evaluator.
1004
1002
  * Checks whether the candidate output contains ALL of the specified substrings.
1005
1003
  */
1006
- type ContainsAllEvaluatorConfig = {
1004
+ type ContainsAllGraderConfig = {
1007
1005
  readonly name: string;
1008
1006
  readonly type: 'contains-all';
1009
1007
  readonly value: readonly string[];
@@ -1011,14 +1009,14 @@ type ContainsAllEvaluatorConfig = {
1011
1009
  readonly required?: boolean | number;
1012
1010
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1013
1011
  readonly min_score?: number;
1014
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1012
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1015
1013
  readonly negate?: boolean;
1016
1014
  };
1017
1015
  /**
1018
1016
  * Configuration for the icontains assertion evaluator.
1019
1017
  * Case-insensitive check whether the candidate output contains a specified substring.
1020
1018
  */
1021
- type IcontainsEvaluatorConfig = {
1019
+ type IcontainsGraderConfig = {
1022
1020
  readonly name: string;
1023
1021
  readonly type: 'icontains';
1024
1022
  readonly value: string;
@@ -1026,14 +1024,14 @@ type IcontainsEvaluatorConfig = {
1026
1024
  readonly required?: boolean | number;
1027
1025
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1028
1026
  readonly min_score?: number;
1029
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1027
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1030
1028
  readonly negate?: boolean;
1031
1029
  };
1032
1030
  /**
1033
1031
  * Configuration for the icontains_any assertion evaluator.
1034
1032
  * Case-insensitive check whether the candidate output contains ANY of the specified substrings.
1035
1033
  */
1036
- type IcontainsAnyEvaluatorConfig = {
1034
+ type IcontainsAnyGraderConfig = {
1037
1035
  readonly name: string;
1038
1036
  readonly type: 'icontains-any';
1039
1037
  readonly value: readonly string[];
@@ -1041,14 +1039,14 @@ type IcontainsAnyEvaluatorConfig = {
1041
1039
  readonly required?: boolean | number;
1042
1040
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1043
1041
  readonly min_score?: number;
1044
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1042
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1045
1043
  readonly negate?: boolean;
1046
1044
  };
1047
1045
  /**
1048
1046
  * Configuration for the icontains_all assertion evaluator.
1049
1047
  * Case-insensitive check whether the candidate output contains ALL of the specified substrings.
1050
1048
  */
1051
- type IcontainsAllEvaluatorConfig = {
1049
+ type IcontainsAllGraderConfig = {
1052
1050
  readonly name: string;
1053
1051
  readonly type: 'icontains-all';
1054
1052
  readonly value: readonly string[];
@@ -1056,14 +1054,14 @@ type IcontainsAllEvaluatorConfig = {
1056
1054
  readonly required?: boolean | number;
1057
1055
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1058
1056
  readonly min_score?: number;
1059
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1057
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1060
1058
  readonly negate?: boolean;
1061
1059
  };
1062
1060
  /**
1063
1061
  * Configuration for the starts_with assertion evaluator.
1064
1062
  * Checks whether the candidate output starts with a specified string (both trimmed).
1065
1063
  */
1066
- type StartsWithEvaluatorConfig = {
1064
+ type StartsWithGraderConfig = {
1067
1065
  readonly name: string;
1068
1066
  readonly type: 'starts-with';
1069
1067
  readonly value: string;
@@ -1071,14 +1069,14 @@ type StartsWithEvaluatorConfig = {
1071
1069
  readonly required?: boolean | number;
1072
1070
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1073
1071
  readonly min_score?: number;
1074
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1072
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1075
1073
  readonly negate?: boolean;
1076
1074
  };
1077
1075
  /**
1078
1076
  * Configuration for the ends_with assertion evaluator.
1079
1077
  * Checks whether the candidate output ends with a specified string (both trimmed).
1080
1078
  */
1081
- type EndsWithEvaluatorConfig = {
1079
+ type EndsWithGraderConfig = {
1082
1080
  readonly name: string;
1083
1081
  readonly type: 'ends-with';
1084
1082
  readonly value: string;
@@ -1086,14 +1084,14 @@ type EndsWithEvaluatorConfig = {
1086
1084
  readonly required?: boolean | number;
1087
1085
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1088
1086
  readonly min_score?: number;
1089
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1087
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1090
1088
  readonly negate?: boolean;
1091
1089
  };
1092
1090
  /**
1093
1091
  * Configuration for the regex assertion evaluator.
1094
1092
  * Checks whether the candidate output matches a regular expression pattern.
1095
1093
  */
1096
- type RegexEvaluatorConfig = {
1094
+ type RegexGraderConfig = {
1097
1095
  readonly name: string;
1098
1096
  readonly type: 'regex';
1099
1097
  readonly value: string;
@@ -1103,28 +1101,28 @@ type RegexEvaluatorConfig = {
1103
1101
  readonly required?: boolean | number;
1104
1102
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1105
1103
  readonly min_score?: number;
1106
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1104
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1107
1105
  readonly negate?: boolean;
1108
1106
  };
1109
1107
  /**
1110
1108
  * Configuration for the is_json assertion evaluator.
1111
1109
  * Checks whether the candidate output is valid JSON.
1112
1110
  */
1113
- type IsJsonEvaluatorConfig = {
1111
+ type IsJsonGraderConfig = {
1114
1112
  readonly name: string;
1115
1113
  readonly type: 'is-json';
1116
1114
  readonly weight?: number;
1117
1115
  readonly required?: boolean | number;
1118
1116
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1119
1117
  readonly min_score?: number;
1120
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1118
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1121
1119
  readonly negate?: boolean;
1122
1120
  };
1123
1121
  /**
1124
1122
  * Configuration for the equals assertion evaluator.
1125
1123
  * Checks whether the candidate output exactly equals a specified string.
1126
1124
  */
1127
- type EqualsEvaluatorConfig = {
1125
+ type EqualsGraderConfig = {
1128
1126
  readonly name: string;
1129
1127
  readonly type: 'equals';
1130
1128
  readonly value: string;
@@ -1132,7 +1130,7 @@ type EqualsEvaluatorConfig = {
1132
1130
  readonly required?: boolean | number;
1133
1131
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1134
1132
  readonly min_score?: number;
1135
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1133
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1136
1134
  readonly negate?: boolean;
1137
1135
  };
1138
1136
  /**
@@ -1147,7 +1145,7 @@ type RubricsEvaluatorConfig = {
1147
1145
  readonly required?: boolean | number;
1148
1146
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1149
1147
  readonly min_score?: number;
1150
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1148
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1151
1149
  readonly negate?: boolean;
1152
1150
  };
1153
1151
  /**
@@ -1156,7 +1154,7 @@ type RubricsEvaluatorConfig = {
1156
1154
  * Tool-name resolution is automatic based on the provider kind.
1157
1155
  * For providers not covered by the built-in mapping, use a code-grader.
1158
1156
  */
1159
- type SkillTriggerEvaluatorConfig = {
1157
+ type SkillTriggerGraderConfig = {
1160
1158
  readonly name: string;
1161
1159
  readonly type: 'skill-trigger';
1162
1160
  /** The skill name to check for (case-sensitive substring match) */
@@ -1182,7 +1180,7 @@ type InlineAssertEvaluatorConfig = {
1182
1180
  readonly min_score?: number;
1183
1181
  readonly negate?: boolean;
1184
1182
  };
1185
- type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
1183
+ type GraderConfig = CodeGraderConfig | LlmGraderConfig | CompositeGraderConfig | ToolTrajectoryGraderConfig | FieldAccuracyGraderConfig | LatencyGraderConfig | CostGraderConfig | TokenUsageGraderConfig | ExecutionMetricsGraderConfig | SkillTriggerGraderConfig | ContainsGraderConfig | ContainsAnyGraderConfig | ContainsAllGraderConfig | IcontainsGraderConfig | IcontainsAnyGraderConfig | IcontainsAllGraderConfig | StartsWithGraderConfig | EndsWithGraderConfig | RegexGraderConfig | IsJsonGraderConfig | EqualsGraderConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
1186
1184
  /**
1187
1185
  * A single turn in a multi-turn conversation evaluation.
1188
1186
  * Each turn is a user message. The runner generates the assistant response.
@@ -1193,7 +1191,7 @@ interface ConversationTurn {
1193
1191
  /** Reference assistant response for grading (NOT carried forward — actual LLM response is used) */
1194
1192
  readonly expected_output?: TestMessageContent;
1195
1193
  /** Per-turn assertions. Strings become rubric criteria via shorthand. */
1196
- readonly assertions?: readonly (string | EvaluatorConfig)[];
1194
+ readonly assertions?: readonly (string | GraderConfig)[];
1197
1195
  }
1198
1196
  /**
1199
1197
  * Conversation evaluation mode.
@@ -1228,8 +1226,8 @@ interface EvalTest {
1228
1226
  readonly reference_answer?: string;
1229
1227
  readonly file_paths: readonly string[];
1230
1228
  readonly criteria: string;
1231
- readonly evaluator?: EvaluatorKind;
1232
- readonly assertions?: readonly EvaluatorConfig[];
1229
+ readonly evaluator?: GraderKind;
1230
+ readonly assertions?: readonly GraderConfig[];
1233
1231
  /** Suite-level preprocessors used by the implicit default llm-grader. */
1234
1232
  readonly preprocessors?: readonly ContentPreprocessorConfig[];
1235
1233
  /** Workspace configuration (merged from suite-level and case-level) */
@@ -1293,7 +1291,7 @@ interface TrialResult {
1293
1291
  readonly attempt: number;
1294
1292
  readonly score: number;
1295
1293
  readonly verdict: EvaluationVerdict;
1296
- readonly scores?: readonly EvaluatorResult[];
1294
+ readonly scores?: readonly GraderResult[];
1297
1295
  readonly error?: string;
1298
1296
  readonly costUsd?: number;
1299
1297
  /** Primary classification for this trial attempt */
@@ -1359,7 +1357,7 @@ interface ExecutionError {
1359
1357
  */
1360
1358
  type FailOnError = boolean;
1361
1359
  /**
1362
- * Evaluator scorecard for a single eval case run.
1360
+ * Grader scorecard for a single eval case run.
1363
1361
  */
1364
1362
  interface EvaluationResult {
1365
1363
  readonly timestamp: string;
@@ -1390,7 +1388,7 @@ interface EvaluationResult {
1390
1388
  readonly lm?: JsonObject;
1391
1389
  readonly evaluator?: JsonObject;
1392
1390
  };
1393
- readonly scores?: readonly EvaluatorResult[];
1391
+ readonly scores?: readonly GraderResult[];
1394
1392
  readonly error?: string;
1395
1393
  /** Lightweight summary of the execution trace (always included when available) */
1396
1394
  readonly trace?: TraceSummary;
@@ -1433,9 +1431,9 @@ interface EvaluationResult {
1433
1431
  readonly executionError?: ExecutionError;
1434
1432
  }
1435
1433
  type EvaluationVerdict = 'pass' | 'fail' | 'skip';
1436
- interface EvaluatorResult {
1434
+ interface GraderResult {
1437
1435
  readonly name: string;
1438
- readonly type: EvaluatorKind;
1436
+ readonly type: GraderKind;
1439
1437
  readonly score: number;
1440
1438
  readonly weight?: number;
1441
1439
  readonly verdict?: EvaluationVerdict;
@@ -1444,7 +1442,7 @@ interface EvaluatorResult {
1444
1442
  readonly input?: JsonObject;
1445
1443
  /** Target name used for grading (e.g., the LLM provider name). */
1446
1444
  readonly target?: string;
1447
- readonly scores?: readonly EvaluatorResult[];
1445
+ readonly scores?: readonly GraderResult[];
1448
1446
  /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
1449
1447
  readonly details?: JsonObject;
1450
1448
  /** Token usage from LLM calls made by this evaluator (optional). */
@@ -1642,7 +1640,7 @@ type EvalSuiteResult = {
1642
1640
  /** Suite-level metadata (name, description, version, etc.) */
1643
1641
  readonly metadata?: EvalMetadata;
1644
1642
  /** Suite-level total cost budget in USD */
1645
- readonly totalBudgetUsd?: number;
1643
+ readonly budgetUsd?: number;
1646
1644
  /** Execution error tolerance: true or false */
1647
1645
  readonly failOnError?: FailOnError;
1648
1646
  /** Suite-level quality threshold (0-1) — suite fails if mean score is below */
@@ -2346,8 +2344,8 @@ interface EvaluationContext {
2346
2344
  readonly graderProvider?: Provider;
2347
2345
  /** @deprecated Use `graderProvider` instead */
2348
2346
  readonly judgeProvider?: Provider;
2349
- readonly evaluatorTemplateOverride?: string;
2350
- readonly evaluator?: EvaluatorConfig;
2347
+ readonly graderTemplateOverride?: string;
2348
+ readonly evaluator?: GraderConfig;
2351
2349
  /** Output messages from agent execution (primary source for tool trajectory) */
2352
2350
  readonly output?: readonly Message[];
2353
2351
  /** Lightweight summary of trace events (if available) */
@@ -2380,8 +2378,8 @@ interface EvaluationScore {
2380
2378
  readonly verdict: EvaluationVerdict;
2381
2379
  readonly assertions: readonly AssertionEntry[];
2382
2380
  readonly expectedAspectCount: number;
2383
- readonly evaluatorRawRequest?: JsonObject;
2384
- readonly scores?: readonly ChildEvaluatorResult[];
2381
+ readonly graderRawRequest?: JsonObject;
2382
+ readonly scores?: readonly ChildGraderResult[];
2385
2383
  /** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
2386
2384
  readonly details?: JsonObject;
2387
2385
  /** Token usage from LLM calls made by this evaluator (optional). */
@@ -2389,26 +2387,26 @@ interface EvaluationScore {
2389
2387
  /** Target name used for grading (e.g., the LLM provider). */
2390
2388
  readonly graderTarget?: string;
2391
2389
  }
2392
- interface ChildEvaluatorResult {
2390
+ interface ChildGraderResult {
2393
2391
  readonly name: string;
2394
2392
  readonly type: string;
2395
2393
  readonly score: number;
2396
2394
  readonly weight?: number;
2397
2395
  readonly verdict: EvaluationVerdict;
2398
2396
  readonly assertions: readonly AssertionEntry[];
2399
- readonly evaluatorRawRequest?: JsonObject;
2400
- readonly scores?: readonly ChildEvaluatorResult[];
2397
+ readonly graderRawRequest?: JsonObject;
2398
+ readonly scores?: readonly ChildGraderResult[];
2401
2399
  /** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
2402
2400
  readonly details?: JsonObject;
2403
2401
  /** Token usage from LLM calls made by this evaluator (optional). */
2404
2402
  readonly tokenUsage?: TokenUsage;
2405
2403
  }
2406
- interface Evaluator {
2404
+ interface Grader {
2407
2405
  readonly kind: string;
2408
2406
  evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
2409
2407
  }
2410
- interface EvaluatorFactory {
2411
- create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
2408
+ interface GraderFactory {
2409
+ create(config: GraderConfig, context: EvaluationContext): Grader;
2412
2410
  }
2413
2411
 
2414
2412
  /**
@@ -2447,7 +2445,7 @@ declare function deepEqual(a: unknown, b: unknown): boolean;
2447
2445
  */
2448
2446
  declare function negateScore(score: EvaluationScore): EvaluationScore;
2449
2447
 
2450
- interface CodeEvaluatorOptions {
2448
+ interface CodeGraderOptions {
2451
2449
  readonly command: readonly string[];
2452
2450
  /** @deprecated Use `command` instead */
2453
2451
  readonly script?: readonly string[];
@@ -2458,29 +2456,29 @@ interface CodeEvaluatorOptions {
2458
2456
  /** Target access config - when present, enables target invocation */
2459
2457
  readonly target?: TargetAccessConfig;
2460
2458
  }
2461
- declare class CodeEvaluator implements Evaluator {
2459
+ declare class CodeGrader implements Grader {
2462
2460
  readonly kind = "code-grader";
2463
2461
  private readonly command;
2464
2462
  private readonly cwd?;
2465
2463
  private readonly agentTimeoutMs?;
2466
2464
  private readonly config?;
2467
2465
  private readonly target?;
2468
- constructor(options: CodeEvaluatorOptions);
2466
+ constructor(options: CodeGraderOptions);
2469
2467
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
2470
2468
  }
2471
2469
  declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
2472
2470
 
2473
- interface CompositeEvaluatorOptions {
2474
- readonly config: CompositeEvaluatorConfig;
2475
- readonly evaluatorFactory: EvaluatorFactory;
2471
+ interface CompositeGraderOptions {
2472
+ readonly config: CompositeGraderConfig;
2473
+ readonly evaluatorFactory: GraderFactory;
2476
2474
  readonly cwd?: string;
2477
2475
  }
2478
- declare class CompositeEvaluator implements Evaluator {
2476
+ declare class CompositeGrader implements Grader {
2479
2477
  readonly kind = "composite";
2480
2478
  private readonly config;
2481
2479
  private readonly evaluatorFactory;
2482
2480
  private readonly cwd?;
2483
- constructor(options: CompositeEvaluatorOptions);
2481
+ constructor(options: CompositeGraderOptions);
2484
2482
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
2485
2483
  private aggregate;
2486
2484
  private runWeightedAverage;
@@ -2489,50 +2487,50 @@ declare class CompositeEvaluator implements Evaluator {
2489
2487
  private runLlmAggregator;
2490
2488
  }
2491
2489
 
2492
- interface CostEvaluatorOptions {
2493
- readonly config: CostEvaluatorConfig;
2490
+ interface CostGraderOptions {
2491
+ readonly config: CostGraderConfig;
2494
2492
  }
2495
2493
  /**
2496
- * Evaluator that checks execution cost against a budget.
2494
+ * Grader that checks execution cost against a budget.
2497
2495
  * Uses costUsd from the evaluation context.
2498
2496
  */
2499
- declare class CostEvaluator implements Evaluator {
2497
+ declare class CostGrader implements Grader {
2500
2498
  readonly kind = "cost";
2501
2499
  private readonly config;
2502
- constructor(options: CostEvaluatorOptions);
2500
+ constructor(options: CostGraderOptions);
2503
2501
  evaluate(context: EvaluationContext): EvaluationScore;
2504
2502
  }
2505
2503
 
2506
- interface ExecutionMetricsEvaluatorOptions {
2507
- readonly config: ExecutionMetricsEvaluatorConfig;
2504
+ interface ExecutionMetricsGraderOptions {
2505
+ readonly config: ExecutionMetricsGraderConfig;
2508
2506
  }
2509
2507
  /**
2510
- * Evaluator that checks execution metrics against configured thresholds.
2508
+ * Grader that checks execution metrics against configured thresholds.
2511
2509
  * Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
2512
2510
  * and exploration ratio. Only specified thresholds are checked.
2513
2511
  *
2514
2512
  * Score is proportional: passed / total assertions
2515
2513
  */
2516
- declare class ExecutionMetricsEvaluator implements Evaluator {
2514
+ declare class ExecutionMetricsGrader implements Grader {
2517
2515
  readonly kind = "execution-metrics";
2518
2516
  private readonly config;
2519
- constructor(options: ExecutionMetricsEvaluatorOptions);
2517
+ constructor(options: ExecutionMetricsGraderOptions);
2520
2518
  evaluate(context: EvaluationContext): EvaluationScore;
2521
2519
  private extractConfiguredThresholds;
2522
2520
  private filterDefinedMetrics;
2523
2521
  }
2524
2522
 
2525
- interface FieldAccuracyEvaluatorOptions {
2526
- readonly config: FieldAccuracyEvaluatorConfig;
2523
+ interface FieldAccuracyGraderOptions {
2524
+ readonly config: FieldAccuracyGraderConfig;
2527
2525
  }
2528
2526
  /**
2529
- * FieldAccuracyEvaluator compares extracted structured data against expected values
2527
+ * FieldAccuracyGrader compares extracted structured data against expected values
2530
2528
  * with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
2531
2529
  */
2532
- declare class FieldAccuracyEvaluator implements Evaluator {
2530
+ declare class FieldAccuracyGrader implements Grader {
2533
2531
  readonly kind = "field-accuracy";
2534
2532
  private readonly config;
2535
- constructor(options: FieldAccuracyEvaluatorOptions);
2533
+ constructor(options: FieldAccuracyGraderOptions);
2536
2534
  evaluate(context: EvaluationContext): EvaluationScore;
2537
2535
  /**
2538
2536
  * Extract expected data from expected_output array.
@@ -2561,33 +2559,33 @@ declare class FieldAccuracyEvaluator implements Evaluator {
2561
2559
  private aggregateResults;
2562
2560
  }
2563
2561
 
2564
- interface LatencyEvaluatorOptions {
2565
- readonly config: LatencyEvaluatorConfig;
2562
+ interface LatencyGraderOptions {
2563
+ readonly config: LatencyGraderConfig;
2566
2564
  }
2567
2565
  /**
2568
- * Evaluator that checks execution duration against a threshold.
2566
+ * Grader that checks execution duration against a threshold.
2569
2567
  * Uses durationMs from the evaluation context.
2570
2568
  */
2571
- declare class LatencyEvaluator implements Evaluator {
2569
+ declare class LatencyGrader implements Grader {
2572
2570
  readonly kind = "latency";
2573
2571
  private readonly config;
2574
- constructor(options: LatencyEvaluatorOptions);
2572
+ constructor(options: LatencyGraderOptions);
2575
2573
  evaluate(context: EvaluationContext): EvaluationScore;
2576
2574
  }
2577
2575
 
2578
2576
  /**
2579
- * Default evaluator template for the user prompt (variables will be substituted).
2580
- * Custom evaluators can override this via evaluatorTemplate option.
2577
+ * Default grader template for the user prompt (variables will be substituted).
2578
+ * Custom graders can override this via graderTemplate option.
2581
2579
  */
2582
- declare const DEFAULT_EVALUATOR_TEMPLATE: string;
2580
+ declare const DEFAULT_GRADER_TEMPLATE: string;
2583
2581
  type GraderProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
2584
- interface LlmGraderEvaluatorOptions {
2582
+ interface LlmGraderOptions {
2585
2583
  readonly resolveGraderProvider: GraderProviderResolver;
2586
2584
  /** @deprecated Use `resolveGraderProvider` instead. */
2587
2585
  readonly resolveJudgeProvider?: GraderProviderResolver;
2588
2586
  readonly maxOutputTokens?: number;
2589
2587
  readonly temperature?: number;
2590
- readonly evaluatorTemplate?: string;
2588
+ readonly graderTemplate?: string;
2591
2589
  readonly maxSteps?: number;
2592
2590
  readonly graderTargetProvider?: Provider;
2593
2591
  /** @deprecated Use `graderTargetProvider` instead. */
@@ -2657,15 +2655,15 @@ declare const rubricEvaluationSchema: z.ZodObject<{
2657
2655
  overall_reasoning: string;
2658
2656
  }>;
2659
2657
 
2660
- declare class LlmGraderEvaluator implements Evaluator {
2658
+ declare class LlmGrader implements Grader {
2661
2659
  readonly kind = "llm-grader";
2662
2660
  private readonly resolveGraderProvider;
2663
2661
  private readonly maxOutputTokens?;
2664
2662
  private readonly temperature?;
2665
- private readonly evaluatorTemplate?;
2663
+ private readonly graderTemplate?;
2666
2664
  private readonly maxSteps;
2667
2665
  private readonly graderTargetProvider?;
2668
- constructor(options: LlmGraderEvaluatorOptions);
2666
+ constructor(options: LlmGraderOptions);
2669
2667
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
2670
2668
  private prepareContext;
2671
2669
  private evaluateFreeform;
@@ -2722,7 +2720,7 @@ declare class LlmGraderEvaluator implements Evaluator {
2722
2720
  }
2723
2721
  /**
2724
2722
  * Build the mandatory output schema that all evaluators must follow.
2725
- * This schema is always appended to the evaluator template.
2723
+ * This schema is always appended to the grader template.
2726
2724
  */
2727
2725
  declare function buildOutputSchema(): string;
2728
2726
  declare function buildRubricOutputSchema(): string;
@@ -2766,10 +2764,10 @@ declare function extractImageBlocks(messages: readonly Message[]): ContentImage[
2766
2764
  * names (input.skill, input.file_path) regardless of provider.
2767
2765
  */
2768
2766
 
2769
- declare class SkillTriggerEvaluator implements Evaluator {
2767
+ declare class SkillTriggerGrader implements Grader {
2770
2768
  readonly kind = "skill-trigger";
2771
2769
  private readonly config;
2772
- constructor(config: SkillTriggerEvaluatorConfig);
2770
+ constructor(config: SkillTriggerGraderConfig);
2773
2771
  evaluate(context: EvaluationContext): EvaluationScore;
2774
2772
  }
2775
2773
 
@@ -2783,33 +2781,33 @@ declare function assembleLlmGraderPrompt(input: {
2783
2781
  evalCase: EvalTest;
2784
2782
  candidate: string;
2785
2783
  promptInputs: PromptInputs;
2786
- evaluatorConfig?: LlmGraderEvaluatorConfig;
2784
+ evaluatorConfig?: LlmGraderConfig;
2787
2785
  output?: readonly Message[];
2788
2786
  fileChanges?: string;
2789
- evaluatorTemplateOverride?: string;
2787
+ graderTemplateOverride?: string;
2790
2788
  }): LlmGraderPromptAssembly;
2791
2789
 
2792
- interface TokenUsageEvaluatorOptions {
2793
- readonly config: TokenUsageEvaluatorConfig;
2790
+ interface TokenUsageGraderOptions {
2791
+ readonly config: TokenUsageGraderConfig;
2794
2792
  }
2795
2793
  /**
2796
- * Evaluator that checks provider-reported token usage against configured limits.
2794
+ * Grader that checks provider-reported token usage against configured limits.
2797
2795
  * Uses tokenUsage from the evaluation context.
2798
2796
  */
2799
- declare class TokenUsageEvaluator implements Evaluator {
2797
+ declare class TokenUsageGrader implements Grader {
2800
2798
  readonly kind = "token-usage";
2801
2799
  private readonly config;
2802
- constructor(options: TokenUsageEvaluatorOptions);
2800
+ constructor(options: TokenUsageGraderOptions);
2803
2801
  evaluate(context: EvaluationContext): EvaluationScore;
2804
2802
  }
2805
2803
 
2806
- interface ToolTrajectoryEvaluatorOptions {
2807
- readonly config: ToolTrajectoryEvaluatorConfig;
2804
+ interface ToolTrajectoryGraderOptions {
2805
+ readonly config: ToolTrajectoryGraderConfig;
2808
2806
  }
2809
- declare class ToolTrajectoryEvaluator implements Evaluator {
2807
+ declare class ToolTrajectoryGrader implements Grader {
2810
2808
  readonly kind = "tool-trajectory";
2811
2809
  private readonly config;
2812
- constructor(options: ToolTrajectoryEvaluatorOptions);
2810
+ constructor(options: ToolTrajectoryGraderOptions);
2813
2811
  evaluate(context: EvaluationContext): EvaluationScore;
2814
2812
  /**
2815
2813
  * Extract tool calls from output messages.
@@ -2873,7 +2871,7 @@ declare function runIsJsonAssertion(output: string): AssertionResult;
2873
2871
  declare function runEqualsAssertion(output: string, value: string): AssertionResult;
2874
2872
 
2875
2873
  /**
2876
- * Extensible evaluator registry.
2874
+ * Extensible grader registry.
2877
2875
  *
2878
2876
  * Replaces the hardcoded switch/case dispatch in the orchestrator with
2879
2877
  * a registry of named factory functions. Built-in evaluators are registered
@@ -2882,10 +2880,10 @@ declare function runEqualsAssertion(output: string, value: string): AssertionRes
2882
2880
  */
2883
2881
 
2884
2882
  /**
2885
- * Context passed to evaluator factory functions during creation.
2883
+ * Context passed to grader factory functions during creation.
2886
2884
  * Contains shared resources needed by evaluator instances.
2887
2885
  */
2888
- interface EvaluatorDispatchContext {
2886
+ interface GraderDispatchContext {
2889
2887
  /** Shared LLM grader provider (resolved at suite level) */
2890
2888
  readonly graderProvider?: Provider;
2891
2889
  /** @deprecated Use `graderProvider` instead */
@@ -2899,48 +2897,48 @@ interface EvaluatorDispatchContext {
2899
2897
  /** Directory containing the eval file (for composite member resolution) */
2900
2898
  readonly evalFileDir?: string;
2901
2899
  /** Shared LLM grader evaluator instance */
2902
- readonly llmGrader: Evaluator;
2900
+ readonly llmGrader: Grader;
2903
2901
  /** @deprecated Use `llmGrader` instead */
2904
- readonly llmJudge?: Evaluator;
2902
+ readonly llmJudge?: Grader;
2905
2903
  /** Reference to the registry itself (for composite evaluators that need to create children) */
2906
- readonly registry: EvaluatorRegistry;
2904
+ readonly registry: GraderRegistry;
2907
2905
  }
2908
2906
  /**
2909
- * Factory function that creates an Evaluator instance from a config.
2907
+ * Factory function that creates an Grader instance from a config.
2910
2908
  *
2911
2909
  * Factory functions handle all type-specific initialization logic:
2912
2910
  * - Reading prompt files for LLM graders
2913
2911
  * - Resolving script paths for code graders
2914
2912
  * - Creating adapter evaluators for deterministic assertions
2915
2913
  */
2916
- type EvaluatorFactoryFn = (config: EvaluatorConfig, context: EvaluatorDispatchContext) => Evaluator | Promise<Evaluator>;
2914
+ type GraderFactoryFn = (config: GraderConfig, context: GraderDispatchContext) => Grader | Promise<Grader>;
2917
2915
  /**
2918
- * Registry of evaluator factory functions keyed by evaluator type name.
2916
+ * Registry of grader factory functions keyed by grader type name.
2919
2917
  *
2920
2918
  * Built-in evaluators are registered at startup. Custom evaluators can be
2921
2919
  * registered via the `register()` method or discovered from `.agentv/assertions/`.
2922
2920
  */
2923
- declare class EvaluatorRegistry {
2921
+ declare class GraderRegistry {
2924
2922
  private readonly factories;
2925
- /** Register a factory function for an evaluator type. */
2926
- register(type: string, factory: EvaluatorFactoryFn): this;
2927
- /** Get the factory function for an evaluator type. */
2928
- get(type: string): EvaluatorFactoryFn | undefined;
2923
+ /** Register a factory function for an grader type. */
2924
+ register(type: string, factory: GraderFactoryFn): this;
2925
+ /** Get the factory function for an grader type. */
2926
+ get(type: string): GraderFactoryFn | undefined;
2929
2927
  /** Check if a factory is registered for the given type. */
2930
2928
  has(type: string): boolean;
2931
- /** List all registered evaluator type names. */
2929
+ /** List all registered grader type names. */
2932
2930
  list(): string[];
2933
2931
  /**
2934
2932
  * Create an evaluator instance from a config, using the registered factory.
2935
- * Throws if no factory is registered for the evaluator type.
2933
+ * Throws if no factory is registered for the grader type.
2936
2934
  */
2937
- create(config: EvaluatorConfig, context: EvaluatorDispatchContext): Promise<Evaluator>;
2935
+ create(config: GraderConfig, context: GraderDispatchContext): Promise<Grader>;
2938
2936
  }
2939
2937
  /**
2940
- * Adapter that wraps a synchronous assertion function as an Evaluator.
2938
+ * Adapter that wraps a synchronous assertion function as an Grader.
2941
2939
  * Used for deterministic assertions (contains, regex, is-json, equals).
2942
2940
  */
2943
- declare class DeterministicAssertionEvaluator implements Evaluator {
2941
+ declare class DeterministicAssertionGrader implements Grader {
2944
2942
  private readonly assertFn;
2945
2943
  readonly kind: string;
2946
2944
  constructor(kind: string, assertFn: (context: EvaluationContext) => EvaluationScore);
@@ -2988,8 +2986,8 @@ interface RunEvalCaseOptions {
2988
2986
  readonly evalCase: EvalTest;
2989
2987
  readonly provider: Provider;
2990
2988
  readonly target: ResolvedTarget;
2991
- readonly evaluators: Partial<Record<string, Evaluator>> & {
2992
- readonly 'llm-grader': Evaluator;
2989
+ readonly evaluators: Partial<Record<string, Grader>> & {
2990
+ readonly 'llm-grader': Grader;
2993
2991
  };
2994
2992
  readonly now?: () => Date;
2995
2993
  readonly maxRetries?: number;
@@ -3020,8 +3018,8 @@ interface RunEvalCaseOptions {
3020
3018
  readonly suiteWorkspaceFile?: string;
3021
3019
  /** Real-time observability callbacks passed to the provider */
3022
3020
  readonly streamCallbacks?: ProviderStreamCallbacks;
3023
- /** Evaluator type registry (with custom assertions discovered) */
3024
- readonly typeRegistry?: EvaluatorRegistry;
3021
+ /** Grader type registry (with custom assertions discovered) */
3022
+ readonly typeRegistry?: GraderRegistry;
3025
3023
  /** RepoManager instance for repo lifecycle (shared workspace mode) */
3026
3024
  readonly repoManager?: RepoManager;
3027
3025
  /** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
@@ -3054,7 +3052,7 @@ interface RunEvaluationOptions {
3054
3052
  readonly targets?: readonly TargetDefinition[];
3055
3053
  readonly env?: EnvLookup;
3056
3054
  readonly providerFactory?: (target: ResolvedTarget) => Provider;
3057
- readonly evaluators?: Partial<Record<string, Evaluator>>;
3055
+ readonly evaluators?: Partial<Record<string, Grader>>;
3058
3056
  readonly maxRetries?: number;
3059
3057
  readonly agentTimeoutMs?: number;
3060
3058
  readonly cache?: EvaluationCache;
@@ -3076,7 +3074,7 @@ interface RunEvaluationOptions {
3076
3074
  /** Real-time observability callbacks passed to the provider */
3077
3075
  readonly streamCallbacks?: ProviderStreamCallbacks;
3078
3076
  /** Suite-level total cost budget in USD (stops dispatching when exceeded) */
3079
- readonly totalBudgetUsd?: number;
3077
+ readonly budgetUsd?: number;
3080
3078
  /** Execution error tolerance: true halts on first error */
3081
3079
  readonly failOnError?: FailOnError;
3082
3080
  /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
@@ -3111,7 +3109,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
3111
3109
  * Types for inline assertion functions used in the evaluate() API.
3112
3110
  *
3113
3111
  * Inline functions are the escape hatch for custom evaluation logic
3114
- * that doesn't fit a built-in evaluator type. For built-in assertions
3112
+ * that doesn't fit a built-in grader type. For built-in assertions
3115
3113
  * (contains, regex, is-json, etc.), use config objects instead:
3116
3114
  *
3117
3115
  * assert: [{ type: 'contains', value: 'hello' }]
@@ -4186,17 +4184,17 @@ declare class OtlpJsonFileExporter {
4186
4184
  }
4187
4185
 
4188
4186
  /**
4189
- * Factory functions for all built-in evaluator types.
4187
+ * Factory functions for all built-in grader types.
4190
4188
  *
4191
- * Each factory creates an Evaluator instance from an EvaluatorConfig,
4189
+ * Each factory creates an Grader instance from an GraderConfig,
4192
4190
  * handling type-specific initialization logic. These are registered into
4193
- * the EvaluatorRegistry at startup.
4191
+ * the GraderRegistry at startup.
4194
4192
  */
4195
4193
 
4196
4194
  /**
4197
- * Create a new EvaluatorRegistry with all built-in evaluator types registered.
4195
+ * Create a new GraderRegistry with all built-in grader types registered.
4198
4196
  */
4199
- declare function createBuiltinRegistry(): EvaluatorRegistry;
4197
+ declare function createBuiltinRegistry(): GraderRegistry;
4200
4198
 
4201
4199
  /**
4202
4200
  * Convention-based discovery of custom assertion scripts.
@@ -4216,27 +4214,27 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
4216
4214
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
4217
4215
  * @returns Names of discovered assertion types
4218
4216
  */
4219
- declare function discoverAssertions(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
4217
+ declare function discoverAssertions(registry: GraderRegistry, baseDir: string): Promise<string[]>;
4220
4218
 
4221
4219
  /**
4222
4220
  * Convention-based discovery of custom grader scripts.
4223
4221
  *
4224
4222
  * Scans `.agentv/graders/` (and legacy `.agentv/judges/`) for TypeScript/JavaScript
4225
- * files and registers them as code-grader evaluators in the registry. The file name
4226
- * (without extension) becomes the evaluator type name.
4223
+ * files and registers them as code graders in the registry. The file name
4224
+ * (without extension) becomes the grader type name.
4227
4225
  *
4228
4226
  * Example: `.agentv/graders/custom-grader.ts` → type "custom-grader" in EVAL.yaml
4229
4227
  */
4230
4228
 
4231
4229
  /**
4232
4230
  * Discover custom grader scripts from `.agentv/graders/` (and legacy `.agentv/judges/`)
4233
- * and register them as evaluator types in the registry.
4231
+ * and register them as grader types in the registry.
4234
4232
  *
4235
- * @param registry - The evaluator registry to register discovered graders into
4233
+ * @param registry - The grader registry to register discovered graders into
4236
4234
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
4237
4235
  * @returns Names of discovered grader types
4238
4236
  */
4239
- declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
4237
+ declare function discoverGraders(registry: GraderRegistry, baseDir: string): Promise<string[]>;
4240
4238
 
4241
4239
  /**
4242
4240
  * Core types for the transcript import pipeline.
@@ -4489,7 +4487,7 @@ declare function discoverClaudeSessions(opts?: ClaudeDiscoverOptions): Promise<C
4489
4487
  * 1. Reads a transcript JSONL file (produced by `agentv import`)
4490
4488
  * 2. Each invocation pops the next line from the transcript
4491
4489
  * 3. Returns a ProviderResponse with pre-populated output, token usage, etc.
4492
- * 4. Evaluators run identically to live eval — they see the same ProviderResponse
4490
+ * 4. Graders run identically to live eval — they see the same ProviderResponse
4493
4491
  *
4494
4492
  * The provider name in results is set to the source provider from the transcript
4495
4493
  * (e.g., "claude", "codex", "copilot").
@@ -4555,4 +4553,4 @@ type AgentKernel = {
4555
4553
  };
4556
4554
  declare function createAgentKernel(): AgentKernel;
4557
4555
 
4558
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4556
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };